Repository: Vonng/ddia
Branch: main
Commit: 573bb53a0557
Files: 139
Total size: 5.4 MB

Directory structure:
gitextract_zlswtsh8/

├── .github/
│   └── workflows/
│       └── pages.yaml
├── .gitignore
├── .nojekyll
├── LICENSE
├── Makefile
├── README.md
├── assets/
│   └── css/
│       ├── custom.css
│       └── example.css
├── bin/
│   ├── Pipfile
│   ├── doc
│   ├── epub
│   ├── preprocess-epub.py
│   ├── toc.py
│   ├── translate.py
│   └── zh-tw.py
├── content/
│   ├── en/
│   │   ├── _index.md
│   │   ├── ch1.md
│   │   ├── ch10.md
│   │   ├── ch11.md
│   │   ├── ch12.md
│   │   ├── ch13.md
│   │   ├── ch14.md
│   │   ├── ch2.md
│   │   ├── ch3.md
│   │   ├── ch4.md
│   │   ├── ch5.md
│   │   ├── ch6.md
│   │   ├── ch7.md
│   │   ├── ch8.md
│   │   ├── ch9.md
│   │   ├── colophon.md
│   │   ├── glossary.md
│   │   ├── indexes.md
│   │   ├── part-i.md
│   │   ├── part-ii.md
│   │   ├── part-iii.md
│   │   ├── preface.md
│   │   └── toc.md
│   ├── tw/
│   │   ├── _index.md
│   │   ├── ch1.md
│   │   ├── ch10.md
│   │   ├── ch11.md
│   │   ├── ch12.md
│   │   ├── ch13.md
│   │   ├── ch14.md
│   │   ├── ch2.md
│   │   ├── ch3.md
│   │   ├── ch4.md
│   │   ├── ch5.md
│   │   ├── ch6.md
│   │   ├── ch7.md
│   │   ├── ch8.md
│   │   ├── ch9.md
│   │   ├── colophon.md
│   │   ├── contrib.md
│   │   ├── glossary.md
│   │   ├── indexes.md
│   │   ├── part-i.md
│   │   ├── part-ii.md
│   │   ├── part-iii.md
│   │   ├── preface.md
│   │   └── toc.md
│   ├── v1/
│   │   ├── _index.md
│   │   ├── ch1.md
│   │   ├── ch10.md
│   │   ├── ch11.md
│   │   ├── ch12.md
│   │   ├── ch2.md
│   │   ├── ch3.md
│   │   ├── ch4.md
│   │   ├── ch5.md
│   │   ├── ch6.md
│   │   ├── ch7.md
│   │   ├── ch8.md
│   │   ├── ch9.md
│   │   ├── colophon.md
│   │   ├── contrib.md
│   │   ├── glossary.md
│   │   ├── part-i.md
│   │   ├── part-ii.md
│   │   ├── part-iii.md
│   │   ├── preface.md
│   │   └── toc.md
│   ├── v1_tw/
│   │   ├── _index.md
│   │   ├── ch1.md
│   │   ├── ch10.md
│   │   ├── ch11.md
│   │   ├── ch12.md
│   │   ├── ch2.md
│   │   ├── ch3.md
│   │   ├── ch4.md
│   │   ├── ch5.md
│   │   ├── ch6.md
│   │   ├── ch7.md
│   │   ├── ch8.md
│   │   ├── ch9.md
│   │   ├── colophon.md
│   │   ├── contrib.md
│   │   ├── glossary.md
│   │   ├── part-i.md
│   │   ├── part-ii.md
│   │   ├── part-iii.md
│   │   ├── preface.md
│   │   └── toc.md
│   └── zh/
│       ├── _index.md
│       ├── ch1.md
│       ├── ch10.md
│       ├── ch11.md
│       ├── ch12.md
│       ├── ch13.md
│       ├── ch14.md
│       ├── ch2.md
│       ├── ch3.md
│       ├── ch4.md
│       ├── ch5.md
│       ├── ch6.md
│       ├── ch7.md
│       ├── ch8.md
│       ├── ch9.md
│       ├── colophon.md
│       ├── contrib.md
│       ├── glossary.md
│       ├── indexes.md
│       ├── part-i.md
│       ├── part-ii.md
│       ├── part-iii.md
│       ├── preface.md
│       └── toc.md
├── giscus.json
├── go.mod
├── go.sum
├── hugo.yaml
├── i18n/
│   ├── en.yaml
│   ├── tw.yaml
│   ├── v2.yaml
│   └── zh.yaml
├── js/
│   └── epub.css
├── layouts/
│   └── shortcodes/
│       └── figure.html
└── metadata.yaml

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/pages.yaml
================================================
# Sample workflow for building and deploying a Hugo site to GitHub Pages
name: Deploy Hugo site to Pages

on:
  # Runs on pushes targeting the default branch
  push:
    branches: ["main"]

  # Allows you to run this workflow manually from the Actions tab
  workflow_dispatch:

# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
permissions:
  contents: read
  pages: write
  id-token: write

# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
concurrency:
  group: "pages"
  cancel-in-progress: false

# Default to bash
defaults:
  run:
    shell: bash

jobs:
  # Build job
  build:
    runs-on: ubuntu-latest
    env:
      HUGO_VERSION: 0.155.3
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0  # fetch all history for .GitInfo and .Lastmod
          submodules: recursive
      - name: Setup Go
        uses: actions/setup-go@v5
        with:
          go-version: '1.26'
      - name: Setup Pages
        id: pages
        uses: actions/configure-pages@v4
      - name: Setup Hugo
        run: |
          wget -O ${{ runner.temp }}/hugo.deb https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_linux-amd64.deb \
          && sudo dpkg -i ${{ runner.temp }}/hugo.deb
      - name: Build with Hugo
        env:
          # For maximum backward compatibility with Hugo modules
          HUGO_ENVIRONMENT: production
          HUGO_ENV: production
        run: |
          hugo \
            --gc --minify \
            --baseURL "${{ steps.pages.outputs.base_url }}/"
      - name: Upload artifact
        uses: actions/upload-pages-artifact@v3
        with:
          path: ./public

  # Deployment job
  deploy:
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}
    runs-on: ubuntu-latest
    needs: build
    steps:
      - name: Deploy to GitHub Pages
        id: deployment
        uses: actions/deploy-pages@v4

================================================
FILE: .gitignore
================================================
.idea/
.code/
__pycache__/
.DS_Store
tmp/
output/
public/
.hugo_build.lock
.claude
CLAUDE.md
content/cn/
zh.md
en.md

================================================
FILE: .nojekyll
================================================


================================================
FILE: LICENSE
================================================
Attribution 4.0 International

=======================================================================

Creative Commons Corporation ("Creative Commons") is not a law firm and
does not provide legal services or legal advice. Distribution of
Creative Commons public licenses does not create a lawyer-client or
other relationship. Creative Commons makes its licenses and related
information available on an "as-is" basis. Creative Commons gives no
warranties regarding its licenses, any material licensed under their
terms and conditions, or any related information. Creative Commons
disclaims all liability for damages resulting from their use to the
fullest extent possible.

Using Creative Commons Public Licenses

Creative Commons public licenses provide a standard set of terms and
conditions that creators and other rights holders may use to share
original works of authorship and other material subject to copyright
and certain other rights specified in the public license below. The
following considerations are for informational purposes only, are not
exhaustive, and do not form part of our licenses.

     Considerations for licensors: Our public licenses are
     intended for use by those authorized to give the public
     permission to use material in ways otherwise restricted by
     copyright and certain other rights. Our licenses are
     irrevocable. Licensors should read and understand the terms
     and conditions of the license they choose before applying it.
     Licensors should also secure all rights necessary before
     applying our licenses so that the public can reuse the
     material as expected. Licensors should clearly mark any
     material not subject to the license. This includes other CC-
     licensed material, or material used under an exception or
     limitation to copyright. More considerations for licensors:
    wiki.creativecommons.org/Considerations_for_licensors

     Considerations for the public: By using one of our public
     licenses, a licensor grants the public permission to use the
     licensed material under specified terms and conditions. If
     the licensor's permission is not necessary for any reason--for
     example, because of any applicable exception or limitation to
     copyright--then that use is not regulated by the license. Our
     licenses grant only permissions under copyright and certain
     other rights that a licensor has authority to grant. Use of
     the licensed material may still be restricted for other
     reasons, including because others have copyright or other
     rights in the material. A licensor may make special requests,
     such as asking that all changes be marked or described.
     Although not required by our licenses, you are encouraged to
     respect those requests where reasonable. More considerations
     for the public:
    wiki.creativecommons.org/Considerations_for_licensees

=======================================================================

Creative Commons Attribution 4.0 International Public License

By exercising the Licensed Rights (defined below), You accept and agree
to be bound by the terms and conditions of this Creative Commons
Attribution 4.0 International Public License ("Public License"). To the
extent this Public License may be interpreted as a contract, You are
granted the Licensed Rights in consideration of Your acceptance of
these terms and conditions, and the Licensor grants You such rights in
consideration of benefits the Licensor receives from making the
Licensed Material available under these terms and conditions.


Section 1 -- Definitions.

  a. Adapted Material means material subject to Copyright and Similar
     Rights that is derived from or based upon the Licensed Material
     and in which the Licensed Material is translated, altered,
     arranged, transformed, or otherwise modified in a manner requiring
     permission under the Copyright and Similar Rights held by the
     Licensor. For purposes of this Public License, where the Licensed
     Material is a musical work, performance, or sound recording,
     Adapted Material is always produced where the Licensed Material is
     synched in timed relation with a moving image.

  b. Adapter's License means the license You apply to Your Copyright
     and Similar Rights in Your contributions to Adapted Material in
     accordance with the terms and conditions of this Public License.

  c. Copyright and Similar Rights means copyright and/or similar rights
     closely related to copyright including, without limitation,
     performance, broadcast, sound recording, and Sui Generis Database
     Rights, without regard to how the rights are labeled or
     categorized. For purposes of this Public License, the rights
     specified in Section 2(b)(1)-(2) are not Copyright and Similar
     Rights.

  d. Effective Technological Measures means those measures that, in the
     absence of proper authority, may not be circumvented under laws
     fulfilling obligations under Article 11 of the WIPO Copyright
     Treaty adopted on December 20, 1996, and/or similar international
     agreements.

  e. Exceptions and Limitations means fair use, fair dealing, and/or
     any other exception or limitation to Copyright and Similar Rights
     that applies to Your use of the Licensed Material.

  f. Licensed Material means the artistic or literary work, database,
     or other material to which the Licensor applied this Public
     License.

  g. Licensed Rights means the rights granted to You subject to the
     terms and conditions of this Public License, which are limited to
     all Copyright and Similar Rights that apply to Your use of the
     Licensed Material and that the Licensor has authority to license.

  h. Licensor means the individual(s) or entity(ies) granting rights
     under this Public License.

  i. Share means to provide material to the public by any means or
     process that requires permission under the Licensed Rights, such
     as reproduction, public display, public performance, distribution,
     dissemination, communication, or importation, and to make material
     available to the public including in ways that members of the
     public may access the material from a place and at a time
     individually chosen by them.

  j. Sui Generis Database Rights means rights other than copyright
     resulting from Directive 96/9/EC of the European Parliament and of
     the Council of 11 March 1996 on the legal protection of databases,
     as amended and/or succeeded, as well as other essentially
     equivalent rights anywhere in the world.

  k. You means the individual or entity exercising the Licensed Rights
     under this Public License. Your has a corresponding meaning.


Section 2 -- Scope.

  a. License grant.

       1. Subject to the terms and conditions of this Public License,
          the Licensor hereby grants You a worldwide, royalty-free,
          non-sublicensable, non-exclusive, irrevocable license to
          exercise the Licensed Rights in the Licensed Material to:

            a. reproduce and Share the Licensed Material, in whole or
               in part; and

            b. produce, reproduce, and Share Adapted Material.

       2. Exceptions and Limitations. For the avoidance of doubt, where
          Exceptions and Limitations apply to Your use, this Public
          License does not apply, and You do not need to comply with
          its terms and conditions.

       3. Term. The term of this Public License is specified in Section
          6(a).

       4. Media and formats; technical modifications allowed. The
          Licensor authorizes You to exercise the Licensed Rights in
          all media and formats whether now known or hereafter created,
          and to make technical modifications necessary to do so. The
          Licensor waives and/or agrees not to assert any right or
          authority to forbid You from making technical modifications
          necessary to exercise the Licensed Rights, including
          technical modifications necessary to circumvent Effective
          Technological Measures. For purposes of this Public License,
          simply making modifications authorized by this Section 2(a)
          (4) never produces Adapted Material.

       5. Downstream recipients.

            a. Offer from the Licensor -- Licensed Material. Every
               recipient of the Licensed Material automatically
               receives an offer from the Licensor to exercise the
               Licensed Rights under the terms and conditions of this
               Public License.

            b. No downstream restrictions. You may not offer or impose
               any additional or different terms or conditions on, or
               apply any Effective Technological Measures to, the
               Licensed Material if doing so restricts exercise of the
               Licensed Rights by any recipient of the Licensed
               Material.

       6. No endorsement. Nothing in this Public License constitutes or
          may be construed as permission to assert or imply that You
          are, or that Your use of the Licensed Material is, connected
          with, or sponsored, endorsed, or granted official status by,
          the Licensor or others designated to receive attribution as
          provided in Section 3(a)(1)(A)(i).

  b. Other rights.

       1. Moral rights, such as the right of integrity, are not
          licensed under this Public License, nor are publicity,
          privacy, and/or other similar personality rights; however, to
          the extent possible, the Licensor waives and/or agrees not to
          assert any such rights held by the Licensor to the limited
          extent necessary to allow You to exercise the Licensed
          Rights, but not otherwise.

       2. Patent and trademark rights are not licensed under this
          Public License.

       3. To the extent possible, the Licensor waives any right to
          collect royalties from You for the exercise of the Licensed
          Rights, whether directly or through a collecting society
          under any voluntary or waivable statutory or compulsory
          licensing scheme. In all other cases the Licensor expressly
          reserves any right to collect such royalties.


Section 3 -- License Conditions.

Your exercise of the Licensed Rights is expressly made subject to the
following conditions.

  a. Attribution.

       1. If You Share the Licensed Material (including in modified
          form), You must:

            a. retain the following if it is supplied by the Licensor
               with the Licensed Material:

                 i. identification of the creator(s) of the Licensed
                    Material and any others designated to receive
                    attribution, in any reasonable manner requested by
                    the Licensor (including by pseudonym if
                    designated);

                ii. a copyright notice;

               iii. a notice that refers to this Public License;

                iv. a notice that refers to the disclaimer of
                    warranties;

                 v. a URI or hyperlink to the Licensed Material to the
                    extent reasonably practicable;

            b. indicate if You modified the Licensed Material and
               retain an indication of any previous modifications; and

            c. indicate the Licensed Material is licensed under this
               Public License, and include the text of, or the URI or
               hyperlink to, this Public License.

       2. You may satisfy the conditions in Section 3(a)(1) in any
          reasonable manner based on the medium, means, and context in
          which You Share the Licensed Material. For example, it may be
          reasonable to satisfy the conditions by providing a URI or
          hyperlink to a resource that includes the required
          information.

       3. If requested by the Licensor, You must remove any of the
          information required by Section 3(a)(1)(A) to the extent
          reasonably practicable.

       4. If You Share Adapted Material You produce, the Adapter's
          License You apply must not prevent recipients of the Adapted
          Material from complying with this Public License.


Section 4 -- Sui Generis Database Rights.

Where the Licensed Rights include Sui Generis Database Rights that
apply to Your use of the Licensed Material:

  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
     to extract, reuse, reproduce, and Share all or a substantial
     portion of the contents of the database;

  b. if You include all or a substantial portion of the database
     contents in a database in which You have Sui Generis Database
     Rights, then the database in which You have Sui Generis Database
     Rights (but not its individual contents) is Adapted Material; and

  c. You must comply with the conditions in Section 3(a) if You Share
     all or a substantial portion of the contents of the database.

For the avoidance of doubt, this Section 4 supplements and does not
replace Your obligations under this Public License where the Licensed
Rights include other Copyright and Similar Rights.


Section 5 -- Disclaimer of Warranties and Limitation of Liability.

  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.

  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.

  c. The disclaimer of warranties and limitation of liability provided
     above shall be interpreted in a manner that, to the extent
     possible, most closely approximates an absolute disclaimer and
     waiver of all liability.


Section 6 -- Term and Termination.

  a. This Public License applies for the term of the Copyright and
     Similar Rights licensed here. However, if You fail to comply with
     this Public License, then Your rights under this Public License
     terminate automatically.

  b. Where Your right to use the Licensed Material has terminated under
     Section 6(a), it reinstates:

       1. automatically as of the date the violation is cured, provided
          it is cured within 30 days of Your discovery of the
          violation; or

       2. upon express reinstatement by the Licensor.

     For the avoidance of doubt, this Section 6(b) does not affect any
     right the Licensor may have to seek remedies for Your violations
     of this Public License.

  c. For the avoidance of doubt, the Licensor may also offer the
     Licensed Material under separate terms or conditions or stop
     distributing the Licensed Material at any time; however, doing so
     will not terminate this Public License.

  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
     License.


Section 7 -- Other Terms and Conditions.

  a. The Licensor shall not be bound by any additional or different
     terms or conditions communicated by You unless expressly agreed.

  b. Any arrangements, understandings, or agreements regarding the
     Licensed Material not stated herein are separate from and
     independent of the terms and conditions of this Public License.


Section 8 -- Interpretation.

  a. For the avoidance of doubt, this Public License does not, and
     shall not be interpreted to, reduce, limit, restrict, or impose
     conditions on any use of the Licensed Material that could lawfully
     be made without permission under this Public License.

  b. To the extent possible, if any provision of this Public License is
     deemed unenforceable, it shall be automatically reformed to the
     minimum extent necessary to make it enforceable. If the provision
     cannot be reformed, it shall be severed from this Public License
     without affecting the enforceability of the remaining terms and
     conditions.

  c. No term or condition of this Public License will be waived and no
     failure to comply consented to unless expressly agreed to by the
     Licensor.

  d. Nothing in this Public License constitutes or may be interpreted
     as a limitation upon, or waiver of, any privileges and immunities
     that apply to the Licensor or You, including from the legal
     processes of any jurisdiction or authority.


=======================================================================

Creative Commons is not a party to its public
licenses. Notwithstanding, Creative Commons may elect to apply one of
its public licenses to material it publishes and in those instances
will be considered the “Licensor.” The text of the Creative Commons
public licenses is dedicated to the public domain under the CC0 Public
Domain Dedication. Except for the limited purpose of indicating that
material is shared under a Creative Commons public license or as
otherwise permitted by the Creative Commons policies published at
creativecommons.org/policies, Creative Commons does not authorize the
use of the trademark "Creative Commons" or any other trademark or logo
of Creative Commons without its prior written consent including,
without limitation, in connection with any unauthorized modifications
to any of its public licenses or any other arrangements,
understandings, or agreements concerning use of licensed material. For
the avoidance of doubt, this paragraph does not form part of the
public licenses.

Creative Commons may be contacted at creativecommons.org.


================================================
FILE: Makefile
================================================
default: dev

d:dev
dev:
	hugo serve

b:build
build:
	hugo build

.PHONY: default d dev b build

# generate zh-tw version
translate:
	bin/zh-tw.py

epub:
	bin/epub

.PHONY: default doc translate


================================================
FILE: README.md
================================================
# 设计数据密集型应用（第二版） - 中文翻译版

[![Webite: ddia](https://img.shields.io/badge/在线阅读-第二版-slategray?style=flat)](https://ddia.vonng.com)
[![Webite: ddia](https://img.shields.io/badge/在线阅读-第一版-slategray?style=flat)](https://ddia.vonng.com/v1)
[![GitHub Stars](https://img.shields.io/github/stars/Vonng/ddia?style=flat&logo=github&logoColor=black&color=slategray)](https://star-history.com/#Vonng/ddia&Date)

**作者**： [Martin Kleppmann](https://martin.kleppmann.com)，[《Designing Data-Intensive Applications 2nd Edition》](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch01.html)： 英国剑桥大学分布式系统研究员，演讲者，博主和开源贡献者，软件工程师和企业家，曾在 LinkedIn 和 Rapportive 负责数据基础架构。

**译者**：[冯若航](https://vonng.com) / [Vonng](https://github.com/Vonng) (rh@vonng.com) [Pigsty](https://pgsty.com) 创始人，[活跃](https://committers.top/china)[开源贡献者](https://gitstar-ranking.com/Vonng)，PostgreSQL Hacker。开源 RDS PG 发行版 [Pigsty](https://pigsty.cc/zh/) 与公众号《[老冯云数](https://mp.weixin.qq.com/s/p4Ys10ZdEDAuqNAiRmcnIQ)》作者，[数据库老司机](https://pigsty.cc/zh/blog/db)，[云计算泥石流](https://pigsty.cc/zh/blog/cloud)，曾于阿里，苹果，探探担任架构师与DBA。

**校订**： [@yingang](https://github.com/yingang) ｜ [**繁體中文**](content/tw/_index.md) by [@afunTW](https://github.com/afunTW) ｜ [完整贡献者列表](#贡献)

**阅读**：访问 [https://ddia.vonng.com](https://ddia.vonng.com) 阅读本书在线版本，或使用 [hugo](https://gohugo.io/documentation/) / [hextra](https://imfing.github.io/hextra/zh-cn/) 主题自行构建。

> [!NOTE] 
> [**DDIA 第二版**](https://ddia.vonng.com) 正在翻译中（翻译至至第十章），欢迎阅览并提出您的宝贵意见！[点击此处阅览第一版](https://ddia.vonng.com/v1)。


---------

## 译序

> 不懂数据库的全栈工程师不是好架构师 
> 
> —— 冯若航 / Vonng

现今，尤其是在互联网领域，大多数应用都属于数据密集型应用。本书从底层数据结构到顶层架构设计，将数据系统设计中的精髓娓娓道来。其中的宝贵经验无论是对架构师、DBA、还是后端工程师、甚至产品经理都会有帮助。

这是一本理论结合实践的书，书中很多问题，译者在实际场景中都曾遇到过，读来让人击节扼腕。如果能早点读到这本书，该少走多少弯路啊！

这也是一本深入浅出的书，讲述概念的来龙去脉而不是卖弄定义，介绍事物发展演化历程而不是事实堆砌，将复杂的概念讲述的浅显易懂，但又直击本质不失深度。每章最后的引用质量非常好，是深入学习各个主题的绝佳索引。

本书为数据系统的设计、实现、与评价提供了很好的概念框架。读完并理解本书内容后，读者可以轻松看破大多数的技术忽悠，与技术砖家撕起来虎虎生风🤣。

这是 2017 年译者读过最好的一本技术类书籍，这么好的书没有中文翻译，实在是遗憾。某不才，愿为先进技术文化的传播贡献一份力量。既可以深入学习有趣的技术主题，又可以锻炼中英文语言文字功底，何乐而不为？


---------

## 前言

> 在我们的社会中，技术是一种强大的力量。数据、软件、通信可以用于坏的方面：不公平的阶级固化，损害公民权利，保护既得利益集团。但也可以用于好的方面：让底层人民发出自己的声音，让每个人都拥有机会，避免灾难。本书献给所有将技术用于善途的人们。

---------

> 计算是一种流行文化，流行文化鄙视历史。流行文化关乎个体身份和参与感，但与合作无关。流行文化活在当下，也与过去和未来无关。我认为大部分（为了钱）编写代码的人就是这样的，他们不知道自己的文化来自哪里。
>
>  —— 阿兰・凯接受 Dobb 博士的杂志采访时（2012 年）


---------

## 目录


* [序言](https://ddia.vonng.com/preface)
* [第一部分：数据系统基础](https://ddia.vonng.com//part-i)
  - [1. 数据系统架构中的权衡](https://ddia.vonng.com/ch1)
  - [2. 定义非功能性需求](https://ddia.vonng.com/ch2)
  - [3. 数据模型与查询语言](https://ddia.vonng.com/ch3)
  - [4. 存储与检索](https://ddia.vonng.com/ch4)
  - [5. 编码与演化](https://ddia.vonng.com/ch5)
* [第二部分：分布式数据](https://ddia.vonng.com/part-ii)
  - [6. 复制](https://ddia.vonng.com/ch6)
  - [7. 分片](https://ddia.vonng.com/ch7)
  - [8. 事务](https://ddia.vonng.com/ch8)
  - [9. 分布式系统的麻烦](https://ddia.vonng.com/ch9)
  - [10.一致性与共识](https://ddia.vonng.com/ch10)
* [第三部分：派生数据](https://ddia.vonng.com/part-iii)
  - [11. 批处理](https://ddia.vonng.com/ch11)
  - [12. 流处理](https://ddia.vonng.com/ch12)
  - [13. 流处理系统哲学](https://ddia.vonng.com/ch13)
  - [14. 做正确的事](https://ddia.vonng.com/ch14)
* [术语表](https://ddia.vonng.com/glossary)
* [后记](https://ddia.vonng.com/colophon)

![](static/title.jpg)


---------

## 法律声明

从原作者处得知，已经有简体中文的翻译计划，将于 2018 年末完成。[购买地址](https://search.jd.com/Search?keyword=设计数据密集型应用)

译者纯粹出于 **学习目的** 与 **个人兴趣** 翻译本书，不追求任何经济利益。

译者保留对此版本译文的署名权，其他权利以原作者和出版社的主张为准。

本译文只供学习研究参考之用，不得公开发行或用于商业用途，有能力阅读英文书籍者请购买正版支持，本书英文原版在 [O'REILLY](https://learning.oreilly.com/api/v1/continue/9781098119058/) 平台上提供在线免费试预览。


---------

## 贡献

0. 全文校订 by [@yingang](https://github.com/Vonng/ddia/commits?author=yingang)
1. [序言初翻修正](https://github.com/Vonng/ddia/commit/afb5edab55c62ed23474149f229677e3b42dfc2c) by [@seagullbird](https://github.com/Vonng/ddia/commits?author=seagullbird)
2. [第一章语法标点校正](https://github.com/Vonng/ddia/commit/973b12cd8f8fcdf4852f1eb1649ddd9d187e3644) by [@nevertiree](https://github.com/Vonng/ddia/commits?author=nevertiree)
3. [第六章部分校正](https://github.com/Vonng/ddia/commit/d4eb0852c0ec1e93c8aacc496c80b915bb1e6d48) 与[第十章的初翻](https://github.com/Vonng/ddia/commit/9de8dbd1bfe6fbb03b3bf6c1a1aa2291aed2490e) by [@MuAlex](https://github.com/Vonng/ddia/commits?author=MuAlex) 
4. 第一部分前言，ch2 校正 by [@jiajiadebug](https://github.com/Vonng/ddia/commits?author=jiajiadebug)
5. 词汇表、后记关于野猪的部分 by [@Chowss](https://github.com/Vonng/ddia/commits?author=Chowss)
6. 繁體中文版本与转换脚本 by [@afunTW](https://github.com/afunTW)
7. 多处翻译修正 by [@songzhibin97](https://github.com/Vonng/ddia/commits?author=songzhibin97) [@MamaShip](https://github.com/Vonng/ddia/commits?author=MamaShip) [@FangYuan33](https://github.com/Vonng/ddia/commits?author=FangYuan33)
8. 感谢所有作出贡献，提出意见的朋友们：

<details>
<summary><a href="https://github.com/Vonng/ddia/pulls">Pull Requests</a> & <a href="https://github.com/Vonng/ddia/issues">Issues</a></summary>

| ISSUE & Pull Requests                           | USER                                                       | Title                                                          |
|-------------------------------------------------|------------------------------------------------------------|----------------------------------------------------------------|
| [386](https://github.com/Vonng/ddia/pull/386)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch2: 优化一处翻译                                                    |
| [384](https://github.com/Vonng/ddia/pull/384)   | [@PanggNOTlovebean](https://github.com/PanggNOTlovebean)   | docs: 优化中文文档的措辞和表达                                              |
| [383](https://github.com/Vonng/ddia/pull/383)   | [@PanggNOTlovebean](https://github.com/PanggNOTlovebean)   | docs: 修正 ch4 中的术语和表达错误                                          |
| [382](https://github.com/Vonng/ddia/pull/382)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch1: 优化一处翻译                                                    |
| [381](https://github.com/Vonng/ddia/pull/381)   | [@Max-Tortoise](https://github.com/Max-Tortoise)           | ch4: 修正一处术语不完整问题                                               |
| [377](https://github.com/Vonng/ddia/pull/377)   | [@huang06](https://github.com/huang06)                     | 优化翻译术语                                                        |
| [375](https://github.com/Vonng/ddia/issues/375) | [@z-soulx](https://github.com/z-soulx)                     | 对于是否100%全中文翻译的必要性讨论？个人-没必要100%，特别是“名词”，有原单词更加适合it人员                 |
| [371](https://github.com/Vonng/ddia/pull/371)   | [@lewiszlw](https://github.com/lewiszlw)                   | CPU core -> CPU 核心                                          |
| [369](https://github.com/Vonng/ddia/pull/369)   | [@bbwang-gl](https://github.com/bbwang-gl)                 | ch7: 可串行化快照隔离检测一个事务何时修改另一个事务的读取                                 |
| [368](https://github.com/Vonng/ddia/pull/368)   | [@yhao3](https://github.com/yhao3)                         | 更新 zh-tw.py 与 zh-tw 内容                                       |
| [367](https://github.com/Vonng/ddia/pull/367)   | [@yhao3](https://github.com/yhao3)                         | 修正拼写、格式和标点问题                                                  |
| [366](https://github.com/Vonng/ddia/pull/366)   | [@yangshangde](https://github.com/yangshangde)             | ch8: 将“电源失败”改为“电源失效”                                           |
| [365](https://github.com/Vonng/ddia/pull/365)   | [@xyohn](https://github.com/xyohn)                         | ch1: 优化“存储与计算分离”相关翻译                                           |
| [364](https://github.com/Vonng/ddia/issues/364) | [@xyohn](https://github.com/xyohn)                         | ch1: 优化“存储与计算分离”相关翻译                                           |
| [363](https://github.com/Vonng/ddia/pull/363)   | [@xyohn](https://github.com/xyohn)                         | #362: 优化一处翻译                                                 |
| [362](https://github.com/Vonng/ddia/issues/362) | [@xyohn](https://github.com/xyohn)                         | ch1: 优化一处翻译                                                   |
| [359](https://github.com/Vonng/ddia/pull/359)   | [@c25423](https://github.com/c25423)                       | ch10: 修正一处拼写错误                                                 |
| [358](https://github.com/Vonng/ddia/pull/358)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch4: 修正一处拼写错误                                                  |
| [356](https://github.com/Vonng/ddia/pull/356)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch2: 修正一处标点错误                                                  |
| [355](https://github.com/Vonng/ddia/pull/355)   | [@DuroyGeorge](https://github.com/DuroyGeorge)             | ch12: 修正一处格式错误                                                 |
| [354](https://github.com/Vonng/ddia/pull/354)   | [@justlorain](https://github.com/justlorain)               | ch7: 修正一处参考链接                                                  |
| [353](https://github.com/Vonng/ddia/pull/353)   | [@fantasyczl](https://github.com/fantasyczl)               | ch3&9: 修正两处引用错误                                                |
| [352](https://github.com/Vonng/ddia/pull/352)   | [@fantasyczl](https://github.com/fantasyczl)               | 支持输出为 EPUB 格式                                                  |
| [349](https://github.com/Vonng/ddia/pull/349)   | [@xiyihan0](https://github.com/xiyihan0)                   | ch1: 修正一处格式错误                                                  |
| [348](https://github.com/Vonng/ddia/pull/348)   | [@omegaatt36](https://github.com/omegaatt36)               | ch3: 修正一处图像链接                                                  |
| [346](https://github.com/Vonng/ddia/issues/346) | [@Vermouth1995](https://github.com/Vermouth1995)           | ch1: 优化一处翻译                                                    |
| [343](https://github.com/Vonng/ddia/pull/343)   | [@kehao-chen](https://github.com/kehao-chen)               | ch10: 优化一处翻译                                                   |
| [341](https://github.com/Vonng/ddia/pull/341)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch3: 优化两处翻译                                                    |
| [340](https://github.com/Vonng/ddia/pull/340)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch2: 优化多处翻译                                                    |
| [338](https://github.com/Vonng/ddia/pull/338)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch1: 优化一处翻译                                                    |
| [335](https://github.com/Vonng/ddia/pull/335)   | [@kimi0230](https://github.com/kimi0230)                   | 修正一处繁体中文错误                                                     |
| [334](https://github.com/Vonng/ddia/pull/334)   | [@soulrrrrr](https://github.com/soulrrrrr)                 | ch2: 修正一处繁体中文错误                                                |
| [332](https://github.com/Vonng/ddia/pull/332)   | [@justlorain](https://github.com/justlorain)               | ch5: 修正一处翻译错误                                                  |
| [331](https://github.com/Vonng/ddia/pull/331)   | [@Lyianu](https://github.com/Lyianu)                       | ch9: 更正几处拼写错误                                                  |
| [330](https://github.com/Vonng/ddia/pull/330)   | [@Lyianu](https://github.com/Lyianu)                       | ch7: 优化一处翻译                                                    |
| [329](https://github.com/Vonng/ddia/issues/329) | [@Lyianu](https://github.com/Lyianu)                       | ch6: 指出一处翻译错误                                                  |
| [328](https://github.com/Vonng/ddia/pull/328)   | [@justlorain](https://github.com/justlorain)               | ch4: 更正一处翻译遗漏                                                  |
| [326](https://github.com/Vonng/ddia/pull/326)   | [@liangGTY](https://github.com/liangGTY)                   | ch1: 优化一处翻译                                                    |
| [323](https://github.com/Vonng/ddia/pull/323)   | [@marvin263](https://github.com/marvin263)                 | ch5: 优化一处翻译                                                    |
| [322](https://github.com/Vonng/ddia/pull/322)   | [@marvin263](https://github.com/marvin263)                 | ch8: 优化一处翻译                                                    |
| [304](https://github.com/Vonng/ddia/pull/304)   | [@spike014](https://github.com/spike014)                   | ch11: 优化一处翻译                                                   |
| [298](https://github.com/Vonng/ddia/pull/298)   | [@Makonike](https://github.com/Makonike)                   | ch11&12: 修正两处错误                                                |
| [284](https://github.com/Vonng/ddia/pull/284)   | [@WAangzE](https://github.com/WAangzE)                     | ch4: 更正一处列表错误                                                  |
| [283](https://github.com/Vonng/ddia/pull/283)   | [@WAangzE](https://github.com/WAangzE)                     | ch3: 更正一处错别字                                                   |
| [282](https://github.com/Vonng/ddia/pull/282)   | [@WAangzE](https://github.com/WAangzE)                     | ch2: 更正一处公式问题                                                  |
| [281](https://github.com/Vonng/ddia/pull/281)   | [@lyuxi99](https://github.com/lyuxi99)                     | 更正多处内部链接错误                                                     |
| [280](https://github.com/Vonng/ddia/pull/280)   | [@lyuxi99](https://github.com/lyuxi99)                     | ch9: 更正内部链接错误                                                  |
| [279](https://github.com/Vonng/ddia/issues/279) | [@codexvn](https://github.com/codexvn)                     | ch9: 指出公式在 GitHub Pages 显示的问题                                  |
| [278](https://github.com/Vonng/ddia/pull/278)   | [@LJlkdskdjflsa](https://github.com/LJlkdskdjflsa)         | 发现了繁体中文版本中的错误翻译                                                |
| [275](https://github.com/Vonng/ddia/pull/275)   | [@117503445](https://github.com/117503445)                 | 更正 LICENSE 链接                                                  |
| [274](https://github.com/Vonng/ddia/pull/274)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch7: 修正错别字                                                     |
| [273](https://github.com/Vonng/ddia/pull/273)   | [@Sdot-Python](https://github.com/Sdot-Python)             | ch7: 统一了 write skew 的翻译                                        |
| [271](https://github.com/Vonng/ddia/pull/271)   | [@Makonike](https://github.com/Makonike)                   | ch6: 统一了 rebalancing 的翻译                                       |
| [270](https://github.com/Vonng/ddia/pull/270)   | [@Ynjxsjmh](https://github.com/Ynjxsjmh)                   | ch7: 修正不一致的翻译                                                  |
| [263](https://github.com/Vonng/ddia/pull/263)   | [@zydmayday](https://github.com/zydmayday)                 | ch5: 修正译文中的重复单词                                                |
| [260](https://github.com/Vonng/ddia/pull/260)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch4: 修正部分不准确的翻译                                                |
| [258](https://github.com/Vonng/ddia/pull/258)   | [@bestgrc](https://github.com/bestgrc)                     | ch3: 修正一处翻译错误                                                  |
| [257](https://github.com/Vonng/ddia/pull/257)   | [@UnderSam](https://github.com/UnderSam)                   | ch8: 修正一处拼写错误                                                  |
| [256](https://github.com/Vonng/ddia/pull/256)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可串行化”相关内容的多处翻译不当                                       |
| [255](https://github.com/Vonng/ddia/pull/255)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可重复读”相关内容的多处翻译不当                                       |
| [253](https://github.com/Vonng/ddia/pull/253)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“读已提交”相关内容的多处翻译不当                                       |
| [246](https://github.com/Vonng/ddia/pull/246)   | [@derekwu0101](https://github.com/derekwu0101)             | ch3: 修正繁体中文的转译错误                                               |
| [245](https://github.com/Vonng/ddia/pull/245)   | [@skyran1278](https://github.com/skyran1278)               | ch12: 修正繁体中文的转译错误                                              |
| [244](https://github.com/Vonng/ddia/pull/244)   | [@Axlgrep](https://github.com/Axlgrep)                     | ch9: 修正不通顺的翻译                                                  |
| [242](https://github.com/Vonng/ddia/pull/242)   | [@lynkeib](https://github.com/lynkeib)                     | ch9: 修正不通顺的翻译                                                  |
| [241](https://github.com/Vonng/ddia/pull/241)   | [@lynkeib](https://github.com/lynkeib)                     | ch8: 修正不正确的公式格式                                                |
| [240](https://github.com/Vonng/ddia/pull/240)   | [@8da2k](https://github.com/8da2k)                         | ch9: 修正不通顺的翻译                                                  |
| [239](https://github.com/Vonng/ddia/pull/239)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch7: 修正不一致的翻译                                                  |
| [237](https://github.com/Vonng/ddia/pull/237)   | [@zhangnew](https://github.com/zhangnew)                   | ch3: 修正错误的图片链接                                                 |
| [229](https://github.com/Vonng/ddia/pull/229)   | [@lis186](https://github.com/lis186)                       | 指出繁体中文的转译错误：复杂                                                 |
| [226](https://github.com/Vonng/ddia/pull/226)   | [@chroming](https://github.com/chroming)                   | ch1: 修正导航栏中的章节名称                                               |
| [220](https://github.com/Vonng/ddia/pull/220)   | [@skyran1278](https://github.com/skyran1278)               | ch9: 修正线性一致的繁体中文翻译                                             |
| [194](https://github.com/Vonng/ddia/pull/194)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正错误的翻译                                                   |
| [193](https://github.com/Vonng/ddia/pull/193)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 优化译文                                                      |
| [192](https://github.com/Vonng/ddia/pull/192)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正不一致和不通顺的翻译                                              |
| [190](https://github.com/Vonng/ddia/pull/190)   | [@Pcrab](https://github.com/Pcrab)                         | ch1: 修正不准确的翻译                                                  |
| [187](https://github.com/Vonng/ddia/pull/187)   | [@narojay](https://github.com/narojay)                     | ch9: 修正生硬的翻译                                                   |
| [186](https://github.com/Vonng/ddia/pull/186)   | [@narojay](https://github.com/narojay)                     | ch8: 修正错别字                                                     |
| [185](https://github.com/Vonng/ddia/issues/185) | [@8da2k](https://github.com/8da2k)                         | 指出小标题跳转的问题                                                     |
| [184](https://github.com/Vonng/ddia/pull/184)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch10: 修正失效的网址                                                  |
| [183](https://github.com/Vonng/ddia/pull/183)   | [@OneSizeFitsQuorum](https://github.com/OneSizeFitsQuorum) | ch8: 修正错别字                                                     |
| [182](https://github.com/Vonng/ddia/issues/182) | [@lroolle](https://github.com/lroolle)                     | 建议docsify的主题风格                                                 |
| [181](https://github.com/Vonng/ddia/pull/181)   | [@YunfengGao](https://github.com/YunfengGao)               | ch2: 修正翻译错误                                                    |
| [180](https://github.com/Vonng/ddia/pull/180)   | [@skyran1278](https://github.com/skyran1278)               | ch3: 指出繁体中文的转译错误                                               |
| [177](https://github.com/Vonng/ddia/pull/177)   | [@exzhawk](https://github.com/exzhawk)                     | 支持 Github Pages 里的公式显示                                         |
| [176](https://github.com/Vonng/ddia/pull/176)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch2: 语义网相关翻译更正                                                 |
| [175](https://github.com/Vonng/ddia/pull/175)   | [@cwr31](https://github.com/cwr31)                         | ch7: 不变式相关翻译更正                                                 |
| [174](https://github.com/Vonng/ddia/pull/174)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | README & preface: 更正不正确的中文用词和标点符号                              |
| [173](https://github.com/Vonng/ddia/pull/173)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正不完整的翻译                                                 |
| [171](https://github.com/Vonng/ddia/pull/171)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正重复的译文                                                  |
| [169](https://github.com/Vonng/ddia/pull/169)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 更正不太通顺的翻译                                                |
| [166](https://github.com/Vonng/ddia/pull/166)   | [@bp4m4h94](https://github.com/bp4m4h94)                   | ch1: 发现错误的文献索引                                                 |
| [164](https://github.com/Vonng/ddia/pull/164)   | [@DragonDriver](https://github.com/DragonDriver)           | preface: 更正错误的标点符号                                             |
| [163](https://github.com/Vonng/ddia/pull/163)   | [@llmmddCoder](https://github.com/llmmddCoder)             | ch1: 更正错误字                                                     |
| [160](https://github.com/Vonng/ddia/pull/160)   | [@Zhayhp](https://github.com/Zhayhp)                       | ch2: 建议将 network model 翻译为网状模型                                 |
| [159](https://github.com/Vonng/ddia/pull/159)   | [@1ess](https://github.com/1ess)                           | ch4: 更正错误字                                                     |
| [157](https://github.com/Vonng/ddia/pull/157)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通顺的翻译                                                 |
| [155](https://github.com/Vonng/ddia/pull/155)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通顺的翻译                                                 |
| [153](https://github.com/Vonng/ddia/pull/153)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch9: 修正缩略图的错别字                                                 |
| [152](https://github.com/Vonng/ddia/pull/152)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 除重->去重                                                    |
| [151](https://github.com/Vonng/ddia/pull/151)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 修订sibling相关的翻译                                            |
| [147](https://github.com/Vonng/ddia/pull/147)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 更正一处不准确的翻译                                                |
| [145](https://github.com/Vonng/ddia/pull/145)   | [@Hookey](https://github.com/Hookey)                       | 识别了当前简繁转译过程中处理不当的地方，暂通过转换脚本规避                                  |
| [144](https://github.com/Vonng/ddia/issues/144) | [@secret4233](https://github.com/secret4233)               | ch7: 不翻译`next-key locking`                                     |
| [143](https://github.com/Vonng/ddia/issues/143) | [@imcheney](https://github.com/imcheney)                   | ch3: 更新残留的机翻段落                                                 |
| [142](https://github.com/Vonng/ddia/issues/142) | [@XIJINIAN](https://github.com/XIJINIAN)                   | 建议去除段首的制表符                                                     |
| [141](https://github.com/Vonng/ddia/issues/141) | [@Flyraty](https://github.com/Flyraty)                     | ch5: 发现一处错误格式的章节引用                                             |
| [140](https://github.com/Vonng/ddia/pull/140)   | [@Bowser1704](https://github.com/Bowser1704)               | ch5: 修正章节Summary中多处不通顺的翻译                                      |
| [139](https://github.com/Vonng/ddia/pull/139)   | [@Bowser1704](https://github.com/Bowser1704)               | ch2&ch3: 修正多处不通顺的或错误的翻译                                        |
| [137](https://github.com/Vonng/ddia/pull/137)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch5&ch6: 优化多处不通顺的或错误的翻译                                        |
| [134](https://github.com/Vonng/ddia/pull/134)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch4: 优化多处不通顺的或错误的翻译                                            |
| [133](https://github.com/Vonng/ddia/pull/133)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 优化多处错误的或不通顺的翻译                                            |
| [132](https://github.com/Vonng/ddia/pull/132)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 优化一处容易产生歧义的翻译                                             |
| [131](https://github.com/Vonng/ddia/pull/131)   | [@rwwg4](https://github.com/rwwg4)                         | ch6: 修正两处错误的翻译                                                 |
| [129](https://github.com/Vonng/ddia/pull/129)   | [@anaer](https://github.com/anaer)                         | ch4: 修正两处强调文本和四处代码变量名称                                         |
| [128](https://github.com/Vonng/ddia/pull/128)   | [@meilin96](https://github.com/meilin96)                   | ch5: 修正一处错误的引用                                                 |
| [126](https://github.com/Vonng/ddia/pull/126)   | [@cwr31](https://github.com/cwr31)                         | ch10: 修正一处错误的翻译（功能 -> 函数）                                      |
| [125](https://github.com/Vonng/ddia/pull/125)   | [@dch1228](https://github.com/dch1228)                     | ch2: 优化 how best 的翻译（如何以最佳方式）                                  |
| [123](https://github.com/Vonng/ddia/pull/123)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 9, TOC in readme, glossary, etc.) |
| [121](https://github.com/Vonng/ddia/pull/121)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 5 to chapter 8)                   |
| [120](https://github.com/Vonng/ddia/pull/120)   | [@jiong-han](https://github.com/jiong-han)                 | Typo fix: 呲之以鼻 -> 嗤之以鼻                                         |
| [119](https://github.com/Vonng/ddia/pull/119)   | [@cclauss](https://github.com/cclauss)                     | Streamline file operations in convert()                        |
| [118](https://github.com/Vonng/ddia/pull/118)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 2 to chapter 4)                   |
| [117](https://github.com/Vonng/ddia/pull/117)   | [@feeeei](https://github.com/feeeei)                       | 统一每章的标题格式                                                      |
| [115](https://github.com/Vonng/ddia/pull/115)   | [@NageNalock](https://github.com/NageNalock)               | 第七章病句修改: 重复词语                                                  |
| [114](https://github.com/Vonng/ddia/pull/114)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | Update README.md: correct the book name                        |
| [113](https://github.com/Vonng/ddia/pull/113)   | [@lpxxn](https://github.com/lpxxn)                         | 修改语句                                                           |
| [112](https://github.com/Vonng/ddia/pull/112)   | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [110](https://github.com/Vonng/ddia/pull/110)   | [@lpxxn](https://github.com/lpxxn)                         | 读已写入数据                                                         |
| [107](https://github.com/Vonng/ddia/pull/107)   | [@abbychau](https://github.com/abbychau)                   | 單調鐘和好死还是赖活着                                                    |
| [106](https://github.com/Vonng/ddia/pull/106)   | [@enochii](https://github.com/enochii)                     | typo in ch2: fix braces typo                                   |
| [105](https://github.com/Vonng/ddia/pull/105)   | [@LiminCode](https://github.com/LiminCode)                 | Chronicle translation error                                    |
| [104](https://github.com/Vonng/ddia/pull/104)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | several advice for better translation                          |
| [103](https://github.com/Vonng/ddia/pull/103)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in ch4: should be 完成 rather than 完全                       |
| [102](https://github.com/Vonng/ddia/pull/102)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | ch4: better-translation: 扼杀 → 破坏                               |
| [101](https://github.com/Vonng/ddia/pull/101)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in Ch4: should be "改变" rathr than "盖面"                    |
| [100](https://github.com/Vonng/ddia/pull/100)   | [@LiminCode](https://github.com/LiminCode)                 | fix missing translation                                        |
| [99 ](https://github.com/Vonng/ddia/pull/99)    | [@mrdrivingduck](https://github.com/mrdrivingduck)         | ch6: fix the word rebalancing                                  |
| [98 ](https://github.com/Vonng/ddia/pull/98)    | [@jacklightChen](https://github.com/jacklightChen)         | fix ch7.md: fix wrong references                               |
| [97 ](https://github.com/Vonng/ddia/pull/97)    | [@jenac](https://github.com/jenac)                         | 96                                                             |
| [96 ](https://github.com/Vonng/ddia/pull/96)    | [@PragmaTwice](https://github.com/PragmaTwice)             | ch2: fix typo about 'may or may not be'                        |
| [95 ](https://github.com/Vonng/ddia/pull/95)    | [@EvanMu96](https://github.com/EvanMu96)                   | fix translation of "the battle cry" in ch5                     |
| [94 ](https://github.com/Vonng/ddia/pull/94)    | [@kemingy](https://github.com/kemingy)                     | ch6: fix markdown and punctuations                             |
| [93 ](https://github.com/Vonng/ddia/pull/93)    | [@kemingy](https://github.com/kemingy)                     | ch5: fix markdown and some typos                               |
| [92 ](https://github.com/Vonng/ddia/pull/92)    | [@Gilbert1024](https://github.com/Gilbert1024)             | Merge pull request #1 from Vonng/master                        |
| [88 ](https://github.com/Vonng/ddia/pull/88)    | [@kemingy](https://github.com/kemingy)                     | fix typo for ch1, ch2, ch3, ch4                                |
| [87 ](https://github.com/Vonng/ddia/pull/87)    | [@wynn5a](https://github.com/wynn5a)                       | Update ch3.md                                                  |
| [86 ](https://github.com/Vonng/ddia/pull/86)    | [@northmorn](https://github.com/northmorn)                 | Update ch1.md                                                  |
| [85 ](https://github.com/Vonng/ddia/pull/85)    | [@sunbuhui](https://github.com/sunbuhui)                   | fix ch2.md: fix ch2 ambiguous translation                      |
| [84 ](https://github.com/Vonng/ddia/pull/84)    | [@ganler](https://github.com/ganler)                       | Fix translation: use up                                        |
| [83 ](https://github.com/Vonng/ddia/pull/83)    | [@afunTW](https://github.com/afunTW)                       | Using OpenCC to convert from zh-cn to zh-tw                    |
| [82 ](https://github.com/Vonng/ddia/pull/82)    | [@kangni](https://github.com/kangni)                       | fix gitbook url                                                |
| [78 ](https://github.com/Vonng/ddia/pull/78)    | [@hanyu2](https://github.com/hanyu2)                       | Fix unappropriated translation                                 |
| [77 ](https://github.com/Vonng/ddia/pull/77)    | [@Ozarklake](https://github.com/Ozarklake)                 | fix typo                                                       |
| [75 ](https://github.com/Vonng/ddia/pull/75)    | [@2997ms](https://github.com/2997ms)                       | Fix typo                                                       |
| [74 ](https://github.com/Vonng/ddia/pull/74)    | [@2997ms](https://github.com/2997ms)                       | Update ch9.md                                                  |
| [70 ](https://github.com/Vonng/ddia/pull/70)    | [@2997ms](https://github.com/2997ms)                       | Update ch7.md                                                  |
| [67 ](https://github.com/Vonng/ddia/pull/67)    | [@jiajiadebug](https://github.com/jiajiadebug)             | fix issues in ch2 - ch9 and glossary                           |
| [66 ](https://github.com/Vonng/ddia/pull/66)    | [@blindpirate](https://github.com/blindpirate)             | Fix typo                                                       |
| [63 ](https://github.com/Vonng/ddia/pull/63)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch10.md                                                 |
| [62 ](https://github.com/Vonng/ddia/pull/62)    | [@ych](https://github.com/ych)                             | fix ch1.md typesetting problem                                 |
| [61 ](https://github.com/Vonng/ddia/pull/61)    | [@xianlaioy](https://github.com/xianlaioy)                 | docs:钟-->种，去掉ou                                                |
| [60 ](https://github.com/Vonng/ddia/pull/60)    | [@Zombo1296](https://github.com/Zombo1296)                 | 否则 -> 或者                                                       |
| [59 ](https://github.com/Vonng/ddia/pull/59)    | [@AlexanderMisel](https://github.com/AlexanderMisel)       | 呼叫->调用，显着->显著                                                  |
| [58 ](https://github.com/Vonng/ddia/pull/58)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch8.md                                                  |
| [55 ](https://github.com/Vonng/ddia/pull/55)    | [@saintube](https://github.com/saintube)                   | ch8: 修改链接错误                                                    |
| [54 ](https://github.com/Vonng/ddia/pull/54)    | [@Panmax](https://github.com/Panmax)                       | Update ch2.md                                                  |
| [53 ](https://github.com/Vonng/ddia/pull/53)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [52 ](https://github.com/Vonng/ddia/pull/52)    | [@hecenjie](https://github.com/hecenjie)                   | Update ch1.md                                                  |
| [51 ](https://github.com/Vonng/ddia/pull/51)    | [@latavin243](https://github.com/latavin243)               | fix 修正ch3 ch4几处翻译                                              |
| [50 ](https://github.com/Vonng/ddia/pull/50)    | [@AlexZFX](https://github.com/AlexZFX)                     | 几个疏漏和格式错误                                                      |
| [49 ](https://github.com/Vonng/ddia/pull/49)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch1.md                                                  |
| [48 ](https://github.com/Vonng/ddia/pull/48)    | [@scaugrated](https://github.com/scaugrated)               | fix typo                                                       |
| [47 ](https://github.com/Vonng/ddia/pull/47)    | [@lzwill](https://github.com/lzwill)                       | Fixed typos in ch2                                             |
| [45 ](https://github.com/Vonng/ddia/pull/45)    | [@zenuo](https://github.com/zenuo)                         | 删除一个多余的右括号                                                     |
| [44 ](https://github.com/Vonng/ddia/pull/44)    | [@akxxsb](https://github.com/akxxsb)                       | 修正第七章底部链接错误                                                    |
| [43 ](https://github.com/Vonng/ddia/pull/43)    | [@baijinping](https://github.com/baijinping)               | "更假简单"->"更加简单"                                                 |
| [42 ](https://github.com/Vonng/ddia/pull/42)    | [@tisonkun](https://github.com/tisonkun)                   | 修复 ch1 中的无序列表格式                                                |
| [38 ](https://github.com/Vonng/ddia/pull/38)    | [@renjie-c](https://github.com/renjie-c)                   | 纠正多处的翻译小错误                                                     |
| [37 ](https://github.com/Vonng/ddia/pull/37)    | [@tankilo](https://github.com/tankilo)                     | fix translation mistakes in ch4.md                             |
| [36 ](https://github.com/Vonng/ddia/pull/36)    | [@wwek](https://github.com/wwek)                           | 1.修复多个链接错误 2.名词优化修订 3.错误修订                                     |
| [35 ](https://github.com/Vonng/ddia/pull/35)    | [@wwek](https://github.com/wwek)                           | fix ch7.md  to ch8.md  link error                              |
| [34 ](https://github.com/Vonng/ddia/pull/34)    | [@wwek](https://github.com/wwek)                           | Merge pull request #1 from Vonng/master                        |
| [33 ](https://github.com/Vonng/ddia/pull/33)    | [@wwek](https://github.com/wwek)                           | fix part-ii.md link error                                      |
| [32 ](https://github.com/Vonng/ddia/pull/32)    | [@JCYoky](https://github.com/JCYoky)                       | Update ch2.md                                                  |
| [31 ](https://github.com/Vonng/ddia/pull/31)    | [@elsonLee](https://github.com/elsonLee)                   | Update ch7.md                                                  |
| [26 ](https://github.com/Vonng/ddia/pull/26)    | [@yjhmelody](https://github.com/yjhmelody)                 | 修复一些明显错误                                                       |
| [25 ](https://github.com/Vonng/ddia/pull/25)    | [@lqbilbo](https://github.com/lqbilbo)                     | 修复链接错误                                                         |
| [24 ](https://github.com/Vonng/ddia/pull/24)    | [@artiship](https://github.com/artiship)                   | 修改词语顺序                                                         |
| [23 ](https://github.com/Vonng/ddia/pull/23)    | [@artiship](https://github.com/artiship)                   | 修正错别字                                                          |
| [22 ](https://github.com/Vonng/ddia/pull/22)    | [@artiship](https://github.com/artiship)                   | 纠正翻译错误                                                         |
| [21 ](https://github.com/Vonng/ddia/pull/21)    | [@zhtisi](https://github.com/zhtisi)                       | 修正目录和本章标题不符的情况                                                 |
| [20 ](https://github.com/Vonng/ddia/pull/20)    | [@rentiansheng](https://github.com/rentiansheng)           | Update ch7.md                                                  |
| [19 ](https://github.com/Vonng/ddia/pull/19)    | [@LHRchina](https://github.com/LHRchina)                   | 修复语句小bug                                                       |
| [16 ](https://github.com/Vonng/ddia/pull/16)    | [@MuAlex](https://github.com/MuAlex)                       | Master                                                         |
| [15 ](https://github.com/Vonng/ddia/pull/15)    | [@cg-zhou](https://github.com/cg-zhou)                     | Update translation progress                                    |
| [14 ](https://github.com/Vonng/ddia/pull/14)    | [@cg-zhou](https://github.com/cg-zhou)                     | Translate glossary                                             |
| [13 ](https://github.com/Vonng/ddia/pull/13)    | [@cg-zhou](https://github.com/cg-zhou)                     | 详细修改了后记中和印度野猪相关的描述                                             |
| [12 ](https://github.com/Vonng/ddia/pull/12)    | [@ibyte2011](https://github.com/ibyte2011)                 | 修改了部分翻译                                                        |
| [11 ](https://github.com/Vonng/ddia/pull/11)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 100%                                                       |
| [10 ](https://github.com/Vonng/ddia/pull/10)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 20%                                                        |
| [9  ](https://github.com/Vonng/ddia/pull/9)     | [@jiajiadebug](https://github.com/jiajiadebug)             | Preface, ch1, part-i translation minor fixes                   |
| [7  ](https://github.com/Vonng/ddia/pull/7)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 translation pull request                                   |
| [6  ](https://github.com/Vonng/ddia/pull/6)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 change version1                                            |
| [5  ](https://github.com/Vonng/ddia/pull/5)     | [@nevertiree](https://github.com/nevertiree)               | Chapter 01语法微调                                                 |
| [2  ](https://github.com/Vonng/ddia/pull/2)     | [@seagullbird](https://github.com/seagullbird)             | 序言初翻                                                           |

</details><br>


---------

## 许可证

[![License: CC-BY 4.0](https://img.shields.io/github/license/Vonng/ddia?logo=opensourceinitiative&logoColor=green&color=slategray)](https://github.com/Vonng/ddia/blob/master/LICENSE)

本项目采用 [CC-BY 4.0](https://github.com/Vonng/ddia/blob/master/LICENSE) 许可证，您可以在这里找到完整说明：

- [署名 4.0 协议国际版 CC BY 4.0 Deed](https://creativecommons.org/licenses/by/4.0/deed.zh-hans)
- [Attribution 4.0 International CC BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.en)


================================================
FILE: assets/css/custom.css
================================================
/* 调整左侧导航栏的宽度 */
/* 增加宽度以确保文本不换行 */

/* 左侧导航栏宽度调整 - 增加到 28rem 以避免换行 */
.hextra-sidebar {
  width: 28rem !important;
  min-width: 28rem !important;
}

/* 确保导航内容正确显示 */
.hextra-sidebar nav {
  width: 100%;
}

/* 防止导航项文字换行 */
.hextra-sidebar li {
  white-space: nowrap;
}

.sidebar-container {
    width: 20rem !important;
    white-space: nowrap;
}

/* 确保导航链接不换行 */
.hextra-sidebar a {
  white-space: nowrap;
  overflow: hidden;
  text-overflow: ellipsis;
  display: block;
}

/* 调整右侧页内目录（On This Page）的宽度 */
/* Hextra 默认宽度约为 16rem (256px)，增加 1.5 倍变为 24rem (384px) */

/* 右侧目录的宽度 */
.hextra-toc {
  width: 24rem !important;
}

/* 确保目录内容正确显示 */
.hextra-toc nav {
  width: 100%;
}

/* 调整目录项的文字换行以适应更宽的空间 */
.hextra-toc li {
  word-wrap: break-word;
}

================================================
FILE: assets/css/example.css
================================================
.md-example { margin: 1.25rem 0; padding: 1rem; border: 1px solid var(--border); border-radius: .75rem; }
.md-example__caption { display: flex; gap: .5rem; align-items: baseline; margin-bottom: .5rem; }
.md-example__label { font-weight: 600; opacity: .85; }
.md-example__title { font-weight: 600; }
.md-example__anchor { margin-left: auto; text-decoration: none; opacity: .6; }
.md-example__anchor:hover { opacity: 1; }
.md-example__note { margin-top: .5rem; font-size: .95em; opacity: .85; }

================================================
FILE: bin/Pipfile
================================================
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"

[packages]
opencc = "*"
click = "*"

[dev-packages]

[requires]
python_version = "3.6"


================================================
FILE: bin/doc
================================================
#!/bin/bash

#==============================================================#
# File      :   doc
# Ctime     :   2021-08-10
# Mtime     :   2021-08-12
# Desc      :   Serve local doc with docsify, python3, python
# Path      :   bin/doc
# Deps      :   docsify or python3 or python2
# Copyright (C) 2018-2021 Ruohang Feng
#==============================================================#

PROG_DIR="$(cd $(dirname $0) && pwd)"
DOCS_DIR="$(cd $(dirname ${PROG_DIR}) && pwd)"

# node.js (docsify) > python3 (http.server) > python2 (SimpleHTTPServer)

if command -v docsify; then
	echo "serve with docsify (click url to view in browser)"
    cd ${DOCS_DIR} && docsify serve
elif command -v python3; then
    echo "serve http://localhost:3001 (python3 http.server)"
    cd ${DOCS_DIR} && python3 -m http.server 3001
elif command -v python2; then
    echo "serve http://localhost:3001 (python2 SimpleHTTPServer)"
    cd ${DOCS_DIR} && python2 -m SimpleHTTPServer 3001
else
	echo "no available server"
fi

================================================
FILE: bin/epub
================================================
#!/usr/bin/env bash

set -e

# Set the directory containing Markdown files
SCRIPT_DIR=$(dirname "$0")
INPUT_DIR=$(cd "$(dirname "$SCRIPT_DIR")" && pwd)
OUTPUT_DIR="$INPUT_DIR/output"
TEMP_DIR="$OUTPUT_DIR/temp"

# Create output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"
mkdir -p "$TEMP_DIR"

# Preprocess Markdown files to convert Hugo shortcodes
echo "Preprocessing Markdown files..."
python3 "${SCRIPT_DIR}/preprocess-epub.py" "${INPUT_DIR}/content/zh" "$TEMP_DIR"

convert_to_epub() {
	# convert all EPUB files into a single EPUB book
	OUTPUT_BOOK="$OUTPUT_DIR/ddia.epub"
	rm -f "$OUTPUT_BOOK"
	echo "Converting all EPUB files into $OUTPUT_BOOK..."

	local meta_file=${INPUT_DIR}/metadata.yaml
	local css_file=${INPUT_DIR}/js/epub.css

	pandoc -o "$OUTPUT_BOOK" --metadata-file="$meta_file" \
		--toc-depth=2 \
		--top-level-division=chapter \
		--file-scope=true \
		--css="$css_file" \
		--webtex \
		--wrap=preserve \
		"${TEMP_DIR}"/_index.md \
		"${TEMP_DIR}"/preface.md \
		"${TEMP_DIR}"/part-i.md \
		"${TEMP_DIR}"/ch1.md \
		"${TEMP_DIR}"/ch2.md \
		"${TEMP_DIR}"/ch3.md \
		"${TEMP_DIR}"/ch4.md \
		"${TEMP_DIR}"/part-ii.md \
		"${TEMP_DIR}"/ch5.md \
		"${TEMP_DIR}"/ch6.md \
		"${TEMP_DIR}"/ch7.md \
		"${TEMP_DIR}"/ch8.md \
		"${TEMP_DIR}"/ch9.md \
		"${TEMP_DIR}"/part-iii.md \
		"${TEMP_DIR}"/ch10.md \
		"${TEMP_DIR}"/ch11.md \
		"${TEMP_DIR}"/ch12.md \
		"${TEMP_DIR}"/ch13.md \
		"${TEMP_DIR}"/ch14.md \
		"${TEMP_DIR}"/colophon.md \
		"${TEMP_DIR}"/glossary.md

	echo "Converted EPUB book created at $OUTPUT_BOOK."
}

convert_to_epub

# Clean up temporary files
rm -rf "$TEMP_DIR"


================================================
FILE: bin/preprocess-epub.py
================================================
#!/usr/bin/env python3
"""
预处理 Markdown 文件，将 Hugo shortcode 转换为 Pandoc 可识别的格式

处理两种 shortcode：
1. {{< figure src="/fig/xxx.png" caption="xxx" >}} → ![xxx](static/fig/xxx.png)
2. {{< figure ... >}} (无 src) → 移除（通常用于代码示例）
"""

import os
import re
import sys
from pathlib import Path

FIGURE_SHORTCODE_RE = re.compile(r"\{\{<\s*figure\b(.*?)>\}\}", re.DOTALL)
ATTR_RE = re.compile(r'([\w-]+)="([^"]*)"')
ABS_IMAGE_RE = re.compile(r'!\[([^\]]*)\]\(/(?!static/)([^)]+)\)')


def _escape_alt_text(text):
    """Escape `]` in alt text to avoid breaking Markdown image syntax."""
    return text.replace("]", r"\]")


def convert_markdown(text):
    """
    转换 Hugo figure shortcode 和绝对路径图片引用。

    Args:
        text: Markdown 文本内容

    Returns:
        转换后的文本
    """
    def replace_figure_shortcode(match):
        attrs_text = match.group(1)
        attrs = dict(ATTR_RE.findall(attrs_text))
        src = attrs.get("src")

        # 没有 src 的 figure 一般是代码示例占位，直接移除
        if not src:
            return ""

        # 绝对路径资源转为相对 static 路径，便于 Pandoc 打包
        if src.startswith('/'):
            src = 'static' + src

        # 优先 caption，fallback 到 title，至少保证图片可渲染
        alt = _escape_alt_text(attrs.get("caption") or attrs.get("title") or "")
        return f'![{alt}]({src})'

    text = FIGURE_SHORTCODE_RE.sub(replace_figure_shortcode, text)

    # 把 Markdown 里的绝对路径图片 ![](/map/ch01.png) 转为 static/map/ch01.png
    text = ABS_IMAGE_RE.sub(r'![\1](static/\2)', text)

    return text

def process_file(input_path, output_path):
    """
    处理单个 Markdown 文件

    Args:
        input_path: 输入文件路径
        output_path: 输出文件路径
    """
    with open(input_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # 转换内容
    converted_content = convert_markdown(content)

    # 写入输出文件
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(converted_content)

    print(f"Processed: {input_path} -> {output_path}")

def main():
    """主函数"""
    if len(sys.argv) < 2:
        print("Usage: preprocess.py <input_file> [output_file]")
        print("   or: preprocess.py <input_dir> <output_dir>")
        sys.exit(1)

    input_path = sys.argv[1]

    if os.path.isfile(input_path):
        # 处理单个文件
        output_path = sys.argv[2] if len(sys.argv) > 2 else input_path
        process_file(input_path, output_path)
    elif os.path.isdir(input_path):
        # 处理目录
        output_dir = sys.argv[2]
        input_dir = Path(input_path)

        # 获取所有 .md 文件
        md_files = sorted(input_dir.glob('*.md'))

        for md_file in md_files:
            output_file = os.path.join(output_dir, md_file.name)
            process_file(str(md_file), output_file)

        print(f"\nTotal processed: {len(md_files)} files")
    else:
        print(f"Error: {input_path} is not a valid file or directory")
        sys.exit(1)

if __name__ == '__main__':
    main()


================================================
FILE: bin/toc.py
================================================
#!/usr/bin/env python3
"""
TOC Generator for DDIA book
Usage: python toc.py <lang> <depth> [output_file]
Example: python toc.py zh 2
         python toc.py en 3 en-toc.md
"""

import os
import sys
import re
from pathlib import Path


def extract_front_matter_title(content):
    """Extract title from Hugo front matter"""
    lines = content.split('\n')
    in_front_matter = False
    for line in lines:
        if line.strip() == '---':
            if not in_front_matter:
                in_front_matter = True
            else:
                break
        elif in_front_matter and line.startswith('title:'):
            # Extract title, removing quotes
            title = line.split(':', 1)[1].strip()
            if title.startswith('"') and title.endswith('"'):
                title = title[1:-1]
            elif title.startswith("'") and title.endswith("'"):
                title = title[1:-1]
            return title
    return None


def extract_headings(content, max_depth):
    """Extract headings up to specified depth from markdown content
    
    max_depth=1 -> extract H2 only
    max_depth=2 -> extract H2-H3
    max_depth=3 -> extract H2-H4
    max_depth=4 -> extract H2-H5
    """
    headings = []
    lines = content.split('\n')
    
    # Skip front matter
    skip_until = 0
    if lines[0].strip() == '---':
        for i, line in enumerate(lines[1:], 1):
            if line.strip() == '---':
                skip_until = i + 1
                break
    
    for line in lines[skip_until:]:
        # Match markdown headings with optional ID
        # Format: ## Heading Text {#heading-id}
        match = re.match(r'^(#{2,5})\s+(.*?)(?:\s*\{#([\w-]+)\})?$', line)
        if match:
            level = len(match.group(1))
            # max_depth=1 -> extract level 2 only (H2)
            # max_depth=2 -> extract level 2-3 (H2-H3)
            # max_depth=3 -> extract level 2-4 (H2-H4)
            # max_depth=4 -> extract level 2-5 (H2-H5)
            max_level = max_depth + 1
            if level <= max_level:
                heading_text = match.group(2).strip()
                heading_id = match.group(3)
                headings.append({
                    'level': level,  # Keep original level: 2 for H2, 3 for H3, etc.
                    'text': heading_text,
                    'id': heading_id
                })
    
    return headings


def generate_toc_entry(file_name, title, lang, depth, content_dir):
    """Generate TOC entry for a file"""
    entries = []
    
    # Determine URL path
    base_name = file_name.replace('.md', '')
    if lang == 'zh':
        url = f"/{base_name}"
    else:
        url = f"/{lang}/{base_name}"
    
    # Add main entry (level 1)
    entries.append({
        'level': 1,
        'text': f"[{title}]({url})",
        'raw_text': title
    })
    
    # Special case: glossary.md only shows main title (no sub-headings)
    if file_name == 'glossary.md':
        effective_depth = 0  # Don't extract any sub-headings
    else:
        effective_depth = depth - 1  # Adjust depth: user depth 1 = no extraction, 2 = extract H2, etc.
    
    # If effective_depth >= 1, extract headings from file
    if effective_depth >= 1:
        file_path = content_dir / file_name
        if file_path.exists():
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            headings = extract_headings(content, effective_depth)
            for heading in headings:
                # Create link with anchor
                if heading['id']:
                    anchor_url = f"{url}#{heading['id']}"
                else:
                    # Generate anchor from heading text (simplified)
                    anchor = heading['text'].lower()
                    anchor = re.sub(r'[^\w\s-]', '', anchor)
                    anchor = re.sub(r'\s+', '-', anchor)
                    anchor_url = f"{url}#{anchor}"
                
                # Adjust level: H2 becomes level 2, H3 becomes level 3, etc.
                # This ensures proper indentation under the main entry
                entries.append({
                    'level': heading['level'],
                    'text': f"[{heading['text']}]({anchor_url})",
                    'raw_text': heading['text']
                })
    
    return entries


def format_toc_entries(entries):
    """Format TOC entries with proper indentation"""
    formatted = []
    for entry in entries:
        level = entry['level']
        text = entry['text']
        
        if level == 0:
            # Blank line separator
            formatted.append('')
        elif level == 1:
            # Main entry (chapter/section level)
            formatted.append(f"## {text}")
        elif level == 2:
            # H2 heading
            formatted.append(f"- {text}")
        elif level == 3:
            # H3 heading
            formatted.append(f"    - {text}")
        elif level == 4:
            # H4 heading
            formatted.append(f"        - {text}")
        elif level == 5:
            # H5 heading
            formatted.append(f"            - {text}")
    
    return '\n'.join(formatted)


def check_file_status(file_path, lang):
    """Check if a file exists and add status marker if needed"""
    if not file_path.exists():
        return " (未发布)" if lang == 'zh' else " (未發布)" if lang == 'tw' else " (WIP)"
    
    # Check if file has minimal content (you can adjust this logic)
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        # Simple heuristic: if file has less than 500 characters of actual content, consider it WIP
        # Remove front matter for content check
        lines = content.split('\n')
        if lines[0].strip() == '---':
            for i, line in enumerate(lines[1:], 1):
                if line.strip() == '---':
                    content = '\n'.join(lines[i+1:])
                    break
        
        content_length = len(content.strip())
        if content_length < 500:
            return " (未发布)" if lang == 'zh' else " (未發布)" if lang == 'tw' else " (WIP)"
    
    return ""


def main():
    # Parse arguments
    if len(sys.argv) < 3:
        print("Usage: python toc.py <lang> <depth> [output_file]")
        print("Example: python toc.py zh 2")
        sys.exit(1)
    
    lang = sys.argv[1]
    if lang not in ['zh', 'en', 'tw']:
        print(f"Error: Language must be one of: zh, en, tw")
        sys.exit(1)
    
    try:
        depth = int(sys.argv[2])
        if depth not in [1, 2, 3, 4]:
            raise ValueError
    except ValueError:
        print(f"Error: Depth must be 1, 2, 3, or 4")
        sys.exit(1)
    
    # Determine output file
    if len(sys.argv) > 3:
        output_file = sys.argv[3]
    else:
        output_file = f"{lang}.md"
    
    # Get content directory
    script_dir = Path(__file__).parent
    project_root = script_dir.parent
    content_dir = project_root / 'content' / lang
    
    if not content_dir.exists():
        print(f"Error: Content directory {content_dir} does not exist")
        sys.exit(1)
    
    # Define file order
    file_order = [
        'preface.md',
        'ch1.md', 'ch2.md', 'ch3.md', 'ch4.md', 'ch5.md', 'ch6.md',
        'ch7.md', 'ch8.md', 'ch9.md', 'ch10.md', 'ch11.md', 'ch12.md', 'ch13.md',
        'glossary.md',
        'colophon.md'
    ]
    
    # Generate TOC
    all_entries = []
    
    for file_name in file_order:
        file_path = content_dir / file_name
        if file_path.exists():
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            title = extract_front_matter_title(content)
            if title:
                entries = generate_toc_entry(file_name, title, lang, depth, content_dir)
                
                # Add status marker to main entry if needed
                status = check_file_status(file_path, lang)
                if status and entries:
                    # Update the first entry (main title) with status
                    entries[0]['text'] = entries[0]['text'].replace(')', f'){status}')
                
                all_entries.extend(entries)
                if entries:  # Add blank line between chapters
                    all_entries.append({'level': 0, 'text': ''})
    
    # Format and write output
    formatted_toc = format_toc_entries(all_entries)
    
    # Clean up extra blank lines
    formatted_toc = re.sub(r'\n{3,}', '\n\n', formatted_toc)
    
    # Write to file
    output_path = Path(output_file)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(formatted_toc)
    
    print(f"TOC generated successfully: {output_path}")
    print(f"Language: {lang}, Depth: {depth}")


if __name__ == "__main__":
    main()

================================================
FILE: bin/translate.py
================================================
"""Convert zh-cn to zh-tw
Refer to https://github.com/BYVoid/OpenCC
"""
import click
import opencc

from pathlib import Path
from pprint import pprint


@click.group()
def cli():
    pass


def convert(infile: str, outfile: str, cfg: str):
    """read >> convert >> write file
    Args:
        infile (str): input file
        outfile (str): output file
        cfg (str): config
    """
    converter = opencc.OpenCC(cfg)
    with open(infile, "r") as inf, open(outfile, "w+") as outf:
        outf.write("\n".join(converter.convert(line) for line in inf))
    print(f"Convert to {outfile}")


@cli.command()
@click.option("-i", "--input", "infile", required=True)
@click.option("-o", "--output", "outfile", required=True)
@click.option("-c", "--config", "cfg", required=True, default="s2twp.json")
def file(infile: str, outfile: str, cfg: str):
    """read >> convert >> write file
    Args:
        infile (str): input file
        outfile (str): output file
        cfg (str): config
    """
    convert(infile, outfile, cfg)


@cli.command()
@click.option("-i", "--input", "infolder", required=True)
@click.option("-o", "--output", "outfolder", required=True)
@click.option("-c", "--config", "cfg", required=True, default="s2twp.json")
def repo(infolder, outfolder, cfg):
    if not Path(outfolder).exists():
        Path(outfolder).mkdir(parents=True)
        print(f"Create {outfolder}")
    infiles = Path(infolder).resolve().glob("*.md")
    pair = [
        {"infile": str(infile), "outfile": str(Path(outfolder).resolve() / infile.name)}
        for idx, infile in enumerate(infiles)
    ]
    for p in pair:
        convert(p["infile"], p["outfile"], cfg)


if __name__ == "__main__":
    cli()


================================================
FILE: bin/zh-tw.py
================================================
#!/usr/bin/env python3
import os, sys, opencc
import re

def process_urls(text, src_folder, dst_folder):
    """处理 Markdown 中的相对 URL"""
    # 定义需要处理的页面路径（不带.md后缀）
    page_paths = [
        '/ch1', '/ch2', '/ch3', '/ch4', '/ch5', '/ch6',
        '/ch7', '/ch8', '/ch9', '/ch10', '/ch11', '/ch12', '/ch13',
        '/part-i', '/part-ii', '/part-iii', 
        '/preface', '/glossary', '/colophon'
    ]
    
    # 对每个页面路径进行替换
    for page_path in page_paths:
        # 匹配 Markdown 链接格式 [text](page_path) 或 [text](page_path#anchor)
        pattern = rf'\[([^\]]*)\]\(([^)]*)({re.escape(page_path)})(#[^)]*)?\)'
        # 替换为添加 /tw 前缀的版本
        def replace_func(match):
            text_part = match.group(1)
            folder_part = match.group(2) or ''
            page_part = match.group(3)
            anchor_part = match.group(4) or ''
            if not folder_part:
                return f'[{text_part}](/tw{page_part}{anchor_part})'            # 默认中文版本，没有 /zh 前缀，直接在前面添加 /tw 前缀
            elif folder_part[1:] == src_folder:
                return f'[{text_part}](/{dst_folder}{page_part}{anchor_part})'  # 其它中文版本，有类似 /v1 的前缀，根据输入参数进行替换
            else:
                text = f'[{text_part}]({folder_part}{page_part}{anchor_part})'
                print(f'unknown folder part in: {text}, keep it unchanged')
                return text
        text = re.sub(pattern, replace_func, text)
    
    return text

def convert_file(src_filepath, dst_filepath, src_folder, dst_folder, cfg='s2twp.json'):
    print("convert %s to %s" % (src_filepath, dst_filepath))
    converter = opencc.OpenCC(cfg)
    with open(src_filepath, "r", encoding='utf-8') as src, open(dst_filepath, "w+", encoding='utf-8') as dst:
        dst.write("\n".join(
            process_urls(
                converter.convert(line.rstrip())
                    .replace('一箇', '一個')
                    .replace('髮送', '傳送')
                    .replace('髮布', '釋出')
                    .replace('髮生', '發生')
                    .replace('髮出', '發出')
                    .replace('嚐試', '嘗試')
                    .replace('線上性一致', '在線性一致')    # 优先按"在线"解析了？
                    .replace('復雜', '複雜')
                    .replace('討論瞭', '討論了')
                    .replace('瞭解釋', '了解釋')
                    .replace('瞭如', '了如')                # 引入了如, 實現了如, 了如何, 了如果, 了如此
                    .replace('了如指掌', '瞭如指掌')        # 针对上一行的例外情况
                    .replace('明瞭', '明了')                # 闡明了, 聲明了, 指明了
                    .replace('倒黴', '倒楣')
                    .replace('區域性性', '區域性')
                    .replace('下麵條件', '下面條件')        # 优先按"面条"解析了？
                    .replace('當日志', '當日誌')            # 优先按"当日"解析了？
                    .replace('真即時間', '真實時間')        # 优先按"实时"解析了？
                    .replace('面向物件', '物件導向')
                    .replace('非規範化', '反正規化')
                    .replace('規範化', '正規化'),
                src_folder, dst_folder
            )
            for line in src))

def convert(zh_folder, tw_folder):
    home = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), '..'))
    zh_dirpath = os.path.join(home, 'content', zh_folder)
    tw_dirpath = os.path.join(home, 'content', tw_folder)
    for file in os.listdir(zh_dirpath):
        if file.endswith('.md'):
            zh_filepath = os.path.join(zh_dirpath, file)
            tw_filepath = os.path.join(tw_dirpath, file)
            convert_file(zh_filepath, tw_filepath, zh_folder, tw_folder)

if __name__ == '__main__':
    print(sys.argv)
    convert('zh', 'tw')
    convert('v1', 'v1_tw')


================================================
FILE: content/en/_index.md
================================================
---
title: "Designing Data-Intensive Applications 2nd Edition"
linkTitle: DDIA
cascade:
  type: docs
breadcrumbs: false
---
 

—— **The Big Ideas Behind Reliable, Scalable, and Maintainable Systems**

[Martin Kleppmann](https://martin.kleppmann.com)

> The en-us version only includes **intro**, **summary**, **references** of all chapters to protect the intellectual property of author and publisher.

![](/title.jpg)

--------

*Technology is a powerful force in our society. Data, software, and communication can*

*be used for bad: to entrench unfair power structures, to undermine human rights, and to protect vested interests. But they can also be used for good: to make underrepresented people’s voices heard, to create opportunities for everyone, and to avert disasters. This book is dedicated to everyone working toward the good.*

---------

*Computing is pop culture. [...] Pop culture holds a disdain for history. Pop culture is all about identity and feeling like you’re participating. It has nothing to do with cooperation, the past or the future—it’s living in the present. I think the same is true of most people who write code for money. They have no idea where [their culture came from].*

 — [Alan Kay](http://www.drdobbs.com/architecture-and-design/interview-with-alan-kay/240003442), in interview with *Dr Dobb’s Journal* (2012)

---------


## Table of Contents

### [Preface](/en/preface)

### [Part I: Foundations of Data Systems](/en/part-i)
  - [1. Tradeoffs in Data Systems Architecture](/en/ch1)
  - [2. Defining NonFunctional Requirements](/en/ch2)
  - [3. Data Models and Query Languages](/en/ch3)
  - [4. Storage and Retrieval](/en/ch4)
  - [5. Encoding and Evolution](/en/ch5)

### [Part II: Distributed Data](/en/part-ii)
  - [6. Replication](/en/ch6)
  - [7. Partitioning](/en/ch7)
  - [8. Transactions](/en/ch8)
  - [9. The Trouble with Distributed Systems](/en/ch9)
  - [10. Consistency and Consensus](/en/ch10)

### [Part III: Derived Data](/en/part-iii)
  - [11. Batch Processing](/en/ch11)
  - [12. Stream Processing](/en/ch12)
  - [13. A Philosophy of Streaming Systems](/en/ch13)
  - [14. Doing the Right Thing](/en/ch14)

### [Glossary](/en/glossary)

### [Colophon](/en/colophon)


================================================
FILE: content/en/ch1.md
================================================
---
title: "1. Trade-offs in Data Systems Architecture"
weight: 101
breadcrumbs: false
---

<a id="ch_tradeoffs"></a>

> *There are no solutions, there are only trade-offs. […] But you try to get the best
> trade-off you can get, and that’s all you can hope for.*
>
> [Thomas Sowell](https://www.youtube.com/watch?v=2YUtKr8-_Fg), Interview with Fred Barnes (2005)

> [!TIP] A NOTE FOR EARLY RELEASE READERS
> With Early Release ebooks, you get books in their earliest form—the author’s raw and unedited content as they write—so you can take advantage of these technologies long before the official release of these titles.
>
> This will be the 1st chapter of the final book. The GitHub repo for this book is https://github.com/ept/ddia2-feedback.
> If you’d like to be actively involved in reviewing and commenting on this draft, please reach out on GitHub.

Data is central to much application development today. With web and mobile apps, software as a
service (SaaS), and cloud services, it has become normal to store data from many different users in
a shared server-based data infrastructure. Data from user activity, business transactions, devices
and sensors needs to be stored and made available for analysis. As users interact with an
application, they both read the data that is stored, and also generate more data.

Small amounts of data, which can be stored and processed on a single machine, are often fairly easy
to deal with. However, as the data volume or the rate of queries grows, it needs to be distributed
across multiple machines, which introduces many challenges. As the needs of the application become
more complex, it is no longer sufficient to store everything in one system, but it might be
necessary to combine multiple storage or processing systems that provide different capabilities.

We call an application *data-intensive* if data management is one of the primary challenges in
developing the application [^1].
While in *compute-intensive* systems the challenge is parallelizing some very large computation, in
data-intensive applications we usually worry more about things like storing and processing large
data volumes, managing changes to data, ensuring consistency in the face of failures and
concurrency, and making sure services are highly available.

Such applications are typically built from standard building blocks that provide commonly needed
functionality. For example, many applications need to:

* Store data so that they, or another application, can find it again later (*databases*)
* Remember the result of an expensive operation, to speed up reads (*caches*)
* Allow users to search data by keyword or filter it in various ways (*search indexes*)
* Handle events and data changes as soon as they occur (*stream processing*)
* Periodically crunch a large amount of accumulated data (*batch processing*)

In building an application we typically take several software systems or services, such as databases
or APIs, and glue them together with some application code. If you are doing exactly what the data
systems were designed for, then this process can be quite easy.

However, as your application becomes more ambitious, challenges arise. There are many database
systems with different characteristics, suitable for different purposes—how do you choose which one
to use? There are various approaches to caching, several ways of building search indexes, and so
on—how do you reason about their trade-offs? You need to figure out which tools and which approaches
are the most appropriate for the task at hand, and it can be difficult to combine tools when you
need to do something that a single tool cannot do alone.

This book is a guide to help you make decisions about which technologies to use and how to combine
them. As you will see, there is no one approach that is fundamentally better than others; everything
has pros and cons. With this book, you will learn to ask the right questions to evaluate and compare
data systems, so that you can figure out which approach will best serve the needs of your particular
application.

We will start our journey by looking at some of the ways that data is typically used in
organizations today. Many of the ideas here have their origin in *enterprise software* (i.e., the
software needs and engineering practices of large organizations, such as big corporations and
governments), since historically, only large organizations had the large data volumes that required
sophisticated technical solutions. If your data volume is small enough, you can simply keep it in a
spreadsheet! However, more recently it has also become common for smaller companies and startups to
manage large data volumes and build data-intensive systems.

One of the key challenges with data systems is that different people need to do very different
things with data. If you are working at a company, you and your team will have one set of
priorities, while another team may have entirely different goals, even though you might be working
with the same dataset! Moreover, those goals might not be explicitly articulated, which can lead to
misunderstandings and disagreement about the right approach.

To help you understand what choices you can make, this chapter compares several contrasting
concepts, and explores their trade-offs:

* the difference between operational and analytical systems ([“Analytical versus Operational Systems”](/en/ch1#sec_introduction_analytics));
* pros and cons of cloud services and self-hosted systems ([“Cloud versus Self-Hosting”](/en/ch1#sec_introduction_cloud));
* when to move from single-node systems to distributed systems ([“Distributed versus Single-Node Systems”](/en/ch1#sec_introduction_distributed)); and
* balancing the needs of the business and the rights of the user ([“Data Systems, Law, and Society”](/en/ch1#sec_introduction_compliance)).

Moreover, this chapter will provide you with terminology that we will need for the rest of the book.

> [!TIP] TERMINOLOGY: FRONTENDS AND BACKENDS

Much of what we will discuss in this book relates to *backend development*. To explain that term:
for web applications, the client-side code (which runs in a web browser) is called the *frontend*,
and the server-side code that handles user requests is known as the *backend*. Mobile apps are
similar to frontends in that they provide user interfaces, which often communicate over the Internet
with a server-side backend. Frontends sometimes manage data locally on the user’s device [^2],
but the greatest data infrastructure challenges often lie in the backend: a frontend only needs to
handle one user’s data, whereas the backend manages data on behalf of *all* of the users.

A backend service is often reachable via HTTP (sometimes WebSocket); it usually consists of some
application code that reads and writes data in one or more databases, and sometimes interfaces with
additional data systems such as caches or message queues (which we might collectively call *data
infrastructure*). The application code is often *stateless* (i.e., when it finishes handling one
HTTP request, it forgets everything about that request), and any information that needs to persist
from one request to another needs to be stored either on the client, or in the server-side data
infrastructure.


## Analytical versus Operational Systems {#sec_introduction_analytics}

If you are working on data systems in an enterprise, you are likely to encounter several different
types of people who work with data. The first type are *backend engineers* who build services that
handle requests for reading and updating data; these services often serve external users, either
directly or indirectly via other services (see [“Microservices and Serverless”](/en/ch1#sec_introduction_microservices)). Sometimes
services are for internal use by other parts of the organization.

In addition to the teams managing backend services, two other groups of people typically require
access to an organization’s data: *business analysts*, who generate reports about the activities of
the organization in order to help the management make better decisions (*business intelligence* or
*BI*), and *data scientists*, who look for novel insights in data or who create user-facing product
features that are enabled by data analysis and machine learning/AI (for example, “people who bought
X also bought Y” recommendations on an e-commerce website, predictive analytics such as risk scoring
or spam filtering, and ranking of search results).

Although business analysts and data scientists tend to use different tools and operate in different
ways, they have some things in common: both perform *analytics*, which means they look at the data
that the users and backend services have generated, but they generally do not modify this data
(except perhaps for fixing mistakes). They might create derived datasets in which the original data
has been processed in some way. This has led to a split between two types of systems—a distinction
that we will use throughout this book:

* *Operational systems* consist of the backend services and data infrastructure where data is
  created, for example by serving external users. Here, the application code both reads and modifies
  the data in its databases, based on the actions performed by the users.
* *Analytical systems* serve the needs of business analysts and data scientists. They contain a
  read-only copy of the data from the operational systems, and they are optimized for the types of
  data processing that are needed for analytics.

As we shall see in the next section, operational and analytical systems are often kept separate, for
good reasons. As these systems have matured, two new specialized roles have emerged: *data
engineers* and *analytics engineers*. Data engineers are the people who know how to integrate the
operational and the analytical systems, and who take responsibility for the organization’s data
infrastructure more widely [^3].
Analytics engineers model and transform data to make it more useful for the business analysts and
data scientists in an organization [^4].

Many engineers specialize on either the operational or the analytical side. However, this book
covers both operational and analytical data systems, since both play an important role in the
lifecycle of data within an organization. We will explore in-depth the data infrastructure that is
used to deliver services both to internal and external users, so that you can work better with your
colleagues on the other side of this divide.

### Characterizing Transaction Processing and Analytics {#sec_introduction_oltp}

In the early days of business data processing, a write to the database typically corresponded to a
*commercial transaction* taking place: making a sale, placing an order with a supplier, paying an
employee’s salary, etc. As databases expanded into areas that didn’t involve money changing hands,
the term *transaction* nevertheless stuck, referring to a group of reads and writes that form a
logical unit.

> [!NOTE]
> [Chapter 8](/en/ch8#ch_transactions) explores in detail what we mean with a transaction. This chapter uses the term
> loosely to refer to low-latency reads and writes.

Even though databases started being used for many different kinds of data—posts on social media,
moves in a game, contacts in an address book, and many others—the basic access pattern
remained similar to processing business transactions. An operational system typically looks up a
small number of records by some key (this is called a *point query*). Records are inserted, updated,
or deleted based on the user’s input. Because these applications are interactive, this access
pattern became known as *online transaction processing* (OLTP).

However, databases also started being increasingly used for analytics, which has very different
access patterns compared to OLTP. Usually an analytic query scans over a huge number of records, and
calculates aggregate statistics (such as count, sum, or average) rather than returning the
individual records to the user. For example, a business analyst at a supermarket chain may want to
answer analytic queries such as:

* What was the total revenue of each of our stores in January?
* How many more bananas than usual did we sell during our latest promotion?
* Which brand of baby food is most often purchased together with brand X diapers?

The reports that result from these types of queries are important for business intelligence, helping
the management decide what to do next. In order to differentiate this pattern of using databases
from transaction processing, it has been called *online analytic processing* (OLAP) [^5].
The difference between OLTP and analytics is not always clear-cut, but some typical characteristics are listed in [Table 1-1](/en/ch1#tab_oltp_vs_olap).

{{< figure id="tab_oltp_vs_olap" title="Table 1-1. Comparing characteristics of operational and analytic systems" class="w-full my-4" >}}

| Property            | Operational systems (OLTP)                      | Analytical systems (OLAP)                 |
|---------------------|-------------------------------------------------|-------------------------------------------|
| Main read pattern   | Point queries (fetch individual records by key) | Aggregate over large number of records    |
| Main write pattern  | Create, update, and delete individual records   | Bulk import (ETL) or event stream         |
| Human user example  | End user of web/mobile application              | Internal analyst, for decision support    |
| Machine use example | Checking if an action is authorized             | Detecting fraud/abuse patterns            |
| Type of queries     | Fixed set of queries, predefined by application | Analyst can make arbitrary queries        |
| Data represents     | Latest state of data (current point in time)    | History of events that happened over time |
| Dataset size        | Gigabytes to terabytes                          | Terabytes to petabytes                    |

> [!NOTE]
> The meaning of *online* in *OLAP* is unclear; it probably refers to the fact that queries are not
> just for predefined reports, but that analysts use the OLAP system interactively for explorative
> queries.

With operational systems, users are generally not allowed to construct custom SQL queries and run
them on the database, since that would potentially allow them to read or modify data that they do
not have permission to access. Moreover, they might write queries that are expensive to execute, and
hence affect the database performance for other users. For these reasons, OLTP systems mostly run a
fixed set of queries that are baked into the application code, and use one-off custom queries only
occasionally for maintenance or troubleshooting. On the other hand, analytic databases usually give
their users the freedom to write arbitrary SQL queries by hand, or to generate queries automatically
using a data visualization or dashboard tool such as Tableau, Looker, or Microsoft Power BI.

There is also a type of systems that is designed for analytical workloads (queries that aggregate
over many records) but that are embedded into user-facing products. This category is known as
*product analytics* or *real-time analytics*, and systems designed for this type of use include
Pinot, Druid, and ClickHouse [^6].

### Data Warehousing {#sec_introduction_dwh}

At first, the same databases were used for both transaction processing and analytic queries. SQL
turned out to be quite flexible in this regard: it works well for both types of queries.
Nevertheless, in the late 1980s and early 1990s, there was a trend for companies to stop using their
OLTP systems for analytics purposes, and to run the analytics on a separate database system instead.
This separate database was called a *data warehouse*.

A large enterprise may have dozens, even hundreds, of online transaction processing systems:
systems powering the customer-facing website, controlling point of sale (checkout) systems in
physical stores, tracking inventory in warehouses, planning routes for vehicles, managing suppliers,
administering employees, and performing many other tasks. Each of these systems is complex and needs
a team of people to maintain it, so these systems end up operating mostly independently from each
other.

It is usually undesirable for business analysts and data scientists to directly query these OLTP
systems, for several reasons:

* the data of interest may be spread across multiple operational systems, making it difficult to
  combine those datasets in a single query (a problem known as *data silos*);
* the kinds of schemas and data layouts that are good for OLTP are less well suited for analytics
  (see [“Stars and Snowflakes: Schemas for Analytics”](/en/ch3#sec_datamodels_analytics));
* analytic queries can be quite expensive, and running them on an OLTP database would impact the
  performance for other users; and
* the OLTP systems might reside in a separate network that users are not allowed direct access to
  for security or compliance reasons.

A *data warehouse*, by contrast, is a separate database that analysts can query to their hearts’
content, without affecting OLTP operations [^7].
As we shall see in [Chapter 4](/en/ch4#ch_storage), data warehouses often store data in a way that is very different
from OLTP databases, in order to optimize for the types of queries that are common in analytics.

The data warehouse contains a read-only copy of the data in all the various OLTP systems in the
company. Data is extracted from OLTP databases (using either a periodic data dump or a continuous
stream of updates), transformed into an analysis-friendly schema, cleaned up, and then loaded into
the data warehouse. This process of getting data into the data warehouse is known as
*Extract–Transform–Load* (ETL) and is illustrated in [Figure 1-1](/en/ch1#fig_dwh_etl). Sometimes the order of the
*transform* and *load* steps is swapped (i.e., the transformation is done in the data warehouse,
after loading), resulting in *ELT*.

{{< figure src="/fig/ddia_0101.png" id="fig_dwh_etl" caption="Figure 1-1. Simplified outline of ETL into a data warehouse." class="w-full my-4" >}}

In some cases the data sources of the ETL processes are external SaaS products such as customer
relationship management (CRM), email marketing, or credit card processing systems. In those cases,
you do not have direct access to the original database, since it is accessible only via the software
vendor’s API. Bringing the data from these external systems into your own data warehouse can enable
analyses that are not possible via the SaaS API. ETL for SaaS APIs is often implemented by
specialist data connector services such as Fivetran, Singer, or AirByte.

Some database systems offer *hybrid transactional/analytic processing* (HTAP), which aims to enable
OLTP and analytics in a single system without requiring ETL from one system into another [^8] [^9].
However, many HTAP systems internally consist of an OLTP system coupled with a separate analytical
system, hidden behind a common interface—so the distinction between the two remains important for
understanding how these systems work.

Moreover, even though HTAP exists, it is common to have a separation between transactional and
analytic systems due to their different goals and requirements. In particular, it is considered good
practice for each operational system to have its own database (see
[“Microservices and Serverless”](/en/ch1#sec_introduction_microservices)), leading to hundreds of separate operational databases; on the
other hand, an enterprise usually has a single data warehouse, so that business analysts can combine
data from several operational systems in a single query.

HTAP therefore does not replace data warehouses. Rather, it is useful in scenarios where the same
application needs to both perform analytics queries that scan a large number of rows, and also
read and update individual records with low latency. Fraud detection can involve such workloads, for
example [^10].

The separation between operational and analytical systems is part of a wider trend: as workloads
have become more demanding, systems have become more specialized and optimized for particular
workloads. General-purpose systems can handle small data volumes comfortably, but the greater the
scale, the more specialized systems tend to become [^11].

#### From data warehouse to data lake {#from-data-warehouse-to-data-lake}

A data warehouse often uses a *relational* data model that is queried through SQL (see
[Chapter 3](/en/ch3#ch_datamodels)), perhaps using specialized business intelligence software. This model works well
for the types of queries that business analysts need to make, but it is less well suited to the
needs of data scientists, who might need to perform tasks such as:

* Transform data into a form that is suitable for training a machine learning model; often this
  requires turning the rows and columns of a database table into a vector or matrix of numerical
  values called *features*. The process of performing this transformation in a way that maximizes
  the performance of the trained model is called *feature engineering*, and it often requires custom
  code that is difficult to express using SQL.
* Take textual data (e.g., reviews of a product) and use natural language processing techniques to
  try to extract structured information from it (e.g., the sentiment of the author, or which topics
  they mention). Similarly, they might need to extract structured information from photos using
  computer vision techniques.

Although there have been efforts to add machine learning operators to a SQL data model [^12]
and to build efficient machine learning systems on top of a relational foundation [^13],
many data scientists prefer not to work in a relational database such as a data warehouse. Instead,
many prefer to use Python data analysis libraries such as pandas and scikit-learn, statistical
analysis languages such as R, and distributed analytics frameworks such as Spark [^14].
We discuss these further in [“Dataframes, Matrices, and Arrays”](/en/ch3#sec_datamodels_dataframes).

Consequently, organizations face a need to make data available in a form that is suitable for use by
data scientists. The answer is a *data lake*: a centralized data repository that holds a copy of any
data that might be useful for analysis, obtained from operational systems via ETL processes. The
difference from a data warehouse is that a data lake simply contains files, without imposing any
particular file format or data model. Files in a data lake might be collections of database records,
encoded using a file format such as Avro or Parquet (see [Chapter 5](/en/ch5#ch_encoding)), but they can equally well
contain text, images, videos, sensor readings, sparse matrices, feature vectors, genome sequences,
or any other kind of data [^15].
Besides being more flexible, this is also often cheaper than relational data storage, since the data
lake can use commoditized file storage such as object stores (see [“Cloud-Native System Architecture”](/en/ch1#sec_introduction_cloud_native)).

ETL processes have been generalized to *data pipelines*, and in some cases the data lake has become
an intermediate stop on the path from the operational systems to the data warehouse. The data lake
contains data in a “raw” form produced by the operational systems, without the transformation into a
relational data warehouse schema. This approach has the advantage that each consumer of the data can
transform the raw data into a form that best suits their needs. It has been dubbed the *sushi
principle*: “raw data is better” [^16].

Besides loading data from a data lake into a separate data warehouse, it is also possible to run
typical data warehousing workloads (SQL queries and business analytics) directly on the files in the
data lake, alongside data science/machine learning workloads. This architecture is known as a *data
lakehouse*, and it requires a query execution engine and a metadata (e.g., schema management) layer
that extend the data lake’s file storage [^17].

Apache Hive, Spark SQL, Presto, and Trino are examples of this approach.

#### Beyond the data lake {#beyond-the-data-lake}

As analytics practices have matured, organizations have been increasingly paying attention to the
management and operations of analytics systems and data pipelines, as captured for example in the
DataOps manifesto [^18].
Part of this are issues of governance, privacy, and compliance with regulation such as GDPR and
CCPA, which we discuss in [“Data Systems, Law, and Society”](/en/ch1#sec_introduction_compliance) and [“Legislation and Self-Regulation”](/en/ch14#sec_future_legislation).

Moreover, analytical data is increasingly made available not only as files and relational tables,
but also as streams of events (see [Chapter 12](/en/ch12#ch_stream)). With file-based data analysis you can re-run the
analysis periodically (e.g., daily) in order to respond to changes in the data, but stream processing
allows analytics systems to respond to events much faster, on the order of seconds. Depending on the
application and how time-sensitive it is, a stream processing approach can be valuable, for example
to identify and block potentially fraudulent or abusive activity.

In some cases the outputs of analytics systems are made available to operational systems (a process
sometimes known as *reverse ETL* [^19]). For example, a machine-learning model that was trained on data in an analytics system may be deployed to
production, so that it can generate recommendations for end-users, such as “people who bought X also
bought Y”. Such deployed outputs of analytics systems are also known as *data products* [^20].
Machine learning models can be deployed to operational systems using specialized tools such as
TFX, Kubeflow, or MLflow.

### Systems of Record and Derived Data {#sec_introduction_derived}

Related to the distinction between operational and analytical systems, this book also distinguishes
between *systems of record* and *derived data systems*. These terms are useful because they can help
you clarify the flow of data through a system:

Systems of record
:   A system of record, also known as *source of truth*, holds the authoritative or *canonical*
    version of some data. When new data comes in, e.g., as user input, it is first written here. Each
    fact is represented exactly once (the representation is typically *normalized*; see
    [“Normalization, Denormalization, and Joins”](/en/ch3#sec_datamodels_normalization)). If there is any discrepancy between another system and the
    system of record, then the value in the system of record is (by definition) the correct one.

Derived data systems
:   Data in a derived system is the result of taking some existing data from another system and
    transforming or processing it in some way. If you lose derived data, you can recreate it from the
    original source. A classic example is a cache: data can be served from the cache if present, but
    if the cache doesn’t contain what you need, you can fall back to the underlying database.
    Denormalized values, indexes, materialized views, transformed data representations, and models
    trained on a dataset also fall into this category.

Technically speaking, derived data is *redundant*, in the sense that it duplicates existing
information. However, it is often essential for getting good performance on read queries. You can
derive several different datasets from a single source, enabling you to look at the data from
different “points of view.”

Analytical systems are usually derived data systems, because they are consumers of data created
elsewhere. Operational services may contain a mixture of systems of record and derived data systems.
The systems of record are the primary databases to which data is first written, whereas the derived
data systems are the indexes and caches that speed up common read operations, especially for queries
that the system of record cannot answer efficiently.

Most databases, storage engines, and query languages are not inherently a system of record or a
derived system. A database is just a tool: how you use it is up to you. The distinction between
system of record and derived data system depends not on the tool, but on how you use it in your
application. By being clear about which data is derived from which other data, you can bring clarity
to an otherwise confusing system architecture.

When the data in one system is derived from the data in another, you need a process for updating the
derived data when the original in the system of record changes. Unfortunately, many databases are
designed based on the assumption that your application only ever needs to use that one database, and
they do not make it easy to integrate multiple systems in order to propagate such updates. In
[“Data Integration”](/en/ch13#sec_future_integration) we will discuss approaches to *data integration*, which allow us to compose multiple
data systems to achieve things that one system alone cannot do.

That brings us to the end of our comparison of analytics and transaction processing. In the next
section, we will examine another trade-off that you might have already seen debated multiple times.


## Cloud versus Self-Hosting {#sec_introduction_cloud}

With anything that an organization needs to do, one of the first questions is: should it be done
in-house, or should it be outsourced? Should you build or should you buy?

Ultimately, this is a question about business priorities. The received management wisdom is that
things that are a core competency or a competitive advantage of your organization should be done
in-house, whereas things that are non-core, routine, or commonplace should be left to a vendor [^21].
To give an extreme example, most companies do not generate their own electricity (unless they are an
energy company, and leaving aside emergency backup power), since it is cheaper to buy electricity from the grid.

With software, two important decisions to be made are who builds the software and who deploys it.
There is a spectrum of possibilities that outsource each decision to various degrees, as illustrated
in [Figure 1-2](/en/ch1#fig_cloud_spectrum). At one extreme is bespoke software that you write and run in-house; at
the other extreme are widely-used cloud services or Software as a Service (SaaS) products that are
implemented and operated by an external vendor, and which you only access through a web interface or API.

{{< figure src="/fig/ddia_0102.png" id="fig_cloud_spectrum" caption="Figure 1-2. A spectrum of types of software and its operations." class="w-full my-4" >}}

The middle ground is off-the-shelf software (open source or commercial) that you *self-host*, i.e.,
deploy yourself—for example, if you download MySQL and install it on a server you control. This
could be on your own hardware (often called *on-premises*, even if the server is actually in a
rented datacenter rack and not literally on your own premises), or on a virtual machine in the cloud
(*Infrastructure as a Service* or IaaS). There are still more points along this spectrum, e.g.,
taking open source software and running a modified version of it.

Separately from this spectrum there is also the question of *how* you deploy services, either in the
cloud or on-premises—for example, whether you use an orchestration framework such as Kubernetes.
However, choice of deployment tooling is out of scope of this book, since other factors have a
greater influence on the architecture of data systems.

### Pros and Cons of Cloud Services {#sec_introduction_cloud_tradeoffs}

Using a cloud service, rather than running comparable software yourself, essentially outsources the
operation of that software to the cloud provider. There are good arguments for and against cloud
services. Cloud providers claim that using their services saves you time and money, and allows you
to move faster compared to setting up your own infrastructure.

Whether a cloud service is actually cheaper and easier than self-hosting depends very much on your
skills and the workload on your systems. If you already have experience setting up and operating the
systems you need, and if your load is quite predictable (i.e., the number of machines you need does
not fluctuate wildly), then it’s often cheaper to buy your own machines and run the software on them
yourself [^22] [^23].

On the other hand, if you need a system that you don’t already know how to deploy and operate, then
adopting a cloud service is often easier and quicker than learning to manage the system yourself. If
you have to hire and train staff specifically to maintain and operate the system, that can get very
expensive. You still need an operations team when you’re using the cloud (see
[“Operations in the Cloud Era”](/en/ch1#sec_introduction_operations)), but outsourcing the basic system administration can free up your
team to focus on higher-level concerns.

When you outsource the operation of a system to a company that specializes in running that service,
that can potentially result in a better service, since the provider gains operational expertise from
providing the service to many customers. On the other hand, if you run the service yourself, you can
configure and tune it to perform well on your particular workload; it is unlikely that a cloud
service would be willing to make such customizations on your behalf.

Cloud services are particularly valuable if the load on your systems varies a lot over time. If you
provision your machines to be able to handle peak load, but those computing resources are idle most
of the time, the system becomes less cost-effective. In this situation, cloud services have the
advantage that they can make it easier to scale your computing resources up or down in response to
changes in demand.

For example, analytics systems often have extremely variable load: running a large analytical query
quickly requires a lot of computing resources in parallel, but once the query completes, those
resources sit idle until the user makes the next query. Predefined queries (e.g., for daily reports)
can be enqueued and scheduled to smooth out the load, but for interactive queries, the faster you
want them to complete, the more variable the workload becomes. If your dataset is so large that
querying it quickly requires significant computing resources, using the cloud can save money, since
you can return unused resources to the provider rather than leaving them idle. For smaller datasets,
this difference is less significant.

The biggest downside of a cloud service is that you have no control over it:

* If it is lacking a feature you need, all you can do is to politely ask the vendor whether they
  will add it; you generally cannot implement it yourself.
* If the service goes down, all you can do is to wait for it to recover.
* If you are using the service in a way that triggers a bug or causes performance problems, it will
  be difficult for you to diagnose the issue. With software that you run yourself, you can get
  performance metrics and debugging information from the operating system to help you understand its
  behavior, and you can look at the server logs, but with a service hosted by a vendor you usually
  do not have access to these internals.
* Moreover, if the service shuts down or becomes unacceptably expensive, or if the vendor decides to
  change their product in a way you don’t like, you are at their mercy—continuing to run an old
  version of the software is usually not an option, so you will be forced to migrate to an
  alternative service [^24].
  This risk is mitigated if there are alternative services that expose a compatible API, but for
  many cloud services there are no standard APIs, which raises the cost of switching, making vendor
  lock-in a problem.
* The cloud provider needs to be trusted to keep the data secure, which can complicate the process
  of complying with privacy and security regulations.

Despite all these risks, it has become more and more popular for organizations to build new
applications on top of cloud services, or adopting a hybrid approach in which cloud services are
used for some aspects of a system. However, cloud services will not subsume all in-house data
systems: many older systems predate the cloud, and for any services that have specialist
requirements that existing cloud services cannot meet, in-house systems remain necessary. For
example, very latency-sensitive applications such as high-frequency trading require full control of
the hardware.

### Cloud-Native System Architecture {#sec_introduction_cloud_native}

Besides having a different economic model (subscribing to a service instead of buying hardware and
licensing software to run on it), the rise of the cloud has also had a profound effect on how data
systems are implemented on a technical level. The term *cloud-native* is used to describe an
architecture that is designed to take advantage of cloud services.

In principle, almost any software that you can self-host could also be provided as a cloud service,
and indeed such managed services are now available for many popular data systems. However, systems
that have been designed from the ground up to be cloud-native have been shown to have several
advantages: better performance on the same hardware, faster recovery from failures, being able to
quickly scale computing resources to match the load, and supporting larger datasets [^25] [^26] [^27].
[Table 1-2](/en/ch1#tab_cloud_native_dbs) lists some examples of both types of systems.

{{< figure id="tab_cloud_native_dbs" title="Table 1-2. Examples of self-hosted and cloud-native database systems" class="w-full my-4" >}}

| Category         | Self-hosted systems         | Cloud-native systems                                                  |
|------------------|-----------------------------|-----------------------------------------------------------------------|
| Operational/OLTP | MySQL, PostgreSQL, MongoDB  | AWS Aurora [^25], Azure SQL DB Hyperscale [^26], Google Cloud Spanner |
| Analytical/OLAP  | Teradata, ClickHouse, Spark | Snowflake [^27], Google BigQuery, Azure Synapse Analytics             |

#### Layering of cloud services {#layering-of-cloud-services}

Many self-hosted data systems have very simple system requirements: they run on a conventional
operating system such as Linux or Windows, they store their data as files on the filesystem, and
they communicate via standard network protocols such as TCP/IP. A few systems depend on special
hardware such as GPUs (for machine learning) or RDMA network interfaces, but on the whole,
self-hosted software tends to use very generic computing resources: CPU, RAM, a filesystem, and an IP network.

In a cloud, this type of software can be run on an Infrastructure-as-a-Service environment, using
one or more virtual machines (or *instances*) with a certain allocation of CPUs, memory, disk, and
network bandwidth. Compared to physical machines, cloud instances can be provisioned faster and they
come in a greater variety of sizes, but otherwise they are similar to a traditional computer: you
can run any software you like on it, but you are responsible for administering it yourself.

In contrast, the key idea of cloud-native services is to use not only the computing resources
managed by your operating system, but also to build upon lower-level cloud services to create
higher-level services. For example:

* *Object storage* services such as Amazon S3, Azure Blob Storage, and Cloudflare R2 store large
  files. They provide more limited APIs than a typical filesystem (basic file reads and writes), but
  they have the advantage that they hide the underlying physical machines: the service automatically
  distributes the data across many machines, so that you don’t have to worry about running out of
  disk space on any one machine. Even if some machines or their disks fail entirely, no data is
  lost.
* Many other services are in turn built upon object storage and other cloud services: for example,
  Snowflake is a cloud-based analytic database (data warehouse) that relies on S3 for data storage [^27], 
  and some other services in turn build upon Snowflake.

As always with abstractions in computing, there is no one right answer to what you should use. As a
general rule, higher-level abstractions tend to be more oriented towards particular use cases. If
your needs match the situations for which a higher-level system is designed, using the existing
higher-level system will probably provide what you need with much less hassle than building it
yourself from lower-level systems. On the other hand, if there is no high-level system that meets
your needs, then building it yourself from lower-level components is the only option.

#### Separation of storage and compute {#sec_introduction_storage_compute}

In traditional computing, disk storage is regarded as durable (we assume that once something is
written to disk, it will not be lost). To tolerate the failure of an individual hard disk, RAID
(Redundant Array of Independent Disks) is often used to maintain copies of the data on several
disks attached to the same machine. RAID can be performed either in hardware or in software by the
operating system, and it is transparent to the applications accessing the filesystem.

In the cloud, compute instances (virtual machines) may also have local disks attached, but
cloud-native systems typically treat these disks more like an ephemeral cache, and less like
long-term storage. This is because the local disk becomes inaccessible if the associated instance
fails, or if the instance is replaced with a bigger or a smaller one (on a different physical machine) in order to adapt to changes in load.

As an alternative to local disks, cloud services also offer virtual disk storage that can be
detached from one instance and attached to a different one (Amazon EBS, Azure managed disks, and
persistent disks in Google Cloud). Such a virtual disk is not actually a physical disk, but rather a
cloud service provided by a separate set of machines, which emulates the behavior of a disk (a
*block device*, where each block is typically 4 KiB in size). This technology makes it
possible to run traditional disk-based software in the cloud, but the block device emulation
introduces overheads that can be avoided in systems that are designed from the ground up for the cloud [^25]. It also makes the application
very sensitive to network glitches, since every I/O on the virtual block device is actually a network call [^28].

To address this problem, cloud-native services generally avoid using virtual disks, and instead
build on dedicated storage services that are optimized for particular workloads. Object storage
services such as S3 are designed for long-term storage of fairly large files, ranging from hundreds
of kilobytes to several gigabytes in size. The individual rows or values stored in a database are
typically much smaller than this; cloud databases therefore typically manage smaller values in a
separate service, and store larger data blocks (containing many individual values) in an object
store [^26] [^29]. We will see ways of doing this in [Chapter 4](/en/ch4#ch_storage).

In a traditional systems architecture, the same computer is responsible for both storage (disk) and
computation (CPU and RAM), but in cloud-native systems, these two responsibilities have become
somewhat separated or *disaggregated* [^9] [^27] [^30] [^31]:
for example, S3 only stores files, and if you want to analyze that data, you will have to run the
analysis code somewhere outside of S3. This implies transferring the data over the network, which we
will discuss further in [“Distributed versus Single-Node Systems”](/en/ch1#sec_introduction_distributed).

Moreover, cloud-native systems are often *multitenant*, which means that rather than having a
separate machine for each customer, data and computation from several different customers are
handled on the same shared hardware by the same service [^32].

Multitenancy can enable better hardware utilization, easier scalability, and easier management by
the cloud provider, but it also requires careful engineering to ensure that one customer’s activity
does not affect the performance or security of the system for other customers [^33].

### Operations in the Cloud Era {#sec_introduction_operations}

Traditionally, the people managing an organization’s server-side data infrastructure were known as
*database administrators* (DBAs) or *system administrators* (sysadmins). More recently, many
organizations have tried to integrate the roles of software development and operations into teams
with a shared responsibility for both backend services and data infrastructure; the *DevOps*
philosophy has guided this trend. *Site Reliability Engineers* (SREs) are Google’s implementation of
this idea [^34].

The role of operations is to ensure services are reliably delivered to users (including configuring
infrastructure and deploying applications), and to ensure a stable production environment (including
monitoring and diagnosing any problems that may affect reliability). For self-hosted systems,
operations traditionally involves a significant amount of work at the level of individual machines,
such as capacity planning (e.g., monitoring available disk space and adding more disks before you
run out of space), provisioning new machines, moving services from one machine to another, and
installing operating system patches.

Many cloud services present an API that hides the individual machines that actually implement the
service. For example, cloud storage replaces fixed-size disks with *metered billing*, where you can
store data without planning your capacity needs in advance, and you are then charged based on the
space actually used. Moreover, many cloud services remain highly available, even when individual
machines have failed (see [“Reliability and Fault Tolerance”](/en/ch2#sec_introduction_reliability)).

This shift in emphasis from individual machines to services has been accompanied by a change in the
role of operations. The high-level goal of providing a reliable service remains the same, but the
processes and tools have evolved. The DevOps/SRE philosophy places greater emphasis on:

* automation—preferring repeatable processes over manual one-off jobs,
* preferring ephemeral virtual machines and services over long running servers,
* enabling frequent application updates,
* learning from incidents, and
* preserving the organization’s knowledge about the system, even as individual people come and go [^35].

With the rise of cloud services, there has been a bifurcation of roles: operations teams at
infrastructure companies specialize in the details of providing a reliable service to a large number
of customers, while the customers of the service spend as little time and effort as possible on infrastructure [^36].

Customers of cloud services still require operations, but they focus on different aspects, such as
choosing the most appropriate service for a given task, integrating different services with each
other, and migrating from one service to another. Even though metered billing removes the need for
capacity planning in the traditional sense, it’s still important to know what resources you are
using for which purpose, so that you don’t waste money on cloud resources that are not needed:
capacity planning becomes financial planning, and performance optimization becomes cost optimization [^37].

Moreover, cloud services do have resource limits or *quotas* (such as the maximum number of
processes you can run concurrently), which you need to know about and plan for before you run into them [^38].

Adopting a cloud service can be easier and quicker than running your own infrastructure, although
even here there is a cost in learning how to use it, and perhaps working around its limitations.
Integration between different services becomes a particular challenge as a growing number of vendors
offers an ever broader range of cloud services targeting different use cases [^39] [^40].

ETL (see [“Data Warehousing”](/en/ch1#sec_introduction_dwh)) is only part of the story; operational cloud services also need
to be integrated with each other. At present, there is a lack of standards that would facilitate
this sort of integration, so it often involves significant manual effort.

Other operational aspects that cannot fully be outsourced to cloud services include maintaining the
security of an application and the libraries it uses, managing the interactions between your own
services, monitoring the load on your services, and tracking down the cause of problems such as
performance degradations or outages. While the cloud is changing the role of operations, the need
for operations is as great as ever.


## Distributed versus Single-Node Systems {#sec_introduction_distributed}

A system that involves several machines communicating via a network is called a *distributed
system*. Each of the processes participating in a distributed system is called a *node*. There are
various reasons why you might want a system to be distributed:

Inherently distributed systems
:   If an application involves two or more interacting users, each using their own device, then the
    system is unavoidably distributed: the communication between the devices will have to go via a
    network.

Requests between cloud services
:   If data is stored in one service but processed in another, it must be transferred over the network
    from one service to the other.

Fault tolerance/high availability
:   If your application needs to continue working even if one machine (or several machines, or
    the network, or an entire datacenter) goes down, you can use multiple machines to give you
    redundancy. When one fails, another one can take over. See [“Reliability and Fault Tolerance”](/en/ch2#sec_introduction_reliability) and
    [Chapter 6](/en/ch6#ch_replication) on replication.

Scalability
:   If your data volume or computing requirements grow bigger than a single machine can handle,
    you can potentially spread the load across multiple machines. See
    [“Scalability”](/en/ch2#sec_introduction_scalability).

Latency
:   If you have users around the world, you might want to have servers in various regions
    worldwide so that each user can be served from a server that is geographically close to
    them. That avoids the users having to wait for network packets to travel halfway around the
    world to answer their requests. See [“Describing Performance”](/en/ch2#sec_introduction_percentiles).

Elasticity
:   If your application is busy at some times and idle at other times, a cloud deployment can scale up
    or down to meet the demand, so that you pay only for resources you are actively using. This is more
    difficult on a single machine, which needs to be provisioned to handle the maximum load, even at
    times when it is barely used.

Using specialized hardware
:   Different parts of the system can take advantage of different types of hardware to match their
    workload. For example, an object store may use machines with many disks but few CPUs, whereas a
    data analysis system may use machines with lots of CPU and memory but no disks, and a machine
    learning system may use machines with GPUs (which are much more efficient than CPUs for training
    deep neural networks and other machine learning tasks).

Legal compliance
:   Some countries have data residency laws that require data about people in their jurisdiction to be
    stored and processed geographically within that country [^41].
    The scope of these rules varies—for example, in some cases it applies only to medical or financial
    data, while other cases are broader. A service with users in several such jurisdictions will
    therefore have to distribute their data across servers in several locations.

Sustainability
:   If you have flexibility on where and when to run your jobs, you might be able to run them in a
    time and place where plenty of renewable electricity is available, and avoid running them when the
    power grid is under strain. This can reduce your carbon emissions and allow you to take advantage
    of cheap power when it is available [^42] [^43].

These reasons apply both to services that you write yourself (application code) and services
consisting of off-the-shelf software (such as databases).

### Problems with Distributed Systems {#sec_introduction_dist_sys_problems}

Distributed systems also have downsides. Every request and API call that goes via the network needs
to deal with the possibility of failure: the network may be interrupted, or the service may be
overloaded or crashed, and therefore any request may time out without receiving a response. In this
case, we don’t know whether the service received the request, and simply retrying it might not be
safe. We will discuss these problems in detail in [Chapter 9](/en/ch9#ch_distributed).

Although datacenter networks are fast, making a call to another service is still vastly slower than
calling a function in the same process [^44].

When operating on large volumes of data, rather than transferring the data from storage to a
separate machine that processes it, it can be faster to bring the computation to the machine that
already has the data [^45].

More nodes are not always faster: in some cases, a simple single-threaded program on one computer
can perform significantly better than a cluster with over 100 CPU cores [^46].

Troubleshooting a distributed system is often difficult: if the system is slow to respond, how do
you figure out where the problem lies? Techniques for diagnosing problems in distributed systems are
developed under the heading of *observability* [^47] [^48], 
which involves collecting data about the execution of a system, and allowing it to be queried in
ways that allows both high-level metrics and individual events to be analyzed. *Tracing* tools such
as OpenTelemetry, Zipkin, and Jaeger allow you to track which client called which server for which
operation, and how long each call took [^49].

Databases provide various mechanisms for ensuring data consistency, as we shall see in
[Chapter 6](/en/ch6#ch_replication) and [Chapter 8](/en/ch8#ch_transactions). However, when each service has its own database,
maintaining consistency of data across those different services becomes the application’s problem.
Distributed transactions, which we explore in [Chapter 8](/en/ch8#ch_transactions), are a possible technique for
ensuring consistency, but they are rarely used in a microservices context because they run counter
to the goal of making services independent from each other, and many databases don’t support them [^50].

For all these reasons, if you can do something on a single machine, this is often much simpler and
cheaper compared to setting up a distributed system [^23] [^46] [^51].
CPUs, memory, and disks have grown larger, faster, and more reliable. When combined with single-node
databases such as DuckDB, SQLite, and KùzuDB, many workloads can now run on a single node. We will
explore more on this topic in [Chapter 4](/en/ch4#ch_storage).

### Microservices and Serverless {#sec_introduction_microservices}

The most common way of distributing a system across multiple machines is to divide them into clients
and servers, and let the clients make requests to the servers. Most commonly HTTP is used for this
communication, as we will discuss in [“Dataflow Through Services: REST and RPC”](/en/ch5#sec_encoding_dataflow_rpc). The same process may be both a
server (handling incoming requests) and a client (making outbound requests to other services).

This way of building applications has traditionally been called a *service-oriented architecture*
(SOA); more recently the idea has been refined into a *microservices* architecture [^52] [^53].
In this architecture, a service has one well-defined purpose (for example, in the case of S3, this
would be file storage); each service exposes an API that can be called by clients via the network,
and each service has one team that is responsible for its maintenance. A complex application can
thus be decomposed into multiple interacting services, each managed by a separate team.

There are several advantages to breaking down a complex piece of software into multiple services:
each service can be updated independently, reducing coordination effort among teams; each service
can be assigned the hardware resources it needs; and by hiding the implementation details behind an
API, the service owners are free to change the implementation without affecting clients. In terms of
data storage, it is common for each service to have its own databases, and not to share databases
between services: sharing a database would effectively make the entire database structure a part of
the service’s API, and then that structure would be difficult to change. Shared databases could also
cause one service’s queries to negatively impact the performance of other services.

On the other hand, having many services can itself breed complexity: each service requires
infrastructure for deploying new releases, adjusting the allocated hardware resources to match the
load, collecting logs, monitoring service health, and alerting an on-call engineer in the case of a
problem. *Orchestration* frameworks such as Kubernetes have become a popular way of deploying
services, since they provide a foundation for this infrastructure. Testing a service during
development can be complicated, since you also need to run all the other services that it depends on.

Microservice APIs can be challenging to evolve. Clients that call an API expect the API to have
certain fields. Developers might wish to add or remove fields to an API as business needs change,
but doing so can cause clients to fail. Worse still, such failures are often not discovered until
late in the development cycle when the updated service API is deployed to a staging or production
environment. API description standards such as OpenAPI and gRPC help manage the relationship between
client and server APIs; we discuss these further in [Chapter 5](/en/ch5#ch_encoding).

Microservices are primarily a technical solution to a people problem: allowing different teams to
make progress independently without having to coordinate with each other. This is valuable in a large
company, but in a small company where there are not many teams, using microservices is likely to be
unnecessary overhead, and it is preferable to implement the application in the simplest way possible [^52].

*Serverless*, or *function-as-a-service* (FaaS), is another approach to deploying services, in which
the management of the infrastructure is outsourced to a cloud vendor [^33].
When using virtual machines, you have to explicitly choose when to start up or shut down an
instance; in contrast, with the serverless model, the cloud provider automatically allocates and
frees hardware resources as needed, based on the incoming requests to your service [^54]. 
Serverless deployment shifts more of the operational burden to cloud providers and enables flexible billing 
by usage rather than machine instances. To offer such benefits, many serverless infrastructure providers
impose a time limit on function execution, limit runtime environments, and might suffer from slow
start times when a function is first invoked. The term “serverless” can also be misleading: each
serverless function execution still runs on a server, but subsequent executions might run on a
different one. Moreover, infrastructure such as BigQuery and various Kafka offerings have adopted
“serverless” terminology to signal that their services auto-scale and that they bill by usage rather than machine instances.

Just like cloud storage replaced capacity planning (deciding in advance how many disks to buy) with
a metered billing model, the serverless approach is bringing metered billing to code execution: you
only pay for the time that your application code is actually running, rather than having to
provision resources in advance.

### Cloud Computing versus Supercomputing {#id17}

Cloud computing is not the only way of building large-scale computing systems; an alternative is
*high-performance computing* (HPC), also known as *supercomputing*. Although there are overlaps, HPC
often has different priorities and uses different techniques compared to cloud computing and
enterprise datacenter systems. Some of those differences are:

* Supercomputers are typically used for computationally intensive scientific computing tasks, such
  as weather forecasting, climate modeling, molecular dynamics (simulating the movement of atoms and
  molecules), complex optimization problems, and solving partial differential equations. On the
  other hand, cloud computing tends to be used for online services, business data systems, and
  similar systems that need to serve user requests with high availability.
* A supercomputer typically runs large batch jobs that checkpoint the state of their computation to
  disk from time to time. If a node fails, a common solution is to simply stop the entire cluster
  workload, repair the faulty node, and then restart the computation from the last checkpoint [^55] [^56].
  With cloud services, it is usually not desirable to stop the entire cluster, since the services
  need to continually serve users with minimal interruptions.
* Supercomputer nodes typically communicate through shared memory and remote direct memory access
  (RDMA), which support high bandwidth and low latency, but assume a high level of trust among the users of the system [^57].
  In cloud computing, the network and the machines are often shared by mutually untrusting
  organizations, requiring stronger security mechanisms such as resource isolation (e.g., virtual
  machines), encryption and authentication.
* Cloud datacenter networks are often based on IP and Ethernet, arranged in Clos topologies to
  provide high bisection bandwidth—a commonly used measure of a network’s overall performance [^55] [^58].
  Supercomputers often use specialized network topologies, such as multi-dimensional meshes and toruses [^59],
  which yield better performance for HPC workloads with known communication patterns.
* Cloud computing allows nodes to be distributed across multiple geographic regions, whereas
  supercomputers generally assume that all of their nodes are close together.

Large-scale analytics systems sometimes share some characteristics with supercomputing, which is why
it can be worth knowing about these techniques if you are working in this area. However, this book
is mostly concerned with services that need to be continually available, as discussed in [“Reliability and Fault Tolerance”](/en/ch2#sec_introduction_reliability).

## Data Systems, Law, and Society {#sec_introduction_compliance}

So far you’ve seen in this chapter that the architecture of data systems is influenced not only by
technical goals and requirements, but also by the human needs of the organizations that they
support. Increasingly, data systems engineers are realizing that serving the needs of their own
business is not enough: we also have a responsibility towards society at large.

One particular concern are systems that store data about people and their behavior. Since 2018 the
*General Data Protection Regulation* (GDPR) has given residents of many European countries greater
control and legal rights over their personal data, and similar privacy regulation has been adopted
in various other countries and states around the world, including for example the California
Consumer Privacy Act (CCPA). Regulations around AI, such as the *EU AI Act*, place further
restrictions on how personal data can be used.

Moreover, even in areas that are not directly subject to regulation, there is increasing recognition
of the effects that computer systems have on people and society. Social media has changed how
individuals consume news, which influences their political opinions and hence may affect the outcome
of elections. Automated systems increasingly make decisions that have profound consequences for
individuals, such as deciding who should be given a loan or insurance coverage, who should be
invited to a job interview, or who should be suspected of a crime [^60].

Everyone who works on such systems shares a responsibility for considering the ethical impact and
ensuring that they comply with relevant law. It is not necessary for everybody to become an expert
in law and ethics, but a basic awareness of legal and ethical principles is just as important as,
say, some foundational knowledge in distributed systems.

Legal considerations are influencing the very foundations of how data systems are being designed [^61].
For example, the GDPR grants individuals the right to have their data erased on request (sometimes
known as the *right to be forgotten*). However, as we shall see in this book, many data systems rely
on immutable constructs such as append-only logs as part of their design; how can we ensure deletion
of some data in the middle of a file that is supposed to be immutable? How do we handle deletion of
data that has been incorporated into derived datasets (see [“Systems of Record and Derived Data”](/en/ch1#sec_introduction_derived)), such as
training data for machine learning models? Answering these questions creates new engineering
challenges.

At present we don’t have clear guidelines on which particular technologies or system architectures
should be considered “GDPR-compliant” or not. The regulation deliberately does not mandate
particular technologies, because these may quickly change as technology progresses. Instead, the
legal texts set out high-level principles that are subject to interpretation. This means that there
are no simple answers to the question of how to comply with privacy regulation, but we will look at
some of the technologies in this book through this lens.

In general, we store data because we think that its value is greater than the costs of storing it.
However, it is worth remembering that the costs of storage are not just the bill you pay for Amazon
S3 or another service: the cost-benefit calculation should also take into account the risks of
liability and reputational damage if the data were to be leaked or compromised by adversaries, and
the risk of legal costs and fines if the storage and processing of the data is found not to be
compliant with the law [^51].

Governments or police forces might also compel companies to hand over data. When there is a risk
that the data may reveal criminalized behaviors (for example, homosexuality in several Middle
Eastern and African countries, or seeking an abortion in several US states), storing that data
creates real safety risks for users. Travel to an abortion clinic, for example, could easily be
revealed by location data, perhaps even by a log of the user’s IP addresses over time (which
indicate approximate location).

Once all the risks are taken into account, it might be reasonable to decide that some data is simply
not worth storing, and that it should therefore be deleted. This principle of *data minimization*
(sometimes known by the German term *Datensparsamkeit*) runs counter to the “big data” philosophy of
storing lots of data speculatively in case it turns out to be useful in the future [^62].
But it fits with the GDPR, which mandates that personal data may only be collected for a specified,
explicit purpose, that this data may not later be used for any other purpose, and that the data must
not be kept for longer than necessary for the purposes for which it was collected [^63].

Businesses have also taken notice of privacy and safety concerns. Credit card companies require
payment processing businesses to adhere to strict payment card industry (PCI) standards. Processors
undergo frequent evaluations from independent auditors to verify continued compliance. Software
vendors have also seen increased scrutiny. Many buyers now require their vendors to comply with
Service Organization Control (SOC) Type 2 standards. As with PCI compliance, vendors undergo third
party audits to verify adherence.

Generally, it is important to balance the needs of your business against the needs of the people
whose data you are collecting and processing. There is much more to this topic; in [Chapter 14](/en/ch14#ch_right_thing) we
will go deeper into the topics of ethics and legal compliance, including the problems of bias and
discrimination.

## Summary {#summary}

The theme of this chapter has been to understand trade-offs: that is, to recognize that for many
questions there is not one right answer, but several different approaches that each have various
pros and cons. We explored some of the most important choices that affect the architecture of data
systems, and introduced terminology that will be needed throughout the rest of this book.

We started by making a distinction between operational (transaction-processing, OLTP) and analytical
(OLAP) systems, and saw their different characteristics: not only managing different types of data
with different access patterns, but also serving different audiences. We encountered the concept of
a data warehouse and data lake, which receive data feeds from operational systems via ETL. In
[Chapter 4](/en/ch4#ch_storage) we will see that operational and analytical systems often use very different internal
data layouts because of the different types of queries they need to serve.

We then compared cloud services, a comparatively recent development, to the traditional paradigm of
self-hosted software that has previously dominated data systems architecture. Which of these
approaches is more cost-effective depends a lot on your particular situation, but it’s undeniable
that cloud-native approaches are bringing big changes to the way data systems are architected, for
example in the way they separate storage and compute.

Cloud systems are intrinsically distributed, and we briefly examined some of the trade-offs of
distributed systems compared to using a single machine. There are situations in which you can’t
avoid going distributed, but it’s advisable not to rush into making a system distributed if it’s
possible to keep it on a single machine. In [Chapter 9](/en/ch9#ch_distributed) we will cover the challenges with
distributed systems in more detail.

Finally, we saw that data systems architecture is determined not only by the needs of the business
deploying the system, but also by privacy regulation that protects the rights of the people whose
data is being processed—an aspect that many engineers are prone to ignoring. How we translate legal
requirements into technical implementations is not yet well understood, but it’s important to keep
this question in mind as we move through the rest of this book.

### References

[^1]: Richard T. Kouzes, Gordon A. Anderson, Stephen T. Elbert, Ian Gorton, and Deborah K. Gracio. [The Changing Paradigm of Data-Intensive Computing](http://www2.ic.uff.br/~boeres/slides_AP/papers/TheChanginParadigmDataIntensiveComputing_2009.pdf). *IEEE Computer*, volume 42, issue 1, January 2009. [doi:10.1109/MC.2009.26](https://doi.org/10.1109/MC.2009.26)
[^2]: Martin Kleppmann, Adam Wiggins, Peter van Hardenberg, and Mark McGranaghan. [Local-first software: you own your data, in spite of the cloud](https://www.inkandswitch.com/local-first/). At *2019 ACM SIGPLAN International Symposium on New Ideas, New Paradigms, and Reflections on Programming and Software* (Onward!), October 2019. [doi:10.1145/3359591.3359737](https://doi.org/10.1145/3359591.3359737)
[^3]: Joe Reis and Matt Housley. [*Fundamentals of Data Engineering*](https://www.oreilly.com/library/view/fundamentals-of-data/9781098108298/). O’Reilly Media, 2022. ISBN: 9781098108304
[^4]: Rui Pedro Machado and Helder Russa. [*Analytics Engineering with SQL and dbt*](https://www.oreilly.com/library/view/analytics-engineering-with/9781098142377/). O’Reilly Media, 2023. ISBN: 9781098142384
[^5]: Edgar F. Codd, S. B. Codd, and C. T. Salley. [Providing OLAP to User-Analysts: An IT Mandate](https://www.estgv.ipv.pt/PaginasPessoais/jloureiro/ESI_AID2007_2008/fichas/codd.pdf). E. F. Codd Associates, 1993. Archived at [perma.cc/RKX8-2GEE](https://perma.cc/RKX8-2GEE)
[^6]: Chinmay Soman and Neha Pawar. [Comparing Three Real-Time OLAP Databases: Apache Pinot, Apache Druid, and ClickHouse](https://startree.ai/blog/a-tale-of-three-real-time-olap-databases). *startree.ai*, April 2023. Archived at [perma.cc/8BZP-VWPA](https://perma.cc/8BZP-VWPA)
[^7]: Surajit Chaudhuri and Umeshwar Dayal. [An Overview of Data Warehousing and OLAP Technology](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/sigrecord.pdf). *ACM SIGMOD Record*, volume 26, issue 1, pages 65–74, March 1997. [doi:10.1145/248603.248616](https://doi.org/10.1145/248603.248616)
[^8]: Fatma Özcan, Yuanyuan Tian, and Pinar Tözün. [Hybrid Transactional/Analytical Processing: A Survey](https://humming80.github.io/papers/sigmod-htaptut.pdf). At *ACM International Conference on Management of Data* (SIGMOD), May 2017. [doi:10.1145/3035918.3054784](https://doi.org/10.1145/3035918.3054784)
[^9]: Adam Prout, Szu-Po Wang, Joseph Victor, Zhou Sun, Yongzhu Li, Jack Chen, Evan Bergeron, Eric Hanson, Robert Walzer, Rodrigo Gomes, and Nikita Shamgunov. [Cloud-Native Transactions and Analytics in SingleStore](https://dl.acm.org/doi/abs/10.1145/3514221.3526055). At *International Conference on Management of Data* (SIGMOD), June 2022. [doi:10.1145/3514221.3526055](https://doi.org/10.1145/3514221.3526055)
[^10]: Chao Zhang, Guoliang Li, Jintao Zhang, Xinning Zhang, and Jianhua Feng. [HTAP Databases: A Survey](https://arxiv.org/pdf/2404.15670). *IEEE Transactions on Knowledge and Data Engineering*, April 2024. [doi:10.1109/TKDE.2024.3389693](https://doi.org/10.1109/TKDE.2024.3389693)
[^11]: Michael Stonebraker and Uğur Çetintemel. [‘One Size Fits All’: An Idea Whose Time Has Come and Gone](https://pages.cs.wisc.edu/~shivaram/cs744-readings/fits_all.pdf). At *21st International Conference on Data Engineering* (ICDE), April 2005. [doi:10.1109/ICDE.2005.1](https://doi.org/10.1109/ICDE.2005.1)
[^12]: Jeffrey Cohen, Brian Dolan, Mark Dunlap, Joseph M. Hellerstein, and Caleb Welton. [MAD Skills: New Analysis Practices for Big Data](https://www.vldb.org/pvldb/vol2/vldb09-219.pdf). *Proceedings of the VLDB Endowment*, volume 2, issue 2, pages 1481–1492, August 2009. [doi:10.14778/1687553.1687576](https://doi.org/10.14778/1687553.1687576)
[^13]: Dan Olteanu. [The Relational Data Borg is Learning](https://www.vldb.org/pvldb/vol13/p3502-olteanu.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 12, August 2020. [doi:10.14778/3415478.3415572](https://doi.org/10.14778/3415478.3415572)
[^14]: Matt Bornstein, Martin Casado, and Jennifer Li. [Emerging Architectures for Modern Data Infrastructure: 2020](https://future.a16z.com/emerging-architectures-for-modern-data-infrastructure-2020/). *future.a16z.com*, October 2020. Archived at [perma.cc/LF8W-KDCC](https://perma.cc/LF8W-KDCC)
[^15]: Martin Fowler. [DataLake](https://www.martinfowler.com/bliki/DataLake.html). *martinfowler.com*, February 2015. Archived at [perma.cc/4WKN-CZUK](https://perma.cc/4WKN-CZUK)
[^16]: Bobby Johnson and Joseph Adler. [The Sushi Principle: Raw Data Is Better](https://learning.oreilly.com/videos/strata-hadoop/9781491924143/9781491924143-video210840/). At *Strata+Hadoop World*, February 2015.
[^17]: Michael Armbrust, Ali Ghodsi, Reynold Xin, and Matei Zaharia. [Lakehouse: A New Generation of Open Platforms that Unify Data Warehousing and Advanced Analytics](https://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf). At *11th Annual Conference on Innovative Data Systems Research* (CIDR), January 2021.
[^18]: DataKitchen, Inc. [The DataOps Manifesto](https://dataopsmanifesto.org/en/). *dataopsmanifesto.org*, 2017. Archived at [perma.cc/3F5N-FUQ4](https://perma.cc/3F5N-FUQ4)
[^19]: Tejas Manohar. [What is Reverse ETL: A Definition & Why It’s Taking Off](https://hightouch.io/blog/reverse-etl/). *hightouch.io*, November 2021. Archived at [perma.cc/A7TN-GLYJ](https://perma.cc/A7TN-GLYJ)
[^20]: Simon O’Regan. [Designing Data Products](https://towardsdatascience.com/designing-data-products-b6b93edf3d23). *towardsdatascience.com*, August 2018. Archived at [perma.cc/HU67-3RV8](https://perma.cc/HU67-3RV8)
[^21]: Camille Fournier. [Why is it so hard to decide to buy?](https://skamille.medium.com/why-is-it-so-hard-to-decide-to-buy-d86fee98e88e) *skamille.medium.com*, July 2021. Archived at [perma.cc/6VSG-HQ5X](https://perma.cc/6VSG-HQ5X)
[^22]: David Heinemeier Hansson. [Why we’re leaving the cloud](https://world.hey.com/dhh/why-we-re-leaving-the-cloud-654b47e0). *world.hey.com*, October 2022. Archived at [perma.cc/82E6-UJ65](https://perma.cc/82E6-UJ65)
[^23]: Nima Badizadegan. [Use One Big Server](https://specbranch.com/posts/one-big-server/). *specbranch.com*, August 2022. Archived at [perma.cc/M8NB-95UK](https://perma.cc/M8NB-95UK)
[^24]: Steve Yegge. [Dear Google Cloud: Your Deprecation Policy is Killing You](https://steve-yegge.medium.com/dear-google-cloud-your-deprecation-policy-is-killing-you-ee7525dc05dc). *steve-yegge.medium.com*, August 2020. Archived at [perma.cc/KQP9-SPGU](https://perma.cc/KQP9-SPGU)
[^25]: Alexandre Verbitski, Anurag Gupta, Debanjan Saha, Murali Brahmadesam, Kamal Gupta, Raman Mittal, Sailesh Krishnamurthy, Sandor Maurice, Tengiz Kharatishvili, and Xiaofeng Bao. [Amazon Aurora: Design Considerations for High Throughput Cloud-Native Relational Databases](https://media.amazonwebservices.com/blog/2017/aurora-design-considerations-paper.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 1041–1052, May 2017. [doi:10.1145/3035918.3056101](https://doi.org/10.1145/3035918.3056101)
[^26]: Panagiotis Antonopoulos, Alex Budovski, Cristian Diaconu, Alejandro Hernandez Saenz, Jack Hu, Hanuma Kodavalla, Donald Kossmann, Sandeep Lingam, Umar Farooq Minhas, Naveen Prakash, Vijendra Purohit, Hugh Qu, Chaitanya Sreenivas Ravella, Krystyna Reisteter, Sheetal Shrotri, Dixin Tang, and Vikram Wakade. [Socrates: The New SQL Server in the Cloud](https://www.microsoft.com/en-us/research/uploads/prod/2019/05/socrates.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 1743–1756, June 2019. [doi:10.1145/3299869.3314047](https://doi.org/10.1145/3299869.3314047)
[^27]: Midhul Vuppalapati, Justin Miron, Rachit Agarwal, Dan Truong, Ashish Motivala, and Thierry Cruanes. [Building An Elastic Query Engine on Disaggregated Storage](https://www.usenix.org/system/files/nsdi20-paper-vuppalapati.pdf). At *17th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), February 2020.
[^28]: Nick Van Wiggeren. [The Real Failure Rate of EBS](https://planetscale.com/blog/the-real-fail-rate-of-ebs). *planetscale.com*, March 2025. Archived at [perma.cc/43CR-SAH5](https://perma.cc/43CR-SAH5)
[^29]: Colin Breck. [Predicting the Future of Distributed Systems](https://blog.colinbreck.com/predicting-the-future-of-distributed-systems/). *blog.colinbreck.com*, August 2024. Archived at [perma.cc/K5FC-4XX2](https://perma.cc/K5FC-4XX2)
[^30]: Gwen Shapira. [Compute-Storage Separation Explained](https://www.thenile.dev/blog/storage-compute). *thenile.dev*, January 2023. Archived at [perma.cc/QCV3-XJNZ](https://perma.cc/QCV3-XJNZ)
[^31]: Ravi Murthy and Gurmeet Goindi. [AlloyDB for PostgreSQL under the hood: Intelligent, database-aware storage](https://cloud.google.com/blog/products/databases/alloydb-for-postgresql-intelligent-scalable-storage). *cloud.google.com*, May 2022. Archived at [archive.org](https://web.archive.org/web/20220514021120/https%3A//cloud.google.com/blog/products/databases/alloydb-for-postgresql-intelligent-scalable-storage)
[^32]: Jack Vanlightly. [The Architecture of Serverless Data Systems](https://jack-vanlightly.com/blog/2023/11/14/the-architecture-of-serverless-data-systems). *jack-vanlightly.com*, November 2023. Archived at [perma.cc/UDV4-TNJ5](https://perma.cc/UDV4-TNJ5)
[^33]: Eric Jonas, Johann Schleier-Smith, Vikram Sreekanti, Chia-Che Tsai, Anurag Khandelwal, Qifan Pu, Vaishaal Shankar, Joao Carreira, Karl Krauth, Neeraja Yadwadkar, Joseph E. Gonzalez, Raluca Ada Popa, Ion Stoica, David A. Patterson. [Cloud Programming Simplified: A Berkeley View on Serverless Computing](https://arxiv.org/abs/1902.03383). *arxiv.org*, February 2019.
[^34]: Betsy Beyer, Jennifer Petoff, Chris Jones, and Niall Richard Murphy. [*Site Reliability Engineering: How Google Runs Production Systems*](https://www.oreilly.com/library/view/site-reliability-engineering/9781491929117/). O’Reilly Media, 2016. ISBN: 9781491929124
[^35]: Thomas Limoncelli. [The Time I Stole $10,000 from Bell Labs](https://queue.acm.org/detail.cfm?id=3434773). *ACM Queue*, volume 18, issue 5, November 2020. [doi:10.1145/3434571.3434773](https://doi.org/10.1145/3434571.3434773)
[^36]: Charity Majors. [The Future of Ops Jobs](https://acloudguru.com/blog/engineering/the-future-of-ops-jobs). *acloudguru.com*, August 2020. Archived at [perma.cc/GRU2-CZG3](https://perma.cc/GRU2-CZG3)
[^37]: Boris Cherkasky. [(Over)Pay As You Go for Your Datastore](https://medium.com/riskified-technology/over-pay-as-you-go-for-your-datastore-11a29ae49a8b). *medium.com*, September 2021. Archived at [perma.cc/Q8TV-2AM2](https://perma.cc/Q8TV-2AM2)
[^38]: Shlomi Kushchi. [Serverless Doesn’t Mean DevOpsLess or NoOps](https://thenewstack.io/serverless-doesnt-mean-devopsless-or-noops/). *thenewstack.io*, February 2023. Archived at [perma.cc/3NJR-AYYU](https://perma.cc/3NJR-AYYU)
[^39]: Erik Bernhardsson. [Storm in the stratosphere: how the cloud will be reshuffled](https://erikbern.com/2021/11/30/storm-in-the-stratosphere-how-the-cloud-will-be-reshuffled.html). *erikbern.com*, November 2021. Archived at [perma.cc/SYB2-99P3](https://perma.cc/SYB2-99P3)
[^40]: Benn Stancil. [The data OS](https://benn.substack.com/p/the-data-os). *benn.substack.com*, September 2021. Archived at [perma.cc/WQ43-FHS6](https://perma.cc/WQ43-FHS6)
[^41]: Maria Korolov. [Data residency laws pushing companies toward residency as a service](https://www.csoonline.com/article/3647761/data-residency-laws-pushing-companies-toward-residency-as-a-service.html). *csoonline.com*, January 2022. Archived at [perma.cc/CHE4-XZZ2](https://perma.cc/CHE4-XZZ2)
[^42]: Severin Borenstein. [Can Data Centers Flex Their Power Demand?](https://energyathaas.wordpress.com/2025/04/14/can-data-centers-flex-their-power-demand/) *energyathaas.wordpress.com*, April 2025. Archived at <https://perma.cc/MUD3-A6FF>
[^43]: Bilge Acun, Benjamin Lee, Fiodar Kazhamiaka, Aditya Sundarrajan, Kiwan Maeng, Manoj Chakkaravarthy, David Brooks, and Carole-Jean Wu. [Carbon Dependencies in Datacenter Design and Management](https://hotcarbon.org/assets/2022/pdf/hotcarbon22-acun.pdf). *ACM SIGENERGY Energy Informatics Review*, volume 3, issue 3, pages 21–26. [doi:10.1145/3630614.3630619](https://doi.org/10.1145/3630614.3630619)
[^44]: Kousik Nath. [These are the numbers every computer engineer should know](https://www.freecodecamp.org/news/must-know-numbers-for-every-computer-engineer/). *freecodecamp.org*, September 2019. Archived at [perma.cc/RW73-36RL](https://perma.cc/RW73-36RL)
[^45]: Joseph M. Hellerstein, Jose Faleiro, Joseph E. Gonzalez, Johann Schleier-Smith, Vikram Sreekanti, Alexey Tumanov, and Chenggang Wu. [Serverless Computing: One Step Forward, Two Steps Back](https://arxiv.org/abs/1812.03651). At *Conference on Innovative Data Systems Research* (CIDR), January 2019.
[^46]: Frank McSherry, Michael Isard, and Derek G. Murray. [Scalability! But at What COST?](https://www.usenix.org/system/files/conference/hotos15/hotos15-paper-mcsherry.pdf) At *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
[^47]: Cindy Sridharan. *[Distributed Systems Observability: A Guide to Building Robust Systems](https://unlimited.humio.com/rs/756-LMY-106/images/Distributed-Systems-Observability-eBook.pdf)*. Report, O’Reilly Media, May 2018. Archived at [perma.cc/M6JL-XKCM](https://perma.cc/M6JL-XKCM)
[^48]: Charity Majors. [Observability — A 3-Year Retrospective](https://thenewstack.io/observability-a-3-year-retrospective/). *thenewstack.io*, August 2019. Archived at [perma.cc/CG62-TJWL](https://perma.cc/CG62-TJWL)
[^49]: Benjamin H. Sigelman, Luiz André Barroso, Mike Burrows, Pat Stephenson, Manoj Plakal, Donald Beaver, Saul Jaspan, and Chandan Shanbhag. [Dapper, a Large-Scale Distributed Systems Tracing Infrastructure](https://research.google/pubs/pub36356/). Google Technical Report dapper-2010-1, April 2010. Archived at [perma.cc/K7KU-2TMH](https://perma.cc/K7KU-2TMH)
[^50]: Rodrigo Laigner, Yongluan Zhou, Marcos Antonio Vaz Salles, Yijian Liu, and Marcos Kalinowski. [Data management in microservices: State of the practice, challenges, and research directions](https://www.vldb.org/pvldb/vol14/p3348-laigner.pdf). *Proceedings of the VLDB Endowment*, volume 14, issue 13, pages 3348–3361, September 2021. [doi:10.14778/3484224.3484232](https://doi.org/10.14778/3484224.3484232)
[^51]: Jordan Tigani. [Big Data is Dead](https://motherduck.com/blog/big-data-is-dead/). *motherduck.com*, February 2023. Archived at [perma.cc/HT4Q-K77U](https://perma.cc/HT4Q-K77U)
[^52]: Sam Newman. [*Building Microservices*, second edition](https://www.oreilly.com/library/view/building-microservices-2nd/9781492034018/). O’Reilly Media, 2021. ISBN: 9781492034025
[^53]: Chris Richardson. [Microservices: Decomposing Applications for Deployability and Scalability](https://www.infoq.com/articles/microservices-intro/). *infoq.com*, May 2014. Archived at [perma.cc/CKN4-YEQ2](https://perma.cc/CKN4-YEQ2)
[^54]: Mohammad Shahrad, Rodrigo Fonseca, Íñigo Goiri, Gohar Chaudhry, Paul Batum, Jason Cooke, Eduardo Laureano, Colby Tresness, Mark Russinovich, Ricardo Bianchini. [Serverless in the Wild: Characterizing and Optimizing the Serverless Workload at a Large Cloud Provider](https://www.usenix.org/system/files/atc20-shahrad.pdf). At *USENIX Annual Technical Conference* (ATC), July 2020.
[^55]: Luiz André Barroso, Urs Hölzle, and Parthasarathy Ranganathan. [The Datacenter as a Computer: Designing Warehouse-Scale Machines](https://www.morganclaypool.com/doi/10.2200/S00874ED3V01Y201809CAC046), third edition. Morgan & Claypool Synthesis Lectures on Computer Architecture, October 2018. [doi:10.2200/S00874ED3V01Y201809CAC046](https://doi.org/10.2200/S00874ED3V01Y201809CAC046)
[^56]: David Fiala, Frank Mueller, Christian Engelmann, Rolf Riesen, Kurt Ferreira, and Ron Brightwell. [Detection and Correction of Silent Data Corruption for Large-Scale High-Performance Computing](https://arcb.csc.ncsu.edu/~mueller/ftp/pub/mueller/papers/sc12.pdf),” at *International Conference for High Performance Computing, Networking, Storage and Analysis* (SC), November 2012. [doi:10.1109/SC.2012.49](https://doi.org/10.1109/SC.2012.49)
[^57]: Anna Kornfeld Simpson, Adriana Szekeres, Jacob Nelson, and Irene Zhang. [Securing RDMA for High-Performance Datacenter Storage Systems](https://www.usenix.org/conference/hotcloud20/presentation/kornfeld-simpson). At *12th USENIX Workshop on Hot Topics in Cloud Computing* (HotCloud), July 2020.
[^58]: Arjun Singh, Joon Ong, Amit Agarwal, Glen Anderson, Ashby Armistead, Roy Bannon, Seb Boving, Gaurav Desai, Bob Felderman, Paulie Germano, Anand Kanagala, Jeff Provost, Jason Simmons, Eiichi Tanda, Jim Wanderer, Urs Hölzle, Stephen Stuart, and Amin Vahdat. [Jupiter Rising: A Decade of Clos Topologies and Centralized Control in Google’s Datacenter Network](https://conferences.sigcomm.org/sigcomm/2015/pdf/papers/p183.pdf). At *Annual Conference of the ACM Special Interest Group on Data Communication* (SIGCOMM), August 2015. [doi:10.1145/2785956.2787508](https://doi.org/10.1145/2785956.2787508)
[^59]: Glenn K. Lockwood. [Hadoop’s Uncomfortable Fit in HPC](https://blog.glennklockwood.com/2014/05/hadoops-uncomfortable-fit-in-hpc.html). *glennklockwood.blogspot.co.uk*, May 2014. Archived at [perma.cc/S8XX-Y67B](https://perma.cc/S8XX-Y67B)
[^60]: Cathy O’Neil: *Weapons of Math Destruction: How Big Data Increases Inequality and Threatens Democracy*. Crown Publishing, 2016. ISBN: 9780553418811
[^61]: Supreeth Shastri, Vinay Banakar, Melissa Wasserman, Arun Kumar, and Vijay Chidambaram. [Understanding and Benchmarking the Impact of GDPR on Database Systems](https://www.vldb.org/pvldb/vol13/p1064-shastri.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 7, pages 1064–1077, March 2020. [doi:10.14778/3384345.3384354](https://doi.org/10.14778/3384345.3384354)
[^62]: Martin Fowler. [Datensparsamkeit](https://www.martinfowler.com/bliki/Datensparsamkeit.html). *martinfowler.com*, December 2013. Archived at [perma.cc/R9QX-CME6](https://perma.cc/R9QX-CME6)
[^63]: [Regulation (EU) 2016/679 of the European Parliament and of the Council of 27 April 2016 (General Data Protection Regulation)](https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32016R0679&from=EN). *Official Journal of the European Union* L 119/1, May 2016.


================================================
FILE: content/en/ch10.md
================================================
---
title: "10. Consistency and Consensus"
weight: 210
breadcrumbs: false
---

<a id="ch_consistency"></a>

![](/map/ch09.png)

> *An ancient adage warns, “Never go to sea with two chronometers; take one or three.”*
>
> Frederick P. Brooks Jr., *The Mythical Man-Month: Essays on Software Engineering* (1995)

Lots of things can go wrong in distributed systems, as discussed in [Chapter 9](/en/ch9#ch_distributed). If we want a
service to continue working correctly despite those things going wrong, we need to find ways of
tolerating faults.

One of the best tools we have for fault tolerance is *replication*. However, as we saw in
[Chapter 6](/en/ch6#ch_replication), having multiple copies of the data on multiple replicas opens up the risk of
inconsistencies. Reads might be handled by a replica that is not up-to-date, yielding stale results.
If multiple replicas can accept writes, we have to deal with conflicts between values that were
concurrently written on different replicas. At a high level, there are two competing philosophies
for dealing with such issues:

Eventual consistency
: In this philosophy, the fact that a system is replicated is made visible to the application, and
 you as application developer are expected to deal with the inconsistencies and conflicts that may
 arise. This approach is often used in systems with multi-leader (see
 [“Multi-Leader Replication”](/en/ch6#sec_replication_multi_leader)) and leaderless replication (see [“Leaderless Replication”](/en/ch6#sec_replication_leaderless)).

Strong consistency
: This philosophy says that applications should not have to worry about internal details of
 replication, and that the system should behave as if it was single-node. The advantage of this
 approach is that it’s simpler for you, the application developer. The disadvantage is that
 stronger consistency has a performance cost, and some kinds of fault that an eventually consistent
 system can tolerate cause outages in strongly consistent systems.

As always, which approach is better depends on your application. If you have an app where users can
make changes to data while offline, then eventual consistency is inevitable, as discussed in
[“Sync Engines and Local-First Software”](/en/ch6#sec_replication_offline_clients). However, eventual consistency can also be difficult for
applications to deal with. If your replicas are located in datacenters with fast, reliable
communication, then strong consistency is often appropriate because its cost is acceptable.

In this chapter we will dive deeper into the strongly consistent approach, looking at three areas:

1. One challenge is that “strong consistency” is quite vague, so we will develop a more precise
 definition of what we want to achieve: *linearizability*.
2. We will look at the problem of generating IDs and timestamps. This may sound unrelated to
 consistency but is actually closely connected.
3. We will explore how distributed systems can achieve linearizability while still remaining
 fault-tolerant; the answer is *consensus* algorithms.

Along the way, we will see that there are some fundamental limits on what is possible and what is
not in a distributed system.

The topics of this chapter are notorious for being hard to implement correctly; it’s very easy to
build systems that behave fine when there are no faults, but which completely fall apart when faced
with an unlucky combination of faults that the designer of the system hadn’t considered. A lot of
theory has been developed to help us think through those edge cases, which enables us to build
systems that can robustly tolerate faults.

This chapter will only scratch the surface: we will stick with informal intuitions, and avoid the
algorithmic nitty-gritty, formal models, and proofs. If you want to do serious work on consensus
systems and similar infrastructure, you will need to go much deeper into the theory if you want any
chance of your systems being robust. As usual, the literature references in this chapter provide
some initial pointers.


## Linearizability {#sec_consistency_linearizability}

If you want a replicated database to be as simple as possible to use, you should make it behave as
if it wasn’t replicated at all. Then users don’t have to worry about replication lag, conflicts, and
other inconsistencies. That would give us the advantage of fault tolerance, but without the
complexity arising from having to think about multiple replicas.

This is the idea behind *linearizability* [^1] (also known as *atomic consistency* [^2], *strong consistency*, *immediate consistency*, or *external consistency* [^3]).
The exact definition of linearizability is quite subtle, and we will explore it in the rest of this
section. But the basic idea is to make a system appear as if there were only one copy of the data,
and all operations on it are atomic. With this guarantee, even though there may be multiple replicas
in reality, the application does not need to worry about them.

In a linearizable system, as soon as one client successfully completes a write, all clients reading
from the database must be able to see the value just written. Maintaining the illusion of a single
copy of the data means guaranteeing that the value read is the most recent, up-to-date value, and
doesn’t come from a stale cache or replica. In other words, linearizability is a *recency
guarantee*. To clarify this idea, let’s look at an example of a system that is not linearizable.

{{< figure src="/fig/ddia_1001.png" id="fig_consistency_linearizability_0" caption="Figure 10-1. If this database were linearizable, then either Alice's read would return 1 instead of 0, or Bob's read would return 0 instead of 1." class="w-full my-4" >}}

[Figure 10-1](/en/ch10#fig_consistency_linearizability_0) shows an example of a nonlinearizable sports website [^4].
Aaliyah and Bryce are sitting in the same room, both checking their phones to see the outcome of a
game their favorite team is playing. Just after the final score is announced, Aaliyah refreshes the
page, sees the winner announced, and excitedly tells Bryce about it. Bryce incredulously hits
*reload* on his own phone, but his request goes to a database replica that is lagging, and so his
phone shows that the game is still ongoing.

If Aaliyah and Bryce had hit reload at the same time, it would have been less surprising if they had
gotten two different query results, because they wouldn’t know at exactly what time their respective
requests were processed by the server. However, Bryce knows that he hit the reload button (initiated
his query) *after* he heard Aaliyah exclaim the final score, and therefore he expects his query
result to be at least as recent as Aaliyah’s. The fact that his query returned a stale result is a
violation of linearizability.

### What Makes a System Linearizable? {#sec_consistency_lin_definition}

In order to understand linearizability better, let’s look at some more examples.
[Figure 10-2](/en/ch10#fig_consistency_linearizability_1) shows three clients concurrently reading and writing the same
object *x* in a linearizable database. In distributed systems theory, *x* is called a *register*—in
practice, it could be one key in a key-value store, one row in a relational database, or one
document in a document database, for example.

{{< figure src="/fig/ddia_1002.png" id="fig_consistency_linearizability_1" caption="Figure 10-2. Alice observes that x = 0 and y = 1, while Bob observes that x = 1 and y = 0. It's as if Alice's and Bob's computers disagree on the order in which the writes happened." class="w-full my-4" >}}


For simplicity, [Figure 10-2](/en/ch10#fig_consistency_linearizability_1) shows only the requests from the clients’
point of view, not the internals of the database. Each bar is a request made by a client, where the
start of a bar is the time when the request was sent, and the end of a bar is when the response was
received by the client. Due to variable network delays, a client doesn’t know exactly when the
database processed its request—it only knows that it must have happened sometime between the
client sending the request and receiving the response.

In this example, the register has two types of operations:

* *read*(*x*) ⇒ *v* means the client requested to read the value of register
 *x*, and the database returned the value *v*.
* *write*(*x*, *v*) ⇒ *r* means the client requested to set the
 register *x* to value *v*, and the database returned response *r* (which could be *ok* or *error*).

In [Figure 10-2](/en/ch10#fig_consistency_linearizability_1), the value of *x* is initially 0, and client C performs a
write request to set it to 1. While this is happening, clients A and B are repeatedly polling the
database to read the latest value. What are the possible responses that A and B might get for their
read requests?

* The first read operation by client A completes before the write begins, so it must definitely
 return the old value 0.
* The last read by client A begins after the write has completed, so it must definitely return the
 new value 1 if the database is linearizable, because the read must have been processed after the
 write.
* Any read operations that overlap in time with the write operation might return either 0 or 1,
 because we don’t know whether or not the write has taken effect at the time when the read
 operation is processed. These operations are *concurrent* with the write.

However, that is not yet sufficient to fully describe linearizability: if reads that are concurrent
with a write can return either the old or the new value, then readers could see a value flip back
and forth between the old and the new value several times while a write is going on. That is not
what we expect of a system that emulates a “single copy of the data.”

To make the system linearizable, we need to add another constraint, illustrated in
[Figure 10-3](/en/ch10#fig_consistency_linearizability_2).

{{< figure src="/fig/ddia_1003.png" id="fig_consistency_linearizability_2" caption="Figure 10-3. If Alice and Bob had perfect clocks, linearizability would require that x = 1 is returned, since the read of x begins after the write x = 1 completes." class="w-full my-4" >}}


In a linearizable system we imagine that there must be some point in time (between the start and end
of the write operation) at which the value of *x* atomically flips from 0 to 1. Thus, if one
client’s read returns the new value 1, all subsequent reads must also return the new value, even if
the write operation has not yet completed.

This timing dependency is illustrated with an arrow in [Figure 10-3](/en/ch10#fig_consistency_linearizability_2).
Client A is the first to read the new value, 1. Just after A’s read returns, B begins a new read.
Since B’s read occurs strictly after A’s read, it must also return 1, even though the write by C is
still ongoing. (It’s the same situation as with Aaliyah and Bryce in
[Figure 10-1](/en/ch10#fig_consistency_linearizability_0): after Aaliyah has read the new value, Bryce also expects to
read the new value.)

We can further refine this timing diagram to visualize each operation taking effect atomically at
some point in time [^5],
like in the more complex example shown in [Figure 10-4](/en/ch10#fig_consistency_linearizability_3). In this example we
add a third type of operation besides *read* and *write*:

* *cas*(*x*, *v*old, *v*new) ⇒ *r* means the client
 requested an atomic *compare-and-set* operation (see [“Conditional writes (compare-and-set)”](/en/ch8#sec_transactions_compare_and_set)). If the
 current value of the register *x* equals *v*old, it should be atomically set to *v*new. If
 the value of *x* is different from *v*old, then the operation should leave the register
 unchanged and return an error. *r* is the database’s response (*ok* or *error*).

Each operation in [Figure 10-4](/en/ch10#fig_consistency_linearizability_3) is marked with a vertical line (inside the
bar for each operation) at the time when we think the operation was executed. Those markers are
joined up in a sequential order, and the result must be a valid sequence of reads and writes for a
register (every read must return the value set by the most recent write).

The requirement of linearizability is that the lines joining up the operation markers always move
forward in time (from left to right), never backward. This requirement ensures the recency guarantee we
discussed earlier: once a new value has been written or read, all subsequent reads see the value
that was written, until it is overwritten again.

{{< figure src="/fig/ddia_1004.png" id="fig_consistency_linearizability_3" caption="Figure 10-4. The read of x is concurrent with the write x = 1. Since we don't know the exact timing of the operations, the read is allowed to return either 0 or 1." class="w-full my-4" >}}


There are a few interesting details to point out in [Figure 10-4](/en/ch10#fig_consistency_linearizability_3):

* First client B sent a request to read *x*, then client D sent a request to set *x* to 0, and then
 client A sent a request to set *x* to 1. Nevertheless, the value returned to B’s read is 1 (the
 value written by A). This is okay: it means that the database first processed D’s write, then A’s
 write, and finally B’s read. Although this is not the order in which the requests were sent, it’s
 an acceptable order, because the three requests are concurrent. Perhaps B’s read request was
 slightly delayed in the network, so it only reached the database after the two writes.
* Client B’s read returned 1 before client A received its response from the database, saying that
 the write of the value 1 was successful. This is also okay: it just means the *ok* response from
 the database to client A was slightly delayed in the network.
* This model doesn’t assume any transaction isolation: another client may change a value at any
 time. For example, C first reads 1 and then reads 2, because the value was changed by B between
 the two reads. An atomic compare-and-set (*cas*) operation can be used to check the value hasn’t
 been concurrently changed by another client: B and C’s *cas* requests succeed, but D’s *cas*
 request fails (by the time the database processes it, the value of *x* is no longer 0).
* The final read by client B (in a shaded bar) is not linearizable. The operation is concurrent with
 C’s *cas* write, which updates *x* from 2 to 4. In the absence of other requests, it would be okay for
 B’s read to return 2. However, client A has already read the new value 4 before B’s read started,
 so B is not allowed to read an older value than A. Again, it’s the same situation as with Aaliyah
 and Bryce in [Figure 10-1](/en/ch10#fig_consistency_linearizability_0).

That is the intuition behind linearizability; the formal definition [^1] describes it more precisely. It is
possible (though computationally expensive) to test whether a system’s behavior is linearizable by
recording the timings of all requests and responses, and checking whether they can be arranged into
a valid sequential order [^6] [^7].

Just as there are various weak isolation levels for transactions besides serializability (see
[“Weak Isolation Levels”](/en/ch8#sec_transactions_isolation_levels)), there are also various weaker consistency models for
replicated systems besides linearizability [^8].
In fact, the *read-after-write*, *monotonic reads*, and *consistent prefix reads* properties we saw
in [“Problems with Replication Lag”](/en/ch6#sec_replication_lag) are examples of such weaker consistency models. Linearizability
guarantees all these weaker properties, and more. In this chapter we will focus on linearizability,
which is the strongest consistency model in common use.


--------

<a id="sidebar_consistency_serializability"></a>

> [!TIP] LINEARIZABILITY VERSUS SERIALIZABILITY

Linearizability is easily confused with serializability (see [“Serializability”](/en/ch8#sec_transactions_serializability)),
as both words seem to mean something like “can be arranged in a sequential order.” However, they are
quite different guarantees, and it is important to distinguish between them:

Serializability
: Serializability is an isolation property of transactions, where every transaction may read and
 write *multiple objects* (rows, documents, records). It guarantees that transactions behave the
 same as if they had executed in *some* serial order: that is, as if you first performed all of one
 transaction’s operations, then all of another transaction’s operations, and so on, without
 interleaving them. It is okay for that serial order to be different from the order in which the
 transactions were actually run [^9].

Linearizability
: Linearizability is a guarantee on reads and writes of a register (an *individual object*). It
 doesn’t group operations together into transactions, so it does not prevent problems such as write
 skew that involve multiple objects (see [“Write Skew and Phantoms”](/en/ch8#sec_transactions_write_skew)). However, linearizability
 is a *recency* guarantee: it requires that if one operation finishes before another one starts,
 then the later operation must observe a state that is at least as new as the earlier operation.
 Serializability does not have that requirement: for example, stale reads are allowed by
 serializability [^10].

(*Sequential consistency* is something else again [^8], but we won’t discuss it here.)

A database may provide both serializability and linearizability, and this combination is known as
*strict serializability* or *strong one-copy serializability* (*strong-1SR*) [^11] [^12].
Single-node databases are typically linearizable. With distributed databases using optimistic
methods like serializable snapshot isolation (see [“Serializable Snapshot Isolation (SSI)”](/en/ch8#sec_transactions_ssi)) the situation is more
complicated: for example, CockroachDB provides serializability, and some recency guarantees on
reads, but not strict serializability [^13]
because this would require expensive coordination between transactions [^14].

It is also possible to combine a weaker isolation level with linearizability, or a weaker
consistency model with serializability; in fact, consistency model and isolation level can be chosen
largely independently from each other [^15] [^16].

--------

### Relying on Linearizability {#sec_consistency_linearizability_usage}

In what circumstances is linearizability useful? Viewing the final score of a sporting match is
perhaps a frivolous example: a result that is outdated by a few seconds is unlikely to cause any
real harm in this situation. However, there a few areas in which linearizability is an important
requirement for making a system work correctly.

#### Locking and leader election {#locking-and-leader-election}

A system that uses single-leader replication needs to ensure that there is indeed only one leader,
not several (split brain). One way of electing a leader is to use a lease: every node that starts up
tries to acquire the lease, and the one that succeeds becomes the leader [^17].
No matter how this mechanism is implemented, it must be linearizable: it should not be possible for
two different nodes to acquire the lease at the same time.

Coordination services like Apache ZooKeeper [^18]
and etcd are often used to implement distributed leases and leader election. They use consensus
algorithms to implement linearizable operations in a fault-tolerant way (we discuss such algorithms
later in this chapter). There are still many subtle details to implementing leases and leader
election correctly (see for example the fencing issue in [“Distributed Locks and Leases”](/en/ch9#sec_distributed_lock_fencing)), and
libraries like Apache Curator help by providing higher-level recipes on top of ZooKeeper. However, a
linearizable storage service is the basic foundation for these coordination tasks.

--------

> [!NOTE]
> Strictly speaking, ZooKeeper provides linearizable writes, but reads may be stale, since there is no
> guarantee that they are served from the current leader [^18]. etcd since version 3 provides linearizable reads by default.

--------


Distributed locking is also used at a much more granular level in some distributed databases, such as
Oracle Real Application Clusters (RAC) [^19].
RAC uses a lock per disk page, with multiple nodes sharing access
to the same disk storage system. Since these linearizable locks are on the critical path of
transaction execution, RAC deployments usually have a dedicated cluster interconnect network for
communication between database nodes.

#### Constraints and uniqueness guarantees {#sec_consistency_uniqueness}

Uniqueness constraints are common in databases: for example, a username or email address must
uniquely identify one user, and in a file storage service there cannot be two files with the same
path and filename. If you want to enforce this constraint as the data is written (such that if two people
try to concurrently create a user or a file with the same name, one of them will be returned an
error), you need linearizability.

This situation is actually similar to a lock: when a user registers for your service, you can think
of them acquiring a “lock” on their chosen username. The operation is also very similar to an atomic
compare-and-set, setting the username to the ID of the user who claimed it, provided that the
username is not already taken.

Similar issues arise if you want to ensure that a bank account balance never goes negative, or that
you don’t sell more items than you have in stock in the warehouse, or that two people don’t
concurrently book the same seat on a flight or in a theater. These constraints all require there to
be a single up-to-date value (the account balance, the stock level, the seat occupancy) that all
nodes agree on.

In real applications, it is sometimes acceptable to treat such constraints loosely (for example, if
a flight is overbooked, you can move customers to a different flight and offer them compensation for
the inconvenience). In such cases, linearizability may not be needed, and we will discuss such
loosely interpreted constraints in [“Timeliness and Integrity”](/en/ch13#sec_future_integrity).

However, a hard uniqueness constraint, such as the one you typically find in relational databases,
requires linearizability. Other kinds of constraints, such as foreign key or attribute constraints,
can be implemented without linearizability [^20].

#### Cross-channel timing dependencies {#cross-channel-timing-dependencies}

Notice a detail in [Figure 10-1](/en/ch10#fig_consistency_linearizability_0): if Aaliyah hadn’t exclaimed the score,
Bryce wouldn’t have known that the result of his query was stale. He would have just refreshed the
page again a few seconds later, and eventually seen the final score. The linearizability violation
was only noticed because there was an additional communication channel in the system (Aaliyah’s
voice to Bryce’s ears).

Similar situations can arise in computer systems. For example, say you have a website where users
can upload a video, and a background process transcodes the video to a lower quality that can be
streamed on slow internet connections. The architecture and dataflow of this system is illustrated
in [Figure 10-5](/en/ch10#fig_consistency_transcoder).

The video transcoder needs to be explicitly instructed to perform a transcoding job, and this
instruction is sent from the web server to the transcoder via a message queue (see [“Messaging Systems”](/en/ch12#sec_stream_messaging)).
The web server doesn’t place the entire video on the queue, since most message brokers are designed
for small messages, and a video may be many megabytes in size. Instead, the video is first written
to a file storage service, and once the write is complete, the instruction to the transcoder is
placed on the queue.

{{< figure src="/fig/ddia_1005.png" id="fig_consistency_transcoder" caption="Figure 10-5. A system that is not linearizable: Alice and Bob see the uploaded image at different times, and thus Bob's request is based on stale data." class="w-full my-4" >}}


If the file storage service is linearizable, then this system should work fine. If it is not
linearizable, there is the risk of a race condition: the message queue (steps 3 and 4 in
[Figure 10-5](/en/ch10#fig_consistency_transcoder)) might be faster than the internal replication inside the storage
service. In this case, when the transcoder fetches the original video (step 5), it might see an old
version of the file, or nothing at all. If it processes an old version of the video, the original
and transcoded videos in the file storage become permanently inconsistent with each other.

This problem arises because there are two different communication channels between the web server
and the transcoder: the file storage and the message queue. Without the recency guarantee of
linearizability, race conditions between these two channels are possible. This situation is
analogous to [Figure 10-1](/en/ch10#fig_consistency_linearizability_0), where there was also a race condition between
two communication channels: the database replication and the real-life audio channel between
Aaliyah’s mouth and Bryce’s ears.

A similar race condition occurs if you have a mobile app that can receive push notifications, and
the app fetches some data from a server when it receives a push notification. If the data fetch
might go to a lagging replica, it could happen that the push notification goes through quickly, but
the subsequent fetch doesn’t see the data that the push notification was about.

Linearizability is not the only way of avoiding this race condition, but it’s the simplest to
understand. If you control the additional communication channel (like in the case of the message
queue, but not in the case of Aaliyah and Bryce), you can use alternative approaches similar to what
we discussed in [“Reading Your Own Writes”](/en/ch6#sec_replication_ryw), at the cost of additional complexity.


### Implementing Linearizable Systems {#sec_consistency_implementing_linearizable}

Now that we’ve looked at a few examples in which linearizability is useful, let’s think about how we
might implement a system that offers linearizable semantics.

Since linearizability essentially means “behave as though there is only a single copy of the data,
and all operations on it are atomic,” the simplest answer would be to really only use a single copy
of the data. However, that approach would not be able to tolerate faults: if the node holding that
one copy failed, the data would be lost, or at least inaccessible until the node was brought up again.

Let’s revisit the replication methods from [Chapter 6](/en/ch6#ch_replication), and compare whether they can be made linearizable:

Single-leader replication (potentially linearizable)
: In a system with single-leader replication, the leader has the primary copy of the data that is
 used for writes, and the followers maintain backup copies of the data on other nodes. As long as
 you perform all reads and writes on the leader, they are likely to be linearizable. However, this
 assumes that you know for sure who the leader is. As discussed in
 [“Distributed Locks and Leases”](/en/ch9#sec_distributed_lock_fencing), it is quite possible for a node to think that it is the leader,
 when in fact it is not—and if the delusional leader continues to serve requests, it is likely to
 violate linearizability [^21].
 With asynchronous replication, failover may even lose committed writes, which violates both
 durability and linearizability.

 Sharding a single-leader database, with a separate leader per shard, does not affect
 linearizability, since it is only a single-object guarantee. Cross-shard transactions are a
 different matter (see [“Distributed Transactions”](/en/ch8#sec_transactions_distributed)).

Consensus algorithms (likely linearizable)
: Some consensus algorithms are essentially single-leader replication with automatic leader election
 and failover. They are carefully designed to prevent split brain, allowing them to implement
 linearizable storage safely. ZooKeeper uses the Zab consensus algorithm [^22] and etcd uses Raft [^23], for example. 
 However, just because a system uses consensus does not guarantee that all operations on it are
 linearizable: if it allows reads on a node without checking that it is still the leader, the
 results of the read may be stale if a new leader has just been elected.

Multi-leader replication (not linearizable)
: Systems with multi-leader replication are generally not linearizable, because they concurrently
 process writes on multiple nodes and asynchronously replicate them to other nodes. For this
 reason, they can produce conflicting writes that require resolution (see
 [“Dealing with Conflicting Writes”](/en/ch6#sec_replication_write_conflicts)).

Leaderless replication (probably not linearizable)
: For systems with leaderless replication (Dynamo-style; see [“Leaderless Replication”](/en/ch6#sec_replication_leaderless)), people
 sometimes claim that you can obtain “strong consistency” by requiring quorum reads and writes
 (*w* + *r* > *n*). Depending on the exact algorithm, and depending on how you define
 strong consistency, this is not quite true.

 “Last write wins” conflict resolution methods based on time-of-day clocks (e.g., in Cassandra and
 ScyllaDB) are almost certainly nonlinearizable, because clock timestamps cannot be guaranteed to be
 consistent with actual event ordering due to clock skew (see [“Relying on Synchronized Clocks”](/en/ch9#sec_distributed_clocks_relying)).
 Even with quorums, nonlinearizable behavior is possible, as demonstrated in the next section.

#### Linearizability and quorums {#sec_consistency_quorum_linearizable}

Intuitively, it seems as though quorum reads and writes should be linearizable in a
Dynamo-style model. However, when we have variable network delays, it is possible to have race
conditions, as demonstrated in [Figure 10-6](/en/ch10#fig_consistency_leaderless).

{{< figure src="/fig/ddia_1006.png" id="fig_consistency_leaderless" caption="Figure 10-6. Quorums are not sufficient to ensure linearizability if network delays are variable." class="w-full my-4" >}}


In [Figure 10-6](/en/ch10#fig_consistency_leaderless), the initial value of *x* is 0, and a writer client is updating
*x* to 1 by sending the write to all three replicas (*n* = 3, *w* = 3).
Concurrently, client A reads from a quorum of two nodes (*r* = 2) and sees the new value 1
on one of the nodes. Also concurrently with the write, client B reads from a different quorum of two
nodes, and gets back the old value 0 from both.

The quorum condition is met (*w* + *r* > *n*), but this execution is nevertheless not
linearizable: B’s request begins after A’s request completes, but B returns the old value while A
returns the new value. (It’s once again the Aaliyah and Bryce situation from
[Figure 10-1](/en/ch10#fig_consistency_linearizability_0).)

It is possible to make Dynamo-style quorums linearizable at the cost of reduced
performance: a reader must perform read repair (see [“Catching up on missed writes”](/en/ch6#sec_replication_read_repair)) synchronously,
before returning results to the application [^24].
Moreover, before writing, a writer must read the latest state of a quorum of nodes to fetch the
latest timestamp of any prior write, and ensure that the new write has a greater timestamp [^25] [^26].
However, Riak does not perform synchronous read repair due to the performance penalty.
Cassandra does wait for read repair to complete on quorum reads [^27],
but it loses linearizability due to its use of time-of-day clocks for timestamps.

Moreover, only linearizable read and write operations can be implemented in this way; a
linearizable compare-and-set operation cannot, because it requires a consensus algorithm [^28].

In summary, it is safest to assume that a leaderless system with Dynamo-style replication does not
provide linearizability, even with quorum reads and writes.

### The Cost of Linearizability {#sec_linearizability_cost}

As some replication methods can provide linearizability and others cannot, it is interesting to
explore the pros and cons of linearizability in more depth.

We already discussed some use cases for different replication methods in [Chapter 6](/en/ch6#ch_replication); for
example, we saw that multi-leader replication is often a good choice for multi-region
replication (see [“Geographically Distributed Operation”](/en/ch6#sec_replication_multi_dc)). An example of such a deployment is illustrated in
[Figure 10-7](/en/ch10#fig_consistency_cap_availability).

{{< figure src="/fig/ddia_1007.png" id="fig_consistency_cap_availability" caption="Figure 10-7. If clients cannot contact enough replicas due to a network partition, they cannot process writes." class="w-full my-4" >}}


Consider what happens if there is a network interruption between the two regions. Let’s assume
that the network within each region is working, and clients can reach their local region, but the
regions cannot connect to each other. This is known as a *network partition*.

With a multi-leader database, each region can continue operating normally: since writes from one
region are asynchronously replicated to the other, the writes are simply queued up and exchanged
when network connectivity is restored.

On the other hand, if single-leader replication is used, then the leader must be in one of the
regions. Any writes and any linearizable reads must be sent to the leader—thus, for any
clients connected to a follower region, those read and write requests must be sent synchronously
over the network to the leader region.

If the network between regions is interrupted in a single-leader setup, clients connected to
follower regions cannot contact the leader, so they cannot make any writes to the database, nor
any linearizable reads. They can still make reads from the follower, but they might be stale
(nonlinearizable). If the application requires linearizable reads and writes, the network
interruption causes the application to become unavailable in the regions that cannot contact the leader.

If clients can connect directly to the leader region, this is not a problem, since the
application continues to work normally there. But clients that can only reach a follower region
will experience an outage until the network link is repaired.

#### The CAP theorem {#the-cap-theorem}

This issue is not just a consequence of single-leader and multi-leader replication: any linearizable
database has this problem, no matter how it is implemented. The issue also isn’t specific to
multi-region deployments, but can occur on any unreliable network, even within one region.
The trade-off is as follows:

* If your application *requires* linearizability, and some replicas are disconnected from the other
 replicas due to a network problem, then some replicas cannot process requests while they are
 disconnected: they must either wait until the network problem is fixed, or return an error (either
 way, they become *unavailable*). This choice is sometimes known as *CP* (consistent under network partitions).
* If your application *does not require* linearizability, then it can be written in a way that each
 replica can process requests independently, even if it is disconnected from other replicas (e.g.,
 multi-leader). In this case, the application can remain *available* in the face of a network
 problem, but its behavior is not linearizable. This choice is known as *AP* (available under network partitions).

Thus, applications that don’t require linearizability can be more tolerant of network problems. This
insight is popularly known as the *CAP theorem* [^29] [^30] [^31] [^32],
named by Eric Brewer in 2000, although the trade-off had been known to designers of
distributed databases since the 1970s [^33] [^34] [^35].

CAP was originally proposed as a rule of thumb, without precise definitions, with the goal of
starting a discussion about trade-offs in databases. At the time, many distributed databases
focused on providing linearizable semantics on a cluster of machines with shared storage [^19], and CAP encouraged database engineers
to explore a wider design space of distributed shared-nothing systems, which were more suitable for
implementing large-scale web services [^36].
CAP deserves credit for this culture shift—it helped trigger the NoSQL movement, a burst of new
database technologies around the mid-2000s.

> [!TIP] THE UNHELPFUL CAP THEOREM

CAP is sometimes presented as *Consistency, Availability, Partition tolerance: pick 2 out of 3*.
Unfortunately, putting it this way is misleading [^32] because network partitions are a kind of
fault, so they aren’t something about which you have a choice: they will happen whether you like it or not.

At times when the network is working correctly, a system can provide both consistency
(linearizability) and total availability. When a network fault occurs, you have to choose between
either linearizability or total availability. Thus, a better way of phrasing CAP would be
*either Consistent or Available when Partitioned* [^37].
A more reliable network needs to make this choice less often, but at some point the choice is inevitable.

The CP/AP classification scheme has several further flaws [^4]. *Consistency* is formalized as
linearizability (the theorem doesn’t say anything about weaker consistency models), and the
formalization of *availability* [^30] does not
match the usual meaning of the term [^38]. Many highly available (fault-tolerant) systems actually do not meet CAP’s
idiosyncratic definition of availability. Moreover, some system designers choose (with good reason)
to provide neither linearizability nor the form of availability that the CAP theorem assumes, so
those systems are neither CP nor AP [^39] [^40].

All in all, there is a lot of misunderstanding and confusion around CAP, and it does not help us
understand systems better, so CAP is best avoided.

The CAP theorem as formally defined [^30] is of
very narrow scope: it only considers one consistency model (namely linearizability) and one kind of
fault (network partitions, which according to data from Google are the cause of less than 8% of incidents [^41]).
It doesn’t say anything about network delays, dead nodes, or other trade-offs. Thus, although CAP
has been historically influential, it has little practical value for designing systems [^4] [^38].

There have been efforts to generalize CAP. For example, the *PACELC principle* observes that system
designers might also choose to weaken consistency at times when the network is working fine in order
to reduce latency [^39] [^40] [^42].
Thus, during a network partition (P), we need to choose between availability (A) and consistency (C); 
else (E), when there is no partition, we may choose between low latency (L) and consistency (C).
However, this definition inherits several problems with CAP, such as the counterintuitive definitions of consistency and availability.

There are many more interesting impossibility results in distributed systems [^43], and CAP has now been 
superseded by more precise results [^44] [^45], so it is of mostly historical interest today.

#### Linearizability and network delays {#linearizability-and-network-delays}

Although linearizability is a useful guarantee, surprisingly few systems are actually linearizable
in practice. For example, even RAM on a modern multi-core CPU is not linearizable [^46]:
if a thread running on one CPU core writes to a memory address, and a thread on another CPU core
reads the same address shortly afterward, it is not guaranteed to read the value written by the
first thread (unless a *memory barrier* or *fence* [^47] is used).

The reason for this behavior is that every CPU core has its own memory cache and store buffer.
Memory access first goes to the cache by default, and any changes are asynchronously written out to
main memory. Since accessing data in the cache is much faster than going to main memory [^48], this feature is essential for
good performance on modern CPUs. However, there are now several copies of the data (one in main
memory, and perhaps several more in various caches), and these copies are asynchronously updated, so
linearizability is lost.

Why make this trade-off? It makes no sense to use the CAP theorem to justify the multi-core memory
consistency model: within one computer we usually assume reliable communication, and we don’t expect
one CPU core to be able to continue operating normally if it is disconnected from the rest of the
computer. The reason for dropping linearizability is *performance*, not fault tolerance [^39].

The same is true of many distributed databases that choose not to provide linearizable guarantees:
they do so primarily to increase performance, not so much for fault tolerance [^42].
Linearizability is slow—and this is true all the time, not only during a network fault.

Can’t we maybe find a more efficient implementation of linearizable storage? It seems the answer is
no: Attiya and Welch [^49] prove that if you want linearizability, the response time of read and write requests is at least
proportional to the uncertainty of delays in the network. In a network with highly variable delays,
like most computer networks (see [“Timeouts and Unbounded Delays”](/en/ch9#sec_distributed_queueing)), the response time of linearizable
reads and writes is inevitably going to be high. A faster algorithm for linearizability does not
exist, but weaker consistency models can be much faster, so this trade-off is important for
latency-sensitive systems. In [“Timeliness and Integrity”](/en/ch13#sec_future_integrity) we will discuss some approaches for avoiding
linearizability without sacrificing correctness.


## ID Generators and Logical Clocks {#sec_consistency_logical}

In many applications you need to assign some sort of unique ID to database records when they are
created, which gives you a primary key by which you can refer to those records. In single-node
databases it is common to use an auto-incrementing integer, which has the advantage that it can be
stored in only 64 bits (or even 32 bits if you are sure that you will never have more than 4 billion
records, but that is risky).

Another advantage of such auto-incrementing IDs is that the order of the IDs tells you the order in
which the records were created. For example, [Figure 10-8](/en/ch10#fig_consistency_id_generator) shows a chat
application that assigns auto-incrementing IDs to chat messages as they are posted. You can then
display the messages in order of increasing ID, and the resulting chat threads will make sense:
Aaliyah posts a question that is assigned ID 1, and Bryce’s answer to the question is assigned a
greater ID, namely 3.

{{< figure src="/fig/ddia_1008.png" id="fig_consistency_id_generator" caption="Figure 10-8. Two different nodes may generate conflicting IDs." class="w-full my-4" >}}


This single-node ID generator is another example of a linearizable system. Each request to fetch the
ID is an operation that atomically increments a counter and returns the old counter value (a
*fetch-and-add* operation); linearizability ensures that if the posting of Aaliyah’s message
completes before Bryce’s posting begins, then Bryce’s ID must be greater than Aaliyah’s. The
messages by Aaliyah and Caleb in [Figure 10-8](/en/ch10#fig_consistency_id_generator) are concurrent, so linearizability
doesn’t specify how their IDs must be ordered, as long as they are unique.

An in-memory single-node ID generator is easy to implement: you can use the atomic increment
instruction provided by your CPU, which allows multiple threads to safely increment the same
counter. It’s a bit more effort to make the counter persistent, so that the node can crash and
restart without resetting the counter value, which would result in duplicate IDs. But the real
problems are:

* A single-node ID generator is not fault-tolerant because that node is a single point of failure.
* It’s slow if you want to create a record in another region, as you potentially have to make a
 round-trip to the other side of the planet just to get an ID.
* That single node could become a bottleneck if you have high write throughput.

There are various alternative options for ID generators that you can consider:

Sharded ID assignment
: You could have multiple nodes that assign IDs—for example, one that generates only even numbers,
 and one that generates only odd numbers. In general, you can reserve some bits in the ID to
 contain a shard number. Those IDs are still compact, but you lose the ordering property: for
 example, if you have chat messages with IDs 16 and 17, you don’t know whether message 16 was
 actually sent first, because the IDs were assigned by different nodes, and one node might have
 been ahead of the other.

Preallocated blocks of IDs
: Instead of requesting individual IDs from the single-node ID generator, it could hand out blocks
 of IDs. For example, node A might claim the block of IDs from 1 to 1,000, and node B might claim
 the block from 1,001 to 2,000. Then each node can independently hand out IDs from its block, and
 request a new block from the single-node ID generator when its supply of sequence numbers begins
 to run low. However, this scheme doesn’t ensure correct ordering either: it could happen that one
 message is given an ID in the range from 1,001 to 2,000, and a later message is given an ID in the
 range from 1 to 1,000 if the ID was assigned by a different node.

Random UUIDs
: You can use *universally unique identifiers* (UUIDs), also known as *globally unique identifiers*
 (GUIDs). These have the big advantage that they can be generated locally on any node without
 requiring communication, but they require more space (128 bits). There are several different
 versions of UUIDs; the simplest is version 4, which is essentially a random number that is so long
 that is very unlikely that two nodes would ever pick the same one. Unfortunately, the order of
 such IDs is also random, so comparing two IDs tells you nothing about which one is newer.

Wall-clock timestamp made unique
: If your nodes’ time-of-day clock is kept approximately correct using NTP, you can generate IDs by
 putting a timestamp from that clock in the most significant bits, and filling the remaining bits
 with extra information that ensures the ID is unique even if the timestamp is not—for example, a
 shard number and a per-shard incrementing sequence number, or a long random value. This approach
 is used in Version 7 UUIDs [^50], Twitter’s Snowflake [^51], ULIDs [^52], Hazelcast’s Flake ID generator, 
 MongoDB ObjectIDs, and many similar schemes [^50]. You can implement these ID generators in application code or within a database [^53].

All these schemes generate IDs that are unique (at least with high enough probability that
collisions are vanishingly rare), but they have much weaker ordering guarantees for IDs than the
single-node auto-incrementing scheme.

As discussed in [“Timestamps for ordering events”](/en/ch9#sec_distributed_lww), wall-clock timestamps can provide at best an approximate
ordering: if an earlier write gets a timestamp from a slightly fast clock, and a later write’s
timestamp is from a slightly slow clock, the timestamp order may be inconsistent with the order in
which the events actually happened. With clock jumps due to using a non-monotonic clock, even the
timestamps generated by a single node might be ordered incorrectly. ID generators based on
wall-clock time are therefore unlikely to be linearizable.

You can reduce such ordering inconsistencies by relying on high-precision clock synchronization,
using atomic clocks or GPS receivers. But it would also be nice to be able to generate IDs that are
unique and correctly ordered without relying on special hardware. That’s what *logical clocks* are
about.

### Logical Clocks {#sec_consistency_timestamps}

In [“Unreliable Clocks”](/en/ch9#sec_distributed_clocks) we discussed time-of-day clocks and monotonic clocks. Both of these
are *physical clocks*: they measure the passing of seconds (or milliseconds, microseconds, etc.).

In distributed systems it is common to also use another kind of clock, called a *logical clock*.
While a physical clock is a hardware device that counts the seconds that have elapsed, a logical
clock is an algorithm that counts the events that have occurred. A timestamp from a logical clock
therefore doesn’t tell you what time it is, but you *can* compare two timestamps from a logical
clock to tell which one is earlier and which one is later.

The requirements for a logical clock are typically:

* that its timestamps are compact (a few bytes in size) and unique;
* that you can compare any two timestamps (i.e. they are *totally ordered*); and
* that the order of timestamps is *consistent with causality*: if operation A happened before B,
 then A’s timestamp is less than B’s timestamp. (We discussed causality previously in
 [“The “happens-before” relation and concurrency”](/en/ch6#sec_replication_happens_before).)

A single-node ID generator meets these requirements, but the distributed ID generators we just
discussed do not meet the causal ordering requirement.

#### Lamport timestamps {#lamport-timestamps}

Fortunately, there is a simple method for generating logical timestamps that *is* consistent with
causality, and which you can use as a distributed ID generator. It is called a *Lamport clock*,
proposed in 1978 by Leslie Lamport [^54],
in what is now one of the most-cited papers in the field of distributed systems.

[Figure 10-9](/en/ch10#fig_consistency_lamport_ts) shows how a Lamport clock would work in the chat example of
[Figure 10-8](/en/ch10#fig_consistency_id_generator). Each node has a unique identifier, which in
[Figure 10-9](/en/ch10#fig_consistency_lamport_ts) is the name “Aaliyah”, “Bryce”, or “Caleb”, but which in practice
could be a random UUID or something similar. Moreover, each node keeps a counter of the number of
operations it has processed. A Lamport timestamp is then simply a pair of (*counter*, *node ID*).
Two nodes may sometimes have the same counter value, but by including the node ID in the timestamp,
each timestamp is made unique.

{{< figure src="/fig/ddia_1009.png" id="fig_consistency_lamport_ts" caption="Figure 10-9. Lamport timestamps provide a total ordering consistent with causality." class="w-full my-4" >}}


Every time a node generates a timestamp, it increments its counter value and uses the new value.
Moreover, every time a node sees a timestamp from another node, if the counter value in that
timestamp is greater than its local counter value, it increases its local counter to match the value in the timestamp.

In [Figure 10-9](/en/ch10#fig_consistency_lamport_ts), Aaliyah had not yet seen Caleb’s message when posting her own,
and vice versa. Assuming both users start with an initial counter value of 0, both therefore
increment their local counter and attach the new counter value of 1 to their message. When Bryce
receives those messages, he increases his local counter value to 1. Finally, Bryce sends a reply to
Aaliyah’s message, for which he increments his local counter and attaches the new value of 2 to the
message.

To compare two Lamport timestamps, we first compare their counter value: for example,
(2, “Bryce”) is greater than (1, “Aaliyah”) and also greater than (1, “Caleb”). If
two timestamps have the same counter, we compare their node IDs instead, using the usual
lexicographic string comparison. Thus, the timestamp order in this example is
(1, “Aaliyah”) < (1, “Caleb”) < (2, “Bryce”).

#### Hybrid logical clocks {#hybrid-logical-clocks}

Lamport timestamps are good at capturing the order in which things happened, but they have some
limitations:

* Since they have no direct relation to physical time, you can’t use them to find, say, all the
 messages that were posted on a particular date—you would need to store the physical time
 separately.
* If two nodes never communicate, one node’s counter increments will never be reflected in the other
 one’s counter. As a result, it could happen that events generated around the same time on
 different nodes have wildly different counter values.

A *hybrid logical clock* combines the advantages of physical time-of-day clocks with the ordering
guarantees of Lamport clocks [^55].
Like a physical clock, it counts seconds or microseconds. Like a Lamport clock, when one node sees a
timestamp from another node that is greater than its local clock value, it moves its own local value
forward to match the other node’s timestamp. As a result, if one node’s clock is running fast, the
other nodes will similarly move their clocks forward when they communicate.

Every time a timestamp from a hybrid logical clock is generated, it is also incremented, which
ensures that the clock monotonically moves forward, even if the underlying physical clock jumps
backwards, for example due to NTP adjustments. Thus, the hybrid logical clock might be slightly
ahead of the underlying physical clock. Details of the algorithm ensure that this discrepancy
remains as small as possible.

As a result, you can treat a timestamp from a hybrid logical clock almost like a timestamp from a
conventional time-of-day clock, with the added property that its ordering is consistent with the
happens-before relation. It doesn’t depend on any special hardware, and requires only roughly
synchronized clocks. Hybrid logical clocks are used by CockroachDB, for example.

#### Lamport/hybrid logical clocks vs. vector clocks {#lamporthybrid-logical-clocks-vs-vector-clocks}

In [“Multi-version concurrency control (MVCC)”](/en/ch8#sec_transactions_snapshot_impl) we discussed how snapshot isolation is often implemented:
essentially, by giving each transaction a transaction ID, and allowing each transaction to see
writes made by transactions with a lower ID, but to make writes by transactions with higher IDs
invisible. Lamport clocks and hybrid logical clocks are a good way of generating these transaction
IDs, because they ensure that the snapshot is consistent with causality [^56].

When multiple timestamps are generated concurrently, these algorithms order them arbitrarily. This
means that when you look at two timestamps, you generally can’t tell whether they were generated
concurrently or whether one happened before the other. (In the example of
[Figure 10-9](/en/ch10#fig_consistency_lamport_ts) you actually can tell that Aaliyah and Caleb’s messages must have
been concurrent, because they have the same counter value, but when the counter values are different
you can’t tell whether they were concurrent.)

If you want to be able to determine when records were created concurrently, you need a different
algorithm, such as a *vector clock*. The downside is that the timestamps from a vector clock are
much bigger—potentially one integer for every node in the system. See [“Detecting Concurrent Writes”](/en/ch6#sec_replication_concurrent)
for more details on detecting concurrency.

### Linearizable ID Generators {#sec_consistency_linearizable_id}

Although Lamport clocks and hybrid logical clocks provide useful ordering guarantees, that ordering
is still weaker than the linearizable single-node ID generator we talked about previously. Recall
that linearizability requires that if request A completed before request B began, then B must have
the higher ID, even if A and B never communicated with each other. On the other hand, Lamport clocks
can only ensure that a node generates timestamps that are greater than any other timestamp that node
has seen, but it can’t say anything about timestamps that it hasn’t seen.

[Figure 10-10](/en/ch10#fig_consistency_permissions) shows how a non-linearizable ID generator could cause problems.
Imagine a social media website where user A wants to share an embarrassing photo privately with
their friends. A’s account is initially public, but using their laptop, A first changes their
account settings to private. Then A uses their phone to upload the photo. Since A performed these
updates in sequence, they might reasonably expect the photo upload to be subject to the new,
restricted account permissions.

{{< figure src="/fig/ddia_1010.png" id="fig_consistency_permissions" caption="Figure 10-10. An example of a permission system using Lamport timestamps." class="w-full my-4" >}}


The account permission and the photo are stored in two separate databases (or separate shards of the
same database), and let’s assume they use a Lamport clock or hybrid logical clock to assign a
timestamp to every write. Since the photos database didn’t read from the accounts database, it’s
possible that the local counter in the photos database is slightly behind, and therefore the photo
upload is assigned a lower timestamp than the update of the account settings.

Next, let’s say that a viewer (who is not friends with A) is looking at A’s profile, and their read
uses an MVCC implementation of snapshot isolation. It could happen that the viewer’s read has a
timestamp that is greater than that of the photo upload, but less than that of the account settings
update. As a result, the system will determine that the account is still public at the time of the
read, and therefore show the viewer the embarrassing photo that they were not supposed to see.

You can imagine several possible ways of fixing this problem. Maybe the photos database should have
read the user’s account status before performing the write, but it’s easy to forget such a check.
If A’s actions had been performed on the same device, maybe the app on their device could have
tracked the latest timestamp of that user’s writes—but if the user uses a laptop and a phone, as in
the example, that’s not so easy.

The simplest solution in this case would be to use a linearizable ID generator, which would ensure
that the photo upload is assigned a greater ID than the account permissions change.

#### Implementing a linearizable ID generator {#implementing-a-linearizable-id-generator}

The simplest way of ensuring that ID assignment is linearizable is by actually using a single node
for this purpose. That node only needs to atomically increment a counter and return its value when
requested, persist the counter value (so that it doesn’t generate duplicate IDs if the node crashes
and restarts), and replicate it for fault tolerance (using single-leader replication). This approach
is used in practice: for example, TiDB/TiKV calls it a *timestamp oracle*, inspired by Google’s
Percolator [^57].

As an optimization, you can avoid performing a disk write and replication on every single request.
Instead, the ID generator can write a record describing a batch of IDs; once that record is
persisted and replicated, the node can start handing out those IDs to clients in sequence. Before it
runs out of IDs in that batch, it can persist and replicate the record for the next batch. That way,
some IDs will be skipped if the node crashes and restarts or if you fail over to a follower, but you
won’t issue any duplicate or out-of-order IDs.

You can’t easily shard the ID generator, since if you have multiple shards independently handing out
IDs, you can no longer guarantee that their order is linearizable. You also can’t easily distribute
the ID generator across multiple regions; thus, in a geographically distributed database, all
requests for IDs will have to go to a node in a single region. On the upside, the ID generator’s job
is very simple, so a single node can handle a large request throughput.

If you don’t want to use a single-node ID generator, an alternative is possible: you can do what
Google’s Spanner does, as discussed in [“Synchronized clocks for global snapshots”](/en/ch9#sec_distributed_spanner). It relies on a physical clock
that returns not just a single timestamp, but a range of timestamps indicating the uncertainty in
the clock reading. It then waits for the duration of that uncertainty interval to elapse before
returning.

Assuming that the uncertainty interval is correct (i.e., that the true current physical time always
lies within that interval), this process also ensures that if one request completes before another
begins, the later request will have a greater timestamp. This approach ensures this linearizable ID
assignment without any communication: even requests in different regions will be ordered correctly,
without waiting for cross-region requests. The downside is that you need hardware and software
support for clocks to be tightly synchronized and compute the necessary uncertainty interval.

#### Enforcing constraints using logical clocks {#enforcing-constraints-using-logical-clocks}

In [“Constraints and uniqueness guarantees”](/en/ch10#sec_consistency_uniqueness) we saw that a linearizable compare-and-set operation can be used
to implement locks, uniqueness constraints, and similar constructs in a distributed system. This
raises the question: is a logical clock or a linearizable ID generator also sufficient to implement
these things?

The answer is: not quite. When you have several nodes that are all trying to acquire the
same lock or register the same username, you could use a logical clock to assign timestamps to those
requests, and pick the one with the lowest timestamp as the winner. If the clock is linearizable,
you know that any future requests will always generate greater timestamps, and therefore you can be
sure that no future request will receive an even lower timestamp than the winner.

Unfortunately, part of the problem is still unsolved: how does a node know whether its own timestamp
is the lowest? To be sure, it needs to hear from *every* other node that might have generated a
timestamp [^54]. If one of the other nodes
has failed in the meantime, or cannot be reached due to a network problem, this system would grind
to a halt, because we can’t be sure whether that node might have the lowest timestamp. This is not
the kind of fault-tolerant system that we need.

To implement locks, leases, and similar constructs in a fault-tolerant way, we need something
stronger than logical clocks or ID generators: we need consensus.


## Consensus {#sec_consistency_consensus}

In this chapter we have seen several examples of things that are easy when you have only a single
node, but which get a lot harder if you want fault tolerance:

* A database can be linearizable if you have only a single leader, and you make all reads and writes
 on that leader. But how do you fail over if that leader fails, while avoiding split brain? How do
 you ensure that a node that believes itself to be the leader hasn’t actually been voted out in the meantime?
* A linearizable ID generator on a single node is just a counter with an atomic fetch-and-add
 instruction, but what if it crashes?
* An atomic compare-and-set (CAS) operation is useful for many things, such as deciding who gets a
 lock or lease when several processes are racing to acquire it, or ensuring the uniqueness of a
 file or user with a given name. On a single node, CAS may be as simple as one CPU instruction, but
 how do you make it fault-tolerant?

It turns out that all of these are instances of the same fundamental distributed systems problem:
*consensus*. Consensus is one of the most important and fundamental problems in distributed
computing; it is also infamously difficult to get right [^58] [^59],
and many systems have got it wrong in the past. Now that we have discussed replication
([Chapter 6](/en/ch6#ch_replication)), transactions ([Chapter 8](/en/ch8#ch_transactions)), system models ([Chapter 9](/en/ch9#ch_distributed)), and
linearizability (this chapter), we are finally ready to tackle the consensus problem.

The best-known consensus algorithms are Viewstamped Replication [^60] [^61], Paxos [^58] [^62] [^63] [^64],
Raft [^23] [^65] [^66], and Zab [^18] [^22] [^67]. There are quite a few similarities between these algorithms, but they are not the same [^68] [^69].
These algorithms work in a non-Byzantine system model: that is, network communication may be
arbitrarily delayed or dropped, and nodes may crash, restart, and become disconnected, but the
algorithms assume that nodes otherwise follow the protocol correctly and do not behave maliciously.

There are also consensus algorithms that can tolerate some Byzantine nodes, i.e., nodes that don’t
correctly follow the protocol (for example, by sending contradictory messages to other nodes). A
common assumption is that fewer than one-third of the nodes are Byzantine-faulty [^26] [^70].
Such *Byzantine fault tolerant* (BFT) consensus algorithms are used in blockchains [^71].
However, as explained in [“Byzantine Faults”](/en/ch9#sec_distributed_byzantine), BFT algorithms are beyond the scope of this
book.

--------

> [!TIP] THE IMPOSSIBILITY OF CONSENSUS

You may have heard about the FLP result [^72]—named after the
authors Fischer, Lynch, and Paterson—which proves that there is no algorithm that is always able to
reach consensus if there is a risk that a node may crash. In a distributed system, we must assume
that nodes may crash, so reliable consensus is impossible. Yet, here we are, discussing algorithms
for achieving consensus. What is going on here?

Firstly, FLP doesn’t say that we can never reach consensus—it only says that we can’t guarantee that
a consensus algorithm will *always* terminate. Moreover, the FLP result is proved assuming a
deterministic algorithm in the asynchronous system model (see [“System Model and Reality”](/en/ch9#sec_distributed_system_model)),
which means the algorithm cannot use any clocks or timeouts. If it can use timeouts to suspect that
another node may have crashed (even if the suspicion is sometimes wrong), then consensus becomes solvable [^73].
Even just allowing the algorithm to use random numbers is sufficient to get around the impossibility result [^74].

Thus, although the FLP result about the impossibility of consensus is of great theoretical
importance, distributed systems can usually achieve consensus in practice.

--------

### The Many Faces of Consensus {#sec_consistency_faces}

Consensus can be expressed in several different ways:

* *Single-value consensus* is very similar to an atomic *compare-and-set* operation, and it can be
 used to implement locks, leases, and uniqueness constraints.
* Constructing an *append-only log* also requires consensus; it is usually formalized as *total
 order broadcast*. With a log you can build *state machine replication*, leader-based replication,
 event sourcing, and other useful things.
* *Atomic commitment* of a multi-database or multi-shard transaction requires that all participants
 agree on whether to commit or abort the transaction.

We will explore all of these shortly. In fact, these problems are all equivalent to each other: if
you have an algorithm that solves one of these problems, you can convert it into a solution for any
of the others. This is quite a profound and perhaps surprising insight! And that’s why we can lump
all of these things together under “consensus”, even though they look quite different on the surface.

#### Single-value consensus {#single-value-consensus}

The standard formulation of consensus involves getting multiple nodes to agree on a single value.
For example:

* When a database with single-leader replication first starts up, or when the existing leader fails,
 several nodes may concurrently try to become the leader. Similarly, multiple nodes may race to
 acquire a lock or lease. Consensus allows them to decide which one wins.
* If several people concurrently try to book the last seat on an airplane, or the same seat in a
 theater, or try to register an account with the same username, then a consensus algorithm could
 determine which one should succeed.

More generally, one or more nodes may *propose* values, and the consensus algorithm *decides* on one
of those values. In the examples above, each node could propose its own ID, and the algorithm
decides which node ID should become the new leader, the holder of the lease, or the buyer of the
airplane/theater seat. In this formalism, a consensus algorithm must satisfy the following
properties [^26]:

Uniform agreement
: No two nodes decide differently.

Integrity
: Once a node has decided one value, it cannot change its mind by deciding another value.

Validity
: If a node decides value *v*, then *v* was proposed by some node.

Termination
: Every node that does not crash eventually decides some value.

If you want to decide multiple values, you can run a separate instance of the consensus algorithm
for each. For example, you could have a separate consensus run for each bookable seat in the
theater, so that you get one decision (one buyer) for each seat.

The uniform agreement and integrity properties define the core idea of consensus: everyone decides
on the same outcome, and once you have decided, you cannot change your mind. The validity property
rules out trivial solutions: for example, you could have an algorithm that always decides `null`, no
matter what was proposed; this algorithm would satisfy the agreement and integrity properties, but
not the validity property.

If you don’t care about fault tolerance, then satisfying the first three properties is easy: you can
just hardcode one node to be the “dictator,” and let that node make all of the decisions. However,
if that one node fails, then the system can no longer make any decisions—just like single-leader
replication without failover. All the difficulty arises from the need for fault tolerance.

The termination property formalizes the idea of fault tolerance. It essentially says that a
consensus algorithm cannot simply sit around and do nothing forever—in other words, it must make
progress. Even if some nodes fail, the other nodes must still reach a decision. (Termination is a
liveness property, whereas the other three are safety properties—see
[“Safety and liveness”](/en/ch9#sec_distributed_safety_liveness).)

If a crashed node may recover, you could just wait for it to come back. However, consensus must
ensure that it makes a decision even if a crashed node suddenly disappears and never comes back.
(Instead of a software crash, imagine that there is an earthquake, and the datacenter containing
your node is destroyed by a landslide. You must assume that your node is buried under 30 feet of mud
and is never going to come back online.)

Of course, if *all* nodes crash and none of them are running, then it is not possible for any
algorithm to decide anything. There is a limit to the number of failures that an algorithm can
tolerate: in fact, it can be proved that any consensus algorithm requires at least a majority of
nodes to be functioning correctly in order to assure termination [^73]. That majority can safely form a quorum
(see [“Quorums for reading and writing”](/en/ch6#sec_replication_quorum_condition)).

Thus, the termination property is subject to the assumption that fewer than half of the nodes are
crashed or unreachable. However, most consensus algorithms ensure that the safety
properties—agreement, integrity, and validity—are always met, even if a majority of nodes fail or
there is a severe network problem [^75].
Thus, a large-scale outage can stop the system from being able to process requests, but it cannot
corrupt the consensus system by causing it to make inconsistent decisions.

#### Compare-and-set as consensus {#compare-and-set-as-consensus}

A compare-and-set (CAS) operation checks whether the current value of some object equals some
expected value; if yes, it atomically updates the object to some new value; if no, it leaves the
object unchanged and returns an error.

If you have a fault-tolerant, linearizable CAS operation, it is easy to solve the consensus problem:
initially set the object to a null value; each node that wants to propose a value invokes CAS with
the expected value being null, and the new value being the value it wants to propose (assuming it is
non-null). The decided value is then whatever value the object is set to.

Likewise, if you have a solution for consensus, you can implement CAS: whenever one or more nodes
want to perform CAS with the same expected value, you use the consensus protocol to propose the new
values in the CAS invocation, and then set the object to whatever value was decided by the
consensus. Any CAS invocations whose new value was not decided return an error. CAS invocations with
different expected values use separate runs of the consensus protocol.

This shows that CAS and consensus are equivalent to each other [^28] [^73].
Again, both are straightforward on a single node, but challenging to make fault-tolerant. As an
example of CAS in a distributed setting, we saw conditional write operations for object stores in
[“Databases backed by object storage”](/en/ch6#sec_replication_object_storage), which allow a write to happen only if an object with the same
name has not been created or modified by another client since the current client last read it.

However, a linearizable read-write register is not sufficient to solve consensus. The FLP result
tells us that consensus cannot be solved by a deterministic algorithm in the asynchronous crash-stop
model [^72], but we saw in [“Linearizability and quorums”](/en/ch10#sec_consistency_quorum_linearizable) that a linearizable register can be implemented using quorum
reads/writes in this model [^24] [^25] [^26]. From this it follows that a linearizable register cannot solve consensus.

#### Shared logs as consensus {#sec_consistency_shared_logs}

We have seen several examples of logs, such as replication logs, transaction logs, and write-ahead
logs. A log stores a sequence of *log entries*, and anyone who reads it sees the same entries in the
same order. Sometimes a log has a single writer that is allowed to append new entries, but a *shared
log* is one where multiple nodes can request entries to be appended. An example is single-leader
replication: any client can ask the leader to make a write, which the leader appends to the
replication log, and then all followers apply the writes in the same order as the leader.

More formally, a shared log supports two operations: you can request for a value to be added to the
log, and you can read the entries in the log. It must satisfy the following properties:

Eventual append
: If a node requests for some value to be added the log, and the node does not crash, then that node
 must eventually read that value in a log entry.

Reliable delivery
: No log entries are lost: if one node reads some log entry, then eventually every node that does
 not crash must also read that log entry.

Append-only
: Once a node has read some log entry, it is immutable, and new log entries can only be added after
 it, but not before. A node may re-read the log, in which case it sees the same log entries in the
 same order as it read them initially (even if the node crashes and restarts).

Agreement
: If two nodes both read some log entry *e*, then prior to *e* they must have read exactly the same
 sequence of log entries in the same order.

Validity
: If a node reads a log entry containing some value, then some node previously requested for that
 value to be added to the log.

--------

> [!NOTE]
> A shared log is formally known as a *total order broadcast*, *atomic broadcast*, or *total order multicast* protocol [^26] [^76] [^77]
> It’s the same thing described in different words: requesting a value to be added to the log is then called “broadcasting” it, and reading a log entry is called “delivering” it.

--------

If you have an implementation of a shared log, it is easy to solve the consensus problem: every node
that wants to propose a value requests for it to be added to the log, and whichever value is read
back in the first log entry is the value that is decided. Since all nodes read log entries in the
same order, they are guaranteed to agree on which value is delivered first [^28].

Conversely, if you have a solution for consensus, you can implement a shared log. The details are a
bit more complicated, but the basic idea is this [^73]:

1. You have a slot in the log for every future log entry, and you run a separate instance of the
 consensus algorithm for every such slot to decide what value should go in that entry.
2. When a node wants to add a value to the log, it proposes that value for one of the slots that has
 not yet been decided.
3. When the consensus algorithm decides for one of the slots, and all the previous slots have
 already been decided, then the decided value is appended as a new log entry, and any consecutive
 slots that have been decided also have their decided value appended to the log.
4. If a proposed value was not chosen for some slot, the node that wanted to add it retries by
 proposing it for a later slot.

This shows that consensus is equivalent to total order broadcast and shared logs. Single-leader
replication without failover does not meet the liveness requirements, since it stops delivering
messages if the leader crashes. As usual, the challenge is in performing failover safely and
automatically.

#### Fetch-and-add as consensus {#fetch-and-add-as-consensus}

The linearizable ID generator we saw in [“Linearizable ID Generators”](/en/ch10#sec_consistency_linearizable_id) comes close to solving
consensus, but it falls slightly short. We can implement such an ID generator using a fetch-and-add
operation, which atomically increments a counter and returns the old counter value.

If you have a CAS operation, it’s easy to implement fetch-and-add: first read the counter value,
then perform a CAS where the expected value is the value you read, and the new value is that value
plus one. If the CAS fails, you retry the whole process until the CAS succeeds. This is less
efficient than a native fetch-and-add operation when there is contention, but it is functionally
equivalent. Since you can implement CAS using consensus, you can also implement fetch-and-add using
consensus.

Conversely, if you have a fault-tolerant fetch-and-add operation, can you solve the consensus
problem? Let’s say you initialize the counter to zero, and every node that wants to propose a value
invokes the fetch-and-add operation to increment the counter. Since the fetch-and-add operation is
atomic, one of the nodes will read the initial value of zero, and the others will all read a value
that has been incremented at least once.

Now let’s say that the node that reads zero is the winner, and its value is decided. That works for
the node that read zero, but the other nodes have a problem: they know that they are not the winner,
but they don’t know which of the other nodes has won. The winner could send a message to the other
nodes to let them know it has won, but what if the winner crashes before it has a chance to send
this message? In that case the other nodes are left hanging, unable to decide any value, and thus
the consensus does not terminate. And the other nodes can’t fall back to another node, because the
node that read zero may yet come back and rightly decide the value it proposed.

An exception is if we know for sure that no more than two nodes will propose a value. In that case,
the nodes can send each other the values they want to propose, and then each perform the
fetch-and-add operation. The node that reads zero decides its own value, and the node that reads one
decides the other node’s value. This solves the consensus problem among two nodes, which is why we
can say that fetch-and-add has a *consensus number* of two [^28].
In contrast, CAS and shared logs solve consensus for any number of nodes that may propose values, so
they have a consensus number of ∞ (infinity).

#### Atomic commitment as consensus {#atomic-commitment-as-consensus}

In [“Distributed Transactions”](/en/ch8#sec_transactions_distributed) we saw the *atomic commitment* problem, which is to ensure that
the databases or shards involved in a distributed transaction all either commit or abort a
transaction. We also saw the *two-phase commit* algorithm, which relies on a coordinator that is a
single point of failure.

What is the relationship between consensus and atomic commitment? At first glance, they seem very
similar—both require nodes to come to some form of agreement. However, there is one important
difference: with consensus it’s okay to decide any value that proposed, whereas with atomic
commitment the algorithm *must* abort if *any* of the participants voted to abort. More precisely,
atomic commitment requires the following properties [^78]:

Uniform agreement
: No two nodes decide on different outcomes.

Integrity
: Once a node has decided one outcome, it cannot change its mind by deciding another outcome.

Validity
: If a node decides to commit, then all nodes must have previously voted to commit. If any node
 voted to abort, the nodes must abort.

Non-triviality
: If all nodes vote to commit, and no communication timeouts occur, then all nodes must decide to
 commit.

Termination
: Every node that does not crash eventually decides to either commit or abort.

The validity property ensures that a transaction can only commit if all nodes agree; and the
non-triviality property ensures the algorithm can’t simply always abort (but it allows an abort if
any of the communication among the nodes times out). The other three properties are basically the
same as for consensus.

If you have a solution for consensus, there are multiple ways you could solve atomic commitment [^78] [^79].
One works like this: when you want to commit the transaction, every node sends its vote to commit or
abort to every other node. Nodes that receive a vote to commit from itself and every other node
propose “commit” using the consensus algorithm; nodes that receive a vote to abort, or which
experience a timeout, propose “abort” using the consensus algorithm. When a node finds out what the
consensus algorithm decided, it commits or aborts accordingly.

In this algorithm, “commit” will only be proposed if all nodes voted to commit. If any node voted to
abort, all proposals in the consensus algorithm will be “abort”. It could happen that some nodes
propose “abort” while others propose “commit” if all nodes voted to commit but some communication
timed out; in this case it doesn’t matter whether the nodes commit or abort, as long as they all do the same.

If you have a fault-tolerant atomic commitment protocol, you can also solve consensus. Every node
that wants to propose a value starts a transaction on a quorum of nodes, and at each node it
performs a single-node CAS to set a register to the proposed value if its value has not already been
set by another transaction. If the CAS succeeds, the node votes to commit, otherwise it votes to
abort. If the atomic commit protocol decides to commit a transaction, its value is decided for
consensus; if atomic commit aborts, the proposing node retries with a new transaction.

This shows that atomic commit and consensus are also equivalent to each other.

### Consensus in Practice {#sec_consistency_total_order}

We have seen that single-value consensus, CAS, shared logs, and atomic commitment are all equivalent
to each other: you can convert a solution to one of them into a solution to any of the others. That
is a valuable theoretical insight, but it doesn’t answer the question: which of these many
formulations of consensus is the most useful in practice?

The answer is that most consensus systems provide shared logs, also known as total order broadcast.
Raft, Viewstamped Replication, and Zab provide shared logs right out of the box. Paxos provides
single-value consensus, but in practice most systems using Paxos actually use the extension called
Multi-Paxos, which also provides a shared log.

#### Using shared logs {#sec_consistency_smr}

A shared log is a good fit for database replication: if every log entry represents a write to the
database, and every replica processes the same writes in the same order using deterministic logic,
then the replicas will all end up in a consistent state. This idea is known as *state machine replication* [^80],
and it is the principle behind event sourcing, which we saw in [“Event Sourcing and CQRS”](/en/ch3#sec_datamodels_events). Shared
logs are also useful for stream processing, as we shall see in [Chapter 12](/en/ch12#ch_stream).

Similarly, a shared log can be used to implement serializable transactions: as discussed in
[“Actual Serial Execution”](/en/ch8#sec_transactions_serial), if every log entry represents a deterministic transaction to be
executed as a stored procedure, and if every node executes those transactions in the same order,
then the transactions will be serializable [^81] [^82].

---------

> [!NOTE]
> Sharded databases with a strong consistency model often maintain a separate log per shard, which
> improves scalability, but limits the consistency guarantees (e.g., consistent snapshots, foreign key
> references) they can offer across shards. Serializable transactions across shards are possible, but
> require additional coordination [^83].

--------

A shared log is also powerful because it can easily be adapted to other forms of consensus:

* We saw previously how to use it to implement single-value consensus and CAS: simply decide the
 value that appears first in the log.
* If you want many instances of single-value consensus (e.g. one per seat in a theater that several
 people are trying to book), include the seat number in the log entries, and decide the first log
 entry that contains a given seat number.
* If you want an atomic fetch-and-add, put the number to add to the counter in a log entry, and the
 current counter value is the sum of all of the log entries so far. A simple counter on log entries
 can be used to generate fencing tokens (see [“Fencing off zombies and delayed requests”](/en/ch9#sec_distributed_fencing_tokens)); for example, in
 ZooKeeper, this sequence number is called `zxid` [^18].

#### From single-leader replication to consensus {#from-single-leader-replication-to-consensus}

We saw previously that single-value consensus is easy if you have a single “dictator” node that
makes the decision, and likewise a shared log is easy if a single leader is the only node that is
allowed to append entries to it. The question is how to provide fault tolerance if that node fails.

Traditionally, databases with single-leader replication didn’t solve this problem: they left leader
failover as an action that a human administrator had to perform manually. Unfortunately, this means
a significant amount of downtime, since there is a limit to how fast humans can react, and it
doesn’t satisfy the termination property of consensus. For consensus, we require that the algorithm
can automatically choose a new leader. (Not all consensus algorithms have a leader, but the commonly
used algorithms do [^84].)

However, there is a problem. We previously discussed the problem of split brain, and said that all
nodes need to agree who the leader is—otherwise two different nodes could each believe themselves to
be the leader, and consequently make inconsistent decisions. Thus, it seems like we need consensus
in order to elect a leader, and we need a leader in order to solve consensus. How do we break out of
this conundrum?

In fact, consensus algorithms don’t require that there is only one leader at any one time. Instead,
they make a weaker guarantee: they define an *epoch number* (called the *ballot number* in Paxos,
*view number* in Viewstamped Replication, and *term number* in Raft) and guarantee that within each
epoch, the leader is unique.

When a node believes that the current leader is dead because it hasn’t heard from the leader for
some timeout, it may start a vote to elect a new leader. This election is given a new epoch number
that is greater than any previous epoch. If there is a conflict between two different leaders in two
different epochs (perhaps because the previous leader actually wasn’t dead after all), then the
leader with the higher epoch number prevails.

Before a leader is allowed to append the next entry to the shared log, it must first check that
there isn’t some other leader with a higher epoch number which might append a different entry. It
can do this by collecting votes from a quorum of nodes—typically, but not always, a majority of
nodes [^85]. A node votes yes only if it is not aware of any other leader with a higher epoch.

Thus, we have two rounds of voting: once to choose a leader, and a second time to vote on a leader’s
proposal for the next entry to append to the log. The quorums for those two votes must overlap: if
a vote on a proposal succeeds, at least one of the nodes that voted for it must have also
participated in the most recent successful leader election [^85]. Thus, if the vote on a proposal
passes without revealing any higher-numbered epoch, the current leader can conclude that no leader
with a higher epoch number has been elected, and therefore it can safely append the proposed entry
to the log [^26] [^86].

These two rounds of voting look superficially similar to two-phase commit, but they are very
different protocols. In consensus algorithms, any node can start an election and it requires only a
quorum of nodes to respond; in 2PC, only the coordinator can request votes, and it requires a “yes”
vote from *every* participant before it can commit.

#### Subtleties of consensus {#subtleties-of-consensus}

This basic structure is common to all of Raft, Multi-Paxos, Zab, and Viewstamped Replication: a vote
by a quorum of nodes elects a leader, and then another quorum vote is required for every entry that
the leader wants to append to the log [^68] [^69]. Every new log entry is synchronously replicated
to a quorum of nodes before it is confirmed to the client that requested the write. This ensures
that the log entry won’t be lost if the current leader fails.

However, the devil is in the details, and that’s also where these algorithms take different
approaches. For example, when the old leader fails and a new one is elected, the algorithm needs to
ensure that the new leader honors any log entries that had already been appended by the old leader
before it failed. Raft does this by only allowing a node to become the new leader if its log is at
least as up-to-date as a majority of its followers [^69].
In contrast, Paxos allows any node to become the new leader, but requires it to bring its log
up-to-date with other nodes before it can start appending new entries of its own.


--------

> [!TIP] CONSISTENCY VS. AVAILABILITY IN LEADER ELECTION

If you want the consensus algorithm to strictly guarantee the properties laid out in
[“Shared logs as consensus”](/en/ch10#sec_consistency_shared_logs), it’s essential that the new leader is up-to-date with any confirmed
log entries before it can process any writes or linearizable reads. If a node with stale data were
to become the new leader, it may write a new value to log entries that were already written by the
old leader, violating the shared log’s append-only property.

In some cases, you might choose to weaken the consensus properties in order to recover more quickly
from a leader failure. For example, Kafka offers the option of enabling *unclean leader election*,
which allows any replica to become leader, even if it is not up-to-date. Also, in databases with
asynchronous replication, you cannot guarantee that any follower is up-to-date when the leader
fails.

If you drop the requirement for the new leader to be up-to-date, you may improve performance and
availability, but you are on thin ice, since the theory of consensus no longer applies. While things
will work fine as long as there are no faults, the problems discussed in [Chapter 9](/en/ch9#ch_distributed) can
easily cause a lot of data loss or corruption.

--------

Another subtlety is in how the algorithms deal with log entries that had been proposed by the old
leader before it failed, but for which the vote on appending to the log had not yet completed. You
can find discussions of these details in the references for this chapter [^23] [^69] [^86].

For databases that use a consensus algorithm for replication, not only do writes need to be turned
into log entries and replicated to a quorum. If you want to guarantee linearizable reads, they also
have to go through a quorum vote similarly to a write, to confirm that the node that believes to be
the leader really still is up-to-date. Linearizable reads in etcd work like this, for example.

In their standard form, most consensus algorithms assume a fixed set of nodes—that is, nodes may go
down and come back up again, but the set of nodes that is allowed to vote is fixed when the cluster
is created. In practice, it’s often necessary to add new nodes or remove old nodes in a system
configuration. Consensus algorithms have been extended with *reconfiguration* features that make
this possible. This is especially useful when adding new regions to a system, or when migrating from
one location to another (by first adding the new nodes, and then removing the old nodes).

#### Pros and cons of consensus {#pros-and-cons-of-consensus}

Although they are complex and subtle, consensus algorithms are a huge breakthrough for distributed
systems. Consensus is essentially “single-leader replication done right”, with automatic failover on
leader failure, ensuring that no committed data is lost and no split-brain is possible, even in the
face of all the problems we discussed in [Chapter 9](/en/ch9#ch_distributed).

Since single-leader replication with automatic failover is essentially one of the definitions of
consensus, any system that provides automatic failover but does not use a proven consensus algorithm
is likely to be unsafe [^87].
Using a proven consensus algorithm is not a guarantee of correctness of the whole system—there are
still plenty of other places where bugs can lurk—but it’s a good start.

Nevertheless, consensus is not used everywhere, because the benefits come at a cost. Consensus
systems always require a strict majority to operate—three nodes to tolerate one failure, or five
nodes to tolerate two failures. Every operation needs to communicate with a quorum, so you can’t
increase throughput by adding more nodes (in fact, every node you add makes the algorithm slower).
If a network partition cuts off some nodes from the rest, only the majority portion of the network
can make progress, and the rest are blocked.

Consensus systems generally rely on timeouts to detect failed nodes. In environments with highly
variable network delays, especially systems distributed across multiple geographic regions, it can
be difficult to tune these timeouts: if they are too large it takes a long time to recover from a
failure; if they are too small there can be lots of unnecessary leader elections, resulting in
terrible performance as the system can end up spending more time choosing leaders than doing useful
work.

Sometimes, consensus algorithms are particularly sensitive to network problems. For example, Raft
has been shown to have unpleasant edge cases [^88] [^89]:
if the entire network is working correctly except for one particular network link that is
consistently unreliable, Raft can get into situations where leadership continually bounces between
two nodes, or the current leader is continually forced to resign, so the system effectively never
makes progress. Designing algorithms that are more robust to unreliable networks is still an open
research problem.

For systems that want to be highly available, but don’t want to accept the cost of consensus, the
only real alternative is to use a weaker consistency model instead, such as those offered by
leaderless or multi-leader replication as discussed in [Chapter 6](/en/ch6#ch_replication). These approaches
generally don’t offer linearizability, but for applications that don’t need it that is fine.


### Coordination Services {#sec_consistency_coordination}

Consensus algorithms are useful in any distributed database that wants to offer linearizable
operations, and many modern distributed databases use consensus algorithms for replication. But one
family of systems is a particularly prominent user of consensus: *coordination services* such as
ZooKeeper, etcd, or Consul. Although these systems look superficially like any other key-value
store, they are not designed for general-purpose data storage like most databases.

Instead, they are designed to coordinate between nodes of another distributed system. For example,
Kubernetes relies on etcd, while Spark and Flink in high availability mode rely on ZooKeeper running
in the background. Coordination services are designed to hold small amounts of data that can fit
entirely in memory (although they still write to disk for durability), which is replicated across
multiple nodes using a fault-tolerant consensus algorithm.

Coordination services are modeled after Google’s Chubby lock service [^17] [^58].
They combine a consensus algorithm with several other features that turn out to be particularly
useful when building distributed systems:

Locks and leases
: We saw previously how consensus systems can implement an atomic, fault-tolerant compare-and-set
 (CAS) operation. Coordination services rely on this approach to implement locks and leases: if
 several nodes concurrently try to acquire the same lease, only one of them will succeed.

Support for fencing
: As discussed in [“Distributed Locks and Leases”](/en/ch9#sec_distributed_lock_fencing), when a resource is protected by a lease, you
 need *fencing* to prevent clients from interfering with each other in the case of a process pause
 or large network delay. Consensus systems can generate fencing tokens by giving each log entry a
 monotonically increasing ID (`zxid` and `cversion` in ZooKeeper, revision number in etcd).

Failure detection
: Clients maintain a long-lived session on the coordination service, and periodically exchange
 heartbeats to check if the other node is still alive. Even if the connection is temporarily
 interrupted, or a server fails, any leases held by the client remain active. However, if there is
 no heartbeat for longer than the timeout of the lease, the coordination service assumes the client
 is dead and releases the lease (ZooKeeper calls these *ephemeral nodes*).

Change notifications
: A client can request that the coordination service sends it a notification whenever certain keys
 change. This allows a client to find out when another client joins the cluster (based on the value
 it writes to the coordination service), or if another client fails (because its session times out
 and its ephemeral nodes disappear), for example. These notifications save the client from having
 to frequently poll the service to find out about changes.

Failure detection and change notifications do not require consensus, but they are useful for
distributed coordination alongside the atomic operations and fencing support that do require
consensus.

--------

> [!TIP] MANAGING CONFIGURATION WITH COORDINATION SERVICES

Applications and infrastructure often have configuration parameters such as timeouts, thread pool
sizes, and so on. Coordination services are sometimes used to store such configuration data,
represented as key-value pairs. Processes load the latest settings upon startup, and subscribe to
receive notifications of any changes. When a configuration changes, the process can begin using the
new setting immediately or restart itself to load the latest changes.

Configuration management doesn’t need the consensus aspect of a coordination service, but it’s
convenient to use a coordination service and rely on its notification feature if you are already
running the coordination service anyway. Alternatively, a process could periodically poll for
configuration updates from a file or URL, which avoids the need for a specialized service.

--------

#### Allocating work to nodes {#allocating-work-to-nodes}

A coordination service is useful if you have several instances of a process or service, and one
of them needs to be chosen as leader or primary. If the leader fails, one of the other nodes should
take over. This is necessary for single-leader databases, but it’s also appropriate for job
schedulers and similar stateful systems.

Another use case is when you have some sharded resource (database, message streams, file storage,
distributed actor system, etc.) and need to decide which shard to assign to which node. As new nodes
join the cluster, some of the shards need to be moved from existing nodes to the new nodes in order
to rebalance the load. As nodes are removed or fail, other nodes need to take over the failed nodes’
work.

These kinds of tasks can be achieved by judicious use of atomic operations, ephemeral nodes, and
notifications in a coordination service. If done correctly, this approach allows the application to
automatically recover from faults without human intervention. It’s not easy, despite the appearance
of libraries such as Apache Curator that have sprung up to provide higher-level tools on top of the
ZooKeeper client API—but it is still much better than attempting to implement the necessary
consensus algorithms from scratch, which would be very prone to bugs.

A dedicated coordination service also has the advantage that it can run on a fixed set of nodes
(usually three or five), regardless of how many nodes there are in the distributed system that
relies on it for coordination. For example, in a storage system with thousands of shards, it would
be terribly inefficient to run a consensus algorithm over thousands of nodes; it’s much better to
“outsource” the consensus to a small number of nodes running a coordination service.

Normally, the kind of data managed by a coordination service is quite slow-changing: it represents
information like “the node running on IP address 10.1.1.23 is the leader for shard 7,” and such
assignments usually change on a timescale of minutes or hours. Coordination services are not
intended for storing data that may change thousands of times per second. For that, it is better to
use a conventional database; alternatively, tools like Apache BookKeeper [^90] [^91]
can be used to replicate fast-changing internal state of a service.

#### Service discovery {#service-discovery}

ZooKeeper, etcd, and Consul are also often used for *service discovery*—that is, to find out which
IP address you need to connect to in order to reach a particular service (see
[“Load balancers, service discovery, and service meshes”](/en/ch5#sec_encoding_service_discovery)). In cloud environments, where it is common for
virtual machines to continually come and go, you often don’t know the IP addresses of your services
ahead of time. Instead, you can configure your services such that when they start up they register
their network endpoints in a service registry, where they can then be found by other services.

Using a coordination service for service discovery can be convenient, as its failure detection and
change notification features make it easy for clients to keep track of service instances as they
come and go. And if you are already using a coordination service for leases, locking, or leader
election, it makes sense to also use it for service discovery, since it already knows which node
should receive requests for your service.

However, using consensus for service discovery is often overkill: this use case often doesn’t
require linearizability, and it’s more important that service discovery is highly available and
fast, since without it everything would grind to a halt. It’s therefore often preferable to cache
service discovery information and accept that it might be slightly stale. For example, DNS-based
service discovery uses multiple layers of caching to achieve good performance and availability.

To support this use case, ZooKeeper supports *observers*, which are replicas that receive the log
and maintain a copy of the data stored in ZooKeeper, but which do not participate in the consensus
algorithm’s voting process. Reads from an observer are not linearizable as they might be stale, but
they remain available even if the network is interrupted, and they increase the read throughput that
the system can support by caching.

## Summary {#summary}

In this chapter we examined the topic of strong consistency in fault-tolerant systems: what it is,
and how to achieve it. We looked in depth at linearizability, a popular formalization of strong
consistency: it means that replicated data appears as though there were only a single copy, and all
operations act on it atomically. We saw that linearizability is useful when you need some data to be
up-to-date when you read it, or if you need to resolve a race condition (e.g. if multiple nodes are
concurrently trying to do the same thing, such as creating files with the same name).

Although linearizability is appealing because it is easy to understand—it makes a database behave
like a variable in a single-threaded program—it has the downside of being slow, especially in
environments with large network delays. Many replication algorithms don’t guarantee linearizability,
even though it superficially might seem like they might provide strong consistency.

Next, we applied the concept of linearizability in the context of ID generators. A single-node
auto-incrementing counter is linearizable, but not fault-tolerant. Many distributed ID generation
schemes don’t guarantee that the IDs are ordered consistently with the order in which the events
actually happened. Logical clocks such as Lamport clocks and hybrid logical clocks provide ordering
that is consistent with causality, but no linearizability.

This led us to the concept of consensus. We saw that achieving consensus means deciding something in
such a way that all nodes agree on what was decided, and such that they can’t change their mind. A
wide range of problems are actually reducible to consensus and are equivalent to each other (i.e.,
if you have a solution for one of them, you can transform it into a solution for all of the others).
Such equivalent problems include:

Linearizable compare-and-set operation
: The register needs to atomically *decide* whether to set its value, based on whether its current
 value equals the parameter given in the operation.

Locks and leases
: When several clients are concurrently trying to grab a lock or lease, the lock *decides* which one
 successfully acquired it.

Uniqueness constraints
: When several transactions concurrently try to create conflicting records with the same key, the
 constraint must *decide* which one to allow and which should fail with a constraint violation.

Shared logs
: When several nodes concurrently want to append entries to a log, the log *decides* in which order
 they are appended. Total order broadcast is also equivalent.

Atomic transaction commit
: The database nodes involved in a distributed transaction must all *decide* the same way whether to
 commit or abort the transaction.

Linearizable fetch-and-add operation
: This operation can be used to implement an ID generator. Several nodes can concurrently invoke the
 operation, and it *decides* the order in which they increment the counter. This case actually
 solves consensus only between two nodes, while the others work for any number of nodes.

All of these are straightforward if you only have a single node, or if you are willing to assign the
decision-making capability to a single node. This is what happens in a single-leader database: all
the power to make decisions is vested in the leader, which is why such databases are able to provide
linearizable operations, uniqueness constraints, a replication log, and more.

However, if that single leader fails, or if a network interruption makes the leader unreachable,
such a system becomes unable to make any progress until a human performs a manual failover.
Widely-used consensus algorithms like Raft and Paxos are essentially single-leader replication with
built-in automatic leader election and failover if the current leader fails.

Consensus algorithms are carefully designed to ensure that no committed writes are lost during a
failover, and that the system cannot get into a split brain state in which multiple nodes are
accepting writes. This requires that every write, and every linearizable read, is confirmed by a
quorum (typically a majority) of nodes. This can be expensive, especially across geographic regions,
but it is unavoidable if you want the strong consistency and fault tolerance that consensus provides.

Coordination services like ZooKeeper and etcd are also built on top of consensus algorithms. They
provide locks, leases, failure detection, and change notification features that are useful for
managing the state of distributed applications. If you find yourself wanting to do one of those
things that is reducible to consensus, and you want it to be fault-tolerant, it is advisable to use
a coordination service. It won’t guarantee that you will get it right, but it will probably help.

Consensus algorithms are complicated and subtle, but they are supported by a rich body of theory
that has been developed since the 1980s. This theory makes it possible to build systems that can
tolerate all the faults that we discussed in [Chapter 9](/en/ch9#ch_distributed), and still ensure that your data is
not corrupted. This is an amazing achievement, and the references at the end of this chapter feature
some of the highlights of this work.

Nevertheless, consensus is not always the right tool: in some systems, the strong consistency
properties it provides are not needed, and it is better to have weaker consistency with higher
availability and better performance. In these cases, it is common to use leaderless or multi-leader
replication, which we previously discussed in [Chapter 6](/en/ch6#ch_replication). The logical clocks that we
discussed in this chapter are helpful in that context.

### References

[^1]: Maurice P. Herlihy and Jeannette M. Wing. [Linearizability: A Correctness Condition for Concurrent Objects](https://cs.brown.edu/~mph/HerlihyW90/p463-herlihy.pdf). *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 12, issue 3, pages 463–492, July 1990. [doi:10.1145/78969.78972](https://doi.org/10.1145/78969.78972) 
[^2]: Leslie Lamport. [On interprocess communication](https://www.microsoft.com/en-us/research/publication/interprocess-communication-part-basic-formalism-part-ii-algorithms/). *Distributed Computing*, volume 1, issue 2, pages 77–101, June 1986. [doi:10.1007/BF01786228](https://doi.org/10.1007/BF01786228) 
[^3]: David K. Gifford. [Information Storage in a Decentralized Computer System](https://bitsavers.org/pdf/xerox/parc/techReports/CSL-81-8_Information_Storage_in_a_Decentralized_Computer_System.pdf). Xerox Palo Alto Research Centers, CSL-81-8, June 1981. Archived at [perma.cc/2XXP-3JPB](https://perma.cc/2XXP-3JPB) 
[^4]: Martin Kleppmann. [Please Stop Calling Databases CP or AP](https://martin.kleppmann.com/2015/05/11/please-stop-calling-databases-cp-or-ap.html). *martin.kleppmann.com*, May 2015. Archived at [perma.cc/MJ5G-75GL](https://perma.cc/MJ5G-75GL) 
[^5]: Kyle Kingsbury. [Call Me Maybe: MongoDB Stale Reads](https://aphyr.com/posts/322-call-me-maybe-mongodb-stale-reads). *aphyr.com*, April 2015. Archived at [perma.cc/DXB4-J4JC](https://perma.cc/DXB4-J4JC) 
[^6]: Kyle Kingsbury. [Computational Techniques in Knossos](https://aphyr.com/posts/314-computational-techniques-in-knossos). *aphyr.com*, May 2014. Archived at [perma.cc/2X5M-EHTU](https://perma.cc/2X5M-EHTU) 
[^7]: Kyle Kingsbury and Peter Alvaro. [Elle: Inferring Isolation Anomalies from Experimental Observations](https://www.vldb.org/pvldb/vol14/p268-alvaro.pdf). *Proceedings of the VLDB Endowment*, volume 14, issue 3, pages 268–280, November 2020. [doi:10.14778/3430915.3430918](https://doi.org/10.14778/3430915.3430918) 
[^8]: Paolo Viotti and Marko Vukolić. [Consistency in Non-Transactional Distributed Storage Systems](https://arxiv.org/abs/1512.00168). *ACM Computing Surveys* (CSUR), volume 49, issue 1, article no. 19, June 2016. [doi:10.1145/2926965](https://doi.org/10.1145/2926965) 
[^9]: Peter Bailis. [Linearizability Versus Serializability](http://www.bailis.org/blog/linearizability-versus-serializability/). *bailis.org*, September 2014. Archived at [perma.cc/386B-KAC3](https://perma.cc/386B-KAC3) 
[^10]: Daniel Abadi. [Correctness Anomalies Under Serializable Isolation](https://dbmsmusings.blogspot.com/2019/06/correctness-anomalies-under.html). *dbmsmusings.blogspot.com*, June 2019. Archived at [perma.cc/JGS7-BZFY](https://perma.cc/JGS7-BZFY) 
[^11]: Peter Bailis, Aaron Davidson, Alan Fekete, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Highly Available Transactions: Virtues and Limitations](https://www.vldb.org/pvldb/vol7/p181-bailis.pdf). *Proceedings of the VLDB Endowment*, volume 7, issue 3, pages 181–192, November 2013. [doi:10.14778/2732232.2732237](https://doi.org/10.14778/2732232.2732237), extended version published as [arXiv:1302.0309](https://arxiv.org/abs/1302.0309) 
[^12]: Philip A. Bernstein, Vassos Hadzilacos, and Nathan Goodman. [*Concurrency Control and Recovery in Database Systems*](https://www.microsoft.com/en-us/research/people/philbe/book/). Addison-Wesley, 1987. ISBN: 978-0-201-10715-9, available online at [*microsoft.com*](https://www.microsoft.com/en-us/research/people/philbe/book/). 
[^13]: Andrei Matei. [CockroachDB’s consistency model](https://www.cockroachlabs.com/blog/consistency-model/). *cockroachlabs.com*, February 2021. Archived at [perma.cc/MR38-883B](https://perma.cc/MR38-883B) 
[^14]: Murat Demirbas. [Strict-serializability, but at what cost, for what purpose?](https://muratbuffalo.blogspot.com/2022/08/strict-serializability-but-at-what-cost.html) *muratbuffalo.blogspot.com*, August 2022. Archived at [perma.cc/T8AY-N3U9](https://perma.cc/T8AY-N3U9) 
[^15]: Ben Darnell. [How to talk about consistency and isolation in distributed DBs](https://www.cockroachlabs.com/blog/db-consistency-isolation-terminology/). *cockroachlabs.com*, February 2022. Archived at [perma.cc/53SV-JBGK](https://perma.cc/53SV-JBGK) 
[^16]: Daniel Abadi. [An explanation of the difference between Isolation levels vs. Consistency levels](https://dbmsmusings.blogspot.com/2019/08/an-explanation-of-difference-between.html). *dbmsmusings.blogspot.com*, August 2019. Archived at [perma.cc/QSF2-CD4P](https://perma.cc/QSF2-CD4P) 
[^17]: Mike Burrows. [The Chubby Lock Service for Loosely-Coupled Distributed Systems](https://research.google/pubs/pub27897/). At *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006. 
[^18]: Flavio P. Junqueira and Benjamin Reed. [*ZooKeeper: Distributed Process Coordination*](https://www.oreilly.com/library/view/zookeeper/9781449361297/). O’Reilly Media, 2013. ISBN: 978-1-449-36130-3 
[^19]: Murali Vallath. [*Oracle 10g RAC Grid, Services & Clustering*](https://www.oreilly.com/library/view/oracle-10g-rac/9781555583217/). Elsevier Digital Press, 2006. ISBN: 978-1-555-58321-7 
[^20]: Peter Bailis, Alan Fekete, Michael J. Franklin, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Coordination Avoidance in Database Systems](https://arxiv.org/abs/1402.2237). *Proceedings of the VLDB Endowment*, volume 8, issue 3, pages 185–196, November 2014. [doi:10.14778/2735508.2735509](https://doi.org/10.14778/2735508.2735509) 
[^21]: Kyle Kingsbury. [Call Me Maybe: etcd and Consul](https://aphyr.com/posts/316-call-me-maybe-etcd-and-consul). *aphyr.com*, June 2014. Archived at [perma.cc/XL7U-378K](https://perma.cc/XL7U-378K) 
[^22]: Flavio P. Junqueira, Benjamin C. Reed, and Marco Serafini. [Zab: High-Performance Broadcast for Primary-Backup Systems](https://marcoserafini.github.io/assets/pdf/zab.pdf). At *41st IEEE International Conference on Dependable Systems and Networks* (DSN), June 2011. [doi:10.1109/DSN.2011.5958223](https://doi.org/10.1109/DSN.2011.5958223) 
[^23]: Diego Ongaro and John K. Ousterhout. [In Search of an Understandable Consensus Algorithm](https://www.usenix.org/system/files/conference/atc14/atc14-paper-ongaro.pdf). At *USENIX Annual Technical Conference* (ATC), June 2014. 
[^24]: Hagit Attiya, Amotz Bar-Noy, and Danny Dolev. [Sharing Memory Robustly in Message-Passing Systems](https://www.cs.huji.ac.il/course/2004/dist/p124-attiya.pdf). *Journal of the ACM*, volume 42, issue 1, pages 124–142, January 1995. [doi:10.1145/200836.200869](https://doi.org/10.1145/200836.200869) 
[^25]: Nancy Lynch and Alex Shvartsman. [Robust Emulation of Shared Memory Using Dynamic Quorum-Acknowledged Broadcasts](https://groups.csail.mit.edu/tds/papers/Lynch/FTCS97.pdf). At *27th Annual International Symposium on Fault-Tolerant Computing* (FTCS), June 1997. [doi:10.1109/FTCS.1997.614100](https://doi.org/10.1109/FTCS.1997.614100) 
[^26]: Christian Cachin, Rachid Guerraoui, and Luís Rodrigues. [*Introduction to Reliable and Secure Distributed Programming*](https://www.distributedprogramming.net/), 2nd edition. Springer, 2011. ISBN: 978-3-642-15259-7, [doi:10.1007/978-3-642-15260-3](https://doi.org/10.1007/978-3-642-15260-3) 
[^27]: Niklas Ekström, Mikhail Panchenko, and Jonathan Ellis. [Possible Issue with Read Repair?](https://lists.apache.org/thread/wwsjnnc93mdlpw8nb0d5gn4q1bmpzbon) Email thread on *cassandra-dev* mailing list, October 2012. 
[^28]: Maurice P. Herlihy. [Wait-Free Synchronization](https://cs.brown.edu/~mph/Herlihy91/p124-herlihy.pdf). *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 13, issue 1, pages 124–149, January 1991. [doi:10.1145/114005.102808](https://doi.org/10.1145/114005.102808) 
[^29]: Armando Fox and Eric A. Brewer. [Harvest, Yield, and Scalable Tolerant Systems](https://radlab.cs.berkeley.edu/people/fox/static/pubs/pdf/c18.pdf). At *7th Workshop on Hot Topics in Operating Systems* (HotOS), March 1999. [doi:10.1109/HOTOS.1999.798396](https://doi.org/10.1109/HOTOS.1999.798396) 
[^30]: Seth Gilbert and Nancy Lynch. [Brewer’s Conjecture and the Feasibility of Consistent, Available, Partition-Tolerant Web Services](https://www.comp.nus.edu.sg/~gilbert/pubs/BrewersConjecture-SigAct.pdf). *ACM SIGACT News*, volume 33, issue 2, pages 51–59, June 2002. [doi:10.1145/564585.564601](https://doi.org/10.1145/564585.564601) 
[^31]: Seth Gilbert and Nancy Lynch. [Perspectives on the CAP Theorem](https://groups.csail.mit.edu/tds/papers/Gilbert/Brewer2.pdf). *IEEE Computer Magazine*, volume 45, issue 2, pages 30–36, February 2012. [doi:10.1109/MC.2011.389](https://doi.org/10.1109/MC.2011.389) 
[^32]: Eric A. Brewer. [CAP Twelve Years Later: How the ‘Rules’ Have Changed](https://sites.cs.ucsb.edu/~rich/class/cs293-cloud/papers/brewer-cap.pdf). *IEEE Computer Magazine*, volume 45, issue 2, pages 23–29, February 2012. [doi:10.1109/MC.2012.37](https://doi.org/10.1109/MC.2012.37) 
[^33]: Susan B. Davidson, Hector Garcia-Molina, and Dale Skeen. [Consistency in Partitioned Networks](https://www.cs.rice.edu/~alc/old/comp520/papers/DGS85.pdf). *ACM Computing Surveys*, volume 17, issue 3, pages 341–370, September 1985. [doi:10.1145/5505.5508](https://doi.org/10.1145/5505.5508) 
[^34]: Paul R. Johnson and Robert H. Thomas. [RFC 677: The Maintenance of Duplicate Databases](https://tools.ietf.org/html/rfc677). Network Working Group, January 1975. 
[^35]: Michael J. Fischer and Alan Michael. [Sacrificing Serializability to Attain High Availability of Data in an Unreliable Network](https://sites.cs.ucsb.edu/~agrawal/spring2011/ugrad/p70-fischer.pdf). At *1st ACM Symposium on Principles of Database Systems* (PODS), March 1982. [doi:10.1145/588111.588124](https://doi.org/10.1145/588111.588124) 
[^36]: Eric A. Brewer. [NoSQL: Past, Present, Future](https://www.infoq.com/presentations/NoSQL-History/). At *QCon San Francisco*, November 2012. 
[^37]: Adrian Cockcroft. [Migrating to Microservices](https://www.infoq.com/presentations/migration-cloud-native/). At *QCon London*, March 2014. 
[^38]: Martin Kleppmann. [A Critique of the CAP Theorem](https://arxiv.org/abs/1509.05393). arXiv:1509.05393, September 2015. 
[^39]: Daniel Abadi. [Problems with CAP, and Yahoo’s little known NoSQL system](https://dbmsmusings.blogspot.com/2010/04/problems-with-cap-and-yahoos-little.html). *dbmsmusings.blogspot.com*, April 2010. Archived at [perma.cc/4NTZ-CLM9](https://perma.cc/4NTZ-CLM9) 
[^40]: Daniel Abadi. [Hazelcast and the Mythical PA/EC System](https://dbmsmusings.blogspot.com/2017/10/hazelcast-and-mythical-paec-system.html). *dbmsmusings.blogspot.com*, October 2017. Archived at [perma.cc/J5XM-U5C2](https://perma.cc/J5XM-U5C2) 
[^41]: Eric Brewer. [Spanner, TrueTime & The CAP Theorem](https://research.google.com/pubs/archive/45855.pdf). *research.google.com*, February 2017. Archived at [perma.cc/59UW-RH7N](https://perma.cc/59UW-RH7N) 
[^42]: Daniel J. Abadi. [Consistency Tradeoffs in Modern Distributed Database System Design](https://www.cs.umd.edu/~abadi/papers/abadi-pacelc.pdf). *IEEE Computer Magazine*, volume 45, issue 2, pages 37–42, February 2012. [doi:10.1109/MC.2012.33](https://doi.org/10.1109/MC.2012.33) 
[^43]: Nancy A. Lynch. [A Hundred Impossibility Proofs for Distributed Computing](https://groups.csail.mit.edu/tds/papers/Lynch/podc89.pdf). At *8th ACM Symposium on Principles of Distributed Computing* (PODC), August 1989. [doi:10.1145/72981.72982](https://doi.org/10.1145/72981.72982) 
[^44]: Prince Mahajan, Lorenzo Alvisi, and Mike Dahlin. [Consistency, Availability, and Convergence](https://apps.cs.utexas.edu/tech_reports/reports/tr/TR-2036.pdf). University of Texas at Austin, Department of Computer Science, Tech Report UTCS TR-11-22, May 2011. Archived at [perma.cc/SAV8-9JAJ](https://perma.cc/SAV8-9JAJ) 
[^45]: Hagit Attiya, Faith Ellen, and Adam Morrison. [Limitations of Highly-Available Eventually-Consistent Data Stores](https://www.cs.tau.ac.il/~mad/publications/podc2015-replds.pdf). At *ACM Symposium on Principles of Distributed Computing* (PODC), July 2015. [doi:10.1145/2767386.2767419](https://doi.org/10.1145/2767386.2767419) 
[^46]: Peter Sewell, Susmit Sarkar, Scott Owens, Francesco Zappa Nardelli, and Magnus O. Myreen. [x86-TSO: A Rigorous and Usable Programmer’s Model for x86 Multiprocessors](https://www.cl.cam.ac.uk/~pes20/weakmemory/cacm.pdf). *Communications of the ACM*, volume 53, issue 7, pages 89–97, July 2010. [doi:10.1145/1785414.1785443](https://doi.org/10.1145/1785414.1785443) 
[^47]: Martin Thompson. [Memory Barriers/Fences](https://mechanical-sympathy.blogspot.com/2011/07/memory-barriersfences.html). *mechanical-sympathy.blogspot.co.uk*, July 2011. Archived at [perma.cc/7NXM-GC5U](https://perma.cc/7NXM-GC5U) 
[^48]: Ulrich Drepper. [What Every Programmer Should Know About Memory](https://www.akkadia.org/drepper/cpumemory.pdf). *akkadia.org*, November 2007. Archived at [perma.cc/NU6Q-DRXZ](https://perma.cc/NU6Q-DRXZ) 
[^49]: Hagit Attiya and Jennifer L. Welch. [Sequential Consistency Versus Linearizability](https://courses.csail.mit.edu/6.852/01/papers/p91-attiya.pdf). *ACM Transactions on Computer Systems* (TOCS), volume 12, issue 2, pages 91–122, May 1994. [doi:10.1145/176575.176576](https://doi.org/10.1145/176575.176576) 
[^50]: Kyzer R. Davis, Brad G. Peabody, and Paul J. Leach. [Universally Unique IDentifiers (UUIDs)](https://www.rfc-editor.org/rfc/rfc9562). RFC 9562, IETF, May 2024. 
[^51]: Ryan King. [Announcing Snowflake](https://blog.x.com/engineering/en_us/a/2010/announcing-snowflake). *blog.x.com*, June 2010. Archived at [archive.org](https://web.archive.org/web/20241128214604/https%3A//blog.x.com/engineering/en_us/a/2010/announcing-snowflake) 
[^52]: Alizain Feerasta. [Universally Unique Lexicographically Sortable Identifier](https://github.com/ulid/spec). *github.com*, 2016. Archived at [perma.cc/NV2Y-ZP8U](https://perma.cc/NV2Y-ZP8U) 
[^53]: Rob Conery. [A Better ID Generator for PostgreSQL](https://bigmachine.io/2014/05/29/a-better-id-generator-for-postgresql/). *bigmachine.io*, May 2014. Archived at [perma.cc/K7QV-3KFC](https://perma.cc/K7QV-3KFC) 
[^54]: Leslie Lamport. [Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/). *Communications of the ACM*, volume 21, issue 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](https://doi.org/10.1145/359545.359563) 
[^55]: Sandeep S. Kulkarni, Murat Demirbas, Deepak Madeppa, Bharadwaj Avva, and Marcelo Leone. [Logical Physical Clocks](https://cse.buffalo.edu/~demirbas/publications/hlc.pdf). *18th International Conference on Principles of Distributed Systems* (OPODIS), December 2014. [doi:10.1007/978-3-319-14472-6\_2](https://doi.org/10.1007/978-3-319-14472-6_2) 
[^56]: Manuel Bravo, Nuno Diegues, Jingna Zeng, Paolo Romano, and Luís Rodrigues. [On the use of Clocks to Enforce Consistency in the Cloud](http://sites.computer.org/debull/A15mar/p18.pdf). *IEEE Data Engineering Bulletin*, volume 38, issue 1, pages 18–31, March 2015. Archived at [perma.cc/68ZU-45SH](https://perma.cc/68ZU-45SH) 
[^57]: Daniel Peng and Frank Dabek. [Large-Scale Incremental Processing Using Distributed Transactions and Notifications](https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Peng.pdf). At *9th USENIX Conference on Operating Systems Design and Implementation* (OSDI), October 2010. 
[^58]: Tushar Deepak Chandra, Robert Griesemer, and Joshua Redstone. [Paxos Made Live – An Engineering Perspective](https://www.read.seas.harvard.edu/~kohler/class/08w-dsi/chandra07paxos.pdf). At *26th ACM Symposium on Principles of Distributed Computing* (PODC), June 2007. [doi:10.1145/1281100.1281103](https://doi.org/10.1145/1281100.1281103) 
[^59]: Will Portnoy. [Lessons Learned from Implementing Paxos](https://blog.willportnoy.com/2012/06/lessons-learned-from-paxos.html). *blog.willportnoy.com*, June 2012. Archived at [perma.cc/QHD9-FDD2](https://perma.cc/QHD9-FDD2) 
[^60]: Brian M. Oki and Barbara H. Liskov. [Viewstamped Replication: A New Primary Copy Method to Support Highly-Available Distributed Systems](https://pmg.csail.mit.edu/papers/vr.pdf). At *7th ACM Symposium on Principles of Distributed Computing* (PODC), August 1988. [doi:10.1145/62546.62549](https://doi.org/10.1145/62546.62549) 
[^61]: Barbara H. Liskov and James Cowling. [Viewstamped Replication Revisited](https://pmg.csail.mit.edu/papers/vr-revisited.pdf). Massachusetts Institute of Technology, Tech Report MIT-CSAIL-TR-2012-021, July 2012. Archived at [perma.cc/56SJ-WENQ](https://perma.cc/56SJ-WENQ) 
[^62]: Leslie Lamport. [The Part-Time Parliament](https://www.microsoft.com/en-us/research/publication/part-time-parliament/). *ACM Transactions on Computer Systems*, volume 16, issue 2, pages 133–169, May 1998. [doi:10.1145/279227.279229](https://doi.org/10.1145/279227.279229) 
[^63]: Leslie Lamport. [Paxos Made Simple](https://www.microsoft.com/en-us/research/publication/paxos-made-simple/). *ACM SIGACT News*, volume 32, issue 4, pages 51–58, December 2001. Archived at [perma.cc/82HP-MNKE](https://perma.cc/82HP-MNKE) 
[^64]: Robbert van Renesse and Deniz Altinbuken. [Paxos Made Moderately Complex](https://people.cs.umass.edu/~arun/590CC/papers/paxos-moderately-complex.pdf). *ACM Computing Surveys* (CSUR), volume 47, issue 3, article no. 42, February 2015. [doi:10.1145/2673577](https://doi.org/10.1145/2673577) 
[^65]: Diego Ongaro. [Consensus: Bridging Theory and Practice](https://github.com/ongardie/dissertation). PhD Thesis, Stanford University, August 2014. Archived at [perma.cc/5VTZ-2ADH](https://perma.cc/5VTZ-2ADH) 
[^66]: Heidi Howard, Malte Schwarzkopf, Anil Madhavapeddy, and Jon Crowcroft. [Raft Refloated: Do We Have Consensus?](https://www.cl.cam.ac.uk/research/srg/netos/papers/2015-raftrefloated-osr.pdf) *ACM SIGOPS Operating Systems Review*, volume 49, issue 1, pages 12–21, January 2015. [doi:10.1145/2723872.2723876](https://doi.org/10.1145/2723872.2723876) 
[^67]: André Medeiros. [ZooKeeper’s Atomic Broadcast Protocol: Theory and Practice](http://www.tcs.hut.fi/Studies/T-79.5001/reports/2012-deSouzaMedeiros.pdf). Aalto University School of Science, March 2012. Archived at [perma.cc/FVL4-JMVA](https://perma.cc/FVL4-JMVA) 
[^68]: Robbert van Renesse, Nicolas Schiper, and Fred B. Schneider. [Vive La Différence: Paxos vs. Viewstamped Replication vs. Zab](https://arxiv.org/abs/1309.5671). *IEEE Transactions on Dependable and Secure Computing*, volume 12, issue 4, pages 472–484, September 2014. [doi:10.1109/TDSC.2014.2355848](https://doi.org/10.1109/TDSC.2014.2355848) 
[^69]: Heidi Howard and Richard Mortier. [Paxos vs Raft: Have we reached consensus on distributed consensus?](https://arxiv.org/abs/2004.05074). At *7th Workshop on Principles and Practice of Consistency for Distributed Data* (PaPoC), April 2020. [doi:10.1145/3380787.3393681](https://doi.org/10.1145/3380787.3393681) 
[^70]: Miguel Castro and Barbara H. Liskov. [Practical Byzantine Fault Tolerance and Proactive Recovery](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/01/p398-castro-bft-tocs.pdf). *ACM Transactions on Computer Systems*, volume 20, issue 4, pages 396–461, November 2002. [doi:10.1145/571637.571640](https://doi.org/10.1145/571637.571640) 
[^71]: Shehar Bano, Alberto Sonnino, Mustafa Al-Bassam, Sarah Azouvi, Patrick McCorry, Sarah Meiklejohn, and George Danezis. [SoK: Consensus in the Age of Blockchains](https://smeiklej.com/files/aft19a.pdf). At *1st ACM Conference on Advances in Financial Technologies* (AFT), October 2019. [doi:10.1145/3318041.3355458](https://doi.org/10.1145/3318041.3355458) 
[^72]: Michael J. Fischer, Nancy Lynch, and Michael S. Paterson. [Impossibility of Distributed Consensus with One Faulty Process](https://groups.csail.mit.edu/tds/papers/Lynch/jacm85.pdf). *Journal of the ACM*, volume 32, issue 2, pages 374–382, April 1985. [doi:10.1145/3149.214121](https://doi.org/10.1145/3149.214121) 
[^73]: Tushar Deepak Chandra and Sam Toueg. [Unreliable Failure Detectors for Reliable Distributed Systems](https://courses.csail.mit.edu/6.852/08/papers/CT96-JACM.pdf). *Journal of the ACM*, volume 43, issue 2, pages 225–267, March 1996. [doi:10.1145/226643.226647](https://doi.org/10.1145/226643.226647) 
[^74]: Michael Ben-Or. [Another Advantage of Free Choice: Completely Asynchronous Agreement Protocols](https://homepage.cs.uiowa.edu/~ghosh/BenOr.pdf). At *2nd ACM Symposium on Principles of Distributed Computing* (PODC), August 1983. [doi:10.1145/800221.806707](https://doi.org/10.1145/800221.806707) 
[^75]: Cynthia Dwork, Nancy Lynch, and Larry Stockmeyer. [Consensus in the Presence of Partial Synchrony](https://groups.csail.mit.edu/tds/papers/Lynch/jacm88.pdf). *Journal of the ACM*, volume 35, issue 2, pages 288–323, April 1988. [doi:10.1145/42282.42283](https://doi.org/10.1145/42282.42283) 
[^76]: Xavier Défago, André Schiper, and Péter Urbán. [Total Order Broadcast and Multicast Algorithms: Taxonomy and Survey](https://dspace.jaist.ac.jp/dspace/bitstream/10119/4883/1/defago_et_al.pdf). *ACM Computing Surveys*, volume 36, issue 4, pages 372–421, December 2004. [doi:10.1145/1041680.1041682](https://doi.org/10.1145/1041680.1041682) 
[^77]: Hagit Attiya and Jennifer Welch. *Distributed Computing: Fundamentals, Simulations and Advanced Topics*, 2nd edition. John Wiley & Sons, 2004. ISBN: 978-0-471-45324-6, [doi:10.1002/0471478210](https://doi.org/10.1002/0471478210) 
[^78]: Rachid Guerraoui. [Revisiting the Relationship Between Non-Blocking Atomic Commitment and Consensus](https://citeseerx.ist.psu.edu/pdf/5d06489503b6f791aa56d2d7942359c2592e44b0). At *9th International Workshop on Distributed Algorithms* (WDAG), September 1995. [doi:10.1007/BFb0022140](https://doi.org/10.1007/BFb0022140) 
[^79]: Jim N. Gray and Leslie Lamport. [Consensus on Transaction Commit](https://dsf.berkeley.edu/cs286/papers/paxoscommit-tods2006.pdf). *ACM Transactions on Database Systems* (TODS), volume 31, issue 1, pages 133–160, March 2006. [doi:10.1145/1132863.1132867](https://doi.org/10.1145/1132863.1132867) 
[^80]: Fred B. Schneider. [Implementing Fault-Tolerant Services Using the State Machine Approach: A Tutorial](https://www.cs.cornell.edu/fbs/publications/SMSurvey.pdf). *ACM Computing Surveys*, volume 22, issue 4, pages 299–319, December 1990. [doi:10.1145/98163.98167](https://doi.org/10.1145/98163.98167) 
[^81]: Alexander Thomson, Thaddeus Diamond, Shu-Chun Weng, Kun Ren, Philip Shao, and Daniel J. Abadi. [Calvin: Fast Distributed Transactions for Partitioned Database Systems](https://cs.yale.edu/homes/thomson/publications/calvin-sigmod12.pdf). At *ACM International Conference on Management of Data* (SIGMOD), May 2012. [doi:10.1145/2213836.2213838](https://doi.org/10.1145/2213836.2213838) 
[^82]: Mahesh Balakrishnan, Dahlia Malkhi, Ted Wobber, Ming Wu, Vijayan Prabhakaran, Michael Wei, John D. Davis, Sriram Rao, Tao Zou, and Aviad Zuck. [Tango: Distributed Data Structures over a Shared Log](https://www.microsoft.com/en-us/research/publication/tango-distributed-data-structures-over-a-shared-log/). At *24th ACM Symposium on Operating Systems Principles* (SOSP), November 2013. [doi:10.1145/2517349.2522732](https://doi.org/10.1145/2517349.2522732) 
[^83]: Mahesh Balakrishnan, Dahlia Malkhi, Vijayan Prabhakaran, Ted Wobber, Michael Wei, and John D. Davis. [CORFU: A Shared Log Design for Flash Clusters](https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final30.pdf). At *9th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), April 2012. 
[^84]: Vasilis Gavrielatos, Antonios Katsarakis, and Vijay Nagarajan. [Odyssey: the impact of modern hardware on strongly-consistent replication protocols](https://vasigavr1.github.io/files/Odyssey_Eurosys_2021.pdf). At *16th European Conference on Computer Systems* (EuroSys), April 2021. [doi:10.1145/3447786.3456240](https://doi.org/10.1145/3447786.3456240) 
[^85]: Heidi Howard, Dahlia Malkhi, and Alexander Spiegelman. [Flexible Paxos: Quorum Intersection Revisited](https://drops.dagstuhl.de/opus/volltexte/2017/7094/pdf/LIPIcs-OPODIS-2016-25.pdf). At *20th International Conference on Principles of Distributed Systems* (OPODIS), December 2016. [doi:10.4230/LIPIcs.OPODIS.2016.25](https://doi.org/10.4230/LIPIcs.OPODIS.2016.25) 
[^86]: Martin Kleppmann. [Distributed Systems lecture notes](https://www.cl.cam.ac.uk/teaching/2425/ConcDisSys/dist-sys-notes.pdf). *University of Cambridge*, October 2024. Archived at [perma.cc/SS3Q-FNS5](https://perma.cc/SS3Q-FNS5) 
[^87]: Kyle Kingsbury. [Call Me Maybe: Elasticsearch 1.5.0](https://aphyr.com/posts/323-call-me-maybe-elasticsearch-1-5-0). *aphyr.com*, April 2015. Archived at [perma.cc/37MZ-JT7H](https://perma.cc/37MZ-JT7H) 
[^88]: Heidi Howard and Jon Crowcroft. [Coracle: Evaluating Consensus at the Internet Edge](https://conferences.sigcomm.org/sigcomm/2015/pdf/papers/p85.pdf). At *Annual Conference of the ACM Special Interest Group on Data Communication* (SIGCOMM), August 2015. [doi:10.1145/2829988.2790010](https://doi.org/10.1145/2829988.2790010) 
[^89]: Tom Lianza and Chris Snook. [A Byzantine failure in the real world](https://blog.cloudflare.com/a-byzantine-failure-in-the-real-world/). *blog.cloudflare.com*, November 2020. Archived at [perma.cc/83EZ-ALCY](https://perma.cc/83EZ-ALCY) 
[^90]: Ivan Kelly. [BookKeeper Tutorial](https://github.com/ivankelly/bookkeeper-tutorial). *github.com*, October 2014. Archived at [perma.cc/37Y6-VZWU](https://perma.cc/37Y6-VZWU) 
[^91]: Jack Vanlightly. [Apache BookKeeper Insights Part 1 — External Consensus and Dynamic Membership](https://medium.com/splunk-maas/apache-bookkeeper-insights-part-1-external-consensus-and-dynamic-membership-c259f388da21). *medium.com*, November 2021. Archived at [perma.cc/3MDB-8GFB](https://perma.cc/3MDB-8GFB)


================================================
FILE: content/en/ch11.md
================================================
---
title: "11. Batch Processing"
weight: 311
breadcrumbs: false
---

<a id="ch_batch"></a>

![](/map/ch10.png)

> *A system cannot be successful if it is too strongly influenced by a single person. Once the
> initial design is complete and fairly robust, the real test begins as people with many different
> viewpoints undertake their own experiments.*
>
> Donald Knuth

> [!TIP] A NOTE FOR EARLY RELEASE READERS
> With Early Release ebooks, you get books in their earliest form---the author's raw and unedited
> content as they write---so you can take advantage of these technologies long before the official
> release of these titles.
>
> This will be the 11th chapter of the final book. The GitHub repo for this book is
> *[*https://github.com/ept/ddia2-feedback*](https://github.com/ept/ddia2-feedback)*.
>
> If you'd like to be actively involved in reviewing and commenting on this draft, please reach out on
> GitHub.

Much of this book so far has talked about *requests* and *queries*, and the corresponding
*responses* or *results*. This style of data processing is assumed in many modern data systems: you
ask for something, or you send an instruction, and the system tries to give you an answer as quickly
as possible.

A web browser requesting a page, a service calling a remote API, databases, caches, search indexes,
and many other systems work this way. We call these *online systems*. Response time is usually their
primary measure of performance, and they often require fault tolerance to ensure high availability.

However, sometimes you need to run a bigger computation or process larger amounts of data than you
can do in an interactive request. Maybe you need to train an AI model, or transform lots of data
from one form into another, or compute analytics over a very large dataset. We call these tasks
*batch processing* jobs, or sometimes *offline systems*.

A batch processing job takes some input data (which is read-only), and produces some output data
(which is generated from scratch every time the job runs). It typically does not mutate data in the
way a read/write transaction would. The output is therefore *derived* from the input (as discussed
in ["Systems of Record and Derived Data"](/en/ch1#sec_introduction_derived)): if you don't like the
output, you can just delete it, adjust the job logic, and run it again. By treating inputs as
immutable and avoiding side effects (such as writing to external databases), batch jobs not only
achieve good performance but also have other benefits:

- If you introduce a bug into the code and the output is wrong or corrupted, you can simply roll
  back to a previous version of the code and rerun the job, and the output will be correct again.
  Or, even simpler, you can keep the old output in a different directory and simply switch back to
  it. Most object stores and open table formats (see ["Cloud Data
  Warehouses"](/en/ch4#sec_cloud_data_warehouses)) support this feature, which is known as *time
  travel*. Most databases with read-write transactions do not have this property: if you deploy
  buggy code that writes bad data to the database, then rolling back the code will do nothing to fix
  the data in the database. The idea of being able to recover from buggy code has been called *human
  fault tolerance* [^1].

- As a consequence of this ease of rolling back, feature development can proceed more quickly than
  in an environment where mistakes could mean irreversible damage. This principle of *minimizing
  irreversibility* is beneficial for Agile software development [^2].

- The same set of files can be used as input for various different jobs, including monitoring jobs
  that calculate metrics and evaluate whether a job's output has the expected characteristics (for
  example, by comparing it to the output from the previous run and measuring discrepancies).

- Batch processing frameworks make efficient use of computing resources. Even though it's possible
  to batch process data using online data systems such as OLTP databases and applications servers,
  doing so can be much more expensive in terms of the resources required.

Batch data processing also presents challenges. With most frameworks, output can only be processed
by other jobs after the whole job finishes. Batch processing can also be inefficient: any change to
input data---even a single byte---means the batch job must reprocess the entire input dataset.
Despite these limitations, batch processing has proven useful in a wide range of use cases, which
we'll revisit in ["Batch Use Cases"](/en/ch11#sec_batch_output).

A batch job may take a long time to run: minutes, hours, or even days. Jobs may be scheduled to run
periodically (for example, once per day). The primary measure of performance is usually throughput:
how much data the job can process per unit time. Some batch systems handle faults by simply aborting
and restarting the whole job, while others have fault tolerance so that a job can complete
successfully despite some of its nodes crashing.

> [!NOTE]
> An alternative to batch processing is *stream processing*, in which the job doesn't finish running
> when it has processed the input, but instead continues watching the input and processes changes in
> the input shortly after they happen. We will turn to stream processing in
> [Chapter 12](/en/ch12#ch_stream).

The boundary between online and batch processing systems is not always clear: a long-running
database query looks quite like a batch process. But batch processing also has some particular
characteristics that make it a useful building block for building reliable, scalable, and
maintainable applications. For example, it often plays a role in *data integration*, i.e., composing
multiple data systems to achieve things that one system alone cannot do. ETL, as discussed in ["Data
Warehousing"](/en/ch1#sec_introduction_dwh), is an example of this.

Modern batch processing has been heavily influenced by MapReduce, a batch processing algorithm that
was published by Google in 2004 [^3], and subsequently implemented in various open source
data systems, including Hadoop, CouchDB, and MongoDB. MapReduce is a fairly low-level programming
model, and less sophisticated than the parallel query execution engines found, for example, in data
warehouses [^4], [^5]. When it was new, MapReduce was a step forward in terms of the
scale of processing that could be achieved on commodity hardware, but now it is largely obsolete,
and no longer used at Google [^6], [^7].

Batch processing today is more often done using frameworks such as Spark or Flink, or data warehouse
query engines. Like MapReduce, they rely heavily on sharding (see [Chapter 7](/en/ch7#ch_sharding))
and parallel execution, but they have far more sophisticated caching and execution strategies. As
these systems have matured, operational concerns have been largely solved, so focus has shifted
toward usability. New processing models such as dataflow APIs, query languages, and DataFrame APIs
are now widely supported. Job and workflow orchestration has also matured. Hadoop-centric workflow
schedulers such as Oozie and Azkaban have been replaced with more generalized solutions such as
Airflow, Dagster, and Prefect, which support a wide array of batch processing frameworks and cloud
data warehouses.

Cloud computing has grown ubiquitous. Batch storage layers are shifting from distributed filesystems
(DFSs) like HDFS, GlusterFS, and CephFS to object storage systems such as S3. Scalable cloud data
warehouses like BigQuery and Snowflake are blurring the line between data warehouses and batch
processing.

To build an intuition of what batch processing is about, we will start this chapter with an example
that uses standard Unix tools on a single machine. We will then investigate how we can extend data
processing to multiple machines in a distributed system. We will see that, much like an operating
system, distributed batch processing frameworks have a scheduler and a filesystem. We will then
explore various processing models that we use to write batch jobs. Finally, we discuss common batch
processing use cases.

## Batch Processing with Unix Tools {#sec_batch_unix}

Say you have a web server that appends a line to a log file every time it serves a request. For
example, using the nginx default access log format, one line of the log might look like this:

    216.58.210.78 - - [27/Jun/2025:17:55:11 +0000] "GET /css/typography.css HTTP/1.1"
    200 3377 "https://martin.kleppmann.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X
    10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"

(That is actually one line; it's only broken onto multiple lines here for readability.) There's a
lot of information in that line. In order to interpret it, you need to look at the definition of the
log format, which is as follows:

    $remote_addr - $remote_user [$time_local] "$request"
    $status $body_bytes_sent "$http_referer" "$http_user_agent"

So, this one line of the log indicates that on June 27, 2025, at 17:55:11 UTC, the server received a
request for the file */css/typography.css* from the client IP address 216.58.210.78. The user was
not authenticated, so `$remote_user` is set to a hyphen (`-`). The response status was 200 (i.e.,
the request was successful), and the response was 3,377 bytes in size. The web browser was Chrome
137, and it loaded the file because it was referenced in the page at the URL
*[*https://martin.kleppmann.com/*](https://martin.kleppmann.com/)*.

Though log parsing might seem contrived, it's actually a critical part of many modern technology
companies, and is used for everything from ad pipelines to payment processing. Indeed, it was a
driving force behind the rapid adoption of MapReduce and the "big data" movement.

### Simple Log Analysis {#sec_batch_log_analysis}

Various tools can take these log files and produce pretty reports about your website traffic, but
for the sake of exercise, let's build our own, using basic Unix tools. For example, say you want to
find the five most popular pages on your website. You can do this in a Unix shell as follows:

``` bash
cat /var/log/nginx/access.log | #1
  awk '{print $7}' | #2
  sort             | #3
  uniq -c          | #4
  sort -r -n       | #5
  head -n 5          #6
```

1.  Read the log file. (Strictly speaking, `cat` is unnecessary here, as the input file could be
    given directly as an argument to `awk`. However, the linear pipeline is more apparent when
    written like this.)

2.  Split each line into fields by whitespace, and output only the seventh such field from each
    line, which happens to be the requested URL. In our example line, this request URL is
    */css/typography.css*.

3.  Alphabetically `sort` the list of requested URLs. If some URL has been requested *n* times, then
    after sorting, the file contains the same URL repeated *n* times in a row.

4.  The `uniq` command filters out repeated lines in its input by checking whether two adjacent
    lines are the same. The `-c` option tells it to also output a counter: for every distinct URL,
    it reports how many times that URL appeared in the input.

5.  The second `sort` sorts by the number (`-n`) at the start of each line, which is the number of
    times the URL was requested. It then returns the results in reverse (`-r`) order, i.e. with the
    largest number first.

6.  Finally, `head` outputs just the first five lines (`-n 5`) of input, and discards the rest.

The output of that series of commands looks something like this:

    4189 /favicon.ico
    3631 /2016/02/08/how-to-do-distributed-locking.html
    2124 /2020/11/18/distributed-systems-and-elliptic-curves.html
    1369 /
     915 /css/typography.css

Although the preceding command line likely looks a bit obscure if you're unfamiliar with Unix tools,
it is incredibly powerful. It will process gigabytes of log files in a matter of seconds, and you
can easily modify the analysis to suit your needs. For example, if you want to omit CSS files from
the report, change the `awk` argument to `'$7 !~ /\.css$/ {print $7}'`. If you want to count top
client IP addresses instead of top pages, change the `awk` argument to `'{print $1}'`. And so on.

We don't have space in this book to explore Unix tools in detail, but they are very much worth
learning about. Surprisingly many data analyses can be done in a few minutes using some combination
of `awk`, `sed`, `grep`, `sort`, `uniq`, and `xargs`, and they perform surprisingly well
[^8].

### Chain of Commands Versus Custom Program {#sec_batch_custom_program}

Instead of the chain of Unix commands, you could write a simple program to do the same thing. For
example, in Python, it might look something like this:

``` python
from collections import defaultdict

counts = defaultdict(int) #1

with open('/var/log/nginx/access.log', 'r') as file:
    for line in file:
        url = line.split()[6] #2
        counts[url] += 1 #3

top5 = sorted(((count, url) for url, count in counts.items()), reverse=True)[:5] #4

for count, url in top5:  #5
    print(f"{count} {url}")
```

1.  `counts` is a hash table that keeps a counter for the number of times we've seen each URL. A
    counter is zero by default.

2.  From each line of the log, we take the URL to be the seventh whitespace-separated field (the
    array index is 6 because Python's arrays are zero-indexed).

3.  Increment the counter for the URL in the current line of the log.

4.  Sort the hash table contents by counter value (descending), and take the top five entries.

5.  Print out those top five entries.

This program is not as concise as the chain of Unix pipes, but it's fairly readable, and which of
the two you prefer is partly a matter of taste. However, besides the superficial syntactic
differences between the two, there is a big difference in the execution flow, which becomes apparent
if you run this analysis on a large file.

### Sorting Versus In-memory Aggregation {#id275}

The Python script keeps an in-memory hash table of URLs, where each URL is mapped to the number of
times it has been seen. The Unix pipeline example does not have such a hash table, but instead
relies on sorting a list of URLs in which multiple occurrences of the same URL are simply repeated.

Which approach is better? It depends how many different URLs you have. For most small to mid-sized
websites, you can probably fit all distinct URLs, and a counter for each URL, in (say) 1 GB of
memory. In this example, the *working set* of the job (the amount of memory to which the job needs
random access) depends only on the number of distinct URLs: if there are a million log entries for a
single URL, the space required in the hash table is still just one URL plus the size of the counter.
If this working set is small enough, an in-memory hash table works fine---even on a laptop.

On the other hand, if the job's working set is larger than the available memory, the sorting
approach has the advantage that it can make efficient use of disks. It's the same principle as we
discussed in ["Log-Structured Storage"](/en/ch4#sec_storage_log_structured): chunks of data can be
sorted in memory and written out to disk as segment files, and then multiple sorted segments can be
merged into a larger sorted file. Mergesort has sequential access patterns that perform well on
disks (see ["Sequential Versus Random Writes on SSDs"](/en/ch4#sidebar_sequential)).

The `sort` utility in GNU Coreutils (Linux) automatically handles larger-than-memory datasets by
spilling to disk, and automatically parallelizes sorting across multiple CPU cores [^9].
This means that the simple chain of Unix commands we saw earlier easily scales to large datasets,
without running out of memory. The bottleneck is likely to be the rate at which the input file can
be read from disk.

A limitation of Unix tools is that they run only on a single machine. Datasets that are too large to
fit in memory or local disk present a problem---and that's where distributed batch processing
frameworks come in.

## Batch Processing in Distributed Systems {#sec_batch_distributed}

The machine that runs our Unix tool example has a number of components that work together to process
the log data:

- Storage devices that are accessed through the operating system's filesystem interface.

- A scheduler that determines when processes get to run, and how to allocate CPU resources to them.

- A series of Unix programs whose `stdin` and `stdout` are connected together by pipes.

These same components exist in distributed data processing frameworks. In fact, you can think of a
distributed processing framework as a distributed operating system; they have filesystems, job
schedulers, and programs that send data to each other through the filesystem or other communication
channels.

### Distributed Filesystems {#sec_batch_dfs}

The filesystem provided by your operating system is composed of several layers:

- At the lowest level, block device drivers speak directly to the disk, and allow the layers above
  to read and write raw blocks.

- Above the block layer sits a page cache that keeps recently accessed blocks in memory for faster
  access.

- The block API is wrapped in a filesystem layer that breaks up large files into blocks, and tracks
  file metadata such as inodes, directories, and files. ext4 and XFS are two common implementations
  on Linux, for example.

- Finally, the operating system exposes different filesystems to applications through a common API
  called the virtual file system (VFS). The VFS is what allows applications to read and write in a
  standard way regardless of the underlying filesystem.

Distributed filesystems work in much the same way. Files are broken up into blocks, which are
distributed across many machines. DFS blocks are typically much larger than local blocks: HDFS
(Hadoop Distributed File System) defaults to 128MB, while JuiceFS and many object stores use 4MB
blocks---much larger than ext4's 4096 bytes. Larger blocks mean less metadata to keep track of,
which makes a big difference on petabyte-sized datasets. Larger blocks also lower the overhead of
seeking to a block relative to reading it.

Most physical storage devices can't write partial blocks, so operating systems require writes to use
an entire block even if the data doesn't take up the whole block. Since distributed filesystems have
larger blocks and are usually implemented on top of operating system filesystems, they don't have
this requirement. For example, a 900MB file stored with 128MB blocks would have 7 blocks that use
128MB and 1 block that uses 4MB.

DFS blocks are read by making network requests to a machine in the cluster that stores the block.
Each machine runs a daemon, exposing an API that allows remote processes to read and write blocks as
files on its local filesystem. HDFS refers to these daemons as DataNodes, while GlusterFS calls them
glusterfsd processes. We'll call them *data nodes* in this book.

Distributed filesystems also implement the distributed equivalent of a page cache. Since DFS blocks
are stored as files on data nodes, reads and writes go through each data node's operating system,
which includes an in-memory page cache. This keeps frequently read data blocks in-memory on the data
nodes. Some distributed filesystems also implement more caching tiers such as the client-side and
local-disk caching found in JuiceFS.

Filesystems such as ext4 and XFS keep track of storage metadata including free space, file block
locations, directory structures, permission settings, and more. Distributed filesystems also need a
way to track file locations spread across machines, permission settings, and so on. Hadoop has a
service called the NameNode, which maintains metadata for the cluster. DeepSeek's 3FS has a metadata
service that persists its data to a key-value store such as FoundationDB.

Above the filesystem sits the VFS. A close analogue in batch processing is a distributed
filesystem's protocol. Distributed filesystems must expose a protocol or interface so that batch
processing systems can read and write files. This protocol acts as a pluggable interface: any DFS
may be used so long as it implements the protocol. For example, Amazon S3's API has been widely
adopted by other storage systems such as MinIO, Cloudflare's R2, Tigris, Backblaze's B2, and many
others. Batch processing systems with S3 support can use any of these storage systems.

Some DFSs implement POSIX-compliant filesystems that appear to the operating system's VFS like any
other filesystem. Filesystem in Userspace (FUSE) or the Network File System (NFS) protocol are often
used to integrate into the VFS. NFS is perhaps the most well known distributed filesystem protocol.
The protocol was originally developed to allow multiple clients to read and write data on a single
server. More recently, filesystems such as AWS's Elastic File System (EFS) and Archil provide
NFS-compatible distributed filesystem implementations that are far more scalable. NFS clients still
connect to one end point, but underneath, these systems communicate with distributed metadata
services and data nodes to read and write data.

> [!TIP] DISTRIBUTED FILESYSTEMS AND NETWORK STORAGE
> Distributed filesystems are based on the *shared-nothing* principle (see ["Shared-Memory,
> Shared-Disk, and Shared-Nothing
> Architecture"](/en/ch2#sec_introduction_shared_nothing)), in contrast to the
> shared-disk approach of *Network Attached Storage* (NAS) and *Storage Area Network* (SAN)
> architectures. Shared-disk storage is implemented by a centralized storage appliance, often using
> custom hardware and special network infrastructure such as Fibre Channel. On the other hand, the
> shared-nothing approach requires no special hardware, only computers connected by a conventional
> datacenter network.

Many distributed filesystems are built on commodity hardware, which is less expensive but has higher
failure rates than enterprise-grade hardware. In order to tolerate machine and disk failures, file
blocks are replicated on multiple machines. This also allows schedulers to more evenly distribute
workloads since it can execute a task on any node that contains a replica of the task's input data.
Replication may mean simply several copies of the same data on multiple machines, as in
[Chapter 6](/en/ch6#ch_replication), or an *erasure coding* scheme such as Reed--Solomon codes,
which allows lost data to be recovered with lower storage overhead than full replication
[^10], [^11], [^12]. The techniques are similar to RAID, which provides
redundancy across several disks attached to the same machine; the difference is that in a
distributed filesystem, file access and replication are done over a conventional datacenter network
without special hardware.

### Object Stores {#id277}

Object storage services such as Amazon S3, Google Cloud Storage, Azure Blob Storage, and OpenStack
Swift have become a popular alternative to distributed filesystems for batch processing jobs. In
fact, the line between the two is somewhat blurry. As we saw in the previous section and ["Databases
Backed by Object Storage"](/en/ch6#sec_replication_object_storage), Filesystem in Userspace (FUSE)
drivers allow users to treat object stores such as S3 as a filesystem. Some DFS implementations such
as JuiceFS and Ceph offer both object storage and filesystem APIs. However, their APIs, performance,
and consistency guarantees are very different. Care must be taken when adopting such systems to make
sure they behave as expected, even if they seem to implement the requisite APIs.

Each object in an object store has a URL such as `s3://my-photo-bucket/2025/04/01/birthday.png`. The
host portion of the URL (`my-photo-bucket`) describes the bucket where objects are stored, and the
part that follows is the object's *key* (`/2025/04/01/birthday.png` in our example). A bucket has a
globally unique name, and each object's key must be unique within its bucket.

Object are read using a `get` call and written using a `put` call. Unlike files on a filesystem,
objects are immutable once written. To update an object, it must be fully rewritten using a `put`
call, similarly to a key-value store. Azure Blob Storage and S3 Express One Zone support appends,
but most other stores do not. There are no file handle APIs with functions like `fopen` and `fseek`.

Objects may look as if they are organised into directories, which is somewhat confusing, since
object stores do not have the concept of directories. The path structure is simply a convention, and
the slashes are a part of the object's key. This convention allows you to perform something similar
to a directory listing by requesting a list of objects with a particular prefix. However, listing
objects by prefix is different from a filesystem directory listing in two ways:

- A prefix `list` operation behaves like recursive `ls -R` call on a Unix system: it returns all
  objects that start with the prefix---​objects in subpaths are included.

- Empty directories are not possible: if you were to remove all objects underneath
  `s3://my-photo-bucket/2025/04/01`, then `01` would no longer appear when we call `list` on
  `s3://my-photo-bucket/2025/04`. It is a common practice to create a zero-byte object as a way to
  represent an empty directory (e.g. creating an empty `s3://my-photo-bucket/2025/04/01` file to
  keep it present when all child objects are deleted).

DFS implementations often support many common filesystem operations such as hard links, symbolic
links, file locking, and atomic renames. Such features are missing from object stores. Linking and
locks are typically not supported, while renames are non-atomic; they're accomplished by copying the
object to the new key, and then deleting the old object. If you want to rename a directory, you have
to individually rename every object within it, since the directory name is a part of the key.

The key-value stores we discussed in [Chapter 4](/en/ch4#ch_storage) are optimized for small values
(typically kilobytes) and frequent, low-latency reads/writes. In contrast, distributed filesystems
and object stores are generally optimized for large objects (megabytes to gigabytes) and less
frequent, larger reads. Recently, however, object stores have begun to add support for frequent and
smaller reads/writes. For example, S3 Express One Zone now offers single-millisecond latency and a
pricing model that is more similar to key-value stores.

Another difference between distributed filesystems and object stores is that DFSes such as HDFS
allow computing tasks to be run on the machine that stores a copy of a particular file. This allows
the task to read that file without having to send it over the network, which saves bandwidth if the
executable code of the task is smaller than the file it needs to read. On the other hand, object
stores usually keep storage and computation separate. Doing so might use more bandwidth, but modern
datacenter networks are very fast, so this is often acceptable. This architecture also allows
machine resources such as CPU and memory to be scaled independently of storage since the two are
decoupled.

### Distributed Job Orchestration {#id278}

Our operating system analogy also applies to job orchestration. When you execute a Unix batch job,
something needs to actually run the `awk`, `sort`, `uniq`, and `head` processes. Data needs to be
transferred from one process's output to another process's input, memory must be allocated for each
process, instructions from each process must be scheduled fairly and executed on the CPU, memory and
I/O boundaries must be enforced, and so on. On a single machine, an operating system's kernel is
responsible for such work. In a distributed environment, this is the role of a job orchestrator.

Batch processing frameworks send a request to an orchestrator's scheduler to run a job. Requests to
start a job contain metadata such as:

- the number of tasks to execute,

- the amount of memory, CPU, and disk needed for each task,

- a job identifier,

- access credentials,

- job paramaters such as input and output data,

- required hardware details such as GPUs or disk types, and

- where the job's executable code is located.

Orchestrators such as Kubernetes and Hadoop YARN (Yet Another Resource Negotiator) [^13]
combine this information with cluster metadata to execute the job using the following components:

Task executors

:   An executor daemon such as YARN's *NodeManager* or Kubernetes's *kubelet* runs on each node in
    the cluster. Executors are responsible for running job tasks, sending heartbeats to signal their
    liveness, and tracking task status and resource allocation on the node. When a task-start
    request is sent to an executor, it retrieves the job's executable code and runs a command to
    start the task. The executor then monitors the process until it finishes or fails, at which
    point it updates the task status metadata accordingly.

    Many executors also work with the operating system to provide both security and performance
    isolation. YARN and Kubernetes both use Linux *cgroups*, for example. This prevents tasks from
    accessing data without permission, or from negatively affecting the performance of other tasks
    on the node by using excessive resources.

Resource Manager

:   An orchestrator's resource manager stores metadata about each node, including available hardware
    (CPUs, GPUs, memory, disks, and so on), task statuses, network location, node status, and other
    relevant information. Thus, the manager provides a global view of the cluster's current state.
    The centralized nature of the resource manager can lead to both scalability and availability
    bottlenecks. YARN uses ZooKeeper and Kubernetes uses etcd to store cluster state (see
    ["Coordination Services"](/en/ch10#sec_consistency_coordination)).

Scheduler

:   Orchestrators usually have a centralized scheduler subsystem, which receives requests to start,
    stop, or check on the status of a job. For example, a scheduler might receive a request to start
    a job with 10 tasks using a specific Docker image on nodes that have a specific type of GPU. The
    scheduler uses the information from the request and state of the resource manager to determine
    which tasks to run on which nodes. The task executors are then informed of their assigned work
    and begin execution.

Though each orchestrator uses different terminology, you will find these components in nearly all
orchestration systems.

> [!NOTE]
> Scheduling decisions sometimes require application-specific schedulers that can take into account
> particular requirements, such as auto-scaling read replicas when a certain query threshold is
> reached. The centralized scheduler and application-specific schedulers work together to determine
> how to best execute tasks. YARN refers to its sub-schedulers as *ApplicationMasters*, while
> Kubernetes calls them *operators*.

#### Resource Allocation {#id279}

Schedulers have a particularly challenging role in job orchestration: they must figure out how to
best allocate the cluster's limited resources amongst jobs with competing needs. Fundamentally, its
decisions must balance fairness and efficiency.

Imagine a small cluster with five nodes that has a total of 160 CPU cores available. The cluster's
scheduler receives two job requests, each wanting 100 cores to complete its work. What's the best
way to schedule the workload?

- The scheduler could decide to run 80 tasks for each job, starting the remaining 20 tasks for each
  job as earlier tasks complete.

- The scheduler could run all of one job's tasks, and begin running the second job's tasks only when
  100 cores are available, a strategy known as *gang scheduling*.

- One job request comes before the other. The scheduler has to decide whether to allocate all 100
  cores to that job, or hold some back in anticipation for future jobs.

This is a very simple example, but we already see many difficult trade-offs. In the gang-scheduling
scenario, for example, if the scheduler reserves CPU cores until all 100 are available at the same
time, nodes will sit idle. The cluster's resource utilization will drop and a deadlock might occur
if other jobs also attempt to reserve CPU cores.

On the other hand, if the scheduler simply waits for 100 cores to become available, other jobs might
grab the cores in the meantime. The cluster might not have 100 cores available for a very long time,
which leads to *starvation*. The scheduler could decide to *preempt* some of the first job's tasks,
killing them to make room for the second job. Task preemption decreases cluster efficiency as well,
since the killed tasks will need to be restarted later and re-run.

Now imagine a scheduler that must make allocation decisions for hundreds or even millions of such
job requests. Finding an optimal solution seems intractable. In fact, the problem is *NP-hard*,
which means that it is prohibitively slow to calculate an optimal solution for all but the smallest
examples [^14], [^15].

In practice, schedulers therefore use heuristics to make non-optimal but reasonable decisions.
Several algorithms are commonly used, including first-in first-out (FIFO), dominant resource
fairness (DRF), priority queues, capacity or quota-based scheduling, and various bin-packing
algorithms. The details for such algorithms are beyond the scope of this book, but they're a
fascinating area of research.

#### Scheduling Workflows {#sec_batch_workflows}

The Unix tools example at the start of this chapter involved a chain of several commands, connected
by Unix pipes. The same pattern arises in distributed batch processes: often the output from one job
needs to become the input to one or more other jobs, and each job may have several inputs that are
produced by other jobs. This is called a *workflow* or *directed acyclic graph (DAG)* of jobs.

> [!NOTE]
> In ["Durable Execution and Workflows"](/en/ch5#sec_encoding_dataflow_workflows) we
> saw workflow engines that offer durable execution of a sequence of steps, typically performing RPCs.
> In the context of batch processing, "workflow" has a different meaning: it's a sequence of batch
> processes, each taking input data and producing output data, but normally not making RPCs to
> external services. Durable execution engines typically process less data per-request than their
> batch processing counterparts, though the line is somewhat fuzzy.

There are several reasons why a workflow of multiple jobs might be needed:

- If the output of one job needs to become the input to several other jobs, which are maintained by
  different teams, it's best for the first job to first write its output to a location where all the
  other jobs can read it. Those consuming jobs can then be scheduled to run every time that data has
  been updated, or on some other schedule.

- You might want to transfer data from one processing tool to another. For example, a Spark job
  might output its data to HDFS, then a Python script might trigger a Trino SQL query (see ["Cloud
  Data Warehouses"](/en/ch4#sec_cloud_data_warehouses)) that does further processing on the HDFS
  files and outputs to S3.

- Some data pipelines internally require multiple processing stages. For example, if one stage needs
  to shard the data by one key, and the next stage needs to shard by a different key, the first
  stage can output data sharded in the way that is required by the second stage.

In the Unix tools example, the pipe that connects the output of one command to the input of another
uses only a small in-memory buffer, and doesn't write the data into a file. If that buffer fills up,
the producing process needs to wait until the consuming process has read some data from the buffer
before it can output more---a form of backpressure. Spark, Flink, and other batch execution engines
support a similar model where the output of one task is directly passed to another task (over the
network if the tasks are running on different machines).

However, in a workflow it is more usual for one job to write its output to a distributed filesystem
or object store, and for the next job to read it from there. This decouples the jobs from each
other, allowing them to run at different times. If a job has several inputs, a workflow scheduler
typically waits until all of the jobs that produce its inputs have completed successfully before
running the job that consumes those inputs.

Schedulers found in orchestration frameworks such as YARN's ResourceManager or Spark's built-in
scheduler do not manage entire workflows; they do scheduling on a per-job basis. To handle these
dependencies between job executions, various workflow schedulers have been developed, including
Airflow, Dagster, and Prefect. Workflow schedulers have management features that are useful when
maintaining a large collection of batch jobs. Workflows consisting of 50 to 100 jobs are common in
many data pipelines, and in a large organization, many different teams may be running different jobs
or workflows that read each other's output across many different systems. Tool support is important
for managing such complex dataflows.

#### Handling Faults {#id281}

Batch jobs often run for long periods of time. Long-running jobs with many parallel tasks are likely
to experience at least one task failure along the way. As discussed in ["Hardware and Software
Faults"](/en/ch2#sec_introduction_hardware_faults) and ["Unreliable
Networks"](/en/ch9#sec_distributed_networks), there are many reasons why this could happen,
including hardware faults (especially on commodity hardware), or network interruptions.

Another reason why a task might not finish running is that the scheduler may intentionally preempt
(kill) it. Preemption is particularly useful if you have multiple priority levels: low-priority
tasks that are cheaper to run, and high-priority tasks that cost more. Low-priority tasks can run
whenever there is spare computing capacity, but they run the risk of being preempted at any moment
if a higher-priority task arrives. Such cheaper, low-priority virtual machines are called *spot
instances* on Amazon EC2, *spot virtual machines* on Azure, and *preemptible instances* on Google
Cloud [^16].

Since batch processing is often used for jobs that are not time-sensitive, it is well suited for
using low-priority tasks and spot instances to reduce the cost of running jobs. Essentially, those
jobs can use spare computing resources that would otherwise be idle, and thereby increase the
utilization of the cluster. However, this also means that those tasks are more likely to be killed
by the scheduler: preemptions occur more frequently than hardware faults [^17].

Since batch jobs regenerate their output from scratch every time they are run, task failures are
easier to handle than in online systems: the system can delete the partial output from the failed
execution and schedule it to run again on another machine. It would be wasteful to rerun the entire
job due to a single task failure, though. MapReduce and its successors therefore keep the execution
of parallel tasks independent from each other, so that they can retry work at the granularity of an
individual task [^3].

Fault tolerance is trickier when the output of one task becomes the input to another task as part of
a workflow. MapReduce solves this by always writing such intermediate data back to the distributed
filesystem, and waiting for the writing task to complete successfully before allowing other tasks to
read the data. This works, even in an environment where preemption is common, but it means a lot of
writes to the DFS, which can be inefficient.

Spark keeps intermediate data in memory or "spills" to local disk, and only writes the final result
to the DFS. It also keeps track of how the intermediate data was computed, allowing Spark to
recompute it in case it is lost [^18]. Flink uses a different approach based on
periodically checkpointing a snapshot of tasks [^19]. We will return to this topic in
["Dataflow Engines"](/en/ch11#sec_batch_dataflow).

## Batch Processing Models {#id431}

We have seen how batch jobs are scheduled in a distributed environment. Let us now turn our
attention to how batch processing frameworks actually process data. The two most common models are
MapReduce and dataflow engines. Although dataflow engines have largely replaced MapReduce in
practice, it is useful to understand how MapReduce works, since it influenced many modern batch
processing frameworks.

MapReduce and dataflow engines have evolved to support multiple programming models including
low-level programmatic APIs, relational query languages, and DataFrame APIs. A variety of options
enable application engineers, analytics engineers, business analysts, and even non-technical
employees to process company data for various use cases, which we'll discuss in ["Batch Use
Cases"](/en/ch11#sec_batch_output).

### MapReduce {#sec_batch_mapreduce}

The pattern of data processing in MapReduce is very similar to the web server log analysis example
in ["Simple Log Analysis"](/en/ch11#sec_batch_log_analysis):

1.  Read a set of input files, and break it up into *records*. In the web server log example, each
    record is one line in the log (that is, `\n` is the record separator). In Hadoop's MapReduce,
    the input file is stored in a distributed filesystem like HDFS or an object store like S3.
    Various file formats are used, such as Apache Parquet (a columnar format, see ["Column-Oriented
    Storage"](/en/ch4#sec_storage_column)) or Apache Avro (a row-based format, see
    ["Avro"](/en/ch5#sec_encoding_avro)).

2.  Call the mapper function to extract a key and value from each input record. In the Unix tool
    example, the mapper function is `awk '{print $7}'`: it extracts the URL (`$7`) as the key, and
    leaves the value empty.

3.  Sort all of the key-value pairs by key. In the log example, this is done by the first `sort`
    command.

4.  Call the reducer function to iterate over the sorted key-value pairs. If there are multiple
    occurrences of the same key, the sorting has made them adjacent in the list, so it is easy to
    combine those values without having to keep a lot of state in memory. In the Unix tool example,
    the reducer is implemented by the command `uniq -c`, which counts the number of adjacent records
    with the same key.

Those four steps can be performed by one MapReduce job. Steps 2 (map) and 4 (reduce) are where you
write your custom data processing code. Step 1 (breaking files into records) is handled by the input
format parser. Step 3, the `sort` step, is implicit in MapReduce---you don't have to write it,
because the output from the mapper is always sorted before it is given to the reducer. This sorting
step is a foundational batch processing algorithm, which we'll revisit in ["Shuffling
Data"](/en/ch11#sec_shuffle).

To create a MapReduce job, you need to implement two callback functions, the mapper and reducer,
which behave as follows:

Mapper

:   The mapper is called once for every input record, and its job is to extract the key and value
    from the input record. For each input, it may generate any number of key-value pairs (including
    none). It does not keep any state from one input record to the next, so each record is handled
    independently.

Reducer

:   The MapReduce framework takes the key-value pairs produced by the mappers, collects all the
    values belonging to the same key, and calls the reducer with an iterator over that collection of
    values. The reducer can produce output records (such as the number of occurrences of the same
    URL).

In the web server log example, we had a second `sort` command in step 5, which ranked URLs by number
of requests. In MapReduce, if you need a second sorting stage, you can implement it by writing a
second MapReduce job and using the output of the first job as input to the second job. Viewed like
this, the role of the mapper is to prepare the data by putting it into a form that is suitable for
sorting, and the role of the reducer is to process the data that has been sorted.

> [!TIP] MAPREDUCE AND FUNCTIONAL PROGRAMMING
> Though MapReduce is used for batch processing, the programming model comes from functional
> programming. Lisp introduced *map* and *reduce* (or *fold*) as higher‑order functions on lists, and
> they have made their way into mainstream languages such as Python, Rust, and Java. Many common data
> processing operations, including those offered by SQL, can be implemented on top of MapReduce. Both
> functions, and functional programming in general, have important properties that MapReduce benefits
> from. Map and reduce are composable, which fits nicely with data processing (as we saw in our Unix
> example). Map is also *embarassingly parallel* (each input is processed independently), which
> simplifies MapReduce's parallel execution. For reduce, different keys can be processed in parallel.

Implementing a complex processing job using the raw MapReduce APIs is actually quite hard and
laborious---for instance, any join algorithms used by the job would need to be implemented from
scratch [^20]. MapReduce is also quite slow compared to more modern batch processors. One
reason is that its file-based I/O prevents job pipelining, i.e., processing output data in a
downstream job before the upstream job is complete.

### Dataflow Engines {#sec_batch_dataflow}

In order to fix some of MapReduce's problems, several new execution engines for distributed batch
computations were developed, the most well known of which are Spark [^18], [^21] and
Flink [^19]. There are various differences in the way they are designed, but they have one
thing in common: they handle an entire workflow as one job, rather than breaking it up into
independent subjobs.

Since they explicitly model the flow of data through several processing stages, these systems are
known as *dataflow engines*. Like MapReduce, they support a low-level API that repeatedly calls a
user-defined function to process one record at a time, but they also offer higher-level operators
such as *join* and *group by*. They parallelize work by sharding inputs, and they copy the output of
one task over the network to become the input to another task. Unlike in MapReduce, operators need
not take the strict roles of alternating map and reduce, but instead can be assembled in more
flexible ways.

These dataflow APIs generally use relational-style building blocks to express a computation: joining
datasets on the value of some field; grouping tuples by key; filtering by some condition; and
aggregating tuples by counting, summing, or other functions. Internally, these operations are
implemented using the shuffle algorithms that we discuss in the next section.

This style of processing engine is based on research systems like Dryad [^22] and Nephele
[^23], and it offers several advantages compared to the MapReduce model:

- Expensive work such as sorting need only be performed in places where it is actually required,
  rather than always happening by default between every map and reduce stage.

- When there are several operators in a row that don't change the sharding of the dataset (such as
  map or filter), they can be combined into a single task, reducing data copying overheads.

- Because all joins and data dependencies in a workflow are explicitly declared, the scheduler has
  an overview of what data is required where, so it can make locality optimizations. For example, it
  can try to place the task that consumes some data on the same machine as the task that produces
  it, so that the data can be exchanged through a shared memory buffer rather than having to copy it
  over the network.

- It is usually sufficient for intermediate state between operators to be kept in memory or written
  to local disk, which requires less I/O than writing it to a distributed filesystem or object store
  (where it must be replicated to several machines and written to disk on each replica). MapReduce
  already uses this optimization for mapper output, but dataflow engines generalize the idea to all
  intermediate state.

- Operators can start executing as soon as their input is ready; there is no need to wait for the
  entire preceding stage to finish before the next one starts.

- Existing processes can be reused to run new operators, reducing startup overheads compared to
  MapReduce (which launches a new JVM for each task).

You can use dataflow engines to implement the same computations as MapReduce workflows, and they
usually execute significantly faster due to the optimizations described here.

### Shuffling Data {#sec_shuffle}

We saw that both the Unix tools example at the beginning of the chapter and MapReduce are based on
sorting. Batch processors need to be able to sort datasets petabytes in size, which are too large to
fit on a single machine. They therefore require a distributed sorting algorithm where both the input
and the output is sharded. Such an algorithm is called a *shuffle*.

> [!NOTE] SHUFFLE IS NOT RANDOM
> The term *shuffle* is confusing. When you shuffle a deck of cards, you end up with a random order.
> In contrast, the shuffle we're talking about here produces a sorted order, with no randomness.

Shuffling is a foundational algorithm for batch processors, where it is used for joins and
aggregations. MapReduce, Spark, Flink, Daft, Dataflow, and BigQuery [^24] all implement
scalable and performant shuffle algorithms in order to handle large datasets. We'll use the shuffle
in Hadoop MapReduce [^25] for illustration purposes, but the concepts in this section
translate to other systems as well.

[Figure 11-1](/en/ch11#fig_batch_mapreduce) shows the dataflow in a MapReduce job. We assume that
the input to the job is sharded, and the shards are labelled *m 1*, *m 2*, and *m 3*. For example,
each shard may be a separate file on HDFS or a separate object in an object store, and all the
shards belonging to the same dataset are grouped into the same HDFS directory or have the same key
prefix in an object store bucket.

{{< figure src="/fig/ddia_1101.png" id="fig_batch_mapreduce" caption="Figure 11-1. A MapReduce job with three mappers and three reducers." class="w-full my-4" >}}

The framework starts a separate map task for each input shard. A task reads its assigned file,
passing one record at a time to the mapper callback. The reduce side of the computation is also
sharded. While the number of map tasks is determined by the number of input shards, the number of
reduce tasks is configured by the job author (it can be different from the number of map tasks).

The output of the mapper consists of key-value pairs, and the framework needs to ensure that if two
different mappers output the same key, those key-value pairs end up being processed by the same
reducer task. To achieve this, each mapper creates a separate output file on its local disk for
every reducer (for example, the file *m 1, r 2* in [Figure 11-1](/en/ch11#fig_batch_mapreduce) is
the file created by mapper 1 containing the data destined for reducer 2). When the mapper outputs a
key-value pair, a hash of the key typically determines which reducer file it is written to
(similarly to ["Sharding by Hash of Key"](/en/ch7#sec_sharding_hash)).

While a mapper is writing these files, it also sorts the key-value pairs within each file. This can
be done using the techniques we saw in ["Log-Structured
Storage"](/en/ch4#sec_storage_log_structured): batches of key-value pairs are first collected in a
sorted data structure in memory, then written out as sorted segment files, and smaller segment files
are progressively merged into larger ones.

After each mapper finishes, reducers connect to it and copy the appropriate file of sorted key-value
pairs to their local disk. Once the reduce task has its share of the output from all of the mappers,
it merges these files together, preserving the sort order, mergesort-style. Key-value pairs with the
same key are now consecutive, even if they came from different mappers. The reducer function is then
called once per-key, each time with an iterator that returns all the values for that key.

Any records output by the reducer function are sequentially written to a file, with one file per
reduce task. These files (*r 1*, *r 2*, *r 3* in [Figure 11-1](/en/ch11#fig_batch_mapreduce)) become
the shards of the job's output dataset, and they are written back to the distributed filesystem or
object store.

Though MapReduce executes the shuffle step between its map and reduce steps, modern dataflow engines
and cloud data warehouses are more sophisticated. Systems such as BigQuery have optimized their
shuffle algorithms to keep data in memory and to write data to external sorting services
[^24]. Such services speed up shuffling and replicate shuffled data to provide resilience.

#### JOIN and GROUP BY {#sec_batch_join}

Let's look at how sorted data simplifies distributed joins and aggregations. We'll continue with
MapReduce for illustration purposes, though these concepts apply to most batch processing systems.

A typical example of a join in a batch job is illustrated in
[Figure 11-2](/en/ch11#fig_batch_join_example). On the left is a log of events describing the things
that logged-in users did on a website (known as *activity events* or *clickstream data*), and on the
right is a database of users. You can think of this example as being part of a star schema (see
["Stars and Snowflakes: Schemas for Analytics"](/en/ch3#sec_datamodels_analytics)): the log of
events is the fact table, and the user database is one of the dimensions.

{{< figure src="/fig/ddia_1102.png" id="fig_batch_join_example" caption="Figure 11-2. A join between a log of user activity events and a database of user profiles." class="w-full my-4" >}}

If you want to perform an analysis of the activity events that takes into account information from
the user database (for example, find out whether certain pages are more popular with younger or
older users, using the date of birth field in the user profile), you need to compute a join between
these two tables. How would you compute that join, assuming both tables are so large that they have
to be sharded?

You can use the fact that in MapReduce, the shuffle brings together all the key-value pairs with the
same key to the same reducer, no matter which shard they were on originally. Here, the user ID can
serve as the key. You can therefore write a mapper that goes over the user activity events, and
emits page view URLs keyed by user ID, as illustrated in
[Figure 11-3](/en/ch11#fig_batch_join_reduce). Another mapper goes over the user database row by
row, extracting the user ID as the key and the user's date of birth as the value.

{{< figure src="/fig/ddia_1103.png" id="fig_batch_join_reduce" caption="Figure 11-3. A sort-merge join on user ID. If the input datasets are sharded into multiple files, each could be processed with multiple mappers in parallel." class="w-full my-4" >}}

The shuffle then ensures that a reducer function can access a particular user's date of birth and
all of that user's page view events at the same time. The MapReduce job can even arrange the records
to be sorted such that the reducer always sees the record from the user database first, followed by
the activity events in timestamp order---this technique is known as a *secondary sort*
[^25].

The reducer can then perform the actual join logic easily. The first value is expected to be the
date of birth, which the reducer stores in a local variable. It then iterates over the activity
events with the same user ID, outputting each viewed URL along with the viewer's date of birth.
Since the reducer processes all of the records for a particular user ID in one go, it only needs to
keep one user record in memory at any one time, and it never needs to make any requests over the
network. This algorithm is known as a *sort-merge join*, since mapper output is sorted by key, and
the reducers then merge together the sorted lists of records from both sides of the join.

The next MapReduce job in the workflow can then calculate the distribution of viewer ages for each
URL. To do so, the job would first shuffle the data using the URL as key. Once sorted, the reducers
would then iterate over all the page views (with viewer birth date) for a single URL, keep a counter
for the number of views by each age group, and increment the appropriate counter for each page view.
This way you can implement a *group by* operation and aggregation.

### Query languages {#sec_batch_query_lanauges}

Over the years, execution engines for distributed batch processing have matured. By now, the
infrastructure has become robust enough to store and process many petabytes of data on clusters of
over 10,000 machines. As the problem of physically operating batch processes at such scale has been
considered more or less solved, attention has turned to improving the programming model.

MapReduce, dataflow engines, and cloud data warehouses have all embraced SQL as the lingua franca
for batch processing. It's a natural fit: legacy data warehouses used SQL, data analytics and ETL
tools already support SQL, and all developers and analysts know it.

Besides the obvious advantage of requiring less code than handwritten MapReduce jobs, these query
language interfaces also allow interactive use, in which you write analytical queries and run them
from a terminal or GUI. This style of interactive querying is an efficient and natural way for
business analytics, product managers, sales and finance teams, and others to explore data in a batch
processing environment. Though not a classic form of batch processing, SQL support has made
exploratory queries suitable for distributed batch processing systems.

High-level query languages not only make the humans using the system more productive, but they also
improve the job execution efficiency at a machine level. As we saw in ["Cloud Data
Warehouses"](/en/ch4#sec_cloud_data_warehouses), query engines are responsible for converting SQL
queries into batch jobs to be executed in a cluster. This translation step from query to syntax tree
to physical operators allows the engine to optimize queries. Query engines such as Hive, Trino,
Spark, and Flink have cost-based query optimizers that can analyze the properties of join inputs and
automatically decide which algorithm would be most suitable for the task at hand. Optimizers might
even change the order of joins so that the amount of intermediate state is minimized [^19],
[^26], [^27], [^28].

While SQL is the most popular general-purpose batch processing query language, other languages
remain in use for niche use cases. Apache Pig was a language based on relational operators that
allowed data pipelines to be specified step by step, rather than as one big SQL query. DataFrames
(see next section) have similar characteristics, and Morel is a more modern language influenced by
Pig. Other users have adopted JSON query languages such as jq, JMESPath, or JsonPath.

In ["Graph-Like Data Models"](/en/ch3#sec_datamodels_graph) we discussed using graphs for modeling
data, and using graph query languages to traverse the edges and vertices in a graph. Many graph
processing frameworks also support batch computation through query languages such as Apache
TinkerPop's Gremlin. We will look at graph processing use cases in more detail in ["Batch Use
Cases"](/en/ch11#sec_batch_output).

> [!TIP] BATCH PROCESSING AND CLOUD DATA WAREHOUSES CONVERGE
> Historically, data warehouses ran on specialized hardware appliances, and provided SQL analytics
> queries over relational data. In contrast, batch processing frameworks like MapReduce set out to
> provide greater scalability and greater flexibility by supporting processing logic written in a
> general-purpose programming language, allowing it to read and write arbitrary data formats.
>
> Over time, the two have become much more similar. Modern batch processing frameworks now support SQL
> as a language for writing batch jobs, and they achieve good performance on relational queries by
> using columnar storage formats such as Parquet and optimized query execution engines (see ["Query
> Execution: Compilation and Vectorization"](/en/ch4#sec_storage_vectorized)).
> Meanwhile, data warehouses have grown more scalable by moving to the cloud (see ["Cloud Data
> Warehouses"](/en/ch4#sec_cloud_data_warehouses)), and implementing many of the
> same scheduling, fault tolerance, and shuffling techniques that distributed batch frameworks do.
> Many use distributed filesystems as well.
>
> Just as batch processing systems adopted SQL as a processing model, cloud warehouses have adopted
> alternative processing models such as DataFrames as well (discussed in the next section). For
> example, Google Cloud BigQuery offers a BigQuery DataFrames library and Snowflake's Snowpark
> integrates with Pandas. Batch processing workflow orchestrators such as Airflow, Prefect, and
> Dagster also integrate with cloud warehouses.
>
> Not all batch jobs are easily expressed in SQL, though. Iterative graph algorithms such as PageRank,
> complex machine learning, and many other tasks are difficult to express in SQL. AI data processing,
> which includes non-relational and multi-modal data such as images, video, and audio, can also be
> difficult to do in SQL.
>
> Moreover, cloud data warehouses struggle with certain workloads. Row-by-row computation is less
> efficient when using column-oriented storage formats. Alternative warehouse APIs or a batch
> processing system are preferable in such cases. Cloud data warehouses also tend to be more expensive
> than other batch processing systems. It can be more cost-efficient to run large jobs in batch
> processing systems such as Spark or Flink instead.
>
> Ultimately, the decision between processing data in batch systems or data warehouses comes down to
> factors such as cost, convenience, ease of implementation, availability, and so on. Most large
> enterprises have many data processing systems, which give them flexibility in this decision. Smaller
> companies often get by with just one.

### DataFrames {#id287}

As data scientists and statisticians began using distributed batch processing frameworks for machine
learning use cases, they found existing processing models cumbersome, as they were used to working
with the DataFrame data model found in R and Pandas (see ["DataFrames, Matrices, and
Arrays"](/en/ch3#sec_datamodels_dataframes)). A DataFrame is similar to a table in a relational
database: it is a collection of rows, and all the values in the same column have the same type.
Instead of writing one big SQL query, users call functions corresponding to relational operators to
perform filters, joins, sorting, group by, and other operations.

Originally, DataFrame manipulation typically occurred locally, in memory. Consequently, DataFrames
were limited to datasets that fit on a single machine. Data scientists wanted to interact with the
large datasets found in batch processing environments using the DataFrame APIs they were used to.
Distributed data processing frameworks such as Spark, Flink, and Daft have adopted DataFrame APIs to
meet this need. On the other hand, local DataFrames are usually indexed and ordered while
distributed DataFrames are generally not [^29]. This can lead to performance surprises
when migrating to batch frameworks.

DataFrame APIs appear similar to dataflow APIs, but implementations vary. While Pandas executes
operations immediately when the DataFrame methods are called, Apache Spark first translates all the
DataFrame API calls into a query plan and runs query optimization before executing the workflow on
top of its distributed dataflow engine. This allows it to improve performance.

Frameworks such as Daft even support both client and server-side computation. Smaller, in-memory
operations are executed on the client while larger datasets and computation are executed on the
server. Columnar storage formats such as Apache Arrow offer a unified data model that both client
and server-side execution engines can share.

## Batch Use Cases {#sec_batch_output}

Now that we've seen how batch processing works, let's see how it is applied to a range of different
applications. Batch jobs are excellent for processing large datasets in bulk, but they aren't good
for low latency use cases. Consequently, you'll find batch jobs wherever there's a lot of data and
data freshness isn't important. This might sound limiting, but it turns out that the a significant
amount of data processing fits this model:

- Accounting and inventory reconciliation, where companies verify that transactions line up with
  their bank accounts and inventory, are often done in batch [^30].

- In manufacturing, demand forecasting is computed in periodic batch jobs [^31].

- Ecommerce, media, and social media companies train their recommendation models using batch jobs
  [^32], [^33].

- Many financial systems are batch-based, as well. For example, the United States's banking network
  runs almost entirely on batch jobs [^34].

In the following sections, we'll discuss some of the batch processing use cases you'll find in
nearly every industry.

### Extract--Transform--Load (ETL) {#sec_batch_etl_usage}

["Data Warehousing"](/en/ch1#sec_introduction_dwh) introduced the idea of ETL and ELT, where a data
processing pipeline extracts data from a production database, transforms it, and loads results into
a downstream system (we'll use "ETL" in this section to represent both ETL and ELT workloads). Batch
jobs are often used for such workloads, especially when the downstream system is a data warehouse.

The parallel nature of batch jobs makes them a great fit for data transformation. Much of data
transformation involves "embarrassingly parallel" workloads. Filtering data, projecting fields, and
many other common data warehouse transformations can all be done in parallel.

Batch processing environments also come with robust workflow schedulers, which make it easy to
schedule, orchestrate, and debug ETL data pipeline jobs. When a failure occurs, schedulers often
retry jobs to mitigate transient issues that might occur. A job that fails repeatedly will be marked
as failed, which helps developers easily see which job in their data pipeline stopped working.
Schedulers like Airflow even come with built-in source, sink, and query operators for MySQL,
PostgreSQL, Snowflake, Spark, Flink, and dozens of other popular systems. A tight integration
between schedulers and data processing systems simplifies data integration.

We've also seen that batch jobs are easy to troubleshoot and fix when things go awry. This feature
is invaluable when debugging data pipelines. Failed files can be easily inspected to see what went
wrong, and ETL batch jobs can be fixed and re-run. For example, an input file might no longer
contain a field that a transformation batch job intends to use. Data engineers will see that the
field is missing, and update the transformation logic or the job that produced the input.

Data pipelines used to be managed by a single data engineering team, as it was considered unfair to
ask other teams working on product features to write and manage complex batch data pipelines.
Recently, improvements in batch processing models and metadata management have made it much easier
for engineers across an organization to contribute to and manage their own data pipelines. *Data
mesh* [^35], [^36], *data contract* [^37], and *data fabric*
[^38] practices provide standards and tools to help teams safely publish their data for
consumption by anybody in the organization.

Data pipelines and analytic queries have begun to share not only processing models, but execution
engines as well. Many batch ETL jobs now run on the same systems as the analytic queries that read
their output. It is not uncommon to see data pipeline transformations and analytic queries both run
as SparkSQL, Trino, or DuckDB queries. Such an architecture further blurs the line between
application engineering, data engineering, analytics engineering, and business analysis.

### Analytics {#sec_batch_olap}

In ["Operational Versus Analytical Systems"](/en/ch1#sec_introduction_analytics), we saw that
analytic queries (OLAP) often scan over a large number of records, performing groupings and
aggregations. It is possible to run such workloads in a batch processing system, alongside other
batch processing workloads. Analysts write SQL queries that execute atop a query engine, which reads
and writes from a distributed file system or object store. Table metadata such as table to file
mappings, names, and types are managed with table formats such as Apache Iceberg and catalogs such
as Unity (see ["Cloud Data Warehouses"](/en/ch4#sec_cloud_data_warehouses)). This architecture is
known as a *data lakehouse* [^39].

As with ETL, improvements in SQL query interfaces mean many organizations now use batch frameworks
such as Spark for analytics. Such query patterns come in two styles:

- Pre-aggregation queries, where data is rolled up into OLAP cubes or data marts to speed up queries
  (see ["Materialized Views and Data Cubes"](/en/ch4#sec_storage_materialized_views)).
  Pre-aggregated data is queried in the warehouse or pushed to purpose-built realtime OLAP systems
  such as Apache Druid or Apache Pinot. Pre-aggregation normally takes place at a scheduled
  interval. The workflow schedulers discussed in ["Scheduling
  Workflows"](/en/ch11#sec_batch_workflows) are used to manage these workloads.

- Ad hoc queries that users run to answer specific business questions, investigate user behavior,
  debug operational issues, and much more. Response times are important for this use case. Analysts
  run queries iteratively as they get responses and learn more about the data they're investigating.
  Batch processing frameworks with fast query execution help reduce waiting times for analysts.

SQL support enables batch processing frameworks to integrate with spreadsheets and data
visualization tools such as Tableau, Power BI, Looker, and Apache Superset. For example, Tableau
offers SparkSQL and Presto connectors, while Apache Superset supports Trino, Hive, Spark SQL,
Presto, and many other systems that ultimately execute batch jobs to query data.

### Machine Learning {#id290}

Machine learning (ML) makes frequent use of batch processing. Data scientists, ML engineers, and AI
engineers use batch processing frameworks to investigate data patterns, transform data, and train
machine learning models. Common uses include:

- Feature engineering: Raw data is filtered and transformed into data that models can be trained on.
  Predictive models often need numeric data, so engineers must transform other forms of data (such
  as text or discrete values) into the required format.

- Model training: The training data is the input to the batch process, and the weights of the
  trained model are the output.

- Batch inference: A trained model can then be used to make predictions in bulk if datasets are
  large and realtime results are not required. This includes evaluating the model's predictions on a
  test dataset.

Batch processing frameworks provide tools explicitly for these use cases. For example, Apache
Spark's MLlib and Apache Flink's FlinkML come with a wide variety of feature engineering tools,
statistical functions, and classifiers.

Machine learning applications such as recommendation engines and ranking systems also make heavy use
of graph processing (see ["Graph-Like Data Models"](/en/ch3#sec_datamodels_graph)). Many graph
algorithms are expressed by traversing one edge at a time, joining one vertex with an adjacent
vertex in order to propagate some information, and repeating until some condition is met---for
example, until there are no more edges to follow, or until some metric converges.

The *bulk synchronous parallel* (BSP) model of computation [^40] has become popular for
batch processing graphs. Among others, it is implemented by Apache Giraph [^20], Spark's
GraphX API, and Flink's Gelly API [^41]. It is also known as the *Pregel* model, as
Google's Pregel paper popularized this approach for processing graphs [^42].

Batch processing is also an integral part of large language model (LLM) data preparation and
training. Raw text input data such as websites typically reside in a DFS or object store. This data
must be pre-processed to make it suitable for training. Pre-processing steps that are well-suited
for batch processing frameworks include:

- Plain text must be extracted from HTML and malformed text must be fixed.

- Low quality, irrelevant, and duplicate documents must be detected and removed.

- Text must be tokenized (split into words) and converted into embeddings, which are numeric
  representations each word.

Batch processing frameworks such as Kubeflow, Flyte, and Ray are purpose-built for such workloads.
OpenAI uses Ray as part of its ChatGPT training process, for example [^43]. These
frameworks have built-in integrations for LLM and AI libraries such as PyTorch, Tensorflow, XGBoost,
and many others. They also offer built-in support for feature engineering, model training, batch
inference, and fine tuning (adjusting a foundational model for specific use cases).

Finally, data scientists often experiment with data in interactive notebooks such as Jupyter or Hex.
Notebooks are made up of *cells*, which are small chunks of markdown, Python, or SQL. Cells are
executed sequentially to produce spreadsheets, graphs, or data. Many notebooks use batch processing
via DataFrame APIs or query such systems using SQL.

### Serving Derived Data {#sec_batch_serving_derived}

Batch jobs are often used to build pre-computed or derived datasets such as product recommendations,
user-facing reports, and features for machine learning models. These datasets are typically served
from a production database, key-value store, or search engine. Regardless of the system used, the
pre-computed data needs to make its way from the batch processor's distributed filesystem or object
store back into the database that's serving live traffic.

The most obvious choice might be to use the client library for your favorite database directly
within a batch job, and to write directly to the database server, one record at a time. This will
work (assuming your firewall rules allow direct access from your batch processing environment to
your production databases), but it is a bad idea for several reasons:

- Making a network request for every single record is orders of magnitude slower than the normal
  throughput of a batch task. Even if the client library supports batching, performance is likely to
  be poor.

- Batch processing frameworks often run many tasks in parallel. If all the tasks concurrently write
  to the same output database, with a rate expected of a batch process, that database can easily be
  overwhelmed, and its performance for queries is likely to suffer. This can in turn cause
  operational problems in other parts of the system [^44].

- Normally, batch jobs provide a clean all-or-nothing guarantee for job output: if a job succeeds,
  the result is the output of running every task exactly once, even if some tasks failed and had to
  be retried along the way; if the entire job fails, no output is produced. However, writing to an
  external system from inside a job produces externally visible side effects that cannot be hidden
  in this way. Thus, you have to worry about the results from partially completed jobs being visible
  to other systems. If a task fails and is restarted, it may duplicate output from the failed
  execution.

A better solution is to have batch jobs push pre-computed datasets to streams such as Kafka topics,
which we discuss further in [Chapter 12](/en/ch12#ch_stream). Search engines like Elasticsearch,
realtime OLAP systems like Apache Pinot and Apache Druid, derived datastores like Venice
[^45], and cloud data warehouses like ClickHouse all have the built-in ability to ingest
data from Kafka into their systems. Pushing data through a streaming systems fixes a few of the
problems we discussed above:

- Streaming systems are optimized for sequential writes, which make them better suited for the bulk
  write workload of a batch job.

- Streaming systems can also act as a buffer between the batch job and the production databases.
  Downstream systems can throttle their read rate to ensure they can continue to comfortably serve
  production traffic.

- The output of a single batch job can be consumed by multiple downstream systems.

- Streaming systems can serve as a security boundary between batch processing environments and
  production networks: they can be deployed in a so-called DMZ (demilitarized zone) network that
  sits between the batch processing network and production network.

Pushing data through streams doesn't inherently solve the all-or-nothing guarantee issue we
discussed above. To make this work, batch jobs must send a notification to downstream systems that
their job is complete and the data can now be served. Consumers of the stream need to be able to
keep data they receive invisible to queries, like an uncommitted transaction with *read committed*
isolation (see ["Read Committed"](/en/ch8#sec_transactions_read_committed)), until they are notified
that it is complete.

Another pattern that is more common when bootstrapping databases is to build a brand-new database
*inside* the batch job and bulk load those files directly into the database from a distributed
filesystem, object store, or local filesystem. Many data systems offer bulk import tools such as
TiDB's Lightning tool, or Apache Pinot's and Apache Druid's Hadoop import jobs. RocksDB also offers
an API to bulk import SSTs from batch jobs.

Building databases in batch and bulk importing the data is very fast, and makes it easier for
systems to atomically switch between dataset versions. On the other hand, it can be challenging to
incrementally update datasets from batch jobs that build brand-new databases. It's common to take a
hybrid approach in situations where both bootstrapping and incremental loads are needed. Venice, for
example, supports hybrid stores that allow for batch row-based updates and full dataset swaps.

## Summary {#id292}

In this chapter, we explored the design and implementation of batch processing systems. We began
with the classic Unix toolchain (awk, sort, uniq, etc.), to illustrate fundamental batch processing
primitives such as sorting and counting.

We then scaled up to distributed batch processing systems. We saw that batch-style I/O processes
immutable, bounded input datasets to produce output data, allowing reruns and debugging without side
effects. To process files, we saw that batch frameworks have three main components: an orchestration
layer that determines where and when jobs run, a storage layer to persist data, and a computation
layer that processes the actual data.

We looked at how distributed filesystems and object stores manage large files through block-based
replication, caching, and metadata services, and how modern batch frameworks interact with these
systems using pluggable APIs. We also discussed how orchestrators schedule tasks, allocate
resources, and handle faults in large clusters. We also compared job orchestrators that schedule
jobs with workflow orchestrators that manage the lifecycle of a collection of jobs that run in a
dependency graph.

We surveyed batch processing models, starting with MapReduce and its canonical map and reduce
functions. Next, we turned to dataflow engines like Spark and Flink, which offer simpler-to-use
dataflow APIs and better performance. To understand how batch jobs scale, we covered the shuffle
algorithm, a foundational operation that enables grouping, joining, and aggregation.

As batch systems matured, focus shifted to usability. You learned about high-level query languages
like SQL and DataFrame APIs, which make batch jobs more accessible and easier to optimize. Query
optimizers translate declarative queries into efficient execution plans.

We finished the chapter with common batch processing use cases:

- ETL pipelines, which extract, transform, and load data between different systems using scheduled
  workflows;

- Analytics, where batch jobs support both pre-aggregated dashboards and ad hoc queries;

- Machine learning, where batch jobs prepare and process large training datasets;

- Populating production-facing systems from batch outputs, often via streams or bulk loading tools,
  in order to serve the derived data to users.

In the next chapter, we will turn to stream processing, in which the input is *unbounded*---that is,
you still have a job, but its inputs are never-ending streams of data. In this case, a job is never
complete, because at any time there may still be more work coming in. We shall see that stream and
batch processing are similar in some respects, but the assumption of unbounded streams also changes
a lot about how we build systems.

##### Footnotes

### References {#references}

[^1]: Nathan Marz. [How to Beat the CAP Theorem](http://nathanmarz.com/blog/how-to-beat-the-cap-theorem.html). *nathanmarz.com*, October 2011. Archived at [perma.cc/4BS9-R9A4](https://perma.cc/4BS9-R9A4)
[^2]: Molly Bartlett Dishman and Martin Fowler. [Agile Architecture](https://www.youtube.com/watch?v=VjKYO6DP3fo&list=PL055Epbe6d5aFJdvWNtTeg_UEHZEHdInE). At *O'Reilly Software Architecture Conference*, March 2015.
[^3]: Jeffrey Dean and Sanjay Ghemawat. [MapReduce: Simplified Data Processing on Large Clusters](https://www.usenix.org/legacy/publications/library/proceedings/osdi04/tech/full_papers/dean/dean.pdf). At *6th USENIX Symposium on Operating System Design and Implementation* (OSDI), December 2004.
[^4]: Shivnath Babu and Herodotos Herodotou. [Massively Parallel Databases and MapReduce Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2013/11/db-mr-survey-final.pdf). *Foundations and Trends in Databases*, volume 5, issue 1, pages 1--104, November 2013. [doi:10.1561/1900000036](https://doi.org/10.1561/1900000036)
[^5]: David J. DeWitt and Michael Stonebraker. [MapReduce: A Major Step Backwards](https://homes.cs.washington.edu/~billhowe/mapreduce_a_major_step_backwards.html). Originally published at *databasecolumn.vertica.com*, January 2008. Archived at [perma.cc/U8PA-K48V](https://perma.cc/U8PA-K48V)
[^6]: Henry Robinson. [The Elephant Was a Trojan Horse: On the Death of Map-Reduce at Google](https://www.the-paper-trail.org/post/2014-06-25-the-elephant-was-a-trojan-horse-on-the-death-of-map-reduce-at-google/). *the-paper-trail.org*, June 2014. Archived at [perma.cc/9FEM-X787](https://perma.cc/9FEM-X787)
[^7]: Urs Hölzle. [R.I.P. MapReduce. After having served us well since 2003, today we removed the remaining internal codebase for good](https://twitter.com/uhoelzle/status/1177360023976067077). *twitter.com*, September 2019. Archived at [perma.cc/B34T-LLY7](https://perma.cc/B34T-LLY7)
[^8]: Adam Drake. [Command-Line Tools Can Be 235x Faster than Your Hadoop Cluster](https://adamdrake.com/command-line-tools-can-be-235x-faster-than-your-hadoop-cluster.html). *aadrake.com*, January 2014. Archived at [perma.cc/87SP-ZMCY](https://perma.cc/87SP-ZMCY)
[^9]: [`sort`: Sort text files](https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html). GNU Coreutils 9.7 Documentation, Free Software Foundation, Inc., 2025.
[^10]: Michael Ovsiannikov, Silvius Rus, Damian Reeves, Paul Sutter, Sriram Rao, and Jim Kelly. [The Quantcast File System](https://db.disi.unitn.eu/pages/VLDBProgram/pdf/industry/p808-ovsiannikov.pdf). *Proceedings of the VLDB Endowment*, volume 6, issue 11, pages 1092--1101, August 2013. [doi:10.14778/2536222.2536234](https://doi.org/10.14778/2536222.2536234)
[^11]: Andrew Wang, Zhe Zhang, Kai Zheng, Uma Maheswara G., and Vinayakumar B. [Introduction to HDFS Erasure Coding in Apache Hadoop](https://www.cloudera.com/blog/technical/introduction-to-hdfs-erasure-coding-in-apache-hadoop.html). *blog.cloudera.com*, September 2015. Archived at [archive.org](https://web.archive.org/web/20250731115546/https://www.cloudera.com/blog/technical/introduction-to-hdfs-erasure-coding-in-apache-hadoop.html)
[^12]: Andy Warfield. [Building and operating a pretty big storage system called S3](https://www.allthingsdistributed.com/2023/07/building-and-operating-a-pretty-big-storage-system.html). *allthingsdistributed.com*, July 2023. Archived at [perma.cc/7LPK-TP7V](https://perma.cc/7LPK-TP7V)
[^13]: Vinod Kumar Vavilapalli, Arun C. Murthy, Chris Douglas, Sharad Agarwal, Mahadev Konar, Robert Evans, Thomas Graves, Jason Lowe, Hitesh Shah, Siddharth Seth, Bikas Saha, Carlo Curino, Owen O'Malley, Sanjay Radia, Benjamin Reed, and Eric Baldeschwieler. [Apache Hadoop YARN: Yet Another Resource Negotiator](https://opencourse.inf.ed.ac.uk/sites/default/files/2023-10/yarn-socc13.pdf). At *4th Annual Symposium on Cloud Computing* (SoCC), October 2013. [doi:10.1145/2523616.2523633](https://doi.org/10.1145/2523616.2523633)
[^14]: Richard M. Karp. [Reducibility Among Combinatorial Problems](https://www.cs.purdue.edu/homes/hosking/197/canon/karp.pdf). *Complexity of Computer Computations. The IBM Research Symposia Series*. Springer, 1972. [doi:10.1007/978-1-4684-2001-2_9](https://doi.org/10.1007/978-1-4684-2001-2_9)
[^15]: J. D. Ullman. [NP-Complete Scheduling Problems](https://www.cs.montana.edu/bhz/classes/fall-2018/csci460/paper4.pdf). *Journal of Computer and System Sciences*, volume 10, issue 3, June 1975. [doi:10.1016/S0022-0000(75)80008-0](https://doi.org/10.1016/S0022-0000(75)80008-0)
[^16]: Gilad David Maayan. [The complete guide to spot instances on AWS, Azure and GCP](https://www.datacenterdynamics.com/en/opinions/complete-guide-spot-instances-aws-azure-and-gcp/). *datacenterdynamics.com*, March 2021. Archived at [archive.org](https://web.archive.org/web/20250722114617/https://www.datacenterdynamics.com/en/opinions/complete-guide-spot-instances-aws-azure-and-gcp/)
[^17]: Abhishek Verma, Luis Pedrosa, Madhukar Korupolu, David Oppenheimer, Eric Tune, and John Wilkes. [Large-Scale Cluster Management at Google with Borg](https://dl.acm.org/doi/pdf/10.1145/2741948.2741964). At *10th European Conference on Computer Systems* (EuroSys), April 2015. [doi:10.1145/2741948.2741964](https://doi.org/10.1145/2741948.2741964)
[^18]: Matei Zaharia, Mosharaf Chowdhury, Tathagata Das, Ankur Dave, Justin Ma, Murphy McCauley, Michael J. Franklin, Scott Shenker, and Ion Stoica. [Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing](https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final138.pdf). At *9th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), April 2012.
[^19]: Paris Carbone, Stephan Ewen, Seif Haridi, Asterios Katsifodimos, Volker Markl, and Kostas Tzoumas. [Apache Flink™: Stream and Batch Processing in a Single Engine](http://sites.computer.org/debull/A15dec/p28.pdf). *Bulletin of the IEEE Computer Society Technical Committee on Data Engineering*, volume 38, issue 4, December 2015. Archived at [perma.cc/G3N3-BKX5](https://perma.cc/G3N3-BKX5)
[^20]: Mark Grover, Ted Malaska, Jonathan Seidman, and Gwen Shapira. *[Hadoop Application Architectures](https://learning.oreilly.com/library/view/hadoop-application-architectures/9781491910313/)*. O'Reilly Media, 2015. ISBN: 978-1-491-90004-8
[^21]: Jules S. Damji, Brooke Wenig, Tathagata Das, and Denny Lee. *[Learning Spark, 2nd Edition](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/)*. O'Reilly Media, 2020. ISBN: 978-1492050049
[^22]: Michael Isard, Mihai Budiu, Yuan Yu, Andrew Birrell, and Dennis Fetterly. [Dryad: Distributed Data-Parallel Programs from Sequential Building Blocks](https://www.microsoft.com/en-us/research/publication/dryad-distributed-data-parallel-programs-from-sequential-building-blocks/). At *2nd European Conference on Computer Systems* (EuroSys), March 2007. [doi:10.1145/1272996.1273005](https://doi.org/10.1145/1272996.1273005)
[^23]: Daniel Warneke and Odej Kao. [Nephele: Efficient Parallel Data Processing in the Cloud](https://stratosphere2.dima.tu-berlin.de/assets/papers/Nephele_09.pdf). At *2nd Workshop on Many-Task Computing on Grids and Supercomputers* (MTAGS), November 2009. [doi:10.1145/1646468.1646476](https://doi.org/10.1145/1646468.1646476)
[^24]: Hossein Ahmadi. [In-memory query execution in Google BigQuery](https://cloud.google.com/blog/products/bigquery/in-memory-query-execution-in-google-bigquery). *cloud.google.com*, August 2016. Archived at [perma.cc/DGG2-FL9W](https://perma.cc/DGG2-FL9W)
[^25]: Tom White. *[Hadoop: The Definitive Guide](https://learning.oreilly.com/library/view/hadoop-the-definitive/9781491901687/)*, 4th edition. O'Reilly Media, 2015. ISBN: 978-1-491-90163-2
[^26]: Fabian Hüske. [Peeking into Apache Flink's Engine Room](https://flink.apache.org/2015/03/13/peeking-into-apache-flinks-engine-room/). *flink.apache.org*, March 2015. Archived at [perma.cc/44BW-ALJX](https://perma.cc/44BW-ALJX)
[^27]: Mostafa Mokhtar. [Hive 0.14 Cost Based Optimizer (CBO) Technical Overview](https://web.archive.org/web/20170607112708/http://hortonworks.com/blog/hive-0-14-cost-based-optimizer-cbo-technical-overview/). *hortonworks.com*, March 2015. Archived on [archive.org](https://web.archive.org/web/20170607112708/http://hortonworks.com/blog/hive-0-14-cost-based-optimizer-cbo-technical-overview/)
[^28]: Michael Armbrust, Reynold S. Xin, Cheng Lian, Yin Huai, Davies Liu, Joseph K. Bradley, Xiangrui Meng, Tomer Kaftan, Michael J. Franklin, Ali Ghodsi, and Matei Zaharia. [Spark SQL: Relational Data Processing in Spark](https://people.csail.mit.edu/matei/papers/2015/sigmod_spark_sql.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2742797](https://doi.org/10.1145/2723372.2742797)
[^29]: Kaya Kupferschmidt. [Spark vs Pandas, part 2 -- Spark](https://towardsdatascience.com/spark-vs-pandas-part-2-spark-c57f8ea3a781/). *towardsdatascience.com*, October 2020. Archived at [perma.cc/5BRK-G4N5](https://perma.cc/5BRK-G4N5)
[^30]: Ammar Chalifah. [Tracking payments at scale](https://bolt.eu/en/blog/tracking-payments-at-scale). *bolt.eu.com*, June 2025. Archived at [perma.cc/Q4KX-8K3J](https://perma.cc/Q4KX-8K3J)
[^31]: Nafi Ahmet Turgut, Hamza Akyıldız, Hasan Burak Yel, Mehmet İkbal Özmen, Mutlu Polatcan, Pinar Baki, and Esra Kayabali. [Demand forecasting at Getir built with Amazon Forecast](https://aws.amazon.com/blogs/machine-learning/demand-forecasting-at-getir-built-with-amazon-forecast). *aws.amazon.com.com*, May 2023. Archived at [perma.cc/H3H6-GNL7](https://perma.cc/H3H6-GNL7)
[^32]: Jason (Siyu) Zhu. [Enhancing homepage feed relevance by harnessing the power of large corpus sparse ID embeddings](https://www.linkedin.com/blog/engineering/feed/enhancing-homepage-feed-relevance-by-harnessing-the-power-of-lar). *linkedin.com*, August 2023. Archived at [archive.org](https://web.archive.org/web/20250225094424/https://www.linkedin.com/blog/engineering/feed/enhancing-homepage-feed-relevance-by-harnessing-the-power-of-lar)
[^33]: Avery Ching, Sital Kedia, and Shuojie Wang. [Apache Spark \@Scale: A 60 TB+ production use case](https://engineering.fb.com/2016/08/31/core-infra/apache-spark-scale-a-60-tb-production-use-case/). *engineering.fb.com*, August 2016. Archived at [perma.cc/F7R5-YFAV](https://perma.cc/F7R5-YFAV)
[^34]: Edward Kim. [How ACH works: A developer perspective --- Part 1](https://engineering.gusto.com/how-ach-works-a-developer-perspective-part-1-339d3e7bea1). *engineering.gusto.com*, April 2014. Archived at [perma.cc/F67P-VBLK](https://perma.cc/F67P-VBLK)
[^35]: Zhamak Dehghani. [How to Move Beyond a Monolithic Data Lake to a Distributed Data Mesh](https://martinfowler.com/articles/data-monolith-to-mesh.html). *martinfowler.com*, May 2019. Archived at [perma.cc/LN2L-L4VC](https://perma.cc/LN2L-L4VC)
[^36]: Chris Riccomini. [What the Heck is a Data Mesh?!](https://cnr.sh/essays/what-the-heck-data-mesh) *cnr.sh*, June 2021. Archived at [perma.cc/NEJ2-BAX3](https://perma.cc/NEJ2-BAX3)
[^37]: Chad Sanderson, Mark Freeman, B. E. Schmidt. [*Data Contracts*](https://www.oreilly.com/library/view/data-contracts/9781098157623/). O'Reilly Media, 2025. ISBN: 9781098157623
[^38]: Daniel Abadi. [Data Fabric vs. Data Mesh: What's the Difference?](https://www.starburst.io/blog/data-fabric-vs-data-mesh-whats-the-difference/) *starburst.io*, November 2021. Archived at [perma.cc/RSK3-HXDK](https://perma.cc/RSK3-HXDK)
[^39]: Michael Armbrust, Ali Ghodsi, Reynold Xin, and Matei Zaharia. [Lakehouse: A New Generation of Open Platforms that Unify Data Warehousing and Advanced Analytics](https://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf). At *11th Annual Conference on Innovative Data Systems Research* (CIDR), January 2021.
[^40]: Leslie G. Valiant. [A Bridging Model for Parallel Computation](https://dl.acm.org/doi/pdf/10.1145/79173.79181). *Communications of the ACM*, volume 33, issue 8, pages 103--111, August 1990. [doi:10.1145/79173.79181](https://doi.org/10.1145/79173.79181)
[^41]: Stephan Ewen, Kostas Tzoumas, Moritz Kaufmann, and Volker Markl. [Spinning Fast Iterative Data Flows](https://vldb.org/pvldb/vol5/p1268_stephanewen_vldb2012.pdf). *Proceedings of the VLDB Endowment*, volume 5, issue 11, pages 1268-1279, July 2012. [doi:10.14778/2350229.2350245](https://doi.org/10.14778/2350229.2350245)
[^42]: Grzegorz Malewicz, Matthew H. Austern, Aart J. C. Bik, James C. Dehnert, Ilan Horn, Naty Leiser, and Grzegorz Czajkowski. [Pregel: A System for Large-Scale Graph Processing](https://kowshik.github.io/JPregel/pregel_paper.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2010. [doi:10.1145/1807167.1807184](https://doi.org/10.1145/1807167.1807184)
[^43]: Richard MacManus. [OpenAI Chats about Scaling LLMs at Anyscale's Ray Summit](https://thenewstack.io/openai-chats-about-scaling-llms-at-anyscales-ray-summit/). *thenewstack.io*, September 2023. Archived at [perma.cc/YJD6-KUXU](https://perma.cc/YJD6-KUXU)
[^44]: Jay Kreps. [Why Local State is a Fundamental Primitive in Stream Processing](https://www.oreilly.com/ideas/why-local-state-is-a-fundamental-primitive-in-stream-processing). *oreilly.com*, July 2014. Archived at [perma.cc/P8HU-R5LA](https://perma.cc/P8HU-R5LA)
[^45]: Félix GV. [Open Sourcing Venice -- LinkedIn's Derived Data Platform](https://www.linkedin.com/blog/engineering/open-source/open-sourcing-venice-linkedin-s-derived-data-platform). *linkedin.com*, September 2022. Archived at [archive.org](https://web.archive.org/web/20250226160927/https://www.linkedin.com/blog/engineering/open-source/open-sourcing-venice-linkedin-s-derived-data-platform)


================================================
FILE: content/en/ch12.md
================================================
---
title: "12. Stream Processing"
weight: 312
breadcrumbs: false
---

<a id="ch_stream"></a>

![](/map/ch11.png)

> *A complex system that works is invariably found to have evolved from a simple system that works.
> The inverse proposition also appears to be true: A complex system designed from scratch never
> works and cannot be made to work.*
>
> John Gall, *Systemantics* (1975)

> [!TIP] A NOTE FOR EARLY RELEASE READERS
> With Early Release ebooks, you get books in their earliest form---the author's raw and unedited
> content as they write---so you can take advantage of these technologies long before the official
> release of these titles.
>
> This will be the 12th chapter of the final book. The GitHub repo for this book is
> *[*https://github.com/ept/ddia2-feedback*](https://github.com/ept/ddia2-feedback)*.
>
> If you'd like to be actively involved in reviewing and commenting on this draft, please reach out on
> GitHub.

In [Chapter 11](/en/ch11#ch_batch) we discussed batch processing---techniques that read a set of
files as input and produce a new set of output files. The output is a form of *derived data*; that
is, a dataset that can be recreated by running the batch process again if necessary. We saw how this
simple but powerful idea can be used to create search indexes, recommendation systems, analytics,
and more.

However, one big assumption remained throughout [Chapter 11](/en/ch11#ch_batch): namely, that the
input is bounded---i.e., of a known and finite size---so the batch process knows when it has
finished reading its input. For example, the sorting operation that is central to MapReduce must
read its entire input before it can start producing output: it could happen that the very last input
record is the one with the lowest key, and thus needs to be the very first output record, so
starting the output early is not an option.

In reality, a lot of data is unbounded because it arrives gradually over time: your users produced
data yesterday and today, and they will continue to produce more data tomorrow. Unless you go out of
business, this process never ends, and so the dataset is never "complete" in any meaningful way
[^1]. Thus, batch processors must artificially divide the data into chunks of fixed
duration: for example, processing a day's worth of data at the end of every day, or processing an
hour's worth of data at the end of every hour.

The problem with daily batch processes is that changes in the input are only reflected in the output
a day later, which is too slow for many impatient users. To reduce the delay, we can run the
processing more frequently---say, processing a second's worth of data at the end of every
second---or even continuously, abandoning the fixed time slices entirely and simply processing every
event as it happens. That is the idea behind *stream processing*.

In general, a "stream" refers to data that is incrementally made available over time. The concept
appears in many places: in the `stdin` and `stdout` of Unix, programming languages (lazy lists)
[^2], filesystem APIs (such as Java's `FileInputStream`), TCP connections, delivering
audio and video over the internet, and so on.

In this chapter we will look at *event streams* as a data management mechanism: the unbounded,
incrementally processed counterpart to the batch data we saw in the last chapter. We will first
discuss how streams are represented, stored, and transmitted over a network. In ["Databases and
Streams"](/en/ch12#sec_stream_databases) we will investigate the relationship between streams and
databases. And finally, in ["Processing Streams"](/en/ch12#sec_stream_processing) we will explore
approaches and tools for processing those streams continually, and ways that they can be used to
build applications.

## Transmitting Event Streams {#sec_stream_transmit}

In the batch processing world, the inputs and outputs of a job are files (perhaps on a distributed
filesystem). What does the streaming equivalent look like?

When the input is a file (a sequence of bytes), the first processing step is usually to parse it
into a sequence of records. In a stream processing context, a record is more commonly known as an
*event*, but it is essentially the same thing: a small, self-contained, immutable object containing
the details of something that happened at some point in time. An event usually contains a timestamp
indicating when it happened according to a time-of-day clock (see ["Monotonic Versus Time-of-Day
Clocks"](/en/ch9#sec_distributed_monotonic_timeofday)).

For example, the thing that happened might be an action that a user took, such as viewing a page or
making a purchase. It might also originate from a machine, such as a periodic measurement from a
temperature sensor, or a CPU utilization metric. In the example of ["Batch Processing with Unix
Tools"](/en/ch11#sec_batch_unix), each line of the web server log is an event.

An event may be encoded as a text string, or JSON, or perhaps in some binary form, as discussed in
[Chapter 5](/en/ch5#ch_encoding). This encoding allows you to store an event, for example by
appending it to a file, inserting it into a relational table, or writing it to a document database.
It also allows you to send the event over the network to another node in order to process it.

In batch processing, a file is written once and then potentially read by multiple jobs. Analogously,
in streaming terminology, an event is generated once by a *producer* (also known as a *publisher* or
*sender*), and then potentially processed by multiple *consumers* (*subscribers* or *recipients*)
[^3]. In a filesystem, a filename identifies a set of related records; in a streaming
system, related events are usually grouped together into a *topic* or *stream*.

In principle, a file or database is sufficient to connect producers and consumers: a producer writes
every event that it generates to the datastore, and each consumer periodically polls the datastore
to check for events that have appeared since it last ran. This is essentially what a batch process
does when it processes a day's worth of data at the end of every day.

However, when moving toward continual processing with low delays, polling becomes expensive if the
datastore is not designed for this kind of usage. The more often you poll, the lower the percentage
of requests that return new events, and thus the higher the overheads become. Instead, it is better
for consumers to be notified when new events appear.

Databases have traditionally not supported this kind of notification mechanism very well: relational
databases commonly have *triggers*, which can react to a change (e.g., a row being inserted into a
table), but they are very limited in what they can do and have been somewhat of an afterthought in
database design [^4]. Instead, specialized tools have been developed for the purpose of
delivering event notifications.

### Messaging Systems {#sec_stream_messaging}

A common approach for notifying consumers about new events is to use a *messaging system*: a
producer sends a message containing the event, which is then pushed to consumers. We touched on
these systems previously in ["Event-Driven Architectures"](/en/ch5#sec_encoding_dataflow_msg), but
we will now go into more detail.

A direct communication channel like a Unix pipe or TCP connection between producer and consumer
would be a simple way of implementing a messaging system. However, most messaging systems expand on
this basic model. In particular, Unix pipes and TCP connect exactly one sender with one recipient,
whereas a messaging system allows multiple producer nodes to send messages to the same topic and
allows multiple consumer nodes to receive messages in a topic.

Within this *publish/subscribe* model, different systems take a wide range of approaches, and there
is no one right answer for all purposes. To differentiate the systems, it is particularly helpful to
ask the following two questions:

1.  *What happens if the producers send messages faster than the consumers can process them?*
    Broadly speaking, there are three options: the system can drop messages, buffer messages in a
    queue, or apply *backpressure* (also known as *flow control*; i.e., blocking the producer from
    sending more messages). For example, Unix pipes and TCP use backpressure: they have a small
    fixed-size buffer, and if it fills up, the sender is blocked until the recipient takes data out
    of the buffer (see ["Network congestion and queueing"](/en/ch9#sec_distributed_congestion)).

    If messages are buffered in a queue, it is important to understand what happens as that queue
    grows. Does the system crash if the queue no longer fits in memory, or does it write messages to
    disk? In the latter case, how does the disk access affect the performance of the messaging
    system [^5], and what happens when the disk fills up [^6]?

2.  *What happens if nodes crash or temporarily go offline---are any messages lost?* As with
    databases, durability may require some combination of writing to disk and/or replication (see
    the sidebar ["Replication and Durability"](/en/ch8#sidebar_transactions_durability)), which has
    a cost. If you can afford to sometimes lose messages, you can probably get higher throughput and
    lower latency on the same hardware.

Whether message loss is acceptable depends very much on the application. For example, with sensor
readings and metrics that are transmitted periodically, an occasional missing data point is perhaps
not important, since an updated value will be sent a short time later anyway. However, beware that
if a large number of messages are dropped, it may not be immediately apparent that the metrics are
incorrect [^7]. If you are counting events, it is more important that they are delivered
reliably, since every lost message means incorrect counters.

A nice property of the batch processing systems we explored in [Chapter 11](/en/ch11#ch_batch) is
that they provide a strong reliability guarantee: failed tasks are automatically retried, and
partial output from failed tasks is automatically discarded. This means the output is the same as if
no failures had occurred, which helps simplify the programming model. Later in this chapter we will
examine how we can provide similar guarantees in a streaming context.

#### Direct messaging from producers to consumers {#id296}

A number of messaging systems use direct network communication between producers and consumers
without going via intermediary nodes:

- UDP multicast is widely used in the financial industry for streams such as stock market feeds,
  where low latency is important [^8]. Although UDP itself is unreliable,
  application-level protocols can recover lost packets (the producer must remember packets it has
  sent so that it can retransmit them on demand).

- Brokerless messaging libraries such as ZeroMQ and nanomsg take a similar approach, implementing
  publish/subscribe messaging over TCP or IP multicast.

- Some metrics collection agents, such as StatsD [^9] use unreliable UDP messaging to
  collect metrics from all machines on the network and monitor them. (In the StatsD protocol,
  counter metrics are only correct if all messages are received; using UDP makes the metrics at best
  approximate [^10]. See also ["TCP Versus UDP"](/en/ch9#sidebar_distributed_tcp_udp).)

- If the consumer exposes a service on the network, producers can make a direct HTTP or RPC request
  (see ["Dataflow Through Services: REST and RPC"](/en/ch5#sec_encoding_dataflow_rpc)) to push
  messages to the consumer. This is the idea behind webhooks [^11], a pattern in which a
  callback URL of one service is registered with another service, and it makes a request to that URL
  whenever an event occurs.

Although these direct messaging systems work well in the situations for which they are designed,
they generally require the application code to be aware of the possibility of message loss. The
faults they can tolerate are quite limited: even if the protocols detect and retransmit packets that
are lost in the network, they generally assume that producers and consumers are constantly online.

If a consumer is offline, it may miss messages that were sent while it is unreachable. Some
protocols allow the producer to retry failed message deliveries, but this approach may break down if
the producer crashes, losing the buffer of messages that it was supposed to retry.

#### Message brokers {#id433}

A widely used alternative is to send messages via a *message broker* (also known as a *message
queue*), which is essentially a kind of database that is optimized for handling message streams
[^12]. It runs as a server, with producers and consumers connecting to it as clients.
Producers write messages to the broker, and consumers receive them by reading them from the broker.

By centralizing the data in the broker, these systems can more easily tolerate clients that come and
go (connect, disconnect, and crash), and the question of durability is moved to the broker instead.
Some message brokers only keep messages in memory, while others (depending on configuration) write
them to disk so that they are not lost in case of a broker crash. Faced with slow consumers, they
generally allow unbounded queueing (as opposed to dropping messages or backpressure), although this
choice may also depend on the configuration.

A consequence of queueing is also that consumers are generally *asynchronous*: when a producer sends
a message, it normally only waits for the broker to confirm that it has buffered the message and
does not wait for the message to be processed by consumers. The delivery to consumers will happen at
some undetermined future point in time---often within a fraction of a second, but sometimes
significantly later if there is a queue backlog.

#### Message brokers compared to databases {#id297}

Some message brokers can even participate in two-phase commit protocols using XA or JTA (see
["Distributed Transactions Across Different Systems"](/en/ch8#sec_transactions_xa)). This feature
makes them quite similar in nature to databases, although there are still important practical
differences between message brokers and databases:

- Databases usually keep data until it is explicitly deleted, whereas some message brokers
  automatically delete a message when it has been successfully delivered to its consumers. Such
  message brokers are not suitable for long-term data storage.

- Since they quickly delete messages, most message brokers assume that their working set is fairly
  small---i.e., the queues are short. If the broker needs to buffer a lot of messages because the
  consumers are slow (perhaps spilling messages to disk if they no longer fit in memory), each
  individual message takes longer to process, and the overall throughput may degrade [^5].

- Databases often support secondary indexes and various ways of searching for data using a query
  language, while message brokers often support some way of subscribing to a subset of topics
  matching some pattern. Both are essentially ways for a client to select the portion of the data
  that it wants to know about, but databases typically offer much more advanced query functionality.

- When querying a database, the result is typically based on a point-in-time snapshot of the data;
  if another client subsequently writes something to the database that changes the query result, the
  first client does not find out that its prior result is now outdated (unless it repeats the query,
  or polls for changes). By contrast, message brokers do not support arbitrary queries and don't
  allow message updates once they're sent, but they do notify clients when data changes (i.e., when
  new messages become available).

This is the traditional view of message brokers, which is encapsulated in standards like JMS
[^13] and AMQP [^14] and implemented in software like RabbitMQ, ActiveMQ,
HornetQ, Qpid, TIBCO Enterprise Message Service, IBM MQ, Azure Service Bus, and Google Cloud Pub/Sub
[^15]. Although it is possible to use databases as queues, tuning them to get good
performance is not straightforward [^16].

#### Multiple consumers {#id298}

When multiple consumers read messages in the same topic, two main patterns of messaging are used, as
illustrated in [Figure 12-1](/en/ch12#fig_stream_multi_consumer):

Load balancing

:   Each message is delivered to *one* of the consumers, so the consumers can share the work of
    processing the messages in the topic. The broker may assign messages to consumers arbitrarily.
    This pattern is useful when the messages are expensive to process, and so you want to be able to
    add consumers to parallelize the processing. (In AMQP, you can implement load balancing by
    having multiple clients consuming from the same queue, and in JMS it is called a *shared*
    *subscription*.)

Fan-out

:   Each message is delivered to *all* of the consumers. Fan-out allows several independent
    consumers to each "tune in" to the same broadcast of messages, without affecting each
    other---the streaming equivalent of having several different batch jobs that read the same input
    file. (This feature is provided by topic subscriptions in JMS, and exchange bindings in AMQP.)

{{< figure src="/fig/ddia_1201.png" id="fig_stream_multi_consumer" caption="Figure 12-1. (a) Load balancing: sharing the work of consuming a topic among consumers; (b) fan-out: delivering each message to multiple consumers." class="w-full my-4" >}}

The two patterns can be combined, for example using Kafka's *consumer groups* feature. When a
consumer group subscribes to a topic, each message in the topic is sent to one of the consumers in
the group (load-balancing across the consumers in the group). If two separate consumer groups
subscribe to the same topic, each message is sent to one consumer in each group (providing fan-out
across consumer groups).

#### Acknowledgments and redelivery {#sec_stream_reordering}

Consumers may crash at any time, so it could happen that a broker delivers a message to a consumer
but the consumer never processes it, or only partially processes it before crashing. In order to
ensure that the message is not lost, message brokers use *acknowledgments*: a client must explicitly
tell the broker when it has finished processing a message so that the broker can remove it from the
queue.

If the connection to a client is closed or times out without the broker receiving an acknowledgment,
it assumes that the message was not processed, and therefore it delivers the message again to
another consumer. (Note that it could happen that the message actually *was* fully processed, but
the acknowledgment was lost in the network. Handling this case requires an atomic commit protocol,
as discussed in ["Exactly-once message processing"](/en/ch8#sec_transactions_exactly_once), unless
the operation was idempotent or exactly-once semantics are not required.)

When combined with load balancing, this redelivery behavior has an interesting effect on the
ordering of messages. In [Figure 12-2](/en/ch12#fig_stream_redelivery), the consumers generally
process messages in the order they were sent by producers. However, consumer 2 crashes while
processing message *m3*, at the same time as consumer 1 is processing message *m4*. The
unacknowledged message *m3* is subsequently redelivered to consumer 1, with the result that consumer
1 processes messages in the order *m4*, *m3*, *m5*. Thus, *m3* and *m4* are not delivered in the
same order as they were sent by producer 1.

{{< figure src="/fig/ddia_1202.png" id="fig_stream_redelivery" caption="Figure 12-2. Consumer 2 crashes while processing m3, so it is redelivered to consumer 1 at a later time." class="w-full my-4" >}}

Even if the message broker otherwise tries to preserve the order of messages (as required by both
the JMS and AMQP standards), the combination of load balancing with redelivery inevitably leads to
messages being reordered. To avoid this issue, you can use a separate queue per consumer (i.e., not
use the load balancing feature). Message reordering is not a problem if messages are completely
independent of each other, but it can be important if there are causal dependencies between
messages, as we shall see later in the chapter.

Redelivery can also result in wasted resources, resource starvation, or permanent blockages in a
stream. A common scenario is a producer that improperly serializes a message; for example, by
leaving out a required key in a JSON-encoded object. Any consumer that reads the message will expect
the key, and fail if it's missing. No acknowledgement is sent, so the broker will re-send the
message, which will cause another consumer to fail. This loop repeats itself indefinitely. If the
broker guarantees strong ordering, no further progress can be made. Brokers that allow message
reordering can continue to make progress, but will waste resources on messages that will never be
acknowledged.

Dead letter queues (DLQs) are used to handle this problem. Rather than keeping the message in the
current queue and retrying forever, the message is moved to a different queue to unblock consumers
[^17], [^18]. Monitoring is usually set up on dead letter queues---​any message in
the queue is an error. Once a new message is detected, an operator can decide to permanently drop
it, manually modify and re-produce the message, or fix consumer code to handle the message
appropriately. DLQs are common in most queuing systems, but log-based messaging systems such as
Apache Pulsar and stream processing systems such as Kafka Streams now support them as well
[^19].

### Log-based Message Brokers {#sec_stream_log}

Sending a packet over a network or making a request to a network service is normally a transient
operation that leaves no permanent trace. Although it is possible to record it permanently (using
packet capture and logging), we normally don't think of it that way. AMQP/JMS-style message brokers
inherited this transient messaging mindset: even though they may write messages to disk, they
quickly delete the messages again after they have been delivered to consumers.

Databases and filesystems take the opposite approach: everything that is written to a database or
file is normally expected to be permanently recorded, at least until someone explicitly chooses to
delete it again.

This difference in mindset has a big impact on how derived data is created. A key feature of batch
processes, as discussed in [Chapter 11](/en/ch11#ch_batch), is that you can run them repeatedly,
experimenting with the processing steps, without risk of damaging the input (since the input is
read-only). This is not the case with AMQP/JMS-style messaging: receiving a message is destructive
if the acknowledgment causes it to be deleted from the broker, so you cannot run the same consumer
again and expect to get the same result.

If you add a new consumer to a messaging system, it typically only starts receiving messages sent
after the time it was registered; any prior messages are already gone and cannot be recovered.
Contrast this with files and databases, where you can add a new client at any time, and it can read
data written arbitrarily far in the past (as long as it has not been explicitly overwritten or
deleted by the application).

Why can we not have a hybrid, combining the durable storage approach of databases with the
low-latency notification facilities of messaging? This is the idea behind *log-based message
brokers*, which have become very popular in recent years.

#### Using logs for message storage {#id300}

A log is simply an append-only sequence of records on disk. We previously discussed logs in the
context of log-structured storage engines and write-ahead logs in [Chapter 4](/en/ch4#ch_storage),
in the context of replication in [Chapter 6](/en/ch6#ch_replication), and as a form of consensus in
[Chapter 10](/en/ch10#ch_consistency).

The same structure can be used to implement a message broker: a producer sends a message by
appending it to the end of the log, and a consumer receives messages by reading the log
sequentially. If a consumer reaches the end of the log, it waits for a notification that a new
message has been appended. The Unix tool `tail -f`, which watches a file for data being appended,
essentially works like this.

In order to scale to higher throughput than a single disk can offer, the log can be *sharded* (in
the sense of [Chapter 7](/en/ch7#ch_sharding)). Different shards can then be hosted on different
machines, making each shard a separate log that can be read and written independently from other
shards. A topic can then be defined as a group of shards that all carry messages of the same type.
This approach is illustrated in [Figure 12-3](/en/ch12#fig_stream_kafka_partitions).

Within each shard, which Kafka calls a *partition*, the broker assigns a monotonically increasing
sequence number, or *offset*, to every message (in
[Figure 12-3](/en/ch12#fig_stream_kafka_partitions), the numbers in boxes are message offsets). Such
a sequence number makes sense because a partition (shard) is append-only, so the messages within a
partition are totally ordered. There is no ordering guarantee across different partitions.

{{< figure src="/fig/ddia_1203.png" id="fig_stream_kafka_partitions" caption="Figure 12-3. Producers send messages by appending them to a topic-partition file, and consumers read these files sequentially." class="w-full my-4" >}}

Apache Kafka [^20] and Amazon Kinesis Streams are log-based message brokers that work like
this. Google Cloud Pub/Sub is architecturally similar but exposes a JMS-style API rather than a log
abstraction [^15]. Even though these message brokers write all messages to disk, they are
able to achieve throughput of millions of messages per second by sharding across multiple machines,
and fault tolerance by replicating messages [^21], [^22].

#### Logs compared to traditional messaging {#sec_stream_logs_vs_messaging}

The log-based approach trivially supports fan-out messaging, because several consumers can
independently read the log without affecting each other---reading a message does not delete it from
the log. To achieve load balancing across a group of consumers, instead of assigning individual
messages to consumer clients, the broker can assign entire shards to nodes in the consumer group.

Each client then consumes *all* the messages in the shards it has been assigned. Typically, when a
consumer has been assigned a log shard, it reads the messages in the shard sequentially, in a
straightforward single-threaded manner. This coarse-grained load balancing approach has some
downsides:

- The number of nodes sharing the work of consuming a topic can be at most the number of log shards
  in that topic, because messages within the same shard are delivered to the same node. (It's
  possible to create a load balancing scheme in which two consumers share the work of processing a
  shard by having both read the full set of messages, but one of them only considers messages with
  even-numbered offsets while the other deals with the odd-numbered offsets. Alternatively, you
  could spread message processing over a thread pool, but that approach complicates consumer offset
  management. In general, single-threaded processing of a shard is preferable, and parallelism can
  be increased by using more shards.)

- If a single message is slow to process, it holds up the processing of subsequent messages in that
  shard (a form of head-of-line blocking; see ["Describing
  Performance"](/en/ch2#sec_introduction_percentiles)).

Thus, in situations where messages may be expensive to process and you want to parallelize
processing on a message-by-message basis, and where message ordering is not so important, the
JMS/AMQP style of message broker is preferable. On the other hand, in situations with high message
throughput, where each message is fast to process and where message ordering is important, the
log-based approach works very well [^23], [^24]. However, the distinction between
the two architectures is being blurred as log-based messaging systems such as Kafka now support
JMS/AMQP style consumer groups, which allow multiple consumers to receive messages from the same
partition [^25], [^26].

Since sharded logs typically preserve message ordering only within a single shard, all messages that
need to be ordered consistently need to be routed to the same shard. For example, an application may
require that the events relating to one particular user appear in a fixed order. This can be
achieved by choosing the shard for an event based on the user ID of that event (in other words,
making the user ID the *partition key*).

#### Consumer offsets {#sec_stream_log_offsets}

Consuming a shard sequentially makes it easy to tell which messages have been processed: all
messages with an offset less than a consumer's current offset have already been processed, and all
messages with a greater offset have not yet been seen. Thus, the broker does not need to track
acknowledgments for every single message---it only needs to periodically record the consumer
offsets. The reduced bookkeeping overhead and the opportunities for batching and pipelining in this
approach help increase the throughput of log-based systems. If a consumer fails, however, it will
resume from the last recorded offset rather than the more recent last offset it saw. This can causes
the consumer to see some messages twice.

This offset is in fact very similar to the *log sequence number* that is commonly found in
single-leader database replication, and which we discussed in ["Setting Up New
Followers"](/en/ch6#sec_replication_new_replica). In database replication, the log sequence number
allows a follower to reconnect to a leader after it has become disconnected, and resume replication
without skipping any writes. Exactly the same principle is used here: the message broker behaves
like a leader database, and the consumer like a follower.

If a consumer node fails, another node in the consumer group is assigned the failed consumer's
shards, and it starts consuming messages at the last recorded offset. If the consumer had processed
subsequent messages but not yet recorded their offset, those messages will be processed a second
time upon restart. We will discuss ways of dealing with this issue later in the chapter.

#### Disk space usage {#sec_stream_disk_usage}

If you only ever append to the log, you will eventually run out of disk space. To reclaim disk
space, the log is actually divided into segments, and from time to time old segments are deleted or
moved to archive storage. (We'll discuss a more sophisticated way of freeing disk space in ["Log
compaction"](/en/ch12#sec_stream_log_compaction).)

This means that if a slow consumer cannot keep up with the rate of messages, and it falls so far
behind that its consumer offset points to a deleted segment, it will miss some of the messages.
Effectively, the log implements a bounded-size buffer that discards old messages when it gets full,
also known as a *circular buffer* or *ring buffer*. However, since that buffer is on disk, it can be
quite large.

Let's do a back-of-the-envelope calculation. At the time of writing, a typical large hard drive has
a capacity of 20 TB and a sequential write throughput of 250 MB/s. If you are writing messages at
the fastest possible rate, it takes about 22 hours until the drive is full and you need to start
deleting the oldest messages. That means a disk-based log can always buffer at least 22 hours worth
of messages, even if you have many disks with many machines (having more disks increases both the
available space and the total write bandwidth). In practice, deployments rarely use the full write
bandwidth of the disk, so the log can typically keep a buffer of several days' or even weeks' worth
of messages.

Many log-based message brokers now store messages in object storage to increase their storage
capacity, similarly to databases as we saw in ["Databases Backed by Object
Storage"](/en/ch6#sec_replication_object_storage). Message brokers such as Apache Kafka and Redpanda
serve older messages from object storage as part of their tiered storage. Others, such as
WarpStream, Confluent Freight, and Bufstream store all of their data in the object store. In
addition to cost-efficiency, this architecture also makes data integration easier: messages in
object storage are stored as Iceberg tables, which enable batch and data warehouse job execution
directly on the data without having to copy it into another system.

#### When consumers cannot keep up with producers {#id459}

At the beginning of ["Messaging Systems"](/en/ch12#sec_stream_messaging) we discussed three choices
of what to do if a consumer cannot keep up with the rate at which producers are sending messages:
dropping messages, buffering, or applying backpressure. In this taxonomy, the log-based approach is
a form of buffering with a large but fixed-size buffer (limited by the available disk space).

If a consumer falls so far behind that the messages it requires are older than what is retained on
disk, it will not be able to read those messages---so the broker effectively drops old messages that
go back further than the size of the buffer can accommodate. You can monitor how far a consumer is
behind the head of the log, and raise an alert if it falls behind significantly. As the buffer is
large, there is enough time for a human operator to fix the slow consumer and allow it to catch up
before it starts missing messages.

Even if a consumer does fall too far behind and starts missing messages, only that consumer is
affected; it does not disrupt the service for other consumers. This fact is a big operational
advantage: you can experimentally consume a production log for development, testing, or debugging
purposes, without having to worry much about disrupting production services. When a consumer is shut
down or crashes, it stops consuming resources---the only thing that remains is its consumer offset.

This behavior also contrasts with traditional message brokers, where you need to be careful to
delete any queues whose consumers have been shut down---otherwise they continue unnecessarily
accumulating messages and taking away memory from consumers that are still active.

#### Replaying old messages {#sec_stream_replay}

We noted previously that with AMQP- and JMS-style message brokers, processing and acknowledging
messages is a destructive operation, since it causes the messages to be deleted on the broker. On
the other hand, in a log-based message broker, consuming messages is more like reading from a file:
it is a read-only operation that does not change the log.

The only side effect of processing, besides any output of the consumer, is that the consumer offset
moves forward. But the offset is under the consumer's control, so it can easily be manipulated if
necessary: for example, you can start a copy of a consumer with yesterday's offsets and write the
output to a different location, in order to reprocess the last day's worth of messages. You can
repeat this any number of times, varying the processing code.

This aspect makes log-based messaging more like the batch processes of the last chapter, where
derived data is clearly separated from input data through a repeatable transformation process. It
allows more experimentation and easier recovery from errors and bugs, making it a good tool for
integrating dataflows within an organization [^27].

## Databases and Streams {#sec_stream_databases}

We have drawn some comparisons between message brokers and databases. Even though they have
traditionally been considered separate categories of tools, we saw that log-based message brokers
have been successful in taking ideas from databases and applying them to messaging. We can also go
in reverse: take ideas from messaging and streams, and apply them to databases.

One approach is to use an *event stream as the system of record* for storing data (see ["Systems of
Record and Derived Data"](/en/ch1#sec_introduction_derived)). This is what happens in *event
sourcing*, which we discussed in ["Event Sourcing and CQRS"](/en/ch3#sec_datamodels_events): instead
of storing data in a data model that is mutated by updating and deleting, you can model every state
change as an immutable event, and write it to an append-only log. Any read-optimized materialized
views are derived from these events. Log-based message brokers (configured to never delete old
events) are well suited for event sourcing since they use append-only storage, and they can notify
consumers about new events with low latency.

But you don't have to go as far as adopting event sourcing; even with mutable data models, event
streams are useful for databases. In fact, every write to a database is an event that can be
captured, stored, and processed. The connection between databases and streams runs deeper than just
the physical storage of logs on disk---it is quite fundamental.

For example, a replication log (see ["Implementation of Replication
Logs"](/en/ch6#sec_replication_implementation)) is a stream of database write events, produced by
the leader as it processes transactions. The followers apply that stream of writes to their own copy
of the database and thus end up with an accurate copy of the same data. The events in the
replication log describe the data changes that occurred.

We also came across the *state machine replication* principle in ["Using shared
logs"](/en/ch10#sec_consistency_smr), which states: if every event represents a write to the
database, and every replica processes the same events in the same order, then the replicas will all
end up in the same final state. (Processing an event is assumed to be a deterministic operation.)
It's just another case of event streams!

In this section we will first look at a problem that arises in heterogeneous data systems, and then
explore how we can solve it by bringing ideas from event streams to databases.

### Keeping Systems in Sync {#sec_stream_sync}

As we have seen throughout this book, there is no single system that can satisfy all data storage,
querying, and processing needs. In practice, most nontrivial applications need to combine several
different technologies in order to satisfy their requirements: for example, using an OLTP database
to serve user requests, a cache to speed up common requests, a full-text index to handle search
queries, and a data warehouse for analytics. Each of these has its own copy of the data, stored in
its own representation that is optimized for its own purposes.

As the same or related data appears in several different places, they need to be kept in sync with
one another: if an item is updated in the database, it also needs to be updated in the cache, search
indexes, and data warehouse. With data warehouses this synchronization is usually performed by ETL
processes (see ["Data Warehousing"](/en/ch1#sec_introduction_dwh)), often by taking a full copy of a
database, transforming it, and bulk-loading it into the data warehouse---in other words, a batch
process. Similarly, we saw in ["Batch Use Cases"](/en/ch11#sec_batch_output) how search indexes,
recommendation systems, and other derived data systems might be created using batch processes.

If periodic full database dumps are too slow, an alternative that is sometimes used is *dual
writes*, in which the application code explicitly writes to each of the systems when data changes:
for example, first writing to the database, then updating the search index, then invalidating the
cache entries (or even performing those writes concurrently).

However, dual writes have some serious problems, one of which is a race condition illustrated in
[Figure 12-4](/en/ch12#fig_stream_write_order). In this example, two clients concurrently want to
update an item X: client 1 wants to set the value to A, and client 2 wants to set it to B. Both
clients first write the new value to the database, then write it to the search index. Due to unlucky
timing, the requests are interleaved: the database first sees the write from client 1 setting the
value to A, then the write from client 2 setting the value to B, so the final value in the database
is B. The search index first sees the write from client 2, then client 1, so the final value in the
search index is A. The two systems are now permanently inconsistent with each other, even though no
error occurred.

{{< figure src="/fig/ddia_1204.png" id="fig_stream_write_order" caption="Figure 12-4. In the database, X is first set to A and then to B, while at the search index the writes arrive in the opposite order." class="w-full my-4" >}}

Unless you have some additional concurrency detection mechanism, such as the version vectors we
discussed in ["Detecting Concurrent Writes"](/en/ch6#sec_replication_concurrent), you will not even
notice that concurrent writes occurred---one value will simply silently overwrite another value.

Another problem with dual writes is that one of the writes may fail while the other succeeds. This
is a fault-tolerance problem rather than a concurrency problem, but it also has the effect of the
two systems becoming inconsistent with each other. Ensuring that they either both succeed or both
fail is a case of the atomic commit problem, which is expensive to solve (see ["Two-Phase Commit
(2PC)"](/en/ch8#sec_transactions_2pc)).

If you only have one replicated database with a single leader, then that leader determines the order
of writes, so the state machine replication approach works among replicas of the database. However,
in [Figure 12-4](/en/ch12#fig_stream_write_order) there isn't a single leader: the database may have
a leader and the search index may have a leader, but neither follows the other, and so conflicts can
occur (see ["Multi-Leader Replication"](/en/ch6#sec_replication_multi_leader)).

The situation would be better if there really was only one leader---for example, the database---and
if we could make the search index a follower of the database. But is this possible in practice?

### Change Data Capture {#sec_stream_cdc}

The problem with most databases' replication logs is that they have long been considered to be an
internal implementation detail of the database, not a public API. Clients are supposed to query the
database through its data model and query language, not parse the replication logs and try to
extract data from them.

For decades, many databases simply did not have a documented way of getting the log of changes
written to them. For this reason it was difficult to take all the changes made in a database and
replicate them to a different storage technology such as a search index, cache, or data warehouse.

More recently, there has been growing interest in *change data capture* (CDC), which is the process
of observing all data changes written to a database and extracting them in a form in which they can
be replicated to other systems [^28]. CDC is especially interesting if changes are made
available as a stream, immediately as they are written.

For example, you can capture the changes in a database and continually apply the same changes to a
search index. If the log of changes is applied in the same order, you can expect the data in the
search index to match the data in the database. The search index and any other derived data systems
are just consumers of the change stream.

[Figure 12-5](/en/ch12#fig_stream_change_capture) shows how the concurrency problem of
[Figure 12-4](/en/ch12#fig_stream_write_order) is solved with CDC. Even though the two requests to
set X to A and B respectively arrive concurrently at the database, the database decides on some
order in which to execute them, and writes them to its replication log in that order. The search
index picks them up and applies them in the same order. If you need the data in another system, such
as a data warehouse, you can simply add it as another consumer of the CDC event stream.

{{< figure src="/fig/ddia_1205.png" id="fig_stream_change_capture" caption="Figure 12-5. Taking data in the order it was written to one database, and applying the changes to other systems in the same order." class="w-full my-4" >}}

#### Implementing change data capture {#id307}

We can call the log consumers *derived data systems*, as discussed in ["Systems of Record and
Derived Data"](/en/ch1#sec_introduction_derived): the data stored in the search index and the data
warehouse is just another view onto the data in the system of record. Change data capture is a
mechanism for ensuring that all changes made to the system of record are also reflected in the
derived data systems so that the derived systems have an accurate copy of the data.

Essentially, change data capture makes one database the leader (the one from which the changes are
captured), and turns the others into followers. A log-based message broker is well suited for
transporting the change events from the source database to the derived systems, since it preserves
the ordering of messages (avoiding the reordering issue of
[Figure 12-2](/en/ch12#fig_stream_redelivery)).

Logical replication logs can be used to implement change data capture (see ["Logical (row-based) log
replication"](/en/ch6#sec_replication_logical)), although it comes with challenges, such as handling
schema changes and properly modeling updates. The Debezium open source project addresses these
challenges. The project contains *source connectors* for MySQL, PostgreSQL, Oracle, SQL Server, Db2,
Cassandra, and many other databases. These connectors attach to database replication logs and
surface the changes in a standard event schema. Messages can then be transformed and written to
downstream databases. The Kafka Connect framework offers further CDC connectors for various
databases, as well. Maxwell does something similar for MySQL by parsing the binlog [^29],
GoldenGate provides similar facilities for Oracle, and pgcapture does the same for PostgreSQL.

Like message brokers, change data capture is usually asynchronous: the system of record database
does not wait for the change to be applied to consumers before committing it. This design has the
operational advantage that adding a slow consumer does not affect the system of record too much, but
it has the downside that all the issues of replication lag apply (see ["Problems with Replication
Lag"](/en/ch6#sec_replication_lag)).

#### Initial snapshot {#sec_stream_cdc_snapshot}

If you have the log of all changes that were ever made to a database, you can reconstruct the entire
state of the database by replaying the log. However, in many cases, keeping all changes forever
would require too much disk space, and replaying it would take too long, so the log needs to be
truncated.

Building a new full-text index, for example, requires a full copy of the entire database---it is not
sufficient to only apply a log of recent changes, since it would be missing items that were not
recently updated. Thus, if you don't have the entire log history, you need to start with a
consistent snapshot, as previously discussed in ["Setting Up New
Followers"](/en/ch6#sec_replication_new_replica).

The snapshot of the database must correspond to a known position or offset in the change log, so
that you know at which point to start applying changes after the snapshot has been processed. Some
CDC tools integrate this snapshot facility, while others leave it as a manual operation. Debezium
uses Netflix's DBLog watermarking algorithm to provide incremental snapshots [^30],
[^31].

#### Log compaction {#sec_stream_log_compaction}

If you can only keep a limited amount of log history, you need to go through the snapshot process
every time you want to add a new derived data system. However, *log compaction* provides a good
alternative.

We discussed log compaction previously in ["Log-Structured
Storage"](/en/ch4#sec_storage_log_structured), in the context of log-structured storage engines (see
[Figure 4-3](/en/ch4#fig_storage_sstable_merging) for an example). The principle is simple: the
storage engine periodically looks for log records with the same key, throws away any duplicates, and
keeps only the most recent update for each key. This might make log segments much smaller, so
segments may also be merged as part of the compaction process, as shown in
[Figure 12-6](/en/ch12#fig_stream_compaction). This process runs in the background.

{{< figure src="/fig/ddia_1206.png" id="fig_stream_compaction" caption="Figure 12-6. A log of key-value pairs, where the key is the ID of a cat video (mew, purr, scratch, or yawn), and the value is the number of times it has been played. Log compaction retains only the most value for each key." class="w-full my-4" >}}

In a log-structured storage engine, an update with a special null value (a *tombstone*) indicates
that a key was deleted, and causes it to be removed during log compaction. But as long as a key is
not overwritten or deleted, it stays in the log forever. The disk space required for such a
compacted log depends only on the current contents of the database, not the number of writes that
have ever occurred in the database. If the same key is frequently overwritten, previous values will
eventually be garbage-collected, and only the latest value will be retained.

The same idea works in the context of log-based message brokers and change data capture. If the CDC
system is set up such that every change has a primary key, and every update for a key replaces the
previous value for that key, then it's sufficient to keep just the most recent write for a
particular key.

Now, whenever you want to rebuild a derived data system such as a search index, you can start a new
consumer from offset 0 of the log-compacted topic, and sequentially scan over all messages in the
log. The log is guaranteed to contain the most recent value for every key in the database (and maybe
some older values)---in other words, you can use it to obtain a full copy of the database contents
without having to take another snapshot of the CDC source database.

This log compaction feature is supported by Apache Kafka. As we shall see later in this chapter, it
allows the message broker to be used for durable storage, not just for transient messaging.

#### API support for change streams {#sec_stream_change_api}

Most popular databases now expose change streams as a first-class interface, rather than the
retrofitted and reverse-engineered CDC efforts of the past. Relational databases such as MySQL and
PostgreSQL typically send changes through the same replication log they use for their own replicas.
Most cloud vendors offer CDC solutions for their products as well: for example, Datastream offers
streaming data access for Google Cloud's relational databases and data warehouses.

Even evenutally consistent, quorum-based databases such as Cassandra now support change data
capture. As we saw in ["Linearizability and quorums"](/en/ch10#sec_consistency_quorum_linearizable),
clients must persist writes to a majority of nodes before they're considered visible. CDC support
for quorum writes is challenging because there's no single source of truth to subscribe to. Whether
the data is visible or not depends on each reader's consistency preferences. Cassandra sidesteps
this issue by exposing raw log segments for each node rather than providing a single stream of
mutations. Systems that wish to consume the data must read the raw log segments for each node and
decide how best to merge them into a single stream (much like a quorum reader does) [^32].

Kafka Connect [^33] integrates change data capture tools for a wide range of database
systems with Kafka. Once the stream of change events is in Kafka, it can be used to update derived
data systems such as search indexes, and also feed into stream processing systems as discussed later
in this chapter.

#### Change data capture versus event sourcing {#sec_stream_event_sourcing}

Let's compare change data capture to event sourcing. Similarly to change data capture, event
sourcing involves storing all changes to the application state as a log of change events. The
biggest difference is that event sourcing applies the idea at a different level of abstraction:

- In change data capture, the application uses the database in a mutable way, updating and deleting
  records at will. The log of changes is extracted from the database at a low level (e.g., by
  parsing the replication log), which ensures that the order of writes extracted from the database
  matches the order in which they were actually written, avoiding the race condition in
  [Figure 12-4](/en/ch12#fig_stream_write_order).

- In event sourcing, the application logic is explicitly built on the basis of immutable events that
  are written to an event log. In this case, the event store is append-only, and updates or deletes
  of events are discouraged or prohibited. Events are designed to reflect things that happened at
  the application level, rather than low-level state changes.

Which one is better depends on your situation. Adopting event sourcing is a big change for an
application that is not already doing it; it has a number of pros and cons, which we discussed in
["Event Sourcing and CQRS"](/en/ch3#sec_datamodels_events). In contrast, CDC can be added to an
existing database with minimal changes---​the application writing to the database might not even know
that CDC is occurring.

> [!TIP] CHANGE DATA CAPTURE AND DATABASE SCHEMAS
> Though change data capture appears easier to adopt than event sourcing, it comes with its own set of
> challenges.
>
> In a microservices architecture, a database is typically only accessed from one service. Other
> services interact with it through that service's public API, but they don't normally access the
> database directly. This makes the database an internal implementation detail of the service,
> allowing the developers to change its database schema without affecting the public API.
>
> However, CDC systems typically use the upstream database's schema when replicating its data, which
> turns these schemas into public APIs that must be managed much like the public API of the service. A
> developer who removes a table column in their database table will break downstream consumers that
> depend on this field. Such challenges have always existed with data pipelines, but they typically
> only impacted data warehouse ETL. Since CDC is often implemented as a data stream, other production
> services might be consumers. Breaking such consumers can cause a customer-facing outage
> [^34]. Data contracts are often used to prevent these breakages.
>
> A common way to decouple internal from external schemas is to use the *outbox pattern*. Outboxes are
> tables with their own schemas, which are exposed to the CDC system rather than the internal domain
> model in the database [^35], [^36]. Developers can then modify their internal
> schemas as they see fit while leaving their outbox tables untouched. This might look like a dual
> write---​it is. However, outboxes avoid the challenges we discussed in ["Keeping Systems in
> Sync"](/en/ch12#sec_stream_sync) by keeping both writes in the same system (the
> database). This design allows both writes to appear in a single transaction.
>
> Outboxes present a few tradeoffs, though. Developers must still maintain the transformation between
> their internal and outbox schemas, which can be challenging. An outbox also increases the amount of
> data that the database has to write to its underlying storage, which might trigger performance
> problems.

Like with change data capture, replaying the event log allows you to reconstruct the current state
of the system. However, log compaction needs to be handled differently:

- A CDC event for the update of a record typically contains the entire new version of the record, so
  the current value for a primary key is entirely determined by the most recent event for that
  primary key, and log compaction can discard previous events for the same key.

- On the other hand, with event sourcing, events are modeled at a higher level: an event typically
  expresses the intent of a user action, not the mechanics of the state update that occurred as a
  result of the action. In this case, later events typically do not override prior events, and so
  you need the full history of events to reconstruct the final state. Log compaction is not possible
  in the same way.

Applications that use event sourcing typically have some mechanism for storing snapshots of the
current state that is derived from the log of events, so they don't need to repeatedly reprocess the
full log. However, this is only a performance optimization to speed up reads and recovery from
crashes; the intention is that the system is able to store all raw events forever and reprocess the
full event log whenever required. We discuss this assumption in ["Limitations of
immutability"](/en/ch12#sec_stream_immutability_limitations).

### State, Streams, and Immutability {#sec_stream_immutability}

We saw in [Chapter 11](/en/ch11#ch_batch) that batch processing benefits from the immutability of
its input files, so you can run experimental processing jobs on existing input files without fear of
damaging them. This principle of immutability is also what makes event sourcing and change data
capture so powerful.

We normally think of databases as storing the current state of the application---this representation
is optimized for reads, and it is usually the most convenient for serving queries. The nature of
state is that it changes, so databases support updating and deleting data as well as inserting it.
How does this fit with immutability?

Whenever you have state that changes, that state is the result of the events that mutated it over
time. For example, your list of currently available seats is the result of the reservations you have
processed, the current account balance is the result of the credits and debits on the account, and
the response time graph for your web server is an aggregation of the individual response times of
all web requests that have occurred.

No matter how the state changes, there was always a sequence of events that caused those changes.
Even as things are done and undone, the fact remains true that those events occurred. The key idea
is that mutable state and an append-only log of immutable events do not contradict each other: they
are two sides of the same coin. The log of all changes, the *changelog*, represents the evolution of
state over time.

If you are mathematically inclined, you might say that the application state is what you get when
you integrate an event stream over time, and a change stream is what you get when you differentiate
the state by time, as shown in [Figure 12-7](/en/ch12#fig_stream_integral) [^37],
[^38]. The analogy has limitations (for example, the second derivative of state does not
seem to be meaningful), but it's a useful starting point for thinking about data.

{{< figure src="/fig/ddia_1207.png" id="fig_stream_integral" caption="Figure 12-7. The relationship between the current application state and an event stream." class="w-full my-4" >}}

If you store the changelog durably, that simply has the effect of making the state reproducible. If
you consider the log of events to be your system of record, and any mutable state as being derived
from it, it becomes easier to reason about the flow of data through a system. As Jim Gray and
Andreas Reuter put it in 1992 [^39]:

> \[T\]here is no fundamental need to keep a database at all; the log contains all the information
> there is. The only reason for storing the database (i.e., the current end-of-the-log) is
> performance of retrieval operations.

Log compaction is one way of bridging the distinction between log and database state: it retains
only the latest version of each record, and discards overwritten versions.

#### Advantages of immutable events {#sec_stream_immutability_pros}

Immutability in databases is an old idea. For example, accountants have been using immutability for
centuries in financial bookkeeping. When a transaction occurs, it is recorded in an append-only
*ledger*, which is essentially a log of events describing money, goods, or services that have
changed hands. The accounts, such as profit and loss or the balance sheet, are derived from the
transactions in the ledger by adding them up [^40].

If a mistake is made, accountants don't erase or change the incorrect transaction in the
ledger---instead, they add another transaction that compensates for the mistake, for example
refunding an incorrect charge. The incorrect transaction still remains in the ledger forever,
because it might be important for auditing reasons. If incorrect figures, derived from the incorrect
ledger, have already been published, then the figures for the next accounting period include a
correction. This process is entirely normal in accounting [^41].

Although such auditability is particularly important in financial systems, it is also beneficial for
many other systems that are not subject to such strict regulation. If you accidentally deploy buggy
code that writes bad data to a database, recovery is much harder if the code is able to
destructively overwrite data. With an append-only log of immutable events, it is much easier to
diagnose what happened and recover from the problem. Similarly, customer service can use an audit
log to diagnose customer requests and complaints.

Immutable events also capture more information than just the current state. For example, on a
shopping website, a customer may add an item to their cart and then remove it again. Although the
second event cancels out the first event from the point of view of order fulfillment, it may be
useful to know for analytics purposes that the customer was considering a particular item but then
decided against it. Perhaps they will choose to buy it in the future, or perhaps they found a
substitute. This information is recorded in an event log, but would be lost in a database that
deletes items when they are removed from the cart.

#### Deriving several views from the same event log {#sec_stream_deriving_views}

Moreover, by separating mutable state from the immutable event log, you can derive several different
read-oriented representations from the same log of events. This works just like having multiple
consumers of a stream ([Figure 12-5](/en/ch12#fig_stream_change_capture)): for example, the analytic
database Druid ingests directly from Kafka using this approach, and Kafka Connect sinks can export
data from Kafka to various different databases and indexes [^33].

Having an explicit translation step from an event log to a database makes it easier to evolve your
application over time: if you want to introduce a new feature that presents your existing data in
some new way, you can use the event log to build a separate read-optimized view for the new feature,
and run it alongside the existing systems without having to modify them. Running old and new systems
side by side is often easier than performing a complicated schema migration in an existing system.
Once readers have switched to the new system and the old system is no longer needed, you can simply
shut it down and reclaim its resources [^42], [^43].

This idea of writing data in one write-optimized form, and then translating it into different
read-optimized representations as needed, is the *command query responsibility segregation* (CQRS)
pattern that we already encountered in ["Event Sourcing and CQRS"](/en/ch3#sec_datamodels_events).
It doesn't necessarily require event sourcing: you can just as well build multiple materialized
views from a stream of CDC events [^44].

The traditional approach to database and schema design is based on the fallacy that data must be
written in the same form as it will be queried. Debates about normalization and denormalization (see
["Normalization, Denormalization, and Joins"](/en/ch3#sec_datamodels_normalization)) become largely
irrelevant if you can translate data from a write-optimized event log to read-optimized application
state: it is entirely reasonable to denormalize data in the read-optimized views, as the translation
process gives you a mechanism for keeping it consistent with the event log.

In ["Case Study: Social Network Home Timelines"](/en/ch2#sec_introduction_twitter) we discussed a
social network's home timelines, a cache of recent posts by the people a particular user is
following (like a mailbox). This is another example of read-optimized state: home timelines are
highly denormalized, since your posts are duplicated in all of the timelines of the people following
you. However, the fan-out service keeps this duplicated state in sync with new posts and new
following relationships, which keeps the duplication manageable.

#### Concurrency control {#sec_stream_concurrency}

The biggest downside of CQRS is that the consumers of the event log are usually asynchronous, so
there is a possibility that a user may make a write to the log, then read from a derived view and
find that their write has not yet been reflected in the view. We discussed this problem and
potential solutions previously in ["Reading Your Own Writes"](/en/ch6#sec_replication_ryw).

One solution would be to perform the updates of the read view synchronously with appending the event
to the log. This either requires a distributed transaction across the event log and the derived
view, or some way of waiting until an event has been reflected in the view. Both approaches are
usually impractical, so views are normally updated asynchronously.

On the other hand, deriving the current state from an event log also simplifies some aspects of
concurrency control. Much of the need for multi-object transactions (see ["Single-Object and
Multi-Object Operations"](/en/ch8#sec_transactions_multi_object)) stems from a single user action
requiring data to be changed in several different places. With event sourcing, you can design an
event such that it is a self-contained description of a user action. The user action then requires
only a single write in one place---namely appending the event to the log---which is easy to make
atomic.

If the event log and the application state are sharded in the same way (for example, processing an
event for a customer in shard 3 only requires updating shard 3 of the application state), then a
straightforward single-threaded log consumer needs no concurrency control for writes---by
construction, it only processes a single event at a time (see also ["Actual Serial
Execution"](/en/ch8#sec_transactions_serial)). The log removes the nondeterminism of concurrency by
defining a serial order of events in a shard [^27]. If an event touches multiple state
shards, a bit more work is required, which we will discuss in [Chapter 13](/en/ch13#ch_philosophy).

Many systems that don't use an event-sourced model nevertheless rely on immutability for concurrency
control: various databases internally use immutable data structures or multi-version data to support
point-in-time snapshots (see ["Indexes and snapshot
isolation"](/en/ch8#sec_transactions_snapshot_indexes)). Version control systems such as Git,
Mercurial, and Fossil also rely on immutable data to preserve version history of files.

#### Limitations of immutability {#sec_stream_immutability_limitations}

To what extent is it feasible to keep an immutable history of all changes forever? The answer
depends on the amount of churn in the dataset. Some workloads mostly add data and rarely update or
delete; they are easy to make immutable. Other workloads have a high rate of updates and deletes on
a comparatively small dataset; in these cases, the immutable history may grow prohibitively large,
fragmentation may become an issue, and the performance of compaction and garbage collection becomes
crucial for operational robustness [^45], [^46].

Besides the performance reasons, there may also be circumstances in which you need data to be
deleted for administrative or legal reasons, in spite of all immutability. For example, privacy
regulations such as the European General Data Protection Regulation (GDPR) require that a user's
personal information be deleted and erroneous information be removed on demand, or an accidental
leak of sensitive information may need to be contained.

In these circumstances, it's not sufficient to just append another event to the log to indicate that
the prior data should be considered deleted---you actually want to rewrite history and pretend that
the data was never written in the first place. For example, Datomic calls this feature *excision*
[^47], and the Fossil version control system has a similar concept called *shunning*
[^48].

Truly deleting data is surprisingly hard [^49], since copies can live in many places: for
example, storage engines, filesystems, and SSDs often write to a new location rather than
overwriting in place [^41], and backups are often deliberately immutable to prevent
accidental deletion or corruption.

One way of enabling deletion of immutable data is *crypto-shredding* [^50]: data that you
may want to delete in the future is stored encrypted, and when you want to get rid of it, you forget
the encryption key. The encrypted data is then still there, but nobody can use it. In some sense
this only moves the problem around: the actual data is now immutable, but your key storage is
mutable.

Moreover, you have to decide up front which data is going to be encrypted with the same key, and
when you are going to use different keys---an important decision, since you can later crypto-shred
either all or none of the data encrypted with a particular key, but not some of it. Storing a
separate key for every single data item would get too unwieldy, as the key storage would get as big
as the primary data storage. More sophisticated schemes such as puncturable encryption
[^51] make it possible to selectively revoke a key's decryption abilities, but they are
not widely used.

Overall, deletion is more a matter of "making it harder to retrieve the data" than actually "making
it impossible to retrieve the data." Nevertheless, you sometimes have to try, as we shall see in
["Legislation and Self-Regulation"](/en/ch14#sec_future_legislation).

## Processing Streams {#sec_stream_processing}

So far in this chapter we have talked about where streams come from (user activity events, sensors,
and writes to databases), and we have talked about how streams are transported (through direct
messaging, via message brokers, and in event logs).

What remains is to discuss what you can do with the stream once you have it---namely, you can
process it. Broadly, there are three options:

1.  You can take the data in the events and write it to a database, cache, search index, or similar
    storage system, from where it can then be queried by other clients. As shown in
    [Figure 12-5](/en/ch12#fig_stream_change_capture), this is a good way of keeping a database in
    sync with changes happening in other parts of the system---especially if the stream consumer is
    the only client writing to the database. Writing to a storage system is the streaming equivalent
    of what we discussed in ["Batch Use Cases"](/en/ch11#sec_batch_output).

2.  You can push the events to users in some way, for example by sending email alerts or push
    notifications, or by streaming the events to a real-time dashboard where they are visualized. In
    this case, a human is the ultimate consumer of the stream.

3.  You can process one or more input streams to produce one or more output streams. Streams may go
    through a pipeline consisting of several such processing stages before they eventually end up at
    an output (option 1 or 2).

In the rest of this chapter, we will discuss option 3: processing streams to produce other, derived
streams. A piece of code that processes streams like this is known as an *operator* or a *job*. It
is closely related to the Unix processes and MapReduce jobs we discussed in
[Chapter 11](/en/ch11#ch_batch), and the pattern of dataflow is similar: a stream processor consumes
input streams in a read-only fashion and writes its output to a different location in an append-only
fashion.

The patterns for sharding and parallelization in stream processors are also very similar to those in
MapReduce and the dataflow engines we saw in [Chapter 11](/en/ch11#ch_batch), so we won't repeat
those topics here. Basic mapping operations such as transforming and filtering records also work the
same.

The one crucial difference from batch jobs is that a stream never ends. This difference has many
implications: as discussed at the start of this chapter, sorting does not make sense with an
unbounded dataset, and so sort-merge joins (see ["JOIN and GROUP BY"](/en/ch11#sec_batch_join))
cannot be used. Fault-tolerance mechanisms must also change: with a batch job that has been running
for a few minutes, a failed task can simply be restarted from the beginning, but with a stream job
that has been running for several years, restarting from the beginning after a crash may not be a
viable option.

### Uses of Stream Processing {#sec_stream_uses}

Stream processing has long been used for monitoring purposes, where an organization wants to be
alerted if certain things happen. For example:

- Fraud detection systems need to determine if the usage patterns of a credit card have unexpectedly
  changed, and block the card if it is likely to have been stolen.

- Trading systems need to examine price changes in a financial market and execute trades according
  to specified rules.

- Manufacturing systems need to monitor the status of machines in a factory, and quickly identify
  the problem if there is a malfunction.

- Military and intelligence systems need to track the activities of a potential aggressor, and raise
  the alarm if there are signs of an attack.

These kinds of applications require quite sophisticated pattern matching and correlations. However,
other uses of stream processing have also emerged over time. In this section we will briefly compare
and contrast some of these applications.

#### Complex event processing {#id317}

*Complex event processing* (CEP) is an approach developed in the 1990s for analyzing event streams,
especially geared toward the kind of application that requires searching for certain event patterns
[^52]. Similarly to the way that a regular expression allows you to search for certain
patterns of characters in a string, CEP allows you to specify rules to search for certain patterns
of events in a stream.

CEP systems often use a high-level declarative query language like SQL, or a graphical user
interface, to describe the patterns of events that should be detected. These queries are submitted
to a processing engine that consumes the input streams and internally maintains a state machine that
performs the required matching. When a match is found, the engine emits a *complex event* (hence the
name) with the details of the event pattern that was detected [^53].

In these systems, the relationship between queries and data is reversed compared to normal
databases. Usually, a database stores data persistently and treats queries as transient: when a
query comes in, the database searches for data matching the query, and then forgets about the query
when it has finished. CEP engines reverse these roles: queries are stored long-term; as each event
arrives, the engine checks whether it has now seen an event pattern that matches any of its standing
queries [^54].

Implementations of CEP include Esper, Apama, and TIBCO StreamBase. Distributed stream processors
like Flink and Spark Streaming also have SQL support for declarative queries on streams.

#### Stream analytics {#id318}

Another area in which stream processing is used is for *analytics* on streams. The boundary between
CEP and stream analytics is blurry, but as a general rule, analytics tends to be less interested in
finding specific event sequences and is more oriented toward aggregations and statistical metrics
over a large number of events---for example:

- Measuring the rate of some type of event (how often it occurs per time interval)

- Calculating the rolling average of a value over some time period

- Comparing current statistics to previous time intervals (e.g., to detect trends or to alert on
  metrics that are unusually high or low compared to the same time last week)

Such statistics are usually computed over fixed time intervals---for example, you might want to know
the average number of queries per second to a service over the last 5 minutes, and their 99th
percentile response time during that period. Averaging over a few minutes smoothes out irrelevant
fluctuations from one second to the next, while still giving you a timely picture of any changes in
traffic pattern. The time interval over which you aggregate is known as a *window*, and we will look
into windowing in more detail in ["Reasoning About Time"](/en/ch12#sec_stream_time).

Stream analytics systems sometimes use probabilistic algorithms, such as Bloom filters (which we
encountered in ["Bloom filters"](/en/ch4#sec_storage_bloom_filter)) for set membership, HyperLogLog
[^55] for cardinality estimation, and various percentile estimation algorithms (see
["Computing Percentiles"](/en/ch2#sidebar_percentiles)). Probabilistic algorithms produce
approximate results, but have the advantage of requiring significantly less memory in the stream
processor than exact algorithms. This use of approximation algorithms sometimes leads people to
believe that stream processing systems are always lossy and inexact, but that is wrong: there is
nothing inherently approximate about stream processing, and using probabilistic algorithms is merely
an optimization [^56].

Many open source distributed stream processing frameworks are designed with analytics in mind: for
example, Apache Storm, Spark Streaming, Flink, Samza, Apache Beam, and Kafka Streams
[^57]. Hosted services include Google Cloud Dataflow and Azure Stream Analytics.

#### Maintaining materialized views {#sec_stream_mat_view}

We saw that a stream of changes to a database can be used to keep derived data systems, such as
caches, search indexes, and data warehouses, up to date with a source database. These are examples
of maintaining materialized views: deriving an alternative view onto some dataset so that you can
query it efficiently, and updating that view whenever the underlying data changes [^37].

Similarly, in event sourcing, application state is maintained by applying a log of events; here the
application state is also a kind of materialized view. Unlike stream analytics scenarios, it is
usually not sufficient to consider only events within some time window: building the materialized
view potentially requires *all* events over an arbitrary time period, apart from any obsolete events
that may be discarded by log compaction. In effect, you need a window that stretches all the way
back to the beginning of time.

In principle, any stream processor could be used for materialized view maintenance, although the
need to maintain events forever runs counter to the assumptions of some analytics-oriented
frameworks that mostly operate on windows of a limited duration. Kafka Streams and Confluent's
ksqlDB support this kind of usage, building upon Kafka's support for log compaction [^58].

> [!TIP] INCREMENTAL VIEW MAINTENANCE
> Databases might seem well suited for materialized view maintenance; they are designed to keep full
> copies of a dataset, after all. Many also support materialized views. We saw in ["Materialized Views
> and Data Cubes"](/en/ch4#sec_storage_materialized_views) that analytical queries
> typical of a data warehouse can be materialized into OLAP cubes.
>
> Unfortunately, databases often refresh materialized view tables using batch jobs or on-demand
> requests such as PostgreSQL's `REFRESH MATERIALIZED VIEW`. Views are recalculated
> periodically rather than as updates to souce data occurs. This approach has two significant
> drawbacks that make it inappropriate for stream processing view maintenance:
>
> 1.  Poor efficiency: All data is reprocessed every time the view is updated, though it's likely that
>     most of the data remains unchanged.
>
> 2.  Data freshness: changes in source data are not reflected in a materialized view until its query
>     is re-run during its next scheduled update.
>
> It is possible to write database triggers that update materialized views efficiently in scenarios
> where the data is easily partitioned and the computation is naturally incremental. For example, if a
> materialized view maintains total sales revenue per-day, the row for the appropriate day can be
> updated every time a new sale occurs. Bespoke solutions work in a few cases, but many SQL queries
> can't be easily or efficiently converted to incremental computation.
>
> <a id="sec_stream_ivm"></a>
>
> *Incremental view maintenance (IVM)* is a more general solution to the problems listed above. IVM
> techniques convert relational grammars such as SQL into operators capable of incremental
> computations. Rather than processing entire datasets, IVM algorithms recompute and update only data
> that has changed [^38], [^59], [^60]. View computation becomes far more
> efficient. Updates can then be run much more frequently, which dramatically increases data
> freshness.
>
> Databases such as Materialize [^61], RisingWave, ClickHouse, and Feldera all use IVM
> techniques to provide efficient incremental materialized views. These databases ingest streams of
> events to expose materialized views in realtime. Recent events are buffered in-memory and
> periodically used to update on-disk materialized views. Reads combine the recent events and the
> materialized data to provide a single realtime view. Since reads are often expressed in SQL and
> materialized views are often stored in OLAP-style formats, these systems also support large-scale
> data warehouse-style queries such as those disucssed in
> [Chapter 11](/en/ch11#ch_batch).

#### Search on streams {#id320}

Besides CEP, which allows searching for patterns consisting of multiple events, there is also
sometimes a need to search for individual events based on complex criteria, such as full-text search
queries.

For example, media monitoring services subscribe to feeds of news articles and broadcasts from media
outlets, and search for any news mentioning companies, products, or topics of interest. This is done
by formulating a search query in advance, and then continually matching the stream of news items
against this query. Similar features exist on some websites: for example, users of real estate
websites can ask to be notified when a new property matching their search criteria appears on the
market. The percolator feature of Elasticsearch [^62] is one option for implementing this
kind of stream search.

Conventional search engines first index the documents and then run queries over the index. By
contrast, searching a stream turns the processing on its head: the queries are stored, and the
documents run past the queries, like in CEP. In the simplest case, you can test every document
against every query, although this can get slow if you have a large number of queries. To optimize
the process, it is possible to index the queries as well as the documents, and thus narrow down the
set of queries that may match [^63].

#### Event-Driven Architectures and RPC {#sec_stream_actors_drpc}

In ["Event-Driven Architectures"](/en/ch5#sec_encoding_dataflow_msg) we discussed message-passing
systems as an alternative to RPC---i.e., as a mechanism for services to communicate, as used for
example in the actor model. Although these systems are also based on messages and events, we
normally don't think of them as stream processors:

- Actor frameworks are primarily a mechanism for managing concurrency and distributed execution of
  communicating modules, whereas stream processing is primarily a data management technique.

- Communication between actors is often ephemeral and one-to-one, whereas event logs are durable and
  multi-subscriber.

- Actors can communicate in arbitrary ways (including cyclic request/response patterns), but stream
  processors are usually set up in acyclic pipelines where every stream is the output of one
  particular job, and derived from a well-defined set of input streams.

That said, there is some crossover area between RPC-like systems and stream processing. For example,
Apache Storm has a feature called *distributed RPC*, which allows user queries to be farmed out to a
set of nodes that also process event streams; these queries are then interleaved with events from
the input streams, and results can be aggregated and sent back to the user. (See also ["Multi-shard
data processing"](/en/ch13#sec_future_unbundled_multi_shard).)

It is also possible to process streams using actor frameworks. However, many such frameworks do not
guarantee message delivery in the case of crashes, so the processing is not fault-tolerant unless
you implement additional retry logic.

### Reasoning About Time {#sec_stream_time}

Stream processors often need to deal with time, especially when used for analytics purposes, which
frequently use time windows such as "the average over the last five minutes." It might seem that the
meaning of "the last five minutes" should be unambiguous and clear, but unfortunately the notion is
surprisingly tricky.

In a batch process, the processing tasks rapidly crunch through a large collection of historical
events. If some kind of breakdown by time needs to happen, the batch process needs to look at the
timestamp embedded in each event. There is no point in looking at the system clock of the machine
running the batch process, because the time at which the process is run has nothing to do with the
time at which the events actually occurred.

A batch process may read a year's worth of historical events within a few minutes; in most cases,
the timeline of interest is the year of history, not the few minutes of processing. Moreover, using
the timestamps in the events allows the processing to be deterministic: running the same process
again on the same input yields the same result.

On the other hand, many stream processing frameworks use the local system clock on the processing
machine (the *processing time*) to determine windowing [^64]. This approach has the
advantage of being simple, and it is reasonable if the delay between event creation and event
processing is negligibly short. However, it breaks down if there is any significant processing
lag---i.e., if the processing may happen noticeably later than the time at which the event actually
occurred.

#### Event time versus processing time {#id322}

There are many reasons why processing may be delayed: queueing, network faults, a performance issue
leading to contention in the message broker or processor, a restart of the stream consumer, or
reprocessing of past events while recovering from a fault or after fixing a bug in the code.

Moreover, message delays can also lead to unpredictable ordering of messages. For example, say a
user first makes one web request (which is handled by web server A), and then a second request
(which is handled by server B). A and B emit events describing the requests they handled, but B's
event reaches the message broker before A's event does. Now stream processors will first see the B
event and then the A event, even though they actually occurred in the opposite order.

If it helps to have an analogy, consider the *Star Wars* movies: Episode IV was released in 1977,
Episode V in 1980, and Episode VI in 1983, followed by Episodes I, II, and III in 1999, 2002, and
2005, respectively, and Episodes VII, VIII, and IX in 2015, 2017, and 2019 [^65]. If you
watched the movies in the order they came out, the order in which you processed the movies is
inconsistent with the order of their narrative. (The episode number is like the event timestamp, and
the date when you watched the movie is the processing time.) As humans, we are able to cope with
such discontinuities, but stream processing algorithms need to be specifically written to
accommodate such timing and ordering issues.

Confusing event time and processing time leads to bad data. For example, say you have a stream
processor that measures the rate of requests (counting the number of requests per second). If you
redeploy the stream processor, it may be shut down for a minute and process the backlog of events
when it comes back up. If you measure the rate based on the processing time, it will look as if
there was a sudden anomalous spike of requests while processing the backlog, when in fact the real
rate of requests was steady ([Figure 12-8](/en/ch12#fig_stream_processing_time)).

{{< figure src="/fig/ddia_1208.png" id="fig_stream_processing_time" caption="Figure 12-8. Windowing by processing time introduces artifacts due to variations in processing rate." class="w-full my-4" >}}

#### Handling straggler events {#id323}

A tricky problem when defining windows in terms of event time is that you can never be sure when you
have received all of the events for a particular window, or whether there are some events still to
come.

For example, say you're grouping events into one-minute windows so that you can count the number of
requests per minute. You have counted some number of events with timestamps that fall in the 37th
minute of the hour, and time has moved on; now most of the incoming events fall within the 38th and
39th minutes of the hour. When do you declare that you have finished the window for the 37th minute,
and output its counter value?

You can time out and declare a window ready after you have not seen any new events for that window
in a while. However, it could still happen that some events were buffered on another machine
somewhere, delayed due to a network interruption. You need to be able to handle such *straggler*
events that arrive after the window has already been declared complete. Broadly, you have two
options [^1]:

1.  Ignore the straggler events, as they are probably a small percentage of events in normal
    circumstances. You can track the number of dropped events as a metric, and alert if you start
    dropping a significant amount of data.

2.  Publish a *correction*, an updated value for the window with stragglers included. You may also
    need to retract the previous output.

In some cases it is possible to use a special message to indicate, "From now on there will be no
more messages with a timestamp earlier than *t*," which can be used by consumers to trigger windows
[^66]. However, if several producers on different machines are generating events, each
with their own minimum timestamp thresholds, the consumers need to keep track of each producer
individually. Adding and removing producers is trickier in this case.

#### Whose clock are you using, anyway? {#id438}

Assigning timestamps to events is even more difficult when events can be buffered at several points
in the system. For example, consider a mobile app that reports events for usage metrics to a server.
The app may be used while the device is offline, in which case it will buffer events locally on the
device and send them to a server when an internet connection is next available (which may be hours
or even days later). To any consumers of this stream, the events will appear as extremely delayed
stragglers.

In this context, the timestamp on the events should really be the time at which the user interaction
occurred, according to the mobile device's local clock. However, the clock on a user-controlled
device often cannot be trusted, as it may be accidentally or deliberately set to the wrong time (see
["Clock Synchronization and Accuracy"](/en/ch9#sec_distributed_clock_accuracy)). The time at which
the event was received by the server (according to the server's clock) is more likely to be
accurate, since the server is under your control, but less meaningful in terms of describing the
user interaction.

To adjust for incorrect device clocks, one approach is to log three timestamps [^67]:

- The time at which the event occurred, according to the device clock

- The time at which the event was sent to the server, according to the device clock

- The time at which the event was received by the server, according to the server clock

By subtracting the second timestamp from the third, you can estimate the offset between the device
clock and the server clock (assuming the network delay is negligible compared to the required
timestamp accuracy). You can then apply that offset to the event timestamp, and thus estimate the
true time at which the event actually occurred (assuming the device clock offset did not change
between the time the event occurred and the time it was sent to the server).

This problem is not unique to stream processing---batch processing suffers from exactly the same
issues of reasoning about time. It is just more noticeable in a streaming context, where we are more
aware of the passage of time.

#### Types of windows {#id324}

Once you know how the timestamp of an event should be determined, the next step is to decide how
windows over time periods should be defined. The window can then be used for aggregations, for
example to count events, or to calculate the average of values within the window. Several types of
windows are in common use [^64], [^68]:

Tumbling window

:   A tumbling window has a fixed length, and every event belongs to exactly one window. For
    example, if you have a 1-minute tumbling window, all the events with timestamps between 10:03:00
    and 10:03:59 are grouped into one window, events between 10:04:00 and 10:04:59 into the next
    window, and so on. You could implement a 1-minute tumbling window by taking each event timestamp
    and rounding it down to the nearest minute to determine the window that it belongs to.

Hopping window

:   A hopping window also has a fixed length, but allows windows to overlap in order to provide some
    smoothing. For example, a 5-minute window with a hop size of 1 minute would contain the events
    between 10:03:00 and 10:07:59, then the next window would cover events between 10:04:00 and
    10:08:59, and so on. You can implement this hopping window by first calculating 1-minute
    tumbling windows, and then aggregating over several adjacent windows.

Sliding window

:   A sliding window contains all the events that occur within some interval of each other. For
    example, a 5-minute sliding window would cover events at 10:03:39 and 10:08:12, because they are
    less than 5 minutes apart (note that tumbling and hopping 5-minute windows would not have put
    these two events in the same window, as they use fixed boundaries). A sliding window can be
    implemented by keeping a buffer of events sorted by time and removing old events when they
    expire from the window.

Session window

:   Unlike the other window types, a session window has no fixed duration. Instead, it is defined by
    grouping together all events for the same user that occur closely together in time, and the
    window ends when the user has been inactive for some time (for example, if there have been no
    events for 30 minutes). Sessionization is a common requirement for website analytics.

Window operations usually maintain temporary state. In some cases, the state is of a fixed size, no
matter how large the window or how many events occur: for example, a counting operation will only
have one counter regardless of the window size or event count. On the other hand, sliding windows or
stream joins, which we discuss in the next section, require that events be buffered until the window
finishes. Therefore, large window sizes or high-throughput streams can cause stream processors to
keep a lot of temporary state. You must then take care to ensure the machines running stream
processing tasks have enough capacity to maintain this state, whether in-memory or on-disk.

### Stream Joins {#sec_stream_joins}

In ["JOIN and GROUP BY"](/en/ch11#sec_batch_join) we discussed how batch jobs can join datasets by
key, and how such joins form an important part of data pipelines. Since stream processing
generalizes data pipelines to incremental processing of unbounded datasets, there is exactly the
same need for joins on streams.

However, the fact that new events can appear anytime on a stream makes joins on streams more
challenging than in batch jobs. To understand the situation better, let's distinguish three
different types of joins: *stream-stream* joins, *stream-table* joins, and *table-table* joins. In
the following sections we'll illustrate each by example.

#### Stream-stream join (window join) {#id440}

Say you have a search feature on your website, and you want to detect recent trends in searched-for
URLs. Every time someone types a search query, you log an event containing the query and the results
returned. Every time someone clicks one of the search results, you log another event recording the
click. In order to calculate the click-through rate for each URL in the search results, you need to
bring together the events for the search action and the click action, which are connected by having
the same session ID. Similar analyses are needed in advertising systems [^69].

The click may never come if the user abandons their search, and even if it comes, the time between
the search and the click may be highly variable: in many cases it might be a few seconds, but it
could be as long as days or weeks (if a user runs a search, forgets about that browser tab, and then
returns to the tab and clicks a result sometime later). Due to variable network delays, the click
event may even arrive before the search event. You can choose a suitable window for the join---for
example, you may choose to join a click with a search if they occur at most one hour apart.

Note that embedding the details of the search in the click event is not equivalent to joining the
events: doing so would only tell you about the cases where the user clicked a search result, not
about the searches where the user did not click any of the results. In order to measure search
quality, you need accurate click-through rates, for which you need both the search events and the
click events.

To implement this type of join, a stream processor needs to maintain *state*: for example, all the
events that occurred in the last hour, indexed by session ID. Whenever a search event or click event
occurs, it is added to the appropriate index, and the stream processor also checks the other index
to see if another event for the same session ID has already arrived. If there is a matching event,
you emit an event saying which search result was clicked. If the search event expires without you
seeing a matching click event, you emit an event saying which search results were not clicked.

#### Stream-table join (stream enrichment) {#sec_stream_table_joins}

In ["JOIN and GROUP BY"](/en/ch11#sec_batch_join) ([Figure 11-2](/en/ch11#fig_batch_join_example))
we saw an example of a batch job joining two datasets: a set of user activity events and a database
of user profiles. It is natural to think of the user activity events as a stream, and to perform the
same join on a continuous basis in a stream processor: the input is a stream of activity events
containing a user ID, and the output is a stream of activity events in which the user ID has been
augmented with profile information about the user. This process is sometimes known as *enriching*
the activity events with information from the database.

To perform this join, the stream process needs to look at one activity event at a time, look up the
event's user ID in the database, and add the profile information to the activity event. The database
lookup could be implemented by querying a remote database; however, as discussed in ["JOIN and GROUP
BY"](/en/ch11#sec_batch_join), such remote queries are likely to be slow and risk overloading the
database [^58].

Another approach is to load a copy of the database into the stream processor so that it can be
queried locally without a network round-trip. This technique is called a *hash join* since the local
copy of the database might be an in-memory hash table if it is small enough, or an index on the
local disk.

The difference from batch jobs is that a batch job uses a point-in-time snapshot of the database as
input, whereas a stream processor is long-running, and the contents of the database are likely to
change over time, so the stream processor's local copy of the database needs to be kept up to date.
This issue can be solved by change data capture: the stream processor can subscribe to a changelog
of the user profile database as well as the stream of activity events. When a profile is created or
modified, the stream processor updates its local copy. Thus, we obtain a join between two streams:
the activity events and the profile updates.

A stream-table join is actually very similar to a stream-stream join; the biggest difference is that
for the table changelog stream, the join uses a window that reaches back to the "beginning of time"
(a conceptually infinite window), with newer versions of records overwriting older ones. For the
stream input, the join might not maintain a window at all.

#### Table-table join (materialized view maintenance) {#id326}

Consider the social network timeline example that we discussed in ["Case Study: Social Network Home
Timelines"](/en/ch2#sec_introduction_twitter). We said that when a user wants to view their home
timeline, it is too expensive to iterate over all the people the user is following, find their
recent posts, and merge them.

Instead, we want a timeline cache: a kind of per-user "inbox" to which posts are written as they are
sent, so that reading the timeline is a single lookup. Materializing and maintaining this cache
requires the following event processing:

- When user *u* sends a new post, it is added to the timeline of every user who is following *u*.

- When a user deletes a post, or deletes their entire account, it is removed from all users'
  timelines.

- When user *u*~1~ starts following user *u*~2~, recent posts by *u*~2~ are added to *u*~1~'s
  timeline.

- When user *u*~1~ unfollows user *u*~2~, posts by *u*~2~ are removed from *u*~1~'s timeline.

To implement this cache maintenance in a stream processor, you need streams of events for posts
(sending and deleting) and for follow relationships (following and unfollowing). The stream process
needs to maintain a database containing the set of followers for each user so that it knows which
timelines need to be updated when a new post arrives.

Another way of looking at this stream process is that it maintains a materialized view for a query
that joins two tables (posts and follows), something like the following:

``` sql
SELECT follows.follower_id AS timeline_id,
  array_agg(posts.* ORDER BY posts.timestamp DESC)
FROM posts
JOIN follows ON follows.followee_id = posts.sender_id
GROUP BY follows.follower_id
```

The join of the streams corresponds directly to the join of the tables in that query. The timelines
are effectively a cache of the result of this query, updated every time the underlying tables
change.

> [!NOTE]
> If you regard a stream as the derivative of a table, as in
> [Figure 12-7](/en/ch12#fig_stream_integral), and regard a join as a product of two
> tables *u·v*, something interesting happens: the stream of changes to the materialized join follows
> the product rule (*u·v*)′ = *u*′*v* + *uv*′. In words: any change of posts is joined with the
> current followers, and any change of followers is joined with the current posts [^37].

#### Time-dependence of joins {#sec_stream_join_time}

The three types of joins described here (stream-stream, stream-table, and table-table) have a lot in
common: they all require the stream processor to maintain some state (search and click events, user
profiles, or follower list) based on one join input, and query that state on messages from the other
join input.

The order of the events that maintain the state is important (it matters whether you first follow
and then unfollow, or the other way round). In a sharded event log like Kafka, the ordering of
events within a single shard (partition) is preserved, but there is typically no ordering guarantee
across different streams or shards.

This raises a question: if events on different streams happen around a similar time, in which order
are they processed? In the stream-table join example, if a user updates their profile, which
activity events are joined with the old profile (processed before the profile update), and which are
joined with the new profile (processed after the profile update)? Put another way: if state changes
over time, and you join with some state, what point in time do you use for the join?

Such time dependence can occur in many places. For example, if you sell things, you need to apply
the right tax rate to invoices, which depends on the country or state, the type of product, and the
date of sale (since tax rates change from time to time). When joining sales to a table of tax rates,
you probably want to join with the tax rate at the time of the sale, which may be different from the
current tax rate if you are reprocessing historical data.

If the ordering of events across streams is undetermined, the join becomes nondeterministic
[^70], which means you cannot rerun the same job on the same input and necessarily get the
same result: the events on the input streams may be interleaved in a different way when you run the
job again.

In data warehouses, this issue is known as a *slowly changing dimension* (SCD), and it is often
addressed by using a unique identifier for a particular version of the joined record: for example,
every time the tax rate changes, it is given a new identifier, and the invoice includes the
identifier for the tax rate at the time of sale [^71], [^72]. This change makes the
join deterministic, but has the consequence that log compaction is not possible, since all versions
of the records in the table need to be retained. Alternatively, you can denormalize the data and
include the applicable tax rate directly in every sale event.

### Fault Tolerance {#sec_stream_fault_tolerance}

In the final section of this chapter, let's consider how stream processors can tolerate faults. We
saw in [Chapter 11](/en/ch11#ch_batch) that batch processing frameworks can tolerate faults fairly
easily: if a task fails, it can simply be started again on another machine, and the output of the
failed task is discarded. This transparent retry is possible because input files are immutable, each
task writes its output to a separate file, and output is only made visible when a task completes
successfully.

In particular, the batch approach to fault tolerance ensures that the output of the batch job is the
same as if nothing had gone wrong, even if in fact some tasks did fail. It appears as though every
input record was processed exactly once---no records are skipped, and none are processed twice.
Although restarting tasks means that records may in fact be processed multiple times, the visible
effect in the output is as if they had only been processed once. This principle is known as
*exactly-once semantics*, although *effectively-once* would be a more descriptive term
[^73].

The same issue of fault tolerance arises in stream processing, but it is less straightforward to
handle: waiting until a task is finished before making its output visible is not an option, because
a stream is infinite and so you can never finish processing it.

#### Microbatching and checkpointing {#id329}

One solution is to break the stream into small blocks, and treat each block like a miniature batch
process. This approach is called *microbatching*, and it is used in Spark Streaming [^74].
The batch size is typically around one second, which is the result of a performance compromise:
smaller batches incur greater scheduling and coordination overhead, while larger batches mean a
longer delay before results of the stream processor become visible.

Microbatching also implicitly provides a tumbling window equal to the batch size (windowed by
processing time, not event timestamps); any jobs that require larger windows need to explicitly
carry over state from one microbatch to the next.

A variant approach, used in Apache Flink, is to periodically generate rolling checkpoints of state
and write them to durable storage [^75], [^76]. If a stream operator crashes, it can
restart from its most recent checkpoint and discard any output generated between the last checkpoint
and the crash. The checkpoints are triggered by barriers in the message stream, similar to the
boundaries between microbatches, but without forcing a particular window size.

Within the confines of the stream processing framework, the microbatching and checkpointing
approaches provide the same exactly-once semantics as batch processing. However, as soon as output
leaves the stream processor (for example, by writing to a database, sending messages to an external
message broker, or sending emails), the framework is no longer able to discard the output of a
failed microbatch. In this case, restarting a failed task causes the external side effect to happen
twice, and microbatching or checkpointing alone is not sufficient to prevent this problem.

#### Atomic commit revisited {#sec_stream_atomic_commit}

In order to give the appearance of exactly-once processing in the presence of faults, we need to
ensure that all outputs and side effects of processing an event take effect *if and only if* the
processing is successful. Those effects include any messages sent to downstream operators or
external messaging systems (including email or push notifications), any database writes, any changes
to operator state, and any acknowledgment of input messages (including moving the consumer offset
forward in a log-based message broker).

Those things either all need to happen atomically, or none of them must happen, but they should not
go out of sync with each other. If this approach sounds familiar, it is because we discussed it in
["Exactly-once message processing"](/en/ch8#sec_transactions_exactly_once) in the context of
distributed transactions and two-phase commit.

In [Chapter 10](/en/ch10#ch_consistency) we discussed the problems in the traditional
implementations of distributed transactions, such as XA. However, in more restricted environments it
is possible to implement such an atomic commit facility efficiently. This approach is used in Google
Cloud Dataflow [^66], [^75], VoltDB [^77], and Apache Kafka [^78],
[^79]. Unlike XA, these implementations do not attempt to provide transactions across
heterogeneous technologies, but instead keep the transactions internal by managing both state
changes and messaging within the stream processing framework. The overhead of the transaction
protocol can be amortized by processing several input messages within a single transaction.

#### Idempotence {#sec_stream_idempotence}

Our goal is to discard the partial output of any failed tasks so that they can be safely retried
without taking effect twice. Distributed transactions are one way of achieving that goal, but
another way is to rely on *idempotence*, as we saw in ["Durable Execution and
Workflows"](/en/ch5#sec_encoding_dataflow_workflows) [^80].

An idempotent operation is one that you can perform multiple times, and it has the same effect as if
you performed it only once. For example, deleting a key in a key-value store is idempotent (deleting
the value again has no further effect), whereas incrementing a counter is not idempotent (performing
the increment again means the value is incremented twice).

Even if an operation is not naturally idempotent, it can often be made idempotent with a bit of
extra metadata. For example, when consuming messages from Kafka, every message has a persistent,
monotonically increasing offset. When writing a value to an external database, you can include the
offset of the message that triggered the last write with the value. Thus, you can tell whether an
update has already been applied, and avoid performing the same update again.

The state handling in Storm's Trident is based on a similar idea. Relying on idempotence implies
several assumptions: restarting a failed task must replay the same messages in the same order (a
log-based message broker does this), the processing must be deterministic, and no other node may
concurrently update the same value [^81], [^82].

When failing over from one processing node to another, fencing may be required (see ["Distributed
Locks and Leases"](/en/ch9#sec_distributed_lock_fencing)) to prevent interference from a node that
is thought to be dead but is actually alive. Despite all those caveats, idempotent operations can be
an effective way of achieving exactly-once semantics with only a small overhead.

#### Rebuilding state after a failure {#sec_stream_state_fault_tolerance}

Any stream process that requires state---for example, any windowed aggregations (such as counters,
averages, and histograms) and any tables and indexes used for joins---must ensure that this state
can be recovered after a failure.

One option is to keep the state in a remote datastore and replicate it, although having to query a
remote database for each individual message can be slow. An alternative is to keep state local to
the stream processor, and replicate it periodically. Then, when the stream processor is recovering
from a failure, the new task can read the replicated state and resume processing without data loss.

For example, Flink periodically captures snapshots of operator state and writes them to durable
storage such as a distributed filesystem [^75], [^76], and Kafka Streams replicates
state changes by sending them to a dedicated Kafka topic with log compaction, similar to change data
capture [^83]. VoltDB replicates state by redundantly processing each input message on
several nodes (see ["Actual Serial Execution"](/en/ch8#sec_transactions_serial)).

In some cases, it may not even be necessary to replicate the state, because it can be rebuilt from
the input streams. For example, if the state consists of aggregations over a fairly short window, it
may be fast enough to simply replay the input events corresponding to that window. If the state is a
local replica of a database, maintained by change data capture, the database can also be rebuilt
from the log-compacted change stream.

However, all of these trade-offs depend on the performance characteristics of the underlying
infrastructure: in some systems, network delay may be lower than disk access latency, and network
bandwidth may be comparable to disk bandwidth. There is no universally ideal trade-off for all
situations, and the merits of local versus remote state may also shift as storage and networking
technologies evolve.

## Summary {#id332}

In this chapter we have discussed event streams, what purposes they serve, and how to process them.
In some ways, stream processing is very much like the batch processing we discussed in
[Chapter 11](/en/ch11#ch_batch), but done continuously on unbounded (never-ending) streams rather
than on a fixed-size input [^84]. From this perspective, message brokers and event logs
serve as the streaming equivalent of a filesystem.

We spent some time comparing two types of message brokers:

AMQP/JMS-style message broker

:   The broker assigns individual messages to consumers, and consumers acknowledge individual
    messages when they have been successfully processed. Messages are deleted from the broker once
    they have been acknowledged. This approach is appropriate as an asynchronous form of RPC (see
    also ["Event-Driven Architectures"](/en/ch5#sec_encoding_dataflow_msg)), for example in a task
    queue, where the exact order of message processing is not important and where there is no need
    to go back and read old messages again after they have been processed.

Log-based message broker

:   The broker assigns all messages in a shard to the same consumer node, and always delivers
    messages in the same order. Parallelism is achieved through sharding, and consumers track their
    progress by checkpointing the offset of the last message they have processed. The broker retains
    messages on disk, so it is possible to jump back and reread old messages if necessary.

The log-based approach has similarities to the replication logs found in databases (see
[Chapter 6](/en/ch6#ch_replication)) and log-structured storage engines (see
[Chapter 4](/en/ch4#ch_storage)). It is also a form of consensus, as we saw in
[Chapter 10](/en/ch10#ch_consistency). We saw that this approach is especially appropriate for
stream processing systems that consume input streams and generate derived state or derived output
streams.

In terms of where streams come from, we discussed several possibilities: user activity events,
sensors providing periodic readings, and data feeds (e.g., market data in finance) are naturally
represented as streams. We saw that it can also be useful to think of the writes to a database as a
stream: we can capture the changelog---i.e., the history of all changes made to a database---either
implicitly through change data capture or explicitly through event sourcing. Log compaction allows
the stream to retain a full copy of the contents of a database.

Representing databases as streams opens up powerful opportunities for integrating systems. You can
keep derived data systems such as search indexes, caches, and analytics systems continually up to
date by consuming the log of changes and applying them to the derived system. You can even build
fresh views onto existing data by starting from scratch and consuming the log of changes from the
beginning all the way to the present.

The facilities for maintaining state as streams and replaying messages are also the basis for the
techniques that enable stream joins and fault tolerance in various stream processing frameworks. We
discussed several purposes of stream processing, including searching for event patterns (complex
event processing), computing windowed aggregations (stream analytics), and keeping derived data
systems up to date (materialized views).

We then discussed the difficulties of reasoning about time in a stream processor, including the
distinction between processing time and event timestamps, and the problem of dealing with straggler
events that arrive after you thought your window was complete.

We distinguished three types of joins that may appear in stream processes:

Stream-stream joins

:   Both input streams consist of activity events, and the join operator searches for related events
    that occur within some window of time. For example, it may match two actions taken by the same
    user within 30 minutes of each other. The two join inputs may in fact be the same stream (a
    *self-join*) if you want to find related events within that one stream.

Stream-table joins

:   One input stream consists of activity events, while the other is a database changelog. The
    changelog keeps a local copy of the database up to date. For each activity event, the join
    operator queries the database and outputs an enriched activity event.

Table-table joins

:   Both input streams are database changelogs. In this case, every change on one side is joined
    with the latest state of the other side. The result is a stream of changes to the materialized
    view of the join between the two tables.

Finally, we discussed techniques for achieving fault tolerance and exactly-once semantics in a
stream processor. As with batch processing, we need to discard the partial output of any failed
tasks. However, since a stream process is long-running and produces output continuously, we can't
simply discard all output. Instead, a finer-grained recovery mechanism can be used, based on
microbatching, checkpointing, transactions, or idempotent writes.

##### Footnotes

### References {#references}

[^1]: Tyler Akidau, Robert Bradshaw, Craig Chambers, Slava Chernyak, Rafael J. Fernández-Moctezuma, Reuven Lax, Sam McVeety, Daniel Mills, Frances Perry, Eric Schmidt, and Sam Whittle. [The Dataflow Model: A Practical Approach to Balancing Correctness, Latency, and Cost in Massive-Scale, Unbounded, Out-of-Order Data Processing](https://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf). *Proceedings of the VLDB Endowment*, volume 8, issue 12, pages 1792--1803, August 2015. [doi:10.14778/2824032.2824076](https://doi.org/10.14778/2824032.2824076)
[^2]: Harold Abelson, Gerald Jay Sussman, and Julie Sussman. [*Structure and Interpretation of Computer Programs*](https://web.mit.edu/6.001/6.037/sicp.pdf), 2nd edition. MIT Press, 1996. ISBN: 978-0-262-51087-5, archived at [archive.org/details/sicp_20211010](https://archive.org/details/sicp_20211010)
[^3]: Patrick Th. Eugster, Pascal A. Felber, Rachid Guerraoui, and Anne-Marie Kermarrec. [The Many Faces of Publish/Subscribe](https://www.cs.ru.nl/~pieter/oss/manyfaces.pdf). *ACM Computing Surveys*, volume 35, issue 2, pages 114--131, June 2003. [doi:10.1145/857076.857078](https://doi.org/10.1145/857076.857078)
[^4]: Don Carney, Uğur Çetintemel, Mitch Cherniack, Christian Convey, Sangdon Lee, Greg Seidman, Michael Stonebraker, Nesime Tatbul, and Stan Zdonik. [Monitoring Streams -- A New Class of Data Management Applications](https://www.vldb.org/conf/2002/S07P02.pdf). At *28th International Conference on Very Large Data Bases* (VLDB), August 2002. [doi:10.1016/B978-155860869-6/50027-5](https://doi.org/10.1016/B978-155860869-6/50027-5)
[^5]: Matthew Sackman. [Pushing Back](https://wellquite.org/posts/lshift/pushing_back/). *wellquite.org*, May 2016. Archived at [perma.cc/3KCZ-RUFY](https://perma.cc/3KCZ-RUFY)
[^6]: Thomas Figg (tef). [how (not) to write a pipeline](https://web.archive.org/web/20250107135013/https://cohost.org/tef/post/1764930-how-not-to-write-a). *cohost.org*, June 2023. Archived at [perma.cc/A3V8-NYCM](https://perma.cc/A3V8-NYCM)
[^7]: Vicent Martí. [Brubeck, a statsd-Compatible Metrics Aggregator](https://github.blog/news-insights/the-library/brubeck/). *github.blog*, June 2015. Archived at [perma.cc/TP3Q-DJYM](https://perma.cc/TP3Q-DJYM)
[^8]: Seth Lowenberger. [MoldUDP64 Protocol Specification V 1.00](https://www.nasdaqtrader.com/content/technicalsupport/specifications/dataproducts/moldudp64.pdf). *nasdaqtrader.com*, July 2009. Archived at <https://perma.cc/7CRQ-QBD7>
[^9]: Ian Malpass. [Measure Anything, Measure Everything](https://codeascraft.com/2011/02/15/measure-anything-measure-everything/). *codeascraft.com*, February 2011. Archived at [archive.org](https://web.archive.org/web/20250820034209/https://www.etsy.com/codeascraft/measure-anything-measure-everything/)
[^10]: Dieter Plaetinck. [25 Graphite, Grafana and statsd Gotchas](https://grafana.com/blog/2016/03/03/25-graphite-grafana-and-statsd-gotchas/). *grafana.com*, March 2016. Archived at [perma.cc/3NP3-67U7](https://perma.cc/3NP3-67U7)
[^11]: Jeff Lindsay. [Web Hooks to Revolutionize the Web](https://progrium.github.io/blog/2007/05/03/web-hooks-to-revolutionize-the-web/). *progrium.com*, May 2007. Archived at [perma.cc/BF9U-XNX4](https://perma.cc/BF9U-XNX4)
[^12]: Jim N. Gray. [Queues Are Databases](https://arxiv.org/pdf/cs/0701158.pdf). Microsoft Research Technical Report MSR-TR-95-56, December 1995. Archived at [arxiv.org](https://arxiv.org/pdf/cs/0701158)
[^13]: Mark Hapner, Rich Burridge, Rahul Sharma, Joseph Fialli, Kate Stout, and Nigel Deakin. [JSR-343 Java Message Service (JMS) 2.0 Specification](https://jcp.org/en/jsr/detail?id=343). *jms-spec.java.net*, March 2013. Archived at [perma.cc/E4YG-46TA](https://perma.cc/E4YG-46TA)
[^14]: Sanjay Aiyagari, Matthew Arrott, Mark Atwell, Jason Brome, Alan Conway, Robert Godfrey, Robert Greig, Pieter Hintjens, John O'Hara, Matthias Radestock, Alexis Richardson, Martin Ritchie, Shahrokh Sadjadi, Rafael Schloming, Steven Shaw, Martin Sustrik, Carl Trieloff, Kim van der Riet, and Steve Vinoski. [AMQP: Advanced Message Queuing Protocol Specification](https://www.rabbitmq.com/resources/specs/amqp0-9-1.pdf). Version 0-9-1, November 2008. Archived at [perma.cc/6YJJ-GM9X](https://perma.cc/6YJJ-GM9X)
[^15]: [Architectural overview of Pub/Sub](https://cloud.google.com/pubsub/architecture). *cloud.google.com*, 2025. Archived at [perma.cc/VWF5-ABP4](https://perma.cc/VWF5-ABP4)
[^16]: Aris Tzoumas. [Lessons from scaling PostgreSQL queues to 100k events per second](https://www.rudderstack.com/blog/scaling-postgres-queue/). *rudderstack.com*, July 2025. Archived at [perma.cc/QD8C-VA4Y](https://perma.cc/QD8C-VA4Y)
[^17]: Robin Moffatt. [Kafka Connect Deep Dive -- Error Handling and Dead Letter Queues](https://www.confluent.io/blog/kafka-connect-deep-dive-error-handling-dead-letter-queues/). *confluent.io*, March 2019. Archived at [perma.cc/KQ5A-AB28](https://perma.cc/KQ5A-AB28)
[^18]: Dunith Danushka. [Message reprocessing: How to implement the dead letter queue](https://redpanda.com/blog/reliable-message-processing-with-dead-letter-queue). *redpanda.com*. Archived at [perma.cc/R7UB-WEWF](https://perma.cc/R7UB-WEWF)
[^19]: Damien Gasparina, Loic Greffier, and Sebastien Viale. [KIP-1034: Dead letter queue in Kafka Streams](https://cwiki.apache.org/confluence/display/KAFKA/KIP-1034%3A+Dead+letter+queue+in+Kafka+Streams). *cwiki.apache.org*, April 2024. Archived at [perma.cc/3VXV-QXAN](https://perma.cc/3VXV-QXAN)
[^20]: Jay Kreps, Neha Narkhede, and Jun Rao. [Kafka: A Distributed Messaging System for Log Processing](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/09/Kafka.pdf). At *6th International Workshop on Networking Meets Databases* (NetDB), June 2011. Archived at [perma.cc/CSW7-TCQ5](https://perma.cc/CSW7-TCQ5)
[^21]: Jay Kreps. [Benchmarking Apache Kafka: 2 Million Writes Per Second (On Three Cheap Machines)](https://engineering.linkedin.com/kafka/benchmarking-apache-kafka-2-million-writes-second-three-cheap-machines). *engineering.linkedin.com*, April 2014. Archived at [archive.org](https://web.archive.org/web/20140921000742/https://engineering.linkedin.com/kafka/benchmarking-apache-kafka-2-million-writes-second-three-cheap-machines)
[^22]: Kartik Paramasivam. [How We're Improving and Advancing Kafka at LinkedIn](https://engineering.linkedin.com/apache-kafka/how-we_re-improving-and-advancing-kafka-linkedin). *engineering.linkedin.com*, September 2015. Archived at [perma.cc/3S3V-JCYJ](https://perma.cc/3S3V-JCYJ)
[^23]: Philippe Dobbelaere and Kyumars Sheykh Esmaili. [Kafka versus RabbitMQ: A comparative study of two industry reference publish/subscribe implementations](https://arxiv.org/abs/1709.00333). At *11th ACM International Conference on Distributed and Event-based Systems* (DEBS), June 2017. [doi:10.1145/3093742.3093908](https://doi.org/10.1145/3093742.3093908)
[^24]: Kate Holterhoff. [Why Message Queues Endure: A History](https://redmonk.com/kholterhoff/2024/12/12/why-message-queues-endure-a-history/). *redmonk.com*, December 2024. Archived at [perma.cc/6DX8-XK4W](https://perma.cc/6DX8-XK4W)
[^25]: Andrew Schofield. [KIP-932: Queues for Kafka](https://cwiki.apache.org/confluence/display/KAFKA/KIP-932%3A+Queues+for+Kafka). *cwiki.apache.org*, May 2023. Archived at [perma.cc/LBE4-BEMK](https://perma.cc/LBE4-BEMK)
[^26]: Jack Vanlightly. [The advantages of queues on logs](https://jack-vanlightly.com/blog/2023/10/2/the-advantages-of-queues-on-logs). *jack-vanlightly.com*, October 2023. Archived at [perma.cc/WJ7V-287K](https://perma.cc/WJ7V-287K)
[^27]: Jay Kreps. [The Log: What Every Software Engineer Should Know About Real-Time Data's Unifying Abstraction](https://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying). *engineering.linkedin.com*, December 2013. Archived at [perma.cc/2JHR-FR64](https://perma.cc/2JHR-FR64)
[^28]: Andy Hattemer. [Change Data Capture is having a moment. Why?](https://materialize.com/blog/change-data-capture-is-having-a-moment-why/) *materialize.com*, September 2021. Archived at [perma.cc/AL37-P53C](https://perma.cc/AL37-P53C)
[^29]: Prem Santosh Udaya Shankar. [Streaming MySQL Tables in Real-Time to Kafka](https://engineeringblog.yelp.com/2016/08/streaming-mysql-tables-in-real-time-to-kafka.html). *engineeringblog.yelp.com*, August 2016. Archived at [perma.cc/5ZR3-2GVV](https://perma.cc/5ZR3-2GVV)
[^30]: Andreas Andreakis, Ioannis Papapanagiotou. [DBLog: A Watermark Based Change-Data-Capture Framework](https://arxiv.org/pdf/2010.12597). October 2020. Archived at [arxiv.org](https://arxiv.org/pdf/2010.12597)
[^31]: Jiri Pechanec. [Percolator](https://debezium.io/blog/2021/10/07/incremental-snapshots/). *debezium.io*, October 2021. Archived at [perma.cc/EQ8E-W6KQ](https://perma.cc/EQ8E-W6KQ)
[^32]: Debezium maintainers. [Debezium Connector for Cassandra](https://debezium.io/documentation/reference/stable/connectors/cassandra.html). *debezium.io*. Archived at [perma.cc/WR6K-EKMD](https://perma.cc/WR6K-EKMD)
[^33]: Neha Narkhede. [Announcing Kafka Connect: Building Large-Scale Low-Latency Data Pipelines](https://www.confluent.io/blog/announcing-kafka-connect-building-large-scale-low-latency-data-pipelines/). *confluent.io*, February 2016. Archived at [perma.cc/8WXJ-L6GF](https://perma.cc/8WXJ-L6GF)
[^34]: Chris Riccomini. [Kafka change data capture breaks database encapsulation](https://cnr.sh/posts/2018-11-05-kafka-change-data-capture-breaks-database-encapsulation/). *cnr.sh*, November 2018. Archived at [perma.cc/P572-9MKF](https://perma.cc/P572-9MKF)
[^35]: Gunnar Morling. ["Change Data Capture Breaks Encapsulation". Does it, though?](https://www.decodable.co/blog/change-data-capture-breaks-encapsulation-does-it-though) *decodable.co*, November 2023. Archived at [perma.cc/YX2P-WNWR](https://perma.cc/YX2P-WNWR)
[^36]: Gunnar Morling. [Revisiting the Outbox Pattern](https://www.decodable.co/blog/revisiting-the-outbox-pattern). *decodable.co*, October 2024. Archived at [perma.cc/M5ZL-RPS9](https://perma.cc/M5ZL-RPS9)
[^37]: Ashish Gupta and Inderpal Singh Mumick. [Maintenance of Materialized Views: Problems, Techniques, and Applications](https://web.archive.org/web/20220407025818id_/http://sites.computer.org/debull/95JUN-CD.pdf#page=5). *IEEE Data Engineering Bulletin*, volume 18, issue 2, pages 3--18, June 1995. Archived at [archive.org](https://web.archive.org/web/20220407025818id_/http://sites.computer.org/debull/95JUN-CD.pdf#page=5)
[^38]: Mihai Budiu, Tej Chajed, Frank McSherry, Leonid Ryzhyk, Val Tannen. [DBSP: Incremental Computation on Streams and Its Applications to Databases](https://sigmodrecord.org/publications/sigmodRecord/2403/pdfs/20_dbsp-budiu.pdf). *SIGMOD Record*, volume 53, issue 1, pages 87--95, March 2024. [doi:10.1145/3665252.3665271](https://doi.org/10.1145/3665252.3665271)
[^39]: Jim Gray and Andreas Reuter. [*Transaction Processing: Concepts and Techniques*](https://learning.oreilly.com/library/view/transaction-processing/9780080519555/). Morgan Kaufmann, 1992. ISBN: 9781558601901
[^40]: Martin Kleppmann. [Accounting for Computer Scientists](https://martin.kleppmann.com/2011/03/07/accounting-for-computer-scientists.html). *martin.kleppmann.com*, March 2011. Archived at [perma.cc/9EGX-P38N](https://perma.cc/9EGX-P38N)
[^41]: Pat Helland. [Immutability Changes Everything](https://www.cidrdb.org/cidr2015/Papers/CIDR15_Paper16.pdf). At *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
[^42]: Martin Kleppmann. [*Making Sense of Stream Processing*](https://martin.kleppmann.com/papers/stream-processing.pdf). Report, O'Reilly Media, May 2016. Archived at [perma.cc/RAY4-JDVX](https://perma.cc/RAY4-JDVX)
[^43]: Kartik Paramasivam. [Stream Processing Hard Problems -- Part 1: Killing Lambda](https://engineering.linkedin.com/blog/2016/06/stream-processing-hard-problems-part-1-killing-lambda). *engineering.linkedin.com*, June 2016. Archived at [archive.org](https://web.archive.org/web/20240621211312/https://www.linkedin.com/blog/engineering/data-streaming-processing/stream-processing-hard-problems-part-1-killing-lambda)
[^44]: Stéphane Derosiaux. [CQRS: What? Why? How?](https://sderosiaux.medium.com/cqrs-what-why-how-945543482313) *sderosiaux.medium.com*, September 2019. Archived at [perma.cc/FZ3U-HVJ4](https://perma.cc/FZ3U-HVJ4)
[^45]: Baron Schwartz. [Immutability, MVCC, and Garbage Collection](https://web.archive.org/web/20220122020806/http://www.xaprb.com/blog/2013/12/28/immutability-mvcc-and-garbage-collection/). *xaprb.com*, December 2013. Archived at [archive.org](https://web.archive.org/web/20220122020806/http://www.xaprb.com/blog/2013/12/28/immutability-mvcc-and-garbage-collection/)
[^46]: Daniel Eloff, Slava Akhmechet, Jay Kreps, et al. [Re: Turning the Database Inside-out with Apache Samza](https://news.ycombinator.com/item?id=9145197). Hacker News discussion, *news.ycombinator.com*, March 2015. Archived at [perma.cc/ML9E-JC83](https://perma.cc/ML9E-JC83)
[^47]: [Datomic Documentation: Excision](https://docs.datomic.com/operation/excision.html). Cognitect, Inc., *docs.datomic.com*. Archived at [perma.cc/J5QQ-SH32](https://perma.cc/J5QQ-SH32)
[^48]: [Fossil Documentation: Deleting Content from Fossil](https://fossil-scm.org/home/doc/trunk/www/shunning.wiki). *fossil-scm.org*, 2025. Archived at [perma.cc/DS23-GTNG](https://perma.cc/DS23-GTNG)
[^49]: Jay Kreps. [The irony of distributed systems is that data loss is really easy but deleting data is surprisingly hard.](https://x.com/jaykreps/status/582580836425330688) *x.com*, March 2015. Archived at [perma.cc/7RRZ-V7B7](https://perma.cc/7RRZ-V7B7)
[^50]: Brent Robinson. [Crypto shredding: How it can solve modern data retention challenges](https://medium.com/@brentrobinson5/crypto-shredding-how-it-can-solve-modern-data-retention-challenges-da874b01745b). *medium.com*, January 2019. Archived at <https://perma.cc/4LFK-S6XE>
[^51]: Matthew D. Green and Ian Miers. [Forward Secure Asynchronous Messaging from Puncturable Encryption](https://isi.jhu.edu/~mgreen/forward_sec.pdf). At *IEEE Symposium on Security and Privacy*, May 2015. [doi:10.1109/SP.2015.26](https://doi.org/10.1109/SP.2015.26)
[^52]: David C. Luckham. [What's the Difference Between ESP and CEP?](https://complexevents.com/2020/06/15/whats-the-difference-between-esp-and-cep-2/) *complexevents.com*, June 2019. Archived at [perma.cc/E7PZ-FDEF](https://perma.cc/E7PZ-FDEF)
[^53]: Arvind Arasu, Shivnath Babu, and Jennifer Widom. [The CQL Continuous Query Language: Semantic Foundations and Query Execution](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cql.pdf). *The VLDB Journal*, volume 15, issue 2, pages 121--142, June 2006. [doi:10.1007/s00778-004-0147-z](https://doi.org/10.1007/s00778-004-0147-z)
[^54]: Julian Hyde. [Data in Flight: How Streaming SQL Technology Can Help Solve the Web 2.0 Data Crunch](https://queue.acm.org/detail.cfm?id=1667562). *ACM Queue*, volume 7, issue 11, December 2009. [doi:10.1145/1661785.1667562](https://doi.org/10.1145/1661785.1667562)
[^55]: Philippe Flajolet, Éric Fusy, Olivier Gandouet, and Frédéric Meunier. [HyperLogLog: The Analysis of a Near-Optimal Cardinality Estimation Algorithm](https://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf). At *Conference on Analysis of Algorithms* (AofA), June 2007. [doi:10.46298/dmtcs.3545](https://doi.org/10.46298/dmtcs.3545)
[^56]: Jay Kreps. [Questioning the Lambda Architecture](https://www.oreilly.com/ideas/questioning-the-lambda-architecture). *oreilly.com*, July 2014. Archived at [perma.cc/2WY5-HC8Y](https://perma.cc/2WY5-HC8Y)
[^57]: Ian Reppel. [An Overview of Apache Streaming Technologies](https://ianreppel.org/an-overview-of-apache-streaming-technologies/). *ianreppel.org*, March 2016. Archived at [perma.cc/BB3E-QJLW](https://perma.cc/BB3E-QJLW)
[^58]: Jay Kreps. [Why Local State is a Fundamental Primitive in Stream Processing](https://www.oreilly.com/ideas/why-local-state-is-a-fundamental-primitive-in-stream-processing). *oreilly.com*, July 2014. Archived at [perma.cc/P8HU-R5LA](https://perma.cc/P8HU-R5LA)
[^59]: RisingWave Labs. [Deep Dive Into the RisingWave Stream Processing Engine - Part 2: Computational Model](https://risingwave.com/blog/deep-dive-into-the-risingwave-stream-processing-engine-part-2-computational-model/). *risingwave.com*, November 2023. Archived at [perma.cc/LM74-XDEL](https://perma.cc/LM74-XDEL)
[^60]: Frank McSherry, Derek G. Murray, Rebecca Isaacs, and Michael Isard. [Differential dataflow](https://www.cidrdb.org/cidr2013/Papers/CIDR13_Paper111.pdf). At *6th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2013.
[^61]: Andy Hattemer. [Incremental Computation in the Database](https://materialize.com/guides/incremental-computation/). *materialize.com*, March 2020. Archived at [perma.cc/AL94-YVRN](https://perma.cc/AL94-YVRN)
[^62]: Shay Banon. [Percolator](https://www.elastic.co/blog/percolator). *elastic.co*, February 2011. Archived at [perma.cc/LS5R-4FQX](https://perma.cc/LS5R-4FQX)
[^63]: Alan Woodward and Martin Kleppmann. [Real-Time Full-Text Search with Luwak and Samza](https://martin.kleppmann.com/2015/04/13/real-time-full-text-search-luwak-samza.html). *martin.kleppmann.com*, April 2015. Archived at [perma.cc/2U92-Q7R4](https://perma.cc/2U92-Q7R4)
[^64]: Tyler Akidau. [The World Beyond Batch: Streaming 102](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-102). *oreilly.com*, January 2016. Archived at [perma.cc/4XF9-8M2K](https://perma.cc/4XF9-8M2K)
[^65]: Stephan Ewen. [Streaming Analytics with Apache Flink](https://www.slideshare.net/slideshow/advanced-streaming-analytics-with-apache-flink-and-apache-kafka-stephan-ewen/61920008). At *Kafka Summit*, April 2016. Archived at [perma.cc/QBQ4-F9MR](https://perma.cc/QBQ4-F9MR)
[^66]: Tyler Akidau, Alex Balikov, Kaya Bekiroğlu, Slava Chernyak, Josh Haberman, Reuven Lax, Sam McVeety, Daniel Mills, Paul Nordstrom, and Sam Whittle. [MillWheel: Fault-Tolerant Stream Processing at Internet Scale](https://www.vldb.org/pvldb/vol6/p1033-akidau.pdf). *Proceedings of the VLDB Endowment*, volume 6, issue 11, pages 1033--1044, August 2013. [doi:10.14778/2536222.2536229](https://doi.org/10.14778/2536222.2536229)
[^67]: Alex Dean. [Improving Snowplow's Understanding of Time](https://snowplow.io/blog/improving-snowplows-understanding-of-time). *snowplow.io*, September 2015. Archived at [perma.cc/6CT9-Z3Q2](https://perma.cc/6CT9-Z3Q2)
[^68]: [Azure Stream Analytics: Windowing functions](https://learn.microsoft.com/en-gb/stream-analytics-query/windowing-azure-stream-analytics). Microsoft Azure Reference, *learn.microsoft.com*, July 2025. Archived at [archive.org](https://web.archive.org/web/20250901140013/https://learn.microsoft.com/en-gb/stream-analytics-query/windowing-azure-stream-analytics)
[^69]: Rajagopal Ananthanarayanan, Venkatesh Basker, Sumit Das, Ashish Gupta, Haifeng Jiang, Tianhao Qiu, Alexey Reznichenko, Deomid Ryabkov, Manpreet Singh, and Shivakumar Venkataraman. [Photon: Fault-Tolerant and Scalable Joining of Continuous Data Streams](https://research.google.com/pubs/archive/41529.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2013. [doi:10.1145/2463676.2465272](https://doi.org/10.1145/2463676.2465272)
[^70]: Ben Kirwin. [Doing the Impossible: Exactly-Once Messaging Patterns in Kafka](https://ben.kirw.in/2014/11/28/kafka-patterns/). *ben.kirw.in*, November 2014. Archived at [perma.cc/A5QL-QRX7](https://perma.cc/A5QL-QRX7)
[^71]: Pat Helland. [Data on the Outside Versus Data on the Inside](https://www.cidrdb.org/cidr2005/papers/P12.pdf). At *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005.
[^72]: Ralph Kimball and Margy Ross. [*The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling*](https://learning.oreilly.com/library/view/the-data-warehouse/9781118530801/), 3rd edition. John Wiley & Sons, 2013. ISBN: 978-1-118-53080-1
[^73]: Viktor Klang. [I'm coining the phrase 'effectively-once' for message processing with at-least-once + idempotent operations](https://x.com/viktorklang/status/789036133434978304). *x.com*, October 2016. Archived at [perma.cc/7DT9-TDG2](https://perma.cc/7DT9-TDG2)
[^74]: Matei Zaharia, Tathagata Das, Haoyuan Li, Scott Shenker, and Ion Stoica. [Discretized Streams: An Efficient and Fault-Tolerant Model for Stream Processing on Large Clusters](https://www.usenix.org/system/files/conference/hotcloud12/hotcloud12-final28.pdf). At *4th USENIX Conference in Hot Topics in Cloud Computing* (HotCloud), June 2012.
[^75]: Kostas Tzoumas, Stephan Ewen, and Robert Metzger. [High-Throughput, Low-Latency, and Exactly-Once Stream Processing with Apache Flink](https://web.archive.org/web/20250429165534/https://www.ververica.com/blog/high-throughput-low-latency-and-exactly-once-stream-processing-with-apache-flink). *ververica.com*, August 2015. Archived at [archive.org](https://web.archive.org/web/20250429165534/https://www.ververica.com/blog/high-throughput-low-latency-and-exactly-once-stream-processing-with-apache-flink)
[^76]: Paris Carbone, Gyula Fóra, Stephan Ewen, Seif Haridi, and Kostas Tzoumas. [Lightweight Asynchronous Snapshots for Distributed Dataflows](https://arxiv.org/abs/1506.08603). arXiv:1506.08603 \[cs.DC\], June 2015.
[^77]: Ryan Betts and John Hugg. [*Fast Data: Smart and at Scale*](https://www.voltactivedata.com/wp-content/uploads/2017/03/hv-ebook-fast-data-smart-and-at-scale.pdf). Report, O'Reilly Media, October 2015. Archived at [perma.cc/VQ6S-XQQY](https://perma.cc/VQ6S-XQQY)
[^78]: Neha Narkhede and Guozhang Wang. [Exactly-Once Semantics Are Possible: Here's How Kafka Does It](https://www.confluent.io/blog/exactly-once-semantics-are-possible-heres-how-apache-kafka-does-it/). *confluent.io*, June 2019. Archived at [perma.cc/Q2AU-Q2ED](https://perma.cc/Q2AU-Q2ED)
[^79]: Jason Gustafson, Flavio Junqueira, Apurva Mehta, Sriram Subramanian, and Guozhang Wang. [KIP-98 -- Exactly Once Delivery and Transactional Messaging](https://cwiki.apache.org/confluence/display/KAFKA/KIP-98+-+Exactly+Once+Delivery+and+Transactional+Messaging). *cwiki.apache.org*, November 2016. Archived at [perma.cc/95PT-RCTG](https://perma.cc/95PT-RCTG)
[^80]: Pat Helland. [Idempotence Is Not a Medical Condition](https://dl.acm.org/doi/pdf/10.1145/2160718.2160734). *Communications of the ACM*, volume 55, issue 5, page 56, May 2012. [doi:10.1145/2160718.2160734](https://doi.org/10.1145/2160718.2160734)
[^81]: Jay Kreps. [Re: Trying to Achieve Deterministic Behavior on Recovery/Rewind](https://lists.apache.org/thread/n0sz6zld72nvjtnytv09pxc57mdcf9ft). Email to *samza-dev* mailing list, September 2014. Archived at [perma.cc/7DPD-GJNL](https://perma.cc/7DPD-GJNL)
[^82]: E. N. (Mootaz) Elnozahy, Lorenzo Alvisi, Yi-Min Wang, and David B. Johnson. [A Survey of Rollback-Recovery Protocols in Message-Passing Systems](https://www.cs.utexas.edu/~lorenzo/papers/SurveyFinal.pdf). *ACM Computing Surveys*, volume 34, issue 3, pages 375--408, September 2002. [doi:10.1145/568522.568525](https://doi.org/10.1145/568522.568525)
[^83]: Adam Warski. [Kafka Streams -- How Does It Fit the Stream Processing Landscape?](https://softwaremill.com/kafka-streams-how-does-it-fit-stream-landscape/) *softwaremill.com*, June 2016. Archived at [perma.cc/WQ5Q-H2J2](https://perma.cc/WQ5Q-H2J2)
[^84]: Stephan Ewen, Fabian Hueske, and Xiaowei Jiang. [Batch as a Special Case of Streaming and Alibaba's contribution of Blink](https://flink.apache.org/2019/02/13/batch-as-a-special-case-of-streaming-and-alibabas-contribution-of-blink/). *flink.apache.org*, February 2019. Archived at [perma.cc/A529-SKA9](https://perma.cc/A529-SKA9)


================================================
FILE: content/en/ch13.md
================================================
---
title: "13. A Philosophy of Streaming Systems"
weight: 313
breadcrumbs: false
---

<a id="ch_philosophy"></a>

![](/map/ch12.png)

> *If a thing be ordained to another as to its end, its last end cannot consist in the preservation
> of its being. Hence a captain does not intend as a last end, the preservation of the ship
> entrusted to him, since a ship is ordained to something else as its end, viz. to navigation.*
>
> *(Often quoted as: If the highest aim of a captain was the preserve his ship, he would keep it in
> port forever.)*
>
> St. Thomas Aquinas, *Summa Theologica* (1265--1274)

> [!TIP] A NOTE FOR EARLY RELEASE READERS
> With Early Release ebooks, you get books in their earliest form---the author's raw and unedited
> content as they write---so you can take advantage of these technologies long before the official
> release of these titles.
>
> This will be the 13th chapter of the final book. The GitHub repo for this book is
> *[*https://github.com/ept/ddia2-feedback*](https://github.com/ept/ddia2-feedback)*.
>
> If you'd like to be actively involved in reviewing and commenting on this draft, please reach out on GitHub.

In [Chapter 2](/en/ch2#ch_nonfunctional) we discussed the goal of creating applications and systems
that are *reliable*, *scalable*, and *maintainable*. These themes have run through all of the
chapters: for example, we discussed many fault-tolerance algorithms that help improve reliability,
sharding to improve scalability, and mechanisms for evolution and abstraction that improve
maintainability.

In this chapter we will bring all of these ideas together, and build on the streaming/event-driven
architecture ideas from [Chapter 12](/en/ch12#ch_stream) in particular to develop a philosophy of
application development that meets those goals. This chapter is more opinionated than previous
chapters, presenting a deep-dive into one particular philosophy rather than comparing multiple
approaches.

## Data Integration {#sec_future_integration}

A recurring theme in this book has been that for any given problem, there are several solutions, all
of which have different pros, cons, and trade-offs. For example, when discussing storage engines in
[Chapter 4](/en/ch4#ch_storage), we saw log-structured storage, B-trees, and column-oriented
storage. When discussing replication in [Chapter 6](/en/ch6#ch_replication), we saw single-leader,
multi-leader, and leaderless approaches.

If you have a problem such as "I want to store some data and look it up again later," there is no
one right solution, but many different approaches that are each appropriate in different
circumstances. A software implementation typically has to pick one particular approach. It's hard
enough to get one code path robust and performing well---trying to do everything in one piece of
software almost guarantees that the implementation will be poor.

Thus, the most appropriate choice of software tool also depends on the circumstances. Every piece of
software, even a so-called "general-purpose" database, is designed for a particular usage pattern.

Faced with this profusion of alternatives, the first challenge is then to figure out the mapping
between the software products and the circumstances in which they are a good fit. Vendors are
understandably reluctant to tell you about the kinds of workloads for which their software is poorly
suited, but hopefully the previous chapters have equipped you with some questions to ask in order to
read between the lines and better understand the trade-offs.

However, even if you perfectly understand the mapping between tools and circumstances for their use,
there is another challenge: in complex applications, data is often used in several different ways.
There is unlikely to be one piece of software that is suitable for *all* the different circumstances
in which the data is used, so you inevitably end up having to cobble together several different
pieces of software in order to provide your application's functionality.

### Combining Specialized Tools by Deriving Data {#id442}

For example, it is common to need to integrate an OLTP database with a full-text search index in
order to handle queries for arbitrary keywords. Although some databases (such as PostgreSQL) include
a full-text indexing feature, which can be sufficient for simple applications [^1], more
sophisticated search facilities require specialist information retrieval tools. Conversely, search
indexes are generally not very suitable as a durable system of record, and so many applications need
to combine two different tools in order to satisfy all of the requirements.

We touched on the issue of integrating data systems in ["Keeping Systems in
Sync"](/en/ch12#sec_stream_sync). As the number of different representations of the data increases,
the integration problem becomes harder. Besides the database and the search index, perhaps you need
to keep copies of the data in analytics systems (data warehouses, or batch and stream processing
systems); maintain caches or denormalized versions of objects that were derived from the original
data; pass the data through machine learning, classification, ranking, or recommendation systems; or
send notifications based on changes to the data.

#### Reasoning about dataflows {#id443}

When copies of the same data need to be maintained in several storage systems in order to satisfy
different access patterns, you need to be very clear about the inputs and outputs: where is data
written first, and which representations are derived from which sources? How do you get data into
all the right places, in the right formats?

For example, you might arrange for data to first be written to a system of record database,
capturing the changes made to that database (see ["Change Data Capture"](/en/ch12#sec_stream_cdc))
and then applying the changes to the search index in the same order. If change data capture (CDC) is
the only way of updating the index, you can be confident that the index is entirely derived from the
system of record, and therefore consistent with it (barring bugs in the software). Writing to the
database is the only way of supplying new input into this system.

Allowing the application to directly write to both the search index and the database introduces the
problem shown in [Figure 12-4](/en/ch12#fig_stream_write_order), in which two clients concurrently
send conflicting writes, and the two storage systems process them in a different order. In this
case, neither the database nor the search index is "in charge" of determining the order of writes,
and so they may make contradictory decisions and become permanently inconsistent with each other.

If it is possible for you to funnel all user input through a single system that decides on an
ordering for all writes, it becomes much easier to derive other representations of the data by
processing the writes in the same order. This is an application of the state machine replication
approach that we saw in ["Consensus in Practice"](/en/ch10#sec_consistency_total_order). Whether you
use change data capture or an event sourcing log is less important than simply the principle of
deciding on a total order.

Updating a derived data system based on an event log can often be made deterministic and idempotent
(see ["Idempotence"](/en/ch12#sec_stream_idempotence)), making it quite easy to recover from faults.

#### Derived data versus distributed transactions {#sec_future_derived_vs_transactions}

The classic approach for keeping different data systems consistent with each other involves
distributed transactions, as discussed in ["Two-Phase Commit (2PC)"](/en/ch8#sec_transactions_2pc).
How does the approach of using derived data systems fare in comparison to distributed transactions?

At an abstract level, they achieve a similar goal by different means. Distributed transactions
decide on an ordering of writes by using locks for mutual exclusion, while CDC and event sourcing
use a log for ordering. Distributed transactions use atomic commit to ensure that changes take
effect exactly once, while log-based systems are often based on deterministic retry and idempotence.

The biggest difference is that transaction systems usually guarantee that after a value is written,
you can immediately read the up-to-date value (see ["Reading Your Own
Writes"](/en/ch6#sec_replication_ryw)). On the other hand, derived data systems are often updated
asynchronously, and so they do not by default guarantee that reads are up-to-date.

Within limited environments that are willing to pay the cost of distributed transactions, they have
been used successfully. However, XA has poor fault tolerance and performance characteristics (see
["Distributed Transactions Across Different Systems"](/en/ch8#sec_transactions_xa)), which severely
limit its usefulness. It might be possible to create a better protocol for distributed transactions,
but getting such a protocol widely adopted and integrated with existing tools would be challenging,
and is unlikely to happen soon.

In the absence of widespread support for a good distributed transaction protocol, log-based derived
data is the most promising approach for integrating different data systems. However, guarantees such
as reading your own writes are useful, and it is not productive to tell everyone "eventual
consistency is inevitable---suck it up and learn to deal with it" (at least not without good
guidance on *how* to deal with it).

Later in this chapter we will discuss some approaches for implementing stronger guarantees on top of
asynchronously derived systems, and work toward a middle ground between distributed transactions and
asynchronous log-based systems.

#### The limits of total ordering {#id335}

With systems that are small enough, constructing a totally ordered event log is entirely feasible
(as demonstrated by the popularity of databases with single-leader replication, which construct
precisely such a log). However, as systems are scaled toward bigger and more complex workloads,
limitations begin to emerge:

- In most cases, constructing a totally ordered log requires all events to pass through a *single
  leader node* that decides on the ordering. If the throughput of events is greater than a single
  machine can handle, you need to shard the log across multiple machines. The order of events in two
  different shards is then ambiguous.

- If the servers are spread across multiple *geographically distributed* regions, for example in
  order to tolerate an entire datacenter going offline, you typically have a separate leader in each
  datacenter, because network delays make synchronous cross-datacenter coordination inefficient.
  This implies an undefined ordering of events that originate in two different datacenters.

- When applications are deployed as *microservices*, a common design choice is to deploy each
  service and its durable state as an independent unit, with no durable state shared between
  services. When two events originate in different services, there is no defined order for those
  events.

- Some applications maintain client-side state that is updated immediately on user input (without
  waiting for confirmation from a server), and even continue to work offline. With such
  applications, clients and servers are very likely to see events in different orders.

In formal terms, deciding on a total order of events is known as *total order broadcast*, which is
equivalent to consensus (see ["The Many Faces of Consensus"](/en/ch10#sec_consistency_faces)). Most
consensus algorithms are designed for situations in which the throughput of a single node is
sufficient to process the entire stream of events, and these algorithms do not provide a mechanism
for multiple nodes to share the work of ordering the events.

#### Ordering events to capture causality {#sec_future_capture_causality}

In cases where there is no causal link between events, the lack of a total order is not a big
problem, since concurrent events can be ordered arbitrarily. Some other cases are easy to handle:
for example, when there are multiple updates of the same object, they can be totally ordered by
routing all updates for a particular object ID to the same log shard. However, causal dependencies
sometimes arise in more subtle ways.

For example, consider a social networking service, and two users who were in a relationship but have
just broken up. One of the users removes the other as a friend, and then sends a message to their
remaining friends complaining about their ex-partner. The user's intention is that their ex-partner
should not see the rude message, since the message was sent after the friend status was revoked.

However, in a system that stores friendship status in one place and messages in another place, that
ordering dependency between the *unfriend* event and the *message-send* event may be lost. If the
causal dependency is not captured, a service that sends notifications about new messages may process
the *message-send* event before the *unfriend* event, and thus incorrectly send a notification to
the ex-partner.

In this example, the notifications are effectively a join between the messages and the friend list,
making it related to the timing issues of joins that we discussed previously (see ["Time-dependence
of joins"](/en/ch12#sec_stream_join_time)). Unfortunately, there does not seem to be a simple answer
to this problem [^2], [^3]. Starting points include:

- Logical timestamps can provide total ordering without coordination (see ["ID Generators and
  Logical Clocks"](/en/ch10#sec_consistency_logical)), so they may help in cases where total order
  broadcast is not feasible. However, they still require recipients to handle events that are
  delivered out of order, and they require additional metadata to be passed around.

- If you can log an event to record the state of the system that the user saw before making a
  decision, and give that event a unique identifier, then any later events can reference that event
  identifier in order to record the causal dependency [^4].

- Conflict resolution algorithms (see ["Automatic conflict
  resolution"](/en/ch6#sec_replication_automatic_resolution)) help with processing events that are
  delivered in an unexpected order. They are useful for maintaining state, but they do not help if
  actions have external side effects (such as sending a notification to a user).

Perhaps, patterns for application development will emerge in the future that allow causal
dependencies to be captured efficiently, and derived state to be maintained correctly, without
forcing all events to go through the bottleneck of total order broadcast.

### Batch and Stream Processing {#sec_future_batch_streaming}

The goal of data integration is to make sure that data ends up in the right form in all the right
places. Doing so requires consuming inputs, transforming, joining, filtering, aggregating, training
models, evaluating, and eventually writing to the appropriate outputs. Batch and stream processors
are the tools for achieving this goal. The outputs of batch and stream processes are derived
datasets such as search indexes, materialized views, recommendations to show to users, aggregate
metrics, and so on.

As we saw in [Chapter 11](/en/ch11#ch_batch) and [Chapter 12](/en/ch12#ch_stream), batch and stream
processing have a lot of principles in common, and the main fundamental difference is that stream
processors operate on unbounded datasets whereas batch process inputs are of a known, finite size.

#### Maintaining derived state {#id446}

Batch processing has a quite strong functional flavor (even if the code is not written in a
functional programming language): it encourages deterministic, pure functions whose output depends
only on the input and which have no side effects other than the explicit outputs, treating inputs as
immutable and outputs as append-only. Stream processing is similar, but it extends operators to
allow managed, fault-tolerant state.

The principle of deterministic functions with well-defined inputs and outputs is not only good for
fault tolerance, but also simplifies reasoning about the dataflows in an organization
[^5]. No matter whether the derived data is a search index, a statistical model, or a
cache, it is helpful to think in terms of data pipelines that derive one thing from another, pushing
state changes in one system through functional application code and applying the effects to derived
systems.

In principle, derived data systems could be maintained synchronously, just like a relational
database updates secondary indexes synchronously within the same transaction as writes to the table
being indexed. However, asynchrony is what makes systems based on event logs robust: it allows a
fault in one part of the system to be contained locally, whereas distributed transactions abort if
any one participant fails, so they tend to amplify failures by spreading them to the rest of the
system.

We saw in ["Sharding and Secondary Indexes"](/en/ch7#sec_sharding_secondary_indexes) that secondary
indexes often cross shard boundaries. A sharded system with secondary indexes either needs to send
writes to multiple shards (if the index is term-partitioned) or send reads to all shards (if the
index is document-partitioned). Such cross-shard communication is also most reliable and scalable if
the index is maintained asynchronously [^6].

#### Reprocessing data for application evolution {#sec_future_reprocessing}

When maintaining derived data, batch and stream processing are both useful. Stream processing allows
changes in the input to be reflected in derived views with low delay, whereas batch processing
allows large amounts of accumulated historical data to be reprocessed in order to derive new views
onto an existing dataset.

In particular, reprocessing existing data provides a good mechanism for maintaining a system,
evolving it to support new features and changed requirements. Without reprocessing, schema evolution
is limited to simple changes like adding a new optional field to a record, or adding a new type of
record. On the other hand, with reprocessing it is possible to restructure a dataset into a
completely different model in order to better serve new requirements.

> [!TIP] SCHEMA MIGRATIONS ON RAILWAYS
> Large-scale "schema migrations" occur in noncomputer systems as well. For example, in the early days
> of railway building in 19th-century England there were various competing standards for the gauge
> (the distance between the two rails). Trains built for one gauge couldn't run on tracks of another
> gauge, which restricted the possible interconnections in the train network [^7].
>
> After a single standard gauge was finally decided upon in 1846, tracks with other gauges had to be
> converted---but how do you do this without shutting down the train line for months or years? The
> solution is to first convert the track to *dual gauge* or *mixed gauge* by adding a third rail. This
> conversion can be done gradually, and when it is done, trains of both gauges can run on the line,
> using two of the three rails. Eventually, once all trains have been converted to the standard gauge,
> the rail providing the nonstandard gauge can be removed.
>
> "Reprocessing" the existing tracks in this way, and allowing the old and new versions to exist side
> by side, makes it possible to change the gauge gradually over the course of years. Nevertheless, it
> is an expensive undertaking, which is why nonstandard gauges still exist today. For example, the
> BART system in the San Francisco Bay Area uses a different gauge from the majority of the US.

Derived views allow *gradual* evolution. If you want to restructure a dataset, you do not need to
perform the migration as a sudden switch. Instead, you can maintain the old schema and the new
schema side by side as two independently derived views onto the same underlying data. You can then
start shifting a small number of users to the new view in order to test its performance and find any
bugs, while most users continue to be routed to the old view. Gradually, you can increase the
proportion of users accessing the new view, and eventually you can drop the old view [^8],
[^9].

The beauty of such a gradual migration is that every stage of the process is easily reversible if
something goes wrong: you always have a working system to go back to. By reducing the risk of
irreversible damage, you can be more confident about going ahead, and thus move faster to improve
your system [^10].

#### Unifying batch and stream processing {#id338}

An early proposal for unifying batch and stream processing was the *lambda architecture*
[^11], which had a number of problems [^12] and has fallen out of use. More
recent systems allow batch computations (reprocessing historical data) and stream computations
(processing events as they arrive) to be implemented in the same system [^13], an approach
that is sometimes known as the *kappa architecture* [^12].

Unifying batch and stream processing in one system requires the following features:

- The ability to replay historical events through the same processing engine that handles the stream
  of recent events. For example, log-based message brokers have the ability to replay messages, and
  some stream processors can read input from a distributed filesystem or object storage.

- Exactly-once semantics for stream processors---that is, ensuring that the output is the same as if
  no faults had occurred, even if faults did in fact occur. Like with batch processing, this
  requires discarding the partial output of any failed tasks.

- Tools for windowing by event time, not by processing time, since processing time is meaningless
  when reprocessing historical events. For example, Apache Beam provides an API for expressing such
  computations, which can then be run using Apache Flink or Google Cloud Dataflow.

## Unbundling Databases {#sec_future_unbundling}

At a most abstract level, databases, batch/stream processors, and operating systems all perform the
same functions: they store some data, and they allow you to process and query that data
[^14], [^15]. A database stores data in records of some data model (rows in tables,
documents, vertices in a graph, etc.) while an operating system's filesystem stores data in
files---but at their core, both are "information management" systems [^16]. As we saw in
[Chapter 11](/en/ch11#ch_batch), batch processors are like a distributed version of Unix.

Of course, there are many practical differences. For example, many filesystems do not cope very well
with a directory containing 10 million small files, whereas a database containing 10 million small
records is completely normal and unremarkable. Nevertheless, the similarities and differences
between operating systems and databases are worth exploring.

Unix and relational databases have approached the information management problem with very different
philosophies. Unix viewed its purpose as presenting programmers with a logical but fairly low-level
hardware abstraction, whereas relational databases wanted to give application programmers a
high-level abstraction that would hide the complexities of data structures on disk, concurrency,
crash recovery, and so on. Unix developed pipes and files that are just sequences of bytes, whereas
databases developed SQL and transactions.

Which approach is better? Of course, it depends what you want. Unix is "simpler" in the sense that
it is a fairly thin wrapper around hardware resources; relational databases are "simpler" in the
sense that a short declarative query can draw on a lot of powerful infrastructure (query
optimization, indexes, join methods, concurrency control, replication, etc.) without the author of
the query needing to understand the implementation details.

The tension between these philosophies has lasted for decades (both Unix and the relational model
emerged in the early 1970s) and still isn't resolved. For example, the NoSQL movement could be
interpreted as wanting to apply a Unix-esque approach of low-level abstractions to the domain of
distributed OLTP data storage.

This section attempts to reconcile the two philosophies, in the hope that we can combine the best of
both worlds.

### Composing Data Storage Technologies {#id447}

Over the course of this book we have discussed various features provided by databases and how they
work, including:

- Secondary indexes, which allow you to efficiently search for records based on the value of a
  field;

- Materialized views, which are a kind of precomputed cache of query results;

- Replication logs, which keep copies of the data on other nodes up to date; and

- Full-text search indexes, which allow keyword search in text and which are built into some
  relational databases [^1].

In Chapters [11](/en/ch11#ch_batch) and [12](/en/ch12#ch_stream), similar themes emerged. We talked
about building full-text search indexes, about materialized view maintenance, and about replicating
changes from a database to derived data systems using change data capture.

It seems that there are parallels between the features that are built into databases and the derived
data systems that people are building with batch and stream processors.

#### Creating an index {#id340}

Think about what happens when you run `CREATE INDEX` to create a new index in a relational database.
The database has to scan over a consistent snapshot of a table, pick out all of the field values
being indexed, sort them, and write out the index. Then it must process the backlog of writes that
have been made since the consistent snapshot was taken (assuming the table was not locked while
creating the index, so writes could continue). Once that is done, the database must continue to keep
the index up to date whenever a transaction writes to the table.

This process is remarkably similar to setting up a new follower replica (see ["Setting Up New
Followers"](/en/ch6#sec_replication_new_replica)), and also very similar to bootstrapping change
data capture in a streaming system (see ["Initial snapshot"](/en/ch12#sec_stream_cdc_snapshot)).

Whenever you run `CREATE INDEX`, the database essentially reprocesses the existing dataset and
derives the index as a new view onto the existing data. The existing data may be a snapshot of the
state rather than a log of all changes that ever happened, but the two are closely related.

#### The meta-database of everything {#id341}

In this light, the dataflow across an entire organization starts looking like one huge database
[^5]. Whenever a batch, stream, or ETL process transports data from one place and form to
another place and form, it is acting like the database subsystem that keeps indexes or materialized
views up to date.

Viewed like this, batch and stream processors are like elaborate implementations of triggers, stored
procedures, and materialized view maintenance algorithms. The derived data systems they maintain are
like different index types. For example, a relational database may support B-tree indexes, hash
indexes, spatial indexes, and other types of indexes. In the emerging architecture of derived data
systems, instead of implementing those facilities as features of a single integrated database
product, they are provided by various different pieces of software, running on different machines,
administered by different teams.

Where will these developments take us in the future? If we start from the premise that there is no
single data model or storage format that is suitable for all access patterns, there are two avenues
by which different storage and processing tools can nevertheless be composed into a cohesive system:

Federated databases: unifying reads

:   It is possible to provide a unified query interface to a wide variety of underlying storage
    engines and processing methods---an approach known as a *federated database* or *polystore*
    [^17], [^18]. For example, PostgreSQL's *foreign data wrapper* feature fits this
    pattern, as do federated query engines such as Trino, Hoptimator, and Xorq. Applications that
    need a specialized data model or query interface can still access the underlying storage engines
    directly, while users who want to combine data from disparate places can do so easily through
    the federated interface.

    A federated query interface follows the relational tradition of a single integrated system with
    a high-level query language and elegant semantics, but a complicated implementation.

Unbundled databases: unifying writes

:   While federation addresses read-only querying across several different systems, it does not have
    a good answer to synchronizing writes across those systems. We said that within a single
    database, creating a consistent index is a built-in feature. When we compose several storage
    systems, we similarly need to ensure that all data changes end up in all the right places, even
    in the face of faults. Making it easier to reliably plug together storage systems (e.g., through
    change data capture and event logs) is like *unbundling* a database's index-maintenance features
    in a way that can synchronize writes across disparate technologies [^5], [^19].

    The unbundled approach follows the Unix tradition of small tools that do one thing well
    [^20], that communicate through a uniform low-level API (pipes), and that can be
    composed using a higher-level language (the shell) [^14].

#### Making unbundling work {#sec_future_unbundling_favor}

Federation and unbundling are two sides of the same coin: composing a reliable, scalable, and
maintainable system out of diverse components. Federated read-only querying requires mapping one
data model into another, which takes some thought but is ultimately quite a manageable problem.
Keeping the writes to several storage systems in sync is the harder engineering problem, and so we
will focus on it here.

The traditional approach to synchronizing writes requires distributed transactions across
heterogeneous storage systems [^17], which are problematic, as discussed previously.
Transactions within a single storage or stream processing system are feasible, but when data crosses
the boundary between different technologies, an asynchronous event log with idempotent writes is a
much more robust and practicable approach.

For example, distributed transactions are used within some stream processors to achieve exactly-once
semantics, and this can work quite well. However, when a transaction would need to involve systems
written by different groups of people (e.g., when data is written from a stream processor to a
distributed key-value store or search index), the lack of a standardized transaction protocol makes
integration much harder. An ordered log of events with idempotent consumers is a much simpler
abstraction, and thus much more feasible to implement across heterogeneous systems [^5].

The big advantage of log-based integration is *loose coupling* between the various components, which
manifests itself in two ways:

1.  At a system level, asynchronous event streams make the system as a whole more robust to outages
    or performance degradation of individual components. If a consumer runs slow or fails, the event
    log can buffer messages, allowing the producer and any other consumers to continue running
    unaffected. The faulty consumer can catch up when it is fixed, so it doesn't miss any data, and
    the fault is contained. By contrast, the synchronous interaction of distributed transactions
    tends to escalate local faults into large-scale failures.

2.  At a human level, unbundling data systems allows different software components and services to
    be developed, improved, and maintained independently from each other by different teams.
    Specialization allows each team to focus on doing one thing well, with well-defined interfaces
    to other teams' systems. Event logs provide an interface that is powerful enough to capture
    fairly strong consistency properties (due to durability and ordering of events), but also
    general enough to be applicable to almost any kind of data.

#### Unbundled versus integrated systems {#id448}

If unbundling does indeed become the way of the future, it will not replace databases in their
current form---they will still be needed as much as ever. Databases are still required for
maintaining state in stream processors, and in order to serve queries for the output of batch and
stream processors. Specialized query engines will continue to be important for particular workloads:
for example, query engines in data warehouses are optimized for exploratory analytic queries and
handle this kind of workload very well.

The complexity of running several different pieces of infrastructure can be a problem: each piece of
software has a learning curve, configuration issues, and operational quirks, and so it is worth
deploying as few moving parts as possible. A single integrated software product may also be able to
achieve better and more predictable performance on the kinds of workloads for which it is designed,
compared to a system consisting of several tools that you have composed with application code
[^21]. Building for scale that you don't need is wasted effort and may lock you into an
inflexible design. In effect, it is a form of premature optimization.

The goal of unbundling is not to compete with individual databases on performance for particular
workloads; the goal is to allow you to combine several different databases in order to achieve good
performance for a much wider range of workloads than is possible with a single piece of software.
It's about breadth, not depth.

Thus, if there is a single technology that does everything you need, you're most likely best off
simply using that product rather than trying to reimplement it yourself from lower-level components.
The advantages of unbundling and composition only come into the picture when there is no single
piece of software that satisfies all your requirements.

The tools for composing data systems are getting better: Debezium can extract change streams from
many databases, Kafka's protocol is becoming a de-facto standard for event streams, and incremental
view maintenance engines (see ["Incremental View Maintenance"](/en/ch12#sec_stream_ivm)) make it
possible to precompute and update caches of complex queries.

### Designing Applications Around Dataflow {#sec_future_dataflow}

The general idea of updating derived data when its underlying data changes is nothing new. For
example, spreadsheets have powerful dataflow programming capabilities [^22]: you can put a
formula in one cell (for example, the sum of cells in another column), and whenever any input to the
formula changes, the result of the formula is automatically recalculated. This is exactly what we
want at a data system level: when a record in a database changes, we want any index for that record
to be automatically updated, and any cached views or aggregations that depend on the record to be
automatically refreshed. You should not have to worry about the technical details of how this
refresh happens, but be able to simply trust that it works correctly.

Thus, most data systems still have something to learn from the features that VisiCalc already had in
1979 [^23]. The difference from spreadsheets is that today's data systems need to be
fault-tolerant, scalable, and store data durably. They also need to be able to integrate disparate
technologies written by different groups of people over time, and reuse existing libraries and
services: it is unrealistic to expect all software to be developed using one particular language,
framework, or tool.

In this section we will expand on these ideas and explore some ways of building applications around
the ideas of unbundled databases and dataflow.

#### Application code as a derivation function {#sec_future_dataflow_derivation}

When one dataset is derived from another, it goes through some kind of transformation function. For
example:

- A secondary index is a kind of derived dataset with a straightforward transformation function: for
  each row or document in the base table, it picks out the values in the columns or fields being
  indexed, and sorts by those values (assuming a SSTable or B-tree index, which are sorted by key).

- A full-text search index is created by applying various natural language processing functions such
  as language detection, word segmentation, stemming or lemmatization, spelling correction, and
  synonym identification, followed by building a data structure for efficient lookups (such as an
  inverted index).

- In a machine learning system, we can consider the model as being derived from the training data by
  applying various feature extraction and statistical analysis functions. When the model is applied
  to new input data, the output of the model is derived from the input and the model (and hence,
  indirectly, from the training data).

- A cache often contains an aggregation of data in the form in which it is going to be displayed in
  a user interface (UI). Populating the cache thus requires knowledge of what fields are referenced
  in the UI; changes in the UI may require updating the definition of how the cache is populated and
  rebuilding the cache.

The derivation function for a secondary index is so commonly required that it is built into many
databases as a core feature, and you can invoke it by merely saying `CREATE INDEX`. For full-text
indexing, basic linguistic features for common languages may be built into a database, but the more
sophisticated features often require domain-specific tuning. In machine learning, feature
engineering is notoriously application-specific, and often has to incorporate detailed knowledge
about the user interaction and deployment of an application [^24].

When the function that creates a derived dataset is not a standard cookie-cutter function like
creating a secondary index, custom code is required to handle the application-specific aspects. And
this custom code is where many databases struggle. Although relational databases commonly support
triggers, stored procedures, and user-defined functions, which can be used to execute application
code within the database, they have been somewhat of an afterthought in database design.

#### Separation of application code and state {#id344}

In theory, databases could be deployment environments for arbitrary application code, like an
operating system. However, in practice they have turned out to be poorly suited for this purpose.
They do not fit well with the requirements of modern application development, such as dependency and
package management, version control, rolling upgrades, evolvability, monitoring, metrics, calls to
network services, and integration with external systems.

On the other hand, deployment and cluster management tools such as Kubernetes, Docker, Mesos, YARN,
and others are designed specifically for the purpose of running application code. By focusing on
doing one thing well, they are able to do it much better than a database that provides execution of
user-defined functions as one of its many features.

Most web applications today are deployed as stateless services, in which any user request can be
routed to any application server, and the server forgets everything about the request once it has
sent the response. This style of deployment is convenient, as servers can be added or removed at
will, but the state has to go somewhere: typically, a database. The trend has been to keep stateless
application logic separate from state management (databases): not putting application logic in the
database and not putting persistent state in the application [^25]. As people in the
functional programming community like to joke, "We believe in the separation of Church and state"
[^26].

> [!NOTE]
> Explaining a joke usually ruins it, but here is an explanation anyway so that nobody feels left out.
> *Church* is a reference to the mathematician Alonzo Church, who created the lambda calculus, an
> early form of computation that is the basis for most functional programming languages. The lambda
> calculus has no mutable state (i.e., no variables that can be overwritten), so one could say that
> mutable state is separate from Church's work.

In this typical web application model, the database acts as a kind of mutable shared variable that
can be accessed synchronously over the network. The application can read and update the variable,
and the database takes care of making it durable, providing some concurrency control and fault
tolerance.

However, in most programming languages you cannot subscribe to changes in a mutable variable---you
can only read it periodically. Unlike in a spreadsheet, readers of the variable don't get notified
if the value of the variable changes. (You can implement such notifications in your own code---this
is known as the *observer pattern*---but most languages do not have this pattern as a built-in
feature.)

Databases have inherited this passive approach to mutable data: if you want to find out whether the
content of the database has changed, often your only option is to poll (i.e., to repeat your query
periodically). Subscribing to changes is only just beginning to emerge as a feature.

#### Dataflow: Interplay between state changes and application code {#id450}

Thinking about applications in terms of dataflow implies renegotiating the relationship between
application code and state management. Instead of treating a database as a passive variable that is
manipulated by the application, we think much more about the interplay and collaboration between
state, state changes, and code that processes them. Application code responds to state changes in
one place by triggering state changes in another place.

We have already seen this idea in change data capture, in the actor model, in triggers, and
incremental view maintenance. Unbundling the database means taking this idea and applying it to the
creation of derived datasets outside of the primary database: caches, full-text search indexes,
machine learning, or analytics systems. We can use stream processing and messaging systems for this
purpose.

Maintaining derived data requires the following properties, which log-based message brokers can
provide:

- When maintaining derived data, the order of state changes is often important (if several views are
  derived from an event log, they need to process the events in the same order so that they remain
  consistent with each other).

- Fault tolerance is essential: losing just a single message causes the derived dataset to go
  permanently out of sync with its data source. Both message delivery and derived state updates must
  be reliable.

Stable message ordering and fault-tolerant message processing are quite stringent demands, but they
are much less expensive and more operationally robust than distributed transactions. Modern stream
processors can provide these ordering and reliability guarantees at scale, and they allow
application code to be run as stream operators.

This application code can do the arbitrary processing that built-in derivation functions in
databases generally don't provide. Like Unix tools chained by pipes, stream operators can be
composed to build large systems around dataflow. Each operator takes streams of state changes as
input, and produces other streams of state changes as output.

#### Stream processors and services {#id345}

The currently dominant style of application development involves breaking down functionality into a
set of *services* that communicate via synchronous network requests such as REST APIs. The advantage
of such a service-oriented architecture over a single monolithic application is primarily
organizational scalability through loose coupling: different teams can work on different services,
which reduces coordination effort between teams (as long as the services can be deployed and updated
independently).

Composing stream operators into dataflow systems has a lot of similar characteristics to the
microservices approach [^27], [^28]. However, the underlying communication mechanism
is very different: one-directional, asynchronous message streams rather than synchronous
request/response interactions.

Besides the advantages listed in ["Event-Driven Architectures"](/en/ch5#sec_encoding_dataflow_msg),
such as better fault tolerance, dataflow systems can also achieve better performance than
traditional REST APIs or RPC. For example, say a customer is purchasing an item that is priced in
one currency but paid for in another currency. In order to perform the currency conversion, you need
to know the current exchange rate. This operation could be implemented in two ways [^27],
[^29]:

1.  In the microservices approach, the code that processes the purchase would probably query an
    exchange-rate service or database in order to obtain the current rate for a particular currency.

2.  In the dataflow approach, the code that processes purchases would subscribe to a stream of
    exchange rate updates ahead of time, and record the current rate in a local database whenever it
    changes. When it comes to processing the purchase, it only needs to query the local database.

The second approach has replaced a synchronous network request to another service with a query to a
local database (which may be on the same machine, even in the same process). In the microservices
approach, you could avoid the synchronous network request by caching the exchange rate locally in
the service that processes the purchase. However, in order to keep that cache fresh, you would need
to periodically poll for updated exchange rates, or subscribe to a stream of changes---which is
exactly what happens in the dataflow approach.

Not only is the dataflow approach faster, but it is also more robust to the failure of another
service. The fastest and most reliable network request is no network request at all! Instead of RPC,
we now have a stream join between purchase events and exchange rate update events.

The join is time-dependent: if the purchase events are reprocessed at a later point in time, the
exchange rate will have changed. If you want to reconstruct the original output, you will need to
obtain the historical exchange rate at the original time of purchase. No matter whether you query a
service or subscribe to a stream of exchange rate updates, you will need to handle this time
dependence (see ["Time-dependence of joins"](/en/ch12#sec_stream_join_time)).

Subscribing to a stream of changes, rather than querying the current state when needed, brings us
closer to a spreadsheet-like model of computation: when some piece of data changes, any derived data
that depends on it can swiftly be updated. There are still many open questions, for example around
issues like time-dependent joins, but building applications around dataflow ideas is a very
promising direction to explore.

### Observing Derived State {#sec_future_observing}

At an abstract level, the dataflow systems discussed in the last section give you a process for
creating derived datasets (such as search indexes, materialized views, and predictive models) and
keeping them up to date. Let's call that process the *write path*: whenever some piece of
information is written to the system, it may go through multiple stages of batch and stream
processing, and eventually every derived dataset is updated to incorporate the data that was
written. [Figure 13-1](/en/ch13#fig_future_write_read_paths) shows an example of updating a search
index.

{{< figure src="/fig/ddia_1301.png" id="fig_future_write_read_paths" caption="Figure 13-1. In a search index, writes (document updates) meet reads (queries)." class="w-full my-4" >}}

But why do you create the derived dataset in the first place? Most likely because you want to query
it again at a later time. This is the *read path*: when serving a user request you read from the
derived dataset, perhaps perform some more processing on the results, and construct the response to
the user.

Taken together, the write path and the read path encompass the whole journey of the data, from the
point where it is collected to the point where it is consumed (probably by another human). The write
path is the portion of the journey that is precomputed---i.e., that is done eagerly as soon as the
data comes in, regardless of whether anyone has asked to see it. The read path is the portion of the
journey that only happens when someone asks for it. If you are familiar with functional programming
languages, you might notice that the write path is similar to eager evaluation, and the read path is
similar to lazy evaluation.

The derived dataset is the place where the write path and the read path meet, as illustrated in
[Figure 13-1](/en/ch13#fig_future_write_read_paths). It represents a trade-off between the amount of
work that needs to be done at write time and the amount that needs to be done at read time.

#### Materialized views and caching {#id451}

A full-text search index is a good example: the write path updates the index, and the read path
searches the index for keywords. Both reads and writes need to do some work. Writes need to update
the index entries for all terms that appear in the document. Reads need to search for each of the
words in the query, and apply Boolean logic to find documents that contain *all* of the words in the
query (an `AND` operator), or *any* synonym of each of the words (an `OR` operator).

If you didn't have an index, a search query would have to scan over all documents (like `grep`),
which would get very expensive if you had a large number of documents. No index means less work on
the write path (no index to update), but a lot more work on the read path.

On the other hand, you could imagine precomputing the search results for all possible queries. In
that case, you would have less work to do on the read path: no Boolean logic, just find the results
for your query and return them. However, the write path would be a lot more expensive: the set of
possible search queries that could be asked is infinite (or at least exponential in the number of
terms in the corpus), and thus precomputing all possible search results would not be possible.

Another option would be to precompute the search results for only a fixed set of the most common
queries, so that they can be served quickly without having to go to the index. The uncommon queries
can still be served from the index. This would generally be called a *cache* of common queries,
although we could also call it a materialized view, as it would need to be updated when new
documents appear that should be included in the results of one of the common queries.

From this example we can see that an index is not the only possible boundary between the write path
and the read path. Caching of common search results is possible, and `grep`-like scanning without
the index is also possible on a small number of documents. Viewed like this, the role of caches,
indexes, and materialized views is simple: they shift the boundary between the read path and the
write path. They allow us to do more work on the write path, by precomputing results, in order to
save effort on the read path.

Shifting the boundary between work done on the write path and the read path was in fact the topic of
the social networking example in ["Case Study: Social Network Home
Timelines"](/en/ch2#sec_introduction_twitter). In that example, we also saw how the boundary between
write path and read path might be drawn differently for celebrities compared to ordinary users.
After 500 pages we have come full circle!

#### Stateful, offline-capable clients {#id347}

The idea of a boundary between write and read paths is interesting because we can discuss shifting
that boundary and explore what that shift means in practical terms. Let's look at the idea in a
different context.

In the past, web browsers were stateless clients that can only do useful things when you have an
internet connection (just about the only thing you could do offline was to scroll up and down in a
page that you had previously loaded while online). However, single-page JavaScript web apps now have
a lot of stateful capabilities, including client-side user interface interaction and persistent
local storage in the web browser. Mobile apps can similarly store a lot of state on the device and
don't require a round-trip to the server for most user interactions.

In ["Sync Engines and Local-First Software"](/en/ch6#sec_replication_offline_clients) we saw how
persistent local state enables a class of applications in which users can work offline, without an
internet connection, and sync with remote servers in the background when a network connection is
available [^30]. Since mobile devices sometimes have slow and unreliable cellular internet
connections, it's a big advantage for users if their user interface does not have to wait for
synchronous network requests, and if apps mostly work offline.

When we move away from the assumption of stateless clients talking to a central database and toward
state that is maintained on end-user devices, a world of new opportunities opens up. In particular,
we can think of the on-device state as a *cache of state on the server*. The pixels on the screen
are a materialized view onto model objects in the client app; the model objects are a local replica
of state in a remote datacenter [^31].

#### Pushing state changes to clients {#id348}

In a typical web page, if you load the page in a web browser and the data subsequently changes on
the server, the browser does not find out about the change until you reload the page. The browser
only reads the data at one point in time, assuming that it is static---it does not subscribe to
updates from the server. Thus, the state in the browser is a stale cache that is not updated unless
you explicitly poll for changes. (HTTP-based feed subscription protocols like RSS are really just a
basic form of polling.)

More recent protocols have moved beyond the basic request/response pattern of HTTP: server-sent
events (the EventSource API) and WebSockets provide communication channels by which a web browser
can keep an open TCP connection to a server, and the server can actively push messages to the
browser as long as it remains connected. This provides an opportunity for the server to actively
inform the end-user client about any changes to the state it has stored locally, reducing the
staleness of the client-side state.

In terms of our model of write path and read path, actively pushing state changes all the way to
client devices means extending the write path all the way to the end user. When a client is first
initialized, it would still need to use a read path to get its initial state, but thereafter it
could rely on a stream of state changes sent by the server. The ideas we discussed around stream
processing and messaging are not restricted to running only in a datacenter: we can take the ideas
further, and extend them all the way to end-user devices [^32].

The devices will be offline some of the time, and unable to receive any notifications of state
changes from the server during that time. But we already solved that problem: in ["Consumer
offsets"](/en/ch12#sec_stream_log_offsets) we discussed how a consumer of a log-based message broker
can reconnect after failing or becoming disconnected, and ensure that it doesn't miss any messages
that arrived while it was disconnected. The same technique works for individual users, where each
device is a small subscriber to a small stream of events.

#### End-to-end event streams {#id349}

Tools for developing stateful clients and user interfaces, such as React and Elm [^33],
already have the ability to update the rendered user interface in response to changes in the
underlying state. It would be very natural to extend this programming model to also allow a server
to push state-change events into this client-side event pipeline.

Thus, state changes could flow through an end-to-end write path: from the interaction on one device
that triggers a state change, via event logs and through several derived data systems and stream
processors, all the way to the user interface of a person observing the state on another device.
These state changes could be propagated with fairly low delay---say, under one second end to end.

Some applications, such as instant messaging and online games, already have such a "real-time"
architecture (in the sense of interactions with low delay, not in the sense of response time
guarantees). But why don't we build all applications this way?

The challenge is that the assumption of stateless clients and request/response interactions is very
deeply ingrained in our databases, libraries, frameworks, and protocols. Many datastores support
read and write operations where a request returns one response, but much fewer provide an ability to
subscribe to changes---i.e., a request that returns a stream of responses over time.

In order to extend the write path all the way to the end user, we would need to fundamentally
rethink the way we build many of these systems: moving away from request/response interaction and
toward publish/subscribe dataflow [^31]. This would require effort, but it would have the
advantage of making user interfaces more responsive and providing better offline support.

#### Reads are events too {#sec_future_read_events}

We discussed that when a stream processor writes derived data to a store (database, cache, or
index), and that store is queried, the store acts as the boundary between the write path and the
read path. The store allows random-access read queries to the data that would otherwise require
scanning the whole event log.

In many cases, the data storage is separate from the streaming system. But recall that stream
processors also need to maintain state to perform aggregations and joins. This state is normally
hidden inside the stream processor, but some frameworks allow it to also be queried by outside
clients [^34], turning the stream processor itself into a kind of simple database.

Let's take that idea further. As discussed so far, the writes to the store go through an event log,
while reads are transient network requests that go directly to the nodes that store the data being
queried. This is a reasonable design, but not the only possible one. It is also possible to
represent read requests as streams of events, and send both the read events and the write events
through a stream processor; the processor responds to read events by emitting the result of the read
to an output stream [^35].

When both the writes and the reads are represented as events, and routed to the same stream operator
in order to be handled, we are in fact performing a stream-table join between the stream of read
queries and the database. The read event needs to be sent to the database shard holding the data,
just like batch and stream processors need to copartition inputs on the same key when joining.

This correspondence between serving requests and performing joins is quite fundamental
[^36]. A one-off read request passes through the join operator, which then immediately
forgets the request; a subscribe request is a persistent join with past and future events on the
other side of the join.

Recording a log of read events potentially also has benefits with regard to tracking causal
dependencies and data provenance across a system: it would allow you to reconstruct what the user
saw before they made a particular decision. For example, in an online shop, it is likely that the
predicted shipping date and the inventory status shown to a customer affect whether they choose to
buy an item [^4]. To analyze this connection, you need to record the result of the user's
query of the shipping and inventory status.

Writing read requests to durable storage thus enables better tracking of causal dependencies, but it
incurs additional storage and I/O cost. Optimizing such systems to reduce the overhead is still an
open research problem [^2]. But if you already log read requests for operational purposes,
as a side effect of request processing, it is not such a great change to make the log the source of
the requests instead.

#### Multi-shard data processing {#sec_future_unbundled_multi_shard}

For queries that only touch a single shard, the effort of sending queries through a stream and
collecting a stream of responses is perhaps overkill. However, this idea opens the possibility of
distributed execution of complex queries that need to combine data from several shards, taking
advantage of the infrastructure for message routing, sharding, and joining that is already provided
by stream processors.

Storm's distributed RPC feature supports this usage pattern. For example, it has been used to
compute the number of people who have seen a URL on a social network---i.e., the union of the
follower sets of everyone who has posted that URL [^37]. As the set of users is sharded,
this computation requires combining results from many shards.

Another example of this pattern occurs in fraud prevention: in order to assess the risk of whether a
particular purchase event is fraudulent, you can examine the reputation scores of the user's IP
address, email address, billing address, shipping address, and so on. Each of these reputation
databases is itself sharded, and so collecting the scores for a particular purchase event requires a
sequence of joins with differently sharded datasets [^38].

The internal query execution graphs of data warehouse query engines have similar characteristics. If
you need to perform this kind of multi-shard join, it is probably simpler to use a database that
provides this feature than to implement it using a stream processor. However, treating queries as
streams provides an option for implementing large-scale applications that run against the limits of
conventional off-the-shelf solutions.

## Aiming for Correctness {#sec_future_correctness}

With stateless services that only read data, it is not a big deal if something goes wrong: you can
fix the bug and restart the service, and everything returns to normal. Stateful systems such as
databases are not so simple: they are designed to remember things forever (more or less), so if
something goes wrong, the effects also potentially last forever---which means they require more
careful thought [^39].

We want to build applications that are reliable and *correct* (i.e., programs whose semantics are
well defined and understood, even in the face of various faults). For approximately four decades,
the transaction properties of atomicity, isolation, and durability have been the tools of choice for
building correct applications. However, those foundations are weaker than they seem: witness for
example the confusion of weak isolation levels (see ["Weak Isolation
Levels"](/en/ch8#sec_transactions_isolation_levels)).

In some areas, transactions have been abandoned entirely and replaced with models that offer better
performance and scalability, but much messier semantics. *Consistency* is often talked about, but
poorly defined. Some people assert that we should "embrace weak consistency" for the sake of better
availability, while lacking a clear idea of what that actually means in practice.

For a topic that is so important, our understanding and our engineering methods are surprisingly
flaky. For example, it is very difficult to determine whether it is safe to run a particular
application at a particular transaction isolation level or replication configuration [^40],
[^41]. Often simple solutions appear to work correctly when concurrency is low and there are
no faults, but turn out to have many subtle bugs in more demanding circumstances.

For example, Kyle Kingsbury's Jepsen experiments [^42] have highlighted the stark
discrepancies between some products' claimed safety guarantees and their actual behavior in the
presence of network problems and crashes. Even if infrastructure products like databases were free
from problems, application code would still need to correctly use the features they provide, which
is error-prone if the configuration is hard to understand (which is the case with weak isolation
levels, quorum configurations, and so on).

If your application can tolerate occasionally corrupting or losing data in unpredictable ways, life
is a lot simpler, and you might be able to get away with simply crossing your fingers and hoping for
the best. On the other hand, if you need stronger assurances of correctness, then serializability
and atomic commit are established approaches, but they come at a cost: they typically only work in a
single datacenter (ruling out geographically distributed architectures), and they limit the scale
and fault-tolerance properties you can achieve.

While the traditional transaction approach is not going away, it is not the last word in making
applications correct and resilient to faults. In this section we will explore some ways of thinking
about correctness in the context of dataflow architectures.

### The End-to-End Argument for Databases {#sec_future_end_to_end}

Just because an application uses a data system that provides comparatively strong safety properties,
such as serializable transactions, that does not mean the application is guaranteed to be free from
data loss or corruption. For example, if an application has a bug that causes it to write incorrect
data, or delete data from a database, serializable transactions aren't going to save you. This is an
argument in favor of immutable and append-only data, because it is easier to recover from such
mistakes if you remove the ability of faulty code to destroy good data.

Although immutability is useful, it is not a cure-all by itself. Let's look at a more subtle example
of data corruption that can occur.

#### Exactly-once execution of an operation {#id353}

In ["Fault Tolerance"](/en/ch12#sec_stream_fault_tolerance) we encountered *exactly-once* (or
*effectively-once*) semantics. If something goes wrong while processing a message, you can either
give up (drop the message---i.e., incur data loss) or try again. If you try again, there is the risk
that it actually succeeded the first time, but you just didn't find out about the success, and so
the message ends up being processed twice.

Processing twice is a form of data corruption: it is undesirable to charge a customer twice for the
same service (billing them too much) or increment a counter twice (overstating some metric). In this
context, *exactly-once* means arranging the computation such that the final effect is the same as if
no faults had occurred, even if the operation actually was retried due to some fault. We previously
discussed a few approaches for achieving this goal.

One of the most effective approaches is to make the operation *idempotent*; that is, to ensure that
it has the same effect, no matter whether it is executed once or multiple times. However, taking an
operation that is not naturally idempotent and making it idempotent requires some effort and care:
you may need to maintain some additional metadata (such as the set of operation IDs that have
updated a value), and ensure fencing when failing over from one node to another (see ["Distributed
Locks and Leases"](/en/ch9#sec_distributed_lock_fencing)).

#### Duplicate suppression {#id354}

The same pattern of needing to suppress duplicates occurs in many other places besides stream
processing. For example, TCP uses sequence numbers on packets to put them in the correct order at
the recipient, and to determine whether any packets were lost or duplicated on the network. Any lost
packets are retransmitted and any duplicates are removed by the TCP stack before it hands the data
to an application.

However, this duplicate suppression only works within the context of a single TCP connection.
Imagine the TCP connection is a client's connection to a database, and it is currently executing the
transaction in [Example 13-1](/en/ch13#fig_future_non_idempotent). In many databases, a transaction
is tied to a client connection (if the client sends several queries, the database knows that they
belong to the same transaction because they are sent on the same TCP connection). If the client
suffers a network interruption and connection timeout after sending the `COMMIT`, but before hearing
back from the database server, it does not know whether the transaction has been committed or
aborted ([Figure 9-1](/en/ch9#fig_distributed_network)).

<a id="fig_future_non_idempotent"></a>

##### Example 13-1. A nonidempotent transfer of money from one account to another

``` sql
BEGIN TRANSACTION;
UPDATE accounts SET balance = balance + 11.00 WHERE account_id = 1234;
UPDATE accounts SET balance = balance - 11.00 WHERE account_id = 4321;
COMMIT;
```

The client can reconnect to the database and retry the transaction, but now it is outside of the
scope of TCP duplicate suppression. Since the transaction in
[Example 13-1](/en/ch13#fig_future_non_idempotent) is not idempotent, it could happen that \$22 is
transferred instead of the desired \$11. Thus, even though
[Example 13-1](/en/ch13#fig_future_non_idempotent) is a standard example for transaction atomicity,
it is actually not correct, and real banks do not work like this [^3].

Two-phase commit (see ["Two-Phase Commit (2PC)"](/en/ch8#sec_transactions_2pc)) protocols break the
1:1 mapping between a TCP connection and a transaction, since they must allow a transaction
coordinator to reconnect to a database after a network fault, and tell it whether to commit or abort
an in-doubt transaction. Is this sufficient to ensure that the transaction will only be executed
once? Unfortunately not.

Even if we can suppress duplicate transactions between the database client and server, we still need
to worry about the network between the end-user device and the application server. For example, if
the end-user client is a web browser, it probably uses an HTTP POST request to submit an instruction
to the server. Perhaps the user is on a weak cellular data connection, and they succeed in sending
the POST, but the signal becomes too weak before they are able to receive the response from the
server.

In this case, the user will probably be shown an error message, and they may retry manually. Web
browsers warn, "Are you sure you want to submit this form again?"---and the user says yes, because
they wanted the operation to happen. (The Post/Redirect/Get pattern [^43] avoids this
warning message in normal operation, but it doesn't help if the POST request times out.) From the
web server's point of view the retry is a separate request, and from the database's point of view it
is a separate transaction. The usual deduplication mechanisms don't help.

#### Uniquely identifying requests {#id355}

To make the request idempotent through several hops of network communication, it is not sufficient
to rely just on a transaction mechanism provided by a database---you need to consider the
*end-to-end* flow of the request.

For example, you could generate a unique identifier for a request (such as a UUID) and include it as
a hidden form field in the client application, or calculate a hash of all the relevant form fields
to derive the request ID [^3]. If the web browser submits the POST request twice, the two
requests will have the same request ID. You can then pass that request ID all the way through to the
database and check that you only ever execute one request with a given ID, as shown in
[Example 13-2](/en/ch13#fig_future_request_id).

<a id="fig_future_request_id"></a>

##### Example 13-2. Suppressing duplicate requests using a unique ID

``` sql
ALTER TABLE requests ADD UNIQUE (request_id);

BEGIN TRANSACTION;

INSERT INTO requests
  (request_id, from_account, to_account, amount)
  VALUES('0286FDB8-D7E1-423F-B40B-792B3608036C', 4321, 1234, 11.00);

UPDATE accounts SET balance = balance + 11.00 WHERE account_id = 1234;
UPDATE accounts SET balance = balance - 11.00 WHERE account_id = 4321;

COMMIT;
```

[Example 13-2](/en/ch13#fig_future_request_id) relies on a uniqueness constraint on the `request_id`
column. If a transaction attempts to insert an ID that already exists, the `INSERT` fails and the
transaction is aborted, preventing it from taking effect twice. Relational databases can generally
maintain a uniqueness constraint correctly, even at weak isolation levels (whereas an
application-level check-then-insert may fail under nonserializable isolation, as discussed in
["Write Skew and Phantoms"](/en/ch8#sec_transactions_write_skew)).

Besides suppressing duplicate requests, the `requests` table in
[Example 13-2](/en/ch13#fig_future_request_id) acts as a kind of event log, which can be useful for
event sourcing or change data capture. The updates to the account balances don't actually have to
happen in the same transaction as the insertion of the event, since they are redundant and could be
derived from the request event in a downstream consumer---as long as the event is processed exactly
once, which can again be enforced using the request ID.

#### The end-to-end argument {#sec_future_e2e_argument}

This scenario of suppressing duplicate transactions is just one example of a more general principle
called the *end-to-end argument*, which was articulated by Saltzer, Reed, and Clark in 1984
[^44]:

> The function in question can completely and correctly be implemented only with the knowledge and
> help of the application standing at the endpoints of the communication system. Therefore,
> providing that questioned function as a feature of the communication system itself is not
> possible. (Sometimes an incomplete version of the function provided by the communication system
> may be useful as a performance enhancement.)

In our example, the *function in question* was duplicate suppression. We saw that TCP suppresses
duplicate packets at the TCP connection level, and some stream processors provide so-called
exactly-once semantics at the message processing level, but that is not enough to prevent a user
from submitting a duplicate request if the first one times out. By themselves, TCP, database
transactions, and stream processors cannot entirely rule out these duplicates. Solving the problem
requires an end-to-end solution: a transaction identifier that is passed all the way from the
end-user client to the database.

The end-to-end argument also applies to checking the integrity of data: checksums built into
Ethernet, TCP, and TLS can detect corruption of packets in the network, but they cannot detect
corruption due to bugs in the software at the sending and receiving ends of the network connection,
or corruption on the disks where the data is stored. If you want to catch all possible sources of
data corruption, you also need end-to-end checksums.

A similar argument applies with encryption [^44]: the password on your home WiFi network
protects against people snooping your WiFi traffic, but not against attackers elsewhere on the
internet; TLS/SSL between your client and the server protects against network attackers, but not
against compromises of the server. Only end-to-end encryption and authentication can protect against
all of these things.

Although the low-level features (TCP duplicate suppression, Ethernet checksums, WiFi encryption)
cannot provide the desired end-to-end features by themselves, they are still useful, since they
reduce the probability of problems at the higher levels. For example, HTTP requests would often get
mangled if we didn't have TCP putting the packets back in the right order. We just need to remember
that the low-level reliability features are not by themselves sufficient to ensure end-to-end
correctness.

#### Applying end-to-end thinking in data systems {#id357}

This brings us back to the original thesis: just because an application uses a data system that
provides comparatively strong safety properties, such as serializable transactions, that does not
mean the application is guaranteed to be free from data loss or corruption. The application itself
needs to take end-to-end measures, such as duplicate suppression, as well.

That is a shame, because fault-tolerance mechanisms are hard to get right. Low-level reliability
mechanisms, such as those in TCP, work quite well, and so the remaining higher-level faults occur
fairly rarely. It would be really nice to wrap up the remaining high-level fault-tolerance machinery
in an abstraction so that application code needn't worry about it---but it seems that we have not
yet found the right abstraction.

Transactions have long been seen as a useful abstraction. As discussed in
[Chapter 8](/en/ch8#ch_transactions), they take a wide range of possible issues (concurrent writes,
constraint violations, crashes, network interruptions, disk failures) and collapse them down to two
possible outcomes: commit or abort. That is a huge simplification of the programming model, but it
is not enough.

Transactions are expensive, especially when they involve heterogeneous storage technologies (see
["Distributed Transactions Across Different Systems"](/en/ch8#sec_transactions_xa)). When we refuse
to use distributed transactions because they are too expensive, we end up having to reimplement
fault-tolerance mechanisms in application code. As numerous examples throughout this book have
shown, reasoning about concurrency and partial failure is difficult and counterintuitive, and so
most application-level mechanisms do not work correctly. The consequence is lost or corrupted data.

For these reasons, it is worth exploring fault-tolerance abstractions that make it easy to provide
application-specific end-to-end correctness properties, but also maintain good performance and good
operational characteristics in a large-scale distributed environment.

### Enforcing Constraints {#sec_future_constraints}

Let's think about correctness in the context of the ideas around unbundling databases. We saw that
end-to-end duplicate suppression can be achieved with a request ID that is passed all the way from
the client to the database that records the write. What about other kinds of constraints?

In particular, let's focus on uniqueness constraints---such as the one we relied on in
[Example 13-2](/en/ch13#fig_future_request_id). In ["Constraints and uniqueness
guarantees"](/en/ch10#sec_consistency_uniqueness) we saw several other examples of application
features that need to enforce uniqueness: a username or email address must uniquely identify a user,
a file storage service cannot have more than one file with the same name, and two people cannot book
the same seat on a flight or in a theater.

Other kinds of constraints are very similar: for example, ensuring that an account balance never
goes negative, that you don't sell more items than you have in stock in the warehouse, or that a
meeting room does not have overlapping bookings. Techniques that enforce uniqueness can often be
used for these kinds of constraints as well.

#### Uniqueness constraints require consensus {#id452}

In [Chapter 10](/en/ch10#ch_consistency) we saw that in a distributed setting, enforcing a
uniqueness constraint requires consensus: if there are several concurrent requests with the same
value, the system somehow needs to decide which one of the conflicting operations is accepted, and
reject the others as violations of the constraint.

The most common way of achieving this consensus is to make a single node the leader, and put it in
charge of making all the decisions. That works fine as long as you don't mind funneling all requests
through a single node (even if the client is on the other side of the world), and as long as that
node doesn't fail. Consensus algorithms like Raft tackle the problem of safely electing a new leader
if the current leader has failed (or is believed to have failed due to a network problem), and
preventing split brain.

Uniqueness checking can be scaled out by sharding based on the value that needs to be unique. For
example, if you need to ensure uniqueness by request ID, as in
[Example 13-2](/en/ch13#fig_future_request_id), you can ensure all requests with the same request ID
are routed to the same shard. If you need usernames to be unique, you can shard by hash of username.

However, asynchronous multi-leader replication is ruled out, because it could happen that different
leaders concurrently accept conflicting writes, and thus the values are no longer unique. If you
want to be able to immediately reject any writes that would violate the constraint, synchronous
coordination is unavoidable [^45].

#### Uniqueness in log-based messaging {#sec_future_uniqueness_log}

A shared log ensures that all consumers see messages in the same order---a guarantee that is
formally known as *total order broadcast* and is equivalent to consensus (see ["The Many Faces of
Consensus"](/en/ch10#sec_consistency_faces)). In the unbundled database approach with log-based
messaging, we can use a very similar approach to enforce uniqueness constraints.

A stream processor consumes all the messages in a log shard sequentially on a single thread. Thus,
if the log is sharded based on the value that needs to be unique, a stream processor can
unambiguously and deterministically decide which one of several conflicting operations came first in
the log. For example, in the case of several users trying to claim the same username
[^46]:

1.  Every request for a username is encoded as a message, and appended to a shard determined by the
    hash of the username.

2.  A stream processor sequentially reads the requests in the log, using a local database to keep
    track of which usernames are taken. For every request for a username that is available, it
    records the name as taken and emits a success message to an output stream. For every request for
    a username that is already taken, it emits a rejection message to an output stream.

3.  The client that requested the username watches the output stream and waits for a success or
    rejection message corresponding to its request.

This algorithm is the same as the construction for achieving consensus using a shared log, which we
saw in [Chapter 10](/en/ch10#ch_consistency). It scales easily to a large request throughput by
increasing the number of shards, as each shard can be processed independently.

The approach works not only for uniqueness constraints, but also for many other kinds of
constraints. Its fundamental principle is that any writes that may conflict are routed to the same
shard and processed sequentially. The definition of a conflict may depend on the application, but
the stream processor can use arbitrary logic to validate a request.

#### Multi-shard request processing {#id360}

Ensuring that an operation is executed atomically, while satisfying constraints, becomes more
interesting when several shards are involved. In [Example 13-2](/en/ch13#fig_future_request_id),
there are potentially three shards: the one containing the request ID, the one containing the payee
account, and the one containing the payer account. There is no reason why those three things should
be in the same shard, since they are all independent from each other.

In the traditional approach to databases, executing this transaction would require an atomic commit
across all three shards, which essentially forces it into a total order with respect to all other
transactions on any of those shards. Since there is now cross-shard coordination, different shards
can no longer be processed independently, so throughput is likely to suffer.

However, equivalent correctness can be achieved without cross-shard transactions using sharded logs
and stream processors. [Figure 13-2](/en/ch13#fig_future_multi_shard) shows an example of a payment
transaction that needs to check whether there is sufficient money in the source account, and if so,
atomically transfers some amount to a destination account while deducting fees. It works as follows
[^47]:

{{< figure src="/fig/ddia_1302.png" id="fig_future_multi_shard" caption="Figure 13-2. Checking whether a source account has enough money, and atomically transferring money to a destination account and a fees account, using event logs and stream processors." class="w-full my-4" >}}

1.  The request to transfer money from the source account to the destination account is given a
    unique request ID by the user's client, and appended to a log shard based on the source account
    ID.

2.  A stream processor reads the log of requests and maintains a database containing the state of
    the source account and the IDs of requests it has already processed. The contents of this
    database are entirely derived from the log. When the stream processor encounters a request with
    an ID that it has not seen before, it checks in its local database whether the source account
    has enough money to perform the transfer.

    If yes, it updates its local database to reserve the payment amount on the source account, and
    emits events to several other logs: an outgoing payment event to the log shard for the source
    account (its own input log), an incoming payment event to the log shard for the destination
    account, and an incoming payment event to the log shard for the fees account. The original
    request ID is included in those emitted events.

3.  Eventually the outgoing payment event is delivered back to the source account processor
    (possibly after having received unrelated events in the meantime). The stream processor
    recognises based on the request ID that this is a payment it previously reserved, and it now
    executes the payment, again updating its local state of the source account. It ignores
    duplicates based on request ID.

4.  The log shards for the destination and fees accounts are consumed by independent stream
    processing tasks. When they receive an incoming payment event, they update their local state to
    reflect the payment, and they deduplicate events based on request ID.

[Figure 13-2](/en/ch13#fig_future_multi_shard) shows the three accounts as being in three separate
shards, but they could just as well be in the same shards---it doesn't matter. All we need is that
the events for any given account are processed strictly in log order with at-least-once semantics,
and that the stream processors are deterministic.

For example, consider what happens if the source account processor crashes while processing a
payment request. The output messages may or may not have been emitted before the crash occurred.
When it recovers from the crash, it will process the same request again (due to at-least-once
semantics), and it will make the same decision on whether to allow the payment (since it's
deterministic). It will therefore emit the same output messages with the same request ID to the
outgoing, incoming, and fees account shards. If they are duplicates, the downstream consumers will
ignore them based on the request ID.

Atomicity in this system comes not from any transactions, but from the fact that writing the initial
request event to the source account log is an atomic action. Once that one event in the log, all the
downstream events will eventually be written as well---possibly after stream processors have
recovered from crashes, and possibly with duplicates, but they will appear eventually.

With exactly-once semantics this example becomes easier to implement, since it ensures that the
stream processor's local state is consistent with the set of messages it has processed. Thus, if it
crashes and re-processes some messages, its local state is also reset to what it was before those
messages were processed.

If the user in [Figure 13-2](/en/ch13#fig_future_multi_shard) wants to find out whether their
transfer was approved or not, they can subscribe to the source account log shard and wait for the
outgoing payment event. In order to explicitly notify the user if the balance is insufficient, the
stream processor can emit a "declined payment" event to that log shard.

By breaking down the multi-shard transaction into several differently sharded stages and using the
end-to-end request ID, we have achieved the same correctness property (every request is applied
exactly once to both the payer and payee accounts), even in the presence of faults, and without
using an atomic commit protocol.

### Timeliness and Integrity {#sec_future_integrity}

A convenient property of many transactional systems is that as soon as one transaction commits, its
writes are immediately visible to other transactions. This property is formalized as *strict
serializability* (see ["Linearizability Versus
Serializability"](/en/ch10#sidebar_consistency_serializability)).

This is not the case when unbundling an operation across multiple stages of stream processors:
consumers of a log are asynchronous by design, so a sender does not wait until its message has been
processed by consumers. However, it is possible for a client to wait for a message to appear on an
output stream, like the user waiting for an outgoing payment or payment declined event in
[Figure 13-2](/en/ch13#fig_future_multi_shard), which depends on whether there was enough money in
the source account.

In this example, the correctness of the source account balance check does not depend on whether the
user making the request waits for the outcome. The waiting only has the purpose of synchronously
informing the user whether or not the payment succeeded, but this notification is decoupled from the
effects of processing the request.

More generally, the term *consistency* conflates two different requirements that are worth
considering separately:

Timeliness

:   Timeliness means ensuring that users observe the system in an up-to-date state. We saw
    previously that if a user reads from a stale copy of the data, they may observe it in an
    inconsistent state (see ["Problems with Replication Lag"](/en/ch6#sec_replication_lag)).
    However, that inconsistency is temporary, and will eventually be resolved simply by waiting and
    trying again.

    The CAP theorem uses consistency in the sense of linearizability, which is a strong way of
    achieving timeliness. Weaker timeliness properties like *read-after-write* consistency can also
    be useful.

Integrity

:   Integrity means absence of corruption; i.e., no data loss, and no contradictory or false data.
    In particular, if some derived dataset is maintained as a view onto some underlying data, the
    derivation must be correct. For example, a database index must correctly reflect the contents of
    the database---an index in which some records are missing is not very useful.

    If integrity is violated, the inconsistency is permanent: waiting and trying again is not going
    to fix database corruption in most cases. Instead, explicit checking and repair is needed. In
    the context of ACID transactions, "consistency" is usually understood as some kind of
    application-specific notion of integrity. Atomicity and durability are important tools for
    preserving integrity.

In slogan form: violations of timeliness are "eventual consistency," whereas violations of integrity
are "perpetual inconsistency."

In most applications, integrity is much more important than timeliness. Violations of timeliness can
be annoying and confusing, but violations of integrity can be catastrophic.

For example, on your credit card statement, it is not surprising if a transaction that you made
within the last 24 hours does not yet appear---it is normal that these systems have a certain lag.
We know that banks reconcile and settle transactions asynchronously, and timeliness is not very
important here [^3]. However, it would be very bad if the statement balance was not equal
to the sum of the transactions plus the previous statement balance (an error in the sums), or if a
transaction was charged to you but not paid to the merchant (disappearing money). Such problems
would be violations of the integrity of the system.

#### Correctness of dataflow systems {#id453}

ACID transactions usually provide both timeliness (e.g., linearizability) and integrity (e.g.,
atomic commit) guarantees. Thus, if you approach application correctness from the point of view of
ACID transactions, the distinction between timeliness and integrity is fairly inconsequential.

On the other hand, an interesting property of the event-based dataflow systems that we have
discussed in this chapter is that they decouple timeliness and integrity. When processing event
streams asynchronously, there is no guarantee of timeliness, unless you explicitly build consumers
that wait for a message to arrive before returning. For example, a user could request a payment and
then read the state of their account before the stream processor has executed the request; the user
will not see the payment they just requested.

However, integrity is in fact central to streaming systems. *Exactly-once* or *effectively-once*
semantics is a mechanism for preserving integrity. If an event is lost, or if an event takes effect
twice, the integrity of a data system could be violated. Thus, fault-tolerant message delivery and
duplicate suppression (e.g., idempotent operations) are important for maintaining the integrity of a
data system in the face of faults.

As we saw in the last section, reliable stream processing systems can preserve integrity without
requiring distributed transactions and an atomic commit protocol, which means they can potentially
achieve comparable correctness with much better performance and operational robustness. We achieved
this integrity through a combination of mechanisms:

- Representing the content of the write operation as a single message, which can easily be written
  atomically---an approach that fits very well with event sourcing

- Deriving all other state updates from that single message using deterministic derivation
  functions, similarly to stored procedures

- Passing a client-generated request ID through all these levels of processing, enabling end-to-end
  duplicate suppression and idempotence

- Making messages immutable and allowing derived data to be reprocessed from time to time, which
  makes it easier to recover from bugs

#### Loosely interpreted constraints {#id362}

As discussed previously, enforcing a uniqueness constraint requires consensus, typically implemented
by funneling all events in a particular shard through a single node. This limitation is unavoidable
if we want the traditional form of uniqueness constraint, and stream processing cannot get around
it.

However, another thing to realize is that in many real applications there is actually a business
requirement to allow violations of what you might think of as hard constraints:

- If customers order more items than you have in your warehouse, you can order in more stock,
  apologize to customers for the delay, and offer them a discount. This is actually the same as what
  you'd have to do if, say, a forklift truck ran over some of the items in your warehouse, leaving
  you with fewer items in stock than you thought you had [^3]. Thus, the apology workflow
  already needs to be part of your business processes anyway in order to deal with forklift
  incidents, and a hard constraint on the number of items in stock might be unnecessary.

- Similarly, many airlines overbook airplanes in the expectation that some passengers will miss
  their flight, and many hotels overbook rooms, expecting that some guests will cancel. In these
  cases, the constraint of "one person per seat" is deliberately violated for business reasons, and
  compensation processes (refunds, upgrades, providing a complimentary room at a neighboring hotel)
  are put in place to handle situations in which demand exceeds supply. Even if there was no
  overbooking, apology and compensation processes would be needed in order to deal with flights
  being cancelled due to bad weather or staff on strike---recovering from such issues is just a
  normal part of business [^3].

- If someone withdraws more money than they have in their account, the bank can charge them an
  overdraft fee and ask them to pay back what they owe. By limiting the total withdrawals per day,
  the risk to the bank is bounded.

- In systems that integrate data between different organizations, inconsistencies will inevitably
  arise, and correction mechanisms are necessary to handle them. As noted in ["Batch Use
  Cases"](/en/ch11#sec_batch_output), settlement of payments between banks is an example of this.

In many business contexts, it is therefore acceptable to temporarily violate a constraint and fix it
up later by apologizing. This kind of change to correct a mistake is called a *compensating
transaction* [^48], [^49]. The cost of the apology (in terms of money or reputation)
varies, but it is often quite low: you can't unsend an email, but you can send a follow-up email
with a correction. If you accidentally charge a credit card twice, you can refund one of the
charges, and the cost to you is just the processing fees and perhaps a customer complaint. Once
money has been paid out of an ATM, you can't directly get it back, although in principle you can
send debt collectors to recover the money if the account was overdrawn and the customer won't pay it
back.

Whether the cost of the apology is acceptable is a business decision. If it is acceptable, the
traditional model of checking all constraints before even writing the data is unnecessarily
restrictive. It may well be reasonable to go ahead with a write optimistically, and to check the
constraint after the fact. You can still ensure that the validation occurs before doing things that
would be expensive to recover from, but that doesn't imply you must do the validation before you
even write the data.

These applications *do* require integrity: you would not want to lose a reservation, or have money
disappear due to mismatched credits and debits. But they *don't* require timeliness on the
enforcement of the constraint: if you have sold more items than you have in the warehouse, you can
patch up the problem after the fact by apologizing. Doing so is similar to the conflict resolution
approaches we discussed in ["Dealing with Conflicting
Writes"](/en/ch6#sec_replication_write_conflicts).

#### Coordination-avoiding data systems {#id454}

We have now made two interesting observations:

1.  Dataflow systems can maintain integrity guarantees on derived data without atomic commit,
    linearizability, or synchronous cross-shard coordination.

2.  Although strict uniqueness constraints require timeliness and coordination, many applications
    are actually fine with loose constraints that may be temporarily violated and fixed up later, as
    long as integrity is preserved throughout.

Taken together, these observations mean that dataflow systems can provide the data management
services for many applications without requiring coordination, while still giving strong integrity
guarantees. Such *coordination-avoiding* data systems have a lot of appeal: they can achieve better
performance and fault tolerance than systems that need to perform synchronous coordination
[^45].

For example, such a system could operate distributed across multiple datacenters in a multi-leader
configuration, asynchronously replicating between regions. Any one datacenter can continue operating
independently from the others, because no synchronous cross-region coordination is required. Such a
system would have weak timeliness guarantees---it could not be linearizable without introducing
coordination---but it can still have strong integrity guarantees.

In this context, serializable transactions are still useful as part of maintaining derived state,
but they can be run at a small scope where they work well [^6]. Heterogeneous distributed
transactions such as XA transactions are not required. Synchronous coordination can still be
introduced in places where it is needed (for example, to enforce strict constraints before an
operation from which recovery is not possible), but there is no need for everything to pay the cost
of coordination if only a small part of an application needs it [^32].

Another way of looking at coordination and constraints: they reduce the number of apologies you have
to make due to inconsistencies, but potentially also reduce the performance and availability of your
system, and thus potentially increase the number of apologies you have to make due to outages. You
cannot reduce the number of apologies to zero, but you can aim to find the best trade-off for your
needs---the sweet spot where there are neither too many inconsistencies nor too many availability
problems.

### Trust, but Verify {#sec_future_verification}

All of our discussion of correctness, integrity, and fault-tolerance has been under the assumption
that certain things might go wrong, but other things won't. We call these assumptions our *system
model* (see ["System Model and Reality"](/en/ch9#sec_distributed_system_model)): for example, we
should assume that processes can crash, machines can suddenly lose power, and the network can
arbitrarily delay or drop messages. But we might also assume that data written to disk is not lost
after `fsync`, that data in memory is not corrupted, and that the multiplication instruction of our
CPU always returns the correct result.

These assumptions are quite reasonable, as they are true most of the time, and it would be difficult
to get anything done if we had to constantly worry about our computers making mistakes.
Traditionally, system models take a binary approach toward faults: we assume that some things can
happen, and other things can never happen. In reality, it is more a question of probabilities: some
things are more likely, other things less likely. The question is whether violations of our
assumptions happen often enough that we may encounter them in practice.

We have seen that data can become corrupted in memory (see ["Hardware and Software
Faults"](/en/ch2#sec_introduction_hardware_faults)), on disk (see ["Replication and
Durability"](/en/ch8#sidebar_transactions_durability)), and on the network (see ["Weak forms of
lying"](/en/ch9#sec_distributed_weak_lying)). Maybe this is something we should be paying more
attention to? If you are operating at large enough scale, even very unlikely things do happen.

#### Maintaining integrity in the face of software bugs {#id455}

Besides such hardware issues, there is always the risk of software bugs, which would not be caught
by lower-level network, memory, or filesystem checksums. Even widely used database software has
bugs: for example, past versions of MySQL have failed to correctly maintain uniqueness constraints
[^50] and PostgreSQL's serializable isolation level has exhibited write skew anomalies in
the past [^51], even though MySQL and PostgreSQL are robust and well-regarded databases
that have been battle-tested by many people for many years. In less mature software, the situation
is likely to be much worse.

Despite considerable efforts in careful design, testing, and review, bugs still creep in. Although
they are rare, and they eventually get found and fixed, there is still a period during which such
bugs can corrupt data.

When it comes to application code, we have to assume many more bugs, since most applications don't
receive anywhere near the amount of review and testing that database code does. Many applications
don't even correctly use the features that databases offer for preserving integrity, such as foreign
key or uniqueness constraints [^25].

Consistency in the sense of ACID is based on the idea that the database starts off in a consistent
state, and a transaction transforms it from one consistent state to another consistent state. Thus,
we expect the database to always be in a consistent state. However, this notion only makes sense if
you assume that the transaction is free from bugs. If the application uses the database incorrectly
in some way, for example using a weak isolation level unsafely, the integrity of the database cannot
be guaranteed.

#### Don't just blindly trust what they promise {#id364}

With both hardware and software not always living up to the ideal that we would like them to be, it
seems that data corruption is inevitable sooner or later. Thus, we should at least have a way of
finding out if data has been corrupted so that we can fix it and try to track down the source of the
error. Checking the integrity of data is known as *auditing*.

As discussed in ["Advantages of immutable events"](/en/ch12#sec_stream_immutability_pros), auditing
is not just for financial applications. However, auditability is very important in finance precisely
because everyone knows that mistakes happen, and we all recognize the need to be able to detect and
fix problems.

Mature systems similarly tend to consider the possibility of unlikely things going wrong, and manage
that risk. For example, large-scale storage systems such as HDFS and Amazon S3 do not fully trust
disks: they run background processes that continually read back files, compare them to other
replicas, and move files from one disk to another, in order to mitigate the risk of silent
corruption [^52], [^53].

If you want to be sure that your data is still there, you have to actually read it and check. Most
of the time it will still be there, but if it isn't, you really want to find out sooner rather than
later. By the same argument, it is important to try restoring from your backups from time to
time---​otherwise you may only find out that your backup is broken when it is too late and you have
already lost data. Don't just blindly trust that it is all working.

Systems like HDFS and S3 still have to assume that disks work correctly most of the time---which is
a reasonable assumption, but not the same as assuming that they *always* work correctly. However,
not many systems currently have this kind of "trust, but verify" approach of continually auditing
themselves. Many assume that correctness guarantees are absolute and make no provision for the
possibility of rare data corruption. In the future we may see more *self-validating* or
*self-auditing* systems that continually check their own integrity, rather than relying on blind
trust [^54].

#### Designing for auditability {#id365}

If a transaction mutates several objects in a database, it is difficult to tell after the fact what
that transaction means. Even if you capture the transaction logs, the insertions, updates, and
deletions in various tables do not necessarily give a clear picture of *why* those mutations were
performed. The invocation of the application logic that decided on those mutations is transient and
cannot be reproduced.

By contrast, event-based systems can provide better auditability. In the event sourcing approach,
user input to the system is represented as a single immutable event, and any resulting state updates
are derived from that event. The derivation can be made deterministic and repeatable, so that
running the same log of events through the same version of the derivation code will result in the
same state updates.

Being explicit about dataflow makes the *provenance* of data much clearer, which makes integrity
checking much more feasible. For the event log, we can use hashes to check that the event storage
has not been corrupted. For any derived state, we can rerun the batch and stream processors that
derived it from the event log in order to check whether we get the same result, or even run a
redundant derivation in parallel.

A deterministic and well-defined dataflow also makes it easier to debug and trace the execution of a
system in order to determine why it did something [^4], [^55]. If something
unexpected occurred, it is valuable to have the diagnostic capability to reproduce the exact
circumstances that led to the unexpected event---a kind of time-travel debugging capability.

#### The end-to-end argument again {#id456}

If we cannot fully trust that every individual component of the system will be free from
corruption---​that every piece of hardware is fault-free and that every piece of software is
bug-free---then we must at least periodically check the integrity of our data. If we don't check, we
won't find out about corruption until it is too late and it has caused some downstream damage, at
which point it will be much harder and more expensive to track down the problem.

Checking the integrity of data systems is best done in an end-to-end fashion: the more systems we
can include in an integrity check, the fewer opportunities there are for corruption to go unnoticed
at some stage of the process. If we can check that an entire derived data pipeline is correct end to
end, then any disks, networks, services, and algorithms along the path are implicitly included in
the check.

Having continuous end-to-end integrity checks gives you increased confidence about the correctness
of your systems, which in turn allows you to move faster [^56]. Like automated testing,
auditing increases the chances that bugs will be found quickly, and thus reduces the risk that a
change to the system or a new storage technology will cause damage. If you are not afraid of making
changes, you can much better evolve an application to meet changing requirements.

#### Tools for auditable data systems {#id366}

At present, not many data systems make auditability a top-level concern. Some applications implement
their own audit mechanisms, for example by logging all changes to a separate audit table, but
guaranteeing the integrity of the audit log and the database state is still difficult. A transaction
log can be made tamper-proof by periodically signing it with a hardware security module, but that
does not guarantee that the right transactions went into the log in the first place.

Blockchains such as Bitcoin or Ethereum are shared append-only logs with cryptographic consistency
checks; the transactions they store are events, and smart contracts are basically stream processors.
The consensus protocols they use ensure that all nodes agree on the same sequence of events. The
difference to the consensus protocols of [Chapter 10](/en/ch10#ch_consistency) is that blockchains
are Byzantine fault tolerant, i.e. they still work if some of the participating nodes have corrupted
data because the replicas continually check each other's integrity.

For most applications, blockchains have too high overhead to be useful. However, some of their
cryptographic tools can also be used in a lighterweight context. For example, *Merkle trees*
[^57], are trees of hashes that can be used to efficiently prove that a record appears in
some dataset (and a few other things). *Certificate transparency* uses cryptographically verified
append-only logs and Merkle trees to check the validity of TLS/SSL certificates [^58],
[^59]; it avoids needing a consensus protocol by having a single leader per log.

Integrity-checking and auditing algorithms, like those of certificate transparency and distributed
ledgers, might becoming more widely used in data systems in general in the future. Some work will be
needed to make them equally scalable as systems without cryptographic auditing, and to keep the
performance penalty as low as possible, but they are nevertheless interesting.

## Summary {#id367}

In this chapter we discussed new approaches to designing data systems based on ideas from stream
processing. We started with the observation that there is no one single tool that can efficiently
serve all possible use cases, and so applications necessarily need to compose several different
pieces of software to accomplish their goals. We discussed how to solve this *data integration*
problem by using batch processing and event streams to let data changes flow between different
systems.

In this approach, certain systems are designated as systems of record, and other data is derived
from them through transformations. In this way we can maintain indexes, materialized views, machine
learning models, statistical summaries, and more. By making these derivations and transformations
asynchronous and loosely coupled, a problem in one area is prevented from spreading to unrelated
parts of the system, increasing the robustness and fault-tolerance of the system as a whole.

Expressing dataflows as transformations from one dataset to another also helps evolve applications:
if you want to change one of the processing steps, for example to change the structure of an index
or cache, you can just rerun the new transformation code on the whole input dataset in order to
rederive the output. Similarly, if something goes wrong, you can fix the code and reprocess the data
in order to recover.

These processes are quite similar to what databases already do internally, so we recast the idea of
dataflow applications as *unbundling* the components of a database, and building an application by
composing these loosely coupled components.

Derived state can be updated by observing changes in the underlying data. Moreover, the derived
state itself can further be observed by downstream consumers. We can even take this dataflow all the
way through to the end-user device that is displaying the data, and thus build user interfaces that
dynamically update to reflect data changes and continue to work offline.

Next, we discussed how to ensure that all of this processing remains correct in the presence of
faults. We saw that strong integrity guarantees can be implemented scalably with asynchronous event
processing, by using end-to-end request identifiers to make operations idempotent and by checking
constraints asynchronously. Clients can either wait until the check has passed, or go ahead without
waiting but risk having to apologize about a constraint violation. This approach is much more
scalable and robust than the traditional approach of using distributed transactions, and fits with
how many business processes work in practice.

By structuring applications around dataflow and checking constraints asynchronously, we can avoid
most coordination and create systems that maintain integrity but still perform well, even in
geographically distributed scenarios and in the presence of faults. We then talked a little about
using audits to verify the integrity of data and detect corruption, and observed that the techniques
used by blockchains also have a similarity to event-based systems.

##### Footnotes

### References {#references}

[^1]: Rachid Belaid. [Postgres Full-Text Search is Good Enough!](https://rachbelaid.com/postgres-full-text-search-is-good-enough/) *rachbelaid.com*, July 2015. Archived at [perma.cc/ZVP9-YDCB](https://perma.cc/ZVP9-YDCB)
[^2]: Philippe Ajoux, Nathan Bronson, Sanjeev Kumar, Wyatt Lloyd, and Kaushik Veeraraghavan. [Challenges to Adopting Stronger Consistency at Scale](https://www.usenix.org/system/files/conference/hotos15/hotos15-paper-ajoux.pdf). At *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
[^3]: Pat Helland and Dave Campbell. [Building on Quicksand](https://arxiv.org/pdf/0909.1788). At *4th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2009.
[^4]: Jessica Kerr. [Provenance and Causality in Distributed Systems](https://jessitron.com/2016/09/25/provenance-and-causality-in-distributed-systems/). *jessitron.com*, September 2016. Archived at [perma.cc/DTD2-F8ZM](https://perma.cc/DTD2-F8ZM)
[^5]: Jay Kreps. [The Log: What Every Software Engineer Should Know About Real-Time Data's Unifying Abstraction](https://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying). *engineering.linkedin.com*, December 2013. Archived at [perma.cc/2JHR-FR64](https://perma.cc/2JHR-FR64)
[^6]: Pat Helland. [Life Beyond Distributed Transactions: An Apostate's Opinion](https://www.cidrdb.org/cidr2007/papers/cidr07p15.pdf). At *3rd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2007.
[^7]: Lionel A. Smith. [The Broad Gauge Story](https://lionels.orpheusweb.co.uk/RailSteam/GWRBroadG/BGHist.html). *Journal of the Monmouthshire Railway Society*, Summer 1985. Archived at [perma.cc/DDK9-JA6X](https://perma.cc/DDK9-JA6X)
[^8]: Jacqueline Xu. [Online Migrations at Scale](https://stripe.com/blog/online-migrations). *stripe.com*, February 2017. Archived at [perma.cc/ZQY2-EAU2](https://perma.cc/ZQY2-EAU2)
[^9]: Flavio Santos and Robert Stephenson. [Changing the Wheels on a Moving Bus --- Spotify's Event Delivery Migration](https://engineering.atspotify.com/2021/10/changing-the-wheels-on-a-moving-bus-spotify-event-delivery-migration). *engineering.atspotify.com*, October 2021. Archived at [perma.cc/5C4V-G8EV](https://perma.cc/5C4V-G8EV)
[^10]: Molly Bartlett Dishman and Martin Fowler. [Agile Architecture](https://www.youtube.com/watch?v=VjKYO6DP3fo&list=PL055Epbe6d5aFJdvWNtTeg_UEHZEHdInE). At *O'Reilly Software Architecture Conference*, March 2015.
[^11]: Nathan Marz and James Warren. [*Big Data: Principles and Best Practices of Scalable Real-Time Data Systems*](https://www.manning.com/books/big-data). Manning, 2015. ISBN: 978-1-617-29034-3
[^12]: Jay Kreps. [Questioning the Lambda Architecture](https://www.oreilly.com/ideas/questioning-the-lambda-architecture). *oreilly.com*, July 2014. Archived at [perma.cc/PGH6-XUCH](https://perma.cc/PGH6-XUCH)
[^13]: Raul Castro Fernandez, Peter Pietzuch, Jay Kreps, Neha Narkhede, Jun Rao, Joel Koshy, Dong Lin, Chris Riccomini, and Guozhang Wang. [Liquid: Unifying Nearline and Offline Big Data Integration](https://www.cidrdb.org/cidr2015/Papers/CIDR15_Paper25u.pdf). At *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
[^14]: Dennis M. Ritchie and Ken Thompson. [The UNIX Time-Sharing System](https://web.eecs.utk.edu/~qcao1/cs560/papers/paper-unix.pdf). *Communications of the ACM*, volume 17, issue 7, pages 365--375, July 1974. [doi:10.1145/361011.361061](https://doi.org/10.1145/361011.361061)
[^15]: Wes McKinney. [The Road to Composable Data Systems: Thoughts on the Last 15 Years and the Future](https://wesmckinney.com/blog/looking-back-15-years/). *wesmckinney.com*, September 2023. Archived at [perma.cc/J9SJ-886N](https://perma.cc/J9SJ-886N)
[^16]: Eric A. Brewer and Joseph M. Hellerstein. [CS262a: Advanced Topics in Computer Systems](https://people.eecs.berkeley.edu/~brewer/cs262/systemr.html). Lecture notes, University of California, Berkeley, *cs.berkeley.edu*, August 2011. Archived at [perma.cc/TE79-LGWU](https://perma.cc/TE79-LGWU)
[^17]: Michael Stonebraker. [The Case for Polystores](https://wp.sigmod.org/?p=1629). *wp.sigmod.org*, July 2015. Archived at [perma.cc/G7J2-KR45](https://perma.cc/G7J2-KR45)
[^18]: Jennie Duggan, Aaron J. Elmore, Michael Stonebraker, Magda Balazinska, Bill Howe, Jeremy Kepner, Sam Madden, David Maier, Tim Mattson, and Stan Zdonik. [The BigDAWG Polystore System](https://sigmod.org/publications/sigmodRecord/1506/pdfs/04_vision_Duggan.pdf). *ACM SIGMOD Record*, volume 44, issue 2, pages 11--16, June 2015. [doi:10.1145/2814710.2814713](https://doi.org/10.1145/2814710.2814713)
[^19]: David B. Lomet, Alan Fekete, Gerhard Weikum, and Mike Zwilling. [Unbundling Transaction Services in the Cloud](https://arxiv.org/pdf/0909.1768). At *4th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2009.
[^20]: Martin Kleppmann and Jay Kreps. [Kafka, Samza and the Unix Philosophy of Distributed Data](https://martin.kleppmann.com/papers/kafka-debull15.pdf). *IEEE Data Engineering Bulletin*, volume 38, issue 4, pages 4--14, December 2015.
[^21]: John Hugg. [Winning Now and in the Future: Where Volt Active Data Shines](https://www.voltactivedata.com/blog/2016/03/winning-now-future-voltdb-shines/). *voltactivedata.com*, March 2016. Archived at [perma.cc/44MP-3MWM](https://perma.cc/44MP-3MWM)
[^22]: Felienne Hermans. [Spreadsheets Are Code](https://vimeo.com/145492419). At *Code Mesh*, November 2015.
[^23]: Dan Bricklin and Bob Frankston. [VisiCalc: Information from Its Creators](http://danbricklin.com/visicalc.htm). *danbricklin.com*. Archived at [archive.org](https://web.archive.org/web/20250905040530/http://danbricklin.com/visicalc.htm)
[^24]: D. Sculley, Gary Holt, Daniel Golovin, Eugene Davydov, Todd Phillips, Dietmar Ebner, Vinay Chaudhary, and Michael Young. [Machine Learning: The High-Interest Credit Card of Technical Debt](https://research.google.com/pubs/archive/43146.pdf). At *NIPS Workshop on Software Engineering for Machine Learning* (SE4ML), December 2014. Archived at <https://perma.cc/M3MD-U7WL>
[^25]: Peter Bailis, Alan Fekete, Michael J. Franklin, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Feral Concurrency Control: An Empirical Investigation of Modern Application Integrity](http://www.bailis.org/papers/feral-sigmod2015.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2737784](https://doi.org/10.1145/2723372.2737784)
[^26]: Guy Steele. [Re: Need for Macros (Was Re: Icon)](https://people.csail.mit.edu/gregs/ll1-discuss-archive-html/msg01134.html). email to *ll1-discuss* mailing list, *people.csail.mit.edu*, December 2001. Archived at [perma.cc/K9X8-CJ65](https://perma.cc/K9X8-CJ65)
[^27]: Ben Stopford. [Microservices in a Streaming World](https://www.infoq.com/presentations/microservices-streaming). At *QCon London*, March 2016.
[^28]: Adam Bellemare. [*Building Event-Driven Microservices, 2nd Edition*](https://learning.oreilly.com/library/view/building-event-driven-microservices/9798341622180/). O'Reilly Media, 2025.
[^29]: Christian Posta. [Why Microservices Should Be Event Driven: Autonomy vs Authority](https://blog.christianposta.com/microservices/why-microservices-should-be-event-driven-autonomy-vs-authority/). *blog.christianposta.com*, May 2016. Archived at [perma.cc/E6N9-3X92](https://perma.cc/E6N9-3X92)
[^30]: Alex Feyerke. [Designing Offline-First Web Apps](https://alistapart.com/article/offline-first/). *alistapart.com*, December 2013. Archived at [perma.cc/WH7R-S2DS](https://perma.cc/WH7R-S2DS)
[^31]: Martin Kleppmann. [Turning the Database Inside-out with Apache Samza.](https://martin.kleppmann.com/2015/03/04/turning-the-database-inside-out.html) at *Strange Loop*, September 2014. Archived at [perma.cc/U6E8-A9MT](https://perma.cc/U6E8-A9MT)
[^32]: Sebastian Burckhardt, Daan Leijen, Jonathan Protzenko, and Manuel Fähndrich. [Global Sequence Protocol: A Robust Abstraction for Replicated Shared State](https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.ECOOP.2015.568). At *29th European Conference on Object-Oriented Programming* (ECOOP), July 2015. [doi:10.4230/LIPIcs.ECOOP.2015.568](https://doi.org/10.4230/LIPIcs.ECOOP.2015.568)
[^33]: Evan Czaplicki and Stephen Chong. [Asynchronous Functional Reactive Programming for GUIs](https://people.seas.harvard.edu/~chong/pubs/pldi13-elm.pdf). At *34th ACM SIGPLAN Conference on Programming Language Design and Implementation* (PLDI), June 2013. [doi:10.1145/2491956.2462161](https://doi.org/10.1145/2491956.2462161)
[^34]: Eno Thereska, Damian Guy, Michael Noll, and Neha Narkhede. [Unifying Stream Processing and Interactive Queries in Apache Kafka](https://www.confluent.io/blog/unifying-stream-processing-and-interactive-queries-in-apache-kafka/). *confluent.io*, October 2016. Archived at [perma.cc/W8JG-EAZF](https://perma.cc/W8JG-EAZF)
[^35]: Frank McSherry. [Dataflow as Database](https://github.com/frankmcsherry/blog/blob/master/posts/2016-07-17.md). *github.com*, July 2016. Archived at [perma.cc/384D-DUFH](https://perma.cc/384D-DUFH)
[^36]: Peter Alvaro. [I See What You Mean](https://www.youtube.com/watch?v=R2Aa4PivG0g). At *Strange Loop*, September 2015.
[^37]: Nathan Marz. [Trident: A High-Level Abstraction for Realtime Computation](https://blog.x.com/engineering/en_us/a/2012/trident-a-high-level-abstraction-for-realtime-computation). *blog.x.com*, August 2012. Archived at [archive.org](https://web.archive.org/web/20250515030808/https://blog.x.com/engineering/en_us/a/2012/trident-a-high-level-abstraction-for-realtime-computation)
[^38]: Edi Bice. [Low Latency Web Scale Fraud Prevention with Apache Samza, Kafka and Friends](https://www.slideshare.net/slideshow/extremely-low-latency-web-scale-fraud-prevention-with-apache-samza-kafka-and-friends/57068078). At *Merchant Risk Council MRC Vegas Conference*, March 2016. Archived at [perma.cc/T3H5-QN3R](https://perma.cc/T3H5-QN3R)
[^39]: Charity Majors. [The Accidental DBA](https://charity.wtf/2016/10/02/the-accidental-dba/). *charity.wtf*, October 2016. Archived at [perma.cc/6ANP-ARB6](https://perma.cc/6ANP-ARB6)
[^40]: Arthur J. Bernstein, Philip M. Lewis, and Shiyong Lu. [Semantic Conditions for Correctness at Different Isolation Levels](https://dsf.berkeley.edu/cs286/papers/isolation-icde2000.pdf). At *16th International Conference on Data Engineering* (ICDE), February 2000. [doi:10.1109/ICDE.2000.839387](https://doi.org/10.1109/ICDE.2000.839387)
[^41]: Sudhir Jorwekar, Alan Fekete, Krithi Ramamritham, and S. Sudarshan. [Automating the Detection of Snapshot Isolation Anomalies](https://www.vldb.org/conf/2007/papers/industrial/p1263-jorwekar.pdf). At *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
[^42]: Kyle Kingsbury. [Jespen: Distributed Systems Safety Research](https://jepsen.io/). *jepsen.io*.
[^43]: Michael Jouravlev. [Redirect After Post](https://www.theserverside.com/news/1365146/Redirect-After-Post). *theserverside.com*, August 2004. Archived at [archive.org](https://web.archive.org/web/20250904205736/https://www.theserverside.com/news/1365146/Redirect-After-Post)
[^44]: Jerome H. Saltzer, David P. Reed, and David D. Clark. [End-to-End Arguments in System Design](https://groups.csail.mit.edu/ana/Publications/PubPDFs/End-to-End%20Arguments%20in%20System%20Design.pdf). *ACM Transactions on Computer Systems*, volume 2, issue 4, pages 277--288, November 1984. [doi:10.1145/357401.357402](https://doi.org/10.1145/357401.357402)
[^45]: Peter Bailis, Alan Fekete, Michael J. Franklin, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Coordination Avoidance in Database Systems](https://arxiv.org/abs/1402.2237). *Proceedings of the VLDB Endowment*, volume 8, issue 3, pages 185--196, November 2014. [doi:10.14778/2735508.2735509](https://doi.org/10.14778/2735508.2735509)
[^46]: Alex Yarmula. [Strong Consistency in Manhattan](https://blog.x.com/engineering/en_us/a/2016/strong-consistency-in-manhattan). *blog.x.com*, March 2016. Archived at [archive.org](https://web.archive.org/web/20250713175819/https://blog.x.com/engineering/en_us/a/2016/strong-consistency-in-manhattan)
[^47]: Martin Kleppmann, Alastair R. Beresford, and Boerge Svingen. [Online Event Processing: Achieving consistency where distributed transactions have failed](https://martin.kleppmann.com/papers/olep-cacm.pdf). *Communications of the ACM*, volume 62, issue 5, pages 43-49, May 2019. [doi:10.1145/3312527](https://doi.org/10.1145/3312527)
[^48]: Jim Gray. [The Transaction Concept: Virtues and Limitations](https://jimgray.azurewebsites.net/papers/thetransactionconcept.pdf). At *7th International Conference on Very Large Data Bases* (VLDB), September 1981. Archived at [perma.cc/8VPT-N5H6](https://perma.cc/8VPT-N5H6)
[^49]: Hector Garcia-Molina and Kenneth Salem. [Sagas](https://www.cs.cornell.edu/andru/cs711/2002fa/reading/sagas.pdf). At *ACM International Conference on Management of Data* (SIGMOD), May 1987. [doi:10.1145/38713.38742](https://doi.org/10.1145/38713.38742)
[^50]: Annamalai Gurusami and Daniel Price. [Bug #73170: Duplicates in Unique Secondary Index Because of Fix of Bug#68021](https://bugs.mysql.com/bug.php?id=73170). *bugs.mysql.com*, July 2014. Archived at [perma.cc/P6BV-W7JJ](https://perma.cc/P6BV-W7JJ)
[^51]: Gary Fredericks. [Postgres Serializability Bug](https://github.com/gfredericks/pg-serializability-bug). *github.com*, September 2015. Archived at [perma.cc/N8UP-2822](https://perma.cc/N8UP-2822)
[^52]: Xiao Chen. [HDFS DataNode Scanners and Disk Checker Explained](https://www.cloudera.com/blog/technical/hdfs-datanode-scanners-and-disk-checker-explained.html). *blog.cloudera.com*, December 2016. Archived at [perma.cc/6S36-X98L](https://perma.cc/6S36-X98L)
[^53]: Daniel Persson. [How does Ceph scrubbing work?](https://www.youtube.com/watch?v=M9QGMoc3GU8) *youtube.com*, March 2022.
[^54]: Jay Kreps. [Getting Real About Distributed System Reliability](https://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability). *blog.empathybox.com*, March 2012. Archived at [perma.cc/9B5Q-AEBW](https://perma.cc/9B5Q-AEBW)
[^55]: Martin Fowler. [The LMAX Architecture](https://martinfowler.com/articles/lmax.html). *martinfowler.com*, July 2011. Archived at [perma.cc/5AV4-N6RJ](https://perma.cc/5AV4-N6RJ)
[^56]: Sam Stokes. [Move Fast with Confidence](https://five-eights.com/2016/07/11/move-fast-with-confidence/). *five-eights.com*, July 2016. Archived at [perma.cc/J8C6-DHXB](https://perma.cc/J8C6-DHXB)
[^57]: Ralph C. Merkle. [A Digital Signature Based on a Conventional Encryption Function](https://people.eecs.berkeley.edu/~raluca/cs261-f15/readings/merkle.pdf). At *CRYPTO '87*, August 1987. [doi:10.1007/3-540-48184-2_32](https://doi.org/10.1007/3-540-48184-2_32)
[^58]: Ben Laurie. [Certificate Transparency](https://queue.acm.org/detail.cfm?id=2668154). *ACM Queue*, volume 12, issue 8, pages 10-19, August 2014. [doi:10.1145/2668152.2668154](https://doi.org/10.1145/2668152.2668154)
[^59]: Mark D. Ryan. [Enhanced Certificate Transparency and End-to-End Encrypted Mail](https://www.ndss-symposium.org/wp-content/uploads/2017/09/12_2_1.pdf). At *Network and Distributed System Security Symposium* (NDSS), February 2014. [doi:10.14722/ndss.2014.23379](https://doi.org/10.14722/ndss.2014.23379)


================================================
FILE: content/en/ch14.md
================================================
---
title: "14. Doing the Right Thing"
weight: 314
breadcrumbs: false
---

<a id="ch_right_thing"></a>

![](/map/ch13.png)

> *Feeding AI systems on the world's beauty, ugliness, and cruelty, but expecting it to reflect only
> the beauty is a fantasy.*
>
> Vinay Uday Prabhu and Abeba Birhane, *Large Datasets: A Pyrrhic Win for Computer Vision?* (2020)

> [!TIP] A NOTE FOR EARLY RELEASE READERS
> With Early Release ebooks, you get books in their earliest form---the author's raw and unedited
> content as they write---so you can take advantage of these technologies long before the official
> release of these titles.
>
> This will be the 14th chapter of the final book. The GitHub repo for this book is
> *[*https://github.com/ept/ddia2-feedback*](https://github.com/ept/ddia2-feedback)*.
>
> If you'd like to be actively involved in reviewing and commenting on this draft, please reach out on GitHub.

In the final chapter of this book, let's take a step back. Throughout this book we have examined a
wide range of different architectures for data systems, evaluated their pros and cons, and explored
techniques for building reliable, scalable, and maintainable applications. However, we have left out
an important and fundamental part of the discussion, which we should now fill in.

Every system is built for a purpose; every action we take has both intended and unintended
consequences. The purpose may be as simple as making money, but the consequences for the world may
reach far beyond that original purpose. We, the engineers building these systems, have a
responsibility to carefully consider those consequences and to consciously decide what kind of world
we want to live in.

We talk about data as an abstract thing, but remember that many datasets are about people: their
behavior, their interests, their identity. We must treat such data with humanity and respect. Users
are humans too, and human dignity is paramount [^1].

Software development increasingly involves making important ethical choices. There are guidelines to
help software engineers navigate these issues, such as the ACM Code of Ethics and Professional
Conduct [^2], but they are rarely discussed, applied, and enforced in practice. As a
result, engineers and product managers sometimes take a very cavalier attitude to privacy and
potential negative consequences of their products [^3], [^4].

A technology is not good or bad in itself---what matters is how it is used and how it affects
people. This is true for a software system like a search engine in much the same way as it is for a
weapon like a gun. Is not sufficient for software engineers to focus exclusively on the technology
and ignore its consequences: the ethical responsibility is ours to bear also. Reasoning about ethics
is difficult, but it is too important to ignore.

However, what makes something "good" or "bad" is not well-defined, and most people in computing
don't even discuss that question [^5]. In contrast to much of computing, the concepts at
the heart of ethics are not fixed or determinate in their precise meaning, and they require
interpretation, which may be subjective [^6]. Ethics is not going through some checklist
to confirm you comply; it's a participatory and iterative process of reflection, in dialog with the
people involved, with accountability for the results [^7].

## Predictive Analytics {#id369}

For example, predictive analytics is a major part of why people are excited about big data and AI.
Using data analysis to predict the weather, or the spread of diseases, is one thing [^8];
it is another matter to predict whether a convict is likely to reoffend, whether an applicant for a
loan is likely to default, or whether an insurance customer is likely to make expensive claims
[^9]. The latter have a direct effect on individual people's lives.

Naturally, payment networks want to prevent fraudulent transactions, banks want to avoid bad loans,
airlines want to avoid hijackings, and companies want to avoid hiring ineffective or untrustworthy
people. From their point of view, the cost of a missed business opportunity is low, but the cost of
a bad loan or a problematic employee is much higher, so it is natural for organizations to want to
be cautious. If in doubt, they are better off saying no.

However, as algorithmic decision-making becomes more widespread, someone who has (accurately or
falsely) been labeled as risky by some algorithm may suffer a large number of those "no" decisions.
Systematically being excluded from jobs, air travel, insurance coverage, property rental, financial
services, and other key aspects of society is such a large constraint of the individual's freedom
that it has been called "algorithmic prison" [^10]. In countries that respect human
rights, the criminal justice system presumes innocence until proven guilty; on the other hand,
automated systems can systematically and arbitrarily exclude a person from participating in society
without any proof of guilt, and with little chance of appeal.

### Bias and Discrimination {#id370}

Decisions made by an algorithm are not necessarily any better or any worse than those made by a
human. Every person is likely to have biases, even if they actively try to counteract them, and
discriminatory practices can become culturally institutionalized. There is hope that basing
decisions on data, rather than subjective and instinctive assessments by people, could be more fair
and give a better chance to people who are often overlooked in the traditional system
[^11].

When we develop predictive analytics and AI systems, we are not merely automating a human's decision
by using software to specify the rules for when to say yes or no; we are even leaving the rules
themselves to be inferred from data. However, the patterns learned by these systems are opaque: even
if there is some correlation in the data, we may not know why. If there is a systematic bias in the
input to an algorithm, the system will most likely learn and amplify that bias in its output
[^12].

In many countries, anti-discrimination laws prohibit treating people differently depending on
protected traits such as ethnicity, age, gender, sexuality, disability, or beliefs. Other features
of a person's data may be analyzed, but what happens if they are correlated with protected traits?
For example, in racially segregated neighborhoods, a person's postal code or even their IP address
is a strong predictor of race. Put like this, it seems ridiculous to believe that an algorithm could
somehow take biased data as input and produce fair and impartial output from it [^13],
[^14]. Yet this belief often seems to be implied by proponents of data-driven decision
making, an attitude that has been satirized as "machine learning is like money laundering for bias"
[^15].

Predictive analytics systems merely extrapolate from the past; if the past is discriminatory, they
codify and amplify that discrimination [^16]. If we want the future to be better than the
past, moral imagination is required, and that's something only humans can provide [^17].
Data and models should be our tools, not our masters.

### Responsibility and Accountability {#id371}

Automated decision making opens the question of responsibility and accountability [^17].
If a human makes a mistake, they can be held accountable, and the person affected by the decision
can appeal. Algorithms make mistakes too, but who is accountable if they go wrong [^18]?
When a self-driving car causes an accident, who is responsible? If an automated credit scoring
algorithm systematically discriminates against people of a particular race or religion, is there any
recourse? If a decision by your machine learning system comes under judicial review, can you explain
to the judge how the algorithm made its decision? People should not be able to evade their
responsibility by blaming an algorithm.

Credit rating agencies are an old example of collecting data to make decisions about people. A bad
credit score makes life difficult, but at least a credit score is normally based on relevant facts
about a person's actual borrowing history, and any errors in the record can be corrected (although
the agencies normally do not make this easy). However, scoring algorithms based on machine learning
typically use a much wider range of inputs and are much more opaque, making it harder to understand
how a particular decision has come about and whether someone is being treated in an unfair or
discriminatory way [^19].

A credit score summarizes "How did you behave in the past?" whereas predictive analytics usually
work on the basis of "Who is similar to you, and how did people like you behave in the past?"
Drawing parallels to others' behavior implies stereotyping people, for example based on where they
live (a close proxy for race and socioeconomic class). What about people who get put in the wrong
bucket? Furthermore, if a decision is incorrect due to erroneous data, recourse is almost impossible
[^17].

Much data is statistical in nature, which means that even if the probability distribution on the
whole is correct, individual cases may well be wrong. For example, if the average life expectancy in
your country is 80 years, that doesn't mean you're expected to drop dead on your 80th birthday. From
the average and the probability distribution, you can't say much about the age to which one
particular person will live. Similarly, the output of a prediction system is probabilistic and may
well be wrong in individual cases.

A blind belief in the supremacy of data for making decisions is not only delusional, it is
positively dangerous. As data-driven decision making becomes more widespread, we will need to figure
out how to make algorithms accountable and transparent, how to avoid reinforcing existing biases,
and how to fix them when they inevitably make mistakes.

We will also need to figure out how to prevent data being used to harm people, and realize its
positive potential instead. For example, analytics can reveal financial and social characteristics
of people's lives. On the one hand, this power could be used to focus aid and support to help those
people who most need it. On the other hand, it is sometimes used by predatory business seeking to
identify vulnerable people and sell them risky products such as high-cost loans and worthless
college degrees [^17], [^20].

### Feedback Loops {#id372}

Even with predictive applications that have less immediately far-reaching effects on people, such as
recommendation systems, there are difficult issues that we must confront. When services become good
at predicting what content users want to see, they may end up showing people only opinions they
already agree with, leading to echo chambers in which stereotypes, misinformation, and polarization
can breed. We are already seeing the impact of social media echo chambers on election campaigns.

When predictive analytics affect people's lives, particularly pernicious problems arise due to
self-reinforcing feedback loops. For example, consider the case of employers using credit scores to
evaluate potential hires. You may be a good worker with a good credit score, but suddenly find
yourself in financial difficulties due to a misfortune outside of your control. As you miss payments
on your bills, your credit score suffers, and you will be less likely to find work. Joblessness
pushes you toward poverty, which further worsens your scores, making it even harder to find
employment [^17]. It's a downward spiral due to poisonous assumptions, hidden behind a
camouflage of mathematical rigor and data.

As another example of a feedback loop, economists found that when gas stations in Germany introduced
algorithmic prices, competition was reduced and prices for consumers went up because the algorithms
learned to collude [^21].

We can't always predict when such feedback loops happen. However, many consequences can be predicted
by thinking about the entire system (not just the computerized parts, but also the people
interacting with it)---an approach known as *systems thinking* [^22]. We can try to
understand how a data analysis system responds to different behaviors, structures, or
characteristics. Does the system reinforce and amplify existing differences between people (e.g.,
making the rich richer or the poor poorer), or does it try to combat injustice? And even with the
best intentions, we must beware of unintended consequences.

## Privacy and Tracking {#id373}

Besides the problems of predictive analytics---i.e., using data to make automated decisions about
people---there are ethical problems with data collection itself. What is the relationship between
the organizations collecting data and the people whose data is being collected?

When a system only stores data that a user has explicitly entered, because they want the system to
store and process it in a certain way, the system is performing a service for the user: the user is
the customer. But when a user's activity is tracked and logged as a side effect of other things they
are doing, the relationship is less clear. The service no longer just does what the user tells it to
do, but it takes on interests of its own, which may conflict with the user's interests.

Tracking behavioral data has become increasingly important for user-facing features of many online
services: tracking which search results are clicked helps improve the ranking of search results;
recommending "people who liked X also liked Y" helps users discover interesting and useful things;
A/B tests and user flow analysis can help indicate how a user interface might be improved. Those
features require some amount of tracking of user behavior, and users benefit from them.

However, depending on a company's business model, tracking often doesn't stop there. If the service
is funded through advertising, the advertisers are the actual customers, and the users' interests
take second place. Tracking data becomes more detailed, analyses become further-reaching, and data
is retained for a long time in order to build up detailed profiles of each person for marketing
purposes.

Now the relationship between the company and the user whose data is being collected starts looking
quite different. The user is given a free service and is coaxed into engaging with it as much as
possible. The tracking of the user serves not primarily that individual, but rather the needs of the
advertisers who are funding the service. This relationship can be appropriately described with a
word that has more sinister connotations: *surveillance*.

### Surveillance {#id374}

As a thought experiment, try replacing the word *data* with *surveillance*, and observe if common
phrases still sound so good [^23]. How about this: "In our surveillance-driven
organization we collect real-time surveillance streams and store them in our surveillance warehouse.
Our surveillance scientists use advanced analytics and surveillance processing in order to derive
new insights."

This thought experiment is unusually polemic for this book, *Designing Surveillance-Intensive
Applications*, but strong words are needed to emphasize this point. In our attempts to make software
"eat the world" [^24], we have built the greatest mass surveillance infrastructure the
world has ever seen. We are rapidly approaching a world in which every inhabited space contains at
least one internet-connected microphone, in the form of smartphones, smart TVs, voice-controlled
assistant devices, baby monitors, and even children's toys that use cloud-based speech recognition.
Many of these devices have a terrible security record [^25].

What is new compared to the past is that digitization has made it easy to collect large amounts of
data about people. Surveillance of our location and movements, our social relationships and
communications, our purchases and payments, and data about our health have become almost
unavoidable. A surveillance organisation may end up knowing more about a person than that person
knows about themselves---for example, identifying illnesses or economic problems before the person
themselves is aware of them.

Even the most totalitarian and repressive regimes of the past could only dream of putting a
microphone in every room and forcing every person to constantly carry a device capable of tracking
their location and movements. Yet the benefits that we get from digital technology are so great that
we now voluntarily accept this world of total surveillance. The difference is just that the data is
being collected by corporations to provide us with services, rather than government agencies seeking
control [^26].

Not all data collection necessarily qualifies as surveillance, but examining it as such can help us
understand our relationship with the data collector. Why are we seemingly happy to accept
surveillance by corporations? Perhaps you feel you have nothing to hide---in other words, you are
totally in line with existing power structures, you are not a marginalized minority, and you needn't
fear persecution [^27]. Not everyone is so fortunate. Or perhaps it's because the purpose
seems benign---it's not overt coercion and conformance, but merely better recommendations and more
personalized marketing. However, combined with the discussion of predictive analytics from the last
section, that distinction seems less clear.

We are already seeing behavioral data on car driving, tracked by cars without drivers' consent,
affecting their insurance premiums [^28], and health insurance coverage that depends on
people wearing a fitness tracking device. When surveillance is used to determine things that hold
sway over important aspects of life, such as insurance coverage or employment, it starts to appear
less benign. Moreover, data analysis can reveal surprisingly intrusive things: for example, the
movement sensor in a smartwatch or fitness tracker can be used to work out what you are typing (for
example, passwords) with fairly good accuracy [^29]. Sensor accuracy and algorithms for
analysis are only going to get better.

### Consent and Freedom of Choice {#id375}

We might assert that users voluntarily choose to use a service that tracks their activity, and they
have agreed to the terms of service and privacy policy, so they consent to data collection. We might
even claim that users are receiving a valuable service in return for the data they provide, and that
the tracking is necessary in order to provide the service. Undoubtedly, social networks, search
engines, and various other free online services are valuable to users---but there are problems with
this argument.

First, we should ask in what way the tracking is necessary. Some forms of tracking directly feed
into improving features for users: for example, tracking the click-through rate on search results
can help improve a search engine's result ranking and relevance, and tracking which products
customers tend to buy together can help an online shop suggest related products. However, when
tracking user interaction for content recommendations, or to build user profiles for advertising
purposes, it is less clear whether this is genuinely in the user's interest---or is it only
necessary because the ads pay for the service?

Second, users have little knowledge of what data they are feeding into our databases, or how it is
retained and processed---and most privacy policies do more to obscure than to illuminate. Without
understanding what happens to their data, users cannot give any meaningful consent. Often, data from
one user also says things about other people who are not users of the service and who have not
agreed to any terms. The derived datasets that we discussed in this part of the book---in which data
from the entire user base may have been combined with behavioral tracking and external data
sources---are precisely the kinds of data of which users cannot have any meaningful understanding.

Moreover, data is extracted from users through a one-way process, not a relationship with true
reciprocity, and not a fair value exchange. There is no dialog, no option for users to negotiate how
much data they provide and what service they receive in return: the relationship between the service
and the user is very asymmetric and one-sided. The terms are set by the service, not by the user
[^30], [^31].

In the European Union, the *General Data Protection Regulation* (GDPR) requires that consent must be
"freely given, specific, informed, and unambiguous", and that the user must be able to "refuse or
withdraw consent without detriment"---otherwise it is not considered "freely given". Any request for
consent must be written "in an intelligible and easily accessible form, using clear and plain
language". Moreover, "silence, pre-ticked boxes or inactivity \[do not\] constitute consent"
[^32]. There are other bases for lawful processing of personal data besides consent, such
as *legitimate interest*, which permits certain uses of data such as fraud prevention
[^33].

You might argue that a user who does not consent to surveillance can simply choose not to use a
service. But this choice is not free either: if a service is so popular that it is "regarded by most
people as essential for basic social participation" [^30], then it is not reasonable to
expect people to opt out of this service---using it is *de facto* mandatory. For example, in most
Western social communities, it has become the norm to carry a smartphone, to use social networks for
socializing, and to use Google for finding information. Especially when a service has network
effects, there is a social cost to people choosing *not* to use it.

Declining to use a service due to its user tracking policies is easier said than done. These
platforms are designed specifically to engage users. Many use game mechanics and tactics common in
gambling to keep users coming back [^34]. Even if a user gets past this, declining to
engage is only an option for the small number of people who are privileged enough to have the time
and knowledge to understand its privacy policy, and who can afford to potentially miss out on social
participation or professional opportunities that may have arisen if they had participated in the
service. For people in a less privileged position, there is no meaningful freedom of choice:
surveillance becomes inescapable.

### Privacy and Use of Data {#id457}

Sometimes people claim that "privacy is dead" on the grounds that some users are willing to post all
sorts of things about their lives to social media, sometimes mundane and sometimes deeply personal.
However, this claim is false and rests on a misunderstanding of the word *privacy*.

Having privacy does not mean keeping everything secret; it means having the freedom to choose which
things to reveal to whom, what to make public, and what to keep secret. The right to privacy is a
decision right: it enables each person to decide where they want to be on the spectrum between
secrecy and transparency in each situation [^30]. It is an important aspect of a person's
freedom and autonomy.

For example, someone who suffers from a rare medical condition might be very happy to provide their
private medical data to researchers if there is a chance that it might help the development of
treatments for their condition. However, the important thing is that this person has a choice over
who may access this data, and for what purpose. If there was a risk that information about their
medical condition would harm their access to medical insurance or employment or other important
things, this person would probably be much more cautious about sharing their data.

When data is extracted from people through surveillance infrastructure, privacy rights are not
necessarily eroded, but rather transferred to the data collector. Companies that acquire data
essentially say "trust us to do the right thing with your data," which means that the right to
decide what to reveal and what to keep secret is transferred from the individual to the company.

The companies in turn choose to keep much of the outcome of this surveillance secret, because to
reveal it would be perceived as creepy, and would harm their business model (which relies on knowing
more about people than other companies do). Intimate information about users is only revealed
indirectly, for example in the form of tools for targeting advertisements to specific groups of
people (such as those suffering from a particular illness).

Even if particular users cannot be personally reidentified from the bucket of people targeted by a
particular ad, they have lost their agency about the disclosure of some intimate information. It is
not the user who decides what is revealed to whom on the basis of their personal preferences---it is
the company that exercises the privacy right with the goal of maximizing its profit.

Many companies have a goal of not being *perceived* as creepy---avoiding the question of how
intrusive their data collection actually is, and instead focusing on managing user perceptions. And
even these perceptions are often managed poorly: for example, something may be factually correct,
but if it triggers painful memories, the user may not want to be reminded about it [^35].
With any kind of data we should expect the possibility that it is wrong, undesirable, or
inappropriate in some way, and we need to build mechanisms for handling those failures. Whether
something is "undesirable" or "inappropriate" is of course down to human judgment; algorithms are
oblivious to such notions unless we explicitly program them to respect human needs. As engineers of
these systems we must be humble, accepting and planning for such failings.

Privacy settings that allow a user of an online service to control which aspects of their data other
users can see are a starting point for handing back some control to users. However, regardless of
the setting, the service itself still has unfettered access to the data, and is free to use it in
any way permitted by the privacy policy. Even if the service promises not to sell the data to third
parties, it usually grants itself unrestricted rights to process and analyze the data internally,
often going much further than what is overtly visible to users.

This kind of large-scale transfer of privacy rights from individuals to corporations is historically
unprecedented [^30]. Surveillance has always existed, but it used to be expensive and
manual, not scalable and automated. Trust relationships have always existed, for example between a
patient and their doctor, or between a defendant and their attorney---but in these cases the use of
data has been strictly governed by ethical, legal, and regulatory constraints. Internet services
have made it much easier to amass huge amounts of sensitive information without meaningful consent,
and to use it at massive scale without users understanding what is happening to their private data.

### Data as Assets and Power {#id376}

Since behavioral data is a byproduct of users interacting with a service, it is sometimes called
"data exhaust"---suggesting that the data is worthless waste material. Viewed this way, behavioral
and predictive analytics can be seen as a form of recycling that extracts value from data that would
have otherwise been thrown away.

More correct would be to view it the other way round: from an economic point of view, if targeted
advertising is what pays for a service, then the user activity that generates behavioral data could
be regarded as a form of labor [^36]. One could go even further and argue that the
application with which the user interacts is merely a means to lure users into feeding more and more
personal information into the surveillance infrastructure [^30]. The delightful human
creativity and social relationships that often find expression in online services are cynically
exploited by the data extraction machine.

Personal data is a valuable asset, as evidenced by the existence of data brokers, a shady industry
operating in secrecy, purchasing, aggregating, analyzing, inferring, and reselling intrusive
personal data about people, mostly for marketing purposes [^20]. Startups are valued by
their user numbers, by "eyeballs"---i.e., by their surveillance capabilities.

Because the data is valuable, many people want it. Of course companies want it---that's why they
collect it in the first place. But governments want to obtain it too: by means of secret deals,
coercion, legal compulsion, or simply stealing it [^37]. When a company goes bankrupt, the
personal data it has collected is one of the assets that gets sold. Moreover, the data is difficult
to secure, so breaches happen disconcertingly often.

These observations have led critics to saying that data is not just an asset, but a "toxic asset"
[^37], or at least "hazardous material" [^38]. Maybe data is not the new gold,
nor the new oil, but rather the new uranium [^39]. Even if we think that we are capable of
preventing abuse of data, whenever we collect data, we need to balance the benefits with the risk of
it falling into the wrong hands: computer systems may be compromised by criminals or hostile foreign
intelligence services, data may be leaked by insiders, the company may fall into the hands of
unscrupulous management that does not share our values, or the country may be taken over by a regime
that has no qualms about compelling us to hand over the data.

When collecting data, we need to consider not just today's political environment, but all possible
future governments. There is no guarantee that every government elected in future will respect human
rights and civil liberties, so "it is poor civic hygiene to install technologies that could someday
facilitate a police state" [^40].

"Knowledge is power," as the old adage goes. And furthermore, "to scrutinize others while avoiding
scrutiny oneself is one of the most important forms of power" [^41]. This is why
totalitarian governments want surveillance: it gives them the power to control the population.
Although today's technology companies are not overtly seeking political power, the data and
knowledge they have accumulated nevertheless gives them a lot of power over our lives, much of which
is surreptitious, outside of public oversight [^42].

### Remembering the Industrial Revolution {#id377}

Data is the defining feature of the information age. The internet, data storage, processing, and
software-driven automation are having a major impact on the global economy and human society. As our
daily lives and social organization have been changed by information technology, and will probably
continue to radically change in the coming decades, comparisons to the Industrial Revolution come to
mind [^17], [^26].

The Industrial Revolution came about through major technological and agricultural advances, and it
brought sustained economic growth and significantly improved living standards in the long run. Yet
it also came with major problems: pollution of the air (due to smoke and chemical processes) and the
water (from industrial and human waste) was dreadful. Factory owners lived in splendor, while urban
workers often lived in very poor housing and worked long hours in harsh conditions. Child labor was
common, including dangerous and poorly paid work in mines.

It took a long time before safeguards were established, such as environmental protection
regulations, safety protocols for workplaces, outlawing child labor, and health inspections for
food. Undoubtedly the cost of doing business increased when factories were no longer allowed to dump
their waste into rivers, sell tainted foods, or exploit workers. But society as a whole benefited
hugely from these regulations, and few of us would want to return to a time before [^17].

Just as the Industrial Revolution had a dark side that needed to be managed, our transition to the
information age has major problems that we need to confront and solve [^43], [^44].
The collection and use of data is one of those problems. In the words of Bruce Schneier
[^26]:

> Data is the pollution problem of the information age, and protecting privacy is the environmental
> challenge. Almost all computers produce information. It stays around, festering. How we deal with
> it---how we contain it and how we dispose of it---is central to the health of our information
> economy. Just as we look back today at the early decades of the industrial age and wonder how our
> ancestors could have ignored pollution in their rush to build an industrial world, our
> grandchildren will look back at us during these early decades of the information age and judge us
> on how we addressed the challenge of data collection and misuse.
>
> We should try to make them proud.

### Legislation and Self-Regulation {#sec_future_legislation}

Data protection laws might be able to help preserve individuals' rights. For example, the European
GDPR states that personal data must be "collected for specified, explicit and legitimate purposes
and not further processed in a manner that is incompatible with those purposes", and furthermore
that data must be "adequate, relevant and limited to what is necessary in relation to the purposes
for which they are processed" [^32].

However, this principle of *data minimization* runs directly counter to the philosophy of Big Data,
which is to maximize data collection, to combine it with other datasets, to experiment and to
explore in order to generate new insights. Exploration means using data for unforeseen purposes,
which is the opposite of the "specified and explicit" purposes for which the data must have been
collected. While the GDPR has had some effect on the online advertising industry [^45],
the regulation has been weakly enforced [^46], and it does not seem to have led to much of
a change in culture and practices across the wider tech industry.

Companies that collect lots of data about people oppose regulation as being a burden and a hindrance
to innovation. To some extent that opposition is justified. For example, when sharing medical data,
there are clear risks to privacy, but there are also potential opportunities: how many deaths could
be prevented if data analysis was able to help us achieve better diagnostics or find better
treatments [^47]? Over-regulation may prevent such breakthroughs. It is difficult to
balance such potential opportunities with the risks [^41].

Fundamentally, we need a culture shift in the tech industry with regard to personal data. We should
stop regarding users as metrics to be optimized, and remember that they are humans who deserve
respect, dignity, and agency. We should self-regulate our data collection and processing practices
in order to establish and maintain the trust of the people who depend on our software
[^48]. And we should take it upon ourselves to educate end users about how their data is
used, rather than keeping them in the dark.

We should allow each individual to maintain their privacy---i.e., their control over own data---and
not steal that control from them through surveillance. Our individual right to control our data is
like the natural environment of a national park: if we don't explicitly protect and care for it, it
will be destroyed. It will be the tragedy of the commons, and we will all be worse off for it.
Ubiquitous surveillance is not inevitable---we are still able to stop it.

As a first step, we should not retain data forever, but purge it as soon as it is no longer needed,
and minimize what we collect in the first place [^48], [^49]. Data you don't have is
data that can't be leaked, stolen, or compelled by governments to be handed over. Overall, culture
and attitude changes will be necessary. As people working in technology, if we don't consider the
societal impact of our work, we're not doing our job [^50].

## Summary {#id594}

This brings us to the end of the book. We have covered a lot of ground:

- In [Chapter 1](/en/ch1#ch_tradeoffs) we contrasted analytical and operational systems, compared
  the cloud to self-hosting, weighed up distributed and single-node systems, and discussed balancing
  the needs of your business with the needs of your users.

- In [Chapter 2](/en/ch2#ch_nonfunctional) we saw how to define several nonfunctional requirements
  such as performance, reliability, scalability, and maintainability.

- In [Chapter 3](/en/ch3#ch_datamodels) we explored a spectrum of data models, including the
  relational, document, and graph models, event sourcing, and DataFrames. We also looked at examples
  of various query languages, including SQL, Cypher, SPARQL, Datalog, and GraphQL.

- In [Chapter 4](/en/ch4#ch_storage) we discussed storage engines for OLTP (LSM-trees and B-trees),
  for analytics (column-oriented storage), and indexes for information retrieval (full-text and
  vector search).

- In [Chapter 5](/en/ch5#ch_encoding) we examined different ways of encoding data objects as bytes,
  and how to support evolution as requirements change. We also compared several ways how data flows
  between processes: via databases, service calls, workflow engines, or event-driven architectures.

- In [Chapter 6](/en/ch6#ch_replication) we studied the trade-offs between single-leader,
  multi-leader, and leaderless replication. We also looked at consistency models such as
  read-after-write consistency, and sync engines that allow clients to work offline.

- In [Chapter 7](/en/ch7#ch_sharding) we went into sharding, including strategies for rebalancing,
  request routing, and secondary indexing.

- In [Chapter 8](/en/ch8#ch_transactions) we covered transactions: durability, how various isolation
  levels (read committed, snapshot isolation, and serializable) can be achieved, and how atomicity
  can be ensured in distributed transactions.

- In [Chapter 9](/en/ch9#ch_distributed) we surveyed fundamental problems that occur in distributed
  systems (network faults and delays, clock errors, process pauses, crashes), and saw how they make
  it difficult to correctly implement even something seemingly simple like a lock.

- In [Chapter 10](/en/ch10#ch_consistency) we went on a deep-dive into various forms of consensus
  and the consistency model (linearizability) it enables.

- In [Chapter 11](/en/ch11#ch_batch) we dug into batch processing, building up from simple chains of
  Unix tools to large-scale distributed batch processors using distributed filesystems or object
  stores.

- In [Chapter 12](/en/ch12#ch_stream) we generalized batch processing to stream processing,
  discussed the underlying message brokers, change data capture, fault tolerance, and processing
  patterns such as streaming joins.

- In [Chapter 13](/en/ch13#ch_philosophy) we explored a philosophy of streaming systems that allows
  disparate data systems to be integrated, systems to be evolved, and applications to be scaled more
  easily.

Finally, in this last chapter, we took a step back and examined some ethical aspects of building
data-intensive applications. We saw that although data can be used to do good, it can also do
significant harm: making decisions that seriously affect people's lives and are difficult to appeal
against, leading to discrimination and exploitation, normalizing surveillance, and exposing intimate
information. We also run the risk of data breaches, and we may find that a well-intentioned use of
data has unintended consequences.

As software and data are having such a large impact on the world, we as engineers must remember that
we carry a responsibility to work toward the kind of world that we want to live in: a world that
treats people with humanity and respect. Let's work together towards that goal.

##### Footnotes

### References {#references}

[^1]: David Schmudde. [What If Data Is a Bad Idea?](https://schmud.de/posts/2024-08-18-data-is-a-bad-idea.html). *schmud.de*, August 2024. Archived at [perma.cc/ZXU5-XMCT](https://perma.cc/ZXU5-XMCT)
[^2]: [ACM Code of Ethics and Professional Conduct](https://www.acm.org/code-of-ethics). Association for Computing Machinery, *acm.org*, 2018. Archived at [perma.cc/SEA8-CMB8](https://perma.cc/SEA8-CMB8)
[^3]: Igor Perisic. [Making Hard Choices: The Quest for Ethics in Machine Learning](https://www.linkedin.com/blog/engineering/archive/making-hard-choices-the-quest-for-ethics-in-machine-learning). *linkedin.com*, November 2016. Archived at [perma.cc/DGF8-KNT7](https://perma.cc/DGF8-KNT7)
[^4]: John Naughton. [Algorithm Writers Need a Code of Conduct](https://www.theguardian.com/commentisfree/2015/dec/06/algorithm-writers-should-have-code-of-conduct). *theguardian.com*, December 2015. Archived at [perma.cc/TBG2-3NG6](https://perma.cc/TBG2-3NG6)
[^5]: Ben Green. ["Good" isn't good enough](https://www.benzevgreen.com/wp-content/uploads/2019/11/19-ai4sg.pdf). At *NeurIPS Joint Workshop on AI for Social Good*, December 2019. Archived at [perma.cc/H4LN-7VY3](https://perma.cc/H4LN-7VY3)
[^6]: Deborah G. Johnson and Mario Verdicchio. [Ethical AI is Not about AI](https://cacm.acm.org/opinion/ethical-ai-is-not-about-ai/). *Communications of the ACM*, volume 66, issue 2, pages 32--34, January 2023. [doi:10.1145/3576932](https://doi.org/10.1145/3576932)
[^7]: Marc Steen. [Ethics as a Participatory and Iterative Process](https://cacm.acm.org/opinion/ethics-as-a-participatory-and-iterative-process/). *Communications of the ACM*, volume 66, issue 5, pages 27--29, April 2023. [doi:10.1145/3550069](https://doi.org/10.1145/3550069)
[^8]: Logan Kugler. [What Happens When Big Data Blunders?](https://cacm.acm.org/news/what-happens-when-big-data-blunders/) *Communications of the ACM*, volume 59, issue 6, pages 15--16, June 2016. [doi:10.1145/2911975](https://doi.org/10.1145/2911975)
[^9]: Miri Zilka. [Algorithms and the criminal justice system: promises and challenges in deployment and research](https://www.cl.cam.ac.uk/research/security/seminars/archive/video/2023-03-07-t196231.html). At *University of Cambridge Security Seminar Series*, March 2023.
[^10]: Bill Davidow. [Welcome to Algorithmic Prison](https://www.theatlantic.com/technology/archive/2014/02/welcome-to-algorithmic-prison/283985/). *theatlantic.com*, February 2014. Archived at [archive.org](https://web.archive.org/web/20171019201812/https://www.theatlantic.com/technology/archive/2014/02/welcome-to-algorithmic-prison/283985/)
[^11]: Don Peck. [They're Watching You at Work](https://www.theatlantic.com/magazine/archive/2013/12/theyre-watching-you-at-work/354681/). *theatlantic.com*, December 2013. Archived at [perma.cc/YR9T-6M38](https://perma.cc/YR9T-6M38)
[^12]: Leigh Alexander. [Is an Algorithm Any Less Racist Than a Human?](https://www.theguardian.com/technology/2016/aug/03/algorithm-racist-human-employers-work) *theguardian.com*, August 2016. Archived at [perma.cc/XP93-DSVX](https://perma.cc/XP93-DSVX)
[^13]: Jesse Emspak. [How a Machine Learns Prejudice](https://www.scientificamerican.com/article/how-a-machine-learns-prejudice/). *scientificamerican.com*, December 2016. [perma.cc/R3L5-55E6](https://perma.cc/R3L5-55E6)
[^14]: Rohit Chopra, Kristen Clarke, Charlotte A. Burrows, and Lina M. Khan. [Joint Statement on Enforcement Efforts Against Discrimination and Bias in Automated Systems](https://www.ftc.gov/system/files/ftc_gov/pdf/EEOC-CRT-FTC-CFPB-AI-Joint-Statement%28final%29.pdf). *ftc.gov*, April 2023. Archived at [perma.cc/YY4Y-RCCA](https://perma.cc/YY4Y-RCCA)
[^15]: Maciej Cegłowski. [The Moral Economy of Tech](https://idlewords.com/talks/sase_panel.htm). *idlewords.com*, June 2016. Archived at [perma.cc/L8XV-BKTD](https://perma.cc/L8XV-BKTD)
[^16]: Greg Nichols. [Artificial Intelligence in healthcare is racist](https://www.zdnet.com/article/artificial-intelligence-in-healthcare-is-racist/). *zdnet.com*, November 2020. Archived at [perma.cc/3MKW-YKRS](https://perma.cc/3MKW-YKRS)
[^17]: Cathy O'Neil. *Weapons of Math Destruction: How Big Data Increases Inequality and Threatens Democracy*. Crown Publishing, 2016. ISBN: 978-0-553-41881-1
[^18]: Julia Angwin. [Make Algorithms Accountable](https://www.nytimes.com/2016/08/01/opinion/make-algorithms-accountable.html). *nytimes.com*, August 2016. Archived at [archive.org](https://web.archive.org/web/20230819055242/https://www.nytimes.com/2016/08/01/opinion/make-algorithms-accountable.html)
[^19]: Bryce Goodman and Seth Flaxman. [European Union Regulations on Algorithmic Decision-Making and a 'Right to Explanation'](https://arxiv.org/abs/1606.08813). At *ICML Workshop on Human Interpretability in Machine Learning*, June 2016. Archived at [arxiv.org/abs/1606.08813](https://arxiv.org/abs/1606.08813)
[^20]: [A Review of the Data Broker Industry: Collection, Use, and Sale of Consumer Data for Marketing Purposes](https://www.commerce.senate.gov/services/files/0d2b3642-6221-4888-a631-08f2f255b577). Staff Report, *United States Senate Committee on Commerce, Science, and Transportation*, *commerce.senate.gov*, December 2013. Archived at [perma.cc/32NV-YWLQ](https://perma.cc/32NV-YWLQ)
[^21]: Stephanie Assad, Robert Clark, Daniel Ershov, and Lei Xu. [Algorithmic Pricing and Competition: Empirical Evidence from the German Retail Gasoline Market](https://economics.yale.edu/sites/default/files/clark_acex_jan_2021.pdf). *Journal of Political Economy*, volume 132, issue 3, pages 723-771, March 2024. [doi:10.1086/726906](https://doi.org/10.1086/726906)
[^22]: Donella H. Meadows and Diana Wright. *Thinking in Systems: A Primer*. Chelsea Green Publishing, 2008. ISBN: 978-1-603-58055-7
[^23]: Daniel J. Bernstein. [Listening to a "big data"/"data science" talk. Mentally translating "data" to "surveillance": "\...everything starts with surveillance\..."](https://x.com/hashbreaker/status/598076230437568512) *x.com*, May 2015. Archived at [perma.cc/EY3D-WBBJ](https://perma.cc/EY3D-WBBJ)
[^24]: Marc Andreessen. [Why Software Is Eating the World](https://a16z.com/why-software-is-eating-the-world/). *a16z.com*, August 2011. Archived at [perma.cc/3DCC-W3G6](https://perma.cc/3DCC-W3G6)
[^25]: J. M. Porup. ['Internet of Things' Security Is Hilariously Broken and Getting Worse](https://arstechnica.com/information-technology/2016/01/how-to-search-the-internet-of-things-for-photos-of-sleeping-babies/). *arstechnica.com*, January 2016. Archived at [archive.org](https://web.archive.org/web/20250823001716/https://arstechnica.com/information-technology/2016/01/how-to-search-the-internet-of-things-for-photos-of-sleeping-babies/)
[^26]: Bruce Schneier. [*Data and Goliath: The Hidden Battles to Collect Your Data and Control Your World*](https://www.schneier.com/books/data_and_goliath/). W. W. Norton, 2015. ISBN: 978-0-393-35217-7
[^27]: The Grugq. [Nothing to Hide](https://grugq.tumblr.com/post/142799983558/nothing-to-hide). *grugq.tumblr.com*, April 2016. Archived at [perma.cc/BL95-8W5M](https://perma.cc/BL95-8W5M)
[^28]: Federal Trade Commission. [FTC Takes Action Against General Motors for Sharing Drivers' Precise Location and Driving Behavior Data Without Consent](https://www.ftc.gov/news-events/news/press-releases/2025/01/ftc-takes-action-against-general-motors-sharing-drivers-precise-location-driving-behavior-data). *ftc.gov*, January 2025. Archived at [perma.cc/3XGV-3HRD](https://perma.cc/3XGV-3HRD)
[^29]: Tony Beltramelli. [Deep-Spying: Spying Using Smartwatch and Deep Learning](https://arxiv.org/abs/1512.05616). Masters Thesis, IT University of Copenhagen, December 2015. Archived at *arxiv.org/abs/1512.05616*
[^30]: Shoshana Zuboff. [Big Other: Surveillance Capitalism and the Prospects of an Information Civilization](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2594754). *Journal of Information Technology*, volume 30, issue 1, pages 75--89, April 2015. [doi:10.1057/jit.2015.5](https://doi.org/10.1057/jit.2015.5)
[^31]: Michiel Rhoen. [Beyond Consent: Improving Data Protection Through Consumer Protection Law](https://policyreview.info/articles/analysis/beyond-consent-improving-data-protection-through-consumer-protection-law). *Internet Policy Review*, volume 5, issue 1, March 2016. [doi:10.14763/2016.1.404](https://doi.org/10.14763/2016.1.404)
[^32]: [Regulation (EU) 2016/679 of the European Parliament and of the Council of 27 April 2016](https://eur-lex.europa.eu/eli/reg/2016/679/oj/eng). *Official Journal of the European Union*, L 119/1, May 2016.
[^33]: UK Information Commissioner's Office. [What is the 'legitimate interests' basis?](https://ico.org.uk/for-organisations/uk-gdpr-guidance-and-resources/lawful-basis/legitimate-interests/what-is-the-legitimate-interests-basis/) *ico.org.uk*. Archived at [perma.cc/W8XR-F7ML](https://perma.cc/W8XR-F7ML)
[^34]: Tristan Harris. [How a handful of tech companies control billions of minds every day](https://www.ted.com/talks/tristan_harris_how_a_handful_of_tech_companies_control_billions_of_minds_every_day). At *TED2017*, April 2017.
[^35]: Carina C. Zona. [Consequences of an Insightful Algorithm](https://www.youtube.com/watch?v=YRI40A4tyWU). At *GOTO Berlin*, November 2016.
[^36]: Imanol Arrieta Ibarra, Leonard Goff, Diego Jiménez Hernández, Jaron Lanier, and E. Glen Weyl. [Should We Treat Data as Labor? Moving Beyond 'Free'](https://www.aeaweb.org/conference/2018/preliminary/paper/2Y7N88na). *American Economic Association Papers Proceedings*, volume 1, issue 1, December 2017.
[^37]: Bruce Schneier. [Data Is a Toxic Asset, So Why Not Throw It Out?](https://www.schneier.com/essays/archives/2016/03/data_is_a_toxic_asse.html) *schneier.com*, March 2016. Archived at [perma.cc/4GZH-WR3D](https://perma.cc/4GZH-WR3D)
[^38]: Cory Scott. [Data is not toxic - which implies no benefit - but rather hazardous material, where we must balance need vs. want](https://x.com/cory_scott/status/706586399483437056). *x.com*, March 2016. Archived at [perma.cc/CLV7-JF2E](https://perma.cc/CLV7-JF2E)
[^39]: Mark Pesce. [Data is the new uranium -- incredibly powerful and amazingly dangerous](https://www.theregister.com/2024/11/20/data_is_the_new_uranium/). *theregister.com*, November 2024. Archived at [perma.cc/NV8B-GYGV](https://perma.cc/NV8B-GYGV)
[^40]: Bruce Schneier. [Mission Creep: When Everything Is Terrorism](https://www.schneier.com/essays/archives/2013/07/mission_creep_when_e.html). *schneier.com*, July 2013. Archived at [perma.cc/QB2C-5RCE](https://perma.cc/QB2C-5RCE)
[^41]: Lena Ulbricht and Maximilian von Grafenstein. [Big Data: Big Power Shifts?](https://policyreview.info/articles/analysis/big-data-big-power-shifts) *Internet Policy Review*, volume 5, issue 1, March 2016. [doi:10.14763/2016.1.406](https://doi.org/10.14763/2016.1.406)
[^42]: Ellen P. Goodman and Julia Powles. [Facebook and Google: Most Powerful and Secretive Empires We've Ever Known](https://www.theguardian.com/technology/2016/sep/28/google-facebook-powerful-secretive-empire-transparency). *theguardian.com*, September 2016. Archived at [perma.cc/8UJA-43G6](https://perma.cc/8UJA-43G6)
[^43]: Judy Estrin and Sam Gill. [The World Is Choking on Digital Pollution](https://washingtonmonthly.com/2019/01/13/the-world-is-choking-on-digital-pollution/). *washingtonmonthly.com*, January 2019. Archived at [perma.cc/3VHF-C6UC](https://perma.cc/3VHF-C6UC)
[^44]: A. Michael Froomkin. [Regulating Mass Surveillance as Privacy Pollution: Learning from Environmental Impact Statements](https://repository.law.miami.edu/cgi/viewcontent.cgi?article=1062&context=fac_articles). *University of Illinois Law Review*, volume 2015, issue 5, August 2015. Archived at [perma.cc/24ZL-VK2T](https://perma.cc/24ZL-VK2T)
[^45]: Pengyuan Wang, Li Jiang, and Jian Yang. [The Early Impact of GDPR Compliance on Display Advertising: The Case of an Ad Publisher](https://openreview.net/pdf?id=TUnLHNo19S). *Journal of Marketing Research*, volume 61, issue 1, April 2023. [doi:10.1177/00222437231171848](https://doi.org/10.1177/00222437231171848)
[^46]: Johnny Ryan. [Don't be fooled by Meta's fine for data breaches](https://www.economist.com/by-invitation/2023/05/24/dont-be-fooled-by-metas-fine-for-data-breaches-says-johnny-ryan). *The Economist*, May 2023. Archived at [perma.cc/VCR6-55HR](https://perma.cc/VCR6-55HR)
[^47]: Jessica Leber. [Your Data Footprint Is Affecting Your Life in Ways You Can't Even Imagine](https://www.fastcompany.com/3057514/your-data-footprint-is-affecting-your-life-in-ways-you-cant-even-imagine). *fastcompany.com*, March 2016. Archived at [archive.org](https://web.archive.org/web/20161128133016/https://www.fastcoexist.com/3057514/your-data-footprint-is-affecting-your-life-in-ways-you-cant-even-imagine)
[^48]: Maciej Cegłowski. [Haunted by Data](https://idlewords.com/talks/haunted_by_data.htm). *idlewords.com*, October 2015. Archived at [archive.org](https://web.archive.org/web/20161130143932/https://idlewords.com/talks/haunted_by_data.htm)
[^49]: Sam Thielman. [You Are Not What You Read: Librarians Purge User Data to Protect Privacy](https://www.theguardian.com/us-news/2016/jan/13/us-library-records-purged-data-privacy). *theguardian.com*, January 2016. Archived at [archive.org](https://web.archive.org/web/20250828224851/https://www.theguardian.com/us-news/2016/jan/13/us-library-records-purged-data-privacy)
[^50]: Jez Humble. [It's a cliché that people get into tech to "change the world". So then, you have to actually consider what the impact of your work is on the world. The idea that you can or should exclude societal and political discussions in tech is idiotic. It means you're not doing your job](https://x.com/jezhumble/status/1386758340894597122). *x.com*, April 2021. Archived at [perma.cc/3NYS-MHLC](https://perma.cc/3NYS-MHLC)


================================================
FILE: content/en/ch2.md
================================================
---
title: "2. Defining Nonfunctional Requirements"
weight: 102
breadcrumbs: false
---

<a id="ch_nonfunctional"></a>

![](/map/ch01.png)

> *The Internet was done so well that most people think of it as a natural resource like the Pacific
> Ocean, rather than something that was man-made. When was the last time a technology with a scale
> like that was so error-free?*
>
> [Alan Kay](https://www.drdobbs.com/architecture-and-design/interview-with-alan-kay/240003442),
> in interview with *Dr Dobb’s Journal* (2012)

If you are building an application, you will be driven by a list of requirements. At the top of your
list is most likely the functionality that the application must offer: what screens and what buttons
you need, and what each operation is supposed to do in order to fulfill the purpose of your
software. These are your *functional requirements*.

In addition, you probably also have some *nonfunctional requirements*: for example, the app should
be fast, reliable, secure, legally compliant, and easy to maintain. These requirements might not be
explicitly written down, because they may seem somewhat obvious, but they are just as important as
the app’s functionality: an app that is unbearably slow or unreliable might as well not exist.

Many nonfunctional requirements, such as security, fall outside the scope of this book. But there
are a few nonfunctional requirements that we will consider, and this chapter will help you
articulate them for your own systems:

* How to define and measure the *performance* of a system (see [“Describing Performance”](/en/ch2#sec_introduction_percentiles));
* What it means for a service to be *reliable*—namely, continuing to work correctly, even when
 things go wrong (see [“Reliability and Fault Tolerance”](/en/ch2#sec_introduction_reliability));
* Allowing a system to be *scalable* by having efficient ways of adding computing
 capacity as the load on the system grows (see [“Scalability”](/en/ch2#sec_introduction_scalability)); and
* Making it easier to maintain a system in the long term (see [“Maintainability”](/en/ch2#sec_introduction_maintainability)).

The terminology introduced in this chapter will also be useful in the following chapters, when we go
into the details of how data-intensive systems are implemented. However, abstract definitions can be
quite dry; to make the ideas more concrete, we will start this chapter with a case study of how a
social networking service might work, which will provide practical examples of performance and
scalability.

## Case Study: Social Network Home Timelines {#sec_introduction_twitter}

Imagine you are given the task of implementing a social network in the style of X (formerly
Twitter), in which users can post messages and follow other users. This will be a huge
simplification of how such a service actually works [^1] [^2] [^3],
but it will help illustrate some of the issues that arise in large-scale systems.

Let’s assume that users make 500 million posts per day, or 5,700 posts per second on average.
Occasionally, the rate can spike as high as 150,000 posts/second [^4].
Let’s also assume that the average user follows 200 people and has 200 followers (although there is
a very wide range: most people have only a handful of followers, and a few celebrities such as
Barack Obama have over 100 million followers).

### Representing Users, Posts, and Follows {#id20}

Imagine we keep all of the data in a relational database as shown in [Figure 2-1](/en/ch2#fig_twitter_relational). We
have one table for users, one table for posts, and one table for follow relationships.

{{< figure src="/fig/ddia_0201.png" id="fig_twitter_relational" caption="Figure 2-1. Simple relational schema for a social network in which users can follow each other." class="w-full my-4" >}}

Let’s say the main read operation that our social network must support is the *home timeline*, which
displays recent posts by people you are following (for simplicity we will ignore ads, suggested
posts from people you are not following, and other extensions). We could write the following SQL
query to get the home timeline for a particular user:

```sql
SELECT posts.*, users.* FROM posts
    JOIN follows ON posts.sender_id = follows.followee_id
    JOIN users ON posts.sender_id = users.id
    WHERE follows.follower_id = current_user
    ORDER BY posts.timestamp DESC
    LIMIT 1000
```

To execute this query, the database will use the `follows` table to find everybody who
`current_user` is following, look up recent posts by those users, and sort them by timestamp to get
the most recent 1,000 posts by any of the followed users.

Posts are supposed to be timely, so let’s assume that after somebody makes a post, we want their
followers to be able to see it within 5 seconds. One way of doing that would be for the user’s
client to repeat the query above every 5 seconds while the user is online (this is known as
*polling*). If we assume that 10 million users are online and logged in at the same time, that would
mean running the query 2 million times per second. Even if you increase the polling interval, this
is a lot.

Moreover, the query above is quite expensive: if you are following 200 people, it needs to fetch a
list of recent posts by each of those 200 people, and merge those lists. 2 million timeline queries
per second then means that the database needs to look up the recent posts from some sender 400
million times per second—a huge number. And that is the average case. Some users follow tens of
thousands of accounts; for them, this query is very expensive to execute, and difficult to make
fast.

### Materializing and Updating Timelines {#sec_introduction_materializing}

How can we do better? Firstly, instead of polling, it would be better if the server actively pushed
new posts to any followers who are currently online. Secondly, we should precompute the results of
the query above so that a user’s request for their home timeline can be served from a cache.

Imagine that for each user we store a data structure containing their home timeline, i.e., the
recent posts by people they are following. Every time a user makes a post, we look up all of their
followers, and insert that post into the home timeline of each follower—like delivering a message to
a mailbox. Now when a user logs in, we can simply give them this home timeline that we precomputed.
Moreover, to receive a notification about any new posts on their timeline, the user’s client simply
needs to subscribe to the stream of posts being added to their home timeline.

The downside of this approach is that we now need to do more work every time a user makes a post,
because the home timelines are derived data that needs to be updated. The process is illustrated in
[Figure 2-2](/en/ch2#fig_twitter_timelines). When one initial request results in several downstream requests being
carried out, we use the term *fan-out* to describe the factor by which the number of requests
increases.

{{< figure src="/fig/ddia_0202.png" id="fig_twitter_timelines" caption="Figure 2-2. Fan-out: delivering new posts to every follower of the user who made the post." class="w-full my-4" >}}


At a rate of 5,700 posts posted per second, if the average post reaches 200 followers (i.e., a
fan-out factor of 200), we will need to do just over 1 million home timeline writes per second. This
is a lot, but it’s still a significant saving compared to the 400 million per-sender post lookups
per second that we would otherwise have to do.

If the rate of posts spikes due to some special event, we don’t have to do the timeline
deliveries immediately—we can enqueue them and accept that it will temporarily take a bit longer for
posts to show up in followers’ timelines. Even during such load spikes, timelines remain fast to
load, since we simply serve them from a cache.

This process of precomputing and updating the results of a query is called *materialization*, and
the timeline cache is an example of a *materialized view* (a concept we will discuss further in
[“Maintaining materialized views”](/en/ch12#sec_stream_mat_view)). The materialized view speeds up reads, but in return we have to do more work on
write. The cost of writes for most users is modest, but a social network also has to consider some
extreme cases:

* If a user is following a very large number of accounts, and those accounts post a lot, that user
 will have a high rate of writes to their materialized timeline. However, in this case it’s
 unlikely that the user is actually reading all of the posts in their timeline, and therefore it’s
 okay to simply drop some of their timeline writes and show the user only a sample of the posts
 from the accounts they’re following
 [^5].
* When a celebrity account with a very large number of followers makes a post, we have to do a large
 amount of work to insert that post into the home timelines of each of their millions of followers.
 In this case it’s not okay to drop some of those writes. One way of solving this problem is to
 handle celebrity posts separately from everyone else’s posts: we can save ourselves the effort of
 adding them to millions of timelines by storing the celebrity posts separately and merging them
 with the materialized timeline when it is read. Despite such optimizations, handling celebrities
 on a social network can require a lot of infrastructure
 [^6].

## Describing Performance {#sec_introduction_percentiles}

Most discussions of software performance consider two main types of metric:

Response time
: The elapsed time from the moment when a user makes a request until they receive the requested
 answer. The unit of measurement is seconds (or milliseconds, or microseconds).

Throughput
: The number of requests per second, or the data volume per second, that the system is processing.
 For a given allocation of hardware resources, there is a *maximum throughput* that can be handled.
 The unit of measurement is “somethings per second”.

In the social network case study, “posts per second” and “timeline writes per second” are throughput
metrics, whereas the “time it takes to load the home timeline” or the “time until a post is
delivered to followers” are response time metrics.

There is often a connection between throughput and response time; an example of such a relationship
for an online service is sketched in [Figure 2-3](/en/ch2#fig_throughput). The service has a low response time when
request throughput is low, but response time increases as load increases. This is because of
*queueing*: when a request arrives on a highly loaded system, it’s likely that the CPU is already in
the process of handling an earlier request, and therefore the incoming request needs to wait until
the earlier request has been completed. As throughput approaches the maximum that the hardware can
handle, queueing delays increase sharply.

{{< figure src="/fig/ddia_0203.png" id="fig_throughput" caption="Figure 2-3. As the throughput of a service approaches its capacity, the response time increases dramatically due to queueing." class="w-full my-4" >}}


--------

<a id="sidebar_metastable"></a>

> [!TIP] WHEN AN OVERLOADED SYSTEM WON'T RECOVER

If a system is close to overload, with throughput pushed close to the limit, it can sometimes enter a
vicious cycle where it becomes less efficient and hence even more overloaded. For example, if there
is a long queue of requests waiting to be handled, response times may increase so much that clients
time out and resend their request. This causes the rate of requests to increase even further, making
the problem worse—a *retry storm*. Even when the load is reduced again, such a system may remain in
an overloaded state until it is rebooted or otherwise reset. This phenomenon is called a *metastable
failure*, and it can cause serious outages in production systems [^7] [^8].

To avoid retries overloading a service, you can increase and randomize the time between successive
retries on the client side (*exponential backoff* [^9] [^10]), and temporarily stop sending requests to a service that has returned errors or timed out recently
(using a *circuit breaker* [^11] [^12] or *token bucket* algorithm [^13]).
The server can also detect when it is approaching overload and start proactively rejecting requests
(*load shedding* [^14]), and send back responses asking clients to slow down (*backpressure* [^1] [^15]).
The choice of queueing and load-balancing algorithms can also make a difference [^16].

--------

In terms of performance metrics, the response time is usually what users care about the most,
whereas the throughput determines the required computing resources (e.g., how many servers you need),
and hence the cost of serving a particular workload. If throughput is likely to increase beyond what
the current hardware can handle, the capacity needs to be expanded; a system is said to be
*scalable* if its maximum throughput can be significantly increased by adding computing resources.

In this section we will focus primarily on response times, and we will return to throughput and
scalability in [“Scalability”](/en/ch2#sec_introduction_scalability).

### Latency and Response Time {#id23}

“Latency” and “response time” are sometimes used interchangeably, but in this book we will use the
terms in a specific way (illustrated in [Figure 2-4](/en/ch2#fig_response_time)):

* The *response time* is what the client sees; it includes all delays incurred anywhere in the
 system.
* The *service time* is the duration for which the service is actively processing the user request.
* *Queueing delays* can occur at several points in the flow: for example, after a request is
 received, it might need to wait until a CPU is available before it can be processed; a response
 packet might need to be buffered before it is sent over the network if other tasks on the same
 machine are sending a lot of data via the outbound network interface.
* *Latency* is a catch-all term for time during which a request is not being actively processed,
 i.e., during which it is *latent*. In particular, *network latency* or *network delay* refers to
 the time that request and response spend traveling through the network.

{{< figure src="/fig/ddia_0204.png" id="fig_response_time" caption="Figure 2-4. Response time, service time, network latency, and queueing delay." class="w-full my-4" >}}

In [Figure 2-4](/en/ch2#fig_response_time), time flows from left to right, each communicating node is shown as a
horizontal line, and a request or response message is shown as a thick diagonal arrow from one node
to another. You will encounter this style of diagram frequently over the course of this book.

The response time can vary significantly from one request to the next, even if you keep making the
same request over and over again. Many factors can add random delays: for example, a context switch
to a background process, the loss of a network packet and TCP retransmission, a garbage collection
pause, a page fault forcing a read from disk, mechanical vibrations in the server rack [^17],
or many other causes. We will discuss this topic in more detail in [“Timeouts and Unbounded Delays”](/en/ch9#sec_distributed_queueing).

Queueing delays often account for a large part of the variability in response times. As a server
can only process a small number of things in parallel (limited, for example, by its number of CPU
cores), it only takes a small number of slow requests to hold up the processing of subsequent
requests—an effect known as *head-of-line blocking*. Even if those subsequent requests have fast
service times, the client will see a slow overall response time due to the time waiting for the
prior request to complete. The queueing delay is not part of the service time, and for this reason
it is important to measure response times on the client side.

### Average, Median, and Percentiles {#id24}

Because the response time varies from one request to the next, we need to think of it not as a
single number, but as a *distribution* of values that you can measure. In [Figure 2-5](/en/ch2#fig_lognormal), each
gray bar represents a request to a service, and its height shows how long that request took. Most
requests are reasonably fast, but there are occasional *outliers* that take much longer.
Variation in network delay is also known as *jitter*.

{{< figure src="/fig/ddia_0205.png" id="fig_lognormal" caption="Figure 2-5. Illustrating mean and percentiles: response times for a sample of 100 requests to a service." class="w-full my-4" >}}

It’s common to report the *average* response time of a service (technically, the *arithmetic mean*:
that is, sum all the response times, and divide by the number of requests). The mean response time
is useful for estimating throughput limits [^18].
However, the mean is not a very good metric if you want to know your “typical” response time,
because it doesn’t tell you how many users actually experienced that delay.

Usually it is better to use *percentiles*. If you take your list of response times and sort it from
fastest to slowest, then the *median* is the halfway point: for example, if your median response
time is 200 ms, that means half your requests return in less than 200 ms, and half your
requests take longer than that. This makes the median a good metric if you want to know how long
users typically have to wait. The median is also known as the *50th percentile*, and sometimes
abbreviated as *p50*.

In order to figure out how bad your outliers are, you can look at higher percentiles: the *95th*,
*99th*, and *99.9th* percentiles are common (abbreviated *p95*, *p99*, and *p999*). They are the
response time thresholds at which 95%, 99%, or 99.9% of requests are faster than that particular
threshold. For example, if the 95th percentile response time is 1.5 seconds, that means 95 out of
100 requests take less than 1.5 seconds, and 5 out of 100 requests take 1.5 seconds or more. This is
illustrated in [Figure 2-5](/en/ch2#fig_lognormal).

High percentiles of response times, also known as *tail latencies*, are important because they
directly affect users’ experience of the service. For example, Amazon describes response time
requirements for internal services in terms of the 99.9th percentile, even though it only affects 1
in 1,000 requests. This is because the customers with the slowest requests are often those who have
the most data on their accounts because they have made many purchases—that is, they’re the most
valuable customers [^19].
It’s important to keep those customers happy by ensuring the website is fast for them.

On the other hand, optimizing the 99.99th percentile (the slowest 1 in 10,000 requests) was deemed
too expensive and to not yield enough benefit for Amazon’s purposes. Reducing response times at very
high percentiles is difficult because they are easily affected by random events outside of your
control, and the benefits are diminishing.

--------

> [!TIP] THE USER IMPACT OF RESPONSE TIMES

It seems intuitively obvious that a fast service is better for users than a slow service [^20].
However, it is surprisingly difficult to get hold of reliable data to quantify the effect that
latency has on user behavior.

Some often-cited statistics are unreliable. In 2006 Google reported that a slowdown in search
results from 400 ms to 900 ms was associated with a 20% drop in traffic and revenue [^21].
However, another Google study from 2009 reported that a 400 ms increase in latency resulted in
only 0.6% fewer searches per day [^22],
and in the same year Bing found that a two-second increase in load time reduced ad revenue by 4.3% [^23].
Newer data from these companies appears not to be publicly available.

A more recent Akamai study [^24]
claims that a 100 ms increase in response time reduced the conversion rate of e-commerce sites
by up to 7%; however, on closer inspection, the same study reveals that very *fast* page load times
are also correlated with lower conversion rates! This seemingly paradoxical result is explained by
the fact that the pages that load fastest are often those that have no useful content (e.g., 404
error pages). However, since the study makes no effort to separate the effects of page content from
the effects of load time, its results are probably not meaningful.

A study by Yahoo [^25] compares click-through rates on fast-loading versus slow-loading search results, controlling for
quality of search results. It finds 20–30% more clicks on fast searches when the difference between
fast and slow responses is 1.25 seconds or more.

--------

### Use of Response Time Metrics {#sec_introduction_slo_sla}

High percentiles are especially important in backend services that are called multiple times as
part of serving a single end-user request. Even if you make the calls in parallel, the end-user
request still needs to wait for the slowest of the parallel calls to complete. It takes just one
slow call to make the entire end-user request slow, as illustrated in [Figure 2-6](/en/ch2#fig_tail_amplification).
Even if only a small percentage of backend calls are slow, the chance of getting a slow call
increases if an end-user request requires multiple backend calls, and so a higher proportion of
end-user requests end up being slow (an effect known as *tail latency amplification* [^26]).

{{< figure src="/fig/ddia_0206.png" id="fig_tail_amplification" caption="Figure 2-6. When several backend calls are needed to serve a request, it takes just a single slow backend request to slow down the entire end-user request." class="w-full my-4" >}}

Percentiles are often used in *service level objectives* (SLOs) and *service level agreements*
(SLAs) as ways of defining the expected performance and availability of a service [^27].
For example, an SLO may set a target for a service to have a median response time of less than
200 ms and a 99th percentile under 1 s, and a target that at least 99.9% of valid requests
result in non-error responses. An SLA is a contract that specifies what happens if the SLO is not
met (for example, customers may be entitled to a refund). That is the basic idea, at least; in
practice, defining good availability metrics for SLOs and SLAs is not straightforward [^28] [^29].

--------

<a id="sidebar_percentiles"></a>

> [!TIP] COMPUTING PERCENTILES

If you want to add response time percentiles to the monitoring dashboards for your services, you
need to efficiently calculate them on an ongoing basis. For example, you may want to keep a rolling
window of response times of requests in the last 10 minutes. Every minute, you calculate the median
and various percentiles over the values in that window and plot those metrics on a graph.

The simplest implementation is to keep a list of response times for all requests within the time
window and to sort that list every minute. If that is too inefficient for you, there are algorithms
that can calculate a good approximation of percentiles at minimal CPU and memory cost.
Open source percentile estimation libraries include HdrHistogram,
t-digest [^30] [^31],
OpenHistogram [^32], and DDSketch [^33].

Beware that averaging percentiles, e.g., to reduce the time resolution or to combine data from
several machines, is mathematically meaningless—the right way of aggregating response time data
is to add the histograms [^34].

--------


## Reliability and Fault Tolerance {#sec_introduction_reliability}

Everybody has an intuitive idea of what it means for something to be reliable or unreliable. For
software, typical expectations include:

* The application performs the function that the user expected.
* It can tolerate the user making mistakes or using the software in unexpected ways.
* Its performance is good enough for the required use case, under the expected load and data volume.
* The system prevents any unauthorized access and abuse.

If all those things together mean “working correctly,” then we can understand *reliability* as
meaning, roughly, “continuing to work correctly, even when things go wrong.” To be more precise
about things going wrong, we will distinguish between *faults* and *failures* [^35] [^36] [^37]:

Fault
: A fault is when a particular *part* of a system stops working correctly: for example, if a
 single hard drive malfunctions, or a single machine crashes, or an external service (that the
 system depends on) has an outage.

Failure
: A failure is when the system *as a whole* stops providing the required service to the user; in
 other words, when it does not meet the service level objective (SLO).

The distinction between fault and failure can be confusing because they are the same thing, just at
different levels. For example, if a hard drive stops working, we say that the hard drive has failed:
if the system consists only of that one hard drive, it has stopped providing the required service.
However, if the system you’re talking about contains many hard drives, then the failure of a single
hard drive is only a fault from the point of view of the bigger system, and the bigger system might
be able to tolerate that fault by having a copy of the data on another hard drive.

### Fault Tolerance {#id27}

We call a system *fault-tolerant* if it continues providing the required service to the user in
spite of certain faults occurring. If a system cannot tolerate a certain part becoming faulty, we
call that part a *single point of failure* (SPOF), because a fault in that part escalates to cause
the failure of the whole system.

For example, in the social network case study, a fault that might happen is that during the fan-out
process, a machine involved in updating the materialized timelines crashes or become unavailable.
To make this process fault-tolerant, we would need to ensure that another machine can take over this
task without missing any posts that should have been delivered, and without duplicating any posts.
(This idea is known as *exactly-once semantics*, and we will examine it in detail in [“The End-to-End Argument for Databases”](/en/ch13#sec_future_end_to_end).)

Fault tolerance is always limited to a certain number of certain types of faults. For example, a
system might be able to tolerate a maximum of two hard drives failing at the same time, or a maximum
of one out of three nodes crashing. It would not make sense to tolerate any number of faults: if all
nodes crash, there is nothing that can be done. If the entire planet Earth (and all servers on it)
were swallowed by a black hole, tolerance of that fault would require web hosting in space—good luck
getting that budget item approved.

Counter-intuitively, in such fault-tolerant systems, it can make sense to *increase* the rate of
faults by triggering them deliberately—for example, by randomly killing individual processes
without warning. This is called *fault injection*. Many critical bugs are actually due to poor error
handling [^38]; by deliberately inducing faults, you ensure
that the fault-tolerance machinery is continually exercised and tested, which can increase your
confidence that faults will be handled correctly when they occur naturally. *Chaos engineering* is
a discipline that aims to improve confidence in fault-tolerance mechanisms through experiments such
as deliberately injecting faults [^39].

Although we generally prefer tolerating faults over preventing faults, there are cases where
prevention is better than cure (e.g., because no cure exists). This is the case with security
matters, for example: if an attacker has compromised a system and gained access to sensitive data,
that event cannot be undone. However, this book mostly deals with the kinds of faults that can be
cured, as described in the following sections.

### Hardware and Software Faults {#sec_introduction_hardware_faults}

When we think of causes of system failure, hardware faults quickly come to mind:

* Approximately 2–5% of magnetic hard drives fail per year [^40] [^41]; in a storage cluster with 10,000 disks, we should therefore expect on average one disk failure per day.
 Recent data suggests that disks are getting more reliable, but failure rates remain significant [^42].
* Approximately 0.5–1% of solid state drives (SSDs) fail per year [^43]. Small numbers of bit errors are corrected automatically [^44], but uncorrectable errors occur approximately once per year per drive, even in drives that are
 fairly new (i.e., that have experienced little wear); this error rate is higher than that of
 magnetic hard drives [^45], [^46].
* Other hardware components such as power supplies, RAID controllers, and memory modules also fail, although less frequently than hard drives [^47] [^48].
* Approximately one in 1,000 machines has a CPU core that occasionally computes the wrong result,
 likely due to manufacturing defects [^49] [^50] [^51]. In some cases, an erroneous computation leads to a crash, but in other cases it leads to a program simply returning the wrong result.
* Data in RAM can also be corrupted, either due to random events such as cosmic rays, or due to
 permanent physical defects. Even when memory with error-correcting codes (ECC) is used, more than
 1% of machines encounter an uncorrectable error in a given year, which typically leads to a crash
 of the machine and the affected memory module needing to be replaced [^52].
 Moreover, certain pathological memory access patterns can flip bits with high probability [^53].
* An entire datacenter might become unavailable (for example, due to power outage or network
 misconfiguration) or even be permanently destroyed (for example by fire, flood, or earthquake [^54]).
 A solar storm, which induces large electrical currents in long-distance wires when the sun ejects
 a large mass of charged particles, could damage power grids and undersea network cables [^55].
 Although such large-scale failures are rare, their impact can be catastrophic if a service cannot tolerate the loss of a datacenter [^56].

These events are rare enough that you often don’t need to worry about them when working on a small
system, as long as you can easily replace hardware that becomes faulty. However, in a large-scale
system, hardware faults happen often enough that they become part of the normal system operation.

#### Tolerating hardware faults through redundancy {#tolerating-hardware-faults-through-redundancy}

Our first response to unreliable hardware is usually to add redundancy to the individual hardware
components in order to reduce the failure rate of the system. Disks may be set up in a RAID
configuration (spreading data across multiple disks in the same machine so that a failed disk does
not cause data loss), servers may have dual power supplies and hot-swappable CPUs, and datacenters
may have batteries and diesel generators for backup power. Such redundancy can often keep a machine
running uninterrupted for years.

Redundancy is most effective when component faults are independent, that is, the occurrence of one
fault does not change how likely it is that another fault will occur. However, experience has shown
that there are often significant correlations between component failures [^41] [^57] [^58];
unavailability of an entire server rack or an entire datacenter still happens more often than we
would like.

Hardware redundancy increases the uptime of a single machine; however, as discussed in
[“Distributed versus Single-Node Systems”](/en/ch1#sec_introduction_distributed), there are advantages to using a distributed system, such as being
able to tolerate a complete outage of one datacenter.
For this reason, cloud systems tend to focus less on the reliability of individual machines, and
instead aim to make services highly available by tolerating faulty nodes at the software level.
Cloud providers use *availability zones* to identify which resources are physically co-located;
resources in the same place are more likely to fail at the same time than geographically separated
resources.

The fault-tolerance techniques we discuss in this book are designed to tolerate the loss of entire
machines, racks, or availability zones. They generally work by allowing a machine in one datacenter
to take over when a machine in another datacenter fails or becomes unreachable. We will discuss such
techniques for fault tolerance in [Chapter 6](/en/ch6#ch_replication), [Chapter 10](/en/ch10#ch_consistency), and at various other
points in this book.

Systems that can tolerate the loss of entire machines also have operational advantages: a
single-server system requires planned downtime if you need to reboot the machine (to apply operating
system security patches, for example), whereas a multi-node fault-tolerant system can be patched by
restarting one node at a time, without affecting the service for users. This is called a *rolling
upgrade*, and we will discuss it further in [Chapter 5](/en/ch5#ch_encoding).

#### Software faults {#software-faults}

Although hardware failures can be weakly correlated, they are still mostly independent: for
example, if one disk fails, it’s likely that other disks in the same machine will be fine for
another while. On the other hand, software faults are often very highly correlated, because it is
common for many nodes to run the same software and thus have the same bugs [^59] [^60].
Such faults are harder to anticipate, and they tend to cause many more system failures than
uncorrelated hardware faults [^47]. For example:

* A software bug that causes every node to fail at the same time in particular circumstances. For
 example, on June 30, 2012, a leap second caused many Java applications to hang simultaneously due
 to a bug in the Linux kernel, bringing down many Internet services [^61].
 Due to a firmware bug, all SSDs of certain models suddenly fail after precisely 32,768 hours of
 operation (less than 4 years), rendering the data on them unrecoverable [^62].
* A runaway process that uses up some shared, limited resource, such as CPU time, memory, disk
 space, network bandwidth, or threads [^63]. For example, a process that consumes too much memory while processing a large request may be
 killed by the operating system. A bug in a client library could cause a much higher request volume than anticipated [^64].
* A service that the system depends on slows down, becomes unresponsive, or starts returning corrupted responses.
* An interaction between different systems results in emergent behavior that does not occur when each system was tested in isolation [^65].
* Cascading failures, where a problem in one component causes another component to become overloaded
 and slow down, which in turn brings down another component [^66] [^67].

The bugs that cause these kinds of software faults often lie dormant for a long time until they are
triggered by an unusual set of circumstances. In those circumstances, it is revealed that the
software is making some kind of assumption about its environment—and while that assumption is
usually true, it eventually stops being true for some reason [^68] [^69].

There is no quick solution to the problem of systematic faults in software. Lots of small things can
help: carefully thinking about assumptions and interactions in the system; thorough testing; process
isolation; allowing processes to crash and restart; avoiding feedback loops such as retry storms
(see [“When an overloaded system won’t recover”](/en/ch2#sidebar_metastable)); measuring, monitoring, and analyzing system behavior in production.

### Humans and Reliability {#id31}

Humans design and build software systems, and the operators who keep the systems running are also
human. Unlike machines, humans don’t just follow rules; their strength is being creative and
adaptive in getting their job done. However, this characteristic also leads to unpredictability, and
sometimes mistakes that can lead to failures, despite best intentions. For example, one study of
large internet services found that configuration changes by operators were the leading cause of
outages, whereas hardware faults (servers or network) played a role in only 10–25% of outages [^70].

It is tempting to label such problems as “human error” and to wish that they could be solved by
better controlling human behavior through tighter procedures and compliance with rules. However,
blaming people for mistakes is counterproductive. What we call “human error” is not really the cause
of an incident, but rather a symptom of a problem with the sociotechnical system in which people are
trying their best to do their jobs [^71].
Often complex systems have emergent behavior, in which unexpected interactions between components
may also lead to failures [^72].

Various technical measures can help minimize the impact of human mistakes, including thorough
testing (both hand-written tests and *property testing* on lots of random inputs) [^38], rollback mechanisms for quickly
reverting configuration changes, gradual roll-outs of new code, detailed and clear monitoring,
observability tools for diagnosing production issues (see [“Problems with Distributed Systems”](/en/ch1#sec_introduction_dist_sys_problems)),
and well-designed interfaces that encourage “the right thing” and discourage “the wrong thing”.

However, these things require an investment of time and money, and in the pragmatic reality of
everyday business, organizations often prioritize revenue-generating activities over measures that
increase their resilience against mistakes. If there is a choice between more features and more
testing, many organizations understandably choose features. Given this choice, when a preventable
mistake inevitably occurs, it does not make sense to blame the person who made the mistake—the
problem is the organization’s priorities.

Increasingly, organizations are adopting a culture of *blameless postmortems*: after an incident,
the people involved are encouraged to share full details about what happened, without fear of
punishment, since this allows others in the organization to learn how to prevent similar problems in the future [^73].
This process may uncover a need to change business priorities, a need to invest in areas that have
been neglected, a need to change the incentives for the people involved, or some other systemic
issue that needs to be brought to the management’s attention.

As a general principle, when investigating an incident, you should be suspicious of simplistic
answers. “Bob should have been more careful when deploying that change” is not productive, but
neither is “We must rewrite the backend in Haskell.” Instead, management should take the opportunity
to learn the details of how the sociotechnical system works from the point of view of the people who
work with it every day, and take steps to improve it based on this feedback [^71].

--------

<a id="sidebar_reliability_importance"></a>

> [!TIP] HOW IMPORTANT IS RELIABILITY?

Reliability is not just for nuclear power stations and air traffic control—more mundane applications
are also expected to work reliably. Bugs in business applications cause lost productivity (and legal
risks if figures are reported incorrectly), and outages of e-commerce sites can have huge costs in
terms of lost revenue and damage to reputation.

In many applications, a temporary outage of a few minutes or even a few hours is tolerable [^74],
but permanent data loss or corruption would be catastrophic. Consider a parent who stores all their
pictures and videos of their children in your photo application [^75]. How would they
feel if that database was suddenly corrupted? Would they know how to restore it from a backup?

As another example of how unreliable software can harm people, consider the Post Office Horizon
scandal. Between 1999 and 2019, hundreds of people managing Post Office branches in Britain were
convicted of theft or fraud because the accounting software showed a shortfall in their accounts.
Eventually it became clear that many of these shortfalls were due to bugs in the software, and many
convictions have since been overturned [^76].
What led to this, probably the largest miscarriage of justice in British history, is the fact that
English law assumes that computers operate correctly (and hence, evidence produced by computers is
reliable) unless there is evidence to the contrary [^77].
Software engineers may laugh at the idea that software could ever be bug-free, but this is little
solace to the people who were wrongfully imprisoned, declared bankrupt, or even committed suicide as
a result of a wrongful conviction due to an unreliable computer system.

There are situations in which we may choose to sacrifice reliability in order to reduce development
cost (e.g., when developing a prototype product for an unproven market)—but we should be very
conscious of when we are cutting corners and keep in mind the potential consequences.

--------

## Scalability {#sec_introduction_scalability}

Even if a system is working reliably today, that doesn’t mean it will necessarily work reliably in
the future. One common reason for degradation is increased load: perhaps the system has grown from
10,000 concurrent users to 100,000 concurrent users, or from 1 million to 10 million. Perhaps it is
processing much larger volumes of data than it did before.

*Scalability* is the term we use to describe a system’s ability to cope with increased load.
Sometimes, when discussing scalability, people make comments along the lines of, “You’re not Google
or Amazon. Stop worrying about scale and just use a relational database.” Whether this maxim applies
to you depends on the type of application you are building.

If you are building a new product that currently only has a small number of users, perhaps at a
startup, the overriding engineering goal is usually to keep the system as simple and flexible as
possible, so that you can easily modify and adapt the features of your product as you learn more
about customers’ needs [^78].
In such an environment, it is counterproductive to worry about hypothetical scale that might be
needed in the future: in the best case, investments in scalability are wasted effort and premature
optimization; in the worst case, they lock you into an inflexible design and make it harder to
evolve your application.

The reason is that scalability is not a one-dimensional label: it is meaningless to say “X is
scalable” or “Y doesn’t scale.” Rather, discussing scalability means considering questions like:

* “If the system grows in a particular way, what are our options for coping with the growth?”
* “How can we add computing resources to handle the additional load?”
* “Based on current growth projections, when will we hit the limits of our current architecture?”

If you succeed in making your application popular, and therefore handling a growing amount of load,
you will learn where your performance bottlenecks lie, and therefore you will know along which
dimensions you need to scale. At that point it’s time to start worrying about techniques for
scalability.

### Describing Load {#id33}

First, we need to succinctly describe the current load on the system; only then can we discuss
growth questions (what happens if our load doubles?). Often this will be a measure of throughput:
for example, the number of requests per second to a service, how many gigabytes of new data arrive
per day, or the number of shopping cart checkouts per hour. Sometimes you care about the peak of
some variable quantity, such as the number of simultaneously online users in
[“Case Study: Social Network Home Timelines”](/en/ch2#sec_introduction_twitter).

Often there are other statistical characteristics of the load that also affect the access patterns
and hence the scalability requirements. For example, you may need to know the ratio of reads to
writes in a database, the hit rate on a cache, or the number of data items per user (for example,
the number of followers in the social network case study). Perhaps the average case is what matters
for you, or perhaps your bottleneck is dominated by a small number of extreme cases. It all depends
on the details of your particular application.

Once you have described the load on your system, you can investigate what happens when the load
increases. You can look at it in two ways:

* When you increase the load in a certain way and keep the system resources (CPUs, memory, network
 bandwidth, etc.) unchanged, how is the performance of your system affected?
* When you increase the load in a certain way, how much do you need to increase the resources if you
 want to keep performance unchanged?

Usually our goal is to keep the performance of the system within the requirements of the SLA
(see [“Use of Response Time Metrics”](/en/ch2#sec_introduction_slo_sla)) while also minimizing the cost of running the system. The greater
the required computing resources, the higher the cost. It might be that some types of hardware are
more cost-effective than others, and these factors may change over time as new types of hardware
become available.

If you can double the resources in order to handle twice the load, while keeping performance the
same, we say that you have *linear scalability*, and this is considered a good thing. Occasionally
it is possible to handle twice the load with less than double the resources, due to economies of
scale or a better distribution of peak load
[^79] [^80].
Much more likely is that the cost grows faster than linearly, and there may be many reasons for the
inefficiency. For example, if you have a lot of data, then processing a single write request may
involve more work than if you have a small amount of data, even if the size of the request is the
same.

### Shared-Memory, Shared-Disk, and Shared-Nothing Architecture {#sec_introduction_shared_nothing}

The simplest way of increasing the hardware resources of a service is to move it to a more powerful
machine. Individual CPU cores are no longer getting significantly faster, but you can buy a machine
(or rent a cloud instance) with more CPU cores, more RAM, and more disk space. This approach is
called *vertical scaling* or *scaling up*.

You can get parallelism on a single machine by using multiple processes or threads. All the threads
belonging to the same process can access the same RAM, and hence this approach is also called a
*shared-memory architecture*. The problem with a shared-memory approach is that the cost grows
faster than linearly: a high-end machine with twice the hardware resources typically costs
significantly more than twice as much. And due to bottlenecks, a machine twice the size can often
handle less than twice the load.

Another approach is the *shared-disk architecture*, which uses several machines with independent
CPUs and RAM, but which stores data on an array of disks that is shared between the machines, which
are connected via a fast network: *Network-Attached Storage* (NAS) or *Storage Area Network* (SAN).
This architecture has traditionally been used for on-premises data warehousing workloads, but
contention and the overhead of locking limit the scalability of the shared-disk approach [^81].

By contrast, the *shared-nothing architecture* [^82]
(also called *horizontal scaling* or *scaling out*) has gained a lot of popularity. In this
approach, we use a distributed system with multiple nodes, each of which has its own CPUs, RAM, and
disks. Any coordination between nodes is done at the software level, via a conventional network.

The advantages of shared-nothing are that it has the potential to scale linearly, it can use
whatever hardware offers the best price/performance ratio (especially in the cloud), it can more
easily adjust its hardware resources as load increases or decreases, and it can achieve greater
fault tolerance by distributing the system across multiple data centers and regions. The downsides
are that it requires explicit sharding (see [Chapter 7](/en/ch7#ch_sharding)), and it incurs all the complexity of
distributed systems ([Chapter 9](/en/ch9#ch_distributed)).

Some cloud-native database systems use separate services for storage and transaction execution (see
[“Separation of storage and compute”](/en/ch1#sec_introduction_storage_compute)), with multiple compute nodes sharing access to the same
storage service. This model has some similarity to a shared-disk architecture, but it avoids the
scalability problems of older systems: instead of providing a filesystem (NAS) or block device (SAN)
abstraction, the storage service offers a specialized API that is designed for the specific needs of
the database [^83].

### Principles for Scalability {#id35}

The architecture of systems that operate at large scale is usually highly specific to the
application—there is no such thing as a generic, one-size-fits-all scalable architecture
(informally known as *magic scaling sauce*). For example, a system that is designed to handle
100,000 requests per second, each 1 kB in size, looks very different from a system that is
designed for 3 requests per minute, each 2 GB in size—even though the two systems have the same
data throughput (100 MB/sec).

Moreover, an architecture that is appropriate for one level of load is unlikely to cope with 10
times that load. If you are working on a fast-growing service, it is therefore likely that you will
need to rethink your architecture on every order of magnitude load increase. As the needs of the
application are likely to evolve, it is usually not worth planning future scaling needs more than
one order of magnitude in advance.

A good general principle for scalability is to break a system down into smaller components that can
operate largely independently from each other. This is the underlying principle behind microservices
(see [“Microservices and Serverless”](/en/ch1#sec_introduction_microservices)), sharding ([Chapter 7](/en/ch7#ch_sharding)), stream processing
([Chapter 12](/en/ch12#ch_stream)), and shared-nothing architectures. However, the challenge is in knowing where to
draw the line between things that should be together, and things that should be apart. Design
guidelines for microservices can be found in other books [^84],
and we discuss sharding of shared-nothing systems in [Chapter 7](/en/ch7#ch_sharding).

Another good principle is not to make things more complicated than necessary. If a single-machine
database will do the job, it’s probably preferable to a complicated distributed setup. Auto-scaling
systems (which automatically add or remove resources in response to demand) are cool, but if your
load is fairly predictable, a manually scaled system may have fewer operational surprises (see
[“Operations: Automatic or Manual Rebalancing”](/en/ch7#sec_sharding_operations)). A system with five services is simpler than one with fifty. Good
architectures usually involve a pragmatic mixture of approaches.

## Maintainability {#sec_introduction_maintainability}

Software does not wear out or suffer material fatigue, so it does not break in the same ways as
mechanical objects do. But the requirements for an application frequently change, the environment
that the software runs in changes (such as its dependencies and the underlying platform), and it has
bugs that need fixing.

It is widely recognized that the majority of the cost of software is not in its initial development,
but in its ongoing maintenance—fixing bugs, keeping its systems operational, investigating failures,
adapting it to new platforms, modifying it for new use cases, repaying technical debt, and adding
new features [^85] [^86].

However, maintenance is also difficult. If a system has been successfully running for a long time,
it may well use outdated technologies that not many engineers understand today (such as mainframes
and COBOL code); institutional knowledge of how and why a system was designed in a certain way may
have been lost as people have left the organization; it might be necessary to fix other people’s
mistakes. Moreover, the computer system is often intertwined with the human organization that it
supports, which means that maintenance of such *legacy* systems is as much a people problem as a
technical one [^87].

Every system we create today will one day become a legacy system if it is valuable enough to survive
for a long time. In order to minimize the pain for future generations who need to maintain our
software, we should design it with maintenance concerns in mind. Although we cannot always predict
which decisions might create maintenance headaches in the future, in this book we will pay attention
to several principles that are widely applicable:

Operability
: Make it easy for the organization to keep the system running smoothly.

Simplicity
: Make it easy for new engineers to understand the system, by implementing it using well-understood,
 consistent patterns and structures, and avoiding unnecessary complexity.

Evolvability
: Make it easy for engineers to make changes to the system in the future, adapting it and extending
 it for unanticipated use cases as requirements change.


### Operability: Making Life Easy for Operations {#id37}

We previously discussed the role of operations in [“Operations in the Cloud Era”](/en/ch1#sec_introduction_operations), and we saw that
human processes are at least as important for reliable operations as software tools. In fact, it has
been suggested that “good operations can often work around the limitations of bad (or incomplete)
software, but good software cannot run reliably with bad operations” [^60].

In large-scale systems consisting of many thousands of machines, manual maintenance would be
unreasonably expensive, and automation is essential. However, automation can be a two-edged sword:
there will always be edge cases (such as rare failure scenarios) that require manual intervention
from the operations team. Since the cases that cannot be handled automatically are the most complex
issues, greater automation requires a *more* skilled operations team that can resolve those issues [^88].

Moreover, if an automated system goes wrong, it is often harder to troubleshoot than a system that
relies on an operator to perform some actions manually. For that reason, it is not the case that
more automation is always better for operability. However, some amount of automation is important,
and the sweet spot will depend on the specifics of your particular application and organization.

Good operability means making routine tasks easy, allowing the operations team to focus their efforts
on high-value activities. Data systems can do various things to make routine tasks easy, including [^89]:

* Allowing monitoring tools to check the system’s key metrics, and supporting observability tools
 (see [“Problems with Distributed Systems”](/en/ch1#sec_introduction_dist_sys_problems)) to give insights into the system’s runtime behavior.
 A variety of commercial and open source tools can help here [^90].
* Avoiding dependency on individual machines (allowing machines to be taken down for maintenance
 while the system as a whole continues running uninterrupted)
* Providing good documentation and an easy-to-understand operational model (“If I do X, Y will happen”)
* Providing good default behavior, but also giving administrators the freedom to override defaults when needed
* Self-healing where appropriate, but also giving administrators manual control over the system state when needed
* Exhibiting predictable behavior, minimizing surprises

### Simplicity: Managing Complexity {#id38}

Small software projects can have delightfully simple and expressive code, but as projects get
larger, they often become very complex and difficult to understand. This complexity slows down
everyone who needs to work on the system, further increasing the cost of maintenance. A software
project mired in complexity is sometimes described as a *big ball of mud* [^91].

When complexity makes maintenance hard, budgets and schedules are often overrun. In complex
software, there is also a greater risk of introducing bugs when making a change: when the system is
harder for developers to understand and reason about, hidden assumptions, unintended consequences,
and unexpected interactions are more easily overlooked [^69].
Conversely, reducing complexity greatly improves the maintainability of software, and thus
simplicity should be a key goal for the systems we build.

Simple systems are easier to understand, and therefore we should try to solve a given problem in the
simplest way possible. Unfortunately, this is easier said than done. Whether something is simple or
not is often a subjective matter of taste, as there is no objective standard of simplicity [^92].
For example, one system may hide a complex implementation behind a simple interface, whereas another
may have a simple implementation that exposes more internal detail to its users—which one is simpler?

One attempt at reasoning about complexity has been to break it down into two categories, *essential* and *accidental* complexity [^93].
The idea is that essential complexity is inherent in the problem domain of the application, while
accidental complexity arises only because of limitations of our tooling. Unfortunately, this
distinction is also flawed, because boundaries between the essential and the accidental shift as our tooling evolves [^94].

One of the best tools we have for managing complexity is *abstraction*. A good abstraction can hide
a great deal of implementation detail behind a clean, simple-to-understand façade. A good
abstraction can also be used for a wide range of different applications. Not only is this reuse more
efficient than reimplementing a similar thing multiple times, but it also leads to higher-quality
software, as quality improvements in the abstracted component benefit all applications that use it.

For example, high-level programming languages are abstractions that hide machine code, CPU registers,
and syscalls. SQL is an abstraction that hides complex on-disk and in-memory data structures,
concurrent requests from other clients, and inconsistencies after crashes. Of course, when
programming in a high-level language, we are still using machine code; we are just not using it
*directly*, because the programming language abstraction saves us from having to think about it.

Abstractions for application code, which aim to reduce its complexity,
can be created using methodologies such as *design patterns* [^95] and *domain-driven design* (DDD) [^96].
This book is not about such application-specific abstractions, but rather about general-purpose
abstractions on top of which you can build your applications, such as database transactions,
indexes, and event logs. If you want to use techniques such as DDD, you can implement them on top of
the foundations described in this book.

### Evolvability: Making Change Easy {#sec_introduction_evolvability}

It’s extremely unlikely that your system’s requirements will remain unchanged forever. They are much more
likely to be in constant flux: you learn new facts, previously unanticipated use cases emerge,
business priorities change, users request new features, new platforms replace old platforms, legal
or regulatory requirements change, growth of the system forces architectural changes, etc.

In terms of organizational processes, *Agile* working patterns provide a framework for adapting to
change. The Agile community has also developed technical tools and processes that are helpful when
developing software in a frequently changing environment, such as test-driven development (TDD) and
refactoring. In this book, we search for ways of increasing agility at the level of a system
consisting of several different applications or services with different characteristics.

The ease with which you can modify a data system, and adapt it to changing requirements, is closely
linked to its simplicity and its abstractions: loosely-coupled, simple systems are usually easier to
modify than tightly-coupled, complex ones. Since this is such an important idea, we will use a
different word to refer to agility on a data system level: *evolvability* [^97].

One major factor that makes change difficult in large systems is when some action is irreversible,
and therefore that action needs to be taken very carefully [^98].
For example, say you are migrating from one database to another: if you cannot switch back to the
old system in case of problems with the new one, the stakes are much higher than if you can easily go
back. Minimizing irreversibility improves flexibility.


## Summary {#summary}

In this chapter we examined several examples of nonfunctional requirements: performance,
reliability, scalability, and maintainability. Through these topics we have also encountered
principles and terminology that we will need throughout the rest of the book. We started with a case
study of how one might implement home timelines in a social network, which illustrated some of the
challenges that arise at scale.

We discussed how to measure performance (e.g., using response time percentiles), the load on a
system (e.g., using throughput metrics), and how they are used in SLAs. Scalability is a closely
related concept: that is, ensuring performance stays the same when the load grows. We saw some
general principles for scalability, such as breaking a task down into smaller parts that can operate
independently, and we will dive into deep technical detail on scalability techniques in the
following chapters.

To achieve reliability, you can use fault tolerance techniques, which allow a system to continue
providing its service even if some component (e.g., a disk, a machine, or another service) is
faulty. We saw examples of hardware faults that can occur, and distinguished them from software
faults, which can be harder to deal with because they are often strongly correlated. Another aspect
of achieving reliability is to build resilience against humans making mistakes, and we saw blameless
postmortems as a technique for learning from incidents.

Finally, we examined several facets of maintainability, including supporting the work of operations
teams, managing complexity, and making it easy to evolve an application’s functionality over time.
There are no easy answers on how to achieve these things, but one thing that can help is to build
applications using well-understood building blocks that provide useful abstractions. The rest of
this book will cover a selection of building blocks that have proved to be valuable in practice.

### References

[^1]: Mike Cvet. [How We Learned to Stop Worrying and Love Fan-In at Twitter](https://www.youtube.com/watch?v=WEgCjwyXvwc). At *QCon San Francisco*, December 2016. 
[^2]: Raffi Krikorian. [Timelines at Scale](https://www.infoq.com/presentations/Twitter-Timeline-Scalability/). At *QCon San Francisco*, November 2012. Archived at [perma.cc/V9G5-KLYK](https://perma.cc/V9G5-KLYK) 
[^3]: Twitter. [Twitter’s Recommendation Algorithm](https://blog.twitter.com/engineering/en_us/topics/open-source/2023/twitter-recommendation-algorithm). *blog.twitter.com*, March 2023. Archived at [perma.cc/L5GT-229T](https://perma.cc/L5GT-229T) 
[^4]: Raffi Krikorian. [New Tweets per second record, and how!](https://blog.twitter.com/engineering/en_us/a/2013/new-tweets-per-second-record-and-how) *blog.twitter.com*, August 2013. Archived at [perma.cc/6JZN-XJYN](https://perma.cc/6JZN-XJYN) 
[^5]: Jaz Volpert. [When Imperfect Systems are Good, Actually: Bluesky’s Lossy Timelines](https://jazco.dev/2025/02/19/imperfection/). *jazco.dev*, February 2025. Archived at [perma.cc/2PVE-L2MX](https://perma.cc/2PVE-L2MX) 
[^6]: Samuel Axon. [3% of Twitter’s Servers Dedicated to Justin Bieber](https://mashable.com/archive/justin-bieber-twitter). *mashable.com*, September 2010. Archived at [perma.cc/F35N-CGVX](https://perma.cc/F35N-CGVX) 
[^7]: Nathan Bronson, Abutalib Aghayev, Aleksey Charapko, and Timothy Zhu. [Metastable Failures in Distributed Systems](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s11-bronson.pdf). At *Workshop on Hot Topics in Operating Systems* (HotOS), May 2021. [doi:10.1145/3458336.3465286](https://doi.org/10.1145/3458336.3465286) 
[^8]: Marc Brooker. [Metastability and Distributed Systems](https://brooker.co.za/blog/2021/05/24/metastable.html). *brooker.co.za*, May 2021. Archived at [perma.cc/7FGJ-7XRK](https://perma.cc/7FGJ-7XRK) 
[^9]: Marc Brooker. [Exponential Backoff And Jitter](https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/). *aws.amazon.com*, March 2015. Archived at [perma.cc/R6MS-AZKH](https://perma.cc/R6MS-AZKH) 
[^10]: Marc Brooker. [What is Backoff For?](https://brooker.co.za/blog/2022/08/11/backoff.html) *brooker.co.za*, August 2022. Archived at [perma.cc/PW9N-55Q5](https://perma.cc/PW9N-55Q5) 
[^11]: Michael T. Nygard. [*Release It!*](https://learning.oreilly.com/library/view/release-it-2nd/9781680504552/), 2nd Edition. Pragmatic Bookshelf, January 2018. ISBN: 9781680502398 
[^12]: Frank Chen. [Slowing Down to Speed Up – Circuit Breakers for Slack’s CI/CD](https://slack.engineering/circuit-breakers/). *slack.engineering*, August 2022. Archived at [perma.cc/5FGS-ZPH3](https://perma.cc/5FGS-ZPH3) 
[^13]: Marc Brooker. [Fixing retries with token buckets and circuit breakers](https://brooker.co.za/blog/2022/02/28/retries.html). *brooker.co.za*, February 2022. Archived at [perma.cc/MD6N-GW26](https://perma.cc/MD6N-GW26) 
[^14]: David Yanacek. [Using load shedding to avoid overload](https://aws.amazon.com/builders-library/using-load-shedding-to-avoid-overload/). Amazon Builders’ Library, *aws.amazon.com*. Archived at [perma.cc/9SAW-68MP](https://perma.cc/9SAW-68MP) 
[^15]: Matthew Sackman. [Pushing Back](https://wellquite.org/posts/lshift/pushing_back/). *wellquite.org*, May 2016. Archived at [perma.cc/3KCZ-RUFY](https://perma.cc/3KCZ-RUFY) 
[^16]: Dmitry Kopytkov and Patrick Lee. [Meet Bandaid, the Dropbox service proxy](https://dropbox.tech/infrastructure/meet-bandaid-the-dropbox-service-proxy). *dropbox.tech*, March 2018. Archived at [perma.cc/KUU6-YG4S](https://perma.cc/KUU6-YG4S) 
[^17]: Haryadi S. Gunawi, Riza O. Suminto, Russell Sears, Casey Golliher, Swaminathan Sundararaman, Xing Lin, Tim Emami, Weiguang Sheng, Nematollah Bidokhti, Caitie McCaffrey, Gary Grider, Parks M. Fields, Kevin Harms, Robert B. Ross, Andree Jacobson, Robert Ricci, Kirk Webb, Peter Alvaro, H. Birali Runesha, Mingzhe Hao, and Huaicheng Li. [Fail-Slow at Scale: Evidence of Hardware Performance Faults in Large Production Systems](https://www.usenix.org/system/files/conference/fast18/fast18-gunawi.pdf). At *16th USENIX Conference on File and Storage Technologies*, February 2018. 
[^18]: Marc Brooker. [Is the Mean Really Useless?](https://brooker.co.za/blog/2017/12/28/mean.html) *brooker.co.za*, December 2017. Archived at [perma.cc/U5AE-CVEM](https://perma.cc/U5AE-CVEM) 
[^19]: Giuseppe DeCandia, Deniz Hastorun, Madan Jampani, Gunavardhan Kakulapati, Avinash Lakshman, Alex Pilchin, Swaminathan Sivasubramanian, Peter Vosshall, and Werner Vogels. [Dynamo: Amazon’s Highly Available Key-Value Store](https://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf). At *21st ACM Symposium on Operating Systems Principles* (SOSP), October 2007. [doi:10.1145/1294261.1294281](https://doi.org/10.1145/1294261.1294281) 
[^20]: Kathryn Whitenton. [The Need for Speed, 23 Years Later](https://www.nngroup.com/articles/the-need-for-speed/). *nngroup.com*, May 2020. Archived at [perma.cc/C4ER-LZYA](https://perma.cc/C4ER-LZYA) 
[^21]: Greg Linden. [Marissa Mayer at Web 2.0](https://glinden.blogspot.com/2006/11/marissa-mayer-at-web-20.html). *glinden.blogspot.com*, November 2005. Archived at [perma.cc/V7EA-3VXB](https://perma.cc/V7EA-3VXB) 
[^22]: Jake Brutlag. [Speed Matters for Google Web Search](https://services.google.com/fh/files/blogs/google_delayexp.pdf). *services.google.com*, June 2009. Archived at [perma.cc/BK7R-X7M2](https://perma.cc/BK7R-X7M2) 
[^23]: Eric Schurman and Jake Brutlag. [Performance Related Changes and their User Impact](https://www.youtube.com/watch?v=bQSE51-gr2s). Talk at *Velocity 2009*. 
[^24]: Akamai Technologies, Inc. [The State of Online Retail Performance](https://web.archive.org/web/20210729180749/https%3A//www.akamai.com/us/en/multimedia/documents/report/akamai-state-of-online-retail-performance-spring-2017.pdf). *akamai.com*, April 2017. Archived at [perma.cc/UEK2-HYCS](https://perma.cc/UEK2-HYCS) 
[^25]: Xiao Bai, Ioannis Arapakis, B. Barla Cambazoglu, and Ana Freire. [Understanding and Leveraging the Impact of Response Latency on User Behaviour in Web Search](https://iarapakis.github.io/papers/TOIS17.pdf). *ACM Transactions on Information Systems*, volume 36, issue 2, article 21, April 2018. [doi:10.1145/3106372](https://doi.org/10.1145/3106372) 
[^26]: Jeffrey Dean and Luiz André Barroso. [The Tail at Scale](https://cacm.acm.org/research/the-tail-at-scale/). *Communications of the ACM*, volume 56, issue 2, pages 74–80, February 2013. [doi:10.1145/2408776.2408794](https://doi.org/10.1145/2408776.2408794) 
[^27]: Alex Hidalgo. [*Implementing Service Level Objectives: A Practical Guide to SLIs, SLOs, and Error Budgets*](https://www.oreilly.com/library/view/implementing-service-level/9781492076803/). O’Reilly Media, September 2020. ISBN: 1492076813 
[^28]: Jeffrey C. Mogul and John Wilkes. [Nines are Not Enough: Meaningful Metrics for Clouds](https://research.google/pubs/pub48033/). At *17th Workshop on Hot Topics in Operating Systems* (HotOS), May 2019. [doi:10.1145/3317550.3321432](https://doi.org/10.1145/3317550.3321432) 
[^29]: Tamás Hauer, Philipp Hoffmann, John Lunney, Dan Ardelean, and Amer Diwan. [Meaningful Availability](https://www.usenix.org/conference/nsdi20/presentation/hauer). At *17th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), February 2020. 
[^30]: Ted Dunning. [The t-digest: Efficient estimates of distributions](https://www.sciencedirect.com/science/article/pii/S2665963820300403). *Software Impacts*, volume 7, article 100049, February 2021. [doi:10.1016/j.simpa.2020.100049](https://doi.org/10.1016/j.simpa.2020.100049) 
[^31]: David Kohn. [How percentile approximation works (and why it’s more useful than averages)](https://www.timescale.com/blog/how-percentile-approximation-works-and-why-its-more-useful-than-averages/). *timescale.com*, September 2021. Archived at [perma.cc/3PDP-NR8B](https://perma.cc/3PDP-NR8B) 
[^32]: Heinrich Hartmann and Theo Schlossnagle. [Circllhist — A Log-Linear Histogram Data Structure for IT Infrastructure Monitoring](https://arxiv.org/pdf/2001.06561.pdf). *arxiv.org*, January 2020. 
[^33]: Charles Masson, Jee E. Rim, and Homin K. Lee. [DDSketch: A Fast and Fully-Mergeable Quantile Sketch with Relative-Error Guarantees](https://www.vldb.org/pvldb/vol12/p2195-masson.pdf). *Proceedings of the VLDB Endowment*, volume 12, issue 12, pages 2195–2205, August 2019. [doi:10.14778/3352063.3352135](https://doi.org/10.14778/3352063.3352135) 
[^34]: Baron Schwartz. [Why Percentiles Don’t Work the Way You Think](https://orangematter.solarwinds.com/2016/11/18/why-percentiles-dont-work-the-way-you-think/). *solarwinds.com*, November 2016. Archived at [perma.cc/469T-6UGB](https://perma.cc/469T-6UGB) 
[^35]: Walter L. Heimerdinger and Charles B. Weinstock. [A Conceptual Framework for System Fault Tolerance](https://resources.sei.cmu.edu/asset_files/TechnicalReport/1992_005_001_16112.pdf). Technical Report CMU/SEI-92-TR-033, Software Engineering Institute, Carnegie Mellon University, October 1992. Archived at [perma.cc/GD2V-DMJW](https://perma.cc/GD2V-DMJW) 
[^36]: Felix C. Gärtner. [Fundamentals of fault-tolerant distributed computing in asynchronous environments](https://dl.acm.org/doi/pdf/10.1145/311531.311532). *ACM Computing Surveys*, volume 31, issue 1, pages 1–26, March 1999. [doi:10.1145/311531.311532](https://doi.org/10.1145/311531.311532) 
[^37]: Algirdas Avižienis, Jean-Claude Laprie, Brian Randell, and Carl Landwehr. [Basic Concepts and Taxonomy of Dependable and Secure Computing](https://hdl.handle.net/1903/6459). *IEEE Transactions on Dependable and Secure Computing*, volume 1, issue 1, January 2004. [doi:10.1109/TDSC.2004.2](https://doi.org/10.1109/TDSC.2004.2) 
[^38]: Ding Yuan, Yu Luo, Xin Zhuang, Guilherme Renna Rodrigues, Xu Zhao, Yongle Zhang, Pranay U. Jain, and Michael Stumm. [Simple Testing Can Prevent Most Critical Failures: An Analysis of Production Failures in Distributed Data-Intensive Systems](https://www.usenix.org/system/files/conference/osdi14/osdi14-paper-yuan.pdf). At *11th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2014. 
[^39]: Casey Rosenthal and Nora Jones. [*Chaos Engineering*](https://learning.oreilly.com/library/view/chaos-engineering/9781492043850/). O’Reilly Media, April 2020. ISBN: 9781492043867 
[^40]: Eduardo Pinheiro, Wolf-Dietrich Weber, and Luiz Andre Barroso. [Failure Trends in a Large Disk Drive Population](https://www.usenix.org/legacy/events/fast07/tech/full_papers/pinheiro/pinheiro_old.pdf). At *5th USENIX Conference on File and Storage Technologies* (FAST), February 2007. 
[^41]: Bianca Schroeder and Garth A. Gibson. [Disk failures in the real world: What does an MTTF of 1,000,000 hours mean to you?](https://www.usenix.org/legacy/events/fast07/tech/schroeder/schroeder.pdf) At *5th USENIX Conference on File and Storage Technologies* (FAST), February 2007. 
[^42]: Andy Klein. [Backblaze Drive Stats for Q2 2021](https://www.backblaze.com/blog/backblaze-drive-stats-for-q2-2021/). *backblaze.com*, August 2021. Archived at [perma.cc/2943-UD5E](https://perma.cc/2943-UD5E) 
[^43]: Iyswarya Narayanan, Di Wang, Myeongjae Jeon, Bikash Sharma, Laura Caulfield, Anand Sivasubramaniam, Ben Cutler, Jie Liu, Badriddine Khessib, and Kushagra Vaid. [SSD Failures in Datacenters: What? When? and Why?](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/08/a7-narayanan.pdf) At *9th ACM International on Systems and Storage Conference* (SYSTOR), June 2016. [doi:10.1145/2928275.2928278](https://doi.org/10.1145/2928275.2928278) 
[^44]: Alibaba Cloud Storage Team. [Storage System Design Analysis: Factors Affecting NVMe SSD Performance (1)](https://www.alibabacloud.com/blog/594375). *alibabacloud.com*, January 2019. Archived at [archive.org](https://web.archive.org/web/20230522005034/https%3A//www.alibabacloud.com/blog/594375) 
[^45]: Bianca Schroeder, Raghav Lagisetty, and Arif Merchant. [Flash Reliability in Production: The Expected and the Unexpected](https://www.usenix.org/system/files/conference/fast16/fast16-papers-schroeder.pdf). At *14th USENIX Conference on File and Storage Technologies* (FAST), February 2016. 
[^46]: Jacob Alter, Ji Xue, Alma Dimnaku, and Evgenia Smirni. [SSD failures in the field: symptoms, causes, and prediction models](https://dl.acm.org/doi/pdf/10.1145/3295500.3356172). At *International Conference for High Performance Computing, Networking, Storage and Analysis* (SC), November 2019. [doi:10.1145/3295500.3356172](https://doi.org/10.1145/3295500.3356172) 
[^47]: Daniel Ford, François Labelle, Florentina I. Popovici, Murray Stokely, Van-Anh Truong, Luiz Barroso, Carrie Grimes, and Sean Quinlan. [Availability in Globally Distributed Storage Systems](https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Ford.pdf). At *9th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2010. 
[^48]: Kashi Venkatesh Vishwanath and Nachiappan Nagappan. [Characterizing Cloud Computing Hardware Reliability](https://www.microsoft.com/en-us/research/wp-content/uploads/2010/06/socc088-vishwanath.pdf). At *1st ACM Symposium on Cloud Computing* (SoCC), June 2010. [doi:10.1145/1807128.1807161](https://doi.org/10.1145/1807128.1807161) 
[^49]: Peter H. Hochschild, Paul Turner, Jeffrey C. Mogul, Rama Govindaraju, Parthasarathy Ranganathan, David E. Culler, and Amin Vahdat. [Cores that don’t count](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s01-hochschild.pdf). At *Workshop on Hot Topics in Operating Systems* (HotOS), June 2021. [doi:10.1145/3458336.3465297](https://doi.org/10.1145/3458336.3465297) 
[^50]: Harish Dattatraya Dixit, Sneha Pendharkar, Matt Beadon, Chris Mason, Tejasvi Chakravarthy, Bharath Muthiah, and Sriram Sankar. [Silent Data Corruptions at Scale](https://arxiv.org/abs/2102.11245). *arXiv:2102.11245*, February 2021. 
[^51]: Diogo Behrens, Marco Serafini, Sergei Arnautov, Flavio P. Junqueira, and Christof Fetzer. [Scalable Error Isolation for Distributed Systems](https://www.usenix.org/conference/nsdi15/technical-sessions/presentation/behrens). At *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015. 
[^52]: Bianca Schroeder, Eduardo Pinheiro, and Wolf-Dietrich Weber. [DRAM Errors in the Wild: A Large-Scale Field Study](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/35162.pdf). At *11th International Joint Conference on Measurement and Modeling of Computer Systems* (SIGMETRICS), June 2009. [doi:10.1145/1555349.1555372](https://doi.org/10.1145/1555349.1555372) 
[^53]: Yoongu Kim, Ross Daly, Jeremie Kim, Chris Fallin, Ji Hye Lee, Donghyuk Lee, Chris Wilkerson, Konrad Lai, and Onur Mutlu. [Flipping Bits in Memory Without Accessing Them: An Experimental Study of DRAM Disturbance Errors](https://users.ece.cmu.edu/~yoonguk/papers/kim-isca14.pdf). At *41st Annual International Symposium on Computer Architecture* (ISCA), June 2014. [doi:10.5555/2665671.2665726](https://doi.org/10.5555/2665671.2665726) 
[^54]: Tim Bray. [Worst Case](https://www.tbray.org/ongoing/When/202x/2021/10/08/The-WOrst-Case). *tbray.org*, October 2021. Archived at [perma.cc/4QQM-RTHN](https://perma.cc/4QQM-RTHN) 
[^55]: Sangeetha Abdu Jyothi. [Solar Superstorms: Planning for an Internet Apocalypse](https://ics.uci.edu/~sabdujyo/papers/sigcomm21-cme.pdf). At *ACM SIGCOMM Conferene*, August 2021. [doi:10.1145/3452296.3472916](https://doi.org/10.1145/3452296.3472916) 
[^56]: Adrian Cockcroft. [Failure Modes and Continuous Resilience](https://adrianco.medium.com/failure-modes-and-continuous-resilience-6553078caad5). *adrianco.medium.com*, November 2019. Archived at [perma.cc/7SYS-BVJP](https://perma.cc/7SYS-BVJP) 
[^57]: Shujie Han, Patrick P. C. Lee, Fan Xu, Yi Liu, Cheng He, and Jiongzhou Liu. [An In-Depth Study of Correlated Failures in Production SSD-Based Data Centers](https://www.usenix.org/conference/fast21/presentation/han). At *19th USENIX Conference on File and Storage Technologies* (FAST), February 2021. 
[^58]: Edmund B. Nightingale, John R. Douceur, and Vince Orgovan. [Cycles, Cells and Platters: An Empirical Analysis of Hardware Failures on a Million Consumer PCs](https://eurosys2011.cs.uni-salzburg.at/pdf/eurosys2011-nightingale.pdf). At *6th European Conference on Computer Systems* (EuroSys), April 2011. [doi:10.1145/1966445.1966477](https://doi.org/10.1145/1966445.1966477) 
[^59]: Haryadi S. Gunawi, Mingzhe Hao, Tanakorn Leesatapornwongsa, Tiratat Patana-anake, Thanh Do, Jeffry Adityatama, Kurnia J. Eliazar, Agung Laksono, Jeffrey F. Lukman, Vincentius Martin, and Anang D. Satria. [What Bugs Live in the Cloud?](https://ucare.cs.uchicago.edu/pdf/socc14-cbs.pdf) At *5th ACM Symposium on Cloud Computing* (SoCC), November 2014. [doi:10.1145/2670979.2670986](https://doi.org/10.1145/2670979.2670986) 
[^60]: Jay Kreps. [Getting Real About Distributed System Reliability](https://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability). *blog.empathybox.com*, March 2012. Archived at [perma.cc/9B5Q-AEBW](https://perma.cc/9B5Q-AEBW) 
[^61]: Nelson Minar. [Leap Second Crashes Half the Internet](https://www.somebits.com/weblog/tech/bad/leap-second-2012.html). *somebits.com*, July 2012. Archived at [perma.cc/2WB8-D6EU](https://perma.cc/2WB8-D6EU) 
[^62]: Hewlett Packard Enterprise. [Support Alerts – Customer Bulletin a00092491en\_us](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-a00092491en_us). *support.hpe.com*, November 2019. Archived at [perma.cc/S5F6-7ZAC](https://perma.cc/S5F6-7ZAC) 
[^63]: Lorin Hochstein. [awesome limits](https://github.com/lorin/awesome-limits). *github.com*, November 2020. Archived at [perma.cc/3R5M-E5Q4](https://perma.cc/3R5M-E5Q4) 
[^64]: Caitie McCaffrey. [Clients Are Jerks: AKA How Halo 4 DoSed the Services at Launch & How We Survived](https://www.caitiem.com/2015/06/23/clients-are-jerks-aka-how-halo-4-dosed-the-services-at-launch-how-we-survived/). *caitiem.com*, June 2015. Archived at [perma.cc/MXX4-W373](https://perma.cc/MXX4-W373) 
[^65]: Lilia Tang, Chaitanya Bhandari, Yongle Zhang, Anna Karanika, Shuyang Ji, Indranil Gupta, and Tianyin Xu. [Fail through the Cracks: Cross-System Interaction Failures in Modern Cloud Systems](https://tianyin.github.io/pub/csi-failures.pdf). At *18th European Conference on Computer Systems* (EuroSys), May 2023. [doi:10.1145/3552326.3587448](https://doi.org/10.1145/3552326.3587448) 
[^66]: Mike Ulrich. [Addressing Cascading Failures](https://sre.google/sre-book/addressing-cascading-failures/). In Betsy Beyer, Jennifer Petoff, Chris Jones, and Niall Richard Murphy (ed). [*Site Reliability Engineering: How Google Runs Production Systems*](https://www.oreilly.com/library/view/site-reliability-engineering/9781491929117/). O’Reilly Media, 2016. ISBN: 9781491929124 
[^67]: Harri Faßbender. [Cascading failures in large-scale distributed systems](https://blog.mi.hdm-stuttgart.de/index.php/2022/03/03/cascading-failures-in-large-scale-distributed-systems/). *blog.mi.hdm-stuttgart.de*, March 2022. Archived at [perma.cc/K7VY-YJRX](https://perma.cc/K7VY-YJRX) 
[^68]: Richard I. Cook. [How Complex Systems Fail](https://www.adaptivecapacitylabs.com/HowComplexSystemsFail.pdf). Cognitive Technologies Laboratory, April 2000. Archived at [perma.cc/RDS6-2YVA](https://perma.cc/RDS6-2YVA) 
[^69]: David D. Woods. [STELLA: Report from the SNAFUcatchers Workshop on Coping With Complexity](https://snafucatchers.github.io/). *snafucatchers.github.io*, March 2017. Archived at [archive.org](https://web.archive.org/web/20230306130131/https%3A//snafucatchers.github.io/) 
[^70]: David Oppenheimer, Archana Ganapathi, and David A. Patterson. [Why Do Internet Services Fail, and What Can Be Done About It?](https://static.usenix.org/events/usits03/tech/full_papers/oppenheimer/oppenheimer.pdf) At *4th USENIX Symposium on Internet Technologies and Systems* (USITS), March 2003. 
[^71]: Sidney Dekker. [*The Field Guide to Understanding ‘Human Error’, 3rd Edition*](https://learning.oreilly.com/library/view/the-field-guide/9781317031833/). CRC Press, November 2017. ISBN: 9781472439055 
[^72]: Sidney Dekker. [*Drift into Failure: From Hunting Broken Components to Understanding Complex Systems*](https://www.taylorfrancis.com/books/mono/10.1201/9781315257396/drift-failure-sidney-dekker). CRC Press, 2011. ISBN: 9781315257396 
[^73]: John Allspaw. [Blameless PostMortems and a Just Culture](https://www.etsy.com/codeascraft/blameless-postmortems/). *etsy.com*, May 2012. Archived at [perma.cc/YMJ7-NTAP](https://perma.cc/YMJ7-NTAP) 
[^74]: Itzy Sabo. [Uptime Guarantees — A Pragmatic Perspective](https://world.hey.com/itzy/uptime-guarantees-a-pragmatic-perspective-736d7ea4). *world.hey.com*, March 2023. Archived at [perma.cc/F7TU-78JB](https://perma.cc/F7TU-78JB) 
[^75]: Michael Jurewitz. [The Human Impact of Bugs](http://jury.me/blog/2013/3/14/the-human-impact-of-bugs). *jury.me*, March 2013. Archived at [perma.cc/5KQ4-VDYL](https://perma.cc/5KQ4-VDYL) 
[^76]: Mark Halper. [How Software Bugs led to ‘One of the Greatest Miscarriages of Justice’ in British History](https://cacm.acm.org/news/how-software-bugs-led-to-one-of-the-greatest-miscarriages-of-justice-in-british-history/). *Communications of the ACM*, January 2025. [doi:10.1145/3703779](https://doi.org/10.1145/3703779) 
[^77]: Nicholas Bohm, James Christie, Peter Bernard Ladkin, Bev Littlewood, Paul Marshall, Stephen Mason, Martin Newby, Steven J. Murdoch, Harold Thimbleby, and Martyn Thomas. [The legal rule that computers are presumed to be operating correctly – unforeseen and unjust consequences](https://www.benthamsgaze.org/wp-content/uploads/2022/06/briefing-presumption-that-computers-are-reliable.pdf). Briefing note, *benthamsgaze.org*, June 2022. Archived at [perma.cc/WQ6X-TMW4](https://perma.cc/WQ6X-TMW4) 
[^78]: Dan McKinley. [Choose Boring Technology](https://mcfunley.com/choose-boring-technology). *mcfunley.com*, March 2015. Archived at [perma.cc/7QW7-J4YP](https://perma.cc/7QW7-J4YP) 
[^79]: Andy Warfield. [Building and operating a pretty big storage system called S3](https://www.allthingsdistributed.com/2023/07/building-and-operating-a-pretty-big-storage-system.html). *allthingsdistributed.com*, July 2023. Archived at [perma.cc/7LPK-TP7V](https://perma.cc/7LPK-TP7V) 
[^80]: Marc Brooker. [Surprising Scalability of Multitenancy](https://brooker.co.za/blog/2023/03/23/economics.html). *brooker.co.za*, March 2023. Archived at [perma.cc/ZZD9-VV8T](https://perma.cc/ZZD9-VV8T) 
[^81]: Ben Stopford. [Shared Nothing vs. Shared Disk Architectures: An Independent View](http://www.benstopford.com/2009/11/24/understanding-the-shared-nothing-architecture/). *benstopford.com*, November 2009. Archived at [perma.cc/7BXH-EDUR](https://perma.cc/7BXH-EDUR) 
[^82]: Michael Stonebraker. [The Case for Shared Nothing](https://dsf.berkeley.edu/papers/hpts85-nothing.pdf). *IEEE Database Engineering Bulletin*, volume 9, issue 1, pages 4–9, March 1986. 
[^83]: Panagiotis Antonopoulos, Alex Budovski, Cristian Diaconu, Alejandro Hernandez Saenz, Jack Hu, Hanuma Kodavalla, Donald Kossmann, Sandeep Lingam, Umar Farooq Minhas, Naveen Prakash, Vijendra Purohit, Hugh Qu, Chaitanya Sreenivas Ravella, Krystyna Reisteter, Sheetal Shrotri, Dixin Tang, and Vikram Wakade. [Socrates: The New SQL Server in the Cloud](https://www.microsoft.com/en-us/research/uploads/prod/2019/05/socrates.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 1743–1756, June 2019. [doi:10.1145/3299869.3314047](https://doi.org/10.1145/3299869.3314047) 
[^84]: Sam Newman. [*Building Microservices*, second edition](https://www.oreilly.com/library/view/building-microservices-2nd/9781492034018/). O’Reilly Media, 2021. ISBN: 9781492034025 
[^85]: Nathan Ensmenger. [When Good Software Goes Bad: The Surprising Durability of an Ephemeral Technology](https://themaintainers.wpengine.com/wp-content/uploads/2021/04/ensmenger-maintainers-v2.pdf). At *The Maintainers Conference*, April 2016. Archived at [perma.cc/ZXT4-HGZB](https://perma.cc/ZXT4-HGZB) 
[^86]: Robert L. Glass. [*Facts and Fallacies of Software Engineering*](https://learning.oreilly.com/library/view/facts-and-fallacies/0321117425/). Addison-Wesley Professional, October 2002. ISBN: 9780321117427 
[^87]: Marianne Bellotti. [*Kill It with Fire*](https://learning.oreilly.com/library/view/kill-it-with/9781098128883/). No Starch Press, April 2021. ISBN: 9781718501188 
[^88]: Lisanne Bainbridge. [Ironies of automation](https://www.adaptivecapacitylabs.com/IroniesOfAutomation-Bainbridge83.pdf). *Automatica*, volume 19, issue 6, pages 775–779, November 1983. [doi:10.1016/0005-1098(83)90046-8](https://doi.org/10.1016/0005-1098%2883%2990046-8) 
[^89]: James Hamilton. [On Designing and Deploying Internet-Scale Services](https://www.usenix.org/legacy/events/lisa07/tech/full_papers/hamilton/hamilton.pdf). At *21st Large Installation System Administration Conference* (LISA), November 2007. 
[^90]: Dotan Horovits. [Open Source for Better Observability](https://horovits.medium.com/open-source-for-better-observability-8c65b5630561). *horovits.medium.com*, October 2021. Archived at [perma.cc/R2HD-U2ZT](https://perma.cc/R2HD-U2ZT) 
[^91]: Brian Foote and Joseph Yoder. [Big Ball of Mud](http://www.laputan.org/pub/foote/mud.pdf). At *4th Conference on Pattern Languages of Programs* (PLoP), September 1997. Archived at [perma.cc/4GUP-2PBV](https://perma.cc/4GUP-2PBV) 
[^92]: Marc Brooker. [What is a simple system?](https://brooker.co.za/blog/2022/05/03/simplicity.html) *brooker.co.za*, May 2022. Archived at [perma.cc/U72T-BFVE](https://perma.cc/U72T-BFVE) 
[^93]: Frederick P. Brooks. [No Silver Bullet – Essence and Accident in Software Engineering](https://worrydream.com/refs/Brooks_1986_-_No_Silver_Bullet.pdf). In [*The Mythical Man-Month*](https://www.oreilly.com/library/view/mythical-man-month-the/0201835959/), Anniversary edition, Addison-Wesley, 1995. ISBN: 9780201835953 
[^94]: Dan Luu. [Against essential and accidental complexity](https://danluu.com/essential-complexity/). *danluu.com*, December 2020. Archived at [perma.cc/H5ES-69KC](https://perma.cc/H5ES-69KC) 
[^95]: Erich Gamma, Richard Helm, Ralph Johnson, and John Vlissides. [*Design Patterns: Elements of Reusable Object-Oriented Software*](https://learning.oreilly.com/library/view/design-patterns-elements/0201633612/). Addison-Wesley Professional, October 1994. ISBN: 9780201633610 
[^96]: Eric Evans. [*Domain-Driven Design: Tackling Complexity in the Heart of Software*](https://learning.oreilly.com/library/view/domain-driven-design-tackling/0321125215/). Addison-Wesley Professional, August 2003. ISBN: 9780321125217 
[^97]: Hongyu Pei Breivold, Ivica Crnkovic, and Peter J. Eriksson. [Analyzing Software Evolvability](https://www.es.mdh.se/pdf_publications/1251.pdf). at *32nd Annual IEEE International Computer Software and Applications Conference* (COMPSAC), July 2008. [doi:10.1109/COMPSAC.2008.50](https://doi.org/10.1109/COMPSAC.2008.50) 
[^98]: Enrico Zaninotto. [From X programming to the X organisation](https://martinfowler.com/articles/zaninotto.pdf). At *XP Conference*, May 2002. Archived at [perma.cc/R9AR-QCKZ](https://perma.cc/R9AR-QCKZ)


================================================
FILE: content/en/ch3.md
================================================
---
title: "3. Data Models and Query Languages"
weight: 103
breadcrumbs: false
---

<a id="ch_datamodels"></a>

![](/map/ch02.png)

> *The limits of my language mean the limits of my world.*
>
> Ludwig Wittgenstein, *Tractatus Logico-Philosophicus* (1922)

Data models are perhaps the most important part of developing software, because they have such a
profound effect: not only on how the software is written, but also on how we *think about the problem*
that we are solving.

Most applications are built by layering one data model on top of another. For each layer, the key
question is: how is it *represented* in terms of the next-lower layer? For example:

1. As an application developer, you look at the real world (in which there are people,
 organizations, goods, actions, money flows, sensors, etc.) and model it in terms of objects or
 data structures, and APIs that manipulate those data structures. Those structures are often
 specific to your application.
2. When you want to store those data structures, you express them in terms of a general-purpose
 data model, such as JSON or XML documents, tables in a relational database, or vertices and
 edges in a graph. Those data models are the topic of this chapter.
3. The engineers who built your database software decided on a way of representing that
 document/relational/graph data in terms of bytes in memory, on disk, or on a network. The
 representation may allow the data to be queried, searched, manipulated, and processed in various
 ways. We will discuss these storage engine designs in [Chapter 4](/en/ch4#ch_storage).
4. On yet lower levels, hardware engineers have figured out how to represent bytes in terms of
 electrical currents, pulses of light, magnetic fields, and more.

In a complex application there may be more intermediary levels, such as APIs built upon APIs, but
the basic idea is still the same: each layer hides the complexity of the layers below it by
providing a clean data model. These abstractions allow different groups of people—for example,
the engineers at the database vendor and the application developers using their database—to work together effectively.

Several different data models are widely used in practice, often for different purposes. Some types
of data and some queries are easy to express in one model, and awkward in another. In this chapter
we will explore those trade-offs by comparing the relational model, the document model, graph-based
data models, event sourcing, and dataframes. We will also briefly look at query languages that allow
you to work with these models. This comparison will help you decide when to use which model.

--------

> [!TIP] TERMINOLOGY: DECLARATIVE QUERY LANGUAGES

Many of the query languages in this chapter (such as SQL, Cypher, SPARQL, or Datalog) are
*declarative*, which means that you specify the pattern of the data you want—what conditions the
results must meet, and how you want the data to be transformed (e.g., sorted, grouped, and
aggregated)—but not *how* to achieve that goal. The database system’s query optimizer can decide
which indexes and which join algorithms to use, and in which order to execute various parts of the query.

In contrast, with most programming languages you would have to write an *algorithm*—i.e., telling
the computer which operations to perform in which order. A declarative query language is attractive
because it is typically more concise and easier to write than an explicit algorithm. But more
importantly, it also hides implementation details of the query engine, which makes it possible for
the database system to introduce performance improvements without requiring any changes to queries. [^1].

For example, a database might be able to execute a declarative query in parallel across multiple CPU
cores and machines, without you having to worry about how to implement that parallelism [^2].
In a hand-coded algorithm it would be a lot of work to implement such parallel execution yourself.

--------

## Relational Model versus Document Model {#sec_datamodels_history}

The best-known data model today is probably that of SQL, based on the relational model proposed by Edgar Codd in 1970 [^3]:
data is organized into *relations* (called *tables* in SQL), where each relation is an unordered collection of *tuples* (*rows* in SQL).

The relational model was originally a theoretical proposal, and many people at the time doubted whether it
could be implemented efficiently. However, by the mid-1980s, relational database management systems
(RDBMS) and SQL had become the tools of choice for most people who needed to store and query data
with some kind of regular structure. Many data management use cases are still dominated by
relational data decades later—for example, business analytics (see [“Stars and Snowflakes: Schemas for Analytics”](/en/ch3#sec_datamodels_analytics)).

Over the years, there have been many competing approaches to data storage and querying. In the 1970s
and early 1980s, the *network model* and the *hierarchical model* were the main alternatives, but
the relational model came to dominate them. Object databases came and went again in the late 1980s
and early 1990s. XML databases appeared in the early 2000s, but have only seen niche adoption. Each
competitor to the relational model generated a lot of hype in its time, but it never lasted [^4].
Instead, SQL has grown to incorporate other data types besides its relational core—for example,
adding support for XML, JSON, and graph data [^5].

In the 2010s, *NoSQL* was the latest buzzword that tried to overthrow the dominance of relational
databases. NoSQL refers not to a single technology, but a loose set of ideas around new data models,
schema flexibility, scalability, and a move towards open source licensing models. Some databases
branded themselves as *NewSQL*, as they aim to provide the scalability of NoSQL systems along with
the data model and transactional guarantees of traditional relational databases. The NoSQL and
NewSQL ideas have been very influential in the design of data systems, but as the principles have
become widely adopted, use of those terms has faded.

One lasting effect of the NoSQL movement is the popularity of the *document model*, which usually
represents data as JSON. This model was originally popularized by specialized document databases
such as MongoDB and Couchbase, although most relational databases have now also added JSON support.
Compared to relational tables, which are often seen as having a rigid and inflexible schema, JSON
documents are thought to be more flexible.

The pros and cons of document and relational data have been debated extensively; let’s examine some
of the key points of that debate.

### The Object-Relational Mismatch {#sec_datamodels_document}

Much application development today is done in object-oriented programming languages, which leads to
a common criticism of the SQL data model: if data is stored in relational tables, an awkward
translation layer is required between the objects in the application code and the database model of
tables, rows, and columns. The disconnect between the models is sometimes called an *impedance mismatch*.

--------

> [!NOTE]
> The term *impedance mismatch* is borrowed from electronics. Every electric circuit has a certain
> impedance (resistance to alternating current) on its inputs and outputs. When you connect one
> circuit’s output to another one’s input, the power transfer across the connection is maximized if
> the output and input impedances of the two circuits match. An impedance mismatch can lead to signal
> reflections and other troubles.

--------

#### Object-relational mapping (ORM) {#object-relational-mapping-orm}

Object-relational mapping (ORM) frameworks like ActiveRecord and Hibernate reduce the amount of
boilerplate code required for this translation layer, but they are often criticized [^6].
Some commonly cited problems are:

* ORMs are complex and can’t completely hide the differences between the two models, so developers
 still end up having to think about both the relational and the object representations of the data.
* ORMs are generally only used for OLTP app development (see [“Characterizing Transaction Processing and Analytics”](/en/ch1#sec_introduction_oltp)); data
 engineers making the data available for analytics purposes still need to work with the underlying
 relational representation, so the design of the relational schema still matters when using an ORM.
* Many ORMs work only with relational OLTP databases. Organizations with diverse data systems such
 as search engines, graph databases, and NoSQL systems might find ORM support lacking.
* Some ORMs generate relational schemas automatically, but these might be awkward for the users who
 are accessing the relational data directly, and they might be inefficient on the underlying
 database. Customizing the ORM’s schema and query generation can be complex and negate the benefit of using the ORM in the first place.
* ORMs make it easy to accidentally write inefficient queries, such as the *N+1 query problem* [^7].
 For example, say you want to display a list of user comments on a page, so you perform one query
 that returns *N* comments, each containing the ID of its author. To show the name of the comment
 author you need to look up the ID in the users table. In hand-written SQL you would probably
 perform this join in the query and return the author name along with each comment, but with an ORM
 you might end up making a separate query on the users table for each of the *N* comments to look
 up its author, resulting in *N*+1 database queries in total, which is slower than performing the
 join in the database. To avoid this problem, you may need to tell the ORM to fetch the author
 information at the same time as fetching the comments.

Nevertheless, ORMs also have advantages:

* For data that is well suited to a relational model, some kind of translation between the
 persistent relational and the in-memory object representation is inevitable, and ORMs reduce the
 amount of boilerplate code required for this translation. Complicated queries may still need to be
 handled outside of the ORM, but the ORM can help with the simple and repetitive cases.
* Some ORMs help with caching the results of database queries, which can help reduce the load on the database.
* ORMs can also help with managing schema migrations and other administrative activities.

#### The document data model for one-to-many relationships {#the-document-data-model-for-one-to-many-relationships}

Not all data lends itself well to a relational representation; let’s look at an example to explore a
limitation of the relational model. [Figure 3-1](/en/ch3#fig_obama_relational) illustrates how a résumé (a LinkedIn
profile) could be expressed in a relational schema. The profile as a whole can be identified by a
unique identifier, `user_id`. Fields like `first_name` and `last_name` appear exactly once per user,
so they can be modeled as columns on the `users` table.

Most people have had more than one job in their career (positions), and people may have varying
numbers of periods of education and any number of pieces of contact information. One way of
representing such *one-to-many relationships* is to put positions, education, and contact
information in separate tables, with a foreign key reference to the `users` table, as in
[Figure 3-1](/en/ch3#fig_obama_relational).

{{< figure src="/fig/ddia_0301.png" id="fig_obama_relational" caption="Figure 3-1. Representing a LinkedIn profile using a relational schema." class="w-full my-4" >}}

Another way of representing the same information, which is perhaps more natural and maps more
closely to an object structure in application code, is as a JSON document as shown in
[Example 3-1](/en/ch3#fig_obama_json).

{{< figure id="fig_obama_json" title="Example 3-1. Representing a LinkedIn profile as a JSON document" class="w-full my-4" >}}

```json
{
    "user_id": 251,
    "first_name": "Barack",
    "last_name": "Obama",
    "headline": "Former President of the United States of America",
    "region_id": "us:91",
    "photo_url": "/p/7/000/253/05b/308dd6e.jpg",
    "positions": [
        {"job_title": "President", "organization": "United States of America"},
        {"job_title": "US Senator (D-IL)", "organization": "United States Senate"}
    ],
    "education": [
        {"school_name": "Harvard University", "start": 1988, "end": 1991},
        {"school_name": "Columbia University", "start": 1981, "end": 1983}
    ],
    "contact_info": {
        "website": "https://barackobama.com",
        "twitter": "https://twitter.com/barackobama"
    }
}
```

Some developers feel that the JSON model reduces the impedance mismatch between the application code
and the storage layer. However, as we shall see in [Chapter 5](/en/ch5#ch_encoding), there are also problems with
JSON as a data encoding format. The lack of a schema is often cited as an advantage; we will discuss
this in [“Schema flexibility in the document model”](/en/ch3#sec_datamodels_schema_flexibility).

The JSON representation has better *locality* than the multi-table schema in
[Figure 3-1](/en/ch3#fig_obama_relational) (see [“Data locality for reads and writes”](/en/ch3#sec_datamodels_document_locality)). If you want to fetch a profile
in the relational example, you need to either perform multiple queries (query each table by
`user_id`) or perform a messy multi-way join between the `users` table and its subordinate tables [^8].
In the JSON representation, all the relevant information is in one place, making the query both
faster and simpler.

The one-to-many relationships from the user profile to the user’s positions, educational history, and
contact information imply a tree structure in the data, and the JSON representation makes this tree
structure explicit (see [Figure 3-2](/en/ch3#fig_json_tree)).

{{< figure src="/fig/ddia_0302.png" id="fig_json_tree" caption="Figure 3-2. One-to-many relationships forming a tree structure." class="w-full my-4" >}}

--------

> [!NOTE]
> This type of relationship is sometimes called *one-to-few* rather than *one-to-many*, since a résumé typically has a small number of positions [^9] [^10].
> In situations where there may be a genuinely large number of related items—say, comments on a
> celebrity’s social media post, of which there could be many thousands—embedding them all in the same
> document may be too unwieldy, so the relational approach in [Figure 3-1](/en/ch3#fig_obama_relational) is preferable.

--------

### Normalization, Denormalization, and Joins {#sec_datamodels_normalization}

In [Example 3-1](/en/ch3#fig_obama_json) in the preceding section, `region_id` is given as an ID, not as the plain-text
string `"Washington, DC, United States"`. Why?

If the user interface has a free-text field for entering the region, it makes sense to store it as a
plain-text string. But there are advantages to having standardized lists of geographic regions, and
letting users choose from a drop-down list or autocompleter:

* Consistent style and spelling across profiles
* Avoiding ambiguity if there are several places with the same name (if the string were just
 “Washington”, would it refer to DC or to the state?)
* Ease of updating—the name is stored in only one place, so it is easy to update across the board if
 it ever needs to be changed (e.g., change of a city name due to political events)
* Localization support—when the site is translated into other languages, the standardized lists can
 be localized, so the region can be displayed in the viewer’s language
* Better search—e.g., a search for people on the US East Coast can match this profile, because the
 list of regions can encode the fact that Washington is located on the East Coast (which is not
 apparent from the string `"Washington, DC"`)

Whether you store an ID or a text string is a question of *normalization*. When you use an ID, your
data is more normalized: the information that is meaningful to humans (such as the text *Washington,
DC*) is stored in only one place, and everything that refers to it uses an ID (which only has
meaning within the database). When you store the text directly, you are duplicating the
human-meaningful information in every record that uses it; this representation is *denormalized*.

The advantage of using an ID is that because it has no meaning to humans, it never needs to change:
the ID can remain the same, even if the information it identifies changes. Anything that is
meaningful to humans may need to change sometime in the future—and if that information is
duplicated, all the redundant copies need to be updated. That requires more code, more write
operations, more disk space, and risks inconsistencies (where some copies of the information are
updated but others aren’t).

The downside of a normalized representation is that every time you want to display a record
containing an ID, you have to do an additional lookup to resolve the ID into something
human-readable. In a relational data model, this is done using a *join*, for example:

```sql
SELECT users.*, regions.region_name
    FROM users
    JOIN regions ON users.region_id = regions.id
    WHERE users.id = 251;
```

Document databases can store both normalized and denormalized data, but they are often associated
with denormalization—partly because the JSON data model makes it easy to store additional,
denormalized fields, and partly because the weak support for joins in many document databases makes
normalization inconvenient. Some document databases don’t support joins at all, so you have to
perform them in application code—that is, you first fetch a document containing an ID, and then
perform a second query to resolve that ID into another document. In MongoDB, it is also possible to
perform a join using the `$lookup` operator in an aggregation pipeline:

```mongodb-json
db.users.aggregate([
    { $match: { _id: 251 } },
    { $lookup: {
        from: "regions",
        localField: "region_id",
        foreignField: "_id",
        as: "region"
    } }
])
```

#### Trade-offs of normalization {#trade-offs-of-normalization}

In the résumé example, while the `region_id` field is a reference into a standardized set of
regions, the name of the `organization` (the company or government where the person worked) and
`school_name` (where they studied) are just strings. This representation is denormalized: many
people may have worked at the same company, but there is no ID linking them.

Perhaps the organization and school should be entities instead, and the profile should reference
their IDs instead of their names? The same arguments for referencing the ID of a region also apply
here. For example, say we wanted to include the logo of the school or company in addition to their
name:

* In a denormalized representation, we would include the image URL of the logo on every individual
 person’s profile; this makes the JSON document self-contained, but it creates a headache if we
 ever need to change the logo, because we now need to find all of the occurrences of the old URL
 and update them [^9].
* In a normalized representation, we would create an entity representing an organization or school,
 and store its name, logo URL, and perhaps other attributes (description, news feed, etc.) once on
 that entity. Every résumé that mentions the organization would then simply reference its ID, and
 updating the logo is easy.

As a general principle, normalized data is usually faster to write (since there is only one copy),
but slower to query (since it requires joins); denormalized data is usually faster to read (fewer
joins), but more expensive to write (more copies to update, more disk space used). You might find it
helpful to view denormalization as a form of derived data ([“Systems of Record and Derived Data”](/en/ch1#sec_introduction_derived)), since you
need to set up a process for updating the redundant copies of the data.

Besides the cost of performing all these updates, you also need to consider the consistency of the
database if a process crashes halfway through making its updates. Databases that offer atomic
transactions (see [“Atomicity”](/en/ch8#sec_transactions_acid_atomicity)) make it easier to remain consistent, but not
all databases offer atomicity across multiple documents. It is also possible to ensure consistency
through stream processing, which we discuss in [“Keeping Systems in Sync”](/en/ch12#sec_stream_sync).

Normalization tends to be better for OLTP systems, where both reads and updates need to be fast;
analytics systems often fare better with denormalized data, since they perform updates in bulk, and
the performance of read-only queries is the dominant concern. Moreover, in systems of small to
moderate scale, a normalized data model is often best, because you don’t have to worry about keeping
multiple copies of the data consistent with each other, and the cost of performing joins is
acceptable. However, in very large-scale systems, the cost of joins can become problematic.

#### Denormalization in the social networking case study {#denormalization-in-the-social-networking-case-study}

In [“Case Study: Social Network Home Timelines”](/en/ch2#sec_introduction_twitter) we compared a normalized representation ([Figure 2-1](/en/ch2#fig_twitter_relational))
and a denormalized one (precomputed, materialized timelines): here, the join between `posts` and
`follows` was too expensive, and the materialized timeline is a cache of the result of that join.
The fan-out process that inserts a new post into followers’ timelines was our way of keeping the
denormalized representation consistent.

However, the implementation of materialized timelines at X (formerly Twitter) does not store the
actual text of each post: each entry actually only stores the post ID, the ID of the user who posted
it, and a little bit of extra information to identify reposts and replies [^11].
In other words, it is a precomputed result of (approximately) the following query:

```sql
SELECT posts.id, posts.sender_id 
    FROM posts
    JOIN follows ON posts.sender_id = follows.followee_id
    WHERE follows.follower_id = current_user
    ORDER BY posts.timestamp DESC
    LIMIT 1000
```

This means that whenever the timeline is read, the service still needs to perform two joins: look up
the post ID to fetch the actual post content (as well as statistics such as the number of likes
and replies), and look up the sender’s profile by ID (to get their username, profile picture, and
other details). This process of looking up the human-readable information by ID is called
*hydrating* the IDs, and it is essentially a join performed in application code [^11].

The reason for storing only IDs in the precomputed timeline is that the data they refer to is
fast-changing: the number of likes and replies may change multiple times per second on a popular
post, and some users regularly change their username or profile photo. Since the timeline should
show the latest like count and profile picture when it is viewed, it would not make sense to
denormalize this information into the materialized timeline. Moreover, the storage cost would be
increased significantly by such denormalization.

This example shows that having to perform joins when reading data is not, as sometimes claimed, an
impediment to creating high-performance, scalable services. Hydrating post ID and user ID is
actually a fairly easy operation to scale, since it parallelizes well, and the cost doesn’t depend
on the number of accounts you are following or the number of followers you have.

If you need to decide whether to denormalize something in your application, the social network case
study shows that the choice is not immediately obvious: the most scalable approach may involve
denormalizing some things and leaving other things normalized. You will have to carefully consider
how often the information changes, and the cost of reads and writes (which might be dominated by
outliers, such as users with many follows/followers in the case of a typical social network).
Normalization and denormalization are not inherently good or bad—they are just a trade-off in terms
of performance of reads and writes, as well as the amount of effort to implement.

### Many-to-One and Many-to-Many Relationships {#sec_datamodels_many_to_many}

While `positions` and `education` in [Figure 3-1](/en/ch3#fig_obama_relational) are examples of one-to-many or
one-to-few relationships (one résumé has several positions, but each position belongs only to one
résumé), the `region_id` field is an example of a *many-to-one* relationship (many people live in
the same region, but we assume that each person lives in only one region at any one time).

If we introduce entities for organizations and schools, and reference them by ID from the résumé,
then we also have *many-to-many* relationships (one person has worked for several organizations, and
an organization has several past or present employees). In a relational model, such a relationship
is usually represented as an *associative table* or *join table*, as shown in
[Figure 3-3](/en/ch3#fig_datamodels_m2m_rel): each position associates one user ID with one organization ID.

{{< figure src="/fig/ddia_0303.png" id="fig_datamodels_m2m_rel" caption="Figure 3-3. Many-to-many relationships in the relational model." class="w-full my-4" >}}

Many-to-one and many-to-many relationships do not easily fit within one self-contained JSON
document; they lend themselves more to a normalized representation. In a document model, one
possible representation is given in [Example 3-2](/en/ch3#fig_datamodels_m2m_json) and illustrated in
[Figure 3-4](/en/ch3#fig_datamodels_many_to_many): the data within each dotted rectangle can be grouped into one
document, but the links to organizations and schools are best represented as references to other
documents.

{{< figure id="fig_datamodels_m2m_json" title="Example 3-2. A résumé that references organizations by ID." class="w-full my-4" >}}

```json
{
    "user_id": 251,
    "first_name": "Barack",
    "last_name": "Obama",
    "positions": [
        {"start": 2009, "end": 2017, "job_title": "President", "org_id": 513},
        {"start": 2005, "end": 2008, "job_title": "US Senator (D-IL)", "org_id": 514}
    ],
    ...
}
```

{{< figure src="/fig/ddia_0304.png" id="fig_datamodels_many_to_many" caption="Figure 3-4. Many-to-many relationships in the document model: the data within each dotted box can be grouped into one document." class="w-full my-4" >}}

Many-to-many relationships often need to be queried in “both directions”: for example, finding all
of the organizations that a particular person has worked for, and finding all of the people who have
worked at a particular organization. One way of enabling such queries is to store ID references on
both sides, i.e., a résumé includes the ID of each organization where the person has worked, and the
organization document includes the IDs of the résumés that mention that organization. This
representation is denormalized, since the relationship is stored in two places, which could become
inconsistent with each other.

A normalized representation stores the relationship in only one place, and relies on *secondary
indexes* (which we discuss in [Chapter 4](/en/ch4#ch_storage)) to allow the relationship to be efficiently queried in
both directions. In the relational schema of [Figure 3-3](/en/ch3#fig_datamodels_m2m_rel), we would tell the database
to create indexes on both the `user_id` and the `org_id` columns of the `positions` table.

In the document model of [Example 3-2](/en/ch3#fig_datamodels_m2m_json), the database needs to index the `org_id` field
of objects inside the `positions` array. Many document databases and relational databases with JSON
support are able to create such indexes on values inside a document.

### Stars and Snowflakes: Schemas for Analytics {#sec_datamodels_analytics}

Data warehouses (see [“Data Warehousing”](/en/ch1#sec_introduction_dwh)) are usually relational, and there are a few
widely-used conventions for the structure of tables in a data warehouse: a *star schema*,
*snowflake schema*, *dimensional modeling* [^12],
and *one big table* (OBT). These structures are optimized for the needs of business analysts. ETL
processes translate data from operational systems into this schema.

[Figure 3-5](/en/ch3#fig_dwh_schema) shows an example of a star schema that might be found in the data warehouse of a grocery
retailer. At the center of the schema is a so-called *fact table* (in this example, it is called
`fact_sales`). Each row of the fact table represents an event that occurred at a particular time
(here, each row represents a customer’s purchase of a product). If we were analyzing website traffic
rather than retail sales, each row might represent a page view or a click by a user.

{{< figure src="/fig/ddia_0305.png" id="fig_dwh_schema" caption="Figure 3-5. Example of a star schema for use in a data warehouse." class="w-full my-4" >}}

Usually, facts are captured as individual events, because this allows maximum flexibility of
analysis later. However, this means that the fact table can become extremely large. A big enterprise
may have many petabytes of transaction history in its data warehouse, mostly represented as fact tables.

Some of the columns in the fact table are attributes, such as the price at which the product was
sold and the cost of buying it from the supplier (allowing the profit margin to be calculated).
Other columns in the fact table are foreign key references to other tables, called *dimension
tables*. As each row in the fact table represents an event, the dimensions represent the *who*,
*what*, *where*, *when*, *how*, and *why* of the event.

For example, in [Figure 3-5](/en/ch3#fig_dwh_schema), one of the dimensions is the product that was sold. Each row in
the `dim_product` table represents one type of product that is for sale, including its stock-keeping
unit (SKU), description, brand name, category, fat content, package size, etc. Each row in the
`fact_sales` table uses a foreign key to indicate which product was sold in that particular
transaction. Queries often involve multiple joins to multiple dimension tables.

Even date and time are often represented using dimension tables, because this allows additional
information about dates (such as public holidays) to be encoded, allowing queries to differentiate
between sales on holidays and non-holidays.

[Figure 3-5](/en/ch3#fig_dwh_schema) is an example of a star schema. The name comes from the fact that when the table
relationships are visualized, the fact table is in the middle, surrounded by its dimension tables;
the connections to these tables are like the rays of a star.

A variation of this template is known as the *snowflake schema*, where dimensions are further broken
down into subdimensions. For example, there could be separate tables for brands and
product categories, and each row in the `dim_product` table could reference the brand and category
as foreign keys, rather than storing them as strings in the `dim_product` table. Snowflake schemas
are more normalized than star schemas, but star schemas are often preferred because
they are simpler for analysts to work with [^12].

In a typical data warehouse, tables are often quite wide: fact tables often have over 100 columns,
sometimes several hundred. Dimension tables can also be wide, as they include all the metadata that
may be relevant for analysis—for example, the `dim_store` table may include details of which
services are offered at each store, whether it has an in-store bakery, the square footage, the date
when the store was first opened, when it was last remodeled, how far it is from the nearest highway, etc.

A star or snowflake schema consists mostly of many-to-one relationships (e.g., many sales occur for
one particular product, in one particular store), represented as the fact table having foreign keys
into dimension tables, or dimensions into sub-dimensions. In principle, other types of relationship
could exist, but they are often denormalized in order to simplify queries. For example, if a
customer buys several different products at once, that multi-item transaction is not represented
explicitly; instead, there is a separate row in the fact table for each product purchased, and those
facts all just happen to have the same customer ID, store ID, and timestamp.

Some data warehouse schemas take denormalization even further and leave out the dimension tables
entirely, folding the information in the dimensions into denormalized columns on the fact table
instead (essentially, precomputing the join between the fact table and the dimension tables). This
approach is known as *one big table* (OBT), and while it requires more storage space, it sometimes
enables faster queries [^13].

In the context of analytics, such denormalization is unproblematic, since the data typically
represents a log of historical data that is not going to change (except maybe for occasionally
correcting an error). The issues of data consistency and write overheads that occur with
denormalization in OLTP systems are not as pressing in analytics.

### When to Use Which Model {#sec_datamodels_document_summary}

The main arguments in favor of the document data model are schema flexibility, better performance
due to locality, and that for some applications it is closer to the object model used by the
application. The relational model counters by providing better support for joins, many-to-one, and
many-to-many relationships. Let’s examine these arguments in more detail.

If the data in your application has a document-like structure (i.e., a tree of one-to-many
relationships, where typically the entire tree is loaded at once), then it’s probably a good idea to
use a document model. The relational technique of *shredding*—splitting a document-like structure
into multiple tables (like `positions`, `education`, and `contact_info` in [Figure 3-1](/en/ch3#fig_obama_relational))
— can lead to cumbersome schemas and unnecessarily complicated application code.

The document model has limitations: for example, you cannot refer directly to a nested item within a
document, but instead you need to say something like “the second item in the list of positions for
user 251”. If you do need to reference nested items, a relational approach works better, since you
can refer to any item directly by its ID.

Some applications allow the user to choose the order of items: for example, imagine a to-do list or
issue tracker where the user can drag and drop tasks to reorder them. The document model supports
such applications well, because the items (or their IDs) can simply be stored in a JSON array to
determine their order. In relational databases there isn’t a standard way of representing such
reorderable lists, and various tricks are used: sorting by an integer column (requiring renumbering
when you insert into the middle), a linked list of IDs, or fractional indexing [^14] [^15] [^16].

#### Schema flexibility in the document model {#sec_datamodels_schema_flexibility}

Most document databases, and the JSON support in relational databases, do not enforce any schema on
the data in documents. XML support in relational databases usually comes with optional schema
validation. No schema means that arbitrary keys and values can be added to a document, and when
reading, clients have no guarantees as to what fields the documents may contain.

Document databases are sometimes called *schemaless*, but that’s misleading, as the code that reads
the data usually assumes some kind of structure—i.e., there is an implicit schema, but it is not
enforced by the database [^17].
A more accurate term is *schema-on-read* (the structure of the data is implicit, and only
interpreted when the data is read), in contrast with *schema-on-write* (the traditional approach of
relational databases, where the schema is explicit and the database ensures all data conforms to it
when the data is written) [^18].

Schema-on-read is similar to dynamic (runtime) type checking in programming languages, whereas
schema-on-write is similar to static (compile-time) type checking. Just as the advocates of static
and dynamic type checking have big debates about their relative merits [^19],
enforcement of schemas in database is a contentious topic, and in general there’s no right or wrong
answer.

The difference between the approaches is particularly noticeable in situations where an application
wants to change the format of its data. For example, say you are currently storing each user’s full
name in one field, and you instead want to store the first name and last name separately [^20].
In a document database, you would just start writing new documents with the new fields and have
code in the application that handles the case when old documents are read. For example:

```mongodb-json
if (user && user.name && !user.first_name) {
    // Documents written before Dec 8, 2023 don't have first_name
    user.first_name = user.name.split(" ")[0];
}
```

The downside of this approach is that every part of your application that reads from the database
now needs to deal with documents in old formats that may have been written a long time in the past.
On the other hand, in a schema-on-write database, you would typically perform a *migration* along
the lines of:

```sql
ALTER TABLE users ADD COLUMN first_name text DEFAULT NULL;
UPDATE users SET first_name = split_part(name, ' ', 1); -- PostgreSQL
UPDATE users SET first_name = substring_index(name, ' ', 1); -- MySQL
```

In most relational databases, adding a column with a default value is fast and unproblematic, even
on large tables. However, running the `UPDATE` statement is likely to be slow on a large table,
since every row needs to be rewritten, and other schema operations (such as changing the data type
of a column) also typically require the entire table to be copied.

Various tools exist to allow this type of schema changes to be performed in the background without downtime [^21] [^22] [^23] [^24],
but performing such migrations on large databases remains operationally challenging. Complicated
migrations can be avoided by only adding the `first_name` column with a default value of `NULL`
(which is fast), and filling it in at read time, like you would with a document database.

The schema-on-read approach is advantageous if the items in the collection don’t all have the same
structure for some reason (i.e., the data is heterogeneous)—for example, because:

* There are many different types of objects, and it is not practicable to put each type of object in its own table.
* The structure of the data is determined by external systems over which you have no control and which may change at any time.

In situations like these, a schema may hurt more than it helps, and schemaless documents can be a
much more natural data model. But in cases where all records are expected to have the same
structure, schemas are a useful mechanism for documenting and enforcing that structure. We will
discuss schemas and schema evolution in more detail in [Chapter 5](/en/ch5#ch_encoding).

#### Data locality for reads and writes {#sec_datamodels_document_locality}

A document is usually stored as a single continuous string, encoded as JSON, XML, or a binary variant
thereof (such as MongoDB’s BSON). If your application often needs to access the entire document
(for example, to render it on a web page), there is a performance advantage to this *storage
locality*. If data is split across multiple tables, like in [Figure 3-1](/en/ch3#fig_obama_relational), multiple
index lookups are required to retrieve it all, which may require more disk seeks and take more time.

The locality advantage only applies if you need large parts of the document at the same time. The
database typically needs to load the entire document, which can be wasteful if you only need to
access a small part of a large document. On updates to a document, the entire document usually needs
to be rewritten. For these reasons, it is generally recommended that you keep documents fairly small
and avoid frequent small updates to a document.

However, the idea of storing related data together for locality is not limited to the document
model. For example, Google’s Spanner database offers the same locality properties in a relational
data model, by allowing the schema to declare that a table’s rows should be interleaved (nested) within a parent table [^25].
Oracle allows the same, using a feature called *multi-table index cluster tables* [^26].
The *wide-column* data model popularized by Google’s Bigtable, and used e.g. in HBase and Accumulo,
has a concept of *column families*, which have a similar purpose of managing locality [^27].

#### Query languages for documents {#query-languages-for-documents}

Another difference between a relational and a document database is the language or API that you use
to query it. Most relational databases are queried using SQL, but document databases are more
varied. Some allow only key-value access by primary key, while others also offer secondary indexes
to query for values inside documents, and some provide rich query languages.

XML databases are often queried using XQuery and XPath, which are designed to allow complex queries,
including joins across multiple documents, and also format their results as XML [^28]. JSON Pointer [^29] and JSONPath [^30] provide an equivalent to XPath for JSON.

MongoDB’s aggregation pipeline, whose `$lookup` operator for joins we saw in
[“Normalization, Denormalization, and Joins”](/en/ch3#sec_datamodels_normalization), is an example of a query language for collections of JSON documents.

Let’s look at another example to get a feel for this language—this time an aggregation, which is
especially needed for analytics. Imagine you are a marine biologist, and you add an observation
record to your database every time you see animals in the ocean. Now you want to generate a report
saying how many sharks you have sighted per month. In PostgreSQL you might express that query like this:

```sql
SELECT date_trunc('month', observation_timestamp) AS observation_month, ❶ 
    sum(num_animals) AS total_animals
FROM observations
WHERE family = 'Sharks'
GROUP BY observation_month;
```

❶ : The `date_trunc('month', timestamp)` function determines the calendar month
 containing `timestamp`, and returns another timestamp representing the beginning of that month. In
 other words, it rounds a timestamp down to the nearest month.

This query first filters the observations to only show species in the `Sharks` family, then groups
the observations by the calendar month in which they occurred, and finally adds up the number of
animals seen in all observations in that month. The same query can be expressed using MongoDB’s
aggregation pipeline as follows:

```mongodb-json
db.observations.aggregate([
    { $match: { family: "Sharks" } },
    { $group: {
    _id: {
        year: { $year: "$observationTimestamp" },
        month: { $month: "$observationTimestamp" }
    },
    totalAnimals: { $sum: "$numAnimals" }
    } }
]);
```

The aggregation pipeline language is similar in expressiveness to a subset of SQL, but it uses a
JSON-based syntax rather than SQL’s English-sentence-style syntax; the difference is perhaps a
matter of taste.

#### Convergence of document and relational databases {#convergence-of-document-and-relational-databases}

Document databases and relational databases started out as very different approaches to data
management, but they have grown more similar over time [^31].
Relational databases added support for JSON types and query operators, and the ability to index
properties inside documents. Some document databases (such as MongoDB, Couchbase, and RethinkDB)
added support for joins, secondary indexes, and declarative query languages.

This convergence of the models is good news for application developers, because the relational model
and the document model work best when you can combine both in the same database. Many document
databases need relational-style references to other documents, and many relational databases have
sections where schema flexibility is beneficial. Relational-document hybrids are a powerful combination.

--------

> [!NOTE]
> Codd’s original description of the relational model [^3] actually allowed something similar to JSON
> within a relational schema. He called it *nonsimple domains*. The idea was that a value in a row
> doesn’t have to just be a primitive datatype like a number or a string, but it could also be a
> nested relation (table)—so you can have an arbitrarily nested tree structure as a value, much like
> the JSON or XML support that was added to SQL over 30 years later.

--------


## Graph-Like Data Models {#sec_datamodels_graph}

We saw earlier that the type of relationships is an important distinguishing feature between
different data models. If your application has mostly one-to-many relationships (tree-structured
data) and few other relationships between records, the document model is appropriate.

But what if many-to-many relationships are very common in your data? The relational model can handle
simple cases of many-to-many relationships, but as the connections within your data become more
complex, it becomes more natural to start modeling your data as a graph.

A graph consists of two kinds of objects: *vertices* (also known as *nodes* or *entities*) and
*edges* (also known as *relationships* or *arcs*). Many kinds of data can be modeled as a graph.
Typical examples include:

Social graphs
: Vertices are people, and edges indicate which people know each other.

The web graph
: Vertices are web pages, and edges indicate HTML links to other pages.

Road or rail networks
: Vertices are junctions, and edges represent the roads or railway lines between them.

Well-known algorithms can operate on these graphs: for example, map navigation apps search for
the shortest path between two points in a road network, and
PageRank can be used on the web graph to determine the
popularity of a web page and thus its ranking in search results [^32].

Graphs can be represented in several different ways. In the *adjacency list* model, each vertex
stores the IDs of its neighbor vertices that are one edge away. Alternatively, you can use an
*adjacency matrix*, a two-dimensional array where each row and each column corresponds to a vertex,
where the value is zero when there is no edge between the row vertex and the column vertex, and
where the value is one if there is an edge. The adjacency list is good for graph traversals, and the
matrix is good for machine learning (see [“Dataframes, Matrices, and Arrays”](/en/ch3#sec_datamodels_dataframes)).

In the examples just given, all the vertices in a graph represent the same kind of thing (people, web
pages, or road junctions, respectively). However, graphs are not limited to such *homogeneous* data:
an equally powerful use of graphs is to provide a consistent way of storing completely different
types of objects in a single database. For example:

* Facebook maintains a single graph with many different types of vertices and edges: vertices
 represent people, locations, events, checkins, and comments made by users; edges indicate which
 people are friends with each other, which checkin happened in which location, who commented on
 which post, who attended which event, and so on [^33].
* Knowledge graphs are used by search engines to record facts about entities that often occur in
 search queries, such as organizations, people, and places [^34].
 This information is obtained by crawling and analyzing the text on websites; some websites, such
 as Wikidata, also publish graph data in a structured form.

There are several different, but related, ways of structuring and querying data in graphs. In this
section we will discuss the *property graph* model (implemented by Neo4j, Memgraph, KùzuDB [^35], and others [^36])
and the *triple-store* model (implemented by Datomic, AllegroGraph, Blazegraph, and others). These
models are fairly similar in what they can express, and some graph databases (such as Amazon
Neptune) support both models.

We will also look at four query languages for graphs (Cypher, SPARQL, Datalog, and GraphQL), as well
as SQL support for querying graphs. Other graph query languages exist, such as Gremlin [^37],
but these will give us a representative overview.

To illustrate these different languages and models, this section uses the graph shown in
[Figure 3-6](/en/ch3#fig_datamodels_graph) as running example. It could be taken from a social network or a
genealogical database: it shows two people, Lucy from Idaho and Alain from Saint-Lô, France. They
are married and living in London. Each person and each location is represented as a vertex, and the
relationships between them as edges. This example will help demonstrate some queries that are easy
in graph databases, but difficult in other models.

{{< figure src="/fig/ddia_0306.png" id="fig_datamodels_graph" caption="Figure 3-6. Example of graph-structured data (boxes represent vertices, arrows represent edges)." class="w-full my-4" >}}

### Property Graphs {#id56}

In the *property graph* (also known as *labeled property graph*) model, each vertex consists of:

* A unique identifier
* A label (string) to describe what type of object this vertex represents
* A set of outgoing edges
* A set of incoming edges
* A collection of properties (key-value pairs)

Each edge consists of:

* A unique identifier
* The vertex at which the edge starts (the *tail vertex*)
* The vertex at which the edge ends (the *head vertex*)
* A label to describe the kind of relationship between the two vertices
* A collection of properties (key-value pairs)

You can think of a graph store as consisting of two relational tables, one for vertices and one for
edges, as shown in [Example 3-3](/en/ch3#fig_graph_sql_schema) (this schema uses the PostgreSQL `jsonb` datatype to
store the properties of each vertex or edge). The head and tail vertex are stored for each edge; if
you want the set of incoming or outgoing edges for a vertex, you can query the `edges` table by
`head_vertex` or `tail_vertex`, respectively.

{{< figure id="fig_graph_sql_schema" title="Example 3-3. Representing a property graph using a relational schema" class="w-full my-4" >}}

```sql
CREATE TABLE vertices (
    vertex_id integer PRIMARY KEY,
    label text,
    properties jsonb
);

CREATE TABLE edges (
    edge_id integer PRIMARY KEY,
    tail_vertex integer REFERENCES vertices (vertex_id),
    head_vertex integer REFERENCES vertices (vertex_id),
    label text,
    properties jsonb
);

CREATE INDEX edges_tails ON edges (tail_vertex);
CREATE INDEX edges_heads ON edges (head_vertex);
```

Some important aspects of this model are:

1. Any vertex can have an edge connecting it with any other vertex. There is no schema that
 restricts which kinds of things can or cannot be associated.
2. Given any vertex, you can efficiently find both its incoming and its outgoing edges, and thus
 *traverse* the graph—i.e., follow a path through a chain of vertices—both forward and backward.
 (That’s why [Example 3-3](/en/ch3#fig_graph_sql_schema) has indexes on both the `tail_vertex` and `head_vertex`
 columns.)
3. By using different labels for different kinds of vertices and relationships, you can store
 several different kinds of information in a single graph, while still maintaining a clean data
 model.

The edges table is like the many-to-many associative table/join table we saw in
[“Many-to-One and Many-to-Many Relationships”](/en/ch3#sec_datamodels_many_to_many), generalized to allow many different types of relationship to be
stored in the same table. There may also be indexes on the labels and the properties, allowing
vertices or edges with certain properties to be found efficiently.

--------

> [!NOTE]
> A limitation of graph models is that an edge can only associate two vertices with each other,
> whereas a relational join table can represent three-way or even higher-degree relationships by
> having multiple foreign key references on a single row. Such relationships can be represented in a
> graph by creating an additional vertex corresponding to each row of the join table, and edges
> to/from that vertex, or by using a *hypergraph*.

--------

Those features give graphs a great deal of flexibility for data modeling, as illustrated in
[Figure 3-6](/en/ch3#fig_datamodels_graph). The figure shows a few things that would be difficult to express in a
traditional relational schema, such as different kinds of regional structures in different countries
(France has *départements* and *régions*, whereas the US has *counties* and *states*), quirks of
history such as a country within a country (ignoring for now the intricacies of sovereign states and
nations), and varying granularity of data (Lucy’s current residence is specified as a city, whereas
her place of birth is specified only at the level of a state).

You could imagine extending the graph to also include many other facts about Lucy and Alain, or
other people. For instance, you could use it to indicate any food allergies they have (by
introducing a vertex for each allergen, and an edge between a person and an allergen to indicate an
allergy), and link the allergens with a set of vertices that show which foods contain which
substances. Then you could write a query to find out what is safe for each person to eat.
Graphs are good for evolvability: as you add features to your application, a graph can easily be
extended to accommodate changes in your application’s data structures.

### The Cypher Query Language {#id57}

*Cypher* is a query language for property graphs, originally created for the Neo4j graph database,
and later developed into an open standard as *openCypher* [^38]. Besides Neo4j, Cypher is supported by Memgraph, KùzuDB [^35],
Amazon Neptune, Apache AGE (with storage in PostgreSQL), and others. It is named after a character
in the movie *The Matrix* and is not related to ciphers in cryptography [^39].

[Example 3-4](/en/ch3#fig_cypher_create) shows the Cypher query to insert the lefthand portion of
[Figure 3-6](/en/ch3#fig_datamodels_graph) into a graph database. The rest of the graph can be added similarly. Each
vertex is given a symbolic name like `usa` or `idaho`. That name is not stored in the database, but
only used internally within the query to create edges between the vertices, using an arrow notation:
`(idaho) -[:WITHIN]-> (usa)` creates an edge labeled `WITHIN`, with `idaho` as the tail node and
`usa` as the head node.

{{< figure id="fig_cypher_create" title="Example 3-4. A subset of the data in [Figure 3-6](/en/ch3#fig_datamodels_graph), represented as a Cypher query" class="w-full my-4" >}}

```
CREATE
    (namerica :Location {name:'North America', type:'continent'}),
    (usa :Location {name:'United States', type:'country' }),
    (idaho :Location {name:'Idaho', type:'state' }),
    (lucy :Person {name:'Lucy' }),
    (idaho) -[:WITHIN ]-> (usa) -[:WITHIN]-> (namerica),
    (lucy) -[:BORN_IN]-> (idaho)
```

When all the vertices and edges of [Figure 3-6](/en/ch3#fig_datamodels_graph) are added to the database, we can start
asking interesting questions: for example, *find the names of all the people who emigrated from the
United States to Europe*. That is, find all the vertices that have a `BORN_IN` edge to a location
within the US, and also a `LIVING_IN` edge to a location within Europe, and return the `name`
property of each of those vertices.

[Example 3-5](/en/ch3#fig_cypher_query) shows how to express that query in Cypher. The same arrow notation is used in a
`MATCH` clause to find patterns in the graph: `(person) -[:BORN_IN]-> ()` matches any two vertices
that are related by an edge labeled `BORN_IN`. The tail vertex of that edge is bound to the
variable `person`, and the head vertex is left unnamed.

{{< figure id="fig_cypher_query" title="Example 3-5. Cypher query to find people who emigrated from the US to Europe" class="w-full my-4" >}}

```
MATCH
    (person) -[:BORN_IN]-> () -[:WITHIN*0..]-> (:Location {name:'United States'}),
    (person) -[:LIVES_IN]-> () -[:WITHIN*0..]-> (:Location {name:'Europe'})
RETURN person.name
```

The query can be read as follows:

> Find any vertex (call it `person`) that meets *both* of the following conditions:
>
> 1. `person` has an outgoing `BORN_IN` edge to some vertex. From that vertex, you can follow a chain
> of outgoing `WITHIN` edges until eventually you reach a vertex of type `Location`, whose `name`
> property is equal to `"United States"`.
> 2. That same `person` vertex also has an outgoing `LIVES_IN` edge. Following that edge, and then a
> chain of outgoing `WITHIN` edges, you eventually reach a vertex of type `Location`, whose `name`
> property is equal to `"Europe"`.
>
> For each such `person` vertex, return the `name` property.

There are several possible ways of executing the query. The description given here suggests that you
start by scanning all the people in the database, examine each person’s birthplace and residence,
and return only those people who meet the criteria.

But equivalently, you could start with the two `Location` vertices and work backward. If there is an
index on the `name` property, you can efficiently find the two vertices representing the US and
Europe. Then you can proceed to find all locations (states, regions, cities, etc.) in the US and
Europe respectively by following all incoming `WITHIN` edges. Finally, you can look for people who
can be found through an incoming `BORN_IN` or `LIVES_IN` edge at one of the location vertices.

### Graph Queries in SQL {#id58}

[Example 3-3](/en/ch3#fig_graph_sql_schema) suggested that graph data can be represented in a relational database. But
if we put graph data in a relational structure, can we also query it using SQL?

The answer is yes, but with some difficulty. Every edge that you traverse in a graph query is
effectively a join with the `edges` table. In a relational database, you usually know in advance
which joins you need in your query. On the other hand, in a graph query, you may need to traverse a
variable number of edges before you find the vertex you’re looking for—that is, the number of joins
is not fixed in advance.

In our example, that happens in the `() -[:WITHIN*0..]-> ()` pattern in the Cypher query. A person’s
`LIVES_IN` edge may point at any kind of location: a street, a city, a district, a region, a state,
etc. A city may be `WITHIN` a region, a region `WITHIN` a state, a state `WITHIN` a country, etc.
The `LIVES_IN` edge may point directly at the location vertex you’re looking for, or it may be
several levels away in the location hierarchy.

In Cypher, `:WITHIN*0..` expresses that fact very concisely: it means “follow a `WITHIN` edge, zero
or more times.” It is like the `*` operator in a regular expression.

Since SQL:1999, this idea of variable-length traversal paths in a query can be expressed using
something called *recursive common table expressions* (the `WITH RECURSIVE` syntax).
[Example 3-6](/en/ch3#fig_graph_sql_query) shows the same query—finding the names of people who emigrated from the US
to Europe—expressed in SQL using this technique. However, the syntax is very clumsy in comparison to
Cypher.

{{< figure id="fig_graph_sql_query" title="Example 3-6. The same query as [Example 3-5](/en/ch3#fig_cypher_query), written in SQL using recursive common table expressions" class="w-full my-4" >}}

```sql
WITH RECURSIVE

    -- in_usa is the set of vertex IDs of all locations within the United States
    in_usa(vertex_id) AS (
        SELECT vertex_id FROM vertices
            WHERE label = 'Location' AND properties->>'name' = 'United States' ❶ 
      UNION
        SELECT edges.tail_vertex FROM edges ❷
            JOIN in_usa ON edges.head_vertex = in_usa.vertex_id
            WHERE edges.label = 'within'
    ),
    
    -- in_europe is the set of vertex IDs of all locations within Europe
    in_europe(vertex_id) AS (
        SELECT vertex_id FROM vertices
            WHERE label = 'location' AND properties->>'name' = 'Europe' ❸
      UNION
        SELECT edges.tail_vertex FROM edges
            JOIN in_europe ON edges.head_vertex = in_europe.vertex_id
            WHERE edges.label = 'within'
    ),
    
    -- born_in_usa is the set of vertex IDs of all people born in the US
    born_in_usa(vertex_id) AS ( ❹
        SELECT edges.tail_vertex FROM edges
            JOIN in_usa ON edges.head_vertex = in_usa.vertex_id
            WHERE edges.label = 'born_in'
    ),
    
    -- lives_in_europe is the set of vertex IDs of all people living in Europe
    lives_in_europe(vertex_id) AS ( ❺
        SELECT edges.tail_vertex FROM edges
            JOIN in_europe ON edges.head_vertex = in_europe.vertex_id
            WHERE edges.label = 'lives_in'
    )
    
    SELECT vertices.properties->>'name'
    FROM vertices
    -- join to find those people who were both born in the US *and* live in Europe
    JOIN born_in_usa ON vertices.vertex_id = born_in_usa.vertex_id ❻
    JOIN lives_in_europe ON vertices.vertex_id = lives_in_europe.vertex_id;
```

❶: First find the vertex whose `name` property has the value `"United States"`, and make it the first element of the set
 of vertices `in_usa`.

❷: Follow all incoming `within` edges from vertices in the set `in_usa`, and add them to the same
 set, until all incoming `within` edges have been visited.

❸: Do the same starting with the vertex whose `name` property has the value `"Europe"`, and build up
 the set of vertices `in_europe`.

❹: For each of the vertices in the set `in_usa`, follow incoming `born_in` edges to find people
 who were born in some place within the United States.

❺: Similarly, for each of the vertices in the set `in_europe`, follow incoming `lives_in` edges to find people who live in Europe.

❻: Finally, intersect the set of people born in the USA with the set of people living in Europe, by
 joining them.

The fact that a 4-line Cypher query requires 31 lines in SQL shows how much of a difference the
right choice of data model and query language can make. And this is just the beginning; there are
more details to consider, e.g., around handling cycles, and choosing between breadth-first or
depth-first traversal [^40].

Oracle has a different SQL extension for recursive queries, which it calls *hierarchical* [^41].

However, the situation may be improving: at the time of writing, there are plans to add a graph
query language called GQL to the SQL standard [^42] [^43], which will provide a syntax inspired by Cypher, GSQL [^44], and PGQL [^45].

### Triple-Stores and SPARQL {#id59}

The triple-store model is mostly equivalent to the property graph model, using different words to
describe the same ideas. It is nevertheless worth discussing, because there are various tools and
languages for triple-stores that can be valuable additions to your toolbox for building
applications.

In a triple-store, all information is stored in the form of very simple three-part statements:
(*subject*, *predicate*, *object*). For example, in the triple (*Jim*, *likes*, *bananas*), *Jim* is
the subject, *likes* is the predicate (verb), and *bananas* is the object.

The subject of a triple is equivalent to a vertex in a graph. The object is one of two things:

1. A value of a primitive datatype, such as a string or a number. In that case, the predicate and
 object of the triple are equivalent to the key and value of a property on the subject vertex.
 Using the example from [Figure 3-6](/en/ch3#fig_datamodels_graph), (*lucy*, *birthYear*, *1989*) is like a vertex
 `lucy` with properties `{"birthYear": 1989}`.
2. Another vertex in the graph. In that case, the predicate is an edge in the
 graph, the subject is the tail vertex, and the object is the head vertex. For example, in
 (*lucy*, *marriedTo*, *alain*) the subject and object *lucy* and *alain* are both vertices, and
 the predicate *marriedTo* is the label of the edge that connects them.

> [!NOTE]
> To be precise, databases that offer a triple-like data model often need to store some additional
> metadata on each tuple. For example, AWS Neptune uses quads (4-tuples) by adding a graph ID to each
> triple [^46];
> Datomic uses 5-tuples, extending each triple with a transaction ID and a boolean to indicate
> deletion [^47].
> Since these databases retain the basic *subject-predicate-object* structure explained above, this
> book nevertheless calls them triple-stores.

[Example 3-7](/en/ch3#fig_graph_n3_triples) shows the same data as in [Example 3-4](/en/ch3#fig_cypher_create), written as
triples in a format called *Turtle*, a subset of *Notation3* (*N3*) [^48].

{{< figure id="fig_graph_n3_triples" title="Example 3-7. A subset of the data in [Figure 3-6](/en/ch3#fig_datamodels_graph), represented as Turtle triples" class="w-full my-4" >}}

```
@prefix : <urn:example:>.
_:lucy a :Person.
_:lucy :name "Lucy".
_:lucy :bornIn _:idaho.
_:idaho a :Location.
_:idaho :name "Idaho".
_:idaho :type "state".
_:idaho :within _:usa.
_:usa a :Location.
_:usa :name "United States".
_:usa :type "country".
_:usa :within _:namerica.
_:namerica a :Location.
_:namerica :name "North America".
_:namerica :type "continent".
```

In this example, vertices of the graph are written as `_:someName`. The name doesn’t mean anything
outside of this file; it exists only because we otherwise wouldn’t know which triples refer to the
same vertex. When the predicate represents an edge, the object is a vertex, as in `_:idaho :within
_:usa`. When the predicate is a property, the object is a string literal, as in `_:usa :name "United States"`.

It’s quite repetitive to repeat the same subject over and over again, but fortunately you can use
semicolons to say multiple things about the same subject. This makes the Turtle format quite
readable: see [Example 3-8](/en/ch3#fig_graph_n3_shorthand).

{{< figure id="fig_graph_n3_shorthand" title="Example 3-8. A more concise way of writing the data in [Example 3-7](/en/ch3#fig_graph_n3_triples)" class="w-full my-4" >}}

```
@prefix : <urn:example:>.
_:lucy a :Person; :name "Lucy"; :bornIn _:idaho.
_:idaho a :Location; :name "Idaho"; :type "state"; :within _:usa.
_:usa a :Location; :name "United States"; :type "country"; :within _:namerica.
_:namerica a :Location; :name "North America"; :type "continent".
```

--------

> [!TIP] THE SEMANTIC WEB

Some of the research and development effort on triple stores was motivated by the *Semantic Web*, an
early-2000s effort to facilitate internet-wide data exchange by publishing data not only as
human-readable web pages, but also in a standardized, machine-readable format. Although the Semantic
Web as originally envisioned did not succeed [^49] [^50],
the legacy of the Semantic Web project lives on in a couple of specific technologies: *linked data*
standards such as JSON-LD [^51], *ontologies* used in biomedical science [^52], Facebook’s Open Graph protocol [^53]
(which is used for link unfurling [^54]), knowledge graphs such as Wikidata, and standardized vocabularies for structured data maintained by [`schema.org`](https://schema.org/).

Triple-stores are another Semantic Web technology that has found use outside of its original use
case: even if you have no interest in the Semantic Web, triples can be a good internal data model for applications.

--------

#### The RDF data model {#the-rdf-data-model}

The Turtle language we used in [Example 3-8](/en/ch3#fig_graph_n3_shorthand) is actually a way of encoding data in the
*Resource Description Framework* (RDF) [^55],
a data model that was designed for the Semantic Web. RDF data can also be encoded in other ways, for
example (more verbosely) in XML, as shown in [Example 3-9](/en/ch3#fig_graph_rdf_xml). Tools like Apache Jena can
automatically convert between different RDF encodings.

{{< figure id="fig_graph_rdf_xml" title="Example 3-9. The data of [Example 3-8](/en/ch3#fig_graph_n3_shorthand), expressed using RDF/XML syntax" class="w-full my-4" >}}

```xml
<rdf:RDF xmlns="urn:example:"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">

    <Location rdf:nodeID="idaho">
        <name>Idaho</name>
        <type>state</type>
        <within>
            <Location rdf:nodeID="usa">
                <name>United States</name>
                <type>country</type>
                <within>
                    <Location rdf:nodeID="namerica">
                        <name>North America</name>
                        <type>continent</type>
                    </Location>
                </within>
            </Location>
        </within>
    </Location>

    <Person rdf:nodeID="lucy">
        <name>Lucy</name>
        <bornIn rdf:nodeID="idaho"/>
    </Person>
</rdf:RDF>
```

RDF has a few quirks due to the fact that it is designed for internet-wide data exchange. The
subject, predicate, and object of a triple are often URIs. For example, a predicate might be an URI
such as `<http://my-company.com/namespace#within>` or `<http://my-company.com/namespace#lives_in>`,
rather than just `WITHIN` or `LIVES_IN`. The reasoning behind this design is that you should be able
to combine your data with someone else’s data, and if they attach a different meaning to the word
`within` or `lives_in`, you won’t get a conflict because their predicates are actually
`<http://other.org/foo#within>` and `<http://other.org/foo#lives_in>`.

The URL `<http://my-company.com/namespace>` doesn’t necessarily need to resolve to anything—from
RDF’s point of view, it is simply a namespace. To avoid potential confusion with `http://` URLs, the
examples in this section use non-resolvable URIs such as `urn:example:within`. Fortunately, you can
just specify this prefix once at the top of the file, and then forget about it.

#### The SPARQL query language {#the-sparql-query-language}

*SPARQL* is a query language for triple-stores using the RDF data model [^56].
(It is an acronym for *SPARQL Protocol and RDF Query Language*, pronounced “sparkle.”)
It predates Cypher, and since Cypher’s pattern matching is borrowed from SPARQL, they look quite
similar.

The same query as before—finding people who have moved from the US to Europe—is similarly concise in
SPARQL as it is in Cypher (see [Example 3-10](/en/ch3#fig_sparql_query)).

{{< figure id="fig_sparql_query" title="Example 3-10. The same query as [Example 3-5](/en/ch3#fig_cypher_query), expressed in SPARQL" class="w-full my-4" >}}

```
PREFIX : <urn:example:>

SELECT ?personName WHERE {
 ?person :name ?personName.
 ?person :bornIn / :within* / :name "United States".
 ?person :livesIn / :within* / :name "Europe".
}
```

The structure is very similar. The following two expressions are equivalent (variables start with a
question mark in SPARQL):

```
(person) -[:BORN_IN]-> () -[:WITHIN*0..]-> (location) # Cypher

?person :bornIn / :within* ?location. # SPARQL
```

Because RDF doesn’t distinguish between properties and edges but just uses predicates for both, you
can use the same syntax for matching properties. In the following expression, the variable `usa` is
bound to any vertex that has a `name` property whose value is the string `"United States"`:

```
(usa {name:'United States'}) # Cypher

?usa :name "United States". # SPARQL
```

SPARQL is supported by Amazon Neptune, AllegroGraph, Blazegraph, OpenLink Virtuoso, Apache Jena, and
various other triple stores [^36].

### Datalog: Recursive Relational Queries {#id62}

Datalog is a much older language than SPARQL or Cypher: it arose from academic research in the 1980s [^57] [^58] [^59].
It is less well known among software engineers and not widely supported in mainstream databases, but
it ought to be better-known since it is a very expressive language that is particularly powerful for
complex queries. Several niche databases, including Datomic, LogicBlox, CozoDB, and LinkedIn’s
LIquid [^60] use Datalog as their query language.

Datalog is actually based on a relational data model, not a graph, but it appears in the graph
databases section of this book because recursive queries on graphs are a particular strength of
Datalog.

The contents of a Datalog database consists of *facts*, and each fact corresponds to a row in a
relational table. For example, say we have a table *location* containing locations, and it has three
columns: *ID*, *name*, and *type*. The fact that the US is a country could then be written as
`location(2, "United States", "country")`, where `2` is the ID of the US. In general, the statement
`table(val1, val2, …​)` means that `table` contains a row where the first column contains `val1`,
the second column contains `val2`, and so on.

[Example 3-11](/en/ch3#fig_datalog_triples) shows how to write the data from the left-hand side of
[Figure 3-6](/en/ch3#fig_datamodels_graph) in Datalog. The edges of the graph (`within`, `born_in`, and `lives_in`)
are represented as two-column join tables. For example, Lucy has the ID 100 and Idaho has the ID 3,
so the relationship “Lucy was born in Idaho” is represented as `born_in(100, 3)`.

{{< figure id="fig_datalog_triples" title="Example 3-11. A subset of the data in [Figure 3-6](/en/ch3#fig_datamodels_graph), represented as Datalog facts" class="w-full my-4" >}}

```
location(1, "North America", "continent").
location(2, "United States", "country").
location(3, "Idaho", "state").

within(2, 1). /* US is in North America */
within(3, 2). /* Idaho is in the US */

person(100, "Lucy").
born_in(100, 3). /* Lucy was born in Idaho */
```

Now that we have defined the data, we can write the same query as before, as shown in
[Example 3-12](/en/ch3#fig_datalog_query). It looks a bit different from the equivalent in Cypher or SPARQL, but don’t
let that put you off. Datalog is a subset of Prolog, a programming language that you might have seen
before if you’ve studied computer science.

{{< figure id="fig_datalog_query" title="Example 3-12. The same query as [Example 3-5](/en/ch3#fig_cypher_query), expressed in Datalog" class="w-full my-4" >}}

```sql
within_recursive(LocID, PlaceName) :- location(LocID, PlaceName, _). /* Rule 1 */

within_recursive(LocID, PlaceName) :- within(LocID, ViaID), /* Rule 2 */
 within_recursive(ViaID, PlaceName).

migrated(PName, BornIn, LivingIn) :- person(PersonID, PName), /* Rule 3 */
 born_in(PersonID, BornID),
 within_recursive(BornID, BornIn),
 lives_in(PersonID, LivingID),
 within_recursive(LivingID, LivingIn).

us_to_europe(Person) :- migrated(Person, "United States", "Europe"). /* Rule 4 */
/* us_to_europe contains the row "Lucy". */
```

Cypher and SPARQL jump in right away with `SELECT`, but Datalog takes a small step at a time. We
define *rules* that derive new virtual tables from the underlying facts. These derived tables are
like (virtual) SQL views: they are not stored in the database, but you can query them in the same
way as a table containing stored facts.

In [Example 3-12](/en/ch3#fig_datalog_query) we define three derived tables: `within_recursive`, `migrated`, and
`us_to_europe`. The name and columns of the virtual tables are defined by what appears before the
`:-` symbol of each rule. For example, `migrated(PName, BornIn, LivingIn)` is a virtual table with
three columns: the name of a person, the name of the place where they were born, and the name of the
place where they are living.

The content of a virtual table is defined by the part of the rule after the `:-` symbol, where we
try to find rows that match a certain pattern in the tables. For example, `person(PersonID, PName)`
matches the row `person(100, "Lucy")`, with the variable `PersonID` bound to the value `100` and the
variable `PName` bound to the value `"Lucy"`. A rule applies if the system can find a match for
*all* patterns on the righthand side of the `:-` operator. When the rule applies, it’s as though the
lefthand side of the `:-` was added to the database (with variables replaced by the values they matched).

One possible way of applying the rules is thus (and as illustrated in [Figure 3-7](/en/ch3#fig_datalog_naive)):

1. `location(1, "North America", "continent")` exists in the database, so rule 1 applies. It generates `within_recursive(1, "North America")`.
2. `within(2, 1)` exists in the database and the previous step generated `within_recursive(1, "North America")`, so rule 2 applies. It generates `within_recursive(2, "North America")`.
3. `within(3, 2)` exists in the database and the previous step generated `within_recursive(2, "North America")`, so rule 2 applies. It generates `within_recursive(3, "North America")`.

By repeated application of rules 1 and 2, the `within_recursive` virtual table can tell us all the
locations in North America (or any other location) contained in our database.

{{< figure link="#fig_datalog_query" src="/fig/ddia_0307.png" id="fig_datalog_naive" title="Figure 3-7. Determining that Idaho is in North America, using the Datalog rules from Example 3-12." class="w-full my-4" >}}

> Figure 3-7. Determining that Idaho is in North America, using the Datalog rules from [Example 3-12](/en/ch3#fig_datalog_query).

Now rule 3 can find people who were born in some location `BornIn` and live in some location
`LivingIn`. Rule 4 invokes rule 3 with `BornIn = 'United States'` and
`LivingIn = 'Europe'`, and returns only the names of the people who match the
search. By querying the contents of the virtual `us_to_europe` table, the Datalog system finally
gets the same answer as in the earlier Cypher and SPARQL queries.

The Datalog approach requires a different kind of thinking compared to the other query languages
discussed in this chapter. It allows complex queries to be built up rule by rule, with one rule
referring to other rules, similarly to the way that you break down code into functions that call
each other. Just like functions can be recursive, Datalog rules can also invoke themselves, like
rule 2 in [Example 3-12](/en/ch3#fig_datalog_query), which enables graph traversals in Datalog queries.

### GraphQL {#id63}

GraphQL is a query language that, by design, is much more restrictive than the other query languages
we have seen in this chapter. The purpose of GraphQL is to allow client software running on a user’s
device (such as a mobile app or a JavaScript web app frontend) to request a JSON document with a
particular structure, containing the fields necessary for rendering its user interface. GraphQL
interfaces allow developers to rapidly change queries in client code without changing server-side APIs.

GraphQL’s flexibility comes at a cost. Organizations that adopt GraphQL often need tooling to
convert GraphQL queries into requests to internal services, which often use REST or gRPC (see
[Chapter 5](/en/ch5#ch_encoding)). Authorization, rate limiting, and performance challenges are additional concerns [^61].
GraphQL’s query language is also limited since GraphQL come from an untrusted source. The language
does not allow anything that could be expensive to execute, since otherwise users could perform
denial-of-service attacks on a server by running lots of expensive queries. In particular, GraphQL
does not allow recursive queries (unlike Cypher, SPARQL, SQL, or Datalog), and it does not allow
arbitrary search conditions such as “find people who were born in the US and are now living in
Europe” (unless the service owners specifically choose to offer such search functionality).

Nevertheless, GraphQL is useful. [Example 3-13](/en/ch3#fig_graphql_query) shows how you might implement a group chat
application such as Discord or Slack using GraphQL. The query requests all the channels that the
user has access to, including the channel name and the 50 most recent messages in each channel. For
each message it requests the timestamp, the message content, and the name and profile picture URL
for the sender of the message. Moreover, if a message is a reply to another message, the query also
requests the sender name and the content of the message it is replying to (which might be rendered
in a smaller font above the reply, in order to provide some context).

{{< figure id="fig_graphql_query" title="Example 3-13. Example GraphQL query for a group chat application" class="w-full my-4" >}}

```
query ChatApp {
    channels {
        name
        recentMessages(latest: 50) {
            timestamp
            content
        sender {
            fullName
            imageUrl
        }
    replyTo {
        content
        sender {
            fullName
        }
    }
    }
    }
}
```

[Example 3-14](/en/ch3#fig_graphql_response) shows what a response to the query in [Example 3-13](/en/ch3#fig_graphql_query) might look
like. The response is a JSON document that mirrors the structure of the query: it contains exactly
those attributes that were requested, no more and no less. This approach has the advantage that the
server does not need to know which attributes the client requires in order to render the user
interface; instead, the client can simply request what it needs. For example, this query does not
request a profile picture URL for the sender of the `replyTo` message, but if the user interface
were changed to add that profile picture, it would be easy for the client to add the required
`imageUrl` attribute to the query without changing the server.

{{< figure id="fig_graphql_response" title="Example 3-14. A possible response to the query in [Example 3-13](/en/ch3#fig_graphql_query)" class="w-full my-4" >}}

```json
{
"data": {
    "channels": [
        {
        "name": "#general",
        "recentMessages": [
        {
        "timestamp": 1693143014,
        "content": "Hey! How are y'all doing?",
        "sender": {"fullName": "Aaliyah", "imageUrl": "https://..."},
        "replyTo": null
        },
        {
            "timestamp": 1693143024,
            "content": "Great! And you?",
            "sender": {"fullName": "Caleb", "imageUrl": "https://..."},
            "replyTo": {
            "content": "Hey! How are y'all doing?",
            "sender": {"fullName": "Aaliyah"}
        }
},
...
```

In [Example 3-14](/en/ch3#fig_graphql_response) the name and image URL of a message sender is embedded directly in the
message object. If the same user sends multiple messages, this information is repeated on each
message. In principle, it would be possible to reduce this duplication, but GraphQL makes the design
choice to accept a larger response size in order to make it simpler to render the user interface
based on the data.

The `replyTo` field is similar: in [Example 3-14](/en/ch3#fig_graphql_response), the second message is a reply to the
first, and the content (“Hey!…”) and sender Aaliyah are duplicated under `replyTo`. It would be
possible to instead return the ID of the message being replied to, but then the client would have to
make an additional request to the server if that ID is not among the 50 most recent messages
returned. Duplicating the content makes it much simpler to work with the data.

The server’s database can store the data in a more normalized form, and perform the necessary joins
to process a query. For example, the server might store a message along with the user ID of the
sender and the ID of the message it is replying to; when it receives a query like the one above, the
server would then resolve those IDs to find the records they refer to. However, the client can only
ask the server to perform joins that are explicitly offered in the GraphQL schema.

Even though the response to a GraphQL query looks similar to a response from a document database,
and even though it has “graph” in the name, GraphQL can be implemented on top of any type of
database—relational, document, or graph.


## Event Sourcing and CQRS {#sec_datamodels_events}

In all the data models we have discussed so far, the data is queried in the same form as it is
written—be it JSON documents, rows in tables, or vertices and edges in a graph. However, in complex
applications it can sometimes be difficult to find a single data representation that is able to
satisfy all the different ways that the data needs to be queried and presented. In such situations,
it can be beneficial to write data in one form, and then to derive from it several representations
that are optimized for different types of reads.

We previously saw this idea in [“Systems of Record and Derived Data”](/en/ch1#sec_introduction_derived), and ETL (see [“Data Warehousing”](/en/ch1#sec_introduction_dwh))
is one example of such a derivation process. Now we will take the idea further. If we are going to
derive one data representation from another anyway, we can choose different representations that are
optimized for writing and for reading, respectively. How would you model your data if you only
wanted to optimize it for writing, and if efficient queries were of no concern?

Perhaps the simplest, fastest, and most expressive way of writing data is an *event log*: every time
you want to write some data, you encode it as a self-contained string (perhaps as JSON), including a
timestamp, and then append it to a sequence of events. Events in this log are *immutable*: you never
change or delete them, you only ever append more events to the log (which may supersede earlier
events). An event can contain arbitrary properties.

[Figure 3-8](/en/ch3#fig_event_sourcing) shows an example that could be taken from a conference management system. A
conference can be a complex business domain: not only can individual attendees register and pay by
card, but companies can also order seats in bulk, pay by invoice, and then later assign the seats to
individual people. Some number of seats may be reserved for speakers, sponsors, volunteer helpers,
and so on. Reservations may also be cancelled, and meanwhile, the conference organizer might change
the capacity of the event by moving it to a different room. With all of this going on, simply
calculating the number of available seats becomes a challenging query.

{{< figure src="/fig/ddia_0308.png" id="fig_event_sourcing" title="Figure 3-8. Using a log of immutable events as source of truth, and deriving materialized views from it." class="w-full my-4" >}}

In [Figure 3-8](/en/ch3#fig_event_sourcing), every change to the state of the conference (such as the organizer
opening registrations, or attendees making and cancelling registrations) is first stored as an
event. Whenever an event is appended to the log, several *materialized views* (also known as
*projections* or *read models*) are also updated to reflect the effect of that event. In the
conference example, there might be one materialized view that collects all information related to
the status of each booking, another that computes charts for the conference organizer’s dashboard,
and a third that generates files for the printer that produces the attendees’ badges.

The idea of using events as the source of truth, and expressing every state change as an event, is
known as *event sourcing* [^62] [^63].
The principle of maintaining separate read-optimized representations and deriving them from the
write-optimized representation is called *command query responsibility segregation (CQRS)* [^64].
These terms originated in the domain-driven design (DDD) community, although similar ideas have been
around for a long time, for example in *state machine replication* (see [“Using shared logs”](/en/ch10#sec_consistency_smr)).

When a request from a user comes in, it is called a *command*, and it first needs to be validated.
Only once the command has been executed and it has been determined to be valid (e.g., there were
enough available seats for a requested reservation), it becomes a fact, and the corresponding event
is added to the log. Consequently, the event log should contain only valid events, and a consumer
of the event log that builds a materialized view is not allowed to reject an event.

When modelling your data in an event sourcing style, it is recommended that you name your events in
the past tense (e.g., “the seats were booked”), because an event is a record of the fact that
something has happened in the past. Even if the user later decides to change or cancel, the fact
remains true that they formerly held a booking, and the change or cancellation is a separate event
that is added later.

A similarity between event sourcing and a star schema fact table, as discussed in
[“Stars and Snowflakes: Schemas for Analytics”](/en/ch3#sec_datamodels_analytics), is that both are collections of events that happened in the past.
However, rows in a fact table all have the same set of columns, wheras in event sourcing there may
be many different event types, each with different properties. Moreover, a fact table is an
unordered collection, while in event sourcing the order of events is important: if a booking is
first made and then cancelled, processing those events in the wrong order would not make sense.

Event sourcing and CQRS have several advantages:

* For the people developing the system, events better communicate the intent of *why* something
 happened. For example, it’s easier to understand the event “the booking was cancelled” than “the
 `active` column on row 4001 of the `bookings` table was set to `false`, three rows associated with
 that booking were deleted from the `seat_assignments` table, and a row representing the refund was
 inserted into the `payments` table”. Those row modifications may still happen when a materialized
 view processes the cancellation event, but when they are driven by an event, the reason for the
 updates becomes much clearer.
* A key principle of event sourcing is that the materialized views are derived from the event log in
 a reproducible way: you should always be able to delete the materialized views and recompute them
 by processing the same events in the same order, using the same code. If there was a bug in the
 view maintenance code, you can just delete the view and recompute it with the new code. It’s also
 easier to find the bug because you can re-run the view maintenance code as often as you like and
 inspect its behavior.
* You can have multiple materialized views that are optimized for the particular queries that your
 application requires. They can be stored either in the same database as the events or a different
 one, depending on your needs. They can use any data model, and they can be denormalized for fast
 reads. You can even keep a view only in memory and avoid persisting it, as long as it’s okay to
 recompute the view from the event log whenever the service restarts.
* If you decide you want to present the existing information in a new way, it is easy to build a new
 materialized view from the existing event log. You can also evolve the system to support new
 features by adding new types of events, or new properties to existing event types (any older
 events remain unmodified). You can also chain new behaviors off existing events (for example, when
 a conference attendee cancels, their seat could be offered to the next person on the waiting
 list).
* If an event was written in error you can delete it again, and then you can rebuild the views
 without the deleted event. On the other hand, in a database where you update and delete data
 directly, a committed transaction is often difficult to reverse. Event sourcing can therefore
 reduce the number of irreversible actions in the system, making it easier to change
 (see [“Evolvability: Making Change Easy”](/en/ch2#sec_introduction_evolvability)).
* The event log can also serve as an audit log of everything that happened in the system, which is
 valuable in regulated industries that require such auditability.

However, event sourcing and CQRS also have downsides:

* You need to be careful if external information is involved. For example, say an event contains a
 price given in one currency, and for one of the views it needs to be converted into another
 currency. Since the exchange rate may fluctuate, it would be problematic to fetch the exchange
 rate from an external source when processing the event, since you would get a different result if
 you recompute the materialized view on another date. To make the event processing logic
 deterministic, you either need to include the exchange rate in the event itself, or have a way of
 querying the historical exchange rate at the timestamp indicated in the event, ensuring that this
 query always returns the same result for the same timestamp.
* The requirement that events are immutable creates problems if events contain personal data from
 users, since users may exercise their right (e.g., under the GDPR) to request deletion of their
 data. If the event log is on a per-user basis, you can just delete the whole log for that user,
 but that doesn’t work if your event log contains events relating to multiple users. You can try
 storing the personal data outside of the actual event, or encrypting it with a key that you can
 later choose to delete, but that also makes it harder to recompute derived state when needed.
* Reprocessing events requires care if there are externally visible side-effects—for example, you
 probably don’t want to resend confirmation emails every time you rebuild a materialized view.

You can implement event sourcing on top of any database, but there are also some systems that are
specifically designed to support this pattern, such as EventStoreDB, MartenDB (based on PostgreSQL),
and Axon Framework. You can also use message brokers such as Apache Kafka to store the event log,
and stream processors can keep the materialized views up-to-date; we will return to these topics in
[“Change data capture versus event sourcing”](/en/ch12#sec_stream_event_sourcing).

The only important requirement is that the event storage system must guarantee that all materialized
views process the events in exactly the same order as they appear in the log; as we shall see in
[Chapter 10](/en/ch10#ch_consistency), this is not always easy to achieve in a distributed system.


## Dataframes, Matrices, and Arrays {#sec_datamodels_dataframes}

The data models we have seen so far in this chapter are generally used for both transaction
processing and analytics purposes (see [“Analytical versus Operational Systems”](/en/ch1#sec_introduction_analytics)). There are also some data
models that you are likely to encounter in an analytical or scientific context, but that rarely
feature in OLTP systems: dataframes and multidimensional arrays of numbers such as matrices.

Dataframes are a data model supported by the R language, the Pandas library for Python, Apache
Spark, ArcticDB, Dask, and other systems. They are a popular tool for data scientists preparing data
for training machine learning models, but they are also widely used for data exploration,
statistical data analysis, data visualization, and similar purposes.

At first glance, a dataframe is similar to a table in a relational database or a spreadsheet. It
supports relational-like operators that perform bulk operations on the contents of the dataframe:
for example, applying a function to all of the rows, filtering the rows based on some condition,
grouping rows by some columns and aggregating other columns, and joining the rows in one dataframe
with another dataframe based on some key (what a relational database calls *join* is typically
called *merge* on dataframes).

Instead of a declarative query such as SQL, a dataframe is typically manipulated through a series of
commands that modify its structure and content. This matches the typical workflow of data
scientists, who incrementally “wrangle” the data into a form that allows them to find answers to the
questions they are asking. These manipulations usually take place on the data scientist’s private
copy of the dataset, often on their local machine, although the end result may be shared with other
users.

Dataframe APIs also offer a wide variety of operations that go far beyond what relational databases
offer, and the data model is often used in ways that are very different from typical relational data modelling [^65].
For example, a common use of dataframes is to transform data from a relational-like representation
into a matrix or multidimensional array representation, which is the form that many machine learning
algorithms expect of their input.

A simple example of such a transformation is shown in [Figure 3-9](/en/ch3#fig_dataframe_to_matrix). On the left we
have a relational table of how different users have rated various movies (on a scale of 1 to 5), and
on the right the data has been transformed into a matrix where each column is a movie and each row
is a user (similarly to a *pivot table* in a spreadsheet). The matrix is *sparse*, which means there
is no data for many user-movie combinations, but this is fine. This matrix may have many thousands
of columns and would therefore not fit well in a relational database, but dataframes and libraries
that offer sparse arrays (such as NumPy for Python) can handle such data easily.

{{< figure src="/fig/ddia_0309.png" id="fig_dataframe_to_matrix" title="Figure 3-9. Transforming a relational database of movie ratings into a matrix representation." class="w-full my-4" >}}

A matrix can only contain numbers, and various techniques are used to transform non-numerical data
into numbers in the matrix. For example:

* Dates (which are omitted from the example matrix in [Figure 3-9](/en/ch3#fig_dataframe_to_matrix)) could be scaled
 to be floating-point numbers within some suitable range.
* For columns that can only take one of a small, fixed set of values (for example, the genre of a
 movie in a database of movies), a *one-hot encoding* is often used: we create a column for each
 possible value (one for “comedy”, one for “drama”, one for “horror”, etc.), and for each row
 representing a movie, we put a 1 in the column corresponding to the genre of that movie, and a 0
 in all the other columns. This representation also easily generalizes to movies that fit within
 several genres.

Once the data is in the form of a matrix of numbers, it is amenable to linear algebra operations,
which form the basis of many machine learning algorithms. For example, the data in
[Figure 3-9](/en/ch3#fig_dataframe_to_matrix) could be a part of a system for recommending movies that the user may
like. Dataframes are flexible enough to allow data to be gradually evolved from a relational form
into a matrix representation, while giving the data scientist control over the representation that
is most suitable for achieving the goals of the data analysis or model training process.

There are also databases such as TileDB [^66] that specialize in storing large multidimensional arrays of numbers; they are called *array
databases* and are most commonly used for scientific datasets such as geospatial measurements
(raster data on a regularly spaced grid), medical imaging, or observations from astronomical telescopes [^67].
Dataframes are also used in the financial industry for representing *time series data*, such as the
prices of assets and trades over time [^68].

## Summary {#summary}

Data models are a huge subject, and in this chapter we have taken a quick look at a broad variety of
different models. We didn’t have space to go into all the details of each model, but hopefully the
overview has been enough to whet your appetite to find out more about the model that best fits your
application’s requirements.

The *relational model*, despite being more than half a century old, remains an important data model
for many applications—especially in data warehousing and business analytics, where relational star
or snowflake schemas and SQL queries are ubiquitous. However, several alternatives to relational
data have also become popular in other domains:

* The *document model* targets use cases where data comes in self-contained JSON documents, and
 where relationships between one document and another are rare.
* *Graph data models* go in the opposite direction, targeting use cases where anything is potentially
 related to everything, and where queries potentially need to traverse multiple hops to find the
 data of interest (which can be expressed using recursive queries in Cypher, SPARQL, or Datalog).
* *Dataframes* generalize relational data to large numbers of columns, and thereby provide a bridge
 between databases and the multidimensional arrays that form the basis of much machine learning,
 statistical data analysis, and scientific computing.

To some degree, one model can be emulated in terms of another model—for example, graph data can be
represented in a relational database—but the result can be awkward, as we saw with the support for
recursive queries in SQL.

Various specialist databases have therefore been developed for each data model, providing query
languages and storage engines that are optimized for a particular model. However, there is also a
trend for databases to expand into neighboring niches by adding support for other data models: for
example, relational databases have added support for document data in the form of JSON columns,
document databases have added relational-like joins, and support for graph data within SQL is
gradually improving.

Another model we discussed is *event sourcing*, which represents data as an append-only log of
immutable events, and which can be advantageous for modeling activities in complex business domains.
An append-only log is good for writing data (as we shall see in [Chapter 4](/en/ch4#ch_storage)); in order to support
efficient queries, the event log is translated into read-optimized materialized views through CQRS.

One thing that non-relational data models have in common is that they typically don’t enforce a
schema for the data they store, which can make it easier to adapt applications to changing
requirements. However, your application most likely still assumes that data has a certain structure;
it’s just a question of whether the schema is explicit (enforced on write) or implicit (assumed on read).

Although we have covered a lot of ground, there are still data models left unmentioned. To give just
a few brief examples:

* Researchers working with genome data often need to perform *sequence-similarity searches*, which
 means taking one very long string (representing a DNA molecule) and matching it against a large
 database of strings that are similar, but not identical. None of the databases described here can
 handle this kind of usage, which is why researchers have written specialized genome database
 software like GenBank [^69].
* Many financial systems use *ledgers* with double-entry accounting as their data model. This type
 of data can be represented in relational databases, but there are also databases such as
 TigerBeetle that specialize in this data model. Cryptocurrencies and blockchains are typically
 based on distributed ledgers, which also have value transfer built into their data model.
* *Full-text search* is arguably a kind of data model that is frequently used alongside databases.
 Information retrieval is a large specialist subject that we won’t cover in great detail in this
 book, but we’ll touch on search indexes and vector search in [“Full-Text Search”](/en/ch4#sec_storage_full_text).

We have to leave it there for now. In the next chapter we will discuss some of the trade-offs that
come into play when *implementing* the data models described in this chapter.


### References

[^1]: Jamie Brandon. [Unexplanations: query optimization works because sql is declarative](https://www.scattered-thoughts.net/writing/unexplanations-sql-declarative/). *scattered-thoughts.net*, February 2024. Archived at [perma.cc/P6W2-WMFZ](https://perma.cc/P6W2-WMFZ) 
[^2]: Joseph M. Hellerstein. [The Declarative Imperative: Experiences and Conjectures in Distributed Logic](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2010/EECS-2010-90.pdf). Tech report UCB/EECS-2010-90, Electrical Engineering and Computer Sciences, University of California at Berkeley, June 2010. Archived at [perma.cc/K56R-VVQM](https://perma.cc/K56R-VVQM) 
[^3]: Edgar F. Codd. [A Relational Model of Data for Large Shared Data Banks](https://www.seas.upenn.edu/~zives/03f/cis550/codd.pdf). *Communications of the ACM*, volume 13, issue 6, pages 377–387, June 1970. [doi:10.1145/362384.362685](https://doi.org/10.1145/362384.362685) 
[^4]: Michael Stonebraker and Joseph M. Hellerstein. [What Goes Around Comes Around](http://mitpress2.mit.edu/books/chapters/0262693143chapm1.pdf). In *Readings in Database Systems*, 4th edition, MIT Press, pages 2–41, 2005. ISBN: 9780262693141 
[^5]: Markus Winand. [Modern SQL: Beyond Relational](https://modern-sql.com/). *modern-sql.com*, 2015. Archived at [perma.cc/D63V-WAPN](https://perma.cc/D63V-WAPN) 
[^6]: Martin Fowler. [OrmHate](https://martinfowler.com/bliki/OrmHate.html). *martinfowler.com*, May 2012. Archived at [perma.cc/VCM8-PKNG](https://perma.cc/VCM8-PKNG) 
[^7]: Vlad Mihalcea. [N+1 query problem with JPA and Hibernate](https://vladmihalcea.com/n-plus-1-query-problem/). *vladmihalcea.com*, January 2023. Archived at [perma.cc/79EV-TZKB](https://perma.cc/79EV-TZKB) 
[^8]: Jens Schauder. [This is the Beginning of the End of the N+1 Problem: Introducing Single Query Loading](https://spring.io/blog/2023/08/31/this-is-the-beginning-of-the-end-of-the-n-1-problem-introducing-single-query). *spring.io*, August 2023. Archived at [perma.cc/6V96-R333](https://perma.cc/6V96-R333) 
[^9]: William Zola. [6 Rules of Thumb for MongoDB Schema Design](https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design). *mongodb.com*, June 2014. Archived at [perma.cc/T2BZ-PPJB](https://perma.cc/T2BZ-PPJB) 
[^10]: Sidney Andrews and Christopher McClister. [Data modeling in Azure Cosmos DB](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/modeling-data). *learn.microsoft.com*, February 2023. Archived at [archive.org](https://web.archive.org/web/20230207193233/https%3A//learn.microsoft.com/en-us/azure/cosmos-db/nosql/modeling-data) 
[^11]: Raffi Krikorian. [Timelines at Scale](https://www.infoq.com/presentations/Twitter-Timeline-Scalability/). At *QCon San Francisco*, November 2012. Archived at [perma.cc/V9G5-KLYK](https://perma.cc/V9G5-KLYK) 
[^12]: Ralph Kimball and Margy Ross. [*The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling*](https://learning.oreilly.com/library/view/the-data-warehouse/9781118530801/), 3rd edition. John Wiley & Sons, July 2013. ISBN: 9781118530801 
[^13]: Michael Kaminsky. [Data warehouse modeling: Star schema vs. OBT](https://www.fivetran.com/blog/star-schema-vs-obt). *fivetran.com*, August 2022. Archived at [perma.cc/2PZK-BFFP](https://perma.cc/2PZK-BFFP) 
[^14]: Joe Nelson. [User-defined Order in SQL](https://begriffs.com/posts/2018-03-20-user-defined-order.html). *begriffs.com*, March 2018. Archived at [perma.cc/GS3W-F7AD](https://perma.cc/GS3W-F7AD) 
[^15]: Evan Wallace. [Realtime Editing of Ordered Sequences](https://www.figma.com/blog/realtime-editing-of-ordered-sequences/). *figma.com*, March 2017. Archived at [perma.cc/K6ER-CQZW](https://perma.cc/K6ER-CQZW) 
[^16]: David Greenspan. [Implementing Fractional Indexing](https://observablehq.com/%40dgreensp/implementing-fractional-indexing). *observablehq.com*, October 2020. Archived at [perma.cc/5N4R-MREN](https://perma.cc/5N4R-MREN) 
[^17]: Martin Fowler. [Schemaless Data Structures](https://martinfowler.com/articles/schemaless/). *martinfowler.com*, January 2013. 
[^18]: Amr Awadallah. [Schema-on-Read vs. Schema-on-Write](https://www.slideshare.net/awadallah/schemaonread-vs-schemaonwrite). At *Berkeley EECS RAD Lab Retreat*, Santa Cruz, CA, May 2009. Archived at [perma.cc/DTB2-JCFR](https://perma.cc/DTB2-JCFR) 
[^19]: Martin Odersky. [The Trouble with Types](https://www.infoq.com/presentations/data-types-issues/). At *Strange Loop*, September 2013. Archived at [perma.cc/85QE-PVEP](https://perma.cc/85QE-PVEP) 
[^20]: Conrad Irwin. [MongoDB—Confessions of a PostgreSQL Lover](https://speakerdeck.com/conradirwin/mongodb-confessions-of-a-postgresql-lover). At *HTML5DevConf*, October 2013. Archived at [perma.cc/C2J6-3AL5](https://perma.cc/C2J6-3AL5) 
[^21]: [Percona Toolkit Documentation: pt-online-schema-change](https://docs.percona.com/percona-toolkit/pt-online-schema-change.html). *docs.percona.com*, 2023. Archived at [perma.cc/9K8R-E5UH](https://perma.cc/9K8R-E5UH) 
[^22]: Shlomi Noach. [gh-ost: GitHub’s Online Schema Migration Tool for MySQL](https://github.blog/2016-08-01-gh-ost-github-s-online-migration-tool-for-mysql/). *github.blog*, August 2016. Archived at [perma.cc/7XAG-XB72](https://perma.cc/7XAG-XB72) 
[^23]: Shayon Mukherjee. [pg-osc: Zero downtime schema changes in PostgreSQL](https://www.shayon.dev/post/2022/47/pg-osc-zero-downtime-schema-changes-in-postgresql/). *shayon.dev*, February 2022. Archived at [perma.cc/35WN-7WMY](https://perma.cc/35WN-7WMY) 
[^24]: Carlos Pérez-Aradros Herce. [Introducing pgroll: zero-downtime, reversible, schema migrations for Postgres](https://xata.io/blog/pgroll-schema-migrations-postgres). *xata.io*, October 2023. Archived at [archive.org](https://web.archive.org/web/20231008161750/https%3A//xata.io/blog/pgroll-schema-migrations-postgres) 
[^25]: James C. Corbett, Jeffrey Dean, Michael Epstein, Andrew Fikes, Christopher Frost, JJ Furman, Sanjay Ghemawat, Andrey Gubarev, Christopher Heiser, Peter Hochschild, Wilson Hsieh, Sebastian Kanthak, Eugene Kogan, Hongyi Li, Alexander Lloyd, Sergey Melnik, David Mwaura, David Nagle, Sean Quinlan, Rajesh Rao, Lindsay Rolig, Dale Woodford, Yasushi Saito, Christopher Taylor, Michal Szymaniak, and Ruth Wang. [Spanner: Google’s Globally-Distributed Database](https://research.google/pubs/pub39966/). At *10th USENIX Symposium on Operating System Design and Implementation* (OSDI), October 2012. 
[^26]: Donald K. Burleson. [Reduce I/O with Oracle Cluster Tables](http://www.dba-oracle.com/oracle_tip_hash_index_cluster_table.htm). *dba-oracle.com*. Archived at [perma.cc/7LBJ-9X2C](https://perma.cc/7LBJ-9X2C) 
[^27]: Fay Chang, Jeffrey Dean, Sanjay Ghemawat, Wilson C. Hsieh, Deborah A. Wallach, Mike Burrows, Tushar Chandra, Andrew Fikes, and Robert E. Gruber. [Bigtable: A Distributed Storage System for Structured Data](https://research.google/pubs/pub27898/). At *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006. 
[^28]: Priscilla Walmsley. [*XQuery, 2nd Edition*](https://learning.oreilly.com/library/view/xquery-2nd-edition/9781491915080/). O’Reilly Media, December 2015. ISBN: 9781491915080 
[^29]: Paul C. Bryan, Kris Zyp, and Mark Nottingham. [JavaScript Object Notation (JSON) Pointer](https://www.rfc-editor.org/rfc/rfc6901). RFC 6901, IETF, April 2013. 
[^30]: Stefan Gössner, Glyn Normington, and Carsten Bormann. [JSONPath: Query Expressions for JSON](https://www.rfc-editor.org/rfc/rfc9535.html). RFC 9535, IETF, February 2024. 
[^31]: Michael Stonebraker and Andrew Pavlo. [What Goes Around Comes Around… And Around…](https://db.cs.cmu.edu/papers/2024/whatgoesaround-sigmodrec2024.pdf). *ACM SIGMOD Record*, volume 53, issue 2, pages 21–37. [doi:10.1145/3685980.3685984](https://doi.org/10.1145/3685980.3685984) 
[^32]: Lawrence Page, Sergey Brin, Rajeev Motwani, and Terry Winograd. [The PageRank Citation Ranking: Bringing Order to the Web](http://ilpubs.stanford.edu:8090/422/). Technical Report 1999-66, Stanford University InfoLab, November 1999. Archived at [perma.cc/UML9-UZHW](https://perma.cc/UML9-UZHW) 
[^33]: Nathan Bronson, Zach Amsden, George Cabrera, Prasad Chakka, Peter Dimov, Hui Ding, Jack Ferris, Anthony Giardullo, Sachin Kulkarni, Harry Li, Mark Marchukov, Dmitri Petrov, Lovro Puzar, Yee Jiun Song, and Venkat Venkataramani. [TAO: Facebook’s Distributed Data Store for the Social Graph](https://www.usenix.org/conference/atc13/technical-sessions/presentation/bronson). At *USENIX Annual Technical Conference* (ATC), June 2013. 
[^34]: Natasha Noy, Yuqing Gao, Anshu Jain, Anant Narayanan, Alan Patterson, and Jamie Taylor. [Industry-Scale Knowledge Graphs: Lessons and Challenges](https://cacm.acm.org/magazines/2019/8/238342-industry-scale-knowledge-graphs/fulltext). *Communications of the ACM*, volume 62, issue 8, pages 36–43, August 2019. [doi:10.1145/3331166](https://doi.org/10.1145/3331166) 
[^35]: Xiyang Feng, Guodong Jin, Ziyi Chen, Chang Liu, and Semih Salihoğlu. [KÙZU Graph Database Management System](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf). At *3th Annual Conference on Innovative Data Systems Research* (CIDR 2023), January 2023. 
[^36]: Maciej Besta, Emanuel Peter, Robert Gerstenberger, Marc Fischer, Michał Podstawski, Claude Barthels, Gustavo Alonso, Torsten Hoefler. [Demystifying Graph Databases: Analysis and Taxonomy of Data Organization, System Designs, and Graph Queries](https://arxiv.org/pdf/1910.09017.pdf). *arxiv.org*, October 2019. 
[^37]: [Apache TinkerPop 3.6.3 Documentation](https://tinkerpop.apache.org/docs/3.6.3/reference/). *tinkerpop.apache.org*, May 2023. Archived at [perma.cc/KM7W-7PAT](https://perma.cc/KM7W-7PAT) 
[^38]: Nadime Francis, Alastair Green, Paolo Guagliardo, Leonid Libkin, Tobias Lindaaker, Victor Marsault, Stefan Plantikow, Mats Rydberg, Petra Selmer, and Andrés Taylor. [Cypher: An Evolving Query Language for Property Graphs](https://core.ac.uk/download/pdf/158372754.pdf). At *International Conference on Management of Data* (SIGMOD), pages 1433–1445, May 2018. [doi:10.1145/3183713.3190657](https://doi.org/10.1145/3183713.3190657) 
[^39]: Emil Eifrem. [Twitter correspondence](https://twitter.com/emileifrem/status/419107961512804352), January 2014. Archived at [perma.cc/WM4S-BW64](https://perma.cc/WM4S-BW64) 
[^40]: Francesco Tisiot. [Explore the new SEARCH and CYCLE features in PostgreSQL® 14](https://aiven.io/blog/explore-the-new-search-and-cycle-features-in-postgresql-14). *aiven.io*, December 2021. Archived at [perma.cc/J6BT-83UZ](https://perma.cc/J6BT-83UZ) 
[^41]: Gaurav Goel. [Understanding Hierarchies in Oracle](https://towardsdatascience.com/understanding-hierarchies-in-oracle-43f85561f3d9). *towardsdatascience.com*, May 2020. Archived at [perma.cc/5ZLR-Q7EW](https://perma.cc/5ZLR-Q7EW) 
[^42]: Alin Deutsch, Nadime Francis, Alastair Green, Keith Hare, Bei Li, Leonid Libkin, Tobias Lindaaker, Victor Marsault, Wim Martens, Jan Michels, Filip Murlak, Stefan Plantikow, Petra Selmer, Oskar van Rest, Hannes Voigt, Domagoj Vrgoč, Mingxi Wu, and Fred Zemke. [Graph Pattern Matching in GQL and SQL/PGQ](https://arxiv.org/abs/2112.06217). At *International Conference on Management of Data* (SIGMOD), pages 2246–2258, June 2022. [doi:10.1145/3514221.3526057](https://doi.org/10.1145/3514221.3526057) 
[^43]: Alastair Green. [SQL... and now GQL](https://opencypher.org/articles/2019/09/12/SQL-and-now-GQL/). *opencypher.org*, September 2019. Archived at [perma.cc/AFB2-3SY7](https://perma.cc/AFB2-3SY7) 
[^44]: Alin Deutsch, Yu Xu, and Mingxi Wu. [Seamless Syntactic and Semantic Integration of Query Primitives over Relational and Graph Data in GSQL](https://cdn2.hubspot.net/hubfs/4114546/IntegrationQuery%20PrimitivesGSQL.pdf). *tigergraph.com*, November 2018. Archived at [perma.cc/JG7J-Y35X](https://perma.cc/JG7J-Y35X) 
[^45]: Oskar van Rest, Sungpack Hong, Jinha Kim, Xuming Meng, and Hassan Chafi. [PGQL: a property graph query language](https://event.cwi.nl/grades/2016/07-VanRest.pdf). At *4th International Workshop on Graph Data Management Experiences and Systems* (GRADES), June 2016. [doi:10.1145/2960414.2960421](https://doi.org/10.1145/2960414.2960421) 
[^46]: Amazon Web Services. [Neptune Graph Data Model](https://docs.aws.amazon.com/neptune/latest/userguide/feature-overview-data-model.html). Amazon Neptune User Guide, *docs.aws.amazon.com*. Archived at [perma.cc/CX3T-EZU9](https://perma.cc/CX3T-EZU9) 
[^47]: Cognitect. [Datomic Data Model](https://docs.datomic.com/cloud/whatis/data-model.html). Datomic Cloud Documentation, *docs.datomic.com*. Archived at [perma.cc/LGM9-LEUT](https://perma.cc/LGM9-LEUT) 
[^48]: David Beckett and Tim Berners-Lee. [Turtle – Terse RDF Triple Language](https://www.w3.org/TeamSubmission/turtle/). W3C Team Submission, March 2011. 
[^49]: Sinclair Target. [Whatever Happened to the Semantic Web?](https://twobithistory.org/2018/05/27/semantic-web.html) *twobithistory.org*, May 2018. Archived at [perma.cc/M8GL-9KHS](https://perma.cc/M8GL-9KHS) 
[^50]: Gavin Mendel-Gleason. [The Semantic Web is Dead – Long Live the Semantic Web!](https://terminusdb.com/blog/the-semantic-web-is-dead/) *terminusdb.com*, August 2022. Archived at [perma.cc/G2MZ-DSS3](https://perma.cc/G2MZ-DSS3) 
[^51]: Manu Sporny. [JSON-LD and Why I Hate the Semantic Web](http://manu.sporny.org/2014/json-ld-origins-2/). *manu.sporny.org*, January 2014. Archived at [perma.cc/7PT4-PJKF](https://perma.cc/7PT4-PJKF) 
[^52]: University of Michigan Library. [Biomedical Ontologies and Controlled Vocabularies](https://guides.lib.umich.edu/ontology), *guides.lib.umich.edu/ontology*. Archived at [perma.cc/Q5GA-F2N8](https://perma.cc/Q5GA-F2N8) 
[^53]: Facebook. [The Open Graph protocol](https://ogp.me/), *ogp.me*. Archived at [perma.cc/C49A-GUSY](https://perma.cc/C49A-GUSY) 
[^54]: Matt Haughey. [Everything you ever wanted to know about unfurling but were afraid to ask /or/ How to make your site previews look amazing in Slack](https://medium.com/slack-developer-blog/everything-you-ever-wanted-to-know-about-unfurling-but-were-afraid-to-ask-or-how-to-make-your-e64b4bb9254). *medium.com*, November 2015. Archived at [perma.cc/C7S8-4PZN](https://perma.cc/C7S8-4PZN) 
[^55]: W3C RDF Working Group. [Resource Description Framework (RDF)](https://www.w3.org/RDF/). *w3.org*, February 2004. 
[^56]: Steve Harris, Andy Seaborne, and Eric Prud’hommeaux. [SPARQL 1.1 Query Language](https://www.w3.org/TR/sparql11-query/). W3C Recommendation, March 2013. 
[^57]: Todd J. Green, Shan Shan Huang, Boon Thau Loo, and Wenchao Zhou. [Datalog and Recursive Query Processing](http://blogs.evergreen.edu/sosw/files/2014/04/Green-Vol5-DBS-017.pdf). *Foundations and Trends in Databases*, volume 5, issue 2, pages 105–195, November 2013. [doi:10.1561/1900000017](https://doi.org/10.1561/1900000017) 
[^58]: Stefano Ceri, Georg Gottlob, and Letizia Tanca. [What You Always Wanted to Know About Datalog (And Never Dared to Ask)](https://www.researchgate.net/profile/Letizia_Tanca/publication/3296132_What_you_always_wanted_to_know_about_Datalog_and_never_dared_to_ask/links/0fcfd50ca2d20473ca000000.pdf). *IEEE Transactions on Knowledge and Data Engineering*, volume 1, issue 1, pages 146–166, March 1989. [doi:10.1109/69.43410](https://doi.org/10.1109/69.43410) 
[^59]: Serge Abiteboul, Richard Hull, and Victor Vianu. [*Foundations of Databases*](http://webdam.inria.fr/Alice/). Addison-Wesley, 1995. ISBN: 9780201537710, available online at [*webdam.inria.fr/Alice*](http://webdam.inria.fr/Alice/) 
[^60]: Scott Meyer, Andrew Carter, and Andrew Rodriguez. [LIquid: The soul of a new graph database, Part 2](https://engineering.linkedin.com/blog/2020/liquid--the-soul-of-a-new-graph-database--part-2). *engineering.linkedin.com*, September 2020. Archived at [perma.cc/K9M4-PD6Q](https://perma.cc/K9M4-PD6Q) 
[^61]: Matt Bessey. [Why, after 6 years, I’m over GraphQL](https://bessey.dev/blog/2024/05/24/why-im-over-graphql/). *bessey.dev*, May 2024. Archived at [perma.cc/2PAU-JYRA](https://perma.cc/2PAU-JYRA) 
[^62]: Dominic Betts, Julián Domínguez, Grigori Melnik, Fernando Simonazzi, and Mani Subramanian. [*Exploring CQRS and Event Sourcing*](https://learn.microsoft.com/en-us/previous-versions/msp-n-p/jj554200%28v%3Dpandp.10%29). Microsoft Patterns & Practices, July 2012. ISBN: 1621140164, archived at [perma.cc/7A39-3NM8](https://perma.cc/7A39-3NM8) 
[^63]: Greg Young. [CQRS and Event Sourcing](https://www.youtube.com/watch?v=JHGkaShoyNs). At *Code on the Beach*, August 2014. 
[^64]: Greg Young. [CQRS Documents](https://cqrs.files.wordpress.com/2010/11/cqrs_documents.pdf). *cqrs.wordpress.com*, November 2010. Archived at [perma.cc/X5R6-R47F](https://perma.cc/X5R6-R47F) 
[^65]: Devin Petersohn, Stephen Macke, Doris Xin, William Ma, Doris Lee, Xiangxi Mo, Joseph E. Gonzalez, Joseph M. Hellerstein, Anthony D. Joseph, and Aditya Parameswaran. [Towards Scalable Dataframe Systems](https://www.vldb.org/pvldb/vol13/p2033-petersohn.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 11, pages 2033–2046. [doi:10.14778/3407790.3407807](https://doi.org/10.14778/3407790.3407807) 
[^66]: Stavros Papadopoulos, Kushal Datta, Samuel Madden, and Timothy Mattson. [The TileDB Array Data Storage Manager](https://www.vldb.org/pvldb/vol10/p349-papadopoulos.pdf). *Proceedings of the VLDB Endowment*, volume 10, issue 4, pages 349–360, November 2016. [doi:10.14778/3025111.3025117](https://doi.org/10.14778/3025111.3025117) 
[^67]: Florin Rusu. [Multidimensional Array Data Management](https://faculty.ucmerced.edu/frusu/Papers/Report/2022-09-fntdb-arrays.pdf). *Foundations and Trends in Databases*, volume 12, numbers 2–3, pages 69–220, February 2023. [doi:10.1561/1900000069](https://doi.org/10.1561/1900000069) 
[^68]: Ed Targett. [Bloomberg, Man Group team up to develop open source “ArcticDB” database](https://www.thestack.technology/bloomberg-man-group-arcticdb-database-dataframe/). *thestack.technology*, March 2023. Archived at [perma.cc/M5YD-QQYV](https://perma.cc/M5YD-QQYV) 
[^69]: Dennis A. Benson, Ilene Karsch-Mizrachi, David J. Lipman, James Ostell, and David L. Wheeler. [GenBank](https://academic.oup.com/nar/article/36/suppl_1/D25/2507746). *Nucleic Acids Research*, volume 36, database issue, pages D25–D30, December 2007. [doi:10.1093/nar/gkm929](https://doi.org/10.1093/nar/gkm929)


================================================
FILE: content/en/ch4.md
================================================
---
title: "4. Storage and Retrieval"
weight: 104
breadcrumbs: false
---

<a id="ch_storage"></a>

![](/map/ch03.png)

> *One of the miseries of life is that everybody names things a little bit wrong. And so it makes
> everything a little harder to understand in the world than it would be if it were named
> differently. A computer does not primarily compute in the sense of doing arithmetic. […] They
> primarily are filing systems.*
>
> [Richard Feynman](https://www.youtube.com/watch?v=EKWGGDXe5MA&t=296s),
> *Idiosyncratic Thinking* seminar (1985)

On the most fundamental level, a database needs to do two things: when you give it some data, it
should store the data, and when you ask it again later, it should give the data back to you.

In [Chapter 3](/en/ch3#ch_datamodels) we discussed data models and query languages—i.e., the format in which you give
the database your data, and the interface through which you can ask for it again later. In this
chapter we discuss the same from the database’s point of view: how the database can store the data
that you give it, and how it can find the data again when you ask for it.

Why should you, as an application developer, care how the database handles storage and retrieval
internally? You’re probably not going to implement your own storage engine from scratch, but you
*do* need to select a storage engine that is appropriate for your application, from the many that
are available. In order to configure a storage engine to perform well on your kind of workload, you
need to have a rough idea of what the storage engine is doing under the hood.

In particular, there is a big difference between storage engines that are optimized for
transactional workloads (OLTP) and those that are optimized for analytics (we introduced this
distinction in [“Analytical versus Operational Systems”](/en/ch1#sec_introduction_analytics)). This chapter starts by examining two families of
storage engines for OLTP: *log-structured* storage engines that write out immutable data files, and
storage engines such as *B-trees* that update data in-place. These structures are used for both
key-value storage as well as secondary indexes.

Later in [“Data Storage for Analytics”](/en/ch4#sec_storage_analytics) we’ll discuss a family of storage engines that is optimized for
analytics, and in [“Multidimensional and Full-Text Indexes”](/en/ch4#sec_storage_multidimensional) we’ll briefly look at indexes for more advanced
queries, such as text retrieval.

## Storage and Indexing for OLTP {#sec_storage_oltp}

Consider the world’s simplest database, implemented as two Bash functions:

```bash
#!/bin/bash

db_set () {
  echo "$1,$2" >> database
}

db_get () {
  grep "^$1," database | sed -e "s/^$1,//" | tail -n 1
}
```

These two functions implement a key-value store. You can call `db_set key value`, which will store
`key` and `value` in the database. The key and value can be (almost) anything you like—for
example, the value could be a JSON document. You can then call `db_get key`, which looks up the most
recent value associated with that particular key and returns it.

And it works:

```bash
$ db_set 12 '{"name":"London","attractions":["Big Ben","London Eye"]}'

$ db_set 42 '{"name":"San Francisco","attractions":["Golden Gate Bridge"]}'

$ db_get 42
{"name":"San Francisco","attractions":["Golden Gate Bridge"]}
```

The storage format is very simple: a text file where each line contains a key-value pair, separated
by a comma (roughly like a CSV file, ignoring escaping issues). Every call to `db_set` appends to
the end of the file. If you update a key several times, old versions of the value are not
overwritten—you need to look at the last occurrence of a key in a file to find the latest value
(hence the `tail -n 1` in `db_get`):

```bash
$ db_set 42 '{"name":"San Francisco","attractions":["Exploratorium"]}'

$ db_get 42
{"name":"San Francisco","attractions":["Exploratorium"]}

$ cat database
12,{"name":"London","attractions":["Big Ben","London Eye"]}
42,{"name":"San Francisco","attractions":["Golden Gate Bridge"]}
42,{"name":"San Francisco","attractions":["Exploratorium"]}

```

The `db_set` function actually has pretty good performance for something that is so simple, because
appending to a file is generally very efficient. Similarly to what `db_set` does, many databases
internally use a *log*, which is an append-only data file. Real databases have more issues to deal
with (such as handling concurrent writes, reclaiming disk space so that the log doesn’t grow
forever, and handling partially written records when recovering from a crash), but the basic
principle is the same. Logs are incredibly useful, and we will encounter them several times in this
book.

---------

> [!NOTE]
> The word *log* is often used to refer to application logs, where an application outputs text that
> describes what’s happening. In this book, *log* is used in the more general sense: an append-only
> sequence of records on disk. It doesn’t have to be human-readable; it might be binary and intended
> only for internal use by the database system.

--------


On the other hand, the `db_get` function has terrible performance if you have a large number of
records in your database. Every time you want to look up a key, `db_get` has to scan the entire
database file from beginning to end, looking for occurrences of the key. In algorithmic terms, the
cost of a lookup is *O*(*n*): if you double the number of records *n* in your database, a lookup
takes twice as long. That’s not good.

In order to efficiently find the value for a particular key in the database, we need a different
data structure: an *index*. In this chapter we will look at a range of indexing structures and see
how they compare; the general idea is to structure the data in a particular way (e.g., sorted by
some key) that makes it faster to locate the data you want. If you want to search the same data in
several different ways, you may need several different indexes on different parts of the data.

An index is an *additional* structure that is derived from the primary data. Many databases allow
you to add and remove indexes, and this doesn’t affect the contents of the database; it only affects
the performance of queries. Maintaining additional structures incurs overhead, especially on writes. For
writes, it’s hard to beat the performance of simply appending to a file, because that’s the simplest
possible write operation. Any kind of index usually slows down writes, because the index also needs
to be updated every time data is written.

This is an important trade-off in storage systems: well-chosen indexes speed up read queries, but
every index consumes additional disk space and slows down writes, sometimes substantially [^1].
For this reason, databases don’t usually index everything by default, but require you—the person
writing the application or administering the database—to choose indexes manually, using your
knowledge of the application’s typical query patterns. You can then choose the indexes that give
your application the greatest benefit, without introducing more overhead on writes than necessary.

### Log-Structured Storage {#sec_storage_log_structured}

To start, let’s assume that you want to continue storing data in the append-only file written by
`db_set`, and you just want to speed up reads. One way you could do this is by keeping a hash map in
memory, in which every key is mapped to the byte offset in the file at which the most recent value
for that key can be found, as illustrated in [Figure 4-1](/en/ch4#fig_storage_csv_hash_index).

{{< figure src="/fig/ddia_0401.png" id="fig_storage_csv_hash_index" caption="Figure 4-1. Storing a log of key-value pairs in a CSV-like format, indexed with an in-memory hash map." class="w-full my-4" >}}

Whenever you append a new key-value pair to the file, you also update the hash map to reflect the
offset of the data you just wrote. When you want to look up a value, you use the hash map to find
the offset in the log file, seek to that location, and read the value. If that part of the data file
is already in the filesystem cache, a read doesn’t require any disk I/O at all.

This approach is much faster, but it still suffers from several problems:

* You never free up disk space occupied by old log entries that have been overwritten; if you keep
 writing to the database you might run out of disk space.
* The hash map is not persisted, so you have to rebuild it when you restart the database—for
 example, by scanning the whole log file to find the latest byte offset for each key. This makes
 restarts slow if you have a lot of data.
* The hash table must fit in memory. In principle, you could maintain a hash table on disk, but
 unfortunately it is difficult to make an on-disk hash map perform well. It requires a lot of
 random access I/O, it is expensive to grow when it becomes full, and hash collisions require fiddly logic [^2].
* Range queries are not efficient. For example, you cannot easily scan over all keys between `10000`
 and `19999`—you’d have to look up each key individually in the hash map.

#### The SSTable file format {#the-sstable-file-format}

In practice, hash tables are not used very often for database indexes, and instead it is much more
common to keep data in a structure that is *sorted by key* [^3].
One example of such a structure is a *Sorted String Table*, or *SSTable* for short, as shown in
[Figure 4-2](/en/ch4#fig_storage_sstable_index). This file format also stores key-value pairs, but it ensures that
they are sorted by key, and each key only appears once in the file.

{{< figure src="/fig/ddia_0402.png" id="fig_storage_sstable_index" caption="Figure 4-2. An SSTable with a sparse index, allowing queries to jump to the right block." class="w-full my-4" >}}

Now you do not need to keep all the keys in memory: you can group the key-value pairs within an
SSTable into *blocks* of a few kilobytes, and then store the first key of each block in the index.
This kind of index, which stores only some of the keys, is called *sparse*. This index is stored in
a separate part of the SSTable, for example using an immutable B-tree, a trie, or another data
structure that allows queries to quickly look up a particular key [^4].

For example, in [Figure 4-2](/en/ch4#fig_storage_sstable_index), the first key of one block is `handbag`, and the
first key of the next block is `handsome`. Now say you’re looking for the key `handiwork`, which
doesn’t appear in the sparse index. Because of the sorting you know that `handiwork` must appear
between `handbag` and `handsome`. This means you can seek to the offset for `handbag` and scan the
file from there until you find `handiwork` (or not, if the key is not present in the file). A block
of a few kilobytes can be scanned very quickly.

Moreover, each block of records can be compressed (indicated by the shaded area in
[Figure 4-2](/en/ch4#fig_storage_sstable_index)). Besides saving disk space, compression also reduces the I/O
bandwidth use, at the cost of using a bit more CPU time.

#### Constructing and merging SSTables {#constructing-and-merging-sstables}

The SSTable file format is better for reading than an append-only log, but it makes writes more
difficult. We can’t simply append at the end, because then the file would no longer be sorted
(unless the keys happen to be written in ascending order). If we had to rewrite the whole SSTable
every time a key is inserted somewhere in the middle, writes would become far too expensive.

We can solve this problem with a *log-structured* approach, which is a hybrid between an append-only
log and a sorted file:

1. When a write comes in, add it to an in-memory ordered map data structure, such as a red-black
 tree, skip list [^5], or trie [^6].
 With these data structures, you can insert keys in any order, look them up efficiently, and read
 them back in sorted order. This in-memory data structure is called the *memtable*.
2. When the memtable gets bigger than some threshold—typically a few megabytes—write it out to
 disk in sorted order as an SSTable file. We call this new SSTable file the most recent *segment*
 of the database, and it is stored as a separate file alongside the older segments. Each segment
 has a separate index of its contents. While the new segment is being written out to disk, the
 database can continue writing to a new memtable instance, and the old memtable’s memory is freed
 when the writing of the SSTable is complete.
3. In order to read the value for some key, first try to find the key in the memtable and the most
 recent on-disk segment. If it’s not there, look in the next-older segment, etc. until you either
 find the key or reach the oldest segment. If the key does not appear in any of the segments, it
 does not exist in the database.
4. From time to time, run a merging and compaction process in the background to combine segment files
 and to discard overwritten or deleted values.

Merging segments works similarly to the *mergesort* algorithm [^5]. The process is illustrated in
[Figure 4-3](/en/ch4#fig_storage_sstable_merging): start reading the input files side by side, look at the first key
in each file, copy the lowest key (according to the sort order) to the output file, and repeat. If
the same key appears in more than one input file, keep only the more recent value. This produces a
new merged segment file, also sorted by key, with one value per key, and it uses minimal memory
because we can iterate over the SSTables one key at a time.

{{< figure src="/fig/ddia_0403.png" id="fig_storage_sstable_merging" caption="Figure 4-3. Merging several SSTable segments, retaining only the most recent value for each key." class="w-full my-4" >}}

To ensure that the data in the memtable is not lost if the database crashes, the storage engine
keeps a separate log on disk to which every write is immediately appended. This log is not sorted by
key, but that doesn’t matter, because its only purpose is to restore the memtable after a crash.
Every time the memtable has been written out to an SSTable, the corresponding part of the log can be
discarded.

If you want to delete a key and its associated value, you have to append a special deletion record
called a *tombstone* to the data file. When log segments are merged, the tombstone tells the merging
process to discard any previous values for the deleted key. Once the tombstone is merged into the
oldest segment, it can be dropped.

The algorithm described here is essentially what is used in RocksDB [^7], Cassandra, Scylla, and HBase [^8],
all of which were inspired by Google’s Bigtable paper [^9] (which introduced the terms *SSTable* and *memtable*).

The algorithm was originally published in 1996 under the name *Log-Structured Merge-Tree* or *LSM-Tree* [^10],
building on earlier work on log-structured filesystems [^11].
For this reason, storage engines that are based on the principle of merging and compacting sorted files are often called *LSM storage engines*.

In LSM storage engines, a segment file is written in one pass (either by writing out the memtable or
by merging some existing segments), and thereafter it is immutable. The merging and compaction of
segments can be done in a background thread, and while it is going on, we can still continue to
serve reads using the old segment files. When the merging process is complete, we switch read
requests to using the new merged segment instead of the old segments, and then the old segment files
can be deleted.

The segment files don’t necessarily have to be stored on local disk: they are also well suited for
writing to object storage. SlateDB and Delta Lake [^12]. take this approach, for example.

Having immutable segment files also simplifies crash recovery: if a crash happens while writing out
the memtable or while merging segments, the database can just delete the unfinished SSTable and
start afresh. The log that persists writes to the memtable could contain incomplete records if there
was a crash halfway through writing a record, or if the disk was full; these are typically detected
by including checksums in the log, and discarding corrupted or incomplete log entries. We will talk
more about durability and crash recovery in [Chapter 8](/en/ch8#ch_transactions).

<a id="sec_storage_bloom_filter"></a>

#### Bloom filters {#bloom-filters}

With LSM storage it can be slow to read a key that was last updated a long time ago, or that does
not exist, since the storage engine needs to check several segment files. In order to speed up such
reads, LSM storage engines often include a *Bloom filter* [^13]
in each segment, which provides a fast but approximate way of checking whether a particular key
appears in a particular SSTable.

[Figure 4-4](/en/ch4#fig_storage_bloom) shows an example of a Bloom filter containing two keys and 16 bits (in
reality, it would contain more keys and more bits). For every key in the SSTable we compute a hash
function, producing a set of numbers that are then interpreted as indexes into the array of bits [^14].
We set the bits corresponding to those indexes to 1, and leave the rest as 0. For example, the key
`handbag` hashes to the numbers (2, 9, 4), so we set the 2nd, 9th, and 4th bits to 1. The bitmap
is then stored as part of the SSTable, along with the sparse index of keys. This takes a bit of
extra space, but the Bloom filter is generally small compared to the rest of the SSTable.

{{< figure src="/fig/ddia_0404.png" id="fig_storage_bloom" caption="Figure 4-4. A Bloom filter provides a fast, probabilistic check whether a particular key exists in a particular SSTable." class="w-full my-4" >}}

When we want to know whether a key appears in the SSTable, we compute the same hash of that key as
before, and check the bits at those indexes. For example, in [Figure 4-4](/en/ch4#fig_storage_bloom), we’re querying
the key `handheld`, which hashes to (6, 11, 2). One of those bits is 1 (namely, bit number 2),
while the other two are 0. These checks can be made extremely fast using the bitwise operations that
all CPUs support.

If at least one of the bits is 0, we know that the key definitely does not appear in the SSTable.
If the bits in the query are all 1, it’s likely that the key is in the SSTable, but it’s also
possible that by coincidence all of those bits were set to 1 by other keys. This case when it looks
as if a key is present, even though it isn’t, is called a *false positive*.

The probability of false positives depends on the number of keys, the number of bits set per key,
and the total number of bits in the Bloom filter. You can use an online calculator tool to work out
the right parameters for your application [^15].
As a rule of thumb, you need to allocate 10 bits of Bloom filter space for every key in the SSTable
to get a false positive probability of 1%, and the probability is reduced tenfold for every 5
additional bits you allocate per key.

In the context of an LSM storage engines, false positives are no problem:

* If the Bloom filter says that a key *is not* present, we can safely skip that SSTable, since we
 can be sure that it doesn’t contain the key.
* If the Bloom filter says the key *is* present, we have to consult the sparse index and decode the
 block of key-value pairs to check whether the key really is there. If it was a false positive, we
 have done a bit of unnecessary work, but otherwise no harm is done—we just continue the search
 with the next-oldest segment.

#### Compaction strategies {#sec_storage_lsm_compaction}

An important detail is how the LSM storage chooses when to perform compaction, and which SSTables to
include in a compaction. Many LSM-based storage systems allow you to configure which compaction
strategy to use, and some of the common choices are [^16] [^17]:

Size-tiered compaction
: Newer and smaller SSTables are successively merged into older and larger SSTables. The SSTables
 containing older data can get very large, and merging them requires a lot of temporary disk space.
 The advantage of this strategy is that it can handle very high write throughput.

Leveled compaction
: The key range is split up into smaller SSTables and older data is moved into separate “levels,”
 which allows the compaction to proceed more incrementally and use less disk space than the
 size-tiered strategy. This strategy is more efficient for reads than size-tiered compaction
 because the storage engine needs to read fewer SSTables to check whether they contain the key.

As a rule of thumb, size-tiered compaction performs better if you have mostly writes and few reads,
whereas leveled compaction performs better if your workload is dominated by reads. If you write a
small number of keys frequently and a large number of keys rarely, then leveled compaction can also
be advantageous [^18].

Even though there are many subtleties, the basic idea of LSM-trees—keeping a cascade of SSTables
that are merged in the background—is simple and effective. We discuss their performance
characteristics in more detail in [“Comparing B-Trees and LSM-Trees”](/en/ch4#sec_storage_btree_lsm_comparison).

--------

<a id="sidebar_embedded"></a>

> [!TIP] EMBEDDED STORAGE ENGINES

Many databases run as a service that accepts queries over a network, but there are also *embedded*
databases that don’t expose a network API. Instead, they are libraries that run in the same process
as your application code, typically reading and writing files on the local disk, and you interact
with them through normal function calls. Examples of embedded storage engines include RocksDB,
SQLite, LMDB, DuckDB, and KùzuDB [^19].

Embedded databases are very commonly used in mobile apps to store the local user’s data. On the
backend, they can be an appropriate choice if the data is small enough to fit on a single machine,
and if there are not many concurrent transactions. For example, in a multitenant system in which
each tenant is small enough and completely separate from others (i.e., you do not need to run
queries that combine data from multiple tenants), you can potentially use a separate embedded
database instance per tenant [^20].

The storage and retrieval methods we discuss in this chapter are used in both embedded and in
client-server databases. In [Chapter 6](/en/ch6#ch_replication) and [Chapter 7](/en/ch7#ch_sharding) we will discuss techniques
for scaling a database across multiple machines.

--------

### B-Trees {#sec_storage_b_trees}

The log-structured approach is popular, but it is not the only form of key-value storage. The most
widely used structure for reading and writing database records by key is the *B-tree*.

Introduced in 1970 [^21] and called “ubiquitous” less than 10 years later [^22],
B-trees have stood the test of time very well. They remain the standard index implementation in
almost all relational databases, and many nonrelational databases use them too.

Like SSTables, B-trees keep key-value pairs sorted by key, which allows efficient key-value lookups
and range queries. But that’s where the similarity ends: B-trees have a very different design
philosophy.

The log-structured indexes we saw earlier break the database down into variable-size *segments*,
typically several megabytes or more in size, that are written once and are then immutable. By
contrast, B-trees break the database down into fixed-size *blocks* or *pages*, and may overwrite a
page in-place. A page is traditionally 4 KiB in size, but PostgreSQL now uses 8 KiB and
MySQL uses 16 KiB by default.

Each page can be identified using a page number, which allows one page to refer to another—​similar
to a pointer, but on disk instead of in memory. If all the pages are stored in the same file,
multiplying the page number by the page size gives us the byte offset in the file where the page is
located. We can use these page references to construct a tree of pages, as illustrated in
[Figure 4-5](/en/ch4#fig_storage_b_tree).

{{< figure src="/fig/ddia_0405.png" id="fig_storage_b_tree" caption="Figure 4-5. Looking up the key 251 using a B-tree index. From the root page we first follow the reference to the page for keys 200–300, then the page for keys 250–270." class="w-full my-4" >}}

One page is designated as the *root* of the B-tree; whenever you want to look up a key in the index,
you start here. The page contains several keys and references to child pages.
Each child is responsible for a continuous range of keys, and the keys between the references indicate
where the boundaries between those ranges lie.
(This structure is sometimes called a B+ tree, but we don’t need to distinguish it
from other B-tree variants.)

In the example in [Figure 4-5](/en/ch4#fig_storage_b_tree), we are looking for the key 251, so we know that we need to
follow the page reference between the boundaries 200 and 300. That takes us to a similar-looking
page that further breaks down the 200–300 range into subranges. Eventually we get down to a
page containing individual keys (a *leaf page*), which either contains the value for each key
inline or contains references to the pages where the values can be found.

The number of references to child pages in one page of the B-tree is called the *branching factor*.
For example, in [Figure 4-5](/en/ch4#fig_storage_b_tree) the branching factor is six. In practice, the branching
factor depends on the amount of space required to store the page references and the range
boundaries, but typically it is several hundred.

If you want to update the value for an existing key in a B-tree, you search for the leaf page
containing that key, and overwrite that page on disk with a version that contains the new value.
If you want to add a new key, you need to find the page whose range encompasses the new key and add
it to that page. If there isn’t enough free space in the page to accommodate the new key, the page
is split into two half-full pages, and the parent page is updated to account for the new subdivision
of key ranges.

{{< figure src="/fig/ddia_0406.png" id="fig_storage_b_tree_split" caption="Figure 4-6. Growing a B-tree by splitting a page on the boundary key 337. The parent page is updated to reference both children." class="w-full my-4" >}}

In the example of [Figure 4-6](/en/ch4#fig_storage_b_tree_split), we want to insert the key 334, but the page for the
range 333–345 is already full. We therefore split it into a page for the range 333–337 (including
the new key), and a page for 337–344. We also have to update the parent page to have references to
both children, with a boundary value of 337 between them. If the parent page doesn’t have enough
space for the new reference, it may also need to be split, and the splits can continue all the way
to the root of the tree. When the root is split, we make a new root above it. Deleting keys (which
may require nodes to be merged) is more complex [^5].

This algorithm ensures that the tree remains *balanced*: a B-tree with *n* keys always has a depth
of *O*(log *n*). Most databases can fit into a B-tree that is three or four levels deep, so
you don’t need to follow many page references to find the page you are looking for. (A four-level
tree of 4 KiB pages with a branching factor of 500 can store up to 250 TB.)

#### Making B-trees reliable {#sec_storage_btree_wal}

The basic underlying write operation of a B-tree is to overwrite a page on disk with new data. It is
assumed that the overwrite does not change the location of the page; i.e., all references to that
page remain intact when the page is overwritten. This is in stark contrast to log-structured indexes
such as LSM-trees, which only append to files (and eventually delete obsolete files) but never
modify files in place.

Overwriting several pages at once, like in a page split, is a dangerous operation: if the database
crashes after only some of the pages have been written, you end up with a corrupted tree (e.g.,
there may be an *orphan* page that is not a child of any parent). If the hardware can’t atomically
write an entire page, you can also end up with a partially written page (this is known as a *torn page* [^23]).

In order to make the database resilient to crashes, it is common for B-tree implementations to
include an additional data structure on disk: a *write-ahead log* (WAL). This is an append-only file
to which every B-tree modification must be written before it can be applied to the pages of the tree
itself. When the database comes back up after a crash, this log is used to restore the B-tree back
to a consistent state [^2] [^24].
In filesystems, the equivalent mechanism is known as *journaling*.

To improve performance, B-tree implementations typically don’t immediately write every modified page
to disk, but buffer the B-tree pages in memory for a while first. The write-ahead log then also
ensures that data is not lost in the case of a crash: as long as data has been written to the WAL,
and flushed to disk using the `fsync()` system call, the data will be durable as the database will
be able to recover it after a crash [^25].

#### B-tree variants {#b-tree-variants}

As B-trees have been around for so long, many variants have been developed over the years. To
mention just a few:

* Instead of overwriting pages and maintaining a WAL for crash recovery, some databases (like LMDB)
 use a copy-on-write scheme [^26].
 A modified page is written to a different location, and a new version of the parent pages in the tree
 is created, pointing at the new location. This approach is also useful for concurrency control, as we shall
 see in [“Snapshot Isolation and Repeatable Read”](/en/ch8#sec_transactions_snapshot_isolation).
* We can save space in pages by not storing the entire key, but abbreviating it. Especially in pages
 on the interior of the tree, keys only need to provide enough information to act as boundaries
 between key ranges. Packing more keys into a page allows the tree to have a higher branching
 factor, and thus fewer levels.
* To speed up scans over the key range in sorted order, some B-tree implementations try to lay out
 the tree so that leaf pages appear in sequential order on disk, reducing the number of disk seeks.
 However, it’s difficult to maintain that order as the tree grows.
* Additional pointers have been added to the tree. For example, each leaf page may have references to
 its sibling pages to the left and right, which allows scanning keys in order without jumping back
 to parent pages.

### Comparing B-Trees and LSM-Trees {#sec_storage_btree_lsm_comparison}

As a rule of thumb, LSM-trees are better suited for write-heavy applications, whereas B-trees are faster for reads [^27] [^28].
However, benchmarks are often sensitive to details of the workload. You need to test systems with
your particular workload in order to make a valid comparison. Moreover, it’s not a strict either/or
choice between LSM and B-trees: storage engines sometimes blend characteristics of both approaches,
for example by having multiple B-trees and merging them LSM-style. In this section we will briefly
discuss a few things that are worth considering when measuring the performance of a storage engine.

#### Read performance {#read-performance}

In a B-tree, looking up a key involves reading one page at each level of the B-tree. Since the
number of levels is usually quite small, this means that reads from a B-tree are generally fast and
have predictable performance. In an LSM storage engine, reads often have to check several different
SSTables at different stages of compaction, but Bloom filters help reduce the number of actual disk
I/O operations required. Both approaches can perform well, and which is faster depends on the
details of the storage engine and the workload.

Range queries are simple and fast on B-trees, as they can use the sorted structure of the tree. On
LSM storage, range queries can also take advantage of the SSTable sorting, but they need to scan all
the segments in parallel and combine the results. Bloom filters don’t help for range queries (since
you would need to compute the hash of every possible key within the range, which is impractical),
making range queries more expensive than point queries in the LSM approach [^29].

High write throughput can cause latency spikes in a log-structured storage engine if the
memtable fills up. This happens if data can’t be written out to disk fast enough, perhaps because
the compaction process cannot keep up with incoming writes. Many storage engines, including RocksDB,
perform *backpressure* in this situation: they suspend all reads and writes until the memtable has
been written out to disk [^30] [^31].

Regarding read throughput, modern SSDs (and especially NVMe) can perform many independent read
requests in parallel. Both LSM-trees and B-trees are able to provide high read throughput, but
storage engines need to be carefully designed to take advantage of this parallelism [^32].

#### Sequential vs. random writes {#sidebar_sequential}

With a B-tree, if the application writes keys that are scattered all over the key space, the
resulting disk operations are also scattered randomly, since the pages that the storage engine needs
to overwrite could be located anywhere on disk. On the other hand, a log-structured storage engine
writes entire segment files at a time (either writing out the memtable or while compacting existing
segments), which are much bigger than a page in a B-tree.

The pattern of many small, scattered writes (as found in B-trees) is called *random writes*, while
the pattern of fewer large writes (as found in LSM-trees) is called *sequential writes*. Disks
generally have higher sequential write throughput than random write throughput, which means that a
log-structured storage engine can generally handle higher write throughput on the same hardware than
a B-tree. This difference is particularly big on spinning-disk hard drives (HDDs); on the solid
state drives (SSDs) that most databases use today, the difference is smaller, but still noticeable
(see [“Sequential vs. Random Writes on SSDs”](/en/ch4#sidebar_sequential)).

--------

> [!TIP] SEQUENTIAL VS. RANDOM WRITES ON SSDS

On spinning-disk hard drives (HDDs), sequential writes are much faster than random writes: a random
write has to mechanically move the disk head to a new position and wait for the right part of the
platter to pass underneath the disk head, which takes several milliseconds—an eternity in computing
timescales. However, SSDs (solid-state drives) including NVMe (Non-Volatile Memory Express, i.e.
flash memory attached to the PCI Express bus) have now overtaken HDDs for many use cases, and they
are not subject to such mechanical limitations.

Nevertheless, SSDs also have higher throughput for sequential writes than for than random writes.
The reason is that flash memory can be read or written one page (typically 4 KiB) at a time,
but it can only be erased one block (typically 512 KiB) at a time. Some of the pages in a block
may contain valid data, whereas others may contain data that is no longer needed. Before erasing a
block, the controller must first move pages containing valid data into other blocks; this process is
called *garbage collection* (GC) [^33].

A sequential write workload writes larger chunks of data at a time, so it is likely that a whole
512 KiB block belongs to a single file; when that file is later deleted again, the whole block
can be erased without having to perform any GC. On the other hand, with a random write workload, it
is more likely that a block contains a mixture of pages with valid and invalid data, so the GC has
to perform more work before a block can be erased [^34] [^35] [^36].

The write bandwidth consumed by GC is then not available for the application. Moreover, the
additional writes performed by GC contribute to wear on the flash memory; therefore, random writes
wear out the drive faster than sequential writes.

--------

#### Write amplification {#write-amplification}

With any type of storage engine, one write request from the application turns into multiple I/O
operations on the underlying disk. With LSM-trees, a value is first written to the log for
durability, then again when the memtable is written to disk, and again every time the key-value pair
is part of a compaction. (If the values are significantly larger than the keys, this overhead can be
reduced by storing values separately from keys, and performing compaction only on SSTables
containing keys and references to values [^37].)

A B-tree index must write every piece of data at least twice: once to the write-ahead log, and once
to the tree page itself. In addition, they sometimes need to write out an entire page, even if only
a few bytes in that page changed, to ensure the B-tree can be correctly recovered after a crash or
power failure [^38] [^39].

If you take the total number of bytes written to disk in some workload, and divide by the number of
bytes you would have to write if you simply wrote an append-only log with no index, you get the
*write amplification*. (Sometimes write amplification is defined in terms of I/O operations rather
than bytes.) In write-heavy applications, the bottleneck might be the rate at which the database can
write to disk. In this case, the higher the write amplification, the fewer writes per second it can
handle within the available disk bandwidth.

Write amplification is a problem in both LSM-trees and B-trees. Which one is better depends on
various factors, such as the length of your keys and values, and how often you overwrite existing
keys versus insert new ones. For typical workloads, LSM-trees tend to have lower write amplification
because they don’t have to write entire pages and they can compress chunks of the SSTable [^40].
This is another factor that makes LSM storage engines well suited for write-heavy workloads.

Besides affecting throughput, write amplification is also relevant for the wear on SSDs: a storage
engine with lower write amplification will wear out the SSD less quickly.

When measuring the write throughput of a storage engine, it is important to run the experiment for
long enough that the effects of write amplification become clear. When writing to an empty LSM-tree,
there are no compactions going on yet, so all of the disk bandwidth is available for new writes. As
the database grows, new writes need to share the disk bandwidth with compaction.

#### Disk space usage {#disk-space-usage}

B-trees can become *fragmented* over time: for example, if a large number of keys are deleted, the
database file may contain a lot of pages that are no longer used by the B-tree. Subsequent additions
to the B-tree can use those free pages, but they can’t easily be returned to the operating system
because they are in the middle of the file, so they still take up space on the filesystem. Databases
therefore need a background process that moves pages around to place them better, such as the vacuum
process in PostgreSQL [^25].

Fragmentation is less of a problem in LSM-trees, since the compaction process periodically rewrites
the data files anyway, and SSTables don’t have pages with unused space. Moreover, blocks of
key-value pairs can better be compressed in SSTables, and thus often produce smaller files on disk
than B-trees. Keys and values that have been overwritten continue to consume space until they are
removed by a compaction, but this overhead is quite low when using leveled compaction [^40] [^41].
Size-tiered compaction (see [“Compaction strategies”](/en/ch4#sec_storage_lsm_compaction)) uses more disk space, especially
temporarily during compaction.

Having multiple copies of some data on disk can also be a problem when you need to delete some data,
and be confident that it really has been deleted (perhaps to comply with data protection
regulations). For example, in most LSM storage engines a deleted record may still exist in the higher
levels until the tombstone representing the deletion has been propagated through all of the
compaction levels, which may take a long time. Specialist storage engine designs can propagate
deletions faster [^42].

On the other hand, the immutable nature of SSTable segment files is useful if you want to take a
snapshot of a database at some point in time (e.g. for a backup or to create a copy of the database
for testing): you can write out the memtable and record which segment files existed at that point in
time. As long as you don’t delete the files that are part of the snapshot, you don’t need to
actually copy them. In a B-tree whose pages are overwritten, taking such a snapshot efficiently is
more difficult.


### Multi-Column and Secondary Indexes {#sec_storage_index_multicolumn}

So far we have only discussed key-value indexes, which are like a *primary key* index in the
relational model. A primary key uniquely identifies one row in a relational table, or one document
in a document database, or one vertex in a graph database. Other records in the database can refer
to that row/document/vertex by its primary key (or ID), and the index is used to resolve such references.

It is also very common to have *secondary indexes*. In relational databases, you can create several
secondary indexes on the same table using the `CREATE INDEX` command, allowing you to search by
columns other than the primary key. For example, in [Figure 3-1](/en/ch3#fig_obama_relational) in [Chapter 3](/en/ch3#ch_datamodels)
you would most likely have a secondary index on the `user_id` columns so that you can find all the
rows belonging to the same user in each of the tables.

A secondary index can easily be constructed from a key-value index. The main difference is that
in a secondary index, the indexed values are not necessarily unique; that is,
there might be many rows (documents, vertices) under the same index entry. This can be
solved in two ways: either by making each value in the index a list of matching row identifiers (like a
postings list in a full-text index) or by making each entry unique by appending a row identifier to
it. Storage engines with in-place updates, like B-trees, and log-structured storage can both be used
to implement an index.

#### Storing values within the index {#sec_storage_index_heap}

The key in an index is the thing that queries search by, but the value can be one of several things:

* If the actual data (row, document, vertex) is stored directly within the index structure, it is
 called a *clustered index*. For example, in MySQL’s InnoDB storage engine, the primary key of a
 table is always a clustered index, and in SQL Server, you can specify one clustered index per table [^43].
* Alternatively, the value can be a reference to the actual data: either the primary key of the row
 in question (InnoDB does this for secondary indexes), or a direct reference to a location on disk.
 In the latter case, the place where rows are stored is known as a *heap file*, and it stores data
 in no particular order (it may be append-only, or it may keep track of deleted rows in order to
 overwrite them with new data later). For example, Postgres uses the heap file approach [^44].
* A middle ground between the two is a *covering index* or *index with included columns*, which
 stores *some* of a table’s columns within the index, in addition to storing the full row on the
 heap or in the primary key clustered index [^45].
 This allows some queries to be answered by using the index alone, without having to resolve the
 primary key or look in the heap file (in which case, the index is said to *cover* the query).
 This can make some queries faster, but the duplication of data means the index uses more disk space and slows down
 writes.

The indexes discussed so far only map a single key to a value. If you need to query multiple columns
of a table (or multiple fields in a document) simultaneously, see [“Multidimensional and Full-Text Indexes”](/en/ch4#sec_storage_multidimensional).

When updating a value without changing the key, the heap file approach can allow the record to be
overwritten in place, provided that the new value is not larger than the old value. The situation is
more complicated if the new value is larger, as it probably needs to be moved to a new location in
the heap where there is enough space. In that case, either all indexes need to be updated to point
at the new heap location of the record, or a forwarding pointer is left behind in the old heap location [^2].

### Keeping everything in memory {#sec_storage_inmemory}

The data structures discussed so far in this chapter have all been answers to the limitations of
disks. Compared to main memory, disks are awkward to deal with. With both magnetic disks and SSDs,
data on disk needs to be laid out carefully if you want good performance on reads and writes.
However, we tolerate this awkwardness because disks have two significant advantages: they are
durable (their contents are not lost if the power is turned off), and they have a lower cost per
gigabyte than RAM.

As RAM becomes cheaper, the cost-per-gigabyte argument is eroded. Many datasets are simply not that
big, so it’s quite feasible to keep them entirely in memory, potentially distributed across several
machines. This has led to the development of *in-memory databases*.

Some in-memory key-value stores, such as Memcached, are intended for caching use only, where it’s
acceptable for data to be lost if a machine is restarted. But other in-memory databases aim for
durability, which can be achieved with special hardware (such as battery-powered RAM), by writing a
log of changes to disk, by writing periodic snapshots to disk, or by replicating the in-memory state
to other machines.

When an in-memory database is restarted, it needs to reload its state, either from disk or over the
network from a replica (unless special hardware is used). Despite writing to disk, it’s still an
in-memory database, because the disk is merely used as an append-only log for durability, and reads
are served entirely from memory. Writing to disk also has operational advantages: files on disk can
easily be backed up, inspected, and analyzed by external utilities.

Products such as VoltDB, SingleStore, and Oracle TimesTen are in-memory databases with a relational model,
and the vendors claim that they can offer big performance improvements by removing all the overheads
associated with managing on-disk data structures [^46] [^47].
RAMCloud is an open source, in-memory key-value store with durability (using a log-structured
approach for the data in memory as well as the data on disk) [^48].

Redis and Couchbase provide weak durability by writing to disk asynchronously.

Counterintuitively, the performance advantage of in-memory databases is not due to the fact that
they don’t need to read from disk. Even a disk-based storage engine may never need to read from disk
if you have enough memory, because the operating system caches recently used disk blocks in memory
anyway. Rather, they can be faster because they can avoid the overheads of encoding in-memory data
structures in a form that can be written to disk [^49].

Besides performance, another interesting area for in-memory databases is providing data models that
are difficult to implement with disk-based indexes. For example, Redis offers a database-like
interface to various data structures such as priority queues and sets. Because it keeps all data in
memory, its implementation is comparatively simple.


## Data Storage for Analytics {#sec_storage_analytics}

The data model of a data warehouse is most commonly relational, because SQL is generally a good fit
for analytic queries. There are many graphical data analysis tools that generate SQL queries,
visualize the results, and allow analysts to explore the data (through operations such as
*drill-down* and *slicing and dicing*).

On the surface, a data warehouse and a relational OLTP database look similar, because they both have
a SQL query interface. However, the internals of the systems can look quite different, because they
are optimized for very different query patterns. Many database vendors now focus on supporting
either transaction processing or analytics workloads, but not both.

Some databases, such as Microsoft SQL Server, SAP HANA, and SingleStore, have support for
transaction processing and data warehousing in the same product. However, these hybrid transactional
and analytical processing (HTAP) databases (introduced in [“Data Warehousing”](/en/ch1#sec_introduction_dwh)) are increasingly
becoming two separate storage and query engines, which happen to be accessible through a common SQL
interface [^50] [^51] [^52] [^53].

### Cloud Data Warehouses {#sec_cloud_data_warehouses}

Data warehouse vendors such as Teradata, Vertica, and SAP HANA sell both on-premises warehouses
under commercial licenses and cloud-based solutions. But as many of their customers move to the
cloud, new cloud data warehouses such as Google Cloud BigQuery, Amazon Redshift, and Snowflake have
also become widely adopted. Unlike traditional data warehouses, cloud data warehouses take advantage
of scalable cloud infrastructure like object storage and serverless computation platforms.

Cloud data warehouses tend to integrate better with other cloud services and to be more elastic.
For example, many cloud warehouses support automatic log ingestion, and offer easy integration with
data processing frameworks such as Google Cloud’s Dataflow or Amazon Web Services’ Kinesis. These
warehouses are also more elastic because they decouple query computation from the storage layer [^54].
Data is persisted on object storage rather than local disks, which makes it easy to adjust storage
capacity and compute resources for queries independently, as we previously saw in
[“Cloud-Native System Architecture”](/en/ch1#sec_introduction_cloud_native).

Open source data warehouses such as Apache Hive, Trino, and Apache Spark have also evolved with the
cloud. As data storage for analytics has moved to data lakes on object storage, open source warehouses
have begun to break apart [^55]. The following
components, which were previously integrated in a single system such as Apache Hive, are now often
implemented as separate components:

Query engine
: Query engines such as Trino, Apache DataFusion, and Presto parse SQL queries, optimize them into
 execution plans, and execute them against the data. Execution usually requires parallel,
 distributed data processing tasks. Some query engines provide built-in task execution, while
 others choose to use third party execution frameworks such as Apache Spark or Apache Flink.

Storage format
: The storage format determines how the rows of a table are encoded as bytes in a file, which is
 then typically stored in object storage or a distributed filesystem [^12].
 This data can then be accessed by the query engine, but also by other applications using the data
 lake. Examples of such storage formats are Parquet, ORC, Lance, or Nimble, and we will see more
 about them in the next section.

Table format
: Files written in Apache Parquet and similar storage formats are typically immutable once written.
 To support row inserts and deletions, a table format such as Apache Iceberg or Databricks’s Delta
 format are used. Table formats specify a file format that defines which files constitute a table
 along with the table’s schema. Such formats also offer advanced features such as time travel (the
 ability to query a table as it was at a previous point in time), garbage collection, and even
 transactions.

Data catalog
: Much like a table format defines which files make up a table, a data catalog defines which tables
 comprise a database. Catalogs are used to create, rename, and drop tables. Unlike storage and table
 formats, data catalogs such as Snowflake’s Polaris and Databricks’s Unity Catalog usually run as a
 standalone service that can be queried using a REST interface. Apache Iceberg also offers a
 catalog, which can be run inside a client or as a separate process. Query engines use catalog
 information when reading and writing tables. Traditionally, catalogs and query engines have been
 integrated, but decoupling them has enabled data discovery and data governance systems
 (discussed in [“Data Systems, Law, and Society”](/en/ch1#sec_introduction_compliance)) to access a catalog’s metadata as well.

### Column-Oriented Storage {#sec_storage_column}

As discussed in [“Stars and Snowflakes: Schemas for Analytics”](/en/ch3#sec_datamodels_analytics), data warehouses by convention often use a relational
schema with a big fact table that contains foreign key references into dimension tables.
If you have trillions of rows and petabytes of data in your fact tables, storing and querying them
efficiently becomes a challenging problem. Dimension tables are usually much smaller (millions of
rows), so in this section we will focus on storage of facts.

Although fact tables are often over 100 columns wide, a typical data warehouse query only accesses 4
or 5 of them at one time (`"SELECT *"` queries are rarely needed for analytics) [^52]. Take the query in
[Example 4-1](/en/ch4#fig_storage_analytics_query): it accesses a large number of rows (every occurrence of someone
buying fruit or candy during the 2024 calendar year), but it only needs to access three columns of
the `fact_sales` table: `date_key`, `product_sk`,
and `quantity`. The query ignores all other columns.

{{< figure id="fig_storage_analytics_query" title="Example 4-1. Analyzing whether people are more inclined to buy fresh fruit or candy, depending on the day of the week" class="w-full my-4" >}}

```sql
SELECT
    dim_date.weekday, dim_product.category,
    SUM(fact_sales.quantity) AS quantity_sold
FROM fact_sales
    JOIN dim_date ON fact_sales.date_key = dim_date.date_key
    JOIN dim_product ON fact_sales.product_sk = dim_product.product_sk
WHERE
    dim_date.year = 2024 AND
    dim_product.category IN ('Fresh fruit', 'Candy')
GROUP BY
    dim_date.weekday, dim_product.category;
```

How can we execute this query efficiently?

In most OLTP databases, storage is laid out in a *row-oriented* fashion: all the values from one row
of a table are stored next to each other. Document databases are similar: an entire document is
typically stored as one contiguous sequence of bytes. You can see this in the CSV example of [Figure 4-1](/en/ch4#fig_storage_csv_hash_index).

In order to process a query like [Example 4-1](/en/ch4#fig_storage_analytics_query), you may have indexes on
`fact_sales.date_key` and/or `fact_sales.product_sk` that tell the storage engine where to find
all the sales for a particular date or for a particular product. But then, a row-oriented storage
engine still needs to load all of those rows (each consisting of over 100 attributes) from disk into
memory, parse them, and filter out those that don’t meet the required conditions. That can take a
long time.

The idea behind *column-oriented* (or *columnar*) storage is simple: don’t store all the values from
one row together, but store all the values from each *column* together instead [^56].
If each column is stored separately, a query only needs to read and parse those columns that are
used in that query, which can save a lot of work. [Figure 4-7](/en/ch4#fig_column_store) shows this principle using
an expanded version of the fact table from [Figure 3-5](/en/ch3#fig_dwh_schema).

--------

> [!NOTE]
> Column storage is easiest to understand in a relational data model, but it applies equally to
> nonrelational data. For example, Parquet [^57] is a columnar storage format that supports a document data model, based on Google’s Dremel [^58],
> using a technique known as *shredding* or *striping* [^59].

--------

{{< figure src="/fig/ddia_0407.png" id="fig_column_store" caption="Figure 4-7. Storing relational data by column, rather than by row." class="w-full my-4" >}}

The column-oriented storage layout relies on each column storing the rows in the same order.
Thus, if you need to reassemble an entire row, you can take the 23rd entry from each of the
individual columns and put them together to form the 23rd row of the table.

In fact, columnar storage engines don’t actually store an entire column (containing perhaps
trillions of rows) in one go. Instead, they break the table into blocks of thousands or millions of
rows, and within each block they store the values from each column separately [^60].
Since many queries are restricted to a particular date range, it is common to make each block
contain the rows for a particular timestamp range. A query then only needs to load the columns it
needs in those blocks that overlap with the required date range.

Columnar storage is used in almost all analytic databases nowadays [^60], ranging from large-scale cloud data warehouses such as Snowflake [^61]
to single-node embedded databases such as DuckDB [^62], and product analytics systems such as Pinot [^63] and Druid [^64].
It is used in storage formats such as Parquet, ORC [^65] [^66], Lance [^67], and Nimble [^68], and in-memory analytics formats like Apache Arrow
[^65] [^69] and Pandas/NumPy [^70]. Some time-series databases, such as InfluxDB IOx [^71] and TimescaleDB [^72], are also based on column-oriented storage.

#### Column Compression {#sec_storage_column_compression}

Besides only loading those columns from disk that are required for a query, we can further reduce
the demands on disk throughput and network bandwidth by compressing data. Fortunately,
column-oriented storage often lends itself very well to compression.

Take a look at the sequences of values for each column in [Figure 4-7](/en/ch4#fig_column_store): they often look quite
repetitive, which is a good sign for compression. Depending on the data in the column, different
compression techniques can be used. One technique that is particularly effective in data warehouses
is *bitmap encoding*, illustrated in [Figure 4-8](/en/ch4#fig_bitmap_index).

{{< figure src="/fig/ddia_0408.png" id="fig_bitmap_index" caption="Figure 4-8. Compressed, bitmap-indexed storage of a single column." class="w-full my-4" >}}

Often, the number of distinct values in a column is small compared to the number of rows (for
example, a retailer may have billions of sales transactions, but only 100,000 distinct products).
We can now take a column with *n* distinct values and turn it into *n* separate bitmaps: one bitmap
for each distinct value, with one bit for each row. The bit is 1 if the row has that value, and 0 if
not.

One option is to store those bitmaps using one bit per row. However, these bitmaps typically contain
a lot of zeros (we say that they are *sparse*). In that case, the bitmaps can additionally be
run-length encoded: counting the number of consecutive zeros or ones and storing that number, as
shown at the bottom of [Figure 4-8](/en/ch4#fig_bitmap_index). Techniques such as *roaring bitmaps* switch between the
two bitmap representations, using whichever is the most compact [^73].
This can make the encoding of a column remarkably efficient.

Bitmap indexes such as these are very well suited for the kinds of queries that are common in a data
warehouse. For example:

`WHERE product_sk IN (31, 68, 69):`
: Load the three bitmaps for `product_sk = 31`, `product_sk = 68`, and `product_sk = 69`, and
 calculate the bitwise *OR* of the three bitmaps, which can be done very efficiently.

`WHERE product_sk = 30 AND store_sk = 3:`
: Load the bitmaps for `product_sk = 30` and `store_sk = 3`, and calculate the bitwise *AND*. This
 works because the columns contain the rows in the same order, so the *k*th bit in one column’s
 bitmap corresponds to the same row as the *k*th bit in another column’s bitmap.

Bitmaps can also be used to answer graph queries, such as finding all users of a social network who are followed by user *X* and who also follow user *Y* [^74].
There are also various other compression schemes for columnar databases, which you can find in the references [^75].

--------

> [!NOTE]
> Don’t confuse column-oriented databases with the *wide-column* (also known as *column-family*) data
> model, in which a row can have thousands of columns, and there is no need for all the rows to have the same columns [^9]. 
> Despite the similarity in name, wide-column databases are row-oriented, since they store all values from a row together.
> Google’s Bigtable, Apache Accumulo, and HBase are examples of the wide-column model.

--------

#### Sort Order in Column Storage {#sort-order-in-column-storage}

In a column store, it doesn’t necessarily matter in which order the rows are stored. It’s easiest to
store them in the order in which they were inserted, since then inserting a new row just means
appending to each of the columns. However, we can choose to impose an order, like we did with
SSTables previously, and use that as an indexing mechanism.

Note that it wouldn’t make sense to sort each column independently, because then we would no longer
know which items in the columns belong to the same row. We can only reconstruct a row because we
know that the *k*th item in one column belongs to the same row as the *k*th item in another
column.

Rather, the data needs to be sorted an entire row at a time, even though it is stored by column.
The administrator of the database can choose the columns by which the table should be sorted, using
their knowledge of common queries. For example, if queries often target date ranges, such as the
last month, it might make sense to make `date_key` the first sort key. Then the query can
scan only the rows from the last month, which will be much faster than scanning all rows.

A second column can determine the sort order of any rows that have the same value in the first
column. For example, if `date_key` is the first sort key in [Figure 4-7](/en/ch4#fig_column_store), it might make
sense for `product_sk` to be the second sort key so that all sales for the same product on the same
day are grouped together in storage. That will help queries that need to group or filter sales by
product within a certain date range.

Another advantage of sorted order is that it can help with compression of columns. If the primary
sort column does not have many distinct values, then after sorting, it will have long sequences
where the same value is repeated many times in a row. A simple run-length encoding, like we used for
the bitmaps in [Figure 4-8](/en/ch4#fig_bitmap_index), could compress that column down to a few kilobytes—even if
the table has billions of rows.

That compression effect is strongest on the first sort key. The second and third sort keys will be
more jumbled up, and thus not have such long runs of repeated values. Columns further down the
sorting priority appear in essentially random order, so they probably won’t compress as well. But
having the first few columns sorted is still a win overall.

#### Writing to Column-Oriented Storage {#writing-to-column-oriented-storage}

We saw in [“Characterizing Transaction Processing and Analytics”](/en/ch1#sec_introduction_oltp) that reads in data warehouses tend to consist of aggregations
over a large number of rows; column-oriented storage, compression, and sorting all help to make
those read queries faster. Writes in a data warehouse tend to be a bulk import of data, often via an ETL process.

With columnar storage, writing an individual row somewhere in the middle of a sorted table would be
very inefficient, as you would have to rewrite all the compressed columns from the insertion
position onwards. However, a bulk write of many rows at once amortizes the cost of rewriting those
columns, making it efficient.

A log-structured approach is often used to perform writes in batches. All writes first go to a
row-oriented, sorted, in-memory store. When enough writes have accumulated, they are merged with the
column-encoded files on disk and written to new files in bulk. As old files remain immutable, and
new files are written in one go, object storage is well suited for storing these files.

Queries need to examine both the column data on disk and the recent writes in memory, and combine
the two. The query execution engine hides this distinction from the user. From an analyst’s point
of view, data that has been modified with inserts, updates, or deletes is immediately reflected in
subsequent queries. Snowflake, Vertica, Apache Pinot, Apache Druid, and many others do this [^61] [^63] [^64] [^76].


### Query Execution: Compilation and Vectorization {#sec_storage_vectorized}

A complex SQL query for analytics is broken down into a *query plan* consisting of multiple stages,
called *operators*, which may be distributed across multiple machines for parallel execution. Query
planners can perform a lot of optimizations by choosing which operators to use, in which order to
perform them, and where to run each operator.

Within each operator, the query engine needs to do various things with the values in a column, such
as finding all the rows where the value is among a particular set of values (perhaps as part of a
join), or checking whether the value is greater than 15. It also needs to look at several columns
for the same row, for example to find all sales transactions where the product is bananas and the
store is a particular store of interest.

For data warehouse queries that need to scan over millions of rows, we need to worry not only about
the amount of data they need to read off disk, but also the CPU time required to execute complex
operators. The simplest kind of operator is like an interpreter for a programming language: while
iterating over each row, it checks a data structure representing the query to find out which
comparisons or calculations it needs to perform on which columns. Unfortunately, this is too slow
for many analytics purposes. Two alternative approaches for efficient query execution have emerged [^77]:

Query compilation
: The query engine takes the SQL query and generates code for executing it. The code iterates over
 the rows one by one, looks at the values in the columns of interest, performs whatever comparisons
 or calculations are needed, and copies the necessary values to an output buffer if the required
 conditions are satisfied. The query engine compiles the generated code to machine code (often
 using an existing compiler such as LLVM), and then runs it on the column-encoded data that has
 been loaded into memory. This approach to code generation is similar to the just-in-time (JIT)
 compilation approach that is used in the Java Virtual Machine (JVM) and similar runtimes.

Vectorized processing
: The query is interpreted, not compiled, but it is made fast by processing many values from a
 column in a batch, instead of iterating over rows one by one. A fixed set of predefined operators
 are built into the database; we can pass arguments to them and get back a batch of results [^50] [^75].

 For example, we could pass the `product_sk` column and the ID of “bananas” to an equality operator,
 and get back a bitmap (one bit per value in the input column, which is 1 if it’s a banana); we could
 then pass the `store_sk` column and the ID of the store of interest to the same equality operator,
 and get back another bitmap; and then we could pass the two bitmaps to a “bitwise AND” operator, as
 shown in [Figure 4-9](/en/ch4#fig_bitmap_and). The result would be a bitmap containing a 1 for all sales of bananas in
 a particular store.

{{< figure src="/fig/ddia_0409.png" id="fig_bitmap_and" caption="Figure 4-9. A bitwise AND between two bitmaps lends itself to vectorization." class="w-full my-4" >}}

The two approaches are very different in terms of their implementation, but both are used in practice [^77]. Both can achieve very good
performance by taking advantages of the characteristics of modern CPUs:

* preferring sequential memory access over random access to reduce cache misses [^78],
* doing most of the work in tight inner loops (that is, with a small number of instructions and no
 function calls) to keep the CPU instruction processing pipeline busy and avoid branch
 mispredictions,
* making use of parallelism such as multiple threads and single-instruction-multi-data (SIMD) instructions [^79] [^80], and
* operating directly on compressed data without decoding it into a separate in-memory
 representation, which saves memory allocation and copying costs.

### Materialized Views and Data Cubes {#sec_storage_materialized_views}

We previously encountered *materialized views* in [“Materializing and Updating Timelines”](/en/ch2#sec_introduction_materializing):
in a relational data model, they are table-like object whose contents are the results of some
query. The difference is that a materialized view is an actual copy of the query results, written to
disk, whereas a virtual view is just a shortcut for writing queries. When you read from a virtual
view, the SQL engine expands it into the view’s underlying query on the fly and then processes the
expanded query.

When the underlying data changes, a materialized view needs to be updated accordingly.
Some databases can do that automatically, and there are also systems such as Materialize that specialize in materialized view maintenance [^81].
Performing such updates means more work on writes, but materialized views can improve read
performance in workloads that repeatedly need to perform the same queries.

*Materialized aggregates* are a type of materialized views that can be useful in data warehouses. As
discussed earlier, data warehouse queries often involve an aggregate function, such as `COUNT`, `SUM`,
`AVG`, `MIN`, or `MAX` in SQL. If the same aggregates are used by many different queries, it can be
wasteful to crunch through the raw data every time. Why not cache some of the counts or sums that
queries use most often? A *data cube* or *OLAP cube* does this by creating a grid of aggregates grouped by different dimensions [^82].
[Figure 4-10](/en/ch4#fig_data_cube) shows an example.

{{< figure src="/fig/ddia_0410.png" id="fig_data_cube" caption="Figure 4-10. Two dimensions of a data cube, aggregating data by summing." class="w-full my-4" >}}

Imagine for now that each fact has foreign keys to only two dimension tables—in [Figure 4-10](/en/ch4#fig_data_cube),
these are `date_key` and `product_sk`. You can now draw a two-dimensional table, with
dates along one axis and products along the other. Each cell contains the aggregate (e.g., `SUM`) of
an attribute (e.g., `net_price`) of all facts with that date-product combination. Then you can apply
the same aggregate along each row or column and get a summary that has been reduced by one
dimension (the sales by product regardless of date, or the sales by date regardless of product).

In general, facts often have more than two dimensions. In [Figure 3-5](/en/ch3#fig_dwh_schema) there are five
dimensions: date, product, store, promotion, and customer. It’s a lot harder to imagine what a
five-dimensional hypercube would look like, but the principle remains the same: each cell contains
the sales for a particular date-product-store-promotion-customer combination. These values can then
repeatedly be summarized along each of the dimensions.

The advantage of a materialized data cube is that certain queries become very fast because they
have effectively been precomputed. For example, if you want to know the total sales per store
yesterday, you just need to look at the totals along the appropriate dimension—no need to scan
millions of rows.

The disadvantage is that a data cube doesn’t have the same flexibility as querying the raw data. For example,
there is no way of calculating which proportion of sales comes from items that cost more than $100,
because the price isn’t one of the dimensions. Most data warehouses therefore try to keep as much
raw data as possible, and use aggregates such as data cubes only as a performance boost for certain queries.


## Multidimensional and Full-Text Indexes {#sec_storage_multidimensional}

The B-trees and LSM-trees we saw in the first half of this chapter allow range queries over a single
attribute: for example, if the key is a username, you can use them as an index to efficiently find
all names starting with an L. But sometimes, searching by a single attribute is not enough.

The most common type of multi-column index is called a *concatenated index*, which simply combines
several fields into one key by appending one column to another (the index definition specifies in
which order the fields are concatenated). This is like an old-fashioned paper phone book, which
provides an index from (*lastname*, *firstname*) to phone number. Due to the sort order, the index
can be used to find all the people with a particular last name, or all the people with a particular
*lastname-firstname* combination. However, the index is useless if you want to find all the people
with a particular first name.

On the other hand, *multi-dimensional indexes* allow you to query several columns at once.
One case where this is particularly important is geospatial data. For example, a restaurant-search
website may have a database containing the latitude and longitude of each restaurant. When a user is
looking at the restaurants on a map, the website needs to search for all the restaurants within the
rectangular map area that the user is currently viewing. This requires a two-dimensional range query
like the following:

```sql
SELECT * FROM restaurants WHERE latitude > 51.4946 AND latitude < 51.5079
    AND longitude > -0.1162 AND longitude < -0.1004;
```

A concatenated index over the latitude and longitude columns is not able to answer that kind of
query efficiently: it can give you either all the restaurants in a range of latitudes (but at any
longitude), or all the restaurants in a range of longitudes (but anywhere between the North and
South poles), but not both simultaneously.

One option is to translate a two-dimensional location into a single number using a space-filling
curve, and then to use a regular B-tree index [^83]. More commonly, specialized spatial indexes such as R-trees or Bkd-trees [^84]
are used; they divide up the space so that nearby data points tend to be grouped in the same subtree. For example, PostGIS implements geospatial indexes as R-trees using PostgreSQL’s
Generalized Search Tree indexing facility [^85]. It is also possible to use regularly spaced grids of triangles, squares, or hexagons [^86].

Multi-dimensional indexes are not just for geographic locations. For example, on an ecommerce
website you could use a three-dimensional index on the dimensions (*red*, *green*, *blue*) to search
for products in a certain range of colors, or in a database of weather observations you could have a
two-dimensional index on (*date*, *temperature*) in order to efficiently search for all the
observations during the year 2013 where the temperature was between 25 and 30℃. With a
one-dimensional index, you would have to either scan over all the records from 2013 (regardless of
temperature) and then filter them by temperature, or vice versa. A 2D index could narrow down by
timestamp and temperature simultaneously [^87].

### Full-Text Search {#sec_storage_full_text}

Full-text search allows you to search a collection of text documents (web pages, product
descriptions, etc.) by keywords that might appear anywhere in the text [^88].
Information retrieval is a big, specialist topic that often involves language-specific processing:
for example, several Asian languages are written without spaces or punctuation between words, and
therefore splitting text into words requires a model that indicates which character sequences
constitute a word. Full-text search also often involves matching words that are similar but not
identical (such as typos or different grammatical forms of words) and synonyms. Those problems go
beyond the scope of this book.

However, at its core, you can think of full-text search as another kind of multidimensional query:
in this case, each word that might appear in a text (a *term*) is a dimension. A document that
contains term *x* has a value of 1 in dimension *x*, and a document that doesn’t contain *x* has a
value of 0. Searching for documents mentioning “red apples” means a query that looks for a 1 in the
*red* dimension, and simultaneously a 1 in the *apples* dimension. The number of dimensions may thus be very large.

The data structure that many search engines use to answer such queries is called an *inverted
index*. This is a key-value structure where the key is a term, and the value is the list of IDs of
all the documents that contain the term (the *postings list*). If the document IDs are sequential
numbers, the postings list can also be represented as a sparse bitmap, like in [Figure 4-8](/en/ch4#fig_bitmap_index):
the *n*th bit in the bitmap for term *x* is a 1 if the document with ID *n* contains the term *x* [^89].

Finding all the documents that contain both terms *x* and *y* is now similar to a vectorized data
warehouse query that searches for rows matching two conditions ([Figure 4-9](/en/ch4#fig_bitmap_and)): load the two
bitmaps for terms *x* and *y* and compute their bitwise AND. Even if the bitmaps are run-length
encoded, this can be done very efficiently.

For example, Lucene, the full-text indexing engine used by Elasticsearch and Solr, works like this [^90].
It stores the mapping from term to postings list in SSTable-like sorted files, which are merged in
the background using the same log-structured approach we saw earlier in this chapter [^91].
PostgreSQL’s GIN index type also uses postings lists to support full-text search and indexing inside
JSON documents [^92] [^93].

Instead of breaking text into words, an alternative is to find all the substrings of length *n*,
which are called *n*-grams. For example, the trigrams (*n* = 3) of the string
`"hello"` are `"hel"`, `"ell"`, and `"llo"`. If we build an inverted index of all trigrams, we can
search the documents for arbitrary substrings that are at least three characters long. Trigram
indexes even allows regular expressions in search queries; the downside is that they are quite large [^94].

To cope with typos in documents or queries, Lucene is able to search text for words within a certain
edit distance (an edit distance of 1 means that one letter has been added, removed, or replaced) [^95].
It does this by storing the set of terms as a finite state automaton over the characters in the keys, similar to a *trie* [^96],
and transforming it into a *Levenshtein automaton*, which supports efficient search for words within a given edit distance [^97].


### Vector Embeddings {#id92}

Semantic search goes beyond synonyms and typos to try and understand document concepts
and user intentions. For example, if your help pages contain a page titled “cancelling your
subscription”, users should still be able to find that page when searching for “how to close my
account” or “terminate contract”, which are close in terms of meaning even though they use
completely different words.

To understand a document’s semantics—​its meaning—​semantic search indexes use embedding models to
translate a document into a vector of floating-point values, called a *vector embedding*. The vector
represents a point in a multi-dimensional space, and each floating-point value represents the document’s
location along one dimension’s axis. Embedding models generate vector embeddings that are near
each other (in this multi-dimensional space) when the embedding’s input documents are semantically
similar.

--------

> [!NOTE]
> We saw the term *vectorized processing* in [“Query Execution: Compilation and Vectorization”](/en/ch4#sec_storage_vectorized).
> Vectors in semantic search have a different meaning. In vectorized processing, the vector refers to
> a batch of bits that can be processed with specially optimized code. In embedding models, vectors are a list of
> floating point numbers that represent a location in multi-dimensional space.

--------

For example, a three-dimensional vector embedding for a Wikipedia page about agriculture might be
`[0.1, 0.22, 0.11]`. A Wikipedia page about vegetables would be quite near, perhaps with an embedding
of `[0.13, 0.19, 0.24]`. A page about star schemas might have an embedding of `[0.82, 0.39, -0.74]`,
comparatively far away. We can tell by looking that the first two vectors are closer than the third.

Embedding models use much larger vectors (often over 1,000 numbers), but the principles are the
same. We don’t try to understand what the individual numbers mean;
they’re simply a way for embedding models to point to a location in an abstract multi-dimensional
space. Search engines use distance functions such as cosine similarity or Euclidean distance to
measure the distance between vectors. Cosine similarity measures the cosine of the angle of two
vectors to determine how close they are, while Euclidean distance measures the straight-line
distance between two points in space.

Many early embedding models such as Word2Vec [^98], BERT [^99], and GPT [^100]
worked with text data. Such models are usually implemented as neural networks. Researchers went on to
create embedding models for video, audio, and images as well. More recently, model
architecture has become *multimodal*: a single model can generate vector embeddings for multiple
modalities such as text and images.

Semantic search engines use an embedding model to generate a vector embedding when a user enters a
query. The user’s query and related context (such as a user’s location) are fed into the embedding
model. After the embedding model generates the query’s vector embedding, the search engine must find
documents with similar vector embeddings using a vector index.

Vector indexes store the vector embeddings of a collection of documents. To query the index, you
pass in the vector embedding of the query, and the index returns the documents whose vectors are
closest to the query vector. Since the R-trees we saw previously don’t work well for vectors with
many dimensions, specialized vector indexes are used, such as:

Flat indexes
: Vectors are stored in the index as they are. A query must read every vector and measure its
 distance to the query vector. Flat indexes are accurate, but measuring the distance between the
 query and each vector is slow.

Inverted file (IVF) indexes
: The vector space is clustered into partitions (called *centroids*) of vectors to reduce the number
 of vectors that must be compared. IVF indexes are faster than flat indexes, but can give only
 approximate results: the query and a document may fall into different partitions, even though they
 are close to each other. A query on an IVF index first defines *probes*, which are simply the number
 of partitions to check. Queries that use more probes will be more accurate, but will be slower, as
 more vectors must be compared.

Hierarchical Navigable Small World (HNSW)
: HNSW indexes maintain multiple layers of the vector space, as illustrated in [Figure 4-11](/en/ch4#fig_vector_hnsw).
 Each layer is represented as a graph, where nodes represent vectors, and edges represent proximity
 to nearby vectors. A query starts by locating the nearest vector in the topmost layer, which has a
 small number of nodes. The query then moves to the same node in the layer below and follows the
 edges in that layer, which is more densely connected, looking for a vector that is closer to the
 query vector. The process continues until the last layer is reached. As with IVF indexes, HNSW
 indexes are approximate.

{{< figure src="/fig/ddia_0411.png" id="fig_vector_hnsw" caption="Figure 4-11. Searching for the database entry that is closest to a given query vector in a HNSW index." class="w-full my-4" >}}


Many popular vector databases implement IVF and HNSW indexes. Facebook’s Faiss library has many variations of each [^101], and PostgreSQL’s pgvector supports both as well [^102].
The full details of the IVF and HNSW algorithms are beyond the scope of this book, but their papers are an excellent resource [^103] [^104].

## Summary {#summary}

In this chapter we tried to get to the bottom of how databases perform storage and retrieval. What
happens when you store data in a database, and what does the database do when you query for the
data again later?

[“Analytical versus Operational Systems”](/en/ch1#sec_introduction_analytics) introduced the distinction between transaction processing (OLTP) and
analytics (OLAP). In this chapter we saw that storage engines optimized for OLTP look very different
from those optimized for analytics:

* OLTP systems are optimized for a high volume of requests, each of which reads and writes a small
 number of records, and which need fast responses. The records are typically accessed via a primary
 key or a secondary index, and these indexes are typically ordered mappings from key to record,
 which also support range queries.
* Data warehouses and similar analytic systems are optimized for complex read queries that scan over
 a large number of records. They generally use a column-oriented storage layout with compression
 that minimizes the amount of data that such a query needs to read off disk, and just-in-time
 compilation of queries or vectorization to minimize the amount of CPU time spent processing the
 data.

On the OLTP side, we saw storage engines from two main schools of thought:

* The log-structured approach, which only permits appending to files and deleting obsolete files,
 but never updates a file that has been written. SSTables, LSM-trees, RocksDB, Cassandra, HBase,
 Scylla, Lucene, and others belong to this group. In general, log-structured storage engines tend
 to provide high write throughput.
* The update-in-place approach, which treats the disk as a set of fixed-size pages that can be
 overwritten. B-trees, the biggest example of this philosophy, are used in all major relational
 OLTP databases and also many nonrelational ones. As a rule of thumb, B-trees tend to be better for
 reads, providing higher read throughput and lower response times than log-structured storage.

We then looked at indexes that can search for multiple conditions at the same time: multidimensional
indexes such as R-trees that can search for points on a map by latitude and longitude at the same
time, and full-text search indexes that can search for multiple keywords appearing in the same text.
Finally, vector databases are used for semantic search on text documents and other media; they use
vectors with a larger number of dimensions and find similar documents by comparing vector
similarity.

As an application developer, if you’re armed with this knowledge about the internals of storage
engines, you are in a much better position to know which tool is best suited for your particular
application. If you need to adjust a database’s tuning parameters, this understanding allows you to
imagine what effect a higher or a lower value may have.

Although this chapter couldn’t make you an expert in tuning any one particular storage engine, it
has hopefully equipped you with enough vocabulary and ideas that you can make sense of the
documentation for the database of your choice.


### References

[^1]: Nikolay Samokhvalov. [How partial, covering, and multicolumn indexes may slow down UPDATEs in PostgreSQL](https://postgres.ai/blog/20211029-how-partial-and-covering-indexes-affect-update-performance-in-postgresql). *postgres.ai*, October 2021. Archived at [perma.cc/PBK3-F4G9](https://perma.cc/PBK3-F4G9) 
[^2]: Goetz Graefe. [Modern B-Tree Techniques](https://w6113.github.io/files/papers/btreesurvey-graefe.pdf). *Foundations and Trends in Databases*, volume 3, issue 4, pages 203–402, August 2011. [doi:10.1561/1900000028](https://doi.org/10.1561/1900000028) 
[^3]: Evan Jones. [Why databases use ordered indexes but programming uses hash tables](https://www.evanjones.ca/ordered-vs-unordered-indexes.html). *evanjones.ca*, December 2019. Archived at [perma.cc/NJX8-3ZZD](https://perma.cc/NJX8-3ZZD) 
[^4]: Branimir Lambov. [CEP-25: Trie-indexed SSTable format](https://cwiki.apache.org/confluence/display/CASSANDRA/CEP-25%3A%2BTrie-indexed%2BSSTable%2Bformat). *cwiki.apache.org*, November 2022. Archived at [perma.cc/HD7W-PW8U](https://perma.cc/HD7W-PW8U). Linked Google Doc archived at [perma.cc/UL6C-AAAE](https://perma.cc/UL6C-AAAE) 
[^5]: Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein: *Introduction to Algorithms*, 3rd edition. MIT Press, 2009. ISBN: 978-0-262-53305-8 
[^6]: Branimir Lambov. [Trie Memtables in Cassandra](https://www.vldb.org/pvldb/vol15/p3359-lambov.pdf). *Proceedings of the VLDB Endowment*, volume 15, issue 12, pages 3359–3371, August 2022. [doi:10.14778/3554821.3554828](https://doi.org/10.14778/3554821.3554828) 
[^7]: Dhruba Borthakur. [The History of RocksDB](https://rocksdb.blogspot.com/2013/11/the-history-of-rocksdb.html). *rocksdb.blogspot.com*, November 2013. Archived at [perma.cc/Z7C5-JPSP](https://perma.cc/Z7C5-JPSP) 
[^8]: Matteo Bertozzi. [Apache HBase I/O – HFile](https://blog.cloudera.com/apache-hbase-i-o-hfile/). *blog.cloudera.com*, June 2012. Archived at [perma.cc/U9XH-L2KL](https://perma.cc/U9XH-L2KL) 
[^9]: Fay Chang, Jeffrey Dean, Sanjay Ghemawat, Wilson C. Hsieh, Deborah A. Wallach, Mike Burrows, Tushar Chandra, Andrew Fikes, and Robert E. Gruber. [Bigtable: A Distributed Storage System for Structured Data](https://research.google/pubs/pub27898/). At *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006. 
[^10]: Patrick O’Neil, Edward Cheng, Dieter Gawlick, and Elizabeth O’Neil. [The Log-Structured Merge-Tree (LSM-Tree)](https://www.cs.umb.edu/~poneil/lsmtree.pdf). *Acta Informatica*, volume 33, issue 4, pages 351–385, June 1996. [doi:10.1007/s002360050048](https://doi.org/10.1007/s002360050048) 
[^11]: Mendel Rosenblum and John K. Ousterhout. [The Design and Implementation of a Log-Structured File System](https://research.cs.wisc.edu/areas/os/Qual/papers/lfs.pdf). *ACM Transactions on Computer Systems*, volume 10, issue 1, pages 26–52, February 1992. [doi:10.1145/146941.146943](https://doi.org/10.1145/146941.146943) 
[^12]: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong Zhu, Mukul Murthy, Joseph Torres, Herman van Hovell, Adrian Ionescu, Alicja Łuszczak, Michał Świtakowski, Michał Szafrański, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter Boncz, Ali Ghodsi, Sameer Paranjpye, Pieter Senster, Reynold Xin, and Matei Zaharia. [Delta Lake: High-Performance ACID Table Storage over Cloud Object Stores](https://vldb.org/pvldb/vol13/p3411-armbrust.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 12, pages 3411–3424, August 2020. [doi:10.14778/3415478.3415560](https://doi.org/10.14778/3415478.3415560) 
[^13]: Burton H. Bloom. [Space/Time Trade-offs in Hash Coding with Allowable Errors](https://people.cs.umass.edu/~emery/classes/cmpsci691st/readings/Misc/p422-bloom.pdf). *Communications of the ACM*, volume 13, issue 7, pages 422–426, July 1970. [doi:10.1145/362686.362692](https://doi.org/10.1145/362686.362692) 
[^14]: Adam Kirsch and Michael Mitzenmacher. [Less Hashing, Same Performance: Building a Better Bloom Filter](https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf). *Random Structures & Algorithms*, volume 33, issue 2, pages 187–218, September 2008. [doi:10.1002/rsa.20208](https://doi.org/10.1002/rsa.20208) 
[^15]: Thomas Hurst. [Bloom Filter Calculator](https://hur.st/bloomfilter/). *hur.st*, September 2023. Archived at [perma.cc/L3AV-6VC2](https://perma.cc/L3AV-6VC2) 
[^16]: Chen Luo and Michael J. Carey. [LSM-based storage techniques: a survey](https://arxiv.org/abs/1812.07527). *The VLDB Journal*, volume 29, pages 393–418, July 2019. [doi:10.1007/s00778-019-00555-y](https://doi.org/10.1007/s00778-019-00555-y) 
[^17]: Subhadeep Sarkar and Manos Athanassoulis. [Dissecting, Designing, and Optimizing LSM-based Data Stores](https://www.youtube.com/watch?v=hkMkBZn2mGs). Tutorial at *ACM International Conference on Management of Data* (SIGMOD), June 2022. Slides archived at [perma.cc/93B3-E827](https://perma.cc/93B3-E827) 
[^18]: Mark Callaghan. [Name that compaction algorithm](https://smalldatum.blogspot.com/2018/08/name-that-compaction-algorithm.html). *smalldatum.blogspot.com*, August 2018. Archived at [perma.cc/CN4M-82DY](https://perma.cc/CN4M-82DY) 
[^19]: Prashanth Rao. [Embedded databases (1): The harmony of DuckDB, KùzuDB and LanceDB](https://thedataquarry.com/posts/embedded-db-1/). *thedataquarry.com*, August 2023. Archived at [perma.cc/PA28-2R35](https://perma.cc/PA28-2R35) 
[^20]: Hacker News discussion. [Bluesky migrates to single-tenant SQLite](https://news.ycombinator.com/item?id=38171322). *news.ycombinator.com*, October 2023. Archived at [perma.cc/69LM-5P6X](https://perma.cc/69LM-5P6X) 
[^21]: Rudolf Bayer and Edward M. McCreight. [Organization and Maintenance of Large Ordered Indices](https://dl.acm.org/doi/pdf/10.1145/1734663.1734671). Boeing Scientific Research Laboratories, Mathematical and Information Sciences Laboratory, report no. 20, July 1970. [doi:10.1145/1734663.1734671](https://doi.org/10.1145/1734663.1734671) 
[^22]: Douglas Comer. [The Ubiquitous B-Tree](https://web.archive.org/web/20170809145513id_/http%3A//sites.fas.harvard.edu/~cs165/papers/comer.pdf). *ACM Computing Surveys*, volume 11, issue 2, pages 121–137, June 1979. [doi:10.1145/356770.356776](https://doi.org/10.1145/356770.356776) 
[^23]: Alex Miller. [Torn Write Detection and Protection](https://transactional.blog/blog/2025-torn-writes). *transactional.blog*, April 2025. Archived at [perma.cc/G7EB-33EW](https://perma.cc/G7EB-33EW) 
[^24]: C. Mohan and Frank Levine. [ARIES/IM: An Efficient and High Concurrency Index Management Method Using Write-Ahead Logging](https://ics.uci.edu/~cs223/papers/p371-mohan.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 1992. [doi:10.1145/130283.130338](https://doi.org/10.1145/130283.130338) 
[^25]: Hironobu Suzuki. [The Internals of PostgreSQL](https://www.interdb.jp/pg/). *interdb.jp*, 2017. 
[^26]: Howard Chu. [LDAP at Lightning Speed](https://buildstuff14.sched.com/event/08a1a368e272eb599a52e08b4c3c779d). At *Build Stuff ’14*, November 2014. Archived at [perma.cc/GB6Z-P8YH](https://perma.cc/GB6Z-P8YH) 
[^27]: Manos Athanassoulis, Michael S. Kester, Lukas M. Maas, Radu Stoica, Stratos Idreos, Anastasia Ailamaki, and Mark Callaghan. [Designing Access Methods: The RUM Conjecture](https://openproceedings.org/2016/conf/edbt/paper-12.pdf). At *19th International Conference on Extending Database Technology* (EDBT), March 2016. [doi:10.5441/002/edbt.2016.42](https://doi.org/10.5441/002/edbt.2016.42) 
[^28]: Ben Stopford. [Log Structured Merge Trees](http://www.benstopford.com/2015/02/14/log-structured-merge-trees/). *benstopford.com*, February 2015. Archived at [perma.cc/E5BV-KUJ6](https://perma.cc/E5BV-KUJ6) 
[^29]: Mark Callaghan. [The Advantages of an LSM vs a B-Tree](https://smalldatum.blogspot.com/2016/01/summary-of-advantages-of-lsm-vs-b-tree.html). *smalldatum.blogspot.co.uk*, January 2016. Archived at [perma.cc/3TYZ-EFUD](https://perma.cc/3TYZ-EFUD) 
[^30]: Oana Balmau, Florin Dinu, Willy Zwaenepoel, Karan Gupta, Ravishankar Chandhiramoorthi, and Diego Didona. [SILK: Preventing Latency Spikes in Log-Structured Merge Key-Value Stores](https://www.usenix.org/conference/atc19/presentation/balmau). At *USENIX Annual Technical Conference*, July 2019. 
[^31]: Igor Canadi, Siying Dong, Mark Callaghan, et al. [RocksDB Tuning Guide](https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide). *github.com*, 2023. Archived at [perma.cc/UNY4-MK6C](https://perma.cc/UNY4-MK6C) 
[^32]: Gabriel Haas and Viktor Leis. [What Modern NVMe Storage Can Do, and How to Exploit it: High-Performance I/O for High-Performance Storage Engines](https://www.vldb.org/pvldb/vol16/p2090-haas.pdf). *Proceedings of the VLDB Endowment*, volume 16, issue 9, pages 2090-2102. [doi:10.14778/3598581.3598584](https://doi.org/10.14778/3598581.3598584) 
[^33]: Emmanuel Goossaert. [Coding for SSDs](https://codecapsule.com/2014/02/12/coding-for-ssds-part-1-introduction-and-table-of-contents/). *codecapsule.com*, February 2014. 
[^34]: Jack Vanlightly. [Is sequential IO dead in the era of the NVMe drive?](https://jack-vanlightly.com/blog/2023/5/9/is-sequential-io-dead-in-the-era-of-the-nvme-drive) *jack-vanlightly.com*, May 2023. Archived at [perma.cc/7TMZ-TAPU](https://perma.cc/7TMZ-TAPU) 
[^35]: Alibaba Cloud Storage Team. [Storage System Design Analysis: Factors Affecting NVMe SSD Performance (2)](https://www.alibabacloud.com/blog/594376). *alibabacloud.com*, January 2019. Archived at [archive.org](https://web.archive.org/web/20230510065132/https%3A//www.alibabacloud.com/blog/594376) 
[^36]: Xiao-Yu Hu and Robert Haas. [The Fundamental Limit of Flash Random Write Performance: Understanding, Analysis and Performance Modelling](https://dominoweb.draco.res.ibm.com/reports/rz3771.pdf). *dominoweb.draco.res.ibm.com*, March 2010. Archived at [perma.cc/8JUL-4ZDS](https://perma.cc/8JUL-4ZDS) 
[^37]: Lanyue Lu, Thanumalayan Sankaranarayana Pillai, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [WiscKey: Separating Keys from Values in SSD-conscious Storage](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf). At *4th USENIX Conference on File and Storage Technologies* (FAST), February 2016. 
[^38]: Peter Zaitsev. [Innodb Double Write](https://www.percona.com/blog/innodb-double-write/). *percona.com*, August 2006. Archived at [perma.cc/NT4S-DK7T](https://perma.cc/NT4S-DK7T) 
[^39]: Tomas Vondra. [On the Impact of Full-Page Writes](https://www.2ndquadrant.com/en/blog/on-the-impact-of-full-page-writes/). *2ndquadrant.com*, November 2016. Archived at [perma.cc/7N6B-CVL3](https://perma.cc/7N6B-CVL3) 
[^40]: Mark Callaghan. [Read, write & space amplification - B-Tree vs LSM](https://smalldatum.blogspot.com/2015/11/read-write-space-amplification-b-tree.html). *smalldatum.blogspot.com*, November 2015. Archived at [perma.cc/S487-WK5P](https://perma.cc/S487-WK5P) 
[^41]: Mark Callaghan. [Choosing Between Efficiency and Performance with RocksDB](https://codemesh.io/codemesh2016/mark-callaghan). At *Code Mesh*, November 2016. Video at [youtube.com/watch?v=tgzkgZVXKB4](https://www.youtube.com/watch?v=tgzkgZVXKB4) 
[^42]: Subhadeep Sarkar, Tarikul Islam Papon, Dimitris Staratzis, Zichen Zhu, and Manos Athanassoulis. [Enabling Timely and Persistent Deletion in LSM-Engines](https://subhadeep.net/assets/fulltext/Enabling_Timely_and_Persistent_Deletion_in_LSM-Engines.pdf). *ACM Transactions on Database Systems*, volume 48, issue 3, article no. 8, August 2023. [doi:10.1145/3599724](https://doi.org/10.1145/3599724) 
[^43]: Lukas Fittl. [Postgres vs. SQL Server: B-Tree Index Differences & the Benefit of Deduplication](https://pganalyze.com/blog/postgresql-vs-sql-server-btree-index-deduplication). *pganalyze.com*, April 2025. Archived at [perma.cc/XY6T-LTPX](https://perma.cc/XY6T-LTPX) 
[^44]: Drew Silcock. [How Postgres stores data on disk – this one’s a page turner](https://drew.silcock.dev/blog/how-postgres-stores-data-on-disk/). *drew.silcock.dev*, August 2024. Archived at [perma.cc/8K7K-7VJ2](https://perma.cc/8K7K-7VJ2) 
[^45]: Joe Webb. [Using Covering Indexes to Improve Query Performance](https://www.red-gate.com/simple-talk/databases/sql-server/learn/using-covering-indexes-to-improve-query-performance/). *simple-talk.com*, September 2008. Archived at [perma.cc/6MEZ-R5VR](https://perma.cc/6MEZ-R5VR) 
[^46]: Michael Stonebraker, Samuel Madden, Daniel J. Abadi, Stavros Harizopoulos, Nabil Hachem, and Pat Helland. [The End of an Architectural Era (It’s Time for a Complete Rewrite)](https://vldb.org/conf/2007/papers/industrial/p1150-stonebraker.pdf). At *33rd International Conference on Very Large Data Bases* (VLDB), September 2007. 
[^47]: [VoltDB Technical Overview White Paper](https://www.voltactivedata.com/wp-content/uploads/2017/03/hv-white-paper-voltdb-technical-overview.pdf). VoltDB, 2017. Archived at [perma.cc/B9SF-SK5G](https://perma.cc/B9SF-SK5G) 
[^48]: Stephen M. Rumble, Ankita Kejriwal, and John K. Ousterhout. [Log-Structured Memory for DRAM-Based Storage](https://www.usenix.org/system/files/conference/fast14/fast14-paper_rumble.pdf). At *12th USENIX Conference on File and Storage Technologies* (FAST), February 2014. 
[^49]: Stavros Harizopoulos, Daniel J. Abadi, Samuel Madden, and Michael Stonebraker. [OLTP Through the Looking Glass, and What We Found There](https://hstore.cs.brown.edu/papers/hstore-lookingglass.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2008. [doi:10.1145/1376616.1376713](https://doi.org/10.1145/1376616.1376713) 
[^50]: Per-Åke Larson, Cipri Clinciu, Campbell Fraser, Eric N. Hanson, Mostafa Mokhtar, Michal Nowakiewicz, Vassilis Papadimos, Susan L. Price, Srikumar Rangarajan, Remus Rusanu, and Mayukh Saubhasik. [Enhancements to SQL Server Column Stores](https://web.archive.org/web/20131203001153id_/http%3A//research.microsoft.com/pubs/193599/Apollo3%20-%20Sigmod%202013%20-%20final.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2013. [doi:10.1145/2463676.2463708](https://doi.org/10.1145/2463676.2463708) 
[^51]: Franz Färber, Norman May, Wolfgang Lehner, Philipp Große, Ingo Müller, Hannes Rauhe, and Jonathan Dees. [The SAP HANA Database – An Architecture Overview](https://web.archive.org/web/20220208081111id_/http%3A//sites.computer.org/debull/A12mar/hana.pdf). *IEEE Data Engineering Bulletin*, volume 35, issue 1, pages 28–33, March 2012. 
[^52]: Michael Stonebraker. [The Traditional RDBMS Wisdom Is (Almost Certainly) All Wrong](https://slideshot.epfl.ch/talks/166). Presentation at *EPFL*, May 2013. 
[^53]: Adam Prout, Szu-Po Wang, Joseph Victor, Zhou Sun, Yongzhu Li, Jack Chen, Evan Bergeron, Eric Hanson, Robert Walzer, Rodrigo Gomes, and Nikita Shamgunov. [Cloud-Native Transactions and Analytics in SingleStore](https://dl.acm.org/doi/pdf/10.1145/3514221.3526055). At *ACM International Conference on Management of Data* (SIGMOD), June 2022. [doi:10.1145/3514221.3526055](https://doi.org/10.1145/3514221.3526055) 
[^54]: Tino Tereshko and Jordan Tigani. [BigQuery under the hood](https://cloud.google.com/blog/products/bigquery/bigquery-under-the-hood). *cloud.google.com*, January 2016. Archived at [perma.cc/WP2Y-FUCF](https://perma.cc/WP2Y-FUCF) 
[^55]: Wes McKinney. [The Road to Composable Data Systems: Thoughts on the Last 15 Years and the Future](https://wesmckinney.com/blog/looking-back-15-years/). *wesmckinney.com*, September 2023. Archived at [perma.cc/6L2M-GTJX](https://perma.cc/6L2M-GTJX) 
[^56]: Michael Stonebraker, Daniel J. Abadi, Adam Batkin, Xuedong Chen, Mitch Cherniack, Miguel Ferreira, Edmond Lau, Amerson Lin, Sam Madden, Elizabeth O’Neil, Pat O’Neil, Alex Rasin, Nga Tran, and Stan Zdonik. [C-Store: A Column-oriented DBMS](https://www.vldb.org/archives/website/2005/program/paper/thu/p553-stonebraker.pdf). At *31st International Conference on Very Large Data Bases* (VLDB), pages 553–564, September 2005. 
[^57]: Julien Le Dem. [Dremel Made Simple with Parquet](https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet.html). *blog.twitter.com*, September 2013. 
[^58]: Sergey Melnik, Andrey Gubarev, Jing Jing Long, Geoffrey Romer, Shiva Shivakumar, Matt Tolton, and Theo Vassilakis. [Dremel: Interactive Analysis of Web-Scale Datasets](https://vldb.org/pvldb/vol3/R29.pdf). At *36th International Conference on Very Large Data Bases* (VLDB), pages 330–339, September 2010. [doi:10.14778/1920841.1920886](https://doi.org/10.14778/1920841.1920886) 
[^59]: Joe Kearney. [Understanding Record Shredding: storing nested data in columns](https://www.joekearney.co.uk/posts/understanding-record-shredding). *joekearney.co.uk*, December 2016. Archived at [perma.cc/ZD5N-AX5D](https://perma.cc/ZD5N-AX5D) 
[^60]: Jamie Brandon. [A shallow survey of OLAP and HTAP query engines](https://www.scattered-thoughts.net/writing/a-shallow-survey-of-olap-and-htap-query-engines). *scattered-thoughts.net*, September 2023. Archived at [perma.cc/L3KH-J4JF](https://perma.cc/L3KH-J4JF) 
[^61]: Benoit Dageville, Thierry Cruanes, Marcin Zukowski, Vadim Antonov, Artin Avanes, Jon Bock, Jonathan Claybaugh, Daniel Engovatov, Martin Hentschel, Jiansheng Huang, Allison W. Lee, Ashish Motivala, Abdul Q. Munir, Steven Pelley, Peter Povinec, Greg Rahn, Spyridon Triantafyllis, and Philipp Unterbrunner. [The Snowflake Elastic Data Warehouse](https://dl.acm.org/doi/pdf/10.1145/2882903.2903741). At *ACM International Conference on Management of Data* (SIGMOD), pages 215–226, June 2016. [doi:10.1145/2882903.2903741](https://doi.org/10.1145/2882903.2903741) 
[^62]: Mark Raasveldt and Hannes Mühleisen. [Data Management for Data Science Towards Embedded Analytics](https://duckdb.org/pdf/CIDR2020-raasveldt-muehleisen-duckdb.pdf). At *10th Conference on Innovative Data Systems Research* (CIDR), January 2020. 
[^63]: Jean-François Im, Kishore Gopalakrishna, Subbu Subramaniam, Mayank Shrivastava, Adwait Tumbde, Xiaotian Jiang, Jennifer Dai, Seunghyun Lee, Neha Pawar, Jialiang Li, and Ravi Aringunram. [Pinot: Realtime OLAP for 530 Million Users](https://cwiki.apache.org/confluence/download/attachments/103092375/Pinot.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 583–594, May 2018. [doi:10.1145/3183713.3190661](https://doi.org/10.1145/3183713.3190661) 
[^64]: Fangjin Yang, Eric Tschetter, Xavier Léauté, Nelson Ray, Gian Merlino, and Deep Ganguli. [Druid: A Real-time Analytical Data Store](https://static.druid.io/docs/druid.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2014. [doi:10.1145/2588555.2595631](https://doi.org/10.1145/2588555.2595631) 
[^65]: Chunwei Liu, Anna Pavlenko, Matteo Interlandi, and Brandon Haynes. [Deep Dive into Common Open Formats for Analytical DBMSs](https://www.vldb.org/pvldb/vol16/p3044-liu.pdf). *Proceedings of the VLDB Endowment*, volume 16, issue 11, pages 3044–3056, July 2023. [doi:10.14778/3611479.3611507](https://doi.org/10.14778/3611479.3611507) 
[^66]: Xinyu Zeng, Yulong Hui, Jiahong Shen, Andrew Pavlo, Wes McKinney, and Huanchen Zhang. [An Empirical Evaluation of Columnar Storage Formats](https://www.vldb.org/pvldb/vol17/p148-zeng.pdf). *Proceedings of the VLDB Endowment*, volume 17, issue 2, pages 148–161. [doi:10.14778/3626292.3626298](https://doi.org/10.14778/3626292.3626298) 
[^67]: Weston Pace. [Lance v2: A columnar container format for modern data](https://blog.lancedb.com/lance-v2/). *blog.lancedb.com*, April 2024. Archived at [perma.cc/ZK3Q-S9VJ](https://perma.cc/ZK3Q-S9VJ) 
[^68]: Yoav Helfman. [Nimble, A New Columnar File Format](https://www.youtube.com/watch?v=bISBNVtXZ6M). At *VeloxCon*, April 2024. 
[^69]: Wes McKinney. [Apache Arrow: High-Performance Columnar Data Framework](https://www.youtube.com/watch?v=YhF8YR0OEFk). At *CMU Database Group – Vaccination Database Tech Talks*, December 2021. 
[^70]: Wes McKinney. [Python for Data Analysis, 3rd Edition](https://learning.oreilly.com/library/view/python-for-data/9781098104023/). O’Reilly Media, August 2022. ISBN: 9781098104023 
[^71]: Paul Dix. [The Design of InfluxDB IOx: An In-Memory Columnar Database Written in Rust with Apache Arrow](https://www.youtube.com/watch?v=_zbwz-4RDXg). At *CMU Database Group – Vaccination Database Tech Talks*, May 2021. 
[^72]: Carlota Soto and Mike Freedman. [Building Columnar Compression for Large PostgreSQL Databases](https://www.timescale.com/blog/building-columnar-compression-in-a-row-oriented-database/). *timescale.com*, March 2024. Archived at [perma.cc/7KTF-V3EH](https://perma.cc/7KTF-V3EH) 
[^73]: Daniel Lemire, Gregory Ssi‐Yan‐Kai, and Owen Kaser. [Consistently faster and smaller compressed bitmaps with Roaring](https://arxiv.org/pdf/1603.06549). *Software: Practice and Experience*, volume 46, issue 11, pages 1547–1569, November 2016. [doi:10.1002/spe.2402](https://doi.org/10.1002/spe.2402) 
[^74]: Jaz Volpert. [An entire Social Network in 1.6GB (GraphD Part 2)](https://jazco.dev/2024/04/20/roaring-bitmaps/). *jazco.dev*, April 2024. Archived at [perma.cc/L27Z-QVMG](https://perma.cc/L27Z-QVMG) 
[^75]: Daniel J. Abadi, Peter Boncz, Stavros Harizopoulos, Stratos Idreos, and Samuel Madden. [The Design and Implementation of Modern Column-Oriented Database Systems](https://www.cs.umd.edu/~abadi/papers/abadi-column-stores.pdf). *Foundations and Trends in Databases*, volume 5, issue 3, pages 197–280, December 2013. [doi:10.1561/1900000024](https://doi.org/10.1561/1900000024) 
[^76]: Andrew Lamb, Matt Fuller, Ramakrishna Varadarajan, Nga Tran, Ben Vandiver, Lyric Doshi, and Chuck Bear. [The Vertica Analytic Database: C-Store 7 Years Later](https://vldb.org/pvldb/vol5/p1790_andrewlamb_vldb2012.pdf). *Proceedings of the VLDB Endowment*, volume 5, issue 12, pages 1790–1801, August 2012. [doi:10.14778/2367502.2367518](https://doi.org/10.14778/2367502.2367518) 
[^77]: Timo Kersten, Viktor Leis, Alfons Kemper, Thomas Neumann, Andrew Pavlo, and Peter Boncz. [Everything You Always Wanted to Know About Compiled and Vectorized Queries But Were Afraid to Ask](https://www.vldb.org/pvldb/vol11/p2209-kersten.pdf). *Proceedings of the VLDB Endowment*, volume 11, issue 13, pages 2209–2222, September 2018. [doi:10.14778/3275366.3284966](https://doi.org/10.14778/3275366.3284966) 
[^78]: Forrest Smith. [Memory Bandwidth Napkin Math](https://www.forrestthewoods.com/blog/memory-bandwidth-napkin-math/). *forrestthewoods.com*, February 2020. Archived at [perma.cc/Y8U4-PS7N](https://perma.cc/Y8U4-PS7N) 
[^79]: Peter Boncz, Marcin Zukowski, and Niels Nes. [MonetDB/X100: Hyper-Pipelining Query Execution](https://www.cidrdb.org/cidr2005/papers/P19.pdf). At *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005. 
[^80]: Jingren Zhou and Kenneth A. Ross. [Implementing Database Operations Using SIMD Instructions](https://www1.cs.columbia.edu/~kar/pubsk/simd.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 145–156, June 2002. [doi:10.1145/564691.564709](https://doi.org/10.1145/564691.564709) 
[^81]: Kevin Bartley. [OLTP Queries: Transfer Expensive Workloads to Materialize](https://materialize.com/blog/oltp-queries/). *materialize.com*, August 2024. Archived at [perma.cc/4TYM-TYD8](https://perma.cc/4TYM-TYD8) 
[^82]: Jim Gray, Surajit Chaudhuri, Adam Bosworth, Andrew Layman, Don Reichart, Murali Venkatrao, Frank Pellow, and Hamid Pirahesh. [Data Cube: A Relational Aggregation Operator Generalizing Group-By, Cross-Tab, and Sub-Totals](https://arxiv.org/pdf/cs/0701155). *Data Mining and Knowledge Discovery*, volume 1, issue 1, pages 29–53, March 2007. [doi:10.1023/A:1009726021843](https://doi.org/10.1023/A%3A1009726021843) 
[^83]: Frank Ramsak, Volker Markl, Robert Fenk, Martin Zirkel, Klaus Elhardt, and Rudolf Bayer. [Integrating the UB-Tree into a Database System Kernel](https://www.vldb.org/conf/2000/P263.pdf). At *26th International Conference on Very Large Data Bases* (VLDB), September 2000. 
[^84]: Octavian Procopiuc, Pankaj K. Agarwal, Lars Arge, and Jeffrey Scott Vitter. [Bkd-Tree: A Dynamic Scalable kd-Tree](https://users.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf). At *8th International Symposium on Spatial and Temporal Databases* (SSTD), pages 46–65, July 2003. [doi:10.1007/978-3-540-45072-6\_4](https://doi.org/10.1007/978-3-540-45072-6_4) 
[^85]: Joseph M. Hellerstein, Jeffrey F. Naughton, and Avi Pfeffer. [Generalized Search Trees for Database Systems](https://dsf.berkeley.edu/papers/vldb95-gist.pdf). At *21st International Conference on Very Large Data Bases* (VLDB), September 1995. 
[^86]: Isaac Brodsky. [H3: Uber’s Hexagonal Hierarchical Spatial Index](https://eng.uber.com/h3/). *eng.uber.com*, June 2018. Archived at [archive.org](https://web.archive.org/web/20240722003854/https%3A//www.uber.com/blog/h3/) 
[^87]: Robert Escriva, Bernard Wong, and Emin Gün Sirer. [HyperDex: A Distributed, Searchable Key-Value Store](https://www.cs.princeton.edu/courses/archive/fall13/cos518/papers/hyperdex.pdf). At *ACM SIGCOMM Conference*, August 2012. [doi:10.1145/2377677.2377681](https://doi.org/10.1145/2377677.2377681) 
[^88]: Christopher D. Manning, Prabhakar Raghavan, and Hinrich Schütze. [*Introduction to Information Retrieval*](https://nlp.stanford.edu/IR-book/). Cambridge University Press, 2008. ISBN: 978-0-521-86571-5, available online at [nlp.stanford.edu/IR-book](https://nlp.stanford.edu/IR-book/) 
[^89]: Jianguo Wang, Chunbin Lin, Yannis Papakonstantinou, and Steven Swanson. [An Experimental Study of Bitmap Compression vs. Inverted List Compression](https://cseweb.ucsd.edu/~swanson/papers/SIGMOD2017-ListCompression.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 993–1008, May 2017. [doi:10.1145/3035918.3064007](https://doi.org/10.1145/3035918.3064007) 
[^90]: Adrien Grand. [What is in a Lucene Index?](https://speakerdeck.com/elasticsearch/what-is-in-a-lucene-index) At *Lucene/Solr Revolution*, November 2013. Archived at [perma.cc/Z7QN-GBYY](https://perma.cc/Z7QN-GBYY) 
[^91]: Michael McCandless. [Visualizing Lucene’s Segment Merges](https://blog.mikemccandless.com/2011/02/visualizing-lucenes-segment-merges.html). *blog.mikemccandless.com*, February 2011. Archived at [perma.cc/3ZV8-72W6](https://perma.cc/3ZV8-72W6) 
[^92]: Lukas Fittl. [Understanding Postgres GIN Indexes: The Good and the Bad](https://pganalyze.com/blog/gin-index). *pganalyze.com*, December 2021. Archived at [perma.cc/V3MW-26H6](https://perma.cc/V3MW-26H6) 
[^93]: Jimmy Angelakos. [The State of (Full) Text Search in PostgreSQL 12](https://www.youtube.com/watch?v=c8IrUHV70KQ). At *FOSDEM*, February 2020. Archived at [perma.cc/J6US-3WZS](https://perma.cc/J6US-3WZS) 
[^94]: Alexander Korotkov. [Index support for regular expression search](https://wiki.postgresql.org/images/6/6c/Index_support_for_regular_expression_search.pdf). At *PGConf.EU Prague*, October 2012. Archived at [perma.cc/5RFZ-ZKDQ](https://perma.cc/5RFZ-ZKDQ) 
[^95]: Michael McCandless. [Lucene’s FuzzyQuery Is 100 Times Faster in 4.0](https://blog.mikemccandless.com/2011/03/lucenes-fuzzyquery-is-100-times-faster.html). *blog.mikemccandless.com*, March 2011. Archived at [perma.cc/E2WC-GHTW](https://perma.cc/E2WC-GHTW) 
[^96]: Steffen Heinz, Justin Zobel, and Hugh E. Williams. [Burst Tries: A Fast, Efficient Data Structure for String Keys](https://web.archive.org/web/20130903070248id_/http%3A//ww2.cs.mu.oz.au%3A80/~jz/fulltext/acmtois02.pdf). *ACM Transactions on Information Systems*, volume 20, issue 2, pages 192–223, April 2002. [doi:10.1145/506309.506312](https://doi.org/10.1145/506309.506312) 
[^97]: Klaus U. Schulz and Stoyan Mihov. [Fast String Correction with Levenshtein Automata](https://dmice.ohsu.edu/bedricks/courses/cs655/pdf/readings/2002_Schulz.pdf). *International Journal on Document Analysis and Recognition*, volume 5, issue 1, pages 67–85, November 2002. [doi:10.1007/s10032-002-0082-8](https://doi.org/10.1007/s10032-002-0082-8) 
[^98]: Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781). At *International Conference on Learning Representations* (ICLR), May 2013. [doi:10.48550/arXiv.1301.3781](https://doi.org/10.48550/arXiv.1301.3781) 
[^99]: Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805). At *Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies*, volume 1, pages 4171–4186, June 2019. [doi:10.18653/v1/N19-1423](https://doi.org/10.18653/v1/N19-1423) 
[^100]: Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever. [Improving Language Understanding by Generative Pre-Training](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf). *openai.com*, June 2018. Archived at [perma.cc/5N3C-DJ4C](https://perma.cc/5N3C-DJ4C) 
[^101]: Matthijs Douze, Maria Lomeli, and Lucas Hosseini. [Faiss indexes](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes). *github.com*, August 2024. Archived at [perma.cc/2EWG-FPBS](https://perma.cc/2EWG-FPBS) 
[^102]: Varik Matevosyan. [Understanding pgvector’s HNSW Index Storage in Postgres](https://lantern.dev/blog/pgvector-storage). *lantern.dev*, August 2024. Archived at [perma.cc/B2YB-JB59](https://perma.cc/B2YB-JB59) 
[^103]: Dmitry Baranchuk, Artem Babenko, and Yury Malkov. [Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](https://arxiv.org/pdf/1802.02422). At *European Conference on Computer Vision* (ECCV), pages 202–216, September 2018. [doi:10.1007/978-3-030-01258-8\_13](https://doi.org/10.1007/978-3-030-01258-8_13) 
[^104]: Yury A. Malkov and Dmitry A. Yashunin. [Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs](https://arxiv.org/pdf/1603.09320). *IEEE Transactions on Pattern Analysis and Machine Intelligence*, volume 42, issue 4, pages 824–836, April 2020. [doi:10.1109/TPAMI.2018.2889473](https://doi.org/10.1109/TPAMI.2018.2889473) 


================================================
FILE: content/en/ch5.md
================================================
---
title: "5. Encoding and Evolution"
weight: 105
breadcrumbs: false
---

<a id="ch_encoding"></a>

![](/map/ch04.png)

> *Everything changes and nothing stands still.*
>
> Heraclitus of Ephesus, as quoted by Plato in *Cratylus* (360 BCE)

Applications inevitably change over time. Features are added or modified as new products are
launched, user requirements become better understood, or business circumstances change. In
[Chapter 2](/en/ch2#ch_nonfunctional) we introduced the idea of *evolvability*: we should aim to build systems that
make it easy to adapt to change (see [“Evolvability: Making Change Easy”](/en/ch2#sec_introduction_evolvability)).

In most cases, a change to an application’s features also requires a change to data that it stores:
perhaps a new field or record type needs to be captured, or perhaps existing data needs to be
presented in a new way.

The data models we discussed in [Chapter 3](/en/ch3#ch_datamodels) have different ways of coping with such change.
Relational databases generally assume that all data in the database conforms to one schema: although
that schema can be changed (through schema migrations; i.e., `ALTER` statements), there is exactly
one schema in force at any one point in time. By contrast, schema-on-read (“schemaless”) databases
don’t enforce a schema, so the database can contain a mixture of older and newer data formats
written at different times (see [“Schema flexibility in the document model”](/en/ch3#sec_datamodels_schema_flexibility)).

When a data format or schema changes, a corresponding change to application code often needs to
happen (for example, you add a new field to a record, and the application code starts reading
and writing that field). However, in a large application, code changes often cannot happen
instantaneously:

* With server-side applications you may want to perform a *rolling upgrade*
 (also known as a *staged rollout*), deploying the new version to a few nodes at a time, checking
 whether the new version is running smoothly, and gradually working your way through all the nodes.
 This allows new versions to be deployed without service downtime, and thus encourages more frequent releases and better evolvability.
* With client-side applications you’re at the mercy of the user, who may not install the update for some time.

This means that old and new versions of the code, and old and new data formats, may potentially all
coexist in the system at the same time. In order for the system to continue running smoothly, we
need to maintain compatibility in both directions:

Backward compatibility
: Newer code can read data that was written by older code.

Forward compatibility
: Older code can read data that was written by newer code.

Backward compatibility is normally not hard to achieve: as author of the newer code, you know the
format of data written by older code, and so you can explicitly handle it (if necessary by simply
keeping the old code to read the old data). Forward compatibility can be trickier, because it
requires older code to ignore additions made by a newer version of the code.

Another challenge with forward compatibility is illustrated in [Figure 5-1](/en/ch5#fig_encoding_preserve_field).
Say you add a field to a record schema, and the newer code creates a record containing that new
field and stores it in a database. Subsequently, an older version of the code (which doesn’t yet
know about the new field) reads the record, updates it, and writes it back. In this situation, the
desirable behavior is usually for the old code to keep the new field intact, even though it couldn’t
be interpreted. But if the record is decoded into a model object that does not explicitly
preserve unknown fields, data can be lost, like in [Figure 5-1](/en/ch5#fig_encoding_preserve_field).

{{< figure src="/fig/ddia_0501.png" id="fig_encoding_preserve_field" caption="When an older version of the application updates data previously written by a newer version of the application, data may be lost if you’re not careful." class="w-full my-4" >}}

In this chapter we will look at several formats for encoding data, including JSON, XML, Protocol
Buffers, and Avro. In particular, we will look at how they handle schema changes and how they
support systems where old and new data and code need to coexist. We will then discuss how those
formats are used for data storage and for communication: in databases, web services, REST APIs,
remote procedure calls (RPC), workflow engines, and event-driven systems such as actors and
message queues.

## Formats for Encoding Data {#sec_encoding_formats}

Programs usually work with data in (at least) two different representations:

1. In memory, data is kept in objects, structs, lists, arrays, hash tables, trees, and so on. These
 data structures are optimized for efficient access and manipulation by the CPU (typically using
 pointers).
2. When you want to write data to a file or send it over the network, you have to encode it as some
 kind of self-contained sequence of bytes (for example, a JSON document). Since a pointer wouldn’t
 make sense to any other process, this sequence-of-bytes representation often looks quite
 different from the data structures that are normally used in memory.

Thus, we need some kind of translation between the two representations. The translation from the
in-memory representation to a byte sequence is called *encoding* (also known as *serialization* or
*marshalling*), and the reverse is called *decoding* (*parsing*, *deserialization*,
*unmarshalling*).

--------

> [!TIP] TERMINOLOGY CLASH

*Serialization* is unfortunately also used in the context of transactions (see [Chapter 8](/en/ch8#ch_transactions)),
with a completely different meaning. To avoid overloading the word we’ll stick with *encoding* in
this book, even though *serialization* is perhaps a more common term.

--------

There are exceptions in which encoding/decoding is not needed—for example, when a database operates
directly on compressed data loaded from disk, as discussed in [“Query Execution: Compilation and Vectorization”](/en/ch4#sec_storage_vectorized). There are
also *zero-copy* data formats that are designed to be used both at runtime and on disk/on the
network, without an explicit conversion step, such as Cap’n Proto and FlatBuffers.

However, most systems need to convert between in-memory objects and flat byte sequences. As this is
such a common problem, there are a myriad different libraries and encoding formats to choose from.
Let’s do a brief overview.

### Language-Specific Formats {#id96}

Many programming languages come with built-in support for encoding in-memory objects into byte
sequences. For example, Java has `java.io.Serializable`, Python has `pickle`, Ruby has `Marshal`,
and so on. Many third-party libraries also exist, such as Kryo for Java.

These encoding libraries are very convenient, because they allow in-memory objects to be saved and
restored with minimal additional code. However, they also have a number of deep problems:

* The encoding is often tied to a particular programming language, and reading the data in another
 language is very difficult. If you store or transmit data in such an encoding, you are committing
 yourself to your current programming language for potentially a very long time, and precluding
 integrating your systems with those of other organizations (which may use different languages).
* In order to restore data in the same object types, the decoding process needs to be able to
 instantiate arbitrary classes. This is frequently a source of security problems [^1]:
 if an attacker can get your application to decode an arbitrary byte sequence, they can instantiate
 arbitrary classes, which in turn often allows them to do terrible things such as remotely
 executing arbitrary code [^2] [^3].
* Versioning data is often an afterthought in these libraries: as they are intended for quick and
 easy encoding of data, they often neglect the inconvenient problems of forward and backward compatibility [^4].
* Efficiency (CPU time taken to encode or decode, and the size of the encoded structure) is also
 often an afterthought. For example, Java’s built-in serialization is notorious for its bad
 performance and bloated encoding [^5].

For these reasons it’s generally a bad idea to use your language’s built-in encoding for anything
other than very transient purposes.

### JSON, XML, and Binary Variants {#sec_encoding_json}

When moving to standardized encodings that can be written and read by many programming languages, JSON
and XML are the obvious contenders. They are widely known, widely supported, and almost as widely
disliked. XML is often criticized for being too verbose and unnecessarily complicated [^6].
JSON’s popularity is mainly due to its built-in support in web browsers and simplicity relative to
XML. CSV is another popular language-independent format, but it only supports tabular data without
nesting.

JSON, XML, and CSV are textual formats, and thus somewhat human-readable (although the syntax is a
popular topic of debate). Besides the superficial syntactic issues, they also have some subtle
problems:

* There is a lot of ambiguity around the encoding of numbers. In XML and CSV, you cannot distinguish
 between a number and a string that happens to consist of digits (except by referring to an external
 schema). JSON distinguishes strings and numbers, but it doesn’t distinguish integers and
 floating-point numbers, and it doesn’t specify a precision.

 This is a problem when dealing with large numbers; for example, integers greater than 253 cannot
 be exactly represented in an IEEE 754 double-precision floating-point number, so such numbers become
 inaccurate when parsed in a language that uses floating-point numbers, such as JavaScript [^7].
 An example of numbers larger than 253 occurs on X (formerly Twitter), which uses a 64-bit number to
 identify each post. The JSON returned by the API includes post IDs twice, once as a JSON number and
 once as a decimal string, to work around the fact that the numbers are not correctly parsed by
 JavaScript applications [^8].
* JSON and XML have good support for Unicode character strings (i.e., human-readable text), but they
 don’t support binary strings (sequences of bytes without a character encoding). Binary strings are a
 useful feature, so people get around this limitation by encoding the binary data as text using
 Base64. The schema is then used to indicate that the value should be interpreted as Base64-encoded.
 This works, but it’s somewhat hacky and increases the data size by 33%.
* XML Schema and JSON Schema are powerful, and thus quite
 complicated to learn and implement. Since the correct interpretation of data (such as numbers and
 binary strings) depends on information in the schema, applications that don’t use XML/JSON schemas
 need to potentially hard-code the appropriate encoding/decoding logic instead.
* CSV does not have any schema, so it is up to the application to define the meaning of each row and
 column. If an application change adds a new row or column, you have to handle that change manually.
 CSV is also a quite vague format (what happens if a value contains a comma or a newline character?).
 Although its escaping rules have been formally specified [^9],
 not all parsers implement them correctly.

Despite these flaws, JSON, XML, and CSV are good enough for many purposes. It’s likely that they will
remain popular, especially as data interchange formats (i.e., for sending data from one organization to
another). In these situations, as long as people agree on what the format is, it often doesn’t
matter how pretty or efficient the format is. The difficulty of getting different organizations to
agree on *anything* outweighs most other concerns.

#### JSON Schema {#json-schema}

JSON Schema has become widely adopted as a way to model data whenever it’s exchanged between systems
or written to storage. You’ll find JSON schemas in web services (see [“Web services”](/en/ch5#sec_web_services)) as part
of the OpenAPI web service specification, schema registries such as Confluent’s Schema Registry and
Red Hat’s Apicurio Registry, and in databases such as PostgreSQL’s pg\_jsonschema validator extension
and MongoDB’s `$jsonSchema` validator syntax.

The JSON Schema specification offers a number of features. Schemas include standard primitive types
including strings, numbers, integers, objects, arrays, booleans, or nulls. But JSON Schema also
offers a separate validation specification that allows developers to overlay constraints on fields.
For example, a `port` field might have a minimum of 1 and a maximum of 65535.

JSON Schemas can have either open or closed content models. An open content model permits any field
not defined in the schema to exist with any data type, whereas a closed content model only allows
fields that are explicitly defined. The open content model in JSON Schema is enabled when
`additionalProperties` is set to `true`, which is the default. Thus, JSON Schemas are usually a
definition of what *isn’t* permitted (namely, invalid values on any of the defined fields), rather
than what *is* permitted in a schema.

Open content models are powerful, but can be complex. For example, say you want to define a map from
integers (such as IDs) to strings. JSON does not have a map or dictionary type, only an “object”
type that can contain string keys, and values of any type. You can then constrain this type with
JSON Schema so that keys may only contain digits, and values can only be strings, using
`patternProperties` and `additionalProperties` as shown in [Example 5-1](/en/ch5#fig_encoding_json_schema).


{{< figure id="fig_encoding_json_schema" title="Example 5-1. Example JSON Schema with integer keys and string values. Integer keys are represented as strings containing only integers since JSON Schema requires all keys to be strings." class="w-full my-4" >}}

```json
{
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "patternProperties": {
        "^[0-9]+$": {
        "type": "string"
    }
    },
    "additionalProperties": false
}
```

In addition to open and closed content models and validators, JSON Schema supports conditional
if/else schema logic, named types, references to remote schemas, and much more. All of this makes
for a very powerful schema language. Such features also make for unwieldy definitions. It can be
challenging to resolve remote schemas, reason about conditional rules, or evolve schemas in a
forwards or backwards compatible way [^10]. Similar concerns apply to XML Schema [^11].

#### Binary encoding {#binary-encoding}

JSON is less verbose than XML, but both still use a lot of space compared to binary formats. This
observation led to the development of a profusion of binary encodings for JSON (MessagePack, CBOR,
BSON, BJSON, UBJSON, BISON, Hessian, and Smile, to name a few) and for XML (WBXML and Fast Infoset,
for example). These formats have been adopted in various niches, as they are more compact and
sometimes faster to parse, but none of them are as widely adopted as the textual versions of JSON and XML [^12].

Some of these formats extend the set of datatypes (e.g., distinguishing integers and floating-point numbers,
or adding support for binary strings), but otherwise they keep the JSON/XML data model unchanged. In
particular, since they don’t prescribe a schema, they need to include all the object field names within
the encoded data. That is, in a binary encoding of the JSON document in [Example 5-2](/en/ch5#fig_encoding_json), they
will need to include the strings `userName`, `favoriteNumber`, and `interests` somewhere.

{{< figure id="fig_encoding_json" title="Example 5-2. Example record which we will encode in several binary formats in this chapter" class="w-full my-4" >}}

```json
{
    "userName": "Martin",
    "favoriteNumber": 1337,
    "interests": ["daydreaming", "hacking"]
}
```

Let’s look at an example of MessagePack, a binary encoding for JSON. [Figure 5-2](/en/ch5#fig_encoding_messagepack)
shows the byte sequence that you get if you encode the JSON document in [Example 5-2](/en/ch5#fig_encoding_json) with
MessagePack. The first few bytes are as follows:

1. The first byte, `0x83`, indicates that what follows is an object (top four bits = `0x80`) with three
 fields (bottom four bits = `0x03`). (In case you’re wondering what happens if an object has more
 than 15 fields, so that the number of fields doesn’t fit in four bits, it then gets a different type
 indicator, and the number of fields is encoded in two or four bytes.)
2. The second byte, `0xa8`, indicates that what follows is a string (top four bits = `0xa0`) that is eight
 bytes long (bottom four bits = `0x08`).
3. The next eight bytes are the field name `userName` in ASCII. Since the length was indicated
 previously, there’s no need for any marker to tell us where the string ends (or any escaping).
4. The next seven bytes encode the six-letter string value `Martin` with a prefix `0xa6`, and so on.

The binary encoding is 66 bytes long, which is only a little less than the 81 bytes taken by the
textual JSON encoding (with whitespace removed). All the binary encodings of JSON are similar in
this regard. It’s not clear whether such a small space reduction (and perhaps a speedup in parsing)
is worth the loss of human-readability.

In the following sections we will see how we can do much better, and encode the same record in just 32 bytes.

{{< figure link="#fig_encoding_json" src="/fig/ddia_0502.png" id="fig_encoding_messagepack" caption="Figure 5-2. Example record Example 5-2 encoded using MessagePack." class="w-full my-4" >}}


### Protocol Buffers {#sec_encoding_protobuf}

Protocol Buffers (protobuf) is a binary encoding library developed at Google.
It is similar to Apache Thrift, which was originally developed by Facebook [^13];
most of what this section says about Protocol Buffers applies also to Thrift.

Protocol Buffers requires a schema for any data that is encoded. To encode the data
in [Example 5-2](/en/ch5#fig_encoding_json) in Protocol Buffers, you would describe the schema in the Protocol Buffers
interface definition language (IDL) like this:

```protobuf
syntax = "proto3";

message Person {
    string user_name = 1;
    int64 favorite_number = 2;
    repeated string interests = 3;
}
```

Protocol Buffers comes with a code generation tool that takes a schema definition like the one
shown here, and produces classes that implement the schema in various programming languages. Your
application code can call this generated code to encode or decode records of the schema. The schema
language is very simple compared to JSON Schema: it only defines the fields of records and their
types, but it does not support other restrictions on the possible values of fields.

Encoding [Example 5-2](/en/ch5#fig_encoding_json) using a Protocol Buffers encoder requires 33 bytes, as shown in [Figure 5-3](/en/ch5#fig_encoding_protobuf) [^14].

{{< figure src="/fig/ddia_0503.png" id="fig_encoding_protobuf" caption="Figure 5-3. Example record encoded using Protocol Buffers." class="w-full my-4" >}}


Similarly to [Figure 5-2](/en/ch5#fig_encoding_messagepack), each field has a type annotation (to indicate whether it
is a string, integer, etc.) and, where required, a length indication (such as the length of a
string). The strings that appear in the data (“Martin”, “daydreaming”, “hacking”) are also encoded
as ASCII (to be precise, UTF-8), similar to before.

The big difference compared to [Figure 5-2](/en/ch5#fig_encoding_messagepack) is that there are no field names
(`userName`, `favoriteNumber`, `interests`). Instead, the encoded data contains *field tags*, which
are numbers (`1`, `2`, and `3`). Those are the numbers that appear in the schema definition. Field tags
are like aliases for fields—they are a compact way of saying what field we’re talking about,
without having to spell out the field name.

As you can see, Protocol Buffers saves even more space by packing the field type and tag number into
a single byte. It uses variable-length integers: the number 1337 is encoded in two bytes, with the
top bit of each byte used to indicate whether there are still more bytes to come. This means numbers
between –64 and 63 are encoded in one byte, numbers between –8192 and 8191 are encoded in two bytes,
etc. Bigger numbers use more bytes.

Protocol Buffers doesn’t have an explicit list or array datatype. Instead, the `repeated` modifier
on the `interests` field indicates that the field contains a list of values, rather than a single
value. In the binary encoding, the list elements are represented simply as repeated occurrences of
the same field tag within the same record.

#### Field tags and schema evolution {#field-tags-and-schema-evolution}

We said previously that schemas inevitably need to change over time. We call this *schema
evolution*. How does Protocol Buffers handle schema changes while keeping backward and forward compatibility?

As you can see from the examples, an encoded record is just the concatenation of its encoded fields.
Each field is identified by its tag number (the numbers `1`, `2`, `3` in the sample schema) and
annotated with a datatype (e.g., string or integer). If a field value is not set, it is simply
omitted from the encoded record. From this you can see that field tags are critical to the meaning
of the encoded data. You can change the name of a field in the schema, since the encoded data never
refers to field names, but you cannot change a field’s tag, since that would make all existing
encoded data invalid.

You can add new fields to the schema, provided that you give each field a new tag number. If old
code (which doesn’t know about the new tag numbers you added) tries to read data written by new
code, including a new field with a tag number it doesn’t recognize, it can simply ignore that field.
The datatype annotation allows the parser to determine how many bytes it needs to skip, and preserve
the unknown fields to avoid the problem in [Figure 5-1](/en/ch5#fig_encoding_preserve_field). This maintains forward
compatibility: old code can read records that were written by new code.

What about backward compatibility? As long as each field has a unique tag number, new code can
always read old data, because the tag numbers still have the same meaning. If a field was added in
the new schema, and you read old data that does not yet contain that field, it is filled in with a
default value (for example, the empty string if the field type is string, or zero if it’s a number).

Removing a field is just like adding a field, with backward and forward compatibility concerns
reversed. You can never use the same tag number again, because you may still have data written
somewhere that includes the old tag number, and that field must be ignored by new code. Tag numbers
used in the past can be reserved in the schema definition to ensure they are not forgotten.

What about changing the datatype of a field? That is possible with some types—check the
documentation for details—but there is a risk that values will get truncated. For example, say you
change a 32-bit integer into a 64-bit integer. New code can easily read data written by old code,
because the parser can fill in any missing bits with zeros. However, if old code reads data written
by new code, the old code is still using a 32-bit variable to hold the value. If the decoded 64-bit
value won’t fit in 32 bits, it will be truncated.

### Avro {#sec_encoding_avro}

Apache Avro is another binary encoding format that is interestingly different from Protocol Buffers.
It was started in 2009 as a subproject of Hadoop, as a result of Protocol Buffers not being a good
fit for Hadoop’s use cases [^15].

Avro also uses a schema to specify the structure of the data being encoded. It has two schema
languages: one (Avro IDL) intended for human editing, and one (based on JSON) that is more easily
machine-readable. Like Protocol Buffers, this schema language specifies only fields and their types,
and not complex validation rules like in JSON Schema.

Our example schema, written in Avro IDL, might look like this:

```c
record Person {
    string                  userName;
    union { null, long }    favoriteNumber = null;
    array<string>           interests;
}
```

The equivalent JSON representation of that schema is as follows:

```c
{
    "type": "record",
    "name": "Person",
    "fields": [
        {"name": "userName",        "type": "string"},
        {"name": "favoriteNumber",  "type": ["null", "long"], "default": null},
        {"name": "interests",       "type": {"type": "array", "items": "string"}}
    ]
}
```

First of all, notice that there are no tag numbers in the schema. If we encode our example record
([Example 5-2](/en/ch5#fig_encoding_json)) using this schema, the Avro binary encoding is just 32 bytes long—the
most compact of all the encodings we have seen. The breakdown of the encoded byte sequence is shown
in [Figure 5-4](/en/ch5#fig_encoding_avro).

If you examine the byte sequence, you can see that there is nothing to identify fields or their
datatypes. The encoding simply consists of values concatenated together. A string is just a length
prefix followed by UTF-8 bytes, but there’s nothing in the encoded data that tells you that it is a
string. It could just as well be an integer, or something else entirely. An integer is encoded using
a variable-length encoding.

{{< figure src="/fig/ddia_0504.png" id="fig_encoding_avro" caption="Figure 5-4. Example record encoded using Avro." class="w-full my-4" >}}


To parse the binary data, you go through the fields in the order that they appear in the schema and
use the schema to tell you the datatype of each field. This means that the binary data can only be
decoded correctly if the code reading the data is using the *exact same schema* as the code that
wrote the data. Any mismatch in the schema between the reader and the writer would mean incorrectly
decoded data.

So, how does Avro support schema evolution?

#### The writer’s schema and the reader’s schema {#the-writers-schema-and-the-readers-schema}

When an application wants to encode some data (to write it to a file or database, to send it over
the network, etc.), it encodes the data using whatever version of the schema it knows about—for
example, that schema may be compiled into the application. This is known as the *writer’s schema*.

When an application wants to decode some data (read it from a file or database, receive it from the
network, etc.), it uses two schemas: the writer’s schema that is identical to the one used for
encoding, and the *reader’s schema*, which may be different. This is illustrated in
[Figure 5-5](/en/ch5#fig_encoding_avro_schemas). The reader’s schema defines the fields of each record that the
application code is expecting, and their types.

{{< figure src="/fig/ddia_0505.png" id="fig_encoding_avro_schemas" caption="Figure 5-5. In Protocol Buffers, encoding and decoding can use different versions of a schema. In Avro, decoding uses two schemas: the writer's schema must be identical to the one used for encoding, but the reader's schema can be an older or newer version." class="w-full my-4" >}}

If the reader’s and writer’s schema are the same, decoding is easy. If they are different, Avro
resolves the differences by looking at the writer’s schema and the reader’s schema side by side and
translating the data from the writer’s schema into the reader’s schema. The Avro specification [^16] [^17]
defines exactly how this resolution works, and it is illustrated in [Figure 5-6](/en/ch5#fig_encoding_avro_resolution).

For example, it’s no problem if the writer’s schema and the reader’s schema have their fields in a
different order, because the schema resolution matches up the fields by field name. If the code
reading the data encounters a field that appears in the writer’s schema but not in the reader’s
schema, it is ignored. If the code reading the data expects some field, but the writer’s schema does
not contain a field of that name, it is filled in with a default value declared in the reader’s
schema.

{{< figure src="/fig/ddia_0506.png" id="fig_encoding_avro_resolution" caption="Figure 5-6. An Avro reader resolves differences between the writer's schema and the reader's schema." class="w-full my-4" >}}

#### Schema evolution rules {#schema-evolution-rules}

With Avro, forward compatibility means that you can have a new version of the schema as writer and
an old version of the schema as reader. Conversely, backward compatibility means that you can have a
new version of the schema as reader and an old version as writer.

To maintain compatibility, you may only add or remove a field that has a default value. (The field
`favoriteNumber` in our Avro schema has a default value of `null`.) For example, say you add a
field with a default value, so this new field exists in the new schema but not the old one. When a
reader using the new schema reads a record written with the old schema, the default value is filled
in for the missing field.

If you were to add a field that has no default value, new readers wouldn’t be able to read data
written by old writers, so you would break backward compatibility. If you were to remove a field
that has no default value, old readers wouldn’t be able to read data written by new writers, so you
would break forward compatibility.

In some programming languages, `null` is an acceptable default for any variable, but this is not the
case in Avro: if you want to allow a field to be null, you have to use a *union type*. For example,
`union { null, long, string } field;` indicates that `field` can be a number, or a string, or null.
You can only use `null` as a default value if it is the first branch of the union. This is a little
more verbose than having everything nullable by default, but it helps prevent bugs by being explicit
about what can and cannot be null [^18].

Changing the datatype of a field is possible, provided that Avro can convert the type. Changing the
name of a field is possible but a little tricky: the reader’s schema can contain aliases for field
names, so it can match an old writer’s schema field names against the aliases. This means that
changing a field name is backward compatible but not forward compatible. Similarly, adding a branch
to a union type is backward compatible but not forward compatible.

#### But what is the writer’s schema? {#but-what-is-the-writers-schema}

There is an important question that we’ve glossed over so far: how does the reader know the writer’s
schema with which a particular piece of data was encoded? We can’t just include the entire schema
with every record, because the schema would likely be much bigger than the encoded data, making all
the space savings from the binary encoding futile.

The answer depends on the context in which Avro is being used. To give a few examples:

Large file with lots of records
: A common use for Avro is for storing a large file containing millions of records, all encoded with
 the same schema. (We will discuss this kind of situation in [Chapter 11](/en/ch11#ch_batch).) In this case, the
 writer of that file can just include the writer’s schema once at the beginning of the file. Avro
 specifies a file format (object container files) to do this.

Database with individually written records
: In a database, different records may be written at different points in time using different
 writer’s schemas—you cannot assume that all the records will have the same schema. The simplest
 solution is to include a version number at the beginning of every encoded record, and to keep a
 list of schema versions in your database. A reader can fetch a record, extract the version number,
 and then fetch the writer’s schema for that version number from the database. Using that writer’s
 schema, it can decode the rest of the record.

 Confluent’s schema registry for Apache Kafka [^19] and LinkedIn’s Espresso [^20] work this way, for example.

Sending records over a network connection
: When two processes are communicating over a bidirectional network connection, they can negotiate
 the schema version on connection setup and then use that schema for the lifetime of the
 connection. The Avro RPC protocol (see [“Dataflow Through Services: REST and RPC”](/en/ch5#sec_encoding_dataflow_rpc)) works like this.

A database of schema versions is a useful thing to have in any case, since it acts as documentation
and gives you a chance to check schema compatibility [^21].
As the version number, you could use a simple incrementing integer, or you could use a hash of the schema.

#### Dynamically generated schemas {#dynamically-generated-schemas}

One advantage of Avro’s approach, compared to Protocol Buffers, is that the schema doesn’t contain
any tag numbers. But why is this important? What’s the problem with keeping a couple of numbers in
the schema?

The difference is that Avro is friendlier to *dynamically generated* schemas. For example, say
you have a relational database whose contents you want to dump to a file, and you want to use a
binary format to avoid the aforementioned problems with textual formats (JSON, CSV, XML). If you use
Avro, you can fairly easily generate an Avro schema (in the JSON representation we saw earlier) from the
relational schema and encode the database contents using that schema, dumping it all to an Avro
object container file [^22].
You can generate a record schema for each database table, and each column becomes a field in that
record. The column name in the database maps to the field name in Avro.

Now, if the database schema changes (for example, a table has one column added and one column
removed), you can just generate a new Avro schema from the updated database schema and export data in
the new Avro schema. The data export process does not need to pay any attention to the schema
change—it can simply do the schema conversion every time it runs. Anyone who reads the new data
files will see that the fields of the record have changed, but since the fields are identified by
name, the updated writer’s schema can still be matched up with the old reader’s schema.

By contrast, if you were using Protocol Buffers for this purpose, the field tags would likely have
to be assigned by hand: every time the database schema changes, an administrator would have to
manually update the mapping from database column names to field tags. (It might be possible to
automate this, but the schema generator would have to be very careful to not assign previously used
field tags.) This kind of dynamically generated schema simply wasn’t a design goal of Protocol
Buffers, whereas it was for Avro.

### The Merits of Schemas {#sec_encoding_schemas}

As we saw, Protocol Buffers and Avro both use a schema to describe a binary encoding format. Their
schema languages are much simpler than XML Schema or JSON Schema, which support much more detailed
validation rules (e.g., “the string value of this field must match this regular expression” or “the
integer value of this field must be between 0 and 100”). As Protocol Buffers and Avro are simpler to
implement and simpler to use, they have grown to support a fairly wide range of programming
languages.

The ideas on which these encodings are based are by no means new. For example, they have a lot in
common with ASN.1, a schema definition language that was first standardized in 1984 [^23] [^24].
It was used to define various network protocols, and its binary encoding (DER) is still used to encode
SSL certificates (X.509), for example [^25].
ASN.1 supports schema evolution using tag numbers, similar to Protocol Buffers [^26].
However, it’s also very complex and badly documented, so ASN.1 is probably not a good choice for new applications.

Many data systems also implement some kind of proprietary binary encoding for their data. For
example, most relational databases have a network protocol over which you can send queries to the
database and get back responses. Those protocols are generally specific to a particular database,
and the database vendor provides a driver (e.g., using the ODBC or JDBC APIs) that decodes responses
from the database’s network protocol into in-memory data structures.

So, we can see that although textual data formats such as JSON, XML, and CSV are widespread, binary
encodings based on schemas are also a viable option. They have a number of nice properties:

* They can be much more compact than the various “binary JSON” variants, since they can omit field
 names from the encoded data.
* The schema is a valuable form of documentation, and because the schema is required for decoding,
 you can be sure that it is up to date (whereas manually maintained documentation may easily
 diverge from reality).
* Keeping a database of schemas allows you to check forward and backward compatibility of schema
 changes, before anything is deployed.
* For users of statically typed programming languages, the ability to generate code from the schema
 is useful, since it enables type-checking at compile time.

In summary, schema evolution allows the same kind of flexibility as schemaless/schema-on-read JSON
databases provide (see [“Schema flexibility in the document model”](/en/ch3#sec_datamodels_schema_flexibility)), while also providing better
guarantees about your data and better tooling.

## Modes of Dataflow {#sec_encoding_dataflow}

At the beginning of this chapter we said that whenever you want to send some data to another process
with which you don’t share memory—for example, whenever you want to send data over the network or
write it to a file—you need to encode it as a sequence of bytes. We then discussed a variety of
different encodings for doing this.

We talked about forward and backward compatibility, which are important for evolvability (making
change easy by allowing you to upgrade different parts of your system independently, and not having
to change everything at once). Compatibility is a relationship between one process that encodes the
data, and another process that decodes it.

That’s a fairly abstract idea—there are many ways data can flow from one process to another.
Who encodes the data, and who decodes it? In the rest of this chapter we will explore some of the
most common ways how data flows between processes:

* Via databases (see [“Dataflow Through Databases”](/en/ch5#sec_encoding_dataflow_db))
* Via service calls (see [“Dataflow Through Services: REST and RPC”](/en/ch5#sec_encoding_dataflow_rpc))
* Via workflow engines (see [“Durable Execution and Workflows”](/en/ch5#sec_encoding_dataflow_workflows))
* Via asynchronous messages (see [“Event-Driven Architectures”](/en/ch5#sec_encoding_dataflow_msg))

### Dataflow Through Databases {#sec_encoding_dataflow_db}

In a database, the process that writes to the database encodes the data, and the process that reads
from the database decodes it. There may just be a single process accessing the database, in which
case the reader is simply a later version of the same process—in that case you can think of
storing something in the database as *sending a message to your future self*.

Backward compatibility is clearly necessary here; otherwise your future self won’t be able to decode
what you previously wrote.

In general, it’s common for several different processes to be accessing a database at the same time.
Those processes might be several different applications or services, or they may simply be several
instances of the same service (running in parallel for scalability or fault tolerance). Either way,
in an environment where the application is changing, it is likely that some processes accessing the
database will be running newer code and some will be running older code—for example because a new
version is currently being deployed in a rolling upgrade, so some instances have been updated while
others haven’t yet.

This means that a value in the database may be written by a *newer* version of the code, and
subsequently read by an *older* version of the code that is still running. Thus, forward
compatibility is also often required for databases.

#### Different values written at different times {#different-values-written-at-different-times}

A database generally allows any value to be updated at any time. This means that within a single
database you may have some values that were written five milliseconds ago, and some values that were
written five years ago.

When you deploy a new version of your application (of a server-side application, at least), you may
entirely replace the old version with the new version within a few minutes. The same is not true of
database contents: the five-year-old data will still be there, in the original encoding, unless you
have explicitly rewritten it since then. This observation is sometimes summed up as *data outlives
code*.

Rewriting (*migrating*) data into a new schema is certainly possible, but it’s an expensive thing to
do on a large dataset, so most databases avoid it if possible. Most relational databases allow
simple schema changes, such as adding a new column with a `null` default value, without rewriting
existing data. When an old row is read, the database fills in `null`s for any columns that are
missing from the encoded data on disk.
Schema evolution thus allows the entire database to appear as if it was encoded with a single
schema, even though the underlying storage may contain records encoded with various historical
versions of the schema.

More complex schema changes—for example, changing a single-valued attribute to be multi-valued, or
moving some data into a separate table—still require data to be rewritten, often at the application level [^27].
Maintaining forward and backward compatibility across such migrations is still a research problem [^28].

#### Archival storage {#archival-storage}

Perhaps you take a snapshot of your database from time to time, say for backup purposes or for
loading into a data warehouse (see [“Data Warehousing”](/en/ch1#sec_introduction_dwh)). In this case, the data dump will typically
be encoded using the latest schema, even if the original encoding in the source database contained a
mixture of schema versions from different eras. Since you’re copying the data anyway, you might as
well encode the copy of the data consistently.

As the data dump is written in one go and is thereafter immutable, formats like Avro object
container files are a good fit. This is also a good opportunity to encode the data in an
analytics-friendly column-oriented format such as Parquet (see [“Column Compression”](/en/ch4#sec_storage_column_compression)).

In [Chapter 11](/en/ch11#ch_batch) we will talk more about using data in archival storage.

### Dataflow Through Services: REST and RPC {#sec_encoding_dataflow_rpc}

When you have processes that need to communicate over a network, there are a few different ways of
arranging that communication. The most common arrangement is to have two roles: *clients* and
*servers*. The servers expose an API over the network, and the clients can connect to the servers
to make requests to that API. The API exposed by the server is known as a *service*.

The web works this way: clients (web browsers) make requests to web servers, making `GET` requests
to download HTML, CSS, JavaScript, images, etc., and making `POST` requests to submit data to the
server. The API consists of a standardized set of protocols and data formats (HTTP, URLs, SSL/TLS,
HTML, etc.). Because web browsers, web servers, and website authors mostly agree on these standards,
you can use any web browser to access any website (at least in theory!).

Web browsers are not the only type of client. For example, native apps running on mobile devices and
desktop computers often talk to servers, and client-side JavaScript applications running inside web
browsers can also make HTTP requests.
In this case, the server’s response is typically not HTML for displaying to a human, but rather data
in an encoding that is convenient for further processing by the client-side application code (most
often JSON). Although HTTP may be used as the transport protocol, the API implemented on top is
application-specific, and the client and server need to agree on the details of that API.

In some ways, services are similar to databases: they typically allow clients to submit and query
data. However, while databases allow arbitrary queries using the query languages we discussed in
[Chapter 3](/en/ch3#ch_datamodels), services expose an application-specific API that only allows inputs and outputs
that are predetermined by the business logic (application code) of the service [^29]. This restriction provides a degree of encapsulation: services can impose
fine-grained restrictions on what clients can and cannot do.

A key design goal of a service-oriented/microservices architecture is to make the application easier
to change and maintain by making services independently deployable and evolvable. A common principle
is that each service should be owned by one team, and that team should be able to release new
versions of the service frequently, without having to coordinate with other teams. We should
therefore expect old and new versions of servers and clients to be running at the same time, and so
the data encoding used by servers and clients must be compatible across versions of the service API.

#### Web services {#sec_web_services}

When HTTP is used as the underlying protocol for talking to the service, it is called a *web
service*. Web services are commonly used when building a service oriented or microservices
architecture (discussed earlier in [“Microservices and Serverless”](/en/ch1#sec_introduction_microservices)). The term “web service” is
perhaps a slight misnomer, because web services are not only used on the web, but in several
different contexts. For example:

1. A client application running on a user’s device (e.g., a native app on a mobile device, or a
 JavaScript web app in a browser) making requests to a service over HTTP. These requests typically go over the public internet.
2. One service making requests to another service owned by the same organization, often located
 within the same datacenter, as part of a service-oriented/microservices architecture.
3. One service making requests to a service owned by a different organization, usually via the
 internet. This is used for data exchange between different organizations’ backend systems. This
 category includes public APIs provided by online services, such as credit card processing
 systems, or OAuth for shared access to user data.

The most popular service design philosophy is REST, which builds upon the principles of HTTP [^30] [^31].
It emphasizes simple data formats, using URLs for identifying resources and using HTTP features for
cache control, authentication, and content type negotiation. An API designed according to the
principles of REST is called *RESTful*.

Code that needs to invoke a web service API must know which HTTP endpoint to query, and what data
format to send and expect in response. Even if a service adopts RESTful design principles, clients
need to somehow find out these details. Service developers often use an interface definition
language (IDL) to define and document their service’s API endpoints and data models, and to evolve
them over time. Other developers can then use the service definition to determine how to query the
service. The two most popular service IDLs are OpenAPI (also known as Swagger [^32])
and gRPC. OpenAPI is used for web services that send and receive JSON data, while gRPC services send
and receive Protocol Buffers.

Developers typically write OpenAPI service definitions in JSON or YAML; see [Example 5-3](/en/ch5#fig_open_api_def).
The service definition allows developers to define service endpoints, documentation, versions, data
models, and much more. gRPC definitions look similar, but are defined using Protocol Buffers service definitions.

{{< figure id="fig_open_api_def" title="Example 5-3. Example OpenAPI service definition in YAML" class="w-full my-4" >}}

```yaml
openapi: 3.0.0
info:
  title: Ping, Pong
  version: 1.0.0
servers:
  - url: http://localhost:8080
paths:
  /ping:
    get:
      summary: Given a ping, returns a pong message
      responses:
        '200':
          description: A pong
          content:
            application/json:
              schema:
                type: object
                properties:
                  message:
                    type: string
                    example: Pong!
```

Even if a design philosophy and IDL are adopted, developers must still write the code that
implements their service’s API calls. A service framework is often adopted to simplify this
effort. Service frameworks such as Spring Boot, FastAPI, and gRPC allow developers to write the
business logic for each API endpoint while the framework code handles routing, metrics, caching,
authentication, and so on. [Example 5-4](/en/ch5#fig_fastapi_def) shows an example Python implementation of the service
defined in [Example 5-3](/en/ch5#fig_open_api_def).

{{< figure id="fig_fastapi_def" title="Example 5-4. Example FastAPI service implementing the definition from [Example 5-3](/en/ch5#fig_open_api_def)" class="w-full my-4" >}}

```python
from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI(title="Ping, Pong", version="1.0.0")

class PongResponse(BaseModel):
    message: str = "Pong!"

@app.get("/ping", response_model=PongResponse,
         summary="Given a ping, returns a pong message")
async def ping():
    return PongResponse()
```

Many frameworks couple service definitions and server code together. In some cases, such as with the
popular Python FastAPI framework, servers are written in code and an IDL is generated automatically.
In other cases, such as with gRPC, the service definition is written first, and server code
scaffolding is generated. Both approaches allow developers to generate client libraries and SDKs
in a variety of languages from the service definition. In addition to code generation, IDL tools
such as Swagger’s can generate documentation, verify schema change compatibility, and provide a
graphical user interfaces for developers to query and test services.

#### The problems with remote procedure calls (RPCs) {#sec_problems_with_rpc}

Web services are merely the latest incarnation of a long line of technologies for making API
requests over a network, many of which received a lot of hype but have serious problems. Enterprise
JavaBeans (EJB) and Java’s Remote Method Invocation (RMI) are limited to Java. The Distributed
Component Object Model (DCOM) is limited to Microsoft platforms. The Common Object Request Broker
Architecture (CORBA) is excessively complex, and does not provide backward or forward compatibility [^33].
SOAP and the WS-\* web services framework aim to provide interoperability across vendors, but are
also plagued by complexity and compatibility problems [^34] [^35] [^36].

All of these are based on the idea of a *remote procedure call* (RPC), which has been around since the 1970s [^37].
The RPC model tries to make a request to a remote network service look the same as calling a function or
method in your programming language, within the same process (this abstraction is called *location
transparency*). Although RPC seems convenient at first, the approach is fundamentally flawed [^38] [^39].
A network request is very different from a local function call:

* A local function call is predictable and either succeeds or fails, depending only on parameters
 that are under your control. A network request is unpredictable: the request or response may be
 lost due to a network problem, or the remote machine may be slow or unavailable, and such problems
 are entirely outside of your control. Network problems are common, so you have to anticipate them,
 for example by retrying a failed request.
* A local function call either returns a result, or throws an exception, or never returns (because
 it goes into an infinite loop or the process crashes). A network request has another possible
 outcome: it may return without a result, due to a *timeout*. In that case, you simply don’t know
 what happened: if you don’t get a response from the remote service, you have no way of knowing
 whether the request got through or not. (We discuss this issue in more detail in [Chapter 9](/en/ch9#ch_distributed).)
* If you retry a failed network request, it could happen that the previous request actually got
 through, and only the response was lost. In that case, retrying will cause the action to
 be performed multiple times, unless you build a mechanism for deduplication (*idempotence*) into the protocol [^40].
 Local function calls don’t have this problem. (We discuss idempotence in more detail in [“Idempotence”](/en/ch12#sec_stream_idempotence).)
* Every time you call a local function, it normally takes about the same time to execute. A network
 request is much slower than a function call, and its latency is also wildly variable: at good
 times it may complete in less than a millisecond, but when the network is congested or the remote
 service is overloaded it may take many seconds to do exactly the same thing.
* When you call a local function, you can efficiently pass it references (pointers) to objects in
 local memory. When you make a network request, all those parameters need to be encoded into a
 sequence of bytes that can be sent over the network. That’s okay if the parameters are immutable
 primitives like numbers or short strings, but it quickly becomes problematic with larger amounts
 of data and mutable objects.
* The client and the service may be implemented in different programming languages, so the RPC
 framework must translate datatypes from one language into another. This can end up ugly, since not
 all languages have the same types—recall JavaScript’s problems with numbers greater than 253,
 for example (see [“JSON, XML, and Binary Variants”](/en/ch5#sec_encoding_json)).
 This problem doesn’t exist in a single process written in a single language.

All of these factors mean that there’s no point trying to make a remote service look too much like a
local object in your programming language, because it’s a fundamentally different thing. Part of the
appeal of REST is that it treats state transfer over a network as a process that is distinct from a
function call.

#### Load balancers, service discovery, and service meshes {#sec_encoding_service_discovery}

All services communicate over the network. For this reason, a client must know the address of the
service it’s connecting to—a problem known as *service discovery*. The simplest approach is to
configure a client to connect to the IP address and port where the service is running. This
configuration will work, but if the server goes offline, is transferred to a new machine, or becomes
overloaded, the client has to be manually reconfigured.

To provide higher availability and scalability, there are usually multiple instances of a service
running on different machines, any of which can handle an incoming request. Spreading requests
across these instances is called *load balancing* [^41].
There are many load balancing and service discovery solutions available:

* *Hardware load balancers* are specialized pieces of equipment that are installed in data centers.
 They allow clients to connect to a single host and port, and incoming connections are routed to
 one of the servers running the service. Such load balancers detect network failures when
 connecting to a downstream server and shift the traffic to other servers.
* *Software load balancers* behave in much the same way as hardware load balancers. But rather than
 requiring a special appliance, software load balancers such as Nginx and HAProxy are applications
 that can be installed on a standard machine.
* The *domain name service (DNS)* is how domain names are resolved on the Internet when you open a
 webpage. It supports load balancing by allowing multiple IP addresses to be associated with a
 single domain name. Clients can then be configured to connect to a service using a domain name
 rather than IP address, and the client’s network layer picks which IP address to use when making a
 connection. One drawback of this approach is that DNS is designed to propagate changes over longer
 periods of time, and to cache DNS entries. If servers are started, stopped, or moved frequently,
 clients might see stale IP addresses that no longer have a server running on them.
* *Service discovery systems* use a centralized registry rather than DNS to track which service
 endpoints are available. When a new service instance starts up, it registers itself with the
 service discovery system by declaring the host and port it’s listening on, along with relevant
 metadata such as shard ownership information (see [Chapter 7](/en/ch7#ch_sharding)), data center location,
 and more. The service then periodically sends a heartbeat signal to the discovery system to signal
 that the service is still available.

 When a client wishes to connect to a service, it first queries the discovery system to get a list of
 available endpoints, and then connects directly to the endpoint. Compared to DNS, service discovery
 supports a much more dynamic environment where service instances change frequently. Discovery
 systems also give clients more metadata about the service they’re connecting to, which enables
 clients to make smarter load balancing decisions.
* *Service meshes* are a sophisticated form of load balancing that combine software load balancers
 and service discovery. Unlike traditional software load balancers, which run on a separate
 machine, service mesh load balancers are typically deployed as an in-process client library or as
 a process or “sidecar” container on both the client and server. Client applications connect
 to their own local service load balancer, which connects to the server’s load balancer. From
 there, the connection is routed to the local server process.

 Though complicated, this topology offers a number of advantages. Because the clients and servers are
 routed entirely through local connections, connection encryption can be handled entirely at the load
 balancer level. This shields clients and servers from having to deal with the complexities of SSL
 certificates and TLS. Mesh systems also provide sophisticated observability. They can track which
 services are calling each other in realtime, detect failures, track traffic load, and more.

Which solution is appropriate depends on an organization’s needs. Those running in a very dynamic
service environment with an orchestrator such as Kubernetes often choose to run a service mesh such
as Istio or Linkerd. Specialized infrastructure such as databases or messaging systems might require
their own purpose-built load balancer. Simpler deployments are best served with software load
balancers.

#### Data encoding and evolution for RPC {#data-encoding-and-evolution-for-rpc}

For evolvability, it is important that RPC clients and servers can be changed and deployed
independently. Compared to data flowing through databases (as described in the last section), we can make a
simplifying assumption in the case of dataflow through services: it is reasonable to assume that
all the servers will be updated first, and all the clients second. Thus, you only need backward
compatibility on requests, and forward compatibility on responses.

The backward and forward compatibility properties of an RPC scheme are inherited from whatever encoding it uses:

* gRPC (Protocol Buffers) and Avro RPC can be evolved according to the compatibility rules of the respective encoding format.
* RESTful APIs most commonly use JSON for responses, and JSON or URI-encoded/form-encoded request
 parameters for requests. Adding optional request parameters and adding new fields to response
 objects are usually considered changes that maintain compatibility.

Service compatibility is made harder by the fact that RPC is often used for communication across
organizational boundaries, so the provider of a service often has no control over its clients and
cannot force them to upgrade. Thus, compatibility needs to be maintained for a long time, perhaps
indefinitely. If a compatibility-breaking change is required, the service provider often ends up
maintaining multiple versions of the service API side by side.

There is no agreement on how API versioning should work (i.e., how a client can indicate which
version of the API it wants to use [^42]).
For RESTful APIs, common approaches are to use a version
number in the URL or in the HTTP `Accept` header. For services that use API keys to identify a
particular client, another option is to store a client’s requested API version on the server and to
allow this version selection to be updated through a separate administrative interface [^43].

### Durable Execution and Workflows {#sec_encoding_dataflow_workflows}

By definition, service-based architectures have multiple services that are all responsible for
different portions of an application. Consider a payment processing application that charges a
credit card and deposits the funds into a bank account. This system would likely have different
services responsible for fraud detection, credit card integration, bank integration, and so on.

Processing a single payment in our example requires many service calls. A payment processor service
might invoke the fraud detection service to check for fraud, call the credit card service to debit
the credit card, and call the banking service to deposit debited funds, as shown in
[Figure 5-7](/en/ch5#fig_encoding_workflow). We call this sequence of steps a *workflow*, and each step a *task*.
Workflows are typically defined as a graph of tasks. Workflow definitions may be written in a
general-purpose programming language, a domain specific language (DSL), or a markup language such as
Business Process Execution Language (BPEL) [^44].

--------

> [!TIP] TASKS, ACTIVITIES, AND FUNCTIONS

Different workflow engines use different names for tasks. Temporal, for example, uses the term
*activity*. Others refer to tasks as *durable functions*. Though the names differ, the concepts are the same.

--------

{{< figure src="/fig/ddia_0507.png" id="fig_encoding_workflow" title="Figure 5-7. Example of a workflow expressed using Business Process Model and Notation (BPMN), a graphical notation." class="w-full my-4" >}}


Workflows are run, or executed, by a *workflow engine*. Workflow engines determine when to run each
task, on which machine a task must be run, what to do if a task fails (e.g., if the machine crashes
while the task is running), how many tasks are allowed to execute in parallel, and more.

Workflow engines are typically composed of an orchestrator and an executor. The orchestrator is
responsible for scheduling tasks to be executed and the executor is responsible for executing tasks.
Execution begins when a workflow is triggered. The orchestrator triggers the workflow itself if
users define a time-based schedule, such as hourly execution. External sources such as a web service
or even a human can also trigger workflow executions. Once triggered, executors are invoked to run
tasks.

There are many kinds of workflow engines that address a diverse set of use cases. Some, such as
Airflow, Dagster, and Prefect, integrate with data systems and orchestrate ETL tasks. Others, such
as Camunda and Orkes, provide a graphical notation for workflows (such as BPMN, used in
[Figure 5-7](/en/ch5#fig_encoding_workflow)) so that non-engineers can more easily define and execute workflows. Still
others, such as Temporal and Restate provide *durable execution*.

#### Durable execution {#durable-execution}

Durable execution frameworks have become a popular way to build service-based architectures that
require transactionality. In our payment example, we would like to process each payment exactly
once. A failure while the workflow is executing could result in a credit card charge, but no
corresponding bank account deposit. In a service-based architecture, we can’t simply wrap the two
tasks in a database transaction. Moreover, we might be interacting with third-party payment gateways
that we have limited control over.

Durable execution frameworks are a way to provide *exactly-once semantics* for workflows. If a
task fails, the framework will re-execute the task, but will skip any RPC calls or state changes
that the task made successfully before failing. Instead, the framework will pretend to make the
call, but will instead return the results from the previous call. This is possible because durable
execution frameworks log all RPCs and state changes to durable storage like a write-ahead log [^45] [^46].
[Example 5-5](/en/ch5#fig_temporal_workflow) shows an example of a workflow definition that supports durable execution
using Temporal.

{{< figure id="fig_temporal_workflow" title="Example 5-5. A Temporal workflow definition fragment for the payment workflow in [Figure 5-7](/en/ch5#fig_encoding_workflow)." class="w-full my-4" >}}

```python
@workflow.defn
class PaymentWorkflow:
    @workflow.run
    async def run(self, payment: PaymentRequest) -> PaymentResult:
        is_fraud = await workflow.execute_activity(
            check_fraud,
            payment,
            start_to_close_timeout=timedelta(seconds=15),
        )
        if is_fraud:
            return PaymentResultFraudulent
        credit_card_response = await workflow.execute_activity(
            debit_credit_card,
            payment,
            start_to_close_timeout=timedelta(seconds=15),
        )
        # ...
```

Frameworks like Temporal are not without their challenges. External services, such as the
third-party payment gateway in our example, must still provide an idempotent API. Developers must
remember to use unique IDs for these APIs to prevent duplicate execution [^47].
And because durable execution frameworks log each RPC call in order, it expects a subsequent
execution to make the same RPC calls in the same order. This makes code changes brittle: you
might introduce undefined behavior simply by re-ordering function calls [^48].
Instead of modifying the code of an existing workflow, it is safer to deploy a new version of the
code separately, so that re-executions of existing workflow invocations continue to use the old
version, and only new invocations use the new code [^49].

Similarly, because durable execution frameworks expect to replay all code deterministically (the
same inputs produce the same outputs), nondeterministic code such as random number generators or system clocks are problematic [^48].
Frameworks often provide their own, deterministic implementations of such library functions, but
you have to remember to use them. In some cases, such as with Temporal’s workflowcheck tool,
frameworks provide static analysis tools to determine if nondeterministic behavior has been introduced.

--------

> [!NOTE]
> Making code deterministic is a powerful idea, but tricky to do robustly. In
> [“The Power of Determinism”](/en/ch9#sidebar_distributed_determinism) we will return to this topic.

--------

### Event-Driven Architectures {#sec_encoding_dataflow_msg}

In this final section, we will briefly look at *event-driven architectures*, which are another way
how encoded data can flow from one process to another. A request is called an *event* or *message*;
unlike RPC, the sender usually does not wait for the recipient to process the event. Moreover,
events are typically not sent to the recipient via a direct network connection, but go via an
intermediary called a *message broker* (also called an *event broker*, *message queue*, or
*message-oriented middleware*), which stores the message temporarily. [^50].

Using a message broker has several advantages compared to direct RPC:

* It can act as a buffer if the recipient is unavailable or overloaded, and thus improve system reliability.
* It can automatically redeliver messages to a process that has crashed, and thus prevent messages from being lost.
* It avoids the need for service discovery, since senders do not need to directly connect to the IP address of the recipient.
* It allows the same message to be sent to several recipients.
* It logically decouples the sender from the recipient (the sender just publishes messages and doesn’t care who consumes them).

The communication via a message broker is *asynchronous*: the sender doesn’t wait for the message to
be delivered, but simply sends it and then forgets about it. It’s possible to implement a
synchronous RPC-like model by having the sender wait for a response on a separate channel.

#### Message brokers {#message-brokers}

In the past, the landscape of message brokers was dominated by commercial enterprise software from
companies such as TIBCO, IBM WebSphere, and webMethods, before open source implementations such as
RabbitMQ, ActiveMQ, HornetQ, NATS, and Apache Kafka become popular. More recently, cloud services
such as Amazon Kinesis, Azure Service Bus, and Google Cloud Pub/Sub have gained adoption. We will
compare them in more detail in [“Messaging Systems”](/en/ch12#sec_stream_messaging).

The detailed delivery semantics vary by implementation and configuration, but in general, two
message distribution patterns are most often used:

* One process adds a message to a named *queue*, and the broker delivers that message to a
 *consumer* of that queue. If there are multiple consumers, one of them receives the message.
* One process publishes a message to a named *topic*, and the broker delivers that message to all
 *subscribers* of that topic. If there are multiple subscribers, they all receive the message.

Message brokers typically don’t enforce any particular data model—a message is just a sequence of
bytes with some metadata, so you can use any encoding format. A common approach is to use Protocol
Buffers, Avro, or JSON, and to deploy a schema registry alongside the message broker to store all
the valid schema versions and check their compatibility [^19] [^21].
AsyncAPI, a messaging-based equivalent of OpenAPI, can also be used to specify the schema of messages.

Message brokers differ in terms of how durable their messages are. Many write messages to disk, so
that they are not lost in case the message broker crashes or needs to be restarted. Unlike
databases, many message brokers automatically delete messages again after they have been consumed.
Some brokers can be configured to store messages indefinitely, which you would require if you want
to use event sourcing (see [“Event Sourcing and CQRS”](/en/ch3#sec_datamodels_events)).

If a consumer republishes messages to another topic, you may need to be careful to preserve unknown
fields, to prevent the issue described previously in the context of databases
([Figure 5-1](/en/ch5#fig_encoding_preserve_field)).

#### Distributed actor frameworks {#distributed-actor-frameworks}

The *actor model* is a programming model for concurrency in a single process. Rather than dealing
directly with threads (and the associated problems of race conditions, locking, and deadlock), logic
is encapsulated in *actors*. Each actor typically represents one client or entity, it may have some
local state (which is not shared with any other actor), and it communicates with other actors by
sending and receiving asynchronous messages. Message delivery is not guaranteed: in certain error
scenarios, messages will be lost. Since each actor processes only one message at a time, it doesn’t
need to worry about threads, and each actor can be scheduled independently by the framework.

In *distributed actor frameworks* such as Akka, Orleans [^51],
and Erlang/OTP, this programming model is used to scale an application across
multiple nodes. The same message-passing mechanism is used, no matter whether the sender and recipient
are on the same node or different nodes. If they are on different nodes, the message is
transparently encoded into a byte sequence, sent over the network, and decoded on the other side.

Location transparency works better in the actor model than in RPC, because the actor model already
assumes that messages may be lost, even within a single process. Although latency over the network
is likely higher than within the same process, there is less of a fundamental mismatch between local
and remote communication when using the actor model.

A distributed actor framework essentially integrates a message broker and the actor programming
model into a single framework. However, if you want to perform rolling upgrades of your actor-based
application, you still have to worry about forward and backward compatibility, as messages may be
sent from a node running the new version to a node running the old version, and vice versa. This can
be achieved by using one of the encodings discussed in this chapter.


## Summary {#summary}

In this chapter we looked at several ways of turning data structures into bytes on the network or
bytes on disk. We saw how the details of these encodings affect not only their efficiency, but more
importantly also the architecture of applications and your options for evolving them.

In particular, many services need to support rolling upgrades, where a new version of a service is
gradually deployed to a few nodes at a time, rather than deploying to all nodes simultaneously.
Rolling upgrades allow new versions of a service to be released without downtime (thus encouraging
frequent small releases over rare big releases) and make deployments less risky (allowing faulty
releases to be detected and rolled back before they affect a large number of users). These
properties are hugely beneficial for *evolvability*, the ease of making changes to an application.

During rolling upgrades, or for various other reasons, we must assume that different nodes are
running the different versions of our application’s code. Thus, it is important that all data
flowing around the system is encoded in a way that provides backward compatibility (new code can
read old data) and forward compatibility (old code can read new data).

We discussed several data encoding formats and their compatibility properties:

* Programming language–specific encodings are restricted to a single programming language and often
 fail to provide forward and backward compatibility.
* Textual formats like JSON, XML, and CSV are widespread, and their compatibility depends on how you
 use them. They have optional schema languages, which are sometimes helpful and sometimes a
 hindrance. These formats are somewhat vague about datatypes, so you have to be careful with things
 like numbers and binary strings.
* Binary schema–driven formats like Protocol Buffers and Avro allow compact, efficient encoding with
 clearly defined forward and backward compatibility semantics. The schemas can be useful for
 documentation and code generation in statically typed languages. However, these formats have the
 downside that data needs to be decoded before it is human-readable.

We also discussed several modes of dataflow, illustrating different scenarios in which data
encodings are important:

* Databases, where the process writing to the database encodes the data and the process reading
 from the database decodes it
* RPC and REST APIs, where the client encodes a request, the server decodes the request and encodes
 a response, and the client finally decodes the response
* Event-driven architectures (using message brokers or actors), where nodes communicate by sending
 each other messages that are encoded by the sender and decoded by the recipient

We can conclude that with a bit of care, backward/forward compatibility and rolling upgrades are
quite achievable. May your application’s evolution be rapid and your deployments be frequent.


### References

[^1]: [CWE-502: Deserialization of Untrusted Data](https://cwe.mitre.org/data/definitions/502.html). Common Weakness Enumeration, *cwe.mitre.org*, July 2006. Archived at [perma.cc/26EU-UK9Y](https://perma.cc/26EU-UK9Y) 
[^2]: Steve Breen. [What Do WebLogic, WebSphere, JBoss, Jenkins, OpenNMS, and Your Application Have in Common? This Vulnerability](https://foxglovesecurity.com/2015/11/06/what-do-weblogic-websphere-jboss-jenkins-opennms-and-your-application-have-in-common-this-vulnerability/). *foxglovesecurity.com*, November 2015. Archived at [perma.cc/9U97-UVVD](https://perma.cc/9U97-UVVD) 
[^3]: Patrick McKenzie. [What the Rails Security Issue Means for Your Startup](https://www.kalzumeus.com/2013/01/31/what-the-rails-security-issue-means-for-your-startup/). *kalzumeus.com*, January 2013. Archived at [perma.cc/2MBJ-7PZ6](https://perma.cc/2MBJ-7PZ6) 
[^4]: Brian Goetz. [Towards Better Serialization](https://openjdk.org/projects/amber/design-notes/towards-better-serialization). *openjdk.org*, June 2019. Archived at [perma.cc/UK6U-GQDE](https://perma.cc/UK6U-GQDE) 
[^5]: Eishay Smith. [jvm-serializers wiki](https://github.com/eishay/jvm-serializers/wiki). *github.com*, October 2023. Archived at [perma.cc/PJP7-WCNG](https://perma.cc/PJP7-WCNG) 
[^6]: [XML Is a Poor Copy of S-Expressions](https://wiki.c2.com/?XmlIsaPoorCopyOfEssExpressions). *wiki.c2.com*, May 2013. Archived at [perma.cc/7FAN-YBKL](https://perma.cc/7FAN-YBKL) 
[^7]: Julia Evans. [Examples of floating point problems](https://jvns.ca/blog/2023/01/13/examples-of-floating-point-problems/). *jvns.ca*, January 2023. Archived at [perma.cc/M57L-QKKW](https://perma.cc/M57L-QKKW) 
[^8]: Matt Harris. [Snowflake: An Update and Some Very Important Information](https://groups.google.com/g/twitter-development-talk/c/ahbvo3VTIYI). Email to *Twitter Development Talk* mailing list, October 2010. Archived at [perma.cc/8UBV-MZ3D](https://perma.cc/8UBV-MZ3D) 
[^9]: Yakov Shafranovich. [RFC 4180: Common Format and MIME Type for Comma-Separated Values (CSV) Files](https://tools.ietf.org/html/rfc4180). IETF, October 2005. 
[^10]: Andy Coates. [Evolving JSON Schemas - Part I](https://www.creekservice.org/articles/2024/01/08/json-schema-evolution-part-1.html) and [Part II](https://www.creekservice.org/articles/2024/01/09/json-schema-evolution-part-2.html). *creekservice.org*, January 2024. Archived at [perma.cc/MZW3-UA54](https://perma.cc/MZW3-UA54) and [perma.cc/GT5H-WKZ5](https://perma.cc/GT5H-WKZ5) 
[^11]: Pierre Genevès, Nabil Layaïda, and Vincent Quint. [Ensuring Query Compatibility with Evolving XML Schemas](https://arxiv.org/abs/0811.4324). INRIA Technical Report 6711, November 2008. 
[^12]: Tim Bray. [Bits On the Wire](https://www.tbray.org/ongoing/When/201x/2019/11/17/Bits-On-the-Wire). *tbray.org*, November 2019. Archived at [perma.cc/3BT3-BQU3](https://perma.cc/3BT3-BQU3) 
[^13]: Mark Slee, Aditya Agarwal, and Marc Kwiatkowski. [Thrift: Scalable Cross-Language Services Implementation](https://thrift.apache.org/static/files/thrift-20070401.pdf). Facebook technical report, April 2007. Archived at [perma.cc/22BS-TUFB](https://perma.cc/22BS-TUFB) 
[^14]: Martin Kleppmann. [Schema Evolution in Avro, Protocol Buffers and Thrift](https://martin.kleppmann.com/2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html). *martin.kleppmann.com*, December 2012. Archived at [perma.cc/E4R2-9RJT](https://perma.cc/E4R2-9RJT) 
[^15]: Doug Cutting, Chad Walters, Jim Kellerman, et al. [[PROPOSAL] New Subproject: Avro](https://lists.apache.org/thread/z571w0r5jmfsjvnl0fq4fgg0vh28d3bk). Email thread on *hadoop-general* mailing list, *lists.apache.org*, April 2009. Archived at [perma.cc/4A79-BMEB](https://perma.cc/4A79-BMEB) 
[^16]: Apache Software Foundation. [Apache Avro 1.12.0 Specification](https://avro.apache.org/docs/1.12.0/specification/). *avro.apache.org*, August 2024. Archived at [perma.cc/C36P-5EBQ](https://perma.cc/C36P-5EBQ) 
[^17]: Apache Software Foundation. [Avro schemas as LL(1) CFG definitions](https://avro.apache.org/docs/1.12.0/api/java/org/apache/avro/io/parsing/doc-files/parsing.html). *avro.apache.org*, August 2024. Archived at [perma.cc/JB44-EM9Q](https://perma.cc/JB44-EM9Q) 
[^18]: Tony Hoare. [Null References: The Billion Dollar Mistake](https://www.infoq.com/presentations/Null-References-The-Billion-Dollar-Mistake-Tony-Hoare/). Talk at *QCon London*, March 2009. 
[^19]: Confluent, Inc. [Schema Registry Overview](https://docs.confluent.io/platform/current/schema-registry/index.html). *docs.confluent.io*, 2024. Archived at [perma.cc/92C3-A9JA](https://perma.cc/92C3-A9JA) 
[^20]: Aditya Auradkar and Tom Quiggle. [Introducing Espresso—LinkedIn’s Hot New Distributed Document Store](https://engineering.linkedin.com/espresso/introducing-espresso-linkedins-hot-new-distributed-document-store). *engineering.linkedin.com*, January 2015. Archived at [perma.cc/FX4P-VW9T](https://perma.cc/FX4P-VW9T) 
[^21]: Jay Kreps. [Putting Apache Kafka to Use: A Practical Guide to Building a Stream Data Platform (Part 2)](https://www.confluent.io/blog/event-streaming-platform-2/). *confluent.io*, February 2015. Archived at [perma.cc/8UA4-ZS5S](https://perma.cc/8UA4-ZS5S) 
[^22]: Gwen Shapira. [The Problem of Managing Schemas](https://www.oreilly.com/content/the-problem-of-managing-schemas/). *oreilly.com*, November 2014. Archived at [perma.cc/BY8Q-RYV3](https://perma.cc/BY8Q-RYV3) 
[^23]: John Larmouth. [*ASN.1 Complete*](https://www.oss.com/asn1/resources/books-whitepapers-pubs/larmouth-asn1-book.pdf). Morgan Kaufmann, 1999. ISBN: 978-0-122-33435-1. Archived at [perma.cc/GB7Y-XSXQ](https://perma.cc/GB7Y-XSXQ) 
[^24]: Burton S. Kaliski Jr. [A Layman’s Guide to a Subset of ASN.1, BER, and DER](https://luca.ntop.org/Teaching/Appunti/asn1.html). Technical Note, RSA Data Security, Inc., November 1993. Archived at [perma.cc/2LMN-W9U8](https://perma.cc/2LMN-W9U8) 
[^25]: Jacob Hoffman-Andrews. [A Warm Welcome to ASN.1 and DER](https://letsencrypt.org/docs/a-warm-welcome-to-asn1-and-der/). *letsencrypt.org*, April 2020. Archived at [perma.cc/CYT2-GPQ8](https://perma.cc/CYT2-GPQ8) 
[^26]: Lev Walkin. [Question: Extensibility and Dropping Fields](https://lionet.info/asn1c/blog/2010/09/21/question-extensibility-removing-fields/). *lionet.info*, September 2010. Archived at [perma.cc/VX8E-NLH3](https://perma.cc/VX8E-NLH3) 
[^27]: Jacqueline Xu. [Online migrations at scale](https://stripe.com/blog/online-migrations). *stripe.com*, February 2017. Archived at [perma.cc/X59W-DK7Y](https://perma.cc/X59W-DK7Y) 
[^28]: Geoffrey Litt, Peter van Hardenberg, and Orion Henry. [Project Cambria: Translate your data with lenses](https://www.inkandswitch.com/cambria/). Technical Report, *Ink & Switch*, October 2020. Archived at [perma.cc/WA4V-VKDB](https://perma.cc/WA4V-VKDB) 
[^29]: Pat Helland. [Data on the Outside Versus Data on the Inside](https://www.cidrdb.org/cidr2005/papers/P12.pdf). At *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005. 
[^30]: Roy Thomas Fielding. [Architectural Styles and the Design of Network-Based Software Architectures](https://ics.uci.edu/~fielding/pubs/dissertation/fielding_dissertation.pdf). PhD Thesis, University of California, Irvine, 2000. Archived at [perma.cc/LWY9-7BPE](https://perma.cc/LWY9-7BPE) 
[^31]: Roy Thomas Fielding. [REST APIs must be hypertext-driven](https://roy.gbiv.com/untangled/2008/rest-apis-must-be-hypertext-driven).” *roy.gbiv.com*, October 2008. Archived at [perma.cc/M2ZW-8ATG](https://perma.cc/M2ZW-8ATG) 
[^32]: [OpenAPI Specification Version 3.1.0](https://swagger.io/specification/). *swagger.io*, February 2021. Archived at [perma.cc/3S6S-K5M4](https://perma.cc/3S6S-K5M4) 
[^33]: Michi Henning. [The Rise and Fall of CORBA](https://cacm.acm.org/practice/the-rise-and-fall-of-corba/). *Communications of the ACM*, volume 51, issue 8, pages 52–57, August 2008. [doi:10.1145/1378704.1378718](https://doi.org/10.1145/1378704.1378718) 
[^34]: Pete Lacey. [The S Stands for Simple](https://harmful.cat-v.org/software/xml/soap/simple). *harmful.cat-v.org*, November 2006. Archived at [perma.cc/4PMK-Z9X7](https://perma.cc/4PMK-Z9X7) 
[^35]: Stefan Tilkov. [Interview: Pete Lacey Criticizes Web Services](https://www.infoq.com/articles/pete-lacey-ws-criticism/). *infoq.com*, December 2006. Archived at [perma.cc/JWF4-XY3P](https://perma.cc/JWF4-XY3P) 
[^36]: Tim Bray. [The Loyal WS-Opposition](https://www.tbray.org/ongoing/When/200x/2004/09/18/WS-Oppo). *tbray.org*, September 2004. Archived at [perma.cc/J5Q8-69Q2](https://perma.cc/J5Q8-69Q2) 
[^37]: Andrew D. Birrell and Bruce Jay Nelson. [Implementing Remote Procedure Calls](https://www.cs.princeton.edu/courses/archive/fall03/cs518/papers/rpc.pdf). *ACM Transactions on Computer Systems* (TOCS), volume 2, issue 1, pages 39–59, February 1984. [doi:10.1145/2080.357392](https://doi.org/10.1145/2080.357392) 
[^38]: Jim Waldo, Geoff Wyant, Ann Wollrath, and Sam Kendall. [A Note on Distributed Computing](https://m.mirror.facebook.net/kde/devel/smli_tr-94-29.pdf). Sun Microsystems Laboratories, Inc., Technical Report TR-94-29, November 1994. Archived at [perma.cc/8LRZ-BSZR](https://perma.cc/8LRZ-BSZR) 
[^39]: Steve Vinoski. [Convenience over Correctness](https://steve.vinoski.net/pdf/IEEE-Convenience_Over_Correctness.pdf). *IEEE Internet Computing*, volume 12, issue 4, pages 89–92, July 2008. [doi:10.1109/MIC.2008.75](https://doi.org/10.1109/MIC.2008.75) 
[^40]: Brandur Leach. [Designing robust and predictable APIs with idempotency](https://stripe.com/blog/idempotency). *stripe.com*, February 2017. Archived at [perma.cc/JD22-XZQT](https://perma.cc/JD22-XZQT) 
[^41]: Sam Rose. [Load Balancing](https://samwho.dev/load-balancing/). *samwho.dev*, April 2023. Archived at [perma.cc/Q7BA-9AE2](https://perma.cc/Q7BA-9AE2) 
[^42]: Troy Hunt. [Your API versioning is wrong, which is why I decided to do it 3 different wrong ways](https://www.troyhunt.com/your-api-versioning-is-wrong-which-is/). *troyhunt.com*, February 2014. Archived at [perma.cc/9DSW-DGR5](https://perma.cc/9DSW-DGR5) 
[^43]: Brandur Leach. [APIs as infrastructure: future-proofing Stripe with versioning](https://stripe.com/blog/api-versioning). *stripe.com*, August 2017. Archived at [perma.cc/L63K-USFW](https://perma.cc/L63K-USFW) 
[^44]: Alexandre Alves, Assaf Arkin, Sid Askary, et al. [Web Services Business Process Execution Language Version 2.0](https://docs.oasis-open.org/wsbpel/2.0/wsbpel-v2.0.html). *docs.oasis-open.org*, April 2007. 
[^45]: [What is a Temporal Service?](https://docs.temporal.io/clusters) *docs.temporal.io*, 2024. Archived at [perma.cc/32P3-CJ9V](https://perma.cc/32P3-CJ9V) 
[^46]: Stephan Ewen. [Why we built Restate](https://restate.dev/blog/why-we-built-restate/). *restate.dev*, August 2023. Archived at [perma.cc/BJJ2-X75K](https://perma.cc/BJJ2-X75K) 
[^47]: Keith Tenzer and Joshua Smith. [Idempotency and Durable Execution](https://temporal.io/blog/idempotency-and-durable-execution). *temporal.io*, February 2024. Archived at [perma.cc/9LGW-PCLU](https://perma.cc/9LGW-PCLU) 
[^48]: [What is a Temporal Workflow?](https://docs.temporal.io/workflows) *docs.temporal.io*, 2024. Archived at [perma.cc/B5C5-Y396](https://perma.cc/B5C5-Y396) 
[^49]: Jack Kleeman. [Solving durable execution’s immutability problem](https://restate.dev/blog/solving-durable-executions-immutability-problem/). *restate.dev*, February 2024. Archived at [perma.cc/G55L-EYH5](https://perma.cc/G55L-EYH5) 
[^50]: Srinath Perera. [Exploring Event-Driven Architecture: A Beginner’s Guide for Cloud Native Developers](https://wso2.com/blogs/thesource/exploring-event-driven-architecture-a-beginners-guide-for-cloud-native-developers/). *wso2.com*, August 2023. Archived at [archive.org](https://web.archive.org/web/20240716204613/https%3A//wso2.com/blogs/thesource/exploring-event-driven-architecture-a-beginners-guide-for-cloud-native-developers/) 
[^51]: Philip A. Bernstein, Sergey Bykov, Alan Geller, Gabriel Kliot, and Jorgen Thelin. [Orleans: Distributed Virtual Actors for Programmability and Scalability](https://www.microsoft.com/en-us/research/publication/orleans-distributed-virtual-actors-for-programmability-and-scalability/). Microsoft Research Technical Report MSR-TR-2014-41, March 2014. Archived at [perma.cc/PD3U-WDMF](https://perma.cc/PD3U-WDMF) 


================================================
FILE: content/en/ch6.md
================================================
---
title: "6. Replication"
weight: 206
breadcrumbs: false
---

<a id="ch_replication"></a>

![](/map/ch05.png)

> *The major difference between a thing that might go wrong and a thing that cannot possibly go wrong
> is that when a thing that cannot possibly go wrong goes wrong it usually turns out to be impossible
> to get at or repair.*
>
> Douglas Adams, *Mostly Harmless* (1992)

*Replication* means keeping a copy of the same data on multiple machines that are connected via a
network. As discussed in [“Distributed versus Single-Node Systems”](/en/ch1#sec_introduction_distributed), there are several reasons
why you might want to replicate data:

* To keep data geographically close to your users (and thus reduce access latency)
* To allow the system to continue working even if some of its parts have failed (and thus increase availability)
* To scale out the number of machines that can serve read queries (and thus increase read throughput)

In this chapter we will assume that your dataset is small enough that each machine can hold a copy of
the entire dataset. In [Chapter 7](/en/ch7#ch_sharding) we will relax that assumption and discuss *sharding*
(*partitioning*) of datasets that are too big for a single machine. In later chapters we will discuss
various kinds of faults that can occur in a replicated data system, and how to deal with them.

If the data that you’re replicating does not change over time, then replication is easy: you just
need to copy the data to every node once, and you’re done. All of the difficulty in replication lies
in handling *changes* to replicated data, and that’s what this chapter is about. We will discuss
three families of algorithms for replicating changes between nodes: *single-leader*, *multi-leader*,
and *leaderless* replication. Almost all distributed databases use one of these three approaches.
They all have various pros and cons, which we will examine in detail.

There are many trade-offs to consider with replication: for example, whether to use synchronous or
asynchronous replication, and how to handle failed replicas. Those are often configuration options
in databases, and although the details vary by database, the general principles are similar across
many different implementations. We will discuss the consequences of such choices in this chapter.

Replication of databases is an old topic—the principles haven’t changed much since they were
studied in the 1970s [^1], because the fundamental constraints of networks have remained the same. Despite being so old,
concepts such as *eventual consistency* still cause confusion. In [“Problems with Replication Lag”](/en/ch6#sec_replication_lag) we will
get more precise about eventual consistency and discuss things like the *read-your-writes* and
*monotonic reads* guarantees.

--------

> [!TIP] BACKUPS AND REPLICATION

You might be wondering whether you still need backups if you have replication. The answer is yes,
because they have different purposes: replicas quickly reflect writes from one node on other nodes,
but backups store old snapshots of the data so that you can go back in time. If you accidentally
delete some data, replication doesn’t help since the deletion will have also been propagated to the
replicas, so you need a backup if you want to restore the deleted data.

In fact, replication and backups are often complementary to each other. Backups are sometimes part
of the process of setting up replication, as we shall see in [“Setting Up New Followers”](/en/ch6#sec_replication_new_replica).
Conversely, archiving replication logs can be part of a backup process.

Some databases internally maintain immutable snapshots of past states, which serve as a kind of
internal backup. However, this means keeping old versions of the data on the same storage media as
the current state. If you have a large amount of data, it can be cheaper to keep the backups of old
data in an object store that is optimized for infrequently-accessed data, and to store only the
current state of the database in primary storage.

--------

## Single-Leader Replication {#sec_replication_leader}

Each node that stores a copy of the database is called a *replica*. With multiple replicas, a
question inevitably arises: how do we ensure that all the data ends up on all the replicas?

Every write to the database needs to be processed by every replica; otherwise, the replicas would no
longer contain the same data. The most common solution is called *leader-based replication*,
*primary-backup*, or *active/passive*. It works as follows (see [Figure 6-1](/en/ch6#fig_replication_leader_follower)):

1. One of the replicas is designated the *leader* (also known as *primary* or *source* [^2]).
   When clients want to write to the database, they must send their requests to the leader, which
   first writes the new data to its local storage.
2. The other replicas are known as *followers* (*read replicas*, *secondaries*, or *hot standbys*).
   Whenever the leader writes new data to its local storage, it also sends the data change to all of
   its followers as part of a *replication log* or *change stream*. Each follower takes the log
   from the leader and updates its local copy of the database accordingly, by applying all writes in
   the same order as they were processed on the leader.
3. When a client wants to read from the database, it can query either the leader or any of the
   followers. However, writes are only accepted on the leader (the followers are read-only from the
   client’s point of view).

{{< figure src="/fig/ddia_0601.png" id="fig_replication_leader_follower" caption="Figure 6-1. Single-leader replication directs all writes to a designated leader, which sends a stream of changes to the follower replicas." class="w-full my-4" >}}

If the database is sharded (see [Chapter 7](/en/ch7#ch_sharding)), each shard has one leader. Different shards may
have their leaders on different nodes, but each shard must nevertheless have one leader node. In
[“Multi-Leader Replication”](/en/ch6#sec_replication_multi_leader) we will discuss an alternative model in which a system may have
multiple leaders for the same shard at the same time.

Single-leader replication is very widely used. It’s a built-in feature of many relational databases,
such as PostgreSQL, MySQL, Oracle Data Guard [^3], and SQL Server’s Always On Availability Groups [^4].
It is also used in some document databases such as MongoDB and DynamoDB [^5],
message brokers such as Kafka, replicated block devices such as DRBD, and some network filesystems.
Many consensus algorithms such as Raft, which is used for replication in CockroachDB [^6], TiDB [^7],
etcd, and RabbitMQ quorum queues (among others), are also based on a single leader, and automatically 
elect a new leader if the old one fails (we will discuss consensus in more detail in [Chapter 10](/en/ch10#ch_consistency)).

--------

> [!NOTE]
> In older documents you may see the term *master–slave replication*. It means the same as
> leader-based replication, but the term should be avoided as it is widely considered offensive [^8].

--------

### Synchronous Versus Asynchronous Replication {#sec_replication_sync_async}

An important detail of a replicated system is whether the replication happens *synchronously* or
*asynchronously*. (In relational databases, this is often a configurable option; other systems are
often hardcoded to be either one or the other.)

Think about what happens in [Figure 6-1](/en/ch6#fig_replication_leader_follower), where the user of a website updates
their profile image. At some point in time, the client sends the update request to the leader;
shortly afterward, it is received by the leader. At some point, the leader forwards the data change
to the followers. Eventually, the leader notifies the client that the update was successful.
[Figure 6-2](/en/ch6#fig_replication_sync_replication) shows one possible way how the timings could work out.

{{< figure src="/fig/ddia_0602.png" id="fig_replication_sync_replication" caption="Figure 6-2. Leader-based replication with one synchronous and one asynchronous follower." class="w-full my-4" >}}

In the example of [Figure 6-2](/en/ch6#fig_replication_sync_replication), the replication to follower 1 is
*synchronous*: the leader waits until follower 1 has confirmed that it received the write before
reporting success to the user, and before making the write visible to other clients. The replication
to follower 2 is *asynchronous*: the leader sends the message, but doesn’t wait for a response from
the follower.

The diagram shows that there is a substantial delay before follower 2 processes the message.
Normally, replication is quite fast: most database systems apply changes to followers in less than a
second. However, there is no guarantee of how long it might take. There are circumstances when
followers might fall behind the leader by several minutes or more; for example, if a follower is
recovering from a failure, if the system is operating near maximum capacity, or if there are network
problems between the nodes.

The advantage of synchronous replication is that the follower is guaranteed to have an up-to-date
copy of the data that is consistent with the leader. If the leader suddenly fails, we can be sure
that the data is still available on the follower. The disadvantage is that if the synchronous
follower doesn’t respond (because it has crashed, or there is a network fault, or for any other
reason), the write cannot be processed. The leader must block all writes and wait until the
synchronous replica is available again.

For that reason, it is impracticable for all followers to be synchronous: any one node outage would
cause the whole system to grind to a halt. In practice, if a database offers synchronous
replication, it often means that *one* of the followers is synchronous, and the others are
asynchronous. If the synchronous follower becomes unavailable or slow, one of the asynchronous
followers is made synchronous. This guarantees that you have an up-to-date copy of the data on at
least two nodes: the leader and one synchronous follower. This configuration is sometimes also
called *semi-synchronous*.

In some systems, a *majority* (e.g., 3 out of 5 replicas, including the leader) of replicas is
updated synchronously, and the remaining minority is asynchronous. This is an example of a *quorum*,
which we will discuss further in [“Quorums for reading and writing”](/en/ch6#sec_replication_quorum_condition). Majority quorums are often
used in systems that use a consensus protocol for automatic leader election, which we will return to
in [Chapter 10](/en/ch10#ch_consistency).

Sometimes, leader-based replication is configured to be completely asynchronous. In this case, if the
leader fails and is not recoverable, any writes that have not yet been replicated to followers are
lost. This means that a write is not guaranteed to be durable, even if it has been confirmed to the
client. However, a fully asynchronous configuration has the advantage that the leader can continue
processing writes, even if all of its followers have fallen behind.

Weakening durability may sound like a bad trade-off, but asynchronous replication is nevertheless
widely used, especially if there are many followers or if they are geographically distributed [^9].
We will return to this issue in [“Problems with Replication Lag”](/en/ch6#sec_replication_lag).

### Setting Up New Followers {#sec_replication_new_replica}

From time to time, you need to set up new followers—perhaps to increase the number of replicas,
or to replace failed nodes. How do you ensure that the new follower has an accurate copy of the
leader’s data?

Simply copying data files from one node to another is typically not sufficient: clients are
constantly writing to the database, and the data is always in flux, so a standard file copy would
see different parts of the database at different points in time. The result might not make any
sense.

You could make the files on disk consistent by locking the database (making it unavailable for
writes), but that would go against our goal of high availability. Fortunately, setting up a
follower can usually be done without downtime. Conceptually, the process looks like this:

1. Take a consistent snapshot of the leader’s database at some point in time—if possible, without
   taking a lock on the entire database. Most databases have this feature, as it is also required
   for backups. In some cases, third-party tools are needed, such as Percona XtraBackup for MySQL.
2. Copy the snapshot to the new follower node.
3. The follower connects to the leader and requests all the data changes that have happened since
   the snapshot was taken. This requires that the snapshot is associated with an exact position in
   the leader’s replication log. That position has various names: for example, PostgreSQL calls it
   the *log sequence number*; MySQL has two mechanisms, *binlog coordinates* and *global transaction
   identifiers* (GTIDs).
4. When the follower has processed the backlog of data changes since the snapshot, we say it has
   *caught up*. It can now continue to process data changes from the leader as they happen.

The practical steps of setting up a follower vary significantly by database. In some systems the
process is fully automated, whereas in others it can be a somewhat arcane multi-step workflow that
needs to be manually performed by an administrator.

You can also archive the replication log to an object store; along with periodic snapshots of the
whole database in the object store this is a good way of implementing database backups and disaster
recovery. You can also perform steps 1 and 2 of setting up a new follower by downloading those files
from the object store. For example, WAL-G does this for PostgreSQL, MySQL, and SQL Server, and
Litestream does the equivalent for SQLite.

--------

<a id="sec_replication_object_storage"></a>

> [!TIP] DATABASES BACKED BY OBJECT STORAGE

Object storage can be used for more than archiving data. Many databases are beginning to use object
stores such as Amazon Web Services S3, Google Cloud Storage, and Azure Blob Storage to serve data
for live queries. Storing database data in object storage has many benefits:

* Object storage is inexpensive compared to other cloud storage options, which allow cloud databases
  to store less-often queried data on cheaper, higher-latency storage while serving the working set
  from memory, SSDs, and NVMe.
* Object stores also provide multi-zone, dual-region, or multi-region replication with very high
  durability guarantees. This also allows databases to bypass inter-zone network fees.
* Databases can use an object store’s *conditional write* feature—essentially, a *compare-and-set*
  (CAS) operation—to implement transactions and leadership election [^10] [^11]
* Storing data from multiple databases in the same object store can simplify data integration,
  particularly when open formats such as Apache Parquet and Apache Iceberg are used.

These benefits dramatically simplify the database architecture by shifting the responsibility of
transactions, leadership election, and replication to object storage.

Systems that adopt object storage for replication must grapple with some tradeoffs. Notably, object
stores have much higher read and write latencies than local disks or virtual block devices such as
EBS. Many cloud providers also charge a per-API call fee, which forces systems to batch reads and
writes to reduce cost. Such batching further increases latency. Moreover, many object stores do not
offer standard filesystem interfaces. This prevents systems that lack object storage integration
from leveraging object storage. Interfaces such as *filesystem in userspace* (FUSE) allow operators
to mount object store buckets as filesystems that applications can use without knowing their data is
stored on object storage. Still, many FUSE interfaces to object stores lack POSIX features such as
non-sequential writes or symlinks, which systems might depend on.

Different systems deal with these trade-offs in various ways. Some introduce a *tiered storage*
architecture that places less frequently accessed data on object storage while new or frequently
accessed data is kept on faster storage devices such as SSDs, NVMe, or even in memory. Other systems
use object storage as their primary storage tier, but use a separate low-latency storage system such
as Amazon’s EBS or Neon’s Safekeepers [^12]) to store their WAL. Recently, some systems have gone even farther by adopting a
*zero-disk architecture* (ZDA). ZDA-based systems persist all data to object storage and use disks
and memory strictly for caching. This allows nodes to have no persistent state, which dramatically
simplifies operations. WarpStream, Confluent Freight, Buf’s Bufstream, and Redpanda Serverless are
all Kafka-compatible systems built using a zero-disk architecture. Nearly every modern cloud data
warehouse also adopts such an architecture, as does Turbopuffer (a vector search engine), and
SlateDB (a cloud-native LSM storage engine).

--------

### Handling Node Outages {#sec_replication_failover}

Any node in the system can go down, perhaps unexpectedly due to a fault, but just as likely due to
planned maintenance (for example, rebooting a machine to install a kernel security patch). Being
able to reboot individual nodes without downtime is a big advantage for operations and maintenance.
Thus, our goal is to keep the system as a whole running despite individual node failures, and to keep
the impact of a node outage as small as possible.

How do you achieve high availability with leader-based replication?

#### Follower failure: Catch-up recovery {#follower-failure-catch-up-recovery}

On its local disk, each follower keeps a log of the data changes it has received from the leader. If
a follower crashes and is restarted, or if the network between the leader and the follower is
temporarily interrupted, the follower can recover quite easily: from its log, it knows the last
transaction that was processed before the fault occurred. Thus, the follower can connect to the
leader and request all the data changes that occurred during the time when the follower was
disconnected. When it has applied these changes, it has caught up to the leader and can continue
receiving a stream of data changes as before.

Although follower recovery is conceptually simple, it can be challenging in terms of performance: if
the database has a high write throughput or if the follower has been offline for a long time, there
might be a lot of writes to catch up on. There will be high load on both the recovering follower and
the leader (which needs to send the backlog of writes to the follower) while this catch-up is ongoing.

The leader can delete its log of writes once all followers have confirmed that they have processed
it, but if a follower is unavailable for a long time, the leader faces a choice: either it retains
the log until the follower recovers and catches up (at the risk of running out of disk space on the
leader), or it deletes the log that the unavailable follower has not yet acknowledged (in which case
the follower won’t be able to recover from the log, and will have to be restored from a backup when
it comes back).

#### Leader failure: Failover {#leader-failure-failover}

Handling a failure of the leader is trickier: one of the followers needs to be promoted to be the
new leader, clients need to be reconfigured to send their writes to the new leader, and the other
followers need to start consuming data changes from the new leader. This process is called
*failover*.

Failover can happen manually (an administrator is notified that the leader has failed and takes the
necessary steps to make a new leader) or automatically. An automatic failover process usually
consists of the following steps:

1. *Determining that the leader has failed.* There are many things that could potentially go wrong:
   crashes, power outages, network issues, and more. There is no foolproof way of detecting what
   has gone wrong, so most systems simply use a timeout: nodes frequently bounce messages back and
   forth between each other, and if a node doesn’t respond for some period of time—say, 30
   seconds—it is assumed to be dead. (If the leader is deliberately taken down for planned
   maintenance, this doesn’t apply.)
2. *Choosing a new leader.* This could be done through an election process (where the leader is chosen by
   a majority of the remaining replicas), or a new leader could be appointed by a previously
   established *controller node* [^13].
   The best candidate for leadership is usually the replica with the most up-to-date data changes
   from the old leader (to minimize any data loss). Getting all the nodes to agree on a new leader
   is a consensus problem, discussed in detail in [Chapter 10](/en/ch10#ch_consistency).
3. *Reconfiguring the system to use the new leader.* Clients now need to send
   their write requests to the new leader (we discuss this
   in [“Request Routing”](/en/ch7#sec_sharding_routing)). If the old leader comes back, it might still believe that it is
   the leader, not realizing that the other replicas have
   forced it to step down. The system needs to ensure that the old leader becomes a follower and
   recognizes the new leader.

Failover is fraught with things that can go wrong:

* If asynchronous replication is used, the new leader may not have received all the writes from the old
  leader before it failed. If the former leader rejoins the cluster after a new leader has been
  chosen, what should happen to those writes? The new leader may have received conflicting writes
  in the meantime. The most common solution is for the old leader’s unreplicated writes to simply be
  discarded, which means that writes you believed to be committed actually weren’t durable after all.
* Discarding writes is especially dangerous if other storage systems outside of the database need to
  be coordinated with the database contents. For example, in one incident at GitHub [^14],
  an out-of-date MySQL follower
  was promoted to leader. The database used an autoincrementing counter to assign primary keys to
  new rows, but because the new leader’s counter lagged behind the old leader’s, it reused some
  primary keys that were previously assigned by the old leader. These primary keys were also used in
  a Redis store, so the reuse of primary keys resulted in inconsistency between MySQL and Redis,
  which caused some private data to be disclosed to the wrong users.
* In certain fault scenarios (see [Chapter 9](/en/ch9#ch_distributed)), it could happen that two nodes both believe
  that they are the leader. This situation is called *split brain*, and it is dangerous: if both
  leaders accept writes, and there is no process for resolving conflicts (see
  [“Multi-Leader Replication”](/en/ch6#sec_replication_multi_leader)), data is likely to be lost or corrupted. As a safety catch, some
  systems have a mechanism to shut down one node if two leaders are detected. However, if this
  mechanism is not carefully designed, you can end up with both nodes being shut down [^15].
  Moreover, there is a risk that by the time the split brain is detected and the old node is shut
  down, it is already too late and data has already been corrupted.
* What is the right timeout before the leader is declared dead? A longer timeout means a longer
  time to recovery in the case where the leader fails. However, if the timeout is too short, there
  could be unnecessary failovers. For example, a temporary load spike could cause a node’s response
  time to increase above the timeout, or a network glitch could cause delayed packets. If the system
  is already struggling with high load or network problems, an unnecessary failover is likely to
  make the situation worse, not better.

--------

> [!NOTE]
> Guarding against split brain by limiting or shutting down old leaders is known as *fencing* or, more
> emphatically, *Shoot The Other Node In The Head* (STONITH). We will discuss fencing in more detail
> in [“Distributed Locks and Leases”](/en/ch9#sec_distributed_lock_fencing).

--------

There are no easy solutions to these problems. For this reason, some operations teams prefer to
perform failovers manually, even if the software supports automatic failover.

The most important thing with failover is to pick an up-to-date follower as the new leader—if
synchronous or semi-synchronous replication is used, this would be the follower that the old leader
waited for before acknowledging writes. With asynchronous replication, you can pick the follower
with the greatest log sequence number. This minimizes the amount of data that is lost during
failover: losing a fraction of a second of writes may be tolerable, but picking a follower that is
behind by several days could be catastrophic.

These issues—node failures; unreliable networks; and trade-offs around replica consistency,
durability, availability, and latency—are in fact fundamental problems in distributed systems.
In [Chapter 9](/en/ch9#ch_distributed) and [Chapter 10](/en/ch10#ch_consistency) we will discuss them in greater depth.

### Implementation of Replication Logs {#sec_replication_implementation}

How does leader-based replication work under the hood? Several different replication methods are
used in practice, so let’s look at each one briefly.

#### Statement-based replication {#statement-based-replication}

In the simplest case, the leader logs every write request (*statement*) that it executes and sends
that statement log to its followers. For a relational database, this means that every `INSERT`,
`UPDATE`, or `DELETE` statement is forwarded to followers, and each follower parses and executes
that SQL statement as if it had been received from a client.

Although this may sound reasonable, there are various ways in which this approach to replication can
break down:

* Any statement that calls a nondeterministic function, such as `NOW()` to get the current date
  and time or `RAND()` to get a random number, is likely to generate a different value on each
  replica.
* If statements use an autoincrementing column, or if they depend on the existing data in the
  database (e.g., `UPDATE …​ WHERE <some condition>`), they must be executed in exactly the same
  order on each replica, or else they may have a different effect. This can be limiting when there
  are multiple concurrently executing transactions.
* Statements that have side effects (e.g., triggers, stored procedures, user-defined functions) may
  result in different side effects occurring on each replica, unless the side effects are absolutely
  deterministic.

It is possible to work around those issues—for example, the leader can replace any nondeterministic
function calls with a fixed return value when the statement is logged so that the followers all get
the same value. The idea of executing deterministic statements in a fixed order is similar to the
event sourcing model that we previously discussed in [“Event Sourcing and CQRS”](/en/ch3#sec_datamodels_events). This approach is
also known as *state machine replication*, and we will discuss the theory behind it in
[“Using shared logs”](/en/ch10#sec_consistency_smr).

Statement-based replication was used in MySQL before version 5.1. It is still sometimes used today,
as it is quite compact, but by default MySQL now switches to row-based replication (discussed shortly) if
there is any nondeterminism in a statement. VoltDB uses statement-based replication, and makes it
safe by requiring transactions to be deterministic [^16]. However, determinism can be hard to guarantee 
in practice, so many databases prefer other replication methods.

#### Write-ahead log (WAL) shipping {#write-ahead-log-wal-shipping}

In [Chapter 4](/en/ch4#ch_storage) we saw that a write-ahead log is needed to make B-tree storage engines robust:
every modification is first written to the WAL so that the tree can be restored to a consistent
state after a crash. Since the WAL contains all the information necessary to restore the indexes and
heap into a consistent state, we can use the exact same log to build a replica on another node:
besides writing the log to disk, the leader also sends it across the network to its followers. When
the follower processes this log, it builds a copy of the exact same files as found on the leader.

This method of replication is used in PostgreSQL and Oracle, among others [^17] [^18]
The main disadvantage is that the log describes the data on a very low level: a WAL contains details
of which bytes were changed in which disk blocks. This makes replication tightly coupled to the
storage engine. If the database changes its storage format from one version to another, it is
typically not possible to run different versions of the database software on the leader and the
followers.

That may seem like a minor implementation detail, but it can have a big operational impact. If the
replication protocol allows the follower to use a newer software version than the leader, you can
perform a zero-downtime upgrade of the database software by first upgrading the followers and then
performing a failover to make one of the upgraded nodes the new leader. If the replication protocol
does not allow this version mismatch, as is often the case with WAL shipping, such upgrades require
downtime.

<a id="sec_replication_logical"></a>

#### Logical (row-based) log replication {#logical-row-based-log-replication}

An alternative is to use different log formats for replication and for the storage engine, which
allows the replication log to be decoupled from the storage engine internals. This kind of
replication log is called a *logical log*, to distinguish it from the storage engine’s (*physical*)
data representation.

A logical log for a relational database is usually a sequence of records describing writes to
database tables at the granularity of a row:

* For an inserted row, the log contains the new values of all columns.
* For a deleted row, the log contains enough information to uniquely identify the row that was
  deleted. Typically this would be the primary key, but if there is no primary key on the table, the
  old values of all columns need to be logged.
* For an updated row, the log contains enough information to uniquely identify the updated row, and
  the new values of all columns (or at least the new values of all columns that changed).

A transaction that modifies several rows generates several such log records, followed by a record
indicating that the transaction was committed. MySQL keeps a separate logical replication log,
called the *binlog*, in addition to the WAL (when configured to use row-based replication).
PostgreSQL implements logical replication by decoding the physical WAL into row
insertion/update/delete events [^19].

Since a logical log is decoupled from the storage engine internals, it can more easily be kept
backward compatible, allowing the leader and the follower to run different versions of the database
software. This in turn enables upgrading to a new version with minimal downtime [^20].

A logical log format is also easier for external applications to parse. This aspect is useful if you want
to send the contents of a database to an external system, such as a data warehouse for offline
analysis, or for building custom indexes and caches [^21].
This technique is called *change data capture*, and we will return to it in [“Change Data Capture”](/en/ch12#sec_stream_cdc).


## Problems with Replication Lag {#sec_replication_lag}

Being able to tolerate node failures is just one reason for wanting replication. As mentioned
in [“Distributed versus Single-Node Systems”](/en/ch1#sec_introduction_distributed), other reasons are scalability (processing more
requests than a single machine can handle) and latency (placing replicas geographically closer to users).

Leader-based replication requires all writes to go through a single node, but read-only queries can
go to any replica. For workloads that consist of mostly reads and only a small percentage of writes
(which is often the case with online services), there is an attractive option: create many followers, and distribute
the read requests across those followers. This removes load from the leader and allows read requests to be
served by nearby replicas.

In this *read-scaling* architecture, you can increase the capacity for serving read-only requests
simply by adding more followers. However, this approach only realistically works with asynchronous
replication—if you tried to synchronously replicate to all followers, a single node failure or
network outage would make the entire system unavailable for writing. And the more nodes you have,
the likelier it is that one will be down, so a fully synchronous configuration would be very unreliable.

Unfortunately, if an application reads from an *asynchronous* follower, it may see outdated
information if the follower has fallen behind. This leads to apparent inconsistencies in the
database: if you run the same query on the leader and a follower at the same time, you may get
different results, because not all writes have been reflected in the follower. This inconsistency is
just a temporary state—if you stop writing to the database and wait a while, the followers will
eventually catch up and become consistent with the leader. For that reason, this effect is known
as *eventual consistency* [^22].

--------

> [!NOTE]
> The term *eventual consistency* was coined by Douglas Terry et al. [^23], popularized by Werner Vogels [^24],
> and became the battle cry of many NoSQL projects. However, not only NoSQL databases are eventually
> consistent: followers in an asynchronously replicated relational database have the same characteristics.

--------

The term “eventually” is deliberately vague: in general, there is no limit to how far a replica can
fall behind. In normal operation, the delay between a write happening on the leader and being
reflected on a follower—the *replication lag*—may be only a fraction of a second, and not
noticeable in practice. However, if the system is operating near capacity or if there is a problem
in the network, the lag can easily increase to several seconds or even minutes.

When the lag is so large, the inconsistencies it introduces are not just a theoretical issue but a
real problem for applications. In this section we will highlight three examples of problems that are
likely to occur when there is replication lag. We’ll also outline some approaches to solving them.

### Reading Your Own Writes {#sec_replication_ryw}

Many applications let the user submit some data and then view what they have submitted. This might
be a record in a customer database, or a comment on a discussion thread, or something else of that sort.
When new data is submitted, it must be sent to the leader, but when the user views the data, it can
be read from a follower. This is especially appropriate if data is frequently viewed but only
occasionally written.

With asynchronous replication, there is a problem, illustrated in
[Figure 6-3](/en/ch6#fig_replication_read_your_writes): if the user views the data shortly after making a write, the
new data may not yet have reached the replica. To the user, it looks as though the data they
submitted was lost, so they will be understandably unhappy.

{{< figure src="/fig/ddia_0603.png" id="fig_replication_read_your_writes" caption="Figure 6-3. A user makes a write, followed by a read from a stale replica. To prevent this anomaly, we need read-after-write consistency." class="w-full my-4" >}}

In this situation, we need *read-after-write consistency*, also known as *read-your-writes consistency* [^23].
This is a guarantee that if the user reloads the page, they will always see any updates they
submitted themselves. It makes no promises about other users: other users’ updates may not be
visible until some later time. However, it reassures the user that their own input has been saved
correctly.

How can we implement read-after-write consistency in a system with leader-based replication? There
are various possible techniques. To mention a few:

* When reading something that the user may have modified, read it from the leader or a synchronously
  updated follower; otherwise, read it from an asynchronously updated follower.
  This requires that you have some way of knowing whether something might have been
  modified, without actually querying it. For example, user profile information on a social network
  is normally only editable by the owner of the profile, not by anybody else. Thus, a simple
  rule is: always read the user’s own profile from the leader, and any other users’ profiles from a
  follower.
* If most things in the application are potentially editable by the user, that approach won’t be
  effective, as most things would have to be read from the leader (negating the benefit of read
  scaling). In that case, other criteria may be used to decide whether to read from the leader. For
  example, you could track the time of the last update and, for one minute after the last update, make all
  reads from the leader [^25].
  You could also monitor the replication lag on followers and prevent queries on any follower that
  is more than one minute behind the leader.
* The client can remember the timestamp of its most recent write—then the system can ensure that the
  replica serving any reads for that user reflects updates at least until that timestamp. If a
  replica is not sufficiently up to date, either the read can be handled by another replica or the
  query can wait until the replica has caught up [^26].
  The timestamp could be a *logical timestamp* (something that indicates ordering of writes, such as
  the log sequence number) or the actual system clock (in which case clock synchronization becomes
  critical; see [“Unreliable Clocks”](/en/ch9#sec_distributed_clocks)).
* If your replicas are distributed across regions (for geographical proximity to users or for
  availability), there is additional complexity. Any request that needs to be served by the leader
  must be routed to the region that contains the leader.

Another complication arises when the same user is accessing your service from multiple devices, for
example a desktop web browser and a mobile app. In this case you may want to provide *cross-device*
read-after-write consistency: if the user enters some information on one device and then views it
on another device, they should see the information they just entered.

In this case, there are some additional issues to consider:

* Approaches that require remembering the timestamp of the user’s last update become more difficult,
  because the code running on one device doesn’t know what updates have happened on the other
  device. This metadata will need to be centralized.
* If your replicas are distributed across different regions, there is no guarantee that connections
  from different devices will be routed to the same region. (For example, if the user’s desktop
  computer uses the home broadband connection and their mobile device uses the cellular data network,
  the devices’ network routes may be completely different.) If your approach requires reading from the
  leader, you may first need to route requests from all of a user’s devices to the same region.

--------

> ![TIP] Regions and Availability Zones

We use the term *region* to refer to one or more datacenters in a single geographic location. Cloud
providers locate multiple datacenters in the same geographic region. Each datacenter is referred to
as an *availability zone* or simply *zone*. Thus, a single cloud region is made up of multiple
zones. Each zone is a separate datacenter located in separate physical facility with its own
power, cooling, and so on.

Zones in the same region are connected by very high speed network connections. Latency is low enough
that most distributed systems can run with nodes spread across multiple zones in the same region as
though they were in a single zone. Multi-zone configurations allow distributed systems to survive
zonal outages where one zone goes offline, but they do not protect against regional outages where
all zones in a region are unavailable. To survive a regional outage, a distributed system must be
deployed across multiple regions, which can result in higher latencies, lower throughput, and
increased cloud networking bills. We will discuss these tradeoffs more in
[“Multi-leader replication topologies”](/en/ch6#sec_replication_topologies). For now, just know that when we say region, we mean a collection of
zones/datacenters in a single geographic location.

--------

### Monotonic Reads {#sec_replication_monotonic_reads}

Our second example of an anomaly that can occur when reading from asynchronous followers is that it’s
possible for a user to see things *moving backward in time*.

This can happen if a user makes several reads from different replicas. For example,
[Figure 6-4](/en/ch6#fig_replication_monotonic_reads) shows user 2345 making the same query twice, first to a follower
with little lag, then to a follower with greater lag. (This scenario is quite likely if the user
refreshes a web page, and each request is routed to a random server.) The first query returns a
comment that was recently added by user 1234, but the second query doesn’t return anything because
the lagging follower has not yet picked up that write. In effect, the second query observes the
system state at an earlier point in time than the first query. This wouldn’t be so bad if the first query
hadn’t returned anything, because user 2345 probably wouldn’t know that user 1234 had recently added
a comment. However, it’s very confusing for user 2345 if they first see user 1234’s comment appear,
and then see it disappear again.

{{< figure src="/fig/ddia_0604.png" id="fig_replication_monotonic_reads" caption="Figure 6-4. A user first reads from a fresh replica, then from a stale replica. Time appears to go backward. To prevent this anomaly, we need monotonic reads." class="w-full my-4" >}}

*Monotonic reads* [^22] is a guarantee that this
kind of anomaly does not happen. It’s a lesser guarantee than strong consistency, but a stronger
guarantee than eventual consistency. When you read data, you may see an old value; monotonic reads
only means that if one user makes several reads in sequence, they will not see time go
backward—i.e., they will not read older data after having previously read newer data.

One way of achieving monotonic reads is to make sure that each user always makes their reads from
the same replica (different users can read from different replicas). For example, the replica can be
chosen based on a hash of the user ID, rather than randomly. However, if that replica fails, the
user’s queries will need to be rerouted to another replica.

### Consistent Prefix Reads {#sec_replication_consistent_prefix}

Our third example of replication lag anomalies concerns violation of causality. Imagine the
following short dialog between Mr. Poons and Mrs. Cake:

Mr. Poons
:   How far into the future can you see, Mrs. Cake?

Mrs. Cake
:   About ten seconds usually, Mr. Poons.

There is a causal dependency between those two sentences: Mrs. Cake heard Mr. Poons’s question and
answered it.

Now, imagine a third person is listening to this conversation through followers. The things said by
Mrs. Cake go through a follower with little lag, but the things said by Mr. Poons have a longer
replication lag (see [Figure 6-5](/en/ch6#fig_replication_consistent_prefix)). This observer would hear the following:

Mrs. Cake
:   About ten seconds usually, Mr. Poons.

Mr. Poons
:   How far into the future can you see, Mrs. Cake?

To the observer it looks as though Mrs. Cake is answering the question before Mr. Poons has even asked
it. Such psychic powers are impressive, but very confusing [^27].

{{< figure src="/fig/ddia_0605.png" id="fig_replication_consistent_prefix" caption="Figure 6-5. If some shards are replicated slower than others, an observer may see the answer before they see the question." class="w-full my-4" >}}

Preventing this kind of anomaly requires another type of guarantee: *consistent prefix reads* [^22]. 
This guarantee says that if a sequence of writes happens in a certain order, 
then anyone reading those writes will see them appear in the same order.

This is a particular problem in sharded (partitioned) databases, which we will discuss in
[Chapter 7](/en/ch7#ch_sharding). If the database always applies writes in the same order, reads always see a
consistent prefix, so this anomaly cannot happen. However, in many distributed databases, different
shards operate independently, so there is no global ordering of writes: when a user reads from the
database, they may see some parts of the database in an older state and some in a newer state.

One solution is to make sure that any writes that are causally related to each other are written to
the same shard—but in some applications that cannot be done efficiently. There are also algorithms
that explicitly keep track of causal dependencies, a topic that we will return to in
[“The “happens-before” relation and concurrency”](/en/ch6#sec_replication_happens_before).

### Solutions for Replication Lag {#id131}

When working with an eventually consistent system, it is worth thinking about how the application
behaves if the replication lag increases to several minutes or even hours. If the answer is “no
problem,” that’s great. However, if the result is a bad experience for users, it’s important to
design the system to provide a stronger guarantee, such as read-after-write. Pretending that
replication is synchronous when in fact it is asynchronous is a recipe for problems down the line.

As discussed earlier, there are ways in which an application can provide a stronger guarantee than
the underlying database—for example, by performing certain kinds of reads on the leader or a
synchronously updated follower. However, dealing with these issues in application code is complex
and easy to get wrong.

The simplest programming model for application developers is to choose a database that provides a
strong consistency guarantee for replicas such as linearizability (see [Chapter 10](/en/ch10#ch_consistency)), and ACID
transactions (see [Chapter 8](/en/ch8#ch_transactions)). This allows you to mostly ignore the challenges that arise
from replication, and treat the database as if it had just a single node. In the early 2010s the
*NoSQL* movement promoted the view that these features limited scalability, and that large-scale
systems would have to embrace eventual consistency.

However, since then, a number of databases started providing strong consistency and transactions
while also offering the fault tolerance, high availability, and scalability advantages of a
distributed database. As mentioned in [“Relational Model versus Document Model”](/en/ch3#sec_datamodels_history), this trend is known as *NewSQL* to
contrast with NoSQL (although it’s less about SQL specifically, and more about new approaches to
scalable transaction management).

Even though scalable, strongly consistent distributed databases are now available, there are still
good reasons why some applications choose to use different forms of replication that offer weaker
consistency guarantees: they can offer stronger resilience in the face of network interruptions, and
have lower overheads compared to transactional systems. We will explore such approaches in the rest
of this chapter.


## Multi-Leader Replication {#sec_replication_multi_leader}

So far in this chapter we have only considered replication architectures using a single leader.
Although that is a common approach, there are interesting alternatives.

Single-leader replication has one major downside: all writes must go through the one leader. If you
can’t connect to the leader for any reason, for example due to a network interruption between you
and the leader, you can’t write to the database.

A natural extension of the single-leader replication model is to allow more than one node to accept
writes. Replication still happens in the same way: each node that processes a write must forward
that data change to all the other nodes. We call this a *multi-leader* configuration (also known as
*active/active* or *bidirectional* replication). In this setup, each leader simultaneously acts as a
follower to the other leaders.

As with single-leader replication, there is a choice between making it synchronous or asynchronous.
Let’s say you have two leaders, *A* and *B*, and you’re trying to write to *A*. If writes are
synchronously replicated from *A* to *B*, and the network between the two nodes is interrupted, you
can’t write to *A* until the network comes back. Synchronous multi-leader replication thus gives you
a model that is very similar to single-leader replication, i.e. if you had made *B* the leader and
*A* simply forwards any write requests to *B* to be executed.

For that reason, we won’t go further into synchronous multi-leader replication, and simply treat it
as equivalent to single-leader replication. The rest of this section focusses on asynchronous
multi-leader replication, in which any leader can process writes even when its connection to the
other leaders is interrupted.

### Geographically Distributed Operation {#sec_replication_multi_dc}

It rarely makes sense to use a multi-leader setup within a single region, because the benefits
rarely outweigh the added complexity. However, there are some situations in which this configuration
is reasonable.

Imagine you have a database with replicas in several different regions (perhaps so that you can
tolerate the failure of an entire region, or perhaps in order to be closer to your users). This is
known as a *geographically distributed*, *geo-distributed* or *geo-replicated* setup. With
single-leader replication, the leader has to be in *one* of the regions, and all writes must go
through that region.

In a multi-leader configuration, you can have a leader in *each* region.
[Figure 6-6](/en/ch6#fig_replication_multi_dc) shows what this architecture might look like. Within each region,
regular leader–follower replication is used (with followers maybe in a different availability zone
from the leader); between regions, each region’s leader replicates its changes to the leaders in
other regions.

{{< figure src="/fig/ddia_0606.png" id="fig_replication_multi_dc" caption="Figure 6-6. Multi-leader replication across multiple regions." class="w-full my-4" >}}

Let’s compare how the single-leader and multi-leader configurations fare in a multi-region deployment:

Performance
:   In a single-leader configuration, every write must go over the internet to the region with the
    leader. This can add significant latency to
    writes and might contravene the purpose of having multiple regions in the first place. In a
    multi-leader configuration, every write can be processed in the local region and is replicated
    asynchronously to the other regions. Thus, the inter-region network delay is hidden from
    users, which means the perceived performance may be better.

Tolerance of regional outages
:   In a single-leader configuration, if the region with the leader becomes unavailable, failover can
    promote a follower in another region to be leader. In a multi-leader configuration, each region
    can continue operating independently of the others, and replication catches up when the offline
    region comes back online.

Tolerance of network problems
:   Even with dedicated connections, traffic between regions

    can be less reliable than traffic between zones in the same region or within a single zone. A
    single-leader configuration is very sensitive to problems in this inter-region link, because when
    a client in one region wants to write to a leader in another region, it has to send its request
    over that link and wait for the response before it can complete.

    A multi-leader configuration with asynchronous replication can tolerate network problems better:
    during a temporary network interruption, each region’s leader can continue independently processing writes.

Consistency
:   A single-leader system can provide strong consistency guarantees, such as serializable
    transactions, which we will discuss in [Chapter 8](/en/ch8#ch_transactions). The biggest downside of multi-leader
    systems is that the consistency they can achieve is much weaker. For example, you can’t guarantee
    that a bank account won’t go negative or that a username is unique: it’s always possible for
    different leaders to process writes that are individually fine (paying out some of the money in an
    account, registering a particular username), but which violate the constraint when taken together
    with another write on another leader.

    This is simply a fundamental limitation of distributed systems [^28].
    If you need to enforce such constraints, you’re therefore better off with a single-leader system.
    However, as we will see in [“Dealing with Conflicting Writes”](/en/ch6#sec_replication_write_conflicts), multi-leader systems can still
    achieve consistency properties that are useful in a wide range of apps that don’t need such constraints.

Multi-leader replication is less common than single-leader replication, but it is still supported by
many databases, including MySQL, Oracle, SQL Server, and YugabyteDB. In some cases it is an external
add-on feature, for example in Redis Enterprise, EDB Postgres Distributed, and pglogical [^29].

As multi-leader replication is a somewhat retrofitted feature in many databases, there are often
subtle configuration pitfalls and surprising interactions with other database features. For example,
autoincrementing keys, triggers, and integrity constraints can be problematic. For this reason,
multi-leader replication is often considered dangerous territory that should be avoided if possible [^30].

#### Multi-leader replication topologies {#sec_replication_topologies}

A *replication topology* describes the communication paths along which writes are propagated from
one node to another. If you have two leaders, like in [Figure 6-9](/en/ch6#fig_replication_write_conflict), there is
only one plausible topology: leader 1 must send all of its writes to leader 2, and vice versa. With
more than two leaders, various different topologies are possible. Some examples are illustrated in
[Figure 6-7](/en/ch6#fig_replication_topologies).

{{< figure src="/fig/ddia_0607.png" id="fig_replication_topologies" caption="Figure 6-7. Three example topologies in which multi-leader replication can be set up." class="w-full my-4" >}}

The most general topology is *all-to-all*, shown in [Figure 6-7](/en/ch6#fig_replication_topologies)(c),
in which every leader sends its writes to every other leader. However, more restricted topologies
are also used: for example a *circular topology* in which each node receives writes from one node
and forwards those writes (plus any writes of its own) to one other node. Another popular topology
has the shape of a *star*: one designated root node forwards writes to all of the other nodes. The
star topology can be generalized to a tree.


--------

> [!NOTE]
> Don’t confuse a star-shaped network topology with a *star schema* (see
> [“Stars and Snowflakes: Schemas for Analytics”](/en/ch3#sec_datamodels_analytics)), which describes the structure of a data model.

--------

In circular and star topologies, a write may need to pass through several nodes before it reaches
all replicas. Therefore, nodes need to forward data changes they receive from other nodes. To
prevent infinite replication loops, each node is given a unique identifier, and in the replication
log, each write is tagged with the identifiers of all the nodes it has passed through [^31].
When a node receives a data change that is tagged with its own identifier, that data change is
ignored, because the node knows that it has already been processed.

#### Problems with different topologies {#problems-with-different-topologies}

A problem with circular and star topologies is that if just one node fails, it can interrupt the
flow of replication messages between other nodes, leaving them unable to communicate until the
node is fixed. The topology could be reconfigured to work around the failed node, but in most
deployments such reconfiguration would have to be done manually. The fault tolerance of a more
densely connected topology (such as all-to-all) is better because it allows messages to travel
along different paths, avoiding a single point of failure.

On the other hand, all-to-all topologies can have issues too. In particular, some network links may
be faster than others (e.g., due to network congestion), with the result that some replication
messages may “overtake” others, as illustrated in [Figure 6-8](/en/ch6#fig_replication_causality).

{{< figure src="/fig/ddia_0608.png" id="fig_replication_causality" caption="Figure 6-8. With multi-leader replication, writes may arrive in the wrong order at some replicas." class="w-full my-4" >}}

In [Figure 6-8](/en/ch6#fig_replication_causality), client A inserts a row into a table on leader 1, and client B
updates that row on leader 3. However, leader 2 may receive the writes in a different order: it may
first receive the update (which, from its point of view, is an update to a row that does not exist
in the database) and only later receive the corresponding insert (which should have preceded the
update).

This is a problem of causality, similar to the one we saw in [“Consistent Prefix Reads”](/en/ch6#sec_replication_consistent_prefix):
the update depends on the prior insert, so we need to make sure that all nodes process the insert
first, and then the update. Simply attaching a timestamp to every write is not sufficient, because
clocks cannot be trusted to be sufficiently in sync to correctly order these events at leader 2 (see
[Chapter 9](/en/ch9#ch_distributed)).

To order these events correctly, a technique called *version vectors* can be used, which we will
discuss later in this chapter (see [“Detecting Concurrent Writes”](/en/ch6#sec_replication_concurrent)). However, many multi-leader
replication systems don’t use good techniques for ordering updates, leaving them vulnerable to
issues like the one in [Figure 6-8](/en/ch6#fig_replication_causality). If you are using multi-leader replication, it
is worth being aware of these issues, carefully reading the documentation, and thoroughly testing
your database to ensure that it really does provide the guarantees you believe it to have.

### Sync Engines and Local-First Software {#sec_replication_offline_clients}

Another situation in which multi-leader replication is appropriate is if you have an application
that needs to continue to work while it is disconnected from the internet.

For example, consider the calendar apps on your mobile phone, your laptop, and other devices. You
need to be able to see your meetings (make read requests) and enter new meetings (make write
requests) at any time, regardless of whether your device currently has an internet connection. If
you make any changes while you are offline, they need to be synced with a server and your other
devices when the device is next online.

In this case, every device has a local database replica that acts as a leader (it accepts write
requests), and there is an asynchronous multi-leader replication process (sync) between the replicas
of your calendar on all of your devices. The replication lag may be hours or even days, depending on
when you have internet access available.

From an architectural point of view, this setup is very similar to multi-leader replication between
regions, taken to the extreme: each device is a “region,” and the network connection between them is
extremely unreliable.

#### Real-time collaboration, offline-first, and local-first apps {#real-time-collaboration-offline-first-and-local-first-apps}

Moreover, many modern web apps offer *real-time collaboration* features, such as Google Docs and
Sheets for text documents and spreadsheets, Figma for graphics, and Linear for project management.
What makes these apps so responsive is that user input is immediately reflected in the user
interface, without waiting for a network round-trip to the server, and edits by one user are shown
to their collaborators with low latency [^32] [^33] [^34]

This again results in a multi-leader architecture: each web browser tab that has opened the shared
file is a replica, and any updates that you make to the file are asynchronously replicated to the
devices of the other users who have opened the same file. Even if the app does not allow you to
continue editing a file while offline, the fact that multiple users can make edits without waiting
for a response from the server already makes it multi-leader.

Both offline editing and real-time collaboration require a similar replication infrastructure: the
application needs to capture any changes that the user makes to a file, and either send them to
collaborators immediately (if online), or store them locally for sending later (if offline).
Additionally, the application needs to receive changes from collaborators, merge them into the
user’s local copy of the file, and update the user interface to reflect the latest version. If
multiple users have changed the file concurrently, conflict resolution logic may be needed to merge
those changes.

A software library that supports this process is called a *sync engine*. Although the idea has
existed for a long time, the term has recently gained attention [^35] [^36] [^37].
An application that allows a user to continue editing a file while offline (which may be implemented
using a sync engine) is called *offline-first* [^38].
The term *local-first software* refers to collaborative apps that are not only offline-first, but
are also designed to continue working even if the developer who made the software shuts down all of
their online services [^39].
This can be achieved by using a sync engine with an open standard sync protocol for which multiple
service providers are available [^40].
For example, Git is a local-first collaboration system (albeit one that doesn’t support real-time
collaboration) since you can sync via GitHub, GitLab, or any other repository hosting service.

#### Pros and cons of sync engines {#pros-and-cons-of-sync-engines}

The dominant way of building web apps today is to keep very little persistent state on the client,
and to rely on making requests to a server whenever a new piece of data needs to be displayed or
some data needs to be updated. In contrast, when using a sync engine, you have persistent state on
the client, and communication with the server is moved into a background process. The sync engine
approach has a number of advantages:

* Having the data locally means the user interface can be much faster to respond than if it had to
  wait for a service call to fetch some data. Some apps aim to respond to user input in the *next
  frame* of the graphics system, which means rendering within 16 ms on a display with a
  60 Hz refresh rate.
* Allowing users to continue working while offline is valuable, especially on mobile devices with
  intermittent connectivity. With a sync engine, an app doesn’t need a separate offline mode: being
  offline is the same as having very large network delay.
* A sync engine simplifies the programming model for frontend apps, compared to performing explicit
  service calls in application code. Every service call requires error handling, as discussed in
  [“The problems with remote procedure calls (RPCs)”](/en/ch5#sec_problems_with_rpc): for example, if a request to update data on a server fails, the user
  interface needs to somehow reflect that error. A sync engine allows the app to perform reads and
  writes on local data, which almost never fails, leading to a more declarative programming style [^41].
* In order to display edits from other users in real-time, you need to receive notifications of
  those edits and efficiently update the user interface accordingly. A sync engine combined with a
  *reactive programming* model is a good way of implementing this [^42].

Sync engines work best when all the data that the user may need is downloaded in advance and stored
persistently on the client. This means that the data is available for offline access when needed,
but it also means that sync engines are not suitable if the user has access to a very large amount
of data. For example, downloading all the files that the user themselves created is probably fine
(one user generally doesn’t generate that much data), but downloading the entire catalog of an
e-commerce website probably doesn’t make sense.

The sync engine was pioneered by Lotus Notes in the 1980s [^43]
(without using that term), and sync for specific apps such as calendars has also existed for a long
time. Today there are a number of general-purpose sync engines, some of which use a proprietary
backend service (e.g., Google Firestore, Realm, or Ditto), and some have an open source backend,
making them suitable for creating local-first software (e.g., PouchDB/CouchDB, Automerge, or Yjs).

Multiplayer video games have a similar need to respond immediately to the user’s local actions, and
reconcile them with other players’ actions received asynchronously over the network. In game
development jargon the equivalent of a sync engine is called *netcode*. The techniques used in
netcode are quite specific to the requirements of games [^44], and don’t directly
carry over to other types of software, so we won’t consider them further in this book.


### Dealing with Conflicting Writes {#sec_replication_write_conflicts}

The biggest problem with multi-leader replication—both in a geo-distributed server-side database and
a local-first sync engine on end user devices—is that concurrent writes on different leaders can
lead to conflicts that need to be resolved.

For example, consider a wiki page that is simultaneously being edited by two users, as shown in
[Figure 6-9](/en/ch6#fig_replication_write_conflict). User 1 changes the title of the page from A to B, and user 2
independently changes the title from A to C. Each user’s change is successfully applied to their
local leader. However, when the changes are asynchronously replicated, a conflict is detected.
This problem does not occur in a single-leader database.

{{< figure src="/fig/ddia_0609.png" id="fig_replication_write_conflict" caption="Figure 6-9. A write conflict caused by two leaders concurrently updating the same record." class="w-full my-4" >}}

> [!NOTE]
> We say that the two writes in [Figure 6-9](/en/ch6#fig_replication_write_conflict) are *concurrent* because neither
> was “aware” of the other at the time the write was originally made. It doesn’t matter whether the
> writes literally happened at the same time; indeed, if the writes were made while offline, they
> might have actually happened some time apart. What matters is whether one write occurred in a state
> where the other write has already taken effect.

In [“Detecting Concurrent Writes”](/en/ch6#sec_replication_concurrent) we will tackle the question of how a database can determine
whether two writes are concurrent. For now we will assume that we can detect conflicts, and we want
to figure out the best way of resolving them.

#### Conflict avoidance {#conflict-avoidance}

One strategy for conflicts is to avoid them occurring in the first place. For example, if the
application can ensure that all writes for a particular record go through the same leader, then
conflicts cannot occur, even if the database as a whole is multi-leader. This approach is not
possible in the case of a sync engine client being updated offline, but it is sometimes possible in
geo-replicated server systems [^30].

For example, in an application where a user can only edit their own data, you can ensure that
requests from a particular user are always routed to the same region and use the leader in that
region for reading and writing. Different users may have different “home” regions (perhaps picked
based on geographic proximity to the user), but from any one user’s point of view the configuration
is essentially single-leader.

However, sometimes you might want to change the designated leader for a record—perhaps because
one region is unavailable and you need to reroute traffic to another region, or perhaps because
a user has moved to a different location and is now closer to a different region. There is now a
risk that the user performs a write while the change of designated leader is in progress, leading to
a conflict that would have to be resolved using one of the methods below. Thus, conflict avoidance
breaks down if you allow the leader to be changed.

Another example of conflict avoidance: imagine you want to insert new records and generate unique
IDs for them based on an auto-incrementing counter. If you have two leaders, you could set them up
so that one leader only generates odd numbers and the other only generates even numbers. That way
you can be sure that the two leaders won’t concurrently assign the same ID to different records.
We will discuss other ID assignment schemes in [“ID Generators and Logical Clocks”](/en/ch10#sec_consistency_logical).


#### Last write wins (discarding concurrent writes) {#sec_replication_lww}

If conflicts can’t be avoided, the simplest way of resolving them is to attach a timestamp to each
write, and to always use the value with the greatest timestamp. For example, in
[Figure 6-9](/en/ch6#fig_replication_write_conflict), let’s say that the timestamp of user 1’s write is greater than
the timestamp of user 2’s write. In that case, both leaders will determine that the new title of the
page should be B, and they discard the write that sets it to C. If the writes coincidentally have
the same timestamp, the winner can be chosen by comparing the values (e.g., in the case of strings,
taking the one that’s earlier in the alphabet).

This approach is called *last write wins* (LWW) because the write with the greatest timestamp can be
considered the “last” one. The term is misleading though, because when two writes are concurrent
like in [Figure 6-9](/en/ch6#fig_replication_write_conflict), which one is older and which is later is undefined, and
so the timestamp order of concurrent writes is essentially random.

Therefore the real meaning of LWW is: when the same record is concurrently written on different
leaders, one of those writes is randomly chosen to be the winner, and the other writes are silently
discarded, even though they were successfully processed at their respective leaders. This achieves
the goal that eventually all replicas end up in a consistent state, but at the cost of data loss.

If you can avoid conflicts—for example, by only inserting records with a unique key such as a UUID,
and never updating them—then LWW is no problem. But if you update existing
records, or if different leaders may insert records with the same key, then you have to decide
whether lost updates are a problem for your application. If lost updates are not acceptable, you
need to use one of the conflict resolution approaches described below.

Another problem with LWW is that if a real-time clock (e.g. a Unix timestamp) is used as timestamp
for the writes, the system becomes very sensitive to clock synchronization. If one node has a clock
that is ahead of the others, and you try to overwrite a value written by that node, your write may
be ignored as it may have a lower timestamp, even though it clearly occurred later. This problem can
be solved by using a *logical clock*, which we will discuss in [“ID Generators and Logical Clocks”](/en/ch10#sec_consistency_logical).

#### Manual conflict resolution {#manual-conflict-resolution}

If randomly discarding some of your writes is not desirable, the next option is to resolve the
conflict manually. You may be familiar with manual conflict resolution from Git and other version
control systems: if commits on two different branches edit the same lines of the same file, and you
try to merge those branches, you will get a merge conflict that needs to be resolved before the
merge is complete.

In a database, it would be impractical for a conflict to stop the entire replication process until a
human has resolved it. Instead, databases typically store all the concurrently written values for a
given record—for example, both B and C in [Figure 6-9](/en/ch6#fig_replication_write_conflict). These values are
sometimes called *siblings*. The next time you query that record, the database returns *all* those
values, rather than just the latest one. You can then resolve those values in whatever way you want,
either automatically in application code (for example, you could concatenate B and C into “B/C”), or
by asking the user. You then write back a new value to the database to resolve the conflict.

This approach to conflict resolution is used in some systems, such as CouchDB. However, it also
suffers from a number of problems:

* The API of the database changes: for example, where previously the title of the wiki page was just
  a string, it now becomes a set of strings that usually contains one element, but may sometimes
  contain multiple elements if there is a conflict. This can make the data awkward to work with in
  application code.
* Asking the user to manually merge the siblings is a lot of work, both for the app developer (who
  needs to build the user interface for conflict resolution) and for the user (who may be confused
  about what they are being asked to do, and why). In many cases, it’s better to merge automatically
  than to bother the user.
* Merging siblings automatically can lead to surprising behavior if it is not done carefully. For
  example, the shopping cart on Amazon used to allow concurrent updates, which were then merged by
  keeping all the shopping cart items that appeared in any of the siblings (i.e., taking the set
  union of the carts). This meant that if the customer had removed an item from their cart in one
  sibling, but another sibling still contained that old item, the removed item would unexpectedly
  reappear in the customer’s cart [^45]. [Figure 6-10](/en/ch6#fig_replication_amazon_anomaly) shows an example where Device 1 removes Book from the shopping
  cart and concurrently Device 2 removes DVD, but after merging the conflict both items reappear.
* If multiple nodes observe the conflict and concurrently resolve it, the conflict resolution
  process can itself introduce a new conflict. Those resolutions could even be inconsistent: for
  example, one node may merge B and C into “B/C” and another may merge them into “C/B” if you are
  not careful to order them consistently. When the conflict between “B/C” and “C/B” is merged, it
  may result in “B/C/C/B” or something similarly surprising.

{{< figure src="/fig/ddia_0610.png" id="fig_replication_amazon_anomaly" caption="Figure 6-10. Example of Amazon's shopping cart anomaly: if conflicts on a shopping cart are merged by taking the union, deleted items may reappear." class="w-full my-4" >}}


<a id="sec_replication_automatic_resolution"></a>

#### Automatic conflict resolution {#automatic-conflict-resolution}

For many applications, the best way of handling conflicts is to use an algorithm that automatically
merges concurrent writes into a consistent state. Automatic conflict resolution ensures that all
replicas *converge* to the same state—i.e., all replicas that have processed the same set of writes
have the same state, regardless of the order in which the writes arrived.

LWW is a simple example of a conflict resolution algorithm. More sophisticated merge algorithms have
been developed for different types of data, with the goal of preserving the intended effect of all
updates as much as possible, and hence avoiding data loss:

* If the data is text (e.g., the title or body of a wiki page), we can detect which characters have
  been inserted or deleted from one version to the next. The merged result then preserves all the
  insertions and deletions made in any of the siblings. If users concurrently insert text at the
  same position, it can be ordered deterministically so that all nodes get the same merged outcome.
* If the data is a collection of items (ordered like a to-do list, or unordered like a shopping
  cart), we can merge it similarly to text by tracking insertions and deletions. To avoid the
  shopping cart issue in [Figure 6-10](/en/ch6#fig_replication_amazon_anomaly), the algorithms track the fact that Book
  and DVD were deleted, so the merged result is Cart = {Soap}.
* If the data is an integer representing a counter that can be incremented or decremented (e.g., the
  number of likes on a social media post), the merge algorithm can tell how many increments and
  decrements happened on each sibling, and add them together correctly so that the result does not
  double-count and does not drop updates.
* If the data is a key-value mapping, we can merge updates to the same key by applying one of the
  other conflict resolution algorithms to the values under that key. Updates to different keys can
  be handled independently from each other.

There are limits to what is possible with conflict resolution. For example, if you want to enforce
that a list contains no more than five items, and multiple users concurrently add items to the list
so that there are more than five in total, your only option is to drop some of the items.
Nevertheless, automatic conflict resolution is sufficient to build many useful apps. And if you
start from the requirement of wanting to build a collaborative offline-first or local-first app,
then conflict resolution is inevitable, and automating it is often the best approach.

### CRDTs and Operational Transformation {#sec_replication_crdts}

Two families of algorithms are commonly used to implement automatic conflict resolution:
*Conflict-free replicated datatypes* (CRDTs) [^46] and *Operational Transformation* (OT) [^47].
They have different design philosophies and performance characteristics, but both are able to
perform automatic merges for all the aforementioned types of data.

[Figure 6-11](/en/ch6#fig_replication_ot_crdt) shows an example of how OT and a CRDT merge concurrent updates to a
text. Assume you have two replicas that both start off with the text “ice”. One replica prepends the
letter “n” to make “nice”, while concurrently the other replica appends an exclamation mark to make “ice!”.

{{< figure src="/fig/ddia_0611.png" id="fig_replication_ot_crdt" caption="Figure 6-11. How two concurrent insertions into a string are merged by OT and a CRDT respectively." class="w-full my-4" >}}

The merged result “nice!” is achieved differently by both types of algorithms:

OT
:   We record the index at which characters are inserted or deleted: “n” is inserted at index 0, and
    “!” at index 3. Next, the replicas exchange their operations. The insertion of “n” at 0 can be
    applied as-is, but if the insertion of “!” at 3 were applied to the state “nice” we would get
    “nic!e”, which is incorrect. We therefore need to transform the index of each operation to account
    for concurrent operations that have already been applied; in this case, the insertion of “!” is
    transformed to index 4 to account for the insertion of “n” at an earlier index.

CRDT
:   Most CRDTs give each character a unique, immutable ID and use those to determine the positions of
    insertions/deletions, instead of indexes. For example, in [Figure 6-11](/en/ch6#fig_replication_ot_crdt) we assign
    the ID 1A to “i”, the ID 2A to “c”, etc. When inserting the exclamation mark, we generate an
    operation containing the ID of the new character (4B) and the ID of the existing character after
    which we want to insert (3A). To insert at the beginning of the string we give “nil” as the
    preceding character ID. Concurrent insertions at the same position are ordered by the IDs of the
    characters. This ensures that replicas converge without performing any transformation.

There are many algorithms based on variations of these ideas. Lists/arrays can be supported
similarly, using list elements instead of characters, and other datatypes such as key-value maps can
be added quite easily. There are some performance and functionality trade-offs between OT and CRDTs,
but it’s possible to combine the advantages of CRDTs and OT in one algorithm [^48].

OT is most often used for real-time collaborative editing of text, e.g. in Google Docs [^32], whereas CRDTs can be found in
distributed databases such as Redis Enterprise, Riak, and Azure Cosmos DB [^49].
Sync engines for JSON data can be implemented both with CRDTs (e.g., Automerge or Yjs) and with OT (e.g., ShareDB).

#### What is a conflict? {#what-is-a-conflict}

Some kinds of conflict are obvious. In the example in [Figure 6-9](/en/ch6#fig_replication_write_conflict), two writes
concurrently modified the same field in the same record, setting it to two different values. There
is little doubt that this is a conflict.

Other kinds of conflict can be more subtle to detect. For example, consider a meeting room booking
system: it tracks which room is booked by which group of people at which time. This application
needs to ensure that each room is only booked by one group of people at any one time (i.e., there
must not be any overlapping bookings for the same room). In this case, a conflict may arise if two
different bookings are created for the same room at the same time. Even if the application checks
availability before allowing a user to make a booking, there can be a conflict if the two bookings
are made on two different leaders.

There isn’t a quick ready-made answer, but in the following chapters we will trace a path toward a
good understanding of this problem. We will see some more examples of conflicts in
[Chapter 8](/en/ch8#ch_transactions), and in [“Ordering events to capture causality”](/en/ch13#sec_future_capture_causality) we will discuss scalable approaches for detecting and
resolving conflicts in a replicated system.


## Leaderless Replication {#sec_replication_leaderless}

The replication approaches we have discussed so far in this chapter—single-leader and
multi-leader replication—are based on the idea that a client sends a write request to one node
(the leader), and the database system takes care of copying that write to the other replicas. A
leader determines the order in which writes should be processed, and followers apply the leader’s
writes in the same order.

Some data storage systems take a different approach, abandoning the concept of a leader and
allowing any replica to directly accept writes from clients. Some of the earliest replicated data
systems were leaderless [^1] [^50], but the idea was mostly forgotten during the era of dominance of relational databases. It once again became
a fashionable architecture for databases after Amazon used it for its in-house *Dynamo* system in
2007 [^45]. Riak, Cassandra, and ScyllaDB are open source datastores with leaderless replication models inspired
by Dynamo, so this kind of database is also known as *Dynamo-style*.

--------

> [!NOTE]
> The original *Dynamo* system was only described in a paper [^45], but never released outside of Amazon. 
> The similarly-named *DynamoDB* is a more recent cloud database from AWS, but it has a completely different architecture: 
> it uses single-leader replication based on the Multi-Paxos consensus algorithm [^5].

--------

In some leaderless implementations, the client directly sends its writes to several replicas, while
in others, a coordinator node does this on behalf of the client. However, unlike a leader database,
that coordinator does not enforce a particular ordering of writes. As we shall see, this difference in design has
profound consequences for the way the database is used.

### Writing to the Database When a Node Is Down {#id287}

Imagine you have a database with three replicas, and one of the replicas is currently
unavailable—​perhaps it is being rebooted to install a system update. In a single-leader
configuration, if you want to continue processing writes, you may need to perform a failover (see
[“Handling Node Outages”](/en/ch6#sec_replication_failover)).

On the other hand, in a leaderless configuration, failover does not exist.
[Figure 6-12](/en/ch6#fig_replication_quorum_node_outage) shows what happens: the client (user 1234) sends the write to
all three replicas in parallel, and the two available replicas accept the write but the unavailable
replica misses it. Let’s say that it’s sufficient for two out of three replicas to
acknowledge the write: after user 1234 has received two *ok* responses, we consider the write to be
successful. The client simply ignores the fact that one of the replicas missed the write.

{{< figure src="/fig/ddia_0612.png" id="fig_replication_quorum_node_outage" caption="Figure 6-12. A quorum write, quorum read, and read repair after a node outage." class="w-full my-4" >}}


Now imagine that the unavailable node comes back online, and clients start reading from it. Any
writes that happened while the node was down are missing from that node. Thus, if you read from that
node, you may get *stale* (outdated) values as responses.

To solve that problem, when a client reads from the database, it doesn’t just send its request to
one replica: *read requests are also sent to several nodes in parallel*. The client may get
different responses from different nodes; for example, the up-to-date value from one node and a
stale value from another.

In order to tell which responses are up-to-date and which are outdated, every value that is written
needs to be tagged with a version number or timestamp, similarly to what we saw in
[“Last write wins (discarding concurrent writes)”](/en/ch6#sec_replication_lww). When a client receives multiple values in response to a read, it uses the
one with the greatest timestamp (even if that value was only returned by one replica, and several
other replicas returned older values). See [“Detecting Concurrent Writes”](/en/ch6#sec_replication_concurrent) for more details.

#### Catching up on missed writes {#sec_replication_read_repair}

The replication system should ensure that eventually all the data is copied to every replica. After
an unavailable node comes back online, how does it catch up on the writes that it missed? Several
mechanisms are used in Dynamo-style datastores:

Read repair
:   When a client makes a read from several nodes in parallel, it can detect any stale responses.
    For example, in [Figure 6-12](/en/ch6#fig_replication_quorum_node_outage), user 2345 gets a version 6 value from
    replica 3 and a version 7 value from replicas 1 and 2. The client sees that replica 3 has a stale
    value and writes the newer value back to that replica. This approach works well for values that are
    frequently read.

Hinted handoff
:   If one replica is unavailable, another replica may store writes on its behalf in the form of
    *hints*. When the replica that was supposed to receive those writes comes back, the replica
    storing the hints sends them to the recovered replica, and then deletes the hints. This *handoff*
    process helps bring replicas up-to-date even for values that are never read, and therefore not
    handled by read repair.

Anti-entropy
:   In addition, there is a background process that periodically looks for differences in
    the data between replicas and copies any missing data from one replica to another. Unlike the
    replication log in leader-based replication, this *anti-entropy process* does not copy writes in
    any particular order, and there may be a significant delay before data is copied.

#### Quorums for reading and writing {#sec_replication_quorum_condition}

In the example of [Figure 6-12](/en/ch6#fig_replication_quorum_node_outage), we considered the write to be successful
even though it was only processed on two out of three replicas. What if only one out of three
replicas accepted the write? How far can we push this?

If we know that every successful write is guaranteed to be present on at least two out of three
replicas, that means at most one replica can be stale. Thus, if we read from at least two replicas,
we can be sure that at least one of the two is up to date. If the third replica is down or slow to
respond, reads can nevertheless continue returning an up-to-date value.

More generally, if there are *n* replicas, every write must be confirmed by *w* nodes to be
considered successful, and we must query at least *r* nodes for each read. (In our example,
*n* = 3, *w* = 2, *r* = 2.) As long as *w* + *r* > *n*, 
we expect to get an up-to-date value when reading, because at least one of the *r* nodes we’re
reading from must be up to date. Reads and writes that obey these *r* and *w* values are called *quorum* reads and writes [^50].
You can think of *r* and *w* as the minimum number of votes required for the read or write to be valid.

In Dynamo-style databases, the parameters *n*, *w*, and *r* are typically configurable. A common
choice is to make *n* an odd number (typically 3 or 5) and to set *w* = *r* =
(*n* + 1) / 2 (rounded up). However, you can vary the numbers as you see fit.
For example, a workload with few writes and many reads may benefit from setting *w* = *n* and
*r* = 1. This makes reads faster, but has the disadvantage that just one failed node causes all
database writes to fail.

--------

> [!NOTE]
> There may be more than *n* nodes in the cluster, but any given value is stored only on *n*
> nodes. This allows the dataset to be sharded, supporting datasets that are larger than you can fit
> on one node. We will return to sharding in [Chapter 7](/en/ch7#ch_sharding).

--------

The quorum condition, *w* + *r* > *n*, allows the system to tolerate unavailable nodes
as follows:

* If *w* < *n*, we can still process writes if a node is unavailable.
* If *r* < *n*, we can still process reads if a node is unavailable.
* With *n* = 3, *w* = 2, *r* = 2 we can tolerate one unavailable
  node, like in [Figure 6-12](/en/ch6#fig_replication_quorum_node_outage).
* With *n* = 5, *w* = 3, *r* = 3 we can tolerate two unavailable nodes.
  This case is illustrated in [Figure 6-13](/en/ch6#fig_replication_quorum_overlap).

Normally, reads and writes are always sent to all *n* replicas in parallel. The parameters *w* and *r* 
determine how many nodes we wait for—i.e., how many of the *n* nodes need to report success
before we consider the read or write to be successful.

{{< figure src="/fig/ddia_0613.png" id="fig_replication_quorum_overlap" caption="Figure 6-13. If *w* + *r* > *n*, at least one of the *r* replicas you read from must have seen the most recent successful write." class="w-full my-4" >}}


If fewer than the required *w* or *r* nodes are available, writes or reads return an error. A node
could be unavailable for many reasons: because the node is down (crashed, powered down), due to an
error executing the operation (can’t write because the disk is full), due to a network interruption
between the client and the node, or for any number of other reasons. We only care whether the node
returned a successful response and don’t need to distinguish between different kinds of fault.

### Limitations of Quorum Consistency {#sec_replication_quorum_limitations}

If you have *n* replicas, and you choose *w* and *r* such that *w* + *r* > *n*, you can
generally expect every read to return the most recent value written for a key. This is the case because the
set of nodes to which you’ve written and the set of nodes from which you’ve read must overlap. That
is, among the nodes you read there must be at least one node with the latest value (illustrated in
[Figure 6-13](/en/ch6#fig_replication_quorum_overlap)).

Often, *r* and *w* are chosen to be a majority (more than *n*/2) of nodes, because that ensures
*w* + *r* > *n* while still tolerating up to *n*/2 (rounded down) node failures. But quorums are
not necessarily majorities—it only matters that the sets of nodes used by the read and write
operations overlap in at least one node. Other quorum assignments are possible, which allows some
flexibility in the design of distributed algorithms [^51].

You may also set *w* and *r* to smaller numbers, so that *w* + *r* ≤ *n* (i.e.,
the quorum condition is not satisfied). In this case, reads and writes will still be sent to *n*
nodes, but a smaller number of successful responses is required for the operation to succeed.

With a smaller *w* and *r* you are more likely to read stale values, because it’s more likely that
your read didn’t include the node with the latest value. On the upside, this configuration allows
lower latency and higher availability: if there is a network interruption and many replicas become
unreachable, there’s a higher chance that you can continue processing reads and writes. Only after
the number of reachable replicas falls below *w* or *r* does the database become unavailable for
writing or reading, respectively.

However, even with *w* + *r* > *n*, there are edge cases in which the consistency
properties can be confusing. Some scenarios include:

* If a node carrying a new value fails, and its data is restored from a replica carrying an old
  value, the number of replicas storing the new value may fall below *w*, breaking the quorum
  condition.
* While a rebalancing is in progress, where some data is moved from one node to another (see
  [Chapter 7](/en/ch7#ch_sharding)), nodes may have inconsistent views of which nodes should be holding the *n*
  replicas for a particular value. This can result in the read and write quorums no longer
  overlapping.
* If a read is concurrent with a write operation, the read may or may not see the concurrently
  written value. In particular, it’s possible for one read to see the new value, and a subsequent
  read to see the old value, as we shall see in [“Linearizability and quorums”](/en/ch10#sec_consistency_quorum_linearizable).
* If a write succeeded on some replicas but failed on others (for example because the disks on some
  nodes are full), and overall succeeded on fewer than *w* replicas, it is not rolled back on the
  replicas where it succeeded. This means that if a write was reported as failed, subsequent reads
  may or may not return the value from that write [^52].
* If the database uses timestamps from a real-time clock to determine which write is newer (as
  Cassandra and ScyllaDB do, for example), writes might be silently dropped if another node with a
  faster clock has written to the same key—an issue we previously saw in [“Last write wins (discarding concurrent writes)”](/en/ch6#sec_replication_lww).
  We will discuss this in more detail in [“Relying on Synchronized Clocks”](/en/ch9#sec_distributed_clocks_relying).
* If two writes occur concurrently, one of them might be processed first on one replica, and the
  other might be processed first on another replica. This leads to a conflict, similarly to what we
  saw for multi-leader replication (see [“Dealing with Conflicting Writes”](/en/ch6#sec_replication_write_conflicts)). We will return to this
  topic in [“Detecting Concurrent Writes”](/en/ch6#sec_replication_concurrent).

Thus, although quorums appear to guarantee that a read returns the latest written value, in practice
it is not so simple. Dynamo-style databases are generally optimized for use cases that can tolerate
eventual consistency. The parameters *w* and *r* allow you to adjust the probability of stale values
being read [^53], but it’s wise to not take them as absolute guarantees.

#### Monitoring staleness {#monitoring-staleness}

From an operational perspective, it’s important to monitor whether your databases are
returning up-to-date results. Even if your application can tolerate stale reads, you need to be
aware of the health of your replication. If it falls behind significantly, it should alert you so
that you can investigate the cause (for example, a problem in the network or an overloaded node).

For leader-based replication, the database typically exposes metrics for the replication lag, which
you can feed into a monitoring system. This is possible because writes are applied to the leader and
to followers in the same order, and each node has a position in the replication log (the number of
writes it has applied locally). By subtracting a follower’s current position from the leader’s
current position, you can measure the amount of replication lag.

However, in systems with leaderless replication, there is no fixed order in which writes are
applied, which makes monitoring more difficult. The number of hints that a replica stores for
handoff can be one measure of system health, but it’s difficult to interpret usefully [^54].
Eventual consistency is a deliberately vague guarantee, but for operability it’s important to be
able to quantify “eventual.”


### Single-Leader vs. Leaderless Replication Performance {#sec_replication_leaderless_perf}

A replication system based on a single leader can provide strong consistency guarantees that are
difficult or impossible to achieve in a leaderless system. However, as we have seen in
[“Problems with Replication Lag”](/en/ch6#sec_replication_lag), reads in a leader-based replicated system can also return stale values if
you make them on an asynchronously updated follower.

Reading from the leader ensures up-to-date responses, but it suffers from performance problems:

* Read throughput is limited by the leader’s capacity to handle requests (in contrast with read
  scaling, which distributes reads across asynchronously updated replicas that may return stale
  values).
* If the leader fails, you have to wait for the fault to be detected, and for the failover to
  complete before you can continue handling requests. Even if the failover process is very quick,
  users will notice it because of the temporarily increased response times; if failover takes a long
  time, the system is unavailable for its duration.
* The system is very sensitive to performance problems on the leader: if the leader is slow to
  respond, e.g. due to overload or some resource contention, the increased response times
  immediately affect users as well.

A big advantage of a leaderless architecture is that it is more resilient against such issues.
Because there is no failover, and requests go to multiple replicas in parallel anyway, one replica
becoming slow or unavailable has very little impact on response times: the client simply uses the
responses from the other replicas that are faster to respond. Using the fastest responses is called
*request hedging*, and it can significantly reduce tail latency [^55]).

At its core, the resilience of a leaderless system comes from the fact that it doesn’t distinguish
between the normal case and the failure case. This is especially helpful when handling so-called
*gray failures*, in which a node isn’t completely down, but running in a degraded state where it is
unusually slow to handle requests [^56], or when a node is simply overloaded (for example, if a node has been offline for a while, recovery
via hinted handoff can cause a lot of additional load). A leader-based system has to decide whether
the situation is bad enough to warrant a failover (which can itself cause further disruption),
whereas in a leaderless system that question doesn’t even arise.

That said, leaderless systems can have performance problems as well:

* Even though the system doesn’t need to perform failover, one replica does need to detect when
  another replica is unavailable so that it can store hints about writes that the unavailable
  replica missed. When the unavailable replica comes back, the handoff process needs to send it
  those hints. This puts additional load on the replicas at a time when the system is already under strain [^54].
* The more replicas you have, the bigger the size of your quorums, and the more responses you have
  to wait for before a request can complete. Even if you wait only for the fastest *r* or *w*
  replicas to respond, and even if you make the requests in parallel, a bigger *r* or *w* increases
  the chance that you hit a slow replica, increasing the overall response time (see
  [“Use of Response Time Metrics”](/en/ch2#sec_introduction_slo_sla)).
* A large-scale network interruption that disconnects a client from a large number of replicas can
  make it impossible to form a quorum. Some leaderless databases offer a configuration option that
  allows any reachable replica to accept writes, even if it’s not one of the usual replicas for that
  key (Riak and Dynamo call this a *sloppy quorum* [^45];
  Cassandra and ScyllaDB call it *consistency level ANY*). There is no guarantee that subsequent
  reads will see the written value, but depending on the application it may still be better than
  having the write fail.

Multi-leader replication can offer even greater resilience against network interruptions than
leaderless replication, since reads and writes only require communication with one leader, which can
be co-located with the client. However, since a write on one leader is propagated asynchronously to
the others, reads can be arbitrarily out-of-date. Quorum reads and writes provide a compromise: good
fault tolerance while also having a high likelihood of reading up-to-date data.

#### Multi-region operation {#multi-region-operation}

We previously discussed cross-region replication as a use case for multi-leader replication (see
[“Multi-Leader Replication”](/en/ch6#sec_replication_multi_leader)). Leaderless replication is also suitable for
multi-region operation, since it is designed to tolerate conflicting concurrent writes, network
interruptions, and latency spikes.

Cassandra and ScyllaDB implement their multi-region support within the normal leaderless model: the
client sends its writes directly to the replicas in all regions, and you can choose from a variety
of consistency levels that determine how many responses are required for a request to be successful.
For example, you can request a quorum across the replicas in all the regions, a separate quorum in
each of the regions, or a quorum only in the client’s local region. A local quorum avoids having to
wait for slow requests to other regions, but it is also more likely to return stale results.

Riak keeps all communication between clients and database nodes local to one region, so *n*
describes the number of replicas within one region. Cross-region replication between
database clusters happens asynchronously in the background, in a style that is similar to
multi-leader replication.


### Detecting Concurrent Writes {#sec_replication_concurrent}

Like with multi-leader replication, leaderless databases allow concurrent writes to the same key,
resulting in conflicts that need to be resolved. Such conflicts may occur as the writes happen, but
not always: they could also be detected later during read repair, hinted handoff, or anti-entropy.

The problem is that events may arrive in a different order at different nodes, due to variable
network delays and partial failures. For example, [Figure 6-14](/en/ch6#fig_replication_concurrency) shows two clients,
A and B, simultaneously writing to a key *X* in a three-node datastore:

* Node 1 receives the write from A, but never receives the write from B due to a transient outage.
* Node 2 first receives the write from A, then the write from B.
* Node 3 first receives the write from B, then the write from A.

{{< figure src="/fig/ddia_0614.png" id="fig_replication_concurrency" caption="Figure 6-14. Concurrent writes in a Dynamo-style datastore: there is no well-defined ordering." class="w-full my-4" >}}


If each node simply overwrote the value for a key whenever it received a write request from a
client, the nodes would become permanently inconsistent, as shown by the final *get* request in
[Figure 6-14](/en/ch6#fig_replication_concurrency): node 2 thinks that the final value of *X* is B, whereas the other
nodes think that the value is A.

In order to become eventually consistent, the replicas should converge toward the same value. For
this, we can use any of the conflict resolution mechanisms we previously discussed in
[“Dealing with Conflicting Writes”](/en/ch6#sec_replication_write_conflicts), such as last-write-wins (used by Cassandra and ScyllaDB),
manual resolution, or CRDTs (described in [“CRDTs and Operational Transformation”](/en/ch6#sec_replication_crdts), and used by Riak).

Last-write-wins is easy to implement: each write is tagged with a timestamp, and a value with a
higher timestamp always overwrites a value with a lower timestamp. However, a timestamp doesn’t tell
you whether two values are actually conflicting (i.e., they were written concurrently) or not (they
were written one after another). If you want to resolve conflicts explicitly, the system needs to
take more care to detect concurrent writes.

#### The “happens-before” relation and concurrency {#sec_replication_happens_before}

How do we decide whether two operations are concurrent or not? To develop an intuition, let’s look
at some examples:

* In [Figure 6-8](/en/ch6#fig_replication_causality), the two writes are not concurrent: A’s insert *happens before*
  B’s increment, because the value incremented by B is the value inserted by A. In other words, B’s
  operation builds upon A’s operation, so B’s operation must have happened later.
  We also say that B is *causally dependent* on A.
* On the other hand, the two writes in [Figure 6-14](/en/ch6#fig_replication_concurrency) are concurrent: when each
  client starts the operation, it does not know that another client is also performing an operation
  on the same key. Thus, there is no causal dependency between the operations.

An operation A *happens before* another operation B if B knows about A, or depends on A, or builds
upon A in some way. Whether one operation happens before another operation is the key to defining
what concurrency means. In fact, we can simply say that two operations are *concurrent* if neither
happens before the other (i.e., neither knows about the other) [^57].

Thus, whenever you have two operations A and B, there are three possibilities: either A happened
before B, or B happened before A, or A and B are concurrent. What we need is an algorithm to tell us
whether two operations are concurrent or not. If one operation happened before another, the later
operation should overwrite the earlier operation, but if the operations are concurrent, we have a
conflict that needs to be resolved.

--------

> ![TIP] Concurrency, Time, and Relativity

It may seem that two operations should be called concurrent if they occur “at the same time”—but
in fact, it is not important whether they literally overlap in time. Because of problems with clocks
in distributed systems, it is actually quite difficult to tell whether two things happened
at exactly the same time—an issue we will discuss in more detail in [Chapter 9](/en/ch9#ch_distributed).

For defining concurrency, exact time doesn’t matter: we simply call two operations concurrent if
they are both unaware of each other, regardless of the physical time at which they occurred. People
sometimes make a connection between this principle and the special theory of relativity in physics
[^57], which introduced the idea that
information cannot travel faster than the speed of light. Consequently, two events that occur some
distance apart cannot possibly affect each other if the time between the events is shorter than the
time it takes light to travel the distance between them.

In computer systems, two operations might be concurrent even though the speed of light would in
principle have allowed one operation to affect the other. For example, if the network was slow or
interrupted at the time, two operations can occur some time apart and still be concurrent, because
the network problems prevented one operation from being able to know about the other.

--------

#### Capturing the happens-before relationship {#capturing-the-happens-before-relationship}

Let’s look at an algorithm that determines whether two operations are concurrent, or whether one
happened before another. To keep things simple, let’s start with a database that has only one
replica. Once we have worked out how to do this on a single replica, we can generalize the approach
to a leaderless database with multiple replicas.

[Figure 6-15](/en/ch6#fig_replication_causality_single) shows two clients concurrently adding items to the same
shopping cart. (If that example strikes you as too inane, imagine instead two air traffic
controllers concurrently adding aircraft to the sector they are tracking.) Initially, the cart is
empty. Between them, the clients make five writes to the database:

1. Client 1 adds `milk` to the cart. This is the first write to that key, so the server successfully
   stores it and assigns it version 1. The server also echoes the value back to the client, along
   with the version number.
2. Client 2 adds `eggs` to the cart, not knowing that client 1 concurrently added `milk` (client 2
   thought that its `eggs` were the only item in the cart). The server assigns version 2 to this
   write, and stores `eggs` and `milk` as two separate values (siblings). It then returns *both*
   values to the client, along with the version number of 2.
3. Client 1, oblivious to client 2’s write, wants to add `flour` to the cart, so it thinks the
   current cart contents should be `[milk, flour]`. It sends this value to the server, along with
   the version number 1 that the server gave client 1 previously. The server can tell from the
   version number that the write of `[milk, flour]` supersedes the prior value of `[milk]` but that
   it is concurrent with `[eggs]`. Thus, the server assigns version 3 to `[milk, flour]`, overwrites
   the version 1 value `[milk]`, but keeps the version 2 value `[eggs]` and returns both remaining
   values to the client.
4. Meanwhile, client 2 wants to add `ham` to the cart, unaware that client 1 just added `flour`.
   Client 2 received the two values `[milk]` and `[eggs]` from the server in the last response, so
   the client now merges those values and adds `ham` to form a new value, `[eggs, milk, ham]`. It
   sends that value to the server, along with the previous version number 2. The server detects that
   version 2 overwrites `[eggs]` but is concurrent with `[milk, flour]`, so the two remaining
   values are `[milk, flour]` with version 3, and `[eggs, milk, ham]` with version 4.
5. Finally, client 1 wants to add `bacon`. It previously received `[milk, flour]` and `[eggs]` from
   the server at version 3, so it merges those, adds `bacon`, and sends the final value
   `[milk, flour, eggs, bacon]` to the server, along with the version number 3. This overwrites
   `[milk, flour]` (note that `[eggs]` was already overwritten in the last step) but is concurrent
   with `[eggs, milk, ham]`, so the server keeps those two concurrent values.

{{< figure src="/fig/ddia_0615.png" id="fig_replication_causality_single" caption="Figure 6-15. Capturing causal dependencies between two clients concurrently editing a shopping cart." class="w-full my-4" >}}


The dataflow between the operations in [Figure 6-15](/en/ch6#fig_replication_causality_single) is illustrated
graphically in [Figure 6-16](/en/ch6#fig_replication_causal_dependencies). The arrows indicate which operation
*happened before* which other operation, in the sense that the later operation *knew about* or
*depended on* the earlier one. In this example, the clients are never fully up to date with the data
on the server, since there is always another operation going on concurrently. But old versions of
the value do get overwritten eventually, and no writes are lost.

{{< figure link="#fig_replication_causality_single" src="/fig/ddia_0616.png" id="fig_replication_causal_dependencies" caption="Figure 6-16. Graph of causal dependencies in Figure 6-15." class="w-full my-4" >}}


Note that the server can determine whether two operations are concurrent by looking at the version
numbers—it does not need to interpret the value itself (so the value could be any data
structure). The algorithm works as follows:

* The server maintains a version number for every key, increments the version number every time that
  key is written, and stores the new version number along with the value written.
* When a client reads a key, the server returns all siblings, i.e., all values that have not been
  overwritten, as well as the latest version number. A client must read a key before writing.
* When a client writes a key, it must include the version number from the prior read, and it must
  merge together all values that it received in the prior read, e.g. using a CRDT or by asking the
  user. The response from a write request is like a read, returning all siblings, which allows us to
  chain several writes like in the shopping cart example.
* When the server receives a write with a particular version number, it can overwrite all values
  with that version number or below (since it knows that they have been merged into the new value),
  but it must keep all values with a higher version number (because those values are concurrent with
  the incoming write).

When a write includes the version number from a prior read, that tells us which previous state the
write is based on. If you make a write without including a version number, it is concurrent with all
other writes, so it will not overwrite anything—it will just be returned as one of the values
on subsequent reads.

#### Version vectors {#version-vectors}

The example in [Figure 6-15](/en/ch6#fig_replication_causality_single) used only a single replica. How does the
algorithm change when there are multiple replicas, but no leader?

[Figure 6-15](/en/ch6#fig_replication_causality_single) uses a single version number to capture dependencies between
operations, but that is not sufficient when there are multiple replicas accepting writes
concurrently. Instead, we need to use a version number *per replica* as well as per key. Each
replica increments its own version number when processing a write, and also keeps track of the
version numbers it has seen from each of the other replicas. This information indicates which values
to overwrite and which values to keep as siblings.

The collection of version numbers from all the replicas is called a *version vector* [^58].
A few variants of this idea are in use, but the most interesting is probably the *dotted version vector* [^59] [^60],
which is used in Riak 2.0 [^61] [^62].
We won’t go into the details, but the way it works is quite similar to what we saw in our cart example.

Like the version numbers in [Figure 6-15](/en/ch6#fig_replication_causality_single), version vectors are sent from the
database replicas to clients when values are read, and need to be sent back to the database when a
value is subsequently written. (Riak encodes the version vector as a string that it calls *causal
context*.) The version vector allows the database to distinguish between overwrites and concurrent
writes.

The version vector also ensures that it is safe to read from one replica and subsequently write back
to another replica. Doing so may result in siblings being created, but no data is lost as long as
siblings are merged correctly.

--------

> [!TIP] VERSION VECTORS AND VECTOR CLOCKS

A *version vector* is sometimes also called a *vector clock*, even though they are not quite the
same. The difference is subtle—please see the references for details [^60] [^63] [^64]. In brief, when
comparing the state of replicas, version vectors are the right data structure to use.

--------

## Summary {#summary}

In this chapter we looked at the issue of replication. Replication can serve several purposes:

*High availability*
:   Keeping the system running, even when one machine (or several machines, a
    zone, or even an entire region) goes down

*Disconnected operation*
:   Allowing an application to continue working when there is a network
    interruption

*Latency*
:   Placing data geographically close to users, so that users can interact with it faster

*Scalability*
:   Being able to handle a higher volume of reads than a single machine could handle,
    by performing reads on replicas

Despite being a simple goal—keeping a copy of the same data on several machines—replication turns out
to be a remarkably tricky problem. It requires carefully thinking about concurrency and about all
the things that can go wrong, and dealing with the consequences of those faults. At a minimum, we
need to deal with unavailable nodes and network interruptions (and that’s not even considering the
more insidious kinds of fault, such as silent data corruption due to software bugs or hardware errors).

We discussed three main approaches to replication:

*Single-leader replication*
:   Clients send all writes to a single node (the leader), which sends a
    stream of data change events to the other replicas (followers). Reads can be performed on any
    replica, but reads from followers might be stale.

*Multi-leader replication*
:   Clients send each write to one of several leader nodes, any of which
    can accept writes. The leaders send streams of data change events to each other and to any
    follower nodes.

*Leaderless replication*
:   Clients send each write to several nodes, and read from several nodes
    in parallel in order to detect and correct nodes with stale data.

Each approach has advantages and disadvantages. Single-leader replication is popular because it is
fairly easy to understand and it offers strong consistency. Multi-leader and leaderless replication
can be more robust in the presence of faulty nodes, network interruptions, and latency spikes—at the
cost of requiring conflict resolution and providing weaker consistency guarantees.

Replication can be synchronous or asynchronous, which has a profound effect on the system behavior
when there is a fault. Although asynchronous replication can be fast when the system is running
smoothly, it’s important to figure out what happens when replication lag increases and servers fail.
If a leader fails and you promote an asynchronously updated follower to be the new leader, recently
committed data may be lost.

We looked at some strange effects that can be caused by replication lag, and we discussed a few
consistency models which are helpful for deciding how an application should behave under replication
lag:

*Read-after-write consistency*
:   Users should always see data that they submitted themselves.

*Monotonic reads*
:   After users have seen the data at one point in time, they shouldn’t later see
    the data from some earlier point in time.

*Consistent prefix reads*
:   Users should see the data in a state that makes causal sense:
    for example, seeing a question and its reply in the correct order.

Finally, we discussed how multi-leader and leaderless replication ensure that all replicas
eventually converge to a consistent state: by using a version vector or similar algorithm to detect
which writes are concurrent, and by using a conflict resolution algorithm such as a CRDT to merge
the concurrently written values. Last-write-wins and manual conflict resolution are also possible.

This chapter has assumed that every replica stores a full copy of the whole database, which is
unrealistic for large datasets. In the next chapter we will look at *sharding*, which allows each
machine to store only a subset of the data.


### References


[^1]: B. G. Lindsay, P. G. Selinger, C. Galtieri, J. N. Gray, R. A. Lorie, T. G. Price, F. Putzolu, I. L. Traiger, and B. W. Wade. [Notes on Distributed Databases](https://dominoweb.draco.res.ibm.com/reports/RJ2571.pdf). IBM Research, Research Report RJ2571(33471), July 1979. Archived at [perma.cc/EPZ3-MHDD](https://perma.cc/EPZ3-MHDD)
[^2]: Kenny Gryp. [MySQL Terminology Updates](https://dev.mysql.com/blog-archive/mysql-terminology-updates/). *dev.mysql.com*, July 2020. Archived at [perma.cc/S62G-6RJ2](https://perma.cc/S62G-6RJ2)
[^3]: Oracle Corporation. [Oracle (Active) Data Guard 19c: Real-Time Data Protection and Availability](https://www.oracle.com/technetwork/database/availability/dg-adg-technical-overview-wp-5347548.pdf). White Paper, *oracle.com*, March 2019. Archived at [perma.cc/P5ST-RPKE](https://perma.cc/P5ST-RPKE)
[^4]: Microsoft. [What is an Always On availability group?](https://learn.microsoft.com/en-us/sql/database-engine/availability-groups/windows/overview-of-always-on-availability-groups-sql-server) *learn.microsoft.com*, September 2024. Archived at [perma.cc/ABH6-3MXF](https://perma.cc/ABH6-3MXF)
[^5]: Mostafa Elhemali, Niall Gallagher, Nicholas Gordon, Joseph Idziorek, Richard Krog, Colin Lazier, Erben Mo, Akhilesh Mritunjai, Somu Perianayagam, Tim Rath, Swami Sivasubramanian, James Christopher Sorenson III, Sroaj Sosothikul, Doug Terry, and Akshat Vig. [Amazon DynamoDB: A Scalable, Predictably Performant, and Fully Managed NoSQL Database Service](https://www.usenix.org/conference/atc22/presentation/elhemali). At *USENIX Annual Technical Conference* (ATC), July 2022.
[^6]: Rebecca Taft, Irfan Sharif, Andrei Matei, Nathan VanBenschoten, Jordan Lewis, Tobias Grieger, Kai Niemi, Andy Woods, Anne Birzin, Raphael Poss, Paul Bardea, Amruta Ranade, Ben Darnell, Bram Gruneir, Justin Jaffray, Lucy Zhang, and Peter Mattis. [CockroachDB: The Resilient Geo-Distributed SQL Database](https://dl.acm.org/doi/abs/10.1145/3318464.3386134). At *ACM SIGMOD International Conference on Management of Data* (SIGMOD), pages 1493–1509, June 2020. [doi:10.1145/3318464.3386134](https://doi.org/10.1145/3318464.3386134)
[^7]: Dongxu Huang, Qi Liu, Qiu Cui, Zhuhe Fang, Xiaoyu Ma, Fei Xu, Li Shen, Liu Tang, Yuxing Zhou, Menglong Huang, Wan Wei, Cong Liu, Jian Zhang, Jianjun Li, Xuelian Wu, Lingyu Song, Ruoxi Sun, Shuaipeng Yu, Lei Zhao, Nicholas Cameron, Liquan Pei, and Xin Tang. [TiDB: a Raft-based HTAP database](https://www.vldb.org/pvldb/vol13/p3072-huang.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 12, pages 3072–3084. [doi:10.14778/3415478.3415535](https://doi.org/10.14778/3415478.3415535)
[^8]: Mallory Knodel and Niels ten Oever. [Terminology, Power, and Inclusive Language in Internet-Drafts and RFCs](https://www.ietf.org/archive/id/draft-knodel-terminology-14.html). *IETF Internet-Draft*, August 2023. Archived at [perma.cc/5ZY9-725E](https://perma.cc/5ZY9-725E)
[^9]: Buck Hodges. [Postmortem: VSTS 4 September 2018](https://devblogs.microsoft.com/devopsservice/?p=17485). *devblogs.microsoft.com*, September 2018. Archived at [perma.cc/ZF5R-DYZS](https://perma.cc/ZF5R-DYZS)
[^10]: Gunnar Morling. [Leader Election With S3 Conditional Writes](https://www.morling.dev/blog/leader-election-with-s3-conditional-writes/). *www.morling.dev*, August 2024. Archived at [perma.cc/7V2N-J78Y](https://perma.cc/7V2N-J78Y)
[^11]: Vignesh Chandramohan, Rohan Desai, and Chris Riccomini. [SlateDB Manifest Design](https://github.com/slatedb/slatedb/blob/main/rfcs/0001-manifest.md). *github.com*, May 2024. Archived at [perma.cc/8EUY-P32Z](https://perma.cc/8EUY-P32Z)
[^12]: Stas Kelvich. [Why does Neon use Paxos instead of Raft, and what’s the difference?](https://neon.tech/blog/paxos) *neon.tech*, August 2022. Archived at [perma.cc/SEZ4-2GXU](https://perma.cc/SEZ4-2GXU)
[^13]: Dimitri Fontaine. [An introduction to the pg\_auto\_failover project](https://tapoueh.org/blog/2021/11/an-introduction-to-the-pg_auto_failover-project/). *tapoueh.org*, November 2021. Archived at [perma.cc/3WH5-6BAF](https://perma.cc/3WH5-6BAF)
[^14]: Jesse Newland. [GitHub availability this week](https://github.blog/news-insights/the-library/github-availability-this-week/). *github.blog*, September 2012. Archived at [perma.cc/3YRF-FTFJ](https://perma.cc/3YRF-FTFJ)
[^15]: Mark Imbriaco. [Downtime last Saturday](https://github.blog/news-insights/the-library/downtime-last-saturday/). *github.blog*, December 2012. Archived at [perma.cc/M7X5-E8SQ](https://perma.cc/M7X5-E8SQ)
[^16]: John Hugg. [‘All In’ with Determinism for Performance and Testing in Distributed Systems](https://www.youtube.com/watch?v=gJRj3vJL4wE). At *Strange Loop*, September 2015.
[^17]: Hironobu Suzuki. [The Internals of PostgreSQL](https://www.interdb.jp/pg/). *interdb.jp*, 2017.
[^18]: Amit Kapila. [WAL Internals of PostgreSQL](https://www.pgcon.org/2012/schedule/attachments/258_212_Internals%20Of%20PostgreSQL%20Wal.pdf). At *PostgreSQL Conference* (PGCon), May 2012. Archived at [perma.cc/6225-3SUX](https://perma.cc/6225-3SUX)
[^19]: Amit Kapila. [Evolution of Logical Replication](https://amitkapila16.blogspot.com/2023/09/evolution-of-logical-replication.html). *amitkapila16.blogspot.com*, September 2023. Archived at [perma.cc/F9VX-JLER](https://perma.cc/F9VX-JLER)
[^20]: Aru Petchimuthu. [Upgrade your Amazon RDS for PostgreSQL or Amazon Aurora PostgreSQL database, Part 2: Using the pglogical extension](https://aws.amazon.com/blogs/database/part-2-upgrade-your-amazon-rds-for-postgresql-database-using-the-pglogical-extension/). *aws.amazon.com*, August 2021. Archived at [perma.cc/RXT8-FS2T](https://perma.cc/RXT8-FS2T)
[^21]: Yogeshwer Sharma, Philippe Ajoux, Petchean Ang, David Callies, Abhishek Choudhary, Laurent Demailly, Thomas Fersch, Liat Atsmon Guz, Andrzej Kotulski, Sachin Kulkarni, Sanjeev Kumar, Harry Li, Jun Li, Evgeniy Makeev, Kowshik Prakasam, Robbert van Renesse, Sabyasachi Roy, Pratyush Seth, Yee Jiun Song, Benjamin Wester, Kaushik Veeraraghavan, and Peter Xie. [Wormhole: Reliable Pub-Sub to Support Geo-Replicated Internet Services](https://www.usenix.org/system/files/conference/nsdi15/nsdi15-paper-sharma.pdf). At *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015.
[^22]: Douglas B. Terry. [Replicated Data Consistency Explained Through Baseball](https://www.microsoft.com/en-us/research/publication/replicated-data-consistency-explained-through-baseball/). Microsoft Research, Technical Report MSR-TR-2011-137, October 2011. Archived at [perma.cc/F4KZ-AR38](https://perma.cc/F4KZ-AR38)
[^23]: Douglas B. Terry, Alan J. Demers, Karin Petersen, Mike J. Spreitzer, Marvin M. Theher, and Brent B. Welch. [Session Guarantees for Weakly Consistent Replicated Data](https://csis.pace.edu/~marchese/CS865/Papers/SessionGuaranteesPDIS.pdf). At *3rd International Conference on Parallel and Distributed Information Systems* (PDIS), September 1994. [doi:10.1109/PDIS.1994.331722](https://doi.org/10.1109/PDIS.1994.331722)
[^24]: Werner Vogels. [Eventually Consistent](https://queue.acm.org/detail.cfm?id=1466448). *ACM Queue*, volume 6, issue 6, pages 14–19, October 2008. [doi:10.1145/1466443.1466448](https://doi.org/10.1145/1466443.1466448)
[^25]: Simon Willison. [Reply to: “My thoughts about Fly.io (so far) and other newish technology I’m getting into”](https://news.ycombinator.com/item?id=31434055). *news.ycombinator.com*, May 2022. Archived at [perma.cc/ZRV4-WWV8](https://perma.cc/ZRV4-WWV8)
[^26]: Nithin Tharakan. [Scaling Bitbucket’s Database](https://www.atlassian.com/blog/bitbucket/scaling-bitbuckets-database). *atlassian.com*, October 2020. Archived at [perma.cc/JAB7-9FGX](https://perma.cc/JAB7-9FGX)
[^27]: Terry Pratchett. *Reaper Man: A Discworld Novel*. Victor Gollancz, 1991. ISBN: 978-0-575-04979-6
[^28]: Peter Bailis, Alan Fekete, Michael J. Franklin, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Coordination Avoidance in Database Systems](https://arxiv.org/abs/1402.2237). *Proceedings of the VLDB Endowment*, volume 8, issue 3, pages 185–196, November 2014. [doi:10.14778/2735508.2735509](https://doi.org/10.14778/2735508.2735509)
[^29]: Yaser Raja and Peter Celentano. [PostgreSQL bi-directional replication using pglogical](https://aws.amazon.com/blogs/database/postgresql-bi-directional-replication-using-pglogical/). *aws.amazon.com*, January 2022. Archived at <https://perma.cc/BUQ2-5QWN>
[^30]: Robert Hodges. [If You \*Must\* Deploy Multi-Master Replication, Read This First](https://scale-out-blog.blogspot.com/2012/04/if-you-must-deploy-multi-master.html). *scale-out-blog.blogspot.com*, April 2012. Archived at [perma.cc/C2JN-F6Y8](https://perma.cc/C2JN-F6Y8)
[^31]: Lars Hofhansl. [HBASE-7709: Infinite Loop Possible in Master/Master Replication](https://issues.apache.org/jira/browse/HBASE-7709). *issues.apache.org*, January 2013. Archived at [perma.cc/24G2-8NLC](https://perma.cc/24G2-8NLC)
[^32]: John Day-Richter. [What’s Different About the New Google Docs: Making Collaboration Fast](https://drive.googleblog.com/2010/09/whats-different-about-new-google-docs.html). *drive.googleblog.com*, September 2010. Archived at [perma.cc/5TL8-TSJ2](https://perma.cc/5TL8-TSJ2)
[^33]: Evan Wallace. [How Figma’s multiplayer technology works](https://www.figma.com/blog/how-figmas-multiplayer-technology-works/). *figma.com*, October 2019. Archived at [perma.cc/L49H-LY4D](https://perma.cc/L49H-LY4D)
[^34]: Tuomas Artman. [Scaling the Linear Sync Engine](https://linear.app/blog/scaling-the-linear-sync-engine). *linear.app*, June 2023.
[^35]: Amr Saafan. [Why Sync Engines Might Be the Future of Web Applications](https://www.nilebits.com/blog/2024/09/sync-engines-future-web-applications/). *nilebits.com*, September 2024. Archived at [perma.cc/5N73-5M3V](https://perma.cc/5N73-5M3V)
[^36]: Isaac Hagoel. [Are Sync Engines The Future of Web Applications?](https://dev.to/isaachagoel/are-sync-engines-the-future-of-web-applications-1bbi) *dev.to*, July 2024. Archived at [perma.cc/R9HF-BKKL](https://perma.cc/R9HF-BKKL)
[^37]: Sujay Jayakar. [A Map of Sync](https://stack.convex.dev/a-map-of-sync). *stack.convex.dev*, October 2024. Archived at [perma.cc/82R3-H42A](https://perma.cc/82R3-H42A)
[^38]: Alex Feyerke. [Designing Offline-First Web Apps](https://alistapart.com/article/offline-first/). *alistapart.com*, December 2013. Archived at [perma.cc/WH7R-S2DS](https://perma.cc/WH7R-S2DS)
[^39]: Martin Kleppmann, Adam Wiggins, Peter van Hardenberg, and Mark McGranaghan. [Local-first software: You own your data, in spite of the cloud](https://www.inkandswitch.com/local-first/). At *ACM SIGPLAN International Symposium on New Ideas, New Paradigms, and Reflections on Programming and Software* (Onward!), October 2019, pages 154–178. [doi:10.1145/3359591.3359737](https://doi.org/10.1145/3359591.3359737)
[^40]: Martin Kleppmann. [The past, present, and future of local-first](https://martin.kleppmann.com/2024/05/30/local-first-conference.html). At *Local-First Conference*, May 2024.
[^41]: Conrad Hofmeyr. [API Calling is to Sync Engines as jQuery is to React](https://www.powersync.com/blog/api-calling-is-to-sync-engines-as-jquery-is-to-react). *powersync.com*, November 2024. Archived at [perma.cc/2FP9-7WJJ](https://perma.cc/2FP9-7WJJ)
[^42]: Peter van Hardenberg and Martin Kleppmann. [PushPin: Towards Production-Quality Peer-to-Peer Collaboration](https://martin.kleppmann.com/papers/pushpin-papoc20.pdf). At *7th Workshop on Principles and Practice of Consistency for Distributed Data* (PaPoC), April 2020. [doi:10.1145/3380787.3393683](https://doi.org/10.1145/3380787.3393683)
[^43]: Leonard Kawell, Jr., Steven Beckhardt, Timothy Halvorsen, Raymond Ozzie, and Irene Greif. [Replicated document management in a group communication system](https://dl.acm.org/doi/pdf/10.1145/62266.1024798). At *ACM Conference on Computer-Supported Cooperative Work* (CSCW), September 1988. [doi:10.1145/62266.1024798](https://doi.org/10.1145/62266.1024798)
[^44]: Ricky Pusch. [Explaining how fighting games use delay-based and rollback netcode](https://words.infil.net/w02-netcode.html). *words.infil.net* and *arstechnica.com*, October 2019. Archived at [perma.cc/DE7W-RDJ8](https://perma.cc/DE7W-RDJ8)
[^45]: Giuseppe DeCandia, Deniz Hastorun, Madan Jampani, Gunavardhan Kakulapati, Avinash Lakshman, Alex Pilchin, Swaminathan Sivasubramanian, Peter Vosshall, and Werner Vogels. [Dynamo: Amazon’s Highly Available Key-Value Store](https://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf). At *21st ACM Symposium on Operating Systems Principles* (SOSP), October 2007. [doi:10.1145/1323293.1294281](https://doi.org/10.1145/1323293.1294281)
[^46]: Marc Shapiro, Nuno Preguiça, Carlos Baquero, and Marek Zawirski. [A Comprehensive Study of Convergent and Commutative Replicated Data Types](https://inria.hal.science/inria-00555588v1/document). INRIA Research Report no. 7506, January 2011.
[^47]: Chengzheng Sun and Clarence Ellis. [Operational Transformation in Real-Time Group Editors: Issues, Algorithms, and Achievements](https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=aef660812c5a9c4d3f06775f9455eeb090a4ff0f). At *ACM Conference on Computer Supported Cooperative Work* (CSCW), November 1998. [doi:10.1145/289444.289469](https://doi.org/10.1145/289444.289469)
[^48]: Joseph Gentle and Martin Kleppmann. [Collaborative Text Editing with Eg-walker: Better, Faster, Smaller](https://arxiv.org/abs/2409.14252). At *20th European Conference on Computer Systems* (EuroSys), March 2025. [doi:10.1145/3689031.3696076](https://doi.org/10.1145/3689031.3696076)
[^49]: Dharma Shukla. [Azure Cosmos DB: Pushing the frontier of globally distributed databases](https://azure.microsoft.com/en-us/blog/azure-cosmos-db-pushing-the-frontier-of-globally-distributed-databases/). *azure.microsoft.com*, September 2018. Archived at [perma.cc/UT3B-HH6R](https://perma.cc/UT3B-HH6R)
[^50]: David K. Gifford. [Weighted Voting for Replicated Data](https://www.cs.cmu.edu/~15-749/READINGS/required/availability/gifford79.pdf). At *7th ACM Symposium on Operating Systems Principles* (SOSP), December 1979. [doi:10.1145/800215.806583](https://doi.org/10.1145/800215.806583)
[^51]: Heidi Howard, Dahlia Malkhi, and Alexander Spiegelman. [Flexible Paxos: Quorum Intersection Revisited](https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.OPODIS.2016.25). At *20th International Conference on Principles of Distributed Systems* (OPODIS), December 2016. [doi:10.4230/LIPIcs.OPODIS.2016.25](https://doi.org/10.4230/LIPIcs.OPODIS.2016.25)
[^52]: Joseph Blomstedt. [Bringing Consistency to Riak](https://vimeo.com/51973001). At *RICON West*, October 2012.
[^53]: Peter Bailis, Shivaram Venkataraman, Michael J. Franklin, Joseph M. Hellerstein, and Ion Stoica. [Quantifying eventual consistency with PBS](http://www.bailis.org/papers/pbs-vldbj2014.pdf). *The VLDB Journal*, volume 23, pages 279–302, April 2014. [doi:10.1007/s00778-013-0330-1](https://doi.org/10.1007/s00778-013-0330-1)
[^54]: Colin Breck. [Shared-Nothing Architectures for Server Replication and Synchronization](https://blog.colinbreck.com/shared-nothing-architectures-for-server-replication-and-synchronization/). *blog.colinbreck.com*, December 2019. Archived at [perma.cc/48P3-J6CJ](https://perma.cc/48P3-J6CJ)
[^55]: Jeffrey Dean and Luiz André Barroso. [The Tail at Scale](https://cacm.acm.org/research/the-tail-at-scale/). *Communications of the ACM*, volume 56, issue 2, pages 74–80, February 2013. [doi:10.1145/2408776.2408794](https://doi.org/10.1145/2408776.2408794)
[^56]: Peng Huang, Chuanxiong Guo, Lidong Zhou, Jacob R. Lorch, Yingnong Dang, Murali Chintalapati, and Randolph Yao. [Gray Failure: The Achilles’ Heel of Cloud-Scale Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/06/paper-1.pdf). At *16th Workshop on Hot Topics in Operating Systems* (HotOS), May 2017. [doi:10.1145/3102980.3103005](https://doi.org/10.1145/3102980.3103005)
[^57]: Leslie Lamport. [Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/). *Communications of the ACM*, volume 21, issue 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](https://doi.org/10.1145/359545.359563)
[^58]: D. Stott Parker Jr., Gerald J. Popek, Gerard Rudisin, Allen Stoughton, Bruce J. Walker, Evelyn Walton, Johanna M. Chow, David Edwards, Stephen Kiser, and Charles Kline. [Detection of Mutual Inconsistency in Distributed Systems](https://pages.cs.wisc.edu/~remzi/Classes/739/Papers/parker83detection.pdf). *IEEE Transactions on Software Engineering*, volume SE-9, issue 3, pages 240–247, May 1983. [doi:10.1109/TSE.1983.236733](https://doi.org/10.1109/TSE.1983.236733)
[^59]: Nuno Preguiça, Carlos Baquero, Paulo Sérgio Almeida, Victor Fonte, and Ricardo Gonçalves. [Dotted Version Vectors: Logical Clocks for Optimistic Replication](https://arxiv.org/abs/1011.5808). arXiv:1011.5808, November 2010.
[^60]: Giridhar Manepalli. [Clocks and Causality - Ordering Events in Distributed Systems](https://www.exhypothesi.com/clocks-and-causality/). *exhypothesi.com*, November 2022. Archived at [perma.cc/8REU-KVLQ](https://perma.cc/8REU-KVLQ)
[^61]: Sean Cribbs. [A Brief History of Time in Riak](https://speakerdeck.com/seancribbs/a-brief-history-of-time-in-riak). At *RICON*, October 2014. Archived at [perma.cc/7U9P-6JFX](https://perma.cc/7U9P-6JFX)
[^62]: Russell Brown. [Vector Clocks Revisited Part 2: Dotted Version Vectors](https://riak.com/posts/technical/vector-clocks-revisited-part-2-dotted-version-vectors/). *riak.com*, November 2015. Archived at [perma.cc/96QP-W98R](https://perma.cc/96QP-W98R)
[^63]: Carlos Baquero. [Version Vectors Are Not Vector Clocks](https://haslab.wordpress.com/2011/07/08/version-vectors-are-not-vector-clocks/). *haslab.wordpress.com*, July 2011. Archived at [perma.cc/7PNU-4AMG](https://perma.cc/7PNU-4AMG)
[^64]: Reinhard Schwarz and Friedemann Mattern. [Detecting Causal Relationships in Distributed Computations: In Search of the Holy Grail](https://disco.ethz.ch/courses/hs08/seminar/papers/mattern4.pdf). *Distributed Computing*, volume 7, issue 3, pages 149–174, March 1994. [doi:10.1007/BF02277859](https://doi.org/10.1007/BF02277859) 


================================================
FILE: content/en/ch7.md
================================================
---
title: "7. Sharding"
weight: 207
breadcrumbs: false
---

<a id="ch_sharding"></a>

![](/map/ch06.png)

> *Clearly, we must break away from the sequential and not limit the computers. We must state
> definitions and provide for priorities and descriptions of data. We must state relationships, not
> procedures.*
>
> Grace Murray Hopper, *Management and the Computer of the Future* (1962)

A distributed database typically distributes data across nodes in two ways:

1. Having a copy of the same data on multiple nodes: this is *replication*, which we discussed in [Chapter 6](/en/ch6#ch_replication).
2. If we don’t want every node to store all the data, we can split up a large amount of data into
 smaller *shards* or *partitions*, and store different shards on different nodes. We’ll discuss
 sharding in this chapter.

Normally, shards are defined in such a way that each piece of data (each record, row, or document)
belongs to exactly one shard. There are various ways of achieving this, which we discuss in depth in
this chapter. In effect, each shard is a small database of its own, although some database systems
support operations that touch multiple shards at the same time.

Sharding is usually combined with replication so that copies of each shard are stored on multiple
nodes. This means that, even though each record belongs to exactly one shard, it may still be stored
on several different nodes for fault tolerance.

A node may store more than one shard. If a single-leader replication model is used, the combination
of sharding and replication can look like [Figure 7-1](/en/ch7#fig_sharding_replicas), for example. Each shard’s
leader is assigned to one node, and its followers are assigned to other nodes. Each node may be the
leader for some shards and a follower for other shards, but each shard still only has one leader.

{{< figure src="/fig/ddia_0701.png" id="fig_sharding_replicas" caption="Figure 7-1. Combining replication and sharding: each node acts as leader for some shards and follower for other shards." class="w-full my-4" >}}

Everything we discussed in [Chapter 6](/en/ch6#ch_replication) about replication of databases applies equally to
replication of shards. Since the choice of sharding scheme is mostly independent of the choice of
replication scheme, we will ignore replication in this chapter for the sake of simplicity.

--------

> [!TIP] SHARDING AND PARTITIONING

What we call a *shard* in this chapter has many different names depending on which software you’re
using: it’s called a *partition* in Kafka, a *range* in CockroachDB, a *region* in HBase and TiDB, a
*tablet* in Bigtable and YugabyteDB, a *vnode* in Cassandra, ScyllaDB, and Riak, and a *vBucket* in
Couchbase, to name just a few.

Some databases treat partitions and shards as two distinct concepts. For example, in PostgreSQL,
partitioning is a way of splitting a large table into several files that are stored on the same
machine (which has several advantages, such as making it very fast to delete an entire partition),
whereas sharding splits a dataset across multiple machines [^1] [^2].
In many other systems, partitioning is just another word for sharding.

While *partitioning* is quite descriptive, the term *sharding* is perhaps surprising. According to
one theory, the term arose from the online role-play game *Ultima Online*, in which a magic crystal
was shattered into pieces, and each of those shards refracted a copy of the game world [^3].
The term *shard* thus came to mean one of a set of parallel game servers, and later was carried over
to databases. Another theory is that *shard* was originally an acronym of *System for Highly
Available Replicated Data*—reportedly a 1980s database, details of which are lost to history.

By the way, partitioning has nothing to do with *network partitions* (netsplits), a type of fault in
the network between nodes. We will discuss such faults in [Chapter 9](/en/ch9#ch_distributed).

--------

## Pros and Cons of Sharding {#sec_sharding_reasons}

The primary reason for sharding a database is *scalability*: it’s a solution if the volume of data
or the write throughput has become too great for a single node to handle, as it allows you to spread
that data and those writes across multiple nodes. (If read throughput is the problem, you don’t
necessarily need sharding—you can use *read scaling* as discussed in [Chapter 6](/en/ch6#ch_replication).)

In fact, sharding is one of the main tools we have for achieving *horizontal scaling* (a *scale-out*
architecture), as discussed in [“Shared-Memory, Shared-Disk, and Shared-Nothing Architecture”](/en/ch2#sec_introduction_shared_nothing): that is, allowing a system to
grow its capacity not by moving to a bigger machine, but by adding more (smaller) machines. If you
can divide the workload such that each shard handles a roughly equal share, you can then assign
those shards to different machines in order to process their data and queries in parallel.

While replication is useful at both small and large scale, because it enables fault tolerance and
offline operation, sharding is a heavyweight solution that is mostly relevant at large scale. If
your data volume and write throughput are such that you can process them on a single machine (and a
single machine can do a lot nowadays!), it’s often better to avoid sharding and stick with a
single-shard database.

The reason for this recommendation is that sharding often adds complexity: you typically have to
decide which records to put in which shard by choosing a *partition key*; all records with the
same partition key are placed in the same shard [^4].
This choice matters because accessing a record is fast if you know which shard it’s in, but if you
don’t know the shard you have to do an inefficient search across all shards, and the sharding scheme
is difficult to change.

Thus, sharding often works well for key-value data, where you can easily shard by key, but it’s
harder with relational data where you may want to search by a secondary index, or join records that
may be distributed across different shards. We will discuss this further in
[“Sharding and Secondary Indexes”](/en/ch7#sec_sharding_secondary_indexes).

Another problem with sharding is that a write may need to update related records in several
different shards. While transactions on a single node are quite common (see [Chapter 8](/en/ch8#ch_transactions)),
ensuring consistency across multiple shards requires a *distributed transaction*. As we shall see in
[Chapter 8](/en/ch8#ch_transactions), distributed transactions are available in some databases, but they are usually
much slower than single-node transactions, may become a bottleneck for the system as a whole, and
some systems don’t support them at all.

Some systems use sharding even on a single machine, typically running one single-threaded process
per CPU core to make use of the parallelism in the CPU, or to take advantage of a *nonuniform memory
access* (NUMA) architecture in which some banks of memory are closer to one CPU than to others [^5].
For example, Redis, VoltDB, and FoundationDB use one process per core, and rely on sharding to
spread load across CPU cores in the same machine [^6].

### Sharding for Multitenancy {#sec_sharding_multitenancy}

Software as a Service (SaaS) products and cloud services are often *multitenant*, where each tenant
is a customer. Multiple users may have logins on the same tenant, but each tenant has a
self-contained dataset that is separate from other tenants. For example, in an email marketing
service, each business that signs up is typically a separate tenant, since one business’s newsletter
signups, delivery data etc. are separate from those of other businesses.

Sometimes sharding is used to implement multitenant systems: either each tenant is given a separate
shard, or multiple small tenants may be grouped together into a larger shard. These shards might be
physically separate databases (which we previously touched on in [“Embedded storage engines”](/en/ch4#sidebar_embedded)), or
separately manageable portions of a larger logical database [^7].
Using sharding for multitenancy has several advantages:

Resource isolation
: If one tenant performs a computationally expensive operation, it is less likely that other
 tenants’ performance will be affected if they are running on different shards.

Permission isolation
: If there is a bug in your access control logic, it’s less likely that you will accidentally give
 one tenant access to another tenant’s data if those tenants’ datasets are stored physically
 separately from each other.

Cell-based architecture
: You can apply sharding not only at the data storage level, but also for the services running your
 application code. In a *cell-based architecture*, the services and storage for a particular set of
 tenants are grouped into a self-contained *cell*, and different cells are set up such that they
 can run largely independently from each other. This approach provides *fault isolation*: that is,
 a fault in one cell remains limited to that cell, and tenants in other cells are not affected [^8].

Per-tenant backup and restore
: Backing up each tenant’s shard separately makes it possible to restore a tenant’s state from a
 backup without affecting other tenants, which can be useful in case the tenant accidentally
 deletes or overwrites important data [^9].

Regulatory compliance
: Data privacy regulation such as the GDPR gives individuals the right to access and delete all data
 stored about them. If each person’s data is stored in a separate shard, this translates into
 simple data export and deletion operations on their shard [^10].

Data residence
: If a particular tenant’s data needs to be stored in a particular jurisdiction in order to comply
 with data residency laws, a region-aware database can allow you to assign that tenant’s shard to a particular region.

Gradual schema rollout
: Schema migrations (previously discussed in [“Schema flexibility in the document model”](/en/ch3#sec_datamodels_schema_flexibility)) can be rolled
 out gradually, one tenant at a time. This reduces risk, as you can detect problems before they
 affect all tenants, but it can be difficult to do transactionally [^11].

The main challenges around using sharding for multitenancy are:

* It assumes that each individual tenant is small enough to fit on a single node. If that is not the
 case, and you have a single tenant that’s too big for one machine, you would need to additionally
 perform sharding within a single tenant, which brings us back to the topic of sharding for
 scalability [^12].
* If you have many small tenants, then creating a separate shard for each one may incur too much
 overhead. You could group several small tenants together into a bigger shard, but then you have
 the problem of how you move tenants from one shard to another as they grow.
* If you ever need to support features that connect data across multiple tenants, these become
 harder to implement if you need to join data across multiple shards.


## Sharding of Key-Value Data {#sec_sharding_key_value}

Say you have a large amount of data, and you want to shard it. How do you decide which records to
store on which nodes?

Our goal with sharding is to spread the data and the query load evenly across nodes. If every node
takes a fair share, then—in theory—10 nodes should be able to handle 10 times as much data and 10
times the read and write throughput of a single node (ignoring replication). Moreover, if we add or
remove a node, we want to be able to *rebalance* the load so that it is evenly distributed across
the 11 (when adding) or the remaining 9 (when removing) nodes.

If the sharding is unfair, so that some shards have more data or queries than others, we call it
*skewed*. The presence of skew makes sharding much less effective. In an extreme case, all the load
could end up on one shard, so 9 out of 10 nodes are idle and your bottleneck is the single busy
node. A shard with disproportionately high load is called a *hot shard* or *hot spot*. If there’s
one key with a particularly high load (e.g., a celebrity in a social network), we call it a *hot key*.

Therefore we need an algorithm that takes as input the partition key of a record, and tells us which
shard that record is in. In a key-value store the partition key is usually the key, or the first
part of the key. In a relational model the partition key might be some column of a table (not
necessarily its primary key). That algorithm needs to be amenable to rebalancing in order to relieve
hot spots.


### Sharding by Key Range {#sec_sharding_key_range}

One way of sharding is to assign a contiguous range of partition keys (from some minimum to some
maximum) to each shard, like the volumes of a paper encyclopedia, as illustrated in
[Figure 7-2](/en/ch7#fig_sharding_encyclopedia). In this example, an entry’s partition key is its title. If you want
to look up the entry for a particular title, you can easily determine which shard contains that
entry by finding the volume whose key range contains the title you’re looking for, and thus pick the
correct book off the shelf.

{{< figure src="/fig/ddia_0702.png" id="fig_sharding_encyclopedia" caption="Figure 7-2. A print encyclopedia is sharded by key range." class="w-full my-4" >}}

The ranges of keys are not necessarily evenly spaced, because your data may not be evenly
distributed. For example, in [Figure 7-2](/en/ch7#fig_sharding_encyclopedia), volume 1 contains words starting with A
and B, but volume 12 contains words starting with T, U, V, W, X, Y, and Z. Simply having one volume
per two letters of the alphabet would lead to some volumes being much bigger than others. In order
to distribute the data evenly, the shard boundaries need to adapt to the data.

The shard boundaries might be chosen manually by an administrator, or the database can choose them
automatically. Manual key-range sharding is used by Vitess (a sharding layer for MySQL), for
example; the automatic variant is used by Bigtable, its open source equivalent HBase, the
range-based sharding option in MongoDB, CockroachDB, RethinkDB, and FoundationDB [^6]. YugabyteDB offers both manual and automatic
tablet splitting.

Within each shard, keys are stored in sorted order (e.g., in a B-tree or SSTables, as discussed in
[Chapter 4](/en/ch4#ch_storage)). This has the advantage that range scans are easy, and you can treat the key as a
concatenated index in order to fetch several related records in one query (see
[“Multidimensional and Full-Text Indexes”](/en/ch4#sec_storage_multidimensional)). For example, consider an application that stores data from a
network of sensors, where the key is the timestamp of the measurement. Range scans are very useful
in this case, because they let you easily fetch, say, all the readings from a particular month.

A downside of key range sharding is that you can easily get a hot shard if there are a
lot of writes to nearby keys. For example, if the key is a timestamp, then the shards correspond to
ranges of time—e.g., one shard per month. Unfortunately, if you write data from the sensors to the
database as the measurements happen, all the writes end up going to the same shard (the one for
this month), so that shard can be overloaded with writes while others sit idle [^13].

To avoid this problem in the sensor database, you need to use something other than the timestamp as
the first element of the key. For example, you could prefix each timestamp with the sensor ID so
that the key ordering is first by sensor ID and then by timestamp. Assuming you have many sensors
active at the same time, the write load will end up more evenly spread across the shards. The
downside is that when you want to fetch the values of multiple sensors within a time range, you now
need to perform a separate range query for each sensor.

#### Rebalancing key-range sharded data {#rebalancing-key-range-sharded-data}

When you first set up your database, there are no key ranges to split into shards. Some databases,
such as HBase and MongoDB, allow you to configure an initial set of shards on an empty database,
which is called *pre-splitting*. This requires that you already have some idea of what the key
distribution is going to look like, so that you can choose appropriate key range boundaries [^14].

Later on, as your data volume and write throughput grow, a system with key-range sharding grows by
splitting an existing shard into two or more smaller shards, each of which holds a contiguous
sub-range of the original shard’s key range. The resulting smaller shards can then be distributed
across multiple nodes. If large amounts of data are deleted, you may also need to merge several
adjacent shards that have become small into one bigger one.
This process is similar to what happens at the top level of a B-tree (see [“B-Trees”](/en/ch4#sec_storage_b_trees)).

With databases that manage shard boundaries automatically, a shard split is typically triggered by:

* the shard reaching a configured size (for example, on HBase, the default is 10 GB), or
* in some systems, the write throughput being persistently above some threshold. Thus, a hot shard
 may be split even if it is not storing a lot of data, so that its write load can be distributed more uniformly.

An advantage of key-range sharding is that the number of shards adapts to the data volume. If there
is only a small amount of data, a small number of shards is sufficient, so overheads are small; if
there is a huge amount of data, the size of each individual shard is limited to a configurable maximum [^15].

A downside of this approach is that splitting a shard is an expensive operation, since it requires
all of its data to be rewritten into new files, similarly to a compaction in a log-structured
storage engine. A shard that needs splitting is often also one that is under high load, and the cost
of splitting can exacerbate that load, risking it becoming overloaded.

### Sharding by Hash of Key {#sec_sharding_hash}

Key-range sharding is useful if you want records with nearby (but different) partition keys to be
grouped into the same shard; for example, this might be the case with timestamps. If you don’t care
whether partition keys are near each other (e.g., if they are tenant IDs in a multitenant
application), a common approach is to first hash the partition key before mapping it to a shard.

A good hash function takes skewed data and makes it uniformly distributed. Say you have a 32-bit
hash function that takes a string. Whenever you give it a new string, it returns a seemingly random
number between 0 and 232 − 1. Even if the input strings are very similar, their hashes are evenly 
distributed across that range of numbers (but the same input always produces the same output).

For sharding purposes, the hash function need not be cryptographically strong: for example, MongoDB
uses MD5, whereas Cassandra and ScyllaDB use Murmur3. Many programming languages have simple hash
functions built in (as they are used for hash tables), but they may not be suitable for sharding:
for example, in Java’s `Object.hashCode()` and Ruby’s `Object#hash`, the same key may have a
different hash value in different processes, making them unsuitable for sharding [^16].

#### Hash modulo number of nodes {#hash-modulo-number-of-nodes}

Once you have hashed the key, how do you choose which shard to store it in? Maybe your first thought
is to take the hash value *modulo* the number of nodes in the system (using the `%` operator in many
programming languages). For example, *hash*(*key*) % 10 would return a number between
0 and 9 (if we write the hash as a decimal number, the hash % 10 would be the last digit).
If we have 10 nodes, numbered 0 to 9, that seems like an easy way of assigning each key to a node.

The problem with the *mod N* approach is that if the number of nodes *N* changes, most of the keys
have to be moved from one node to another. [Figure 7-3](/en/ch7#fig_sharding_hash_mod_n) shows what happens when you
have three nodes and add a fourth. Before the rebalancing, node 0 stored the keys whose hashes are
0, 3, 6, 9, and so on. After adding the fourth node, the key with hash 3 has moved to node 3, the
key with hash 6 has moved to node 2, the key with hash 9 has moved to node 1, and so on.

{{< figure src="/fig/ddia_0703.png" id="fig_sharding_hash_mod_n" caption="Figure 7-3. Assigning keys to nodes by hashing the key and taking it modulo the number of nodes. Changing the number of nodes results in many keys moving from one node to another." class="w-full my-4" >}}

The *mod N* function is easy to compute, but it leads to very inefficient rebalancing because there
is a lot of unnecessary movement of records from one node to another. We need an approach that
doesn’t move data around more than necessary.

#### Fixed number of shards {#fixed-number-of-shards}

One simple but widely-used solution is to create many more shards than there are nodes, and to
assign several shards to each node. For example, a database running on a cluster of 10 nodes may be
split into 1,000 shards from the outset so that 100 shards are assigned to each node. A key is then
stored in shard number *hash*(*key*) % 1,000, and the system separately keeps track of
which shard is stored on which node.

Now, if a node is added to the cluster, the system can reassign some of the shards from existing
nodes to the new node until they are fairly distributed once again. This process is illustrated in
[Figure 7-4](/en/ch7#fig_sharding_rebalance_fixed). If a node is removed from the cluster, the same happens in reverse.

{{< figure src="/fig/ddia_0704.png" id="fig_sharding_rebalance_fixed" caption="Figure 7-4. Adding a new node to a database cluster with multiple shards per node." class="w-full my-4" >}}

In this model, only entire shards are moved between nodes, which is cheaper than splitting shards.
The number of shards does not change, nor does the assignment of keys to shards. The only thing that
changes is the assignment of shards to nodes. This change of assignment is not immediate—it takes
some time to transfer a large amount of data over the network—so the old assignment of shards is
used for any reads and writes that happen while the transfer is in progress.

It’s common to choose the number of shards to be a number that is divisible by many factors, so that
the dataset can be evenly split across various different numbers of nodes—not requiring the number
of nodes to be a power of 2, for example [^4].
You can even account for mismatched hardware in your cluster: by assigning more shards to nodes that
are more powerful, you can make those nodes take a greater share of the load.

This approach to sharding is used in Citus (a sharding layer for PostgreSQL), Riak, Elasticsearch,
and Couchbase, among others. It works well as long as you have a good estimate of how many shards
you will need when you first create the database. You can then add or remove nodes easily, subject
to the limitation that you can’t have more nodes than you have shards.

If you find the originally configured number of shards to be wrong—for example, if you have reached
a scale where you need more nodes than you have shards—then an expensive resharding operation is
required. It needs to split each shard and write it out to new files, using a lot of additional disk
space in the process. Some systems don’t allow resharding while concurrently writing to the
database, which makes it difficult to change the number of shards without downtime.

Choosing the right number of shards is difficult if the total size of the dataset is highly variable
(for example, if it starts small but may grow much larger over time). Since each shard contains a
fixed fraction of the total data, the size of each shard grows proportionally to the total amount of
data in the cluster. If shards are very large, rebalancing and recovery from node failures become
expensive. But if shards are too small, they incur too much overhead. The best performance is
achieved when the size of shards is “just right,” neither too big nor too small, which can be hard
to achieve if the number of shards is fixed but the dataset size varies.

#### Sharding by hash range {#sharding-by-hash-range}

If the required number of shards can’t be predicted in advance, it’s better to use a scheme in which
the number of shards can adapt easily to the workload. The aforementioned key-range sharding scheme
has this property, but it has a risk of hot spots when there are a lot of writes to nearby keys. One
solution is to combine key-range sharding with a hash function so that each shard contains a range
of *hash values* rather than a range of *keys*.

[Figure 7-5](/en/ch7#fig_sharding_hash_range) shows an example using a 16-bit hash function that returns a number
between 0 and 65,535 = 216 − 1 (in reality, the hash is usually 32 bits or more).
Even if the input keys are very similar (e.g., consecutive timestamps), their hashes are uniformly
distributed across that range. We can then assign a range of hash values to each shard: for example,
values between 0 and 16,383 to shard 0, values between 16,384 and 32,767 to shard 1, and so on.

{{< figure src="/fig/ddia_0705.png" id="fig_sharding_hash_range" caption="Figure 7-5. Assigning a contiguous range of hash values to each shard." class="w-full my-4" >}}

Like with key-range sharding, a shard in hash-range sharding can be split when it becomes too big or
too heavily loaded. This is still an expensive operation, but it can happen as needed, so the number
of shards adapts to the volume of data rather than being fixed in advance.

The downside compared to key-range sharding is that range queries over the partition key are not
efficient, as keys in the range are now scattered across all the shards. However, if keys consist of
two or more columns, and the partition key is only the first of these columns, you can still perform
efficient range queries over the second and later columns: as long as all records in the range query
have the same partition key, they will be in the same shard.

--------

> [!TIP] PARTITIONING AND RANGE QUERIES IN DATA WAREHOUSES

Data warehouses such as BigQuery, Snowflake, and Delta Lake support a similar indexing approach,
though the terminology differs. In BigQuery, for example, the partition key determines which
partition a record resides in while “cluster columns” determine how records are sorted within the
partition. Snowflake assigns records to “micro-partitions” automatically, but allows users to define
cluster keys for a table. Delta Lake supports both manual and automatic partition assignment, and
supports cluster keys. Clustering data not only improves range scan performance, but can
improve compression and filtering performance as well.

--------

Hash-range sharding is used in YugabyteDB and DynamoDB [^17], and is an option in MongoDB.
Cassandra and ScyllaDB use a variant of this approach that is illustrated in
[Figure 7-6](/en/ch7#fig_sharding_cassandra): the space of hash values is split into a number of ranges proportional
to the number of nodes (3 ranges per node in [Figure 7-6](/en/ch7#fig_sharding_cassandra), but actual numbers are 8
per node in Cassandra by default, and 256 per node in ScyllaDB), with random boundaries between
those ranges. This means some ranges are bigger than others, but by having multiple ranges per node
those imbalances tend to even out [^15] [^18].

{{< figure src="/fig/ddia_0706.png" id="fig_sharding_cassandra" caption="Figure 7-6. Cassandra and ScyllaDB split the range of possible hash values (here 0–1023) into contiguous ranges with random boundaries, and assign several ranges to each node." class="w-full my-4" >}}

When nodes are added or removed, range boundaries are added and removed, and shards are split or
merged accordingly [^19].
In the example of [Figure 7-6](/en/ch7#fig_sharding_cassandra), when node 3 is added, node 1
transfers parts of two of its ranges to node 3, and node 2 transfers part of one of its ranges to
node 3. This has the effect of giving the new node an approximately fair share of the dataset,
without transferring more data than necessary from one node to another.

#### Consistent hashing {#sec_sharding_consistent_hashing}

A *consistent hashing* algorithm is a hash function that maps keys to a specified number of shards
in a way that satisfies two properties:

1. the number of keys mapped to each shard is roughly equal, and
2. when the number of shards changes, as few keys as possible are moved from one shard to another.

Note that *consistent* here has nothing to do with replica consistency (see [Chapter 6](/en/ch6#ch_replication)) or
ACID consistency (see [Chapter 8](/en/ch8#ch_transactions)), but rather describes the tendency of a key to stay in
the same shard as much as possible.

The sharding algorithm used by Cassandra and ScyllaDB is similar to the original definition of consistent hashing [^20],
but several other consistent hashing algorithms have also been proposed [^21], such as *highest random weight*, also known as *rendezvous hashing* [^22],
and *jump consistent hash* [^23].
With Cassandra’s algorithm, if one node is added, a small number of existing shards are split into
sub-ranges; on the other hand, with rendezvous and jump consistent hashes, the new node is assigned
individual keys that were previously scattered across all of the other nodes. Which one is
preferable depends on the application.

### Skewed Workloads and Relieving Hot Spots {#sec_sharding_skew}

Consistent hashing ensures that keys are uniformly distributed across nodes, but that doesn’t mean
that the actual load is uniformly distributed. If the workload is highly skewed—that is, the amount
of data under some partition keys is much greater than other keys, or if the rate of requests to
some keys is much higher than to others—you can still end up with some servers being overloaded
while others sit almost idle.

For example, on a social media site, a celebrity user with millions of followers may cause a storm
of activity when they do something [^24].
This event can result in a large volume of reads and writes to the same key (where the partition key
is perhaps the user ID of the celebrity, or the ID of the action that people are commenting on).

In such situations, a more flexible sharding policy is required [^25] [^26].
A system that defines shards based on ranges of keys (or ranges of hashes) makes it possible to put
an individual hot key in a shard by its own, and perhaps even assigning it a dedicated machine [^27].

It’s also possible to compensate for skew at the application level. For example, if one key is known
to be very hot, a simple technique is to add a random number to the beginning or end of the key.
Just a two-digit decimal random number would split the writes to the key evenly across 100 different
keys, allowing those keys to be distributed to different shards.

However, having split the writes across different keys, any reads now have to do additional work, as
they have to read the data from all 100 keys and combine it. The volume of reads to each shard of
the hot key is not reduced; only the write load is split. This technique also requires additional
bookkeeping: it only makes sense to append the random number for the small number of hot keys; for
the vast majority of keys with low write throughput this would be unnecessary overhead. Thus, you
also need some way of keeping track of which keys are being split, and a process for converting a
regular key into a specially-managed hot key.

The problem is further compounded by change of load over time: for example, a particular social
media post that has gone viral may experience high load for a couple of days, but thereafter it’s
likely to calm down again. Moreover, some keys may be hot for writes while others are hot for reads,
necessitating different strategies for handling them.

Some systems (especially cloud services designed for large scale) have automated approaches for
dealing with hot shards; for example, Amazon calls it *heat management* [^28] or *adaptive capacity* [^17].
The details of how these systems work go beyond the scope of this book.

### Operations: Automatic or Manual Rebalancing {#sec_sharding_operations}

There is one important question with regard to rebalancing that we have glossed over: does the
splitting of shards and rebalancing happen automatically or manually?

Some systems automatically decide when to split shards and when to move them from one node to
another, without any human interaction, while others leave sharding to be explicitly configured by
an administrator. There is also a middle ground: for example, Couchbase and Riak generate a
suggested shard assignment automatically, but require an administrator to commit it before it takes effect.

Fully automated rebalancing can be convenient, because there is less operational work to do for
normal maintenance, and such systems can even auto-scale to adapt to changes in workload. Cloud
databases such as DynamoDB are promoted as being able to automatically add and remove shards to
adapt to big increases or decreases of load within a matter of minutes [^17] [^29].

However, automatic shard management can also be unpredictable. Rebalancing is an expensive
operation, because it requires rerouting requests and moving a large amount of data from one node to
another. If it is not done carefully, this process can overload the network or the nodes, and it
might harm the performance of other requests. The system must continue processing writes while the
rebalancing is in progress; if a system is near its maximum write throughput, the shard-splitting
process might not even be able to keep up with the rate of incoming writes [^29].

Such automation can be dangerous in combination with automatic failure detection. For example, say
one node is overloaded and is temporarily slow to respond to requests. The other nodes conclude that
the overloaded node is dead, and automatically rebalance the cluster to move load away from it. This
puts additional load on other nodes and the network, making the situation worse. There is a risk of
causing a cascading failure where other nodes become overloaded and are also falsely suspected of being down.

For that reason, it can be a good thing to have a human in the loop for rebalancing. It’s slower
than a fully automatic process, but it can help prevent operational surprises.


## Request Routing {#sec_sharding_routing}

We have discussed how to shard a dataset across multiple nodes, and how to rebalance those shards as
nodes are added or removed. Now let’s move on to the question: if you want to read or write a
particular key, how do you know which node—i.e., which IP address and port number—you need to
connect to?

We call this problem *request routing*, and it’s very similar to *service discovery*, which we
previously discussed in [“Load balancers, service discovery, and service meshes”](/en/ch5#sec_encoding_service_discovery). The biggest difference between the two
is that with services running application code, each instance is usually stateless, and a load
balancer can send a request to any of the instances. With sharded databases, a request for a key can
only be handled by a node that is a replica for the shard containing that key.

This means that request routing has to be aware of the assignment from keys to shards, and from
shards to nodes. On a high level, there are a few different approaches to this problem 
(illustrated in [Figure 7-7](/en/ch7#fig_sharding_routing)):

1. Allow clients to contact any node (e.g., via a round-robin load balancer). If that node
 coincidentally owns the shard to which the request applies, it can handle the request directly;
 otherwise, it forwards the request to the appropriate node, receives the reply, and passes the
 reply along to the client.
2. Send all requests from clients to a routing tier first, which determines the node that should
 handle each request and forwards it accordingly. This routing tier does not itself handle any
 requests; it only acts as a shard-aware load balancer.
3. Require that clients be aware of the sharding and the assignment of shards to nodes. In this
 case, a client can connect directly to the appropriate node, without any intermediary.

{{< figure src="/fig/ddia_0707.png" id="fig_sharding_routing" caption="Figure 7-7. Three different ways of routing a request to the right node." class="w-full my-4" >}}

In all cases, there are some key problems:

* Who decides which shard should live on which node? It’s simplest to have a single coordinator
 making that decision, but in that case how do you make it fault-tolerant in case the node running
 the coordinator goes down? And if the coordinator role can failover to another node, how do you
 prevent a split-brain situation (see [“Handling Node Outages”](/en/ch6#sec_replication_failover)) where two different
 coordinators make contradictory shard assignments?
* How does the component performing the routing (which may be one of the nodes, or the routing tier,
 or the client) learn about changes in the assignment of shards to nodes?
* While a shard is being moved from one node to another, there is a cutover period during which the
 new node has taken over, but requests to the old node may still be in flight. How do you handle
 those?

Many distributed data systems rely on a separate coordination service such as ZooKeeper or etcd to
keep track of shard assignments, as illustrated in [Figure 7-8](/en/ch7#fig_sharding_zookeeper). They use consensus
algorithms (see [Chapter 10](/en/ch10#ch_consistency)) to provide fault tolerance and protection against split-brain.
Each node registers itself in ZooKeeper, and ZooKeeper maintains the authoritative mapping of shards
to nodes. Other actors, such as the routing tier or the sharding-aware client, can subscribe to this
information in ZooKeeper. Whenever a shard changes ownership, or a node is added or removed,
ZooKeeper notifies the routing tier so that it can keep its routing information up to date.

{{< figure src="/fig/ddia_0708.png" id="fig_sharding_zookeeper" caption="Figure 7-8. Using ZooKeeper to keep track of assignment of shards to nodes." class="w-full my-4" >}}

For example, HBase and SolrCloud use ZooKeeper to manage shard assignment, and Kubernetes uses etcd
to keep track of which service instance is running where. MongoDB has a similar architecture, but it
relies on its own *config server* implementation and *mongos* daemons as the routing tier. Kafka,
YugabyteDB, and TiDB use built-in implementations of the Raft consensus protocol to perform this
coordination function.

Cassandra, ScyllaDB, and Riak take a different approach: they use a *gossip protocol* among the
nodes to disseminate any changes in cluster state. This provides much weaker consistency than a
consensus protocol; it is possible to have split brain, in which different parts of the cluster have
different node assignments for the same shard. Leaderless databases can tolerate this because they
generally make weak consistency guarantees anyway (see [“Limitations of Quorum Consistency”](/en/ch6#sec_replication_quorum_limitations)).

When using a routing tier or when sending requests to a random node, clients still need to find the
IP addresses to connect to. These are not as fast-changing as the assignment of shards to nodes,
so it is often sufficient to use DNS for this purpose.

This discussion of request routing has focused on finding the shard for an individual key, which is
most relevant for sharded OLTP databases. Analytic databases often use sharding as well, but they
typically have a very different kind of query execution: rather than executing in a single shard, a
query typically needs to aggregate and join data from many different shards in parallel. We will
discuss techniques for such parallel query execution in [“JOIN and GROUP BY”](/en/ch11#sec_batch_join).

## Sharding and Secondary Indexes {#sec_sharding_secondary_indexes}

The sharding schemes we have discussed so far rely on the client knowing the partition key for any
record it wants to access. This is most easily done in a key-value data model, where the partition
key is the first part of the primary key (or the entire primary key), and so we can use the
partition key to determine the shard, and thus route reads and writes to the node that is
responsible for that key.

The situation becomes more complicated if secondary indexes are involved (see also
[“Multi-Column and Secondary Indexes”](/en/ch4#sec_storage_index_multicolumn)). A secondary index usually doesn’t identify a record uniquely but
rather is a way of searching for occurrences of a particular value: find all actions by user `123`,
find all articles containing the word `hogwash`, find all cars whose color is `red`, and so on.

Key-value stores often don’t have secondary indexes, but they are the bread and butter of relational
databases, they are common in document databases too, and they are the *raison d’être* of full-text
search engines such as Solr and Elasticsearch. The problem with secondary indexes is that they don’t
map neatly to shards. There are two main approaches to sharding a database with secondary indexes:
local and global indexes.

### Local Secondary Indexes {#id166}

For example, imagine you are operating a website for selling used cars (illustrated in
[Figure 7-9](/en/ch7#fig_sharding_local_secondary)). Each listing has a unique ID, and you use that ID as partition
key for sharding (for example, IDs 0 to 499 in shard 0, IDs 500 to 999 in shard 1, etc.).

If you want to let users search for cars, allowing them to filter by color and by make, you need a
secondary index on `color` and `make` (in a document database these would be fields; in a relational
database they would be columns). If you have declared the index, the database can perform the
indexing automatically. For example, whenever a red car is added to the database, the database shard
automatically adds its ID to the list of IDs for the index entry `color:red`. As discussed in
[Chapter 4](/en/ch4#ch_storage), that list of IDs is also called a *postings list*.

{{< figure src="/fig/ddia_0709.png" id="fig_sharding_local_secondary" caption="Figure 7-9. Local secondary indexes: each shard indexes only the records within its own shard." class="w-full my-4" >}}

> [!WARNING] WARNING

If your database only supports a key-value model, you might be tempted to implement a secondary
index yourself by creating a mapping from values to IDs in application code. If you go down this
route, you need to take great care to ensure your indexes remain consistent with the underlying
data. Race conditions and intermittent write failures (where some changes were saved but others
weren’t) can very easily cause the data to go out of sync—see [“The need for multi-object transactions”](/en/ch8#sec_transactions_need).

--------

In this indexing approach, each shard is completely separate: each shard maintains its own secondary
indexes, covering only the records in that shard. It doesn’t care what data is stored in other
shards. Whenever you write to the database—to add, remove, or update a records—you only need to
deal with the shard that contains the record that you are writing. For that reason, this type of
secondary index is known as a *local index*. In an information retrieval context it is also known as
a *document-partitioned index* [^30].

When reading from a local secondary index, if you already know the partition key of the record
you’re looking for, you can just perform the search on the appropriate shard. Moreover, if you only
want *some* results, and you don’t need all, you can send the request to any shard.

However, if you want all the results and don’t know their partition key in advance, you need to send
the query to all shards, and combine the results you get back, because the matching records might be
scattered across all the shards. In [Figure 7-9](/en/ch7#fig_sharding_local_secondary), red cars appear in both shard
0 and shard 1.

This approach to querying a sharded database can make read queries on secondary indexes quite
expensive. Even if you query the shards in parallel, it is prone to tail latency amplification (see
[“Use of Response Time Metrics”](/en/ch2#sec_introduction_slo_sla)). It also limits the scalability of your application: adding more
shards lets you store more data, but it doesn’t increase your query throughput if every shard has to
process every query anyway.

Nevertheless, local secondary indexes are widely used [^31]: for example, MongoDB, Riak, Cassandra [^32], Elasticsearch [^33], 
SolrCloud, and VoltDB [^34] all use local secondary indexes.

### Global Secondary Indexes {#id167}

Rather than each shard having its own, local secondary index, we can construct a *global index* that
covers data in all shards. However, we can’t just store that index on one node, since it would
likely become a bottleneck and defeat the purpose of sharding. A global index must also be sharded,
but it can be sharded differently from the primary key index.

[Figure 7-10](/en/ch7#fig_sharding_global_secondary) illustrates what this could look like: the IDs of red cars from
all shards appear under `color:red` in the index, but the index is sharded so that colors starting
with the letters *a* to *r* appear in shard 0 and colors starting with *s* to *z* appear in shard 1.
The index on the make of car is partitioned similarly (with the shard boundary being between *f* and *h*).

{{< figure src="/fig/ddia_0710.png" id="fig_sharding_global_secondary" caption="Figure 7-10. A global secondary index reflects data from all shards, and is itself sharded by the indexed value." class="w-full my-4" >}}

This kind of index is also called *term-partitioned* [^30]:
recall from [“Full-Text Search”](/en/ch4#sec_storage_full_text) that in full-text search, a *term* is a keyword in a text that
you can search for. Here we generalise it to mean any value that you can search for in the secondary index.

The global index uses the term as partition key, so that when you’re looking for a particular term
or value, you can figure out which shard you need to query. As before, a shard can contain a
contiguous range of terms (as in [Figure 7-10](/en/ch7#fig_sharding_global_secondary)), or you can assign terms to
shards based on a hash of the term.

Global indexes have the advantage that a query with a single condition (such as *color = red*) only
needs to read from a single shard to fetch the postings list. However, if you want to fetch records
and not just IDs, you still have to read from all the shards that are responsible for those IDs.

If you have multiple search conditions or terms (e.g., searching for cars of a certain color and a
certain make, or searching for multiple words occurring in the same text), it’s likely that those
terms will be assigned to different shards. To compute the logical AND of the two conditions, the
system needs to find all the IDs that occur in both of the postings lists. That’s no problem if the
postings lists are short, but if they are long, it can be slow to send them over the network to
compute their intersection [^30].

Another challenge with global secondary indexes is that writes are more complicated than with local
indexes, because writing a single record might affect multiple shards of the index (every term in
the document might be on a different shard). This makes it harder to keep the secondary index in
sync with the underlying data. One option is to use a distributed transaction to atomically update
the shards storing the primary record and its secondary indexes (see [Chapter 8](/en/ch8#ch_transactions)).

Global secondary indexes are used by CockroachDB, TiDB, and YugabyteDB; DynamoDB supports both local
and global secondary indexes. In the case of DynamoDB, writes are asynchronously reflected in global
indexes, so reads from a global index may be stale (similarly to replication lag, as in [“Problems with Replication Lag”](/en/ch6#sec_replication_lag)).
Nevertheless, global indexes are useful if read throughput is higher than write throughput, and if
the postings lists are not too long.


## Summary {#summary}

In this chapter we explored different ways of sharding a large dataset into smaller subsets.
Sharding is necessary when you have so much data that storing and processing it on a single machine
is no longer feasible.

The goal of sharding is to spread the data and query load evenly across multiple machines, avoiding
hot spots (nodes with disproportionately high load). This requires choosing a sharding scheme that
is appropriate to your data, and rebalancing the shards when nodes are added to or removed from the cluster.

We discussed two main approaches to sharding:

* *Key range sharding*, where keys are sorted, and a shard owns all the keys from some minimum up to
 some maximum. Sorting has the advantage that efficient range queries are possible, but there is a
 risk of hot spots if the application often accesses keys that are close together in the sorted
 order.

 In this approach, shards are typically rebalanced by splitting the range into two subranges when a
 shard gets too big.
* *Hash sharding*, where a hash function is applied to each key, and a shard owns a range of hash
 values (or another consistent hashing algorithm may be used to map hashes to shards). This method
 destroys the ordering of keys, making range queries inefficient, but it may distribute load more
 evenly.

 When sharding by hash, it is common to create a fixed number of shards in advance, to assign several
 shards to each node, and to move entire shards from one node to another when nodes are added or
 removed. Splitting shards, like with key ranges, is also possible.

It is common to use the first part of the key as the partition key (i.e., to identify the shard),
and to sort records within that shard by the rest of the key. That way you can still have efficient
range queries among the records with the same partition key.

We also discussed the interaction between sharding and secondary indexes. A secondary index also
needs to be sharded, and there are two methods:

* *Local secondary indexes*, where the secondary indexes are stored
 in the same shard as the primary key and value. This means that only a single shard needs to be
 updated on write, but a lookup of the secondary index requires reading from all shards.
* *Global secondary indexes*, which are sharded separately based on
 the indexed values. An entry in the secondary index may refer to records from all shards of the
 primary key. When a record is written, several secondary index shards may need to be updated;
 however, a read of the postings list can be served from a single shard (fetching the actual
 records still requires reading from multiple shards).

Finally, we discussed techniques for routing queries to the appropriate shard, and how a
coordination service is often used to keep track of the assigment of shards to nodes.

By design, every shard operates mostly independently—that’s what allows a sharded database to scale
to multiple machines. However, operations that need to write to several shards can be problematic:
for example, what happens if the write to one shard succeeds, but another fails? We will address
that question in the following chapters.


### References

[^1]: Claire Giordano. [Understanding partitioning and sharding in Postgres and Citus](https://www.citusdata.com/blog/2023/08/04/understanding-partitioning-and-sharding-in-postgres-and-citus/). *citusdata.com*, August 2023. Archived at [perma.cc/8BTK-8959](https://perma.cc/8BTK-8959) 
[^2]: Brandur Leach. [Partitioning in Postgres, 2022 edition](https://brandur.org/fragments/postgres-partitioning-2022). *brandur.org*, October 2022. Archived at [perma.cc/Z5LE-6AKX](https://perma.cc/Z5LE-6AKX) 
[^3]: Raph Koster. [Database “sharding” came from UO?](https://www.raphkoster.com/2009/01/08/database-sharding-came-from-uo/) *raphkoster.com*, January 2009. Archived at [perma.cc/4N9U-5KYF](https://perma.cc/4N9U-5KYF) 
[^4]: Garrett Fidalgo. [Herding elephants: Lessons learned from sharding Postgres at Notion](https://www.notion.com/blog/sharding-postgres-at-notion). *notion.com*, October 2021. Archived at [perma.cc/5J5V-W2VX](https://perma.cc/5J5V-W2VX) 
[^5]: Ulrich Drepper. [What Every Programmer Should Know About Memory](https://www.akkadia.org/drepper/cpumemory.pdf). *akkadia.org*, November 2007. Archived at [perma.cc/NU6Q-DRXZ](https://perma.cc/NU6Q-DRXZ) 
[^6]: Jingyu Zhou, Meng Xu, Alexander Shraer, Bala Namasivayam, Alex Miller, Evan Tschannen, Steve Atherton, Andrew J. Beamon, Rusty Sears, John Leach, Dave Rosenthal, Xin Dong, Will Wilson, Ben Collins, David Scherer, Alec Grieser, Young Liu, Alvin Moore, Bhaskar Muppana, Xiaoge Su, and Vishesh Yadav. [FoundationDB: A Distributed Unbundled Transactional Key Value Store](https://www.foundationdb.org/files/fdb-paper.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2021. [doi:10.1145/3448016.3457559](https://doi.org/10.1145/3448016.3457559) 
[^7]: Marco Slot. [Citus 12: Schema-based sharding for PostgreSQL](https://www.citusdata.com/blog/2023/07/18/citus-12-schema-based-sharding-for-postgres/). *citusdata.com*, July 2023. Archived at [perma.cc/R874-EC9W](https://perma.cc/R874-EC9W) 
[^8]: Robisson Oliveira. [Reducing the Scope of Impact with Cell-Based Architecture](https://docs.aws.amazon.com/pdfs/wellarchitected/latest/reducing-scope-of-impact-with-cell-based-architecture/reducing-scope-of-impact-with-cell-based-architecture.pdf). AWS Well-Architected white paper, Amazon Web Services, September 2023. Archived at [perma.cc/4KWW-47NR](https://perma.cc/4KWW-47NR) 
[^9]: Gwen Shapira. [Things DBs Don’t Do - But Should](https://www.thenile.dev/blog/things-dbs-dont-do). *thenile.dev*, February 2023. Archived at [perma.cc/C3J4-JSFW](https://perma.cc/C3J4-JSFW) 
[^10]: Malte Schwarzkopf, Eddie Kohler, M. Frans Kaashoek, and Robert Morris. [Position: GDPR Compliance by Construction](https://cs.brown.edu/people/malte/pub/papers/2019-poly-gdpr.pdf). At *Towards Polystores that manage multiple Databases, Privacy, Security and/or Policy Issues for Heterogenous Data* (Poly), August 2019. [doi:10.1007/978-3-030-33752-0\_3](https://doi.org/10.1007/978-3-030-33752-0_3) 
[^11]: Gwen Shapira. [Introducing pg\_karnak: Transactional schema migration across tenant databases](https://www.thenile.dev/blog/distributed-ddl). *thenile.dev*, November 2024. Archived at [perma.cc/R5RD-8HR9](https://perma.cc/R5RD-8HR9) 
[^12]: Arka Ganguli, Guido Iaquinti, Maggie Zhou, and Rafael Chacón. [Scaling Datastores at Slack with Vitess](https://slack.engineering/scaling-datastores-at-slack-with-vitess/). *slack.engineering*, December 2020. Archived at [perma.cc/UW8F-ALJK](https://perma.cc/UW8F-ALJK) 
[^13]: Ikai Lan. [App Engine Datastore Tip: Monotonically Increasing Values Are Bad](https://ikaisays.com/2011/01/25/app-engine-datastore-tip-monotonically-increasing-values-are-bad/). *ikaisays.com*, January 2011. Archived at [perma.cc/BPX8-RPJB](https://perma.cc/BPX8-RPJB) 
[^14]: Enis Soztutar. [Apache HBase Region Splitting and Merging](https://www.cloudera.com/blog/technical/apache-hbase-region-splitting-and-merging.html). *cloudera.com*, February 2013. Archived at [perma.cc/S9HS-2X2C](https://perma.cc/S9HS-2X2C) 
[^15]: Eric Evans. [Rethinking Topology in Cassandra](https://www.youtube.com/watch?v=Qz6ElTdYjjU). At *Cassandra Summit*, June 2013. Archived at [perma.cc/2DKM-F438](https://perma.cc/2DKM-F438) 
[^16]: Martin Kleppmann. [Java’s hashCode Is Not Safe for Distributed Systems](https://martin.kleppmann.com/2012/06/18/java-hashcode-unsafe-for-distributed-systems.html). *martin.kleppmann.com*, June 2012. Archived at [perma.cc/LK5U-VZSN](https://perma.cc/LK5U-VZSN) 
[^17]: Mostafa Elhemali, Niall Gallagher, Nicholas Gordon, Joseph Idziorek, Richard Krog, Colin Lazier, Erben Mo, Akhilesh Mritunjai, Somu Perianayagam, Tim Rath, Swami Sivasubramanian, James Christopher Sorenson III, Sroaj Sosothikul, Doug Terry, and Akshat Vig. [Amazon DynamoDB: A Scalable, Predictably Performant, and Fully Managed NoSQL Database Service](https://www.usenix.org/conference/atc22/presentation/elhemali). At *USENIX Annual Technical Conference* (ATC), July 2022. 
[^18]: Brandon Williams. [Virtual Nodes in Cassandra 1.2](https://www.datastax.com/blog/virtual-nodes-cassandra-12). *datastax.com*, December 2012. Archived at [perma.cc/N385-EQXV](https://perma.cc/N385-EQXV) 
[^19]: Branimir Lambov. [New Token Allocation Algorithm in Cassandra 3.0](https://www.datastax.com/blog/new-token-allocation-algorithm-cassandra-30). *datastax.com*, January 2016. Archived at [perma.cc/2BG7-LDWY](https://perma.cc/2BG7-LDWY) 
[^20]: David Karger, Eric Lehman, Tom Leighton, Rina Panigrahy, Matthew Levine, and Daniel Lewin. [Consistent Hashing and Random Trees: Distributed Caching Protocols for Relieving Hot Spots on the World Wide Web](https://people.csail.mit.edu/karger/Papers/web.pdf). At *29th Annual ACM Symposium on Theory of Computing* (STOC), May 1997. [doi:10.1145/258533.258660](https://doi.org/10.1145/258533.258660) 
[^21]: Damian Gryski. [Consistent Hashing: Algorithmic Tradeoffs](https://dgryski.medium.com/consistent-hashing-algorithmic-tradeoffs-ef6b8e2fcae8). *dgryski.medium.com*, April 2018. Archived at [perma.cc/B2WF-TYQ8](https://perma.cc/B2WF-TYQ8) 
[^22]: David G. Thaler and Chinya V. Ravishankar. [Using name-based mappings to increase hit rates](https://www.cs.kent.edu/~javed/DL/web/p1-thaler.pdf). *IEEE/ACM Transactions on Networking*, volume 6, issue 1, pages 1–14, February 1998. [doi:10.1109/90.663936](https://doi.org/10.1109/90.663936) 
[^23]: John Lamping and Eric Veach. [A Fast, Minimal Memory, Consistent Hash Algorithm](https://arxiv.org/abs/1406.2294). *arxiv.org*, June 2014. 
[^24]: Samuel Axon. [3% of Twitter’s Servers Dedicated to Justin Bieber](https://mashable.com/archive/justin-bieber-twitter). *mashable.com*, September 2010. Archived at [perma.cc/F35N-CGVX](https://perma.cc/F35N-CGVX) 
[^25]: Gerald Guo and Thawan Kooburat. [Scaling services with Shard Manager](https://engineering.fb.com/2020/08/24/production-engineering/scaling-services-with-shard-manager/). *engineering.fb.com*, August 2020. Archived at [perma.cc/EFS3-XQYT](https://perma.cc/EFS3-XQYT) 
[^26]: Sangmin Lee, Zhenhua Guo, Omer Sunercan, Jun Ying, Thawan Kooburat, Suryadeep Biswal, Jun Chen, Kun Huang, Yatpang Cheung, Yiding Zhou, Kaushik Veeraraghavan, Biren Damani, Pol Mauri Ruiz, Vikas Mehta, and Chunqiang Tang. [Shard Manager: A Generic Shard Management Framework for Geo-distributed Applications](https://dl.acm.org/doi/pdf/10.1145/3477132.3483546). *28th ACM SIGOPS Symposium on Operating Systems Principles* (SOSP), pages 553–569, October 2021. [doi:10.1145/3477132.3483546](https://doi.org/10.1145/3477132.3483546) 
[^27]: Scott Lystig Fritchie. [A Critique of Resizable Hash Tables: Riak Core & Random Slicing](https://www.infoq.com/articles/dynamo-riak-random-slicing/). *infoq.com*, August 2018. Archived at [perma.cc/RPX7-7BLN](https://perma.cc/RPX7-7BLN) 
[^28]: Andy Warfield. [Building and operating a pretty big storage system called S3](https://www.allthingsdistributed.com/2023/07/building-and-operating-a-pretty-big-storage-system.html). *allthingsdistributed.com*, July 2023. Archived at [perma.cc/6S7P-GLM4](https://perma.cc/6S7P-GLM4) 
[^29]: Rich Houlihan. [DynamoDB adaptive capacity: smooth performance for chaotic workloads (DAT327)](https://www.youtube.com/watch?v=kMY0_m29YzU). At *AWS re:Invent*, November 2017. 
[^30]: Christopher D. Manning, Prabhakar Raghavan, and Hinrich Schütze. [*Introduction to Information Retrieval*](https://nlp.stanford.edu/IR-book/). Cambridge University Press, 2008. ISBN: 978-0-521-86571-5, available online at [nlp.stanford.edu/IR-book](https://nlp.stanford.edu/IR-book/) 
[^31]: Michael Busch, Krishna Gade, Brian Larson, Patrick Lok, Samuel Luckenbill, and Jimmy Lin. [Earlybird: Real-Time Search at Twitter](https://cs.uwaterloo.ca/~jimmylin/publications/Busch_etal_ICDE2012.pdf). At *28th IEEE International Conference on Data Engineering* (ICDE), April 2012. [doi:10.1109/ICDE.2012.149](https://doi.org/10.1109/ICDE.2012.149) 
[^32]: Nadav Har’El. [Indexing in Cassandra 3](https://github.com/scylladb/scylladb/wiki/Indexing-in-Cassandra-3). *github.com*, April 2017. Archived at [perma.cc/3ENV-8T9P](https://perma.cc/3ENV-8T9P) 
[^33]: Zachary Tong. [Customizing Your Document Routing](https://www.elastic.co/blog/customizing-your-document-routing/). *elastic.co*, June 2013. Archived at [perma.cc/97VM-MREN](https://perma.cc/97VM-MREN) 
[^34]: Andrew Pavlo. [H-Store Frequently Asked Questions](https://hstore.cs.brown.edu/documentation/faq/). *hstore.cs.brown.edu*, October 2013. Archived at [perma.cc/X3ZA-DW6Z](https://perma.cc/X3ZA-DW6Z) 


================================================
FILE: content/en/ch8.md
================================================
---
title: "8. Transactions"
weight: 208
breadcrumbs: false
---

<a id="ch_transactions"></a>

![](/map/ch07.png)

> *Some authors have claimed that general two-phase commit is too expensive to support, because of the
> performance or availability problems that it brings. We believe it is better to have application
> programmers deal with performance problems due to overuse of transactions as bottlenecks arise,
> rather than always coding around the lack of transactions.*
>
> James Corbett et al., *Spanner: Google’s Globally-Distributed Database* (2012)

In the harsh reality of data systems, many things can go wrong:

* The database software or hardware may fail at any time (including in the middle of a write
 operation).
* The application may crash at any time (including halfway through a series of operations).
* Interruptions in the network can unexpectedly cut off the application from the database, or one
 database node from another.
* Several clients may write to the database at the same time, overwriting each other’s changes.
* A client may read data that doesn’t make sense because it has only partially been updated.
* Race conditions between clients can cause surprising bugs.

In order to be reliable, a system has to deal with these faults and ensure that they don’t cause
catastrophic failure of the entire system. However, implementing fault-tolerance mechanisms is a lot
of work. It requires a lot of careful thinking about all the things that can go wrong, and a lot of
testing to ensure that the solution actually works.

For decades, *transactions* have been the mechanism of choice for simplifying these issues. A
transaction is a way for an application to group several reads and writes together into a logical
unit. Conceptually, all the reads and writes in a transaction are executed as one operation: either
the entire transaction succeeds (*commit*) or it fails (*abort*, *rollback*). If it fails, the
application can safely retry. With transactions, error handling becomes much simpler for an
application, because it doesn’t need to worry about partial failure—i.e., the case where some
operations succeed and some fail (for whatever reason).

If you have spent years working with transactions, they may seem obvious, but we shouldn’t take them
for granted. Transactions are not a law of nature; they were created with a purpose, namely to
*simplify the programming model* for applications accessing a database. By using transactions, the
application is free to ignore certain potential error scenarios and concurrency issues, because the
database takes care of them instead (we call these *safety guarantees*).

Not every application needs transactions, and sometimes there are advantages to weakening
transactional guarantees or abandoning them entirely (for example, to achieve higher performance or
higher availability). Some safety properties can be achieved without transactions. On the other
hand, transactions can prevent a lot of grief: for example, the technical cause behind the Post
Office Horizon scandal (see [“How Important Is Reliability?”](/en/ch2#sidebar_reliability_importance)) was probably a lack of ACID
transactions in the underlying accounting system [^1].

How do you figure out whether you need transactions? In order to answer that question, we first need
to understand exactly what safety guarantees transactions can provide, and what costs are associated
with them. Although transactions seem straightforward at first glance, there are actually many
subtle but important details that come into play.

In this chapter, we will examine many examples of things that can go wrong, and explore the
algorithms that databases use to guard against those issues. We will go especially deep in the area
of concurrency control, discussing various kinds of race conditions that can occur and how
databases implement isolation levels such as *read committed*, *snapshot isolation*, and
*serializability*.

Concurrency control is relevant for both single-node and distributed databases. Later in this
chapter, in [“Distributed Transactions”](/en/ch8#sec_transactions_distributed), we will examine the *two-phase commit* protocol and
the challenge of achieving atomicity in a distributed transaction.

## What Exactly Is a Transaction? {#sec_transactions_overview}

Almost all relational databases today, and some nonrelational databases, support transactions. Most
of them follow the style that was introduced in 1975 by IBM System R, the first SQL database [^2] [^3] [^4].
Although some implementation details have changed, the general idea has remained virtually the same
for 50 years: the transaction support in MySQL, PostgreSQL, Oracle, SQL Server, etc., is uncannily
similar to that of System R.

In the late 2000s, nonrelational (NoSQL) databases started gaining popularity. They aimed to
improve upon the relational status quo by offering a choice of new data models (see
[Chapter 3](/en/ch3#ch_datamodels)), and by including replication ([Chapter 6](/en/ch6#ch_replication)) and sharding
([Chapter 7](/en/ch7#ch_sharding)) by default. Transactions were the main casualty of this movement: many of this
generation of databases abandoned transactions entirely, or redefined the word to describe a
much weaker set of guarantees than had previously been understood.

The hype around NoSQL distributed databases led to a popular belief that transactions were
fundamentally unscalable, and that any large-scale system would have to abandon transactions in
order to maintain good performance and high availability. More recently, that belief has turned out
to be wrong. So-called “NewSQL” databases such as CockroachDB [^5], TiDB [^6], Spanner [^7], FoundationDB [^8],
and Yugabyte have shown that transactional systems can scale to large data volumes and high
throughput. These systems combine sharding with consensus protocols ([Chapter 10](/en/ch10#ch_consistency)) to provide
strong ACID guarantees at scale.

However, that doesn’t mean that every system must be transactional either: like every other
technical design choice, transactions have advantages and limitations. In order to understand those
trade-offs, let’s go into the details of the guarantees that transactions can provide—both in normal
operation and in various extreme (but realistic) circumstances.

### The Meaning of ACID {#sec_transactions_acid}

The safety guarantees provided by transactions are often described by the well-known acronym *ACID*,
which stands for *Atomicity*, *Consistency*, *Isolation*, and *Durability*. It was coined in 1983 by
Theo Härder and Andreas Reuter [^9] in an effort to establish precise terminology for fault-tolerance mechanisms in databases.

However, in practice, one database’s implementation of ACID does not equal another’s implementation.
For example, as we shall see, there is a lot of ambiguity around the meaning of *isolation* [^10].
The high-level idea is sound, but the devil is in the details. Today, when a system claims to be
“ACID compliant,” it’s unclear what guarantees you can actually expect. ACID has unfortunately
become mostly a marketing term.

(Systems that do not meet the ACID criteria are sometimes called *BASE*, which stands for
*Basically Available*, *Soft state*, and *Eventual consistency* [^11].
This is even more vague than the definition of ACID. It seems that the only sensible definition of
BASE is “not ACID”; i.e., it can mean almost anything you want.)

Let’s dig into the definitions of atomicity, consistency, isolation, and durability, as this will let
us refine our idea of transactions.

#### Atomicity {#sec_transactions_acid_atomicity}

In general, *atomic* refers to something that cannot be broken down into smaller parts. The word
means similar but subtly different things in different branches of computing. For example, in
multi-threaded programming, if one thread executes an atomic operation, that means there is no way
that another thread could see the half-finished result of the operation. The system can only be in
the state it was before the operation or after the operation, not something in between.

By contrast, in the context of ACID, atomicity is *not* about concurrency. It does not describe
what happens if several processes try to access the same data at the same time, because that is
covered under the letter *I*, for *isolation* (see [“Isolation”](/en/ch8#sec_transactions_acid_isolation)).

Rather, ACID atomicity describes what happens if a client wants to make several writes, but a fault
occurs after some of the writes have been processed—for example, a process crashes, a network
connection is interrupted, a disk becomes full, or some integrity constraint is violated.
If the writes are grouped together into an atomic transaction, and the transaction cannot be
completed (*committed*) due to a fault, then the transaction is *aborted* and the database must
discard or undo any writes it has made so far in that transaction.

Without atomicity, if an error occurs partway through making multiple changes, it’s difficult to
know which changes have taken effect and which haven’t. The application could try again, but that
risks making the same change twice, leading to duplicate or incorrect data. Atomicity simplifies
this problem: if a transaction was aborted, the application can be sure that it didn’t change
anything, so it can safely be retried.

The ability to abort a transaction on error and have all writes from that transaction discarded is
the defining feature of ACID atomicity. Perhaps *abortability* would have been a better term than
*atomicity*, but we will stick with *atomicity* since that’s the usual word.

#### Consistency {#sec_transactions_acid_consistency}

The word *consistency* is terribly overloaded:

* In [Chapter 6](/en/ch6#ch_replication) we discussed *replica consistency* and the issue of *eventual consistency*
 that arises in asynchronously replicated systems (see [“Problems with Replication Lag”](/en/ch6#sec_replication_lag)).
* A *consistent snapshot* of a database, e.g. for a backup, is a snapshot of the entire database as
 it existed at one moment in time. More precisely, it is consistent with the happens-before
 relation (see [“The “happens-before” relation and concurrency”](/en/ch6#sec_replication_happens_before)): that is, if the snapshot contains a value that
 was written at a particular time, then it also reflects all the writes that happened before that
 value was written.
* *Consistent hashing* is an approach to sharding that some systems use for rebalancing (see
 [“Consistent hashing”](/en/ch7#sec_sharding_consistent_hashing)).
* In the CAP theorem (see [Chapter 10](/en/ch10#ch_consistency)), the word *consistency* is used to mean
 *linearizability* (see [“Linearizability”](/en/ch10#sec_consistency_linearizability)).
* In the context of ACID, *consistency* refers to an application-specific notion of the database
 being in a “good state.”

It’s unfortunate that the same word is used with at least five different meanings.

The idea of ACID consistency is that you have certain statements about your data (*invariants*) that
must always be true—for example, in an accounting system, credits and debits across all accounts
must always be balanced. If a transaction starts with a database that is valid according to these
invariants, and any writes during the transaction preserve the validity, then you can be sure that
the invariants are always satisfied. (An invariant may be temporarily violated during transaction
execution, but it should be satisfied again at transaction commit.)

If you want the database to enforce your invariants, you need to declare them as *constraints* as
part of the schema. For example, foreign key constraints, uniqueness constraints, or check
constraints (which restrict the values that can appear in an individual row) are often used to
model specific types of invariants. More complex consistency requirements can sometimes be modeled
using triggers or materialized views [^12].

However, complex invariants can be difficult or impossible to model using the constraints that
databases usually provide. In that case, it’s the application’s responsibility to define its
transactions correctly so that they preserve consistency. If you write bad data that violates your
invariants, but you haven’t declared those invariants, the database can’t stop you. As such, the C
in ACID often depends on how the application uses the database, and it’s not a property of the
database alone.

#### Isolation {#sec_transactions_acid_isolation}

Most databases are accessed by several clients at the same time. That is no problem if they are
reading and writing different parts of the database, but if they are accessing the same database
records, you can run into concurrency problems (race conditions).

[Figure 8-1](/en/ch8#fig_transactions_increment) is a simple example of this kind of problem. Say you have two clients
simultaneously incrementing a counter that is stored in a database. Each client needs to read the
current value, add 1, and write the new value back (assuming there is no increment operation built
into the database). In [Figure 8-1](/en/ch8#fig_transactions_increment) the counter should have increased from 42 to
44, because two increments happened, but it actually only went to 43 because of the race condition.

{{< figure src="/fig/ddia_0801.png" id="fig_transactions_increment" caption="Figure 8-1. A race condition between two clients concurrently incrementing a counter." class="w-full my-4" >}}


*Isolation* in the sense of ACID means that concurrently executing transactions are isolated from
each other: they cannot step on each other’s toes. The classic database textbooks formalize
isolation as *serializability*, which means that each transaction can pretend that it is the only
transaction running on the entire database. The database ensures that when the transactions have
committed, the result is the same as if they had run *serially* (one after another), even though in
reality they may have run concurrently [^13].

However, serializability has a performance cost. In practice, many databases use forms of isolation
that are weaker than serializability: that is, they allow concurrent transactions to interfere with
each other in limited ways. Some popular databases, such as Oracle, don’t even implement it (Oracle
has an isolation level called “serializable,” but it actually implements *snapshot isolation*, which
is a weaker guarantee than serializability [^10] [^14]).
This means that some kinds of race conditions can still occur. We will explore snapshot isolation
and other forms of isolation in [“Weak Isolation Levels”](/en/ch8#sec_transactions_isolation_levels).

#### Durability {#durability}

The purpose of a database system is to provide a safe place where data can be stored without fear of
losing it. *Durability* is the promise that once a transaction has committed successfully, any data it
has written will not be forgotten, even if there is a hardware fault or the database crashes.

In a single-node database, durability typically means that the data has been written to nonvolatile
storage such as a hard drive or SSD. Regular file writes are usually buffered in memory before being
sent to the disk sometime later, which means they would be lost if there is a sudden power failure;
many databases therefore use the `fsync()` system call to ensure the data really has been written to
disk. Databases usually also have a write-ahead log or similar (see [“Making B-trees reliable”](/en/ch4#sec_storage_btree_wal)),
which allows them to recover in the event that a crash occurs part way through a write.

In a replicated database, durability may mean that the data has been successfully copied to some
number of nodes. In order to provide a durability guarantee, a database must wait until these writes
or replications are complete before reporting a transaction as successfully committed. However,
as discussed in [“Reliability and Fault Tolerance”](/en/ch2#sec_introduction_reliability), perfect durability does not exist: if all your
hard disks and all your backups are destroyed at the same time, there’s obviously nothing your
database can do to save you.

--------

<a id="sidebar_transactions_durability"></a>

> [!TIP] REPLICATION AND DURABILITY

Historically, durability meant writing to an archive tape. Then it was understood as writing to a disk
or SSD. More recently, it has been adapted to mean replication. Which implementation is better?

The truth is, nothing is perfect:

* If you write to disk and the machine dies, even though your data isn’t lost, it is inaccessible
 until you either fix the machine or transfer the disk to another machine. Replicated systems can
 remain available.
* A correlated fault—a power outage or a bug that crashes every node on a particular input—​can
 knock out all replicas at once (see [“Reliability and Fault Tolerance”](/en/ch2#sec_introduction_reliability)), losing any data that is
 only in memory. Writing to disk is therefore still relevant for replicated databases.
* In an asynchronously replicated system, recent writes may be lost when the leader becomes
 unavailable (see [“Handling Node Outages”](/en/ch6#sec_replication_failover)).
* When the power is suddenly cut, SSDs in particular have been shown to sometimes violate the
 guarantees they are supposed to provide: even `fsync` isn’t guaranteed to work correctly [^15].
 Disk firmware can have bugs, just like any other kind of software [^16] [^17],
 e.g. causing drives to fail after exactly 32,768 hours of operation [^18].
 And `fsync` is hard to use; even PostgreSQL used it incorrectly for over 20 years [^19] [^20] [^21].
* Subtle interactions between the storage engine and the filesystem implementation can lead to bugs
 that are hard to track down, and may cause files on disk to be corrupted after a crash [^22] [^23].
 Filesystem errors on one replica can sometimes spread to other replicas as well [^24].
* Data on disk can gradually become corrupted without this being detected [^25].
 If data has been corrupted for some time, replicas and recent backups may also be corrupted. In
 this case, you will need to try to restore the data from a historical backup.
* One study of SSDs found that between 30% and 80% of drives develop at least one bad block during
 the first four years of operation, and only some of these can be corrected by the firmware [^26].
 Magnetic hard drives have a lower rate of bad sectors, but a higher rate of complete failure than SSDs.
* When a worn-out SSD (that has gone through many write/erase cycles) is disconnected from power,
 it can start losing data within a timescale of weeks to months, depending on the temperature [^27].
 This is less of a problem for drives with lower wear levels [^28].

In practice, there is no one technique that can provide absolute guarantees. There are only various
risk-reduction techniques, including writing to disk, replicating to remote machines, and
backups—​and they can and should be used together. As always, it’s wise to take any theoretical
“guarantees” with a healthy grain of salt.

--------

### Single-Object and Multi-Object Operations {#sec_transactions_multi_object}

To recap, in ACID, atomicity and isolation describe what the database should do if a client makes
several writes within the same transaction:

Atomicity
: If an error occurs halfway through a sequence of writes, the transaction should be aborted, and
 the writes made up to that point should be discarded. In other words, the database saves you from
 having to worry about partial failure, by giving an all-or-nothing guarantee.

Isolation
: Concurrently running transactions shouldn’t interfere with each other. For example, if one
 transaction makes several writes, then another transaction should see either all or none of those
 writes, but not some subset.

These definitions assume that you want to modify several objects (rows, documents, records) at once.
Such *multi-object transactions* are often needed if several pieces of data need to be kept in sync.
[Figure 8-2](/en/ch8#fig_transactions_read_uncommitted) shows an example from an email application. To display the
number of unread messages for a user, you could query something like:

```
SELECT COUNT(*) FROM emails WHERE recipient_id = 2 AND unread_flag = true
```

{{< figure src="/fig/ddia_0802.png" id="fig_transactions_read_uncommitted" caption="Figure 8-2. Violating isolation: one transaction reads another transaction's uncommitted writes (a \"dirty read\")." class="w-full my-4" >}}


However, you might find this query to be too slow if there are many emails, and decide to store the
number of unread messages in a separate field (a kind of denormalization, which we discuss in
[“Normalization, Denormalization, and Joins”](/en/ch3#sec_datamodels_normalization)). Now, whenever a new message comes in, you have to increment the
unread counter as well, and whenever a message is marked as read, you also have to decrement the
unread counter.

In [Figure 8-2](/en/ch8#fig_transactions_read_uncommitted), user 2 experiences an anomaly: the mailbox listing shows
an unread message, but the counter shows zero unread messages because the counter increment has not
yet happened. (If an incorrect counter in an email application seems too insignificant, think of a
customer account balance instead of an unread counter, and a payment transaction instead of an
email.) Isolation would have prevented this issue by ensuring that user 2 sees either both the
inserted email and the updated counter, or neither, but not an inconsistent halfway point.

[Figure 8-3](/en/ch8#fig_transactions_atomicity) illustrates the need for atomicity: if an error occurs somewhere
over the course of the transaction, the contents of the mailbox and the unread counter might become out
of sync. In an atomic transaction, if the update to the counter fails, the transaction is aborted
and the inserted email is rolled back.

{{< figure src="/fig/ddia_0803.png" id="fig_transactions_atomicity" caption="Figure 8-3. Atomicity ensures that if an error occurs any prior writes from that transaction are undone, to avoid an inconsistent state." class="w-full my-4" >}}


Multi-object transactions require some way of determining which read and write operations belong to
the same transaction. In relational databases, that is typically done based on the client’s TCP
connection to the database server: on any particular connection, everything between a `BEGIN
TRANSACTION` and a `COMMIT` statement is considered to be part of the same transaction. If the TCP
connection is interrupted, the transaction must be aborted.

On the other hand, many nonrelational databases don’t have such a way of grouping operations
together. Even if there is a multi-object API (for example, a key-value store may have a *multi-put*
operation that updates several keys in one operation), that doesn’t necessarily mean it has
transaction semantics: the command may succeed for some keys and fail for others, leaving the
database in a partially updated state.

#### Single-object writes {#sec_transactions_single_object}

Atomicity and isolation also apply when a single object is being changed. For example, imagine you
are writing a 20 KB JSON document to a database:

* If the network connection is interrupted after the first 10 KB have been sent, does the
 database store that unparseable 10 KB fragment of JSON?
* If the power fails while the database is in the middle of overwriting the previous value on disk,
 do you end up with the old and new values spliced together?
* If another client reads that document while the write is in progress, will it see a partially
 updated value?

Those issues would be incredibly confusing, so storage engines almost universally aim to provide
atomicity and isolation on the level of a single object (such as a key-value pair) on one node.
Atomicity can be implemented using a log for crash recovery (see [“Making B-trees reliable”](/en/ch4#sec_storage_btree_wal)), and
isolation can be implemented using a lock on each object (allowing only one thread to access an
object at any one time).

Some databases also provide more complex atomic operations, such as an increment operation, which
removes the need for a read-modify-write cycle like that in [Figure 8-1](/en/ch8#fig_transactions_increment).
Similarly popular is a *conditional write* operation, which allows a write to happen only if the value
has not been concurrently changed by someone else (see [“Conditional writes (compare-and-set)”](/en/ch8#sec_transactions_compare_and_set)),
similarly to a compare-and-set or compare-and-swap (CAS) operation in shared-memory concurrency.

--------

> [!NOTE]
> Strictly speaking, the term *atomic increment* uses the word *atomic* in the sense of multi-threaded
> programming. In the context of ACID, it should actually be called an *isolated* or *serializable*
> increment, but that’s not the usual term.

--------

These single-object operations are useful, as they can prevent lost updates when several clients try
to write to the same object concurrently (see [“Preventing Lost Updates”](/en/ch8#sec_transactions_lost_update)). However, they are
not transactions in the usual sense of the word. For example, the “lightweight transactions” feature
of Cassandra and ScyllaDB, and Aerospike’s “strong consistency” mode offer linearizable (see
[“Linearizability”](/en/ch10#sec_consistency_linearizability)) reads and conditional writes on a single object, but no
guarantees across multiple objects.

#### The need for multi-object transactions {#sec_transactions_need}

Do we need multi-object transactions at all? Would it be possible to implement any application with
only a key-value data model and single-object operations?

There are some use cases in which single-object inserts, updates, and deletes are sufficient.
However, in many other cases writes to several different objects need to be coordinated:

* In a relational data model, a row in one table often has a foreign key reference to a row in
 another table. Similarly, in a graph-like data model, a vertex has edges to other vertices.
 Multi-object transactions allow you to ensure that these references remain valid: when inserting
 several records that refer to one another, the foreign keys have to be correct and up to date,
 or the data becomes nonsensical.
* In a document data model, the fields that need to be updated together are often within the same
 document, which is treated as a single object—no multi-object transactions are needed when
 updating a single document. However, document databases lacking join functionality also encourage
 denormalization (see [“When to Use Which Model”](/en/ch3#sec_datamodels_document_summary)). When denormalized information needs to
 be updated, like in the example of [Figure 8-2](/en/ch8#fig_transactions_read_uncommitted), you need to update
 several documents in one go. Transactions are very useful in this situation to prevent
 denormalized data from going out of sync.
* In databases with secondary indexes (almost everything except pure key-value stores), the indexes
 also need to be updated every time you change a value. These indexes are different database
 objects from a transaction point of view: for example, without transaction isolation, it’s
 possible for a record to appear in one index but not another, because the update to the second
 index hasn’t happened yet (see [“Sharding and Secondary Indexes”](/en/ch7#sec_sharding_secondary_indexes)).

Such applications can still be implemented without transactions. However, error handling becomes
much more complicated without atomicity, and the lack of isolation can cause concurrency problems.
We will discuss those in [“Weak Isolation Levels”](/en/ch8#sec_transactions_isolation_levels), and explore alternative approaches
in [“Derived data versus distributed transactions”](/en/ch13#sec_future_derived_vs_transactions).

#### Handling errors and aborts {#handling-errors-and-aborts}

A key feature of a transaction is that it can be aborted and safely retried if an error occurred.
ACID databases are based on this philosophy: if the database is in danger of violating its guarantee
of atomicity, isolation, or durability, it would rather abandon the transaction entirely than allow
it to remain half-finished.

Not all systems follow that philosophy, though. In particular, datastores with leaderless
replication (see [“Leaderless Replication”](/en/ch6#sec_replication_leaderless)) work much more on a “best effort” basis, which
could be summarized as “the database will do as much as it can, and if it runs into an error, it
won’t undo something it has already done”—so it’s the application’s responsibility to recover from
errors.

Errors will inevitably happen, but many software developers prefer to think only about the happy
path rather than the intricacies of error handling. For example, popular object-relational mapping
(ORM) frameworks such as Rails’s ActiveRecord and Django don’t retry aborted transactions—the
error usually results in an exception bubbling up the stack, so any user input is thrown away and
the user gets an error message. This is a shame, because the whole point of aborts is to enable safe
retries.

Although retrying an aborted transaction is a simple and effective error handling mechanism, it
isn’t perfect:

* If the transaction actually succeeded, but the network was interrupted while the server tried to
 acknowledge the successful commit to the client (so it timed out from the client’s point of view),
 then retrying the transaction causes it to be performed twice—unless you have an additional
 application-level deduplication mechanism in place.
* If the error is due to overload or high contention between concurrent transactions, retrying the
 transaction will make the problem worse, not better. To avoid such feedback cycles, you can limit
 the number of retries, use exponential backoff, and handle overload-related errors differently
 from other errors (see [“When an overloaded system won’t recover”](/en/ch2#sidebar_metastable)).
* It is only worth retrying after transient errors (for example due to deadlock, isolation
 violation, temporary network interruptions, and failover); after a permanent error (e.g.,
 constraint violation) a retry would be pointless.
* If the transaction also has side effects outside of the database, those side effects may happen
 even if the transaction is aborted. For example, if you’re sending an email, you wouldn’t want to
 send the email again every time you retry the transaction. If you want to make sure that several
 different systems either commit or abort together, two-phase commit can help (we will discuss this
 in [“Two-Phase Commit (2PC)”](/en/ch8#sec_transactions_2pc)).
* If the client process crashes while retrying, any data it was trying to write to the database is lost.


## Weak Isolation Levels {#sec_transactions_isolation_levels}

If two transactions don’t access the same data, or if both are read-only, they can safely be run in
parallel, because neither depends on the other. Concurrency issues (race conditions) only come into
play when one transaction reads data that is concurrently modified by another transaction, or when
the two transactions try to modify the same data.

Concurrency bugs are hard to find by testing, because such bugs are only triggered when you get
unlucky with the timing. Such timing issues might occur very rarely, and are usually difficult to
reproduce. Concurrency is also very difficult to reason about, especially in a large application
where you don’t necessarily know which other pieces of code are accessing the database. Application
development is difficult enough if you just have one user at a time; having many concurrent users
makes it much harder still, because any piece of data could unexpectedly change at any time.

For that reason, databases have long tried to hide concurrency issues from application developers by
providing *transaction isolation*. In theory, isolation should make your life easier by letting you
pretend that no concurrency is happening: *serializable* isolation means that the database
guarantees that transactions have the same effect as if they ran *serially* (i.e., one at a time,
without any concurrency).

In practice, isolation is unfortunately not that simple. Serializable isolation has a performance
cost, and many databases don’t want to pay that price [^10]. It’s therefore common for systems to use
weaker levels of isolation, which protect against *some* concurrency issues, but not all. Those
levels of isolation are much harder to understand, and they can lead to subtle bugs, but they are
nevertheless used in practice [^29].

Concurrency bugs caused by weak transaction isolation are not just a theoretical problem. They have
caused substantial loss of money [^30] [^31] [^32], led to investigation by financial auditors [^33],
and caused customer data to be corrupted [^34]. A popular comment on revelations of such problems is “Use an ACID database if you’re handling
financial data!”—but that misses the point. Even many popular relational database systems (which
are usually considered “ACID”) use weak isolation, so they wouldn’t necessarily have prevented these
bugs from occurring.

--------

> [!NOTE]
> Incidentally, much of the banking system relies on text files that are exchanged via secure FTP [^35].
> In this context, having an audit trail and some human-level fraud prevention measures is actually
> more important than ACID properties.

--------

Those examples also highlight an important point: even if concurrency issues are rare in normal
operation, you have to consider the possibility that an attacker deliberately sends a burst of
highly concurrent requests to your API in an attempt to deliberately exploit concurrency bugs [^30]. Therefore, in order to build
applications that are reliable and secure, you have to ensure that such bugs are systematically
prevented.

In this section we will look at several weak (nonserializable) isolation levels that are used in
practice, and discuss in detail what kinds of race conditions can and cannot occur, so that you can
decide what level is appropriate to your application. Once we’ve done that, we will discuss
serializability in detail (see [“Serializability”](/en/ch8#sec_transactions_serializability)). Our discussion of isolation
levels will be informal, using examples. If you want rigorous definitions and analyses of their
properties, you can find them in the academic literature [^36] [^37] [^38] [^39].

### Read Committed {#sec_transactions_read_committed}

The most basic level of transaction isolation is *read committed*. It makes two guarantees:

1. When reading from the database, you will only see data that has been committed (no *dirty reads*).
2. When writing to the database, you will only overwrite data that has been committed (no *dirty writes*).

Some databases support an even weaker isolation level called *read uncommitted*. It prevents dirty
writes, but does not prevent dirty reads. Let’s discuss these two guarantees in more detail.

#### No dirty reads {#no-dirty-reads}

Imagine a transaction has written some data to the database, but the transaction has not yet committed or aborted.
Can another transaction see that uncommitted data? If yes, that is called a
*dirty read* [^3].

Transactions running at the read committed isolation level must prevent dirty reads. This means that
any writes by a transaction only become visible to others when that transaction commits (and then
all of its writes become visible at once). This is illustrated in [Figure 8-4](/en/ch8#fig_transactions_read_committed), where user 1 has set *x* = 3, but user 2’s *get x* still
returns the old value, 2, while user 1 has not yet committed.

{{< figure src="/fig/ddia_0804.png" id="fig_transactions_read_committed" caption="Figure 8-4. No dirty reads: user 2 sees the new value for x only after user 1's transaction has committed." class="w-full my-4" >}}

There are a few reasons why it’s useful to prevent dirty reads:

* If a transaction needs to update several rows, a dirty read means that another transaction may
 see some of the updates but not others. For example, in [Figure 8-2](/en/ch8#fig_transactions_read_uncommitted), the
 user sees the new unread email but not the updated counter. This is a dirty read of the email.
 Seeing the database in a partially updated state is confusing to users and may cause other
 transactions to take incorrect decisions.
* If a transaction aborts, any writes it has made need to be rolled back (like in
 [Figure 8-3](/en/ch8#fig_transactions_atomicity)). If the database allows dirty reads, that means a transaction may
 see data that is later rolled back—i.e., which is never actually committed to the database. Any
 transaction that read uncommitted data would also need to be aborted, leading to a problem called
 *cascading aborts*.

#### No dirty writes {#sec_transactions_dirty_write}

What happens if two transactions concurrently try to update the same row in a database? We don’t
know in which order the writes will happen, but we normally assume that the later write overwrites
the earlier write.

However, what happens if the earlier write is part of a transaction that has not yet committed, so
the later write overwrites an uncommitted value? This is called a *dirty write* [^36]. Transactions running at the read
committed isolation level must prevent dirty writes, usually by delaying the second write until the
first write’s transaction has committed or aborted.

By preventing dirty writes, this isolation level avoids some kinds of concurrency problems:

* If transactions update multiple rows, dirty writes can lead to a bad outcome. For example,
 consider [Figure 8-5](/en/ch8#fig_transactions_dirty_writes), which illustrates a used car sales website on which
 two people, Aaliyah and Bryce, are simultaneously trying to buy the same car. Buying a car requires
 two database writes: the listing on the website needs to be updated to reflect the buyer, and the
 sales invoice needs to be sent to the buyer. In the case of [Figure 8-5](/en/ch8#fig_transactions_dirty_writes), the
 sale is awarded to Bryce (because he performs the winning update to the `listings` table), but the
 invoice is sent to Aaliyah (because she performs the winning update to the `invoices` table). Read
 committed prevents such mishaps.
* However, read committed does *not* prevent the race condition between two counter increments in
 [Figure 8-1](/en/ch8#fig_transactions_increment). In this case, the second write happens after the first transaction
 has committed, so it’s not a dirty write. It’s still incorrect, but for a different reason—in
 [“Preventing Lost Updates”](/en/ch8#sec_transactions_lost_update) we will discuss how to make such counter increments safe.

{{< figure src="/fig/ddia_0805.png" id="fig_transactions_dirty_writes" caption="Figure 8-5. With dirty writes, conflicting writes from different transactions can be mixed up." class="w-full my-4" >}}


#### Implementing read committed {#sec_transactions_read_committed_impl}

Read committed is a very popular isolation level. It is the default setting in Oracle Database,
PostgreSQL, SQL Server, and many other databases [^10].

Most commonly, databases prevent dirty writes by using row-level locks: when a transaction wants to
modify a particular row (or document or some other object), it must first acquire a lock on that
row. It must then hold that lock until the transaction is committed or aborted. Only one transaction
can hold the lock for any given row; if another transaction wants to write to the same row, it must
wait until the first transaction is committed or aborted before it can acquire the lock and
continue. This locking is done automatically by databases in read committed mode (or stronger
isolation levels).

How do we prevent dirty reads? One option would be to use the same lock, and to require any
transaction that wants to read a row to briefly acquire the lock and then release it again
immediately after reading. This would ensure that a read couldn’t happen while a row has a
dirty, uncommitted value (because during that time the lock would be held by the transaction that
has made the write).

However, the approach of requiring read locks does not work well in practice, because one
long-running write transaction can force many other transactions to wait until the long-running
transaction has completed, even if the other transactions only read and do not write anything to the
database. This harms the response time of read-only transactions and is bad for
operability: a slowdown in one part of an application can have a knock-on effect in a completely
different part of the application, due to waiting for locks.

Nevertheless, locks are used to prevent dirty reads in some databases, such as IBM
Db2 and Microsoft SQL Server in the `read_committed_snapshot=off` setting [^29].

A more commonly used approach to preventing dirty reads is the one illustrated in [Figure 8-4](/en/ch8#fig_transactions_read_committed): for every
row that is written, the database remembers both the old committed value and the new value
set by the transaction that currently holds the write lock. While the transaction is ongoing, any
other transactions that read the row are simply given the old value. Only when the new value is
committed do transactions switch over to reading the new value (see
[“Multi-version concurrency control (MVCC)”](/en/ch8#sec_transactions_snapshot_impl) for more detail).

### Snapshot Isolation and Repeatable Read {#sec_transactions_snapshot_isolation}

If you look superficially at read committed isolation, you could be forgiven for thinking that it
does everything that a transaction needs to do: it allows aborts (required for atomicity), it
prevents reading the incomplete results of transactions, and it prevents concurrent writes from
getting intermingled. Indeed, those are useful features, and much stronger guarantees than you can
get from a system that has no transactions.

However, there are still plenty of ways in which you can have concurrency bugs when using this
isolation level. For example, [Figure 8-6](/en/ch8#fig_transactions_item_many_preceders) illustrates a problem that
can occur with read committed.

{{< figure src="/fig/ddia_0806.png" id="fig_transactions_item_many_preceders" caption="Figure 8-6. Read skew: Aaliyah observes the database in an inconsistent state." class="w-full my-4" >}}


Say Aaliyah has $1,000 of savings at a bank, split across two accounts with $500 each. Now a
transaction transfers $100 from one of her accounts to the other. If she is unlucky enough to look at her
list of account balances in the same moment as that transaction is being processed, she may see one
account balance at a time before the incoming payment has arrived (with a balance of $500), and the
other account after the outgoing transfer has been made (the new balance being $400). To Aaliyah it
now appears as though she only has a total of $900 in her accounts—it seems that $100 has
vanished into thin air.

This anomaly is called *read skew*, and it is an example of a *nonrepeatable read*:
if Aaliyah were to read the balance of account 1 again at the end of the transaction, she would see a different value ($600) than she saw
in her previous query. Read skew is considered acceptable under read committed isolation: the
account balances that Aaliyah saw were indeed committed at the time when she read them.

--------

> [!NOTE]
> The term *skew* is unfortunately overloaded: we previously used it in the sense of an *unbalanced
> workload with hot spots* (see [“Skewed Workloads and Relieving Hot Spots”](/en/ch7#sec_sharding_skew)), whereas here it means *timing anomaly*.

--------

In Aaliyah’s case, this is not a lasting problem, because she will most likely see consistent account
balances if she reloads the online banking website a few seconds later. However, some situations
cannot tolerate such temporary inconsistency:

Backups
: Taking a backup requires making a copy of the entire database, which may take hours on a large
 database. During the time that the backup process is running, writes will continue to be made to
 the database. Thus, you could end up with some parts of the backup containing an older version of
 the data, and other parts containing a newer version. If you need to restore from such a backup,
 the inconsistencies (such as disappearing money) become permanent.

Analytic queries and integrity checks
: Sometimes, you may want to run a query that scans over large parts of the database. Such queries
 are common in analytics (see [“Analytical versus Operational Systems”](/en/ch1#sec_introduction_analytics)), or may be part of a periodic integrity
 check that everything is in order (monitoring for data corruption). These queries are likely to
 return nonsensical results if they observe parts of the database at different points in time.

*Snapshot isolation* [^36] is the most common
solution to this problem. The idea is that each transaction reads from a *consistent snapshot* of
the database—that is, the transaction sees all the data that was committed in the database at the
start of the transaction. Even if the data is subsequently changed by another transaction, each
transaction sees only the old data from that particular point in time.

Snapshot isolation is a boon for long-running, read-only queries such as backups and analytics. It
is very hard to reason about the meaning of a query if the data on which it operates is changing at
the same time as the query is executing. When a transaction can see a consistent snapshot of the
database, frozen at a particular point in time, it is much easier to understand.

Snapshot isolation is a popular feature: variants of it are supported by PostgreSQL, MySQL with the
InnoDB storage engine, Oracle, SQL Server, and others, although the detailed behavior varies from
one system to the next [^29] [^40] [^41].
Some databases, such as Oracle, TiDB, and Aurora DSQL, even choose snapshot isolation as their
highest isolation level.

#### Multi-version concurrency control (MVCC) {#sec_transactions_snapshot_impl}

Like read committed isolation, implementations of snapshot isolation typically use write locks to
prevent dirty writes (see [“Implementing read committed”](/en/ch8#sec_transactions_read_committed_impl)), which means that a transaction
that makes a write can block the progress of another transaction that writes to the same row.
However, reads do not require any locks. From a performance point of view, a key principle of
snapshot isolation is *readers never block writers, and writers never block readers*. This allows a
database to handle long-running read queries on a consistent snapshot at the same time as processing
writes normally, without any lock contention between the two.

To implement snapshot isolation, databases use a generalization of the mechanism we saw for
preventing dirty reads in [Figure 8-4](/en/ch8#fig_transactions_read_committed). Instead of two versions of each row
(the committed version and the overwritten-but-not-yet-committed version), the database must
potentially keep several different committed versions of a row, because various in-progress
transactions may need to see the state of the database at different points in time. Because it
maintains several versions of a row side by side, this technique is known as *multi-version
concurrency control* (MVCC).

[Figure 8-7](/en/ch8#fig_transactions_mvcc) illustrates how MVCC-based snapshot isolation is implemented in PostgreSQL
[^40] [^42] [^43] (other implementations are similar).
When a transaction is started, it is given a unique, always-increasing transaction ID (`txid`).
Whenever a transaction writes anything to the database, the data it writes is tagged with the
transaction ID of the writer. (To be precise, transaction IDs in PostgreSQL are 32-bit integers, so
they overflow after approximately 4 billion transactions. The vacuum process performs cleanup to
ensure that overflow does not affect the data.)

{{< figure src="/fig/ddia_0807.png" id="fig_transactions_mvcc" caption="Figure 8-7. Implementing snapshot isolation using multi-version concurrency control." class="w-full my-4" >}}


Each row in a table has a `inserted_by` field, containing the ID of the transaction that inserted
this row into the table. Moreover, each row has a `deleted_by` field, which is initially empty. If a
transaction deletes a row, the row isn’t actually removed from the database, but it is marked for
deletion by setting the `deleted_by` field to the ID of the transaction that requested the deletion.
At some later time, when it is certain that no transaction can any longer access the deleted data, a
garbage collection process in the database removes any rows marked for deletion and frees their
space.

An update is internally translated into a delete and a insert [^44].
For example, in [Figure 8-7](/en/ch8#fig_transactions_mvcc), transaction 13 deducts $100 from account 2, changing the
balance from $500 to $400. The `accounts` table now actually contains two rows for account 2: a row
with a balance of $500 which was marked as deleted by transaction 13, and a row with a balance of
$400 which was inserted by transaction 13.

All of the versions of a row are stored within the same database heap (see
[“Storing values within the index”](/en/ch4#sec_storage_index_heap)), regardless of whether the transactions that wrote them have committed
or not. The versions of the same row form a linked list, going either from newest version to oldest
version or the other way round, so that queries can internally iterate over all versions of a row [^45] [^46].

#### Visibility rules for observing a consistent snapshot {#sec_transactions_mvcc_visibility}

When a transaction reads from the database, transaction IDs are used to decide which row versions it
can see and which are invisible. By carefully defining visibility rules, the database can present a
consistent snapshot of the database to the application. This works roughly as follows [^43]:

1. At the start of each transaction, the database makes a list of all the other transactions that
 are in progress (not yet committed or aborted) at that time. Any writes that those
 transactions have made are ignored, even if the transactions subsequently commit. This ensures
 that we see a consistent snapshot that is not affected by another transaction committing.
2. Any writes made by transactions with a later transaction ID (i.e., which started after the current
 transaction started, and which are therefore not included in the list of in-progress
 transactions) are ignored, regardless of whether those transactions have committed.
3. Any writes made by aborted transactions are ignored, regardless of when that abort happened.
 This has the advantage that when a transaction aborts, we don’t need to immediately remove the
 rows it wrote from storage, since the visibility rule filters them out. The garbage collection
 process can remove them later.
4. All other writes are visible to the application’s queries.

These rules apply to both insertion and deletion of rows. In [Figure 8-7](/en/ch8#fig_transactions_mvcc), when
transaction 12 reads from account 2, it sees a balance of $500 because the deletion of the $500
balance was made by transaction 13 (according to rule 2, transaction 12 cannot see a deletion made
by transaction 13), and the insertion of the $400 balance is not yet visible (by the same rule).

Put another way, a row is visible if both of the following conditions are true:

* At the time when the reader’s transaction started, the transaction that inserted the row had
 already committed.
* The row is not marked for deletion, or if it is, the transaction that requested deletion had
 not yet committed at the time when the reader’s transaction started.

A long-running transaction may continue using a snapshot for a long time, continuing to read values
that (from other transactions’ point of view) have long been overwritten or deleted. By never
updating values in place but instead inserting a new version every time a value is changed, the
database can provide a consistent snapshot while incurring only a small overhead.

<a id="sec_transactions_snapshot_indexes"></a>

#### Indexes and snapshot isolation {#indexes-and-snapshot-isolation}

How do indexes work in a multi-version database? The most common approach is that each index entry
points at one of the versions of a row that matches the entry (either the oldest or the newest
version). Each row version may contain a reference to the next-oldest or next-newest version. A
query that uses the index must then iterate over the rows to find one that is visible, and where the
value matches what the query is looking for. When garbage collection removes old row versions that
are no longer visible to any transaction, the corresponding index entries can also be removed.

Many implementation details affect the performance of multi-version concurrency control [^45] [^46].
For example, PostgreSQL has optimizations for avoiding index updates if different versions of the
same row can fit on the same page [^40]. Some other databases avoid storing full copies of modified rows, 
and only store differences between versions to save space.

Another approach is used in CouchDB, Datomic, and LMDB. Although they also use B-trees (see
[“B-Trees”](/en/ch4#sec_storage_b_trees)), they use an *immutable* (copy-on-write) variant that does not overwrite
pages of the tree when they are updated, but instead creates a new copy of each modified page.
Parent pages, up to the root of the tree, are copied and updated to point to the new versions of
their child pages. Any pages that are not affected by a write do not need to be copied, and can be
shared with the new tree [^47].

With immutable B-trees, every write transaction (or batch of transactions) creates a new B-tree
root, and a particular root is a consistent snapshot of the database at the point in time when it
was created. There is no need to filter out rows based on transaction IDs because subsequent
writes cannot modify an existing B-tree; they can only create new tree roots. This approach also
requires a background process for compaction and garbage collection.

#### Snapshot isolation, repeatable read, and naming confusion {#snapshot-isolation-repeatable-read-and-naming-confusion}

MVCC is a commonly used implementation technique for databases, and often it is used to implement
snapshot isolation. However, different databases sometimes use different terms to refer to the same
thing: for example, snapshot isolation is called “repeatable read” in PostgreSQL, and “serializable”
in Oracle [^29]. Sometimes different systems
use the same term to mean different things: for example, while in PostgreSQL “repeatable read” means
snapshot isolation, in MySQL it means an implementation of MVCC with weaker consistency than
snapshot isolation [^41].

The reason for this naming confusion is that the SQL standard doesn’t have the concept of snapshot
isolation, because the standard is based on System R’s 1975 definition of isolation levels [^3] and snapshot isolation hadn’t yet been
invented then. Instead, it defines repeatable read, which looks superficially similar to snapshot
isolation. PostgreSQL calls its snapshot isolation level “repeatable read” because it meets the
requirements of the standard, and so they can claim standards compliance.

Unfortunately, the SQL standard’s definition of isolation levels is flawed—it is ambiguous,
imprecise, and not as implementation-independent as a standard should be [^36]. Even though several databases
implement repeatable read, there are big differences in the guarantees they actually provide,
despite being ostensibly standardized [^29]. There has been a formal definition of
repeatable read in the research literature [^37] [^38], but most implementations don’t satisfy that
formal definition. And to top it off, IBM Db2 uses “repeatable read” to refer to serializability [^10].

As a result, nobody really knows what repeatable read means.

### Preventing Lost Updates {#sec_transactions_lost_update}

The read committed and snapshot isolation levels we’ve discussed so far have been primarily about the guarantees
of what a read-only transaction can see in the presence of concurrent writes. We have mostly ignored
the issue of two transactions writing concurrently—we have only discussed dirty writes (see
[“No dirty writes”](/en/ch8#sec_transactions_dirty_write)), one particular type of write-write conflict that can occur.

There are several other interesting kinds of conflicts that can occur between concurrently writing
transactions. The best known of these is the *lost update* problem, illustrated in
[Figure 8-1](/en/ch8#fig_transactions_increment) with the example of two concurrent counter increments.

The lost update problem can occur if an application reads some value from the database, modifies it,
and writes back the modified value (a *read-modify-write cycle*). If two transactions do this
concurrently, one of the modifications can be lost, because the second write does not include the
first modification. (We sometimes say that the later write *clobbers* the earlier write.) This
pattern occurs in various different scenarios:

* Incrementing a counter or updating an account balance (requires reading the current value,
 calculating the new value, and writing back the updated value)
* Making a local change to a complex value, e.g., adding an element to a list within a JSON document
 (requires parsing the document, making the change, and writing back the modified document)
* Two users editing a wiki page at the same time, where each user saves their changes by sending the
 entire page contents to the server, overwriting whatever is currently in the database

Because this is such a common problem, a variety of solutions have been developed [^48].

#### Atomic write operations {#atomic-write-operations}

Many databases provide atomic update operations, which remove the need to implement
read-modify-write cycles in application code. They are usually the best solution if your code can be
expressed in terms of those operations. For example, the following instruction is concurrency-safe
in most relational databases:

```sql
UPDATE counters SET value = value + 1 WHERE key = 'foo';
```

Similarly, document databases such as MongoDB provide atomic operations for making local
modifications to a part of a JSON document, and Redis provides atomic operations for modifying data
structures such as priority queues. Not all writes can easily be expressed in terms of atomic
operations—for example, updates to a wiki page involve arbitrary text editing, which can be handled
using algorithms discussed in [“CRDTs and Operational Transformation”](/en/ch6#sec_replication_crdts)—but in situations where atomic operations
can be used, they are usually the best choice.

Atomic operations are usually implemented by taking an exclusive lock on the object when it is read
so that no other transaction can read it until the update has been applied.
Another option is to simply force all atomic operations to be executed on a single thread.

Unfortunately, object-relational mapping (ORM) frameworks make it easy to accidentally write code
that performs unsafe read-modify-write cycles instead of using atomic operations provided by the
database [^49] [^50] [^51].
This can be a source of subtle bugs that are difficult to find by testing.

#### Explicit locking {#explicit-locking}

Another option for preventing lost updates, if the database’s built-in atomic operations don’t
provide the necessary functionality, is for the application to explicitly lock objects that are
going to be updated. Then the application can perform a read-modify-write cycle, and if any other
transaction tries to concurrently update or lock the same object, it is forced to wait until the
first read-modify-write cycle has completed.

For example, consider a multiplayer game in which several players can move the same figure
concurrently. In this case, an atomic operation may not be sufficient, because the application also
needs to ensure that a player’s move abides by the rules of the game, which involves some logic that
you cannot sensibly implement as a database query. Instead, you may use a lock to prevent two
players from concurrently moving the same piece, as illustrated in [Example 8-1](/en/ch8#fig_transactions_select_for_update).

{{< figure id="fig_transactions_select_for_update" title="Example 8-1. Explicitly locking rows to prevent lost updates" class="w-full my-4" >}}

```sql
BEGIN TRANSACTION;

SELECT * FROM figures
    WHERE name = 'robot' AND game_id = 222
    FOR UPDATE; ❶

-- Check whether move is valid, then update the position
-- of the piece that was returned by the previous SELECT.
UPDATE figures SET position = 'c4' WHERE id = 1234;

COMMIT;
```

❶: The `FOR UPDATE` clause indicates that the database should take a lock on all rows returned by this query.

This works, but to get it right, you need to carefully think about your application logic. It’s easy
to forget to add a necessary lock somewhere in the code, and thus introduce a race condition.

Moreover, if you lock multiple objects there is a risk of deadlock, where two or more transactions
are waiting for each other to release their locks. Many databases automatically detect deadlocks,
and abort one of the involved transactions so that the system can make progress. You can handle this
situation at the application level by retrying the aborted transaction.

#### Automatically detecting lost updates {#automatically-detecting-lost-updates}

Atomic operations and locks are ways of preventing lost updates by forcing the read-modify-write
cycles to happen sequentially. An alternative is to allow them to execute in parallel and, if the
transaction manager detects a lost update, abort the transaction and force it to retry
its read-modify-write cycle.

An advantage of this approach is that databases can perform this check efficiently in conjunction
with snapshot isolation. Indeed, PostgreSQL’s repeatable read, Oracle’s serializable, and SQL
Server’s snapshot isolation levels automatically detect when a lost update has occurred and abort
the offending transaction. However, MySQL/InnoDB’s repeatable read does not detect lost updates [^29] [^41].
Some authors [^36] [^38] argue that a database must prevent lost
updates in order to qualify as providing snapshot isolation, so MySQL does not provide snapshot
isolation under this definition.

Lost update detection is a great feature, because it doesn’t require application code to use any
special database features—you may forget to use a lock or an atomic operation and thus introduce
a bug, but lost update detection happens automatically and is thus less error-prone. However, you
also have to retry aborted transactions at the application level.

#### Conditional writes (compare-and-set) {#sec_transactions_compare_and_set}

In databases that don’t provide transactions, you sometimes find a *conditional write* operation
that can prevent lost updates by allowing an update to happen only if the value has not changed
since you last read it (previously mentioned in [“Single-object writes”](/en/ch8#sec_transactions_single_object)). If the current
value does not match what you previously read, the update has no effect, and the read-modify-write
cycle must be retried. It is the database equivalent of an atomic *compare-and-set* or
*compare-and-swap* (CAS) instruction that is supported by many CPUs.

For example, to prevent two users concurrently updating the same wiki page, you might try something
like this, expecting the update to occur only if the content of the page hasn’t changed since the
user started editing it:

```sql
-- This may or may not be safe, depending on the database implementation
UPDATE wiki_pages SET content = 'new content'
    WHERE id = 1234 AND content = 'old content';
```

If the content has changed and no longer matches `'old content'`, this update will have no effect,
so you need to check whether the update took effect and retry if necessary. Instead of comparing the
full content, you could also use a version number column that you increment on every update, and
apply the update only if the current version number hasn’t changed. This approach is sometimes
called *optimistic locking* [^52].

Note that if another transaction has concurrently modified `content`, the new content may not be
visible under the MVCC visibility rules (see [“Visibility rules for observing a consistent snapshot”](/en/ch8#sec_transactions_mvcc_visibility)). Many
implementations of MVCC have an exception to the visibility rules for this scenario, where values
written by other transactions are visible to the evaluation of the `WHERE` clause of `UPDATE` and
`DELETE` queries, even though those writes are not otherwise visible in the snapshot.

#### Conflict resolution and replication {#conflict-resolution-and-replication}

In replicated databases (see [Chapter 6](/en/ch6#ch_replication)), preventing lost updates takes on another
dimension: since they have copies of the data on multiple nodes, and the data can potentially be
modified concurrently on different nodes, some additional steps need to be taken to prevent lost
updates.

Locks and conditional write operations assume that there is a single up-to-date copy of the data.
However, databases with multi-leader or leaderless replication usually allow several writes to
happen concurrently and replicate them asynchronously, so they cannot guarantee that there is a
single up-to-date copy of the data. Thus, techniques based on locks or conditional writes do not apply
in this context. (We will revisit this issue in more detail in [“Linearizability”](/en/ch10#sec_consistency_linearizability).)

Instead, as discussed in [“Dealing with Conflicting Writes”](/en/ch6#sec_replication_write_conflicts), a common approach in such replicated
databases is to allow concurrent writes to create several conflicting versions of a value (also
known as *siblings*), and to use application code or special data structures to resolve and merge
these versions after the fact.

Merging conflicting values can prevent lost updates if the updates are commutative (i.e., you can
apply them in a different order on different replicas, and still get the same result). For example,
incrementing a counter or adding an element to a set are commutative operations. That is the idea
behind CRDTs, which we encountered in [“CRDTs and Operational Transformation”](/en/ch6#sec_replication_crdts). However, some operations such as
conditional writes cannot be made commutative.

On the other hand, the *last write wins* (LWW) conflict resolution method is prone to lost updates,
as discussed in [“Last write wins (discarding concurrent writes)”](/en/ch6#sec_replication_lww). 
Unfortunately, LWW is the default in many replicated databases.

### Write Skew and Phantoms {#sec_transactions_write_skew}

In the previous sections we saw *dirty writes* and *lost updates*, two kinds of race conditions that
can occur when different transactions concurrently try to write to the same objects. In order to
avoid data corruption, those race conditions need to be prevented—either automatically by the
database, or by manual safeguards such as using locks or atomic write operations.

However, that is not the end of the list of potential race conditions that can occur between
concurrent writes. In this section we will see some subtler examples of conflicts.

To begin, imagine this example: you are writing an application for doctors to manage their on-call
shifts at a hospital. The hospital usually tries to have several doctors on call at any one time,
but it absolutely must have at least one doctor on call. Doctors can give up their shifts (e.g., if
they are sick themselves), provided that at least one colleague remains on call in that shift [^53] [^54].

Now imagine that Aaliyah and Bryce are the two on-call doctors for a particular shift. Both are
feeling unwell, so they both decide to request leave. Unfortunately, they happen to click the button
to go off call at approximately the same time. What happens next is illustrated in
[Figure 8-8](/en/ch8#fig_transactions_write_skew).

{{< figure src="/fig/ddia_0808.png" id="fig_transactions_write_skew" caption="Figure 8-8. Example of write skew causing an application bug." class="w-full my-4" >}}


In each transaction, your application first checks that two or more doctors are currently on call;
if yes, it assumes it’s safe for one doctor to go off call. Since the database is using snapshot
isolation, both checks return `2`, so both transactions proceed to the next stage. Aaliyah updates her
own record to take herself off call, and Bryce updates his own record likewise. Both transactions
commit, and now no doctor is on call. Your requirement of having at least one doctor on call has been violated.

#### Characterizing write skew {#characterizing-write-skew}

This anomaly is called *write skew* [^36]. It
is neither a dirty write nor a lost update, because the two transactions are updating two different
objects (Aaliyah’s and Bryce’s on-call records, respectively). It is less obvious that a conflict occurred
here, but it’s definitely a race condition: if the two transactions had run one after another, the
second doctor would have been prevented from going off call. The anomalous behavior was only
possible because the transactions ran concurrently.

You can think of write skew as a generalization of the lost update problem. Write skew can occur if two
transactions read the same objects, and then update some of those objects (different transactions
may update different objects). In the special case where different transactions update the same
object, you get a dirty write or lost update anomaly (depending on the timing).

We saw that there are various different ways of preventing lost updates. With write skew, our
options are more restricted:

* Atomic single-object operations don’t help, as multiple objects are involved.
* The automatic detection of lost updates that you find in some implementations of snapshot
 isolation unfortunately doesn’t help either: write skew is not automatically detected in
 PostgreSQL’s repeatable read, MySQL/InnoDB’s repeatable read, Oracle’s serializable, or SQL
 Server’s snapshot isolation level [^29]. 
 Automatically preventing write skew requires true serializable isolation (see [“Serializability”](/en/ch8#sec_transactions_serializability)).
* Some databases allow you to configure constraints, which are then enforced by the database (e.g.,
 uniqueness, foreign key constraints, or restrictions on a particular value). However, in order to
 specify that at least one doctor must be on call, you would need a constraint that involves
 multiple objects. Most databases do not have built-in support for such constraints, but you may be
 able to implement them with triggers or materialized views, as discussed in
 [“Consistency”](/en/ch8#sec_transactions_acid_consistency) [^12].
* If you can’t use a serializable isolation level, the second-best option in this case is probably
 to explicitly lock the rows that the transaction depends on. In the doctors example, you could
 write something like the following:

 ```sql
 BEGIN TRANSACTION;

 SELECT * FROM doctors
     WHERE on_call = true
     AND shift_id = 1234 FOR UPDATE; ❶

 UPDATE doctors
    SET on_call = false
    WHERE name = 'Aaliyah'
    AND shift_id = 1234;

 COMMIT;
 ```

❶: As before, `FOR UPDATE` tells the database to lock all rows returned by this query.

#### More examples of write skew {#more-examples-of-write-skew}

Write skew may seem like an esoteric issue at first, but once you’re aware of it, you may notice
more situations in which it can occur. Here are some more examples:

Meeting room booking system
: Say you want to enforce that there cannot be two bookings for the same meeting room at the same time [^55].
    When someone wants to make a booking, you first check for any conflicting bookings (i.e.,
    bookings for the same room with an overlapping time range), and if none are found, you create the
    meeting (see [Example 8-2](/en/ch8#fig_transactions_meeting_rooms)).
    
    {{< figure id="fig_transactions_meeting_rooms" title="Example 8-2. A meeting room booking system tries to avoid double-booking (not safe under snapshot isolation)" class="w-full my-4" >}}
    
    ```sql
    BEGIN TRANSACTION;
    
    -- Check for any existing bookings that overlap with the period of noon-1pm
    SELECT COUNT(*) FROM bookings
    WHERE room_id = 123 AND
    end_time > '2025-01-01 12:00' AND start_time < '2025-01-01 13:00';
    
    -- If the previous query returned zero:
    INSERT INTO bookings (room_id, start_time, end_time, user_id)
    VALUES (123, '2025-01-01 12:00', '2025-01-01 13:00', 666);
    
    COMMIT;
    ```

     Unfortunately, snapshot isolation does not prevent another user from concurrently inserting a conflicting
     meeting. In order to guarantee you won’t get scheduling conflicts, you once again need serializable
     isolation.

Multiplayer game
: In [Example 8-1](/en/ch8#fig_transactions_select_for_update), we used a lock to prevent lost updates (that is, making
 sure that two players can’t move the same figure at the same time). However, the lock doesn’t
 prevent players from moving two different figures to the same position on the board or potentially
 making some other move that violates the rules of the game. Depending on the kind of rule you are
 enforcing, you might be able to use a unique constraint, but otherwise you’re vulnerable to write
 skew.

Claiming a username
: On a website where each user has a unique username, two users may try to create accounts with the
 same username at the same time. You may use a transaction to check whether a name is taken and, if
 not, create an account with that name. However, like in the previous examples, that is not safe
 under snapshot isolation. Fortunately, a unique constraint is a simple solution here (the second
 transaction that tries to register the username will be aborted due to violating the constraint).

Preventing double-spending
: A service that allows users to spend money or points needs to check that a user doesn’t spend more
 than they have. You might implement this by inserting a tentative spending item into a user’s
 account, listing all the items in the account, and checking that the sum is positive.
 With write skew, it could happen that two spending items are inserted concurrently that together
 cause the balance to go negative, but that neither transaction notices the other.

#### Phantoms causing write skew {#sec_transactions_phantom}

All of these examples follow a similar pattern:

1. A `SELECT` query checks whether some requirement is satisfied by searching for rows that
 match some search condition (there are at least two doctors on call, there are no existing
 bookings for that room at that time, the position on the board doesn’t already have another
 figure on it, the username isn’t already taken, there is still money in the account).
2. Depending on the result of the first query, the application code decides how to continue (perhaps
 to go ahead with the operation, or perhaps to report an error to the user and abort).
3. If the application decides to go ahead, it makes a write (`INSERT`, `UPDATE`, or `DELETE`) to the
 database and commits the transaction.

 The effect of this write changes the precondition of the decision of step 2. In other words, if you
 were to repeat the `SELECT` query from step 1 after committing the write, you would get a different
 result, because the write changed the set of rows matching the search condition (there is now one
 fewer doctor on call, the meeting room is now booked for that time, the position on the board is now
 taken by the figure that was moved, the username is now taken, there is now less money in the
 account).

The steps may occur in a different order. For example, you could first make the write, then the
`SELECT` query, and finally decide whether to abort or commit based on the result of the query.

In the case of the doctor on call example, the row being modified in step 3 was one of the rows
returned in step 1, so we could make the transaction safe and avoid write skew by locking the rows
in step 1 (`SELECT FOR UPDATE`). However, the other four examples are different: they check for the
*absence* of rows matching some search condition, and the write *adds* a row matching the same
condition. If the query in step 1 doesn’t return any rows, `SELECT FOR UPDATE` can’t attach locks to
anything [^56].

This effect, where a write in one transaction changes the result of a search query in another
transaction, is called a *phantom* [^4].
Snapshot isolation avoids phantoms in read-only queries, but in read-write transactions like the
examples we discussed, phantoms can lead to particularly tricky cases of write skew. The SQL
generated by ORMs is also prone to write skew [^50] [^51].

#### Materializing conflicts {#materializing-conflicts}

If the problem of phantoms is that there is no object to which we can attach the locks, perhaps we
can artificially introduce a lock object into the database?

For example, in the meeting room booking case you could imagine creating a table of time slots and
rooms. Each row in this table corresponds to a particular room for a particular time period (say, 15
minutes). You create rows for all possible combinations of rooms and time periods ahead of time,
e.g. for the next six months.

Now a transaction that wants to create a booking can lock (`SELECT FOR UPDATE`) the rows in the
table that correspond to the desired room and time period. After it has acquired the locks, it can
check for overlapping bookings and insert a new booking as before. Note that the additional table
isn’t used to store information about the booking—it’s purely a collection of locks which is used
to prevent bookings on the same room and time range from being modified concurrently.

This approach is called *materializing conflicts*, because it takes a phantom and turns it into a
lock conflict on a concrete set of rows that exist in the database [^14]. Unfortunately, it can be hard and
error-prone to figure out how to materialize conflicts, and it’s ugly to let a concurrency control
mechanism leak into the application data model. For those reasons, materializing conflicts should be
considered a last resort if no alternative is possible. A serializable isolation level is much
preferable in most cases.


## Serializability {#sec_transactions_serializability}

In this chapter we have seen several examples of transactions that are prone to race conditions.
Some race conditions are prevented by the read committed and snapshot isolation levels, but
others are not. We encountered some particularly tricky examples with write skew and phantoms. It’s
a sad situation:

* Isolation levels are hard to understand, and inconsistently implemented in different databases
 (e.g., the meaning of “repeatable read” varies significantly).
* If you look at your application code, it’s difficult to tell whether it is safe to run at a
 particular isolation level—especially in a large application, where you might not be aware of
 all the things that may be happening concurrently.
* There are no good tools to help us detect race conditions. In principle, static analysis may
 help [^33], but research techniques have not
 yet found their way into practical use. Testing for concurrency issues is hard, because they are
 usually nondeterministic—problems only occur if you get unlucky with the timing.

This is not a new problem—it has been like this since the 1970s, when weak isolation levels were
first introduced [^3]. All along, the answer
from researchers has been simple: use *serializable* isolation!

Serializable isolation is the strongest isolation level. It guarantees that even
though transactions may execute in parallel, the end result is the same as if they had executed one
at a time, *serially*, without any concurrency. Thus, the database guarantees that if the
transactions behave correctly when run individually, they continue to be correct when run
concurrently—in other words, the database prevents *all* possible race conditions.

But if serializable isolation is so much better than the mess of weak isolation levels, then why
isn’t everyone using it? To answer this question, we need to look at the options for implementing
serializability, and how they perform. Most databases that provide serializability today use one of
three techniques, which we will explore in the rest of this chapter:

* Literally executing transactions in a serial order (see [“Actual Serial Execution”](/en/ch8#sec_transactions_serial))
* Two-phase locking (see [“Two-Phase Locking (2PL)”](/en/ch8#sec_transactions_2pl)), which for several decades was the only viable option
* Optimistic concurrency control techniques such as serializable snapshot isolation (see
 [“Serializable Snapshot Isolation (SSI)”](/en/ch8#sec_transactions_ssi))

### Actual Serial Execution {#sec_transactions_serial}

The simplest way of avoiding concurrency problems is to remove the concurrency entirely: to
execute only one transaction at a time, in serial order, on a single thread. By doing so, we completely
sidestep the problem of detecting and preventing conflicts between transactions: the resulting
isolation is by definition serializable.

Even though this seems like an obvious idea, it was only in the 2000s that database designers
decided that a single-threaded loop for executing transactions was feasible [^57].
If multi-threaded concurrency was considered essential for getting good performance during the
previous 30 years, what changed to make single-threaded execution possible?

Two developments caused this rethink:

* RAM became cheap enough that for many use cases it is now feasible to keep the entire
 active dataset in memory (see [“Keeping everything in memory”](/en/ch4#sec_storage_inmemory)). When all data that a transaction needs to
 access is in memory, transactions can execute much faster than if they have to wait for data to be
 loaded from disk.
* Database designers realized that OLTP transactions are usually short and only make a small number
 of reads and writes (see [“Analytical versus Operational Systems”](/en/ch1#sec_introduction_analytics)). By contrast, long-running analytic queries
 are typically read-only, so they can be run on a consistent snapshot (using snapshot isolation)
 outside of the serial execution loop.

The approach of executing transactions serially is implemented in VoltDB/H-Store, Redis, and Datomic,
for example [^58] [^59] [^60].
A system designed for single-threaded execution can sometimes perform better than a system that
supports concurrency, because it can avoid the coordination overhead of locking. However, its
throughput is limited to that of a single CPU core. In order to make the most of that single thread,
transactions need to be structured differently from their traditional form.

#### Encapsulating transactions in stored procedures {#encapsulating-transactions-in-stored-procedures}

In the early days of databases, the intention was that a database transaction could encompass an
entire flow of user activity. For example, booking an airline ticket is a multi-stage process
(searching for routes, fares, and available seats; deciding on an itinerary; booking seats on
each of the flights of the itinerary; entering passenger details; making payment). Database
designers thought that it would be neat if that entire process was one transaction so that it could
be committed atomically.

Unfortunately, humans are very slow to make up their minds and respond. If a database transaction
needs to wait for input from a user, the database needs to support a potentially huge number of
concurrent transactions, most of them idle. Most databases cannot do that efficiently, and so almost
all OLTP applications keep transactions short by avoiding interactively waiting for a user within a
transaction. On the web, this means that a transaction is committed within the same HTTP request—​a
transaction does not span multiple requests. A new HTTP request starts a new transaction.

Even though the human has been taken out of the critical path, transactions have continued to be
executed in an interactive client/server style, one statement at a time. An application makes a
query, reads the result, perhaps makes another query depending on the result of the first query, and
so on. The queries and results are sent back and forth between the application code (running on one
machine) and the database server (on another machine).

In this interactive style of transaction, a lot of time is spent in network communication between
the application and the database. If you were to disallow concurrency in the database and only
process one transaction at a time, the throughput would be dreadful because the database would
spend most of its time waiting for the application to issue the next query for the current
transaction. In this kind of database, it’s necessary to process multiple transactions concurrently
in order to get reasonable performance.

For this reason, systems with single-threaded serial transaction processing don’t allow interactive
multi-statement transactions. Instead, the application must either limit itself to transactions
containing a single statement, or submit the entire transaction code to the database ahead of time,
as a *stored procedure* [^61].

The differences between interactive transactions and stored procedures is illustrated in
[Figure 8-9](/en/ch8#fig_transactions_stored_proc). Provided that all data required by a transaction is in memory, the
stored procedure can execute very quickly, without waiting for any network or disk I/O.

{{< figure src="/fig/ddia_0809.png" id="fig_transactions_stored_proc" caption="Figure 8-9. The difference between an interactive transaction and a stored procedure (using the example transaction of [Figure 8-8](/en/ch8#fig_transactions_write_skew))." class="w-full my-4" >}}

#### Pros and cons of stored procedures {#sec_transactions_stored_proc_tradeoffs}

Stored procedures have existed for some time in relational databases, and they have been part of the
SQL standard (SQL/PSM) since 1999. They have gained a somewhat bad reputation, for various reasons:

* Traditionally, each database vendor had its own language for stored procedures (Oracle has PL/SQL, SQL Server
 has T-SQL, PostgreSQL has PL/pgSQL, etc.). These languages haven’t kept up with developments in
 general-purpose programming languages, so they look quite ugly and archaic from today’s point of
 view, and they lack the ecosystem of libraries that you find with most programming languages.
* Code running in a database is difficult to manage: compared to an application server, it’s harder
 to debug, more awkward to keep in version control and deploy, trickier to test, and difficult to
 integrate with a metrics collection system for monitoring.
* A database is often much more performance-sensitive than an application server, because a single
 database instance is often shared by many application servers. A badly written stored procedure
 (e.g., using a lot of memory or CPU time) in a database can cause much more trouble than equivalent
 badly written code in an application server.
* In a multitenant system that allows tenants to write their own stored procedures, it’s a security
 risk to execute untrusted code in the same process as the database kernel [^62].

However, those issues can be overcome. Modern implementations of stored procedures have abandoned
PL/SQL and use existing general-purpose programming languages instead: VoltDB uses Java or Groovy,
Datomic uses Java or Clojure, Redis uses Lua, and MongoDB uses Javascript.

Stored procedures are also useful in cases where application logic can’t easily be embedded
elsewhere. Applications that use GraphQL, for example, might directly expose their database through
a GraphQL proxy. If the proxy doesn’t support complex validation logic, you can embed such logic
directly in the database using a stored procedure. If the database doesn’t support stored
procedures, you would have to deploy a validation service between the proxy and the database to do validation.

With stored procedures and in-memory data, executing all transactions on a single thread becomes
feasible. When stored procedures don’t need to wait for I/O and avoid the overhead of other
concurrency control mechanisms, they can achieve quite good throughput on a single thread.

VoltDB also uses stored procedures for replication: instead of copying a transaction’s writes from
one node to another, it executes the same stored procedure on each replica. VoltDB therefore
requires that stored procedures are *deterministic* (when run on different nodes, they must produce
the same result). If a transaction needs to use the current date and time, for example, it must do
so through special deterministic APIs (see [“Durable Execution and Workflows”](/en/ch5#sec_encoding_dataflow_workflows) for more details on
deterministic operations). This approach is called *state machine replication*, and we will return
to it in [Chapter 10](/en/ch10#ch_consistency).

#### Sharding {#sharding}

Executing all transactions serially makes concurrency control much simpler, but limits the
transaction throughput of the database to the speed of a single CPU core on a single machine.
Read-only transactions may execute elsewhere, using snapshot isolation, but for applications with
high write throughput, the single-threaded transaction processor can become a serious bottleneck.

In order to scale to multiple CPU cores, and multiple nodes, you can shard your data
(see [Chapter 7](/en/ch7#ch_sharding)), which is supported in VoltDB. If you can find a way of sharding your dataset
so that each transaction only needs to read and write data within a single shard, then each shard
can have its own transaction processing thread running independently from the others. In this case,
you can give each CPU core its own shard, which allows your transaction throughput to scale linearly
with the number of CPU cores [^59].

However, for any transaction that needs to access multiple shards, the database must coordinate the
transaction across all the shards that it touches. The stored procedure needs to be performed in
lock-step across all shards to ensure serializability across the whole system.

Since cross-shard transactions have additional coordination overhead, they are vastly slower than
single-shard transactions. VoltDB reports a throughput of about 1,000 cross-shard writes per second,
which is orders of magnitude below its single-shard throughput and cannot be increased by adding
more machines [^61]. More recent research
has explored ways of making multi-shard transactions more scalable [^63].

Whether transactions can be single-shard depends very much on the structure of the data used by the
application. Simple key-value data can often be sharded very easily, but data with multiple
secondary indexes is likely to require a lot of cross-shard coordination (see
[“Sharding and Secondary Indexes”](/en/ch7#sec_sharding_secondary_indexes)).

#### Summary of serial execution {#summary-of-serial-execution}

Serial execution of transactions has become a viable way of achieving serializable isolation within
certain constraints:

* Every transaction must be small and fast, because it takes only one slow transaction to stall all transaction processing.
* It is most appropriate in situations where the active dataset can fit in memory. Rarely accessed
 data could potentially be moved to disk, but if it needed to be accessed in a single-threaded
 transaction, the system would get very slow.
* Write throughput must be low enough to be handled on a single CPU core, or else transactions need
 to be sharded without requiring cross-shard coordination.
* Cross-shard transactions are possible, but their throughput is hard to scale.

### Two-Phase Locking (2PL) {#sec_transactions_2pl}

For around 30 years, there was only one widely used algorithm for serializability in databases:
*two-phase locking* (2PL), sometimes called *strong strict two-phase locking* (SS2PL) to distinguish
it from other variants of 2PL.


--------

> [!TIP] 2PL IS NOT 2PC

Two-phase *locking* (2PL) and two-phase *commit* (2PC) are two very different things. 2PL provides
serializable isolation, whereas 2PC provides atomic commit in a distributed database (see
[“Two-Phase Commit (2PC)”](/en/ch8#sec_transactions_2pc)). To avoid confusion, it’s best to think of them as entirely separate
concepts and to ignore the unfortunate similarity in the names.

--------

We saw previously that locks are often used to prevent dirty writes (see
[“No dirty writes”](/en/ch8#sec_transactions_dirty_write)): if two transactions concurrently try to write to the same object,
the lock ensures that the second writer must wait until the first one has finished its transaction
(aborted or committed) before it may continue.

Two-phase locking is similar, but makes the lock requirements much stronger. Several transactions
are allowed to concurrently read the same object as long as nobody is writing to it. But as soon as
anyone wants to write (modify or delete) an object, exclusive access is required:

* If transaction A has read an object and transaction B wants to write to that object, B must wait
 until A commits or aborts before it can continue. (This ensures that B can’t change the object
 unexpectedly behind A’s back.)
* If transaction A has written an object and transaction B wants to read that object, B must wait
 until A commits or aborts before it can continue. (Reading an old version of the object, like in
 [Figure 8-4](/en/ch8#fig_transactions_read_committed), is not acceptable under 2PL.)

In 2PL, writers don’t just block other writers; they also block readers and vice
versa. Snapshot isolation has the mantra *readers never block writers, and writers never block
readers* (see [“Multi-version concurrency control (MVCC)”](/en/ch8#sec_transactions_snapshot_impl)), which captures this key difference between
snapshot isolation and two-phase locking. On the other hand, because 2PL provides serializability,
it protects against all the race conditions discussed earlier, including lost updates and write skew.

#### Implementation of two-phase locking {#implementation-of-two-phase-locking}

2PL is used by the serializable isolation level in MySQL (InnoDB) and SQL Server, and the
repeatable read isolation level in Db2 [^29].

The blocking of readers and writers is implemented by having a lock on each object in the
database. The lock can either be in *shared mode* or in *exclusive mode* (also known as a
*multi-reader single-writer* lock). The lock is used as follows:

* If a transaction wants to read an object, it must first acquire the lock in shared mode. Several
 transactions are allowed to hold the lock in shared mode simultaneously, but if another
 transaction already has an exclusive lock on the object, these transactions must wait.
* If a transaction wants to write to an object, it must first acquire the lock in exclusive mode. No
 other transaction may hold the lock at the same time (either in shared or in exclusive mode), so
 if there is any existing lock on the object, the transaction must wait.
* If a transaction first reads and then writes an object, it may upgrade its shared lock to an
 exclusive lock. The upgrade works the same as getting an exclusive lock directly.
* After a transaction has acquired the lock, it must continue to hold the lock until the end of the
 transaction (commit or abort). This is where the name “two-phase” comes from: the first phase
 (while the transaction is executing) is when the locks are acquired, and the second phase (at the
 end of the transaction) is when all the locks are released.

Since so many locks are in use, it can happen quite easily that transaction A is stuck waiting for
transaction B to release its lock, and vice versa. This situation is called *deadlock*. The database
automatically detects deadlocks between transactions and aborts one of them so that the others can
make progress. The aborted transaction needs to be retried by the application.

#### Performance of two-phase locking {#performance-of-two-phase-locking}

The big downside of two-phase locking, and the reason why it hasn’t been used by everybody since the
1970s, is performance: transaction throughput and response times of queries are significantly worse
under two-phase locking than under weak isolation.

This is partly due to the overhead of acquiring and releasing all those locks, but more importantly
due to reduced concurrency. By design, if two concurrent transactions try to do anything that may
in any way result in a race condition, one has to wait for the other to complete.

For example, if you have a transaction that needs to read an entire table (e.g. a backup, analytics
query, or integrity check, as discussed in [“Snapshot Isolation and Repeatable Read”](/en/ch8#sec_transactions_snapshot_isolation)), that
transaction has to take a shared lock on the entire table. Therefore, the reading transaction first
has to wait until all in-progress transactions writing to that table have completed; then, while the
whole table is being read (which may take a long time on a large table), all other transactions that
want to write to that table are blocked until the big read-only transaction commits. In effect, the
database becomes unavailable for writes for an extended time.

For this reason, databases running 2PL can have quite unstable latencies, and they can be very slow at
high percentiles (see [“Describing Performance”](/en/ch2#sec_introduction_percentiles)) if there is contention in the workload. It
may take just one slow transaction, or one transaction that accesses a lot of data and acquires many
locks, to cause the rest of the system to grind to a halt.

Although deadlocks can happen with the lock-based read committed isolation level, they occur much
more frequently under 2PL serializable isolation (depending on the access patterns of your
transaction). This can be an additional performance problem: when a transaction is aborted due to
deadlock and is retried, it needs to do its work all over again. If deadlocks are frequent, this can
mean significant wasted effort.

#### Predicate locks {#predicate-locks}

In the preceding description of locks, we glossed over a subtle but important detail. In
[“Phantoms causing write skew”](/en/ch8#sec_transactions_phantom) we discussed the problem of *phantoms*—that is, one transaction
changing the results of another transaction’s search query. A database with serializable isolation
must prevent phantoms.

In the meeting room booking example this means that if one transaction has searched for existing
bookings for a room within a certain time window (see [Example 8-2](/en/ch8#fig_transactions_meeting_rooms)), another
transaction is not allowed to concurrently insert or update another booking for the same room and
time range. (It’s okay to concurrently insert bookings for other rooms, or for the same room at a
different time that doesn’t affect the proposed booking.)

How do we implement this? Conceptually, we need a *predicate lock* [^4]. It works similarly to the
shared/exclusive lock described earlier, but rather than belonging to a particular object (e.g., one
row in a table), it belongs to all objects that match some search condition, such as:

```
SELECT * FROM bookings
 WHERE room_id = 123 AND
 end_time > '2025-01-01 12:00' AND
 start_time < '2025-01-01 13:00';
```

A predicate lock restricts access as follows:

* If transaction A wants to read objects matching some condition, like in that `SELECT` query, it
 must acquire a shared-mode predicate lock on the conditions of the query. If another transaction B
 currently has an exclusive lock on any object matching those conditions, A must wait until B
 releases its lock before it is allowed to make its query.
* If transaction A wants to insert, update, or delete any object, it must first check whether either the old
 or the new value matches any existing predicate lock. If there is a matching predicate lock held by
 transaction B, then A must wait until B has committed or aborted before it can continue.

The key idea here is that a predicate lock applies even to objects that do not yet exist in the
database, but which might be added in the future (phantoms). If two-phase locking includes predicate locks,
the database prevents all forms of write skew and other race conditions, and so its isolation
becomes serializable.

#### Index-range locks {#sec_transactions_2pl_range}

Unfortunately, predicate locks do not perform well: if there are many locks by active transactions,
checking for matching locks becomes time-consuming. For that reason, most databases with 2PL
actually implement *index-range locking* (also known as *next-key locking*), which is a simplified
approximation of predicate locking [^54] [^64].

It’s safe to simplify a predicate by making it match a greater set of objects. For example, if you
have a predicate lock for bookings of room 123 between noon and 1 p.m., you can approximate it by
locking bookings for room 123 at any time, or you can approximate it by locking all rooms (not just
room 123) between noon and 1 p.m. This is safe because any write that matches the original predicate
will definitely also match the approximations.

In the room bookings database you would probably have an index on the `room_id` column, and/or
indexes on `start_time` and `end_time` (otherwise the preceding query would be very slow on a large database):

* Say your index is on `room_id`, and the database uses this index to find existing bookings for
 room 123. Now the database can simply attach a shared lock to this index entry, indicating that a
 transaction has searched for bookings of room 123.
* Alternatively, if the database uses a time-based index to find existing bookings, it can attach a
 shared lock to a range of values in that index, indicating that a transaction has searched for
 bookings that overlap with the time period of noon to 1 p.m. on January 1, 2025.

Either way, an approximation of the search condition is attached to one of the indexes. Now, if
another transaction wants to insert, update, or delete a booking for the same room and/or an
overlapping time period, it will have to update the same part of the index. In the process of doing
so, it will encounter the shared lock, and it will be forced to wait until the lock is released.

This provides effective protection against phantoms and write skew. Index-range locks are not as
precise as predicate locks would be (they may lock a bigger range of objects than is strictly
necessary to maintain serializability), but since they have much lower overheads, they are a good
compromise.

If there is no suitable index where a range lock can be attached, the database can fall back to a
shared lock on the entire table. This will not be good for performance, since it will stop all
other transactions writing to the table, but it’s a safe fallback position.

### Serializable Snapshot Isolation (SSI) {#sec_transactions_ssi}

This chapter has painted a bleak picture of concurrency control in databases. On the one hand, we
have implementations of serializability that don’t perform well (two-phase locking) or don’t scale
well (serial execution). On the other hand, we have weak isolation levels that have good
performance, but are prone to various race conditions (lost updates, write skew, phantoms, etc.). Are
serializable isolation and good performance fundamentally at odds with each other?

It seems not: an algorithm called *serializable snapshot isolation* (SSI) provides full
serializability with only a small performance penalty compared to snapshot isolation. SSI is
comparatively new: it was first described in 2008 [^53] [^65].

Today SSI and similar algorithms are used in single-node databases (the serializable isolation level
in PostgreSQL [^54], SQL Server’s In-Memory OLTP/Hekaton [^66], and HyPer [^67]), distributed databases (CockroachDB [^5] and
FoundationDB [^8]), and embedded storage engines such as BadgerDB.

#### Pessimistic versus optimistic concurrency control {#pessimistic-versus-optimistic-concurrency-control}

Two-phase locking is a so-called *pessimistic* concurrency control mechanism: it is based on the
principle that if anything might possibly go wrong (as indicated by a lock held by another
transaction), it’s better to wait until the situation is safe again before doing anything. It is
like *mutual exclusion*, which is used to protect data structures in multi-threaded programming.

Serial execution is, in a sense, pessimistic to the extreme: it is essentially equivalent to each
transaction having an exclusive lock on the entire database (or one shard of the database) for the
duration of the transaction. We compensate for the pessimism by making each transaction very fast to
execute, so it only needs to hold the “lock” for a short time.

By contrast, serializable snapshot isolation is an *optimistic* concurrency control technique.
Optimistic in this context means that instead of blocking if something potentially dangerous
happens, transactions continue anyway, in the hope that everything will turn out all right. When a
transaction wants to commit, the database checks whether anything bad happened (i.e., whether
isolation was violated); if so, the transaction is aborted and has to be retried. Only transactions
that executed serializably are allowed to commit.

Optimistic concurrency control is an old idea [^68], and its advantages and disadvantages have been debated for a long time [^69].
It performs badly if there is high contention (many transactions trying to access the same objects),
as this leads to a high proportion of transactions needing to abort. If the system is already close
to its maximum throughput, the additional transaction load from retried transactions can make
performance worse.

However, if there is enough spare capacity, and if contention between transactions is not too high,
optimistic concurrency control techniques tend to perform better than pessimistic ones. Contention
can be reduced with commutative atomic operations: for example, if several transactions concurrently
want to increment a counter, it doesn’t matter in which order the increments are applied (as long as
the counter isn’t read in the same transaction), so the concurrent increments can all be applied
without conflicting.

As the name suggests, SSI is based on snapshot isolation—that is, all reads within a transaction
are made from a consistent snapshot of the database (see [“Snapshot Isolation and Repeatable Read”](/en/ch8#sec_transactions_snapshot_isolation)).
On top of snapshot isolation, SSI adds an algorithm for detecting serialization conflicts among
reads and writes, and determining which transactions to abort.

#### Decisions based on an outdated premise {#decisions-based-on-an-outdated-premise}

When we previously discussed write skew in snapshot isolation (see [“Write Skew and Phantoms”](/en/ch8#sec_transactions_write_skew)),
we observed a recurring pattern: a transaction reads some data from the database, examines the
result of the query, and decides to take some action (write to the database) based on the result
that it saw. However, under snapshot isolation, the result from the original query may no longer be
up-to-date by the time the transaction commits, because the data may have been modified in the meantime.

Put another way, the transaction is taking an action based on a *premise* (a fact that was true at
the beginning of the transaction, e.g., “There are currently two doctors on call”). Later, when the
transaction wants to commit, the original data may have changed—the premise may no longer be
true.

When the application makes a query (e.g., “How many doctors are currently on call?”), the database
doesn’t know how the application logic uses the result of that query. To be safe, the database needs
to assume that any change in the query result (the premise) means that writes in that transaction
may be invalid. In other words, there may be a causal dependency between the queries and the writes
in the transaction. In order to provide serializable isolation, the database must detect situations
in which a transaction may have acted on an outdated premise and abort the transaction in that case.

How does the database know if a query result might have changed? There are two cases to consider:

* Detecting reads of a stale MVCC object version (uncommitted write occurred before the read)
* Detecting writes that affect prior reads (the write occurs after the read)

#### Detecting stale MVCC reads {#detecting-stale-mvcc-reads}

Recall that snapshot isolation is usually implemented by multi-version concurrency control (MVCC;
see [“Multi-version concurrency control (MVCC)”](/en/ch8#sec_transactions_snapshot_impl)). When a transaction reads from a consistent snapshot in an
MVCC database, it ignores writes that were made by any other transactions that hadn’t yet committed
at the time when the snapshot was taken.

In [Figure 8-10](/en/ch8#fig_transactions_detect_mvcc), transaction 43 sees
Aaliyah as having `on_call = true`, because transaction 42 (which modified Aaliyah’s on-call status) is
uncommitted. However, by the time transaction 43 wants to commit, transaction 42 has already
committed. This means that the write that was ignored when reading from the consistent snapshot has
now taken effect, and transaction 43’s premise is no longer true. Things get even more complicated
when a writer inserts data that didn’t exist before (see [“Phantoms causing write skew”](/en/ch8#sec_transactions_phantom)). We’ll
discuss detecting phantom writes for SSI in [“Detecting writes that affect prior reads”](/en/ch8#sec_detecting_writes_affect_reads).

{{< figure src="/fig/ddia_0810.png" id="fig_transactions_detect_mvcc" caption="Figure 8-10. Detecting when a transaction reads outdated values from an MVCC snapshot." class="w-full my-4" >}}


In order to prevent this anomaly, the database needs to track when a transaction ignores another
transaction’s writes due to MVCC visibility rules. When the transaction wants to commit, the
database checks whether any of the ignored writes have now been committed. If so, the transaction
must be aborted.

Why wait until committing? Why not abort transaction 43 immediately when the stale read is detected?
Well, if transaction 43 was a read-only transaction, it wouldn’t need to be aborted, because there
is no risk of write skew. At the time when transaction 43 makes its read, the database doesn’t yet
know whether that transaction is going to later perform a write. Moreover, transaction 42 may yet
abort or may still be uncommitted at the time when transaction 43 is committed, and so the read may
turn out not to have been stale after all. By avoiding unnecessary aborts, SSI preserves snapshot
isolation’s support for long-running reads from a consistent snapshot.

#### Detecting writes that affect prior reads {#sec_detecting_writes_affect_reads}

The second case to consider is when another transaction modifies data after it has been read. This
case is illustrated in [Figure 8-11](/en/ch8#fig_transactions_detect_index_range).

{{< figure src="/fig/ddia_0811.png" id="fig_transactions_detect_index_range" caption="Figure 8-11. In serializable snapshot isolation, detecting when one transaction modifies another transaction's reads." class="w-full my-4" >}}


In the context of two-phase locking we discussed index-range locks (see
[“Index-range locks”](/en/ch8#sec_transactions_2pl_range)), which allow the database to lock access to all rows matching some
search query, such as `WHERE shift_id = 1234`. We can use a similar technique here, except that SSI
locks don’t block other transactions.

In [Figure 8-11](/en/ch8#fig_transactions_detect_index_range), transactions 42 and 43 both search for on-call doctors
during shift `1234`. If there is an index on `shift_id`, the database can use the index entry 1234 to
record the fact that transactions 42 and 43 read this data. (If there is no index, this information
can be tracked at the table level.) This information only needs to be kept for a while: after a
transaction has finished (committed or aborted), and all concurrent transactions have finished, the
database can forget what data it read.

When a transaction writes to the database, it must look in the indexes for any other transactions
that have recently read the affected data. This process is similar to acquiring a write lock on the affected
key range, but rather than blocking until the readers have committed, the lock acts as a tripwire:
it simply notifies the transactions that the data they read may no longer be up to date.

In [Figure 8-11](/en/ch8#fig_transactions_detect_index_range), transaction 43 notifies transaction 42 that its prior
read is outdated, and vice versa. Transaction 42 is first to commit, and it is successful: although
transaction 43’s write affected 42, 43 hasn’t yet committed, so the write has not yet taken effect.
However, when transaction 43 wants to commit, the conflicting write from 42 has already been
committed, so 43 must abort.

#### Performance of serializable snapshot isolation {#performance-of-serializable-snapshot-isolation}

As always, many engineering details affect how well an algorithm works in practice. For example, one
trade-off is the granularity at which transactions’ reads and writes are tracked. If the database
keeps track of each transaction’s activity in great detail, it can be precise about which
transactions need to abort, but the bookkeeping overhead can become significant. Less detailed
tracking is faster, but may lead to more transactions being aborted than strictly necessary.

In some cases, it’s okay for a transaction to read information that was overwritten by another
transaction: depending on what else happened, it’s sometimes possible to prove that the result of
the execution is nevertheless serializable. PostgreSQL uses this theory to reduce the number of
unnecessary aborts [^14] [^54].

Compared to two-phase locking, the big advantage of serializable snapshot isolation is that one
transaction doesn’t need to block waiting for locks held by another transaction. Like under snapshot
isolation, writers don’t block readers, and vice versa. This design principle makes query latency
much more predictable and less variable. In particular, read-only queries can run on a consistent
snapshot without requiring any locks, which is very appealing for read-heavy workloads.

Compared to serial execution, serializable snapshot isolation is not limited to the throughput of a
single CPU core: for example, FoundationDB distributes the detection of serialization conflicts across multiple
machines, allowing it to scale to very high throughput. Even though data may be sharded across
multiple machines, transactions can read and write data in multiple shards while ensuring
serializable isolation.

Compared to non-serializable snapshot isolation, the need to check for serializability violations
introduces some performance overheads. How significant these overheads are is a matter of debate:
some believe that serializability checking is not worth it [^70],
while others believe that the performance of serializability is now so good that there is no need to
use the weaker snapshot isolation any more [^67].

The rate of aborts significantly affects the overall performance of SSI. For example, a transaction
that reads and writes data over a long period of time is likely to run into conflicts and abort, so
SSI requires that read-write transactions be fairly short (long-running read-only transactions are
okay). However, SSI is less sensitive to slow transactions than two-phase locking or serial
execution.

## Distributed Transactions {#sec_transactions_distributed}

The last few sections have focused on concurrency control for isolation, the I in ACID. The
algorithms we have seen apply to both single-node and distributed databases: although there are
challenges in making concurrency control algorithms scalable (for example, performing distributed
serializability checking for SSI), the high-level ideas for distributed concurrency control are
similar to single-node concurrency control [^8].

Consistency and durability also don’t change much when we move to distributed transactions. However,
atomicity requires more care.

For transactions that execute at a single database node, atomicity is commonly implemented by the
storage engine. When the client asks the database node to commit the transaction, the database makes
the transaction’s writes durable (typically in a write-ahead log; see [“Making B-trees reliable”](/en/ch4#sec_storage_btree_wal)) and
then appends a commit record to the log on disk. If the database crashes in the middle of this
process, the transaction is recovered from the log when the node restarts: if the commit record was
successfully written to disk before the crash, the transaction is considered committed; if not, any
writes from that transaction are rolled back.

Thus, on a single node, transaction commitment crucially depends on the *order* in which data is
durably written to disk: first the data, then the commit record [^22].
The key deciding moment for whether the transaction commits or aborts is the moment at which the
disk finishes writing the commit record: before that moment, it is still possible to abort (due to a
crash), but after that moment, the transaction is committed (even if the database crashes). Thus, it
is a single device (the controller of one particular disk drive, attached to one particular node)
that makes the commit atomic.

However, what if multiple nodes are involved in a transaction? For example, perhaps you have a
multi-object transaction in a sharded database, or a global secondary index (in which the
index entry may be on a different node from the primary data; see
[“Sharding and Secondary Indexes”](/en/ch7#sec_sharding_secondary_indexes)). Most “NoSQL” distributed datastores do not support such
distributed transactions, but various distributed relational databases do.

In these cases, it is not sufficient to simply send a commit request to all of the nodes and
independently commit the transaction on each one. It could easily happen that the commit succeeds on
some nodes and fails on other nodes, as shown in [Figure 8-12](/en/ch8#fig_transactions_non_atomic):

* Some nodes may detect a constraint violation or conflict, making an abort necessary, while other
 nodes are successfully able to commit.
* Some of the commit requests might be lost in the network, eventually aborting due to a timeout,
 while other commit requests get through.
* Some nodes may crash before the commit record is fully written and roll back on recovery, while
 others successfully commit.

{{< figure src="/fig/ddia_0812.png" id="fig_transactions_non_atomic" caption="Figure 8-12. When a transaction involves multiple database nodes, it may commit on some and fail on others." class="w-full my-4" >}}


If some nodes commit the transaction but others abort it, the nodes become inconsistent with each
other. And once a transaction has been committed on one node, it cannot be retracted again if it
later turns out that it was aborted on another node. This is because once data has been committed,
it becomes visible to other transactions under *read committed* or stronger isolation. For example,
in [Figure 8-12](/en/ch8#fig_transactions_non_atomic), by the time user 1 notices that its commit failed on database 1,
user 2 has already read the data from the same transaction on database 2. If user 1’s transaction
was later aborted, user 2’s transaction would have to be reverted as well, since it was based on
data that was retroactively declared not to have existed.

A better approach is to ensure that the nodes involved in a transaction either all commit or all
abort, and to prevent a mixture of the two. Ensuring this is known as the *atomic commitment* problem.

### Two-Phase Commit (2PC) {#sec_transactions_2pc}

Two-phase commit is an algorithm for achieving atomic transaction commit across multiple nodes. It
is a classic algorithm in distributed databases [^13] [^71] [^72]. 2PC is used
internally in some databases and also made available to applications in the form of *XA transactions* [^73]
(which are supported by the Java Transaction API, for example) or via WS-AtomicTransaction for SOAP
web services [^74] [^75].

The basic flow of 2PC is illustrated in [Figure 8-13](/en/ch8#fig_transactions_two_phase_commit). Instead of a single
commit request, as with a single-node transaction, the commit/abort process in 2PC is split into two
phases (hence the name).

{{< figure src="/fig/ddia_0813.png" id="fig_transactions_two_phase_commit" title="Figure 8-13. A successful execution of two-phase commit (2PC)." class="w-full my-4" >}}


2PC uses a new component that does not normally appear in single-node transactions: a
*coordinator* (also known as *transaction manager*). The coordinator is often implemented as a
library within the same application process that is requesting the transaction (e.g., embedded in a
Java EE container), but it can also be a separate process or service. Examples of such coordinators
include Narayana, JOTM, BTM, or MSDTC.

When 2PC is used, a distributed
transaction begins with the application reading and writing data on multiple database nodes,
as normal. We call these database nodes *participants* in the transaction. When the application is
ready to commit, the coordinator begins phase 1: it sends a *prepare* request to each of the nodes,
asking them whether they are able to commit. The coordinator then tracks the responses from the
participants:

* If all participants reply “yes,” indicating they are ready to commit, then the coordinator sends
 out a *commit* request in phase 2, and the commit actually takes place.
* If any of the participants replies “no,” the coordinator sends an *abort* request to all nodes in phase 2.

This process is somewhat like the traditional marriage ceremony in Western cultures: the minister
asks the bride and groom individually whether each wants to marry the other, and typically receives
the answer “I do” from both. After receiving both acknowledgments, the minister pronounces the
couple husband and wife: the transaction is committed, and the happy fact is broadcast to all
attendees. If either bride or groom does not say “yes,” the ceremony is aborted [^76].

#### A system of promises {#a-system-of-promises}

From this short description it might not be clear why two-phase commit ensures atomicity, while
one-phase commit across several nodes does not. Surely the prepare and commit requests can just
as easily be lost in the two-phase case. What makes 2PC different?

To understand why it works, we have to break down the process in a bit more detail:

1. When the application wants to begin a distributed transaction, it requests a transaction ID from
 the coordinator. This transaction ID is globally unique.
2. The application begins a single-node transaction on each of the participants, and attaches the
 globally unique transaction ID to the single-node transaction. All reads and writes are done in
 one of these single-node transactions. If anything goes wrong at this stage (for example, a node
 crashes or a request times out), the coordinator or any of the participants can abort.
3. When the application is ready to commit, the coordinator sends a prepare request to all
 participants, tagged with the global transaction ID. If any of these requests fails or times out,
 the coordinator sends an abort request for that transaction ID to all participants.
4. When a participant receives the prepare request, it makes sure that it can definitely commit
 the transaction under all circumstances.

 This includes writing all transaction data to disk (a crash, a power failure, or running out of
 disk space is not an acceptable excuse for refusing to commit later), and checking for any
 conflicts or constraint violations. By replying “yes” to the coordinator, the node promises to
 commit the transaction without error if requested. In other words, the participant surrenders the
 right to abort the transaction, but without actually committing it.
5. When the coordinator has received responses to all prepare requests, it makes a definitive
 decision on whether to commit or abort the transaction (committing only if all participants voted
 “yes”). The coordinator must write that decision to its transaction log on disk so that it knows
 which way it decided in case it subsequently crashes. This is called the *commit point*.
6. Once the coordinator’s decision has been written to disk, the commit or abort request is sent
 to all participants. If this request fails or times out, the coordinator must retry forever until
 it succeeds. There is no more going back: if the decision was to commit, that decision must be
 enforced, no matter how many retries it takes. If a participant has crashed in the meantime, the
 transaction will be committed when it recovers—since the participant voted “yes,” it cannot
 refuse to commit when it recovers.

Thus, the protocol contains two crucial “points of no return”: when a participant votes “yes,” it
promises that it will definitely be able to commit later (although the coordinator may still choose to
abort); and once the coordinator decides, that decision is irrevocable. Those promises ensure the
atomicity of 2PC. (Single-node atomic commit lumps these two events into one: writing the commit
record to the transaction log.)

Returning to the marriage analogy, before saying “I do,” you and your bride/groom have the freedom
to abort the transaction by saying “No way!” (or something to that effect). However, after saying “I
do,” you cannot retract that statement. If you faint after saying “I do” and you don’t hear the
minister speak the words “You are now husband and wife,” that doesn’t change the fact that the
transaction was committed. When you recover consciousness later, you can find out whether you are
married or not by querying the minister for the status of your global transaction ID, or you can
wait for the minister’s next retry of the commit request (since the retries will have continued
throughout your period of unconsciousness).

#### Coordinator failure {#coordinator-failure}

We have discussed what happens if one of the participants or the network fails during 2PC: if any of
the prepare requests fails or times out, the coordinator aborts the transaction; if any of the
commit or abort requests fails, the coordinator retries them indefinitely. However, it is less
clear what happens if the coordinator crashes.

If the coordinator fails before sending the prepare requests, a participant can safely abort the
transaction. But once the participant has received a prepare request and voted “yes,” it can no
longer abort unilaterally—it must wait to hear back from the coordinator whether the transaction
was committed or aborted. If the coordinator crashes or the network fails at this point, the
participant can do nothing but wait. A participant’s transaction in this state is called *in doubt*
or *uncertain*.

The situation is illustrated in [Figure 8-14](/en/ch8#fig_transactions_2pc_crash). In this particular example, the
coordinator actually decided to commit, and database 2 received the commit request. However, the
coordinator crashed before it could send the commit request to database 1, and so database 1 does
not know whether to commit or abort. Even a timeout does not help here: if database 1 unilaterally
aborts after a timeout, it will end up inconsistent with database 2, which has committed. Similarly,
it is not safe to unilaterally commit, because another participant may have aborted.

{{< figure src="/fig/ddia_0814.png" id="fig_transactions_2pc_crash" title="Figure 8-14. The coordinator crashes after participants vote \"yes.\" Database 1 does not know whether to commit or abort." class="w-full my-4" >}}


Without hearing from the coordinator, the participant has no way of knowing whether to commit or
abort. In principle, the participants could communicate among themselves to find out how each
participant voted and come to some agreement, but that is not part of the 2PC protocol.

The only way 2PC can complete is by waiting for the coordinator to recover. This is why the
coordinator must write its commit or abort decision to a transaction log on disk before sending
commit or abort requests to participants: when the coordinator recovers, it determines the status of
all in-doubt transactions by reading its transaction log. Any transactions that don’t have a commit
record in the coordinator’s log are aborted. Thus, the commit point of 2PC comes down to a regular
single-node atomic commit on the coordinator.

#### Three-phase commit {#three-phase-commit}

Two-phase commit is called a *blocking* atomic commit protocol due to the fact that 2PC can become
stuck waiting for the coordinator to recover. It is possible to make an atomic commit protocol
*nonblocking*, so that it does not get stuck if a node fails. However, making this work in practice
is not so straightforward.

As an alternative to 2PC, an algorithm called *three-phase commit* (3PC) has been proposed [^13] [^77].
However, 3PC assumes a network with bounded delay and nodes with bounded response times; in most
practical systems with unbounded network delay and process pauses (see [Chapter 9](/en/ch9#ch_distributed)), it
cannot guarantee atomicity.

A better solution in practice is to replace the single-node coordinator with a fault-tolerant
consensus protocol. We will see how to do this in [Chapter 10](/en/ch10#ch_consistency).

### Distributed Transactions Across Different Systems {#sec_transactions_xa}

Distributed transactions and two-phase commit have a mixed reputation. On the one hand, they are
seen as providing an important safety guarantee that would be hard to achieve otherwise; on the
other hand, they are criticized for causing operational problems, killing performance, and promising
more than they can deliver [^78] [^79] [^80] [^81].
Many cloud services choose not to implement distributed transactions due to the operational problems they engender [^82].

Some implementations of distributed transactions carry a heavy performance penalty. Much of the
performance cost inherent in two-phase commit is due to the additional disk forcing (`fsync`) that
is required for crash recovery, and the additional network round-trips.

However, rather than dismissing distributed transactions outright, we should examine them in some
more detail, because there are important lessons to be learned from them. To begin, we should be
precise about what we mean by “distributed transactions.” Two quite different types of distributed
transactions are often conflated:

Database-internal distributed transactions
: Some distributed databases (i.e., databases that use replication and sharding in their standard
 configuration) support internal transactions among the nodes of that database. For example,
 YugabyteDB, TiDB, FoundationDB, Spanner, VoltDB, and MySQL Cluster’s NDB storage engine have such
 internal transaction support. In this case, all the nodes participating in the transaction are
 running the same database software.

Heterogeneous distributed transactions
: In a *heterogeneous* transaction, the participants are two or more different technologies: for
 example, two databases from different vendors, or even non-database systems such as message
 brokers. A distributed transaction across these systems must ensure atomic commit, even though
 the systems may be entirely different under the hood.

Database-internal transactions do not have to be compatible with any other system, so they can
use any protocol and apply optimizations specific to that particular technology. For that reason,
database-internal distributed transactions can often work quite well. On the other hand,
transactions spanning heterogeneous technologies are a lot more challenging.

#### Exactly-once message processing {#sec_transactions_exactly_once}

Heterogeneous distributed transactions allow diverse systems to be integrated in powerful ways. For
example, a message from a message queue can be acknowledged as processed if and only if the database
transaction for processing the message was successfully committed. This is implemented by atomically
committing the message acknowledgment and the database writes in a single transaction. With
distributed transaction support, this is possible, even if the message broker and the database are
two unrelated technologies running on different machines.

If either the message delivery or the database transaction fails, both are aborted, and so the
message broker may safely redeliver the message later. Thus, by atomically committing the message
and the side effects of its processing, we can ensure that the message is *effectively* processed
exactly once, even if it required a few retries before it succeeded. The abort discards any side
effects of the partially completed transaction. This is known as *exactly-once semantics*.

Such a distributed transaction is only possible if all systems affected by the transaction are able
to use the same atomic commit protocol, however. For example, say a side effect of processing a
message is to send an email, and the email server does not support two-phase commit: it could happen
that the email is sent two or more times if message processing fails and is retried. But if all side
effects of processing a message are rolled back on transaction abort, then the processing step can
safely be retried as if nothing had happened.

We will return to the topic of exactly-once semantics later in this chapter. Let’s look first at the
atomic commit protocol that allows such heterogeneous distributed transactions.

#### XA transactions {#xa-transactions}

*X/Open XA* (short for *eXtended Architecture*) is a standard for implementing two-phase commit
across heterogeneous technologies [^73]. It was introduced in 1991 and has been widely
implemented: XA is supported by many traditional relational databases (including PostgreSQL, MySQL,
Db2, SQL Server, and Oracle) and message brokers (including ActiveMQ, HornetQ, MSMQ, and IBM MQ).

XA is not a network protocol—it is merely a C API for interfacing with a transaction coordinator.
Bindings for this API exist in other languages; for example, in the world of Java EE applications,
XA transactions are implemented using the Java Transaction API (JTA), which in turn is supported by
many drivers for databases using Java Database Connectivity (JDBC) and drivers for message brokers
using the Java Message Service (JMS) APIs.

XA assumes that your application uses a network driver or client library to communicate with the
participant databases or messaging services. If the driver supports XA, that means it calls the XA
API to find out whether an operation should be part of a distributed transaction—and if so, it
sends the necessary information to the database server. The driver also exposes callbacks through
which the coordinator can ask the participant to prepare, commit, or abort.

The transaction coordinator implements the XA API. The standard does not specify how it should be
implemented, but in practice the coordinator is often simply a library that is loaded into the same
process as the application issuing the transaction (not a separate service). It keeps track of the
participants in a transaction, collects partipants’ responses after asking them to prepare (via a
callback into the driver), and uses a log on the local disk to keep track of the commit/abort
decision for each transaction.

If the application process crashes, or the machine on which the application is running dies, the
coordinator goes with it. Any participants with prepared but uncommitted transactions are then stuck
in doubt. Since the coordinator’s log is on the application server’s local disk, that server must be
restarted, and the coordinator library must read the log to recover the commit/abort outcome of each
transaction. Only then can the coordinator use the database driver’s XA callbacks to ask
participants to commit or abort, as appropriate. The database server cannot contact the coordinator
directly, since all communication must go via its client library.

#### Holding locks while in doubt {#holding-locks-while-in-doubt}

Why do we care so much about a transaction being stuck in doubt? Can’t the rest of the system just
get on with its work, and ignore the in-doubt transaction that will be cleaned up eventually?

The problem is with *locking*. As discussed in [“Read Committed”](/en/ch8#sec_transactions_read_committed), database
transactions usually take a row-level exclusive lock on any rows they modify, to prevent dirty
writes. In addition, if you want serializable isolation, a database using two-phase locking would
also have to take a shared lock on any rows *read* by the transaction.

The database cannot release those locks until the transaction commits or aborts (illustrated as a
shaded area in [Figure 8-13](/en/ch8#fig_transactions_two_phase_commit)). Therefore, when using two-phase commit, a
transaction must hold onto the locks throughout the time it is in doubt. If the coordinator has
crashed and takes 20 minutes to start up again, those locks will be held for 20 minutes. If the
coordinator’s log is entirely lost for some reason, those locks will be held forever—or at least
until the situation is manually resolved by an administrator.

While those locks are held, no other transaction can modify those rows. Depending on the isolation
level, other transactions may even be blocked from reading those rows. Thus, other transactions
cannot simply continue with their business—if they want to access that same data, they will be
blocked. This can cause large parts of your application to become unavailable until the in-doubt
transaction is resolved.

#### Recovering from coordinator failure {#recovering-from-coordinator-failure}

In theory, if the coordinator crashes and is restarted, it should cleanly recover its state from the
log and resolve any in-doubt transactions. However, in practice, *orphaned* in-doubt transactions do occur [^83] [^84] — that is,
transactions for which the coordinator cannot decide the outcome for whatever reason (e.g., because
the transaction log has been lost or corrupted due to a software bug). These transactions cannot be
resolved automatically, so they sit forever in the database, holding locks and blocking other
transactions.

Even rebooting your database servers will not fix this problem, since a correct implementation of
2PC must preserve the locks of an in-doubt transaction even across restarts (otherwise it would risk
violating the atomicity guarantee). It’s a sticky situation.

The only way out is for an administrator to manually decide whether to commit or roll back the
transactions. The administrator must examine the participants of each in-doubt transaction,
determine whether any participant has committed or aborted already, and then apply the same outcome
to the other participants. Resolving the problem potentially requires a lot of manual effort, and
most likely needs to be done under high stress and time pressure during a serious production outage
(otherwise, why would the coordinator be in such a bad state?).

Many XA implementations have an emergency escape hatch called *heuristic decisions*: allowing a
participant to unilaterally decide to abort or commit an in-doubt transaction without a definitive
decision from the coordinator [^73]. To be clear,
*heuristic* here is a euphemism for *probably breaking atomicity*, since the heuristic decision
violates the system of promises in two-phase commit. Thus, heuristic decisions are intended only for
getting out of catastrophic situations, and not for regular use.

#### Problems with XA transactions {#problems-with-xa-transactions}

A single-node coordinator is a single point of failure for the entire system, and making it part of
the application server is also problematic because the coordinator’s logs on its local disk become a
crucial part of the durable system state—as important as the databases themselves.

In principle, the coordinator of an XA transaction could be highly available and replicated, just
like we would expect of any other important database. Unfortunately, this still doesn’t solve a
fundamental problem with XA, which is that it provides no way for the coordinator and the
participants of a transaction to communicate with each other directly. They can only communicate via
the application code that invoked the transaction, and the database drivers through which it calls
the participants.

Even if the coordinator were replicated, the application code would therefore be a single point of
failure. Solving this problem would require totally redesigning how application code is run to make
it replicated or restartable, which could perhaps look similar to durable execution (see
[“Durable Execution and Workflows”](/en/ch5#sec_encoding_dataflow_workflows)). However, there don’t seem to be any tools that actually take
this approach in practice.

Another problem is that since XA needs to be compatible with a wide range of data systems, it is
necessarily a lowest common denominator. For example, it cannot detect deadlocks across different
systems (since that would require a standardized protocol for systems to exchange information on the
locks that each transaction is waiting for), and it does not work with SSI (see
[“Serializable Snapshot Isolation (SSI)”](/en/ch8#sec_transactions_ssi)), since that would require a protocol for identifying conflicts across
different systems.

These problems are somewhat inherent in performing transactions across heterogeneous technologies.
However, keeping several heterogeneous data systems consistent with each other is still a real and
important problem, so we need to find a different solution to it. This can be done, as we will see
in the next section and in [“Derived data versus distributed transactions”](/en/ch13#sec_future_derived_vs_transactions).

### Database-internal Distributed Transactions {#sec_transactions_internal}

As explained previously, there is a big difference between distributed transactions that span
multiple heterogeneous storage technologies, and those that are internal to a system—i.e., where all
the participating nodes are shards of the same database running the same software. Such internal
distributed transactions are a defining feature of “NewSQL” databases such as
CockroachDB [^5], TiDB [^6], Spanner [^7], FoundationDB [^8], and YugabyteDB, for example. 
Some message brokers such as Kafka also support internal distributed transactions [^85].

Many of these systems use 2-phase commit to ensure atomicity of transactions that write to multiple
shards, and yet they don’t suffer the same problems as XA transactions. The reason is that because
their distributed transactions don’t need to interface with any other technologies, they avoid the
lowest-common-denominator trap—the designers of these systems are free to use better protocols that
are more reliable and faster.

The biggest problems with XA can be fixed by:

* Replicating the coordinator, with automatic failover to another coordinator node if the primary one crashes;
* Allowing the coordinator and data shards to communicate directly without going via application code;
* Replicating the participating shards, so that the risk of having to abort a transaction because of a fault in one of the shards is reduced; and
* Coupling the atomic commitment protocol with a distributed concurrency control protocol that supports deadlock detection and consistent reads across shards.

Consensus algorithms are commonly used to replicate the coordinator and the database shards. We will
see in [Chapter 10](/en/ch10#ch_consistency) how atomic commitment for distributed transactions can be implemented
using a consensus algorithm. These algorithms tolerate faults by automatically failing over from one
node to another without any human intervention, and while continuing to guarantee strong consistency
properties.

The isolation levels offered for distributed transactions depend on the system, but snapshot
isolation and serializable snapshot isolation are both possible across shards. The details of how
this works can be found in the papers referenced at the end of this chapter.

#### Exactly-once message processing revisited {#exactly-once-message-processing-revisited}

We saw in [“Exactly-once message processing”](/en/ch8#sec_transactions_exactly_once) that an important use case for distributed transactions
is to ensure that some operation takes effect exactly once, even if a crash occurs while it is being
processed and the processing needs to be retried. If you can atomically commit a transaction across
a message broker and a database, you can acknowledge the message to the broker if and only if it was
successfully processed and the database writes resulting from the process were committed.

However, you don’t actually need such distributed transactions to achieve exactly-once semantics. An
alternative approach is as follows, which only requires transactions within the database:

1. Assume every message has a unique ID, and in the database you have a table of message IDs that
 have been processed. When you start processing a message from the broker, you begin a new
 transaction on the database, and check the message ID. If the same message ID is already present
 in the database, you know that it has already been processed, so you can acknowledge the message
 to the broker and drop it.
2. If the message ID is not already in the database, you add it to the table. You then process the
 message, which may result in additional writes to the database within the same transaction. When
 you finish processing the message, you commit the transaction on the database.
3. Once the database transaction is successfully committed, you can acknowledge the message to the
 broker.
4. Once the message has successfully been acknowledged to the broker, you know that it won’t try
 processing the same message again, so you can delete the message ID from the database (in a
 separate transaction).

If the message processor crashes before committing the database transaction, the transaction is
aborted and the message broker will retry processing. If it crashes after committing but before
acknowledging the message to the broker, it will also retry processing, but the retry will see the
message ID in the database and drop it. If it crashes after acknowledging the message but before
deleting the message ID from the database, you will have an old message ID lying around, which
doesn’t do any harm besides taking a little bit of storage space. If a retry happens before the
database transaction is aborted (which could happen if communication between the message processor
and the database is interrupted), a uniqueness constraint on the table of message IDs should prevent
the same message ID from being inserted by two concurrent transactions.

Thus, achieving exactly-once processing only requires transactions within the database—atomicity
across database and message broker is not necessary for this use case. Recording the message ID in
the database makes the message processing *idempotent*, so that message processing can be safely
retried without duplicating its side-effects. A similar approach is used in stream processing
frameworks such as Kafka Streams to achieve exactly-once semantics, as we shall see in [“Fault Tolerance”](/en/ch12#sec_stream_fault_tolerance).

However, internal distributed transactions within the database are still useful for the scalability
of patterns such as these: for example, they would allow the message IDs to be stored on one shard
and the main data updated by the message processing to be stored on other shards, and to ensure
atomicity of the transaction commit across those shards.


## Summary {#summary}

Transactions are an abstraction layer that allows an application to pretend that certain concurrency
problems and certain kinds of hardware and software faults don’t exist. A large class of errors is
reduced down to a simple *transaction abort*, and the application just needs to try again.

In this chapter we saw many examples of problems that transactions help prevent. Not all
applications are susceptible to all those problems: an application with very simple access patterns,
such as reading and writing only a single record, can probably manage without transactions. However,
for more complex access patterns, transactions can hugely reduce the number of potential error cases
you need to think about.

Without transactions, various error scenarios (processes crashing, network interruptions, power
outages, disk full, unexpected concurrency, etc.) mean that data can become inconsistent in various
ways. For example, denormalized data can easily go out of sync with the source data. Without
transactions, it becomes very difficult to reason about the effects that complex interacting accesses
can have on the database.

In this chapter, we went particularly deep into the topic of concurrency control. We discussed
several widely used isolation levels, in particular *read committed*, *snapshot isolation*
(sometimes called *repeatable read*), and *serializable*. We characterized those isolation levels by
discussing various examples of race conditions, summarized in [Table 8-1](/en/ch8#ch_transactions_isolation_levels):

{{< figure id="ch_transactions_isolation_levels" title="Table 8-1. Summary of anomalies that can occur at various isolation levels" class="w-full my-4" >}}

| Isolation level    | Dirty reads | Read skew   | Phantom reads | Lost updates | Write skew  |
|--------------------|-------------|-------------|---------------|--------------|-------------|
| Read uncommitted   | ✗ Possible  | ✗ Possible  | ✗ Possible    | ✗ Possible   | ✗ Possible  |
| Read committed     | ✓ Prevented | ✗ Possible  | ✗ Possible    | ✗ Possible   | ✗ Possible  |
| Snapshot isolation | ✓ Prevented | ✓ Prevented | ✓ Prevented   | ? Depends    | ✗ Possible  |
| Serializable       | ✓ Prevented | ✓ Prevented | ✓ Prevented   | ✓ Prevented  | ✓ Prevented |

Dirty reads
: One client reads another client’s writes before they have been committed. The read committed
 isolation level and stronger levels prevent dirty reads.

Dirty writes
: One client overwrites data that another client has written, but not yet committed. Almost all
 transaction implementations prevent dirty writes.

Read skew
: A client sees different parts of the database at different points in time. Some cases of read
 skew are also known as *nonrepeatable reads*. This issue is most commonly prevented with snapshot
 isolation, which allows a transaction to read from a consistent snapshot corresponding to one
 particular point in time. It is usually implemented with *multi-version concurrency control*
 (MVCC).

Lost updates
: Two clients concurrently perform a read-modify-write cycle. One overwrites the other’s write
 without incorporating its changes, so data is lost. Some implementations of snapshot isolation
 prevent this anomaly automatically, while others require a manual lock (`SELECT FOR UPDATE`).

Write skew
: A transaction reads something, makes a decision based on the value it saw, and writes the decision
 to the database. However, by the time the write is made, the premise of the decision is no longer
 true. Only serializable isolation prevents this anomaly.

Phantom reads
: A transaction reads objects that match some search condition. Another client makes a write that
 affects the results of that search. Snapshot isolation prevents straightforward phantom reads, but
 phantoms in the context of write skew require special treatment, such as index-range locks.

Weak isolation levels protect against some of those anomalies but leave you, the application
developer, to handle others manually (e.g., using explicit locking). Only serializable isolation
protects against all of these issues. We discussed three different approaches to implementing
serializable transactions:

Literally executing transactions in a serial order
: If you can make each transaction very fast to execute (typically by using stored procedures), and
 the transaction throughput is low enough to process on a single CPU core or can be sharded, this
 is a simple and effective option.

Two-phase locking
: For decades this has been the standard way of implementing serializability, but many applications
 avoid using it because of its poor performance.

Serializable snapshot isolation (SSI)
: A comparatively new algorithm that avoids most of the downsides of the previous approaches. It
 uses an optimistic approach, allowing transactions to proceed without blocking. When a transaction
 wants to commit, it is checked, and it is aborted if the execution was not serializable.

Finally, we examined how to achieve atomicity when a transaction is distributed across multiple
nodes, using two-phase commit. If those nodes are all running the same database software,
distributed transactions can work quite well, but across different storage technologies (using XA
transactions), 2PC is problematic: it is very sensitive to faults in the coordinator and the
application code driving the transaction, and it interacts poorly with concurrency control
mechanisms. Fortunately, idempotence can ensure exactly-once semantics without requiring atomic
commit across different storage technologies, and we will see more on this in later chapters.

The examples in this chapter used a relational data model. However, as discussed in
[“The need for multi-object transactions”](/en/ch8#sec_transactions_need), transactions are a valuable database feature, no matter which data model is used.


### References

[^1]: Steven J. Murdoch. [What went wrong with Horizon: learning from the Post Office Trial](https://www.benthamsgaze.org/2021/07/15/what-went-wrong-with-horizon-learning-from-the-post-office-trial/). *benthamsgaze.org*, July 2021. Archived at [perma.cc/CNM4-553F](https://perma.cc/CNM4-553F) 
[^2]: Donald D. Chamberlin, Morton M. Astrahan, Michael W. Blasgen, James N. Gray, W. Frank King, Bruce G. Lindsay, Raymond Lorie, James W. Mehl, Thomas G. Price, Franco Putzolu, Patricia Griffiths Selinger, Mario Schkolnick, Donald R. Slutz, Irving L. Traiger, Bradford W. Wade, and Robert A. Yost. [A History and Evaluation of System R](https://dsf.berkeley.edu/cs262/2005/SystemR.pdf). *Communications of the ACM*, volume 24, issue 10, pages 632–646, October 1981. [doi:10.1145/358769.358784](https://doi.org/10.1145/358769.358784) 
[^3]: Jim N. Gray, Raymond A. Lorie, Gianfranco R. Putzolu, and Irving L. Traiger. [Granularity of Locks and Degrees of Consistency in a Shared Data Base](https://citeseerx.ist.psu.edu/pdf/e127f0a6a912bb9150ecfe03c0ebf7fbc289a023). in *Modelling in Data Base Management Systems: Proceedings of the IFIP Working Conference on Modelling in Data Base Management Systems*, edited by G. M. Nijssen, pages 364–394, Elsevier/North Holland Publishing, 1976. Also in *Readings in Database Systems*, 4th edition, edited by Joseph M. Hellerstein and Michael Stonebraker, MIT Press, 2005. ISBN: 978-0-262-69314-1 
[^4]: Kapali P. Eswaran, Jim N. Gray, Raymond A. Lorie, and Irving L. Traiger. [The Notions of Consistency and Predicate Locks in a Database System](https://jimgray.azurewebsites.net/papers/On%20the%20Notions%20of%20Consistency%20and%20Predicate%20Locks%20in%20a%20Database%20System%20CACM.pdf?from=https://research.microsoft.com/en-us/um/people/gray/papers/On%20the%20Notions%20of%20Consistency%20and%20Predicate%20Locks%20in%20a%20Database%20System%20CACM.pdf). *Communications of the ACM*, volume 19, issue 11, pages 624–633, November 1976. [doi:10.1145/360363.360369](https://doi.org/10.1145/360363.360369) 
[^5]: Rebecca Taft, Irfan Sharif, Andrei Matei, Nathan VanBenschoten, Jordan Lewis, Tobias Grieger, Kai Niemi, Andy Woods, Anne Birzin, Raphael Poss, Paul Bardea, Amruta Ranade, Ben Darnell, Bram Gruneir, Justin Jaffray, Lucy Zhang, and Peter Mattis. [CockroachDB: The Resilient Geo-Distributed SQL Database](https://dl.acm.org/doi/pdf/10.1145/3318464.3386134). At *ACM SIGMOD International Conference on Management of Data* (SIGMOD), pages 1493–1509, June 2020. [doi:10.1145/3318464.3386134](https://doi.org/10.1145/3318464.3386134) 
[^6]: Dongxu Huang, Qi Liu, Qiu Cui, Zhuhe Fang, Xiaoyu Ma, Fei Xu, Li Shen, Liu Tang, Yuxing Zhou, Menglong Huang, Wan Wei, Cong Liu, Jian Zhang, Jianjun Li, Xuelian Wu, Lingyu Song, Ruoxi Sun, Shuaipeng Yu, Lei Zhao, Nicholas Cameron, Liquan Pei, and Xin Tang. [TiDB: a Raft-based HTAP database](https://www.vldb.org/pvldb/vol13/p3072-huang.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 12, pages 3072–3084. [doi:10.14778/3415478.3415535](https://doi.org/10.14778/3415478.3415535) 
[^7]: James C. Corbett, Jeffrey Dean, Michael Epstein, Andrew Fikes, Christopher Frost, JJ Furman, Sanjay Ghemawat, Andrey Gubarev, Christopher Heiser, Peter Hochschild, Wilson Hsieh, Sebastian Kanthak, Eugene Kogan, Hongyi Li, Alexander Lloyd, Sergey Melnik, David Mwaura, David Nagle, Sean Quinlan, Rajesh Rao, Lindsay Rolig, Dale Woodford, Yasushi Saito, Christopher Taylor, Michal Szymaniak, and Ruth Wang. [Spanner: Google’s Globally-Distributed Database](https://research.google/pubs/pub39966/). At *10th USENIX Symposium on Operating System Design and Implementation* (OSDI), October 2012. 
[^8]: Jingyu Zhou, Meng Xu, Alexander Shraer, Bala Namasivayam, Alex Miller, Evan Tschannen, Steve Atherton, Andrew J. Beamon, Rusty Sears, John Leach, Dave Rosenthal, Xin Dong, Will Wilson, Ben Collins, David Scherer, Alec Grieser, Young Liu, Alvin Moore, Bhaskar Muppana, Xiaoge Su, and Vishesh Yadav. [FoundationDB: A Distributed Unbundled Transactional Key Value Store](https://www.foundationdb.org/files/fdb-paper.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2021. [doi:10.1145/3448016.3457559](https://doi.org/10.1145/3448016.3457559) 
[^9]: Theo Härder and Andreas Reuter. [Principles of Transaction-Oriented Database Recovery](https://citeseerx.ist.psu.edu/pdf/11ef7c142295aeb1a28a0e714c91fc8d610c3047). *ACM Computing Surveys*, volume 15, issue 4, pages 287–317, December 1983. [doi:10.1145/289.291](https://doi.org/10.1145/289.291) 
[^10]: Peter Bailis, Alan Fekete, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [HAT, not CAP: Towards Highly Available Transactions](https://www.usenix.org/system/files/conference/hotos13/hotos13-final80.pdf). At *14th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2013. 
[^11]: Armando Fox, Steven D. Gribble, Yatin Chawathe, Eric A. Brewer, and Paul Gauthier. [Cluster-Based Scalable Network Services](https://people.eecs.berkeley.edu/~brewer/cs262b/TACC.pdf). At *16th ACM Symposium on Operating Systems Principles* (SOSP), October 1997. [doi:10.1145/268998.266662](https://doi.org/10.1145/268998.266662) 
[^12]: Tony Andrews. [Enforcing Complex Constraints in Oracle](https://tonyandrews.blogspot.com/2004/10/enforcing-complex-constraints-in.html). *tonyandrews.blogspot.co.uk*, October 2004. Archived at [archive.org](https://web.archive.org/web/20220201190625/https%3A//tonyandrews.blogspot.com/2004/10/enforcing-complex-constraints-in.html) 
[^13]: Philip A. Bernstein, Vassos Hadzilacos, and Nathan Goodman. [*Concurrency Control and Recovery in Database Systems*](https://www.microsoft.com/en-us/research/people/philbe/book/). Addison-Wesley, 1987. ISBN: 978-0-201-10715-9, available online at [*microsoft.com*](https://www.microsoft.com/en-us/research/people/philbe/book/). 
[^14]: Alan Fekete, Dimitrios Liarokapis, Elizabeth O’Neil, Patrick O’Neil, and Dennis Shasha. [Making Snapshot Isolation Serializable](https://www.cse.iitb.ac.in/infolab/Data/Courses/CS632/2009/Papers/p492-fekete.pdf). *ACM Transactions on Database Systems*, volume 30, issue 2, pages 492–528, June 2005. [doi:10.1145/1071610.1071615](https://doi.org/10.1145/1071610.1071615) 
[^15]: Mai Zheng, Joseph Tucek, Feng Qin, and Mark Lillibridge. [Understanding the Robustness of SSDs Under Power Fault](https://www.usenix.org/system/files/conference/fast13/fast13-final80.pdf). At *11th USENIX Conference on File and Storage Technologies* (FAST), February 2013. 
[^16]: Laurie Denness. [SSDs: A Gift and a Curse](https://laur.ie/blog/2015/06/ssds-a-gift-and-a-curse/). *laur.ie*, June 2015. Archived at [perma.cc/6GLP-BX3T](https://perma.cc/6GLP-BX3T) 
[^17]: Adam Surak. [When Solid State Drives Are Not That Solid](https://www.algolia.com/blog/engineering/when-solid-state-drives-are-not-that-solid). *blog.algolia.com*, June 2015. Archived at [perma.cc/CBR9-QZEE](https://perma.cc/CBR9-QZEE) 
[^18]: Hewlett Packard Enterprise. [Bulletin: (Revision) HPE SAS Solid State Drives - Critical Firmware Upgrade Required for Certain HPE SAS Solid State Drive Models to Prevent Drive Failure at 32,768 Hours of Operation](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-a00092491en_us). *support.hpe.com*, November 2019. Archived at [perma.cc/CZR4-AQBS](https://perma.cc/CZR4-AQBS) 
[^19]: Craig Ringer et al. [PostgreSQL’s handling of fsync() errors is unsafe and risks data loss at least on XFS](https://www.postgresql.org/message-id/flat/CAMsr%2BYHh%2B5Oq4xziwwoEfhoTZgr07vdGG%2Bhu%3D1adXx59aTeaoQ%40mail.gmail.com). Email thread on pgsql-hackers mailing list, *postgresql.org*, March 2018. Archived at [perma.cc/5RKU-57FL](https://perma.cc/5RKU-57FL) 
[^20]: Anthony Rebello, Yuvraj Patel, Ramnatthan Alagappan, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [Can Applications Recover from fsync Failures?](https://www.usenix.org/conference/atc20/presentation/rebello) At *USENIX Annual Technical Conference* (ATC), July 2020. 
[^21]: Thanumalayan Sankaranarayana Pillai, Vijay Chidambaram, Ramnatthan Alagappan, Samer Al-Kiswany, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [Crash Consistency: Rethinking the Fundamental Abstractions of the File System](https://dl.acm.org/doi/pdf/10.1145/2800695.2801719). *ACM Queue*, volume 13, issue 7, pages 20–28, July 2015. [doi:10.1145/2800695.2801719](https://doi.org/10.1145/2800695.2801719) 
[^22]: Thanumalayan Sankaranarayana Pillai, Vijay Chidambaram, Ramnatthan Alagappan, Samer Al-Kiswany, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [All File Systems Are Not Created Equal: On the Complexity of Crafting Crash-Consistent Applications](https://www.usenix.org/system/files/conference/osdi14/osdi14-paper-pillai.pdf). At *11th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2014. 
[^23]: Chris Siebenmann. [Unix’s File Durability Problem](https://utcc.utoronto.ca/~cks/space/blog/unix/FileSyncProblem). *utcc.utoronto.ca*, April 2016. Archived at [perma.cc/VSS8-5MC4](https://perma.cc/VSS8-5MC4) 
[^24]: Aishwarya Ganesan, Ramnatthan Alagappan, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [Redundancy Does Not Imply Fault Tolerance: Analysis of Distributed Storage Reactions to Single Errors and Corruptions](https://www.usenix.org/conference/fast17/technical-sessions/presentation/ganesan). At *15th USENIX Conference on File and Storage Technologies* (FAST), February 2017. 
[^25]: Lakshmi N. Bairavasundaram, Garth R. Goodson, Bianca Schroeder, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [An Analysis of Data Corruption in the Storage Stack](https://www.usenix.org/legacy/event/fast08/tech/full_papers/bairavasundaram/bairavasundaram.pdf). At *6th USENIX Conference on File and Storage Technologies* (FAST), February 2008. 
[^26]: Bianca Schroeder, Raghav Lagisetty, and Arif Merchant. [Flash Reliability in Production: The Expected and the Unexpected](https://www.usenix.org/conference/fast16/technical-sessions/presentation/schroeder). At *14th USENIX Conference on File and Storage Technologies* (FAST), February 2016. 
[^27]: Don Allison. [SSD Storage – Ignorance of Technology Is No Excuse](https://blog.korelogic.com/blog/2015/03/24). *blog.korelogic.com*, March 2015. Archived at [perma.cc/9QN4-9SNJ](https://perma.cc/9QN4-9SNJ) 
[^28]: Gordon Mah Ung. [Debunked: Your SSD won’t lose data if left unplugged after all](https://www.pcworld.com/article/427602/debunked-your-ssd-wont-lose-data-if-left-unplugged-after-all.html). *pcworld.com*, May 2015. Archived at [perma.cc/S46H-JUDU](https://perma.cc/S46H-JUDU) 
[^29]: Martin Kleppmann. [Hermitage: Testing the ‘I’ in ACID](https://martin.kleppmann.com/2014/11/25/hermitage-testing-the-i-in-acid.html). *martin.kleppmann.com*, November 2014. Archived at [perma.cc/KP2Y-AQGK](https://perma.cc/KP2Y-AQGK) 
[^30]: Todd Warszawski and Peter Bailis. [ACIDRain: Concurrency-Related Attacks on Database-Backed Web Applications](http://www.bailis.org/papers/acidrain-sigmod2017.pdf). At *ACM International Conference on Management of Data* (SIGMOD), May 2017. [doi:10.1145/3035918.3064037](https://doi.org/10.1145/3035918.3064037) 
[^31]: Tristan D’Agosta. [BTC Stolen from Poloniex](https://bitcointalk.org/index.php?topic=499580). *bitcointalk.org*, March 2014. Archived at [perma.cc/YHA6-4C5D](https://perma.cc/YHA6-4C5D) 
[^32]: bitcointhief2. [How I Stole Roughly 100 BTC from an Exchange and How I Could Have Stolen More!](https://www.reddit.com/r/Bitcoin/comments/1wtbiu/how_i_stole_roughly_100_btc_from_an_exchange_and/) *reddit.com*, February 2014. Archived at [archive.org](https://web.archive.org/web/20250118042610/https%3A//www.reddit.com/r/Bitcoin/comments/1wtbiu/how_i_stole_roughly_100_btc_from_an_exchange_and/) 
[^33]: Sudhir Jorwekar, Alan Fekete, Krithi Ramamritham, and S. Sudarshan. [Automating the Detection of Snapshot Isolation Anomalies](https://www.vldb.org/conf/2007/papers/industrial/p1263-jorwekar.pdf). At *33rd International Conference on Very Large Data Bases* (VLDB), September 2007. 
[^34]: Michael Melanson. [Transactions: The Limits of Isolation](https://www.michaelmelanson.net/posts/transactions-the-limits-of-isolation/). *michaelmelanson.net*, November 2014. Archived at [perma.cc/RG5R-KMYZ](https://perma.cc/RG5R-KMYZ) 
[^35]: Edward Kim. [How ACH works: A developer perspective — Part 1](https://engineering.gusto.com/how-ach-works-a-developer-perspective-part-1-339d3e7bea1). *engineering.gusto.com*, April 2014. Archived at [perma.cc/7B2H-PU94](https://perma.cc/7B2H-PU94) 
[^36]: Hal Berenson, Philip A. Bernstein, Jim N. Gray, Jim Melton, Elizabeth O’Neil, and Patrick O’Neil. [A Critique of ANSI SQL Isolation Levels](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-95-51.pdf). At *ACM International Conference on Management of Data* (SIGMOD), May 1995. [doi:10.1145/568271.223785](https://doi.org/10.1145/568271.223785) 
[^37]: Atul Adya. [Weak Consistency: A Generalized Theory and Optimistic Implementations for Distributed Transactions](https://pmg.csail.mit.edu/papers/adya-phd.pdf). PhD Thesis, Massachusetts Institute of Technology, March 1999. Archived at [perma.cc/E97M-HW5Q](https://perma.cc/E97M-HW5Q) 
[^38]: Peter Bailis, Aaron Davidson, Alan Fekete, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Highly Available Transactions: Virtues and Limitations](https://www.vldb.org/pvldb/vol7/p181-bailis.pdf). At *40th International Conference on Very Large Data Bases* (VLDB), September 2014. 
[^39]: Natacha Crooks, Youer Pu, Lorenzo Alvisi, and Allen Clement. [Seeing is Believing: A Client-Centric Specification of Database Isolation](https://www.cs.cornell.edu/lorenzo/papers/Crooks17Seeing.pdf). At *ACM Symposium on Principles of Distributed Computing* (PODC), pages 73–82, July 2017. [doi:10.1145/3087801.3087802](https://doi.org/10.1145/3087801.3087802) 
[^40]: Bruce Momjian. [MVCC Unmasked](https://momjian.us/main/writings/pgsql/mvcc.pdf). *momjian.us*, July 2014. Archived at [perma.cc/KQ47-9GYB](https://perma.cc/KQ47-9GYB) 
[^41]: Peter Alvaro and Kyle Kingsbury. [MySQL 8.0.34](https://jepsen.io/analyses/mysql-8.0.34). *jepsen.io*, December 2023. Archived at [perma.cc/HGE2-Z878](https://perma.cc/HGE2-Z878) 
[^42]: Egor Rogov. [PostgreSQL 14 Internals](https://postgrespro.com/community/books/internals). *postgrespro.com*, April 2023. Archived at [perma.cc/FRK2-D7WB](https://perma.cc/FRK2-D7WB) 
[^43]: Hironobu Suzuki. [The Internals of PostgreSQL](https://www.interdb.jp/pg/). *interdb.jp*, 2017. 
[^44]: Rohan Reddy Alleti. [Internals of MVCC in Postgres: Hidden costs of Updates vs Inserts](https://medium.com/%40rohanjnr44/internals-of-mvcc-in-postgres-hidden-costs-of-updates-vs-inserts-381eadd35844). *medium.com*, March 2025. Archived at [perma.cc/3ACX-DFXT](https://perma.cc/3ACX-DFXT) 
[^45]: Andy Pavlo and Bohan Zhang. [The Part of PostgreSQL We Hate the Most](https://www.cs.cmu.edu/~pavlo/blog/2023/04/the-part-of-postgresql-we-hate-the-most.html). *cs.cmu.edu*, April 2023. Archived at [perma.cc/XSP6-3JBN](https://perma.cc/XSP6-3JBN) 
[^46]: Yingjun Wu, Joy Arulraj, Jiexi Lin, Ran Xian, and Andrew Pavlo. [An empirical evaluation of in-memory multi-version concurrency control](https://vldb.org/pvldb/vol10/p781-Wu.pdf). *Proceedings of the VLDB Endowment*, volume 10, issue 7, pages 781–792, March 2017. [doi:10.14778/3067421.3067427](https://doi.org/10.14778/3067421.3067427) 
[^47]: Nikita Prokopov. [Unofficial Guide to Datomic Internals](https://tonsky.me/blog/unofficial-guide-to-datomic-internals/). *tonsky.me*, May 2014. 
[^48]: Daniil Svetlov. [A Practical Guide to Taming Postgres Isolation Anomalies](https://dansvetlov.me/postgres-anomalies/). *dansvetlov.me*, March 2025. Archived at [perma.cc/L7LE-TDLS](https://perma.cc/L7LE-TDLS) 
[^49]: Nate Wiger. [An Atomic Rant](https://nateware.com/2010/02/18/an-atomic-rant/). *nateware.com*, February 2010. Archived at [perma.cc/5ZYB-PE44](https://perma.cc/5ZYB-PE44) 
[^50]: James Coglan. [Reading and writing, part 3: web applications](https://blog.jcoglan.com/2020/10/12/reading-and-writing-part-3/). *blog.jcoglan.com*, October 2020. Archived at [perma.cc/A7EK-PJVS](https://perma.cc/A7EK-PJVS) 
[^51]: Peter Bailis, Alan Fekete, Michael J. Franklin, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Feral Concurrency Control: An Empirical Investigation of Modern Application Integrity](http://www.bailis.org/papers/feral-sigmod2015.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2737784](https://doi.org/10.1145/2723372.2737784) 
[^52]: Jaana Dogan. [Things I Wished More Developers Knew About Databases](https://rakyll.medium.com/things-i-wished-more-developers-knew-about-databases-2d0178464f78). *rakyll.medium.com*, April 2020. Archived at [perma.cc/6EFK-P2TD](https://perma.cc/6EFK-P2TD) 
[^53]: Michael J. Cahill, Uwe Röhm, and Alan Fekete. [Serializable Isolation for Snapshot Databases](https://www.cs.cornell.edu/~sowell/dbpapers/serializable_isolation.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2008. [doi:10.1145/1376616.1376690](https://doi.org/10.1145/1376616.1376690) 
[^54]: Dan R. K. Ports and Kevin Grittner. [Serializable Snapshot Isolation in PostgreSQL](https://drkp.net/papers/ssi-vldb12.pdf). At *38th International Conference on Very Large Databases* (VLDB), August 2012. 
[^55]: Douglas B. Terry, Marvin M. Theimer, Karin Petersen, Alan J. Demers, Mike J. Spreitzer and Carl H. Hauser. [Managing Update Conflicts in Bayou, a Weakly Connected Replicated Storage System](https://pdos.csail.mit.edu/6.824/papers/bayou-conflicts.pdf). At *15th ACM Symposium on Operating Systems Principles* (SOSP), December 1995. [doi:10.1145/224056.224070](https://doi.org/10.1145/224056.224070) 
[^56]: Hans-Jürgen Schönig. [Constraints over multiple rows in PostgreSQL](https://www.cybertec-postgresql.com/en/postgresql-constraints-over-multiple-rows/). *cybertec-postgresql.com*, June 2021. Archived at [perma.cc/2TGH-XUPZ](https://perma.cc/2TGH-XUPZ) 
[^57]: Michael Stonebraker, Samuel Madden, Daniel J. Abadi, Stavros Harizopoulos, Nabil Hachem, and Pat Helland. [The End of an Architectural Era (It’s Time for a Complete Rewrite)](https://vldb.org/conf/2007/papers/industrial/p1150-stonebraker.pdf). At *33rd International Conference on Very Large Data Bases* (VLDB), September 2007. 
[^58]: John Hugg. [H-Store/VoltDB Architecture vs. CEP Systems and Newer Streaming Architectures](https://www.youtube.com/watch?v=hD5M4a1UVz8). At *Data @Scale Boston*, November 2014. 
[^59]: Robert Kallman, Hideaki Kimura, Jonathan Natkins, Andrew Pavlo, Alexander Rasin, Stanley Zdonik, Evan P. C. Jones, Samuel Madden, Michael Stonebraker, Yang Zhang, John Hugg, and Daniel J. Abadi. [H-Store: A High-Performance, Distributed Main Memory Transaction Processing System](https://www.vldb.org/pvldb/vol1/1454211.pdf). *Proceedings of the VLDB Endowment*, volume 1, issue 2, pages 1496–1499, August 2008. 
[^60]: Rich Hickey. [The Architecture of Datomic](https://www.infoq.com/articles/Architecture-Datomic/). *infoq.com*, November 2012. Archived at [perma.cc/5YWU-8XJK](https://perma.cc/5YWU-8XJK) 
[^61]: John Hugg. [Debunking Myths About the VoltDB In-Memory Database](https://dzone.com/articles/debunking-myths-about-voltdb). *dzone.com*, May 2014. Archived at [perma.cc/2Z9N-HPKF](https://perma.cc/2Z9N-HPKF) 
[^62]: Xinjing Zhou, Viktor Leis, Xiangyao Yu, and Michael Stonebraker. [OLTP Through the Looking Glass 16 Years Later: Communication is the New Bottleneck](https://www.vldb.org/cidrdb/papers/2025/p17-zhou.pdf). At *15th Annual Conference on Innovative Data Systems Research* (CIDR), January 2025. 
[^63]: Xinjing Zhou, Xiangyao Yu, Goetz Graefe, and Michael Stonebraker. [Lotus: scalable multi-partition transactions on single-threaded partitioned databases](https://www.vldb.org/pvldb/vol15/p2939-zhou.pdf). *Proceedings of the VLDB Endowment* (PVLDB), volume 15, issue 11, pages 2939–2952, July 2022. [doi:10.14778/3551793.3551843](https://doi.org/10.14778/3551793.3551843) 
[^64]: Joseph M. Hellerstein, Michael Stonebraker, and James Hamilton. [Architecture of a Database System](https://dsf.berkeley.edu/papers/fntdb07-architecture.pdf). *Foundations and Trends in Databases*, volume 1, issue 2, pages 141–259, November 2007. [doi:10.1561/1900000002](https://doi.org/10.1561/1900000002) 
[^65]: Michael J. Cahill. [Serializable Isolation for Snapshot Databases](https://ses.library.usyd.edu.au/bitstream/handle/2123/5353/michael-cahill-2009-thesis.pdf). PhD Thesis, University of Sydney, July 2009. Archived at [perma.cc/727J-NTMP](https://perma.cc/727J-NTMP) 
[^66]: Cristian Diaconu, Craig Freedman, Erik Ismert, Per-Åke Larson, Pravin Mittal, Ryan Stonecipher, Nitin Verma, and Mike Zwilling. [Hekaton: SQL Server’s Memory-Optimized OLTP Engine](https://www.microsoft.com/en-us/research/wp-content/uploads/2013/06/Hekaton-Sigmod2013-final.pdf). At *ACM SIGMOD International Conference on Management of Data* (SIGMOD), pages 1243–1254, June 2013. [doi:10.1145/2463676.2463710](https://doi.org/10.1145/2463676.2463710) 
[^67]: Thomas Neumann, Tobias Mühlbauer, and Alfons Kemper. [Fast Serializable Multi-Version Concurrency Control for Main-Memory Database Systems](https://db.in.tum.de/~muehlbau/papers/mvcc.pdf). At *ACM SIGMOD International Conference on Management of Data* (SIGMOD), pages 677–689, May 2015. [doi:10.1145/2723372.2749436](https://doi.org/10.1145/2723372.2749436) 
[^68]: D. Z. Badal. [Correctness of Concurrency Control and Implications in Distributed Databases](https://ieeexplore.ieee.org/abstract/document/762563). At *3rd International IEEE Computer Software and Applications Conference* (COMPSAC), November 1979. [doi:10.1109/CMPSAC.1979.762563](https://doi.org/10.1109/CMPSAC.1979.762563) 
[^69]: Rakesh Agrawal, Michael J. Carey, and Miron Livny. [Concurrency Control Performance Modeling: Alternatives and Implications](https://people.eecs.berkeley.edu/~brewer/cs262/ConcControl.pdf). *ACM Transactions on Database Systems* (TODS), volume 12, issue 4, pages 609–654, December 1987. [doi:10.1145/32204.32220](https://doi.org/10.1145/32204.32220) 
[^70]: Marc Brooker. [Snapshot Isolation vs Serializability](https://brooker.co.za/blog/2024/12/17/occ-and-isolation.html). *brooker.co.za*, December 2024. Archived at [perma.cc/5TRC-CR5G](https://perma.cc/5TRC-CR5G) 
[^71]: B. G. Lindsay, P. G. Selinger, C. Galtieri, J. N. Gray, R. A. Lorie, T. G. Price, F. Putzolu, I. L. Traiger, and B. W. Wade. [Notes on Distributed Databases](https://dominoweb.draco.res.ibm.com/reports/RJ2571.pdf). IBM Research, Research Report RJ2571(33471), July 1979. Archived at [perma.cc/EPZ3-MHDD](https://perma.cc/EPZ3-MHDD) 
[^72]: C. Mohan, Bruce G. Lindsay, and Ron Obermarck. [Transaction Management in the R\* Distributed Database Management System](https://cs.brown.edu/courses/csci2270/archives/2012/papers/dtxn/p378-mohan.pdf). *ACM Transactions on Database Systems*, volume 11, issue 4, pages 378–396, December 1986. [doi:10.1145/7239.7266](https://doi.org/10.1145/7239.7266) 
[^73]: X/Open Company Ltd. [Distributed Transaction Processing: The XA Specification](https://pubs.opengroup.org/onlinepubs/009680699/toc.pdf). Technical Standard XO/CAE/91/300, December 1991. ISBN: 978-1-872-63024-3, archived at [perma.cc/Z96H-29JB](https://perma.cc/Z96H-29JB) 
[^74]: Ivan Silva Neto and Francisco Reverbel. [Lessons Learned from Implementing WS-Coordination and WS-AtomicTransaction](https://www.ime.usp.br/~reverbel/papers/icis2008.pdf). At *7th IEEE/ACIS International Conference on Computer and Information Science* (ICIS), May 2008. [doi:10.1109/ICIS.2008.75](https://doi.org/10.1109/ICIS.2008.75) 
[^75]: James E. Johnson, David E. Langworthy, Leslie Lamport, and Friedrich H. Vogt. [Formal Specification of a Web Services Protocol](https://www.microsoft.com/en-us/research/publication/formal-specification-of-a-web-services-protocol/). At *1st International Workshop on Web Services and Formal Methods* (WS-FM), February 2004. [doi:10.1016/j.entcs.2004.02.022](https://doi.org/10.1016/j.entcs.2004.02.022) 
[^76]: Jim Gray. [The Transaction Concept: Virtues and Limitations](https://jimgray.azurewebsites.net/papers/thetransactionconcept.pdf). At *7th International Conference on Very Large Data Bases* (VLDB), September 1981. 
[^77]: Dale Skeen. [Nonblocking Commit Protocols](https://www.cs.utexas.edu/~lorenzo/corsi/cs380d/papers/Ske81.pdf). At *ACM International Conference on Management of Data* (SIGMOD), April 1981. [doi:10.1145/582318.582339](https://doi.org/10.1145/582318.582339) 
[^78]: Gregor Hohpe. [Your Coffee Shop Doesn’t Use Two-Phase Commit](https://www.martinfowler.com/ieeeSoftware/coffeeShop.pdf). *IEEE Software*, volume 22, issue 2, pages 64–66, March 2005. [doi:10.1109/MS.2005.52](https://doi.org/10.1109/MS.2005.52) 
[^79]: Pat Helland. [Life Beyond Distributed Transactions: An Apostate’s Opinion](https://www.cidrdb.org/cidr2007/papers/cidr07p15.pdf). At *3rd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2007. 
[^80]: Jonathan Oliver. [My Beef with MSDTC and Two-Phase Commits](https://blog.jonathanoliver.com/my-beef-with-msdtc-and-two-phase-commits/). *blog.jonathanoliver.com*, April 2011. Archived at [perma.cc/K8HF-Z4EN](https://perma.cc/K8HF-Z4EN) 
[^81]: Oren Eini (Ahende Rahien). [The Fallacy of Distributed Transactions](https://ayende.com/blog/167362/the-fallacy-of-distributed-transactions). *ayende.com*, July 2014. Archived at [perma.cc/VB87-2JEF](https://perma.cc/VB87-2JEF) 
[^82]: Clemens Vasters. [Transactions in Windows Azure (with Service Bus) – An Email Discussion](https://learn.microsoft.com/en-gb/archive/blogs/clemensv/transactions-in-windows-azure-with-service-bus-an-email-discussion). *learn.microsoft.com*, July 2012. Archived at [perma.cc/4EZ9-5SKW](https://perma.cc/4EZ9-5SKW) 
[^83]: Ajmer Dhariwal. [Orphaned MSDTC Transactions (-2 spids)](https://www.eraofdata.com/posts/2008/orphaned-msdtc-transactions-2-spids/). *eraofdata.com*, December 2008. Archived at [perma.cc/YG6F-U34C](https://perma.cc/YG6F-U34C) 
[^84]: Paul Randal. [Real World Story of DBCC PAGE Saving the Day](https://www.sqlskills.com/blogs/paul/real-world-story-of-dbcc-page-saving-the-day/). *sqlskills.com*, June 2013. Archived at [perma.cc/2MJN-A5QH](https://perma.cc/2MJN-A5QH) 
[^85]: Guozhang Wang, Lei Chen, Ayusman Dikshit, Jason Gustafson, Boyang Chen, Matthias J. Sax, John Roesler, Sophie Blee-Goldman, Bruno Cadonna, Apurva Mehta, Varun Madan, and Jun Rao. [Consistency and Completeness: Rethinking Distributed Stream Processing in Apache Kafka](https://dl.acm.org/doi/pdf/10.1145/3448016.3457556). At *ACM International Conference on Management of Data* (SIGMOD), June 2021. [doi:10.1145/3448016.3457556](https://doi.org/10.1145/3448016.3457556)


================================================
FILE: content/en/ch9.md
================================================
---
title: "9. The Trouble with Distributed Systems"
weight: 209
breadcrumbs: false
---

<a id="ch_distributed"></a>

![](/map/ch08.png)

> *They’re funny things, Accidents. You never have them till you’re having them.*
>
> A.A. Milne, *The House at Pooh Corner* (1928)

As discussed in [“Reliability and Fault Tolerance”](/en/ch2#sec_introduction_reliability), making a system reliable means ensuring that the
system as a whole continues working, even when things go wrong (i.e., when there is a fault).
However, anticipating all the possible faults and handling them is not that easy. As a developer, it
is very tempting to focus mostly on the happy path (after all, most of the time things work fine!)
and to neglect faults, since they introduce a lot of edge cases.

If you want your system to be reliable in the presence of faults you have to radically change your
mindset, and focus on the things that could go wrong, even though they may be unlikely. It doesn’t
matter whether there is only a one-in-a-million chance of a thing going wrong: in a large enough
system, one-in-a-million events happen every day. Experienced systems operators will tell you that
anything that *can* go wrong *will* go wrong.

Moreover, working with distributed systems is fundamentally different from writing software on a
single computer—and the main difference is that there are lots of new and exciting ways for things
to go wrong [^1] [^2].
In this chapter, you will get a taste of the problems that arise in practice, and an understanding
of the things you can and cannot rely on.

To understand what challenges we are up against, we will now turn our pessimism to the maximum and
explore the things that may go wrong in a distributed system. We will look into problems with
networks ([“Unreliable Networks”](/en/ch9#sec_distributed_networks)) as well as clocks and timing issues
([“Unreliable Clocks”](/en/ch9#sec_distributed_clocks)). The consequences of all these issues are disorienting, so we’ll
explore how to think about the state of a distributed system and how to reason about things that
have happened ([“Knowledge, Truth, and Lies”](/en/ch9#sec_distributed_truth)). Later, in [Chapter 10](/en/ch10#ch_consistency), we will look at some
examples of how we can achieve fault tolerance in the face of those faults.

## Faults and Partial Failures {#sec_distributed_partial_failure}

When you are writing a program on a single computer, it normally behaves in a fairly predictable
way: either it works or it doesn’t. Buggy software may give the appearance that the computer is
sometimes “having a bad day” (a problem that is often fixed by a reboot), but that is mostly just
a consequence of badly written software.

There is no fundamental reason why software on a single computer should be flaky: when the hardware
is working correctly, the same operation always produces the same result (it is *deterministic*). If
there is a hardware problem (e.g., memory corruption or a loose connector), the consequence is usually a
total system failure (e.g., kernel panic, “blue screen of death,” failure to start up). An individual
computer with good software is usually either fully functional or entirely broken, but not something
in between.

This is a deliberate choice in the design of computers: if an internal fault occurs, we prefer a
computer to crash completely rather than returning a wrong result, because wrong results are difficult
and confusing to deal with. Thus, computers hide the fuzzy physical reality on which they are
implemented and present an idealized system model that operates with mathematical perfection. A CPU
instruction always does the same thing; if you write some data to memory or disk, that data remains
intact and doesn’t get randomly corrupted. As discussed in [“Hardware and Software Faults”](/en/ch2#sec_introduction_hardware_faults),
this is not actually true—in reality, data does get silently corrupted and CPUs do sometimes
silently return the wrong result—but it happens rarely enough that we can get away with ignoring it.

When you are writing software that runs on several computers, connected by a network, the situation
is fundamentally different. In distributed systems, faults occur much more frequently, and so we can
no longer ignore them—we have no choice but to confront the messy reality of the physical world. And
in the physical world, a remarkably wide range of things can go wrong, as illustrated by this
anecdote [^3]:

> In my limited experience I’ve dealt with long-lived network partitions in a single data center (DC),
> PDU [power distribution unit] failures, switch failures, accidental power cycles of whole racks,
> whole-DC backbone failures, whole-DC power failures, and a hypoglycemic driver smashing his Ford
> pickup truck into a DC’s HVAC [heating, ventilation, and air conditioning] system. And I’m not even
> an ops guy.
>
> —— Coda Hale

In a distributed system, there may well be some parts of the system that are broken in some
unpredictable way, even though other parts of the system are working fine. This is known as a
*partial failure*. The difficulty is that partial failures are *nondeterministic*: if you try to do
anything involving multiple nodes and the network, it may sometimes work and sometimes unpredictably
fail. As we shall see, you may not even *know* whether something succeeded or not!

This nondeterminism and possibility of partial failures is what makes distributed systems hard to work with [^4].
On the other hand, if a distributed system can tolerate partial failures, that opens up powerful
possibilities: for example, it allows you to perform a rolling upgrade, rebooting one node at a time
to install software updates while the system as a whole continues working uninterrupted all the
time. Fault tolerance therefore allows us to make distributed systems more reliable than single-node
systems: we can build a reliable system from unreliable components.

But before we can implement fault tolerance, we need to know more about the faults that we’re
supposed to tolerate. It is important to consider a wide range of possible faults—even fairly
unlikely ones—and to artificially create such situations in your testing environment to see what
happens. In distributed systems, suspicion, pessimism, and paranoia pay off.

## Unreliable Networks {#sec_distributed_networks}

As discussed in [“Shared-Memory, Shared-Disk, and Shared-Nothing Architecture”](/en/ch2#sec_introduction_shared_nothing), the distributed systems we focus on
in this book are mostly *shared-nothing systems*: i.e., a bunch of machines connected by a network.
The network is the only way those machines can communicate—we assume that each machine has its
own memory and disk, and one machine cannot access another machine’s memory or disk (except by
making requests to a service over the network). Even when storage is shared, such as with Amazon’s
S3, machines communicate with shared storage services over the network.

The internet and most internal networks in datacenters (often Ethernet) are *asynchronous packet
networks*. In this kind of network, one node can send a message (a packet) to another node, but the
network gives no guarantees as to when it will arrive, or whether it will arrive at all. If you send
a request and expect a response, many things could go wrong (some of which are illustrated in
[Figure 9-1](/en/ch9#fig_distributed_network)):

1. Your request may have been lost (perhaps someone unplugged a network cable).
2. Your request may be waiting in a queue and will be delivered later (perhaps the network or the
 recipient is overloaded).
3. The remote node may have failed (perhaps it crashed or it was powered down).
4. The remote node may have temporarily stopped responding (perhaps it is experiencing a long
 garbage collection pause; see [“Process Pauses”](/en/ch9#sec_distributed_clocks_pauses)), but it will start responding
 again later.
5. The remote node may have processed your request, but the response has been lost on the network
 (perhaps a network switch has been misconfigured).
6. The remote node may have processed your request, but the response has been delayed and will be
 delivered later (perhaps the network or your own machine is overloaded).

{{< figure src="/fig/ddia_0901.png" id="fig_distributed_network" caption="Figure 9-1. If you send a request and don't get a response, it's not possible to distinguish whether (a) the request was lost, (b) the remote node is down, or (c) the response was lost." class="w-full my-4" >}}


The sender can’t even tell whether the packet was delivered: the only option is for the recipient to
send a response message, which may in turn be lost or delayed. These issues are indistinguishable in
an asynchronous network: the only information you have is that you haven’t received a response yet.
If you send a request to another node and don’t receive a response, it is *impossible* to tell why.

The usual way of handling this issue is a *timeout*: after some time you give up waiting and assume that
the response is not going to arrive. However, when a timeout occurs, you still don’t know whether
the remote node got your request or not (and if the request is still queued somewhere, it may still
be delivered to the recipient, even if the sender has given up on it).

### The Limitations of TCP {#sec_distributed_tcp}

Network packets have a maximum size (generally a few kilobytes), but many applications need to send
messages (requests, responses) that are too big to fit in one packet. These applications most often
use TCP, the Transmission Control Protocol, to establish a *connection* that breaks up large data
streams into individual packets, and puts them back together again on the receiving side.

--------

> [!NOTE]
> Most of what we say about TCP applies also to its more recent alternative QUIC, as well as the
> Stream Control Transmission Protocol (SCTP) used in WebRTC, the BitTorrent uTP protocol, and
> other transport protocols. For a comparison to UDP, see [“TCP Versus UDP”](/en/ch9#sidebar_distributed_tcp_udp).

--------

TCP is often described as providing “reliable” delivery, in the sense that it detects and
retransmits dropped packets, it detects reordered packets and puts them back in the correct order,
and it detects packet corruption using a simple checksum. It also figures out how fast it can send
data so that it is transferred as quickly as possible, but without overloading the network or the
receiving node; this is known as *congestion control*, *flow control*, or *backpressure* [^5].

When you “send” some data by writing it to a socket, it actually doesn’t get sent immediately,
but it’s only placed in a buffer managed by your operating system. When the congestion control
algorithm decides that it has capacity to send a packet, it takes the next packet-worth of data from
that buffer and passes it to the network interface. The packet passes through several switches and
routers, and eventually the receiving node’s operating system places the packet’s data in a receive
buffer and sends an acknowledgment packet back to the sender. Only then does the receiving operating
system notify the application that some more data has arrived [^6].

So, if TCP provides “reliability”, does that mean we no longer need to worry about networks being
unreliable? Unfortunately not. It decides that a packet must have been lost if no acknowledgment
arrives within some timeout, but TCP can’t tell either whether it was the outbound packet or the
acknowledgment that was lost. Although TCP can resend the packet, it can’t guarantee that the new
packet will get through either. If the network cable is unplugged, TCP can’t plug it back in for
you. Eventually, after a configurable timeout, TCP gives up and signals an error to the application.

If a TCP connection is closed with an error—perhaps because the remote node crashed, or perhaps
because the network was interrupted—you unfortunately have no way of knowing how much data was
actually processed by the remote node [^6].
Even if TCP acknowledged that a packet was delivered, this only means that the operating system
kernel on the remote node received it, but the application may have crashed before it handled that
data. If you want to be sure that a request was successful, you need a positive response from the
application itself [^7].

Nevertheless, TCP is very useful, because it provides a convenient way of sending and receiving
messages that are too big to fit in one packet. Once a TCP connection is established, you can also
use it to send multiple requests and responses. This is usually done by first sending a header that
indicates the length of the following message in bytes, followed by the actual message. HTTP and
many RPC protocols (see [“Dataflow Through Services: REST and RPC”](/en/ch5#sec_encoding_dataflow_rpc)) work like this.

### Network Faults in Practice {#sec_distributed_network_faults}

We have been building computer networks for decades—one might hope that by now we would have figured
out how to make them reliable. Unfortunately, we have not yet succeeded. There are some systematic
studies, and plenty of anecdotal evidence, showing that network problems can be surprisingly common,
even in controlled environments like a datacenter operated by one company [^8]:

* One study in a medium-sized datacenter found about 12 network faults per month, of which half
 disconnected a single machine, and half disconnected an entire rack [^9].
* Another study measured the failure rates of components like top-of-rack switches, aggregation
 switches, and load balancers [^10].
 It found that adding redundant networking gear doesn’t reduce faults as much as you might hope,
 since it doesn’t guard against human error (e.g., misconfigured switches), which is a major cause
 of outages.
* Interruptions of wide-area fiber links have been blamed on cows [^11], beavers [^12], and sharks [^13]
 (though shark bites have become rarer due to better shielding of submarine cables [^14]).
 Humans are also at fault, be it due to accidental misconfiguration [^15], scavenging [^16], or sabotage [^17].
* Across different cloud regions, round-trip times of up to several *minutes* have been observed at
 high percentiles [^18].
 Even within a single datacenter, packet delay of more than a minute can occur during a network
 topology reconfiguration, triggered by a problem during a software upgrade for a switch [^19].
 Thus, we have to assume that messages might be delayed arbitrarily.
* Sometimes communications are partially interrupted, depending on who you’re talking to: for
 example, A and B can communicate, B and C can communicate, but A and C cannot [^20] [^21].
 Other surprising faults include a network interface that sometimes drops all inbound packets but
 sends outbound packets successfully [^22]:
 just because a network link works in one direction doesn’t guarantee it’s also working in the opposite direction.
* Even a brief network interruption can have repercussions that last for much longer than the
 original issue [^8] [^20] [^23].

--------

> [!TIP] NETWORK PARTITIONS

When one part of the network is cut off from the rest due to a network fault, that is sometimes
called a *network partition* or *netsplit*, but it is not fundamentally different from other kinds
of network interruption. Network partitions are not related to sharding of a storage system, which
is sometimes also called *partitioning* (see [Chapter 7](/en/ch7#ch_sharding)).

--------

Even if network faults are rare in your environment, the fact that faults *can* occur means that
your software needs to be able to handle them. Whenever any communication happens over a network, it
may fail—there is no way around it.

If the error handling of network faults is not defined and tested, arbitrarily bad things could
happen: for example, the cluster could become deadlocked and permanently unable to serve requests,
even when the network recovers [^24],
or it could even delete all of your data [^25].
If software is put in an unanticipated situation, it may do arbitrary unexpected things.

Handling network faults doesn’t necessarily mean *tolerating* them: if your network is normally
fairly reliable, a valid approach may be to simply show an error message to users while your network
is experiencing problems. However, you do need to know how your software reacts to network problems
and ensure that the system can recover from them.
It may make sense to deliberately trigger network problems and test the system’s response (this is
known as *fault injection*; see [“Fault injection”](/en/ch9#sec_fault_injection)).

### Detecting Faults {#id307}

Many systems need to automatically detect faulty nodes. For example:

* A load balancer needs to stop sending requests to a node that is dead (i.e., take it *out of rotation*).
* In a distributed database with single-leader replication, if the leader fails, one of the
 followers needs to be promoted to be the new leader (see [“Handling Node Outages”](/en/ch6#sec_replication_failover)).

Unfortunately, the uncertainty about the network makes it difficult to tell whether a node is
working or not. In some specific circumstances you might get some feedback to explicitly tell you
that something is not working:

* If you can reach the machine on which the node should be running, but no process is listening on
 the destination port (e.g., because the process crashed), the operating system will helpfully close
 or refuse TCP connections by sending a `RST` or `FIN` packet in reply.
* If a node process crashed (or was killed by an administrator) but the node’s operating system is
 still running, a script can notify other nodes about the crash so that another node can take over
 quickly without having to wait for a timeout to expire. For example, HBase does this [^26].
* If you have access to the management interface of the network switches in your datacenter, you can
 query them to detect link failures at a hardware level (e.g., if the remote machine is powered
 down). This option is ruled out if you’re connecting via the internet, or if you’re in a shared
 datacenter with no access to the switches themselves, or if you can’t reach the management
 interface due to a network problem.
* If a router is sure that the IP address you’re trying to connect to is unreachable, it may reply
 to you with an ICMP Destination Unreachable packet. However, the router doesn’t have a magic
 failure detection capability either—it is subject to the same limitations as other participants
 of the network.

Rapid feedback about a remote node being down is useful, but you can’t count on it. If something has
gone wrong, you may get an error response at some level of the stack, but in general you have to
assume that you will get no response at all. You can retry a few times, wait for a timeout to
elapse, and eventually declare the node dead if you don’t hear back within the timeout.

### Timeouts and Unbounded Delays {#sec_distributed_queueing}

If a timeout is the only sure way of detecting a fault, then how long should the timeout be? There
is unfortunately no simple answer.

A long timeout means a long wait until a node is declared dead (and during this time, users may have
to wait or see error messages). A short timeout detects faults faster, but carries a higher risk of
incorrectly declaring a node dead when in fact it has only suffered a temporary slowdown (e.g., due
to a load spike on the node or the network).

Prematurely declaring a node dead is problematic: if the node is actually alive and in the middle of
performing some action (for example, sending an email), and another node takes over, the action may
end up being performed twice. We will discuss this issue in more detail in
[“Knowledge, Truth, and Lies”](/en/ch9#sec_distributed_truth), [Chapter 10](/en/ch10#ch_consistency), and [“The End-to-End Argument for Databases”](/en/ch13#sec_future_end_to_end).

When a node is declared dead, its responsibilities need to be transferred to other nodes, which
places additional load on other nodes and the network. If the system is already struggling with high
load, declaring nodes dead prematurely can make the problem worse. In particular, it could happen
that the node actually wasn’t dead but only slow to respond due to overload; transferring its load
to other nodes can cause a cascading failure (in the extreme case, all nodes declare each other
dead, and everything stops working—see [“When an overloaded system won’t recover”](/en/ch2#sidebar_metastable)).

Imagine a fictitious system with a network that guaranteed a maximum delay for packets—every packet
is either delivered within some time *d*, or it is lost, but delivery never takes longer than *d*.
Furthermore, assume that you can guarantee that a non-failed node always handles a request within
some time *r*. In this case, you could guarantee that every successful request receives a response
within time 2*d* + *r*—and if you don’t receive a response within that time, you know
that either the network or the remote node is not working. If this was true,
2*d* + *r* would be a reasonable timeout to use.

Unfortunately, most systems we work with have neither of those guarantees: asynchronous networks
have *unbounded delays* (that is, they try to deliver packets as quickly as possible, but there is
no upper limit on the time it may take for a packet to arrive), and most server implementations
cannot guarantee that they can handle requests within some maximum time (see
[“Response time guarantees”](/en/ch9#sec_distributed_clocks_realtime)). For failure detection, it’s not sufficient for the system to
be fast most of the time: if your timeout is low, it only takes a transient spike in round-trip
times to throw the system off-balance.

<a id="sec_distributed_congestion"></a>

#### Network congestion and queueing {#network-congestion-and-queueing}

When driving a car, travel times on road networks often vary most due to traffic congestion.
Similarly, the variability of packet delays on computer networks is most often due to queueing [^27]:

* If several different nodes simultaneously try to send packets to the same destination, the network
 switch must queue them up and feed them into the destination network link one by one (as illustrated
 in [Figure 9-2](/en/ch9#fig_distributed_switch_queueing)). On a busy network link, a packet may have to wait a while
 until it can get a slot (this is called *network congestion*). If there is so much incoming data
 that the switch queue fills up, the packet is dropped, so it needs to be resent—even though
 the network is functioning fine.
* When a packet reaches the destination machine, if all CPU cores are currently busy, the incoming
 request from the network is queued by the operating system until the application is ready to
 handle it. Depending on the load on the machine, this may take an arbitrary length of time [^28].
* In virtualized environments, a running operating system is often paused for tens of milliseconds
 while another virtual machine uses a CPU core. During this time, the VM cannot consume any data
 from the network, so the incoming data is queued (buffered) by the virtual machine monitor [^29],
 further increasing the variability of network delays.
* As mentioned earlier, in order to avoid overloading the network, TCP limits the rate at which it
 sends data. This means additional queueing at the sender before the data even enters the network.

{{< figure src="/fig/ddia_0902.png" id="fig_distributed_switch_queueing" caption="Figure 9-2. If several machines send network traffic to the same destination, its switch queue can fill up. Here, ports 1, 2, and 4 are all trying to send packets to port 3." class="w-full my-4" >}}

Moreover, when TCP detects and automatically retransmits a lost packet, although the application
does not see the packet loss directly, it does see the resulting delay (waiting for the timeout to
expire, and then waiting for the retransmitted packet to be acknowledged).

--------

<a id="sidebar_distributed_tcp_udp"></a>

> [!TIP] TCP VERSUS UDP

Some latency-sensitive applications, such as videoconferencing and Voice over IP (VoIP), use UDP
rather than TCP. It’s a trade-off between reliability and variability of delays: as UDP does not
perform flow control and does not retransmit lost packets, it avoids some of the reasons for
variable network delays (although it is still susceptible to switch queues and scheduling delays).

UDP is a good choice in situations where delayed data is worthless. For example, in a VoIP phone
call, there probably isn’t enough time to retransmit a lost packet before its data is due to be
played over the loudspeakers. In this case, there’s no point in retransmitting the packet—the
application must instead fill the missing packet’s time slot with silence (causing a brief
interruption in the sound) and move on in the stream. The retry happens at the human layer instead.
(“Could you repeat that please? The sound just cut out for a moment.”)

--------

All of these factors contribute to the variability of network delays. Queueing delays have an
especially wide range when a system is close to its maximum capacity: a system with plenty of spare
capacity can easily drain queues, whereas in a highly utilized system, long queues can build up very
quickly.

In public clouds and multitenant datacenters, resources are shared among many customers: the
network links and switches, and even each machine’s network interface and CPUs (when running on
virtual machines), are shared. Processing large amounts of data can use the entire capacity of
network links (*saturate* them). As you have no control over or insight into other customers’ usage of the shared
resources, network delays can be highly variable if someone near you (a *noisy neighbor*) is
using a lot of resources [^30] [^31].

In such environments, you can only choose timeouts experimentally: measure the distribution of
network round-trip times over an extended period, and over many machines, to determine the expected
variability of delays. Then, taking into account your application’s characteristics, you can
determine an appropriate trade-off between failure detection delay and risk of premature timeouts.

Even better, rather than using configured constant timeouts, systems can continually measure
response times and their variability (*jitter*), and automatically adjust timeouts according to the
observed response time distribution. The Phi Accrual failure detector [^32],
which is used for example in Akka and Cassandra [^33]
is one way of doing this. TCP retransmission timeouts also work similarly [^5].

### Synchronous Versus Asynchronous Networks {#sec_distributed_sync_networks}

Distributed systems would be a lot simpler if we could rely on the network to deliver packets with
some fixed maximum delay, and not to drop packets. Why can’t we solve this at the hardware level
and make the network reliable so that the software doesn’t need to worry about it?

To answer this question, it’s interesting to compare datacenter networks to the traditional fixed-line
telephone network (non-cellular, non-VoIP), which is extremely reliable: delayed audio
frames and dropped calls are very rare. A phone call requires a constantly low end-to-end latency
and enough bandwidth to transfer the audio samples of your voice. Wouldn’t it be nice to have
similar reliability and predictability in computer networks?

When you make a call over the telephone network, it establishes a *circuit*: a fixed, guaranteed
amount of bandwidth is allocated for the call, along the entire route between the two callers. This
circuit remains in place until the call ends [^34].
For example, an ISDN network runs at a fixed rate of 4,000 frames per second. When a call is
established, it is allocated 16 bits of space within each frame (in each direction). Thus, for the
duration of the call, each side is guaranteed to be able to send exactly 16 bits of audio data every
250 microseconds [^35].

This kind of network is *synchronous*: even as data passes through several routers, it does not
suffer from queueing, because the 16 bits of space for the call have already been reserved in the
next hop of the network. And because there is no queueing, the maximum end-to-end latency of the
network is fixed. We call this a *bounded delay*.

#### Can we not simply make network delays predictable? {#can-we-not-simply-make-network-delays-predictable}

Note that a circuit in a telephone network is very different from a TCP connection: a circuit is a
fixed amount of reserved bandwidth which nobody else can use while the circuit is established,
whereas the packets of a TCP connection opportunistically use whatever network bandwidth is
available. You can give TCP a variable-sized block of data (e.g., an email or a web page), and it
will try to transfer it in the shortest time possible. While a TCP connection is idle, it doesn’t
use any bandwidth (except perhaps for an occasional keepalive packet).

If datacenter networks and the internet were circuit-switched networks, it would be possible to
establish a guaranteed maximum round-trip time when a circuit was set up. However, they are not:
Ethernet and IP are packet-switched protocols, which suffer from queueing and thus unbounded delays
in the network. These protocols do not have the concept of a circuit.

Why do datacenter networks and the internet use packet switching? The answer is that they are
optimized for *bursty traffic*. A circuit is good for an audio or video call, which needs to
transfer a fairly constant number of bits per second for the duration of the call. On the other
hand, requesting a web page, sending an email, or transferring a file doesn’t have any particular
bandwidth requirement—we just want it to complete as quickly as possible.

If you wanted to transfer a file over a circuit, you would have to guess a bandwidth allocation. If
you guess too low, the transfer is unnecessarily slow, leaving network capacity unused. If you guess
too high, the circuit cannot be set up (because the network cannot allow a circuit to be created if
its bandwidth allocation cannot be guaranteed). Thus, using circuits for bursty data transfers
wastes network capacity and makes transfers unnecessarily slow. By contrast, TCP dynamically adapts
the rate of data transfer to the available network capacity.

There have been some attempts to build hybrid networks that support both circuit switching and
packet switching. *Asynchronous Transfer Mode* (ATM) was a competitor to Ethernet in the 1980s, but
it didn’t gain much adoption outside of telephone network core switches. InfiniBand has some similarities [^36]:
it implements end-to-end flow control at the link layer, which reduces the need for queueing in the
network, although it can still suffer from delays due to link congestion [^37].
With careful use of *quality of service* (QoS, prioritization and scheduling of packets) and *admission
control* (rate-limiting senders), it is possible to emulate circuit switching on packet networks, or
provide statistically bounded delay [^27] [^34]. New network algorithms like Low Latency, Low
Loss, and Scalable Throughput (L4S) attempt to mitigate some of the queuing and congestion control
problems both at the client and router level. Linux’s traffic controller (TC) also allows
applications to reprioritize packets for QoS purposes.

--------

<a id="sidebar_distributed_latency_utilization"></a>

> [!TIP] LATENCY AND RESOURCE UTILIZATION

More generally, you can think of variable delays as a consequence of dynamic resource partitioning.

Say you have a wire between two telephone switches that can carry up to 10,000 simultaneous calls.
Each circuit that is switched over this wire occupies one of those call slots. Thus, you can think of
the wire as a resource that can be shared by up to 10,000 simultaneous users. The resource is
divided up in a *static* way: even if you’re the only call on the wire right now, and all other
9,999 slots are unused, your circuit is still allocated the same fixed amount of bandwidth as when
the wire is fully utilized.

By contrast, the internet shares network bandwidth *dynamically*. Senders push and jostle with each
other to get their packets over the wire as quickly as possible, and the network switches decide
which packet to send (i.e., the bandwidth allocation) from one moment to the next. This approach has the
downside of queueing, but the advantage is that it maximizes utilization of the wire. The wire has a
fixed cost, so if you utilize it better, each byte you send over the wire is cheaper.

A similar situation arises with CPUs: if you share each CPU core dynamically between several
threads, one thread sometimes has to wait in the operating system’s run queue while another thread
is running, so a thread can be paused for varying lengths of time [^38].
However, this utilizes the hardware better than if you allocated a static number of CPU cycles to
each thread (see [“Response time guarantees”](/en/ch9#sec_distributed_clocks_realtime)). Better hardware utilization is also why cloud
platforms run several virtual machines from different customers on the same physical machine.

Latency guarantees are achievable in certain environments, if resources are statically partitioned
(e.g., dedicated hardware and exclusive bandwidth allocations). However, it comes at the cost of
reduced utilization—in other words, it is more expensive. On the other hand, multitenancy with
dynamic resource partitioning provides better utilization, so it is cheaper, but it has the downside of variable delays.

Variable delays in networks are not a law of nature, but simply the result of a cost/benefit trade-off.

--------

However, such quality of service is currently not enabled in multitenant datacenters and public clouds, or when communicating via the internet.
Currently deployed technology does not allow us to make any guarantees about delays or reliability
of the network: we have to assume that network congestion, queueing, and unbounded delays will
happen. Consequently, there’s no “correct” value for timeouts—they need to be determined experimentally.

Peering agreements between internet service providers and the establishment of routes through the
Border Gateway Protocol (BGP), bear closer resemblance to circuit switching than IP itself. At this
level, it is possible to buy dedicated bandwidth. However, internet routing operates at the level of
networks, not individual connections between hosts, and at a much longer timescale.


## Unreliable Clocks {#sec_distributed_clocks}

Clocks and time are important. Applications depend on clocks in various ways to answer questions
like the following:

1. Has this request timed out yet?
2. What’s the 99th percentile response time of this service?
3. How many queries per second did this service handle on average in the last five minutes?
4. How long did the user spend on our site?
5. When was this article published?
6. At what date and time should the reminder email be sent?
7. When does this cache entry expire?
8. What is the timestamp on this error message in the log file?

Examples 1–4 measure *durations* (e.g., the time interval between a request being sent and a
response being received), whereas examples 5–8 describe *points in time* (events that occur on a
particular date, at a particular time).

In a distributed system, time is a tricky business, because communication is not instantaneous: it
takes time for a message to travel across the network from one machine to another. The time when a
message is received is always later than the time when it is sent, but due to variable delays in the
network, we don’t know how much later. This fact sometimes makes it difficult to determine the order
in which things happened when multiple machines are involved.

Moreover, each machine on the network has its own clock, which is an actual hardware device: usually
a quartz crystal oscillator. These devices are not perfectly accurate, so each machine has its own
notion of time, which may be slightly faster or slower than on other machines. It is possible to
synchronize clocks to some degree: the most commonly used mechanism is the Network Time Protocol (NTP), which
allows the computer clock to be adjusted according to the time reported by a group of servers [^39].
The servers in turn get their time from a more accurate time source, such as a GPS receiver.

### Monotonic Versus Time-of-Day Clocks {#sec_distributed_monotonic_timeofday}

Modern computers have at least two different kinds of clocks: a *time-of-day clock* and a *monotonic
clock*. Although they both measure time, it is important to distinguish the two, since they serve
different purposes.

#### Time-of-day clocks {#time-of-day-clocks}

A time-of-day clock does what you intuitively expect of a clock: it returns the current date and
time according to some calendar (also known as *wall-clock time*). For example,
`clock_gettime(CLOCK_REALTIME)` on Linux and
`System.currentTimeMillis()` in Java return the number of seconds (or milliseconds) since the
*epoch*: midnight UTC on January 1, 1970, according to the Gregorian calendar, not counting leap
seconds. Some systems use other dates as their reference point.
(Although the Linux clock is called *real-time*, it has nothing to do with real-time operating
systems, as discussed in [“Response time guarantees”](/en/ch9#sec_distributed_clocks_realtime).)

Time-of-day clocks are usually synchronized with NTP, which means that a timestamp from one machine
(ideally) means the same as a timestamp on another machine. However, time-of-day clocks also have
various oddities, as described in the next section. In particular, if the local clock is too far
ahead of the NTP server, it may be forcibly reset and appear to jump back to a previous point in
time. These jumps, as well as similar jumps caused by leap seconds, make time-of-day clocks
unsuitable for measuring elapsed time [^40].

Time-of-day clocks can experience jumps due to the start and end of Daylight Saving Time (DST);
these can be avoided by always using UTC as time zone, which does not have DST.
Time-of-day clocks have also historically had quite a coarse-grained resolution, e.g., moving forward
in steps of 10 ms on older Windows systems [^41].
On recent systems, this is less of a problem.

#### Monotonic clocks {#monotonic-clocks}

A monotonic clock is suitable for measuring a duration (time interval), such as a timeout or a
service’s response time: `clock_gettime(CLOCK_MONOTONIC)` or `clock_gettime(CLOCK_BOOTTIME)` on Linux [^42]
and `System.nanoTime()` in Java are monotonic clocks, for example. The name comes from the fact that
they are guaranteed to always move forward (whereas a time-of-day clock may jump back in time).

You can check the value of the monotonic clock at one point in time, do something, and then check
the clock again at a later time. The *difference* between the two values tells you how much time
elapsed between the two checks — more like a stopwatch than a wall clock. However, the *absolute*
value of the clock is meaningless: it might be the number of nanoseconds since the computer was
booted up, or something similarly arbitrary. In particular, it makes no sense to compare monotonic
clock values from two different computers, because they don’t mean the same thing.

On a server with multiple CPU sockets, there may be a separate timer per CPU, which is not
necessarily synchronized with other CPUs [^43].
Operating systems compensate for any discrepancy and try
to present a monotonic view of the clock to application threads, even as they are scheduled across
different CPUs. However, it is wise to take this guarantee of monotonicity with a pinch of salt [^44].

NTP may adjust the frequency at which the monotonic clock moves forward (this is known as *slewing*
the clock) if it detects that the computer’s local quartz is moving faster or slower than the NTP
server. By default, NTP allows the clock rate to be speeded up or slowed down by up to 0.05%, but
NTP cannot cause the monotonic clock to jump forward or backward. The resolution of monotonic
clocks is usually quite good: on most systems they can measure time intervals in microseconds or
less.

In a distributed system, using a monotonic clock for measuring elapsed time (e.g., timeouts) is
usually fine, because it doesn’t assume any synchronization between different nodes’ clocks and is
not sensitive to slight inaccuracies of measurement.

### Clock Synchronization and Accuracy {#sec_distributed_clock_accuracy}

Monotonic clocks don’t need synchronization, but time-of-day clocks need to be set according to an
NTP server or other external time source in order to be useful. Unfortunately, our methods for
getting a clock to tell the correct time aren’t nearly as reliable or accurate as you might
hope—hardware clocks and NTP can be fickle beasts. To give just a few examples:

* The quartz clock in a computer is not very accurate: it *drifts* (runs faster or slower than it
 should). Clock drift varies depending on the temperature of the machine. Google assumes a clock
 drift of up to 200 ppm (parts per million) for its servers  [^45],
 which is equivalent to 6 ms drift for a clock that is resynchronized with a server every 30
 seconds, or 17 seconds drift for a clock that is resynchronized once a day. This drift limits the best
 possible accuracy you can achieve, even if everything is working correctly.
* If a computer’s clock differs too much from an NTP server, it may refuse to synchronize, or the
 local clock will be forcibly reset [^39]. Any applications observing the time before and after this reset may see time go backward or suddenly jump forward.
* If a node is accidentally firewalled off from NTP servers, the misconfiguration may go
 unnoticed for some time, during which the drift may add up to large discrepancies between
 different nodes’ clocks. Anecdotal evidence suggests that this does happen in practice.
* NTP synchronization can only be as good as the network delay, so there is a limit to its
 accuracy when you’re on a congested network with variable packet delays. One experiment showed
 that a minimum error of 35 ms is achievable when synchronizing over the internet [^46],
 though occasional spikes in network delay lead to errors of around a second. Depending on the
 configuration, large network delays can cause the NTP client to give up entirely.
* Some NTP servers are wrong or misconfigured, reporting time that is off by hours [^47] [^48].
 NTP clients mitigate such errors by querying several servers and ignoring outliers.
 Nevertheless, it’s somewhat worrying to bet the correctness of your systems on the time that you
 were told by a stranger on the internet.
* Leap seconds result in a minute that is 59 seconds or 61 seconds long, which messes up timing
 assumptions in systems that are not designed with leap seconds in mind [^49].
 The fact that leap seconds have crashed many large systems [^40] [^50]
 shows how easy it is for incorrect assumptions about clocks to sneak into a system. The best
 way of handling leap seconds may be to make NTP servers “lie,” by performing the leap second
 adjustment gradually over the course of a day (this is known as *smearing*) [^51] [^52],
 although actual NTP server behavior varies in practice [^53].
 Leap seconds will no longer be used from 2035 onwards, so this problem will fortunately go away.
* In virtual machines, the hardware clock is virtualized, which raises additional challenges for applications that need accurate timekeeping [^54].
 When a CPU core is shared between virtual machines, each VM is paused for tens of milliseconds
 while another VM is running. From an application’s point of view, this pause manifests itself as
 the clock suddenly jumping forward [^29].
 If a VM pauses for several seconds, the clock may then be several seconds behind the actual time,
 but NTP may continue to report that the clock is almost perfectly in sync [^55].
* If you run software on devices that you don’t fully control (e.g., mobile or embedded devices), you
 probably cannot trust the device’s hardware clock at all. Some users deliberately set their
 hardware clock to an incorrect date and time, for example to cheat in games [^56].
 As a result, the clock might be set to a time wildly in the past or the future.

It is possible to achieve very good clock accuracy if you care about it sufficiently to invest
significant resources. For example, the MiFID II European regulation for financial
institutions requires all high-frequency trading funds to synchronize their clocks to within 100
microseconds of UTC, in order to help debug market anomalies such as “flash crashes” and to help
detect market manipulation [^57].

Such accuracy can be achieved with some special hardware (GPS receivers and/or atomic clocks), the
Precision Time Protocol (PTP) and careful deployment and monitoring [^58] [^59].
Relying on GPS alone can be risky because GPS signals can easily be jammed. In some locations this
happens frequently, e.g. close to military facilities [^60].
Some cloud providers have begun offering high-accuracy clock synchronization for their virtual machines [^61].
However, clock synchronization still requires a lot of care. If your NTP daemon is misconfigured, or
a firewall is blocking NTP traffic, the clock error due to drift can quickly become large.

### Relying on Synchronized Clocks {#sec_distributed_clocks_relying}

The problem with clocks is that while they seem simple and easy to use, they have a surprising
number of pitfalls: a day may not have exactly 86,400 seconds, time-of-day clocks may move backward
in time, and the time according to one node’s clock may be quite different from another node’s clock.

Earlier in this chapter we discussed networks dropping and arbitrarily delaying packets. Even though
networks are well behaved most of the time, software must be designed on the assumption that the
network will occasionally be faulty, and the software must handle such faults gracefully. The same
is true with clocks: although they work quite well most of the time, robust software needs to be
prepared to deal with incorrect clocks.

Part of the problem is that incorrect clocks easily go unnoticed. If a machine’s CPU is defective or
its network is misconfigured, it most likely won’t work at all, so it will quickly be noticed and
fixed. On the other hand, if its quartz clock is defective or its NTP client is misconfigured, most
things will seem to work fine, even though its clock gradually drifts further and further away from
reality. If some piece of software is relying on an accurately synchronized clock, the result is
more likely to be silent and subtle data loss than a dramatic crash [^62] [^63].

Thus, if you use software that requires synchronized clocks, it is essential that you also carefully
monitor the clock offsets between all the machines. Any node whose clock drifts too far from the
others should be declared dead and removed from the cluster. Such monitoring ensures that you notice
the broken clocks before they can cause too much damage.

#### Timestamps for ordering events {#sec_distributed_lww}

Let’s consider one particular situation in which it is tempting, but dangerous, to rely on clocks:
ordering of events across multiple nodes [^64].
For example, if two clients write to a distributed database, who got there first? Which write is the
more recent one?

[Figure 9-3](/en/ch9#fig_distributed_timestamps) illustrates a dangerous use of time-of-day clocks in a database with
multi-leader replication (the example is similar to [Figure 6-8](/en/ch6#fig_replication_causality)). Client A writes
*x* = 1 on node 1; the write is replicated to node 3; client B increments *x* on node
3 (we now have *x* = 2); and finally, both writes are replicated to node 2.

{{< figure src="/fig/ddia_0903.png" id="fig_distributed_timestamps" caption="Figure 9-3. The write by client B is causally later than the write by client A, but B's write has an earlier timestamp." class="w-full my-4" >}}


In [Figure 9-3](/en/ch9#fig_distributed_timestamps), when a write is replicated to other nodes, it is tagged with a
timestamp according to the time-of-day clock on the node where the write originated. The clock
synchronization is very good in this example: the skew between node 1 and node 3 is less than
3 ms, which is probably better than you can expect in practice.

Since the increment builds upon the earlier write of *x* = 1, we might expect that the
write of *x* = 2 should have the greater timestamp of the two. Unfortunately, that is
not what happens in [Figure 9-3](/en/ch9#fig_distributed_timestamps): the write *x* = 1 has a timestamp of
42.004 seconds, but the write *x* = 2 has a timestamp of 42.003 seconds.

As discussed in [“Last write wins (discarding concurrent writes)”](/en/ch6#sec_replication_lww), one way of resolving conflicts between concurrently written
values on different nodes is *last write wins* (LWW), which means keeping the write with the
greatest timestamp for a given key and discarding all writes with older timestamps. In the example
of [Figure 9-3](/en/ch9#fig_distributed_timestamps), when node 2 receives these two events, it will incorrectly
conclude that *x* = 1 is the more recent value and drop the write *x* = 2,
so the increment is lost.

This problem can be prevented by ensuring that when a value is overwritten, the new value always has
a higher timestamp than the overwritten value, even if that timestamp is ahead of the writer’s local
clock. However, that incurs the cost of an additional read to find the greatest existing timestamp.
Some systems, including Cassandra and ScyllaDB, want to write to all replicas in a single round
trip, and therefore they simply use the client clock’s timestamp along with a last write wins
policy [^62]. This approach has some serious problems:

* Database writes can mysteriously disappear: a node with a lagging clock is unable to overwrite
 values previously written by a node with a fast clock until the clock skew between the nodes has elapsed [^63] [^65].
 This scenario can cause arbitrary amounts of data to be silently dropped without any error being
 reported to the application.
* LWW cannot distinguish between writes that occurred sequentially in quick succession (in
 [Figure 9-3](/en/ch9#fig_distributed_timestamps), client B’s increment definitely occurs *after* client A’s write)
 and writes that were truly concurrent (neither writer was aware of the other). Additional
 causality tracking mechanisms, such as version vectors, are needed in order to prevent violations
 of causality (see [“Detecting Concurrent Writes”](/en/ch6#sec_replication_concurrent)).
* It is possible for two nodes to independently generate writes with the same timestamp, especially
 when the clock only has millisecond resolution. An additional tiebreaker value (which can simply
 be a large random number) is required to resolve such conflicts, but this approach can also lead to
 violations of causality [^62].

Thus, even though it is tempting to resolve conflicts by keeping the most “recent” value and
discarding others, it’s important to be aware that the definition of “recent” depends on a local
time-of-day clock, which may well be incorrect. Even with tightly NTP-synchronized clocks, you could
send a packet at timestamp 100 ms (according to the sender’s clock) and have it arrive at
timestamp 99 ms (according to the recipient’s clock)—so it appears as though the packet
arrived before it was sent, which is impossible.

Could NTP synchronization be made accurate enough that such incorrect orderings cannot occur?
Probably not, because NTP’s synchronization accuracy is itself limited by the network round-trip
time, in addition to other sources of error such as quartz drift. To guarantee a correct ordering,
you would need the clock error to be significantly lower than the network delay, which is not possible.

So-called *logical clocks* [^66], which are based on incrementing counters rather than an oscillating quartz crystal, are a safer
alternative for ordering events (see [“Detecting Concurrent Writes”](/en/ch6#sec_replication_concurrent)). Logical clocks do not measure
the time of day or the number of seconds elapsed, only the relative ordering of events (whether one
event happened before or after another). In contrast, time-of-day and monotonic clocks, which
measure actual elapsed time, are also known as *physical clocks*. We’ll look at logical clocks in
more detail in [“ID Generators and Logical Clocks”](/en/ch10#sec_consistency_logical).

#### Clock readings with a confidence interval {#clock-readings-with-a-confidence-interval}

You may be able to read a machine’s time-of-day clock with microsecond or even nanosecond
resolution. But even if you can get such a fine-grained measurement, that doesn’t mean the value is
actually accurate to such precision. In fact, it most likely is not—as mentioned previously, the
drift in an imprecise quartz clock can easily be several milliseconds, even if you synchronize with
an NTP server on the local network every minute. With an NTP server on the public internet, the best
possible accuracy is probably to the tens of milliseconds, and the error may easily spike to over
100 ms when there is network congestion.

Thus, it doesn’t make sense to think of a clock reading as a point in time—it is more like a
range of times, within a confidence interval: for example, a system may be 95% confident that the
time now is between 10.3 and 10.5 seconds past the minute, but it doesn’t know any more precisely than that [^67].
If we only know the time +/– 100 ms, the microsecond digits in the timestamp are essentially meaningless.

The uncertainty bound can be calculated based on your time source. If you have a GPS receiver or
atomic clock directly attached to your computer, the expected error range is determined by
the device and, in the case of GPS, by the quality of the signal from the satellites. If you’re
getting the time from a server, the uncertainty is based on the expected quartz drift since your
last sync with the server, plus the NTP server’s uncertainty, plus the network round-trip time to
the server (to a first approximation, and assuming you trust the server).

Unfortunately, most systems don’t expose this uncertainty: for example, when you call
`clock_gettime()`, the return value doesn’t tell you the expected error of the timestamp, so you
don’t know if its confidence interval is five milliseconds or five years.

There are exceptions: the *TrueTime* API in Google’s Spanner [^45] and Amazon’s ClockBound explicitly report the
confidence interval on the local clock. When you ask it for the current time, you get back two
values: `[earliest, latest]`, which are the *earliest possible* and the *latest possible*
timestamp. Based on its uncertainty calculations, the clock knows that the actual current time is
somewhere within that interval. The width of the interval depends, among other things, on how long
it has been since the local quartz clock was last synchronized with a more accurate clock source.

#### Synchronized clocks for global snapshots {#sec_distributed_spanner}

In [“Snapshot Isolation and Repeatable Read”](/en/ch8#sec_transactions_snapshot_isolation) we discussed *multi-version concurrency control* (MVCC),
which is a very useful feature in databases that need to support both small, fast read-write
transactions and large, long-running read-only transactions (e.g., for backups or analytics). It
allows read-only transactions to see a *snapshot* of the database, a consistent state at a
particular point in time, without locking and interfering with read-write transactions.

Generally, MVCC requires a monotonically increasing transaction ID. If a write happened later than
the snapshot (i.e., the write has a greater transaction ID than the snapshot), that write is
invisible to the snapshot transaction. On a single-node database, a simple counter is sufficient for
generating transaction IDs.

However, when a database is distributed across many machines, potentially in multiple datacenters, a
global, monotonically increasing transaction ID (across all shards) is difficult to generate,
because it requires coordination. The transaction ID must reflect causality: if transaction B reads
or overwrites a value that was previously written by transaction A, then B must have a higher
transaction ID than A—otherwise, the snapshot would not be consistent. With lots of small, rapid
transactions, creating transaction IDs in a distributed system becomes an untenable
bottleneck. (We will discuss such ID generators in [“ID Generators and Logical Clocks”](/en/ch10#sec_consistency_logical).)

Can we use the timestamps from synchronized time-of-day clocks as transaction IDs? If we could get
the synchronization good enough, they would have the right properties: later transactions have a
higher timestamp. The problem, of course, is the uncertainty about clock accuracy.

Spanner implements snapshot isolation across datacenters in this way [^68] [^69].
It uses the clock’s confidence interval as reported by the TrueTime API, and is based on the
following observation: if you have two confidence intervals, each consisting of an earliest and
latest possible timestamp (*A* = [*Aearliest*, *Alatest*] and *B* = [*Bearliest*, *Blatest*]), and those two intervals do not overlap 
(i.e., *Aearliest* < *Alatest* < *Bearliest* < *Blatest*), then B definitely happened after A—there
can be no doubt. Only if the intervals overlap are we unsure in which order A and B happened.

In order to ensure that transaction timestamps reflect causality, Spanner deliberately waits for the
length of the confidence interval before committing a read-write transaction. By doing so, it
ensures that any transaction that may read the data is at a sufficiently later time, so their
confidence intervals do not overlap. In order to keep the wait time as short as possible, Spanner
needs to keep the clock uncertainty as small as possible; for this purpose, Google deploys a GPS
receiver or atomic clock in each datacenter, allowing clocks to be synchronized to within about 7 ms [^45].

The atomic clocks and GPS receivers are not strictly necessary in Spanner: the important thing is to
have a confidence interval, and the accurate clock sources only help keep that interval small. Other
systems are beginning to adopt similar approaches: for example, YugabyteDB can leverage ClockBound
when running on AWS [^70], and several other systems now also rely on clock synchronization to various degrees [^71] [^72].

### Process Pauses {#sec_distributed_clocks_pauses}

Let’s consider another example of dangerous clock use in a distributed system. Say you have a
database with a single leader per shard. Only the leader is allowed to accept writes. How does a
node know that it is still leader (that it hasn’t been declared dead by the others), and that it may
safely accept writes?

One option is for the leader to obtain a *lease* from the other nodes, which is similar to a lock with a timeout [^73].
Only one node can hold the lease at any one time—thus, when a node obtains a lease, it knows that
it is the leader for some amount of time, until the lease expires. In order to remain leader, the
node must periodically renew the lease before it expires. If the node fails, it stops renewing the
lease, so another node can take over when it expires.

You can imagine the request-handling loop looking something like this:

```js
while (true) {
    request = getIncomingRequest();

    // Ensure that the lease always has at least 10 seconds remaining
    if (lease.expiryTimeMillis - System.currentTimeMillis() < 10000) {
        lease = lease.renew();
    }

    if (lease.isValid()) {
        process(request);
    }
}
```

What’s wrong with this code? Firstly, it’s relying on synchronized clocks: the expiry time on the
lease is set by a different machine (where the expiry may be calculated as the current time plus 30
seconds, for example), and it’s being compared to the local system clock. If the clocks are out of
sync by more than a few seconds, this code will start doing strange things.

Secondly, even if we change the protocol to only use the local monotonic clock, there is another
problem: the code assumes that very little time passes between the point that it checks the time
(`System.currentTimeMillis()`) and the time when the request is processed (`process(request)`).
Normally this code runs very quickly, so the 10 second buffer is more than enough to ensure that the
lease doesn’t expire in the middle of processing a request.

However, what if there is an unexpected pause in the execution of the program? For example, imagine
the thread stops for 15 seconds around the line `lease.isValid()` before finally continuing. In
that case, it’s likely that the lease will have expired by the time the request is processed, and
another node has already taken over as leader. However, there is nothing to tell this thread that it
was paused for so long, so this code won’t notice that the lease has expired until the next
iteration of the loop—by which time it may have already done something unsafe by processing the
request.

Is it reasonable to assume that a thread might be paused for so long? Unfortunately yes. There are
various reasons why this could happen:

* Contention among threads accessing a shared resource, such as a lock or queue, can cause threads
 to spend a lot of their time waiting. Moving to a machine with more CPU cores can make such
 problems worse, and contention problems can be difficult to diagnose [^74].
* Many programming language runtimes (such as the Java Virtual Machine) have a *garbage collector*
 (GC) that occasionally needs to stop all running threads. In the past, such *“stop-the-world” GC
 pauses* would sometimes last for several minutes [^75]!
 With modern GC algorithms this is less of a problem, but GC pauses can still be noticable (see
 [“Limiting the impact of garbage collection”](/en/ch9#sec_distributed_gc_impact)).
* In virtualized environments, a virtual machine can be *suspended* (pausing the execution of all
 processes and saving the contents of memory to disk) and *resumed* (restoring the contents of
 memory and continuing execution). This pause can occur at any time in a process’s execution and can
 last for an arbitrary length of time. This feature is sometimes used for *live migration* of
 virtual machines from one host to another without a reboot, in which case the length of the pause
 depends on the rate at which processes are writing to memory [^76].
* On end-user devices such as laptops and phones, execution may also be suspended and resumed
 arbitrarily, e.g., when the user closes the lid of their laptop.
* When the operating system context-switches to another thread, or when the hypervisor switches to a
 different virtual machine (when running in a virtual machine), the currently running thread can be
 paused at any arbitrary point in the code. In the case of a virtual machine, the CPU time spent in
 other virtual machines is known as *steal time*. If the machine is under heavy load—i.e., if
 there is a long queue of threads waiting to run—it may take some time before the paused thread
 gets to run again.
* If the application performs synchronous disk access, a thread may be paused waiting for a slow
 disk I/O operation to complete [^77]. In many languages, disk access can happen
 surprisingly, even if the code doesn’t explicitly mention file access—for example, the Java
 classloader lazily loads class files when they are first used, which could happen at any time in
 the program execution. I/O pauses and GC pauses may even conspire to combine their delays [^78].
 If the disk is actually a network filesystem or network block device (such as Amazon’s EBS), the
 I/O latency is further subject to the variability of network delays [^31].
* If the operating system is configured to allow *swapping to disk* (*paging*), a simple memory
 access may result in a page fault that requires a page from disk to be loaded into memory. The
 thread is paused while this slow I/O operation takes place. If memory pressure is high, this may
 in turn require a different page to be swapped out to disk. In extreme circumstances, the
 operating system may spend most of its time swapping pages in and out of memory and getting little
 actual work done (this is known as *thrashing*). To avoid this problem, paging is often disabled
 on server machines (if you would rather kill a process to free up memory than risk thrashing).
* A Unix process can be paused by sending it the `SIGSTOP` signal, for example by pressing Ctrl-Z in
 a shell. This signal immediately stops the process from getting any more CPU cycles until it is
 resumed with `SIGCONT`, at which point it continues running where it left off. Even if your
 environment does not normally use `SIGSTOP`, it might be sent accidentally by an operations
 engineer.

All of these occurrences can *preempt* the running thread at any point and resume it at some later time,
without the thread even noticing. The problem is similar to making multi-threaded code on a single
machine thread-safe: you can’t assume anything about timing, because arbitrary context switches and
parallelism may occur.

When writing multi-threaded code on a single machine, we have fairly good tools for making it
thread-safe: mutexes, semaphores, atomic counters, lock-free data structures, blocking queues, and
so on. Unfortunately, these tools don’t directly translate to distributed systems, because a
distributed system has no shared memory—only messages sent over an unreliable network.

A node in a distributed system must assume that its execution can be paused for a significant length
of time at any point, even in the middle of a function. During the pause, the rest of the world
keeps moving and may even declare the paused node dead because it’s not responding. Eventually,
the paused node may continue running, without even noticing that it was asleep until it checks its
clock sometime later.

#### Response time guarantees {#sec_distributed_clocks_realtime}

In many programming languages and operating systems, threads and processes may pause for an
unbounded amount of time, as discussed. Those reasons for pausing *can* be eliminated if you try
hard enough.

Some software runs in environments where a failure to respond within a specified time can cause
serious damage: computers that control aircraft, rockets, robots, cars, and other physical objects
must respond quickly and predictably to their sensor inputs. In these systems, there is a specified
*deadline* by which the software must respond; if it doesn’t meet the deadline, that may cause a
failure of the entire system. These are so-called *hard real-time* systems.

--------

> [!NOTE]
> In embedded systems, *real-time* means that a system is carefully designed and tested to meet
> specified timing guarantees in all circumstances. This meaning is in contrast to the more vague use of the
> term *real-time* on the web, where it describes servers pushing data to clients and stream
> processing without hard response time constraints (see [Chapter 12](/en/ch12#ch_stream)).

--------

For example, if your car’s onboard sensors detect that you are currently experiencing a crash, you
wouldn’t want the release of the airbag to be delayed due to an inopportune GC pause in the airbag
release system.

Providing real-time guarantees in a system requires support from all levels of the software stack: a
*real-time operating system* (RTOS) that allows processes to be scheduled with a guaranteed
allocation of CPU time in specified intervals is needed; library functions must document their
worst-case execution times; dynamic memory allocation may be restricted or disallowed entirely
(real-time garbage collectors exist, but the application must still ensure that it doesn’t give the
GC too much work to do); and an enormous amount of testing and measurement must be done to ensure
that guarantees are being met.

All of this requires a large amount of additional work and severely restricts the range of
programming languages, libraries, and tools that can be used (since most languages and tools do not
provide real-time guarantees). For these reasons, developing real-time systems is very expensive,
and they are most commonly used in safety-critical embedded devices. Moreover, “real-time” is not the
same as “high-performance”—in fact, real-time systems may have lower throughput, since they have to
prioritize timely responses above all else (see also [“Latency and Resource Utilization”](/en/ch9#sidebar_distributed_latency_utilization)).

For most server-side data processing systems, real-time guarantees are simply not economical or
appropriate. Consequently, these systems must suffer the pauses and clock instability that come from
operating in a non-real-time environment.

#### Limiting the impact of garbage collection {#sec_distributed_gc_impact}

Garbage collection used to be one of the biggest reasons for process pauses [^79],
but fortunately GC algorithms have improved a lot: a properly tuned collector will now usually pause
for no more than a few milliseconds. The Java runtime offers collectors such as concurrent mark
sweep (CMS), garbage-first (G1), the Z garbage collector (ZGC), Epsilon, and Shenandoah. Each of
these is optimized for different memory profiles such as high-frequency object creation, large
heaps, and so on. By contrast, Go offers a simpler concurrent mark sweep garbage collector that
attempts to optimize itself.

If you need to avoid GC pauses entirely, one option is to use a language that doesn’t have a garbage
collector at all. For example, Swift uses automatic reference counting to determine when memory can
be freed; Rust and Mojo track lifetimes of objects using the type system so the compiler can
determine how long memory must be allocated for.

It’s also possible to use a garbage-collected language while mitigating the impact of pauses.
One approach is to treat GC pauses like brief planned outages of a node, and to let other nodes
handle requests from clients while one node is collecting its garbage. If the runtime can warn the
application that a node soon requires a GC pause, the application can stop sending new requests to
that node, wait for it to finish processing outstanding requests, and then perform the GC while no
requests are in progress. This trick hides GC pauses from clients and reduces the high percentiles
of the response time [^80] [^81].

A variant of this idea is to use the garbage collector only for short-lived objects (which are fast
to collect) and to restart processes periodically, before they accumulate enough long-lived objects
to require a full GC of long-lived objects [^79] [^82].
One node can be restarted at a time, and traffic can be shifted away from the node before the
planned restart, like in a rolling upgrade (see [Chapter 5](/en/ch5#ch_encoding)).

These measures cannot fully prevent garbage collection pauses, but they can usefully reduce their
impact on the application.


## Knowledge, Truth, and Lies {#sec_distributed_truth}

So far in this chapter we have explored the ways in which distributed systems are different from
programs running on a single computer: there is no shared memory, only message passing via an
unreliable network with variable delays, and the systems may suffer from partial failures, unreliable clocks,
and processing pauses.

The consequences of these issues are profoundly disorienting if you’re not used to distributed
systems. A node in the network cannot *know* anything for sure about other nodes—it can only make
guesses based on the messages it receives (or doesn’t receive). A node can only find out what state
another node is in (what data it has stored, whether it is correctly functioning, etc.) by
exchanging messages with it. If a remote node doesn’t respond, there is no way of knowing what state
it is in, because problems in the network cannot reliably be distinguished from problems at a node.

Discussions of these systems border on the philosophical: What do we know to be true or false in our
system? How sure can we be of that knowledge, if the mechanisms for perception and measurement are unreliable [^83]?
Should software systems obey the laws that we expect of the physical world, such as cause and effect?

Fortunately, we don’t need to go as far as figuring out the meaning of life. In a distributed
system, we can state the assumptions we are making about the behavior (the *system model*) and
design the actual system in such a way that it meets those assumptions. Algorithms can be proved to
function correctly within a certain system model. This means that reliable behavior is achievable,
even if the underlying system model provides very few guarantees.

However, although it is possible to make software well behaved in an unreliable system model, it
is not straightforward to do so. In the rest of this chapter we will further explore the notions of
knowledge and truth in distributed systems, which will help us think about the kinds of assumptions
we can make and the guarantees we may want to provide. In [Chapter 10](/en/ch10#ch_consistency) we will proceed to
look at some examples of distributed algorithms that provide particular guarantees under particular
assumptions.

### The Majority Rules {#sec_distributed_majority}

Imagine a network with an asymmetric fault: a node is able to receive all messages sent to it, but
any outgoing messages from that node are dropped or delayed [^22]. Even though that node is working
perfectly well, and is receiving requests from other nodes, the other nodes cannot hear its
responses. After some timeout, the other nodes declare it dead, because they haven’t heard from the
node. The situation unfolds like a nightmare: the semi-disconnected node is dragged to the
graveyard, kicking and screaming “I’m not dead!”—but since nobody can hear its screaming, the
funeral procession continues with stoic determination.

In a slightly less nightmarish scenario, the semi-disconnected node may notice that the messages it
is sending are not being acknowledged by other nodes, and so realize that there must be a fault
in the network. Nevertheless, the node is wrongly declared dead by the other nodes, and the
semi-disconnected node cannot do anything about it.

As a third scenario, imagine a node that pauses execution for one minute. During that time, no
requests are processed and no responses are sent. The other nodes wait, retry, grow impatient, and
eventually declare the node dead and load it onto the hearse. Finally, the pause finishes and the
node’s threads continue as if nothing had happened. The other nodes are surprised as the supposedly
dead node suddenly raises its head out of the coffin, in full health, and starts cheerfully chatting
with bystanders. At first, the paused node doesn’t even realize that an entire minute has passed and
that it was declared dead—from its perspective, hardly any time has passed since it was last talking
to the other nodes.

The moral of these stories is that a node cannot necessarily trust its own judgment of a situation.
A distributed system cannot exclusively rely on a single node, because a node may fail at any time,
potentially leaving the system stuck and unable to recover. Instead, many distributed algorithms
rely on a *quorum*, that is, voting among the nodes (see [“Quorums for reading and writing”](/en/ch6#sec_replication_quorum_condition)):
decisions require some minimum number of votes from several nodes in order to reduce the dependence
on any one particular node.

That includes decisions about declaring nodes dead. If a quorum of nodes declares another node
dead, then it must be considered dead, even if that node still very much feels alive. The individual
node must abide by the quorum decision and step down.

Most commonly, the quorum is an absolute majority of more than half the nodes (although other kinds
of quorums are possible). A majority quorum allows the system to continue working if a minority of nodes
are faulty (with three nodes, one faulty node can be tolerated; with five nodes, two faulty nodes can be
tolerated). However, it is still safe, because there can only be only one majority in the
system—there cannot be two majorities with conflicting decisions at the same time. We will discuss
the use of quorums in more detail when we get to *consensus algorithms* in [Chapter 10](/en/ch10#ch_consistency).

### Distributed Locks and Leases {#sec_distributed_lock_fencing}

Locks and leases in distributed application are prone to be misused, and a common source of bugs [^84].
Let’s look at one particular case of how they can go wrong.

In [“Process Pauses”](/en/ch9#sec_distributed_clocks_pauses) we saw that a lease is a kind of lock that times out and can be
assigned to a new owner if the old owner stops responding (perhaps because it crashed, it paused for
too long, or it was disconnected from the network). You can use leases in situations where a system
requires there to be only one of some thing. For example:

* Only one node is allowed to be the leader for a database shard, to avoid split brain (see
 [“Handling Node Outages”](/en/ch6#sec_replication_failover)).
* Only one transaction or client is allowed to update a particular resource or object, to prevent
 it being corrupted by concurrent writes.
* Only one node should process a given input file to a big processing job, to avoid wasted effort
 due to multiple nodes redundantly doing the same work.

It is worth thinking carefully about what happens if several nodes simultaneously believe that they
hold the lease, perhaps due to a process pause. In the third example, the consequence is only some
wasted computational resources, which is not a big deal. But in the first two cases, the consequence
could be lost or corrupted data, which is much more serious.

For example, [Figure 9-4](/en/ch9#fig_distributed_lease_pause) shows a data corruption bug due to an incorrect
implementation of locking. (The bug is not theoretical: HBase used to have this problem [^85] [^86].)
Say you want to ensure that a file in a storage service can only be
accessed by one client at a time, because if multiple clients tried to write to it, the file would
become corrupted. You try to implement this by requiring a client to obtain a lease from a lock
service before accessing the file. Such a lock service is often implemented using a consensus
algorithm; we will discuss this further in [Chapter 10](/en/ch10#ch_consistency).

{{< figure src="/fig/ddia_0904.png" id="fig_distributed_lease_pause" caption="Figure 9-4. Incorrect implementation of a distributed lock: client 1 believes that it still has a valid lease, even though it has expired, and thus corrupts a file in storage." class="w-full my-4" >}}


The problem is an example of what we discussed in [“Process Pauses”](/en/ch9#sec_distributed_clocks_pauses): if the client
holding the lease is paused for too long, its lease expires. Another client can obtain a lease for
the same file, and start writing to the file. When the paused client comes back, it believes
(incorrectly) that it still has a valid lease and proceeds to also write to the file. We now have a
split brain situation: the clients’ writes clash and corrupt the file.

[Figure 9-5](/en/ch9#fig_distributed_lease_delay) shows a different problem that has similar consequences. In this
example there is no process pause, only a crash by client 1. Just before client 1 crashes it sends a
write request to the storage service, but this request is delayed for a long time in the network.
(Remember from [“Network Faults in Practice”](/en/ch9#sec_distributed_network_faults) that packets can sometimes be delayed by a minute
or more.) By the time the write request arrives at the storage service, the lease has already timed
out, allowing client 2 to acquire it and issue a write of its own. The result is corruption similar
to [Figure 9-4](/en/ch9#fig_distributed_lease_pause).

{{< figure src="/fig/ddia_0905.png" id="fig_distributed_lease_delay" caption="Figure 9-5. A message from a former leaseholder might be delayed for a long time, and arrive after another node has taken over the lease." class="w-full my-4" >}}


#### Fencing off zombies and delayed requests {#sec_distributed_fencing_tokens}

The term *zombie* is sometimes used to describe a former leaseholder who has not yet found out that
it lost the lease, and who is still acting as if it was the current leaseholder. Since we cannot
rule out zombies entirely, we have to instead ensure that they can’t do any damage in the form of
split brain. This is called *fencing off* the zombie.

Some systems attempt to fence off zombies by shutting them down, for example by disconnecting them
from the network [^9], shutting down the VM via
the cloud provider’s management interface, or even physically powering down the machine [^87].
This approach is known as *Shoot The Other Node In The Head* or STONITH. Unfortunately, it suffers
from some problems: it does not protect against large network delays like in
[Figure 9-5](/en/ch9#fig_distributed_lease_delay); it can happen that all of the nodes shut each other down [^19]; and by the time the zombie has been
detected and shut down, it may already be too late and data may already have been corrupted.

A more robust fencing solution, which protects against both zombies and delayed requests, is
illustrated in [Figure 9-6](/en/ch9#fig_distributed_fencing).

{{< figure src="/fig/ddia_0906.png" id="fig_distributed_fencing" caption="Figure 9-6. Making access to storage safe by allowing writes only in the order of increasing fencing tokens." class="w-full my-4" >}}


Let’s assume that every time the lock service grants a lock or lease, it also returns a *fencing
token*, which is a number that increases every time a lock is granted (e.g., incremented by the lock
service). We can then require that every time a client sends a write request to the storage service,
it must include its current fencing token.

--------

> [!NOTE]
> There are several alternative names for fencing tokens. In Chubby, Google’s lock service, they are
> called *sequencers* [^88], and in Kafka they are called *epoch numbers*.
> In consensus algorithms, which we will discuss in [Chapter 10](/en/ch10#ch_consistency), the *ballot number* (Paxos) or
> *term number* (Raft) serves a similar purpose.

--------

In [Figure 9-6](/en/ch9#fig_distributed_fencing), client 1 acquires the lease with a token of 33, but then
it goes into a long pause and the lease expires. Client 2 acquires the lease with a token of 34 (the
number always increases) and then sends its write request to the storage service, including the
token of 34. Later, client 1 comes back to life and sends its write to the storage service,
including its token value 33. However, the storage service remembers that it has already processed a
write with a higher token number (34), and so it rejects the request with token 33. A client that
has just acquired the lease must immediately make a write to the storage service, and once that
write has completed, any zombies are fenced off.

If ZooKeeper is your lock service, you can use the transaction ID `zxid` or the node version
`cversion` as fencing token [^85].
With etcd, the revision number along with the lease ID serves a similar purpose [^89].
The FencedLock API in Hazelcast explicitly generates a fencing token [^90].

This mechanism requires that the storage service has some way of checking whether a write is based
on an outdated token. Alternatively, it’s sufficient for the service to support a write that
succeeds only if the object has not been written by another client since the current client last
read it, similarly to an atomic compare-and-set (CAS) operation. For example, object storage
services support such a check: Amazon S3 calls it *conditional writes*, Azure Blob Storage calls it
*conditional headers*, and Google Cloud Storage calls it *request preconditions*.

#### Fencing with multiple replicas {#fencing-with-multiple-replicas}

If your clients need to write only to one storage service that supports such conditional writes, the
lock service is somewhat redundant [^91] [^92], since the lease assignment could have been implemented directly based on that storage service [^93].
However, once you have a fencing token you can also use it with multiple services or replicas, and
ensure that the old leaseholder is fenced off on all of those services.

For example, imagine the storage service is a leaderless replicated key-value store with
last-write-wins conflict resolution (see [“Leaderless Replication”](/en/ch6#sec_replication_leaderless)). In such a system, the
client sends writes directly to each replica, and each replica independently decides whether to
accept a write based on a timestamp assigned by the client.

As illustrated in [Figure 9-7](/en/ch9#fig_distributed_fencing_leaderless), you can put the writer’s fencing token in
the most significant bits or digits of the timestamp. You can then be sure that any timestamp
generated by the new leaseholder will be greater than any timestamp from the old leaseholder, even
if the old leaseholder’s writes happened later.

{{< figure src="/fig/ddia_0907.png" id="fig_distributed_fencing_leaderless" caption="Figure 9-7. Using fencing tokens to protect writes to a leaderless replicated database." class="w-full my-4" >}}


In [Figure 9-7](/en/ch9#fig_distributed_fencing_leaderless), Client 2 has a fencing token of 34, so all of its
timestamps starting with 34…​ are greater than any timestamps starting with 33…​ that are
generated by Client 1. Client 2 writes to a quorum of replicas but it can’t reach Replica 3. This
means that when the zombie Client 1 later tries to write, its write may succeed at Replica 3 even
though it is ignored by replicas 1 and 2. This is not a problem, since a subsequent quorum read will
prefer the write from Client 2 with the greater timestamp, and read repair or anti-entropy will
eventually overwrite the value written by Client 1.

As you can see from these examples, it is not safe to assume that there is only one node holding a
lease at any one time. Fortunately, with a bit of care you can use fencing tokens to prevent zombies
and delayed requests from doing any damage.

### Byzantine Faults {#sec_distributed_byzantine}

Fencing tokens can detect and block a node that is *inadvertently* acting in error (e.g., because it
hasn’t yet found out that its lease has expired). However, if the node deliberately wanted to
subvert the system’s guarantees, it could easily do so by sending messages with a fake fencing
token.

In this book we assume that nodes are unreliable but honest: they may be slow or never respond (due
to a fault), and their state may be outdated (due to a GC pause or network delays), but we assume
that if a node *does* respond, it is telling the “truth”: to the best of its knowledge, it is
playing by the rules of the protocol.

Distributed systems problems become much harder if there is a risk that nodes may “lie” (send
arbitrary faulty or corrupted responses)—for example, it might cast multiple contradictory votes in
the same election. Such behavior is known as a *Byzantine fault*, and the problem of reaching
consensus in this untrusting environment is known as the *Byzantine Generals Problem* [^94].

> [!TIP] THE BYZANTINE GENERALS PROBLEM

The Byzantine Generals Problem is a generalization of the so-called *Two Generals Problem* [^95],
which imagines a situation in which two army generals need to agree on a battle plan. As they
have set up camp on two different sites, they can only communicate by messenger, and the messengers
sometimes get delayed or lost (like packets in a network). We will discuss this problem of
*consensus* in [Chapter 10](/en/ch10#ch_consistency).

In the Byzantine version of the problem, there are *n* generals who need to agree, and their
endeavor is hampered by the fact that there are some traitors in their midst. Most of the generals
are loyal, and thus send truthful messages, but the traitors may try to deceive and confuse the
others by sending fake or untrue messages. It is not known in advance who the traitors are.

Byzantium was an ancient Greek city that later became Constantinople, in the place which is now
Istanbul in Turkey. There isn’t any historic evidence that the generals of Byzantium were any more
prone to intrigue and conspiracy than those elsewhere. Rather, the name is derived from *Byzantine*
in the sense of *excessively complicated, bureaucratic, devious*, which was used in politics long
before computers [^96].
Lamport wanted to choose a nationality that would not offend any readers, and he was advised that
calling it *The Albanian Generals Problem* was not such a good idea [^97].

--------

A system is *Byzantine fault-tolerant* if it continues to operate correctly even if some of the
nodes are malfunctioning and not obeying the protocol, or if malicious attackers are interfering
with the network. This concern is relevant in certain specific circumstances. For example:

* In aerospace environments, the data in a computer’s memory or CPU register could become corrupted
 by radiation, leading it to respond to other nodes in arbitrarily unpredictable ways. Since a
 system failure would be very expensive (e.g., an aircraft crashing and killing everyone on board,
 or a rocket colliding with the International Space Station), flight control systems must tolerate
 Byzantine faults [^98] [^99].
* In a system with multiple participating parties, some participants may attempt to cheat or
 defraud others. In such circumstances, it is not safe for a node to simply trust another node’s
 messages, since they may be sent with malicious intent. For example, cryptocurrencies like
 Bitcoin and other blockchains can be considered to be a way of getting mutually untrusting parties
 to agree whether a transaction happened or not, without relying on a central authority [^100].

However, in the kinds of systems we discuss in this book, we can usually safely assume that there
are no Byzantine faults. In a datacenter, all the nodes are controlled by your organization (so
they can hopefully be trusted) and radiation levels are low enough that memory corruption is not a
major problem (although datacenters in orbit are being considered [^101]).
Multitenant systems have mutually untrusting tenants, but they are isolated from each
other using firewalls, virtualization, and access control policies, not using Byzantine fault
tolerance. Protocols for making systems Byzantine fault-tolerant are quite expensive [^102],
and fault-tolerant embedded systems rely on support from the hardware level [^98]. In most server-side data systems, the
cost of deploying Byzantine fault-tolerant solutions makes them impracticable.

Web applications do need to expect arbitrary and malicious behavior of clients that are under
end-user control, such as web browsers. This is why input validation, sanitization, and output
escaping are so important: to prevent SQL injection and cross-site scripting, for example. However,
we typically don’t use Byzantine fault-tolerant protocols here, but simply make the server the
authority on deciding what client behavior is and isn’t allowed. In peer-to-peer networks, where
there is no such central authority, Byzantine fault tolerance is more relevant [^103] [^104].

A bug in the software could be regarded as a Byzantine fault, but if you deploy the same software to
all nodes, then a Byzantine fault-tolerant algorithm cannot save you. Most Byzantine fault-tolerant
algorithms require a supermajority of more than two-thirds of the nodes to be functioning correctly
(for example, if you have four nodes, at most one may malfunction). To use this approach against bugs, you
would have to have four independent implementations of the same software and hope that a bug only
appears in one of the four implementations.

Similarly, it would be appealing if a protocol could protect us from vulnerabilities, security
compromises, and malicious attacks. Unfortunately, this is not realistic either: in most systems, if
an attacker can compromise one node, they can probably compromise all of them, because they are
probably running the same software. Thus, traditional mechanisms (authentication, access control,
encryption, firewalls, and so on) continue to be the main protection against attackers.

<a id="sec_distributed_weak_lying"></a>

#### Weak forms of lying {#weak-forms-of-lying}

Although we assume that nodes are generally honest, it can be worth adding mechanisms to software
that guard against weak forms of “lying”—for example, invalid messages due to hardware issues,
software bugs, and misconfiguration. Such protection mechanisms are not full-blown Byzantine fault
tolerance, as they would not withstand a determined adversary, but they are nevertheless simple and
pragmatic steps toward better reliability. For example:

* Network packets do sometimes get corrupted due to hardware issues or bugs in operating systems,
 drivers, routers, etc. Usually, corrupted packets are caught by the checksums built into TCP and
 UDP, but sometimes they evade detection [^105] [^106] [^107].
 Simple measures are usually sufficient protection against such corruption, such as checksums in
 the application-level protocol. TLS-encrypted connections also offer protection against corruption.
* A publicly accessible application must carefully sanitize any inputs from users, for example
 checking that a value is within a reasonable range and limiting the size of strings to prevent
 denial of service through large memory allocations. An internal service behind a firewall may be
 able to get away with less strict checks on inputs, but basic checks in protocol parsers are still a good idea [^105].
* NTP clients can be configured with multiple server addresses. When synchronizing, the client
 contacts all of them, estimates their errors, and checks that a majority of servers agree on some
 time range. As long as most of the servers are okay, a misconfigured NTP server that is reporting an
 incorrect time is detected as an outlier and is excluded from synchronization [^39]. The use of multiple servers makes NTP
 more robust than if it only uses a single server.

### System Model and Reality {#sec_distributed_system_model}

Many algorithms have been designed to solve distributed systems problems—for example, we will
examine solutions for the consensus problem in [Chapter 10](/en/ch10#ch_consistency). In order to be useful, these
algorithms need to tolerate the various faults of distributed systems that we discussed in this
chapter.

Algorithms need to be written in a way that does not depend too heavily on the details of the
hardware and software configuration on which they are run. This in turn requires that we somehow
formalize the kinds of faults that we expect to happen in a system. We do this by defining a *system
model*, which is an abstraction that describes what things an algorithm may assume.

With regard to timing assumptions, three system models are in common use:

Synchronous model
: The synchronous model assumes bounded network delay, bounded process pauses, and bounded clock
 error. This does not imply exactly synchronized clocks or zero network delay; it just means you
 know that network delay, pauses, and clock drift will never exceed some fixed upper bound [^108].
 The synchronous model is not a realistic model of most practical
 systems, because (as discussed in this chapter) unbounded delays and pauses do occur.

Partially synchronous model
: Partial synchrony means that a system behaves like a synchronous system *most of the time*, but it
 sometimes exceeds the bounds for network delay, process pauses, and clock drift [^108]. This is a realistic model of many
 systems: most of the time, networks and processes are quite well behaved—otherwise we would never
 be able to get anything done—but we have to reckon with the fact that any timing assumptions
 may be shattered occasionally. When this happens, network delay, pauses, and clock error may become
 arbitrarily large.

Asynchronous model
: In this model, an algorithm is not allowed to make any timing assumptions—in fact, it does not
 even have a clock (so it cannot use timeouts). Some algorithms can be designed for the
 asynchronous model, but it is very restrictive.

Moreover, besides timing issues, we have to consider node failures. Some common system models for
nodes are:

Crash-stop faults
: In the *crash-stop* (or *fail-stop*) model, an algorithm may assume that a node can fail in only
 one way, namely by crashing [^109].
 This means that the node may suddenly stop responding at any moment, and thereafter that node is
 gone forever—it never comes back.

Crash-recovery faults
: We assume that nodes may crash at any moment, and perhaps start responding again after some
 unknown time. In the crash-recovery model, nodes are assumed to have stable storage (i.e.,
 nonvolatile disk storage) that is preserved across crashes, while the in-memory state is assumed
 to be lost.

Degraded performance and partial functionality
: In addition to crashing and restarting, nodes may go slow: they may still be able to respond to
 health check requests, while being too slow to get any real work done. For example, a Gigabit
 network interface could suddenly drop to 1 Kb/s throughput due to a driver bug [^110];
 a process that is under memory pressure may spend most of its time performing garbage collection [^111];
 worn-out SSDs can have erratic performance; and hardware can be affected by high temperature,
 loose connectors, mechanical vibration, power supply problems, firmware bugs, and more [^112].
 Such a situation is called a *limping node*, *gray failure*, or *fail-slow* [^113],
 and it can be even more difficult to deal with than a cleanly failed node. A related problem is
 when a process stops doing some of the things it is supposed to do while other aspects continue
 working, for example because a background thread is crashed or deadlocked [^114].

Byzantine (arbitrary) faults
: Nodes may do absolutely anything, including trying to trick and deceive other nodes, as described
 in the last section.

For modeling real systems, the partially synchronous model with crash-recovery faults is generally
the most useful model. It allows for unbounded network delay, process pauses, and slow nodes. But
how do distributed algorithms cope with that model?

#### Defining the correctness of an algorithm {#defining-the-correctness-of-an-algorithm}

To define what it means for an algorithm to be *correct*, we can describe its *properties*. For
example, the output of a sorting algorithm has the property that for any two distinct elements of
the output list, the element further to the left is smaller than the element further to the right.
That is simply a formal way of defining what it means for a list to be sorted.

Similarly, we can write down the properties we want of a distributed algorithm to define what it
means to be correct. For example, if we are generating fencing tokens for a lock (see
[“Fencing off zombies and delayed requests”](/en/ch9#sec_distributed_fencing_tokens)), we may require the algorithm to have the following properties:

Uniqueness
: No two requests for a fencing token return the same value.

Monotonic sequence
: If request *x* returned token *t**x*, and request *y* returned token *t**y*, and
 *x* completed before *y* began, then *t**x* < *t**y*.

Availability
: A node that requests a fencing token and does not crash eventually receives a response.

An algorithm is correct in some system model if it always satisfies its properties in all situations
that we assume may occur in that system model. However, if all nodes crash, or all network delays
suddenly become infinitely long, then no algorithm will be able to get anything done. How can we
still make useful guarantees even in a system model that allows complete failures?

#### Safety and liveness {#sec_distributed_safety_liveness}

To clarify the situation, it is worth distinguishing between two different kinds of properties:
*safety* and *liveness* properties. In the example just given, *uniqueness* and *monotonic sequence* are
safety properties, but *availability* is a liveness property.

What distinguishes the two kinds of properties? A giveaway is that liveness properties often include
the word “eventually” in their definition. (And yes, you guessed it—*eventual consistency* is a
liveness property [^115].)

Safety is often informally defined as *nothing bad happens*, and liveness as *something good
eventually happens*. However, it’s best to not read too much into those informal definitions,
because “good” and “bad” are value judgements that don’t apply well to algorithms. The actual
definitions of safety and liveness are more precise [^116]:

* If a safety property is violated, we can point at a particular point in time at which it was
 broken (for example, if the uniqueness property was violated, we can identify the particular
 operation in which a duplicate fencing token was returned). After a safety property has been
 violated, the violation cannot be undone—the damage is already done.
* A liveness property works the other way round: it may not hold at some point in time (for example,
 a node may have sent a request but not yet received a response), but there is always hope that it
 may be satisfied in the future (namely by receiving a response).

An advantage of distinguishing between safety and liveness properties is that it helps us deal with
difficult system models. For distributed algorithms, it is common to require that safety properties
*always* hold, in all possible situations of a system model [^108]. That is, even if all nodes crash, or
the entire network fails, the algorithm must nevertheless ensure that it does not return a wrong
result (i.e., that the safety properties remain satisfied).

However, with liveness properties we are allowed to make caveats: for example, we could say that a
request needs to receive a response only if a majority of nodes have not crashed, and only if the
network eventually recovers from an outage. The definition of the partially synchronous model
requires that eventually the system returns to a synchronous state—that is, any period of network
interruption lasts only for a finite duration and is then repaired.

#### Mapping system models to the real world {#mapping-system-models-to-the-real-world}

Safety and liveness properties and system models are very useful for reasoning about the correctness
of a distributed algorithm. However, when implementing an algorithm in practice, the messy facts of
reality come back to bite you again, and it becomes clear that the system model is a simplified
abstraction of reality.

For example, algorithms in the crash-recovery model generally assume that data in stable storage
survives crashes. However, what happens if the data on disk is corrupted, or the data is wiped out
due to hardware error or misconfiguration [^117]?
What happens if a server has a firmware bug and fails to recognize
its hard drives on reboot, even though the drives are correctly attached to the server [^118]?

Quorum algorithms (see [“Quorums for reading and writing”](/en/ch6#sec_replication_quorum_condition)) rely on a node remembering the data
that it claims to have stored. If a node may suffer from amnesia and forget previously stored data,
that breaks the quorum condition, and thus breaks the correctness of the algorithm. Perhaps a new
system model is needed, in which we assume that stable storage mostly survives crashes, but may
sometimes be lost. But that model then becomes harder to reason about.

The theoretical description of an algorithm can declare that certain things are simply assumed not
to happen—and in non-Byzantine systems, we do have to make some assumptions about faults that can
and cannot happen. However, a real implementation may still have to include code to handle the
case where something happens that was assumed to be impossible, even if that handling boils down to
`printf("Sucks to be you")` and `exit(666)`—i.e., letting a human operator clean up the mess [^119].
(This is one difference between computer science and software engineering.)

That is not to say that theoretical, abstract system models are worthless—quite the opposite.
They are incredibly helpful for distilling down the complexity of real systems to a manageable set
of faults that we can reason about, so that we can understand the problem and try to solve it
systematically.

### Formal Methods and Randomized Testing {#sec_distributed_formal}

How do we know that an algorithm satisfies the required properties? Due to concurrency, partial
failures, and network delays there are a huge number of potential states. We need to guarantee
that the properties hold in every possible state, and ensure that we haven’t forgotten about any
edge cases.

One approach is to formally verify an algorithm by describing it mathematically, and using proof
techniques to show that it satisfies the required properties in all situations that the system model
allows. Proving an algorithm correct does not mean its *implementation* on a real system will
necessarily always behave correctly. But it’s a very good first step, because the theoretical
analysis can uncover problems in an algorithm that might remain hidden for a long time in a real
system, and that only come to bite you when your assumptions (e.g., about timing) are defeated due
to unusual circumstances.

It is prudent to combine theoretical analysis with empirical testing to verify that implementations
behave as expected. Techniques such as property-based testing, fuzzing, and deterministic simulation
testing (DST) use randomization to test a system in a wide range of situations. Companies such as
Amazon Web Services have successfully used a combination of these techniques on many of their
products [^120] [^121].

#### Model checking and specification languages {#model-checking-and-specification-languages}

*Model checkers* are tools that help verify that an algorithm or system behaves as expected. An algorithm
specification is written in a purpose-built language such as TLA+, Gallina, or FizzBee. These
languages make it easier to focus on an algorithm’s behavior without worrying about code
implementation details. Model checkers then use these models to verify that invariants hold across
all of an algorithm’s states by systematically trying all the things that could happen.

Model checking can’t actually prove that an algorithm’s invariants hold for every possible state
since most real-world algorithms have an infinite state space. A true verification of all states
would require a formal proof, which can be done, but which is typically more difficult than running
a model checker. Instead, model checkers encourage you to reduce the algorithm’s model to an
approximation that can be fully verified, or to limit the execution to some upper bound (for
example, by setting a maximum number of messages that can be sent). Any bugs that only occur with
longer executions would then not be found.

Still, model checkers strike a nice balance between ease of use and the ability to find non-obvious
bugs. CockroachDB, TiDB, Kafka, and many other distributed systems use model specifications to find
and fix bugs [^122] [^123] [^124]. For example,
using TLA+, researchers were able to demonstrate the potential for data loss in viewstamped
replication (VR) caused by ambiguity in the prose description of the algorithm [^125].

By design, model checkers don’t run your actual code, but rather a simplified model that specifies
only the core ideas of your protocol. This makes it more tractable to systematically explore the
state space, but it risks that your specification and your implementation go out of sync with each other [^126].
It is possible to check whether the model and the real implementation have equivalent behavior, but
this requires instrumentation in the real implementation [^127].

#### Fault injection {#sec_fault_injection}

Many bugs are triggered when machine and network failures occur. Fault injection is an effective
(and sometimes scary) technique that verifies whether a system’s implementation works as expected things
go wrong. The idea is simple: inject faults into a running system’s environment and see how it
behaves. Faults can be network failures, machine crashes, disk corruption, paused
processes—anything you can imagine going wrong with a computer.

Fault injection tests are typically run in an environment that closely resembles the production
environment where the system will run. Some even inject faults directly into their production
environment. Netflix popularized this approach with their Chaos Monkey tool [^128]. Production fault
injection is often referred to as *chaos engineering*, which we discussed in
[“Reliability and Fault Tolerance”](/en/ch2#sec_introduction_reliability).

To run fault injection tests, the system under test is first deployed along with fault injection
coordinators and scripts. Coordinators are responsible for deciding what faults to execute and when
to execute them. Local or remote scripts are responsible for injecting failures into individual
nodes or processes. Injection scripts use many different tools to trigger faults. A Linux process
can be paused or killed using Linux’s `kill` command, a disk can be unmounted with `umount`, and
network connections can be disrupted through firewall settings. You can inspect system behavior
during and after faults are injected to make sure things work as expected.

The myriad of tools required to trigger failures make fault injection tests cumbersome to write.
It’s common to adopt a fault injection framework like Jepsen to run fault injection tests to
simplify the process. Such frameworks come with integrations for various operating systems and many
pre-built fault injectors [^129].
Jepsen has been remarkably effective at finding critical bugs in many widely-used systems [^130] [^131].

#### Deterministic simulation testing {#deterministic-simulation-testing}

Deterministic simulation testing (DST) has also become a popular complement to model-checking and
fault injection. It uses a similar state space exploration process as a model checker, but it tests
your actual code, not a model.

In DST, a simulation automatically runs through a large number of randomised executions of the
system. Network communication, I/O, and clock timing during the simulation are all replaced with
mocks that allow the simulator to control the exact order in which things happen, including various
timings and failure scenarios. This allows the simulator to explore many more situations than
hand-written tests or fault injection could. If a test fails, it can be re-run since the simulator
knows the exact order of operations that triggered the failure—in contrast to fault injection, which
does not have such fine-grained control over the system.

DST requires the simulator to be able to control all sources of nondeterminism, such as network
delays. One of three strategies is generally adopted to make code deterministic:

Application-level
: Some systems are built from the ground-up to make it easy to execute code deterministically. For
 example, FoundationDB, one of the pioneers in the DST space, is built using an asynchronous
 communication library called Flow. Flow provides a point for developers to inject a deterministic
 network simulation into the system [^132].
 Similarly, TigerBeetle is an online transaction processing (OLTP) database with first-class DST
 support. The system’s state is modeled as a state machine, with all mutations occuring within a
 single event loop. When combined with mock deterministic primitives such as clocks, such an
 architecture is able to run deterministically [^133].

Runtime-level
: Languages with asynchronous runtimes and commonly used libraries provide an insertion point
 to introduce determinism. A single-threaded runtime is used to force all asynchronous code to run
 sequentially. FrostDB, for example, patches Go’s runtime to execute goroutines sequentially [^134].
 Rust’s madsim library works in a similar manner. Madsim provides deterministic implementations of
 Tokio’s asynchronous runtime API, AWS’s S3 library, Kafka’s Rust library, and many others.
 Applications can swap in deterministic libraries and runtimes to get deterministic test executions
 without changing their code.

Machine-level
: Rather than patching code at runtime, an entire machine can be made deterministic. This is a
 delicate process that requires a machine to respond to all normally nondeterministic calls with
 deterministic responses. Tools such as Antithesis do this by building a custom hypervisor that
 replaces normally nondeterministic operations with deterministic ones. Everything from clocks
 to network and storage needs to be accounted for. Once done, though, developers can run their
 entire distributed system in a collection of containers within the hypervisor and get a completely
 deterministic distributed system.

DST provides several advantages beyond replayability. Tools such as Antithesis attempt to explore
many different code paths in application code by branching a test execution into multiple
sub-executions when it discovers less common behavior. And because deterministic tests often use
mocked clocks and network calls, such tests can run faster than wall-clock time. For example,
TigerBeetle’s time abstraction allows simulations to simulate network latency and timeouts without
actually taking the full length of time to trigger the timeout. Such techniques allow the simulator
to explore more code paths faster.

#### The Power of Determinism {#sidebar_distributed_determinism}

Nondeterminism is at the core of all of the distributed systems challenges we discussed in this
chapter: concurrency, network delay, process pauses, clock jumps, and crashes all happen in
unpredictable ways that vary from one run of a system to the next. Conversely, if you can make a
system deterministic, that can hugely simplify things.

In fact, making things deterministic is a simple but powerful idea that arises again and again in
distributed system design. Besides deterministic simulation testing, we have seen several ways of
using determinism over the past chapters:

* A key advantage of event sourcing (see [“Event Sourcing and CQRS”](/en/ch3#sec_datamodels_events)) is that you can
 deterministically replay a log of events to reconstruct derived materialized views.
* Workflow engines (see [“Durable Execution and Workflows”](/en/ch5#sec_encoding_dataflow_workflows)) rely on workflow definitions being
 deterministic to provide durable execution semantics.
* *State machine replication*, which we will discuss in [“Using shared logs”](/en/ch10#sec_consistency_smr), replicates data by
 independently executing the same sequence of deterministic transactions on each replica. We have
 already seen two variants of that idea: statement-based replication (see
 [“Implementation of Replication Logs”](/en/ch6#sec_replication_implementation)) and serial transaction execution using stored procedures
 (see [“Pros and cons of stored procedures”](/en/ch8#sec_transactions_stored_proc_tradeoffs)).

However, making code fully deterministic requires care. Even once you have removed all concurrency
and replaced I/O, network communication, clocks, and random number generators with deterministic
simulations, elements of nondeterminism may remain. For example, in some programming languages, the
order in which you iterate over the elements of a hash table may be nondeterministic. Whether you
run into a resource limit (memory allocation failure, stack overflow) is also nondeterministic.

## Summary {#summary}

In this chapter we have discussed a wide range of problems that can occur in distributed systems,
including:

* Whenever you try to send a packet over the network, it may be lost or arbitrarily delayed.
 Likewise, the reply may be lost or delayed, so if you don’t get a reply, you have no idea whether
 the message got through.
* A node’s clock may be significantly out of sync with other nodes (despite your best efforts to set
 up NTP), it may suddenly jump forward or back in time, and relying on it is dangerous because you
 most likely don’t have a good measure of your clock’s confidence interval.
* A process may pause for a substantial amount of time at any point in its execution, be declared
 dead by other nodes, and then come back to life again without realizing that it was paused.

The fact that such *partial failures* can occur is the defining characteristic of distributed
systems. Whenever software tries to do anything involving other nodes, there is the possibility that
it may occasionally fail, or randomly go slow, or not respond at all (and eventually time out). In
distributed systems, we try to build tolerance of partial failures into software, so that the system
as a whole may continue functioning even when some of its constituent parts are broken.

To tolerate faults, the first step is to *detect* them, but even that is hard. Most systems
don’t have an accurate mechanism of detecting whether a node has failed, so most distributed
algorithms rely on timeouts to determine whether a remote node is still available. However, timeouts
can’t distinguish between network and node failures, and variable network delay sometimes causes a
node to be falsely suspected of crashing. Handling limping nodes, which are responding but are too
slow to do anything useful, is even harder.

Once a fault is detected, making a system tolerate it is not easy either: there is no global
variable, no shared memory, no common knowledge or any other kind of shared state between the machines [^83].
Nodes can’t even agree on what time it is, let alone on anything more profound. The only way
information can flow from one node to another is by sending it over the unreliable network. Major
decisions cannot be safely made by a single node, so we require protocols that enlist help from
other nodes and try to get a quorum to agree.

If you’re used to writing software in the idealized mathematical perfection of a single computer,
where the same operation always deterministically returns the same result, then moving to the messy
physical reality of distributed systems can be a bit of a shock. Conversely, distributed systems
engineers will often regard a problem as trivial if it can be solved on a single computer [^4],
and indeed a single computer can do a lot nowadays. If you can avoid opening Pandora’s box and
simply keep things on a single machine, for example by using an embedded storage engine (see [“Embedded storage engines”](/en/ch4#sidebar_embedded)), it is generally worth doing so.

However, as discussed in [“Distributed versus Single-Node Systems”](/en/ch1#sec_introduction_distributed), scalability is not the only reason for
wanting to use a distributed system. Fault tolerance and low latency (by placing data geographically
close to users) are equally important goals, and those things cannot be achieved with a single node.
The power of distributed systems is that in principle, they can run forever without being
interrupted at the service level, because all faults and maintenance can be handled at the node
level. (In practice, if a bad configuration change is rolled out to all nodes, that will still bring
a distributed system to its knees.)

In this chapter we also went on some tangents to explore whether the unreliability of networks,
clocks, and processes is an inevitable law of nature. We saw that it isn’t: it is possible to give
hard real-time response guarantees and bounded delays in networks, but doing so is very expensive and
results in lower utilization of hardware resources. Most non-safety-critical systems choose cheap
and unreliable over expensive and reliable.

This chapter has been all about problems, and has given us a bleak outlook. In the next chapter we
will move on to solutions, and discuss some algorithms that have been designed to cope with the
problems in distributed systems.


### References

[^1]: Mark Cavage. [There’s Just No Getting Around It: You’re Building a Distributed System](https://queue.acm.org/detail.cfm?id=2482856). *ACM Queue*, volume 11, issue 4, pages 80-89, April 2013. [doi:10.1145/2466486.2482856](https://doi.org/10.1145/2466486.2482856) 
[^2]: Jay Kreps. [Getting Real About Distributed System Reliability](https://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability). *blog.empathybox.com*, March 2012. Archived at [perma.cc/9B5Q-AEBW](https://perma.cc/9B5Q-AEBW) 
[^3]: Coda Hale. [You Can’t Sacrifice Partition Tolerance](https://codahale.com/you-cant-sacrifice-partition-tolerance/). *codahale.com*, October 2010. <https://perma.cc/6GJU-X4G5>
[^4]: Jeff Hodges. [Notes on Distributed Systems for Young Bloods](https://www.somethingsimilar.com/2013/01/14/notes-on-distributed-systems-for-young-bloods/). *somethingsimilar.com*, January 2013. Archived at [perma.cc/B636-62CE](https://perma.cc/B636-62CE) 
[^5]: Van Jacobson. [Congestion Avoidance and Control](https://www.cs.usask.ca/ftp/pub/discus/seminars2002-2003/p314-jacobson.pdf). At *ACM Symposium on Communications Architectures and Protocols* (SIGCOMM), August 1988. [doi:10.1145/52324.52356](https://doi.org/10.1145/52324.52356) 
[^6]: Bert Hubert. [The Ultimate SO\_LINGER Page, or: Why Is My TCP Not Reliable](https://blog.netherlabs.nl/articles/2009/01/18/the-ultimate-so_linger-page-or-why-is-my-tcp-not-reliable). *blog.netherlabs.nl*, January 2009. Archived at [perma.cc/6HDX-L2RR](https://perma.cc/6HDX-L2RR) 
[^7]: Jerome H. Saltzer, David P. Reed, and David D. Clark. [End-To-End Arguments in System Design](https://groups.csail.mit.edu/ana/Publications/PubPDFs/End-to-End%20Arguments%20in%20System%20Design.pdf). *ACM Transactions on Computer Systems*, volume 2, issue 4, pages 277–288, November 1984. [doi:10.1145/357401.357402](https://doi.org/10.1145/357401.357402) 
[^8]: Peter Bailis and Kyle Kingsbury. [The Network Is Reliable](https://queue.acm.org/detail.cfm?id=2655736). *ACM Queue*, volume 12, issue 7, pages 48-55, July 2014. [doi:10.1145/2639988.2639988](https://doi.org/10.1145/2639988.2639988) 
[^9]: Joshua B. Leners, Trinabh Gupta, Marcos K. Aguilera, and Michael Walfish. [Taming Uncertainty in Distributed Systems with Help from the Network](https://cs.nyu.edu/~mwalfish/papers/albatross-eurosys15.pdf). At *10th European Conference on Computer Systems* (EuroSys), April 2015. [doi:10.1145/2741948.2741976](https://doi.org/10.1145/2741948.2741976) 
[^10]: Phillipa Gill, Navendu Jain, and Nachiappan Nagappan. [Understanding Network Failures in Data Centers: Measurement, Analysis, and Implications](https://conferences.sigcomm.org/sigcomm/2011/papers/sigcomm/p350.pdf). At *ACM SIGCOMM Conference*, August 2011. [doi:10.1145/2018436.2018477](https://doi.org/10.1145/2018436.2018477) 
[^11]: Urs Hölzle. [But recently a farmer had started grazing a herd of cows nearby. And whenever they stepped on the fiber link, they bent it enough to cause a blip](https://x.com/uhoelzle/status/1263333283107991558). *x.com*, May 2020. Archived at [perma.cc/WX8X-ZZA5](https://perma.cc/WX8X-ZZA5) 
[^12]: CBC News. [Hundreds lose internet service in northern B.C. after beaver chews through cable](https://www.cbc.ca/news/canada/british-columbia/beaver-internet-down-tumbler-ridge-1.6001594). *cbc.ca*, April 2021. Archived at [perma.cc/UW8C-H2MY](https://perma.cc/UW8C-H2MY) 
[^13]: Will Oremus. [The Global Internet Is Being Attacked by Sharks, Google Confirms](https://slate.com/technology/2014/08/shark-attacks-threaten-google-s-undersea-internet-cables-video.html). *slate.com*, August 2014. Archived at [perma.cc/P6F3-C6YG](https://perma.cc/P6F3-C6YG) 
[^14]: Jess Auerbach Jahajeeah. [Down to the wire: The ship fixing our internet](https://continent.substack.com/p/down-to-the-wire-the-ship-fixing). *continent.substack.com*, November 2023. Archived at [perma.cc/DP7B-EQ7S](https://perma.cc/DP7B-EQ7S) 
[^15]: Santosh Janardhan. [More details about the October 4 outage](https://engineering.fb.com/2021/10/05/networking-traffic/outage-details/). *engineering.fb.com*, October 2021. Archived at [perma.cc/WW89-VSXH](https://perma.cc/WW89-VSXH) 
[^16]: Tom Parfitt. [Georgian woman cuts off web access to whole of Armenia](https://www.theguardian.com/world/2011/apr/06/georgian-woman-cuts-web-access). *theguardian.com*, April 2011. Archived at [perma.cc/KMC3-N3NZ](https://perma.cc/KMC3-N3NZ) 
[^17]: Antonio Voce, Tural Ahmedzade and Ashley Kirk. [‘Shadow fleets’ and subaquatic sabotage: are Europe’s undersea internet cables under attack?](https://www.theguardian.com/world/ng-interactive/2025/mar/05/shadow-fleets-subaquatic-sabotage-europe-undersea-internet-cables-under-attack) *theguardian.com*, March 2025. Archived at [perma.cc/HA7S-ZDBV](https://perma.cc/HA7S-ZDBV) 
[^18]: Shengyun Liu, Paolo Viotti, Christian Cachin, Vivien Quéma, and Marko Vukolić. [XFT: Practical Fault Tolerance beyond Crashes](https://www.usenix.org/system/files/conference/osdi16/osdi16-liu.pdf). At *12th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), November 2016. 
[^19]: Mark Imbriaco. [Downtime last Saturday](https://github.blog/news-insights/the-library/downtime-last-saturday/). *github.blog*, December 2012. Archived at [perma.cc/M7X5-E8SQ](https://perma.cc/M7X5-E8SQ) 
[^20]: Tom Lianza and Chris Snook. [A Byzantine failure in the real world](https://blog.cloudflare.com/a-byzantine-failure-in-the-real-world/). *blog.cloudflare.com*, November 2020. Archived at [perma.cc/83EZ-ALCY](https://perma.cc/83EZ-ALCY) 
[^21]: Mohammed Alfatafta, Basil Alkhatib, Ahmed Alquraan, and Samer Al-Kiswany. [Toward a Generic Fault Tolerance Technique for Partial Network Partitioning](https://www.usenix.org/conference/osdi20/presentation/alfatafta). At *14th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), November 2020. 
[^22]: Marc A. Donges. [Re: bnx2 cards Intermittantly Going Offline](https://www.spinics.net/lists/netdev/msg210485.html). Message to Linux *netdev* mailing list, *spinics.net*, September 2012. Archived at [perma.cc/TXP6-H8R3](https://perma.cc/TXP6-H8R3) 
[^23]: Troy Toman. [Inside a CODE RED: Network Edition](https://signalvnoise.com/svn3/inside-a-code-red-network-edition/). *signalvnoise.com*, September 2020. Archived at [perma.cc/BET6-FY25](https://perma.cc/BET6-FY25) 
[^24]: Kyle Kingsbury. [Call Me Maybe: Elasticsearch](https://aphyr.com/posts/317-call-me-maybe-elasticsearch). *aphyr.com*, June 2014. [perma.cc/JK47-S89J](https://perma.cc/JK47-S89J) 
[^25]: Salvatore Sanfilippo. [A Few Arguments About Redis Sentinel Properties and Fail Scenarios](https://antirez.com/news/80). *antirez.com*, October 2014. [perma.cc/8XEU-CLM8](https://perma.cc/8XEU-CLM8) 
[^26]: Nicolas Liochon. [CAP: If All You Have Is a Timeout, Everything Looks Like a Partition](http://blog.thislongrun.com/2015/05/CAP-theorem-partition-timeout-zookeeper.html). *blog.thislongrun.com*, May 2015. Archived at [perma.cc/FS57-V2PZ](https://perma.cc/FS57-V2PZ) 
[^27]: Matthew P. Grosvenor, Malte Schwarzkopf, Ionel Gog, Robert N. M. Watson, Andrew W. Moore, Steven Hand, and Jon Crowcroft. [Queues Don’t Matter When You Can JUMP Them!](https://www.usenix.org/system/files/conference/nsdi15/nsdi15-paper-grosvenor_update.pdf) At *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015. 
[^28]: Theo Julienne. [Debugging network stalls on Kubernetes](https://github.blog/engineering/debugging-network-stalls-on-kubernetes/). *github.blog*, November 2019. Archived at [perma.cc/K9M8-XVGL](https://perma.cc/K9M8-XVGL) 
[^29]: Guohui Wang and T. S. Eugene Ng. [The Impact of Virtualization on Network Performance of Amazon EC2 Data Center](https://www.cs.rice.edu/~eugeneng/papers/INFOCOM10-ec2.pdf). At *29th IEEE International Conference on Computer Communications* (INFOCOM), March 2010. [doi:10.1109/INFCOM.2010.5461931](https://doi.org/10.1109/INFCOM.2010.5461931) 
[^30]: Brandon Philips. [etcd: Distributed Locking and Service Discovery](https://www.youtube.com/watch?v=HJIjTTHWYnE). At *Strange Loop*, September 2014. 
[^31]: Steve Newman. [A Systematic Look at EC2 I/O](https://www.sentinelone.com/blog/a-systematic-look-at-ec2-i-o/). *blog.scalyr.com*, October 2012. Archived at [perma.cc/FL4R-H2VE](https://perma.cc/FL4R-H2VE) 
[^32]: Naohiro Hayashibara, Xavier Défago, Rami Yared, and Takuya Katayama. [The ϕ Accrual Failure Detector](https://hdl.handle.net/10119/4784). Japan Advanced Institute of Science and Technology, School of Information Science, Technical Report IS-RR-2004-010, May 2004. Archived at [perma.cc/NSM2-TRYA](https://perma.cc/NSM2-TRYA) 
[^33]: Jeffrey Wang. [Phi Accrual Failure Detector](https://ternarysearch.blogspot.com/2013/08/phi-accrual-failure-detector.html). *ternarysearch.blogspot.co.uk*, August 2013. [perma.cc/L452-AMLV](https://perma.cc/L452-AMLV) 
[^34]: Srinivasan Keshav. *An Engineering Approach to Computer Networking: ATM Networks, the Internet, and the Telephone Network*. Addison-Wesley Professional, May 1997. ISBN: 978-0-201-63442-6 
[^35]: Othmar Kyas. *ATM Networks*. International Thomson Publishing, 1995. ISBN: 978-1-850-32128-6 
[^36]: Mellanox Technologies. [InfiniBand FAQ, Rev 1.3](https://network.nvidia.com/related-docs/whitepapers/InfiniBandFAQ_FQ_100.pdf). *network.nvidia.com*, December 2014. Archived at [perma.cc/LQJ4-QZVK](https://perma.cc/LQJ4-QZVK) 
[^37]: Jose Renato Santos, Yoshio Turner, and G. (John) Janakiraman. [End-to-End Congestion Control for InfiniBand](https://infocom2003.ieee-infocom.org/papers/28_01.PDF). At *22nd Annual Joint Conference of the IEEE Computer and Communications Societies* (INFOCOM), April 2003. Also published by HP Laboratories Palo Alto, Tech Report HPL-2002-359. [doi:10.1109/INFCOM.2003.1208949](https://doi.org/10.1109/INFCOM.2003.1208949) 
[^38]: Jialin Li, Naveen Kr. Sharma, Dan R. K. Ports, and Steven D. Gribble. [Tales of the Tail: Hardware, OS, and Application-level Sources of Tail Latency](https://syslab.cs.washington.edu/papers/latency-socc14.pdf). At *ACM Symposium on Cloud Computing* (SOCC), November 2014. [doi:10.1145/2670979.2670988](https://doi.org/10.1145/2670979.2670988) 
[^39]: Ulrich Windl, David Dalton, Marc Martinec, and Dale R. Worley. [The NTP FAQ and HOWTO](https://www.ntp.org/ntpfaq/). *ntp.org*, November 2006. 
[^40]: John Graham-Cumming. [How and why the leap second affected Cloudflare DNS](https://blog.cloudflare.com/how-and-why-the-leap-second-affected-cloudflare-dns/). *blog.cloudflare.com*, January 2017. Archived at [archive.org](https://web.archive.org/web/20250202041444/https%3A//blog.cloudflare.com/how-and-why-the-leap-second-affected-cloudflare-dns/) 
[^41]: David Holmes. [Inside the Hotspot VM: Clocks, Timers and Scheduling Events – Part I – Windows](https://web.archive.org/web/20160308031939/https%3A//blogs.oracle.com/dholmes/entry/inside_the_hotspot_vm_clocks). *blogs.oracle.com*, October 2006. Archived at [archive.org](https://web.archive.org/web/20160308031939/https%3A//blogs.oracle.com/dholmes/entry/inside_the_hotspot_vm_clocks) 
[^42]: Joran Dirk Greef. [Three Clocks are Better than One](https://tigerbeetle.com/blog/2021-08-30-three-clocks-are-better-than-one/). *tigerbeetle.com*, August 2021. Archived at [perma.cc/5RXG-EU6B](https://perma.cc/5RXG-EU6B) 
[^43]: Oliver Yang. [Pitfalls of TSC usage](https://oliveryang.net/2015/09/pitfalls-of-TSC-usage/). *oliveryang.net*, September 2015. Archived at [perma.cc/Z2QY-5FRA](https://perma.cc/Z2QY-5FRA) 
[^44]: Steve Loughran. [Time on Multi-Core, Multi-Socket Servers](https://steveloughran.blogspot.com/2015/09/time-on-multi-core-multi-socket-servers.html). *steveloughran.blogspot.co.uk*, September 2015. Archived at [perma.cc/7M4S-D4U6](https://perma.cc/7M4S-D4U6) 
[^45]: James C. Corbett, Jeffrey Dean, Michael Epstein, Andrew Fikes, Christopher Frost, JJ Furman, Sanjay Ghemawat, Andrey Gubarev, Christopher Heiser, Peter Hochschild, Wilson Hsieh, Sebastian Kanthak, Eugene Kogan, Hongyi Li, Alexander Lloyd, Sergey Melnik, David Mwaura, David Nagle, Sean Quinlan, Rajesh Rao, Lindsay Rolig, Dale Woodford, Yasushi Saito, Christopher Taylor, Michal Szymaniak, and Ruth Wang. [Spanner: Google’s Globally-Distributed Database](https://research.google/pubs/pub39966/). At *10th USENIX Symposium on Operating System Design and Implementation* (OSDI), October 2012. 
[^46]: M. Caporaloni and R. Ambrosini. [How Closely Can a Personal Computer Clock Track the UTC Timescale Via the Internet?](https://iopscience.iop.org/0143-0807/23/4/103/) *European Journal of Physics*, volume 23, issue 4, pages L17–L21, June 2012. [doi:10.1088/0143-0807/23/4/103](https://doi.org/10.1088/0143-0807/23/4/103) 
[^47]: Nelson Minar. [A Survey of the NTP Network](https://alumni.media.mit.edu/~nelson/research/ntp-survey99/). *alumni.media.mit.edu*, December 1999. Archived at [perma.cc/EV76-7ZV3](https://perma.cc/EV76-7ZV3) 
[^48]: Viliam Holub. [Synchronizing Clocks in a Cassandra Cluster Pt. 1 – The Problem](https://blog.rapid7.com/2014/03/14/synchronizing-clocks-in-a-cassandra-cluster-pt-1-the-problem/). *blog.rapid7.com*, March 2014. Archived at [perma.cc/N3RV-5LNL](https://perma.cc/N3RV-5LNL) 
[^49]: Poul-Henning Kamp. [The One-Second War (What Time Will You Die?)](https://queue.acm.org/detail.cfm?id=1967009) *ACM Queue*, volume 9, issue 4, pages 44–48, April 2011. [doi:10.1145/1966989.1967009](https://doi.org/10.1145/1966989.1967009) 
[^50]: Nelson Minar. [Leap Second Crashes Half the Internet](https://www.somebits.com/weblog/tech/bad/leap-second-2012.html). *somebits.com*, July 2012. Archived at [perma.cc/2WB8-D6EU](https://perma.cc/2WB8-D6EU) 
[^51]: Christopher Pascoe. [Time, Technology and Leaping Seconds](https://googleblog.blogspot.com/2011/09/time-technology-and-leaping-seconds.html). *googleblog.blogspot.co.uk*, September 2011. Archived at [perma.cc/U2JL-7E74](https://perma.cc/U2JL-7E74) 
[^52]: Mingxue Zhao and Jeff Barr. [Look Before You Leap – The Coming Leap Second and AWS](https://aws.amazon.com/blogs/aws/look-before-you-leap-the-coming-leap-second-and-aws/). *aws.amazon.com*, May 2015. Archived at [perma.cc/KPE9-XMFM](https://perma.cc/KPE9-XMFM) 
[^53]: Darryl Veitch and Kanthaiah Vijayalayan. [Network Timing and the 2015 Leap Second](https://opus.lib.uts.edu.au/bitstream/10453/43923/1/LeapSecond_camera.pdf). At *17th International Conference on Passive and Active Measurement* (PAM), April 2016. [doi:10.1007/978-3-319-30505-9\_29](https://doi.org/10.1007/978-3-319-30505-9_29) 
[^54]: VMware, Inc. [Timekeeping in VMware Virtual Machines](https://www.vmware.com/docs/vmware_timekeeping). *vmware.com*, October 2008. Archived at [perma.cc/HM5R-T5NF](https://perma.cc/HM5R-T5NF) 
[^55]: Victor Yodaiken. [Clock Synchronization in Finance and Beyond](https://www.yodaiken.com/wp-content/uploads/2018/05/financeandbeyond.pdf). *yodaiken.com*, November 2017. Archived at [perma.cc/9XZD-8ZZN](https://perma.cc/9XZD-8ZZN) 
[^56]: Mustafa Emre Acer, Emily Stark, Adrienne Porter Felt, Sascha Fahl, Radhika Bhargava, Bhanu Dev, Matt Braithwaite, Ryan Sleevi, and Parisa Tabriz. [Where the Wild Warnings Are: Root Causes of Chrome HTTPS Certificate Errors](https://acmccs.github.io/papers/p1407-acerA.pdf). At *ACM SIGSAC Conference on Computer and Communications Security* (CCS), pages 1407–1420, October 2017. [doi:10.1145/3133956.3134007](https://doi.org/10.1145/3133956.3134007) 
[^57]: European Securities and Markets Authority. [MiFID II / MiFIR: Regulatory Technical and Implementing Standards – Annex I](https://www.esma.europa.eu/sites/default/files/library/2015/11/2015-esma-1464_annex_i_-_draft_rts_and_its_on_mifid_ii_and_mifir.pdf). *esma.europa.eu*, Report ESMA/2015/1464, September 2015. Archived at [perma.cc/ZLX9-FGQ3](https://perma.cc/ZLX9-FGQ3) 
[^58]: Luke Bigum. [Solving MiFID II Clock Synchronisation With Minimum Spend (Part 1)](https://catach.blogspot.com/2015/11/solving-mifid-ii-clock-synchronisation.html). *catach.blogspot.com*, November 2015. Archived at [perma.cc/4J5W-FNM4](https://perma.cc/4J5W-FNM4) 
[^59]: Oleg Obleukhov and Ahmad Byagowi. [How Precision Time Protocol is being deployed at Meta](https://engineering.fb.com/2022/11/21/production-engineering/precision-time-protocol-at-meta/). *engineering.fb.com*, November 2022. Archived at [perma.cc/29G6-UJNW](https://perma.cc/29G6-UJNW) 
[^60]: John Wiseman. [gpsjam.org](https://gpsjam.org/), July 2022. 
[^61]: Josh Levinson, Julien Ridoux, and Chris Munns. [It’s About Time: Microsecond-Accurate Clocks on Amazon EC2 Instances](https://aws.amazon.com/blogs/compute/its-about-time-microsecond-accurate-clocks-on-amazon-ec2-instances/). *aws.amazon.com*, November 2023. Archived at [perma.cc/56M6-5VMZ](https://perma.cc/56M6-5VMZ) 
[^62]: Kyle Kingsbury. [Call Me Maybe: Cassandra](https://aphyr.com/posts/294-call-me-maybe-cassandra/). *aphyr.com*, September 2013. Archived at [perma.cc/4MBR-J96V](https://perma.cc/4MBR-J96V) 
[^63]: John Daily. [Clocks Are Bad, or, Welcome to the Wonderful World of Distributed Systems](https://riak.com/clocks-are-bad-or-welcome-to-distributed-systems/). *riak.com*, November 2013. Archived at [perma.cc/4XB5-UCXY](https://perma.cc/4XB5-UCXY) 
[^64]: Marc Brooker. [It’s About Time!](https://brooker.co.za/blog/2023/11/27/about-time.html) *brooker.co.za*, November 2023. Archived at [perma.cc/N6YK-DRPA](https://perma.cc/N6YK-DRPA) 
[^65]: Kyle Kingsbury. [The Trouble with Timestamps](https://aphyr.com/posts/299-the-trouble-with-timestamps). *aphyr.com*, October 2013. Archived at [perma.cc/W3AM-5VAV](https://perma.cc/W3AM-5VAV) 
[^66]: Leslie Lamport. [Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/). *Communications of the ACM*, volume 21, issue 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](https://doi.org/10.1145/359545.359563) 
[^67]: Justin Sheehy. [There Is No Now: Problems With Simultaneity in Distributed Systems](https://queue.acm.org/detail.cfm?id=2745385). *ACM Queue*, volume 13, issue 3, pages 36–41, March 2015. [doi:10.1145/2733108](https://doi.org/10.1145/2733108) 
[^68]: Murat Demirbas. [Spanner: Google’s Globally-Distributed Database](https://muratbuffalo.blogspot.com/2013/07/spanner-googles-globally-distributed_4.html). *muratbuffalo.blogspot.co.uk*, July 2013. Archived at [perma.cc/6VWR-C9WB](https://perma.cc/6VWR-C9WB) 
[^69]: Dahlia Malkhi and Jean-Philippe Martin. [Spanner’s Concurrency Control](https://www.cs.cornell.edu/~ie53/publications/DC-col51-Sep13.pdf). *ACM SIGACT News*, volume 44, issue 3, pages 73–77, September 2013. [doi:10.1145/2527748.2527767](https://doi.org/10.1145/2527748.2527767) 
[^70]: Franck Pachot. [Achieving Precise Clock Synchronization on AWS](https://www.yugabyte.com/blog/aws-clock-synchronization/). *yugabyte.com*, December 2024. Archived at [perma.cc/UYM6-RNBS](https://perma.cc/UYM6-RNBS) 
[^71]: Spencer Kimball. [Living Without Atomic Clocks: Where CockroachDB and Spanner diverge](https://www.cockroachlabs.com/blog/living-without-atomic-clocks/). *cockroachlabs.com*, January 2022. Archived at [perma.cc/AWZ7-RXFT](https://perma.cc/AWZ7-RXFT) 
[^72]: Murat Demirbas. [Use of Time in Distributed Databases (part 4): Synchronized clocks in production databases](https://muratbuffalo.blogspot.com/2025/01/use-of-time-in-distributed-databases.html). *muratbuffalo.blogspot.com*, January 2025. Archived at [perma.cc/9WNX-Q9U3](https://perma.cc/9WNX-Q9U3) 
[^73]: Cary G. Gray and David R. Cheriton. [Leases: An Efficient Fault-Tolerant Mechanism for Distributed File Cache Consistency](https://courses.cs.duke.edu/spring11/cps210/papers/p202-gray.pdf). At *12th ACM Symposium on Operating Systems Principles* (SOSP), December 1989. [doi:10.1145/74850.74870](https://doi.org/10.1145/74850.74870) 
[^74]: Daniel Sturman, Scott Delap, Max Ross, et al. [Roblox Return to Service](https://corp.roblox.com/newsroom/2022/01/roblox-return-to-service-10-28-10-31-2021). *corp.roblox.com*, January 2022. Archived at [perma.cc/8ALT-WAS4](https://perma.cc/8ALT-WAS4) 
[^75]: Todd Lipcon. [Avoiding Full GCs with MemStore-Local Allocation Buffers](https://www.slideshare.net/slideshow/hbase-hug-presentation/7038178). *slideshare.net*, February 2011. Archived at <https://perma.cc/CH62-2EWJ>
[^76]: Christopher Clark, Keir Fraser, Steven Hand, Jacob Gorm Hansen, Eric Jul, Christian Limpach, Ian Pratt, and Andrew Warfield. [Live Migration of Virtual Machines](https://www.usenix.org/legacy/publications/library/proceedings/nsdi05/tech/full_papers/clark/clark.pdf). At *2nd USENIX Symposium on Symposium on Networked Systems Design & Implementation* (NSDI), May 2005. 
[^77]: Mike Shaver. [fsyncers and Curveballs](https://web.archive.org/web/20220107141023/http%3A//shaver.off.net/diary/2008/05/25/fsyncers-and-curveballs/). *shaver.off.net*, May 2008. Archived at [archive.org](https://web.archive.org/web/20220107141023/http%3A//shaver.off.net/diary/2008/05/25/fsyncers-and-curveballs/) 
[^78]: Zhenyun Zhuang and Cuong Tran. [Eliminating Large JVM GC Pauses Caused by Background IO Traffic](https://engineering.linkedin.com/blog/2016/02/eliminating-large-jvm-gc-pauses-caused-by-background-io-traffic). *engineering.linkedin.com*, February 2016. Archived at [perma.cc/ML2M-X9XT](https://perma.cc/ML2M-X9XT) 
[^79]: Martin Thompson. [Java Garbage Collection Distilled](https://mechanical-sympathy.blogspot.com/2013/07/java-garbage-collection-distilled.html). *mechanical-sympathy.blogspot.co.uk*, July 2013. Archived at [perma.cc/DJT3-NQLQ](https://perma.cc/DJT3-NQLQ) 
[^80]: David Terei and Amit Levy. [Blade: A Data Center Garbage Collector](https://arxiv.org/pdf/1504.02578). arXiv:1504.02578, April 2015. 
[^81]: Martin Maas, Tim Harris, Krste Asanović, and John Kubiatowicz. [Trash Day: Coordinating Garbage Collection in Distributed Systems](https://timharris.uk/papers/2015-hotos.pdf). At *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015. 
[^82]: Martin Fowler. [The LMAX Architecture](https://martinfowler.com/articles/lmax.html). *martinfowler.com*, July 2011. Archived at [perma.cc/5AV4-N6RJ](https://perma.cc/5AV4-N6RJ) 
[^83]: Joseph Y. Halpern and Yoram Moses. [Knowledge and common knowledge in a distributed environment](https://groups.csail.mit.edu/tds/papers/Halpern/JACM90.pdf). *Journal of the ACM* (JACM), volume 37, issue 3, pages 549–587, July 1990. [doi:10.1145/79147.79161](https://doi.org/10.1145/79147.79161) 
[^84]: Chuzhe Tang, Zhaoguo Wang, Xiaodong Zhang, Qianmian Yu, Binyu Zang, Haibing Guan, and Haibo Chen. [Ad Hoc Transactions in Web Applications: The Good, the Bad, and the Ugly](https://ipads.se.sjtu.edu.cn/_media/publications/concerto-sigmod22.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2022. [doi:10.1145/3514221.3526120](https://doi.org/10.1145/3514221.3526120) 
[^85]: Flavio P. Junqueira and Benjamin Reed. [*ZooKeeper: Distributed Process Coordination*](https://www.oreilly.com/library/view/zookeeper/9781449361297/). O’Reilly Media, 2013. ISBN: 978-1-449-36130-3 
[^86]: Enis Söztutar. [HBase and HDFS: Understanding Filesystem Usage in HBase](https://www.slideshare.net/slideshow/hbase-and-hdfs-understanding-filesystem-usage/22990858). At *HBaseCon*, June 2013. Archived at [perma.cc/4DXR-9P88](https://perma.cc/4DXR-9P88) 
[^87]: SUSE LLC. [SUSE Linux Enterprise High Availability 15 SP6 Administration Guide, Section 12: Fencing and STONITH](https://documentation.suse.com/sle-ha/15-SP6/html/SLE-HA-all/cha-ha-fencing.html). *documentation.suse.com*, March 2025. Archived at [perma.cc/8LAR-EL9D](https://perma.cc/8LAR-EL9D) 
[^88]: Mike Burrows. [The Chubby Lock Service for Loosely-Coupled Distributed Systems](https://research.google/pubs/pub27897/). At *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006. 
[^89]: Kyle Kingsbury. [etcd 3.4.3](https://jepsen.io/analyses/etcd-3.4.3). *jepsen.io*, January 2020. Archived at [perma.cc/2P3Y-MPWU](https://perma.cc/2P3Y-MPWU) 
[^90]: Ensar Basri Kahveci. [Distributed Locks are Dead; Long Live Distributed Locks!](https://hazelcast.com/blog/long-live-distributed-locks/) *hazelcast.com*, April 2019. Archived at [perma.cc/7FS5-LDXE](https://perma.cc/7FS5-LDXE) 
[^91]: Martin Kleppmann. [How to do distributed locking](https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html). *martin.kleppmann.com*, February 2016. Archived at [perma.cc/Y24W-YQ5L](https://perma.cc/Y24W-YQ5L) 
[^92]: Salvatore Sanfilippo. [Is Redlock safe?](https://antirez.com/news/101) *antirez.com*, February 2016. Archived at [perma.cc/B6GA-9Q6A](https://perma.cc/B6GA-9Q6A) 
[^93]: Gunnar Morling. [Leader Election With S3 Conditional Writes](https://www.morling.dev/blog/leader-election-with-s3-conditional-writes/). *www.morling.dev*, August 2024. Archived at [perma.cc/7V2N-J78Y](https://perma.cc/7V2N-J78Y) 
[^94]: Leslie Lamport, Robert Shostak, and Marshall Pease. [The Byzantine Generals Problem](https://www.microsoft.com/en-us/research/publication/byzantine-generals-problem/). *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 4, issue 3, pages 382–401, July 1982. [doi:10.1145/357172.357176](https://doi.org/10.1145/357172.357176) 
[^95]: Jim N. Gray. [Notes on Data Base Operating Systems](https://jimgray.azurewebsites.net/papers/dbos.pdf). in *Operating Systems: An Advanced Course*, Lecture Notes in Computer Science, volume 60, edited by R. Bayer, R. M. Graham, and G. Seegmüller, pages 393–481, Springer-Verlag, 1978. ISBN: 978-3-540-08755-7. Archived at [perma.cc/7S9M-2LZU](https://perma.cc/7S9M-2LZU) 
[^96]: Brian Palmer. [How Complicated Was the Byzantine Empire?](https://slate.com/news-and-politics/2011/10/the-byzantine-tax-code-how-complicated-was-byzantium-anyway.html) *slate.com*, October 2011. Archived at [perma.cc/AN7X-FL3N](https://perma.cc/AN7X-FL3N) 
[^97]: Leslie Lamport. [My Writings](https://lamport.azurewebsites.net/pubs/pubs.html). *lamport.azurewebsites.net*, December 2014. Archived at [perma.cc/5NNM-SQGR](https://perma.cc/5NNM-SQGR) 
[^98]: John Rushby. [Bus Architectures for Safety-Critical Embedded Systems](https://www.csl.sri.com/papers/emsoft01/emsoft01.pdf). At *1st International Workshop on Embedded Software* (EMSOFT), October 2001. [doi:10.1007/3-540-45449-7\_22](https://doi.org/10.1007/3-540-45449-7_22) 
[^99]: Jake Edge. [ELC: SpaceX Lessons Learned](https://lwn.net/Articles/540368/). *lwn.net*, March 2013. Archived at [perma.cc/AYX8-QP5X](https://perma.cc/AYX8-QP5X) 
[^100]: Shehar Bano, Alberto Sonnino, Mustafa Al-Bassam, Sarah Azouvi, Patrick McCorry, Sarah Meiklejohn, and George Danezis. [SoK: Consensus in the Age of Blockchains](https://smeiklej.com/files/aft19a.pdf). At *1st ACM Conference on Advances in Financial Technologies* (AFT), October 2019. [doi:10.1145/3318041.3355458](https://doi.org/10.1145/3318041.3355458) 
[^101]: Ezra Feilden, Adi Oltean, and Philip Johnston. [Why we should train AI in space](https://www.starcloud.com/wp). White Paper, *starcloud.com*, September 2024. Archived at [perma.cc/7Y3S-8UB6](https://perma.cc/7Y3S-8UB6) 
[^102]: James Mickens. [The Saddest Moment](https://www.usenix.org/system/files/login-logout_1305_mickens.pdf). *USENIX ;login*, May 2013. Archived at [perma.cc/T7BZ-XCFR](https://perma.cc/T7BZ-XCFR) 
[^103]: Martin Kleppmann and Heidi Howard. [Byzantine Eventual Consistency and the Fundamental Limits of Peer-to-Peer Databases](https://arxiv.org/abs/2012.00472). *arxiv.org*, December 2020. [doi:10.48550/arXiv.2012.00472](https://doi.org/10.48550/arXiv.2012.00472) 
[^104]: Martin Kleppmann. [Making CRDTs Byzantine Fault Tolerant](https://martin.kleppmann.com/papers/bft-crdt-papoc22.pdf). At *9th Workshop on Principles and Practice of Consistency for Distributed Data* (PaPoC), April 2022. [doi:10.1145/3517209.3524042](https://doi.org/10.1145/3517209.3524042) 
[^105]: Evan Gilman. [The Discovery of Apache ZooKeeper’s Poison Packet](https://www.pagerduty.com/blog/the-discovery-of-apache-zookeepers-poison-packet/). *pagerduty.com*, May 2015. Archived at [perma.cc/RV6L-Y5CQ](https://perma.cc/RV6L-Y5CQ) 
[^106]: Jonathan Stone and Craig Partridge. [When the CRC and TCP Checksum Disagree](https://conferences2.sigcomm.org/sigcomm/2000/conf/paper/sigcomm2000-9-1.pdf). At *ACM Conference on Applications, Technologies, Architectures, and Protocols for Computer Communication* (SIGCOMM), August 2000. [doi:10.1145/347059.347561](https://doi.org/10.1145/347059.347561) 
[^107]: Evan Jones. [How Both TCP and Ethernet Checksums Fail](https://www.evanjones.ca/tcp-and-ethernet-checksums-fail.html). *evanjones.ca*, October 2015. Archived at [perma.cc/9T5V-B8X5](https://perma.cc/9T5V-B8X5) 
[^108]: Cynthia Dwork, Nancy Lynch, and Larry Stockmeyer. [Consensus in the Presence of Partial Synchrony](https://groups.csail.mit.edu/tds/papers/Lynch/jacm88.pdf). *Journal of the ACM*, volume 35, issue 2, pages 288–323, April 1988. [doi:10.1145/42282.42283](https://doi.org/10.1145/42282.42283) 
[^109]: Richard D. Schlichting and Fred B. Schneider. [Fail-stop processors: an approach to designing fault-tolerant computing systems](https://www.cs.cornell.edu/fbs/publications/Fail_Stop.pdf). *ACM Transactions on Computer Systems* (TOCS), volume 1, issue 3, pages 222–238, August 1983. [doi:10.1145/357369.357371](https://doi.org/10.1145/357369.357371) 
[^110]: Thanh Do, Mingzhe Hao, Tanakorn Leesatapornwongsa, Tiratat Patana-anake, and Haryadi S. Gunawi. [Limplock: Understanding the Impact of Limpware on Scale-out Cloud Systems](https://ucare.cs.uchicago.edu/pdf/socc13-limplock.pdf). At *4th ACM Symposium on Cloud Computing* (SoCC), October 2013. [doi:10.1145/2523616.2523627](https://doi.org/10.1145/2523616.2523627) 
[^111]: Josh Snyder and Joseph Lynch. [Garbage collecting unhealthy JVMs, a proactive approach](https://netflixtechblog.medium.com/introducing-jvmquake-ec944c60ba70). Netflix Technology Blog, *netflixtechblog.medium.com*, November 2019. Archived at [perma.cc/8BTA-N3YB](https://perma.cc/8BTA-N3YB) 
[^112]: Haryadi S. Gunawi, Riza O. Suminto, Russell Sears, Casey Golliher, Swaminathan Sundararaman, Xing Lin, Tim Emami, Weiguang Sheng, Nematollah Bidokhti, Caitie McCaffrey, Gary Grider, Parks M. Fields, Kevin Harms, Robert B. Ross, Andree Jacobson, Robert Ricci, Kirk Webb, Peter Alvaro, H. Birali Runesha, Mingzhe Hao, and Huaicheng Li. [Fail-Slow at Scale: Evidence of Hardware Performance Faults in Large Production Systems](https://www.usenix.org/system/files/conference/fast18/fast18-gunawi.pdf). At *16th USENIX Conference on File and Storage Technologies*, February 2018. 
[^113]: Peng Huang, Chuanxiong Guo, Lidong Zhou, Jacob R. Lorch, Yingnong Dang, Murali Chintalapati, and Randolph Yao. [Gray Failure: The Achilles’ Heel of Cloud-Scale Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/06/paper-1.pdf). At *16th Workshop on Hot Topics in Operating Systems* (HotOS), May 2017. [doi:10.1145/3102980.3103005](https://doi.org/10.1145/3102980.3103005) 
[^114]: Chang Lou, Peng Huang, and Scott Smith. [Understanding, Detecting and Localizing Partial Failures in Large System Software](https://www.usenix.org/conference/nsdi20/presentation/lou). At *17th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), February 2020. 
[^115]: Peter Bailis and Ali Ghodsi. [Eventual Consistency Today: Limitations, Extensions, and Beyond](https://queue.acm.org/detail.cfm?id=2462076). *ACM Queue*, volume 11, issue 3, pages 55-63, March 2013. [doi:10.1145/2460276.2462076](https://doi.org/10.1145/2460276.2462076) 
[^116]: Bowen Alpern and Fred B. Schneider. [Defining Liveness](https://www.cs.cornell.edu/fbs/publications/DefLiveness.pdf). *Information Processing Letters*, volume 21, issue 4, pages 181–185, October 1985. [doi:10.1016/0020-0190(85)90056-0](https://doi.org/10.1016/0020-0190%2885%2990056-0) 
[^117]: Flavio P. Junqueira. [Dude, Where’s My Metadata?](https://fpj.me/2015/05/28/dude-wheres-my-metadata/) *fpj.me*, May 2015. Archived at [perma.cc/D2EU-Y9S5](https://perma.cc/D2EU-Y9S5) 
[^118]: Scott Sanders. [January 28th Incident Report](https://github.com/blog/2106-january-28th-incident-report). *github.com*, February 2016. Archived at [perma.cc/5GZR-88TV](https://perma.cc/5GZR-88TV) 
[^119]: Jay Kreps. [A Few Notes on Kafka and Jepsen](https://blog.empathybox.com/post/62279088548/a-few-notes-on-kafka-and-jepsen). *blog.empathybox.com*, September 2013. [perma.cc/XJ5C-F583](https://perma.cc/XJ5C-F583) 
[^120]: Marc Brooker and Ankush Desai. [Systems Correctness Practices at AWS](https://dl.acm.org/doi/pdf/10.1145/3712057). *Queue, Volume 22, Issue 6*, November/December 2024. [doi:10.1145/3712057](https://doi.org/10.1145/3712057) 
[^121]: Andrey Satarin. [Testing Distributed Systems: Curated list of resources on testing distributed systems](https://asatarin.github.io/testing-distributed-systems/). *asatarin.github.io*. Archived at [perma.cc/U5V8-XP24](https://perma.cc/U5V8-XP24) 
[^122]: Jack Vanlightly. [Verifying Kafka transactions - Diary entry 2 - Writing an initial TLA+ spec](https://jack-vanlightly.com/analyses/2024/12/3/verifying-kafka-transactions-diary-entry-2-writing-an-initial-tla-spec). *jack-vanlightly.com*, December 2024. Archived at [perma.cc/NSQ8-MQ5N](https://perma.cc/NSQ8-MQ5N) 
[^123]: Siddon Tang. [From Chaos to Order — Tools and Techniques for Testing TiDB, A Distributed NewSQL Database](https://www.pingcap.com/blog/chaos-practice-in-tidb/). *pingcap.com*, April 2018. Archived at [perma.cc/5EJB-R29F](https://perma.cc/5EJB-R29F) 
[^124]: Nathan VanBenschoten. [Parallel Commits: An atomic commit protocol for globally distributed transactions](https://www.cockroachlabs.com/blog/parallel-commits/). *cockroachlabs.com*, November 2019. Archived at [perma.cc/5FZ7-QK6J](https://perma.cc/5FZ7-QK6J%20) 
[^125]: Jack Vanlightly. [Paper: VR Revisited - State Transfer (part 3)](https://jack-vanlightly.com/analyses/2022/12/28/paper-vr-revisited-state-transfer-part-3). *jack-vanlightly.com*, December 2022. Archived at [perma.cc/KNK3-K6WS](https://perma.cc/KNK3-K6WS) 
[^126]: Hillel Wayne. [What if the spec doesn’t match the code?](https://buttondown.com/hillelwayne/archive/what-if-the-spec-doesnt-match-the-code/) *buttondown.com*, March 2024. Archived at [perma.cc/8HEZ-KHER](https://perma.cc/8HEZ-KHER) 
[^127]: Lingzhi Ouyang, Xudong Sun, Ruize Tang, Yu Huang, Madhav Jivrajani, Xiaoxing Ma, Tianyin Xu. [Multi-Grained Specifications for Distributed System Model Checking and Verification](https://arxiv.org/abs/2409.14301). At *20th European Conference on Computer Systems* (EuroSys), March 2025. [doi:10.1145/3689031.3696069](https://doi.org/10.1145/3689031.3696069) 
[^128]: Yury Izrailevsky and Ariel Tseitlin. [The Netflix Simian Army](https://netflixtechblog.com/the-netflix-simian-army-16e57fbab116). *netflixtechblog.com*, July, 2011. Archived at [perma.cc/M3NY-FJW6](https://perma.cc/M3NY-FJW6) 
[^129]: Kyle Kingsbury. [Jepsen: On the perils of network partitions](https://aphyr.com/posts/281-jepsen-on-the-perils-of-network-partitions). *aphyr.com*, May, 2013. Archived at [perma.cc/W98G-6HQP](https://perma.cc/W98G-6HQP) 
[^130]: Kyle Kingsbury. [Jepsen Analyses](https://jepsen.io/analyses). *jepsen.io*, 2024. Archived at [perma.cc/8LDN-D2T8](https://perma.cc/8LDN-D2T8) 
[^131]: Rupak Majumdar and Filip Niksic. [Why is random testing effective for partition tolerance bugs?](https://dl.acm.org/doi/pdf/10.1145/3158134) *Proceedings of the ACM on Programming Languages* (PACMPL), volume 2, issue POPL, article no. 46, December 2017. [doi:10.1145/3158134](https://doi.org/10.1145/3158134) 
[^132]: FoundationDB project authors. [Simulation and Testing](https://apple.github.io/foundationdb/testing.html). *apple.github.io*. Archived at [perma.cc/NQ3L-PM4C](https://perma.cc/NQ3L-PM4C) 
[^133]: Alex Kladov. [Simulation Testing For Liveness](https://tigerbeetle.com/blog/2023-07-06-simulation-testing-for-liveness/). *tigerbeetle.com*, July 2023. Archived at [perma.cc/RKD4-HGCR](https://perma.cc/RKD4-HGCR) 
[^134]: Alfonso Subiotto Marqués. [(Mostly) Deterministic Simulation Testing in Go](https://www.polarsignals.com/blog/posts/2024/05/28/mostly-dst-in-go). *polarsignals.com*, May 2024. Archived at [perma.cc/ULD6-TSA4](https://perma.cc/ULD6-TSA4) 


================================================
FILE: content/en/colophon.md
================================================
---
title: Colophon
weight: 600
breadcrumbs: false
---

## About the Author

**Martin Kleppmann** is an Associate Professor at the University of Cambridge, UK, where he teaches on distributed systems and cryptographic protocols. 
The first edition of *Designing Data-Intensive Applications* in 2017 established him as an authority on data systems, 
and through his research on distributed systems he helped start the local-first software movement. 
Previously he was a software engineer and entrepreneur at internet companies including LinkedIn and Rapportive, 
where he worked on large-scale data infrastructure.

![](http://martin.kleppmann.com/2017/03/ddia-poster.jpg)

**Chris Riccomini** is a software engineer, startup investor, and author with 15+ years of experience at PayPal, 
LinkedIn, and WePay. He runs Materialized View Capital, where he invests in infrastructure startups. 
He is also the co-creator of Apache Samza and SlateDB, 
and co-author of The Missing README: A Guide for the New Software Engineer.


## Colophon

The animal on the cover of *Designing Data-Intensive Applications* is an Indian wild boar (*Sus scrofa cristatus*), a subspecies of wild boar found in India, Myanmar, Nepal, Sri Lanka, and Thailand. They are distinctive from European boars in that they have higher back bristles, no woolly undercoat, and a larger, straighter skull.

The Indian wild boar has a coat of gray or black hair, with stiff bristles running along the spine. Males have protruding canine teeth (called tushes) that are used to fight with rivals or fend off predators. Males are larger than females, but the species aver‐ ages 33–35 inches tall at the shoulder and 200–300 pounds in weight. Their natural predators include bears, tigers, and various big cats.

These animals are nocturnal and omnivorous—they eat a wide variety of things, including roots, insects, carrion, nuts, berries, and small animals. Wild boars are also known to root through garbage and crop fields, causing a great deal of destruction and earning the enmity of farmers. They need to eat 4,000–4,500 calories a day. Boars have a well-developed sense of smell, which helps them forage for underground plant material and burrowing animals. However, their eyesight is poor.

Wild boars have long held significance in human culture. In Hindu lore, the boar is an avatar of the god Vishnu. In ancient Greek funerary monuments, it was a symbol of a gallant loser (in contrast to the victorious lion). Due to its aggression, it was depicted on the armor and weapons of Scandinavian, Germanic, and Anglo-Saxon warriors. In the Chinese zodiac, it symbolizes determination and impetuosity.

Many of the animals on O’Reilly covers are endangered; all of them are important to the world. To learn more about how you can help, go to *animals.oreilly.com*.

The cover image is from Shaw’s *Zoology*. The cover fonts are URW Typewriter and Guardian Sans. The text font is Adobe Minion Pro; the font in diagrams is Adobe Myriad Pro; the heading font is Adobe Myriad Condensed; and the code font is Dal‐ ton Maag’s Ubuntu Mono.


================================================
FILE: content/en/glossary.md
================================================
---
title: Glossary
weight: 500
breadcrumbs: false
---

> Please note that the definitions in this glossary are short and simple, intended to convey the core idea but not the full subtleties of a term. For more detail, please follow the references into the main text.

### asynchronous

Not waiting for something to complete (e.g., sending data over the network to another node), and not making any assumptions about how long it is going to take. See [“Synchronous Versus Asynchronous Replication”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch06.html#sec_replication_sync_async), [“Synchronous Versus Asynchronous Networks”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch09.html#sec_distributed_sync_networks), and [“System Model and Reality”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch09.html#sec_distributed_system_model).

### atomic

1.  In the context of concurrency: describing an operation that appears to take effect at a single point in time, so another concurrent process can never encounter the operation in a “half-finished” state. See also *isolation*.

2.  In the context of transactions: grouping together a set of writes that must either all be committed or all be rolled back, even if faults occur. See [“Atomicity”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch08.html#sec_transactions_acid_atomicity) and [“Two-Phase Commit (2PC)”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch08.html#sec_transactions_2pc).

### backpressure

Forcing the sender of some data to slow down when the recipient cannot keep up with it. Also known as *flow control*. See [“When an Overloaded System Won’t Recover”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch02.html#sidebar_metastable).

### batch process

A computation that takes some fixed (and usually large) set of data as input and produces some other data as output, without modifying the input. See [Chapter 11](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch11.html#ch_batch).

### bounded

Having some known upper limit or size. Used for example in the context of network delay (see [“Timeouts and Unbounded Delays”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch09.html#sec_distributed_queueing)) and datasets (see the introduction to [Chapter 12](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch12.html#ch_stream)).

### Byzantine fault

A node that behaves incorrectly in some arbitrary way, for example by sending contradictory or malicious messages to other nodes. See [“Byzantine Faults”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch09.html#sec_distributed_byzantine).

### cache

A component that remembers recently used data in order to speed up future reads of the same data. It is generally not complete: thus, if some data is missing from the cache, it has to be fetched from some underlying, slower data storage system that has a complete copy of the data.

### CAP theorem

A widely misunderstood theoretical result that is not useful in practice. See [“The CAP theorem”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch10.html#sec_consistency_cap).

### causality

The dependency between events that arises when one thing “happens before” another thing in a system. For example, a later event that is in response to an earlier event, or builds upon an earlier event, or should be understood in the light of an earlier event. See [“The “happens-before” relation and concurrency”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch06.html#sec_replication_happens_before).

### consensus

A fundamental problem in distributed computing, concerning getting several nodes to agree on something (for example, which node should be the leader for a database cluster). The problem is much harder than it seems at first glance. See [“Consensus”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch10.html#sec_consistency_consensus).

### data warehouse

A database in which data from several different OLTP systems has been combined and prepared to be used for analytics purposes. See [“Data Warehousing”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch01.html#sec_introduction_dwh).

### declarative

Describing the properties that something should have, but not the exact steps for how to achieve it. In the context of database queries, a query optimizer takes a declarative query and decides how it should best be executed. See [“Terminology: Declarative Query Languages”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch03.html#sidebar_declarative).

### denormalize

To introduce some amount of redundancy or duplication in a *normalized* dataset, typically in the form of a *cache* or *index*, in order to speed up reads. A denormalized value is a kind of precomputed query result, similar to a materialized view. See [“Normalization, Denormalization, and Joins”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch03.html#sec_datamodels_normalization).

### derived data

A dataset that is created from some other data through a repeatable process, which you could run again if necessary. Usually, derived data is needed to speed up a particular kind of read access to the data. Indexes, caches, and materialized views are examples of derived data. See [“Systems of Record and Derived Data”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch01.html#sec_introduction_derived).

### deterministic

Describing a function that always produces the same output if you give it the same input. This means it cannot depend on random numbers, the time of day, network communication, or other unpredictable things. See [“The Power of Determinism”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch09.html#sidebar_distributed_determinism).

### distributed

Running on several nodes connected by a network. Characterized by *partial failures*: some part of the system may be broken while other parts are still working, and it is often impossible for the software to know what exactly is broken. See [“Faults and Partial Failures”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch09.html#sec_distributed_partial_failure).

### durable

Storing data in a way such that you believe it will not be lost, even if various faults occur. See [“Durability”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch08.html#sec_transactions_acid_durability).

### ETL

Extract–Transform–Load. The process of extracting data from a source database, transforming it into a form that is more suitable for analytic queries, and loading it into a data warehouse or batch processing system. See [“Data Warehousing”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch01.html#sec_introduction_dwh).

### failover

In systems that have a single leader, failover is the process of moving the leadership role from one node to another. See [“Handling Node Outages”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch06.html#sec_replication_failover).

### fault-tolerant

Able to recover automatically if something goes wrong (e.g., if a machine crashes or a network link fails). See [“Reliability and Fault Tolerance”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch02.html#sec_introduction_reliability).

### flow control

See *backpressure*.

### follower

A replica that does not directly accept any writes from clients, but only processes data changes that it receives from a leader. Also known as a *secondary*, *read replica*, or *hot standby*. See [“Single-Leader Replication”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch06.html#sec_replication_leader).

### full-text search

Searching text by arbitrary keywords, often with additional features such as matching similarly spelled words or synonyms. A full-text index is a kind of *secondary index* that supports such queries. See [“Full-Text Search”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch04.html#sec_storage_full_text).

### graph

A data structure consisting of *vertices* (things that you can refer to, also known as *nodes* or *entities*) and *edges* (connections from one vertex to another, also known as *relationships* or *arcs*). See [“Graph-Like Data Models”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch03.html#sec_datamodels_graph).

### hash

A function that turns an input into a random-looking number. The same input always returns the same number as output. Two different inputs are very likely to have two different numbers as output, although it is possible that two different inputs produce the same output (this is called a *collision*). See [“Sharding by Hash of Key”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch07.html#sec_sharding_hash).

### idempotent

Describing an operation that can be safely retried; if it is executed more than once, it has the same effect as if it was only executed once. See [“Idempotence”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch12.html#sec_stream_idempotence).

### index

A data structure that lets you efficiently search for all records that have a particular value in a particular field. See [“Storage and Indexing for OLTP”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch04.html#sec_storage_oltp).

### isolation

In the context of transactions, describing the degree to which concurrently executing transactions can interfere with each other. *Serializable* isolation provides the strongest guarantees, but weaker isolation levels are also used. See [“Isolation”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch08.html#sec_transactions_acid_isolation).

### join

To bring together records that have something in common. Most commonly used in the case where one record has a reference to another (a foreign key, a document reference, an edge in a graph) and a query needs to get the record that the reference points to. See [“Normalization, Denormalization, and Joins”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch03.html#sec_datamodels_normalization) and [“JOIN and GROUP BY”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch11.html#sec_batch_join).

### leader

When data or a service is replicated across several nodes, the leader is the designated replica that is allowed to make changes. A leader may be elected through some protocol, or manually chosen by an administrator. Also known as the *primary* or *source*. See [“Single-Leader Replication”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch06.html#sec_replication_leader).

### linearizable

Behaving as if there was only a single copy of data in the system, which is updated by atomic operations. See [“Linearizability”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch10.html#sec_consistency_linearizability).

### locality

A performance optimization: putting several pieces of data in the same place if they are frequently needed at the same time. See [“Data locality for reads and writes”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch03.html#sec_datamodels_document_locality).

### lock

A mechanism to ensure that only one thread, node, or transaction can access something, and anyone else who wants to access the same thing must wait until the lock is released. See [“Two-Phase Locking (2PL)”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch08.html#sec_transactions_2pl) and [“Distributed Locks and Leases”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch09.html#sec_distributed_lock_fencing).

### log

An append-only file for storing data. A *write-ahead log* is used to make a storage engine resilient against crashes (see [“Making B-trees reliable”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch04.html#sec_storage_btree_wal)), a *log-structured* storage engine uses logs as its primary storage format (see [“Log-Structured Storage”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch04.html#sec_storage_log_structured)), a *replication log* is used to copy writes from a leader to followers (see [“Single-Leader Replication”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch06.html#sec_replication_leader)), and an *event log* can represent a data stream (see [“Log-based Message Brokers”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch12.html#sec_stream_log)).

### materialize

To perform a computation eagerly and write out its result, as opposed to calculating it on demand when requested. See [“Event Sourcing and CQRS”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch03.html#sec_datamodels_events).

### node

An instance of some software running on a computer, which communicates with other nodes via a network in order to accomplish some task.

### normalized

Structured in such a way that there is no redundancy or duplication. In a normalized database, when some piece of data changes, you only need to change it in one place, not many copies in many different places. See [“Normalization, Denormalization, and Joins”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch03.html#sec_datamodels_normalization).

### OLAP

Online analytic processing. Access pattern characterized by aggregating (e.g., count, sum, average) over a large number of records. See [“Operational Versus Analytical Systems”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch01.html#sec_introduction_analytics).

### OLTP

Online transaction processing. Access pattern characterized by fast queries that read or write a small number of records, usually indexed by key. See [“Operational Versus Analytical Systems”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch01.html#sec_introduction_analytics).

### sharding

Splitting up a large dataset or computation that is too big for a single machine into smaller parts and spreading them across several machines. Also known as *partitioning*. See [Chapter 7](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch07.html#ch_sharding).

### percentile

A way of measuring the distribution of values by counting how many values are above or below some threshold. For example, the 95th percentile response time during some period is the time *t* such that 95% of requests in that period complete in less than *t*, and 5% take longer than *t*. See [“Describing Performance”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch02.html#sec_introduction_percentiles).

### primary key

A value (typically a number or a string) that uniquely identifies a record. In many applications, primary keys are generated by the system when a record is created (e.g., sequentially or randomly); they are not usually set by users. See also *secondary index*.

### quorum

The minimum number of nodes that need to vote on an operation before it can be considered successful. See [“Quorums for reading and writing”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch06.html#sec_replication_quorum_condition).

### rebalance

To move data or services from one node to another in order to spread the load fairly. See [“Sharding of Key-Value Data”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch07.html#sec_sharding_key_value).

### replication

Keeping a copy of the same data on several nodes (*replicas*) so that it remains accessible if a node becomes unreachable. See [Chapter 6](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch06.html#ch_replication).

### schema

A description of the structure of some data, including its fields and datatypes. Whether some data conforms to a schema can be checked at various points in the data’s lifetime (see [“Schema flexibility in the document model”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch03.html#sec_datamodels_schema_flexibility)), and a schema can change over time (see [Chapter 5](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch05.html#ch_encoding)).

### secondary index

An additional data structure that is maintained alongside the primary data storage and which allows you to efficiently search for records that match a certain kind of condition. See [“Multi-Column and Secondary Indexes”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch04.html#sec_storage_index_multicolumn) and [“Sharding and Secondary Indexes”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch07.html#sec_sharding_secondary_indexes).

### serializable

An *isolation* guarantee that if several transactions execute concurrently, they behave the same as if they had executed one at a time, in some serial order. See [“Serializability”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch08.html#sec_transactions_serializability).

### shared-nothing

An architecture in which independent nodes—each with their own CPUs, memory, and disks—are connected via a conventional network, in contrast to shared-memory or shared-disk architectures. See [“Shared-Memory, Shared-Disk, and Shared-Nothing Architecture”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch02.html#sec_introduction_shared_nothing).

### skew

1.  Imbalanced load across shards, such that some shards have lots of requests or data, and others have much less. Also known as *hot spots*. See [“Skewed Workloads and Relieving Hot Spots”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch07.html#sec_sharding_skew).

2.  A timing anomaly that causes events to appear in an unexpected, nonsequential order. See the discussions of *read skew* in [“Snapshot Isolation and Repeatable Read”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch08.html#sec_transactions_snapshot_isolation), *write skew* in [“Write Skew and Phantoms”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch08.html#sec_transactions_write_skew), and *clock skew* in [“Timestamps for ordering events”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch09.html#sec_distributed_lww).

### split brain

A scenario in which two nodes simultaneously believe themselves to be the leader, and which may cause system guarantees to be violated. See [“Handling Node Outages”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch06.html#sec_replication_failover) and [“The Majority Rules”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch09.html#sec_distributed_majority).

### stored procedure

A way of encoding the logic of a transaction such that it can be entirely executed on a database server, without communicating back and forth with a client during the transaction. See [“Actual Serial Execution”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch08.html#sec_transactions_serial).

### stream process

A continually running computation that consumes a never-ending stream of events as input, and derives some output from it. See [Chapter 12](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch12.html#ch_stream).

### synchronous

The opposite of *asynchronous*.

### system of record

A system that holds the primary, authoritative version of some data, also known as the *source of truth*. Changes are first written here, and other datasets may be derived from the system of record. See [“Systems of Record and Derived Data”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch01.html#sec_introduction_derived).

### timeout

One of the simplest ways of detecting a fault, namely by observing the lack of a response within some amount of time. However, it is impossible to know whether a timeout is due to a problem with the remote node, or an issue in the network. See [“Timeouts and Unbounded Delays”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch09.html#sec_distributed_queueing).

### total order

A way of comparing things (e.g., timestamps) that allows you to always say which one of two things is greater and which one is lesser. An ordering in which some things are incomparable (you cannot say which is greater or smaller) is called a *partial order*.

### transaction

Grouping together several reads and writes into a logical unit, in order to simplify error handling and concurrency issues. See [Chapter 8](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch08.html#ch_transactions).

### two-phase commit (2PC)

An algorithm to ensure that several database nodes either all *atomically* commit or all abort a transaction. See [“Two-Phase Commit (2PC)”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch08.html#sec_transactions_2pc).

### two-phase locking (2PL)

An algorithm for achieving *serializable isolation* that works by a transaction acquiring a lock on all data it reads or writes, and holding the lock until the end of the transaction. See [“Two-Phase Locking (2PL)”](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch08.html#sec_transactions_2pl).

### unbounded

Not having any known upper limit or size. The opposite of *bounded*.


================================================
FILE: content/en/indexes.md
================================================
---
title: Indexes
weight: 550
breadcrumbs: false
---

### Symbols

- 3FS (distributed filesystem, [Distributed Filesystems](/en/ch11#sec_batch_dfs)

### A

- aborts (transactions), [Transactions](/en/ch8#ch_transactions), [Atomicity](/en/ch8#sec_transactions_acid_atomicity)
  - cascading, [No dirty reads](/en/ch8#no-dirty-reads)
  - in two-phase commit, [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc)
  - performance of optimistic concurrency control, [Performance of serializable snapshot isolation](/en/ch8#performance-of-serializable-snapshot-isolation)
  - retrying aborted transactions, [Handling errors and aborts](/en/ch8#handling-errors-and-aborts)
- abstraction, [Layering of cloud services](/en/ch1#layering-of-cloud-services), [Simplicity: Managing Complexity](/en/ch2#id38), [Data Models and Query Languages](/en/ch3#ch_datamodels), [Transactions](/en/ch8#ch_transactions), [Summary](/en/ch8#summary)
- accidental complexity, [Simplicity: Managing Complexity](/en/ch2#id38)
- accountability, [Responsibility and Accountability](/en/ch14#id371)
- accounting (financial data), [Summary](/en/ch3#summary), [Advantages of immutable events](/en/ch12#sec_stream_immutability_pros)
- Accumulo (database)
  - wide-column data model, [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality), [Column Compression](/en/ch4#sec_storage_column_compression)
- ACID properties (transactions), [The Meaning of ACID](/en/ch8#sec_transactions_acid)
  - atomicity, [Atomicity](/en/ch8#sec_transactions_acid_atomicity), [Single-Object and Multi-Object Operations](/en/ch8#sec_transactions_multi_object)
  - consistency, [Consistency](/en/ch8#sec_transactions_acid_consistency), [Maintaining integrity in the face of software bugs](/en/ch13#id455)
  - durability, [Making B-trees reliable](/en/ch4#sec_storage_btree_wal), [Durability](/en/ch8#durability)
  - isolation, [Isolation](/en/ch8#sec_transactions_acid_isolation), [Single-Object and Multi-Object Operations](/en/ch8#sec_transactions_multi_object)
- acknowledgements (messaging), [Acknowledgments and redelivery](/en/ch12#sec_stream_reordering)
- active/active replication (see multi-leader replication)
- active/passive replication (see leader-based replication)
- ActiveMQ (messaging), [Message brokers](/en/ch5#message-brokers), [Message brokers compared to databases](/en/ch12#id297)
  - distributed transaction support, [XA transactions](/en/ch8#xa-transactions)
- ActiveRecord (object-relational mapper), [Object-relational mapping (ORM)](/en/ch3#object-relational-mapping-orm), [Handling errors and aborts](/en/ch8#handling-errors-and-aborts)
- activity (workflows) (see workflow engines)
- actor model, [Distributed actor frameworks](/en/ch5#distributed-actor-frameworks)
  - (see also event-driven architecture)
  - comparison to stream processing, [Event-Driven Architectures and RPC](/en/ch12#sec_stream_actors_drpc)
- adaptive capacity, [Skewed Workloads and Relieving Hot Spots](/en/ch7#sec_sharding_skew)
- Advanced Message Queuing Protocol (see AMQP)
- aerospace systems, [Byzantine Faults](/en/ch9#sec_distributed_byzantine)
- Aerospike (database)
  - strong consistency mode, [Single-object writes](/en/ch8#sec_transactions_single_object)
- AGE (graph database), [The Cypher Query Language](/en/ch3#id57)
- aggregation
  - data cubes and materialized views, [Materialized Views and Data Cubes](/en/ch4#sec_storage_materialized_views)
  - in batch processes, [Sorting Versus In-memory Aggregation](/en/ch11#id275)
  - in stream processes, [Stream analytics](/en/ch12#id318)
- aggregation pipeline (MongoDB), [Normalization, Denormalization, and Joins](/en/ch3#sec_datamodels_normalization), [Query languages for documents](/en/ch3#query-languages-for-documents)
- Agile, [Evolvability: Making Change Easy](/en/ch2#sec_introduction_evolvability)
  - minimizing irreversibility, [Batch Processing](/en/ch11#ch_batch), [Reprocessing data for application evolution](/en/ch13#sec_future_reprocessing)
  - moving faster with confidence, [The end-to-end argument again](/en/ch13#id456)
- agreement, [Single-value consensus](/en/ch10#single-value-consensus), [Atomic commitment as consensus](/en/ch10#atomic-commitment-as-consensus)
  - (see also consensus)
- AI (artificial intelligence) (see machine learning)
- AI Act (European Union), [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)
- AirByte, [Data Warehousing](/en/ch1#sec_introduction_dwh)
- Airflow (workflow scheduler), [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows), [Batch Processing](/en/ch11#ch_batch), [Scheduling Workflows](/en/ch11#sec_batch_workflows)
  - cloud data warehouse integration, [Query languages](/en/ch11#sec_batch_query_lanauges)
  - use for ETL, [Extract--Transform--Load (ETL)](/en/ch11#sec_batch_etl_usage)
- Akamai
  - response time study, [Average, Median, and Percentiles](/en/ch2#id24)
- algorithms
  - algorithm correctness, [Defining the correctness of an algorithm](/en/ch9#defining-the-correctness-of-an-algorithm)
  - B-trees, [B-Trees](/en/ch4#sec_storage_b_trees)-[B-tree variants](/en/ch4#b-tree-variants)
  - for distributed systems, [System Model and Reality](/en/ch9#sec_distributed_system_model)
  - mergesort, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables), [Shuffling Data](/en/ch11#sec_shuffle)
  - scheduling, [Resource Allocation](/en/ch11#id279)
  - SSTables and LSM-trees, [The SSTable file format](/en/ch4#the-sstable-file-format)-[Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
- all-to-all replication topologies, [Multi-leader replication topologies](/en/ch6#sec_replication_topologies)
- AllegroGraph (database), [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
  - SPARQL query language, [The SPARQL query language](/en/ch3#the-sparql-query-language)
- ALTER TABLE statement (SQL), [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility), [Encoding and Evolution](/en/ch5#ch_encoding)
- Amazon
  - Dynamo (see Dynamo (database))
  - response time study, [Average, Median, and Percentiles](/en/ch2#id24)
- Amazon Web Services (AWS)
  - Aurora (see Aurora (cloud database))
  - ClockBound (see ClockBound (time sync))
  - correctness testing, [Formal Methods and Randomized Testing](/en/ch9#sec_distributed_formal)
  - DynamoDB (see DynamoDB (database))
  - EBS (see EBS (virtual block device))
  - Kinesis (see Kinesis (messaging))
  - Neptune (see Neptune (graph database))
  - network reliability, [Network Faults in Practice](/en/ch9#sec_distributed_network_faults)
  - S3 (see S3 (object storage))
- amplification
  - of bias, [Bias and Discrimination](/en/ch14#id370)
  - of failures, [Maintaining derived state](/en/ch13#id446)
  - of tail latency, [Use of Response Time Metrics](/en/ch2#sec_introduction_slo_sla), [Local Secondary Indexes](/en/ch7#id166)
  - write amplification, [Write amplification](/en/ch4#write-amplification)
- AMQP (Advanced Message Queuing Protocol), [Message brokers compared to databases](/en/ch12#id297)
  - (see also messaging systems)
  - comparison to log-based messaging, [Logs compared to traditional messaging](/en/ch12#sec_stream_logs_vs_messaging), [Replaying old messages](/en/ch12#sec_stream_replay)
  - message ordering, [Acknowledgments and redelivery](/en/ch12#sec_stream_reordering)
- analytical systems, [Operational Versus Analytical Systems](/en/ch1#sec_introduction_analytics)
  - as derived data systems, [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived)
  - ETL from operational systems, [Data Warehousing](/en/ch1#sec_introduction_dwh)
  - governance, [Beyond the data lake](/en/ch1#beyond-the-data-lake)
- analytics, [Operational Versus Analytical Systems](/en/ch1#sec_introduction_analytics)-[Systems of Record and Derived Data](/en/ch1#sec_introduction_derived)
  - comparison to transaction processing, [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp)
  - data normalization, [Trade-offs of normalization](/en/ch3#trade-offs-of-normalization)
  - data warehousing (see data warehousing)
  - predictive (see predictive analytics)
  - relation to batch processing, [Analytics](/en/ch11#sec_batch_olap)-[Analytics](/en/ch11#sec_batch_olap)
  - schemas for, [Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)-[Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)
  - snapshot isolation for queries, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)
  - stream analytics, [Stream analytics](/en/ch12#id318)
- analytics engineering, [Operational Versus Analytical Systems](/en/ch1#sec_introduction_analytics)
- anti-entropy, [Catching up on missed writes](/en/ch6#sec_replication_read_repair)
- Antithesis (deterministic simulation testing), [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
- Apache Accumulo (see Accumulo)
- Apache ActiveMQ (see ActiveMQ)
- Apache AGE (see AGE)
- Apache Arrow (see Arrow (data format))
- Apache Avro (see Avro)
- Apache Beam (see Beam)
- Apache BookKeeper (see BookKeeper)
- Apache Cassandra (see Cassandra)
- Apache Curator (see Curator)
- Apache DataFusion (see DataFusion (query engine))
- Apache Druid (see Druid (database))
- Apache Flink (see Flink (processing framework))
- Apache HBase (see HBase)
- Apache Iceberg (see Iceberg (table format))
- Apache Jena (see Jena)
- Apache Kafka (see Kafka)
- Apache Lucene (see Lucene)
- Apache Oozie (see Oozie (workflow scheduler))
- Apache ORC (see ORC (data format))
- Apache Parquet (see Parquet (data format))
- Apache Pig (query language), [Query languages](/en/ch11#sec_batch_query_lanauges)
- Apache Pinot (see Pinot (database))
- Apache Pulsar (see Pulsar)
- Apache Qpid (see Qpid)
- Apache Samza (see Samza)
- Apache Solr (see Solr)
- Apache Spark (see Spark) (see Spark (processing framework))
- Apache Storm (see Storm)
- Apache Superset (see Superset (data visualization software))
- Apache Thrift (see Thrift)
- Apache ZooKeeper (see ZooKeeper)
- Apama (stream analytics), [Complex event processing](/en/ch12#id317)
- append-only files (see logs)
- Application Programming Interfaces (APIs), [Data Models and Query Languages](/en/ch3#ch_datamodels)
  - for change streams, [API support for change streams](/en/ch12#sec_stream_change_api)
  - for distributed transactions, [XA transactions](/en/ch8#xa-transactions)
  - for services, [Dataflow Through Services: REST and RPC](/en/ch5#sec_encoding_dataflow_rpc)-[Data encoding and evolution for RPC](/en/ch5#data-encoding-and-evolution-for-rpc)
    - (see also services)
    - evolvability, [Data encoding and evolution for RPC](/en/ch5#data-encoding-and-evolution-for-rpc)
    - RESTful, [Web services](/en/ch5#sec_web_services)
- application state (see state)
- approximate search (see similarity search)
- archival storage, data from databases, [Archival storage](/en/ch5#archival-storage)
- arcs (see edges)
- ArcticDB (database), [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- arithmetic mean, [Average, Median, and Percentiles](/en/ch2#id24)
- arrays
  - array databases, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
  - multidimensional, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- Arrow (data format), [Column-Oriented Storage](/en/ch4#sec_storage_column), [DataFrames](/en/ch11#id287)
- artificial intelligence (see machine learning)
- ASCII text, [Protocol Buffers](/en/ch5#sec_encoding_protobuf)
- ASN.1 (schema language), [The Merits of Schemas](/en/ch5#sec_encoding_schemas)
- associative table, [Many-to-One and Many-to-Many Relationships](/en/ch3#sec_datamodels_many_to_many), [Property Graphs](/en/ch3#id56)
- asynchronous networks, [Unreliable Networks](/en/ch9#sec_distributed_networks), [Glossary](/en/glossary)
  - comparison to synchronous networks, [Synchronous Versus Asynchronous Networks](/en/ch9#sec_distributed_sync_networks)
  - system model, [System Model and Reality](/en/ch9#sec_distributed_system_model)
- asynchronous replication, [Synchronous Versus Asynchronous Replication](/en/ch6#sec_replication_sync_async), [Glossary](/en/glossary)
  - data loss on failover, [Leader failure: Failover](/en/ch6#leader-failure-failover)
  - reads from asynchronous follower, [Problems with Replication Lag](/en/ch6#sec_replication_lag)
  - with multiple leaders, [Multi-Leader Replication](/en/ch6#sec_replication_multi_leader)
- Asynchronous Transfer Mode (ATM), [Can we not simply make network delays predictable?](/en/ch9#can-we-not-simply-make-network-delays-predictable)
- atomic broadcast, [Shared logs as consensus](/en/ch10#sec_consistency_shared_logs)
- atomic clocks, [Clock readings with a confidence interval](/en/ch9#clock-readings-with-a-confidence-interval), [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
  - (see also clocks)
- atomicity (concurrency), [Glossary](/en/glossary)
  - atomic increment, [Single-object writes](/en/ch8#sec_transactions_single_object)
  - compare-and-set (CAS), [Conditional writes (compare-and-set)](/en/ch8#sec_transactions_compare_and_set), [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
    - (see also compare-and-set (CAS))
  - denormalized data, [Trade-offs of normalization](/en/ch3#trade-offs-of-normalization)
  - fetch-and-add/increment, [ID Generators and Logical Clocks](/en/ch10#sec_consistency_logical), [Consensus](/en/ch10#sec_consistency_consensus), [Fetch-and-add as consensus](/en/ch10#fetch-and-add-as-consensus)
  - write operations, [Atomic write operations](/en/ch8#atomic-write-operations)
- atomicity (transactions), [Atomicity](/en/ch8#sec_transactions_acid_atomicity), [Single-Object and Multi-Object Operations](/en/ch8#sec_transactions_multi_object), [Glossary](/en/glossary)
  - atomic commit
    - avoiding, [Multi-shard request processing](/en/ch13#id360), [Coordination-avoiding data systems](/en/ch13#id454)
    - blocking and nonblocking, [Three-phase commit](/en/ch8#three-phase-commit)
    - in stream processing, [Exactly-once message processing](/en/ch8#sec_transactions_exactly_once), [Exactly-once message processing revisited](/en/ch8#exactly-once-message-processing-revisited), [Atomic commit revisited](/en/ch12#sec_stream_atomic_commit)
    - maintaining derived data, [Keeping Systems in Sync](/en/ch12#sec_stream_sync)
  - distributed transactions, [Distributed Transactions](/en/ch8#sec_transactions_distributed)-[Exactly-once message processing revisited](/en/ch8#exactly-once-message-processing-revisited)
  - for multi-object transactions, [Single-Object and Multi-Object Operations](/en/ch8#sec_transactions_multi_object)
  - for single-object writes, [Single-object writes](/en/ch8#sec_transactions_single_object)
  - relation to consensus, [Atomic commitment as consensus](/en/ch10#atomic-commitment-as-consensus)
- auditability, [Trust, but Verify](/en/ch13#sec_future_verification)-[Tools for auditable data systems](/en/ch13#id366)
  - designing for, [Designing for auditability](/en/ch13#id365)
  - self-auditing systems, [Don't just blindly trust what they promise](/en/ch13#id364)
  - through immutability, [Advantages of immutable events](/en/ch12#sec_stream_immutability_pros)
  - tools for auditable data systems, [Tools for auditable data systems](/en/ch13#id366)
- Aurora (cloud database), [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native)
- Aurora DSQL (database)
  - snapshot isolation support, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)
- auto-scaling, [Operations: Automatic or Manual Rebalancing](/en/ch7#sec_sharding_operations)
- Automerge (CRDT library), [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
- availability, [Reliability and Fault Tolerance](/en/ch2#sec_introduction_reliability)
  - (see also fault tolerance)
  - in CAP theorem, [The CAP theorem](/en/ch10#the-cap-theorem)
  - in leader election, [Subtleties of consensus](/en/ch10#subtleties-of-consensus)
  - in service level agreements (SLAs), [Use of Response Time Metrics](/en/ch2#sec_introduction_slo_sla)
- availability zones, [Tolerating hardware faults through redundancy](/en/ch2#tolerating-hardware-faults-through-redundancy), [Reading Your Own Writes](/en/ch6#sec_replication_ryw)
- Avro (data format), [Avro](/en/ch5#sec_encoding_avro)-[Dynamically generated schemas](/en/ch5#dynamically-generated-schemas)
  - dynamically generated schemas, [Dynamically generated schemas](/en/ch5#dynamically-generated-schemas)
  - object container files, [But what is the writer's schema?](/en/ch5#but-what-is-the-writers-schema), [Archival storage](/en/ch5#archival-storage)
  - reader determining writer's schema, [But what is the writer's schema?](/en/ch5#but-what-is-the-writers-schema)
  - schema evolution, [The writer's schema and the reader's schema](/en/ch5#the-writers-schema-and-the-readers-schema)
  - use in batch processing, [MapReduce](/en/ch11#sec_batch_mapreduce)
- awk (Unix tool), [Simple Log Analysis](/en/ch11#sec_batch_log_analysis), [Simple Log Analysis](/en/ch11#sec_batch_log_analysis), [Distributed Job Orchestration](/en/ch11#id278)
- Axon Framework, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
- Azkaban (workflow scheduler), [Batch Processing](/en/ch11#ch_batch)
- Azure Blob Storage (object storage), [Layering of cloud services](/en/ch1#layering-of-cloud-services), [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - conditional headers, [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens)
- Azure managed disks, [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute)
- Azure SQL DB (database), [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native)
- Azure Storage, [Object Stores](/en/ch11#id277)
- Azure Synapse Analytics (database), [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native)
- Azure Virtual Machines
  - spot virtual machines, [Handling Faults](/en/ch11#id281)

### B

- B-trees (indexes), [B-Trees](/en/ch4#sec_storage_b_trees)-[B-tree variants](/en/ch4#b-tree-variants)
  - B+ trees, [B-tree variants](/en/ch4#b-tree-variants)
  - branching factor, [B-Trees](/en/ch4#sec_storage_b_trees)
  - comparison to LSM-trees, [Comparing B-Trees and LSM-Trees](/en/ch4#sec_storage_btree_lsm_comparison)-[Disk space usage](/en/ch4#disk-space-usage)
  - crash recovery, [Making B-trees reliable](/en/ch4#sec_storage_btree_wal)
  - growing by splitting a page, [B-Trees](/en/ch4#sec_storage_b_trees)
  - immutable variants, [B-tree variants](/en/ch4#b-tree-variants), [Indexes and snapshot isolation](/en/ch8#indexes-and-snapshot-isolation)
  - similarity to shard splitting, [Rebalancing key-range sharded data](/en/ch7#rebalancing-key-range-sharded-data)
  - variants, [B-tree variants](/en/ch4#b-tree-variants)
- B2 (object storage), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- Backblaze B2 (see B2 (object storage))
- backend, [Trade-offs in Data Systems Architecture](/en/ch1#ch_tradeoffs)
- backoff, exponential, [Describing Performance](/en/ch2#sec_introduction_percentiles), [Handling errors and aborts](/en/ch8#handling-errors-and-aborts)
- backpressure, [Describing Performance](/en/ch2#sec_introduction_percentiles), [Read performance](/en/ch4#read-performance), [Messaging Systems](/en/ch12#sec_stream_messaging), [Glossary](/en/glossary)
  - in batch processing, [Scheduling Workflows](/en/ch11#sec_batch_workflows)
  - in TCP, [The Limitations of TCP](/en/ch9#sec_distributed_tcp)
- backups
  - database snapshot for replication, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - in multitenant systems, [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
  - integrity of, [Don't just blindly trust what they promise](/en/ch13#id364)
  - snapshot isolation for, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)
  - using object storage, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - versus replication, [Replication](/en/ch6#ch_replication)
- backward compatibility, [Encoding and Evolution](/en/ch5#ch_encoding)
- BadgerDB (database)
  - serializable transactions, [Serializable Snapshot Isolation (SSI)](/en/ch8#sec_transactions_ssi)
- BASE, contrast to ACID, [The Meaning of ACID](/en/ch8#sec_transactions_acid)
- bash shell (Unix), [Storage and Indexing for OLTP](/en/ch4#sec_storage_oltp)
- batch processing, [Batch Processing](/en/ch11#ch_batch)-[Summary](/en/ch11#id292), [Glossary](/en/glossary)
  - and functional programming, [MapReduce](/en/ch11#sec_batch_mapreduce)
  - benefits of, [Batch Processing](/en/ch11#ch_batch)
  - combining with stream processing, [Unifying batch and stream processing](/en/ch13#id338)
  - comparison to stream processing, [Processing Streams](/en/ch12#sec_stream_processing)
  - dataflow engines, [Dataflow Engines](/en/ch11#sec_batch_dataflow)-[Dataflow Engines](/en/ch11#sec_batch_dataflow)
  - fault tolerance, [Handling Faults](/en/ch11#id281), [Messaging Systems](/en/ch12#sec_stream_messaging)
  - for data integration, [Batch and Stream Processing](/en/ch13#sec_future_batch_streaming)-[Unifying batch and stream processing](/en/ch13#id338)
  - graphs and iterative processing, [Machine Learning](/en/ch11#id290)
  - high-level APIs and languages, [Query languages](/en/ch11#sec_batch_query_lanauges)-[Query languages](/en/ch11#sec_batch_query_lanauges)
  - in cloud data warehouses, [Query languages](/en/ch11#sec_batch_query_lanauges)
  - in distributed systems, [Batch Processing in Distributed Systems](/en/ch11#sec_batch_distributed)
  - join and group by, [JOIN and GROUP BY](/en/ch11#sec_batch_join)-[JOIN and GROUP BY](/en/ch11#sec_batch_join)
  - limitations, [Batch Processing](/en/ch11#ch_batch)
  - log-based messaging and, [Replaying old messages](/en/ch12#sec_stream_replay)
  - maintaining derived state, [Maintaining derived state](/en/ch13#id446)
  - measuring performance, [Batch Processing](/en/ch11#ch_batch)
  - models of, [Batch Processing Models](/en/ch11#id431)
  - resource allocation, [Resource Allocation](/en/ch11#id279)-[Resource Allocation](/en/ch11#id279)
  - resource managers, [Distributed Job Orchestration](/en/ch11#id278)
  - schedulers, [Distributed Job Orchestration](/en/ch11#id278)
  - serving derived data, [Serving Derived Data](/en/ch11#sec_batch_serving_derived)-[Serving Derived Data](/en/ch11#sec_batch_serving_derived)
  - shuffling data, [Shuffling Data](/en/ch11#sec_shuffle)-[Shuffling Data](/en/ch11#sec_shuffle)
  - task execution, [Distributed Job Orchestration](/en/ch11#id278)
  - use cases, [Batch Use Cases](/en/ch11#sec_batch_output)-[Serving Derived Data](/en/ch11#sec_batch_serving_derived)
  - using Unix tools (example), [Batch Processing with Unix Tools](/en/ch11#sec_batch_unix)-[Sorting Versus In-memory Aggregation](/en/ch11#id275)
- batch processing frameworks
  - comparison to operating systems, [Batch Processing in Distributed Systems](/en/ch11#sec_batch_distributed)
- Beam (dataflow library), [Unifying batch and stream processing](/en/ch13#id338)
- BERT (language model), [Vector Embeddings](/en/ch4#id92)
- bias, [Bias and Discrimination](/en/ch14#id370)
- bidirectional replication (see multi-leader replication)
- big ball of mud, [Simplicity: Managing Complexity](/en/ch2#id38)
- big data
  - versus data minimization, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance), [Legislation and Self-Regulation](/en/ch14#sec_future_legislation)
- BigQuery (database), [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses), [Batch Processing](/en/ch11#ch_batch)
  - DataFrames, [Query languages](/en/ch11#sec_batch_query_lanauges)
  - sharding and clustering, [Sharding by hash range](/en/ch7#sharding-by-hash-range)
  - shuffling data, [Shuffling Data](/en/ch11#sec_shuffle)
  - snapshot isolation support, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)
- Bigtable (database)
  - sharding scheme, [Sharding by Key Range](/en/ch7#sec_sharding_key_range)
  - storage layout, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
  - tablets (sharding), [Sharding](/en/ch7#ch_sharding)
  - wide-column data model, [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality), [Column Compression](/en/ch4#sec_storage_column_compression)
- binary data encodings, [Binary encoding](/en/ch5#binary-encoding)-[The Merits of Schemas](/en/ch5#sec_encoding_schemas)
  - Avro, [Avro](/en/ch5#sec_encoding_avro)-[Dynamically generated schemas](/en/ch5#dynamically-generated-schemas)
  - MessagePack, [Binary encoding](/en/ch5#binary-encoding)-[Binary encoding](/en/ch5#binary-encoding)
  - Protocol Buffers, [Protocol Buffers](/en/ch5#sec_encoding_protobuf)-[Field tags and schema evolution](/en/ch5#field-tags-and-schema-evolution)
- binary encoding
  - based on schemas, [The Merits of Schemas](/en/ch5#sec_encoding_schemas)
  - by network drivers, [The Merits of Schemas](/en/ch5#sec_encoding_schemas)
- binary strings, lack of support in JSON and XML, [JSON, XML, and Binary Variants](/en/ch5#sec_encoding_json)
- Bitcoin (cryptocurrency), [Tools for auditable data systems](/en/ch13#id366)
  - Byzantine fault tolerance, [Byzantine Faults](/en/ch9#sec_distributed_byzantine)
  - concurrency bugs in exchanges, [Weak Isolation Levels](/en/ch8#sec_transactions_isolation_levels)
- bitmap indexes, [Column Compression](/en/ch4#sec_storage_column_compression)
- BitTorrent uTP protocol, [The Limitations of TCP](/en/ch9#sec_distributed_tcp)
- Bkd-trees (indexes), [Multidimensional and Full-Text Indexes](/en/ch4#sec_storage_multidimensional)
- blameless postmortems, [Humans and Reliability](/en/ch2#id31)
- Blazegraph (database), [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
  - SPARQL query language, [The SPARQL query language](/en/ch3#the-sparql-query-language)
- blob storage (see object storage)
- block (file system), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- block device (disk), [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute)
- blockchains, [Summary](/en/ch3#summary)
  - Byzantine fault tolerance, [Byzantine Faults](/en/ch9#sec_distributed_byzantine), [Consensus](/en/ch10#sec_consistency_consensus), [Tools for auditable data systems](/en/ch13#id366)
- blocking atomic commit, [Three-phase commit](/en/ch8#three-phase-commit)
- Bloom filter (algorithm), [Bloom filters](/en/ch4#bloom-filters), [Read performance](/en/ch4#read-performance), [Stream analytics](/en/ch12#id318)
- BookKeeper (replicated log), [Allocating work to nodes](/en/ch10#allocating-work-to-nodes)
- bounded datasets, [Stream Processing](/en/ch12#ch_stream), [Glossary](/en/glossary)
  - (see also batch processing)
- bounded delays, [Glossary](/en/glossary)
  - in networks, [Synchronous Versus Asynchronous Networks](/en/ch9#sec_distributed_sync_networks)
  - process pauses, [Response time guarantees](/en/ch9#sec_distributed_clocks_realtime)
- broadcast
  - total order broadcast (see shared logs)
- brokerless messaging, [Direct messaging from producers to consumers](/en/ch12#id296)
- Brubeck (metrics aggregator), [Direct messaging from producers to consumers](/en/ch12#id296)
- BTM (transaction coordinator), [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc)
- Buf
  - Bufstream (messaging), [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
- Bufstream (messaging), [Disk space usage](/en/ch12#sec_stream_disk_usage)
- build or buy, [Cloud Versus Self-Hosting](/en/ch1#sec_introduction_cloud)
- bursty network traffic patterns, [Can we not simply make network delays predictable?](/en/ch9#can-we-not-simply-make-network-delays-predictable)
- business analyst, [Operational Versus Analytical Systems](/en/ch1#sec_introduction_analytics), [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake)
- business data processing, [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp)
- business intelligence, [Operational Versus Analytical Systems](/en/ch1#sec_introduction_analytics)-[Data Warehousing](/en/ch1#sec_introduction_dwh)
- Business Process Execution Language (BPEL), [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows)
- Business Process Model and Notation (BPMN), [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows)
  - example, [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows)
- byte sequence, encoding data in, [Formats for Encoding Data](/en/ch5#sec_encoding_formats)
- Byzantine faults, [Byzantine Faults](/en/ch9#sec_distributed_byzantine)-[Weak forms of lying](/en/ch9#weak-forms-of-lying), [System Model and Reality](/en/ch9#sec_distributed_system_model), [Glossary](/en/glossary)
  - Byzantine fault-tolerant systems, [Byzantine Faults](/en/ch9#sec_distributed_byzantine)
  - Byzantine Generals Problem, [Byzantine Faults](/en/ch9#sec_distributed_byzantine)
  - consensus algorithms and, [Consensus](/en/ch10#sec_consistency_consensus), [Tools for auditable data systems](/en/ch13#id366)

### C

- caches, [Keeping everything in memory](/en/ch4#sec_storage_inmemory), [Glossary](/en/glossary)
  - and materialized views, [Materialized Views and Data Cubes](/en/ch4#sec_storage_materialized_views)
  - as derived data, [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived), [Composing Data Storage Technologies](/en/ch13#id447)-[Unbundled versus integrated systems](/en/ch13#id448)
  - in CPUs, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized), [Linearizability and network delays](/en/ch10#linearizability-and-network-delays)
  - invalidation and maintenance, [Keeping Systems in Sync](/en/ch12#sec_stream_sync), [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
  - linearizability, [Linearizability](/en/ch10#sec_consistency_linearizability)
  - local disks in the cloud, [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute)
- calendar sync, [Sync Engines and Local-First Software](/en/ch6#sec_replication_offline_clients), [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
- California Consumer Privacy Act (CCPA), [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)
- Camunda (workflow engine), [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows)
- canonical version (of data), [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived)
- CAP theorem, [The CAP theorem](/en/ch10#the-cap-theorem)-[The CAP theorem](/en/ch10#the-cap-theorem), [Glossary](/en/glossary)
- capacity planning, [Operations in the Cloud Era](/en/ch1#sec_introduction_operations)
- Cap'n Proto (data format), [Formats for Encoding Data](/en/ch5#sec_encoding_formats)
- carbon emissions, [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed)
- cascading aborts, [No dirty reads](/en/ch8#no-dirty-reads)
- cascading failures, [Software faults](/en/ch2#software-faults), [Operations: Automatic or Manual Rebalancing](/en/ch7#sec_sharding_operations), [Timeouts and Unbounded Delays](/en/ch9#sec_distributed_queueing)
- Cassandra (database)
  - change data capture, [Implementing change data capture](/en/ch12#id307), [API support for change streams](/en/ch12#sec_stream_change_api)
  - compaction strategy, [Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
  - consistency level ANY, [Single-Leader Versus Leaderless Replication Performance](/en/ch6#sec_replication_leaderless_perf)
  - hash-range sharding, [Sharding by Hash of Key](/en/ch7#sec_sharding_hash), [Sharding by hash range](/en/ch7#sharding-by-hash-range)
  - last-write-wins conflict resolution, [Detecting Concurrent Writes](/en/ch6#sec_replication_concurrent)
  - leaderless replication, [Leaderless Replication](/en/ch6#sec_replication_leaderless)
  - lightweight transactions, [Single-object writes](/en/ch8#sec_transactions_single_object)
  - linearizability, lack of, [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable)
  - log-structured storage, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
  - multi-region support, [Multi-region operation](/en/ch6#multi-region-operation)
  - secondary indexes, [Local Secondary Indexes](/en/ch7#id166)
  - use of clocks, [Limitations of Quorum Consistency](/en/ch6#sec_replication_quorum_limitations), [Timestamps for ordering events](/en/ch9#sec_distributed_lww)
  - vnodes (sharding), [Sharding](/en/ch7#ch_sharding)
- cat (Unix tool), [Simple Log Analysis](/en/ch11#sec_batch_log_analysis)
- catalog, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
- causal context, [Version vectors](/en/ch6#version-vectors)
  - (see also causal dependencies)
- causal dependencies, [The "happens-before" relation and concurrency](/en/ch6#sec_replication_happens_before)-[Version vectors](/en/ch6#version-vectors)
  - capturing, [Version vectors](/en/ch6#version-vectors), [Ordering events to capture causality](/en/ch13#sec_future_capture_causality), [Reads are events too](/en/ch13#sec_future_read_events)
    - by total ordering, [The limits of total ordering](/en/ch13#id335)
  - in transactions, [Decisions based on an outdated premise](/en/ch8#decisions-based-on-an-outdated-premise)
  - sending message to friends (example), [Ordering events to capture causality](/en/ch13#sec_future_capture_causality)
- causality, [Glossary](/en/glossary)
  - causal ordering
    - total order consistent with, [Logical Clocks](/en/ch10#sec_consistency_timestamps)
  - consistency with, [Logical Clocks](/en/ch10#sec_consistency_timestamps)-[Enforcing constraints using logical clocks](/en/ch10#enforcing-constraints-using-logical-clocks)
  - happens-before relation, [The "happens-before" relation and concurrency](/en/ch6#sec_replication_happens_before)
  - in serializable transactions, [Decisions based on an outdated premise](/en/ch8#decisions-based-on-an-outdated-premise)-[Detecting writes that affect prior reads](/en/ch8#sec_detecting_writes_affect_reads)
  - mismatch with clocks, [Timestamps for ordering events](/en/ch9#sec_distributed_lww)
  - ordering events to capture, [Ordering events to capture causality](/en/ch13#sec_future_capture_causality)
  - violations of, [Consistent Prefix Reads](/en/ch6#sec_replication_consistent_prefix), [Problems with different topologies](/en/ch6#problems-with-different-topologies), [Timestamps for ordering events](/en/ch9#sec_distributed_lww)
  - with synchronized clocks, [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
- cell-based architecture, [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
- CEP (see complex event processing)
- CephFS (distributed filesystem), [Batch Processing](/en/ch11#ch_batch), [Object Stores](/en/ch11#id277)
- certificate transparency, [Tools for auditable data systems](/en/ch13#id366)
- cgroups, [Distributed Job Orchestration](/en/ch11#id278)
- change data capture, [Logical (row-based) log replication](/en/ch6#logical-row-based-log-replication), [Change Data Capture](/en/ch12#sec_stream_cdc)
  - API support for change streams, [API support for change streams](/en/ch12#sec_stream_change_api)
  - comparison to event sourcing, [Change data capture versus event sourcing](/en/ch12#sec_stream_event_sourcing)
  - implementing, [Implementing change data capture](/en/ch12#id307)
  - initial snapshot, [Initial snapshot](/en/ch12#sec_stream_cdc_snapshot)
  - log compaction, [Log compaction](/en/ch12#sec_stream_log_compaction)
- changelogs, [State, Streams, and Immutability](/en/ch12#sec_stream_immutability)
  - change data capture, [Change Data Capture](/en/ch12#sec_stream_cdc)
  - for operator state, [Rebuilding state after a failure](/en/ch12#sec_stream_state_fault_tolerance)
  - in stream joins, [Stream-table join (stream enrichment)](/en/ch12#sec_stream_table_joins)
  - log compaction, [Log compaction](/en/ch12#sec_stream_log_compaction)
  - maintaining derived state, [Databases and Streams](/en/ch12#sec_stream_databases)
- chaos engineering, [Fault Tolerance](/en/ch2#id27), [Fault injection](/en/ch9#sec_fault_injection)
- checkpointing
  - in high-performance computing, [Cloud Computing Versus Supercomputing](/en/ch1#id17)
  - in stream processors, [Microbatching and checkpointing](/en/ch12#id329)
- circuit breaker (limiting retries), [Describing Performance](/en/ch2#sec_introduction_percentiles)
- circuit-switched networks, [Synchronous Versus Asynchronous Networks](/en/ch9#sec_distributed_sync_networks)
- circular buffers, [Disk space usage](/en/ch12#sec_stream_disk_usage)
- circular replication topologies, [Multi-leader replication topologies](/en/ch6#sec_replication_topologies)
- Citus (database)
  - hash sharding, [Fixed number of shards](/en/ch7#fixed-number-of-shards)
- ClickHouse (database), [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp), [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native)
  - incremental view maintenance, [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
- clickstream data, analysis of, [JOIN and GROUP BY](/en/ch11#sec_batch_join)
- clients
  - calling services, [Dataflow Through Services: REST and RPC](/en/ch5#sec_encoding_dataflow_rpc)
  - offline-capable, [Sync Engines and Local-First Software](/en/ch6#sec_replication_offline_clients), [Stateful, offline-capable clients](/en/ch13#id347)
  - pushing state changes to, [Pushing state changes to clients](/en/ch13#id348)
  - request routing, [Request Routing](/en/ch7#sec_sharding_routing)
- ClockBound (time sync), [Clock readings with a confidence interval](/en/ch9#clock-readings-with-a-confidence-interval)
  - use in YugabyteDB, [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
- clocks, [Unreliable Clocks](/en/ch9#sec_distributed_clocks)-[Limiting the impact of garbage collection](/en/ch9#sec_distributed_gc_impact)
  - atomic clocks, [Clock readings with a confidence interval](/en/ch9#clock-readings-with-a-confidence-interval), [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
  - confidence interval, [Clock readings with a confidence interval](/en/ch9#clock-readings-with-a-confidence-interval)-[Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
  - for global snapshots, [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
  - hybrid logical clocks, [Hybrid logical clocks](/en/ch10#hybrid-logical-clocks)
  - logical (see logical clocks)
  - skew, [Last write wins (discarding concurrent writes)](/en/ch6#sec_replication_lww), [Limitations of Quorum Consistency](/en/ch6#sec_replication_quorum_limitations), [Relying on Synchronized Clocks](/en/ch9#sec_distributed_clocks_relying)-[Clock readings with a confidence interval](/en/ch9#clock-readings-with-a-confidence-interval), [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable)
  - slewing, [Monotonic clocks](/en/ch9#monotonic-clocks)
  - synchronization and accuracy, [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy)-[Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy)
  - synchronization using GPS, [Unreliable Clocks](/en/ch9#sec_distributed_clocks), [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy), [Clock readings with a confidence interval](/en/ch9#clock-readings-with-a-confidence-interval), [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
  - time-of-day versus monotonic clocks, [Monotonic Versus Time-of-Day Clocks](/en/ch9#sec_distributed_monotonic_timeofday)
  - timestamping events, [Whose clock are you using, anyway?](/en/ch12#id438)
- cloud services, [Cloud Versus Self-Hosting](/en/ch1#sec_introduction_cloud)-[Cloud Computing Versus Supercomputing](/en/ch1#id17)
  - availability zones, [Tolerating hardware faults through redundancy](/en/ch2#tolerating-hardware-faults-through-redundancy), [Reading Your Own Writes](/en/ch6#sec_replication_ryw)
  - data warehouses, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
  - need for service discovery, [Service discovery](/en/ch10#service-discovery)
  - network glitches, [Network Faults in Practice](/en/ch9#sec_distributed_network_faults)
  - pros and cons, [Pros and Cons of Cloud Services](/en/ch1#sec_introduction_cloud_tradeoffs)-[Pros and Cons of Cloud Services](/en/ch1#sec_introduction_cloud_tradeoffs)
  - quotas, [Operations in the Cloud Era](/en/ch1#sec_introduction_operations)
  - regions (see regions (geographic distribution))
  - serverless, [Microservices and Serverless](/en/ch1#sec_introduction_microservices)
  - shared resources, [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
  - versus supercomputing, [Cloud Computing Versus Supercomputing](/en/ch1#id17)
- cloud-native, [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native)-[Operations in the Cloud Era](/en/ch1#sec_introduction_operations)
- Cloudflare
  - R2 (see R2 (object storage))
- clustered indexes, [Storing values within the index](/en/ch4#sec_storage_index_heap)
- clustering (record ordering), [Sharding by hash range](/en/ch7#sharding-by-hash-range)
- CockroachDB (database)
  - consensus-based replication, [Single-Leader Replication](/en/ch6#sec_replication_leader)
  - consistency model, [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
  - key-range sharding, [Sharding](/en/ch7#ch_sharding), [Sharding by Key Range](/en/ch7#sec_sharding_key_range)
  - serializable transactions, [Serializable Snapshot Isolation (SSI)](/en/ch8#sec_transactions_ssi)
  - sharded secondary indexes, [Global Secondary Indexes](/en/ch7#id167)
  - transactions, [What Exactly Is a Transaction?](/en/ch8#sec_transactions_overview), [Database-internal Distributed Transactions](/en/ch8#sec_transactions_internal)
  - use of model-checking, [Model checking and specification languages](/en/ch9#model-checking-and-specification-languages)
- code generation
  - for query execution, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
  - with Protocol Buffers, [Protocol Buffers](/en/ch5#sec_encoding_protobuf)
- collaborative editing, [Real-time collaboration, offline-first, and local-first apps](/en/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- column families (Bigtable), [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality), [Column Compression](/en/ch4#sec_storage_column_compression)
- column-oriented storage, [Column-Oriented Storage](/en/ch4#sec_storage_column)-[Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
  - column compression, [Column Compression](/en/ch4#sec_storage_column_compression)
  - Parquet, [Column-Oriented Storage](/en/ch4#sec_storage_column), [Archival storage](/en/ch5#archival-storage)
  - sort order in, [Sort Order in Column Storage](/en/ch4#sort-order-in-column-storage)-[Sort Order in Column Storage](/en/ch4#sort-order-in-column-storage)
  - vectorized processing, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
  - versus wide-column model, [Column Compression](/en/ch4#sec_storage_column_compression)
  - writing to, [Writing to Column-Oriented Storage](/en/ch4#writing-to-column-oriented-storage)
- comma-separated values (see CSV)
- command query responsibility segregation (CQRS), [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)-[Event Sourcing and CQRS](/en/ch3#sec_datamodels_events), [Deriving several views from the same event log](/en/ch12#sec_stream_deriving_views)
- commands (event sourcing), [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
- commits (transactions), [Transactions](/en/ch8#ch_transactions)
  - atomic commit, [Distributed Transactions](/en/ch8#sec_transactions_distributed)-[Exactly-once message processing revisited](/en/ch8#exactly-once-message-processing-revisited)
    - (see also atomicity; transactions)
  - read committed isolation, [Read Committed](/en/ch8#sec_transactions_read_committed)
  - three-phase commit (3PC), [Three-phase commit](/en/ch8#three-phase-commit)
  - two-phase commit (2PC), [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc)-[Coordinator failure](/en/ch8#coordinator-failure)
- commutative operations, [Conflict resolution and replication](/en/ch8#conflict-resolution-and-replication)
- compaction
  - of changelogs, [Log compaction](/en/ch12#sec_stream_log_compaction)
    - (see also log compaction)
    - for stream operator state, [Rebuilding state after a failure](/en/ch12#sec_stream_state_fault_tolerance)
  - of log-structured storage, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
    - issues with, [Read performance](/en/ch4#read-performance)
    - size-tiered and leveled approaches, [Compaction strategies](/en/ch4#sec_storage_lsm_compaction), [Disk space usage](/en/ch4#disk-space-usage)
- compare-and-set (CAS), [Conditional writes (compare-and-set)](/en/ch8#sec_transactions_compare_and_set), [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
  - implementing locks, [Coordination Services](/en/ch10#sec_consistency_coordination)
  - implementing uniqueness constraints, [Constraints and uniqueness guarantees](/en/ch10#sec_consistency_uniqueness)
  - on object storage, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - relation to consensus, [Linearizability and quorums](/en/ch10#sec_consistency_quorum_linearizable), [Consensus](/en/ch10#sec_consistency_consensus), [Compare-and-set as consensus](/en/ch10#compare-and-set-as-consensus)
  - relation to fencing tokens, [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens)
  - relation to transactions, [Single-object writes](/en/ch8#sec_transactions_single_object)
- compatibility, [Encoding and Evolution](/en/ch5#ch_encoding), [Modes of Dataflow](/en/ch5#sec_encoding_dataflow)
  - calling services, [Data encoding and evolution for RPC](/en/ch5#data-encoding-and-evolution-for-rpc)
  - properties of encoding formats, [Summary](/en/ch5#summary)
  - using databases, [Dataflow Through Databases](/en/ch5#sec_encoding_dataflow_db)-[Archival storage](/en/ch5#archival-storage)
- compensating transactions, [Advantages of immutable events](/en/ch12#sec_stream_immutability_pros), [Loosely interpreted constraints](/en/ch13#id362)
- compilation, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
- complex event processing (CEP), [Complex event processing](/en/ch12#id317)
- complexity
  - distilling in theoretical models, [Mapping system models to the real world](/en/ch9#mapping-system-models-to-the-real-world)
  - essential and accidental, [Simplicity: Managing Complexity](/en/ch2#id38)
  - hiding using abstraction, [Data Models and Query Languages](/en/ch3#ch_datamodels)
  - managing, [Simplicity: Managing Complexity](/en/ch2#id38)
- composing data systems (see unbundling databases)
- compression
  - in SSTables, [The SSTable file format](/en/ch4#the-sstable-file-format)
- compute-intensive applications, [Trade-offs in Data Systems Architecture](/en/ch1#ch_tradeoffs)
- computer games, [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
- concatenated indexes, [Multidimensional and Full-Text Indexes](/en/ch4#sec_storage_multidimensional)
  - in hash-sharded systems, [Sharding by hash range](/en/ch7#sharding-by-hash-range)
- concurrency
  - actor programming model, [Distributed actor frameworks](/en/ch5#distributed-actor-frameworks), [Event-Driven Architectures and RPC](/en/ch12#sec_stream_actors_drpc)
    - (see also event-driven architecture)
  - bugs from weak transaction isolation, [Weak Isolation Levels](/en/ch8#sec_transactions_isolation_levels)
  - conflict resolution, [Dealing with Conflicting Writes](/en/ch6#sec_replication_write_conflicts)-[Types of conflict](/en/ch6#sec_replication_write_conflicts)
  - definition, [Dealing with Conflicting Writes](/en/ch6#sec_replication_write_conflicts)
  - detecting concurrent writes, [Detecting Concurrent Writes](/en/ch6#sec_replication_concurrent)-[Version vectors](/en/ch6#version-vectors)
  - dual writes, problems with, [Keeping Systems in Sync](/en/ch12#sec_stream_sync)
  - happens-before relation, [The "happens-before" relation and concurrency](/en/ch6#sec_replication_happens_before)
  - in replicated systems, [Problems with Replication Lag](/en/ch6#sec_replication_lag)-[Version vectors](/en/ch6#version-vectors), [Linearizability](/en/ch10#sec_consistency_linearizability)-[Linearizability and network delays](/en/ch10#linearizability-and-network-delays)
  - lost updates, [Preventing Lost Updates](/en/ch8#sec_transactions_lost_update)
  - multi-version concurrency control (MVCC), [Multi-version concurrency control (MVCC)](/en/ch8#sec_transactions_snapshot_impl), [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
  - optimistic concurrency control, [Pessimistic versus optimistic concurrency control](/en/ch8#pessimistic-versus-optimistic-concurrency-control)
  - ordering of operations, [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
  - reducing, through event logs, [Concurrency control](/en/ch12#sec_stream_concurrency), [Dataflow: Interplay between state changes and application code](/en/ch13#id450)
  - time and relativity, [The "happens-before" relation and concurrency](/en/ch6#sec_replication_happens_before)
  - transaction isolation, [Isolation](/en/ch8#sec_transactions_acid_isolation)
  - write skew (transaction isolation), [Write Skew and Phantoms](/en/ch8#sec_transactions_write_skew)-[Materializing conflicts](/en/ch8#materializing-conflicts)
- conditional write, [Conditional writes (compare-and-set)](/en/ch8#sec_transactions_compare_and_set)
  - in transactions, [Single-object writes](/en/ch8#sec_transactions_single_object)
  - on object storage, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
- conference management system (example), [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
- conflict-free replicated datatypes (CRDTs), [CRDTs and Operational Transformation](/en/ch6#sec_replication_crdts)
  - for leaderless replication, [Capturing the happens-before relationship](/en/ch6#capturing-the-happens-before-relationship)
  - preventing lost updates, [Conflict resolution and replication](/en/ch8#conflict-resolution-and-replication)
- conflicts
  - avoidance, [Conflict avoidance](/en/ch6#conflict-avoidance)
  - causal dependencies, [The "happens-before" relation and concurrency](/en/ch6#sec_replication_happens_before)
  - conflict detection
    - in distributed transactions, [Problems with XA transactions](/en/ch8#problems-with-xa-transactions)
    - in log-based systems, [Uniqueness constraints require consensus](/en/ch13#id452)
    - in serializable snapshot isolation (SSI), [Detecting writes that affect prior reads](/en/ch8#sec_detecting_writes_affect_reads)
    - in two-phase commit, [A system of promises](/en/ch8#a-system-of-promises)
  - conflict resolution
    - by aborting transactions, [Pessimistic versus optimistic concurrency control](/en/ch8#pessimistic-versus-optimistic-concurrency-control)
    - by apologizing, [Loosely interpreted constraints](/en/ch13#id362)
    - last write wins (LWW), [Timestamps for ordering events](/en/ch9#sec_distributed_lww)
    - using atomic operations, [Conflict resolution and replication](/en/ch8#conflict-resolution-and-replication)
  - determining what is a conflict, [Types of conflict](/en/ch6#sec_replication_write_conflicts), [Uniqueness in log-based messaging](/en/ch13#sec_future_uniqueness_log)
  - in leaderless replication, [Detecting Concurrent Writes](/en/ch6#sec_replication_concurrent)
  - lost updates, [Preventing Lost Updates](/en/ch8#sec_transactions_lost_update)-[Conflict resolution and replication](/en/ch8#conflict-resolution-and-replication)
  - materializing, [Materializing conflicts](/en/ch8#materializing-conflicts)
  - resolution, [Dealing with Conflicting Writes](/en/ch6#sec_replication_write_conflicts)-[Types of conflict](/en/ch6#sec_replication_write_conflicts)
    - automatic, [Automatic conflict resolution](/en/ch6#automatic-conflict-resolution)
    - in leaderless systems, [Detecting Concurrent Writes](/en/ch6#sec_replication_concurrent)
    - last write wins (LWW), [Last write wins (discarding concurrent writes)](/en/ch6#sec_replication_lww)
    - using custom logic, [Manual conflict resolution](/en/ch6#manual-conflict-resolution), [Capturing the happens-before relationship](/en/ch6#capturing-the-happens-before-relationship)
  - siblings, [Manual conflict resolution](/en/ch6#manual-conflict-resolution), [Capturing the happens-before relationship](/en/ch6#capturing-the-happens-before-relationship)
    - merging, [Capturing the happens-before relationship](/en/ch6#capturing-the-happens-before-relationship)
  - write skew (transaction isolation), [Write Skew and Phantoms](/en/ch8#sec_transactions_write_skew)-[Materializing conflicts](/en/ch8#materializing-conflicts)
- Confluent
  - Freight (messaging), [Setting Up New Followers](/en/ch6#sec_replication_new_replica), [Disk space usage](/en/ch12#sec_stream_disk_usage)
  - schema registry, [JSON Schema](/en/ch5#json-schema), [But what is the writer's schema?](/en/ch5#but-what-is-the-writers-schema)
- congestion (networks)
  - avoidance, [The Limitations of TCP](/en/ch9#sec_distributed_tcp)
  - limiting accuracy of clocks, [Clock readings with a confidence interval](/en/ch9#clock-readings-with-a-confidence-interval)
  - queueing delays, [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
- consensus, [Consensus](/en/ch10#sec_consistency_consensus)-[Summary](/en/ch10#summary), [Glossary](/en/glossary)
  - algorithms, [Consensus](/en/ch10#sec_consistency_consensus), [Consensus in Practice](/en/ch10#sec_consistency_total_order)
  - consensus numbers, [Fetch-and-add as consensus](/en/ch10#fetch-and-add-as-consensus)
  - coordination services, [Coordination Services](/en/ch10#sec_consistency_coordination)-[Service discovery](/en/ch10#service-discovery)
  - cost of, [Pros and cons of consensus](/en/ch10#pros-and-cons-of-consensus)
  - impossibility of, [Consensus](/en/ch10#sec_consistency_consensus)
  - preventing split brain, [From single-leader replication to consensus](/en/ch10#from-single-leader-replication-to-consensus)
  - reconfiguration, [Subtleties of consensus](/en/ch10#subtleties-of-consensus)
  - relation to atomic commitment, [Atomic commitment as consensus](/en/ch10#atomic-commitment-as-consensus)
  - relation to compare-and-set (CAS), [Linearizability and quorums](/en/ch10#sec_consistency_quorum_linearizable), [Compare-and-set as consensus](/en/ch10#compare-and-set-as-consensus)
  - relation to fetch-and-add, [Fetch-and-add as consensus](/en/ch10#fetch-and-add-as-consensus)
  - relation to replication, [Using shared logs](/en/ch10#sec_consistency_smr)
  - relation to shared logs, [Shared logs as consensus](/en/ch10#sec_consistency_shared_logs)
  - relation to uniqueness constraints, [Uniqueness constraints require consensus](/en/ch13#id452)
  - safety and liveness properties, [Single-value consensus](/en/ch10#single-value-consensus)
  - single-value consensus, [Single-value consensus](/en/ch10#single-value-consensus)
- consent (GDPR), [Consent and Freedom of Choice](/en/ch14#id375)
- consistency, [Consistency](/en/ch8#sec_transactions_acid_consistency), [Timeliness and Integrity](/en/ch13#sec_future_integrity)
  - across different databases, [Leader failure: Failover](/en/ch6#leader-failure-failover), [Keeping Systems in Sync](/en/ch12#sec_stream_sync), [Deriving several views from the same event log](/en/ch12#sec_stream_deriving_views), [Derived data versus distributed transactions](/en/ch13#sec_future_derived_vs_transactions)
  - causal, [Consistent Prefix Reads](/en/ch6#sec_replication_consistent_prefix), [Problems with different topologies](/en/ch6#problems-with-different-topologies), [Ordering events to capture causality](/en/ch13#sec_future_capture_causality)
  - consistent prefix reads, [Consistent Prefix Reads](/en/ch6#sec_replication_consistent_prefix)-[Consistent Prefix Reads](/en/ch6#sec_replication_consistent_prefix)
  - consistent snapshots, [Setting Up New Followers](/en/ch6#sec_replication_new_replica), [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)-[Snapshot isolation, repeatable read, and naming confusion](/en/ch8#snapshot-isolation-repeatable-read-and-naming-confusion), [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner), [Initial snapshot](/en/ch12#sec_stream_cdc_snapshot), [Creating an index](/en/ch13#id340)
    - (see also snapshots)
  - crash recovery, [Making B-trees reliable](/en/ch4#sec_storage_btree_wal)
  - enforcing constraints (see constraints)
  - eventual, [Problems with Replication Lag](/en/ch6#sec_replication_lag)
    - (see also eventual consistency)
  - in ACID transactions, [Consistency](/en/ch8#sec_transactions_acid_consistency), [Maintaining integrity in the face of software bugs](/en/ch13#id455)
  - in CAP theorem, [The CAP theorem](/en/ch10#the-cap-theorem)
  - in leader election, [Subtleties of consensus](/en/ch10#subtleties-of-consensus)
  - in microservices, [Problems with Distributed Systems](/en/ch1#sec_introduction_dist_sys_problems)
  - linearizability, [Solutions for Replication Lag](/en/ch6#id131), [Linearizability](/en/ch10#sec_consistency_linearizability)-[Linearizability and network delays](/en/ch10#linearizability-and-network-delays)
  - meanings of, [Consistency](/en/ch8#sec_transactions_acid_consistency)
  - monotonic reads, [Monotonic Reads](/en/ch6#sec_replication_monotonic_reads)-[Monotonic Reads](/en/ch6#sec_replication_monotonic_reads)
  - of secondary indexes, [The need for multi-object transactions](/en/ch8#sec_transactions_need), [Indexes and snapshot isolation](/en/ch8#indexes-and-snapshot-isolation), [Reasoning about dataflows](/en/ch13#id443), [Creating an index](/en/ch13#id340)
  - read-after-write, [Reading Your Own Writes](/en/ch6#sec_replication_ryw)-[Reading Your Own Writes](/en/ch6#sec_replication_ryw)
    - in derived data systems, [Derived data versus distributed transactions](/en/ch13#sec_future_derived_vs_transactions)
  - strong (see linearizability)
  - timeliness and integrity, [Timeliness and Integrity](/en/ch13#sec_future_integrity)
  - using quorums, [Limitations of Quorum Consistency](/en/ch6#sec_replication_quorum_limitations), [Linearizability and quorums](/en/ch10#sec_consistency_quorum_linearizable)
- consistent hashing, [Consistent hashing](/en/ch7#sec_sharding_consistent_hashing)
- consistent prefix reads, [Consistent Prefix Reads](/en/ch6#sec_replication_consistent_prefix)
- constraints (databases), [Consistency](/en/ch8#sec_transactions_acid_consistency), [Characterizing write skew](/en/ch8#characterizing-write-skew)
  - asynchronously checked, [Loosely interpreted constraints](/en/ch13#id362)
  - coordination avoidance, [Coordination-avoiding data systems](/en/ch13#id454)
  - ensuring idempotence, [Uniquely identifying requests](/en/ch13#id355)
  - in log-based systems, [Enforcing Constraints](/en/ch13#sec_future_constraints)-[Multi-shard request processing](/en/ch13#id360)
    - across multiple shards, [Multi-shard request processing](/en/ch13#id360)
  - in two-phase commit, [Distributed Transactions](/en/ch8#sec_transactions_distributed), [A system of promises](/en/ch8#a-system-of-promises)
  - relation to consensus, [Uniqueness constraints require consensus](/en/ch13#id452)
  - requiring linearizability, [Constraints and uniqueness guarantees](/en/ch10#sec_consistency_uniqueness)
- Consul (coordination service), [Coordination Services](/en/ch10#sec_consistency_coordination)
  - use for service discovery, [Service discovery](/en/ch10#service-discovery)
- consumers (message streams), [Message brokers](/en/ch5#message-brokers), [Transmitting Event Streams](/en/ch12#sec_stream_transmit)
  - backpressure, [Messaging Systems](/en/ch12#sec_stream_messaging)
  - consumer groups, [Multiple consumers](/en/ch12#id298)
  - consumer offsets in logs, [Consumer offsets](/en/ch12#sec_stream_log_offsets)
  - failures, [Acknowledgments and redelivery](/en/ch12#sec_stream_reordering), [Consumer offsets](/en/ch12#sec_stream_log_offsets)
  - fan-out, [Materializing and Updating Timelines](/en/ch2#sec_introduction_materializing), [Multiple consumers](/en/ch12#id298), [Logs compared to traditional messaging](/en/ch12#sec_stream_logs_vs_messaging)
  - load balancing, [Multiple consumers](/en/ch12#id298), [Logs compared to traditional messaging](/en/ch12#sec_stream_logs_vs_messaging)
  - not keeping up with producers, [Messaging Systems](/en/ch12#sec_stream_messaging), [Disk space usage](/en/ch12#sec_stream_disk_usage), [Making unbundling work](/en/ch13#sec_future_unbundling_favor)
- content models (JSON Schema), [JSON Schema](/en/ch5#json-schema)
- contention
  - between transactions, [Handling errors and aborts](/en/ch8#handling-errors-and-aborts)
  - blocking threads, [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
  - performance of optimistic concurrency control, [Pessimistic versus optimistic concurrency control](/en/ch8#pessimistic-versus-optimistic-concurrency-control)
  - under two-phase locking, [Performance of two-phase locking](/en/ch8#performance-of-two-phase-locking)
- context switches, [Latency and Response Time](/en/ch2#id23), [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
- convergence (conflict resolution), [Automatic conflict resolution](/en/ch6#automatic-conflict-resolution)-[CRDTs and Operational Transformation](/en/ch6#sec_replication_crdts)
- coordination
  - avoidance, [Coordination-avoiding data systems](/en/ch13#id454)
  - cross-datacenter, [The limits of total ordering](/en/ch13#id335)
  - cross-region, [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)
  - cross-shard ordering, [Sharding](/en/ch8#sharding), [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner), [Using shared logs](/en/ch10#sec_consistency_smr), [Multi-shard request processing](/en/ch13#id360)
  - routing requests to shards, [Request Routing](/en/ch7#sec_sharding_routing)
  - services, [Locking and leader election](/en/ch10#locking-and-leader-election), [Coordination Services](/en/ch10#sec_consistency_coordination)-[Service discovery](/en/ch10#service-discovery)
- coordinator (in 2PC), [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc)
  - failure, [Coordinator failure](/en/ch8#coordinator-failure)
  - in XA transactions, [XA transactions](/en/ch8#xa-transactions)-[Problems with XA transactions](/en/ch8#problems-with-xa-transactions)
  - recovery, [Recovering from coordinator failure](/en/ch8#recovering-from-coordinator-failure)
- copy-on-write (B-trees), [B-tree variants](/en/ch4#b-tree-variants), [Indexes and snapshot isolation](/en/ch8#indexes-and-snapshot-isolation)
- CORBA (Common Object Request Broker Architecture), [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc)
- coronal mass ejection (see solar storm)
- correctness
  - auditability, [Trust, but Verify](/en/ch13#sec_future_verification)-[Tools for auditable data systems](/en/ch13#id366)
  - Byzantine fault tolerance, [Byzantine Faults](/en/ch9#sec_distributed_byzantine)
  - dealing with partial failures, [Faults and Partial Failures](/en/ch9#sec_distributed_partial_failure)
  - in log-based systems, [Enforcing Constraints](/en/ch13#sec_future_constraints)-[Multi-shard request processing](/en/ch13#id360)
  - of algorithm within system model, [Defining the correctness of an algorithm](/en/ch9#defining-the-correctness-of-an-algorithm)
  - of derived data, [Designing for auditability](/en/ch13#id365)
  - of immutable data, [Advantages of immutable events](/en/ch12#sec_stream_immutability_pros)
  - of personal data, [Responsibility and Accountability](/en/ch14#id371), [Privacy and Use of Data](/en/ch14#id457)
  - of time, [Problems with different topologies](/en/ch6#problems-with-different-topologies), [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy)-[Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
  - of transactions, [Consistency](/en/ch8#sec_transactions_acid_consistency), [Aiming for Correctness](/en/ch13#sec_future_correctness), [Maintaining integrity in the face of software bugs](/en/ch13#id455)
  - timeliness and integrity, [Timeliness and Integrity](/en/ch13#sec_future_integrity)-[Coordination-avoiding data systems](/en/ch13#id454)
- corruption of data
  - detecting, [The end-to-end argument](/en/ch13#sec_future_e2e_argument), [Don't just blindly trust what they promise](/en/ch13#id364)-[Tools for auditable data systems](/en/ch13#id366)
  - due to pathological memory access, [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults)
  - due to radiation, [Byzantine Faults](/en/ch9#sec_distributed_byzantine)
  - due to split brain, [Leader failure: Failover](/en/ch6#leader-failure-failover), [Distributed Locks and Leases](/en/ch9#sec_distributed_lock_fencing)
  - due to weak transaction isolation, [Weak Isolation Levels](/en/ch8#sec_transactions_isolation_levels)
  - integrity as absence of, [Timeliness and Integrity](/en/ch13#sec_future_integrity)
  - network packets, [Weak forms of lying](/en/ch9#weak-forms-of-lying)
  - on disks, [Durability](/en/ch8#durability)
  - preventing using write-ahead logs, [Making B-trees reliable](/en/ch4#sec_storage_btree_wal)
  - recovering from, [Batch Processing](/en/ch11#ch_batch), [Advantages of immutable events](/en/ch12#sec_stream_immutability_pros)
- cosine similarity (semantic search), [Vector Embeddings](/en/ch4#id92)
- Couchbase (database)
  - document data model, [Relational Model versus Document Model](/en/ch3#sec_datamodels_history)
  - durability, [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
  - hash sharding, [Fixed number of shards](/en/ch7#fixed-number-of-shards)
  - join support, [Convergence of document and relational databases](/en/ch3#convergence-of-document-and-relational-databases)
  - rebalancing, [Operations: Automatic or Manual Rebalancing](/en/ch7#sec_sharding_operations)
  - vBuckets (sharding), [Sharding](/en/ch7#ch_sharding)
- CouchDB (database)
  - as sync engine, [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
  - B-tree storage, [Indexes and snapshot isolation](/en/ch8#indexes-and-snapshot-isolation)
  - conflict resolution, [Manual conflict resolution](/en/ch6#manual-conflict-resolution)
- coupling (loose and tight), [Evolvability: Making Change Easy](/en/ch2#sec_introduction_evolvability)
- covering indexes, [Storing values within the index](/en/ch4#sec_storage_index_heap)
- CozoDB (database), [Datalog: Recursive Relational Queries](/en/ch3#id62)
- CPUs
  - cache coherence and memory barriers, [Linearizability and network delays](/en/ch10#linearizability-and-network-delays)
  - caching and pipelining, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
  - computing the wrong result, [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults)
  - SIMD instructions, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
- crash-stop and crash-recovery faults, [System Model and Reality](/en/ch9#sec_distributed_system_model)
- CRDTs (see conflict-free replicated datatypes)
- CREATE INDEX statement (SQL), [Multi-Column and Secondary Indexes](/en/ch4#sec_storage_index_multicolumn), [Creating an index](/en/ch13#id340)
- credit rating agencies, [Responsibility and Accountability](/en/ch14#id371)
- crypto-shredding, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events), [Limitations of immutability](/en/ch12#sec_stream_immutability_limitations)
- cryptocurrencies, [Summary](/en/ch3#summary)
- cryptography
  - defense against attackers, [Byzantine Faults](/en/ch9#sec_distributed_byzantine)
  - end-to-end encryption and authentication, [The end-to-end argument](/en/ch13#sec_future_e2e_argument)
- CSV (comma-separated values), [Storage and Indexing for OLTP](/en/ch4#sec_storage_oltp), [JSON, XML, and Binary Variants](/en/ch5#sec_encoding_json)
- Curator (ZooKeeper recipes), [Locking and leader election](/en/ch10#locking-and-leader-election), [Allocating work to nodes](/en/ch10#allocating-work-to-nodes)
- Cypher (query language), [The Cypher Query Language](/en/ch3#id57)
  - comparison to SPARQL, [The SPARQL query language](/en/ch3#the-sparql-query-language)

### D

- Daft (processing framework)
  - DataFrames, [DataFrames](/en/ch11#id287)
  - shuffling data, [Shuffling Data](/en/ch11#sec_shuffle)
- Dagster (workflow scheduler), [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows), [Batch Processing](/en/ch11#ch_batch), [Scheduling Workflows](/en/ch11#sec_batch_workflows)
  - cloud data warehouse integration, [Query languages](/en/ch11#sec_batch_query_lanauges)
- dashboard (business intelligence), [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp)
- Dask (processing framework), [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- data catalog, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
- data connectors, [Data Warehousing](/en/ch1#sec_introduction_dwh)
- data contracts, [Extract--Transform--Load (ETL)](/en/ch11#sec_batch_etl_usage)
  - change data capture, [Change data capture versus event sourcing](/en/ch12#sec_stream_event_sourcing)
- data corruption (see corruption of data)
- data cubes, [Materialized Views and Data Cubes](/en/ch4#sec_storage_materialized_views)
- data engineering, [Operational Versus Analytical Systems](/en/ch1#sec_introduction_analytics)
- data fabric, [Extract--Transform--Load (ETL)](/en/ch11#sec_batch_etl_usage)
- data formats (see encoding)
- data infrastructure, [Trade-offs in Data Systems Architecture](/en/ch1#ch_tradeoffs)
- data integration, [Data Integration](/en/ch13#sec_future_integration)-[Unifying batch and stream processing](/en/ch13#id338), [Summary](/en/ch13#id367)
  - batch and stream processing, [Batch and Stream Processing](/en/ch13#sec_future_batch_streaming)-[Unifying batch and stream processing](/en/ch13#id338)
    - maintaining derived state, [Maintaining derived state](/en/ch13#id446)
    - reprocessing data, [Reprocessing data for application evolution](/en/ch13#sec_future_reprocessing)
    - unifying, [Unifying batch and stream processing](/en/ch13#id338)
  - by unbundling databases, [Unbundling Databases](/en/ch13#sec_future_unbundling)-[Multi-shard data processing](/en/ch13#sec_future_unbundled_multi_shard)
    - comparison to federated databases, [The meta-database of everything](/en/ch13#id341)
  - combining tools by deriving data, [Combining Specialized Tools by Deriving Data](/en/ch13#id442)-[Ordering events to capture causality](/en/ch13#sec_future_capture_causality)
    - derived data versus distributed transactions, [Derived data versus distributed transactions](/en/ch13#sec_future_derived_vs_transactions)
    - limits of total ordering, [The limits of total ordering](/en/ch13#id335)
    - ordering events to capture causality, [Ordering events to capture causality](/en/ch13#sec_future_capture_causality)
    - reasoning about dataflows, [Reasoning about dataflows](/en/ch13#id443)
  - need for, [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived)
  - using batch processing, [Batch Processing](/en/ch11#ch_batch), [Extract--Transform--Load (ETL)](/en/ch11#sec_batch_etl_usage)
- data lake, [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake)
  - data lakehouse, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses), [Analytics](/en/ch11#sec_batch_olap)
- data locality (see locality)
- data mesh, [Extract--Transform--Load (ETL)](/en/ch11#sec_batch_etl_usage)
- data minimization, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance), [Legislation and Self-Regulation](/en/ch14#sec_future_legislation)
- data models, [Data Models and Query Languages](/en/ch3#ch_datamodels)-[Summary](/en/ch3#summary)
  - DataFrames and arrays, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
  - graph-like models, [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)-[GraphQL](/en/ch3#id63)
    - Datalog language, [Datalog: Recursive Relational Queries](/en/ch3#id62)-[Datalog: Recursive Relational Queries](/en/ch3#id62)
    - property graphs, [Property Graphs](/en/ch3#id56)
    - RDF and triple-stores, [Triple-Stores and SPARQL](/en/ch3#id59)-[The SPARQL query language](/en/ch3#the-sparql-query-language)
  - relational model versus document model, [Relational Model versus Document Model](/en/ch3#sec_datamodels_history)-[Convergence of document and relational databases](/en/ch3#convergence-of-document-and-relational-databases)
  - supporting multiple, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
- data pipelines, [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake), [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived), [Extract--Transform--Load (ETL)](/en/ch11#sec_batch_etl_usage)
- data products, [Beyond the data lake](/en/ch1#beyond-the-data-lake)
- data protection regulations (see GDPR)
- data residence laws, [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed), [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
- data science, [Operational Versus Analytical Systems](/en/ch1#sec_introduction_analytics), [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake)
- data silo, [Data Warehousing](/en/ch1#sec_introduction_dwh)
- data systems
  - correctness, constraints, and integrity, [Aiming for Correctness](/en/ch13#sec_future_correctness)-[Tools for auditable data systems](/en/ch13#id366)
  - data integration, [Data Integration](/en/ch13#sec_future_integration)-[Unifying batch and stream processing](/en/ch13#id338)
  - goals for using, [Trade-offs in Data Systems Architecture](/en/ch1#ch_tradeoffs)
  - heterogeneous, keeping in sync, [Keeping Systems in Sync](/en/ch12#sec_stream_sync)
  - maintainability, [Maintainability](/en/ch2#sec_introduction_maintainability)-[Evolvability: Making Change Easy](/en/ch2#sec_introduction_evolvability)
  - possible faults in, [Transactions](/en/ch8#ch_transactions)
  - reliability, [Reliability and Fault Tolerance](/en/ch2#sec_introduction_reliability)-[Humans and Reliability](/en/ch2#id31)
    - hardware faults, [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults)
    - human errors, [Humans and Reliability](/en/ch2#id31)
    - importance of, [Humans and Reliability](/en/ch2#id31)
    - software faults, [Software faults](/en/ch2#software-faults)
  - scalability, [Scalability](/en/ch2#sec_introduction_scalability)-[Principles for Scalability](/en/ch2#id35)
  - unbundling databases, [Unbundling Databases](/en/ch13#sec_future_unbundling)-[Multi-shard data processing](/en/ch13#sec_future_unbundled_multi_shard)
  - unreliable clocks, [Unreliable Clocks](/en/ch9#sec_distributed_clocks)-[Limiting the impact of garbage collection](/en/ch9#sec_distributed_gc_impact)
- data warehousing, [Data Warehousing](/en/ch1#sec_introduction_dwh), [Glossary](/en/glossary)
  - cloud-based solutions, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
  - ETL (extract-transform-load), [Data Warehousing](/en/ch1#sec_introduction_dwh), [Keeping Systems in Sync](/en/ch12#sec_stream_sync)
  - for batch processing, [Batch Processing](/en/ch11#ch_batch)
  - keeping data systems in sync, [Keeping Systems in Sync](/en/ch12#sec_stream_sync)
  - schema design, [Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)
  - sharding and clustering, [Sharding by hash range](/en/ch7#sharding-by-hash-range)
  - slowly changing dimension (SCD), [Time-dependence of joins](/en/ch12#sec_stream_join_time)
- data-intensive applications, [Trade-offs in Data Systems Architecture](/en/ch1#ch_tradeoffs)
- database administrator, [Operations in the Cloud Era](/en/ch1#sec_introduction_operations)
- database-internal distributed transactions, [Distributed Transactions Across Different Systems](/en/ch8#sec_transactions_xa), [Database-internal Distributed Transactions](/en/ch8#sec_transactions_internal), [Atomic commit revisited](/en/ch12#sec_stream_atomic_commit)
- databases
  - archival storage, [Archival storage](/en/ch5#archival-storage)
  - comparison of message brokers to, [Message brokers compared to databases](/en/ch12#id297)
  - dataflow through, [Dataflow Through Databases](/en/ch5#sec_encoding_dataflow_db)
  - end-to-end argument for, [The end-to-end argument](/en/ch13#sec_future_e2e_argument)-[Applying end-to-end thinking in data systems](/en/ch13#id357)
    - checking integrity, [The end-to-end argument again](/en/ch13#id456)
  - relation to event streams, [Databases and Streams](/en/ch12#sec_stream_databases)-[Limitations of immutability](/en/ch12#sec_stream_immutability_limitations)
    - (see also changelogs)
    - API support for change streams, [API support for change streams](/en/ch12#sec_stream_change_api), [Separation of application code and state](/en/ch13#id344)
    - change data capture, [Change Data Capture](/en/ch12#sec_stream_cdc)-[API support for change streams](/en/ch12#sec_stream_change_api)
    - event sourcing, [Change data capture versus event sourcing](/en/ch12#sec_stream_event_sourcing)
    - keeping systems in sync, [Keeping Systems in Sync](/en/ch12#sec_stream_sync)-[Keeping Systems in Sync](/en/ch12#sec_stream_sync)
    - philosophy of immutable events, [State, Streams, and Immutability](/en/ch12#sec_stream_immutability)-[Limitations of immutability](/en/ch12#sec_stream_immutability_limitations)
  - unbundling, [Unbundling Databases](/en/ch13#sec_future_unbundling)-[Multi-shard data processing](/en/ch13#sec_future_unbundled_multi_shard)
    - composing data storage technologies, [Composing Data Storage Technologies](/en/ch13#id447)-[Unbundled versus integrated systems](/en/ch13#id448)
    - designing applications around dataflow, [Designing Applications Around Dataflow](/en/ch13#sec_future_dataflow)-[Stream processors and services](/en/ch13#id345)
    - observing derived state, [Observing Derived State](/en/ch13#sec_future_observing)-[Multi-shard data processing](/en/ch13#sec_future_unbundled_multi_shard)
- datacenters
  - failures of, [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults)
  - geographically distributed (see regions (geographic distribution))
  - multitenancy and shared resources, [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
  - network architecture, [Cloud Computing Versus Supercomputing](/en/ch1#id17)
  - network faults, [Network Faults in Practice](/en/ch9#sec_distributed_network_faults)
- dataflow, [Modes of Dataflow](/en/ch5#sec_encoding_dataflow)-[Distributed actor frameworks](/en/ch5#distributed-actor-frameworks), [Designing Applications Around Dataflow](/en/ch13#sec_future_dataflow)-[Stream processors and services](/en/ch13#id345)
  - correctness of dataflow systems, [Correctness of dataflow systems](/en/ch13#id453)
  - dataflow engines, [Dataflow Engines](/en/ch11#sec_batch_dataflow)
    - comparison to stream processing, [Processing Streams](/en/ch12#sec_stream_processing)
    - DataFrames, [DataFrames](/en/ch11#id287)
    - support in batch processing frameworks, [Batch Processing](/en/ch11#ch_batch)
  - event-driven, [Event-Driven Architectures](/en/ch5#sec_encoding_dataflow_msg)-[Distributed actor frameworks](/en/ch5#distributed-actor-frameworks)
  - reasoning about, [Reasoning about dataflows](/en/ch13#id443)
  - through databases, [Dataflow Through Databases](/en/ch5#sec_encoding_dataflow_db)
  - through services, [Dataflow Through Services: REST and RPC](/en/ch5#sec_encoding_dataflow_rpc)-[Data encoding and evolution for RPC](/en/ch5#data-encoding-and-evolution-for-rpc)
  - workflow engines (see workflow engines)
- DataFrames, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
  - implementation, [DataFrames](/en/ch11#id287)
  - in batch processing, [DataFrames](/en/ch11#id287)
  - in notebooks, [Machine Learning](/en/ch11#id290)
  - support in batch processing frameworks, [Batch Processing](/en/ch11#ch_batch)
- DataFusion (query engine), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
- Datalog (query language), [Datalog: Recursive Relational Queries](/en/ch3#id62)-[Datalog: Recursive Relational Queries](/en/ch3#id62)
- Datastream (change data capture), [API support for change streams](/en/ch12#sec_stream_change_api)
- datatypes
  - binary strings in XML and JSON, [JSON, XML, and Binary Variants](/en/ch5#sec_encoding_json)
  - conflict-free, [CRDTs and Operational Transformation](/en/ch6#sec_replication_crdts)
  - in Avro encodings, [Avro](/en/ch5#sec_encoding_avro)
  - in Protocol Buffers, [Field tags and schema evolution](/en/ch5#field-tags-and-schema-evolution)
  - numbers in XML and JSON, [JSON, XML, and Binary Variants](/en/ch5#sec_encoding_json)
- Datensparsamkeit, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)
- Datomic (database)
  - B-tree storage, [Indexes and snapshot isolation](/en/ch8#indexes-and-snapshot-isolation)
  - data model, [Graph-Like Data Models](/en/ch3#sec_datamodels_graph), [Triple-Stores and SPARQL](/en/ch3#id59)
  - Datalog query language, [Datalog: Recursive Relational Queries](/en/ch3#id62)
  - excision (deleting data), [Limitations of immutability](/en/ch12#sec_stream_immutability_limitations)
  - languages for transactions, [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs)
  - serial execution of transactions, [Actual Serial Execution](/en/ch8#sec_transactions_serial)
- Daylight Saving Time (DST), [Time-of-day clocks](/en/ch9#time-of-day-clocks)
- Db2 (database)
  - change data capture, [Implementing change data capture](/en/ch12#id307)
- DBA (database administrator), [Operations in the Cloud Era](/en/ch1#sec_introduction_operations)
- deadlocks, [Explicit locking](/en/ch8#explicit-locking)
  - detection, in distributed transaction, [Problems with XA transactions](/en/ch8#problems-with-xa-transactions)
  - in two-phase locking (2PL), [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
- Debezium (change data capture), [Implementing change data capture](/en/ch12#id307)
  - Cassandra, [API support for change streams](/en/ch12#sec_stream_change_api)
  - for data integration, [Unbundled versus integrated systems](/en/ch13#id448)
- declarative languages, [Data Models and Query Languages](/en/ch3#ch_datamodels), [Glossary](/en/glossary)
  - and sync engines, [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
  - Datalog, [Datalog: Recursive Relational Queries](/en/ch3#id62)
  - in document databases, [Convergence of document and relational databases](/en/ch3#convergence-of-document-and-relational-databases)
  - recursive SQL queries, [Graph Queries in SQL](/en/ch3#id58)
  - SPARQL, [The SPARQL query language](/en/ch3#the-sparql-query-language)
- DeepSeek
  - 3FS (see 3FS)
- delays
  - bounded network delays, [Synchronous Versus Asynchronous Networks](/en/ch9#sec_distributed_sync_networks)
  - bounded process pauses, [Response time guarantees](/en/ch9#sec_distributed_clocks_realtime)
  - unbounded network delays, [Timeouts and Unbounded Delays](/en/ch9#sec_distributed_queueing)
  - unbounded process pauses, [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
- deleting data, [Limitations of immutability](/en/ch12#sec_stream_immutability_limitations)
  - in LSM storage, [Disk space usage](/en/ch4#disk-space-usage)
  - legal basis, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)
- Delta Lake (table format), [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
  - sharding and clustering, [Sharding by hash range](/en/ch7#sharding-by-hash-range)
- demilitarized zone (networking), [Serving Derived Data](/en/ch11#sec_batch_serving_derived)
- denormalization (data representation), [Normalization, Denormalization, and Joins](/en/ch3#sec_datamodels_normalization)-[Many-to-One and Many-to-Many Relationships](/en/ch3#sec_datamodels_many_to_many), [Glossary](/en/glossary)
  - in derived data systems, [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived)
  - in event sourcing/CQRS, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
  - in social network case study, [Denormalization in the social networking case study](/en/ch3#denormalization-in-the-social-networking-case-study)
  - materialized views, [Materialized Views and Data Cubes](/en/ch4#sec_storage_materialized_views)
  - updating derived data, [Single-Object and Multi-Object Operations](/en/ch8#sec_transactions_multi_object), [The need for multi-object transactions](/en/ch8#sec_transactions_need), [Combining Specialized Tools by Deriving Data](/en/ch13#id442)
  - versus normalization, [Deriving several views from the same event log](/en/ch12#sec_stream_deriving_views)
- derived data, [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived), [Stream Processing](/en/ch12#ch_stream), [Glossary](/en/glossary)
  - batch processing, [Batch Processing](/en/ch11#ch_batch)
  - event sourcing and CQRS, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
  - from change data capture, [Implementing change data capture](/en/ch12#id307)
  - maintaining derived state through logs, [Databases and Streams](/en/ch12#sec_stream_databases)-[API support for change streams](/en/ch12#sec_stream_change_api), [State, Streams, and Immutability](/en/ch12#sec_stream_immutability)-[Concurrency control](/en/ch12#sec_stream_concurrency)
  - observing, by subscribing to streams, [End-to-end event streams](/en/ch13#id349)
  - outputs of batch and stream processing, [Batch and Stream Processing](/en/ch13#sec_future_batch_streaming)
  - through application code, [Application code as a derivation function](/en/ch13#sec_future_dataflow_derivation)
  - versus distributed transactions, [Derived data versus distributed transactions](/en/ch13#sec_future_derived_vs_transactions)
- design patterns, [Simplicity: Managing Complexity](/en/ch2#id38)
- deterministic operations, [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs), [Faults and Partial Failures](/en/ch9#sec_distributed_partial_failure), [Glossary](/en/glossary)
  - and idempotence, [Idempotence](/en/ch12#sec_stream_idempotence), [Reasoning about dataflows](/en/ch13#id443)
  - computing derived data, [Maintaining derived state](/en/ch13#id446), [Correctness of dataflow systems](/en/ch13#id453), [Designing for auditability](/en/ch13#id365)
  - in event sourcing, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
  - in state machine replication, [Using shared logs](/en/ch10#sec_consistency_smr), [Databases and Streams](/en/ch12#sec_stream_databases)
  - in statement-based replication, [Statement-based replication](/en/ch6#statement-based-replication)
  - in testing, [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
  - joins, [Time-dependence of joins](/en/ch12#sec_stream_join_time)
  - making code deterministic, [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
  - overview, [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
- deterministic simulation testing (DST), [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
- DevOps, [Operations in the Cloud Era](/en/ch1#sec_introduction_operations)
- dimension tables, [Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)
- dimensional modeling (see star schemas)
- directed acyclic graphs (DAG)
  - workflows, [Scheduling Workflows](/en/ch11#sec_batch_workflows)
    - (see also workflow engines)
- dirty reads (transaction isolation), [No dirty reads](/en/ch8#no-dirty-reads)
- dirty writes (transaction isolation), [No dirty writes](/en/ch8#sec_transactions_dirty_write)
- disaggregation
  - of storage and compute, [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute)
- Discord (group chat)
  - GraphQL example, [GraphQL](/en/ch3#id63)
- discrimination, [Bias and Discrimination](/en/ch14#id370)
- disks (see hard disks)
- distributed actor frameworks, [Distributed actor frameworks](/en/ch5#distributed-actor-frameworks)
- distributed filesystems, [Distributed Filesystems](/en/ch11#sec_batch_dfs)-[Distributed Filesystems](/en/ch11#sec_batch_dfs)
  - comparison to object storage, [Object Stores](/en/ch11#id277)
  - use by Flink, [Rebuilding state after a failure](/en/ch12#sec_stream_state_fault_tolerance)
- distributed ledgers, [Summary](/en/ch3#summary)
- distributed systems, [The Trouble with Distributed Systems](/en/ch9#ch_distributed)-[Summary](/en/ch9#summary), [Glossary](/en/glossary)
  - Byzantine faults, [Byzantine Faults](/en/ch9#sec_distributed_byzantine)-[Weak forms of lying](/en/ch9#weak-forms-of-lying)
  - detecting network faults, [Detecting Faults](/en/ch9#id307)
  - faults and partial failures, [Faults and Partial Failures](/en/ch9#sec_distributed_partial_failure)
  - formalization of consensus, [Single-value consensus](/en/ch10#single-value-consensus)
  - impossibility results, [The CAP theorem](/en/ch10#the-cap-theorem), [Consensus](/en/ch10#sec_consistency_consensus)
  - issues with failover, [Leader failure: Failover](/en/ch6#leader-failure-failover)
  - multi-region (see regions (geographic distribution))
  - network problems, [Unreliable Networks](/en/ch9#sec_distributed_networks)-[Can we not simply make network delays predictable?](/en/ch9#can-we-not-simply-make-network-delays-predictable)
  - problems with, [Problems with Distributed Systems](/en/ch1#sec_introduction_dist_sys_problems)
  - quorums, relying on, [The Majority Rules](/en/ch9#sec_distributed_majority)
  - reasons for using, [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed), [Replication](/en/ch6#ch_replication)
  - synchronized clocks, relying on, [Relying on Synchronized Clocks](/en/ch9#sec_distributed_clocks_relying)-[Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
  - system models, [System Model and Reality](/en/ch9#sec_distributed_system_model)-[Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
  - use of clocks and time, [Unreliable Clocks](/en/ch9#sec_distributed_clocks)
- distributed transactions (see transactions)
- Django (web framework), [Handling errors and aborts](/en/ch8#handling-errors-and-aborts)
- DMZ (demilitarized zone), [Serving Derived Data](/en/ch11#sec_batch_serving_derived)
- DNS (Domain Name System), [Request Routing](/en/ch7#sec_sharding_routing), [Service discovery](/en/ch10#service-discovery)
  - for load balancing, [Load balancers, service discovery, and service meshes](/en/ch5#sec_encoding_service_discovery)
- Docker (container manager), [Separation of application code and state](/en/ch13#id344)
- document data model, [Relational Model versus Document Model](/en/ch3#sec_datamodels_history)-[Convergence of document and relational databases](/en/ch3#convergence-of-document-and-relational-databases)
  - comparison to relational model, [When to Use Which Model](/en/ch3#sec_datamodels_document_summary)-[Convergence of document and relational databases](/en/ch3#convergence-of-document-and-relational-databases)
  - multi-object transactions, need for, [The need for multi-object transactions](/en/ch8#sec_transactions_need)
  - sharded secondary indexes, [Sharding and Secondary Indexes](/en/ch7#sec_sharding_secondary_indexes)
  - versus relational model
    - convergence of models, [Convergence of document and relational databases](/en/ch3#convergence-of-document-and-relational-databases)
    - data locality, [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality)
- document-partitioned indexes (see local secondary indexes)
- domain-driven design (DDD), [Simplicity: Managing Complexity](/en/ch2#id38), [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
- dotted version vectors, [Version vectors](/en/ch6#version-vectors)
- double-entry bookkeeping, [Summary](/en/ch3#summary)
- DRBD (Distributed Replicated Block Device), [Single-Leader Replication](/en/ch6#sec_replication_leader)
- drift (clocks), [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy)
- Druid (database), [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp), [Column-Oriented Storage](/en/ch4#sec_storage_column), [Deriving several views from the same event log](/en/ch12#sec_stream_deriving_views)
  - handling writes, [Writing to Column-Oriented Storage](/en/ch4#writing-to-column-oriented-storage)
  - pre-aggregation, [Analytics](/en/ch11#sec_batch_olap)
  - serving derived data, [Serving Derived Data](/en/ch11#sec_batch_serving_derived)
- Dryad (dataflow engine), [Dataflow Engines](/en/ch11#sec_batch_dataflow)
- dual writes, problems with, [Keeping Systems in Sync](/en/ch12#sec_stream_sync)
- DuckDB (database), [Problems with Distributed Systems](/en/ch1#sec_introduction_dist_sys_problems), [Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
  - column-oriented storage, [Column-Oriented Storage](/en/ch4#sec_storage_column)
  - use for ETL, [Extract--Transform--Load (ETL)](/en/ch11#sec_batch_etl_usage)
- duplicates, suppression of, [Duplicate suppression](/en/ch13#id354)
  - (see also idempotence)
  - using a unique ID, [Uniquely identifying requests](/en/ch13#id355), [Multi-shard request processing](/en/ch13#id360)
- durability (transactions), [Making B-trees reliable](/en/ch4#sec_storage_btree_wal), [Durability](/en/ch8#durability), [Glossary](/en/glossary)
- durable execution, [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows)
  - reliance on determinism, [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
  - Restate (see Restate (workflow engine))
  - Temporal (see Temporal (workflow engine))
- durable functions (see workflow engines)
- duration (time), [Unreliable Clocks](/en/ch9#sec_distributed_clocks)
  - measurement with monotonic clocks, [Monotonic clocks](/en/ch9#monotonic-clocks)
- dynamically typed languages
  - analogy to schema-on-read, [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility)
- Dynamo (database), [Leaderless Replication](/en/ch6#sec_replication_leaderless)
- Dynamo-style databases (see leaderless replication)
- DynamoDB (database)
  - auto-scaling, [Operations: Automatic or Manual Rebalancing](/en/ch7#sec_sharding_operations)
  - hash-range sharding, [Sharding by hash range](/en/ch7#sharding-by-hash-range)
  - leader-based replication, [Single-Leader Replication](/en/ch6#sec_replication_leader)
  - sharded secondary indexes, [Global Secondary Indexes](/en/ch7#id167)

### E

- EBS (virtual block device), [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute)
  - compared to object storage, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
- ECC (see error-correcting codes)
- EDB Postgres Distributed (database), [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)
- edges (in graphs), [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
  - property graph model, [Property Graphs](/en/ch3#id56)
- edit distance (full-text search), [Full-Text Search](/en/ch4#sec_storage_full_text)
- effectively-once semantics, [Fault Tolerance](/en/ch12#sec_stream_fault_tolerance), [Exactly-once execution of an operation](/en/ch13#id353)
  - (see also exactly-once semantics)
  - preservation of integrity, [Correctness of dataflow systems](/en/ch13#id453)
- Elastic Compute Cloud (EC2)
  - spot instances, [Handling Faults](/en/ch11#id281)
- elasticity, [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed)
  - cloud data warehouses, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses), [Query languages](/en/ch11#sec_batch_query_lanauges)
- Elasticsearch (search server)
  - local secondary indexes, [Local Secondary Indexes](/en/ch7#id166)
  - percolator (stream search), [Search on streams](/en/ch12#id320)
  - serving derived data, [Serving Derived Data](/en/ch11#sec_batch_serving_derived)
  - shard rebalancing, [Fixed number of shards](/en/ch7#fixed-number-of-shards)
  - use of Lucene, [Full-Text Search](/en/ch4#sec_storage_full_text)
- Elm (programming language), [End-to-end event streams](/en/ch13#id349)
- ELT (extract-load-transform), [Data Warehousing](/en/ch1#sec_introduction_dwh)
  - relation to batch processing, [Extract--Transform--Load (ETL)](/en/ch11#sec_batch_etl_usage)
- embarassingly parallel (algorithms)
  - ETL (see ETL (extract-transform-load))
  - MapReduce, [MapReduce](/en/ch11#sec_batch_mapreduce)
    - (see also MapReduce)
- embedded storage engines, [Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
- embedding (vector), [Vector Embeddings](/en/ch4#id92)
- encodings (data formats), [Encoding and Evolution](/en/ch5#ch_encoding)-[The Merits of Schemas](/en/ch5#sec_encoding_schemas)
  - Avro, [Avro](/en/ch5#sec_encoding_avro)-[Dynamically generated schemas](/en/ch5#dynamically-generated-schemas)
  - binary variants of JSON and XML, [Binary encoding](/en/ch5#binary-encoding)
  - compatibility, [Encoding and Evolution](/en/ch5#ch_encoding)
    - calling services, [Data encoding and evolution for RPC](/en/ch5#data-encoding-and-evolution-for-rpc)
    - using databases, [Dataflow Through Databases](/en/ch5#sec_encoding_dataflow_db)-[Archival storage](/en/ch5#archival-storage)
  - defined, [Formats for Encoding Data](/en/ch5#sec_encoding_formats)
  - JSON, XML, and CSV, [JSON, XML, and Binary Variants](/en/ch5#sec_encoding_json)
  - language-specific formats, [Language-Specific Formats](/en/ch5#id96)
  - merits of schemas, [The Merits of Schemas](/en/ch5#sec_encoding_schemas)
  - Protocol Buffers, [Protocol Buffers](/en/ch5#sec_encoding_protobuf)-[Field tags and schema evolution](/en/ch5#field-tags-and-schema-evolution)
  - representations of data, [Formats for Encoding Data](/en/ch5#sec_encoding_formats)
- end-to-end argument, [The end-to-end argument](/en/ch13#sec_future_e2e_argument)-[Applying end-to-end thinking in data systems](/en/ch13#id357)
  - checking integrity, [The end-to-end argument again](/en/ch13#id456)
  - publish/subscribe streams, [End-to-end event streams](/en/ch13#id349)
- enrichment (stream), [Stream-table join (stream enrichment)](/en/ch12#sec_stream_table_joins)
- Enterprise JavaBeans (EJB), [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc)
- enterprise software, [Trade-offs in Data Systems Architecture](/en/ch1#ch_tradeoffs)
- entities (see vertices)
- ephemeral storage, [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute)
- epoch (consensus algorithms), [From single-leader replication to consensus](/en/ch10#from-single-leader-replication-to-consensus)
- epoch (Unix timestamps), [Time-of-day clocks](/en/ch9#time-of-day-clocks)
- erasure coding (error correction), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- error handling
  - for network faults, [Network Faults in Practice](/en/ch9#sec_distributed_network_faults)
  - in transactions, [Handling errors and aborts](/en/ch8#handling-errors-and-aborts)
- error-correcting codes, [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- Esper (CEP engine), [Complex event processing](/en/ch12#id317)
- essential complexity, [Simplicity: Managing Complexity](/en/ch2#id38)
- etcd (coordination service), [Coordination Services](/en/ch10#sec_consistency_coordination)-[Service discovery](/en/ch10#service-discovery)
  - generating fencing tokens, [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens), [Coordination Services](/en/ch10#sec_consistency_coordination)
  - linearizable operations, [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable), [Subtleties of consensus](/en/ch10#subtleties-of-consensus)
  - locks and leader election, [Locking and leader election](/en/ch10#locking-and-leader-election)
  - use for service discovery, [Load balancers, service discovery, and service meshes](/en/ch5#sec_encoding_service_discovery), [Service discovery](/en/ch10#service-discovery)
  - use for shard assignment, [Request Routing](/en/ch7#sec_sharding_routing)
  - use of Raft algorithm, [Single-Leader Replication](/en/ch6#sec_replication_leader)
- Ethereum (blockchain), [Tools for auditable data systems](/en/ch13#id366)
- Ethernet (networks), [Cloud Computing Versus Supercomputing](/en/ch1#id17), [Unreliable Networks](/en/ch9#sec_distributed_networks), [Can we not simply make network delays predictable?](/en/ch9#can-we-not-simply-make-network-delays-predictable)
  - packet checksums, [Weak forms of lying](/en/ch9#weak-forms-of-lying), [The end-to-end argument](/en/ch13#sec_future_e2e_argument)
- ethics, [Doing the Right Thing](/en/ch14)-[Legislation and Self-Regulation](/en/ch14#sec_future_legislation)
  - code of ethics and professional practice, [Doing the Right Thing](/en/ch14)
  - legislation and self-regulation, [Legislation and Self-Regulation](/en/ch14#sec_future_legislation)
  - predictive analytics, [Predictive Analytics](/en/ch14#id369)-[Feedback Loops](/en/ch14#id372)
    - amplifying bias, [Bias and Discrimination](/en/ch14#id370)
    - feedback loops, [Feedback Loops](/en/ch14#id372)
  - privacy and tracking, [Privacy and Tracking](/en/ch14#id373)-[Legislation and Self-Regulation](/en/ch14#sec_future_legislation)
    - consent and freedom of choice, [Consent and Freedom of Choice](/en/ch14#id375)
    - data as assets and power, [Data as Assets and Power](/en/ch14#id376)
    - meaning of privacy, [Privacy and Use of Data](/en/ch14#id457)
    - surveillance, [Surveillance](/en/ch14#id374)
  - respect, dignity, and agency, [Legislation and Self-Regulation](/en/ch14#sec_future_legislation)
  - unintended consequences, [Doing the Right Thing](/en/ch14), [Feedback Loops](/en/ch14#id372)
- ETL (extract-transform-load), [Data Warehousing](/en/ch1#sec_introduction_dwh), [Keeping Systems in Sync](/en/ch12#sec_stream_sync), [Glossary](/en/glossary)
  - relation to batch processing, [Extract--Transform--Load (ETL)](/en/ch11#sec_batch_etl_usage)-[Extract--Transform--Load (ETL)](/en/ch11#sec_batch_etl_usage)
  - using batch processing, [Batch Processing](/en/ch11#ch_batch)
- Euclidean distance (semantic search), [Vector Embeddings](/en/ch4#id92)
- European Union
  - AI Act (see AI Act)
  - GDPR (see GDPR)
- event sourcing, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)-[Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
  - and change data capture, [Change data capture versus event sourcing](/en/ch12#sec_stream_event_sourcing)
  - comparison to change data capture, [Change data capture versus event sourcing](/en/ch12#sec_stream_event_sourcing)
  - immutability and auditability, [State, Streams, and Immutability](/en/ch12#sec_stream_immutability), [Designing for auditability](/en/ch13#id365)
  - large, reliable data systems, [Uniquely identifying requests](/en/ch13#id355), [Correctness of dataflow systems](/en/ch13#id453)
  - reliance on determinism, [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
- event streams (see streams)
- event-driven architecture, [Event-Driven Architectures](/en/ch5#sec_encoding_dataflow_msg)-[Distributed actor frameworks](/en/ch5#distributed-actor-frameworks)
  - distributed actor frameworks, [Distributed actor frameworks](/en/ch5#distributed-actor-frameworks)
- events, [Transmitting Event Streams](/en/ch12#sec_stream_transmit)
  - deciding on total order of, [The limits of total ordering](/en/ch13#id335)
  - deriving views from event log, [Deriving several views from the same event log](/en/ch12#sec_stream_deriving_views)
  - event time versus processing time, [Event time versus processing time](/en/ch12#id322), [Microbatching and checkpointing](/en/ch12#id329), [Unifying batch and stream processing](/en/ch13#id338)
  - immutable, advantages of, [Advantages of immutable events](/en/ch12#sec_stream_immutability_pros), [Designing for auditability](/en/ch13#id365)
  - ordering to capture causality, [Ordering events to capture causality](/en/ch13#sec_future_capture_causality)
  - reads as, [Reads are events too](/en/ch13#sec_future_read_events)
  - stragglers, [Handling straggler events](/en/ch12#id323)
  - timestamp of, in stream processing, [Whose clock are you using, anyway?](/en/ch12#id438)
- EventSource (browser API), [Pushing state changes to clients](/en/ch13#id348)
- EventStoreDB (database), [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
- eventual consistency, [Replication](/en/ch6#ch_replication), [Problems with Replication Lag](/en/ch6#sec_replication_lag), [Safety and liveness](/en/ch9#sec_distributed_safety_liveness)
  - (see also conflicts)
  - and perpetual inconsistency, [Timeliness and Integrity](/en/ch13#sec_future_integrity)
  - strong eventual consistency, [Automatic conflict resolution](/en/ch6#automatic-conflict-resolution)
- evidence
  - data used as, [Humans and Reliability](/en/ch2#id31)
- evolvability, [Evolvability: Making Change Easy](/en/ch2#sec_introduction_evolvability), [Encoding and Evolution](/en/ch5#ch_encoding)
  - calling services, [Data encoding and evolution for RPC](/en/ch5#data-encoding-and-evolution-for-rpc)
  - event sourcing, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
  - graph-structured data, [Property Graphs](/en/ch3#id56)
  - of databases, [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility), [Dataflow Through Databases](/en/ch5#sec_encoding_dataflow_db)-[Archival storage](/en/ch5#archival-storage), [Deriving several views from the same event log](/en/ch12#sec_stream_deriving_views), [Reprocessing data for application evolution](/en/ch13#sec_future_reprocessing)
  - reprocessing data, [Reprocessing data for application evolution](/en/ch13#sec_future_reprocessing), [Unifying batch and stream processing](/en/ch13#id338)
  - schema evolution in Avro, [The writer's schema and the reader's schema](/en/ch5#the-writers-schema-and-the-readers-schema)
  - schema evolution in Protocol Buffers, [Field tags and schema evolution](/en/ch5#field-tags-and-schema-evolution)
  - schema-on-read, [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility), [Encoding and Evolution](/en/ch5#ch_encoding), [The Merits of Schemas](/en/ch5#sec_encoding_schemas)
- exactly-once semantics, [Exactly-once message processing](/en/ch8#sec_transactions_exactly_once), [Exactly-once message processing revisited](/en/ch8#exactly-once-message-processing-revisited), [Fault Tolerance](/en/ch12#sec_stream_fault_tolerance), [Exactly-once execution of an operation](/en/ch13#id353)
  - parity with batch processors, [Unifying batch and stream processing](/en/ch13#id338)
  - preservation of integrity, [Correctness of dataflow systems](/en/ch13#id453)
  - using durable execution, [Durable execution](/en/ch5#durable-execution)
- exclusive mode (locks), [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
- exponential backoff, [Describing Performance](/en/ch2#sec_introduction_percentiles), [Handling errors and aborts](/en/ch8#handling-errors-and-aborts)
- ext4 (file system), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- eXtended Architecture transactions (see XA transactions)
- extract-transform-load (see ETL)

### F

- Facebook
  - Faiss (vector index), [Vector Embeddings](/en/ch4#id92)
  - React (user interface library), [End-to-end event streams](/en/ch13#id349)
  - social graphs, [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
- facts
  - fact table (star schema), [Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)
  - in Datalog, [Datalog: Recursive Relational Queries](/en/ch3#id62)
  - in event sourcing, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
- fail-slow faults, [System Model and Reality](/en/ch9#sec_distributed_system_model)
- fail-stop model, [System Model and Reality](/en/ch9#sec_distributed_system_model)
- failover, [Leader failure: Failover](/en/ch6#leader-failure-failover), [Glossary](/en/glossary)
  - (see also leader-based replication)
  - in leaderless replication, absence of, [Writing to the Database When a Node Is Down](/en/ch6#id287)
  - leader election, [Distributed Locks and Leases](/en/ch9#sec_distributed_lock_fencing), [Consensus](/en/ch10#sec_consistency_consensus), [From single-leader replication to consensus](/en/ch10#from-single-leader-replication-to-consensus)
  - potential problems, [Leader failure: Failover](/en/ch6#leader-failure-failover)
- failures
  - amplification by distributed transactions, [Maintaining derived state](/en/ch13#id446)
  - failure detection, [Detecting Faults](/en/ch9#id307)
    - automatic rebalancing causing cascading failures, [Operations: Automatic or Manual Rebalancing](/en/ch7#sec_sharding_operations)
    - timeouts and unbounded delays, [Timeouts and Unbounded Delays](/en/ch9#sec_distributed_queueing), [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
    - using a coordination service, [Coordination Services](/en/ch10#sec_consistency_coordination)
  - faults versus, [Reliability and Fault Tolerance](/en/ch2#sec_introduction_reliability)
  - partial failures, [Faults and Partial Failures](/en/ch9#sec_distributed_partial_failure), [Summary](/en/ch9#summary)
- Faiss (vector index), [Vector Embeddings](/en/ch4#id92)
- false positive (Bloom filters), [Bloom filters](/en/ch4#bloom-filters)
- fan-out (messaging systems), [Materializing and Updating Timelines](/en/ch2#sec_introduction_materializing), [Multiple consumers](/en/ch12#id298)
- fault injection, [Fault Tolerance](/en/ch2#id27), [Network Faults in Practice](/en/ch9#sec_distributed_network_faults), [Fault injection](/en/ch9#sec_fault_injection)
- fault isolation, [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
- fault tolerance, [Reliability and Fault Tolerance](/en/ch2#sec_introduction_reliability)-[Humans and Reliability](/en/ch2#id31), [Glossary](/en/glossary)
  - formalization in consensus, [Single-value consensus](/en/ch10#single-value-consensus)
  - human fault tolerance, [Batch Processing](/en/ch11#ch_batch)
  - in batch processing, [Handling Faults](/en/ch11#id281)
  - in log-based systems, [Applying end-to-end thinking in data systems](/en/ch13#id357), [Timeliness and Integrity](/en/ch13#sec_future_integrity)-[Correctness of dataflow systems](/en/ch13#id453)
  - in stream processing, [Fault Tolerance](/en/ch12#sec_stream_fault_tolerance)-[Rebuilding state after a failure](/en/ch12#sec_stream_state_fault_tolerance)
    - atomic commit, [Atomic commit revisited](/en/ch12#sec_stream_atomic_commit)
    - idempotence, [Idempotence](/en/ch12#sec_stream_idempotence)
    - maintaining derived state, [Maintaining derived state](/en/ch13#id446)
    - microbatching and checkpointing, [Microbatching and checkpointing](/en/ch12#id329)
    - rebuilding state after a failure, [Rebuilding state after a failure](/en/ch12#sec_stream_state_fault_tolerance)
  - of distributed transactions, [XA transactions](/en/ch8#xa-transactions)-[Exactly-once message processing revisited](/en/ch8#exactly-once-message-processing-revisited)
  - of leader-based and leaderless replication, [Single-Leader Versus Leaderless Replication Performance](/en/ch6#sec_replication_leaderless_perf)
  - transaction atomicity, [Atomicity](/en/ch8#sec_transactions_acid_atomicity), [Distributed Transactions](/en/ch8#sec_transactions_distributed)-[Exactly-once message processing](/en/ch8#sec_transactions_exactly_once)
- faults
  - Byzantine faults, [Byzantine Faults](/en/ch9#sec_distributed_byzantine)-[Weak forms of lying](/en/ch9#weak-forms-of-lying)
  - failures versus, [Reliability and Fault Tolerance](/en/ch2#sec_introduction_reliability)
  - handled by transactions, [Transactions](/en/ch8#ch_transactions)
  - handling in supercomputers and cloud computing, [Cloud Computing Versus Supercomputing](/en/ch1#id17)
  - hardware, [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults)
  - in distributed systems, [Faults and Partial Failures](/en/ch9#sec_distributed_partial_failure)
  - introducing deliberately (see fault injection)
  - network faults, [Network Faults in Practice](/en/ch9#sec_distributed_network_faults)-[Detecting Faults](/en/ch9#id307)
    - asymmetric faults, [The Majority Rules](/en/ch9#sec_distributed_majority)
    - detecting, [Detecting Faults](/en/ch9#id307)
    - tolerance of, in multi-leader replication, [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)
  - software faults, [Software faults](/en/ch2#software-faults)
  - tolerating (see fault tolerance)
- feature engineering (machine learning), [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake)
- federated databases, [The meta-database of everything](/en/ch13#id341)
- Feldera (database)
  - incremental view maintenance, [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
- fence (CPU instruction), [Linearizability and network delays](/en/ch10#linearizability-and-network-delays)
- fencing (preventing split brain), [Leader failure: Failover](/en/ch6#leader-failure-failover), [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens)-[Fencing with multiple replicas](/en/ch9#fencing-with-multiple-replicas)
  - generating fencing tokens, [Using shared logs](/en/ch10#sec_consistency_smr), [Coordination Services](/en/ch10#sec_consistency_coordination)
  - properties of fencing tokens, [Defining the correctness of an algorithm](/en/ch9#defining-the-correctness-of-an-algorithm)
  - stream processors writing to databases, [Idempotence](/en/ch12#sec_stream_idempotence), [Exactly-once execution of an operation](/en/ch13#id353)
- fetch-and-add
  - relation to consensus, [Fetch-and-add as consensus](/en/ch10#fetch-and-add-as-consensus)
- Fibre Channel (networks), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- field tags (Protocol Buffers), [Protocol Buffers](/en/ch5#sec_encoding_protobuf)-[Field tags and schema evolution](/en/ch5#field-tags-and-schema-evolution)
- Figma (graphics software), [Real-time collaboration, offline-first, and local-first apps](/en/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- filesystem in userspace (FUSE), [Setting Up New Followers](/en/ch6#sec_replication_new_replica), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
  - on object storage, [Object Stores](/en/ch11#id277)
- financial data
  - accounting ledgers, [Summary](/en/ch3#summary)
  - immutability, [Advantages of immutable events](/en/ch12#sec_stream_immutability_pros)
  - time series data, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- Fivetran, [Data Warehousing](/en/ch1#sec_introduction_dwh)
- FizzBee (specification language), [Model checking and specification languages](/en/ch9#model-checking-and-specification-languages)
- flat index (vector index), [Vector Embeddings](/en/ch4#id92)
- FlatBuffers (data format), [Formats for Encoding Data](/en/ch5#sec_encoding_formats)
- Flink (processing framework), [Batch Processing](/en/ch11#ch_batch), [Dataflow Engines](/en/ch11#sec_batch_dataflow)
  - cost efficiency, [Query languages](/en/ch11#sec_batch_query_lanauges)
  - DataFrames, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes), [DataFrames](/en/ch11#id287)
  - fault tolerance, [Handling Faults](/en/ch11#id281), [Microbatching and checkpointing](/en/ch12#id329), [Rebuilding state after a failure](/en/ch12#sec_stream_state_fault_tolerance)
  - FlinkML, [Machine Learning](/en/ch11#id290)
  - for data warehouses, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
  - high availability using ZooKeeper, [Coordination Services](/en/ch10#sec_consistency_coordination)
  - integration of batch and stream processing, [Unifying batch and stream processing](/en/ch13#id338)
  - query optimizer, [Query languages](/en/ch11#sec_batch_query_lanauges)
  - shuffling data, [Shuffling Data](/en/ch11#sec_shuffle)
  - stream processing, [Stream analytics](/en/ch12#id318)
  - streaming SQL support, [Complex event processing](/en/ch12#id317)
- flow control, [The Limitations of TCP](/en/ch9#sec_distributed_tcp), [Messaging Systems](/en/ch12#sec_stream_messaging), [Glossary](/en/glossary)
- FLP result (on consensus), [Consensus](/en/ch10#sec_consistency_consensus)
- Flyte (workflow scheduler), [Machine Learning](/en/ch11#id290)
- followers, [Single-Leader Replication](/en/ch6#sec_replication_leader), [Glossary](/en/glossary)
  - (see also leader-based replication)
- formal methods, [Formal Methods and Randomized Testing](/en/ch9#sec_distributed_formal)-[Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
- forward compatibility, [Encoding and Evolution](/en/ch5#ch_encoding)
- forward decay (algorithm), [Use of Response Time Metrics](/en/ch2#sec_introduction_slo_sla)
- Fossil (version control system), [Concurrency control](/en/ch12#sec_stream_concurrency)
  - shunning (deleting data), [Limitations of immutability](/en/ch12#sec_stream_immutability_limitations)
- FoundationDB (database)
  - consistency model, [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
  - deterministic simulation testing, [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
  - key-range sharding, [Sharding by Key Range](/en/ch7#sec_sharding_key_range)
  - process-per-core model, [Pros and Cons of Sharding](/en/ch7#sec_sharding_reasons)
  - serializable transactions, [Serializable Snapshot Isolation (SSI)](/en/ch8#sec_transactions_ssi), [Performance of serializable snapshot isolation](/en/ch8#performance-of-serializable-snapshot-isolation)
  - transactions, [What Exactly Is a Transaction?](/en/ch8#sec_transactions_overview), [Database-internal Distributed Transactions](/en/ch8#sec_transactions_internal)
- fractional indexing, [When to Use Which Model](/en/ch3#sec_datamodels_document_summary)
- fragmentation (of B-trees), [Disk space usage](/en/ch4#disk-space-usage)
- frame (computer graphics), [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
- frontend (web development), [Trade-offs in Data Systems Architecture](/en/ch1#ch_tradeoffs)
- FrostDB (database)
  - deterministic simulation testing (DST), [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
- fsync (system call), [Making B-trees reliable](/en/ch4#sec_storage_btree_wal), [Durability](/en/ch8#durability)
- full-text search, [Full-Text Search](/en/ch4#sec_storage_full_text), [Glossary](/en/glossary)
  - and fuzzy indexes, [Full-Text Search](/en/ch4#sec_storage_full_text)
  - Lucene storage engine, [Full-Text Search](/en/ch4#sec_storage_full_text)
  - sharded indexes, [Sharding and Secondary Indexes](/en/ch7#sec_sharding_secondary_indexes)
- Function as a Service (FaaS), [Microservices and Serverless](/en/ch1#sec_introduction_microservices)
- functional programming
  - inspiration for MapReduce, [MapReduce](/en/ch11#sec_batch_mapreduce)
- functional requirements, [Defining Nonfunctional Requirements](/en/ch2#ch_nonfunctional)
- FUSE (see filesystem in userspace (FUSE))
- fuzzing, [Formal Methods and Randomized Testing](/en/ch9#sec_distributed_formal)
- fuzzy search (see similarity search)

### G

- Gallina (specification language), [Model checking and specification languages](/en/ch9#model-checking-and-specification-languages)
- game development, [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
- garbage collection
  - immutability and, [Limitations of immutability](/en/ch12#sec_stream_immutability_limitations)
  - process pauses for, [Latency and Response Time](/en/ch2#id23), [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)-[Limiting the impact of garbage collection](/en/ch9#sec_distributed_gc_impact), [The Majority Rules](/en/ch9#sec_distributed_majority)
    - (see also process pauses)
- gas stations algorithmic pricing, [Feedback Loops](/en/ch14#id372)
- GDPR (regulation), [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance), [Limitations of immutability](/en/ch12#sec_stream_immutability_limitations)
  - consent, [Consent and Freedom of Choice](/en/ch14#id375)
  - data minimization, [Legislation and Self-Regulation](/en/ch14#sec_future_legislation)
  - legitimate interest, [Consent and Freedom of Choice](/en/ch14#id375)
  - right of access, [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
  - right to erasure, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance), [Disk space usage](/en/ch4#disk-space-usage), [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
- GenBank (genome database), [Summary](/en/ch3#summary)
- General Data Protection Regulation (see GDPR (regulation))
- genome analysis, [Summary](/en/ch3#summary)
- geographic distribution (see regions (geographic distribution))
- geospatial indexes, [Multidimensional and Full-Text Indexes](/en/ch4#sec_storage_multidimensional)
- Git (version control system), [Concurrency control](/en/ch12#sec_stream_concurrency)
  - local-first software, [Real-time collaboration, offline-first, and local-first apps](/en/ch6#real-time-collaboration-offline-first-and-local-first-apps)
  - merge conflicts, [Manual conflict resolution](/en/ch6#manual-conflict-resolution)
- GitHub, postmortems, [Leader failure: Failover](/en/ch6#leader-failure-failover), [Leader failure: Failover](/en/ch6#leader-failure-failover), [Mapping system models to the real world](/en/ch9#mapping-system-models-to-the-real-world)
- global secondary indexes, [Global Secondary Indexes](/en/ch7#id167), [Summary](/en/ch7#summary)
- globally unique identifiers (see UUIDs)
- GlusterFS (distributed filesystem), [Batch Processing](/en/ch11#ch_batch), [Distributed Filesystems](/en/ch11#sec_batch_dfs), [Object Stores](/en/ch11#id277)
- GNU Coreutils (Linux), [Sorting Versus In-memory Aggregation](/en/ch11#id275)
- Go (programming language)
  - garbage collection, [Limiting the impact of garbage collection](/en/ch9#sec_distributed_gc_impact)
- GoldenGate (change data capture), [Implementing change data capture](/en/ch12#id307)
  - (see also Oracle)
- Google
  - BigQuery (see BigQuery (database))
  - Bigtable (see Bigtable (database))
  - Chubby (lock service), [Coordination Services](/en/ch10#sec_consistency_coordination)
  - Cloud Storage (object storage), [Setting Up New Followers](/en/ch6#sec_replication_new_replica), [Object Stores](/en/ch11#id277)
    - request preconditions, [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens)
  - Compute Engine
    - preemptible instances, [Handling Faults](/en/ch11#id281)
  - Dataflow (stream processing)
    - data warehouse integration, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
    - shuffling data, [Shuffling Data](/en/ch11#sec_shuffle)
  - Dataflow (stream processor), [Stream analytics](/en/ch12#id318), [Atomic commit revisited](/en/ch12#sec_stream_atomic_commit), [Unifying batch and stream processing](/en/ch13#id338)
    - (see also Beam)
  - Datastream (change data capture), [API support for change streams](/en/ch12#sec_stream_change_api)
  - Docs (collaborative editor), [Real-time collaboration, offline-first, and local-first apps](/en/ch6#real-time-collaboration-offline-first-and-local-first-apps), [CRDTs and Operational Transformation](/en/ch6#sec_replication_crdts)
    - operational transformation, [CRDTs and Operational Transformation](/en/ch6#sec_replication_crdts)
  - Dremel (query engine), [Column-Oriented Storage](/en/ch4#sec_storage_column)
  - Firestore (database), [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
  - MapReduce (batch processing), [Batch Processing](/en/ch11#ch_batch)
    - (see also MapReduce)
  - Percolator (transaction system), [Implementing a linearizable ID generator](/en/ch10#implementing-a-linearizable-id-generator)
  - persistent disks (cloud service), [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute)
  - Pub/Sub (messaging), [Message brokers](/en/ch5#message-brokers), [Message brokers compared to databases](/en/ch12#id297), [Using logs for message storage](/en/ch12#id300)
  - response time study, [Average, Median, and Percentiles](/en/ch2#id24)
  - Sheets (collaborative spreadsheet), [Real-time collaboration, offline-first, and local-first apps](/en/ch6#real-time-collaboration-offline-first-and-local-first-apps), [CRDTs and Operational Transformation](/en/ch6#sec_replication_crdts)
  - Spanner (see Spanner (database))
  - TrueTime (clock API), [Clock readings with a confidence interval](/en/ch9#clock-readings-with-a-confidence-interval)
- gossip protocol, [Request Routing](/en/ch7#sec_sharding_routing)
- governance, [Beyond the data lake](/en/ch1#beyond-the-data-lake)
- government use of data, [Data as Assets and Power](/en/ch14#id376)
- GPS (Global Positioning System)
  - use for clock synchronization, [Unreliable Clocks](/en/ch9#sec_distributed_clocks), [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy), [Clock readings with a confidence interval](/en/ch9#clock-readings-with-a-confidence-interval), [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
- GPT (language model), [Vector Embeddings](/en/ch4#id92)
- GPU (graphics processing unit), [Layering of cloud services](/en/ch1#layering-of-cloud-services), [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed)
- gradual rollout (see rolling upgrades)
- GraphQL (query language), [GraphQL](/en/ch3#id63)
  - validation, [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs)
- graphs, [Glossary](/en/glossary)
  - as data models, [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)-[GraphQL](/en/ch3#id63)
    - property graphs, [Property Graphs](/en/ch3#id56)
    - RDF and triple-stores, [Triple-Stores and SPARQL](/en/ch3#id59)-[The SPARQL query language](/en/ch3#the-sparql-query-language)
  - DAGs (see directed acyclic graphs)
  - processing and analysis, [Machine Learning](/en/ch11#id290)
  - query languages
    - Cypher, [The Cypher Query Language](/en/ch3#id57)
    - Datalog, [Datalog: Recursive Relational Queries](/en/ch3#id62)-[Datalog: Recursive Relational Queries](/en/ch3#id62)
    - GraphQL, [GraphQL](/en/ch3#id63)
    - Gremlin, [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
    - recursive SQL queries, [Graph Queries in SQL](/en/ch3#id58)
    - SPARQL, [The SPARQL query language](/en/ch3#the-sparql-query-language)-[The SPARQL query language](/en/ch3#the-sparql-query-language)
  - traversal, [Property Graphs](/en/ch3#id56)
- gray failures, [System Model and Reality](/en/ch9#sec_distributed_system_model)
  - in leaderless replication, [Single-Leader Versus Leaderless Replication Performance](/en/ch6#sec_replication_leaderless_perf)
- Gremlin (graph query language), [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
- grep (Unix tool), [Simple Log Analysis](/en/ch11#sec_batch_log_analysis)
- gRPC (service calls), [Microservices and Serverless](/en/ch1#sec_introduction_microservices), [Web services](/en/ch5#sec_web_services)
  - forward and backward compatibility, [Data encoding and evolution for RPC](/en/ch5#data-encoding-and-evolution-for-rpc)
- GUIDs (see UUIDs)

### H

- Hadoop (data infrastructure)
  - comparison to distributed databases, [Batch Processing](/en/ch11#ch_batch)
  - MapReduce (see MapReduce)
  - NodeManager, [Distributed Job Orchestration](/en/ch11#id278)
  - YARN (see YARN (job scheduler))
- HANA (see SAP HANA (database))
- happens-before relation, [The "happens-before" relation and concurrency](/en/ch6#sec_replication_happens_before)
- hard disks
  - access patterns, [Sequential versus random writes](/en/ch4#sidebar_sequential)
  - detecting corruption, [The end-to-end argument](/en/ch13#sec_future_e2e_argument), [Don't just blindly trust what they promise](/en/ch13#id364)
  - faults in, [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults), [Durability](/en/ch8#durability)
  - sequential vs. random writes, [Sequential versus random writes](/en/ch4#sidebar_sequential)
  - sequential write throughput, [Disk space usage](/en/ch12#sec_stream_disk_usage)
- hardware faults, [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults)
- hash function
  - in Bloom filters, [Bloom filters](/en/ch4#bloom-filters)
- hash join
  - in stream processing, [Stream-table join (stream enrichment)](/en/ch12#sec_stream_table_joins)
- hash sharding, [Sharding by Hash of Key](/en/ch7#sec_sharding_hash)-[Consistent hashing](/en/ch7#sec_sharding_consistent_hashing), [Summary](/en/ch7#summary)
  - consistent hashing, [Consistent hashing](/en/ch7#sec_sharding_consistent_hashing)
  - problems with hash mod N, [Hash modulo number of nodes](/en/ch7#hash-modulo-number-of-nodes)
  - range queries, [Sharding by hash range](/en/ch7#sharding-by-hash-range)
  - suitable hash functions, [Sharding by Hash of Key](/en/ch7#sec_sharding_hash)
  - with fixed number of shards, [Fixed number of shards](/en/ch7#fixed-number-of-shards)
- hash tables, [Log-Structured Storage](/en/ch4#sec_storage_log_structured)
- Hazelcast (in-memory data grid)
  - FencedLock, [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens)
  - Flake ID Generator, [ID Generators and Logical Clocks](/en/ch10#sec_consistency_logical)
- HBase (database)
  - bug due to lack of fencing, [Distributed Locks and Leases](/en/ch9#sec_distributed_lock_fencing)
  - key-range sharding, [Sharding by Key Range](/en/ch7#sec_sharding_key_range)
  - log-structured storage, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
  - regions (sharding), [Sharding](/en/ch7#ch_sharding)
  - request routing, [Request Routing](/en/ch7#sec_sharding_routing)
  - size-tiered compaction, [Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
  - wide-column data model, [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality), [Column Compression](/en/ch4#sec_storage_column_compression)
- HDFS (Hadoop Distributed File System), [Batch Processing](/en/ch11#ch_batch), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
  - (see also distributed filesystems)
  - checking data integrity, [Don't just blindly trust what they promise](/en/ch13#id364)
  - DataNode, [Distributed Filesystems](/en/ch11#sec_batch_dfs)
  - NameNode, [Distributed Filesystems](/en/ch11#sec_batch_dfs)
  - use in MapReduce, [MapReduce](/en/ch11#sec_batch_mapreduce)
  - workflow example, [Scheduling Workflows](/en/ch11#sec_batch_workflows)
- HdrHistogram (numerical library), [Use of Response Time Metrics](/en/ch2#sec_introduction_slo_sla)
- head (Unix tool), [Simple Log Analysis](/en/ch11#sec_batch_log_analysis), [Distributed Job Orchestration](/en/ch11#id278)
- head vertex (property graphs), [Property Graphs](/en/ch3#id56)
- head-of-line blocking, [Latency and Response Time](/en/ch2#id23)
- heap files (databases), [Storing values within the index](/en/ch4#sec_storage_index_heap)
  - in multiversion concurrency control, [Multi-version concurrency control (MVCC)](/en/ch8#sec_transactions_snapshot_impl)
- heat management, [Skewed Workloads and Relieving Hot Spots](/en/ch7#sec_sharding_skew)
- hedged requests, [Single-Leader Versus Leaderless Replication Performance](/en/ch6#sec_replication_leaderless_perf)
- heterogeneous distributed transactions, [Distributed Transactions Across Different Systems](/en/ch8#sec_transactions_xa), [Problems with XA transactions](/en/ch8#problems-with-xa-transactions)
- heuristic decisions (in 2PC), [Recovering from coordinator failure](/en/ch8#recovering-from-coordinator-failure)
- Hex (notebook), [Machine Learning](/en/ch11#id290)
- hexagons
  - for geospatial indexing, [Multidimensional and Full-Text Indexes](/en/ch4#sec_storage_multidimensional)
- Hibernate (object-relational mapper), [Object-relational mapping (ORM)](/en/ch3#object-relational-mapping-orm)
- hierarchical model, [Relational Model versus Document Model](/en/ch3#sec_datamodels_history)
- hierarchical navigable small world (vector index), [Vector Embeddings](/en/ch4#id92)
- hierarchical queries (see recursive common table expressions)
- high availability (see fault tolerance)
- high-frequency trading, [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy)
- high-performance computing (HPC), [Cloud Computing Versus Supercomputing](/en/ch1#id17)
- hinted handoff (leaderless replication), [Catching up on missed writes](/en/ch6#sec_replication_read_repair)
- histograms, [Use of Response Time Metrics](/en/ch2#sec_introduction_slo_sla)
- Hive (data warehouse), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
  - query optimizer, [Query languages](/en/ch11#sec_batch_query_lanauges)
- HNSW (vector index), [Vector Embeddings](/en/ch4#id92)
- hopping windows (stream processing), [Types of windows](/en/ch12#id324)
  - (see also windows)
- Hoptimator (query engine), [The meta-database of everything](/en/ch13#id341)
- Horizon scandal, [Humans and Reliability](/en/ch2#id31)
  - lack of transactions, [Transactions](/en/ch8#ch_transactions)
- horizontal scaling (see scaling out)
  - by sharding, [Pros and Cons of Sharding](/en/ch7#sec_sharding_reasons)
- HornetQ (messaging), [Message brokers](/en/ch5#message-brokers), [Message brokers compared to databases](/en/ch12#id297)
  - distributed transaction support, [XA transactions](/en/ch8#xa-transactions)
- hot keys, [Sharding of Key-Value Data](/en/ch7#sec_sharding_key_value)
- hot spots, [Sharding of Key-Value Data](/en/ch7#sec_sharding_key_value)
  - due to celebrities, [Skewed Workloads and Relieving Hot Spots](/en/ch7#sec_sharding_skew)
  - for time-series data, [Sharding by Key Range](/en/ch7#sec_sharding_key_range)
  - relieving, [Skewed Workloads and Relieving Hot Spots](/en/ch7#sec_sharding_skew)
- hot standbys (see leader-based replication)
- HTAP (see hybrid transactional/analytic processing)
- HTTP, use in APIs (see services)
- human errors, [Humans and Reliability](/en/ch2#id31), [Network Faults in Practice](/en/ch9#sec_distributed_network_faults), [Batch Processing](/en/ch11#ch_batch)
- hybrid logical clocks, [Hybrid logical clocks](/en/ch10#hybrid-logical-clocks)
- hybrid transactional/analytic processing, [Data Warehousing](/en/ch1#sec_introduction_dwh), [Data Storage for Analytics](/en/ch4#sec_storage_analytics)
- hydrating IDs (join), [Denormalization in the social networking case study](/en/ch3#denormalization-in-the-social-networking-case-study)
- hypergraph, [Property Graphs](/en/ch3#id56)
- HyperLogLog (algorithm), [Stream analytics](/en/ch12#id318)

### I

- I/O operations, waiting for, [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
- IaaS (see infrastructure as a service (IaaS))
- IBM
  - Db2 (database)
    - distributed transaction support, [XA transactions](/en/ch8#xa-transactions)
    - serializable isolation, [Snapshot isolation, repeatable read, and naming confusion](/en/ch8#snapshot-isolation-repeatable-read-and-naming-confusion), [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
  - MQ (messaging), [Message brokers compared to databases](/en/ch12#id297)
    - distributed transaction support, [XA transactions](/en/ch8#xa-transactions)
  - System R (database), [What Exactly Is a Transaction?](/en/ch8#sec_transactions_overview)
  - WebSphere (messaging), [Message brokers](/en/ch5#message-brokers)
- Iceberg (table format), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
  - databases on object storage, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - log-based message broker storage, [Disk space usage](/en/ch12#sec_stream_disk_usage)
- idempotence, [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc), [Idempotence](/en/ch12#sec_stream_idempotence), [Glossary](/en/glossary)
  - by giving operations unique IDs, [Multi-shard request processing](/en/ch13#id360)
  - by giving requests unique IDs, [Uniquely identifying requests](/en/ch13#id355)
  - for exactly-once semantics, [Exactly-once message processing revisited](/en/ch8#exactly-once-message-processing-revisited)
  - idempotent operations, [Exactly-once execution of an operation](/en/ch13#id353)
  - in workflow engines, [Durable execution](/en/ch5#durable-execution)
- immutability
  - advantages of, [Advantages of immutable events](/en/ch12#sec_stream_immutability_pros), [Designing for auditability](/en/ch13#id365)
  - and right to erasure, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance), [Disk space usage](/en/ch4#disk-space-usage)
  - crypto-shredding for deletion, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events), [Limitations of immutability](/en/ch12#sec_stream_immutability_limitations)
  - deriving state from event log, [State, Streams, and Immutability](/en/ch12#sec_stream_immutability)-[Limitations of immutability](/en/ch12#sec_stream_immutability_limitations)
  - for crash recovery, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
  - in B-trees, [B-tree variants](/en/ch4#b-tree-variants), [Indexes and snapshot isolation](/en/ch8#indexes-and-snapshot-isolation)
  - in event sourcing, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events), [Change data capture versus event sourcing](/en/ch12#sec_stream_event_sourcing)
  - limitations of, [Concurrency control](/en/ch12#sec_stream_concurrency)
- impedance mismatch, [The Object-Relational Mismatch](/en/ch3#sec_datamodels_document)
- in doubt (transaction status), [Coordinator failure](/en/ch8#coordinator-failure)
  - holding locks, [Holding locks while in doubt](/en/ch8#holding-locks-while-in-doubt)
  - orphaned transactions, [Recovering from coordinator failure](/en/ch8#recovering-from-coordinator-failure)
- in-memory databases, [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
  - durability, [Durability](/en/ch8#durability)
  - serial transaction execution, [Actual Serial Execution](/en/ch8#sec_transactions_serial)
- incidents
  - accounting software bugs leading to wrongful convictions, [Humans and Reliability](/en/ch2#id31)
  - blameless postmortems, [Humans and Reliability](/en/ch2#id31)
  - crashes due to leap seconds, [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy)
  - data corruption and financial losses due to concurrency bugs, [Weak Isolation Levels](/en/ch8#sec_transactions_isolation_levels)
  - data corruption on hard disks, [Durability](/en/ch8#durability)
  - data loss due to last-write-wins, [Timestamps for ordering events](/en/ch9#sec_distributed_lww)
  - data on disks unreadable, [Mapping system models to the real world](/en/ch9#mapping-system-models-to-the-real-world)
  - disclosure of sensitive data due to primary key reuse, [Leader failure: Failover](/en/ch6#leader-failure-failover)
  - errors in transaction serializability, [Maintaining integrity in the face of software bugs](/en/ch13#id455)
  - gigabit network interface with 1 Kb/s throughput, [System Model and Reality](/en/ch9#sec_distributed_system_model)
  - leap second crash, [Software faults](/en/ch2#software-faults)
  - network faults, [Network Faults in Practice](/en/ch9#sec_distributed_network_faults)
  - network interface dropping only inbound packets, [Network Faults in Practice](/en/ch9#sec_distributed_network_faults)
  - network partitions and whole-datacenter failures, [Faults and Partial Failures](/en/ch9#sec_distributed_partial_failure)
  - poor handling of network faults, [Network Faults in Practice](/en/ch9#sec_distributed_network_faults)
  - sending message to ex-partner, [Ordering events to capture causality](/en/ch13#sec_future_capture_causality)
  - sharks biting undersea cables, [Network Faults in Practice](/en/ch9#sec_distributed_network_faults)
  - split brain due to 1-minute packet delay, [Leader failure: Failover](/en/ch6#leader-failure-failover), [Network Faults in Practice](/en/ch9#sec_distributed_network_faults)
  - SSD failure after 32,768 hours, [Software faults](/en/ch2#software-faults)
  - thread contention bringing down a service, [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
  - vibrations in server rack, [Latency and Response Time](/en/ch2#id23)
  - violation of uniqueness constraint, [Maintaining integrity in the face of software bugs](/en/ch13#id455)
- incremental view maintenance (IVM), [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
  - for data integration, [Unbundled versus integrated systems](/en/ch13#id448)
- indexes, [Storage and Indexing for OLTP](/en/ch4#sec_storage_oltp), [Glossary](/en/glossary)
  - and snapshot isolation, [Indexes and snapshot isolation](/en/ch8#indexes-and-snapshot-isolation)
  - as derived data, [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived), [Composing Data Storage Technologies](/en/ch13#id447)-[Unbundled versus integrated systems](/en/ch13#id448)
  - B-trees, [B-Trees](/en/ch4#sec_storage_b_trees)-[B-tree variants](/en/ch4#b-tree-variants)
  - clustered, [Storing values within the index](/en/ch4#sec_storage_index_heap)
  - comparison of B-trees and LSM-trees, [Comparing B-Trees and LSM-Trees](/en/ch4#sec_storage_btree_lsm_comparison)-[Disk space usage](/en/ch4#disk-space-usage)
  - covering (with included columns), [Storing values within the index](/en/ch4#sec_storage_index_heap)
  - creating, [Creating an index](/en/ch13#id340)
  - full-text search, [Full-Text Search](/en/ch4#sec_storage_full_text)
  - geospatial, [Multidimensional and Full-Text Indexes](/en/ch4#sec_storage_multidimensional)
  - index-range locking, [Index-range locks](/en/ch8#sec_transactions_2pl_range)
  - multi-column (concatenated), [Multidimensional and Full-Text Indexes](/en/ch4#sec_storage_multidimensional)
  - secondary, [Multi-Column and Secondary Indexes](/en/ch4#sec_storage_index_multicolumn)
    - (see also secondary indexes)
    - problems with dual writes, [Keeping Systems in Sync](/en/ch12#sec_stream_sync), [Reasoning about dataflows](/en/ch13#id443)
  - sharding and secondary indexes, [Sharding and Secondary Indexes](/en/ch7#sec_sharding_secondary_indexes)-[Global Secondary Indexes](/en/ch7#id167), [Summary](/en/ch7#summary)
  - sparse, [The SSTable file format](/en/ch4#the-sstable-file-format)
  - SSTables and LSM-trees, [The SSTable file format](/en/ch4#the-sstable-file-format)-[Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
  - updating when data changes, [Keeping Systems in Sync](/en/ch12#sec_stream_sync), [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
- Industrial Revolution, [Remembering the Industrial Revolution](/en/ch14#id377)
- InfiniBand (networks), [Can we not simply make network delays predictable?](/en/ch9#can-we-not-simply-make-network-delays-predictable)
- InfluxDB IOx (storage engine), [Column-Oriented Storage](/en/ch4#sec_storage_column)
- information retrieval (see full-text search)
- infrastructure as a service (IaaS), [Cloud Versus Self-Hosting](/en/ch1#sec_introduction_cloud), [Layering of cloud services](/en/ch1#layering-of-cloud-services)
- InnoDB (storage engine)
  - clustered index on primary key, [Storing values within the index](/en/ch4#sec_storage_index_heap)
  - not preventing lost updates, [Automatically detecting lost updates](/en/ch8#automatically-detecting-lost-updates)
  - preventing write skew, [Characterizing write skew](/en/ch8#characterizing-write-skew), [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
  - serializable isolation, [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
  - snapshot isolation support, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)
- instance (cloud computing), [Layering of cloud services](/en/ch1#layering-of-cloud-services)
- integrating different data systems (see data integration)
- integrity, [Timeliness and Integrity](/en/ch13#sec_future_integrity)
  - coordination-avoiding data systems, [Coordination-avoiding data systems](/en/ch13#id454)
  - correctness of dataflow systems, [Correctness of dataflow systems](/en/ch13#id453)
  - in consensus formalization, [Single-value consensus](/en/ch10#single-value-consensus), [Atomic commitment as consensus](/en/ch10#atomic-commitment-as-consensus)
  - integrity checks, [Don't just blindly trust what they promise](/en/ch13#id364)
    - (see also auditing)
    - end-to-end, [The end-to-end argument](/en/ch13#sec_future_e2e_argument), [The end-to-end argument again](/en/ch13#id456)
    - use of snapshot isolation, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)
  - maintaining despite software bugs, [Maintaining integrity in the face of software bugs](/en/ch13#id455)
- Interface Definition Language (IDL), [Protocol Buffers](/en/ch5#sec_encoding_protobuf), [Avro](/en/ch5#sec_encoding_avro), [Web services](/en/ch5#sec_web_services)
- invariants, [Consistency](/en/ch8#sec_transactions_acid_consistency)
  - (see also constraints)
- inverted file index (vector index), [Vector Embeddings](/en/ch4#id92)
- inverted index, [Full-Text Search](/en/ch4#sec_storage_full_text)
- irreversibility, minimizing, [Evolvability: Making Change Easy](/en/ch2#sec_introduction_evolvability), [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events), [Batch Processing](/en/ch11#ch_batch)
- ISDN (Integrated Services Digital Network), [Synchronous Versus Asynchronous Networks](/en/ch9#sec_distributed_sync_networks)
- isolation (in operating systems)
  - cgroups (see cgroups)
- isolation (in transactions), [Isolation](/en/ch8#sec_transactions_acid_isolation), [Single-Object and Multi-Object Operations](/en/ch8#sec_transactions_multi_object), [Glossary](/en/glossary)
  - correctness and, [Aiming for Correctness](/en/ch13#sec_future_correctness)
  - for single-object writes, [Single-object writes](/en/ch8#sec_transactions_single_object)
  - serializability, [Serializability](/en/ch8#sec_transactions_serializability)-[Performance of serializable snapshot isolation](/en/ch8#performance-of-serializable-snapshot-isolation)
    - actual serial execution, [Actual Serial Execution](/en/ch8#sec_transactions_serial)-[Summary of serial execution](/en/ch8#summary-of-serial-execution)
    - serializable snapshot isolation (SSI), [Serializable Snapshot Isolation (SSI)](/en/ch8#sec_transactions_ssi)-[Performance of serializable snapshot isolation](/en/ch8#performance-of-serializable-snapshot-isolation)
    - two-phase locking (2PL), [Two-Phase Locking (2PL)](/en/ch8#sec_transactions_2pl)-[Index-range locks](/en/ch8#sec_transactions_2pl_range)
  - violating, [Single-Object and Multi-Object Operations](/en/ch8#sec_transactions_multi_object)
  - weak isolation levels, [Weak Isolation Levels](/en/ch8#sec_transactions_isolation_levels)-[Materializing conflicts](/en/ch8#materializing-conflicts)
    - preventing lost updates, [Preventing Lost Updates](/en/ch8#sec_transactions_lost_update)-[Conflict resolution and replication](/en/ch8#conflict-resolution-and-replication)
    - read committed, [Read Committed](/en/ch8#sec_transactions_read_committed)-[Implementing read committed](/en/ch8#sec_transactions_read_committed_impl)
    - snapshot isolation, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)-[Snapshot isolation, repeatable read, and naming confusion](/en/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
- IVF (vector index), [Vector Embeddings](/en/ch4#id92)

### J

- Java Database Connectivity (JDBC)
  - distributed transaction support, [XA transactions](/en/ch8#xa-transactions)
  - network drivers, [The Merits of Schemas](/en/ch5#sec_encoding_schemas)
- Java Enterprise Edition (EE), [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc), [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc), [XA transactions](/en/ch8#xa-transactions)
- Java Message Service (JMS), [Message brokers compared to databases](/en/ch12#id297)
  - (see also messaging systems)
  - comparison to log-based messaging, [Logs compared to traditional messaging](/en/ch12#sec_stream_logs_vs_messaging), [Replaying old messages](/en/ch12#sec_stream_replay)
  - distributed transaction support, [XA transactions](/en/ch8#xa-transactions)
  - message ordering, [Acknowledgments and redelivery](/en/ch12#sec_stream_reordering)
- Java Transaction API (JTA), [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc), [XA transactions](/en/ch8#xa-transactions)
- Java Virtual Machine (JVM)
  - garbage collection, [Process Pauses](/en/ch9#sec_distributed_clocks_pauses), [Limiting the impact of garbage collection](/en/ch9#sec_distributed_gc_impact)
  - JIT compilation, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
  - process reuse in batch processors, [Dataflow Engines](/en/ch11#sec_batch_dataflow)
- Jena (RDF framework), [The RDF data model](/en/ch3#the-rdf-data-model)
  - SPARQL query language, [The SPARQL query language](/en/ch3#the-sparql-query-language)
- Jepsen (fault tolerance testing), [Fault injection](/en/ch9#sec_fault_injection), [Aiming for Correctness](/en/ch13#sec_future_correctness)
- jitter (network delay), [Average, Median, and Percentiles](/en/ch2#id24), [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
- JMESPath (query language), [Query languages](/en/ch11#sec_batch_query_lanauges)
- join table, [Many-to-One and Many-to-Many Relationships](/en/ch3#sec_datamodels_many_to_many), [Property Graphs](/en/ch3#id56)
- joins, [Glossary](/en/glossary)
  - expressing as relational operators, [Query languages](/en/ch11#sec_batch_query_lanauges)
  - handling GraphQL query, [GraphQL](/en/ch3#id63)
  - in application code, [Normalization, Denormalization, and Joins](/en/ch3#sec_datamodels_normalization), [Denormalization in the social networking case study](/en/ch3#denormalization-in-the-social-networking-case-study)
  - in DataFrames, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
  - in relational and document databases, [Normalization, Denormalization, and Joins](/en/ch3#sec_datamodels_normalization)
  - secondary indexes and, [Multi-Column and Secondary Indexes](/en/ch4#sec_storage_index_multicolumn)
  - sort-merge joins, [JOIN and GROUP BY](/en/ch11#sec_batch_join)
  - stream joins, [Stream Joins](/en/ch12#sec_stream_joins)-[Time-dependence of joins](/en/ch12#sec_stream_join_time)
    - stream-stream join, [Stream-stream join (window join)](/en/ch12#id440)
    - stream-table join, [Stream-table join (stream enrichment)](/en/ch12#sec_stream_table_joins)
    - table-table join, [Table-table join (materialized view maintenance)](/en/ch12#id326)
    - time-dependence of, [Time-dependence of joins](/en/ch12#sec_stream_join_time)
  - support in document databases, [Convergence of document and relational databases](/en/ch3#convergence-of-document-and-relational-databases)
- JOTM (transaction coordinator), [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc)
- journaling (filesystems), [Making B-trees reliable](/en/ch4#sec_storage_btree_wal)
- JSON
  - aggregation pipeline (query language), [Query languages for documents](/en/ch3#query-languages-for-documents)
  - Avro schema representation, [Avro](/en/ch5#sec_encoding_avro)
  - binary variants, [Binary encoding](/en/ch5#binary-encoding)
  - data locality, [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality)
  - document data model, [Relational Model versus Document Model](/en/ch3#sec_datamodels_history)
  - for application data, issues with, [JSON, XML, and Binary Variants](/en/ch5#sec_encoding_json)
  - GraphQL response, [GraphQL](/en/ch3#id63)
  - in relational databases, [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility)
  - representing a résumé (example), [The document data model for one-to-many relationships](/en/ch3#the-document-data-model-for-one-to-many-relationships)
  - Schema, [JSON Schema](/en/ch5#json-schema)
- JSON-LD, [Triple-Stores and SPARQL](/en/ch3#id59)
- JsonPath (query language), [Query languages](/en/ch11#sec_batch_query_lanauges)
- JuiceFS (distributed filesystem), [Distributed Filesystems](/en/ch11#sec_batch_dfs), [Object Stores](/en/ch11#id277)
- Jupyter (notebook), [Machine Learning](/en/ch11#id290)
- just-in-time (JIT) compilation, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)

### K

- Kafka (messaging), [Message brokers](/en/ch5#message-brokers), [Using logs for message storage](/en/ch12#id300)
  - consumer groups, [Multiple consumers](/en/ch12#id298)
  - for data integration, [Unbundled versus integrated systems](/en/ch13#id448)
  - for event sourcing, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
  - Kafka Connect (database integration), [Implementing change data capture](/en/ch12#id307), [API support for change streams](/en/ch12#sec_stream_change_api), [Deriving several views from the same event log](/en/ch12#sec_stream_deriving_views)
  - Kafka Streams (stream processor), [Stream analytics](/en/ch12#id318), [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
    - exactly-once semantics, [Exactly-once message processing revisited](/en/ch8#exactly-once-message-processing-revisited)
    - fault tolerance, [Rebuilding state after a failure](/en/ch12#sec_stream_state_fault_tolerance)
  - ksqlDB (stream database), [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
  - leader-based replication, [Single-Leader Replication](/en/ch6#sec_replication_leader)
  - log compaction, [Log compaction](/en/ch12#sec_stream_log_compaction), [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
  - message offsets, [Using logs for message storage](/en/ch12#id300), [Idempotence](/en/ch12#sec_stream_idempotence)
  - partitions (sharding), [Sharding](/en/ch7#ch_sharding)
  - request routing, [Request Routing](/en/ch7#sec_sharding_routing)
  - schema registry, [But what is the writer's schema?](/en/ch5#but-what-is-the-writers-schema)
  - serving derived data, [Serving Derived Data](/en/ch11#sec_batch_serving_derived)
  - tiered storage, [Disk space usage](/en/ch12#sec_stream_disk_usage)
  - transactions, [Database-internal Distributed Transactions](/en/ch8#sec_transactions_internal), [Atomic commit revisited](/en/ch12#sec_stream_atomic_commit)
  - unclean leader election, [Subtleties of consensus](/en/ch10#subtleties-of-consensus)
  - use of model-checking, [Model checking and specification languages](/en/ch9#model-checking-and-specification-languages)
- kappa architecture, [Unifying batch and stream processing](/en/ch13#id338)
- key-value stores, [Storage and Indexing for OLTP](/en/ch4#sec_storage_oltp)
  - comparison to object stores, [Object Stores](/en/ch11#id277)
  - in-memory, [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
  - LSM storage, [Log-Structured Storage](/en/ch4#sec_storage_log_structured)-[Disk space usage](/en/ch4#disk-space-usage)
  - sharding, [Sharding of Key-Value Data](/en/ch7#sec_sharding_key_value)-[Skewed Workloads and Relieving Hot Spots](/en/ch7#sec_sharding_skew)
    - by hash of key, [Sharding by Hash of Key](/en/ch7#sec_sharding_hash), [Summary](/en/ch7#summary)
    - by key range, [Sharding by Key Range](/en/ch7#sec_sharding_key_range), [Summary](/en/ch7#summary)
    - skew and hot spots, [Skewed Workloads and Relieving Hot Spots](/en/ch7#sec_sharding_skew)
- Kinesis (messaging), [Message brokers](/en/ch5#message-brokers), [Using logs for message storage](/en/ch12#id300)
  - data warehouse integration, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
- Kryo (Java), [Language-Specific Formats](/en/ch5#id96)
- ksqlDB (stream database), [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
- Kubernetes (cluster manager), [Cloud Versus Self-Hosting](/en/ch1#sec_introduction_cloud), [Microservices and Serverless](/en/ch1#sec_introduction_microservices), [Distributed Job Orchestration](/en/ch11#id278), [Separation of application code and state](/en/ch13#id344)
  - Kubeflow, [Machine Learning](/en/ch11#id290)
  - kubelet, [Distributed Job Orchestration](/en/ch11#id278)
  - operators, [Distributed Job Orchestration](/en/ch11#id278)
  - use of etcd, [Request Routing](/en/ch7#sec_sharding_routing), [Coordination Services](/en/ch10#sec_consistency_coordination)
- KùzuDB (database), [Problems with Distributed Systems](/en/ch1#sec_introduction_dist_sys_problems), [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
  - as embedded storage engine, [Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
  - Cypher query language, [The Cypher Query Language](/en/ch3#id57)

### L

- labeled property graphs (see property graphs)
- lambda architecture, [Unifying batch and stream processing](/en/ch13#id338)
- Lamport timestamps, [Lamport timestamps](/en/ch10#lamport-timestamps)
- Lance (data format), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses), [Column-Oriented Storage](/en/ch4#sec_storage_column)
  - (see also column-oriented storage)
- large language models (LLMs)
  - pre-processing training data, [Machine Learning](/en/ch11#id290)
- last write wins (LWW), [Last write wins (discarding concurrent writes)](/en/ch6#sec_replication_lww), [Detecting Concurrent Writes](/en/ch6#sec_replication_concurrent), [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable)
  - problems with, [Timestamps for ordering events](/en/ch9#sec_distributed_lww)
  - prone to lost updates, [Conflict resolution and replication](/en/ch8#conflict-resolution-and-replication)
- latency, [Latency and Response Time](/en/ch2#id23)
  - (see also response time)
  - across regions, [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed)
  - instability under two-phase locking, [Performance of two-phase locking](/en/ch8#performance-of-two-phase-locking)
  - network latency and resource utilization, [Can we not simply make network delays predictable?](/en/ch9#can-we-not-simply-make-network-delays-predictable)
  - reducing by request hedging, [Single-Leader Versus Leaderless Replication Performance](/en/ch6#sec_replication_leaderless_perf)
  - response time versus, [Latency and Response Time](/en/ch2#id23)
  - tail latency, [Average, Median, and Percentiles](/en/ch2#id24), [Use of Response Time Metrics](/en/ch2#sec_introduction_slo_sla), [Local Secondary Indexes](/en/ch7#id166)
- law (see legal matters)
- layering (of cloud services), [Layering of cloud services](/en/ch1#layering-of-cloud-services)
- leader-based replication, [Single-Leader Replication](/en/ch6#sec_replication_leader)-[Logical (row-based) log replication](/en/ch6#logical-row-based-log-replication)
  - (see also replication)
  - failover, [Leader failure: Failover](/en/ch6#leader-failure-failover), [Distributed Locks and Leases](/en/ch9#sec_distributed_lock_fencing)
  - handling node outages, [Handling Node Outages](/en/ch6#sec_replication_failover)
  - implementation of replication logs
    - change data capture, [Change Data Capture](/en/ch12#sec_stream_cdc)-[API support for change streams](/en/ch12#sec_stream_change_api)
      - (see also changelogs)
    - statement-based, [Statement-based replication](/en/ch6#statement-based-replication)
    - write-ahead log (WAL) shipping, [Write-ahead log (WAL) shipping](/en/ch6#write-ahead-log-wal-shipping)
  - linearizability of operations, [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable)
  - locking and leader election, [Locking and leader election](/en/ch10#locking-and-leader-election)
  - log sequence number, [Setting Up New Followers](/en/ch6#sec_replication_new_replica), [Consumer offsets](/en/ch12#sec_stream_log_offsets)
  - read-scaling architecture, [Problems with Replication Lag](/en/ch6#sec_replication_lag), [Single-Leader Versus Leaderless Replication Performance](/en/ch6#sec_replication_leaderless_perf)
  - relation to consensus, [Consensus](/en/ch10#sec_consistency_consensus), [From single-leader replication to consensus](/en/ch10#from-single-leader-replication-to-consensus), [Pros and cons of consensus](/en/ch10#pros-and-cons-of-consensus)
  - setting up new followers, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - synchronous versus asynchronous, [Synchronous Versus Asynchronous Replication](/en/ch6#sec_replication_sync_async)-[Synchronous Versus Asynchronous Replication](/en/ch6#sec_replication_sync_async)
- leaderless replication, [Leaderless Replication](/en/ch6#sec_replication_leaderless)-[Version vectors](/en/ch6#version-vectors)
  - (see also replication)
  - catching up on missed writes, [Catching up on missed writes](/en/ch6#sec_replication_read_repair)
  - detecting concurrent writes, [Detecting Concurrent Writes](/en/ch6#sec_replication_concurrent)-[Version vectors](/en/ch6#version-vectors)
    - version vectors, [Version vectors](/en/ch6#version-vectors)
  - multi-region, [Multi-region operation](/en/ch6#multi-region-operation)
  - quorums, [Quorums for reading and writing](/en/ch6#sec_replication_quorum_condition)-[Multi-region operation](/en/ch6#multi-region-operation)
    - consistency limitations, [Limitations of Quorum Consistency](/en/ch6#sec_replication_quorum_limitations)-[Monitoring staleness](/en/ch6#monitoring-staleness), [Linearizability and quorums](/en/ch10#sec_consistency_quorum_linearizable)
- leap seconds, [Software faults](/en/ch2#software-faults), [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy)
  - in time-of-day clocks, [Time-of-day clocks](/en/ch9#time-of-day-clocks)
- leases, [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
  - implementation with coordination service, [Coordination Services](/en/ch10#sec_consistency_coordination)
  - need for fencing, [Distributed Locks and Leases](/en/ch9#sec_distributed_lock_fencing)
  - relation to consensus, [Single-value consensus](/en/ch10#single-value-consensus)
- ledgers (accounting), [Summary](/en/ch3#summary)
  - immutability, [Advantages of immutable events](/en/ch12#sec_stream_immutability_pros)
- legacy systems, maintenance of, [Maintainability](/en/ch2#sec_introduction_maintainability)
- legal matters, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)-[Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)
  - data deletion, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance), [Disk space usage](/en/ch4#disk-space-usage)
  - data residence, [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed), [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
  - privacy regulation, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance), [Legislation and Self-Regulation](/en/ch14#sec_future_legislation)
- legitimate interest (GDPR), [Consent and Freedom of Choice](/en/ch14#id375)
- leveled compaction, [Compaction strategies](/en/ch4#sec_storage_lsm_compaction), [Disk space usage](/en/ch4#disk-space-usage)
- Levenshtein automata, [Full-Text Search](/en/ch4#sec_storage_full_text)
- limping (partial failure), [System Model and Reality](/en/ch9#sec_distributed_system_model)
- Linear (project management software), [Real-time collaboration, offline-first, and local-first apps](/en/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- linear algebra, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- linear scalability, [Describing Load](/en/ch2#id33)
- linearizability, [Solutions for Replication Lag](/en/ch6#id131), [Linearizability](/en/ch10#sec_consistency_linearizability)-[Linearizability and network delays](/en/ch10#linearizability-and-network-delays), [Glossary](/en/glossary)
  - and consensus, [Consensus](/en/ch10#sec_consistency_consensus)
  - cost of, [The Cost of Linearizability](/en/ch10#sec_linearizability_cost)-[Linearizability and network delays](/en/ch10#linearizability-and-network-delays)
    - CAP theorem, [The CAP theorem](/en/ch10#the-cap-theorem)
    - memory on multi-core CPUs, [Linearizability and network delays](/en/ch10#linearizability-and-network-delays)
  - definition, [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)-[What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
  - ID generation, [Linearizable ID Generators](/en/ch10#sec_consistency_linearizable_id)
  - in coordination services, [Coordination Services](/en/ch10#sec_consistency_coordination)
  - of derived data systems
    - avoiding coordination, [Coordination-avoiding data systems](/en/ch13#id454)
  - of different replication methods, [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable)-[Linearizability and quorums](/en/ch10#sec_consistency_quorum_linearizable)
    - using quorums, [Linearizability and quorums](/en/ch10#sec_consistency_quorum_linearizable)
  - reads in consensus systems, [Subtleties of consensus](/en/ch10#subtleties-of-consensus)
  - relying on, [Relying on Linearizability](/en/ch10#sec_consistency_linearizability_usage)-[Cross-channel timing dependencies](/en/ch10#cross-channel-timing-dependencies)
    - constraints and uniqueness, [Constraints and uniqueness guarantees](/en/ch10#sec_consistency_uniqueness)
    - cross-channel timing dependencies, [Cross-channel timing dependencies](/en/ch10#cross-channel-timing-dependencies)
    - locking and leader election, [Locking and leader election](/en/ch10#locking-and-leader-election)
  - versus serializability, [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
- linked data, [Triple-Stores and SPARQL](/en/ch3#id59)
- LinkedIn
  - Espresso (database), [But what is the writer's schema?](/en/ch5#but-what-is-the-writers-schema)
  - LIquid (database), [Datalog: Recursive Relational Queries](/en/ch3#id62)
  - profile (example), [The document data model for one-to-many relationships](/en/ch3#the-document-data-model-for-one-to-many-relationships)
- Linux, leap second bug, [Software faults](/en/ch2#software-faults), [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy)
- Litestream (backup tool), [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
- liveness properties, [Safety and liveness](/en/ch9#sec_distributed_safety_liveness)
- LLVM (compiler), [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
- LMDB (storage engine), [Compaction strategies](/en/ch4#sec_storage_lsm_compaction), [B-tree variants](/en/ch4#b-tree-variants), [Indexes and snapshot isolation](/en/ch8#indexes-and-snapshot-isolation)
- load
  - coping with, [Principles for Scalability](/en/ch2#id35)
  - describing, [Describing Load](/en/ch2#id33)
- load balancing, [Describing Performance](/en/ch2#sec_introduction_percentiles), [Load balancers, service discovery, and service meshes](/en/ch5#sec_encoding_service_discovery)
  - in hardware, [Load balancers, service discovery, and service meshes](/en/ch5#sec_encoding_service_discovery)
  - in software, [Load balancers, service discovery, and service meshes](/en/ch5#sec_encoding_service_discovery)
  - using message brokers, [Multiple consumers](/en/ch12#id298)
- load shedding, [Describing Performance](/en/ch2#sec_introduction_percentiles)
- local secondary indexes, [Local Secondary Indexes](/en/ch7#id166), [Summary](/en/ch7#summary)
- local-first software, [Real-time collaboration, offline-first, and local-first apps](/en/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- locality (data access), [The document data model for one-to-many relationships](/en/ch3#the-document-data-model-for-one-to-many-relationships), [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality), [Glossary](/en/glossary)
  - in batch processing, [Dataflow Engines](/en/ch11#sec_batch_dataflow)
  - in stateful clients, [Sync Engines and Local-First Software](/en/ch6#sec_replication_offline_clients), [Stateful, offline-capable clients](/en/ch13#id347)
  - in stream processing, [Stream-table join (stream enrichment)](/en/ch12#sec_stream_table_joins), [Rebuilding state after a failure](/en/ch12#sec_stream_state_fault_tolerance), [Stream processors and services](/en/ch13#id345), [Uniqueness in log-based messaging](/en/ch13#sec_future_uniqueness_log)
- location transparency, [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc)
  - in the actor model, [Distributed actor frameworks](/en/ch5#distributed-actor-frameworks)
- lock-in, [Pros and Cons of Cloud Services](/en/ch1#sec_introduction_cloud_tradeoffs)
- locks, [Glossary](/en/glossary)
  - deadlock, [Explicit locking](/en/ch8#explicit-locking), [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
  - distributed locking, [Distributed Locks and Leases](/en/ch9#sec_distributed_lock_fencing)-[Fencing with multiple replicas](/en/ch9#fencing-with-multiple-replicas), [Locking and leader election](/en/ch10#locking-and-leader-election)
    - fencing tokens, [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens)
    - implementation with coordination service, [Coordination Services](/en/ch10#sec_consistency_coordination)
    - relation to consensus, [Single-value consensus](/en/ch10#single-value-consensus)
  - for transaction isolation
    - in snapshot isolation, [Multi-version concurrency control (MVCC)](/en/ch8#sec_transactions_snapshot_impl)
    - in two-phase locking (2PL), [Two-Phase Locking (2PL)](/en/ch8#sec_transactions_2pl)-[Index-range locks](/en/ch8#sec_transactions_2pl_range)
    - making operations atomic, [Atomic write operations](/en/ch8#atomic-write-operations)
    - performance, [Performance of two-phase locking](/en/ch8#performance-of-two-phase-locking)
    - preventing dirty writes, [Implementing read committed](/en/ch8#sec_transactions_read_committed_impl)
    - preventing phantoms with index-range locks, [Index-range locks](/en/ch8#sec_transactions_2pl_range), [Detecting writes that affect prior reads](/en/ch8#sec_detecting_writes_affect_reads)
    - read locks (shared mode), [Implementing read committed](/en/ch8#sec_transactions_read_committed_impl), [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
    - shared mode and exclusive mode, [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
  - in distributed transactions
    - deadlock detection, [Problems with XA transactions](/en/ch8#problems-with-xa-transactions)
    - in-doubt transactions holding locks, [Holding locks while in doubt](/en/ch8#holding-locks-while-in-doubt)
  - materializing conflicts with, [Materializing conflicts](/en/ch8#materializing-conflicts)
  - preventing lost updates by explicit locking, [Explicit locking](/en/ch8#explicit-locking)
- log sequence number, [Setting Up New Followers](/en/ch6#sec_replication_new_replica), [Consumer offsets](/en/ch12#sec_stream_log_offsets)
- logical clocks, [Timestamps for ordering events](/en/ch9#sec_distributed_lww), [ID Generators and Logical Clocks](/en/ch10#sec_consistency_logical)-[Enforcing constraints using logical clocks](/en/ch10#enforcing-constraints-using-logical-clocks), [Ordering events to capture causality](/en/ch13#sec_future_capture_causality)
  - for last-write-wins, [Last write wins (discarding concurrent writes)](/en/ch6#sec_replication_lww)
  - for read-after-write consistency, [Reading Your Own Writes](/en/ch6#sec_replication_ryw)
  - hybrid logical clocks, [Hybrid logical clocks](/en/ch10#hybrid-logical-clocks)
  - insufficiency for enforcing constraints, [Enforcing constraints using logical clocks](/en/ch10#enforcing-constraints-using-logical-clocks)
  - Lamport timestamps, [Lamport timestamps](/en/ch10#lamport-timestamps)
- logical replication, [Logical (row-based) log replication](/en/ch6#logical-row-based-log-replication)
  - for change data capture, [Implementing change data capture](/en/ch12#id307)
- LogicBlox (database), [Datalog: Recursive Relational Queries](/en/ch3#id62)
- logs (data structure), [Storage and Indexing for OLTP](/en/ch4#sec_storage_oltp), [Shared logs as consensus](/en/ch10#sec_consistency_shared_logs), [Glossary](/en/glossary)
  - (see also shared logs)
  - advantages of immutability, [Advantages of immutable events](/en/ch12#sec_stream_immutability_pros)
  - and right to erasure, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance), [Disk space usage](/en/ch4#disk-space-usage)
  - compaction, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables), [Compaction strategies](/en/ch4#sec_storage_lsm_compaction), [Log compaction](/en/ch12#sec_stream_log_compaction), [State, Streams, and Immutability](/en/ch12#sec_stream_immutability)
    - for stream operator state, [Rebuilding state after a failure](/en/ch12#sec_stream_state_fault_tolerance)
  - implementing uniqueness constraints, [Uniqueness in log-based messaging](/en/ch13#sec_future_uniqueness_log)
  - log-based messaging, [Log-based Message Brokers](/en/ch12#sec_stream_log)-[Replaying old messages](/en/ch12#sec_stream_replay)
    - comparison to traditional messaging, [Logs compared to traditional messaging](/en/ch12#sec_stream_logs_vs_messaging), [Replaying old messages](/en/ch12#sec_stream_replay)
    - consumer offsets, [Consumer offsets](/en/ch12#sec_stream_log_offsets)
    - disk space usage, [Disk space usage](/en/ch12#sec_stream_disk_usage)
    - replaying old messages, [Replaying old messages](/en/ch12#sec_stream_replay), [Reprocessing data for application evolution](/en/ch13#sec_future_reprocessing), [Unifying batch and stream processing](/en/ch13#id338)
    - slow consumers, [When consumers cannot keep up with producers](/en/ch12#id459)
    - using logs for message storage, [Using logs for message storage](/en/ch12#id300)
  - log-structured storage, [Storage and Indexing for OLTP](/en/ch4#sec_storage_oltp)-[Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
    - log-structured merge tree (see LSM-trees)
  - relation to consensus, [Shared logs as consensus](/en/ch10#sec_consistency_shared_logs)
  - replication, [Single-Leader Replication](/en/ch6#sec_replication_leader), [Implementation of Replication Logs](/en/ch6#sec_replication_implementation)-[Logical (row-based) log replication](/en/ch6#logical-row-based-log-replication)
    - change data capture, [Change Data Capture](/en/ch12#sec_stream_cdc)-[API support for change streams](/en/ch12#sec_stream_change_api)
      - (see also changelogs)
    - coordination with snapshot, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
    - logical (row-based) replication, [Logical (row-based) log replication](/en/ch6#logical-row-based-log-replication)
    - statement-based replication, [Statement-based replication](/en/ch6#statement-based-replication)
    - write-ahead log (WAL) shipping, [Write-ahead log (WAL) shipping](/en/ch6#write-ahead-log-wal-shipping)
  - scalability limits, [The limits of total ordering](/en/ch13#id335)
- Looker (business intelligence software), [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp), [Analytics](/en/ch11#sec_batch_olap)
- loose coupling, [Making unbundling work](/en/ch13#sec_future_unbundling_favor)
- lost updates (see updates)
- Lotus Notes (sync engine), [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
- LSM-trees (indexes), [The SSTable file format](/en/ch4#the-sstable-file-format)-[Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
  - comparison to B-trees, [Comparing B-Trees and LSM-Trees](/en/ch4#sec_storage_btree_lsm_comparison)-[Disk space usage](/en/ch4#disk-space-usage)
- Lucene (storage engine), [Full-Text Search](/en/ch4#sec_storage_full_text)
  - similarity search, [Full-Text Search](/en/ch4#sec_storage_full_text)
- LWW (see last write wins)

### M

- machine learning
  - batch inference, [Machine Learning](/en/ch11#id290)
  - data preparation with DataFrames, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
  - deleting training data, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)
  - deploying data products, [Beyond the data lake](/en/ch1#beyond-the-data-lake)
  - ethical considerations, [Predictive Analytics](/en/ch14#id369)
    - (see also ethics)
  - feature engineering, [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake), [Machine Learning](/en/ch11#id290)
  - in analytics systems, [Operational Versus Analytical Systems](/en/ch1#sec_introduction_analytics)
  - iterative processing, [Machine Learning](/en/ch11#id290)
  - LLMs (see large language models (LLMs))
  - models derived from training data, [Application code as a derivation function](/en/ch13#sec_future_dataflow_derivation)
  - relation to batch processing, [Machine Learning](/en/ch11#id290)-[Machine Learning](/en/ch11#id290)
  - using a data lake, [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake)
  - using GPUs, [Layering of cloud services](/en/ch1#layering-of-cloud-services), [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed)
  - using matrices, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- madsim (deterministic simulation testing), [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
- magic scaling sauce, [Principles for Scalability](/en/ch2#id35)
- maintainability, [Maintainability](/en/ch2#sec_introduction_maintainability)-[Evolvability: Making Change Easy](/en/ch2#sec_introduction_evolvability), [A Philosophy of Streaming Systems](/en/ch13#ch_philosophy)
  - evolvability (see evolvability)
  - operability, [Operability: Making Life Easy for Operations](/en/ch2#id37)
  - simplicity and managing complexity, [Simplicity: Managing Complexity](/en/ch2#id38)
- many-to-many relationships, [Many-to-One and Many-to-Many Relationships](/en/ch3#sec_datamodels_many_to_many)
  - modeling as graphs, [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
- many-to-one relationships, [Many-to-One and Many-to-Many Relationships](/en/ch3#sec_datamodels_many_to_many)
  - in star schema, [Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)
- MapReduce (batch processing), [Batch Processing](/en/ch11#ch_batch), [MapReduce](/en/ch11#sec_batch_mapreduce)-[MapReduce](/en/ch11#sec_batch_mapreduce)
  - analysis of user activity events (example), [JOIN and GROUP BY](/en/ch11#sec_batch_join)
  - comparison to stream processing, [Processing Streams](/en/ch12#sec_stream_processing)
  - disadvantages and limitations of, [MapReduce](/en/ch11#sec_batch_mapreduce)
  - fault tolerance, [Handling Faults](/en/ch11#id281)
  - higher-level tools, [Query languages](/en/ch11#sec_batch_query_lanauges)
  - mapper and reducer functions, [MapReduce](/en/ch11#sec_batch_mapreduce)
  - shuffling data, [Shuffling Data](/en/ch11#sec_shuffle)
  - sort-merge joins, [JOIN and GROUP BY](/en/ch11#sec_batch_join)
  - workflows, [Scheduling Workflows](/en/ch11#sec_batch_workflows)
    - (see also workflow engines)
- marshalling (see encoding)
- MartenDB (database), [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
- master-slave replication (obsolete term), [Single-Leader Replication](/en/ch6#sec_replication_leader)
- materialization, [Glossary](/en/glossary)
  - aggregate values, [Materialized Views and Data Cubes](/en/ch4#sec_storage_materialized_views)
  - conflicts, [Materializing conflicts](/en/ch8#materializing-conflicts)
  - materialized views, [Materialized Views and Data Cubes](/en/ch4#sec_storage_materialized_views)
    - as derived data, [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived), [Composing Data Storage Technologies](/en/ch13#id447)-[Unbundled versus integrated systems](/en/ch13#id448)
    - in event sourcing, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
    - incremental view maintenance, [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
      - (see also incremental view maintenance (IVM))
    - maintaining, using stream processing, [Maintaining materialized views](/en/ch12#sec_stream_mat_view), [Table-table join (materialized view maintenance)](/en/ch12#id326)
  - social network timeline example, [Materializing and Updating Timelines](/en/ch2#sec_introduction_materializing)
- Materialize (database), [Materialized Views and Data Cubes](/en/ch4#sec_storage_materialized_views)
  - incremental view maintenance, [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
- matrices, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
  - sparse, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- Maxwell (change data capture), [Implementing change data capture](/en/ch12#id307)
- mean, [Average, Median, and Percentiles](/en/ch2#id24)
- media monitoring, [Search on streams](/en/ch12#id320)
- median, [Average, Median, and Percentiles](/en/ch2#id24)
- meeting room booking (example), [More examples of write skew](/en/ch8#more-examples-of-write-skew), [Predicate locks](/en/ch8#predicate-locks), [Enforcing Constraints](/en/ch13#sec_future_constraints)
- Memcached (caching server), [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
- Memgraph (database), [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
  - Cypher query language, [The Cypher Query Language](/en/ch3#id57)
- memory
  - barrier (CPU instruction), [Linearizability and network delays](/en/ch10#linearizability-and-network-delays)
  - corruption, [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults)
  - in-memory databases, [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
    - durability, [Durability](/en/ch8#durability)
    - serial transaction execution, [Actual Serial Execution](/en/ch8#sec_transactions_serial)
  - in-memory representation of data, [Formats for Encoding Data](/en/ch5#sec_encoding_formats)
  - memtable (in LSM-trees), [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
  - random bit-flips in, [Trust, but Verify](/en/ch13#sec_future_verification)
  - use by indexes, [Log-Structured Storage](/en/ch4#sec_storage_log_structured)
- memtable (in LSM-trees), [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
- Mercurial (version control system), [Concurrency control](/en/ch12#sec_stream_concurrency)
- merge (DataFrame operator), [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- merging sorted files, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables), [Shuffling Data](/en/ch11#sec_shuffle)
- Merkle trees, [Tools for auditable data systems](/en/ch13#id366)
- Mesos (cluster manager), [Separation of application code and state](/en/ch13#id344)
- message brokers (see messaging systems)
- message-passing (see event-driven architecture)
- MessagePack (encoding format), [Binary encoding](/en/ch5#binary-encoding)
- messaging systems, [Stream Processing](/en/ch12#ch_stream)-[Replaying old messages](/en/ch12#sec_stream_replay)
  - (see also streams)
  - backpressure, buffering, or dropping messages, [Messaging Systems](/en/ch12#sec_stream_messaging)
  - brokerless messaging, [Direct messaging from producers to consumers](/en/ch12#id296)
  - event logs, [Log-based Message Brokers](/en/ch12#sec_stream_log)-[Replaying old messages](/en/ch12#sec_stream_replay)
    - as data model, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
    - comparison to traditional messaging, [Logs compared to traditional messaging](/en/ch12#sec_stream_logs_vs_messaging), [Replaying old messages](/en/ch12#sec_stream_replay)
    - consumer offsets, [Consumer offsets](/en/ch12#sec_stream_log_offsets)
    - replaying old messages, [Replaying old messages](/en/ch12#sec_stream_replay), [Reprocessing data for application evolution](/en/ch13#sec_future_reprocessing), [Unifying batch and stream processing](/en/ch13#id338)
    - slow consumers, [When consumers cannot keep up with producers](/en/ch12#id459)
  - exactly-once semantics, [Exactly-once message processing](/en/ch8#sec_transactions_exactly_once), [Exactly-once message processing revisited](/en/ch8#exactly-once-message-processing-revisited), [Fault Tolerance](/en/ch12#sec_stream_fault_tolerance)
  - message brokers, [Message brokers](/en/ch12#id433)-[Acknowledgments and redelivery](/en/ch12#sec_stream_reordering)
    - acknowledgements and redelivery, [Acknowledgments and redelivery](/en/ch12#sec_stream_reordering)
    - comparison to event logs, [Logs compared to traditional messaging](/en/ch12#sec_stream_logs_vs_messaging), [Replaying old messages](/en/ch12#sec_stream_replay)
    - multiple consumers of same topic, [Multiple consumers](/en/ch12#id298)
    - versus RPC, [Event-Driven Architectures](/en/ch5#sec_encoding_dataflow_msg)
  - message loss, [Messaging Systems](/en/ch12#sec_stream_messaging)
  - reliability, [Messaging Systems](/en/ch12#sec_stream_messaging)
  - uniqueness in log-based messaging, [Uniqueness in log-based messaging](/en/ch13#sec_future_uniqueness_log)
- metastable failure, [Describing Performance](/en/ch2#sec_introduction_percentiles)
- metered billing
  - serverless, [Microservices and Serverless](/en/ch1#sec_introduction_microservices)
  - storage, [Operations in the Cloud Era](/en/ch1#sec_introduction_operations)
- microbatching, [Microbatching and checkpointing](/en/ch12#id329)
- microservices, [Microservices and Serverless](/en/ch1#sec_introduction_microservices)
  - (see also services)
  - causal dependencies across services, [The limits of total ordering](/en/ch13#id335)
  - loose coupling, [Making unbundling work](/en/ch13#sec_future_unbundling_favor)
  - relation to batch/stream processors, [Batch Processing](/en/ch11#ch_batch), [Stream processors and services](/en/ch13#id345)
- Microsoft
  - Azure Blob Storage (see Azure Blob Storage)
  - Azure managed disks, [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute)
  - Azure Service Bus (messaging), [Message brokers](/en/ch5#message-brokers), [Message brokers compared to databases](/en/ch12#id297)
  - Azure SQL DB (database), [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native)
  - Azure Storage, [Object Stores](/en/ch11#id277)
  - Azure Stream Analytics, [Stream analytics](/en/ch12#id318)
  - Azure Synapse Analytics (database), [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native)
  - DCOM (Distributed Component Object Model), [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc)
  - MSDTC (transaction coordinator), [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc)
  - SQL Server (see SQL Server)
- Microsoft Power BI (see Power BI (business intelligence software))
- migrating (rewriting) data, [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility), [Different values written at different times](/en/ch5#different-values-written-at-different-times), [Deriving several views from the same event log](/en/ch12#sec_stream_deriving_views), [Reprocessing data for application evolution](/en/ch13#sec_future_reprocessing)
- MinIO (object storage), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- mobile apps, [Trade-offs in Data Systems Architecture](/en/ch1#ch_tradeoffs)
  - embedded databases, [Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
- model checking, [Model checking and specification languages](/en/ch9#model-checking-and-specification-languages)
- modulus operator (%), [Hash modulo number of nodes](/en/ch7#hash-modulo-number-of-nodes)
- Mojo (programming language)
  - memory management, [Limiting the impact of garbage collection](/en/ch9#sec_distributed_gc_impact)
- MongoDB (database)
  - aggregation pipeline, [Query languages for documents](/en/ch3#query-languages-for-documents)
  - atomic operations, [Atomic write operations](/en/ch8#atomic-write-operations)
  - BSON, [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality)
  - document data model, [Relational Model versus Document Model](/en/ch3#sec_datamodels_history)
  - hash-range sharding, [Sharding by Hash of Key](/en/ch7#sec_sharding_hash), [Sharding by hash range](/en/ch7#sharding-by-hash-range)
  - in the cloud, [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native)
  - join support, [Convergence of document and relational databases](/en/ch3#convergence-of-document-and-relational-databases)
  - joins (\$lookup operator), [Normalization, Denormalization, and Joins](/en/ch3#sec_datamodels_normalization)
  - JSON Schema validation, [JSON Schema](/en/ch5#json-schema)
  - leader-based replication, [Single-Leader Replication](/en/ch6#sec_replication_leader)
  - ObjectIds, [ID Generators and Logical Clocks](/en/ch10#sec_consistency_logical)
  - range-based sharding, [Sharding by Key Range](/en/ch7#sec_sharding_key_range)
  - request routing, [Request Routing](/en/ch7#sec_sharding_routing)
  - secondary indexes, [Local Secondary Indexes](/en/ch7#id166)
  - shard splitting, [Rebalancing key-range sharded data](/en/ch7#rebalancing-key-range-sharded-data)
  - stored procedures, [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs)
- monitoring, [Operations in the Cloud Era](/en/ch1#sec_introduction_operations), [Humans and Reliability](/en/ch2#id31), [Operability: Making Life Easy for Operations](/en/ch2#id37)
- monotonic clocks, [Monotonic clocks](/en/ch9#monotonic-clocks)
- monotonic reads, [Monotonic Reads](/en/ch6#sec_replication_monotonic_reads)
- Morel (query language), [Query languages](/en/ch11#sec_batch_query_lanauges)
- MSMQ (messaging), [XA transactions](/en/ch8#xa-transactions)
- multi-column indexes, [Multidimensional and Full-Text Indexes](/en/ch4#sec_storage_multidimensional)
- multi-leader replication, [Multi-Leader Replication](/en/ch6#sec_replication_multi_leader)-[Types of conflict](/en/ch6#sec_replication_write_conflicts)
  - (see also replication)
  - collaborative editing, [Real-time collaboration, offline-first, and local-first apps](/en/ch6#real-time-collaboration-offline-first-and-local-first-apps)
  - conflict detection, [Types of conflict](/en/ch6#sec_replication_write_conflicts)
  - conflict resolution, [Dealing with Conflicting Writes](/en/ch6#sec_replication_write_conflicts)
  - for multi-region replication, [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc), [The Cost of Linearizability](/en/ch10#sec_linearizability_cost)
  - linearizability, lack of, [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable)
  - offline-capable clients, [Sync Engines and Local-First Software](/en/ch6#sec_replication_offline_clients)
  - replication topologies, [Multi-leader replication topologies](/en/ch6#sec_replication_topologies)-[Problems with different topologies](/en/ch6#problems-with-different-topologies)
- multi-object transactions, [Single-Object and Multi-Object Operations](/en/ch8#sec_transactions_multi_object)
  - need for, [The need for multi-object transactions](/en/ch8#sec_transactions_need)
- Multi-Paxos (consensus algorithm), [Consensus in Practice](/en/ch10#sec_consistency_total_order)
- multi-reader single-writer lock, [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
- multi-table index cluster tables (Oracle), [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality)
- multi-version concurrency control (MVCC), [Multi-version concurrency control (MVCC)](/en/ch8#sec_transactions_snapshot_impl), [Summary](/en/ch8#summary)
  - detecting stale MVCC reads, [Detecting stale MVCC reads](/en/ch8#detecting-stale-mvcc-reads)
  - indexes and snapshot isolation, [Indexes and snapshot isolation](/en/ch8#indexes-and-snapshot-isolation)
  - using synchronized clocks, [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
- multidimensional arrays, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- multitenancy, [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute), [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
  - by sharding, [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
  - using embedded databases, [Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
  - versus Byzantine fault tolerance, [Byzantine Faults](/en/ch9#sec_distributed_byzantine)
- mutual exclusion, [Pessimistic versus optimistic concurrency control](/en/ch8#pessimistic-versus-optimistic-concurrency-control)
  - (see also locks)
- MySQL (database)
  - archiving WAL to object stores, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - binlog coordinates, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - change data capture, [Implementing change data capture](/en/ch12#id307), [API support for change streams](/en/ch12#sec_stream_change_api)
  - circular replication topology, [Multi-leader replication topologies](/en/ch6#sec_replication_topologies)
  - consistent snapshots, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - distributed transaction support, [XA transactions](/en/ch8#xa-transactions)
  - global transaction identifiers (GTIDs), [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - in the cloud, [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native)
  - InnoDB storage engine (see InnoDB)
  - leader-based replication, [Single-Leader Replication](/en/ch6#sec_replication_leader)
  - multi-leader replication, [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)
  - row-based replication, [Logical (row-based) log replication](/en/ch6#logical-row-based-log-replication)
  - sharding (see Vitess (database))
  - snapshot isolation support, [Snapshot isolation, repeatable read, and naming confusion](/en/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
    - (see also InnoDB)
  - statement-based replication, [Statement-based replication](/en/ch6#statement-based-replication)

### N

- N+1 query problem, [Object-relational mapping (ORM)](/en/ch3#object-relational-mapping-orm)
- nanomsg (messaging library), [Direct messaging from producers to consumers](/en/ch12#id296)
- Narayana (transaction coordinator), [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc)
- NATS (messaging), [Message brokers](/en/ch5#message-brokers)
- natural language processing, [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake)
- Neo4j (database)
  - Cypher query language, [The Cypher Query Language](/en/ch3#id57)
  - graph data model, [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
- Neon (database), [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
- Nephele (dataflow engine), [Dataflow Engines](/en/ch11#sec_batch_dataflow)
- Neptune (graph database), [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
  - Cypher query language, [The Cypher Query Language](/en/ch3#id57)
  - SPARQL query language, [The SPARQL query language](/en/ch3#the-sparql-query-language)
- netcode (game development), [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
- Network Attached Storage (NAS), [Shared-Memory, Shared-Disk, and Shared-Nothing Architecture](/en/ch2#sec_introduction_shared_nothing), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- network model (data representation), [Relational Model versus Document Model](/en/ch3#sec_datamodels_history)
- Network Time Protocol (see NTP)
- networks
  - congestion and queueing, [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
  - datacenter network topologies, [Cloud Computing Versus Supercomputing](/en/ch1#id17)
  - faults (see faults)
  - linearizability and network delays, [Linearizability and network delays](/en/ch10#linearizability-and-network-delays)
  - network partitions, [Network Faults in Practice](/en/ch9#sec_distributed_network_faults)
    - in CAP theorem, [The Cost of Linearizability](/en/ch10#sec_linearizability_cost)
  - timeouts and unbounded delays, [Timeouts and Unbounded Delays](/en/ch9#sec_distributed_queueing)
- NewSQL, [Relational Model versus Document Model](/en/ch3#sec_datamodels_history), [Solutions for Replication Lag](/en/ch6#id131)
  - transactions and, [What Exactly Is a Transaction?](/en/ch8#sec_transactions_overview), [Database-internal Distributed Transactions](/en/ch8#sec_transactions_internal)
- next-key locking, [Index-range locks](/en/ch8#sec_transactions_2pl_range)
- NFS (network file system), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
  - on object storage, [Object Stores](/en/ch11#id277)
- Nimble (data format), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses), [Column-Oriented Storage](/en/ch4#sec_storage_column)
  - (see also column-oriented storage)
- node (in graphs) (see vertices)
- nodes (processes), [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed), [Glossary](/en/glossary)
  - handling outages in leader-based replication, [Handling Node Outages](/en/ch6#sec_replication_failover)
  - system models for failure, [System Model and Reality](/en/ch9#sec_distributed_system_model)
- noisy neighbors, [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
- nonblocking atomic commit, [Three-phase commit](/en/ch8#three-phase-commit)
- nondeterministic operations, [Statement-based replication](/en/ch6#statement-based-replication)
  - (see also deterministic operations)
  - in distributed systems, [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
  - in workflow engines, [Durable execution](/en/ch5#durable-execution)
  - partial failures, [Faults and Partial Failures](/en/ch9#sec_distributed_partial_failure)
  - sources of nondeterminism, [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
- nonfunctional requirements, [Defining Nonfunctional Requirements](/en/ch2#ch_nonfunctional), [Summary](/en/ch2#summary)
- nonrepeatable reads, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)
  - (see also read skew)
- normalization (data representation), [Normalization, Denormalization, and Joins](/en/ch3#sec_datamodels_normalization)-[Many-to-One and Many-to-Many Relationships](/en/ch3#sec_datamodels_many_to_many), [Glossary](/en/glossary)
  - foreign key references, [The need for multi-object transactions](/en/ch8#sec_transactions_need)
  - in social network case study, [Denormalization in the social networking case study](/en/ch3#denormalization-in-the-social-networking-case-study)
  - in systems of record, [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived)
  - versus denormalization, [Deriving several views from the same event log](/en/ch12#sec_stream_deriving_views)
- NoSQL, [Relational Model versus Document Model](/en/ch3#sec_datamodels_history), [Solutions for Replication Lag](/en/ch6#id131), [Unbundling Databases](/en/ch13#sec_future_unbundling)
  - transactions and, [What Exactly Is a Transaction?](/en/ch8#sec_transactions_overview)
- Notation3 (N3), [Triple-Stores and SPARQL](/en/ch3#id59)
- NTP (Network Time Protocol), [Unreliable Clocks](/en/ch9#sec_distributed_clocks)
  - accuracy, [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy), [Timestamps for ordering events](/en/ch9#sec_distributed_lww)
  - adjustments to monotonic clocks, [Monotonic clocks](/en/ch9#monotonic-clocks)
  - multiple server addresses, [Weak forms of lying](/en/ch9#weak-forms-of-lying)
- numbers, in XML and JSON encodings, [JSON, XML, and Binary Variants](/en/ch5#sec_encoding_json)
- NumPy (Python library), [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes), [Column-Oriented Storage](/en/ch4#sec_storage_column)
- NVMe (Non-Volatile Memory Express) (see solid state drives (SSDs))

### O

- object databases, [Relational Model versus Document Model](/en/ch3#sec_datamodels_history)
- object storage, [Layering of cloud services](/en/ch1#layering-of-cloud-services), [Object Stores](/en/ch11#id277)-[Object Stores](/en/ch11#id277)
  - Azure Blob Storage (see Azure Blob Storage)
  - comparison to distributed filesystems, [Object Stores](/en/ch11#id277)
  - comparison to key-value stores, [Object Stores](/en/ch11#id277)
  - databases backed by, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - for backups, [Replication](/en/ch6#ch_replication)
  - for cloud data warehouses, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses), [Writing to Column-Oriented Storage](/en/ch4#writing-to-column-oriented-storage)
  - for database replication, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - Google Cloud Storage (see Google Cloud Storage)
  - object size, [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute)
  - S3 (see S3 (object storage))
  - storing LSM segment files, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
  - support for fencing, [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens)
  - use in data lakes, [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake)
- object-relational mapping (ORM) frameworks, [Object-relational mapping (ORM)](/en/ch3#object-relational-mapping-orm)
  - error handling and aborted transactions, [Handling errors and aborts](/en/ch8#handling-errors-and-aborts)
  - unsafe read-modify-write cycle code, [Atomic write operations](/en/ch8#atomic-write-operations)
- object-relational mismatch, [The Object-Relational Mismatch](/en/ch3#sec_datamodels_document)
- observability, [Problems with Distributed Systems](/en/ch1#sec_introduction_dist_sys_problems), [Humans and Reliability](/en/ch2#id31), [Operability: Making Life Easy for Operations](/en/ch2#id37)
- observer pattern, [Separation of application code and state](/en/ch13#id344)
- OBT (one big table), [Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics), [Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)
- offline systems, [Batch Processing](/en/ch11#ch_batch)
  - (see also batch processing)
- offline-first applications, [Real-time collaboration, offline-first, and local-first apps](/en/ch6#real-time-collaboration-offline-first-and-local-first-apps), [Stateful, offline-capable clients](/en/ch13#id347)
- offsets
  - consumer offsets in sharded logs, [Consumer offsets](/en/ch12#sec_stream_log_offsets)
  - messages in sharded logs, [Using logs for message storage](/en/ch12#id300)
- OLAP (online analytic processing), [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp), [Glossary](/en/glossary)
  - data cubes, [Materialized Views and Data Cubes](/en/ch4#sec_storage_materialized_views)
- OLTP (online transaction processing), [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp), [Glossary](/en/glossary)
  - analytics queries versus, [Analytics](/en/ch11#sec_batch_olap)
  - data normalization, [Trade-offs of normalization](/en/ch3#trade-offs-of-normalization)
  - workload characteristics, [Actual Serial Execution](/en/ch8#sec_transactions_serial)
- on-premises deployment, [Cloud Versus Self-Hosting](/en/ch1#sec_introduction_cloud)
  - data warehouses, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
- one big table (data warehouse schema), [Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics), [Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)
- one-hot encoding, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- one-to-few relationships, [The document data model for one-to-many relationships](/en/ch3#the-document-data-model-for-one-to-many-relationships)
- one-to-many relationships, [The document data model for one-to-many relationships](/en/ch3#the-document-data-model-for-one-to-many-relationships)
  - JSON representation, [The document data model for one-to-many relationships](/en/ch3#the-document-data-model-for-one-to-many-relationships)
- online systems, [Batch Processing](/en/ch11#ch_batch)
  - (see also services)
  - versus scientific computing, [Cloud Computing Versus Supercomputing](/en/ch1#id17)
- ontologies, [Triple-Stores and SPARQL](/en/ch3#id59)
- Oozie (workflow scheduler), [Batch Processing](/en/ch11#ch_batch)
- OpenAPI (service definition format), [Microservices and Serverless](/en/ch1#sec_introduction_microservices), [Web services](/en/ch5#sec_web_services), [Web services](/en/ch5#sec_web_services)
  - use of JSON Schema, [JSON Schema](/en/ch5#json-schema)
- openCypher (see Cypher (query language))
- OpenLink Virtuoso (see Virtuoso (database))
- OpenStack
  - Swift (object storage), [Object Stores](/en/ch11#id277)
- operability, [Operability: Making Life Easy for Operations](/en/ch2#id37)
- operating systems versus databases, [Unbundling Databases](/en/ch13#sec_future_unbundling)
- operational systems, [Operational Versus Analytical Systems](/en/ch1#sec_introduction_analytics)
  - (see also OLTP)
  - as systems of record, [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived)
  - ETL into analytical systems, [Data Warehousing](/en/ch1#sec_introduction_dwh)
- operational transformation, [CRDTs and Operational Transformation](/en/ch6#sec_replication_crdts)
- operations teams, [Operations in the Cloud Era](/en/ch1#sec_introduction_operations)
- operators (query execution), [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
  - in stream processing, [Processing Streams](/en/ch12#sec_stream_processing)
- optimistic concurrency control, [Pessimistic versus optimistic concurrency control](/en/ch8#pessimistic-versus-optimistic-concurrency-control)
- optimistic locking, [Conditional writes (compare-and-set)](/en/ch8#sec_transactions_compare_and_set)
- Oracle (database)
  - distributed transaction support, [XA transactions](/en/ch8#xa-transactions)
  - GoldenGate (change data capture), [Implementing change data capture](/en/ch12#id307)
  - hierarchical queries, [Graph Queries in SQL](/en/ch3#id58), [Graph Queries in SQL](/en/ch3#id58)
  - lack of serializability, [Isolation](/en/ch8#sec_transactions_acid_isolation)
  - leader-based replication, [Single-Leader Replication](/en/ch6#sec_replication_leader)
  - multi-leader replication, [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)
  - multi-table index cluster tables, [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality)
  - not preventing write skew, [Characterizing write skew](/en/ch8#characterizing-write-skew)
  - PL/SQL language, [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs)
  - preventing lost updates, [Automatically detecting lost updates](/en/ch8#automatically-detecting-lost-updates)
  - read committed isolation, [Implementing read committed](/en/ch8#sec_transactions_read_committed_impl)
  - Real Application Clusters (RAC), [Locking and leader election](/en/ch10#locking-and-leader-election)
  - snapshot isolation support, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation), [Snapshot isolation, repeatable read, and naming confusion](/en/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
  - TimesTen (in-memory database), [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
  - WAL-based replication, [Write-ahead log (WAL) shipping](/en/ch6#write-ahead-log-wal-shipping)
- ORC (data format), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses), [Column-Oriented Storage](/en/ch4#sec_storage_column)
  - (see also column-oriented storage)
- orchestration (service deployment), [Cloud Versus Self-Hosting](/en/ch1#sec_introduction_cloud), [Microservices and Serverless](/en/ch1#sec_introduction_microservices)
  - batch job execution, [Distributed Job Orchestration](/en/ch11#id278)-[Distributed Job Orchestration](/en/ch11#id278)
  - workflow engines, [Batch Processing](/en/ch11#ch_batch)
- ordering
  - event logs, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
  - limits of total ordering, [The limits of total ordering](/en/ch13#id335)
  - logical timestamps, [Logical Clocks](/en/ch10#sec_consistency_timestamps)
  - of auto-incrementing IDs, [ID Generators and Logical Clocks](/en/ch10#sec_consistency_logical)
  - shared logs, [Consensus in Practice](/en/ch10#sec_consistency_total_order)-[Pros and cons of consensus](/en/ch10#pros-and-cons-of-consensus)
- Orkes (workflow engine), [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows)
- orphan pages (B-trees), [Making B-trees reliable](/en/ch4#sec_storage_btree_wal)
- outbox pattern, [Change data capture versus event sourcing](/en/ch12#sec_stream_event_sourcing)
- outliers (response time), [Average, Median, and Percentiles](/en/ch2#id24)
- outsourcing, [Cloud Versus Self-Hosting](/en/ch1#sec_introduction_cloud)
- overload, [Describing Performance](/en/ch2#sec_introduction_percentiles), [Handling errors and aborts](/en/ch8#handling-errors-and-aborts)

### P

- PACELC principle, [The CAP theorem](/en/ch10#the-cap-theorem)
- package managers, [Separation of application code and state](/en/ch13#id344)
- packet switching, [Can we not simply make network delays predictable?](/en/ch9#can-we-not-simply-make-network-delays-predictable)
- packets
  - corruption of, [Weak forms of lying](/en/ch9#weak-forms-of-lying)
  - sending via UDP, [Direct messaging from producers to consumers](/en/ch12#id296)
- PageRank (algorithm), [Graph-Like Data Models](/en/ch3#sec_datamodels_graph), [Query languages](/en/ch11#sec_batch_query_lanauges), [Machine Learning](/en/ch11#id290)
- paging (see virtual memory)
- pandas (Python library), [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake), [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes), [Column-Oriented Storage](/en/ch4#sec_storage_column), [DataFrames](/en/ch11#id287)
- Parquet (data format), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses), [Column-Oriented Storage](/en/ch4#sec_storage_column), [Archival storage](/en/ch5#archival-storage), [Query languages](/en/ch11#sec_batch_query_lanauges)
  - (see also column-oriented storage)
  - databases on object storage, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - document data model, [Column-Oriented Storage](/en/ch4#sec_storage_column)
  - use in batch processing, [MapReduce](/en/ch11#sec_batch_mapreduce)
- partial failures, [Faults and Partial Failures](/en/ch9#sec_distributed_partial_failure), [Summary](/en/ch9#summary)
  - limping, [System Model and Reality](/en/ch9#sec_distributed_system_model)
- partial synchrony (system model), [System Model and Reality](/en/ch9#sec_distributed_system_model)
- partition key, [Pros and Cons of Sharding](/en/ch7#sec_sharding_reasons), [Sharding of Key-Value Data](/en/ch7#sec_sharding_key_value)
- partitioning (see sharding)
- Paxos (consensus algorithm), [Consensus](/en/ch10#sec_consistency_consensus), [Consensus in Practice](/en/ch10#sec_consistency_total_order)
  - ballot number, [From single-leader replication to consensus](/en/ch10#from-single-leader-replication-to-consensus)
  - Multi-Paxos, [Consensus in Practice](/en/ch10#sec_consistency_total_order)
- payment card industry (PCI), [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)
- PCI (payment card industry) compliance, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)
- percentiles, [Average, Median, and Percentiles](/en/ch2#id24), [Glossary](/en/glossary)
  - calculating efficiently, [Use of Response Time Metrics](/en/ch2#sec_introduction_slo_sla)
  - importance of high percentiles, [Use of Response Time Metrics](/en/ch2#sec_introduction_slo_sla)
  - use in service level agreements (SLAs), [Use of Response Time Metrics](/en/ch2#sec_introduction_slo_sla)
- Percolator (Google), [Implementing a linearizable ID generator](/en/ch10#implementing-a-linearizable-id-generator)
- Percona XtraBackup (MySQL tool), [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
- performance
  - degradation as fault, [System Model and Reality](/en/ch9#sec_distributed_system_model)
  - describing, [Describing Performance](/en/ch2#sec_introduction_percentiles)
  - of distributed transactions, [Distributed Transactions Across Different Systems](/en/ch8#sec_transactions_xa)
  - of in-memory databases, [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
  - of linearizability, [Linearizability and network delays](/en/ch10#linearizability-and-network-delays)
  - of multi-leader replication, [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)
- permission isolation, [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
- perpetual inconsistency, [Timeliness and Integrity](/en/ch13#sec_future_integrity)
- pessimistic concurrency control, [Pessimistic versus optimistic concurrency control](/en/ch8#pessimistic-versus-optimistic-concurrency-control)
- pglogical (PostgreSQL extension), [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)
- pgvector (vector index), [Vector Embeddings](/en/ch4#id92)
- phantoms (transaction isolation), [Phantoms causing write skew](/en/ch8#sec_transactions_phantom)
  - materializing conflicts, [Materializing conflicts](/en/ch8#materializing-conflicts)
  - preventing, in serializability, [Predicate locks](/en/ch8#predicate-locks)
- physical clocks (see clocks)
- pickle (Python), [Language-Specific Formats](/en/ch5#id96)
- Pinot (database), [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp), [Column-Oriented Storage](/en/ch4#sec_storage_column)
  - handling writes, [Writing to Column-Oriented Storage](/en/ch4#writing-to-column-oriented-storage)
  - pre-aggregation, [Analytics](/en/ch11#sec_batch_olap)
  - serving derived data, [Serving Derived Data](/en/ch11#sec_batch_serving_derived), [Serving Derived Data](/en/ch11#sec_batch_serving_derived)
- pipelined execution
  - in data warehouse queries, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
- pivot table, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- point in time, [Unreliable Clocks](/en/ch9#sec_distributed_clocks)
- point query, [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp)
- Polaris (data catalog), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
- polling, [Representing Users, Posts, and Follows](/en/ch2#id20)
- polystores, [The meta-database of everything](/en/ch13#id341)
- POSIX (portable operating system interface)
  - compliant filesystems, [Setting Up New Followers](/en/ch6#sec_replication_new_replica), [Distributed Filesystems](/en/ch11#sec_batch_dfs), [Object Stores](/en/ch11#id277)
- Post Office Horizon scandal, [Humans and Reliability](/en/ch2#id31)
  - lack of transactions, [Transactions](/en/ch8#ch_transactions)
- PostgreSQL (database)
  - archiving WAL to object stores, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - change data capture, [Implementing change data capture](/en/ch12#id307), [API support for change streams](/en/ch12#sec_stream_change_api)
  - distributed transaction support, [XA transactions](/en/ch8#xa-transactions)
  - foreign data wrappers, [The meta-database of everything](/en/ch13#id341)
  - full text search support, [Combining Specialized Tools by Deriving Data](/en/ch13#id442)
  - in the cloud, [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native)
  - JSON Schema validation, [JSON Schema](/en/ch5#json-schema)
  - leader-based replication, [Single-Leader Replication](/en/ch6#sec_replication_leader)
  - log sequence number, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - logical decoding, [Logical (row-based) log replication](/en/ch6#logical-row-based-log-replication)
  - materialized view maintenance, [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
  - multi-leader replication, [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)
  - MVCC implementation, [Multi-version concurrency control (MVCC)](/en/ch8#sec_transactions_snapshot_impl), [Indexes and snapshot isolation](/en/ch8#indexes-and-snapshot-isolation)
  - partitioning vs. sharding, [Sharding](/en/ch7#ch_sharding)
  - pgvector (vector index), [Vector Embeddings](/en/ch4#id92)
  - PL/pgSQL language, [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs)
  - PostGIS geospatial indexes, [Multidimensional and Full-Text Indexes](/en/ch4#sec_storage_multidimensional)
  - preventing lost updates, [Automatically detecting lost updates](/en/ch8#automatically-detecting-lost-updates)
  - preventing write skew, [Characterizing write skew](/en/ch8#characterizing-write-skew), [Serializable Snapshot Isolation (SSI)](/en/ch8#sec_transactions_ssi)
  - read committed isolation, [Implementing read committed](/en/ch8#sec_transactions_read_committed_impl)
  - representing graphs, [Property Graphs](/en/ch3#id56)
  - serializable snapshot isolation (SSI), [Serializable Snapshot Isolation (SSI)](/en/ch8#sec_transactions_ssi)
  - sharding (see Citus (database))
  - snapshot isolation support, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation), [Snapshot isolation, repeatable read, and naming confusion](/en/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
  - WAL-based replication, [Write-ahead log (WAL) shipping](/en/ch6#write-ahead-log-wal-shipping)
- postings list, [Full-Text Search](/en/ch4#sec_storage_full_text)
  - in sharded indexes, [Local Secondary Indexes](/en/ch7#id166)
- postmortems, blameless, [Humans and Reliability](/en/ch2#id31)
- PouchDB (database), [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
- Power BI (business intelligence software), [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp), [Analytics](/en/ch11#sec_batch_olap)
- pre-aggregation, [Analytics](/en/ch11#sec_batch_olap)
  - serving derived data, [Serving Derived Data](/en/ch11#sec_batch_serving_derived)
- pre-splitting, [Rebalancing key-range sharded data](/en/ch7#rebalancing-key-range-sharded-data)
- Precision Time Protocol (PTP), [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy)
- predicate locks, [Predicate locks](/en/ch8#predicate-locks)
- predictive analytics, [Operational Versus Analytical Systems](/en/ch1#sec_introduction_analytics), [Predictive Analytics](/en/ch14#id369)-[Feedback Loops](/en/ch14#id372)
  - amplifying bias, [Bias and Discrimination](/en/ch14#id370)
  - ethics of (see ethics)
  - feedback loops, [Feedback Loops](/en/ch14#id372)
- preemption, [Resource Allocation](/en/ch11#id279)
  - in distributed schedulers, [Handling Faults](/en/ch11#id281)
  - of threads, [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
- Prefect (workflow scheduler), [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows), [Batch Processing](/en/ch11#ch_batch), [Scheduling Workflows](/en/ch11#sec_batch_workflows)
  - cloud data warehouse integration, [Query languages](/en/ch11#sec_batch_query_lanauges)
- Presto (query engine), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
- primary keys, [Multi-Column and Secondary Indexes](/en/ch4#sec_storage_index_multicolumn), [Glossary](/en/glossary)
  - auto-incrementing, [ID Generators and Logical Clocks](/en/ch10#sec_consistency_logical)
  - versus partition key, [Sharding by hash range](/en/ch7#sharding-by-hash-range)
- primary-backup replication (see leader-based replication)
- privacy, [Privacy and Tracking](/en/ch14#id373)-[Legislation and Self-Regulation](/en/ch14#sec_future_legislation)
  - consent and freedom of choice, [Consent and Freedom of Choice](/en/ch14#id375)
  - data as assets and power, [Data as Assets and Power](/en/ch14#id376)
  - deleting data, [Limitations of immutability](/en/ch12#sec_stream_immutability_limitations)
  - ethical considerations (see ethics)
  - legislation and self-regulation, [Legislation and Self-Regulation](/en/ch14#sec_future_legislation)
  - meaning of, [Privacy and Use of Data](/en/ch14#id457)
  - regulation, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)
  - surveillance, [Surveillance](/en/ch14#id374)
  - tracking behavioral data, [Privacy and Tracking](/en/ch14#id373)
- probabilistic algorithms, [Use of Response Time Metrics](/en/ch2#sec_introduction_slo_sla), [Stream analytics](/en/ch12#id318)
- process pauses, [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)-[Limiting the impact of garbage collection](/en/ch9#sec_distributed_gc_impact)
- processing time (of events), [Reasoning About Time](/en/ch12#sec_stream_time)
- producers (message streams), [Transmitting Event Streams](/en/ch12#sec_stream_transmit)
- product analytics, [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp)
  - column-oriented storage, [Column-Oriented Storage](/en/ch4#sec_storage_column)
- programming languages
  - for stored procedures, [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs)
- projections (event sourcing), [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
- Prolog (language), [Datalog: Recursive Relational Queries](/en/ch3#id62)
  - (see also Datalog)
- property graphs, [Property Graphs](/en/ch3#id56)
  - Cypher query language, [The Cypher Query Language](/en/ch3#id57)
  - Property Graph Query Language (PGQL), [Graph Queries in SQL](/en/ch3#id58)
- property-based testing, [Humans and Reliability](/en/ch2#id31), [Formal Methods and Randomized Testing](/en/ch9#sec_distributed_formal)
- Protocol Buffers (data format), [Protocol Buffers](/en/ch5#sec_encoding_protobuf)-[Field tags and schema evolution](/en/ch5#field-tags-and-schema-evolution), [Protocol Buffers](/en/ch5#sec_encoding_protobuf)
  - field tags and schema evolution, [Field tags and schema evolution](/en/ch5#field-tags-and-schema-evolution)
- provenance of data, [Designing for auditability](/en/ch13#id365)
- publish/subscribe model, [Messaging Systems](/en/ch12#sec_stream_messaging)
- publishers (message streams), [Transmitting Event Streams](/en/ch12#sec_stream_transmit)
- Pulsar (streaming platform), [Acknowledgments and redelivery](/en/ch12#sec_stream_reordering)
- PyTorch (machine learning library), [Machine Learning](/en/ch11#id290)

### Q

- Qpid (messaging), [Message brokers compared to databases](/en/ch12#id297)
- quality of service (QoS), [Can we not simply make network delays predictable?](/en/ch9#can-we-not-simply-make-network-delays-predictable)
- Quantcast File System (distributed filesystem), [Object Stores](/en/ch11#id277)
- query engines
  - compilation and vectorization, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
  - in cloud data warehouse, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
  - operators, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
  - optimizing declarative queries, [Data Models and Query Languages](/en/ch3#ch_datamodels)
- query languages
  - Cypher, [The Cypher Query Language](/en/ch3#id57)
  - Datalog, [Datalog: Recursive Relational Queries](/en/ch3#id62)
  - GraphQL, [GraphQL](/en/ch3#id63)
  - MongoDB aggregation pipeline, [Normalization, Denormalization, and Joins](/en/ch3#sec_datamodels_normalization), [Query languages for documents](/en/ch3#query-languages-for-documents)
  - recursive SQL queries, [Graph Queries in SQL](/en/ch3#id58)
  - SPARQL, [The SPARQL query language](/en/ch3#the-sparql-query-language)
  - SQL, [Normalization, Denormalization, and Joins](/en/ch3#sec_datamodels_normalization)
- query optimizers, [Query languages](/en/ch11#sec_batch_query_lanauges)
- query plans, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
- queueing delays, [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
  - head-of-line blocking, [Latency and Response Time](/en/ch2#id23)
  - latency and response time, [Latency and Response Time](/en/ch2#id23)
- queues (messaging), [Message brokers](/en/ch5#message-brokers)
- QUIC (protocol), [The Limitations of TCP](/en/ch9#sec_distributed_tcp)
- quorums, [Quorums for reading and writing](/en/ch6#sec_replication_quorum_condition)-[Multi-region operation](/en/ch6#multi-region-operation), [Glossary](/en/glossary)
  - for leaderless replication, [Quorums for reading and writing](/en/ch6#sec_replication_quorum_condition)
  - in consensus algorithms, [From single-leader replication to consensus](/en/ch10#from-single-leader-replication-to-consensus)
  - limitations of consistency, [Limitations of Quorum Consistency](/en/ch6#sec_replication_quorum_limitations)-[Monitoring staleness](/en/ch6#monitoring-staleness), [Linearizability and quorums](/en/ch10#sec_consistency_quorum_linearizable)
  - making decisions in distributed systems, [The Majority Rules](/en/ch9#sec_distributed_majority)
  - monitoring staleness, [Monitoring staleness](/en/ch6#monitoring-staleness)
  - multi-region replication, [Multi-region operation](/en/ch6#multi-region-operation)
  - relying on durability, [Mapping system models to the real world](/en/ch9#mapping-system-models-to-the-real-world)
- quotas, [Operations in the Cloud Era](/en/ch1#sec_introduction_operations)

### R

- R (language), [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake), [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes), [DataFrames](/en/ch11#id287)
- R-trees (indexes), [Multidimensional and Full-Text Indexes](/en/ch4#sec_storage_multidimensional)
- R2 (object storage), [Layering of cloud services](/en/ch1#layering-of-cloud-services), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- RabbitMQ (messaging), [Message brokers](/en/ch5#message-brokers), [Message brokers compared to databases](/en/ch12#id297)
  - quorum queues (replication), [Single-Leader Replication](/en/ch6#sec_replication_leader)
- race conditions, [Isolation](/en/ch8#sec_transactions_acid_isolation)
  - (see also concurrency)
  - avoiding with linearizability, [Cross-channel timing dependencies](/en/ch10#cross-channel-timing-dependencies)
  - caused by dual writes, [Keeping Systems in Sync](/en/ch12#sec_stream_sync)
  - causing loss of money, [Weak Isolation Levels](/en/ch8#sec_transactions_isolation_levels)
  - dirty writes, [No dirty writes](/en/ch8#sec_transactions_dirty_write)
  - in counter increments, [No dirty writes](/en/ch8#sec_transactions_dirty_write)
  - lost updates, [Preventing Lost Updates](/en/ch8#sec_transactions_lost_update)-[Conflict resolution and replication](/en/ch8#conflict-resolution-and-replication)
  - preventing with event logs, [Concurrency control](/en/ch12#sec_stream_concurrency), [Dataflow: Interplay between state changes and application code](/en/ch13#id450)
  - preventing with serializable isolation, [Serializability](/en/ch8#sec_transactions_serializability)
  - weak transaction isolation, [Weak Isolation Levels](/en/ch8#sec_transactions_isolation_levels)
  - write skew, [Write Skew and Phantoms](/en/ch8#sec_transactions_write_skew)-[Materializing conflicts](/en/ch8#materializing-conflicts)
- Raft (consensus algorithm), [Consensus](/en/ch10#sec_consistency_consensus), [Consensus in Practice](/en/ch10#sec_consistency_total_order)
  - leader-based replication, [Single-Leader Replication](/en/ch6#sec_replication_leader)
  - sensitivity to network problems, [Pros and cons of consensus](/en/ch10#pros-and-cons-of-consensus)
  - term number, [From single-leader replication to consensus](/en/ch10#from-single-leader-replication-to-consensus)
  - use in etcd, [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable)
- RAID (Redundant Array of Independent Disks), [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute), [Tolerating hardware faults through redundancy](/en/ch2#tolerating-hardware-faults-through-redundancy), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- railways, schema migration on, [Reprocessing data for application evolution](/en/ch13#sec_future_reprocessing)
- RAM (see memory)
- RAMCloud (in-memory storage), [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
- random writes (access pattern), [Sequential versus random writes](/en/ch4#sidebar_sequential)
- range queries
  - in B-trees, [B-Trees](/en/ch4#sec_storage_b_trees), [Read performance](/en/ch4#read-performance)
  - in LSM-trees, [Read performance](/en/ch4#read-performance)
  - not efficient in hash maps, [Log-Structured Storage](/en/ch4#sec_storage_log_structured)
  - with hash sharding, [Sharding by hash range](/en/ch7#sharding-by-hash-range)
- ranking algorithms, [Machine Learning](/en/ch11#id290)
- Ray (workflow scheduler), [Machine Learning](/en/ch11#id290)
- RDF (Resource Description Framework), [The RDF data model](/en/ch3#the-rdf-data-model)
  - querying with SPARQL, [The SPARQL query language](/en/ch3#the-sparql-query-language)
- RDMA (Remote Direct Memory Access), [Layering of cloud services](/en/ch1#layering-of-cloud-services), [Cloud Computing Versus Supercomputing](/en/ch1#id17)
- React (user interface library), [End-to-end event streams](/en/ch13#id349)
- reactive programming, [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
- read committed isolation level, [Read Committed](/en/ch8#sec_transactions_read_committed)-[Implementing read committed](/en/ch8#sec_transactions_read_committed_impl)
  - implementing, [Implementing read committed](/en/ch8#sec_transactions_read_committed_impl)
  - multi-version concurrency control (MVCC), [Multi-version concurrency control (MVCC)](/en/ch8#sec_transactions_snapshot_impl)
  - no dirty reads, [No dirty reads](/en/ch8#no-dirty-reads)
  - no dirty writes, [No dirty writes](/en/ch8#sec_transactions_dirty_write)
- read models (event sourcing), [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
- read path (derived data), [Observing Derived State](/en/ch13#sec_future_observing)
- read repair (leaderless replication), [Catching up on missed writes](/en/ch6#sec_replication_read_repair)
  - for linearizability, [Linearizability and quorums](/en/ch10#sec_consistency_quorum_linearizable)
- read replicas (see leader-based replication)
- read skew (transaction isolation), [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation), [Summary](/en/ch8#summary)
- read uncommitted isolation level, [Implementing read committed](/en/ch8#sec_transactions_read_committed_impl)
- read-after-write consistency, [Reading Your Own Writes](/en/ch6#sec_replication_ryw), [Timeliness and Integrity](/en/ch13#sec_future_integrity)
  - cross-device, [Reading Your Own Writes](/en/ch6#sec_replication_ryw)
  - in derived data systems, [Derived data versus distributed transactions](/en/ch13#sec_future_derived_vs_transactions)
- read-modify-write cycle, [Preventing Lost Updates](/en/ch8#sec_transactions_lost_update)
- read-scaling architecture, [Problems with Replication Lag](/en/ch6#sec_replication_lag), [Single-Leader Versus Leaderless Replication Performance](/en/ch6#sec_replication_leaderless_perf)
  - versus sharding, [Pros and Cons of Sharding](/en/ch7#sec_sharding_reasons)
- reads as events, [Reads are events too](/en/ch13#sec_future_read_events)
- real-time
  - analytics (see product analytics)
  - collaborative editing, [Real-time collaboration, offline-first, and local-first apps](/en/ch6#real-time-collaboration-offline-first-and-local-first-apps)
  - publish/subscribe dataflow, [End-to-end event streams](/en/ch13#id349)
  - response time guarantees, [Response time guarantees](/en/ch9#sec_distributed_clocks_realtime)
  - time-of-day clocks, [Time-of-day clocks](/en/ch9#time-of-day-clocks)
- Realm (database), [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
- rebalancing shards, [Rebalancing key-range sharded data](/en/ch7#rebalancing-key-range-sharded-data)-[Operations: Automatic or Manual Rebalancing](/en/ch7#sec_sharding_operations), [Glossary](/en/glossary)
  - (see also sharding)
  - automatic or manual rebalancing, [Operations: Automatic or Manual Rebalancing](/en/ch7#sec_sharding_operations)
  - fixed number of shards, [Fixed number of shards](/en/ch7#fixed-number-of-shards)
  - fixed number of shards per node, [Sharding by hash range](/en/ch7#sharding-by-hash-range)
  - problems with hash mod N, [Hash modulo number of nodes](/en/ch7#hash-modulo-number-of-nodes)
- recency guarantee, [Linearizability](/en/ch10#sec_consistency_linearizability)
- recommendation engines, [Operational Versus Analytical Systems](/en/ch1#sec_introduction_analytics)
  - building using DataFrames, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
  - iterative processing, [Machine Learning](/en/ch11#id290)
- reconfiguration (consensus), [Subtleties of consensus](/en/ch10#subtleties-of-consensus)
- records, [MapReduce](/en/ch11#sec_batch_mapreduce)
  - events in stream processing, [Transmitting Event Streams](/en/ch12#sec_stream_transmit)
- recursive queries
  - in Cypher, [The Cypher Query Language](/en/ch3#id57)
  - in Datalog, [Datalog: Recursive Relational Queries](/en/ch3#id62)
  - in SPARQL, [The SPARQL query language](/en/ch3#the-sparql-query-language)
  - lack of, in GraphQL, [GraphQL](/en/ch3#id63)
  - SQL common table expressions, [Graph Queries in SQL](/en/ch3#id58)
- Red Hat
  - Apicurio Registry, [JSON Schema](/en/ch5#json-schema)
- red-black tree, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
- redelivery (messaging), [Acknowledgments and redelivery](/en/ch12#sec_stream_reordering)
- Redis (database)
  - atomic operations, [Atomic write operations](/en/ch8#atomic-write-operations)
  - CRDT support, [CRDTs and Operational Transformation](/en/ch6#sec_replication_crdts)
  - durability, [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
  - Lua scripting, [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs)
  - multi-leader replication, [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)
  - process-per-core model, [Pros and Cons of Sharding](/en/ch7#sec_sharding_reasons)
  - single-threaded execution, [Actual Serial Execution](/en/ch8#sec_transactions_serial)
- redo log (see write-ahead log)
- Redpanda (messaging), [Message brokers](/en/ch5#message-brokers), [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - tiered storage, [Disk space usage](/en/ch12#sec_stream_disk_usage)
- Redshift (database), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
- redundancy
  - hardware components, [Tolerating hardware faults through redundancy](/en/ch2#tolerating-hardware-faults-through-redundancy)
  - of derived data, [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived)
    - (see also derived data)
- Reed--Solomon codes (error correction), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- refactoring, [Evolvability: Making Change Easy](/en/ch2#sec_introduction_evolvability)
  - (see also evolvability)
- regions (geographic distribution), [Reading Your Own Writes](/en/ch6#sec_replication_ryw)
  - (see also datacenters)
  - consensus across, [Pros and cons of consensus](/en/ch10#pros-and-cons-of-consensus)
  - definition, [Reading Your Own Writes](/en/ch6#sec_replication_ryw)
  - latency, [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed)
  - linearizable ID generation, [Implementing a linearizable ID generator](/en/ch10#implementing-a-linearizable-id-generator)
  - replication across, [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)-[Problems with different topologies](/en/ch6#problems-with-different-topologies), [The Cost of Linearizability](/en/ch10#sec_linearizability_cost), [The limits of total ordering](/en/ch13#id335)
    - leaderless, [Multi-region operation](/en/ch6#multi-region-operation)
    - multi-leader, [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)
- regions (sharding), [Sharding](/en/ch7#ch_sharding)
- register (data structure), [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
- regulation (see legal matters)
- relational data model, [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake), [Relational Model versus Document Model](/en/ch3#sec_datamodels_history)-[Convergence of document and relational databases](/en/ch3#convergence-of-document-and-relational-databases)
  - comparison to document model, [When to Use Which Model](/en/ch3#sec_datamodels_document_summary)-[Convergence of document and relational databases](/en/ch3#convergence-of-document-and-relational-databases)
  - graph queries in SQL, [Graph Queries in SQL](/en/ch3#id58)
  - in-memory databases with, [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
  - many-to-one and many-to-many relationships, [Many-to-One and Many-to-Many Relationships](/en/ch3#sec_datamodels_many_to_many)
  - multi-object transactions, need for, [The need for multi-object transactions](/en/ch8#sec_transactions_need)
  - object-relational mismatch, [The Object-Relational Mismatch](/en/ch3#sec_datamodels_document)
  - representing a reorderable list, [When to Use Which Model](/en/ch3#sec_datamodels_document_summary)
  - versus document model
    - convergence of models, [Convergence of document and relational databases](/en/ch3#convergence-of-document-and-relational-databases)
    - data locality, [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality)
- relational databases
  - eventual consistency, [Problems with Replication Lag](/en/ch6#sec_replication_lag)
  - history, [Relational Model versus Document Model](/en/ch3#sec_datamodels_history)
  - leader-based replication, [Single-Leader Replication](/en/ch6#sec_replication_leader)
  - logical logs, [Logical (row-based) log replication](/en/ch6#logical-row-based-log-replication)
  - philosophy compared to Unix, [Unbundling Databases](/en/ch13#sec_future_unbundling), [The meta-database of everything](/en/ch13#id341)
  - schema changes, [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility), [Encoding and Evolution](/en/ch5#ch_encoding), [Different values written at different times](/en/ch5#different-values-written-at-different-times)
  - sharded secondary indexes, [Sharding and Secondary Indexes](/en/ch7#sec_sharding_secondary_indexes)
  - statement-based replication, [Statement-based replication](/en/ch6#statement-based-replication)
  - use of B-tree indexes, [B-Trees](/en/ch4#sec_storage_b_trees)
- relationships (see edges)
- reliability, [Reliability and Fault Tolerance](/en/ch2#sec_introduction_reliability)-[Humans and Reliability](/en/ch2#id31), [A Philosophy of Streaming Systems](/en/ch13#ch_philosophy)
  - building a reliable system from unreliable components, [Faults and Partial Failures](/en/ch9#sec_distributed_partial_failure)
  - hardware faults, [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults)
  - human errors, [Humans and Reliability](/en/ch2#id31)
  - importance of, [Humans and Reliability](/en/ch2#id31)
  - of messaging systems, [Messaging Systems](/en/ch12#sec_stream_messaging)
  - software faults, [Software faults](/en/ch2#software-faults)
- Remote Method Invocation (Java RMI), [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc)
- remote procedure calls (RPCs), [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc)-[Data encoding and evolution for RPC](/en/ch5#data-encoding-and-evolution-for-rpc)
  - (see also services)
  - data encoding and evolution, [Data encoding and evolution for RPC](/en/ch5#data-encoding-and-evolution-for-rpc)
  - issues with, [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc)
  - using Avro, [But what is the writer's schema?](/en/ch5#but-what-is-the-writers-schema)
  - versus message brokers, [Event-Driven Architectures](/en/ch5#sec_encoding_dataflow_msg)
- renewable energy, [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed)
- repeatable reads (transaction isolation), [Snapshot isolation, repeatable read, and naming confusion](/en/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
- replicas, [Single-Leader Replication](/en/ch6#sec_replication_leader)
- replication, [Replication](/en/ch6#ch_replication)-[Summary](/en/ch6#summary), [Glossary](/en/glossary)
  - and durability, [Durability](/en/ch8#durability)
  - conflict resolution and, [Conflict resolution and replication](/en/ch8#conflict-resolution-and-replication)
  - consistency properties, [Problems with Replication Lag](/en/ch6#sec_replication_lag)-[Solutions for Replication Lag](/en/ch6#id131)
    - consistent prefix reads, [Consistent Prefix Reads](/en/ch6#sec_replication_consistent_prefix)
    - monotonic reads, [Monotonic Reads](/en/ch6#sec_replication_monotonic_reads)
    - reading your own writes, [Reading Your Own Writes](/en/ch6#sec_replication_ryw)
  - in distributed filesystems, [Distributed Filesystems](/en/ch11#sec_batch_dfs)
  - leaderless, [Leaderless Replication](/en/ch6#sec_replication_leaderless)-[Version vectors](/en/ch6#version-vectors)
    - detecting concurrent writes, [Detecting Concurrent Writes](/en/ch6#sec_replication_concurrent)-[Version vectors](/en/ch6#version-vectors)
    - limitations of quorum consistency, [Limitations of Quorum Consistency](/en/ch6#sec_replication_quorum_limitations)-[Monitoring staleness](/en/ch6#monitoring-staleness), [Linearizability and quorums](/en/ch10#sec_consistency_quorum_linearizable)
  - monitoring staleness, [Monitoring staleness](/en/ch6#monitoring-staleness)
  - multi-leader, [Multi-Leader Replication](/en/ch6#sec_replication_multi_leader)-[Types of conflict](/en/ch6#sec_replication_write_conflicts)
    - across multiple regions, [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc), [The Cost of Linearizability](/en/ch10#sec_linearizability_cost)
    - conflict resolution, [Dealing with Conflicting Writes](/en/ch6#sec_replication_write_conflicts)-[Types of conflict](/en/ch6#sec_replication_write_conflicts)
    - replication topologies, [Multi-leader replication topologies](/en/ch6#sec_replication_topologies)-[Problems with different topologies](/en/ch6#problems-with-different-topologies)
  - reasons for using, [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed), [Replication](/en/ch6#ch_replication)
  - sharding and, [Sharding](/en/ch7#ch_sharding)
  - single-leader, [Single-Leader Replication](/en/ch6#sec_replication_leader)-[Logical (row-based) log replication](/en/ch6#logical-row-based-log-replication)
    - failover, [Leader failure: Failover](/en/ch6#leader-failure-failover)
    - implementation of replication logs, [Implementation of Replication Logs](/en/ch6#sec_replication_implementation)-[Logical (row-based) log replication](/en/ch6#logical-row-based-log-replication)
    - relation to consensus, [From single-leader replication to consensus](/en/ch10#from-single-leader-replication-to-consensus), [Pros and cons of consensus](/en/ch10#pros-and-cons-of-consensus)
    - setting up new followers, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
    - synchronous versus asynchronous, [Synchronous Versus Asynchronous Replication](/en/ch6#sec_replication_sync_async)-[Synchronous Versus Asynchronous Replication](/en/ch6#sec_replication_sync_async)
  - state machine replication, [Statement-based replication](/en/ch6#statement-based-replication), [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs), [Using shared logs](/en/ch10#sec_consistency_smr), [Databases and Streams](/en/ch12#sec_stream_databases)
    - event sourcing, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
    - reliance on determinism, [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
  - using consensus, [Pros and cons of consensus](/en/ch10#pros-and-cons-of-consensus)
  - using erasure coding, [Distributed Filesystems](/en/ch11#sec_batch_dfs)
  - using object storage, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - versus backups, [Replication](/en/ch6#ch_replication)
  - with heterogeneous data systems, [Keeping Systems in Sync](/en/ch12#sec_stream_sync)
- replication logs (see logs)
- representations of data (see data models)
- reprocessing data, [Reprocessing data for application evolution](/en/ch13#sec_future_reprocessing), [Unifying batch and stream processing](/en/ch13#id338)
  - (see also evolvability)
  - from log-based messaging, [Replaying old messages](/en/ch12#sec_stream_replay)
- request hedging, [Single-Leader Versus Leaderless Replication Performance](/en/ch6#sec_replication_leaderless_perf)
- request identifiers, [Uniquely identifying requests](/en/ch13#id355), [Multi-shard request processing](/en/ch13#id360)
- request routing, [Request Routing](/en/ch7#sec_sharding_routing)-[Request Routing](/en/ch7#sec_sharding_routing)
  - approaches to, [Request Routing](/en/ch7#sec_sharding_routing)
- residence laws for data, [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed), [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
- resilient systems, [Reliability and Fault Tolerance](/en/ch2#sec_introduction_reliability)
  - (see also fault tolerance)
- resource isolation, [Cloud Computing Versus Supercomputing](/en/ch1#id17), [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
- resource limits, [Operations in the Cloud Era](/en/ch1#sec_introduction_operations)
- response time
  - as performance metric, [Describing Performance](/en/ch2#sec_introduction_percentiles), [Batch Processing](/en/ch11#ch_batch)
  - guarantees on, [Response time guarantees](/en/ch9#sec_distributed_clocks_realtime)
  - impact on users, [Average, Median, and Percentiles](/en/ch2#id24)
  - in replicated systems, [Single-Leader Versus Leaderless Replication Performance](/en/ch6#sec_replication_leaderless_perf)
  - latency versus, [Latency and Response Time](/en/ch2#id23)
  - mean and percentiles, [Average, Median, and Percentiles](/en/ch2#id24)
  - user experience, [Average, Median, and Percentiles](/en/ch2#id24)
- responsibility and accountability, [Responsibility and Accountability](/en/ch14#id371)
- REST (Representational State Transfer), [Web services](/en/ch5#sec_web_services)
  - (see also services)
- Restate (workflow engine), [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows)
- RethinkDB (database)
  - join support, [Convergence of document and relational databases](/en/ch3#convergence-of-document-and-relational-databases)
  - key-range sharding, [Sharding by Key Range](/en/ch7#sec_sharding_key_range)
- retry storm, [Describing Performance](/en/ch2#sec_introduction_percentiles), [Software faults](/en/ch2#software-faults)
- reverse ETL, [Beyond the data lake](/en/ch1#beyond-the-data-lake)
- Riak (database)
  - CRDT support, [CRDTs and Operational Transformation](/en/ch6#sec_replication_crdts), [Detecting Concurrent Writes](/en/ch6#sec_replication_concurrent)
  - dotted version vectors, [Version vectors](/en/ch6#version-vectors)
  - gossip protocol, [Request Routing](/en/ch7#sec_sharding_routing)
  - hash sharding, [Fixed number of shards](/en/ch7#fixed-number-of-shards)
  - leaderless replication, [Leaderless Replication](/en/ch6#sec_replication_leaderless)
  - linearizability, lack of, [Linearizability and quorums](/en/ch10#sec_consistency_quorum_linearizable)
  - multi-region support, [Multi-region operation](/en/ch6#multi-region-operation)
  - rebalancing, [Operations: Automatic or Manual Rebalancing](/en/ch7#sec_sharding_operations)
  - secondary indexes, [Local Secondary Indexes](/en/ch7#id166)
  - sloppy quorums, [Single-Leader Versus Leaderless Replication Performance](/en/ch6#sec_replication_leaderless_perf)
  - vnodes (sharding), [Sharding](/en/ch7#ch_sharding)
- ring buffers, [Disk space usage](/en/ch12#sec_stream_disk_usage)
- RisingWave (database)
  - incremental view maintenance, [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
- rockets, [Byzantine Faults](/en/ch9#sec_distributed_byzantine)
- RocksDB (storage engine), [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
  - as embedded storage engine, [Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
  - leveled compaction, [Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
  - serving derived data, [Serving Derived Data](/en/ch11#sec_batch_serving_derived)
- rollbacks (transactions), [Transactions](/en/ch8#ch_transactions)
- rolling upgrades, [Tolerating hardware faults through redundancy](/en/ch2#tolerating-hardware-faults-through-redundancy), [Encoding and Evolution](/en/ch5#ch_encoding), [Faults and Partial Failures](/en/ch9#sec_distributed_partial_failure)
  - in a multitenant system, [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
- routing (see request routing)
- row-based replication, [Logical (row-based) log replication](/en/ch6#logical-row-based-log-replication)
- row-oriented storage, [Column-Oriented Storage](/en/ch4#sec_storage_column)
- rowhammer (memory corruption), [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults)
- RPCs (see remote procedure calls)
- rules (Datalog), [Datalog: Recursive Relational Queries](/en/ch3#id62)
- Rust (programming language)
  - memory management, [Limiting the impact of garbage collection](/en/ch9#sec_distributed_gc_impact)

### S

- S3 (object storage), [Layering of cloud services](/en/ch1#layering-of-cloud-services), [Setting Up New Followers](/en/ch6#sec_replication_new_replica), [Batch Processing](/en/ch11#ch_batch), [Distributed Filesystems](/en/ch11#sec_batch_dfs), [Object Stores](/en/ch11#id277)
  - checking data integrity, [Don't just blindly trust what they promise](/en/ch13#id364)
  - conditional writes, [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens)
  - object size, [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute)
  - S3 Express One Zone, [Object Stores](/en/ch11#id277), [Object Stores](/en/ch11#id277)
  - use in MapReduce, [MapReduce](/en/ch11#sec_batch_mapreduce)
  - workflow example, [Scheduling Workflows](/en/ch11#sec_batch_workflows)
- SaaS (see software as a service (SaaS))
- safety and liveness properties, [Safety and liveness](/en/ch9#sec_distributed_safety_liveness)
  - in consensus algorithms, [Single-value consensus](/en/ch10#single-value-consensus)
  - in transactions, [Transactions](/en/ch8#ch_transactions)
- sagas (see compensating transactions)
- Samza (stream processor), [Stream analytics](/en/ch12#id318)
- SAP HANA (database), [Data Storage for Analytics](/en/ch4#sec_storage_analytics)
- scalability, [Scalability](/en/ch2#sec_introduction_scalability)-[Principles for Scalability](/en/ch2#id35), [A Philosophy of Streaming Systems](/en/ch13#ch_philosophy)
  - auto-scaling, [Operations: Automatic or Manual Rebalancing](/en/ch7#sec_sharding_operations)
  - by sharding, [Pros and Cons of Sharding](/en/ch7#sec_sharding_reasons)
  - describing load, [Describing Load](/en/ch2#id33)
  - describing performance, [Describing Performance](/en/ch2#sec_introduction_percentiles)
  - linear, [Describing Load](/en/ch2#id33)
  - principles for, [Principles for Scalability](/en/ch2#id35)
  - replication and, [Problems with Replication Lag](/en/ch6#sec_replication_lag)
  - scaling up versus scaling out, [Shared-Memory, Shared-Disk, and Shared-Nothing Architecture](/en/ch2#sec_introduction_shared_nothing)
- scaling out, [Shared-Memory, Shared-Disk, and Shared-Nothing Architecture](/en/ch2#sec_introduction_shared_nothing)
  - (see also shared-nothing architecture)
  - by sharding, [Pros and Cons of Sharding](/en/ch7#sec_sharding_reasons)
- scaling up, [Shared-Memory, Shared-Disk, and Shared-Nothing Architecture](/en/ch2#sec_introduction_shared_nothing)
- SCD (slowly changing dimension), [Time-dependence of joins](/en/ch12#sec_stream_join_time)
- scheduling
  - algorithms, [Resource Allocation](/en/ch11#id279)
  - batch jobs, [Distributed Job Orchestration](/en/ch11#id278)-[Scheduling Workflows](/en/ch11#sec_batch_workflows)
  - gang scheduling, [Resource Allocation](/en/ch11#id279)
- schema-on-read, [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility)
  - comparison to evolvable schema, [The Merits of Schemas](/en/ch5#sec_encoding_schemas)
- schema-on-write, [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility)
- schemaless databases (see schema-on-read)
- schemas, [Glossary](/en/glossary)
  - Avro, [Avro](/en/ch5#sec_encoding_avro)-[Dynamically generated schemas](/en/ch5#dynamically-generated-schemas)
    - reader determining writer's schema, [But what is the writer's schema?](/en/ch5#but-what-is-the-writers-schema)
    - schema evolution, [The writer's schema and the reader's schema](/en/ch5#the-writers-schema-and-the-readers-schema)
  - dynamically generated, [Dynamically generated schemas](/en/ch5#dynamically-generated-schemas)
  - evolution of, [Reprocessing data for application evolution](/en/ch13#sec_future_reprocessing)
    - affecting application code, [Encoding and Evolution](/en/ch5#ch_encoding)
    - compatibility checking, [But what is the writer's schema?](/en/ch5#but-what-is-the-writers-schema)
    - in databases, [Dataflow Through Databases](/en/ch5#sec_encoding_dataflow_db)-[Archival storage](/en/ch5#archival-storage)
    - in service calls, [Data encoding and evolution for RPC](/en/ch5#data-encoding-and-evolution-for-rpc)
  - flexibility in document model, [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility)
  - for analytics, [Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)-[Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)
  - for JSON and XML, [JSON, XML, and Binary Variants](/en/ch5#sec_encoding_json), [JSON Schema](/en/ch5#json-schema)
  - generation and migration using ORMs, [Object-relational mapping (ORM)](/en/ch3#object-relational-mapping-orm)
  - merits of, [The Merits of Schemas](/en/ch5#sec_encoding_schemas)
  - migration, [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility)
  - Protocol Buffers, [Protocol Buffers](/en/ch5#sec_encoding_protobuf)-[Field tags and schema evolution](/en/ch5#field-tags-and-schema-evolution)
    - schema evolution, [Field tags and schema evolution](/en/ch5#field-tags-and-schema-evolution)
  - schema migration on railways, [Reprocessing data for application evolution](/en/ch13#sec_future_reprocessing)
  - traditional approach to design, fallacy in, [Deriving several views from the same event log](/en/ch12#sec_stream_deriving_views)
- scientific computing, [Cloud Computing Versus Supercomputing](/en/ch1#id17)
- scikit-learn (Python library), [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake)
- ScyllaDB (database)
  - cluster metadata, [Request Routing](/en/ch7#sec_sharding_routing)
  - consistency level ANY, [Single-Leader Versus Leaderless Replication Performance](/en/ch6#sec_replication_leaderless_perf)
  - hash-range sharding, [Sharding by Hash of Key](/en/ch7#sec_sharding_hash), [Sharding by hash range](/en/ch7#sharding-by-hash-range)
  - last-write-wins conflict resolution, [Detecting Concurrent Writes](/en/ch6#sec_replication_concurrent)
  - leaderless replication, [Leaderless Replication](/en/ch6#sec_replication_leaderless)
  - lightweight transactions, [Single-object writes](/en/ch8#sec_transactions_single_object)
  - linearizability, lack of, [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable)
  - log-structured storage, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
  - multi-region support, [Multi-region operation](/en/ch6#multi-region-operation)
  - use of clocks, [Limitations of Quorum Consistency](/en/ch6#sec_replication_quorum_limitations), [Timestamps for ordering events](/en/ch9#sec_distributed_lww)
  - vnodes (sharding), [Sharding](/en/ch7#ch_sharding)
- search engines (see full-text search)
- searching on streams, [Search on streams](/en/ch12#id320)
- secondaries (see leader-based replication)
- secondary indexes, [Multi-Column and Secondary Indexes](/en/ch4#sec_storage_index_multicolumn), [Glossary](/en/glossary)
  - for many-to-many relationships, [Many-to-One and Many-to-Many Relationships](/en/ch3#sec_datamodels_many_to_many)
  - problems with dual writes, [Keeping Systems in Sync](/en/ch12#sec_stream_sync), [Reasoning about dataflows](/en/ch13#id443)
  - sharding, [Sharding and Secondary Indexes](/en/ch7#sec_sharding_secondary_indexes)-[Global Secondary Indexes](/en/ch7#id167), [Summary](/en/ch7#summary)
    - global, [Global Secondary Indexes](/en/ch7#id167)
    - index maintenance, [Maintaining derived state](/en/ch13#id446)
    - local, [Local Secondary Indexes](/en/ch7#id166)
  - updating, transaction isolation and, [The need for multi-object transactions](/en/ch8#sec_transactions_need)
- secondary sort (MapReduce), [JOIN and GROUP BY](/en/ch11#sec_batch_join)
- sed (Unix tool), [Simple Log Analysis](/en/ch11#sec_batch_log_analysis)
- self-hosting, [Cloud Versus Self-Hosting](/en/ch1#sec_introduction_cloud)
  - data warehouses, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
- self-joins, [Summary](/en/ch12#id332)
- self-validating systems, [Don't just blindly trust what they promise](/en/ch13#id364)
- semantic search, [Vector Embeddings](/en/ch4#id92)
- semantic similarity, [Vector Embeddings](/en/ch4#id92)
- semantic web, [Triple-Stores and SPARQL](/en/ch3#id59)
- semi-synchronous replication, [Synchronous Versus Asynchronous Replication](/en/ch6#sec_replication_sync_async)
- sequential writes (access pattern), [Sequential versus random writes](/en/ch4#sidebar_sequential)
- serializability, [Isolation](/en/ch8#sec_transactions_acid_isolation), [Weak Isolation Levels](/en/ch8#sec_transactions_isolation_levels), [Serializability](/en/ch8#sec_transactions_serializability)-[Performance of serializable snapshot isolation](/en/ch8#performance-of-serializable-snapshot-isolation), [Glossary](/en/glossary)
  - linearizability versus, [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
  - pessimistic versus optimistic concurrency control, [Pessimistic versus optimistic concurrency control](/en/ch8#pessimistic-versus-optimistic-concurrency-control)
  - serial execution, [Actual Serial Execution](/en/ch8#sec_transactions_serial)-[Summary of serial execution](/en/ch8#summary-of-serial-execution)
    - sharding, [Sharding](/en/ch8#sharding)
    - using stored procedures, [Encapsulating transactions in stored procedures](/en/ch8#encapsulating-transactions-in-stored-procedures), [Using shared logs](/en/ch10#sec_consistency_smr)
  - serializable snapshot isolation (SSI), [Serializable Snapshot Isolation (SSI)](/en/ch8#sec_transactions_ssi)-[Performance of serializable snapshot isolation](/en/ch8#performance-of-serializable-snapshot-isolation)
    - detecting stale MVCC reads, [Detecting stale MVCC reads](/en/ch8#detecting-stale-mvcc-reads)
    - detecting writes that affect prior reads, [Detecting writes that affect prior reads](/en/ch8#sec_detecting_writes_affect_reads)
    - distributed execution, [Performance of serializable snapshot isolation](/en/ch8#performance-of-serializable-snapshot-isolation), [Database-internal Distributed Transactions](/en/ch8#sec_transactions_internal)
    - performance of SSI, [Performance of serializable snapshot isolation](/en/ch8#performance-of-serializable-snapshot-isolation)
    - preventing write skew, [Decisions based on an outdated premise](/en/ch8#decisions-based-on-an-outdated-premise)-[Detecting writes that affect prior reads](/en/ch8#sec_detecting_writes_affect_reads)
  - strict serializability, [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
    - timeliness vs. integrity, [Timeliness and Integrity](/en/ch13#sec_future_integrity)
  - two-phase locking (2PL), [Two-Phase Locking (2PL)](/en/ch8#sec_transactions_2pl)-[Index-range locks](/en/ch8#sec_transactions_2pl_range)
    - index-range locks, [Index-range locks](/en/ch8#sec_transactions_2pl_range)
    - performance, [Performance of two-phase locking](/en/ch8#performance-of-two-phase-locking)
- Serializable (Java), [Language-Specific Formats](/en/ch5#id96)
- serialization, [Formats for Encoding Data](/en/ch5#sec_encoding_formats)
  - (see also encoding)
- serverless, [Microservices and Serverless](/en/ch1#sec_introduction_microservices)
- service discovery, [Load balancers, service discovery, and service meshes](/en/ch5#sec_encoding_service_discovery), [Request Routing](/en/ch7#sec_sharding_routing), [Service discovery](/en/ch10#service-discovery)
  - registration, [Load balancers, service discovery, and service meshes](/en/ch5#sec_encoding_service_discovery)
  - using DNS, [Load balancers, service discovery, and service meshes](/en/ch5#sec_encoding_service_discovery), [Request Routing](/en/ch7#sec_sharding_routing), [Service discovery](/en/ch10#service-discovery)
- service level agreements (SLAs), [Use of Response Time Metrics](/en/ch2#sec_introduction_slo_sla), [Describing Load](/en/ch2#id33)
- service mesh, [Load balancers, service discovery, and service meshes](/en/ch5#sec_encoding_service_discovery)
- Service Organization Control (SOC), [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)
- service time, [Latency and Response Time](/en/ch2#id23)
- service-oriented architecture (SOA), [Microservices and Serverless](/en/ch1#sec_introduction_microservices)
  - (see also services)
- services, [Dataflow Through Services: REST and RPC](/en/ch5#sec_encoding_dataflow_rpc)-[Data encoding and evolution for RPC](/en/ch5#data-encoding-and-evolution-for-rpc)
  - microservices, [Microservices and Serverless](/en/ch1#sec_introduction_microservices)
    - causal dependencies across services, [The limits of total ordering](/en/ch13#id335)
    - loose coupling, [Making unbundling work](/en/ch13#sec_future_unbundling_favor)
  - relation to batch/stream processors, [Batch Processing](/en/ch11#ch_batch), [Stream processors and services](/en/ch13#id345)
  - remote procedure calls (RPCs), [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc)-[Data encoding and evolution for RPC](/en/ch5#data-encoding-and-evolution-for-rpc)
    - issues with, [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc)
  - similarity to databases, [Dataflow Through Services: REST and RPC](/en/ch5#sec_encoding_dataflow_rpc)
  - web services, [Web services](/en/ch5#sec_web_services)
- session windows (stream processing), [Types of windows](/en/ch12#id324)
  - (see also windows)
- sharding, [Sharding](/en/ch7#ch_sharding)-[Summary](/en/ch7#summary), [Glossary](/en/glossary)
  - and consensus, [Using shared logs](/en/ch10#sec_consistency_smr)
  - and replication, [Sharding](/en/ch7#ch_sharding)
  - distributed transactions across shards, [Distributed Transactions](/en/ch8#sec_transactions_distributed)
  - hot shards, [Sharding of Key-Value Data](/en/ch7#sec_sharding_key_value)
  - in batch processing, [Batch Processing](/en/ch11#ch_batch)
  - key-range splitting, [Rebalancing key-range sharded data](/en/ch7#rebalancing-key-range-sharded-data)
  - multi-shard operations, [Multi-shard data processing](/en/ch13#sec_future_unbundled_multi_shard)
    - enforcing constraints, [Multi-shard request processing](/en/ch13#id360)
    - secondary index maintenance, [Maintaining derived state](/en/ch13#id446)
  - of key-value data, [Sharding of Key-Value Data](/en/ch7#sec_sharding_key_value)-[Skewed Workloads and Relieving Hot Spots](/en/ch7#sec_sharding_skew)
    - by key range, [Sharding by Key Range](/en/ch7#sec_sharding_key_range)
    - skew and hot spots, [Skewed Workloads and Relieving Hot Spots](/en/ch7#sec_sharding_skew)
  - origin of the term, [Sharding](/en/ch7#ch_sharding)
  - partition key, [Pros and Cons of Sharding](/en/ch7#sec_sharding_reasons), [Sharding of Key-Value Data](/en/ch7#sec_sharding_key_value)
  - rebalancing
    - of key-range sharded data, [Rebalancing key-range sharded data](/en/ch7#rebalancing-key-range-sharded-data)
  - rebalancing shards, [Rebalancing key-range sharded data](/en/ch7#rebalancing-key-range-sharded-data)-[Operations: Automatic or Manual Rebalancing](/en/ch7#sec_sharding_operations)
    - automatic or manual rebalancing, [Operations: Automatic or Manual Rebalancing](/en/ch7#sec_sharding_operations)
    - problems with hash mod N, [Hash modulo number of nodes](/en/ch7#hash-modulo-number-of-nodes)
    - using fixed number of shards, [Fixed number of shards](/en/ch7#fixed-number-of-shards)
    - using N shards per node, [Sharding by hash range](/en/ch7#sharding-by-hash-range)
  - request routing, [Request Routing](/en/ch7#sec_sharding_routing)-[Request Routing](/en/ch7#sec_sharding_routing)
  - secondary indexes, [Sharding and Secondary Indexes](/en/ch7#sec_sharding_secondary_indexes)-[Global Secondary Indexes](/en/ch7#id167)
    - global, [Global Secondary Indexes](/en/ch7#id167)
    - local, [Local Secondary Indexes](/en/ch7#id166)
  - serial execution of transactions and, [Sharding](/en/ch8#sharding)
  - sorting sharded data, [Shuffling Data](/en/ch11#sec_shuffle)
- shared logs, [Consensus in Practice](/en/ch10#sec_consistency_total_order)-[Pros and cons of consensus](/en/ch10#pros-and-cons-of-consensus), [The limits of total ordering](/en/ch13#id335), [Uniqueness in log-based messaging](/en/ch13#sec_future_uniqueness_log)
  - algorithms, [Consensus in Practice](/en/ch10#sec_consistency_total_order)
  - for event sourcing, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
  - for messaging, [Log-based Message Brokers](/en/ch12#sec_stream_log)-[Replaying old messages](/en/ch12#sec_stream_replay)
  - relation to consensus, [Shared logs as consensus](/en/ch10#sec_consistency_shared_logs)
  - using, [Using shared logs](/en/ch10#sec_consistency_smr)
- shared mode (locks), [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
- shared-disk architecture, [Shared-Memory, Shared-Disk, and Shared-Nothing Architecture](/en/ch2#sec_introduction_shared_nothing), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- shared-memory architecture, [Shared-Memory, Shared-Disk, and Shared-Nothing Architecture](/en/ch2#sec_introduction_shared_nothing)
- shared-nothing architecture, [Shared-Memory, Shared-Disk, and Shared-Nothing Architecture](/en/ch2#sec_introduction_shared_nothing), [Glossary](/en/glossary)
  - distributed filesystems, [Distributed Filesystems](/en/ch11#sec_batch_dfs)
    - (see also distributed filesystems)
  - use of network, [Unreliable Networks](/en/ch9#sec_distributed_networks)
- sharks
  - biting undersea cables, [Network Faults in Practice](/en/ch9#sec_distributed_network_faults)
  - counting (example), [Query languages for documents](/en/ch3#query-languages-for-documents)
- shredding (deletion) (see crypto-shredding)
- shredding (in columnar encoding), [Column-Oriented Storage](/en/ch4#sec_storage_column)
- shredding (in relational model), [When to Use Which Model](/en/ch3#sec_datamodels_document_summary)
- shuffle (batch processing), [Shuffling Data](/en/ch11#sec_shuffle)-[Shuffling Data](/en/ch11#sec_shuffle)
- siblings (concurrent values), [Manual conflict resolution](/en/ch6#manual-conflict-resolution), [Capturing the happens-before relationship](/en/ch6#capturing-the-happens-before-relationship), [Conflict resolution and replication](/en/ch8#conflict-resolution-and-replication)
  - (see also conflicts)
- silo, [Data Warehousing](/en/ch1#sec_introduction_dwh)
- similarity search
  - edit distance, [Full-Text Search](/en/ch4#sec_storage_full_text)
  - genome data, [Summary](/en/ch3#summary)
- simplicity, [Simplicity: Managing Complexity](/en/ch2#id38)
- Singer, [Data Warehousing](/en/ch1#sec_introduction_dwh)
- single-instruction-multi-data (SIMD) instructions, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
- single-leader replication (see leader-based replication)
- single-threaded execution, [Atomic write operations](/en/ch8#atomic-write-operations), [Actual Serial Execution](/en/ch8#sec_transactions_serial)
  - in stream processing, [Logs compared to traditional messaging](/en/ch12#sec_stream_logs_vs_messaging), [Concurrency control](/en/ch12#sec_stream_concurrency), [Uniqueness in log-based messaging](/en/ch13#sec_future_uniqueness_log)
- SingleStore (database)
  - in-memory storage, [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
- site reliability engineer, [Operations in the Cloud Era](/en/ch1#sec_introduction_operations)
- size-tiered compaction, [Compaction strategies](/en/ch4#sec_storage_lsm_compaction), [Disk space usage](/en/ch4#disk-space-usage)
- skew, [Glossary](/en/glossary)
  - clock skew, [Relying on Synchronized Clocks](/en/ch9#sec_distributed_clocks_relying)-[Clock readings with a confidence interval](/en/ch9#clock-readings-with-a-confidence-interval), [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable)
  - in transaction isolation
    - read skew, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation), [Summary](/en/ch8#summary)
    - write skew, [Write Skew and Phantoms](/en/ch8#sec_transactions_write_skew)-[Materializing conflicts](/en/ch8#materializing-conflicts), [Decisions based on an outdated premise](/en/ch8#decisions-based-on-an-outdated-premise)-[Detecting writes that affect prior reads](/en/ch8#sec_detecting_writes_affect_reads)
      - (see also write skew)
  - meanings of, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)
  - unbalanced workload, [Sharding of Key-Value Data](/en/ch7#sec_sharding_key_value)
    - compensating for, [Skewed Workloads and Relieving Hot Spots](/en/ch7#sec_sharding_skew)
    - due to celebrities, [Skewed Workloads and Relieving Hot Spots](/en/ch7#sec_sharding_skew)
    - for time-series data, [Sharding by Key Range](/en/ch7#sec_sharding_key_range)
- skip list, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
- SLA (see service level agreements)
- Slack (group chat)
  - GraphQL example, [GraphQL](/en/ch3#id63)
- SlateDB (database), [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables), [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
- sliding windows (stream processing), [Types of windows](/en/ch12#id324)
  - (see also windows)
- sloppy quorums, [Single-Leader Versus Leaderless Replication Performance](/en/ch6#sec_replication_leaderless_perf)
- slowly changing dimension (data warehouses), [Time-dependence of joins](/en/ch12#sec_stream_join_time)
- smearing (leap seconds adjustments), [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy)
- snapshots (databases)
  - as backups, [Replication](/en/ch6#ch_replication)
  - computing derived data, [Creating an index](/en/ch13#id340)
  - in change data capture, [Initial snapshot](/en/ch12#sec_stream_cdc_snapshot)
  - serializable snapshot isolation (SSI), [Serializable Snapshot Isolation (SSI)](/en/ch8#sec_transactions_ssi)-[Performance of serializable snapshot isolation](/en/ch8#performance-of-serializable-snapshot-isolation)
  - setting up a new replica, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - snapshot isolation and repeatable read, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)-[Snapshot isolation, repeatable read, and naming confusion](/en/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
    - implementing with MVCC, [Multi-version concurrency control (MVCC)](/en/ch8#sec_transactions_snapshot_impl)
    - indexes and MVCC, [Indexes and snapshot isolation](/en/ch8#indexes-and-snapshot-isolation)
    - visibility rules, [Visibility rules for observing a consistent snapshot](/en/ch8#sec_transactions_mvcc_visibility)
  - synchronized clocks for global snapshots, [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
- Snowflake (database), [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native), [Layering of cloud services](/en/ch1#layering-of-cloud-services), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses), [Batch Processing](/en/ch11#ch_batch)
  - column-oriented storage, [Column-Oriented Storage](/en/ch4#sec_storage_column)
  - handling writes, [Writing to Column-Oriented Storage](/en/ch4#writing-to-column-oriented-storage)
  - sharding and clustering, [Sharding by hash range](/en/ch7#sharding-by-hash-range)
  - Snowpark, [Query languages](/en/ch11#sec_batch_query_lanauges)
- Snowflake (ID generator), [ID Generators and Logical Clocks](/en/ch10#sec_consistency_logical)
- snowflake schemas, [Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)
- SOAP (web services), [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc)
- SOC2 (see Service Organization Control (SOC))
- social graph, [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
- society
  - responsibility towards, [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance), [Legislation and Self-Regulation](/en/ch14#sec_future_legislation)
- sociotechnical systems, [Humans and Reliability](/en/ch2#id31)
- software as a service (SaaS), [Trade-offs in Data Systems Architecture](/en/ch1#ch_tradeoffs), [Cloud Versus Self-Hosting](/en/ch1#sec_introduction_cloud)
  - ETL from, [Data Warehousing](/en/ch1#sec_introduction_dwh)
  - multitenancy, [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
- software bugs, [Software faults](/en/ch2#software-faults)
  - maintaining integrity, [Maintaining integrity in the face of software bugs](/en/ch13#id455)
- solar storm, [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults)
- solid state drives (SSDs)
  - access patterns, [Sequential versus random writes](/en/ch4#sidebar_sequential)
  - compared to object storage, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - detecting corruption, [The end-to-end argument](/en/ch13#sec_future_e2e_argument), [Don't just blindly trust what they promise](/en/ch13#id364)
  - failure rate, [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults)
  - faults in, [Durability](/en/ch8#durability)
  - firmware bugs, [Software faults](/en/ch2#software-faults)
  - read throughput, [Read performance](/en/ch4#read-performance)
  - sequential vs. random writes, [Sequential versus random writes](/en/ch4#sidebar_sequential)
- Solr (search server)
  - local secondary indexes, [Local Secondary Indexes](/en/ch7#id166)
  - request routing, [Request Routing](/en/ch7#sec_sharding_routing)
  - use of Lucene, [Full-Text Search](/en/ch4#sec_storage_full_text)
- sort (Unix tool), [Simple Log Analysis](/en/ch11#sec_batch_log_analysis), [Simple Log Analysis](/en/ch11#sec_batch_log_analysis), [Sorting Versus In-memory Aggregation](/en/ch11#id275), [Distributed Job Orchestration](/en/ch11#id278)
- sort-merge joins (MapReduce), [JOIN and GROUP BY](/en/ch11#sec_batch_join)
- Sorted String Tables (see SSTables)
- sorting
  - sort order in column storage, [Sort Order in Column Storage](/en/ch4#sort-order-in-column-storage)
- source of truth (see systems of record)
- Spanner (database)
  - consistency model, [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
  - data locality, [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality)
  - in the cloud, [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native)
  - snapshot isolation using clocks, [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
  - transactions, [What Exactly Is a Transaction?](/en/ch8#sec_transactions_overview), [Database-internal Distributed Transactions](/en/ch8#sec_transactions_internal)
  - TrueTime API, [Clock readings with a confidence interval](/en/ch9#clock-readings-with-a-confidence-interval)
- Spark (processing framework), [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake), [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native), [Batch Processing](/en/ch11#ch_batch), [Dataflow Engines](/en/ch11#sec_batch_dataflow)
  - cost efficiency, [Query languages](/en/ch11#sec_batch_query_lanauges)
  - DataFrames, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes), [DataFrames](/en/ch11#id287)
  - fault tolerance, [Handling Faults](/en/ch11#id281)
  - for data warehouses, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
  - high availability using ZooKeeper, [Coordination Services](/en/ch10#sec_consistency_coordination)
  - MLlib, [Machine Learning](/en/ch11#id290)
  - query optimizer, [Query languages](/en/ch11#sec_batch_query_lanauges)
  - shuffling data, [Shuffling Data](/en/ch11#sec_shuffle)
  - Spark Streaming, [Stream analytics](/en/ch12#id318)
    - microbatching, [Microbatching and checkpointing](/en/ch12#id329)
  - streaming SQL support, [Complex event processing](/en/ch12#id317)
  - use for ETL, [Extract--Transform--Load (ETL)](/en/ch11#sec_batch_etl_usage)
- SPARQL (query language), [The SPARQL query language](/en/ch3#the-sparql-query-language)
- sparse index, [The SSTable file format](/en/ch4#the-sstable-file-format)
- sparse matrices, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- split brain, [Leader failure: Failover](/en/ch6#leader-failure-failover), [Request Routing](/en/ch7#sec_sharding_routing), [Glossary](/en/glossary)
  - enforcing constraints, [Uniqueness constraints require consensus](/en/ch13#id452)
  - in consensus algorithms, [Consensus](/en/ch10#sec_consistency_consensus), [From single-leader replication to consensus](/en/ch10#from-single-leader-replication-to-consensus)
  - preventing, [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable)
  - using fencing tokens to avoid, [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens)-[Fencing with multiple replicas](/en/ch9#fencing-with-multiple-replicas)
- spot instances, [Handling Faults](/en/ch11#id281)
- spreadsheets, [Trade-offs in Data Systems Architecture](/en/ch1#ch_tradeoffs), [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
  - dataflow programming, [Designing Applications Around Dataflow](/en/ch13#sec_future_dataflow)
  - pivot table, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- SQL (Structured Query Language), [Simplicity: Managing Complexity](/en/ch2#id38), [Relational Model versus Document Model](/en/ch3#sec_datamodels_history), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
  - for analytics, [Data Warehousing](/en/ch1#sec_introduction_dwh), [Column-Oriented Storage](/en/ch4#sec_storage_column)
  - graph queries in, [Graph Queries in SQL](/en/ch3#id58)
  - isolation levels standard, issues with, [Snapshot isolation, repeatable read, and naming confusion](/en/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
  - joins, [Normalization, Denormalization, and Joins](/en/ch3#sec_datamodels_normalization)
  - résumé (example), [The document data model for one-to-many relationships](/en/ch3#the-document-data-model-for-one-to-many-relationships)
  - social network home timelines (example), [Representing Users, Posts, and Follows](/en/ch2#id20)
  - SQL injection vulnerability, [Byzantine Faults](/en/ch9#sec_distributed_byzantine)
  - statement-based replication, [Statement-based replication](/en/ch6#statement-based-replication)
  - stored procedures, [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs)
  - support in batch processing frameworks, [Batch Processing](/en/ch11#ch_batch)
  - views, [Datalog: Recursive Relational Queries](/en/ch3#id62)
- SQL Server (database)
  - archiving WAL to object stores, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
  - change data capture, [Implementing change data capture](/en/ch12#id307)
  - data warehousing support, [Data Storage for Analytics](/en/ch4#sec_storage_analytics)
  - distributed transaction support, [XA transactions](/en/ch8#xa-transactions)
  - leader-based replication, [Single-Leader Replication](/en/ch6#sec_replication_leader)
  - multi-leader replication, [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)
  - preventing lost updates, [Automatically detecting lost updates](/en/ch8#automatically-detecting-lost-updates)
  - preventing write skew, [Characterizing write skew](/en/ch8#characterizing-write-skew), [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
  - read committed isolation, [Implementing read committed](/en/ch8#sec_transactions_read_committed_impl)
  - serializable isolation, [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
  - snapshot isolation support, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)
  - T-SQL language, [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs)
- SQLite (database), [Problems with Distributed Systems](/en/ch1#sec_introduction_dist_sys_problems), [Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
  - archiving WAL to object stores, [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
- SRE (site reliability engineer), [Operations in the Cloud Era](/en/ch1#sec_introduction_operations)
- SSDs (see solid state drives)
- SSTables (storage format), [The SSTable file format](/en/ch4#the-sstable-file-format)-[Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
  - constructing and maintaining, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
  - making LSM-Tree from, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
- staged rollout (see rolling upgrades)
- staleness (old data), [Reading Your Own Writes](/en/ch6#sec_replication_ryw)
  - cross-channel timing dependencies, [Cross-channel timing dependencies](/en/ch10#cross-channel-timing-dependencies)
  - in leaderless databases, [Writing to the Database When a Node Is Down](/en/ch6#id287)
  - in multi-version concurrency control, [Detecting stale MVCC reads](/en/ch8#detecting-stale-mvcc-reads)
  - monitoring for, [Monitoring staleness](/en/ch6#monitoring-staleness)
  - of client state, [Pushing state changes to clients](/en/ch13#id348)
  - versus linearizability, [Linearizability](/en/ch10#sec_consistency_linearizability)
  - versus timeliness, [Timeliness and Integrity](/en/ch13#sec_future_integrity)
- standbys (see leader-based replication)
- star replication topologies, [Multi-leader replication topologies](/en/ch6#sec_replication_topologies)
- star schemas, [Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)-[Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)
- Star Wars analogy (event time versus processing time), [Event time versus processing time](/en/ch12#id322)
- starvation (scheduling), [Resource Allocation](/en/ch11#id279)
- state
  - derived from log of immutable events, [State, Streams, and Immutability](/en/ch12#sec_stream_immutability)
  - interplay between state changes and application code, [Dataflow: Interplay between state changes and application code](/en/ch13#id450)
  - maintaining derived state, [Maintaining derived state](/en/ch13#id446)
  - maintenance by stream processor in stream-stream joins, [Stream-stream join (window join)](/en/ch12#id440)
  - observing derived state, [Observing Derived State](/en/ch13#sec_future_observing)-[Multi-shard data processing](/en/ch13#sec_future_unbundled_multi_shard)
  - rebuilding after stream processor failure, [Rebuilding state after a failure](/en/ch12#sec_stream_state_fault_tolerance)
  - separation of application code and, [Separation of application code and state](/en/ch13#id344)
- state machine replication, [Statement-based replication](/en/ch6#statement-based-replication), [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs), [Using shared logs](/en/ch10#sec_consistency_smr), [Databases and Streams](/en/ch12#sec_stream_databases)
  - event sourcing, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
  - reliance on determinism, [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
- stateless systems, [Trade-offs in Data Systems Architecture](/en/ch1#ch_tradeoffs)
- statement-based replication, [Statement-based replication](/en/ch6#statement-based-replication)
  - reliance on determinism, [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
- statically typed languages
  - analogy to schema-on-write, [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility)
- statistical and numerical algorithms, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- StatsD (metrics aggregator), [Direct messaging from producers to consumers](/en/ch12#id296)
- stock market feeds, [Direct messaging from producers to consumers](/en/ch12#id296)
- STONITH (Shoot The Other Node In The Head), [Leader failure: Failover](/en/ch6#leader-failure-failover)
  - problems with, [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens)
- stop-the-world (see garbage collection)
- storage
  - composing data storage technologies, [Composing Data Storage Technologies](/en/ch13#id447)-[Unbundled versus integrated systems](/en/ch13#id448)
- Storage Area Network (SAN), [Shared-Memory, Shared-Disk, and Shared-Nothing Architecture](/en/ch2#sec_introduction_shared_nothing), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- storage engines, [Storage and Retrieval](/en/ch4#ch_storage)-[Summary](/en/ch4#summary)
  - column-oriented, [Column-Oriented Storage](/en/ch4#sec_storage_column)-[Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
    - column compression, [Column Compression](/en/ch4#sec_storage_column_compression)-[Column Compression](/en/ch4#sec_storage_column_compression)
    - defined, [Column-Oriented Storage](/en/ch4#sec_storage_column)
    - Parquet, [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses), [Column-Oriented Storage](/en/ch4#sec_storage_column), [Archival storage](/en/ch5#archival-storage)
    - sort order in, [Sort Order in Column Storage](/en/ch4#sort-order-in-column-storage)-[Sort Order in Column Storage](/en/ch4#sort-order-in-column-storage)
    - versus wide-column model, [Column Compression](/en/ch4#sec_storage_column_compression)
    - writing to, [Writing to Column-Oriented Storage](/en/ch4#writing-to-column-oriented-storage)
  - in-memory storage, [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
    - durability, [Durability](/en/ch8#durability)
  - row-oriented, [Storage and Indexing for OLTP](/en/ch4#sec_storage_oltp)-[Keeping everything in memory](/en/ch4#sec_storage_inmemory)
    - B-trees, [B-Trees](/en/ch4#sec_storage_b_trees)-[B-tree variants](/en/ch4#b-tree-variants)
    - comparing B-trees and LSM-trees, [Comparing B-Trees and LSM-Trees](/en/ch4#sec_storage_btree_lsm_comparison)-[Disk space usage](/en/ch4#disk-space-usage)
    - defined, [Column-Oriented Storage](/en/ch4#sec_storage_column)
    - log-structured, [Log-Structured Storage](/en/ch4#sec_storage_log_structured)-[Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
- stored procedures, [Encapsulating transactions in stored procedures](/en/ch8#encapsulating-transactions-in-stored-procedures)-[Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs), [Glossary](/en/glossary)
  - and shared logs, [Using shared logs](/en/ch10#sec_consistency_smr)
  - pros and cons of, [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs)
  - similarity to stream processors, [Application code as a derivation function](/en/ch13#sec_future_dataflow_derivation)
- Storm (stream processor), [Stream analytics](/en/ch12#id318)
  - distributed RPC, [Event-Driven Architectures and RPC](/en/ch12#sec_stream_actors_drpc), [Multi-shard data processing](/en/ch13#sec_future_unbundled_multi_shard)
  - Trident state handling, [Idempotence](/en/ch12#sec_stream_idempotence)
- straggler events, [Handling straggler events](/en/ch12#id323)
- Stream Control Transmission Protocol (SCTP), [The Limitations of TCP](/en/ch9#sec_distributed_tcp)
- stream processing, [Processing Streams](/en/ch12#sec_stream_processing)-[Summary](/en/ch12#id332), [Glossary](/en/glossary)
  - accessing external services within job, [Stream-table join (stream enrichment)](/en/ch12#sec_stream_table_joins), [Microbatching and checkpointing](/en/ch12#id329), [Idempotence](/en/ch12#sec_stream_idempotence), [Exactly-once execution of an operation](/en/ch13#id353)
  - combining with batch processing, [Unifying batch and stream processing](/en/ch13#id338)
  - comparison to batch processing, [Processing Streams](/en/ch12#sec_stream_processing)
  - complex event processing (CEP), [Complex event processing](/en/ch12#id317)
  - fault tolerance, [Fault Tolerance](/en/ch12#sec_stream_fault_tolerance)-[Rebuilding state after a failure](/en/ch12#sec_stream_state_fault_tolerance)
    - atomic commit, [Atomic commit revisited](/en/ch12#sec_stream_atomic_commit)
    - idempotence, [Idempotence](/en/ch12#sec_stream_idempotence)
    - microbatching and checkpointing, [Microbatching and checkpointing](/en/ch12#id329)
    - rebuilding state after a failure, [Rebuilding state after a failure](/en/ch12#sec_stream_state_fault_tolerance)
  - for data integration, [Batch and Stream Processing](/en/ch13#sec_future_batch_streaming)-[Unifying batch and stream processing](/en/ch13#id338)
  - for event sourcing, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
  - maintaining derived state, [Maintaining derived state](/en/ch13#id446)
  - maintenance of materialized views, [Maintaining materialized views](/en/ch12#sec_stream_mat_view)
  - messaging systems (see messaging systems)
  - reasoning about time, [Reasoning About Time](/en/ch12#sec_stream_time)-[Types of windows](/en/ch12#id324)
    - event time versus processing time, [Event time versus processing time](/en/ch12#id322), [Microbatching and checkpointing](/en/ch12#id329), [Unifying batch and stream processing](/en/ch13#id338)
    - knowing when window is ready, [Handling straggler events](/en/ch12#id323)
    - types of windows, [Types of windows](/en/ch12#id324)
  - relation to databases (see streams)
  - relation to services, [Stream processors and services](/en/ch13#id345)
  - relationship to batch processing, [Batch Processing](/en/ch11#ch_batch)
  - search on streams, [Search on streams](/en/ch12#id320)
  - single-threaded execution, [Logs compared to traditional messaging](/en/ch12#sec_stream_logs_vs_messaging), [Concurrency control](/en/ch12#sec_stream_concurrency)
  - stream analytics, [Stream analytics](/en/ch12#id318)
  - stream joins, [Stream Joins](/en/ch12#sec_stream_joins)-[Time-dependence of joins](/en/ch12#sec_stream_join_time)
    - stream-stream join, [Stream-stream join (window join)](/en/ch12#id440)
    - stream-table join, [Stream-table join (stream enrichment)](/en/ch12#sec_stream_table_joins)
    - table-table join, [Table-table join (materialized view maintenance)](/en/ch12#id326)
    - time-dependence of, [Time-dependence of joins](/en/ch12#sec_stream_join_time)
- streams, [Stream Processing](/en/ch12#ch_stream)-[Replaying old messages](/en/ch12#sec_stream_replay)
  - end-to-end, pushing events to clients, [End-to-end event streams](/en/ch13#id349)
  - messaging systems (see messaging systems)
  - processing (see stream processing)
  - relation to databases, [Databases and Streams](/en/ch12#sec_stream_databases)-[Limitations of immutability](/en/ch12#sec_stream_immutability_limitations)
    - (see also changelogs)
    - API support for change streams, [API support for change streams](/en/ch12#sec_stream_change_api)
    - change data capture, [Change Data Capture](/en/ch12#sec_stream_cdc)-[API support for change streams](/en/ch12#sec_stream_change_api)
    - derivative of state by time, [State, Streams, and Immutability](/en/ch12#sec_stream_immutability)
    - event sourcing, [Change data capture versus event sourcing](/en/ch12#sec_stream_event_sourcing)
    - keeping systems in sync, [Keeping Systems in Sync](/en/ch12#sec_stream_sync)-[Keeping Systems in Sync](/en/ch12#sec_stream_sync)
    - philosophy of immutable events, [State, Streams, and Immutability](/en/ch12#sec_stream_immutability)-[Limitations of immutability](/en/ch12#sec_stream_immutability_limitations)
  - topics, [Transmitting Event Streams](/en/ch12#sec_stream_transmit)
- strict serializability, [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
  - timeliness vs. integrity, [Timeliness and Integrity](/en/ch13#sec_future_integrity)
- striping (in columnar encoding), [Column-Oriented Storage](/en/ch4#sec_storage_column)
- strong consistency (see linearizability)
- strong eventual consistency, [Automatic conflict resolution](/en/ch6#automatic-conflict-resolution)
- strong one-copy serializability, [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
- subjects, predicates, and objects (in triple-stores), [Triple-Stores and SPARQL](/en/ch3#id59)
- subscribers (message streams), [Transmitting Event Streams](/en/ch12#sec_stream_transmit)
  - (see also consumers)
- supercomputers, [Cloud Computing Versus Supercomputing](/en/ch1#id17)
- Superset (data visualization software), [Analytics](/en/ch11#sec_batch_olap)
- surveillance, [Surveillance](/en/ch14#id374)
  - (see also privacy)
- sushi principle, [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake)
- sustainability, [Distributed Versus Single-Node Systems](/en/ch1#sec_introduction_distributed)
- Swagger (service definition format), [Web services](/en/ch5#sec_web_services)
- swapping to disk (see virtual memory)
- Swift (programming language)
  - memory management, [Limiting the impact of garbage collection](/en/ch9#sec_distributed_gc_impact)
- sync engines, [Sync Engines and Local-First Software](/en/ch6#sec_replication_offline_clients)-[Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
  - examples of, [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
  - for local-first software, [Real-time collaboration, offline-first, and local-first apps](/en/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- synchronous networks, [Synchronous Versus Asynchronous Networks](/en/ch9#sec_distributed_sync_networks), [Glossary](/en/glossary)
  - comparison to asynchronous networks, [Synchronous Versus Asynchronous Networks](/en/ch9#sec_distributed_sync_networks)
  - system model, [System Model and Reality](/en/ch9#sec_distributed_system_model)
- synchronous replication, [Synchronous Versus Asynchronous Replication](/en/ch6#sec_replication_sync_async), [Glossary](/en/glossary)
  - with multiple leaders, [Multi-Leader Replication](/en/ch6#sec_replication_multi_leader)
- system administrator, [Operations in the Cloud Era](/en/ch1#sec_introduction_operations)
- system models, [Knowledge, Truth, and Lies](/en/ch9#sec_distributed_truth), [System Model and Reality](/en/ch9#sec_distributed_system_model)-[Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
  - assumptions in, [Trust, but Verify](/en/ch13#sec_future_verification)
  - correctness of algorithms, [Defining the correctness of an algorithm](/en/ch9#defining-the-correctness-of-an-algorithm)
  - mapping to the real world, [Mapping system models to the real world](/en/ch9#mapping-system-models-to-the-real-world)
  - safety and liveness, [Safety and liveness](/en/ch9#sec_distributed_safety_liveness)
- systems of record, [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived), [Glossary](/en/glossary)
  - change data capture, [Implementing change data capture](/en/ch12#id307), [Reasoning about dataflows](/en/ch13#id443)
  - event logs, [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
  - treating event log as, [State, Streams, and Immutability](/en/ch12#sec_stream_immutability)
- systems thinking, [Feedback Loops](/en/ch14#id372)

### T

- t-digest (algorithm), [Use of Response Time Metrics](/en/ch2#sec_introduction_slo_sla)
- table-table joins, [Table-table join (materialized view maintenance)](/en/ch12#id326)
- Tableau (data visualization software), [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp), [Analytics](/en/ch11#sec_batch_olap)
- tail (Unix tool), [Using logs for message storage](/en/ch12#id300)
- tail latency (see latency)
- tail vertex (property graphs), [Property Graphs](/en/ch3#id56)
- task (workflows) (see workflow engines)
- TCP (Transmission Control Protocol), [The Limitations of TCP](/en/ch9#sec_distributed_tcp)
  - comparison to circuit switching, [Can we not simply make network delays predictable?](/en/ch9#can-we-not-simply-make-network-delays-predictable)
  - comparison to UDP, [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
  - connection failures, [Detecting Faults](/en/ch9#id307)
  - flow control, [Network congestion and queueing](/en/ch9#network-congestion-and-queueing), [Messaging Systems](/en/ch12#sec_stream_messaging)
  - packet checksums, [Weak forms of lying](/en/ch9#weak-forms-of-lying), [The end-to-end argument](/en/ch13#sec_future_e2e_argument), [Trust, but Verify](/en/ch13#sec_future_verification)
  - reliability and duplicate suppression, [Duplicate suppression](/en/ch13#id354)
  - retransmission timeouts, [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
  - use for transaction sessions, [Single-Object and Multi-Object Operations](/en/ch8#sec_transactions_multi_object)
- Temporal (workflow engine), [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows)
- Tensorflow (machine learning library), [Machine Learning](/en/ch11#id290)
- Teradata (database), [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
- term-partitioned indexes (see global secondary indexes)
- termination (consensus), [Single-value consensus](/en/ch10#single-value-consensus), [Atomic commitment as consensus](/en/ch10#atomic-commitment-as-consensus)
- testing, [Humans and Reliability](/en/ch2#id31)
- thrashing (out of memory), [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
- threads (concurrency)
  - actor model, [Distributed actor frameworks](/en/ch5#distributed-actor-frameworks), [Event-Driven Architectures and RPC](/en/ch12#sec_stream_actors_drpc)
    - (see also event-driven architecture)
  - atomic operations, [Atomicity](/en/ch8#sec_transactions_acid_atomicity)
  - background threads, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
  - execution pauses, [Can we not simply make network delays predictable?](/en/ch9#can-we-not-simply-make-network-delays-predictable), [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)-[Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
  - memory barriers, [Linearizability and network delays](/en/ch10#linearizability-and-network-delays)
  - preemption, [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
  - single (see single-threaded execution)
- three-phase commit, [Three-phase commit](/en/ch8#three-phase-commit)
- three-way relationships, [Property Graphs](/en/ch3#id56)
- Thrift (data format), [Protocol Buffers](/en/ch5#sec_encoding_protobuf)
- throughput, [Describing Performance](/en/ch2#sec_introduction_percentiles), [Describing Load](/en/ch2#id33), [Batch Processing](/en/ch11#ch_batch)
- TIBCO, [Message brokers](/en/ch5#message-brokers)
  - Enterprise Message Service, [Message brokers compared to databases](/en/ch12#id297)
  - StreamBase (stream analytics), [Complex event processing](/en/ch12#id317)
- TiDB (database)
  - consensus-based replication, [Single-Leader Replication](/en/ch6#sec_replication_leader)
  - regions (sharding), [Sharding](/en/ch7#ch_sharding)
  - request routing, [Request Routing](/en/ch7#sec_sharding_routing)
  - serving derived data, [Serving Derived Data](/en/ch11#sec_batch_serving_derived)
  - sharded secondary indexes, [Global Secondary Indexes](/en/ch7#id167)
  - snapshot isolation support, [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)
  - timestamp oracle, [Implementing a linearizable ID generator](/en/ch10#implementing-a-linearizable-id-generator)
  - transactions, [What Exactly Is a Transaction?](/en/ch8#sec_transactions_overview), [Database-internal Distributed Transactions](/en/ch8#sec_transactions_internal)
  - use of model-checking, [Model checking and specification languages](/en/ch9#model-checking-and-specification-languages)
- tiered storage, [Setting Up New Followers](/en/ch6#sec_replication_new_replica), [Disk space usage](/en/ch12#sec_stream_disk_usage)
- TigerBeetle (database), [Summary](/en/ch3#summary)
  - deterministic simulation testing, [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
- TigerGraph (database)
  - GSQL language, [Graph Queries in SQL](/en/ch3#id58)
- Tigris (object storage), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- TileDB (database), [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- time
  - concurrency and, [The "happens-before" relation and concurrency](/en/ch6#sec_replication_happens_before)
  - cross-channel timing dependencies, [Cross-channel timing dependencies](/en/ch10#cross-channel-timing-dependencies)
  - in distributed systems, [Unreliable Clocks](/en/ch9#sec_distributed_clocks)-[Limiting the impact of garbage collection](/en/ch9#sec_distributed_gc_impact)
    - (see also clocks)
    - clock synchronization and accuracy, [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy)
    - relying on synchronized clocks, [Relying on Synchronized Clocks](/en/ch9#sec_distributed_clocks_relying)-[Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
  - process pauses, [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)-[Limiting the impact of garbage collection](/en/ch9#sec_distributed_gc_impact)
  - reasoning about, in stream processors, [Reasoning About Time](/en/ch12#sec_stream_time)-[Types of windows](/en/ch12#id324)
    - event time versus processing time, [Event time versus processing time](/en/ch12#id322), [Microbatching and checkpointing](/en/ch12#id329), [Unifying batch and stream processing](/en/ch13#id338)
    - knowing when window is ready, [Handling straggler events](/en/ch12#id323)
    - timestamp of events, [Whose clock are you using, anyway?](/en/ch12#id438)
    - types of windows, [Types of windows](/en/ch12#id324)
  - system models for distributed systems, [System Model and Reality](/en/ch9#sec_distributed_system_model)
  - time-dependence in stream joins, [Time-dependence of joins](/en/ch12#sec_stream_join_time)
- time series data
  - as DataFrames, [DataFrames, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
  - column-oriented storage, [Column-Oriented Storage](/en/ch4#sec_storage_column)
- time-of-day clocks, [Time-of-day clocks](/en/ch9#time-of-day-clocks)
  - hybrid logical clocks, [Hybrid logical clocks](/en/ch10#hybrid-logical-clocks)
- timeliness, [Timeliness and Integrity](/en/ch13#sec_future_integrity)
  - coordination-avoiding data systems, [Coordination-avoiding data systems](/en/ch13#id454)
  - correctness of dataflow systems, [Correctness of dataflow systems](/en/ch13#id453)
- timeouts, [Unreliable Networks](/en/ch9#sec_distributed_networks), [Glossary](/en/glossary)
  - dynamic configuration of, [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
  - for failover, [Leader failure: Failover](/en/ch6#leader-failure-failover)
  - length of, [Timeouts and Unbounded Delays](/en/ch9#sec_distributed_queueing)
- TimescaleDB (database), [Column-Oriented Storage](/en/ch4#sec_storage_column)
- timestamps, [Logical Clocks](/en/ch10#sec_consistency_timestamps)
  - assigning to events in stream processing, [Whose clock are you using, anyway?](/en/ch12#id438)
  - for read-after-write consistency, [Reading Your Own Writes](/en/ch6#sec_replication_ryw)
  - for transaction ordering, [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
  - insufficiency for enforcing constraints, [Enforcing constraints using logical clocks](/en/ch10#enforcing-constraints-using-logical-clocks)
  - key range sharding by, [Sharding by Key Range](/en/ch7#sec_sharding_key_range)
  - Lamport, [Lamport timestamps](/en/ch10#lamport-timestamps)
  - logical, [Ordering events to capture causality](/en/ch13#sec_future_capture_causality)
  - ordering events, [Timestamps for ordering events](/en/ch9#sec_distributed_lww)
  - timestamp oracle, [Implementing a linearizable ID generator](/en/ch10#implementing-a-linearizable-id-generator)
- TLA+ (specification language), [Model checking and specification languages](/en/ch9#model-checking-and-specification-languages)
- token bucket (limiting retries), [Describing Performance](/en/ch2#sec_introduction_percentiles)
- tombstones, [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables), [Disk space usage](/en/ch4#disk-space-usage), [Log compaction](/en/ch12#sec_stream_log_compaction)
- topics (messaging), [Message brokers](/en/ch5#message-brokers), [Transmitting Event Streams](/en/ch12#sec_stream_transmit)
- torn pages (B-trees), [Making B-trees reliable](/en/ch4#sec_storage_btree_wal)
- total order, [Glossary](/en/glossary)
  - broadcast (see shared logs)
  - limits of, [The limits of total ordering](/en/ch13#id335)
  - on logical timestamps, [Logical Clocks](/en/ch10#sec_consistency_timestamps)
- tracing, [Problems with Distributed Systems](/en/ch1#sec_introduction_dist_sys_problems)
- tracking behavioral data, [Privacy and Tracking](/en/ch14#id373)
  - (see also privacy)
- trade-offs, [Trade-offs in Data Systems Architecture](/en/ch1#ch_tradeoffs)-[Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)
- transaction coordinator (see coordinator)
- transaction manager (see coordinator)
- transaction processing, [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp)-[Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp)
  - comparison to analytics, [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp)
  - comparison to data warehousing, [Data Storage for Analytics](/en/ch4#sec_storage_analytics)
- transactions, [Transactions](/en/ch8#ch_transactions)-[Summary](/en/ch8#summary), [Glossary](/en/glossary)
  - ACID properties of, [The Meaning of ACID](/en/ch8#sec_transactions_acid)
    - atomicity, [Atomicity](/en/ch8#sec_transactions_acid_atomicity)
    - consistency, [Consistency](/en/ch8#sec_transactions_acid_consistency)
    - durability, [Making B-trees reliable](/en/ch4#sec_storage_btree_wal), [Durability](/en/ch8#durability)
    - isolation, [Isolation](/en/ch8#sec_transactions_acid_isolation)
  - and derived data integrity, [Timeliness and Integrity](/en/ch13#sec_future_integrity)
  - and replication, [Solutions for Replication Lag](/en/ch6#id131)
  - compensating (see compensating transactions)
  - concept of, [What Exactly Is a Transaction?](/en/ch8#sec_transactions_overview)
  - distributed transactions, [Distributed Transactions](/en/ch8#sec_transactions_distributed)-[Exactly-once message processing revisited](/en/ch8#exactly-once-message-processing-revisited)
    - avoiding, [Derived data versus distributed transactions](/en/ch13#sec_future_derived_vs_transactions), [Making unbundling work](/en/ch13#sec_future_unbundling_favor), [Enforcing Constraints](/en/ch13#sec_future_constraints)-[Coordination-avoiding data systems](/en/ch13#id454)
    - failure amplification, [Maintaining derived state](/en/ch13#id446)
    - for sharded systems, [Pros and Cons of Sharding](/en/ch7#sec_sharding_reasons)
    - in doubt/uncertain status, [Coordinator failure](/en/ch8#coordinator-failure), [Holding locks while in doubt](/en/ch8#holding-locks-while-in-doubt)
    - two-phase commit, [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc)-[Three-phase commit](/en/ch8#three-phase-commit)
    - use of, [Distributed Transactions Across Different Systems](/en/ch8#sec_transactions_xa)-[Exactly-once message processing](/en/ch8#sec_transactions_exactly_once)
    - XA transactions, [XA transactions](/en/ch8#xa-transactions)-[Problems with XA transactions](/en/ch8#problems-with-xa-transactions)
  - OLTP versus analytics queries, [Analytics](/en/ch11#sec_batch_olap)
  - purpose of, [Transactions](/en/ch8#ch_transactions)
  - serializability, [Serializability](/en/ch8#sec_transactions_serializability)-[Performance of serializable snapshot isolation](/en/ch8#performance-of-serializable-snapshot-isolation)
    - actual serial execution, [Actual Serial Execution](/en/ch8#sec_transactions_serial)-[Summary of serial execution](/en/ch8#summary-of-serial-execution)
    - pessimistic versus optimistic concurrency control, [Pessimistic versus optimistic concurrency control](/en/ch8#pessimistic-versus-optimistic-concurrency-control)
    - serializable snapshot isolation (SSI), [Serializable Snapshot Isolation (SSI)](/en/ch8#sec_transactions_ssi)-[Performance of serializable snapshot isolation](/en/ch8#performance-of-serializable-snapshot-isolation)
    - two-phase locking (2PL), [Two-Phase Locking (2PL)](/en/ch8#sec_transactions_2pl)-[Index-range locks](/en/ch8#sec_transactions_2pl_range)
  - single-object and multi-object, [Single-Object and Multi-Object Operations](/en/ch8#sec_transactions_multi_object)-[Handling errors and aborts](/en/ch8#handling-errors-and-aborts)
    - handling errors and aborts, [Handling errors and aborts](/en/ch8#handling-errors-and-aborts)
    - need for multi-object transactions, [The need for multi-object transactions](/en/ch8#sec_transactions_need)
    - single-object writes, [Single-object writes](/en/ch8#sec_transactions_single_object)
  - snapshot isolation (see snapshots)
  - strict serializability, [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
  - weak isolation levels, [Weak Isolation Levels](/en/ch8#sec_transactions_isolation_levels)-[Materializing conflicts](/en/ch8#materializing-conflicts)
    - preventing lost updates, [Preventing Lost Updates](/en/ch8#sec_transactions_lost_update)-[Conflict resolution and replication](/en/ch8#conflict-resolution-and-replication)
    - read committed, [Read Committed](/en/ch8#sec_transactions_read_committed)-[Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)
- traversal (graphs), [Property Graphs](/en/ch3#id56)
- trie (data structure), [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables), [Full-Text Search](/en/ch4#sec_storage_full_text)
  - as SSTable index, [The SSTable file format](/en/ch4#the-sstable-file-format)
- triggers (databases), [Transmitting Event Streams](/en/ch12#sec_stream_transmit)
- Trino (data warehouse), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
  - federated databases, [The meta-database of everything](/en/ch13#id341)
  - query optimizer, [Query languages](/en/ch11#sec_batch_query_lanauges)
  - use for ETL, [Extract--Transform--Load (ETL)](/en/ch11#sec_batch_etl_usage)
  - workflow example, [Scheduling Workflows](/en/ch11#sec_batch_workflows)
- triple-stores, [Triple-Stores and SPARQL](/en/ch3#id59)-[The SPARQL query language](/en/ch3#the-sparql-query-language)
  - SPARQL query language, [The SPARQL query language](/en/ch3#the-sparql-query-language)
- tumbling windows (stream processing), [Types of windows](/en/ch12#id324)
  - (see also windows)
  - in microbatching, [Microbatching and checkpointing](/en/ch12#id329)
- Turbopuffer (vector search), [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
- Turtle (RDF data format), [Triple-Stores and SPARQL](/en/ch3#id59)
- Twitter (see X (social network))
- two-phase commit (2PC), [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc)-[Coordinator failure](/en/ch8#coordinator-failure), [Glossary](/en/glossary)
  - confusion with two-phase locking, [Two-Phase Locking (2PL)](/en/ch8#sec_transactions_2pl)
  - coordinator failure, [Coordinator failure](/en/ch8#coordinator-failure)
  - coordinator recovery, [Recovering from coordinator failure](/en/ch8#recovering-from-coordinator-failure)
  - how it works, [A system of promises](/en/ch8#a-system-of-promises)
  - performance cost, [Distributed Transactions Across Different Systems](/en/ch8#sec_transactions_xa)
  - problems with XA transactions, [Problems with XA transactions](/en/ch8#problems-with-xa-transactions)
  - transactions holding locks, [Holding locks while in doubt](/en/ch8#holding-locks-while-in-doubt)
- two-phase locking (2PL), [Two-Phase Locking (2PL)](/en/ch8#sec_transactions_2pl)-[Index-range locks](/en/ch8#sec_transactions_2pl_range), [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition), [Glossary](/en/glossary)
  - confusion with two-phase commit, [Two-Phase Locking (2PL)](/en/ch8#sec_transactions_2pl)
  - growing and shrinking phases, [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
  - index-range locks, [Index-range locks](/en/ch8#sec_transactions_2pl_range)
  - performance of, [Performance of two-phase locking](/en/ch8#performance-of-two-phase-locking)
- type checking, dynamic versus static, [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility)

### U

- UDP (User Datagram Protocol)
  - comparison to TCP, [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
  - multicast, [Direct messaging from producers to consumers](/en/ch12#id296)
- Ultima Online (game), [Sharding](/en/ch7#ch_sharding)
- unbounded datasets, [Stream Processing](/en/ch12#ch_stream), [Glossary](/en/glossary)
  - (see also streams)
- unbounded delays, [Glossary](/en/glossary)
  - in networks, [Timeouts and Unbounded Delays](/en/ch9#sec_distributed_queueing)
  - process pauses, [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
- unbundling databases, [Unbundling Databases](/en/ch13#sec_future_unbundling)-[Multi-shard data processing](/en/ch13#sec_future_unbundled_multi_shard)
  - composing data storage technologies, [Composing Data Storage Technologies](/en/ch13#id447)-[Unbundled versus integrated systems](/en/ch13#id448)
    - federation versus unbundling, [The meta-database of everything](/en/ch13#id341)
  - designing applications around dataflow, [Designing Applications Around Dataflow](/en/ch13#sec_future_dataflow)-[Stream processors and services](/en/ch13#id345)
  - observing derived state, [Observing Derived State](/en/ch13#sec_future_observing)-[Multi-shard data processing](/en/ch13#sec_future_unbundled_multi_shard)
    - materialized views and caching, [Materialized views and caching](/en/ch13#id451)
    - multi-shard data processing, [Multi-shard data processing](/en/ch13#sec_future_unbundled_multi_shard)
    - pushing state changes to clients, [Pushing state changes to clients](/en/ch13#id348)
- uncertain (transaction status) (see in doubt)
- union type (in Avro), [Schema evolution rules](/en/ch5#schema-evolution-rules)
- uniq (Unix tool), [Simple Log Analysis](/en/ch11#sec_batch_log_analysis), [Simple Log Analysis](/en/ch11#sec_batch_log_analysis), [Distributed Job Orchestration](/en/ch11#id278)
- uniqueness constraints
  - asynchronously checked, [Loosely interpreted constraints](/en/ch13#id362)
  - requiring consensus, [Uniqueness constraints require consensus](/en/ch13#id452)
  - requiring linearizability, [Constraints and uniqueness guarantees](/en/ch10#sec_consistency_uniqueness)
  - uniqueness in log-based messaging, [Uniqueness in log-based messaging](/en/ch13#sec_future_uniqueness_log)
- Unity (data catalog), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
- universally unique identifiers (see UUIDs)
- Unix philosophy
  - comparison to relational databases, [Unbundling Databases](/en/ch13#sec_future_unbundling), [The meta-database of everything](/en/ch13#id341)
  - comparison to stream processing, [Processing Streams](/en/ch12#sec_stream_processing)
- Unix pipes, [Simple Log Analysis](/en/ch11#sec_batch_log_analysis)
  - compared to distributed batch processing, [Scheduling Workflows](/en/ch11#sec_batch_workflows)
- UPDATE statement (SQL), [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility)
- updates
  - preventing lost updates, [Preventing Lost Updates](/en/ch8#sec_transactions_lost_update)-[Conflict resolution and replication](/en/ch8#conflict-resolution-and-replication)
    - atomic write operations, [Atomic write operations](/en/ch8#atomic-write-operations)
    - automatically detecting lost updates, [Automatically detecting lost updates](/en/ch8#automatically-detecting-lost-updates)
    - compare-and-set (CAS), [Conditional writes (compare-and-set)](/en/ch8#sec_transactions_compare_and_set)
    - conflict resolution and replication, [Conflict resolution and replication](/en/ch8#conflict-resolution-and-replication)
    - using explicit locking, [Explicit locking](/en/ch8#explicit-locking)
  - preventing write skew, [Write Skew and Phantoms](/en/ch8#sec_transactions_write_skew)-[Materializing conflicts](/en/ch8#materializing-conflicts)
- utilization
  - batch process scheduling, [Resource Allocation](/en/ch11#id279)
  - increasing through preemption, [Handling Faults](/en/ch11#id281)
  - trade-off with latency, [Can we not simply make network delays predictable?](/en/ch9#can-we-not-simply-make-network-delays-predictable)
- uTP protocol (BitTorrent), [The Limitations of TCP](/en/ch9#sec_distributed_tcp)
- UUIDs, [ID Generators and Logical Clocks](/en/ch10#sec_consistency_logical)

### V

- validity (consensus), [Single-value consensus](/en/ch10#single-value-consensus), [Atomic commitment as consensus](/en/ch10#atomic-commitment-as-consensus)
- vBuckets (sharding), [Sharding](/en/ch7#ch_sharding)
- vector clocks, [Version vectors](/en/ch6#version-vectors)
  - (see also version vectors)
  - and Lamport/hybrid logical clocks, [Lamport/hybrid logical clocks versus vector clocks](/en/ch10#lamporthybrid-logical-clocks-vs-vector-clocks)
  - and version vectors, [Version vectors](/en/ch6#version-vectors)
- vector embedding, [Vector Embeddings](/en/ch4#id92)
- vectorized processing, [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
- vendor lock-in, [Pros and Cons of Cloud Services](/en/ch1#sec_introduction_cloud_tradeoffs)
- Venice (database), [Serving Derived Data](/en/ch11#sec_batch_serving_derived)
- verification, [Trust, but Verify](/en/ch13#sec_future_verification)-[Tools for auditable data systems](/en/ch13#id366)
  - avoiding blind trust, [Don't just blindly trust what they promise](/en/ch13#id364)
  - designing for auditability, [Designing for auditability](/en/ch13#id365)
  - end-to-end integrity checks, [The end-to-end argument again](/en/ch13#id456)
  - tools for auditable data systems, [Tools for auditable data systems](/en/ch13#id366)
- version control systems
  - merge conflicts, [Manual conflict resolution](/en/ch6#manual-conflict-resolution)
  - reliance on immutable data, [Concurrency control](/en/ch12#sec_stream_concurrency)
- version vectors, [Problems with different topologies](/en/ch6#problems-with-different-topologies), [Version vectors](/en/ch6#version-vectors)
  - dotted, [Version vectors](/en/ch6#version-vectors)
  - versus vector clocks, [Version vectors](/en/ch6#version-vectors)
- Vertica (database), [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
  - handling writes, [Writing to Column-Oriented Storage](/en/ch4#writing-to-column-oriented-storage)
- vertical scaling (see scaling up)
- vertices (in graphs), [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
  - property graph model, [Property Graphs](/en/ch3#id56)
- video games, [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
- video transcoding (example), [Cross-channel timing dependencies](/en/ch10#cross-channel-timing-dependencies)
- views (SQL queries), [Datalog: Recursive Relational Queries](/en/ch3#id62)
  - materialized views (see materialization)
- Viewstamped Replication (consensus algorithm), [Consensus](/en/ch10#sec_consistency_consensus), [Consensus in Practice](/en/ch10#sec_consistency_total_order)
  - use of model-checking, [Model checking and specification languages](/en/ch9#model-checking-and-specification-languages)
  - view number, [From single-leader replication to consensus](/en/ch10#from-single-leader-replication-to-consensus)
- virtual block device, [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute)
- virtual file system, [Distributed Filesystems](/en/ch11#sec_batch_dfs)
  - comparison to distributed filesystems, [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- virtual machines, [Layering of cloud services](/en/ch1#layering-of-cloud-services)
  - context switches, [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
  - network performance, [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
  - noisy neighbors, [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
  - virtualized clocks in, [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy)
- virtual memory
  - process pauses due to page faults, [Latency and Response Time](/en/ch2#id23), [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
- Virtuoso (database), [The SPARQL query language](/en/ch3#the-sparql-query-language)
- VisiCalc (spreadsheets), [Designing Applications Around Dataflow](/en/ch13#sec_future_dataflow)
- Vitess (database)
  - key-range sharding, [Sharding by Key Range](/en/ch7#sec_sharding_key_range)
- vnodes (sharding), [Sharding](/en/ch7#ch_sharding)
- vocabularies, [Triple-Stores and SPARQL](/en/ch3#id59)
- Voice over IP (VoIP), [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
- VoltDB (database)
  - cross-shard serializability, [Sharding](/en/ch8#sharding)
  - deterministic stored procedures, [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs)
  - in-memory storage, [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
  - process-per-core model, [Pros and Cons of Sharding](/en/ch7#sec_sharding_reasons)
  - secondary indexes, [Local Secondary Indexes](/en/ch7#id166)
  - serial execution of transactions, [Actual Serial Execution](/en/ch8#sec_transactions_serial)
  - statement-based replication, [Statement-based replication](/en/ch6#statement-based-replication), [Rebuilding state after a failure](/en/ch12#sec_stream_state_fault_tolerance)
  - transactions in stream processing, [Atomic commit revisited](/en/ch12#sec_stream_atomic_commit)

### W

- WAL (write-ahead log), [Making B-trees reliable](/en/ch4#sec_storage_btree_wal)
- WAL-G (backup tool), [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
- WarpStream (messaging), [Disk space usage](/en/ch12#sec_stream_disk_usage)
- web services (see services)
- webhooks, [Direct messaging from producers to consumers](/en/ch12#id296)
- webMethods (messaging), [Message brokers](/en/ch5#message-brokers)
- WebSocket (protocol), [Pushing state changes to clients](/en/ch13#id348)
- wide-column data model, [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality)
  - versus column-oriented storage, [Column Compression](/en/ch4#sec_storage_column_compression)
- windows (stream processing), [Stream analytics](/en/ch12#id318), [Reasoning About Time](/en/ch12#sec_stream_time)-[Types of windows](/en/ch12#id324)
  - infinite windows for changelogs, [Maintaining materialized views](/en/ch12#sec_stream_mat_view), [Stream-table join (stream enrichment)](/en/ch12#sec_stream_table_joins)
  - knowing when all events have arrived, [Handling straggler events](/en/ch12#id323)
  - stream joins within a window, [Stream-stream join (window join)](/en/ch12#id440)
  - types of windows, [Types of windows](/en/ch12#id324)
- WITH RECURSIVE syntax (SQL), [Graph Queries in SQL](/en/ch3#id58)
- Word2Vec (language model), [Vector Embeddings](/en/ch4#id92)
- workflow engines, [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows)
  - Airflow (see Airflow (workflow scheduler))
  - batch processing, [Scheduling Workflows](/en/ch11#sec_batch_workflows)
  - Camunda (see Camunda (workflow engine))
  - Dagster (see Dagster (workflow scheduler))
  - durable execution, [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows)
  - ETL (see ETL (extract-transform-load))
  - executor, [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows)
  - orchestrators, [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows), [Batch Processing](/en/ch11#ch_batch)
  - Orkes (see Orkes (workflow engine))
  - Prefect (see Prefect (workflow scheduler))
  - reliance on determinism, [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
  - Restate (see Restate (workflow engine))
  - Temporal (see Temporal (workflow engine))
- working set, [Sorting Versus In-memory Aggregation](/en/ch11#id275)
- write amplification, [Write amplification](/en/ch4#write-amplification)
- write path (derived data), [Observing Derived State](/en/ch13#sec_future_observing)
- write skew (transaction isolation), [Write Skew and Phantoms](/en/ch8#sec_transactions_write_skew)-[Materializing conflicts](/en/ch8#materializing-conflicts)
  - characterizing, [Write Skew and Phantoms](/en/ch8#sec_transactions_write_skew)-[Phantoms causing write skew](/en/ch8#sec_transactions_phantom), [Decisions based on an outdated premise](/en/ch8#decisions-based-on-an-outdated-premise)
  - examples of, [Write Skew and Phantoms](/en/ch8#sec_transactions_write_skew), [More examples of write skew](/en/ch8#more-examples-of-write-skew)
  - materializing conflicts, [Materializing conflicts](/en/ch8#materializing-conflicts)
  - occurrence in practice, [Maintaining integrity in the face of software bugs](/en/ch13#id455)
  - phantoms, [Phantoms causing write skew](/en/ch8#sec_transactions_phantom)
  - preventing
    - in snapshot isolation, [Decisions based on an outdated premise](/en/ch8#decisions-based-on-an-outdated-premise)-[Detecting writes that affect prior reads](/en/ch8#sec_detecting_writes_affect_reads)
    - in two-phase locking, [Predicate locks](/en/ch8#predicate-locks)-[Index-range locks](/en/ch8#sec_transactions_2pl_range)
    - options for, [Characterizing write skew](/en/ch8#characterizing-write-skew)
- write-ahead log (WAL), [Making B-trees reliable](/en/ch4#sec_storage_btree_wal), [Write-ahead log (WAL) shipping](/en/ch6#write-ahead-log-wal-shipping)
  - in durable execution, [Durable execution](/en/ch5#durable-execution)
- writes (database)
  - atomic write operations, [Atomic write operations](/en/ch8#atomic-write-operations)
  - detecting writes affecting prior reads, [Detecting writes that affect prior reads](/en/ch8#sec_detecting_writes_affect_reads)
  - preventing dirty writes with read committed, [No dirty writes](/en/ch8#sec_transactions_dirty_write)
- WS-\* framework, [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc)
- WS-AtomicTransaction (2PC), [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc)

### X

- X (social network)
  - constructing home timelines (example), [Case Study: Social Network Home Timelines](/en/ch2#sec_introduction_twitter), [Deriving several views from the same event log](/en/ch12#sec_stream_deriving_views), [Table-table join (materialized view maintenance)](/en/ch12#id326), [Materialized views and caching](/en/ch13#id451)
    - cost of joins, [Denormalization in the social networking case study](/en/ch3#denormalization-in-the-social-networking-case-study)
    - describing load, [Describing Load](/en/ch2#id33)
    - fault tolerance, [Fault Tolerance](/en/ch2#id27)
    - performance metrics, [Describing Performance](/en/ch2#sec_introduction_percentiles)
  - DistributedLog (event log), [Using logs for message storage](/en/ch12#id300)
  - Snowflake (ID generator), [ID Generators and Logical Clocks](/en/ch10#sec_consistency_logical)
- XA transactions, [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc), [XA transactions](/en/ch8#xa-transactions)-[Problems with XA transactions](/en/ch8#problems-with-xa-transactions)
  - heuristic decisions, [Recovering from coordinator failure](/en/ch8#recovering-from-coordinator-failure)
  - problems with, [Problems with XA transactions](/en/ch8#problems-with-xa-transactions)
- xargs (Unix tool), [Simple Log Analysis](/en/ch11#sec_batch_log_analysis)
- XFS (file system), [Distributed Filesystems](/en/ch11#sec_batch_dfs)
- XGBoost (machine learning library), [Machine Learning](/en/ch11#id290)
- XML
  - binary variants, [Binary encoding](/en/ch5#binary-encoding)
  - data locality, [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality)
  - encoding RDF data, [The RDF data model](/en/ch3#the-rdf-data-model)
  - for application data, issues with, [JSON, XML, and Binary Variants](/en/ch5#sec_encoding_json)
  - in relational databases, [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility)
  - XML databases, [Relational Model versus Document Model](/en/ch3#sec_datamodels_history), [Query languages for documents](/en/ch3#query-languages-for-documents)
- Xorq (query engine), [The meta-database of everything](/en/ch13#id341)
- XPath, [Query languages for documents](/en/ch3#query-languages-for-documents)
- XQuery, [Query languages for documents](/en/ch3#query-languages-for-documents)

### Y

- Yahoo
  - response time study, [Average, Median, and Percentiles](/en/ch2#id24)
- YARN (job scheduler), [Distributed Job Orchestration](/en/ch11#id278), [Separation of application code and state](/en/ch13#id344)
  - ApplicationMaster, [Distributed Job Orchestration](/en/ch11#id278)
- Yjs (CRDT library), [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
- YugabyteDB (database)
  - hash-range sharding, [Sharding by hash range](/en/ch7#sharding-by-hash-range)
  - key-range sharding, [Sharding by Key Range](/en/ch7#sec_sharding_key_range)
  - multi-leader replication, [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)
  - request routing, [Request Routing](/en/ch7#sec_sharding_routing)
  - sharded secondary indexes, [Global Secondary Indexes](/en/ch7#id167)
  - tablets (sharding), [Sharding](/en/ch7#ch_sharding)
  - transactions, [What Exactly Is a Transaction?](/en/ch8#sec_transactions_overview), [Database-internal Distributed Transactions](/en/ch8#sec_transactions_internal)
  - use of clock synchronization, [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)

### Z

- Zab (consensus algorithm), [Consensus](/en/ch10#sec_consistency_consensus), [Consensus in Practice](/en/ch10#sec_consistency_total_order)
  - use in ZooKeeper, [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable)
- zero-copy, [Formats for Encoding Data](/en/ch5#sec_encoding_formats)
- zero-disk architecture (ZDA), [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
- ZeroMQ (messaging library), [Direct messaging from producers to consumers](/en/ch12#id296)
- zombies (split brain), [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens)
- zones (cloud computing) (see availability zones)
- ZooKeeper (coordination service), [Coordination Services](/en/ch10#sec_consistency_coordination)-[Service discovery](/en/ch10#service-discovery)
  - generating fencing tokens, [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens), [Using shared logs](/en/ch10#sec_consistency_smr), [Coordination Services](/en/ch10#sec_consistency_coordination)
  - linearizable operations, [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable)
  - locks and leader election, [Locking and leader election](/en/ch10#locking-and-leader-election)
  - observers, [Service discovery](/en/ch10#service-discovery)
  - use for service discovery, [Load balancers, service discovery, and service meshes](/en/ch5#sec_encoding_service_discovery), [Service discovery](/en/ch10#service-discovery)
  - use for shard assignment, [Request Routing](/en/ch7#sec_sharding_routing)
  - use of Zab algorithm, [Consensus](/en/ch10#sec_consistency_consensus)


================================================
FILE: content/en/part-i.md
================================================
---
title: "PART I: Foundations of Data Systems"
weight: 100
breadcrumbs: false
---

{{< callout type="warning" >}}
This page is from the 1st edition， 2nd edition is not available yet.
{{< /callout >}}

The first five chapters go through the fundamental ideas that apply to all data systems, whether running on a single machine or distributed across a cluster of machines:

1. [Chapter 1](/en/ch1) introduces the tradeoffs that data systems must make, such as the balance between consistency and availability, and how these tradeoffs affect system design.

2. [Chater 2](/en/ch2) discusses the nonfunctional requirements of data systems, such as availability, consistency, and latency. And how we can try to achieve these goals.

3. [Chapter 3](/en/ch3) compares several different data models and query languages—the most visible distinguishing factor between databases from a developer’s point of view. We will see how different models are appropriate to different situations.

4. [Chapter 4](/en/ch4) turns to the internals of storage engines and looks at how databases lay out data on disk. Different storage engines are optimized for different workloads, and choosing the right one can have a huge effect on performance.

5. [Chapter 5](/en/ch5) compares various formats for data encoding (serialization) and especially examines how they fare in an environment where application requirements change and schemas need to adapt over time.

Later, [Part II](/en/part-ii) will turn to the particular issues of distributed data systems.


## [1. Trade-offs in Data Systems Architecture](/en/ch1)
- [Analytical versus Operational Systems](/en/ch1#sec_introduction_analytics)
- [Cloud versus Self-Hosting](/en/ch1#sec_introduction_cloud)
- [Distributed versus Single-Node Systems](/en/ch1#sec_introduction_distributed)
- [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)
- [Summary](/en/ch1#summary)

## [2. Defining Nonfunctional Requirements](/en/ch2)
- [Case Study: Social Network Home Timelines](/en/ch2#sec_introduction_twitter)
- [Describing Performance](/en/ch2#sec_introduction_percentiles)
- [Reliability and Fault Tolerance](/en/ch2#sec_introduction_reliability)
- [Scalability](/en/ch2#sec_introduction_scalability)
- [Maintainability](/en/ch2#sec_introduction_maintainability)
- [Summary](/en/ch2#summary)

## [3. Data Models and Query Languages](/en/ch3)
- [Relational Model versus Document Model](/en/ch3#sec_datamodels_history)
- [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
- [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
- [Dataframes, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- [Summary](/en/ch3#summary)

## [4. Storage and Retrieval](/en/ch4)
- [Storage and Indexing for OLTP](/en/ch4#sec_storage_oltp)
- [Data Storage for Analytics](/en/ch4#sec_storage_analytics)
- [Multidimensional and Full-Text Indexes](/en/ch4#sec_storage_multidimensional)
- [Summary](/en/ch4#summary)

## [5. Encoding and Evolution](/en/ch5)
- [Formats for Encoding Data](/en/ch5#sec_encoding_formats)
- [Modes of Dataflow](/en/ch5#sec_encoding_dataflow)
- [Summary](/en/ch5#summary)


================================================
FILE: content/en/part-ii.md
================================================
---
title: "PART II: Distributed Data"
weight: 200
breadcrumbs: false
---

{{< callout type="warning" >}}
This page is from the 1st edition， 2nd edition is not available yet.
{{< /callout >}}

> *For a successful technology, reality must take precedence over public relations, for nature cannot be fooled.*
>
> —Richard Feynman, *Rogers Commission Report* (1986)

-------

In [Part I](/en/part-i) of this book, we discussed aspects of data systems that apply when data is stored on a single machine. Now, in [Part II](/en/part-ii), 
we move up a level and ask: what happens if multiple machines are involved in storage and retrieval of data?

There are various reasons why you might want to distribute a database across multiple machines:

***Scalability***

If your data volume, read load, or write load grows bigger than a single machine can handle, you can potentially spread the load across multiple machines.

***Fault tolerance/high availability***

If your application needs to continue working even if one machine (or several machines, or the network, or an entire datacenter) goes down, 
you can use multiple machines to give you redundancy. When one fails, another one can take over.

***Latency***

If you have users around the world, you might want to have servers at various locations worldwide so that each user can be served from a datacenter that is geographically close to them. 
That avoids the users having to wait for network packets to travel halfway around the world.


## Scaling to Higher Load

If all you need is to scale to higher load, the simplest approach is to buy a more powerful machine (sometimes called *vertical scaling* or *scaling up*). Many CPUs, many RAM chips, and many disks can be joined together under one operating system, 
and a fast interconnect allows any CPU to access any part of the memory or disk. In this kind of *shared-memory architecture*, all the components can be treated as a single machine [^1].

> [!NOTE]
> In a large machine, although any CPU can access any part of memory, some banks of memory are closer to one CPU than to others (this is called nonuniform memory access, or NUMA [^1]). 
> To make efficient use of this architecture, the processing needs to be broken down so that each CPU mostly accesses memory that is nearby—which means that partitioning is still required, even when ostensibly running on one machine.

The problem with a shared-memory approach is that the cost grows faster than linearly: a machine with twice as many CPUs, twice as much RAM, and twice as much disk capacity as another typically costs significantly more than twice as much. 
And due to bottlenecks, a machine twice the size cannot necessarily handle twice the load.

A shared-memory architecture may offer limited fault tolerance—high-end machines have hot-swappable components (you can replace disks, memory modules, and even CPUs without shutting down the machines) — but it is definitely limited to a single geographic location.

Another approach is the *shared-disk architecture*, which uses several machines with independent CPUs and RAM, but stores data on an array of disks that is shared between the machines, which are connected via a fast network.  
This architecture is used for some data warehousing workloads, but contention and the overhead of locking limit the scalability of the shared-disk approach [^2].

> [!NOTE]
> Network Attached Storage (NAS) or Storage Area Network (SAN).


### Shared-Nothing Architectures

By contrast, *shared-nothing architectures* [^3] (sometimes called *horizontal scaling* or *scaling out*) have gained a lot of popularity. 
In this approach, each machine or virtual machine running the database software is called a *node*. 
Each node uses its CPUs, RAM, and disks independently. Any coordination between nodes is done at the software level, using a conventional network.

No special hardware is required by a shared-nothing system, so you can use whatever machines have the best price/performance ratio. 
You can potentially distribute data across multiple geographic regions, and thus reduce latency for users and potentially be able to survive the loss of an entire datacenter. 
With cloud deployments of virtual machines, you don’t need to be operating at Google scale: even for small companies, a multi-region distributed architecture is now feasible.

In this part of the book, we focus on shared-nothing architectures—not because they are necessarily the best choice for every use case, but rather because they require the most caution from you, the application developer. 
If your data is distributed across multiple nodes, you need to be aware of the constraints and trade-offs that occur in such a distributed system—the database cannot magically hide these from you.

While a distributed shared-nothing architecture has many advantages, it usually also incurs additional complexity for applications and sometimes limits the expressiveness of the data models you can use. 
In some cases, a simple single-threaded program can perform significantly better than a cluster with over 100 CPU cores [^4]. On the other hand, shared-nothing systems can be very powerful. 
The next few chapters go into details on the issues that arise when data is distributed.

### Replication Versus Partitioning

There are two common ways data is distributed across multiple nodes:

***Replication***

Keeping a copy of the same data on several different nodes, potentially in different locations. 
Replication provides redundancy: if some nodes are unavailable, the data can still be served from the remaining nodes. 
Replication can also help improve performance. We discuss replication in [Chapter 6](/en/ch6).

***Partitioning***

 Splitting a big database into smaller subsets called *partitions* so that different partitions can be assigned to different nodes (also known as *sharding*). 
 We discuss partitioning in [Chapter 7](/en/ch7).

These are separate mechanisms, but they often go hand in hand, as illustrated in [Figure II-1](#fig_replication_partitioning).

{{< figure src="/v1/ddia_part-ii_01.png" id="fig_replication_partitioning" caption="*Figure II-1. A database split into two partitions, with two replicas per partition." class="w-full my-4" >}}

With an understanding of those concepts, we can discuss the difficult trade-offs that you need to make in a distributed system. 
We’ll discuss *transactions* in [Chapter 8](/en/ch8), as that will help you understand all the many things that can go wrong in a data system, and what you can do about them. 
We’ll conclude this part of the book by discussing the fundamental limitations of distributed systems in [Chapters 9](/en/ch9) and [10](/en/ch10).

Later, in [Part III](/en/part-iii) of this book, we will discuss how you can take several (potentially distributed) datastores and integrate them into a larger system, 
satisfying the needs of a complex application. But first, let’s talk about distributed data.


## [6. Replication](/en/ch6)
- [Single-Leader Replication](/en/ch6#sec_replication_leader)
- [Problems with Replication Lag](/en/ch6#sec_replication_lag)
- [Multi-Leader Replication](/en/ch6#sec_replication_multi_leader)
- [Leaderless Replication](/en/ch6#sec_replication_leaderless)
- [Summary](/en/ch6#summary)

## [7. Sharding](/en/ch7)
- [Pros and Cons of Sharding](/en/ch7#sec_sharding_reasons)
- [Sharding of Key-Value Data](/en/ch7#sec_sharding_key_value)
- [Request Routing](/en/ch7#sec_sharding_routing)
- [Sharding and Secondary Indexes](/en/ch7#sec_sharding_secondary_indexes)
- [Summary](/en/ch7#summary)

## [8. Transactions](/en/ch8)
- [What Exactly Is a Transaction?](/en/ch8#sec_transactions_overview)
- [Weak Isolation Levels](/en/ch8#sec_transactions_isolation_levels)
- [Serializability](/en/ch8#sec_transactions_serializability)
- [Distributed Transactions](/en/ch8#sec_transactions_distributed)
- [Summary](/en/ch8#summary)

## [9. The Trouble with Distributed Systems](/en/ch9)
- [Faults and Partial Failures](/en/ch9#sec_distributed_partial_failure)
- [Unreliable Networks](/en/ch9#sec_distributed_networks)
- [Unreliable Clocks](/en/ch9#sec_distributed_clocks)
- [Knowledge, Truth, and Lies](/en/ch9#sec_distributed_truth)
- [Summary](/en/ch9#summary)

## [10. Consistency and Consensus](/en/ch10)
- [Linearizability](/en/ch10#sec_consistency_linearizability)
- [ID Generators and Logical Clocks](/en/ch10#sec_consistency_logical)
- [Consensus](/en/ch10#sec_consistency_consensus)
- [Summary](/en/ch10#summary)


### References

[^1]: Ulrich Drepper: “[What Every Programmer Should Know About Memory](https://people.freebsd.org/~lstewart/articles/cpumemory.pdf),” akka‐dia.org, November 21, 2007.
[^2]: Ben Stopford: “[Shared Nothing vs. Shared Disk Architectures: An Independent View](http://www.benstopford.com/2009/11/24/understanding-the-shared-nothing-architecture/),” benstopford.com, November 24, 2009.
[^3]: Michael Stonebraker: “[The Case for Shared Nothing](http://db.cs.berkeley.edu/papers/hpts85-nothing.pdf),” IEEE Database EngineeringBulletin, volume 9, number 1, pages 4–9, March 1986.
[^4]: Frank McSherry, Michael Isard, and Derek G. Murray: “[Scalability! But at What COST?](http://www.frankmcsherry.org/assets/COST.pdf),” at 15th USENIX Workshop on Hot Topics in Operating Systems (HotOS),May 2015.


================================================
FILE: content/en/part-iii.md
================================================
---
title: "PART III: Derived Data"
weight: 300
breadcrumbs: false
---

{{< callout type="warning" >}}
This page is from the 1st edition， 2nd edition is not available yet.
{{< /callout >}}

In Parts [I](/en/part-i) and [II](/en/part-ii) of this book, we assembled from the ground up all the major considerations that go into a distributed database, 
from the layout of data on disk all the way to the limits of distributed consistency in the presence of faults. However, this discussion assumed that there was only one database in the application.

In reality, data systems are often more complex. In a large application you often need to be able to access and process data in many different ways, 
and there is no one database that can satisfy all those different needs simultaneously. Applications thus commonly use a combination of several different datastores, 
indexes, caches, analytics systems, etc. and implement mechanisms for moving data from one store to another.

In this final part of the book, we will examine the issues around integrating multiple different data systems, 
potentially with different data models and optimized for different access patterns, into one coherent application architecture. 
This aspect of system-building is often overlooked by vendors who claim that their product can satisfy all your needs. 
In reality, integrating disparate systems is one of the most important things that needs to be done in a nontrivial application.

## Systems of Record and Derived Data

On a high level, systems that store and process data can be grouped into two broad categories:


***Systems of record***

A system of record, also known as *source of truth*, holds the authoritative version of your data. 
When new data comes in, e.g., as user input, it is first written here. 
Each fact is represented exactly once (the representation is typically *normalized*). 
If there is any discrepancy between another system and the system of record, 
then the value in the system of record is (by definition) the correct one.

***Derived data systems***

Data in a derived system is the result of taking some existing data from another system and transforming or processing it in some way. 
If you lose derived data, you can recreate it from the original source. A classic example is a cache: data can be served from the cache if present, 
but if the cache doesn’t contain what you need, you can fall back to the underlying database. Denormalized values, indexes, 
and materialized views also fall into this category. In recommendation systems, predictive summary data is often derived from usage logs.

Technically speaking, derived data is *redundant*, in the sense that it duplicates existing information. 
However, it is often essential for getting good performance on read queries. It is commonly *denormalized*. 
You can derive several different datasets from a single source, enabling you to look at the data from different “points of view.”

Not all systems make a clear distinction between systems of record and derived data in their architecture, 
but it’s a very helpful distinction to make, because it clarifies the dataflow through your system: 
it makes explicit which parts of the system have which inputs and which outputs, and how they depend on each other.

Most databases, storage engines, and query languages are not inherently either a system of record or a derived system. 
A database is just a tool: how you use it is up to you. 
The distinction between system of record and derived data system depends not on the tool, but on how you use it in your application.

By being clear about which data is derived from which other data, you can bring clarity to an otherwise confusing system architecture. 
This point will be a running theme throughout this part of the book.


## Overview of Chapters

We will start in [Chapter 11](/en/ch11) by examining batch-oriented dataflow systems such as MapReduce, and see how they give us good tools and principles for building large- scale data systems. 
In [Chapter 12](/en/ch12) we will take those ideas and apply them to data streams, which allow us to do the same kinds of things with lower delays. 
In [Chapter 13](/en/ch13) we explore ideas about how we might use these tools to build reliable, scalable, and maintainable applications in the future.
[Chapter 14](/en/ch14) concludes the book with ethics, privacy, and the social impact of data systems.


## Index

- [11. Batch Processing](/en/ch11) (WIP)
- [12. Stream Processing](/en/ch12) (WIP)
- [13. A Philosophy of Streaming Systems](/en/ch13) (WIP)
- [14. Doing the Right Thing](/en/ch14) (WIP)


================================================
FILE: content/en/preface.md
================================================
---
title: Preface
weight: 50
breadcrumbs: false
---

{{< callout type="warning" >}}
This page is from the 1st edition， 2nd edition is not available yet.
{{< /callout >}}

If you have worked in software engineering in recent years, especially in server-side and backend systems, you have probably been bombarded with a plethora of buzzwords relating to storage and processing of data. NoSQL! Big Data! Web-scale! Sharding! Eventual consistency! ACID! CAP theorem! Cloud services! MapReduce! Real-time!

In the last decade we have seen many interesting developments in databases, in distributed systems, and in the ways we build applications on top of them. There are various driving forces for these developments:

- Internet companies such as Google, Yahoo!, Amazon, Facebook, LinkedIn, Microsoft, and Twitter are handling huge volumes of data and traffic, forcing them to create new tools that enable them to efficiently handle such scale.
- Businesses need to be agile, test hypotheses cheaply, and respond quickly to new market insights by keeping development cycles short and data models flexible.
- Free and open source software has become very successful and is now preferred to commercial or bespoke in-house software in many environments.
- CPU clock speeds are barely increasing, but multi-core processors are standard, and networks are getting faster. This means parallelism is only going to increase.
- Even if you work on a small team, you can now build systems that are distributed across many machines and even multiple geographic regions, thanks to infrastructure as a service (IaaS) such as Amazon Web Services.
- Many services are now expected to be highly available; extended downtime due to outages or maintenance is becoming increasingly unacceptable.

*Data-intensive applications* are pushing the boundaries of what is possible by making use of these technological developments. We call an application *data-intensive* if data is its primary challenge—the quantity of data, the complexity of data, or the speed at which it is changing—as opposed to *compute-intensive*, where CPU cycles are the bottleneck.

The tools and technologies that help data-intensive applications store and process data have been rapidly adapting to these changes. New types of database systems (“NoSQL”) have been getting lots of attention, but message queues, caches, search indexes, frameworks for batch and stream processing, and related technologies are very important too. Many applications use some combination of these.

The buzzwords that fill this space are a sign of enthusiasm for the new possibilities, which is a great thing. However, as software engineers and architects, we also need to have a technically accurate and precise understanding of the various technologies and their trade-offs if we want to build good applications. For that understanding, we have to dig deeper than buzzwords.

Fortunately, behind the rapid changes in technology, there are enduring principles that remain true, no matter which version of a particular tool you are using. If you understand those principles, you’re in a position to see where each tool fits in, how to make good use of it, and how to avoid its pitfalls. That’s where this book comes in.

The goal of this book is to help you navigate the diverse and fast-changing landscape of technologies for processing and storing data. This book is not a tutorial for one particular tool, nor is it a textbook full of dry theory. Instead, we will look at examples of successful data systems: technologies that form the foundation of many popular applications and that have to meet scalability, performance, and reliability requirements in production every day.

We will dig into the internals of those systems, tease apart their key algorithms, discuss their principles and the trade-offs they have to make. On this journey, we will try to find useful ways of *thinking about* data systems—not just *how* they work, but also *why* they work that way, and what questions we need to ask.

After reading this book, you will be in a great position to decide which kind of technology is appropriate for which purpose, and understand how tools can be combined to form the foundation of a good application architecture. You won’t be ready to build your own database storage engine from scratch, but fortunately that is rarely necessary. You will, however, develop a good intuition for what your systems are doing under the hood so that you can reason about their behavior, make good design decisions, and track down any problems that may arise.


## Who Should Read This Book?

If you develop applications that have some kind of server/backend for storing or processing data, and your applications use the internet (e.g., web applications, mobile apps, or internet-connected sensors), then this book is for you.

This book is for software engineers, software architects, and technical managers who love to code. It is especially relevant if you need to make decisions about the architecture of the systems you work on—for example, if you need to choose tools for solving a given problem and figure out how best to apply them. But even if you have no choice over your tools, this book will help you better understand their strengths and weaknesses.

You should have some experience building web-based applications or network services, and you should be familiar with relational databases and SQL. Any non-relational databases and other data-related tools you know are a bonus, but not required. A general understanding of common network protocols like TCP and HTTP is helpful. Your choice of programming language or framework makes no difference for this book.

If any of the following are true for you, you’ll find this book valuable:

- You want to learn how to make data systems scalable, for example, to support web or mobile apps with millions of users.
- You need to make applications highly available (minimizing downtime) and operationally robust.
- You are looking for ways of making systems easier to maintain in the long run, even as they grow and as requirements and technologies change.
- You have a natural curiosity for the way things work and want to know what goes on inside major websites and online services. This book breaks down the internals of various databases and data processing systems, and it’s great fun to explore the bright thinking that went into their design.

Sometimes, when discussing scalable data systems, people make comments along the lines of, “You’re not Google or Amazon. Stop worrying about scale and just use a relational database.” There is truth in that statement: building for scale that you don’t need is wasted effort and may lock you into an inflexible design. In effect, it is a form of premature optimization. However, it’s also important to choose the right tool for the job, and different technologies each have their own strengths and weaknesses. As we shall see, relational databases are important but not the final word on dealing with data.


## Scope of This Book

This book does not attempt to give detailed instructions on how to install or use specific software packages or APIs, since there is already plenty of documentation for those things. Instead we discuss the various principles and trade-offs that are fundamental to data systems, and we explore the different design decisions taken by different products.

In the ebook editions we have included links to the full text of online resources. All links were verified at the time of publication, but unfortunately links tend to break frequently due to the nature of the web. If you come across a broken link, or if you are reading a print copy of this book, you can look up references using a search engine. For academic papers, you can search for the title in Google Scholar to find open-access PDF files. Alternatively, you can find all of the references at [*https:// github.com/ept/ddia-references*](https:// github.com/ept/ddia-references), where we maintain up-to-date links.

We look primarily at the *architecture* of data systems and the ways they are integrated into data-intensive applications. This book doesn’t have space to cover deployment, operations, security, management, and other areas—those are complex and important topics, and we wouldn’t do them justice by making them superficial side notes in this book. They deserve books of their own.

Many of the technologies described in this book fall within the realm of the *Big Data* buzzword. However, the term “Big Data” is so overused and underdefined that it is not useful in a serious engineering discussion. This book uses less ambiguous terms, such as single-node versus distributed systems, or online/interactive versus offline/ batch processing systems.

This book has a bias toward free and open source software (FOSS), because reading, modifying, and executing source code is a great way to understand how something works in detail. Open platforms also reduce the risk of vendor lock-in. However, where appropriate, we also discuss proprietary software (closed-source software, software as a service, or companies’ in-house software that is only described in literature but not released publicly).


## Outline of This Book

This book is arranged into three parts:

1. In [Part I](/en/part-i), we discuss the fundamental ideas that underpin the design of data-intensive applications. We start in [Chapter 1](/en/ch1) by discussing what we’re actually trying to achieve: reliability, scalability, and maintainability; how we need to think about them; and how we can achieve them. In [Chapter 2](/en/ch2) we compare several different data models and query languages, and see how they are appropriate to different situations. In [Chapter 3](/en/ch3) we talk about storage engines: how databases arrange data on disk so that we can find it again efficiently. [Chapter 4](/en/ch4) turns to formats for data encoding (serialization) and evolution of schemas over time.
2. [In Part II](/en/part-ii), we move from data stored on one machine to data that is distributed across multiple machines. This is often necessary for scalability, but brings with it a variety of unique challenges. We first discuss replication ([Chapter 5](/en/ch5)), partitioning/sharding ([Chapter 6](/en/ch6)), and transactions ([Chapter 7](/en/ch7)). We then go into more detail on the problems with distributed systems ([Chapter 8](/en/ch8)) and what it means to achieve consistency and consensus in a distributed system ([Chapter 9](/en/ch9)).

3. In [Part III](/en/part-iii), we discuss systems that derive some datasets from other datasets. Derived data often occurs in heterogeneous systems: when there is no one database that can do everything well, applications need to integrate several different databases, caches, indexes, and so on. In [Chapter 10](/en/ch10) we start with a batch processing approach to derived data, and we build upon it with stream processing in [Chapter 11](/en/ch11). Finally, in [Chapter 12](/en/ch12) we put everything together and discuss approaches for building reliable, scalable, and maintainable applications in the future.


## References and Further Reading

Most of what we discuss in this book has already been said elsewhere in some form or another—in conference presentations, research papers, blog posts, code, bug trackers, mailing lists, and engineering folklore. This book summarizes the most important ideas from many different sources, and it includes pointers to the original literature throughout the text. The references at the end of each chapter are a great resource if you want to explore an area in more depth, and most of them are freely available online.


## O'Reilly Safari

[Safari](http://oreilly.com/safari) (formerly Safari Books Online) is a membership-based training and reference platform for enterprise, government, educators, and individuals.

Members have access to thousands of books, training videos, Learning Paths, interactive tutorials, and curated playlists from over 250 publishers, including O’Reilly Media, Harvard Business Review, Prentice Hall Professional, Addison-Wesley Professional, Microsoft Press, Sams, Que, Peachpit Press, Adobe, Focal Press, Cisco Press, John Wiley & Sons, Syngress, Morgan Kaufmann, IBM Redbooks, Packt, Adobe Press, FT Press, Apress, Manning, New Riders, McGraw-Hill, Jones & Bartlett, and Course Technology, among others.

For more information, please visit http://oreilly.com/safari.


## How to Contact Us

Please address comments and questions concerning this book to the publisher:

O’Reilly Media, Inc.
 1005 Gravenstein Highway North
 Sebastopol, CA 95472
 800-998-9938 (in the United States or Canada) 707-829-0515 (international or local) 707-829-0104 (fax)

We have a web page for this book, where we list errata, examples, and any additional information. You can access this page at *http://bit.ly/designing-data-intensive-apps*.

To comment or ask technical questions about this book, send email to *bookquestions@oreilly.com*.

For more information about our books, courses, conferences, and news, see our website at *http://www.oreilly.com*.

* Find us on Facebook: [http://facebook.com/oreilly](http://facebook.com/oreilly)
* Follow us on Twitter: [http://twitter.com/oreillymedia](#http://twitter.com/oreillymedia)
* Watch us on YouTube: [http://www.youtube.com/oreillymedia](#http://www.youtube.com/oreillymedia)


## Acknowledgments

Acknowledgments

This book is an amalgamation and systematization of a large number of other people’s ideas and knowledge, combining experience from both academic research and industrial practice. In computing we tend to be attracted to things that are new and shiny, but I think we have a huge amount to learn from things that have been done before. This book has over 800 references to articles, blog posts, talks, documentation, and more, and they have been an invaluable learning resource for me. I am very grateful to the authors of this material for sharing their knowledge.

I have also learned a lot from personal conversations, thanks to a large number of people who have taken the time to discuss ideas or patiently explain things to me. In particular, I would like to thank Joe Adler, Ross Anderson, Peter Bailis, Márton Balassi, Alastair Beresford, Mark Callaghan, Mat Clayton, Patrick Collison, Sean Cribbs, Shirshanka Das, Niklas Ekström, Stephan Ewen, Alan Fekete, Gyula Fóra, Camille Fournier, Andres Freund, John Garbutt, Seth Gilbert, Tom Haggett, Pat Helland, Joe Hellerstein, Jakob Homan, Heidi Howard, John Hugg, Julian Hyde, Conrad Irwin, Evan Jones, Flavio Junqueira, Jessica Kerr, Kyle Kingsbury, Jay Kreps, Carl Lerche, Nicolas Liochon, Steve Loughran, Lee Mallabone, Nathan Marz, Caitie, McCaffrey, Josie McLellan, Christopher Meiklejohn, Ian Meyers, Neha Narkhede, Neha Narula, Cathy O’Neil, Onora O’Neill, Ludovic Orban, Zoran Perkov, Julia Powles, Chris Riccomini, Henry Robinson, David Rosenthal, Jennifer Rullmann, Matthew Sackman, Martin Scholl, Amit Sela, Gwen Shapira, Greg Spurrier, Sam Stokes, Ben Stopford, Tom Stuart, Diana Vasile, Rahul Vohra, Pete Warden, and Brett Wooldridge.

Several more people have been invaluable to the writing of this book by reviewing drafts and providing feedback. For these contributions I am particularly indebted to Raul Agepati, Tyler Akidau, Mattias Andersson, Sasha Baranov, Veena Basavaraj, David Beyer, Jim Brikman, Paul Carey, Raul Castro Fernandez, Joseph Chow, Derek Elkins, Sam Elliott, Alexander Gallego, Mark Grover, Stu Halloway, Heidi Howard, Nicola Kleppmann, Stefan Kruppa, Bjorn Madsen, Sander Mak, Stefan Podkowinski, Phil Potter, Hamid Ramazani, Sam Stokes, and Ben Summers. Of course, I take all responsibility for any remaining errors or unpalatable opinions in this book.

For helping this book become real, and for their patience with my slow writing and unusual requests, I am grateful to my editors Marie Beaugureau, Mike Loukides, Ann Spencer, and all the team at O’Reilly. For helping find the right words, I thank Rachel Head. For giving me the time and freedom to write in spite of other work commitments, I thank Alastair Beresford, Susan Goodhue, Neha Narkhede, and Kevin Scott.

Very special thanks are due to Shabbir Diwan and Edie Freedman, who illustrated with great care the maps that accompany the chapters. It’s wonderful that they took on the unconventional idea of creating maps, and made them so beautiful and compelling.

Finally, my love goes to my family and friends, without whom I would not have been able to get through this writing process that has taken almost four years. You’re the best.

================================================
FILE: content/en/toc.md
================================================
---
title: "Table of Content"
linkTitle: "Table of Content"
weight: 10
breadcrumbs: false
---


![](/title.jpg)


## [Preface](/en/preface)
- [Who Should Read This Book?](/en/preface#who-should-read-this-book)
- [Scope of This Book](/en/preface#scope-of-this-book)
- [Outline of This Book](/en/preface#outline-of-this-book)
- [References and Further Reading](/en/preface#references-and-further-reading)
- [O'Reilly Safari](/en/preface#oreilly-safari)
- [How to Contact Us](/en/preface#how-to-contact-us)
- [Acknowledgments](/en/preface#acknowledgments)

## [1. Trade-offs in Data Systems Architecture](/en/ch1)
- [Analytical versus Operational Systems](/en/ch1#sec_introduction_analytics)
    - [Characterizing Transaction Processing and Analytics](/en/ch1#sec_introduction_oltp)
    - [Data Warehousing](/en/ch1#sec_introduction_dwh)
        - [From data warehouse to data lake](/en/ch1#from-data-warehouse-to-data-lake)
        - [Beyond the data lake](/en/ch1#beyond-the-data-lake)
    - [Systems of Record and Derived Data](/en/ch1#sec_introduction_derived)
- [Cloud versus Self-Hosting](/en/ch1#sec_introduction_cloud)
    - [Pros and Cons of Cloud Services](/en/ch1#sec_introduction_cloud_tradeoffs)
    - [Cloud-Native System Architecture](/en/ch1#sec_introduction_cloud_native)
        - [Layering of cloud services](/en/ch1#layering-of-cloud-services)
        - [Separation of storage and compute](/en/ch1#sec_introduction_storage_compute)
    - [Operations in the Cloud Era](/en/ch1#sec_introduction_operations)
- [Distributed versus Single-Node Systems](/en/ch1#sec_introduction_distributed)
    - [Problems with Distributed Systems](/en/ch1#sec_introduction_dist_sys_problems)
    - [Microservices and Serverless](/en/ch1#sec_introduction_microservices)
    - [Cloud Computing versus Supercomputing](/en/ch1#id17)
- [Data Systems, Law, and Society](/en/ch1#sec_introduction_compliance)
- [Summary](/en/ch1#summary)
    - [References](/en/ch1#references)

## [2. Defining Nonfunctional Requirements](/en/ch2)
- [Case Study: Social Network Home Timelines](/en/ch2#sec_introduction_twitter)
    - [Representing Users, Posts, and Follows](/en/ch2#id20)
    - [Materializing and Updating Timelines](/en/ch2#sec_introduction_materializing)
- [Describing Performance](/en/ch2#sec_introduction_percentiles)
    - [Latency and Response Time](/en/ch2#id23)
    - [Average, Median, and Percentiles](/en/ch2#id24)
    - [Use of Response Time Metrics](/en/ch2#sec_introduction_slo_sla)
- [Reliability and Fault Tolerance](/en/ch2#sec_introduction_reliability)
    - [Fault Tolerance](/en/ch2#id27)
    - [Hardware and Software Faults](/en/ch2#sec_introduction_hardware_faults)
        - [Tolerating hardware faults through redundancy](/en/ch2#tolerating-hardware-faults-through-redundancy)
        - [Software faults](/en/ch2#software-faults)
    - [Humans and Reliability](/en/ch2#id31)
- [Scalability](/en/ch2#sec_introduction_scalability)
    - [Describing Load](/en/ch2#id33)
    - [Shared-Memory, Shared-Disk, and Shared-Nothing Architecture](/en/ch2#sec_introduction_shared_nothing)
    - [Principles for Scalability](/en/ch2#id35)
- [Maintainability](/en/ch2#sec_introduction_maintainability)
    - [Operability: Making Life Easy for Operations](/en/ch2#id37)
    - [Simplicity: Managing Complexity](/en/ch2#id38)
    - [Evolvability: Making Change Easy](/en/ch2#sec_introduction_evolvability)
- [Summary](/en/ch2#summary)
    - [References](/en/ch2#references)

## [3. Data Models and Query Languages](/en/ch3)
- [Relational Model versus Document Model](/en/ch3#sec_datamodels_history)
    - [The Object-Relational Mismatch](/en/ch3#sec_datamodels_document)
        - [Object-relational mapping (ORM)](/en/ch3#object-relational-mapping-orm)
        - [The document data model for one-to-many relationships](/en/ch3#the-document-data-model-for-one-to-many-relationships)
    - [Normalization, Denormalization, and Joins](/en/ch3#sec_datamodels_normalization)
        - [Trade-offs of normalization](/en/ch3#trade-offs-of-normalization)
        - [Denormalization in the social networking case study](/en/ch3#denormalization-in-the-social-networking-case-study)
    - [Many-to-One and Many-to-Many Relationships](/en/ch3#sec_datamodels_many_to_many)
    - [Stars and Snowflakes: Schemas for Analytics](/en/ch3#sec_datamodels_analytics)
    - [When to Use Which Model](/en/ch3#sec_datamodels_document_summary)
        - [Schema flexibility in the document model](/en/ch3#sec_datamodels_schema_flexibility)
        - [Data locality for reads and writes](/en/ch3#sec_datamodels_document_locality)
        - [Query languages for documents](/en/ch3#query-languages-for-documents)
        - [Convergence of document and relational databases](/en/ch3#convergence-of-document-and-relational-databases)
- [Graph-Like Data Models](/en/ch3#sec_datamodels_graph)
    - [Property Graphs](/en/ch3#id56)
    - [The Cypher Query Language](/en/ch3#id57)
    - [Graph Queries in SQL](/en/ch3#id58)
    - [Triple-Stores and SPARQL](/en/ch3#id59)
        - [The RDF data model](/en/ch3#the-rdf-data-model)
        - [The SPARQL query language](/en/ch3#the-sparql-query-language)
    - [Datalog: Recursive Relational Queries](/en/ch3#id62)
    - [GraphQL](/en/ch3#id63)
- [Event Sourcing and CQRS](/en/ch3#sec_datamodels_events)
- [Dataframes, Matrices, and Arrays](/en/ch3#sec_datamodels_dataframes)
- [Summary](/en/ch3#summary)
    - [References](/en/ch3#references)

## [4. Storage and Retrieval](/en/ch4)
- [Storage and Indexing for OLTP](/en/ch4#sec_storage_oltp)
    - [Log-Structured Storage](/en/ch4#sec_storage_log_structured)
        - [The SSTable file format](/en/ch4#the-sstable-file-format)
        - [Constructing and merging SSTables](/en/ch4#constructing-and-merging-sstables)
        - [Bloom filters](/en/ch4#bloom-filters)
        - [Compaction strategies](/en/ch4#sec_storage_lsm_compaction)
    - [B-Trees](/en/ch4#sec_storage_b_trees)
        - [Making B-trees reliable](/en/ch4#sec_storage_btree_wal)
        - [B-tree variants](/en/ch4#b-tree-variants)
    - [Comparing B-Trees and LSM-Trees](/en/ch4#sec_storage_btree_lsm_comparison)
        - [Read performance](/en/ch4#read-performance)
        - [Sequential vs. random writes](/en/ch4#sidebar_sequential)
        - [Write amplification](/en/ch4#write-amplification)
        - [Disk space usage](/en/ch4#disk-space-usage)
    - [Multi-Column and Secondary Indexes](/en/ch4#sec_storage_index_multicolumn)
        - [Storing values within the index](/en/ch4#sec_storage_index_heap)
    - [Keeping everything in memory](/en/ch4#sec_storage_inmemory)
- [Data Storage for Analytics](/en/ch4#sec_storage_analytics)
    - [Cloud Data Warehouses](/en/ch4#sec_cloud_data_warehouses)
    - [Column-Oriented Storage](/en/ch4#sec_storage_column)
        - [Column Compression](/en/ch4#sec_storage_column_compression)
        - [Sort Order in Column Storage](/en/ch4#sort-order-in-column-storage)
        - [Writing to Column-Oriented Storage](/en/ch4#writing-to-column-oriented-storage)
    - [Query Execution: Compilation and Vectorization](/en/ch4#sec_storage_vectorized)
    - [Materialized Views and Data Cubes](/en/ch4#sec_storage_materialized_views)
- [Multidimensional and Full-Text Indexes](/en/ch4#sec_storage_multidimensional)
    - [Full-Text Search](/en/ch4#sec_storage_full_text)
    - [Vector Embeddings](/en/ch4#id92)
- [Summary](/en/ch4#summary)
    - [References](/en/ch4#references)

## [5. Encoding and Evolution](/en/ch5)
- [Formats for Encoding Data](/en/ch5#sec_encoding_formats)
    - [Language-Specific Formats](/en/ch5#id96)
    - [JSON, XML, and Binary Variants](/en/ch5#sec_encoding_json)
        - [JSON Schema](/en/ch5#json-schema)
        - [Binary encoding](/en/ch5#binary-encoding)
    - [Protocol Buffers](/en/ch5#sec_encoding_protobuf)
        - [Field tags and schema evolution](/en/ch5#field-tags-and-schema-evolution)
    - [Avro](/en/ch5#sec_encoding_avro)
        - [The writer’s schema and the reader’s schema](/en/ch5#the-writers-schema-and-the-readers-schema)
        - [Schema evolution rules](/en/ch5#schema-evolution-rules)
        - [But what is the writer’s schema?](/en/ch5#but-what-is-the-writers-schema)
        - [Dynamically generated schemas](/en/ch5#dynamically-generated-schemas)
    - [The Merits of Schemas](/en/ch5#sec_encoding_schemas)
- [Modes of Dataflow](/en/ch5#sec_encoding_dataflow)
    - [Dataflow Through Databases](/en/ch5#sec_encoding_dataflow_db)
        - [Different values written at different times](/en/ch5#different-values-written-at-different-times)
        - [Archival storage](/en/ch5#archival-storage)
    - [Dataflow Through Services: REST and RPC](/en/ch5#sec_encoding_dataflow_rpc)
        - [Web services](/en/ch5#sec_web_services)
        - [The problems with remote procedure calls (RPCs)](/en/ch5#sec_problems_with_rpc)
        - [Load balancers, service discovery, and service meshes](/en/ch5#sec_encoding_service_discovery)
        - [Data encoding and evolution for RPC](/en/ch5#data-encoding-and-evolution-for-rpc)
    - [Durable Execution and Workflows](/en/ch5#sec_encoding_dataflow_workflows)
        - [Durable execution](/en/ch5#durable-execution)
    - [Event-Driven Architectures](/en/ch5#sec_encoding_dataflow_msg)
        - [Message brokers](/en/ch5#message-brokers)
        - [Distributed actor frameworks](/en/ch5#distributed-actor-frameworks)
- [Summary](/en/ch5#summary)
    - [References](/en/ch5#references)

## [6. Replication](/en/ch6)
- [Single-Leader Replication](/en/ch6#sec_replication_leader)
    - [Synchronous Versus Asynchronous Replication](/en/ch6#sec_replication_sync_async)
    - [Setting Up New Followers](/en/ch6#sec_replication_new_replica)
    - [Handling Node Outages](/en/ch6#sec_replication_failover)
        - [Follower failure: Catch-up recovery](/en/ch6#follower-failure-catch-up-recovery)
        - [Leader failure: Failover](/en/ch6#leader-failure-failover)
    - [Implementation of Replication Logs](/en/ch6#sec_replication_implementation)
        - [Statement-based replication](/en/ch6#statement-based-replication)
        - [Write-ahead log (WAL) shipping](/en/ch6#write-ahead-log-wal-shipping)
        - [Logical (row-based) log replication](/en/ch6#logical-row-based-log-replication)
- [Problems with Replication Lag](/en/ch6#sec_replication_lag)
    - [Reading Your Own Writes](/en/ch6#sec_replication_ryw)
    - [Monotonic Reads](/en/ch6#sec_replication_monotonic_reads)
    - [Consistent Prefix Reads](/en/ch6#sec_replication_consistent_prefix)
    - [Solutions for Replication Lag](/en/ch6#id131)
- [Multi-Leader Replication](/en/ch6#sec_replication_multi_leader)
    - [Geographically Distributed Operation](/en/ch6#sec_replication_multi_dc)
        - [Multi-leader replication topologies](/en/ch6#sec_replication_topologies)
        - [Problems with different topologies](/en/ch6#problems-with-different-topologies)
    - [Sync Engines and Local-First Software](/en/ch6#sec_replication_offline_clients)
        - [Real-time collaboration, offline-first, and local-first apps](/en/ch6#real-time-collaboration-offline-first-and-local-first-apps)
        - [Pros and cons of sync engines](/en/ch6#pros-and-cons-of-sync-engines)
    - [Dealing with Conflicting Writes](/en/ch6#sec_replication_write_conflicts)
        - [Conflict avoidance](/en/ch6#conflict-avoidance)
        - [Last write wins (discarding concurrent writes)](/en/ch6#sec_replication_lww)
        - [Manual conflict resolution](/en/ch6#manual-conflict-resolution)
        - [Automatic conflict resolution](/en/ch6#automatic-conflict-resolution)
    - [CRDTs and Operational Transformation](/en/ch6#sec_replication_crdts)
        - [What is a conflict?](/en/ch6#what-is-a-conflict)
- [Leaderless Replication](/en/ch6#sec_replication_leaderless)
    - [Writing to the Database When a Node Is Down](/en/ch6#id287)
        - [Catching up on missed writes](/en/ch6#sec_replication_read_repair)
        - [Quorums for reading and writing](/en/ch6#sec_replication_quorum_condition)
    - [Limitations of Quorum Consistency](/en/ch6#sec_replication_quorum_limitations)
        - [Monitoring staleness](/en/ch6#monitoring-staleness)
    - [Single-Leader vs. Leaderless Replication Performance](/en/ch6#sec_replication_leaderless_perf)
        - [Multi-region operation](/en/ch6#multi-region-operation)
    - [Detecting Concurrent Writes](/en/ch6#sec_replication_concurrent)
        - [The “happens-before” relation and concurrency](/en/ch6#sec_replication_happens_before)
        - [Capturing the happens-before relationship](/en/ch6#capturing-the-happens-before-relationship)
        - [Version vectors](/en/ch6#version-vectors)
- [Summary](/en/ch6#summary)
    - [References](/en/ch6#references)

## [7. Sharding](/en/ch7)
- [Pros and Cons of Sharding](/en/ch7#sec_sharding_reasons)
    - [Sharding for Multitenancy](/en/ch7#sec_sharding_multitenancy)
- [Sharding of Key-Value Data](/en/ch7#sec_sharding_key_value)
    - [Sharding by Key Range](/en/ch7#sec_sharding_key_range)
        - [Rebalancing key-range sharded data](/en/ch7#rebalancing-key-range-sharded-data)
    - [Sharding by Hash of Key](/en/ch7#sec_sharding_hash)
        - [Hash modulo number of nodes](/en/ch7#hash-modulo-number-of-nodes)
        - [Fixed number of shards](/en/ch7#fixed-number-of-shards)
        - [Sharding by hash range](/en/ch7#sharding-by-hash-range)
        - [Consistent hashing](/en/ch7#sec_sharding_consistent_hashing)
    - [Skewed Workloads and Relieving Hot Spots](/en/ch7#sec_sharding_skew)
    - [Operations: Automatic or Manual Rebalancing](/en/ch7#sec_sharding_operations)
- [Request Routing](/en/ch7#sec_sharding_routing)
- [Sharding and Secondary Indexes](/en/ch7#sec_sharding_secondary_indexes)
    - [Local Secondary Indexes](/en/ch7#id166)
    - [Global Secondary Indexes](/en/ch7#id167)
- [Summary](/en/ch7#summary)
    - [References](/en/ch7#references)

## [8. Transactions](/en/ch8)
- [What Exactly Is a Transaction?](/en/ch8#sec_transactions_overview)
    - [The Meaning of ACID](/en/ch8#sec_transactions_acid)
        - [Atomicity](/en/ch8#sec_transactions_acid_atomicity)
        - [Consistency](/en/ch8#sec_transactions_acid_consistency)
        - [Isolation](/en/ch8#sec_transactions_acid_isolation)
        - [Durability](/en/ch8#durability)
    - [Single-Object and Multi-Object Operations](/en/ch8#sec_transactions_multi_object)
        - [Single-object writes](/en/ch8#sec_transactions_single_object)
        - [The need for multi-object transactions](/en/ch8#sec_transactions_need)
        - [Handling errors and aborts](/en/ch8#handling-errors-and-aborts)
- [Weak Isolation Levels](/en/ch8#sec_transactions_isolation_levels)
    - [Read Committed](/en/ch8#sec_transactions_read_committed)
        - [No dirty reads](/en/ch8#no-dirty-reads)
        - [No dirty writes](/en/ch8#sec_transactions_dirty_write)
        - [Implementing read committed](/en/ch8#sec_transactions_read_committed_impl)
    - [Snapshot Isolation and Repeatable Read](/en/ch8#sec_transactions_snapshot_isolation)
        - [Multi-version concurrency control (MVCC)](/en/ch8#sec_transactions_snapshot_impl)
        - [Visibility rules for observing a consistent snapshot](/en/ch8#sec_transactions_mvcc_visibility)
        - [Indexes and snapshot isolation](/en/ch8#indexes-and-snapshot-isolation)
        - [Snapshot isolation, repeatable read, and naming confusion](/en/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
    - [Preventing Lost Updates](/en/ch8#sec_transactions_lost_update)
        - [Atomic write operations](/en/ch8#atomic-write-operations)
        - [Explicit locking](/en/ch8#explicit-locking)
        - [Automatically detecting lost updates](/en/ch8#automatically-detecting-lost-updates)
        - [Conditional writes (compare-and-set)](/en/ch8#sec_transactions_compare_and_set)
        - [Conflict resolution and replication](/en/ch8#conflict-resolution-and-replication)
    - [Write Skew and Phantoms](/en/ch8#sec_transactions_write_skew)
        - [Characterizing write skew](/en/ch8#characterizing-write-skew)
        - [More examples of write skew](/en/ch8#more-examples-of-write-skew)
        - [Phantoms causing write skew](/en/ch8#sec_transactions_phantom)
        - [Materializing conflicts](/en/ch8#materializing-conflicts)
- [Serializability](/en/ch8#sec_transactions_serializability)
    - [Actual Serial Execution](/en/ch8#sec_transactions_serial)
        - [Encapsulating transactions in stored procedures](/en/ch8#encapsulating-transactions-in-stored-procedures)
        - [Pros and cons of stored procedures](/en/ch8#sec_transactions_stored_proc_tradeoffs)
        - [Sharding](/en/ch8#sharding)
        - [Summary of serial execution](/en/ch8#summary-of-serial-execution)
    - [Two-Phase Locking (2PL)](/en/ch8#sec_transactions_2pl)
        - [Implementation of two-phase locking](/en/ch8#implementation-of-two-phase-locking)
        - [Performance of two-phase locking](/en/ch8#performance-of-two-phase-locking)
        - [Predicate locks](/en/ch8#predicate-locks)
        - [Index-range locks](/en/ch8#sec_transactions_2pl_range)
    - [Serializable Snapshot Isolation (SSI)](/en/ch8#sec_transactions_ssi)
        - [Pessimistic versus optimistic concurrency control](/en/ch8#pessimistic-versus-optimistic-concurrency-control)
        - [Decisions based on an outdated premise](/en/ch8#decisions-based-on-an-outdated-premise)
        - [Detecting stale MVCC reads](/en/ch8#detecting-stale-mvcc-reads)
        - [Detecting writes that affect prior reads](/en/ch8#sec_detecting_writes_affect_reads)
        - [Performance of serializable snapshot isolation](/en/ch8#performance-of-serializable-snapshot-isolation)
- [Distributed Transactions](/en/ch8#sec_transactions_distributed)
    - [Two-Phase Commit (2PC)](/en/ch8#sec_transactions_2pc)
        - [A system of promises](/en/ch8#a-system-of-promises)
        - [Coordinator failure](/en/ch8#coordinator-failure)
        - [Three-phase commit](/en/ch8#three-phase-commit)
    - [Distributed Transactions Across Different Systems](/en/ch8#sec_transactions_xa)
        - [Exactly-once message processing](/en/ch8#sec_transactions_exactly_once)
        - [XA transactions](/en/ch8#xa-transactions)
        - [Holding locks while in doubt](/en/ch8#holding-locks-while-in-doubt)
        - [Recovering from coordinator failure](/en/ch8#recovering-from-coordinator-failure)
        - [Problems with XA transactions](/en/ch8#problems-with-xa-transactions)
    - [Database-internal Distributed Transactions](/en/ch8#sec_transactions_internal)
        - [Exactly-once message processing revisited](/en/ch8#exactly-once-message-processing-revisited)
- [Summary](/en/ch8#summary)
    - [References](/en/ch8#references)

## [9. The Trouble with Distributed Systems](/en/ch9)
- [Faults and Partial Failures](/en/ch9#sec_distributed_partial_failure)
- [Unreliable Networks](/en/ch9#sec_distributed_networks)
    - [The Limitations of TCP](/en/ch9#sec_distributed_tcp)
    - [Network Faults in Practice](/en/ch9#sec_distributed_network_faults)
    - [Detecting Faults](/en/ch9#id307)
    - [Timeouts and Unbounded Delays](/en/ch9#sec_distributed_queueing)
        - [Network congestion and queueing](/en/ch9#network-congestion-and-queueing)
    - [Synchronous Versus Asynchronous Networks](/en/ch9#sec_distributed_sync_networks)
        - [Can we not simply make network delays predictable?](/en/ch9#can-we-not-simply-make-network-delays-predictable)
- [Unreliable Clocks](/en/ch9#sec_distributed_clocks)
    - [Monotonic Versus Time-of-Day Clocks](/en/ch9#sec_distributed_monotonic_timeofday)
        - [Time-of-day clocks](/en/ch9#time-of-day-clocks)
        - [Monotonic clocks](/en/ch9#monotonic-clocks)
    - [Clock Synchronization and Accuracy](/en/ch9#sec_distributed_clock_accuracy)
    - [Relying on Synchronized Clocks](/en/ch9#sec_distributed_clocks_relying)
        - [Timestamps for ordering events](/en/ch9#sec_distributed_lww)
        - [Clock readings with a confidence interval](/en/ch9#clock-readings-with-a-confidence-interval)
        - [Synchronized clocks for global snapshots](/en/ch9#sec_distributed_spanner)
    - [Process Pauses](/en/ch9#sec_distributed_clocks_pauses)
        - [Response time guarantees](/en/ch9#sec_distributed_clocks_realtime)
        - [Limiting the impact of garbage collection](/en/ch9#sec_distributed_gc_impact)
- [Knowledge, Truth, and Lies](/en/ch9#sec_distributed_truth)
    - [The Majority Rules](/en/ch9#sec_distributed_majority)
    - [Distributed Locks and Leases](/en/ch9#sec_distributed_lock_fencing)
        - [Fencing off zombies and delayed requests](/en/ch9#sec_distributed_fencing_tokens)
        - [Fencing with multiple replicas](/en/ch9#fencing-with-multiple-replicas)
    - [Byzantine Faults](/en/ch9#sec_distributed_byzantine)
        - [Weak forms of lying](/en/ch9#weak-forms-of-lying)
    - [System Model and Reality](/en/ch9#sec_distributed_system_model)
        - [Defining the correctness of an algorithm](/en/ch9#defining-the-correctness-of-an-algorithm)
        - [Safety and liveness](/en/ch9#sec_distributed_safety_liveness)
        - [Mapping system models to the real world](/en/ch9#mapping-system-models-to-the-real-world)
    - [Formal Methods and Randomized Testing](/en/ch9#sec_distributed_formal)
        - [Model checking and specification languages](/en/ch9#model-checking-and-specification-languages)
        - [Fault injection](/en/ch9#sec_fault_injection)
        - [Deterministic simulation testing](/en/ch9#deterministic-simulation-testing)
- [Summary](/en/ch9#summary)
    - [References](/en/ch9#references)

## [10. Consistency and Consensus](/en/ch10)
- [Linearizability](/en/ch10#sec_consistency_linearizability)
    - [What Makes a System Linearizable?](/en/ch10#sec_consistency_lin_definition)
    - [Relying on Linearizability](/en/ch10#sec_consistency_linearizability_usage)
        - [Locking and leader election](/en/ch10#locking-and-leader-election)
        - [Constraints and uniqueness guarantees](/en/ch10#sec_consistency_uniqueness)
        - [Cross-channel timing dependencies](/en/ch10#cross-channel-timing-dependencies)
    - [Implementing Linearizable Systems](/en/ch10#sec_consistency_implementing_linearizable)
        - [Linearizability and quorums](/en/ch10#sec_consistency_quorum_linearizable)
    - [The Cost of Linearizability](/en/ch10#sec_linearizability_cost)
        - [The CAP theorem](/en/ch10#the-cap-theorem)
        - [Linearizability and network delays](/en/ch10#linearizability-and-network-delays)
- [ID Generators and Logical Clocks](/en/ch10#sec_consistency_logical)
    - [Logical Clocks](/en/ch10#sec_consistency_timestamps)
        - [Lamport timestamps](/en/ch10#lamport-timestamps)
        - [Hybrid logical clocks](/en/ch10#hybrid-logical-clocks)
        - [Lamport/hybrid logical clocks vs. vector clocks](/en/ch10#lamporthybrid-logical-clocks-vs-vector-clocks)
    - [Linearizable ID Generators](/en/ch10#sec_consistency_linearizable_id)
        - [Implementing a linearizable ID generator](/en/ch10#implementing-a-linearizable-id-generator)
        - [Enforcing constraints using logical clocks](/en/ch10#enforcing-constraints-using-logical-clocks)
- [Consensus](/en/ch10#sec_consistency_consensus)
    - [The Many Faces of Consensus](/en/ch10#sec_consistency_faces)
        - [Single-value consensus](/en/ch10#single-value-consensus)
        - [Compare-and-set as consensus](/en/ch10#compare-and-set-as-consensus)
        - [Shared logs as consensus](/en/ch10#sec_consistency_shared_logs)
        - [Fetch-and-add as consensus](/en/ch10#fetch-and-add-as-consensus)
        - [Atomic commitment as consensus](/en/ch10#atomic-commitment-as-consensus)
    - [Consensus in Practice](/en/ch10#sec_consistency_total_order)
        - [Using shared logs](/en/ch10#sec_consistency_smr)
        - [From single-leader replication to consensus](/en/ch10#from-single-leader-replication-to-consensus)
        - [Subtleties of consensus](/en/ch10#subtleties-of-consensus)
        - [Pros and cons of consensus](/en/ch10#pros-and-cons-of-consensus)
    - [Coordination Services](/en/ch10#sec_consistency_coordination)
        - [Allocating work to nodes](/en/ch10#allocating-work-to-nodes)
        - [Service discovery](/en/ch10#service-discovery)
- [Summary](/en/ch10#summary)
    - [References](/en/ch10#references)

## [11. Batch Processing](/en/ch11)
- [……](/en/ch11#)
- [Summary](/en/ch11#id292)
    - [References](/en/ch11#references)

## [12. Stream Processing](/en/ch12)
- [……](/en/ch12#)
- [Summary](/en/ch12#id332)
    - [References](/en/ch12#references)

## [13. A Philosophy of Streaming Systems](/en/ch13)
- [……](/en/ch13#)
- [Summary](/en/ch13#id367)
    - [References](/en/ch13#references)

## [14. Doing the Right Thing](/en/ch14)
- [……](/en/ch14#)
- [Summary](/en/ch14#id594)
    - [References](/en/ch14#references)

## [Glossary](/en/glossary)

## [Colophon](/en/colophon)
- [About the Author](/en/colophon#about-the-author)
- [Colophon](/en/colophon#colophon)


================================================
FILE: content/tw/_index.md
================================================
---
title: 設計資料密集型應用（第二版）
linkTitle: DDIA
cascade:
  type: docs
breadcrumbs: false
---


**作者**： [Martin Kleppmann](https://martin.kleppmann.com)，[《Designing Data-Intensive Applications 2nd Edition》](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch01.html) ： 英國劍橋大學分散式系統研究員，演講者，博主和開源貢獻者，軟體工程師和企業家，曾在 LinkedIn 和 Rapportive 負責資料基礎架構。

**譯者**：[**馮若航**](https://vonng.com)，網名 [@Vonng](https://github.com/Vonng)。
PostgreSQL 專家，資料庫老司機，雲計算泥石流。
[**Pigsty**](https://pgsty.com) 作者與創始人。
架構師，DBA，全棧工程師 @ TanTan，Alibaba，Apple。
獨立開源貢獻者，[GitStar Ranking 600](https://gitstar-ranking.com/Vonng)，[國區活躍 Top20](https://committers.top/china)。
[DDIA](https://ddia.pigsty.io) / [PG Internal](https://pgint.vonng.com) 中文版譯者，公眾號：《老馮雲數》，資料庫 KOL。

**校訂**： [@yingang](https://github.com/yingang)  ｜  [繁體中文](/tw) **版本維護** by  [@afunTW](https://github.com/afunTW) ｜ [完整貢獻者列表](/contrib)

> [!NOTE]
> **DDIA 第二版** 正在翻譯中 ([`main`](https://github.com/Vonng/ddia/tree/main) 分支)，歡迎加入並提出您的寶貴意見！[點選此處閱覽第一版](/v1)。


> [!TIP] 預覽版讀者須知
> 預覽版電子書允許你在作者寫作時就能獲得最原始、未經編輯的內容 —— 這樣你就能在這些技術正式釋出之前很久就用上它們。
> 如果你想積極參與審閱和評論這份草稿，請在 GitHub 上聯絡。本書的 GitHub 倉庫是 [ept/ddia2-feedback](https://github.com/ept/ddia2-feedback)，中文翻譯版的倉庫是 [Vonng/ddia](https://github.com/Vonng/ddia)。


## 譯序

> 不懂資料庫的全棧工程師不是好架構師 —— 馮若航 / Vonng

現今，尤其是在網際網路領域，大多數應用都屬於資料密集型應用。本書從底層資料結構到頂層架構設計，將資料系統設計中的精髓娓娓道來。其中的寶貴經驗無論是對架構師、DBA、還是後端工程師、甚至產品經理都會有幫助。

這是一本理論結合實踐的書，書中很多問題，譯者在實際場景中都曾遇到過，讀來讓人擊節扼腕。如果能早點讀到這本書，該少走多少彎路啊！

這也是一本深入淺出的書，講述概念的來龍去脈而不是賣弄定義，介紹事物發展演化歷程而不是事實堆砌，將複雜的概念講述的淺顯易懂，但又直擊本質不失深度。每章最後的引用質量非常好，是深入學習各個主題的絕佳索引。

本書為資料系統的設計、實現、與評價提供了很好的概念框架。讀完並理解本書內容後，讀者可以輕鬆看破大多數的技術忽悠，與技術磚家撕起來虎虎生風。

這是 2017 年譯者讀過最好的一本技術類書籍，這麼好的書沒有中文翻譯，實在是遺憾。某不才，願為先進技術文化的傳播貢獻一份力量。既可以深入學習有趣的技術主題，又可以鍛鍊中英文語言文字功底，何樂而不為？


## 前言

> 在我們的社會中，技術是一種強大的力量。資料、軟體、通訊可以用於壞的方面：不公平的階級固化，損害公民權利，保護既得利益集團。但也可以用於好的方面：讓底層人民發出自己的聲音，讓每個人都擁有機會，避免災難。本書獻給所有將技術用於善途的人們。


> 計算是一種流行文化，流行文化鄙視歷史。流行文化關乎個體身份和參與感，但與合作無關。流行文化活在當下，也與過去和未來無關。我認為大部分（為了錢）編寫程式碼的人就是這樣的，他們不知道自己的文化來自哪裡。
>
>  —— 阿蘭・凱接受 Dobb 博士的雜誌採訪時（2012 年）


## 目錄

### [序言](/tw/preface)

### [第一部分：資料系統基礎](/tw/part-i)

- [1. 資料系統架構中的權衡](/tw/ch1)
- [2. 定義非功能性需求](/tw/ch2)
- [3. 資料模型與查詢語言](/tw/ch3)
- [4. 儲存與檢索](/tw/ch4)
- [5. 編碼與演化](/tw/ch5)

### [第二部分：分散式資料](/tw/part-ii)

- [6. 複製](/tw/ch6)
- [7. 分片](/tw/ch7)
- [8. 事務](/tw/ch8)
- [9. 分散式系統的麻煩](/tw/ch9)
- [10.一致性與共識](/tw/ch10)

### [第三部分：派生資料](/tw/part-iii)

- [11. 批處理](/tw/ch11)
- [12. 流處理](/tw/ch12)
- [13. 流式系統的哲學](/tw/ch13)
- [14. 將事情做正確](/ch14)
- [術語表](/tw/glossary)
- [索引](/index)
- [後記](/tw/colophon)


## 法律宣告

從原作者處得知，已經有簡體中文的翻譯計劃，將於 2018 年末完成。[購買地址](https://search.jd.com/Search?keyword=設計資料密集型應用)

譯者純粹出於 **學習目的** 與 **個人興趣** 翻譯本書，不追求任何經濟利益。

譯者保留對此版本譯文的署名權，其他權利以原作者和出版社的主張為準。

本譯文只供學習研究參考之用，不得公開傳播發行或用於商業用途。有能力閱讀英文書籍者請購買正版支援。


## 貢獻

0. 全文校訂 by [@yingang](https://github.com/Vonng/ddia/commits?author=yingang)
1. [序言初翻修正](https://github.com/Vonng/ddia/commit/afb5edab55c62ed23474149f229677e3b42dfc2c) by [@seagullbird](https://github.com/Vonng/ddia/commits?author=seagullbird)
2. [第一章語法標點校正](https://github.com/Vonng/ddia/commit/973b12cd8f8fcdf4852f1eb1649ddd9d187e3644) by [@nevertiree](https://github.com/Vonng/ddia/commits?author=nevertiree)
3. [第六章部分校正](https://github.com/Vonng/ddia/commit/d4eb0852c0ec1e93c8aacc496c80b915bb1e6d48) 與[第十章的初翻](https://github.com/Vonng/ddia/commit/9de8dbd1bfe6fbb03b3bf6c1a1aa2291aed2490e) by [@MuAlex](https://github.com/Vonng/ddia/commits?author=MuAlex)
4. [第一部分](/tw/part-i)前言，[ch2](/tw/ch2)校正 by [@jiajiadebug](https://github.com/Vonng/ddia/commits?author=jiajiadebug)
5. [詞彙表](/tw/glossary)、[後記](/tw/colophon)關於野豬的部分 by [@Chowss](https://github.com/Vonng/ddia/commits?author=Chowss)
6. [繁體中文](https://github.com/Vonng/ddia/pulls)版本與轉換指令碼 by [@afunTW](https://github.com/afunTW)
7. 多處翻譯修正 by [@songzhibin97](https://github.com/Vonng/ddia/commits?author=songzhibin97) [@MamaShip](https://github.com/Vonng/ddia/commits?author=MamaShip) [@FangYuan33](https://github.com/Vonng/ddia/commits?author=FangYuan33)
8. [感謝所有作出貢獻，提出意見的朋友們](/contrib)：

<details>
<summary><a href="https://github.com/Vonng/ddia/pulls">Pull Requests</a> & <a href="https://github.com/Vonng/ddia/issues">Issues</a></summary>

| ISSUE & Pull Requests                           | USER                                                       | Title                                                          |
|-------------------------------------------------|------------------------------------------------------------|----------------------------------------------------------------|
| [386](https://github.com/Vonng/ddia/pull/386)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch2: 最佳化一處翻譯                                                    |
| [384](https://github.com/Vonng/ddia/pull/384)   | [@PanggNOTlovebean](https://github.com/PanggNOTlovebean)   | docs: 最佳化中文文件的措辭和表達                                              |
| [383](https://github.com/Vonng/ddia/pull/383)   | [@PanggNOTlovebean](https://github.com/PanggNOTlovebean)   | docs: 修正 ch4 中的術語和表達錯誤                                          |
| [382](https://github.com/Vonng/ddia/pull/382)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch1: 最佳化一處翻譯                                                    |
| [381](https://github.com/Vonng/ddia/pull/381)   | [@Max-Tortoise](https://github.com/Max-Tortoise)           | ch4: 修正一處術語不完整問題                                               |
| [377](https://github.com/Vonng/ddia/pull/377)   | [@huang06](https://github.com/huang06)                     | 最佳化翻譯術語                                                        |
| [375](https://github.com/Vonng/ddia/issues/375) | [@z-soulx](https://github.com/z-soulx)                     | 對於是否100%全中文翻譯的必要性討論？個人-沒必要100%，特別是“名詞”，有原單詞更加適合it人員                 |
| [371](https://github.com/Vonng/ddia/pull/371)   | [@lewiszlw](https://github.com/lewiszlw)                   | CPU core -> CPU 核心                                          |
| [369](https://github.com/Vonng/ddia/pull/369)   | [@bbwang-gl](https://github.com/bbwang-gl)                 | ch7: 可序列化快照隔離檢測一個事務何時修改另一個事務的讀取                                 |
| [368](https://github.com/Vonng/ddia/pull/368)   | [@yhao3](https://github.com/yhao3)                         | 更新 zh-tw.py 與 zh-tw 內容                                       |
| [367](https://github.com/Vonng/ddia/pull/367)   | [@yhao3](https://github.com/yhao3)                         | 修正拼寫、格式和標點問題                                                  |
| [366](https://github.com/Vonng/ddia/pull/366)   | [@yangshangde](https://github.com/yangshangde)             | ch8: 將“電源失敗”改為“電源失效”                                           |
| [365](https://github.com/Vonng/ddia/pull/365)   | [@xyohn](https://github.com/xyohn)                         | ch1: 最佳化“儲存與計算分離”相關翻譯                                           |
| [364](https://github.com/Vonng/ddia/issues/364) | [@xyohn](https://github.com/xyohn)                         | ch1: 最佳化“儲存與計算分離”相關翻譯                                           |
| [363](https://github.com/Vonng/ddia/pull/363)   | [@xyohn](https://github.com/xyohn)                         | #362: 最佳化一處翻譯                                                 |
| [362](https://github.com/Vonng/ddia/issues/362) | [@xyohn](https://github.com/xyohn)                         | ch1: 最佳化一處翻譯                                                   |
| [359](https://github.com/Vonng/ddia/pull/359)   | [@c25423](https://github.com/c25423)                       | ch10: 修正一處拼寫錯誤                                                 |
| [358](https://github.com/Vonng/ddia/pull/358)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch4: 修正一處拼寫錯誤                                                  |
| [356](https://github.com/Vonng/ddia/pull/356)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch2: 修正一處標點錯誤                                                  |
| [355](https://github.com/Vonng/ddia/pull/355)   | [@DuroyGeorge](https://github.com/DuroyGeorge)             | ch12: 修正一處格式錯誤                                                 |
| [354](https://github.com/Vonng/ddia/pull/354)   | [@justlorain](https://github.com/justlorain)               | ch7: 修正一處參考連結                                                  |
| [353](https://github.com/Vonng/ddia/pull/353)   | [@fantasyczl](https://github.com/fantasyczl)               | ch3&9: 修正兩處引用錯誤                                                |
| [352](https://github.com/Vonng/ddia/pull/352)   | [@fantasyczl](https://github.com/fantasyczl)               | 支援輸出為 EPUB 格式                                                  |
| [349](https://github.com/Vonng/ddia/pull/349)   | [@xiyihan0](https://github.com/xiyihan0)                   | ch1: 修正一處格式錯誤                                                  |
| [348](https://github.com/Vonng/ddia/pull/348)   | [@omegaatt36](https://github.com/omegaatt36)               | ch3: 修正一處影像連結                                                  |
| [346](https://github.com/Vonng/ddia/issues/346) | [@Vermouth1995](https://github.com/Vermouth1995)           | ch1: 最佳化一處翻譯                                                    |
| [343](https://github.com/Vonng/ddia/pull/343)   | [@kehao-chen](https://github.com/kehao-chen)               | ch10: 最佳化一處翻譯                                                   |
| [341](https://github.com/Vonng/ddia/pull/341)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch3: 最佳化兩處翻譯                                                    |
| [340](https://github.com/Vonng/ddia/pull/340)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch2: 最佳化多處翻譯                                                    |
| [338](https://github.com/Vonng/ddia/pull/338)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch1: 最佳化一處翻譯                                                    |
| [335](https://github.com/Vonng/ddia/pull/335)   | [@kimi0230](https://github.com/kimi0230)                   | 修正一處繁體中文錯誤                                                     |
| [334](https://github.com/Vonng/ddia/pull/334)   | [@soulrrrrr](https://github.com/soulrrrrr)                 | ch2: 修正一處繁體中文錯誤                                                |
| [332](https://github.com/Vonng/ddia/pull/332)   | [@justlorain](https://github.com/justlorain)               | ch5: 修正一處翻譯錯誤                                                  |
| [331](https://github.com/Vonng/ddia/pull/331)   | [@Lyianu](https://github.com/Lyianu)                       | ch9: 更正幾處拼寫錯誤                                                  |
| [330](https://github.com/Vonng/ddia/pull/330)   | [@Lyianu](https://github.com/Lyianu)                       | ch7: 最佳化一處翻譯                                                    |
| [329](https://github.com/Vonng/ddia/issues/329) | [@Lyianu](https://github.com/Lyianu)                       | ch6: 指出一處翻譯錯誤                                                  |
| [328](https://github.com/Vonng/ddia/pull/328)   | [@justlorain](https://github.com/justlorain)               | ch4: 更正一處翻譯遺漏                                                  |
| [326](https://github.com/Vonng/ddia/pull/326)   | [@liangGTY](https://github.com/liangGTY)                   | ch1: 最佳化一處翻譯                                                    |
| [323](https://github.com/Vonng/ddia/pull/323)   | [@marvin263](https://github.com/marvin263)                 | ch5: 最佳化一處翻譯                                                    |
| [322](https://github.com/Vonng/ddia/pull/322)   | [@marvin263](https://github.com/marvin263)                 | ch8: 最佳化一處翻譯                                                    |
| [304](https://github.com/Vonng/ddia/pull/304)   | [@spike014](https://github.com/spike014)                   | ch11: 最佳化一處翻譯                                                   |
| [298](https://github.com/Vonng/ddia/pull/298)   | [@Makonike](https://github.com/Makonike)                   | ch11&12: 修正兩處錯誤                                                |
| [284](https://github.com/Vonng/ddia/pull/284)   | [@WAangzE](https://github.com/WAangzE)                     | ch4: 更正一處列表錯誤                                                  |
| [283](https://github.com/Vonng/ddia/pull/283)   | [@WAangzE](https://github.com/WAangzE)                     | ch3: 更正一處錯別字                                                   |
| [282](https://github.com/Vonng/ddia/pull/282)   | [@WAangzE](https://github.com/WAangzE)                     | ch2: 更正一處公式問題                                                  |
| [281](https://github.com/Vonng/ddia/pull/281)   | [@lyuxi99](https://github.com/lyuxi99)                     | 更正多處內部連結錯誤                                                     |
| [280](https://github.com/Vonng/ddia/pull/280)   | [@lyuxi99](https://github.com/lyuxi99)                     | ch9: 更正內部連結錯誤                                                  |
| [279](https://github.com/Vonng/ddia/issues/279) | [@codexvn](https://github.com/codexvn)                     | ch9: 指出公式在 GitHub Pages 顯示的問題                                  |
| [278](https://github.com/Vonng/ddia/pull/278)   | [@LJlkdskdjflsa](https://github.com/LJlkdskdjflsa)         | 發現了繁體中文版本中的錯誤翻譯                                                |
| [275](https://github.com/Vonng/ddia/pull/275)   | [@117503445](https://github.com/117503445)                 | 更正 LICENSE 連結                                                  |
| [274](https://github.com/Vonng/ddia/pull/274)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch7: 修正錯別字                                                     |
| [273](https://github.com/Vonng/ddia/pull/273)   | [@Sdot-Python](https://github.com/Sdot-Python)             | ch7: 統一了 write skew 的翻譯                                        |
| [271](https://github.com/Vonng/ddia/pull/271)   | [@Makonike](https://github.com/Makonike)                   | ch6: 統一了 rebalancing 的翻譯                                       |
| [270](https://github.com/Vonng/ddia/pull/270)   | [@Ynjxsjmh](https://github.com/Ynjxsjmh)                   | ch7: 修正不一致的翻譯                                                  |
| [263](https://github.com/Vonng/ddia/pull/263)   | [@zydmayday](https://github.com/zydmayday)                 | ch5: 修正譯文中的重複單詞                                                |
| [260](https://github.com/Vonng/ddia/pull/260)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch4: 修正部分不準確的翻譯                                                |
| [258](https://github.com/Vonng/ddia/pull/258)   | [@bestgrc](https://github.com/bestgrc)                     | ch3: 修正一處翻譯錯誤                                                  |
| [257](https://github.com/Vonng/ddia/pull/257)   | [@UnderSam](https://github.com/UnderSam)                   | ch8: 修正一處拼寫錯誤                                                  |
| [256](https://github.com/Vonng/ddia/pull/256)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可序列化”相關內容的多處翻譯不當                                       |
| [255](https://github.com/Vonng/ddia/pull/255)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可重複讀”相關內容的多處翻譯不當                                       |
| [253](https://github.com/Vonng/ddia/pull/253)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“讀已提交”相關內容的多處翻譯不當                                       |
| [246](https://github.com/Vonng/ddia/pull/246)   | [@derekwu0101](https://github.com/derekwu0101)             | ch3: 修正繁體中文的轉譯錯誤                                               |
| [245](https://github.com/Vonng/ddia/pull/245)   | [@skyran1278](https://github.com/skyran1278)               | ch12: 修正繁體中文的轉譯錯誤                                              |
| [244](https://github.com/Vonng/ddia/pull/244)   | [@Axlgrep](https://github.com/Axlgrep)                     | ch9: 修正不通順的翻譯                                                  |
| [242](https://github.com/Vonng/ddia/pull/242)   | [@lynkeib](https://github.com/lynkeib)                     | ch9: 修正不通順的翻譯                                                  |
| [241](https://github.com/Vonng/ddia/pull/241)   | [@lynkeib](https://github.com/lynkeib)                     | ch8: 修正不正確的公式格式                                                |
| [240](https://github.com/Vonng/ddia/pull/240)   | [@8da2k](https://github.com/8da2k)                         | ch9: 修正不通順的翻譯                                                  |
| [239](https://github.com/Vonng/ddia/pull/239)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch7: 修正不一致的翻譯                                                  |
| [237](https://github.com/Vonng/ddia/pull/237)   | [@zhangnew](https://github.com/zhangnew)                   | ch3: 修正錯誤的圖片連結                                                 |
| [229](https://github.com/Vonng/ddia/pull/229)   | [@lis186](https://github.com/lis186)                       | 指出繁體中文的轉譯錯誤：複雜                                                 |
| [226](https://github.com/Vonng/ddia/pull/226)   | [@chroming](https://github.com/chroming)                   | ch1: 修正導航欄中的章節名稱                                               |
| [220](https://github.com/Vonng/ddia/pull/220)   | [@skyran1278](https://github.com/skyran1278)               | ch9: 修正線性一致的繁體中文翻譯                                             |
| [194](https://github.com/Vonng/ddia/pull/194)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正錯誤的翻譯                                                   |
| [193](https://github.com/Vonng/ddia/pull/193)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 最佳化譯文                                                      |
| [192](https://github.com/Vonng/ddia/pull/192)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正不一致和不通順的翻譯                                              |
| [190](https://github.com/Vonng/ddia/pull/190)   | [@Pcrab](https://github.com/Pcrab)                         | ch1: 修正不準確的翻譯                                                  |
| [187](https://github.com/Vonng/ddia/pull/187)   | [@narojay](https://github.com/narojay)                     | ch9: 修正生硬的翻譯                                                   |
| [186](https://github.com/Vonng/ddia/pull/186)   | [@narojay](https://github.com/narojay)                     | ch8: 修正錯別字                                                     |
| [185](https://github.com/Vonng/ddia/issues/185) | [@8da2k](https://github.com/8da2k)                         | 指出小標題跳轉的問題                                                     |
| [184](https://github.com/Vonng/ddia/pull/184)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch10: 修正失效的網址                                                  |
| [183](https://github.com/Vonng/ddia/pull/183)   | [@OneSizeFitsQuorum](https://github.com/OneSizeFitsQuorum) | ch8: 修正錯別字                                                     |
| [182](https://github.com/Vonng/ddia/issues/182) | [@lroolle](https://github.com/lroolle)                     | 建議docsify的主題風格                                                 |
| [181](https://github.com/Vonng/ddia/pull/181)   | [@YunfengGao](https://github.com/YunfengGao)               | ch2: 修正翻譯錯誤                                                    |
| [180](https://github.com/Vonng/ddia/pull/180)   | [@skyran1278](https://github.com/skyran1278)               | ch3: 指出繁體中文的轉譯錯誤                                               |
| [177](https://github.com/Vonng/ddia/pull/177)   | [@exzhawk](https://github.com/exzhawk)                     | 支援 Github Pages 裡的公式顯示                                         |
| [176](https://github.com/Vonng/ddia/pull/176)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch2: 語義網相關翻譯更正                                                 |
| [175](https://github.com/Vonng/ddia/pull/175)   | [@cwr31](https://github.com/cwr31)                         | ch7: 不變式相關翻譯更正                                                 |
| [174](https://github.com/Vonng/ddia/pull/174)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | README & preface: 更正不正確的中文用詞和標點符號                              |
| [173](https://github.com/Vonng/ddia/pull/173)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正不完整的翻譯                                                 |
| [171](https://github.com/Vonng/ddia/pull/171)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正重複的譯文                                                  |
| [169](https://github.com/Vonng/ddia/pull/169)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 更正不太通順的翻譯                                                |
| [166](https://github.com/Vonng/ddia/pull/166)   | [@bp4m4h94](https://github.com/bp4m4h94)                   | ch1: 發現錯誤的文獻索引                                                 |
| [164](https://github.com/Vonng/ddia/pull/164)   | [@DragonDriver](https://github.com/DragonDriver)           | preface: 更正錯誤的標點符號                                             |
| [163](https://github.com/Vonng/ddia/pull/163)   | [@llmmddCoder](https://github.com/llmmddCoder)             | ch1: 更正錯誤字                                                     |
| [160](https://github.com/Vonng/ddia/pull/160)   | [@Zhayhp](https://github.com/Zhayhp)                       | ch2: 建議將 network model 翻譯為網狀模型                                 |
| [159](https://github.com/Vonng/ddia/pull/159)   | [@1ess](https://github.com/1ess)                           | ch4: 更正錯誤字                                                     |
| [157](https://github.com/Vonng/ddia/pull/157)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通順的翻譯                                                 |
| [155](https://github.com/Vonng/ddia/pull/155)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通順的翻譯                                                 |
| [153](https://github.com/Vonng/ddia/pull/153)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch9: 修正縮圖的錯別字                                                 |
| [152](https://github.com/Vonng/ddia/pull/152)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 除重->去重                                                    |
| [151](https://github.com/Vonng/ddia/pull/151)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 修訂sibling相關的翻譯                                            |
| [147](https://github.com/Vonng/ddia/pull/147)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 更正一處不準確的翻譯                                                |
| [145](https://github.com/Vonng/ddia/pull/145)   | [@Hookey](https://github.com/Hookey)                       | 識別了當前簡繁轉譯過程中處理不當的地方，暫透過轉換指令碼規避                                  |
| [144](https://github.com/Vonng/ddia/issues/144) | [@secret4233](https://github.com/secret4233)               | ch7: 不翻譯`next-key locking`                                     |
| [143](https://github.com/Vonng/ddia/issues/143) | [@imcheney](https://github.com/imcheney)                   | ch3: 更新殘留的機翻段落                                                 |
| [142](https://github.com/Vonng/ddia/issues/142) | [@XIJINIAN](https://github.com/XIJINIAN)                   | 建議去除段首的製表符                                                     |
| [141](https://github.com/Vonng/ddia/issues/141) | [@Flyraty](https://github.com/Flyraty)                     | ch5: 發現一處錯誤格式的章節引用                                             |
| [140](https://github.com/Vonng/ddia/pull/140)   | [@Bowser1704](https://github.com/Bowser1704)               | ch5: 修正章節Summary中多處不通順的翻譯                                      |
| [139](https://github.com/Vonng/ddia/pull/139)   | [@Bowser1704](https://github.com/Bowser1704)               | ch2&ch3: 修正多處不通順的或錯誤的翻譯                                        |
| [137](https://github.com/Vonng/ddia/pull/137)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch5&ch6: 最佳化多處不通順的或錯誤的翻譯                                        |
| [134](https://github.com/Vonng/ddia/pull/134)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch4: 最佳化多處不通順的或錯誤的翻譯                                            |
| [133](https://github.com/Vonng/ddia/pull/133)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 最佳化多處錯誤的或不通順的翻譯                                            |
| [132](https://github.com/Vonng/ddia/pull/132)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 最佳化一處容易產生歧義的翻譯                                             |
| [131](https://github.com/Vonng/ddia/pull/131)   | [@rwwg4](https://github.com/rwwg4)                         | ch6: 修正兩處錯誤的翻譯                                                 |
| [129](https://github.com/Vonng/ddia/pull/129)   | [@anaer](https://github.com/anaer)                         | ch4: 修正兩處強調文字和四處程式碼變數名稱                                         |
| [128](https://github.com/Vonng/ddia/pull/128)   | [@meilin96](https://github.com/meilin96)                   | ch5: 修正一處錯誤的引用                                                 |
| [126](https://github.com/Vonng/ddia/pull/126)   | [@cwr31](https://github.com/cwr31)                         | ch10: 修正一處錯誤的翻譯（功能 -> 函式）                                      |
| [125](https://github.com/Vonng/ddia/pull/125)   | [@dch1228](https://github.com/dch1228)                     | ch2: 最佳化 how best 的翻譯（如何以最佳方式）                                  |
| [123](https://github.com/Vonng/ddia/pull/123)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 9, TOC in readme, glossary, etc.) |
| [121](https://github.com/Vonng/ddia/pull/121)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 5 to chapter 8)                   |
| [120](https://github.com/Vonng/ddia/pull/120)   | [@jiong-han](https://github.com/jiong-han)                 | Typo fix: 呲之以鼻 -> 嗤之以鼻                                         |
| [119](https://github.com/Vonng/ddia/pull/119)   | [@cclauss](https://github.com/cclauss)                     | Streamline file operations in convert()                        |
| [118](https://github.com/Vonng/ddia/pull/118)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 2 to chapter 4)                   |
| [117](https://github.com/Vonng/ddia/pull/117)   | [@feeeei](https://github.com/feeeei)                       | 統一每章的標題格式                                                      |
| [115](https://github.com/Vonng/ddia/pull/115)   | [@NageNalock](https://github.com/NageNalock)               | 第七章病句修改: 重複詞語                                                  |
| [114](https://github.com/Vonng/ddia/pull/114)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | Update README.md: correct the book name                        |
| [113](https://github.com/Vonng/ddia/pull/113)   | [@lpxxn](https://github.com/lpxxn)                         | 修改語句                                                           |
| [112](https://github.com/Vonng/ddia/pull/112)   | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [110](https://github.com/Vonng/ddia/pull/110)   | [@lpxxn](https://github.com/lpxxn)                         | 讀已寫入資料                                                         |
| [107](https://github.com/Vonng/ddia/pull/107)   | [@abbychau](https://github.com/abbychau)                   | 單調鐘和好死還是賴活著                                                    |
| [106](https://github.com/Vonng/ddia/pull/106)   | [@enochii](https://github.com/enochii)                     | typo in ch2: fix braces typo                                   |
| [105](https://github.com/Vonng/ddia/pull/105)   | [@LiminCode](https://github.com/LiminCode)                 | Chronicle translation error                                    |
| [104](https://github.com/Vonng/ddia/pull/104)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | several advice for better translation                          |
| [103](https://github.com/Vonng/ddia/pull/103)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in ch4: should be 完成 rather than 完全                       |
| [102](https://github.com/Vonng/ddia/pull/102)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | ch4: better-translation: 扼殺 → 破壞                               |
| [101](https://github.com/Vonng/ddia/pull/101)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in Ch4: should be "改變" rathr than "蓋面"                    |
| [100](https://github.com/Vonng/ddia/pull/100)   | [@LiminCode](https://github.com/LiminCode)                 | fix missing translation                                        |
| [99 ](https://github.com/Vonng/ddia/pull/99)    | [@mrdrivingduck](https://github.com/mrdrivingduck)         | ch6: fix the word rebalancing                                  |
| [98 ](https://github.com/Vonng/ddia/pull/98)    | [@jacklightChen](https://github.com/jacklightChen)         | fix ch7.md: fix wrong references                               |
| [97 ](https://github.com/Vonng/ddia/pull/97)    | [@jenac](https://github.com/jenac)                         | 96                                                             |
| [96 ](https://github.com/Vonng/ddia/pull/96)    | [@PragmaTwice](https://github.com/PragmaTwice)             | ch2: fix typo about 'may or may not be'                        |
| [95 ](https://github.com/Vonng/ddia/pull/95)    | [@EvanMu96](https://github.com/EvanMu96)                   | fix translation of "the battle cry" in ch5                     |
| [94 ](https://github.com/Vonng/ddia/pull/94)    | [@kemingy](https://github.com/kemingy)                     | ch6: fix markdown and punctuations                             |
| [93 ](https://github.com/Vonng/ddia/pull/93)    | [@kemingy](https://github.com/kemingy)                     | ch5: fix markdown and some typos                               |
| [92 ](https://github.com/Vonng/ddia/pull/92)    | [@Gilbert1024](https://github.com/Gilbert1024)             | Merge pull request #1 from Vonng/master                        |
| [88 ](https://github.com/Vonng/ddia/pull/88)    | [@kemingy](https://github.com/kemingy)                     | fix typo for ch1, ch2, ch3, ch4                                |
| [87 ](https://github.com/Vonng/ddia/pull/87)    | [@wynn5a](https://github.com/wynn5a)                       | Update ch3.md                                                  |
| [86 ](https://github.com/Vonng/ddia/pull/86)    | [@northmorn](https://github.com/northmorn)                 | Update ch1.md                                                  |
| [85 ](https://github.com/Vonng/ddia/pull/85)    | [@sunbuhui](https://github.com/sunbuhui)                   | fix ch2.md: fix ch2 ambiguous translation                      |
| [84 ](https://github.com/Vonng/ddia/pull/84)    | [@ganler](https://github.com/ganler)                       | Fix translation: use up                                        |
| [83 ](https://github.com/Vonng/ddia/pull/83)    | [@afunTW](https://github.com/afunTW)                       | Using OpenCC to convert from zh-cn to zh-tw                    |
| [82 ](https://github.com/Vonng/ddia/pull/82)    | [@kangni](https://github.com/kangni)                       | fix gitbook url                                                |
| [78 ](https://github.com/Vonng/ddia/pull/78)    | [@hanyu2](https://github.com/hanyu2)                       | Fix unappropriated translation                                 |
| [77 ](https://github.com/Vonng/ddia/pull/77)    | [@Ozarklake](https://github.com/Ozarklake)                 | fix typo                                                       |
| [75 ](https://github.com/Vonng/ddia/pull/75)    | [@2997ms](https://github.com/2997ms)                       | Fix typo                                                       |
| [74 ](https://github.com/Vonng/ddia/pull/74)    | [@2997ms](https://github.com/2997ms)                       | Update ch9.md                                                  |
| [70 ](https://github.com/Vonng/ddia/pull/70)    | [@2997ms](https://github.com/2997ms)                       | Update ch7.md                                                  |
| [67 ](https://github.com/Vonng/ddia/pull/67)    | [@jiajiadebug](https://github.com/jiajiadebug)             | fix issues in ch2 - ch9 and glossary                           |
| [66 ](https://github.com/Vonng/ddia/pull/66)    | [@blindpirate](https://github.com/blindpirate)             | Fix typo                                                       |
| [63 ](https://github.com/Vonng/ddia/pull/63)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch10.md                                                 |
| [62 ](https://github.com/Vonng/ddia/pull/62)    | [@ych](https://github.com/ych)                             | fix ch1.md typesetting problem                                 |
| [61 ](https://github.com/Vonng/ddia/pull/61)    | [@xianlaioy](https://github.com/xianlaioy)                 | docs:鍾-->種，去掉ou                                                |
| [60 ](https://github.com/Vonng/ddia/pull/60)    | [@Zombo1296](https://github.com/Zombo1296)                 | 否則 -> 或者                                                       |
| [59 ](https://github.com/Vonng/ddia/pull/59)    | [@AlexanderMisel](https://github.com/AlexanderMisel)       | 呼叫->呼叫，顯著->顯著                                                  |
| [58 ](https://github.com/Vonng/ddia/pull/58)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch8.md                                                  |
| [55 ](https://github.com/Vonng/ddia/pull/55)    | [@saintube](https://github.com/saintube)                   | ch8: 修改連結錯誤                                                    |
| [54 ](https://github.com/Vonng/ddia/pull/54)    | [@Panmax](https://github.com/Panmax)                       | Update ch2.md                                                  |
| [53 ](https://github.com/Vonng/ddia/pull/53)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [52 ](https://github.com/Vonng/ddia/pull/52)    | [@hecenjie](https://github.com/hecenjie)                   | Update ch1.md                                                  |
| [51 ](https://github.com/Vonng/ddia/pull/51)    | [@latavin243](https://github.com/latavin243)               | fix 修正ch3 ch4幾處翻譯                                              |
| [50 ](https://github.com/Vonng/ddia/pull/50)    | [@AlexZFX](https://github.com/AlexZFX)                     | 幾個疏漏和格式錯誤                                                      |
| [49 ](https://github.com/Vonng/ddia/pull/49)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch1.md                                                  |
| [48 ](https://github.com/Vonng/ddia/pull/48)    | [@scaugrated](https://github.com/scaugrated)               | fix typo                                                       |
| [47 ](https://github.com/Vonng/ddia/pull/47)    | [@lzwill](https://github.com/lzwill)                       | Fixed typos in ch2                                             |
| [45 ](https://github.com/Vonng/ddia/pull/45)    | [@zenuo](https://github.com/zenuo)                         | 刪除一個多餘的右括號                                                     |
| [44 ](https://github.com/Vonng/ddia/pull/44)    | [@akxxsb](https://github.com/akxxsb)                       | 修正第七章底部連結錯誤                                                    |
| [43 ](https://github.com/Vonng/ddia/pull/43)    | [@baijinping](https://github.com/baijinping)               | "更假簡單"->"更加簡單"                                                 |
| [42 ](https://github.com/Vonng/ddia/pull/42)    | [@tisonkun](https://github.com/tisonkun)                   | 修復 ch1 中的無序列表格式                                                |
| [38 ](https://github.com/Vonng/ddia/pull/38)    | [@renjie-c](https://github.com/renjie-c)                   | 糾正多處的翻譯小錯誤                                                     |
| [37 ](https://github.com/Vonng/ddia/pull/37)    | [@tankilo](https://github.com/tankilo)                     | fix translation mistakes in ch4.md                             |
| [36 ](https://github.com/Vonng/ddia/pull/36)    | [@wwek](https://github.com/wwek)                           | 1.修復多個連結錯誤 2.名詞最佳化修訂 3.錯誤修訂                                     |
| [35 ](https://github.com/Vonng/ddia/pull/35)    | [@wwek](https://github.com/wwek)                           | fix ch7.md  to ch8.md  link error                              |
| [34 ](https://github.com/Vonng/ddia/pull/34)    | [@wwek](https://github.com/wwek)                           | Merge pull request #1 from Vonng/master                        |
| [33 ](https://github.com/Vonng/ddia/pull/33)    | [@wwek](https://github.com/wwek)                           | fix part-ii.md link error                                      |
| [32 ](https://github.com/Vonng/ddia/pull/32)    | [@JCYoky](https://github.com/JCYoky)                       | Update ch2.md                                                  |
| [31 ](https://github.com/Vonng/ddia/pull/31)    | [@elsonLee](https://github.com/elsonLee)                   | Update ch7.md                                                  |
| [26 ](https://github.com/Vonng/ddia/pull/26)    | [@yjhmelody](https://github.com/yjhmelody)                 | 修復一些明顯錯誤                                                       |
| [25 ](https://github.com/Vonng/ddia/pull/25)    | [@lqbilbo](https://github.com/lqbilbo)                     | 修復連結錯誤                                                         |
| [24 ](https://github.com/Vonng/ddia/pull/24)    | [@artiship](https://github.com/artiship)                   | 修改詞語順序                                                         |
| [23 ](https://github.com/Vonng/ddia/pull/23)    | [@artiship](https://github.com/artiship)                   | 修正錯別字                                                          |
| [22 ](https://github.com/Vonng/ddia/pull/22)    | [@artiship](https://github.com/artiship)                   | 糾正翻譯錯誤                                                         |
| [21 ](https://github.com/Vonng/ddia/pull/21)    | [@zhtisi](https://github.com/zhtisi)                       | 修正目錄和本章標題不符的情況                                                 |
| [20 ](https://github.com/Vonng/ddia/pull/20)    | [@rentiansheng](https://github.com/rentiansheng)           | Update ch7.md                                                  |
| [19 ](https://github.com/Vonng/ddia/pull/19)    | [@LHRchina](https://github.com/LHRchina)                   | 修復語句小bug                                                       |
| [16 ](https://github.com/Vonng/ddia/pull/16)    | [@MuAlex](https://github.com/MuAlex)                       | Master                                                         |
| [15 ](https://github.com/Vonng/ddia/pull/15)    | [@cg-zhou](https://github.com/cg-zhou)                     | Update translation progress                                    |
| [14 ](https://github.com/Vonng/ddia/pull/14)    | [@cg-zhou](https://github.com/cg-zhou)                     | Translate glossary                                             |
| [13 ](https://github.com/Vonng/ddia/pull/13)    | [@cg-zhou](https://github.com/cg-zhou)                     | 詳細修改了後記中和印度野豬相關的描述                                             |
| [12 ](https://github.com/Vonng/ddia/pull/12)    | [@ibyte2011](https://github.com/ibyte2011)                 | 修改了部分翻譯                                                        |
| [11 ](https://github.com/Vonng/ddia/pull/11)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 100%                                                       |
| [10 ](https://github.com/Vonng/ddia/pull/10)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 20%                                                        |
| [9  ](https://github.com/Vonng/ddia/pull/9)     | [@jiajiadebug](https://github.com/jiajiadebug)             | Preface, ch1, part-i translation minor fixes                   |
| [7  ](https://github.com/Vonng/ddia/pull/7)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 translation pull request                                   |
| [6  ](https://github.com/Vonng/ddia/pull/6)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 change version1                                            |
| [5  ](https://github.com/Vonng/ddia/pull/5)     | [@nevertiree](https://github.com/nevertiree)               | Chapter 01語法微調                                                 |
| [2  ](https://github.com/Vonng/ddia/pull/2)     | [@seagullbird](https://github.com/seagullbird)             | 序言初翻                                                           |

</details><br />


---------

## 許可證

本專案採用 [CC-BY 4.0](https://github.com/Vonng/ddia/blob/master/LICENSE) 許可證，您可以在這裡找到完整說明：

- [署名 4.0 協議國際版 CC BY 4.0 Deed](https://creativecommons.org/licenses/by/4.0/deed.zh-hans)
- [Attribution 4.0 International CC BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.en)

================================================
FILE: content/tw/ch1.md
================================================
---
title: "1. 資料系統架構中的權衡"
weight: 101
breadcrumbs: false
---

<a id="ch_tradeoffs"></a>

> *沒有完美的解決方案，只有權衡取捨。[…] 你能做的就是努力獲得最佳的權衡，這就是你所能期望的一切。*
>
> [Thomas Sowell](https://www.youtube.com/watch?v=2YUtKr8-_Fg)，接受 Fred Barnes 採訪（2005）

> [!TIP] 早期讀者注意事項
> 透過 Early Release 電子書，你可以在最早階段讀到作者寫作中的原始、未編輯內容，從而在正式版釋出前儘早使用這些技術。
>
> 這將是最終書籍的第 1 章。本書的 GitHub 倉庫是 https://github.com/ept/ddia2-feedback。
> 如果你希望積極參與本草稿的審閱與評論，請在 GitHub 上聯絡。

資料是當今應用開發的核心。隨著 Web 與移動應用、軟體即服務（SaaS）和雲服務普及，把許多不同使用者的資料存放在共享的伺服器端資料基礎設施中，已經成為常態。來自使用者行為、業務交易、裝置與感測器的資料，需要被儲存並可用於分析。使用者每次與應用互動，既會讀取已有資料，也會產生新資料。

當資料量較小、可在單機儲存和處理時，問題往往並不複雜。但隨著資料規模或查詢速率增長，資料必須分佈到多臺機器上，挑戰隨之而來。隨著需求變得更複雜，僅靠單一系統通常已不足夠，你可能需要組合多個具備不同能力的儲存與處理系統。

如果“管理資料”是開發過程中的主要挑戰之一，我們稱這樣的應用為 **資料密集型（data-intensive）** 應用 [^1]。與之對照，在 **計算密集型（compute-intensive）** 系統中，難點是並行化超大規模計算；而在資料密集型應用中，我們更常關心的是：如何儲存與處理海量資料、如何管理資料變化、如何在故障與併發下保持一致性，以及如何讓服務保持高可用。

這類應用通常由若干標準構件搭建而成，每個構件負責一種常見能力。例如，很多應用都需要：

* 儲存資料，以便它們或其他應用程式以後能再次找到（**資料庫**）
* 記住昂貴操作的結果，以加快讀取速度（**快取**）
* 允許使用者按關鍵字搜尋資料或以各種方式過濾資料（**搜尋索引**）
* 一旦事件和資料變更發生就立即處理（**流處理**）
* 定期處理累積的大量資料（**批處理**）

在構建應用時，我們通常會選擇若干軟體系統或服務（例如資料庫或 API），再用應用程式碼把它們拼接起來。如果你的需求恰好落在這些系統的設計邊界內，這並不困難。

但當應用目標更有野心時，問題就會出現。資料庫有很多種，各自特性不同、適用場景也不同，如何選型？快取有多種做法，搜尋索引也有多種構建方式，如何權衡？當單個工具無法獨立完成目標時，如何把多個工具可靠地組合起來？這些都並不簡單。

本書正是用來幫助你做這類決策：該用什麼技術、怎樣組合技術。你會看到，沒有哪種方案在根本上永遠優於另一種；每種方案都有得失。透過本書，你將學會提出正確問題來評估和比較資料系統，從而為你的具體應用找到更合適的方案。

我們將從今天組織內資料的典型使用方式開始。這些思想很多源自 **企業軟體**（即大型組織的軟體需求與工程實踐，例如大公司和政府機構），因為在歷史上，只有這類組織才有足夠大的資料規模，值得投入複雜技術方案。如果你的資料足夠小，電子表格都可能夠用；但近些年，小公司和初創團隊構建資料密集型系統也越來越常見。

資料系統的核心難點之一在於：不同的人需要用同一份資料做完全不同的事。在公司裡，你和你的團隊有自己的優先順序，另一個團隊即使使用同一資料集，目標也可能完全不同。更麻煩的是，這些目標往往並未被明確表達，容易引發誤解和分歧。

為了幫助你瞭解可以做出哪些選擇，本章比較了幾個對比概念，並探討了它們的權衡：

* 事務型系統和分析型系統之間的區別（["分析型與事務型系統"](#sec_introduction_analytics)）；
* 雲服務和自託管系統的利弊（["雲服務與自託管"](#sec_introduction_cloud)）；
* 何時從單節點系統轉向分散式系統（["分散式與單節點系統"](#sec_introduction_distributed)）；以及
* 平衡業務需求和使用者權利（["資料系統、法律與社會"](#sec_introduction_compliance)）。

此外，本章還會引入貫穿全書的關鍵術語。

> [!TIP] 術語：前端和後端

本書討論的大部分內容都與 **後端開發** 相關。對 Web 應用而言，執行在瀏覽器中的客戶端程式碼稱為 **前端**，處理使用者請求的伺服器端程式碼稱為 **後端**。移動應用也類似前端：它們提供使用者介面，通常經由網際網路與伺服器端後端通訊。前端有時會在裝置本地管理資料 [^2]，但更棘手的資料基礎設施問題通常發生在後端：前端只處理單個使用者的資料，而後端需要代表 **所有** 使用者管理資料。

後端服務通常透過 HTTP（有時是 WebSocket）提供訪問。其核心是應用程式碼：在一個或多個數據庫中讀寫資料，並按需接入快取、訊息佇列等其他系統（可統稱為 **資料基礎設施**）。應用程式碼往往是 **無狀態** 的：處理完一個 HTTP 請求後，不保留該請求上下文。因此，凡是需要跨請求持久化的資訊，都必須寫在客戶端，或寫入伺服器端資料基礎設施。


## 分析型與事務型系統 {#sec_introduction_analytics}

如果你在企業中從事資料系統工作，往往會遇到幾類不同的資料使用者。第一類是 **後端工程師**，他們構建服務來處理讀取與更新資料的請求；這些服務通常直接面向外部使用者，或透過其他服務間接提供能力（參見["微服務與無伺服器"](#sec_introduction_microservices)）。有時服務也只供組織內部使用。

除了管理後端服務的團隊外，通常還有兩類人需要訪問組織的資料：**業務分析師**，他們生成關於組織活動的報告，以幫助管理層做出更好的決策（**商業智慧** 或 **BI**）；以及 **資料科學家**，他們在資料中尋找新的見解，或建立由資料分析和機器學習（AI）支援的面向使用者的產品功能（例如，電子商務網站上的“購買了 X 的人也購買了 Y”推薦、風險評分或垃圾郵件過濾等預測分析，以及搜尋結果排名）。

儘管業務分析師和資料科學家傾向於使用不同的工具並以不同的方式操作，但他們有一些共同點：兩者都執行 **分析**，這意味著他們檢視使用者和後端服務生成的資料，但他們通常不修改這些資料（除了可能修復錯誤）。他們可能建立派生資料集，其中原始資料已經以某種方式處理過。這導致了兩種型別系統之間的分離——我們將在本書中使用這種區別：

* **事務型系統** 由後端服務和資料基礎設施組成，在這裡建立資料，例如透過服務外部使用者。在這裡，應用程式程式碼基於使用者執行的操作讀取和修改其資料庫中的資料。
* **分析型系統** 服務於業務分析師和資料科學家的需求。它們包含來自事務型系統的只讀資料副本，並針對分析所需的資料處理型別進行了最佳化。

正如我們將在下一節中看到的，事務型系統和分析型系統通常出於充分的理由而保持分離。隨著這些系統的成熟，出現了兩個新的專業角色：**資料工程師** 和 **分析工程師**。資料工程師是知道如何整合事務型系統和分析型系統的人，並更廣泛地負責組織的資料基礎設施 [^3]。分析工程師對資料進行建模和轉換，使其對組織中的業務分析師和資料科學家更有用 [^4]。

許多工程師只專注於事務型或分析型其中一側。然而，本書會同時覆蓋這兩類資料系統，因為它們都在組織內的資料生命週期中扮演關鍵角色。我們將深入討論向內外部使用者提供服務所需的資料基礎設施，幫助你更好地與“另一側”的同事協作。

### 事務處理與分析的特徵 {#sec_introduction_oltp}

在商業資料處理的早期，對資料庫的寫入通常對應於發生的 **商業交易（commercial transaction）**：進行銷售、向供應商下訂單、支付員工工資等。隨著資料庫擴充套件到不涉及金錢交換的領域，**事務（transaction）** 這個術語仍然保留了下來，指的是形成邏輯單元的一組讀取和寫入。

> [!NOTE]
> [第 8 章](/tw/ch8#ch_transactions) 詳細探討了我們所說的事務的含義。本章鬆散地使用該術語來指代低延遲的讀取和寫入。

儘管資料庫開始用於許多不同型別的資料——社交媒體上的帖子、遊戲中的移動、地址簿中的聯絡人等等——但是基本的訪問模式仍然類似於處理商業交易。事務型系統通常透過某個鍵查詢少量記錄（這稱為 **點查詢**）。基於使用者的輸入插入、更新或刪除記錄。因為這些應用程式是互動式的，這種訪問模式被稱為 **聯機事務處理**（OLTP）。

然而，資料庫也越來越多地用於分析，與 OLTP 相比，分析具有非常不同的訪問模式。通常，分析查詢會掃描大量記錄，並計算聚合統計資訊（如計數、求和或平均值），而不是將單個記錄返回給使用者。例如，連鎖超市的業務分析師可能想要回答以下分析查詢：

* 我們每家商店在一月份的總收入是多少？
* 在我們最近的促銷期間，我們比平時多賣出了多少香蕉？
* 哪個品牌的嬰兒食品最常與 X 品牌尿布一起購買？

這些型別的查詢產生的報告對商業智慧很重要，可以幫助管理層決定下一步做什麼。為了將這種使用資料庫的模式與事務處理區分開來，它被稱為 **聯機分析處理**（OLAP）[^5]。OLTP 和分析之間的區別並不總是很明確，但[表 1-1](#tab_oltp_vs_olap) 列出了一些典型特徵。

{{< figure id="tab_oltp_vs_olap" title="表 1-1. 事務型系統和分析型系統特徵比較" class="w-full my-4" >}}

| 屬性            | 事務型系統（OLTP）                      | 分析型系統（OLAP）                 |
|-----------------|----------------------------------------|-----------------------------------|
| 主要讀取模式    | 點查詢（透過鍵獲取單個記錄）            | 對大量記錄進行聚合                 |
| 主要寫入模式    | 建立、更新和刪除單個記錄                | 批次匯入（ETL）或事件流            |
| 人類使用者示例    | Web 或移動應用程式的終端使用者              | 內部分析師，用於決策支援           |
| 機器使用示例    | 檢查操作是否被授權                      | 檢測欺詐/濫用模式                  |
| 查詢型別        | 固定的查詢集，由應用程式預定義          | 分析師可以進行任意查詢             |
| 資料代表        | 資料的最新狀態（當前時間點）            | 隨時間發生的事件歷史               |
| 資料集大小      | GB 到 TB                                | TB 到 PB                           |

> [!NOTE]
> OLAP 中 **聯機（online）** 的含義不明確；它可能指的是查詢不僅用於預定義的報告，也可能是指分析師互動式地使用 OLAP 系統來進行探索性查詢。

在事務型系統中，通常不允許使用者構建自定義 SQL 查詢並在資料庫上執行它們，因為這可能會允許他們讀取或修改他們沒有許可權訪問的資料。此外，他們可能編寫執行成本高昂的查詢，從而影響其他使用者的資料庫效能。出於這些原因，OLTP 系統主要執行嵌入到應用程式程式碼中的固定查詢集，只偶爾使用一次性的自定義查詢來進行維護或故障排除。另一方面，分析資料庫通常讓使用者可以自由地手動編寫任意 SQL 查詢，或使用 Tableau、Looker 或 Microsoft Power BI 等資料視覺化或儀表板工具自動生成查詢。

還有一種型別的系統是為分析型的工作負載（對許多記錄進行聚合的查詢）設計的，但嵌入到面向使用者的產品中。這一類別被稱為 **產品分析** 或 **即時分析**，為這種用途設計的系統包括 Pinot、Druid 和 ClickHouse [^6]。

### 資料倉庫 {#sec_introduction_dwh}

起初，相同的資料庫既用於事務處理，也用於分析查詢。SQL 在這方面相當靈活：它對兩種型別的查詢都很有效。然而，在 20 世紀 80 年代末和 90 年代初，企業有停止使用其 OLTP 系統進行分析目的的趨勢，轉而在單獨的資料庫系統上執行分析。這個單獨的資料庫被稱為 **資料倉庫**。

一家大型企業可能有幾十個甚至上百個聯機事務處理系統：為面向客戶的網站提供動力的系統、控制實體店中的銷售點（收銀臺）系統、跟蹤倉庫中的庫存、規劃車輛路線、管理供應商、管理員工以及執行許多其他任務。這些系統中的每一個都很複雜，需要一個團隊來維護它，因此這些系統最終主要是相互獨立地執行。

出於幾個原因，業務分析師和資料科學家直接查詢這些 OLTP 系統通常是不可取的：

* 感興趣的資料可能分佈在多個事務型系統中，使得在單個查詢中組合這些資料集變得困難（稱為 **資料孤島** 的問題）；
* 適合 OLTP 的模式和資料佈局不太適合分析（參見["星型和雪花型：分析模式"](/tw/ch3#sec_datamodels_analytics)）；
* 分析查詢可能相當昂貴，在 OLTP 資料庫上執行它們會影響其他使用者的效能；以及
* 出於安全或合規原因，OLTP 系統可能位於不允許使用者直接訪問的單獨網路中。

相比之下，**資料倉庫** 是一個單獨的資料庫，分析師可以隨心所欲地查詢，而不會影響 OLTP 操作 [^7]。正如我們將在[第 4 章](/tw/ch4#ch_storage)中看到的，資料倉庫通常以與 OLTP 資料庫非常不同的方式儲存資料，以最佳化分析中常見的查詢型別。

資料倉庫包含公司中所有各種 OLTP 系統中資料的只讀副本。資料從 OLTP 資料庫中提取（使用定期資料轉儲或連續更新流），轉換為分析友好的模式，進行清理，然後載入到資料倉庫中。這種將資料匯入資料倉庫的過程稱為 **提取-轉換-載入**（ETL），如[圖 1-1](#fig_dwh_etl) 所示。有時 **轉換** 和 **載入** 步驟的順序會互換（即，先載入，再在資料倉庫中進行轉換），從而產生 **ELT**。

{{< figure src="/fig/ddia_0101.png" id="fig_dwh_etl" caption="圖 1-1. ETL 到資料倉庫的簡化概述。" class="w-full my-4" >}}

在某些情況下，ETL 過程的資料來源是外部 SaaS 產品，如客戶關係管理（CRM）、電子郵件營銷或信用卡處理系統。在這些情況下，你無法直接訪問原始資料庫，因為它只能透過軟體供應商的 API 訪問。將這些外部系統的資料匯入你自己的資料倉庫可以實現透過 SaaS API 無法實現的分析。SaaS API 的 ETL 通常由專門的資料聯結器服務（如 Fivetran、Singer 或 AirByte）實現。

一些資料庫系統提供 **混合事務/分析處理**（HTAP），目標是在單個系統中同時支援 OLTP 和分析，而無需從一個系統 ETL 到另一個系統 [^8] [^9]。然而，許多 HTAP 系統內部由一個 OLTP 系統與一個單獨的分析系統耦合組成，隱藏在公共介面後面——因此兩者之間的區別對於理解這些系統如何工作仍然很重要。

此外，儘管 HTAP 已出現，但由於目標和約束不同，事務型系統與分析型系統分離仍很常見。尤其是，讓每個事務型系統擁有自己的資料庫通常被視為良好實踐（參見["微服務與無伺服器"](#sec_introduction_microservices)），這會形成數百個相互獨立的事務型資料庫；與之對應，企業往往只有一個統一的資料倉庫，以便分析師能在單個查詢裡組合多個事務型系統的資料。

因此，HTAP 不會取代資料倉庫。相反，它在同一應用程式既需要執行掃描大量行的分析查詢，又需要以低延遲讀取和更新單個記錄的場景中很有用。例如，欺詐檢測可能涉及此類工作負載 [^10]。

事務型系統和分析型系統之間的分離是更廣泛趨勢的一部分：隨著工作負載變得更加苛刻，系統變得更加專業化並針對特定工作負載進行最佳化。通用系統可以舒適地處理小資料量，但規模越大，系統往往變得越專業化 [^11]。

#### 從資料倉庫到資料湖 {#from-data-warehouse-to-data-lake}

資料倉庫通常使用透過 SQL 進行查詢的 **關係** 資料模型（參見[第 3 章](/tw/ch3#ch_datamodels)），可能使用專門的商業智慧軟體。這個模型很適合業務分析師需要進行的查詢型別，但不太適合資料科學家的需求，他們可能需要執行以下任務：

* 將資料轉換為適合訓練機器學習模型的形式；這通常需要將資料庫表的行和列轉換為稱為 **特徵** 的數值向量或矩陣。以最大化訓練模型效能的方式執行這種轉換的過程稱為 **特徵工程**，它通常需要難以用 SQL 表達的自定義程式碼。
* 獲取文字資料（例如，產品評論）並使用自然語言處理技術嘗試從中提取結構化資訊（例如，作者的情感或他們提到的主題）。同樣，他們可能需要使用計算機視覺技術從照片中提取結構化資訊。

儘管已經有人在努力將機器學習運算元新增到 SQL 資料模型 [^12] 並在關係基礎上構建高效的機器學習系統 [^13]，但許多資料科學家不喜歡在資料倉庫等關係資料庫中工作。相反，許多人更喜歡使用 Python 資料分析庫（如 pandas 和 scikit-learn）、統計分析語言（如 R）和分散式分析框架（如 Spark）[^14]。我們將在["資料框、矩陣和陣列"](/tw/ch3#sec_datamodels_dataframes)中進一步討論這些。

因此，組織面臨著以適合資料科學家使用的形式提供資料的需求。答案是 **資料湖**：一個集中的資料儲存庫，儲存任何可能對分析有用的資料副本，透過 ETL 過程從事務型系統獲得。與資料倉庫的區別在於，資料湖只是包含檔案，而不強制任何特定的檔案格式或資料模型。資料湖中的檔案可能是資料庫記錄的集合，使用 Avro 或 Parquet 等檔案格式編碼（參見[第 5 章](/tw/ch5#ch_encoding)），但它們同樣可以包含文字、影像、影片、感測器讀數、稀疏矩陣、特徵向量、基因組序列或任何其他型別的資料 [^15]。除了更靈活之外，這通常也比關係資料儲存更便宜，因為資料湖可以使用商品化的檔案儲存，如物件儲存（參見["雲原生系統架構"](#sec_introduction_cloud_native)）。

ETL 過程已經泛化為 **資料管道**，在某些情況下，資料湖已成為從事務型系統到資料倉庫路徑上的中間站。資料湖包含事務型系統產生的“原始”形式的資料，沒有轉換為關係資料倉庫模式。這種方法的優勢在於，每個資料消費者都可以將原始資料轉換為最適合其需求的形式。它被稱為 **壽司原則**：“原始資料更好”[^16]。

除了從資料湖載入資料到單獨的資料倉庫之外，還可以直接在資料湖中的檔案上執行典型的資料倉庫工作負載（SQL 查詢和業務分析），以及資料科學和機器學習的工作負載。這種架構被稱為 **資料湖倉**，它需要一個查詢執行引擎和一個元資料（例如，模式管理）層來擴充套件資料湖的檔案儲存 [^17]。

Apache Hive、Spark SQL、Presto 和 Trino 是這種方法的例子。

#### 超越資料湖 {#beyond-the-data-lake}

隨著分析實踐的成熟，組織越來越重視分析系統與資料管道的管理和運維，這一點在 DataOps 宣言中已有體現 [^18]。其中一部分是治理、隱私以及對 GDPR、CCPA 等法規的遵從；我們會在["資料系統、法律與社會"](#sec_introduction_compliance)和["立法與行業自律"](/ch14#sec_future_legislation)中討論。

此外，分析資料的提供形式也越來越多樣：不僅有檔案和關係表，也有事件流（見[第 12 章](/tw/ch12#ch_stream)）。基於檔案的分析通常透過週期性重跑（例如每天一次）來響應資料變化，而流處理能夠讓分析系統在秒級響應事件。對於時效性要求高的場景，這種方式很有價值，例如識別並阻斷潛在的欺詐或濫用行為。

在某些場景中，分析系統的輸出還會迴流到事務型系統（這一過程有時稱為 **反向 ETL** [^19]）。例如，在分析系統裡訓練出的機器學習模型會部署到生產環境，為終端使用者生成“買了 X 的人也買了 Y”這類推薦。此類分析系統的投產結果也稱為 **資料產品** [^20]。機器學習模型可藉助 TFX、Kubeflow、MLflow 等專用工具部署到事務型系統。

### 記錄系統與派生資料 {#sec_introduction_derived}

與事務型系統和分析型系統的區分相關，本書還區分 **記錄系統** 與 **派生資料系統**。這組術語有助於你理清資料在系統中的流向：

權威記錄系統
:   記錄系統，也稱 **真相來源（權威資料來源）**，儲存某類資料的權威（canonical）版本。新資料進入系統時（例如使用者輸入）首先寫入這裡。每個事實只表示一次（這種表示通常是 **正規化** 的；見["正規化、反正規化與連線"](/tw/ch3#sec_datamodels_normalization)）。如果其他系統與記錄系統不一致，則按定義以記錄系統為準。

派生資料系統
:   派生系統中的資料，是對其他系統中已有資料進行轉換或處理後的結果。如果派生資料丟失，可以從原始資料來源重新構建。經典例子是快取：命中時由快取返回，未命中時回退到底層資料庫。反正規化值、索引、物化檢視、變換後的資料表示，以及在資料集上訓練出的模型，都屬於這一類。

從技術上說，派生資料是 **冗餘** 的，因為它複製了已有資訊。但它往往是讀查詢高效能的關鍵。你可以從同一個源資料派生出多個數據集，以不同“視角”觀察同一份事實。

分析系統通常屬於派生資料系統，因為它消費的是別處產生的資料。事務型服務往往同時包含記錄系統和派生資料系統：前者是資料首先寫入的主資料庫，後者則是用於加速常見讀取操作的索引與快取，尤其針對記錄系統難以高效回答的查詢。

大多數資料庫、儲存引擎和查詢語言本身並不天然屬於“記錄系統”或“派生系統”。資料庫只是工具，關鍵在於你如何使用它。兩者的區別不在工具本身，而在應用中的職責劃分。只要明確“哪些資料由哪些資料派生而來”，原本混亂的系統架構就會清晰很多。

當一個系統的資料由另一個系統的資料派生而來時，你需要在記錄系統原始資料變化時同步更新派生資料。不幸的是，很多資料庫預設假設應用只依賴單一資料庫，並不擅長在多系統之間傳播這類更新。在["資料整合"](/tw/ch13#sec_future_integration)中，我們會討論如何組合多個數據系統，實現單一系統難以獨立完成的能力。

至此，我們結束了對分析與事務處理的比較。下一節將討論另一組常被反覆爭論的權衡。


## 雲服務與自託管 {#sec_introduction_cloud}

對於組織需要做的任何事情，首要問題之一是：應該在內部完成，還是應該外包？應該自建還是購買？

歸根結底，這是一個關於業務優先順序的問題。公認的管理智慧是，作為組織核心競爭力或競爭優勢的事物應該在內部完成，而非核心、例行或常見的事物應該留給供應商 [^21]。
舉一個極端的例子，大多數公司不會自己發電（除非他們是能源公司，而且不考慮緊急備用電源），因為從電網購買電力更便宜。

對於軟體，需要做出的兩個重要決定是誰構建軟體和誰部署它。有一系列可能性，每個決定都在不同程度上外包，如[圖 1-2](#fig_cloud_spectrum) 所示。
一個極端是你自己編寫並在內部執行的定製軟體；另一個極端是廣泛使用的雲服務或軟體即服務（SaaS）產品，由外部供應商實施和運營，你只能透過 Web 介面或 API 訪問。

{{< figure src="/fig/ddia_0102.png" id="fig_cloud_spectrum" caption="圖 1-2. 軟體型別及其運維的範圍。" class="w-full my-4" >}}

中間地帶是你 **自託管** 的現成軟體（開源或商業），即自己部署——例如，如果你下載 MySQL 並將其安裝在你控制的伺服器上。
這可能在你自己的硬體上（通常稱為 **本地部署**，即使伺服器實際上在租用的資料中心機架中而不是字面上在你自己的場所）
，或者在雲中的虛擬機器上（**基礎設施即服務** 或 IaaS）。沿著這個範圍還有更多的點，例如，採用開源軟體並執行其修改版本。

與這個範圍分開的還有 **如何** 部署服務的問題，無論是在雲中還是在本地——例如，是否使用 Kubernetes 等編排框架。
然而，部署工具的選擇超出了本書的範圍，因為其他因素對資料系統的架構有更大的影響。

### 雲服務的利弊 {#sec_introduction_cloud_tradeoffs}

使用雲服務而不是自己執行對應的軟體，本質上是將該軟體的運維外包給雲提供商。
使用雲服務有充分的支援和反對理由。雲提供商聲稱，使用他們的服務可以節省你的時間和金錢，並相比自建基礎設施讓你更敏捷。

雲服務實際上是否比自託管更便宜、更容易，很大程度上取決於你的技能和系統的工作負載。
如果你已經有設定和運維所需系統的經驗，並且你的負載相當可預測（即，你需要的機器數量不會劇烈波動），
那麼購買自己的機器並自己在上面執行軟體通常更便宜 [^22] [^23]。

另一方面，如果你需要一個你還不知道如何部署和運維的系統，那麼採用雲服務通常比學習自己管理系統更容易、更快。
如果你必須專門僱用和培訓員工來維護和運營系統，那可能會變得非常昂貴。
使用雲時你仍然需要一個運維團隊（參見["雲時代的運維"](#sec_introduction_operations)），但外包基本的系統管理可以讓你的團隊專注於更高層次的問題。

當你將系統的運維外包給專門運維該服務的公司時，可能會帶來更好的服務，因為供應商在向許多客戶提供服務中獲得了專業運維知識。
另一方面，如果你自己運維服務，你可以配置和調整它，以專門針對你特定的工作負載進行最佳化，而云服務不太可能願意替你進行此類定製。

如果你的系統負載隨時間變化很大，雲服務特別有價值。如果你配置機器以能夠處理峰值負載，但這些計算資源大部分時間都處於空閒狀態，系統就變得不太具有成本效益。
在這種情況下，雲服務的優勢在於它們可以更容易地根據需求變化向上或向下擴充套件你的計算資源。

例如，分析系統通常具有極其可變的負載：快速執行大型分析查詢需要並行使用大量計算資源，但一旦查詢完成，這些資源就會處於空閒狀態，直到使用者進行下一個查詢。
預定義的查詢（例如，每日報告）可以排隊和排程以平滑負載，但對於互動式查詢，你越希望它們完成得快，工作負載就變得越可變。
如果你的資料集如此之大，以至於快速查詢需要大量的計算資源，使用雲可以節省資金，因為你可以將未使用的資源返回給供應商，而不是讓它們閒置。對於較小的資料集，這種差異不太顯著。

雲服務的最大缺點是你無法控制它：

* 如果它缺少你需要的功能，你所能做的就是禮貌地詢問供應商是否會新增它；你通常無法自己實現它。
* 如果服務宕機，你所能做的就是等它恢復。
* 如果你以觸發錯誤或導致效能問題的方式使用服務，你將很難診斷問題。對於你自己執行的軟體，你可以從作業系統獲取效能指標和除錯資訊來幫助你理解其行為，你可以檢視伺服器日誌，但對於供應商託管的服務，你通常無法訪問這些內部資訊。
* 此外，如果服務關閉或變得無法接受的昂貴，或者如果供應商決定以你不喜歡的方式更改他們的產品，你就受制於他們 —— 繼續執行舊版本的軟體通常不是一個可行選項，所以你將被迫遷移到替代服務 [^24]。
  如果有暴露相容 API 的替代服務，這種風險會得到緩解，但對於許多雲服務，沒有標準 API，這增加了切換成本，使供應商鎖定成為一個問題。
* 雲供應商需要被信任以保持資料安全，這可能會使遵守隱私和安全法規的過程複雜化。

儘管有所有這些風險，組織在雲服務之上構建新應用程式或採用混合方法（在系統的某些部分使用雲服務）變得越來越流行。然而，雲服務不會取代所有內部資料系統：許多較舊的系統早於雲，對於任何具有現有云服務無法滿足的專業要求的服務，內部系統仍然是必要的。例如，對延遲非常敏感的應用程式（如高頻交易）需要對硬體的完全控制。

### 雲原生系統架構 {#sec_introduction_cloud_native}

除了具有不同的經濟模型（訂閱服務而不是購買硬體和許可軟體在其上執行）之外，雲的興起也對資料系統在技術層面的實現產生了深遠的影響。
術語 **雲原生** 用於描述旨在利用雲服務的架構。

原則上，幾乎任何可自託管的軟體都可以做成雲服務；事實上，許多主流資料系統都已有託管版本。
不過，從零設計為雲原生的系統已經展示出若干優勢：同等硬體下效能更好、故障恢復更快、能更快按負載擴縮計算資源，並支援更大資料集 [^25] [^26] [^27]。[表 1-2](#tab_cloud_native_dbs) 給出兩類系統的一些示例。

{{< figure id="tab_cloud_native_dbs" title="表 1-2. 自託管與雲原生資料庫系統示例" class="w-full my-4" >}}

| 類別              | 自託管系統                  | 雲原生系統                                                            |
|------------------|----------------------------|----------------------------------------------------------------------|
| 事務型/OLTP      | MySQL、PostgreSQL、MongoDB  | AWS Aurora [^25]、Azure SQL DB Hyperscale [^26]、Google Cloud Spanner |
| 分析型/OLAP      | Teradata、ClickHouse、Spark | Snowflake [^27]、Google BigQuery、Azure Synapse Analytics             |

#### 雲服務的分層 {#layering-of-cloud-services}

許多自託管資料系統的系統要求非常簡單：它們在傳統作業系統（如 Linux 或 Windows）上執行，將資料儲存為檔案系統上的檔案，並透過 TCP/IP 等標準網路協議進行通訊。
少數系統依賴於特殊硬體，如 GPU（用於機器學習）或 RDMA 網路介面，但總的來說，自託管軟體傾向於使用非常通用的計算資源：CPU、RAM、檔案系統和 IP 網路。

在雲中，這種型別的軟體可以在基礎設施即服務（IaaS）環境中執行，使用一個或多個虛擬機器（或 **例項**），分配一定的 CPU、記憶體、磁碟和網路頻寬。
與物理機器相比，雲實例可以更快地配置，並且有更多種類的大小，但除此之外，它們與傳統計算機類似：你可以在上面執行任何你喜歡的軟體，但你負責自己管理它。

相比之下，雲原生服務的關鍵思想是不僅使用由作業系統管理的計算資源，還基於較低級別的雲服務構建更高級別的服務。例如：

* 使用 **物件儲存** 服務（如 Amazon S3、Azure Blob Storage 和 Cloudflare R2）儲存大檔案。它們提供比典型檔案系統更有限的 API（基本檔案讀寫），但它們的優勢在於隱藏了底層物理機器：服務自動將資料分佈在許多機器上，因此你不必擔心任何一臺機器上的磁碟空間用完。即使某些機器或其磁碟完全故障，也不會丟失資料。
* 在物件儲存和其他雲服務之上建立更多的服務：例如，Snowflake 是一個基於雲的分析資料庫（資料倉庫），依賴於 S3 進行資料儲存 [^27]，而一些其他服務反過來建立在 Snowflake 之上。

與計算中的抽象一樣，沒有一個正確的答案告訴你應該使用什麼。作為一般規則，更高級別的抽象往往更面向特定的用例。如果你的需求與為其設計更高級別系統的情況相匹配，使用現有的高級別系統可能會比自己從較低級別系統構建更輕鬆，且更能滿足您的需求。另一方面，如果沒有滿足你需求的高階系統，那麼從較低級別的元件自己構建它是唯一的選擇。

#### 儲存與計算的分離 {#sec_introduction_storage_compute}

在傳統計算中，磁碟儲存被認為是持久的（我們假設一旦某些東西被寫入磁碟，它就不會丟失）。為了容忍單個硬碟的故障，通常使用 RAID（獨立磁碟冗餘陣列）在連線到同一臺機器的幾個磁碟上維護資料副本。RAID 可以在硬體中執行，也可以由作業系統在軟體中執行，它對訪問檔案系統的應用程式是透明的。

在雲中，計算例項（虛擬機器）也可能有本地磁碟連線，但云原生系統通常將這些磁碟更多地視為臨時快取，而不是長期儲存。這是因為如果關聯的例項出現故障，或者為了適應負載變化而將例項替換為更大或更小的例項（在不同的物理機器上），本地磁碟就會變得不可訪問。

作為本地磁碟的替代方案，雲服務還提供可以從一個例項分離並附加到另一個例項的虛擬磁碟儲存（Amazon EBS、Azure 託管磁碟和 Google Cloud 中的持久磁碟）。這種虛擬磁碟實際上不是物理磁碟，而是由一組單獨的機器提供的雲服務，它模擬磁碟的行為（**塊裝置**，其中每個塊通常為 4 KiB 大小）。這項技術使得在雲中執行傳統的基於磁碟的軟體成為可能，但塊裝置模擬所引入的開銷在一開始就為雲設計的系統中是可以避免的 [^25]。它還使應用程式對網路故障非常敏感，因為虛擬塊裝置上的每個 I/O 實際上都是網路呼叫 [^28]。

為了解決這個問題，雲原生服務通常避免使用虛擬磁碟，而是建立在針對特定工作負載最佳化的專用儲存服務之上。物件儲存服務（如 S3）設計用於長期儲存相當大的檔案，大小從數百 KB 到幾 GB 不等。資料庫中儲存的單個行或值通常比這小得多；因此，雲資料庫通常在單獨的服務中管理較小的值，並將較大的資料塊（包含許多單個值）儲存在物件儲存中 [^26] [^29]。我們將在[第 4 章](/tw/ch4#ch_storage)中看到這樣做的方法。

在傳統的系統架構中，同一臺計算機負責儲存（磁碟）和計算（CPU 和 RAM），但在雲原生系統中，這兩個職責已經在某種程度上分離或 **解耦** [^9] [^27] [^30] [^31]：例如，S3 只儲存檔案，如果你想分析該資料，你必須在 S3 之外的某個地方執行分析程式碼。這意味著透過網路傳輸資料，我們將在["分散式與單節點系統"](#sec_introduction_distributed)中進一步討論。

此外，雲原生系統通常是 **多租戶** 的，這意味著不是每個客戶都有一臺單獨的機器，而是來自幾個不同客戶的資料和計算由同一服務在同一共享硬體上處理 [^32]。

多租戶可以實現更好的硬體利用率、更容易的可伸縮性和雲提供商更容易的管理，但它也需要仔細的工程設計，以確保一個客戶的活動不會影響其他客戶的系統的效能或安全性 [^33]。

### 雲時代的運維 {#sec_introduction_operations}

傳統上，管理組織伺服器端資料基礎設施的人員被稱為 **資料庫管理員**（DBA）或 **系統管理員**（sysadmins）。最近，許多組織已經嘗試將軟體開發和運維的角色整合到團隊中，共同負責後端服務和資料基礎設施；**DevOps** 理念引導了這一趨勢。**站點可靠性工程師**（SRE）是 Google 對這個想法的實現 [^34]。

運維的作用是確保服務可靠地交付給使用者（包括配置基礎設施和部署應用程式），並確保穩定的生產環境（包括監控和診斷可能影響可靠性的任何問題）。對於自託管系統，運維傳統上涉及大量在單個機器級別的工作，例如容量規劃（例如，監控可用磁碟空間並在空間用完之前新增更多磁碟）、配置新機器、將服務從一臺機器移動到另一臺機器，以及安裝作業系統補丁。

許多雲服務提供了 API 來隱藏實際實現服務的單個機器。例如，雲端儲存用 **計量計費** 替換固定大小的磁碟，你可以儲存資料而無需提前規劃容量需求，然後根據實際使用的空間收費。此外，即使在單個機器發生故障時，許多雲服務仍能保持高可用性（參見["可靠性與容錯"](/tw/ch2#sec_introduction_reliability)）。

從單個機器到服務的重點轉移伴隨著運維角色的變化。提供可靠服務的高階目標保持不變，但流程和工具已經發展。DevOps/SRE 理念更加強調：

* 自動化——優先考慮可重複的流程而不是手動的一次性工作，
* 優先考慮短暫的虛擬機器和服務而不是長期執行的伺服器，
* 啟用頻繁的應用程式更新，
* 從事故中學習，以及
* 保留組織關於系統的知識，即使組織里的人員在不斷流動 [^35]。

隨著雲服務的興起，角色出現了分叉：基礎設施公司的運維團隊專門研究向大量客戶提供可靠服務的細節，而服務的客戶在基礎設施上花費盡可能少的時間和精力 [^36]。

雲服務的客戶仍然需要運維，但他們專注於不同的方面，例如為給定任務選擇最合適的服務、將不同服務相互整合，以及從一個服務遷移到另一個服務。即使計量計費消除了傳統意義上的容量規劃需求，瞭解你為哪個目的使用哪些資源仍然很重要，這樣你就不會在不需要的雲資源上浪費金錢：容量規劃變成了財務規劃，效能最佳化變成了成本最佳化 [^37]。

此外，雲服務確實有資源限制或 **配額**（例如你可以同時執行的最大程序數），你需要在遇到它們之前瞭解並規劃這些 [^38]。

採用雲服務可能比執行自己的基礎設施更容易、更快，儘管學習如何使用它也有成本，也許還要解決其限制。隨著越來越多的供應商提供針對不同用例的更廣泛的雲服務，不同服務之間的整合成為一個特別的挑戰 [^39] [^40]。

ETL（參見["資料倉庫"](#sec_introduction_dwh)）只是故事的一部分；面向事務處理的雲服務之間也需要相互整合。目前，缺乏能促進這類整合的標準，因此往往仍要投入大量手工工作。

無法完全外包給雲服務的其他運維方面包括維護應用程式及其使用的庫的安全性、管理你自己的服務之間的互動、監控服務的負載，以及追蹤問題的原因，例如效能下降或中斷。雖然雲正在改變運維的角色，但對運維的需求比以往任何時候都大。


## 分散式與單節點系統 {#sec_introduction_distributed}

涉及多臺機器透過網路通訊的系統稱為 **分散式系統**。參與分散式系統的每個程序稱為 **節點**。你希望採用分散式系統的原因可能有多種：

固有的分散式系統
:   如果應用程式涉及兩個或多個互動使用者，每個使用者使用自己的裝置，那麼系統不可避免地是分散式的：裝置之間的通訊必須透過網路進行。

雲服務之間的請求
:   如果資料儲存在一個服務中但在另一個服務中處理，則必須透過網路從一個服務傳輸到另一個服務。

容錯/高可用性
:   如果你的應用程式需要在一臺機器（或幾臺機器、網路或整個資料中心）發生故障時繼續工作，你可以使用多臺機器為你提供冗餘。當一臺故障時，另一臺可以接管。參見["可靠性與容錯"](/tw/ch2#sec_introduction_reliability)和[第 6 章](/tw/ch6#ch_replication)關於複製的內容。

可伸縮性
:   如果你的資料量或計算需求增長超過單臺機器的處理能力，你可以潛在地將負載分散到多臺機器上。參見["可伸縮性"](/tw/ch2#sec_introduction_scalability)。

延遲
:   如果你在世界各地都有使用者，你可能希望在全球各個地區都有伺服器，以便每個使用者都可以從地理位置接近他們的伺服器獲得服務。這避免了使用者必須等待網路資料包繞地球半圈才能回答他們的請求。參見["描述效能"](/tw/ch2#sec_introduction_percentiles)。

彈性
:   如果你的應用程式在某些時候很忙，在其他時候很空閒，雲部署可以根據需求向上或向下伸縮，因此你只需為實際使用的資源付費。這在單臺機器上更困難，它需要按處理最大負載的情況進行配置，即使在幾乎不使用的時候也是如此。

使用專用硬體
:   系統的不同部分可以利用不同型別的硬體來匹配其工作負載。例如，物件儲存可能使用具有許多磁碟但很少 CPU 的機器，而資料分析系統可能使用具有大量 CPU 和記憶體但沒有磁碟的機器，機器學習系統可能使用具有 GPU 的機器（GPU 在訓練深度神經網路和其他機器學習任務方面比 CPU 效率高得多）。

法律合規
:   一些國家有資料駐留法律，要求其管轄範圍內的人員資料必須在該國地理範圍內儲存和處理 [^41]。這些規則的範圍各不相同——例如，在某些情況下，它僅適用於醫療或金融資料，而其他情況則更廣泛。因此，在幾個這樣的管轄區域中擁有使用者的服務不得不將他們的資料分佈在幾個位置的伺服器上。

可持續性
:   如果你能靈活把控作業執行的地點和時間，你可能能夠在可再生電力充足的時間和地點執行它們，並避免在電網緊張時執行它們。這可以減少你的碳排放，並允許你利用到廉價的電力 [^42] [^43]。

這些原因既適用於你自己編寫的服務（應用程式程式碼），也適用於由現成軟體（如資料庫）組成的服務。

### 分散式系統的問題 {#sec_introduction_dist_sys_problems}

分散式系統也有缺點。透過網路進行的每個請求和 API 呼叫都需要處理失敗的可能性：網路可能中斷，或者服務可能過載或崩潰，因此任何請求都可能超時而沒有收到響應。在這種情況下，我們不知道服務是否收到了請求，簡單地重試它可能不安全。我們將在[第 9 章](/tw/ch9#ch_distributed)中詳細討論這些問題。

儘管資料中心網路很快，但呼叫另一個服務仍然比在同一程序中呼叫函式慢得多 [^44]。

在處理大量資料時，與其將資料從其儲存處傳輸到處理它的單獨機器，將計算帶到已經擁有資料的機器上可能更快 [^45]。

更多的節點並不總是更快：在某些情況下，一個簡單的單執行緒程式在單臺計算機上執行的效能可以比在具有 100 多個 CPU 核心的叢集上更好 [^46]。

對分散式系統進行故障排除通常很困難：如果系統響應緩慢，你如何找出問題所在？**可觀測性** [^47] [^48] 技術可以用來對分散式系統中的問題進行診斷，這涉及到系統執行資料的收集，並提供查詢方式來支援對高層級的指標或單個的事件的分析。**追蹤** 工具（如 OpenTelemetry、Zipkin 和 Jaeger）允許你跟蹤哪個客戶端為哪個操作呼叫了哪個伺服器，以及每次呼叫花費了多長時間 [^49]。

資料庫提供了各種機制來確保資料一致性，正如我們將在[第 6 章](/tw/ch6#ch_replication)和[第 8 章](/tw/ch8#ch_transactions)中看到的。然而，當每個服務都有自己的資料庫時，維護這些不同服務之間的資料一致性就成了應用程式的問題。分散式事務（我們在[第 8 章](/tw/ch8#ch_transactions)中探討）是確保一致性的一種可能技術，但它們在微服務上下文中很少使用，因為它們違背了使服務彼此獨立的目標，而且許多資料庫不支援它們 [^50]。

出於所有這些原因，如果你可以在單臺機器上做某件事情，與搭建分散式系統相比通常要簡單得多，成本也更低 [^23] [^46] [^51]。CPU、記憶體和磁碟已經變得更大、更快、更可靠。當與 DuckDB、SQLite 和 KùzuDB 等單節點資料庫結合使用時，許多工作負載現在可以在單個節點上執行。我們將在[第 4 章](/tw/ch4#ch_storage)中進一步探討這個主題。

### 微服務與無伺服器 {#sec_introduction_microservices}

在多臺機器上分佈系統的最常見方式是將它們分為客戶端和伺服器，並讓客戶端向伺服器發出請求。最常見的是使用 HTTP 進行此通訊，正如我們將在["流經服務的資料流：REST 和 RPC"](/tw/ch5#sec_encoding_dataflow_rpc)中討論的。同一程序可能既是伺服器（處理傳入請求）又是客戶端（向其他服務發出出站請求）。

這種構建應用程式的方式傳統上被稱為 **面向服務的體系結構**（SOA）；最近，這個想法已經被細化為 **微服務** 架構 [^52] [^53]。在這種架構中，服務有一個明確定義的目的（例如，對於 S3 來說，這個目的是檔案儲存）；每個服務公開一個可以由客戶端透過網路呼叫的 API，每個服務有一個負責其維護的團隊。因此，複雜的應用程式可以分解為多個互動服務，每個服務由單獨的團隊管理。

將複雜的軟體分解為多個服務有幾個優點：每個服務可以獨立更新，減少團隊之間的協調工作；每個服務可以分配它需要的硬體資源；透過將實現細節隱藏在 API 後面，服務所有者可以自由地更改實現而不影響客戶端。在資料儲存方面，每個服務通常有自己的資料庫，而不在服務之間共享資料庫：共享資料庫實際上會使整個資料庫結構成為服務 API 的一部分，然後該結構將很難更改。共享資料庫還可能導致一個服務的查詢對其他服務的效能產生負面影響。

另一方面，擁有許多服務本身可能會帶來複雜性：每個服務都需要用於部署新版本、調整分配的硬體資源以匹配負載、收集日誌、監控服務健康狀況以及在出現問題時向值班工程師發出警報的基礎設施。**編排** 框架（如 Kubernetes）已成為部署服務的流行方式，因為它們為這種基礎設施提供了基礎。在開發期間測試服務可能很複雜，因為你還需要執行它所依賴的所有其他服務。

微服務 API 的演進可能具有挑戰性。呼叫 API 的客戶端期望 API 具有某些欄位。開發人員可能希望根據業務需求的變化向 API 新增或刪除欄位，但這樣做可能會導致客戶端失敗。更糟糕的是，這種失敗通常直到開發週期的後期才被發現，當更新的服務 API 部署到預生產或生產環境時。API 描述標準（如 OpenAPI 和 gRPC）有助於管理客戶端和伺服器 API 之間的關係；我們將在[第 5 章](/tw/ch5#ch_encoding)中進一步討論這些。

微服務主要是人員問題的技術解決方案：允許不同的團隊獨立取得進展，而無需相互協調。這在大公司中很有價值，但在沒有很多團隊的小公司中，使用微服務可能是不必要的開銷，最好以最簡單的方式實現應用程式 [^52]。

**無伺服器（Serverless）**，或 **函式即服務**（FaaS），是另一種部署方式：基礎設施管理進一步外包給雲廠商 [^33]。使用虛擬機器時，你需要顯式決定何時啟動、何時關閉例項；而在無伺服器模型中，雲廠商會根據進入服務的請求自動分配和回收計算資源 [^54]。這種部署方式把更多運維負擔轉移給雲廠商，並支援按使用量計費，而不是按例項計費。為實現這些優勢，許多無伺服器平臺會限制函式執行時長、限制執行時環境，並在函式首次呼叫時出現較慢冷啟動。術語“無伺服器”本身也容易誤導：每次函式執行依然執行在某臺伺服器上，只是後續執行未必在同一臺機器上。此外，BigQuery 及多種 Kafka 產品也採用“Serverless”術語，強調其服務可自動擴縮容且按使用量計費。

就像雲端儲存以計量計費取代了傳統容量規劃（預先決定買多少磁碟）一樣，無伺服器模式把同樣的計費邏輯帶到了程式碼執行層：你只為程式碼實際執行的時間付費，而不必預先準備固定資源。

### 雲計算與超級計算 {#id17}

雲計算不是構建大規模計算系統的唯一方式；另一種選擇是 **高效能計算**（HPC），也稱為 **超級計算**。儘管有重疊，但與雲計算和企業資料中心繫統相比，HPC 通常有不同的設計考量並使用不同的技術。其中一些差異是：

* 超級計算機通常用於計算密集型科學計算任務，例如天氣預報、氣候建模、分子動力學（模擬原子和分子的運動）、複雜的最佳化問題和求解偏微分方程。另一方面，雲計算往往用於線上服務、業務資料系統和需要以高可用性為使用者請求提供服務的類似系統。
* 超級計算機通常執行大型批處理作業，定期將其計算狀態檢查點儲存到磁碟。如果節點發生故障，常見的解決方案是簡單地停止整個叢集工作負載，修復故障節點，然後從最後一個檢查點重新啟動計算 [^55] [^56]。對於雲服務，通常不希望停止整個叢集，因為服務需要以最小的中斷持續為使用者提供服務。
* 超級計算機節點通常透過共享記憶體和遠端直接記憶體訪問（RDMA）進行通訊，這支援高頻寬和低延遲，但假設系統使用者之間有高度的信任 [^57]。在雲計算中，網路和機器通常由相互不信任的組織共享，需要更強的安全機制，如資源隔離（例如虛擬機器）、加密和身份驗證。
* 雲資料中心網路通常基於 IP 和乙太網，以 Clos 拓撲排列以提供高對分頻寬——這是網路整體效能的常用度量 [^55] [^58]。超級計算機通常使用專門的網路拓撲，例如多維網格和環面 [^59]，這能讓具有已知通訊模式的 HPC 工作負載產生更好的效能。
* 雲計算允許節點分佈在多個地理區域，而超級計算機通常假設它們的所有節點都靠近在一起。

大規模分析系統有時與超級計算共享一些特徵，如果你在這個領域工作，瞭解這些技術可能是值得的。然而，本書主要關注需要持續可用的服務，如["可靠性與容錯"](/tw/ch2#sec_introduction_reliability)中所討論的。

## 資料系統、法律與社會 {#sec_introduction_compliance}

到目前為止，你已經在本章中看到，資料系統的架構不僅受到技術目標和要求的影響，還受到它們所支援的組織的人力需求的影響。越來越多的資料系統工程師認識到，僅服務於自己企業的需求是不夠的：我們還對整個社會負有責任。

一個特別的關注點是儲存有關人員及其行為資料的系統。自 2018 年以來，**通用資料保護條例**（GDPR）賦予了許多歐洲國家居民對其個人資料更大的控制權和法律權利，類似的隱私法規已在世界各地的各個國家和州採用，例如加州消費者隱私法（CCPA）。關於 AI 的法規，例如 **歐盟 AI 法案**，對個人資料的使用方式施加了進一步的限制。

此外，即使在不直接受法規約束的領域，人們也越來越認識到計算機系統對人和社會的影響。社交媒體改變了個人消費新聞的方式，這影響了他們的政治觀點，因此可能影響選舉結果。自動化系統越來越多地做出對個人產生深遠影響的決策，例如決定誰應該獲得貸款或保險覆蓋，誰應該被邀請參加工作面試，或者誰應該被懷疑犯罪 [^60]。

每個從事此類系統工作的人都有責任考慮道德影響並確保他們遵守相關法律。沒有必要讓每個人都成為法律和道德專家，但對法律和道德原則的基本認識與分散式系統中的一些基礎知識同樣重要。

法律考慮正在影響資料系統設計的基礎 [^61]。例如，GDPR 授予個人在請求時刪除其資料的權利（有時稱為 **被遺忘權**）。然而，正如我們將在本書中看到的，許多資料系統依賴不可變構造（如僅追加日誌）作為其設計的一部分；我們如何確保刪除應該不可變的檔案中間的某些資料？我們如何處理已被納入派生資料集（參見["記錄系統與派生資料"](#sec_introduction_derived)）的資料刪除，例如機器學習模型的訓練資料？回答這些問題會帶來新的工程挑戰。

目前，我們對於哪些特定技術或系統架構應被視為“符合 GDPR”沒有明確的指導方針。法規故意不強制要求特定技術，因為隨著技術的進步，這些技術可能會迅速變化。相反，法律文字規定了需要解釋的高層級原則。這意味著如何遵守隱私法規的問題沒有簡單的答案，但我們將透過這個視角來看待本書中的一些技術。

一般來說，我們儲存資料是因為我們認為其價值大於儲存它的成本。然而，值得記住的是，儲存成本不僅僅是你為 Amazon S3 或其他服務支付的賬單：成本效益計算還應該考慮到如果資料被洩露或被對手入侵的責任和聲譽損害風險，以及如果資料的儲存和處理被發現不符合法律的法律成本和罰款風險 [^51]。

政府或警察部隊也可能迫使公司交出資料。當存在資料可能暴露犯罪行為的風險時（例如，在幾個中東和非洲國家的同性戀，或在幾個美國州尋求墮胎），儲存該資料會為使用者創造真正的安全風險。例如，去墮胎診所的行程很容易被位置資料洩露，甚至可能透過使用者 IP 地址隨時間的日誌（表示大致位置）洩露。

一旦考慮到所有風險，可能合理地決定某些資料根本不值得儲存，因此應該刪除。這個 **資料最小化** 原則（有時以德語術語 **Datensparsamkeit** 為人所知）與“大資料”哲學相反，後者是投機性地儲存大量資料，以防將來有用 [^62]。但它符合 GDPR，該法規要求個人資料只能為指定的、明確的目的收集，這些資料以後不得用於任何其他目的，並且資料不得保留超過收集目的所需的時間 [^63]。

企業也注意到了隱私和安全問題。信用卡公司要求處理支付的企業遵守嚴格的支付卡行業（PCI）標準。處理商需要經常接受獨立審計師的評估，以驗證持續的合規性。軟體供應商也受到了更多的審查。現在許多買家要求他們的供應商遵守服務組織控制（SOC）型別 2 標準。與 PCI 合規性一樣，供應商需要接受第三方審計以驗證遵守情況。

總的來說，關鍵在於平衡業務目標與被收集、被處理資料的人們的權益。這個主題還有很多內容；在[第 14 章](/ch14#ch_right_thing)中，我們會進一步討論倫理與法律合規，以及偏見與歧視等問題。


## 總結 {#summary}

本章的主線是理解“權衡”。對許多問題而言，並不存在唯一正確答案，而是有多種路徑，各有利弊。我們討論了影響資料系統架構的幾個關鍵選擇，並引入了後續章節會反覆使用的術語。

我們首先區分了事務型（事務處理，OLTP）和分析型（OLAP）系統。它們不僅面對不同訪問模式與資料型別，也服務於不同人群。我們還看到資料倉庫與資料湖這兩類體系，它們透過 ETL 接收來自事務型系統的資料。在[第 4 章](/tw/ch4#ch_storage)中，我們會看到由於查詢型別不同，事務型與分析型系統常常採用截然不同的內部資料佈局。

隨後，我們把相對較新的雲服務模式與長期主導資料系統架構的自託管正規化做了比較。哪種方式更具成本效益高度依賴具體情境，但不可否認，雲原生架構正在深刻改變資料系統的構建方式，例如儲存與計算的分離。

雲系統天然是分散式系統，我們也簡要討論了它與單機方案之間的權衡。有些場景無法避免分散式，但如果單機可行，不必急於把系統分散式化。在[第 9 章](/tw/ch9#ch_distributed)中，我們會更深入地討論分散式系統的挑戰。

最後，資料系統架構不僅由企業自身需求決定，也受保護資料主體權利的隱私法規所塑造，而這一點常被工程實踐忽略。如何把法律要求轉化為技術實現，目前仍無標準答案；但在閱讀本書後續內容時，始終帶著這個問題會很重要。

### 參考文獻

[^1]: Richard T. Kouzes, Gordon A. Anderson, Stephen T. Elbert, Ian Gorton, and Deborah K. Gracio. [The Changing Paradigm of Data-Intensive Computing](http://www2.ic.uff.br/~boeres/slides_AP/papers/TheChanginParadigmDataIntensiveComputing_2009.pdf). *IEEE Computer*, volume 42, issue 1, January 2009. [doi:10.1109/MC.2009.26](https://doi.org/10.1109/MC.2009.26)
[^2]: Martin Kleppmann, Adam Wiggins, Peter van Hardenberg, and Mark McGranaghan. [Local-first software: you own your data, in spite of the cloud](https://www.inkandswitch.com/local-first/). At *2019 ACM SIGPLAN International Symposium on New Ideas, New Paradigms, and Reflections on Programming and Software* (Onward!), October 2019. [doi:10.1145/3359591.3359737](https://doi.org/10.1145/3359591.3359737)
[^3]: Joe Reis and Matt Housley. [*Fundamentals of Data Engineering*](https://www.oreilly.com/library/view/fundamentals-of-data/9781098108298/). O’Reilly Media, 2022. ISBN: 9781098108304
[^4]: Rui Pedro Machado and Helder Russa. [*Analytics Engineering with SQL and dbt*](https://www.oreilly.com/library/view/analytics-engineering-with/9781098142377/). O’Reilly Media, 2023. ISBN: 9781098142384
[^5]: Edgar F. Codd, S. B. Codd, and C. T. Salley. [Providing OLAP to User-Analysts: An IT Mandate](https://www.estgv.ipv.pt/PaginasPessoais/jloureiro/ESI_AID2007_2008/fichas/codd.pdf). E. F. Codd Associates, 1993. Archived at [perma.cc/RKX8-2GEE](https://perma.cc/RKX8-2GEE)
[^6]: Chinmay Soman and Neha Pawar. [Comparing Three Real-Time OLAP Databases: Apache Pinot, Apache Druid, and ClickHouse](https://startree.ai/blog/a-tale-of-three-real-time-olap-databases). *startree.ai*, April 2023. Archived at [perma.cc/8BZP-VWPA](https://perma.cc/8BZP-VWPA)
[^7]: Surajit Chaudhuri and Umeshwar Dayal. [An Overview of Data Warehousing and OLAP Technology](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/sigrecord.pdf). *ACM SIGMOD Record*, volume 26, issue 1, pages 65–74, March 1997. [doi:10.1145/248603.248616](https://doi.org/10.1145/248603.248616)
[^8]: Fatma Özcan, Yuanyuan Tian, and Pinar Tözün. [Hybrid Transactional/Analytical Processing: A Survey](https://humming80.github.io/papers/sigmod-htaptut.pdf). At *ACM International Conference on Management of Data* (SIGMOD), May 2017. [doi:10.1145/3035918.3054784](https://doi.org/10.1145/3035918.3054784)
[^9]: Adam Prout, Szu-Po Wang, Joseph Victor, Zhou Sun, Yongzhu Li, Jack Chen, Evan Bergeron, Eric Hanson, Robert Walzer, Rodrigo Gomes, and Nikita Shamgunov. [Cloud-Native Transactions and Analytics in SingleStore](https://dl.acm.org/doi/abs/10.1145/3514221.3526055). At *International Conference on Management of Data* (SIGMOD), June 2022. [doi:10.1145/3514221.3526055](https://doi.org/10.1145/3514221.3526055)
[^10]: Chao Zhang, Guoliang Li, Jintao Zhang, Xinning Zhang, and Jianhua Feng. [HTAP Databases: A Survey](https://arxiv.org/pdf/2404.15670). *IEEE Transactions on Knowledge and Data Engineering*, April 2024. [doi:10.1109/TKDE.2024.3389693](https://doi.org/10.1109/TKDE.2024.3389693)
[^11]: Michael Stonebraker and Uğur Çetintemel. [‘One Size Fits All’: An Idea Whose Time Has Come and Gone](https://pages.cs.wisc.edu/~shivaram/cs744-readings/fits_all.pdf). At *21st International Conference on Data Engineering* (ICDE), April 2005. [doi:10.1109/ICDE.2005.1](https://doi.org/10.1109/ICDE.2005.1)
[^12]: Jeffrey Cohen, Brian Dolan, Mark Dunlap, Joseph M. Hellerstein, and Caleb Welton. [MAD Skills: New Analysis Practices for Big Data](https://www.vldb.org/pvldb/vol2/vldb09-219.pdf). *Proceedings of the VLDB Endowment*, volume 2, issue 2, pages 1481–1492, August 2009. [doi:10.14778/1687553.1687576](https://doi.org/10.14778/1687553.1687576)
[^13]: Dan Olteanu. [The Relational Data Borg is Learning](https://www.vldb.org/pvldb/vol13/p3502-olteanu.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 12, August 2020. [doi:10.14778/3415478.3415572](https://doi.org/10.14778/3415478.3415572)
[^14]: Matt Bornstein, Martin Casado, and Jennifer Li. [Emerging Architectures for Modern Data Infrastructure: 2020](https://future.a16z.com/emerging-architectures-for-modern-data-infrastructure-2020/). *future.a16z.com*, October 2020. Archived at [perma.cc/LF8W-KDCC](https://perma.cc/LF8W-KDCC)
[^15]: Martin Fowler. [DataLake](https://www.martinfowler.com/bliki/DataLake.html). *martinfowler.com*, February 2015. Archived at [perma.cc/4WKN-CZUK](https://perma.cc/4WKN-CZUK)
[^16]: Bobby Johnson and Joseph Adler. [The Sushi Principle: Raw Data Is Better](https://learning.oreilly.com/videos/strata-hadoop/9781491924143/9781491924143-video210840/). At *Strata+Hadoop World*, February 2015.
[^17]: Michael Armbrust, Ali Ghodsi, Reynold Xin, and Matei Zaharia. [Lakehouse: A New Generation of Open Platforms that Unify Data Warehousing and Advanced Analytics](https://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf). At *11th Annual Conference on Innovative Data Systems Research* (CIDR), January 2021.
[^18]: DataKitchen, Inc. [The DataOps Manifesto](https://dataopsmanifesto.org/en/). *dataopsmanifesto.org*, 2017. Archived at [perma.cc/3F5N-FUQ4](https://perma.cc/3F5N-FUQ4)
[^19]: Tejas Manohar. [What is Reverse ETL: A Definition & Why It’s Taking Off](https://hightouch.io/blog/reverse-etl/). *hightouch.io*, November 2021. Archived at [perma.cc/A7TN-GLYJ](https://perma.cc/A7TN-GLYJ)
[^20]: Simon O’Regan. [Designing Data Products](https://towardsdatascience.com/designing-data-products-b6b93edf3d23). *towardsdatascience.com*, August 2018. Archived at [perma.cc/HU67-3RV8](https://perma.cc/HU67-3RV8)
[^21]: Camille Fournier. [Why is it so hard to decide to buy?](https://skamille.medium.com/why-is-it-so-hard-to-decide-to-buy-d86fee98e88e) *skamille.medium.com*, July 2021. Archived at [perma.cc/6VSG-HQ5X](https://perma.cc/6VSG-HQ5X)
[^22]: David Heinemeier Hansson. [Why we’re leaving the cloud](https://world.hey.com/dhh/why-we-re-leaving-the-cloud-654b47e0). *world.hey.com*, October 2022. Archived at [perma.cc/82E6-UJ65](https://perma.cc/82E6-UJ65)
[^23]: Nima Badizadegan. [Use One Big Server](https://specbranch.com/posts/one-big-server/). *specbranch.com*, August 2022. Archived at [perma.cc/M8NB-95UK](https://perma.cc/M8NB-95UK)
[^24]: Steve Yegge. [Dear Google Cloud: Your Deprecation Policy is Killing You](https://steve-yegge.medium.com/dear-google-cloud-your-deprecation-policy-is-killing-you-ee7525dc05dc). *steve-yegge.medium.com*, August 2020. Archived at [perma.cc/KQP9-SPGU](https://perma.cc/KQP9-SPGU)
[^25]: Alexandre Verbitski, Anurag Gupta, Debanjan Saha, Murali Brahmadesam, Kamal Gupta, Raman Mittal, Sailesh Krishnamurthy, Sandor Maurice, Tengiz Kharatishvili, and Xiaofeng Bao. [Amazon Aurora: Design Considerations for High Throughput Cloud-Native Relational Databases](https://media.amazonwebservices.com/blog/2017/aurora-design-considerations-paper.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 1041–1052, May 2017. [doi:10.1145/3035918.3056101](https://doi.org/10.1145/3035918.3056101)
[^26]: Panagiotis Antonopoulos, Alex Budovski, Cristian Diaconu, Alejandro Hernandez Saenz, Jack Hu, Hanuma Kodavalla, Donald Kossmann, Sandeep Lingam, Umar Farooq Minhas, Naveen Prakash, Vijendra Purohit, Hugh Qu, Chaitanya Sreenivas Ravella, Krystyna Reisteter, Sheetal Shrotri, Dixin Tang, and Vikram Wakade. [Socrates: The New SQL Server in the Cloud](https://www.microsoft.com/en-us/research/uploads/prod/2019/05/socrates.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 1743–1756, June 2019. [doi:10.1145/3299869.3314047](https://doi.org/10.1145/3299869.3314047)
[^27]: Midhul Vuppalapati, Justin Miron, Rachit Agarwal, Dan Truong, Ashish Motivala, and Thierry Cruanes. [Building An Elastic Query Engine on Disaggregated Storage](https://www.usenix.org/system/files/nsdi20-paper-vuppalapati.pdf). At *17th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), February 2020.
[^28]: Nick Van Wiggeren. [The Real Failure Rate of EBS](https://planetscale.com/blog/the-real-fail-rate-of-ebs). *planetscale.com*, March 2025. Archived at [perma.cc/43CR-SAH5](https://perma.cc/43CR-SAH5)
[^29]: Colin Breck. [Predicting the Future of Distributed Systems](https://blog.colinbreck.com/predicting-the-future-of-distributed-systems/). *blog.colinbreck.com*, August 2024. Archived at [perma.cc/K5FC-4XX2](https://perma.cc/K5FC-4XX2)
[^30]: Gwen Shapira. [Compute-Storage Separation Explained](https://www.thenile.dev/blog/storage-compute). *thenile.dev*, January 2023. Archived at [perma.cc/QCV3-XJNZ](https://perma.cc/QCV3-XJNZ)
[^31]: Ravi Murthy and Gurmeet Goindi. [AlloyDB for PostgreSQL under the hood: Intelligent, database-aware storage](https://cloud.google.com/blog/products/databases/alloydb-for-postgresql-intelligent-scalable-storage). *cloud.google.com*, May 2022. Archived at [archive.org](https://web.archive.org/web/20220514021120/https%3A//cloud.google.com/blog/products/databases/alloydb-for-postgresql-intelligent-scalable-storage)
[^32]: Jack Vanlightly. [The Architecture of Serverless Data Systems](https://jack-vanlightly.com/blog/2023/11/14/the-architecture-of-serverless-data-systems). *jack-vanlightly.com*, November 2023. Archived at [perma.cc/UDV4-TNJ5](https://perma.cc/UDV4-TNJ5)
[^33]: Eric Jonas, Johann Schleier-Smith, Vikram Sreekanti, Chia-Che Tsai, Anurag Khandelwal, Qifan Pu, Vaishaal Shankar, Joao Carreira, Karl Krauth, Neeraja Yadwadkar, Joseph E. Gonzalez, Raluca Ada Popa, Ion Stoica, David A. Patterson. [Cloud Programming Simplified: A Berkeley View on Serverless Computing](https://arxiv.org/abs/1902.03383). *arxiv.org*, February 2019.
[^34]: Betsy Beyer, Jennifer Petoff, Chris Jones, and Niall Richard Murphy. [*Site Reliability Engineering: How Google Runs Production Systems*](https://www.oreilly.com/library/view/site-reliability-engineering/9781491929117/). O’Reilly Media, 2016. ISBN: 9781491929124
[^35]: Thomas Limoncelli. [The Time I Stole $10,000 from Bell Labs](https://queue.acm.org/detail.cfm?id=3434773). *ACM Queue*, volume 18, issue 5, November 2020. [doi:10.1145/3434571.3434773](https://doi.org/10.1145/3434571.3434773)
[^36]: Charity Majors. [The Future of Ops Jobs](https://acloudguru.com/blog/engineering/the-future-of-ops-jobs). *acloudguru.com*, August 2020. Archived at [perma.cc/GRU2-CZG3](https://perma.cc/GRU2-CZG3)
[^37]: Boris Cherkasky. [(Over)Pay As You Go for Your Datastore](https://medium.com/riskified-technology/over-pay-as-you-go-for-your-datastore-11a29ae49a8b). *medium.com*, September 2021. Archived at [perma.cc/Q8TV-2AM2](https://perma.cc/Q8TV-2AM2)
[^38]: Shlomi Kushchi. [Serverless Doesn’t Mean DevOpsLess or NoOps](https://thenewstack.io/serverless-doesnt-mean-devopsless-or-noops/). *thenewstack.io*, February 2023. Archived at [perma.cc/3NJR-AYYU](https://perma.cc/3NJR-AYYU)
[^39]: Erik Bernhardsson. [Storm in the stratosphere: how the cloud will be reshuffled](https://erikbern.com/2021/11/30/storm-in-the-stratosphere-how-the-cloud-will-be-reshuffled.html). *erikbern.com*, November 2021. Archived at [perma.cc/SYB2-99P3](https://perma.cc/SYB2-99P3)
[^40]: Benn Stancil. [The data OS](https://benn.substack.com/p/the-data-os). *benn.substack.com*, September 2021. Archived at [perma.cc/WQ43-FHS6](https://perma.cc/WQ43-FHS6)
[^41]: Maria Korolov. [Data residency laws pushing companies toward residency as a service](https://www.csoonline.com/article/3647761/data-residency-laws-pushing-companies-toward-residency-as-a-service.html). *csoonline.com*, January 2022. Archived at [perma.cc/CHE4-XZZ2](https://perma.cc/CHE4-XZZ2)
[^42]: Severin Borenstein. [Can Data Centers Flex Their Power Demand?](https://energyathaas.wordpress.com/2025/04/14/can-data-centers-flex-their-power-demand/) *energyathaas.wordpress.com*, April 2025. Archived at <https://perma.cc/MUD3-A6FF>
[^43]: Bilge Acun, Benjamin Lee, Fiodar Kazhamiaka, Aditya Sundarrajan, Kiwan Maeng, Manoj Chakkaravarthy, David Brooks, and Carole-Jean Wu. [Carbon Dependencies in Datacenter Design and Management](https://hotcarbon.org/assets/2022/pdf/hotcarbon22-acun.pdf). *ACM SIGENERGY Energy Informatics Review*, volume 3, issue 3, pages 21–26. [doi:10.1145/3630614.3630619](https://doi.org/10.1145/3630614.3630619)
[^44]: Kousik Nath. [These are the numbers every computer engineer should know](https://www.freecodecamp.org/news/must-know-numbers-for-every-computer-engineer/). *freecodecamp.org*, September 2019. Archived at [perma.cc/RW73-36RL](https://perma.cc/RW73-36RL)
[^45]: Joseph M. Hellerstein, Jose Faleiro, Joseph E. Gonzalez, Johann Schleier-Smith, Vikram Sreekanti, Alexey Tumanov, and Chenggang Wu. [Serverless Computing: One Step Forward, Two Steps Back](https://arxiv.org/abs/1812.03651). At *Conference on Innovative Data Systems Research* (CIDR), January 2019.
[^46]: Frank McSherry, Michael Isard, and Derek G. Murray. [Scalability! But at What COST?](https://www.usenix.org/system/files/conference/hotos15/hotos15-paper-mcsherry.pdf) At *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
[^47]: Cindy Sridharan. *[Distributed Systems Observability: A Guide to Building Robust Systems](https://unlimited.humio.com/rs/756-LMY-106/images/Distributed-Systems-Observability-eBook.pdf)*. Report, O’Reilly Media, May 2018. Archived at [perma.cc/M6JL-XKCM](https://perma.cc/M6JL-XKCM)
[^48]: Charity Majors. [Observability — A 3-Year Retrospective](https://thenewstack.io/observability-a-3-year-retrospective/). *thenewstack.io*, August 2019. Archived at [perma.cc/CG62-TJWL](https://perma.cc/CG62-TJWL)
[^49]: Benjamin H. Sigelman, Luiz André Barroso, Mike Burrows, Pat Stephenson, Manoj Plakal, Donald Beaver, Saul Jaspan, and Chandan Shanbhag. [Dapper, a Large-Scale Distributed Systems Tracing Infrastructure](https://research.google/pubs/pub36356/). Google Technical Report dapper-2010-1, April 2010. Archived at [perma.cc/K7KU-2TMH](https://perma.cc/K7KU-2TMH)
[^50]: Rodrigo Laigner, Yongluan Zhou, Marcos Antonio Vaz Salles, Yijian Liu, and Marcos Kalinowski. [Data management in microservices: State of the practice, challenges, and research directions](https://www.vldb.org/pvldb/vol14/p3348-laigner.pdf). *Proceedings of the VLDB Endowment*, volume 14, issue 13, pages 3348–3361, September 2021. [doi:10.14778/3484224.3484232](https://doi.org/10.14778/3484224.3484232)
[^51]: Jordan Tigani. [Big Data is Dead](https://motherduck.com/blog/big-data-is-dead/). *motherduck.com*, February 2023. Archived at [perma.cc/HT4Q-K77U](https://perma.cc/HT4Q-K77U)
[^52]: Sam Newman. [*Building Microservices*, second edition](https://www.oreilly.com/library/view/building-microservices-2nd/9781492034018/). O’Reilly Media, 2021. ISBN: 9781492034025
[^53]: Chris Richardson. [Microservices: Decomposing Applications for Deployability and Scalability](https://www.infoq.com/articles/microservices-intro/). *infoq.com*, May 2014. Archived at [perma.cc/CKN4-YEQ2](https://perma.cc/CKN4-YEQ2)
[^54]: Mohammad Shahrad, Rodrigo Fonseca, Íñigo Goiri, Gohar Chaudhry, Paul Batum, Jason Cooke, Eduardo Laureano, Colby Tresness, Mark Russinovich, Ricardo Bianchini. [Serverless in the Wild: Characterizing and Optimizing the Serverless Workload at a Large Cloud Provider](https://www.usenix.org/system/files/atc20-shahrad.pdf). At *USENIX Annual Technical Conference* (ATC), July 2020.
[^55]: Luiz André Barroso, Urs Hölzle, and Parthasarathy Ranganathan. [The Datacenter as a Computer: Designing Warehouse-Scale Machines](https://www.morganclaypool.com/doi/10.2200/S00874ED3V01Y201809CAC046), third edition. Morgan & Claypool Synthesis Lectures on Computer Architecture, October 2018. [doi:10.2200/S00874ED3V01Y201809CAC046](https://doi.org/10.2200/S00874ED3V01Y201809CAC046)
[^56]: David Fiala, Frank Mueller, Christian Engelmann, Rolf Riesen, Kurt Ferreira, and Ron Brightwell. [Detection and Correction of Silent Data Corruption for Large-Scale High-Performance Computing](https://arcb.csc.ncsu.edu/~mueller/ftp/pub/mueller/papers/sc12.pdf),” at *International Conference for High Performance Computing, Networking, Storage and Analysis* (SC), November 2012. [doi:10.1109/SC.2012.49](https://doi.org/10.1109/SC.2012.49)
[^57]: Anna Kornfeld Simpson, Adriana Szekeres, Jacob Nelson, and Irene Zhang. [Securing RDMA for High-Performance Datacenter Storage Systems](https://www.usenix.org/conference/hotcloud20/presentation/kornfeld-simpson). At *12th USENIX Workshop on Hot Topics in Cloud Computing* (HotCloud), July 2020.
[^58]: Arjun Singh, Joon Ong, Amit Agarwal, Glen Anderson, Ashby Armistead, Roy Bannon, Seb Boving, Gaurav Desai, Bob Felderman, Paulie Germano, Anand Kanagala, Jeff Provost, Jason Simmons, Eiichi Tanda, Jim Wanderer, Urs Hölzle, Stephen Stuart, and Amin Vahdat. [Jupiter Rising: A Decade of Clos Topologies and Centralized Control in Google’s Datacenter Network](https://conferences.sigcomm.org/sigcomm/2015/pdf/papers/p183.pdf). At *Annual Conference of the ACM Special Interest Group on Data Communication* (SIGCOMM), August 2015. [doi:10.1145/2785956.2787508](https://doi.org/10.1145/2785956.2787508)
[^59]: Glenn K. Lockwood. [Hadoop’s Uncomfortable Fit in HPC](https://blog.glennklockwood.com/2014/05/hadoops-uncomfortable-fit-in-hpc.html). *glennklockwood.blogspot.co.uk*, May 2014. Archived at [perma.cc/S8XX-Y67B](https://perma.cc/S8XX-Y67B)
[^60]: Cathy O’Neil: *Weapons of Math Destruction: How Big Data Increases Inequality and Threatens Democracy*. Crown Publishing, 2016. ISBN: 9780553418811
[^61]: Supreeth Shastri, Vinay Banakar, Melissa Wasserman, Arun Kumar, and Vijay Chidambaram. [Understanding and Benchmarking the Impact of GDPR on Database Systems](https://www.vldb.org/pvldb/vol13/p1064-shastri.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 7, pages 1064–1077, March 2020. [doi:10.14778/3384345.3384354](https://doi.org/10.14778/3384345.3384354)
[^62]: Martin Fowler. [Datensparsamkeit](https://www.martinfowler.com/bliki/Datensparsamkeit.html). *martinfowler.com*, December 2013. Archived at [perma.cc/R9QX-CME6](https://perma.cc/R9QX-CME6)
[^63]: [Regulation (EU) 2016/679 of the European Parliament and of the Council of 27 April 2016 (General Data Protection Regulation)](https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32016R0679&from=EN). *Official Journal of the European Union* L 119/1, May 2016.

================================================
FILE: content/tw/ch10.md
================================================
---
title: "10. 一致性與共識"
weight: 210
breadcrumbs: false
---

<a id="ch_consistency"></a>

![](/map/ch09.png)

> *一句古老的格言告誡說："千萬不要帶著兩塊計時器出海；要麼帶一塊，要麼帶三塊。"*
>
> 弗雷德里克·P·布魯克斯，《人月神話：軟體工程隨筆》（1995）

正如在 [第九章](/tw/ch9) 中討論的，分散式系統中會出現許多問題。如果我們希望服務在出現這些問題時仍能正確工作，就需要找到容錯的方法。

我們擁有的最佳容錯工具之一是 *複製*。然而，正如我們在 [第六章](/tw/ch6) 中看到的，在多個副本上擁有多份資料副本會帶來不一致的風險。讀取可能由一個非最新的副本處理，從而產生過時的結果。如果多個副本可以接受寫入，我們必須處理在不同副本上併發寫入的值之間的衝突。從高層次來看，處理這些問題有兩種相互競爭的理念：

最終一致性
: 在這種理念中，系統被複制這一事實對應用程式是可見的，作為應用程式開發者，你需要處理可能出現的不一致和衝突。這種方法通常用於多主複製（見 ["多主複製"](/tw/ch6#sec_replication_multi_leader)）和無主複製（見 ["無主複製"](/tw/ch6#sec_replication_leaderless)）的系統中。

強一致性
: 這種理念認為應用程式不應該擔心複製的內部細節，系統應該表現得就像單節點一樣。這種方法的優點是對你（應用程式開發者）來說更簡單。缺點是更強的一致性會帶來效能成本，並且某些最終一致系統能夠容忍的故障會導致強一致系統出現中斷。

一如既往，哪種方法更好取決於你的應用程式。如果你有一個應用程式，使用者可以在離線狀態下對資料進行更改，那麼最終一致性是不可避免的，如 ["同步引擎與本地優先軟體"](/tw/ch6#sec_replication_offline_clients) 中所討論的。然而，最終一致性對應用程式來說也可能很難處理。如果你的副本位於具有快速、可靠通訊的資料中心，那麼強一致性通常是合適的，因為其成本是可以接受的。

在本章中，我們將深入探討強一致性方法，關注三個領域：

1. 一個挑戰是"強一致性"相當模糊，因此我們將制定一個更精確的定義，明確我們想要實現什麼：*線性一致性*。
2. 我們將研究生成 ID 和時間戳的問題。這可能聽起來與一致性無關，但實際上密切相關。
3. 我們將探討分散式系統如何在保持容錯的同時實現線性一致性；答案是 *共識* 演算法。

在此過程中，我們將看到分散式系統中什麼是可能的，什麼是不可能的，存在一些基本限制。

本章的主題以難以正確實現而著稱；構建在沒有故障時表現良好，但在面對設計者沒有考慮到的不幸故障組合時完全崩潰的系統非常容易。已經發展了大量理論來幫助我們思考這些邊界情況，這使我們能夠構建可以穩健地容忍故障的系統。

本章只會觸及表面：我們將堅持非正式的直覺，避免演算法細節、形式化模型和證明。如果你想在共識系統和類似基礎設施上進行認真的工作，你需要更深入地研究理論，才有機會讓你的系統穩健。與往常一樣，本章中的文獻參考提供了一些初步的指引。


## 線性一致性 {#sec_consistency_linearizability}

如果你希望複製的資料庫儘可能簡單易用，你應該讓它表現得就像根本沒有複製一樣。然後使用者就不必擔心複製延遲、衝突和其他不一致性。這將給我們帶來容錯的優勢，但不會因為必須考慮多個副本而帶來複雜性。

這就是 *線性一致性* [^1] 背後的想法（也稱為 *原子一致性* [^2]、*強一致性*、*即時一致性* 或 *外部一致性* [^3]）。線性一致性的確切定義相當微妙，我們將在本節的其餘部分探討它。但基本思想是讓系統看起來好像只有一份資料副本，並且對它的所有操作都是原子的。有了這個保證，即使實際上可能有多個副本，應用程式也不需要擔心它們。

在線性一致系統中，一旦一個客戶端成功完成寫入，所有從資料庫讀取的客戶端都必須能夠看到剛剛寫入的值。維護單一資料副本的假象，意味著要保證讀取到的是最新值，而不是來自過時的快取或副本。換句話說，線性一致性是一種 *新鮮度保證*。為了闡明這個想法，讓我們看一個非線性一致系統的例子。

{{< figure src="/fig/ddia_1001.png" id="fig_consistency_linearizability_0" caption="圖 10-1. 如果這個資料庫是線性一致的，那麼 Alice 的讀取要麼返回 1 而不是 0，要麼 Bob 的讀取返回 0 而不是 1。" class="w-full my-4" >}}

[圖 10-1](#fig_consistency_linearizability_0) 顯示了一個非線性一致的體育網站示例 [^4]。Aaliyah 和 Bryce 坐在同一個房間裡，都在檢視手機，想要了解他們最喜歡的球隊比賽的結果。就在最終比分宣佈後，Aaliyah 重新整理了頁面，看到了獲勝者的公告，並興奮地告訴了 Bryce。Bryce 懷疑地在自己的手機上點選了 *重新整理*，但他的請求傳送到了一個滯後的資料庫副本，因此他的手機顯示比賽仍在進行中。

如果 Aaliyah 和 Bryce 同時點選重新整理，他們得到兩個不同的查詢結果就不會那麼令人驚訝了，因為他們不知道他們各自的請求在伺服器上被處理的確切時間。然而，Bryce 知道他是在聽到 Aaliyah 宣佈最終比分 *之後* 點選重新整理按鈕（發起查詢）的，因此他期望他的查詢結果至少與 Aaliyah 的一樣新。他的查詢返回過時結果這一事實違反了線性一致性。

### 什麼使系統具有線性一致性？ {#sec_consistency_lin_definition}

為了更好地理解線性一致性，讓我們看一些更多的例子。[圖 10-2](#fig_consistency_linearizability_1) 顯示了三個客戶端在線性一致資料庫中併發讀取和寫入同一個物件 *x*。在分散式系統理論中，*x* 被稱為 *暫存器*——在實踐中，它可能是鍵值儲存中的一個鍵，關係資料庫中的一行，或者文件資料庫中的一個文件，例如。

{{< figure src="/fig/ddia_1002.png" id="fig_consistency_linearizability_1" caption="圖 10-2. Alice 觀察到 x = 0 且 y = 1，而 Bob 觀察到 x = 1 且 y = 0。就好像 Alice 和 Bob 的計算機對寫入發生的順序意見不一。" class="w-full my-4" >}}


為簡單起見，[圖 10-2](#fig_consistency_linearizability_1) 僅顯示了從客戶端角度看的請求，而不是資料庫的內部。每個條形代表客戶端發出的請求，條形的開始是傳送請求的時間，條形的結束是客戶端收到響應的時間。由於網路延遲可變，客戶端不知道資料庫確切何時處理了它的請求——它只知道必須在客戶端傳送請求和接收響應之間的某個時間發生。

在這個例子中，暫存器有兩種型別的操作：

* *read*(*x*) ⇒ *v* 表示客戶端請求讀取暫存器 *x* 的值，資料庫返回值 *v*。
* *write*(*x*, *v*) ⇒ *r* 表示客戶端請求將暫存器 *x* 設定為值 *v*，資料庫返回響應 *r*（可能是 *ok* 或 *error*）。

在 [圖 10-2](#fig_consistency_linearizability_1) 中，*x* 的值最初為 0，客戶端 C 執行寫入請求將其設定為 1。在此期間，客戶端 A 和 B 反覆輪詢資料庫以讀取最新值。A 和 B 的讀取請求可能得到什麼響應？

* 客戶端 A 的第一個讀取操作在寫入開始之前完成，因此它必須明確返回舊值 0。
* 客戶端 A 的最後一次讀取在寫入完成後開始，因此如果資料庫是線性一致的，它必須明確返回新值 1，因為讀取必須在寫入之後被處理。
* 與寫入操作在時間上重疊的任何讀取操作可能返回 0 或 1，因為我們不知道在讀取操作被處理時寫入是否已經生效。這些操作與寫入是 *併發* 的。

然而，這還不足以完全描述線性一致性：如果與寫入併發的讀取可以返回舊值或新值，那麼讀者可能會在寫入進行時多次看到值在舊值和新值之間來回翻轉。這不是我們對模擬"單一資料副本"的系統所期望的。

為了使系統線性一致，我們需要新增另一個約束，如 [圖 10-3](#fig_consistency_linearizability_2) 所示。

{{< figure src="/fig/ddia_1003.png" id="fig_consistency_linearizability_2" caption="圖 10-3. 如果 Alice 和 Bob 有完美的時鐘，線性一致性將要求返回 x = 1，因為 x 的讀取在寫入 x = 1 完成後開始。" class="w-full my-4" >}}


在線性一致系統中，我們想象必須有某個時間點（在寫入操作的開始和結束之間），*x* 的值從 0 原子地翻轉到 1。因此，如果一個客戶端的讀取返回新值 1，所有後續讀取也必須返回新值，即使寫入操作尚未完成。

這種時序依賴關係在 [圖 10-3](#fig_consistency_linearizability_2) 中用箭頭表示。客戶端 A 是第一個讀取新值 1 的。就在 A 的讀取返回後，B 開始新的讀取。由於 B 的讀取嚴格發生在 A 的讀取之後，它也必須返回 1，即使 C 的寫入仍在進行中。（這與 [圖 10-1](#fig_consistency_linearizability_0) 中 Aaliyah 和 Bryce 的情況相同：在 Aaliyah 讀取新值後，Bryce 也期望讀取新值。）

我們可以進一步細化這個時序圖，以視覺化每個操作在某個時間點原子地生效 [^5]，就像 [圖 10-4](#fig_consistency_linearizability_3) 中顯示的更複雜的例子。在這個例子中，除了 *read* 和 *write* 之外，我們添加了第三種操作型別：

* *cas*(*x*, *v*old, *v*new) ⇒ *r* 表示客戶端請求一個原子 *比較並設定* 操作（見 ["條件寫入（比較並設定）"](/tw/ch8#sec_transactions_compare_and_set)）。如果暫存器 *x* 的當前值等於 *v*old，它應該原子地設定為 *v*new。如果 *x* 的值與 *v*old 不同，則操作應該保持暫存器不變並返回錯誤。*r* 是資料庫的響應（*ok* 或 *error*）。

[圖 10-4](#fig_consistency_linearizability_3) 中的每個操作都用一條垂直線（在每個操作的條形內）標記，表示我們認為操作執行的時間。這些標記按順序連線起來，結果必須是暫存器的有效讀寫序列（每次讀取必須返回最近寫入設定的值）。

線性一致性的要求是連線操作標記的線始終向前移動（從左到右），永不後退。這個要求確保了我們之前討論的新鮮度保證：一旦寫入或讀取了新值，所有後續讀取都會看到寫入的值，直到它再次被覆蓋。

{{< figure src="/fig/ddia_1004.png" id="fig_consistency_linearizability_3" caption="圖 10-4. x 的讀取與寫入 x = 1 併發。由於我們不知道操作的確切時序，讀取可以返回 0 或 1。" class="w-full my-4" >}}


[圖 10-4](#fig_consistency_linearizability_3) 中有一些有趣的細節需要指出：

* 首先客戶端 B 傳送了讀取 *x* 的請求，然後客戶端 D 傳送了將 *x* 設定為 0 的請求，然後客戶端 A 傳送了將 *x* 設定為 1 的請求。然而，返回給 B 的讀取值是 1（A 寫入的值）。這是可以的：這意味著資料庫首先處理了 D 的寫入，然後是 A 的寫入，最後是 B 的讀取。雖然這不是傳送請求的順序，但這是一個可接受的順序，因為這三個請求是併發的。也許 B 的讀取請求在網路中稍有延遲，因此它在兩次寫入之後才到達資料庫。
* 客戶端 B 的讀取在客戶端 A 收到資料庫的響應之前返回了 1，表示值 1 的寫入成功。這也是可以的：這只是意味著從資料庫到客戶端 A 的 *ok* 響應在網路中稍有延遲。
* 這個模型不假設任何事務隔離：另一個客戶端可以隨時更改值。例如，C 首先讀取 1，然後讀取 2，因為該值在兩次讀取之間被 B 更改了。原子比較並設定（*cas*）操作可用於檢查值是否未被另一個客戶端併發更改：B 和 C 的 *cas* 請求成功，但 D 的 *cas* 請求失敗（到資料庫處理它時，*x* 的值不再是 0）。
* 客戶端 B 的最後一次讀取（在陰影條中）不是線性一致的。該操作與 C 的 *cas* 寫入併發，後者將 *x* 從 2 更新到 4。在沒有其他請求的情況下，B 的讀取返回 2 是可以的。然而，客戶端 A 在 B 的讀取開始之前已經讀取了新值 4，因此 B 不允許讀取比 A 更舊的值。同樣，這與 [圖 10-1](#fig_consistency_linearizability_0) 中 Aaliyah 和 Bryce 的情況相同。

這就是線性一致性背後的直覺；形式化定義 [^1] 更精確地描述了它。可以（儘管計算成本高昂）透過記錄所有請求和響應的時序，並檢查它們是否可以排列成有效的順序序列來測試系統的行為是否線性一致 [^6] [^7]。

就像除了可序列化之外還有各種弱隔離級別用於事務（見 ["弱隔離級別"](/tw/ch8#sec_transactions_isolation_levels)），除了線性一致性之外，複製系統也有各種較弱的一致性模型 [^8]。實際上，我們在 ["複製延遲問題"](/tw/ch6#sec_replication_lag) 中看到的 *寫後讀*、*單調讀* 和 *一致性字首讀* 屬性就是這種較弱一致性模型的例子。線性一致性保證所有這些較弱的屬性，以及更多。在本章中，我們將重點關注線性一致性，它是最常用的最強一致性模型。


--------

> [!TIP] 線性一致性與可序列化

線性一致性很容易與可序列化混淆（見 ["可序列化"](/tw/ch8#sec_transactions_serializability)），因為這兩個詞似乎都意味著類似"可以按順序排列"的東西。然而，它們是完全不同的保證，區分它們很重要：

可序列化
: 可序列化是事務的隔離屬性，其中每個事務可能讀取和寫入 *多個物件*（行、文件、記錄）。它保證事務的行為與它們按 *某種* 序列順序執行時相同：也就是說，就好像你首先執行一個事務的所有操作，然後執行另一個事務的所有操作，依此類推，而不交錯它們。該序列順序可以與事務實際執行的順序不同 [^9]。

線性一致性
: 線性一致性是對暫存器（*單個物件*）的讀寫保證。它不將操作分組到事務中，因此它不能防止涉及多個物件的問題，如寫偏差（見 ["寫偏差和幻讀"](/tw/ch8#sec_transactions_write_skew)）。然而，線性一致性是一個 *新鮮度* 保證：它要求如果一個操作在另一個操作開始之前完成，那麼後一個操作必須觀察到至少與前一個操作一樣新的狀態。可序列化沒有這個要求：例如，可序列化允許過時讀取 [^10]。

（*順序一致性* 又是另外一回事 [^8]，但我們不會在這裡討論它。）

資料庫可能同時提供可序列化和線性一致性，這種組合稱為 *嚴格可序列化* 或 *強單副本可序列化*（*strong-1SR*）[^11] [^12]。單節點資料庫通常是線性一致的。對於使用樂觀方法（如可序列化快照隔離）的分散式資料庫（見 ["可序列化快照隔離（SSI）"](/tw/ch8#sec_transactions_ssi)），情況更加複雜：例如，CockroachDB 提供可序列化和對讀取的一些新鮮度保證，但不是嚴格可序列化 [^13]，因為這需要事務之間進行昂貴的協調 [^14]。

也可以將較弱的隔離級別與線性一致性結合，或將較弱的一致性模型與可序列化結合；實際上，一致性模型和隔離級別可以在很大程度上相互獨立地選擇 [^15] [^16]。

--------

### 依賴線性一致性 {#sec_consistency_linearizability_usage}

在什麼情況下線性一致性有用？檢視體育比賽的最終比分也許是一個無關緊要的例子：過時幾秒鐘的結果在這種情況下不太可能造成任何實際傷害。然而，有幾個領域中線性一致性是使系統正確工作的重要要求。

#### 鎖定與領導者選舉 {#locking-and-leader-election}

使用單主複製的系統需要確保確實只有一個主節點，而不是多個（腦裂）。選舉領導者的一種方法是使用租約：每個啟動的節點都嘗試獲取租約，成功的節點成為領導者 [^17]。無論這種機制如何實現，它都必須是線性一致的：兩個不同的節點不應該能夠同時獲取租約。

像 Apache ZooKeeper [^18] 和 etcd 這樣的協調服務通常用於實現分散式租約和領導者選舉。它們使用共識演算法以容錯的方式實現線性一致的操作（我們將在本章後面討論這些演算法）。實現租約和領導者選舉正確仍然有許多微妙的細節（例如，參見 ["分散式鎖和租約"](/tw/ch9#sec_distributed_lock_fencing) 中的柵欄問題），像 Apache Curator 這樣的庫透過在 ZooKeeper 之上提供更高級別的配方來提供幫助。然而，線性一致的儲存服務是這些協調任務的基本基礎。

--------

> [!NOTE]
> 嚴格來說，ZooKeeper 提供線性一致的寫入，但讀取可能是過時的，因為不能保證它們由當前領導者提供 [^18]。etcd 從版本 3 開始預設提供線性一致的讀取。

--------


分散式鎖也在一些分散式資料庫中以更細粒度的級別使用，例如 Oracle Real Application Clusters (RAC) [^19]。RAC 對每個磁碟頁使用一個鎖，多個節點共享對同一磁碟儲存系統的訪問。由於這些線性一致的鎖位於事務執行的關鍵路徑上，RAC 部署通常具有專用的叢集互連網路用於資料庫節點之間的通訊。

#### 約束與唯一性保證 {#sec_consistency_uniqueness}

唯一性約束在資料庫中很常見：例如，使用者名稱或電子郵件地址必須唯一標識一個使用者，在檔案儲存服務中不能有兩個具有相同路徑和檔名的檔案。如果你想在資料寫入時強制執行此約束（這樣如果兩個人同時嘗試建立具有相同名稱的使用者或檔案，其中一個將返回錯誤），你需要線性一致性。

這種情況實際上類似於鎖：當用戶註冊你的服務時，你可以認為他們獲取了所選使用者名稱的"鎖"。該操作也非常類似於原子比較並設定，將使用者名稱設定為宣告它的使用者的 ID，前提是使用者名稱尚未被佔用。

如果你想確保銀行賬戶餘額永遠不會變為負數，或者你不會銷售超過倉庫庫存的物品，或者兩個人不會同時預訂同一航班或劇院的同一座位，也會出現類似的問題。這些約束都要求有一個所有節點都同意的單一最新值（賬戶餘額、庫存水平、座位佔用情況）。

在實際應用中，有時可以接受寬鬆地對待這些約束（例如，如果航班超售，你可以將客戶轉移到其他航班，併為不便提供補償）。在這種情況下，可能不需要線性一致性，我們將在 ["時效性與完整性"](/tw/ch13#sec_future_integrity) 中討論這種寬鬆解釋的約束。

然而，硬唯一性約束，例如你通常在關係資料庫中找到的約束，需要線性一致性。其他型別的約束，例如外部索引鍵或屬性約束，可以在沒有線性一致性的情況下實現 [^20]。

#### 跨通道時序依賴 {#cross-channel-timing-dependencies}

注意 [圖 10-1](#fig_consistency_linearizability_0) 中的一個細節：如果 Aaliyah 沒有大聲說出比分，Bryce 就不會知道他的查詢結果是過時的。他只會在幾秒鐘後再次重新整理頁面，最終看到最終比分。線性一致性違規之所以被注意到，只是因為系統中有一個額外的通訊通道（Aaliyah 的聲音到 Bryce 的耳朵）。

類似的情況可能出現在計算機系統中。例如，假設你有一個網站，使用者可以上傳影片，後臺程序將影片轉碼為較低質量，以便在慢速網際網路連線上流式傳輸。該系統的架構和資料流如 [圖 10-5](#fig_consistency_transcoder) 所示。

影片轉碼器需要明確指示執行轉碼作業，此指令透過訊息佇列從 Web 伺服器傳送到轉碼器（見 ["訊息傳遞系統"](/tw/ch12#sec_stream_messaging)）。Web 伺服器不會將整個影片放在佇列中，因為大多數訊息代理都是為小訊息設計的，而影片可能有許多兆位元組大小。相反，影片首先寫入檔案儲存服務，寫入完成後，轉碼指令被放入佇列。

{{< figure src="/fig/ddia_1005.png" id="fig_consistency_transcoder" caption="圖 10-5. 一個非線性一致的系統：Alice 和 Bob 在不同時間看到上傳的影像，因此 Bob 的請求基於過時的資料。" class="w-full my-4" >}}


如果檔案儲存服務是線性一致的，那麼這個系統應該工作正常。如果它不是線性一致的，就存在競態條件的風險：訊息佇列（[圖 10-5](#fig_consistency_transcoder) 中的步驟 3 和 4）可能比儲存服務內部的複製更快。在這種情況下，當轉碼器獲取原始影片（步驟 5）時，它可能會看到檔案的舊版本，或者根本看不到任何內容。如果它處理影片的舊版本，檔案儲存中的原始影片和轉碼影片將永久不一致。

這個問題的出現是因為 Web 伺服器和轉碼器之間有兩個不同的通訊通道：檔案儲存和訊息佇列。如果沒有線性一致性的新鮮度保證，這兩個通道之間可能存在競態條件。這種情況類似於 [圖 10-1](#fig_consistency_linearizability_0)，其中也存在兩個通訊通道之間的競態條件：資料庫複製和 Aaliyah 嘴巴到 Bryce 耳朵之間的現實音訊通道。

如果你有一個可以接收推送通知的移動應用程式，並且應用程式在收到推送通知時從伺服器獲取一些資料，就會發生類似的競態條件。如果資料獲取可能傳送到滯後的副本，可能會發生推送通知快速透過，但後續獲取沒有看到推送通知所涉及的資料。

線性一致性不是避免這種競態條件的唯一方法，但它是最容易理解的。如果你控制額外的通訊通道（如訊息佇列的情況，但不是 Aaliyah 和 Bryce 的情況），你可以使用類似於我們在 ["讀己之寫"](/tw/ch6#sec_replication_ryw) 中討論的替代方法，但代價是額外的複雜性。


### 實現線性一致性系統 {#sec_consistency_implementing_linearizable}

現在我們已經看了線性一致性有用的幾個例子，讓我們思考如何實現一個提供線性一致語義的系統。

由於線性一致性本質上意味著"表現得好像只有一份資料副本，並且對它的所有操作都是原子的"，最簡單的答案是真的只使用一份資料副本。然而，這種方法將無法容忍故障：如果持有該副本的節點失敗，資料將丟失，或者至少在節點重新啟動之前無法訪問。

讓我們重新審視 [第六章](/tw/ch6) 中的複製方法，並比較它們是否可以實現線性一致：

單主複製（可能線性一致）
: 在單主複製系統中，主節點擁有用於寫入的資料主副本，備庫在其他節點上維護資料副本。只要你在主節點上執行所有讀寫操作，它們很可能是線性一致的。然而，這假設你確定知道誰是主節點。如 ["分散式鎖和租約"](/tw/ch9#sec_distributed_lock_fencing) 中所討論的，一個節點很可能認為自己是主節點，而實際上並不是。如果這個“妄想中的主節點”繼續處理請求，很可能會違反線性一致性 [^21]。使用非同步複製時，故障切換甚至可能丟失已提交的寫入，這違反了永續性和線性一致性。

 對單主資料庫進行分片，每個分片有一個單獨的主節點，不會影響線性一致性，因為它只是單物件保證。跨分片事務是另一回事（見 ["分散式事務"](/tw/ch8#sec_transactions_distributed)）。

共識演算法（可能線性一致）
: 一些共識演算法本質上是帶有自動領導者選舉和故障切換的單主複製。它們經過精心設計以防止腦裂，使它們能夠安全地實現線性一致的儲存。ZooKeeper 使用 Zab 共識演算法 [^22]，etcd 使用 Raft [^23]，例如。然而，僅僅因為系統使用共識並不能保證其上的所有操作都是線性一致的：如果它允許在不檢查節點是否仍然是領導者的情況下在節點上讀取，讀取的結果可能是過時的，如果剛剛選出了新的領導者。

多主複製（非線性一致）
: 具有多主複製的系統通常不是線性一致的，因為它們在多個節點上併發處理寫入，並將它們非同步複製到其他節點。因此，它們可能產生需要解決的衝突寫入（見 ["處理衝突寫入"](/tw/ch6#sec_replication_write_conflicts)）。

無主複製（可能非線性一致）
: 對於具有無主複製的系統（Dynamo 風格；見 ["無主複製"](/tw/ch6#sec_replication_leaderless)），人們有時聲稱可以透過要求仲裁讀寫（*w* + *r* > *n*）來獲得"強一致性"。根據確切的演算法，以及你如何定義強一致性，這並不完全正確。

 基於日曆時鐘的"最後寫入獲勝"衝突解決方法（例如，在 Cassandra 和 ScyllaDB 中）幾乎肯定是非線性一致的，因為時鐘時間戳由於時鐘偏差而無法保證與實際事件順序一致（見 ["依賴同步時鐘"](/tw/ch9#sec_distributed_clocks_relying)）。即使使用仲裁，也可能出現非線性一致的行為，如下一節所示。

#### 線性一致性與仲裁 {#sec_consistency_quorum_linearizable}

直觀地說，在 Dynamo 風格的模型中，仲裁讀寫似乎應該是線性一致的。然而，當我們有可變的網路延遲時，可能會出現競態條件，如 [圖 10-6](#fig_consistency_leaderless) 所示。

{{< figure src="/fig/ddia_1006.png" id="fig_consistency_leaderless" caption="圖 10-6. 如果網路延遲是可變的，仲裁不足以確保線性一致性。" class="w-full my-4" >}}


在 [圖 10-6](#fig_consistency_leaderless) 中，*x* 的初始值為 0，寫入客戶端透過向所有三個副本傳送寫入（*n* = 3，*w* = 3）將 *x* 更新為 1。同時，客戶端 A 從兩個節點的仲裁（*r* = 2）讀取，並在其中一個節點上看到新值 1。同時與寫入併發，客戶端 B 從不同的兩個節點仲裁讀取，並從兩者獲得舊值 0。

仲裁條件得到滿足（*w* + *r* > *n*），但這種執行仍然不是線性一致的：B 的請求在 A 的請求完成後開始，但 B 返回舊值而 A 返回新值。（這又是 [圖 10-1](#fig_consistency_linearizability_0) 中 Aaliyah 和 Bryce 的情況。）

可以使 Dynamo 風格的仲裁線性一致，但代價是降低效能：讀者必須同步執行讀修復（見 ["追趕錯過的寫入"](/tw/ch6#sec_replication_read_repair)），然後才能將結果返回給應用程式 [^24]。此外，在寫入之前，寫入者必須讀取節點仲裁的最新狀態以獲取任何先前寫入的最新時間戳，並確保新寫入具有更大的時間戳 [^25] [^26]。然而，Riak 由於效能損失而不執行同步讀修復。Cassandra 確實等待仲裁讀取時的讀修復完成 [^27]，但由於它使用日曆時鐘作為時間戳而失去了線性一致性。

此外，只有線性一致的讀寫操作可以以這種方式實現；線性一致的比較並設定操作不能，因為它需要共識演算法 [^28]。

總之，最安全的假設是，具有 Dynamo 風格複製的無主系統不提供線性一致性，即使使用仲裁讀寫。

### 線性一致性的代價 {#sec_linearizability_cost}

由於某些複製方法可以提供線性一致性而其他方法不能，因此更深入地探討線性一致性的利弊是很有趣的。

我們已經在 [第六章](/tw/ch6) 中討論了不同複製方法的一些用例；例如，我們看到多主複製通常是多區域複製的良好選擇（見 ["地理分散式操作"](/tw/ch6#sec_replication_multi_dc)）。[圖 10-7](#fig_consistency_cap_availability) 展示了這種部署的示例。

{{< figure src="/fig/ddia_1007.png" id="fig_consistency_cap_availability" caption="圖 10-7. 如果客戶端由於網路分割槽而無法聯絡足夠的副本，它們就無法處理寫入。" class="w-full my-4" >}}


考慮如果兩個區域之間出現網路中斷會發生什麼。讓我們假設每個區域內的網路正常工作，客戶端可以到達其本地區域，但這些區域之間無法相互連線。這被稱為 *網路分割槽*。

使用多主資料庫，每個區域可以繼續正常執行：由於來自一個區域的寫入被非同步複製到另一個區域，寫入只是排隊並在網路連線恢復時交換。

另一方面，如果使用單主複製，那麼主節點必須在其中一個區域。任何寫入和任何線性一致的讀取都必須傳送到主節點。因此，對於連線到備庫所在區域的任何客戶端，這些讀寫請求都必須透過網路同步傳送到主節點區域。

如果在單主設定中區域之間的網路中斷，連線到備庫區域的客戶端無法聯絡主節點，因此它們既不能對資料庫進行任何寫入，也不能進行任何線性一致的讀取。它們仍然可以從備庫讀取，但這些讀取可能是過時的（非線性一致）。如果應用程式需要線性一致的讀寫，網路中斷會導致應用程式在無法聯絡主節點的區域中變得不可用。

如果客戶端可以直接連線到主節點區域，這不是問題，因為應用程式在那裡繼續正常工作。但只能訪問備庫區域的客戶端將在網路鏈路修復之前遇到中斷。

#### CAP 定理 {#the-cap-theorem}

這個問題不僅僅是單主和多主複製的結果：任何線性一致的資料庫都有這個問題，無論它如何實現。這個問題也不特定於多區域部署，而是可以發生在任何不可靠的網路上，即使在一個區域內。權衡如下：

* 如果你的應用程式 *需要* 線性一致性，並且某些副本由於網路問題與其他副本斷開連線，那麼某些副本在斷開連線時無法處理請求：它們必須等待網路問題修復，或者返回錯誤（無論哪種方式，它們都變得 *不可用*）。這種選擇有時被稱為 *CP*（在網路分割槽下一致）。
* 如果你的應用程式 *不需要* 線性一致性，那麼它可以以一種方式編寫，使每個副本可以獨立處理請求，即使它與其他副本斷開連線（例如，多主）。在這種情況下，應用程式可以在面對網路問題時保持 *可用*，但其行為不是線性一致的。這種選擇被稱為 *AP*（在網路分割槽下可用）。

因此，不需要線性一致性的應用程式可以更好地容忍網路問題。這種見解通常被稱為 *CAP 定理* [^29] [^30] [^31] [^32]，由 Eric Brewer 在 2000 年命名，儘管這種權衡自 1970 年代以來就為分散式資料庫設計者所知 [^33] [^34] [^35]。

CAP 最初是作為經驗法則提出的，沒有精確的定義，目的是開始關於資料庫中權衡的討論。當時，許多分散式資料庫專注於在具有共享儲存的機器叢集上提供線性一致語義 [^19]，CAP 鼓勵資料庫工程師探索更廣泛的分散式無共享系統設計空間，這些系統更適合實現大規模 Web 服務 [^36]。CAP 在這種文化轉變方面值得稱讚——它幫助觸發了 NoSQL 運動，這是 2000 年代中期左右的一系列新資料庫技術。

> [!TIP] 無用的 CAP 定理

CAP 有時被表述為 *一致性、可用性、分割槽容錯性：從 3 箇中選擇 2 個*。不幸的是，這樣表述是誤導性的 [^32]，因為網路分割槽是一種故障，所以它們不是你可以選擇的：無論你喜歡與否，它們都會發生。

當網路正常工作時，系統可以同時提供一致性（線性一致性）和完全可用性。當發生網路故障時，你必須在線性一致性或完全可用性之間進行選擇。因此，CAP 的更好表述方式是 *分割槽時要麼一致要麼可用* [^37]。更可靠的網路需要更少地做出這種選擇，但在某個時候這種選擇是不可避免的。

CP/AP 分類方案還有幾個進一步的缺陷 [^4]。*一致性* 被形式化為線性一致性（定理沒有說任何關於較弱一致性模型的內容），*可用性* 的形式化 [^30] 與該術語的通常含義不匹配 [^38]。許多高可用（容錯）系統實際上不符合 CAP 對可用性的特殊定義。此外，一些系統設計者選擇（有充分理由）既不提供線性一致性也不提供 CAP 定理假設的可用性形式，因此這些系統既不是 CP 也不是 AP [^39] [^40]。

總的來說，關於 CAP 有很多誤解和混淆，它並不能幫助我們更好地理解系統，因此最好避免使用 CAP。

正式定義的 CAP 定理 [^30] 範圍非常狹窄：它只考慮一種一致性模型（即線性一致性）和一種故障（網路分割槽，根據 Google 的資料，這是不到 8% 事件的原因 [^41]）。它沒有說任何關於網路延遲、死節點或其他權衡的內容。因此，儘管 CAP 在歷史上具有影響力，但對於設計系統幾乎沒有實際價值 [^4] [^38]。

已經有努力推廣 CAP。例如，*PACELC 原則* 觀察到系統設計者也可能選擇在網路正常工作時削弱一致性以減少延遲 [^39] [^40] [^42]。因此，在網路分割槽（P）期間，我們需要在可用性（A）和一致性（C）之間進行選擇；否則（E），當沒有分割槽時，我們可能在低延遲（L）和一致性（C）之間進行選擇。然而，這個定義繼承了 CAP 的幾個問題，例如一致性和可用性的反直覺定義。

分散式系統中有許多更有趣的不可能性結果 [^43]，CAP 現在已被更精確的結果所取代 [^44] [^45]，因此它今天主要具有歷史意義。

#### 線性一致性與網路延遲 {#linearizability-and-network-delays}

儘管線性一致性是一個有用的保證，但令人驚訝的是，實際上很少有系統是線性一致的。例如，即使現代多核 CPU 上的 RAM 也不是線性一致的 [^46]：如果在一個 CPU 核心上執行的執行緒寫入記憶體地址，而另一個 CPU 核心上的執行緒隨後讀取相同的地址，不能保證讀取第一個執行緒寫入的值（除非使用 *記憶體屏障* 或 *柵欄* [^47]）。

這種行為的原因是每個 CPU 核心都有自己的記憶體快取和儲存緩衝區。預設情況下，記憶體訪問首先進入快取，任何更改都非同步寫出到主記憶體。由於訪問快取中的資料比訪問主記憶體快得多 [^48]，這個特性對於現代 CPU 的良好效能至關重要。然而，現在有多份資料副本（一份在主記憶體中，可能還有幾份在各種快取中），這些副本是非同步更新的，因此線性一致性丟失了。

為什麼要做出這種權衡？使用 CAP 定理來證明多核記憶體一致性模型是沒有意義的：在一臺計算機內，我們通常假設可靠的通訊，我們不期望一個 CPU 核心在與計算機其餘部分斷開連線的情況下能夠繼續正常執行。放棄線性一致性的原因是 *效能*，而不是容錯 [^39]。

許多選擇不提供線性一致保證的分散式資料庫也是如此：它們這樣做主要是為了提高效能，而不是為了容錯 [^42]。線性一致性很慢——這在任何時候都是真的，不僅在網路故障期間。

我們能否找到更高效的線性一致儲存實現？答案似乎是否定的：Attiya 和 Welch [^49] 證明，如果你想要線性一致性，讀寫請求的響應時間至少與網路中延遲的不確定性成正比。在具有高度可變延遲的網路中，例如大多數計算機網路（見 ["超時和無界延遲"](/tw/ch9#sec_distributed_queueing)），線性一致讀寫的響應時間不可避免地會很高。更快的線性一致性演算法不存在，但較弱的一致性模型可能會快得多，因此這種權衡對於延遲敏感的系統很重要。在 ["時效性與完整性"](/tw/ch13#sec_future_integrity) 中，我們將討論一些在不犧牲正確性的情況下避免線性一致性的方法。


## ID 生成器和邏輯時鐘 {#sec_consistency_logical}

在許多應用程式中，你需要在建立資料庫記錄時為它們分配某種唯一的 ID，這給了你一個可以引用這些記錄的主鍵。在單節點資料庫中，通常使用自增整數，它的優點是隻需要 64 位（如果你確定永遠不會有超過 40 億條記錄，甚至可以使用 32 位，但這是有風險的）來儲存。

這種自增 ID 的另一個優點是，ID 的順序告訴你記錄建立的順序。例如，[圖 10-8](#fig_consistency_id_generator) 顯示了一個聊天應用程式，它在釋出聊天訊息時為其分配自增 ID。然後，你可以按 ID 遞增的順序顯示訊息，生成的聊天執行緒將有意義：Aaliyah 釋出了一個被分配 ID 1 的問題，而 Bryce 對該問題的回答被分配了一個更大的 ID，即 3。

{{< figure src="/fig/ddia_1008.png" id="fig_consistency_id_generator" caption="圖 10-8. 兩個不同的節點可能生成衝突的 ID。" class="w-full my-4" >}}


這個單節點 ID 生成器是線性一致系統的另一個例子。每個獲取 ID 的請求都是一個原子地遞增計數器並返回舊計數器值的操作（*獲取並增加* 操作）；線性一致性確保如果 Aaliyah 的訊息釋出在 Bryce 的釋出開始之前完成，那麼 Bryce 的 ID 必須大於 Aaliyah 的。[圖 10-8](#fig_consistency_id_generator) 中 Aaliyah 和 Caleb 的訊息是併發的，因此線性一致性不指定它們的 ID 必須如何排序，只要它們是唯一的。

記憶體中的單節點 ID 生成器很容易實現：你可以使用 CPU 提供的原子遞增指令，它允許多個執行緒安全地遞增同一個計數器。使計數器持久化需要更多的努力，這樣節點就可以崩潰並重新啟動而不重置計數器值，這將導致重複的 ID。但真正的問題是：

* 單節點 ID 生成器不具容錯性，因為該節點是單點故障。
* 如果你想在另一個區域建立記錄，速度會很慢，因為你可能必須往返地球的另一端才能獲得 ID。
* 如果你有高寫入吞吐量，該單個節點可能成為瓶頸。

你可以考慮各種 ID 生成器的替代選項：

分片 ID 分配
: 你可以有多個分配 ID 的節點——例如，一個只生成偶數，一個只生成奇數。一般來說，你可以在 ID 中保留一些位來包含分片編號。這些 ID 仍然緊湊，但你失去了排序屬性：例如，如果你有 ID 為 16 和 17 的聊天訊息，你不知道訊息 16 是否實際上是先發送的，因為 ID 是由不同的節點分配的，其中一個節點可能領先於另一個。

預分配 ID 塊
: 不是從單節點 ID 生成器請求單個 ID，它可以分發 ID 塊。例如，節點 A 可能宣告從 1 到 1,000 的 ID 塊，節點 B 可能宣告從 1,001 到 2,000 的塊。然後每個節點可以獨立地從其塊中分發 ID，並在其序列號供應開始不足時從單節點 ID 生成器請求新塊。但是，這種方案也不能確保正確的排序：可能會發生這樣的情況，一條訊息被分配了 1,001 到 2,000 範圍內的 ID，而後來的訊息被分配了 1 到 1,000 範圍內的 ID，如果 ID 是由不同的節點分配的。

隨機 UUID
: 你可以使用 *通用唯一識別符號*（UUID），也稱為 *全域性唯一識別符號*（GUID）。它們的一大優點是可以在任何節點上本地生成，無需通訊，但它們需要更多空間（128 位）。有幾種不同版本的 UUID；最簡單的是版本 4，它本質上是一個如此長的隨機數，以至於兩個節點選擇相同的可能性非常小。不幸的是，這些 ID 的順序也是隨機的，因此比較兩個 ID 不會告訴你哪個更新。

時鐘時間戳使其唯一
: 如果你的節點的日曆時鐘使用 NTP 保持大致正確，你可以透過將該時鐘的時間戳放在最高有效位中，並用確保 ID 唯一的額外資訊填充剩餘位來生成 ID，即使時間戳不是——例如，分片編號和每分片遞增序列號，或長隨機值。這種方法用於版本 7 UUID [^50]、Twitter 的 Snowflake [^51]、ULID [^52]、Hazelcast 的 Flake ID 生成器、MongoDB ObjectID 和許多類似方案 [^50]。你可以在應用程式程式碼或資料庫中實現這些 ID 生成器 [^53]。

所有這些方案都生成唯一的 ID（至少有足夠高的機率，使衝突極其罕見），但它們對 ID 的排序保證比單節點自增方案弱得多。

如 ["為事件排序的時間戳"](/tw/ch9#sec_distributed_lww) 中所討論的，時鐘時間戳最多隻能提供近似排序：如果較早的寫入從稍快的時鐘獲得時間戳，而較晚寫入的時間戳來自稍慢的時鐘，則時間戳順序可能與事件實際發生的順序不一致。由於使用非單調時鐘而導致的時鐘跳躍，即使單個節點生成的時間戳也可能排序錯誤。因此，基於時鐘時間的 ID 生成器不太可能是線性一致的。

你可以透過依賴高精度時鐘同步，使用原子鐘或 GPS 接收器來減少這種排序不一致。但如果能夠在不依賴特殊硬體的情況下生成唯一且正確排序的 ID 也會很好。這就是 *邏輯時鐘* 的用途。

### 邏輯時鐘 {#sec_consistency_timestamps}

在 ["不可靠的時鐘"](/tw/ch9#sec_distributed_clocks) 中，我們討論了日曆時鐘和單調時鐘。這兩種都是 *物理時鐘*：它們測量經過的秒數（或毫秒、微秒等）。

在分散式系統中，通常還使用另一種時鐘，稱為 *邏輯時鐘*。物理時鐘是計算已經過的秒數的硬體裝置，而邏輯時鐘是計算已發生事件的演算法。來自邏輯時鐘的時間戳因此不會告訴你現在幾點，但你 *可以* 比較來自邏輯時鐘的兩個時間戳，以判斷哪個更早，哪個更晚。

邏輯時鐘的要求通常是：

* 其時間戳緊湊（大小為幾個位元組）且唯一；
* 你可以比較任意兩個時間戳（即它們是 *全序* 的）；並且
* 時間戳的順序與因果關係 *一致*：如果操作 A 發生在 B 之前，那麼 A 的時間戳小於 B 的時間戳。（我們之前在 ["“先發生”關係與併發"](/tw/ch6#sec_replication_happens_before) 中討論了因果關係。）

單節點 ID 生成器滿足這些要求，但我們剛剛討論的分散式 ID 生成器不滿足因果排序要求。

#### Lamport 時間戳 {#lamport-timestamps}

幸運的是，有一種生成邏輯時間戳的簡單方法，它與因果關係 *一致*，你可以將其用作分散式 ID 生成器。它被稱為 *Lamport 時鐘*，由 Leslie Lamport 在 1978 年提出 [^54]，現在是分散式系統領域被引用最多的論文之一。

[圖 10-9](#fig_consistency_lamport_ts) 顯示了 Lamport 時鐘如何在 [圖 10-8](#fig_consistency_id_generator) 的聊天示例中工作。每個節點都有一個唯一識別符號，在 [圖 10-9](#fig_consistency_lamport_ts) 中是名稱"Aaliyah"、"Bryce"或"Caleb"，但在實踐中可能是隨機 UUID 或類似的東西。此外，每個節點都保留它已處理的運算元的計數器。Lamport 時間戳就是一對（*計數器*，*節點 ID*）。兩個節點有時可能具有相同的計數器值，但透過在時間戳中包含節點 ID，每個時間戳都是唯一的。

{{< figure src="/fig/ddia_1009.png" id="fig_consistency_lamport_ts" caption="圖 10-9. Lamport 時間戳提供與因果關係一致的全序。" class="w-full my-4" >}}


每次節點生成時間戳時，它都會遞增其計數器值並使用新值。此外，每次節點看到來自另一個節點的時間戳時，如果該時間戳中的計數器值大於其本地計數器值，它會將其本地計數器增加到與時間戳中的值匹配。

在 [圖 10-9](#fig_consistency_lamport_ts) 中，Aaliyah 在釋出自己的訊息時還沒有看到 Caleb 的訊息，反之亦然。假設兩個使用者都以初始計數器值 0 開始，因此都遞增其本地計數器並將新計數器值 1 附加到其訊息。當 Bryce 收到這些訊息時，他將本地計數器值增加到 1。最後，Bryce 向 Aaliyah 的訊息傳送回覆，為此他遞增本地計數器並將新值 2 附加到訊息。

要比較兩個 Lamport 時間戳，我們首先比較它們的計數器值：例如，(2, "Bryce") 大於 (1, "Aaliyah")，也大於 (1, "Caleb")。如果兩個時間戳具有相同的計數器，我們改為比較它們的節點 ID，使用通常的字典序字串比較。因此，此示例中的時間戳順序是 (1, "Aaliyah") < (1, "Caleb") < (2, "Bryce")。

#### 混合邏輯時鐘 {#hybrid-logical-clocks}

Lamport 時間戳擅長捕獲事物發生的順序，但它們有一些限制：

* 由於它們與物理時間沒有直接關係，你不能使用它們來查詢，比如說，在特定日期釋出的所有訊息——你需要單獨儲存物理時間。
* 如果兩個節點從不通訊，一個節點的計數器遞增將永遠不會反映在另一個節點的計數器中。因此，可能會發生這樣的情況，即在不同節點上大約同一時間生成的事件具有極不相同的計數器值。

*混合邏輯時鐘* 結合了物理日曆時鐘的優勢和 Lamport 時鐘的排序保證 [^55]。像物理時鐘一樣，它計算秒或微秒。像 Lamport 時鐘一樣，當一個節點看到來自另一個節點的時間戳大於其本地時鐘值時，它將自己的本地值向前移動以匹配另一個節點的時間戳。因此，如果一個節點的時鐘執行得很快，其他節點在通訊時也會類似地向前移動它們的時鐘。

每次生成混合邏輯時鐘的時間戳時，它也會遞增，這確保時鐘單調向前移動，即使底層物理時鐘由於 NTP 調整而向後跳躍。因此，混合邏輯時鐘可能略微領先於底層物理時鐘。演算法的細節確保這種差異儘可能小。

因此，你可以將混合邏輯時鐘的時間戳幾乎像傳統日曆時鐘的時間戳一樣對待，具有其排序與先發生關係一致的附加屬性。它不依賴於任何特殊硬體，只需要大致同步的時鐘。例如，CockroachDB 使用混合邏輯時鐘。

#### Lamport/混合邏輯時鐘 vs. 向量時鐘 {#lamporthybrid-logical-clocks-vs-vector-clocks}

在 ["多版本併發控制（MVCC）"](/tw/ch8#sec_transactions_snapshot_impl) 中，我們討論了快照隔離通常是如何實現的：本質上，透過給每個事務一個事務 ID，並允許每個事務看到由 ID 較低的事務進行的寫入，但使 ID 較高的事務的寫入不可見。Lamport 時鐘和混合邏輯時鐘是生成這些事務 ID 的好方法，因為它們確保快照與因果關係一致 [^56]。

當併發生成多個時間戳時，這些演算法會任意排序它們。這意味著當你檢視兩個時間戳時，你通常無法判斷它們是併發生成的還是一個發生在另一個之前。（在 [圖 10-9](#fig_consistency_lamport_ts) 的示例中，你實際上可以判斷 Aaliyah 和 Caleb 的訊息必須是併發的，因為它們具有相同的計數器值，但當計數器值不同時，你無法判斷它們是否併發。）

如果你想能夠確定記錄何時併發建立，你需要不同的演算法，例如 *向量時鐘*。缺點是向量時鐘的時間戳要大得多——可能是系統中每個節點一個整數。有關檢測併發的更多詳細資訊，請參見 ["檢測併發寫入"](/tw/ch6#sec_replication_concurrent)。

### 線性一致的 ID 生成器 {#sec_consistency_linearizable_id}

儘管 Lamport 時鐘和混合邏輯時鐘提供了有用的排序保證，但該排序仍然弱於我們之前討論的線性一致單節點 ID 生成器。回想一下，線性一致性要求如果請求 A 在請求 B 開始之前完成，那麼 B 必須具有更高的 ID，即使 A 和 B 從未相互通訊。另一方面，Lamport 時鐘只能確保節點生成的時間戳大於該節點看到的任何其他時間戳，但它不能對它沒有看到的時間戳說任何話。

[圖 10-10](#fig_consistency_permissions) 顯示了非線性一致 ID 生成器如何導致問題。想象一個社交媒體網站，使用者 A 想要與朋友私下分享一張尷尬的照片。A 的賬戶最初是公開的，但使用他們的筆記型電腦，A 首先將他們的賬戶設定更改為私密。然後 A 使用他們的手機上傳照片。由於 A 按順序執行了這些更新，他們可能合理地期望照片上傳受到新的、受限的賬戶許可權的約束。

{{< figure src="/fig/ddia_1010.png" id="fig_consistency_permissions" caption="圖 10-10. 使用 Lamport 時間戳的許可權系統示例。" class="w-full my-4" >}}


賬戶許可權和照片儲存在兩個單獨的資料庫（或同一資料庫的單獨分片）中，讓我們假設它們使用 Lamport 時鐘或混合邏輯時鐘為每次寫入分配時間戳。由於照片資料庫沒有從賬戶資料庫讀取，照片資料庫中的本地計數器可能稍微落後，因此照片上傳被分配了比賬戶設定更新更低的時間戳。

接下來，假設一個檢視者（不是 A 的朋友）正在檢視 A 的個人資料，他們的讀取使用快照隔離的 MVCC 實現。可能會發生這樣的情況，檢視者的讀取具有大於照片上傳的時間戳，但小於賬戶設定更新的時間戳。因此，系統將確定在讀取時賬戶仍然是公開的，因此向檢視者顯示他們不應該看到的尷尬照片。

你可以想象幾種可能的方法來解決這個問題。也許照片資料庫應該在執行寫入之前讀取使用者的賬戶狀態，但很容易忘記這樣的檢查。如果 A 的操作是在同一裝置上執行的，也許該裝置上的應用程式可以跟蹤該使用者寫入的最新時間戳——但如果使用者使用筆記型電腦和手機，如示例中所示，那就不那麼容易了。

在這種情況下，最簡單的解決方案是使用線性一致的 ID 生成器，這將確保照片上傳被分配比賬戶許可權更改更大的 ID。

#### 實現線性一致的 ID 生成器 {#implementing-a-linearizable-id-generator}

確保 ID 分配線性一致的最簡單方法實際上是為此目的使用單個節點。該節點只需要原子地遞增計數器並在請求時返回其值，持久化計數器值（以便在節點崩潰並重新啟動時不會生成重複的 ID），並使用單主複製進行容錯複製。這種方法在實踐中使用：例如，TiDB/TiKV 稱之為 *時間戳預言機*，受 Google 的 Percolator [^57] 啟發。

作為最佳化，你可以避免在每個請求上執行磁碟寫入和複製。相反，ID 生成器可以寫入描述一批 ID 的記錄；一旦該記錄被持久化並完成複製，節點就可以開始按順序向客戶端分發這些 ID。在它用完該批次中的 ID 之前，它可以為下一批持久化並複製記錄。這樣，如果節點崩潰並重啟，或故障切換到備庫，某些 ID 會被跳過，但不會發出任何重複或亂序的 ID。

你不能輕易地對 ID 生成器進行分片，因為如果你有多個分片獨立分發 ID，你就無法再保證它們的順序是線性一致的。你也不能輕易地將 ID 生成器分佈在多個區域；因此，在地理分散式資料庫中，所有 ID 請求都必須轉到單個區域的節點。從好的方面來說，ID 生成器的工作非常簡單，因此單個節點可以處理大量請求吞吐量。

如果你不想使用單節點 ID 生成器，可以使用替代方案：你可以做 Google 的 Spanner 所做的，如 ["全域性快照的同步時鐘"](/tw/ch9#sec_distributed_spanner) 中所討論的。它依賴於物理時鐘，該時鐘不僅返回單個時間戳，還返回表示時鐘讀數不確定性的時間戳範圍。然後它等待該不確定性間隔的持續時間過去後再返回。

假設不確定性間隔是正確的（即真實的當前物理時間始終位於該間隔內），此過程還確保如果一個請求在另一個請求開始之前完成，後一個請求將具有更大的時間戳。這種方法確保了這種線性一致的 ID 分配，而無需任何通訊：即使不同區域的請求也將被正確排序，無需等待跨區域請求。缺點是你需要硬體和軟體支援，以使時鐘緊密同步並計算必要的不確定性間隔。

#### 使用邏輯時鐘強制約束 {#enforcing-constraints-using-logical-clocks}

在 ["約束與唯一性保證"](#sec_consistency_uniqueness) 中，我們看到線性一致的比較並設定操作可用於在分散式系統中實現鎖、唯一性約束和類似構造。這提出了一個問題：邏輯時鐘或線性一致的 ID 生成器是否也足以實現這些東西？

答案是：不完全。當你有幾個節點都試圖獲取同一個鎖或註冊同一個使用者名稱時，你可以使用邏輯時鐘為這些請求分配時間戳，並選擇具有最低時間戳的請求作為獲勝者。如果時鐘是線性一致的，你知道任何未來的請求都將始終生成更大的時間戳，因此你可以確定沒有未來的請求會收到比獲勝者更低的時間戳。

不幸的是，問題的一部分仍未解決：節點如何知道自己的時間戳是否最低？要確定，它需要聽到可能生成時間戳的 *每個* 其他節點 [^54]。如果其他節點之一在此期間失敗，或者由於網路問題無法訪問，該系統將停止執行，因為我們無法確定該節點是否可能具有最低的時間戳。這不是我們需要的那種容錯系統。

要以容錯方式實現鎖、租約和類似構造，我們需要比邏輯時鐘或 ID 生成器更強大的東西：我們需要共識。


## 共識 {#sec_consistency_consensus}

在本章中，我們已經看到了幾個只有單個節點時很容易，但如果你想要容錯就會變得困難得多的例子：

* 如果你只有一個主節點，並且在該主節點上進行所有讀寫，資料庫可以是線性一致的。但是，如果該主節點失敗，如何進行故障切換，同時避免腦裂？如何確保一個認為自己是主節點的節點實際上沒有被投票罷免？
* 單節點上的線性一致 ID 生成器只是一個帶有原子獲取並增加指令的計數器，但如果它崩潰了怎麼辦？
* 原子比較並設定（CAS）操作對許多事情都很有用，例如當多個程序競相獲取它時決定誰獲得鎖或租約，或確保具有給定名稱的檔案或使用者的唯一性。在單個節點上，CAS 可能就像一條 CPU 指令一樣簡單，但如何使其容錯？

事實證明，所有這些都是同一個基本分散式系統問題的例項：*共識*。共識是分散式計算中最重要和最基本的問題之一；它也是出了名的難以正確實現 [^58] [^59]，許多系統在過去都出錯了。現在我們已經討論了複製（[第六章](/tw/ch6)）、事務（[第八章](/tw/ch8)）、系統模型（[第九章](/tw/ch9)）和線性一致性（本章），我們終於準備好解決共識問題了。

最著名的共識演算法是 Viewstamped Replication [^60] [^61]、Paxos [^58] [^62] [^63] [^64]、Raft [^23] [^65] [^66] 和 Zab [^18] [^22] [^67]。這些演算法之間有相當多的相似之處，但它們並不相同 [^68] [^69]。這些演算法在非拜占庭系統模型中工作：也就是說，網路通訊可能會被任意延遲或丟棄，節點可能會崩潰、重啟和斷開連線，但演算法假設節點在其他方面正確遵循協議，不會惡意行為。

也有可以容忍某些拜占庭節點的共識演算法，即不正確遵循協議的節點（例如，向其他節點發送矛盾訊息）。一個常見的假設是少於三分之一的節點是拜占庭故障的 [^26] [^70]。這種 *拜占庭容錯*（BFT）共識演算法用於區塊鏈 [^71]。然而，如 ["拜占庭故障"](/tw/ch9#sec_distributed_byzantine) 中所解釋的，BFT 演算法超出了本書的範圍。

--------

> [!TIP] 共識的不可能性

你可能聽說過 FLP 結果 [^72]——以作者 Fischer、Lynch 和 Paterson 的名字命名——它證明如果存在節點可能崩潰的風險，就沒有演算法總是能夠達成共識。在分散式系統中，我們必須假設節點可能會崩潰，因此可靠的共識是不可能的。然而，在這裡我們正在討論實現共識的演算法。這是怎麼回事？

首先，FLP 並不是說我們永遠無法達成共識——它只是說我們不能保證共識演算法 *總是* 終止。此外，FLP 結果是在非同步系統模型中假設確定性演算法的情況下證明的（見 ["系統模型與現實"](/tw/ch9#sec_distributed_system_model)），這意味著演算法不能使用任何時鐘或超時。如果它可以使用超時來懷疑另一個節點可能已經崩潰（即使懷疑有時是錯誤的），那麼共識就變得可解 [^73]。即使只是允許演算法使用隨機數也足以繞過不可能性結果 [^74]。

因此，儘管 FLP 關於共識不可能性的結果具有重要的理論意義，但分散式系統通常可以在實踐中實現共識。

--------

### 共識的多面性 {#sec_consistency_faces}

共識可以用幾種不同的方式表達：

* *單值共識* 非常類似於原子 *比較並設定* 操作，它可用於實現鎖、租約和唯一性約束。
* 構建 *僅追加日誌* 也需要共識；它通常形式化為 *全序廣播*。有了日誌，你可以構建 *狀態機複製*、基於主節點的複製、事件溯源和其他有用的東西。
* 多資料庫或多分片事務的 *原子提交* 要求所有參與者就是否提交或中止事務達成一致。

我們很快就會探討所有這些。事實上，這些問題都是相互等價的：如果你有解決其中一個問題的演算法，你可以將其轉換為任何其他問題的解決方案。這是一個相當深刻且也許令人驚訝的見解！這就是為什麼我們可以將所有這些東西歸入"共識"之下，即使它們表面上看起來完全不同。

#### 單值共識 {#single-value-consensus}

共識的標準表述涉及讓多個節點就單個值達成一致。例如：

* 當具有單主複製的資料庫首次啟動時，或者當現有主節點失敗時，多個節點可能會同時嘗試成為主節點。同樣，多個節點可能競相獲取鎖或租約。共識允許它們決定哪一個獲勝。
* 如果幾個人同時嘗試預訂飛機上的最後一個座位，或劇院中的同一個座位，或嘗試使用相同的使用者名稱註冊賬戶，那麼共識演算法可以確定哪一個應該成功。

更一般地說，一個或多個節點可能 *提議* 值，共識演算法 *決定* 其中一個值。在上述示例中，每個節點可以提議自己的 ID，演算法決定哪個節點 ID 應該成為新的主節點、租約的持有者或飛機/劇院座位的購買者。在這種形式主義中，共識演算法必須滿足以下屬性 [^26]：

一致同意
: 沒有兩個節點決定不同。

完整性
: 一旦節點決定了一個值，它就不能透過決定另一個值來改變主意。

有效性
: 如果節點決定值 *v*，那麼 *v* 是由某個節點提議的。

終止
: 每個未崩潰的節點最終都會決定某個值。

如果你想決定多個值，你可以為每個值執行共識演算法的單獨例項。例如，你可以為劇院中的每個可預訂座位進行單獨的共識執行，這樣你就可以為每個座位獲得一個決定（一個買家）。

一致同意和完整性屬性定義了共識的核心思想：每個人都決定相同的結果，一旦你決定了，你就不能改變主意。有效性屬性排除了瑣碎的解決方案：例如，你可以有一個總是決定 `null` 的演算法，無論提議什麼；這個演算法將滿足同意和完整性屬性，但不滿足有效性屬性。

如果你不關心容錯，那麼滿足前三個屬性很容易：你可以硬編碼一個節點作為"獨裁者"，讓該節點做出所有決定。然而，如果那個節點失敗，那麼系統就無法再做出任何決定——就像沒有故障切換的單主複製一樣。所有的困難都來自對容錯的需求。

終止屬性形式化了容錯的想法。它本質上是說共識演算法不能簡單地坐著什麼都不做——換句話說，它必須取得進展。即使某些節點失敗，其他節點仍必須達成決定。（終止是活性屬性，而其他三個是安全屬性——見 ["安全性和活性"](/tw/ch9#sec_distributed_safety_liveness)。）

如果崩潰的節點可能恢復，你可以等待它回來。然而，共識必須確保即使崩潰的節點突然消失並且永遠不會回來，它也會做出決定。（不要想象軟體崩潰，而是想象有地震，包含你的節點的資料中心被山體滑坡摧毀。你必須假設你的節點被埋在 30 英尺的泥土下，永遠不會重新上線。）

當然，如果 *所有* 節點都崩潰了，並且沒有一個在執行，那麼任何演算法都不可能決定任何事情。演算法可以容忍的故障數量是有限的：事實上，可以證明任何共識演算法都需要至少大多數節點正常執行才能確保終止 [^73]。該多數可以安全地形成仲裁（見 ["讀寫仲裁"](/tw/ch6#sec_replication_quorum_condition)）。

因此，終止屬性受到少於一半節點崩潰或不可達的假設的約束。然而，大多數共識演算法確保安全屬性——同意、完整性和有效性——始終得到滿足，即使大多數節點失敗或存在嚴重的網路問題 [^75]。因此，大規模中斷可能會阻止系統處理請求，但它不能透過導致做出不一致的決定來破壞共識系統。

#### 比較並設定作為共識 {#compare-and-set-as-consensus}

比較並設定（CAS）操作檢查某個物件的當前值是否等於某個期望值；如果是，它原子地將物件更新為某個新值；如果不是，它保持物件不變並返回錯誤。

如果你有容錯、線性一致的 CAS 操作，很容易解決共識問題：最初將物件設定為空值；每個想要提議值的節點都使用期望值為空、新值為它想要提議的值（假設它是非空的）呼叫 CAS。然後決定的值就是物件設定的任何值。

同樣，如果你有共識的解決方案，你可以實現 CAS：每當一個或多個節點想要使用相同的期望值執行 CAS 時，你使用共識協議提議 CAS 呼叫中的新值，然後將物件設定為共識決定的任何值。任何新值未被決定的 CAS 呼叫都返回錯誤。具有不同期望值的 CAS 呼叫使用共識協議的單獨執行。

這表明 CAS 和共識彼此等價 [^28] [^73]。同樣，兩者在單個節點上都很簡單，但要使其容錯則具有挑戰性。作為分散式環境中 CAS 的示例，我們在 ["由物件儲存支援的資料庫"](/tw/ch6#sec_replication_object_storage) 中看到了物件儲存的條件寫入操作，它允許寫入僅在自當前客戶端上次讀取以來具有相同名稱的物件未被另一個客戶端建立或修改時發生。

然而，線性一致的讀寫暫存器不足以解決共識。FLP 結果告訴我們，共識不能由非同步崩潰停止模型中的確定性演算法解決 [^72]，但我們在 ["線性一致性與仲裁"](#sec_consistency_quorum_linearizable) 中看到，線性一致的暫存器可以使用此模型中的仲裁讀/寫來實現 [^24] [^25] [^26]。由此可見，線性一致的暫存器無法解決共識。

#### 共享日誌作為共識 {#sec_consistency_shared_logs}

我們已經看到了幾個日誌的例子，例如複製日誌、事務日誌和預寫日誌。日誌儲存一系列 *日誌條目*，任何讀取它的人都會看到相同順序的相同條目。有時日誌有一個允許追加新條目的單個寫入者，但 *共享日誌* 是多個節點可以請求追加條目的日誌。單主複製就是一個例子：任何客戶端都可以要求主節點進行寫入，主節點將其追加到複製日誌，然後所有備庫按照與主節點相同的順序應用寫入。

更正式地說，共享日誌支援兩種操作：你可以請求將值新增到日誌中，並且可以讀取日誌中的條目。它必須滿足以下屬性：

最終追加
: 如果節點請求將某個值新增到日誌中，並且節點不會崩潰，那麼該節點最終必須在日誌條目中讀取該值。

可靠交付
: 沒有日誌條目丟失：如果一個節點讀取某個日誌條目，那麼最終每個未崩潰的節點也必須讀取該日誌條目。

僅追加
: 一旦節點讀取了某個日誌條目，它就是不可變的，新的日誌條目只能在它之後新增，而不能在之前。節點可能會重新讀取日誌，在這種情況下，它會以與最初讀取它們時相同的順序看到相同的日誌條目（即使節點崩潰並重新啟動）。

一致性
: 如果兩個節點都讀取某個日誌條目 *e*，那麼在 *e* 之前，它們必須以相同的順序讀取完全相同的日誌條目序列。

有效性
: 如果節點讀取包含某個值的日誌條目，那麼某個節點先前請求將該值新增到日誌中。

--------

> [!NOTE]
> 共享日誌在形式上被稱為 *全序廣播*、*原子廣播* 或 *全序組播* 協議 [^26] [^76] [^77]。這是用不同的詞描述的同一件事：請求將值新增到日誌中然後稱為"廣播"它，讀取日誌條目稱為"交付"它。

--------

如果你有共享日誌的實現，很容易解決共識問題：每個想要提議值的節點都請求將其新增到日誌中，第一個日誌條目中讀回的任何值就是決定的值。由於所有節點以相同的順序讀取日誌條目，它們保證就首先交付哪個值達成一致 [^28]。

相反，如果你有共識的解決方案，你可以實現共享日誌。細節有點複雜，但基本思想是這樣的 [^73]：

1. 你為每個未來的日誌條目在日誌中都有一個槽，並且你為每個這樣的槽執行共識演算法的單獨例項，以決定該條目中應該包含什麼值。
2. 當節點想要向日志新增值時，它為尚未決定的槽之一提議該值。
3. 當共識演算法為其中一個槽做出決定，並且所有先前的槽都已經決定時，則決定的值作為新的日誌條目追加，並且已經決定的任何連續槽也將其決定的值追加到日誌中。
4. 如果提議的值未被某個槽選擇，想要新增它的節點會透過為稍後的槽提議它來重試。

這表明共識等價於全序廣播和共享日誌。沒有故障切換的單主複製不滿足活性要求，因為如果主節點崩潰，它將停止傳遞訊息。像往常一樣，挑戰在於安全地自動執行故障切換。

#### 獲取並增加作為共識 {#fetch-and-add-as-consensus}

我們在 ["線性一致的 ID 生成器"](#sec_consistency_linearizable_id) 中看到的線性一致 ID 生成器接近解決共識，但略有不足。我們可以使用獲取並增加操作實現這樣的 ID 生成器，該操作原子地遞增計數器並返回舊的計數器值。

如果你有 CAS 操作，很容易實現獲取並增加：首先讀取計數器值，然後執行 CAS，其中期望值是你讀取的值，新值是該值加一。如果 CAS 失敗，你將重試整個過程，直到 CAS 成功。當存在爭用時，這比本機獲取並增加操作效率低，但在功能上是等效的。由於你可以使用共識實現 CAS，你也可以使用共識實現獲取並增加。

相反，如果你有容錯的獲取並增加操作，你能解決共識問題嗎？假設你將計數器初始化為零，每個想要提議值的節點都呼叫獲取並增加操作來遞增計數器。由於獲取並增加操作是原子的，其中一個節點將讀取初始值零，其他節點都將讀取至少遞增過一次的值。

現在假設讀取零的節點是獲勝者，它的值被決定。這對於讀取零的節點有效，但其他節點有問題：它們知道自己不是獲勝者，但它們不知道其他節點中哪一個獲勝了。獲勝者可以向其他節點發送訊息，讓它們知道它已經獲勝，但如果獲勝者在有機會發送此訊息之前崩潰了怎麼辦？在這種情況下，其他節點將被掛起，無法決定任何值，因此共識不會終止。其他節點不能回退到另一個節點，因為讀取零的節點可能會回來並正確地決定它提議的值。

一個例外是，如果我們確定不超過兩個節點將提議值。在這種情況下，節點可以相互發送它們想要提議的值，然後每個都執行獲取並增加操作。讀取零的節點決定自己的值，讀取一的節點決定另一個節點的值。這解決了兩個節點之間的共識問題，這就是為什麼我們可以說獲取並增加的 *共識數* 為二 [^28]。相比之下，CAS 和共享日誌解決了任意數量節點可能提議值的共識，因此它們的共識數為 ∞（無窮大）。

#### 原子提交作為共識 {#atomic-commitment-as-consensus}

在 ["分散式事務"](/tw/ch8#sec_transactions_distributed) 中，我們看到了 *原子提交* 問題，即確保參與分散式事務的資料庫或分片都提交或中止事務。我們還看到了 *兩階段提交* 演算法，它依賴於作為單點故障的協調器。

共識和原子提交之間有什麼關係？乍一看，它們似乎非常相似——兩者都需要節點達成某種形式的一致。然而，有一個重要的區別：對於共識，可以決定提議的任何值，而對於原子提交，如果 *任何* 參與者投票中止，演算法 *必須* 中止。更準確地說，原子提交需要以下屬性 [^78]：

一致同意
: 沒有兩個節點決定不同的結果。

完整性
: 一旦節點決定了一個結果，它就不能透過決定另一個結果來改變主意。

有效性
: 如果節點決定提交，那麼所有節點必須先前投票提交。如果任何節點投票中止，節點必須中止。

非平凡性
: 如果所有節點都投票提交，並且沒有發生通訊超時，那麼所有節點必須決定提交。

終止
: 每個未崩潰的節點最終都會決定提交或中止。

有效性屬性確保事務只有在所有節點都同意時才能提交；非平凡性屬性確保演算法不能簡單地總是中止（但如果任何節點之間的通訊超時，它允許中止）。其他三個屬性基本上與共識相同。

如果你有共識的解決方案，有多種方法可以解決原子提交 [^78] [^79]。一種方法是這樣的：當你想要提交事務時，每個節點將其提交或中止的投票傳送給每個其他節點。從自己和每個其他節點收到提交投票的節點使用共識演算法提議"提交"；收到中止投票或經歷超時的節點使用共識演算法提議"中止"。當節點發現共識演算法決定了什麼時，它會相應地提交或中止。

在這個演算法中，只有當所有節點都投票提交時，才會提議"提交"。如果任何節點投票中止，所有共識演算法中的提議都將是"中止"。如果所有節點都投票提交但某些通訊超時，可能會發生某些節點提議"中止"而其他節點提議"提交"；在這種情況下，節點是提交還是中止並不重要，只要它們都做同樣的事。

如果你有容錯的原子提交協議，你也可以解決共識。每個想要提議值的節點都在節點仲裁上啟動事務，並在每個節點上執行單節點 CAS，如果其值尚未被另一個事務設定，則將暫存器設定為提議的值。如果 CAS 成功，節點投票提交，否則投票中止。如果原子提交協議決定提交事務，其值將被決定用於共識；如果原子提交中止，提議節點將使用新事務重試。

這表明原子提交和共識也是彼此等價的。

### 共識的實踐 {#sec_consistency_total_order}

我們已經看到，單值共識、CAS、共享日誌和原子提交都彼此等價：你可以將其中一個的解決方案轉換為任何其他的解決方案。這是一個有價值的理論見解，但它沒有回答這個問題：在實踐中，這些許多共識表述中哪一個最有用？

答案是大多數共識系統提供共享日誌，也稱為全序廣播。Raft、Viewstamped Replication 和 Zab 直接提供共享日誌。Paxos 提供單值共識，但在實踐中，大多數使用 Paxos 的系統實際上使用稱為 Multi-Paxos 的擴充套件，它也提供共享日誌。

#### 使用共享日誌 {#sec_consistency_smr}

共享日誌非常適合資料庫複製：如果每個日誌條目代表對資料庫的寫入，並且每個副本使用確定性邏輯以相同的順序處理相同的寫入，那麼副本將全部處於一致狀態。這個想法被稱為 *狀態機複製* [^80]，它是事件溯源背後的原則，我們在 ["事件溯源和 CQRS"](/tw/ch3#sec_datamodels_events) 中看到了。共享日誌對於流處理也很有用，我們將在 [第十二章](/tw/ch12#ch_stream) 中看到。

同樣，共享日誌可用於實現可序列化事務：如 ["實際序列執行"](/tw/ch8#sec_transactions_serial) 中所討論的，如果每個日誌條目代表要作為儲存過程執行的確定性事務，並且如果每個節點以相同的順序執行這些事務，那麼事務將是可序列化的 [^81] [^82]。

---------

> [!NOTE]
> 具有強一致性模型的分片資料庫通常為每個分片維護一個單獨的日誌，這提高了可伸縮性，但限制了它們可以跨分片提供的一致性保證（例如，一致快照、外部索引鍵引用）。跨分片的可序列化事務是可能的，但需要額外的協調 [^83]。

--------

共享日誌也很強大，因為它可以很容易地適應其他形式的共識：

* 我們之前看到了如何使用它來實現單值共識和 CAS：只需決定日誌中首先出現的值。
* 如果你想要許多單值共識例項（例如，幾個人試圖預訂的劇院中每個座位一個），請在日誌條目中包含座位編號，並決定包含給定座位編號的第一個日誌條目。
* 如果你想要原子獲取並增加，請將要新增到計數器的數字放入日誌條目中，當前計數器值是到目前為止所有日誌條目的總和。日誌條目上的簡單計數器可用於生成柵欄令牌（見 ["柵欄化殭屍和延遲請求"](/tw/ch9#sec_distributed_fencing_tokens)）；例如，在 ZooKeeper 中，此序列號稱為 `zxid` [^18]。

#### 從單主複製到共識 {#from-single-leader-replication-to-consensus}

我們之前看到，如果你有一個單一的"獨裁者"節點做出決定，單值共識很容易，同樣，如果單個主節點是唯一允許向其追加條目的節點，共享日誌也很容易。問題是如果該節點失敗如何提供容錯。

傳統上，具有單主複製的資料庫沒有解決這個問題：它們將主節點故障切換作為人類管理員必須手動執行的操作。不幸的是，這意味著大量的停機時間，因為人類反應的速度是有限的，並且它不滿足共識的終止屬性。對於共識，我們要求演算法可以自動選擇新的主節點。（並非所有共識演算法都有主節點，但常用的演算法有 [^84]。）

然而，有一個問題。我們之前討論過腦裂的問題，並說所有節點都需要就誰是主節點達成一致——否則兩個不同的節點可能各自認為自己是主節點，從而做出不一致的決定。因此，似乎我們需要共識來選舉主節點，而我們需要主節點來解決共識。我們如何擺脫這個難題？

事實上，共識演算法不要求在任何時候只有一個主節點。相反，它們做出了較弱的保證：它們定義了一個 *紀元編號*（在 Paxos 中稱為 *投票編號*，在 Viewstamped Replication 中稱為 *檢視編號*，在 Raft 中稱為 *任期編號*）並保證在每個紀元內，主節點是唯一的。

當節點因為在某個超時時間內沒有收到主節點的訊息而認為當前主節點已死時，它可能會開始投票選舉新的主節點。這次選舉被賦予一個大於任何先前紀元的新紀元編號。如果兩個不同紀元中的兩個不同主節點之間存在衝突（也許是因為先前的主節點實際上並沒有死），那麼具有更高紀元編號的主節點獲勝。

在主節點被允許將下一個條目追加到共享日誌之前，它必須首先檢查是否有其他具有更高紀元編號的主節點可能追加不同的條目。它可以透過從一個節點仲裁收集投票來做到這一點，通常（但並非總是）是多數節點 [^85]。只有在節點不知道任何其他具有更高紀元的主節點時，節點才會投贊成票。

因此，我們有兩輪投票：一次選擇主節點，第二次對主節點提議的下一個要追加到日誌的條目進行投票。這兩次投票的仲裁必須重疊：如果對提議的投票成功，投票支援它的節點中至少有一個也必須參與了最近成功的主節點選舉 [^85]。因此，如果對提議的投票透過而沒有透露任何更高編號的紀元，當前主節點可以得出結論，沒有選出具有更高紀元編號的主節點，因此它可以安全地將提議的條目追加到日誌中 [^26] [^86]。

這兩輪投票表面上看起來類似於兩階段提交，但它們是非常不同的協議。在共識演算法中，任何節點都可以開始選舉，它只需要節點仲裁的響應；在 2PC 中，只有協調器可以請求投票，它需要 *每個* 參與者的"是"投票才能提交。

#### 共識的微妙之處 {#subtleties-of-consensus}

這個基本結構對於 Raft、Multi-Paxos、Zab 和 Viewstamped Replication 的所有都是通用的：節點仲裁的投票選舉主節點，然後主節點想要追加到日誌的每個條目都需要另一個仲裁投票 [^68] [^69]。每個新的日誌條目在確認給請求寫入的客戶端之前都會同步複製到節點仲裁。這確保如果當前主節點失敗，日誌條目不會丟失。

然而，魔鬼在細節中，這也是這些演算法採用不同方法的地方。例如，當舊主節點失敗並選出新主節點時，演算法需要確保新主節點遵守舊主節點在失敗之前已經追加的任何日誌條目。Raft 透過只允許其日誌至少與其大多數追隨者一樣最新的節點成為新主節點來做到這一點 [^69]。相比之下，Paxos 允許任何節點成為新主節點，但要求它在開始追加自己的新條目之前使其日誌與其他節點保持最新。


--------

> [!TIP] 主節點選舉中的一致性與可用性

如果你希望共識演算法嚴格保證 ["共享日誌作為共識"](#sec_consistency_shared_logs) 中列出的屬性，那麼新主節點在處理任何寫入或線性一致讀取之前必須瞭解任何已確認的日誌條目，這一點至關重要。如果具有過時資料的節點成為新主節點，它可能會將新值寫入已經由舊主節點寫入的日誌條目，從而違反共享日誌的僅追加屬性。

在某些情況下，你可能選擇削弱共識屬性，以便更快地從主節點故障中恢復。例如，Kafka 提供了啟用 *不乾淨的主節點選舉* 的選項，它允許任何副本成為主節點，即使它不是最新的。此外，在採用非同步複製的資料庫中，當主節點失敗時，你無法保證任何備庫是最新的。

如果你放棄新主節點必須是最新的要求，你可能會提高效能和可用性，但你是在薄冰上，因為共識理論不再適用。雖然只要沒有故障，事情就會正常工作，但 [第九章](/tw/ch9) 中討論的問題很容易導致大量資料丟失或損壞。

--------

另一個微妙之處是如何處理演算法處理舊主節點在失敗之前提議的日誌條目，但對於追加到日誌的投票尚未完成。你可以在本章的參考文獻中找到這些細節的討論 [^23] [^69] [^86]。

對於使用共識演算法進行復制的資料庫，不僅寫入需要轉換為日誌條目並複製到仲裁。如果你想保證線性一致的讀取，它們也必須像寫入一樣透過仲裁投票，以確認認為自己是主節點的節點確實仍然是最新的。例如，etcd 中的線性一致讀取就是這樣工作的。

在其標準形式中，大多數共識演算法假設一組固定的節點——也就是說，節點可能會宕機並重新啟動，但允許投票的節點集在建立叢集時是固定的。在實踐中，通常需要在系統配置中新增新節點或刪除舊節點。共識演算法已經擴充套件了 *重新配置* 功能，使這成為可能。這在向系統新增新區域或從一個位置遷移到另一個位置（透過首先新增新節點，然後刪除舊節點）時特別有用。

#### 共識的利弊 {#pros-and-cons-of-consensus}

儘管它們複雜而微妙，但共識演算法是分散式系統的巨大突破。共識本質上是"正確完成的單主複製"，在主節點故障時自動故障切換，確保沒有已提交的資料丟失，也不可能出現腦裂，即使面對我們在 [第九章](/tw/ch9) 中討論的所有問題。

由於單主複製與自動故障切換本質上是共識的定義之一，任何提供自動故障切換但不使用經過驗證的共識演算法的系統都可能是不安全的 [^87]。使用經過驗證的共識演算法並不能保證整個系統的正確性——仍然有很多其他地方可能潛伏著錯誤——但這是一個好的開始。

然而，共識並不是到處都使用，因為好處是有代價的。共識系統總是需要嚴格的多數才能執行——容忍一個故障需要三個節點，或者容忍兩個故障需要五個節點。每個操作都需要與仲裁通訊，因此你不能透過新增更多節點來增加吞吐量（事實上，你新增的每個節點都會使演算法變慢）。如果網路分割槽將某些節點與其餘節點隔離，只有網路的多數部分可以取得進展，其餘部分被阻塞。

共識系統通常依賴超時來檢測失敗的節點。在具有高度可變網路延遲的環境中，特別是跨多個地理區域分佈的系統，調整這些超時可能很困難：如果它們太大，從故障中恢復需要很長時間；如果它們太小，可能會有很多不必要的主節點選舉，導致糟糕的效能，因為系統最終花費更多時間選擇主節點而不是做有用的工作。

有時，共識演算法對網路問題特別敏感。例如，Raft 已被證明具有不愉快的邊緣情況 [^88] [^89]：如果除了一個始終不可靠的特定網路連結之外，整個網路都正常工作，Raft 可能會進入主節點身份在兩個節點之間不斷跳躍的情況，或者當前主節點不斷被迫辭職，因此係統實際上從未取得進展。設計對不可靠網路更穩健的演算法仍然是一個開放的研究問題。

對於想要高可用但不想接受共識成本的系統，唯一真正的選擇是使用較弱的一致性模型，例如 [第六章](/tw/ch6) 中討論的無主或多主複製提供的模型。這些方法通常不提供線性一致性，但對於不需要它的應用程式來說已經足夠。


### 協調服務 {#sec_consistency_coordination}

共識演算法對於任何希望提供線性一致操作的分散式資料庫都很有價值，許多現代分散式資料庫也都用共識來做複製。但有一類系統是共識演算法的重度使用者：*協調服務*，例如 ZooKeeper、etcd 和 Consul。雖然它們表面上看起來像普通鍵值儲存，但它們並不是為通用資料儲存而設計的。

相反，它們的目標是協調另一個分散式系統中的多個節點。例如，Kubernetes 依賴 etcd；Spark 和 Flink 在高可用模式下會在後臺依賴 ZooKeeper。協調服務通常只儲存小規模資料，這些資料可以完全放入記憶體（同時仍會寫盤以保證永續性），並透過容錯共識演算法在多個節點間複製。

協調服務的設計思路來自 Google 的 Chubby 鎖服務 [^17] [^58]。它把共識演算法與一些在分散式系統裡尤其有用的能力結合在一起：

鎖與租約
: 我們前面看到，共識系統可以實現具備容錯能力的原子比較並設定（CAS）操作。協調服務正是基於這一點來實現鎖和租約：若多個節點併發嘗試獲取同一個租約，最終只會有一個成功。

支援柵欄
: 如 ["分散式鎖和租約"](/tw/ch9#sec_distributed_lock_fencing) 所述，當某個資源受租約保護時，需要 *柵欄* 機制來防止程序暫停或網路大延遲時的相互干擾。共識系統可透過為每個日誌條目分配單調遞增 ID 來生成柵欄令牌（ZooKeeper 中的 `zxid` 和 `cversion`，etcd 中的 revision）。

故障檢測
: 客戶端會在協調服務上維持長連線會話，並透過週期性心跳檢查對端是否存活。即使連線臨時中斷或某臺服務端故障，客戶端持有的租約仍可保持有效；但如果超過租約超時時間仍未收到心跳，協調服務就會認為客戶端已失效並釋放租約（ZooKeeper 將其稱為 *臨時節點*）。

變更通知
: 客戶端可以請求：當某些鍵發生變化時由協調服務主動通知。這樣客戶端就能知道另一個節點何時加入叢集（基於其寫入的值），或者何時失效（會話超時、臨時節點消失）。這類通知避免了客戶端頻繁輪詢。

故障檢測和變更通知本身不需要共識，但與需要共識的原子操作、柵欄機制結合後，它們對分散式協調非常有用。

--------

> [!TIP] 用協調服務管理配置

應用與基礎設施通常都有配置引數，例如超時時間、執行緒池大小等。有時會把這類配置資料以鍵值對形式存放在協調服務中。程序啟動時載入最新配置，並訂閱後續變更通知。配置更新後，程序可以立即應用新值，或重啟後生效。

配置管理本身不需要協調服務裡的共識能力；但如果系統本來就已經運行了協調服務，那麼直接複用它的通知機制會很方便。另一種做法是程序週期性地從檔案或 URL 拉取配置更新，以避免依賴專門的協調服務。

--------

#### 將工作分配給節點 {#allocating-work-to-nodes}

當你有某個程序或服務的多個例項，且其中一個需要被選為主節點時，協調服務很有用。如果主節點失效，其他節點之一應當接管。這不僅適用於單主資料庫，也適用於作業排程器等有狀態系統。

另一個場景是：你有某種分片資源（資料庫、訊息流、檔案儲存、分散式 Actor 系統等），需要決定每個分片由哪個節點負責。隨著新節點加入叢集，需要把部分分片從舊節點遷移到新節點以實現再平衡；當節點被移除或失效時，其他節點需要接手其工作。

這類任務可以透過協調服務中的原子操作、臨時節點和通知機制配合完成。若實現得當，應用可以在無人值守的情況下自動從故障中恢復。即使有 Apache Curator 這類在 ZooKeeper 客戶端 API 上封裝的高階庫，這件事仍不容易；但它仍遠好於從零實現共識演算法，後者極易引入缺陷。

專用協調服務還有一個優勢：無論被協調系統有多少節點，協調服務本身通常都只需執行在一組固定節點上（常見是 3 個或 5 個）。例如，一個擁有數千分片的儲存系統若在數千節點上直接跑共識會非常低效；把共識“外包”給少量協調服務節點通常更合理。

通常，協調服務管理的資料變化頻率不高：例如“IP 為 10.1.1.23 的節點當前是分片 7 的主節點”這類資訊，更新週期往往是分鐘級或小時級。協調服務不適合儲存每秒變化數千次的資料。對於高頻變化資料，應該使用常規資料庫；或者使用 Apache BookKeeper [^90] [^91] 這類工具複製服務內部的快速變化狀態。

#### 服務發現 {#service-discovery}

ZooKeeper、etcd 和 Consul 也常用於 *服務發現*：即確定連線某個服務所需的 IP 地址（見 ["負載均衡、服務發現和服務網格"](/tw/ch5#sec_encoding_service_discovery)）。在雲環境下，虛擬機器常常頻繁上下線，因此你通常無法預先知道服務地址。常見做法是讓服務啟動時把自身網路端點註冊到服務登錄檔，再供其他服務查詢。

用協調服務做服務發現很方便，因為它的故障檢測和變更通知能讓客戶端及時跟蹤服務例項的增減。而且如果你本來就用協調服務做租約、鎖或主節點選舉，那麼繼續複用它做服務發現通常也很自然，因為它已經知道哪個節點應該接收請求。

不過，對服務發現使用共識往往有些“殺雞用牛刀”：這個場景通常不要求線性一致性，更重要的是高可用和低延遲，因為沒有服務發現，整個系統都會停滯。因此通常更傾向於快取服務發現結果，並接受其可能略有陳舊。比如基於 DNS 的服務發現，就是透過多層快取來獲得良好的效能與可用性。

為支援這類需求，ZooKeeper 提供了 *observer*（觀察者）節點：它接收日誌並維護一份 ZooKeeper 資料副本，但不參與共識投票。來自 observer 的讀取不具備線性一致性（可能陳舊），但即使網路中斷仍然可用，並且能透過快取提高系統可支援的讀吞吐量。

## 總結 {#summary}

在本章中，我們研究了容錯系統中強一致性的主題：它是什麼，以及如何實現它。我們深入研究了線性一致性，這是強一致性的一種流行形式化：它意味著複製的資料看起來好像只有一個副本，所有操作都以原子方式作用於它。我們看到，當你需要在讀取時某些資料是最新的，或者需要解決競爭條件（例如，如果多個節點併發地嘗試做同樣的事情，比如建立具有相同名稱的檔案）時，線性一致性是有用的。

雖然線性一致性很有吸引力，因為它易於理解——它使資料庫的行為像單執行緒程式中的變數一樣——但它的缺點是速度慢，特別是在網路延遲較大的環境中。許多複製演算法不能保證線性一致性，即使表面上看起來它們可能提供強一致性。

接下來，我們在 ID 生成器的背景下應用了線性一致性的概念。單節點自增計數器是線性一致的，但不是容錯的。許多分散式 ID 生成方案不能保證 ID 的順序與事件實際發生的順序一致。像 Lamport 時鐘和混合邏輯時鐘這樣的邏輯時鐘提供了與因果關係一致的順序，但沒有線性一致性。

這引導我們進入了共識的概念。我們看到，達成共識意味著以一種所有節點都同意決定的方式決定某事，並且他們不能改變主意。廣泛的問題實際上可以歸約為共識，並且彼此等價（即，如果你有一個問題的解決方案，你可以將其轉換為所有其他問題的解決方案）。這些等價的問題包括：

線性一致的比較並設定操作
: 暫存器需要根據其當前值是否等於操作中給定的引數，原子地 **決定** 是否設定其值。

鎖和租約
: 當多個客戶端併發地嘗試獲取鎖或租約時，鎖 **決定** 哪一個成功獲取它。

唯一性約束
: 當多個事務併發地嘗試建立具有相同鍵的衝突記錄時，約束必須 **決定** 允許哪一個，哪一個應該因約束違反而失敗。

共享日誌
: 當多個節點併發地想要向日志追加條目時，日誌 **決定** 它們被追加的順序。全序廣播也是等價的。

原子事務提交
: 參與分散式事務的資料庫節點必須都以相同的方式 **決定** 是提交還是中止事務。

線性一致的獲取並增加操作
: 這個操作可以用來實現 ID 生成器。多個節點可以併發呼叫該操作，它 **決定** 它們遞增計數器的順序。這種情況實際上只解決了兩個節點之間的共識，而其他情況適用於任意數量的節點。

如果你只有一個節點，或者願意把決策能力交給單個節點，所有這些都很簡單。這就是單主資料庫中發生的事情：所有決策權都授予主節點，這也是這類資料庫能夠提供線性一致操作、唯一性約束和複製日誌等能力的原因。

然而，如果這個單一主節點失效，或者網路中斷使其不可達，這樣的系統就無法繼續推進，直到人工完成手動故障切換。Raft 和 Paxos 等廣泛使用的共識演算法，本質上就是內建自動主節點選舉與故障切換的“單主複製”。

共識演算法經過精心設計，以確保在故障轉移期間不會丟失任何已提交的寫入，並且系統不會進入腦裂狀態（多個節點接受寫入）。這要求每個寫入和每個線性一致的讀取都由節點的仲裁（通常是多數）確認。這可能是昂貴的，特別是跨地理區域，但如果你想要共識提供的強一致性和容錯性，這是不可避免的。

像 ZooKeeper 和 etcd 這樣的協調服務也是建立在共識演算法之上的。它們提供鎖、租約、故障檢測和變更通知功能，這些功能對於管理分散式應用程式的狀態很有用。如果你發現自己想要做那些可以歸約為共識的事情之一，並且你希望它是容錯的，建議使用協調服務。它不會保證你做對，但它可能會有所幫助。

共識演算法複雜而微妙，但其背後有自 1980 年代以來形成的豐富理論體系支援。正是這些理論，使我們能夠構建出能夠容忍 [第九章](/tw/ch9#ch_distributed) 所述故障、同時仍保證資料不被破壞的系統。這是分散式系統工程中的重要成就，本章末尾參考文獻展示了其中一些關鍵工作。

然而，共識並不總是正確的工具：在某些系統中，不需要它提供的強一致性屬性，使用較弱一致性來換取更高可用性和更好效能反而更合適。在這些場景下，通常會使用無主或多主複製，這也是我們之前在 [第六章](/tw/ch6#ch_replication) 討論過的內容。我們在本章討論的邏輯時鐘在那類場景中也很有幫助。

### 參考文獻

[^1]: Maurice P. Herlihy and Jeannette M. Wing. [Linearizability: A Correctness Condition for Concurrent Objects](https://cs.brown.edu/~mph/HerlihyW90/p463-herlihy.pdf). *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 12, issue 3, pages 463–492, July 1990. [doi:10.1145/78969.78972](https://doi.org/10.1145/78969.78972)
[^2]: Leslie Lamport. [On interprocess communication](https://www.microsoft.com/en-us/research/publication/interprocess-communication-part-basic-formalism-part-ii-algorithms/). *Distributed Computing*, volume 1, issue 2, pages 77–101, June 1986. [doi:10.1007/BF01786228](https://doi.org/10.1007/BF01786228)
[^3]: David K. Gifford. [Information Storage in a Decentralized Computer System](https://bitsavers.org/pdf/xerox/parc/techReports/CSL-81-8_Information_Storage_in_a_Decentralized_Computer_System.pdf). Xerox Palo Alto Research Centers, CSL-81-8, June 1981. Archived at [perma.cc/2XXP-3JPB](https://perma.cc/2XXP-3JPB)
[^4]: Martin Kleppmann. [Please Stop Calling Databases CP or AP](https://martin.kleppmann.com/2015/05/11/please-stop-calling-databases-cp-or-ap.html). *martin.kleppmann.com*, May 2015. Archived at [perma.cc/MJ5G-75GL](https://perma.cc/MJ5G-75GL)
[^5]: Kyle Kingsbury. [Call Me Maybe: MongoDB Stale Reads](https://aphyr.com/posts/322-call-me-maybe-mongodb-stale-reads). *aphyr.com*, April 2015. Archived at [perma.cc/DXB4-J4JC](https://perma.cc/DXB4-J4JC)
[^6]: Kyle Kingsbury. [Computational Techniques in Knossos](https://aphyr.com/posts/314-computational-techniques-in-knossos). *aphyr.com*, May 2014. Archived at [perma.cc/2X5M-EHTU](https://perma.cc/2X5M-EHTU)
[^7]: Kyle Kingsbury and Peter Alvaro. [Elle: Inferring Isolation Anomalies from Experimental Observations](https://www.vldb.org/pvldb/vol14/p268-alvaro.pdf). *Proceedings of the VLDB Endowment*, volume 14, issue 3, pages 268–280, November 2020. [doi:10.14778/3430915.3430918](https://doi.org/10.14778/3430915.3430918)
[^8]: Paolo Viotti and Marko Vukolić. [Consistency in Non-Transactional Distributed Storage Systems](https://arxiv.org/abs/1512.00168). *ACM Computing Surveys* (CSUR), volume 49, issue 1, article no. 19, June 2016. [doi:10.1145/2926965](https://doi.org/10.1145/2926965)
[^9]: Peter Bailis. [Linearizability Versus Serializability](http://www.bailis.org/blog/linearizability-versus-serializability/). *bailis.org*, September 2014. Archived at [perma.cc/386B-KAC3](https://perma.cc/386B-KAC3)
[^10]: Daniel Abadi. [Correctness Anomalies Under Serializable Isolation](https://dbmsmusings.blogspot.com/2019/06/correctness-anomalies-under.html). *dbmsmusings.blogspot.com*, June 2019. Archived at [perma.cc/JGS7-BZFY](https://perma.cc/JGS7-BZFY)
[^11]: Peter Bailis, Aaron Davidson, Alan Fekete, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Highly Available Transactions: Virtues and Limitations](https://www.vldb.org/pvldb/vol7/p181-bailis.pdf). *Proceedings of the VLDB Endowment*, volume 7, issue 3, pages 181–192, November 2013. [doi:10.14778/2732232.2732237](https://doi.org/10.14778/2732232.2732237), extended version published as [arXiv:1302.0309](https://arxiv.org/abs/1302.0309)
[^12]: Philip A. Bernstein, Vassos Hadzilacos, and Nathan Goodman. [*Concurrency Control and Recovery in Database Systems*](https://www.microsoft.com/en-us/research/people/philbe/book/). Addison-Wesley, 1987. ISBN: 978-0-201-10715-9, available online at [*microsoft.com*](https://www.microsoft.com/en-us/research/people/philbe/book/).
[^13]: Andrei Matei. [CockroachDB’s consistency model](https://www.cockroachlabs.com/blog/consistency-model/). *cockroachlabs.com*, February 2021. Archived at [perma.cc/MR38-883B](https://perma.cc/MR38-883B)
[^14]: Murat Demirbas. [Strict-serializability, but at what cost, for what purpose?](https://muratbuffalo.blogspot.com/2022/08/strict-serializability-but-at-what-cost.html) *muratbuffalo.blogspot.com*, August 2022. Archived at [perma.cc/T8AY-N3U9](https://perma.cc/T8AY-N3U9)
[^15]: Ben Darnell. [How to talk about consistency and isolation in distributed DBs](https://www.cockroachlabs.com/blog/db-consistency-isolation-terminology/). *cockroachlabs.com*, February 2022. Archived at [perma.cc/53SV-JBGK](https://perma.cc/53SV-JBGK)
[^16]: Daniel Abadi. [An explanation of the difference between Isolation levels vs. Consistency levels](https://dbmsmusings.blogspot.com/2019/08/an-explanation-of-difference-between.html). *dbmsmusings.blogspot.com*, August 2019. Archived at [perma.cc/QSF2-CD4P](https://perma.cc/QSF2-CD4P)
[^17]: Mike Burrows. [The Chubby Lock Service for Loosely-Coupled Distributed Systems](https://research.google/pubs/pub27897/). At *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006.
[^18]: Flavio P. Junqueira and Benjamin Reed. [*ZooKeeper: Distributed Process Coordination*](https://www.oreilly.com/library/view/zookeeper/9781449361297/). O’Reilly Media, 2013. ISBN: 978-1-449-36130-3
[^19]: Murali Vallath. [*Oracle 10g RAC Grid, Services & Clustering*](https://www.oreilly.com/library/view/oracle-10g-rac/9781555583217/). Elsevier Digital Press, 2006. ISBN: 978-1-555-58321-7
[^20]: Peter Bailis, Alan Fekete, Michael J. Franklin, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Coordination Avoidance in Database Systems](https://arxiv.org/abs/1402.2237). *Proceedings of the VLDB Endowment*, volume 8, issue 3, pages 185–196, November 2014. [doi:10.14778/2735508.2735509](https://doi.org/10.14778/2735508.2735509)
[^21]: Kyle Kingsbury. [Call Me Maybe: etcd and Consul](https://aphyr.com/posts/316-call-me-maybe-etcd-and-consul). *aphyr.com*, June 2014. Archived at [perma.cc/XL7U-378K](https://perma.cc/XL7U-378K)
[^22]: Flavio P. Junqueira, Benjamin C. Reed, and Marco Serafini. [Zab: High-Performance Broadcast for Primary-Backup Systems](https://marcoserafini.github.io/assets/pdf/zab.pdf). At *41st IEEE International Conference on Dependable Systems and Networks* (DSN), June 2011. [doi:10.1109/DSN.2011.5958223](https://doi.org/10.1109/DSN.2011.5958223)
[^23]: Diego Ongaro and John K. Ousterhout. [In Search of an Understandable Consensus Algorithm](https://www.usenix.org/system/files/conference/atc14/atc14-paper-ongaro.pdf). At *USENIX Annual Technical Conference* (ATC), June 2014.
[^24]: Hagit Attiya, Amotz Bar-Noy, and Danny Dolev. [Sharing Memory Robustly in Message-Passing Systems](https://www.cs.huji.ac.il/course/2004/dist/p124-attiya.pdf). *Journal of the ACM*, volume 42, issue 1, pages 124–142, January 1995. [doi:10.1145/200836.200869](https://doi.org/10.1145/200836.200869)
[^25]: Nancy Lynch and Alex Shvartsman. [Robust Emulation of Shared Memory Using Dynamic Quorum-Acknowledged Broadcasts](https://groups.csail.mit.edu/tds/papers/Lynch/FTCS97.pdf). At *27th Annual International Symposium on Fault-Tolerant Computing* (FTCS), June 1997. [doi:10.1109/FTCS.1997.614100](https://doi.org/10.1109/FTCS.1997.614100)
[^26]: Christian Cachin, Rachid Guerraoui, and Luís Rodrigues. [*Introduction to Reliable and Secure Distributed Programming*](https://www.distributedprogramming.net/), 2nd edition. Springer, 2011. ISBN: 978-3-642-15259-7, [doi:10.1007/978-3-642-15260-3](https://doi.org/10.1007/978-3-642-15260-3)
[^27]: Niklas Ekström, Mikhail Panchenko, and Jonathan Ellis. [Possible Issue with Read Repair?](https://lists.apache.org/thread/wwsjnnc93mdlpw8nb0d5gn4q1bmpzbon) Email thread on *cassandra-dev* mailing list, October 2012.
[^28]: Maurice P. Herlihy. [Wait-Free Synchronization](https://cs.brown.edu/~mph/Herlihy91/p124-herlihy.pdf). *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 13, issue 1, pages 124–149, January 1991. [doi:10.1145/114005.102808](https://doi.org/10.1145/114005.102808)
[^29]: Armando Fox and Eric A. Brewer. [Harvest, Yield, and Scalable Tolerant Systems](https://radlab.cs.berkeley.edu/people/fox/static/pubs/pdf/c18.pdf). At *7th Workshop on Hot Topics in Operating Systems* (HotOS), March 1999. [doi:10.1109/HOTOS.1999.798396](https://doi.org/10.1109/HOTOS.1999.798396)
[^30]: Seth Gilbert and Nancy Lynch. [Brewer’s Conjecture and the Feasibility of Consistent, Available, Partition-Tolerant Web Services](https://www.comp.nus.edu.sg/~gilbert/pubs/BrewersConjecture-SigAct.pdf). *ACM SIGACT News*, volume 33, issue 2, pages 51–59, June 2002. [doi:10.1145/564585.564601](https://doi.org/10.1145/564585.564601)
[^31]: Seth Gilbert and Nancy Lynch. [Perspectives on the CAP Theorem](https://groups.csail.mit.edu/tds/papers/Gilbert/Brewer2.pdf). *IEEE Computer Magazine*, volume 45, issue 2, pages 30–36, February 2012. [doi:10.1109/MC.2011.389](https://doi.org/10.1109/MC.2011.389)
[^32]: Eric A. Brewer. [CAP Twelve Years Later: How the ‘Rules’ Have Changed](https://sites.cs.ucsb.edu/~rich/class/cs293-cloud/papers/brewer-cap.pdf). *IEEE Computer Magazine*, volume 45, issue 2, pages 23–29, February 2012. [doi:10.1109/MC.2012.37](https://doi.org/10.1109/MC.2012.37)
[^33]: Susan B. Davidson, Hector Garcia-Molina, and Dale Skeen. [Consistency in Partitioned Networks](https://www.cs.rice.edu/~alc/old/comp520/papers/DGS85.pdf). *ACM Computing Surveys*, volume 17, issue 3, pages 341–370, September 1985. [doi:10.1145/5505.5508](https://doi.org/10.1145/5505.5508)
[^34]: Paul R. Johnson and Robert H. Thomas. [RFC 677: The Maintenance of Duplicate Databases](https://tools.ietf.org/html/rfc677). Network Working Group, January 1975.
[^35]: Michael J. Fischer and Alan Michael. [Sacrificing Serializability to Attain High Availability of Data in an Unreliable Network](https://sites.cs.ucsb.edu/~agrawal/spring2011/ugrad/p70-fischer.pdf). At *1st ACM Symposium on Principles of Database Systems* (PODS), March 1982. [doi:10.1145/588111.588124](https://doi.org/10.1145/588111.588124)
[^36]: Eric A. Brewer. [NoSQL: Past, Present, Future](https://www.infoq.com/presentations/NoSQL-History/). At *QCon San Francisco*, November 2012.
[^37]: Adrian Cockcroft. [Migrating to Microservices](https://www.infoq.com/presentations/migration-cloud-native/). At *QCon London*, March 2014.
[^38]: Martin Kleppmann. [A Critique of the CAP Theorem](https://arxiv.org/abs/1509.05393). arXiv:1509.05393, September 2015.
[^39]: Daniel Abadi. [Problems with CAP, and Yahoo’s little known NoSQL system](https://dbmsmusings.blogspot.com/2010/04/problems-with-cap-and-yahoos-little.html). *dbmsmusings.blogspot.com*, April 2010. Archived at [perma.cc/4NTZ-CLM9](https://perma.cc/4NTZ-CLM9)
[^40]: Daniel Abadi. [Hazelcast and the Mythical PA/EC System](https://dbmsmusings.blogspot.com/2017/10/hazelcast-and-mythical-paec-system.html). *dbmsmusings.blogspot.com*, October 2017. Archived at [perma.cc/J5XM-U5C2](https://perma.cc/J5XM-U5C2)
[^41]: Eric Brewer. [Spanner, TrueTime & The CAP Theorem](https://research.google.com/pubs/archive/45855.pdf). *research.google.com*, February 2017. Archived at [perma.cc/59UW-RH7N](https://perma.cc/59UW-RH7N)
[^42]: Daniel J. Abadi. [Consistency Tradeoffs in Modern Distributed Database System Design](https://www.cs.umd.edu/~abadi/papers/abadi-pacelc.pdf). *IEEE Computer Magazine*, volume 45, issue 2, pages 37–42, February 2012. [doi:10.1109/MC.2012.33](https://doi.org/10.1109/MC.2012.33)
[^43]: Nancy A. Lynch. [A Hundred Impossibility Proofs for Distributed Computing](https://groups.csail.mit.edu/tds/papers/Lynch/podc89.pdf). At *8th ACM Symposium on Principles of Distributed Computing* (PODC), August 1989. [doi:10.1145/72981.72982](https://doi.org/10.1145/72981.72982)
[^44]: Prince Mahajan, Lorenzo Alvisi, and Mike Dahlin. [Consistency, Availability, and Convergence](https://apps.cs.utexas.edu/tech_reports/reports/tr/TR-2036.pdf). University of Texas at Austin, Department of Computer Science, Tech Report UTCS TR-11-22, May 2011. Archived at [perma.cc/SAV8-9JAJ](https://perma.cc/SAV8-9JAJ)
[^45]: Hagit Attiya, Faith Ellen, and Adam Morrison. [Limitations of Highly-Available Eventually-Consistent Data Stores](https://www.cs.tau.ac.il/~mad/publications/podc2015-replds.pdf). At *ACM Symposium on Principles of Distributed Computing* (PODC), July 2015. [doi:10.1145/2767386.2767419](https://doi.org/10.1145/2767386.2767419)
[^46]: Peter Sewell, Susmit Sarkar, Scott Owens, Francesco Zappa Nardelli, and Magnus O. Myreen. [x86-TSO: A Rigorous and Usable Programmer’s Model for x86 Multiprocessors](https://www.cl.cam.ac.uk/~pes20/weakmemory/cacm.pdf). *Communications of the ACM*, volume 53, issue 7, pages 89–97, July 2010. [doi:10.1145/1785414.1785443](https://doi.org/10.1145/1785414.1785443)
[^47]: Martin Thompson. [Memory Barriers/Fences](https://mechanical-sympathy.blogspot.com/2011/07/memory-barriersfences.html). *mechanical-sympathy.blogspot.co.uk*, July 2011. Archived at [perma.cc/7NXM-GC5U](https://perma.cc/7NXM-GC5U)
[^48]: Ulrich Drepper. [What Every Programmer Should Know About Memory](https://www.akkadia.org/drepper/cpumemory.pdf). *akkadia.org*, November 2007. Archived at [perma.cc/NU6Q-DRXZ](https://perma.cc/NU6Q-DRXZ)
[^49]: Hagit Attiya and Jennifer L. Welch. [Sequential Consistency Versus Linearizability](https://courses.csail.mit.edu/6.852/01/papers/p91-attiya.pdf). *ACM Transactions on Computer Systems* (TOCS), volume 12, issue 2, pages 91–122, May 1994. [doi:10.1145/176575.176576](https://doi.org/10.1145/176575.176576)
[^50]: Kyzer R. Davis, Brad G. Peabody, and Paul J. Leach. [Universally Unique IDentifiers (UUIDs)](https://www.rfc-editor.org/rfc/rfc9562). RFC 9562, IETF, May 2024.
[^51]: Ryan King. [Announcing Snowflake](https://blog.x.com/engineering/en_us/a/2010/announcing-snowflake). *blog.x.com*, June 2010. Archived at [archive.org](https://web.archive.org/web/20241128214604/https%3A//blog.x.com/engineering/en_us/a/2010/announcing-snowflake)
[^52]: Alizain Feerasta. [Universally Unique Lexicographically Sortable Identifier](https://github.com/ulid/spec). *github.com*, 2016. Archived at [perma.cc/NV2Y-ZP8U](https://perma.cc/NV2Y-ZP8U)
[^53]: Rob Conery. [A Better ID Generator for PostgreSQL](https://bigmachine.io/2014/05/29/a-better-id-generator-for-postgresql/). *bigmachine.io*, May 2014. Archived at [perma.cc/K7QV-3KFC](https://perma.cc/K7QV-3KFC)
[^54]: Leslie Lamport. [Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/). *Communications of the ACM*, volume 21, issue 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](https://doi.org/10.1145/359545.359563)
[^55]: Sandeep S. Kulkarni, Murat Demirbas, Deepak Madeppa, Bharadwaj Avva, and Marcelo Leone. [Logical Physical Clocks](https://cse.buffalo.edu/~demirbas/publications/hlc.pdf). *18th International Conference on Principles of Distributed Systems* (OPODIS), December 2014. [doi:10.1007/978-3-319-14472-6\_2](https://doi.org/10.1007/978-3-319-14472-6_2)
[^56]: Manuel Bravo, Nuno Diegues, Jingna Zeng, Paolo Romano, and Luís Rodrigues. [On the use of Clocks to Enforce Consistency in the Cloud](http://sites.computer.org/debull/A15mar/p18.pdf). *IEEE Data Engineering Bulletin*, volume 38, issue 1, pages 18–31, March 2015. Archived at [perma.cc/68ZU-45SH](https://perma.cc/68ZU-45SH)
[^57]: Daniel Peng and Frank Dabek. [Large-Scale Incremental Processing Using Distributed Transactions and Notifications](https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Peng.pdf). At *9th USENIX Conference on Operating Systems Design and Implementation* (OSDI), October 2010.
[^58]: Tushar Deepak Chandra, Robert Griesemer, and Joshua Redstone. [Paxos Made Live – An Engineering Perspective](https://www.read.seas.harvard.edu/~kohler/class/08w-dsi/chandra07paxos.pdf). At *26th ACM Symposium on Principles of Distributed Computing* (PODC), June 2007. [doi:10.1145/1281100.1281103](https://doi.org/10.1145/1281100.1281103)
[^59]: Will Portnoy. [Lessons Learned from Implementing Paxos](https://blog.willportnoy.com/2012/06/lessons-learned-from-paxos.html). *blog.willportnoy.com*, June 2012. Archived at [perma.cc/QHD9-FDD2](https://perma.cc/QHD9-FDD2)
[^60]: Brian M. Oki and Barbara H. Liskov. [Viewstamped Replication: A New Primary Copy Method to Support Highly-Available Distributed Systems](https://pmg.csail.mit.edu/papers/vr.pdf). At *7th ACM Symposium on Principles of Distributed Computing* (PODC), August 1988. [doi:10.1145/62546.62549](https://doi.org/10.1145/62546.62549)
[^61]: Barbara H. Liskov and James Cowling. [Viewstamped Replication Revisited](https://pmg.csail.mit.edu/papers/vr-revisited.pdf). Massachusetts Institute of Technology, Tech Report MIT-CSAIL-TR-2012-021, July 2012. Archived at [perma.cc/56SJ-WENQ](https://perma.cc/56SJ-WENQ)
[^62]: Leslie Lamport. [The Part-Time Parliament](https://www.microsoft.com/en-us/research/publication/part-time-parliament/). *ACM Transactions on Computer Systems*, volume 16, issue 2, pages 133–169, May 1998. [doi:10.1145/279227.279229](https://doi.org/10.1145/279227.279229)
[^63]: Leslie Lamport. [Paxos Made Simple](https://www.microsoft.com/en-us/research/publication/paxos-made-simple/). *ACM SIGACT News*, volume 32, issue 4, pages 51–58, December 2001. Archived at [perma.cc/82HP-MNKE](https://perma.cc/82HP-MNKE)
[^64]: Robbert van Renesse and Deniz Altinbuken. [Paxos Made Moderately Complex](https://people.cs.umass.edu/~arun/590CC/papers/paxos-moderately-complex.pdf). *ACM Computing Surveys* (CSUR), volume 47, issue 3, article no. 42, February 2015. [doi:10.1145/2673577](https://doi.org/10.1145/2673577)
[^65]: Diego Ongaro. [Consensus: Bridging Theory and Practice](https://github.com/ongardie/dissertation). PhD Thesis, Stanford University, August 2014. Archived at [perma.cc/5VTZ-2ADH](https://perma.cc/5VTZ-2ADH)
[^66]: Heidi Howard, Malte Schwarzkopf, Anil Madhavapeddy, and Jon Crowcroft. [Raft Refloated: Do We Have Consensus?](https://www.cl.cam.ac.uk/research/srg/netos/papers/2015-raftrefloated-osr.pdf) *ACM SIGOPS Operating Systems Review*, volume 49, issue 1, pages 12–21, January 2015. [doi:10.1145/2723872.2723876](https://doi.org/10.1145/2723872.2723876)
[^67]: André Medeiros. [ZooKeeper’s Atomic Broadcast Protocol: Theory and Practice](http://www.tcs.hut.fi/Studies/T-79.5001/reports/2012-deSouzaMedeiros.pdf). Aalto University School of Science, March 2012. Archived at [perma.cc/FVL4-JMVA](https://perma.cc/FVL4-JMVA)
[^68]: Robbert van Renesse, Nicolas Schiper, and Fred B. Schneider. [Vive La Différence: Paxos vs. Viewstamped Replication vs. Zab](https://arxiv.org/abs/1309.5671). *IEEE Transactions on Dependable and Secure Computing*, volume 12, issue 4, pages 472–484, September 2014. [doi:10.1109/TDSC.2014.2355848](https://doi.org/10.1109/TDSC.2014.2355848)
[^69]: Heidi Howard and Richard Mortier. [Paxos vs Raft: Have we reached consensus on distributed consensus?](https://arxiv.org/abs/2004.05074). At *7th Workshop on Principles and Practice of Consistency for Distributed Data* (PaPoC), April 2020. [doi:10.1145/3380787.3393681](https://doi.org/10.1145/3380787.3393681)
[^70]: Miguel Castro and Barbara H. Liskov. [Practical Byzantine Fault Tolerance and Proactive Recovery](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/01/p398-castro-bft-tocs.pdf). *ACM Transactions on Computer Systems*, volume 20, issue 4, pages 396–461, November 2002. [doi:10.1145/571637.571640](https://doi.org/10.1145/571637.571640)
[^71]: Shehar Bano, Alberto Sonnino, Mustafa Al-Bassam, Sarah Azouvi, Patrick McCorry, Sarah Meiklejohn, and George Danezis. [SoK: Consensus in the Age of Blockchains](https://smeiklej.com/files/aft19a.pdf). At *1st ACM Conference on Advances in Financial Technologies* (AFT), October 2019. [doi:10.1145/3318041.3355458](https://doi.org/10.1145/3318041.3355458)
[^72]: Michael J. Fischer, Nancy Lynch, and Michael S. Paterson. [Impossibility of Distributed Consensus with One Faulty Process](https://groups.csail.mit.edu/tds/papers/Lynch/jacm85.pdf). *Journal of the ACM*, volume 32, issue 2, pages 374–382, April 1985. [doi:10.1145/3149.214121](https://doi.org/10.1145/3149.214121)
[^73]: Tushar Deepak Chandra and Sam Toueg. [Unreliable Failure Detectors for Reliable Distributed Systems](https://courses.csail.mit.edu/6.852/08/papers/CT96-JACM.pdf). *Journal of the ACM*, volume 43, issue 2, pages 225–267, March 1996. [doi:10.1145/226643.226647](https://doi.org/10.1145/226643.226647)
[^74]: Michael Ben-Or. [Another Advantage of Free Choice: Completely Asynchronous Agreement Protocols](https://homepage.cs.uiowa.edu/~ghosh/BenOr.pdf). At *2nd ACM Symposium on Principles of Distributed Computing* (PODC), August 1983. [doi:10.1145/800221.806707](https://doi.org/10.1145/800221.806707)
[^75]: Cynthia Dwork, Nancy Lynch, and Larry Stockmeyer. [Consensus in the Presence of Partial Synchrony](https://groups.csail.mit.edu/tds/papers/Lynch/jacm88.pdf). *Journal of the ACM*, volume 35, issue 2, pages 288–323, April 1988. [doi:10.1145/42282.42283](https://doi.org/10.1145/42282.42283)
[^76]: Xavier Défago, André Schiper, and Péter Urbán. [Total Order Broadcast and Multicast Algorithms: Taxonomy and Survey](https://dspace.jaist.ac.jp/dspace/bitstream/10119/4883/1/defago_et_al.pdf). *ACM Computing Surveys*, volume 36, issue 4, pages 372–421, December 2004. [doi:10.1145/1041680.1041682](https://doi.org/10.1145/1041680.1041682)
[^77]: Hagit Attiya and Jennifer Welch. *Distributed Computing: Fundamentals, Simulations and Advanced Topics*, 2nd edition. John Wiley & Sons, 2004. ISBN: 978-0-471-45324-6, [doi:10.1002/0471478210](https://doi.org/10.1002/0471478210)
[^78]: Rachid Guerraoui. [Revisiting the Relationship Between Non-Blocking Atomic Commitment and Consensus](https://citeseerx.ist.psu.edu/pdf/5d06489503b6f791aa56d2d7942359c2592e44b0). At *9th International Workshop on Distributed Algorithms* (WDAG), September 1995. [doi:10.1007/BFb0022140](https://doi.org/10.1007/BFb0022140)
[^79]: Jim N. Gray and Leslie Lamport. [Consensus on Transaction Commit](https://dsf.berkeley.edu/cs286/papers/paxoscommit-tods2006.pdf). *ACM Transactions on Database Systems* (TODS), volume 31, issue 1, pages 133–160, March 2006. [doi:10.1145/1132863.1132867](https://doi.org/10.1145/1132863.1132867)
[^80]: Fred B. Schneider. [Implementing Fault-Tolerant Services Using the State Machine Approach: A Tutorial](https://www.cs.cornell.edu/fbs/publications/SMSurvey.pdf). *ACM Computing Surveys*, volume 22, issue 4, pages 299–319, December 1990. [doi:10.1145/98163.98167](https://doi.org/10.1145/98163.98167)
[^81]: Alexander Thomson, Thaddeus Diamond, Shu-Chun Weng, Kun Ren, Philip Shao, and Daniel J. Abadi. [Calvin: Fast Distributed Transactions for Partitioned Database Systems](https://cs.yale.edu/homes/thomson/publications/calvin-sigmod12.pdf). At *ACM International Conference on Management of Data* (SIGMOD), May 2012. [doi:10.1145/2213836.2213838](https://doi.org/10.1145/2213836.2213838)
[^82]: Mahesh Balakrishnan, Dahlia Malkhi, Ted Wobber, Ming Wu, Vijayan Prabhakaran, Michael Wei, John D. Davis, Sriram Rao, Tao Zou, and Aviad Zuck. [Tango: Distributed Data Structures over a Shared Log](https://www.microsoft.com/en-us/research/publication/tango-distributed-data-structures-over-a-shared-log/). At *24th ACM Symposium on Operating Systems Principles* (SOSP), November 2013. [doi:10.1145/2517349.2522732](https://doi.org/10.1145/2517349.2522732)
[^83]: Mahesh Balakrishnan, Dahlia Malkhi, Vijayan Prabhakaran, Ted Wobber, Michael Wei, and John D. Davis. [CORFU: A Shared Log Design for Flash Clusters](https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final30.pdf). At *9th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), April 2012.
[^84]: Vasilis Gavrielatos, Antonios Katsarakis, and Vijay Nagarajan. [Odyssey: the impact of modern hardware on strongly-consistent replication protocols](https://vasigavr1.github.io/files/Odyssey_Eurosys_2021.pdf). At *16th European Conference on Computer Systems* (EuroSys), April 2021. [doi:10.1145/3447786.3456240](https://doi.org/10.1145/3447786.3456240)
[^85]: Heidi Howard, Dahlia Malkhi, and Alexander Spiegelman. [Flexible Paxos: Quorum Intersection Revisited](https://drops.dagstuhl.de/opus/volltexte/2017/7094/pdf/LIPIcs-OPODIS-2016-25.pdf). At *20th International Conference on Principles of Distributed Systems* (OPODIS), December 2016. [doi:10.4230/LIPIcs.OPODIS.2016.25](https://doi.org/10.4230/LIPIcs.OPODIS.2016.25)
[^86]: Martin Kleppmann. [Distributed Systems lecture notes](https://www.cl.cam.ac.uk/teaching/2425/ConcDisSys/dist-sys-notes.pdf). *University of Cambridge*, October 2024. Archived at [perma.cc/SS3Q-FNS5](https://perma.cc/SS3Q-FNS5)
[^87]: Kyle Kingsbury. [Call Me Maybe: Elasticsearch 1.5.0](https://aphyr.com/posts/323-call-me-maybe-elasticsearch-1-5-0). *aphyr.com*, April 2015. Archived at [perma.cc/37MZ-JT7H](https://perma.cc/37MZ-JT7H)
[^88]: Heidi Howard and Jon Crowcroft. [Coracle: Evaluating Consensus at the Internet Edge](https://conferences.sigcomm.org/sigcomm/2015/pdf/papers/p85.pdf). At *Annual Conference of the ACM Special Interest Group on Data Communication* (SIGCOMM), August 2015. [doi:10.1145/2829988.2790010](https://doi.org/10.1145/2829988.2790010)
[^89]: Tom Lianza and Chris Snook. [A Byzantine failure in the real world](https://blog.cloudflare.com/a-byzantine-failure-in-the-real-world/). *blog.cloudflare.com*, November 2020. Archived at [perma.cc/83EZ-ALCY](https://perma.cc/83EZ-ALCY)
[^90]: Ivan Kelly. [BookKeeper Tutorial](https://github.com/ivankelly/bookkeeper-tutorial). *github.com*, October 2014. Archived at [perma.cc/37Y6-VZWU](https://perma.cc/37Y6-VZWU)
[^91]: Jack Vanlightly. [Apache BookKeeper Insights Part 1 — External Consensus and Dynamic Membership](https://medium.com/splunk-maas/apache-bookkeeper-insights-part-1-external-consensus-and-dynamic-membership-c259f388da21). *medium.com*, November 2021. Archived at [perma.cc/3MDB-8GFB](https://perma.cc/3MDB-8GFB)

================================================
FILE: content/tw/ch11.md
================================================
---
title: "第十一章：批處理"
linkTitle: "11. 批處理"
weight: 311
breadcrumbs: false
---

<a id="ch_batch"></a>

![](/map/ch10.png)

> *帶有太強個人色彩的系統無法成功。當最初的設計完成並且相對穩健時，真正的考驗才剛開始：此後會有許多持不同觀點的人做出各自的實驗。*
>
> 高德納

到目前為止，本書大部分內容都圍繞著 *請求（request）* 與 *查詢（query）* 以及對應的 *響應（response）* 或 *結果（result）* 展開。現代很多資料系統都預設採用這種處理方式：你發出請求或指令，系統儘快給出答案。

網頁瀏覽器請求頁面、服務呼叫遠端 API、資料庫、快取、搜尋索引，以及很多其他系統都如此運作。我們稱這類系統為 *線上系統（online systems）*。它們通常以響應時間作為主要效能指標，並且往往需要良好的容錯能力來保證高可用。

但有時候，你需要執行的計算比一次互動式請求大得多，或者要處理的資料量遠超單次請求能承載的範圍。例如訓練 AI 模型、把海量資料從一種形式轉換成另一種形式、或者在超大資料集上做分析計算。我們把這類任務稱為 *批處理（batch processing）* 作業，有時也稱為 *離線系統（offline systems）*。

批處理作業讀取一批輸入資料（只讀），並生成一批輸出資料（每次執行都從頭生成）。它通常不會像讀寫事務那樣原地修改資料。因此，輸出是由輸入推匯出的 *派生資料（derived data）*（見[“記錄系統與派生資料”](/tw/ch1#sec_introduction_derived)）：如果不滿意輸出，你可以直接刪除它，修改作業邏輯，再跑一遍即可。把輸入視為不可變並儘量避免副作用（例如直接寫外部資料庫），不僅有助於效能，也帶來其他好處：

- 如果你在程式碼中引入了 bug 導致輸出錯誤或損壞，可以直接回滾程式碼並重跑作業，輸出就會恢復正確。更簡單的做法是把舊輸出保留在另一個目錄，直接切回舊版本。多數物件儲存與開放表格式（見[“雲資料倉庫”](/tw/ch4#sec_cloud_data_warehouses)）都支援這種能力，通常稱為 *時間旅行（time travel）*。大多數支援讀寫事務的資料庫不具備這種特性：如果錯誤程式碼把壞資料寫進資料庫，僅回滾程式碼並不能修復已寫入的資料。能夠從錯誤程式碼中恢復的能力被稱為 *容忍人為失誤* [^1]。

- 因為回滾容易，功能開發能比“犯錯會造成不可逆損害”的環境更快推進。這個 *最小化不可逆性* 的原則對敏捷開發非常有益 [^2]。

- 同一組檔案可以作為多種作業的輸入，包括監控類作業：例如計算指標、驗證輸出是否符合預期（如與上一次結果比較並度量偏差）。

- 批處理框架能更高效地利用計算資源。雖然也可以用 OLTP 資料庫和應用伺服器等線上系統做批處理，但資源成本通常顯著更高。

批處理也有挑戰。多數框架中，作業只有在整體完成後，其輸出才能被下游進一步處理。批處理也可能低效：輸入哪怕只變動一個位元組，也可能需要重算整個輸入資料集。儘管如此，批處理在大量場景中依然非常有用，我們會在[“批處理用例”](#sec_batch_output)中回到這個話題。

批處理作業可能執行很久：幾分鐘、幾小時甚至幾天。很多作業是週期排程的（例如每天一次）。它的核心效能指標通常是吞吐量：單位時間能處理多少資料。有些批處理系統透過“中止並整體重啟”應對故障，也有些具備更細粒度容錯能力，可以在部分節點崩潰時仍讓作業完成。

> [!NOTE]
> 批處理的另一種替代形態是 *流處理（stream processing）*：作業不會在“處理完輸入後結束”，而是持續監聽輸入，並在變化發生後很快處理。我們將在[第十二章](/tw/ch12#ch_stream)討論流處理。

線上處理與批處理的邊界並不總是清晰：一個執行很久的資料庫查詢，看起來也很像批處理過程。但批處理有一些獨特特性，使其成為構建可靠、可伸縮、可維護應用的重要積木。例如，它常在 *資料整合（data integration）* 中發揮作用，即把多個數據系統組合起來完成單一系統做不到的事。ETL（見[“資料倉庫”](/tw/ch1#sec_introduction_dwh)）就是典型例子。

現代批處理深受 MapReduce 影響。Google 在 2004 年發表了這一批處理演算法 [^3]，隨後 Hadoop、CouchDB、MongoDB 等開源系統都實現了它。MapReduce 是相對底層的程式設計模型，其能力不如資料倉庫中的並行查詢執行引擎成熟 [^4] [^5]。它在誕生時確實讓商用硬體上的處理規模躍升一大步，但今天已大體過時，Google 內部也不再使用 [^6] [^7]。

如今批處理更常透過 Spark、Flink 或資料倉庫查詢引擎完成。它們與 MapReduce 一樣高度依賴分片（見[第七章](/tw/ch7#ch_sharding)）和並行執行，但快取與執行策略更成熟。隨著這些系統走向成熟，運維問題已大幅緩解，重點轉向可用性：資料流 API、查詢語言、DataFrame API 得到廣泛支援；任務與工作流編排也顯著進化。以 Hadoop 為中心的 Oozie、Azkaban 等排程器，正被 Airflow、Dagster、Prefect 這類更通用方案替代，它們可協調多種批處理框架與雲資料倉庫。

雲計算已無處不在。批處理儲存層也正在從 HDFS、GlusterFS、CephFS 這類分散式檔案系統（DFS）向 S3 等物件儲存遷移。BigQuery、Snowflake 這類可伸縮雲資料倉庫，正在模糊“資料倉庫”和“批處理系統”之間的邊界。

為了建立直覺，本章先從單機 Unix 工具示例出發，再擴充套件到分散式多機處理。你會看到，分散式批處理框架在很多方面很像作業系統：它也有排程器和檔案系統。隨後我們會討論編寫批處理作業的幾種處理模型，最後給出常見應用場景。

## 使用 Unix 工具的批處理 {#sec_batch_unix}

假設你有一臺 Web 伺服器，每處理一個請求就在日誌檔案末尾追加一行。例如，使用 nginx 預設訪問日誌格式，一行可能像這樣：

    216.58.210.78 - - [27/Jun/2025:17:55:11 +0000] "GET /css/typography.css HTTP/1.1"
    200 3377 "https://martin.kleppmann.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X
    10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"

（實際上這是一行，這裡為了閱讀方便換了行。）這一行包含了很多資訊。要正確解釋它，你需要日誌格式定義：

    $remote_addr - $remote_user [$time_local] "$request"
    $status $body_bytes_sent "$http_referer" "$http_user_agent"

這表示：UTC 時間 2025 年 6 月 27 日 17:55:11，伺服器收到來自客戶端 IP `216.58.210.78` 對 `/css/typography.css` 的請求。使用者未認證，因此 `$remote_user` 是連字元（`-`）。響應狀態碼是 200（成功），響應體大小 3,377 位元組。瀏覽器是 Chrome 137，該檔案是從頁面 *[*https://martin.kleppmann.com/*](https://martin.kleppmann.com/)* 引用而來。

看起來“解析日誌”有點樸素，但它在現代科技公司裡是核心能力之一，從廣告流水線到支付處理都大量依賴。事實上，這也是 MapReduce 與“大資料”浪潮快速興起的重要推動力。

### 簡單日誌分析 {#sec_batch_log_analysis}

很多工具都能從日誌生成漂亮的網站流量報告。這裡為了練手，我們只用基礎 Unix 工具自己做一個。比如你想找出網站最受歡迎的五個頁面，可以在 shell 中這樣做：

```bash
cat /var/log/nginx/access.log | #1
  awk '{print $7}' | #2
  sort             | #3
  uniq -c          | #4
  sort -r -n       | #5
  head -n 5          #6
```

1. 讀取日誌檔案。（嚴格說這裡不需要 `cat`，可直接把檔案作為 `awk` 引數；但這樣寫更直觀看出線性管道。）
2. 以空白字元切分每行，只輸出第 7 個欄位，也就是請求 URL。上面的樣例中是 `/css/typography.css`。
3. 按字典序對 URL 排序。某個 URL 若出現 *n* 次，排序後會連續出現 *n* 行。
4. `uniq` 透過比較相鄰兩行是否相同來去重。`-c` 讓它輸出計數：每個不同 URL 出現了多少次。
5. 第二次 `sort` 按每行開頭的數字（`-n`）排序，並用 `-r` 逆序，出現次數最多的排在最前。
6. `head` 只保留前 5 行（`-n 5`），丟棄其餘。

輸出大致如下：

```
    4189 /favicon.ico
    3631 /2016/02/08/how-to-do-distributed-locking.html
    2124 /2020/11/18/distributed-systems-and-elliptic-curves.html
    1369 /
     915 /css/typography.css
```

如果你不熟悉 Unix 工具，這條命令看起來可能有點晦澀，但它威力很強。它能在幾秒內處理 GB 級日誌，而且修改分析邏輯也非常方便：例如要排除 CSS 檔案，可把 `awk` 引數改成 `'$7 !~ /\.css$/ {print $7}'`；若要統計訪問最多的客戶端 IP，把 `awk` 引數改成 `'{print $1}'` 即可。

本書篇幅有限，無法展開講 Unix 工具，但它們非常值得學。令人驚訝的是，僅靠 `awk`、`sed`、`grep`、`sort`、`uniq`、`xargs` 的組合，就能在幾分鐘內做出很多資料分析，並且效能相當好 [^8]。

### 命令鏈與自定義程式 {#sec_batch_custom_program}

你也可以不用 Unix 管道，而寫個小程式完成同樣的事。比如用 Python：

```python
from collections import defaultdict

counts = defaultdict(int) #1

with open('/var/log/nginx/access.log', 'r') as file:
    for line in file:
        url = line.split()[6] #2
        counts[url] += 1 #3

top5 = sorted(((count, url) for url, count in counts.items()), reverse=True)[:5] #4

for count, url in top5:  #5
    print(f"{count} {url}")
```

1. `counts` 是散列表，記錄每個 URL 出現次數，預設值為 0。
2. 每行按空白字元切分，取第 7 個欄位作為 URL（Python 陣列從 0 開始，所以索引是 6）。
3. 當前行對應 URL 的計數器加一。
4. 按計數降序排序，取前五項。
5. 列印前五項。

這個程式不如 Unix 管道簡潔，但可讀性也不錯，偏好取決於習慣。不過兩者除了語法差異，執行流程也很不一樣；在大檔案上執行時，這種差異會很明顯。

### 排序與記憶體聚合 {#id275}

Python 指令碼在記憶體裡維護了一個“URL -> 出現次數”的散列表。Unix 管道示例沒有這種散列表，而是透過排序把同一 URL 的多次出現排到一起。

哪種方法更好？取決於不同 URL 的數量。對多數中小網站而言，通常可以把所有不同 URL 及其計數器放進（比如）1GB 記憶體。這個作業的 *工作集（working set）*（需要隨機訪問的記憶體規模）只取決於不同 URL 的個數：即便一百萬條日誌都指向同一 URL，散列表也只存一個 URL 和一個計數器。工作集足夠小時，記憶體散列表很好用，筆記本都能跑。

但如果工作集大於可用記憶體，排序法就有優勢：它能高效使用磁碟。這與[“日誌結構儲存”](/tw/ch4#sec_storage_log_structured)中的原理一樣：先在記憶體對資料塊排序並寫成段檔案，再把多個有序段合併成更大的有序檔案。歸併排序的順序訪問模式對磁碟很友好（見[“SSD 上的順序寫與隨機寫”](/tw/ch4#sidebar_sequential)）。

GNU Coreutils（Linux）中的 `sort` 能自動把超記憶體資料溢寫到磁碟，並自動利用多核並行排序 [^9]。這意味著前面的 Unix 命令鏈可以自然擴充套件到大資料集而不耗盡記憶體，瓶頸通常變成磁碟讀取輸入檔案的速率。

Unix 工具的一個侷限是它們只在單機執行。當資料大到單機記憶體或本地磁碟都放不下時，就需要分散式批處理框架。

## 分散式系統中的批處理 {#sec_batch_distributed}

在前面的 Unix 示例中，單機有幾個協同元件在處理日誌：

- 透過作業系統檔案系統介面訪問的儲存裝置。
- 決定程序何時執行、如何分配 CPU 資源的排程器。
- 一串透過管道把 `stdin`/`stdout` 連線起來的 Unix 程式。

分散式批處理框架也有對應元件。某種意義上，你可以把分散式處理框架看成“分散式作業系統”：它有檔案系統、有任務排程器，還有透過檔案系統或其他通道互相傳遞資料的程式。

### 分散式檔案系統 {#sec_batch_dfs}

作業系統提供的檔案系統由多層組成：

- 最底層是塊裝置驅動，直接與磁碟互動，向上層提供原始塊讀寫。
- 塊層之上是頁快取，快取最近訪問塊以提升讀取速度。
- 塊 API 之上是檔案系統層，負責把大檔案切塊，並維護 inode、目錄、檔案等元資料。Linux 常見實現如 ext4、XFS。
- 最上層，作業系統透過統一 API（虛擬檔案系統，VFS）嚮應用暴露不同檔案系統，讓應用以統一方式讀寫底層不同實現。

分散式檔案系統（DFS）工作方式很類似：檔案被切成塊並分散到多臺機器。DFS 的塊通常比本地檔案系統大得多：HDFS 預設 128MB，JuiceFS 和許多物件儲存常用 4MB，而 ext4 預設塊通常是 4096 位元組。塊越大，需要維護的元資料越少，這對 PB 級資料非常關鍵；同時尋道開銷佔比也更低。

大多數物理儲存裝置不能做“部分塊寫入”，即使資料不足一個塊也得寫滿塊。DFS 的塊更大且通常構建在作業系統檔案系統之上，因此一般沒有這個約束。比如一個 900MB 檔案在 128MB 分塊下，會有 7 個 128MB 塊和 1 個 4MB 塊。

讀取 DFS 塊需要透過網路請求到持有該塊的叢集節點。每臺機器都執行守護程序，對外提供 API，使遠端程序能把本地檔案系統中的塊當作檔案讀寫。HDFS 把這些守護程序叫 DataNode，GlusterFS 叫 glusterfsd。後文統稱 *資料節點（data node）*。

DFS 也實現了“分散式版本”的頁快取。因為 DFS 塊作為檔案存放在資料節點本地，讀寫會經過資料節點作業系統，自帶記憶體頁快取，熱門塊會被快取在記憶體中。某些 DFS 還提供更多快取層，例如 JuiceFS 的客戶端快取和本地磁碟快取。

像 ext4/XFS 這樣的檔案系統會維護空閒空間、塊位置、目錄結構、許可權等元資料。DFS 同樣需要記錄“檔案塊分佈在哪些機器”“許可權如何”等資訊。Hadoop 使用 NameNode 維護叢集元資料；DeepSeek 的 3FS 使用元資料服務並把元資料持久化到 FoundationDB 之類鍵值儲存。

在檔案系統之上是 VFS。批處理系統裡最接近它的是 DFS 協議：批處理框架需要透過協議/介面來讀寫儲存。只要實現協議，就能作為可插拔儲存接入。例如 S3 API 已被 MinIO、Cloudflare R2、Tigris、Backblaze B2 等大量系統相容支援。具備 S3 支援的批處理系統通常可直接使用這些儲存。

有些 DFS 還提供 POSIX 相容檔案系統，讓作業系統 VFS 把它當普通檔案系統。常見整合方式是 FUSE 或 NFS 協議。NFS 可能是最知名分散式檔案系統協議，最初用於讓多個客戶端讀寫單個伺服器上的資料。後來 AWS EFS、Archil 等提供了更可伸縮的 NFS 相容實現。NFS 客戶端雖仍連到一個端點，但底層會與分散式元資料服務和資料節點互動完成讀寫。

> [!TIP] 分散式檔案系統與網路儲存
> 分散式檔案系統基於 *無共享（shared-nothing）* 原則（見[“共享記憶體、共享磁碟與無共享架構”](/tw/ch2#sec_introduction_shared_nothing)），與 NAS（網路附加儲存）和 SAN（儲存區域網路）等 *共享磁碟* 方案形成對照。共享磁碟通常依賴集中式儲存裝置、定製硬體和專用網路（如光纖通道）；無共享方案不要求專用硬體，只需普通資料中心網路互聯的機器。

很多 DFS 構建在商用硬體上，成本更低但故障率高於企業級專用硬體。為容忍機器和磁碟故障，檔案塊通常複製到多臺機器。這也讓排程器更容易均衡負載：任務可在任一持有輸入副本的節點執行。複製可以是多副本（見[第六章](/tw/ch6#ch_replication)），也可以是 Reed-Solomon 等 *糾刪碼* 方案，以更低儲存開銷恢復丟失資料 [^10] [^11] [^12]。這與 RAID 思想類似，只是 RAID 面向同一機器上的多塊磁碟，而 DFS 是透過普通資料中心網路跨機器做訪問和複製。

### 物件儲存 {#id277}

Amazon S3、Google Cloud Storage、Azure Blob Storage、OpenStack Swift 等物件儲存，已成為批處理場景中對 DFS 的主流替代。實際上兩者邊界越來越模糊：正如前一節和[“由物件儲存支撐的資料庫”](/tw/ch6#sec_replication_object_storage)所述，FUSE 可以把 S3 這類物件儲存“掛載成檔案系統”；JuiceFS、Ceph 等系統也同時提供物件 API 與檔案系統 API。但這些介面、效能、以及一致性保證差異很大，即便 API 看似相容，也需要仔細驗證行為是否符合預期。

物件儲存中的每個物件有一個 URL，例如 `s3://my-photo-bucket/2025/04/01/birthday.png`。其中主機部分（`my-photo-bucket`）是 bucket 名，後半部分是物件 *鍵（key）*（示例裡是 `/2025/04/01/birthday.png`）。bucket 名全域性唯一；物件鍵在 bucket 內必須唯一。

物件讀取用 `get`，寫入用 `put`。與檔案系統檔案不同，物件寫入後通常不可變；更新物件需要透過 `put` 全量重寫，類似鍵值儲存。Azure Blob Storage 和 S3 Express One Zone 支援追加，但多數物件儲存不支援。它也沒有 `fopen`、`fseek` 這類檔案控制代碼 API。

物件看起來像按目錄組織，這很容易讓人誤解：物件儲存並沒有真正目錄概念。所謂路徑只是約定，斜槓也是 key 的一部分。這個約定允許你按字首列出物件，類似“目錄列表”，但與檔案系統目錄列舉有兩點不同：

- 字首 `list` 行為更像 Unix 的遞迴 `ls -R`：會返回所有以該字首開頭的物件，包括“子路徑”下的物件。
- 不存在“空目錄”。如果你刪除了 `s3://my-photo-bucket/2025/04/01` 下所有物件，再列 `s3://my-photo-bucket/2025/04` 時就看不到 `01`。常見做法是建立 0 位元組物件表示空目錄（如建立空物件 `s3://my-photo-bucket/2025/04/01` 以保留目錄佔位）。

DFS 常支援硬連結、符號連結、檔案鎖、原子重新命名等檔案系統操作，而物件儲存通常缺失這些能力：連結和鎖大多不支援；重新命名也非原子，通常是“複製到新 key，再刪除舊 key”。若要“重新命名目錄”，因為目錄名是 key 的一部分，實際上要逐個物件重新命名。

[第四章](/tw/ch4#ch_storage)討論的鍵值儲存通常面向小值（通常 KB 級）和高頻低延遲讀寫。相比之下，DFS 和物件儲存通常最佳化的是大物件（MB 到 GB）和低頻大塊讀寫。不過近年物件儲存也在增強小物件高頻訪問能力，例如 S3 Express One Zone 已提供單毫秒級延遲，計費模型也更接近鍵值儲存。

DFS 與物件儲存另一個區別是：HDFS 等 DFS 可把計算任務排程到持有檔案副本的機器上，讓任務本地讀檔案，減少網路傳輸（當任務程式碼遠小於待讀檔案時尤其划算）。物件儲存通常把儲存和計算解耦，雖然可能用更多頻寬，但現代資料中心網路很快，通常可接受。同時這種解耦讓 CPU/記憶體與儲存容量可以獨立擴充套件。

### 分散式作業編排 {#id278}

前面的“作業系統類比”同樣適用於作業編排。在單機上跑 Unix 批處理任務時，總得有東西真正去執行 `awk`、`sort`、`uniq`、`head` 程序；需要把一個程序輸出送到另一個程序輸入；要給每個程序分配記憶體；公平排程 CPU 指令；隔離記憶體與 I/O 邊界，等等。單機裡這由作業系統核心負責；分散式環境裡，這就是作業編排器（orchestrator）的職責。

批處理框架會向編排器的排程器發起“執行作業”請求。請求通常包含如下元資料：

- 需要執行的任務數量；
- 每個任務所需記憶體、CPU、磁碟；
- 作業識別符號；
- 訪問憑據；
- 輸入輸出等作業引數；
- 所需硬體資訊（如 GPU、磁碟型別）；
- 作業可執行程式碼的位置。

Kubernetes、Hadoop YARN（Yet Another Resource Negotiator）[^13] 等編排器會結合這些請求與叢集狀態，依靠以下元件執行任務：

任務執行器（Task executors）

:   每個節點上執行執行器守護程序，例如 YARN 的 *NodeManager* 或 Kubernetes 的 *kubelet*。執行器負責拉起任務、透過心跳上報存活狀態、跟蹤節點上的任務狀態與資源佔用。收到“啟動任務”請求後，執行器會獲取作業程式碼並執行啟動命令；隨後持續監控程序直至結束或失敗，並更新對應狀態元資料。

    很多執行器還配合作業系統實現安全與效能隔離，例如 YARN 和 Kubernetes 都會使用 Linux *cgroups*。這樣可防止任務越權訪問資料，或因資源濫用影響同機其他任務。

資源管理器（Resource Manager）

:   資源管理器維護各節點元資料：可用硬體（CPU、GPU、記憶體、磁碟等）、任務狀態、網路位置、節點健康狀態等，從而形成全域性檢視。其中心化特性可能成為可用性和可伸縮性瓶頸。YARN 藉助 ZooKeeper，Kubernetes 藉助 etcd 儲存叢集狀態（見[“協調服務”](/tw/ch10#sec_consistency_coordination)）。

排程器（Scheduler）

:   編排器通常包含中心化排程子系統，接收啟動/停止作業與狀態查詢請求。例如收到“啟動 10 個任務，使用指定 Docker 映象，且必須執行在某類 GPU 節點上”的請求後，排程器會基於請求和資源管理器狀態決定“哪些任務跑在哪些節點”，再通知執行器執行。

不同編排器命名各異，但幾乎都具備這些核心元件。

> [!NOTE]
> 有些排程決策需要“應用特定排程器”參與，才能考慮更具體的業務約束，例如當查詢量達到閾值時自動擴容只讀副本。中心排程器與應用排程器協同決定如何執行任務。YARN 把這類子排程器稱為 *ApplicationMaster*，Kubernetes 通常稱為 *operator*。

#### 資源分配 {#id279}

排程器在編排系統中最具挑戰的職責之一，就是在資源有限且作業需求衝突時，做出合理分配。它本質上是在公平與效率之間做平衡。

假設一個小叢集有 5 個節點，共 160 個 CPU 核。排程器收到兩個作業請求，每個都想要 100 核。怎麼排最好？

- 可以給每個作業先分 80 個任務，剩餘 20 個等前面的任務結束後再啟動。
- 也可以先跑完其中一個作業，再等 100 核都空出來後跑另一個。這叫 *gang scheduling*（成組排程）。
- 如果一個請求先到，排程器還要決定是立即把 100 核都給它，還是為未來請求預留一部分資源。

這是很簡化的例子，但已經能看到艱難權衡。以成組排程為例，如果排程器為了湊齊 100 核而長期預留資源，節點會閒置，資源利用率下降，若其他作業也在搶佔式預留，還可能死鎖。

反過來，如果只是被動等 100 核“自然可用”，中間可能被別的作業拿走，導致長時間湊不齊，從而產生 *飢餓（starvation）*。排程器也可以 *搶佔（preempt）* 一部分先到作業任務，把它們殺掉給後到作業騰資源；但被殺任務之後還要重跑，整體效率同樣下降。

把這個問題放大到數百甚至數百萬個請求，想求全域性最優幾乎不可行。事實上這是 *NP-hard* 問題：除了很小規模，很難在可接受時間內算出最優解 [^14] [^15]。

因此工程上排程器通常採用啟發式方法，在非最優前提下做“足夠好”的決策。常見演算法包括 FIFO、主導資源公平（DRF）、優先順序佇列、容量/配額排程、各種裝箱演算法等。細節超出本書範圍，但這是非常有趣的研究領域。

#### 工作流排程 {#sec_batch_workflows}

本章開頭的 Unix 示例是多個命令串聯。分散式批處理中同樣常見：一個作業輸出要成為一個或多個後續作業輸入，而每個作業又可能依賴多個上游輸入。這個依賴結構稱為 *工作流（workflow）* 或 *有向無環圖（DAG）*。

> [!NOTE]
> 我們在[“持久化執行與工作流”](/tw/ch5#sec_encoding_dataflow_workflows)中討論過“按步驟執行 RPC”的工作流引擎；在批處理語境裡，“工作流”指的是一串批處理過程：每一步讀輸入、產輸出，通常不直接對外做 RPC。持久化執行引擎通常單次請求處理的資料量小於批處理系統，但兩者邊界並非絕對。

需要多作業工作流常見有以下原因：

- 一個作業輸出可能被多個團隊維護的下游作業消費。此時先把輸出寫到公共位置更合理，下游可按“資料更新觸發”或定時方式執行。
- 你可能要在多個處理工具間傳遞資料。比如 Spark 作業寫 HDFS，再由 Python 觸發 Trino SQL 查詢（見[“雲資料倉庫”](/tw/ch4#sec_cloud_data_warehouses)）繼續處理並寫入 S3。
- 有些流水線內部天然需要多階段。例如第一階段按某鍵分片，下一階段按另一鍵分片，那麼第一階段需要先產出符合第二階段要求的資料佈局。

在 Unix 裡，管道用很小的記憶體緩衝連線前後命令，不落盤。若緩衝區滿，上游必須等待下游消費，這是一種 *背壓（backpressure）*。Spark、Flink 等批處理執行引擎也支援類似模式：一個任務輸出直接傳給下一任務（跨機時經網路傳輸）。

但在工作流中，更常見仍是“上游作業寫 DFS/物件儲存，下游再讀”，這樣可讓作業在時間上解耦。若一個作業有多個輸入，工作流排程器通常會等待所有上游輸入生產成功後再啟動它。

YARN ResourceManager 或 Spark 內建排程器主要做“作業內排程”，不負責整條工作流。為管理跨作業依賴，出現了 Airflow、Dagster、Prefect 等工作流排程器。它們在維護大量批作業時非常關鍵：包含 50~100 個作業的工作流並不罕見；大型組織內很多團隊會跨系統互相消費輸出。沒有工具支撐，很難管理這種複雜資料流。

#### 故障處理 {#id281}

批處理作業往往執行時間長。長時間執行且並行任務多的作業，在執行過程中遇到至少一次任務失敗幾乎是常態。正如[“硬體與軟體故障”](/tw/ch2#sec_introduction_hardware_faults)和[“不可靠網路”](/tw/ch9#sec_distributed_networks)所述，原因可能是硬體故障（商用硬體尤甚）、網路中斷等。

任務無法完成的另一原因是被排程器主動搶佔（kill）。當系統有多優先順序佇列時，這很常見：低優先順序任務便宜、高優先順序任務昂貴。低優先順序任務可用空閒算力跑，但高優先順序任務一到就可能把它們搶佔掉。雲廠商的對應產品名分別是：AWS 的 *spot instances*、Azure 的 *spot virtual machines*、GCP 的 *preemptible instances* [^16]。

批處理很多時候對即時性要求不高，因此很適合利用低優先順序資源/搶佔式例項降成本：本質上它在“吃”否則會閒置的算力，提高叢集利用率。但代價是更高的被殺機率：實際裡搶佔往往比硬體故障更常見 [^17]。

由於批處理每次都從頭生成輸出，任務失敗比線上系統更容易處理：刪掉失敗任務的部分輸出，把任務重新排程到別的機器重跑即可。若只因一個任務失敗就重跑整個作業會非常浪費，因此 MapReduce 及其後繼系統都儘量讓並行任務彼此獨立，從而把重試粒度降到單個任務 [^3]。

當一個任務輸出成為另一任務輸入（即在工作流內傳遞）時，容錯更複雜。MapReduce 的做法是：中間資料總是寫回 DFS，且只有寫入任務成功後才允許下游讀取。這個方案在頻繁搶佔環境中也能工作，但會帶來大量 DFS 寫入，效率不高。

Spark 更傾向把中間資料放記憶體或溢寫本地磁碟，只把最終結果寫 DFS；它還記錄中間資料的計算血緣，丟失時可重算 [^18]。Flink 則採用定期檢查點快照機制 [^19]。我們會在[“資料流引擎”](#sec_batch_dataflow)繼續討論。

## 批處理模型 {#id431}

前面我們討論了分散式環境中批作業如何排程。現在轉向“批處理框架如何處理資料”。最常見的兩類模型是 MapReduce 與資料流引擎。儘管實踐中資料流引擎已大面積替代 MapReduce，但理解 MapReduce 仍然重要，因為它深刻影響了現代批處理框架。

MapReduce 與資料流引擎都發展出多種程式設計介面：低層 API、關係查詢語言、DataFrame API。它們讓應用工程師、資料分析工程師、業務分析師乃至非技術人員都能參與資料處理。我們將在[“批處理用例”](#sec_batch_output)中討論這些用途。

### MapReduce {#sec_batch_mapreduce}

MapReduce 的處理模式與[“簡單日誌分析”](#sec_batch_log_analysis)幾乎同構：

1. 讀取輸入檔案並切分為 *記錄（records）*。在日誌例子裡，每條記錄就是一行（`\n` 為記錄分隔符）。在 Hadoop MapReduce 中，輸入通常存放在 HDFS 或 S3 等物件儲存，檔案格式可能是 Parquet（列式，見[“面向列儲存”](/tw/ch4#sec_storage_column)）或 Avro（行式，見[“Avro”](/tw/ch5#sec_encoding_avro)）。
2. 呼叫 mapper，從每條輸入記錄中提取鍵和值。Unix 示例中 mapper 相當於 `awk '{print $7}'`：URL（`$7`）是鍵，值可留空。
3. 按鍵排序所有鍵值對。日誌示例中這一步對應第一次 `sort`。
4. 呼叫 reducer 遍歷排序後的鍵值對。同鍵記錄會相鄰，因此可以在很小記憶體狀態下合併。Unix 示例中 reducer 等價於 `uniq -c`，統計相鄰同鍵記錄數。

這四步就是一個 MapReduce 作業。第 2 步（map）與第 4 步（reduce）是你寫業務邏輯的地方；第 1 步（檔案切記錄）由輸入格式解析器完成；第 3 步排序在 MapReduce 中是隱式內建的，你無需手寫。這一步是批處理的基礎演算法，我們會在[“混洗資料”](#sec_shuffle)再討論。

要建立 MapReduce 作業，你需實現兩個回撥：mapper 與 reducer，其行為如下。

Mapper

:   對每條輸入記錄呼叫一次。它從輸入記錄中提取鍵和值，並可為每條輸入產生任意數量鍵值對（包括 0 條）。它不保留跨記錄狀態，每條記錄獨立處理。

Reducer

:   框架收集 mapper 產生的鍵值對，把同鍵值集合交給 reducer（以迭代器形式）。reducer 可輸出結果記錄（如同一 URL 的出現次數）。

在日誌示例裡，第 5 步還有一次 `sort` 用於按請求次數排名 URL。MapReduce 若要第二輪排序，通常要再寫一個作業：前一個輸出作為後一個輸入。換個角度看，mapper 的作用是把資料整理成適合排序的形態；reducer 的作用是處理已排序資料。

> [!TIP] MapReduce 與函數語言程式設計
> MapReduce 雖用於批處理，但其程式設計模型來自函數語言程式設計。Lisp 把 *map* 與 *reduce/fold* 作為列表上的高階函式引入，後來進入 Python、Rust、Java 等主流語言。包括 SQL 在內的大量資料處理操作都可在 MapReduce 之上表達。Map 和 reduce 以及函數語言程式設計的一些特性恰好契合 MapReduce：可組合、天然適合資料處理鏈；map 還是典型“令人尷尬地並行”（每條輸入獨立處理）；reduce 則可按不同鍵並行。

但用原始 MapReduce API 寫複雜處理其實很費力，例如各種連線演算法都要自己實現 [^20]。MapReduce 相比現代批處理引擎也偏慢，一個重要原因是其“以檔案為中心”的 I/O 讓作業流水化困難：上游不結束，下游很難提前處理輸出。

### 資料流引擎 {#sec_batch_dataflow}

為解決 MapReduce 的侷限，出現了多種分散式批處理執行引擎，最著名的是 Spark [^18] [^21] 和 Flink [^19]。它們設計細節各異，但有一個共同點：把整條工作流當成一個作業處理，而不是拆成互相獨立的小作業。

因為它們顯式建模了跨多個處理階段的資料流動，所以稱為 *資料流引擎（dataflow engines）*。與 MapReduce 一樣，它們提供低層 API（反覆呼叫使用者函式逐條處理記錄），也提供更高層運算元（如 *join*、*group by*）。它們透過分片並行輸入，並透過網路把一個任務輸出傳給另一個任務輸入。與 MapReduce 不同，運算元不必嚴格在 map/reduce 兩類角色間交替，而可以更靈活組合。

這些 API 通常以關係風格構件表達計算：按欄位值連線資料集、按鍵分組、按條件過濾、按計數或求和等函式聚合。內部實現依賴的正是下一節要講的混洗演算法。

這種處理引擎風格可追溯到 Dryad [^22]、Nephele [^23] 等研究系統。相比 MapReduce，它有幾個優勢：

- 像排序這類昂貴操作只在“確實需要”的地方執行，而不是每個 map 與 reduce 階段之間都預設做。
- 連續多個不改變分片方式的運算元（如 map/filter）可融合成一個任務，減少資料複製開銷。
- 由於工作流裡的連線與資料依賴都顯式宣告，排程器能全域性最佳化資料區域性。比如把“消費某資料”的任務放到“生產該資料”的同機上，用共享記憶體緩衝交換，而非走網路複製。
- 運算元間中間狀態通常放記憶體或本地磁碟即可，比寫 DFS/物件儲存 I/O 更低（後者要多副本並落到多機磁碟）。MapReduce 僅對 mapper 輸出做了這類最佳化，資料流引擎把它推廣到所有中間狀態。
- 輸入一就緒就能啟動下游運算元，無需等待整個上游階段全部完成。
- 可複用已有程序執行新運算元，減少啟動開銷；MapReduce 往往為每個任務起一個新 JVM。

因此，資料流引擎能實現與 MapReduce 工作流同樣的計算，但通常速度明顯更快。

### 混洗資料 {#sec_shuffle}

本章開頭的 Unix 工具示例和 MapReduce 都建立在排序之上。批處理系統要能排序 PB 級資料，單機放不下，因此必須使用“輸入與輸出都分片”的分散式排序演算法，這就是 *混洗（shuffle）*。

> [!NOTE] 混洗不是隨機
> “shuffle” 容易引發誤解。洗牌會得到隨機順序；而這裡的 shuffle 產出的是排序後的確定順序，不含隨機性。

混洗是批處理系統的基礎演算法，連線與聚合都依賴它。MapReduce、Spark、Flink、Daft、Dataflow、BigQuery [^24] 都實現了高可伸縮且高效能的混洗機制以處理大資料集。這裡用 Hadoop MapReduce 的混洗實現做說明 [^25]，但核心思想在其他系統同樣適用。

[圖 11-1](#fig_batch_mapreduce) 展示了一個 MapReduce 作業的資料流。假設輸入已分片，標記為 *m1*、*m2*、*m3*。例如每個分片可以是 HDFS 中一個檔案，或物件儲存中的一個物件；同一資料集的所有分片可以放在同一 HDFS 目錄，或使用同一物件字首。

{{< figure src="/fig/ddia_1101.png" id="fig_batch_mapreduce" caption="圖 11-1. 一個包含三個 mapper 和三個 reducer 的 MapReduce 作業。" class="w-full my-4" >}}

框架會為每個輸入分片啟動一個 map 任務。任務讀取分配到的檔案，並逐條記錄呼叫 mapper 回撥。reduce 側也會分片。map 任務數由輸入分片數決定；reduce 任務數由作業作者配置（可與 map 數不同）。

mapper 輸出是鍵值對。框架需要保證：若不同 mapper 輸出了同一個鍵，這些鍵值對最終必須由同一個 reducer 處理。為此，每個 mapper 會在本地磁碟為每個 reducer 維護一個輸出檔案（例如[圖 11-1](#fig_batch_mapreduce)中的 *m1,r2*：由 mapper1 生成，目標是 reducer2）。mapper 每輸出一條鍵值對，通常會按鍵的雜湊決定寫入哪個 reducer 檔案（類似[“按鍵雜湊分片”](/tw/ch7#sec_sharding_hash)）。

mapper 寫這些檔案的同時，也會在每個檔案內部按鍵排序。可用的正是[“日誌結構儲存”](/tw/ch4#sec_storage_log_structured)中的技術：先在記憶體有序結構裡積累一批鍵值對，寫成有序段檔案，再把小段逐步合併成大段。

每個 mapper 完成後，reducer 會連線到 mapper，把屬於自己的有序檔案複製到本地磁碟。reducer 拿到所有 mapper 的對應分片後，再用歸併排序方式合併它們並保持有序。同鍵記錄即便來自不同 mapper，也會在合併後相鄰。隨後 reducer 以“每個鍵一次呼叫”的方式執行，每次拿到一個可迭代器，遍歷該鍵所有值。

reducer 輸出記錄會順序寫入檔案，每個 reduce 任務一個檔案。[圖 11-1](#fig_batch_mapreduce)中的 *r1*、*r2*、*r3* 就是輸出資料集的分片，最終寫回 DFS 或物件儲存。

MapReduce 在 map 與 reduce 之間執行混洗；現代資料流引擎和雲資料倉庫則更複雜。BigQuery 等系統已最佳化混洗，使資料儘量留在記憶體，並寫入外部排序服務 [^24]，以提升速度並透過複製增強韌性。

#### JOIN 與 GROUP BY {#sec_batch_join}

下面看“有序資料”如何簡化分散式連線與聚合。為便於說明仍以 MapReduce 為例，但概念適用於大多數批處理系統。

批處理裡常見連線場景見[圖 11-2](#fig_batch_join_example)。左邊是使用者活動日誌（*activity events* 或 *clickstream data*），右邊是使用者資料庫。它可以看作星型模型的一部分（見[“星型與雪花型：分析模式”](/tw/ch3#sec_datamodels_analytics)）：活動日誌是事實表，使用者庫是維度表之一。

{{< figure src="/fig/ddia_1102.png" id="fig_batch_join_example" caption="圖 11-2. 使用者活動日誌與使用者畫像資料庫的連線。" class="w-full my-4" >}}

如果你要做“結合使用者庫資訊的活動分析”（例如利用使用者出生日期欄位，判斷哪些頁面更受年輕或年長使用者歡迎），就需要連線這兩張表。若兩邊都大到必須分片，怎麼做？

可利用 MapReduce 的關鍵特性：混洗會把同鍵鍵值對匯聚到同一個 reducer，無論它們最初在哪個分片。這裡使用者 ID 就可以作為鍵。因此可寫一個 mapper 掃活動日誌，輸出“按使用者 ID 鍵控的頁面訪問 URL”（見[圖 11-3](#fig_batch_join_reduce)）；再寫一個 mapper 按行掃描使用者表，提取“使用者 ID 作為鍵、出生日期作為值”。

{{< figure src="/fig/ddia_1103.png" id="fig_batch_join_reduce" caption="圖 11-3. 基於使用者 ID 的排序合併連線。若輸入資料集由多個檔案分片組成，可並行啟動多個 mapper 處理。" class="w-full my-4" >}}

混洗保證 reducer 能同時拿到某使用者的出生日期和該使用者全部頁面訪問事件。MapReduce 甚至可以把記錄進一步排成 reducer 先看到使用者記錄、再按時間戳看到活動事件，這稱為 *二次排序（secondary sort）* [^25]。

於是 reducer 很容易實現連線邏輯：先拿到出生日期並存入區域性變數，再遍歷同一使用者 ID 的活動事件，輸出“被訪問 URL + 訪問者出生日期”。因為 reducer 一次處理一個使用者的全部記錄，所以記憶體裡只要保留一條使用者記錄，也無需發任何網路請求。這個演算法稱為 *排序合併連線（sort-merge join）*：mapper 輸出先按鍵排序，reducer 再把連線兩側有序記錄合併。

工作流中的下一個 MapReduce 作業就可以繼續計算“每個 URL 的訪問者年齡分佈”：先按 URL 做一次混洗，再在 reducer 中遍歷同 URL 的所有訪問記錄（含出生日期），按年齡段維護計數並逐條累加，從而實現 *group by* 與聚合。

### 查詢語言 {#sec_batch_query_lanauges}

這些年分散式批處理執行引擎不斷成熟。如今在上萬臺機器的叢集上儲存並處理數 PB 資料，基礎設施已足夠穩健。隨著“如何在這規模下把系統跑起來”基本被解決，重點開始轉向程式設計模型的可用性。

MapReduce、資料流引擎、雲資料倉庫都把 SQL 作為批處理“通用語”。這很自然：傳統資料倉庫本就用 SQL，資料分析/ETL 工具都支援 SQL，幾乎所有開發者和分析師也都熟悉 SQL。

相比手寫 MapReduce，查詢語言介面不僅程式碼更少，還支援互動式使用：可在終端或 GUI 裡寫分析 SQL 並直接執行。這種互動式查詢對於業務分析、產品、銷售、財務等角色探索資料非常高效。雖然它不完全是“經典批處理”形態，但 SQL 讓探索式查詢也能在分散式批處理系統中高效完成。

高階查詢語言不只提升人的生產力，也提高機器執行效率。正如[“雲資料倉庫”](/tw/ch4#sec_cloud_data_warehouses)所述，查詢引擎要把 SQL 轉成在集群裡執行的批處理作業。這個從查詢到語法樹再到物理運算元的轉換過程，讓引擎有機會做最佳化。Hive、Trino、Spark、Flink 等查詢引擎都具備代價最佳化器：它們可分析連線輸入特徵，自動選擇更合適的連線演算法，甚至重排連線順序以減少中間狀態 [^19] [^26] [^27] [^28]。

SQL 是最流行的通用批處理語言，但在一些細分場景中仍有其他語言。Apache Pig 提供了基於關係運算元的逐步式資料流水線描述方式，而非“一個超大 SQL 查詢”。DataFrame（下一節）有相似特徵，Morel 則是受 Pig 影響的更現代語言。還有使用者採用 jq、JMESPath、JsonPath 等 JSON 查詢語言。

在[“圖狀資料模型”](/tw/ch3#sec_datamodels_graph)中，我們討論了圖建模與圖查詢語言如何遍歷邊和頂點。許多圖處理框架也支援透過查詢語言做批計算，例如 Apache TinkerPop 的 Gremlin。我們會在[“批處理用例”](#sec_batch_output)繼續看圖處理場景。

> [!TIP] 批處理與雲資料倉庫正在收斂
> 歷史上，資料倉庫執行在專用硬體裝置上，主要提供關係資料的 SQL 分析查詢；而 MapReduce 等批處理框架強調更高可伸縮性與更高靈活性，允許使用通用程式語言寫處理邏輯，並讀寫任意資料格式。
>
> 隨著發展，兩者越來越像。現代批處理框架已經支援 SQL，並藉助 Parquet 等列式格式和最佳化執行引擎（見[“查詢執行：編譯與向量化”](/tw/ch4#sec_storage_vectorized)）在關係查詢上獲得良好效能。與此同時，資料倉庫透過雲化（見[“雲資料倉庫”](/tw/ch4#sec_cloud_data_warehouses)）獲得更強可伸縮能力，並實現了許多與分散式批處理框架相同的排程、容錯和混洗技術，很多也使用分散式檔案系統。
>
> 正如批處理系統採納 SQL，雲倉庫也在採納 DataFrame 等替代處理模型（下一節）。例如 BigQuery 提供 BigQuery DataFrames，Snowflake 的 Snowpark 能與 Pandas 整合。Airflow、Prefect、Dagster 等批處理工作流編排器也已廣泛整合雲倉庫。
>
> 當然，並非所有批任務都容易用 SQL 表達。PageRank 等迭代圖演算法、複雜機器學習任務都很難用 SQL 寫。涉及影像、影片、音訊等非關係多模態資料的 AI 處理同樣如此。
>
> 此外，雲資料倉庫在某些負載上並不理想。行級逐條計算與列式儲存不匹配，效率較低，此時更適合使用倉庫的其他 API 或批處理系統。雲倉庫通常也比其他批處理系統更貴，某些大作業放到 Spark/Flink 等系統可能更具成本優勢。
>
> 因此，“用批處理系統還是資料倉庫”最終要看成本、便利性、實現複雜度、可用性等綜合因素。大型企業往往並存多套系統以保留選擇空間；小公司通常一套系統也能跑起來。

### DataFrames {#id287}

隨著資料科學家和統計學家開始用分散式批處理框架做機器學習，他們發現原有處理模型不夠順手，因為他們更習慣 R 與 Pandas 裡的 DataFrame 資料模型（見[“DataFrame、矩陣與陣列”](/tw/ch3#sec_datamodels_dataframes)）。DataFrame 與關係庫裡的表很像：由多行組成，同一列值型別一致。它不是寫一個超大 SQL，而是透過呼叫對應關係運算元的函式來做過濾、連線、排序、分組等操作。

早期 DataFrame 操作大多在本地記憶體執行，因此只能處理單機裝得下的資料集。資料科學家希望在批處理環境中，仍用熟悉的 DataFrame API 處理大資料。Spark、Flink、Daft 等分散式框架都因此提供了 DataFrame API。需要注意的是，本地 DataFrame 通常帶索引且有順序，而分散式 DataFrame 往往沒有 [^29]，遷移時可能出現效能“意外”。

DataFrame API 看起來和資料流 API 相似，但實現方式差別不小。Pandas 呼叫方法後通常立刻執行；Spark 則會先把 DataFrame API 呼叫翻譯為查詢計劃，做查詢最佳化後，再在分散式資料流引擎上執行，從而獲得更好效能。

Daft 等框架甚至同時支援客戶端與服務端計算：小規模記憶體操作在客戶端執行，大資料與重計算在服務端執行。Apache Arrow 等列式格式提供統一資料模型，可被兩側執行引擎共享。

## 批處理用例 {#sec_batch_output}

瞭解了批處理如何工作後，我們來看它在不同應用中的落地。批處理非常適合“海量資料的批次計算”，但不適合低延遲場景。因此，只要資料多且新鮮度要求不高，幾乎都能看到批處理的身影。這聽起來像限制，但現實裡大量工作都符合這個模型：

- 會計對賬與庫存核對：企業定期驗證交易、銀行賬戶與庫存是否一致，常由批處理完成 [^30]。
- 製造業需求預測：通常以週期性批任務計算 [^31]。
- 電商、媒體、社交平臺推薦模型訓練：大量依賴批處理 [^32] [^33]。
- 許多金融系統也是批處理驅動。例如美國銀行網路幾乎完全基於批任務執行 [^34]。

下面分別討論幾個幾乎所有行業都常見的批處理用例。

### 提取-轉換-載入（ETL） {#sec_batch_etl_usage}

[“資料倉庫”](/tw/ch1#sec_introduction_dwh)介紹了 ETL/ELT：從生產資料庫抽取資料、進行轉換，再載入到下游系統。本節用“ETL”統稱這兩類負載。尤其當下遊是資料倉庫時，ETL 常由批處理作業承載。

批處理天然並行，非常適合資料轉換。很多轉換任務都是“令人尷尬地並行”：過濾、欄位投影及大量常見倉庫轉換都可並行完成。

批處理環境通常自帶成熟工作流排程器，便於安排、編排和除錯 ETL 流水線。發生故障時，排程器常會自動重試以覆蓋瞬時問題；若持續失敗，則明確標記失敗，便於工程師快速定位流水線中斷點。像 Airflow 還內建大量 source/sink/query 運算元，可直接對接 MySQL、PostgreSQL、Snowflake、Spark、Flink 等數十種系統。排程器與資料處理系統的緊密整合顯著簡化了資料整合。

我們也看到，批處理在“出錯後排障與修復”方面很友好，這對除錯資料流水線極其關鍵。失敗檔案可直接檢查，ETL 作業可修復後重跑。比如輸入檔案不再包含某個轉換邏輯依賴欄位，資料工程師就能據此更新轉換邏輯或修復上游生產作業。

過去資料流水線往往由單一資料工程團隊集中維護，因為讓產品團隊自行編寫和維護複雜批流水線不太現實。近年隨著處理模型和元資料管理改進，組織內更多團隊都能參與並維護自己的流水線。*data mesh* [^35] [^36]、*data contract* [^37]、*data fabric* [^38] 等實踐，正透過規範和工具幫助團隊安全釋出可被全組織消費的資料。

如今資料流水線與分析查詢不僅共享處理模型，也常共享執行引擎。很多 ETL 作業與消費其輸出的分析查詢都執行在同一系統裡：例如同樣以 SparkSQL、Trino 或 DuckDB 查詢執行。這樣的架構進一步模糊了應用工程、資料工程、分析工程與業務分析之間的界限。

### 分析（Analytics） {#sec_batch_olap}

在[“操作型系統與分析型系統”](/tw/ch1#sec_introduction_analytics)中我們看到，分析查詢（OLAP）通常要掃描大量記錄並做分組聚合。這類負載可以與其他批任務一起執行在批處理系統中。分析人員寫 SQL，經查詢引擎執行，讀寫底層 DFS 或物件儲存。表到檔案對映、名稱、型別等表元資料通常由 Apache Iceberg 等表格式與 Unity 等 catalog 管理（見[“雲資料倉庫”](/tw/ch4#sec_cloud_data_warehouses)）。這種架構稱為 *資料湖倉（data lakehouse）* [^39]。

與 ETL 類似，SQL 介面改進讓很多組織用 Spark 等批框架直接承載分析。常見模式有兩類：

- 預聚合查詢：先把資料滾動聚合為 OLAP 立方體或資料集市，以提升查詢速度（見[“物化檢視與資料立方”](/tw/ch4#sec_storage_materialized_views)）。預聚合結果可在倉庫查詢，或推送到 Apache Druid、Apache Pinot 這類即時 OLAP 系統。預聚合通常按固定週期執行，通常由[“工作流排程”](#sec_batch_workflows)中提到的排程器管理。
- 臨時查詢（ad hoc）：使用者為回答具體業務問題、分析使用者行為、排查執行問題等隨時發起。該場景非常看重響應時間，分析師通常會根據每次結果繼續迭代提問。執行快的批處理查詢引擎可顯著縮短等待。

SQL 支援還讓批處理系統更易接入電子表格與視覺化工具，如 Tableau、Power BI、Looker、Apache Superset。比如 Tableau 有 SparkSQL、Presto 聯結器；Superset 支援 Trino、Hive、Spark SQL、Presto 等大量最終會觸發批任務的資料系統。

### 機器學習 {#id290}

機器學習（ML）高度依賴批處理。資料科學家、ML 工程師、AI 工程師會用批處理框架探索資料模式、做資料轉換、訓練模型。常見用途包括：

- 特徵工程：把原始資料過濾並轉換為可訓練資料。預測模型往往要求數值特徵，因此文字或離散值等資料需要先轉成目標格式。
- 模型訓練：訓練資料是批過程輸入，訓練後模型權重是輸出。
- 批次推理：當資料集很大且不要求即時結果時，可對整批資料做預測，也包括在測試集上評估模型預測效果。

很多框架為這些場景提供了專用工具。例如 Spark 的 MLlib、Flink 的 FlinkML 都內建豐富的特徵工程工具、統計函式與分類器。

推薦系統和排序系統等 ML 應用也大量使用圖處理（見[“圖狀資料模型”](/tw/ch3#sec_datamodels_graph)）。許多圖演算法表達為“沿邊逐步傳播資訊並反覆迭代”：把一個頂點與相鄰頂點連線，傳遞某些資訊，重複直到滿足停止條件，例如無邊可繼續，或某個指標收斂。

*批同步並行（bulk synchronous parallel, BSP）* 計算模型 [^40] 已成為批圖計算常用模型。Apache Giraph [^20]、Spark GraphX、Flink Gelly [^41] 等都實現了它。它也常被稱為 *Pregel* 模型，因為 Google 的 Pregel 論文讓這一方法廣為人知 [^42]。

批處理同樣是大語言模型（LLM）資料準備與訓練的重要組成部分。網頁等原始文字通常存放在 DFS 或物件儲存中，必須先預處理才能用於訓練。適合批處理框架的預處理步驟包括：

- 從 HTML 中提取純文字，並修復損壞文字；
- 檢測並清理低質量、無關或重複文件；
- 對文字做分詞並轉換為嵌入向量（詞或片段的數值表示）。

Kubeflow、Flyte、Ray 等框架就專為這類負載構建。以 OpenAI 為例，ChatGPT 訓練流程中就使用了 Ray [^43]。這些框架通常內建與 PyTorch、TensorFlow、XGBoost 等 LLM/AI 庫的整合，並支援特徵工程、模型訓練、批次推理、微調等能力。

最後，資料科學家常在 Jupyter、Hex 等互動式 Notebook 中實驗資料。Notebook 由多個 *cell* 組成，每個 cell 是一小段 Markdown、Python 或 SQL；按順序執行可得到表格、圖表或資料結果。很多 Notebook 背後透過 DataFrame API 或 SQL 呼叫批處理系統。

### 對外提供派生資料 {#sec_batch_serving_derived}

批處理常用於構建預計算/派生資料集，如商品推薦、面向使用者的報表、機器學習特徵等。這些資料通常由生產資料庫、鍵值儲存或搜尋引擎對外服務。不論目標系統是什麼，都需要把批處理環境中的 DFS/物件儲存輸出，回灌到線上服務資料庫。

最直觀的做法是：在批作業裡直接使用資料庫客戶端庫，一條條寫生產資料庫（假設防火牆允許）。這雖然能工作，但通常不是好主意，原因有三：

- 每條記錄一次網路請求，比批任務正常吞吐低幾個數量級。即便客戶端支援批寫，效能通常也不理想。
- 批處理框架常並行跑很多工。若所有任務同時以批處理速率寫同一資料庫，很容易把資料庫壓垮，進而影響其線上查詢效能，引發系統其他部分故障 [^44]。
- 批作業通常提供清晰的“全有或全無”輸出語義：作業成功時，結果等價於每個任務恰好執行一次；作業失敗時，無有效輸出。但如果在作業內直接寫外部系統，就產生了外部可見副作用，難以隱藏：部分完成結果可能被其他系統看到，任務失敗重啟還可能造成重複寫。

更好的方案是把預計算結果先推送到 Kafka 這類流系統（我們會在[第十二章](/tw/ch12#ch_stream)深入討論）。Elasticsearch、Apache Pinot、Apache Druid、Venice 這類派生資料儲存 [^45]，以及 ClickHouse 等雲數倉，都支援從 Kafka 攝入資料。透過流系統過渡可以改善前述問題：

- 流系統針對順序寫最佳化，更適合批作業的大吞吐寫入模式；
- 流系統可在批作業與生產庫間充當緩衝層，下游可按自身能力限速讀取，避免影響線上流量；
- 一個批作業輸出可被多個下游系統同時消費；
- 流系統還可作為批處理網路與生產網路之間的安全邊界（可部署在 DMZ）。

但“經由流”並不會自動解決“全有或全無”語義。要實現這一點，批作業需要在完成後向下遊發出“作業完成，可對外可見”的通知。流消費者需要像 *讀已提交（read committed）* 事務那樣，在收到完成通知前讓新資料對查詢不可見（見[“讀已提交”](/tw/ch8#sec_transactions_read_committed)）。

另一種在資料庫冷啟動（bootstrap）時更常見的模式，是在批作業內直接構建一個全新資料庫，再把檔案從 DFS、物件儲存或本地檔案系統批次匯入目標資料庫。很多系統都提供這類批次匯入工具，如 TiDB Lightning、Apache Pinot/Apache Druid 的 Hadoop 匯入作業，RocksDB 也提供從批作業批次匯入 SST 的 API。

“批構建 + 批匯入”速度非常快，也更容易在不同資料版本間做原子切換。但對於需要持續增量更新的場景，這種“每次構建全新庫”的方式會更難。很多系統採用混合策略，同時支援冷啟動與增量載入。比如 Venice 就支援混合儲存，可同時做基於行的批更新和全量資料集切換。

## 本章小結 {#id292}

本章討論了批處理系統的設計與實現。我們先從經典 Unix 工具鏈（awk、sort、uniq 等）出發，說明了批處理的基礎原語，例如排序和計數。

然後我們把視角擴充套件到分散式批處理系統。批處理以“不可變、有限（bounded）的輸入資料集”為物件，生成輸出資料，這使得重跑和除錯可以不引入副作用。圍繞這一模式，批處理框架通常包含三層核心能力：決定作業何時何地執行的編排層，負責持久化資料的儲存層，以及執行實際計算的計算層。

我們看了分散式檔案系統和物件儲存如何透過分塊複製、快取和元資料服務管理大檔案，也討論了現代批處理框架如何透過可插拔 API 與這些儲存互動。我們還討論了編排器在大叢集中如何排程任務、分配資源和處理故障，以及“按作業排程”的編排器與“按依賴圖管理整組作業生命週期”的工作流編排器之間的區別。

在處理模型方面，我們回顧了 MapReduce 及其經典 map/reduce 函式，又介紹了 Spark、Flink 等更易用且效能更好的資料流引擎。為了理解批作業如何擴充套件到大規模，我們重點講了混洗（shuffle）演算法，它是實現分組、連線、聚合的基礎操作。

隨著批處理系統成熟，焦點轉向可用性。高階查詢語言（尤其 SQL）和 DataFrame API 讓批處理作業更易編寫，也更容易被最佳化器最佳化。查詢最佳化器把宣告式查詢轉換為高效執行計劃。

最後我們回顧了批處理常見用例：

- ETL 流水線：透過定時工作流在不同系統間提取、轉換、載入資料；
- 分析：既支援預聚合報表，也支援臨時探索查詢；
- 機器學習：用於準備與處理大規模訓練資料；
- 把批處理輸出灌入面向生產流量的系統：常透過流系統或批次匯入工具，把派生資料提供給使用者。

下一章我們將轉向流處理。與批處理不同，流處理輸入是 *無界（unbounded）* 的：作業仍在，但輸入是持續不斷的資料流，因此作業不會“完成”。我們會看到，流處理與批處理在一些方面很相似，但“輸入無界”這一前提也會顯著改變系統設計。


### 參考文獻 {#references}

[^1]: Nathan Marz. [How to Beat the CAP Theorem](http://nathanmarz.com/blog/how-to-beat-the-cap-theorem.html). *nathanmarz.com*, October 2011. Archived at [perma.cc/4BS9-R9A4](https://perma.cc/4BS9-R9A4)
[^2]: Molly Bartlett Dishman and Martin Fowler. [Agile Architecture](https://www.youtube.com/watch?v=VjKYO6DP3fo&list=PL055Epbe6d5aFJdvWNtTeg_UEHZEHdInE). At *O'Reilly Software Architecture Conference*, March 2015.
[^3]: Jeffrey Dean and Sanjay Ghemawat. [MapReduce: Simplified Data Processing on Large Clusters](https://www.usenix.org/legacy/publications/library/proceedings/osdi04/tech/full_papers/dean/dean.pdf). At *6th USENIX Symposium on Operating System Design and Implementation* (OSDI), December 2004.
[^4]: Shivnath Babu and Herodotos Herodotou. [Massively Parallel Databases and MapReduce Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2013/11/db-mr-survey-final.pdf). *Foundations and Trends in Databases*, volume 5, issue 1, pages 1--104, November 2013. [doi:10.1561/1900000036](https://doi.org/10.1561/1900000036)
[^5]: David J. DeWitt and Michael Stonebraker. [MapReduce: A Major Step Backwards](https://homes.cs.washington.edu/~billhowe/mapreduce_a_major_step_backwards.html). Originally published at *databasecolumn.vertica.com*, January 2008. Archived at [perma.cc/U8PA-K48V](https://perma.cc/U8PA-K48V)
[^6]: Henry Robinson. [The Elephant Was a Trojan Horse: On the Death of Map-Reduce at Google](https://www.the-paper-trail.org/post/2014-06-25-the-elephant-was-a-trojan-horse-on-the-death-of-map-reduce-at-google/). *the-paper-trail.org*, June 2014. Archived at [perma.cc/9FEM-X787](https://perma.cc/9FEM-X787)
[^7]: Urs Hölzle. [R.I.P. MapReduce. After having served us well since 2003, today we removed the remaining internal codebase for good](https://twitter.com/uhoelzle/status/1177360023976067077). *twitter.com*, September 2019. Archived at [perma.cc/B34T-LLY7](https://perma.cc/B34T-LLY7)
[^8]: Adam Drake. [Command-Line Tools Can Be 235x Faster than Your Hadoop Cluster](https://adamdrake.com/command-line-tools-can-be-235x-faster-than-your-hadoop-cluster.html). *aadrake.com*, January 2014. Archived at [perma.cc/87SP-ZMCY](https://perma.cc/87SP-ZMCY)
[^9]: [`sort`: Sort text files](https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html). GNU Coreutils 9.7 Documentation, Free Software Foundation, Inc., 2025.
[^10]: Michael Ovsiannikov, Silvius Rus, Damian Reeves, Paul Sutter, Sriram Rao, and Jim Kelly. [The Quantcast File System](https://db.disi.unitn.eu/pages/VLDBProgram/pdf/industry/p808-ovsiannikov.pdf). *Proceedings of the VLDB Endowment*, volume 6, issue 11, pages 1092--1101, August 2013. [doi:10.14778/2536222.2536234](https://doi.org/10.14778/2536222.2536234)
[^11]: Andrew Wang, Zhe Zhang, Kai Zheng, Uma Maheswara G., and Vinayakumar B. [Introduction to HDFS Erasure Coding in Apache Hadoop](https://www.cloudera.com/blog/technical/introduction-to-hdfs-erasure-coding-in-apache-hadoop.html). *blog.cloudera.com*, September 2015. Archived at [archive.org](https://web.archive.org/web/20250731115546/https://www.cloudera.com/blog/technical/introduction-to-hdfs-erasure-coding-in-apache-hadoop.html)
[^12]: Andy Warfield. [Building and operating a pretty big storage system called S3](https://www.allthingsdistributed.com/2023/07/building-and-operating-a-pretty-big-storage-system.html). *allthingsdistributed.com*, July 2023. Archived at [perma.cc/7LPK-TP7V](https://perma.cc/7LPK-TP7V)
[^13]: Vinod Kumar Vavilapalli, Arun C. Murthy, Chris Douglas, Sharad Agarwal, Mahadev Konar, Robert Evans, Thomas Graves, Jason Lowe, Hitesh Shah, Siddharth Seth, Bikas Saha, Carlo Curino, Owen O'Malley, Sanjay Radia, Benjamin Reed, and Eric Baldeschwieler. [Apache Hadoop YARN: Yet Another Resource Negotiator](https://opencourse.inf.ed.ac.uk/sites/default/files/2023-10/yarn-socc13.pdf). At *4th Annual Symposium on Cloud Computing* (SoCC), October 2013. [doi:10.1145/2523616.2523633](https://doi.org/10.1145/2523616.2523633)
[^14]: Richard M. Karp. [Reducibility Among Combinatorial Problems](https://www.cs.purdue.edu/homes/hosking/197/canon/karp.pdf). *Complexity of Computer Computations. The IBM Research Symposia Series*. Springer, 1972. [doi:10.1007/978-1-4684-2001-2_9](https://doi.org/10.1007/978-1-4684-2001-2_9)
[^15]: J. D. Ullman. [NP-Complete Scheduling Problems](https://www.cs.montana.edu/bhz/classes/fall-2018/csci460/paper4.pdf). *Journal of Computer and System Sciences*, volume 10, issue 3, June 1975. [doi:10.1016/S0022-0000(75)80008-0](https://doi.org/10.1016/S0022-0000(75)80008-0)
[^16]: Gilad David Maayan. [The complete guide to spot instances on AWS, Azure and GCP](https://www.datacenterdynamics.com/en/opinions/complete-guide-spot-instances-aws-azure-and-gcp/). *datacenterdynamics.com*, March 2021. Archived at [archive.org](https://web.archive.org/web/20250722114617/https://www.datacenterdynamics.com/en/opinions/complete-guide-spot-instances-aws-azure-and-gcp/)
[^17]: Abhishek Verma, Luis Pedrosa, Madhukar Korupolu, David Oppenheimer, Eric Tune, and John Wilkes. [Large-Scale Cluster Management at Google with Borg](https://dl.acm.org/doi/pdf/10.1145/2741948.2741964). At *10th European Conference on Computer Systems* (EuroSys), April 2015. [doi:10.1145/2741948.2741964](https://doi.org/10.1145/2741948.2741964)
[^18]: Matei Zaharia, Mosharaf Chowdhury, Tathagata Das, Ankur Dave, Justin Ma, Murphy McCauley, Michael J. Franklin, Scott Shenker, and Ion Stoica. [Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing](https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final138.pdf). At *9th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), April 2012.
[^19]: Paris Carbone, Stephan Ewen, Seif Haridi, Asterios Katsifodimos, Volker Markl, and Kostas Tzoumas. [Apache Flink™: Stream and Batch Processing in a Single Engine](http://sites.computer.org/debull/A15dec/p28.pdf). *Bulletin of the IEEE Computer Society Technical Committee on Data Engineering*, volume 38, issue 4, December 2015. Archived at [perma.cc/G3N3-BKX5](https://perma.cc/G3N3-BKX5)
[^20]: Mark Grover, Ted Malaska, Jonathan Seidman, and Gwen Shapira. *[Hadoop Application Architectures](https://learning.oreilly.com/library/view/hadoop-application-architectures/9781491910313/)*. O'Reilly Media, 2015. ISBN: 978-1-491-90004-8
[^21]: Jules S. Damji, Brooke Wenig, Tathagata Das, and Denny Lee. *[Learning Spark, 2nd Edition](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/)*. O'Reilly Media, 2020. ISBN: 978-1492050049
[^22]: Michael Isard, Mihai Budiu, Yuan Yu, Andrew Birrell, and Dennis Fetterly. [Dryad: Distributed Data-Parallel Programs from Sequential Building Blocks](https://www.microsoft.com/en-us/research/publication/dryad-distributed-data-parallel-programs-from-sequential-building-blocks/). At *2nd European Conference on Computer Systems* (EuroSys), March 2007. [doi:10.1145/1272996.1273005](https://doi.org/10.1145/1272996.1273005)
[^23]: Daniel Warneke and Odej Kao. [Nephele: Efficient Parallel Data Processing in the Cloud](https://stratosphere2.dima.tu-berlin.de/assets/papers/Nephele_09.pdf). At *2nd Workshop on Many-Task Computing on Grids and Supercomputers* (MTAGS), November 2009. [doi:10.1145/1646468.1646476](https://doi.org/10.1145/1646468.1646476)
[^24]: Hossein Ahmadi. [In-memory query execution in Google BigQuery](https://cloud.google.com/blog/products/bigquery/in-memory-query-execution-in-google-bigquery). *cloud.google.com*, August 2016. Archived at [perma.cc/DGG2-FL9W](https://perma.cc/DGG2-FL9W)
[^25]: Tom White. *[Hadoop: The Definitive Guide](https://learning.oreilly.com/library/view/hadoop-the-definitive/9781491901687/)*, 4th edition. O'Reilly Media, 2015. ISBN: 978-1-491-90163-2
[^26]: Fabian Hüske. [Peeking into Apache Flink's Engine Room](https://flink.apache.org/2015/03/13/peeking-into-apache-flinks-engine-room/). *flink.apache.org*, March 2015. Archived at [perma.cc/44BW-ALJX](https://perma.cc/44BW-ALJX)
[^27]: Mostafa Mokhtar. [Hive 0.14 Cost Based Optimizer (CBO) Technical Overview](https://web.archive.org/web/20170607112708/http://hortonworks.com/blog/hive-0-14-cost-based-optimizer-cbo-technical-overview/). *hortonworks.com*, March 2015. Archived on [archive.org](https://web.archive.org/web/20170607112708/http://hortonworks.com/blog/hive-0-14-cost-based-optimizer-cbo-technical-overview/)
[^28]: Michael Armbrust, Reynold S. Xin, Cheng Lian, Yin Huai, Davies Liu, Joseph K. Bradley, Xiangrui Meng, Tomer Kaftan, Michael J. Franklin, Ali Ghodsi, and Matei Zaharia. [Spark SQL: Relational Data Processing in Spark](https://people.csail.mit.edu/matei/papers/2015/sigmod_spark_sql.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2742797](https://doi.org/10.1145/2723372.2742797)
[^29]: Kaya Kupferschmidt. [Spark vs Pandas, part 2 -- Spark](https://towardsdatascience.com/spark-vs-pandas-part-2-spark-c57f8ea3a781/). *towardsdatascience.com*, October 2020. Archived at [perma.cc/5BRK-G4N5](https://perma.cc/5BRK-G4N5)
[^30]: Ammar Chalifah. [Tracking payments at scale](https://bolt.eu/en/blog/tracking-payments-at-scale). *bolt.eu.com*, June 2025. Archived at [perma.cc/Q4KX-8K3J](https://perma.cc/Q4KX-8K3J)
[^31]: Nafi Ahmet Turgut, Hamza Akyıldız, Hasan Burak Yel, Mehmet İkbal Özmen, Mutlu Polatcan, Pinar Baki, and Esra Kayabali. [Demand forecasting at Getir built with Amazon Forecast](https://aws.amazon.com/blogs/machine-learning/demand-forecasting-at-getir-built-with-amazon-forecast). *aws.amazon.com.com*, May 2023. Archived at [perma.cc/H3H6-GNL7](https://perma.cc/H3H6-GNL7)
[^32]: Jason (Siyu) Zhu. [Enhancing homepage feed relevance by harnessing the power of large corpus sparse ID embeddings](https://www.linkedin.com/blog/engineering/feed/enhancing-homepage-feed-relevance-by-harnessing-the-power-of-lar). *linkedin.com*, August 2023. Archived at [archive.org](https://web.archive.org/web/20250225094424/https://www.linkedin.com/blog/engineering/feed/enhancing-homepage-feed-relevance-by-harnessing-the-power-of-lar)
[^33]: Avery Ching, Sital Kedia, and Shuojie Wang. [Apache Spark \@Scale: A 60 TB+ production use case](https://engineering.fb.com/2016/08/31/core-infra/apache-spark-scale-a-60-tb-production-use-case/). *engineering.fb.com*, August 2016. Archived at [perma.cc/F7R5-YFAV](https://perma.cc/F7R5-YFAV)
[^34]: Edward Kim. [How ACH works: A developer perspective --- Part 1](https://engineering.gusto.com/how-ach-works-a-developer-perspective-part-1-339d3e7bea1). *engineering.gusto.com*, April 2014. Archived at [perma.cc/F67P-VBLK](https://perma.cc/F67P-VBLK)
[^35]: Zhamak Dehghani. [How to Move Beyond a Monolithic Data Lake to a Distributed Data Mesh](https://martinfowler.com/articles/data-monolith-to-mesh.html). *martinfowler.com*, May 2019. Archived at [perma.cc/LN2L-L4VC](https://perma.cc/LN2L-L4VC)
[^36]: Chris Riccomini. [What the Heck is a Data Mesh?!](https://cnr.sh/essays/what-the-heck-data-mesh) *cnr.sh*, June 2021. Archived at [perma.cc/NEJ2-BAX3](https://perma.cc/NEJ2-BAX3)
[^37]: Chad Sanderson, Mark Freeman, B. E. Schmidt. [*Data Contracts*](https://www.oreilly.com/library/view/data-contracts/9781098157623/). O'Reilly Media, 2025. ISBN: 9781098157623
[^38]: Daniel Abadi. [Data Fabric vs. Data Mesh: What's the Difference?](https://www.starburst.io/blog/data-fabric-vs-data-mesh-whats-the-difference/) *starburst.io*, November 2021. Archived at [perma.cc/RSK3-HXDK](https://perma.cc/RSK3-HXDK)
[^39]: Michael Armbrust, Ali Ghodsi, Reynold Xin, and Matei Zaharia. [Lakehouse: A New Generation of Open Platforms that Unify Data Warehousing and Advanced Analytics](https://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf). At *11th Annual Conference on Innovative Data Systems Research* (CIDR), January 2021.
[^40]: Leslie G. Valiant. [A Bridging Model for Parallel Computation](https://dl.acm.org/doi/pdf/10.1145/79173.79181). *Communications of the ACM*, volume 33, issue 8, pages 103--111, August 1990. [doi:10.1145/79173.79181](https://doi.org/10.1145/79173.79181)
[^41]: Stephan Ewen, Kostas Tzoumas, Moritz Kaufmann, and Volker Markl. [Spinning Fast Iterative Data Flows](https://vldb.org/pvldb/vol5/p1268_stephanewen_vldb2012.pdf). *Proceedings of the VLDB Endowment*, volume 5, issue 11, pages 1268-1279, July 2012. [doi:10.14778/2350229.2350245](https://doi.org/10.14778/2350229.2350245)
[^42]: Grzegorz Malewicz, Matthew H. Austern, Aart J. C. Bik, James C. Dehnert, Ilan Horn, Naty Leiser, and Grzegorz Czajkowski. [Pregel: A System for Large-Scale Graph Processing](https://kowshik.github.io/JPregel/pregel_paper.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2010. [doi:10.1145/1807167.1807184](https://doi.org/10.1145/1807167.1807184)
[^43]: Richard MacManus. [OpenAI Chats about Scaling LLMs at Anyscale's Ray Summit](https://thenewstack.io/openai-chats-about-scaling-llms-at-anyscales-ray-summit/). *thenewstack.io*, September 2023. Archived at [perma.cc/YJD6-KUXU](https://perma.cc/YJD6-KUXU)
[^44]: Jay Kreps. [Why Local State is a Fundamental Primitive in Stream Processing](https://www.oreilly.com/ideas/why-local-state-is-a-fundamental-primitive-in-stream-processing). *oreilly.com*, July 2014. Archived at [perma.cc/P8HU-R5LA](https://perma.cc/P8HU-R5LA)
[^45]: Félix GV. [Open Sourcing Venice -- LinkedIn's Derived Data Platform](https://www.linkedin.com/blog/engineering/open-source/open-sourcing-venice-linkedin-s-derived-data-platform). *linkedin.com*, September 2022. Archived at [archive.org](https://web.archive.org/web/20250226160927/https://www.linkedin.com/blog/engineering/open-source/open-sourcing-venice-linkedin-s-derived-data-platform)

================================================
FILE: content/tw/ch12.md
================================================
---
title: "第十二章：流處理"
linkTitle: "12. 流處理"
weight: 312
math: true
breadcrumbs: false
---

<a id="ch_stream"></a>

![](/map/ch11.png)

> 有效的複雜系統總是從簡單的系統演化而來。反之亦然：從零設計的複雜系統沒一個能有效工作的。
>
> —— 約翰・加爾，Systemantics（1975）

在 [第十一章](/tw/ch11) 中，我們討論了批處理技術，它讀取一組檔案作為輸入，並生成一組新的檔案作為輸出。輸出是 **派生資料（derived data）** 的一種形式；也就是說，如果需要，可以透過再次執行批處理過程來重新建立資料集。我們看到了如何使用這個簡單而強大的想法來建立搜尋索引、推薦系統、做分析等等。

然而，在 [第十一章](/tw/ch11) 中仍然有一個很大的假設：即輸入是有界的，即已知和有限的大小，所以批處理知道它何時完成輸入的讀取。例如，MapReduce 核心的排序操作必須讀取其全部輸入，然後才能開始生成輸出：可能發生這種情況：最後一條輸入記錄具有最小的鍵，因此需要第一個被輸出，所以提早開始輸出是不可行的。

實際上，很多資料是 **無界限** 的，因為它隨著時間的推移而逐漸到達：你的使用者在昨天和今天產生了資料，明天他們將繼續產生更多的資料。除非你停業，否則這個過程永遠都不會結束，所以資料集從來就不會以任何有意義的方式 “完成”[^1]。因此，批處理程式必須將資料人為地分成固定時間段的資料塊，例如，在每天結束時處理一天的資料，或者在每小時結束時處理一小時的資料。

日常批處理中的問題是，輸入的變更只會在一天之後的輸出中反映出來，這對於許多急躁的使用者來說太慢了。為了減少延遲，我們可以更頻繁地執行處理 —— 比如說，在每秒鐘的末尾 —— 或者甚至更連續一些，完全拋開固定的時間切片，當事件發生時就立即進行處理，這就是 **流處理（stream processing）** 背後的想法。

一般來說，“流” 是指隨著時間的推移逐漸可用的資料。這個概念出現在很多地方：Unix 的 stdin 和 stdout、程式語言（惰性列表）[^2]、檔案系統 API（如 Java 的 `FileInputStream`）、TCP 連線、透過網際網路傳送音訊和影片等等。

在本章中，我們將把 **事件流（event stream）** 視為一種資料管理機制：無界限，增量處理，與上一章中的批次資料相對應。我們將首先討論怎樣表示、儲存、透過網路傳輸流。在 “[資料庫與流](#sec_stream_databases)” 中，我們將研究流和資料庫之間的關係。最後在 “[流處理](#sec_stream_processing)” 中，我們將研究連續處理這些流的方法和工具，以及它們用於應用構建的方式。


## 傳遞事件流 {#sec_stream_transmit}

在批處理領域，作業的輸入和輸出是檔案（也許在分散式檔案系統上）。流處理領域中的等價物看上去是什麼樣子的？

當輸入是一個檔案（一個位元組序列），第一個處理步驟通常是將其解析為一系列記錄。在流處理的上下文中，記錄通常被叫做 **事件（event）** ，但它本質上是一樣的：一個小的、自包含的、不可變的物件，包含某個時間點發生的某件事情的細節。一個事件通常包含一個來自日曆時鐘的時間戳，以指明事件發生的時間（請參閱 “[單調鍾與日曆時鐘](/tw/ch9#sec_distributed_monotonic_timeofday)”）。

例如，發生的事件可能是使用者採取的行動，例如檢視頁面或進行購買。它也可能來源於機器，例如對溫度感測器或 CPU 利用率的週期性測量。在 “[使用 Unix 工具的批處理](/tw/ch11#sec_batch_unix)” 的示例中，Web 伺服器日誌的每一行都是一個事件。

事件可能被編碼為文字字串或 JSON，或者某種二進位制編碼，如 [第五章](/tw/ch5) 所述。這種編碼允許你儲存一個事件，例如將其追加到一個檔案，將其插入關係表，或將其寫入文件資料庫。它還允許你透過網路將事件傳送到另一個節點以進行處理。

在批處理中，檔案被寫入一次，然後可能被多個作業讀取。類似地，在流處理術語中，一個事件由 **生產者（producer）** （也稱為 **釋出者（publisher）** 或 **傳送者（sender）** ）生成一次，然後可能由多個 **消費者（consumer）** （ **訂閱者（subscribers）** 或 **接收者（recipients）** ）進行處理[^3]。在檔案系統中，檔名標識一組相關記錄；在流式系統中，相關的事件通常被聚合為一個 **主題（topic）** 或 **流（stream）** 。

原則上講，檔案或資料庫就足以連線生產者和消費者：生產者將其生成的每個事件寫入資料儲存，且每個消費者定期輪詢資料儲存，檢查自上次執行以來新出現的事件。這實際上正是批處理在每天結束時處理當天資料時所做的事情。

但當我們想要進行低延遲的連續處理時，如果資料儲存不是為這種用途專門設計的，那麼輪詢開銷就會很大。輪詢的越頻繁，能返回新事件的請求比例就越低，而額外開銷也就越高。相比之下，最好能在新事件出現時直接通知消費者。

資料庫在傳統上對這種通知機制支援的並不好，關係型資料庫通常有 **觸發器（trigger）** ，它們可以對變化（如，插入表中的一行）作出反應，但是它們的功能非常有限，並且在資料庫設計中有些後顧之憂[^4]。相應的是，已經開發了專門的工具來提供事件通知。


### 訊息傳遞系統 {#sec_stream_messaging}

向消費者通知新事件的常用方式是使用 **訊息傳遞系統（messaging system）**：生產者傳送包含事件的訊息，然後將訊息推送給消費者。我們之前在 “[訊息傳遞中的資料流](/tw/ch5#sec_encoding_dataflow_msg)” 中談到了這些系統，但現在我們將詳細介紹這些系統。

像生產者和消費者之間的 Unix 管道或 TCP 連線這樣的直接通道，是實現訊息傳遞系統的簡單方法。但是，大多數訊息傳遞系統都在這一基本模型上進行了擴充套件。特別的是，Unix 管道和 TCP 將恰好一個傳送者與恰好一個接收者連線，而一個訊息傳遞系統允許多個生產者節點將訊息傳送到同一個主題，並允許多個消費者節點接收主題中的訊息。

在這個 **釋出 / 訂閱** 模式中，不同的系統採取各種各樣的方法，並沒有針對所有目的的通用答案。為了區分這些系統，問一下這兩個問題會特別有幫助：

1. **如果生產者傳送訊息的速度比消費者能夠處理的速度快會發生什麼？** 一般來說，有三種選擇：系統可以丟掉訊息，將訊息放入緩衝佇列，或使用 **背壓**（backpressure，也稱為 **流量控制**，即 flow control：阻塞生產者，以免其傳送更多的訊息）。例如 Unix 管道和 TCP 就使用了背壓：它們有一個固定大小的小緩衝區，如果填滿，傳送者會被阻塞，直到接收者從緩衝區中取出資料（請參閱 “[網路擁塞和排隊](/tw/ch9#sec_distributed_congestion)”）。

   如果訊息被快取在佇列中，那麼理解佇列增長會發生什麼是很重要的。當佇列裝不進記憶體時系統會崩潰嗎？還是將訊息寫入磁碟？如果是這樣，磁碟訪問又會如何影響訊息傳遞系統的效能[^5]，磁碟寫滿又會發生什麼[^6]？

2. **如果節點崩潰或暫時離線，會發生什麼情況？ —— 是否會有訊息丟失？** 與資料庫一樣，永續性可能需要寫入磁碟和 / 或複製的某種組合（請參閱 “[複製與永續性](/tw/ch8#sidebar_transactions_durability)”），這是有代價的。如果你能接受有時訊息會丟失，則可能在同一硬體上獲得更高的吞吐量和更低的延遲。

是否可以接受訊息丟失取決於應用。例如，對於週期傳輸的感測器讀數和指標，偶爾丟失的資料點可能並不重要，因為更新的值會在短時間內發出。但要注意，如果大量的訊息被丟棄，可能無法立刻意識到指標已經不正確了[^7]。如果你正在對事件計數，那麼它們能夠可靠送達是更重要的，因為每個丟失的訊息都意味著使計數器的錯誤擴大。

我們在 [第十一章](/tw/ch11) 中探討的批處理系統的一個很好的特性是，它們提供了強大的可靠性保證：失敗的任務會自動重試，失敗任務的部分輸出會自動丟棄。這意味著輸出與沒有發生故障一樣，這有助於簡化程式設計模型。在本章的後面，我們將研究如何在流處理的上下文中提供類似的保證。

#### 直接從生產者傳遞給消費者 {#id296}

許多訊息傳遞系統使用生產者和消費者之間的直接網路通訊，而不透過中間節點：

* UDP 組播廣泛應用於金融行業，例如股票市場，其中低時延非常重要[^8]。雖然 UDP 本身是不可靠的，但應用層的協議可以恢復丟失的資料包（生產者必須記住它傳送的資料包，以便能按需重新發送資料包）。
* 無代理的訊息庫，如 ZeroMQ 和 nanomsg 採取類似的方法，透過 TCP 或 IP 多播實現釋出 / 訂閱訊息傳遞。
* 一些指標採集代理（例如 StatsD [^9]）使用不可靠的 UDP 訊息傳遞來收集網路中所有機器的指標並進行監控。（在 StatsD 協議中，計數器指標只有在所有訊息都被接收時才是準確的；使用 UDP 使得這些指標至多是近似值[^10]。另請參閱 “[TCP 與 UDP](/tw/ch9#sidebar_distributed_tcp_udp)”）。
* 如果消費者在網路上公開了服務，生產者可以直接傳送 HTTP 或 RPC 請求（請參閱 “[服務中的資料流：REST 與 RPC](/tw/ch5#sec_encoding_dataflow_rpc)”）將訊息推送給使用者。這就是 webhooks 背後的想法[^11]：把一個服務的回撥 URL 註冊到另一個服務中，當事件發生時向該 URL 發起請求。

儘管這些直接訊息傳遞系統在設計它們的環境中執行良好，但是它們通常要求應用程式碼意識到訊息丟失的可能性。它們的容錯程度極為有限：即使協議檢測到並重傳在網路中丟失的資料包，它們通常也只是假設生產者和消費者始終線上。

如果消費者處於離線狀態，則可能會丟失其不可達時傳送的訊息。一些協議允許生產者重試失敗的訊息傳遞，但當生產者崩潰時，它可能會丟失訊息緩衝區及其本應傳送的訊息，這種方法可能就沒用了。

#### 訊息代理 {#id433}

一種廣泛使用的替代方法是透過 **訊息代理**（message broker，也稱為 **訊息佇列**，即 message queue）傳送訊息，訊息代理實質上是一種針對處理訊息流而最佳化的資料庫[^12]。它作為伺服器執行，生產者和消費者作為客戶端連線到伺服器。生產者將訊息寫入代理，消費者透過從代理那裡讀取來接收訊息。

透過將資料集中在代理上，這些系統可以更容易地容忍來來去去的客戶端（連線，斷開連線和崩潰），而永續性問題則轉移到代理的身上。一些訊息代理只將訊息儲存在記憶體中，而另一些訊息代理（取決於配置）將其寫入磁碟，以便在代理崩潰的情況下不會丟失。針對緩慢的消費者，它們通常會允許無上限的排隊（而不是丟棄訊息或背壓），儘管這種選擇也可能取決於配置。

排隊的結果是，消費者通常是 **非同步（asynchronous）** 的：當生產者傳送訊息時，通常只會等待代理確認訊息已經被快取，而不等待訊息被消費者處理。向消費者遞送訊息將發生在未來某個未定的時間點 —— 通常在幾分之一秒之內，但有時當訊息堆積時會顯著延遲。

#### 訊息代理與資料庫的對比 {#id297}

有些訊息代理甚至可以使用 XA 或 JTA 參與兩階段提交協議（請參閱 “[實踐中的分散式事務](/tw/ch8#sec_transactions_xa)”）。這個功能與資料庫在本質上非常相似，儘管訊息代理和資料庫之間仍存在實踐上很重要的差異：

* 資料庫通常保留資料直至顯式刪除，而大多數訊息代理在訊息成功遞送給消費者時會自動刪除訊息。這樣的訊息代理不適合長期的資料儲存。
* 由於它們很快就能刪除訊息，大多數訊息代理都認為它們的工作集相當小 —— 即佇列很短。如果代理需要緩衝很多訊息，比如因為消費者速度較慢（如果記憶體裝不下訊息，可能會溢位到磁碟），每個訊息需要更長的處理時間，整體吞吐量可能會惡化[^5]。
* 資料庫通常支援次級索引和各種搜尋資料的方式，而訊息代理通常支援按照某種模式匹配主題，訂閱其子集。雖然機制並不一樣，但對於客戶端選擇想要了解的資料的一部分，都是基本的方式。
* 查詢資料庫時，結果通常基於某個時間點的資料快照；如果另一個客戶端隨後向資料庫寫入一些改變了查詢結果的內容，則第一個客戶端不會發現其先前結果現已過期（除非它重複查詢或輪詢變更）。相比之下，訊息代理不支援任意查詢，但是當資料發生變化時（即新訊息可用時），它們會通知客戶端。

這是關於訊息代理的傳統觀點，它被封裝在諸如 JMS [^13] 和 AMQP [^14] 的標準中，並且被諸如 RabbitMQ、ActiveMQ、HornetQ、Qpid、TIBCO 企業訊息服務、IBM MQ、Azure Service Bus 和 Google Cloud Pub/Sub 所實現[^15]。儘管可以把資料庫當作佇列來用，但要調優到理想效能並不容易[^16]。

#### 多個消費者 {#id298}

當多個消費者從同一主題中讀取訊息時，有兩種主要的訊息傳遞模式，如 [圖 12-1](#fig_stream_broker_patterns) 所示：

負載均衡（load balancing）
: 每條訊息都被傳遞給消費者 **之一**，所以處理該主題下訊息的工作能被多個消費者共享。代理可以為消費者任意分配訊息。當處理訊息的代價高昂，希望能並行處理訊息時，此模式非常有用（在 AMQP 中，可以透過讓多個客戶端從同一個佇列中消費來實現負載均衡，而在 JMS 中則稱之為 **共享訂閱**，即 shared subscription）。

扇出（fan-out）
: 每條訊息都被傳遞給 **所有** 消費者。扇出允許幾個獨立的消費者各自 “收聽” 相同的訊息廣播，而不會相互影響 —— 這個流處理中的概念對應批處理中多個不同批處理作業讀取同一份輸入檔案 （JMS 中的主題訂閱與 AMQP 中的交叉繫結提供了這一功能）。

{{< figure src="/fig/ddia_1201.png" id="fig_stream_broker_patterns" caption="圖 12-1. （a）負載均衡：在消費者間共享消費主題；（b）扇出：將每條訊息傳遞給多個消費者。" class="w-full my-4" >}}

兩種模式可以組合使用：例如，兩個獨立的消費者組可以每組各訂閱同一個主題，每一組都共同收到所有訊息，但在每一組內部，每條訊息僅由單個節點處理。

#### 確認與重新傳遞 {#sec_stream_reordering}

消費者隨時可能會崩潰，所以有一種可能的情況是：代理向消費者遞送訊息，但消費者沒有處理，或者在消費者崩潰之前只進行了部分處理。為了確保訊息不會丟失，訊息代理使用 **確認（acknowledgments）**：客戶端必須顯式告知代理訊息處理完畢的時間，以便代理能將訊息從佇列中移除。

如果與客戶端的連線關閉，或者代理超出一段時間未收到確認，代理則認為訊息沒有被處理，因此它將訊息再遞送給另一個消費者。（請注意可能發生這樣的情況，訊息 **實際上是** 處理完畢的，但 **確認** 在網路中丟失了。需要一種原子提交協議才能處理這種情況，正如在 “[實踐中的分散式事務](/tw/ch8#sec_transactions_xa)” 中所討論的那樣）

當與負載均衡相結合時，這種重傳行為對訊息的順序有種有趣的影響。在 [圖 12-2](#fig_stream_redelivery_reordering) 中，消費者通常按照生產者傳送的順序處理訊息。然而消費者 2 在處理訊息 m3 時崩潰，與此同時消費者 1 正在處理訊息 m4。未確認的訊息 m3 隨後被重新發送給消費者 1，結果消費者 1 按照 m4，m3，m5 的順序處理訊息。因此 m3 和 m4 的交付順序與生產者 1 的傳送順序不同。

{{< figure src="/fig/ddia_1202.png" id="fig_stream_redelivery_reordering" caption="圖 12-2. 在處理 m3 時消費者 2 崩潰，因此稍後重傳至消費者 1。" class="w-full my-4" >}}

即使訊息代理試圖保留訊息的順序（如 JMS 和 AMQP 標準所要求的），負載均衡與重傳的組合也不可避免地導致訊息被重新排序。為避免此問題，你可以讓每個消費者使用單獨的佇列（即不使用負載均衡功能）。如果訊息是完全獨立的，則訊息順序重排並不是一個問題。但正如我們將在本章後續部分所述，如果訊息之間存在因果依賴關係，這就是一個很重要的問題。

重傳還可能導致資源浪費、資源飢餓，甚至使流永久阻塞。一個常見場景是生產者錯誤地序列化訊息，例如 JSON 物件缺少必填鍵。任何讀取到該訊息的消費者都會因為缺鍵而失敗，無法傳送確認，於是代理會不斷重傳，導致其他消費者也不斷失敗。如果代理強順序保證，後續訊息可能被徹底卡住；即便允許重排，也會持續浪費資源在永遠無法確認的壞訊息上。

這類問題通常透過 **死信佇列（dead letter queue, DLQ）** 處理：不再無限重試，而是把問題訊息移到另一條佇列中，從而解堵主消費鏈路[^17] [^18]。運維通常會對死信佇列設定告警 —— 只要有訊息進入，就代表出現了錯誤。收到告警後，操作員可以決定永久丟棄該訊息、人工修復後重新投遞，或修復消費者程式碼以正確處理該訊息。除了傳統佇列系統，基於日誌的訊息系統和流處理系統也開始支援 DLQ[^19]。

### 基於日誌的訊息代理 {#sec_stream_log}

透過網路傳送資料包或向網路服務傳送請求通常是短暫的操作，不會留下永久的痕跡。儘管可以永久記錄（透過抓包與日誌），但我們通常不這麼做。即使是將訊息持久地寫入磁碟的訊息代理，在送達給消費者之後也會很快刪除訊息，因為它們建立在短暫訊息傳遞的思維方式上。

資料庫和檔案系統採用截然相反的方法論：至少在某人顯式刪除前，通常寫入資料庫或檔案的所有內容都要被永久記錄下來。

這種思維方式上的差異對建立派生資料的方式有巨大影響。如 [第十一章](/tw/ch11) 所述，批處理過程的一個關鍵特性是，你可以反覆執行它們，試驗處理步驟，不用擔心損壞輸入（因為輸入是隻讀的）。而 AMQP/JMS 風格的訊息傳遞並非如此：收到訊息是具有破壞性的，因為確認可能導致訊息從代理中被刪除，因此你不能期望再次運行同一個消費者能得到相同的結果。

如果你將新的消費者新增到訊息傳遞系統，通常只能接收到消費者註冊之後開始傳送的訊息。先前的任何訊息都隨風而逝，一去不復返。作為對比，你可以隨時為檔案和資料庫新增新的客戶端，且能讀取任意久遠的資料（只要應用沒有顯式覆蓋或刪除這些資料）。

為什麼我們不能把它倆雜交一下，既有資料庫的持久儲存方式，又有訊息傳遞的低延遲通知？這就是 **基於日誌的訊息代理（log-based message brokers）** 背後的想法。

#### 使用日誌進行訊息儲存 {#id300}

日誌只是磁碟上簡單的僅追加記錄序列。我們先前在 [第四章](/tw/ch4) 中日誌結構儲存引擎和預寫式日誌的上下文中討論了日誌，在 [第六章](/tw/ch6) 複製的上下文裡也討論了它。

同樣的結構可以用於實現訊息代理：生產者透過將訊息追加到日誌末尾來發送訊息，而消費者透過依次讀取日誌來接收訊息。如果消費者讀到日誌末尾，則會等待新訊息追加的通知。Unix 工具 `tail -f` 能監視檔案被追加寫入的資料，基本上就是這樣工作的。

為了伸縮超出單個磁碟所能提供的更高吞吐量，可以對日誌進行 **分割槽**（按 [第七章](/tw/ch7) 的定義）。不同的分割槽可以託管在不同的機器上，使得每個分割槽都有一份能獨立於其他分割槽進行讀寫的日誌。一個主題可以定義為一組攜帶相同型別訊息的分割槽。這種方法如 [圖 12-3](#fig_stream_log_partitions) 所示。

在每個分割槽內，代理為每個訊息分配一個單調遞增的序列號或 **偏移量**（offset，在 [圖 12-3](#fig_stream_log_partitions) 中，框中的數字是訊息偏移量）。這種序列號是有意義的，因為分割槽是僅追加寫入的，所以分割槽內的訊息是完全有序的。沒有跨不同分割槽的順序保證。

{{< figure src="/fig/ddia_1203.png" id="fig_stream_log_partitions" caption="圖 12-3. 生產者透過將訊息追加寫入主題分割槽檔案來發送訊息，消費者依次讀取這些檔案。" class="w-full my-4" >}}

Apache Kafka [^20] 和 Amazon Kinesis Streams 都是按這種方式工作的基於日誌的訊息代理。Google Cloud Pub/Sub 在架構上類似，但對外暴露的是 JMS 風格的 API，而不是日誌抽象[^15]。儘管這些訊息代理將所有訊息寫入磁碟，但透過跨多臺機器分割槽，依然能夠達到每秒數百萬條訊息的吞吐量，並透過複製訊息實現容錯[^21] [^22]。

#### 日誌與傳統的訊息傳遞相比 {#sec_stream_logs_vs_messaging}

基於日誌的方法天然支援扇出式訊息傳遞，因為多個消費者可以獨立讀取日誌，而不會相互影響 —— 讀取訊息不會將其從日誌中刪除。為了在一組消費者之間實現負載平衡，代理可以將整個分割槽分配給消費者組中的節點，而不是將單條訊息分配給消費者客戶端。

然後每個客戶端將消費被指派分割槽中的 **所有** 訊息。通常情況下，當一個使用者被指派了一個日誌分割槽時，它會以簡單的單執行緒方式順序地讀取分割槽中的訊息。這種粗粒度的負載均衡方法有一些缺點：

* 共享消費主題工作的節點數，最多為該主題中的日誌分割槽數，因為同一個分割槽內的所有訊息被遞送到同一個節點。
* 如果某條訊息處理緩慢，則它會阻塞該分割槽中後續訊息的處理（一種頭部阻塞的形式；請參閱 “[描述效能](/tw/ch2#sec_introduction_percentiles)”）。

因此在訊息處理代價高昂，希望逐條並行處理，以及訊息的順序並沒有那麼重要的情況下，JMS/AMQP 風格的訊息代理是可取的。另一方面，在訊息吞吐量很高，處理迅速，順序很重要的情況下，基於日誌的方法表現得非常好[^23] [^24]。不過，基於日誌與傳統訊息系統的邊界並不絕對：例如，一個主題分割槽通常一次只分配給一個消費者[^25] [^26]。

#### 消費者偏移量 {#sec_stream_log_offsets}

順序消費一個分割槽使得判斷訊息是否已經被處理變得相當容易：所有偏移量小於消費者的當前偏移量的訊息已經被處理，而具有更大偏移量的訊息還沒有被看到。因此，代理不需要跟蹤確認每條訊息，只需要定期記錄消費者的偏移即可。這種方法減少了額外簿記開銷，而且在批處理和流處理中採用這種方法有助於提高基於日誌的系統的吞吐量。

實際上，這種偏移量與單領導者資料庫複製中常見的日誌序列號非常相似，我們在 “[設定新從庫](/tw/ch6#sec_replication_new_replica)” 中討論了這種情況。在資料庫複製中，日誌序列號允許跟隨者斷開連線後，重新連線到領導者，並在不跳過任何寫入的情況下恢復複製。這裡原理完全相同：訊息代理表現得像一個主庫，而消費者就像一個從庫。

如果消費者節點失效，則失效消費者的分割槽將指派給其他節點，並從最後記錄的偏移量開始消費訊息。如果消費者已經處理了後續的訊息，但還沒有記錄它們的偏移量，那麼重啟後這些訊息將被處理兩次。我們將在本章後面討論這個問題的處理方法。

#### 磁碟空間使用 {#sec_stream_disk_usage}

如果只追加寫入日誌，則磁碟空間終究會耗盡。為了回收磁碟空間，日誌實際上被分割成段，並不時地將舊段刪除或移動到歸檔儲存。（我們將在後面討論一種更為複雜的磁碟空間釋放方式）

這就意味著如果一個慢消費者跟不上訊息產生的速率而落後得太多，它的消費偏移量指向了刪除的段，那麼它就會錯過一些訊息。實際上，日誌實現了一個有限大小的緩衝區，當緩衝區填滿時會丟棄舊訊息，它也被稱為 **迴圈緩衝區（circular buffer）** 或 **環形緩衝區（ring buffer）**。不過由於緩衝區在磁碟上，因此緩衝區可能相當的大。

讓我們做個粗略估算。在撰寫本文時，典型的大容量硬碟約為 20 TB，順序寫入吞吐量約為 250 MB/s。如果持續以最高速率寫入訊息，磁碟大約 22 小時就會寫滿並開始刪除最舊訊息。這意味著，即使在滿速寫入下，磁碟日誌也至少可以緩衝約 22 小時的資料。實踐中部署很少持續打滿磁碟頻寬，因此通常可以保留數天甚至數週的訊息緩衝區。

許多基於日誌的訊息代理現在也將訊息分層儲存到物件儲存中，以進一步提升容量，方式與我們在第六章中討論“物件儲存支撐資料庫”時類似。像 Apache Kafka 和 Redpanda 可以把較舊訊息放在物件儲存中按需讀取；還有一些系統直接將全部訊息儲存在物件儲存中。除了成本優勢外，這種架構也有資料整合優勢：如果物件儲存中的訊息以 Iceberg 表形式組織，批處理和資料倉庫作業可以直接在這些資料上執行，而無需再複製一份資料。

#### 當消費者跟不上生產者時 {#id459}

在 “[訊息傳遞系統](#sec_stream_messaging)” 中，如果消費者無法跟上生產者傳送資訊的速度時，我們討論了三種選擇：丟棄資訊，進行緩衝或施加背壓。在這種分類法裡，基於日誌的方法是緩衝的一種形式，具有很大但大小固定的緩衝區（受可用磁碟空間的限制）。

如果消費者遠遠落後，而所要求的資訊比保留在磁碟上的資訊還要舊，那麼它將不能讀取這些資訊，所以代理實際上丟棄了比緩衝區容量更大的舊資訊。你可以監控消費者落後日誌頭部的距離，如果落後太多就發出報警。由於緩衝區很大，因而有足夠的時間讓運維人員來修復慢消費者，並在訊息開始丟失之前讓其趕上。

即使消費者真的落後太多開始丟失訊息，也只有那個消費者受到影響；它不會中斷其他消費者的服務。這是一個巨大的運維優勢：你可以實驗性地消費生產日誌，以進行開發，測試或除錯，而不必擔心會中斷生產服務。當消費者關閉或崩潰時，會停止消耗資源，唯一剩下的只有消費者偏移量。

這種行為也與傳統的訊息代理形成了鮮明對比，在那種情況下，你需要小心地刪除那些消費者已經關閉的佇列 —— 否則那些佇列就會累積不必要的訊息，從其他仍活躍的消費者那裡佔走記憶體。

#### 重播舊訊息 {#sec_stream_replay}

我們之前提到，使用 AMQP 和 JMS 風格的訊息代理，處理和確認訊息是一個破壞性的操作，因為它會導致訊息在代理上被刪除。另一方面，在基於日誌的訊息代理中，使用訊息更像是從檔案中讀取資料：這是隻讀操作，不會更改日誌。

除了消費者的任何輸出之外，處理的唯一副作用是消費者偏移量的前進。但偏移量是在消費者的控制之下的，所以如果需要的話可以很容易地操縱：例如你可以用昨天的偏移量跑一個消費者副本，並將輸出寫到不同的位置，以便重新處理最近一天的訊息。你可以使用各種不同的處理程式碼重複任意次。

這一方面使得基於日誌的訊息傳遞更像上一章的批處理，其中派生資料透過可重複的轉換過程與輸入資料顯式分離。它允許進行更多的實驗，更容易從錯誤和漏洞中恢復，使其成為在組織內整合資料流的良好工具[^27]。


## 資料庫與流 {#sec_stream_databases}

我們已經在訊息代理和資料庫之間進行了一些比較。儘管傳統上它們被視為單獨的工具類別，但是我們看到基於日誌的訊息代理已經成功地從資料庫中獲取靈感並將其應用於訊息傳遞。我們也可以反過來：從訊息傳遞和流中獲取靈感，並將它們應用於資料庫。

我們之前曾經說過，事件是某個時刻發生的事情的記錄。發生的事情可能是使用者操作（例如鍵入搜尋查詢）或讀取感測器，但也可能是 **寫入資料庫**。某些東西被寫入資料庫的事實是可以被捕獲、儲存和處理的事件。這一觀察結果表明，資料庫和資料流之間的聯絡不僅僅是磁碟日誌的物理儲存 —— 而是更深層的聯絡。

事實上，複製日誌（請參閱 “[複製日誌的實現](/tw/ch6#sec_replication_implementation)”）是一個由資料庫寫入事件組成的流，由主庫在處理事務時生成。從庫將寫入流應用到它們自己的資料庫副本，從而最終得到相同資料的精確副本。複製日誌中的事件描述發生的資料更改。

我們還在 “[使用共享日誌](/tw/ch10#sec_consistency_smr)” 中遇到了狀態機複製原理，其中指出：如果每個事件代表對資料庫的寫入，並且每個副本按相同的順序處理相同的事件，則副本將達到相同的最終狀態（假設事件處理是一個確定性的操作）。這是事件流的又一種場景！

在本節中，我們將首先看看異構資料系統中出現的一個問題，然後探討如何透過將事件流的想法帶入資料庫來解決這個問題。

### 保持系統同步 {#sec_stream_sync}

正如我們在本書中所看到的，沒有一個系統能夠滿足所有的資料儲存、查詢和處理需求。在實踐中，大多數重要應用都需要組合使用幾種不同的技術來滿足所有的需求：例如，使用 OLTP 資料庫來為使用者請求提供服務，使用快取來加速常見請求，使用全文索引來處理搜尋查詢，使用資料倉庫用於分析。每一種技術都有自己的資料副本，並根據自己的目的進行儲存方式的最佳化。

由於相同或相關的資料出現在了不同的地方，因此相互間需要保持同步：如果某個專案在資料庫中被更新，它也應當在快取、搜尋索引和資料倉庫中被更新。對於資料倉庫，這種同步通常由 ETL 程序執行（請參閱 “[資料倉庫](/tw/ch1#sec_introduction_dwh)”），通常是先取得資料庫的完整副本，然後執行轉換，並批次載入到資料倉庫中 —— 換句話說，批處理。我們在 “[批處理工作流的輸出](/tw/ch11#sec_batch_output)” 中同樣看到了如何使用批處理建立搜尋索引、推薦系統和其他派生資料系統。

如果週期性的完整資料庫轉儲過於緩慢，有時會使用的替代方法是 **雙寫（dual write）**，其中應用程式碼在資料變更時明確寫入每個系統：例如，首先寫入資料庫，然後更新搜尋索引，然後使快取項失效（甚至同時執行這些寫入）。

但是，雙寫有一些嚴重的問題，其中一個是競爭條件，如 [圖 12-4](#fig_stream_dual_write_race) 所示。在這個例子中，兩個客戶端同時想要更新一個專案 X：客戶端 1 想要將值設定為 A，客戶端 2 想要將其設定為 B。兩個客戶端首先將新值寫入資料庫，然後將其寫入到搜尋索引。因為運氣不好，這些請求的時序是交錯的：資料庫首先看到來自客戶端 1 的寫入將值設定為 A，然後來自客戶端 2 的寫入將值設定為 B，因此資料庫中的最終值為 B。搜尋索引首先看到來自客戶端 2 的寫入，然後是客戶端 1 的寫入，所以搜尋索引中的最終值是 A。即使沒發生錯誤，這兩個系統現在也永久地不一致了。

{{< figure src="/fig/ddia_1204.png" id="fig_stream_dual_write_race" caption="圖 12-4. 在資料庫中 X 首先被設定為 A，然後被設定為 B，而在搜尋索引處，寫入以相反的順序到達。" class="w-full my-4" >}}

除非有一些額外的併發檢測機制，例如我們在 “[檢測併發寫入](/tw/ch6#sec_replication_concurrent)” 中討論的版本向量，否則你甚至不會意識到發生了併發寫入 —— 一個值將簡單地以無提示方式覆蓋另一個值。

雙重寫入的另一個問題是，其中一個寫入可能會失敗，而另一個成功。這是一個容錯問題，而不是一個併發問題，但也會造成兩個系統互相不一致的結果。確保它們要麼都成功要麼都失敗，是原子提交問題的一個例子，解決這個問題的代價是昂貴的（請參閱 “[原子提交與兩階段提交](/tw/ch8#sec_transactions_2pc)”）。

如果你只有一個單領導者複製的資料庫，那麼這個領導者決定了寫入順序，而狀態機複製方法可以在資料庫副本上工作。然而，在 [圖 12-4](#fig_stream_dual_write_race) 中，沒有單個主庫：資料庫可能有一個領導者，搜尋索引也可能有一個領導者，但是兩者都不追隨對方，所以可能會發生衝突（請參閱 “[多主複製](/tw/ch6#sec_replication_multi_leader)”）。

如果實際上只有一個領導者 —— 例如，資料庫 —— 而且我們能讓搜尋索引成為資料庫的追隨者，情況要好得多。但這在實踐中可能嗎？

### 資料變更捕獲 {#sec_stream_cdc}

大多數資料庫的複製日誌的問題在於，它們一直被當做資料庫的內部實現細節，而不是公開的 API。客戶端應該透過其資料模型和查詢語言來查詢資料庫，而不是解析複製日誌並嘗試從中提取資料。

數十年來，許多資料庫根本沒有記錄在檔的獲取變更日誌的方式。由於這個原因，捕獲資料庫中所有的變更，然後將其複製到其他儲存技術（搜尋索引、快取或資料倉庫）中是相當困難的。

最近，人們對 **資料變更捕獲（change data capture, CDC）** 越來越感興趣，這是一種觀察寫入資料庫的所有資料變更，並將其提取並轉換為可以複製到其他系統中的形式的過程。CDC 是非常有意思的，尤其是當變更能在被寫入後立刻用於流時[^28]。

例如，你可以捕獲資料庫中的變更，並不斷將相同的變更應用至搜尋索引。如果變更日誌以相同的順序應用，則可以預期搜尋索引中的資料與資料庫中的資料是匹配的。搜尋索引和任何其他派生資料系統只是變更流的消費者，如 [圖 12-5](#fig_stream_cdc_flow) 所示。

{{< figure src="/fig/ddia_1205.png" id="fig_stream_cdc_flow" caption="圖 12-5. 將資料按順序寫入一個數據庫，然後按照相同的順序將這些更改應用到其他系統。" class="w-full my-4" >}}

#### 資料變更捕獲的實現 {#id307}

我們可以將日誌消費者叫做 **派生資料系統**，正如在 [第一章](/tw/ch1#sec_introduction_derived) 討論“記錄系統與派生資料”時所述：儲存在搜尋索引和資料倉庫中的資料，只是 **記錄系統** 資料的額外檢視。資料變更捕獲是一種機制，可確保對記錄系統所做的所有更改都反映在派生資料系統中，以便派生系統具有資料的準確副本。

從本質上說，資料變更捕獲使得一個數據庫成為領導者（被捕獲變化的資料庫），並將其他元件變為追隨者。基於日誌的訊息代理非常適合從源資料庫傳輸變更事件，因為它保留了訊息的順序（避免了 [圖 12-2](#fig_stream_redelivery_reordering) 的重新排序問題）。

資料庫觸發器可用來實現資料變更捕獲（請參閱 “[基於觸發器的複製](/tw/ch6#sec_replication_logical)”），透過註冊觀察所有變更的觸發器，並將相應的變更項寫入變更日誌表中。但是它們往往是脆弱的，而且有顯著的效能開銷。解析複製日誌可能是一種更穩健的方法，但它也很有挑戰，例如如何應對模式變更。

邏輯複製日誌可以用於實現 CDC（請參閱 “[邏輯（基於行）的日誌複製](/tw/ch6#sec_replication_logical)”），但會帶來不少挑戰，例如模式變更和更新建模。Debezium 開源專案專門解決這些問題，提供了面向 MySQL、PostgreSQL、Oracle、SQL Server、Db2、Cassandra 等資料庫的源聯結器。Kafka Connect 也為多種資料庫提供了 CDC 聯結器；Maxwell 透過解析 binlog 為 MySQL 提供類似能力[^29]，GoldenGate 為 Oracle 提供類似能力，pgcapture 為 PostgreSQL 提供類似能力。

類似於訊息代理，資料變更捕獲通常是非同步的：記錄資料庫系統在提交變更之前不會等待消費者應用變更。這種設計具有的運維優勢是，新增緩慢的消費者不會過度影響記錄系統。不過，所有複製延遲可能有的問題在這裡都可能出現（請參閱 “[複製延遲問題](/tw/ch6#sec_replication_lag)”）。

#### 初始快照 {#sec_stream_cdc_snapshot}

如果你擁有 **所有** 對資料庫進行變更的日誌，則可以透過重播該日誌，來重建資料庫的完整狀態。但是在許多情況下，永遠保留所有更改會耗費太多磁碟空間，且重播過於費時，因此日誌需要被截斷。

例如，構建新的全文索引需要整個資料庫的完整副本 —— 僅僅應用最近變更的日誌是不夠的，因為這樣會丟失最近未曾更新的專案。因此，如果你沒有完整的歷史日誌，則需要從一個一致的快照開始，如先前的 “[設定新從庫](/tw/ch6#sec_replication_new_replica)” 中所述。

資料庫的快照必須與變更日誌中的已知位置或偏移量相對應，以便在處理完快照後知道從哪裡開始應用變更。一些 CDC 工具集成了這種快照功能，而其他工具則把它留給你手動執行。Debezium 使用 Netflix 的 DBLog 水位線演算法提供增量快照能力[^30] [^31]。

#### 日誌壓縮 {#sec_stream_log_compaction}

如果你只能保留有限的歷史日誌，則每次要新增新的派生資料系統時，都需要做一次快照。但 **日誌壓縮（log compaction）** 提供了一個很好的備選方案。

我們之前在 “[日誌結構儲存](/tw/ch4#sec_storage_log_structured)” 的上下文中討論過日誌壓縮（可參閱 [圖 4-3](/tw/ch4#fig_storage_sstable_merging) 的示例）。原理很簡單：儲存引擎定期在日誌中查詢具有相同鍵的記錄，丟掉所有重複的內容，並只保留每個鍵的最新更新。這個壓縮與合併過程在後臺執行，如 [圖 12-6](#fig_stream_compaction) 所示。

{{< figure src="/fig/ddia_1206.png" id="fig_stream_compaction" caption="圖 12-6. 一個鍵值對日誌，其中鍵是貓影片的 ID（mew、purr、scratch、yawn），值是播放次數。日誌壓縮只保留每個鍵的最新值。" class="w-full my-4" >}}

在日誌結構儲存引擎中，具有特殊值 NULL（**墓碑**，即 tombstone）的更新表示該鍵被刪除，並會在日誌壓縮過程中被移除。但只要鍵不被覆蓋或刪除，它就會永遠留在日誌中。這種壓縮日誌所需的磁碟空間僅取決於資料庫的當前內容，而不取決於資料庫中曾經發生的寫入次數。如果相同的鍵經常被覆蓋寫入，則先前的值將最終將被垃圾回收，只有最新的值會保留下來。

在基於日誌的訊息代理與資料變更捕獲的上下文中也適用相同的想法。如果 CDC 系統被配置為，每個變更都包含一個主鍵，且每個鍵的更新都替換了該鍵以前的值，那麼只需要保留對鍵的最新寫入就足夠了。

現在，無論何時需要重建派生資料系統（如搜尋索引），你可以從壓縮日誌主題的零偏移量處啟動新的消費者，然後依次掃描日誌中的所有訊息。日誌能保證包含資料庫中每個鍵的最新值（也可能是一些較舊的值）—— 換句話說，你可以使用它來獲取資料庫內容的完整副本，而無需從 CDC 源資料庫取一個快照。

Apache Kafka 支援這種日誌壓縮功能。正如我們將在本章後面看到的，它允許訊息代理被當成永續性儲存使用，而不僅僅是用於臨時訊息。

#### 變更流的 API 支援 {#sec_stream_change_api}

如今許多主流資料庫都把變更流作為一等介面提供，而不再像過去那樣主要依賴“事後補丁式”或逆向工程式的 CDC。MySQL、PostgreSQL 等關係資料庫通常透過與自身複製相同的日誌通道輸出變更；各大雲廠商也提供了對應的 CDC 服務，例如 Google Cloud 的 Datastream 可向關係資料庫與資料倉庫提供流式資料訪問。

即使是 Cassandra 這類最終一致、基於法定票數的資料庫，也開始支援資料變更捕獲。正如我們在第十章關於線性一致與法定票數中看到的，寫入是否“可見”取決於讀寫一致性設定，這使得其 CDC 的統一抽象更困難。Cassandra 的做法通常是公開各節點原始日誌段，而不是提供單一統一的變更流；消費方需要自己讀取併合並各節點日誌，生成業務可用的單一事件流[^32]。

Kafka Connect[^33]提供了大量資料庫系統與 Kafka 的 CDC 整合能力。變更事件一旦進入 Kafka，就可以用於更新搜尋索引等派生系統，也可以繼續送入後續流處理鏈路。

#### 資料變更捕獲與事件溯源 {#sec_stream_event_sourcing}

資料變更捕獲與事件溯源都把狀態變化表示成事件日誌，但二者抽象層級不同：

* 在資料變更捕獲中，應用仍以可變方式使用資料庫，任意更新/刪除記錄；變更日誌從資料庫底層抽取（如複製日誌），因此能保證抽取順序與真實寫入順序一致，避免 [圖 12-4](#fig_stream_dual_write_race) 這類競態問題。
* 在事件溯源中，應用邏輯從一開始就構建在不可變事件之上，事件儲存通常是僅追加寫入，更新和刪除被限制或禁止。事件語義是應用層行為，而非底層狀態差異。

二者孰優取決於場景。對未採用事件溯源的系統而言，引入它通常是一次較大架構變更；而資料變更捕獲通常可在現有資料庫上以較小改動接入，應用層甚至可以感知不到 CDC 的存在。

> [!TIP] 資料變更捕獲與資料庫模式
> 資料變更捕獲看上去比事件溯源更容易落地，但它也有自己的工程挑戰。
>
> 在微服務架構中，資料庫通常只由所屬服務直接訪問；其他服務透過該服務 API 互動，因此資料庫模式本應是服務內部實現細節，可隨服務演化。
>
> 但 CDC 往往直接複用上游資料庫模式做複製，這會把原本“內部模式”變成“外部契約”。刪除某個列可能會直接破壞下游消費者[^34]。
>
> 一種常見解法是 **Outbox 模式**：專門維護對外發布的 outbox 表，讓 CDC 讀取 outbox，而不是直接讀取內部領域模型表。這樣可以在儘量不影響外部消費者的前提下演化內部模式[^35] [^36]。它看起來像雙寫，實際上也是雙寫；但它把兩次寫入留在同一個資料庫系統內，因此可放進同一事務，規避跨系統雙寫的一致性問題。

和資料變更捕獲一樣，重放事件日誌也能重建當前狀態，但日誌壓縮策略不同：

* 對於 CDC，更新事件通常攜帶記錄的完整新版本，因此同一主鍵的最新事件就足以決定當前值，舊事件可被壓縮。
* 對於事件溯源，事件通常描述使用者意圖而非狀態覆蓋，後續事件一般不會“覆蓋”先前事件，因此重建狀態通常需要完整歷史，不能按 CDC 的方式壓縮。

採用事件溯源的系統通常會儲存由事件日誌匯出的狀態快照，以降低讀取與恢復成本；但快照本質上是效能最佳化。其核心假設仍是：原始事件可長期儲存，並在需要時可完整重放。我們將在“不變性的侷限性”中討論這一假設的邊界。

### 狀態、流和不變性 {#sec_stream_immutability}

我們在 [第十一章](/tw/ch11) 中看到，批處理因其輸入檔案不變性而受益良多，你可以在現有輸入檔案上執行實驗性處理作業，而不用擔心損壞它們。這種不變性原則也是使得事件溯源與資料變更捕獲如此強大的原因。

我們通常將資料庫視為應用程式當前狀態的儲存 —— 這種表示針對讀取進行了最佳化，而且通常對於服務查詢而言是最為方便的表示。狀態的本質是，它會變化，所以資料庫才會支援資料的增刪改。這又該如何匹配不變性呢？

只要你的狀態發生了變化，那麼這個狀態就是這段時間中事件修改的結果。例如，當前可用的座位列表是你已處理的預訂所產生的結果，當前帳戶餘額是帳戶中的借與貸的結果，而 Web 伺服器的響應時間圖，是所有已發生 Web 請求的獨立響應時間的聚合結果。

無論狀態如何變化，總是有一系列事件導致了這些變化。即使事情已經執行與回滾，這些事件出現是始終成立的。關鍵的想法是：可變的狀態與不可變事件的僅追加日誌相互之間並不矛盾：它們是一體兩面，互為陰陽的。所有變化的日誌 —— **變化日誌（changelog）**，表示了隨時間演變的狀態。

如果你傾向於數學表示，那麼你可能會說，應用狀態是事件流對時間求積分得到的結果，而變更流是狀態對時間求微分的結果，如 [圖 12-7](#fig_stream_state_derivative) 所示[^37] [^38]。這個比喻有一些侷限性（例如，狀態的二階導似乎沒有意義），但這是考慮資料的一個實用出發點。

$$
\begin{aligned}
state(now) &= \int_{t=0}^{now} stream(t)\,dt \\
stream(t) &= \frac{d\,state(t)}{dt}
\end{aligned}
$$

{{< figure src="/fig/ddia_1207.png" id="fig_stream_state_derivative" caption="圖 12-7. 應用當前狀態與事件流之間的關係。" class="w-full my-4" >}}

如果你持久儲存了變更日誌，那麼重現狀態就非常簡單。如果你將事件日誌視為記錄系統，而把可變狀態視為其派生結果，那麼系統中的資料流就更容易推理。正如 Jim Gray 和 Andreas Reuter 在 1992 年所說[^39]：

> 從原理上講，資料庫並非必需；日誌已經包含了全部資訊。之所以要保留資料庫（即日誌末端的當前狀態），只是為了提高讀取效能。

日誌壓縮（如 “[日誌壓縮](#sec_stream_log_compaction)” 中所述）是連線日誌與資料庫狀態之間的橋樑：它只保留每條記錄的最新版本，並丟棄被覆蓋的版本。

#### 不可變事件的優點 {#sec_stream_immutability_pros}

資料庫中的不變性是一個古老的概念。例如，會計在幾個世紀以來一直在財務記賬中應用不變性。一筆交易發生時，它被記錄在一個僅追加寫入的分類帳中，實質上是描述貨幣、商品或服務轉手的事件日誌。賬目，比如利潤、虧損、資產負債表，是從分類賬中的交易求和派生而來[^40]。

如果發生錯誤，會計師不會刪除或更改分類帳中的錯誤交易 —— 而是新增另一筆交易以補償錯誤，例如退還一筆不正確的費用。不正確的交易將永遠保留在分類帳中，對於審計而言可能非常重要。如果從不正確的分類賬派生出的錯誤數字已經公佈，那麼下一個會計週期的數字就會包括一個更正。這個過程在會計事務中是很常見的[^41]。

儘管這種可審計性只在金融系統中尤其重要，但對於不受這種嚴格監管的許多其他系統，也是很有幫助的。如 “[批處理用例](/tw/ch11#sec_batch_output)” 中所討論的，如果你意外地部署了將錯誤資料寫入資料庫的錯誤程式碼，當代碼會破壞性地覆寫資料時，恢復要困難得多。使用不可變事件的僅追加日誌，診斷問題與故障恢復就要容易得多。

不可變的事件也包含了比當前狀態更多的資訊。例如在購物網站上，顧客可以將物品新增到他們的購物車，然後再將其移除。雖然從履行訂單的角度，第二個事件取消了第一個事件，但對分析目的而言，知道客戶考慮過某個特定項而之後又反悔，可能是很有用的。也許他們會選擇在未來購買，或者他們已經找到了替代品。這個資訊被記錄在事件日誌中，但對於移出購物車就刪除記錄的資料庫而言，這個資訊在移出購物車時可能就丟失了。

#### 從同一事件日誌中派生多個檢視 {#sec_stream_deriving_views}

此外，透過從不變的事件日誌中分離出可變的狀態，你可以針對不同的讀取方式，從相同的事件日誌中派生出幾種不同的表現形式。效果就像一個流的多個消費者一樣（[圖 12-5](#fig_stream_cdc_flow)）：例如，Kafka Connect 能將來自 Kafka 的資料匯出到各種不同的資料庫與索引[^33]。這對於許多其他儲存和索引系統（如搜尋伺服器）來說也是有意義的，當系統要從分散式日誌中獲取輸入時尤其如此（請參閱 “[保持系統同步](#sec_stream_sync)”）。

新增從事件日誌到資料庫的顯式轉換，能夠使應用更容易地隨時間演進：如果你想要引入一個新功能，以新的方式表示現有資料，則可以使用事件日誌來構建一個單獨的、針對新功能的讀取最佳化檢視，無需修改現有系統而與之共存。並行執行新舊系統通常比在現有系統中執行複雜的模式遷移更容易。一旦不再需要舊的系統，你可以簡單地關閉它並回收其資源[^42] [^43]。

如果你不需要擔心如何查詢與訪問資料，那麼儲存資料通常是非常簡單的。模式設計、索引和儲存引擎的許多複雜性，都是希望支援某些特定查詢和訪問模式的結果（請參閱 [第三章](/tw/ch3)）。出於這個原因，透過將資料寫入的形式與讀取形式相分離，並允許幾個不同的讀取檢視，你能獲得很大的靈活性。這個想法有時被稱為 **命令查詢責任分離（command query responsibility segregation, CQRS）**[^44]。

資料庫和模式設計的傳統方法是基於這樣一種謬論，資料必須以與查詢相同的形式寫入。如果可以將資料從針對寫入最佳化的事件日誌轉換為針對讀取最佳化的應用狀態，那麼有關正規化和反正規化的爭論就變得無關緊要了（請參閱 “[多對一和多對多的關係](/tw/ch3#sec_datamodels_normalization)”）：在針對讀取最佳化的檢視中對資料進行反正規化是完全合理的，因為翻譯過程提供了使其與事件日誌保持一致的機制。

在 “[描述負載](/tw/ch2#sec_introduction_twitter)” 中，我們討論了推特主頁時間線，它是特定使用者關注的人群所發推特的快取（類似郵箱）。這是 **針對讀取最佳化的狀態** 的又一個例子：主頁時間線是高度反正規化的，因為你的推文與你所有粉絲的時間線都構成了重複。然而，扇出服務保持了這種重複狀態與新推特以及新關注關係的同步，從而保證了重複的可管理性。

#### 併發控制 {#sec_stream_concurrency}

事件溯源和資料變更捕獲的最大缺點是，事件日誌的消費者通常是非同步的，所以可能會出現這樣的情況：使用者會寫入日誌，然後從日誌派生檢視中讀取，結果發現他的寫入還沒有反映在讀取檢視中。我們之前在 “[讀己之寫](/tw/ch6#sec_replication_ryw)” 中討論了這個問題以及可能的解決方案。

一種解決方案是將事件追加到日誌時同步執行讀取檢視的更新。而將這些寫入操作合併為一個原子單元需要 **事務**，所以要麼將事件日誌和讀取檢視儲存在同一個儲存系統中，要麼就需要跨不同系統進行分散式事務。或者，你也可以使用在 “[使用共享日誌](/tw/ch10#sec_consistency_smr)” 中討論的方法。

另一方面，從事件日誌匯出當前狀態也簡化了併發控制的某些部分。許多對於多物件事務的需求（請參閱 “[單物件和多物件操作](/tw/ch8#sec_transactions_multi_object)”）源於單個使用者操作需要在多個不同的位置更改資料。透過事件溯源，你可以設計一個自包含的事件以表示一個使用者操作。然後使用者操作就只需要在一個地方進行單次寫入操作 —— 即將事件附加到日誌中 —— 這個還是很容易使原子化的。

如果事件日誌與應用狀態以相同的方式分割槽（例如，處理分割槽 3 中的客戶事件只需要更新分割槽 3 中的應用狀態），那麼直接使用單執行緒日誌消費者就不需要寫入併發控制了。它從設計上一次只處理一個事件（請參閱 “[真的序列執行](/tw/ch8#sec_transactions_serial)”）。日誌透過在分割槽中定義事件的序列順序，消除了併發性的不確定性[^27]。如果一個事件觸及多個狀態分割槽，那麼需要做更多的工作，我們將在 [第十三章](/tw/ch13) 討論。

#### 不變性的侷限性 {#sec_stream_immutability_limitations}

許多不使用事件溯源模型的系統也還是依賴不可變性：各種資料庫在內部使用不可變的資料結構或多版本資料來支援時間點快照（請參閱 “[索引和快照隔離](/tw/ch8#sec_transactions_snapshot_indexes)” ）。Git、Mercurial 和 Fossil 等版本控制系統也依靠不可變的資料來儲存檔案的版本歷史記錄。

永遠保持所有變更的不變歷史，在多大程度上是可行的？答案取決於資料集的流失率。一些工作負載主要是新增資料，很少更新或刪除；它們很容易保持不變。其他工作負載在相對較小的資料集上有較高的更新 / 刪除率；在這些情況下，不可變的歷史可能增至難以接受的巨大，碎片化可能成為一個問題，壓縮與垃圾收集的表現對於運維的穩健性變得至關重要[^45] [^46]。

除了效能方面的原因外，也可能有出於管理方面的原因需要刪除資料的情況，儘管這些資料都是不可變的。例如，隱私條例可能要求在使用者關閉帳戶後刪除他們的個人資訊，資料保護立法可能要求刪除錯誤的資訊，或者可能需要阻止敏感資訊的意外洩露。

在這種情況下，僅僅在日誌中新增另一個事件來指明先前的資料應該被視為刪除是不夠的 —— 你實際上是想改寫歷史，並假裝資料從一開始就沒有寫入。例如，Datomic 管這個特性叫 **切除（excision）**[^47]，而 Fossil 版本控制系統有一個類似的概念叫 **避免（shunning）**[^48]。

真正刪除資料是非常非常困難的[^49]，因為副本可能存在於很多地方：例如，儲存引擎、檔案系統和 SSD 通常會向新位置寫入，而不是原地覆蓋舊資料[^41]；而備份往往刻意設計為不可變，以防誤刪或損壞。

一種支援刪除不可變資料的方法是 **加密粉碎（crypto-shredding）**[^50]：將未來可能需要刪除的資料以加密形式儲存，刪除時僅銷燬金鑰。這樣，密文仍在，但不可再被使用。從某種意義上說，這只是把可變性從“資料本身”轉移到“金鑰管理”上。

此外，你需要預先決定哪些資料共享同一金鑰、哪些資料使用不同金鑰，因為後續你能“粉碎”的粒度通常是“該金鑰加密的全部資料”或“都不刪”，很難只刪其中一部分。若為每條記錄單獨存金鑰，金鑰儲存規模又會變得不可控。像 puncturable encryption 這樣的高階方案[^51]可以提供更細粒度的撤銷能力，但尚未廣泛落地。

總的來說，刪除更多是在“讓資料更難被取回”，而非“讓資料絕對不可恢復”。儘管如此，在某些場景下仍必須盡力而為，正如我們在 “[立法與自律](/ch14#sec_future_legislation)” 中會看到的。


## 流處理 {#sec_stream_processing}

到目前為止，本章中我們已經討論了流的來源（使用者活動事件，感測器和寫入資料庫），我們討論了流如何傳輸（直接透過訊息傳送，透過訊息代理，透過事件日誌）。

剩下的就是討論一下你可以用流做什麼 —— 也就是說，你可以處理它。一般來說，有三種選項：

1. 你可以將事件中的資料寫入資料庫、快取、搜尋索引或類似的儲存系統，然後能被其他客戶端查詢。如 [圖 12-5](#fig_stream_cdc_flow) 所示，這是資料庫與系統其他部分所發生的變更保持同步的好方法 —— 特別是當流消費者是寫入資料庫的唯一客戶端時。如 “[批處理工作流的輸出](/tw/ch11#sec_batch_output)” 中所討論的，它是寫入儲存系統的流等價物。
2. 你能以某種方式將事件推送給使用者，例如傳送報警郵件或推送通知，或將事件流式傳輸到可即時顯示的儀表板上。在這種情況下，人是流的最終消費者。
3. 你可以處理一個或多個輸入流，併產生一個或多個輸出流。流可能會經過由幾個這樣的處理階段組成的流水線，最後再輸出（選項 1 或 2）。

在本章的剩餘部分中，我們將討論選項 3：處理流以產生其他派生流。處理這樣的流的程式碼片段，被稱為 **運算元（operator）** 或 **作業（job）**。它與我們在 [第十一章](/tw/ch11) 中討論過的 Unix 程序和 MapReduce 作業密切相關，資料流的模式是相似的：一個流處理器以只讀的方式使用輸入流，並將其輸出以僅追加的方式寫入一個不同的位置。

流處理中的分割槽和並行化模式也非常類似於 [第十一章](/tw/ch11) 中介紹的 MapReduce 和資料流引擎，因此我們不再重複這些主題。基本的 Map 操作（如轉換和過濾記錄）也是一樣的。

與批次作業相比的一個關鍵區別是，流不會結束。這種差異會帶來很多隱含的結果。正如本章開始部分所討論的，排序對無界資料集沒有意義，因此無法使用 **排序合併連線**（請參閱 “[Reduce 側連線與分組](/tw/ch11#sec_batch_join)”）。容錯機制也必須改變：對於已經運行了幾分鐘的批處理作業，可以簡單地從頭開始重啟失敗任務，但是對於已經執行數年的流作業，重啟後從頭開始跑可能並不是一個可行的選項。

### 流處理的應用 {#sec_stream_uses}

長期以來，流處理一直用於監控目的，如果某個事件發生，組織希望能得到警報。例如：

* 欺詐檢測系統需要確定信用卡的使用模式是否有意外地變化，如果信用卡可能已被盜刷，則鎖卡。
* 交易系統需要檢查金融市場的價格變化，並根據指定的規則進行交易。
* 製造系統需要監控工廠中機器的狀態，如果出現故障，可以快速定位問題。
* 軍事和情報系統需要跟蹤潛在侵略者的活動，並在出現襲擊徵兆時發出警報。

這些型別的應用需要非常精密複雜的模式匹配與相關檢測。然而隨著時代的進步，流處理的其他用途也開始出現。在本節中，我們將簡要比較一下這些應用。

#### 複合事件處理 {#id317}

**複合事件處理（complex event processing, CEP）** 是 20 世紀 90 年代為分析事件流而開發出的一種方法，尤其適用於需要搜尋某些事件模式的應用[^52]。與正則表示式允許你在字串中搜索特定字元模式的方式類似，CEP 允許你指定規則以在流中搜索某些事件模式。

CEP 系統通常使用高層次的宣告式查詢語言，比如 SQL，或者圖形使用者介面，來描述應該檢測到的事件模式。這些查詢被提交給處理引擎，該引擎消費輸入流，並在內部維護一個執行所需匹配的狀態機。當發現匹配時，引擎發出一個 **複合事件**（即 complex event，CEP 因此得名），並附有檢測到的事件模式詳情[^53]。

在這些系統中，查詢和資料之間的關係與普通資料庫相比是顛倒的。通常情況下，資料庫會持久儲存資料，並將查詢視為臨時的：當查詢進入時，資料庫搜尋與查詢匹配的資料，然後在查詢完成時丟掉查詢。CEP 引擎反轉了角色：查詢是長期儲存的，來自輸入流的事件不斷流過它們，搜尋匹配事件模式的查詢[^54]。

CEP 的實現包括 Esper、Apama 和 TIBCO StreamBase。像 Flink 和 Spark Streaming 這樣的分散式流處理框架，也支援在流上使用 SQL 進行宣告式查詢。

#### 流分析 {#id318}

使用流處理的另一個領域是對流進行分析。CEP 與流分析之間的邊界是模糊的，但一般來說，分析往往對找出特定事件序列並不關心，而更關注大量事件上的聚合與統計指標 —— 例如：

* 測量某種型別事件的速率（每個時間間隔內發生的頻率）
* 滾動計算一段時間視窗內某個值的平均值
* 將當前的統計值與先前的時間區間的值對比（例如，檢測趨勢，當指標與上週同比異常偏高或偏低時報警）

這些統計值通常是在固定時間區間內進行計算的，例如，你可能想知道在過去 5 分鐘內服務每秒查詢次數的均值，以及此時間段內響應時間的第 99 百分位點。在幾分鐘內取平均，能抹平秒和秒之間的無關波動，且仍然能向你展示流量模式的時間圖景。聚合的時間間隔稱為 **視窗（window）**，我們將在 “[時間推理](#sec_stream_time)” 中更詳細地討論視窗。

流分析系統有時會使用機率演算法，例如 Bloom filter（我們在 “[效能最佳化](/tw/ch4#sec_storage_bloom_filter)” 中遇到過）來管理成員資格，HyperLogLog[^55]用於基數估計以及各種百分比估計算法（請參閱 “[實踐中的百分位點](/tw/ch2#sec_introduction_percentiles)”）。機率演算法產出近似的結果，但比起精確演算法的優點是記憶體使用要少得多。使用近似演算法有時讓人們覺得流處理系統總是有損的和不精確的，但這是錯誤看法：流處理並沒有任何內在的近似性，而機率演算法只是一種最佳化[^56]。

許多開源分散式流處理框架的設計都是針對分析設計的：例如 Apache Storm、Spark Streaming、Flink、Samza、Apache Beam 和 Kafka Streams[^57]。託管服務包括 Google Cloud Dataflow 和 Azure Stream Analytics。

#### 維護物化檢視 {#sec_stream_mat_view}

我們在 “[資料庫與流](#sec_stream_databases)” 中看到，資料庫的變更流可以用於維護派生資料系統（如快取、搜尋索引和資料倉庫），並使其與源資料庫保持最新。我們可以將這些示例視作維護 **物化檢視（materialized view）** 的一種具體場景：在某個資料集上派生出一個替代檢視以便高效查詢，並在底層資料變更時更新檢視[^37]。

同樣，在事件溯源中，應用程式的狀態是透過應用事件日誌來維護的；這裡的應用程式狀態也是一種物化檢視。與流分析場景不同的是，僅考慮某個時間視窗內的事件通常是不夠的：構建物化檢視可能需要任意時間段內的 **所有** 事件，除了那些可能由日誌壓縮丟棄的過時事件（請參閱 “[日誌壓縮](#sec_stream_log_compaction)”）。實際上，你需要一個可以一直延伸到時間開端的視窗。

原則上講，任何流處理元件都可以用於維護物化檢視，儘管 “永遠執行” 與一些面向分析的框架假設的 “主要在有限時間段視窗上執行” 背道而馳，Kafka Streams 和 Confluent 的 ksqlDB 支援這種用法，建立在 Kafka 對日誌壓縮的支援上[^58]。

> [!TIP] 增量檢視維護
> 資料庫看起來很適合做物化檢視維護：它們本來就擅長儲存完整資料副本，也常常支援物化檢視。
>
> 但很多資料庫重新整理物化檢視仍依賴批處理或按需觸發（例如 PostgreSQL 的 `REFRESH MATERIALIZED VIEW`），而不是在源資料變化時做增量維護。這會帶來兩個問題：
>
> 1. 效率低：每次重新整理都重算全量資料，而不是隻處理變化部分[^38] [^59] [^60]。
> 2. 不夠即時：重新整理間隔內的變化不會立刻反映在視圖裡。
>
> Materialize、RisingWave、ClickHouse、Feldera 等系統都在探索更即時的增量維護路徑[^61]。

#### 在流上搜索 {#id320}

除了允許搜尋由多個事件構成模式的 CEP 外，有時也存在基於複雜標準（例如全文檢索查詢）來搜尋單個事件的需求。

例如，媒體監測服務可以訂閱新聞文章 Feed 與來自媒體的播客，搜尋任何關於公司、產品或感興趣的話題的新聞。這是透過預先構建一個搜尋查詢來完成的，然後不斷地將新聞項的流與該查詢進行匹配。在一些網站上也有類似的功能：例如，當市場上出現符合其搜尋條件的新房產時，房地產網站的使用者可以要求網站通知他們。Elasticsearch 的 percolator 功能，是實現這種流搜尋的一種選擇[^62]。

傳統的搜尋引擎首先索引檔案，然後在索引上跑查詢。相比之下，搜尋一個數據流則反了過來：查詢被儲存下來，文件從查詢中流過，就像在 CEP 中一樣。最簡單的情況就是，你可以為每個文件測試每個查詢。但是如果你有大量查詢，這可能會變慢。為了最佳化這個過程，可以像對文件一樣，為查詢建立索引。因而收窄可能匹配的查詢集合[^63]。

#### 事件驅動架構與 RPC {#sec_stream_actors_drpc}

在 “[訊息傳遞中的資料流](/tw/ch5#sec_encoding_dataflow_msg)” 中我們討論過，訊息傳遞系統可以作為 RPC 的替代方案，即作為一種服務間通訊的機制，比如在 Actor 模型中所使用的那樣。儘管這些系統也是基於訊息和事件，但我們通常不會將其視作流處理元件：

* Actor 框架主要是管理模組通訊的併發和分散式執行的一種機制，而流處理主要是一種資料管理技術。
* Actor 之間的交流往往是短暫的、一對一的；而事件日誌則是持久的、多訂閱者的。
* Actor 可以以任意方式進行通訊（包括迴圈的請求 / 響應模式），但流處理通常配置在無環流水線中，其中每個流都是一個特定作業的輸出，由良好定義的輸入流中派生而來。

也就是說，RPC 類系統與流處理之間有一些交叉領域。例如，Apache Storm 有一個稱為 **分散式 RPC** 的功能，它允許將使用者查詢分散到一系列也處理事件流的節點上；然後這些查詢與來自輸入流的事件交織，而結果可以被彙總併發回給使用者（另請參閱 “[多分割槽資料處理](/tw/ch13#sec_future_unbundled_multi_shard)”）。

也可以使用 Actor 框架來處理流。但是，很多這樣的框架在崩潰時不能保證訊息的傳遞，除非你實現了額外的重試邏輯，否則這種處理不是容錯的。

### 時間推理 {#sec_stream_time}

流處理通常需要與時間打交道，尤其是用於分析目的時候，會頻繁使用時間視窗，例如 “過去五分鐘的平均值”。“過去五分鐘” 的含義看上去似乎是清晰而無歧義的，但不幸的是，這個概念非常棘手。

在批處理過程中，大量的歷史事件被快速地處理。如果需要按時間來分析，批處理器需要檢查每個事件中嵌入的時間戳。讀取執行批處理機器的系統時鐘沒有任何意義，因為處理執行的時間與事件實際發生的時間無關。

批處理可以在幾分鐘內讀取一年的歷史事件；在大多數情況下，感興趣的時間線是歷史中的一年，而不是處理中的幾分鐘。而且使用事件中的時間戳，使得處理是 **確定性** 的：在相同的輸入上再次執行相同的處理過程會得到相同的結果。

另一方面，許多流處理框架使用處理機器上的本地系統時鐘（**處理時間**，即 processing time）來確定 **視窗（windowing）**[^64]。這種方法的優點是簡單，如果事件建立與事件處理之間的延遲可以忽略不計，那也是合理的。然而，如果存在任何顯著的處理延遲 —— 即，事件處理顯著地晚於事件實際發生的時間，這種處理方式就失效了。

#### 事件時間與處理時間 {#id322}

很多原因都可能導致處理延遲：排隊，網路故障（請參閱 “[不可靠的網路](/tw/ch9#sec_distributed_networks)”），效能問題導致訊息代理 / 訊息處理器出現爭用，流消費者重啟，從故障中恢復時重新處理過去的事件（請參閱 “[重播舊訊息](#sec_stream_replay)”），或者在修復程式碼 BUG 之後。

而且，訊息延遲還可能導致無法預測訊息順序。例如，假設使用者首先發出一個 Web 請求（由 Web 伺服器 A 處理），然後發出第二個請求（由伺服器 B 處理）。A 和 B 發出描述它們所處理請求的事件，但是 B 的事件在 A 的事件發生之前到達訊息代理。現在，流處理器將首先看到 B 事件，然後看到 A 事件，即使它們實際上是以相反的順序發生的。

有一個類比也許能幫助理解，“星球大戰” 電影：第四集於 1977 年發行，第五集於 1980 年，第六集於 1983 年，緊隨其後的是 1999 年的第一集、2002 年的第二集、2005 年的第三集，以及 2015 年、2017 年和 2019 年的第七至第九集[^65]。如果你按照它們上映的順序觀看電影，你處理電影的順序與它們敘事的順序就是不一致的。（集數編號就像事件時間戳，而你觀看電影的日期就是處理時間）作為人類，我們能夠應對這種不連續性，但是流處理演算法需要專門編寫，以適應這種時序與順序的問題。

將事件時間和處理時間搞混會導致錯誤的資料。例如，假設你有一個流處理器用於測量請求速率（計算每秒請求數）。如果你重新部署流處理器，它可能會停止一分鐘，並在恢復之後處理積壓的事件。如果你按處理時間來衡量速率，那麼在處理積壓日誌時，請求速率看上去就像有一個異常的突發尖峰，而實際上請求速率是穩定的（[圖 12-8](#fig_stream_processing_time_skew)）。

{{< figure src="/fig/ddia_1208.png" id="fig_stream_processing_time_skew" caption="圖 12-8. 按處理時間分窗，會因為處理速率的變動引入人為因素。" class="w-full my-4" >}}

#### 處理滯留事件 {#id323}

用事件時間來定義視窗的一個棘手的問題是，你永遠也無法確定是不是已經收到了特定視窗的所有事件，還是說還有一些事件正在來的路上。

例如，假設你將事件分組為一分鐘的視窗，以便統計每分鐘的請求數。你已經計數了一些帶有本小時內第 37 分鐘時間戳的事件，時間流逝，現在進入的主要都是本小時內第 38 和第 39 分鐘的事件。什麼時候才能宣佈你已經完成了第 37 分鐘的視窗計數，並輸出其計數器值？

在一段時間沒有看到任何新的事件之後，你可以超時並宣佈一個視窗已經就緒，但仍然可能發生這種情況：某些事件被緩衝在另一臺機器上，由於網路中斷而延遲。你需要能夠處理這種在視窗宣告完成之後到達的 **滯留（straggler）** 事件。大體上，你有兩種選擇[^1]：

1. 忽略這些滯留事件，因為在正常情況下它們可能只是事件中的一小部分。你可以將丟棄事件的數量作為一個監控指標，並在出現大量丟訊息的情況時報警。
2. 釋出一個 **更正（correction）**，一個包括滯留事件的更新視窗值。你可能還需要收回以前的輸出。

在某些情況下，可以使用特殊的訊息來指示 “從現在開始，不會有比 t 更早時間戳的訊息了”，消費者可以使用它來觸發視窗[^66]。但是，如果不同機器上的多個生產者都在生成事件，每個生產者都有自己的最小時間戳閾值，則消費者需要分別跟蹤每個生產者。在這種情況下，新增和刪除生產者都是比較棘手的。

#### 你用的是誰的時鐘？ {#id438}

當事件可能在系統內多個地方進行緩衝時，為事件分配時間戳更加困難了。例如，考慮一個移動應用向伺服器上報關於用量的事件。該應用可能會在裝置處於離線狀態時被使用，在這種情況下，它將在裝置本地緩衝事件，並在下一次網際網路連線可用時向伺服器上報這些事件（可能是幾小時甚至幾天）。對於這個流的任意消費者而言，它們就如延遲極大的滯留事件一樣。

在這種情況下，事件上的時間戳實際上應當是使用者交互發生的時間，取決於移動裝置的本地時鐘。然而使用者控制的裝置上的時鐘通常是不可信的，因為它可能會被無意或故意設定成錯誤的時間（請參閱 “[時鐘同步與準確性](/tw/ch9#sec_distributed_clock_accuracy)”）。伺服器收到事件的時間（取決於伺服器的時鐘）可能是更準確的，因為伺服器在你的控制之下，但在描述使用者互動方面意義不大。

要校正不正確的裝置時鐘，一種方法是記錄三個時間戳[^67]：

* 事件發生的時間，取決於裝置時鐘
* 事件傳送往伺服器的時間，取決於裝置時鐘
* 事件被伺服器接收的時間，取決於伺服器時鐘

透過從第三個時間戳中減去第二個時間戳，可以估算裝置時鐘和伺服器時鐘之間的偏移（假設網路延遲與所需的時間戳精度相比可忽略不計）。然後可以將該偏移應用於事件時間戳，從而估計事件實際發生的真實時間（假設裝置時鐘偏移在事件發生時與送往伺服器之間沒有變化）。

這並不是流處理獨有的問題，批處理有著完全一樣的時間推理問題。只是在流處理的上下文中，我們更容易意識到時間的流逝。

#### 視窗的型別 {#id324}

當你知道如何確定一個事件的時間戳後，下一步就是如何定義時間段的視窗。然後視窗就可以用於聚合，例如事件計數，或計算視窗內值的平均值。有幾種視窗很常用[^64] [^68]：

滾動視窗（Tumbling Window）
: 滾動視窗有著固定的長度，每個事件都僅能屬於一個視窗。例如，假設你有一個 1 分鐘的滾動視窗，則所有時間戳在 `10:03:00` 和 `10:03:59` 之間的事件會被分組到一個視窗中，`10:04:00` 和 `10:04:59` 之間的事件被分組到下一個視窗，依此類推。透過將每個事件時間戳四捨五入至最近的分鐘來確定它所屬的視窗，可以實現 1 分鐘的滾動視窗。

跳動視窗（Hopping Window）
: 跳動視窗也有著固定的長度，但允許視窗重疊以提供一些平滑。例如，一個帶有 1 分鐘跳躍步長的 5 分鐘視窗將包含 `10:03:00` 至 `10:07:59` 之間的事件，而下一個視窗將覆蓋 `10:04:00` 至 `10:08:59` 之間的事件，等等。透過首先計算 1 分鐘的滾動視窗（tumbling window），然後在幾個相鄰視窗上進行聚合，可以實現這種跳動視窗。

滑動視窗（Sliding Window）
: 滑動視窗包含了彼此間距在特定時長內的所有事件。例如，一個 5 分鐘的滑動視窗應當覆蓋 `10:03:39` 和 `10:08:12` 的事件，因為它們相距不超過 5 分鐘（注意滾動視窗與步長 5 分鐘的跳動視窗可能不會把這兩個事件分組到同一個視窗中，因為它們使用固定的邊界）。透過維護一個按時間排序的事件緩衝區，並不斷從視窗中移除過期的舊事件，可以實現滑動視窗。

會話視窗（Session window）
: 與其他視窗型別不同，會話視窗沒有固定的持續時間，而定義為：將同一使用者出現時間相近的所有事件分組在一起，而當用戶一段時間沒有活動時（例如，如果 30 分鐘內沒有事件）視窗結束。會話切分是網站分析的常見需求（請參閱 “[JOIN 與 GROUP BY](/tw/ch11#sec_batch_join)”）。

### 流連線 {#sec_stream_joins}

在 [第十一章](/tw/ch11) 中，我們討論了批處理作業如何透過鍵來連線資料集，以及這種連線是如何成為資料管道的重要組成部分的。由於流處理將資料管道泛化為對無限資料集進行增量處理，因此對流進行連線的需求也是完全相同的。

然而，新事件隨時可能出現在一個流中，這使得流連線要比批處理連線更具挑戰性。為了更好地理解情況，讓我們先來區分三種不同型別的連線：**流 - 流** 連線，**流 - 表** 連線，與 **表 - 表** 連線。我們將在下面的章節中透過例子來說明。

#### 流流連線（視窗連線） {#id440}

假設你的網站上有搜尋功能，而你想要找出搜尋 URL 的近期趨勢。每當有人鍵入搜尋查詢時，都會記錄下一個包含查詢與其返回結果的事件。每當有人點選其中一個搜尋結果時，就會記錄另一個記錄點選事件。為了計算搜尋結果中每個 URL 的點選率，你需要將搜尋動作與點選動作的事件連在一起，這些事件透過相同的會話 ID 進行連線。廣告系統中需要類似的分析[^69]。

如果使用者丟棄了搜尋結果，點選可能永遠不會發生，即使它出現了，搜尋與點選之間的時間可能是高度可變的：在很多情況下，它可能是幾秒鐘，但也可能長達幾天或幾周（如果使用者執行搜尋，忘掉了這個瀏覽器頁面，過了一段時間後重新回到這個瀏覽器頁面上，並點選了一個結果）。由於可變的網路延遲，點選事件甚至可能先於搜尋事件到達。你可以選擇合適的連線視窗 —— 例如，如果點選與搜尋之間的時間間隔在一小時內，你可能會選擇連線兩者。

請注意，在點選事件中嵌入搜尋詳情與事件連線並不一樣：這樣做的話，只有當用戶點選了一個搜尋結果時你才能知道，而那些沒有點選的搜尋就無能為力了。為了衡量搜尋質量，你需要準確的點選率，為此搜尋事件和點選事件兩者都是必要的。

為了實現這種型別的連線，流處理器需要維護 **狀態**：例如，按會話 ID 索引最近一小時內發生的所有事件。無論何時發生搜尋事件或點選事件，都會被新增到合適的索引中，而流處理器也會檢查另一個索引是否有具有相同會話 ID 的事件到達。如果有匹配事件就會發出一個表示搜尋結果被點選的事件；如果搜尋事件直到過期都沒看見有匹配的點選事件，就會發出一個表示搜尋結果未被點選的事件。

#### 流表連線（流擴充） {#sec_stream_table_joins}

在 “[示例：使用者活動事件分析](/tw/ch11#sec_batch_join)”（[圖 11-2](/tw/ch11#fig_batch_join_example)）中，我們看到了連線兩個資料集的批處理作業示例：一組使用者活動事件和一個使用者檔案資料庫。將使用者活動事件視為流，並在流處理器中連續執行相同的連線是很自然的想法：輸入是包含使用者 ID 的活動事件流，而輸出還是活動事件流，但其中使用者 ID 已經被擴充套件為使用者的檔案資訊。這個過程有時被稱為使用資料庫的資訊來 **擴充（enriching）** 活動事件。

要執行此連線，流處理器需要一次處理一個活動事件，在資料庫中查詢事件的使用者 ID，並將檔案資訊新增到活動事件中。資料庫查詢可以透過查詢遠端資料庫來實現。但正如在 “[示例：使用者活動事件分析](/tw/ch11#sec_batch_join)” 一節中討論的，此類遠端查詢可能會很慢，並且有可能導致資料庫過載[^58]。

另一種方法是將資料庫副本載入到流處理器中，以便在本地進行查詢而無需網路往返。這種技術與我們在 “[JOIN 與 GROUP BY](/tw/ch11#sec_batch_join)” 中討論的雜湊連線非常相似：如果資料庫的本地副本足夠小，則可以是記憶體中的散列表，比較大的話也可以是本地磁碟上的索引。

與批處理作業的區別在於，批處理作業使用資料庫的時間點快照作為輸入，而流處理器是長時間執行的，且資料庫的內容可能隨時間而改變，所以流處理器資料庫的本地副本需要保持更新。這個問題可以透過資料變更捕獲來解決：流處理器可以訂閱使用者檔案資料庫的更新日誌，如同活動事件流一樣。當增添或修改檔案時，流處理器會更新其本地副本。因此，我們有了兩個流之間的連線：活動事件和檔案更新。

流表連線實際上非常類似於流流連線；最大的區別在於對於表的變更日誌流，連線使用了一個可以回溯到 “時間起點” 的視窗（概念上是無限的視窗），新版本的記錄會覆蓋更早的版本。對於輸入的流，連線可能壓根兒就沒有維護任何視窗。

#### 表表連線（維護物化檢視） {#id326}

我們在 “[描述負載](/tw/ch2#sec_introduction_twitter)” 中討論的推特時間線例子時說過，當用戶想要檢視他們的主頁時間線時，迭代使用者所關注人群的推文併合並它們是一個開銷巨大的操作。

相反，我們需要一個時間線快取：一種每個使用者的 “收件箱”，在傳送推文的時候寫入這些資訊，因而讀取時間線時只需要簡單地查詢即可。物化與維護這個快取需要處理以下事件：

* 當用戶 u 傳送新的推文時，它將被新增到每個關注使用者 u 的時間線上。
* 使用者刪除推文時，推文將從所有使用者的時間線中刪除。
* 當用戶 *u*~1~ 開始關注使用者 *u*~2~ 時，*u*~2~ 最近的推文將被新增到 *u*~1~ 的時間線上。
* 當用戶 *u*~1~ 取消關注使用者 *u*~2~ 時，*u*~2~ 的推文將從 *u*~1~ 的時間線中移除。

要在流處理器中實現這種快取維護，你需要推文事件流（傳送與刪除）和關注關係事件流（關注與取消關注）。流處理需要維護一個數據庫，包含每個使用者的粉絲集合，以便知道當一條新推文到達時，需要更新哪些時間線。

觀察這個流處理過程的另一種視角是：它維護了一個連線了兩個表（推文與關注）的物化檢視，如下所示：

```sql
SELECT follows.follower_id AS timeline_id,
    array_agg(tweets.* ORDER BY tweets.timestamp DESC)
FROM tweets
JOIN follows ON follows.followee_id = tweets.sender_id
GROUP BY follows.follower_id
```

流連線直接對應於這個查詢中的表連線。時間線實際上是這個查詢結果的快取，每當底層的表發生變化時都會更新。

> [!NOTE]
> 如果你將流視作表的導數（如 [圖 12-7](#fig_stream_state_derivative) 所示），並把連線看作兩個表 *u·v* 的乘積，那麼會出現一個有趣現象：物化連線的變化流遵循乘積法則 \( (u \cdot v)' = u'v + uv' \)。換句話說，任何推文變化都要和當前關注關係連線，任何關注關係變化都要和當前推文連線[^37]。

#### 連線的時間依賴性 {#sec_stream_join_time}

這裡描述的三種連線（流流，流表，表表）有很多共通之處：它們都需要流處理器維護連線一側的一些狀態（搜尋與點選事件，使用者檔案，關注列表），然後當連線另一側的訊息到達時查詢該狀態。

用於維護狀態的事件順序是很重要的（先關注然後取消關注，或者其他類似操作）。在分割槽日誌中，單個分割槽內的事件順序是保留下來的。但典型情況下是沒有跨流或跨分割槽的順序保證的。

這就產生了一個問題：如果不同流中的事件發生在近似的時間範圍內，則應該按照什麼樣的順序進行處理？在流表連線的例子中，如果使用者更新了它們的檔案，哪些活動事件與舊檔案連線（在檔案更新前處理），哪些又與新檔案連線（在檔案更新之後處理）？換句話說：你需要對一些狀態做連線，如果狀態會隨著時間推移而變化，那應當使用什麼時間點來連線呢？

這種時序依賴可能出現在很多地方。例如銷售東西需要對發票應用適當的稅率，這取決於所處的國家 / 州，產品型別，銷售日期（因為稅率時不時會變化）。當連線銷售額與稅率表時，你可能期望的是使用銷售時的稅率參與連線。如果你正在重新處理歷史資料，銷售時的稅率可能和現在的稅率有所不同。

如果跨越流的事件順序是未定的，則連線會變為不確定性的[^70]，這意味著你在同樣輸入上重跑相同的作業未必會得到相同的結果：當你重跑任務時，輸入流上的事件可能會以不同的方式交織。

在資料倉庫中，這個問題被稱為 **緩慢變化的維度（slowly changing dimension, SCD）**，通常透過對特定版本的記錄使用唯一的識別符號來解決：例如，每當稅率改變時都會獲得一個新的識別符號，而發票在銷售時會帶有稅率的識別符號[^71] [^72]。這種變化使連線變為確定性的，但也會導致日誌壓縮無法進行：表中所有的記錄版本都需要保留。

### 容錯 {#sec_stream_fault_tolerance}

在本章的最後一節中，讓我們看一看流處理是如何容錯的。我們在 [第十一章](/tw/ch11) 中看到，批處理框架可以很容易地容錯：如果 MapReduce 作業中的任務失敗，可以簡單地在另一臺機器上再次啟動，並且丟棄失敗任務的輸出。這種透明的重試是可能的，因為輸入檔案是不可變的，每個任務都將其輸出寫入到 HDFS 上的獨立檔案中，而輸出僅當任務成功完成後可見。

特別是，批處理容錯方法可確保批處理作業的輸出與沒有出錯的情況相同，即使實際上某些任務失敗了。看起來好像每條輸入記錄都被處理了恰好一次 —— 沒有記錄被跳過，而且沒有記錄被處理兩次。儘管重啟任務意味著實際上可能會多次處理記錄，但輸出中的可見效果看上去就像只處理過一次。這個原則被稱為 **恰好一次語義（exactly-once semantics）**，儘管 **等效一次（effectively-once）** 可能會是一個更寫實的術語[^73]。

在流處理中也出現了同樣的容錯問題，但是處理起來沒有那麼直觀：等待某個任務完成之後再使其輸出可見並不是一個可行選項，因為你永遠無法處理完一個無限的流。

#### 微批次與存檔點 {#id329}

一個解決方案是將流分解成小塊，並像微型批處理一樣處理每個塊。這種方法被稱為 **微批次（microbatching）**，它被用於 Spark Streaming[^74]。批次的大小通常約為 1 秒，這是對效能妥協的結果：較小的批次會導致更大的排程與協調開銷，而較大的批次意味著流處理器結果可見之前的延遲要更長。

微批次也隱式提供了一個與批次大小相等的滾動視窗（按處理時間而不是事件時間戳分窗）。任何需要更大視窗的作業都需要顯式地將狀態從一個微批次轉移到下一個微批次。

Apache Flink 則使用不同的方法，它會定期生成狀態的滾動存檔點並將其寫入持久儲存[^75] [^76]。如果流運算元崩潰，它可以從最近的存檔點重啟，並丟棄從最近檢查點到崩潰之間的所有輸出。存檔點會由訊息流中的 **壁障（barrier）** 觸發，類似於微批次之間的邊界，但不會強制一個特定的視窗大小。

在流處理框架的範圍內，微批次與存檔點方法提供了與批處理一樣的 **恰好一次語義**。但是，只要輸出離開流處理器（例如，寫入資料庫，向外部訊息代理傳送訊息，或傳送電子郵件），框架就無法拋棄失敗批次的輸出了。在這種情況下，重啟失敗任務會導致外部副作用發生兩次，只有微批次或存檔點不足以阻止這一問題。

#### 原子提交再現 {#sec_stream_atomic_commit}

為了在出現故障時表現出恰好處理一次的樣子，我們需要確保事件處理的所有輸出和副作用 **當且僅當** 處理成功時才會生效。這些影響包括傳送給下游運算元或外部訊息傳遞系統（包括電子郵件或推送通知）的任何訊息，任何資料庫寫入，對運算元狀態的任何變更，以及對輸入訊息的任何確認（包括在基於日誌的訊息代理中將消費者偏移量前移）。

這些事情要麼都原子地發生，要麼都不發生，但是它們不應當失去同步。如果這種方法聽起來很熟悉，那是因為我們在分散式事務和兩階段提交的上下文中討論過它（請參閱 “[恰好一次的訊息處理](/tw/ch8#sec_transactions_exactly_once)”）。

在 [第十章](/tw/ch10) 中，我們討論了分散式事務傳統實現中的問題（如 XA）。然而在限制更為嚴苛的環境中，也是有可能高效實現這種原子提交機制的。Google Cloud Dataflow[^66] [^75]、VoltDB[^77] 和 Apache Kafka[^78] [^79] 中都使用了這種方法。與 XA 不同，這些實現不會嘗試跨異構技術提供事務，而是透過在流處理框架中同時管理狀態變更與訊息傳遞來內化事務。事務協議的開銷可以透過在單個事務中處理多個輸入訊息來分攤。

#### 冪等性 {#sec_stream_idempotence}

我們的目標是丟棄任何失敗任務的部分輸出，以便能安全地重試，而不會生效兩次。分散式事務是實現這個目標的一種方式，而另一種方式是依賴 **冪等性（idempotence）**[^80]。

冪等操作是多次重複執行與單次執行效果相同的操作。例如，將鍵值儲存中的某個鍵設定為某個特定值是冪等的（再次寫入該值，只是用同樣的值替代），而遞增一個計數器不是冪等的（再次執行遞增意味著該值遞增兩次）。

即使一個操作不是天生冪等的，往往可以透過一些額外的元資料做成冪等的。例如，在使用來自 Kafka 的訊息時，每條訊息都有一個持久的、單調遞增的偏移量。將值寫入外部資料庫時可以將這個偏移量帶上，這樣你就可以判斷一條更新是不是已經執行過了，因而避免重複執行。

Storm 的 Trident 基於類似的想法來處理狀態。依賴冪等性意味著隱含了一些假設：重啟一個失敗的任務必須以相同的順序重播相同的訊息（基於日誌的訊息代理能做這些事），處理必須是確定性的，沒有其他節點能同時更新相同的值[^81] [^82]。

當從一個處理節點故障切換到另一個節點時，可能需要進行 **防護**（fencing，請參閱 “[領導者和鎖](/tw/ch9#sec_distributed_lock_fencing)”），以防止被假死節點干擾。儘管有這麼多注意事項，冪等操作是一種實現 **恰好一次語義** 的有效方式，僅需很小的額外開銷。

#### 失敗後重建狀態 {#sec_stream_state_fault_tolerance}

任何需要狀態的流處理 —— 例如，任何視窗聚合（例如計數器，平均值和直方圖）以及任何用於連線的表和索引，都必須確保在失敗之後能恢復其狀態。

一種選擇是將狀態儲存在遠端資料儲存中，並進行復制，然而正如在 “[流表連線（流擴充）](#sec_stream_table_joins)” 中所述，每個訊息都要查詢遠端資料庫可能會很慢。另一種方法是在流處理器本地儲存狀態，並定期複製。然後當流處理器從故障中恢復時，新任務可以讀取狀態副本，恢復處理而不丟失資料。

例如，Flink 定期捕獲運算元狀態的快照，並將它們寫入 HDFS 等持久儲存中[^75] [^76]。Kafka Streams 透過將狀態變更傳送到具有日誌壓縮功能的專用 Kafka 主題來複制狀態變更，這與資料變更捕獲類似[^83]。VoltDB 透過在多個節點上對每個輸入訊息進行冗餘處理來複制狀態（請參閱 “[真的序列執行](/tw/ch8#sec_transactions_serial)”）。

在某些情況下，甚至可能都不需要複製狀態，因為它可以從輸入流重建。例如，如果狀態是從相當短的視窗中聚合而成，則簡單地重播該視窗中的輸入事件可能是足夠快的。如果狀態是透過資料變更捕獲來維護的資料庫的本地副本，那麼也可以從日誌壓縮的變更流中重建資料庫（請參閱 “[日誌壓縮](#sec_stream_log_compaction)”）。

然而，所有這些權衡取決於底層基礎架構的效能特徵：在某些系統中，網路延遲可能低於磁碟訪問延遲，網路頻寬也可能與磁碟頻寬相當。沒有針對所有情況的普適理想權衡，隨著儲存和網路技術的發展，本地狀態與遠端狀態的優點也可能會互換。


## 本章小結 {#id332}

在本章中，我們討論了事件流，它們所服務的目的，以及如何處理它們。在某些方面，流處理非常類似於在 [第十一章](/tw/ch11) 中討論的批處理，不過是在無限的（永無止境的）流而不是固定大小的輸入上持續進行[^84]。從這個角度來看，訊息代理和事件日誌可以視作檔案系統的流式等價物。

我們花了一些時間比較兩種訊息代理：

AMQP/JMS 風格的訊息代理
: 代理將單條訊息分配給消費者，消費者在成功處理單條訊息後確認訊息。訊息被確認後從代理中刪除。這種方法適合作為一種非同步形式的 RPC（另請參閱 “[事件驅動的架構](/tw/ch5#sec_encoding_dataflow_msg)”），例如在任務佇列中，訊息處理的確切順序並不重要，而且訊息在處理完之後，不需要回頭重新讀取舊訊息。

基於日誌的訊息代理
: 代理將一個分割槽中的所有訊息分配給同一個消費者節點，並始終以相同的順序傳遞訊息。並行是透過分割槽實現的，消費者透過存檔最近處理訊息的偏移量來跟蹤工作進度。訊息代理將訊息保留在磁碟上，因此如有必要的話，可以回跳並重新讀取舊訊息。

基於日誌的方法與資料庫中的複製日誌（請參閱 [第六章](/tw/ch6)）和日誌結構儲存引擎（請參閱 [第四章](/tw/ch4)）有相似之處。我們看到，這種方法對於消費輸入流，併產生派生狀態或派生輸出資料流的系統而言特別適用。

就流的來源而言，我們討論了幾種可能性：使用者活動事件，定期讀數的感測器，和 Feed 資料（例如，金融中的市場資料）能夠自然地表示為流。我們發現將資料庫寫入視作流也是很有用的：我們可以捕獲變更日誌 —— 即對資料庫所做的所有變更的歷史記錄 —— 隱式地透過資料變更捕獲，或顯式地透過事件溯源。日誌壓縮允許流也能保有資料庫內容的完整副本。

將資料庫表示為流為系統整合帶來了很多強大機遇。透過消費變更日誌並將其應用至派生系統，你能使諸如搜尋索引、快取以及分析系統這類派生資料系統不斷保持更新。你甚至能從頭開始，透過讀取從創世至今的所有變更日誌，為現有資料建立全新的檢視。

像流一樣維護狀態以及訊息重播的基礎設施，是在各種流處理框架中實現流連線和容錯的基礎。我們討論了流處理的幾種目的，包括搜尋事件模式（複雜事件處理），計算分窗聚合（流分析），以及保證派生資料系統處於最新狀態（物化檢視）。

然後我們討論了在流處理中對時間進行推理的困難，包括處理時間與事件時間戳之間的區別，以及當你認為視窗已經完事之後，如何處理到達的掉隊事件的問題。

我們區分了流處理中可能出現的三種連線型別：

流流連線
: 兩個輸入流都由活動事件組成，而連線運算元在某個時間視窗內搜尋相關的事件。例如，它可能會將同一個使用者 30 分鐘內進行的兩個活動聯絡在一起。如果你想要找出一個流內的相關事件，連線的兩側輸入可能實際上都是同一個流（**自連線**，即 self-join）。

流表連線
: 一個輸入流由活動事件組成，另一個輸入流是資料庫變更日誌。變更日誌保證了資料庫的本地副本是最新的。對於每個活動事件，連線運算元將查詢資料庫，並輸出一個擴充套件的活動事件。

表表連線
: 兩個輸入流都是資料庫變更日誌。在這種情況下，一側的每一個變化都與另一側的最新狀態相連線。結果是兩表連線所得物化檢視的變更流。

最後，我們討論了在流處理中實現容錯和恰好一次語義的技術。與批處理一樣，我們需要放棄任何失敗任務的部分輸出。然而由於流處理長時間執行並持續產生輸出，所以不能簡單地丟棄所有的輸出。相反，可以使用更細粒度的恢復機制，基於微批次、存檔點、事務或冪等寫入。


### 參考文獻 {#references}

[^1]: Tyler Akidau, Robert Bradshaw, Craig Chambers, Slava Chernyak, Rafael J. Fernández-Moctezuma, Reuven Lax, Sam McVeety, Daniel Mills, Frances Perry, Eric Schmidt, and Sam Whittle. [The Dataflow Model: A Practical Approach to Balancing Correctness, Latency, and Cost in Massive-Scale, Unbounded, Out-of-Order Data Processing](https://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf). *Proceedings of the VLDB Endowment*, volume 8, issue 12, pages 1792--1803, August 2015. [doi:10.14778/2824032.2824076](https://doi.org/10.14778/2824032.2824076)
[^2]: Harold Abelson, Gerald Jay Sussman, and Julie Sussman. [*Structure and Interpretation of Computer Programs*](https://web.mit.edu/6.001/6.037/sicp.pdf), 2nd edition. MIT Press, 1996. ISBN: 978-0-262-51087-5, archived at [archive.org/details/sicp_20211010](https://archive.org/details/sicp_20211010)
[^3]: Patrick Th. Eugster, Pascal A. Felber, Rachid Guerraoui, and Anne-Marie Kermarrec. [The Many Faces of Publish/Subscribe](https://www.cs.ru.nl/~pieter/oss/manyfaces.pdf). *ACM Computing Surveys*, volume 35, issue 2, pages 114--131, June 2003. [doi:10.1145/857076.857078](https://doi.org/10.1145/857076.857078)
[^4]: Don Carney, Uğur Çetintemel, Mitch Cherniack, Christian Convey, Sangdon Lee, Greg Seidman, Michael Stonebraker, Nesime Tatbul, and Stan Zdonik. [Monitoring Streams -- A New Class of Data Management Applications](https://www.vldb.org/conf/2002/S07P02.pdf). At *28th International Conference on Very Large Data Bases* (VLDB), August 2002. [doi:10.1016/B978-155860869-6/50027-5](https://doi.org/10.1016/B978-155860869-6/50027-5)
[^5]: Matthew Sackman. [Pushing Back](https://wellquite.org/posts/lshift/pushing_back/). *wellquite.org*, May 2016. Archived at [perma.cc/3KCZ-RUFY](https://perma.cc/3KCZ-RUFY)
[^6]: Thomas Figg (tef). [how (not) to write a pipeline](https://web.archive.org/web/20250107135013/https://cohost.org/tef/post/1764930-how-not-to-write-a). *cohost.org*, June 2023. Archived at [perma.cc/A3V8-NYCM](https://perma.cc/A3V8-NYCM)
[^7]: Vicent Martí. [Brubeck, a statsd-Compatible Metrics Aggregator](https://github.blog/news-insights/the-library/brubeck/). *github.blog*, June 2015. Archived at [perma.cc/TP3Q-DJYM](https://perma.cc/TP3Q-DJYM)
[^8]: Seth Lowenberger. [MoldUDP64 Protocol Specification V 1.00](https://www.nasdaqtrader.com/content/technicalsupport/specifications/dataproducts/moldudp64.pdf). *nasdaqtrader.com*, July 2009. Archived at <https://perma.cc/7CRQ-QBD7>
[^9]: Ian Malpass. [Measure Anything, Measure Everything](https://codeascraft.com/2011/02/15/measure-anything-measure-everything/). *codeascraft.com*, February 2011. Archived at [archive.org](https://web.archive.org/web/20250820034209/https://www.etsy.com/codeascraft/measure-anything-measure-everything/)
[^10]: Dieter Plaetinck. [25 Graphite, Grafana and statsd Gotchas](https://grafana.com/blog/2016/03/03/25-graphite-grafana-and-statsd-gotchas/). *grafana.com*, March 2016. Archived at [perma.cc/3NP3-67U7](https://perma.cc/3NP3-67U7)
[^11]: Jeff Lindsay. [Web Hooks to Revolutionize the Web](https://progrium.github.io/blog/2007/05/03/web-hooks-to-revolutionize-the-web/). *progrium.com*, May 2007. Archived at [perma.cc/BF9U-XNX4](https://perma.cc/BF9U-XNX4)
[^12]: Jim N. Gray. [Queues Are Databases](https://arxiv.org/pdf/cs/0701158.pdf). Microsoft Research Technical Report MSR-TR-95-56, December 1995. Archived at [arxiv.org](https://arxiv.org/pdf/cs/0701158)
[^13]: Mark Hapner, Rich Burridge, Rahul Sharma, Joseph Fialli, Kate Stout, and Nigel Deakin. [JSR-343 Java Message Service (JMS) 2.0 Specification](https://jcp.org/en/jsr/detail?id=343). *jms-spec.java.net*, March 2013. Archived at [perma.cc/E4YG-46TA](https://perma.cc/E4YG-46TA)
[^14]: Sanjay Aiyagari, Matthew Arrott, Mark Atwell, Jason Brome, Alan Conway, Robert Godfrey, Robert Greig, Pieter Hintjens, John O'Hara, Matthias Radestock, Alexis Richardson, Martin Ritchie, Shahrokh Sadjadi, Rafael Schloming, Steven Shaw, Martin Sustrik, Carl Trieloff, Kim van der Riet, and Steve Vinoski. [AMQP: Advanced Message Queuing Protocol Specification](https://www.rabbitmq.com/resources/specs/amqp0-9-1.pdf). Version 0-9-1, November 2008. Archived at [perma.cc/6YJJ-GM9X](https://perma.cc/6YJJ-GM9X)
[^15]: [Architectural overview of Pub/Sub](https://cloud.google.com/pubsub/architecture). *cloud.google.com*, 2025. Archived at [perma.cc/VWF5-ABP4](https://perma.cc/VWF5-ABP4)
[^16]: Aris Tzoumas. [Lessons from scaling PostgreSQL queues to 100k events per second](https://www.rudderstack.com/blog/scaling-postgres-queue/). *rudderstack.com*, July 2025. Archived at [perma.cc/QD8C-VA4Y](https://perma.cc/QD8C-VA4Y)
[^17]: Robin Moffatt. [Kafka Connect Deep Dive -- Error Handling and Dead Letter Queues](https://www.confluent.io/blog/kafka-connect-deep-dive-error-handling-dead-letter-queues/). *confluent.io*, March 2019. Archived at [perma.cc/KQ5A-AB28](https://perma.cc/KQ5A-AB28)
[^18]: Dunith Danushka. [Message reprocessing: How to implement the dead letter queue](https://redpanda.com/blog/reliable-message-processing-with-dead-letter-queue). *redpanda.com*. Archived at [perma.cc/R7UB-WEWF](https://perma.cc/R7UB-WEWF)
[^19]: Damien Gasparina, Loic Greffier, and Sebastien Viale. [KIP-1034: Dead letter queue in Kafka Streams](https://cwiki.apache.org/confluence/display/KAFKA/KIP-1034%3A+Dead+letter+queue+in+Kafka+Streams). *cwiki.apache.org*, April 2024. Archived at [perma.cc/3VXV-QXAN](https://perma.cc/3VXV-QXAN)
[^20]: Jay Kreps, Neha Narkhede, and Jun Rao. [Kafka: A Distributed Messaging System for Log Processing](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/09/Kafka.pdf). At *6th International Workshop on Networking Meets Databases* (NetDB), June 2011. Archived at [perma.cc/CSW7-TCQ5](https://perma.cc/CSW7-TCQ5)
[^21]: Jay Kreps. [Benchmarking Apache Kafka: 2 Million Writes Per Second (On Three Cheap Machines)](https://engineering.linkedin.com/kafka/benchmarking-apache-kafka-2-million-writes-second-three-cheap-machines). *engineering.linkedin.com*, April 2014. Archived at [archive.org](https://web.archive.org/web/20140921000742/https://engineering.linkedin.com/kafka/benchmarking-apache-kafka-2-million-writes-second-three-cheap-machines)
[^22]: Kartik Paramasivam. [How We're Improving and Advancing Kafka at LinkedIn](https://engineering.linkedin.com/apache-kafka/how-we_re-improving-and-advancing-kafka-linkedin). *engineering.linkedin.com*, September 2015. Archived at [perma.cc/3S3V-JCYJ](https://perma.cc/3S3V-JCYJ)
[^23]: Philippe Dobbelaere and Kyumars Sheykh Esmaili. [Kafka versus RabbitMQ: A comparative study of two industry reference publish/subscribe implementations](https://arxiv.org/abs/1709.00333). At *11th ACM International Conference on Distributed and Event-based Systems* (DEBS), June 2017. [doi:10.1145/3093742.3093908](https://doi.org/10.1145/3093742.3093908)
[^24]: Kate Holterhoff. [Why Message Queues Endure: A History](https://redmonk.com/kholterhoff/2024/12/12/why-message-queues-endure-a-history/). *redmonk.com*, December 2024. Archived at [perma.cc/6DX8-XK4W](https://perma.cc/6DX8-XK4W)
[^25]: Andrew Schofield. [KIP-932: Queues for Kafka](https://cwiki.apache.org/confluence/display/KAFKA/KIP-932%3A+Queues+for+Kafka). *cwiki.apache.org*, May 2023. Archived at [perma.cc/LBE4-BEMK](https://perma.cc/LBE4-BEMK)
[^26]: Jack Vanlightly. [The advantages of queues on logs](https://jack-vanlightly.com/blog/2023/10/2/the-advantages-of-queues-on-logs). *jack-vanlightly.com*, October 2023. Archived at [perma.cc/WJ7V-287K](https://perma.cc/WJ7V-287K)
[^27]: Jay Kreps. [The Log: What Every Software Engineer Should Know About Real-Time Data's Unifying Abstraction](https://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying). *engineering.linkedin.com*, December 2013. Archived at [perma.cc/2JHR-FR64](https://perma.cc/2JHR-FR64)
[^28]: Andy Hattemer. [Change Data Capture is having a moment. Why?](https://materialize.com/blog/change-data-capture-is-having-a-moment-why/) *materialize.com*, September 2021. Archived at [perma.cc/AL37-P53C](https://perma.cc/AL37-P53C)
[^29]: Prem Santosh Udaya Shankar. [Streaming MySQL Tables in Real-Time to Kafka](https://engineeringblog.yelp.com/2016/08/streaming-mysql-tables-in-real-time-to-kafka.html). *engineeringblog.yelp.com*, August 2016. Archived at [perma.cc/5ZR3-2GVV](https://perma.cc/5ZR3-2GVV)
[^30]: Andreas Andreakis, Ioannis Papapanagiotou. [DBLog: A Watermark Based Change-Data-Capture Framework](https://arxiv.org/pdf/2010.12597). October 2020. Archived at [arxiv.org](https://arxiv.org/pdf/2010.12597)
[^31]: Jiri Pechanec. [Percolator](https://debezium.io/blog/2021/10/07/incremental-snapshots/). *debezium.io*, October 2021. Archived at [perma.cc/EQ8E-W6KQ](https://perma.cc/EQ8E-W6KQ)
[^32]: Debezium maintainers. [Debezium Connector for Cassandra](https://debezium.io/documentation/reference/stable/connectors/cassandra.html). *debezium.io*. Archived at [perma.cc/WR6K-EKMD](https://perma.cc/WR6K-EKMD)
[^33]: Neha Narkhede. [Announcing Kafka Connect: Building Large-Scale Low-Latency Data Pipelines](https://www.confluent.io/blog/announcing-kafka-connect-building-large-scale-low-latency-data-pipelines/). *confluent.io*, February 2016. Archived at [perma.cc/8WXJ-L6GF](https://perma.cc/8WXJ-L6GF)
[^34]: Chris Riccomini. [Kafka change data capture breaks database encapsulation](https://cnr.sh/posts/2018-11-05-kafka-change-data-capture-breaks-database-encapsulation/). *cnr.sh*, November 2018. Archived at [perma.cc/P572-9MKF](https://perma.cc/P572-9MKF)
[^35]: Gunnar Morling. ["Change Data Capture Breaks Encapsulation". Does it, though?](https://www.decodable.co/blog/change-data-capture-breaks-encapsulation-does-it-though) *decodable.co*, November 2023. Archived at [perma.cc/YX2P-WNWR](https://perma.cc/YX2P-WNWR)
[^36]: Gunnar Morling. [Revisiting the Outbox Pattern](https://www.decodable.co/blog/revisiting-the-outbox-pattern). *decodable.co*, October 2024. Archived at [perma.cc/M5ZL-RPS9](https://perma.cc/M5ZL-RPS9)
[^37]: Ashish Gupta and Inderpal Singh Mumick. [Maintenance of Materialized Views: Problems, Techniques, and Applications](https://web.archive.org/web/20220407025818id_/http://sites.computer.org/debull/95JUN-CD.pdf#page=5). *IEEE Data Engineering Bulletin*, volume 18, issue 2, pages 3--18, June 1995. Archived at [archive.org](https://web.archive.org/web/20220407025818id_/http://sites.computer.org/debull/95JUN-CD.pdf#page=5)
[^38]: Mihai Budiu, Tej Chajed, Frank McSherry, Leonid Ryzhyk, Val Tannen. [DBSP: Incremental Computation on Streams and Its Applications to Databases](https://sigmodrecord.org/publications/sigmodRecord/2403/pdfs/20_dbsp-budiu.pdf). *SIGMOD Record*, volume 53, issue 1, pages 87--95, March 2024. [doi:10.1145/3665252.3665271](https://doi.org/10.1145/3665252.3665271)
[^39]: Jim Gray and Andreas Reuter. [*Transaction Processing: Concepts and Techniques*](https://learning.oreilly.com/library/view/transaction-processing/9780080519555/). Morgan Kaufmann, 1992. ISBN: 9781558601901
[^40]: Martin Kleppmann. [Accounting for Computer Scientists](https://martin.kleppmann.com/2011/03/07/accounting-for-computer-scientists.html). *martin.kleppmann.com*, March 2011. Archived at [perma.cc/9EGX-P38N](https://perma.cc/9EGX-P38N)
[^41]: Pat Helland. [Immutability Changes Everything](https://www.cidrdb.org/cidr2015/Papers/CIDR15_Paper16.pdf). At *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
[^42]: Martin Kleppmann. [*Making Sense of Stream Processing*](https://martin.kleppmann.com/papers/stream-processing.pdf). Report, O'Reilly Media, May 2016. Archived at [perma.cc/RAY4-JDVX](https://perma.cc/RAY4-JDVX)
[^43]: Kartik Paramasivam. [Stream Processing Hard Problems -- Part 1: Killing Lambda](https://engineering.linkedin.com/blog/2016/06/stream-processing-hard-problems-part-1-killing-lambda). *engineering.linkedin.com*, June 2016. Archived at [archive.org](https://web.archive.org/web/20240621211312/https://www.linkedin.com/blog/engineering/data-streaming-processing/stream-processing-hard-problems-part-1-killing-lambda)
[^44]: Stéphane Derosiaux. [CQRS: What? Why? How?](https://sderosiaux.medium.com/cqrs-what-why-how-945543482313) *sderosiaux.medium.com*, September 2019. Archived at [perma.cc/FZ3U-HVJ4](https://perma.cc/FZ3U-HVJ4)
[^45]: Baron Schwartz. [Immutability, MVCC, and Garbage Collection](https://web.archive.org/web/20220122020806/http://www.xaprb.com/blog/2013/12/28/immutability-mvcc-and-garbage-collection/). *xaprb.com*, December 2013. Archived at [archive.org](https://web.archive.org/web/20220122020806/http://www.xaprb.com/blog/2013/12/28/immutability-mvcc-and-garbage-collection/)
[^46]: Daniel Eloff, Slava Akhmechet, Jay Kreps, et al. [Re: Turning the Database Inside-out with Apache Samza](https://news.ycombinator.com/item?id=9145197). Hacker News discussion, *news.ycombinator.com*, March 2015. Archived at [perma.cc/ML9E-JC83](https://perma.cc/ML9E-JC83)
[^47]: [Datomic Documentation: Excision](https://docs.datomic.com/operation/excision.html). Cognitect, Inc., *docs.datomic.com*. Archived at [perma.cc/J5QQ-SH32](https://perma.cc/J5QQ-SH32)
[^48]: [Fossil Documentation: Deleting Content from Fossil](https://fossil-scm.org/home/doc/trunk/www/shunning.wiki). *fossil-scm.org*, 2025. Archived at [perma.cc/DS23-GTNG](https://perma.cc/DS23-GTNG)
[^49]: Jay Kreps. [The irony of distributed systems is that data loss is really easy but deleting data is surprisingly hard.](https://x.com/jaykreps/status/582580836425330688) *x.com*, March 2015. Archived at [perma.cc/7RRZ-V7B7](https://perma.cc/7RRZ-V7B7)
[^50]: Brent Robinson. [Crypto shredding: How it can solve modern data retention challenges](https://medium.com/@brentrobinson5/crypto-shredding-how-it-can-solve-modern-data-retention-challenges-da874b01745b). *medium.com*, January 2019. Archived at <https://perma.cc/4LFK-S6XE>
[^51]: Matthew D. Green and Ian Miers. [Forward Secure Asynchronous Messaging from Puncturable Encryption](https://isi.jhu.edu/~mgreen/forward_sec.pdf). At *IEEE Symposium on Security and Privacy*, May 2015. [doi:10.1109/SP.2015.26](https://doi.org/10.1109/SP.2015.26)
[^52]: David C. Luckham. [What's the Difference Between ESP and CEP?](https://complexevents.com/2020/06/15/whats-the-difference-between-esp-and-cep-2/) *complexevents.com*, June 2019. Archived at [perma.cc/E7PZ-FDEF](https://perma.cc/E7PZ-FDEF)
[^53]: Arvind Arasu, Shivnath Babu, and Jennifer Widom. [The CQL Continuous Query Language: Semantic Foundations and Query Execution](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cql.pdf). *The VLDB Journal*, volume 15, issue 2, pages 121--142, June 2006. [doi:10.1007/s00778-004-0147-z](https://doi.org/10.1007/s00778-004-0147-z)
[^54]: Julian Hyde. [Data in Flight: How Streaming SQL Technology Can Help Solve the Web 2.0 Data Crunch](https://queue.acm.org/detail.cfm?id=1667562). *ACM Queue*, volume 7, issue 11, December 2009. [doi:10.1145/1661785.1667562](https://doi.org/10.1145/1661785.1667562)
[^55]: Philippe Flajolet, Éric Fusy, Olivier Gandouet, and Frédéric Meunier. [HyperLogLog: The Analysis of a Near-Optimal Cardinality Estimation Algorithm](https://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf). At *Conference on Analysis of Algorithms* (AofA), June 2007. [doi:10.46298/dmtcs.3545](https://doi.org/10.46298/dmtcs.3545)
[^56]: Jay Kreps. [Questioning the Lambda Architecture](https://www.oreilly.com/ideas/questioning-the-lambda-architecture). *oreilly.com*, July 2014. Archived at [perma.cc/2WY5-HC8Y](https://perma.cc/2WY5-HC8Y)
[^57]: Ian Reppel. [An Overview of Apache Streaming Technologies](https://ianreppel.org/an-overview-of-apache-streaming-technologies/). *ianreppel.org*, March 2016. Archived at [perma.cc/BB3E-QJLW](https://perma.cc/BB3E-QJLW)
[^58]: Jay Kreps. [Why Local State is a Fundamental Primitive in Stream Processing](https://www.oreilly.com/ideas/why-local-state-is-a-fundamental-primitive-in-stream-processing). *oreilly.com*, July 2014. Archived at [perma.cc/P8HU-R5LA](https://perma.cc/P8HU-R5LA)
[^59]: RisingWave Labs. [Deep Dive Into the RisingWave Stream Processing Engine - Part 2: Computational Model](https://risingwave.com/blog/deep-dive-into-the-risingwave-stream-processing-engine-part-2-computational-model/). *risingwave.com*, November 2023. Archived at [perma.cc/LM74-XDEL](https://perma.cc/LM74-XDEL)
[^60]: Frank McSherry, Derek G. Murray, Rebecca Isaacs, and Michael Isard. [Differential dataflow](https://www.cidrdb.org/cidr2013/Papers/CIDR13_Paper111.pdf). At *6th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2013.
[^61]: Andy Hattemer. [Incremental Computation in the Database](https://materialize.com/guides/incremental-computation/). *materialize.com*, March 2020. Archived at [perma.cc/AL94-YVRN](https://perma.cc/AL94-YVRN)
[^62]: Shay Banon. [Percolator](https://www.elastic.co/blog/percolator). *elastic.co*, February 2011. Archived at [perma.cc/LS5R-4FQX](https://perma.cc/LS5R-4FQX)
[^63]: Alan Woodward and Martin Kleppmann. [Real-Time Full-Text Search with Luwak and Samza](https://martin.kleppmann.com/2015/04/13/real-time-full-text-search-luwak-samza.html). *martin.kleppmann.com*, April 2015. Archived at [perma.cc/2U92-Q7R4](https://perma.cc/2U92-Q7R4)
[^64]: Tyler Akidau. [The World Beyond Batch: Streaming 102](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-102). *oreilly.com*, January 2016. Archived at [perma.cc/4XF9-8M2K](https://perma.cc/4XF9-8M2K)
[^65]: Stephan Ewen. [Streaming Analytics with Apache Flink](https://www.slideshare.net/slideshow/advanced-streaming-analytics-with-apache-flink-and-apache-kafka-stephan-ewen/61920008). At *Kafka Summit*, April 2016. Archived at [perma.cc/QBQ4-F9MR](https://perma.cc/QBQ4-F9MR)
[^66]: Tyler Akidau, Alex Balikov, Kaya Bekiroğlu, Slava Chernyak, Josh Haberman, Reuven Lax, Sam McVeety, Daniel Mills, Paul Nordstrom, and Sam Whittle. [MillWheel: Fault-Tolerant Stream Processing at Internet Scale](https://www.vldb.org/pvldb/vol6/p1033-akidau.pdf). *Proceedings of the VLDB Endowment*, volume 6, issue 11, pages 1033--1044, August 2013. [doi:10.14778/2536222.2536229](https://doi.org/10.14778/2536222.2536229)
[^67]: Alex Dean. [Improving Snowplow's Understanding of Time](https://snowplow.io/blog/improving-snowplows-understanding-of-time). *snowplow.io*, September 2015. Archived at [perma.cc/6CT9-Z3Q2](https://perma.cc/6CT9-Z3Q2)
[^68]: [Azure Stream Analytics: Windowing functions](https://learn.microsoft.com/en-gb/stream-analytics-query/windowing-azure-stream-analytics). Microsoft Azure Reference, *learn.microsoft.com*, July 2025. Archived at [archive.org](https://web.archive.org/web/20250901140013/https://learn.microsoft.com/en-gb/stream-analytics-query/windowing-azure-stream-analytics)
[^69]: Rajagopal Ananthanarayanan, Venkatesh Basker, Sumit Das, Ashish Gupta, Haifeng Jiang, Tianhao Qiu, Alexey Reznichenko, Deomid Ryabkov, Manpreet Singh, and Shivakumar Venkataraman. [Photon: Fault-Tolerant and Scalable Joining of Continuous Data Streams](https://research.google.com/pubs/archive/41529.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2013. [doi:10.1145/2463676.2465272](https://doi.org/10.1145/2463676.2465272)
[^70]: Ben Kirwin. [Doing the Impossible: Exactly-Once Messaging Patterns in Kafka](https://ben.kirw.in/2014/11/28/kafka-patterns/). *ben.kirw.in*, November 2014. Archived at [perma.cc/A5QL-QRX7](https://perma.cc/A5QL-QRX7)
[^71]: Pat Helland. [Data on the Outside Versus Data on the Inside](https://www.cidrdb.org/cidr2005/papers/P12.pdf). At *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005.
[^72]: Ralph Kimball and Margy Ross. [*The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling*](https://learning.oreilly.com/library/view/the-data-warehouse/9781118530801/), 3rd edition. John Wiley & Sons, 2013. ISBN: 978-1-118-53080-1
[^73]: Viktor Klang. [I'm coining the phrase 'effectively-once' for message processing with at-least-once + idempotent operations](https://x.com/viktorklang/status/789036133434978304). *x.com*, October 2016. Archived at [perma.cc/7DT9-TDG2](https://perma.cc/7DT9-TDG2)
[^74]: Matei Zaharia, Tathagata Das, Haoyuan Li, Scott Shenker, and Ion Stoica. [Discretized Streams: An Efficient and Fault-Tolerant Model for Stream Processing on Large Clusters](https://www.usenix.org/system/files/conference/hotcloud12/hotcloud12-final28.pdf). At *4th USENIX Conference in Hot Topics in Cloud Computing* (HotCloud), June 2012.
[^75]: Kostas Tzoumas, Stephan Ewen, and Robert Metzger. [High-Throughput, Low-Latency, and Exactly-Once Stream Processing with Apache Flink](https://web.archive.org/web/20250429165534/https://www.ververica.com/blog/high-throughput-low-latency-and-exactly-once-stream-processing-with-apache-flink). *ververica.com*, August 2015. Archived at [archive.org](https://web.archive.org/web/20250429165534/https://www.ververica.com/blog/high-throughput-low-latency-and-exactly-once-stream-processing-with-apache-flink)
[^76]: Paris Carbone, Gyula Fóra, Stephan Ewen, Seif Haridi, and Kostas Tzoumas. [Lightweight Asynchronous Snapshots for Distributed Dataflows](https://arxiv.org/abs/1506.08603). arXiv:1506.08603 [cs.DC], June 2015.
[^77]: Ryan Betts and John Hugg. [*Fast Data: Smart and at Scale*](https://www.voltactivedata.com/wp-content/uploads/2017/03/hv-ebook-fast-data-smart-and-at-scale.pdf). Report, O'Reilly Media, October 2015. Archived at [perma.cc/VQ6S-XQQY](https://perma.cc/VQ6S-XQQY)
[^78]: Neha Narkhede and Guozhang Wang. [Exactly-Once Semantics Are Possible: Here's How Kafka Does It](https://www.confluent.io/blog/exactly-once-semantics-are-possible-heres-how-apache-kafka-does-it/). *confluent.io*, June 2019. Archived at [perma.cc/Q2AU-Q2ED](https://perma.cc/Q2AU-Q2ED)
[^79]: Jason Gustafson, Flavio Junqueira, Apurva Mehta, Sriram Subramanian, and Guozhang Wang. [KIP-98 -- Exactly Once Delivery and Transactional Messaging](https://cwiki.apache.org/confluence/display/KAFKA/KIP-98+-+Exactly+Once+Delivery+and+Transactional+Messaging). *cwiki.apache.org*, November 2016. Archived at [perma.cc/95PT-RCTG](https://perma.cc/95PT-RCTG)
[^80]: Pat Helland. [Idempotence Is Not a Medical Condition](https://dl.acm.org/doi/pdf/10.1145/2160718.2160734). *Communications of the ACM*, volume 55, issue 5, page 56, May 2012. [doi:10.1145/2160718.2160734](https://doi.org/10.1145/2160718.2160734)
[^81]: Jay Kreps. [Re: Trying to Achieve Deterministic Behavior on Recovery/Rewind](https://lists.apache.org/thread/n0sz6zld72nvjtnytv09pxc57mdcf9ft). Email to *samza-dev* mailing list, September 2014. Archived at [perma.cc/7DPD-GJNL](https://perma.cc/7DPD-GJNL)
[^82]: E. N. (Mootaz) Elnozahy, Lorenzo Alvisi, Yi-Min Wang, and David B. Johnson. [A Survey of Rollback-Recovery Protocols in Message-Passing Systems](https://www.cs.utexas.edu/~lorenzo/papers/SurveyFinal.pdf). *ACM Computing Surveys*, volume 34, issue 3, pages 375--408, September 2002. [doi:10.1145/568522.568525](https://doi.org/10.1145/568522.568525)
[^83]: Adam Warski. [Kafka Streams -- How Does It Fit the Stream Processing Landscape?](https://softwaremill.com/kafka-streams-how-does-it-fit-stream-landscape/) *softwaremill.com*, June 2016. Archived at [perma.cc/WQ5Q-H2J2](https://perma.cc/WQ5Q-H2J2)
[^84]: Stephan Ewen, Fabian Hueske, and Xiaowei Jiang. [Batch as a Special Case of Streaming and Alibaba's contribution of Blink](https://flink.apache.org/2019/02/13/batch-as-a-special-case-of-streaming-and-alibabas-contribution-of-blink/). *flink.apache.org*, February 2019. Archived at [perma.cc/A529-SKA9](https://perma.cc/A529-SKA9)

================================================
FILE: content/tw/ch13.md
================================================
---
title: "第十三章：流式系統的哲學"
linkTitle: "13. 流式系統的哲學"
weight: 313
breadcrumbs: false
---

<a id="ch_philosophy"></a>
<a id="ch13"></a>

![](/map/ch12.png)

> 如果船長的終極目標是保護船隻，他應該永遠待在港口。
>
> —— 聖托馬斯・阿奎那《神學大全》（1265-1274）

[第二章](/tw/ch2) 討論了構建 **可靠**、**可伸縮**、**可維護** 應用與系統的目標。這些主題貫穿了全書：例如，我們討論了提升可靠性的多種容錯演算法、提升可伸縮性的分割槽方法，以及提升可維護性的演化與抽象機制。

在本章中，我們將把這些想法整合起來，並特別基於 [第十二章](/tw/ch12) 的流式/事件驅動架構思路，提出一套滿足這些目標的應用開發哲學。與前幾章相比，本章立場更鮮明：不是並列比較多種方案，而是深入展開一種特定的設計哲學。

## 資料整合 {#sec_future_integration}

本書中反覆出現的主題是，對於任何給定的問題都會有好幾種解決方案，所有這些解決方案都有不同的優缺點與利弊權衡。例如在 [第四章](/tw/ch4) 討論儲存引擎時，我們看到了日誌結構儲存、B 樹以及列式儲存。在 [第六章](/tw/ch6) 討論複製時，我們看到了單領導者、多領導者和無領導者的方法。

如果你有一個類似於 “我想儲存一些資料並稍後再查詢” 的問題，那麼並沒有一種正確的解決方案。但對於不同的具體環境，總會有不同的合適方法。軟體實現通常必須選擇一種特定的方法。使單條程式碼路徑能做到穩定健壯且表現良好已經是一件非常困難的事情了 —— 嘗試在單個軟體中完成所有事情，幾乎可以保證，實現效果會很差。

因此軟體工具的最佳選擇也取決於情況。每一種軟體，甚至所謂的 “通用” 資料庫，都是針對特定的使用模式設計的。

面對讓人眼花繚亂的諸多替代品，第一個挑戰就是弄清軟體與其適用環境的對映關係。供應商不願告訴你他們軟體不適用的工作負載，這是可以理解的。但是希望先前的章節能給你提供一些問題，讓你讀出字裡行間的言外之意，並更好地理解這些權衡。

但是，即使你已經完全理解各種工具與其適用環境間的關係，還有一個挑戰：在複雜的應用中，資料的用法通常花樣百出。不太可能存在適用於 **所有** 不同資料應用場景的軟體，因此你不可避免地需要拼湊幾個不同的軟體來以提供應用所需的功能。

### 組合使用派生資料的工具 {#id442}

例如，為了處理任意關鍵詞的搜尋查詢，將 OLTP 資料庫與全文檢索索引整合在一起是很常見的需求。儘管一些資料庫（例如 PostgreSQL）包含了全文索引功能，對於簡單的應用完全夠了[^1]，但更複雜的搜尋能力就需要專業的資訊檢索工具了。相反的是，搜尋索引通常不適合作為持久的記錄系統，因此許多應用需要組合這兩種不同的工具以滿足所有需求。

我們在 “[保持系統同步](/tw/ch12#sec_stream_sync)” 中接觸過整合資料系統的問題。隨著資料不同表示形式的增加，整合問題變得越來越困難。除了資料庫和搜尋索引之外，也許你需要在分析系統（資料倉庫，或批處理和流處理系統）中維護資料副本；維護從原始資料中派生的快取，或反正規化的資料版本；將資料灌入機器學習、分類、排名或推薦系統中；或者基於資料變更傳送通知。

#### 理解資料流 {#id443}

當需要在多個儲存系統中維護相同資料的副本以滿足不同的訪問模式時，你要對輸入和輸出瞭如指掌：哪些資料先寫入，哪些資料表示派生自哪些來源？如何以正確的格式，將所有資料匯入正確的地方？

例如，你可能會首先將資料寫入 **記錄系統** 資料庫，捕獲對該資料庫所做的變更（請參閱 “[變更資料捕獲](/tw/ch12#sec_stream_cdc)”），然後將變更以相同的順序應用於搜尋索引。如果變更資料捕獲（CDC）是更新索引的唯一方式，則可以確定該索引完全派生自記錄系統，因此與其保持一致（除軟體錯誤外）。寫入資料庫是向該系統提供新輸入的唯一方式。

允許應用程式直接寫入搜尋索引和資料庫引入了如 [圖 12-4](/tw/ch12#fig_stream_dual_write_race) 所示的問題，其中兩個客戶端同時傳送衝突的寫入，且兩個儲存系統按不同順序處理它們。在這種情況下，既不是資料庫說了算，也不是搜尋索引說了算，所以它們做出了相反的決定，進入彼此間永續性的不一致狀態。

如果你可以透過單個系統來提供所有使用者輸入，從而決定所有寫入的排序，則透過按相同順序處理寫入，可以更容易地派生出其他資料表示。這是狀態機複製方法的一個應用，我們在 “[全序廣播](/tw/ch10#sec_consistency_total_order)” 中看到。無論你使用變更資料捕獲還是事件溯源日誌，都不如簡單的基於全序的決策原則更重要。

基於事件日誌來更新派生資料的系統，通常可以做到 **確定性** 與 **冪等性**（請參閱 “[冪等性](/tw/ch12#sec_stream_idempotence)”），使得從故障中恢復相當容易。

#### 派生資料與分散式事務 {#sec_future_derived_vs_transactions}

保持不同資料系統彼此一致的經典方法涉及分散式事務，如 “[原子提交與兩階段提交](/tw/ch8#sec_transactions_2pc)” 中所述。與分散式事務相比，使用派生資料系統的方法如何？

在抽象層面，它們透過不同的方式達到類似的目標。分散式事務透過 **鎖** 進行互斥來決定寫入的順序（請參閱 “[兩階段鎖定](/tw/ch8#sec_transactions_2pl)”），而 CDC 和事件溯源使用日誌進行排序。分散式事務使用原子提交來確保變更只生效一次，而基於日誌的系統通常基於 **確定性重試** 和 **冪等性**。

最大的不同之處在於事務系統通常提供 [線性一致性](/tw/ch10#sec_consistency_linearizability)，這包含著有用的保證，例如 [讀己之寫](/tw/ch6#sec_replication_ryw)。另一方面，派生資料系統通常是非同步更新的，因此它們預設不會提供相同的時序保證。

在願意為分散式事務付出代價的有限場景中，它們已被成功應用。但是，我認為 XA 的容錯能力和效能很差勁（請參閱 “[實踐中的分散式事務](/tw/ch8#sec_transactions_xa)”），這嚴重限制了它的實用性。我相信為分散式事務設計一種更好的協議是可行的。但使這樣一種協議被現有工具廣泛接受是很有挑戰的，且不是立竿見影的事。

在沒有廣泛支援的良好分散式事務協議的情況下，我認為基於日誌的派生資料是整合不同資料系統的最有前途的方法。然而，諸如讀己之寫的保證是有用的，我認為告訴所有人 “最終一致性是不可避免的 —— 忍一忍並學會和它打交道” 是沒有什麼建設性的（至少在缺乏 **如何** 應對的良好指導時）。

在本章後文中，我們將討論一些在非同步派生系統之上實現更強保障的方法，並邁向分散式事務和基於日誌的非同步系統之間的中間地帶。

#### 全序的限制 {#id335}

對於足夠小的系統，構建一個完全有序的事件日誌是完全可行的（正如單主複製資料庫的流行所證明的那樣，它正好建立了這樣一種日誌）。但是，隨著系統向更大更複雜的工作負載伸縮，限制開始出現：

* 在大多數情況下，構建完全有序的日誌，需要所有事件彙集於決定順序的 **單個領導者節點**。如果事件吞吐量大於單臺計算機的處理能力，則需要將其分割槽到多臺計算機上（請參閱 “[分割槽日誌](/tw/ch12#sec_stream_log)”）。然後兩個不同分割槽中的事件順序關係就不明確了。
* 如果伺服器分佈在多個 **地理位置分散** 的資料中心上，例如為了容忍整個資料中心掉線，你通常在每個資料中心都有單獨的主庫，因為網路延遲會導致同步的跨資料中心協調效率低下（請參閱 “[多主複製](/tw/ch6#sec_replication_multi_leader)”）。這意味著源自兩個不同資料中心的事件順序未定義。
* 將應用程式部署為微服務時（請參閱 “[服務中的資料流：REST 與 RPC](/tw/ch5#sec_encoding_dataflow_rpc)”），常見的設計選擇是將每個服務及其持久狀態作為獨立單元進行部署，服務之間不共享持久狀態。當兩個事件來自不同的服務時，這些事件間的順序未定義。
* 某些應用程式在客戶端儲存狀態，該狀態在使用者輸入時立即更新（無需等待伺服器確認），甚至可以繼續離線工作（請參閱 “[需要離線操作的客戶端](/tw/ch6#sec_replication_offline_clients)”）。對於這樣的應用程式，客戶端和伺服器很可能以不同的順序看到事件。

在形式上，決定事件的全域性順序稱為 **全序廣播**，相當於 **共識**（請參閱 “[共識演算法和全序廣播](/tw/ch10#sec_consistency_faces)”）。大多數共識演算法都是針對單個節點的吞吐量足以處理整個事件流的情況而設計的，並且這些演算法不提供多個節點共享事件排序工作的機制。設計可以伸縮至單個節點的吞吐量之上，且在地理位置分散環境中仍能良好工作的共識演算法仍然是一個開放研究問題。

#### 排序事件以捕獲因果關係 {#sec_future_capture_causality}

在事件之間不存在因果關係的情況下，全序的缺乏並不是一個大問題，因為併發事件可以任意排序。其他一些情況很容易處理：例如，當同一物件有多個更新時，它們可以透過將特定物件 ID 的所有更新路由到相同的日誌分割槽來完全排序。然而，因果關係有時會以更微妙的方式出現（請參閱 “[順序與因果關係](/tw/ch10#sec_consistency_logical)”）。

例如，考慮一個社交網路服務，以及一對曾處於戀愛關係但剛分手的使用者。其中一個使用者將另一個使用者從好友中移除，然後向剩餘的好友傳送訊息，抱怨他們的前任。使用者的心思是他們的前任不應該看到這些粗魯的訊息，因為訊息是在好友狀態解除後傳送的。

但是如果好友關係狀態與訊息儲存在不同的地方，在這樣一個系統中，可能會出現 **解除好友** 事件與 **傳送訊息** 事件之間的因果依賴丟失的情況。如果因果依賴關係沒有被捕捉到，則傳送有關新訊息的通知的服務可能會在 **解除好友** 事件之前處理 **傳送訊息** 事件，從而錯誤地向前任傳送通知。

在本例中，通知實際上是訊息和好友列表之間的連線，使得它與我們先前討論的連線的時序問題有關（請參閱 “[連線的時間依賴性](/tw/ch12#sec_stream_join_time)”）。不幸的是，這個問題似乎並沒有一個簡單的答案[^2] [^3]。起點包括：

* 邏輯時間戳可以提供無需協調的全域性順序（請參閱 “[序列號順序](/tw/ch10#sec_consistency_logical)”），因此它們可能有助於全序廣播不可行的情況。但是，他們仍然要求收件人處理不按順序傳送的事件，並且需要傳遞其他元資料。
* 如果你可以記錄一個事件來記錄使用者在做出決定之前所看到的系統狀態，並給該事件一個唯一的識別符號，那麼後面的任何事件都可以引用該事件識別符號來記錄因果關係[^4]。我們將在 “[讀也是事件](#sec_future_read_events)” 中回到這個想法。
* 衝突解決演算法（請參閱 “[自動衝突解決](/tw/ch6#automatic-conflict-resolution)”）有助於處理以意外順序傳遞的事件。它們對於維護狀態很有用，但如果行為有外部副作用（例如，給使用者傳送通知），就沒什麼幫助了。

也許，隨著時間的推移，應用開發模式將出現，使得能夠有效地捕獲因果依賴關係，並且保持正確的派生狀態，而不會迫使所有事件經歷全序廣播的瓶頸）。

### 批處理與流處理 {#sec_future_batch_streaming}

我會說資料整合的目標是，確保資料最終能在所有正確的地方表現出正確的形式。這樣做需要消費輸入、轉換、連線、過濾、聚合、訓練模型、評估、以及最終寫出適當的輸出。批處理和流處理是實現這一目標的工具。

批處理和流處理的輸出是派生資料集，例如搜尋索引、物化檢視、向用戶顯示的建議、聚合指標等（請參閱 “[批處理工作流的輸出](/tw/ch11#sec_batch_output)” 和 “[流處理的應用](/tw/ch12#sec_stream_uses)”）。

正如我們在 [第十一章](/tw/ch11) 和 [第十二章](/tw/ch12) 中看到的，批處理和流處理有許多共同的原則，主要的根本區別在於流處理器在無限資料集上執行，而批處理輸入是已知的有限大小。

#### 維護派生狀態 {#id446}

批處理有著很強的函式式風格（即使其程式碼不是用函式式語言編寫的）：它鼓勵確定性的純函式，其輸出僅依賴於輸入，除了顯式輸出外沒有副作用，將輸入視作不可變的，且輸出是僅追加的。流處理與之類似，但它擴充套件了運算元以允許受管理的、容錯的狀態（請參閱 “[失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance)”）。

具有良好定義的輸入和輸出的確定性函式的原理不僅有利於容錯（請參閱 “[冪等性](/tw/ch12#sec_stream_idempotence)”），也簡化了有關組織中資料流的推理[^7]。無論派生資料是搜尋索引、統計模型還是快取，採用這種觀點思考都是很有幫助的：將其視為從一個東西派生出另一個的資料管道，透過函式式應用程式碼推送一個系統的狀態變更，並將其效果應用至派生系統中。

原則上，派生資料系統可以同步地維護，就像關係資料庫在與索引表寫入操作相同的事務中同步更新次級索引一樣。然而，非同步是使基於事件日誌的系統穩健的原因：它允許系統的一部分故障被抑制在本地。而如果任何一個參與者失敗，分散式事務將中止，因此它們傾向於透過將故障傳播到系統的其餘部分來放大故障（請參閱 “[分散式事務的限制](/tw/ch8#sec_transactions_xa)”）。

我們在 “[分割槽與次級索引](/tw/ch7#sec_sharding_secondary_indexes)” 中看到，次級索引經常跨越分割槽邊界。具有次級索引的分割槽系統需要將寫入傳送到多個分割槽（如果索引按關鍵詞分割槽的話）或將讀取傳送到所有分割槽（如果索引是按文件分割槽的話）。如果索引是非同步維護的，這種跨分割槽通訊也是最可靠和最可伸縮的[^8]（另請參閱 “[多分割槽資料處理](#sec_future_unbundled_multi_shard)”）。

#### 應用演化後重新處理資料 {#sec_future_reprocessing}

在維護派生資料時，批處理和流處理都是有用的。流處理允許將輸入中的變化以低延遲反映在派生檢視中，而批處理允許重新處理大量累積的歷史資料以便將新檢視匯出到現有資料集上。

特別是，重新處理現有資料為維護系統、演化並支援新功能和需求變更提供了一個良好的機制（請參閱 [第四章](/tw/ch4)）。沒有重新進行處理，模式演化將僅限於簡單的變化，例如向記錄中新增新的可選欄位或新增新型別的記錄。無論是在寫時模式還是在讀時模式中都是如此（請參閱 “[文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility)”）。另一方面，透過重新處理，可以將資料集重組為一個完全不同的模型，以便更好地滿足新的要求。

> ### 鐵路上的模式遷移
>
> 大規模的 “模式遷移” 也發生在非計算機系統中。例如，在 19 世紀英國鐵路建設初期，軌距（兩軌之間的距離）就有了各種各樣的競爭標準。為一種軌距而建的列車不能在另一種軌距的軌道上執行，這限制了火車網路中可能的相互連線[^9]。
>
> 在 1846 年最終確定了一個標準軌距之後，其他軌距的軌道必須轉換 —— 但是如何在不停運火車線路的情況下進行數月甚至數年的遷移？解決的辦法是首先透過新增第三條軌道將軌道轉換為 **雙軌距（dual gauge）** 或 **混合軌距**。這種轉換可以逐漸完成，當完成時，兩種軌距的列車都可以線上路上跑，使用三條軌道中的兩條。事實上，一旦所有的列車都轉換成標準軌距，那麼可以移除提供非標準軌距的軌道。
>
> 以這種方式 “再加工” 現有的軌道，讓新舊版本並存，可以在幾年的時間內逐漸改變軌距。然而，這是一項昂貴的事業，這就是今天非標準軌距仍然存在的原因。例如，舊金山灣區的 BART 系統使用了與美國大部分地區不同的軌距。

派生檢視允許 **漸進演化（gradual evolution）**。如果你想重新構建資料集，不需要執行突然切換式的遷移。取而代之的是，你可以將舊架構和新架構並排維護為相同基礎資料上的兩個獨立派生檢視。然後可以開始將少量使用者轉移到新檢視，以測試其效能並發現任何錯誤，而大多數使用者仍然會被路由到舊檢視。你可以逐漸地增加訪問新檢視的使用者比例，最終可以刪除舊檢視[^10]。

這種逐漸遷移的美妙之處在於，如果出現問題，每個階段的過程都很容易逆轉：你始終有一個可以回滾的可用系統。透過降低不可逆損害的風險，你能對繼續前進更有信心，從而更快地改善系統[^11]。

#### 統一批處理和流處理 {#id338}

早期統一批處理與流處理的提案是 **Lambda 架構**[^12]，但它有不少問題，並且已經逐漸淡出主流。更新的系統允許在同一個系統中同時實現批計算（重處理歷史資料）和流計算（事件到達即處理）[^15]。

在一個系統中統一批處理和流處理需要以下功能，這些功能也正在越來越廣泛地被提供：

* 透過處理最近事件流的相同處理引擎來重播歷史事件的能力。例如，基於日誌的訊息代理可以重播訊息（請參閱 “[重播舊訊息](/tw/ch12#sec_stream_replay)”），某些流處理器可以從 HDFS 等分散式檔案系統讀取輸入。
* 對於流處理器來說，恰好一次語義 —— 即確保輸出與未發生故障的輸出相同，即使事實上發生故障（請參閱 “[容錯](/tw/ch12#sec_stream_fault_tolerance)”）。與批處理一樣，這需要丟棄任何失敗任務的部分輸出。
* 按事件時間進行視窗化的工具，而不是按處理時間進行視窗化，因為處理歷史事件時，處理時間毫無意義（請參閱 “[時間推理](/tw/ch12#sec_stream_time)”）。例如，Apache Beam 提供了用於表達這種計算的 API，可以在 Apache Flink 或 Google Cloud Dataflow 使用。


## 分拆資料庫 {#sec_future_unbundling}

在最抽象的層面上，資料庫、批/流處理器和作業系統都在做相似的事情：儲存資料，並允許你處理和查詢這些資料[^16]。資料庫將資料儲存為某種資料模型下的記錄（例如錶行、文件、圖頂點等），而作業系統檔案系統將資料存為檔案；但它們本質上都可視作 “資訊管理” 系統[^17]。正如我們在 [第十一章](/tw/ch11) 中看到的，批處理系統在很多方面像是 Unix 的分散式版本。

當然，有很多實際的差異。例如，許多檔案系統都不能很好地處理包含 1000 萬個小檔案的目錄，而包含 1000 萬個小記錄的資料庫完全是尋常而不起眼的。無論如何，作業系統和資料庫之間的相似之處和差異值得探討。

Unix 和關係資料庫以非常不同的哲學來處理資訊管理問題。Unix 認為它的目的是為程式設計師提供一種相當低層次的硬體的邏輯抽象，而關係資料庫則希望為應用程式設計師提供一種高層次的抽象，以隱藏磁碟上資料結構的複雜性、併發性、崩潰恢復等等。Unix 發展出的管道和檔案只是位元組序列，而資料庫則發展出了 SQL 和事務。

哪種方法更好？當然這取決於你想要的是什麼。Unix 是 “簡單的”，因為它是對硬體資源相當薄的包裝；關係資料庫是 “更簡單” 的，因為一個簡短的宣告性查詢可以利用很多強大的基礎設施（查詢最佳化、索引、連線方法、併發控制、複製等），而不需要查詢的作者理解其實現細節。

這些哲學之間的矛盾已經持續了幾十年（Unix 和關係模型都出現在 70 年代初），仍然沒有解決。例如，我將 NoSQL 運動解釋為，希望將類 Unix 的低級別抽象方法應用於分散式 OLTP 資料儲存的領域。

在這一部分我將試圖調和這兩個哲學，希望我們能各取其美。

### 組合使用資料儲存技術 {#id447}

在本書的過程中，我們討論了資料庫提供的各種功能及其工作原理，其中包括：

* 次級索引，使你可以根據欄位的值有效地搜尋記錄（請參閱 “[其他索引結構](/tw/ch4#sec_storage_index_multicolumn)”）
* 物化檢視，這是一種預計算的查詢結果快取（請參閱 “[聚合：資料立方體和物化檢視](/tw/ch4#sec_storage_materialized_views)”）
* 複製日誌，保持其他節點上資料的副本最新（請參閱 “[複製日誌的實現](/tw/ch6#sec_replication_implementation)”）
* 全文檢索索引，允許在文字中進行關鍵字搜尋（請參閱 “[全文檢索與模糊索引](/tw/ch4#sec_storage_full_text)”），也內置於某些關係資料庫[^1]

在 [第十一章](/tw/ch11) 和 [第十二章](/tw/ch12) 中，出現了類似的主題。我們討論了如何構建全文檢索索引（請參閱 “[批處理工作流的輸出](/tw/ch11#sec_batch_output)”），瞭解了如何維護物化檢視（請參閱 “[維護物化檢視](/tw/ch12#sec_stream_mat_view)”）以及如何將變更從資料庫複製到派生資料系統（請參閱 “[變更資料捕獲](/tw/ch12#sec_stream_cdc)”）。

資料庫中內建的功能與人們用批處理和流處理器構建的派生資料系統似乎有相似之處。

#### 建立索引 {#id340}

想想當你執行 `CREATE INDEX` 在關係資料庫中建立一個新的索引時會發生什麼。資料庫必須掃描表的一致性快照，挑選出所有被索引的欄位值，對它們進行排序，然後寫出索引。然後它必須處理自一致快照以來所做的寫入操作（假設表在建立索引時未被鎖定，所以寫操作可能會繼續）。一旦完成，只要事務寫入表中，資料庫就必須繼續保持索引最新。

此過程非常類似於設定新的從庫副本（請參閱 “[設定新從庫](/tw/ch6#sec_replication_new_replica)”），也非常類似於流處理系統中的 **引導（bootstrap）** 變更資料捕獲（請參閱 “[初始快照](/tw/ch12#sec_stream_cdc_snapshot)”）。

無論何時執行 `CREATE INDEX`，資料庫都會重新處理現有資料集（如 “[應用演化後重新處理資料](#sec_future_reprocessing)” 中所述），並將該索引作為新檢視匯出到現有資料上。現有資料可能是狀態的快照，而不是所有發生變化的日誌，但兩者密切相關（請參閱 “[狀態、流和不變性](/tw/ch12#sec_stream_immutability)”）。

#### 一切的元資料庫 {#id341}

有鑑於此，我認為整個組織的資料流開始像一個巨大的資料庫[^7]。每當批處理、流處理或 ETL 過程將資料從一個地方傳輸並轉換到另一個地方時，它都像資料庫子系統在維護索引或物化檢視。

從這種角度來看，批處理和流處理器就像精心實現的觸發器、儲存過程和物化檢視維護例程。它們維護的派生資料系統就像不同的索引型別。例如，關係資料庫可能支援 B 樹索引、雜湊索引、空間索引（請參閱 “[多列索引](/tw/ch4#sec_storage_index_multicolumn)”）以及其他型別的索引。在新興的派生資料系統架構中，不是將這些設施作為單個整合資料庫產品的功能實現，而是由各種不同的軟體提供，執行在不同的機器上，由不同的團隊管理。

這些發展在未來將會把我們帶到哪裡？如果我們從沒有適合所有訪問模式的單一資料模型或儲存格式的前提出發，我推測有兩種途徑可以將不同的儲存和處理工具組合成一個有凝聚力的系統：

**聯合資料庫：統一讀取**

可以為各種各樣的底層儲存引擎和處理方法提供一個統一的查詢介面 —— 一種稱為 **聯合資料庫（federated database）** 或 **多型儲存（polystore）** 的方法[^18] [^19]。例如，PostgreSQL 的 **外部資料包裝器（foreign data wrapper）** 功能符合這種模式[^20]。需要專用資料模型或查詢介面的應用程式仍然可以直接訪問底層儲存引擎，而想要組合來自不同位置的資料的使用者可以透過聯合介面輕鬆完成操作。

聯合查詢介面遵循著單一整合系統的關係型傳統，帶有高階查詢語言和優雅的語義，但實現起來非常複雜。

**分拆資料庫：統一寫入**

雖然聯合能解決跨多個不同系統的只讀查詢問題，但它並沒有很好的解決跨系統 **同步** 寫入的問題。我們說過，在單個數據庫中，建立一致的索引是一項內建功能。當我們構建多個儲存系統時，我們同樣需要確保所有資料變更都會在所有正確的位置結束，即使在出現故障時也是如此。想要更容易地將儲存系統可靠地插接在一起（例如，透過變更資料捕獲和事件日誌），就像將資料庫的索引維護功能以可以跨不同技術同步寫入的方式分開[^7] [^21]。

分拆方法遵循 Unix 傳統的小型工具，它可以很好地完成一件事[^22]，透過統一的低層級 API（管道）進行通訊，並且可以使用更高層級的語言進行組合（shell）[^16] 。

#### 開展分拆工作 {#sec_future_unbundling_favor}

聯合和分拆是一個硬幣的兩面：用不同的元件構成可靠、 可伸縮和可維護的系統。聯合只讀查詢需要將一個數據模型對映到另一個數據模型，這需要一些思考，但最終還是一個可解決的問題。而我認為同步寫入到幾個儲存系統是更困難的工程問題，所以我將重點關注它。

傳統的同步寫入方法需要跨異構儲存系統的分散式事務[^18]，我認為這是錯誤的解決方案（請參閱 “[派生資料與分散式事務](#sec_future_derived_vs_transactions)”）。單個儲存或流處理系統內的事務是可行的，但是當資料跨越不同技術之間的邊界時，我認為具有冪等寫入的非同步事件日誌是一種更加健壯和實用的方法。

例如，分散式事務在某些流處理元件內部使用，以匹配 **恰好一次（exactly-once）** 語義（請參閱 “[原子提交再現](/tw/ch12#sec_stream_atomic_commit)”），這可以很好地工作。然而，當事務需要涉及由不同人群編寫的系統時（例如，當資料從流處理元件寫入分散式鍵值儲存或搜尋索引時），缺乏標準化的事務協議會使整合更難。有冪等消費者的有序事件日誌（請參閱 “[冪等性](/tw/ch12#sec_stream_idempotence)”）是一種更簡單的抽象，因此在異構系統中實現更加可行[^7]。

基於日誌的整合的一大優勢是各個元件之間的 **鬆散耦合（loose coupling）**，這體現在兩個方面：

1. 在系統級別，非同步事件流使整個系統在個別元件的中斷或效能下降時更加穩健。如果消費者執行緩慢或失敗，那麼事件日誌可以緩衝訊息（請參閱 “[磁碟空間使用](/tw/ch12#sec_stream_disk_usage)”），以便生產者和任何其他消費者可以繼續不受影響地執行。有問題的消費者可以在問題修復後趕上，因此不會錯過任何資料，並且包含故障。相比之下，分散式事務的同步互動往往會將本地故障升級為大規模故障（請參閱 “[分散式事務的限制](/tw/ch8#sec_transactions_xa)”）。
2. 在人力方面，分拆資料系統允許不同的團隊獨立開發，改進和維護不同的軟體元件和服務。專業化使得每個團隊都可以專注於做好一件事，並與其他團隊的系統以明確的介面互動。事件日誌提供了一個足夠強大的介面，以捕獲相當強的一致性屬性（由於永續性和事件的順序），但也足夠普適於幾乎任何型別的資料。

#### 分拆系統與整合系統 {#id448}

如果分拆確實成為未來的方式，它也不會取代目前形式的資料庫 —— 它們仍然會像以往一樣被需要。為了維護流處理元件中的狀態，資料庫仍然是需要的，並且為批處理和流處理器的輸出提供查詢服務（請參閱 “[批處理工作流的輸出](/tw/ch11#sec_batch_output)” 與 “[流處理](/tw/ch12#sec_stream_processing)”）。專用查詢引擎對於特定的工作負載仍然非常重要：例如，MPP 資料倉庫中的查詢引擎針對探索性分析查詢進行了最佳化，並且能夠很好地處理這種型別的工作負載（請參閱 “[Hadoop 與分散式資料庫的對比](/tw/ch11#sec_batch_distributed)”）。

執行幾種不同基礎設施的複雜性可能是一個問題：每種軟體都有一個學習曲線，配置問題和操作怪癖，因此部署儘可能少的移動部件是很有必要的。比起使用應用程式碼拼接多個工具而成的系統，單一整合軟體產品也可以在其設計應對的工作負載型別上實現更好、更可預測的效能[^23]。正如在前言中所說的那樣，為了不需要的規模而構建系統是白費精力，而且可能會將你鎖死在一個不靈活的設計中。實際上，這是一種過早最佳化的形式。

分拆的目標不是要針對個別資料庫與特定工作負載的效能進行競爭；我們的目標是允許你結合多個不同的資料庫，以便在比單個軟體可能實現的更廣泛的工作負載範圍內實現更好的效能。這是關於廣度，而不是深度 —— 與我們在 “[Hadoop 與分散式資料庫的對比](/tw/ch11#sec_batch_distributed)” 中討論的儲存和處理模型的多樣性一樣。

因此，如果有一項技術可以滿足你的所有需求，那麼最好使用該產品，而不是試圖用更低層級的元件重新實現它。只有當沒有單一軟體滿足你的所有需求時，才會出現拆分和聯合的優勢。

### 圍繞資料流設計應用 {#sec_future_dataflow}

當底層資料發生變化時去更新派生資料，這個思路並不新鮮。比如電子表格就有很強的資料流程式設計能力[^33]：你可以在一個單元格寫公式（例如對另一列求和），只要輸入變化，結果就會自動重算。這正是我們希望資料系統具備的能力：資料庫記錄一旦變化，相關索引、快取檢視和聚合結果都應自動重新整理，而不需要應用開發者關心重新整理細節。

從這個意義上說，今天很多資料系統仍可以向 VisiCalc 在 1979 年就具備的特性學習[^34]。與電子表格不同的是，現代資料系統還必須同時滿足容錯、可伸縮、持久化儲存、跨團隊異構技術整合等要求，也必須能夠複用已有庫與服務。指望所有軟體都在一種語言、框架或工具上統一實現並不現實。

#### 應用程式碼作為派生函式 {#sec_future_dataflow_derivation}

當一個數據集派生自另一個數據集時，它會經歷某種轉換函式。例如：

* 次級索引是由一種直白的轉換函式生成的派生資料集：對於基礎表中的每行或每個文件，它挑選被索引的列或欄位中的值，並按這些值排序（假設使用 B 樹或 SSTable 索引，按鍵排序，如 [第四章](/tw/ch4) 所述）。
* 全文檢索索引是透過應用各種自然語言處理函式而建立的，諸如語言檢測、分詞、詞幹或詞彙化、拼寫糾正和同義詞識別，然後構建用於高效查詢的資料結構（例如倒排索引）。
* 在機器學習系統中，我們可以將模型視作從訓練資料透過應用各種特徵提取、統計分析函式派生的資料，當模型應用於新的輸入資料時，模型的輸出是從輸入和模型（因此間接地從訓練資料）中派生的。
* 快取通常包含將以使用者介面（UI）顯示的形式的資料聚合。因此填充快取需要知道 UI 中引用的欄位；UI 中的變更可能需要更新快取填充方式的定義，並重建快取。

用於次級索引的派生函式是如此常用的需求，以致於它作為核心功能被內建至許多資料庫中，你可以簡單地透過 `CREATE INDEX` 來呼叫它。對於全文索引，常見語言的基本語言特徵可能內建到資料庫中，但更複雜的特徵通常需要領域特定的調整。在機器學習中，特徵工程是眾所周知的特定於應用的特徵，通常需要包含很多關於使用者互動與應用部署的詳細知識[^35]。

當建立派生資料集的函式不是像建立次級索引那樣的標準搬磚函式時，需要自定義程式碼來處理特定於應用的東西。而這個自定義程式碼是讓許多資料庫掙扎的地方，雖然關係資料庫通常支援觸發器、儲存過程和使用者定義的函式，可以用它們來在資料庫中執行應用程式碼，但它們有點像資料庫設計裡的事後反思。（請參閱 “[傳遞事件流](/tw/ch12#sec_stream_transmit)”）。

#### 應用程式碼和狀態的分離 {#id344}

理論上，資料庫可以是任意應用程式碼的部署環境，就如同作業系統一樣。然而實踐中它們對這一目標適配的很差。它們不滿足現代應用開發的要求，例如依賴和軟體包管理、版本控制、滾動升級、可演化性、監控、指標、對網路服務的呼叫以及與外部系統的整合。

另一方面，Mesos、YARN、Docker、Kubernetes 等部署和叢集管理工具專為執行應用程式碼而設計。透過專注於做好一件事情，他們能夠做得比將資料庫作為其眾多功能之一執行使用者定義的功能要好得多。

我認為讓系統的某些部分專門用於持久資料儲存並讓其他部分專門執行應用程式程式碼是有意義的。這兩者可以在保持獨立的同時互動。

現在大多數 Web 應用程式都是作為無狀態服務部署的，其中任何使用者請求都可以路由到任何應用程式伺服器，並且伺服器在傳送響應後會忘記所有請求。這種部署方式很方便，因為可以隨意新增或刪除伺服器，但狀態必須到某個地方：通常是資料庫。趨勢是將無狀態應用程式邏輯與狀態管理（資料庫）分開：不將應用程式邏輯放入資料庫中，也不將持久狀態置於應用程式中[^36]。正如函數語言程式設計社群喜歡開玩笑說的那樣，“我們相信 **教會（Church）** 與 **國家（state）** 的分離”[^37]。

在這個典型的 Web 應用模型中，資料庫充當一種可以透過網路同步訪問的可變共享變數。應用程式可以讀取和更新變數，而資料庫負責維持它的永續性，提供一些諸如併發控制和容錯的功能。

但是，在大多數程式語言中，你無法訂閱可變變數中的變更 —— 你只能定期讀取它。與電子表格不同，如果變數的值發生變化，變數的讀者不會收到通知（你可以在自己的程式碼中實現這樣的通知 —— 這被稱為 **觀察者模式** —— 但大多數語言沒有將這種模式作為內建功能）。

資料庫繼承了這種可變資料的被動方法：如果你想知道資料庫的內容是否發生了變化，通常你唯一的選擇就是輪詢（即定期重複你的查詢）。訂閱變更只是剛剛開始出現的功能（請參閱 “[變更流的 API 支援](/tw/ch12#sec_stream_change_api)”）。

#### 資料流：應用程式碼與狀態變化的互動 {#id450}

從資料流的角度思考應用程式，意味著重新協調應用程式碼和狀態管理之間的關係。我們不再將資料庫視作被應用操縱的被動變數，取而代之的是更多地考慮狀態，狀態變更和處理它們的程式碼之間的相互作用與協同關係。應用程式碼透過在另一個地方觸發狀態變更來響應狀態變更。

我們在 “[資料庫與流](/tw/ch12#sec_stream_databases)” 中看到了這一思路，我們討論了將資料庫的變更日誌視為一種我們可以訂閱的事件流。諸如 Actor 的訊息傳遞系統（請參閱 “[訊息傳遞中的資料流](/tw/ch5#sec_encoding_dataflow_msg)”）也具有響應事件的概念。早在 20 世紀 80 年代，**元組空間（tuple space）** 模型就已經探索了表達分散式計算的方式：觀察狀態變更並作出反應的過程[^38] [^39]。

如前所述，當觸發器由於資料變更而被觸發時，或次級索引更新以反映索引表中的變更時，資料庫內部也發生著類似的情況。分拆資料庫意味著將這個想法應用於在主資料庫之外，用於建立派生資料集：快取、全文檢索索引、機器學習或分析系統。我們可以為此使用流處理和訊息傳遞系統。

需要記住的重要一點是，維護派生資料不同於執行非同步任務。傳統的訊息傳遞系統通常是為執行非同步任務設計的（請參閱 “[日誌與傳統的訊息傳遞相比](/tw/ch12#sec_stream_logs_vs_messaging)”）：

* 在維護派生資料時，狀態變更的順序通常很重要（如果多個檢視是從事件日誌派生的，則需要按照相同的順序處理事件，以便它們之間保持一致）。如 “[確認與重新傳遞](/tw/ch12#sec_stream_reordering)” 中所述，許多訊息代理在重傳未確認訊息時沒有此屬性，雙寫也被排除在外（請參閱 “[保持系統同步](/tw/ch12#sec_stream_sync)”）。
* 容錯是派生資料的關鍵：僅僅丟失單個訊息就會導致派生資料集永遠與其資料來源失去同步。訊息傳遞和派生狀態更新都必須可靠。例如，許多 Actor 系統預設在記憶體中維護 Actor 的狀態和訊息，所以如果執行 Actor 的機器崩潰，狀態和訊息就會丟失。

穩定的訊息排序和容錯訊息處理是相當嚴格的要求，但與分散式事務相比，它們開銷更小，執行更穩定。現代流處理元件可以提供這些排序和可靠性保證，並允許應用程式碼以流運算元的形式執行。

這些應用程式碼可以執行任意處理，包括資料庫內建派生函式通常不提供的功能。就像透過管道連結的 Unix 工具一樣，流運算元可以圍繞著資料流構建大型系統。每個運算元接受狀態變更的流作為輸入，併產生其他狀態變化的流作為輸出。

#### 流處理器和服務 {#id345}

當今流行的應用開發風格涉及將功能分解為一組透過同步網路請求（如 REST API）進行通訊的 **服務**（service，請參閱 “[服務中的資料流：REST 與 RPC](/tw/ch5#sec_encoding_dataflow_rpc)”）。這種面向服務的架構優於單一龐大應用的優勢主要在於：通過鬆散耦合來提供組織上的可伸縮性：不同的團隊可以專職於不同的服務上，從而減少團隊之間的協調工作（因為服務可以獨立部署和更新）。

在資料流中組裝流運算元與微服務方法有很多相似之處[^40]。但底層通訊機制是有很大區別：資料流採用單向非同步訊息流，而不是同步的請求 / 響應式互動。

除了在 “[訊息傳遞中的資料流](/tw/ch5#sec_encoding_dataflow_msg)” 中列出的優點（如更好的容錯性），資料流系統還能實現更好的效能。例如，假設客戶正在購買以一種貨幣定價，但以另一種貨幣支付的商品。為了執行貨幣換算，你需要知道當前的匯率。這個操作可以透過兩種方式實現[^40] [^41]：

1. 在微服務方法中，處理購買的程式碼可能會查詢匯率服務或資料庫，以獲取特定貨幣的當前匯率。
2. 在資料流方法中，處理訂單的程式碼會提前訂閱匯率變更流，並在匯率發生變動時將當前匯率儲存在本地資料庫中。處理訂單時只需查詢本地資料庫即可。

第二種方法能將對另一服務的同步網路請求替換為對本地資料庫的查詢（可能在同一臺機器甚至同一個程序中）。資料流方法不僅更快，而且當其他服務失效時也更穩健。最快且最可靠的網路請求就是壓根沒有網路請求！我們現在不再使用 RPC，而是在購買事件和匯率更新事件之間建立流聯接（請參閱 “[流表連線（流擴充）](/tw/ch12#sec_stream_table_joins)”）。

連線是時間相關的：如果購買事件在稍後的時間點被重新處理，匯率可能已經改變。如果要重建原始輸出，則需要獲取原始購買時的歷史匯率。無論是查詢服務還是訂閱匯率更新流，你都需要處理這種時間相關性（請參閱 “[連線的時間依賴性](/tw/ch12#sec_stream_join_time)”）。

訂閱變更流，而不是在需要時查詢當前狀態，使我們更接近類似電子表格的計算模型：當某些資料發生變更時，依賴於此的所有派生資料都可以快速更新。還有很多未解決的問題，例如關於時間相關連線等問題，但我認為圍繞資料流構建應用的想法是一個非常有希望的方向。

### 觀察派生資料狀態 {#sec_future_observing}

在抽象層面，上一節討論的資料流系統給出了建立並維護派生資料集（如搜尋索引、物化檢視、預測模型）的過程。我們把這稱為 **寫路徑（write path）**：當資訊寫入系統後，它可能經過多個批處理與流處理階段，最終所有相關派生資料集都會被更新。[圖 13-1](#fig_future_write_read_paths) 展示了搜尋索引更新的例子。

{{< figure src="/fig/ddia_1301.png" id="fig_future_write_read_paths" caption="圖 13-1 在搜尋索引中，寫入（文件更新）與讀取（查詢）相遇。" class="w-full my-4" >}}

但你為什麼一開始就要建立派生資料集？很可能是因為你想在以後再次查詢它。這就是 **讀路徑（read path）**：當服務使用者請求時，你需要從派生資料集中讀取，也許還要對結果進行一些額外處理，然後構建給使用者的響應。

總而言之，寫路徑和讀路徑涵蓋了資料的整個旅程，從收集資料開始，到使用資料結束（可能是由另一個人）。寫路徑是預計算過程的一部分 —— 即，一旦資料進入，即刻完成，無論是否有人需要看它。讀路徑是這個過程中只有當有人請求時才會發生的部分。如果你熟悉函數語言程式設計語言，則可能會注意到寫路徑類似於立即求值，讀路徑類似於惰性求值。

如 [圖 13-1](#fig_future_write_read_paths) 所示，派生資料集是寫路徑和讀路徑相遇的地方。它代表了寫入時工作量與讀取時工作量之間的權衡。

#### 物化檢視和快取 {#id451}

全文檢索索引就是一個很好的例子：寫路徑更新索引，讀路徑在索引中搜索關鍵字。讀寫都需要做一些工作。寫入需要更新文件中出現的所有關鍵詞的索引條目。讀取需要搜尋查詢中的每個單詞，並應用布林邏輯來查詢包含查詢中所有單詞（AND 運算子）的文件，或者每個單詞（OR 運算子）的任何同義詞。

如果沒有索引，搜尋查詢將不得不掃描所有文件（如 grep），如果有著大量文件，這樣做的開銷巨大。沒有索引意味著寫入路徑上的工作量較少（沒有要更新的索引），但是在讀取路徑上需要更多工作。

另一方面，可以想象為所有可能的查詢預先計算搜尋結果。在這種情況下，讀路徑上的工作量會減少：不需要布林邏輯，只需查詢查詢結果並返回即可。但寫路徑會更加昂貴：可能的搜尋查詢集合是無限大的，因此預先計算所有可能的搜尋結果將需要無限的時間和儲存空間，這在實踐中不可行。

另一種選擇是預先計算一組固定的最常見查詢的搜尋結果，以便可以快速提供它們而無需轉到索引。不常見的查詢仍然可以透過索引來提供服務。這通常被稱為常見查詢的 **快取（cache）**，儘管我們也可以稱之為 **物化檢視（materialized view）**，因為當新文件出現，且需要被包含在這些常見查詢的搜尋結果之中時，這些索引就需要更新。

從這個例子中我們可以看到，索引不是寫路徑和讀路徑之間唯一可能的邊界；快取常見搜尋結果也是可行的；而在少量文件上使用沒有索引的類 grep 掃描也是可行的。由此來看，快取，索引和物化檢視的作用很簡單：它們改變了讀路徑與寫路徑之間的邊界。透過預先計算結果，從而允許我們在寫路徑上做更多的工作，以節省讀路徑上的工作量。

在寫路徑上完成的工作和讀路徑之間的界限，實際上是本書開始處在 “[描述負載](/tw/ch2#sec_introduction_twitter)” 中推特例子裡談到的主題。在該例中，我們還看到了與普通使用者相比，名人的寫路徑和讀路徑可能有所不同。在 500 頁之後，我們已經繞回了起點！

#### 有狀態、可離線的客戶端 {#id347}

我發現寫路徑和讀路徑之間的邊界很有趣，因為我們可以試著改變這個邊界，並探討這種改變的實際意義。我們來看看不同上下文中的這一想法。

過去二十年來，Web 應用的火熱讓我們對應用開發作出了一些很容易視作理所當然的假設。具體來說就是，客戶端 / 伺服器模型 —— 客戶端大多是無狀態的，而伺服器擁有資料的權威 —— 已經普遍到我們幾乎忘掉了還有其他任何模型的存在。但是技術在不斷地發展，我認為不時地質疑現狀非常重要。

傳統上，網路瀏覽器是無狀態的客戶端，只有當連線到網際網路時才能做一些有用的事情（能離線執行的唯一事情基本上就是上下滾動之前線上時載入好的頁面）。然而，最近的 “單頁面” JavaScript Web 應用已經獲得了很多有狀態的功能，包括客戶端使用者介面互動，以及 Web 瀏覽器中的持久化本地儲存。移動應用可以類似地在裝置上儲存大量狀態，而且大多數使用者互動都不需要與伺服器往返互動。

這些不斷變化的功能重新引發了對 **離線優先（offline-first）** 應用的興趣，這些應用盡可能地在同一裝置上使用本地資料庫，無需連線網際網路，並在後臺網路連線可用時與遠端伺服器同步[^42]。由於移動裝置通常具有緩慢且不可靠的蜂窩網路連線，因此，如果使用者的使用者介面不必等待同步網路請求，且應用主要是離線工作的，則這是一個巨大優勢（請參閱 “[需要離線操作的客戶端](/tw/ch6#sec_replication_offline_clients)”）。

當我們擺脫無狀態客戶端與中央資料庫互動的假設，並轉向在終端使用者裝置上維護狀態時，這就開啟了新世界的大門。特別是，我們可以將裝置上的狀態視為 **伺服器狀態的快取**。螢幕上的畫素是客戶端應用中模型物件的物化檢視；模型物件是遠端資料中心的本地狀態副本[^27]。

#### 將狀態變更推送給客戶端 {#id348}

在典型的網頁中，如果你在 Web 瀏覽器中載入頁面，並且隨後伺服器上的資料發生變更，則瀏覽器在重新載入頁面之前對此一無所知。瀏覽器只能在一個時間點讀取資料，假設它是靜態的 —— 它不會訂閱來自伺服器的更新。因此裝置上的狀態是陳舊的快取，除非你顯式輪詢變更否則不會更新。（像 RSS 這樣基於 HTTP 的 Feed 訂閱協議實際上只是一種基本的輪詢形式）

最近的協議已經超越了 HTTP 的基本請求 / 響應模式：服務端傳送的事件（EventSource API）和 WebSockets 提供了通訊通道，透過這些通道，Web 瀏覽器可以與伺服器保持開啟的 TCP 連線，只要瀏覽器仍然連線著，伺服器就能主動向瀏覽器推送資訊。這為伺服器提供了主動通知終端使用者客戶端的機會，伺服器能告知客戶端其本地儲存狀態的任何變化，從而減少客戶端狀態的陳舊程度。

用我們的寫路徑與讀路徑模型來講，主動將狀態變更推至到客戶端裝置，意味著將寫路徑一直延伸到終端使用者。當客戶端首次初始化時，它仍然需要使用讀路徑來獲取其初始狀態，但此後它就能夠依賴伺服器傳送的狀態變更流了。我們在流處理和訊息傳遞部分討論的想法並不侷限於資料中心中：我們可以進一步採納這些想法，並將它們一直延伸到終端使用者裝置[^43]。

這些裝置有時會離線，並在此期間無法收到伺服器狀態變更的任何通知。但是我們已經解決了這個問題：在 “[消費者偏移量](/tw/ch12#sec_stream_log_offsets)” 中，我們討論了基於日誌的訊息代理的消費者能在失敗或斷開連線後重連，並確保它不會錯過掉線期間任何到達的訊息。同樣的技術適用於單個使用者，每個裝置都是一個小事件流的小小訂閱者。

#### 端到端的事件流 {#id349}

最近用於開發有狀態的客戶端與使用者介面的工具，例如如 Elm 語言[^30]和 Facebook 的 React、Flux 和 Redux 工具鏈，已經透過訂閱表示使用者輸入或伺服器響應的事件流來管理客戶端的內部狀態，其結構與事件溯源相似（請參閱 “[事件溯源](/tw/ch12#sec_stream_event_sourcing)”）。

將這種程式設計模型擴充套件為：允許伺服器將狀態變更事件推送到客戶端的事件管道中，是非常自然的。因此，狀態變化可以透過 **端到端（end-to-end）** 的寫路徑流動：從一個裝置上的互動觸發狀態變更開始，經由事件日誌，並穿過幾個派生資料系統與流處理器，一直到另一臺裝置上的使用者介面，而有人正在觀察使用者介面上的狀態變化。這些狀態變化能以相當低的延遲傳播 —— 比如說，在一秒內從一端到另一端。

一些應用（如即時訊息傳遞與線上遊戲）已經具有這種 “即時” 架構（在低延遲互動的意義上，不是在 “[響應時間保證](/tw/ch9#sec_distributed_clocks_realtime)” 中的意義上）。但我們為什麼不用這種方式構建所有的應用？

挑戰在於，關於無狀態客戶端和請求 / 響應互動的假設已經根深蒂固地植入在我們的資料庫、庫、框架以及協議之中。許多資料儲存支援讀取與寫入操作，為請求返回一個響應，但只有極少數提供訂閱變更的能力 —— 請求返回一個隨時間推移的響應流（請參閱 “[變更流的 API 支援](/tw/ch12#sec_stream_change_api)” ）。

為了將寫路徑延伸至終端使用者，我們需要從根本上重新思考我們構建這些系統的方式：從請求 / 響應互動轉向釋出 / 訂閱資料流[^27]。更具響應性的使用者介面與更好的離線支援，我認為這些優勢值得我們付出努力。如果你正在設計資料系統，我希望你對訂閱變更的選項留有印象，而不只是查詢當前狀態。

#### 讀也是事件 {#sec_future_read_events}

我們討論過，當流處理器將派生資料寫入儲存（資料庫，快取或索引）時，以及當用戶請求查詢該儲存時，儲存將充當寫路徑和讀路徑之間的邊界。該儲存應當允許對資料進行隨機訪問的讀取查詢，否則這些查詢將需要掃描整個事件日誌。

在很多情況下，資料儲存與流處理系統是分開的。但回想一下，流處理器還是需要維護狀態以執行聚合和連線的（請參閱 “[流連線](/tw/ch12#sec_stream_joins)”）。這種狀態通常隱藏在流處理器內部，但一些框架也允許這些狀態被外部客戶端查詢[^45]，將流處理器本身變成一種簡單的資料庫。

我願意進一步思考這個想法。正如到目前為止所討論的那樣，對儲存的寫入是透過事件日誌進行的，而讀取是臨時的網路請求，直接流向儲存著待查資料的節點。這是一個合理的設計，但不是唯一可行的設計。也可以將讀取請求表示為事件流，並同時將讀事件與寫事件送往流處理器；流處理器透過將讀取結果傳送到輸出流來響應讀取事件[^46]。

當寫入和讀取都被表示為事件，並且被路由到同一個流運算元以便處理時，我們實際上是在讀取查詢流和資料庫之間執行流表連線。讀取事件需要被送往儲存資料的資料庫分割槽（請參閱 “[請求路由](/tw/ch7#sec_sharding_routing)”），就像批處理和流處理器在連線時需要在同一個鍵上對輸入分割槽一樣（請參閱 “[Reduce 側連線與分組](/tw/ch11#sec_batch_join)”）。

服務請求與執行連線之間的這種相似之處是非常關鍵的[^47]。一次性讀取請求只是將請求傳過連線運算元，然後請求馬上就被忘掉了；而一個訂閱請求，則是與連線另一側過去與未來事件的持久化連線。

記錄讀取事件的日誌可能對於追蹤整個系統中的因果關係與資料來源也有好處：它可以讓你重現出當用戶做出特定決策之前看見了什麼。例如在網商中，向客戶顯示的預測送達日期與庫存狀態，可能會影響他們是否選擇購買一件商品[^4]。要分析這種聯絡，則需要記錄使用者查詢運輸與庫存狀態的結果。

將讀取事件寫入持久儲存可以更好地跟蹤因果關係（請參閱 “[排序事件以捕獲因果關係](#sec_future_capture_causality)”），但會產生額外的儲存與 I/O 成本。最佳化這些系統以減少開銷仍然是一個開放的研究問題[^2]。但如果你已經出於運維目的留下了讀取請求日誌，將其作為請求處理的副作用，那麼將這份日誌作為請求事件源並不是什麼特別大的變更。

#### 多分割槽資料處理 {#sec_future_unbundled_multi_shard}

對於只涉及單個分割槽的查詢，透過流來發送查詢與收集響應可能是殺雞用牛刀了。然而，這個想法開啟了分散式執行複雜查詢的可能性，這需要合併來自多個分割槽的資料，利用了流處理器已經提供的訊息路由、分割槽和連線的基礎設施。

Storm 的分散式 RPC 功能支援這種使用模式（請參閱 “[訊息傳遞和 RPC](/tw/ch12#sec_stream_actors_drpc)”）。例如，它已經被用來計算瀏覽過某個推特 URL 的人數 —— 即，發推包含該 URL 的所有人的粉絲集合的並集[^48]。由於推特的使用者是分割槽的，因此這種計算需要合併來自多個分割槽的結果。

這種模式的另一個例子是欺詐預防：為了評估特定購買事件是否具有欺詐風險，你可以檢查該使用者 IP 地址，電子郵件地址，帳單地址，送貨地址的信用分。這些信用資料庫中的每一個都是有分割槽的，因此為特定購買事件採集分數需要連線一系列不同的分割槽資料集[^49]。

MPP 資料庫的內部查詢執行圖有著類似的特徵（請參閱 “[Hadoop 與分散式資料庫的對比](/tw/ch11#sec_batch_distributed)”）。如果需要執行這種多分割槽連線，則直接使用提供此功能的資料庫，可能要比使用流處理器實現它要更簡單。然而將查詢視為流提供了一種選項，可以用於實現超出傳統現成解決方案的大規模應用。


## 追求正確性 {#sec_future_correctness}

對於只讀取資料的無狀態服務，出問題也沒什麼大不了的：你可以修復該錯誤並重啟服務，而一切都恢復正常。像資料庫這樣的有狀態系統就沒那麼簡單了：它們被設計為永遠記住事物（或多或少），所以如果出現問題，這種（錯誤的）效果也將潛在地永遠持續下去，這意味著它們需要更仔細的思考[^50]。

我們希望構建可靠且 **正確** 的應用（即使面對各種故障，程式的語義也能被很好地定義與理解）。約四十年來，原子性、隔離性和永續性（[第八章](/tw/ch8)）等事務特性一直是構建正確應用的首選工具。然而這些地基沒有看上去那麼牢固：例如弱隔離級別帶來的困惑可以佐證（請參閱 “[弱隔離級別](/tw/ch8#sec_transactions_isolation_levels)”）。

事務在某些領域被完全拋棄，並被提供更好效能與可伸縮性的模型取代，但後者有更複雜的語義（例如，請參閱 “[無主複製](/tw/ch6#sec_replication_leaderless)”）。**一致性（Consistency）** 經常被談起，但其定義並不明確（請參閱 “[一致性](/tw/ch8#sec_transactions_acid_consistency)” 和 [第十章](/tw/ch10)）。有些人斷言我們應當為了高可用而 “擁抱弱一致性”，但卻對這些概念實際上意味著什麼缺乏清晰的認識。

對於如此重要的話題，我們的理解，以及我們的工程方法卻是驚人地薄弱。例如，確定在特定事務隔離等級或複製配置下執行特定應用是否安全是非常困難的[^51] [^52]。通常簡單的解決方案似乎在低併發性的情況下工作正常，並且沒有錯誤，但在要求更高的情況下卻會出現許多微妙的錯誤。

例如，Kyle Kingsbury 的 Jepsen 實驗[^53]標出了一些產品聲稱的安全保證與其在網路問題與崩潰時的實際行為之間的明顯差異。即使像資料庫這樣的基礎設施產品沒有問題，應用程式碼仍然需要正確使用它們提供的功能才行，如果配置很難理解，這是很容易出錯的（在這種情況下指的是弱隔離級別，法定人數配置等）。

如果你的應用可以容忍偶爾的崩潰，以及以不可預料的方式損壞或丟失資料，那生活就要簡單得多，而你可能只要雙手合十念阿彌陀佛，期望佛祖能保佑最好的結果。另一方面，如果你需要更強的正確性保證，那麼可序列化與原子提交就是久經考驗的方法，但它們是有代價的：它們通常只在單個數據中心中工作（這就排除了地理位置分散的架構），並限制了系統能夠實現的規模與容錯特性。

雖然傳統的事務方法並沒有走遠，但我也相信在使應用正確而靈活地處理錯誤方面上，事務也不是最後一個可以談的。在本節中，我將提出一些在資料流架構中考量正確性的方式。

### 資料庫的端到端原則 {#sec_future_end_to_end}

僅僅因為一個應用程式使用了具有相對較強安全屬性的資料系統（例如可序列化的事務），並不意味著就可以保證沒有資料丟失或損壞。例如，如果某個應用有個 Bug，導致它寫入不正確的資料，或者從資料庫中刪除資料，那麼可序列化的事務也救不了你。

這個例子可能看起來很無聊，但值得認真對待：應用會出 Bug，而人也會犯錯誤。我在 “[狀態、流和不變性](/tw/ch12#sec_stream_immutability)” 中使用了這個例子來支援不可變和僅追加的資料，閹割掉錯誤程式碼摧毀良好資料的能力，能讓從錯誤中恢復更為容易。

雖然不變性很有用，但它本身並非萬靈藥。讓我們來看一個可能發生的、非常微妙的資料損壞案例。

#### 恰好執行一次操作 {#id353}

在 “[容錯](/tw/ch12#sec_stream_fault_tolerance)” 中，我們見到了 **恰好一次**（或 **等效一次**）語義的概念。如果在處理訊息時出現問題，你可以選擇放棄（丟棄訊息 —— 導致資料丟失）或重試。如果重試，就會有這種風險：第一次實際上成功了，只不過你沒有發現。結果這個訊息就被處理了兩次。

處理兩次是資料損壞的一種形式：為同樣的服務向客戶收費兩次（收費太多）或增長計數器兩次（誇大指標）都不是我們想要的。在這種情況下，恰好一次意味著安排計算，使得最終效果與沒有發生錯誤的情況一樣，即使操作實際上因為某種錯誤而重試。我們先前討論過實現這一目標的幾種方法。

最有效的方法之一是使操作 **冪等**（idempotent，請參閱 “[冪等性](/tw/ch12#sec_stream_idempotence)”）：即確保它無論是執行一次還是執行多次都具有相同的效果。但是，將不是天生冪等的操作變為冪等的操作需要一些額外的努力與關注：你可能需要維護一些額外的元資料（例如更新了值的操作 ID 集合），並在從一個節點故障切換至另一個節點時做好防護（請參閱 “[領導者和鎖](/tw/ch9#sec_distributed_lock_fencing)”）。

#### 抑制重複 {#id354}

除了流處理之外，其他許多地方也需要抑制重複的模式。例如，TCP 使用了資料包上的序列號，以便接收方可以將它們正確排序，並確定網路上是否有資料包丟失或重複。在將資料交付應用前，TCP 協議棧會重新傳輸任何丟失的資料包，也會移除任何重複的資料包。

但是，這種重複抑制僅適用於單條 TCP 連線的場景中。假設 TCP 連線是一個客戶端與資料庫的連線，並且它正在執行 [例 13-1](#fig_future_non_idempotent) 中的事務。在許多資料庫中，事務是繫結在客戶端連線上的（如果客戶端傳送了多個查詢，資料庫就知道它們屬於同一個事務，因為它們是在同一個 TCP 連線上傳送的）。如果客戶端在傳送 `COMMIT` 之後並在從資料庫伺服器收到響應之前遇到網路中斷與連線超時，客戶端是不知道事務是否已經被提交的（[圖 9-1](/tw/ch9#fig_distributed_network)）。

<a id="fig_future_non_idempotent"></a>

##### 例 13-1 資金從一個賬戶到另一個賬戶的非冪等轉移

```sql
BEGIN TRANSACTION;
    UPDATE accounts SET balance = balance + 11.00 WHERE account_id = 1234;
    UPDATE accounts SET balance = balance - 11.00 WHERE account_id = 4321;
COMMIT;
```

客戶端可以重連到資料庫並重試事務，但現在已經處於 TCP 重複抑制的範圍之外了。因為 [例 13-1](#fig_future_non_idempotent) 中的事務不是冪等的，可能會發生轉了 \$22 而不是期望的 \$11。因此，儘管 [例 13-1](#fig_future_non_idempotent) 是一個事務原子性的標準樣例，但它實際上並不正確，而真正的銀行並不會這樣辦事[^3]。

兩階段提交（請參閱 “[原子提交與兩階段提交](/tw/ch8#sec_transactions_2pc)”）協議會破壞 TCP 連線與事務之間的 1:1 對映，因為它們必須在故障後允許事務協調器重連到資料庫，告訴資料庫將存疑事務提交還是中止。這足以確保事務只被恰好執行一次嗎？不幸的是，並不能。

即使我們可以抑制資料庫客戶端與伺服器之間的重複事務，我們仍然需要擔心終端使用者裝置與應用伺服器之間的網路。例如，如果終端使用者的客戶端是 Web 瀏覽器，則它可能會使用 HTTP POST 請求向伺服器提交指令。也許使用者正處於一個訊號微弱的蜂窩資料網路連線中，它們成功地傳送了 POST，但卻在能夠從伺服器接收響應之前沒了訊號。

在這種情況下，可能會向用戶顯示錯誤訊息，而他們可能會手動重試。Web 瀏覽器警告說，“你確定要再次提交這個表單嗎？”  —— 使用者選 “是”，因為他們希望操作發生（Post/Redirect/Get 模式[^54]可以避免在正常操作中出現此警告訊息，但 POST 請求超時就沒辦法了）。從 Web 伺服器的角度來看，重試是一個獨立的請求；從資料庫的角度來看，這是一個獨立的事務。通常的除重機制無濟於事。

#### 操作識別符號 {#id355}

要在通過幾跳的網路通訊上使操作具有冪等性，僅僅依賴資料庫提供的事務機制是不夠的，你需要考慮 **端到端（end-to-end）** 的請求流。
例如，你可以為操作生成一個唯一識別符號（例如 UUID），並將其作為隱藏表單欄位包含在客戶端應用中，或透過計算所有相關表單欄位的雜湊來生成操作 ID[^3]。如果瀏覽器提交了兩次 POST，請求會攜帶相同操作 ID。你就可以把這個 ID 貫穿傳遞到資料庫，並確保同一個 ID 最多隻執行一次，如 [例 13-2](#fig_future_request_id) 所示。

<a id="fig_future_request_id"></a>

##### 例 13-2 使用唯一 ID 抑制重複請求

```sql
ALTER TABLE requests ADD UNIQUE (request_id);

BEGIN TRANSACTION;
    INSERT INTO requests
        (request_id, from_account, to_account, amount)
        VALUES('0286FDB8-D7E1-423F-B40B-792B3608036C', 4321, 1234, 11.00);
    UPDATE accounts SET balance = balance + 11.00 WHERE account_id = 1234;
    UPDATE accounts SET balance = balance - 11.00 WHERE account_id = 4321;
COMMIT;
```

[例 13-2](#fig_future_request_id) 依賴於 `request_id` 列上的唯一約束。如果事務嘗試插入已存在的 ID，`INSERT` 會失敗並中止事務，從而避免重複生效。即使在較弱隔離級別下，關係資料庫通常也能正確維護唯一性約束（而應用層的 “先檢查再插入” 在不可序列化隔離下可能失敗，見 “[寫入偏差與幻讀](/tw/ch8#sec_transactions_write_skew)”）。

除了抑制重複請求，[例 13-2](#fig_future_request_id) 中的 `requests` 表本身也像一份事件日誌，可用於事件溯源或變更資料捕獲。賬戶餘額更新並不一定要與事件插入放在同一事務中，因為餘額是可由下游消費者從請求事件派生出的冗餘狀態；只要請求事件被恰好處理一次（同樣可透過請求 ID 保證），即可保持正確性。

#### 端到端原則 {#sec_future_e2e_argument}

抑制重複事務的這種情況只是一個更普遍的原則的一個例子，這個原則被稱為 **端到端原則（end-to-end argument）**，它在 1984 年由 Saltzer、Reed 和 Clark 闡述[^55]：

> 只有在通訊系統兩端應用的知識與幫助下，所討論的功能才能完全地正確地實現。因而將這種被質疑的功能作為通訊系統本身的功能是不可能的（有時，通訊系統可以提供這種功能的不完備版本，可能有助於提高效能）。
>

在我們的例子中 **所討論的功能** 是重複抑制。我們看到 TCP 在 TCP 連線層次抑制了重複的資料包，一些流處理器在訊息處理層次提供了所謂的恰好一次語義，但這些都無法阻止當一個請求超時時，使用者親自提交重複的請求。TCP，資料庫事務，以及流處理器本身並不能完全排除這些重複。解決這個問題需要一個端到端的解決方案：從終端使用者的客戶端一路傳遞到資料庫的事務識別符號。

端到端原則也適用於檢查資料的完整性：乙太網，TCP 和 TLS 中內建的校驗和可以檢測網路中資料包的損壞情況，但是它們無法檢測到由連線兩端傳送 / 接收軟體中 Bug 導致的損壞。或資料儲存所在磁碟上的損壞。如果你想捕獲資料所有可能的損壞來源，你也需要端到端的校驗和。

類似的原則也適用於加密[^55]：家庭 WiFi 網路上的密碼可以防止人們竊聽你的 WiFi 流量，但無法阻止網際網路上其他地方攻擊者的窺探；客戶端與伺服器之間的 TLS/SSL 可以阻擋網路攻擊者，但無法阻止惡意伺服器。只有端到端的加密和認證可以防止所有這些事情。

儘管低層級的功能（TCP 重複抑制、乙太網校驗和、WiFi 加密）無法單獨提供所需的端到端功能，但它們仍然很有用，因為它們能降低較高層級出現問題的可能性。例如，如果我們沒有 TCP 來將資料包排成正確的順序，那麼 HTTP 請求通常就會被攪爛。我們只需要記住，低級別的可靠性功能本身並不足以確保端到端的正確性。

#### 在資料系統中應用端到端思考 {#id357}

這將我帶回最初的論點：僅僅因為應用使用了提供相對較強安全屬性的資料系統，例如可序列化的事務，並不意味著應用的資料就不會丟失或損壞了。應用本身也需要採取端到端的措施，例如除重。

這實在是一個遺憾，因為容錯機制很難弄好。低層級的可靠機制（比如 TCP 中的那些）執行的相當好，因而剩下的高層級錯誤基本很少出現。如果能將這些剩下的高層級容錯機制打包成抽象，而應用不需要再去操心，那該多好呀 —— 但恐怕我們還沒有找到這一正確的抽象。

長期以來，事務被認為是一個很好的抽象，我相信它們確實是很有用的。正如 [第八章](/tw/ch8) 中所討論的，它們將各種可能的問題（併發寫入、違背約束、崩潰、網路中斷、磁碟故障）合併為兩種可能結果：提交或中止。這是對程式設計模型而言的一種巨大簡化，但這還不夠。

事務是代價高昂的，當涉及異構儲存技術時尤為甚（請參閱 “[實踐中的分散式事務](/tw/ch8#sec_transactions_xa)”）。我們拒絕使用分散式事務是因為它開銷太大，結果我們最後不得不在應用程式碼中重新實現容錯機制。正如本書中大量的例子所示，對併發性與部分失敗的推理是困難且違反直覺的，所以我懷疑大多數應用級別的機制都不能正確工作，最終結果是資料丟失或損壞。

出於這些原因，我認為探索對容錯的抽象是很有價值的。它使提供應用特定的端到端的正確性屬性變得更簡單，而且還能在大規模分散式環境中提供良好的效能與運維特性。

### 強制約束 {#sec_future_constraints}

讓我們思考一下在 [分拆資料庫](#sec_future_unbundling) 上下文中的 **正確性（correctness）**。我們看到端到端的除重可以透過從客戶端一路透傳到資料庫的請求 ID 實現。那麼其他型別的約束呢？

我們先來特別關注一下 **唯一性約束** —— 例如我們在 [例 13-2](#fig_future_request_id) 中所依賴的約束。在 “[約束和唯一性保證](/tw/ch10#sec_consistency_uniqueness)” 中，我們看到了幾個其他需要強制實施唯一性的應用功能例子：使用者名稱或電子郵件地址必須唯一標識使用者，檔案儲存服務不能包含多個重名檔案，兩個人不能在航班或劇院預訂同一個座位。

其他型別的約束也非常類似：例如，確保帳戶餘額永遠不會變為負數，確保不會超賣庫存，或者會議室沒有重複的預訂。執行唯一性約束的技術通常也可以用於這些約束。

#### 唯一性約束需要達成共識 {#id452}

在 [第十章](/tw/ch10) 中我們看到，在分散式環境中，強制執行唯一性約束需要共識：如果存在多個具有相同值的併發請求，則系統需要決定衝突操作中的哪一個被接受，並拒絕其他違背約束的操作。

達成這一共識的最常見方式是使單個節點作為領導，並使其負責所有決策。只要你不介意所有請求都擠過單個節點（即使客戶端位於世界的另一端），只要該節點沒有失效，系統就能正常工作。如果你需要容忍領導者失效，那麼就又回到了共識問題（請參閱 “[單主複製與共識](/tw/ch10#from-single-leader-replication-to-consensus)”）。

唯一性檢查可以透過對唯一性欄位分割槽做橫向伸縮。例如，如果需要透過請求 ID 確保唯一性（如 [例 13-2](#fig_future_request_id) 所示），你可以確保所有具有相同請求 ID 的請求都被路由到同一分割槽（請參閱 [第七章](/tw/ch7)）。如果你需要讓使用者名稱是唯一的，則可以按使用者名稱的雜湊值做分割槽。

但非同步多主複製排除在外，因為可能會發生不同主庫同時接受衝突寫操作的情況，因而這些值不再是唯一的（請參閱 “[實現線性一致的系統](/tw/ch10#sec_consistency_implementing_linearizable)”）。如果你想立刻拒絕任何違背約束的寫入，同步協調是無法避免的[^56]。

#### 基於日誌訊息傳遞中的唯一性 {#sec_future_uniqueness_log}

日誌確保所有消費者以相同順序看到訊息，這在形式上稱為 **全序廣播（total order broadcast）**，並且等價於共識（請參閱 “[全序廣播](/tw/ch10#sec_consistency_total_order)”）。在基於日誌訊息傳遞的分拆資料庫方案中，我們可以用同樣的思路來實施唯一性約束。

流處理器在單個執行緒上依次消費單個日誌分割槽中的所有訊息（請參閱 “[日誌與傳統的訊息傳遞相比](/tw/ch12#sec_stream_logs_vs_messaging)”）。因此，如果日誌是按需要確保唯一的值做的分割槽，則流處理器可以無歧義地、確定性地決定幾個衝突操作中的哪一個先到達。例如，在多個使用者嘗試宣告相同使用者名稱的情況下[^57]：

1. 每個對使用者名稱的請求都被編碼為一條訊息，並追加到按使用者名稱雜湊值確定的分割槽。
2. 流處理器依序讀取日誌中的請求，並使用本地資料庫來追蹤哪些使用者名稱已經被佔用了。對於所有申請可用使用者名稱的請求，它都會記錄該使用者名稱，並向輸出流傳送一條成功訊息。對於所有申請已佔用使用者名稱的請求，它都會向輸出流傳送一條拒絕訊息。
3. 請求使用者名稱的客戶端監視輸出流，等待與其請求相對應的成功或拒絕訊息。

該演算法基本上與 “[使用全序廣播實現線性一致的儲存](/tw/ch10#sec_consistency_total_order)” 中的演算法相同。它可以簡單地透過增加分割槽數伸縮至較大的請求吞吐量，因為每個分割槽都可以被獨立處理。

該方法不僅適用於唯一性約束，而且適用於許多其他型別的約束。其基本原理是，任何可能衝突的寫入都會路由到相同的分割槽並按順序處理。正如 “[什麼是衝突？](/tw/ch6#what-is-a-conflict)” 與 “[寫入偏差與幻讀](/tw/ch8#sec_transactions_write_skew)” 中所述，衝突的定義可能取決於應用，但流處理器可以使用任意邏輯來驗證請求。這個想法與 Bayou 在 90 年代開創的方法類似[^58]。

#### 多分割槽請求處理 {#id360}

當請求涉及多個分割槽時，如何在滿足約束的同時保證原子效果，會更有挑戰性。在 [例 13-2](#fig_future_request_id) 中，至少可能涉及三個分割槽：請求 ID 所在分割槽、收款賬戶所在分割槽、付款賬戶所在分割槽。它們彼此獨立，並不必然位於同一分割槽。

在傳統資料庫方案裡，這類事務通常需要跨分割槽原子提交；這會把事務強行納入跨分割槽全序，從而引入同步協調開銷並影響吞吐量。
但使用分割槽日誌與流處理器，也可以在不使用跨分割槽原子提交的情況下達到等價正確性。

{{< figure src="/fig/ddia_1302.png" id="fig_future_multi_shard" caption="圖 13-2 使用事件日誌與流處理器，檢查源賬戶是否有足夠餘額，並將資金原子地劃轉到目標賬戶與手續費賬戶。" class="w-full my-4" >}}

1.  客戶端為轉賬請求生成全域性唯一請求 ID，並將請求按源賬戶 ID 路由到相應日誌分割槽。
2.  一個流處理器消費該請求日誌，並維護源賬戶本地狀態及已處理請求 ID 集。遇到新請求 ID 時，先檢查餘額是否充足；若充足，則在本地狀態中預留金額，併發出多個後續事件：源賬戶的出賬事件、目標賬戶的入賬事件、手續費賬戶的入賬事件。所有事件都攜帶同一請求 ID。
3.  源賬戶處理器稍後會再次收到出賬事件。它根據請求 ID 識別出這是先前預留過的支付，執行真正扣款並更新本地狀態；若重複到達則忽略。
4.  目標賬戶與手續費賬戶各自由獨立處理任務消費。收到入賬事件後更新本地狀態，並基於請求 ID 去重。

圖 13-2 雖然畫成三個賬戶落在三個分割槽中，但即使在同一分割槽也同樣成立。關鍵條件是：同一賬戶的事件必須按日誌順序處理，且訊息投遞具備至少一次語義，處理邏輯保持確定性。

如果源賬戶處理器在處理中崩潰，恢復後會重放相同請求並做出相同決策，發出相同請求 ID 的後續事件。下游消費者會基於請求 ID 去重，因此不會重複生效。

這個系統的原子性不來自分散式事務，而來自初始請求事件寫入源賬戶日誌這一原子動作。只要這個起點事件寫入成功，後續事件最終都會出現：它們可能因故障恢復而延遲，也可能短暫重複，但最終可達。

透過把多分割槽事務拆成多個按不同鍵分割槽的階段，並貫穿端到端請求 ID，我們在故障場景下依然能保證“每個請求對付款方與收款方都恰好生效一次”，同時避免使用原子提交協議。

### 及時性與完整性 {#sec_future_integrity}

事務的一個便利屬性是，它們通常是線性一致的（請參閱 “[線性一致性](/tw/ch10#sec_consistency_linearizability)”），也就是說，寫入者會等到事務提交，而之後其寫入立刻對所有讀取者可見。

當我們把一個操作拆分為跨越多個階段的流處理器時，卻並非如此：日誌的消費者在設計上就是非同步的，因此傳送者不會等其訊息被消費者處理完。但是，客戶端等待輸出流中的特定訊息是可能的。這正是我們在 “[基於日誌訊息傳遞中的唯一性](#sec_future_uniqueness_log)” 一節中檢查唯一性約束時所做的事情。

在這個例子中，唯一性檢查的正確性不取決於訊息傳送者是否等待結果。等待的目的僅僅是同步通知傳送者唯一性檢查是否成功。但該通知可以與訊息處理的結果相解耦。

更一般地來講，我認為術語 **一致性（consistency）** 這個術語混淆了兩個值得分別考慮的需求：

* 及時性（Timeliness）

  及時性意味著確保使用者觀察到系統的最新狀態。我們之前看到，如果使用者從陳舊的資料副本中讀取資料，它們可能會觀察到系統處於不一致的狀態（請參閱 “[複製延遲問題](/tw/ch6#sec_replication_lag)”）。但這種不一致是暫時的，而最終會透過等待與重試簡單地得到解決。

  CAP 定理（請參閱 “[線性一致性的代價](/tw/ch10#sec_linearizability_cost)”）使用 **線性一致性（linearizability）** 意義上的一致性，這是實現及時性的強有力方法。像 **寫後讀** 這樣及時性更弱的一致性也很有用（請參閱 “[讀己之寫](/tw/ch6#sec_replication_ryw)”）。

* 完整性（Integrity）

  完整性意味著沒有損壞；即沒有資料丟失，並且沒有矛盾或錯誤的資料。尤其是如果某些派生資料集是作為底層資料之上的檢視而維護的（請參閱 “[從事件日誌中派生出當前狀態](/tw/ch12#sec_stream_deriving_views)”），這種派生必須是正確的。例如，資料庫索引必須正確地反映資料庫的內容 —— 缺失某些記錄的索引並不是很有用。

  如果完整性被違背，這種不一致是永久的：在大多數情況下，等待與重試並不能修復資料庫損壞。相反的是，需要顯式地檢查與修復。在 ACID 事務的上下文中（請參閱 “[ACID 的含義](/tw/ch8#sec_transactions_acid)”），一致性通常被理解為某種特定於應用的完整性概念。原子性和永續性是保持完整性的重要工具。


口號形式：違反及時性，“最終一致性”；違反完整性，“永無一致性”。

我斷言在大多數應用中，完整性比及時性重要得多。違反及時性可能令人困惑與討厭，但違反完整性的結果可能是災難性的。

例如在你的信用卡對賬單上，如果某一筆過去 24 小時內完成的交易尚未出現並不令人奇怪 —— 這些系統有一定的滯後是正常的。我們知道銀行是非同步核算與敲定交易的，這裡的及時性並不是非常重要[^3]。但如果當期對賬單餘額與上期對賬單餘額加交易總額對不上（求和錯誤），或者出現一筆向你收費但未向商家付款的交易（消失的錢），那就實在是太糟糕了，這樣的問題就違背了系統的完整性。

#### 資料流系統的正確性 {#id453}

ACID 事務通常既提供及時性（例如線性一致性）也提供完整性保證（例如原子提交）。因此如果你從 ACID 事務的角度來看待應用的正確性，那麼及時性與完整性的區別是無關緊要的。

另一方面，對於在本章中討論的基於事件的資料流系統而言，它們的一個有趣特性就是將及時性與完整性分開。在非同步處理事件流時不能保證及時性，除非你顯式構建一個在返回之前明確等待特定訊息到達的消費者。但完整性實際上才是流處理系統的核心。

**恰好一次** 或 **等效一次** 語義（請參閱 “[容錯](/tw/ch12#sec_stream_fault_tolerance)”）是一種保持完整性的機制。如果事件丟失或者生效兩次，就有可能違背資料系統的完整性。因此在出現故障時，容錯訊息傳遞與重複抑制（例如，冪等操作）對於維護資料系統的完整性是很重要的。

正如我們在上一節看到的那樣，可靠的流處理系統可以在無需分散式事務與原子提交協議的情況下保持完整性，這意味著它們有潛力達到與後者相當的正確性，同時還具備好得多的效能與運維穩健性。為了達成這種正確性，我們組合使用了多種機制：

* 將寫入操作的內容表示為單條訊息，從而可以輕鬆地被原子寫入 —— 與事件溯源搭配效果拔群（請參閱 “[事件溯源](/tw/ch12#sec_stream_event_sourcing)”）。
* 使用與儲存過程類似的確定性派生函式，從這一訊息中派生出所有其他的狀態變更（請參閱 “[真的序列執行](/tw/ch8#sec_transactions_serial)” 和 “[應用程式碼作為派生函式](#sec_future_dataflow_derivation)”）
* 將客戶端生成的請求 ID 傳遞透過所有的處理層次，從而允許端到端的除重，帶來冪等性。
* 使訊息不可變，並允許派生資料能隨時被重新處理，這使從錯誤中恢復更加容易（請參閱 “[不可變事件的優點](/tw/ch12#sec_stream_immutability_pros)”）

這種機制組合在我看來，是未來構建容錯應用的一個非常有前景的方向。

#### 寬鬆地解釋約束 {#id362}

如前所述，執行唯一性約束需要共識，通常透過在單個節點中彙集特定分割槽中的所有事件來實現。如果我們想要傳統的唯一性約束形式，這種限制是不可避免的，流處理也不例外。

然而另一個需要了解的事實是，許多真實世界的應用實際上可以擺脫這種形式，接受弱得多的唯一性：

* 如果兩個人同時註冊了相同的使用者名稱或預訂了相同的座位，你可以給其中一個人發訊息道歉，並要求他們換一個不同的使用者名稱或座位。這種糾正錯誤的變化被稱為 **補償性事務（compensating transaction）**[^59] [^60]。
* 如果客戶訂購的物品多於倉庫中的物品，你可以下單補倉，併為延誤向客戶道歉，向他們提供折扣。實際上，這麼說吧，如果叉車在倉庫中軋過了你的貨物，剩下的貨物比你想象的要少，那麼你也是得這麼做[^61]。因此，既然道歉工作流無論如何已經成為你商業過程中的一部分了，那麼對庫存物品數目新增線性一致的約束可能就沒必要了。
* 與之類似，許多航空公司都會超賣機票，打著一些旅客可能會錯過航班的算盤；許多旅館也會超賣客房，抱著部分客人可能會取消預訂的期望。在這些情況下，出於商業原因而故意違反了 “一人一座” 的約束；當需求超過供給的情況出現時，就會進入補償流程（退款、升級艙位 / 房型、提供隔壁酒店的免費的房間）。即使沒有超賣，為了應對由惡劣天氣或員工罷工導致的航班取消，你還是需要道歉與補償流程 —— 從這些問題中恢復僅僅是商業活動的正常組成部分。
* 如果有人從賬戶超額取款，銀行可以向他們收取透支費用，並要求他們償還欠款。透過限制每天的提款總額，銀行的風險是有限的。

在許多商業場景中，臨時違背約束並稍後透過道歉來修復，實際上是可以接受的。道歉的成本各不相同，但通常很低（以金錢或名聲來算）：你無法撤回已傳送的電子郵件，但可以傳送一封后續電子郵件進行更正。如果你不小心向信用卡收取了兩次費用，則可以將其中一項收費退款，而代價僅僅是手續費，也許還有客戶的投訴。儘管一旦 ATM 吐了錢，你無法直接取回，但原則上如果賬戶透支而客戶拒不支付，你可以派催收員收回欠款。

道歉的成本是否能接受是一個商業決策。如果可以接受的話，在寫入資料之前檢查所有約束的傳統模型反而會帶來不必要的限制，而線性一致性的約束也不是必須的。樂觀寫入，事後檢查可能是一種合理的選擇。你仍然可以在做一些挽回成本高昂的事情前確保有相關的驗證，但這並不意味著寫入資料之前必須先進行驗證。

這些應用 **確實** 需要完整性：你不會希望丟失預訂資訊，或者由於借方貸方不匹配導致資金消失。但是它們在執行約束時 **並不需要** 及時性：如果你銷售的貨物多於倉庫中的庫存，可以在事後道歉後並彌補問題。這種做法與我們在 “[處理寫入衝突](/tw/ch6#sec_replication_write_conflicts)” 中討論的衝突解決方法類似。

#### 無協調資料系統 {#id454}

我們現在已經做了兩個有趣的觀察：

1. 資料流系統可以維持派生資料的完整性保證，而無需原子提交、線性一致性或者同步的跨分割槽協調。
2. 雖然嚴格的唯一性約束要求及時性和協調，但許多應用實際上可以接受寬鬆的約束：只要整個過程保持完整性，這些約束可能會被臨時違反並在稍後被修復。

總之這些觀察意味著，資料流系統可以為許多應用提供無需協調的資料管理服務，且仍能給出很強的完整性保證。這種 **無協調（coordination-avoiding）** 的資料系統有著很大的吸引力：比起需要執行同步協調的系統，它們能達到更好的效能與更強的容錯能力[^56]。

例如，這種系統可以使用多領導者配置運維，跨越多個數據中心，在區域間非同步複製。任何一個數據中心都可以持續獨立執行，因為不需要同步的跨區域協調。這樣的系統的及時性保證會很弱 —— 如果不引入協調它是不可能是線性一致的 —— 但它仍然可以提供有力的完整性保證。

在這種情況下，可序列化事務作為維護派生狀態的一部分仍然是有用的，但它們只能在小範圍內執行，在那裡它們工作得很好[^8]。異構分散式事務（如 XA 事務，請參閱 “[實踐中的分散式事務](/tw/ch8#sec_transactions_xa)”）不是必需的。同步協調仍然可以在需要的地方引入（例如在無法恢復的操作之前強制執行嚴格的約束），但是如果只是應用的一小部分地方需要它，沒必要讓所有操作都付出協調的代價。[^43]。

另一種審視協調與約束的角度是：它們減少了由於不一致而必須做出的道歉數量，但也可能會降低系統的效能和可用性，從而可能增加由於宕機中斷而需要做出的道歉數量。你不可能將道歉數量減少到零，但可以根據自己的需求尋找最佳平衡點 —— 既不存在太多不一致性，又不存在太多可用性問題。

### 信任但驗證 {#sec_future_verification}

我們所有關於正確性，完整性和容錯的討論都基於一些假設，假設某些事情可能會出錯，但其他事情不會。我們將這些假設稱為我們的 **系統模型**（system model，請參閱 “[將系統模型對映到現實世界](/tw/ch9#sec_distributed_system_model)”）：例如，我們應該假設程序可能會崩潰，機器可能突然斷電，網路可能會任意延遲或丟棄訊息。但是我們也可能假設寫入磁碟的資料在執行 `fsync` 後不會丟失，記憶體中的資料沒有損壞，而 CPU 的乘法指令總是能返回正確的結果。

這些假設是相當合理的，因為大多數時候它們都是成立的，如果我們不得不經常擔心計算機出錯，那麼基本上寸步難行。在傳統上，系統模型採用二元方法處理故障：我們假設有些事情可能會發生，而其他事情 **永遠** 不會發生。實際上，這更像是一個機率問題：有些事情更有可能，其他事情不太可能。問題在於違反我們假設的情況是否經常發生，以至於我們可能在實踐中遇到它們。

我們已經看到，資料可能會在記憶體中、磁碟上、以及網路傳輸過程中出現損壞。也許這件事值得我們投入更多關注：當系統規模足夠大時，哪怕機率再低的問題也會在現實中發生。

#### 維護完整性，儘管軟體有Bug {#id455}

除了這些硬體問題之外，總是存在軟體 Bug 的風險，這些錯誤不會被較低層次的網路、記憶體或檔案系統校驗和所捕獲。即使廣泛使用的資料庫軟體也有 Bug：即使像 MySQL 與 PostgreSQL 這樣穩健、口碑良好、多年來被許多人充分測試過的軟體，就我個人所見也有 Bug，比如 MySQL 未能正確維護唯一約束[^65]，以及 PostgreSQL 的可序列化隔離等級存在特定的寫入偏差異常[^66]。對於不那麼成熟的軟體來說，情況可能要糟糕得多。

儘管在仔細設計，測試，以及審查上做出很多努力，但 Bug 仍然會在不知不覺中產生。儘管它們很少，而且最終會被發現並被修復，但總會有那麼一段時間，這些 Bug 可能會損壞資料。

而對於應用程式碼，我們不得不假設會有更多的錯誤，因為絕大多數應用的程式碼經受的評審與測試遠遠無法與資料庫的程式碼相比。許多應用甚至沒有正確使用資料庫提供的用於維持完整性的功能，例如外部索引鍵或唯一性約束[^36]。

ACID 意義下的一致性（請參閱 “[一致性](/tw/ch8#sec_transactions_acid_consistency)”）基於這樣一種想法：資料庫以一致的狀態啟動，而事務將其從一個一致狀態轉換至另一個一致的狀態。因此，我們期望資料庫始終處於一致狀態。然而，只有當你假設事務沒有 Bug 時，這種想法才有意義。如果應用以某種錯誤的方式使用資料庫，例如，不安全地使用弱隔離等級，資料庫的完整性就無法得到保證。

#### 不要盲目信任承諾 {#id364}

由於硬體和軟體並不總是符合我們的理想，所以資料損壞似乎早晚不可避免。因此，我們至少應該有辦法查明資料是否已經損壞，以便我們能夠修復它，並嘗試追查錯誤的來源。檢查資料完整性稱為 **審計（auditing）**。

如 “[不可變事件的優點](/tw/ch12#sec_stream_immutability_pros)” 一節中所述，審計不僅僅適用於財務應用程式。不過，可審計性在財務中是非常非常重要的，因為每個人都知道錯誤總會發生，我們也都認為能夠檢測和解決問題是合理的需求。

成熟的系統同樣傾向於考慮不太可能的事情出錯的可能性，並管理這種風險。例如，HDFS 和 Amazon S3 等大規模儲存系統並不完全信任磁碟：它們執行後臺程序持續回讀檔案，並將其與其他副本進行比較，並將檔案從一個磁碟移動到另一個，以便降低靜默損壞的風險[^67]。

如果你想確保你的資料仍然存在，你必須真正讀取它並進行檢查。大多數時候它們仍然會在那裡，但如果不是這樣，你一定想盡早知道答案，而不是更晚。按照同樣的原則，不時地嘗試從備份中恢復是非常重要的 —— 否則當你發現備份損壞時，你可能已經遇到了資料丟失，那時候就真的太晚了。不要盲目地相信它們全都管用。

#### 為可審計性而設計 {#id365}

如果一個事務在一個數據庫中改變了多個物件，在這一事實發生後，很難說清這個事務到底意味著什麼。即使你捕獲了事務日誌（請參閱 “[變更資料捕獲](/tw/ch12#sec_stream_cdc)”），各種表中的插入、更新和刪除操作並不一定能清楚地表明 **為什麼** 要執行這些變更。決定這些變更的是應用邏輯中的呼叫，而這一應用邏輯稍縱即逝，無法重現。

相比之下，基於事件的系統可以提供更好的可審計性。在事件溯源方法中，系統的使用者輸入被表示為一個單一不可變事件，而任何其導致的狀態變更都派生自該事件。派生可以實現為具有確定性與可重複性，因而相同的事件日誌透過相同版本的派生程式碼時，會導致相同的狀態變更。

顯式處理資料流（請參閱 “[批處理輸出的哲學](/tw/ch11#sec_batch_output)”）可以使資料的 **來龍去脈（provenance）** 更加清晰，從而使完整性檢查更具可行性。對於事件日誌，我們可以使用雜湊來檢查事件儲存沒有被破壞。對於任何派生狀態，我們可以重新執行從事件日誌中派生它的批處理器與流處理器，以檢查是否獲得相同的結果，或者，甚至並行執行冗餘的派生流程。

具有確定性且定義良好的資料流，也使除錯與跟蹤系統的執行變得容易，以便確定它 **為什麼** 做了某些事情[^4] [^69]。如果出現意想之外的事情，那麼重現導致意外事件的確切事故現場的診斷能力 —— 一種時間旅行除錯功能是非常有價值的。

#### 端到端原則重現 {#id456}

如果我們不能完全相信系統的每個元件都不會損壞 —— 每一個硬體都沒缺陷，每一個軟體都沒有 Bug —— 那我們至少必須定期檢查資料的完整性。如果我們不檢查，我們就不能發現損壞，直到無可挽回地導致對下游的破壞時，那時候再去追蹤問題就要難得多，且代價也要高的多。

檢查資料系統的完整性，最好是以端到端的方式進行（請參閱 “[資料庫的端到端原則](#sec_future_end_to_end)”）：我們能在完整性檢查中涵蓋的系統越多，某些處理階中出現不被察覺損壞的機率就越小。如果我們能檢查整個派生資料管道端到端的正確性，那麼沿著這一路徑的任何磁碟、網路、服務以及演算法的正確性檢查都隱含在其中了。

持續的端到端完整性檢查可以不斷提高你對系統正確性的信心，從而使你能更快地進步[^70]。與自動化測試一樣，審計提高了快速發現錯誤的可能性，從而降低了系統變更或新儲存技術可能導致損失的風險。如果你不害怕進行變更，就可以更好地充分演化一個應用，使其滿足不斷變化的需求。

#### 用於可審計資料系統的工具 {#id366}

目前，把可審計性作為一級目標的資料系統還不多。一些應用會實現自己的審計機制（例如把變更寫入獨立審計表），但要同時保證審計日誌與主資料庫狀態都不可篡改仍然很難。

像 Bitcoin、Ethereum 這樣的區塊鏈，本質上是帶密碼學一致性校驗的共享僅追加日誌；交易可視作事件，智慧合約可視作流處理器。它們透過共識協議讓所有節點同意同一事件序列。與本書 [第十章](/tw/ch10) 的共識協議相比，區塊鏈的一個差異是強調拜占庭容錯：參與節點會持續相互校驗完整性[^71] [^72] [^73]。

對多數應用而言，區塊鏈整體開銷仍偏高；但其中一些密碼學工具可在更輕量的場景複用。比如 **默克爾樹（Merkle tree）**[^74]可高效證明某條記錄屬於某資料集。**證書透明性（certificate transparency）** 使用可驗證的僅追加日誌與默克爾樹來校驗 TLS/SSL 證書有效性[^75] [^76]。

未來，這類完整性校驗與審計算法可能會在通用資料系統中更廣泛應用。要把它們做到與無密碼學審計系統同等級別的可伸縮性，同時把效能開銷壓到足夠低，仍需要工程改進，但方向值得重視。


## 本章小結 {#id367}

在本章中，我們討論了設計資料系統的新方式，而且也包括了我的個人觀點，以及對未來的猜測。我們從這樣一種觀察開始：沒有單種工具能高效服務所有可能的用例，因此應用必須組合使用幾種不同的軟體才能實現其目標。我們討論了如何使用批處理與事件流來解決這一 **資料整合（data integration）** 問題，以便讓資料變更在不同系統之間流動。

在這種方法中，某些系統被指定為記錄系統，而其他資料則透過轉換派生自記錄系統。透過這種方式，我們可以維護索引、物化檢視、機器學習模型、統計摘要等等。透過使這些派生和轉換操作非同步且鬆散耦合，能夠防止一個區域中的問題擴散到系統中不相關部分，從而增加整個系統的穩健性與容錯性。

將資料流表示為從一個數據集到另一個數據集的轉換也有助於演化應用程式：如果你想變更其中一個處理步驟，例如變更索引或快取的結構，則可以在整個輸入資料集上重新執行新的轉換程式碼，以便重新派生輸出。同樣，出現問題時，你也可以修復程式碼並重新處理資料以便恢復。

這些過程與資料庫內部已經完成的過程非常類似，因此我們將資料流應用的概念重新改寫為，**分拆（unbundling）** 資料庫元件，並透過組合這些鬆散耦合的元件來構建應用程式。

派生狀態可以透過觀察底層資料的變更來更新。此外，派生狀態本身可以進一步被下游消費者觀察。我們甚至可以將這種資料流一路傳送至顯示資料的終端使用者裝置，從而構建可動態更新以反映資料變更，並在離線時能繼續工作的使用者介面。

接下來，我們討論了如何確保所有這些處理在出現故障時保持正確。我們看到可伸縮的強完整性保證可以透過非同步事件處理來實現，透過使用端到端操作識別符號使操作冪等，以及透過非同步檢查約束。客戶端可以等到檢查透過，或者不等待繼續前進，但是可能會冒有違反約束需要道歉的風險。這種方法比使用分散式事務的傳統方法更具可伸縮性與可靠性，並且在實踐中適用於很多業務流程。

透過圍繞資料流構建應用，並非同步檢查約束，我們可以避免絕大多數協調，構建在地理分佈和故障場景下依然保持完整性且效能良好的系統。隨後我們還討論了如何透過審計驗證完整性、發現損壞，並指出區塊鏈/分散式賬本所使用的一些機制與事件驅動系統在思想上也存在共通之處。


##### Footnotes

### References {#references}

[^1]: Rachid Belaid: “[Postgres Full-Text Search is Good Enough!](http://rachbelaid.com/postgres-full-text-search-is-good-enough/),” *rachbelaid.com*, July 13, 2015.
[^2]: Philippe Ajoux, Nathan Bronson, Sanjeev Kumar, et al.: “[Challenges to Adopting Stronger Consistency at Scale](https://www.usenix.org/system/files/conference/hotos15/hotos15-paper-ajoux.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
[^3]: Pat Helland and Dave Campbell: “[Building on Quicksand](https://web.archive.org/web/20220606172817/https://database.cs.wisc.edu/cidr/cidr2009/Paper_133.pdf),” at *4th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2009.
[^4]: Jessica Kerr: “[Provenance and Causality in Distributed Systems](https://web.archive.org/web/20190425150540/http://blog.jessitron.com/2016/09/provenance-and-causality-in-distributed.html),” *blog.jessitron.com*, September 25, 2016.
[^5]: Kostas Tzoumas: “[Batch Is a Special Case of Streaming](http://data-artisans.com/blog/batch-is-a-special-case-of-streaming/),” *data-artisans.com*, September 15, 2015.
[^6]: Shinji Kim and Robert Blafford: “[Stream Windowing Performance Analysis: Concord and Spark Streaming](https://web.archive.org/web/20180125074821/http://concord.io/posts/windowing_performance_analysis_w_spark_streaming),” *concord.io*, July 6, 2016.
[^7]: Jay Kreps: “[The Log: What Every Software Engineer Should Know About Real-Time Data's Unifying Abstraction](http://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying),” *engineering.linkedin.com*, December 16, 2013.
[^8]: Pat Helland: “[Life Beyond Distributed Transactions: An Apostate’s Opinion](https://web.archive.org/web/20200730171311/http://www-db.cs.wisc.edu/cidr/cidr2007/papers/cidr07p15.pdf),” at *3rd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2007.
[^9]: “[Great Western Railway (1835–1948)](https://web.archive.org/web/20160122155425/https://www.networkrail.co.uk/VirtualArchive/great-western/),” Network Rail Virtual Archive, *networkrail.co.uk*.
[^10]: Jacqueline Xu: “[Online Migrations at Scale](https://stripe.com/blog/online-migrations),” *stripe.com*, February 2, 2017.
[^11]: Molly Bartlett Dishman and Martin Fowler: “[Agile Architecture](https://web.archive.org/web/20161130034721/http://conferences.oreilly.com/software-architecture/sa2015/public/schedule/detail/40388),” at *O'Reilly Software Architecture Conference*, March 2015.
[^12]: Nathan Marz and James Warren: [*Big Data: Principles and Best Practices of Scalable Real-Time Data Systems*](https://www.manning.com/books/big-data). Manning, 2015. ISBN: 978-1-617-29034-3
[^13]: Oscar Boykin, Sam Ritchie, Ian O'Connell, and Jimmy Lin: “[Summingbird: A Framework for Integrating Batch and Online MapReduce Computations](http://www.vldb.org/pvldb/vol7/p1441-boykin.pdf),” at *40th International Conference on Very Large Data Bases* (VLDB), September 2014.
[^14]: Jay Kreps: “[Questioning the Lambda Architecture](https://www.oreilly.com/ideas/questioning-the-lambda-architecture),” *oreilly.com*, July 2, 2014.
[^15]: Raul Castro Fernandez, Peter Pietzuch, Jay Kreps, et al.: “[Liquid: Unifying Nearline and Offline Big Data Integration](http://cidrdb.org/cidr2015/Papers/CIDR15_Paper25u.pdf),” at *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
[^16]: Dennis M. Ritchie and Ken Thompson: “[The UNIX Time-Sharing System](http://web.eecs.utk.edu/~qcao1/cs560/papers/paper-unix.pdf),” *Communications of the ACM*, volume 17, number 7, pages 365–375, July 1974. [doi:10.1145/361011.361061](http://dx.doi.org/10.1145/361011.361061)
[^17]: Eric A. Brewer and Joseph M. Hellerstein: “[CS262a: Advanced Topics in Computer Systems](http://people.eecs.berkeley.edu/~brewer/cs262/systemr.html),” lecture notes, University of California, Berkeley, *cs.berkeley.edu*, August 2011.
[^18]: Michael Stonebraker: “[The Case for Polystores](http://wp.sigmod.org/?p=1629),” *wp.sigmod.org*, July 13, 2015.
[^19]: Jennie Duggan, Aaron J. Elmore, Michael Stonebraker, et al.: “[The BigDAWG Polystore System](https://dspace.mit.edu/handle/1721.1/100936),” *ACM SIGMOD Record*, volume 44, number 2, pages 11–16, June 2015. [doi:10.1145/2814710.2814713](http://dx.doi.org/10.1145/2814710.2814713)
[^20]: Patrycja Dybka: “[Foreign Data Wrappers for PostgreSQL](https://web.archive.org/web/20221003115732/https://www.vertabelo.com/blog/foreign-data-wrappers-for-postgresql/),” *vertabelo.com*, March 24, 2015.
[^21]: David B. Lomet, Alan Fekete, Gerhard Weikum, and Mike Zwilling: “[Unbundling Transaction Services in the Cloud](https://www.microsoft.com/en-us/research/publication/unbundling-transaction-services-in-the-cloud/),” at *4th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2009.
[^22]: Martin Kleppmann and Jay Kreps: “[Kafka, Samza and the Unix Philosophy of Distributed Data](http://martin.kleppmann.com/papers/kafka-debull15.pdf),” *IEEE Data Engineering Bulletin*, volume 38, number 4, pages 4–14, December 2015.
[^23]: John Hugg: “[Winning Now and in the Future: Where VoltDB Shines](https://voltdb.com/blog/winning-now-and-future-where-voltdb-shines),” *voltdb.com*, March 23, 2016.
[^24]: Frank McSherry, Derek G. Murray, Rebecca Isaacs, and Michael Isard: “[Differential Dataflow](http://cidrdb.org/cidr2013/Papers/CIDR13_Paper111.pdf),” at *6th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2013.
[^25]: Derek G Murray, Frank McSherry, Rebecca Isaacs, et al.: “[Naiad: A Timely Dataflow System](http://sigops.org/s/conferences/sosp/2013/papers/p439-murray.pdf),” at *24th ACM Symposium on Operating Systems Principles* (SOSP), pages 439–455, November 2013. [doi:10.1145/2517349.2522738](http://dx.doi.org/10.1145/2517349.2522738)
[^26]: Gwen Shapira: “[We have a bunch of customers who are implementing ‘database inside-out’ concept and they all ask ‘is anyone else doing it? are we crazy?’](https://twitter.com/gwenshap/status/758800071110430720)” *twitter.com*, July 28, 2016.
[^27]: Martin Kleppmann: “[Turning the Database Inside-out with Apache Samza,](http://martin.kleppmann.com/2015/03/04/turning-the-database-inside-out.html)” at *Strange Loop*, September 2014.
[^28]: Peter Van Roy and Seif Haridi: [*Concepts, Techniques, and Models of Computer Programming*](https://www.info.ucl.ac.be/~pvr/book.html). MIT Press, 2004. ISBN: 978-0-262-22069-9
[^29]: “[Juttle Documentation](http://juttle.github.io/juttle/),” *juttle.github.io*, 2016.
[^30]: Evan Czaplicki and Stephen Chong: “[Asynchronous Functional Reactive Programming for GUIs](http://people.seas.harvard.edu/~chong/pubs/pldi13-elm.pdf),” at *34th ACM SIGPLAN Conference on Programming Language Design and Implementation* (PLDI), June 2013. [doi:10.1145/2491956.2462161](http://dx.doi.org/10.1145/2491956.2462161)
[^31]: Engineer Bainomugisha, Andoni Lombide Carreton, Tom van Cutsem, Stijn Mostinckx, and Wolfgang de Meuter: “[A Survey on Reactive Programming](http://soft.vub.ac.be/Publications/2012/vub-soft-tr-12-13.pdf),” *ACM Computing Surveys*, volume 45, number 4, pages 1–34, August 2013. [doi:10.1145/2501654.2501666](http://dx.doi.org/10.1145/2501654.2501666)
[^32]: Peter Alvaro, Neil Conway, Joseph M. Hellerstein, and William R. Marczak: “[Consistency Analysis in Bloom: A CALM and Collected Approach](https://dsf.berkeley.edu/cs286/papers/calm-cidr2011.pdf),” at *5th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2011.
[^33]: Felienne Hermans: “[Spreadsheets Are Code](https://vimeo.com/145492419),” at *Code Mesh*, November 2015.
[^34]: Dan Bricklin and Bob Frankston: “[VisiCalc: Information from Its Creators](http://danbricklin.com/visicalc.htm),” *danbricklin.com*.
[^35]: D. Sculley, Gary Holt, Daniel Golovin, et al.: “[Machine Learning: The High-Interest Credit Card of Technical Debt](http://research.google.com/pubs/pub43146.html),” at *NIPS Workshop on Software Engineering for Machine Learning* (SE4ML), December 2014.
[^36]: Peter Bailis, Alan Fekete, Michael J Franklin, et al.: “[Feral Concurrency Control: An Empirical Investigation of Modern Application Integrity](http://www.bailis.org/papers/feral-sigmod2015.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2737784](http://dx.doi.org/10.1145/2723372.2737784)
[^37]: Guy Steele: “[Re: Need for Macros (Was Re: Icon)](https://people.csail.mit.edu/gregs/ll1-discuss-archive-html/msg01134.html),” email to *ll1-discuss* mailing list, *people.csail.mit.edu*, December 24, 2001.
[^38]: David Gelernter: “[Generative Communication in Linda](http://cseweb.ucsd.edu/groups/csag/html/teaching/cse291s03/Readings/p80-gelernter.pdf),” *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 7, number 1, pages 80–112, January 1985. [doi:10.1145/2363.2433](http://dx.doi.org/10.1145/2363.2433)
[^39]: Patrick Th. Eugster, Pascal A. Felber, Rachid Guerraoui, and Anne-Marie Kermarrec: “[The Many Faces of Publish/Subscribe](http://www.cs.ru.nl/~pieter/oss/manyfaces.pdf),” *ACM Computing Surveys*, volume 35, number 2, pages 114–131, June 2003. [doi:10.1145/857076.857078](http://dx.doi.org/10.1145/857076.857078)
[^40]: Ben Stopford: “[Microservices in a Streaming World](https://www.infoq.com/presentations/microservices-streaming),” at *QCon London*, March 2016.
[^41]: Christian Posta: “[Why Microservices Should Be Event Driven: Autonomy vs Authority](http://blog.christianposta.com/microservices/why-microservices-should-be-event-driven-autonomy-vs-authority/),” *blog.christianposta.com*, May 27, 2016.
[^42]: Alex Feyerke: “[Say Hello to Offline First](https://web.archive.org/web/20210420014747/http://hood.ie/blog/say-hello-to-offline-first.html),” *hood.ie*, November 5, 2013.
[^43]: Sebastian Burckhardt, Daan Leijen, Jonathan Protzenko, and Manuel Fähndrich: “[Global Sequence Protocol: A Robust Abstraction for Replicated Shared State](http://drops.dagstuhl.de/opus/volltexte/2015/5238/),” at *29th European Conference on Object-Oriented Programming* (ECOOP), July 2015. [doi:10.4230/LIPIcs.ECOOP.2015.568](http://dx.doi.org/10.4230/LIPIcs.ECOOP.2015.568)
[^44]: Mark Soper: “[Clearing Up React Data Management Confusion with Flux, Redux, and Relay](https://medium.com/@marksoper/clearing-up-react-data-management-confusion-with-flux-redux-and-relay-aad504e63cae),” *medium.com*, December 3, 2015.
[^45]: Eno Thereska, Damian Guy, Michael Noll, and Neha Narkhede: “[Unifying Stream Processing and Interactive Queries in Apache Kafka](http://www.confluent.io/blog/unifying-stream-processing-and-interactive-queries-in-apache-kafka/),” *confluent.io*, October 26, 2016.
[^46]: Frank McSherry: “[Dataflow as Database](https://github.com/frankmcsherry/blog/blob/master/posts/2016-07-17.md),” *github.com*, July 17, 2016.
[^47]: Peter Alvaro: “[I See What You Mean](https://www.youtube.com/watch?v=R2Aa4PivG0g),” at *Strange Loop*, September 2015.
[^48]: Nathan Marz: “[Trident: A High-Level Abstraction for Realtime Computation](https://blog.twitter.com/2012/trident-a-high-level-abstraction-for-realtime-computation),” *blog.twitter.com*, August 2, 2012.
[^49]: Edi Bice: “[Low Latency Web Scale Fraud Prevention with Apache Samza, Kafka and Friends](http://www.slideshare.net/edibice/extremely-low-latency-web-scale-fraud-prevention-with-apache-samza-kafka-and-friends),” at *Merchant Risk Council MRC Vegas Conference*, March 2016.
[^50]: Charity Majors: “[The Accidental DBA](https://charity.wtf/2016/10/02/the-accidental-dba/),” *charity.wtf*, October 2, 2016.
[^51]: Arthur J. Bernstein, Philip M. Lewis, and Shiyong Lu: “[Semantic Conditions for Correctness at Different Isolation Levels](http://db.cs.berkeley.edu/cs286/papers/isolation-icde2000.pdf),” at *16th International Conference on Data Engineering* (ICDE), February 2000. [doi:10.1109/ICDE.2000.839387](http://dx.doi.org/10.1109/ICDE.2000.839387)
[^52]: Sudhir Jorwekar, Alan Fekete, Krithi Ramamritham, and S. Sudarshan: “[Automating the Detection of Snapshot Isolation Anomalies](http://www.vldb.org/conf/2007/papers/industrial/p1263-jorwekar.pdf),” at *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
[^53]: Kyle Kingsbury: [Jepsen blog post series](https://aphyr.com/tags/jepsen), *aphyr.com*, 2013–2016.
[^54]: Michael Jouravlev: “[Redirect After Post](http://www.theserverside.com/news/1365146/Redirect-After-Post),” *theserverside.com*, August 1, 2004.
[^55]: Jerome H. Saltzer, David P. Reed, and David D. Clark: “[End-to-End Arguments in System Design](https://groups.csail.mit.edu/ana/Publications/PubPDFs/End-to-End%20Arguments%20in%20System%20Design.pdf),” *ACM Transactions on Computer Systems*, volume 2, number 4, pages 277–288, November 1984. [doi:10.1145/357401.357402](http://dx.doi.org/10.1145/357401.357402)
[^56]: Peter Bailis, Alan Fekete, Michael J. Franklin, et al.: “[Coordination-Avoiding Database Systems](http://arxiv.org/pdf/1402.2237.pdf),” *Proceedings of the VLDB Endowment*, volume 8, number 3, pages 185–196, November 2014.
[^57]: Alex Yarmula: “[Strong Consistency in Manhattan](https://blog.twitter.com/2016/strong-consistency-in-manhattan),” *blog.twitter.com*, March 17, 2016.
[^58]: Douglas B Terry, Marvin M Theimer, Karin Petersen, et al.: “[Managing Update Conflicts in Bayou, a Weakly Connected Replicated Storage System](http://css.csail.mit.edu/6.824/2014/papers/bayou-conflicts.pdf),” at *15th ACM Symposium on Operating Systems Principles* (SOSP), pages 172–182, December 1995. [doi:10.1145/224056.224070](http://dx.doi.org/10.1145/224056.224070)
[^59]: Jim Gray: “[The Transaction Concept: Virtues and Limitations](http://jimgray.azurewebsites.net/papers/thetransactionconcept.pdf),” at *7th International Conference on Very Large Data Bases* (VLDB), September 1981.
[^60]: Hector Garcia-Molina and Kenneth Salem: “[Sagas](http://www.cs.cornell.edu/andru/cs711/2002fa/reading/sagas.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), May 1987. [doi:10.1145/38713.38742](http://dx.doi.org/10.1145/38713.38742)
[^61]: Pat Helland: “[Memories, Guesses, and Apologies](https://web.archive.org/web/20160304020907/http://blogs.msdn.com/b/pathelland/archive/2007/05/15/memories-guesses-and-apologies.aspx),” *blogs.msdn.com*, May 15, 2007.
[^62]: Yoongu Kim, Ross Daly, Jeremie Kim, et al.: “[Flipping Bits in Memory Without Accessing Them: An Experimental Study of DRAM Disturbance Errors](https://users.ece.cmu.edu/~yoonguk/papers/kim-isca14.pdf),” at *41st Annual International Symposium on Computer Architecture* (ISCA), June 2014. [doi:10.1145/2678373.2665726](http://dx.doi.org/10.1145/2678373.2665726)
[^63]: Mark Seaborn and Thomas Dullien: “[Exploiting the DRAM Rowhammer Bug to Gain Kernel Privileges](https://googleprojectzero.blogspot.co.uk/2015/03/exploiting-dram-rowhammer-bug-to-gain.html),” *googleprojectzero.blogspot.co.uk*, March 9, 2015.
[^64]: Jim N. Gray and Catharine van Ingen: “[Empirical Measurements of Disk Failure Rates and Error Rates](https://www.microsoft.com/en-us/research/publication/empirical-measurements-of-disk-failure-rates-and-error-rates/),” Microsoft Research, MSR-TR-2005-166, December 2005.
[^65]: Annamalai Gurusami and Daniel Price: “[Bug #73170: Duplicates in Unique Secondary Index Because of Fix of Bug#68021](http://bugs.mysql.com/bug.php?id=73170),” *bugs.mysql.com*, July 2014.
[^66]: Gary Fredericks: “[Postgres Serializability Bug](https://github.com/gfredericks/pg-serializability-bug),” *github.com*, September 2015.
[^67]: Xiao Chen: “[HDFS DataNode Scanners and Disk Checker Explained](http://blog.cloudera.com/blog/2016/12/hdfs-datanode-scanners-and-disk-checker-explained/),” *blog.cloudera.com*, December 20, 2016.
[^68]: Jay Kreps: “[Getting Real About Distributed System Reliability](http://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability),” *blog.empathybox.com*, March 19, 2012.
[^69]: Martin Fowler: “[The LMAX Architecture](http://martinfowler.com/articles/lmax.html),” *martinfowler.com*, July 12, 2011.
[^70]: Sam Stokes: “[Move Fast with Confidence](http://blog.samstokes.co.uk/blog/2016/07/11/move-fast-with-confidence/),” *blog.samstokes.co.uk*, July 11, 2016.
[^71]: “[Hyperledger Sawtooth documentation](https://web.archive.org/web/20220120211548/https://sawtooth.hyperledger.org/docs/core/releases/latest/introduction.html),” Intel Corporation, *sawtooth.hyperledger.org*, 2017.
[^72]: Richard Gendal Brown: “[Introducing R3 Corda™: A Distributed Ledger Designed for Financial Services](https://gendal.me/2016/04/05/introducing-r3-corda-a-distributed-ledger-designed-for-financial-services/),” *gendal.me*, April 5, 2016.
[^73]: Trent McConaghy, Rodolphe Marques, Andreas Müller, et al.: “[BigchainDB: A Scalable Blockchain Database](https://www.bigchaindb.com/whitepaper/bigchaindb-whitepaper.pdf),” *bigchaindb.com*, June 8, 2016.
[^74]: Ralph C. Merkle: “[A Digital Signature Based on a Conventional Encryption Function](https://people.eecs.berkeley.edu/~raluca/cs261-f15/readings/merkle.pdf),” at *CRYPTO '87*, August 1987. [doi:10.1007/3-540-48184-2_32](http://dx.doi.org/10.1007/3-540-48184-2_32)
[^75]: Ben Laurie: “[Certificate Transparency](http://queue.acm.org/detail.cfm?id=2668154),” *ACM Queue*, volume 12, number 8, pages 10-19, August 2014. [doi:10.1145/2668152.2668154](http://dx.doi.org/10.1145/2668152.2668154)
[^76]: Mark D. Ryan: “[Enhanced Certificate Transparency and End-to-End Encrypted Mail](https://www.ndss-symposium.org/wp-content/uploads/2017/09/12_2_1.pdf),” at *Network and Distributed System Security Symposium* (NDSS), February 2014. [doi:10.14722/ndss.2014.23379](http://dx.doi.org/10.14722/ndss.2014.23379)

================================================
FILE: content/tw/ch14.md
================================================
---
title: "14. 將事情做正確"
weight: 314
breadcrumbs: false
---

<a id="ch_right_thing"></a>

![](/map/ch12.png)

> *將世界的美好、醜陋與殘酷一起餵給 AI，卻期待它只反映美好的一面，這是一種幻想。*
>
> Vinay Uday Prabhu 與 Abeba Birhane，《Large Datasets: A Pyrrhic Win for Computer Vision?》（2020）

在本書最後一章，讓我們退一步看問題。整本書裡，我們考察了各種資料系統架構，評估了它們的利弊，也探討了如何構建可靠、可伸縮、可維護的應用。然而，我們一直略去了討論中一個重要而基礎的部分，現在該補上了。

每個系統都是為了某種目的而建；我們做的每個動作，都有預期後果，也有非預期後果。目的可能只是賺錢，但對世界產生的影響可能遠遠超出這個初始目的。構建這些系統的工程師，有責任認真思考這些後果，並且有意識地決定我們希望生活在怎樣的世界中。

我們常把資料當成抽象事物來談論，但請記住，許多資料集都是關於人的：他們的行為、興趣、身份。我們必須以人性與尊重來對待這樣的資料。使用者也是人，而人的尊嚴至高無上 [^1]。

軟體開發越來越涉及重要的倫理抉擇。確實有一些指南幫助軟體工程師應對這些問題，比如 ACM《倫理與職業行為準則》 [^2]，但在實踐中，它們很少被討論、應用與執行。因此，工程師和產品經理有時會對隱私以及產品可能帶來的負面後果抱持一種輕率態度 [^3], [^4]。

技術本身並無善惡，關鍵在於它如何被使用，以及它如何影響人。這一點對搜尋引擎這樣的軟體系統成立，對槍支這樣的武器同樣成立。軟體工程師若只專注技術本身而忽視其後果，是不夠的：倫理責任同樣由我們承擔。倫理推理很難，但它又重要到不能迴避。

不過，什麼算“好”或“壞”並沒有清晰定義，而計算領域的大多數人甚至不討論這個問題 [^5]。與計算領域中的很多概念不同，倫理的核心概念並沒有嚴格且確定的單一含義，它們需要解釋，而解釋可能具有主觀性 [^6]。倫理並不是走一遍檢查清單、確認你“合規”就完事；它是一種參與式、迭代式的反思過程，要與相關人群對話，並對結果負責 [^7]。

## 預測分析 {#id369}

例如，預測分析是人們對大資料和 AI 感到興奮的重要原因之一。用資料分析來預測天氣或疾病傳播是一回事 [^8]；預測一個罪犯是否可能再犯、一個貸款申請者是否可能違約，或一個保險客戶是否可能提出高額理賠，又是另一回事 [^9]。後者會直接影響個人的生活。

支付網路當然想防止欺詐交易，銀行想避免壞賬，航空公司想避免劫機，公司想避免僱到低效或不可信的人。從它們的角度看，錯過一筆業務機會的成本較低，而壞賬或問題員工的成本更高，因此機構傾向於謹慎行事完全可以理解。拿不準時，說“不”更穩妥。

然而，隨著演算法決策越來越普遍，一個被某個演算法標記為“高風險”的人（不管標記準確與否），可能會不斷遭遇這種“不”。如果一個人系統性地被排除在工作、航空出行、保險保障、房屋租賃、金融服務以及社會其他關鍵領域之外，這對個體自由構成的約束之大，以至於有人稱之為“演算法監獄” [^10]。在尊重人權的國家，刑事司法講究“未經證明有罪即推定無罪”；但自動化系統卻可能在沒有罪證、幾乎無申訴機會的情況下，系統性且任意地把一個人排除在社會參與之外。

### 偏見與歧視 {#id370}

演算法作出的決策並不必然比人更好，也不必然更差。每個人都可能有偏見，即使他們主動嘗試糾偏也是如此；歧視性做法也可能被文化性地制度化。人們期待基於資料、而非基於人的主觀直覺評估來作決定，可能更公平，也能讓傳統系統中常被忽視的人獲得更好機會 [^11]。

當我們開發預測分析和 AI 系統時，我們並不只是把人的決策“自動化”——即用軟體寫明何時說“是”或“否”的規則；我們甚至把規則本身也交給資料去推斷。然而，這些系統學到的模式往往是不透明的：即使資料中存在某種相關性，我們也未必知道為什麼。如果演算法輸入中存在系統性偏差，系統很可能會在輸出中學習並放大這種偏差 [^12]。

在許多國家，反歧視法禁止依據族裔、年齡、性別、性取向、殘障或信仰等受保護特徵而區別對待他人。一個人的其他資料特徵也許可以分析，但如果這些特徵與受保護特徵相關怎麼辦？例如，在按種族隔離的社群裡，一個人的郵編，甚至其 IP 地址，都可能是種族的強預測因子。這樣一看，認為演算法能把帶偏見的資料作為輸入，卻產出公平中立的結果，幾乎是荒謬的 [^13], [^14]。然而，資料驅動決策的支持者常隱含這種信念，這種態度甚至被諷刺為“機器學習就像給偏見洗錢” [^15]。

預測分析系統只是在外推過去；如果過去有歧視，它們就會把歧視編碼並放大 [^16]。如果我們希望未來比過去更好，就需要道德想象力，而這隻能由人提供 [^17]。資料和模型應當是我們的工具，而不是我們的主宰。

### 責任與問責 {#id371}

自動化決策把責任與問責問題擺到了臺前 [^17]。如果人犯了錯，可以追責，受影響者也可以申訴。演算法同樣會出錯，但如果演算法出了問題，誰來負責 [^18]？自動駕駛汽車造成事故，誰應承擔責任？自動化信用評分演算法如果系統性歧視某一族裔或宗教的人，受害者是否有救濟途徑？如果你的機器學習系統的決策受到司法審查，你能向法官解釋演算法是如何作出該決策的嗎？人不應透過“怪演算法”來逃避自己的責任。

信用評級機構是一個較早的先例：透過收集資料來對人作決策。糟糕的信用評分會讓生活變難，但至少信用分通常基於與借貸歷史直接相關的事實記錄，記錄中的錯誤也可以更正（儘管機構往往不會讓這件事變得容易）。相比之下，基於機器學習的評分演算法通常使用更廣泛的輸入且更不透明，使人更難理解某個具體決策是如何得出的，也更難判斷某人是否受到了不公平或歧視性對待 [^19]。

信用分回答的是“你過去行為如何？”；而預測分析通常基於“誰和你相似，以及像你這樣的人過去行為如何？”。把某人和“相似人群”類比，本質上就是在給人貼群體標籤，比如按居住地（這往往是種族和社會經濟階層的近似代理）來推斷。那被分錯桶的人怎麼辦？此外，如果決策因錯誤資料而出錯，幾乎不可能得到救濟 [^17]。

許多資料本質上是統計性的，這意味著即便總體機率分佈正確，具體個案也可能是錯的。比如，某國平均預期壽命是 80 歲，並不意味著你會在 80 歲生日那天去世。僅憑平均值和機率分佈，我們很難判斷某個具體個體會活到多少歲。同樣，預測系統的輸出是機率性的，在具體個案上完全可能出錯。

盲目相信資料在決策中的至高地位，不僅是錯覺，更是危險。隨著資料驅動決策越來越普遍，我們必須找到辦法讓演算法可問責、可透明，避免強化既有偏見，並在它們不可避免地犯錯時加以糾正。

我們還需要想辦法防止資料被用來傷害人，並實現其積極潛力。比如，分析可以揭示一個人財務和社會生活上的特徵。一方面，這種能力可以用於把援助精準地送到最需要的人手中。另一方面，它有時被掠奪性企業用來識別脆弱人群，並向其兜售高成本貸款、含金量極低的學歷專案等高風險產品 [^17], [^20]。

### 反饋迴路 {#id372}

即便在對人影響沒那麼立竿見影的預測應用中，比如推薦系統，我們也必須直面棘手問題。當服務越來越擅長預測使用者想看什麼內容時，它可能最終只向人們展示他們本就認同的觀點，形成迴音室，讓刻板印象、錯誤資訊和社會極化不斷滋生。我們已經看到社交媒體迴音室對選舉活動的影響。

當預測分析影響人的生活時，自我強化的反饋迴路會帶來尤其惡性的後果。比如，設想僱主用信用分來評估候選人。你原本是一個工作能力不錯、信用也不錯的人，但因某個無法控制的不幸事件突然陷入財務困境。賬單逾期後，你的信用分下降，找到工作的可能性也隨之下降。失業把你推向貧困，反過來讓你的評分更差，進一步降低就業機會 [^17]。這就是一種下行螺旋：有毒假設披著數學嚴謹與資料客觀的偽裝。

反饋迴路還有另一個例子：經濟學家發現，德國加油站引入演算法定價後，競爭反而減弱，消費者價格上升，因為演算法學會了“合謀” [^21]。

我們並不總能預測這些反饋迴路何時出現。不過，很多後果可以透過思考“整個系統”來預見（不僅是計算機化部分，還包括與系統互動的人）——這種方法稱為 **系統思維** [^22]。我們可以嘗試理解資料分析系統對不同行為、結構與特徵的響應。系統是在強化和放大人與人之間既有差異（例如讓富者更富、窮者更窮），還是在努力對抗不公？而且，即便出發點再好，我們也必須警惕非預期後果。

## 隱私與追蹤 {#id373}

除了預測分析的問題——也就是用資料自動化地對人作決策——資料收集本身也有倫理問題。收集資料的組織，與資料被收集的人之間，到底是什麼關係？

當系統只儲存使用者明確輸入的資料，因為使用者希望系統以某種方式儲存和處理它時，系統是在為使用者提供服務：使用者是客戶。但當用戶活動是在做其他事情時被“順帶”追蹤並記錄下來，這種關係就不那麼清晰了。服務不再只是執行使用者指令，而開始擁有自己的利益，而這種利益可能與使用者利益衝突。

行為資料追蹤已成為許多線上服務面向使用者功能的重要組成部分：追蹤搜尋結果點選有助於改進搜尋排序；推薦“喜歡 X 的人也喜歡 Y”可幫助使用者發現有趣且有用的內容；A/B 測試與使用者流程分析可幫助改進使用者介面。這些功能都需要一定程度的使用者行為追蹤，使用者也能從中受益。

然而，取決於公司的商業模式，追蹤往往不會止步於此。如果服務靠廣告資助，那麼廣告主才是真正客戶，使用者利益就會退居次位。追蹤資料會變得更細、分析會更深入、資料會被長期保留，以便為營銷目的構建每個人的精細畫像。

這時，公司與被收集資料的使用者之間的關係，就開始顯著改變了。使用者得到“免費”服務，並被引導儘可能多地參與。對使用者的追蹤，主要服務的並不是這個個體，而是資助服務的廣告主需求。這樣的關係，用一個語義更陰暗的詞來描述更貼切：**監視**。

### 監視 {#id374}

做個思想實驗：把 *data* 一詞替換為 *surveillance*（監視），看看常見說法是否還那麼“好聽” [^23]。例如：“在我們這個監視驅動的組織中，我們收集即時監視流並存入監視倉庫。我們的監視科學家使用先進的分析與監視處理來產出新洞見。”

這個思想實驗對本書來說少見地帶有一點論戰色彩，彷彿書名成了《設計監視密集型應用》（*Designing Surveillance-Intensive Applications*）。但為了強調這一點，我們需要更尖銳的詞。在我們試圖讓軟體“吞噬世界” [^24] 的過程中，我們構建了人類有史以來規模最大的群體監視基礎設施。我們正快速接近這樣一個世界：幾乎每個有人居住的空間都至少有一個聯網麥克風，存在於智慧手機、智慧電視、語音助手裝置、嬰兒監視器，甚至使用雲語音識別的兒童玩具中。許多這類裝置的安全記錄都非常糟糕 [^25]。

與過去相比，新變化在於：數字化讓大規模收集人的資料變得很容易。對我們位置與行動軌跡、社交關係與通訊、購買與支付、健康資訊的監視，幾乎已不可避免。一個監視型組織最終掌握的個人資訊，甚至可能比當事人自己知道的還多——例如，在當事人意識到之前就識別出其疾病或經濟困境。

即便是過去最極權、最壓迫的政權，也只能夢想把麥克風裝進每個房間，並迫使每個人隨身攜帶可追蹤其位置與行動的裝置。可是，由於數字技術帶來的好處太大，我們如今卻自願接受這個全面監視的世界。區別只在於：資料由企業收集以向我們提供服務，而不是由政府機構為控制目的而收集 [^26]。

並非所有資料收集都一定構成監視，但把它放在“監視”的框架下審視，有助於我們理解自己與資料收集者的關係。為什麼我們似乎樂於接受企業監視？也許你覺得自己“沒什麼可隱瞞”——換句話說，你與既有權力結構完全一致，不是邊緣少數群體，也無需擔心被迫害 [^27]。但不是每個人都這麼幸運。又或者，你覺得目的似乎是善意的——不是公開的強制和馴化，而只是更好的推薦與更個性化的營銷。然而，結合上一節對預測分析的討論，這種區分就沒那麼清楚了。

我們已經看到，汽車在未經駕駛員同意的情況下追蹤其駕駛行為，並影響保險費率 [^28]；也看到了與佩戴健身追蹤裝置繫結的健康保險保障。當監視被用於決定對生活關鍵方面有重大影響的事項（如保險保障或就業）時，它看起來就不再“無害”。而且，資料分析還能揭示極具侵入性的內容：例如，智慧手錶或健身手環裡的運動感測器可以以相當高的準確率推斷你在輸入什麼（包括密碼） [^29]。感測器精度和分析演算法只會越來越強。

### 同意與選擇自由 {#id375}

我們或許會主張，使用者是自願選擇使用會追蹤其活動的服務，並且他們同意了服務條款和隱私政策，因此他們已同意資料收集。我們甚至可能聲稱，使用者正以其提供的資料換取有價值的服務，而追蹤是提供該服務所必需的。毫無疑問，社交網路、搜尋引擎和各種其他免費線上服務確實對使用者有價值——但這個論證有問題。

首先，我們應當問：追蹤在哪種意義上是“必要的”？有些追蹤形式確實直接用於改進使用者功能：例如，追蹤搜尋結果點選率可提升搜尋排序與相關性；追蹤客戶常一起購買哪些商品，可幫助網店推薦關聯商品。然而，當追蹤使用者互動是為了內容推薦，或為了廣告構建使用者畫像時，這是否真正在使用者利益之中就不那麼清楚了——還是說，它“必要”僅僅因為廣告在為服務買單？

其次，使用者對自己向我們的資料庫“喂入”了哪些資料、這些資料如何被保留與處理，幾乎沒有認知——而多數隱私政策更多是在遮蔽而非闡明。使用者若不瞭解其資料會發生什麼，就無法給出有意義的同意。並且，某個使用者的資料往往也會揭示並非該服務使用者、也未同意任何條款的其他人。我們在本書這部分討論過的那些派生資料集——其中可能把全體使用者資料與行為追蹤及外部資料來源結合——正是使用者不可能形成有意義理解的資料型別。

此外，資料從使用者身上被抽取是單向過程，不是具有真實互惠的關係，也不是公平的價值交換。這裡沒有對話，沒有讓使用者協商“提供多少資料、換取什麼服務”的空間：服務與使用者之間的關係高度不對稱、單向度。規則由服務制定，而非使用者 [^30], [^31]。

在歐盟，《通用資料保護條例》（GDPR）要求同意必須是 “freely given, specific, informed, and unambiguous”，並且使用者必須能夠 “refuse or withdraw consent without detriment”——否則不被視為 “freely given”。任何徵求同意的請求都必須以 “an intelligible and easily accessible form, using clear and plain language” 撰寫。此外，“silence, pre-ticked boxes or inactivity \[do not\] constitute consent” [^32]。除同意外，個人資料處理還可基於其他合法基礎，例如 *legitimate interest*，它允許某些資料用途，如防欺詐 [^33]。

你可能會說，不同意被監視的使用者可以選擇不用這項服務。但這種選擇同樣不自由：如果某項服務流行到“被大多數人視為基本社會參與所必需” [^30]，那就不能合理期待人們退出——使用它在事實上成了強制（*de facto* mandatory）。例如，在多數西方社群中，攜帶智慧手機、透過社交網路社交、使用 Google 獲取資訊，已經成為常態。尤其當服務具有網路效應時，選擇 *不* 使用它會付出社會成本。

因為追蹤政策而拒絕使用某服務，說起來容易做起來難。這些平臺本來就是為吸引使用者而設計的。許多平臺使用遊戲機制和賭博常見策略來讓使用者反覆回來 [^34]。即便使用者能克服這一點，拒絕參與也往往只是少數特權人群的選項：他們有時間和知識去理解隱私政策，也有能力承擔潛在代價——比如錯過本可透過該服務獲得的社會參與或職業機會。對於處境更不利的人來說，並不存在真正意義上的選擇自由：監視變得無可逃避。

### 隱私與資料使用 {#id457}

有時有人聲稱“隱私已死”，理由是某些使用者願意在社交媒體上釋出各種生活內容，有些瑣碎，有些極度私密。但這個說法是錯誤的，它建立在對 *privacy* 一詞的誤解之上。

擁有隱私並不意味著把一切都保密；它意味著擁有選擇自由：哪些內容向誰披露、哪些公開、哪些保密。隱私權是一種決策權：它讓每個人在每種情境中，決定自己在“保密”與“透明”光譜上的位置 [^30]。這是個體自由與自主性的重要組成部分。

例如，一個患有罕見疾病的人，可能非常願意把其私密醫療資料提供給研究者，只要這有助於開發治療方法。但關鍵在於，這個人應當有權選擇誰可以訪問這些資料，以及出於什麼目的。如果其病情資訊可能損害其醫療保險、就業或其他重要權益，這個人很可能會更謹慎地共享資料。

當資料透過監視基礎設施從人們身上被抽取時，被侵蝕的未必是隱私權本身，而可能是隱私權的轉移：轉移給資料收集者。獲取資料的公司本質上是在說“相信我們會正確使用你的資料”，這意味著決定“披露什麼、保密什麼”的權利，從個人轉移到了公司。

這些公司反過來會把監視結果中的很大一部分保密，因為一旦公開，會讓人感到毛骨悚然，並傷害其商業模式（該模式依賴於“比其他公司更瞭解你”）。關於使用者的私密資訊通常只以間接方式被暴露，例如透過向特定人群（如患有某種疾病的人）定向投放廣告的工具表現出來。

即便特定使用者無法從某條廣告所面向的人群桶中被個人重識別，他們仍失去了對某些私密資訊披露的主導權。決定“向誰披露什麼”不再基於使用者自己的偏好，而是公司在行使這種隱私權，目標是利潤最大化。

許多公司追求的目標是“不被 *感知* 為令人不適”，迴避“資料收集到底有多侵入”這一問題，轉而專注於管理使用者感知。而且就連這種感知管理也常常做得不好：例如，某些內容也許在事實層面是正確的，但若會觸發痛苦記憶，使用者可能並不想被提醒 [^35]。面對任何資料，我們都應預期它可能出錯、不可取或在某些情況下不合適，並且需要構建機制來處理這些失效。至於什麼算“不可取”或“不合適”，當然屬於人的判斷；演算法除非被我們顯式程式設計去尊重人的需要，否則對這些概念是無感的。作為這些系統的工程師，我們必須保持謙遜，接受並預先規劃這些失效。

線上服務裡的隱私設定，允許使用者控制其資料的哪些方面可被其他使用者看到，這是把部分控制權還給使用者的起點。然而，不管設定如何，服務本身仍可不受限制地訪問這些資料，並可在隱私政策允許範圍內任意使用。即使服務承諾不把資料出售給第三方，通常也會賦予自己在內部處理和分析資料的廣泛權利，而這種處理常常遠遠超出使用者可見範圍。

這種把隱私權從個人大規模轉移到企業的現象，在歷史上前所未有 [^30]。監視並非從未存在，但過去它昂貴且依賴人工，不具備自動化與可伸縮性。信任關係也一直存在，比如病人與醫生、被告與律師之間——但這些關係中的資料使用長期受倫理、法律與監管約束。網際網路服務則讓“在缺乏有意義同意的情況下聚合海量敏感資訊，並在使用者不知情時以大規模方式使用”變得容易得多。

### 資料作為資產與權力 {#id376}

由於行為資料是使用者與服務互動的副產物，它有時被稱為 “data exhaust”（資料尾氣），暗示這些資料是無價值的廢料。照這個角度看，行為分析與預測分析像一種“回收”，從原本會被丟棄的資料中提煉價值。

更準確的看法可能正相反：從經濟學角度看，如果定向廣告在為服務買單，那麼生成行為資料的使用者活動就可被視作一種勞動 [^36]。甚至可以更進一步主張：使用者互動的應用本身，只是引誘使用者不斷向監視基礎設施輸入更多個人資訊的手段 [^30]。線上服務中常見的人類創造力與社會關係，被資料抽取機器以冷酷方式利用。

個人資料是有價值資產，這從資料經紀商行業的存在即可見一斑：這是一個在隱秘中運作、頗為灰暗的行業，購買、聚合、分析、推斷並轉售關於個人的侵入性資料，多數用於營銷 [^20]。初創公司的估值常以使用者數、以“眼球”為基礎——也就是以其監視能力為基礎。

因為資料有價值，很多人都想要它。公司當然想要——這本就是它們收集資料的原因。政府也想拿到：透過秘密交易、脅迫、法律強制，或者直接竊取 [^37]。當公司破產時，其收集的個人資料會作為資產被出售。並且，資料很難徹底保護，洩露事件頻發得令人不安。

這些觀察促使批評者說，資料不只是資產，還是“有毒資產”（*toxic asset*） [^37]，或者至少是“危險材料”（*hazardous material*） [^38]。也許資料不是“新黃金”、不是“新石油”，而是“新鈾” [^39]。即使我們認為自己有能力防止資料濫用，每次收集資料時也必須權衡收益與其落入錯誤之手的風險：計算機系統可能被犯罪分子或敵對外國情報機構攻破，資料可能被內部人員洩露，公司可能落入與我們價值觀不一致的管理層手中，或國家可能被一個毫無顧忌、會強迫我們交出資料的政權接管。

收集資料時，我們不僅要考慮今天的政治環境，還要考慮未來所有可能的政府。無法保證未來每一屆政府都會尊重人權與公民自由，因此，“安裝那些未來可能助長警察國家的技術，是糟糕的公民衛生習慣” [^40]。

正如古老格言所說，“知識就是力量”。而且，“審視他人而避免自身被審視，是最重要的權力形式之一” [^41]。這正是極權政府追求監視的原因：它賦予其控制人口的力量。今天的科技公司雖未公開追求政治權力，但它們積累的資料與知識依然賦予其對我們生活的巨大影響力，其中很多是隱蔽的，處在公共監督之外 [^42]。

### 回顧工業革命 {#id377}

資料是資訊時代的決定性特徵。網際網路、資料儲存與處理、軟體驅動自動化，正在深刻影響全球經濟和人類社會。我們的日常生活與社會組織已被資訊科技改變，並且在未來幾十年很可能繼續發生劇烈變化，這很容易讓人聯想到工業革命 [^17], [^26]。

工業革命建立在重大技術與農業進步之上，長期看帶來了持續經濟增長和生活水平顯著改善。但它也伴隨嚴重問題：空氣汙染（煙塵與化工過程）和水汙染（工業與生活廢棄物）都觸目驚心。工廠主生活奢華，城市工人卻常住在惡劣住房裡、長時間在嚴苛條件下勞動。童工普遍存在，包括礦井中危險且低薪的工作。

社會花了很長時間才建立起各種防護措施：環境保護法規、工作場所安全規程、取締童工、食品衛生檢查。毫無疑問，當工廠不再被允許把廢棄物排進河裡、售賣汙染食品、剝削工人時，做生意的成本上升了。但整個社會從這些規制中獲益巨大，今天幾乎沒人願意回到那之前 [^17]。

正如工業革命有其需要被管理的黑暗面一樣，我們向資訊時代的過渡也有重大問題，必須正視並解決 [^43], [^44]。資料的收集與使用就是其中之一。借用 Bruce Schneier 的話 [^26]：

> 資料是資訊時代的汙染問題，而保護隱私是環境挑戰。幾乎所有計算機都會產生資訊。它會長期滯留、不斷髮酵。我們如何處理它——如何圍堵它、如何處置它——對資訊經濟的健康至關重要。正如今天我們回望工業時代的早期幾十年，會疑惑我們的祖先為何在建設工業世界的狂熱中忽視了汙染問題；我們的後代也將回望資訊時代的這些早期幾十年，並以我們如何應對資料收集與濫用的挑戰來評判我們。
>
> 我們應努力讓他們感到驕傲。

### 立法與自律 {#sec_future_legislation}

資料保護法也許能夠幫助維護個體權利。例如，歐盟 GDPR 規定，個人資料必須“為特定、明確且合法的目的而收集，不得以與這些目的不相容的方式進一步處理”；並且資料必須“就處理目的而言充分、相關且限於必要範圍” [^32]。

然而，這一 **資料最小化** 原則與大資料哲學正面衝突。大資料強調最大化資料收集，把資料與其他資料集合並，持續實驗與探索，以產生新洞見。探索意味著為預見之外的目的使用資料，這與“特定且明確目的”正相反。儘管 GDPR 對線上廣告行業產生了一些影響 [^45]，監管執行總體仍偏弱 [^46]，也似乎沒有在更廣泛的科技行業內真正帶來文化與實踐層面的顯著轉變。

那些收集大量個人資料的公司把監管視為負擔和創新阻礙。這種反對在某種程度上也有其合理性。比如共享醫療資料時，隱私風險確實明確存在，但也有潛在機會：如果資料分析能幫助我們實現更好的診斷或找到更好的治療方案，能減少多少死亡 [^47]？過度監管可能會阻礙這類突破。如何平衡機會與風險並不容易 [^41]。

歸根結底，科技行業需要在個人資料問題上完成一次文化轉向。我們應停止把使用者當作可最佳化指標，記住他們是應被尊重、擁有尊嚴與主體性的人。我們應透過自律來約束資料收集與處理實踐，以建立並維繫依賴我們軟體的人們的信任 [^48]。並且，我們應主動教育終端使用者其資料如何被使用，而不是把他們矇在鼓裡。

我們應允許每個個體保有其隱私——也就是對自身資料的控制——而不是透過監視把這種控制偷走。個體對自身資料的控制權，就像國家公園中的自然環境：如果我們不明確保護並照料它，它就會被破壞。這會成為“公地悲劇”，最終所有人都更糟。無處不在的監視並非命中註定——我們仍有機會阻止它。

第一步是不要無限期保留資料，而應在不再需要時儘快清除，並在源頭最小化收集 [^48], [^49]。只要你的資料不存在，它就不會被洩露、被盜，或被政府強制交出。總的來說，這需要文化與態度的改變。作為技術從業者，如果我們不考慮自己工作的社會影響，那就是沒有盡到本職 [^50]。

## 總結 {#id594}

至此，本書接近尾聲。我們已經走過了很長一段路：

- 在 [第 1 章](/tw/ch1#ch_tradeoffs) 中，我們對比了分析型系統與事務型系統，比較了雲與自託管，權衡了分散式與單節點系統，並討論了如何平衡業務需求與使用者需求。

- 在 [第 2 章](/tw/ch2#ch_nonfunctional) 中，我們看到了如何定義非功能性需求，例如效能、可靠性、可伸縮性與可維護性。

- 在 [第 3 章](/tw/ch3#ch_datamodels) 中，我們考察了從關係模型、文件模型到圖模型的一系列資料模型，也討論了事件溯源與 DataFrame。我們還看了多種查詢語言示例，包括 SQL、Cypher、SPARQL、Datalog 與 GraphQL。

- 在 [第 4 章](/tw/ch4#ch_storage) 中，我們討論了面向 OLTP 的儲存引擎（LSM 樹與 B 樹）、面向分析的儲存（列式儲存），以及面向資訊檢索的索引（全文檢索與向量檢索）。

- 在 [第 5 章](/tw/ch5#ch_encoding) 中，我們考察了將資料物件編碼為位元組的不同方式，以及如何在需求變化時支援演化。我們還比較了程序間資料流動的幾種方式：經由資料庫、服務呼叫、工作流引擎或事件驅動架構。

- 在 [第 6 章](/tw/ch6#ch_replication) 中，我們研究了單領導者、多領導者與無主（無領導者）複製之間的權衡，也討論了寫後讀一致性等一致性模型，以及可讓客戶端離線工作的同步引擎。

- 在 [第 7 章](/tw/ch7#ch_sharding) 中，我們深入討論了分片，包括再平衡策略、請求路由與次級索引。

- 在 [第 8 章](/tw/ch8#ch_transactions) 中，我們覆蓋了事務：永續性、各種隔離級別（讀已提交、快照隔離、可序列化）的實現方式，以及如何在分散式事務中保證原子性。

- 在 [第 9 章](/tw/ch9#ch_distributed) 中，我們梳理了分散式系統中的基礎問題（網路失效與延遲、時鐘誤差、程序暫停、崩潰），並看到這些問題如何讓“實現一個看似簡單的鎖”都變得困難。

- 在 [第 10 章](/tw/ch10#ch_consistency) 中，我們深入分析了各種共識形式，以及它所支援的一致性模型（線性一致性）。

- 在 [第 11 章](/tw/ch11#ch_batch) 中，我們深入批處理，從簡單的 Unix 工具鏈一直講到基於分散式檔案系統或物件儲存的大規模分散式批處理系統。

- 在 [第 12 章](/tw/ch12#ch_stream) 中，我們把批處理推廣到流處理，討論了底層訊息代理、資料變更捕獲、容錯機制，以及流連線等處理模式。

- 在 [第 13 章](/tw/ch13#ch_philosophy) 中，我們探討了流式系統的一種哲學，它使異構資料系統更易於整合、系統更易於演化、應用更易於擴充套件。

最後，在本章中，我們後退一步，審視了構建資料密集型應用的一些倫理面向。我們看到，資料雖可為善，也可能造成嚴重傷害：作出深刻影響個人生活卻難以申訴的決策，導致歧視與剝削，使監視常態化，並暴露私密資訊。我們還面臨資料洩露風險，也可能發現某些出於善意的資料使用產生了非預期後果。

隨著軟體與資料對世界產生如此巨大的影響，我們作為工程師必須記住：我們有責任朝著我們希望生活其中的世界努力——一個以人性與尊重對待人的世界。讓我們共同朝這個目標前進。

### 參考文獻 {#references}

[^1]: David Schmudde. [What If Data Is a Bad Idea?](https://schmud.de/posts/2024-08-18-data-is-a-bad-idea.html). *schmud.de*, August 2024. Archived at [perma.cc/ZXU5-XMCT](https://perma.cc/ZXU5-XMCT)
[^2]: [ACM Code of Ethics and Professional Conduct](https://www.acm.org/code-of-ethics). Association for Computing Machinery, *acm.org*, 2018. Archived at [perma.cc/SEA8-CMB8](https://perma.cc/SEA8-CMB8)
[^3]: Igor Perisic. [Making Hard Choices: The Quest for Ethics in Machine Learning](https://www.linkedin.com/blog/engineering/archive/making-hard-choices-the-quest-for-ethics-in-machine-learning). *linkedin.com*, November 2016. Archived at [perma.cc/DGF8-KNT7](https://perma.cc/DGF8-KNT7)
[^4]: John Naughton. [Algorithm Writers Need a Code of Conduct](https://www.theguardian.com/commentisfree/2015/dec/06/algorithm-writers-should-have-code-of-conduct). *theguardian.com*, December 2015. Archived at [perma.cc/TBG2-3NG6](https://perma.cc/TBG2-3NG6)
[^5]: Ben Green. ["Good" isn't good enough](https://www.benzevgreen.com/wp-content/uploads/2019/11/19-ai4sg.pdf). At *NeurIPS Joint Workshop on AI for Social Good*, December 2019. Archived at [perma.cc/H4LN-7VY3](https://perma.cc/H4LN-7VY3)
[^6]: Deborah G. Johnson and Mario Verdicchio. [Ethical AI is Not about AI](https://cacm.acm.org/opinion/ethical-ai-is-not-about-ai/). *Communications of the ACM*, volume 66, issue 2, pages 32--34, January 2023. [doi:10.1145/3576932](https://doi.org/10.1145/3576932)
[^7]: Marc Steen. [Ethics as a Participatory and Iterative Process](https://cacm.acm.org/opinion/ethics-as-a-participatory-and-iterative-process/). *Communications of the ACM*, volume 66, issue 5, pages 27--29, April 2023. [doi:10.1145/3550069](https://doi.org/10.1145/3550069)
[^8]: Logan Kugler. [What Happens When Big Data Blunders?](https://cacm.acm.org/news/what-happens-when-big-data-blunders/) *Communications of the ACM*, volume 59, issue 6, pages 15--16, June 2016. [doi:10.1145/2911975](https://doi.org/10.1145/2911975)
[^9]: Miri Zilka. [Algorithms and the criminal justice system: promises and challenges in deployment and research](https://www.cl.cam.ac.uk/research/security/seminars/archive/video/2023-03-07-t196231.html). At *University of Cambridge Security Seminar Series*, March 2023.
[^10]: Bill Davidow. [Welcome to Algorithmic Prison](https://www.theatlantic.com/technology/archive/2014/02/welcome-to-algorithmic-prison/283985/). *theatlantic.com*, February 2014. Archived at [archive.org](https://web.archive.org/web/20171019201812/https://www.theatlantic.com/technology/archive/2014/02/welcome-to-algorithmic-prison/283985/)
[^11]: Don Peck. [They're Watching You at Work](https://www.theatlantic.com/magazine/archive/2013/12/theyre-watching-you-at-work/354681/). *theatlantic.com*, December 2013. Archived at [perma.cc/YR9T-6M38](https://perma.cc/YR9T-6M38)
[^12]: Leigh Alexander. [Is an Algorithm Any Less Racist Than a Human?](https://www.theguardian.com/technology/2016/aug/03/algorithm-racist-human-employers-work) *theguardian.com*, August 2016. Archived at [perma.cc/XP93-DSVX](https://perma.cc/XP93-DSVX)
[^13]: Jesse Emspak. [How a Machine Learns Prejudice](https://www.scientificamerican.com/article/how-a-machine-learns-prejudice/). *scientificamerican.com*, December 2016. [perma.cc/R3L5-55E6](https://perma.cc/R3L5-55E6)
[^14]: Rohit Chopra, Kristen Clarke, Charlotte A. Burrows, and Lina M. Khan. [Joint Statement on Enforcement Efforts Against Discrimination and Bias in Automated Systems](https://www.ftc.gov/system/files/ftc_gov/pdf/EEOC-CRT-FTC-CFPB-AI-Joint-Statement%28final%29.pdf). *ftc.gov*, April 2023. Archived at [perma.cc/YY4Y-RCCA](https://perma.cc/YY4Y-RCCA)
[^15]: Maciej Cegłowski. [The Moral Economy of Tech](https://idlewords.com/talks/sase_panel.htm). *idlewords.com*, June 2016. Archived at [perma.cc/L8XV-BKTD](https://perma.cc/L8XV-BKTD)
[^16]: Greg Nichols. [Artificial Intelligence in healthcare is racist](https://www.zdnet.com/article/artificial-intelligence-in-healthcare-is-racist/). *zdnet.com*, November 2020. Archived at [perma.cc/3MKW-YKRS](https://perma.cc/3MKW-YKRS)
[^17]: Cathy O'Neil. *Weapons of Math Destruction: How Big Data Increases Inequality and Threatens Democracy*. Crown Publishing, 2016. ISBN: 978-0-553-41881-1
[^18]: Julia Angwin. [Make Algorithms Accountable](https://www.nytimes.com/2016/08/01/opinion/make-algorithms-accountable.html). *nytimes.com*, August 2016. Archived at [archive.org](https://web.archive.org/web/20230819055242/https://www.nytimes.com/2016/08/01/opinion/make-algorithms-accountable.html)
[^19]: Bryce Goodman and Seth Flaxman. [European Union Regulations on Algorithmic Decision-Making and a 'Right to Explanation'](https://arxiv.org/abs/1606.08813). At *ICML Workshop on Human Interpretability in Machine Learning*, June 2016. Archived at [arxiv.org/abs/1606.08813](https://arxiv.org/abs/1606.08813)
[^20]: [A Review of the Data Broker Industry: Collection, Use, and Sale of Consumer Data for Marketing Purposes](https://www.commerce.senate.gov/services/files/0d2b3642-6221-4888-a631-08f2f255b577). Staff Report, *United States Senate Committee on Commerce, Science, and Transportation*, *commerce.senate.gov*, December 2013. Archived at [perma.cc/32NV-YWLQ](https://perma.cc/32NV-YWLQ)
[^21]: Stephanie Assad, Robert Clark, Daniel Ershov, and Lei Xu. [Algorithmic Pricing and Competition: Empirical Evidence from the German Retail Gasoline Market](https://economics.yale.edu/sites/default/files/clark_acex_jan_2021.pdf). *Journal of Political Economy*, volume 132, issue 3, pages 723-771, March 2024. [doi:10.1086/726906](https://doi.org/10.1086/726906)
[^22]: Donella H. Meadows and Diana Wright. *Thinking in Systems: A Primer*. Chelsea Green Publishing, 2008. ISBN: 978-1-603-58055-7
[^23]: Daniel J. Bernstein. [Listening to a "big data"/"data science" talk. Mentally translating "data" to "surveillance": "\...everything starts with surveillance\..."](https://x.com/hashbreaker/status/598076230437568512) *x.com*, May 2015. Archived at [perma.cc/EY3D-WBBJ](https://perma.cc/EY3D-WBBJ)
[^24]: Marc Andreessen. [Why Software Is Eating the World](https://a16z.com/why-software-is-eating-the-world/). *a16z.com*, August 2011. Archived at [perma.cc/3DCC-W3G6](https://perma.cc/3DCC-W3G6)
[^25]: J. M. Porup. ['Internet of Things' Security Is Hilariously Broken and Getting Worse](https://arstechnica.com/information-technology/2016/01/how-to-search-the-internet-of-things-for-photos-of-sleeping-babies/). *arstechnica.com*, January 2016. Archived at [archive.org](https://web.archive.org/web/20250823001716/https://arstechnica.com/information-technology/2016/01/how-to-search-the-internet-of-things-for-photos-of-sleeping-babies/)
[^26]: Bruce Schneier. [*Data and Goliath: The Hidden Battles to Collect Your Data and Control Your World*](https://www.schneier.com/books/data_and_goliath/). W. W. Norton, 2015. ISBN: 978-0-393-35217-7
[^27]: The Grugq. [Nothing to Hide](https://grugq.tumblr.com/post/142799983558/nothing-to-hide). *grugq.tumblr.com*, April 2016. Archived at [perma.cc/BL95-8W5M](https://perma.cc/BL95-8W5M)
[^28]: Federal Trade Commission. [FTC Takes Action Against General Motors for Sharing Drivers' Precise Location and Driving Behavior Data Without Consent](https://www.ftc.gov/news-events/news/press-releases/2025/01/ftc-takes-action-against-general-motors-sharing-drivers-precise-location-driving-behavior-data). *ftc.gov*, January 2025. Archived at [perma.cc/3XGV-3HRD](https://perma.cc/3XGV-3HRD)
[^29]: Tony Beltramelli. [Deep-Spying: Spying Using Smartwatch and Deep Learning](https://arxiv.org/abs/1512.05616). Masters Thesis, IT University of Copenhagen, December 2015. Archived at *arxiv.org/abs/1512.05616*
[^30]: Shoshana Zuboff. [Big Other: Surveillance Capitalism and the Prospects of an Information Civilization](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2594754). *Journal of Information Technology*, volume 30, issue 1, pages 75--89, April 2015. [doi:10.1057/jit.2015.5](https://doi.org/10.1057/jit.2015.5)
[^31]: Michiel Rhoen. [Beyond Consent: Improving Data Protection Through Consumer Protection Law](https://policyreview.info/articles/analysis/beyond-consent-improving-data-protection-through-consumer-protection-law). *Internet Policy Review*, volume 5, issue 1, March 2016. [doi:10.14763/2016.1.404](https://doi.org/10.14763/2016.1.404)
[^32]: [Regulation (EU) 2016/679 of the European Parliament and of the Council of 27 April 2016](https://eur-lex.europa.eu/eli/reg/2016/679/oj/eng). *Official Journal of the European Union*, L 119/1, May 2016.
[^33]: UK Information Commissioner's Office. [What is the 'legitimate interests' basis?](https://ico.org.uk/for-organisations/uk-gdpr-guidance-and-resources/lawful-basis/legitimate-interests/what-is-the-legitimate-interests-basis/) *ico.org.uk*. Archived at [perma.cc/W8XR-F7ML](https://perma.cc/W8XR-F7ML)
[^34]: Tristan Harris. [How a handful of tech companies control billions of minds every day](https://www.ted.com/talks/tristan_harris_how_a_handful_of_tech_companies_control_billions_of_minds_every_day). At *TED2017*, April 2017.
[^35]: Carina C. Zona. [Consequences of an Insightful Algorithm](https://www.youtube.com/watch?v=YRI40A4tyWU). At *GOTO Berlin*, November 2016.
[^36]: Imanol Arrieta Ibarra, Leonard Goff, Diego Jiménez Hernández, Jaron Lanier, and E. Glen Weyl. [Should We Treat Data as Labor? Moving Beyond 'Free'](https://www.aeaweb.org/conference/2018/preliminary/paper/2Y7N88na). *American Economic Association Papers Proceedings*, volume 1, issue 1, December 2017.
[^37]: Bruce Schneier. [Data Is a Toxic Asset, So Why Not Throw It Out?](https://www.schneier.com/essays/archives/2016/03/data_is_a_toxic_asse.html) *schneier.com*, March 2016. Archived at [perma.cc/4GZH-WR3D](https://perma.cc/4GZH-WR3D)
[^38]: Cory Scott. [Data is not toxic - which implies no benefit - but rather hazardous material, where we must balance need vs. want](https://x.com/cory_scott/status/706586399483437056). *x.com*, March 2016. Archived at [perma.cc/CLV7-JF2E](https://perma.cc/CLV7-JF2E)
[^39]: Mark Pesce. [Data is the new uranium -- incredibly powerful and amazingly dangerous](https://www.theregister.com/2024/11/20/data_is_the_new_uranium/). *theregister.com*, November 2024. Archived at [perma.cc/NV8B-GYGV](https://perma.cc/NV8B-GYGV)
[^40]: Bruce Schneier. [Mission Creep: When Everything Is Terrorism](https://www.schneier.com/essays/archives/2013/07/mission_creep_when_e.html). *schneier.com*, July 2013. Archived at [perma.cc/QB2C-5RCE](https://perma.cc/QB2C-5RCE)
[^41]: Lena Ulbricht and Maximilian von Grafenstein. [Big Data: Big Power Shifts?](https://policyreview.info/articles/analysis/big-data-big-power-shifts) *Internet Policy Review*, volume 5, issue 1, March 2016. [doi:10.14763/2016.1.406](https://doi.org/10.14763/2016.1.406)
[^42]: Ellen P. Goodman and Julia Powles. [Facebook and Google: Most Powerful and Secretive Empires We've Ever Known](https://www.theguardian.com/technology/2016/sep/28/google-facebook-powerful-secretive-empire-transparency). *theguardian.com*, September 2016. Archived at [perma.cc/8UJA-43G6](https://perma.cc/8UJA-43G6)
[^43]: Judy Estrin and Sam Gill. [The World Is Choking on Digital Pollution](https://washingtonmonthly.com/2019/01/13/the-world-is-choking-on-digital-pollution/). *washingtonmonthly.com*, January 2019. Archived at [perma.cc/3VHF-C6UC](https://perma.cc/3VHF-C6UC)
[^44]: A. Michael Froomkin. [Regulating Mass Surveillance as Privacy Pollution: Learning from Environmental Impact Statements](https://repository.law.miami.edu/cgi/viewcontent.cgi?article=1062&context=fac_articles). *University of Illinois Law Review*, volume 2015, issue 5, August 2015. Archived at [perma.cc/24ZL-VK2T](https://perma.cc/24ZL-VK2T)
[^45]: Pengyuan Wang, Li Jiang, and Jian Yang. [The Early Impact of GDPR Compliance on Display Advertising: The Case of an Ad Publisher](https://openreview.net/pdf?id=TUnLHNo19S). *Journal of Marketing Research*, volume 61, issue 1, April 2023. [doi:10.1177/00222437231171848](https://doi.org/10.1177/00222437231171848)
[^46]: Johnny Ryan. [Don't be fooled by Meta's fine for data breaches](https://www.economist.com/by-invitation/2023/05/24/dont-be-fooled-by-metas-fine-for-data-breaches-says-johnny-ryan). *The Economist*, May 2023. Archived at [perma.cc/VCR6-55HR](https://perma.cc/VCR6-55HR)
[^47]: Jessica Leber. [Your Data Footprint Is Affecting Your Life in Ways You Can't Even Imagine](https://www.fastcompany.com/3057514/your-data-footprint-is-affecting-your-life-in-ways-you-cant-even-imagine). *fastcompany.com*, March 2016. Archived at [archive.org](https://web.archive.org/web/20161128133016/https://www.fastcoexist.com/3057514/your-data-footprint-is-affecting-your-life-in-ways-you-cant-even-imagine)
[^48]: Maciej Cegłowski. [Haunted by Data](https://idlewords.com/talks/haunted_by_data.htm). *idlewords.com*, October 2015. Archived at [archive.org](https://web.archive.org/web/20161130143932/https://idlewords.com/talks/haunted_by_data.htm)
[^49]: Sam Thielman. [You Are Not What You Read: Librarians Purge User Data to Protect Privacy](https://www.theguardian.com/us-news/2016/jan/13/us-library-records-purged-data-privacy). *theguardian.com*, January 2016. Archived at [archive.org](https://web.archive.org/web/20250828224851/https://www.theguardian.com/us-news/2016/jan/13/us-library-records-purged-data-privacy)
[^50]: Jez Humble. [It's a cliché that people get into tech to "change the world". So then, you have to actually consider what the impact of your work is on the world. The idea that you can or should exclude societal and political discussions in tech is idiotic. It means you're not doing your job](https://x.com/jezhumble/status/1386758340894597122). *x.com*, April 2021. Archived at [perma.cc/3NYS-MHLC](https://perma.cc/3NYS-MHLC)

================================================
FILE: content/tw/ch2.md
================================================
---
title: "2. 定義非功能性需求"
weight: 102
breadcrumbs: false
---

<a id="ch_nonfunctional"></a>

![](/map/ch01.png)

> *網際網路做得太好了，以至於大多數人把它看成像太平洋那樣的自然資源，而不是人造產物。上一次出現這種規模且幾乎無差錯的技術是什麼時候？*
>
> [艾倫・凱](https://www.drdobbs.com/architecture-and-design/interview-with-alan-kay/240003442)，
> 在接受 *Dr Dobb's Journal* 採訪時（2012 年）

構建一個應用時，你通常會從一張需求清單開始。清單最上面的，往往是應用必須提供的功能：需要哪些頁面和按鈕，每個操作應該完成什麼行為，才能實現軟體的目標。這些就是 ***功能性需求***。

此外，你通常還會有一些 ***非功能性需求***：例如，應用應當足夠快、足夠可靠、足夠安全、符合法規，而且易於維護。這些需求可能並沒有明確寫下來，因為它們看起來像是“常識”，但它們與功能需求同樣重要。一個慢得無法忍受、或頻繁出錯的應用，幾乎等於不存在。

許多非功能性需求（比如安全）超出了本書範圍。但本章會討論其中幾項核心要求，並幫助你用更清晰的方式描述自己的系統：

* 如何定義並衡量系統的 **效能**（參見 ["描述效能"](#sec_introduction_percentiles)）；
* 服務 **可靠** 到底意味著什麼：也就是在出錯時仍能持續正確工作（參見 ["可靠性與容錯"](#sec_introduction_reliability)）；
* 如何透過高效增加計算資源，讓系統在負載增長時保持 **可伸縮性**（參見 ["可伸縮性"](#sec_introduction_scalability)）；以及
* 如何讓系統在長期演進中保持 **可維護性**（參見 ["可維護性"](#sec_introduction_maintainability)）。

本章引入的術語，在後續章節深入實現細節時也會反覆用到。不過純定義往往比較抽象。為了把概念落到實處，本章先從一個案例研究開始：看看社交網路服務可能如何實現，並藉此討論效能與可伸縮性問題。


## 案例研究：社交網路首頁時間線 {#sec_introduction_twitter}

假設你要實現一個類似 X（原 Twitter）的社交網路：使用者可以發帖，並追隨其他使用者。這會極大簡化真實系統的實現方式 [^1] [^2] [^3]，但足以說明大規模系統會遇到的一些關鍵問題。

我們假設：使用者每天發帖 5 億條，平均每秒約 5,700 條；在特殊事件期間，峰值可能衝到每秒 150,000 條 [^4]。再假設平均每位使用者追隨 200 人，並有 200 名追隨者（實際分佈非常不均勻：大多數人只有少量追隨者，少數名人如巴拉克・奧巴馬則有上億追隨者）。

### 表示使用者、帖子與關注關係 {#id20}

假設我們將所有資料儲存在關係資料庫中，如 [圖 2-1](#fig_twitter_relational) 所示。我們有一個使用者表、一個帖子表和一個關注關係表。

{{< figure src="/fig/ddia_0201.png" id="fig_twitter_relational" caption="圖 2-1. 社交網路的簡單關係模式，使用者可以相互關注。" class="w-full my-4" >}}

假設該社交網路最重要的讀操作是 *首頁時間線*：展示你所追隨的人最近釋出的帖子（為簡化起見，我們忽略廣告、未追隨使用者的推薦帖，以及其他擴充套件功能）。獲取某個使用者首頁時間線的 SQL 可能如下：

```sql
SELECT posts.*, users.* FROM posts
    JOIN follows ON posts.sender_id = follows.followee_id
    JOIN users ON posts.sender_id = users.id
    WHERE follows.follower_id = current_user
    ORDER BY posts.timestamp DESC
    LIMIT 1000
```

要執行此查詢，資料庫將使用 `follows` 表找到 `current_user` 關注的所有人，查詢這些使用者最近的帖子，並按時間戳排序以獲取被關注使用者的最新 1,000 條帖子。

帖子具有時效性。我們假設：某人發帖後，追隨者應在 5 秒內看到。一個做法是客戶端每 5 秒重複執行一次上述查詢（即 *輪詢*）。如果同時線上登入使用者有 1000 萬，就意味著每秒要執行 200 萬次查詢。即使把輪詢間隔調大，這個量也很可觀。

此外，這個查詢本身也很昂貴。若你追隨 200 人，系統就要分別抓取這 200 人的近期帖子列表，再把它們歸併。每秒 200 萬次時間線查詢，等價於資料庫每秒要執行約 4 億次“按傳送者查最近帖子”。這還只是平均情況。少數使用者會追隨數萬賬戶，這個查詢對他們尤其昂貴，也更難做快。

### 時間線的物化與更新 {#sec_introduction_materializing}

要如何最佳化？第一，與其輪詢，不如由伺服器主動向線上追隨者推送新帖。第二，我們應該預先計算上述查詢結果，讓首頁時間線請求可以直接從快取返回。

設想我們為每個使用者維護一個數據結構，儲存其首頁時間線，也就是其所追隨者的近期帖子。每當使用者發帖，我們就找出其所有追隨者，把這條帖子插入每個追隨者的首頁時間線中，就像往郵箱裡投遞信件。這樣使用者登入時，可以直接讀取預先算好的時間線。若要接收新帖提醒，客戶端只需訂閱“寫入該時間線”的帖子流即可。

這種方法的缺點是：每次發帖時都要做更多工作，因為首頁時間線屬於需要持續更新的派生資料。這個過程見 [圖 2-2](#fig_twitter_timelines)。當一個初始請求觸發多個下游請求時，我們用 *扇出* 描述請求數量被放大的倍數。

{{< figure src="/fig/ddia_0202.png" id="fig_twitter_timelines" caption="圖 2-2. 扇出：將新帖子傳遞給釋出帖子的使用者的每個追隨者。" class="w-full my-4" >}}

按每秒 5,700 條帖子計算，若平均每條帖到達 200 名追隨者（扇出因子 200），則每秒需要略高於 100 萬次首頁時間線寫入。這已經很多，但相比原先每秒 4 億次“按傳送者查帖”，仍是顯著最佳化。

如果遇到特殊事件導致發帖速率激增，我們不必立刻完成時間線投遞。可以先入隊，接受“帖子出現在追隨者時間線中”會暫時變慢。即便在這種峰值期，時間線載入仍然很快，因為讀取仍來自快取。

這種預先計算並持續更新查詢結果的過程稱為 *物化*。時間線快取就是一種 *物化檢視*（這個概念見 [“維護物化檢視”](/tw/ch12#sec_stream_mat_view)）。物化檢視能加速讀取，但代價是寫入側工作量增加。對大多數使用者而言，這個寫入成本仍可接受，但社交網路還要處理一些極端情況：

* 如果某使用者追隨了大量賬戶，且這些賬戶發帖頻繁，那麼該使用者的物化時間線寫入率會很高。但在這種場景下，使用者通常也看不完全部帖子，因此可以丟棄部分時間線寫入，只展示其追隨賬戶帖子的一部分樣本 [^5]。
* 如果一個擁有海量追隨者的名人賬號發帖，我們需要把這條帖子寫入其數百萬追隨者的首頁時間線，工作量極大。此時不能隨意丟寫。常見做法是把名人帖子與普通帖子分開處理：名人帖單獨儲存，讀取時間線時再與物化時間線合併，從而省去寫入數百萬條時間線的成本。即便如此，服務名人賬號仍需大量基礎設施 [^6]。

## 描述效能 {#sec_introduction_percentiles}

軟體效能通常圍繞兩類指標展開：

響應時間
: 從使用者發出請求到收到響應所經歷的時間。單位是秒（或毫秒、微秒）。

吞吐量
: 系統每秒可處理的請求數或資料量。對於給定硬體資源，系統存在一個可處理的 *最大吞吐量*。單位是“每秒某種工作量”。

在社交網路案例中，“每秒帖子數”和“每秒時間線寫入數”屬於吞吐量指標；“載入首頁時間線所需時間”或“帖子送達追隨者所需時間”屬於響應時間指標。

吞吐量和響應時間之間通常相關。線上服務的典型關係如 [圖 2-3](#fig_throughput)：低吞吐量時響應時間較低，負載升高後響應時間上升。原因是 *排隊*。請求到達高負載系統時，CPU 往往已在處理前一個請求，新請求只能等待；當吞吐量逼近硬體上限，排隊延遲會急劇上升。

{{< figure src="/fig/ddia_0203.png" id="fig_throughput" caption="圖 2-3. 隨著服務的吞吐量接近其容量，由於排隊，響應時間急劇增加。" class="w-full my-4" >}}

--------

<a id="sidebar_metastable"></a>

> [!TIP] 當過載系統無法恢復時

如果系統已接近過載、吞吐量逼近極限，有時會進入惡性迴圈：效率下降，進而更加過載。例如，請求佇列很長時，響應時間可能高到讓客戶端超時並重發請求，導致請求速率進一步上升，問題持續惡化，形成 *重試風暴*。即使負載後來回落，系統也可能仍卡在過載狀態，直到重啟或重置。這種現象叫 *亞穩態故障*（Metastable Failure），可能引發嚴重生產故障 [^7] [^8]。

為了避免重試把服務拖垮，可以在客戶端拉大並隨機化重試間隔（*指數退避* [^9] [^10]），並臨時停止向近期報錯或超時的服務發請求（例如 *熔斷器* [^11] [^12] 或 *令牌桶* [^13]）。服務端也可在接近過載時主動拒絕請求（*負載卸除* [^14]），並透過響應要求客戶端降速（*背壓* [^1] [^15]）。此外，排隊與負載均衡演算法的選擇也會影響結果 [^16]。

--------

從效能指標角度看，使用者通常最關心響應時間；而吞吐量決定了所需計算資源（例如伺服器數量），從而決定承載特定工作負載的成本。如果吞吐量增長可能超過當前硬體上限，就必須擴容；若系統可透過增加計算資源顯著提升最大吞吐量，就稱其 *可伸縮*。

本節主要討論響應時間；吞吐量與可伸縮性會在 ["可伸縮性"](#sec_introduction_scalability) 一節再展開。

### 延遲與響應時間 {#id23}

“延遲”和“響應時間”有時會混用，但本書對它們有明確區分（見 [圖 2-4](#fig_response_time)）：

* *響應時間* 是客戶端看到的總時間，包含鏈路上各處產生的全部延遲。
* *服務時間* 是服務主動處理該請求的時間。
* *排隊延遲* 可發生在流程中的多個位置。例如請求到達後，可能要等 CPU 空出來才能處理；同機其他任務若佔滿出站網絡卡，響應包也可能先在緩衝區等待發送。
* *延遲* 是對“請求未被主動處理這段時間”的統稱，也就是請求處於 *潛伏（latent）* 狀態的時間。尤其是 *網路延遲*（或網路時延）指請求與響應在網路中傳播所花的時間。

{{< figure src="/fig/ddia_0204.png" id="fig_response_time" caption="圖 2-4. 響應時間、服務時間、網路延遲和排隊延遲。" class="w-full my-4" >}}

在 [圖 2-4](#fig_response_time) 中，時間從左向右流動。每個通訊節點畫成一條水平線，請求/響應訊息畫成節點間的粗斜箭頭。本書後文會頻繁使用這種圖示風格。

即便反覆傳送同一個請求，響應時間也可能顯著波動。許多因素都會引入隨機延遲：例如切換到後臺程序、網路丟包與 TCP 重傳、垃圾回收暫停、缺頁導致的磁碟讀取、伺服器機架機械振動 [^17] 等。我們會在 ["超時與無界延遲"](/tw/ch9#sec_distributed_queueing) 進一步討論這個問題。

排隊延遲常常是響應時間波動的主要來源。伺服器並行處理能力有限（例如受 CPU 核數約束），少量慢請求就可能堵住後續請求，這就是 *頭部阻塞*。即便後續請求本身服務時間很短，客戶端仍會因為等待前序請求而看到較慢的總體響應。排隊延遲不屬於服務時間，因此必須在客戶端側測量響應時間。

### 平均值、中位數與百分位點 {#id24}

由於響應時間會隨請求變化，我們應將其看作一個可測量的 *分佈*，而非單一數字。在 [圖 2-5](#fig_lognormal) 中，每個灰色柱表示一次請求，柱高是該請求耗時。大多數請求較快，但會有少量更慢的 *異常值*。網路時延波動也常稱為 *抖動*。

{{< figure src="/fig/ddia_0205.png" id="fig_lognormal" caption="圖 2-5. 說明平均值和百分位點：100 個服務請求的響應時間樣本。" class="w-full my-4" >}}

報告服務 *平均* 響應時間很常見（嚴格說是 *算術平均值*：總響應時間除以請求數）。平均值對估算吞吐量上限有幫助 [^18]。但若你想知道“典型”響應時間，平均值並不理想，因為它不能反映到底有多少使用者經歷了這種延遲。

通常，*百分位點* 更有意義。把響應時間從快到慢排序，*中位數* 位於中間。例如中位響應時間為 200 毫秒，表示一半請求在 200 毫秒內返回，另一半更慢。因此中位數適合衡量使用者“通常要等多久”。中位數也稱 *第 50 百分位*，常記為 *p50*。

為了看清異常值有多糟，需要觀察更高百分位點：常見的是 *p95*、*p99*、*p999*。它們表示 95%、99%、99.9% 的請求都快於該閾值。例如 p95 為 1.5 秒，表示 100 個請求裡有 95 個小於 1.5 秒，另外 5 個不小於 1.5 秒。[圖 2-5](#fig_lognormal) 展示了這一點。

響應時間的高百分位點（也叫 *尾部延遲*）非常重要，因為它直接影響使用者體驗。例如亞馬遜內部服務常以第 99.9 百分位設定響應要求，儘管它隻影響 1/1000 的請求。原因是最慢請求往往來自“賬戶資料最多”的客戶，他們通常也是最有價值客戶 [^19]。讓這批使用者也能獲得快速響應，對業務很關鍵。

另一方面，繼續最佳化到第 99.99 百分位（最慢的萬分之一請求）通常成本過高、收益有限。越到高百分位，越容易受不可控隨機因素影響，也更符合邊際收益遞減規律。

--------

> [!TIP] 響應時間對使用者的影響

直覺上，快服務當然比慢服務更好 [^20]。但真正要拿到“延遲如何影響使用者行為”的可靠量化資料，其實非常困難。

一些被頻繁引用的統計並不可靠。2006 年，Google 曾報告：搜尋結果從 400 毫秒變慢到 900 毫秒，與流量和收入下降 20% 相關 [^21]。但 2009 年 Google 另一項研究又稱，延遲增加 400 毫秒僅導致日搜尋量下降 0.6% [^22]；同年 Bing 發現，載入時間增加 2 秒會讓廣告收入下降 4.3% [^23]。這些公司的更新資料似乎並未公開。

Akamai 的一項較新研究 [^24] 聲稱：響應時間增加 100 毫秒會讓電商網站轉化率最多下降 7%。但細看可知，同一研究也顯示“載入極快”的頁面同樣和較低轉化率相關。這個看似矛盾的結果，很可能是因為載入最快的頁面往往是“無有效內容”的頁面（如 404）。而該研究並未把“頁面內容影響”和“載入時間影響”區分開，因此結論可能並不可靠。

Yahoo 的一項研究 [^25] 在控制搜尋結果質量後，比對了快慢載入對點選率的影響。結果顯示：當快慢響應差異達到 1.25 秒或以上時，快速搜尋的點選量會高出 20%–30%。

--------

### 響應時間指標的應用 {#sec_introduction_slo_sla}

對於“一個終端請求會觸發多次後端呼叫”的服務，高百分位點尤其關鍵。即使並行呼叫，終端請求仍要等待最慢的那個返回。正如 [圖 2-6](#fig_tail_amplification) 所示，只要一個呼叫慢，就能拖慢整個終端請求。即便慢呼叫比例很小，只要後端呼叫次數變多，撞上慢呼叫的機率就會上升，於是更大比例的終端請求會變慢（稱為 *尾部延遲放大* [^26]）。

{{< figure src="/fig/ddia_0206.png" id="fig_tail_amplification" caption="圖 2-6. 當需要幾個後端呼叫來服務請求時，只需要一個慢的後端請求就可以減慢整個終端使用者請求。" class="w-full my-4" >}}

百分位點也常用於定義 *服務級別目標*（SLO）和 *服務級別協議*（SLA）[^27]。例如，一個 SLO 可能要求：中位響應時間低於 200 毫秒、p99 低於 1 秒，並且至少 99.9% 的有效請求返回非錯誤響應。SLA 則是“未達成 SLO 時如何處理”的合同條款（例如客戶可獲賠償）。這是基本思路；但在實踐中，為 SLO/SLA 設計合理可用性指標並不容易 [^28] [^29]。

--------

> [!TIP] 計算百分位點

如果你想在監控面板中展示響應時間百分位點，就需要持續且高效地計算它們。例如，維護“最近 10 分鐘請求響應時間”的滾動視窗，每分鐘計算一次該視窗內的中位數與各百分位點，並繪圖展示。

最簡單的實現是儲存視窗內全部請求的響應時間，並每分鐘排序一次。若效率不夠，可以用一些低 CPU/記憶體開銷的演算法來近似計算百分位點。常見開源庫包括 HdrHistogram、t-digest [^30] [^31]、OpenHistogram [^32] 和 DDSketch [^33]。

要注意，“對百分位點再取平均”（例如降低時間解析度，或合併多機器資料）在數學上沒有意義。聚合響應時間資料的正確方式是聚合直方圖 [^34]。

--------

## 可靠性與容錯 {#sec_introduction_reliability}

每個人對“可靠”與“不可靠”都有直覺。對軟體而言，典型期望包括：

* 應用能完成使用者預期的功能。
* 能容忍使用者犯錯，或以意料之外的方式使用軟體。
* 在預期負載與資料規模下，效能足以支撐目標用例。
* 能防止未授權訪問與濫用。

如果把這些合起來稱為“正確工作”，那麼 *可靠性* 可以粗略理解為：即使出現問題，系統仍能持續正確工作。為了更精確地描述“出問題”，我們區分 *故障* 與 *失效* [^35] [^36] [^37]：

故障
: 指系統某個 *區域性元件* 停止正常工作：例如單個硬碟損壞、單臺機器宕機，或系統依賴的外部服務中斷。

失效
: 指 *整個系統* 無法繼續向用戶提供所需服務；換言之，系統未滿足服務級別目標（SLO）。

“故障”與“失效”的區別容易混淆，因為它們本質上是同一件事在不同層級上的表述。比如一個硬碟壞了，對“硬碟這個系統”來說是失效；但對“由許多硬碟組成的更大系統”來說，它只是一個故障。更大系統若在其他硬碟上有副本，就可能容忍該故障。

### 容錯 {#id27}

如果系統在發生某些故障時仍繼續向用戶提供所需的服務，我們稱系統為 *容錯的*。如果系統不能容忍某個部分變得有故障，我們稱該部分為 *單點故障*（SPOF），因為該部分的故障會升級導致整個系統的失效。

例如在社交網路案例中，扇出流程裡可能有機器崩潰或不可用，導致物化時間線更新中斷。若要讓該流程具備容錯性，就必須保證有其他機器可接管任務，同時既不漏投帖子，也不重複投遞。（這個思想稱為 *恰好一次語義*，我們會在 [“資料庫的端到端論證”](/tw/ch13#sec_future_end_to_end) 中詳細討論。）

容錯能力總是“有邊界”的：它只針對某些型別、某個數量以內的故障。例如系統可能最多容忍 2 塊硬碟同時故障，或 3 個節點裡壞 1 個。若全部節點都崩潰，就無計可施，因此“容忍任意數量故障”並無意義。要是地球和上面的伺服器都被黑洞吞噬，那就只能去太空託管了，預算審批祝你好運。

反直覺的是，在這類系統裡，故意 *提高* 故障發生率反而有意義，例如無預警隨機殺死某個程序。這叫 *故障注入*。許多關鍵故障本質上是錯誤處理做得不夠好 [^38]。透過主動注入故障，可以持續演練並驗證容錯機制，提升對“真實故障發生時系統仍能正確處理”的信心。*混沌工程* 就是圍繞這類實驗建立起來的方法論 [^39]。

儘管我們通常更傾向於“容忍故障”，而非“阻止故障”，但也有“預防優於補救”的場景（例如根本無法補救）。安全問題就是如此：若攻擊者已攻破系統並獲取敏感資料，事件本身無法撤銷。不過，本書主要討論的是可恢復的故障型別。

### 硬體與軟體故障 {#sec_introduction_hardware_faults}

當我們想到系統失效的原因時，硬體故障很快就會浮現在腦海中：

* 機械硬碟每年故障率約為 2%–5% [^40] [^41]；在 10,000 盤位的儲存叢集中，平均每天約有 1 塊盤故障。近期資料表明磁碟可靠性在提升，但故障率仍不可忽視 [^42]。
* SSD 每年故障率約為 0.5%–1% [^43]。少量位元錯誤可自動糾正 [^44]，但不可糾正錯誤大約每盤每年一次，即使是磨損較輕的新盤也會出現；該錯誤率高於機械硬碟 [^45]、[^46]。
* 其他硬體元件，如電源、RAID 控制器和記憶體模組也會發生故障，儘管頻率低於硬碟驅動器 [^47] [^48]。
* 大約每 1000 臺機器裡就有 1 臺存在“偶發算錯結果”的 CPU 核心，可能由製造缺陷導致 [^49] [^50] [^51]。有時錯誤計算會直接導致崩潰；有時則只是悄悄返回錯誤結果。
* RAM 資料也可能損壞：要麼來自宇宙射線等隨機事件，要麼來自永久性物理缺陷。即便使用 ECC 記憶體，任意一年內仍有超過 1% 的機器會遇到不可糾正錯誤，通常表現為機器崩潰並需要更換受影響記憶體條 [^52]。此外，某些病態訪問模式還可能以較高機率觸發位元翻轉 [^53]。
* 整個資料中心也可能不可用（如停電、網路配置錯誤），甚至被永久摧毀（如火災、洪水、地震 [^54]）。太陽風暴會在長距離導線中感應大電流，可能損壞電網和海底通訊電纜 [^55]。這類大規模故障雖罕見，但若服務無法容忍資料中心丟失，後果將極其嚴重 [^56]。

這類事件在小系統裡足夠罕見，通常不必過度擔心，只要能方便地更換故障硬體即可。但在大規模系統裡，硬體故障足夠頻繁，已經是“正常執行”的一部分。

#### 透過冗餘容忍硬體故障 {#tolerating-hardware-faults-through-redundancy}

我們對不可靠硬體的第一反應通常是向各個硬體元件新增冗餘，以降低系統的故障率。磁碟可以設定為 RAID 配置（將資料分佈在同一臺機器的多個磁碟上，以便故障磁碟不會導致資料丟失），伺服器可能有雙電源和可熱插拔的 CPU，資料中心可能有電池和柴油發電機作為備用電源。這種冗餘通常可以使機器不間斷執行多年。

當元件故障獨立時，冗餘最有效，即一個故障的發生不會改變另一個故障發生的可能性。然而，經驗表明，元件故障之間通常存在顯著的相關性 [^41] [^57] [^58]；整個伺服器機架或整個資料中心的不可用仍然比我們預期的更頻繁地發生。

硬體冗餘確實能提升單機可用時間；但正如 ["分散式與單節點系統"](/tw/ch1#sec_introduction_distributed) 所述，分散式系統還具備額外優勢，例如可容忍整個資料中心中斷。因此雲系統通常不再過分追求“單機極致可靠”，而是透過軟體層容忍節點故障來實現高可用。雲廠商使用 *可用區* 標識資源是否物理共址；同一可用區內資源比跨地域資源更容易同時失效。

我們在本書中討論的容錯技術旨在容忍整個機器、機架或可用區的丟失。它們通常透過允許一個數據中心的機器在另一個數據中心的機器發生故障或變得不可達時接管來工作。我們將在 [第 6 章](/tw/ch6)、[第 10 章](/tw/ch10) 以及本書的其他各個地方討論這種容錯技術。

能夠容忍整個機器丟失的系統也具有運營優勢：如果你需要重新啟動機器（例如，應用作業系統安全補丁），單伺服器系統需要計劃停機時間，而多節點容錯系統可以一次修補一個節點，而不影響使用者的服務。這稱為 *滾動升級*，我們將在 [第 5 章](/tw/ch5) 中進一步討論它。

#### 軟體故障 {#software-faults}

儘管硬體故障可能存在弱相關，但整體上仍相對獨立：例如一塊盤壞了，同機其他盤往往還能再正常工作一段時間。相比之下，軟體故障常常高度相關，因為許多節點運行同一套軟體，也就共享同一批 bug [^59] [^60]。這類故障更難預判，也往往比“相互獨立的硬體故障”造成更多系統失效 [^47]。例如：

* 在特定情況下導致每個節點同時失效的軟體錯誤。例如，2012 年 6 月 30 日，閏秒導致許多 Java 應用程式由於 Linux 核心中的錯誤而同時掛起 [^61]。由於韌體錯誤，某些型號的所有 SSD 在精確執行 32,768 小時（不到 4 年）後突然失效，使其上的資料無法恢復 [^62]。
* 使用某些共享、有限資源（如 CPU 時間、記憶體、磁碟空間、網路頻寬或執行緒）的失控程序 [^63]。例如，處理大請求時消耗過多記憶體的程序可能會被作業系統殺死。客戶端庫中的錯誤可能導致比預期更高的請求量 [^64]。
* 系統所依賴的服務變慢、無響應或開始返回損壞的響應。
* 不同系統互動後出現“單系統隔離測試中看不到”的湧現行為 [^65]。
* 級聯故障，其中一個元件中的問題導致另一個元件過載和減速，這反過來又導致另一個元件崩潰 [^66] [^67]。

導致這類軟體故障的 bug 往往潛伏很久，直到一組不尋常條件把它觸發出來。這時才暴露出：軟體其實對執行環境做了某些假設，平時大多成立，但終有一天會因某種原因失效 [^68] [^69]。

軟體系統性故障沒有“速效藥”。但許多小措施都有效：認真審視系統假設與互動、充分測試、程序隔離、允許程序崩潰並重啟、避免反饋環路（如重試風暴，參見 ["當過載系統無法恢復時"](#sidebar_metastable)），以及在生產環境持續度量、監控和分析系統行為。

### 人類與可靠性 {#id31}

軟體系統由人設計、構建和運維。與機器不同，人不會只按規則執行；人的優勢在於創造性和適應性。但這也帶來不可預測性，即使本意是好的，也會犯導致失效的錯誤。例如，一項針對大型網際網路服務的研究發現：運維配置變更是中斷首因，而硬體故障（伺服器或網路）僅佔 10%–25% [^70]。

遇到這類問題，人們很容易歸咎於“人為錯誤”，並試圖透過更嚴格流程和更強規則約束來控制人。但“責怪個人”通常適得其反。所謂“人為錯誤”往往不是事故根因，而是社會技術系統本身存在問題的徵兆 [^71]。複雜系統裡，元件意外互動產生的湧現行為也常導致故障 [^72]。

有多種技術手段可降低人為失誤的影響：充分測試（含手寫測試與大量隨機輸入的 *屬性測試*）[^38]、可快速回滾配置變更的機制、新程式碼漸進發布、清晰細緻的監控、用於排查生產問題的可觀測性工具（參見 ["分散式系統的問題"](/tw/ch1#sec_introduction_dist_sys_problems)），以及鼓勵“正確操作”並抑制“錯誤操作”的良好介面設計。

但這些措施都需要時間和預算。在日常業務壓力下，組織往往優先投入“直接創收”活動，而非提升抗錯韌性的建設。若在“更多功能”和“更多測試”之間二選一，很多組織會自然選擇前者。既然如此，當可預防錯誤最終發生時，責怪個人並無意義，問題本質在於組織的優先順序選擇。

越來越多組織在實踐 *無責備事後分析*：事故發生後，鼓勵參與者在不擔心懲罰的前提下完整覆盤細節，讓組織其他人也能學習如何避免類似問題 [^73]。這個過程常會揭示出：業務優先順序需要調整、某些長期被忽視的領域需要補投入、相關激勵機制需要改，或其他應由管理層關注的系統性問題。

一般來說，調查事故時應警惕“過於簡單”的答案。“鮑勃部署時應更小心”沒有建設性，“我們必須用 Haskell 重寫後端”同樣不是。更可行的做法是：管理層藉機從一線人員視角理解社會技術系統的真實執行方式，並據此推動改進 [^71]。

--------

<a id="sidebar_reliability_importance"></a>

> [!TIP] 可靠性有多重要？

可靠性不只適用於核電站或空管系統，普通應用同樣需要可靠。企業軟體中的 bug 會造成生產力損失（若報表錯誤還會帶來法律風險）；電商網站故障則會帶來直接收入損失和品牌傷害。

在許多應用裡，幾分鐘乃至幾小時的短暫中斷尚可容忍 [^74]；但永久性資料丟失或損壞往往是災難性的。想象一位家長把孩子的全部照片和影片都存在你的相簿應用裡 [^75]。若資料庫突然損壞，他們會怎樣？又是否知道如何從備份恢復？

另一個“軟體不可靠傷害現實人群”的例子，是英國郵局 Horizon 醜聞。1999 到 2019 年間，數百名郵局網點負責人因會計系統顯示“賬目短缺”被判盜竊或欺詐。後來事實證明，許多“短缺”來自軟體缺陷，且大量判決已被推翻 [^76]。造成這場可能是英國史上最大司法不公的一個關鍵前提，是英國法律預設計算機正常執行（因此其證據可靠），除非有相反證據 [^77]。軟體工程師或許會覺得“軟體無 bug”很荒謬，但這對那些因此被錯判入獄、破產乃至自殺的人來說毫無安慰。

在某些場景下，我們也許會有意犧牲部分可靠性來降低開發成本（例如做未驗證市場的原型產品）。但應明確知道自己在何處“走捷徑”，並充分評估其後果。

--------

## 可伸縮性 {#sec_introduction_scalability}

即便系統今天執行可靠，也不代表將來一定如此。效能退化的常見原因之一是負載增長：比如併發使用者從 1 萬漲到 10 萬，或從 100 萬漲到 1000 萬；也可能是處理的資料規模遠大於從前。

*可伸縮性* 用來描述系統應對負載增長的能力。討論這個話題時，常有人說：“你又不是 Google/Amazon，別擔心規模，直接上關係資料庫。”這句話是否成立，取決於你在做什麼型別的應用。

如果你在做一個目前使用者很少的新產品（例如創業早期），首要工程目標通常是“儘可能簡單、儘可能靈活”，以便隨著對使用者需求理解加深而快速調整產品功能 [^78]。在這種環境下，過早擔心“未來也許會有”的規模往往適得其反：最好情況是白費功夫、過早最佳化；最壞情況是把自己鎖進僵化設計，反而阻礙演進。

原因在於，可伸縮性不是一維標籤；“X 可伸縮”或“Y 不可伸縮”這種說法本身意義不大。更有意義的問題是：

* “如果系統按某種方式增長，我們有哪些應對選項？”
* “我們如何增加計算資源來承載額外負載？”
* “按當前增長趨勢，現有架構何時會觸頂？”

當你的產品真的做起來、負載持續上升時，你自然會看到瓶頸在哪裡，也就知道該沿哪些維度擴充套件。那時再系統性投入可伸縮性技術，通常更合適。

### 描述負載 {#id33}

首先要簡明描述系統當前負載，之後才能討論“增長會怎樣”（例如負載翻倍會發生什麼）。最常見的是吞吐量指標：每秒請求數、每天新增資料量（GB）、每小時購物車結賬次數等。有時你關心的是峰值變數，比如 ["案例研究：社交網路首頁時間線"](#sec_introduction_twitter) 裡的“同時線上使用者數”。

此外還可能有其他統計特徵會影響訪問模式，進而影響可伸縮性要求。例如資料庫讀寫比、快取命中率、每使用者資料項數量（如社交網路裡的追隨者數）。有時平均情況最關鍵，有時瓶頸由少數極端情況主導，具體取決於你的應用細節。

當負載被清楚描述後，就可以分析“負載增加時系統會怎樣”。可從兩個角度看：

* 以某種方式增大負載、但保持資源（CPU、記憶體、網路頻寬等）不變時，效能如何變化？
* 若負載按某種方式增長、但你希望效能不變，需要增加多少資源？

通常目標是：在儘量降低執行成本的同時，讓效能維持在 SLA 要求內（參見 ["響應時間指標的應用"](#sec_introduction_slo_sla)）。所需計算資源越多，成本越高。不同硬體的價效比不同，而且會隨著新硬體出現而變化。

如果資源翻倍後能承載兩倍負載且效能不變，這稱為 *線性可伸縮性*，通常是理想狀態。偶爾，藉助規模效應或峰值負載更均勻分佈，甚至可用不足兩倍資源處理兩倍負載 [^79] [^80]。但更常見的是成本增長快於線性，低效原因也很多。比如資料量增大後，即使請求大小相同，處理一次寫請求也可能比資料量小時更耗資源。

### 共享記憶體、共享磁碟與無共享架構 {#sec_introduction_shared_nothing}

增加服務硬體資源的最簡單方式，是遷移到更強的機器。雖然單核 CPU 不再明顯提速，但你仍可購買（或租用）擁有更多 CPU 核心、更多 RAM、更多磁碟的例項。這叫 *縱向伸縮*（scaling up）。

在單機上，你可以透過多程序/多執行緒獲得並行性。同一程序內執行緒共享同一塊 RAM，因此這也叫 *共享記憶體架構*。問題是它的成本常常“超線性增長”：硬體資源翻倍的高階機器，價格往往遠超兩倍；且受限於瓶頸，效能提升通常又達不到兩倍。

另一種方案是 *共享磁碟架構*：多臺機器各有獨立 CPU 和 RAM，但共享同一組磁碟陣列，透過高速網路連線（NAS 或 SAN）。該架構傳統上用於本地資料倉庫場景，但爭用與鎖開銷限制了其可伸縮性 [^81]。

相比之下，*無共享架構* [^82]（即 *橫向伸縮*、scaling out）已廣泛流行。這種方案使用多節點分散式系統，每個節點擁有自己的 CPU、RAM 和磁碟；節點間協作透過常規網路在軟體層完成。

無共享的優勢在於：具備線性伸縮潛力、可靈活選用高性價比硬體（尤其在雲上）、更容易隨負載增減調整資源，並可透過跨多個數據中心/地域部署提升容錯。代價是：需要顯式分片（見 [第 7 章](/tw/ch7)），並承擔分散式系統的全部複雜性（見 [第 9 章](/tw/ch9)）。

一些雲原生資料庫把“儲存”和“事務執行”拆成獨立服務（參見 ["儲存與計算分離"](/tw/ch1#sec_introduction_storage_compute)），由多個計算節點共享同一儲存服務。這種模式與共享磁碟有相似性，但規避了老系統的可伸縮瓶頸：它不暴露 NAS/SAN 那種檔案系統或塊裝置抽象，而是提供面向資料庫場景定製的儲存 API [^83]。

### 可伸縮性原則 {#id35}

能夠大規模執行的系統架構，通常高度依賴具體應用，不存在通用“一招鮮”的可伸縮架構（俗稱 *萬金油*）。例如：面向“每秒 10 萬次請求、每次 1 kB”的系統，與面向“每分鐘 3 次請求、每次 2 GB”的系統，形態會完全不同，儘管二者資料吞吐量都約為 100 MB/s。

此外，適合某一級負載的架構，通常難以直接承受 10 倍負載。若你在做高速增長服務，幾乎每跨一個數量級都要重新審視架構。考慮到業務需求本身也會變化，提前規劃超過一個數量級的未來伸縮需求，往往不划算。

可伸縮性的一個通用原則，是把系統拆分成儘量可獨立執行的小元件。這也是微服務（參見 ["微服務與無伺服器"](/tw/ch1#sec_introduction_microservices)）、分片（[第 7 章](/tw/ch7)）、流處理（[第 12 章](/tw/ch12#ch_stream)）和無共享架構的共同基礎。難點在於：哪裡該拆，哪裡該合。微服務設計可參考其他書籍 [^84]；無共享系統的分片問題我們會在 [第 7 章](/tw/ch7) 討論。

另一個好原則是：不要把系統做得比必要更複雜。若單機資料庫足夠，就往往優於複雜分散式方案。自動伸縮（按需求自動加減資源）很吸引人，但若負載相對可預測，手動伸縮可能帶來更少運維意外（參見 ["操作：自動或手動再平衡"](/tw/ch7#sec_sharding_operations)）。5 個服務的系統通常比 50 個服務更簡單。好架構往往是多種方案的務實組合。

## 可維護性 {#sec_introduction_maintainability}

軟體不會像機械裝置那樣磨損或材料疲勞，但應用需求會變化，軟體所處環境（依賴項、底層平臺）也會變化，程式碼中還會持續暴露需要修復的缺陷。

業界普遍認同：軟體成本的大頭不在初始開發，而在後續維護，包括修 bug、保障系統穩定執行、排查故障、適配新平臺、支援新場景、償還技術債，以及持續交付新功能 [^85] [^86]。

然而維護並不容易。一個長期執行成功的系統，可能仍依賴今天少有人熟悉的舊技術（如大型機和 COBOL）；隨著人員流動，系統為何如此設計的組織記憶也可能丟失；維護者往往還要修復前人留下的問題。更重要的是，計算機系統通常與其支撐的組織流程深度耦合，這使得 *遺留* 系統維護既是技術問題，也是人員與組織問題 [^87]。

如果今天構建的系統足夠有價值並長期存活，它終有一天會變成遺留系統。為減少後繼維護者的痛苦，我們應在設計階段就考慮維護性。雖然難以準確預判哪些決策會在未來埋雷，但本書會強調幾條廣泛適用的原則：

可運維性（Operability）
: 讓組織能夠更容易地保持系統平穩執行。

簡單性（Simplicity）
: 採用易理解且一致的模式與結構，避免不必要複雜性，讓新工程師也能快速理解系統。

可演化性（Evolvability）
: 讓工程師在未來能更容易修改系統，使其隨著需求變化而持續適配並擴充套件到未預料場景。

### 可運維性：讓運維更輕鬆 {#id37}

我們在 ["雲時代的運維"](/tw/ch1#sec_introduction_operations) 已討論過運維角色：可靠執行不僅依賴工具，人類流程同樣關鍵。甚至有人指出：“好的運維常能繞過糟糕（或不完整）軟體的侷限；但再好的軟體，碰上糟糕運維也難以可靠執行” [^60]。

在由成千上萬臺機器組成的大規模系統中，純手工維護成本不可接受，自動化必不可少。但自動化也是雙刃劍：總會有邊緣場景（如罕見故障）需要運維團隊人工介入。並且“自動化處理不了”的往往恰恰最複雜，因此自動化越深，越需要 **更** 高水平的運維團隊來兜底 [^88]。

另外，一旦自動化系統本身出錯，往往比“部分依賴人工操作”的系統更難排查。因此自動化並非越多越好。合理自動化程度取決於你所在應用與組織的具體條件。

良好的可運維性意味著把日常任務做簡單，讓運維團隊把精力投入到高價值工作。資料系統可以透過多種方式達成這一點 [^89]：

* 讓監控工具能獲取關鍵指標，並支援可觀測性工具（參見 ["分散式系統的問題"](/tw/ch1#sec_introduction_dist_sys_problems)）以洞察執行時行為。相關商業/開源工具都很多 [^90]。
* 避免依賴單機（系統整體不停機的前提下允許下線機器維護）。
* 提供完善文件和易理解的操作模型（“我做 X，會發生 Y”）。
* 提供良好預設值，同時允許管理員在需要時覆蓋預設行為。
* 適當支援自愈，同時在必要時保留管理員對系統狀態的手動控制權。
* 行為可預測，儘量減少“驚喜”。

### 簡單性：管理複雜度 {#id38}

小型專案往往能保持簡潔、優雅、富有表達力；但專案變大後，程式碼常會迅速變複雜且難理解。這種複雜性會拖慢所有參與者效率，進一步抬高維護成本。陷入這種狀態的軟體專案常被稱為 *大泥團* [^91]。

當複雜性讓維護變難時，預算和進度常常失控。在複雜軟體裡，變更時引入缺陷的風險也更高：系統越難理解和推理，隱藏假設、非預期後果和意外互動就越容易被忽略 [^69]。反過來，降低複雜性能顯著提升可維護性，因此“追求簡單”應是系統設計核心目標之一。

簡單系統更容易理解，因此我們應儘可能用最簡單方式解決問題。但“簡單”知易行難。什麼叫簡單，往往帶有主觀判斷，因為不存在絕對客觀的簡單性標準 [^92]。例如，一個系統可能“介面簡單但實現複雜”，另一個可能“實現簡單但暴露更多內部細節”，到底誰更簡單，並不總有標準答案。

一種常見分析方法是把複雜性分成兩類：**本質複雜性** 與 **偶然複雜性** [^93]。前者源於業務問題本身，後者源於工具與實現限制。但這種劃分也並不完美，因為隨著工具演進，“本質”和“偶然”的邊界會移動 [^94]。

管理複雜度最重要的工具之一是 **抽象**。好的抽象能在清晰外觀後隱藏大量實現細節，也能被多種場景複用。這種複用不僅比反覆重寫更高效，也能提升質量，因為抽象元件一旦改進，所有依賴它的應用都會受益。

例如，高階語言是對機器碼、CPU 暫存器和系統呼叫的抽象。SQL 則抽象了磁碟/記憶體中的複雜資料結構、來自其他客戶端的併發請求，以及崩潰後的不一致狀態。用高階語言程式設計時，我們仍然在“使用機器碼”，但不再 *直接* 面對它，因為語言抽象替我們遮蔽了細節。

應用程式碼層面的抽象，常藉助 *設計模式* [^95]、*領域驅動設計*（DDD）[^96] 等方法來構建。本書重點不在這類應用專用抽象，而在你可以拿來構建應用的通用抽象，例如資料庫事務、索引、事件日誌等。若你想採用 DDD 等方法，也可以建立在本書介紹的基礎能力之上。

### 可演化性：讓變化更容易 {#sec_introduction_evolvability}

系統需求永遠不變的機率極低。更常見的是持續變化：你會發現新事實，出現此前未預期用例，業務優先順序會調整，使用者會提出新功能，新平臺會替換舊平臺，法律與監管會變化，系統增長也會倒逼架構調整。

在組織層面，*敏捷* 方法為適應變化提供了框架；敏捷社群也發展出多種適用於高變化環境的技術與流程，如測試驅動開發（TDD）和重構。本書關注的是：如何在“由多個不同應用/服務組成的系統層級”提升這種敏捷能力。

資料系統對變化的適應難易度，與其簡單性和抽象質量高度相關：松耦合、簡單系統通常比緊耦合、複雜系統更容易修改。由於這一點極其重要，我們把“資料系統層面的敏捷性”單獨稱為 *可演化性* [^97]。

大型系統中讓變更困難的一個關鍵因素，是某些操作不可逆，因此執行時必須極其謹慎 [^98]。例如從一個數據庫遷移到另一個：若新庫出問題後無法回切，風險就遠高於可隨時回退。儘量減少不可逆操作，能顯著提升系統靈活性。

## 總結 {#summary}

本章討論了幾類核心非功能性需求：效能、可靠性、可伸縮性與可維護性。圍繞這些主題，我們也建立了貫穿全書的一組概念與術語。章節從“社交網路首頁時間線”案例切入，直觀展示了系統在規模增長時會遇到的現實挑戰。

我們討論了如何衡量效能（例如響應時間百分位點）、如何描述系統負載（例如吞吐量指標），以及這些指標如何進入 SLA。與之緊密相關的是可伸縮性：當負載增長時，如何保持效能不退化。我們也給出了若干通用原則，例如將任務拆解為可獨立執行的小元件。後續章節會深入展開相關技術細節。

為實現可靠性，可以使用容錯機制，使系統在部分元件（如磁碟、機器或外部服務）故障時仍能持續提供服務。我們區分了硬體故障與軟體故障，並指出軟體故障常更難處理，因為它們往往高度相關。可靠性的另一面是“對人為失誤的韌性”，其中 *無責備事後分析* 是重要學習機制。

最後，我們討論了可維護性的多個維度：支援運維工作、管理複雜度、提升系統可演化性。實現這些目標沒有銀彈，但一個普遍有效的做法是：用清晰、可理解、具備良好抽象的構件來搭建系統。接下來全書會介紹一系列在實踐中證明有效的構件。

### 參考文獻

[^1]: Mike Cvet. [How We Learned to Stop Worrying and Love Fan-In at Twitter](https://www.youtube.com/watch?v=WEgCjwyXvwc). At *QCon San Francisco*, December 2016.
[^2]: Raffi Krikorian. [Timelines at Scale](https://www.infoq.com/presentations/Twitter-Timeline-Scalability/). At *QCon San Francisco*, November 2012. Archived at [perma.cc/V9G5-KLYK](https://perma.cc/V9G5-KLYK)
[^3]: Twitter. [Twitter's Recommendation Algorithm](https://blog.twitter.com/engineering/en_us/topics/open-source/2023/twitter-recommendation-algorithm). *blog.twitter.com*, March 2023. Archived at [perma.cc/L5GT-229T](https://perma.cc/L5GT-229T)
[^4]: Raffi Krikorian. [New Tweets per second record, and how!](https://blog.twitter.com/engineering/en_us/a/2013/new-tweets-per-second-record-and-how) *blog.twitter.com*, August 2013. Archived at [perma.cc/6JZN-XJYN](https://perma.cc/6JZN-XJYN)
[^5]: Jaz Volpert. [When Imperfect Systems are Good, Actually: Bluesky's Lossy Timelines](https://jazco.dev/2025/02/19/imperfection/). *jazco.dev*, February 2025. Archived at [perma.cc/2PVE-L2MX](https://perma.cc/2PVE-L2MX)
[^6]: Samuel Axon. [3% of Twitter's Servers Dedicated to Justin Bieber](https://mashable.com/archive/justin-bieber-twitter). *mashable.com*, September 2010. Archived at [perma.cc/F35N-CGVX](https://perma.cc/F35N-CGVX)
[^7]: Nathan Bronson, Abutalib Aghayev, Aleksey Charapko, and Timothy Zhu. [Metastable Failures in Distributed Systems](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s11-bronson.pdf). At *Workshop on Hot Topics in Operating Systems* (HotOS), May 2021. [doi:10.1145/3458336.3465286](https://doi.org/10.1145/3458336.3465286)
[^8]: Marc Brooker. [Metastability and Distributed Systems](https://brooker.co.za/blog/2021/05/24/metastable.html). *brooker.co.za*, May 2021. Archived at [perma.cc/7FGJ-7XRK](https://perma.cc/7FGJ-7XRK)
[^9]: Marc Brooker. [Exponential Backoff And Jitter](https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/). *aws.amazon.com*, March 2015. Archived at [perma.cc/R6MS-AZKH](https://perma.cc/R6MS-AZKH)
[^10]: Marc Brooker. [What is Backoff For?](https://brooker.co.za/blog/2022/08/11/backoff.html) *brooker.co.za*, August 2022. Archived at [perma.cc/PW9N-55Q5](https://perma.cc/PW9N-55Q5)
[^11]: Michael T. Nygard. [*Release It!*](https://learning.oreilly.com/library/view/release-it-2nd/9781680504552/), 2nd Edition. Pragmatic Bookshelf, January 2018. ISBN: 9781680502398
[^12]: Frank Chen. [Slowing Down to Speed Up – Circuit Breakers for Slack's CI/CD](https://slack.engineering/circuit-breakers/). *slack.engineering*, August 2022. Archived at [perma.cc/5FGS-ZPH3](https://perma.cc/5FGS-ZPH3)
[^13]: Marc Brooker. [Fixing retries with token buckets and circuit breakers](https://brooker.co.za/blog/2022/02/28/retries.html). *brooker.co.za*, February 2022. Archived at [perma.cc/MD6N-GW26](https://perma.cc/MD6N-GW26)
[^14]: David Yanacek. [Using load shedding to avoid overload](https://aws.amazon.com/builders-library/using-load-shedding-to-avoid-overload/). Amazon Builders' Library, *aws.amazon.com*. Archived at [perma.cc/9SAW-68MP](https://perma.cc/9SAW-68MP)
[^15]: Matthew Sackman. [Pushing Back](https://wellquite.org/posts/lshift/pushing_back/). *wellquite.org*, May 2016. Archived at [perma.cc/3KCZ-RUFY](https://perma.cc/3KCZ-RUFY)
[^16]: Dmitry Kopytkov and Patrick Lee. [Meet Bandaid, the Dropbox service proxy](https://dropbox.tech/infrastructure/meet-bandaid-the-dropbox-service-proxy). *dropbox.tech*, March 2018. Archived at [perma.cc/KUU6-YG4S](https://perma.cc/KUU6-YG4S)
[^17]: Haryadi S. Gunawi, Riza O. Suminto, Russell Sears, Casey Golliher, Swaminathan Sundararaman, Xing Lin, Tim Emami, Weiguang Sheng, Nematollah Bidokhti, Caitie McCaffrey, Gary Grider, Parks M. Fields, Kevin Harms, Robert B. Ross, Andree Jacobson, Robert Ricci, Kirk Webb, Peter Alvaro, H. Birali Runesha, Mingzhe Hao, and Huaicheng Li. [Fail-Slow at Scale: Evidence of Hardware Performance Faults in Large Production Systems](https://www.usenix.org/system/files/conference/fast18/fast18-gunawi.pdf). At *16th USENIX Conference on File and Storage Technologies*, February 2018.
[^18]: Marc Brooker. [Is the Mean Really Useless?](https://brooker.co.za/blog/2017/12/28/mean.html) *brooker.co.za*, December 2017. Archived at [perma.cc/U5AE-CVEM](https://perma.cc/U5AE-CVEM)
[^19]: Giuseppe DeCandia, Deniz Hastorun, Madan Jampani, Gunavardhan Kakulapati, Avinash Lakshman, Alex Pilchin, Swaminathan Sivasubramanian, Peter Vosshall, and Werner Vogels. [Dynamo: Amazon's Highly Available Key-Value Store](https://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf). At *21st ACM Symposium on Operating Systems Principles* (SOSP), October 2007. [doi:10.1145/1294261.1294281](https://doi.org/10.1145/1294261.1294281)
[^20]: Kathryn Whitenton. [The Need for Speed, 23 Years Later](https://www.nngroup.com/articles/the-need-for-speed/). *nngroup.com*, May 2020. Archived at [perma.cc/C4ER-LZYA](https://perma.cc/C4ER-LZYA)
[^21]: Greg Linden. [Marissa Mayer at Web 2.0](https://glinden.blogspot.com/2006/11/marissa-mayer-at-web-20.html). *glinden.blogspot.com*, November 2005. Archived at [perma.cc/V7EA-3VXB](https://perma.cc/V7EA-3VXB)
[^22]: Jake Brutlag. [Speed Matters for Google Web Search](https://services.google.com/fh/files/blogs/google_delayexp.pdf). *services.google.com*, June 2009. Archived at [perma.cc/BK7R-X7M2](https://perma.cc/BK7R-X7M2)
[^23]: Eric Schurman and Jake Brutlag. [Performance Related Changes and their User Impact](https://www.youtube.com/watch?v=bQSE51-gr2s). Talk at *Velocity 2009*.
[^24]: Akamai Technologies, Inc. [The State of Online Retail Performance](https://web.archive.org/web/20210729180749/https%3A//www.akamai.com/us/en/multimedia/documents/report/akamai-state-of-online-retail-performance-spring-2017.pdf). *akamai.com*, April 2017. Archived at [perma.cc/UEK2-HYCS](https://perma.cc/UEK2-HYCS)
[^25]: Xiao Bai, Ioannis Arapakis, B. Barla Cambazoglu, and Ana Freire. [Understanding and Leveraging the Impact of Response Latency on User Behaviour in Web Search](https://iarapakis.github.io/papers/TOIS17.pdf). *ACM Transactions on Information Systems*, volume 36, issue 2, article 21, April 2018. [doi:10.1145/3106372](https://doi.org/10.1145/3106372)
[^26]: Jeffrey Dean and Luiz André Barroso. [The Tail at Scale](https://cacm.acm.org/research/the-tail-at-scale/). *Communications of the ACM*, volume 56, issue 2, pages 74–80, February 2013. [doi:10.1145/2408776.2408794](https://doi.org/10.1145/2408776.2408794)
[^27]: Alex Hidalgo. [*Implementing Service Level Objectives: A Practical Guide to SLIs, SLOs, and Error Budgets*](https://www.oreilly.com/library/view/implementing-service-level/9781492076803/). O'Reilly Media, September 2020. ISBN: 1492076813
[^28]: Jeffrey C. Mogul and John Wilkes. [Nines are Not Enough: Meaningful Metrics for Clouds](https://research.google/pubs/pub48033/). At *17th Workshop on Hot Topics in Operating Systems* (HotOS), May 2019. [doi:10.1145/3317550.3321432](https://doi.org/10.1145/3317550.3321432)
[^29]: Tamás Hauer, Philipp Hoffmann, John Lunney, Dan Ardelean, and Amer Diwan. [Meaningful Availability](https://www.usenix.org/conference/nsdi20/presentation/hauer). At *17th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), February 2020.
[^30]: Ted Dunning. [The t-digest: Efficient estimates of distributions](https://www.sciencedirect.com/science/article/pii/S2665963820300403). *Software Impacts*, volume 7, article 100049, February 2021. [doi:10.1016/j.simpa.2020.100049](https://doi.org/10.1016/j.simpa.2020.100049)
[^31]: David Kohn. [How percentile approximation works (and why it's more useful than averages)](https://www.timescale.com/blog/how-percentile-approximation-works-and-why-its-more-useful-than-averages/). *timescale.com*, September 2021. Archived at [perma.cc/3PDP-NR8B](https://perma.cc/3PDP-NR8B)
[^32]: Heinrich Hartmann and Theo Schlossnagle. [Circllhist — A Log-Linear Histogram Data Structure for IT Infrastructure Monitoring](https://arxiv.org/pdf/2001.06561.pdf). *arxiv.org*, January 2020.
[^33]: Charles Masson, Jee E. Rim, and Homin K. Lee. [DDSketch: A Fast and Fully-Mergeable Quantile Sketch with Relative-Error Guarantees](https://www.vldb.org/pvldb/vol12/p2195-masson.pdf). *Proceedings of the VLDB Endowment*, volume 12, issue 12, pages 2195–2205, August 2019. [doi:10.14778/3352063.3352135](https://doi.org/10.14778/3352063.3352135)
[^34]: Baron Schwartz. [Why Percentiles Don't Work the Way You Think](https://orangematter.solarwinds.com/2016/11/18/why-percentiles-dont-work-the-way-you-think/). *solarwinds.com*, November 2016. Archived at [perma.cc/469T-6UGB](https://perma.cc/469T-6UGB)
[^35]: Walter L. Heimerdinger and Charles B. Weinstock. [A Conceptual Framework for System Fault Tolerance](https://resources.sei.cmu.edu/asset_files/TechnicalReport/1992_005_001_16112.pdf). Technical Report CMU/SEI-92-TR-033, Software Engineering Institute, Carnegie Mellon University, October 1992. Archived at [perma.cc/GD2V-DMJW](https://perma.cc/GD2V-DMJW)
[^36]: Felix C. Gärtner. [Fundamentals of fault-tolerant distributed computing in asynchronous environments](https://dl.acm.org/doi/pdf/10.1145/311531.311532). *ACM Computing Surveys*, volume 31, issue 1, pages 1–26, March 1999. [doi:10.1145/311531.311532](https://doi.org/10.1145/311531.311532)
[^37]: Algirdas Avižienis, Jean-Claude Laprie, Brian Randell, and Carl Landwehr. [Basic Concepts and Taxonomy of Dependable and Secure Computing](https://hdl.handle.net/1903/6459). *IEEE Transactions on Dependable and Secure Computing*, volume 1, issue 1, January 2004. [doi:10.1109/TDSC.2004.2](https://doi.org/10.1109/TDSC.2004.2)
[^38]: Ding Yuan, Yu Luo, Xin Zhuang, Guilherme Renna Rodrigues, Xu Zhao, Yongle Zhang, Pranay U. Jain, and Michael Stumm. [Simple Testing Can Prevent Most Critical Failures: An Analysis of Production Failures in Distributed Data-Intensive Systems](https://www.usenix.org/system/files/conference/osdi14/osdi14-paper-yuan.pdf). At *11th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2014.
[^39]: Casey Rosenthal and Nora Jones. [*Chaos Engineering*](https://learning.oreilly.com/library/view/chaos-engineering/9781492043850/). O'Reilly Media, April 2020. ISBN: 9781492043867
[^40]: Eduardo Pinheiro, Wolf-Dietrich Weber, and Luiz Andre Barroso. [Failure Trends in a Large Disk Drive Population](https://www.usenix.org/legacy/events/fast07/tech/full_papers/pinheiro/pinheiro_old.pdf). At *5th USENIX Conference on File and Storage Technologies* (FAST), February 2007.
[^41]: Bianca Schroeder and Garth A. Gibson. [Disk failures in the real world: What does an MTTF of 1,000,000 hours mean to you?](https://www.usenix.org/legacy/events/fast07/tech/schroeder/schroeder.pdf) At *5th USENIX Conference on File and Storage Technologies* (FAST), February 2007.
[^42]: Andy Klein. [Backblaze Drive Stats for Q2 2021](https://www.backblaze.com/blog/backblaze-drive-stats-for-q2-2021/). *backblaze.com*, August 2021. Archived at [perma.cc/2943-UD5E](https://perma.cc/2943-UD5E)
[^43]: Iyswarya Narayanan, Di Wang, Myeongjae Jeon, Bikash Sharma, Laura Caulfield, Anand Sivasubramaniam, Ben Cutler, Jie Liu, Badriddine Khessib, and Kushagra Vaid. [SSD Failures in Datacenters: What? When? and Why?](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/08/a7-narayanan.pdf) At *9th ACM International on Systems and Storage Conference* (SYSTOR), June 2016. [doi:10.1145/2928275.2928278](https://doi.org/10.1145/2928275.2928278)
[^44]: Alibaba Cloud Storage Team. [Storage System Design Analysis: Factors Affecting NVMe SSD Performance (1)](https://www.alibabacloud.com/blog/594375). *alibabacloud.com*, January 2019. Archived at [archive.org](https://web.archive.org/web/20230522005034/https%3A//www.alibabacloud.com/blog/594375)
[^45]: Bianca Schroeder, Raghav Lagisetty, and Arif Merchant. [Flash Reliability in Production: The Expected and the Unexpected](https://www.usenix.org/system/files/conference/fast16/fast16-papers-schroeder.pdf). At *14th USENIX Conference on File and Storage Technologies* (FAST), February 2016.
[^46]: Jacob Alter, Ji Xue, Alma Dimnaku, and Evgenia Smirni. [SSD failures in the field: symptoms, causes, and prediction models](https://dl.acm.org/doi/pdf/10.1145/3295500.3356172). At *International Conference for High Performance Computing, Networking, Storage and Analysis* (SC), November 2019. [doi:10.1145/3295500.3356172](https://doi.org/10.1145/3295500.3356172)
[^47]: Daniel Ford, François Labelle, Florentina I. Popovici, Murray Stokely, Van-Anh Truong, Luiz Barroso, Carrie Grimes, and Sean Quinlan. [Availability in Globally Distributed Storage Systems](https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Ford.pdf). At *9th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2010.
[^48]: Kashi Venkatesh Vishwanath and Nachiappan Nagappan. [Characterizing Cloud Computing Hardware Reliability](https://www.microsoft.com/en-us/research/wp-content/uploads/2010/06/socc088-vishwanath.pdf). At *1st ACM Symposium on Cloud Computing* (SoCC), June 2010. [doi:10.1145/1807128.1807161](https://doi.org/10.1145/1807128.1807161)
[^49]: Peter H. Hochschild, Paul Turner, Jeffrey C. Mogul, Rama Govindaraju, Parthasarathy Ranganathan, David E. Culler, and Amin Vahdat. [Cores that don't count](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s01-hochschild.pdf). At *Workshop on Hot Topics in Operating Systems* (HotOS), June 2021. [doi:10.1145/3458336.3465297](https://doi.org/10.1145/3458336.3465297)
[^50]: Harish Dattatraya Dixit, Sneha Pendharkar, Matt Beadon, Chris Mason, Tejasvi Chakravarthy, Bharath Muthiah, and Sriram Sankar. [Silent Data Corruptions at Scale](https://arxiv.org/abs/2102.11245). *arXiv:2102.11245*, February 2021.
[^51]: Diogo Behrens, Marco Serafini, Sergei Arnautov, Flavio P. Junqueira, and Christof Fetzer. [Scalable Error Isolation for Distributed Systems](https://www.usenix.org/conference/nsdi15/technical-sessions/presentation/behrens). At *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015.
[^52]: Bianca Schroeder, Eduardo Pinheiro, and Wolf-Dietrich Weber. [DRAM Errors in the Wild: A Large-Scale Field Study](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/35162.pdf). At *11th International Joint Conference on Measurement and Modeling of Computer Systems* (SIGMETRICS), June 2009. [doi:10.1145/1555349.1555372](https://doi.org/10.1145/1555349.1555372)
[^53]: Yoongu Kim, Ross Daly, Jeremie Kim, Chris Fallin, Ji Hye Lee, Donghyuk Lee, Chris Wilkerson, Konrad Lai, and Onur Mutlu. [Flipping Bits in Memory Without Accessing Them: An Experimental Study of DRAM Disturbance Errors](https://users.ece.cmu.edu/~yoonguk/papers/kim-isca14.pdf). At *41st Annual International Symposium on Computer Architecture* (ISCA), June 2014. [doi:10.5555/2665671.2665726](https://doi.org/10.5555/2665671.2665726)
[^54]: Tim Bray. [Worst Case](https://www.tbray.org/ongoing/When/202x/2021/10/08/The-WOrst-Case). *tbray.org*, October 2021. Archived at [perma.cc/4QQM-RTHN](https://perma.cc/4QQM-RTHN)
[^55]: Sangeetha Abdu Jyothi. [Solar Superstorms: Planning for an Internet Apocalypse](https://ics.uci.edu/~sabdujyo/papers/sigcomm21-cme.pdf). At *ACM SIGCOMM Conferene*, August 2021. [doi:10.1145/3452296.3472916](https://doi.org/10.1145/3452296.3472916)
[^56]: Adrian Cockcroft. [Failure Modes and Continuous Resilience](https://adrianco.medium.com/failure-modes-and-continuous-resilience-6553078caad5). *adrianco.medium.com*, November 2019. Archived at [perma.cc/7SYS-BVJP](https://perma.cc/7SYS-BVJP)
[^57]: Shujie Han, Patrick P. C. Lee, Fan Xu, Yi Liu, Cheng He, and Jiongzhou Liu. [An In-Depth Study of Correlated Failures in Production SSD-Based Data Centers](https://www.usenix.org/conference/fast21/presentation/han). At *19th USENIX Conference on File and Storage Technologies* (FAST), February 2021.
[^58]: Edmund B. Nightingale, John R. Douceur, and Vince Orgovan. [Cycles, Cells and Platters: An Empirical Analysis of Hardware Failures on a Million Consumer PCs](https://eurosys2011.cs.uni-salzburg.at/pdf/eurosys2011-nightingale.pdf). At *6th European Conference on Computer Systems* (EuroSys), April 2011. [doi:10.1145/1966445.1966477](https://doi.org/10.1145/1966445.1966477)
[^59]: Haryadi S. Gunawi, Mingzhe Hao, Tanakorn Leesatapornwongsa, Tiratat Patana-anake, Thanh Do, Jeffry Adityatama, Kurnia J. Eliazar, Agung Laksono, Jeffrey F. Lukman, Vincentius Martin, and Anang D. Satria. [What Bugs Live in the Cloud?](https://ucare.cs.uchicago.edu/pdf/socc14-cbs.pdf) At *5th ACM Symposium on Cloud Computing* (SoCC), November 2014. [doi:10.1145/2670979.2670986](https://doi.org/10.1145/2670979.2670986)
[^60]: Jay Kreps. [Getting Real About Distributed System Reliability](https://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability). *blog.empathybox.com*, March 2012. Archived at [perma.cc/9B5Q-AEBW](https://perma.cc/9B5Q-AEBW)
[^61]: Nelson Minar. [Leap Second Crashes Half the Internet](https://www.somebits.com/weblog/tech/bad/leap-second-2012.html). *somebits.com*, July 2012. Archived at [perma.cc/2WB8-D6EU](https://perma.cc/2WB8-D6EU)
[^62]: Hewlett Packard Enterprise. [Support Alerts – Customer Bulletin a00092491en\_us](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-a00092491en_us). *support.hpe.com*, November 2019. Archived at [perma.cc/S5F6-7ZAC](https://perma.cc/S5F6-7ZAC)
[^63]: Lorin Hochstein. [awesome limits](https://github.com/lorin/awesome-limits). *github.com*, November 2020. Archived at [perma.cc/3R5M-E5Q4](https://perma.cc/3R5M-E5Q4)
[^64]: Caitie McCaffrey. [Clients Are Jerks: AKA How Halo 4 DoSed the Services at Launch & How We Survived](https://www.caitiem.com/2015/06/23/clients-are-jerks-aka-how-halo-4-dosed-the-services-at-launch-how-we-survived/). *caitiem.com*, June 2015. Archived at [perma.cc/MXX4-W373](https://perma.cc/MXX4-W373)
[^65]: Lilia Tang, Chaitanya Bhandari, Yongle Zhang, Anna Karanika, Shuyang Ji, Indranil Gupta, and Tianyin Xu. [Fail through the Cracks: Cross-System Interaction Failures in Modern Cloud Systems](https://tianyin.github.io/pub/csi-failures.pdf). At *18th European Conference on Computer Systems* (EuroSys), May 2023. [doi:10.1145/3552326.3587448](https://doi.org/10.1145/3552326.3587448)
[^66]: Mike Ulrich. [Addressing Cascading Failures](https://sre.google/sre-book/addressing-cascading-failures/). In Betsy Beyer, Jennifer Petoff, Chris Jones, and Niall Richard Murphy (ed). [*Site Reliability Engineering: How Google Runs Production Systems*](https://www.oreilly.com/library/view/site-reliability-engineering/9781491929117/). O'Reilly Media, 2016. ISBN: 9781491929124
[^67]: Harri Faßbender. [Cascading failures in large-scale distributed systems](https://blog.mi.hdm-stuttgart.de/index.php/2022/03/03/cascading-failures-in-large-scale-distributed-systems/). *blog.mi.hdm-stuttgart.de*, March 2022. Archived at [perma.cc/K7VY-YJRX](https://perma.cc/K7VY-YJRX)
[^68]: Richard I. Cook. [How Complex Systems Fail](https://www.adaptivecapacitylabs.com/HowComplexSystemsFail.pdf). Cognitive Technologies Laboratory, April 2000. Archived at [perma.cc/RDS6-2YVA](https://perma.cc/RDS6-2YVA)
[^69]: David D. Woods. [STELLA: Report from the SNAFUcatchers Workshop on Coping With Complexity](https://snafucatchers.github.io/). *snafucatchers.github.io*, March 2017. Archived at [archive.org](https://web.archive.org/web/20230306130131/https%3A//snafucatchers.github.io/)
[^70]: David Oppenheimer, Archana Ganapathi, and David A. Patterson. [Why Do Internet Services Fail, and What Can Be Done About It?](https://static.usenix.org/events/usits03/tech/full_papers/oppenheimer/oppenheimer.pdf) At *4th USENIX Symposium on Internet Technologies and Systems* (USITS), March 2003.
[^71]: Sidney Dekker. [*The Field Guide to Understanding 'Human Error', 3rd Edition*](https://learning.oreilly.com/library/view/the-field-guide/9781317031833/). CRC Press, November 2017. ISBN: 9781472439055
[^72]: Sidney Dekker. [*Drift into Failure: From Hunting Broken Components to Understanding Complex Systems*](https://www.taylorfrancis.com/books/mono/10.1201/9781315257396/drift-failure-sidney-dekker). CRC Press, 2011. ISBN: 9781315257396
[^73]: John Allspaw. [Blameless PostMortems and a Just Culture](https://www.etsy.com/codeascraft/blameless-postmortems/). *etsy.com*, May 2012. Archived at [perma.cc/YMJ7-NTAP](https://perma.cc/YMJ7-NTAP)
[^74]: Itzy Sabo. [Uptime Guarantees — A Pragmatic Perspective](https://world.hey.com/itzy/uptime-guarantees-a-pragmatic-perspective-736d7ea4). *world.hey.com*, March 2023. Archived at [perma.cc/F7TU-78JB](https://perma.cc/F7TU-78JB)
[^75]: Michael Jurewitz. [The Human Impact of Bugs](http://jury.me/blog/2013/3/14/the-human-impact-of-bugs). *jury.me*, March 2013. Archived at [perma.cc/5KQ4-VDYL](https://perma.cc/5KQ4-VDYL)
[^76]: Mark Halper. [How Software Bugs led to 'One of the Greatest Miscarriages of Justice' in British History](https://cacm.acm.org/news/how-software-bugs-led-to-one-of-the-greatest-miscarriages-of-justice-in-british-history/). *Communications of the ACM*, January 2025. [doi:10.1145/3703779](https://doi.org/10.1145/3703779)
[^77]: Nicholas Bohm, James Christie, Peter Bernard Ladkin, Bev Littlewood, Paul Marshall, Stephen Mason, Martin Newby, Steven J. Murdoch, Harold Thimbleby, and Martyn Thomas. [The legal rule that computers are presumed to be operating correctly – unforeseen and unjust consequences](https://www.benthamsgaze.org/wp-content/uploads/2022/06/briefing-presumption-that-computers-are-reliable.pdf). Briefing note, *benthamsgaze.org*, June 2022. Archived at [perma.cc/WQ6X-TMW4](https://perma.cc/WQ6X-TMW4)
[^78]: Dan McKinley. [Choose Boring Technology](https://mcfunley.com/choose-boring-technology). *mcfunley.com*, March 2015. Archived at [perma.cc/7QW7-J4YP](https://perma.cc/7QW7-J4YP)
[^79]: Andy Warfield. [Building and operating a pretty big storage system called S3](https://www.allthingsdistributed.com/2023/07/building-and-operating-a-pretty-big-storage-system.html). *allthingsdistributed.com*, July 2023. Archived at [perma.cc/7LPK-TP7V](https://perma.cc/7LPK-TP7V)
[^80]: Marc Brooker. [Surprising Scalability of Multitenancy](https://brooker.co.za/blog/2023/03/23/economics.html). *brooker.co.za*, March 2023. Archived at [perma.cc/ZZD9-VV8T](https://perma.cc/ZZD9-VV8T)
[^81]: Ben Stopford. [Shared Nothing vs. Shared Disk Architectures: An Independent View](http://www.benstopford.com/2009/11/24/understanding-the-shared-nothing-architecture/). *benstopford.com*, November 2009. Archived at [perma.cc/7BXH-EDUR](https://perma.cc/7BXH-EDUR)
[^82]: Michael Stonebraker. [The Case for Shared Nothing](https://dsf.berkeley.edu/papers/hpts85-nothing.pdf). *IEEE Database Engineering Bulletin*, volume 9, issue 1, pages 4–9, March 1986.
[^83]: Panagiotis Antonopoulos, Alex Budovski, Cristian Diaconu, Alejandro Hernandez Saenz, Jack Hu, Hanuma Kodavalla, Donald Kossmann, Sandeep Lingam, Umar Farooq Minhas, Naveen Prakash, Vijendra Purohit, Hugh Qu, Chaitanya Sreenivas Ravella, Krystyna Reisteter, Sheetal Shrotri, Dixin Tang, and Vikram Wakade. [Socrates: The New SQL Server in the Cloud](https://www.microsoft.com/en-us/research/uploads/prod/2019/05/socrates.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 1743–1756, June 2019. [doi:10.1145/3299869.3314047](https://doi.org/10.1145/3299869.3314047)
[^84]: Sam Newman. [*Building Microservices*, second edition](https://www.oreilly.com/library/view/building-microservices-2nd/9781492034018/). O'Reilly Media, 2021. ISBN: 9781492034025
[^85]: Nathan Ensmenger. [When Good Software Goes Bad: The Surprising Durability of an Ephemeral Technology](https://themaintainers.wpengine.com/wp-content/uploads/2021/04/ensmenger-maintainers-v2.pdf). At *The Maintainers Conference*, April 2016. Archived at [perma.cc/ZXT4-HGZB](https://perma.cc/ZXT4-HGZB)
[^86]: Robert L. Glass. [*Facts and Fallacies of Software Engineering*](https://learning.oreilly.com/library/view/facts-and-fallacies/0321117425/). Addison-Wesley Professional, October 2002. ISBN: 9780321117427
[^87]: Marianne Bellotti. [*Kill It with Fire*](https://learning.oreilly.com/library/view/kill-it-with/9781098128883/). No Starch Press, April 2021. ISBN: 9781718501188
[^88]: Lisanne Bainbridge. [Ironies of automation](https://www.adaptivecapacitylabs.com/IroniesOfAutomation-Bainbridge83.pdf). *Automatica*, volume 19, issue 6, pages 775–779, November 1983. [doi:10.1016/0005-1098(83)90046-8](https://doi.org/10.1016/0005-1098%2883%2990046-8)
[^89]: James Hamilton. [On Designing and Deploying Internet-Scale Services](https://www.usenix.org/legacy/events/lisa07/tech/full_papers/hamilton/hamilton.pdf). At *21st Large Installation System Administration Conference* (LISA), November 2007.
[^90]: Dotan Horovits. [Open Source for Better Observability](https://horovits.medium.com/open-source-for-better-observability-8c65b5630561). *horovits.medium.com*, October 2021. Archived at [perma.cc/R2HD-U2ZT](https://perma.cc/R2HD-U2ZT)
[^91]: Brian Foote and Joseph Yoder. [Big Ball of Mud](http://www.laputan.org/pub/foote/mud.pdf). At *4th Conference on Pattern Languages of Programs* (PLoP), September 1997. Archived at [perma.cc/4GUP-2PBV](https://perma.cc/4GUP-2PBV)
[^92]: Marc Brooker. [What is a simple system?](https://brooker.co.za/blog/2022/05/03/simplicity.html) *brooker.co.za*, May 2022. Archived at [perma.cc/U72T-BFVE](https://perma.cc/U72T-BFVE)
[^93]: Frederick P. Brooks. [No Silver Bullet – Essence and Accident in Software Engineering](https://worrydream.com/refs/Brooks_1986_-_No_Silver_Bullet.pdf). In [*The Mythical Man-Month*](https://www.oreilly.com/library/view/mythical-man-month-the/0201835959/), Anniversary edition, Addison-Wesley, 1995. ISBN: 9780201835953
[^94]: Dan Luu. [Against essential and accidental complexity](https://danluu.com/essential-complexity/). *danluu.com*, December 2020. Archived at [perma.cc/H5ES-69KC](https://perma.cc/H5ES-69KC)
[^95]: Erich Gamma, Richard Helm, Ralph Johnson, and John Vlissides. [*Design Patterns: Elements of Reusable Object-Oriented Software*](https://learning.oreilly.com/library/view/design-patterns-elements/0201633612/). Addison-Wesley Professional, October 1994. ISBN: 9780201633610
[^96]: Eric Evans. [*Domain-Driven Design: Tackling Complexity in the Heart of Software*](https://learning.oreilly.com/library/view/domain-driven-design-tackling/0321125215/). Addison-Wesley Professional, August 2003. ISBN: 9780321125217
[^97]: Hongyu Pei Breivold, Ivica Crnkovic, and Peter J. Eriksson. [Analyzing Software Evolvability](https://www.es.mdh.se/pdf_publications/1251.pdf). at *32nd Annual IEEE International Computer Software and Applications Conference* (COMPSAC), July 2008. [doi:10.1109/COMPSAC.2008.50](https://doi.org/10.1109/COMPSAC.2008.50)
[^98]: Enrico Zaninotto. [From X programming to the X organisation](https://martinfowler.com/articles/zaninotto.pdf). At *XP Conference*, May 2002. Archived at [perma.cc/R9AR-QCKZ](https://perma.cc/R9AR-QCKZ)

================================================
FILE: content/tw/ch3.md
================================================
---
title: "3. 資料模型與查詢語言"
weight: 103
breadcrumbs: false
---

<a id="ch_datamodels"></a>

![](/map/ch02.png)

> *語言的邊界就是世界的邊界。*
>
> 路德維希・維特根斯坦，《邏輯哲學論》（1922）

資料模型或許是開發軟體最重要的部分，因為它們有著深遠的影響：不僅影響軟體的編寫方式，還影響我們 **思考問題** 的方式。

大多數應用程式都是透過層層疊加的資料模型來構建的。每一層的關鍵問題是：如何用更低層次的資料模型來 **表示** 它？例如：

1. 作為應用程式開發者，你觀察現實世界（其中有人員、組織、貨物、行為、資金流動、感測器等），並用物件或資料結構，以及操作這些資料結構的 API 來建模。這些結構通常是特定於應用程式的。
2. 當你想要儲存這些資料結構時，你用通用的資料模型來表達它們，例如 JSON 或 XML 文件、關係資料庫中的表，或者圖中的頂點和邊。這些資料模型是本章的主題。
3. 構建你的資料庫軟體的工程師決定了如何用記憶體、磁碟或網路上的位元組來表示文件/關係/圖資料。這種表示可能允許以各種方式查詢、搜尋、操作和處理資料。我們將在 [第 4 章](/tw/ch4#ch_storage) 中討論這些儲存引擎的設計。
4. 在更低的層次上，硬體工程師已經想出了如何用電流、光脈衝、磁場等來表示位元組的方法。

在複雜的應用程式中可能有更多的中間層，例如基於 API 之上的 API，但基本思想仍然相同：每一層透過提供一個簡潔的資料模型來隱藏下層的複雜性。這些抽象允許不同的人群 —— 例如，資料庫供應商的工程師和使用他們資料庫的應用程式開發者 —— 有效地合作。

在實踐中廣泛使用著幾種不同的資料模型，通常用於不同的目的。某些型別的資料和某些查詢在一種模型中很容易表達，而在另一種模型中則很困難。在本章中，我們將透過比較關係模型、文件模型、基於圖的資料模型、事件溯源和資料框來探討這些權衡。我們還將簡要介紹允許你使用這些模型的查詢語言。這種比較將幫助你決定何時使用哪種模型。

--------

> [!TIP] 術語：宣告式查詢語言
>
> 本章中的許多查詢語言（如 SQL、Cypher、SPARQL 或 Datalog）都是 **宣告式** 的，這意味著你指定所需資料的模式 ——
> 結果必須滿足什麼條件，以及你希望如何轉換資料（例如，排序、分組和聚合）—— 但不指定 **如何** 實現該目標。
> 資料庫系統的查詢最佳化器可以決定使用哪些索引和哪些連線演算法，以及以什麼順序執行查詢的各個部分。
>
> 相比之下，使用大多數程式語言，你必須編寫一個 **演算法** —— 即告訴計算機以什麼順序執行哪些操作。
> 宣告式查詢語言很有吸引力，因為它通常更簡潔，比顯式演算法更容易編寫。
> 但更重要的是，它還隱藏了查詢引擎的實現細節，這使得資料庫系統可以在不需要更改任何查詢的情況下引入效能改進 [^1]。
>
> 例如，資料庫可能能夠跨多個 CPU 核心和機器並行執行宣告式查詢，而你無需擔心如何實現該並行性 [^2]。
> 如果用手寫演算法，實現這種並行執行將需要大量工作。

--------

## 關係模型與文件模型 {#sec_datamodels_history}

今天最廣為人知的資料模型可能是 SQL，它基於 Edgar Codd 在 1970 年提出的關係模型 [^3]：
資料被組織成 **關係**（在 SQL 中稱為 **表**），其中每個關係是 **元組**（在 SQL 中稱為 **行**）的無序集合。

關係模型最初是一個理論提議，當時許多人懷疑它是否能夠高效實現。
然而，到 20 世紀 80 年代中期，關係資料庫管理系統（RDBMS）和 SQL 已成為大多數需要儲存和查詢具有某種規則結構的資料的人的首選工具。
許多資料管理用例在幾十年後仍然由關係資料主導 —— 例如，商業分析（參見 ["星型與雪花型：分析模式"](#sec_datamodels_analytics)）。

多年來，出現了許多與資料儲存和查詢相關的競爭方法。在 20 世紀 70 年代和 80 年代初，**網狀模型** 和 **層次模型** 是主要的替代方案，但關係模型最終戰勝了它們。
物件資料庫在 20 世紀 80 年代末和 90 年代初出現又消失。XML 資料庫在 21 世紀初出現，但只獲得了小眾的採用。
每個關係模型的競爭者在其時代都產生了大量的炒作，但都沒有持續下去 [^4]。
相反，SQL 已經發展到在其關係核心之外納入其他資料型別 —— 例如，增加了對 XML、JSON 和圖資料的支援 [^5]。

在 2010 年代，**NoSQL** 是試圖推翻關係資料庫主導地位的最新流行詞。
NoSQL 指的不是單一技術，而是圍繞新資料模型、模式靈活性、可伸縮性以及向開源許可模式轉變的一系列鬆散的想法。
一些資料庫將自己標榜為 *NewSQL*，因為它們旨在提供 NoSQL 系統的可伸縮性以及傳統關係資料庫的資料模型和事務保證。
NoSQL 和 NewSQL 的想法在資料系統設計中產生了很大的影響，但隨著這些原則被廣泛採用，這些術語的使用已經減少。

NoSQL 運動的一個持久影響是 **文件模型** 的流行，它通常將資料表示為 JSON。
這個模型最初由專門的文件資料庫（如 MongoDB 和 Couchbase）推廣，儘管大多數關係資料庫現在也增加了 JSON 支援。
與通常被視為具有嚴格和不靈活模式的關係表相比，JSON 文件被認為更加靈活。

文件和關係資料的優缺點已經被廣泛討論；讓我們來看看該辯論的一些關鍵點。

### 物件關係不匹配 {#sec_datamodels_document}

如今，大部分應用程式開發都是使用物件導向的程式語言完成的，這導致了對 SQL 資料模型的常見批評：如果資料儲存在關係表中，則需要在應用程式程式碼中的物件和資料庫的表、行、列模型之間建立一個笨拙的轉換層。這種模型之間的脫節有時被稱為 *阻抗不匹配*。

--------

> [!NOTE]
> 術語 *阻抗不匹配* 借自電子學。每個電路的輸入和輸出都有一定的阻抗（對交流電的阻力）。當你將一個電路的輸出連線到另一個電路的輸入時，如果兩個電路的輸出和輸入阻抗匹配，則透過連線的功率傳輸將最大化。阻抗不匹配可能導致訊號反射和其他問題。

--------

#### 物件關係對映（ORM） {#object-relational-mapping-orm}

物件關係對映（ORM）框架（如 ActiveRecord 和 Hibernate）減少了這個轉換層所需的樣板程式碼量，但它們經常受到批評 [^6]。一些常見的問題包括：

* ORM 很複雜，無法完全隱藏兩種模型之間的差異，因此開發人員仍然需要考慮資料的關係和物件表示。
* ORM 通常僅用於 OLTP 應用程式開發（參見 ["表徵事務處理和分析"](/tw/ch1#sec_introduction_oltp)）；為分析目的提供資料的資料工程師仍然需要使用底層的關係表示，因此在使用 ORM 時，關係模式的設計仍然很重要。
* 許多 ORM 僅適用於關係型 OLTP 資料庫。擁有多樣化資料系統（如搜尋引擎、圖資料庫和 NoSQL 系統）的組織可能會發現 ORM 支援不足。
* 一些 ORM 會自動生成關係模式，但這些模式對於直接訪問關係資料的使用者來說可能很尷尬，並且在底層資料庫上可能效率低下。自定義 ORM 的模式和查詢生成可能很複雜，並否定了首先使用 ORM 的好處。
* ORM 使得意外編寫低效查詢變得容易，例如 *N+1 查詢問題* [^7]。例如，假設你想在頁面上顯示使用者評論列表，因此你執行一個返回 *N* 條評論的查詢，每條評論都包含其作者的 ID。要顯示評論作者的姓名，你需要在使用者表中查詢 ID。在手寫 SQL 中，你可能會在查詢中執行此連線並返回每個評論的作者姓名，但使用 ORM 時，你可能最終會為 *N* 條評論中的每一條在使用者表上進行單獨的查詢以查詢其作者，總共產生 *N*+1 個數據庫查詢，這比在資料庫中執行連線要慢。為了避免這個問題，你可能需要告訴 ORM 在獲取評論的同時獲取作者資訊。

然而，ORM 也有優勢：

* 對於非常適合關係模型的資料，持久關係和記憶體物件表示之間的某種轉換是不可避免的，ORM 減少了這種轉換所需的樣板程式碼量。複雜的查詢可能仍然需要在 ORM 之外處理，但 ORM 可以幫助處理簡單和重複的情況。
* 一些 ORM 有助於快取資料庫查詢的結果，這可以幫助減少資料庫的負載。
* ORM 還可以幫助管理模式遷移和其他管理活動。

#### 用於一對多關係的文件資料模型 {#the-document-data-model-for-one-to-many-relationships}

並非所有資料都很適合關係表示；讓我們透過一個例子來探討關係模型的侷限性。[圖 3-1](#fig_obama_relational) 說明了如何在關係模式中表達簡歷（LinkedIn 個人資料）。整個個人資料可以透過唯一識別符號 `user_id` 來識別。像 `first_name` 和 `last_name` 這樣的欄位每個使用者只出現一次，因此它們可以建模為 `users` 表上的列。

大多數人在職業生涯中有多份工作（職位），人們可能有不同數量的教育經歷和任意數量的聯絡資訊。表示這種 *一對多關係* 的一種方法是將職位、教育和聯絡資訊放在單獨的表中，並使用外部索引鍵引用 `users` 表，如 [圖 3-1](#fig_obama_relational) 所示。

{{< figure src="/fig/ddia_0301.png" id="fig_obama_relational" caption="圖 3-1. 使用關係模式表示 LinkedIn 個人資料。" class="w-full my-4" >}}

另一種表示相同資訊的方式，可能更自然並且更接近應用程式程式碼中的物件結構，是作為 JSON 文件，如 [示例 3-1](#fig_obama_json) 所示。

{{< figure id="fig_obama_json" title="示例 3-1. 將 LinkedIn 個人資料表示為 JSON 文件" class="w-full my-4" >}}

```json
{
    "user_id": 251,
    "first_name": "Barack",
    "last_name": "Obama",
    "headline": "Former President of the United States of America",
    "region_id": "us:91",
    "photo_url": "/p/7/000/253/05b/308dd6e.jpg",
    "positions": [
        {"job_title": "President", "organization": "United States of America"},
        {"job_title": "US Senator (D-IL)", "organization": "United States Senate"}
    ],
    "education": [
        {"school_name": "Harvard University", "start": 1988, "end": 1991},
        {"school_name": "Columbia University", "start": 1981, "end": 1983}
    ],
    "contact_info": {
        "website": "https://barackobama.com",
        "twitter": "https://twitter.com/barackobama"
    }
}
```

一些開發人員認為 JSON 模型減少了應用程式程式碼和儲存層之間的阻抗不匹配。然而，正如我們將在 [第 5 章](/tw/ch5#ch_encoding) 中看到的，JSON 作為資料編碼格式也存在問題。缺乏模式通常被認為是一個優勢；我們將在 ["文件模型中的模式靈活性"](#sec_datamodels_schema_flexibility) 中討論這個問題。

與 [圖 3-1](#fig_obama_relational) 中的多表模式相比，JSON 表示具有更好的 *區域性*（參見 ["讀寫的資料區域性"](#sec_datamodels_document_locality)）。如果你想在關係示例中獲取個人資料，你需要執行多個查詢（透過 `user_id` 查詢每個表）或在 `users` 表與其從屬表之間執行複雜的多表連線 [^8]。在 JSON 表示中，所有相關資訊都在一個地方，使查詢既更快又更簡單。

從使用者個人資料到使用者職位、教育歷史和聯絡資訊的一對多關係暗示了資料中的樹形結構，而 JSON 表示使這種樹形結構變得明確（見 [圖 3-2](#fig_json_tree)）。

{{< figure src="/fig/ddia_0302.png" id="fig_json_tree" caption="圖 3-2. 一對多關係形成樹狀結構。" class="w-full my-4" >}}

--------

> [!NOTE]
> 這種型別的關係有時被稱為 *一對少* 而不是 *一對多*，因為簡歷通常有少量的職位 [^9] [^10]。在可能存在真正大量相關專案的情況下 —— 比如名人社交媒體帖子上的評論，可能有成千上萬條 —— 將它們全部嵌入同一個文件中可能太笨拙了，因此 [圖 3-1](#fig_obama_relational) 中的關係方法更可取。

--------

### 正規化、反正規化與連線 {#sec_datamodels_normalization}

在前一節的 [示例 3-1](#fig_obama_json) 中，`region_id` 被給出為 ID，而不是純文字字串 `"Washington, DC, United States"`。為什麼？

如果使用者介面有一個用於輸入地區的自由文字欄位，將其儲存為純文字字串是有意義的。但是，擁有標準化的地理區域列表並讓使用者從下拉列表或自動補全中選擇也有其優勢：

* 不同個人資料之間的風格和拼寫保持一致
* 避免歧義：如果有幾個同名的地方（如果字串只是 "Washington"，它是指 DC 還是州？）
* 易於更新 —— 名稱只儲存在一個地方，因此如果需要更改（例如，由於政治事件而更改城市名稱），可以輕鬆地全面更新
* 本地化支援 —— 當網站被翻譯成其他語言時，標準化列表可以被本地化，因此區域可以用檢視者的語言顯示
* 更好的搜尋 —— 例如，搜尋美國東海岸的人可以匹配此個人資料，因為區域列表可以編碼華盛頓位於東海岸的事實（這從字串 `"Washington, DC"` 中並不明顯）

無論你儲存 ID 還是文字字串，這都是 *正規化* 的問題。當你使用 ID 時，你的資料更加正規化：對人類有意義的資訊（如文字 *Washington, DC*）只儲存在一個地方，所有引用它的地方都使用 ID（它只在資料庫中有意義）。當你直接儲存文字時，你在使用它的每條記錄中都複製了對人類有意義的資訊；這種表示是 *反正規化* 的。

使用 ID 的優勢在於，因為它對人類沒有意義，所以永遠不需要更改：即使它標識的資訊發生變化，ID 也可以保持不變。任何對人類有意義的東西將來某個時候可能需要更改 —— 如果該資訊被複制，所有冗餘副本都需要更新。這需要更多的程式碼、更多的寫操作、更多的磁碟空間，並且存在不一致的風險（其中一些資訊副本被更新但其他的沒有）。

正規化表示的缺點是，每次要顯示包含 ID 的記錄時，都必須進行額外的查詢以將 ID 解析為人類可讀的內容。在關係資料模型中，這是使用 *連線* 完成的，例如：

```sql
SELECT users.*, regions.region_name
    FROM users
    JOIN regions ON users.region_id = regions.id
    WHERE users.id = 251;
```

文件資料庫可以儲存正規化和反正規化的資料，但它們通常與反正規化相關聯 —— 部分是因為 JSON 資料模型使得儲存額外的反正規化欄位變得容易，部分是因為許多文件資料庫中對連線的弱支援使得正規化不方便。一些文件資料庫根本不支援連線，因此你必須在應用程式程式碼中執行它們 —— 也就是說，你首先獲取包含 ID 的文件，然後執行第二個查詢將該 ID 解析為另一個文件。在 MongoDB 中，也可以使用聚合管道中的 `$lookup` 運算元執行連線：

```mongodb-json
db.users.aggregate([
    { $match: { _id: 251 } },
    { $lookup: {
        from: "regions",
        localField: "region_id",
        foreignField: "_id",
        as: "region"
    } }
])
```

#### 正規化的權衡 {#trade-offs-of-normalization}

在簡歷示例中，雖然 `region_id` 欄位是對標準化區域集的引用，但 `organization`（人工作的公司或政府）和 `school_name`（他們學習的地方）的名稱只是字串。這種表示是反正規化的：許多人可能在同一家公司工作過，但沒有 ID 將他們聯絡起來。

也許組織和學校應該是實體，個人資料應該引用它們的 ID 而不是它們的名稱？引用區域 ID 的相同論點也適用於此。例如，假設我們想在他們的名字之外包括學校或公司的標誌：

* 在反正規化表示中，我們會在每個人的個人資料中包含標誌的影像 URL；這使得 JSON 文件自包含，但如果我們需要更改標誌，就會產生麻煩，因為我們現在需要找到舊 URL 的所有出現並更新它們 [^9]。
* 在正規化表示中，我們將建立一個代表組織或學校的實體，並在該實體上儲存其名稱、標誌 URL 以及可能的其他屬性（描述、新聞提要等）一次。然後，每個提到該組織的簡歷都會簡單地引用其 ID，更新標誌很容易。

作為一般原則，正規化資料通常寫入更快（因為只有一個副本），但查詢更慢（因為它需要連線）；反正規化資料通常讀取更快（連線更少），但寫入更昂貴（更多副本要更新，使用更多磁碟空間）。你可能會發現將反正規化視為派生資料的一種形式很有幫助（["記錄系統與派生資料"](/tw/ch1#sec_introduction_derived)），因為你需要設定一個過程來更新資料的冗餘副本。

除了執行所有這些更新的成本之外，如果程序在進行更新的過程中崩潰，你還需要考慮資料庫的一致性。提供原子事務的資料庫（參見 ["原子性"](/tw/ch8#sec_transactions_acid_atomicity)）使保持一致性變得更容易，但並非所有資料庫都在多個文件之間提供原子性。透過流處理確保一致性也是可能的，我們將在 ["保持系統同步"](/tw/ch12#sec_stream_sync) 中討論。

正規化往往更適合 OLTP 系統，其中讀取和更新都需要快速；分析系統通常使用反正規化資料表現更好，因為它們批次執行更新，只讀查詢的效能是主要關注點。此外，在中小規模的系統中，正規化資料模型通常是最好的，因為你不必擔心保持資料的多個副本相互一致，執行連線的成本是可以接受的。然而，在非常大規模的系統中，連線的成本可能會成為問題。

#### 社交網路案例研究中的反正規化 {#denormalization-in-the-social-networking-case-study}

在 ["案例研究：社交網路首頁時間線"](/tw/ch2#sec_introduction_twitter) 中，我們比較了正規化表示（[圖 2-1](/tw/ch2#fig_twitter_relational)）和反正規化表示（預計算的物化時間線）：這裡，`posts` 和 `follows` 之間的連線太昂貴了，物化時間線是該連線結果的快取。將新帖子插入關注者時間線的扇出過程是我們保持反正規化表示一致的方式。

然而，X（前 Twitter）的物化時間線實現實際上並不儲存每個帖子的實際文字：每個條目實際上只儲存帖子 ID、釋出者的使用者 ID，以及一些額外的資訊來識別轉發和回覆 [^11]。換句話說，它大致是以下查詢的預計算結果：

```sql
SELECT posts.id, posts.sender_id
    FROM posts
    JOIN follows ON posts.sender_id = follows.followee_id
    WHERE follows.follower_id = current_user
    ORDER BY posts.timestamp DESC
    LIMIT 1000
```

這意味著每當讀取時間線時，服務仍然需要執行兩個連線：透過 ID 查詢帖子以獲取實際的帖子內容（以及點贊數和回覆數等統計資訊），並透過 ID 查詢傳送者的個人資料（以獲取他們的使用者名稱、個人資料圖片和其他詳細資訊）。這個將 ID 補全為人類可讀資訊的過程稱為 *hydrating* ID，本質上是在應用程式程式碼中執行的連線 [^11]。

在預計算時間線中僅儲存 ID 的原因是它們引用的資料變化很快：熱門帖子的點贊數和回覆數可能每秒變化多次，一些使用者定期更改他們的使用者名稱或個人資料照片。由於時間線在檢視時應該顯示最新的點贊數和個人資料圖片，因此將此資訊反正規化到物化時間線中是沒有意義的。此外，這種反正規化會顯著增加儲存成本。

這個例子表明，在讀取資料時必須執行連線並不像有時聲稱的那樣，是建立高效能、可擴充套件服務的障礙。`hydrating` 帖子 ID 和使用者 ID 實際上是一個相當容易擴充套件的操作，因為它可以很好地並行化，並且成本不取決於你關注的賬戶數量或你擁有的關注者數量。

如果你需要決定是否在應用程式中反正規化某些內容，社交網路案例研究表明選擇並不是立即顯而易見的：最可擴充套件的方法可能涉及反正規化某些內容並保持其他內容正規化。你必須仔細考慮資訊更改的頻率以及讀寫成本（這可能由異常值主導，例如在典型社交網路的情況下擁有許多關注/關注者的使用者）。正規化和反正規化本質上並不好或壞 —— 它們只是在讀寫效能以及實施工作量方面的權衡。

### 多對一與多對多關係 {#sec_datamodels_many_to_many}

雖然 [圖 3-1](#fig_obama_relational) 中的 `positions` 和 `education` 是一對多或一對少關係的例子（一份簡歷有多個職位，但每個職位只屬於一份簡歷），但 `region_id` 欄位是 *多對一* 關係的例子（許多人住在同一個地區，但我們假設每個人在任何時候只住在一個地區）。

如果我們為組織和學校引入實體，並透過 ID 從簡歷中引用它們，那麼我們也有 *多對多* 關係（一個人曾為多個組織工作，一個組織有多個過去或現在的員工）。在關係模型中，這種關係通常表示為 *關聯表* 或 *連線表*，如 [圖 3-3](#fig_datamodels_m2m_rel) 所示：每個職位將一個使用者 ID 與一個組織 ID 關聯起來。

{{< figure src="/fig/ddia_0303.png" id="fig_datamodels_m2m_rel" caption="圖 3-3. 關係模型中的多對多關係。" class="w-full my-4" >}}

多對一和多對多關係不容易適應一個自包含的 JSON 文件；它們更適合正規化表示。在文件模型中，一種可能的表示如 [示例 3-2](#fig_datamodels_m2m_json) 所示，並在 [圖 3-4](#fig_datamodels_many_to_many) 中說明：每個虛線矩形內的資料可以分組到一個文件中，但到組織和學校的連結最好表示為對其他文件的引用。

{{< figure id="fig_datamodels_m2m_json" title="示例 3-2. 透過 ID 引用組織的簡歷。" class="w-full my-4" >}}

```json
{
    "user_id": 251,
    "first_name": "Barack",
    "last_name": "Obama",
    "positions": [
        {"start": 2009, "end": 2017, "job_title": "President", "org_id": 513},
        {"start": 2005, "end": 2008, "job_title": "US Senator (D-IL)", "org_id": 514}
    ],
    ...
}
```

{{< figure src="/fig/ddia_0304.png" id="fig_datamodels_many_to_many" caption="圖 3-4. 文件模型中的多對多關係：每個虛線框內的資料可以分組到一個文件中。" class="w-full my-4" >}}

多對多關係通常需要"雙向"查詢：例如，找到特定人員工作過的所有組織，以及找到在特定組織工作過的所有人員。啟用此類查詢的一種方法是在兩邊都儲存 ID 引用，即簡歷包含該人工作過的每個組織的 ID，組織文件包含提到該組織的簡歷的 ID。這種表示是反正規化的，因為關係儲存在兩個地方，可能會相互不一致。

正規化表示僅在一個地方儲存關係，並依賴 *二級索引*（我們將在 [第 4 章](/tw/ch4#ch_storage) 中討論）來允許有效地雙向查詢關係。在 [圖 3-3](#fig_datamodels_m2m_rel) 的關係模式中，我們會告訴資料庫在 `positions` 表的 `user_id` 和 `org_id` 列上建立索引。

在 [示例 3-2](#fig_datamodels_m2m_json) 的文件模型中，資料庫需要索引 `positions` 陣列內物件的 `org_id` 欄位。許多文件資料庫和具有 JSON 支援的關係資料庫能夠在文件內的值上建立此類索引。

### 星型與雪花型：分析模式 {#sec_datamodels_analytics}

資料倉庫（參見 ["資料倉庫"](/tw/ch1#sec_introduction_dwh)）通常是關係型的，並且資料倉庫中表結構有一些廣泛使用的約定：*星型模式*、*雪花模式*、*維度建模* [^12]，以及 *一張大表*（OBT）。這些結構針對業務分析師的需求進行了最佳化。ETL 過程將來自運營系統的資料轉換為此模式。

[圖 3-5](#fig_dwh_schema) 顯示了一個可能在雜貨零售商的資料倉庫中找到的星型模式示例。模式的中心是所謂的 *事實表*（在此示例中，它稱為 `fact_sales`）。事實表的每一行代表在特定時間發生的事件（這裡，每一行代表客戶購買產品）。如果我們分析的是網站流量而不是零售銷售，每一行可能代表使用者的頁面檢視或點選。

{{< figure src="/fig/ddia_0305.png" id="fig_dwh_schema" caption="圖 3-5. 用於資料倉庫的星型模式示例。" class="w-full my-4" >}}

通常，事實被捕獲為單個事件，因為這允許以後最大的分析靈活性。然而，這意味著事實表可能變得非常大。一個大型企業可能在其資料倉庫中有許多 PB 的交易歷史，主要表示為事實表。

事實表中的一些列是屬性，例如產品售出的價格和從供應商那裡購買它的成本（允許計算利潤率）。事實表中的其他列是對其他表的外部索引鍵引用，稱為 *維度表*。由於事實表中的每一行代表一個事件，維度代表事件的 *誰*、*什麼*、*哪裡*、*何時*、*如何* 和 *為什麼*。

例如，在 [圖 3-5](#fig_dwh_schema) 中，其中一個維度是售出的產品。`dim_product` 表中的每一行代表一種待售產品型別，包括其庫存單位（SKU）、描述、品牌名稱、類別、脂肪含量、包裝尺寸等。`fact_sales` 表中的每一行使用外部索引鍵來指示在該特定交易中售出了哪種產品。查詢通常涉及對多個維度表的多個連線。

即使日期和時間也經常使用維度表表示，因為這允許編碼有關日期的附加資訊（例如公共假期），允許查詢區分假期和非假期的銷售。

[圖 3-5](#fig_dwh_schema) 是星型模式的一個例子。該名稱來自這樣一個事實：當表關係被視覺化時，事實表位於中間，被其維度表包圍；到這些表的連線就像星星的光芒。

這個模板的一個變體被稱為 *雪花模式*，其中維度被進一步分解為子維度。例如，品牌和產品類別可能有單獨的表，`dim_product` 表中的每一行都可以將品牌和類別作為外部索引鍵引用，而不是將它們作為字串儲存在 `dim_product` 表中。雪花模式比星型模式更正規化，但星型模式通常更受歡迎，因為它們對分析師來說更簡單 [^12]。

在典型的資料倉庫中，表通常非常寬：事實表通常有超過 100 列，有時有幾百列。維度表也可能很寬，因為它們包括所有可能與分析相關的元資料 —— 例如，`dim_store` 表可能包括每個商店提供哪些服務的詳細資訊、是否有店內麵包房、平方英尺、商店首次開業的日期、最後一次改造的時間、距離最近的高速公路有多遠等。

星型或雪花模式主要由多對一關係組成（例如，許多銷售發生在一個特定產品，在一個特定商店），表示為事實表對維度表的外部索引鍵，或維度對子維度的外部索引鍵。原則上，其他型別的關係可能存在，但它們通常被反正規化以簡化查詢。例如，如果客戶一次購買多種不同的產品，則該多項交易不會被明確表示；相反，事實表中為每個購買的產品都有一個單獨的行，這些事實都恰好具有相同的客戶 ID、商店 ID 和時間戳。

一些資料倉庫模式進一步進行反正規化，完全省略維度表，將維度中的資訊摺疊到事實表上的反正規化列中（本質上是預計算事實表和維度表之間的連線）。這種方法被稱為 *一張大表*（OBT），雖然它需要更多的儲存空間，但有時可以實現更快的查詢 [^13]。

在分析的背景下，這種反正規化是沒有問題的，因為資料通常代表不會改變的歷史資料日誌（除了偶爾糾正錯誤）。OLTP 系統中反正規化出現的資料一致性和寫入開銷問題在分析中並不那麼緊迫。

### 何時使用哪種模型 {#sec_datamodels_document_summary}

文件資料模型的主要論點是模式靈活性、由於區域性而獲得更好的效能，以及對於某些應用程式來說，它更接近應用程式使用的物件模型。關係模型透過為連線、多對一和多對多關係提供更好的支援來反擊。讓我們更詳細地研究這些論點。

如果你的應用程式中的資料具有類似文件的結構（即一對多關係的樹，通常一次載入整個樹），那麼使用文件模型可能是個好主意。將類似文件的結構 *切碎*（shredding）為多個表的關係技術（如 [圖 3-1](#fig_obama_relational) 中的 `positions`、`education` 和 `contact_info`）可能導致繁瑣的模式和不必要複雜的應用程式程式碼。

文件模型有侷限性：例如，你不能直接引用文件中的巢狀項，而是需要說類似"使用者 251 的職位列表中的第二項"之類的話。如果你確實需要引用巢狀項，關係方法效果更好，因為你可以透過其 ID 直接引用任何項。

一些應用程式允許使用者選擇專案的順序：例如，想象一個待辦事項列表或問題跟蹤器，使用者可以拖放任務來重新排序它們。文件模型很好地支援此類應用程式，因為專案（或它們的 ID）可以簡單地儲存在 JSON 陣列中以確定它們的順序。在關係資料庫中，沒有表示此類可重新排序列表的標準方法，並且使用各種技巧：按整數列排序（在插入中間時需要重新編號）、ID 的連結串列或分數索引 [^14] [^15] [^16]。

#### 文件模型中的模式靈活性 {#sec_datamodels_schema_flexibility}

大多數文件資料庫以及關係資料庫中的 JSON 支援不會對文件中的資料強制執行任何模式。關係資料庫中的 XML 支援通常帶有可選的模式驗證。沒有模式意味著可以將任意鍵和值新增到文件中，並且在讀取時，客戶端不能保證文件可能包含哪些欄位。

文件資料庫有時被稱為 *無模式*，但這是誤導性的，因為讀取資料的程式碼通常假設某種結構 —— 即存在隱式模式，但資料庫不強制執行 [^17]。更準確的術語是 *讀時模式*（資料的結構是隱式的，只有在讀取資料時才解釋），與 *寫時模式*（關係資料庫的傳統方法，其中模式是顯式的，資料庫確保所有資料在寫入時都符合它）形成對比 [^18]。

讀時模式類似於程式語言中的動態（執行時）型別檢查，而寫時模式類似於靜態（編譯時）型別檢查。正如靜態和動態型別檢查的倡導者對它們的相對優點有很大的爭論 [^19]，資料庫中模式的強制執行是一個有爭議的話題，通常沒有正確或錯誤的答案。

當應用程式想要更改其資料格式時，這些方法之間的差異特別明顯。例如，假設你當前在一個欄位中儲存每個使用者的全名，而你想要分別儲存名字和姓氏 [^20]。在文件資料庫中，你只需開始編寫具有新欄位的新文件，並在應用程式中編寫處理讀取舊文件時的程式碼。例如：

```mongodb-json
if (user && user.name && !user.first_name) {
    // 2023 年 12 月 8 日之前寫入的文件沒有 first_name
    user.first_name = user.name.split(" ")[0];
}
```

這種方法的缺點是，從資料庫讀取的應用程式的每個部分現在都需要處理可能很久以前寫入的舊格式的文件。另一方面，在寫時模式資料庫中，你通常會執行 *遷移*，如下所示：

```sql
ALTER TABLE users ADD COLUMN first_name text DEFAULT NULL;
UPDATE users SET first_name = split_part(name, ' ', 1); -- PostgreSQL
UPDATE users SET first_name = substring_index(name, ' ', 1); -- MySQL
```

在大多數關係資料庫中，新增具有預設值的列即使在大表上也是快速且無問題的。然而，在大表上執行 `UPDATE` 語句可能會很慢，因為每一行都需要重寫，其他模式操作（例如更改列的資料型別）通常也需要複製整個表。

存在各種工具允許在後臺執行此類模式更改而無需停機 [^21] [^22] [^23] [^24]，但在大型資料庫上執行此類遷移在操作上仍然具有挑戰性。透過僅新增預設值為 `NULL` 的 `first_name` 列（這很快）並在讀取時填充它，可以避免複雜的遷移，就像你在文件資料庫中所做的那樣。

如果集合中的專案由於某種原因並非都具有相同的結構（即資料是異構的），則讀時模式方法是有利的 —— 例如，因為：

* 有許多不同型別的物件，將每種型別的物件放在自己的表中是不切實際的。
* 資料的結構由你無法控制且可能隨時更改的外部系統決定。

在這樣的情況下，模式可能弊大於利，無模式文件可能是更自然的資料模型。但在所有記錄都應具有相同結構的情況下，模式是記錄和強制該結構的有用機制。我們將在 [第 5 章](/tw/ch5#ch_encoding) 中更詳細地討論模式和模式演化。

#### 讀寫的資料區域性 {#sec_datamodels_document_locality}

文件通常儲存為單個連續字串，編碼為 JSON、XML 或二進位制變體（如 MongoDB 的 BSON）。如果你的應用程式經常需要訪問整個文件（例如，在網頁上渲染它），則這種 *儲存區域性* 具有效能優勢。如果資料分佈在多個表中，如 [圖 3-1](#fig_obama_relational) 所示，則需要多次索引查詢才能檢索所有資料，這可能需要更多的磁碟尋道並花費更多時間。

區域性優勢僅在你同時需要文件的大部分時才適用。資料庫通常需要載入整個文件，如果你只需要訪問大文件的一小部分，這可能會浪費。在文件更新時，通常需要重寫整個文件。由於這些原因，通常建議你保持文件相當小，並避免頻繁對文件進行小的更新。

然而，將相關資料儲存在一起以獲得區域性的想法並不限於文件模型。例如，Google 的 Spanner 資料庫在關係資料模型中提供相同的區域性屬性，允許模式宣告表的行應該交錯（巢狀）在父表中 [^25]。Oracle 允許相同的功能，使用稱為 *多表索引叢集表* 的功能 [^26]。由 Google 的 Bigtable 推廣並在 HBase 和 Accumulo 等中使用的 *寬列* 資料模型具有 *列族* 的概念，其目的類似於管理區域性 [^27]。

#### 文件的查詢語言 {#query-languages-for-documents}

關係資料庫和文件資料庫之間的另一個區別是你用來查詢它的語言或 API。大多數關係資料庫使用 SQL 查詢，但文件資料庫更加多樣化。一些只允許透過主鍵進行鍵值訪問，而另一些還提供二級索引來查詢文件內的值，有些提供豐富的查詢語言。

XML 資料庫通常使用 XQuery 和 XPath 查詢，它們旨在允許複雜的查詢，包括跨多個文件的連線，並將其結果格式化為 XML [^28]。JSON Pointer [^29] 和 JSONPath [^30] 為 JSON 提供了等效於 XPath 的功能。

MongoDB 的聚合管道，我們在 ["正規化、反正規化與連線"](#sec_datamodels_normalization) 中看到了其用於連線的 `$lookup` 運算元，是 JSON 文件集合查詢語言的一個例子。

讓我們看另一個例子來感受這種語言 —— 這次是聚合，這對分析特別需要。想象你是一名海洋生物學家，每次你在海洋中看到動物時，你都會向資料庫新增一條觀察記錄。現在你想生成一份報告，說明你每個月看到了多少條鯊魚。在 PostgreSQL 中，你可能會這樣表達該查詢：

```sql
SELECT date_trunc('month', observation_timestamp) AS observation_month, ❶
    sum(num_animals) AS total_animals
FROM observations
WHERE family = 'Sharks'
GROUP BY observation_month;
```

❶ : `date_trunc('month', timestamp)` 函式確定包含 `timestamp` 的日曆月，並返回表示該月開始的另一個時間戳。換句話說，它將時間戳向下舍入到最近的月份。

此查詢首先過濾觀察結果以僅顯示 `Sharks` 家族中的物種，然後按它們發生的日曆月對觀察結果進行分組，最後將該月所有觀察中看到的動物數量相加。可以使用 MongoDB 的聚合管道表達相同的查詢，如下所示：

```mongodb-json
db.observations.aggregate([
    { $match: { family: "Sharks" } },
    { $group: {
    _id: {
        year: { $year: "$observationTimestamp" },
        month: { $month: "$observationTimestamp" }
    },
    totalAnimals: { $sum: "$numAnimals" }
    } }
]);
```

聚合管道語言在表達能力上類似於 SQL 的子集，但它使用基於 JSON 的語法而不是 SQL 的英語句子風格語法；差異可能是品味問題。

#### 文件和關係資料庫的融合 {#convergence-of-document-and-relational-databases}

文件資料庫和關係資料庫最初是非常不同的資料管理方法，但隨著時間的推移，它們變得更加相似 [^31]。關係資料庫增加了對 JSON 型別和查詢運算元的支援，以及索引文件內屬性的能力。一些文件資料庫（如 MongoDB、Couchbase 和 RethinkDB）增加了對連線、二級索引和宣告式查詢語言的支援。

模型的這種融合對應用程式開發人員來說是個好訊息，因為當你可以在同一個資料庫中組合兩者時，關係模型和文件模型效果最好。許多文件資料庫需要對其他文件進行關係式引用，許多關係資料庫也有一些場景更適合模式靈活性。關係-文件混合是一個強大的組合。

--------

> [!NOTE]
> Codd 對關係模型的原始描述 [^3] 實際上允許在關係模式中存在類似於 JSON 的東西。他稱之為 *非簡單域*。這個想法是，行中的值不必只是原始資料型別（如數字或字串），但它也可以是巢狀關係（表）—— 所以你可以有一個任意巢狀的樹結構作為值，很像 30 多年後新增到 SQL 的 JSON 或 XML 支援。

--------


## 圖資料模型 {#sec_datamodels_graph}

我們之前看到，關係型別是不同資料模型之間的重要區別特徵。如果你的應用程式主要具有一對多關係（樹形結構資料）並且記錄之間很少有其他關係，則文件模型是合適的。

但是，如果你的資料中多對多關係非常常見呢？關係模型可以處理多對多關係的簡單情況，但隨著資料內部連線變得更加複雜，開始將資料建模為圖變得更加自然。

圖由兩種物件組成：*頂點*（也稱為 *節點* 或 *實體*）和 *邊*（也稱為 *關係* 或 *弧*）。許多型別的資料可以建模為圖。典型的例子包括：

社交圖
: 頂點是人，邊表示哪些人相互認識。

網頁圖
: 頂點是網頁，邊表示指向其他頁面的 HTML 連結。

道路或鐵路網路
: 頂點是交叉點，邊表示它們之間的道路或鐵路線。

眾所周知的演算法可以在這些圖上執行：例如，地圖導航應用程式搜尋道路網路中兩點之間的最短路徑，PageRank 可用於網頁圖以確定網頁的受歡迎程度，從而確定其在搜尋結果中的排名 [^32]。

圖可以用幾種不同的方式表示。在 *鄰接表* 模型中，每個頂點儲存其相距一條邊的鄰居頂點的 ID。或者，你可以使用 *鄰接矩陣*，這是一個二維陣列，其中每一行和每一列對應一個頂點，當行頂點和列頂點之間沒有邊時值為零，如果有邊則值為一。鄰接表適合圖遍歷，矩陣適合機器學習（參見 ["資料框、矩陣與陣列"](#sec_datamodels_dataframes)）。

在剛才給出的示例中，圖中的所有頂點都表示相同型別的事物（分別是人、網頁或道路交叉點）。然而，圖不限於這種 *同質* 資料：圖的一個同樣強大的用途是提供一種一致的方式在單個數據庫中儲存完全不同型別的物件。例如：

* Facebook 維護一個包含許多不同型別頂點和邊的單一圖：頂點表示人員、位置、事件、簽到和使用者發表的評論；邊表示哪些人彼此是朋友、哪個簽到發生在哪個位置、誰評論了哪個帖子、誰參加了哪個事件等等 [^33]。
* 知識圖被搜尋引擎用來記錄搜尋查詢中經常出現的實體（如組織、人員和地點）的事實 [^34]。這些資訊透過爬取和分析網站上的文字獲得；一些網站（如 Wikidata）也以結構化形式釋出圖資料。

在圖中構建和查詢資料有幾種不同但相關的方式。在本節中，我們將討論 *屬性圖* 模型（由 Neo4j、Memgraph、KùzuDB [^35] 和其他 [^36] 實現）和 *三元組儲存* 模型（由 Datomic、AllegroGraph、Blazegraph 和其他實現）。這些模型在它們可以表達的內容方面相當相似，一些圖資料庫（如 Amazon Neptune）支援兩種模型。

我們還將檢視圖的四種查詢語言（Cypher、SPARQL、Datalog 和 GraphQL），以及用於查詢圖的 SQL 支援。還存在其他圖查詢語言，如 Gremlin [^37]，但這些將為我們提供代表性的概述。

為了說明這些不同的語言和模型，本節使用 [圖 3-6](#fig_datamodels_graph) 中顯示的圖作為執行示例。它可能取自社交網路或家譜資料庫：它顯示了兩個人，來自愛達荷州的 Lucy 和來自法國聖洛的 Alain。他們已婚並住在倫敦。每個人和每個位置都表示為頂點，它們之間的關係表示為邊。此示例將幫助演示一些在圖資料庫中很容易但在其他模型中很困難的查詢。

{{< figure src="/fig/ddia_0306.png" id="fig_datamodels_graph" caption="圖 3-6. 圖結構資料示例（框表示頂點，箭頭表示邊）。" class="w-full my-4" >}}

### 屬性圖 {#id56}

在 *屬性圖*（也稱為 *標記屬性圖*）模型中，每個頂點包含：

* 唯一識別符號
* 標籤（字串），描述此頂點表示的物件型別
* 一組出邊
* 一組入邊
* 屬性集合（鍵值對）

每條邊包含：

* 唯一識別符號
* 邊開始的頂點（*尾頂點*）
* 邊結束的頂點（*頭頂點*）
* 描述兩個頂點之間關係型別的標籤
* 屬性集合（鍵值對）

你可以將圖儲存視為由兩個關係表組成，一個用於頂點，一個用於邊，如 [示例 3-3](#fig_graph_sql_schema) 所示（此模式使用 PostgreSQL `jsonb` 資料型別來儲存每個頂點或邊的屬性）。每條邊都儲存頭頂點和尾頂點；如果你想要頂點的入邊或出邊集，可以分別透過 `head_vertex` 或 `tail_vertex` 查詢 `edges` 表。

{{< figure id="fig_graph_sql_schema" title="示例 3-3. 使用關係模式表示屬性圖" class="w-full my-4" >}}

```sql
CREATE TABLE vertices (
    vertex_id integer PRIMARY KEY,
    label text,
    properties jsonb
);

CREATE TABLE edges (
    edge_id integer PRIMARY KEY,
    tail_vertex integer REFERENCES vertices (vertex_id),
    head_vertex integer REFERENCES vertices (vertex_id),
    label text,
    properties jsonb
);

CREATE INDEX edges_tails ON edges (tail_vertex);
CREATE INDEX edges_heads ON edges (head_vertex);
```

此模型的一些重要方面是：

1. 任何頂點都可以有一條邊將其與任何其他頂點連線。沒有限制哪些型別的事物可以或不能關聯的模式。
2. 給定任何頂點，你可以有效地找到其入邊和出邊，從而 *遍歷* 圖 —— 即透過頂點鏈跟隨路徑 —— 向前和向後。（這就是為什麼 [示例 3-3](#fig_graph_sql_schema) 在 `tail_vertex` 和 `head_vertex` 列上都有索引。）
3. 透過對不同型別的頂點和關係使用不同的標籤，你可以在單個圖中儲存幾種不同型別的資訊，同時仍保持簡潔的資料模型。

邊表就像我們在 ["多對一與多對多關係"](#sec_datamodels_many_to_many) 中看到的多對多關聯表/連線表，泛化為允許在同一表中儲存許多不同型別的關係。標籤和屬性上也可能有索引，允許有效地找到具有某些屬性的頂點或邊。

--------

> [!NOTE]
> 圖模型的一個限制是邊只能將兩個頂點相互關聯，而關係連線表可以透過在單行上具有多個外部索引鍵引用來表示三元或甚至更高階的關係。此類關係可以透過為連線表的每一行建立一個額外的頂點，以及到/從該頂點的邊，或者使用 *超圖* 在圖中表示。

--------

這些功能為資料建模提供了極大的靈活性，如 [圖 3-6](#fig_datamodels_graph) 所示。該圖顯示了一些在傳統關係模式中難以表達的內容，例如不同國家的不同區域結構（法國有 *省* 和 *大區*，而美國有 *縣* 和 *州*）、歷史的怪癖（如國中之國）（暫時忽略主權國家和民族的複雜性），以及不同粒度的資料（Lucy 的當前居住地指定為城市，而她的出生地僅在州級別指定）。

你可以想象擴充套件圖以包括有關 Lucy 和 Alain 或其他人的許多其他事實。例如，你可以使用它來指示他們有哪些食物過敏（透過為每個過敏原引入一個頂點，並在人和過敏原之間設定邊以指示過敏），並將過敏原與顯示哪些食物含有哪些物質的一組頂點連結。然後你可以編寫查詢來找出每個人可以安全食用的食物。圖適合可演化性：隨著你嚮應用程式新增功能，圖可以輕鬆擴充套件以適應應用程式資料結構的變化。

### Cypher 查詢語言 {#id57}

*Cypher* 是用於屬性圖的查詢語言，最初為 Neo4j 圖資料庫建立，後來作為 *openCypher* 發展為開放標準 [^38]。除了 Neo4j，Cypher 還得到 Memgraph、KùzuDB [^35]、Amazon Neptune、Apache AGE（在 PostgreSQL 中儲存）等的支援。它以電影《駭客帝國》中的角色命名，與密碼學中的密碼無關 [^39]。

[示例 3-4](#fig_cypher_create) 顯示了將 [圖 3-6](#fig_datamodels_graph) 的左側部分插入圖資料庫的 Cypher 查詢。圖的其餘部分可以類似地新增。每個頂點都被賦予一個符號名稱，如 `usa` 或 `idaho`。該名稱不儲存在資料庫中，僅在查詢內部使用以在頂點之間建立邊，使用箭頭符號：`(idaho) -[:WITHIN]-> (usa)` 建立一條標記為 `WITHIN` 的邊，其中 `idaho` 作為尾節點，`usa` 作為頭節點。

{{< figure link="#fig_datamodels_graph" id="fig_cypher_create" title="示例 3-4. 圖 3-6 中資料的子集，表示為 Cypher 查詢" class="w-full my-4" >}}

```
CREATE
    (namerica :Location {name:'North America', type:'continent'}),
    (usa :Location {name:'United States', type:'country' }),
    (idaho :Location {name:'Idaho', type:'state' }),
    (lucy :Person {name:'Lucy' }),
    (idaho) -[:WITHIN ]-> (usa) -[:WITHIN]-> (namerica),
    (lucy) -[:BORN_IN]-> (idaho)
```

當 [圖 3-6](#fig_datamodels_graph) 的所有頂點和邊都新增到資料庫後，我們可以開始提出有趣的問題：例如，*查詢所有從美國移民到歐洲的人的姓名*。也就是說，找到所有具有指向美國境內位置的 `BORN_IN` 邊，以及指向歐洲境內位置的 `LIVING_IN` 邊的頂點，並返回每個頂點的 `name` 屬性。

[示例 3-5](#fig_cypher_query) 顯示了如何在 Cypher 中表達該查詢。相同的箭頭符號用於 `MATCH` 子句中以在圖中查詢模式：`(person) -[:BORN_IN]-> ()` 匹配由標記為 `BORN_IN` 的邊相關的任意兩個頂點。該邊的尾頂點繫結到變數 `person`，頭頂點未命名。

{{< figure id="fig_cypher_query" title="示例 3-5. Cypher 查詢查詢從美國移民到歐洲的人" class="w-full my-4" >}}

```
MATCH
    (person) -[:BORN_IN]-> () -[:WITHIN*0..]-> (:Location {name:'United States'}),
    (person) -[:LIVES_IN]-> () -[:WITHIN*0..]-> (:Location {name:'Europe'})
RETURN person.name
```

查詢可以這樣理解：

> 找到滿足以下 *兩個* 條件的任何頂點（稱為 `person`）：
>
> 1. `person` 有一條出邊 `BORN_IN` 指向某個頂點。從那個頂點，你可以跟隨一條出邊 `WITHIN` 鏈，直到最終到達一個型別為 `Location` 的頂點，其 `name` 屬性等於 `"United States"`。
> 2. 同一個 `person` 頂點也有一條出邊 `LIVES_IN`。跟隨該邊，然後是一條出邊 `WITHIN` 鏈，你最終到達一個型別為 `Location` 的頂點，其 `name` 屬性等於 `"Europe"`。
>
> 對於每個這樣的 `person` 頂點，返回 `name` 屬性。

有幾種可能的執行查詢的方法。這裡給出的描述建議你從掃描資料庫中的所有人開始，檢查每個人的出生地和居住地，並僅返回符合條件的人。

但等效地，你可以從兩個 `Location` 頂點開始並向後工作。如果 `name` 屬性上有索引，你可以有效地找到表示美國和歐洲的兩個頂點。然後你可以透過跟隨所有傳入的 `WITHIN` 邊來查詢美國和歐洲各自的所有位置（州、地區、城市等）。最後，你可以尋找可以透過位置頂點之一的傳入 `BORN_IN` 或 `LIVES_IN` 邊找到的人。

### SQL 中的圖查詢 {#id58}

[示例 3-3](#fig_graph_sql_schema) 建議圖資料可以在關係資料庫中表示。但如果我們將圖資料放入關係結構中，我們還能使用 SQL 查詢它嗎？

答案是肯定的，但有一些困難。你在圖查詢中遍歷的每條邊實際上都是與 `edges` 表的連線。在關係資料庫中，你通常事先知道查詢中需要哪些連線。另一方面，在圖查詢中，你可能需要遍歷可變數量的邊才能找到你要查詢的頂點 —— 也就是說，連線的數量不是預先固定的。

在我們的示例中，這發生在 Cypher 查詢中的 `() -[:WITHIN*0..]-> ()` 模式中。一個人的 `LIVES_IN` 邊可能指向任何型別的位置：街道、城市、區（district）、地區（region）、州等。一個城市可能在（`WITHIN`）某個地區，該地區在（`WITHIN`）某個州，該州在（`WITHIN`）某個國家，等等。`LIVES_IN` 邊可能直接指向你要查詢的位置頂點，或者它可能在位置層次結構中相距幾個級別。

在 Cypher 中，`:WITHIN*0..` 非常簡潔地表達了這個事實：它意味著"跟隨 `WITHIN` 邊，零次或多次"。它就像正則表示式中的 `*` 運算元。

自 SQL:1999 以來，查詢中可變長度遍歷路徑的想法可以使用稱為 *遞迴公用表表達式*（`WITH RECURSIVE` 語法）的東西來表達。[示例 3-6](#fig_graph_sql_query) 顯示了相同的查詢 —— 查詢從美國移民到歐洲的人的姓名 —— 使用此技術在 SQL 中表達。然而，與 Cypher 相比，語法非常笨拙。

{{< figure link="#fig_cypher_query" id="fig_graph_sql_query" title="示例 3-6. 與 示例 3-5 相同的查詢，使用遞迴公用表表達式在 SQL 中編寫" class="w-full my-4" >}}

```sql
WITH RECURSIVE

    -- in_usa 是美國境內所有位置的頂點 ID 集合
    in_usa(vertex_id) AS (
        SELECT vertex_id FROM vertices
            WHERE label = 'Location' AND properties->>'name' = 'United States' ❶
      UNION
        SELECT edges.tail_vertex FROM edges ❷
            JOIN in_usa ON edges.head_vertex = in_usa.vertex_id
            WHERE edges.label = 'within'
    ),

    -- in_europe 是歐洲境內所有位置的頂點 ID 集合
    in_europe(vertex_id) AS (
        SELECT vertex_id FROM vertices
            WHERE label = 'location' AND properties->>'name' = 'Europe' ❸
      UNION
        SELECT edges.tail_vertex FROM edges
            JOIN in_europe ON edges.head_vertex = in_europe.vertex_id
            WHERE edges.label = 'within'
    ),

    -- born_in_usa 是所有在美國出生的人的頂點 ID 集合
    born_in_usa(vertex_id) AS ( ❹
        SELECT edges.tail_vertex FROM edges
            JOIN in_usa ON edges.head_vertex = in_usa.vertex_id
            WHERE edges.label = 'born_in'
    ),

    -- lives_in_europe 是所有居住在歐洲的人的頂點 ID 集合
    lives_in_europe(vertex_id) AS ( ❺
        SELECT edges.tail_vertex FROM edges
            JOIN in_europe ON edges.head_vertex = in_europe.vertex_id
            WHERE edges.label = 'lives_in'
    )

    SELECT vertices.properties->>'name'
    FROM vertices
    -- 連線以找到那些既在美國出生 *又* 居住在歐洲的人
    JOIN born_in_usa ON vertices.vertex_id = born_in_usa.vertex_id ❻
    JOIN lives_in_europe ON vertices.vertex_id = lives_in_europe.vertex_id;
```

❶: 首先找到 `name` 屬性值為 `"United States"` 的頂點，並使其成為頂點集 `in_usa` 的第一個元素。

❷: 從集合 `in_usa` 中的頂點跟隨所有傳入的 `within` 邊，並將它們新增到同一集合中，直到訪問了所有傳入的 `within` 邊。

❸: 從 `name` 屬性值為 `"Europe"` 的頂點開始執行相同操作，並構建頂點集 `in_europe`。

❹: 對於集合 `in_usa` 中的每個頂點，跟隨傳入的 `born_in` 邊以查詢在美國某個地方出生的人。

❺: 類似地，對於集合 `in_europe` 中的每個頂點，跟隨傳入的 `lives_in` 邊以查詢居住在歐洲的人。

❻: 最後，透過連線它們來將在美國出生的人的集合與居住在歐洲的人的集合相交。

4 行 Cypher 查詢需要 31 行 SQL 的事實表明，正確選擇資料模型和查詢語言可以產生多大的差異。這只是開始；還有更多細節需要考慮，例如，處理迴圈，以及在廣度優先或深度優先遍歷之間進行選擇 [^40]。

Oracle 對遞迴查詢有不同的 SQL 擴充套件，它稱之為 *層次* [^41]。

然而，情況可能正在改善：在撰寫本文時，有計劃向 SQL 標準新增一種名為 GQL 的圖查詢語言 [^42] [^43]，它將提供受 Cypher、GSQL [^44] 和 PGQL [^45] 啟發的語法。

### 三元組儲存與 SPARQL {#id59}

三元組儲存模型大多等同於屬性圖模型，使用不同的詞來描述相同的想法。儘管如此，它仍值得討論，因為有各種三元組儲存的工具和語言，它們可以成為構建應用程式工具箱的寶貴補充。

在三元組儲存中，所有資訊都以非常簡單的三部分語句的形式儲存：（*主語*、*謂語*、*賓語*）。例如，在三元組（*Jim*、*likes*、*bananas*）中，*Jim* 是主語，*likes* 是謂語（動詞），*bananas* 是賓語。

三元組的主語等同於圖中的頂點。賓語是兩種東西之一：

1. 原始資料型別的值，如字串或數字。在這種情況下，三元組的謂語和賓語等同於主語頂點上屬性的鍵和值。使用 [圖 3-6](#fig_datamodels_graph) 中的示例，（*lucy*、*birthYear*、*1989*）就像一個頂點 `lucy`，其屬性為 `{"birthYear": 1989}`。
2. 圖中的另一個頂點。在這種情況下，謂語是圖中的邊，主語是尾頂點，賓語是頭頂點。例如，在（*lucy*、*marriedTo*、*alain*）中，主語和賓語 *lucy* 和 *alain* 都是頂點，謂語 *marriedTo* 是連線它們的邊的標籤。

> [!NOTE]
> 準確地說，提供類似三元組資料模型的資料庫通常需要在每個元組上儲存一些額外的元資料。例如，AWS Neptune 使用四元組（4-tuples），透過向每個三元組新增圖 ID [^46]；Datomic 使用 5 元組，用事務 ID 和一個表示刪除的布林值擴充套件每個三元組 [^47]。由於這些資料庫保留了上面解釋的基本 *主語-謂語-賓語* 結構，本書仍然稱它們為三元組儲存。

[示例 3-7](#fig_graph_n3_triples) 顯示了與 [示例 3-4](#fig_cypher_create) 中相同的資料，以稱為 *Turtle* 的格式編寫為三元組，它是 *Notation3*（*N3*）的子集 [^48]。

{{< figure link="#fig_datamodels_graph" id="fig_graph_n3_triples" title="示例 3-7. 圖 3-6 中資料的子集，表示為 Turtle 三元組" class="w-full my-4" >}}

```
@prefix : <urn:example:>.
_:lucy a :Person.
_:lucy :name "Lucy".
_:lucy :bornIn _:idaho.
_:idaho a :Location.
_:idaho :name "Idaho".
_:idaho :type "state".
_:idaho :within _:usa.
_:usa a :Location.
_:usa :name "United States".
_:usa :type "country".
_:usa :within _:namerica.
_:namerica a :Location.
_:namerica :name "North America".
_:namerica :type "continent".
```

在此示例中，圖的頂點寫為 `_:someName`。該名稱在此檔案之外沒有任何意義；它的存在只是因為否則我們不知道哪些三元組引用同一個頂點。當謂語表示邊時，賓語是頂點，如 `_:idaho :within _:usa`。當謂語是屬性時，賓語是字串字面量，如 `_:usa :name "United States"`。

一遍又一遍地重複相同的主語相當重複，但幸運的是，你可以使用分號來表達關於同一主語的多個內容。這使得 Turtle 格式非常易讀：見 [示例 3-8](#fig_graph_n3_shorthand)。

{{< figure link="#fig_graph_n3_triples" id="fig_graph_n3_shorthand" title="示例 3-8. 編寫 示例 3-7 中資料的更簡潔方式" class="w-full my-4" >}}

```
@prefix : <urn:example:>.
_:lucy a :Person; :name "Lucy"; :bornIn _:idaho.
_:idaho a :Location; :name "Idaho"; :type "state"; :within _:usa.
_:usa a :Location; :name "United States"; :type "country"; :within _:namerica.
_:namerica a :Location; :name "North America"; :type "continent".
```

--------

> [!TIP] 語義網

一些三元組儲存的研究和開發工作是由 *語義網* 推動的，這是 2000 年代初的一項努力，旨在透過不僅以人類可讀的網頁形式釋出資料，還以標準化的機器可讀格式釋出資料來促進網際網路範圍的資料交換。儘管最初設想的語義網沒有成功 [^49] [^50]，但語義網專案的遺產在幾項特定技術中繼續存在：*連結資料* 標準（如 JSON-LD [^51]）、生物醫學科學中使用的 *本體* [^52]、Facebook 的開放圖協議 [^53]（用於連結展開 [^54]）、知識圖（如 Wikidata）以及由 [`schema.org`](https://schema.org/) 維護的結構化資料的標準化詞彙表。

三元組儲存是另一種在其原始用例之外找到用途的語義網技術：即使你對語義網沒有興趣，三元組也可以成為應用程式的良好內部資料模型。

--------

#### RDF 資料模型 {#the-rdf-data-model}

我們在 [示例 3-8](#fig_graph_n3_shorthand) 中使用的 Turtle 語言實際上是在 *資源描述框架*（RDF）[^55] 中編碼資料的一種方式，這是為語義網設計的資料模型。RDF 資料也可以用其他方式編碼，例如（更冗長地）用 XML，如 [示例 3-9](#fig_graph_rdf_xml) 所示。像 Apache Jena 這樣的工具可以在不同的 RDF 編碼之間自動轉換。

{{< figure link="#fig_graph_n3_shorthand" id="fig_graph_rdf_xml" title="示例 3-9. 示例 3-8 的資料，使用 RDF/XML 語法表示" class="w-full my-4" >}}

```xml
<rdf:RDF xmlns="urn:example:"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">

    <Location rdf:nodeID="idaho">
        <name>Idaho</name>
        <type>state</type>
        <within>
            <Location rdf:nodeID="usa">
                <name>United States</name>
                <type>country</type>
                <within>
                    <Location rdf:nodeID="namerica">
                        <name>North America</name>
                        <type>continent</type>
                    </Location>
                </within>
            </Location>
        </within>
    </Location>

    <Person rdf:nodeID="lucy">
        <name>Lucy</name>
        <bornIn rdf:nodeID="idaho"/>
    </Person>
</rdf:RDF>
```

RDF 有一些怪癖，因為它是為網際網路範圍的資料交換而設計的。三元組的主語、謂語和賓語通常是 URI。例如，謂語可能是一個 URI，如 `<http://my-company.com/namespace#within>` 或 `<http://my-company.com/namespace#lives_in>`，而不僅僅是 `WITHIN` 或 `LIVES_IN`。這種設計背後的原因是，你應該能夠將你的資料與其他人的資料結合起來，如果他們給單詞 `within` 或 `lives_in` 附加了不同的含義，你不會發生衝突，因為他們的謂語實際上是 `<http://other.org/foo#within>` 和 `<http://other.org/foo#lives_in>`。

URL `<http://my-company.com/namespace>` 不一定需要解析為任何內容 —— 從 RDF 的角度來看，它只是一個名稱空間。為了避免與 `http://` URL 的潛在混淆，本節中的示例使用不可解析的 URI，如 `urn:example:within`。幸運的是，你只需在檔案頂部指定一次此字首，然後就可以忘記它。

#### SPARQL 查詢語言 {#the-sparql-query-language}

*SPARQL* 是使用 RDF 資料模型的三元組儲存的查詢語言 [^56]。（它是 *SPARQL Protocol and RDF Query Language* 的首字母縮略詞，發音為 "sparkle"。）它早於 Cypher，由於 Cypher 的模式匹配是從 SPARQL 借用的，它們看起來非常相似。

與之前相同的查詢 —— 查詢從美國搬到歐洲的人 —— 在 SPARQL 中與在 Cypher 中一樣簡潔（見 [示例 3-10](#fig_sparql_query)）。

{{< figure id="fig_sparql_query" title="示例 3-10. 與 [示例 3-5](#fig_cypher_query) 相同的查詢，用 SPARQL 表示" class="w-full my-4" >}}

```
PREFIX : <urn:example:>

SELECT ?personName WHERE {
 ?person :name ?personName.
 ?person :bornIn / :within* / :name "United States".
 ?person :livesIn / :within* / :name "Europe".
}
```

結構非常相似。以下兩個表示式是等效的（變數在 SPARQL 中以問號開頭）：

```
(person) -[:BORN_IN]-> () -[:WITHIN*0..]-> (location) # Cypher

?person :bornIn / :within* ?location. # SPARQL
```

因為 RDF 不區分屬性和邊，而只是對兩者都使用謂語，所以你可以使用相同的語法來匹配屬性。在以下表達式中，變數 `usa` 繫結到任何具有 `name` 屬性且其值為字串 `"United States"` 的頂點：

```
(usa {name:'United States'}) # Cypher

?usa :name "United States". # SPARQL
```

SPARQL 得到 Amazon Neptune、AllegroGraph、Blazegraph、OpenLink Virtuoso、Apache Jena 和各種其他三元組儲存的支援 [^36]。

### Datalog：遞迴關係查詢 {#id62}

Datalog 是一種比 SPARQL 或 Cypher 更古老的語言：它源於 20 世紀 80 年代的學術研究 [^57] [^58] [^59]。它在軟體工程師中不太為人所知，並且在主流資料庫中沒有得到廣泛支援，但它應該更為人所知，因為它是一種非常有表現力的語言，對於複雜查詢特別強大。幾個小眾資料庫，包括 Datomic、LogicBlox、CozoDB 和 LinkedIn 的 LIquid [^60] 使用 Datalog 作為它們的查詢語言。

Datalog 實際上基於關係資料模型，而不是圖，但它出現在本書的圖資料庫部分，因為圖上的遞迴查詢是 Datalog 的特殊優勢。

Datalog 資料庫的內容由 *事實* 組成，每個事實對應於關係表中的一行。例如，假設我們有一個包含位置的表 *location*，它有三列：*ID*、*name* 和 *type*。美國是一個國家的事實可以寫成 `location(2, "United States", "country")`，其中 `2` 是美國的 ID。一般來說，語句 `table(val1, val2, …​)` 意味著 `table` 包含一行，其中第一列包含 `val1`，第二列包含 `val2`，依此類推。

[示例 3-11](#fig_datalog_triples) 顯示了如何在 Datalog 中編寫 [圖 3-6](#fig_datamodels_graph) 左側的資料。圖的邊（`within`、`born_in` 和 `lives_in`）表示為兩列連線表。例如，Lucy 的 ID 是 100，愛達荷州的 ID 是 3，所以關係"Lucy 出生在愛達荷州"表示為 `born_in(100, 3)`。

{{< figure id="fig_datalog_triples" title="示例 3-11. [圖 3-6](#fig_datamodels_graph) 中資料的子集，表示為 Datalog 事實" class="w-full my-4" >}}

```
location(1, "North America", "continent").
location(2, "United States", "country").
location(3, "Idaho", "state").

within(2, 1). /* 美國在北美 */
within(3, 2). /* 愛達荷州在美國 */

person(100, "Lucy").
born_in(100, 3). /* Lucy 出生在愛達荷州 */
```

現在我們已經定義了資料，我們可以編寫與之前相同的查詢，如 [示例 3-12](#fig_datalog_query) 所示。它看起來與 Cypher 或 SPARQL 中的等效查詢有點不同，但不要讓這嚇倒你。Datalog 是 Prolog 的子集，這是一種程式語言，如果你學過計算機科學，你可能見過它。

{{< figure id="fig_datalog_query" title="示例 3-12. 與 [示例 3-5](#fig_cypher_query) 相同的查詢，用 Datalog 表示" class="w-full my-4" >}}

```sql
within_recursive(LocID, PlaceName) :- location(LocID, PlaceName, _). /* 規則 1 */

within_recursive(LocID, PlaceName) :- within(LocID, ViaID), /* 規則 2 */
 within_recursive(ViaID, PlaceName).

migrated(PName, BornIn, LivingIn) :- person(PersonID, PName), /* 規則 3 */
 born_in(PersonID, BornID),
 within_recursive(BornID, BornIn),
 lives_in(PersonID, LivingID),
 within_recursive(LivingID, LivingIn).

us_to_europe(Person) :- migrated(Person, "United States", "Europe"). /* 規則 4 */
/* us_to_europe 包含行 "Lucy"。 */
```

Cypher 和 SPARQL 直接用 `SELECT` 開始，但 Datalog 一次只邁出一小步。我們定義 *規則* 從底層事實派生新的虛擬表。這些派生表就像（虛擬）SQL 檢視：它們不儲存在資料庫中，但你可以像查詢包含儲存事實的表一樣查詢它們。

在 [示例 3-12](#fig_datalog_query) 中，我們定義了三個派生表：`within_recursive`、`migrated` 和 `us_to_europe`。虛擬表的名稱和列由每個規則的 `:-` 符號之前出現的內容定義。例如，`migrated(PName, BornIn, LivingIn)` 是一個具有三列的虛擬表：一個人的姓名、他們出生地的名稱和他們居住地的名稱。

虛擬表的內容由規則的 `:-` 符號之後的部分定義，我們在其中嘗試查詢表中匹配某種模式的行。例如，`person(PersonID, PName)` 匹配行 `person(100, "Lucy")`，變數 `PersonID` 繫結到值 `100`，變數 `PName` 繫結到值 `"Lucy"`。如果系統可以為 `:-` 運算元右側的 *所有* 模式找到匹配項，則規則適用。當規則適用時，就好像 `:-` 的左側被新增到資料庫中（變數被它們匹配的值替換）。

因此，應用規則的一種可能方式是（如 [圖 3-7](#fig_datalog_naive) 所示）：

1. `location(1, "North America", "continent")` 存在於資料庫中，因此規則 1 適用。它生成 `within_recursive(1, "North America")`。
2. `within(2, 1)` 存在於資料庫中，前一步生成了 `within_recursive(1, "North America")`，因此規則 2 適用。它生成 `within_recursive(2, "North America")`。
3. `within(3, 2)` 存在於資料庫中，前一步生成了 `within_recursive(2, "North America")`，因此規則 2 適用。它生成 `within_recursive(3, "North America")`。

透過重複應用規則 1 和 2，`within_recursive` 虛擬表可以告訴我們資料庫中包含的北美（或任何其他位置）的所有位置。

{{< figure link="#fig_datalog_query" src="/fig/ddia_0307.png" id="fig_datalog_naive" title="圖 3-7. 使用示例 3-12 中的 Datalog 規則確定愛達荷州在北美。" class="w-full my-4" >}}

> 圖 3-7. 使用 [示例 3-12](#fig_datalog_query) 中的 Datalog 規則確定愛達荷州在北美。

現在規則 3 可以找到出生在某個位置 `BornIn` 並居住在某個位置 `LivingIn` 的人。規則 4 使用 `BornIn = 'United States'` 和 `LivingIn = 'Europe'` 呼叫規則 3，並僅返回匹配搜尋的人的姓名。透過查詢虛擬 `us_to_europe` 表的內容，Datalog 系統最終得到與早期 Cypher 和 SPARQL 查詢相同的答案。

與本章討論的其他查詢語言相比，Datalog 方法需要不同型別的思維。它允許逐條規則地構建複雜查詢，一個規則引用其他規則，類似於你將程式碼分解為相互呼叫的函式的方式。就像函式可以遞迴一樣，Datalog 規則也可以呼叫自己，如 [示例 3-12](#fig_datalog_query) 中的規則 2，這使得 Datalog 查詢中的圖遍歷成為可能。

### GraphQL {#id63}

GraphQL 是一種查詢語言，從設計上講，它比我們在本章中看到的其他查詢語言限制性更強。GraphQL 的目的是允許在使用者裝置上執行的客戶端軟體（如移動應用程式或 JavaScript Web 應用程式前端）請求具有特定結構的 JSON 文件，其中包含渲染其使用者介面所需的欄位。GraphQL 介面允許開發人員快速更改客戶端程式碼中的查詢，而無需更改伺服器端 API。

GraphQL 的靈活性是有代價的。採用 GraphQL 的組織通常需要工具將 GraphQL 查詢轉換為對內部服務的請求，這些服務通常使用 REST 或 gRPC（參見 [第 5 章](/tw/ch5#ch_encoding)）。授權、速率限制和效能挑戰是額外的關注點 [^61]。GraphQL 的查詢語言也受到限制，因為 GraphQL 查詢來自不受信任的來源。該語言不允許任何可能執行成本高昂的操作，否則使用者可能透過執行大量昂貴的查詢對伺服器執行拒絕服務攻擊。特別是，GraphQL 不允許遞迴查詢（與 Cypher、SPARQL、SQL 或 Datalog 不同），並且不允許任意搜尋條件，如"查詢在美國出生並現在居住在歐洲的人"（除非服務所有者特別選擇提供此類搜尋功能）。

儘管如此，GraphQL 還是很有用的。[示例 3-13](#fig_graphql_query) 顯示了如何使用 GraphQL 實現 Discord 或 Slack 等群聊應用程式。查詢請求使用者有權訪問的所有頻道，包括頻道名稱和每個頻道中的 50 條最新訊息。對於每條訊息，它請求時間戳、訊息內容以及訊息傳送者的姓名和個人資料圖片 URL。此外，如果訊息是對另一條訊息的回覆，查詢還會請求傳送者姓名和它所回覆的訊息內容（可能以較小的字型呈現在回覆上方，以提供一些上下文）。

{{< figure id="fig_graphql_query" title="示例 3-13. 群聊應用程式的示例 GraphQL 查詢" class="w-full my-4" >}}

```
query ChatApp {
    channels {
        name
        recentMessages(latest: 50) {
            timestamp
            content
        sender {
            fullName
            imageUrl
        }
    replyTo {
        content
        sender {
            fullName
        }
    }
    }
    }
}
```

[示例 3-14](#fig_graphql_response) 顯示了對 [示例 3-13](#fig_graphql_query) 中查詢的響應可能是什麼樣子。響應是一個反映查詢結構的 JSON 文件：它正好包含請求的那些屬性，不多也不少。這種方法的優點是伺服器不需要知道客戶端需要哪些屬性來渲染使用者介面；相反，客戶端可以簡單地請求它需要的內容。例如，此查詢不會為 `replyTo` 訊息的傳送者請求個人資料圖片 URL，但如果使用者介面更改為新增該個人資料圖片，客戶端可以很容易地將所需的 `imageUrl` 屬性新增到查詢中，而無需更改伺服器。

{{< figure link="#fig_graphql_query" id="fig_graphql_response" title="示例 3-14. 對 示例 3-13 中查詢的可能響應" class="w-full my-4" >}}

```json
{
"data": {
    "channels": [
        {
        "name": "#general",
        "recentMessages": [
        {
        "timestamp": 1693143014,
        "content": "Hey! How are y'all doing?",
        "sender": {"fullName": "Aaliyah", "imageUrl": "https://..."},
        "replyTo": null
        },
        {
            "timestamp": 1693143024,
            "content": "Great! And you?",
            "sender": {"fullName": "Caleb", "imageUrl": "https://..."},
            "replyTo": {
            "content": "Hey! How are y'all doing?",
            "sender": {"fullName": "Aaliyah"}
        }
},
...
```

在 [示例 3-14](#fig_graphql_response) 中，訊息傳送者的姓名和影像 URL 直接嵌入在訊息物件中。如果同一使用者傳送多條訊息，此資訊會在每條訊息上重複。原則上，可以減少這種重複，但 GraphQL 做出了接受更大響應大小的設計選擇，以便更簡單地基於資料渲染使用者介面。

`replyTo` 欄位類似：在 [示例 3-14](#fig_graphql_response) 中，第二條訊息是對第一條訊息的回覆，內容（"Hey!…"）和傳送者 Aaliyah 在 `replyTo` 下重複。可以改為返回被回覆訊息的 ID，但如果該 ID 不在返回的 50 條最新訊息中，客戶端就必須向伺服器發出額外的請求。重複內容使得處理資料變得更加簡單。

伺服器的資料庫可以以更正規化的形式儲存資料，並執行必要的連線來處理查詢。例如，伺服器可能儲存訊息以及傳送者的使用者 ID 和它所回覆的訊息的 ID；當它收到如上所示的查詢時，伺服器將解析這些 ID 以查詢它們引用的記錄。但是，客戶端只能要求伺服器執行 GraphQL 模式中明確提供的連線。

即使對 GraphQL 查詢的響應看起來類似於文件資料庫的響應，即使它的名稱中有"graph"，GraphQL 也可以在任何型別的資料庫之上實現 —— 關係型、文件型或圖型。


## 事件溯源與 CQRS {#sec_datamodels_events}

在我們迄今為止討論的所有資料模型中，資料以與寫入相同的形式被查詢 —— 無論是 JSON 文件、表中的行，還是圖中的頂點和邊。然而，在複雜的應用程式中，有時很難找到一種能夠滿足所有不同查詢和呈現資料方式的單一資料表示。在這種情況下，以一種形式寫入資料，然後從中派生出針對不同型別讀取最佳化的多種表示形式可能是有益的。

我們之前在 ["記錄系統與派生資料"](/tw/ch1#sec_introduction_derived) 中看到了這個想法，ETL（參見 ["資料倉庫"](/tw/ch1#sec_introduction_dwh)）就是這種派生過程的一個例子。現在我們將進一步深入這個想法。如果我們無論如何都要從一種資料表示派生出另一種，我們可以選擇分別針對寫入和讀取最佳化的不同表示。如果你只想為寫入最佳化資料建模，而不關心高效查詢，你會如何建模？

也許寫入資料的最簡單、最快速和最具表現力的方式是 *事件日誌*：每次你想寫入一些資料時，你將其編碼為自包含的字串（可能是 JSON），包括時間戳，然後將其追加到事件序列中。此日誌中的事件是 *不可變的*：你永遠不會更改或刪除它們，你只會向日志追加更多事件（這可能會取代早期事件）。事件可以包含任意屬性。

[圖 3-8](#fig_event_sourcing) 顯示了一個可能來自會議管理系統的示例。會議可能是一個複雜的業務領域：不僅個人參與者可以註冊並用信用卡付款，公司也可以批次訂購座位，透過發票付款，然後再將座位分配給個人。一些座位可能為演講者、贊助商、志願者助手等保留。預訂也可能被取消，與此同時，會議組織者可能透過將其移至不同的房間來更改活動的容量。在所有這些情況發生時，簡單地計算可用座位數量就成為一個具有挑戰性的查詢。

{{< figure src="/fig/ddia_0308.png" id="fig_event_sourcing" title="圖 3-8. 使用不可變事件日誌作為真相來源（權威資料來源），並從中派生物化檢視。" class="w-full my-4" >}}

在 [圖 3-8](#fig_event_sourcing) 中，會議狀態的每個變化（例如組織者開放註冊，或參與者進行和取消註冊）首先被儲存為事件。每當事件追加到日誌時，幾個 *物化檢視*（也稱為 *投影* 或 *讀模型*）也會更新以反映該事件的影響。在會議示例中，可能有一個物化檢視收集與每個預訂狀態相關的所有資訊，另一個為會議組織者的儀表板計算圖表，第三個為列印參與者徽章的印表機生成檔案。

使用事件作為真相來源（權威資料來源），並將每個狀態變化表達為事件的想法被稱為 *事件溯源* [^62] [^63]。維護單獨的讀最佳化表示並從寫最佳化表示派生它們的原則稱為 *命令查詢責任分離（CQRS）* [^64]。這些術語起源於領域驅動設計（DDD）社群，儘管類似的想法已經存在很長時間了，例如 *狀態機複製*（參見 ["使用共享日誌"](/tw/ch10#sec_consistency_smr)）。

當用戶的請求進來時，它被稱為 *命令*，首先需要驗證。只有在命令已執行並確定有效（例如，請求的預訂有足夠的可用座位）後，它才成為事實，相應的事件被新增到日誌中。因此，事件日誌應該只包含有效事件，構建物化檢視的事件日誌消費者不允許拒絕事件。

在以事件溯源風格建模資料時，建議你使用過去時態命名事件（例如，"座位已預訂"），因為事件是記錄過去發生的事情的記錄。即使使用者後來決定更改或取消，他們以前持有預訂的事實仍然是真實的，更改或取消是稍後新增的單獨事件。

事件溯源與星型模式事實表之間的相似之處（如 ["星型與雪花型：分析模式"](#sec_datamodels_analytics) 中所討論的）是兩者都是過去發生的事件的集合。然而，事實表中的行都具有相同的列集，而在事件溯源中可能有許多不同的事件型別，每種都有不同的屬性。此外，事實表是無序集合，而在事件溯源中事件的順序很重要：如果先進行預訂然後取消，以錯誤的順序處理這些事件將沒有意義。

事件溯源和 CQRS 有幾個優點：

* 對於開發系統的人來說，事件更好地傳達了 *為什麼* 發生某事的意圖。例如，理解事件"預訂已取消"比理解"`bookings` 表第 4001 行的 `active` 列被設定為 `false`，與該預訂相關的三行從 `seat_assignments` 表中刪除，並且在 `payments` 表中插入了一行代表退款"更容易。當物化檢視處理取消事件時，這些行修改仍可能發生，但當它們由事件驅動時，更新的原因變得更加清晰。
* 事件溯源的關鍵原則是物化檢視以可重現的方式從事件日誌派生：你應該始終能夠刪除物化檢視並透過以相同順序處理相同事件，使用相同程式碼來重新計算它們。如果檢視維護程式碼中有錯誤，你可以刪除檢視並使用新程式碼重新計算它。查詢錯誤也更容易，因為你可以隨意重新執行檢視維護程式碼並檢查其行為。
* 你可以有多個物化檢視，針對應用程式所需的特定查詢進行最佳化。它們可以儲存在與事件相同的資料庫中，也可以儲存在不同的資料庫中，具體取決於你的需求。它們可以使用任何資料模型，並且可以為快速讀取而反正規化。你甚至可以只在記憶體中保留檢視並避免持久化它，只要可以在服務重新啟動時從事件日誌重新計算檢視即可。
* 如果你決定以新方式呈現現有資訊，很容易從現有事件日誌構建新的物化檢視。你還可以透過新增新型別的事件或向現有事件型別新增新屬性（任何舊事件保持未修改）來發展系統以支援新功能。你還可以將新行為連結到現有事件（例如，當會議參與者取消時，他們的座位可以提供給等候名單上的下一個人）。
* 如果某個事件被錯誤寫入，你可以再把它刪掉，這樣你就能重建出一個沒有這個被刪除事件的檢視。另一方面，在直接更新和刪除資料的資料庫中，已提交的事務通常很難撤銷。因此，事件溯源可以減少系統中不可逆操作的數量，使其更容易更改（參見 ["可演化性：讓變更變得容易"](/tw/ch2#sec_introduction_evolvability)）。
* 事件日誌還可以作為系統中發生的所有事情的審計日誌，這在需要此類可審計性的受監管行業中很有價值。

然而，事件溯源和 CQRS 也有缺點：

* 如果涉及外部資訊，你需要小心。例如，假設一個事件包含以一種貨幣給出的價格，對於其中一個檢視，它需要轉換為另一種貨幣。由於匯率可能會波動，在處理事件時從外部源獲取匯率會有問題，因為如果你在另一個日期重新計算物化檢視，你會得到不同的結果。為了使事件處理邏輯具有確定性，你要麼需要在事件本身中包含匯率，要麼有一種方法來查詢事件中指示的時間戳處的歷史匯率，確保此查詢始終為相同的時間戳返回相同的結果。
* 事件不可變的要求會在事件包含使用者的個人資料時產生問題，因為使用者可能行使他們的權利（例如，根據 GDPR）請求刪除他們的資料。如果事件日誌是基於每個使用者的，你可以刪除該使用者的整個日誌，但如果你的事件日誌包含與多個使用者相關的事件，這就不起作用了。你可以嘗試將個人資料儲存在實際事件之外，或者使用金鑰對其進行加密，你可以稍後選擇刪除該金鑰，但這也使得在需要時更難重新計算派生狀態。
* 如果存在外部可見的副作用，重新處理事件需要小心 —— 例如，你可能不希望每次重建物化檢視時都重新發送確認電子郵件。

你可以在任何資料庫之上實現事件溯源，但也有一些專門設計來支援這種模式的系統，例如 EventStoreDB、MartenDB（基於 PostgreSQL）和 Axon Framework。你還可以使用訊息代理（如 Apache Kafka）來儲存事件日誌，流處理器可以使物化檢視保持最新；我們將在 ["資料變更捕獲與事件溯源"](/tw/ch12#sec_stream_event_sourcing) 中回到這些主題。

唯一重要的要求是事件儲存系統必須保證所有物化檢視以與它們在日誌中出現的完全相同的順序處理事件；正如我們將在 [第 10 章](/tw/ch10#ch_consistency) 中看到的，這在分散式系統中並不總是容易實現。


## 資料框、矩陣與陣列 {#sec_datamodels_dataframes}

到目前為止，我們在本章中看到的資料模型通常用於事務處理和分析目的（參見 ["分析與運營系統"](/tw/ch1#sec_introduction_analytics)）。還有一些資料模型你可能會在分析或科學環境中遇到，但很少出現在 OLTP 系統中：資料框和多維數字陣列（如矩陣）。

資料框是 R 語言、Python 的 Pandas 庫、Apache Spark、ArcticDB、Dask 和其他系統支援的資料模型。它們是資料科學家為訓練機器學習模型準備資料的流行工具，但它們也廣泛用於資料探索、統計資料分析、資料視覺化和類似目的。

乍一看，資料框類似於關係資料庫中的表或電子表格。它支援對資料框內容執行批次操作的類關係運算元：例如，將函式應用於所有行、基於某些條件過濾行、按某些列對行進行分組並聚合其他列，以及基於某個鍵將一個數據框中的行與另一個數據框連線（關係資料庫稱為 *連線* 的操作在資料框上通常稱為 *合併*）。

資料框通常不是透過宣告式查詢（如 SQL）而是透過一系列修改其結構和內容的命令來操作的。這符合資料科學家的典型工作流程，他們逐步"整理"資料，使其成為能夠找到他們所提問題答案的形式。這些操作通常在資料科學家的資料集私有副本上進行，通常在他們的本地機器上，儘管最終結果可能與其他使用者共享。

資料框 API 還提供了遠遠超出關係資料庫提供的各種操作，資料模型的使用方式通常與典型的關係資料建模非常不同 [^65]。例如，資料框的常見用途是將資料從類似關係的表示轉換為矩陣或多維陣列表示，這是許多機器學習演算法期望的輸入形式。

[圖 3-9](#fig_dataframe_to_matrix) 顯示了這種轉換的簡單示例。左側是不同使用者如何評價各種電影的關係表（評分為 1 到 5），右側資料已轉換為矩陣，其中每列是一部電影，每行是一個使用者（類似於電子表格中的 *資料透視表*）。矩陣是 *稀疏* 的，這意味著許多使用者-電影組合沒有資料，但這沒關係。這個矩陣可能有數千列，因此不太適合關係資料庫，但資料框和提供稀疏陣列的庫（如 Python 的 NumPy）可以輕鬆處理此類資料。

{{< figure src="/fig/ddia_0309.png" id="fig_dataframe_to_matrix" title="圖 3-9. 將電影評分的關係資料庫轉換為矩陣表示。" class="w-full my-4" >}}

矩陣只能包含數字，各種技術用於將非數字資料轉換為矩陣中的數字。例如：

* 日期（在 [圖 3-9](#fig_dataframe_to_matrix) 的示例矩陣中省略了）可以縮放為某個合適範圍內的浮點數。
* 對於只能取一小組固定值之一的列（例如，電影資料庫中電影的型別），通常使用 *獨熱編碼*：我們為每個可能的值建立一列（一個用於"喜劇"，一個用於"劇情"，一個用於"恐怖"等），對於代表電影的每一行，我們在對應於該電影型別的列中放置 1，在所有其他列中放置 0。這種表示也很容易推廣到適合多種型別的電影。

一旦資料以數字矩陣的形式存在，它就適合線性代數運算，這構成了許多機器學習演算法的基礎。例如，[圖 3-9](#fig_dataframe_to_matrix) 中的資料可能是推薦使用者可能喜歡的電影系統的一部分。資料框足夠靈活，允許資料從關係形式逐漸演變為矩陣表示，同時讓資料科學家控制最適合實現資料分析或模型訓練過程目標的表示。

還有像 TileDB [^66] 這樣專門儲存大型多維數字陣列的資料庫；它們被稱為 *陣列資料庫*，最常用於科學資料集，如地理空間測量（規則間隔網格上的柵格資料）、醫學成像或天文望遠鏡的觀測 [^67]。資料框在金融行業也用於表示 *時間序列資料*，如資產價格和隨時間變化的交易 [^68]。

## 總結 {#summary}

資料模型是一個巨大的主題，在本章中，我們快速瀏覽了各種不同的模型。我們沒有空間深入每個模型的所有細節，但希望這個概述足以激發你的興趣，找出最適合你的應用需求的模型。

*關係模型* 儘管已有半個多世紀的歷史，但對許多應用來說仍然是一個重要的資料模型——特別是在資料倉庫和商業分析中，關係星型或雪花模式和 SQL 查詢無處不在。然而，關係資料的幾種替代方案也在其他領域變得流行：

* *文件模型* 針對資料以獨立的 JSON 文件形式出現的用例，以及一個文件與另一個文件之間的關係很少的情況。
* *圖資料模型* 走向相反的方向，針對任何東西都可能與一切相關的用例，以及查詢可能需要遍歷多個跳躍才能找到感興趣的資料（可以使用 Cypher、SPARQL 或 Datalog 中的遞迴查詢來表達）。
* *資料框* 將關係資料推廣到大量列，從而在資料庫和構成大量機器學習、統計資料分析和科學計算基礎的多維陣列之間提供橋樑。

在某種程度上，一個模型可以用另一個模型來模擬——例如，圖資料可以在關係資料庫中表示——但結果可能很彆扭，正如我們在 SQL 中對遞迴查詢的支援中看到的那樣。

因此，為每個資料模型開發了各種專業資料庫，提供針對特定模型最佳化的查詢語言和儲存引擎。然而，資料庫也有透過新增對其他資料模型的支援來擴充套件到相鄰領域的趨勢：例如，關係資料庫以 JSON 列的形式添加了對文件資料的支援，文件資料庫添加了類似關係的連線，SQL 中對圖資料的支援也在逐步改進。

我們討論的另一個模型是 *事件溯源*，它將資料表示為不可變事件的僅追加日誌，這對於建模複雜業務領域中的活動可能是有利的。僅追加日誌有利於寫入資料（正如我們將在 [第 4 章](/tw/ch4#ch_storage) 中看到的）；為了支援高效查詢，事件日誌透過 CQRS 轉換為讀最佳化的物化檢視。

非關係資料模型的一個共同點是，它們通常不會對儲存的資料強制執行模式，這可以使應用更容易適應不斷變化的需求。然而，你的應用很可能仍然假設資料具有某種結構；這只是模式是顯式的（在寫入時強制執行）還是隱式的（在讀取時假設）的問題。

儘管我們涵蓋了很多內容，但仍有資料模型未被提及。僅舉幾個簡短的例子：

* 研究基因組資料的研究人員通常需要執行 *序列相似性搜尋*，這意味著獲取一個非常長的字串（代表 DNA 分子）並將其與相似但不相同的大量字串資料庫進行匹配。這裡描述的資料庫都無法處理這種用法，這就是研究人員編寫了像 GenBank [^69] 這樣的專門基因組資料庫軟體的原因。
* 許多金融系統使用具有複式記賬的 *賬本* 作為其資料模型。這種型別的資料可以在關係資料庫中表示，但也有像 TigerBeetle 這樣專門研究這種資料模型的資料庫。加密貨幣和區塊鏈通常基於分散式賬本，它們的資料模型中也內建了價值轉移。
* *全文檢索* 可以說是一種經常與資料庫一起使用的資料模型。資訊檢索是一個大型的專業主題，我們不會在本書中詳細介紹，但我們將在 ["全文檢索"](/tw/ch4#sec_storage_full_text) 中涉及搜尋索引和向量搜尋。

我們現在必須到此為止了。在下一章中，我們將討論在 *實現* 本章中描述的資料模型時出現的一些權衡。


### 參考文獻

[^1]: Jamie Brandon. [Unexplanations: query optimization works because sql is declarative](https://www.scattered-thoughts.net/writing/unexplanations-sql-declarative/). *scattered-thoughts.net*, February 2024. Archived at [perma.cc/P6W2-WMFZ](https://perma.cc/P6W2-WMFZ)
[^2]: Joseph M. Hellerstein. [The Declarative Imperative: Experiences and Conjectures in Distributed Logic](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2010/EECS-2010-90.pdf). Tech report UCB/EECS-2010-90, Electrical Engineering and Computer Sciences, University of California at Berkeley, June 2010. Archived at [perma.cc/K56R-VVQM](https://perma.cc/K56R-VVQM)
[^3]: Edgar F. Codd. [A Relational Model of Data for Large Shared Data Banks](https://www.seas.upenn.edu/~zives/03f/cis550/codd.pdf). *Communications of the ACM*, volume 13, issue 6, pages 377–387, June 1970. [doi:10.1145/362384.362685](https://doi.org/10.1145/362384.362685)
[^4]: Michael Stonebraker and Joseph M. Hellerstein. [What Goes Around Comes Around](http://mitpress2.mit.edu/books/chapters/0262693143chapm1.pdf). In *Readings in Database Systems*, 4th edition, MIT Press, pages 2–41, 2005. ISBN: 9780262693141
[^5]: Markus Winand. [Modern SQL: Beyond Relational](https://modern-sql.com/). *modern-sql.com*, 2015. Archived at [perma.cc/D63V-WAPN](https://perma.cc/D63V-WAPN)
[^6]: Martin Fowler. [OrmHate](https://martinfowler.com/bliki/OrmHate.html). *martinfowler.com*, May 2012. Archived at [perma.cc/VCM8-PKNG](https://perma.cc/VCM8-PKNG)
[^7]: Vlad Mihalcea. [N+1 query problem with JPA and Hibernate](https://vladmihalcea.com/n-plus-1-query-problem/). *vladmihalcea.com*, January 2023. Archived at [perma.cc/79EV-TZKB](https://perma.cc/79EV-TZKB)
[^8]: Jens Schauder. [This is the Beginning of the End of the N+1 Problem: Introducing Single Query Loading](https://spring.io/blog/2023/08/31/this-is-the-beginning-of-the-end-of-the-n-1-problem-introducing-single-query). *spring.io*, August 2023. Archived at [perma.cc/6V96-R333](https://perma.cc/6V96-R333)
[^9]: William Zola. [6 Rules of Thumb for MongoDB Schema Design](https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design). *mongodb.com*, June 2014. Archived at [perma.cc/T2BZ-PPJB](https://perma.cc/T2BZ-PPJB)
[^10]: Sidney Andrews and Christopher McClister. [Data modeling in Azure Cosmos DB](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/modeling-data). *learn.microsoft.com*, February 2023. Archived at [archive.org](https://web.archive.org/web/20230207193233/https%3A//learn.microsoft.com/en-us/azure/cosmos-db/nosql/modeling-data)
[^11]: Raffi Krikorian. [Timelines at Scale](https://www.infoq.com/presentations/Twitter-Timeline-Scalability/). At *QCon San Francisco*, November 2012. Archived at [perma.cc/V9G5-KLYK](https://perma.cc/V9G5-KLYK)
[^12]: Ralph Kimball and Margy Ross. [*The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling*](https://learning.oreilly.com/library/view/the-data-warehouse/9781118530801/), 3rd edition. John Wiley & Sons, July 2013. ISBN: 9781118530801
[^13]: Michael Kaminsky. [Data warehouse modeling: Star schema vs. OBT](https://www.fivetran.com/blog/star-schema-vs-obt). *fivetran.com*, August 2022. Archived at [perma.cc/2PZK-BFFP](https://perma.cc/2PZK-BFFP)
[^14]: Joe Nelson. [User-defined Order in SQL](https://begriffs.com/posts/2018-03-20-user-defined-order.html). *begriffs.com*, March 2018. Archived at [perma.cc/GS3W-F7AD](https://perma.cc/GS3W-F7AD)
[^15]: Evan Wallace. [Realtime Editing of Ordered Sequences](https://www.figma.com/blog/realtime-editing-of-ordered-sequences/). *figma.com*, March 2017. Archived at [perma.cc/K6ER-CQZW](https://perma.cc/K6ER-CQZW)
[^16]: David Greenspan. [Implementing Fractional Indexing](https://observablehq.com/%40dgreensp/implementing-fractional-indexing). *observablehq.com*, October 2020. Archived at [perma.cc/5N4R-MREN](https://perma.cc/5N4R-MREN)
[^17]: Martin Fowler. [Schemaless Data Structures](https://martinfowler.com/articles/schemaless/). *martinfowler.com*, January 2013.
[^18]: Amr Awadallah. [Schema-on-Read vs. Schema-on-Write](https://www.slideshare.net/awadallah/schemaonread-vs-schemaonwrite). At *Berkeley EECS RAD Lab Retreat*, Santa Cruz, CA, May 2009. Archived at [perma.cc/DTB2-JCFR](https://perma.cc/DTB2-JCFR)
[^19]: Martin Odersky. [The Trouble with Types](https://www.infoq.com/presentations/data-types-issues/). At *Strange Loop*, September 2013. Archived at [perma.cc/85QE-PVEP](https://perma.cc/85QE-PVEP)
[^20]: Conrad Irwin. [MongoDB—Confessions of a PostgreSQL Lover](https://speakerdeck.com/conradirwin/mongodb-confessions-of-a-postgresql-lover). At *HTML5DevConf*, October 2013. Archived at [perma.cc/C2J6-3AL5](https://perma.cc/C2J6-3AL5)
[^21]: [Percona Toolkit Documentation: pt-online-schema-change](https://docs.percona.com/percona-toolkit/pt-online-schema-change.html). *docs.percona.com*, 2023. Archived at [perma.cc/9K8R-E5UH](https://perma.cc/9K8R-E5UH)
[^22]: Shlomi Noach. [gh-ost: GitHub’s Online Schema Migration Tool for MySQL](https://github.blog/2016-08-01-gh-ost-github-s-online-migration-tool-for-mysql/). *github.blog*, August 2016. Archived at [perma.cc/7XAG-XB72](https://perma.cc/7XAG-XB72)
[^23]: Shayon Mukherjee. [pg-osc: Zero downtime schema changes in PostgreSQL](https://www.shayon.dev/post/2022/47/pg-osc-zero-downtime-schema-changes-in-postgresql/). *shayon.dev*, February 2022. Archived at [perma.cc/35WN-7WMY](https://perma.cc/35WN-7WMY)
[^24]: Carlos Pérez-Aradros Herce. [Introducing pgroll: zero-downtime, reversible, schema migrations for Postgres](https://xata.io/blog/pgroll-schema-migrations-postgres). *xata.io*, October 2023. Archived at [archive.org](https://web.archive.org/web/20231008161750/https%3A//xata.io/blog/pgroll-schema-migrations-postgres)
[^25]: James C. Corbett, Jeffrey Dean, Michael Epstein, Andrew Fikes, Christopher Frost, JJ Furman, Sanjay Ghemawat, Andrey Gubarev, Christopher Heiser, Peter Hochschild, Wilson Hsieh, Sebastian Kanthak, Eugene Kogan, Hongyi Li, Alexander Lloyd, Sergey Melnik, David Mwaura, David Nagle, Sean Quinlan, Rajesh Rao, Lindsay Rolig, Dale Woodford, Yasushi Saito, Christopher Taylor, Michal Szymaniak, and Ruth Wang. [Spanner: Google’s Globally-Distributed Database](https://research.google/pubs/pub39966/). At *10th USENIX Symposium on Operating System Design and Implementation* (OSDI), October 2012.
[^26]: Donald K. Burleson. [Reduce I/O with Oracle Cluster Tables](http://www.dba-oracle.com/oracle_tip_hash_index_cluster_table.htm). *dba-oracle.com*. Archived at [perma.cc/7LBJ-9X2C](https://perma.cc/7LBJ-9X2C)
[^27]: Fay Chang, Jeffrey Dean, Sanjay Ghemawat, Wilson C. Hsieh, Deborah A. Wallach, Mike Burrows, Tushar Chandra, Andrew Fikes, and Robert E. Gruber. [Bigtable: A Distributed Storage System for Structured Data](https://research.google/pubs/pub27898/). At *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006.
[^28]: Priscilla Walmsley. [*XQuery, 2nd Edition*](https://learning.oreilly.com/library/view/xquery-2nd-edition/9781491915080/). O’Reilly Media, December 2015. ISBN: 9781491915080
[^29]: Paul C. Bryan, Kris Zyp, and Mark Nottingham. [JavaScript Object Notation (JSON) Pointer](https://www.rfc-editor.org/rfc/rfc6901). RFC 6901, IETF, April 2013.
[^30]: Stefan Gössner, Glyn Normington, and Carsten Bormann. [JSONPath: Query Expressions for JSON](https://www.rfc-editor.org/rfc/rfc9535.html). RFC 9535, IETF, February 2024.
[^31]: Michael Stonebraker and Andrew Pavlo. [What Goes Around Comes Around… And Around…](https://db.cs.cmu.edu/papers/2024/whatgoesaround-sigmodrec2024.pdf). *ACM SIGMOD Record*, volume 53, issue 2, pages 21–37. [doi:10.1145/3685980.3685984](https://doi.org/10.1145/3685980.3685984)
[^32]: Lawrence Page, Sergey Brin, Rajeev Motwani, and Terry Winograd. [The PageRank Citation Ranking: Bringing Order to the Web](http://ilpubs.stanford.edu:8090/422/). Technical Report 1999-66, Stanford University InfoLab, November 1999. Archived at [perma.cc/UML9-UZHW](https://perma.cc/UML9-UZHW)
[^33]: Nathan Bronson, Zach Amsden, George Cabrera, Prasad Chakka, Peter Dimov, Hui Ding, Jack Ferris, Anthony Giardullo, Sachin Kulkarni, Harry Li, Mark Marchukov, Dmitri Petrov, Lovro Puzar, Yee Jiun Song, and Venkat Venkataramani. [TAO: Facebook’s Distributed Data Store for the Social Graph](https://www.usenix.org/conference/atc13/technical-sessions/presentation/bronson). At *USENIX Annual Technical Conference* (ATC), June 2013.
[^34]: Natasha Noy, Yuqing Gao, Anshu Jain, Anant Narayanan, Alan Patterson, and Jamie Taylor. [Industry-Scale Knowledge Graphs: Lessons and Challenges](https://cacm.acm.org/magazines/2019/8/238342-industry-scale-knowledge-graphs/fulltext). *Communications of the ACM*, volume 62, issue 8, pages 36–43, August 2019. [doi:10.1145/3331166](https://doi.org/10.1145/3331166)
[^35]: Xiyang Feng, Guodong Jin, Ziyi Chen, Chang Liu, and Semih Salihoğlu. [KÙZU Graph Database Management System](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf). At *3th Annual Conference on Innovative Data Systems Research* (CIDR 2023), January 2023.
[^36]: Maciej Besta, Emanuel Peter, Robert Gerstenberger, Marc Fischer, Michał Podstawski, Claude Barthels, Gustavo Alonso, Torsten Hoefler. [Demystifying Graph Databases: Analysis and Taxonomy of Data Organization, System Designs, and Graph Queries](https://arxiv.org/pdf/1910.09017.pdf). *arxiv.org*, October 2019.
[^37]: [Apache TinkerPop 3.6.3 Documentation](https://tinkerpop.apache.org/docs/3.6.3/reference/). *tinkerpop.apache.org*, May 2023. Archived at [perma.cc/KM7W-7PAT](https://perma.cc/KM7W-7PAT)
[^38]: Nadime Francis, Alastair Green, Paolo Guagliardo, Leonid Libkin, Tobias Lindaaker, Victor Marsault, Stefan Plantikow, Mats Rydberg, Petra Selmer, and Andrés Taylor. [Cypher: An Evolving Query Language for Property Graphs](https://core.ac.uk/download/pdf/158372754.pdf). At *International Conference on Management of Data* (SIGMOD), pages 1433–1445, May 2018. [doi:10.1145/3183713.3190657](https://doi.org/10.1145/3183713.3190657)
[^39]: Emil Eifrem. [Twitter correspondence](https://twitter.com/emileifrem/status/419107961512804352), January 2014. Archived at [perma.cc/WM4S-BW64](https://perma.cc/WM4S-BW64)
[^40]: Francesco Tisiot. [Explore the new SEARCH and CYCLE features in PostgreSQL® 14](https://aiven.io/blog/explore-the-new-search-and-cycle-features-in-postgresql-14). *aiven.io*, December 2021. Archived at [perma.cc/J6BT-83UZ](https://perma.cc/J6BT-83UZ)
[^41]: Gaurav Goel. [Understanding Hierarchies in Oracle](https://towardsdatascience.com/understanding-hierarchies-in-oracle-43f85561f3d9). *towardsdatascience.com*, May 2020. Archived at [perma.cc/5ZLR-Q7EW](https://perma.cc/5ZLR-Q7EW)
[^42]: Alin Deutsch, Nadime Francis, Alastair Green, Keith Hare, Bei Li, Leonid Libkin, Tobias Lindaaker, Victor Marsault, Wim Martens, Jan Michels, Filip Murlak, Stefan Plantikow, Petra Selmer, Oskar van Rest, Hannes Voigt, Domagoj Vrgoč, Mingxi Wu, and Fred Zemke. [Graph Pattern Matching in GQL and SQL/PGQ](https://arxiv.org/abs/2112.06217). At *International Conference on Management of Data* (SIGMOD), pages 2246–2258, June 2022. [doi:10.1145/3514221.3526057](https://doi.org/10.1145/3514221.3526057)
[^43]: Alastair Green. [SQL... and now GQL](https://opencypher.org/articles/2019/09/12/SQL-and-now-GQL/). *opencypher.org*, September 2019. Archived at [perma.cc/AFB2-3SY7](https://perma.cc/AFB2-3SY7)
[^44]: Alin Deutsch, Yu Xu, and Mingxi Wu. [Seamless Syntactic and Semantic Integration of Query Primitives over Relational and Graph Data in GSQL](https://cdn2.hubspot.net/hubfs/4114546/IntegrationQuery%20PrimitivesGSQL.pdf). *tigergraph.com*, November 2018. Archived at [perma.cc/JG7J-Y35X](https://perma.cc/JG7J-Y35X)
[^45]: Oskar van Rest, Sungpack Hong, Jinha Kim, Xuming Meng, and Hassan Chafi. [PGQL: a property graph query language](https://event.cwi.nl/grades/2016/07-VanRest.pdf). At *4th International Workshop on Graph Data Management Experiences and Systems* (GRADES), June 2016. [doi:10.1145/2960414.2960421](https://doi.org/10.1145/2960414.2960421)
[^46]: Amazon Web Services. [Neptune Graph Data Model](https://docs.aws.amazon.com/neptune/latest/userguide/feature-overview-data-model.html). Amazon Neptune User Guide, *docs.aws.amazon.com*. Archived at [perma.cc/CX3T-EZU9](https://perma.cc/CX3T-EZU9)
[^47]: Cognitect. [Datomic Data Model](https://docs.datomic.com/cloud/whatis/data-model.html). Datomic Cloud Documentation, *docs.datomic.com*. Archived at [perma.cc/LGM9-LEUT](https://perma.cc/LGM9-LEUT)
[^48]: David Beckett and Tim Berners-Lee. [Turtle – Terse RDF Triple Language](https://www.w3.org/TeamSubmission/turtle/). W3C Team Submission, March 2011.
[^49]: Sinclair Target. [Whatever Happened to the Semantic Web?](https://twobithistory.org/2018/05/27/semantic-web.html) *twobithistory.org*, May 2018. Archived at [perma.cc/M8GL-9KHS](https://perma.cc/M8GL-9KHS)
[^50]: Gavin Mendel-Gleason. [The Semantic Web is Dead – Long Live the Semantic Web!](https://terminusdb.com/blog/the-semantic-web-is-dead/) *terminusdb.com*, August 2022. Archived at [perma.cc/G2MZ-DSS3](https://perma.cc/G2MZ-DSS3)
[^51]: Manu Sporny. [JSON-LD and Why I Hate the Semantic Web](http://manu.sporny.org/2014/json-ld-origins-2/). *manu.sporny.org*, January 2014. Archived at [perma.cc/7PT4-PJKF](https://perma.cc/7PT4-PJKF)
[^52]: University of Michigan Library. [Biomedical Ontologies and Controlled Vocabularies](https://guides.lib.umich.edu/ontology), *guides.lib.umich.edu/ontology*. Archived at [perma.cc/Q5GA-F2N8](https://perma.cc/Q5GA-F2N8)
[^53]: Facebook. [The Open Graph protocol](https://ogp.me/), *ogp.me*. Archived at [perma.cc/C49A-GUSY](https://perma.cc/C49A-GUSY)
[^54]: Matt Haughey. [Everything you ever wanted to know about unfurling but were afraid to ask /or/ How to make your site previews look amazing in Slack](https://medium.com/slack-developer-blog/everything-you-ever-wanted-to-know-about-unfurling-but-were-afraid-to-ask-or-how-to-make-your-e64b4bb9254). *medium.com*, November 2015. Archived at [perma.cc/C7S8-4PZN](https://perma.cc/C7S8-4PZN)
[^55]: W3C RDF Working Group. [Resource Description Framework (RDF)](https://www.w3.org/RDF/). *w3.org*, February 2004.
[^56]: Steve Harris, Andy Seaborne, and Eric Prud’hommeaux. [SPARQL 1.1 Query Language](https://www.w3.org/TR/sparql11-query/). W3C Recommendation, March 2013.
[^57]: Todd J. Green, Shan Shan Huang, Boon Thau Loo, and Wenchao Zhou. [Datalog and Recursive Query Processing](http://blogs.evergreen.edu/sosw/files/2014/04/Green-Vol5-DBS-017.pdf). *Foundations and Trends in Databases*, volume 5, issue 2, pages 105–195, November 2013. [doi:10.1561/1900000017](https://doi.org/10.1561/1900000017)
[^58]: Stefano Ceri, Georg Gottlob, and Letizia Tanca. [What You Always Wanted to Know About Datalog (And Never Dared to Ask)](https://www.researchgate.net/profile/Letizia_Tanca/publication/3296132_What_you_always_wanted_to_know_about_Datalog_and_never_dared_to_ask/links/0fcfd50ca2d20473ca000000.pdf). *IEEE Transactions on Knowledge and Data Engineering*, volume 1, issue 1, pages 146–166, March 1989. [doi:10.1109/69.43410](https://doi.org/10.1109/69.43410)
[^59]: Serge Abiteboul, Richard Hull, and Victor Vianu. [*Foundations of Databases*](http://webdam.inria.fr/Alice/). Addison-Wesley, 1995. ISBN: 9780201537710, available online at [*webdam.inria.fr/Alice*](http://webdam.inria.fr/Alice/)
[^60]: Scott Meyer, Andrew Carter, and Andrew Rodriguez. [LIquid: The soul of a new graph database, Part 2](https://engineering.linkedin.com/blog/2020/liquid--the-soul-of-a-new-graph-database--part-2). *engineering.linkedin.com*, September 2020. Archived at [perma.cc/K9M4-PD6Q](https://perma.cc/K9M4-PD6Q)
[^61]: Matt Bessey. [Why, after 6 years, I’m over GraphQL](https://bessey.dev/blog/2024/05/24/why-im-over-graphql/). *bessey.dev*, May 2024. Archived at [perma.cc/2PAU-JYRA](https://perma.cc/2PAU-JYRA)
[^62]: Dominic Betts, Julián Domínguez, Grigori Melnik, Fernando Simonazzi, and Mani Subramanian. [*Exploring CQRS and Event Sourcing*](https://learn.microsoft.com/en-us/previous-versions/msp-n-p/jj554200%28v%3Dpandp.10%29). Microsoft Patterns & Practices, July 2012. ISBN: 1621140164, archived at [perma.cc/7A39-3NM8](https://perma.cc/7A39-3NM8)
[^63]: Greg Young. [CQRS and Event Sourcing](https://www.youtube.com/watch?v=JHGkaShoyNs). At *Code on the Beach*, August 2014.
[^64]: Greg Young. [CQRS Documents](https://cqrs.files.wordpress.com/2010/11/cqrs_documents.pdf). *cqrs.wordpress.com*, November 2010. Archived at [perma.cc/X5R6-R47F](https://perma.cc/X5R6-R47F)
[^65]: Devin Petersohn, Stephen Macke, Doris Xin, William Ma, Doris Lee, Xiangxi Mo, Joseph E. Gonzalez, Joseph M. Hellerstein, Anthony D. Joseph, and Aditya Parameswaran. [Towards Scalable Dataframe Systems](https://www.vldb.org/pvldb/vol13/p2033-petersohn.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 11, pages 2033–2046. [doi:10.14778/3407790.3407807](https://doi.org/10.14778/3407790.3407807)
[^66]: Stavros Papadopoulos, Kushal Datta, Samuel Madden, and Timothy Mattson. [The TileDB Array Data Storage Manager](https://www.vldb.org/pvldb/vol10/p349-papadopoulos.pdf). *Proceedings of the VLDB Endowment*, volume 10, issue 4, pages 349–360, November 2016. [doi:10.14778/3025111.3025117](https://doi.org/10.14778/3025111.3025117)
[^67]: Florin Rusu. [Multidimensional Array Data Management](https://faculty.ucmerced.edu/frusu/Papers/Report/2022-09-fntdb-arrays.pdf). *Foundations and Trends in Databases*, volume 12, numbers 2–3, pages 69–220, February 2023. [doi:10.1561/1900000069](https://doi.org/10.1561/1900000069)
[^68]: Ed Targett. [Bloomberg, Man Group team up to develop open source “ArcticDB” database](https://www.thestack.technology/bloomberg-man-group-arcticdb-database-dataframe/). *thestack.technology*, March 2023. Archived at [perma.cc/M5YD-QQYV](https://perma.cc/M5YD-QQYV)
[^69]: Dennis A. Benson, Ilene Karsch-Mizrachi, David J. Lipman, James Ostell, and David L. Wheeler. [GenBank](https://academic.oup.com/nar/article/36/suppl_1/D25/2507746). *Nucleic Acids Research*, volume 36, database issue, pages D25–D30, December 2007. [doi:10.1093/nar/gkm929](https://doi.org/10.1093/nar/gkm929)

================================================
FILE: content/tw/ch4.md
================================================
---
title: "4. 儲存與檢索"
weight: 104
breadcrumbs: false
---

<a id="ch_storage"></a>

![](/map/ch03.png)

> *生活的苦惱之一是，每個人對事物的命名都有些偏差。這讓我們理解世界變得比本該有的樣子困難一些，要是命名方式不同就好了。計算機的主要功能並不是傳統意義上的計算，比如算術運算。[……] 它們主要是歸檔系統。*
>
> [理查德·費曼](https://www.youtube.com/watch?v=EKWGGDXe5MA&t=296s)，
> *特立獨行的思考* 研討會（1985）

在最基礎的層面上，資料庫需要做兩件事：當你給它一些資料時，它應該儲存這些資料；當你之後再詢問時，它應該把資料返回給你。

在 [第 3 章](/tw/ch3#ch_datamodels) 中，我們討論了資料模型和查詢語言 —— 即你向資料庫提供資料的格式，以及之後再次請求資料的介面。在本章中，我們從資料庫的角度討論同樣的問題：資料庫如何儲存你提供的資料，以及當你請求時如何再次找到這些資料。

作為應用開發者，你為什麼要關心資料庫內部如何處理儲存和檢索？你可能不會從頭開始實現自己的儲存引擎，但你 *確實* 需要從眾多可用的儲存引擎中選擇一個適合你應用的。為了讓儲存引擎在你的工作負載型別上表現良好，你需要對儲存引擎在底層做了什麼有個大致的瞭解。

特別是，針對事務型工作負載（OLTP）最佳化的儲存引擎和針對分析型工作負載最佳化的儲存引擎之間存在巨大差異（我們在 ["分析型與事務型系統"](/tw/ch1#sec_introduction_analytics) 中介紹了這種區別）。本章首先研究兩種用於 OLTP 的儲存引擎家族：寫入不可變資料檔案的 *日誌結構* 儲存引擎，以及像 *B 樹* 這樣就地更新資料的儲存引擎。這些結構既用於鍵值儲存，也用於二級索引。

隨後在 ["分析型資料儲存"](#sec_storage_analytics) 中，我們將討論一系列針對分析最佳化的儲存引擎；在 ["多維索引與全文索引"](#sec_storage_multidimensional) 中，我們將簡要介紹用於更高階查詢（如文字檢索）的索引。

## OLTP 系統的儲存與索引 {#sec_storage_oltp}

考慮世界上最簡單的資料庫，用兩個 Bash 函式實現：

```bash
#!/bin/bash

db_set () {
  echo "$1,$2" >> database
}

db_get () {
  grep "^$1," database | sed -e "s/^$1,//" | tail -n 1
}
```

這兩個函式實現了一個鍵值儲存。你可以呼叫 `db_set key value`，它將在資料庫中儲存 `key` 和 `value`。鍵和值可以是（幾乎）任何你喜歡的內容 —— 例如，值可以是一個 JSON 文件。然後你可以呼叫 `db_get key`，它會查詢與該特定鍵關聯的最新值並返回它。

麻雀雖小，五臟俱全：

```bash
$ db_set 12 '{"name":"London","attractions":["Big Ben","London Eye"]}'

$ db_set 42 '{"name":"San Francisco","attractions":["Golden Gate Bridge"]}'

$ db_get 42
{"name":"San Francisco","attractions":["Golden Gate Bridge"]}
```

儲存格式非常簡單：一個文字檔案，每行包含一個鍵值對，用逗號分隔（大致類似 CSV 檔案，忽略轉義問題）。每次呼叫 `db_set` 都會追加到檔案末尾。如果你多次更新一個鍵，舊版本的值不會被覆蓋 —— 你需要檢視檔案中鍵的最後一次出現來找到最新值（因此 `db_get` 中使用了 `tail -n 1`）：

```bash
$ db_set 42 '{"name":"San Francisco","attractions":["Exploratorium"]}'

$ db_get 42
{"name":"San Francisco","attractions":["Exploratorium"]}

$ cat database
12,{"name":"London","attractions":["Big Ben","London Eye"]}
42,{"name":"San Francisco","attractions":["Golden Gate Bridge"]}
42,{"name":"San Francisco","attractions":["Exploratorium"]}

```

對於如此簡單的實現，`db_set` 函式實際上有相當好的效能，因為追加到檔案通常非常高效。與 `db_set` 所做的類似，許多資料庫內部使用 *日誌*，這是一個僅追加的資料檔案。真正的資料庫有更多問題要處理（如處理併發寫入、回收磁碟空間以防日誌無限增長，以及從崩潰中恢復時處理部分寫入的記錄），但基本原理是相同的。日誌非常有用，我們將在本書中多次遇到它們。

---------

> [!NOTE]
> *日誌* 這個詞通常用於指應用程式日誌，應用程式輸出描述正在發生什麼的文字。在本書中，*日誌* 用於更一般的含義：磁碟上僅追加的記錄序列。它不一定是人類可讀的；它可能是二進位制的，僅供資料庫系統內部使用。

--------


另一方面，如果你的資料庫中有大量記錄，`db_get` 函式的效能會很糟糕。每次你想查詢一個鍵時，`db_get` 必須從頭到尾掃描整個資料庫檔案，尋找該鍵的出現。用算法術語來說，查詢的成本是 *O*(*n*)：如果你的資料庫中的記錄數 *n* 翻倍，查詢時間也會翻倍。這並不好。

為了高效地找到資料庫中特定鍵的值，我們需要一個不同的資料結構：*索引*。在本章中，我們將研究一系列索引結構並瞭解它們的比較；一般思想是以特定方式（例如，按某個鍵排序）構建資料，使定位所需資料更快。如果你想以幾種不同的方式搜尋相同的資料，你可能需要在資料的不同部分上建立幾個不同的索引。

索引是從主資料派生出的 *額外* 結構。許多資料庫允許你新增和刪除索引，這不會影響資料庫的內容；它隻影響查詢的效能。維護額外的結構會產生開銷，特別是在寫入時。對於寫入，很難超越簡單地追加到檔案的效能，因為這是最簡單的寫入操作。任何型別的索引通常都會減慢寫入速度，因為每次寫入資料時也需要更新索引。

這是儲存系統中的一個重要權衡：精心選擇的索引加快了讀查詢速度，但每個索引都會消耗額外的磁碟空間並減慢寫入速度，有時會大幅減慢 [^1]。因此，資料庫通常不會預設為所有內容建立索引，而是要求你 —— 編寫應用程式或管理資料庫的人 —— 使用你對應用程式典型查詢模式的瞭解來手動選擇索引。然後你可以選擇為你的應用程式帶來最大收益的索引，而不會引入超過必要的寫入開銷。

### 日誌結構儲存 {#sec_storage_log_structured}

首先，讓我們假設你想繼續將資料儲存在 `db_set` 寫入的僅追加檔案中，你只是想加快讀取速度。一種方法是在記憶體中保留一個雜湊對映，其中每個鍵都對映到檔案中可以找到該鍵最新值的位元組偏移量，如 [圖 4-1](#fig_storage_csv_hash_index) 所示。

{{< figure src="/fig/ddia_0401.png" id="fig_storage_csv_hash_index" caption="圖 4-1. 以類似 CSV 格式儲存鍵值對日誌，使用記憶體雜湊對映建立索引。" class="w-full my-4" >}}

每當你向檔案追加新的鍵值對時，你也會更新雜湊對映以反映剛剛寫入資料的偏移量。當你想查詢一個值時，你使用雜湊對映找到日誌檔案中的偏移量，尋找到該位置，然後讀取值。如果資料檔案的那部分已經在檔案系統快取中，讀取根本不需要任何磁碟 I/O。

這種方法速度更快，但仍然存在幾個問題：

* 你永遠不會釋放被覆蓋的舊日誌條目佔用的磁碟空間；如果你不斷寫入資料庫，可能會耗盡磁碟空間。
* 雜湊對映不是持久化的，所以當你重啟資料庫時必須重建它 —— 例如，透過掃描整個日誌檔案來找到每個鍵的最新位元組偏移量。如果你有大量資料，這會使重啟變慢。
* 雜湊表必須適合記憶體。原則上，你可以在磁碟上維護雜湊表，但不幸的是，很難讓磁碟上的雜湊對映表現良好。它需要大量的隨機訪問 I/O，當它變滿時擴充套件成本高昂，雜湊衝突需要複雜的邏輯 [^2]。
* 範圍查詢效率不高。例如，你不能輕鬆掃描 `10000` 和 `19999` 之間的所有鍵 —— 你必須在雜湊對映中單獨查詢每個鍵。

#### SSTable 檔案格式 {#the-sstable-file-format}

實際上，雜湊表很少用於資料庫索引，相反，保持資料 *按鍵排序* 的結構更為常見 [^3]。這種結構的一個例子是 *排序字串表*（*Sorted String Table*），簡稱 *SSTable*，如 [圖 4-2](#fig_storage_sstable_index) 所示。這種檔案格式也儲存鍵值對，但它確保它們按鍵排序，每個鍵在檔案中只出現一次。

{{< figure src="/fig/ddia_0402.png" id="fig_storage_sstable_index" caption="圖 4-2. 帶有稀疏索引的 SSTable，允許查詢跳轉到正確的塊。" class="w-full my-4" >}}

現在你不需要在記憶體中保留所有鍵：你可以將 SSTable 中的鍵值對分組為幾千位元組的 *塊*，然後在索引中儲存每個塊的第一個鍵。這種只儲存部分鍵的索引稱為 *稀疏* 索引。這個索引儲存在 SSTable 的單獨部分，例如使用不可變 B 樹、字典樹或其他允許查詢快速查詢特定鍵的資料結構 [^4]。

例如，在 [圖 4-2](#fig_storage_sstable_index) 中，一個塊的第一個鍵是 `handbag`，下一個塊的第一個鍵是 `handsome`。現在假設你要查詢鍵 `handiwork`，它沒有出現在稀疏索引中。由於排序，你知道 `handiwork` 必須出現在 `handbag` 和 `handsome` 之間。這意味著你可以尋找到 `handbag` 的偏移量，然後從那裡掃描檔案，直到找到 `handiwork`（或沒有，如果該鍵不在檔案中）。幾千位元組的塊可以非常快速地掃描。

此外，每個記錄塊都可以壓縮（在 [圖 4-2](#fig_storage_sstable_index) 中用陰影區域表示）。除了節省磁碟空間外，壓縮還減少了 I/O 頻寬使用，代價是使用更多一點的 CPU 時間。

#### 構建和合並 SSTable {#constructing-and-merging-sstables}

SSTable 檔案格式在讀取方面比僅追加日誌更好，但它使寫入更加困難。我們不能簡單地追加到末尾，因為那樣檔案就不再有序了（除非鍵恰好按升序寫入）。如果我們每次在中間某處插入鍵時都必須重寫整個 SSTable，寫入將變得太昂貴。

我們可以用 *日誌結構* 方法解決這個問題，這是僅追加日誌和排序檔案之間的混合：

1. 當寫入操作到來時，將其新增到記憶體中的有序對映資料結構中，例如紅黑樹、跳錶 [^5] 或字典樹 [^6]。使用這些資料結構，你可以按任意順序插入鍵，高效地查詢它們，並按排序順序讀回它們。這個記憶體資料結構稱為 *記憶體表*（*memtable*）。
2. 當記憶體表變得大於某個閾值（通常是幾兆位元組）時，將其按排序順序作為 SSTable 檔案寫入磁碟。我們將這個新的 SSTable 檔案稱為資料庫的最新 *段*，它與舊段一起作為單獨的檔案儲存。每個段都有自己內容的單獨索引。當新段被寫入磁碟時，資料庫可以繼續寫入新的記憶體表例項，當 SSTable 寫入完成時，舊記憶體表的記憶體被釋放。
3. 為了讀取某個鍵的值，首先嘗試在記憶體表和最新的磁碟段中找到該鍵。如果沒有找到，就在下一個較舊的段中查詢，依此類推，直到找到鍵或到達最舊的段。如果鍵沒有出現在任何段中，則它不存在於資料庫中。
4. 不時地在後臺執行合併和壓實過程，以合併段檔案並丟棄被覆蓋或刪除的值。

合併段的工作方式類似於 *歸併排序* 演算法 [^5]。該過程如 [圖 4-3](#fig_storage_sstable_merging) 所示：並排開始讀取輸入檔案，檢視每個檔案中的第一個鍵，將最低的鍵（根據排序順序）複製到輸出檔案，然後重複。如果同一個鍵出現在多個輸入檔案中，只保留較新的值。這會產生一個新的合併段檔案，也按鍵排序，每個鍵只有一個值，並且它使用最少的記憶體，因為我們可以一次遍歷一個鍵的 SSTable。

{{< figure src="/fig/ddia_0403.png" id="fig_storage_sstable_merging" caption="圖 4-3. 合併多個 SSTable 段，僅保留每個鍵的最新值。" class="w-full my-4" >}}

為了確保資料庫崩潰時記憶體表中的資料不會丟失，儲存引擎在磁碟上保留一個單獨的日誌，每次寫入都會立即追加到該日誌中。此日誌不按鍵排序，但這無關緊要，因為它的唯一目的是在崩潰後恢復記憶體表。每次記憶體表被寫出到 SSTable 後，日誌的相應部分就可以丟棄。

如果你想刪除一個鍵及其關聯的值，你必須向資料檔案追加一個稱為 *墓碑*（*tombstone*）的特殊刪除記錄。當日誌段合併時，墓碑告訴合併過程丟棄已刪除鍵的任何先前值。一旦墓碑合併到最舊的段中，它就可以被丟棄。

這裡描述的演算法本質上就是 RocksDB [^7]、Cassandra、Scylla 和 HBase [^8] 中使用的演算法，它們都受到 Google 的 Bigtable 論文 [^9] 的啟發（該論文引入了 *SSTable* 和 *memtable* 這兩個術語）。

該演算法最初於 1996 年以 *日誌結構合併樹*（*Log-Structured Merge-Tree*）或 *LSM 樹*（*LSM-Tree*）[^10] 的名稱釋出，建立在早期日誌結構檔案系統工作的基礎上 [^11]。因此，基於合併和壓實排序檔案原理的儲存引擎通常被稱為 *LSM 儲存引擎*。

在 LSM 儲存引擎中，段檔案是一次性寫入的（透過寫出記憶體表或合併一些現有段），此後它是不可變的。段的合併和壓實可以在後臺執行緒中完成，當它進行時，我們仍然可以使用舊的段檔案繼續提供讀取服務。當合並過程完成時，我們將讀取請求切換到使用新的合併段而不是舊段，然後可以刪除舊的段檔案。

段檔案不一定必須儲存在本地磁碟上：它們也非常適合寫入物件儲存。例如，SlateDB 和 Delta Lake [^12] 採用了這種方法。

具有不可變段檔案也簡化了崩潰恢復：如果在寫出記憶體表或合併段時發生崩潰，資料庫可以刪除未完成的 SSTable 並重新開始。將寫入持久化到記憶體表的日誌如果在寫入記錄的過程中發生崩潰，或者磁碟已滿，可能包含不完整的記錄；這些通常透過在日誌中包含校驗和來檢測，並丟棄損壞或不完整的日誌條目。我們將在 [第 8 章](/tw/ch8#ch_transactions) 中更多地討論永續性和崩潰恢復。

<a id="sec_storage_bloom_filter"></a>

#### 布隆過濾器 {#bloom-filters}

使用 LSM 儲存，讀取很久以前更新的鍵或不存在的鍵可能會很慢，因為儲存引擎需要檢查多個段檔案。為了加快此類讀取，LSM 儲存引擎通常在每個段中包含一個 *布隆過濾器*（*Bloom filter*）[^13]，它提供了一種快速但近似的方法來檢查特定鍵是否出現在特定 SSTable 中。

[圖 4-4](#fig_storage_bloom) 顯示了一個包含兩個鍵和 16 位的布隆過濾器示例（實際上，它會包含更多的鍵和更多的位）。對於 SSTable 中的每個鍵，我們計算一個雜湊函式，產生一組數字，然後將其解釋為位陣列的索引 [^14]。我們將對應於這些索引的位設定為 1，其餘保持為 0。例如，鍵 `handbag` 雜湊為數字 (2, 9, 4)，所以我們將第 2、9 和 4 位設定為 1。然後將點陣圖與鍵的稀疏索引一起儲存為 SSTable 的一部分。這需要一點額外的空間，但與 SSTable 的其餘部分相比，布隆過濾器通常很小。

{{< figure src="/fig/ddia_0404.png" id="fig_storage_bloom" caption="圖 4-4. 布隆過濾器提供了一種快速的機率檢查，用於判斷特定鍵是否存在於特定 SSTable 中。" class="w-full my-4" >}}

當我們想知道一個鍵是否出現在 SSTable 中時，我們像以前一樣計算該鍵的相同雜湊，並檢查這些索引處的位。例如，在 [圖 4-4](#fig_storage_bloom) 中，我們查詢鍵 `handheld`，它雜湊為 (6, 11, 2)。其中一個位是 1（即第 2 位），而另外兩個是 0。這些檢查可以使用所有 CPU 都支援的位運算非常快速地進行。

如果至少有一個位是 0，我們知道該鍵肯定不在 SSTable 中。如果查詢中的位都是 1，那麼該鍵很可能在 SSTable 中，但也有可能是巧合，所有這些位都被其他鍵設定為 1。這種看起來鍵存在但實際上不存在的情況稱為 *假陽性*（*false positive*）。

假陽性的機率取決於鍵的數量、每個鍵設定的位數和布隆過濾器中的總位數。你可以使用線上計算器工具為你的應用計算出正確的引數 [^15]。作為經驗法則，你需要為 SSTable 中的每個鍵分配 10 位布隆過濾器空間以獲得 1% 的假陽性機率，每為每個鍵分配額外的 5 位，機率就會降低十倍。

在 LSM 儲存引擎的上下文中，假陽性沒有問題：

* 如果布隆過濾器說鍵 *不* 存在，我們可以安全地跳過該 SSTable，因為我們可以確定它不包含該鍵。
* 如果布隆過濾器說鍵 *存在*，我們必須查詢稀疏索引並解碼鍵值對塊以檢查鍵是否真的在那裡。如果是假陽性，我們做了一些不必要的工作，但除此之外沒有害處 —— 我們只是繼續使用下一個最舊的段進行搜尋。

#### 壓實策略 {#sec_storage_lsm_compaction}

一個重要的細節是 LSM 儲存如何選擇何時執行壓實，以及在壓實中包括哪些 SSTable。許多基於 LSM 的儲存系統允許你配置使用哪種壓實策略，一些常見的選擇是 [^16] [^17]：

分層壓實（Size-tiered compaction）
: 較新和較小的 SSTable 依次合併到較舊和較大的 SSTable 中。包含較舊資料的 SSTable 可能變得非常大，合併它們需要大量的臨時磁碟空間。這種策略的優點是它可以處理非常高的寫入吞吐量。

分級壓實（Leveled compaction）
: 鍵範圍被分成較小的 SSTable，較舊的資料被移動到單獨的"級別"中，這允許壓實更增量地進行，並且比分層策略使用更少的磁碟空間。這種策略對於讀取比分層壓實更有效，因為儲存引擎需要讀取更少的 SSTable 來檢查它們是否包含該鍵。

作為經驗法則，如果你主要有寫入而讀取很少，分層壓實表現更好，而如果你的工作負載以讀取為主，分級壓實表現更好。如果你頻繁寫入少量鍵，而很少寫入大量鍵，那麼分級壓實也可能有優勢 [^18]。

儘管有許多細微之處，但 LSM 樹的基本思想 —— 保持在後臺合併的 SSTable 級聯 —— 簡單而有效。我們將在 ["比較 B 樹與 LSM 樹"](#sec_storage_btree_lsm_comparison) 中更詳細地討論它們的效能特徵。

--------

<a id="sidebar_embedded"></a>

> [!TIP] 嵌入式儲存引擎

許多資料庫作為接受網路查詢的服務執行，但也有 *嵌入式* 資料庫不公開網路 API。相反，它們是在與應用程式程式碼相同的程序中執行的庫，通常讀取和寫入本地磁碟上的檔案，你透過正常的函式呼叫與它們互動。嵌入式儲存引擎的例子包括 RocksDB、SQLite、LMDB、DuckDB 和 KùzuDB [^19]。

嵌入式資料庫在移動應用中非常常用，用於儲存本地使用者的資料。在後端，如果資料足夠小以適合單臺機器，並且沒有太多併發事務，它們可能是一個合適的選擇。例如，在多租戶系統中，如果每個租戶足夠小且完全與其他租戶分離（即，你不需要執行合併多個租戶資料的查詢），你可能可以為每個租戶使用單獨的嵌入式資料庫例項 [^20]。

我們在本章討論的儲存和檢索方法既用於嵌入式資料庫，也用於客戶端-伺服器資料庫。在 [第 6 章](/tw/ch6#ch_replication) 和 [第 7 章](/tw/ch7#ch_sharding) 中，我們將討論跨多臺機器擴充套件資料庫的技術。

--------

### B 樹 {#sec_storage_b_trees}

日誌結構方法很流行，但它不是鍵值儲存的唯一形式。按鍵讀取和寫入資料庫記錄最廣泛使用的結構是 *B 樹*。

B 樹於 1970 年引入 [^21]，不到 10 年後就被稱為"無處不在"[^22]，它們經受住了時間的考驗。它們仍然是幾乎所有關係資料庫中的標準索引實現，許多非關係資料庫也使用它們。

像 SSTable 一樣，B 樹按鍵保持鍵值對排序，這允許高效的鍵值查詢和範圍查詢。但相似之處到此為止：B 樹有著非常不同的設計理念。

我們之前看到的日誌結構索引將資料庫分解為可變大小的 *段*，通常為幾兆位元組或更大，寫入一次後就不可變。相比之下，B 樹將資料庫分解為固定大小的 *塊* 或 *頁*，並可能就地覆蓋頁。頁傳統上大小為 4 KiB，但 PostgreSQL 現在預設使用 8 KiB，MySQL 預設使用 16 KiB。

每個頁都可以使用頁號來標識，這允許一個頁引用另一個頁 —— 類似於指標，但在磁碟上而不是在記憶體中。如果所有頁都儲存在同一個檔案中，將頁號乘以頁大小就給我們檔案中頁所在位置的位元組偏移量。我們可以使用這些頁引用來構建頁樹，如 [圖 4-5](#fig_storage_b_tree) 所示。

{{< figure src="/fig/ddia_0405.png" id="fig_storage_b_tree" caption="圖 4-5. 使用 B 樹索引查詢鍵 251。從根頁開始，我們首先跟隨引用到鍵 200–300 的頁，然後是鍵 250–270 的頁。" class="w-full my-4" >}}

一個頁被指定為 B 樹的 *根*；每當你想在索引中查詢一個鍵時，你就從這裡開始。該頁包含幾個鍵和對子頁的引用。每個子頁負責一個連續的鍵範圍，引用之間的鍵指示這些範圍之間的邊界在哪裡。（這種結構有時稱為 B+ 樹，但我們不需要將其與其他 B 樹變體區分開來。）

在 [圖 4-5](#fig_storage_b_tree) 的例子中，我們正在查詢鍵 251，所以我們知道我們需要跟隨邊界 200 和 300 之間的頁引用。這將我們帶到一個看起來相似的頁，該頁進一步將 200–300 範圍分解為子範圍。最終我們到達包含單個鍵的頁（*葉頁*），該頁要麼內聯包含每個鍵的值，要麼包含對可以找到值的頁的引用。

B 樹的一個頁中對子頁的引用數稱為 *分支因子*。例如，在 [圖 4-5](#fig_storage_b_tree) 中，分支因子為六。實際上，分支因子取決於儲存頁引用和範圍邊界所需的空間量，但通常為幾百。

如果你想更新 B 樹中現有鍵的值，你搜索包含該鍵的葉頁，並用包含新值的版本覆蓋磁碟上的該頁。如果你想新增一個新鍵，你需要找到其範圍包含新鍵的頁並將其新增到該頁。如果頁中沒有足夠的空閒空間來容納新鍵，則頁被分成兩個半滿的頁，並更新父頁以說明鍵範圍的新細分。

{{< figure src="/fig/ddia_0406.png" id="fig_storage_b_tree_split" caption="圖 4-6. 透過在邊界鍵 337 上分割頁來增長 B 樹。父頁被更新以引用兩個子頁。" class="w-full my-4" >}}

在 [圖 4-6](#fig_storage_b_tree_split) 的例子中，我們想插入鍵 334，但範圍 333–345 的頁已經滿了。因此，我們將其分成範圍 333–337（包括新鍵）的頁和 337–344 的頁。我們還必須更新父頁以引用兩個子頁，它們之間的邊界值為 337。如果父頁沒有足夠的空間容納新引用，它也可能需要被分割，分割可以一直持續到樹的根。當根被分割時，我們在它上面建立一個新根。刪除鍵（可能需要合併節點）更複雜 [^5]。

這個演算法確保樹保持 *平衡*：具有 *n* 個鍵的 B 樹始終具有 *O*(log *n*) 的深度。大多數資料庫可以適合三或四層深的 B 樹，所以你不需要跟隨許多頁引用來找到你要查詢的頁。（具有 500 分支因子的 4 KiB 頁的四層樹可以儲存多達 250 TB。）

#### 使 B 樹可靠 {#sec_storage_btree_wal}

B 樹的基本底層寫操作是用新資料覆蓋磁碟上的頁。假設覆蓋不會改變頁的位置；即，當頁被覆蓋時，對該頁的所有引用保持不變。這與日誌結構索引（如 LSM 樹）形成鮮明對比，後者只追加到檔案（並最終刪除過時的檔案），但從不就地修改檔案。

一次覆蓋多個頁，如在頁分割中，是一個危險的操作：如果資料庫在只寫入了部分頁後崩潰，你最終會得到一個損壞的樹（例如，可能有一個 *孤立* 頁，它不是任何父頁的子頁）。如果硬體不能原子地寫入整個頁，你也可能最終得到部分寫入的頁（這稱為 *撕裂頁*（*torn page*）[^23]）。

為了使資料庫對崩潰具有彈性，B 樹實現通常包括磁碟上的額外資料結構：*預寫日誌*（*write-ahead log*，WAL）。這是一個僅追加檔案，每個 B 樹修改必須在應用於樹本身的頁之前寫入其中。當資料庫在崩潰後恢復時，此日誌用於將 B 樹恢復到一致狀態 [^2] [^24]。在檔案系統中，等效機制稱為 *日誌記錄*（*journaling*）。

為了提高效能，B 樹實現通常不會立即將每個修改的頁寫入磁碟，而是首先將 B 樹頁緩衝在記憶體中一段時間。預寫日誌還確保在崩潰的情況下資料不會丟失：只要資料已寫入 WAL，並使用 `fsync()` 系統呼叫重新整理到磁碟，資料就是持久的，因為資料庫將能夠在崩潰後恢復它 [^25]。

#### B 樹變體 {#b-tree-variants}

由於 B 樹已經存在了很長時間，多年來已經開發了許多變體。僅舉幾個例子：

* 一些資料庫（如 LMDB）使用寫時複製方案 [^26]，而不是覆蓋頁並維護 WAL 以進行崩潰恢復。修改的頁被寫入不同的位置，並建立樹中父頁的新版本，指向新位置。這種方法對於併發控制也很有用，我們將在 ["快照隔離和可重複讀"](/tw/ch8#sec_transactions_snapshot_isolation) 中看到。
* 我們可以透過不儲存整個鍵而是縮寫它來節省頁中的空間。特別是在樹內部的頁中，鍵只需要提供足夠的資訊來充當鍵範圍之間的邊界。在頁中打包更多鍵允許樹具有更高的分支因子，從而減少層數。
* 為了加快按排序順序掃描鍵範圍，一些 B 樹實現嘗試佈局樹，使葉頁按順序出現在磁碟上，減少磁碟尋道次數。然而，隨著樹的增長，很難維持這種順序。
* 已向樹添加了其他指標。例如，每個葉頁可能有對其左右兄弟頁的引用，這允許按順序掃描鍵而無需跳回父頁。

### 比較 B 樹與 LSM 樹 {#sec_storage_btree_lsm_comparison}

作為經驗法則，LSM 樹更適合寫入密集型應用，而 B 樹對讀取更快 [^27] [^28]。然而，基準測試通常對工作負載的細節很敏感。你需要使用特定的工作負載測試系統，以便進行有效的比較。此外，這不是 LSM 和 B 樹之間的嚴格二選一選擇：儲存引擎有時會混合兩種方法的特徵，例如具有多個 B 樹並以 LSM 風格合併它們。在本節中，我們將簡要討論在衡量儲存引擎效能時值得考慮的幾件事。

#### 讀取效能 {#read-performance}

在 B 樹中，查詢鍵涉及在 B 樹的每個層級讀取一個頁。由於層級數通常很小，這意味著從 B 樹讀取通常很快並且具有可預測的效能。在 LSM 儲存引擎中，讀取通常必須檢查處於不同壓實階段的幾個不同 SSTable，但布隆過濾器有助於減少所需的實際磁碟 I/O 運算元。兩種方法都可以表現良好，哪個更快取決於儲存引擎的細節和工作負載。

範圍查詢在 B 樹上簡單而快速，因為它們可以使用樹的排序結構。在 LSM 儲存上，範圍查詢也可以利用 SSTable 排序，但它們需要並行掃描所有段並組合結果。布隆過濾器對範圍查詢沒有幫助（因為你需要計算範圍內每個可能鍵的雜湊，這是不切實際的），使得範圍查詢在 LSM 方法中比點查詢更昂貴 [^29]。

如果記憶體表填滿，高寫入吞吐量可能會導致日誌結構儲存引擎中的延遲峰值。如果資料無法足夠快地寫入磁碟，可能是因為壓實過程無法跟上傳入的寫入，就會發生這種情況。許多儲存引擎，包括 RocksDB，在這種情況下執行 *背壓*：它們暫停所有讀取和寫入，直到記憶體表被寫入磁碟 [^30] [^31]。

關於讀取吞吐量，現代 SSD（特別是 NVMe）可以並行執行許多獨立的讀請求。LSM 樹和 B 樹都能夠提供高讀取吞吐量，但儲存引擎需要仔細設計以利用這種並行性 [^32]。

#### 順序與隨機寫入 {#sidebar_sequential}

使用 B 樹時，如果應用程式寫入的鍵分散在整個鍵空間中，生成的磁碟操作也會隨機分散，因為儲存引擎需要覆蓋的頁可能位於磁碟的任何位置。另一方面，日誌結構儲存引擎一次寫入整個段檔案（無論是寫出記憶體表還是壓實現有段），這比 B 樹中的頁大得多。

許多小的、分散的寫入模式（如 B 樹中的）稱為 *隨機寫入*，而較少的大寫入模式（如 LSM 樹中的）稱為 *順序寫入*。磁碟通常具有比隨機寫入更高的順序寫入吞吐量，這意味著日誌結構儲存引擎通常可以在相同硬體上處理比 B 樹更高的寫入吞吐量。這種差異在旋轉磁碟硬碟（HDD）上特別大；在今天大多數資料庫使用的固態硬碟（SSD）上，差異較小，但仍然明顯（參見 ["SSD 上的順序與隨機寫入"](#sidebar_sequential)）。

--------

> [!TIP] SSD 上的順序與隨機寫入

在旋轉磁碟硬碟（HDD）上，順序寫入比隨機寫入快得多：隨機寫入必須機械地將磁頭移動到新位置，並等待碟片的正確部分經過磁頭下方，這需要幾毫秒 —— 在計算時間尺度上是永恆的。然而，SSD（固態硬碟）包括 NVMe（Non-Volatile Memory Express，即連線到 PCI Express 匯流排的快閃記憶體）現在已經在許多場景中超越了 HDD，它們不受這種機械限制。

儘管如此，SSD 對順序寫入的吞吐量也高於隨機寫入。原因是快閃記憶體可以一次讀取或寫入一頁（通常為 4 KiB），但只能一次擦除一個塊（通常為 512 KiB）。塊中的某些頁可能包含有效資料，而其他頁可能包含不再需要的資料。在擦除塊之前，控制器必須首先將包含有效資料的頁移動到其他塊中；這個過程稱為 *垃圾回收*（GC）[^33]。

順序寫入工作負載一次寫入更大的資料塊，因此整個 512 KiB 塊很可能屬於單個檔案；當該檔案稍後再次被刪除時，整個塊可以被擦除而無需執行任何 GC。另一方面，對於隨機寫入工作負載，塊更可能包含有效和無效資料頁的混合，因此 GC 必須在塊可以擦除之前執行更多工作 [^34] [^35] [^36]。

GC 消耗的寫入頻寬就不能用於應用程式。此外，GC 執行的額外寫入會導致快閃記憶體磨損；因此，隨機寫入比順序寫入更快地磨損驅動器。

--------

#### 寫放大 {#write-amplification}

對於任何型別的儲存引擎，來自應用程式的一次寫請求都會轉換為底層磁碟上的多個 I/O 操作。對於 LSM 樹，一個值首先被寫入日誌以保證永續性，然後在記憶體表寫入磁碟時再次寫入，並且每次鍵值對參與壓即時再次寫入。（如果值明顯大於鍵，可以透過將值與鍵分開儲存，並僅對包含鍵和值引用的 SSTable 執行壓實來減少這種開銷 [^37]。）

B 樹索引必須至少寫入每條資料兩次：一次寫入預寫日誌，一次寫入樹頁本身。此外，它們有時需要寫出整個頁，即使該頁中只有幾個位元組發生了變化，以確保 B 樹在崩潰或斷電後可以正確恢復 [^38] [^39]。

如果你獲取在某個工作負載中寫入磁碟的總位元組數，然後除以如果你只是寫入沒有索引的僅追加日誌需要寫入的位元組數，你就得到了 *寫放大*。（有時寫放大是根據 I/O 操作而不是位元組來定義的。）在寫入密集型應用程式中，瓶頸可能是資料庫可以寫入磁碟的速率。在這種情況下，寫放大越高，它在可用磁碟頻寬內可以處理的每秒寫入次數就越少。

寫放大是 LSM 樹和 B 樹中的問題。哪個更好取決於各種因素，例如鍵和值的長度，以及你覆蓋現有鍵與插入新鍵的頻率。對於典型的工作負載，LSM 樹往往具有較低的寫放大，因為它們不必寫入整個頁，並且可以壓縮 SSTable 的塊 [^40]。這是使 LSM 儲存引擎非常適合寫入密集型工作負載的另一個因素。

除了影響吞吐量，寫放大也與 SSD 的磨損有關：寫放大較低的儲存引擎將更慢地磨損 SSD。

在測量儲存引擎的寫入吞吐量時，重要的是要執行足夠長的實驗，以便寫放大的影響變得清晰。當寫入空的 LSM 樹時，還沒有進行壓實，因此所有磁碟頻寬都可用於新寫入。隨著資料庫的增長，新寫入需要與壓實共享磁碟頻寬。

#### 磁碟空間使用 {#disk-space-usage}

B 樹可能會隨著時間的推移變得 *碎片化*：例如，如果刪除了大量鍵，資料庫檔案可能包含許多 B 樹不再使用的頁。對 B 樹的後續新增可以使用這些空閒頁，但它們不能輕易地返回給作業系統，因為它們在檔案的中間，所以它們仍然佔用檔案系統上的空間。因此，資料庫需要一個後臺過程來移動頁以更好地放置它們，例如 PostgreSQL 中的真空過程 [^25]。

碎片化在 LSM 樹中不太成問題，因為壓實過程無論如何都會定期重寫資料檔案，而且 SSTable 沒有未使用空間的頁。此外，SSTable 中的鍵值對塊可以更好地壓縮，因此通常比 B 樹在磁碟上產生更小的檔案。被覆蓋的鍵和值繼續消耗空間，直到它們被壓實刪除，但使用分級壓即時，這種開銷相當低 [^40] [^41]。分層壓實（參見 ["壓實策略"](#sec_storage_lsm_compaction)）使用更多的磁碟空間，特別是在壓實期間臨時使用。

在磁碟上有一些資料的多個副本也可能是一個問題，當你需要刪除一些資料，並確信它真的已被刪除（也許是為了遵守資料保護法規）。例如，在大多數 LSM 儲存引擎中，已刪除的記錄可能仍然存在於較高級別中，直到代表刪除的墓碑透過所有壓實級別傳播，這可能需要很長時間。專門的儲存引擎設計可以更快地傳播刪除 [^42]。

另一方面，SSTable 段檔案的不可變性質在你想在某個時間點對資料庫進行快照時很有用（例如，用於備份或建立資料庫副本以進行測試）：你可以寫出記憶體表並記錄該時間點存在的段檔案。只要你不刪除快照的一部分的檔案，你就不需要實際複製它們。在其頁被覆蓋的 B 樹中，有效地進行這樣的快照更困難。


### 多列索引與二級索引 {#sec_storage_index_multicolumn}

到目前為止，我們只討論了鍵值索引，它們就像關係模型中的 *主鍵* 索引。主鍵唯一標識關係表中的一行，或文件資料庫中的一個文件，或圖資料庫中的一個頂點。資料庫中的其他記錄可以透過其主鍵（或 ID）引用該行/文件/頂點，索引用於解析此類引用。

擁有 *二級索引* 也非常常見。在關係資料庫中，你可以使用 `CREATE INDEX` 命令在同一個表上建立多個二級索引，允許你按主鍵以外的列進行搜尋。例如，在 [第 3 章](/tw/ch3#ch_datamodels) 的 [圖 3-1](/tw/ch3#fig_obama_relational) 中，你很可能在 `user_id` 列上有一個二級索引，以便你可以在每個表中找到屬於同一使用者的所有行。

二級索引可以很容易地從鍵值索引構建。主要區別在於，在二級索引中，索引值不一定是唯一的；也就是說，同一索引條目下可能有許多行（文件、頂點）。這可以透過兩種方式解決：要麼使索引中的每個值成為匹配行識別符號的列表（如全文索引中的倒排列表），要麼透過向其追加行識別符號使每個條目唯一。具有就地更新的儲存引擎（如 B 樹）和日誌結構儲存都可用於實現索引。

#### 在索引中儲存值 {#sec_storage_index_heap}

索引中的鍵是查詢搜尋的內容，但值可以是幾種東西之一：

* 如果實際資料（行、文件、頂點）直接儲存在索引結構中，則稱為 *聚簇索引*。例如，在 MySQL 的 InnoDB 儲存引擎中，表的主鍵始終是聚簇索引，在 SQL Server 中，你可以為每個表指定一個聚簇索引 [^43]。
* 或者，值可以是對實際資料的引用：要麼是相關行的主鍵（InnoDB 對二級索引這樣做），要麼是對磁碟上位置的直接引用。在後一種情況下，儲存行的地方稱為 *堆檔案*，它以無特定順序儲存資料（它可能是僅追加的，或者它可能跟蹤已刪除的行以便稍後用新資料覆蓋它們）。例如，Postgres 使用堆檔案方法 [^44]。
* 兩者之間的折中是 *覆蓋索引* 或 *包含列的索引*，它在索引中儲存表的 *某些* 列，除了在堆上或主鍵聚簇索引中儲存完整行 [^45]。這允許僅使用索引來回答某些查詢，而無需解析主鍵或檢視堆檔案（在這種情況下，索引被稱為 *覆蓋* 查詢）。這可以使某些查詢更快，但資料的重複意味著索引使用更多的磁碟空間並減慢寫入速度。

到目前為止討論的索引只將單個鍵對映到值。如果你需要同時查詢表的多個列（或文件中的多個欄位），請參見 ["多維索引與全文索引"](#sec_storage_multidimensional)。

當更新值而不更改鍵時，堆檔案方法可以允許記錄就地覆蓋，前提是新值不大於舊值。如果新值更大，情況會更複雜，因為它可能需要移動到堆中有足夠空間的新位置。在這種情況下，要麼所有索引都需要更新以指向記錄的新堆位置，要麼在舊堆位置留下轉發指標 [^2]。

### 全記憶體儲存 {#sec_storage_inmemory}

本章到目前為止討論的資料結構都是對磁碟限制的回應。與主記憶體相比，磁碟很難處理。對於磁碟和 SSD，如果你想在讀取和寫入上獲得良好的效能，磁碟上的資料需要仔細布局。然而，我們容忍這種尷尬，因為磁碟有兩個顯著的優勢：它們是持久的（如果斷電，其內容不會丟失），並且它們每千兆位元組的成本比 RAM 低。

隨著 RAM 變得更便宜，按每 GB 計價的成本優勢正在減弱。許多資料集根本沒有那麼大，因此將它們完全保留在記憶體中是完全可行的，甚至可以分佈在幾臺機器上。這導致了 *記憶體資料庫* 的發展。

一些記憶體鍵值儲存，例如 Memcached，僅用於快取，如果機器重新啟動，資料丟失是可以接受的。但其他記憶體資料庫旨在實現永續性，這可以透過特殊硬體（例如電池供電的 RAM）、將更改日誌寫入磁碟、將定期快照寫入磁碟或將記憶體狀態複製到其他機器來實現。

當記憶體資料庫重新啟動時，它需要重新載入其狀態，要麼從磁碟，要麼透過網路從副本（除非使用特殊硬體）。儘管寫入磁碟，它仍然是一個記憶體資料庫，因為磁碟僅用作永續性的僅追加日誌，讀取完全從記憶體提供。寫入磁碟還具有操作優勢：磁碟上的檔案可以輕鬆備份、檢查和由外部實用程式分析。

VoltDB、SingleStore 和 Oracle TimesTen 等產品是具有關係模型的記憶體資料庫，供應商聲稱，透過消除管理磁碟資料結構相關的所有開銷，它們可以提供巨大的效能改進 [^46] [^47]。RAMCloud 是一個開源的記憶體鍵值儲存，具有永續性（對記憶體中的資料以及磁碟上的資料使用日誌結構方法）[^48]。

Redis 和 Couchbase 透過非同步寫入磁碟提供弱永續性。

反直覺的是，記憶體資料庫的效能優勢不是因為它們不需要從磁碟讀取。即使是基於磁碟的儲存引擎，如果你有足夠的記憶體，也可能永遠不需要從磁碟讀取，因為作業系統無論如何都會在記憶體中快取最近使用的磁碟塊。相反，它們可以更快，因為它們可以避免將記憶體資料結構編碼為可以寫入磁碟的形式的開銷 [^49]。

除了效能，記憶體資料庫的另一個有趣領域是提供了基於磁碟的索引難以實現的資料模型。例如，Redis 為各種資料結構（例如優先佇列和集合）提供類似資料庫的介面。因為它將所有資料保留在記憶體中，其實現相對簡單。


## 分析型資料儲存 {#sec_storage_analytics}

資料倉庫的資料模型最常見的是關係型，因為 SQL 通常非常適合分析查詢。有許多圖形化資料分析工具可以生成 SQL 查詢、視覺化結果，並允許分析師探索資料（透過 *下鑽* 和 *切片切塊* 等操作）。

表面上，資料倉庫和關係型 OLTP 資料庫看起來很相似，因為它們都有 SQL 查詢介面。然而，系統的內部可能看起來完全不同，因為它們針對非常不同的查詢模式進行了最佳化。許多資料庫供應商現在專注於支援事務處理或分析工作負載，但不是兩者兼而有之。

一些資料庫，如 Microsoft SQL Server、SAP HANA 和 SingleStore，在同一產品中支援事務處理和資料倉庫。然而，這些混合事務和分析處理（HTAP）資料庫（在 ["資料倉庫"](/tw/ch1#sec_introduction_dwh) 中介紹）越來越多地成為兩個獨立的儲存和查詢引擎，它們恰好可以透過通用的 SQL 介面訪問 [^50] [^51] [^52] [^53]。

### 雲資料倉庫 {#sec_cloud_data_warehouses}

Teradata、Vertica 和 SAP HANA 等資料倉庫供應商既銷售商業許可下的本地倉庫，也銷售基於雲的解決方案。但隨著他們的許多客戶轉向雲，新的雲資料倉庫（如 Google Cloud BigQuery、Amazon Redshift 和 Snowflake）也變得廣泛採用。與傳統資料倉庫不同，雲資料倉庫利用可擴充套件的雲基礎設施，如物件儲存和無伺服器計算平臺。

雲資料倉庫往往與其他雲服務更好地整合，並且更具彈性。例如，許多雲倉庫支援自動日誌攝取，並提供與資料處理框架（如 Google Cloud 的 Dataflow 或 Amazon Web Services 的 Kinesis）的輕鬆整合。這些倉庫也更具彈性，因為它們將查詢計算與儲存層解耦 [^54]。資料持久儲存在物件儲存而不是本地磁碟上，這使得可以獨立調整儲存容量和查詢的計算資源，正如我們之前在 ["雲原生系統架構"](/tw/ch1#sec_introduction_cloud_native) 中看到的。

Apache Hive、Trino 和 Apache Spark 等開源資料倉庫也隨著雲的發展而發展。隨著分析資料儲存轉移到物件儲存上的資料湖，開源倉庫也開始解耦拆分 [^55]。以下元件以前整合在單個系統（如 Apache Hive）中，現在通常作為單獨的元件實現：

查詢引擎
: Trino、Apache DataFusion 和 Presto 等查詢引擎解析 SQL 查詢，將其最佳化為執行計劃，並在資料上執行這些計劃。執行通常需要並行、分散式的資料處理任務。一些查詢引擎提供內建任務執行，而有些則選擇使用第三方執行框架，如 Apache Spark 或 Apache Flink。

儲存格式
: 儲存格式確定表的行如何編碼為檔案中的位元組，然後通常儲存在物件儲存或分散式檔案系統中 [^12]。然後查詢引擎可以訪問這些資料，但使用資料湖的其他應用程式也可以訪問。此類儲存格式的示例包括 Parquet、ORC、Lance 或 Nimble，我們將在下一節中看到更多關於它們的內容。

表格式
: 以 Apache Parquet 和類似儲存格式編寫的檔案一旦寫入通常就是不可變的。為了支援行插入和刪除，通常會使用 Apache Iceberg 或 Databricks Delta 等表格式。表格式規定了哪些檔案構成一張表，以及表模式的定義格式。此類格式還提供高階功能，例如時間旅行（查詢表在過去某個時間點狀態的能力）、垃圾回收，甚至事務。

資料目錄
: 就像表格式定義哪些檔案構成表一樣，資料目錄定義哪些表組成資料庫。目錄用於建立、重新命名和刪除表。與儲存和表格式不同，Snowflake 的 Polaris 和 Databricks 的 Unity Catalog 等資料目錄通常作為可以使用 REST 介面查詢的獨立服務執行。Apache Iceberg 也提供目錄，可以在客戶端內執行或作為單獨的程序執行。查詢引擎在讀取和寫入表時使用目錄資訊。傳統上，目錄和查詢引擎已經整合，但將它們解耦使資料發現和資料治理系統（在 ["資料系統、法律和社會"](/tw/ch1#sec_introduction_compliance) 中討論）也能夠訪問目錄的元資料。

### 列式儲存 {#sec_storage_column}

如 ["星型和雪花型：分析模式"](/tw/ch3#sec_datamodels_analytics) 中所討論的，資料倉庫按照慣例通常使用帶有大型事實表的關係模式，該表包含對維度表的外部索引鍵引用。如果你的事實表中有數萬億行和數 PB 的資料，有效地儲存和查詢它們就成為一個具有挑戰性的問題。維度表通常要小得多（數百萬行），因此在本節中我們將重點關注事實的儲存。

儘管事實表通常有超過 100 列，但典型的資料倉庫查詢一次只訪問其中的 4 或 5 列（分析很少需要 `"SELECT *"` 查詢）[^52]。以 [示例 4-1](#fig_storage_analytics_query) 中的查詢為例：它訪問大量行（2024 日曆年期間每次有人購買水果或糖果的情況），但它只需要訪問 `fact_sales` 表的三列：`date_key`、`product_sk` 和 `quantity`。查詢忽略所有其他列。

{{< figure id="fig_storage_analytics_query" title="示例 4-1. 分析人們是否更傾向於購買新鮮水果或糖果，取決於星期幾" class="w-full my-4" >}}

```sql
SELECT
    dim_date.weekday, dim_product.category,
    SUM(fact_sales.quantity) AS quantity_sold
FROM fact_sales
    JOIN dim_date ON fact_sales.date_key = dim_date.date_key
    JOIN dim_product ON fact_sales.product_sk = dim_product.product_sk
WHERE
    dim_date.year = 2024 AND
    dim_product.category IN ('Fresh fruit', 'Candy')
GROUP BY
    dim_date.weekday, dim_product.category;
```

我們如何高效地執行這個查詢？

在大多數 OLTP 資料庫中，儲存是以 *面向行* 的方式佈局的：表中一行的所有值彼此相鄰儲存。文件資料庫類似：整個文件通常作為一個連續的位元組序列儲存。你可以在 [圖 4-1](#fig_storage_csv_hash_index) 的 CSV 示例中看到這一點。

為了處理像 [示例 4-1](#fig_storage_analytics_query) 這樣的查詢，你可能在 `fact_sales.date_key` 和/或 `fact_sales.product_sk` 上有索引，告訴儲存引擎在哪裡找到特定日期或特定產品的所有銷售。但是，面向行的儲存引擎仍然需要將所有這些行（每行包含超過 100 個屬性）從磁碟載入到記憶體中，解析它們，並過濾掉不符合所需條件的行。這可能需要很長時間。

*面向列*（或 *列式*）儲存背後的想法很簡單：不要將一行中的所有值儲存在一起，而是將每 *列* 中的所有值儲存在一起 [^56]。如果每列單獨儲存，查詢只需要讀取和解析該查詢中使用的那些列，這可以節省大量工作。[圖 4-7](#fig_column_store) 使用 [圖 3-5](/tw/ch3#fig_dwh_schema) 中事實表的擴充套件版本展示了這一原理。

--------

> [!NOTE]
> 列儲存在關係資料模型中最容易理解，但它同樣適用於非關係資料。例如，Parquet [^57] 是一種列式儲存格式，它支援基於 Google 的 Dremel [^58] 的文件資料模型，使用一種稱為 *分解*（*shredding*）或 *條帶化*（*striping*）的技術 [^59]。

--------

{{< figure src="/fig/ddia_0407.png" id="fig_column_store" caption="圖 4-7. 按列而不是按行儲存關係資料。" class="w-full my-4" >}}

面向列的儲存佈局依賴於每列以相同順序儲存行。因此，如果你需要重新組裝整行，你可以從每個單獨的列中取出第 23 個條目，並將它們組合在一起形成表的第 23 行。

實際上，列式儲存引擎並不真的一次儲存整個列（可能包含數萬億行）。相反，它們將表分解為數千或數百萬行的塊，並且在每個塊內，它們分別儲存每列的值 [^60]。由於許多查詢都限制在特定的日期範圍內，因此通常使每個塊包含特定時間戳範圍的行。然後查詢只需要在與所需日期範圍重疊的那些塊中載入它需要的列。

列式儲存如今幾乎用於所有分析資料庫 [^60]，從大規模雲資料倉庫（如 Snowflake [^61]）到單節點嵌入式資料庫（如 DuckDB [^62]），以及產品分析系統（如 Pinot [^63] 和 Druid [^64]）。它用於儲存格式，如 Parquet、ORC [^65] [^66]、Lance [^67] 和 Nimble [^68]，以及記憶體分析格式，如 Apache Arrow [^65] [^69] 和 Pandas/NumPy [^70]。一些時間序列資料庫，如 InfluxDB IOx [^71] 和 TimescaleDB [^72]，也基於面向列的儲存。

#### 列壓縮 {#sec_storage_column_compression}

除了只從磁碟載入查詢所需的那些列之外，我們還可以透過壓縮資料進一步減少對磁碟吞吐量和網路頻寬的需求。幸運的是，面向列的儲存通常非常適合壓縮。

看看 [圖 4-7](#fig_column_store) 中每列的值序列：它們看起來經常重複，這是壓縮的良好跡象。根據列中的資料，可以使用不同的壓縮技術。在資料倉庫中特別有效的一種技術是 *點陣圖編碼*，如 [圖 4-8](#fig_bitmap_index) 所示。

{{< figure src="/fig/ddia_0408.png" id="fig_bitmap_index" caption="圖 4-8. 單列的壓縮、點陣圖索引儲存。" class="w-full my-4" >}}

通常，列中不同值的數量與行數相比很小（例如，零售商可能有數十億條銷售交易，但只有 100,000 種不同的產品）。我們現在可以將具有 *n* 個不同值的列轉換為 *n* 個單獨的點陣圖：每個不同值一個位圖，每行一位。如果該行具有該值，則該位為 1，否則為 0。

一種選擇是使用每行一位來儲存這些點陣圖。然而，這些點陣圖通常包含大量零（我們說它們是 *稀疏* 的）。在這種情況下，點陣圖可以另外進行遊程編碼：計算連續零或一的數量並存儲該數字，如 [圖 4-8](#fig_bitmap_index) 底部所示。諸如 *咆哮點陣圖*（*roaring bitmaps*）之類的技術在兩種位圖表示之間切換，使用最緊湊的表示 [^73]。這可以使列的編碼非常高效。

像這樣的點陣圖索引非常適合資料倉庫中常見的查詢型別。例如：

`WHERE product_sk IN (31, 68, 69):`
: 載入 `product_sk = 31`、`product_sk = 68` 和 `product_sk = 69` 的三個點陣圖，並計算三個點陣圖的按位 *OR*，這可以非常高效地完成。

`WHERE product_sk = 30 AND store_sk = 3:`
: 載入 `product_sk = 30` 和 `store_sk = 3` 的點陣圖，並計算按位 *AND*。這有效是因為列以相同的順序包含行，所以一列點陣圖中的第 *k* 位對應於另一列點陣圖中第 *k* 位的同一行。

點陣圖也可用於回答圖查詢，例如查詢社交網路中被使用者 *X* 關注並且也關注使用者 *Y* 的所有使用者 [^74]。列式資料庫還有各種其他壓縮方案，你可以在參考文獻中找到 [^75]。

--------

> [!NOTE]
> 不要將面向列的資料庫與 *寬列*（也稱為 *列族*）資料模型混淆，在該模型中，一行可以有數千列，並且不需要所有行都有相同的列 [^9]。儘管名稱相似，寬列資料庫是面向行的，因為它們將一行中的所有值儲存在一起。Google 的 Bigtable、Apache Accumulo 和 HBase 是寬列模型的例子。

--------

#### 列儲存中的排序順序 {#sort-order-in-column-storage}

在列儲存中，行的儲存順序並不一定重要。最簡單的是按插入順序儲存它們，因為這樣插入新行只需追加到每列。但是，我們可以選擇強制執行順序，就像我們之前對 SSTable 所做的那樣，並將其用作索引機制。

請注意，獨立排序每列是沒有意義的，因為那樣我們就不再知道列中的哪些項屬於同一行。我們只能重建一行，因為我們知道一列中的第 *k* 個項與另一列中的第 *k* 個項屬於同一行。

相反，資料需要一次排序整行，即使它是按列儲存的。資料庫管理員可以使用他們對常見查詢的瞭解來選擇表應按哪些列排序。例如，如果查詢經常針對日期範圍（例如上個月），則將 `date_key` 作為第一個排序鍵可能是有意義的。然後查詢可以只掃描上個月的行，這將比掃描所有行快得多。

第二列可以確定在第一列中具有相同值的任何行的排序順序。例如，如果 `date_key` 是 [圖 4-7](#fig_column_store) 中的第一個排序鍵，那麼 `product_sk` 作為第二個排序鍵可能是有意義的，這樣同一天同一產品的所有銷售都在儲存中分組在一起。這將有助於需要在某個日期範圍內按產品分組或過濾銷售的查詢。

排序順序的另一個優點是它可以幫助壓縮列。如果主排序列沒有許多不同的值，那麼排序後，它將有很長的序列，其中相同的值在一行中重複多次。簡單的遊程編碼，就像我們在 [圖 4-8](#fig_bitmap_index) 中用於點陣圖的那樣，可以將該列壓縮到幾千位元組 —— 即使表有數十億行。

該壓縮效果在第一個排序鍵上最強。第二和第三個排序鍵將更加混亂，因此不會有如此長的重複值執行。排序優先順序較低的列基本上以隨機順序出現，因此它們可能不會壓縮得那麼好。但是，讓前幾列排序仍然是整體上的勝利。

#### 寫入列式儲存 {#writing-to-column-oriented-storage}

我們在 ["事務處理和分析的特徵"](/tw/ch1#sec_introduction_oltp) 中看到，資料倉庫中的讀取往往包括大量行的聚合；列式儲存、壓縮和排序都有助於使這些讀取查詢更快。資料倉庫中的寫入往往是資料的批次匯入，通常透過 ETL 過程。

使用列式儲存，在排序表的中間某處寫入單個行將非常低效，因為你必須從插入位置開始重寫所有壓縮列。但是，一次批次寫入許多行會分攤重寫這些列的成本，使其高效。

通常使用日誌結構方法以批次執行寫入。所有寫入首先進入面向行的、排序的記憶體儲存。當積累了足夠的寫入時，它們將與磁碟上的列編碼檔案合併，並批次寫入新檔案。由於舊檔案保持不可變，新檔案一次寫入，物件儲存非常適合儲存這些檔案。

查詢需要檢查磁碟上的列資料和記憶體中的最近寫入，並將兩者結合起來。查詢執行引擎對使用者隱藏了這種區別。從分析師的角度來看，已透過插入、更新或刪除修改的資料會立即反映在後續查詢中。Snowflake、Vertica、Apache Pinot、Apache Druid 和許多其他系統都這樣做 [^61] [^63] [^64] [^76]。


### 查詢執行：編譯與向量化 {#sec_storage_vectorized}

用於分析的複雜 SQL 查詢被分解為由多個階段組成的 *查詢計劃*，稱為 *運算元*，這些運算元可能分佈在多臺機器上以並行執行。查詢規劃器可以透過選擇使用哪些運算元、以何種順序執行它們以及在哪裡執行每個運算元來執行大量最佳化。

在每個運算元內，查詢引擎需要對列中的值執行各種操作，例如查詢值在特定值集中的所有行（可能作為連線的一部分），或檢查值是否大於 15。它還需要檢視同一行的幾列，例如查詢產品是香蕉且門店是某個特定目標門店的所有銷售交易。

對於需要掃描數百萬行的資料倉庫查詢，我們不僅需要擔心它們需要從磁碟讀取的資料量，還需要擔心執行複雜運算元所需的 CPU 時間。最簡單的運算元型別就像程式語言的直譯器：在遍歷每一行時，它檢查表示查詢的資料結構，以找出需要對哪些列執行哪些比較或計算。不幸的是，這對許多分析目的來說太慢了。高效查詢執行的兩種替代方法已經出現 [^77]：

查詢編譯
: 查詢引擎獲取 SQL 查詢並生成用於執行它的程式碼。程式碼逐行迭代，檢視感興趣列中的值，執行所需的任何比較或計算，如果滿足所需條件，則將必要的值複製到輸出緩衝區。查詢引擎將生成的程式碼編譯為機器程式碼（通常使用現有編譯器，如 LLVM），然後在已載入到記憶體中的列編碼資料上執行它。這種程式碼生成方法類似於 Java 虛擬機器（JVM）和類似執行時中使用的即時（JIT）編譯方法。

向量化處理
: 查詢被解釋，而不是編譯，但透過批次處理列中的許多值而不是逐行迭代來提高速度。一組固定的預定義運算元內建在資料庫中；我們可以向它們傳遞引數並獲得一批結果 [^50] [^75]。

例如，我們可以將 `product_sk` 列和"香蕉"的 ID 傳遞給相等運算元，並獲得一個位圖（輸入列中每個值一位，如果是香蕉則為 1）；然後我們可以將 `store_sk` 列和感興趣商店的 ID 傳遞給相同的相等運算元，並獲得另一個位圖；然後我們可以將兩個點陣圖傳遞給"按位 AND"運算元，如 [圖 4-9](#fig_bitmap_and) 所示。結果將是一個位圖，包含特定商店中所有香蕉銷售的 1。

{{< figure src="/fig/ddia_0409.png" id="fig_bitmap_and" caption="圖 4-9. 兩個點陣圖之間的按位 AND 適合向量化。" class="w-full my-4" >}}

這兩種方法在實現方面非常不同，但兩者都在實踐中使用 [^77]。兩者都可以透過利用現代 CPU 的特性來實現非常好的效能：

* 優先選擇順序記憶體訪問而不是隨機訪問以減少快取未命中 [^78]，
* 在緊密的內部迴圈中完成大部分工作（即，具有少量指令且沒有函式呼叫）以保持 CPU 指令處理管道繁忙併避免分支預測錯誤，
* 利用並行性，例如多執行緒和單指令多資料（SIMD）指令 [^79] [^80]，以及
* 直接對壓縮資料進行操作，而無需將其解碼為單獨的記憶體表示，這可以節省記憶體分配和複製成本。

### 物化檢視與資料立方體 {#sec_storage_materialized_views}

我們之前在 ["物化和更新時間線"](/tw/ch2#sec_introduction_materializing) 中遇到了 *物化檢視*：在關係資料模型中，它們是表狀物件，其內容是某些查詢的結果。區別在於物化檢視是查詢結果的實際副本，寫入磁碟，而虛擬檢視只是編寫查詢的快捷方式。當你從虛擬檢視讀取時，SQL 引擎會即時將其擴充套件為檢視的基礎查詢，然後處理擴充套件的查詢。

當基礎資料更改時，物化檢視需要相應更新。一些資料庫可以自動執行此操作，還有像 Materialize 這樣專門從事物化檢視維護的系統 [^81]。執行此類更新意味著寫入時需要更多工作，但物化檢視可以改善在重複需要執行相同查詢的工作負載中的讀取效能。

*物化聚合* 是一種可以在資料倉庫中有用的物化檢視型別。如前所述，資料倉庫查詢通常涉及聚合函式，例如 SQL 中的 `COUNT`、`SUM`、`AVG`、`MIN` 或 `MAX`。如果許多不同的查詢使用相同的聚合，每次都處理原始資料可能會很浪費。為什麼不快取查詢最常使用的一些計數或總和？*資料立方體*（*OLAP 立方體*）透過建立按不同維度分組的聚合網格來做到這一點 [^82]。[圖 4-10](#fig_data_cube) 顯示了一個示例。

{{< figure src="/fig/ddia_0410.png" id="fig_data_cube" caption="圖 4-10. 資料立方體的兩個維度，透過求和聚合資料。" class="w-full my-4" >}}

現在假設每個事實只有兩個維度表的外部索引鍵 —— 在 [圖 4-10](#fig_data_cube) 中，這些是 `date_key` 和 `product_sk`。你現在可以繪製一個二維表，日期沿著一個軸，產品沿著另一個軸。每個單元格包含具有該日期-產品組合的所有事實的屬性（例如 `net_price`）的聚合（例如 `SUM`）。然後，你可以沿著每行或列應用相同的聚合，並獲得已減少一個維度的摘要（不管日期的產品銷售，或不管產品的日期銷售）。

一般來說，事實通常有兩個以上的維度。在 [圖 3-5](/tw/ch3#fig_dwh_schema) 中有五個維度：日期、產品、商店、促銷和客戶。很難想象五維超立方體會是什麼樣子，但原理保持不變：每個單元格包含特定日期-產品-商店-促銷-客戶組合的銷售。然後可以沿著每個維度重複彙總這些值。

物化資料立方體的優點是某些查詢會變得非常快，因為結果已經被預先計算好了。例如，如果你想知道昨天每個商店的總銷售額，你只需要檢視相應維度上的彙總值 —— 不需要掃描數百萬行。

缺點是資料立方體不像直接查詢原始資料那樣靈活。例如，沒有辦法計算售價超過 100 美元的商品銷售佔比，因為價格並不是其中一個維度。因此，大多數資料倉庫都會盡可能保留原始資料，只把這類聚合（如資料立方體）當作特定查詢的效能加速手段。


## 多維索引與全文索引 {#sec_storage_multidimensional}

我們在本章前半部分看到的 B 樹和 LSM 樹允許對單個屬性進行範圍查詢：例如，如果鍵是使用者名稱，你可以使用它們作為索引來高效查詢所有以 L 開頭的名稱。但有時，按單個屬性搜尋是不夠的。

最常見的多列索引型別稱為 *聯合索引*，它透過將一列追加到另一列來將幾個欄位組合成一個鍵（索引定義指定欄位以何種順序連線）。這就像老式的紙質電話簿，它提供從（*姓氏*、*名字*）到電話號碼的索引。由於排序順序，索引可用於查詢具有特定姓氏的所有人，或具有特定 *姓氏-名字* 組合的所有人。但是，如果你想查詢具有特定名字的所有人，索引是無用的。

另一方面，*多維索引* 允許你一次查詢多個列。在地理空間資料中這尤其重要。例如，餐廳搜尋網站可能有一個包含每個餐廳的緯度和經度的資料庫。當用戶在地圖上檢視餐廳時，網站需要搜尋使用者當前檢視的矩形地圖區域內的所有餐廳。這需要像以下這樣的二維範圍查詢：

```sql
SELECT * FROM restaurants WHERE latitude > 51.4946 AND latitude < 51.5079
    AND longitude > -0.1162 AND longitude < -0.1004;
```

緯度和經度列上的聯合索引無法有效地回答這種查詢：它可以為你提供緯度範圍內的所有餐廳（但在任何經度），或經度範圍內的所有餐廳（但在北極和南極之間的任何地方），但不能同時提供兩者。

一種選擇是使用空間填充曲線將二維位置轉換為單個數字，然後使用常規 B 樹索引 [^83]。更常見的是，使用專門的空間索引，如 R 樹或 Bkd 樹 [^84]；它們劃分空間，使附近的資料點傾向於分組在同一子樹中。例如，PostGIS 使用 PostgreSQL 的通用搜索樹索引設施將地理空間索引實現為 R 樹 [^85]。也可以使用規則間隔的三角形、正方形或六邊形網格 [^86]。

多維索引不僅用於地理位置。例如，在電子商務網站上，你可以在維度（*紅色*、*綠色*、*藍色*）上使用三維索引來搜尋某個顏色範圍內的產品，或者在天氣觀測資料庫中，你可以在（*日期*、*溫度*）上有一個二維索引，以便有效地搜尋 2013 年期間溫度在 25 到 30°C 之間的所有觀測。使用一維索引，你必須掃描 2013 年的所有記錄（不管溫度），然後按溫度過濾它們，反之亦然。二維索引可以同時按時間戳和溫度縮小範圍 [^87]。

### 全文檢索 {#sec_storage_full_text}

全文檢索允許你透過可能出現在文字中任何位置的關鍵字搜尋文字文件集合（網頁、產品描述等）[^88]。資訊檢索是一個大的專業主題，通常涉及特定於語言的處理：例如，幾種亞洲語言在單詞之間沒有空格或標點符號，因此將文字分割成單詞需要一個指示哪些字元序列構成單詞的模型。全文檢索還經常涉及匹配相似但不相同的單詞（例如拼寫錯誤或單詞的不同語法形式）和同義詞。這些問題超出了本書的範圍。

然而，在其核心，你可以將全文檢索視為另一種多維查詢：在這種情況下，可能出現在文字中的每個單詞（*詞項*）是一個維度。包含詞項 *x* 的文件在維度 *x* 中的值為 1，不包含 *x* 的文件的值為 0。搜尋提到“紅蘋果”的文件意味著查詢在 *紅* 維度中查詢 1，同時在 *蘋果* 維度中查詢 1。維度數量可能因此非常大。

許多搜尋引擎用來回答此類查詢的資料結構稱為 *倒排索引*。這是一個鍵值結構，其中鍵是詞項，值是包含該詞項的所有文件的 ID 列表（*倒排列表*）。如果文件 ID 是順序數字，倒排列表也可以表示為稀疏點陣圖，如 [圖 4-8](#fig_bitmap_index)：詞項 *x* 的點陣圖中的第 *n* 位是 1，如果 ID 為 *n* 的文件包含詞項 *x* [^89]。

查詢包含詞項 *x* 和 *y* 的所有文件現在類似於搜尋匹配兩個條件的行的向量化資料倉庫查詢（[圖 4-9](#fig_bitmap_and)）：載入詞項 *x* 和 *y* 的兩個點陣圖並計算它們的按位 AND。即使點陣圖是遊程編碼的，這也可以非常高效地完成。

例如，Elasticsearch 和 Solr 使用的全文索引引擎 Lucene 就是這樣工作的 [^90]。它將詞項到倒排列表的對映儲存在類似 SSTable 的排序檔案中，這些檔案使用我們在本章前面看到的相同日誌結構方法在後臺合併 [^91]。PostgreSQL 的 GIN 索引型別也使用倒排列表來支援全文檢索和 JSON 文件內的索引 [^92] [^93]。

除了將文字分解為單詞，另一種選擇是查詢長度為 *n* 的所有子字串，稱為 *n-gram*（*n 元語法*）。例如，字串 `"hello"` 的三元語法（*n* = 3）是 `"hel"`、`"ell"` 和 `"llo"`。如果我們為所有三元語法構建倒排索引，我們就可以搜尋任意至少三個字元長的子字串。三元語法索引甚至允許在搜尋查詢中使用正則表示式；缺點是它們相當大 [^94]。

為了處理文件或查詢中的拼寫錯誤，Lucene 能夠在一定編輯距離內搜尋文字中的單詞（編輯距離為 1 意味著已新增、刪除或替換了一個字母）[^95]。它透過將詞項集儲存為字元上的有限狀態自動機（類似於 *字典樹* [^96]）並將其轉換為 *萊文斯坦自動機* 來實現，該自動機支援在給定編輯距離內高效搜尋單詞 [^97]。


### 向量嵌入 {#id92}

語義搜尋超越了同義詞和拼寫錯誤，試圖理解文件概念和使用者意圖。例如，如果你的幫助頁面中有一個標題為“取消訂閱”的頁面，使用者在搜尋“如何關閉我的賬戶”或“終止合同”時，仍應能找到這個頁面，即使查詢詞完全不同，但語義非常接近。

為了理解文件的語義 —— 它的含義 —— 語義搜尋索引使用嵌入模型將文件轉換為浮點值向量，稱為 *向量嵌入*。向量表示多維空間中的一個點，每個浮點值表示文件沿著一個維度軸的位置。嵌入模型生成的向量嵌入在（這個多維空間中）彼此接近，當嵌入的輸入文件在語義上相似時。

--------

> [!NOTE]
> 我們在 ["查詢執行：編譯與向量化"](#sec_storage_vectorized) 中看到了術語 *向量化處理*。語義搜尋中的向量有不同的含義。在向量化處理中，向量指的是可以用特別最佳化的程式碼處理的一批位。在嵌入模型中，向量是表示多維空間中位置的浮點數列表。

--------

例如，關於農業的維基百科頁面的三維向量嵌入可能是 `[0.1, 0.22, 0.11]`。關於蔬菜的維基百科頁面會非常接近，可能嵌入為 `[0.13, 0.19, 0.24]`。關於星型模式的頁面可能有 `[0.82, 0.39, -0.74]` 的嵌入，相對較遠。我們可以透過觀察看出前兩個向量比第三個更接近。

嵌入模型使用更大的向量（通常超過 1,000 個數字），但原理是相同的。我們不試圖理解各個數字的含義；它們只是嵌入模型指向抽象多維空間中位置的一種方式。搜尋引擎使用距離函式（如餘弦相似度或歐幾里得距離）來測量向量之間的距離。餘弦相似度測量兩個向量角度的餘弦以確定它們的接近程度，而歐幾里得距離測量空間中兩點之間的直線距離。

許多早期的嵌入模型，如 Word2Vec [^98]、BERT [^99] 和 GPT [^100] 都處理文字資料。這些模型通常實現為神經網路。研究人員繼續為影片、音訊和影像建立嵌入模型。最近，模型架構已經變成 *多模態* 的：單個模型可以為多種模態（如文字和影像）生成向量嵌入。

語義搜尋引擎在使用者輸入查詢時使用嵌入模型生成向量嵌入。使用者的查詢和相關上下文（例如使用者的位置）被輸入到嵌入模型中。嵌入模型生成查詢的向量嵌入後，搜尋引擎必須使用向量索引找到具有相似向量嵌入的文件。

向量索引儲存文件集合的向量嵌入。要查詢索引，你傳入查詢的向量嵌入，索引返回其向量最接近查詢向量的文件。由於我們之前看到的 R 樹不適用於多維向量，因此使用專門的向量索引，例如：

平面索引（Flat indexes）
: 向量按原樣儲存在索引中。查詢必須讀取每個向量並測量其與查詢向量的距離。平面索引是準確的，但測量查詢與每個向量之間的距離很慢。

倒排檔案（IVF）索引
: 向量空間被聚類為向量的分割槽（稱為 *質心*），以減少必須比較的向量數量。IVF 索引比平面索引更快，但只能給出近似結果：即使查詢和文件彼此接近，它們也可能落入不同的分割槽。對 IVF 索引的查詢首先定義 *探針*，這只是要檢查的分割槽數。使用更多探針的查詢將更準確，但會更慢，因為必須比較更多向量。

分層可導航小世界（HNSW）
: HNSW 索引維護向量空間的多個層，如 [圖 4-11](#fig_vector_hnsw) 所示。每一層都表示為一個圖，其中節點表示向量，邊表示與附近向量的接近度。查詢首先在最頂層定位最近的向量，該層具有少量節點。然後查詢移動到下面一層的同一節點，並跟隨該層中的邊，該層連線更密集，尋找更接近查詢向量的向量。該過程繼續直到到達最後一層。與 IVF 索引一樣，HNSW 索引是近似的。

{{< figure src="/fig/ddia_0411.png" id="fig_vector_hnsw" caption="圖 4-11. 在 HNSW 索引中搜索最接近給定查詢向量的資料庫條目。" class="w-full my-4" >}}


許多流行的向量資料庫實現了 IVF 和 HNSW 索引。Facebook 的 Faiss 庫有每種的許多變體 [^101]，PostgreSQL 的 pgvector 也支援兩者 [^102]。IVF 和 HNSW 演算法的完整細節超出了本書的範圍，但它們的論文是極好的資源 [^103] [^104]。

## 總結 {#summary}

在本章中，我們試圖深入瞭解資料庫如何執行儲存和檢索。當你在資料庫中儲存資料時會發生什麼，當你稍後再次查詢資料時資料庫會做什麼？

["分析型與事務型系統"](/tw/ch1#sec_introduction_analytics) 介紹了事務處理（OLTP）和分析（OLAP）之間的區別。在本章中，我們看到為 OLTP 最佳化的儲存引擎與為分析最佳化的儲存引擎看起來非常不同：

* OLTP 系統針對大量請求進行了最佳化，每個請求讀取和寫入少量記錄，並且需要快速響應。記錄通常透過主鍵或二級索引訪問，這些索引通常是從鍵到記錄的有序對映，也支援範圍查詢。
* 資料倉庫和類似的分析系統針對掃描大量記錄的複雜讀取查詢進行了最佳化。它們通常使用帶有壓縮的列式儲存佈局，以最小化此類查詢需要從磁碟讀取的資料量，並使用查詢的即時編譯或向量化來最小化處理資料所花費的 CPU 時間。

在 OLTP 方面，我們看到了兩個主要思想流派的儲存引擎：

* 日誌結構方法，只允許追加到檔案和刪除過時檔案，但從不更新已寫入的檔案。SSTable、LSM 樹、RocksDB、Cassandra、HBase、Scylla、Lucene 等屬於這一組。一般來說，日誌結構儲存引擎往往提供高寫入吞吐量。
* 就地更新方法，將磁碟視為一組可以覆蓋的固定大小頁。B 樹是這種理念的最大例子，用於所有主要的關係型 OLTP 資料庫以及許多非關係型資料庫。作為經驗法則，B 樹往往更適合讀取，提供比日誌結構儲存更高的讀取吞吐量和更低的響應時間。

然後我們查看了可以同時搜尋多個條件的索引：多維索引（如 R 樹）可以同時按緯度和經度搜索地圖上的點，全文檢索索引可以搜尋出現在同一文字中的多個關鍵字。最後，向量資料庫用於文字文件和其他媒體的語義搜尋；它們使用具有大量維度的向量，並透過比較向量相似性來查詢相似文件。

作為應用開發者，如果你掌握了這些關於儲存引擎內部機制的知識，就能更好地判斷哪種工具最適合你的具體應用。如果你需要調整資料庫的調優引數，這種理解也能幫助你預判引數調高或調低可能帶來的影響。

儘管本章不能讓你成為調優某個特定儲存引擎的專家，但它希望已經為你提供了足夠的術語和思路，使你能夠讀懂所選資料庫的文件。


### 參考


[^1]: Nikolay Samokhvalov. [How partial, covering, and multicolumn indexes may slow down UPDATEs in PostgreSQL](https://postgres.ai/blog/20211029-how-partial-and-covering-indexes-affect-update-performance-in-postgresql). *postgres.ai*, October 2021. Archived at [perma.cc/PBK3-F4G9](https://perma.cc/PBK3-F4G9)
[^2]: Goetz Graefe. [Modern B-Tree Techniques](https://w6113.github.io/files/papers/btreesurvey-graefe.pdf). *Foundations and Trends in Databases*, volume 3, issue 4, pages 203–402, August 2011. [doi:10.1561/1900000028](https://doi.org/10.1561/1900000028)
[^3]: Evan Jones. [Why databases use ordered indexes but programming uses hash tables](https://www.evanjones.ca/ordered-vs-unordered-indexes.html). *evanjones.ca*, December 2019. Archived at [perma.cc/NJX8-3ZZD](https://perma.cc/NJX8-3ZZD)
[^4]: Branimir Lambov. [CEP-25: Trie-indexed SSTable format](https://cwiki.apache.org/confluence/display/CASSANDRA/CEP-25%3A%2BTrie-indexed%2BSSTable%2Bformat). *cwiki.apache.org*, November 2022. Archived at [perma.cc/HD7W-PW8U](https://perma.cc/HD7W-PW8U). Linked Google Doc archived at [perma.cc/UL6C-AAAE](https://perma.cc/UL6C-AAAE)
[^5]: Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein: *Introduction to Algorithms*, 3rd edition. MIT Press, 2009. ISBN: 978-0-262-53305-8
[^6]: Branimir Lambov. [Trie Memtables in Cassandra](https://www.vldb.org/pvldb/vol15/p3359-lambov.pdf). *Proceedings of the VLDB Endowment*, volume 15, issue 12, pages 3359–3371, August 2022. [doi:10.14778/3554821.3554828](https://doi.org/10.14778/3554821.3554828)
[^7]: Dhruba Borthakur. [The History of RocksDB](https://rocksdb.blogspot.com/2013/11/the-history-of-rocksdb.html). *rocksdb.blogspot.com*, November 2013. Archived at [perma.cc/Z7C5-JPSP](https://perma.cc/Z7C5-JPSP)
[^8]: Matteo Bertozzi. [Apache HBase I/O – HFile](https://blog.cloudera.com/apache-hbase-i-o-hfile/). *blog.cloudera.com*, June 2012. Archived at [perma.cc/U9XH-L2KL](https://perma.cc/U9XH-L2KL)
[^9]: Fay Chang, Jeffrey Dean, Sanjay Ghemawat, Wilson C. Hsieh, Deborah A. Wallach, Mike Burrows, Tushar Chandra, Andrew Fikes, and Robert E. Gruber. [Bigtable: A Distributed Storage System for Structured Data](https://research.google/pubs/pub27898/). At *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006.
[^10]: Patrick O’Neil, Edward Cheng, Dieter Gawlick, and Elizabeth O’Neil. [The Log-Structured Merge-Tree (LSM-Tree)](https://www.cs.umb.edu/~poneil/lsmtree.pdf). *Acta Informatica*, volume 33, issue 4, pages 351–385, June 1996. [doi:10.1007/s002360050048](https://doi.org/10.1007/s002360050048)
[^11]: Mendel Rosenblum and John K. Ousterhout. [The Design and Implementation of a Log-Structured File System](https://research.cs.wisc.edu/areas/os/Qual/papers/lfs.pdf). *ACM Transactions on Computer Systems*, volume 10, issue 1, pages 26–52, February 1992. [doi:10.1145/146941.146943](https://doi.org/10.1145/146941.146943)
[^12]: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong Zhu, Mukul Murthy, Joseph Torres, Herman van Hovell, Adrian Ionescu, Alicja Łuszczak, Michał Świtakowski, Michał Szafrański, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter Boncz, Ali Ghodsi, Sameer Paranjpye, Pieter Senster, Reynold Xin, and Matei Zaharia. [Delta Lake: High-Performance ACID Table Storage over Cloud Object Stores](https://vldb.org/pvldb/vol13/p3411-armbrust.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 12, pages 3411–3424, August 2020. [doi:10.14778/3415478.3415560](https://doi.org/10.14778/3415478.3415560)
[^13]: Burton H. Bloom. [Space/Time Trade-offs in Hash Coding with Allowable Errors](https://people.cs.umass.edu/~emery/classes/cmpsci691st/readings/Misc/p422-bloom.pdf). *Communications of the ACM*, volume 13, issue 7, pages 422–426, July 1970. [doi:10.1145/362686.362692](https://doi.org/10.1145/362686.362692)
[^14]: Adam Kirsch and Michael Mitzenmacher. [Less Hashing, Same Performance: Building a Better Bloom Filter](https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf). *Random Structures & Algorithms*, volume 33, issue 2, pages 187–218, September 2008. [doi:10.1002/rsa.20208](https://doi.org/10.1002/rsa.20208)
[^15]: Thomas Hurst. [Bloom Filter Calculator](https://hur.st/bloomfilter/). *hur.st*, September 2023. Archived at [perma.cc/L3AV-6VC2](https://perma.cc/L3AV-6VC2)
[^16]: Chen Luo and Michael J. Carey. [LSM-based storage techniques: a survey](https://arxiv.org/abs/1812.07527). *The VLDB Journal*, volume 29, pages 393–418, July 2019. [doi:10.1007/s00778-019-00555-y](https://doi.org/10.1007/s00778-019-00555-y)
[^17]: Subhadeep Sarkar and Manos Athanassoulis. [Dissecting, Designing, and Optimizing LSM-based Data Stores](https://www.youtube.com/watch?v=hkMkBZn2mGs). Tutorial at *ACM International Conference on Management of Data* (SIGMOD), June 2022. Slides archived at [perma.cc/93B3-E827](https://perma.cc/93B3-E827)
[^18]: Mark Callaghan. [Name that compaction algorithm](https://smalldatum.blogspot.com/2018/08/name-that-compaction-algorithm.html). *smalldatum.blogspot.com*, August 2018. Archived at [perma.cc/CN4M-82DY](https://perma.cc/CN4M-82DY)
[^19]: Prashanth Rao. [Embedded databases (1): The harmony of DuckDB, KùzuDB and LanceDB](https://thedataquarry.com/posts/embedded-db-1/). *thedataquarry.com*, August 2023. Archived at [perma.cc/PA28-2R35](https://perma.cc/PA28-2R35)
[^20]: Hacker News discussion. [Bluesky migrates to single-tenant SQLite](https://news.ycombinator.com/item?id=38171322). *news.ycombinator.com*, October 2023. Archived at [perma.cc/69LM-5P6X](https://perma.cc/69LM-5P6X)
[^21]: Rudolf Bayer and Edward M. McCreight. [Organization and Maintenance of Large Ordered Indices](https://dl.acm.org/doi/pdf/10.1145/1734663.1734671). Boeing Scientific Research Laboratories, Mathematical and Information Sciences Laboratory, report no. 20, July 1970. [doi:10.1145/1734663.1734671](https://doi.org/10.1145/1734663.1734671)
[^22]: Douglas Comer. [The Ubiquitous B-Tree](https://web.archive.org/web/20170809145513id_/http%3A//sites.fas.harvard.edu/~cs165/papers/comer.pdf). *ACM Computing Surveys*, volume 11, issue 2, pages 121–137, June 1979. [doi:10.1145/356770.356776](https://doi.org/10.1145/356770.356776)
[^23]: Alex Miller. [Torn Write Detection and Protection](https://transactional.blog/blog/2025-torn-writes). *transactional.blog*, April 2025. Archived at [perma.cc/G7EB-33EW](https://perma.cc/G7EB-33EW)
[^24]: C. Mohan and Frank Levine. [ARIES/IM: An Efficient and High Concurrency Index Management Method Using Write-Ahead Logging](https://ics.uci.edu/~cs223/papers/p371-mohan.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 1992. [doi:10.1145/130283.130338](https://doi.org/10.1145/130283.130338)
[^25]: Hironobu Suzuki. [The Internals of PostgreSQL](https://www.interdb.jp/pg/). *interdb.jp*, 2017.
[^26]: Howard Chu. [LDAP at Lightning Speed](https://buildstuff14.sched.com/event/08a1a368e272eb599a52e08b4c3c779d). At *Build Stuff ’14*, November 2014. Archived at [perma.cc/GB6Z-P8YH](https://perma.cc/GB6Z-P8YH)
[^27]: Manos Athanassoulis, Michael S. Kester, Lukas M. Maas, Radu Stoica, Stratos Idreos, Anastasia Ailamaki, and Mark Callaghan. [Designing Access Methods: The RUM Conjecture](https://openproceedings.org/2016/conf/edbt/paper-12.pdf). At *19th International Conference on Extending Database Technology* (EDBT), March 2016. [doi:10.5441/002/edbt.2016.42](https://doi.org/10.5441/002/edbt.2016.42)
[^28]: Ben Stopford. [Log Structured Merge Trees](http://www.benstopford.com/2015/02/14/log-structured-merge-trees/). *benstopford.com*, February 2015. Archived at [perma.cc/E5BV-KUJ6](https://perma.cc/E5BV-KUJ6)
[^29]: Mark Callaghan. [The Advantages of an LSM vs a B-Tree](https://smalldatum.blogspot.com/2016/01/summary-of-advantages-of-lsm-vs-b-tree.html). *smalldatum.blogspot.co.uk*, January 2016. Archived at [perma.cc/3TYZ-EFUD](https://perma.cc/3TYZ-EFUD)
[^30]: Oana Balmau, Florin Dinu, Willy Zwaenepoel, Karan Gupta, Ravishankar Chandhiramoorthi, and Diego Didona. [SILK: Preventing Latency Spikes in Log-Structured Merge Key-Value Stores](https://www.usenix.org/conference/atc19/presentation/balmau). At *USENIX Annual Technical Conference*, July 2019.
[^31]: Igor Canadi, Siying Dong, Mark Callaghan, et al. [RocksDB Tuning Guide](https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide). *github.com*, 2023. Archived at [perma.cc/UNY4-MK6C](https://perma.cc/UNY4-MK6C)
[^32]: Gabriel Haas and Viktor Leis. [What Modern NVMe Storage Can Do, and How to Exploit it: High-Performance I/O for High-Performance Storage Engines](https://www.vldb.org/pvldb/vol16/p2090-haas.pdf). *Proceedings of the VLDB Endowment*, volume 16, issue 9, pages 2090-2102. [doi:10.14778/3598581.3598584](https://doi.org/10.14778/3598581.3598584)
[^33]: Emmanuel Goossaert. [Coding for SSDs](https://codecapsule.com/2014/02/12/coding-for-ssds-part-1-introduction-and-table-of-contents/). *codecapsule.com*, February 2014.
[^34]: Jack Vanlightly. [Is sequential IO dead in the era of the NVMe drive?](https://jack-vanlightly.com/blog/2023/5/9/is-sequential-io-dead-in-the-era-of-the-nvme-drive) *jack-vanlightly.com*, May 2023. Archived at [perma.cc/7TMZ-TAPU](https://perma.cc/7TMZ-TAPU)
[^35]: Alibaba Cloud Storage Team. [Storage System Design Analysis: Factors Affecting NVMe SSD Performance (2)](https://www.alibabacloud.com/blog/594376). *alibabacloud.com*, January 2019. Archived at [archive.org](https://web.archive.org/web/20230510065132/https%3A//www.alibabacloud.com/blog/594376)
[^36]: Xiao-Yu Hu and Robert Haas. [The Fundamental Limit of Flash Random Write Performance: Understanding, Analysis and Performance Modelling](https://dominoweb.draco.res.ibm.com/reports/rz3771.pdf). *dominoweb.draco.res.ibm.com*, March 2010. Archived at [perma.cc/8JUL-4ZDS](https://perma.cc/8JUL-4ZDS)
[^37]: Lanyue Lu, Thanumalayan Sankaranarayana Pillai, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [WiscKey: Separating Keys from Values in SSD-conscious Storage](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf). At *4th USENIX Conference on File and Storage Technologies* (FAST), February 2016.
[^38]: Peter Zaitsev. [Innodb Double Write](https://www.percona.com/blog/innodb-double-write/). *percona.com*, August 2006. Archived at [perma.cc/NT4S-DK7T](https://perma.cc/NT4S-DK7T)
[^39]: Tomas Vondra. [On the Impact of Full-Page Writes](https://www.2ndquadrant.com/en/blog/on-the-impact-of-full-page-writes/). *2ndquadrant.com*, November 2016. Archived at [perma.cc/7N6B-CVL3](https://perma.cc/7N6B-CVL3)
[^40]: Mark Callaghan. [Read, write & space amplification - B-Tree vs LSM](https://smalldatum.blogspot.com/2015/11/read-write-space-amplification-b-tree.html). *smalldatum.blogspot.com*, November 2015. Archived at [perma.cc/S487-WK5P](https://perma.cc/S487-WK5P)
[^41]: Mark Callaghan. [Choosing Between Efficiency and Performance with RocksDB](https://codemesh.io/codemesh2016/mark-callaghan). At *Code Mesh*, November 2016. Video at [youtube.com/watch?v=tgzkgZVXKB4](https://www.youtube.com/watch?v=tgzkgZVXKB4)
[^42]: Subhadeep Sarkar, Tarikul Islam Papon, Dimitris Staratzis, Zichen Zhu, and Manos Athanassoulis. [Enabling Timely and Persistent Deletion in LSM-Engines](https://subhadeep.net/assets/fulltext/Enabling_Timely_and_Persistent_Deletion_in_LSM-Engines.pdf). *ACM Transactions on Database Systems*, volume 48, issue 3, article no. 8, August 2023. [doi:10.1145/3599724](https://doi.org/10.1145/3599724)
[^43]: Lukas Fittl. [Postgres vs. SQL Server: B-Tree Index Differences & the Benefit of Deduplication](https://pganalyze.com/blog/postgresql-vs-sql-server-btree-index-deduplication). *pganalyze.com*, April 2025. Archived at [perma.cc/XY6T-LTPX](https://perma.cc/XY6T-LTPX)
[^44]: Drew Silcock. [How Postgres stores data on disk – this one’s a page turner](https://drew.silcock.dev/blog/how-postgres-stores-data-on-disk/). *drew.silcock.dev*, August 2024. Archived at [perma.cc/8K7K-7VJ2](https://perma.cc/8K7K-7VJ2)
[^45]: Joe Webb. [Using Covering Indexes to Improve Query Performance](https://www.red-gate.com/simple-talk/databases/sql-server/learn/using-covering-indexes-to-improve-query-performance/). *simple-talk.com*, September 2008. Archived at [perma.cc/6MEZ-R5VR](https://perma.cc/6MEZ-R5VR)
[^46]: Michael Stonebraker, Samuel Madden, Daniel J. Abadi, Stavros Harizopoulos, Nabil Hachem, and Pat Helland. [The End of an Architectural Era (It’s Time for a Complete Rewrite)](https://vldb.org/conf/2007/papers/industrial/p1150-stonebraker.pdf). At *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
[^47]: [VoltDB Technical Overview White Paper](https://www.voltactivedata.com/wp-content/uploads/2017/03/hv-white-paper-voltdb-technical-overview.pdf). VoltDB, 2017. Archived at [perma.cc/B9SF-SK5G](https://perma.cc/B9SF-SK5G)
[^48]: Stephen M. Rumble, Ankita Kejriwal, and John K. Ousterhout. [Log-Structured Memory for DRAM-Based Storage](https://www.usenix.org/system/files/conference/fast14/fast14-paper_rumble.pdf). At *12th USENIX Conference on File and Storage Technologies* (FAST), February 2014.
[^49]: Stavros Harizopoulos, Daniel J. Abadi, Samuel Madden, and Michael Stonebraker. [OLTP Through the Looking Glass, and What We Found There](https://hstore.cs.brown.edu/papers/hstore-lookingglass.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2008. [doi:10.1145/1376616.1376713](https://doi.org/10.1145/1376616.1376713)
[^50]: Per-Åke Larson, Cipri Clinciu, Campbell Fraser, Eric N. Hanson, Mostafa Mokhtar, Michal Nowakiewicz, Vassilis Papadimos, Susan L. Price, Srikumar Rangarajan, Remus Rusanu, and Mayukh Saubhasik. [Enhancements to SQL Server Column Stores](https://web.archive.org/web/20131203001153id_/http%3A//research.microsoft.com/pubs/193599/Apollo3%20-%20Sigmod%202013%20-%20final.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2013. [doi:10.1145/2463676.2463708](https://doi.org/10.1145/2463676.2463708)
[^51]: Franz Färber, Norman May, Wolfgang Lehner, Philipp Große, Ingo Müller, Hannes Rauhe, and Jonathan Dees. [The SAP HANA Database – An Architecture Overview](https://web.archive.org/web/20220208081111id_/http%3A//sites.computer.org/debull/A12mar/hana.pdf). *IEEE Data Engineering Bulletin*, volume 35, issue 1, pages 28–33, March 2012.
[^52]: Michael Stonebraker. [The Traditional RDBMS Wisdom Is (Almost Certainly) All Wrong](https://slideshot.epfl.ch/talks/166). Presentation at *EPFL*, May 2013.
[^53]: Adam Prout, Szu-Po Wang, Joseph Victor, Zhou Sun, Yongzhu Li, Jack Chen, Evan Bergeron, Eric Hanson, Robert Walzer, Rodrigo Gomes, and Nikita Shamgunov. [Cloud-Native Transactions and Analytics in SingleStore](https://dl.acm.org/doi/pdf/10.1145/3514221.3526055). At *ACM International Conference on Management of Data* (SIGMOD), June 2022. [doi:10.1145/3514221.3526055](https://doi.org/10.1145/3514221.3526055)
[^54]: Tino Tereshko and Jordan Tigani. [BigQuery under the hood](https://cloud.google.com/blog/products/bigquery/bigquery-under-the-hood). *cloud.google.com*, January 2016. Archived at [perma.cc/WP2Y-FUCF](https://perma.cc/WP2Y-FUCF)
[^55]: Wes McKinney. [The Road to Composable Data Systems: Thoughts on the Last 15 Years and the Future](https://wesmckinney.com/blog/looking-back-15-years/). *wesmckinney.com*, September 2023. Archived at [perma.cc/6L2M-GTJX](https://perma.cc/6L2M-GTJX)
[^56]: Michael Stonebraker, Daniel J. Abadi, Adam Batkin, Xuedong Chen, Mitch Cherniack, Miguel Ferreira, Edmond Lau, Amerson Lin, Sam Madden, Elizabeth O’Neil, Pat O’Neil, Alex Rasin, Nga Tran, and Stan Zdonik. [C-Store: A Column-oriented DBMS](https://www.vldb.org/archives/website/2005/program/paper/thu/p553-stonebraker.pdf). At *31st International Conference on Very Large Data Bases* (VLDB), pages 553–564, September 2005.
[^57]: Julien Le Dem. [Dremel Made Simple with Parquet](https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet.html). *blog.twitter.com*, September 2013.
[^58]: Sergey Melnik, Andrey Gubarev, Jing Jing Long, Geoffrey Romer, Shiva Shivakumar, Matt Tolton, and Theo Vassilakis. [Dremel: Interactive Analysis of Web-Scale Datasets](https://vldb.org/pvldb/vol3/R29.pdf). At *36th International Conference on Very Large Data Bases* (VLDB), pages 330–339, September 2010. [doi:10.14778/1920841.1920886](https://doi.org/10.14778/1920841.1920886)
[^59]: Joe Kearney. [Understanding Record Shredding: storing nested data in columns](https://www.joekearney.co.uk/posts/understanding-record-shredding). *joekearney.co.uk*, December 2016. Archived at [perma.cc/ZD5N-AX5D](https://perma.cc/ZD5N-AX5D)
[^60]: Jamie Brandon. [A shallow survey of OLAP and HTAP query engines](https://www.scattered-thoughts.net/writing/a-shallow-survey-of-olap-and-htap-query-engines). *scattered-thoughts.net*, September 2023. Archived at [perma.cc/L3KH-J4JF](https://perma.cc/L3KH-J4JF)
[^61]: Benoit Dageville, Thierry Cruanes, Marcin Zukowski, Vadim Antonov, Artin Avanes, Jon Bock, Jonathan Claybaugh, Daniel Engovatov, Martin Hentschel, Jiansheng Huang, Allison W. Lee, Ashish Motivala, Abdul Q. Munir, Steven Pelley, Peter Povinec, Greg Rahn, Spyridon Triantafyllis, and Philipp Unterbrunner. [The Snowflake Elastic Data Warehouse](https://dl.acm.org/doi/pdf/10.1145/2882903.2903741). At *ACM International Conference on Management of Data* (SIGMOD), pages 215–226, June 2016. [doi:10.1145/2882903.2903741](https://doi.org/10.1145/2882903.2903741)
[^62]: Mark Raasveldt and Hannes Mühleisen. [Data Management for Data Science Towards Embedded Analytics](https://duckdb.org/pdf/CIDR2020-raasveldt-muehleisen-duckdb.pdf). At *10th Conference on Innovative Data Systems Research* (CIDR), January 2020.
[^63]: Jean-François Im, Kishore Gopalakrishna, Subbu Subramaniam, Mayank Shrivastava, Adwait Tumbde, Xiaotian Jiang, Jennifer Dai, Seunghyun Lee, Neha Pawar, Jialiang Li, and Ravi Aringunram. [Pinot: Realtime OLAP for 530 Million Users](https://cwiki.apache.org/confluence/download/attachments/103092375/Pinot.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 583–594, May 2018. [doi:10.1145/3183713.3190661](https://doi.org/10.1145/3183713.3190661)
[^64]: Fangjin Yang, Eric Tschetter, Xavier Léauté, Nelson Ray, Gian Merlino, and Deep Ganguli. [Druid: A Real-time Analytical Data Store](https://static.druid.io/docs/druid.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2014. [doi:10.1145/2588555.2595631](https://doi.org/10.1145/2588555.2595631)
[^65]: Chunwei Liu, Anna Pavlenko, Matteo Interlandi, and Brandon Haynes. [Deep Dive into Common Open Formats for Analytical DBMSs](https://www.vldb.org/pvldb/vol16/p3044-liu.pdf). *Proceedings of the VLDB Endowment*, volume 16, issue 11, pages 3044–3056, July 2023. [doi:10.14778/3611479.3611507](https://doi.org/10.14778/3611479.3611507)
[^66]: Xinyu Zeng, Yulong Hui, Jiahong Shen, Andrew Pavlo, Wes McKinney, and Huanchen Zhang. [An Empirical Evaluation of Columnar Storage Formats](https://www.vldb.org/pvldb/vol17/p148-zeng.pdf). *Proceedings of the VLDB Endowment*, volume 17, issue 2, pages 148–161. [doi:10.14778/3626292.3626298](https://doi.org/10.14778/3626292.3626298)
[^67]: Weston Pace. [Lance v2: A columnar container format for modern data](https://blog.lancedb.com/lance-v2/). *blog.lancedb.com*, April 2024. Archived at [perma.cc/ZK3Q-S9VJ](https://perma.cc/ZK3Q-S9VJ)
[^68]: Yoav Helfman. [Nimble, A New Columnar File Format](https://www.youtube.com/watch?v=bISBNVtXZ6M). At *VeloxCon*, April 2024.
[^69]: Wes McKinney. [Apache Arrow: High-Performance Columnar Data Framework](https://www.youtube.com/watch?v=YhF8YR0OEFk). At *CMU Database Group – Vaccination Database Tech Talks*, December 2021.
[^70]: Wes McKinney. [Python for Data Analysis, 3rd Edition](https://learning.oreilly.com/library/view/python-for-data/9781098104023/). O’Reilly Media, August 2022. ISBN: 9781098104023
[^71]: Paul Dix. [The Design of InfluxDB IOx: An In-Memory Columnar Database Written in Rust with Apache Arrow](https://www.youtube.com/watch?v=_zbwz-4RDXg). At *CMU Database Group – Vaccination Database Tech Talks*, May 2021.
[^72]: Carlota Soto and Mike Freedman. [Building Columnar Compression for Large PostgreSQL Databases](https://www.timescale.com/blog/building-columnar-compression-in-a-row-oriented-database/). *timescale.com*, March 2024. Archived at [perma.cc/7KTF-V3EH](https://perma.cc/7KTF-V3EH)
[^73]: Daniel Lemire, Gregory Ssi‐Yan‐Kai, and Owen Kaser. [Consistently faster and smaller compressed bitmaps with Roaring](https://arxiv.org/pdf/1603.06549). *Software: Practice and Experience*, volume 46, issue 11, pages 1547–1569, November 2016. [doi:10.1002/spe.2402](https://doi.org/10.1002/spe.2402)
[^74]: Jaz Volpert. [An entire Social Network in 1.6GB (GraphD Part 2)](https://jazco.dev/2024/04/20/roaring-bitmaps/). *jazco.dev*, April 2024. Archived at [perma.cc/L27Z-QVMG](https://perma.cc/L27Z-QVMG)
[^75]: Daniel J. Abadi, Peter Boncz, Stavros Harizopoulos, Stratos Idreos, and Samuel Madden. [The Design and Implementation of Modern Column-Oriented Database Systems](https://www.cs.umd.edu/~abadi/papers/abadi-column-stores.pdf). *Foundations and Trends in Databases*, volume 5, issue 3, pages 197–280, December 2013. [doi:10.1561/1900000024](https://doi.org/10.1561/1900000024)
[^76]: Andrew Lamb, Matt Fuller, Ramakrishna Varadarajan, Nga Tran, Ben Vandiver, Lyric Doshi, and Chuck Bear. [The Vertica Analytic Database: C-Store 7 Years Later](https://vldb.org/pvldb/vol5/p1790_andrewlamb_vldb2012.pdf). *Proceedings of the VLDB Endowment*, volume 5, issue 12, pages 1790–1801, August 2012. [doi:10.14778/2367502.2367518](https://doi.org/10.14778/2367502.2367518)
[^77]: Timo Kersten, Viktor Leis, Alfons Kemper, Thomas Neumann, Andrew Pavlo, and Peter Boncz. [Everything You Always Wanted to Know About Compiled and Vectorized Queries But Were Afraid to Ask](https://www.vldb.org/pvldb/vol11/p2209-kersten.pdf). *Proceedings of the VLDB Endowment*, volume 11, issue 13, pages 2209–2222, September 2018. [doi:10.14778/3275366.3284966](https://doi.org/10.14778/3275366.3284966)
[^78]: Forrest Smith. [Memory Bandwidth Napkin Math](https://www.forrestthewoods.com/blog/memory-bandwidth-napkin-math/). *forrestthewoods.com*, February 2020. Archived at [perma.cc/Y8U4-PS7N](https://perma.cc/Y8U4-PS7N)
[^79]: Peter Boncz, Marcin Zukowski, and Niels Nes. [MonetDB/X100: Hyper-Pipelining Query Execution](https://www.cidrdb.org/cidr2005/papers/P19.pdf). At *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005.
[^80]: Jingren Zhou and Kenneth A. Ross. [Implementing Database Operations Using SIMD Instructions](https://www1.cs.columbia.edu/~kar/pubsk/simd.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 145–156, June 2002. [doi:10.1145/564691.564709](https://doi.org/10.1145/564691.564709)
[^81]: Kevin Bartley. [OLTP Queries: Transfer Expensive Workloads to Materialize](https://materialize.com/blog/oltp-queries/). *materialize.com*, August 2024. Archived at [perma.cc/4TYM-TYD8](https://perma.cc/4TYM-TYD8)
[^82]: Jim Gray, Surajit Chaudhuri, Adam Bosworth, Andrew Layman, Don Reichart, Murali Venkatrao, Frank Pellow, and Hamid Pirahesh. [Data Cube: A Relational Aggregation Operator Generalizing Group-By, Cross-Tab, and Sub-Totals](https://arxiv.org/pdf/cs/0701155). *Data Mining and Knowledge Discovery*, volume 1, issue 1, pages 29–53, March 2007. [doi:10.1023/A:1009726021843](https://doi.org/10.1023/A%3A1009726021843)
[^83]: Frank Ramsak, Volker Markl, Robert Fenk, Martin Zirkel, Klaus Elhardt, and Rudolf Bayer. [Integrating the UB-Tree into a Database System Kernel](https://www.vldb.org/conf/2000/P263.pdf). At *26th International Conference on Very Large Data Bases* (VLDB), September 2000.
[^84]: Octavian Procopiuc, Pankaj K. Agarwal, Lars Arge, and Jeffrey Scott Vitter. [Bkd-Tree: A Dynamic Scalable kd-Tree](https://users.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf). At *8th International Symposium on Spatial and Temporal Databases* (SSTD), pages 46–65, July 2003. [doi:10.1007/978-3-540-45072-6\_4](https://doi.org/10.1007/978-3-540-45072-6_4)
[^85]: Joseph M. Hellerstein, Jeffrey F. Naughton, and Avi Pfeffer. [Generalized Search Trees for Database Systems](https://dsf.berkeley.edu/papers/vldb95-gist.pdf). At *21st International Conference on Very Large Data Bases* (VLDB), September 1995.
[^86]: Isaac Brodsky. [H3: Uber’s Hexagonal Hierarchical Spatial Index](https://eng.uber.com/h3/). *eng.uber.com*, June 2018. Archived at [archive.org](https://web.archive.org/web/20240722003854/https%3A//www.uber.com/blog/h3/)
[^87]: Robert Escriva, Bernard Wong, and Emin Gün Sirer. [HyperDex: A Distributed, Searchable Key-Value Store](https://www.cs.princeton.edu/courses/archive/fall13/cos518/papers/hyperdex.pdf). At *ACM SIGCOMM Conference*, August 2012. [doi:10.1145/2377677.2377681](https://doi.org/10.1145/2377677.2377681)
[^88]: Christopher D. Manning, Prabhakar Raghavan, and Hinrich Schütze. [*Introduction to Information Retrieval*](https://nlp.stanford.edu/IR-book/). Cambridge University Press, 2008. ISBN: 978-0-521-86571-5, available online at [nlp.stanford.edu/IR-book](https://nlp.stanford.edu/IR-book/)
[^89]: Jianguo Wang, Chunbin Lin, Yannis Papakonstantinou, and Steven Swanson. [An Experimental Study of Bitmap Compression vs. Inverted List Compression](https://cseweb.ucsd.edu/~swanson/papers/SIGMOD2017-ListCompression.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 993–1008, May 2017. [doi:10.1145/3035918.3064007](https://doi.org/10.1145/3035918.3064007)
[^90]: Adrien Grand. [What is in a Lucene Index?](https://speakerdeck.com/elasticsearch/what-is-in-a-lucene-index) At *Lucene/Solr Revolution*, November 2013. Archived at [perma.cc/Z7QN-GBYY](https://perma.cc/Z7QN-GBYY)
[^91]: Michael McCandless. [Visualizing Lucene’s Segment Merges](https://blog.mikemccandless.com/2011/02/visualizing-lucenes-segment-merges.html). *blog.mikemccandless.com*, February 2011. Archived at [perma.cc/3ZV8-72W6](https://perma.cc/3ZV8-72W6)
[^92]: Lukas Fittl. [Understanding Postgres GIN Indexes: The Good and the Bad](https://pganalyze.com/blog/gin-index). *pganalyze.com*, December 2021. Archived at [perma.cc/V3MW-26H6](https://perma.cc/V3MW-26H6)
[^93]: Jimmy Angelakos. [The State of (Full) Text Search in PostgreSQL 12](https://www.youtube.com/watch?v=c8IrUHV70KQ). At *FOSDEM*, February 2020. Archived at [perma.cc/J6US-3WZS](https://perma.cc/J6US-3WZS)
[^94]: Alexander Korotkov. [Index support for regular expression search](https://wiki.postgresql.org/images/6/6c/Index_support_for_regular_expression_search.pdf). At *PGConf.EU Prague*, October 2012. Archived at [perma.cc/5RFZ-ZKDQ](https://perma.cc/5RFZ-ZKDQ)
[^95]: Michael McCandless. [Lucene’s FuzzyQuery Is 100 Times Faster in 4.0](https://blog.mikemccandless.com/2011/03/lucenes-fuzzyquery-is-100-times-faster.html). *blog.mikemccandless.com*, March 2011. Archived at [perma.cc/E2WC-GHTW](https://perma.cc/E2WC-GHTW)
[^96]: Steffen Heinz, Justin Zobel, and Hugh E. Williams. [Burst Tries: A Fast, Efficient Data Structure for String Keys](https://web.archive.org/web/20130903070248id_/http%3A//ww2.cs.mu.oz.au%3A80/~jz/fulltext/acmtois02.pdf). *ACM Transactions on Information Systems*, volume 20, issue 2, pages 192–223, April 2002. [doi:10.1145/506309.506312](https://doi.org/10.1145/506309.506312)
[^97]: Klaus U. Schulz and Stoyan Mihov. [Fast String Correction with Levenshtein Automata](https://dmice.ohsu.edu/bedricks/courses/cs655/pdf/readings/2002_Schulz.pdf). *International Journal on Document Analysis and Recognition*, volume 5, issue 1, pages 67–85, November 2002. [doi:10.1007/s10032-002-0082-8](https://doi.org/10.1007/s10032-002-0082-8)
[^98]: Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781). At *International Conference on Learning Representations* (ICLR), May 2013. [doi:10.48550/arXiv.1301.3781](https://doi.org/10.48550/arXiv.1301.3781)
[^99]: Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805). At *Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies*, volume 1, pages 4171–4186, June 2019. [doi:10.18653/v1/N19-1423](https://doi.org/10.18653/v1/N19-1423)
[^100]: Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever. [Improving Language Understanding by Generative Pre-Training](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf). *openai.com*, June 2018. Archived at [perma.cc/5N3C-DJ4C](https://perma.cc/5N3C-DJ4C)
[^101]: Matthijs Douze, Maria Lomeli, and Lucas Hosseini. [Faiss indexes](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes). *github.com*, August 2024. Archived at [perma.cc/2EWG-FPBS](https://perma.cc/2EWG-FPBS)
[^102]: Varik Matevosyan. [Understanding pgvector’s HNSW Index Storage in Postgres](https://lantern.dev/blog/pgvector-storage). *lantern.dev*, August 2024. Archived at [perma.cc/B2YB-JB59](https://perma.cc/B2YB-JB59)
[^103]: Dmitry Baranchuk, Artem Babenko, and Yury Malkov. [Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](https://arxiv.org/pdf/1802.02422). At *European Conference on Computer Vision* (ECCV), pages 202–216, September 2018. [doi:10.1007/978-3-030-01258-8\_13](https://doi.org/10.1007/978-3-030-01258-8_13)
[^104]: Yury A. Malkov and Dmitry A. Yashunin. [Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs](https://arxiv.org/pdf/1603.09320). *IEEE Transactions on Pattern Analysis and Machine Intelligence*, volume 42, issue 4, pages 824–836, April 2020. [doi:10.1109/TPAMI.2018.2889473](https://doi.org/10.1109/TPAMI.2018.2889473)

================================================
FILE: content/tw/ch5.md
================================================
---
title: "5. 編碼與演化"
weight: 105
math: true
breadcrumbs: false
---

<a id="ch_encoding"></a>

![](/map/ch04.png)

> *萬物流轉，無物常駐。*
>
> 赫拉克利特，引自柏拉圖《克拉提魯斯》（公元前 360 年）

應用程式不可避免地會隨時間而變化。隨著新產品的推出、使用者需求被更深入地理解，或者業務環境發生變化，功能會被新增或修改。在 [第 2 章](/tw/ch2#ch_nonfunctional) 中，我們介紹了 *可演化性* 的概念：我們應該致力於構建易於適應變化的系統（參見 ["可演化性：讓變更更容易"](/tw/ch2#sec_introduction_evolvability)）。

在大多數情況下，應用程式功能的變更也需要其儲存資料的變更：可能需要捕獲新的欄位或記錄型別，或者現有資料需要以新的方式呈現。

我們在 [第 3 章](/tw/ch3#ch_datamodels) 中討論的資料模型有不同的方式來應對這種變化。關係資料庫通常假定資料庫中的所有資料都遵循一個模式：儘管該模式可以更改（透過模式遷移；即 `ALTER` 語句），但在任何一個時間點只有一個模式生效。相比之下，讀時模式（"無模式"）資料庫不強制執行模式，因此資料庫可以包含在不同時間寫入的新舊資料格式的混合（參見 ["文件模型中的模式靈活性"](/tw/ch3#sec_datamodels_schema_flexibility)）。

當資料格式或模式發生變化時，通常需要對應用程式程式碼進行相應的更改（例如，你向記錄添加了一個新欄位，應用程式程式碼開始讀寫該欄位）。然而，在大型應用程式中，程式碼更改通常無法立即完成：

* 對於服務端應用程式，你可能希望執行 *滾動升級*（也稱為 *階段釋出*），每次將新版本部署到幾個節點，檢查新版本是否執行順利，然後逐步在所有節點上部署。這允許在不中斷服務的情況下部署新版本，從而鼓勵更頻繁的釋出和更好的可演化性。
* 對於客戶端應用程式，你要看使用者的意願，他們可能很長時間都不安裝更新。

這意味著新舊版本的程式碼，以及新舊資料格式，可能會同時在系統中共存。為了使系統繼續平穩執行，我們需要在兩個方向上保持相容性：

向後相容性
: 較新的程式碼可以讀取由較舊程式碼寫入的資料。

向前相容性
: 較舊的程式碼可以讀取由較新程式碼寫入的資料。

向後相容性通常不難實現：作為新程式碼的作者，你知道舊程式碼寫入的資料格式，因此可以顯式地處理它（如有必要，只需保留舊程式碼來讀取舊資料）。向前相容性可能更棘手，因為它需要舊程式碼忽略新版本程式碼新增的部分。

向前相容性的另一個挑戰如 [圖 5-1](#fig_encoding_preserve_field) 所示。假設你向記錄模式添加了一個欄位，新程式碼建立了包含該新欄位的記錄並將其儲存在資料庫中。隨後，舊版本的程式碼（尚不知道新欄位）讀取記錄，更新它，然後寫回。在這種情況下，理想的行為通常是舊程式碼保持新欄位不變，即使它無法解釋。但是，如果記錄被解碼為不顯式保留未知欄位的模型物件，資料可能會丟失，如 [圖 5-1](#fig_encoding_preserve_field) 所示。

{{< figure src="/fig/ddia_0501.png" id="fig_encoding_preserve_field" caption="圖 5-1. 當舊版本的應用程式更新之前由新版本應用程式寫入的資料時，如果不小心，資料可能會丟失。" class="w-full my-4" >}}

在本章中，我們將研究幾種編碼資料的格式，包括 JSON、XML、Protocol Buffers 和 Avro。特別是，我們將研究它們如何處理模式變化，以及它們如何支援新舊資料和程式碼需要共存的系統。然後我們將討論這些格式如何用於資料儲存和通訊：在資料庫、Web 服務、REST API、遠端過程呼叫（RPC）、工作流引擎以及事件驅動系統（如 actor 和訊息佇列）中。

## 編碼資料的格式 {#sec_encoding_formats}

程式通常以（至少）兩種不同的表示形式處理資料：

1. 在記憶體中，資料儲存在物件、結構體、列表、陣列、雜湊表、樹等中。這些資料結構針對 CPU 的高效訪問和操作進行了最佳化（通常使用指標）。
2. 當你想要將資料寫入檔案或透過網路傳送時，必須將其編碼為某種自包含的位元組序列（例如，JSON 文件）。由於指標對任何其他程序都沒有意義，因此這種位元組序列表示通常與記憶體中常用的資料結構看起來截然不同。

因此，我們需要在兩種表示之間進行某種轉換。從記憶體表示到位元組序列的轉換稱為 *編碼*（也稱為 *序列化* 或 *編組*），反向過程稱為 *解碼*（*解析*、*反序列化*、*反編組*）。

--------

> [!TIP] 術語衝突
>
> *序列化* 這個術語不幸地也用於事務的上下文中（參見 [第 8 章](/tw/ch8#ch_transactions)），具有完全不同的含義。為了避免詞義過載，本書中我們將堅持使用 *編碼*，儘管 *序列化* 可能是更常見的術語。

--------

也有例外情況不需要編碼/解碼——例如，當資料庫直接對從磁碟載入的壓縮資料進行操作時，如 ["查詢執行：編譯與向量化"](/tw/ch4#sec_storage_vectorized) 中所討論的。還有一些 *零複製* 資料格式，旨在在執行時和磁碟/網路上都可以使用，無需顯式轉換步驟，例如 Cap'n Proto 和 FlatBuffers。

然而，大多數系統需要在記憶體物件和平面位元組序列之間進行轉換。由於這是一個如此常見的問題，有無數不同的庫和編碼格式可供選擇。讓我們簡要概述一下。

### 特定語言的格式 {#id96}

許多程式語言都內建了將記憶體物件編碼為位元組序列的支援。例如，Java 有 `java.io.Serializable`，Python 有 `pickle`，Ruby 有 `Marshal`，等等。許多第三方庫也存在，例如 Java 的 Kryo。

這些編碼庫非常方便，因為它們允許用最少的額外程式碼儲存和恢復記憶體物件。然而，它們也有許多深層次的問題：

* 編碼通常與特定程式語言繫結，在另一種語言中讀取會非常困難。如果你以這種編碼儲存或傳輸資料，就等於在相當長時間內把自己繫結在當前程式語言上，也排除了與其他組織（可能使用不同語言）的系統整合。
* 為了以相同的物件型別恢復資料，解碼過程需要能夠例項化任意類。這經常是安全問題的來源 [^1]：如果攻擊者可以讓你的應用程式解碼任意位元組序列，他們可以例項化任意類，這反過來通常允許他們做可怕的事情，例如遠端執行任意程式碼 [^2] [^3]。
* 在這些庫中，資料版本控制通常是事後考慮的：由於它們旨在快速輕鬆地編碼資料，因此它們經常忽略向前和向後相容性的不便問題 [^4]。
* 效率（編碼或解碼所需的 CPU 時間以及編碼結構的大小）通常也是事後考慮的。例如，Java 的內建序列化因其糟糕的效能和臃腫的編碼而臭名昭著 [^5]。

由於這些原因，除了非常臨時的目的外，使用語言的內建編碼通常是個壞主意。

### JSON、XML 及其二進位制變體 {#sec_encoding_json}

當轉向可以由許多程式語言編寫和讀取的標準化編碼時，JSON 和 XML 是顯而易見的競爭者。它們廣為人知，廣受支援，也幾乎同樣廣受詬病。XML 經常因過於冗長和不必要的複雜而受到批評 [^6]。JSON 的流行主要是由於它在 Web 瀏覽器中的內建支援以及相對於 XML 的簡單性。CSV 是另一種流行的與語言無關的格式，但它只支援表格資料而不支援巢狀。

JSON、XML 和 CSV 是文字格式，因此在某種程度上是人類可讀的（儘管語法是一個熱門的爭論話題）。除了表面的語法問題之外，它們還有一些微妙的問題：

* 數字的編碼有很多歧義。在 XML 和 CSV 中，你無法區分數字和恰好由數字組成的字串（除非引用外部模式）。JSON 區分字串和數字，但它不區分整數和浮點數，也不指定精度。

  這在處理大數字時是一個問題；例如，大於 2⁵³ 的整數無法在 IEEE 754 雙精度浮點數中精確表示，因此在使用浮點數的語言（如 JavaScript）中解析時，此類數字會變得不準確 [^7]。大於 2⁵³ 的數字的一個例子出現在 X（前身為 Twitter）上，它使用 64 位數字來識別每個帖子。API 返回的 JSON 包括帖子 ID 兩次，一次作為 JSON 數字，一次作為十進位制字串，以解決 JavaScript 應用程式無法正確解析數字的事實 [^8]。
* JSON 和 XML 對 Unicode 字串（即人類可讀文字）有很好的支援，但它們不支援二進位制字串（沒有字元編碼的位元組序列）。二進位制字串是一個有用的功能，因此人們透過使用 Base64 將二進位制資料編碼為文字來繞過這個限制。然後模式用於指示該值應被解釋為 Base64 編碼。這雖然有效，但有點取巧，並且會將資料大小增加 33%。
* XML 模式和 JSON 模式功能強大，因此學習和實現起來相當複雜。由於資料的正確解釋（如數字和二進位制字串）取決於模式中的資訊，不使用 XML/JSON 模式的應用程式需要潛在地硬編碼適當的編碼/解碼邏輯。
* CSV 沒有任何模式，因此應用程式需要定義每行和每列的含義。如果應用程式更改添加了新行或列，你必須手動處理該更改。CSV 也是一種相當模糊的格式（如果值包含逗號或換行符會發生什麼？）。儘管其轉義規則已被正式指定 [^9]，但並非所有解析器都正確實現它們。

儘管存在這些缺陷，JSON、XML 和 CSV 對許多目的來說已經足夠好了。它們可能會繼續流行，特別是作為資料交換格式（即從一個組織向另一個組織傳送資料）。在這些情況下，只要人們就格式達成一致，格式有多漂亮或高效通常並不重要。讓不同組織就 *任何事情* 達成一致的困難超過了大多數其他問題。

#### JSON 模式 {#json-schema}

JSON 模式已被廣泛採用，作為系統間交換或寫入儲存時對資料建模的一種方式。你會在 Web 服務中找到 JSON 模式（參見 ["Web 服務"](#sec_web_services)）作為 OpenAPI Web 服務規範的一部分，在模式登錄檔中如 Confluent 的 Schema Registry 和 Red Hat 的 Apicurio Registry，以及在資料庫中如 PostgreSQL 的 pg_jsonschema 驗證器擴充套件和 MongoDB 的 `$jsonSchema` 驗證器語法。

JSON 模式規範提供了許多功能。模式包括標準原始型別，包括字串、數字、整數、物件、陣列、布林值或空值。但 JSON 模式還提供了一個單獨的驗證規範，允許開發人員在欄位上疊加約束。例如，`port` 欄位可能具有最小值 1 和最大值 65535。

JSON 模式可以具有開放或封閉的內容模型。開放內容模型允許模式中未定義的任何欄位以任何資料型別存在，而封閉內容模型只允許顯式定義的欄位。JSON 模式中的開放內容模型在 `additionalProperties` 設定為 `true` 時啟用，這是預設值。因此，JSON 模式通常是對 *不允許* 內容的定義（即，任何已定義欄位上的無效值），而不是對模式中 *允許* 內容的定義。

開放內容模型功能強大，但可能很複雜。例如，假設你想定義一個從整數（如 ID）到字串的對映。JSON 沒有對映或字典型別，只有一個可以包含字串鍵和任何型別值的"物件"型別。然後，你可以使用 JSON 模式約束此型別，使鍵只能包含數字，值只能是字串，使用 `patternProperties` 和 `additionalProperties`，如 [示例 5-1](#fig_encoding_json_schema) 所示。


{{< figure id="fig_encoding_json_schema" title="示例 5-1. 具有整數鍵和字串值的示例 JSON 模式。整數鍵表示為僅包含整數的字串，因為 JSON 模式要求所有鍵都是字串。" class="w-full my-4" >}}

```json
{
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "patternProperties": {
        "^[0-9]+$": {
        "type": "string"
    }
    },
    "additionalProperties": false
}
```

除了開放和封閉內容模型以及驗證器之外，JSON 模式還支援條件 if/else 模式邏輯、命名型別、對遠端模式的引用等等。所有這些都構成了一種非常強大的模式語言。這些功能也使定義變得笨重。解析遠端模式、推理條件規則或以向前或向後相容的方式演化模式可能具有挑戰性 [^10]。類似的問題也適用於 XML 模式 [^11]。

#### 二進位制編碼 {#binary-encoding}

JSON 比 XML 更簡潔，但與二進位制格式相比，兩者仍然使用大量空間。這一觀察導致了大量 JSON 二進位制編碼（MessagePack、CBOR、BSON、BJSON、UBJSON、BISON、Hessian 和 Smile 等等）和 XML 二進位制編碼（例如 WBXML 和 Fast Infoset）的發展。這些格式已在各種利基市場中被採用，因為它們更緊湊，有時解析速度更快，但它們都沒有像 JSON 和 XML 的文字版本那樣被廣泛採用 [^12]。

其中一些格式擴充套件了資料型別集（例如，區分整數和浮點數，或新增對二進位制字串的支援），但除此之外，它們保持 JSON/XML 資料模型不變。特別是，由於它們不規定模式，因此需要在編碼資料中包含所有物件欄位名稱。也就是說，在 [示例 5-2](#fig_encoding_json) 中的 JSON 文件的二進位制編碼中，它們需要在某處包含字串 `userName`、`favoriteNumber` 和 `interests`。

{{< figure id="fig_encoding_json" title="示例 5-2. 本章中我們將以幾種二進位制格式編碼的示例記錄" class="w-full my-4" >}}

```json
{
    "userName": "Martin",
    "favoriteNumber": 1337,
    "interests": ["daydreaming", "hacking"]
}
```

讓我們看一個 MessagePack 的例子，它是 JSON 的二進位制編碼。[圖 5-2](#fig_encoding_messagepack) 顯示了如果你使用 MessagePack 編碼 [示例 5-2](#fig_encoding_json) 中的 JSON 文件所得到的位元組序列。前幾個位元組如下：

1. 第一個位元組 `0x83` 表示接下來是一個物件（前四位 = `0x80`），有三個欄位（後四位 = `0x03`）。（如果你想知道如果物件有超過 15 個欄位會發生什麼，以至於欄位數無法裝入四位，那麼它會獲得不同的型別指示符，欄位數會以兩個或四個位元組編碼。）
2. 第二個位元組 `0xa8` 表示接下來是一個字串（前四位 = `0xa0`），長度為八個位元組（後四位 = `0x08`）。
3. 接下來的八個位元組是 ASCII 格式的欄位名 `userName`。由於之前已經指示了長度，因此不需要任何標記來告訴我們字串在哪裡結束（或任何轉義）。
4. 接下來的七個位元組使用字首 `0xa6` 編碼六個字母的字串值 `Martin`，依此類推。

二進位制編碼長度為 66 位元組，僅比文字 JSON 編碼（去除空格後）佔用的 81 位元組少一點。所有 JSON 的二進位制編碼在這方面都是相似的。目前尚不清楚這種小的空間減少（以及可能的解析速度提升）是否值得失去人類可讀性。

在接下來的部分中，我們將看到如何做得更好，將相同的記錄編碼為僅 32 位元組。

{{< figure link="#fig_encoding_json" src="/fig/ddia_0502.png" id="fig_encoding_messagepack" caption="圖 5-2. 使用 MessagePack 編碼的示例記錄 示例 5-2。" class="w-full my-4" >}}


### Protocol Buffers {#sec_encoding_protobuf}

Protocol Buffers (protobuf) 是 Google 開發的二進位制編碼庫。它類似於 Apache Thrift，後者最初由 Facebook 開發 [^13]；本節關於 Protocol Buffers 的大部分內容也適用於 Thrift。

Protocol Buffers 需要為任何編碼的資料提供模式。要在 Protocol Buffers 中編碼 [示例 5-2](#fig_encoding_json) 中的資料，你需要像這樣在 Protocol Buffers 介面定義語言（IDL）中描述模式：

```protobuf
syntax = "proto3";

message Person {
    string user_name = 1;
    int64 favorite_number = 2;
    repeated string interests = 3;
}
```

Protocol Buffers 附帶了一個程式碼生成工具，它接受像這裡顯示的模式定義，並生成以各種程式語言實現該模式的類。你的應用程式程式碼可以呼叫此生成的程式碼來編碼或解碼模式的記錄。使用 Protocol Buffers 編碼器編碼 [示例 5-2](#fig_encoding_json) 需要 33 位元組，如 [圖 5-3](#fig_encoding_protobuf) 所示 [^14]。

{{< figure src="/fig/ddia_0503.png" id="fig_encoding_protobuf" caption="圖 5-3. 使用 Protocol Buffers 編碼的示例記錄。" class="w-full my-4" >}}


與 [圖 5-2](#fig_encoding_messagepack) 類似，每個欄位都有一個型別註釋（指示它是字串、整數等）以及必要時的長度指示（例如字串的長度）。資料中出現的字串（"Martin"、"daydreaming"、"hacking"）也編碼為 ASCII（準確地說是 UTF-8），與之前類似。

與 [圖 5-2](#fig_encoding_messagepack) 相比的最大區別是沒有欄位名（`userName`、`favoriteNumber`、`interests`）。相反，編碼資料包含 *欄位標籤*，即數字（`1`、`2` 和 `3`）。這些是模式定義中出現的數字。欄位標籤就像欄位的別名——它們是說明我們正在談論哪個欄位的緊湊方式，而無需拼寫欄位名。

如你所見，Protocol Buffers 透過將欄位型別和標籤號打包到單個位元組中來節省更多空間。它使用可變長度整數：數字 1337 編碼為兩個位元組，每個位元組的最高位用於指示是否還有更多位元組要來。這意味著 -64 到 63 之間的數字以一個位元組編碼，-8192 到 8191 之間的數字以兩個位元組編碼，等等。更大的數字使用更多位元組。

Protocol Buffers 沒有顯式的列表或陣列資料型別。相反，`interests` 欄位上的 `repeated` 修飾符表示該欄位包含值列表，而不是單個值。在二進位制編碼中，列表元素只是簡單地表示為同一記錄中相同欄位標籤的重複出現。

#### 欄位標籤與模式演化 {#field-tags-and-schema-evolution}

我們之前說過，模式不可避免地需要隨時間而變化。我們稱之為 *模式演化*。Protocol Buffers 如何在保持向後和向前相容性的同時處理模式更改？

從示例中可以看出，編碼記錄只是其編碼欄位的串聯。每個欄位由其標籤號（示例模式中的數字 `1`、`2`、`3`）標識，並帶有資料型別註釋（例如字串或整數）。如果未設定欄位值，則它會從編碼記錄中省略。由此可以看出，欄位標籤對編碼資料的含義至關重要。你可以更改模式中欄位的名稱，因為編碼資料從不引用欄位名，但你不能更改欄位的標籤，因為這會使所有現有的編碼資料無效。

你可以向模式新增新欄位，前提是你為每個欄位提供新的標籤號。如果舊程式碼（不知道你新增的新標籤號）嘗試讀取由新程式碼寫入的資料（包括具有它不識別的標籤號的新欄位），它可以簡單地忽略該欄位。資料型別註釋允許解析器確定需要跳過多少位元組，並保留未知欄位以避免 [圖 5-1](#fig_encoding_preserve_field) 中的問題。這保持了向前相容性：舊程式碼可以讀取由新程式碼編寫的記錄。

向後相容性呢？只要每個欄位都有唯一的標籤號，新程式碼總是可以讀取舊資料，因為標籤號仍然具有相同的含義。如果在新模式中添加了欄位，而你讀取尚未包含該欄位的舊資料，則它將填充預設值（例如，如果欄位型別為字串，則為空字串；如果是數字，則為零）。

刪除欄位就像新增欄位一樣，向後和向前相容性問題相反。你永遠不能再次使用相同的標籤號，因為你可能仍然有在某處寫入的資料包含舊標籤號，並且該欄位必須被新程式碼忽略。可以在模式定義中保留過去使用的標籤號，以確保它們不會被遺忘。

更改欄位的資料型別呢？這在某些型別上是可能的——請檢視文件瞭解詳細資訊——但存在值被截斷的風險。例如，假設你將 32 位整數更改為 64 位整數。新程式碼可以輕鬆讀取舊程式碼寫入的資料，因為解析器可以用零填充任何缺失的位。但是，如果舊程式碼讀取新程式碼寫入的資料，則舊程式碼仍然使用 32 位變數來儲存該值。如果解碼的 64 位值無法裝入 32 位，它將被截斷。

### Avro {#sec_encoding_avro}

Apache Avro 是另一種二進位制編碼格式，與 Protocol Buffers 有著有趣的不同。它於 2009 年作為 Hadoop 的子專案啟動，因為 Protocol Buffers 不太適合 Hadoop 的用例 [^15]。

Avro 也使用模式來指定正在編碼的資料的結構。它有兩種模式語言：一種（Avro IDL）用於人工編輯，另一種（基於 JSON）更容易被機器讀取。與 Protocol Buffers 一樣，此模式語言僅指定欄位及其型別，而不像 JSON 模式那樣指定複雜的驗證規則。

我們的示例模式，用 Avro IDL 編寫，可能如下所示：

```c
record Person {
    string                  userName;
    union { null, long }    favoriteNumber = null;
    array<string>           interests;
}
```

該模式的等效 JSON 表示如下：

```c
{
    "type": "record",
    "name": "Person",
    "fields": [
        {"name": "userName",        "type": "string"},
        {"name": "favoriteNumber",  "type": ["null", "long"], "default": null},
        {"name": "interests",       "type": {"type": "array", "items": "string"}}
    ]
}
```

首先，請注意模式中沒有標籤號。如果我們使用此模式編碼示例記錄（[示例 5-2](#fig_encoding_json)），Avro 二進位制編碼只有 32 位元組長——是我們看到的所有編碼中最緊湊的。編碼位元組序列的分解如 [圖 5-4](#fig_encoding_avro) 所示。

如果你檢查位元組序列，你會發現沒有任何東西來標識欄位或其資料型別。編碼只是由串聯在一起的值組成。字串只是一個長度字首，後跟 UTF-8 位元組，但編碼資料中沒有任何內容告訴你它是字串。它也可能是整數，或完全是其他東西。整數使用可變長度編碼進行編碼。

{{< figure src="/fig/ddia_0504.png" id="fig_encoding_avro" caption="圖 5-4. 使用 Avro 編碼的示例記錄。" class="w-full my-4" >}}


要解析二進位制資料，你需要按照模式中出現的欄位順序進行遍歷，並使用模式告訴你每個欄位的資料型別。這意味著只有當讀取資料的程式碼使用與寫入資料的程式碼 *完全相同的模式* 時，二進位制資料才能被正確解碼。讀取器和寫入器之間的任何模式不匹配都意味著資料被錯誤解碼。

那麼，Avro 如何支援模式演化？

#### 寫入者模式與讀取者模式 {#the-writers-schema-and-the-readers-schema}

當應用程式想要編碼一些資料（將其寫入檔案或資料庫，透過網路傳送等）時，它使用它知道的任何版本的模式對資料進行編碼——例如，該模式可能被編譯到應用程式中。這被稱為 *寫入者模式*。

當應用程式想要解碼一些資料（從檔案或資料庫讀取，從網路接收等）時，它使用兩個模式：與用於編碼相同的寫入者模式，以及 *讀取者模式*，後者可能不同。這在 [圖 5-5](#fig_encoding_avro_schemas) 中說明。讀取者模式定義了應用程式程式碼期望的每條記錄的欄位及其型別。

{{< figure src="/fig/ddia_0505.png" id="fig_encoding_avro_schemas" caption="圖 5-5. 在 Protocol Buffers 中，編碼和解碼可以使用不同版本的模式。在 Avro 中，解碼使用兩個模式：寫入者模式必須與用於編碼的模式相同，但讀取者模式可以是較舊或較新的版本。" class="w-full my-4" >}}

如果讀取者模式和寫入者模式相同，解碼很容易。如果它們不同，Avro 透過並排檢視寫入者模式和讀取者模式並將資料從寫入者模式轉換為讀取者模式來解決差異。Avro 規範 [^16] [^17] 準確定義了此解析的工作方式，並在 [圖 5-6](#fig_encoding_avro_resolution) 中進行了說明。

例如，如果寫入者模式和讀取者模式的欄位順序不同，這沒有問題，因為模式解析透過欄位名匹配欄位。如果讀取資料的程式碼遇到出現在寫入者模式中但不在讀取者模式中的欄位，它將被忽略。如果讀取資料的程式碼期望某個欄位，但寫入者模式不包含該名稱的欄位，則使用讀取者模式中宣告的預設值填充它。

{{< figure src="/fig/ddia_0506.png" id="fig_encoding_avro_resolution" caption="圖 5-6. Avro 讀取器解決寫入者模式和讀取者模式之間的差異。" class="w-full my-4" >}}

#### 模式演化規則 {#schema-evolution-rules}

使用 Avro，向前相容性意味著你可以將新版本的模式作為寫入者，將舊版本的模式作為讀取者。相反，向後相容性意味著你可以將新版本的模式作為讀取者，將舊版本作為寫入者。

為了保持相容性，你只能新增或刪除具有預設值的欄位。（我們的 Avro 模式中的 `favoriteNumber` 欄位的預設值為 `null`。）例如，假設你添加了一個具有預設值的欄位，因此這個新欄位存在於新模式中但不在舊模式中。當使用新模式的讀取者讀取使用舊模式編寫的記錄時，將為缺失的欄位填充預設值。

如果你要新增一個沒有預設值的欄位，新讀取者將無法讀取舊寫入者寫入的資料，因此你會破壞向後相容性。如果你要刪除一個沒有預設值的欄位，舊讀取者將無法讀取新寫入者寫入的資料，因此你會破壞向前相容性。

在某些程式語言中，`null` 是任何變數的可接受預設值，但在 Avro 中不是這樣：如果你想允許欄位為 null，你必須使用 *聯合型別*。例如，`union { null, long, string } field;` 表示 `field` 可以是數字、字串或 null。只有當 `null` 是聯合的第一個分支時，你才能將其用作預設值。這比預設情況下一切都可為空更冗長一些，但它透過明確什麼可以和不能為 null 來幫助防止錯誤 [^18]。

更改欄位的資料型別是可能的，前提是 Avro 可以轉換該型別。更改欄位的名稱是可能的，但有點棘手：讀取者模式可以包含欄位名的別名，因此它可以將舊寫入者的模式欄位名與別名匹配。這意味著更改欄位名是向後相容的，但不是向前相容的。同樣，向聯合型別新增分支是向後相容的，但不是向前相容的。

#### 但什麼是寫入者模式？ {#but-what-is-the-writers-schema}

到目前為止，我們忽略了一個重要問題：讀取者如何知道特定資料是用哪個寫入者模式編碼的？我們不能只在每條記錄中包含整個模式，因為模式可能比編碼資料大得多，使二進位制編碼節省的所有空間都白費了。

答案取決於 Avro 的使用環境。舉幾個例子：

包含大量記錄的大檔案
: Avro 的一個常見用途是儲存包含數百萬條記錄的大檔案，所有記錄都使用相同的模式編碼。（我們將在 [第 11 章](/tw/ch11#ch_batch) 討論這種情況。）在這種情況下，該檔案的寫入者可以在檔案開頭只包含一次寫入者模式。Avro 指定了一種檔案格式（物件容器檔案）來執行此操作。

具有單獨寫入記錄的資料庫
: 在資料庫中，不同的記錄可能在不同的時間點使用不同的寫入者模式編寫——你不能假定所有記錄都具有相同的模式。最簡單的解決方案是在每個編碼記錄的開頭包含一個版本號，並在資料庫中保留模式版本列表。讀取者可以獲取記錄，提取版本號，然後從資料庫中獲取該版本號的寫入者模式。使用該寫入者模式，它可以解碼記錄的其餘部分。

  例如，Apache Kafka 的 Confluent 模式登錄檔 [^19] 和 LinkedIn 的 Espresso [^20] 就是這樣工作的。

透過網路連線傳送記錄
: 當兩個程序透過雙向網路連線進行通訊時，它們可以在連線設定時協商模式版本，然後在連線的生命週期內使用該模式。Avro RPC 協議（參見 ["流經服務的資料流：REST 與 RPC"](#sec_encoding_dataflow_rpc)）就是這樣工作的。

無論如何，模式版本資料庫都是有用的，因為它充當文件並讓你有機會檢查模式相容性 [^21]。作為版本號，你可以使用簡單的遞增整數，或者可以使用模式的雜湊值。

#### 動態生成的模式 {#dynamically-generated-schemas}

與 Protocol Buffers 相比，Avro 方法的一個優點是模式不包含任何標籤號。但為什麼這很重要？在模式中保留幾個數字有什麼問題？

區別在於 Avro 對 *動態生成* 的模式更友好。例如，假設你有一個關係資料庫，其內容你想要轉儲到檔案中，並且你想要使用二進位制格式來避免前面提到的文字格式（JSON、CSV、XML）的問題。如果你使用 Avro，你可以相當容易地從關係模式生成 Avro 模式（我們之前看到的 JSON 表示），並使用該模式對資料庫內容進行編碼，將其全部轉儲到 Avro 物件容器檔案中 [^22]。你可以為每個資料庫表生成記錄模式，每列成為該記錄中的一個欄位。資料庫中的列名對映到 Avro 中的欄位名。

現在，如果資料庫模式發生變化（例如，表添加了一列並刪除了一列），你可以從更新的資料庫模式生成新的 Avro 模式，並以新的 Avro 模式匯出資料。資料匯出過程不需要關注模式更改——它可以在每次執行時簡單地進行模式轉換。讀取新資料檔案的任何人都會看到記錄的欄位已更改，但由於欄位是按名稱標識的，因此更新的寫入者模式仍然可以與舊的讀取者模式匹配。

相比之下，如果你為此目的使用 Protocol Buffers，欄位標籤可能必須手動分配：每次資料庫模式更改時，管理員都必須手動更新從資料庫列名到欄位標籤的對映。（這可能是可以自動化的，但模式生成器必須非常小心，不要分配以前使用過的欄位標籤。）這種動態生成的模式根本不是 Protocol Buffers 的設計目標，而 Avro 則是。

### 模式的優點 {#sec_encoding_schemas}

正如我們所見，Protocol Buffers 和 Avro 都使用模式來描述二進位制編碼格式。它們的模式語言比 XML 模式或 JSON 模式簡單得多，後者支援更詳細的驗證規則（例如，"此欄位的字串值必須與此正則表示式匹配"或"此欄位的整數值必須在 0 到 100 之間"）。由於 Protocol Buffers 和 Avro 在實現和使用上都更簡單，它們已經發展到支援相當廣泛的程式語言。

這些編碼所基於的想法絕不是新的。例如，它們與 ASN.1 有很多共同之處，ASN.1 是 1984 年首次標準化的模式定義語言 [^23] [^24]。它用於定義各種網路協議，其二進位制編碼（DER）仍用於編碼 SSL 證書（X.509），例如 [^25]。ASN.1 支援使用標籤號的模式演化，類似於 Protocol Buffers [^26]。然而，它也非常複雜且文件記錄不佳，因此 ASN.1 可能不是新應用程式的好選擇。

許多資料系統也為其資料實現某種專有二進位制編碼。例如，大多數關係資料庫都有一個網路協議，你可以透過它向資料庫傳送查詢並獲取響應。這些協議通常特定於特定資料庫，資料庫供應商提供驅動程式（例如，使用 ODBC 或 JDBC API），將資料庫網路協議的響應解碼為記憶體資料結構。

因此，我們可以看到，儘管文字資料格式（如 JSON、XML 和 CSV）廣泛存在，但基於模式的二進位制編碼也是一個可行的選擇。它們具有許多良好的屬性：

* 它們可以比各種"二進位制 JSON"變體緊湊得多，因為它們可以從編碼資料中省略欄位名。
* 模式是一種有價值的文件形式，並且由於解碼需要模式，因此你可以確保它是最新的（而手動維護的文件很容易與現實脫節）。
* 保留模式資料庫允許你在部署任何內容之前檢查模式更改的向前和向後相容性。
* 對於靜態型別程式語言的使用者，從模式生成程式碼的能力很有用，因為它可以在編譯時進行型別檢查。

總之，模式演化允許與無模式/讀時模式 JSON 資料庫相同的靈活性（參見 ["文件模型中的模式靈活性"](/tw/ch3#sec_datamodels_schema_flexibility)），同時還提供更好的資料保證和更好的工具。

## 資料流的模式 {#sec_encoding_dataflow}

在本章開頭，我們說過，當你想要將一些資料傳送到與你不共享記憶體的另一個程序時——例如，當你想要透過網路傳送資料或將其寫入檔案時——你需要將其編碼為位元組序列。然後，我們討論了用於執行此操作的各種不同編碼。

我們討論了向前和向後相容性，這對可演化性很重要（透過允許你獨立升級系統的不同部分，而不必一次更改所有內容，使更改變得容易）。相容性是編碼資料的一個程序與解碼資料的另一個程序之間的關係。

這是一個相當抽象的想法——資料可以透過許多方式從一個程序流向另一個程序。誰編碼資料，誰解碼資料？在本章的其餘部分，我們將探討資料在程序之間流動的一些最常見方式：

* 透過資料庫（參見 ["流經資料庫的資料流"](#sec_encoding_dataflow_db)）
* 透過服務呼叫（參見 ["流經服務的資料流：REST 與 RPC"](#sec_encoding_dataflow_rpc)）
* 透過工作流引擎（參見 ["持久化執行與工作流"](#sec_encoding_dataflow_workflows)）
* 透過非同步訊息（參見 ["事件驅動的架構"](#sec_encoding_dataflow_msg)）

### 流經資料庫的資料流 {#sec_encoding_dataflow_db}

在資料庫中，寫入資料庫的程序對資料進行編碼，從資料庫讀取的程序對其進行解碼。可能只有一個程序訪問資料庫，在這種情況下，讀取者只是同一程序的後續版本——在這種情況下，你可以將在資料庫中儲存某些內容視為 *向未來的自己傳送訊息*。

向後相容性在這裡顯然是必要的；否則你未來的自己將無法解碼你之前寫的內容。

通常，幾個不同的程序同時訪問資料庫是很常見的。這些程序可能是幾個不同的應用程式或服務，或者它們可能只是同一服務的幾個例項（為了可伸縮性或容錯而並行執行）。無論哪種方式，在應用程式正在更改的環境中，某些訪問資料庫的程序可能正在執行較新的程式碼，而某些程序正在執行較舊的程式碼——例如，因為新版本當前正在滾動升級中部署，因此某些例項已更新，而其他例項尚未更新。

這意味著資料庫中的值可能由 *較新* 版本的程式碼寫入，隨後由仍在執行的 *較舊* 版本的程式碼讀取。因此，資料庫通常也需要向前相容性。

#### 不同時間寫入的不同值 {#different-values-written-at-different-times}

資料庫通常允許在任何時間更新任何值。這意味著在單個數據庫中，你可能有一些五毫秒前寫入的值，以及一些五年前寫入的值。

當你部署應用程式的新版本時（至少是服務端應用程式），你可能會在幾分鐘內用新版本完全替換舊版本。資料庫內容並非如此：五年前的資料仍然存在，採用原始編碼，除非你自那時以來明確重寫了它。這種觀察有時被總結為 *資料比程式碼更長壽*。

將資料重寫（*遷移*）為新模式當然是可能的，但在大型資料集上這是一件昂貴的事情，因此大多數資料庫儘可能避免它。大多數關係資料庫允許簡單的模式更改，例如新增具有 `null` 預設值的新列，而無需重寫現有資料。從磁碟上的編碼資料中缺少的任何列讀取舊行時，資料庫會為其填充 `null`。因此，模式演化允許整個資料庫看起來好像是用單個模式編碼的，即使底層儲存可能包含用各種歷史版本的模式編碼的記錄。

更複雜的模式更改——例如，將單值屬性更改為多值，或將某些資料移動到單獨的表中——仍然需要重寫資料，通常在應用程式級別 [^27]。在此類遷移中保持向前和向後相容性仍然是一個研究問題 [^28]。

#### 歸檔儲存 {#archival-storage}

也許你會不時對資料庫進行快照，例如用於備份目的或載入到資料倉庫中（參見 ["資料倉庫"](/tw/ch1#sec_introduction_dwh)）。在這種情況下，資料轉儲通常將使用最新模式進行編碼，即使源資料庫中的原始編碼包含來自不同時代的模式版本的混合。由於你無論如何都在複製資料，因此你不妨一致地對資料副本進行編碼。

由於資料轉儲是一次性寫入的，此後是不可變的，因此像 Avro 物件容器檔案這樣的格式非常適合。這也是將資料編碼為分析友好的列式格式（如 Parquet）的好機會（參見 ["列壓縮"](/tw/ch4#sec_storage_column_compression)）。

在 [第 11 章](/tw/ch11#ch_batch) 中，我們將更多地討論如何使用歸檔儲存中的資料。

### 流經服務的資料流：REST 與 RPC {#sec_encoding_dataflow_rpc}

當你有需要透過網路進行通訊的程序時，有幾種不同的方式來安排這種通訊。最常見的安排是有兩個角色：*客戶端* 和 *伺服器*。伺服器透過網路公開 API，客戶端可以連線到伺服器以向該 API 發出請求。伺服器公開的 API 稱為 *服務*。

Web 就是這樣工作的：客戶端（Web 瀏覽器）向 Web 伺服器發出請求，發出 `GET` 請求以下載 HTML、CSS、JavaScript、影像等，併發出 `POST` 請求以向伺服器提交資料。API 由一組標準化的協議和資料格式（HTTP、URL、SSL/TLS、HTML 等）組成。由於 Web 瀏覽器、Web 伺服器和網站作者大多同意這些標準，因此你可以使用任何 Web 瀏覽器訪問任何網站（至少在理論上！）。

Web 瀏覽器不是唯一型別的客戶端。例如，在移動裝置和桌面計算機上執行的原生應用程式通常也與伺服器通訊，在 Web 瀏覽器內執行的客戶端 JavaScript 應用程式也可以發出 HTTP 請求。在這種情況下，伺服器的響應通常不是用於向人顯示的 HTML，而是以便於客戶端應用程式程式碼進一步處理的編碼資料（最常見的是 JSON）。儘管 HTTP 可能用作傳輸協議，但在其之上實現的 API 是特定於應用程式的，客戶端和伺服器需要就該 API 的詳細資訊達成一致。

在某些方面，服務類似於資料庫：它們通常允許客戶端提交和查詢資料。但是，雖然資料庫允許使用我們在 [第 3 章](/tw/ch3#ch_datamodels) 中討論的查詢語言進行任意查詢，但服務公開了一個特定於應用程式的 API，該 API 僅允許由服務的業務邏輯（應用程式程式碼）預先確定的輸入和輸出 [^29]。這種限制提供了一定程度的封裝：服務可以對客戶端可以做什麼和不能做什麼施加細粒度的限制。

面向服務/微服務架構的一個關鍵設計目標是透過使服務可獨立部署和演化來使應用程式更容易更改和維護。一個常見的原則是每個服務應該由一個團隊擁有，該團隊應該能夠頻繁釋出服務的新版本，而無需與其他團隊協調。因此，我們應該期望伺服器和客戶端的新舊版本同時執行，因此伺服器和客戶端使用的資料編碼必須在服務 API 的各個版本之間相容。

#### Web 服務 {#sec_web_services}

當 HTTP 用作與服務通訊的底層協議時，它被稱為 *Web 服務*。Web 服務通常用於構建面向服務或微服務架構（在 ["微服務與 Serverless"](/tw/ch1#sec_introduction_microservices) 中討論過）。術語"Web 服務"可能有點用詞不當，因為 Web 服務不僅用於 Web，還用於幾種不同的上下文。例如：

1. 在使用者裝置上執行的客戶端應用程式（例如，移動裝置上的原生應用程式，或瀏覽器中的 JavaScript Web 應用程式）向服務發出 HTTP 請求。這些請求通常透過公共網際網路進行。
2. 一個服務向同一組織擁有的另一個服務發出請求，通常位於同一資料中心內，作為面向服務/微服務架構的一部分。
3. 一個服務向不同組織擁有的服務發出請求，通常透過網際網路。這用於不同組織後端系統之間的資料交換。此類別包括線上服務提供的公共 API，例如信用卡處理系統或用於共享訪問使用者資料的 OAuth。

最流行的服務設計理念是 REST，它建立在 HTTP 的原則之上 [^30] [^31]。它強調簡單的資料格式，使用 URL 來標識資源，並使用 HTTP 功能進行快取控制、身份驗證和內容型別協商。根據 REST 原則設計的 API 稱為 *RESTful*。

需要呼叫 Web 服務 API 的程式碼必須知道要查詢哪個 HTTP 端點，以及傳送什麼資料格式以及預期的響應。即使服務採用 RESTful 設計原則，客戶端也需要以某種方式找出這些詳細資訊。服務開發人員通常使用介面定義語言（IDL）來定義和記錄其服務的 API 端點和資料模型，並隨著時間的推移演化它們。然後，其他開發人員可以使用服務定義來確定如何查詢服務。兩種最流行的服務 IDL 是 OpenAPI（也稱為 Swagger [^32]）和 gRPC。OpenAPI 用於傳送和接收 JSON 資料的 Web 服務，而 gRPC 服務傳送和接收 Protocol Buffers。

開發人員通常用 JSON 或 YAML 編寫 OpenAPI 服務定義；參見 [示例 5-3](#fig_open_api_def)。服務定義允許開發人員定義服務端點、文件、版本、資料模型等。gRPC 定義看起來類似，但使用 Protocol Buffers 服務定義進行定義。

{{< figure id="fig_open_api_def" title="示例 5-3. YAML 中的示例 OpenAPI 服務定義" class="w-full my-4" >}}

```yaml
openapi: 3.0.0
info:
  title: Ping, Pong
  version: 1.0.0
servers:
  - url: http://localhost:8080
paths:
  /ping:
    get:
      summary: Given a ping, returns a pong message
      responses:
        '200':
          description: A pong
          content:
            application/json:
              schema:
                type: object
                properties:
                  message:
                    type: string
                    example: Pong!
```

即使採用了設計理念和 IDL，開發人員仍必須編寫實現其服務 API 呼叫的程式碼。通常採用服務框架來簡化這項工作。Spring Boot、FastAPI 和 gRPC 等服務框架允許開發人員為每個 API 端點編寫業務邏輯，而框架程式碼處理路由、指標、快取、身份驗證等。[示例 5-4](#fig_fastapi_def) 顯示了 [示例 5-3](#fig_open_api_def) 中定義的服務的示例 Python 實現。

{{< figure id="fig_fastapi_def" title="示例 5-4. 實現 [示例 5-3](#fig_open_api_def) 中定義的示例 FastAPI 服務" class="w-full my-4" >}}

```python
from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI(title="Ping, Pong", version="1.0.0")

class PongResponse(BaseModel):
    message: str = "Pong!"

@app.get("/ping", response_model=PongResponse,
         summary="Given a ping, returns a pong message")
async def ping():
    return PongResponse()
```

許多框架將服務定義和伺服器程式碼耦合在一起。在某些情況下，例如流行的 Python FastAPI 框架，伺服器是用程式碼編寫的，IDL 會自動生成。在其他情況下，例如 gRPC，首先編寫服務定義，然後生成伺服器程式碼腳手架。兩種方法都允許開發人員從服務定義生成各種語言的客戶端庫和 SDK。除了程式碼生成之外，Swagger 等 IDL 工具還可以生成文件、驗證模式更改相容性，併為開發人員提供查詢和測試服務的圖形使用者介面。

#### 遠端過程呼叫（RPC）的問題 {#sec_problems_with_rpc}

Web 服務只是透過網路進行 API 請求的一長串技術的最新化身，其中許多技術獲得了大量炒作但存在嚴重問題。Enterprise JavaBeans (EJB) 和 Java 的遠端方法呼叫 (RMI) 僅限於 Java。分散式元件物件模型 (DCOM) 僅限於 Microsoft 平臺。公共物件請求代理架構 (CORBA) 過於複雜，並且不提供向後或向前相容性 [^33]。SOAP 和 WS-\* Web 服務框架旨在提供跨供應商的互操作性，但也受到複雜性和相容性問題的困擾 [^34] [^35] [^36]。

所有這些都基於 *遠端過程呼叫* (RPC) 的想法，這個想法自 1970 年代以來就存在了 [^37]。RPC 模型試圖使向遠端網路服務的請求看起來與在程式語言中呼叫函式或方法相同，在同一程序內（這種抽象稱為 *位置透明性*）。儘管 RPC 起初似乎很方便，但這種方法從根本上是有缺陷的 [^38] [^39]。網路請求與本地函式呼叫非常不同：

* 本地函式呼叫是可預測的，要麼成功要麼失敗，僅取決於你控制的引數。網路請求是不可預測的：由於網路問題，請求或響應可能會丟失，或者遠端機器可能速度慢或不可用，而這些問題完全超出了你的控制。網路問題很常見，因此你必須預料到它們，例如透過重試失敗的請求。
* 本地函式呼叫要麼返回結果，要麼丟擲異常，要麼永不返回（因為它進入無限迴圈或程序崩潰）。網路請求有另一種可能的結果：它可能由於 *超時* 而沒有返回結果。在這種情況下，你根本不知道發生了什麼：如果你沒有從遠端服務獲得響應，你無法知道請求是否透過。（我們在 [第 9 章](/tw/ch9#ch_distributed) 中更詳細地討論了這個問題。）
* 如果你重試失敗的網路請求，可能會發生前一個請求實際上已經成功，只是響應丟失了。在這種情況下，重試將導致操作執行多次，除非你在協議中構建去重機制（*冪等性*）[^40]。本地函式呼叫沒有這個問題。（我們在 [“冪等性”](/tw/ch12#sec_stream_idempotence) 中更詳細地討論冪等性。）
* 每次呼叫本地函式時，通常需要大約相同的時間來執行。網路請求比函式呼叫慢得多，其延遲也變化很大：在良好的時候，它可能在不到一毫秒內完成，但當網路擁塞或遠端服務過載時，執行完全相同的操作可能需要許多秒。
* 當你呼叫本地函式時，你可以有效地將引用（指標）傳遞給本地記憶體中的物件。當你發出網路請求時，所有這些引數都需要編碼為可以透過網路傳送的位元組序列。如果引數是不可變的原語，如數字或短字串，那沒問題，但對於更大量的資料和可變物件，它很快就會出現問題。
* 客戶端和服務可能以不同的程式語言實現，因此 RPC 框架必須將資料型別從一種語言轉換為另一種語言。這可能會變得很醜陋，因為並非所有語言都具有相同的型別——例如，回想一下 JavaScript 處理大於 2⁵³ 的數字的問題（參見 ["JSON、XML 及其二進位制變體"](#sec_encoding_json)）。單一語言編寫的單個程序中不存在此問題。

所有這些因素意味著，試圖讓遠端服務看起來太像程式語言中的本地物件是沒有意義的，因為它是根本不同的東西。REST 的部分吸引力在於它將網路上的狀態傳輸視為與函式呼叫不同的過程。

#### 負載均衡器、服務發現和服務網格 {#sec_encoding_service_discovery}

所有服務都透過網路進行通訊。因此，客戶端必須知道它正在連線的服務的地址——這個問題稱為 *服務發現*。最簡單的方法是配置客戶端連線到執行服務的 IP 地址和埠。此配置可以工作，但如果伺服器離線、轉移到新機器或變得過載，則必須手動重新配置客戶端。

為了提供更高的可用性和可伸縮性，通常在不同的機器上執行服務的多個例項，其中任何一個都可以處理傳入的請求。將請求分散到這些例項上稱為 *負載均衡* [^41]。有許多負載均衡和服務發現解決方案可用：

* *硬體負載均衡器* 是安裝在資料中心的專用裝置。它們允許客戶端連線到單個主機和埠，傳入連線被路由到執行服務的伺服器之一。此類負載均衡器在連線到下游伺服器時檢測網路故障，並將流量轉移到其他伺服器。
* *軟體負載均衡器* 的行為方式與硬體負載均衡器大致相同。但是，軟體負載均衡器（如 Nginx 和 HAProxy）不需要特殊裝置，而是可以安裝在標準機器上的應用程式。
* *域名服務 (DNS)* 是當你開啟網頁時在網際網路上解析域名的方式。它透過允許多個 IP 地址與單個域名關聯來支援負載均衡。然後，客戶端可以配置為使用域名而不是 IP 地址連線到服務，並且客戶端的網路層在建立連線時選擇要使用的 IP 地址。這種方法的一個缺點是 DNS 旨在在較長時間內傳播更改並快取 DNS 條目。如果伺服器頻繁啟動、停止或移動，客戶端可能會看到不再有伺服器執行的陳舊 IP 地址。
* *服務發現系統* 使用集中式登錄檔而不是 DNS 來跟蹤哪些服務端點可用。當新服務例項啟動時，它透過宣告它正在偵聽的主機和埠以及相關元資料（如分片所有權資訊（參見 [第 7 章](/tw/ch7#ch_sharding)）、資料中心位置等）向服務發現系統註冊自己。然後，服務定期向發現系統傳送心跳訊號，以表明服務仍然可用。

  當客戶端希望連線到服務時，它首先查詢發現系統以獲取可用端點列表，然後直接連線到端點。與 DNS 相比，服務發現支援服務例項頻繁更改的更動態環境。發現系統還為客戶端提供有關它們正在連線的服務的更多元資料，這使客戶端能夠做出更智慧的負載均衡決策。
* *服務網格* 是一種複雜的負載均衡形式，它結合了軟體負載均衡器和服務發現。與在單獨機器上執行的傳統軟體負載均衡器不同，服務網格負載均衡器通常作為程序內客戶端庫或作為客戶端和伺服器上的程序或"邊車"容器部署。客戶端應用程式連線到它們自己的本地服務負載均衡器，該負載均衡器連線到伺服器的負載均衡器。從那裡，連線被路由到本地伺服器程序。

  雖然複雜，但這種拓撲提供了許多優勢。由於客戶端和伺服器完全透過本地連線路由，因此連線加密可以完全在負載均衡器級別處理。這使客戶端和伺服器免於處理 SSL 證書和 TLS 的複雜性。網格系統還提供複雜的可觀測性。它們可以即時跟蹤哪些服務正在相互呼叫，檢測故障，跟蹤流量負載等。

哪種解決方案合適取決於組織的需求。在使用 Kubernetes 等編排器的非常動態的服務環境中執行的組織通常選擇執行 Istio 或 Linkerd 等服務網格。專門的基礎設施（如資料庫或訊息傳遞系統）可能需要自己專門構建的負載均衡器。更簡單的部署最適合使用軟體負載均衡器。

#### RPC 的資料編碼與演化 {#data-encoding-and-evolution-for-rpc}

對於可演化性，RPC 客戶端和伺服器可以獨立更改和部署非常重要。與透過資料庫流動的資料（如上一節所述）相比，我們可以在透過服務的資料流的情況下做出簡化假設：假設所有伺服器都先更新，然後所有客戶端都更新是合理的。因此，你只需要在請求上向後相容，在響應上向前相容。

RPC 方案的向後和向前相容性屬性繼承自它使用的任何編碼：

* gRPC（Protocol Buffers）和 Avro RPC 可以根據各自編碼格式的相容性規則進行演化。
* RESTful API 最常使用 JSON 作為響應，以及 JSON 或 URI 編碼/表單編碼的請求引數作為請求。新增可選請求引數和向響應物件新增新欄位通常被認為是保持相容性的更改。

服務相容性變得更加困難，因為 RPC 通常用於跨組織邊界的通訊，因此服務提供者通常無法控制其客戶端，也無法強制它們升級。因此，相容性需要保持很長時間，也許是無限期的。如果需要破壞相容性的更改，服務提供者通常最終會並行維護服務 API 的多個版本。

關於 API 版本控制應該如何工作（即客戶端如何指示它想要使用哪個版本的 API）沒有達成一致 [^42]。對於 RESTful API，常見的方法是在 URL 中使用版本號或在 HTTP `Accept` 標頭中使用。對於使用 API 金鑰識別特定客戶端的服務，另一個選項是在伺服器上儲存客戶端請求的 API 版本，並允許透過單獨的管理介面更新此版本選擇 [^43]。

### 持久化執行與工作流 {#sec_encoding_dataflow_workflows}

根據定義，基於服務的架構具有多個服務，這些服務都負責應用程式的不同部分。考慮一個處理信用卡並將資金存入銀行賬戶的支付處理應用程式。該系統可能有不同的服務負責欺詐檢測、信用卡整合、銀行整合等。

在我們的示例中，處理單個付款需要許多服務呼叫。支付處理器服務可能會呼叫欺詐檢測服務以檢查欺詐，呼叫信用卡服務以扣除信用卡費用，並呼叫銀行服務以存入扣除的資金，如 [圖 5-7](#fig_encoding_workflow) 所示。我們將這一系列步驟稱為 *工作流*，每個步驟稱為 *任務*。工作流通常定義為任務圖。工作流定義可以用通用程式語言、領域特定語言 (DSL) 或標記語言（如業務流程執行語言 (BPEL)）[^44] 編寫。

--------

> [!TIP] 任務、活動和函式
>
> 不同的工作流引擎對任務使用不同的名稱。例如，Temporal 使用術語 *活動*。其他引擎將任務稱為 *持久函式*。雖然名稱不同，但概念是相同的。

--------

{{< figure src="/fig/ddia_0507.png" id="fig_encoding_workflow" title="圖 5-7. 使用業務流程模型和標記法 (BPMN) 表示的工作流示例，這是一種圖形標記法。" class="w-full my-4" >}}


工作流由 *工作流引擎* 執行或執行。工作流引擎確定何時執行每個任務、任務必須在哪臺機器上執行、如果任務失敗該怎麼辦（例如，如果機器在任務執行時崩潰）、允許並行執行多少任務等。

工作流引擎通常由編排器和執行器組成。編排器負責排程要執行的任務，執行器負責執行任務。當工作流被觸發時，執行開始。如果使用者定義了基於時間的排程（例如每小時執行），則編排器會自行觸發工作流。外部源（如 Web 服務）甚至人類也可以觸發工作流執行。一旦觸發，就會呼叫執行器來執行任務。

有許多型別的工作流引擎可以滿足各種各樣的用例。有些，如 Airflow、Dagster 和 Prefect，與資料系統整合並編排 ETL 任務。其他的，如 Camunda 和 Orkes，為工作流提供圖形標記法（如 [圖 5-7](#fig_encoding_workflow) 中使用的 BPMN），以便非工程師可以更輕鬆地定義和執行工作流。還有一些，如 Temporal 和 Restate，提供 *持久化執行*。

#### 持久化執行 {#durable-execution}

持久化執行框架已成為構建需要事務性的基於服務的架構的流行方式。在我們的支付示例中，我們希望每筆付款都恰好處理一次。工作流執行期間的故障可能導致信用卡扣費，但沒有相應的銀行賬戶存款。在基於服務的架構中，我們不能簡單地將兩個任務包裝在資料庫事務中。此外，我們可能正在與我們控制有限的第三方支付閘道器進行互動。

持久化執行框架是為工作流提供 *恰好一次語義* 的一種方式。如果任務失敗，框架將重新執行該任務，但會跳過任務在失敗之前成功完成的任何 RPC 呼叫或狀態更改。相反，框架將假裝進行呼叫，但實際上將返回先前呼叫的結果。這是可能的，因為持久化執行框架將所有 RPC 和狀態更改記錄到持久儲存（如預寫日誌）[^45] [^46]。[示例 5-5](#fig_temporal_workflow) 顯示了使用 Temporal 支援持久化執行的工作流定義示例。

{{< figure id="fig_temporal_workflow" title="示例 5-5. [圖 5-7](#fig_encoding_workflow) 中支付工作流的 Temporal 工作流定義片段。" class="w-full my-4" >}}

```python
@workflow.defn
class PaymentWorkflow:
    @workflow.run
    async def run(self, payment: PaymentRequest) -> PaymentResult:
        is_fraud = await workflow.execute_activity(
            check_fraud,
            payment,
            start_to_close_timeout=timedelta(seconds=15),
        )
        if is_fraud:
            return PaymentResultFraudulent
        credit_card_response = await workflow.execute_activity(
            debit_credit_card,
            payment,
            start_to_close_timeout=timedelta(seconds=15),
        )
        # ...
```

像 Temporal 這樣的框架並非沒有挑戰。外部服務（例如我們示例中的第三方支付閘道器）仍必須提供冪等 API。開發人員必須記住為這些 API 使用唯一 ID 以防止重複執行 [^47]。由於持久化執行框架按順序記錄每個 RPC 呼叫，因此它期望後續執行以相同的順序進行相同的 RPC 呼叫。這使得程式碼更改變得脆弱：你可能僅透過重新排序函式呼叫就引入未定義的行為 [^48]。與其修改現有工作流的程式碼，不如單獨部署新版本的程式碼更安全，以便現有工作流呼叫的重新執行繼續使用舊版本，只有新呼叫使用新程式碼 [^49]。

同樣，由於持久化執行框架期望以確定性方式重放所有程式碼（相同的輸入產生相同的輸出），因此隨機數生成器或系統時鐘等非確定性程式碼會產生問題 [^48]。框架通常會為這類庫函式提供自己的確定性實現，但你必須記得使用它們。在某些情況下，例如 Temporal 的 workflowcheck 工具，框架還會提供靜態分析工具來判斷是否引入了非確定性行為。

--------

> [!NOTE]
> 使程式碼具有確定性是一個強大的想法，但要穩健地做到這一點很棘手。在 ["確定性的力量"](/tw/ch9#sidebar_distributed_determinism) 中，我們將回到這個話題。

--------

### 事件驅動的架構 {#sec_encoding_dataflow_msg}

在這最後一節中，我們將簡要介紹 *事件驅動架構*，這是編碼資料從一個程序流向另一個程序的另一種方式。請求稱為 *事件* 或 *訊息*；與 RPC 不同，傳送者通常不會等待接收者處理事件。此外，事件通常不是透過直接網路連線傳送給接收者，而是透過稱為 *訊息代理*（也稱為 *事件代理*、*訊息佇列* 或 *面向訊息的中介軟體*）的中介，它臨時儲存訊息 [^50]。

使用訊息代理與直接 RPC 相比有幾個優點：

* 如果接收者不可用或過載，它可以充當緩衝區，從而提高系統可靠性。
* 它可以自動將訊息重新傳遞給已崩潰的程序，從而防止訊息丟失。
* 它避免了服務發現的需要，因為傳送者不需要直接連線到接收者的 IP 地址。
* 它允許將相同的訊息傳送給多個接收者。
* 它在邏輯上將傳送者與接收者解耦（傳送者只是釋出訊息，不關心誰使用它們）。

透過訊息代理的通訊是 *非同步的*：傳送者不會等待訊息被傳遞，而是簡單地傳送它然後忘記它。可以透過讓傳送者在單獨的通道上等待響應來實現類似同步 RPC 的模型。

#### 訊息代理 {#message-brokers}

過去，訊息代理的格局由 TIBCO、IBM WebSphere 和 webMethods 等公司的商業企業軟體主導，然後開源實現（如 RabbitMQ、ActiveMQ、HornetQ、NATS 和 Apache Kafka）變得流行。最近，雲服務（如 Amazon Kinesis、Azure Service Bus 和 Google Cloud Pub/Sub）也獲得了採用。我們將在 [“訊息系統”](/tw/ch12#sec_stream_messaging) 中更詳細地比較它們。

詳細的傳遞語義因實現和配置而異，但通常，最常使用兩種訊息分發模式：

* 一個程序將訊息新增到命名 *佇列*，代理將該訊息傳遞給該佇列的 *消費者*。如果有多個消費者，其中一個會收到訊息。
* 一個程序將訊息釋出到命名 *主題*，代理將該訊息傳遞給該主題的所有 *訂閱者*。如果有多個訂閱者，他們都會收到訊息。

訊息代理通常不強制執行任何特定的資料模型——訊息只是帶有一些元資料的位元組序列，因此你可以使用任何編碼格式。常見的方法是使用 Protocol Buffers、Avro 或 JSON，並在訊息代理旁邊部署模式登錄檔來儲存所有有效的模式版本並檢查其相容性 [^19] [^21]。AsyncAPI（OpenAPI 的基於訊息傳遞的等效物）也可用於指定訊息的模式。

訊息代理在訊息的永續性方面有所不同。許多將訊息寫入磁碟，以便在訊息代理崩潰或需要重新啟動時不會丟失。與資料庫不同，許多訊息代理在訊息被消費後會自動再次刪除訊息。某些代理可以配置為無限期地儲存訊息，如果你想使用事件溯源，這是必需的（參見 ["事件溯源與 CQRS"](/tw/ch3#sec_datamodels_events)）。

如果消費者將訊息重新發布到另一個主題，你可能需要小心保留未知欄位，以防止前面在資料庫上下文中描述的問題（[圖 5-1](#fig_encoding_preserve_field)）。

#### 分散式 actor 框架 {#distributed-actor-frameworks}

*Actor 模型* 是單個程序中併發的程式設計模型。與其直接處理執行緒（以及相關的競態條件、鎖定和死鎖問題），邏輯被封裝在 *actor* 中。每個 actor 通常代表一個客戶端或實體，它可能有一些本地狀態（不與任何其他 actor 共享），並透過傳送和接收非同步訊息與其他 actor 通訊。訊息傳遞不能保證：在某些錯誤場景中，訊息將丟失。由於每個 actor 一次只處理一條訊息，因此它不需要擔心執行緒，並且每個 actor 可以由框架獨立排程。

在 *分散式 actor 框架* 中，如 Akka、Orleans [^51] 和 Erlang/OTP，此程式設計模型用於跨多個節點擴充套件應用程式。無論傳送者和接收者是在同一節點還是不同節點上，都使用相同的訊息傳遞機制。如果它們在不同的節點上，訊息將透明地編碼為位元組序列，透過網路傳送，並在另一端解碼。

位置透明性在 actor 模型中比在 RPC 中效果更好，因為 actor 模型已經假定訊息可能會丟失，即使在單個程序內也是如此。儘管網路上的延遲可能比同一程序內的延遲更高，但在使用 actor 模型時，本地和遠端通訊之間的根本不匹配較少。

分散式 actor 框架本質上將訊息代理和 actor 程式設計模型整合到單個框架中。但是，如果你想對基於 actor 的應用程式執行滾動升級，你仍然必須擔心向前和向後相容性，因為訊息可能從執行新版本的節點發送到執行舊版本的節點，反之亦然。這可以透過使用本章中討論的編碼之一來實現。


## 總結 {#summary}

在本章中，我們研究了將資料結構轉換為網路上的位元組或磁碟上的位元組的幾種方法。我們看到了這些編碼的細節不僅影響其效率，更重要的是還影響應用程式的架構和演化選項。

特別是，許多服務需要支援滾動升級，其中服務的新版本逐步部署到少數節點，而不是同時部署到所有節點。滾動升級允許在不停機的情況下發布服務的新版本（從而鼓勵頻繁的小版本釋出而不是罕見的大版本釋出），並使部署風險更低（允許在影響大量使用者之前檢測和回滾有故障的版本）。這些屬性對 *可演化性* 非常有益，即輕鬆進行應用程式更改。

在滾動升級期間，或出於其他各種原因，我們必須假設不同的節點正在執行我們應用程式程式碼的不同版本。因此，重要的是系統中流動的所有資料都以提供向後相容性（新程式碼可以讀取舊資料）和向前相容性（舊程式碼可以讀取新資料）的方式進行編碼。

我們討論了幾種資料編碼格式及其相容性屬性：

* 特定於程式語言的編碼僅限於單一程式語言，並且通常無法提供向前和向後相容性。
* 文字格式（如 JSON、XML 和 CSV）廣泛存在，其相容性取決於你如何使用它們。它們有可選的模式語言，有時有幫助，有時是障礙。這些格式在資料型別方面有些模糊，因此你必須小心處理數字和二進位制字串等內容。
* 二進位制模式驅動的格式（如 Protocol Buffers 和 Avro）允許使用明確定義的向前和向後相容性語義進行緊湊、高效的編碼。模式可用於文件和程式碼生成，適用於靜態型別語言。但是，這些格式的缺點是資料需要在人類可讀之前進行解碼。

我們還討論了幾種資料流模式，說明了資料編碼很重要的不同場景：

* 資料庫，其中寫入資料庫的程序對資料進行編碼，從資料庫讀取的程序對其進行解碼
* RPC 和 REST API，其中客戶端對請求進行編碼，伺服器對請求進行解碼並對響應進行編碼，客戶端最終對響應進行解碼
* 事件驅動架構（使用訊息代理或 actor），其中節點透過相互發送訊息進行通訊，這些訊息由傳送者編碼並由接收者解碼

我們可以得出結論，透過一點小心，向後/向前相容性和滾動升級是完全可以實現的。願你的應用程式演化迅速，部署頻繁。


### 參考

[^1]: [CWE-502: Deserialization of Untrusted Data](https://cwe.mitre.org/data/definitions/502.html). Common Weakness Enumeration, *cwe.mitre.org*, July 2006. Archived at [perma.cc/26EU-UK9Y](https://perma.cc/26EU-UK9Y)
[^2]: Steve Breen. [What Do WebLogic, WebSphere, JBoss, Jenkins, OpenNMS, and Your Application Have in Common? This Vulnerability](https://foxglovesecurity.com/2015/11/06/what-do-weblogic-websphere-jboss-jenkins-opennms-and-your-application-have-in-common-this-vulnerability/). *foxglovesecurity.com*, November 2015. Archived at [perma.cc/9U97-UVVD](https://perma.cc/9U97-UVVD)
[^3]: Patrick McKenzie. [What the Rails Security Issue Means for Your Startup](https://www.kalzumeus.com/2013/01/31/what-the-rails-security-issue-means-for-your-startup/). *kalzumeus.com*, January 2013. Archived at [perma.cc/2MBJ-7PZ6](https://perma.cc/2MBJ-7PZ6)
[^4]: Brian Goetz. [Towards Better Serialization](https://openjdk.org/projects/amber/design-notes/towards-better-serialization). *openjdk.org*, June 2019. Archived at [perma.cc/UK6U-GQDE](https://perma.cc/UK6U-GQDE)
[^5]: Eishay Smith. [jvm-serializers wiki](https://github.com/eishay/jvm-serializers/wiki). *github.com*, October 2023. Archived at [perma.cc/PJP7-WCNG](https://perma.cc/PJP7-WCNG)
[^6]: [XML Is a Poor Copy of S-Expressions](https://wiki.c2.com/?XmlIsaPoorCopyOfEssExpressions). *wiki.c2.com*, May 2013. Archived at [perma.cc/7FAN-YBKL](https://perma.cc/7FAN-YBKL)
[^7]: Julia Evans. [Examples of floating point problems](https://jvns.ca/blog/2023/01/13/examples-of-floating-point-problems/). *jvns.ca*, January 2023. Archived at [perma.cc/M57L-QKKW](https://perma.cc/M57L-QKKW)
[^8]: Matt Harris. [Snowflake: An Update and Some Very Important Information](https://groups.google.com/g/twitter-development-talk/c/ahbvo3VTIYI). Email to *Twitter Development Talk* mailing list, October 2010. Archived at [perma.cc/8UBV-MZ3D](https://perma.cc/8UBV-MZ3D)
[^9]: Yakov Shafranovich. [RFC 4180: Common Format and MIME Type for Comma-Separated Values (CSV) Files](https://tools.ietf.org/html/rfc4180). IETF, October 2005.
[^10]: Andy Coates. [Evolving JSON Schemas - Part I](https://www.creekservice.org/articles/2024/01/08/json-schema-evolution-part-1.html) and [Part II](https://www.creekservice.org/articles/2024/01/09/json-schema-evolution-part-2.html). *creekservice.org*, January 2024. Archived at [perma.cc/MZW3-UA54](https://perma.cc/MZW3-UA54) and [perma.cc/GT5H-WKZ5](https://perma.cc/GT5H-WKZ5)
[^11]: Pierre Genevès, Nabil Layaïda, and Vincent Quint. [Ensuring Query Compatibility with Evolving XML Schemas](https://arxiv.org/abs/0811.4324). INRIA Technical Report 6711, November 2008.
[^12]: Tim Bray. [Bits On the Wire](https://www.tbray.org/ongoing/When/201x/2019/11/17/Bits-On-the-Wire). *tbray.org*, November 2019. Archived at [perma.cc/3BT3-BQU3](https://perma.cc/3BT3-BQU3)
[^13]: Mark Slee, Aditya Agarwal, and Marc Kwiatkowski. [Thrift: Scalable Cross-Language Services Implementation](https://thrift.apache.org/static/files/thrift-20070401.pdf). Facebook technical report, April 2007. Archived at [perma.cc/22BS-TUFB](https://perma.cc/22BS-TUFB)
[^14]: Martin Kleppmann. [Schema Evolution in Avro, Protocol Buffers and Thrift](https://martin.kleppmann.com/2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html). *martin.kleppmann.com*, December 2012. Archived at [perma.cc/E4R2-9RJT](https://perma.cc/E4R2-9RJT)
[^15]: Doug Cutting, Chad Walters, Jim Kellerman, et al. [[PROPOSAL] New Subproject: Avro](https://lists.apache.org/thread/z571w0r5jmfsjvnl0fq4fgg0vh28d3bk). Email thread on *hadoop-general* mailing list, *lists.apache.org*, April 2009. Archived at [perma.cc/4A79-BMEB](https://perma.cc/4A79-BMEB)
[^16]: Apache Software Foundation. [Apache Avro 1.12.0 Specification](https://avro.apache.org/docs/1.12.0/specification/). *avro.apache.org*, August 2024. Archived at [perma.cc/C36P-5EBQ](https://perma.cc/C36P-5EBQ)
[^17]: Apache Software Foundation. [Avro schemas as LL(1) CFG definitions](https://avro.apache.org/docs/1.12.0/api/java/org/apache/avro/io/parsing/doc-files/parsing.html). *avro.apache.org*, August 2024. Archived at [perma.cc/JB44-EM9Q](https://perma.cc/JB44-EM9Q)
[^18]: Tony Hoare. [Null References: The Billion Dollar Mistake](https://www.infoq.com/presentations/Null-References-The-Billion-Dollar-Mistake-Tony-Hoare/). Talk at *QCon London*, March 2009.
[^19]: Confluent, Inc. [Schema Registry Overview](https://docs.confluent.io/platform/current/schema-registry/index.html). *docs.confluent.io*, 2024. Archived at [perma.cc/92C3-A9JA](https://perma.cc/92C3-A9JA)
[^20]: Aditya Auradkar and Tom Quiggle. [Introducing Espresso—LinkedIn’s Hot New Distributed Document Store](https://engineering.linkedin.com/espresso/introducing-espresso-linkedins-hot-new-distributed-document-store). *engineering.linkedin.com*, January 2015. Archived at [perma.cc/FX4P-VW9T](https://perma.cc/FX4P-VW9T)
[^21]: Jay Kreps. [Putting Apache Kafka to Use: A Practical Guide to Building a Stream Data Platform (Part 2)](https://www.confluent.io/blog/event-streaming-platform-2/). *confluent.io*, February 2015. Archived at [perma.cc/8UA4-ZS5S](https://perma.cc/8UA4-ZS5S)
[^22]: Gwen Shapira. [The Problem of Managing Schemas](https://www.oreilly.com/content/the-problem-of-managing-schemas/). *oreilly.com*, November 2014. Archived at [perma.cc/BY8Q-RYV3](https://perma.cc/BY8Q-RYV3)
[^23]: John Larmouth. [*ASN.1 Complete*](https://www.oss.com/asn1/resources/books-whitepapers-pubs/larmouth-asn1-book.pdf). Morgan Kaufmann, 1999. ISBN: 978-0-122-33435-1. Archived at [perma.cc/GB7Y-XSXQ](https://perma.cc/GB7Y-XSXQ)
[^24]: Burton S. Kaliski Jr. [A Layman’s Guide to a Subset of ASN.1, BER, and DER](https://luca.ntop.org/Teaching/Appunti/asn1.html). Technical Note, RSA Data Security, Inc., November 1993. Archived at [perma.cc/2LMN-W9U8](https://perma.cc/2LMN-W9U8)
[^25]: Jacob Hoffman-Andrews. [A Warm Welcome to ASN.1 and DER](https://letsencrypt.org/docs/a-warm-welcome-to-asn1-and-der/). *letsencrypt.org*, April 2020. Archived at [perma.cc/CYT2-GPQ8](https://perma.cc/CYT2-GPQ8)
[^26]: Lev Walkin. [Question: Extensibility and Dropping Fields](https://lionet.info/asn1c/blog/2010/09/21/question-extensibility-removing-fields/). *lionet.info*, September 2010. Archived at [perma.cc/VX8E-NLH3](https://perma.cc/VX8E-NLH3)
[^27]: Jacqueline Xu. [Online migrations at scale](https://stripe.com/blog/online-migrations). *stripe.com*, February 2017. Archived at [perma.cc/X59W-DK7Y](https://perma.cc/X59W-DK7Y)
[^28]: Geoffrey Litt, Peter van Hardenberg, and Orion Henry. [Project Cambria: Translate your data with lenses](https://www.inkandswitch.com/cambria/). Technical Report, *Ink & Switch*, October 2020. Archived at [perma.cc/WA4V-VKDB](https://perma.cc/WA4V-VKDB)
[^29]: Pat Helland. [Data on the Outside Versus Data on the Inside](https://www.cidrdb.org/cidr2005/papers/P12.pdf). At *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005.
[^30]: Roy Thomas Fielding. [Architectural Styles and the Design of Network-Based Software Architectures](https://ics.uci.edu/~fielding/pubs/dissertation/fielding_dissertation.pdf). PhD Thesis, University of California, Irvine, 2000. Archived at [perma.cc/LWY9-7BPE](https://perma.cc/LWY9-7BPE)
[^31]: Roy Thomas Fielding. [REST APIs must be hypertext-driven](https://roy.gbiv.com/untangled/2008/rest-apis-must-be-hypertext-driven).” *roy.gbiv.com*, October 2008. Archived at [perma.cc/M2ZW-8ATG](https://perma.cc/M2ZW-8ATG)
[^32]: [OpenAPI Specification Version 3.1.0](https://swagger.io/specification/). *swagger.io*, February 2021. Archived at [perma.cc/3S6S-K5M4](https://perma.cc/3S6S-K5M4)
[^33]: Michi Henning. [The Rise and Fall of CORBA](https://cacm.acm.org/practice/the-rise-and-fall-of-corba/). *Communications of the ACM*, volume 51, issue 8, pages 52–57, August 2008. [doi:10.1145/1378704.1378718](https://doi.org/10.1145/1378704.1378718)
[^34]: Pete Lacey. [The S Stands for Simple](https://harmful.cat-v.org/software/xml/soap/simple). *harmful.cat-v.org*, November 2006. Archived at [perma.cc/4PMK-Z9X7](https://perma.cc/4PMK-Z9X7)
[^35]: Stefan Tilkov. [Interview: Pete Lacey Criticizes Web Services](https://www.infoq.com/articles/pete-lacey-ws-criticism/). *infoq.com*, December 2006. Archived at [perma.cc/JWF4-XY3P](https://perma.cc/JWF4-XY3P)
[^36]: Tim Bray. [The Loyal WS-Opposition](https://www.tbray.org/ongoing/When/200x/2004/09/18/WS-Oppo). *tbray.org*, September 2004. Archived at [perma.cc/J5Q8-69Q2](https://perma.cc/J5Q8-69Q2)
[^37]: Andrew D. Birrell and Bruce Jay Nelson. [Implementing Remote Procedure Calls](https://www.cs.princeton.edu/courses/archive/fall03/cs518/papers/rpc.pdf). *ACM Transactions on Computer Systems* (TOCS), volume 2, issue 1, pages 39–59, February 1984. [doi:10.1145/2080.357392](https://doi.org/10.1145/2080.357392)
[^38]: Jim Waldo, Geoff Wyant, Ann Wollrath, and Sam Kendall. [A Note on Distributed Computing](https://m.mirror.facebook.net/kde/devel/smli_tr-94-29.pdf). Sun Microsystems Laboratories, Inc., Technical Report TR-94-29, November 1994. Archived at [perma.cc/8LRZ-BSZR](https://perma.cc/8LRZ-BSZR)
[^39]: Steve Vinoski. [Convenience over Correctness](https://steve.vinoski.net/pdf/IEEE-Convenience_Over_Correctness.pdf). *IEEE Internet Computing*, volume 12, issue 4, pages 89–92, July 2008. [doi:10.1109/MIC.2008.75](https://doi.org/10.1109/MIC.2008.75)
[^40]: Brandur Leach. [Designing robust and predictable APIs with idempotency](https://stripe.com/blog/idempotency). *stripe.com*, February 2017. Archived at [perma.cc/JD22-XZQT](https://perma.cc/JD22-XZQT)
[^41]: Sam Rose. [Load Balancing](https://samwho.dev/load-balancing/). *samwho.dev*, April 2023. Archived at [perma.cc/Q7BA-9AE2](https://perma.cc/Q7BA-9AE2)
[^42]: Troy Hunt. [Your API versioning is wrong, which is why I decided to do it 3 different wrong ways](https://www.troyhunt.com/your-api-versioning-is-wrong-which-is/). *troyhunt.com*, February 2014. Archived at [perma.cc/9DSW-DGR5](https://perma.cc/9DSW-DGR5)
[^43]: Brandur Leach. [APIs as infrastructure: future-proofing Stripe with versioning](https://stripe.com/blog/api-versioning). *stripe.com*, August 2017. Archived at [perma.cc/L63K-USFW](https://perma.cc/L63K-USFW)
[^44]: Alexandre Alves, Assaf Arkin, Sid Askary, et al. [Web Services Business Process Execution Language Version 2.0](https://docs.oasis-open.org/wsbpel/2.0/wsbpel-v2.0.html). *docs.oasis-open.org*, April 2007.
[^45]: [What is a Temporal Service?](https://docs.temporal.io/clusters) *docs.temporal.io*, 2024. Archived at [perma.cc/32P3-CJ9V](https://perma.cc/32P3-CJ9V)
[^46]: Stephan Ewen. [Why we built Restate](https://restate.dev/blog/why-we-built-restate/). *restate.dev*, August 2023. Archived at [perma.cc/BJJ2-X75K](https://perma.cc/BJJ2-X75K)
[^47]: Keith Tenzer and Joshua Smith. [Idempotency and Durable Execution](https://temporal.io/blog/idempotency-and-durable-execution). *temporal.io*, February 2024. Archived at [perma.cc/9LGW-PCLU](https://perma.cc/9LGW-PCLU)
[^48]: [What is a Temporal Workflow?](https://docs.temporal.io/workflows) *docs.temporal.io*, 2024. Archived at [perma.cc/B5C5-Y396](https://perma.cc/B5C5-Y396)
[^49]: Jack Kleeman. [Solving durable execution’s immutability problem](https://restate.dev/blog/solving-durable-executions-immutability-problem/). *restate.dev*, February 2024. Archived at [perma.cc/G55L-EYH5](https://perma.cc/G55L-EYH5)
[^50]: Srinath Perera. [Exploring Event-Driven Architecture: A Beginner’s Guide for Cloud Native Developers](https://wso2.com/blogs/thesource/exploring-event-driven-architecture-a-beginners-guide-for-cloud-native-developers/). *wso2.com*, August 2023. Archived at [archive.org](https://web.archive.org/web/20240716204613/https%3A//wso2.com/blogs/thesource/exploring-event-driven-architecture-a-beginners-guide-for-cloud-native-developers/)
[^51]: Philip A. Bernstein, Sergey Bykov, Alan Geller, Gabriel Kliot, and Jorgen Thelin. [Orleans: Distributed Virtual Actors for Programmability and Scalability](https://www.microsoft.com/en-us/research/publication/orleans-distributed-virtual-actors-for-programmability-and-scalability/). Microsoft Research Technical Report MSR-TR-2014-41, March 2014. Archived at [perma.cc/PD3U-WDMF](https://perma.cc/PD3U-WDMF)

================================================
FILE: content/tw/ch6.md
================================================
---
title: "6. 複製"
weight: 206
breadcrumbs: false
---

<a id="ch_replication"></a>

![](/map/ch05.png)

> *可能出錯的東西和“不可能”出錯的東西之間，最大的區別在於：後者一旦出錯，往往幾乎無從下手，也難以修復。*
>
> 道格拉斯·亞當斯，《基本無害》（1992）

**複製** 指的是在透過網路連線的多臺機器上保留相同資料的副本。如 ["分散式與單節點系統"](/tw/ch1#sec_introduction_distributed) 中所討論的，你可能出於以下幾個原因希望複製資料：

* 使資料在地理上更接近使用者（從而減少訪問延遲）
* 即使系統的部分元件出現故障，也能讓系統繼續工作（從而提高可用性）
* 擴充套件能夠處理讀查詢的機器數量（從而提高讀吞吐量）

本章假設你的資料集足夠小，每臺機器都可以儲存整個資料集的副本。在 [第 7 章](/tw/ch7#ch_sharding) 中，我們將放寬這一假設，討論單臺機器無法容納的、過大資料集的 **分片**（**分割槽**）。在後續章節中，我們將討論複製資料系統中可能發生的各種故障，以及如何處理它們。

如果需要複製的資料不會隨時間變化，那麼複製就很簡單：只需要將資料複製到每個節點一次就大功告成。處理複製的所有困難都在於處理複製資料的 **變更**，這也是本章的主題。我們將討論三類在節點之間複製變更的演算法：**單主**、**多主** 和 **無主** 複製。幾乎所有分散式資料庫都使用這三種方法之一。它們各有利弊，我們將詳細研究。

複製需要考慮許多權衡：例如，是使用同步還是非同步複製，以及如何處理失敗的副本。這些通常是資料庫中的配置選項，儘管不同資料庫的細節有所不同，但許多不同實現的通用原則是相似的。我們將在本章中討論這些選擇的後果。

資料庫複製是一個古老的話題——自 20 世紀 70 年代研究以來，原理並沒有太大變化 [^1]，因為網路的基本約束保持不變。儘管如此古老，像 **最終一致性** 這樣的概念仍然會引起困惑。在 ["複製延遲的問題"](#sec_replication_lag) 中，我們將更準確地瞭解最終一致性，並討論諸如 **讀己之寫** 和 **單調讀** 等保證。

--------

> [!TIP] 備份與複製
>
> 你可能會想，如果有了複製，是否還需要備份。答案是肯定的，因為它們有不同的目的：副本會快速將一個節點的寫入反映到其他節點上，但備份儲存資料的舊快照，以便你可以回到過去的時間點。如果你不小心刪除了一些資料，複製並不能幫助你，因為刪除操作也會傳播到副本，所以如果你想恢復被刪除的資料，就需要備份。
>
> 事實上，複製和備份通常是相互補充的。備份有時是設定複製過程的一部分，正如我們將在 ["設定新的副本"](#sec_replication_new_replica) 中看到的。反過來，歸檔複製日誌可以成為備份過程的一部分。
>
> 一些資料庫在內部維護過去狀態的不可變快照，作為一種內部備份。然而，這意味著在與當前狀態相同的儲存介質上保留資料的舊版本。如果你有大量資料，將舊資料的備份儲存在針對不常訪問資料最佳化的物件儲存中可能會更便宜，而只在主儲存中儲存資料庫的當前狀態。

--------

## 單主複製 {#sec_replication_leader}

儲存資料庫副本的每個節點稱為 **副本**。有了多個副本，不可避免地會出現一個問題：我們如何確保所有資料最終都出現在所有副本上？

每次寫入資料庫都需要由每個副本處理；否則，副本將不再包含相同的資料。最常見的解決方案稱為 **基於領導者的複製**、**主備複製** 或 **主動/被動複製**。它的工作原理如下（見 [圖 6-1](#fig_replication_leader_follower)）：

1. 其中一個副本被指定為 **領導者**（也稱為 **主庫** 或 **源** [^2]）。當客戶端想要寫入資料庫時，他們必須將請求傳送給領導者，領導者首先將新資料寫入其本地儲存。
2. 其他副本稱為 **追隨者**（**只讀副本**、**從庫** 或 **熱備**）。每當領導者將新資料寫入其本地儲存時，它也會將資料變更作為 **複製日誌** 或 **變更流** 的一部分發送給所有追隨者。每個追隨者從領導者獲取日誌，並透過按照與領導者處理相同的順序應用所有寫入來相應地更新其本地資料庫副本。
3. 當客戶端想要從資料庫讀取時，它可以查詢領導者或任何追隨者。然而，只有領導者接受寫入（從客戶端的角度來看，追隨者是隻讀的）。

{{< figure src="/fig/ddia_0601.png" id="fig_replication_leader_follower" caption="圖 6-1. 單主複製將所有寫入定向到指定的領導者，該領導者向追隨者傳送變更流。" class="w-full my-4" >}}

如果資料庫是分片的（見 [第 7 章](/tw/ch7#ch_sharding)），每個分片都有一個領導者。不同的分片可能在不同的節點上有其領導者，但每個分片仍必須有一個領導者。在 ["多主複製"](#sec_replication_multi_leader) 中，我們將討論一種替代模型，其中系統可能同時為同一分片擁有多個領導者。

單主複製被廣泛使用。它是許多關係資料庫的內建功能，如 PostgreSQL、MySQL、Oracle Data Guard [^3] 和 SQL Server 的 Always On 可用性組 [^4]。它也用於一些文件資料庫，如 MongoDB 和 DynamoDB [^5]，訊息代理如 Kafka，複製塊裝置如 DRBD，以及一些網路檔案系統。許多共識演算法（如 Raft）也基於單個領導者，用於 CockroachDB [^6]、TiDB [^7]、etcd 和 RabbitMQ 仲裁佇列（以及其他）中的複製，並在舊領導者失敗時自動選舉新領導者（我們將在 [第 10 章](/tw/ch10#ch_consistency) 中更詳細地討論共識）。

--------

> [!NOTE]
> 在較舊的文件中，你可能會看到術語 **主從複製**。它與基於領導者的複製含義相同，但應該避免使用該術語，因為它被廣泛認為是冒犯性的 [^8]。

--------

### 同步複製與非同步複製 {#sec_replication_sync_async}

複製系統的一個重要細節是複製是 **同步** 發生還是 **非同步** 發生。（在關係資料庫中，這通常是一個可配置選項；其他系統通常硬編碼為其中之一。）

想想 [圖 6-1](#fig_replication_leader_follower) 中發生的情況，一個網站使用者更新他們的個人資料圖片。在某個時間點，客戶端向領導者傳送更新請求；不久之後，領導者收到了它。在某個時間點，領導者將資料變更轉發給追隨者。最終，領導者通知客戶端更新成功。[圖 6-2](#fig_replication_sync_replication) 顯示了時序可能的工作方式。

{{< figure src="/fig/ddia_0602.png" id="fig_replication_sync_replication" caption="圖 6-2. 基於領導者的複製，帶有一個同步和一個非同步追隨者。" class="w-full my-4" >}}

在 [圖 6-2](#fig_replication_sync_replication) 的示例中，對追隨者 1 的複製是 **同步的**：領導者等待追隨者 1 確認它已收到寫入，然後才向用戶報告成功，並使寫入對其他客戶端可見。對追隨者 2 的複製是 **非同步的**：領導者傳送訊息，但不等待追隨者的響應。

圖中顯示，追隨者 2 處理訊息之前有相當大的延遲。通常，複製相當快：大多數資料庫系統在不到一秒的時間內將變更應用到追隨者。然而，不能保證需要多長時間。在某些情況下，追隨者可能落後領導者幾分鐘或更長時間；例如，如果追隨者正在從故障中恢復，如果系統正在接近最大容量執行，或者如果節點之間存在網路問題。

同步複製的優點是追隨者保證擁有與領導者一致的最新資料副本。如果領導者突然失敗，我們可以確信資料仍然在追隨者上可用。缺點是，如果同步追隨者沒有響應（因為它已崩潰，或存在網路故障，或任何其他原因），寫入就無法處理。領導者必須阻塞所有寫入並等待同步副本再次可用。

因此，將所有追隨者都設為同步是不切實際的：任何一個節點的中斷都會導致整個系統停止。實際上，如果資料庫提供同步複製，通常意味著 **一個** 追隨者是同步的，其他的是非同步的。如果同步追隨者變得不可用或緩慢，非同步追隨者之一將變為同步。這保證了你至少在兩個節點上擁有最新的資料副本：領導者和一個同步追隨者。這種配置有時也稱為 **半同步**。

在某些系統中，**多數**（例如，包括領導者在內的 5 個副本中的 3 個）副本被同步更新，其餘少數是非同步的。這是 **仲裁** 的一個例子，我們將在 ["讀寫仲裁"](#sec_replication_quorum_condition) 中進一步討論。多數仲裁通常用於使用共識協議進行自動領導者選舉的系統中，我們將在 [第 10 章](/tw/ch10#ch_consistency) 中回到這個話題。

有時，基於領導者的複製被配置為完全非同步。在這種情況下，如果領導者失敗且無法恢復，任何尚未複製到追隨者的寫入都會丟失。這意味著即使已向客戶端確認，寫入也不能保證持久。然而，完全非同步配置的優點是領導者可以繼續處理寫入，即使所有追隨者都已落後。

弱化永續性可能聽起來像是一個糟糕的權衡，但非同步複製仍然被廣泛使用，特別是如果有許多追隨者或者它們在地理上分佈廣泛 [^9]。我們將在 ["複製延遲的問題"](#sec_replication_lag) 中回到這個問題。

### 設定新的副本 {#sec_replication_new_replica}

不時地，你需要設定新的追隨者——也許是為了增加副本的數量，或者替換失敗的節點。如何確保新的追隨者擁有領導者資料的準確副本？

簡單地將資料檔案從一個節點複製到另一個節點通常是不夠的：客戶端不斷向資料庫寫入，資料總是在變化，所以標準檔案複製會在不同的時間點看到資料庫的不同部分。結果可能沒有任何意義。

你可以透過鎖定資料庫（使其不可用於寫入）來使磁碟上的檔案保持一致，但這將違揹我們的高可用性目標。幸運的是，設定追隨者通常可以在不停機的情況下完成。從概念上講，過程如下所示：

1. 在某個時間點獲取領導者資料庫的一致快照——如果可能，不鎖定整個資料庫。大多數資料庫都有此功能，因為備份也需要它。在某些情況下，需要第三方工具，例如用於 MySQL 的 Percona XtraBackup。
2. 將快照複製到新的追隨者。
3. 追隨者連線到領導者並請求自快照拍攝以來發生的所有資料變更。這要求快照與領導者複製日誌中的確切位置相關聯。該位置有各種名稱：例如，PostgreSQL 稱之為 **日誌序列號**；MySQL 有兩種機制，**binlog 位點** 和 **全域性事務識別符號**（GTID）。
4. 當追隨者處理了自快照以來的資料變更積壓後，我們說它已經 **追上進度**。它現在可以繼續處理領導者發生的資料變更。

設定追隨者的實際步驟因資料庫而異。在某些系統中，該過程是完全自動化的，而在其他系統中，它可能是需要管理員手動執行的有些神秘的多步驟工作流程。

你也可以將複製日誌歸檔到物件儲存；連同物件儲存中整個資料庫的定期快照，這是實現資料庫備份和災難恢復的好方法。你還可以透過從物件儲存下載這些檔案來執行設定新追隨者的步驟 1 和 2。例如，WAL-G 為 PostgreSQL、MySQL 和 SQL Server 執行此操作，Litestream 為 SQLite 執行等效操作。

--------

<a id="sec_replication_object_storage"></a>

> [!TIP] 由物件儲存支援的資料庫
>
> 物件儲存可用於存檔資料之外的更多用途。許多資料庫開始使用物件儲存（如 Amazon Web Services S3、Google Cloud Storage 和 Azure Blob Storage）來為即時查詢提供資料。在物件儲存中儲存資料庫資料有許多好處：
>
> * 與其他雲端儲存選項相比，物件儲存價格便宜，這使得雲資料庫可以將較少查詢的資料儲存在更便宜、更高延遲的儲存上，同時從記憶體、SSD 和 NVMe 中提供工作集。
> * 物件儲存還提供具有非常高永續性保證的多區域、雙區域或多區域複製。這也允許資料庫繞過跨區域網路費用。
> * 資料庫可以使用物件儲存的 **條件寫入** 功能——本質上是 **比較並設定**（CAS）操作——來實現事務和領導者選舉 [^10] [^11]
> * 將來自多個數據庫的資料儲存在同一物件儲存中可以簡化資料整合，特別是在使用 Apache Parquet 和 Apache Iceberg 等開放格式時。
>
> 這些好處透過將事務、領導者選舉和複製的責任轉移到物件儲存，大大簡化了資料庫架構。
>
> 採用物件儲存進行復制的系統必須應對一些權衡。值得注意的是，物件儲存的讀寫延遲比本地磁碟或 EBS 等虛擬塊裝置要高得多。許多雲提供商還收取每個 API 呼叫費用，這迫使系統批次讀寫以降低成本。這種批處理進一步增加了延遲。此外，許多物件儲存不提供標準檔案系統介面。這阻止了缺乏物件儲存整合的系統利用物件儲存。像 **使用者空間檔案系統**（FUSE）這樣的介面允許操作員將物件儲存桶掛載為檔案系統，應用程式可以在不知道其資料儲存在物件儲存上的情況下使用。儘管如此，許多物件儲存的 FUSE 介面缺乏系統可能依賴的 POSIX 功能，如非順序寫入或符號連結。
>
> 不同的系統以各種方式處理這些權衡。一些引入了 **分層儲存** 架構，將較少訪問的資料放在物件儲存上，而新的或頻繁訪問的資料儲存在更快的儲存裝置上，如 SSD、NVMe，甚至記憶體中。其他系統使用物件儲存作為其主要儲存層，但使用單獨的低延遲儲存系統（如 Amazon 的 EBS 或 Neon 的 Safekeepers [^12]）來儲存其 WAL。最近，一些系統更進一步，採用了 **零磁碟架構**（ZDA）。基於 ZDA 的系統將所有資料持久化到物件儲存，並嚴格將磁碟和記憶體用於快取。這允許節點沒有持久狀態，這大大簡化了運維。WarpStream、Confluent Freight、Buf 的 Bufstream 和 Redpanda Serverless 都是使用零磁碟架構構建的相容 Kafka 的系統。幾乎每個現代雲資料倉庫也採用這種架構，Turbopuffer（向量搜尋引擎）和 SlateDB（雲原生 LSM 儲存引擎）也是如此。

--------

### 處理節點故障 {#sec_replication_failover}

系統中的任何節點都可能發生故障，可能是由於故障意外發生，但同樣可能是由於計劃維護（例如，重新啟動機器以安裝核心安全補丁）。能夠在不停機的情況下重新啟動單個節點對於操作和維護來說是一個很大的優勢。因此，我們的目標是儘管單個節點發生故障，但保持整個系統執行，並儘可能減小節點中斷的影響。

如何透過基於領導者的複製實現高可用性？

#### 追隨者故障：追趕恢復 {#follower-failure-catch-up-recovery}

在其本地磁碟上，每個追隨者保留從領導者接收的資料變更日誌。如果追隨者崩潰並重新啟動，或者如果領導者和追隨者之間的網路暫時中斷，追隨者可以很容易地恢復：從其日誌中，它知道在故障發生之前處理的最後一個事務。因此，追隨者可以連線到領導者並請求在追隨者斷開連線期間發生的所有資料變更。當它應用了這些變更後，它就趕上了領導者，可以像以前一樣繼續接收資料變更流。

儘管追隨者恢復在概念上很簡單，但在效能方面可能具有挑戰性：如果資料庫具有高寫入吞吐量，或者如果追隨者已離線很長時間，可能有很多寫入需要趕上。在進行這種追趕時，恢復的追隨者和領導者（需要將寫入積壓傳送到追隨者）都會有高負載。

一旦所有追隨者都確認已處理了日誌，領導者就可以刪除其寫入日誌，但如果追隨者長時間不可用，領導者面臨選擇：要麼保留日誌直到追隨者恢復並趕上（冒著領導者磁碟空間耗盡的風險），要麼刪除不可用追隨者尚未確認的日誌（在這種情況下，追隨者無法從日誌中恢復，並且在它回來時必須從備份中恢復）。

#### 領導者故障：故障轉移 {#leader-failure-failover}

處理領導者故障更加棘手：其中一個追隨者需要被提升為新的領導者，客戶端需要重新配置以將其寫入傳送到新的領導者，其他追隨者需要開始從新的領導者消費資料變更。這個過程稱為 **故障轉移**。

故障轉移可以手動發生（管理員收到領導者失敗的通知並採取必要步驟來建立新的領導者）或自動發生。自動故障轉移過程通常包括以下步驟：

1. **確定領導者已失效。** 可能會出現許多問題：崩潰、停電、網路故障等。沒有萬無一失的方法能準確判斷發生了什麼，所以大多數系統只是依賴超時：節點之間會頻繁來回傳送訊息，如果某個節點在一段時間內（例如 30 秒）沒有響應，就認為它已經失效。（如果是計劃維護而主動下線領導者，則不適用。）
2. **選擇新的領導者。** 這可以透過選舉過程完成（由剩餘副本中的多數選出領導者），也可以由預先設定的 **控制器節點** 任命 [^13]。最適合擔任領導者的通常是那個擁有舊領導者最新資料變更的副本（以儘量減少資料丟失）。讓所有節點就新領導者達成一致是一個共識問題，我們會在 [第 10 章](/tw/ch10#ch_consistency) 詳細討論。
3. **將系統重新配置為使用新的領導者。** 客戶端現在需要把寫請求傳送到新領導者（我們在 ["請求路由"](/tw/ch7#sec_sharding_routing) 中討論這個問題）。如果舊領導者恢復，它可能仍然以為自己是領導者，並不知道其他副本已經讓它下臺。系統需要確保舊領導者降級為追隨者，並識別新的領導者。

故障轉移充滿了可能出錯的事情：

* 如果使用非同步複製，新的領導者可能在失敗之前沒有收到來自舊領導者的所有寫入。如果前領導者在選擇了新領導者後重新加入叢集，那些寫入應該怎麼辦？新的領導者可能同時收到了衝突的寫入。最常見的解決方案是簡單地丟棄舊領導者未複製的寫入，這意味著你認為已提交的寫入實際上並不持久。
* 如果資料庫之外的其他儲存系統需要與資料庫內容協調，丟棄寫入尤其危險。例如，在 GitHub 的一次事故中 [^14]，一個過時的 MySQL 追隨者被提升為領導者。資料庫使用自增計數器為新行分配主鍵，但由於新領導者的計數器落後於舊領導者，它重用了舊領導者先前分配的一些主鍵。這些主鍵也在 Redis 儲存中使用，因此主鍵的重用導致 MySQL 和 Redis 之間的不一致，這導致一些私人資料被錯誤地披露給錯誤的使用者。
* 在某些故障場景中（見 [第 9 章](/tw/ch9#ch_distributed)），可能會發生兩個節點都認為自己是領導者的情況。這種情況稱為 **腦裂**，這是危險的：如果兩個領導者都接受寫入，並且沒有解決衝突的過程（見 ["多主複製"](#sec_replication_multi_leader)），資料很可能會丟失或損壞。作為安全措施，一些系統在檢測到兩個領導者時有一種機制來關閉一個節點。然而，如果這種機制設計不當，你最終可能會關閉兩個節點 [^15]。此外，當檢測到腦裂並關閉舊節點時，可能為時已晚，資料已經損壞。
* 在宣佈領導者死亡之前，正確的超時是什麼？更長的超時意味著在領導者失敗的情況下恢復時間更長。然而，如果超時太短，可能會有不必要的故障轉移。例如，臨時負載峰值可能導致節點的響應時間增加到超時以上，或者網路故障可能導致資料包延遲。如果系統已經在高負載或網路問題上掙扎，不必要的故障轉移可能會使情況變得更糟，而不是更好。

--------

> [!NOTE]
> 透過限制或關閉舊領導者來防止腦裂，被稱為 **柵欄機制**（fencing），或者更直白地說，**爆彼之頭**（STONITH）。我們將在 ["分散式鎖和租約"](/tw/ch9#sec_distributed_lock_fencing) 中更詳細地討論柵欄機制。

--------

這些問題沒有簡單的解決方案。因此，一些運維團隊更喜歡手動執行故障轉移，即使軟體支援自動故障轉移。

故障轉移最重要的是選擇一個最新的追隨者作為新的領導者——如果使用同步或半同步複製，這將是舊領導者在確認寫入之前等待的追隨者。使用非同步複製，你可以選擇具有最大日誌序列號的追隨者。這最小化了故障轉移期間丟失的資料量：丟失幾分之一秒的寫入可能是可以容忍的，但選擇落後幾天的追隨者可能是災難性的。

這些問題——節點故障；不可靠的網路；以及圍繞副本一致性、永續性、可用性和延遲的權衡——實際上是分散式系統中的基本問題。在 [第 9 章](/tw/ch9#ch_distributed) 和 [第 10 章](/tw/ch10#ch_consistency) 中，我們將更深入地討論它們。

### 複製日誌的實現 {#sec_replication_implementation}

基於領導者的複製在底層是如何工作的？讓我們簡要地看看實踐中使用的幾種不同的複製方法。

#### 基於語句的複製 {#statement-based-replication}

在最簡單的情況下，領導者記錄它執行的每個寫入請求（**語句**）並將該語句日誌傳送給其追隨者。對於關係資料庫，這意味著每個 `INSERT`、`UPDATE` 或 `DELETE` 語句都被轉發到追隨者，每個追隨者解析並執行該 SQL 語句，就像它是從客戶端接收的一樣。

雖然這聽起來合理，但這種複製方法可能會出現各種問題：

* 任何呼叫非確定性函式的語句，例如 `NOW()` 獲取當前日期和時間或 `RAND()` 獲取隨機數，可能會在每個副本上生成不同的值。
* 如果語句使用自增列，或者如果它們依賴於資料庫中的現有資料（例如，`UPDATE … WHERE <某條件>`），它們必須在每個副本上以完全相同的順序執行，否則它們可能會產生不同的效果。當有多個併發執行的事務時，這可能會受到限制。
* 具有副作用的語句（例如，觸發器、儲存過程、使用者定義的函式）可能會導致每個副本上發生不同的副作用，除非副作用是絕對確定的。

可以解決這些問題——例如，領導者可以在記錄語句時用固定的返回值替換任何非確定性函式呼叫，以便追隨者都獲得相同的值。以固定順序執行確定性語句的想法類似於我們之前在 ["事件溯源與 CQRS"](/tw/ch3#sec_datamodels_events) 中討論的事件溯源模型。這種方法也稱為 **狀態機複製**，我們將在 ["使用共享日誌"](/tw/ch10#sec_consistency_smr) 中討論其背後的理論。

基於語句的複製在 MySQL 5.1 版本之前使用。它今天有時仍在使用，因為它相當緊湊，但預設情況下，如果語句中有任何非確定性，MySQL 現在會切換到基於行的複製（稍後討論）。VoltDB 使用基於語句的複製，並透過要求事務是確定性的來使其安全 [^16]。然而，確定性在實踐中很難保證，因此許多資料庫更喜歡其他複製方法。

#### 預寫日誌（WAL）傳輸 {#write-ahead-log-wal-shipping}

在 [第 4 章](/tw/ch4#ch_storage) 中，我們看到預寫日誌是使 B 樹儲存引擎健壯所必需的：每個修改首先寫入 WAL，以便在崩潰後可以將樹恢復到一致狀態。由於 WAL 包含將索引和堆恢復到一致狀態所需的所有資訊，我們可以使用完全相同的日誌在另一個節點上構建副本：除了將日誌寫入磁碟外，領導者還透過網路將其傳送給其追隨者。當追隨者處理此日誌時，它構建了與領導者上找到的完全相同的檔案副本。

此複製方法在 PostgreSQL 和 Oracle 等中使用 [^17] [^18]。主要缺點是日誌在非常低的級別描述資料：WAL 包含哪些位元組在哪些磁碟塊中被更改的詳細資訊。這使得複製與儲存引擎緊密耦合。如果資料庫從一個版本更改其儲存格式到另一個版本，通常不可能在領導者和追隨者上執行不同版本的資料庫軟體。

這可能看起來像是一個小的實現細節，但它可能會產生很大的操作影響。如果複製協議允許追隨者使用比領導者更新的軟體版本，你可以透過首先升級追隨者然後執行故障轉移以使其中一個升級的節點成為新的領導者來執行資料庫軟體的零停機升級。如果複製協議不允許此版本不匹配（如 WAL 傳輸的情況），此類升級需要停機。

<a id="sec_replication_logical"></a>

#### 邏輯（基於行）日誌複製 {#logical-row-based-log-replication}

另一種選擇是為複製和儲存引擎使用不同的日誌格式，這允許複製日誌與儲存引擎內部解耦。這種複製日誌稱為 **邏輯日誌**，以區別於儲存引擎的（**物理**）資料表示。

關係資料庫的邏輯日誌通常是描述以行粒度對資料庫表的寫入的記錄序列：

* 對於插入的行，日誌包含所有列的新值。
* 對於刪除的行，日誌包含足夠的資訊來唯一標識被刪除的行。通常這將是主鍵，但如果表上沒有主鍵，則需要記錄所有列的舊值。
* 對於更新的行，日誌包含足夠的資訊來唯一標識更新的行，以及所有列的新值（或至少所有已更改的列的新值）。

修改多行的事務會生成多個這樣的日誌記錄，後跟指示事務已提交的記錄。MySQL 除了 WAL 之外還保留一個單獨的邏輯複製日誌，稱為 **binlog**（當配置為使用基於行的複製時）。PostgreSQL 透過將物理 WAL 解碼為行插入/更新/刪除事件來實現邏輯複製 [^19]。

由於邏輯日誌與儲存引擎內部解耦，因此可以更容易地保持向後相容，允許領導者和追隨者執行不同版本的資料庫軟體。這反過來又可以以最少的停機時間升級到新版本 [^20]。

邏輯日誌格式也更容易被外部應用解析。如果你想把資料庫內容傳送到外部系統（例如用於離線分析的資料倉庫），或者構建自定義索引和快取 [^21]，這一點會很有用。這種技術稱為 **資料變更捕獲**，我們將在 ["資料變更捕獲"](/tw/ch12#sec_stream_cdc) 一節再回到它。


## 複製延遲的問題 {#sec_replication_lag}

能夠容忍節點故障只是想要複製的一個原因。如 ["分散式與單節點系統"](/tw/ch1#sec_introduction_distributed) 中所述，其他原因是可伸縮性（處理比單臺機器能夠處理的更多請求）和延遲（將副本在地理上放置得更接近使用者）。

基於領導者的複製要求所有寫入都透過單個節點，但只讀查詢可以轉到任何副本。對於主要由讀取和只有少量寫入組成的工作負載（這通常是線上服務的情況），有一個有吸引力的選擇：建立許多追隨者，並將讀取請求分佈在這些追隨者上。這減輕了領導者的負載，並允許附近的副本提供讀取請求。

在這種 **讀擴充套件** 架構中，你可以透過新增更多追隨者來簡單地增加服務只讀請求的容量。然而，這種方法只有在使用非同步複製時才現實可行——如果你試圖同步複製到所有追隨者，單個節點故障或網路中斷將使整個系統無法寫入。而且你擁有的節點越多，其中一個節點宕機的可能性就越大，因此完全同步的配置將非常不可靠。

不幸的是，如果應用程式從 **非同步** 追隨者讀取，如果追隨者已落後，它可能會看到過時的資訊。這導致資料庫中出現明顯的不一致：如果你同時在領導者和追隨者上執行相同的查詢，你可能會得到不同的結果，因為並非所有寫入都已反映在追隨者中。這種不一致只是一種臨時狀態——如果你停止向資料庫寫入並等待一段時間，追隨者最終將趕上並與領導者保持一致。因此，這種效果被稱為 **最終一致性** [^22]。

--------

> [!NOTE]
> 術語 **最終一致性** 由 Douglas Terry 等人創造 [^23]，由 Werner Vogels 推廣 [^24]，併成為許多 NoSQL 專案的戰鬥口號。然而，不僅 NoSQL 資料庫是最終一致的：非同步複製的關係資料庫中的追隨者具有相同的特徵。

--------

術語"最終"是故意模糊的：一般來說，副本可以落後多遠沒有限制。在正常操作中，寫入發生在領導者上並反映在追隨者上之間的延遲——**複製延遲**——可能只是幾分之一秒，在實踐中不會被注意到。然而，如果系統在接近容量執行或網路中存在問題，延遲可以輕易增加到幾秒甚至幾分鐘。

當延遲如此之大時，它引入的不一致不僅僅是一個理論問題，而是應用程式的真正問題。在本節中，我們將重點介紹複製延遲時可能發生的三個問題示例。我們還將概述解決它們的一些方法。

### 讀己之寫 {#sec_replication_ryw}

許多應用程式讓使用者提交一些資料，然後檢視他們提交的內容。這可能是客戶資料庫中的記錄，或討論執行緒上的評論，或其他類似的東西。提交新資料時，必須將其傳送到領導者，但當用戶檢視資料時，可以從追隨者讀取。如果資料經常被檢視但只是偶爾被寫入，這尤其合適。

使用非同步複製，存在一個問題，如 [圖 6-3](#fig_replication_read_your_writes) 所示：如果使用者在寫入後不久檢視資料，新資料可能尚未到達副本。對使用者來說，看起來他們提交的資料丟失了，所以他們自然會不高興。

{{< figure src="/fig/ddia_0603.png" id="fig_replication_read_your_writes" caption="圖 6-3. 使用者進行寫入，然後從陳舊副本讀取。為了防止這種異常，我們需要寫後讀一致性。" class="w-full my-4" >}}

在這種情況下，我們需要 **寫後讀一致性**，也稱為 **讀己之寫一致性** [^23]。這是一種保證，如果使用者重新載入頁面，他們將始終看到他們自己提交的任何更新。它不對其他使用者做出承諾：其他使用者的更新可能直到稍後才可見。然而，它向用戶保證他們自己的輸入已正確儲存。

我們如何在基於領導者的複製系統中實現寫後讀一致性？有各種可能的技術。下面舉幾個例子：

* 當讀取使用者可能已修改的內容時，從領導者或同步更新的追隨者讀取；否則，從非同步更新的追隨者讀取。這要求你有某種方法知道某物是否可能已被修改，而無需實際查詢它。例如，社交網路上的使用者個人資料資訊通常只能由個人資料的所有者編輯，而不能由其他任何人編輯。因此，一個簡單的規則是：始終從領導者讀取使用者自己的個人資料，從追隨者讀取任何其他使用者的個人資料。
* 如果應用程式中的大多數東西都可能被使用者編輯，那種方法將不會有效，因為大多數東西都必須從領導者讀取（否定了讀擴充套件的好處）。在這種情況下，可以使用其他標準來決定是否從領導者讀取。例如，你可以跟蹤上次更新的時間，並在上次更新後的一分鐘內，使所有讀取都來自領導者 [^25]。你還可以監控追隨者上的複製延遲，並防止在落後領導者超過一分鐘的任何追隨者上進行查詢。
* 客戶端可以記住其最近寫入的時間戳——然後系統可以確保為該使用者提供任何讀取的副本至少反映該時間戳之前的更新。如果副本不夠最新，則可以由另一個副本處理讀取，或者查詢可以等待直到副本趕上 [^26]。時間戳可以是 **邏輯時間戳**（指示寫入順序的東西，例如日誌序列號）或實際系統時鐘（在這種情況下，時鐘同步變得至關重要；見 ["不可靠的時鐘"](/tw/ch9#sec_distributed_clocks)）。
* 如果你的副本分佈在各個地區（為了地理上接近使用者或為了可用性），還有額外的複雜性。任何需要由領導者提供的請求都必須路由到包含領導者的地區。

當同一使用者從多個裝置訪問你的服務時，會出現另一個複雜情況，例如桌面網路瀏覽器和移動應用程式。在這種情況下，你可能希望提供 **跨裝置** 寫後讀一致性：如果使用者在一個裝置上輸入一些資訊，然後在另一個裝置上檢視它，他們應該看到他們剛剛輸入的資訊。

在這種情況下，需要考慮一些額外的問題：

* 需要記住使用者上次更新的時間戳的方法變得更加困難，因為在一個裝置上執行的程式碼不知道在另一個裝置上發生了什麼更新。此元資料將需要集中化。
* 如果你的副本分佈在不同的地區，則無法保證來自不同裝置的連線將路由到同一地區。（例如，如果使用者的臺式計算機使用家庭寬頻連線，而他們的移動裝置使用蜂窩資料網路，則裝置的網路路由可能完全不同。）如果你的方法需要從領導者讀取，你可能首先需要將來自使用者所有裝置的請求路由到同一地區。

--------

> [!TIP] 地區和可用區
>
> 我們用 **地區**（region）來指代一個地理位置中的一組資料中心。雲服務提供商通常會在同一地區部署多個數據中心，每個資料中心稱為 **可用區**（availability zone，簡稱 AZ）。因此，一個地區由多個可用區組成；每個可用區都是獨立的物理設施，具有自己的供電、製冷等基礎設施。
>
> 同一地區內各可用區通常透過高速網路互聯，延遲足夠低，因此大多數分散式系統可以把同一地區內的多個可用區近似看作一個機房。多可用區部署可以抵禦單個可用區故障，但無法抵禦整個地區不可用。要應對地區級中斷，系統必須跨多個地區部署，這通常會帶來更高延遲、更低吞吐和更高的雲網絡費用。我們將在 ["多主複製拓撲"](#sec_replication_topologies) 中進一步討論這些權衡。這裡你只需記住：本書所說的“地區”，是同一地理位置內多個可用區（資料中心）的集合。

--------

### 單調讀 {#sec_replication_monotonic_reads}

從非同步追隨者讀取時可能發生的第二個異常示例是，使用者可能會看到事物 **在時間上倒退**。

如果使用者從不同的副本進行多次讀取，就可能發生這種情況。例如，[圖 6-4](#fig_replication_monotonic_reads) 顯示使用者 2345 進行相同的查詢兩次，首先到延遲很小的追隨者，然後到延遲更大的追隨者。（如果使用者重新整理網頁，並且每個請求都路由到隨機伺服器，這種情況很可能發生。）第一個查詢返回使用者 1234 最近新增的評論，但第二個查詢沒有返回任何內容，因為滯後的追隨者尚未獲取該寫入。實際上，第二個查詢觀察到的系統狀態比第一個查詢更早的時間點。如果第一個查詢沒有返回任何內容，這不會那麼糟糕，因為使用者 2345 可能不知道使用者 1234 最近添加了評論。然而，如果使用者 2345 首先看到使用者 1234 的評論出現，然後又看到它消失，這對使用者 2345 來說非常令人困惑。

{{< figure src="/fig/ddia_0604.png" id="fig_replication_monotonic_reads" caption="圖 6-4. 使用者首先從新鮮副本讀取，然後從陳舊副本讀取。時間似乎倒退了。為了防止這種異常，我們需要單調讀。" class="w-full my-4" >}}

**單調讀** [^22] 是一種保證這類異常不會發生的會話保證。它比強一致性弱，但比最終一致性強。當你讀取資料時，仍可能看到舊值；單調讀只保證同一使用者按順序進行多次讀取時，不會出現“時間倒退”——也就是先讀到新值，後又讀到更舊的值。

實現單調讀的一種方法是確保每個使用者始終從同一副本進行讀取（不同的使用者可以從不同的副本讀取）。例如，可以基於使用者 ID 的雜湊選擇副本，而不是隨機選擇。然而，如果該副本失敗，使用者的查詢將需要重新路由到另一個副本。

### 一致字首讀 {#sec_replication_consistent_prefix}

我們的第三個複製延遲異常示例涉及違反因果關係。想象一下 Poons 先生和 Cake 夫人之間的以下簡短對話：

Poons 先生
:   你能看到多遠的未來，Cake 夫人？

Cake 夫人
:   通常大約十秒鐘，Poons 先生。

這兩個句子之間存在因果依賴關係：Cake 夫人聽到了 Poons 先生的問題並回答了它。

現在，想象第三個人透過追隨者聽這個對話。Cake 夫人說的話透過延遲很小的追隨者，但 Poons 先生說的話有更長的複製延遲（見 [圖 6-5](#fig_replication_consistent_prefix)）。這個觀察者會聽到以下內容：

Cake 夫人
:   通常大約十秒鐘，Poons 先生。

Poons 先生
:   你能看到多遠的未來，Cake 夫人？

對觀察者來說，看起來 Cake 夫人在 Poons 先生甚至提出問題之前就回答了問題。這種通靈能力令人印象深刻，但非常令人困惑 [^27]。

{{< figure src="/fig/ddia_0605.png" id="fig_replication_consistent_prefix" caption="圖 6-5. 如果某些分片的複製比其他分片慢，觀察者可能會在看到問題之前看到答案。" class="w-full my-4" >}}

防止這種異常需要另一種型別的保證：**一致字首讀** [^22]。這種保證說，如果一系列寫入以某個順序發生，那麼任何讀取這些寫入的人都會看到它們以相同的順序出現。

這是分片（分割槽）資料庫中的一個特殊問題，我們將在 [第 7 章](/tw/ch7#ch_sharding) 中討論。如果資料庫始終以相同的順序應用寫入，讀取始終會看到一致的字首，因此這種異常不會發生。然而，在許多分散式資料庫中，不同的分片獨立執行，因此沒有全域性的寫入順序：當用戶從資料庫讀取時，他們可能會看到資料庫的某些部分處於較舊狀態，而某些部分處於較新狀態。

一種解決方案是確保任何因果相關的寫入都寫入同一分片——但在某些應用程式中，這無法有效完成。還有一些演算法明確跟蹤因果依賴關係，這是我們將在 ["先發生關係與併發"](#sec_replication_happens_before) 中回到的主題。

### 複製延遲的解決方案 {#id131}

在使用最終一致系統時，值得思考：如果複製延遲上升到幾分鐘甚至幾小時，應用程式會如何表現。如果答案是“沒問題”，那很好；但如果這會造成糟糕的使用者體驗，就應當設計系統提供更強的保證（如寫後讀一致性）。把非同步複製當作同步複製來假設，往往會在系統承壓時暴露問題。

如前所述，應用程式可以提供比底層資料庫更強的保證——例如，透過在領導者或同步更新的追隨者上執行某些型別的讀取。然而，在應用程式程式碼中處理這些問題很複雜且容易出錯。

對於應用程式開發人員來說，最簡單的程式設計模型是選擇一個為副本提供強一致性保證的資料庫，例如線性一致性（見 [第 10 章](/tw/ch10#ch_consistency)）和 ACID 事務（見 [第 8 章](/tw/ch8#ch_transactions)）。這允許你大部分忽略複製帶來的挑戰，並將資料庫視為只有一個節點。在 2010 年代初期，**NoSQL** 運動推廣了這樣的觀點，即這些功能限制了可伸縮性，大規模系統必須接受最終一致性。

然而，從那時起，許多資料庫開始提供強一致性和事務，同時還提供分散式資料庫的容錯、高可用性和可伸縮性優勢。如 ["關係模型與文件模型"](/tw/ch3#sec_datamodels_history) 中所述，這種趨勢被稱為 **NewSQL**，以與 NoSQL 形成對比（儘管它不太關於 SQL 本身，而更多關於可伸縮事務管理的新方法）。

儘管現在可以使用可伸縮、強一致的分散式資料庫，但某些應用程式選擇使用提供較弱一致性保證的不同形式的複製仍然有充分的理由：它們可以在面對網路中斷時提供更強的韌性，並且與事務系統相比具有較低的開銷。我們將在本章的其餘部分探討這些方法。


## 多主複製 {#sec_replication_multi_leader}

到目前為止，本章中我們只考慮了使用單個領導者的複製架構。儘管這是一種常見的方法，但還有一些有趣的替代方案。

單主複製有一個主要缺點：所有寫入都必須透過一個領導者。如果由於任何原因無法連線到領導者，例如你和領導者之間的網路中斷，你就無法寫入資料庫。

單主複製模型的自然擴充套件是允許多個節點接受寫入。複製仍然以相同的方式進行：每個處理寫入的節點必須將該資料變更轉發給所有其他節點。我們稱之為 **多主** 配置（也稱為 **主動/主動** 或 **雙向** 複製）。在這種設定中，每個領導者同時充當其他領導者的追隨者。

與單主複製一樣，可以選擇使其同步或非同步。假設你有兩個領導者，*A* 和 *B*，你正在嘗試寫入 *A*。如果寫入從 *A* 同步複製到 *B*，並且兩個節點之間的網路中斷，你就無法寫入 *A* 直到網路恢復。同步多主複製因此給你一個非常類似於單主複製的模型，即如果你讓 *B* 成為領導者，*A* 只是將任何寫入請求轉發給 *B* 執行。

因此，我們不會進一步討論同步多主複製，而只是將其視為等同於單主複製。本節的其餘部分專注於非同步多主複製，其中任何領導者都可以處理寫入，即使其與其他領導者的連線中斷。

### 跨地域執行 {#sec_replication_multi_dc}

在單個地區內使用多主設定很少有意義，因為好處很少超過增加的複雜性。然而，在某些情況下，這種配置是合理的。

想象你有一個數據庫，在幾個不同的地區有副本（也許是為了能夠容忍整個地區的故障，或者是為了更接近你的使用者）。這被稱為 **地理分散式**、**地域分散式** 或 **地域複製** 設定。使用單主複製，領導者必須在 **一個** 地區，所有寫入都必須透過該地區。

在多主配置中，你可以在 **每個** 地區都部署一個領導者。[圖 6-6](#fig_replication_multi_dc) 展示了這種架構：在每個地區內使用常規單主複製（追隨者可能位於與領導者不同的可用區）；在地區之間，每個地區的領導者把變更復制給其他地區的領導者。

{{< figure src="/fig/ddia_0606.png" id="fig_replication_multi_dc" caption="圖 6-6. 跨多個地區的多主複製。" class="w-full my-4" >}}

讓我們比較單主和多主配置在多地區部署中的表現：

效能
:   在單主配置中，每次寫入都必須透過網際網路到擁有領導者的地區。這可能會給寫入增加顯著的延遲，並可能違背首先擁有多個地區的目的。在多主配置中，每次寫入都可以在本地地區處理，並非同步複製到其他地區。因此，跨地區網路延遲對使用者是隱藏的，這意味著感知效能可能更好。

地區故障容忍
:   在單主配置中，如果擁有領導者的地區變得不可用，故障轉移可以將另一個地區的追隨者提升為領導者。在多主配置中，每個地區可以獨立於其他地區繼續執行，並在離線地區恢復上線時趕上覆制。

網路問題容忍
:   即使有專用連線，地區之間的流量也可能比同一地區內或單個區域內的流量更不可靠。單主配置對這種跨地區鏈路中的問題非常敏感，因為當一個地區的客戶端想要寫入另一個地區的領導者時，它必須透過該鏈路傳送其請求並等待響應才能完成。

    具有非同步複製的多主配置可以更好地容忍網路問題：在臨時網路中斷期間，每個地區的領導者可以繼續獨立處理寫入。

一致性
:   單主系統可以提供強一致性保證，例如可序列化事務，我們將在 [第 8 章](/tw/ch8#ch_transactions) 中討論。多主系統的最大缺點是它們能夠實現的一致性要弱得多。例如，你不能保證銀行賬戶不會變成負數或使用者名稱是唯一的：不同的領導者總是可能處理單獨沒問題的寫入（從賬戶中支付一些錢，註冊特定使用者名稱），但當與另一個領導者上的另一個寫入結合時違反了約束。

    這只是分散式系統的基本限制 [^28]。如果你必須強制執行這類約束，通常應選擇單主系統。不過，正如我們將在 ["處理寫入衝突"](#sec_replication_write_conflicts) 中看到的，多主系統在不需要這類約束的廣泛應用裡，仍然可以提供有用的一致性屬性。

多主複製不如單主複製常見，但許多資料庫仍然支援它，包括 MySQL、Oracle、SQL Server 和 YugabyteDB。在某些情況下，它是一個外部附加功能，例如在 Redis Enterprise、EDB Postgres Distributed 和 pglogical 中 [^29]。

由於多主複製在許多資料庫中是一個有點改裝的功能，因此通常存在微妙的配置陷阱和與其他資料庫功能的令人驚訝的互動。例如，自增鍵、觸發器和完整性約束可能會有問題。因此，多主複製通常被認為是應該儘可能避免的危險領域 [^30]。

#### 多主複製拓撲 {#sec_replication_topologies}

**複製拓撲** 描述了寫入從一個節點傳播到另一個節點的通訊路徑。如果你有兩個領導者，如 [圖 6-9](#fig_replication_write_conflict) 中，只有一種合理的拓撲：領導者 1 必須將其所有寫入傳送到領導者 2，反之亦然。有了兩個以上的領導者，各種不同的拓撲是可能的。[圖 6-7](#fig_replication_topologies) 中說明了一些示例。

{{< figure src="/fig/ddia_0607.png" id="fig_replication_topologies" caption="圖 6-7. 可以設定多主複製的三個示例拓撲。" class="w-full my-4" >}}

最通用的拓撲是 **全對全**，如 [圖 6-7](#fig_replication_topologies)(c) 所示，其中每個領導者將其寫入傳送到每個其他領導者。然而，也使用更受限制的拓撲：例如 **環形拓撲**，其中每個節點從一個節點接收寫入並將這些寫入（加上其自己的任何寫入）轉發到另一個節點。另一種流行的拓撲具有 **星形** 形狀：一個指定的根節點將寫入轉發到所有其他節點。星形拓撲可以推廣到樹形。

--------

> [!NOTE]
> 不要將星形網路拓撲與 **星型模式** 混淆（見 ["星型與雪花型：分析模式"](/tw/ch3#sec_datamodels_analytics)），後者描述了資料模型的結構。

--------

在環形和星形拓撲中，寫入可能需要通過幾個節點才能到達所有副本。因此，節點需要轉發它們從其他節點接收的資料變更。為了防止無限複製迴圈，每個節點都被賦予一個唯一識別符號，並且在複製日誌中，每個寫入都用它經過的所有節點的識別符號標記 [^31]。當節點接收到用其自己的識別符號標記的資料變更時，該資料變更將被忽略，因為節點知道它已經被處理過了。

#### 不同拓撲的問題 {#problems-with-different-topologies}

環形和星形拓撲的一個問題是，如果只有一個節點發生故障，它可能會中斷其他節點之間的複製訊息流，使它們無法通訊，直到節點被修復。可以重新配置拓撲以繞過故障節點，但在大多數部署中，這種重新配置必須手動完成。更密集連線的拓撲（如全對全）的容錯性更好，因為它允許訊息沿著不同的路徑傳播，避免單點故障。

另一方面，全對全拓撲也可能有問題。特別是，一些網路鏈路可能比其他鏈路更快（例如，由於網路擁塞），結果是一些複製訊息可能會"超越"其他訊息，如 [圖 6-8](#fig_replication_causality) 所示。

{{< figure src="/fig/ddia_0608.png" id="fig_replication_causality" caption="圖 6-8. 使用多主複製，寫入可能以錯誤的順序到達某些副本。" class="w-full my-4" >}}

在 [圖 6-8](#fig_replication_causality) 中，客戶端 A 在領導者 1 上向表中插入一行，客戶端 B 在領導者 3 上更新該行。然而，領導者 2 可能以不同的順序接收寫入：它可能首先接收更新（從其角度來看，這是對資料庫中不存在的行的更新），然後才接收相應的插入（應該在更新之前）。

這是一個因果關係問題，類似於我們在 ["一致字首讀"](#sec_replication_consistent_prefix) 中看到的問題：更新依賴於先前的插入，因此我們需要確保所有節點首先處理插入，然後處理更新。簡單地為每個寫入附加時間戳是不夠的，因為時鐘不能被信任足夠同步以在領導者 2 上正確排序這些事件（見 [第 9 章](/tw/ch9#ch_distributed)）。

為了正確排序這些事件，可以使用一種稱為 **版本向量** 的技術，我們將在本章後面討論（見 ["檢測併發寫入"](#sec_replication_concurrent)）。然而，許多多主複製系統不使用良好的技術來排序更新，使它們容易受到像 [圖 6-8](#fig_replication_causality) 中的問題的影響。如果你使用多主複製，值得了解這些問題，仔細閱讀文件，並徹底測試你的資料庫，以確保它真正提供你認為它具有的保證。

### 同步引擎與本地優先軟體 {#sec_replication_offline_clients}

另一種適合多主複製的情況是，如果你有一個需要在與網際網路斷開連線時繼續工作的應用程式。

例如，考慮你的手機、筆記型電腦和其他裝置上的日曆應用程式。你需要能夠隨時檢視你的會議（進行讀取請求）並輸入新會議（進行寫入請求），無論你的裝置當前是否有網際網路連線。如果你在離線時進行任何更改，它們需要在裝置下次上線時與伺服器和你的其他裝置同步。

在這種情況下，每個裝置都擁有一個充當領導者的本地資料庫副本（可接受寫入），並在你所有裝置上的日曆副本之間執行非同步多主複製流程（即同步過程）。複製延遲可能是幾小時甚至幾天，具體取決於你何時能連上網際網路。

從架構的角度來看，這種設定與地區之間的多主複製非常相似，達到了極端：每個裝置是一個"地區"，它們之間的網路連線極其不可靠。

#### 即時協作、離線優先和本地優先應用 {#real-time-collaboration-offline-first-and-local-first-apps}

此外，許多現代 Web 應用程式提供 **即時協作** 功能，例如用於文字文件和電子表格的 Google Docs 和 Sheets，用於圖形的 Figma，以及用於專案管理的 Linear。使這些應用程式如此響應的原因是使用者輸入立即反映在使用者介面中，無需等待到伺服器的網路往返，並且一個使用者的編輯以低延遲顯示給他們的協作者 [^32] [^33] [^34]。

這再次導致多主架構：每個開啟共享檔案的 Web 瀏覽器選項卡都是一個副本，你對檔案進行的任何更新都會非同步複製到開啟同一檔案的其他使用者的裝置。即使應用程式不允許你在離線時繼續編輯檔案，多個使用者可以進行編輯而無需等待伺服器的響應這一事實已經使其成為多主。

離線編輯和即時協作都需要類似的複製基礎設施：應用程式需要捕獲使用者對檔案所做的任何更改，並立即將它們傳送給協作者（如果線上），或本地儲存它們以供稍後傳送（如果離線）。此外，應用程式需要接收來自協作者的更改，將它們合併到使用者的檔案本地副本中，並更新使用者介面以反映最新版本。如果多個使用者同時更改了檔案，可能需要衝突解決邏輯來合併這些更改。

支援此過程的軟體庫稱為 **同步引擎**。儘管這個想法已經存在很長時間了，但這個術語最近才受到關注 [^35] [^36] [^37]。允許使用者在離線時繼續編輯檔案的應用程式（可能使用同步引擎實現）稱為 **離線優先** [^38]。術語 **本地優先軟體** 指的是不僅是離線優先的協作應用程式，而且即使製作軟體的開發人員關閉了他們的所有線上服務，也被設計為繼續工作 [^39]。這可以透過使用具有開放標準同步協議的同步引擎來實現，該協議有多個服務提供商可用 [^40]。例如，Git 是一個本地優先的協作系統（儘管不支援即時協作），因為你可以透過 GitHub、GitLab 或任何其他儲存庫託管服務進行同步。

#### 同步引擎的利弊 {#pros-and-cons-of-sync-engines}

今天構建 Web 應用程式的主導方式是在客戶端保留很少的持久狀態，並在需要顯示新資料或需要更新某些資料時依賴向伺服器發出請求。相比之下，當使用同步引擎時，你在客戶端有持久狀態，與伺服器的通訊被移到後臺程序中。同步引擎方法有許多優點：

* 在本地擁有資料意味著使用者介面的響應速度可以比必須等待服務呼叫獲取某些資料時快得多。一些應用程式的目標是在圖形系統的 **下一幀** 響應使用者輸入，這意味著在 60 Hz 重新整理率的顯示器上在 16 毫秒內渲染。
* 允許使用者在離線時繼續工作是有價值的，特別是在具有間歇性連線的移動裝置上。使用同步引擎，應用程式不需要單獨的離線模式：離線與具有非常大的網路延遲相同。
* 與在應用程式程式碼中執行顯式服務呼叫相比，同步引擎簡化了前端應用程式的程式設計模型。每個服務呼叫都需要錯誤處理，如 ["遠端過程呼叫（RPC）的問題"](/tw/ch5#sec_problems_with_rpc) 中所討論的：例如，如果更新伺服器上的資料的請求失敗，使用者介面需要以某種方式反映該錯誤。同步引擎允許應用程式對本地資料執行讀寫，這幾乎從不失敗，導致更具宣告性的程式設計風格 [^41]。
* 為了即時顯示其他使用者的編輯，你需要接收這些編輯的通知並相應地有效更新使用者介面。同步引擎與 **響應式程式設計** 模型相結合是實現此目的的好方法 [^42]。

當用戶可能需要的所有資料都提前下載並持久儲存在客戶端時，同步引擎效果最佳。這意味著資料可用於離線訪問，但這也意味著如果使用者可以訪問非常大量的資料，同步引擎就不適合。例如，下載使用者自己建立的所有檔案可能很好（一個使用者通常不會生成那麼多資料），但下載電子商務網站的整個目錄可能沒有意義。

同步引擎由 Lotus Notes 在 20 世紀 80 年代開創 [^43]（沒有使用該術語），特定應用程式（如日曆）的同步也已經存在很長時間了。今天有許多通用同步引擎，其中一些使用專有後端服務（例如，Google Firestore、Realm 或 Ditto），有些具有開源後端，使它們適合建立本地優先軟體（例如，PouchDB/CouchDB、Automerge 或 Yjs）。

多人影片遊戲有類似的需求，需要立即響應使用者的本地操作，並將它們與透過網路非同步接收的其他玩家的操作協調。在遊戲開發術語中，同步引擎的等效物稱為 **網路程式碼**。網路程式碼中使用的技術非常特定於遊戲的要求 [^44]，並且不能直接應用於其他型別的軟體，因此我們不會在本書中進一步考慮它們。


### 處理寫入衝突 {#sec_replication_write_conflicts}

多主複製的最大問題——無論是在地域分散式伺服器端資料庫中還是在終端使用者裝置上的本地優先同步引擎中——是不同領導者上的併發寫入可能導致需要解決的衝突。

例如，考慮一個維基頁面同時被兩個使用者編輯，如 [圖 6-9](#fig_replication_write_conflict) 所示。使用者 1 將頁面標題從 A 更改為 B，使用者 2 獨立地將標題從 A 更改為 C。每個使用者的更改成功應用於其本地領導者。然而，當更改非同步複製時，檢測到衝突。這個問題在單主資料庫中不會發生。

{{< figure src="/fig/ddia_0609.png" id="fig_replication_write_conflict" caption="圖 6-9. 兩個領導者併發更新同一記錄導致的寫入衝突。" class="w-full my-4" >}}

> [!NOTE]
> 我們說 [圖 6-9](#fig_replication_write_conflict) 中的兩個寫入是 **併發的**，因為在最初進行寫入時，兩者都不“知道”對方。寫入是否真的在同一時刻發生並不重要；實際上，如果寫入發生在離線狀態，它們在物理時間上可能相隔很久。關鍵在於：一個寫入是否發生在另一個寫入已經生效的狀態之上。

在 ["檢測併發寫入"](#sec_replication_concurrent) 中，我們將解決資料庫如何確定兩個寫入是否併發的問題。現在我們假設我們可以檢測衝突，並且我們想找出解決它們的最佳方法。

#### 衝突避免 {#conflict-avoidance}

衝突的一種策略是首先避免它們發生。例如，如果應用程式可以確保特定記錄的所有寫入都透過同一領導者，那麼即使整個資料庫是多主的，也不會發生衝突。這種方法在同步引擎客戶端離線更新的情況下是不可能的，但在地域複製的伺服器系統中有時是可能的 [^30]。

例如，在一個使用者只能編輯自己資料的應用程式中，你可以確保來自特定使用者的請求始終路由到同一地區，並使用該地區的領導者進行讀寫。不同的使用者可能有不同的"主"地區（可能基於與使用者的地理接近程度選擇），但從任何一個使用者的角度來看，配置本質上是單主的。

然而，有時你可能想要更改記錄的指定領導者——也許是因為一個地區不可用，你需要將流量重新路由到另一個地區，或者也許是因為使用者已經移動到不同的位置，現在更接近不同的地區。現在存在風險，即使用者在指定領導者更改正在進行時執行寫入，導致必須使用下面的方法之一解決的衝突。因此，如果你允許更改領導者，衝突避免就會失效。

衝突避免的另一個例子：想象你想要插入新記錄並基於自增計數器為它們生成唯一 ID。如果你有兩個領導者，你可以設定它們，使得一個領導者只生成奇數，另一個只生成偶數。這樣你可以確保兩個領導者不會同時為不同的記錄分配相同的 ID。我們將在 ["ID 生成器和邏輯時鐘"](/tw/ch10#sec_consistency_logical) 中討論其他 ID 分配方案。


#### 最後寫入勝利（丟棄併發寫入） {#sec_replication_lww}

如果無法避免衝突，解決它們的最簡單方法是為每個寫入附加時間戳，並始終使用具有最大時間戳的值。例如，在 [圖 6-9](#fig_replication_write_conflict) 中，假設使用者 1 的寫入時間戳大於使用者 2 的寫入時間戳。在這種情況下，兩個領導者都將確定頁面的新標題應該是 B，並丟棄將其設定為 C 的寫入。如果寫入巧合地具有相同的時間戳，可以透過比較值來選擇獲勝者（例如，在字串的情況下，取字母表中較早的那個）。

這種方法稱為 **最後寫入勝利**（LWW），因為具有最大時間戳的寫入可以被認為是"最後"的。然而，這個術語是誤導性的，因為當兩個寫入像 [圖 6-9](#fig_replication_write_conflict) 中那樣併發時，哪個更舊，哪個更新是未定義的，因此併發寫入的時間戳順序本質上是隨機的。

因此，LWW 的真正含義是：當同一記錄在不同的領導者上併發寫入時，其中一個寫入被隨機選擇為獲勝者，其他寫入被靜默丟棄，即使它們在各自的領導者上成功處理。這實現了最終所有副本都處於一致狀態的目標，但代價是資料丟失。

如果你可以避免衝突——例如，透過只插入具有唯一鍵（如 UUID）的記錄，而從不更新它們——那麼 LWW 沒有問題。但是，如果你更新現有記錄，或者如果不同的領導者可能插入具有相同鍵的記錄，那麼你必須決定丟失的更新對你的應用程式是否是個問題。如果丟失的更新是不可接受的，你需要使用下面描述的衝突解決方法之一。

LWW 的另一個問題是，如果使用即時時鐘（例如 Unix 時間戳）作為寫入的時間戳，系統對時鐘同步變得非常敏感。如果一個節點的時鐘領先於其他節點，並且你嘗試覆蓋該節點寫入的值，你的寫入可能會被忽略，因為它可能具有較低的時間戳，即使它明顯發生得更晚。這個問題可以透過使用 **邏輯時鐘** 來解決，我們將在 ["ID 生成器和邏輯時鐘"](/tw/ch10#sec_consistency_logical) 中討論。

#### 手動衝突解決 {#manual-conflict-resolution}

如果隨機丟棄你的一些寫入是不可取的，下一個選擇是手動解決衝突。你可能熟悉 Git 和其他版本控制系統中的手動衝突解決：如果兩個不同分支上的提交編輯同一檔案的相同行，並且你嘗試合併這些分支，你將得到一個需要在合併完成之前解決的合併衝突。

在資料庫裡，讓衝突阻塞整個複製流程、直到人工處理，通常並不現實。更常見的是，資料庫會保留某條記錄的所有併發寫入值——例如 [圖 6-9](#fig_replication_write_conflict) 中的 B 和 C。這些值有時稱為 **兄弟**。下次查詢該記錄時，資料庫會返回 **所有** 這些值，而不只是最新值。隨後你可以按需要解決這些值：要麼在應用程式碼裡自動處理（例如把 B 和 C 合併成 "B/C"），要麼讓使用者參與處理；最後再把新值寫回資料庫以消解衝突。

這種衝突解決方法在某些系統中使用，例如 CouchDB。然而，它也存在許多問題：

* 資料庫的 API 發生變化：例如，以前維基頁面的標題只是一個字串，現在它變成了一組字串，通常包含一個元素，但如果有衝突，有時可能包含多個元素。這可能使應用程式程式碼中的資料難以處理。
* 要求使用者手動合併兄弟，會帶來很大負擔：開發者需要構建衝突解決介面，使用者也可能不明白自己為何要做這件事。在很多場景下，自動合併比打擾使用者更合適。
* 如果不夠謹慎，自動合併兄弟也可能產生反直覺行為。例如，亞馬遜購物車曾允許併發更新，並用“並集”策略合併（保留出現在任一兄弟中的所有商品）。這意味著：若使用者在一個兄弟裡刪除了某商品，但另一個兄弟仍保留它，該商品會“復活”回購物車 [^45]。[圖 6-10](#fig_replication_amazon_anomaly) 就是一個例子：裝置 1 刪除 Book，裝置 2 併發刪除 DVD，衝突合併後兩個商品都回來了。
* 如果多個節點觀察到衝突並併發解決它，衝突解決過程本身可能會引入新的衝突。這些解決方案甚至可能不一致：例如，如果你不小心一致地排序它們，一個節點可能將 B 和 C 合併為"B/C"，另一個可能將它們合併為"C/B"。當"B/C"和"C/B"之間的衝突被合併時，它可能導致"B/C/C/B"或類似令人驚訝的東西。

{{< figure src="/fig/ddia_0610.png" id="fig_replication_amazon_anomaly" caption="圖 6-10. 亞馬遜購物車異常的示例：如果購物車上的衝突透過取並集合並，刪除的專案可能會重新出現。" class="w-full my-4" >}}


#### 自動衝突解決 {#automatic-conflict-resolution}

對於許多應用程式，處理衝突的最佳方法是使用自動將併發寫入合併為一致狀態的演算法。自動衝突解決確保所有副本 **收斂** 到相同的狀態——即，處理了相同寫入集的所有副本都具有相同的狀態，無論寫入到達的順序如何。

LWW 是衝突解決演算法的一個簡單示例。已經為不同型別的資料開發了更複雜的合併演算法，目標是儘可能保留所有更新的預期效果，從而避免資料丟失：

* 如果資料是文字（例如維基頁面標題或正文），我們可以檢測每次版本演進中的字元插入和刪除。合併結果會保留任一兄弟中的所有插入和刪除。如果多個使用者併發在同一位置插入文字，還可以用確定性順序來排序，以確保所有節點得到同樣的合併結果。
* 如果資料是專案集合（像待辦事項列表那樣有序，或像購物車那樣無序），我們可以透過跟蹤插入和刪除類似於文字來合併它。為了避免 [圖 6-10](#fig_replication_amazon_anomaly) 中的購物車問題，演算法跟蹤 Book 和 DVD 被刪除的事實，因此合併的結果是 Cart = {Soap}。
* 如果資料是可增可減的整數計數器（例如社交媒體帖子的點贊數），合併演算法可以統計每個兄弟上的遞增和遞減次數，並正確求和，既不重複計數，也不丟更新。
* 如果資料是鍵值對映，我們可以透過將其他衝突解決演算法之一應用於該鍵下的值來合併對同一鍵的更新。對不同鍵的更新可以相互獨立處理。

衝突解決的可能性是有限的。例如，如果你想強制一個列表不包含超過五個專案，並且多個使用者併發地向列表新增專案，使得總共有五個以上，你唯一的選擇是丟棄一些專案。儘管如此，自動衝突解決足以構建許多有用的應用程式。如果你從想要構建協作離線優先或本地優先應用程式的要求開始，那麼衝突解決是不可避免的，自動化它通常是最好的方法。

### CRDT 與操作變換 {#sec_replication_crdts}

兩個演算法族通常用於實現自動衝突解決：**無衝突複製資料型別**（CRDT）[^46] 和 **操作變換**（OT）[^47]。它們具有不同的設計理念和效能特徵，但都能夠為前面提到的所有型別的資料執行自動合併。

[圖 6-11](#fig_replication_ot_crdt) 顯示了 OT 和 CRDT 如何合併對文字的併發更新的示例。假設你有兩個副本，都從文字"ice"開始。一個副本在前面新增字母"n"以製作"nice"，而另一個副本併發地附加感嘆號以製作"ice!"。

{{< figure src="/fig/ddia_0611.png" id="fig_replication_ot_crdt" caption="圖 6-11. OT 和 CRDT 如何分別合併對字串的兩個併發插入。" class="w-full my-4" >}}

合併的結果"nice!"由兩種型別的演算法以不同的方式實現：

OT
:   我們記錄插入或刪除字元的索引："n"插入在索引 0，"!"插入在索引 3。接下來，副本交換它們的操作。在 0 處插入"n"可以按原樣應用，但如果在 3 處插入"!"應用於狀態"nice"，我們將得到"nic!e"，這是不正確的。因此，我們需要轉換每個操作的索引以考慮已經應用的併發操作；在這種情況下，"!"的插入被轉換為索引 4 以考慮在較早索引處插入"n"。

CRDT
:   大多數 CRDT 為每個字元提供唯一的、不可變的 ID，並使用這些 ID 來確定插入/刪除的位置，而不是索引。例如，在 [圖 6-11](#fig_replication_ot_crdt) 中，我們將 ID 1A 分配給"i"，ID 2A 分配給"c"等。插入感嘆號時，我們生成一個包含新字元的 ID（4B）和我們想要在其後插入的現有字元的 ID（3A）的操作。要在字串的開頭插入，我們將"nil"作為前面的字元 ID。在同一位置的併發插入按字元的 ID 排序。這確保副本收斂而不執行任何轉換。

有許多基於這些想法變體的演算法。列表/陣列可以類似地支援，使用列表元素而不是字元，其他資料型別（如鍵值對映）可以很容易地新增。OT 和 CRDT 之間存在一些效能和功能權衡，但可以在一個演算法中結合 CRDT 和 OT 的優點 [^48]。

OT 最常用於文字的即時協作編輯，例如在 Google Docs 中 [^32]，而 CRDT 可以在分散式資料庫中找到，例如 Redis Enterprise、Riak 和 Azure Cosmos DB [^49]。JSON 資料的同步引擎可以使用 CRDT（例如，Automerge 或 Yjs）和 OT（例如，ShareDB）實現。

#### 什麼是衝突？ {#what-is-a-conflict}

某些型別的衝突是顯而易見的。在 [圖 6-9](#fig_replication_write_conflict) 的示例中，兩個寫入併發修改了同一記錄中的同一欄位，將其設定為兩個不同的值。毫無疑問，這是一個衝突。

其他型別的衝突可能更難以檢測。例如，考慮一個會議室預訂系統：它跟蹤哪個房間由哪組人在什麼時間預訂。此應用程式需要確保每個房間在任何時間只由一組人預訂（即，同一房間不得有任何重疊的預訂）。在這種情況下，如果為同一房間同時建立兩個不同的預訂，可能會出現衝突。即使應用程式在允許使用者進行預訂之前檢查可用性，如果兩個預訂是在兩個不同的領導者上進行的，也可能會發生衝突。

沒有現成的快速答案，不過在後續章節中，我們會逐步建立對這個問題的理解。我們將在 [第 8 章](/tw/ch8#ch_transactions) 看到更多衝突案例，並在 ["透過事件順序捕獲因果關係"](/tw/ch13#sec_future_capture_causality) 中討論在複製系統裡可伸縮地檢測和解決衝突的方法。


## 無主複製 {#sec_replication_leaderless}

到目前為止，我們在本章中討論的複製方法——單主和多主複製——都基於這樣的想法：客戶端向一個節點（領導者）傳送寫入請求，資料庫系統負責將該寫入複製到其他副本。領導者確定寫入應該處理的順序，追隨者以相同的順序應用領導者的寫入。

一些資料儲存系統採用不同的方法，放棄領導者的概念，並允許任何副本直接接受來自客戶端的寫入。一些最早的複製資料系統是無主的 [^1] [^50]，但在關係資料庫主導的時代，這個想法基本上被遺忘了。在亞馬遜於 2007 年將其用於其內部 **Dynamo** 系統後，它再次成為資料庫的時尚架構 [^45]。Riak、Cassandra 和 ScyllaDB 是受 Dynamo 啟發的具有無主複製模型的開源資料儲存，因此這種資料庫也被稱為 **Dynamo 風格**。

--------

> [!NOTE]
> 原始的 **Dynamo** 系統僅在論文中描述 [^45]，但從未在亞馬遜之外發布。AWS 的名稱相似的 **DynamoDB** 是一個更新的雲資料庫，但它具有完全不同的架構：它使用基於 Multi-Paxos 共識演算法的單主複製 [^5]。

--------

在某些無主實現中，客戶端直接將其寫入傳送到多個副本，而在其他實現中，協調器節點代表客戶端執行此操作。然而，與領導者資料庫不同，該協調器不強制執行特定的寫入順序。正如我們將看到的，這種設計差異對資料庫的使用方式產生了深遠的影響。

### 當節點故障時寫入資料庫 {#id287}

想象你有一個具有三個副本的資料庫，其中一個副本當前不可用——也許它正在重新啟動以安裝系統更新。在單主配置中，如果你想繼續處理寫入，你可能需要執行故障轉移（見 ["處理節點故障"](#sec_replication_failover)）。

另一方面，在無主配置中，故障轉移不存在。[圖 6-12](#fig_replication_quorum_node_outage) 顯示了發生的情況：客戶端（使用者 1234）將寫入並行傳送到所有三個副本，兩個可用副本接受寫入，但不可用副本錯過了它。假設三個副本中有兩個確認寫入就足夠了：在使用者 1234 收到兩個 **ok** 響應後，我們認為寫入成功。客戶端只是忽略了其中一個副本錯過寫入的事實。

{{< figure src="/fig/ddia_0612.png" id="fig_replication_quorum_node_outage" caption="圖 6-12. 節點中斷後的仲裁寫入、仲裁讀取和讀修復。" class="w-full my-4" >}}


現在想象不可用節點恢復上線，客戶端開始從它讀取。在節點宕機期間發生的任何寫入都從該節點丟失。因此，如果你從該節點讀取，你可能會得到 **陳舊**（過時）值作為響應。

為了解決這個問題，當客戶端從資料庫讀取時，它不只是將其請求傳送到一個副本：**讀取請求也並行傳送到多個節點**。客戶端可能會從不同的節點獲得不同的響應；例如，從一個節點獲得最新值，從另一個節點獲得陳舊值。

為了區分哪些響應是最新的，哪些是過時的，寫入的每個值都需要用版本號或時間戳標記，類似於我們在 ["最後寫入勝利（丟棄併發寫入）"](#sec_replication_lww) 中看到的。當客戶端收到對讀取的多個值響應時，它使用具有最大時間戳的值（即使該值僅由一個副本返回，而其他幾個副本返回較舊的值）。有關更多詳細資訊，請參見 ["檢測併發寫入"](#sec_replication_concurrent)。

#### 追趕錯過的寫入 {#sec_replication_read_repair}

複製系統應確保最終所有資料都複製到每個副本。在不可用節點恢復上線後，它如何趕上它錯過的寫入？在 Dynamo 風格的資料儲存中使用了幾種機制：

讀修復
:   當客戶端並行從多個節點讀取時，它可以檢測任何陳舊響應。例如，在 [圖 6-12](#fig_replication_quorum_node_outage) 中，使用者 2345 從副本 3 獲得版本 6 的值，從副本 1 和 2 獲得版本 7 的值。客戶端發現副本 3 陳舊後，會把較新的值寫回該副本。這種方法適用於經常被讀取的值。

提示移交
:   如果一個副本不可用，另一個副本可能會以 **提示** 的形式代表其儲存寫入。當應該接收這些寫入的副本恢復時，儲存提示的副本將它們傳送到恢復的副本，然後刪除提示。這個 **移交** 過程有助於使副本保持最新，即使對於從未讀取的值也是如此，因此不由讀修復處理。

反熵
:   此外，還有一個後臺程序定期查詢副本之間資料的差異，並將任何缺失的資料從一個副本複製到另一個。與基於領導者的複製中的複製日誌不同，這個 **反熵程序** 不以任何特定順序複製寫入，並且在複製資料之前可能會有顯著的延遲。

#### 讀寫仲裁 {#sec_replication_quorum_condition}

在 [圖 6-12](#fig_replication_quorum_node_outage) 的例子中，即使寫入僅在三個副本中的兩個上處理，我們也認為寫入成功。如果三個副本中只有一個接受了寫入呢？我們能推多遠？

如果我們知道每次成功的寫入都保證至少存在於三個副本中的兩個上，這意味著最多一個副本可能是陳舊的。因此，如果我們從至少兩個副本讀取，我們可以確信兩個中至少有一個是最新的。如果第三個副本宕機或響應緩慢，讀取仍然可以繼續返回最新值。

更一般地說，如果有 *n* 個副本，每次寫入必須由 *w* 個節點確認才能被認為成功，並且我們必須為每次讀取查詢至少 *r* 個節點。（在我們的例子中，*n* = 3，*w* = 2，*r* = 2。）只要 *w* + *r* > *n*，我們在讀取時期望獲得最新值，因為我們讀取的 *r* 個節點中至少有一個必須是最新的。遵守這些 *r* 和 *w* 值的讀取和寫入稱為 **仲裁** 讀取和寫入 [^50]。你可以將 *r* 和 *w* 視為讀取或寫入有效所需的最小投票數。

在 Dynamo 風格的資料庫中，引數 *n*、*w* 和 *r* 通常是可配置的。常見的選擇是使 *n* 為奇數（通常為 3 或 5），並設定 *w* = *r* = (*n* + 1) / 2（向上舍入）。然而，你可以根據需要更改數字。例如，寫入很少而讀取很多的工作負載可能受益於設定 *w* = *n* 和 *r* = 1。這使讀取更快，但缺點是僅一個失敗的節點就會導致所有資料庫寫入失敗。

--------

> [!NOTE]
> 叢集中可能有超過 *n* 個節點，但任何給定值僅儲存在 *n* 個節點上。這允許資料集進行分片，支援比單個節點能容納的更大的資料集。我們將在 [第 7 章](/tw/ch7#ch_sharding) 中回到分片。

--------

仲裁條件 *w* + *r* > *n* 允許系統容忍不可用節點，如下所示：

* 如果 *w* < *n*，如果節點不可用，我們仍然可以處理寫入。
* 如果 *r* < *n*，如果節點不可用，我們仍然可以處理讀取。
* 使用 *n* = 3，*w* = 2，*r* = 2，我們可以容忍一個不可用節點，如 [圖 6-12](#fig_replication_quorum_node_outage) 中所示。
* 使用 *n* = 5，*w* = 3，*r* = 3，我們可以容忍兩個不可用節點。這種情況在 [圖 6-13](#fig_replication_quorum_overlap) 中說明。

通常，讀取和寫入總是並行傳送到所有 *n* 個副本。引數 *w* 和 *r* 確定我們等待多少個節點——即，在我們認為讀取或寫入成功之前，*n* 個節點中有多少個需要報告成功。

{{< figure src="/fig/ddia_0613.png" id="fig_replication_quorum_overlap" caption="圖 6-13. 如果 *w* + *r* > *n*，你讀取的 *r* 個副本中至少有一個必須看到最近的成功寫入。" class="w-full my-4" >}}


如果少於所需的 *w* 或 *r* 個節點可用，寫入或讀取將返回錯誤。節點可能因許多原因不可用：因為節點宕機（崩潰、斷電）、由於執行操作時出錯（無法寫入因為磁碟已滿）、由於客戶端和節點之間的網路中斷，或任何其他原因。我們只關心節點是否返回了成功響應，不需要區分不同型別的故障。

### 仲裁一致性的侷限 {#sec_replication_quorum_limitations}

如果你有 *n* 個副本，並且你選擇 *w* 和 *r* 使得 *w* + *r* > *n*，你通常可以期望每次讀取都返回為鍵寫入的最新值。這是因為你寫入的節點集和你讀取的節點集必須重疊。也就是說，在你讀取的節點中，必須至少有一個具有最新值的節點（如 [圖 6-13](#fig_replication_quorum_overlap) 所示）。

通常，*r* 和 *w* 被選擇為多數（超過 *n*/2）節點，因為這確保了 *w* + *r* > *n*，同時仍然容忍最多 *n*/2（向下舍入）個節點故障。但仲裁不一定是多數——重要的是讀取和寫入操作使用的節點集至少在一個節點中重疊。其他仲裁分配是可能的，這允許分散式演算法設計中的一些靈活性 [^51]。

你也可以將 *w* 和 *r* 設定為較小的數字，使得 *w* + *r* ≤ *n*（即，不滿足仲裁條件）。在這種情況下，讀取和寫入仍將傳送到 *n* 個節點，但需要較少的成功響應數才能使操作成功。

使用較小的 *w* 和 *r*，你更有可能讀取陳舊值，因為你的讀取更可能沒有包含具有最新值的節點。從好的方面來說，這種配置允許更低的延遲和更高的可用性：如果存在網路中斷並且許多副本變得無法訪問，你繼續處理讀取和寫入的機會更高。只有在可訪問副本的數量低於 *w* 或 *r* 之後，資料庫才分別變得無法寫入或讀取。

然而，即使使用 *w* + *r* > *n*，在某些邊緣情況下，一致性屬性可能會令人困惑。一些場景包括：

* 如果攜帶新值的節點失敗，並且其資料從攜帶舊值的副本恢復，儲存新值的副本數量可能低於 *w*，破壞仲裁條件。
* 在重新平衡正在進行時，其中一些資料從一個節點移動到另一個節點（見 [第 7 章](/tw/ch7#ch_sharding)），節點可能對哪些節點應該持有特定值的 *n* 個副本有不一致的檢視。這可能導致讀取和寫入仲裁不再重疊。
* 如果讀取與寫入操作併發，讀取可能會或可能不會看到併發寫入的值。特別是，一次讀取可能看到新值，而後續讀取看到舊值，正如我們將在 ["線性一致性與仲裁"](/tw/ch10#sec_consistency_quorum_linearizable) 中看到的。
* 如果寫入在某些副本上成功但在其他副本上失敗（例如，因為某些節點上的磁碟已滿），並且總體上在少於 *w* 個副本上成功，它不會在成功的副本上回滾。這意味著如果寫入被報告為失敗，後續讀取可能會或可能不會返回該寫入的值 [^52]。
* 如果資料庫使用即時時鐘的時間戳來確定哪個寫入更新（如 Cassandra 和 ScyllaDB 所做的），如果另一個具有更快時鐘的節點已寫入同一鍵，寫入可能會被靜默丟棄——我們之前在 ["最後寫入勝利（丟棄併發寫入）"](#sec_replication_lww) 中看到的問題。我們將在 ["依賴同步時鐘"](/tw/ch9#sec_distributed_clocks_relying) 中更詳細地討論這一點。
* 如果兩個寫入併發發生，其中一個可能首先在一個副本上處理，另一個可能首先在另一個副本上處理。這導致衝突，類似於我們在多主複製中看到的（見 ["處理寫入衝突"](#sec_replication_write_conflicts)）。我們將在 ["檢測併發寫入"](#sec_replication_concurrent) 中回到這個主題。

因此，儘管仲裁似乎保證讀取返回最新寫入的值，但實際上並不那麼簡單。Dynamo 風格的資料庫通常針對可以容忍最終一致性的用例進行了最佳化。引數 *w* 和 *r* 允許你調整讀取陳舊值的機率 [^53]，但明智的做法是不要將它們視為絕對保證。

#### 監控陳舊性 {#monitoring-staleness}

從操作角度來看，監控你的資料庫是否返回最新結果很重要。即使你的應用程式可以容忍陳舊讀取，你也需要了解複製的健康狀況。如果它明顯落後，它應該提醒你，以便你可以調查原因（例如，網路中的問題或過載的節點）。

對於基於領導者的複製，資料庫通常公開復制延遲的指標，你可以將其輸入到監控系統。這是可能的，因為寫入以相同的順序應用於領導者和追隨者，每個節點在複製日誌中都有一個位置（它在本地應用的寫入數）。透過從領導者的當前位置減去追隨者的當前位置，你可以測量複製延遲的量。

然而，在具有無主複製的系統中，沒有固定的寫入應用順序，這使得監控更加困難。副本為移交儲存的提示數量可以是系統健康的一個度量，但很難有用地解釋 [^54]。最終一致性是一個故意模糊的保證，但為了可操作性，能夠量化"最終"很重要。


### 單主與無主複製的效能 {#sec_replication_leaderless_perf}

基於單個領導者的複製系統可以提供在無主系統中難以或不可能實現的強一致性保證。然而，正如我們在 ["複製延遲的問題"](#sec_replication_lag) 中看到的，如果你在非同步更新的追隨者上進行讀取，基於領導者的複製系統中的讀取也可能返回陳舊值。

從領導者讀取確保最新響應，但它存在效能問題：

* 讀取吞吐量受領導者處理請求能力的限制（與讀擴充套件相反，讀擴充套件將讀取分佈在可能返回陳舊值的非同步更新副本上）。
* 如果領導者失敗，你必須等待檢測到故障，並在繼續處理請求之前完成故障轉移。即使故障轉移過程非常快，使用者也會因為臨時增加的響應時間而注意到它；如果故障轉移需要很長時間，系統在其持續時間內不可用。
* 系統對領導者上的效能問題非常敏感：如果領導者響應緩慢，例如由於過載或某些資源爭用，增加的響應時間也會立即影響使用者。

無主架構的一大優勢是它對此類問題更有彈性。因為沒有故障轉移，而且請求本來就是並行發往多個副本，所以某個副本變慢或不可用對響應時間影響較小：客戶端只需採用更快副本的響應即可。利用最快響應的做法稱為 **請求對沖**，它可以顯著降低尾部延遲 [^55]。

從根本上說，無主系統的彈性來自於它不區分正常情況和故障情況的事實。這在處理所謂的 **灰色故障** 時特別有用，其中節點沒有完全宕機，但以降級狀態執行，處理請求異常緩慢 [^56]，或者當節點只是過載時（例如，如果節點已離線一段時間，透過提示移交恢復可能會導致大量額外負載）。基於領導者的系統必須決定情況是否足夠糟糕以保證故障轉移（這本身可能會導致進一步的中斷），而在無主系統中，這個問題甚至不會出現。

也就是說，無主系統也可能有效能問題：

* 即使系統不需要執行故障轉移，一個副本確實需要檢測另一個副本何時不可用，以便它可以儲存有關不可用副本錯過的寫入的提示。當不可用副本恢復時，移交過程需要向其傳送這些提示。這在系統已經處於壓力下時給副本帶來了額外的負載 [^54]。
* 你擁有的副本越多，你的仲裁就越大，在請求完成之前你必須等待的響應就越多。即使你只等待最快的 *r* 或 *w* 個副本響應，即使你並行發出請求，更大的 *r* 或 *w* 增加了你遇到慢副本的機會，增加了總體響應時間（見 ["響應時間指標的應用"](/tw/ch2#sec_introduction_slo_sla)）。
* 大規模網路中斷使客戶端與大量副本斷開連線，可能使形成仲裁變得不可能。一些無主資料庫提供了一個配置選項，允許任何可訪問的副本接受寫入，即使它不是該鍵的通常副本之一（Riak 和 Dynamo 稱之為 **寬鬆仲裁** [^45]；Cassandra 和 ScyllaDB 稱之為 **一致性級別 ANY**）。不能保證後續讀取會看到寫入的值，但根據應用程式，它可能仍然比寫入失敗更好。

多主複製可以提供比無主複製更大的網路中斷彈性，因為讀取和寫入只需要與一個領導者通訊，該領導者可以與客戶端位於同一位置。然而，由於一個領導者上的寫入非同步傳播到其他領導者，讀取可能任意過時。仲裁讀取和寫入提供了一種折衷：良好的容錯性，同時也有很高的可能性讀取最新資料。

#### 多地區操作 {#multi-region-operation}

我們之前討論了跨地區複製作為多主複製的用例（見 ["多主複製"](#sec_replication_multi_leader)）。無主複製也適合多地區操作，因為它被設計為容忍衝突的併發寫入、網路中斷和延遲峰值。

Cassandra 和 ScyllaDB 在正常的無主模型中實現了它們的多地區支援：客戶端直接將其寫入傳送到所有地區的副本，你可以從各種一致性級別中進行選擇，這些級別確定請求成功所需的響應數。例如，你可以請求所有地區中副本的仲裁、每個地區中的單獨仲裁，或僅客戶端本地地區的仲裁。本地仲裁避免了必須等待到其他地區的緩慢請求，但它也更可能返回陳舊結果。

Riak 將客戶端和資料庫節點之間的所有通訊保持在一個地區本地，因此 *n* 描述了一個地區內的副本數。資料庫叢集之間的跨地區複製在後臺非同步發生，其風格類似於多主複製。


### 檢測併發寫入 {#sec_replication_concurrent}

與多主複製一樣，無主資料庫允許對同一鍵進行併發寫入，導致需要解決的衝突。此類衝突可能在寫入發生時發生，但並非總是如此：它們也可能在讀修復、提示移交或反熵期間稍後檢測到。

問題在於，由於可變的網路延遲和部分故障，事件可能以不同的順序到達不同的節點。例如，[圖 6-14](#fig_replication_concurrency) 顯示了兩個客戶端 A 和 B 同時寫入三節點資料儲存中的鍵 *X*：

* 節點 1 接收來自 A 的寫入，但由於瞬時中斷從未接收來自 B 的寫入。
* 節點 2 首先接收來自 A 的寫入，然後接收來自 B 的寫入。
* 節點 3 首先接收來自 B 的寫入，然後接收來自 A 的寫入。

{{< figure src="/fig/ddia_0614.png" id="fig_replication_concurrency" caption="圖 6-14. Dynamo 風格資料儲存中的併發寫入：沒有明確定義的順序。" class="w-full my-4" >}}

如果每個節點在接收到來自客戶端的寫入請求時只是覆蓋鍵的值，節點將變得永久不一致，如 [圖 6-14](#fig_replication_concurrency) 中的最終 *get* 請求所示：節點 2 認為 *X* 的最終值是 B，而其他節點認為值是 A。

為了最終保持一致，副本應該收斂到相同的值。為此，我們可以使用我們之前在 ["處理寫入衝突"](#sec_replication_write_conflicts) 中討論的任何衝突解決機制，例如最後寫入勝利（由 Cassandra 和 ScyllaDB 使用）、手動解決或 CRDT（在 ["CRDT 與操作變換"](#sec_replication_crdts) 中描述，並由 Riak 使用）。

最後寫入勝利很容易實現：每個寫入都標有時間戳，具有更高時間戳的值總是覆蓋具有較低時間戳的值。然而，時間戳不會告訴你兩個值是否實際上衝突（即，它們是併發寫入的）或不衝突（它們是一個接一個寫入的）。如果你想顯式解決衝突，系統需要更加小心地檢測併發寫入。

#### "先發生"關係與併發 {#sec_replication_happens_before}

我們如何決定兩個操作是否併發？為了培養直覺，讓我們看一些例子：

* 在 [圖 6-8](#fig_replication_causality) 中，兩個寫入不是併發的：A 的插入 **先發生於** B 的遞增，因為 B 遞增的值是 A 插入的值。換句話說，B 的操作建立在 A 的操作之上，所以 B 的操作必須稍後發生。我們也說 B **因果依賴** 於 A。
* 另一方面，[圖 6-14](#fig_replication_concurrency) 中的兩個寫入是併發的：當每個客戶端開始操作時，它不知道另一個客戶端也在對同一鍵執行操作。因此，操作之間沒有因果依賴關係。

如果操作 B 知道 A，或依賴於 A，或以某種方式建立在 A 之上，則操作 A **先發生於** 另一個操作 B。一個操作是否先發生於另一個操作是定義併發含義的關鍵。事實上，我們可以簡單地說，如果兩個操作都不先發生於另一個（即，兩者都不知道另一個），則它們是 **併發的** [^57]。

因此，每當你有兩個操作 A 和 B 時，有三種可能性：要麼 A 先發生於 B，要麼 B 先發生於 A，要麼 A 和 B 是併發的。我們需要的是一個演算法來告訴我們兩個操作是否併發。如果一個操作先發生於另一個，後面的操作應該覆蓋前面的操作，但如果操作是併發的，我們有一個需要解決的衝突。

--------

> [!TIP] 併發、時間和相對論
>
> 似乎兩個操作如果"同時"發生，應該稱為併發——但實際上，它們是否真的在時間上重疊並不重要。由於分散式系統中的時鐘問題，實際上很難判斷兩件事是否恰好在同一時間發生——我們將在 [第 9 章](/tw/ch9#ch_distributed) 中更詳細地討論這個問題。
>
> 為了定義併發，確切的時間並不重要：我們只是稱兩個操作併發，如果它們都不知道對方，無論它們發生的物理時間如何。人們有時將這一原則與物理學中的狹義相對論聯絡起來 [^57]，它引入了資訊不能比光速傳播更快的想法。因此，如果兩個事件之間的時間短於光在它們之間傳播的時間，那麼相隔一定距離發生的兩個事件不可能相互影響。
>
> 在計算機系統中，即使光速原則上允許一個操作影響另一個，兩個操作也可能是併發的。例如，如果網路在當時很慢或中斷，兩個操作可以相隔一段時間發生，仍然是併發的，因為網路問題阻止了一個操作能夠知道另一個。

--------

#### 捕獲先發生關係 {#capturing-the-happens-before-relationship}

讓我們看一個確定兩個操作是否併發或一個先發生於另一個的演算法。為了簡單起見，讓我們從只有一個副本的資料庫開始。一旦我們弄清楚如何在單個副本上執行此操作，我們就可以將該方法推廣到具有多個副本的無主資料庫。

[圖 6-15](#fig_replication_causality_single) 顯示了兩個客戶端併發地向同一購物車新增專案。（如果這個例子讓你覺得太無聊，想象一下兩個空中交通管制員併發地向他們正在跟蹤的扇區新增飛機。）最初，購物車是空的。兩個客戶端總共向資料庫發起了五次寫入：

1. 客戶端 1 將 `milk` 新增到購物車。這是對該鍵的第一次寫入，因此伺服器成功儲存它併為其分配版本 1。伺服器還將值連同版本號一起回顯給客戶端。
2. 客戶端 2 將 `eggs` 新增到購物車，不知道客戶端 1 併發地添加了 `milk`（客戶端 2 認為它的 `eggs` 是購物車中的唯一專案）。伺服器為此寫入分配版本 2，並將 `eggs` 和 `milk` 儲存為兩個單獨的值（兄弟）。然後，它將 **兩個** 值連同版本號 2 一起返回給客戶端。
3. 客戶端 1，不知道客戶端 2 的寫入，想要將 `flour` 新增到購物車，因此它認為當前購物車內容應該是 `[milk, flour]`。它將此值連同伺服器之前給客戶端 1 的版本號 1 一起傳送到伺服器。伺服器可以從版本號判斷 `[milk, flour]` 的寫入取代了 `[milk]` 的先前值，但它與 `[eggs]` 併發。因此，伺服器將版本 3 分配給 `[milk, flour]`，覆蓋版本 1 值 `[milk]`，但保留版本 2 值 `[eggs]` 並將兩個剩餘值返回給客戶端。
4. 同時，客戶端 2 想要將 `ham` 新增到購物車，不知道客戶端 1 剛剛添加了 `flour`。客戶端 2 在上次響應中從伺服器接收了兩個值 `[milk]` 和 `[eggs]`，因此客戶端現在合併這些值並新增 `ham` 以形成新值 `[eggs, milk, ham]`。它將該值連同先前的版本號 2 一起傳送到伺服器。伺服器檢測到版本 2 覆蓋 `[eggs]` 但與 `[milk, flour]` 併發，因此兩個剩餘值是版本 3 的 `[milk, flour]` 和版本 4 的 `[eggs, milk, ham]`。
5. 最後，客戶端 1 想要新增 `bacon`。它之前從伺服器接收了版本 3 的 `[milk, flour]` 和 `[eggs]`，因此它合併這些，新增 `bacon`，並將最終值 `[milk, flour, eggs, bacon]` 連同版本號 3 一起傳送到伺服器。這覆蓋了 `[milk, flour]`（注意 `[eggs]` 已經在上一步中被覆蓋）但與 `[eggs, milk, ham]` 併發，因此伺服器保留這兩個併發值。

{{< figure src="/fig/ddia_0615.png" id="fig_replication_causality_single" caption="圖 6-15. 捕獲兩個客戶端併發編輯購物車之間的因果依賴關係。" class="w-full my-4" >}}


[圖 6-15](#fig_replication_causality_single) 中操作之間的資料流在 [圖 6-16](#fig_replication_causal_dependencies) 中以圖形方式說明。箭頭指示哪個操作 **先發生於** 哪個其他操作，即後面的操作 **知道** 或 **依賴於** 前面的操作。在這個例子中，客戶端從未完全瞭解伺服器上的資料，因為總是有另一個併發進行的操作。但是值的舊版本最終會被覆蓋，並且不會丟失任何寫入。

{{< figure link="#fig_replication_causality_single" src="/fig/ddia_0616.png" id="fig_replication_causal_dependencies" caption="圖 6-16. 圖 6-15 中因果依賴關係的圖。" class="w-full my-4" >}}


請注意，伺服器可以透過檢視版本號來確定兩個操作是否併發——它不需要解釋值本身（因此值可以是任何資料結構）。演算法的工作原理如下：

* 伺服器為每個鍵維護一個版本號，每次寫入該鍵時遞增版本號，並將新版本號與寫入的值一起儲存。
* 當客戶端讀取鍵時，伺服器返回所有兄弟，即所有未被覆蓋的值，以及最新的版本號。客戶端必須在寫入之前讀取鍵。
* 當客戶端寫入鍵時，它必須包含來自先前讀取的版本號，並且必須合併它在先前讀取中收到的所有值，例如使用 CRDT 或透過詢問使用者。寫入請求的響應就像讀取一樣，返回所有兄弟，這允許我們像購物車示例中那樣連結多個寫入。
* 當伺服器接收到具有特定版本號的寫入時，它可以覆蓋具有該版本號或更低版本號的所有值（因為它知道它們已合併到新值中），但它必須保留具有更高版本號的所有值（因為這些值與傳入寫入併發）。

當寫入包含來自先前讀取的版本號時，這告訴我們寫入基於哪個先前狀態。如果你在不包含版本號的情況下進行寫入，它與所有其他寫入併發，因此它不會覆蓋任何內容——它只會作為後續讀取的值之一返回。

#### 版本向量 {#version-vectors}

[圖 6-15](#fig_replication_causality_single) 中的示例只使用了單個副本。當存在多個副本、且沒有領導者時，演算法如何變化？

[圖 6-15](#fig_replication_causality_single) 使用單個版本號來捕獲操作間依賴關係，但當多個副本併發接受寫入時，這還不夠。我們需要為 **每個副本**、每個鍵分別維護版本號。每個副本在處理寫入時遞增自己的版本號，並追蹤從其他副本看到的版本號。這些資訊決定了哪些值該被覆蓋，哪些值要作為兄弟保留。

來自所有副本的版本號集合稱為 **版本向量** [^58]。這一思想有若干變體，其中較有代表性的是 **點版本向量** [^59] [^60]，Riak 2.0 使用了它 [^61] [^62]。這裡不展開細節，它的工作方式與前面的購物車示例非常相似。

和 [圖 6-15](#fig_replication_causality_single) 裡的版本號一樣，版本向量會在讀取時由資料庫副本返回給客戶端，並在後續寫入時再由客戶端帶回資料庫。（Riak 把版本向量編碼成一個字串，稱為 **因果上下文**。）版本向量讓資料庫能夠區分“覆蓋寫入”和“併發寫入”。

版本向量還保證了“從一個副本讀取，再寫回另一個副本”是安全的。這樣做可能會產生兄弟，但只要正確合併兄弟，就不會丟失資料。

--------

> [!TIP] 版本向量和向量時鐘
>
> **版本向量** 有時也稱為 **向量時鐘**，儘管它們不完全相同。差異很微妙——請參閱參考資料以獲取詳細資訊 [^60] [^63] [^64]。簡而言之，在比較副本狀態時，版本向量是要使用的正確資料結構。

--------

## 總結 {#summary}

在本章中，我們研究了複製問題。複製可以服務於多種目的：

**高可用性**
:   即使一臺機器（或幾臺機器、一個區域，甚至整個地區）宕機，也能保持系統執行

**斷開操作**
:   允許應用程式在網路中斷時繼續工作

**延遲**
:   將資料在地理上放置在靠近使用者的位置，以便使用者可以更快地與其互動

**可伸縮性**
:   透過在副本上執行讀取，能夠處理比單臺機器能夠處理的更高的讀取量

儘管目標很簡單——在幾臺機器上保留相同資料的副本——複製卻是一個非常棘手的問題。它需要仔細考慮併發性以及所有可能出錯的事情，並處理這些故障的後果。至少，我們需要處理不可用的節點和網路中斷（這甚至還沒有考慮更隱蔽的故障型別，例如由於軟體錯誤或硬體錯誤導致的靜默資料損壞）。

我們討論了三種主要的複製方法：

**單主複製**
:   客戶端將所有寫入傳送到單個節點（領導者），該節點將資料變更事件流傳送到其他副本（追隨者）。讀取可以在任何副本上執行，但從追隨者讀取可能是陳舊的。

**多主複製**
:   客戶端將每個寫入傳送到幾個領導者之一，任何領導者都可以接受寫入。領導者相互發送資料變更事件流，併發送到任何追隨者。

**無主複製**
:   客戶端將每個寫入傳送到多個節點，並行從多個節點讀取，以檢測和糾正具有陳舊資料的節點。

每種方法都有優缺點。單主複製很受歡迎，因為它相當容易理解，並且提供強一致性。多主和無主複製在存在故障節點、網路中斷和延遲峰值時可以更加健壯——代價是需要衝突解決並提供較弱的一致性保證。

複製可以是同步的或非同步的，這對系統在出現故障時的行為有深遠的影響。儘管非同步複製在系統平穩執行時可能很快，但重要的是要弄清楚當複製延遲增加和伺服器失敗時會發生什麼。如果領導者失敗並且你將非同步更新的追隨者提升為新的領導者，最近提交的資料可能會丟失。

我們研究了複製延遲可能導致的一些奇怪效果，並討論了一些有助於決定應用程式在複製延遲下應如何表現的一致性模型：

**寫後讀一致性**
:   使用者應該始終看到他們自己提交的資料。

**單調讀**
:   在使用者在某個時間點看到資料後，他們不應該稍後從某個較早的時間點看到資料。

**一致字首讀**
:   使用者應該看到處於因果意義狀態的資料：例如，按正確順序看到問題及其回覆。

最後，我們討論了多主和無主複製如何確保所有副本最終收斂到一致狀態：透過使用版本向量或類似演算法來檢測哪些寫入是併發的，並透過使用衝突解決演算法（如 CRDT）來合併併發寫入的值。最後寫入勝利和手動衝突解決也是可能的。

本章假設每個副本都儲存整個資料庫的完整副本，這對於大型資料集是不現實的。在下一章中，我們將研究 **分片**，它允許每臺機器只儲存資料的子集。


### 參考

[^1]: B. G. Lindsay, P. G. Selinger, C. Galtieri, J. N. Gray, R. A. Lorie, T. G. Price, F. Putzolu, I. L. Traiger, and B. W. Wade. [Notes on Distributed Databases](https://dominoweb.draco.res.ibm.com/reports/RJ2571.pdf). IBM Research, Research Report RJ2571(33471), July 1979. Archived at [perma.cc/EPZ3-MHDD](https://perma.cc/EPZ3-MHDD)
[^2]: Kenny Gryp. [MySQL Terminology Updates](https://dev.mysql.com/blog-archive/mysql-terminology-updates/). *dev.mysql.com*, July 2020. Archived at [perma.cc/S62G-6RJ2](https://perma.cc/S62G-6RJ2)
[^3]: Oracle Corporation. [Oracle (Active) Data Guard 19c: Real-Time Data Protection and Availability](https://www.oracle.com/technetwork/database/availability/dg-adg-technical-overview-wp-5347548.pdf). White Paper, *oracle.com*, March 2019. Archived at [perma.cc/P5ST-RPKE](https://perma.cc/P5ST-RPKE)
[^4]: Microsoft. [What is an Always On availability group?](https://learn.microsoft.com/en-us/sql/database-engine/availability-groups/windows/overview-of-always-on-availability-groups-sql-server) *learn.microsoft.com*, September 2024. Archived at [perma.cc/ABH6-3MXF](https://perma.cc/ABH6-3MXF)
[^5]: Mostafa Elhemali, Niall Gallagher, Nicholas Gordon, Joseph Idziorek, Richard Krog, Colin Lazier, Erben Mo, Akhilesh Mritunjai, Somu Perianayagam, Tim Rath, Swami Sivasubramanian, James Christopher Sorenson III, Sroaj Sosothikul, Doug Terry, and Akshat Vig. [Amazon DynamoDB: A Scalable, Predictably Performant, and Fully Managed NoSQL Database Service](https://www.usenix.org/conference/atc22/presentation/elhemali). At *USENIX Annual Technical Conference* (ATC), July 2022.
[^6]: Rebecca Taft, Irfan Sharif, Andrei Matei, Nathan VanBenschoten, Jordan Lewis, Tobias Grieger, Kai Niemi, Andy Woods, Anne Birzin, Raphael Poss, Paul Bardea, Amruta Ranade, Ben Darnell, Bram Gruneir, Justin Jaffray, Lucy Zhang, and Peter Mattis. [CockroachDB: The Resilient Geo-Distributed SQL Database](https://dl.acm.org/doi/abs/10.1145/3318464.3386134). At *ACM SIGMOD International Conference on Management of Data* (SIGMOD), pages 1493–1509, June 2020. [doi:10.1145/3318464.3386134](https://doi.org/10.1145/3318464.3386134)
[^7]: Dongxu Huang, Qi Liu, Qiu Cui, Zhuhe Fang, Xiaoyu Ma, Fei Xu, Li Shen, Liu Tang, Yuxing Zhou, Menglong Huang, Wan Wei, Cong Liu, Jian Zhang, Jianjun Li, Xuelian Wu, Lingyu Song, Ruoxi Sun, Shuaipeng Yu, Lei Zhao, Nicholas Cameron, Liquan Pei, and Xin Tang. [TiDB: a Raft-based HTAP database](https://www.vldb.org/pvldb/vol13/p3072-huang.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 12, pages 3072–3084. [doi:10.14778/3415478.3415535](https://doi.org/10.14778/3415478.3415535)
[^8]: Mallory Knodel and Niels ten Oever. [Terminology, Power, and Inclusive Language in Internet-Drafts and RFCs](https://www.ietf.org/archive/id/draft-knodel-terminology-14.html). *IETF Internet-Draft*, August 2023. Archived at [perma.cc/5ZY9-725E](https://perma.cc/5ZY9-725E)
[^9]: Buck Hodges. [Postmortem: VSTS 4 September 2018](https://devblogs.microsoft.com/devopsservice/?p=17485). *devblogs.microsoft.com*, September 2018. Archived at [perma.cc/ZF5R-DYZS](https://perma.cc/ZF5R-DYZS)
[^10]: Gunnar Morling. [Leader Election With S3 Conditional Writes](https://www.morling.dev/blog/leader-election-with-s3-conditional-writes/). *www.morling.dev*, August 2024. Archived at [perma.cc/7V2N-J78Y](https://perma.cc/7V2N-J78Y)
[^11]: Vignesh Chandramohan, Rohan Desai, and Chris Riccomini. [SlateDB Manifest Design](https://github.com/slatedb/slatedb/blob/main/rfcs/0001-manifest.md). *github.com*, May 2024. Archived at [perma.cc/8EUY-P32Z](https://perma.cc/8EUY-P32Z)
[^12]: Stas Kelvich. [Why does Neon use Paxos instead of Raft, and what’s the difference?](https://neon.tech/blog/paxos) *neon.tech*, August 2022. Archived at [perma.cc/SEZ4-2GXU](https://perma.cc/SEZ4-2GXU)
[^13]: Dimitri Fontaine. [An introduction to the pg\_auto\_failover project](https://tapoueh.org/blog/2021/11/an-introduction-to-the-pg_auto_failover-project/). *tapoueh.org*, November 2021. Archived at [perma.cc/3WH5-6BAF](https://perma.cc/3WH5-6BAF)
[^14]: Jesse Newland. [GitHub availability this week](https://github.blog/news-insights/the-library/github-availability-this-week/). *github.blog*, September 2012. Archived at [perma.cc/3YRF-FTFJ](https://perma.cc/3YRF-FTFJ)
[^15]: Mark Imbriaco. [Downtime last Saturday](https://github.blog/news-insights/the-library/downtime-last-saturday/). *github.blog*, December 2012. Archived at [perma.cc/M7X5-E8SQ](https://perma.cc/M7X5-E8SQ)
[^16]: John Hugg. [‘All In’ with Determinism for Performance and Testing in Distributed Systems](https://www.youtube.com/watch?v=gJRj3vJL4wE). At *Strange Loop*, September 2015.
[^17]: Hironobu Suzuki. [The Internals of PostgreSQL](https://www.interdb.jp/pg/). *interdb.jp*, 2017.
[^18]: Amit Kapila. [WAL Internals of PostgreSQL](https://www.pgcon.org/2012/schedule/attachments/258_212_Internals%20Of%20PostgreSQL%20Wal.pdf). At *PostgreSQL Conference* (PGCon), May 2012. Archived at [perma.cc/6225-3SUX](https://perma.cc/6225-3SUX)
[^19]: Amit Kapila. [Evolution of Logical Replication](https://amitkapila16.blogspot.com/2023/09/evolution-of-logical-replication.html). *amitkapila16.blogspot.com*, September 2023. Archived at [perma.cc/F9VX-JLER](https://perma.cc/F9VX-JLER)
[^20]: Aru Petchimuthu. [Upgrade your Amazon RDS for PostgreSQL or Amazon Aurora PostgreSQL database, Part 2: Using the pglogical extension](https://aws.amazon.com/blogs/database/part-2-upgrade-your-amazon-rds-for-postgresql-database-using-the-pglogical-extension/). *aws.amazon.com*, August 2021. Archived at [perma.cc/RXT8-FS2T](https://perma.cc/RXT8-FS2T)
[^21]: Yogeshwer Sharma, Philippe Ajoux, Petchean Ang, David Callies, Abhishek Choudhary, Laurent Demailly, Thomas Fersch, Liat Atsmon Guz, Andrzej Kotulski, Sachin Kulkarni, Sanjeev Kumar, Harry Li, Jun Li, Evgeniy Makeev, Kowshik Prakasam, Robbert van Renesse, Sabyasachi Roy, Pratyush Seth, Yee Jiun Song, Benjamin Wester, Kaushik Veeraraghavan, and Peter Xie. [Wormhole: Reliable Pub-Sub to Support Geo-Replicated Internet Services](https://www.usenix.org/system/files/conference/nsdi15/nsdi15-paper-sharma.pdf). At *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015.
[^22]: Douglas B. Terry. [Replicated Data Consistency Explained Through Baseball](https://www.microsoft.com/en-us/research/publication/replicated-data-consistency-explained-through-baseball/). Microsoft Research, Technical Report MSR-TR-2011-137, October 2011. Archived at [perma.cc/F4KZ-AR38](https://perma.cc/F4KZ-AR38)
[^23]: Douglas B. Terry, Alan J. Demers, Karin Petersen, Mike J. Spreitzer, Marvin M. Theher, and Brent B. Welch. [Session Guarantees for Weakly Consistent Replicated Data](https://csis.pace.edu/~marchese/CS865/Papers/SessionGuaranteesPDIS.pdf). At *3rd International Conference on Parallel and Distributed Information Systems* (PDIS), September 1994. [doi:10.1109/PDIS.1994.331722](https://doi.org/10.1109/PDIS.1994.331722)
[^24]: Werner Vogels. [Eventually Consistent](https://queue.acm.org/detail.cfm?id=1466448). *ACM Queue*, volume 6, issue 6, pages 14–19, October 2008. [doi:10.1145/1466443.1466448](https://doi.org/10.1145/1466443.1466448)
[^25]: Simon Willison. [Reply to: “My thoughts about Fly.io (so far) and other newish technology I’m getting into”](https://news.ycombinator.com/item?id=31434055). *news.ycombinator.com*, May 2022. Archived at [perma.cc/ZRV4-WWV8](https://perma.cc/ZRV4-WWV8)
[^26]: Nithin Tharakan. [Scaling Bitbucket’s Database](https://www.atlassian.com/blog/bitbucket/scaling-bitbuckets-database). *atlassian.com*, October 2020. Archived at [perma.cc/JAB7-9FGX](https://perma.cc/JAB7-9FGX)
[^27]: Terry Pratchett. *Reaper Man: A Discworld Novel*. Victor Gollancz, 1991. ISBN: 978-0-575-04979-6
[^28]: Peter Bailis, Alan Fekete, Michael J. Franklin, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Coordination Avoidance in Database Systems](https://arxiv.org/abs/1402.2237). *Proceedings of the VLDB Endowment*, volume 8, issue 3, pages 185–196, November 2014. [doi:10.14778/2735508.2735509](https://doi.org/10.14778/2735508.2735509)
[^29]: Yaser Raja and Peter Celentano. [PostgreSQL bi-directional replication using pglogical](https://aws.amazon.com/blogs/database/postgresql-bi-directional-replication-using-pglogical/). *aws.amazon.com*, January 2022. Archived at <https://perma.cc/BUQ2-5QWN>
[^30]: Robert Hodges. [If You \*Must\* Deploy Multi-Master Replication, Read This First](https://scale-out-blog.blogspot.com/2012/04/if-you-must-deploy-multi-master.html). *scale-out-blog.blogspot.com*, April 2012. Archived at [perma.cc/C2JN-F6Y8](https://perma.cc/C2JN-F6Y8)
[^31]: Lars Hofhansl. [HBASE-7709: Infinite Loop Possible in Master/Master Replication](https://issues.apache.org/jira/browse/HBASE-7709). *issues.apache.org*, January 2013. Archived at [perma.cc/24G2-8NLC](https://perma.cc/24G2-8NLC)
[^32]: John Day-Richter. [What’s Different About the New Google Docs: Making Collaboration Fast](https://drive.googleblog.com/2010/09/whats-different-about-new-google-docs.html). *drive.googleblog.com*, September 2010. Archived at [perma.cc/5TL8-TSJ2](https://perma.cc/5TL8-TSJ2)
[^33]: Evan Wallace. [How Figma’s multiplayer technology works](https://www.figma.com/blog/how-figmas-multiplayer-technology-works/). *figma.com*, October 2019. Archived at [perma.cc/L49H-LY4D](https://perma.cc/L49H-LY4D)
[^34]: Tuomas Artman. [Scaling the Linear Sync Engine](https://linear.app/blog/scaling-the-linear-sync-engine). *linear.app*, June 2023.
[^35]: Amr Saafan. [Why Sync Engines Might Be the Future of Web Applications](https://www.nilebits.com/blog/2024/09/sync-engines-future-web-applications/). *nilebits.com*, September 2024. Archived at [perma.cc/5N73-5M3V](https://perma.cc/5N73-5M3V)
[^36]: Isaac Hagoel. [Are Sync Engines The Future of Web Applications?](https://dev.to/isaachagoel/are-sync-engines-the-future-of-web-applications-1bbi) *dev.to*, July 2024. Archived at [perma.cc/R9HF-BKKL](https://perma.cc/R9HF-BKKL)
[^37]: Sujay Jayakar. [A Map of Sync](https://stack.convex.dev/a-map-of-sync). *stack.convex.dev*, October 2024. Archived at [perma.cc/82R3-H42A](https://perma.cc/82R3-H42A)
[^38]: Alex Feyerke. [Designing Offline-First Web Apps](https://alistapart.com/article/offline-first/). *alistapart.com*, December 2013. Archived at [perma.cc/WH7R-S2DS](https://perma.cc/WH7R-S2DS)
[^39]: Martin Kleppmann, Adam Wiggins, Peter van Hardenberg, and Mark McGranaghan. [Local-first software: You own your data, in spite of the cloud](https://www.inkandswitch.com/local-first/). At *ACM SIGPLAN International Symposium on New Ideas, New Paradigms, and Reflections on Programming and Software* (Onward!), October 2019, pages 154–178. [doi:10.1145/3359591.3359737](https://doi.org/10.1145/3359591.3359737)
[^40]: Martin Kleppmann. [The past, present, and future of local-first](https://martin.kleppmann.com/2024/05/30/local-first-conference.html). At *Local-First Conference*, May 2024.
[^41]: Conrad Hofmeyr. [API Calling is to Sync Engines as jQuery is to React](https://www.powersync.com/blog/api-calling-is-to-sync-engines-as-jquery-is-to-react). *powersync.com*, November 2024. Archived at [perma.cc/2FP9-7WJJ](https://perma.cc/2FP9-7WJJ)
[^42]: Peter van Hardenberg and Martin Kleppmann. [PushPin: Towards Production-Quality Peer-to-Peer Collaboration](https://martin.kleppmann.com/papers/pushpin-papoc20.pdf). At *7th Workshop on Principles and Practice of Consistency for Distributed Data* (PaPoC), April 2020. [doi:10.1145/3380787.3393683](https://doi.org/10.1145/3380787.3393683)
[^43]: Leonard Kawell, Jr., Steven Beckhardt, Timothy Halvorsen, Raymond Ozzie, and Irene Greif. [Replicated document management in a group communication system](https://dl.acm.org/doi/pdf/10.1145/62266.1024798). At *ACM Conference on Computer-Supported Cooperative Work* (CSCW), September 1988. [doi:10.1145/62266.1024798](https://doi.org/10.1145/62266.1024798)
[^44]: Ricky Pusch. [Explaining how fighting games use delay-based and rollback netcode](https://words.infil.net/w02-netcode.html). *words.infil.net* and *arstechnica.com*, October 2019. Archived at [perma.cc/DE7W-RDJ8](https://perma.cc/DE7W-RDJ8)
[^45]: Giuseppe DeCandia, Deniz Hastorun, Madan Jampani, Gunavardhan Kakulapati, Avinash Lakshman, Alex Pilchin, Swaminathan Sivasubramanian, Peter Vosshall, and Werner Vogels. [Dynamo: Amazon’s Highly Available Key-Value Store](https://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf). At *21st ACM Symposium on Operating Systems Principles* (SOSP), October 2007. [doi:10.1145/1323293.1294281](https://doi.org/10.1145/1323293.1294281)
[^46]: Marc Shapiro, Nuno Preguiça, Carlos Baquero, and Marek Zawirski. [A Comprehensive Study of Convergent and Commutative Replicated Data Types](https://inria.hal.science/inria-00555588v1/document). INRIA Research Report no. 7506, January 2011.
[^47]: Chengzheng Sun and Clarence Ellis. [Operational Transformation in Real-Time Group Editors: Issues, Algorithms, and Achievements](https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=aef660812c5a9c4d3f06775f9455eeb090a4ff0f). At *ACM Conference on Computer Supported Cooperative Work* (CSCW), November 1998. [doi:10.1145/289444.289469](https://doi.org/10.1145/289444.289469)
[^48]: Joseph Gentle and Martin Kleppmann. [Collaborative Text Editing with Eg-walker: Better, Faster, Smaller](https://arxiv.org/abs/2409.14252). At *20th European Conference on Computer Systems* (EuroSys), March 2025. [doi:10.1145/3689031.3696076](https://doi.org/10.1145/3689031.3696076)
[^49]: Dharma Shukla. [Azure Cosmos DB: Pushing the frontier of globally distributed databases](https://azure.microsoft.com/en-us/blog/azure-cosmos-db-pushing-the-frontier-of-globally-distributed-databases/). *azure.microsoft.com*, September 2018. Archived at [perma.cc/UT3B-HH6R](https://perma.cc/UT3B-HH6R)
[^50]: David K. Gifford. [Weighted Voting for Replicated Data](https://www.cs.cmu.edu/~15-749/READINGS/required/availability/gifford79.pdf). At *7th ACM Symposium on Operating Systems Principles* (SOSP), December 1979. [doi:10.1145/800215.806583](https://doi.org/10.1145/800215.806583)
[^51]: Heidi Howard, Dahlia Malkhi, and Alexander Spiegelman. [Flexible Paxos: Quorum Intersection Revisited](https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.OPODIS.2016.25). At *20th International Conference on Principles of Distributed Systems* (OPODIS), December 2016. [doi:10.4230/LIPIcs.OPODIS.2016.25](https://doi.org/10.4230/LIPIcs.OPODIS.2016.25)
[^52]: Joseph Blomstedt. [Bringing Consistency to Riak](https://vimeo.com/51973001). At *RICON West*, October 2012.
[^53]: Peter Bailis, Shivaram Venkataraman, Michael J. Franklin, Joseph M. Hellerstein, and Ion Stoica. [Quantifying eventual consistency with PBS](http://www.bailis.org/papers/pbs-vldbj2014.pdf). *The VLDB Journal*, volume 23, pages 279–302, April 2014. [doi:10.1007/s00778-013-0330-1](https://doi.org/10.1007/s00778-013-0330-1)
[^54]: Colin Breck. [Shared-Nothing Architectures for Server Replication and Synchronization](https://blog.colinbreck.com/shared-nothing-architectures-for-server-replication-and-synchronization/). *blog.colinbreck.com*, December 2019. Archived at [perma.cc/48P3-J6CJ](https://perma.cc/48P3-J6CJ)
[^55]: Jeffrey Dean and Luiz André Barroso. [The Tail at Scale](https://cacm.acm.org/research/the-tail-at-scale/). *Communications of the ACM*, volume 56, issue 2, pages 74–80, February 2013. [doi:10.1145/2408776.2408794](https://doi.org/10.1145/2408776.2408794)
[^56]: Peng Huang, Chuanxiong Guo, Lidong Zhou, Jacob R. Lorch, Yingnong Dang, Murali Chintalapati, and Randolph Yao. [Gray Failure: The Achilles’ Heel of Cloud-Scale Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/06/paper-1.pdf). At *16th Workshop on Hot Topics in Operating Systems* (HotOS), May 2017. [doi:10.1145/3102980.3103005](https://doi.org/10.1145/3102980.3103005)
[^57]: Leslie Lamport. [Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/). *Communications of the ACM*, volume 21, issue 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](https://doi.org/10.1145/359545.359563)
[^58]: D. Stott Parker Jr., Gerald J. Popek, Gerard Rudisin, Allen Stoughton, Bruce J. Walker, Evelyn Walton, Johanna M. Chow, David Edwards, Stephen Kiser, and Charles Kline. [Detection of Mutual Inconsistency in Distributed Systems](https://pages.cs.wisc.edu/~remzi/Classes/739/Papers/parker83detection.pdf). *IEEE Transactions on Software Engineering*, volume SE-9, issue 3, pages 240–247, May 1983. [doi:10.1109/TSE.1983.236733](https://doi.org/10.1109/TSE.1983.236733)
[^59]: Nuno Preguiça, Carlos Baquero, Paulo Sérgio Almeida, Victor Fonte, and Ricardo Gonçalves. [Dotted Version Vectors: Logical Clocks for Optimistic Replication](https://arxiv.org/abs/1011.5808). arXiv:1011.5808, November 2010.
[^60]: Giridhar Manepalli. [Clocks and Causality - Ordering Events in Distributed Systems](https://www.exhypothesi.com/clocks-and-causality/). *exhypothesi.com*, November 2022. Archived at [perma.cc/8REU-KVLQ](https://perma.cc/8REU-KVLQ)
[^61]: Sean Cribbs. [A Brief History of Time in Riak](https://speakerdeck.com/seancribbs/a-brief-history-of-time-in-riak). At *RICON*, October 2014. Archived at [perma.cc/7U9P-6JFX](https://perma.cc/7U9P-6JFX)
[^62]: Russell Brown. [Vector Clocks Revisited Part 2: Dotted Version Vectors](https://riak.com/posts/technical/vector-clocks-revisited-part-2-dotted-version-vectors/). *riak.com*, November 2015. Archived at [perma.cc/96QP-W98R](https://perma.cc/96QP-W98R)
[^63]: Carlos Baquero. [Version Vectors Are Not Vector Clocks](https://haslab.wordpress.com/2011/07/08/version-vectors-are-not-vector-clocks/). *haslab.wordpress.com*, July 2011. Archived at [perma.cc/7PNU-4AMG](https://perma.cc/7PNU-4AMG)
[^64]: Reinhard Schwarz and Friedemann Mattern. [Detecting Causal Relationships in Distributed Computations: In Search of the Holy Grail](https://disco.ethz.ch/courses/hs08/seminar/papers/mattern4.pdf). *Distributed Computing*, volume 7, issue 3, pages 149–174, March 1994. [doi:10.1007/BF02277859](https://doi.org/10.1007/BF02277859)

================================================
FILE: content/tw/ch7.md
================================================
---
title: "7. 分片"
weight: 207
breadcrumbs: false
---

<a id="ch_sharding"></a>

![](/map/ch06.png)

> *顯然，我們必須跳出順序計算機指令的窠臼。我們必須敘述定義、提供優先順序和資料描述。我們必須敘述關係，而不是過程。*
>
> Grace Murray Hopper，《未來的計算機及其管理》（1962）

分散式資料庫通常透過兩種方式在節點間分佈資料：

1. 在多個節點上儲存相同資料的副本：這是 *複製*，我們在 [第 6 章](/tw/ch6#ch_replication) 中討論過。
2. 如果我們不想讓每個節點都儲存所有資料，我們可以將大量資料分割成更小的 *分片（shards）* 或 *分割槽（partitions）*，並將不同的分片儲存在不同的節點上。我們將在本章討論分片。

通常，分片的定義方式使得每條資料（每條記錄、行或文件）恰好屬於一個分片。有多種方法可以實現這一點，我們將在本章深入討論。實際上，每個分片本身就是一個小型資料庫，儘管某些資料庫系統支援同時涉及多個分片的操作。

分片通常與複製結合使用，以便每個分片的副本儲存在多個節點上。這意味著，即使每條記錄屬於恰好一個分片，它仍然可以儲存在多個不同的節點上以提供容錯能力。

一個節點可能儲存多個分片。例如，如果使用單領導者複製模型，分片與複製的組合可能如 [圖 7-1](#fig_sharding_replicas) 所示。每個分片的領導者被分配到一個節點，追隨者被分配到其他節點。每個節點可能是某些分片的領導者，同時又是其他分片的追隨者，但每個分片仍然只有一個領導者。

{{< figure src="/fig/ddia_0701.png" id="fig_sharding_replicas" caption="圖 7-1. 複製與分片結合使用：每個節點對某些分片充當領導者，對另一些分片充當追隨者。" class="w-full my-4" >}}

我們在 [第 6 章](/tw/ch6#ch_replication) 中討論的關於資料庫複製的所有內容同樣適用於分片的複製。由於分片方案的選擇大部分獨立於複製方案的選擇，為了簡單起見，我們將在本章中忽略複製。

--------

> [!TIP] 分片和分割槽

在本章中我們稱之為 *分片* 的東西，根據你使用的軟體不同有許多不同的名稱：在 Kafka 中稱為 *分割槽（partition）*，在 CockroachDB 中稱為 *範圍（range）*，在 HBase 和 TiDB 中稱為 *區域（region）*，在 Bigtable 和 YugabyteDB 中稱為 *表塊（tablet）*，在 Cassandra、ScyllaDB 和 Riak 中稱為 *虛節點（vnode）*，在 Couchbase 中稱為 *虛桶（vBucket）*，僅舉幾例。

一些資料庫將分割槽和分片視為兩個不同的概念。例如，在 PostgreSQL 中，分割槽是將大表拆分為儲存在同一臺機器上的多個檔案的方法（這有幾個優點，例如可以非常快速地刪除整個分割槽），而分片則是將資料集拆分到多臺機器上 [^1] [^2]。在許多其他系統中，分割槽只是分片的另一個詞。

雖然 *分割槽* 相當具有描述性，但 *分片* 這個術語可能令人驚訝。根據一種理論，該術語源於線上角色扮演遊戲《網路創世紀》（Ultima Online），其中一塊魔法水晶被打碎成碎片，每個碎片都折射出遊戲世界的副本 [^3]。*分片* 一詞因此用來指一組並行遊戲伺服器中的一個，後來被引入資料庫。另一種理論是 *分片* 最初是 *高可用複製資料系統*（System for Highly Available Replicated Data）的縮寫——據說是 1980 年代的一個數據庫，其細節已經失傳。

順便說一下，分割槽與 *網路分割槽*（netsplits）無關，後者是節點之間網路中的一種故障。我們將在 [第 9 章](/tw/ch9#ch_distributed) 中討論此類故障。

--------

## 分片的利與弊 {#sec_sharding_reasons}

對資料庫進行分片的主要原因是 *可伸縮性*：如果資料量或寫吞吐量已經超出單個節點的處理能力，這是一個解決方案，它允許你將資料和寫入分散到多個節點上。（如果讀吞吐量是問題，你不一定需要分片——你可以使用 [第 6 章](/tw/ch6#ch_replication) 中討論的 *讀擴充套件*。）

事實上，分片是我們實現 *水平擴充套件*（*橫向擴充套件* 架構）的主要工具之一，如 ["共享記憶體、共享磁碟和無共享架構"](/tw/ch2#sec_introduction_shared_nothing) 中所討論的：即，允許系統透過新增更多（較小的）機器而不是轉移到更大的機器來增長其容量。如果你可以劃分工作負載，使每個分片處理大致相等的份額，那麼你可以將這些分片分配給不同的機器，以便並行處理它們的資料和查詢。

雖然複製在小規模和大規模上都很有用，因為它支援容錯和離線操作，但分片是一個重量級解決方案，主要在大規模場景下才有意義。如果你的資料量和寫吞吐量可以在單臺機器上處理（而單臺機器現在可以做很多事情！），通常最好避免分片並堅持使用單分片資料庫。

推薦這樣做的原因是分片通常會增加複雜性：你通常必須透過選擇 *分割槽鍵* 來決定將哪些記錄放在哪個分片中；具有相同分割槽鍵的所有記錄都放在同一個分片中 [^4]。這個選擇很重要，因為如果你知道記錄在哪個分片中，訪問記錄會很快，但如果你不知道分片，你必須在所有分片中進行低效的搜尋，而且分片方案很難更改。

因此，分片通常適用於鍵值資料，你可以輕鬆地按鍵進行分片，但對於關係資料則較難，因為你可能想要透過二級索引搜尋，或連線可能分佈在不同分片中的記錄。我們將在 ["分片與二級索引"](#sec_sharding_secondary_indexes) 中進一步討論這個問題。

分片的另一個問題是寫入可能需要更新多個不同分片中的相關記錄。雖然單節點上的事務相當常見（見 [第 8 章](/tw/ch8#ch_transactions)），但確保跨多個分片的一致性需要 *分散式事務*。正如我們將在 [第 8 章](/tw/ch8#ch_transactions) 中看到的，分散式事務在某些資料庫中可用，但它們通常比單節點事務慢得多，可能成為整個系統的瓶頸，有些系統根本不支援它們。

一些系統即使在單臺機器上也使用分片，通常每個 CPU 核心執行一個單執行緒程序，以利用 CPU 的並行性，或者利用 *非統一記憶體訪問*（NUMA）架構：某些記憶體分割槽比其他分割槽更靠近某個 CPU [^5]。例如，Redis、VoltDB 和 FoundationDB 每個核心使用一個程序，並依靠分片在同一臺機器的 CPU 核心之間分散負載 [^6]。

### 面向多租戶的分片 {#sec_sharding_multitenancy}

軟體即服務（SaaS）產品和雲服務通常是 *多租戶* 的，其中每個租戶是一個客戶。多個使用者可能在同一租戶上擁有登入帳戶，但每個租戶都有一個獨立的資料集，與其他租戶分開。例如，在電子郵件營銷服務中，每個註冊的企業通常是一個單獨的租戶，因為一個企業的通訊訂閱、投遞資料等與其他企業的資料是分開的。

有時分片用於實現多租戶系統：要麼每個租戶被分配一個單獨的分片，要麼多個小租戶可能被分組到一個更大的分片中。這些分片可能是物理上分離的資料庫（我們之前在 ["嵌入式儲存引擎"](/tw/ch4#sidebar_embedded) 中提到過），或者是更大邏輯資料庫的可單獨管理部分 [^7]。使用分片實現多租戶有幾個優點：

資源隔離
: 如果某個租戶執行計算密集型操作，而它與其他租戶執行在不同分片上，那麼其他租戶效能受影響的可能性更小。

許可權隔離
: 如果訪問控制邏輯有漏洞，而租戶資料集又是彼此物理隔離儲存的，那麼誤將一個租戶的資料暴露給另一個租戶的機率會更低。

基於單元的架構
: 你不僅可以在資料儲存級別應用分片，還可以為執行應用程式程式碼的服務應用分片。在 *基於單元的架構* 中，特定租戶集的服務和儲存被分組到一個自包含的 *單元* 中，不同的單元被設定為可以在很大程度上彼此獨立執行。這種方法提供了 *故障隔離*：即，一個單元中的故障僅限於該單元，其他單元中的租戶不受影響 [^8]。

按租戶備份和恢復
: 單獨備份每個租戶的分片使得可以從備份中恢復租戶的狀態而不影響其他租戶，這在租戶意外刪除或覆蓋重要資料的情況下很有用 [^9]。

法規合規性
: 資料隱私法規（如 GDPR）賦予個人訪問和刪除儲存的所有關於他們的資料的權利。如果每個人的資料儲存在單獨的分片中，這就轉化為對其分片的簡單資料匯出和刪除操作 [^10]。

資料駐留
: 如果特定租戶的資料需要儲存在特定司法管轄區以符合資料駐留法律，具有區域感知的資料庫可以允許你將該租戶的分片分配給特定區域。

漸進式模式推出
: 模式遷移（之前在 ["文件模型中的模式靈活性"](/tw/ch3#sec_datamodels_schema_flexibility) 中討論過）可以逐步推出，一次一個租戶。這降低了風險，因為你可以在影響所有租戶之前檢測到問題，但很難以事務方式執行 [^11]。

使用分片實現多租戶的主要挑戰是：

* 它假設每個單獨的租戶都足夠小，可以適應單個節點。如果情況並非如此，並且你有一個對於一臺機器來說太大的租戶，你將需要在單個租戶內額外執行分片，這將我們帶回到為可伸縮性進行分片的主題 [^12]。
* 如果你有許多小租戶，那麼為每個租戶建立單獨的分片可能會產生太多開銷。你可以將幾個小租戶組合到一個更大的分片中，但隨後你會遇到如何在租戶增長時將其從一個分片移動到另一個分片的問題。
* 如果你需要支援跨多個租戶關聯資料的功能，那麼在必須跨多個分片做連線時，實現難度會顯著增加。


## 鍵值資料的分片 {#sec_sharding_key_value}

假設你有大量資料，並且想要對其進行分片。如何決定將哪些記錄儲存在哪些節點上？

我們進行分片的目標是將資料和查詢負載均勻地分佈在各節點上。如果每個節點承擔公平的份額，那麼理論上——10 個節點應該能夠處理 10 倍的資料量和 10 倍單個節點的讀寫吞吐量（忽略複製）。此外，如果我們新增或刪除節點，我們希望能夠 *再平衡* 負載，使其在新增時均勻分佈在 11 個節點上（或刪除時在剩餘的 9 個節點上）。

如果分片不公平，使得某些分片比其他分片承載更多資料或查詢，我們稱之為 *偏斜*。偏斜會顯著削弱分片效果。在極端情況下，所有負載都可能集中在一個分片上，導致 10 個節點中有 9 個處於空閒狀態，而瓶頸落在那一個繁忙節點上。負載明顯高於其他分片的分片稱為 *熱分片* 或 *熱點*。如果某個鍵的負載特別高（例如社交網路中的名人），我們稱之為 *熱鍵*。

因此，我們需要一種演算法，它以記錄的分割槽鍵作為輸入，並告訴我們該記錄在哪個分片中。在鍵值儲存中，分割槽鍵通常是鍵，或鍵的第一部分。在關係模型中，分割槽鍵可能是表的某一列（不一定是其主鍵）。該演算法需要能夠進行再平衡以緩解熱點。


### 按鍵的範圍分片 {#sec_sharding_key_range}

一種分片方法是為每個分片分配一個連續的分割槽鍵範圍（從某個最小值到某個最大值），就像紙質百科全書的卷一樣，如 [圖 7-2](#fig_sharding_encyclopedia) 所示。在這個例子中，條目的分割槽鍵是其標題。如果你想查詢特定標題的條目，你可以透過找到鍵範圍包含你要查詢標題的捲來輕鬆確定哪個分片包含該條目，從而從書架上挑選正確的書。

{{< figure src="/fig/ddia_0702.png" id="fig_sharding_encyclopedia" caption="圖 7-2. 印刷版百科全書按鍵範圍分片。" class="w-full my-4" >}}

鍵的範圍不一定是均勻分佈的，因為你的資料可能不是均勻分佈的。例如，在 [圖 7-2](#fig_sharding_encyclopedia) 中，第 1 捲包含以 A 和 B 開頭的單詞，但第 12 捲包含以 T、U、V、W、X、Y 和 Z 開頭的單詞。簡單地為字母表的每兩個字母分配一卷會導致某些卷比其他卷大得多。為了均勻分佈資料，分片邊界需要適應資料。

分片邊界可能由管理員手動選擇，或者資料庫可以自動選擇它們。手動鍵範圍分片例如被 Vitess（MySQL 的分片層）使用；自動變體被 Bigtable、其開源等價物 HBase、MongoDB 中基於範圍的分片選項、CockroachDB、RethinkDB 和 FoundationDB 使用 [^6]。YugabyteDB 提供手動和自動錶塊分割兩種選項。

在每個分片內，鍵以排序順序儲存（例如，在 B 樹或 SSTable 中，如 [第 4 章](/tw/ch4#ch_storage) 中所討論的）。這樣做的優點是範圍掃描很容易，你可以將鍵視為連線索引，以便在一個查詢中獲取多個相關記錄（參見 ["多維和全文索引"](/tw/ch4#sec_storage_multidimensional)）。例如，考慮一個儲存感測器網路資料的應用程式，其中鍵是測量的時間戳。範圍掃描在這種情況下非常有用，因為它們讓你可以輕鬆獲取，比如說，特定月份的所有讀數。

鍵範圍分片的一個缺點是，如果有大量對相鄰鍵的寫入，你很容易得到一個熱分片。例如，如果鍵是時間戳，那麼分片對應於時間範圍——例如，每個月一個分片。不幸的是，如果你在測量發生時將感測器資料寫入資料庫，所有寫入最終都會進入同一個分片（本月的分片），因此該分片可能會因寫入而過載，而其他分片則處於空閒狀態 [^13]。

為了避免感測器資料庫中的這個問題，你需要使用時間戳以外的東西作為鍵的第一個元素。例如，你可以在每個時間戳前加上感測器 ID，使鍵排序首先按感測器 ID，然後按時間戳。假設你有許多感測器同時活動，寫入負載最終會更均勻地分佈在各個分片上。缺點是當你想要在一個時間範圍內獲取多個感測器的值時，你現在需要為每個感測器執行單獨的範圍查詢。

#### 重新平衡鍵範圍分片資料 {#rebalancing-key-range-sharded-data}

當你首次設定資料庫時，沒有鍵範圍可以分割成分片。一些資料庫，如 HBase 和 MongoDB，允許你在空資料庫上配置一組初始分片，這稱為 *預分割*。這要求你已經對鍵分佈將會是什麼樣子有所瞭解，以便你可以選擇適當的鍵範圍邊界 [^14]。

後來，隨著你的資料量和寫吞吐量增長，具有鍵範圍分片的系統透過將現有分片分割成兩個或更多較小的分片來增長，每個分片都儲存原始分片鍵範圍的連續子範圍。然後可以將生成的較小分片分佈在多個節點上。如果刪除了大量資料，你可能還需要將幾個相鄰的已變小的分片合併為一個更大的分片。這個過程類似於 B 樹頂層發生的事情（參見 ["B 樹"](/tw/ch4#sec_storage_b_trees)）。

對於自動管理分片邊界的資料庫，分片分割通常由以下觸發：

* 分片達到配置的大小（例如，在 HBase 上，預設值為 10 GB），或
* 在某些系統中，寫吞吐量持續高於某個閾值。因此，即使熱分片沒有儲存大量資料，也可能被分割，以便其寫入負載可以更均勻地分佈。

鍵範圍分片的一個優點是分片數量適應資料量。如果只有少量資料，少量分片就足夠了，因此開銷很小；如果有大量資料，每個單獨分片的大小被限制在可配置的最大值 [^15]。

這種方法的一個缺點是分割分片是一項昂貴的操作，因為它需要將其所有資料重寫到新檔案中，類似於日誌結構儲存引擎中的壓實。需要分割的分片通常也是處於高負載下的分片，分割的成本可能會加劇該負載，有使其過載的風險。

### 按鍵的雜湊分片 {#sec_sharding_hash}

鍵範圍分片在你希望具有相鄰（但不同）分割槽鍵的記錄被分組到同一個分片中時很有用；例如，如果是時間戳，這可能就是這種情況。如果你不關心分割槽鍵是否彼此接近（例如，如果它們是多租戶應用程式中的租戶 ID），一種常見方法是先對分割槽鍵進行雜湊，然後將其對映到分片。

一個好的雜湊函式可以把偏斜的資料變得更均勻。假設你有一個 32 位雜湊函式，輸入是字串。每當給它一個新字串，它都會返回一個看似隨機、介於 0 和 2³² − 1 之間的數字。即使輸入字串非常相似，它們的雜湊值也會在這個範圍內均勻分佈（但相同輸入總是產生相同輸出）。

出於分片目的，雜湊函式不需要是密碼學強度的：例如，MongoDB 使用 MD5，而 Cassandra 和 ScyllaDB 使用 Murmur3。許多程式語言都內建了簡單的雜湊函式（因為它們用於雜湊表），但它們可能不適合分片：例如，在 Java 的 `Object.hashCode()` 和 Ruby 的 `Object#hash` 中，相同的鍵在不同的程序中可能有不同的雜湊值，使它們不適合分片 [^16]。

#### 雜湊取模節點數 {#hash-modulo-number-of-nodes}

一旦你對鍵進行了雜湊，如何選擇將其儲存在哪個分片中？也許你的第一個想法是取雜湊值 *模* 系統中的節點數（在許多程式語言中使用 `%` 運算子）。例如，*hash*(*key*) % 10 將返回 0 到 9 之間的數字（如果我們將雜湊寫為十進位制數，hash % 10 將是最後一位數字）。如果我們有 10 個節點，編號從 0 到 9，這似乎是將每個鍵分配給節點的簡單方法。

*mod N* 方法的問題是，如果節點數 *N* 發生變化，大多數鍵必須從一個節點移動到另一個節點。[圖 7-3](#fig_sharding_hash_mod_n) 顯示了當你有三個節點並新增第四個節點時會發生什麼。在再平衡之前，節點 0 儲存雜湊值為 0、3、6、9 等的鍵。新增第四個節點後，雜湊值為 3 的鍵已移動到節點 3，雜湊值為 6 的鍵已移動到節點 2，雜湊值為 9 的鍵已移動到節點 1，依此類推。

{{< figure src="/fig/ddia_0703.png" id="fig_sharding_hash_mod_n" caption="圖 7-3. 透過對鍵進行雜湊並取模節點數來將鍵分配給節點。更改節點數會導致許多鍵從一個節點移動到另一個節點。" class="w-full my-4" >}}

*mod N* 函式易於計算，但它導致非常低效的再平衡，因為存在大量不必要的記錄從一個節點移動到另一個節點。我們需要一種不會移動超過必要資料的方法。

#### 固定數量的分片 {#fixed-number-of-shards}

一個簡單但廣泛使用的解決方案是建立比節點多得多的分片，併為每個節點分配多個分片。例如，在 10 個節點的叢集上執行的資料庫可能從一開始就被分成 1,000 個分片，以便每個節點分配 100 個分片。然後將鍵儲存在分片號 *hash*(*key*) % 1,000 中，系統單獨跟蹤哪個分片儲存在哪個節點上。

現在，如果向叢集新增一個節點，系統可以從現有節點重新分配一些分片到新節點，直到它們再次公平分佈。這個過程在 [圖 7-4](#fig_sharding_rebalance_fixed) 中說明。如果從叢集中刪除節點，則反向發生相同的事情。

{{< figure src="/fig/ddia_0704.png" id="fig_sharding_rebalance_fixed" caption="圖 7-4. 向每個節點有多個分片的資料庫叢集新增新節點。" class="w-full my-4" >}}

在這個模型中，只有整個分片在節點之間移動，這比分割分片更便宜。分片的數量不會改變，也不會改變鍵到分片的分配。唯一改變的是分片到節點的分配。這種分配的變化不是立即的——透過網路傳輸大量資料需要一些時間——因此在傳輸進行時，舊的分片分配用於任何發生的讀寫。

選擇分片數量為可被許多因子整除的數字是很常見的，這樣資料集可以在各種不同數量的節點之間均勻分割——例如，不要求節點數必須是 2 的冪 [^4]。你甚至可以考慮叢集中不匹配的硬體：透過為更強大的節點分配更多分片，你可以讓這些節點承擔更大份額的負載。

這種分片方法被 Citus（PostgreSQL 的分片層）、Riak、Elasticsearch 和 Couchbase 等使用。只要你對首次建立資料庫時需要多少分片有很好的估計，它就很有效。然後你可以輕鬆新增或刪除節點，但受限於你不能擁有比分片更多的節點。

如果你發現最初配置的分片數量是錯誤的——例如，如果你已經達到需要比分片更多節點的規模——那麼需要進行昂貴的重新分片操作。它需要分割每個分片並將其寫入新檔案，在此過程中使用大量額外的磁碟空間。一些系統不允許在併發寫入資料庫時進行重新分片，這使得在沒有停機時間的情況下更改分片數量變得困難。

如果資料集總大小高度可變（例如起初很小，但會隨時間顯著增長），選擇合適的分片數量就很困難。由於每個分片包含總資料中的固定比例，每個分片的大小會隨叢集總資料量按比例增長。如果分片很大，再平衡和節點故障恢復都會很昂貴；但如果分片太小，又會產生過多管理開銷。最佳效能通常出現在分片大小“恰到好處”時，但在分片數量固定、資料規模又持續變化的情況下，這很難做到。

#### 按雜湊範圍分片 {#sharding-by-hash-range}

如果無法提前預測所需的分片數量，最好使用一種方案，其中分片數量可以輕鬆適應工作負載。前面提到的鍵範圍分片方案具有這個屬性，但當有大量對相鄰鍵的寫入時，它有熱點的風險。一種解決方案是將鍵範圍分片與雜湊函式結合，使每個分片包含 *雜湊值* 的範圍而不是 *鍵* 的範圍。

[圖 7-5](#fig_sharding_hash_range) 顯示了使用 16 位雜湊函式的示例，該函式返回 0 到 65,535 = 2¹⁶ − 1 之間的數字（實際上，雜湊通常是 32 位或更多）。即使輸入鍵非常相似（例如，連續的時間戳），它們的雜湊值也會在該範圍內均勻分佈。然後我們可以為每個分片分配一個雜湊值範圍：例如，值 0 到 16,383 分配給分片 0，值 16,384 到 32,767 分配給分片 1，依此類推。

{{< figure src="/fig/ddia_0705.png" id="fig_sharding_hash_range" caption="圖 7-5. 為每個分片分配連續的雜湊值範圍。" class="w-full my-4" >}}

與鍵範圍分片一樣，雜湊範圍分片中的分片在變得太大或負載太重時可以被分割。這仍然是一個昂貴的操作，但它可以根據需要發生，因此分片數量適應資料量而不是預先固定。

與鍵範圍分片相比的缺點是，對分割槽鍵的範圍查詢效率不高，因為範圍內的鍵現在分散在所有分片中。但是，如果鍵由兩列或更多列組成，並且分割槽鍵只是這些列中的第一列，你仍然可以對第二列和後續列執行高效的範圍查詢：只要範圍查詢中的所有記錄具有相同的分割槽鍵，它們就會在同一個分片中。

--------

> [!TIP] 資料倉庫中的分割槽和範圍查詢

資料倉庫如 BigQuery、Snowflake 和 Delta Lake 支援類似的索引方法，儘管術語不同。例如，在 BigQuery 中，分割槽鍵決定記錄駐留在哪個分割槽中，而"叢集列"決定記錄在分割槽內如何排序。Snowflake 自動將記錄分配給"微分割槽"，但允許使用者為表定義叢集鍵。Delta Lake 支援手動和自動分割槽分配，並支援叢集鍵。聚集資料不僅可以提高範圍掃描效能，還可以提高壓縮和過濾效能。

--------

雜湊範圍分片被 YugabyteDB 和 DynamoDB 使用 [^17]，並且是 MongoDB 中的一個選項。Cassandra 和 ScyllaDB 使用這種方法的一個變體，如 [圖 7-6](#fig_sharding_cassandra) 所示：雜湊值空間被分割成與節點數成比例的範圍數（[圖 7-6](#fig_sharding_cassandra) 中每個節點 3 個範圍，但實際數字在 Cassandra 中預設為每個節點 8 個，在 ScyllaDB 中為每個節點 256 個），這些範圍之間有隨機邊界。這意味著某些範圍比其他範圍大，但透過每個節點有多個範圍，這些不平衡傾向於平均化 [^15] [^18]。

{{< figure src="/fig/ddia_0706.png" id="fig_sharding_cassandra" caption="圖 7-6. Cassandra 和 ScyllaDB 將可能的雜湊值範圍（這裡是 0-1023）分割成具有隨機邊界的連續範圍，併為每個節點分配多個範圍。" class="w-full my-4" >}}

當新增或刪除節點時，會新增和刪除範圍邊界，並相應地分割或合併分片 [^19]。在 [圖 7-6](#fig_sharding_cassandra) 的示例中，當新增節點 3 時，節點 1 將其兩個範圍的部分轉移到節點 3，節點 2 將其一個範圍的部分轉移到節點 3。這樣做的效果是給新節點一個大致公平的資料集份額，而不會在節點之間傳輸超過必要的資料。

#### 一致性雜湊 {#sec_sharding_consistent_hashing}

*一致性雜湊* 演算法是一種雜湊函式，它以滿足兩個屬性的方式將鍵對映到指定數量的分片：

1. 對映到每個分片的鍵數大致相等，並且
2. 當分片數量變化時，儘可能少的鍵從一個分片移動到另一個分片。

注意這裡的 *一致性* 與副本一致性（見 [第 6 章](/tw/ch6#ch_replication)）或 ACID 一致性（見 [第 8 章](/tw/ch8#ch_transactions)）無關，而是描述了鍵儘可能保持在同一個分片中的傾向。

Cassandra 和 ScyllaDB 使用的分片演算法類似於一致性雜湊的原始定義 [^20]，但也提出了其他幾種一致性雜湊演算法 [^21]，如 *最高隨機權重*，也稱為 *會合雜湊* [^22]，以及 *跳躍一致性雜湊* [^23]。使用 Cassandra 的演算法，如果新增一個節點，少量現有分片會被分割成子範圍；另一方面，使用會合和跳躍一致性雜湊，新節點被分配之前分散在所有其他節點中的單個鍵。哪種更可取取決於應用程式。

### 偏斜的工作負載與緩解熱點 {#sec_sharding_skew}

一致性雜湊保證鍵在節點間大致均勻分佈，但這並不等於實際負載也均勻分佈。如果工作負載高度偏斜，即某些分割槽鍵下的資料量遠大於其他鍵，或某些鍵的請求速率遠高於其他鍵，那麼你仍可能出現部分伺服器過載、其他伺服器幾乎空閒的情況。

例如，在社交媒體網站上，擁有數百萬粉絲的名人使用者在做某事時可能會引起活動風暴 [^24]。這個事件可能導致對同一個鍵的大量讀寫（其中分割槽鍵可能是名人的使用者 ID，或者人們正在評論的動作的 ID）。

在這種情況下，需要更靈活的分片策略 [^25] [^26]。基於鍵範圍（或雜湊範圍）定義分片的系統使得可以將單個熱鍵放在自己的分片中，甚至可能為其分配專用機器 [^27]。

也可以在應用層補償偏斜。例如，如果已知某個鍵非常熱，一個簡單方法是在鍵的前後附加隨機數。僅用兩位十進位制隨機數，就可以把對該鍵的寫入均勻打散到 100 個不同鍵上，從而將它們分佈到不同分片。

然而，將寫入分散到不同的鍵之後，任何讀取現在都必須做額外的工作，因為它們必須從所有 100 個鍵讀取資料並將其組合。對熱鍵每個分片的讀取量沒有減少；只有寫入負載被分割。這種技術還需要額外的記賬：只對少數熱鍵附加隨機數是有意義的；對於寫入吞吐量低的絕大多數鍵，這將是不必要的開銷。因此，你還需要某種方法來跟蹤哪些鍵正在被分割，以及將常規鍵轉換為特殊管理的熱鍵的過程。

問題因負載隨時間變化而進一步複雜化：例如，一個已經病毒式傳播的特定社交媒體帖子可能會在幾天內經歷高負載，但之後可能會再次平靜下來。此外，某些鍵可能對寫入很熱，而其他鍵對讀取很熱，需要不同的策略來處理它們。

一些系統（特別是為大規模設計的雲服務）有自動處理熱分片的方法；例如，Amazon 稱之為 *熱管理* [^28] 或 *自適應容量* [^17]。這些系統如何工作的細節超出了本書的範圍。

### 運維：自動/手動再平衡 {#sec_sharding_operations}

關於再平衡有一個我們已經忽略的重要問題：分片的分割和再平衡是自動發生還是手動發生？

一些系統自動決定何時分割分片以及何時將它們從一個節點移動到另一個節點，無需任何人工互動，而其他系統則讓分片由管理員明確配置。還有一個中間地帶：例如，Couchbase 和 Riak 自動生成建議的分片分配，但需要管理員提交才能生效。

完全自動的再平衡可能很方便，因為正常維護的操作工作較少，這樣的系統甚至可以自動擴充套件以適應工作負載的變化。雲資料庫如 DynamoDB 被宣傳為能夠在幾分鐘內自動新增和刪除分片以適應負載的大幅增加或減少 [^17] [^29]。

然而，自動分片管理也可能是不可預測的。再平衡是一項昂貴的操作，因為它需要重新路由請求並將大量資料從一個節點移動到另一個節點。如果操作不當，這個過程可能會使網路或節點過載，並可能損害其他請求的效能。系統必須在再平衡進行時繼續處理寫入；如果系統接近其最大寫入吞吐量，分片分割過程甚至可能無法跟上傳入寫入的速率 [^29]。

這種自動化與自動故障檢測結合可能很危險。例如，假設一個節點過載並暫時響應請求緩慢。其他節點得出結論，過載的節點已死，並自動重新平衡叢集以將負載從它移開。這會對其他節點和網路施加額外負載，使情況變得更糟。存在導致級聯故障的風險，其中其他節點變得過載並也被錯誤地懷疑已關閉。

出於這個原因，在再平衡過程中有人參與可能是件好事。它比完全自動的過程慢，但它可以幫助防止操作意外。


## 請求路由 {#sec_sharding_routing}

我們已經討論了如何將資料集分片到多個節點上，以及如何在新增或刪除節點時重新平衡這些分片。現在讓我們繼續討論這個問題：如果你想讀取或寫入特定的鍵，你如何知道需要連線到哪個節點——即哪個 IP 地址和埠號？

我們稱這個問題為 *請求路由*，它與 *服務發現* 非常相似，我們之前在 ["負載均衡器、服務發現和服務網格"](/tw/ch5#sec_encoding_service_discovery) 中討論過。兩者之間最大的區別是，對於執行應用程式程式碼的服務，每個例項通常是無狀態的，負載均衡器可以將請求傳送到任何例項。對於分片資料庫，對鍵的請求只能由包含該鍵的分片的副本節點處理。

這意味著請求路由必須知道鍵到分片的分配，以及分片到節點的分配。在高層次上，這個問題有幾種不同的方法（在 [圖 7-7](#fig_sharding_routing) 中說明）：

1. 允許客戶端連線任何節點（例如，透過迴圈負載均衡器）。如果該節點恰好擁有請求適用的分片，它可以直接處理請求；否則，它將請求轉發到適當的節點，接收回復，並將回覆傳遞給客戶端。
2. 首先將客戶端的所有請求傳送到路由層，該層確定應該處理每個請求的節點並相應地轉發它。這個路由層本身不處理任何請求；它只充當分片感知的負載均衡器。
3. 要求客戶端知道分片和分片到節點的分配。在這種情況下，客戶端可以直接連線到適當的節點，而無需任何中介。

{{< figure src="/fig/ddia_0707.png" id="fig_sharding_routing" caption="圖 7-7. 將請求路由到正確節點的三種不同方式。" class="w-full my-4" >}}

在所有情況下，都有一些關鍵問題：

* 誰決定哪個分片應該存在於哪個節點上？最簡單的是有一個單一的協調器做出該決定，但在這種情況下，如果執行協調器的節點出現故障，如何使其容錯？如果協調器角色可以故障轉移到另一個節點，如何防止腦裂情況（見 ["處理節點中斷"](/tw/ch6#sec_replication_failover)），其中兩個不同的協調器做出相互矛盾的分片分配？
* 執行路由的元件（可能是節點之一、路由層或客戶端）如何瞭解分片到節點分配的變化？
* 當分片從一個節點移動到另一個節點時，有一個切換期，在此期間新節點已接管，但對舊節點的請求可能仍在傳輸中。如何處理這些？

許多分散式資料系統依賴於單獨的協調服務（如 ZooKeeper 或 etcd）來跟蹤分片分配，如 [圖 7-8](#fig_sharding_zookeeper) 所示。它們使用共識演算法（見 [第 10 章](/tw/ch10#ch_consistency)）來提供容錯和防止腦裂。每個節點在 ZooKeeper 中註冊自己，ZooKeeper 維護分片到節點的權威對映。其他參與者，如路由層或分片感知客戶端，可以在 ZooKeeper 中訂閱此資訊。每當分片所有權發生變化，或者新增或刪除節點時，ZooKeeper 都會通知路由層，以便它可以保持其路由資訊最新。

{{< figure src="/fig/ddia_0708.png" id="fig_sharding_zookeeper" caption="圖 7-8. 使用 ZooKeeper 跟蹤分片到節點的分配。" class="w-full my-4" >}}

例如，HBase 和 SolrCloud 使用 ZooKeeper 管理分片分配，Kubernetes 使用 etcd 跟蹤哪個服務例項在哪裡執行。MongoDB 有類似的架構，但它依賴於自己的 *配置伺服器* 實現和 *mongos* 守護程序作為路由層。Kafka、YugabyteDB 和 TiDB 使用內建的 Raft 共識協議實現來執行此協調功能。

Cassandra、ScyllaDB 和 Riak 採用不同的方法：它們在節點之間使用 *流言協議* 來傳播叢集狀態的任何變化。這提供了比共識協議弱得多的一致性；可能會出現腦裂，其中叢集的不同部分對同一分片有不同的節點分配。無主資料庫可以容忍這一點，因為它們通常提供弱一致性保證（見 ["仲裁一致性的限制"](/tw/ch6#sec_replication_quorum_limitations)）。

當使用路由層或向隨機節點發送請求時，客戶端仍然需要找到要連線的 IP 地址。這些不像分片到節點的分配那樣快速變化，因此通常使用 DNS 就足夠了。

上面對請求路由的討論，主要關注如何為單個鍵找到對應分片，這對分片 OLTP 資料庫最相關。分析型資料庫通常也使用分片，但其查詢執行模型很不一樣：查詢往往需要並行聚合並連線來自多個分片的資料，而不是在單個分片內執行。我們將在 ["JOIN 和 GROUP BY"](/tw/ch11#sec_batch_join) 中討論這類並行查詢執行技術。

## 分片與二級索引 {#sec_sharding_secondary_indexes}

到目前為止，我們討論的分片方案依賴於客戶端知道它想要訪問的任何記錄的分割槽鍵。這在鍵值資料模型中最容易做到，其中分割槽鍵是主鍵的第一部分（或整個主鍵），因此我們可以使用分割槽鍵來確定分片，從而將讀寫路由到負責該鍵的節點。

如果涉及二級索引，情況會變得更加複雜（另見 ["多列和二級索引"](/tw/ch4#sec_storage_index_multicolumn)）。二級索引通常不唯一地標識記錄，而是一種搜尋特定值出現的方法：查詢使用者 `123` 的所有操作、查詢包含單詞 `hogwash` 的所有文章、查詢顏色為 `red` 的所有汽車等。

鍵值儲存通常沒有二級索引；但在關係資料庫中，二級索引是基礎能力，在文件資料庫中也很常見，而且它們正是 Solr、Elasticsearch 等全文檢索引擎的 *立身之本*。二級索引的難點在於，它們不能整齊地對映到分片。帶二級索引的分片資料庫主要有兩種做法：本地索引與全域性索引。

### 本地二級索引 {#id166}

例如，假設你正在運營一個出售二手車的網站（如 [圖 7-9](#fig_sharding_local_secondary) 所示）。每個列表都有一個唯一的 ID——稱之為文件 ID——你使用該 ID 作為分割槽鍵對資料庫進行分片（例如，ID 0 到 499 在分片 0 中，ID 500 到 999 在分片 1 中，等等）。

如果你想讓使用者搜尋汽車，允許他們按顏色和製造商過濾，你需要在 `color` 和 `make` 上建立二級索引（在文件資料庫中這些是欄位；在關係資料庫中這些是列）。如果你已宣告索引，資料庫就可以自動維護索引。例如，每當一輛紅色汽車被寫入資料庫，所在分片會自動將其 ID 加入索引條目 `color:red` 對應的文件 ID 列表。正如 [第 4 章](/tw/ch4#ch_storage) 所述，這個 ID 列表也稱為 *倒排列表*。

{{< figure src="/fig/ddia_0709.png" id="fig_sharding_local_secondary" caption="圖 7-9. 本地二級索引：每個分片只索引其自己分片內的記錄。" class="w-full my-4" >}}

> [!WARNING] 警告

如果你的資料庫只支援鍵值模型，你可能會嘗試透過在應用程式程式碼中建立從值到文件 ID 的對映來自己實現二級索引。如果你走這條路，你需要格外小心，確保你的索引與底層資料保持一致。競態條件和間歇性寫入失敗（其中某些更改已儲存但其他更改未儲存）很容易導致資料不同步——見 ["多物件事務的需求"](/tw/ch8#sec_transactions_need)。

--------

在這種索引方法中，每個分片是完全獨立的：每個分片維護自己的二級索引，僅覆蓋該分片中的文件。它不關心儲存在其他分片中的資料。每當你需要寫入資料庫——新增、刪除或更新記錄——你只需要處理包含你正在寫入的文件 ID 的分片。出於這個原因，這種型別的二級索引被稱為 *本地索引*。在資訊檢索上下文中，它也被稱為 *文件分割槽索引* [^30]。

當從本地二級索引讀取時，如果你已經知道你正在查詢的記錄的分割槽鍵，你可以只在適當的分片上執行搜尋。此外，如果你只想要 *一些* 結果，而不需要全部，你可以將請求傳送到任何分片。

但是，如果你想要所有結果並且事先不知道它們的分割槽鍵，你需要將查詢傳送到所有分片，並組合你收到的結果，因為匹配的記錄可能分散在所有分片中。在 [圖 7-9](#fig_sharding_local_secondary) 中，紅色汽車出現在分片 0 和分片 1 中。

這種查詢分片資料庫的方法有時稱為 *分散/收集*（scatter/gather），它可能使二級索引讀取變得相當昂貴。即使並行查詢各分片，分散/收集也容易導致尾部延遲放大（見 ["響應時間指標的使用"](/tw/ch2#sec_introduction_slo_sla)）。它還會限制應用的可伸縮性：增加分片可以提升可儲存資料量，但若每個查詢仍需所有分片參與，查詢吞吐量並不會隨分片數增加而提升。

儘管如此，本地二級索引被廣泛使用 [^31]：例如，MongoDB、Riak、Cassandra [^32]、Elasticsearch [^33]、SolrCloud 和 VoltDB [^34] 都使用本地二級索引。

### 全域性二級索引 {#id167}

我們可以構建一個覆蓋所有分片資料的 *全域性索引*，而不是每個分片有自己的本地二級索引。但是，我們不能只將該索引儲存在一個節點上，因為它可能會成為瓶頸並違背分片的目的。全域性索引也必須進行分片，但它可以以不同於主鍵索引的方式進行分片。

[圖 7-10](#fig_sharding_global_secondary) 說明了這可能是什麼樣子：來自所有分片的紅色汽車的 ID 出現在索引的 `color:red` 下，但索引是分片的，以便以字母 *a* 到 *r* 開頭的顏色出現在分片 0 中，以 *s* 到 *z* 開頭的顏色出現在分片 1 中。汽車製造商的索引也類似地分割槽（分片邊界在 *f* 和 *h* 之間）。

{{< figure src="/fig/ddia_0710.png" id="fig_sharding_global_secondary" caption="圖 7-10. 全域性二級索引反映來自所有分片的資料，並且本身按索引值進行分片。" class="w-full my-4" >}}

這種索引也稱為 *基於詞項分割槽* [^30]：回憶一下 ["全文檢索"](/tw/ch4#sec_storage_full_text)，在全文檢索中，*詞項* 是你可以搜尋的文字中的關鍵字。這裡我們將其推廣為指二級索引中你可以搜尋的任何值。

全域性索引使用詞項作為分割槽鍵，因此當你查詢特定詞項或值時，你可以找出需要查詢哪個分片。和以前一樣，分片可以包含連續的詞項範圍（如 [圖 7-10](#fig_sharding_global_secondary)），或者你可以基於詞項的雜湊將詞項分配給分片。

全域性索引的優點是，只有一個查詢條件時（如 *color = red*），只需從一個分片讀取即可獲得倒排列表。但如果你不僅要 ID，還要取回完整記錄，仍然必須去負責這些 ID 的各個分片讀取。

如果你有多個搜尋條件或詞項（例如搜尋某種顏色且某個製造商的汽車，或搜尋同一文字中出現的多個單詞），這些詞項很可能會落在不同分片。要計算兩個條件的邏輯 AND，系統需要找出同時出現在兩個倒排列表中的 ID。若倒排列表較短，這沒問題；但若很長，把它們透過網路傳送後再算交集就可能很慢 [^30]。

全域性二級索引的另一個挑戰是寫入比本地索引更複雜，因為寫入單個記錄可能會影響索引的多個分片（文件中的每個詞項可能在不同的分片或不同的節點上）。這使得二級索引與底層資料保持同步更加困難。一種選擇是使用分散式事務來原子地更新儲存主記錄的分片及其二級索引（見 [第 8 章](/tw/ch8#ch_transactions)）。

全域性二級索引被 CockroachDB、TiDB 和 YugabyteDB 使用；DynamoDB 同時支援本地與全域性二級索引。在 DynamoDB 中，寫入會非同步反映到全域性索引，因此從全域性索引讀取到的結果可能是陳舊的（類似複製延遲，見 ["複製延遲的問題"](/tw/ch6#sec_replication_lag)）。儘管如此，在讀吞吐量高於寫吞吐量且倒排列表不太長的場景下，全域性索引仍然很有價值。


## 總結 {#summary}

在本章中，我們探討了將大型資料集分片為更小子集的不同方法。當你有如此多的資料以至於在單臺機器上儲存和處理它不再可行時，分片是必要的。

分片的目標是在多臺機器上均勻分佈資料和查詢負載，避免熱點（負載不成比例高的節點）。這需要選擇適合你的資料的分片方案，並在節點新增到叢集或從叢集中刪除時重新平衡分片。

我們討論了兩種主要的分片方法：

**鍵範圍分片**
: 其中鍵是有序的，分片擁有從某個最小值到某個最大值的所有鍵。排序的優點是可以進行高效的範圍查詢，但如果應用程式經常訪問排序順序中彼此接近的鍵，則存在熱點風險。

  在這種方法中，當分片變得太大時，通常透過將範圍分成兩個子範圍來動態重新平衡分片。

**雜湊分片**
: 其中對每個鍵應用雜湊函式，分片擁有一個雜湊值範圍（或者可以使用另一種一致性雜湊演算法將雜湊對映到分片）。這種方法破壞了鍵的順序，使範圍查詢效率低下，但可能更均勻地分佈負載。

  當按雜湊分片時，通常預先建立固定數量的分片，為每個節點分配多個分片，並在新增或刪除節點時將整個分片從一個節點移動到另一個節點。像鍵範圍一樣分割分片也是可能的。

通常使用鍵的第一部分作為分割槽鍵（即，識別分片），並在該分片內按鍵的其餘部分對記錄進行排序。這樣，你仍然可以在具有相同分割槽鍵的記錄之間進行高效的範圍查詢。

我們還討論了分片和二級索引之間的互動。二級索引也需要進行分片，有兩種方法：

**本地二級索引**
: 其中二級索引與主鍵和值儲存在同一個分片中。這意味著寫入時只需要更新一個分片，但二級索引的查詢需要從所有分片讀取。

**全域性二級索引**
: 它們基於索引值單獨分片。二級索引中的條目可能引用來自主鍵所有分片的記錄。寫入記錄時，可能需要更新多個二級索引分片；但讀取倒排列表時，可以由單個分片提供（獲取實際記錄仍需從多個分片讀取）。

最後，我們討論了將查詢路由到正確分片的技術，以及如何藉助協調服務維護分片到節點的分配資訊。

按設計，每個分片大體獨立執行，這正是分片資料庫能夠擴充套件到多臺機器的原因。然而，凡是需要同時寫多個分片的操作都會變得棘手：例如，一個分片寫入成功、另一個分片寫入失敗時會發生什麼？這個問題將在後續章節中討論。


### 參考

[^1]: Claire Giordano. [Understanding partitioning and sharding in Postgres and Citus](https://www.citusdata.com/blog/2023/08/04/understanding-partitioning-and-sharding-in-postgres-and-citus/). *citusdata.com*, August 2023. Archived at [perma.cc/8BTK-8959](https://perma.cc/8BTK-8959)
[^2]: Brandur Leach. [Partitioning in Postgres, 2022 edition](https://brandur.org/fragments/postgres-partitioning-2022). *brandur.org*, October 2022. Archived at [perma.cc/Z5LE-6AKX](https://perma.cc/Z5LE-6AKX)
[^3]: Raph Koster. [Database “sharding” came from UO?](https://www.raphkoster.com/2009/01/08/database-sharding-came-from-uo/) *raphkoster.com*, January 2009. Archived at [perma.cc/4N9U-5KYF](https://perma.cc/4N9U-5KYF)
[^4]: Garrett Fidalgo. [Herding elephants: Lessons learned from sharding Postgres at Notion](https://www.notion.com/blog/sharding-postgres-at-notion). *notion.com*, October 2021. Archived at [perma.cc/5J5V-W2VX](https://perma.cc/5J5V-W2VX)
[^5]: Ulrich Drepper. [What Every Programmer Should Know About Memory](https://www.akkadia.org/drepper/cpumemory.pdf). *akkadia.org*, November 2007. Archived at [perma.cc/NU6Q-DRXZ](https://perma.cc/NU6Q-DRXZ)
[^6]: Jingyu Zhou, Meng Xu, Alexander Shraer, Bala Namasivayam, Alex Miller, Evan Tschannen, Steve Atherton, Andrew J. Beamon, Rusty Sears, John Leach, Dave Rosenthal, Xin Dong, Will Wilson, Ben Collins, David Scherer, Alec Grieser, Young Liu, Alvin Moore, Bhaskar Muppana, Xiaoge Su, and Vishesh Yadav. [FoundationDB: A Distributed Unbundled Transactional Key Value Store](https://www.foundationdb.org/files/fdb-paper.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2021. [doi:10.1145/3448016.3457559](https://doi.org/10.1145/3448016.3457559)
[^7]: Marco Slot. [Citus 12: Schema-based sharding for PostgreSQL](https://www.citusdata.com/blog/2023/07/18/citus-12-schema-based-sharding-for-postgres/). *citusdata.com*, July 2023. Archived at [perma.cc/R874-EC9W](https://perma.cc/R874-EC9W)
[^8]: Robisson Oliveira. [Reducing the Scope of Impact with Cell-Based Architecture](https://docs.aws.amazon.com/pdfs/wellarchitected/latest/reducing-scope-of-impact-with-cell-based-architecture/reducing-scope-of-impact-with-cell-based-architecture.pdf). AWS Well-Architected white paper, Amazon Web Services, September 2023. Archived at [perma.cc/4KWW-47NR](https://perma.cc/4KWW-47NR)
[^9]: Gwen Shapira. [Things DBs Don’t Do - But Should](https://www.thenile.dev/blog/things-dbs-dont-do). *thenile.dev*, February 2023. Archived at [perma.cc/C3J4-JSFW](https://perma.cc/C3J4-JSFW)
[^10]: Malte Schwarzkopf, Eddie Kohler, M. Frans Kaashoek, and Robert Morris. [Position: GDPR Compliance by Construction](https://cs.brown.edu/people/malte/pub/papers/2019-poly-gdpr.pdf). At *Towards Polystores that manage multiple Databases, Privacy, Security and/or Policy Issues for Heterogenous Data* (Poly), August 2019. [doi:10.1007/978-3-030-33752-0\_3](https://doi.org/10.1007/978-3-030-33752-0_3)
[^11]: Gwen Shapira. [Introducing pg\_karnak: Transactional schema migration across tenant databases](https://www.thenile.dev/blog/distributed-ddl). *thenile.dev*, November 2024. Archived at [perma.cc/R5RD-8HR9](https://perma.cc/R5RD-8HR9)
[^12]: Arka Ganguli, Guido Iaquinti, Maggie Zhou, and Rafael Chacón. [Scaling Datastores at Slack with Vitess](https://slack.engineering/scaling-datastores-at-slack-with-vitess/). *slack.engineering*, December 2020. Archived at [perma.cc/UW8F-ALJK](https://perma.cc/UW8F-ALJK)
[^13]: Ikai Lan. [App Engine Datastore Tip: Monotonically Increasing Values Are Bad](https://ikaisays.com/2011/01/25/app-engine-datastore-tip-monotonically-increasing-values-are-bad/). *ikaisays.com*, January 2011. Archived at [perma.cc/BPX8-RPJB](https://perma.cc/BPX8-RPJB)
[^14]: Enis Soztutar. [Apache HBase Region Splitting and Merging](https://www.cloudera.com/blog/technical/apache-hbase-region-splitting-and-merging.html). *cloudera.com*, February 2013. Archived at [perma.cc/S9HS-2X2C](https://perma.cc/S9HS-2X2C)
[^15]: Eric Evans. [Rethinking Topology in Cassandra](https://www.youtube.com/watch?v=Qz6ElTdYjjU). At *Cassandra Summit*, June 2013. Archived at [perma.cc/2DKM-F438](https://perma.cc/2DKM-F438)
[^16]: Martin Kleppmann. [Java’s hashCode Is Not Safe for Distributed Systems](https://martin.kleppmann.com/2012/06/18/java-hashcode-unsafe-for-distributed-systems.html). *martin.kleppmann.com*, June 2012. Archived at [perma.cc/LK5U-VZSN](https://perma.cc/LK5U-VZSN)
[^17]: Mostafa Elhemali, Niall Gallagher, Nicholas Gordon, Joseph Idziorek, Richard Krog, Colin Lazier, Erben Mo, Akhilesh Mritunjai, Somu Perianayagam, Tim Rath, Swami Sivasubramanian, James Christopher Sorenson III, Sroaj Sosothikul, Doug Terry, and Akshat Vig. [Amazon DynamoDB: A Scalable, Predictably Performant, and Fully Managed NoSQL Database Service](https://www.usenix.org/conference/atc22/presentation/elhemali). At *USENIX Annual Technical Conference* (ATC), July 2022.
[^18]: Brandon Williams. [Virtual Nodes in Cassandra 1.2](https://www.datastax.com/blog/virtual-nodes-cassandra-12). *datastax.com*, December 2012. Archived at [perma.cc/N385-EQXV](https://perma.cc/N385-EQXV)
[^19]: Branimir Lambov. [New Token Allocation Algorithm in Cassandra 3.0](https://www.datastax.com/blog/new-token-allocation-algorithm-cassandra-30). *datastax.com*, January 2016. Archived at [perma.cc/2BG7-LDWY](https://perma.cc/2BG7-LDWY)
[^20]: David Karger, Eric Lehman, Tom Leighton, Rina Panigrahy, Matthew Levine, and Daniel Lewin. [Consistent Hashing and Random Trees: Distributed Caching Protocols for Relieving Hot Spots on the World Wide Web](https://people.csail.mit.edu/karger/Papers/web.pdf). At *29th Annual ACM Symposium on Theory of Computing* (STOC), May 1997. [doi:10.1145/258533.258660](https://doi.org/10.1145/258533.258660)
[^21]: Damian Gryski. [Consistent Hashing: Algorithmic Tradeoffs](https://dgryski.medium.com/consistent-hashing-algorithmic-tradeoffs-ef6b8e2fcae8). *dgryski.medium.com*, April 2018. Archived at [perma.cc/B2WF-TYQ8](https://perma.cc/B2WF-TYQ8)
[^22]: David G. Thaler and Chinya V. Ravishankar. [Using name-based mappings to increase hit rates](https://www.cs.kent.edu/~javed/DL/web/p1-thaler.pdf). *IEEE/ACM Transactions on Networking*, volume 6, issue 1, pages 1–14, February 1998. [doi:10.1109/90.663936](https://doi.org/10.1109/90.663936)
[^23]: John Lamping and Eric Veach. [A Fast, Minimal Memory, Consistent Hash Algorithm](https://arxiv.org/abs/1406.2294). *arxiv.org*, June 2014.
[^24]: Samuel Axon. [3% of Twitter’s Servers Dedicated to Justin Bieber](https://mashable.com/archive/justin-bieber-twitter). *mashable.com*, September 2010. Archived at [perma.cc/F35N-CGVX](https://perma.cc/F35N-CGVX)
[^25]: Gerald Guo and Thawan Kooburat. [Scaling services with Shard Manager](https://engineering.fb.com/2020/08/24/production-engineering/scaling-services-with-shard-manager/). *engineering.fb.com*, August 2020. Archived at [perma.cc/EFS3-XQYT](https://perma.cc/EFS3-XQYT)
[^26]: Sangmin Lee, Zhenhua Guo, Omer Sunercan, Jun Ying, Thawan Kooburat, Suryadeep Biswal, Jun Chen, Kun Huang, Yatpang Cheung, Yiding Zhou, Kaushik Veeraraghavan, Biren Damani, Pol Mauri Ruiz, Vikas Mehta, and Chunqiang Tang. [Shard Manager: A Generic Shard Management Framework for Geo-distributed Applications](https://dl.acm.org/doi/pdf/10.1145/3477132.3483546). *28th ACM SIGOPS Symposium on Operating Systems Principles* (SOSP), pages 553–569, October 2021. [doi:10.1145/3477132.3483546](https://doi.org/10.1145/3477132.3483546)
[^27]: Scott Lystig Fritchie. [A Critique of Resizable Hash Tables: Riak Core & Random Slicing](https://www.infoq.com/articles/dynamo-riak-random-slicing/). *infoq.com*, August 2018. Archived at [perma.cc/RPX7-7BLN](https://perma.cc/RPX7-7BLN)
[^28]: Andy Warfield. [Building and operating a pretty big storage system called S3](https://www.allthingsdistributed.com/2023/07/building-and-operating-a-pretty-big-storage-system.html). *allthingsdistributed.com*, July 2023. Archived at [perma.cc/6S7P-GLM4](https://perma.cc/6S7P-GLM4)
[^29]: Rich Houlihan. [DynamoDB adaptive capacity: smooth performance for chaotic workloads (DAT327)](https://www.youtube.com/watch?v=kMY0_m29YzU). At *AWS re:Invent*, November 2017.
[^30]: Christopher D. Manning, Prabhakar Raghavan, and Hinrich Schütze. [*Introduction to Information Retrieval*](https://nlp.stanford.edu/IR-book/). Cambridge University Press, 2008. ISBN: 978-0-521-86571-5, available online at [nlp.stanford.edu/IR-book](https://nlp.stanford.edu/IR-book/)
[^31]: Michael Busch, Krishna Gade, Brian Larson, Patrick Lok, Samuel Luckenbill, and Jimmy Lin. [Earlybird: Real-Time Search at Twitter](https://cs.uwaterloo.ca/~jimmylin/publications/Busch_etal_ICDE2012.pdf). At *28th IEEE International Conference on Data Engineering* (ICDE), April 2012. [doi:10.1109/ICDE.2012.149](https://doi.org/10.1109/ICDE.2012.149)
[^32]: Nadav Har’El. [Indexing in Cassandra 3](https://github.com/scylladb/scylladb/wiki/Indexing-in-Cassandra-3). *github.com*, April 2017. Archived at [perma.cc/3ENV-8T9P](https://perma.cc/3ENV-8T9P)
[^33]: Zachary Tong. [Customizing Your Document Routing](https://www.elastic.co/blog/customizing-your-document-routing/). *elastic.co*, June 2013. Archived at [perma.cc/97VM-MREN](https://perma.cc/97VM-MREN)
[^34]: Andrew Pavlo. [H-Store Frequently Asked Questions](https://hstore.cs.brown.edu/documentation/faq/). *hstore.cs.brown.edu*, October 2013. Archived at [perma.cc/X3ZA-DW6Z](https://perma.cc/X3ZA-DW6Z)

================================================
FILE: content/tw/ch8.md
================================================
---
title: "8. 事務"
weight: 208
math: true
breadcrumbs: false
---

<a id="ch_transactions"></a>

![](/map/ch07.png)

> *有些作者聲稱，支援通用的兩階段提交代價太大，會帶來效能與可用性的問題。我們認為，讓程式設計師來處理過度使用事務導致的效能問題，總比缺少事務程式設計好得多。*
>
> James Corbett 等人，*Spanner：Google 的全球分散式資料庫*（2012）

在資料系統的殘酷現實中，很多事情都可能出錯：

* 資料庫軟體或硬體可能在任意時刻發生故障（包括寫操作進行到一半時）。
* 應用程式可能在任意時刻崩潰（包括一系列操作的中間）。
* 網路中斷可能會意外切斷應用程式與資料庫的連線，或資料庫節點之間的連線。
* 多個客戶端可能會同時寫入資料庫，覆蓋彼此的更改。
* 客戶端可能讀取到無意義的資料，因為資料只更新了一部分。
* 客戶端之間的競態條件可能導致令人驚訝的錯誤。

為了實現可靠性，系統必須處理這些故障，確保它們不會導致整個系統的災難性故障。然而，實現容錯機制需要大量工作。它需要仔細考慮所有可能出錯的事情，並進行大量測試，以確保解決方案真正有效。

數十年來，*事務*一直是簡化這些問題的首選機制。事務是應用程式將多個讀寫操作組合成一個邏輯單元的一種方式。從概念上講，事務中的所有讀寫操作被視作單個操作來執行：整個事務要麼成功（*提交*），要麼失敗（*中止*、*回滾*）。如果失敗，應用程式可以安全地重試。對於事務來說，應用程式的錯誤處理變得簡單多了，因為它不用再擔心部分失敗——即某些操作成功，某些失敗（無論出於何種原因）。

如果你與事務打交道多年，它們可能看起來顯而易見，但我們不應該將其視為理所當然。事務不是自然法則；它們是有目的地建立的，即為了*簡化應用程式的程式設計模型*。透過使用事務，應用程式可以自由地忽略某些潛在的錯誤場景和併發問題，因為資料庫會替應用處理好這些（我們稱之為*安全保證*）。

並非所有應用程式都需要事務，有時弱化事務保證或完全放棄事務也有好處（例如，為了獲得更高的效能或更高的可用性）。某些安全屬性可以在沒有事務的情況下實現。另一方面，事務可以防止很多麻煩：例如，郵局 Horizon 醜聞（參見["可靠性有多重要？"](/tw/ch2#sidebar_reliability_importance)）背後的技術原因可能是底層會計系統缺乏 ACID 事務[^1]。

你如何確定是否需要事務？為了回答這個問題，我們首先需要準確理解事務可以提供哪些安全保證，以及相關的成本。儘管事務乍看起來很簡單，但實際上有許多細微但重要的細節在起作用。

在本章中，我們將研究許多可能出錯的案例，並探索資料庫用於防範這些問題的演算法。我們將特別深入併發控制領域，討論可能發生的各種競態條件，以及資料庫如何實現*讀已提交*、*快照隔離*和*可序列化*等隔離級別。

併發控制對單節點和分散式資料庫都很重要。在本章後面的["分散式事務"](#sec_transactions_distributed)部分，我們將研究*兩階段提交*協議和在分散式事務中實現原子性的挑戰。

## 事務到底是什麼？ {#sec_transactions_overview}

今天，幾乎所有的關係型資料庫和一些非關係資料庫都支援事務。它們大多遵循 1975 年由 IBM System R（第一個 SQL 資料庫）引入的風格[^2] [^3] [^4]。儘管一些實現細節發生了變化，但總體思路在 50 年裡幾乎保持不變：MySQL、PostgreSQL、Oracle、SQL Server 等的事務支援與 System R 驚人地相似。

在 2000 年代後期，非關係（NoSQL）資料庫開始流行起來。它們旨在透過提供新的資料模型選擇（參見[第 3 章](/tw/ch3#ch_datamodels)），以及預設包含複製（[第 6 章](/tw/ch6#ch_replication)）和分片（[第 7 章](/tw/ch7#ch_sharding)）來改進關係型資料庫的現狀。事務是這一運動的主要犧牲品：許多這一代資料庫完全放棄了事務，或者重新定義了這個詞，用來描述比以前理解的更弱的保證集。

圍繞 NoSQL 分散式資料庫的炒作導致了一種流行的信念，即事務從根本上不可伸縮，任何大規模系統都必須放棄事務以保持良好的效能和高可用性。最近，這種信念被證明是錯誤的。所謂 "NewSQL" 資料庫，如 CockroachDB[^5]、TiDB[^6]、Spanner[^7]、FoundationDB[^8] 和 YugabyteDB 已經證明，事務系統同樣可以具備很強的可伸縮性，並支援大資料量與高吞吐量。這些系統將分片與共識協議（[第 10 章](/tw/ch10#ch_consistency)）結合，在大規模下提供強 ACID 保證。

然而，這並不意味著每個系統都必須是事務型的：與任何其他技術設計選擇一樣，事務有優點也有侷限性。為了理解這些權衡，讓我們深入瞭解事務可以提供的保證的細節——無論是在正常操作中還是在各種極端（但現實）的情況下。

### ACID 的含義 {#sec_transactions_acid}

事務提供的安全保證通常由眾所周知的首字母縮略詞 *ACID* 來描述，它代表*原子性*（Atomicity）、*一致性*（Consistency）、*隔離性*（Isolation）和*永續性*（Durability）。它由 Theo Härder 和 Andreas Reuter 於 1983 年提出[^9]，旨在為資料庫中的容錯機制建立精確的術語。

然而，在實踐中，一個數據庫的 ACID 實現並不等同於另一個數據庫的實現。例如，正如我們將看到的，*隔離性*的含義有很多歧義[^10]。高層次的想法是合理的，但魔鬼在細節中。今天，當一個系統聲稱自己"符合 ACID"時，實際上你能期待什麼保證並不清楚。不幸的是，ACID 基本上已經成為了一個營銷術語。

（不符合 ACID 標準的系統有時被稱為 *BASE*，它代表*基本可用*（Basically Available）、*軟狀態*（Soft state）和*最終一致性*（Eventual consistency）[^11]。這比 ACID 的定義更加模糊。似乎 BASE 唯一合理的定義是"非 ACID"；即，它幾乎可以代表任何你想要的東西。）

讓我們深入瞭解原子性、一致性、隔離性和永續性的定義，這將讓我們提煉出事務的思想。

#### 原子性 {#sec_transactions_acid_atomicity}

一般來說，*原子*是指不能分解成更小部分的東西。這個詞在計算機的不同分支中意味著相似但又微妙不同的東西。例如，在多執行緒程式設計中，如果一個執行緒執行原子操作，這意味著另一個執行緒無法看到該操作的半完成結果。系統只能處於操作之前或操作之後的狀態，而不是介於兩者之間。

相比之下，在 ACID 的上下文中，原子性*不是*關於併發的。它不描述如果幾個程序試圖同時訪問相同的資料會發生什麼，因為這包含在字母 *I*（*隔離性*）中（參見["隔離性"](#sec_transactions_acid_isolation)）。

相反，ACID 原子性描述了當客戶端想要進行多次寫入，但在某些寫入被處理後發生故障時會發生什麼——例如，程序崩潰、網路連線中斷、磁碟變滿或違反了某些完整性約束。如果這些寫入被分組到一個原子事務中，並且由於故障無法完成（*提交*）事務，則事務被*中止*，資料庫必須丟棄或撤消該事務中迄今為止所做的任何寫入。

如果沒有原子性，如果在進行多處更改的中途發生錯誤，很難知道哪些更改已經生效，哪些沒有。應用程式可以重試，但這有進行兩次相同更改的風險，導致資料重複或錯誤。原子性簡化了這個問題：如果事務被中止，應用程式可以確定它沒有改變任何東西，因此可以安全地重試。

在錯誤時中止事務並丟棄該事務的所有寫入的能力是 ACID 原子性的定義特徵。也許*可中止性*比*原子性*更好，但我們將堅持使用*原子性*，因為這是常用詞。

#### 一致性 {#sec_transactions_acid_consistency}

*一致性*這個詞被嚴重濫用：

* 在[第 6 章](/tw/ch6#ch_replication)中，我們討論了*副本一致性*和非同步複製系統中出現的*最終一致性*問題（參見["複製延遲的問題"](/tw/ch6#sec_replication_lag)）。
* 資料庫的*一致快照*（例如，用於備份）是整個資料庫在某一時刻存在的快照。更準確地說，它與先發生關係（happens-before relation）一致（參見["“先發生”關係和併發"](/tw/ch6#sec_replication_happens_before)）：也就是說，如果快照包含在特定時間寫入的值，那麼它也反映了在該值寫入之前發生的所有寫入。
* *一致性雜湊*是某些系統用於再平衡的分片方法（參見["一致性雜湊"](/tw/ch7#sec_sharding_consistent_hashing)）。
* 在 CAP定理中（參見[第 10 章](/tw/ch10#ch_consistency)），*一致性*一詞用於表示*線性一致性*（參見["線性一致性"](/tw/ch10#sec_consistency_linearizability)）。
* 在 ACID 的上下文中，*一致性*是指應用程式特定的資料庫處於"良好狀態"的概念。

不幸的是，同一個詞至少有五種不同的含義。

ACID 一致性的思想是，你對資料有某些陳述（*不變式*）必須始終為真——例如，在會計系統中，所有賬戶的貸方和借方必須始終平衡。如果事務從滿足這些不變式的有效資料庫開始，並且事務期間的任何寫入都保持有效性，那麼你可以確定不變式始終得到滿足。（不變式可能在事務執行期間暫時違反，但在事務提交時應該再次滿足。）

如果你希望資料庫強制執行你的不變式，你需要將它們宣告為模式的一部分的*約束*。例如，外部索引鍵約束、唯一性約束或檢查約束（限制單個行中可以出現的值）通常用於對特定型別的不變式建模。更複雜的一致性要求有時可以使用觸發器或物化檢視建模[^12]。

然而，複雜的不變式可能很難或不可能使用資料庫通常提供的約束來建模。在這種情況下，應用程式有責任正確定義其事務，以便它們保持一致性。如果你寫入違反不變式的錯誤資料，但你沒有宣告這些不變式，資料庫無法阻止你。因此，ACID 中的 C 通常取決於應用程式如何使用資料庫，而不僅僅是資料庫的屬性。

#### 隔離性 {#sec_transactions_acid_isolation}

大多數資料庫都會同時被多個客戶端訪問。如果它們讀寫資料庫的不同部分，這沒有問題，但如果它們訪問相同的資料庫記錄，你可能會遇到併發問題（競態條件）。

[圖 8-1](#fig_transactions_increment) 是這種問題的一個簡單例子。假設你有兩個客戶端同時遞增儲存在資料庫中的計數器。每個客戶端需要讀取當前值，加 1，然後寫回新值（假設資料庫中沒有內建的遞增操作）。在[圖 8-1](#fig_transactions_increment) 中，計數器應該從 42 增加到 44，因為發生了兩次遞增，但實際上由於競態條件只增加到 43。

{{< figure src="/fig/ddia_0801.png" id="fig_transactions_increment" caption="圖 8-1. 兩個客戶端併發遞增計數器之間的競態條件。" class="w-full my-4" >}}


ACID 意義上的*隔離性*意味著同時執行的事務彼此隔離：它們不能相互干擾。經典的資料庫教科書將隔離性形式化為*可序列化*，這意味著每個事務可以假裝它是唯一在整個資料庫上執行的事務。資料庫確保當事務已經提交時，結果與它們*序列*執行（一個接一個）相同，即使實際上它們可能是併發執行的[^13]。

然而，可序列化有效能成本。在實踐中，許多資料庫使用比可序列化更弱的隔離形式：也就是說，它們允許併發事務以有限的方式相互干擾。一些流行的資料庫，如 Oracle，甚至沒有實現它（Oracle 有一個稱為"可序列化"的隔離級別，但它實際上實現了*快照隔離*，這是比可序列化更弱的保證[^10] [^14]）。這意味著某些型別的競態條件仍然可能發生。我們將在["弱隔離級別"](#sec_transactions_isolation_levels)中探討快照隔離和其他形式的隔離。

#### 永續性 {#durability}

資料庫系統的目的是提供一個安全的地方來儲存資料，而不用擔心丟失它。*永續性*是一個承諾，即一旦事務成功提交，它寫入的任何資料都不會被遺忘，即使發生硬體故障或資料庫崩潰。

在單節點資料庫中，永續性通常意味著資料已經寫入非易失性儲存，如硬碟或 SSD。定期檔案寫入通常在傳送到磁碟之前在記憶體中緩衝，這意味著如果突然斷電它們將丟失；因此，許多資料庫使用 `fsync()` 系統呼叫來確保資料真正寫入磁碟。資料庫通常還有預寫日誌或類似的（參見["使 B 樹可靠"](/tw/ch4#sec_storage_btree_wal)），這允許它們在寫入過程中發生崩潰時恢復。

在複製資料庫中，永續性可能意味著資料已成功複製到某些節點。為了提供永續性保證，資料庫必須等到這些寫入或複製完成，然後才報告事務成功提交。然而，如["可靠性和容錯"](/tw/ch2#sec_introduction_reliability)中所討論的，完美的永續性不存在：如果所有硬碟和所有備份同時被銷燬，顯然你的資料庫無法挽救你。

--------

<a id="sidebar_transactions_durability"></a>

> [!TIP] 複製與永續性

歷史上，永續性意味著寫入歸檔磁帶。然後它被理解為寫入磁碟或 SSD。最近，它已經適應為意味著複製。哪種實現更好？

事實是，沒有什麼是完美的：

* 如果你寫入磁碟而機器宕機，即使你的資料沒有丟失，在你修復機器或將磁碟轉移到另一臺機器之前，它也是不可訪問的。複製系統可以保持可用。
* 相關故障——停電或導致每個節點在特定輸入上崩潰的錯誤——可以一次性摧毀所有副本（參見["可靠性和容錯"](/tw/ch2#sec_introduction_reliability)），失去任何僅在記憶體中的資料。因此，寫入磁碟對於複製資料庫仍然相關。
* 在非同步複製系統中，當領導者變得不可用時，最近的寫入可能會丟失（參見["處理節點故障"](/tw/ch6#sec_replication_failover)）。
* 當電源突然切斷時，SSD 特別被證明有時會違反它們應該提供的保證：即使 `fsync` 也不能保證正常工作[^15]。磁碟韌體可能有錯誤，就像任何其他型別的軟體一樣[^16] [^17]，例如，導致驅動器在正好 32,768 小時操作後失敗[^18]。而且 `fsync` 很難使用；即使 PostgreSQL 使用它不正確超過 20 年[^19] [^20] [^21]。
* 儲存引擎和檔案系統實現之間的微妙互動可能導致難以追蹤的錯誤，並可能導致磁碟上的檔案在崩潰後損壞[^22] [^23]。一個副本上的檔案系統錯誤有時也會傳播到其他副本[^24]。
* 磁碟上的資料可能在未被檢測到的情況下逐漸損壞[^25]。如果資料已經損壞了一段時間，副本和最近的備份也可能損壞。在這種情況下，你需要嘗試從歷史備份中恢復資料。
* 一項關於 SSD 的研究發現，在前四年的執行中，30% 到 80% 的驅動器會開發至少一個壞塊，其中只有一些可以透過韌體糾正[^26]。磁碟驅動器的壞扇區率較低，但完全故障率高於 SSD。
* 當磨損的 SSD（經歷了許多寫/擦除週期）斷電時，它可能在幾周到幾個月的時間尺度上開始丟失資料，具體取決於溫度[^27]。對於磨損水平較低的驅動器，這不是問題[^28]。

在實踐中，沒有一種技術可以提供絕對保證。只有各種降低風險的技術，包括寫入磁碟、複製到遠端機器和備份——它們可以而且應該一起使用。一如既往，明智的做法是對任何理論上的"保證"持健康的懷疑態度。

--------

### 單物件與多物件操作 {#sec_transactions_multi_object}

回顧一下，在 ACID 中，原子性和隔離性描述了如果客戶端在同一事務中進行多次寫入，資料庫應該做什麼：

原子性
: 如果在寫入序列的中途發生錯誤，事務應該被中止，並且到該點為止所做的寫入應該被丟棄。換句話說，資料庫讓你免於擔心部分失敗，透過提供全有或全無的保證。

隔離性
: 併發執行的事務不應該相互干擾。例如，如果一個事務進行多次寫入，那麼另一個事務應該看到所有或不看到這些寫入，但不是某些子集。

這些定義假設你想要同時修改多個物件（行、文件、記錄）。這種*多物件事務*通常需要保持多塊資料同步。[圖 8-2](#fig_transactions_read_uncommitted) 顯示了一個來自電子郵件應用程式的示例。要顯示使用者的未讀訊息數，你可以查詢類似這樣的內容：

```
SELECT COUNT(*) FROM emails WHERE recipient_id = 2 AND unread_flag = true
```

{{< figure src="/fig/ddia_0802.png" id="fig_transactions_read_uncommitted" caption="圖 8-2. 違反隔離性：一個事務讀取另一個事務的未提交寫入（“髒讀”）。" class="w-full my-4" >}}


然而，如果有很多電子郵件，你可能會發現這個查詢太慢，並決定將未讀訊息的數量儲存在一個單獨的欄位中（一種反正規化，我們在["正規化、反正規化和連線"](/tw/ch3#sec_datamodels_normalization)中討論）。現在，每當有新訊息進來時，你必須增加未讀計數器，每當訊息被標記為已讀時，你也必須減少未讀計數器。

在[圖 8-2](#fig_transactions_read_uncommitted) 中，使用者 2 遇到了異常：郵箱列表顯示有未讀訊息，但計數器顯示零未讀訊息，因為計數器增量尚未發生。（如果電子郵件應用程式中的錯誤計數器看起來太微不足道，請考慮客戶賬戶餘額而不是未讀計數器，以及支付事務而不是電子郵件。）隔離本可以透過確保使用者 2 看到插入的電子郵件和更新的計數器，或者兩者都不看到，但不是不一致的中間點，來防止這個問題。

[圖 8-3](#fig_transactions_atomicity) 說明了對原子性的需求：如果在事務過程中某處發生錯誤，郵箱的內容和未讀計數器可能會失去同步。在原子事務中，如果對計數器的更新失敗，事務將被中止，插入的電子郵件將被回滾。

{{< figure src="/fig/ddia_0803.png" id="fig_transactions_atomicity" caption="圖 8-3. 原子性確保如果發生錯誤，該事務的任何先前寫入都會被撤消，以避免不一致的狀態。" class="w-full my-4" >}}


多物件事務需要某種方式來確定哪些讀寫操作屬於同一事務。在關係資料庫中，這通常基於客戶端與資料庫伺服器的 TCP 連線：在任何特定連線上，`BEGIN TRANSACTION` 和 `COMMIT` 語句之間的所有內容都被認為是同一事務的一部分。如果 TCP 連線中斷，事務必須被中止。

另一方面，許多非關係資料庫沒有這樣的方式來將操作組合在一起。即使有多物件 API（例如，鍵值儲存可能有一個*多重放置*操作，在一個操作中更新多個鍵），這並不一定意味著它具有事務語義：該命令可能在某些鍵上成功而在其他鍵上失敗，使資料庫處於部分更新狀態。

#### 單物件寫入 {#sec_transactions_single_object}

當單個物件被更改時，原子性和隔離性也適用。例如，假設你正在向資料庫寫入 20 KB 的 JSON 文件：

* 如果在傳送了前 10 KB 後網路連線中斷，資料庫是否儲存了無法解析的 10 KB JSON 片段？
* 如果資料庫正在覆蓋磁碟上的先前值的過程中電源失效，你是否最終會將新舊值拼接在一起？
* 如果另一個客戶端在寫入過程中讀取該文件，它會看到部分更新的值嗎？

這些問題會令人非常困惑，因此儲存引擎幾乎普遍的目標是在一個節點上的單個物件（如鍵值對）上提供原子性和隔離性。原子性可以使用日誌實現崩潰恢復（參見["使 B 樹可靠"](/tw/ch4#sec_storage_btree_wal)），隔離性可以使用每個物件上的鎖來實現（一次只允許一個執行緒訪問物件）。

某些資料庫還提供更複雜的原子操作，例如遞增操作，它消除了像[圖 8-1](#fig_transactions_increment) 中那樣的讀-修改-寫迴圈的需求。類似流行的是*條件寫入*操作，它允許僅在值未被其他人併發更改時才進行寫入（參見["條件寫入（比較並設定）"](#sec_transactions_compare_and_set)），類似於共享記憶體併發中的比較並設定或比較並交換（CAS）操作。

--------

> [!NOTE]
> 嚴格來說，術語*原子遞增*在多執行緒程式設計的意義上使用了*原子*這個詞。在 ACID 的上下文中，它實際上應該被稱為*隔離*或*可序列化*遞增，但這不是通常的術語。

--------

這些單物件操作很有用，因為它們可以防止多個客戶端嘗試同時寫入同一物件時的丟失更新（參見["防止丟失更新"](#sec_transactions_lost_update)）。然而，它們不是通常意義上的事務。例如，Cassandra 和 ScyllaDB 的"輕量級事務"功能以及 Aerospike 的"強一致性"模式在單個物件上提供線性一致（參見["線性一致性"](/tw/ch10#sec_consistency_linearizability)）讀取和條件寫入，但不保證跨多個物件。

#### 多物件事務的需求 {#sec_transactions_need}

我們是否需要多物件事務？是否可能僅使用鍵值資料模型和單物件操作來實現任何應用程式？

在某些用例中，單物件插入、更新和刪除就足夠了。然而，在許多其他情況下，需要協調對多個不同物件的寫入：

* 在關係資料模型中，一個表中的行通常具有對另一個表中行的外部索引鍵引用。類似地，在類似圖的資料模型中，頂點具有指向其他頂點的邊。多物件事務允許你確保這些引用保持有效：插入引用彼此的多個記錄時，外部索引鍵必須正確且最新，否則資料變得毫無意義。
* 在文件資料模型中，需要一起更新的欄位通常在同一文件內，它被視為單個物件——更新單個文件時不需要多物件事務。然而，缺乏連線功能的文件資料庫也鼓勵反正規化（參見["何時使用哪種模型"](/tw/ch3#sec_datamodels_document_summary)）。當需要更新反正規化資訊時，如[圖 8-2](#fig_transactions_read_uncommitted) 的示例，你需要一次更新多個文件。事務在這種情況下非常有用，可以防止反正規化資料失去同步。
* 在具有二級索引的資料庫中（幾乎除了純鍵值儲存之外的所有資料庫），每次更改值時都需要更新索引。從事務的角度來看，這些索引是不同的資料庫物件：例如，如果沒有事務隔離，記錄可能出現在一個索引中但不在另一個索引中，因為對第二個索引的更新尚未發生（參見["分片和二級索引"](/tw/ch7#sec_sharding_secondary_indexes)）。

這些應用程式仍然可以在沒有事務的情況下實現。然而，沒有原子性的錯誤處理變得更加複雜，缺乏隔離性可能導致併發問題。我們將在["弱隔離級別"](#sec_transactions_isolation_levels)中討論這些問題，並在["派生資料與分散式事務"](/tw/ch13#sec_future_derived_vs_transactions)中探索替代方法。

#### 處理錯誤和中止 {#handling-errors-and-aborts}

事務的一個關鍵特性是，如果發生錯誤，它可以被中止並安全地重試。ACID 資料庫基於這樣的哲學：如果資料庫有違反其原子性、隔離性或永續性保證的危險，它寧願完全放棄事務，也不允許它保持半完成狀態。

然而，並非所有系統都遵循這種哲學。特別是，具有無主（無領導者）複製的資料儲存（參見["無主（無領導者）複製"](/tw/ch6#sec_replication_leaderless)）更多地基於"盡力而為"的基礎工作，可以總結為"資料庫將盡其所能，如果遇到錯誤，它不會撤消已經完成的操作"——因此，從錯誤中恢復是應用程式的責任。

錯誤不可避免地會發生，但許多軟體開發人員更願意只考慮快樂路徑，而不是錯誤處理的複雜性。例如，流行的物件關係對映（ORM）框架，如 Rails 的 ActiveRecord 和 Django，不會重試中止的事務——錯誤通常導致異常冒泡到堆疊中，因此任何使用者輸入都被丟棄，使用者收到錯誤訊息。這是一種遺憾，因為中止的全部意義是啟用安全重試。

儘管重試中止的事務是一種簡單有效的錯誤處理機制，但它並不完美：

* 如果事務實際上成功了，但在伺服器嘗試向客戶端確認成功提交時網路中斷（因此從客戶端的角度來看超時），那麼重試事務會導致它被執行兩次——除非你有額外的應用程式級去重機制。
* 如果錯誤是由於過載或併發事務之間的高爭用，重試事務會使問題變得更糟，而不是更好。為了避免這種反饋迴圈，你可以限制重試次數，使用指數退避，並以不同的方式處理與過載相關的錯誤與其他錯誤（參見["當過載系統無法恢復時"](/tw/ch2#sidebar_metastable)）。
* 僅在瞬態錯誤後重試才值得（例如，由於死鎖、隔離違規、臨時網路中斷和故障轉移）；在永久錯誤後（例如，約束違規）重試將毫無意義。
* 如果事務在資料庫之外也有副作用，即使事務被中止，這些副作用也可能發生。例如，如果你正在傳送電子郵件，你不會希望每次重試事務時都再次傳送電子郵件。如果你想確保幾個不同的系統一起提交或中止，兩階段提交可以提供幫助（我們將在["兩階段提交（2PC）"](#sec_transactions_2pc)中討論這個問題）。
* 如果客戶端程序在重試時崩潰，它試圖寫入資料庫的任何資料都會丟失。


## 弱隔離級別 {#sec_transactions_isolation_levels}

如果兩個事務不訪問相同的資料，或者都是隻讀的，它們可以安全地並行執行，因為它們互不依賴。僅當一個事務讀取另一個事務併發修改的資料時，或者當兩個事務嘗試同時修改相同的資料時，才會出現併發問題（競態條件）。

併發錯誤很難透過測試發現，因為這些錯誤只有在時機不巧時才會觸發。這種時機問題可能非常罕見，通常難以重現。併發也很難推理，特別是在大型應用程式中，你不一定知道程式碼的其他部分正在訪問資料庫。如果只有一個使用者，應用程式開發就已經夠困難了；有許多併發使用者會讓情況變得更加困難，因為任何資料都可能在任何時候意外地發生變化。

出於這個原因，資料庫長期以來一直試圖透過提供*事務隔離*來嚮應用程式開發人員隱藏併發問題。理論上，隔離應該讓你的生活更輕鬆，讓你假裝沒有併發發生：*可序列化*隔離意味著資料庫保證事務具有與*序列*執行（即一次一個，沒有任何併發）相同的效果。

在實踐中，隔離不幸並不那麼簡單。可序列化隔離有效能成本，許多資料庫不願意支付這個代價[^10]。因此，系統通常使用較弱的隔離級別，這些級別可以防止*某些*併發問題，但不是全部。這些隔離級別更難理解，它們可能導致微妙的錯誤，但它們在實踐中仍然被使用[^29]。

由弱事務隔離引起的併發錯誤不僅僅是理論問題。它們已經導致了鉅額資金損失[^30] [^31] [^32]，引發了金融審計師的調查[^33]，並導致客戶資料損壞[^34]。對此類問題披露的一個流行評論是"如果你正在處理金融資料，請使用 ACID 資料庫！"——但這沒有抓住重點。即使許多流行的關係資料庫系統（通常被認為是"ACID"）使用弱隔離，因此它們不一定能防止這些錯誤發生。

--------

> [!NOTE]
> 順便說一句，銀行系統的大部分依賴於透過安全 FTP 交換的文字檔案[^35]。在這種情況下，擁有審計跟蹤和一些人為級別的欺詐預防措施實際上比 ACID 屬性更重要。

--------

這些例子還強調了一個重要觀點：即使併發問題在正常操作中很少見，你也必須考慮攻擊者故意向你的 API 傳送大量高度併發請求以故意利用併發錯誤的可能性[^30]。因此，為了構建可靠和安全的應用程式，你必須確保系統地防止此類錯誤。

在本節中，我們將研究實踐中使用的幾種弱（非可序列化）隔離級別，並詳細討論哪些競態條件可以發生和不能發生，以便你可以決定哪個級別適合你的應用程式。完成後，我們將詳細討論可序列化（參見["可序列化"](#sec_transactions_serializability)）。我們對隔離級別的討論將是非正式的，使用示例。如果你想要嚴格的定義和對其屬性的分析，你可以在學術文獻中找到它們[^36] [^37] [^38] [^39]。

### 讀已提交 {#sec_transactions_read_committed}

最基本的事務隔離級別是*讀已提交*。它提供兩個保證：

1. 從資料庫讀取時，你只會看到已經提交的資料（沒有*髒讀*）。
2. 寫入資料庫時，你只會覆蓋已經提交的資料（沒有*髒寫*）。

某些資料庫支援更弱的隔離級別，稱為*讀未提交*。它防止髒寫，但不防止髒讀。讓我們更詳細地討論這兩個保證。

#### 沒有髒讀 {#no-dirty-reads}

想象一個事務已經向資料庫寫入了一些資料，但事務尚未提交或中止。另一個事務能看到那個未提交的資料嗎？如果能，這稱為*髒讀*[^3]。

在讀已提交隔離級別下執行的事務必須防止髒讀。這意味著事務的任何寫入只有在該事務提交時才對其他人可見（然後它的所有寫入立即變得可見）。這在[圖 8-4](#fig_transactions_read_committed) 中說明，其中使用者 1 已設定 *x* = 3，但使用者 2 的 *get x* 仍返回舊值 2，因為使用者 1 尚未提交。

{{< figure src="/fig/ddia_0804.png" id="fig_transactions_read_committed" caption="圖 8-4. 沒有髒讀：使用者 2 只有在使用者 1 的事務提交後才能看到 x 的新值。" class="w-full my-4" >}}

有幾個原因說明為什麼防止髒讀是有用的：

* 如果事務需要更新多行，髒讀意味著另一個事務可能看到某些更新但不是其他更新。例如，在[圖 8-2](#fig_transactions_read_uncommitted) 中，使用者看到新的未讀電子郵件但沒有看到更新的計數器。這是電子郵件的髒讀。看到資料庫處於部分更新狀態會讓使用者感到困惑，並可能導致其他事務做出錯誤的決定。
* 如果事務中止，它所做的任何寫入都需要回滾（如[圖 8-3](#fig_transactions_atomicity)）。如果資料庫允許髒讀，這意味著事務可能看到後來被回滾的資料——即從未實際提交到資料庫的資料。任何讀取未提交資料的事務也需要被中止，導致稱為*級聯中止*的問題。

#### 沒有髒寫 {#sec_transactions_dirty_write}

如果兩個事務併發嘗試更新資料庫中的同一行會發生什麼？我們不知道寫入將以什麼順序發生，但我們通常假設後面的寫入會覆蓋前面的寫入。

然而，如果前面的寫入是尚未提交的事務的一部分，因此後面的寫入覆蓋了一個未提交的值，會發生什麼？這稱為*髒寫*[^36]。在讀已提交隔離級別下執行的事務必須防止髒寫，通常透過延遲第二個寫入直到第一個寫入的事務已提交或中止。

透過防止髒寫，這個隔離級別避免了某些型別的併發問題：

* 如果事務更新多行，髒寫可能導致糟糕的結果。例如，考慮[圖 8-5](#fig_transactions_dirty_writes)，它說明了一個二手車銷售網站，兩個人 Aaliyah 和 Bryce 同時嘗試購買同一輛車。購買汽車需要兩次資料庫寫入：網站上的列表需要更新以反映買家，銷售發票需要傳送給買家。在[圖 8-5](#fig_transactions_dirty_writes) 的情況下，銷售被授予 Bryce（因為他對 `listings` 表執行了獲勝的更新），但發票被傳送給 Aaliyah（因為她對 `invoices` 表執行了獲勝的更新）。讀已提交防止了這種事故。
* 然而，讀已提交*不*防止[圖 8-1](#fig_transactions_increment) 中兩個計數器遞增之間的競態條件。在這種情況下，第二個寫入發生在第一個事務提交之後，所以它不是髒寫。它仍然是不正確的，但原因不同——在["防止丟失更新"](#sec_transactions_lost_update)中，我們將討論如何使此類計數器遞增安全。

{{< figure src="/fig/ddia_0805.png" id="fig_transactions_dirty_writes" caption="圖 8-5. 有了髒寫，來自不同事務的衝突寫入可能會混在一起。" class="w-full my-4" >}}


#### 實現讀已提交 {#sec_transactions_read_committed_impl}

讀已提交是一個非常流行的隔離級別。它是 Oracle Database、PostgreSQL、SQL Server 和許多其他資料庫中的預設設定[^10]。

最常見的是，資料庫透過使用行級鎖來防止髒寫：當事務想要修改特定行（或文件或其他物件）時，它必須首先獲取該行的鎖。然後它必須持有該鎖直到事務提交或中止。任何給定行只能有一個事務持有鎖；如果另一個事務想要寫入同一行，它必須等到第一個事務提交或中止後才能獲取鎖並繼續。這種鎖定由資料庫在讀已提交模式（或更強的隔離級別）下自動完成。

我們如何防止髒讀？一種選擇是使用相同的鎖，並要求任何想要讀取行的事務短暫地獲取鎖，然後在讀取後立即再次釋放它。這將確保在行具有髒的、未提交的值時無法進行讀取（因為在此期間鎖將由進行寫入的事務持有）。

然而，要求讀鎖的方法在實踐中效果不佳，因為一個長時間執行的寫事務可以強制許多其他事務等待，直到長時間執行的事務完成，即使其他事務只讀取並且不向資料庫寫入任何內容。這會損害只讀事務的響應時間，並且對可操作性不利：應用程式一個部分的減速可能會由於等待鎖而在應用程式的完全不同部分產生連鎖效應。

儘管如此，在某些資料庫中使用鎖來防止髒讀，例如 IBM Db2 和 Microsoft SQL Server 在 `read_committed_snapshot=off` 設定中[^29]。

防止髒讀的更常用方法是[圖 8-4](#fig_transactions_read_committed) 中說明的方法：對於每個被寫入的行，資料庫記住舊的已提交值和當前持有寫鎖的事務設定的新值。當事務正在進行時，任何其他讀取該行的事務都只是被給予舊值。只有當新值被提交時，事務才會切換到讀取新值（有關更多詳細資訊，請參見["多版本併發控制（MVCC）"](#sec_transactions_snapshot_impl)）。

### 快照隔離與可重複讀 {#sec_transactions_snapshot_isolation}

如果你膚淺地看待讀已提交隔離，你可能會被原諒認為它做了事務需要做的一切：它允許中止（原子性所需），它防止讀取事務的不完整結果，並且它防止併發寫入混淆。確實，這些是有用的功能，比沒有事務的系統能獲得的保證要強得多。

然而，使用這個隔離級別時，仍然有很多方式可能出現併發錯誤。例如，[圖 8-6](#fig_transactions_item_many_preceders) 說明了讀已提交可能發生的問題。

{{< figure src="/fig/ddia_0806.png" id="fig_transactions_item_many_preceders" caption="圖 8-6. 讀取偏差：Aaliyah 觀察到資料庫處於不一致狀態。" class="w-full my-4" >}}


假設 Aaliyah 在銀行有 1,000 美元的儲蓄，分成兩個賬戶，每個 500 美元。現在一筆事務從她的一個賬戶轉賬 100 美元到另一個賬戶。如果她不幸在該事務處理的同時檢視她的賬戶餘額列表，她可能會看到一個賬戶餘額在收款到達之前（餘額為 500 美元），另一個賬戶在轉出之後（新余額為 400 美元）。對 Aaliyah 來說，現在她的賬戶總共只有 900 美元——似乎 100 美元憑空消失了。

這種異常稱為*讀取偏差*，它是*不可重複讀*的一個例子：如果 Aaliyah 在事務結束時再次讀取賬戶 1 的餘額，她會看到與之前查詢中看到的不同的值（600 美元）。讀取偏差在讀已提交隔離下被認為是可接受的：Aaliyah 看到的賬戶餘額確實是在她讀取它們時已提交的。

--------

> [!NOTE]
> 術語*偏斜*不幸地被過載了：我們之前在*具有熱點的不平衡工作負載*的意義上使用它（參見["傾斜負載和緩解熱點"](/tw/ch7#sec_sharding_skew)），而這裡它意味著*時序異常*。

--------

在 Aaliyah 的情況下，這不是一個持久的問題，因為如果她幾秒鐘後重新載入線上銀行網站，她很可能會看到一致的賬戶餘額。然而，某些情況不能容忍這種臨時的不一致性：

備份
: 進行備份需要複製整個資料庫，對於大型資料庫可能需要幾個小時。在備份過程執行期間，寫入將繼續對資料庫進行。因此，你最終可能會得到備份的某些部分包含較舊版本的資料，而其他部分包含較新版本。如果你需要從這樣的備份恢復，不一致性（如消失的錢）將變成永久性的。

分析查詢和完整性檢查
: 有時，你可能想要執行掃描資料庫大部分的查詢。此類查詢在分析中很常見（參見["分析與運營系統"](/tw/ch1#sec_introduction_analytics)），或者可能是定期完整性檢查的一部分，以確保一切正常（監控資料損壞）。如果這些查詢在不同時間點觀察資料庫的不同部分，它們很可能返回無意義的結果。

*快照隔離*[^36] 是解決這個問題的最常見方法。其思想是每個事務從資料庫的*一致快照*讀取——也就是說，事務看到事務開始時資料庫中已提交的所有資料。即使資料隨後被另一個事務更改，每個事務也只能看到該特定時間點的舊資料。

快照隔離對於長時間執行的只讀查詢（如備份和分析）來說是一個福音。如果查詢操作的資料在查詢執行的同時發生變化，很難推理查詢的含義。當事務可以看到資料庫的一致快照（凍結在特定時間點）時，理解起來就容易得多。

快照隔離是一個流行的功能：它的變體受到 PostgreSQL、使用 InnoDB 儲存引擎的 MySQL、Oracle、SQL Server 等的支援，儘管詳細行為因系統而異[^29] [^40] [^41]。某些資料庫，如 Oracle、TiDB 和 Aurora DSQL，甚至選擇快照隔離作為它們的最高隔離級別。

#### 多版本併發控制（MVCC） {#sec_transactions_snapshot_impl}

與讀已提交隔離一樣，快照隔離的實現通常使用寫鎖來防止髒寫（參見["實現讀已提交"](#sec_transactions_read_committed_impl)），這意味著進行寫入的事務可以阻止寫入同一行的另一個事務的進度。但是，讀取不需要任何鎖。從效能的角度來看，快照隔離的一個關鍵原則是*讀者永遠不會阻塞寫者，寫者永遠不會阻塞讀者*。這允許資料庫在一致快照上處理長時間執行的讀查詢，同時正常處理寫入，兩者之間沒有任何鎖爭用。

為了實現快照隔離，資料庫使用了我們在[圖 8-4](#fig_transactions_read_committed) 中看到的防止髒讀機制的泛化。資料庫必須潛在地保留每行的幾個不同的已提交版本，而不是每行的兩個版本（已提交版本和被覆蓋但尚未提交的版本），因為各種正在進行的事務可能需要在不同時間點看到資料庫的狀態。因為它並排維護一行的多個版本，所以這種技術被稱為*多版本併發控制*（MVCC）。

[圖 8-7](#fig_transactions_mvcc) 說明了 PostgreSQL 中如何實現基於 MVCC 的快照隔離[^40] [^42] [^43]（其他實現類似）。當事務啟動時，它被賦予一個唯一的、始終遞增的事務 ID（`txid`）。每當事務向資料庫寫入任何內容時，它寫入的資料都用寫入者的事務 ID 標記。（準確地說，PostgreSQL 中的事務 ID 是 32 位整數，因此它們在大約 40 億個事務後溢位。清理過程執行清理以確保溢位不會影響資料。）

{{< figure src="/fig/ddia_0807.png" id="fig_transactions_mvcc" caption="圖 8-7. 使用多版本併發控制實現快照隔離。" class="w-full my-4" >}}


表中的每一行都有一個 `inserted_by` 欄位，包含將此行插入表中的事務的 ID。此外，每行都有一個 `deleted_by` 欄位，最初為空。如果事務刪除一行，該行實際上不會從資料庫中刪除，而是透過將 `deleted_by` 欄位設定為請求刪除的事務的 ID 來標記為刪除。在稍後的某個時間，當確定沒有事務可以再訪問已刪除的資料時，資料庫中的垃圾收集過程會刪除任何標記為刪除的行並釋放它們的空間。

更新在內部被轉換為刪除和插入[^44]。例如，在[圖 8-7](#fig_transactions_mvcc) 中，事務 13 從賬戶 2 中扣除 100 美元，將餘額從 500 美元更改為 400 美元。`accounts` 表現在實際上包含賬戶 2 的兩行：餘額為 500 美元的行被事務 13 標記為已刪除，餘額為 400 美元的行由事務 13 插入。

行的所有版本都儲存在同一個資料庫堆中（參見["在索引中儲存值"](/tw/ch4#sec_storage_index_heap)），無論寫入它們的事務是否已提交。同一行的版本形成一個連結串列，從最新版本到最舊版本或相反，以便查詢可以在內部迭代行的所有版本[^45] [^46]。

#### 觀察一致快照的可見性規則 {#sec_transactions_mvcc_visibility}

當事務從資料庫讀取時，事務 ID 用於決定它可以看到哪些行版本以及哪些是不可見的。透過仔細定義可見性規則，資料庫可以嚮應用程式呈現資料庫的一致快照。這大致如下工作[^43]：

1. 在每個事務開始時，資料庫列出當時正在進行（尚未提交或中止）的所有其他事務。這些事務所做的任何寫入都被忽略，即使事務隨後提交。這確保我們看到一個不受另一個事務提交影響的一致快照。
2. 具有較晚事務 ID（即在當前事務開始後開始，因此不包括在正在進行的事務列表中）的事務所做的任何寫入都被忽略，無論這些事務是否已提交。
3. 中止事務所做的任何寫入都被忽略，無論該中止何時發生。這樣做的好處是，當事務中止時，我們不需要立即從儲存中刪除它寫入的行，因為可見性規則會將它們過濾掉。垃圾收集過程可以稍後刪除它們。
4. 所有其他寫入對應用程式的查詢可見。

這些規則適用於行的插入和刪除。在[圖 8-7](#fig_transactions_mvcc) 中，當事務 12 從賬戶 2 讀取時，它看到 500 美元的餘額，因為 500 美元餘額的刪除是由事務 13 進行的（根據規則 2，事務 12 無法看到事務 13 進行的刪除），而 400 美元餘額的插入尚不可見（根據相同的規則）。

換句話說，如果以下兩個條件都為真，則行是可見的：

* 在讀者事務開始時，插入該行的事務已經提交。
* 該行未標記為刪除，或者如果是，請求刪除的事務在讀者事務開始時尚未提交。

長時間執行的事務可能會長時間繼續使用快照，繼續讀取（從其他事務的角度來看）早已被覆蓋或刪除的值。透過永遠不更新原地的值，而是在每次更改值時插入新版本，資料庫可以提供一致的快照，同時只產生很小的開銷。

<a id="sec_transactions_snapshot_indexes"></a>

#### 索引與快照隔離 {#indexes-and-snapshot-isolation}

索引如何在多版本資料庫中工作？最常見的方法是每個索引條目指向與該條目匹配的行的一個版本（最舊或最新版本）。每個行版本可能包含對下一個最舊或下一個最新版本的引用。使用索引的查詢必須迭代行以找到可見的行，並且值與查詢要查詢的內容匹配。當垃圾收集刪除不再對任何事務可見的舊行版本時，相應的索引條目也可以被刪除。

許多實現細節影響多版本併發控制的效能[^45] [^46]。例如，如果同一行的不同版本可以適合同一頁面，PostgreSQL 有避免索引更新的最佳化[^40]。其他一些資料庫避免儲存修改行的完整副本，而只儲存版本之間的差異以節省空間。

CouchDB、Datomic 和 LMDB 使用另一種方法。儘管它們也使用 B 樹（參見["B 樹"](/tw/ch4#sec_storage_b_trees)），但它們使用*不可變*（寫時複製）變體，在更新時不會覆蓋樹的頁面，而是建立每個修改頁面的新副本。父頁面，直到樹的根，被複制並更新以指向其子頁面的新版本。任何不受寫入影響的頁面都不需要複製，並且可以與新樹共享[^47]。

使用不可變 B 樹，每個寫事務（或事務批次）都會建立一個新的 B 樹根，特定的根是建立時資料庫的一致快照。不需要基於事務 ID 過濾行，因為後續寫入無法修改現有的 B 樹；它們只能建立新的樹根。這種方法還需要後臺程序進行壓縮和垃圾收集。

#### 快照隔離、可重複讀和命名混淆 {#snapshot-isolation-repeatable-read-and-naming-confusion}

MVCC 是資料庫常用的實現技術，通常用於實現快照隔離。然而，不同的資料庫有時使用不同的術語來指代同一件事：例如，快照隔離在 PostgreSQL 中稱為"可重複讀"，在 Oracle 中稱為"可序列化"[^29]。有時不同的系統使用相同的術語來表示不同的東西：例如，雖然在 PostgreSQL 中"可重複讀"意味著快照隔離，但在 MySQL 中它意味著比快照隔離更弱一致性的 MVCC 實現[^41]。

這種命名混淆的原因是 SQL 標準沒有快照隔離的概念，因為該標準基於 System R 1975 年的隔離級別定義[^3]，而快照隔離當時還沒有被髮明。相反，它定義了可重複讀，表面上看起來類似於快照隔離。PostgreSQL 將其快照隔離級別稱為"可重複讀"，因為它符合標準的要求，因此他們可以聲稱符合標準。

不幸的是，SQL 標準對隔離級別的定義是有缺陷的——它是模糊的、不精確的，並且不像標準應該的那樣獨立於實現[^36]。即使幾個資料庫實現了可重複讀，它們實際提供的保證也有很大差異，儘管表面上是標準化的[^29]。研究文獻中有可重複讀的正式定義[^37] [^38]，但大多數實現不滿足該正式定義。最重要的是，IBM Db2 使用"可重複讀"來指代可序列化[^10]。

因此，沒有人真正知道可重複讀意味著什麼。

### 防止丟失更新 {#sec_transactions_lost_update}

到目前為止，我們討論的讀已提交和快照隔離級別主要是關於只讀事務在併發寫入存在的情況下可以看到什麼的保證。我們大多忽略了兩個事務併發寫入的問題——我們只討論了髒寫（參見["沒有髒寫"](#sec_transactions_dirty_write)），這是可能發生的一種特定型別的寫-寫衝突。

併發寫入事務之間還可能發生其他幾種有趣的衝突。其中最著名的是*丟失更新*問題，在[圖 8-1](#fig_transactions_increment) 中以兩個併發計數器遞增的例子說明。

如果應用程式從資料庫讀取某個值，修改它，然後寫回修改後的值（*讀-修改-寫迴圈*），就會出現丟失更新問題。如果兩個事務併發執行此操作，其中一個修改可能會丟失，因為第二個寫入不包括第一個修改。（我們有時說後面的寫入*覆蓋*了前面的寫入。）這種模式出現在各種不同的場景中：

* 遞增計數器或更新賬戶餘額（需要讀取當前值，計算新值，並寫回更新的值）
* 對複雜值進行本地更改，例如，向 JSON 文件中的列表新增元素（需要解析文件，進行更改，並寫回修改後的文件）
* 兩個使用者同時編輯 wiki 頁面，每個使用者透過將整個頁面內容傳送到伺服器來儲存他們的更改，覆蓋資料庫中當前的任何內容

因為這是一個如此常見的問題，已經開發了各種解決方案[^48]。

#### 原子寫操作 {#atomic-write-operations}

許多資料庫提供原子更新操作，消除了在應用程式程式碼中實現讀-修改-寫迴圈的需要。如果你的程式碼可以用這些操作來表達，它們通常是最好的解決方案。例如，以下指令在大多數關係資料庫中是併發安全的：

```sql
UPDATE counters SET value = value + 1 WHERE key = 'foo';
```

類似地，文件資料庫（如 MongoDB）提供原子操作來對 JSON 文件的一部分進行本地修改，Redis 提供原子操作來修改資料結構（如優先順序佇列）。並非所有寫入都可以輕鬆地用原子操作來表達——例如，對 wiki 頁面的更新涉及任意文字編輯，可以使用["CRDT 和操作轉換"](/tw/ch6#sec_replication_crdts)中討論的演算法來處理——但在可以使用原子操作的情況下，它們通常是最佳選擇。

原子操作通常透過在讀取物件時對其進行獨佔鎖來實現，以便在應用更新之前沒有其他事務可以讀取它。另一種選擇是簡單地強制所有原子操作在單個執行緒上執行。

不幸的是，物件關係對映（ORM）框架很容易意外地編寫執行不安全的讀-修改-寫迴圈的程式碼，而不是使用資料庫提供的原子操作[^49] [^50] [^51]。這可能是難以透過測試發現的微妙錯誤的來源。

#### 顯式鎖定 {#explicit-locking}

如果資料庫的內建原子操作不提供必要的功能，另一個防止丟失更新的選項是應用程式顯式鎖定要更新的物件。然後應用程式可以執行讀-修改-寫迴圈，如果任何其他事務嘗試併發更新或鎖定同一物件，它將被迫等到第一個讀-修改-寫迴圈完成。

例如，考慮一個多人遊戲，其中幾個玩家可以同時移動同一個棋子。在這種情況下，原子操作可能不夠，因為應用程式還需要確保玩家的移動遵守遊戲規則，這涉及一些你無法合理地作為資料庫查詢實現的邏輯。相反，你可以使用鎖來防止兩個玩家同時移動同一個棋子，如[例 8-1](#fig_transactions_select_for_update) 所示。

{{< figure id="fig_transactions_select_for_update" title="例 8-1. 顯式鎖定行以防止丟失更新" class="w-full my-4" >}}

```sql
BEGIN TRANSACTION;

SELECT * FROM figures
    WHERE name = 'robot' AND game_id = 222
    FOR UPDATE; ❶

-- 檢查移動是否有效，然後更新
-- 前一個 SELECT 返回的棋子的位置。
UPDATE figures SET position = 'c4' WHERE id = 1234;

COMMIT;
```

❶：`FOR UPDATE` 子句表示資料庫應該對此查詢返回的所有行進行鎖定。

這是有效的，但要正確執行，你需要仔細考慮你的應用程式邏輯。很容易忘記在程式碼中的某個地方新增必要的鎖，從而引入競態條件。

此外，如果你鎖定多個物件，則存在死鎖的風險，其中兩個或多個事務正在等待彼此釋放鎖。許多資料庫會自動檢測死鎖，並中止涉及的事務之一，以便系統可以取得進展。你可以在應用程式級別透過重試中止的事務來處理這種情況。

#### 自動檢測丟失的更新 {#automatically-detecting-lost-updates}

原子操作和鎖是透過強制讀-修改-寫迴圈按順序發生來防止丟失更新的方法。另一種選擇是允許它們並行執行，如果事務管理器檢測到丟失的更新，則中止事務並強制它重試其讀-修改-寫迴圈。

這種方法的一個優點是資料庫可以與快照隔離一起有效地執行此檢查。實際上，PostgreSQL 的可重複讀、Oracle 的可序列化和 SQL Server 的快照隔離級別會自動檢測何時發生丟失的更新並中止有問題的事務。然而，MySQL/InnoDB 的可重複讀不檢測丟失的更新[^29] [^41]。一些作者[^36] [^38] 認為資料庫必須防止丟失的更新才能提供快照隔離，因此根據這個定義，MySQL 不提供快照隔離。

丟失更新檢測是一個很好的功能，因為它不需要應用程式程式碼使用任何特殊的資料庫功能——你可能忘記使用鎖或原子操作從而引入錯誤，但丟失更新檢測會自動發生，因此不太容易出錯。但是，你還必須在應用程式級別重試中止的事務。

#### 條件寫入（比較並設定） {#sec_transactions_compare_and_set}

在不提供事務的資料庫中，你有時會發現一個*條件寫入*操作，它可以透過僅在值自你上次讀取以來未更改時才允許更新來防止丟失的更新（之前在["單物件寫入"](#sec_transactions_single_object)中提到）。如果當前值與你之前讀取的不匹配，則更新無效，必須重試讀-修改-寫迴圈。它是許多 CPU 支援的原子*比較並設定*或*比較並交換*（CAS）指令的資料庫等價物。

例如，為了防止兩個使用者同時更新同一個 wiki 頁面，你可以嘗試類似這樣的操作，期望僅當頁面內容自使用者開始編輯以來沒有更改時才進行更新：

```sql
-- 這可能安全也可能不安全，取決於資料庫實現
UPDATE wiki_pages SET content = 'new content'
    WHERE id = 1234 AND content = 'old content';
```

如果內容已更改並且不再匹配 `'old content'`，則此更新將無效，因此你需要檢查更新是否生效並在必要時重試。你也可以使用在每次更新時遞增的版本號列，並且僅在當前版本號未更改時才應用更新，而不是比較完整內容。這種方法有時稱為*樂觀鎖定*[^52]。

請注意，如果另一個事務併發修改了 `content`，則根據 MVCC 可見性規則，新內容可能不可見（參見["觀察一致快照的可見性規則"](#sec_transactions_mvcc_visibility)）。MVCC 的許多實現對此場景有可見性規則的例外，其中其他事務寫入的值對 `UPDATE` 和 `DELETE` 查詢的 `WHERE` 子句的評估可見，即使這些寫入在快照中不可見。

#### 衝突解決與複製 {#conflict-resolution-and-replication}

在複製資料庫中（參見[第 6 章](/tw/ch6#ch_replication)），防止丟失的更新具有另一個維度：由於它們在多個節點上有資料副本，並且資料可能在不同節點上併發修改，因此需要採取一些額外的步驟來防止丟失的更新。

鎖和條件寫入操作假設有一個最新的資料副本。然而，具有多領導者或無主（無領導者）複製的資料庫通常允許多個寫入併發發生並非同步複製它們，因此它們不能保證有一個最新的資料副本。因此，基於鎖或條件寫入的技術在此上下文中不適用。（我們將在["線性一致性"](/tw/ch10#sec_consistency_linearizability)中更詳細地重新討論這個問題。）

相反，如["處理衝突寫入"](/tw/ch6#sec_replication_write_conflicts)中所討論的，此類複製資料庫中的常見方法是允許併發寫入建立值的多個衝突版本（也稱為*兄弟節點*），並使用應用程式程式碼或特殊資料結構在事後解決和合並這些版本。

如果更新是可交換的（即，你可以在不同副本上以不同順序應用它們，仍然得到相同的結果），合併衝突值可以防止丟失的更新。例如，遞增計數器或向集合新增元素是可交換操作。這就是 CRDT 背後的想法，我們在["CRDT 和操作轉換"](/tw/ch6#sec_replication_crdts)中遇到過。然而，某些操作（如條件寫入）不能成為可交換的。

另一方面，*最後寫入勝利*（LWW）衝突解決方法容易丟失更新，如["最後寫入勝利（丟棄併發寫入）"](/tw/ch6#sec_replication_lww)中所討論的。不幸的是，LWW 是許多複製資料庫中的預設值。

### 寫偏差與幻讀 {#sec_transactions_write_skew}

在前面的部分中，我們看到了*髒寫*和*丟失更新*，這是當不同事務併發嘗試寫入相同物件時可能發生的兩種競態條件。為了避免資料損壞，需要防止這些競態條件——要麼由資料庫自動防止，要麼透過使用鎖或原子寫操作等手動保護措施。

然而，這並不是併發寫入之間可能發生的潛在競態條件列表的結尾。在本節中，我們將看到一些更微妙的衝突示例。

首先，想象這個例子：你正在為醫生編寫一個應用程式來管理他們在醫院的值班班次。醫院通常試圖在任何時候都有幾位醫生值班，但絕對必須至少有一位醫生值班。醫生可以放棄他們的班次（例如，如果他們自己生病了），前提是該班次中至少有一位同事留在值班[^53] [^54]。

現在想象 Aaliyah 和 Bryce 是特定班次的兩位值班醫生。兩人都感覺不舒服，所以他們都決定請假。不幸的是，他們碰巧大約在同一時間點選了下班的按鈕。接下來發生的事情如[圖 8-8](#fig_transactions_write_skew) 所示。

{{< figure src="/fig/ddia_0808.png" id="fig_transactions_write_skew" caption="圖 8-8. 寫偏差導致應用程式錯誤的示例。" class="w-full my-4" >}}


在每個事務中，你的應用程式首先檢查當前是否有兩個或更多醫生在值班；如果是，它假設一個醫生下班是安全的。由於資料庫使用快照隔離，兩個檢查都返回 `2`，因此兩個事務都繼續到下一階段。Aaliyah 更新她自己的記錄讓自己下班，Bryce 同樣更新他自己的記錄。兩個事務都提交，現在沒有醫生值班。你至少有一個醫生值班的要求被違反了。

#### 寫偏差的特徵 {#characterizing-write-skew}

這種異常稱為*寫偏差*[^36]。它既不是髒寫也不是丟失的更新，因為兩個事務正在更新兩個不同的物件（分別是 Aaliyah 和 Bryce 的值班記錄）。這裡發生衝突不太明顯，但這絕對是一個競態條件：如果兩個事務一個接一個地執行，第二個醫生將被阻止下班。異常行為只有在事務併發執行時才可能。

你可以將寫偏差視為丟失更新問題的概括。如果兩個事務讀取相同的物件，然後更新其中一些物件（不同的事務可能更新不同的物件），就會發生寫偏差。在不同事務更新同一物件的特殊情況下，你會得到髒寫或丟失更新異常（取決於時機）。

我們看到有各種不同的方法可以防止丟失的更新。對於寫偏差，我們的選擇更受限制：

* 原子單物件操作沒有幫助，因為涉及多個物件。
* 不幸的是，你在某些快照隔離實現中發現的丟失更新的自動檢測也沒有幫助：寫偏差在 PostgreSQL 的可重複讀、MySQL/InnoDB 的可重複讀、Oracle 的可序列化或 SQL Server 的快照隔離級別中不會自動檢測到[^29]。自動防止寫偏差需要真正的可序列化隔離（參見["可序列化"](#sec_transactions_serializability)）。
* 某些資料庫允許你配置約束，然後由資料庫強制執行（例如，唯一性、外部索引鍵約束或對特定值的限制）。但是，為了指定至少有一個醫生必須值班，你需要一個涉及多個物件的約束。大多數資料庫沒有對此類約束的內建支援，但你可能能夠使用觸發器或物化檢視實現它們，如["一致性"](#sec_transactions_acid_consistency)中所討論的[^12]。
* 如果你不能使用可序列化隔離級別，在這種情況下，第二好的選擇可能是顯式鎖定事務所依賴的行。在醫生示例中，你可以編寫如下內容：

    ```sql
    BEGIN TRANSACTION;

    SELECT * FROM doctors
        WHERE on_call = true
        AND shift_id = 1234 FOR UPDATE; ❶

    UPDATE doctors
       SET on_call = false
       WHERE name = 'Aaliyah'
       AND shift_id = 1234;

    COMMIT;
    ```

❶：和以前一樣，`FOR UPDATE` 告訴資料庫鎖定此查詢返回的所有行。

#### 寫偏差的更多例子 {#more-examples-of-write-skew}

寫偏差起初可能看起來是一個深奧的問題，但一旦你意識到它，你可能會注意到更多可能發生的情況。以下是更多示例：

會議室預訂系統
: 假設你想強制同一會議室在同一時間不能有兩個預訂[^55]。當有人想要預訂時，你首先檢查是否有任何衝突的預訂（即，具有重疊時間範圍的同一房間的預訂），如果沒有找到，你就建立會議（參見[例 8-2](#fig_transactions_meeting_rooms)）。

    {{< figure id="fig_transactions_meeting_rooms" title="例 8-2. 會議室預訂系統試圖避免重複預訂（在快照隔離下不安全）" class="w-full my-4" >}}

    ```sql
    BEGIN TRANSACTION;

    -- 檢查是否有任何現有預訂與中午 12 點到 1 點的時間段重疊
    SELECT COUNT(*) FROM bookings
    WHERE room_id = 123 AND
    end_time > '2025-01-01 12:00' AND start_time < '2025-01-01 13:00';

    -- 如果前一個查詢返回零：
    INSERT INTO bookings (room_id, start_time, end_time, user_id)
    VALUES (123, '2025-01-01 12:00', '2025-01-01 13:00', 666);

    COMMIT;
    ```

     不幸的是，快照隔離不會阻止另一個使用者併發插入衝突的會議。為了保證你不會出現排程衝突，你再次需要可序列化隔離。

多人遊戲
: 在[例 8-1](#fig_transactions_select_for_update) 中，我們使用鎖來防止丟失的更新（即，確保兩個玩家不能同時移動同一個棋子）。但是，鎖不會阻止玩家將兩個不同的棋子移動到棋盤上的同一位置，或者可能做出違反遊戲規則的其他移動。根據你要執行的規則型別，你可能能夠使用唯一約束，但否則你很容易受到寫偏差的影響。

宣告使用者名稱
: 在每個使用者都有唯一使用者名稱的網站上，兩個使用者可能同時嘗試使用相同的使用者名稱建立賬戶。你可以使用事務來檢查名稱是否被佔用，如果沒有，使用該名稱建立賬戶。但是，就像前面的例子一樣，這在快照隔離下是不安全的。幸運的是，唯一約束在這裡是一個簡單的解決方案（嘗試註冊使用者名稱的第二個事務將由於違反約束而被中止）。

防止重複消費
: 允許使用者花錢或積分的服務需要檢查使用者不會花費超過他們擁有的。你可以透過在使用者賬戶中插入暫定支出專案，列出賬戶中的所有專案，並檢查總和是否為正來實現這一點。有了寫偏差，可能會發生兩個支出專案併發插入，它們一起導致餘額變為負數，但沒有任何事務注意到另一個。

#### 導致寫偏差的幻讀 {#sec_transactions_phantom}

所有這些例子都遵循類似的模式：

1. `SELECT` 查詢透過搜尋匹配某些搜尋條件的行來檢查是否滿足某些要求（至少有兩個醫生值班，該房間在該時間沒有現有預訂，棋盤上的位置還沒有另一個棋子，使用者名稱尚未被佔用，賬戶中仍有錢）。
2. 根據第一個查詢的結果，應用程式程式碼決定如何繼續（也許繼續操作，或者向用戶報告錯誤並中止）。
3. 如果應用程式決定繼續，它會向資料庫進行寫入（`INSERT`、`UPDATE` 或 `DELETE`）並提交事務。

 此寫入的效果改變了步驟 2 決策的前提條件。換句話說，如果你在提交寫入後重復步驟 1 的 `SELECT` 查詢，你會得到不同的結果，因為寫入改變了匹配搜尋條件的行集（現在少了一個醫生值班，會議室現在已為該時間預訂，棋盤上的位置現在被移動的棋子佔據，使用者名稱現在被佔用，賬戶中的錢現在更少）。

步驟可能以不同的順序發生。例如，你可以先進行寫入，然後進行 `SELECT` 查詢，最後根據查詢結果決定是中止還是提交。

在醫生值班示例的情況下，步驟 3 中被修改的行是步驟 1 中返回的行之一，因此我們可以透過鎖定步驟 1 中的行（`SELECT FOR UPDATE`）來使事務安全並避免寫偏差。但是，其他四個示例是不同的：它們檢查*不存在*匹配某些搜尋條件的行，而寫入*新增*了匹配相同條件的行。如果步驟 1 中的查詢不返回任何行，`SELECT FOR UPDATE` 就無法附加鎖[^56]。

這種效果，其中一個事務中的寫入改變另一個事務中搜索查詢的結果，稱為*幻讀*[^4]。快照隔離避免了只讀查詢中的幻讀，但在我們討論的讀寫事務中，幻讀可能導致特別棘手的寫偏差情況。ORM 生成的 SQL 也容易出現寫偏差[^50] [^51]。

#### 物化衝突 {#materializing-conflicts}

如果幻讀的問題是沒有物件可以附加鎖，也許我們可以在資料庫中人為地引入一個鎖物件？

例如，在會議室預訂情況下，你可以想象建立一個時間段和房間的表。此表中的每一行對應於特定時間段（例如，15 分鐘）的特定房間。你提前為所有可能的房間和時間段組合建立行，例如，接下來的六個月。

現在，想要建立預訂的事務可以鎖定（`SELECT FOR UPDATE`）表中對應於所需房間和時間段的行。獲取鎖後，它可以像以前一樣檢查重疊的預訂並插入新的預訂。請注意，附加表不用於儲存有關預訂的資訊——它純粹是一組鎖，用於防止同一房間和時間範圍的預訂被併發修改。

這種方法稱為*物化衝突*，因為它採用了幻讀並將其轉化為存在於資料庫中的具體行集上的鎖衝突[^14]。不幸的是，很難且容易出錯地弄清楚如何物化衝突，並且讓併發控制機制洩漏到應用程式資料模型中是醜陋的。出於這些原因，如果沒有其他選擇，物化衝突應被視為最後的手段。在大多數情況下，可序列化隔離級別要好得多。


## 可序列化 {#sec_transactions_serializability}

在本章中，我們已經看到了幾個容易出現競態條件的事務示例。某些競態條件被讀已提交和快照隔離級別所防止，但其他的則沒有。我們遇到了一些特別棘手的寫偏差和幻讀示例。這是一個令人沮喪的情況：

* 隔離級別很難理解，並且在不同資料庫中的實現不一致（例如，"可重複讀"的含義差異很大）。
* 如果你檢視你的應用程式程式碼，很難判斷在特定隔離級別下執行是否安全——特別是在大型應用程式中，你可能不知道所有可能併發發生的事情。
* 沒有好的工具來幫助我們檢測競態條件。原則上，靜態分析可能有所幫助[^33]，但研究技術尚未進入實際使用。測試併發問題很困難，因為它們通常是非確定性的——只有在時機不巧時才會出現問題。

這不是一個新問題——自 1970 年代引入弱隔離級別以來一直如此[^3]。一直以來，研究人員的答案都很簡單：使用*可序列化*隔離！

可序列化隔離是最強的隔離級別。它保證即使事務可能並行執行，最終結果與它們*序列*執行（一次一個，沒有任何併發）相同。因此，資料庫保證如果事務在單獨執行時行為正確，那麼在併發執行時它們繼續保持正確——換句話說，資料庫防止了*所有*可能的競態條件。

但如果可序列化隔離比弱隔離級別的混亂要好得多，那為什麼不是每個人都在使用它？要回答這個問題，我們需要檢視實現可序列化的選項，以及它們的效能如何。今天提供可序列化的大多數資料庫使用以下三種技術之一，我們將在本章的其餘部分探討：

* 字面上序列執行事務（參見["實際序列執行"](#sec_transactions_serial)）
* 兩階段鎖定（參見["兩階段鎖定（2PL）"](#sec_transactions_2pl)），幾十年來這是唯一可行的選擇
* 樂觀併發控制技術，如可序列化快照隔離（參見["可序列化快照隔離（SSI）"](#sec_transactions_ssi)）

### 實際序列執行 {#sec_transactions_serial}

避免併發問題的最簡單方法是完全消除併發：在單個執行緒上按序列順序一次執行一個事務。透過這樣做，我們完全迴避了檢測和防止事務之間衝突的問題：所產生的隔離根據定義是可序列化的。

儘管這似乎是一個顯而易見的想法，但直到 2000 年代，資料庫設計者才決定執行事務的單執行緒迴圈是可行的[^57]。如果在過去 30 年中多執行緒併發被認為是獲得良好效能的必要條件，那是什麼改變使得單執行緒執行成為可能？

兩個發展導致了這種重新思考：

* RAM 變得足夠便宜，對於許多用例，現在可以將整個活動資料集儲存在記憶體中（參見["將所有內容儲存在記憶體中"](/tw/ch4#sec_storage_inmemory)）。當事務需要訪問的所有資料都在記憶體中時，事務的執行速度比必須等待從磁碟載入資料要快得多。
* 資料庫設計者意識到 OLTP 事務通常很短，只進行少量讀寫（參見["分析與運營系統"](/tw/ch1#sec_introduction_analytics)）。相比之下，長時間執行的分析查詢通常是隻讀的，因此它們可以在序列執行迴圈之外的一致快照上執行（使用快照隔離）。

序列執行事務的方法在 VoltDB/H-Store、Redis 和 Datomic 等中實現[^58] [^59] [^60]。為單執行緒執行設計的系統有時可以比支援併發的系統性能更好，因為它可以避免鎖定的協調開銷。但是，其吞吐量限於單個 CPU 核心。為了充分利用該單執行緒，事務需要以不同於傳統形式的方式構建。

#### 將事務封裝在儲存過程中 {#encapsulating-transactions-in-stored-procedures}

在資料庫的早期，意圖是資料庫事務可以包含整個使用者活動流程。例如，預訂機票是一個多階段過程（搜尋路線、票價和可用座位；決定行程；預訂行程中每個航班的座位；輸入乘客詳細資訊；付款）。資料庫設計者認為，如果整個過程是一個事務，以便可以原子地提交，那將是很好的。

不幸的是，人類做決定和響應的速度非常慢。如果資料庫事務需要等待使用者的輸入，資料庫需要支援潛在的大量併發事務，其中大多數是空閒的。大多數資料庫無法有效地做到這一點，因此幾乎所有 OLTP 應用程式都透過避免在事務中互動式地等待使用者來保持事務簡短。在 Web 上，這意味著事務在同一 HTTP 請求中提交——事務不跨越多個請求。新的 HTTP 請求開始新的事務。

即使人類已經從關鍵路徑中移除，事務仍然以互動式客戶端/伺服器風格執行，一次一個語句。應用程式進行查詢，讀取結果，可能根據第一個查詢的結果進行另一個查詢，依此類推。查詢和結果在應用程式程式碼（在一臺機器上執行）和資料庫伺服器（在另一臺機器上）之間來回傳送。

在這種互動式事務風格中，大量時間花在應用程式和資料庫之間的網路通訊上。如果你要在資料庫中禁止併發並一次只處理一個事務，吞吐量將是可怕的，因為資料庫將大部分時間都在等待應用程式為當前事務發出下一個查詢。在這種資料庫中，為了獲得合理的效能，必須併發處理多個事務。

因此，具有單執行緒序列事務處理的系統不允許互動式多語句事務。相反，應用程式必須將自己限制為包含單個語句的事務，或者提前將整個事務程式碼作為*儲存過程*提交給資料庫[^61]。

互動式事務和儲存過程之間的差異如[圖 8-9](#fig_transactions_stored_proc) 所示。前提是事務所需的所有資料都在記憶體中，儲存過程可以非常快速地執行，而無需等待任何網路或磁碟 I/O。

{{< figure src="/fig/ddia_0809.png" id="fig_transactions_stored_proc" caption="圖 8-9. 互動式事務和儲存過程之間的差異（使用[圖 8-8](#fig_transactions_write_skew)的示例事務）。" class="w-full my-4" >}}

#### 儲存過程的利弊 {#sec_transactions_stored_proc_tradeoffs}

儲存過程在關係資料庫中已經存在了一段時間，自 1999 年以來一直是 SQL 標準（SQL/PSM）的一部分。它們因各種原因獲得了一些不好的聲譽：

* 傳統上，每個資料庫供應商都有自己的儲存過程語言（Oracle 有 PL/SQL，SQL Server 有 T-SQL，PostgreSQL 有 PL/pgSQL 等）。這些語言沒有跟上通用程式語言的發展，因此從今天的角度來看，它們看起來相當醜陋和過時，並且缺乏大多數程式語言中的庫生態系統。
* 在資料庫中執行的程式碼很難管理：與應用程式伺服器相比，除錯更困難，版本控制和部署更尷尬，測試更棘手，並且難以與監控的指標收集系統整合。
* 資料庫通常比應用程式伺服器對效能更敏感，因為單個數據庫例項通常由許多應用程式伺服器共享。資料庫中編寫不當的儲存過程（例如，使用大量記憶體或 CPU 時間）可能比應用程式伺服器中等效的編寫不當的程式碼造成更多麻煩。
* 在允許租戶編寫自己的儲存過程的多租戶系統中，在與資料庫核心相同的程序中執行不受信任的程式碼是一個安全風險[^62]。

然而，這些問題可以克服。儲存過程的現代實現已經放棄了 PL/SQL，而是使用現有的通用程式語言：VoltDB 使用 Java 或 Groovy，Datomic 使用 Java 或 Clojure，Redis 使用 Lua，MongoDB 使用 Javascript。

儲存過程在應用程式邏輯無法輕鬆嵌入其他地方的情況下也很有用。例如，使用 GraphQL 的應用程式可能透過 GraphQL 代理直接公開其資料庫。如果代理不支援複雜的驗證邏輯，你可以使用儲存過程將此類邏輯直接嵌入資料庫中。如果資料庫不支援儲存過程，你必須在代理和資料庫之間部署驗證服務來進行驗證。

使用儲存過程和記憶體資料，在單個執行緒上執行所有事務變得可行。當儲存過程不需要等待 I/O 並避免其他併發控制機制的開銷時，它們可以在單個執行緒上實現相當好的吞吐量。

VoltDB 還使用儲存過程進行復制：它不是將事務的寫入從一個節點複製到另一個節點，而是在每個副本上執行相同的儲存過程。因此，VoltDB 要求儲存過程是*確定性的*（在不同節點上執行時，它們必須產生相同的結果）。例如，如果事務需要使用當前日期和時間，它必須透過特殊的確定性 API 來實現（有關確定性操作的更多詳細資訊，請參見["持久執行和工作流"](/tw/ch5#sec_encoding_dataflow_workflows)）。這種方法稱為*狀態機複製*，我們將在[第 10 章](/tw/ch10#ch_consistency)中回到它。

#### 分片 {#sharding}

序列執行所有事務使併發控制變得簡單得多，但將資料庫的事務吞吐量限制為單臺機器上單個 CPU 核心的速度。只讀事務可以使用快照隔離在其他地方執行，但對於具有高寫入吞吐量的應用程式，單執行緒事務處理器可能成為嚴重的瓶頸。

為了擴充套件到多個 CPU 核心和多個節點，你可以對資料進行分片（參見[第 7 章](/tw/ch7#ch_sharding)），VoltDB 支援這一點。如果你可以找到一種對資料集進行分片的方法，使每個事務只需要讀取和寫入單個分片內的資料，那麼每個分片可以有自己的事務處理執行緒，獨立於其他分片執行。在這種情況下，你可以給每個 CPU 核心分配自己的分片，這允許你的事務吞吐量與 CPU 核心數量線性擴充套件[^59]。

但是，對於需要訪問多個分片的任何事務，資料庫必須協調它所涉及的所有分片之間的事務。儲存過程需要在所有分片上同步執行，以確保整個系統的可序列化。

由於跨分片事務具有額外的協調開銷，因此它們比單分片事務慢得多。VoltDB 報告的跨分片寫入吞吐量約為每秒 1,000 次，這比其單分片吞吐量低幾個數量級，並且無法透過新增更多機器來增加[^61]。最近的研究探索了使多分片事務更具可伸縮性的方法[^63]。

事務是否可以是單分片的很大程度上取決於應用程式使用的資料結構。簡單的鍵值資料通常可以很容易地分片，但具有多個二級索引的資料可能需要大量的跨分片協調（參見["分片和二級索引"](/tw/ch7#sec_sharding_secondary_indexes)）。

#### 序列執行總結 {#summary-of-serial-execution}

序列執行事務已成為在某些約束條件下實現可序列化隔離的可行方法：

* 每個事務必須小而快，因為只需要一個緩慢的事務就可以阻止所有事務處理。
* 它最適合活動資料集可以適合記憶體的情況。很少訪問的資料可能會移到磁碟，但如果需要在單執行緒事務中訪問，系統會變得非常慢。
* 寫入吞吐量必須足夠低，可以在單個 CPU 核心上處理，否則事務需要分片而不需要跨分片協調。
* 跨分片事務是可能的，但它們的吞吐量很難擴充套件。

### 兩階段鎖定（2PL） {#sec_transactions_2pl}

大約 30 年來，資料庫中只有一種廣泛使用的可序列化演算法：*兩階段鎖定*（2PL），有時稱為*強嚴格兩階段鎖定*（SS2PL），以區別於 2PL 的其他變體。


--------

> [!TIP] 2PL 不是 2PC

兩階段*鎖定*（2PL）和兩階段*提交*（2PC）是兩個非常不同的東西。2PL 提供可序列化隔離，而 2PC 在分散式資料庫中提供原子提交（參見["兩階段提交（2PC）"](#sec_transactions_2pc)）。為避免混淆，最好將它們視為完全獨立的概念，並忽略名稱中不幸的相似性。

--------

我們之前看到鎖通常用於防止髒寫（參見["沒有髒寫"](#sec_transactions_dirty_write)）：如果兩個事務併發嘗試寫入同一物件，鎖確保第二個寫入者必須等到第一個完成其事務（中止或提交）後才能繼續。

兩階段鎖定類似，但使鎖要求更強。只要沒有人寫入，多個事務就可以併發讀取同一物件。但是一旦有人想要寫入（修改或刪除）物件，就需要獨佔訪問：

* 如果事務 A 已讀取物件而事務 B 想要寫入該物件，B 必須等到 A 提交或中止後才能繼續。（這確保 B 不能在 A 背後意外地更改物件。）
* 如果事務 A 已寫入物件而事務 B 想要讀取該物件，B 必須等到 A 提交或中止後才能繼續。（像[圖 8-4](#fig_transactions_read_committed) 中那樣讀取物件的舊版本在 2PL 下是不可接受的。）

在 2PL 中，寫入者不僅阻塞其他寫入者；它們還阻塞讀者，反之亦然。快照隔離有這樣的口號：*讀者永遠不會阻塞寫者，寫者永遠不會阻塞讀者*（參見["多版本併發控制（MVCC）"](#sec_transactions_snapshot_impl)），這捕捉了快照隔離和兩階段鎖定之間的關鍵區別。另一方面，因為 2PL 提供可序列化，它可以防止早期討論的所有競態條件，包括丟失的更新和寫偏差。

#### 兩階段鎖定的實現 {#implementation-of-two-phase-locking}

2PL 由 MySQL（InnoDB）和 SQL Server 中的可序列化隔離級別以及 Db2 中的可重複讀隔離級別使用[^29]。

讀者和寫者的阻塞是透過在資料庫中的每個物件上有一個鎖來實現的。鎖可以處於*共享模式*或*獨佔模式*（也稱為*多讀者單寫者*鎖）。鎖的使用如下：

* 如果事務想要讀取物件，它必須首先以共享模式獲取鎖。多個事務可以同時以共享模式持有鎖，但如果另一個事務已經對該物件具有獨佔鎖，則這些事務必須等待。
* 如果事務想要寫入物件，它必須首先以獨佔模式獲取鎖。沒有其他事務可以同時持有鎖（無論是共享模式還是獨佔模式），因此如果物件上有任何現有鎖，事務必須等待。
* 如果事務首先讀取然後寫入物件，它可以將其共享鎖升級為獨佔鎖。升級的工作方式與直接獲取獨佔鎖相同。
* 獲取鎖後，事務必須繼續持有鎖直到事務結束（提交或中止）。這就是"兩階段"名稱的來源：第一階段（事務執行時）是獲取鎖，第二階段（事務結束時）是釋放所有鎖。

由於使用了如此多的鎖，很容易發生事務 A 等待事務 B 釋放其鎖，反之亦然的情況。這種情況稱為*死鎖*。資料庫自動檢測事務之間的死鎖並中止其中一個，以便其他事務可以取得進展。中止的事務需要由應用程式重試。

#### 兩階段鎖定的效能 {#performance-of-two-phase-locking}

兩階段鎖定的主要缺點，以及自 1970 年代以來並非每個人都使用它的原因，是效能：在兩階段鎖定下，事務吞吐量和查詢響應時間明顯比弱隔離下差。

這部分是由於獲取和釋放所有這些鎖的開銷，但更重要的是由於併發性降低。按設計，如果兩個併發事務嘗試執行任何可能以任何方式導致競態條件的操作，其中一個必須等待另一個完成。

例如，如果你有一個需要讀取整個表的事務（例如，備份、分析查詢或完整性檢查，如["快照隔離與可重複讀"](#sec_transactions_snapshot_isolation)中所討論的），該事務必須對整個表進行共享鎖。因此，讀取事務首先必須等到所有正在寫入該表的進行中事務完成；然後，在讀取整個表時（對於大表可能需要很長時間），所有想要寫入該表的其他事務都被阻塞，直到大型只讀事務提交。實際上，資料庫在很長一段時間內無法進行寫入。

因此，執行 2PL 的資料庫可能具有相當不穩定的延遲，如果工作負載中存在爭用，它們在高百分位數可能非常慢（參見["描述效能"](/tw/ch2#sec_introduction_percentiles)）。可能只需要一個緩慢的事務，或者一個訪問大量資料並獲取許多鎖的事務，就會導致系統的其餘部分停滯不前。

儘管死鎖可能發生在基於鎖的讀已提交隔離級別下，但在 2PL 可序列化隔離下（取決於事務的訪問模式）它們發生得更頻繁。這可能是一個額外的效能問題：當事務由於死鎖而被中止並重試時，它需要重新完成所有工作。如果死鎖頻繁，這可能意味著大量的浪費努力。

#### 謂詞鎖 {#predicate-locks}

在前面的鎖描述中，我們掩蓋了一個微妙但重要的細節。在["導致寫偏差的幻讀"](#sec_transactions_phantom)中，我們討論了*幻讀*的問題——即一個事務改變另一個事務的搜尋查詢結果。具有可序列化隔離的資料庫必須防止幻讀。

在會議室預訂示例中，這意味著如果一個事務已經搜尋了某個時間視窗內某個房間的現有預訂（參見[例 8-2](#fig_transactions_meeting_rooms)），另一個事務不允許併發插入或更新同一房間和時間範圍的另一個預訂。（併發插入其他房間的預訂，或同一房間不影響擬議預訂的不同時間的預訂是可以的。）

我們如何實現這一點？從概念上講，我們需要一個*謂詞鎖*[^4]。它的工作方式類似於前面描述的共享/獨佔鎖，但它不屬於特定物件（例如，表中的一行），而是屬於匹配某些搜尋條件的所有物件，例如：

```
SELECT * FROM bookings
 WHERE room_id = 123 AND
 end_time > '2025-01-01 12:00' AND
 start_time < '2025-01-01 13:00';
```

謂詞鎖限制訪問如下：

* 如果事務 A 想要讀取匹配某些條件的物件，就像在該 `SELECT` 查詢中一樣，它必須在查詢條件上獲取共享模式謂詞鎖。如果另一個事務 B 當前對匹配這些條件的任何物件具有獨佔鎖，A 必須等到 B 釋放其鎖後才允許進行查詢。
* 如果事務 A 想要插入、更新或刪除任何物件，它必須首先檢查舊值或新值是否匹配任何現有的謂詞鎖。如果存在事務 B 持有的匹配謂詞鎖，則 A 必須等到 B 提交或中止後才能繼續。

這裡的關鍵思想是，謂詞鎖甚至適用於資料庫中尚不存在但將來可能新增的物件（幻讀）。如果兩階段鎖定包括謂詞鎖，資料庫將防止所有形式的寫偏差和其他競態條件，因此其隔離變為可序列化。

#### 索引範圍鎖 {#sec_transactions_2pl_range}

不幸的是，謂詞鎖的效能不佳：如果活動事務有許多鎖，檢查匹配鎖變得耗時。因此，大多數具有 2PL 的資料庫實際上實現了*索引範圍鎖定*（也稱為*間隙鎖*），這是謂詞鎖定的簡化近似[^54] [^64]。

透過使謂詞匹配更大的物件集來簡化謂詞是安全的。例如，如果你對中午到下午 1 點之間房間 123 的預訂有謂詞鎖，你可以透過鎖定房間 123 在任何時間的預訂來近似它，或者你可以透過鎖定中午到下午 1 點之間的所有房間（不僅僅是房間 123）來近似它。這是安全的，因為匹配原始謂詞的任何寫入肯定也會匹配近似。

在房間預訂資料庫中，你可能在 `room_id` 列上有索引，和/或在 `start_time` 和 `end_time` 上有索引（否則前面的查詢在大型資料庫上會非常慢）：

* 假設你的索引在 `room_id` 上，資料庫使用此索引查詢房間 123 的現有預訂。現在資料庫可以簡單地將共享鎖附加到此索引條目，表示事務已搜尋房間 123 的預訂。
* 或者，如果資料庫使用基於時間的索引查詢現有預訂，它可以將共享鎖附加到該索引中的值範圍，表示事務已搜尋與 2025 年 1 月 1 日中午到下午 1 點的時間段重疊的預訂。

無論哪種方式，搜尋條件的近似都附加到其中一個索引。現在，如果另一個事務想要插入、更新或刪除同一房間和/或重疊時間段的預訂，它將必須更新索引的相同部分。在這樣做的過程中，它將遇到共享鎖，並被迫等到鎖被釋放。

這提供了對幻讀和寫偏差的有效保護。索引範圍鎖不如謂詞鎖精確（它們可能鎖定比嚴格維護可序列化所需的更大範圍的物件），但由於它們的開銷要低得多，它們是一個很好的折衷。

如果沒有合適的索引可以附加範圍鎖，資料庫可以退回到整個表的共享鎖。這對效能不利，因為它將阻止所有其他事務寫入表，但這是一個安全的後備位置。

### 可序列化快照隔離（SSI） {#sec_transactions_ssi}

本章描繪了資料庫併發控制的黯淡畫面。一方面，我們有效能不佳（兩階段鎖定）或可伸縮性不佳（序列執行）的可序列化實現。另一方面，我們有效能良好但容易出現各種競態條件（丟失的更新、寫偏差、幻讀等）的弱隔離級別。可序列化隔離和良好效能從根本上是對立的嗎？

似乎不是：一種稱為*可序列化快照隔離*（SSI）的演算法提供完全可序列化，與快照隔離相比只有很小的效能損失。SSI 相對較新：它於 2008 年首次描述[^53] [^65]。

今天，SSI 和類似演算法用於單節點資料庫（PostgreSQL 中的可序列化隔離級別[^54]、SQL Server 的記憶體 OLTP/Hekaton[^66] 和 HyPer[^67]）、分散式資料庫（CockroachDB[^5] 和 FoundationDB[^8]）以及嵌入式儲存引擎（如 BadgerDB）。

#### 悲觀併發控制與樂觀併發控制 {#pessimistic-versus-optimistic-concurrency-control}

兩階段鎖定是所謂的*悲觀*併發控制機制：它基於這樣的原則，即如果任何事情可能出錯（如另一個事務持有的鎖所示），最好等到情況再次安全後再做任何事情。它就像*互斥*，用於保護多執行緒程式設計中的資料結構。

序列執行在某種意義上是悲觀到極端：它本質上相當於每個事務在事務期間對整個資料庫（或資料庫的一個分片）具有獨佔鎖。我們透過使每個事務執行得非常快來補償悲觀主義，因此它只需要短時間持有"鎖"。

相比之下，可序列化快照隔離是一種*樂觀*併發控制技術。在這種情況下，樂觀意味著，如果發生潛在危險的事情，事務不會阻塞，而是繼續進行，希望一切都會好起來。當事務想要提交時，資料庫會檢查是否發生了任何不好的事情（即，是否違反了隔離）；如果是，事務將被中止並必須重試。只允許可序列執行的事務提交。

樂觀併發控制是一個老想法[^68]，其優缺點已經爭論了很長時間[^69]。如果存在高爭用（許多事務嘗試訪問相同的物件），它的效能很差，因為這會導致大部分事務需要中止。如果系統已經接近其最大吞吐量，重試事務的額外事務負載可能會使效能變差。

但是，如果有足夠的備用容量，並且事務之間的爭用不太高，樂觀併發控制技術往往比悲觀技術性能更好。可交換原子操作可以減少爭用：例如，如果幾個事務併發想要遞增計數器，應用遞增的順序無關緊要（只要計數器在同一事務中沒有被讀取），因此併發遞增都可以應用而不會發生衝突。

顧名思義，SSI 基於快照隔離——也就是說，事務中的所有讀取都從資料庫的一致快照進行（參見["快照隔離與可重複讀"](#sec_transactions_snapshot_isolation)）。在快照隔離的基礎上，SSI 添加了一種演算法來檢測讀寫之間的序列化衝突，並確定要中止哪些事務。

#### 基於過時前提的決策 {#decisions-based-on-an-outdated-premise}

當我們之前討論快照隔離中的寫偏差時（參見["寫偏差與幻讀"](#sec_transactions_write_skew)），我們觀察到一個反覆出現的模式：事務從資料庫讀取一些資料，檢查查詢結果，並根據它看到的結果決定採取某些行動（寫入資料庫）。但是，在快照隔離下，原始查詢的結果在事務提交時可能不再是最新的，因為資料可能在此期間被修改。

換句話說，事務基於*前提*（事務開始時為真的事實，例如，"當前有兩名醫生值班"）採取行動。後來，當事務想要提交時，原始資料可能已更改——前提可能不再為真。

當應用程式進行查詢（例如，"當前有多少醫生值班？"）時，資料庫不知道應用程式邏輯如何使用該查詢的結果。為了安全起見，資料庫需要假設查詢結果（前提）中的任何更改都意味著該事務中的寫入可能無效。換句話說，事務中的查詢和寫入之間可能存在因果依賴關係。為了提供可序列化隔離，資料庫必須檢測事務可能基於過時前提採取行動的情況，並在這種情況下中止事務。

資料庫如何知道查詢結果是否可能已更改？有兩種情況需要考慮：

* 檢測陳舊的 MVCC 物件版本的讀取（未提交的寫入發生在讀取之前）
* 檢測影響先前讀取的寫入（寫入發生在讀取之後）

#### 檢測陳舊的 MVCC 讀取 {#detecting-stale-mvcc-reads}

回想一下，快照隔離通常由多版本併發控制（MVCC；參見["多版本併發控制（MVCC）"](#sec_transactions_snapshot_impl)）實現。當事務從 MVCC 資料庫中的一致快照讀取時，它會忽略在拍攝快照時尚未提交的任何其他事務所做的寫入。

在[圖 8-10](#fig_transactions_detect_mvcc) 中，事務 43 看到 Aaliyah 的 `on_call = true`，因為事務 42（修改了 Aaliyah 的值班狀態）未提交。但是，當事務 43 想要提交時，事務 42 已經提交。這意味著從一致快照讀取時被忽略的寫入現在已生效，事務 43 的前提不再為真。當寫入者插入以前不存在的資料時，事情變得更加複雜（參見["導致寫偏差的幻讀"](#sec_transactions_phantom)）。我們將在["檢測影響先前讀取的寫入"](#sec_detecting_writes_affect_reads)中討論為 SSI 檢測幻寫。

{{< figure src="/fig/ddia_0810.png" id="fig_transactions_detect_mvcc" caption="圖 8-10. 檢測事務何時從 MVCC 快照讀取過時值。" class="w-full my-4" >}}


為了防止這種異常，資料庫需要跟蹤事務由於 MVCC 可見性規則而忽略另一個事務的寫入的時間。當事務想要提交時，資料庫會檢查是否有任何被忽略的寫入現在已經提交。如果是，事務必須被中止。

為什麼要等到提交？為什麼不在檢測到陳舊讀取時立即中止事務 43？好吧，如果事務 43 是隻讀事務，它就不需要被中止，因為沒有寫偏差的風險。在事務 43 進行讀取時，資料庫還不知道該事務是否稍後會執行寫入。此外，事務 42 可能還會中止，或者在事務 43 提交時可能仍未提交，因此讀取可能最終不是陳舊的。透過避免不必要的中止，SSI 保留了快照隔離對從一致快照進行長時間執行讀取的支援。

#### 檢測影響先前讀取的寫入 {#sec_detecting_writes_affect_reads}

要考慮的第二種情況是另一個事務在資料被讀取後修改資料。這種情況如[圖 8-11](#fig_transactions_detect_index_range) 所示。

{{< figure src="/fig/ddia_0811.png" id="fig_transactions_detect_index_range" caption="圖 8-11. 在可序列化快照隔離中，檢測一個事務何時修改另一個事務的讀取。" class="w-full my-4" >}}


在兩階段鎖定的上下文中，我們討論了索引範圍鎖（參見["索引範圍鎖"](#sec_transactions_2pl_range)），它允許資料庫鎖定對匹配某些搜尋查詢的所有行的訪問，例如 `WHERE shift_id = 1234`。我們可以在這裡使用類似的技術，除了 SSI 鎖不會阻塞其他事務。

在[圖 8-11](#fig_transactions_detect_index_range) 中，事務 42 和 43 都在班次 `1234` 期間搜尋值班醫生。如果 `shift_id` 上有索引，資料庫可以使用索引條目 1234 來記錄事務 42 和 43 讀取此資料的事實。（如果沒有索引，可以在表級別跟蹤此資訊。）此資訊只需要保留一段時間：在事務完成（提交或中止）並且所有併發事務完成後，資料庫可以忘記它讀取的資料。

當事務寫入資料庫時，它必須在索引中查詢最近讀取受影響資料的任何其他事務。此過程類似於獲取受影響鍵範圍的寫鎖，但它不是阻塞直到讀者提交，而是充當絆線：它只是通知事務它們讀取的資料可能不再是最新的。

在[圖 8-11](#fig_transactions_detect_index_range) 中，事務 43 通知事務 42 其先前的讀取已過時，反之亦然。事務 42 首先提交，並且成功：儘管事務 43 的寫入影響了 42，但 43 尚未提交，因此寫入尚未生效。但是，當事務 43 想要提交時，來自 42 的衝突寫入已經提交，因此 43 必須中止。

#### 可序列化快照隔離的效能 {#performance-of-serializable-snapshot-isolation}

與往常一樣，許多工程細節會影響演算法在實踐中的工作效果。例如，一個權衡是跟蹤事務讀寫的粒度。如果資料庫詳細跟蹤每個事務的活動，它可以精確地確定哪些事務需要中止，但簿記開銷可能變得很大。不太詳細的跟蹤速度更快，但可能導致比嚴格必要更多的事務被中止。

在某些情況下，事務讀取被另一個事務覆蓋的資訊是可以的：根據發生的其他情況，有時可以證明執行結果仍然是可序列化的。PostgreSQL 使用這一理論來減少不必要中止的數量[^14] [^54]。

與兩階段鎖定相比，可序列化快照隔離的主要優點是一個事務不需要阻塞等待另一個事務持有的鎖。與快照隔離一樣，寫入者不會阻塞讀者，反之亦然。這種設計原則使查詢延遲更可預測且變化更少。特別是，只讀查詢可以在一致快照上執行而無需任何鎖，這對於讀取密集型工作負載非常有吸引力。

與序列執行相比，可序列化快照隔離不限於單個 CPU 核心的吞吐量：例如，FoundationDB 將序列化衝突的檢測分佈在多臺機器上，允許它擴充套件到非常高的吞吐量。即使資料可能分片在多臺機器上，事務也可以在多個分片中讀取和寫入資料，同時確保可序列化隔離。

與非可序列化快照隔離相比，檢查可序列化違規的需要引入了一些效能開銷。這些開銷有多大是一個爭論的問題：有些人認為可序列化檢查不值得[^70]，而其他人認為可序列化的效能現在已經很好，不再需要使用較弱的快照隔離[^67]。

中止率顯著影響 SSI 的整體效能。例如，長時間讀取和寫入資料的事務可能會遇到衝突並中止，因此 SSI 要求讀寫事務相當短（長時間執行的只讀事務是可以的）。但是，SSI 對慢事務的敏感性低於兩階段鎖定或序列執行。

## 分散式事務 {#sec_transactions_distributed}

前幾節重點討論了隔離的併發控制，即 ACID 中的 I。我們看到的演算法適用於單節點和分散式資料庫：儘管在使併發控制演算法可擴充套件方面存在挑戰（例如，為 SSI 執行分散式可序列化檢查），但分散式併發控制的高層思想與單節點併發控制相似[^8]。

一致性和永續性在轉向分散式事務時也沒有太大變化。但是，原子性需要更多關注。

對於在單個數據庫節點執行的事務，原子性通常由儲存引擎實現。當客戶端要求資料庫節點提交事務時，資料庫使事務的寫入持久化（通常在預寫日誌中；參見["使 B 樹可靠"](/tw/ch4#sec_storage_btree_wal)），然後將提交記錄附加到磁碟上的日誌。如果資料庫在此過程中崩潰，事務將在節點重新啟動時從日誌中恢復：如果提交記錄在崩潰前成功寫入磁碟，則事務被認為已提交；如果沒有，該事務的任何寫入都將回滾。

因此，在單個節點上，事務提交關鍵取決於資料持久寫入磁碟的*順序*：首先是資料，然後是提交記錄[^22]。事務提交或中止的關鍵決定時刻是磁碟完成寫入提交記錄的時刻：在那一刻之前，仍然可能中止（由於崩潰），但在那一刻之後，事務已提交（即使資料庫崩潰）。因此，是單個裝置（連線到特定節點的特定磁碟驅動器的控制器）使提交成為原子的。

但是，如果多個節點參與事務會怎樣？例如，也許你在分片資料庫中有多物件事務，或者有全域性二級索引（其中索引條目可能與主資料在不同的節點上；參見["分片和二級索引"](/tw/ch7#sec_sharding_secondary_indexes)）。大多數"NoSQL"分散式資料儲存不支援此類分散式事務，但各種分散式關係資料庫支援。

在這些情況下，僅向所有節點發送提交請求並在每個節點上獨立提交事務是不夠的。如[圖 8-12](#fig_transactions_non_atomic) 所示，提交可能在某些節點上成功，在其他節點上失敗：

* 某些節點可能檢測到約束違規或衝突，需要中止，而其他節點能夠成功提交。
* 某些提交請求可能在網路中丟失，最終由於超時而中止，而其他提交請求透過。
* 某些節點可能在提交記錄完全寫入之前崩潰並在恢復時回滾，而其他節點成功提交。

{{< figure src="/fig/ddia_0812.png" id="fig_transactions_non_atomic" caption="圖 8-12. 當事務涉及多個數據庫節點時，它可能在某些節點上提交，在其他節點上失敗。" class="w-full my-4" >}}


如果某些節點提交事務而其他節點中止它，節點之間就會變得不一致。一旦事務在一個節點上提交，如果後來發現它在另一個節點上被中止，就不能撤回了。這是因為一旦資料被提交，它在*讀已提交*或更強的隔離下對其他事務可見。例如，在[圖 8-12](#fig_transactions_non_atomic) 中，當用戶 1 注意到其在資料庫 1 上的提交失敗時，使用者 2 已經從資料庫 2 上的同一事務讀取了資料。如果使用者 1 的事務後來被中止，使用者 2 的事務也必須被還原，因為它基於被追溯宣告不存在的資料。

更好的方法是確保參與事務的節點要麼全部提交，要麼全部中止，並防止兩者的混合。確保這一點被稱為*原子提交*問題。

### 兩階段提交（2PC） {#sec_transactions_2pc}

兩階段提交是一種跨多個節點實現原子事務提交的演算法。它是分散式資料庫中的經典演算法[^13] [^71] [^72]。2PC 在某些資料庫內部使用，也以 *XA 事務*[^73] 的形式提供給應用程式（例如，Java 事務 API 支援），或透過 WS-AtomicTransaction 用於 SOAP Web 服務[^74] [^75]。

2PC 的基本流程如[圖 8-13](#fig_transactions_two_phase_commit) 所示。與單節點事務的單個提交請求不同，2PC 中的提交/中止過程分為兩個階段（因此得名）。

{{< figure src="/fig/ddia_0813.png" id="fig_transactions_two_phase_commit" title="圖 8-13. 兩階段提交（2PC）的成功執行。" class="w-full my-4" >}}


2PC 使用一個通常不會出現在單節點事務中的新元件：*協調器*（也稱為*事務管理器*）。協調器通常作為請求事務的同一應用程式程序中的庫實現（例如，嵌入在 Java EE 容器中），但它也可以是單獨的程序或服務。此類協調器的示例包括 Narayana、JOTM、BTM 或 MSDTC。

使用 2PC 時，分散式事務從應用程式在多個數據庫節點上正常讀寫資料開始。我們稱這些資料庫節點為事務中的*參與者*。當應用程式準備提交時，協調器開始第 1 階段：它向每個節點發送*準備*請求，詢問它們是否能夠提交。然後協調器跟蹤參與者的響應：

* 如果所有參與者回覆"是"，表示他們準備提交，那麼協調器在第 2 階段發出*提交*請求，提交實際發生。
* 如果任何參與者回覆"否"，協調器在第 2 階段向所有節點發送*中止*請求。

這個過程有點像西方文化中的傳統婚禮儀式：牧師分別詢問新娘和新郎是否願意嫁給對方，通常從兩人那裡得到"我願意"的答案。在收到兩個確認後，牧師宣佈這對夫婦為夫妻：事務已提交，這個快樂的事實向所有參加者廣播。如果新娘或新郎沒有說"是"，儀式就被中止了[^76]。

#### 系統性的承諾 {#a-system-of-promises}

從這個簡短的描述中，可能不清楚為什麼兩階段提交確保原子性，而跨多個節點的單階段提交卻不能。準備和提交請求在兩階段情況下同樣容易丟失。是什麼讓 2PC 不同？

要理解它為什麼有效，我們必須更詳細地分解這個過程：

1. 當應用程式想要開始分散式事務時，它從協調器請求事務 ID。此事務 ID 是全域性唯一的。
2. 應用程式在每個參與者上開始單節點事務，並將全域性唯一的事務 ID 附加到單節點事務。所有讀寫都在這些單節點事務之一中完成。如果在此階段出現任何問題（例如，節點崩潰或請求超時），協調器或任何參與者都可以中止。
3. 當應用程式準備提交時，協調器向所有參與者傳送準備請求，標記有全域性事務 ID。如果這些請求中的任何一個失敗或超時，協調器向所有參與者傳送該事務 ID 的中止請求。
4. 當參與者收到準備請求時，它確保它可以在任何情況下明確提交事務。

 這包括將所有事務資料寫入磁碟（崩潰、電源故障或磁碟空間不足不是稍後拒絕提交的可接受藉口），並檢查任何衝突或約束違規。透過向協調器回覆"是"，節點承諾在請求時無錯誤地提交事務。換句話說，參與者放棄了中止事務的權利，但沒有實際提交它。
5. 當協調器收到所有準備請求的響應時，它對是否提交或中止事務做出明確決定（僅當所有參與者投票"是"時才提交）。協調器必須將該決定寫入其磁碟上的事務日誌，以便在隨後崩潰時知道它是如何決定的。這稱為*提交點*。
6. 一旦協調器的決定被寫入磁碟，提交或中止請求就會發送給所有參與者。如果此請求失敗或超時，協調器必須永遠重試，直到成功。沒有回頭路：如果決定是提交，那麼必須執行該決定，無論需要多少次重試。如果參與者在此期間崩潰，事務將在恢復時提交——因為參與者投票"是"，它在恢復時不能拒絕提交。

因此，該協議包含兩個關鍵的"不歸路"：當參與者投票"是"時，它承諾它肯定能夠稍後提交（儘管協調器仍可能選擇中止）；一旦協調器決定，該決定是不可撤銷的。這些承諾確保了 2PC 的原子性。（單節點原子提交將這兩個事件合併為一個：將提交記錄寫入事務日誌。）

回到婚姻比喻，在說"我願意"之前，你和你的新娘/新郎有自由透過說"不行！"（或類似的話）來中止事務。但是，在說"我願意"之後，你不能撤回該宣告。如果你在說"我願意"後暈倒，沒有聽到牧師說"你們現在是夫妻"，這並不改變事務已提交的事實。當你稍後恢復意識時，你可以透過向牧師查詢你的全域性事務 ID 的狀態來了解你是否已婚，或者你可以等待牧師下一次重試提交請求（因為重試將在你失去意識期間繼續）。

#### 協調器故障 {#coordinator-failure}

我們已經討論了如果參與者之一或網路在 2PC 期間失敗會發生什麼：如果任何準備請求失敗或超時，協調器將中止事務；如果任何提交或中止請求失敗，協調器將無限期地重試它們。但是，如果協調器崩潰會發生什麼就不太清楚了。

如果協調器在傳送準備請求之前失敗，參與者可以安全地中止事務。但是一旦參與者收到準備請求並投票"是"，它就不能再單方面中止——它必須等待協調器回覆事務是提交還是中止。如果協調器此時崩潰或網路失敗，參與者除了等待別無他法。參與者在此狀態下的事務稱為*存疑*或*不確定*。

這種情況如[圖 8-14](#fig_transactions_2pc_crash) 所示。在這個特定的例子中，協調器實際上決定提交，資料庫 2 收到了提交請求。但是，協調器在向資料庫 1 傳送提交請求之前崩潰了，因此資料庫 1 不知道是提交還是中止。即使超時在這裡也沒有幫助：如果資料庫 1 在超時後單方面中止，它將與已提交的資料庫 2 不一致。同樣，單方面提交也不安全，因為另一個參與者可能已中止。

{{< figure src="/fig/ddia_0814.png" id="fig_transactions_2pc_crash" title="圖 8-14. 協調器在參與者投票“是”後崩潰。資料庫 1 不知道是提交還是中止。" class="w-full my-4" >}}


沒有協調器的訊息，參與者無法知道是提交還是中止。原則上，參與者可以相互通訊，瞭解每個參與者如何投票並達成某種協議，但這不是 2PC 協議的一部分。

2PC 完成的唯一方法是等待協調器恢復。這就是為什麼協調器必須在向參與者傳送提交或中止請求之前將其提交或中止決定寫入磁碟上的事務日誌：當協調器恢復時，它透過讀取其事務日誌來確定所有存疑事務的狀態。協調器日誌中沒有提交記錄的任何事務都將中止。因此，2PC 的提交點歸結為協調器上的常規單節點原子提交。

#### 三階段提交 {#three-phase-commit}

由於 2PC 可能會卡住等待協調器恢復，因此兩階段提交被稱為*阻塞*原子提交協議。可以使原子提交協議*非阻塞*，以便在節點失敗時不會卡住。但是，在實踐中使其工作並不那麼簡單。

作為 2PC 的替代方案，已經提出了一種稱為*三階段提交*（3PC）的演算法[^13] [^77]。但是，3PC 假設具有有界延遲的網路和具有有界響應時間的節點；在大多數具有無界網路延遲和程序暫停的實際系統中（參見[第 9 章](/tw/ch9#ch_distributed)），它無法保證原子性。

實踐中更好的解決方案是用容錯共識協議替換單節點協調器。我們將在[第 10 章](/tw/ch10#ch_consistency)中看到如何做到這一點。

### 跨不同系統的分散式事務 {#sec_transactions_xa}

分散式事務和兩階段提交的聲譽參差不齊。一方面，它們被認為提供了一個重要的安全保證，否則很難實現；另一方面，它們因導致操作問題、扼殺效能並承諾超過它們可以提供的東西而受到批評[^78] [^79] [^80] [^81]。許多雲服務由於它們引起的操作問題而選擇不實現分散式事務[^82]。

某些分散式事務的實現會帶來沉重的效能損失。兩階段提交固有的大部分效能成本是由於崩潰恢復所需的額外磁碟強制（`fsync`）和額外的網路往返。

但是，與其直接否定分散式事務，我們應該更詳細地研究它們，因為從中可以學到重要的教訓。首先，我們應該準確說明"分散式事務"的含義。兩種完全不同型別的分散式事務經常被混淆：

資料庫內部分散式事務
: 某些分散式資料庫（即，在其標準配置中使用複製和分片的資料庫）支援該資料庫節點之間的內部事務。例如，YugabyteDB、TiDB、FoundationDB、Spanner、VoltDB 和 MySQL Cluster 的 NDB 儲存引擎都有這樣的內部事務支援。在這種情況下，參與事務的所有節點都執行相同的資料庫軟體。

異構分散式事務
: 在*異構*事務中，參與者是兩個或多個不同的技術：例如，來自不同供應商的兩個資料庫，甚至是非資料庫系統（如訊息代理）。跨這些系統的分散式事務必須確保原子提交，即使系統在底層可能完全不同。

資料庫內部事務不必與任何其他系統相容，因此它們可以使用任何協議並應用特定於該特定技術的最佳化。因此，資料庫內部分散式事務通常可以很好地工作。另一方面，跨異構技術的事務更具挑戰性。

#### 恰好一次訊息處理 {#sec_transactions_exactly_once}

異構分散式事務允許以強大的方式整合各種系統。例如，當且僅當處理訊息的資料庫事務成功提交時，來自訊息佇列的訊息才能被確認為已處理。這是透過在單個事務中原子地提交訊息確認和資料庫寫入來實現的。有了分散式事務支援，即使訊息代理和資料庫是在不同機器上執行的兩種不相關的技術，這也是可能的。

如果訊息傳遞或資料庫事務失敗，兩者都會中止，因此訊息代理可以稍後安全地重新傳遞訊息。因此，透過原子地提交訊息及其處理的副作用，我們可以確保訊息在效果上*恰好*處理一次，即使在成功之前需要幾次重試。中止會丟棄部分完成事務的任何副作用。這被稱為*恰好一次語義*。

但是，只有當受事務影響的所有系統都能夠使用相同的原子提交協議時，這種分散式事務才有可能。例如，假設處理訊息的副作用是傳送電子郵件，而電子郵件伺服器不支援兩階段提交：如果訊息處理失敗並重試，可能會發生電子郵件被傳送兩次或更多次。但是，如果處理訊息的所有副作用在事務中止時都會回滾，那麼處理步驟可以安全地重試，就好像什麼都沒有發生一樣。

我們將在本章後面回到恰好一次語義的主題。讓我們首先看看允許此類異構分散式事務的原子提交協議。

#### XA 事務 {#xa-transactions}

*X/Open XA*（*eXtended Architecture* 的縮寫）是跨異構技術實現兩階段提交的標準[^73]。它於 1991 年推出並得到廣泛實現：XA 受到許多傳統關係資料庫（包括 PostgreSQL、MySQL、Db2、SQL Server 和 Oracle）和訊息代理（包括 ActiveMQ、HornetQ、MSMQ 和 IBM MQ）的支援。

XA 不是網路協議——它只是用於與事務協調器介面的 C API。此 API 的繫結存在於其他語言中；例如，在 Java EE 應用程式的世界中，XA 事務使用 Java 事務 API（JTA）實現，而 JTA 又由許多使用 Java 資料庫連線（JDBC）的資料庫驅動程式和使用 Java 訊息服務（JMS）API 的訊息代理驅動程式支援。

XA 假設你的應用程式使用網路驅動程式或客戶端庫與參與者資料庫或訊息服務進行通訊。如果驅動程式支援 XA，這意味著它呼叫 XA API 來確定操作是否應該是分散式事務的一部分——如果是，它將必要的資訊傳送到資料庫伺服器。驅動程式還公開回調，協調器可以透過回撥要求參與者準備、提交或中止。

事務協調器實現 XA API。該標準沒有指定應該如何實現它，但在實踐中，協調器通常只是載入到發出事務的應用程式的同一程序中的庫（而不是單獨的服務）。它跟蹤事務中的參與者，在要求他們準備後收集參與者的響應（透過驅動程式的回撥），並使用本地磁碟上的日誌來跟蹤每個事務的提交/中止決定。

如果應用程式程序崩潰，或者執行應用程式的機器宕機，協調器也隨之消失。任何準備但未提交事務的參與者都陷入存疑。由於協調器的日誌在應用程式伺服器的本地磁碟上，該伺服器必須重新啟動，協調器庫必須讀取日誌以恢復每個事務的提交/中止結果。然後，協調器才能使用資料庫驅動程式的 XA 回撥來要求參與者提交或中止（視情況而定）。資料庫伺服器無法直接聯絡協調器，因為所有通訊都必須透過其客戶端庫。

#### 存疑時持有鎖 {#holding-locks-while-in-doubt}

為什麼我們如此關心事務陷入存疑？系統的其餘部分不能繼續工作，忽略最終會被清理的存疑事務嗎？

問題在於*鎖定*。如["讀已提交"](#sec_transactions_read_committed)中所討論的，資料庫事務通常對它們修改的任何行進行行級獨佔鎖，以防止髒寫。此外，如果你想要可序列化隔離，使用兩階段鎖定的資料庫還必須對事務*讀取*的任何行進行共享鎖。

資料庫在事務提交或中止之前不能釋放這些鎖（如[圖 8-13](#fig_transactions_two_phase_commit) 中的陰影區域所示）。因此，使用兩階段提交時，事務必須在存疑期間保持鎖。如果協調器崩潰並需要 20 分鐘才能重新啟動，這些鎖將保持 20 分鐘。如果協調器的日誌由於某種原因完全丟失，這些鎖將永遠保持——或者至少直到管理員手動解決情況。

當這些鎖被持有時，沒有其他事務可以修改這些行。根據隔離級別，其他事務甚至可能被阻止讀取這些行。因此，其他事務不能簡單地繼續他們的業務——如果他們想要訪問相同的資料，他們將被阻塞。這可能導致你的應用程式的大部分變得不可用，直到存疑事務得到解決。

#### 從協調器故障中恢復 {#recovering-from-coordinator-failure}

理論上，如果協調器崩潰並重新啟動，它應該從日誌中乾淨地恢復其狀態並解決任何存疑事務。但是，在實踐中，*孤立的*存疑事務確實會發生[^83] [^84]——也就是說，協調器由於某種原因（例如，由於軟體錯誤導致事務日誌丟失或損壞）無法決定結果的事務。這些事務無法自動解決，因此它們永遠留在資料庫中，持有鎖並阻塞其他事務。

即使重新啟動資料庫伺服器也無法解決此問題，因為 2PC 的正確實現必須即使在重新啟動時也保留存疑事務的鎖（否則它將冒著違反原子性保證的風險）。這是一個棘手的情況。

唯一的出路是管理員手動決定是提交還是回滾事務。管理員必須檢查每個存疑事務的參與者，確定是否有任何參與者已經提交或中止，然後將相同的結果應用於其他參與者。解決問題可能需要大量的手動工作，並且很可能需要在嚴重的生產中斷期間在高壓力和時間壓力下完成（否則，為什麼協調器會處於如此糟糕的狀態？）。

許多 XA 實現都有一個名為*啟發式決策*的緊急逃生艙口：允許參與者在沒有協調器明確決定的情況下單方面決定中止或提交存疑事務[^73]。明確地說，這裡的*啟發式*是*可能破壞原子性*的委婉說法，因為啟發式決策違反了兩階段提交中的承諾系統。因此，啟發式決策僅用於擺脫災難性情況，而不用於常規使用。

#### XA 事務的問題 {#problems-with-xa-transactions}

單節點協調器是整個系統的單點故障，使其成為應用程式伺服器的一部分也是有問題的，因為協調器在其本地磁碟上的日誌成為持久系統狀態的關鍵部分——與資料庫本身一樣重要。

原則上，XA 事務的協調器可以是高可用和複製的，就像我們對任何其他重要資料庫的期望一樣。不幸的是，這仍然不能解決 XA 的一個根本問題，即它沒有為事務的協調器和參與者提供直接相互通訊的方式。它們只能透過呼叫事務的應用程式程式碼以及呼叫參與者的資料庫驅動程式進行通訊。

即使協調器被複制，應用程式程式碼也將是單點故障。解決這個問題需要完全重新設計應用程式程式碼的執行方式，使其複製或可重啟，這可能看起來類似於持久執行（參見["持久執行和工作流"](/tw/ch5#sec_encoding_dataflow_workflows)）。但是，實踐中似乎沒有任何工具實際採用這種方法。

另一個問題是，由於 XA 需要與各種資料系統相容，它必然是最低公分母。例如，它無法檢測跨不同系統的死鎖（因為這需要系統交換有關每個事務正在等待的鎖的資訊的標準化協議），並且它不適用於 SSI（參見["可序列化快照隔離（SSI）"](#sec_transactions_ssi)），因為這需要跨不同系統識別衝突的協議。

這些問題在某種程度上是跨異構技術執行事務所固有的。但是，保持幾個異構資料系統彼此一致仍然是一個真實而重要的問題，因此我們需要為其找到不同的解決方案。這可以做到，我們將在下一節和["派生資料與分散式事務"](/tw/ch13#sec_future_derived_vs_transactions)中看到。

### 資料庫內部的分散式事務 {#sec_transactions_internal}

如前所述，跨多個異構儲存技術的分散式事務與系統內部的分散式事務之間存在很大差異——即，參與節點都是執行相同軟體的同一資料庫的分片。此類內部分散式事務是"NewSQL"資料庫的定義特徵，例如 CockroachDB[^5]、TiDB[^6]、Spanner[^7]、FoundationDB[^8] 和 YugabyteDB。某些訊息代理（如 Kafka）也支援內部分散式事務[^85]。

這些系統中的許多系統使用兩階段提交來確保寫入多個分片的事務的原子性，但它們不會遇到與 XA 事務相同的問題。原因是，由於它們的分散式事務不需要與任何其他技術介面，它們避免了最低公分母陷阱——這些系統的設計者可以自由使用更可靠、更快的更好協議。

XA 的最大問題可以透過以下方式解決：

* 複製協調器，如果主協調器崩潰，自動故障轉移到另一個協調器節點；
* 允許協調器和資料分片直接通訊，而不透過應用程式程式碼；
* 複製參與分片，以減少由於分片中的故障而必須中止事務的風險；以及
* 將原子提交協議與支援跨分片死鎖檢測和一致讀取的分散式併發控制協議耦合。

共識演算法通常用於複製協調器和資料庫分片。我們將在[第 10 章](/tw/ch10#ch_consistency)中看到如何使用共識演算法實現分散式事務的原子提交。這些演算法透過自動從一個節點故障轉移到另一個節點來容忍故障，無需任何人工干預，同時繼續保證強一致性屬性。

為分散式事務提供的隔離級別取決於系統，但跨分片的快照隔離和可序列化快照隔離都是可能的。有關其工作原理的詳細資訊，請參見本章末尾引用的論文。

#### 再談恰好一次訊息處理 {#exactly-once-message-processing-revisited}

我們在["恰好一次訊息處理"](#sec_transactions_exactly_once)中看到，分散式事務的一個重要用例是確保某些操作恰好生效一次，即使在處理過程中發生崩潰並且需要重試處理。如果你可以跨訊息代理和資料庫原子地提交事務，則當且僅當成功處理訊息並且從處理過程產生的資料庫寫入被提交時，你可以向代理確認訊息。

但是，你實際上不需要這樣的分散式事務來實現恰好一次語義。另一種方法如下，它只需要資料庫中的事務：

1. 假設每條訊息都有唯一的 ID，並且在資料庫中有一個已處理訊息 ID 的表。當你開始從代理處理訊息時，你在資料庫上開始一個新事務，並檢查訊息 ID。如果資料庫中已經存在相同的訊息 ID，你知道它已經被處理，因此你可以向代理確認訊息並丟棄它。
2. 如果訊息 ID 尚未在資料庫中，你將其新增到表中。然後你處理訊息，這可能會導致在同一事務中對資料庫進行額外的寫入。完成處理訊息後，你提交資料庫上的事務。
3. 一旦資料庫事務成功提交，你就可以向代理確認訊息。
4. 一旦訊息成功確認給代理，你知道它不會再次嘗試處理相同的訊息，因此你可以從資料庫中刪除訊息 ID（在單獨的事務中）。

如果訊息處理器在提交資料庫事務之前崩潰，事務將被中止，訊息代理將重試處理。如果它在提交後但在向代理確認訊息之前崩潰，它也將重試處理，但重試將在資料庫中看到訊息 ID 並丟棄它。如果它在確認訊息後但在從資料庫中刪除訊息 ID 之前崩潰，你將有一個舊的訊息 ID 留下，除了佔用一點儲存空間外不會造成任何傷害。如果在資料庫事務中止之前發生重試（如果訊息處理器和資料庫之間的通訊中斷，這可能會發生），訊息 ID 表上的唯一性約束應該防止兩個併發事務插入相同的訊息 ID。

因此，實現恰好一次處理只需要資料庫中的事務——跨資料庫和訊息代理的原子性對於此用例不是必需的。在資料庫中記錄訊息 ID 使訊息處理具備*冪等性*，因此可以安全地重試訊息處理而不會重複其副作用。流處理框架（如 Kafka Streams）中使用類似的方法來實現恰好一次語義，我們將在["容錯"](/tw/ch12#sec_stream_fault_tolerance)中看到。

但是，資料庫內的內部分散式事務對於此類模式的可伸縮性仍然有用：例如，它們將允許訊息 ID 儲存在一個分片上，而訊息處理更新的主資料儲存在其他分片上，並確保跨這些分片的事務提交的原子性。


## 總結 {#summary}

事務是一個抽象層，允許應用程式假裝某些併發問題和某些型別的硬體和軟體故障不存在。大量錯誤被簡化為簡單的*事務中止*，應用程式只需要重試。

在本章中，我們看到了許多事務有助於防止的問題示例。並非所有應用程式都容易受到所有這些問題的影響：具有非常簡單的訪問模式的應用程式（例如，僅讀取和寫入單個記錄）可能可以在沒有事務的情況下管理。但是，對於更複雜的訪問模式，事務可以大大減少你需要考慮的潛在錯誤情況的數量。

沒有事務，各種錯誤場景（程序崩潰、網路中斷、停電、磁碟已滿、意外併發等）意味著資料可能以各種方式變得不一致。例如，反正規化資料很容易與源資料失去同步。沒有事務，很難推理複雜的互動訪問對資料庫可能產生的影響。

在本章中，我們特別深入地探討了併發控制的主題。我們討論了幾種廣泛使用的隔離級別，特別是*讀已提交*、*快照隔離*（有時稱為*可重複讀*）和*可序列化*。我們透過討論各種競態條件的示例來描述這些隔離級別，總結在 [表 8-1](#tab_transactions_isolation_levels) 中：

{{< figure id="tab_transactions_isolation_levels" title="表 8-1. 各種隔離級別可能發生的異常總結" class="w-full my-4" >}}

| 隔離級別 | 髒讀   | 讀取偏差  | 幻讀   | 丟失更新  | 寫偏差  |
|------|------|------|------|-------|------|
| 讀未提交 | ✗ 可能 | ✗ 可能 | ✗ 可能 | ✗ 可能  | ✗ 可能 |
| 讀已提交 | ✓ 防止 | ✗ 可能 | ✗ 可能 | ✗ 可能  | ✗ 可能 |
| 快照隔離 | ✓ 防止 | ✓ 防止 | ✓ 防止 | ? 視情況 | ✗ 可能 |
| 可序列化 | ✓ 防止 | ✓ 防止 | ✓ 防止 | ✓ 防止  | ✓ 防止 |

髒讀
: 一個客戶端在另一個客戶端的寫入提交之前讀取它們。讀已提交隔離級別和更強的級別防止髒讀。

髒寫
: 一個客戶端覆蓋另一個客戶端已寫入但尚未提交的資料。幾乎所有事務實現都防止髒寫。

讀取偏差
: 客戶端在不同時間點看到資料庫的不同部分。某些讀取偏差的情況也稱為*不可重複讀*。這個問題最常透過快照隔離來防止，它允許事務從對應於特定時間點的一致快照讀取。它通常使用*多版本併發控制*（MVCC）實現。

丟失更新
: 兩個客戶端併發執行讀-修改-寫迴圈。一個覆蓋另一個的寫入而不合並其更改，因此資料丟失。某些快照隔離的實現會自動防止此異常，而其他實現需要手動鎖（`SELECT FOR UPDATE`）。

寫偏差
: 事務讀取某些內容，根據它看到的值做出決定，並將決定寫入資料庫。但是，在進行寫入時，決策的前提不再為真。只有可序列化隔離才能防止此異常。

幻讀
: 事務讀取匹配某些搜尋條件的物件。另一個客戶端進行影響該搜尋結果的寫入。快照隔離防止直接的幻讀，但寫偏差上下文中的幻讀需要特殊處理，例如索引範圍鎖。

弱隔離級別可以防止某些異常，但讓你（應用程式開發人員）手動處理其他異常（例如，使用顯式鎖定）。只有可序列化隔離可以防止所有這些問題。我們討論了實現可序列化事務的三種不同方法：

字面上序列執行事務
: 如果你可以使每個事務執行得非常快（通常透過使用儲存過程），並且事務吞吐量足夠低，可以在單個 CPU 核心上處理或可以分片，這是一個簡單有效的選擇。

兩階段鎖定
: 幾十年來，這一直是實現可序列化的標準方法，但許多應用程式由於其效能不佳而避免使用它。

可序列化快照隔離（SSI）
: 一種相對較新的演算法，避免了前面方法的大部分缺點。它使用樂觀方法，允許事務在不阻塞的情況下進行。當事務想要提交時，它會被檢查，如果執行不可序列化，它將被中止。

最後，我們研究了當事務分佈在多個節點上時如何實現原子性，使用兩階段提交。如果這些節點都執行相同的資料庫軟體，分散式事務可以很好地工作，但跨不同儲存技術（使用 XA 事務），2PC 是有問題的：它對協調器和驅動事務的應用程式程式碼中的故障非常敏感，並且與併發控制機制的互動很差。幸運的是，冪等性可以確保恰好一次語義，而無需跨不同儲存技術的原子提交，我們將在後面的章節中看到更多相關內容。

本章中的示例使用了關係資料模型。但是，如["多物件事務的需求"](#sec_transactions_need)中所討論的，無論使用哪種資料模型，事務都是有價值的資料庫功能。


### 參考


[^1]: Steven J. Murdoch. [What went wrong with Horizon: learning from the Post Office Trial](https://www.benthamsgaze.org/2021/07/15/what-went-wrong-with-horizon-learning-from-the-post-office-trial/). *benthamsgaze.org*, July 2021. Archived at [perma.cc/CNM4-553F](https://perma.cc/CNM4-553F)
[^2]: Donald D. Chamberlin, Morton M. Astrahan, Michael W. Blasgen, James N. Gray, W. Frank King, Bruce G. Lindsay, Raymond Lorie, James W. Mehl, Thomas G. Price, Franco Putzolu, Patricia Griffiths Selinger, Mario Schkolnick, Donald R. Slutz, Irving L. Traiger, Bradford W. Wade, and Robert A. Yost. [A History and Evaluation of System R](https://dsf.berkeley.edu/cs262/2005/SystemR.pdf). *Communications of the ACM*, volume 24, issue 10, pages 632–646, October 1981. [doi:10.1145/358769.358784](https://doi.org/10.1145/358769.358784)
[^3]: Jim N. Gray, Raymond A. Lorie, Gianfranco R. Putzolu, and Irving L. Traiger. [Granularity of Locks and Degrees of Consistency in a Shared Data Base](https://citeseerx.ist.psu.edu/pdf/e127f0a6a912bb9150ecfe03c0ebf7fbc289a023). in *Modelling in Data Base Management Systems: Proceedings of the IFIP Working Conference on Modelling in Data Base Management Systems*, edited by G. M. Nijssen, pages 364–394, Elsevier/North Holland Publishing, 1976. Also in *Readings in Database Systems*, 4th edition, edited by Joseph M. Hellerstein and Michael Stonebraker, MIT Press, 2005. ISBN: 978-0-262-69314-1
[^4]: Kapali P. Eswaran, Jim N. Gray, Raymond A. Lorie, and Irving L. Traiger. [The Notions of Consistency and Predicate Locks in a Database System](https://jimgray.azurewebsites.net/papers/On%20the%20Notions%20of%20Consistency%20and%20Predicate%20Locks%20in%20a%20Database%20System%20CACM.pdf?from=https://research.microsoft.com/en-us/um/people/gray/papers/On%20the%20Notions%20of%20Consistency%20and%20Predicate%20Locks%20in%20a%20Database%20System%20CACM.pdf). *Communications of the ACM*, volume 19, issue 11, pages 624–633, November 1976. [doi:10.1145/360363.360369](https://doi.org/10.1145/360363.360369)
[^5]: Rebecca Taft, Irfan Sharif, Andrei Matei, Nathan VanBenschoten, Jordan Lewis, Tobias Grieger, Kai Niemi, Andy Woods, Anne Birzin, Raphael Poss, Paul Bardea, Amruta Ranade, Ben Darnell, Bram Gruneir, Justin Jaffray, Lucy Zhang, and Peter Mattis. [CockroachDB: The Resilient Geo-Distributed SQL Database](https://dl.acm.org/doi/pdf/10.1145/3318464.3386134). At *ACM SIGMOD International Conference on Management of Data* (SIGMOD), pages 1493–1509, June 2020. [doi:10.1145/3318464.3386134](https://doi.org/10.1145/3318464.3386134)
[^6]: Dongxu Huang, Qi Liu, Qiu Cui, Zhuhe Fang, Xiaoyu Ma, Fei Xu, Li Shen, Liu Tang, Yuxing Zhou, Menglong Huang, Wan Wei, Cong Liu, Jian Zhang, Jianjun Li, Xuelian Wu, Lingyu Song, Ruoxi Sun, Shuaipeng Yu, Lei Zhao, Nicholas Cameron, Liquan Pei, and Xin Tang. [TiDB: a Raft-based HTAP database](https://www.vldb.org/pvldb/vol13/p3072-huang.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 12, pages 3072–3084. [doi:10.14778/3415478.3415535](https://doi.org/10.14778/3415478.3415535)
[^7]: James C. Corbett, Jeffrey Dean, Michael Epstein, Andrew Fikes, Christopher Frost, JJ Furman, Sanjay Ghemawat, Andrey Gubarev, Christopher Heiser, Peter Hochschild, Wilson Hsieh, Sebastian Kanthak, Eugene Kogan, Hongyi Li, Alexander Lloyd, Sergey Melnik, David Mwaura, David Nagle, Sean Quinlan, Rajesh Rao, Lindsay Rolig, Dale Woodford, Yasushi Saito, Christopher Taylor, Michal Szymaniak, and Ruth Wang. [Spanner: Google’s Globally-Distributed Database](https://research.google/pubs/pub39966/). At *10th USENIX Symposium on Operating System Design and Implementation* (OSDI), October 2012.
[^8]: Jingyu Zhou, Meng Xu, Alexander Shraer, Bala Namasivayam, Alex Miller, Evan Tschannen, Steve Atherton, Andrew J. Beamon, Rusty Sears, John Leach, Dave Rosenthal, Xin Dong, Will Wilson, Ben Collins, David Scherer, Alec Grieser, Young Liu, Alvin Moore, Bhaskar Muppana, Xiaoge Su, and Vishesh Yadav. [FoundationDB: A Distributed Unbundled Transactional Key Value Store](https://www.foundationdb.org/files/fdb-paper.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2021. [doi:10.1145/3448016.3457559](https://doi.org/10.1145/3448016.3457559)
[^9]: Theo Härder and Andreas Reuter. [Principles of Transaction-Oriented Database Recovery](https://citeseerx.ist.psu.edu/pdf/11ef7c142295aeb1a28a0e714c91fc8d610c3047). *ACM Computing Surveys*, volume 15, issue 4, pages 287–317, December 1983. [doi:10.1145/289.291](https://doi.org/10.1145/289.291)
[^10]: Peter Bailis, Alan Fekete, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [HAT, not CAP: Towards Highly Available Transactions](https://www.usenix.org/system/files/conference/hotos13/hotos13-final80.pdf). At *14th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2013.
[^11]: Armando Fox, Steven D. Gribble, Yatin Chawathe, Eric A. Brewer, and Paul Gauthier. [Cluster-Based Scalable Network Services](https://people.eecs.berkeley.edu/~brewer/cs262b/TACC.pdf). At *16th ACM Symposium on Operating Systems Principles* (SOSP), October 1997. [doi:10.1145/268998.266662](https://doi.org/10.1145/268998.266662)
[^12]: Tony Andrews. [Enforcing Complex Constraints in Oracle](https://tonyandrews.blogspot.com/2004/10/enforcing-complex-constraints-in.html). *tonyandrews.blogspot.co.uk*, October 2004. Archived at [archive.org](https://web.archive.org/web/20220201190625/https%3A//tonyandrews.blogspot.com/2004/10/enforcing-complex-constraints-in.html)
[^13]: Philip A. Bernstein, Vassos Hadzilacos, and Nathan Goodman. [*Concurrency Control and Recovery in Database Systems*](https://www.microsoft.com/en-us/research/people/philbe/book/). Addison-Wesley, 1987. ISBN: 978-0-201-10715-9, available online at [*microsoft.com*](https://www.microsoft.com/en-us/research/people/philbe/book/).
[^14]: Alan Fekete, Dimitrios Liarokapis, Elizabeth O’Neil, Patrick O’Neil, and Dennis Shasha. [Making Snapshot Isolation Serializable](https://www.cse.iitb.ac.in/infolab/Data/Courses/CS632/2009/Papers/p492-fekete.pdf). *ACM Transactions on Database Systems*, volume 30, issue 2, pages 492–528, June 2005. [doi:10.1145/1071610.1071615](https://doi.org/10.1145/1071610.1071615)
[^15]: Mai Zheng, Joseph Tucek, Feng Qin, and Mark Lillibridge. [Understanding the Robustness of SSDs Under Power Fault](https://www.usenix.org/system/files/conference/fast13/fast13-final80.pdf). At *11th USENIX Conference on File and Storage Technologies* (FAST), February 2013.
[^16]: Laurie Denness. [SSDs: A Gift and a Curse](https://laur.ie/blog/2015/06/ssds-a-gift-and-a-curse/). *laur.ie*, June 2015. Archived at [perma.cc/6GLP-BX3T](https://perma.cc/6GLP-BX3T)
[^17]: Adam Surak. [When Solid State Drives Are Not That Solid](https://www.algolia.com/blog/engineering/when-solid-state-drives-are-not-that-solid). *blog.algolia.com*, June 2015. Archived at [perma.cc/CBR9-QZEE](https://perma.cc/CBR9-QZEE)
[^18]: Hewlett Packard Enterprise. [Bulletin: (Revision) HPE SAS Solid State Drives - Critical Firmware Upgrade Required for Certain HPE SAS Solid State Drive Models to Prevent Drive Failure at 32,768 Hours of Operation](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-a00092491en_us). *support.hpe.com*, November 2019. Archived at [perma.cc/CZR4-AQBS](https://perma.cc/CZR4-AQBS)
[^19]: Craig Ringer et al. [PostgreSQL’s handling of fsync() errors is unsafe and risks data loss at least on XFS](https://www.postgresql.org/message-id/flat/CAMsr%2BYHh%2B5Oq4xziwwoEfhoTZgr07vdGG%2Bhu%3D1adXx59aTeaoQ%40mail.gmail.com). Email thread on pgsql-hackers mailing list, *postgresql.org*, March 2018. Archived at [perma.cc/5RKU-57FL](https://perma.cc/5RKU-57FL)
[^20]: Anthony Rebello, Yuvraj Patel, Ramnatthan Alagappan, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [Can Applications Recover from fsync Failures?](https://www.usenix.org/conference/atc20/presentation/rebello) At *USENIX Annual Technical Conference* (ATC), July 2020.
[^21]: Thanumalayan Sankaranarayana Pillai, Vijay Chidambaram, Ramnatthan Alagappan, Samer Al-Kiswany, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [Crash Consistency: Rethinking the Fundamental Abstractions of the File System](https://dl.acm.org/doi/pdf/10.1145/2800695.2801719). *ACM Queue*, volume 13, issue 7, pages 20–28, July 2015. [doi:10.1145/2800695.2801719](https://doi.org/10.1145/2800695.2801719)
[^22]: Thanumalayan Sankaranarayana Pillai, Vijay Chidambaram, Ramnatthan Alagappan, Samer Al-Kiswany, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [All File Systems Are Not Created Equal: On the Complexity of Crafting Crash-Consistent Applications](https://www.usenix.org/system/files/conference/osdi14/osdi14-paper-pillai.pdf). At *11th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2014.
[^23]: Chris Siebenmann. [Unix’s File Durability Problem](https://utcc.utoronto.ca/~cks/space/blog/unix/FileSyncProblem). *utcc.utoronto.ca*, April 2016. Archived at [perma.cc/VSS8-5MC4](https://perma.cc/VSS8-5MC4)
[^24]: Aishwarya Ganesan, Ramnatthan Alagappan, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [Redundancy Does Not Imply Fault Tolerance: Analysis of Distributed Storage Reactions to Single Errors and Corruptions](https://www.usenix.org/conference/fast17/technical-sessions/presentation/ganesan). At *15th USENIX Conference on File and Storage Technologies* (FAST), February 2017.
[^25]: Lakshmi N. Bairavasundaram, Garth R. Goodson, Bianca Schroeder, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [An Analysis of Data Corruption in the Storage Stack](https://www.usenix.org/legacy/event/fast08/tech/full_papers/bairavasundaram/bairavasundaram.pdf). At *6th USENIX Conference on File and Storage Technologies* (FAST), February 2008.
[^26]: Bianca Schroeder, Raghav Lagisetty, and Arif Merchant. [Flash Reliability in Production: The Expected and the Unexpected](https://www.usenix.org/conference/fast16/technical-sessions/presentation/schroeder). At *14th USENIX Conference on File and Storage Technologies* (FAST), February 2016.
[^27]: Don Allison. [SSD Storage – Ignorance of Technology Is No Excuse](https://blog.korelogic.com/blog/2015/03/24). *blog.korelogic.com*, March 2015. Archived at [perma.cc/9QN4-9SNJ](https://perma.cc/9QN4-9SNJ)
[^28]: Gordon Mah Ung. [Debunked: Your SSD won’t lose data if left unplugged after all](https://www.pcworld.com/article/427602/debunked-your-ssd-wont-lose-data-if-left-unplugged-after-all.html). *pcworld.com*, May 2015. Archived at [perma.cc/S46H-JUDU](https://perma.cc/S46H-JUDU)
[^29]: Martin Kleppmann. [Hermitage: Testing the ‘I’ in ACID](https://martin.kleppmann.com/2014/11/25/hermitage-testing-the-i-in-acid.html). *martin.kleppmann.com*, November 2014. Archived at [perma.cc/KP2Y-AQGK](https://perma.cc/KP2Y-AQGK)
[^30]: Todd Warszawski and Peter Bailis. [ACIDRain: Concurrency-Related Attacks on Database-Backed Web Applications](http://www.bailis.org/papers/acidrain-sigmod2017.pdf). At *ACM International Conference on Management of Data* (SIGMOD), May 2017. [doi:10.1145/3035918.3064037](https://doi.org/10.1145/3035918.3064037)
[^31]: Tristan D’Agosta. [BTC Stolen from Poloniex](https://bitcointalk.org/index.php?topic=499580). *bitcointalk.org*, March 2014. Archived at [perma.cc/YHA6-4C5D](https://perma.cc/YHA6-4C5D)
[^32]: bitcointhief2. [How I Stole Roughly 100 BTC from an Exchange and How I Could Have Stolen More!](https://www.reddit.com/r/Bitcoin/comments/1wtbiu/how_i_stole_roughly_100_btc_from_an_exchange_and/) *reddit.com*, February 2014. Archived at [archive.org](https://web.archive.org/web/20250118042610/https%3A//www.reddit.com/r/Bitcoin/comments/1wtbiu/how_i_stole_roughly_100_btc_from_an_exchange_and/)
[^33]: Sudhir Jorwekar, Alan Fekete, Krithi Ramamritham, and S. Sudarshan. [Automating the Detection of Snapshot Isolation Anomalies](https://www.vldb.org/conf/2007/papers/industrial/p1263-jorwekar.pdf). At *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
[^34]: Michael Melanson. [Transactions: The Limits of Isolation](https://www.michaelmelanson.net/posts/transactions-the-limits-of-isolation/). *michaelmelanson.net*, November 2014. Archived at [perma.cc/RG5R-KMYZ](https://perma.cc/RG5R-KMYZ)
[^35]: Edward Kim. [How ACH works: A developer perspective — Part 1](https://engineering.gusto.com/how-ach-works-a-developer-perspective-part-1-339d3e7bea1). *engineering.gusto.com*, April 2014. Archived at [perma.cc/7B2H-PU94](https://perma.cc/7B2H-PU94)
[^36]: Hal Berenson, Philip A. Bernstein, Jim N. Gray, Jim Melton, Elizabeth O’Neil, and Patrick O’Neil. [A Critique of ANSI SQL Isolation Levels](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-95-51.pdf). At *ACM International Conference on Management of Data* (SIGMOD), May 1995. [doi:10.1145/568271.223785](https://doi.org/10.1145/568271.223785)
[^37]: Atul Adya. [Weak Consistency: A Generalized Theory and Optimistic Implementations for Distributed Transactions](https://pmg.csail.mit.edu/papers/adya-phd.pdf). PhD Thesis, Massachusetts Institute of Technology, March 1999. Archived at [perma.cc/E97M-HW5Q](https://perma.cc/E97M-HW5Q)
[^38]: Peter Bailis, Aaron Davidson, Alan Fekete, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Highly Available Transactions: Virtues and Limitations](https://www.vldb.org/pvldb/vol7/p181-bailis.pdf). At *40th International Conference on Very Large Data Bases* (VLDB), September 2014.
[^39]: Natacha Crooks, Youer Pu, Lorenzo Alvisi, and Allen Clement. [Seeing is Believing: A Client-Centric Specification of Database Isolation](https://www.cs.cornell.edu/lorenzo/papers/Crooks17Seeing.pdf). At *ACM Symposium on Principles of Distributed Computing* (PODC), pages 73–82, July 2017. [doi:10.1145/3087801.3087802](https://doi.org/10.1145/3087801.3087802)
[^40]: Bruce Momjian. [MVCC Unmasked](https://momjian.us/main/writings/pgsql/mvcc.pdf). *momjian.us*, July 2014. Archived at [perma.cc/KQ47-9GYB](https://perma.cc/KQ47-9GYB)
[^41]: Peter Alvaro and Kyle Kingsbury. [MySQL 8.0.34](https://jepsen.io/analyses/mysql-8.0.34). *jepsen.io*, December 2023. Archived at [perma.cc/HGE2-Z878](https://perma.cc/HGE2-Z878)
[^42]: Egor Rogov. [PostgreSQL 14 Internals](https://postgrespro.com/community/books/internals). *postgrespro.com*, April 2023. Archived at [perma.cc/FRK2-D7WB](https://perma.cc/FRK2-D7WB)
[^43]: Hironobu Suzuki. [The Internals of PostgreSQL](https://www.interdb.jp/pg/). *interdb.jp*, 2017.
[^44]: Rohan Reddy Alleti. [Internals of MVCC in Postgres: Hidden costs of Updates vs Inserts](https://medium.com/%40rohanjnr44/internals-of-mvcc-in-postgres-hidden-costs-of-updates-vs-inserts-381eadd35844). *medium.com*, March 2025. Archived at [perma.cc/3ACX-DFXT](https://perma.cc/3ACX-DFXT)
[^45]: Andy Pavlo and Bohan Zhang. [The Part of PostgreSQL We Hate the Most](https://www.cs.cmu.edu/~pavlo/blog/2023/04/the-part-of-postgresql-we-hate-the-most.html). *cs.cmu.edu*, April 2023. Archived at [perma.cc/XSP6-3JBN](https://perma.cc/XSP6-3JBN)
[^46]: Yingjun Wu, Joy Arulraj, Jiexi Lin, Ran Xian, and Andrew Pavlo. [An empirical evaluation of in-memory multi-version concurrency control](https://vldb.org/pvldb/vol10/p781-Wu.pdf). *Proceedings of the VLDB Endowment*, volume 10, issue 7, pages 781–792, March 2017. [doi:10.14778/3067421.3067427](https://doi.org/10.14778/3067421.3067427)
[^47]: Nikita Prokopov. [Unofficial Guide to Datomic Internals](https://tonsky.me/blog/unofficial-guide-to-datomic-internals/). *tonsky.me*, May 2014.
[^48]: Daniil Svetlov. [A Practical Guide to Taming Postgres Isolation Anomalies](https://dansvetlov.me/postgres-anomalies/). *dansvetlov.me*, March 2025. Archived at [perma.cc/L7LE-TDLS](https://perma.cc/L7LE-TDLS)
[^49]: Nate Wiger. [An Atomic Rant](https://nateware.com/2010/02/18/an-atomic-rant/). *nateware.com*, February 2010. Archived at [perma.cc/5ZYB-PE44](https://perma.cc/5ZYB-PE44)
[^50]: James Coglan. [Reading and writing, part 3: web applications](https://blog.jcoglan.com/2020/10/12/reading-and-writing-part-3/). *blog.jcoglan.com*, October 2020. Archived at [perma.cc/A7EK-PJVS](https://perma.cc/A7EK-PJVS)
[^51]: Peter Bailis, Alan Fekete, Michael J. Franklin, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Feral Concurrency Control: An Empirical Investigation of Modern Application Integrity](http://www.bailis.org/papers/feral-sigmod2015.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2737784](https://doi.org/10.1145/2723372.2737784)
[^52]: Jaana Dogan. [Things I Wished More Developers Knew About Databases](https://rakyll.medium.com/things-i-wished-more-developers-knew-about-databases-2d0178464f78). *rakyll.medium.com*, April 2020. Archived at [perma.cc/6EFK-P2TD](https://perma.cc/6EFK-P2TD)
[^53]: Michael J. Cahill, Uwe Röhm, and Alan Fekete. [Serializable Isolation for Snapshot Databases](https://www.cs.cornell.edu/~sowell/dbpapers/serializable_isolation.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2008. [doi:10.1145/1376616.1376690](https://doi.org/10.1145/1376616.1376690)
[^54]: Dan R. K. Ports and Kevin Grittner. [Serializable Snapshot Isolation in PostgreSQL](https://drkp.net/papers/ssi-vldb12.pdf). At *38th International Conference on Very Large Databases* (VLDB), August 2012.
[^55]: Douglas B. Terry, Marvin M. Theimer, Karin Petersen, Alan J. Demers, Mike J. Spreitzer and Carl H. Hauser. [Managing Update Conflicts in Bayou, a Weakly Connected Replicated Storage System](https://pdos.csail.mit.edu/6.824/papers/bayou-conflicts.pdf). At *15th ACM Symposium on Operating Systems Principles* (SOSP), December 1995. [doi:10.1145/224056.224070](https://doi.org/10.1145/224056.224070)
[^56]: Hans-Jürgen Schönig. [Constraints over multiple rows in PostgreSQL](https://www.cybertec-postgresql.com/en/postgresql-constraints-over-multiple-rows/). *cybertec-postgresql.com*, June 2021. Archived at [perma.cc/2TGH-XUPZ](https://perma.cc/2TGH-XUPZ)
[^57]: Michael Stonebraker, Samuel Madden, Daniel J. Abadi, Stavros Harizopoulos, Nabil Hachem, and Pat Helland. [The End of an Architectural Era (It’s Time for a Complete Rewrite)](https://vldb.org/conf/2007/papers/industrial/p1150-stonebraker.pdf). At *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
[^58]: John Hugg. [H-Store/VoltDB Architecture vs. CEP Systems and Newer Streaming Architectures](https://www.youtube.com/watch?v=hD5M4a1UVz8). At *Data @Scale Boston*, November 2014.
[^59]: Robert Kallman, Hideaki Kimura, Jonathan Natkins, Andrew Pavlo, Alexander Rasin, Stanley Zdonik, Evan P. C. Jones, Samuel Madden, Michael Stonebraker, Yang Zhang, John Hugg, and Daniel J. Abadi. [H-Store: A High-Performance, Distributed Main Memory Transaction Processing System](https://www.vldb.org/pvldb/vol1/1454211.pdf). *Proceedings of the VLDB Endowment*, volume 1, issue 2, pages 1496–1499, August 2008.
[^60]: Rich Hickey. [The Architecture of Datomic](https://www.infoq.com/articles/Architecture-Datomic/). *infoq.com*, November 2012. Archived at [perma.cc/5YWU-8XJK](https://perma.cc/5YWU-8XJK)
[^61]: John Hugg. [Debunking Myths About the VoltDB In-Memory Database](https://dzone.com/articles/debunking-myths-about-voltdb). *dzone.com*, May 2014. Archived at [perma.cc/2Z9N-HPKF](https://perma.cc/2Z9N-HPKF)
[^62]: Xinjing Zhou, Viktor Leis, Xiangyao Yu, and Michael Stonebraker. [OLTP Through the Looking Glass 16 Years Later: Communication is the New Bottleneck](https://www.vldb.org/cidrdb/papers/2025/p17-zhou.pdf). At *15th Annual Conference on Innovative Data Systems Research* (CIDR), January 2025.
[^63]: Xinjing Zhou, Xiangyao Yu, Goetz Graefe, and Michael Stonebraker. [Lotus: scalable multi-partition transactions on single-threaded partitioned databases](https://www.vldb.org/pvldb/vol15/p2939-zhou.pdf). *Proceedings of the VLDB Endowment* (PVLDB), volume 15, issue 11, pages 2939–2952, July 2022. [doi:10.14778/3551793.3551843](https://doi.org/10.14778/3551793.3551843)
[^64]: Joseph M. Hellerstein, Michael Stonebraker, and James Hamilton. [Architecture of a Database System](https://dsf.berkeley.edu/papers/fntdb07-architecture.pdf). *Foundations and Trends in Databases*, volume 1, issue 2, pages 141–259, November 2007. [doi:10.1561/1900000002](https://doi.org/10.1561/1900000002)
[^65]: Michael J. Cahill. [Serializable Isolation for Snapshot Databases](https://ses.library.usyd.edu.au/bitstream/handle/2123/5353/michael-cahill-2009-thesis.pdf). PhD Thesis, University of Sydney, July 2009. Archived at [perma.cc/727J-NTMP](https://perma.cc/727J-NTMP)
[^66]: Cristian Diaconu, Craig Freedman, Erik Ismert, Per-Åke Larson, Pravin Mittal, Ryan Stonecipher, Nitin Verma, and Mike Zwilling. [Hekaton: SQL Server’s Memory-Optimized OLTP Engine](https://www.microsoft.com/en-us/research/wp-content/uploads/2013/06/Hekaton-Sigmod2013-final.pdf). At *ACM SIGMOD International Conference on Management of Data* (SIGMOD), pages 1243–1254, June 2013. [doi:10.1145/2463676.2463710](https://doi.org/10.1145/2463676.2463710)
[^67]: Thomas Neumann, Tobias Mühlbauer, and Alfons Kemper. [Fast Serializable Multi-Version Concurrency Control for Main-Memory Database Systems](https://db.in.tum.de/~muehlbau/papers/mvcc.pdf). At *ACM SIGMOD International Conference on Management of Data* (SIGMOD), pages 677–689, May 2015. [doi:10.1145/2723372.2749436](https://doi.org/10.1145/2723372.2749436)
[^68]: D. Z. Badal. [Correctness of Concurrency Control and Implications in Distributed Databases](https://ieeexplore.ieee.org/abstract/document/762563). At *3rd International IEEE Computer Software and Applications Conference* (COMPSAC), November 1979. [doi:10.1109/CMPSAC.1979.762563](https://doi.org/10.1109/CMPSAC.1979.762563)
[^69]: Rakesh Agrawal, Michael J. Carey, and Miron Livny. [Concurrency Control Performance Modeling: Alternatives and Implications](https://people.eecs.berkeley.edu/~brewer/cs262/ConcControl.pdf). *ACM Transactions on Database Systems* (TODS), volume 12, issue 4, pages 609–654, December 1987. [doi:10.1145/32204.32220](https://doi.org/10.1145/32204.32220)
[^70]: Marc Brooker. [Snapshot Isolation vs Serializability](https://brooker.co.za/blog/2024/12/17/occ-and-isolation.html). *brooker.co.za*, December 2024. Archived at [perma.cc/5TRC-CR5G](https://perma.cc/5TRC-CR5G)
[^71]: B. G. Lindsay, P. G. Selinger, C. Galtieri, J. N. Gray, R. A. Lorie, T. G. Price, F. Putzolu, I. L. Traiger, and B. W. Wade. [Notes on Distributed Databases](https://dominoweb.draco.res.ibm.com/reports/RJ2571.pdf). IBM Research, Research Report RJ2571(33471), July 1979. Archived at [perma.cc/EPZ3-MHDD](https://perma.cc/EPZ3-MHDD)
[^72]: C. Mohan, Bruce G. Lindsay, and Ron Obermarck. [Transaction Management in the R\* Distributed Database Management System](https://cs.brown.edu/courses/csci2270/archives/2012/papers/dtxn/p378-mohan.pdf). *ACM Transactions on Database Systems*, volume 11, issue 4, pages 378–396, December 1986. [doi:10.1145/7239.7266](https://doi.org/10.1145/7239.7266)
[^73]: X/Open Company Ltd. [Distributed Transaction Processing: The XA Specification](https://pubs.opengroup.org/onlinepubs/009680699/toc.pdf). Technical Standard XO/CAE/91/300, December 1991. ISBN: 978-1-872-63024-3, archived at [perma.cc/Z96H-29JB](https://perma.cc/Z96H-29JB)
[^74]: Ivan Silva Neto and Francisco Reverbel. [Lessons Learned from Implementing WS-Coordination and WS-AtomicTransaction](https://www.ime.usp.br/~reverbel/papers/icis2008.pdf). At *7th IEEE/ACIS International Conference on Computer and Information Science* (ICIS), May 2008. [doi:10.1109/ICIS.2008.75](https://doi.org/10.1109/ICIS.2008.75)
[^75]: James E. Johnson, David E. Langworthy, Leslie Lamport, and Friedrich H. Vogt. [Formal Specification of a Web Services Protocol](https://www.microsoft.com/en-us/research/publication/formal-specification-of-a-web-services-protocol/). At *1st International Workshop on Web Services and Formal Methods* (WS-FM), February 2004. [doi:10.1016/j.entcs.2004.02.022](https://doi.org/10.1016/j.entcs.2004.02.022)
[^76]: Jim Gray. [The Transaction Concept: Virtues and Limitations](https://jimgray.azurewebsites.net/papers/thetransactionconcept.pdf). At *7th International Conference on Very Large Data Bases* (VLDB), September 1981.
[^77]: Dale Skeen. [Nonblocking Commit Protocols](https://www.cs.utexas.edu/~lorenzo/corsi/cs380d/papers/Ske81.pdf). At *ACM International Conference on Management of Data* (SIGMOD), April 1981. [doi:10.1145/582318.582339](https://doi.org/10.1145/582318.582339)
[^78]: Gregor Hohpe. [Your Coffee Shop Doesn’t Use Two-Phase Commit](https://www.martinfowler.com/ieeeSoftware/coffeeShop.pdf). *IEEE Software*, volume 22, issue 2, pages 64–66, March 2005. [doi:10.1109/MS.2005.52](https://doi.org/10.1109/MS.2005.52)
[^79]: Pat Helland. [Life Beyond Distributed Transactions: An Apostate’s Opinion](https://www.cidrdb.org/cidr2007/papers/cidr07p15.pdf). At *3rd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2007.
[^80]: Jonathan Oliver. [My Beef with MSDTC and Two-Phase Commits](https://blog.jonathanoliver.com/my-beef-with-msdtc-and-two-phase-commits/). *blog.jonathanoliver.com*, April 2011. Archived at [perma.cc/K8HF-Z4EN](https://perma.cc/K8HF-Z4EN)
[^81]: Oren Eini (Ahende Rahien). [The Fallacy of Distributed Transactions](https://ayende.com/blog/167362/the-fallacy-of-distributed-transactions). *ayende.com*, July 2014. Archived at [perma.cc/VB87-2JEF](https://perma.cc/VB87-2JEF)
[^82]: Clemens Vasters. [Transactions in Windows Azure (with Service Bus) – An Email Discussion](https://learn.microsoft.com/en-gb/archive/blogs/clemensv/transactions-in-windows-azure-with-service-bus-an-email-discussion). *learn.microsoft.com*, July 2012. Archived at [perma.cc/4EZ9-5SKW](https://perma.cc/4EZ9-5SKW)
[^83]: Ajmer Dhariwal. [Orphaned MSDTC Transactions (-2 spids)](https://www.eraofdata.com/posts/2008/orphaned-msdtc-transactions-2-spids/). *eraofdata.com*, December 2008. Archived at [perma.cc/YG6F-U34C](https://perma.cc/YG6F-U34C)
[^84]: Paul Randal. [Real World Story of DBCC PAGE Saving the Day](https://www.sqlskills.com/blogs/paul/real-world-story-of-dbcc-page-saving-the-day/). *sqlskills.com*, June 2013. Archived at [perma.cc/2MJN-A5QH](https://perma.cc/2MJN-A5QH)
[^85]: Guozhang Wang, Lei Chen, Ayusman Dikshit, Jason Gustafson, Boyang Chen, Matthias J. Sax, John Roesler, Sophie Blee-Goldman, Bruno Cadonna, Apurva Mehta, Varun Madan, and Jun Rao. [Consistency and Completeness: Rethinking Distributed Stream Processing in Apache Kafka](https://dl.acm.org/doi/pdf/10.1145/3448016.3457556). At *ACM International Conference on Management of Data* (SIGMOD), June 2021. [doi:10.1145/3448016.3457556](https://doi.org/10.1145/3448016.3457556)

================================================
FILE: content/tw/ch9.md
================================================
---
title: "9. 分散式系統的麻煩"
weight: 209
breadcrumbs: false
---

<a id="ch_distributed"></a>

![](/map/ch08.png)

> *意外這東西挺有意思：你沒碰上之前，它就從來不會發生。*
>
> A.A. 米爾恩，《小熊維尼和老灰驢的家》（1928）

正如 ["可靠性與容錯"](/tw/ch2#sec_introduction_reliability) 中所討論的，讓系統可靠意味著確保系統作為一個整體繼續工作，即使出了問題（即出現故障）。然而，預料所有可能的故障並處理它們並不是那麼容易。作為開發者，我們很容易主要關注正常路徑（畢竟，大多數時候事情都執行良好！）而忽略故障，因為故障會引入大量邊界情況。

如果你希望系統在故障存在的情況下仍然可靠，你必須從根本上改變你的思維方式，並專注於可能出錯的事情，即使它們可能性很低。一件事情出錯的機率是否只有百萬分之一並不重要：在一個足夠大的系統中，百萬分之一的事件每天都在發生。經驗豐富的系統操作員會告訴你，任何 *可能* 出錯的事情 *都會* 出錯。

此外，使用分散式系統與在單臺計算機上編寫軟體有著根本的不同 —— 主要區別在於有許多新的、令人興奮的出錯方式 [^1] [^2]。在本章中，你將體驗實踐中出現的問題，並理解你可以依賴和不能依賴的事物。

為了理解我們面臨的挑戰，我們現在將把悲觀情緒發揮到極致，探索分散式系統中可能出錯的事情。我們將研究網路問題（["不可靠的網路"](#sec_distributed_networks)）以及時鐘和時序問題（["不可靠的時鐘"](#sec_distributed_clocks)）。所有這些問題的後果令人迷惑，因此我們將探索如何思考分散式系統的狀態以及如何推理已經發生的事情（["知識、真相與謊言"](#sec_distributed_truth)）。稍後，在 [第 10 章](/tw/ch10#ch_consistency) 中，我們將看一些面對這些故障時如何實現容錯的例子。

## 故障與部分失效 {#sec_distributed_partial_failure}

當你在單臺計算機上編寫程式時，它通常以相當可預測的方式執行：要麼工作，要麼不工作。有缺陷的軟體可能會給人一種計算機有時 "狀態不佳" 的印象（這個問題通常透過重啟來解決），但這主要只是編寫不良的軟體的後果。

軟體在單臺計算機上不應該是不穩定的，這沒有根本原因：當硬體正常工作時，相同的操作總是產生相同的結果（它是 *確定性的*）。如果存在硬體問題（例如，記憶體損壞或聯結器鬆動），後果通常是整個系統故障（例如，核心恐慌、"藍色畫面宕機"、無法啟動）。一臺執行良好軟體的單獨計算機通常要麼完全正常執行，要麼完全故障，而不是介於兩者之間。

這是計算機設計中的一個刻意選擇：如果發生內部故障，我們寧願計算機完全崩潰而不是返回錯誤的結果，因為錯誤的結果很難處理且令人困惑。因此，計算機隱藏了它們所實現的模糊物理現實，並呈現一個以數學完美執行的理想化系統模型。CPU 指令總是做同樣的事情；如果你將一些資料寫入記憶體或磁碟，該資料保持完整，不會被隨機損壞。正如 ["硬體與軟體故障"](/tw/ch2#sec_introduction_hardware_faults) 中所討論的，這實際上並不是真的 —— 實際上，資料確實會被靜默損壞，CPU 有時會靜默返回錯誤的結果 —— 但這種情況發生得足夠少，以至於我們可以忽略它。

當你編寫在多臺計算機上執行的軟體，透過網路連線時，情況就根本不同了。在分散式系統中，故障發生得更加頻繁，因此我們不能再忽略它們 —— 我們別無選擇，只能直面物理世界的混亂現實。在物理世界中，可能出錯的事情範圍非常廣泛，正如這個軼事所說明的 [^3]：

> 在我有限的經驗中，我處理過單個數據中心（DC）中的長期網路分割槽、PDU [配電單元] 故障、交換機故障、整個機架的意外斷電、整個 DC 骨幹網故障、整個 DC 電源故障，以及一個低血糖的司機將他的福特皮卡撞進 DC 的 HVAC [供暖、通風和空調] 系統。而我甚至不是運維人員。
>
> —— Coda Hale

在分散式系統中，系統的某些部分可能以某種不可預測的方式出現故障，即使系統的其他部分工作正常。這被稱為 *部分失效*。困難在於部分失效是 *非確定性的*：如果你嘗試做任何涉及多個節點和網路的事情，它有時可能工作，有時可能不可預測地失敗。正如我們將看到的，你甚至可能不 *知道* 某事是否成功！

這種非確定性和部分失效的可能性使分散式系統難以使用 [^4]。另一方面，如果分散式系統可以容忍部分失效，這將開啟強大的可能性：例如，它允許你執行滾動升級，一次重啟一個節點以安裝軟體更新，而系統作為一個整體繼續不間斷地工作。因此，容錯使我們能夠從不可靠的元件構建比單節點系統更可靠的分散式系統。

但在我們實現容錯之前，我們需要更多地瞭解我們應該容忍的故障。重要的是要考慮各種可能的故障 —— 即使是相當不太可能的故障 —— 並在你的測試環境中人為地建立這種情況以檢視會發生什麼。在分散式系統中，懷疑、悲觀和偏執是有回報的。

## 不可靠的網路 {#sec_distributed_networks}

正如 ["共享記憶體、共享磁碟和無共享架構"](/tw/ch2#sec_introduction_shared_nothing) 中所討論的，我們在本書中關注的分散式系統主要是 *無共享系統*：即透過網路連線的一組機器。網路是這些機器進行通訊的唯一方式 —— 我們假設每臺機器都有自己的記憶體和磁碟，一臺機器不能訪問另一臺機器的記憶體或磁碟（除非透過網路向服務發出請求）。即使儲存是共享的，例如亞馬遜的 S3，機器也是透過網路與共享儲存服務通訊。

網際網路和資料中心中的大多數內部網路（通常是乙太網）都是 *非同步分組網路*。在這種網路中，一個節點可以向另一個節點發送訊息（資料包），但網路不保證它何時到達，或者是否會到達。如果你傳送請求並期望響應，許多事情可能會出錯（其中一些如 [圖 9-1](#fig_distributed_network) 所示）：

1. 你的請求可能已經丟失（也許有人拔掉了網線）。
2. 你的請求可能在佇列中等待，稍後將被交付（也許網路或接收方過載）。
3. 遠端節點可能已經失效（也許它崩潰了或被關閉了）。
4. 遠端節點可能暫時停止響應（也許它正在經歷長時間的垃圾回收暫停；見 ["程序暫停"](#sec_distributed_clocks_pauses)），但稍後會再次開始響應。
5. 遠端節點可能已經處理了你的請求，但響應在網路上丟失了（也許網路交換機配置錯誤）。
6. 遠端節點可能已經處理了你的請求，但響應被延遲了，稍後將被交付（也許網路或你自己的機器過載）。

{{< figure src="/fig/ddia_0901.png" id="fig_distributed_network" caption="圖 9-1. 如果你傳送請求但沒有收到響應，無法區分是 (a) 請求丟失了，(b) 遠端節點宕機了，還是 (c) 響應丟失了。" class="w-full my-4" >}}


傳送方甚至無法判斷資料包是否已交付：唯一的選擇是讓接收方傳送響應訊息，而響應訊息本身也可能丟失或延遲。在非同步網路中，這些問題是無法區分的：你擁有的唯一資訊是你還沒有收到響應。如果你向另一個節點發送請求但沒有收到響應，*不可能* 判斷原因。

處理這個問題的常用方法是 *超時*：在一段時間後，你放棄等待並假設響應不會到達。然而，當超時發生時，你仍然不知道遠端節點是否收到了你的請求（如果請求仍在某處排隊，即使傳送方已經放棄了它，它仍可能被交付給接收方）。

### TCP 的侷限性 {#sec_distributed_tcp}

網路資料包有最大大小（通常為幾千位元組），但許多應用程式需要傳送太大而無法裝入一個數據包的訊息（請求、響應）。這些應用程式最常使用 TCP（傳輸控制協議）來建立一個 *連線*，將大型資料流分解為單個數據包，並在接收端將它們重新組合起來。

--------

> [!NOTE]
> 我們關於 TCP 的大部分內容也適用於其更新的替代方案 QUIC，以及 WebRTC 中使用的流控制傳輸協議（SCTP）、BitTorrent uTP 協議和其他傳輸協議。有關與 UDP 的比較，請參見 ["TCP 與 UDP"](#sidebar_distributed_tcp_udp)。

--------

TCP 通常被描述為提供 "可靠" 的交付，從某種意義上說，它檢測並重傳丟棄的資料包，檢測重新排序的資料包並將它們恢復到正確的順序，並使用簡單的校驗和檢測資料包損壞。它還計算出可以傳送資料的速度，以便儘快傳輸資料，但不會使網路或接收節點過載；這被稱為 *擁塞控制*、*流量控制* 或 *背壓* [^5]。

當你透過將資料寫入套接字來 "傳送" 一些資料時，它實際上不會立即傳送，而只是放置在由作業系統管理的緩衝區中。當擁塞控制演算法決定它有能力傳送資料包時，它會從該緩衝區中獲取下一個資料包的資料並將其傳遞給網路介面。資料包通過幾個交換機和路由器，最終接收節點的作業系統將資料包的資料放置在接收緩衝區中並向傳送方傳送確認資料包。只有這樣，接收作業系統才會通知應用程式有更多資料到達 [^6]。

那麼，如果 TCP 提供 "可靠性"，這是否意味著我們不再需要擔心網路不可靠？不幸的是不是。如果在某個超時時間內沒有收到確認，它會認為資料包一定已經丟失，但 TCP 也無法判斷是出站資料包還是確認丟失了。儘管 TCP 可以重新發送資料包，但它不能保證新資料包也會透過。如果網線被拔掉，TCP 不能為你重新插上它。最終，在可配置的超時後，TCP 放棄並嚮應用程式發出錯誤訊號。

如果 TCP 連線因錯誤而關閉 —— 也許是因為遠端節點崩潰了，或者是因為網路被中斷了 —— 你不幸地無法知道遠端節點實際處理了多少資料 [^6]。即使 TCP 確認資料包已交付，這也僅意味著遠端節點上的作業系統核心收到了它，但應用程式可能在處理該資料之前就崩潰了。如果你想確保請求成功，你需要應用層返回明確的成功響應 [^7]。

儘管如此，TCP 非常有用，因為它提供了一種方便的方式來發送和接收太大而無法裝入一個數據包的訊息。一旦建立了 TCP 連線，你還可以使用它來發送多個請求和響應。這通常是透過首先發送一個標頭來完成的，該標頭以位元組為單位指示後續訊息的長度，然後是實際訊息。HTTP 和許多 RPC 協議（見 ["透過服務的資料流：REST 和 RPC"](/tw/ch5#sec_encoding_dataflow_rpc)）就是這樣工作的。

### 實踐中的網路故障 {#sec_distributed_network_faults}

我們已經建立計算機網路幾十年了 —— 人們可能希望到現在我們已經弄清楚如何使它們可靠。不幸的是，我們還沒有成功。有一些系統研究和大量軼事證據表明，網路問題可能出人意料地常見，即使在由一家公司運營的受控環境（如資料中心）中也是如此 [^8]：

* 一項在中型資料中心的研究發現，每月約有 12 次網路故障，其中一半斷開了單臺機器，一半斷開了整個機架 [^9]。
* 另一項研究測量了元件（如機架頂部交換機、匯聚交換機和負載均衡器）的故障率 [^10]。它發現，新增冗餘網路裝置並不能像你希望的那樣減少故障，因為它不能防範人為錯誤（例如，配置錯誤的交換機），這是停機的主要原因。
* 廣域光纖鏈路的中斷被歸咎於奶牛 [^11]、海狸 [^12] 和鯊魚 [^13]（儘管由於海底電纜遮蔽更好，鯊魚咬傷已經變得更加罕見 [^14]）。人類也有過錯，無論是由於意外配置錯誤 [^15]、拾荒 [^16] 還是破壞 [^17]。
* 在不同的雲區域之間，已經觀察到高百分位數下長達幾 *分鐘* 的往返時間 [^18]。即使在單個數據中心內，在網路拓撲重新配置期間（由交換機軟體升級期間的問題觸發），也可能發生超過一分鐘的資料包延遲 [^19]。因此，我們必須假設訊息可能被任意延遲。
* 有時通訊部分中斷，這取決於你在和誰交談：例如，A 和 B 可以通訊，B 和 C 可以通訊，但 A 和 C 不能 [^20] [^21]。其他令人驚訝的故障包括網路介面有時會丟棄所有入站資料包但成功傳送出站資料包 [^22]：僅僅因為網路鏈路在一個方向上工作並不能保證它在相反方向上也工作。
* 即使是短暫的網路中斷也可能產生比原始問題持續時間更長的影響 [^8] [^20] [^23]。

--------

> [!TIP] 網路分割槽
>
> 當網路的一部分由於網路故障而與其餘部分隔離時，有時稱為 *網路分割槽* 或 *網路分裂*，但它與其他型別的網路中斷沒有根本區別。網路分割槽與儲存系統的分片無關，後者有時也稱為 *分割槽*（見 [第 7 章](/tw/ch7#ch_sharding)）。

--------

即使網路故障在你的環境中很少見，故障 *可能* 發生的事實意味著你的軟體需要能夠處理它們。每當透過網路進行任何通訊時，它都可能失敗 —— 這是無法避免的。

如果網路故障的錯誤處理沒有定義和測試，可能會發生任意糟糕的事情：例如，叢集可能會陷入死鎖並永久無法提供請求，即使網路恢復 [^24]，或者它甚至可能刪除你的所有資料 [^25]。如果軟體處於意料之外的情況，它可能會做任意意外的事情。

處理網路故障不一定意味著 *容忍* 它們：如果你的網路通常相當可靠，一個有效的方法可能是在網路出現問題時簡單地向用戶顯示錯誤訊息。但是，你確實需要知道你的軟體如何對網路問題做出反應，並確保系統可以從中恢復。故意觸發網路問題並測試系統的響應可能是有意義的（這被稱為 *故障注入*；見 ["故障注入"](#sec_fault_injection)）。

### 檢測故障 {#id307}

許多系統需要自動檢測故障節點。例如：

* 負載均衡器需要停止向已死亡的節點發送請求（即，將其 *從輪詢池中摘除*）。
* 在具有單主複製的分散式資料庫中，如果主節點失效，其中一個從節點需要被提升為新的主節點（見 ["處理節點中斷"](/tw/ch6#sec_replication_failover)）。

不幸的是，網路的不確定性使得很難判斷節點是否正常工作。在某些特定情況下，你可能會得到一些明確告訴你某事不工作的反饋：

* 如果你可以訪問節點應該執行的機器，但沒有程序監聽目標埠（例如，因為程序崩潰了），作業系統將透過傳送 `RST` 或 `FIN` 資料包來幫助關閉或拒絕 TCP 連線。
* 如果節點程序崩潰（或被管理員殺死）但節點的作業系統仍在執行，指令碼可以通知其他節點有關崩潰的資訊，以便另一個節點可以快速接管而無需等待超時到期。例如，HBase 就是這樣做的 [^26]。
* 如果你可以訪問資料中心中網路交換機的管理介面，你可以查詢它們以在硬體級別檢測鏈路故障（例如，如果遠端機器已關閉電源）。如果你透過網際網路連線，或者你在共享資料中心中無法訪問交換機本身，或者由於網路問題無法訪問管理介面，則此選項被排除。
* 如果路由器確定你嘗試連線的 IP 地址不可達，它可能會向你回覆 ICMP 目標不可達資料包。然而，路由器也沒有神奇的故障檢測能力 —— 它受到與網路其他參與者相同的限制。

關於遠端節點宕機的快速反饋很有用，但你不能指望它。如果出了問題，你可能會在堆疊的某個級別收到錯誤響應，但通常你必須假設你根本不會收到任何響應。你可以重試幾次，等待超時過去，如果在超時內沒有收到回覆，最終宣佈節點死亡。

### 超時和無界延遲 {#sec_distributed_queueing}

如果超時是檢測故障的唯一可靠方法，那麼超時應該多長？不幸的是，沒有簡單的答案。

長超時意味著在節點被宣佈死亡之前需要長時間等待（在此期間，使用者可能不得不等待或看到錯誤訊息）。短超時可以更快地檢測故障，但當節點實際上只是遭受暫時的減速（例如，由於節點或網路上的負載峰值）時，錯誤地宣佈節點死亡的風險更高。

過早地宣佈節點死亡是有問題的：如果節點實際上是活著的並且正在執行某些操作（例如，傳送電子郵件），而另一個節點接管，該操作可能最終被執行兩次。我們將在 ["知識、真相與謊言"](#sec_distributed_truth) 以及第 10 章和後續章節中更詳細地討論這個問題。

當節點被宣佈死亡時，其職責需要轉移到其他節點，這會給其他節點和網路帶來額外的負載。如果系統已經在高負載下掙扎，過早地宣佈節點死亡可能會使問題變得更糟。特別是，可能發生的情況是，節點實際上並沒有死亡，只是由於過載而響應緩慢；將其負載轉移到其他節點可能會導致級聯故障（在極端情況下，所有節點互相宣佈對方死亡，一切都停止工作 —— 見 ["當過載系統無法恢復時"](/tw/ch2#sidebar_metastable)）。

想象一個虛構的系統，其網路保證資料包的最大延遲 —— 每個資料包要麼在某個時間 *d* 內交付，要麼丟失，但交付從不會超過 *d*。此外，假設你可以保證未失效的節點總是在某個時間 *r* 內處理請求。在這種情況下，你可以保證每個成功的請求在時間 2*d* + *r* 內收到響應 —— 如果你在該時間內沒有收到響應，你就知道網路或遠端節點不工作。如果這是真的，2*d* + *r* 將是一個合理的超時時間。

不幸的是，我們使用的大多數系統都沒有這些保證：非同步網路具有 *無界延遲*（即，它們嘗試儘快交付資料包，但資料包到達所需的時間沒有上限），大多數伺服器實現無法保證它們可以在某個最大時間內處理請求（見 ["響應時間保證"](#sec_distributed_clocks_realtime)）。對於故障檢測，系統大部分時間快速執行是不夠的：如果你的超時很低，往返時間的瞬時峰值就足以使系統失去平衡。

<a id="sec_distributed_congestion"></a>

#### 網路擁塞和排隊 {#network-congestion-and-queueing}

開車時，道路網路上的行駛時間通常因交通擁堵而變化最大。同樣，計算機網路上資料包延遲的可變性最常是由於排隊 [^27]：

* 如果幾個不同的節點同時嘗試向同一目的地傳送資料包，網路交換機必須將它們排隊並逐個送入目標網路鏈路（如 [圖 9-2](#fig_distributed_switch_queueing) 所示）。在繁忙的網路鏈路上，資料包可能需要等待一段時間才能獲得一個插槽（這稱為 *網路擁塞*）。如果有太多的傳入資料以至於交換機佇列滿了，資料包將被丟棄，因此需要重新發送 —— 即使網路執行正常。
* 當資料包到達目標機器時，如果所有 CPU 核心當前都很忙，來自網路的傳入請求會被作業系統排隊，直到應用程式準備處理它。根據機器上的負載，這可能需要任意長的時間 [^28]。
* 在虛擬化環境中，正在執行的作業系統經常會暫停幾十毫秒，而另一個虛擬機器使用 CPU 核心。在此期間，VM 無法消耗來自網路的任何資料，因此傳入資料由虛擬機器監視器排隊（緩衝）[^29]，進一步增加了網路延遲的可變性。
* 如前所述，為了避免網路過載，TCP 限制傳送資料的速率。這意味著在資料甚至進入網路之前，傳送方就有額外的排隊。

{{< figure src="/fig/ddia_0902.png" id="fig_distributed_switch_queueing" caption="圖 9-2. 如果幾臺機器向同一目的地傳送網路流量，其交換機佇列可能會滿。這裡，埠 1、2 和 4 都試圖向埠 3 傳送資料包。" class="w-full my-4" >}}

此外，當 TCP 檢測到並自動重傳丟失的資料包時，儘管應用程式不會直接看到資料包丟失，但它確實會看到由此產生的延遲（等待超時到期，然後等待重傳的資料包被確認）。

--------

<a id="sidebar_distributed_tcp_udp"></a>

> [!TIP] TCP 與 UDP
>
> 一些對延遲敏感的應用程式，如視訊會議和 IP 語音（VoIP），使用 UDP 而不是 TCP。這是可靠性和延遲可變性之間的權衡：由於 UDP 不執行流量控制並且不重傳丟失的資料包，它避免了網路延遲可變的一些原因（儘管它仍然容易受到交換機佇列和排程延遲的影響）。
>
> UDP 是延遲資料無價值的情況下的好選擇。例如，在 VoIP 電話通話中，在資料應該透過揚聲器播放之前，可能沒有足夠的時間重傳丟失的資料包。在這種情況下，重傳資料包沒有意義 —— 應用程式必須用靜音填充缺失資料包的時間槽（導致聲音短暫中斷）並繼續流。重試發生在人類層面。（"你能重複一下嗎？聲音剛剛中斷了一會兒。"）

--------

所有這些因素都導致了網路延遲的可變性。當系統接近其最大容量時，排隊延遲的範圍特別大：具有充足備用容量的系統可以輕鬆排空佇列，而在高度利用的系統中，長佇列可以很快建立起來。

在公共雲和多租戶資料中心中，資源在許多客戶之間共享：網路鏈路和交換機，甚至每臺機器的網路介面和 CPU（在虛擬機器上執行時）都是共享的。處理大量資料可以使用網路鏈路的全部容量（*飽和* 它們）。由於你無法控制或瞭解其他客戶對共享資源的使用情況，如果你附近的某人（*吵鬧的鄰居*）正在使用大量資源，網路延遲可能會高度可變 [^30] [^31]。

在這種環境中，你只能透過實驗選擇超時：在較長時間內和許多機器上測量網路往返時間的分佈，以確定延遲的預期可變性。然後，考慮到你的應用程式的特徵，你可以在故障檢測延遲和過早超時風險之間確定適當的權衡。

更好的是，系統可以持續測量響應時間及其可變性（*抖動*），並根據觀察到的響應時間分佈自動調整超時，而不是使用配置的常量超時。Phi 累積故障檢測器 [^32]（例如在 Akka 和 Cassandra 中使用 [^33]）就是這樣做的一種方法。TCP 重傳超時也以類似的方式工作 [^5]。

### 同步與非同步網路 {#sec_distributed_sync_networks}

如果我們可以依靠網路以某個固定的最大延遲交付資料包，並且不丟棄資料包，分散式系統將會簡單得多。為什麼我們不能在硬體級別解決這個問題，使網路可靠，這樣軟體就不需要擔心它了？

要回答這個問題，比較資料中心網路與傳統的固定電話網路（非蜂窩、非 VoIP）很有趣，後者極其可靠：延遲的音訊幀和掉線非常罕見。電話通話需要持續的低端到端延遲和足夠的頻寬來傳輸你聲音的音訊樣本。在計算機網路中擁有類似的可靠性和可預測性不是很好嗎？

當你透過電話網路撥打電話時，它會建立一個 *電路*：在兩個呼叫者之間的整個路線上分配固定、有保證的頻寬量。該電路一直保持到通話結束 [^34]。例如，ISDN 網路以每秒 4,000 幀的固定速率執行。建立呼叫時，它在每幀內（在每個方向上）分配 16 位空間。因此，在通話期間，每一方都保證能夠每 250 微秒準確傳送 16 位音訊資料 [^35]。

這種網路是 *同步的*：即使資料通過幾個路由器，它也不會遭受排隊，因為呼叫的 16 位空間已經在網路的下一跳中預留了。由於沒有排隊，網路的最大端到端延遲是固定的。我們稱之為 *有界延遲*。

#### 我們不能簡單地使網路延遲可預測嗎？ {#can-we-not-simply-make-network-delays-predictable}

請注意，電話網路中的電路與 TCP 連線非常不同：電路是固定數量的預留頻寬，在電路建立期間其他人無法使用，而 TCP 連線的資料包則機會主義地使用任何可用的網路頻寬。你可以給 TCP 一個可變大小的資料塊（例如，電子郵件或網頁），它會嘗試在儘可能短的時間內傳輸它。當 TCP 連線空閒時，它不使用任何頻寬（除了偶爾的保活資料包）。

如果資料中心網路和網際網路是電路交換網路，那麼在建立電路時就可以建立有保證的最大往返時間。然而，它們不是：乙太網和 IP 是分組交換協議，會遭受排隊，因此在網路中有無界延遲。這些協議沒有電路的概念。

為什麼資料中心網路和網際網路使用分組交換？答案是它們針對 *突發流量* 進行了最佳化。電路適合音訊或視訊通話，需要在通話期間傳輸相當恆定的每秒位數。另一方面，請求網頁、傳送電子郵件或傳輸檔案沒有任何特定的頻寬要求 —— 我們只希望它儘快完成。

如果你想透過電路傳輸檔案，你必須猜測頻寬分配。如果你猜得太低，傳輸會不必要地慢，使網路容量未被使用。如果你猜得太高，電路無法建立（因為如果無法保證其頻寬分配，網路無法允許建立電路）。因此，使用電路進行突發資料傳輸會浪費網路容量並使傳輸不必要地緩慢。相比之下，TCP 動態調整資料傳輸速率以適應可用的網路容量。

曾經有一些嘗試構建既支援電路交換又支援分組交換的混合網路。*非同步傳輸模式*（ATM）在 1980 年代是乙太網的競爭對手，但除了電話網路核心交換機外，它沒有獲得太多采用。InfiniBand 有一些相似之處 [^36]：它在鏈路層實現端到端流量控制，減少了網路中排隊的需要，儘管它仍然可能因鏈路擁塞而遭受延遲 [^37]。透過仔細使用 *服務質量*（QoS，資料包的優先順序和排程）和 *准入控制*（對傳送者的速率限制），可以在分組網路上類比電路交換，或提供統計上有界的延遲 [^27] [^34]。新的網路演算法，如低延遲、低損耗和可擴充套件吞吐量（L4S）試圖在客戶端和路由器級別緩解一些排隊和擁塞控制問題。Linux 的流量控制器（TC）也允許應用程式為 QoS 目的重新優先排序資料包。

--------

<a id="sidebar_distributed_latency_utilization"></a>

> [!TIP] 延遲和資源利用率
>
> 更一般地說，你可以將可變延遲視為動態資源分割槽的結果。
>
> 假設你在兩個電話交換機之間有一條可以承載多達 10,000 個同時呼叫的線路。透過此線路交換的每個電路都佔用其中一個呼叫插槽。因此，你可以將該線路視為最多可由 10,000 個同時使用者共享的資源。資源以 *靜態* 方式劃分：即使你現在是線路上唯一的呼叫，並且所有其他 9,999 個插槽都未使用，你的電路仍然分配與線路完全利用時相同的固定頻寬量。
>
> 相比之下，網際網路 *動態* 共享網路頻寬。傳送者互相推擠，儘可能快地透過線路傳送資料包，網路交換機決定在每個時刻傳送哪個資料包（即頻寬分配）。這種方法的缺點是排隊，但優點是它最大化了線路的利用率。線路有固定成本，所以如果你更好地利用它，你透過線路傳送的每個位元組都更便宜。
>
> CPU 也會出現類似的情況：如果你在幾個執行緒之間動態共享每個 CPU 核心，一個執行緒有時必須在作業系統的執行佇列中等待，而另一個執行緒正在執行，因此執行緒可能會暫停不同的時間長度 [^38]。然而，這比為每個執行緒分配靜態數量的 CPU 週期更好地利用硬體（見 ["響應時間保證"](#sec_distributed_clocks_realtime)）。更好的硬體利用率也是雲平臺在同一物理機器上執行來自不同客戶的多個虛擬機器的原因。
>
> 如果資源是靜態分割槽的（例如，專用硬體和獨佔頻寬分配），則在某些環境中可以實現延遲保證。然而，這是以降低利用率為代價的 —— 換句話說，它更昂貴。另一方面，具有動態資源分割槽的多租戶提供了更好的利用率，因此更便宜，但它有可變延遲的缺點。
>
> 網路中的可變延遲不是自然法則，而只是成本/收益權衡的結果。

--------

然而，這種服務質量目前在多租戶資料中心和公共雲中未啟用，或者在透過網際網路通訊時未啟用。當前部署的技術不允許我們對網路的延遲或可靠性做出任何保證：我們必須假設網路擁塞、排隊和無界延遲會發生。因此，超時沒有 "正確" 的值 —— 它們需要透過實驗確定。

網際網路服務提供商之間的對等協議和透過邊界閘道器協議（BGP）建立路由，比 IP 本身更接近電路交換。在這個級別，可以購買專用頻寬。然而，網際網路路由在網路級別而不是主機之間的單個連線上執行，並且時間尺度要長得多。


## 不可靠的時鐘 {#sec_distributed_clocks}

時鐘和時間很重要。應用程式以各種方式依賴時鐘來回答如下問題：

1. 這個請求超時了嗎？
2. 這項服務的第 99 百分位響應時間是多少？
3. 這項服務在過去五分鐘內平均每秒處理了多少查詢？
4. 使用者在我們的網站上花了多長時間？
5. 這篇文章是什麼時候發表的？
6. 提醒郵件應該在什麼日期和時間傳送？
7. 這個快取條目何時過期？
8. 日誌檔案中此錯誤訊息的時間戳是什麼？

示例 1-4 測量 *持續時間*（例如，傳送請求和接收響應之間的時間間隔），而示例 5-8 描述 *時間點*（在特定日期、特定時間發生的事件）。

在分散式系統中，時間是一件棘手的事情，因為通訊不是瞬時的：訊息從一臺機器透過網路傳輸到另一臺機器需要時間。接收訊息的時間總是晚於傳送訊息的時間，但由於網路中的可變延遲，我們不知道晚了多少。當涉及多臺機器時，這個事實有時會使確定事情發生的順序變得困難。

此外，網路上的每臺機器都有自己的時鐘，這是一個實際的硬體裝置：通常是石英晶體振盪器。這些裝置並不完全準確，因此每臺機器都有自己的時間概念，可能比其他機器稍快或稍慢。可以在某種程度上同步時鐘：最常用的機制是網路時間協議（NTP），它允許根據一組伺服器報告的時間調整計算機時鐘 [^39]。伺服器反過來從更準確的時間源（如 GPS 接收器）獲取時間。

### 單調時鐘與日曆時鐘 {#sec_distributed_monotonic_timeofday}

現代計算機至少有兩種不同型別的時鐘：*日曆時鐘* 和 *單調時鐘*。儘管它們都測量時間，但區分兩者很重要，因為它們服務於不同的目的。

#### 日曆時鐘 {#time-of-day-clocks}

日曆時鐘做你直觀期望時鐘做的事情：它根據某個日曆返回當前日期和時間（也稱為 *牆上時鐘時間*）。例如，Linux 上的 `clock_gettime(CLOCK_REALTIME)` 和 Java 中的 `System.currentTimeMillis()` 返回自 *紀元* 以來的秒數（或毫秒數）：根據格里高利曆，1970 年 1 月 1 日午夜 UTC，不計算閏秒。一些系統使用其他日期作為參考點。（儘管 Linux 時鐘被稱為 *即時*，但它與即時作業系統無關，如 ["響應時間保證"](#sec_distributed_clocks_realtime) 中所討論的。）

日曆時鐘通常與 NTP 同步，這意味著來自一臺機器的時間戳（理想情況下）與另一臺機器上的時間戳意思相同。然而，日曆時鐘也有各種奇怪之處，如下一節所述。特別是，如果本地時鐘遠遠超前於 NTP 伺服器，它可能會被強制重置並顯示跳回到以前的時間點。這些跳躍，以及閏秒引起的類似跳躍，使日曆時鐘不適合測量經過的時間 [^40]。

日曆時鐘可能會因夏令時（DST）的開始和結束而經歷跳躍；這些可以透過始終使用 UTC 作為時區來避免，UTC 沒有 DST。日曆時鐘在歷史上也具有相當粗粒度的解析度，例如，在較舊的 Windows 系統上以 10 毫秒的步長前進 [^41]。在最近的系統上，這不再是一個問題。

#### 單調時鐘 {#monotonic-clocks}

單調時鐘適用於測量持續時間（時間間隔），例如超時或服務的響應時間：例如，Linux 上的 `clock_gettime(CLOCK_MONOTONIC)` 或 `clock_gettime(CLOCK_BOOTTIME)` [^42] 和 Java 中的 `System.nanoTime()` 是單調時鐘。這個名字來源於它們保證始終向前移動的事實（而日曆時鐘可能會在時間上向後跳躍）。

你可以在某個時間點檢查單調時鐘的值，做一些事情，然後在稍後的時間再次檢查時鐘。兩個值之間的 *差值* 告訴你兩次檢查之間經過了多少時間 —— 更像秒錶而不是掛鐘。然而，時鐘的 *絕對* 值是沒有意義的：它可能是自計算機啟動以來的納秒數，或類似的任意值。特別是，比較來自兩臺不同計算機的單調時鐘值是沒有意義的，因為它們不代表同樣的東西。

在具有多個 CPU 插槽的伺服器上，每個 CPU 可能有一個單獨的計時器，它不一定與其他 CPU 同步 [^43]。作業系統會補償任何差異，並嘗試嚮應用程式執行緒呈現時鐘的單調檢視，即使它們被排程到不同的 CPU 上。然而，明智的做法是對這種單調性保證持保留態度 [^44]。

如果 NTP 檢測到計算機的本地石英晶體比 NTP 伺服器執行得更快或更慢，它可能會調整單調時鐘前進的頻率（這被稱為 *調整* 時鐘）。預設情況下，NTP 允許時鐘速率加速或減速高達 0.05%，但 NTP 不能導致單調時鐘向前或向後跳躍。單調時鐘的解析度通常相當好：在大多數系統上，它們可以測量微秒或更短的時間間隔。

在分散式系統中，使用單調時鐘測量經過的時間（例如，超時）通常是可以的，因為它不假設不同節點的時鐘之間有任何同步，並且對測量的輕微不準確不敏感。

### 時鐘同步和準確性 {#sec_distributed_clock_accuracy}

單調時鐘不需要同步，但日曆時鐘需要根據 NTP 伺服器或其他外部時間源設定才能有用。不幸的是，我們讓時鐘顯示正確時間的方法遠不如你希望的那樣可靠或準確 —— 硬體時鐘和 NTP 可能是反覆無常的野獸。僅舉幾個例子：

* 計算機中的石英時鐘不是很準確：它會 *漂移*（比應該的執行得更快或更慢）。時鐘漂移因機器的溫度而異。Google 假設其伺服器的時鐘漂移高達 200 ppm（百萬分之一）[^45]，這相當於每 30 秒與伺服器重新同步的時鐘有 6 毫秒漂移，或每天重新同步一次的時鐘有 17 秒漂移。即使一切正常工作，這種漂移也限制了你可以達到的最佳精度。
* 如果計算機的時鐘與 NTP 伺服器相差太多，它可能會拒絕同步，或者本地時鐘將被強制重置 [^39]。任何在重置前後觀察時間的應用程式都可能看到時間倒退或突然向前跳躍。
* 如果節點意外地被防火牆與 NTP 伺服器隔離，配置錯誤可能會在一段時間內未被注意到，在此期間漂移可能會累積成不同節點時鐘之間的巨大差異。軼事證據表明，這在實踐中確實會發生。
* NTP 同步只能與網路延遲一樣好，因此當你在具有可變資料包延遲的擁塞網路上時，其準確性有限。一項實驗表明，透過網際網路同步時可以達到 35 毫秒的最小誤差 [^46]，儘管網路延遲的偶爾峰值會導致大約一秒的誤差。根據配置，大的網路延遲可能導致 NTP 客戶端完全放棄。
* 一些 NTP 伺服器是錯誤的或配置錯誤的，報告的時間相差數小時 [^47] [^48]。NTP 客戶端透過查詢多個伺服器並忽略異常值來減輕此類錯誤。儘管如此，將系統的正確性押注在網際網路上陌生人告訴你的時間上還是有些令人擔憂的。
* 閏秒導致一分鐘有 59 秒或 61 秒長，這會搞亂在設計時沒有考慮閏秒的系統中的時序假設 [^49]。閏秒已經導致許多大型系統崩潰的事實 [^40] [^50] 表明，關於時鐘的錯誤假設是多麼容易潛入系統。處理閏秒的最佳方法可能是讓 NTP 伺服器 "撒謊"，透過在一天的過程中逐漸執行閏秒調整（這被稱為 *平滑*）[^51] [^52]，儘管實際的 NTP 伺服器行為在實踐中有所不同 [^53]。從 2035 年起將不再使用閏秒，所以這個問題幸運地將會消失。
* 在虛擬機器中，硬體時鐘是虛擬化的，這為需要準確計時的應用程式帶來了額外的挑戰 [^54]。當 CPU 核心在虛擬機器之間共享時，每個 VM 在另一個 VM 執行時會暫停數十毫秒。從應用程式的角度來看，這種暫停表現為時鐘突然向前跳躍 [^29]。如果 VM 暫停幾秒鐘，時鐘可能會比實際時間落後幾秒鐘，但 NTP 可能會繼續報告時鐘幾乎完全同步 [^55]。
* 如果你在不完全控制的裝置上執行軟體（例如，移動或嵌入式裝置），你可能根本無法信任裝置的硬體時鐘。一些使用者故意將他們的硬體時鐘設定為不正確的日期和時間，例如在遊戲中作弊 [^56]。因此，時鐘可能被設定為遙遠的過去或未來的時間。

如果你足夠關心時鐘精度並願意投入大量資源，就可以實現非常好的時鐘精度。例如，歐洲金融機構的 MiFID II 法規要求所有高頻交易基金將其時鐘同步到 UTC 的 100 微秒以內，以幫助除錯市場異常（如 "閃崩"）並幫助檢測市場操縱 [^57]。

這種精度可以透過一些特殊硬體（GPS 接收器和/或原子鐘）、精確時間協議（PTP）以及仔細的部署和監控來實現 [^58] [^59]。僅依賴 GPS 可能有風險，因為 GPS 訊號很容易被幹擾。在某些地方，這種情況經常發生，例如靠近軍事設施 [^60]。一些雲提供商已經開始為其虛擬機器提供高精度時鐘同步 [^61]。然而，時鐘同步仍然需要很多注意。如果你的 NTP 守護程序配置錯誤，或者防火牆阻止了 NTP 流量，由於漂移導致的時鐘誤差可能會迅速變大。

### 對同步時鐘的依賴 {#sec_distributed_clocks_relying}

時鐘的問題在於，雖然它們看起來簡單易用，但它們有驚人數量的陷阱：一天可能沒有正好 86,400 秒，日曆時鐘可能會在時間上向後移動，根據一個節點的時鐘的時間可能與另一個節點的時鐘相差很大。

本章前面我們討論了網路丟棄和任意延遲資料包。即使網路大部分時間表現良好，軟體也必須設計成假設網路偶爾會出現故障，軟體必須優雅地處理此類故障。時鐘也是如此：儘管它們大部分時間工作得很好，但強健的軟體需要準備好處理不正確的時鐘。

問題的一部分是不正確的時鐘很容易被忽視。如果機器的 CPU 有缺陷或其網路配置錯誤，它很可能根本無法工作，因此會很快被注意到並修復。另一方面，如果它的石英時鐘有缺陷或其 NTP 客戶端配置錯誤，大多數事情看起來會正常工作，即使它的時鐘逐漸偏離現實越來越遠。如果某些軟體依賴於準確同步的時鐘，結果更可能是靜默和微妙的資料丟失，而不是戲劇性的崩潰 [^62] [^63]。

因此，如果你使用需要同步時鐘的軟體，你還必須仔細監控所有機器之間的時鐘偏移。任何時鐘偏離其他節點太遠的節點都應該被宣佈死亡並從叢集中移除。這種監控確保你在損壞的時鐘造成太多損害之前注意到它們。

#### 用於事件排序的時間戳 {#sec_distributed_lww}

讓我們考慮一個特定的情況，其中依賴時鐘是誘人但危險的：跨多個節點的事件排序 [^64]。例如，如果兩個客戶端寫入分散式資料庫，誰先到達？哪個寫入是更新的？

[圖 9-3](#fig_distributed_timestamps) 說明了在具有多主複製的資料庫中日曆時鐘的危險使用（該示例類似於 [圖 6-8](/tw/ch6#fig_replication_causality)）。客戶端 A 在節點 1 上寫入 *x* = 1；寫入被複制到節點 3；客戶端 B 在節點 3 上遞增 *x*（我們現在有 *x* = 2）；最後，兩個寫入都被複制到節點 2。

{{< figure src="/fig/ddia_0903.png" id="fig_distributed_timestamps" caption="圖 9-3. 客戶端 B 的寫入在因果關係上晚於客戶端 A 的寫入，但 B 的寫入具有更早的時間戳。" class="w-full my-4" >}}


在 [圖 9-3](#fig_distributed_timestamps) 中，當寫入被複制到其他節點時，它會根據寫入起源節點上的日曆時鐘標記時間戳。此示例中的時鐘同步非常好：節點 1 和節點 3 之間的偏差小於 3 毫秒，這可能比你在實踐中可以期望的要好。

由於遞增建立在 *x* = 1 的早期寫入之上，我們可能期望 *x* = 2 的寫入應該具有兩者中更大的時間戳。不幸的是，[圖 9-3](#fig_distributed_timestamps) 中發生的並非如此：寫入 *x* = 1 的時間戳為 42.004 秒，但寫入 *x* = 2 的時間戳為 42.003 秒。

如 ["最後寫入勝利（丟棄併發寫入）"](/tw/ch6#sec_replication_lww) 中所討論的，解決不同節點上併發寫入值之間衝突的一種方法是 *最後寫入勝利*（LWW），這意味著保留給定鍵的具有最大時間戳的寫入，並丟棄所有具有較舊時間戳的寫入。在 [圖 9-3](#fig_distributed_timestamps) 的示例中，當節點 2 接收這兩個事件時，它將錯誤地得出結論，認為 *x* = 1 是更新的值並丟棄寫入 *x* = 2，因此遞增丟失了。

可以透過確保當值被覆蓋時，新值總是具有比被覆蓋值更高的時間戳來防止這個問題，即使該時間戳超前於寫入者的本地時鐘。然而，這會產生額外的讀取成本來查詢最大的現有時間戳。一些系統，包括 Cassandra 和 ScyllaDB，希望在單次往返中寫入所有副本，因此它們只是使用客戶端時鐘的時間戳以及最後寫入勝利策略 [^62]。這種方法有一些嚴重的問題：

* 資料庫寫入可能會神秘地消失：具有滯後時鐘的節點無法覆蓋先前由具有快速時鐘的節點寫入的值，直到節點之間的時鐘偏差時間過去 [^63] [^65]。這種情況可能導致任意數量的資料被靜默丟棄，而不會嚮應用程式報告任何錯誤。
* LWW 無法區分快速連續發生的順序寫入（在 [圖 9-3](#fig_distributed_timestamps) 中，客戶端 B 的遞增肯定發生在客戶端 A 的寫入 *之後*）和真正併發的寫入（兩個寫入者都不知道對方）。需要額外的因果關係跟蹤機制，如版本向量，以防止違反因果關係（見 ["檢測併發寫入"](/tw/ch6#sec_replication_concurrent)）。
* 兩個節點可能獨立生成具有相同時間戳的寫入，特別是當時鍾只有毫秒解析度時。需要額外的決勝值（可以簡單地是一個大的隨機數）來解決此類衝突，但這種方法也可能導致違反因果關係 [^62]。

因此，即使透過保留最 "新" 的值並丟棄其他值來解決衝突很誘人，但重要的是要意識到 "新" 的定義取決於本地日曆時鐘，它很可能是不正確的。即使使用緊密 NTP 同步的時鐘，你也可能在時間戳 100 毫秒（根據傳送者的時鐘）傳送資料包，並讓它在時間戳 99 毫秒（根據接收者的時鐘）到達 —— 因此看起來資料包在傳送之前就到達了，這是不可能的。

NTP 同步能否足夠準確以至於不會發生此類錯誤排序？可能不行，因為除了石英漂移等其他誤差源之外，NTP 的同步精度本身受到網路往返時間的限制。要保證正確的排序，你需要時鐘誤差顯著低於網路延遲，這是不可能的。

所謂的 *邏輯時鐘* [^66]，基於遞增計數器而不是振盪石英晶體，是排序事件的更安全替代方案（見 ["檢測併發寫入"](/tw/ch6#sec_replication_concurrent)）。邏輯時鐘不測量一天中的時間或經過的秒數，只測量事件的相對順序（一個事件是在另一個事件之前還是之後發生）。相比之下，日曆時鐘和單調時鐘測量實際經過的時間，也稱為 *物理時鐘*。我們將在 ["ID 生成器和邏輯時鐘"](/tw/ch10#sec_consistency_logical) 中更詳細地研究邏輯時鐘。

#### 帶置信區間的時鐘讀數 {#clock-readings-with-a-confidence-interval}

你可能能夠以微秒甚至納秒解析度讀取機器的日曆時鐘。但即使你能獲得如此細粒度的測量，也不意味著該值實際上精確到如此精度。事實上，它很可能不是 —— 如前所述，即使你每分鐘與本地網路上的 NTP 伺服器同步，不精確的石英時鐘的漂移也很容易達到幾毫秒。使用公共網際網路上的 NTP 伺服器，最佳可能精度可能是幾十毫秒，當存在網路擁塞時，誤差很容易超過 100 毫秒。

因此，將時鐘讀數視為時間點是沒有意義的 —— 它更像是一個時間範圍，在置信區間內：例如，系統可能有 95% 的信心認為現在的時間在分鐘後的 10.3 到 10.5 秒之間，但它不知道比這更精確的時間 [^67]。如果我們只知道時間 +/- 100 毫秒，時間戳中的微秒數字基本上是沒有意義的。

不確定性邊界可以根據你的時間源計算。如果你有直接連線到計算機的 GPS 接收器或原子鐘，預期誤差範圍由裝置決定，對於 GPS，由來自衛星的訊號質量決定。如果你從伺服器獲取時間，不確定性基於自上次與伺服器同步以來的預期石英漂移，加上 NTP 伺服器的不確定性，加上到伺服器的網路往返時間（作為第一近似，並假設你信任伺服器）。

不幸的是，大多數系統不暴露這種不確定性：例如，當你呼叫 `clock_gettime()` 時，返回值不會告訴你時間戳的預期誤差，所以你不知道它的置信區間是五毫秒還是五年。

有例外：Google Spanner 中的 *TrueTime* API [^45] 和亞馬遜的 ClockBound 明確報告本地時鐘的置信區間。當你詢問當前時間時，你會得到兩個值：`[earliest, latest]`，它們是 *最早可能* 和 *最晚可能* 的時間戳。基於其不確定性計算，時鐘知道實際當前時間在該區間內的某處。區間的寬度取決於多種因素，包括本地石英時鐘上次與更準確的時鐘源同步以來已經過去了多長時間。

#### 用於全域性快照的同步時鐘 {#sec_distributed_spanner}

在 ["快照隔離和可重複讀"](/tw/ch8#sec_transactions_snapshot_isolation) 中，我們討論了 *多版本併發控制*（MVCC），這是資料庫中非常有用的功能，需要支援小型、快速的讀寫事務和大型、長時間執行的只讀事務（例如，用於備份或分析）。它允許只讀事務看到資料庫的 *快照*，即特定時間點的一致狀態，而不會鎖定和干擾讀寫事務。

通常，MVCC 需要單調遞增的事務 ID。如果寫入發生在快照之後（即，寫入的事務 ID 大於快照），則該寫入對快照事務不可見。在單節點資料庫上，簡單的計數器就足以生成事務 ID。

然而，當資料庫分佈在許多機器上，可能在多個數據中心時，全域性單調遞增的事務 ID（跨所有分片）很難生成，因為它需要協調。事務 ID 必須反映因果關係：如果事務 B 讀取或覆蓋先前由事務 A 寫入的值，則 B 必須具有比 A 更高的事務 ID —— 否則，快照將不一致。對於大量小型、快速的事務，在分散式系統中建立事務 ID 成為難以承受的瓶頸。（我們將在 ["ID 生成器和邏輯時鐘"](/tw/ch10#sec_consistency_logical) 中討論此類 ID 生成器。）

我們能否使用同步日曆時鐘的時間戳作為事務 ID？如果我們能夠獲得足夠好的同步，它們將具有正確的屬性：較晚的事務具有更高的時間戳。當然，問題是時鐘精度的不確定性。

Spanner 以這種方式跨資料中心實現快照隔離 [^68] [^69]。它使用 TrueTime API 報告的時鐘置信區間，並基於以下觀察：如果你有兩個置信區間，每個都由最早和最晚可能的時間戳組成（*A* = [*A最早*, *A最晚*] 和 *B* = [*B最早*, *B最晚*]），並且這兩個區間不重疊（即，*A最早* < *A最晚* < *B最早* < *B最晚*），那麼 B 肯定發生在 A 之後 —— 毫無疑問。只有當區間重疊時，我們才不確定 A 和 B 發生的順序。

為了確保事務時間戳反映因果關係，Spanner 在提交讀寫事務之前故意等待置信區間的長度。透過這樣做，它確保任何可能讀取資料的事務都在足夠晚的時間，因此它們的置信區間不會重疊。為了使等待時間儘可能短，Spanner 需要使時鐘不確定性儘可能小；為此，Google 在每個資料中心部署 GPS 接收器或原子鐘，使時鐘能夠同步到大約 7 毫秒以內 [^45]。

原子鐘和 GPS 接收器在 Spanner 中並不是嚴格必要的：重要的是要有一個置信區間，準確的時鐘源只是幫助保持該區間較小。其他系統開始採用類似的方法：例如，YugabyteDB 在 AWS 上執行時可以利用 ClockBound [^70]，其他幾個系統現在也在不同程度上依賴時鐘同步 [^71] [^72]。

### 程序暫停 {#sec_distributed_clocks_pauses}

讓我們考慮分散式系統中危險使用時鐘的另一個例子。假設你有一個每個分片都有單個主節點的資料庫。只有主節點被允許接受寫入。節點如何知道它仍然是主節點（它沒有被其他節點宣佈死亡），並且它可以安全地接受寫入？

一種選擇是讓主節點從其他節點獲取 *租約*，這類似於帶有超時的鎖 [^73]。任何時候只有一個節點可以持有租約 —— 因此，當節點獲得租約時，它知道在租約到期之前的一段時間內它是主節點。為了保持主節點身份，節點必須在租約到期之前定期續訂租約。如果節點失效，它會停止續訂租約，因此另一個節點可以在租約到期時接管。

你可以想象請求處理迴圈看起來像這樣：

```js
while (true) {
    request = getIncomingRequest();

    // 確保租約始終至少有 10 秒的剩餘時間
    if (lease.expiryTimeMillis - System.currentTimeMillis() < 10000) {
        lease = lease.renew();
    }

    if (lease.isValid()) {
        process(request);
    }
}
```

這段程式碼有什麼問題？首先，它依賴於同步時鐘：租約的到期時間由不同的機器設定（到期時間可能計算為當前時間加 30 秒，例如），並且它與本地系統時鐘進行比較。如果時鐘相差超過幾秒鐘，這段程式碼將開始做奇怪的事情。

其次，即使我們更改協議以僅使用本地單調時鐘，還有另一個問題：程式碼假設在檢查時間（`System.currentTimeMillis()`）和處理請求（`process(request)`）之間經過的時間非常少。通常這段程式碼執行得非常快，所以 10 秒的緩衝時間足以確保租約不會在處理請求的過程中到期。

然而，如果程式執行中出現意外暫停會怎樣？例如，想象執行緒在 `lease.isValid()` 行周圍停止了 15 秒，然後才最終繼續。在這種情況下，處理請求時租約很可能已經到期，另一個節點已經接管了主節點身份。然而，沒有任何東西告訴這個執行緒它暫停了這麼長時間，所以這段程式碼不會注意到租約已經到期，直到迴圈的下一次迭代 —— 到那時它可能已經透過處理請求做了一些不安全的事情。

假設執行緒可能暫停這麼長時間是合理的嗎？不幸的是，是的。有各種原因可能導致這種情況發生：

* 執行緒訪問共享資源（如鎖或佇列）時的爭用可能導致執行緒花費大量時間等待。轉移到具有更多 CPU 核心的機器可能會使此類問題變得更糟，並且爭用問題可能難以診斷 [^74]。
* 許多程式語言執行時（如 Java 虛擬機器）有 *垃圾回收器*（GC），偶爾需要停止所有正在執行的執行緒。過去，這種 *"全域性暫停" GC 暫停* 有時會持續幾分鐘 [^75]！使用現代 GC 演算法，這不再是一個大問題，但 GC 暫停仍然可能很明顯（見 ["限制垃圾回收的影響"](#sec_distributed_gc_impact)）。
* 在虛擬化環境中，虛擬機器可以被 *掛起*（暫停所有程序的執行並將記憶體內容儲存到磁碟）和 *恢復*（恢復記憶體內容並繼續執行）。這種暫停可能發生在程序執行的任何時間，並且可能持續任意長的時間。這個功能有時用於虛擬機器從一臺主機到另一臺主機的 *即時遷移*，無需重啟，在這種情況下，暫停的長度取決於程序寫入記憶體的速率 [^76]。
* 在筆記型電腦和手機等終端使用者裝置上，執行也可能被任意掛起和恢復，例如，當用戶合上筆記型電腦蓋時。
* 當作業系統上下文切換到另一個執行緒時，或者當虛擬機器管理程式切換到不同的虛擬機器時（在虛擬機器中執行時），當前執行的執行緒可能在程式碼的任何任意點暫停。在虛擬機器的情況下，在其他虛擬機器中花費的 CPU 時間稱為 *竊取時間*。如果機器負載很重 —— 即，如果有長佇列的執行緒等待執行 —— 暫停的執行緒可能需要一些時間才能再次執行。
* 如果應用程式執行同步磁碟訪問，執行緒可能會暫停等待緩慢的磁碟 I/O 操作完成 [^77]。在許多語言中，磁碟訪問可能會令人驚訝地發生，即使程式碼沒有明確提到檔案訪問 —— 例如，Java 類載入器在首次使用時會延遲載入類檔案，這可能發生在程式執行的任何時間。I/O 暫停和 GC 暫停甚至可能共謀結合它們的延遲 [^78]。如果磁碟實際上是網路檔案系統或網路塊裝置（如亞馬遜的 EBS），I/O 延遲還會受到網路延遲可變性的影響 [^31]。
* 如果作業系統配置為允許 *交換到磁碟*（*分頁*），簡單的記憶體訪問可能會導致頁面錯誤，需要從磁碟載入頁面到記憶體。執行緒在此緩慢的 I/O 操作進行時暫停。如果記憶體壓力很高，這可能反過來需要將不同的頁面交換到磁碟。在極端情況下，作業系統可能會花費大部分時間在記憶體中交換頁面進出，而實際完成的工作很少（這被稱為 *抖動*）。為了避免這個問題，伺服器機器上通常停用分頁（如果你寧願殺死程序以釋放記憶體而不是冒抖動的風險）。
* Unix 程序可以透過向其傳送 `SIGSTOP` 訊號來暫停，例如透過在 shell 中按 Ctrl-Z。此訊號立即停止程序獲取更多 CPU 週期，直到使用 `SIGCONT` 恢復它，此時它從停止的地方繼續執行。即使你的環境通常不使用 `SIGSTOP`，它也可能被運維工程師意外發送。

所有這些情況都可以在任何時候 *搶佔* 正在執行的執行緒，並在稍後的某個時間恢復它，而執行緒甚至沒有注意到。這個問題類似於在單臺機器上使多執行緒程式碼執行緒安全：你不能對時序做任何假設，因為可能會發生任意的上下文切換和並行性。

在單臺機器上編寫多執行緒程式碼時，我們有相當好的工具來使其執行緒安全：互斥鎖、訊號量、原子計數器、無鎖資料結構、阻塞佇列等。不幸的是，這些工具不能直接轉換到分散式系統，因為分散式系統沒有共享記憶體 —— 只有透過不可靠網路傳送的訊息。

分散式系統中的節點必須假設其執行可以在任何時候暫停相當長的時間，即使在函式的中間。在暫停期間，世界的其餘部分繼續執行，甚至可能因為暫停的節點沒有響應而宣佈它死亡。最終，暫停的節點可能會繼續執行，甚至沒有注意到它在睡覺，直到它稍後某個時候檢查其時鐘。

#### 響應時間保證 {#sec_distributed_clocks_realtime}

在許多程式語言和作業系統中，如所討論的，執行緒和程序可能會暫停無限長的時間。如果你足夠努力，這些暫停的原因 *可以* 被消除。

某些軟體在環境中執行，如果未能在指定時間內響應可能會造成嚴重損害：控制飛機、火箭、機器人、汽車和其他物理物件的計算機必須快速且可預測地響應其感測器輸入。在這些系統中，有一個指定的 *截止時間*，軟體必須在此之前響應；如果它沒有達到截止時間，可能會導致整個系統的故障。這些被稱為 *硬即時* 系統。

--------

> [!NOTE]
> 在嵌入式系統中，*即時* 意味著系統經過精心設計和測試，以在所有情況下滿足指定的時序保證。這個含義與網路上更模糊的 *即時* 術語使用形成對比，後者描述伺服器向客戶端推送資料和流處理，沒有硬響應時間約束（見後續章節）。

--------

例如，如果你的汽車的車載感測器檢測到你當前正在經歷碰撞，你不希望安全氣囊的釋放因為安全氣囊釋放系統中不合時宜的 GC 暫停而延遲。

在系統中提供即時保證需要軟體棧所有級別的支援：需要 *即時作業系統*（RTOS），它允許程序在指定的時間間隔內以有保證的 CPU 時間分配進行排程；庫函式必須記錄其最壞情況執行時間；動態記憶體分配可能受到限制或完全禁止（即時垃圾回收器存在，但應用程式仍必須確保它不會給 GC 太多工作）；必須進行大量的測試和測量以確保滿足保證。

所有這些都需要大量的額外工作，並嚴重限制了可以使用的程式語言、庫和工具的範圍（因為大多數語言和工具不提供即時保證）。由於這些原因，開發即時系統非常昂貴，它們最常用於安全關鍵的嵌入式裝置。此外，"即時" 不同於 "高效能" —— 事實上，即時系統可能具有較低的吞吐量，因為它們必須優先考慮及時響應高於一切（另見 ["延遲和資源利用率"](#sidebar_distributed_latency_utilization)）。

對於大多數伺服器端資料處理系統，即時保證根本不經濟或不合適。因此，這些系統必須承受在非即時環境中執行帶來的暫停和時鐘不穩定性。

#### 限制垃圾回收的影響 {#sec_distributed_gc_impact}

垃圾回收曾經是程序暫停的最大原因之一 [^79]，但幸運的是 GC 演算法已經改進了很多：經過適當調整的回收器現在通常只會暫停幾毫秒。Java 執行時提供了併發標記清除（CMS）、G1、Z 垃圾回收器（ZGC）、Epsilon 和 Shenandoah 等回收器。每個都針對不同的記憶體配置檔案進行了最佳化，如高頻物件建立、大堆等。相比之下，Go 提供了一個更簡單的併發標記清除垃圾回收器，試圖自我最佳化。

如果你需要完全避免 GC 暫停，一個選擇是使用根本沒有垃圾回收器的語言。例如，Swift 使用自動引用計數來確定何時可以釋放記憶體；Rust 和 Mojo 使用型別系統跟蹤物件的生命週期，以便編譯器可以確定必須分配記憶體多長時間。

也可以使用垃圾回收語言，同時減輕暫停的影響。一種方法是將 GC 暫停視為節點的短暫計劃中斷，並讓其他節點在一個節點收集垃圾時處理來自客戶端的請求。如果執行時可以警告應用程式節點很快需要 GC 暫停，應用程式可以停止向該節點發送新請求，等待它完成處理未完成的請求，然後在沒有請求進行時執行 GC。這個技巧從客戶端隱藏了 GC 暫停，並減少了響應時間的高百分位數 [^80] [^81]。

這個想法的一個變體是僅對短期物件使用垃圾回收器（快速收集），並定期重啟程序，在它們積累足夠的長期物件需要長期物件的完整 GC 之前 [^79] [^82]。可以一次重啟一個節點，並且可以在計劃重啟之前將流量從節點轉移，就像滾動升級一樣（見 [第 5 章](/tw/ch5#ch_encoding)）。

這些措施不能完全防止垃圾回收暫停，但它們可以有效地減少對應用程式的影響。


## 知識、真相與謊言 {#sec_distributed_truth}

到目前為止，在本章中，我們已經探討了分散式系統與在單臺計算機上執行的程式的不同之處：沒有共享記憶體，只有透過不可靠的網路進行訊息傳遞，具有可變延遲，系統可能會遭受部分失效、不可靠的時鐘和處理暫停。

如果你不習慣分散式系統，這些問題的後果會令人深感迷惑。網路中的節點不能 *確切地知道* 關於其他節點的任何事情 —— 它只能根據它接收（或未接收）的訊息進行猜測。節點只能透過與另一個節點交換訊息來了解它處於什麼狀態（它儲存了什麼資料，它是否正常執行等）。如果遠端節點沒有響應，就無法知道它處於什麼狀態，因為網路中的問題無法與節點的問題可靠地區分開來。

這些系統的討論接近哲學：在我們的系統中，我們知道什麼是真或假？如果感知和測量的機制不可靠，我們對這些知識有多確定 [^83]？軟體系統是否應該遵守我們對物理世界的期望法則，如因果關係？

幸運的是，我們不需要走到弄清生命意義的程度。在分散式系統中，我們可以陳述我們對行為（*系統模型*）的假設，並以這樣的方式設計實際系統，使其滿足這些假設。演算法可以被證明在某個系統模型內正確執行。這意味著即使底層系統模型提供的保證很少，也可以實現可靠的行為。

然而，儘管可以在不可靠的系統模型中使軟體表現良好，但這樣做並不簡單。在本章的其餘部分，我們將進一步探討分散式系統中知識和真相的概念，這將幫助我們思考我們可以做出的假設型別和我們可能希望提供的保證。在 [第 10 章](/tw/ch10#ch_consistency) 中，我們將繼續檢視在特定假設下提供特定保證的分散式演算法的一些示例。

### 多數派原則 {#sec_distributed_majority}

想象一個具有不對稱故障的網路：一個節點能夠接收發送給它的所有訊息，但該節點的任何傳出訊息都被丟棄或延遲 [^22]。即使該節點執行得非常好，並且正在接收來自其他節點的請求，其他節點也無法聽到它的響應。在一些超時之後，其他節點宣佈它死亡，因為它們沒有收到該節點的訊息。情況展開就像一場噩夢：半斷開的節點被拖到墓地，踢腿尖叫著 "我沒死！" —— 但由於沒人能聽到它的尖叫，葬禮隊伍以堅忍的決心繼續前進。

在稍微不那麼可怕的情況下，半斷開的節點可能會注意到它傳送的訊息沒有被其他節點確認，因此意識到網路中一定有故障。儘管如此，該節點被其他節點錯誤地宣佈死亡，半斷開的節點對此無能為力。

作為第三種情況，想象一個節點暫停執行一分鐘。在此期間，沒有請求被處理，也沒有響應被傳送。其他節點等待、重試、變得不耐煩，最終宣佈該節點死亡並將其裝上靈車。最後，暫停結束，節點的執行緒繼續執行，就好像什麼都沒發生過。其他節點驚訝地看到據稱已死的節點突然從棺材裡抬起頭來，健康狀況良好，開始愉快地與旁觀者聊天。起初，暫停的節點甚至沒有意識到整整一分鐘已經過去，它被宣佈死亡 —— 從它的角度來看，自從它上次與其他節點交談以來，幾乎沒有時間過去。

這些故事的寓意是，節點不一定能信任自己對情況的判斷。分散式系統不能完全依賴單個節點，因為節點可能隨時失效，可能使系統陷入困境並無法恢復。相反，許多分散式演算法依賴於 *仲裁*，即節點之間的投票（見 ["讀寫仲裁"](/tw/ch6#sec_replication_quorum_condition)）：決策需要來自幾個節點的最少票數，以減少對任何一個特定節點的依賴。

這包括關於宣佈節點死亡的決定。如果節點的仲裁宣佈另一個節點死亡，那麼它必須被認為是死亡的，即使該節點仍然感覺自己非常活著。個別節點必須遵守仲裁決定並退出。

最常見的是，仲裁是超過半數節點的絕對多數（儘管其他型別的仲裁也是可能的）。多數仲裁允許系統在少數節點故障時繼續工作（三個節點可以容忍一個故障節點；五個節點可以容忍兩個故障節點）。然而，它仍然是安全的，因為系統中只能有一個多數 —— 不能同時有兩個具有衝突決策的多數。當我們在 [第 10 章](/tw/ch10#ch_consistency) 討論 *共識演算法* 時，我們將更詳細地討論仲裁的使用。

### 分散式鎖和租約 {#sec_distributed_lock_fencing}

分散式應用程式中的鎖和租約容易被誤用，並且是錯誤的常見來源 [^84]。讓我們看看它們如何出錯的一個特定案例。

在 ["程序暫停"](#sec_distributed_clocks_pauses) 中，我們看到租約是一種超時的鎖，如果舊所有者停止響應（可能是因為它崩潰了、暫停太久或與網路斷開連線），可以分配給新所有者。你可以在系統需要只有一個某種東西的情況下使用租約。例如：

* 只允許一個節點成為資料庫分片的主節點，以避免腦裂（見 ["處理節點中斷"](/tw/ch6#sec_replication_failover)）。
* 只允許一個事務或客戶端更新特定資源或物件，以防止併發寫入損壞它。
* 只有一個節點應該處理大型處理作業的給定輸入檔案，以避免由於多個節點冗餘地執行相同工作而浪費精力。

值得仔細思考如果幾個節點同時認為它們持有租約會發生什麼，可能是由於程序暫停。在第三個例子中，後果只是一些浪費的計算資源，這不是什麼大問題。但在前兩種情況下，後果可能是資料丟失或損壞，這要嚴重得多。

例如，[圖 9-4](#fig_distributed_lease_pause) 顯示了由於鎖的錯誤實現導致的資料損壞錯誤。（該錯誤不是理論上的：HBase 曾經有這個問題 [^85] [^86]。）假設你想確保儲存服務中的檔案一次只能由一個客戶端訪問，因為如果多個客戶端試圖寫入它，檔案將被損壞。你嘗試透過要求客戶端在訪問檔案之前從鎖服務獲取租約來實現這一點。這種鎖服務通常使用共識演算法實現；我們將在 [第 10 章](/tw/ch10#ch_consistency) 中進一步討論這一點。

{{< figure src="/fig/ddia_0904.png" id="fig_distributed_lease_pause" caption="圖 9-4. 分散式鎖的錯誤實現：客戶端 1 認為它仍然有有效的租約，即使它已經過期，因此損壞了儲存中的檔案。" class="w-full my-4" >}}


問題是我們在 ["程序暫停"](#sec_distributed_clocks_pauses) 中討論的一個例子：如果持有租約的客戶端暫停太久，其租約就會過期。另一個客戶端可以獲得同一檔案的租約，並開始寫入檔案。當暫停的客戶端回來時，它（錯誤地）認為它仍然有有效的租約，並繼續寫入檔案。我們現在有了腦裂情況：客戶端的寫入衝突並損壞了檔案。

[圖 9-5](#fig_distributed_lease_delay) 顯示了具有類似後果的另一個問題。在這個例子中沒有程序暫停，只有客戶端 1 的崩潰。就在客戶端 1 崩潰之前，它向儲存服務傳送了一個寫請求，但這個請求在網路中被延遲了很長時間。（請記住 ["實踐中的網路故障"](#sec_distributed_network_faults)，資料包有時可能會延遲一分鐘或更長時間。）當寫請求到達儲存服務時，租約已經超時，允許客戶端 2 獲取它併發出自己的寫入。結果是類似於 [圖 9-4](#fig_distributed_lease_pause) 的損壞。

{{< figure src="/fig/ddia_0905.png" id="fig_distributed_lease_delay" caption="圖 9-5. 來自前租約持有者的訊息可能會延遲很長時間，並在另一個節點接管租約後到達。" class="w-full my-4" >}}


#### 隔離殭屍程序和延遲請求 {#sec_distributed_fencing_tokens}

術語 *殭屍* 有時用於描述尚未發現失去租約的前租約持有者，並且仍在充當當前租約持有者。由於我們不能完全排除殭屍，我們必須確保它們不能以腦裂的形式造成任何損害。這被稱為 *隔離* 殭屍。

一些系統試圖透過關閉殭屍來隔離它們，例如透過斷開它們與網路的連線 [^9]、透過雲提供商的管理介面關閉 VM，甚至物理關閉機器 [^87]。這種方法被稱為 *對端節點爆頭*（STONITH）。不幸的是，它存在一些問題：它不能防範像 [圖 9-5](#fig_distributed_lease_delay) 中那樣的大網路延遲；可能會發生所有節點相互關閉的情況 [^19]；到檢測到殭屍並關閉它時，可能已經太晚了，資料可能已經被損壞。

一個更強大的隔離解決方案，可以防範殭屍和延遲請求，如 [圖 9-6](#fig_distributed_fencing) 所示。

{{< figure src="/fig/ddia_0906.png" id="fig_distributed_fencing" caption="圖 9-6. 透過只允許按遞增隔離令牌順序寫入來使儲存訪問安全。" class="w-full my-4" >}}


假設每次鎖服務授予鎖或租約時，它還返回一個 *隔離令牌*，這是一個每次授予鎖時都會增加的數字（例如，由鎖服務遞增）。然後我們可以要求客戶端每次向儲存服務傳送寫請求時，都必須包含其當前的隔離令牌。

--------

> [!NOTE]
> 隔離令牌有幾個替代名稱。在 Google 的鎖服務 Chubby 中，它們被稱為 *序列器* [^88]，在 Kafka 中它們被稱為 *紀元編號*。在共識演算法中，我們將在 [第 10 章](/tw/ch10#ch_consistency) 中討論，*投票編號*（Paxos）或 *任期編號*（Raft）起著類似的作用。

--------

在 [圖 9-6](#fig_distributed_fencing) 中，客戶端 1 獲得帶有令牌 33 的租約，但隨後進入長時間暫停，租約過期。客戶端 2 獲得帶有令牌 34 的租約（數字總是增加），然後將其寫請求傳送到儲存服務，包括令牌 34。稍後，客戶端 1 恢復執行並將其寫入傳送到儲存服務，包括其令牌值 33。然而，儲存服務記得它已經處理了具有更高令牌編號（34）的寫入，因此它拒絕帶有令牌 33 的請求。剛剛獲得租約的客戶端必須立即向儲存服務進行寫入，一旦該寫入完成，任何殭屍都被隔離了。

如果 ZooKeeper 是你的鎖服務，你可以使用事務 ID `zxid` 或節點版本 `cversion` 作為隔離令牌 [^85]。使用 etcd，修訂號與租約 ID 一起起著類似的作用 [^89]。Hazelcast 中的 FencedLock API 明確生成隔離令牌 [^90]。

這種機制要求儲存服務有某種方法來檢查寫入是否基於過時的令牌。或者，服務支援僅在物件自當前客戶端上次讀取以來未被另一個客戶端寫入時才成功的寫入就足夠了，類似於原子比較並設定（CAS）操作。例如，物件儲存服務支援這種檢查：Amazon S3 稱之為 *條件寫入*，Azure Blob Storage 稱之為 *條件標頭*，Google Cloud Storage 稱之為 *請求前提條件*。

#### 多副本隔離 {#fencing-with-multiple-replicas}

如果你的客戶端只需要寫入一個支援此類條件寫入的儲存服務，鎖服務在某種程度上是多餘的 [^91] [^92]，因為租約分配本可以直接基於該儲存服務實現 [^93]。然而，一旦你有了隔離令牌，你也可以將其用於多個服務或副本，並確保舊的租約持有者在所有這些服務上都被隔離。

例如，想象儲存服務是一個具有最後寫入勝利衝突解決的無主複製鍵值儲存（見 ["無主複製"](/tw/ch6#sec_replication_leaderless)）。在這樣的系統中，客戶端直接向每個副本傳送寫入，每個副本根據客戶端分配的時間戳獨立決定是否接受寫入。

如 [圖 9-7](#fig_distributed_fencing_leaderless) 所示，你可以將寫入者的隔離令牌放在時間戳的最高有效位或數字中。然後你可以確保新租約持有者生成的任何時間戳都將大於舊租約持有者的任何時間戳，即使舊租約持有者的寫入發生得更晚。

{{< figure src="/fig/ddia_0907.png" id="fig_distributed_fencing_leaderless" caption="圖 9-7. 使用隔離令牌保護對無主複製資料庫的寫入。" class="w-full my-4" >}}


在 [圖 9-7](#fig_distributed_fencing_leaderless) 中，客戶端 2 有隔離令牌 34，因此它所有以 34… 開頭的時間戳都大於客戶端 1 生成的任何以 33… 開頭的時間戳。客戶端 2 寫入副本的仲裁，但它無法到達副本 3。這意味著當殭屍客戶端 1 稍後嘗試寫入時，它的寫入可能在副本 3 上成功，即使它被副本 1 和 2 忽略。這不是問題，因為後續的仲裁讀取將更喜歡具有更大時間戳的客戶端 2 的寫入，讀修復或反熵最終將覆蓋客戶端 1 寫入的值。

從這些例子可以看出，假設任何時候只有一個節點持有租約是不安全的。幸運的是，透過一點小心，你可以使用隔離令牌來防止殭屍和延遲請求造成任何損害。

### 拜占庭故障 {#sec_distributed_byzantine}

隔離令牌可以檢測並阻止 *無意中* 出錯的節點（例如，因為它尚未發現其租約已過期）。然而，如果節點故意想要破壞系統的保證，它可以透過傳送帶有虛假隔離令牌的訊息輕鬆做到。

在本書中，我們假設節點是不可靠但誠實的：它們可能很慢或從不響應（由於故障），它們的狀態可能已過時（由於 GC 暫停或網路延遲），但我們假設如果節點 *確實* 響應，它就是在說 "真話"：據它所知，它正在按協議規則行事。

如果節點可能 "撒謊"（傳送任意錯誤或損壞的響應）的風險存在，分散式系統問題會變得更加困難 —— 例如，它可能在同一次選舉中投出多個相互矛盾的票。這種行為被稱為 *拜占庭故障*，在這種不信任環境中達成共識的問題被稱為 *拜占庭將軍問題* [^94]。

> [!TIP] 拜占庭將軍問題
>
> 拜占庭將軍問題是所謂 *兩將軍問題* [^95] 的推廣，它想象了兩個軍隊將軍需要就戰鬥計劃達成一致的情況。由於他們在兩個不同的地點扎營，他們只能透過信使進行通訊，信使有時會延遲或丟失（就像網路中的資料包）。我們將在 [第 10 章](/tw/ch10#ch_consistency) 中討論這個 *共識* 問題。
>
> 在問題的拜占庭版本中，有 *n* 個需要達成一致的將軍，他們的努力受到他們中間有一些叛徒的阻礙。大多數將軍是忠誠的，因此傳送真實的訊息，但叛徒可能試圖透過傳送虛假或不真實的訊息來欺騙和混淆其他人。事先不知道誰是叛徒。
>
> 拜占庭是一個古希臘城市，後來成為君士坦丁堡，位於現在土耳其的伊斯坦布林。沒有任何歷史證據表明拜占庭的將軍比其他地方的將軍更容易搞陰謀和密謀。相反，這個名字源自 *拜占庭* 一詞在 *過於複雜、官僚、狡猾* 的意義上的使用，這個詞在計算機出現之前很久就在政治中使用了 [^96]。Lamport 想選擇一個不會冒犯任何讀者的國籍，他被建議稱之為 *阿爾巴尼亞將軍問題* 不是個好主意 [^97]。

--------

如果即使某些節點發生故障並且不遵守協議，或者惡意攻擊者干擾網路，系統仍能繼續正確執行，則該系統是 *拜占庭容錯* 的。這種擔憂在某些特定情況下是相關的。例如：

* 在航空航天環境中，計算機記憶體或 CPU 暫存器中的資料可能因輻射而損壞，導致它以任意不可預測的方式響應其他節點。由於系統故障的成本非常高昂（例如，飛機墜毀並殺宕機上所有人，或火箭與國際空間站相撞），飛行控制系統必須容忍拜占庭故障 [^98] [^99]。
* 在有多個參與方的系統中，一些參與者可能試圖欺騙或欺詐其他人。在這種情況下，節點簡單地信任另一個節點的訊息是不安全的，因為它們可能是惡意傳送的。例如，比特幣等加密貨幣和其他區塊鏈可以被認為是讓相互不信任的各方就交易是否發生達成一致的一種方式，而無需依賴中央權威 [^100]。

然而，在我們在本書中討論的系統型別中，我們通常可以安全地假設沒有拜占庭故障。在資料中心中，所有節點都由你的組織控制（因此它們有望被信任），輻射水平足夠低，記憶體損壞不是主要問題（儘管正在考慮軌道資料中心 [^101]）。多租戶系統有相互不信任的租戶，但它們使用防火牆、虛擬化和訪問控制策略相互隔離，而不是使用拜占庭容錯。使系統拜占庭容錯的協議相當昂貴 [^102]，容錯嵌入式系統依賴於硬體級別的支援 [^98]。在大多數伺服器端資料系統中，部署拜占庭容錯解決方案的成本使它們不切實際。

Web 應用程式確實需要預期客戶端在終端使用者控制下的任意和惡意行為，例如 Web 瀏覽器。這就是輸入驗證、清理和輸出轉義如此重要的原因：例如，防止 SQL 注入和跨站指令碼攻擊。然而，我們通常不在這裡使用拜占庭容錯協議，而只是讓伺服器成為決定什麼客戶端行為被允許和不被允許的權威。在沒有這種中央權威的點對點網路中，拜占庭容錯更相關 [^103] [^104]。

軟體中的錯誤可以被視為拜占庭故障，但如果你將相同的軟體部署到所有節點，那麼拜占庭容錯演算法無法拯救你。大多數拜占庭容錯演算法需要超過三分之二的節點的絕對多數才能正常執行（例如，如果你有四個節點，最多一個可能發生故障）。要使用這種方法對付錯誤，你必須有四個相同軟體的獨立實現，並希望錯誤只出現在四個實現中的一個。

同樣，如果協議可以保護我們免受漏洞、安全妥協和惡意攻擊，那將是很有吸引力的。不幸的是，這也不現實：在大多數系統中，如果攻擊者可以破壞一個節點，他們可能可以破壞所有節點，因為它們可能執行相同的軟體。因此，傳統機制（身份驗證、訪問控制、加密、防火牆等）仍然是防範攻擊者的主要保護。

<a id="sec_distributed_weak_lying"></a>

#### 弱形式的謊言 {#weak-forms-of-lying}

儘管我們假設節點通常是誠實的，但向軟體新增防範弱形式 "謊言" 的機制可能是值得的 —— 例如，由於硬體問題、軟體錯誤和配置錯誤導致的無效訊息。這種保護機制不是完全的拜占庭容錯，因為它們無法抵禦堅定的對手，但它們仍然是朝著更好可靠性邁出的簡單而務實的步驟。例如：

* 由於硬體問題或作業系統、驅動程式、路由器等中的錯誤，網路資料包有時確實會損壞。通常，損壞的資料包會被內置於 TCP 和 UDP 中的校驗和捕獲，但有時它們會逃避檢測 [^105] [^106] [^107]。簡單的措施通常足以防範此類損壞，例如應用程式級協議中的校驗和。TLS 加密連線也提供防損壞保護。
* 公開可訪問的應用程式必須仔細清理來自使用者的任何輸入，例如檢查值是否在合理範圍內，並限制字串的大小以防止透過大記憶體分配進行拒絕服務。防火牆後面的內部服務可能能夠在輸入上進行較少嚴格的檢查，但協議解析器中的基本檢查仍然是個好主意 [^105]。
* NTP 客戶端可以配置多個伺服器地址。同步時，客戶端聯絡所有伺服器，估計它們的錯誤，並檢查大多數伺服器是否在某個時間範圍內達成一致。只要大多數伺服器都正常，報告不正確時間的配置錯誤的 NTP 伺服器就會被檢測為異常值並從同步中排除 [^39]。使用多個伺服器使 NTP 比僅使用單個伺服器更強大。

### 系統模型與現實 {#sec_distributed_system_model}

許多演算法被設計來解決分散式系統問題 —— 例如，我們將在 [第 10 章](/tw/ch10#ch_consistency) 中研究共識問題的解決方案。為了有用，這些演算法需要容忍我們在本章中討論的分散式系統的各種故障。

演算法需要以不過度依賴於它們執行的硬體和軟體配置細節的方式編寫。這反過來又要求我們以某種方式形式化我們期望在系統中發生的故障型別。我們透過定義 *系統模型* 來做到這一點，這是一個描述演算法可能假設什麼事情的抽象。

關於時序假設，三種系統模型常用：

同步模型
: 同步模型假設有界的網路延遲、有界的程序暫停和有界的時鐘誤差。這並不意味著精確同步的時鐘或零網路延遲；它只是意味著你知道網路延遲、暫停和時鐘漂移永遠不會超過某個固定的上限 [^108]。同步模型不是大多數實際系統的現實模型，因為（如本章所討論的）無界延遲和暫停確實會發生。

部分同步模型
: 部分同步意味著系統 *大部分時間* 表現得像同步系統，但有時會超過網路延遲、程序暫停和時鐘漂移的界限 [^108]。這是許多系統的現實模型：大部分時間，網路和程序表現相當良好 —— 否則我們永遠無法完成任何事情 —— 但我們必須考慮到任何時序假設偶爾可能會被打破的事實。發生這種情況時，網路延遲、暫停和時鐘誤差可能會變得任意大。

非同步模型
: 在這個模型中，演算法不允許做出任何時序假設 —— 事實上，它甚至沒有時鐘（因此它不能使用超時）。一些演算法可以為非同步模型設計，但它非常有限。

此外，除了時序問題，我們還必須考慮節點故障。節點的一些常見系統模型是：

崩潰停止故障
: 在 *崩潰停止*（或 *故障停止*）模型中，演算法可以假設節點只能以一種方式失效，即崩潰 [^109]。這意味著節點可能在任何時刻突然停止響應，此後該節點永遠消失 —— 它永遠不會回來。

崩潰恢復故障
: 我們假設節點可能在任何時刻崩潰，並且可能在某個未知時間後再次開始響應。在崩潰恢復模型中，假設節點具有跨崩潰保留的穩定儲存（即非易失性磁碟儲存），而記憶體中的狀態假設丟失。

效能下降和部分功能
: 除了崩潰和重啟之外，節點可能變慢：它們可能仍然能夠響應健康檢查請求，但速度太慢而無法完成任何實際工作。例如，千兆網路介面可能由於驅動程式錯誤突然降至 1 Kb/s 吞吐量 [^110]；處於記憶體壓力下的程序可能會花費大部分時間執行垃圾回收 [^111]；磨損的 SSD 可能具有不穩定的效能；硬體可能受到高溫、鬆動的聯結器、機械振動、電源問題、韌體錯誤等的影響 [^112]。這種情況被稱為 *跛行節點*、*灰色故障* 或 *慢速故障* [^113]，它可能比干淨失效的節點更難處理。一個相關的問題是當程序停止執行它應該做的某些事情，而其他方面繼續工作時，例如因為後臺執行緒崩潰或死鎖 [^114]。

拜占庭（任意）故障
: 節點可能做任何事情，包括試圖欺騙和欺騙其他節點，如上一節所述。

對於建模真實系統，具有崩潰恢復故障的部分同步模型通常是最有用的模型。它允許無界的網路延遲、程序暫停和慢節點。但是分散式演算法如何應對該模型？

#### 定義演算法的正確性 {#defining-the-correctness-of-an-algorithm}

為了定義演算法 *正確* 的含義，我們可以描述它的 *屬性*。例如，排序演算法的輸出具有這樣的屬性：對於輸出列表的任何兩個不同元素，左邊的元素小於右邊的元素。這只是定義列表排序含義的正式方式。

同樣，我們可以寫下我們希望分散式演算法具有的屬性，以定義正確的含義。例如，如果我們為鎖生成隔離令牌（見 ["隔離殭屍程序和延遲請求"](#sec_distributed_fencing_tokens)），我們可能要求演算法具有以下屬性：

唯一性
: 沒有兩個隔離令牌請求返回相同的值。

單調序列
: 如果請求 *x* 返回令牌 *t**x*，請求 *y* 返回令牌 *t**y*，並且 *x* 在 *y* 開始之前完成，則 *t**x* < *t**y*。

可用性
: 請求隔離令牌且不崩潰的節點最終會收到響應。

如果演算法在我們假設該系統模型中可能發生的所有情況下始終滿足其屬性，則該演算法在某個系統模型中是正確的。然而，如果所有節點崩潰，或者所有網路延遲突然變得無限長，那麼沒有演算法能夠完成任何事情。即使在允許完全失效的系統模型中，我們如何仍然做出有用的保證？

#### 安全性與活性 {#sec_distributed_safety_liveness}

為了澄清情況，值得區分兩種不同型別的屬性：*安全性* 和 *活性* 屬性。在剛才給出的例子中，*唯一性* 和 *單調序列* 是安全屬性，但 *可用性* 是活性屬性。

什麼區分這兩種屬性？一個跡象是活性屬性通常在其定義中包含 "最終" 一詞。（是的，你猜對了 —— *最終一致性* 是一個活性屬性 [^115]。）

安全性通常被非正式地定義為 *沒有壞事發生*，活性被定義為 *好事最終會發生*。然而，最好不要過多地解讀這些非正式定義，因為 "好" 和 "壞" 是價值判斷，不能很好地應用於演算法。安全性和活性的實際定義更精確 [^116]：

* 如果違反了安全屬性，我們可以指出它被破壞的特定時間點（例如，如果違反了唯一性屬性，我們可以識別返回重複隔離令牌的特定操作）。在違反安全屬性之後，違規無法撤消 —— 損害已經造成。
* 活性屬性以相反的方式工作：它可能在某個時間點不成立（例如，節點可能已傳送請求但尚未收到響應），但總有希望它將來可能得到滿足（即透過接收響應）。

區分安全性和活性屬性的一個優點是它有助於我們處理困難的系統模型。對於分散式演算法，通常要求安全屬性在系統模型的所有可能情況下 *始終* 成立 [^108]。也就是說，即使所有節點崩潰，或整個網路失效，演算法也必須確保它不會返回錯誤的結果（即，安全屬性保持滿足）。

然而，對於活性屬性，我們可以做出警告：例如，我們可以說請求只有在大多數節點沒有崩潰時才需要收到響應，並且只有在網路最終從中斷中恢復時才需要響應。部分同步模型的定義要求系統最終返回到同步狀態 —— 也就是說，任何網路中斷期只持續有限的時間，然後被修復。

#### 將系統模型對映到現實世界 {#mapping-system-models-to-the-real-world}

安全性和活性屬性以及系統模型對於推理分散式演算法的正確性非常有用。然而，在實踐中實現演算法時，現實的混亂事實又會回來咬你一口，很明顯系統模型是現實的簡化抽象。

例如，崩潰恢復模型中的演算法通常假設穩定儲存中的資料在崩潰後倖存。然而，如果磁碟上的資料損壞了，或者由於硬體錯誤或配置錯誤而擦除了資料，會發生什麼 [^117]？如果伺服器有韌體錯誤並且在重啟時無法識別其硬碟驅動器，即使驅動器正確連線到伺服器，會發生什麼 [^118]？

仲裁演算法（見 ["讀寫仲裁"](/tw/ch6#sec_replication_quorum_condition)）依賴於節點記住它聲稱已儲存的資料。如果節點可能患有健忘症並忘記先前儲存的資料，那會破壞仲裁條件，從而破壞演算法的正確性。也許需要一個新的系統模型，其中我們假設穩定儲存大多在崩潰後倖存，但有時可能會丟失。但該模型隨後變得更難推理。

演算法的理論描述可以宣告某些事情被簡單地假設不會發生 —— 在非拜占庭系統中，我們確實必須對可能和不可能發生的故障做出一些假設。然而，真正的實現可能仍然必須包含程式碼來處理被假設為不可能的事情發生的情況，即使該處理歸結為 `printf("Sucks to be you")` 和 `exit(666)` —— 即，讓人類操作員清理爛攤子 [^119]。（這是計算機科學和軟體工程之間的一個區別。）

這並不是說理論上的、抽象的系統模型是無用的 —— 恰恰相反。它們非常有助於將真實系統的複雜性提煉為我們可以推理的可管理的故障集，以便我們可以理解問題並嘗試系統地解決它。

### 形式化方法和隨機測試 {#sec_distributed_formal}

我們如何知道演算法滿足所需的屬性？由於併發性、部分失效和網路延遲，存在大量潛在狀態。我們需要保證屬性在每個可能的狀態下都成立，並確保我們沒有忘記任何邊界情況。

一種方法是透過數學描述演算法來形式驗證它，並使用證明技術來表明它在系統模型允許的所有情況下都滿足所需的屬性。證明演算法正確並不意味著它在真實系統上的 *實現* 必然總是正確執行。但這是一個非常好的第一步，因為理論分析可以發現演算法中的問題，這些問題可能在真實系統中長時間隱藏，並且只有當你的假設（例如，關於時序）由於不尋常的情況而失敗時才會咬你一口。

將理論分析與經驗測試相結合以驗證實現按預期執行是明智的。基於屬性的測試、模糊測試和確定性模擬測試（DST）等技術使用隨機化來在各種情況下測試系統。亞馬遜網路服務等公司已成功地在其許多產品上使用了這些技術的組合 [^120] [^121]。

#### 模型檢查與規範語言 {#model-checking-and-specification-languages}

*模型檢查器* 是幫助驗證演算法或系統按預期執行的工具。演算法規範是用專門構建的語言編寫的，如 TLA+、Gallina 或 FizzBee。這些語言使得更容易專注於演算法的行為，而不必擔心程式碼實現細節。然後，模型檢查器使用這些模型透過系統地嘗試所有可能發生的事情來驗證不變數在演算法的所有狀態中都成立。

模型檢查實際上不能證明演算法的不變數對每個可能的狀態都成立，因為大多數現實世界的演算法都有無限的狀態空間。對所有狀態的真正驗證需要形式證明，這是可以做到的，但通常比執行模型檢查器更困難。相反，模型檢查器鼓勵你將演算法的模型減少到可以完全驗證的近似值，或者將執行限制到某個上限（例如，透過設定可以傳送的最大訊息數）。任何只在更長執行時發生的錯誤將不會被發現。

儘管如此，模型檢查器在易用性和查詢非顯而易見錯誤的能力之間取得了很好的平衡。CockroachDB、TiDB、Kafka 和許多其他分散式系統使用模型規範來查詢和修復錯誤 [^122] [^123] [^124]。例如，使用 TLA+，研究人員能夠證明由演算法的散文描述中的歧義引起的檢視戳複製（VR）中資料丟失的可能性 [^125]。

按設計，模型檢查器不執行你的實際程式碼，而是執行一個簡化的模型，該模型僅指定你的協議的核心思想。這使得系統地探索狀態空間更易處理，但有風險是你的規範和你的實現彼此不同步 [^126]。可以檢查模型和真實實現是否具有等效行為，但這需要在真實實現中進行儀器化 [^127]。

#### 故障注入 {#sec_fault_injection}

許多錯誤是在機器和網路故障發生時觸發的。故障注入是一種有效（有時令人恐懼）的技術，用於驗證系統的實現在出錯時是否按預期工作。這個想法很簡單：將故障注入到正在執行的系統環境中，看看它如何表現。故障可以是網路故障、機器崩潰、磁碟損壞、暫停的程序 —— 你能想象到的計算機出錯的任何事情。

故障注入測試通常在與系統將執行的生產環境非常相似的環境中執行。有些甚至直接將故障注入到他們的生產環境中。Netflix 透過他們的 Chaos Monkey 工具推廣了這種方法 [^128]。生產故障注入通常被稱為 *混沌工程*，我們在 ["可靠性與容錯"](/tw/ch2#sec_introduction_reliability) 中討論過。

要執行故障注入測試，首先部署被測系統以及故障注入協調器和指令碼。協調器負責決定執行什麼故障以及何時執行它們。本地或遠端指令碼負責將故障注入到單個節點或程序中。注入指令碼使用許多不同的工具來觸發故障。可以使用 Linux 的 `kill` 命令暫停或殺死 Linux 程序，可以使用 `umount` 解除安裝磁碟，可以透過防火牆設定中斷網路連線。你可以在注入故障期間和之後檢查系統行為，以確保事情按預期工作。

觸發故障所需的無數工具使故障注入測試編寫起來很麻煩。採用像 Jepsen 這樣的故障注入框架來執行故障注入測試以簡化過程是常見的。這些框架帶有各種作業系統的整合和許多預構建的故障注入器 [^129]。Jepsen 在許多廣泛使用的系統中發現關鍵錯誤方面非常有效 [^130] [^131]。

#### 確定性模擬測試 {#deterministic-simulation-testing}

確定性模擬測試（DST）也已成為模型檢查和故障注入的流行補充。它使用與模型檢查器類似的狀態空間探索過程，但它測試你的實際程式碼，而不是模型。

在 DST 中，模擬自動執行系統的大量隨機執行。模擬期間的網路通訊、I/O 和時鐘時序都被模擬替換，允許模擬器控制事情發生的確切順序，包括各種時序和故障場景。這允許模擬器探索比手寫測試或故障注入更多的情況。如果測試失敗，它可以重新執行，因為模擬器知道觸發故障的確切操作順序 —— 與故障注入相比，後者對系統沒有如此細粒度的控制。

DST 要求模擬器能夠控制所有非確定性來源，例如網路延遲。通常採用三種策略之一來使程式碼確定性：

應用程式級
: 一些系統從頭開始構建，以便於確定性地執行程式碼。例如，DST 領域的先驅之一 FoundationDB 是使用稱為 Flow 的非同步通訊庫構建的。Flow 為開發人員提供了將確定性網路模擬注入系統的點 [^132]。類似地，TigerBeetle 是一個具有一流 DST 支援的線上事務處理（OLTP）資料庫。系統的狀態被建模為狀態機，所有突變都發生在單個事件迴圈中。當與模擬確定性原語（如時鐘）結合時，這種架構能夠確定性地執行 [^133]。

執行時級
: 具有非同步執行時和常用庫的語言提供了引入確定性的插入點。使用單執行緒執行時強制所有非同步程式碼按順序執行。例如，FrostDB 修補 Go 的執行時以按順序執行 goroutine [^134]。Rust 的 madsim 庫以類似的方式工作。Madsim 提供了 Tokio 的非同步執行時 API、AWS 的 S3 庫、Kafka 的 Rust 庫等的確定性實現。應用程式可以交換確定性庫和執行時以獲得確定性測試執行，而無需更改其程式碼。

機器級
: 與其在執行時修補程式碼，不如使整個機器確定性。這是一個微妙的過程，需要機器對所有通常非確定性的呼叫響應確定性響應。Antithesis 等工具透過構建自定義虛擬機器管理程式來做到這一點，該虛擬機器管理程式用確定性操作替換通常的非確定性操作。從時鐘到網路和儲存的一切都需要考慮。不過，一旦完成，開發人員可以在虛擬機器管理程式內的容器集合中執行其整個分散式系統，並獲得完全確定性的分散式系統。

DST 提供了超越可重放性的幾個優勢。Antithesis 等工具試圖透過在發現不太常見的行為時將測試執行分支為多個子執行來探索應用程式程式碼中的許多不同程式碼路徑。由於確定性測試通常使用模擬時鐘和網路呼叫，因此此類測試可以比掛鐘時間執行得更快。例如，TigerBeetle 的時間抽象允許模擬模擬網路延遲和超時，而實際上不需要觸發超時的全部時間長度。這些技術允許模擬器更快地探索更多程式碼路徑。

#### 確定性的力量 {#sidebar_distributed_determinism}

非確定性是我們在本章中討論的所有分散式系統挑戰的核心：併發性、網路延遲、程序暫停、時鐘跳躍和崩潰都以不可預測的方式發生，從系統的一次執行到下一次執行都不同。相反，如果你能使系統確定性，那可以極大地簡化事情。

事實上，使事物確定性是一個簡單但強大的想法，在分散式系統設計中一再出現。除了確定性模擬測試，我們在過去的章節中已經看到了幾種使用確定性的方法：

* 事件溯源的一個關鍵優勢（見 ["事件溯源和 CQRS"](/tw/ch3#sec_datamodels_events)）是你可以確定性地重放事件日誌以重建派生的物化檢視。
* 工作流引擎（見 ["持久執行和工作流"](/tw/ch5#sec_encoding_dataflow_workflows)）依賴於工作流定義是確定性的，以提供持久執行語義。
* *狀態機複製*，我們將在 ["使用共享日誌"](/tw/ch10#sec_consistency_smr) 中討論，透過在每個副本上獨立執行相同的確定性事務序列來複制資料。我們已經看到了這個想法的兩個變體：基於語句的複製（見 ["複製日誌的實現"](/tw/ch6#sec_replication_implementation)）和使用儲存過程的序列事務執行（見 ["儲存過程的利弊"](/tw/ch8#sec_transactions_stored_proc_tradeoffs)）。

然而，使程式碼完全確定性需要小心。即使你已經刪除了所有併發性並用確定性模擬替換了 I/O、網路通訊、時鐘和隨機數生成器，非確定性元素可能仍然存在。例如，在某些程式語言中，迭代雜湊表元素的順序可能是非確定性的。是否遇到資源限制（記憶體分配失敗、堆疊溢位）也是非確定性的。

## 總結 {#summary}

在本章中，我們討論了分散式系統中可能發生的各種問題，包括：

* 每當你嘗試透過網路傳送資料包時，它可能會丟失或任意延遲。同樣，回覆可能會丟失或延遲，所以如果你沒有得到回覆，你不知道訊息是否送達。
* 節點的時鐘可能與其他節點嚴重不同步（儘管你盡最大努力設定了 NTP），它可能會突然向前或向後跳躍，而依賴它是危險的，因為你很可能沒有一個好的時鐘置信區間度量。
* 程序可能在其執行的任何時刻暫停相當長的時間，被其他節點宣告死亡，然後再次恢復活動而沒有意識到它曾暫停。

這種 *部分失效* 可能發生的事實是分散式系統的決定性特徵。每當軟體嘗試做任何涉及其他節點的事情時，都有可能偶爾失敗、隨機變慢或根本沒有響應（並最終超時）。在分散式系統中，我們嘗試將對部分失效的容忍構建到軟體中，這樣即使某些組成部分出現故障，整個系統也可以繼續執行。

要容忍故障，第一步是 *檢測* 它們，但即使這樣也很困難。大多數系統沒有準確的機制來檢測節點是否已失敗，因此大多數分散式演算法依賴超時來確定遠端節點是否仍然可用。然而，超時無法區分網路和節點故障，可變的網路延遲有時會導致節點被錯誤地懷疑崩潰。處理跛行節點（limping nodes）更加困難，這些節點正在響應但速度太慢而無法做任何有用的事情。

一旦檢測到故障，讓系統容忍它也不容易：沒有全域性變數、沒有共享記憶體、沒有公共知識或機器之間任何其他型別的共享狀態 [^83]。節點甚至無法就現在是什麼時間達成一致，更不用說任何更深刻的事情了。資訊從一個節點流向另一個節點的唯一方式是透過不可靠的網路傳送。單個節點無法安全地做出重大決策，因此我們需要協議來徵求其他節點的幫助並嘗試獲得法定人數的同意。

如果你習慣於在單臺計算機的理想數學完美環境中編寫軟體，其中相同的操作總是確定性地返回相同的結果，那麼轉向分散式系統混亂的物理現實可能會有點震驚。相反，分散式系統工程師通常會認為如果一個問題可以在單臺計算機上解決，那它就是微不足道的 [^4]，而且單臺計算機現在確實可以做很多事情。如果你可以避免開啟潘多拉的盒子，只需將事情保持在單臺機器上，例如使用嵌入式儲存引擎（見 ["嵌入式儲存引擎"](/tw/ch4#sidebar_embedded)），通常值得這樣做。

然而，正如在 ["分散式系統與單節點系統"](/tw/ch1#sec_introduction_distributed) 中討論的，可伸縮性並不是使用分散式系統的唯一原因。容錯和低延遲（透過將資料在地理上放置在靠近使用者的位置）是同樣重要的目標，而這些事情無法透過單個節點實現。分散式系統的力量在於，原則上它們可以在服務層面永遠執行而不被中斷，因為所有故障和維護都可以在節點層面處理。（實際上，如果錯誤的配置更改被推送到所有節點，仍然會讓分散式系統崩潰。）

在本章中，我們還探討了網路、時鐘和程序的不可靠性是否是不可避免的自然法則。我們看到它不是：可以在網路中提供硬即時響應保證和有界延遲，但這樣做非常昂貴，並導致硬體資源利用率降低。大多數非安全關鍵系統選擇便宜和不可靠而不是昂貴和可靠。

本章一直在討論問題，給了我們一個暗淡的前景。在下一章中，我們將轉向解決方案，並討論一些為應對分散式系統中的問題而設計的演算法。


### 參考

[^1]: Mark Cavage. [There’s Just No Getting Around It: You’re Building a Distributed System](https://queue.acm.org/detail.cfm?id=2482856). *ACM Queue*, volume 11, issue 4, pages 80-89, April 2013. [doi:10.1145/2466486.2482856](https://doi.org/10.1145/2466486.2482856)
[^2]: Jay Kreps. [Getting Real About Distributed System Reliability](https://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability). *blog.empathybox.com*, March 2012. Archived at [perma.cc/9B5Q-AEBW](https://perma.cc/9B5Q-AEBW)
[^3]: Coda Hale. [You Can’t Sacrifice Partition Tolerance](https://codahale.com/you-cant-sacrifice-partition-tolerance/). *codahale.com*, October 2010. <https://perma.cc/6GJU-X4G5>
[^4]: Jeff Hodges. [Notes on Distributed Systems for Young Bloods](https://www.somethingsimilar.com/2013/01/14/notes-on-distributed-systems-for-young-bloods/). *somethingsimilar.com*, January 2013. Archived at [perma.cc/B636-62CE](https://perma.cc/B636-62CE)
[^5]: Van Jacobson. [Congestion Avoidance and Control](https://www.cs.usask.ca/ftp/pub/discus/seminars2002-2003/p314-jacobson.pdf). At *ACM Symposium on Communications Architectures and Protocols* (SIGCOMM), August 1988. [doi:10.1145/52324.52356](https://doi.org/10.1145/52324.52356)
[^6]: Bert Hubert. [The Ultimate SO\_LINGER Page, or: Why Is My TCP Not Reliable](https://blog.netherlabs.nl/articles/2009/01/18/the-ultimate-so_linger-page-or-why-is-my-tcp-not-reliable). *blog.netherlabs.nl*, January 2009. Archived at [perma.cc/6HDX-L2RR](https://perma.cc/6HDX-L2RR)
[^7]: Jerome H. Saltzer, David P. Reed, and David D. Clark. [End-To-End Arguments in System Design](https://groups.csail.mit.edu/ana/Publications/PubPDFs/End-to-End%20Arguments%20in%20System%20Design.pdf). *ACM Transactions on Computer Systems*, volume 2, issue 4, pages 277–288, November 1984. [doi:10.1145/357401.357402](https://doi.org/10.1145/357401.357402)
[^8]: Peter Bailis and Kyle Kingsbury. [The Network Is Reliable](https://queue.acm.org/detail.cfm?id=2655736). *ACM Queue*, volume 12, issue 7, pages 48-55, July 2014. [doi:10.1145/2639988.2639988](https://doi.org/10.1145/2639988.2639988)
[^9]: Joshua B. Leners, Trinabh Gupta, Marcos K. Aguilera, and Michael Walfish. [Taming Uncertainty in Distributed Systems with Help from the Network](https://cs.nyu.edu/~mwalfish/papers/albatross-eurosys15.pdf). At *10th European Conference on Computer Systems* (EuroSys), April 2015. [doi:10.1145/2741948.2741976](https://doi.org/10.1145/2741948.2741976)
[^10]: Phillipa Gill, Navendu Jain, and Nachiappan Nagappan. [Understanding Network Failures in Data Centers: Measurement, Analysis, and Implications](https://conferences.sigcomm.org/sigcomm/2011/papers/sigcomm/p350.pdf). At *ACM SIGCOMM Conference*, August 2011. [doi:10.1145/2018436.2018477](https://doi.org/10.1145/2018436.2018477)
[^11]: Urs Hölzle. [But recently a farmer had started grazing a herd of cows nearby. And whenever they stepped on the fiber link, they bent it enough to cause a blip](https://x.com/uhoelzle/status/1263333283107991558). *x.com*, May 2020. Archived at [perma.cc/WX8X-ZZA5](https://perma.cc/WX8X-ZZA5)
[^12]: CBC News. [Hundreds lose internet service in northern B.C. after beaver chews through cable](https://www.cbc.ca/news/canada/british-columbia/beaver-internet-down-tumbler-ridge-1.6001594). *cbc.ca*, April 2021. Archived at [perma.cc/UW8C-H2MY](https://perma.cc/UW8C-H2MY)
[^13]: Will Oremus. [The Global Internet Is Being Attacked by Sharks, Google Confirms](https://slate.com/technology/2014/08/shark-attacks-threaten-google-s-undersea-internet-cables-video.html). *slate.com*, August 2014. Archived at [perma.cc/P6F3-C6YG](https://perma.cc/P6F3-C6YG)
[^14]: Jess Auerbach Jahajeeah. [Down to the wire: The ship fixing our internet](https://continent.substack.com/p/down-to-the-wire-the-ship-fixing). *continent.substack.com*, November 2023. Archived at [perma.cc/DP7B-EQ7S](https://perma.cc/DP7B-EQ7S)
[^15]: Santosh Janardhan. [More details about the October 4 outage](https://engineering.fb.com/2021/10/05/networking-traffic/outage-details/). *engineering.fb.com*, October 2021. Archived at [perma.cc/WW89-VSXH](https://perma.cc/WW89-VSXH)
[^16]: Tom Parfitt. [Georgian woman cuts off web access to whole of Armenia](https://www.theguardian.com/world/2011/apr/06/georgian-woman-cuts-web-access). *theguardian.com*, April 2011. Archived at [perma.cc/KMC3-N3NZ](https://perma.cc/KMC3-N3NZ)
[^17]: Antonio Voce, Tural Ahmedzade and Ashley Kirk. [‘Shadow fleets’ and subaquatic sabotage: are Europe’s undersea internet cables under attack?](https://www.theguardian.com/world/ng-interactive/2025/mar/05/shadow-fleets-subaquatic-sabotage-europe-undersea-internet-cables-under-attack) *theguardian.com*, March 2025. Archived at [perma.cc/HA7S-ZDBV](https://perma.cc/HA7S-ZDBV)
[^18]: Shengyun Liu, Paolo Viotti, Christian Cachin, Vivien Quéma, and Marko Vukolić. [XFT: Practical Fault Tolerance beyond Crashes](https://www.usenix.org/system/files/conference/osdi16/osdi16-liu.pdf). At *12th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), November 2016.
[^19]: Mark Imbriaco. [Downtime last Saturday](https://github.blog/news-insights/the-library/downtime-last-saturday/). *github.blog*, December 2012. Archived at [perma.cc/M7X5-E8SQ](https://perma.cc/M7X5-E8SQ)
[^20]: Tom Lianza and Chris Snook. [A Byzantine failure in the real world](https://blog.cloudflare.com/a-byzantine-failure-in-the-real-world/). *blog.cloudflare.com*, November 2020. Archived at [perma.cc/83EZ-ALCY](https://perma.cc/83EZ-ALCY)
[^21]: Mohammed Alfatafta, Basil Alkhatib, Ahmed Alquraan, and Samer Al-Kiswany. [Toward a Generic Fault Tolerance Technique for Partial Network Partitioning](https://www.usenix.org/conference/osdi20/presentation/alfatafta). At *14th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), November 2020.
[^22]: Marc A. Donges. [Re: bnx2 cards Intermittantly Going Offline](https://www.spinics.net/lists/netdev/msg210485.html). Message to Linux *netdev* mailing list, *spinics.net*, September 2012. Archived at [perma.cc/TXP6-H8R3](https://perma.cc/TXP6-H8R3)
[^23]: Troy Toman. [Inside a CODE RED: Network Edition](https://signalvnoise.com/svn3/inside-a-code-red-network-edition/). *signalvnoise.com*, September 2020. Archived at [perma.cc/BET6-FY25](https://perma.cc/BET6-FY25)
[^24]: Kyle Kingsbury. [Call Me Maybe: Elasticsearch](https://aphyr.com/posts/317-call-me-maybe-elasticsearch). *aphyr.com*, June 2014. [perma.cc/JK47-S89J](https://perma.cc/JK47-S89J)
[^25]: Salvatore Sanfilippo. [A Few Arguments About Redis Sentinel Properties and Fail Scenarios](https://antirez.com/news/80). *antirez.com*, October 2014. [perma.cc/8XEU-CLM8](https://perma.cc/8XEU-CLM8)
[^26]: Nicolas Liochon. [CAP: If All You Have Is a Timeout, Everything Looks Like a Partition](http://blog.thislongrun.com/2015/05/CAP-theorem-partition-timeout-zookeeper.html). *blog.thislongrun.com*, May 2015. Archived at [perma.cc/FS57-V2PZ](https://perma.cc/FS57-V2PZ)
[^27]: Matthew P. Grosvenor, Malte Schwarzkopf, Ionel Gog, Robert N. M. Watson, Andrew W. Moore, Steven Hand, and Jon Crowcroft. [Queues Don’t Matter When You Can JUMP Them!](https://www.usenix.org/system/files/conference/nsdi15/nsdi15-paper-grosvenor_update.pdf) At *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015.
[^28]: Theo Julienne. [Debugging network stalls on Kubernetes](https://github.blog/engineering/debugging-network-stalls-on-kubernetes/). *github.blog*, November 2019. Archived at [perma.cc/K9M8-XVGL](https://perma.cc/K9M8-XVGL)
[^29]: Guohui Wang and T. S. Eugene Ng. [The Impact of Virtualization on Network Performance of Amazon EC2 Data Center](https://www.cs.rice.edu/~eugeneng/papers/INFOCOM10-ec2.pdf). At *29th IEEE International Conference on Computer Communications* (INFOCOM), March 2010. [doi:10.1109/INFCOM.2010.5461931](https://doi.org/10.1109/INFCOM.2010.5461931)
[^30]: Brandon Philips. [etcd: Distributed Locking and Service Discovery](https://www.youtube.com/watch?v=HJIjTTHWYnE). At *Strange Loop*, September 2014.
[^31]: Steve Newman. [A Systematic Look at EC2 I/O](https://www.sentinelone.com/blog/a-systematic-look-at-ec2-i-o/). *blog.scalyr.com*, October 2012. Archived at [perma.cc/FL4R-H2VE](https://perma.cc/FL4R-H2VE)
[^32]: Naohiro Hayashibara, Xavier Défago, Rami Yared, and Takuya Katayama. [The ϕ Accrual Failure Detector](https://hdl.handle.net/10119/4784). Japan Advanced Institute of Science and Technology, School of Information Science, Technical Report IS-RR-2004-010, May 2004. Archived at [perma.cc/NSM2-TRYA](https://perma.cc/NSM2-TRYA)
[^33]: Jeffrey Wang. [Phi Accrual Failure Detector](https://ternarysearch.blogspot.com/2013/08/phi-accrual-failure-detector.html). *ternarysearch.blogspot.co.uk*, August 2013. [perma.cc/L452-AMLV](https://perma.cc/L452-AMLV)
[^34]: Srinivasan Keshav. *An Engineering Approach to Computer Networking: ATM Networks, the Internet, and the Telephone Network*. Addison-Wesley Professional, May 1997. ISBN: 978-0-201-63442-6
[^35]: Othmar Kyas. *ATM Networks*. International Thomson Publishing, 1995. ISBN: 978-1-850-32128-6
[^36]: Mellanox Technologies. [InfiniBand FAQ, Rev 1.3](https://network.nvidia.com/related-docs/whitepapers/InfiniBandFAQ_FQ_100.pdf). *network.nvidia.com*, December 2014. Archived at [perma.cc/LQJ4-QZVK](https://perma.cc/LQJ4-QZVK)
[^37]: Jose Renato Santos, Yoshio Turner, and G. (John) Janakiraman. [End-to-End Congestion Control for InfiniBand](https://infocom2003.ieee-infocom.org/papers/28_01.PDF). At *22nd Annual Joint Conference of the IEEE Computer and Communications Societies* (INFOCOM), April 2003. Also published by HP Laboratories Palo Alto, Tech Report HPL-2002-359. [doi:10.1109/INFCOM.2003.1208949](https://doi.org/10.1109/INFCOM.2003.1208949)
[^38]: Jialin Li, Naveen Kr. Sharma, Dan R. K. Ports, and Steven D. Gribble. [Tales of the Tail: Hardware, OS, and Application-level Sources of Tail Latency](https://syslab.cs.washington.edu/papers/latency-socc14.pdf). At *ACM Symposium on Cloud Computing* (SOCC), November 2014. [doi:10.1145/2670979.2670988](https://doi.org/10.1145/2670979.2670988)
[^39]: Ulrich Windl, David Dalton, Marc Martinec, and Dale R. Worley. [The NTP FAQ and HOWTO](https://www.ntp.org/ntpfaq/). *ntp.org*, November 2006.
[^40]: John Graham-Cumming. [How and why the leap second affected Cloudflare DNS](https://blog.cloudflare.com/how-and-why-the-leap-second-affected-cloudflare-dns/). *blog.cloudflare.com*, January 2017. Archived at [archive.org](https://web.archive.org/web/20250202041444/https%3A//blog.cloudflare.com/how-and-why-the-leap-second-affected-cloudflare-dns/)
[^41]: David Holmes. [Inside the Hotspot VM: Clocks, Timers and Scheduling Events – Part I – Windows](https://web.archive.org/web/20160308031939/https%3A//blogs.oracle.com/dholmes/entry/inside_the_hotspot_vm_clocks). *blogs.oracle.com*, October 2006. Archived at [archive.org](https://web.archive.org/web/20160308031939/https%3A//blogs.oracle.com/dholmes/entry/inside_the_hotspot_vm_clocks)
[^42]: Joran Dirk Greef. [Three Clocks are Better than One](https://tigerbeetle.com/blog/2021-08-30-three-clocks-are-better-than-one/). *tigerbeetle.com*, August 2021. Archived at [perma.cc/5RXG-EU6B](https://perma.cc/5RXG-EU6B)
[^43]: Oliver Yang. [Pitfalls of TSC usage](https://oliveryang.net/2015/09/pitfalls-of-TSC-usage/). *oliveryang.net*, September 2015. Archived at [perma.cc/Z2QY-5FRA](https://perma.cc/Z2QY-5FRA)
[^44]: Steve Loughran. [Time on Multi-Core, Multi-Socket Servers](https://steveloughran.blogspot.com/2015/09/time-on-multi-core-multi-socket-servers.html). *steveloughran.blogspot.co.uk*, September 2015. Archived at [perma.cc/7M4S-D4U6](https://perma.cc/7M4S-D4U6)
[^45]: James C. Corbett, Jeffrey Dean, Michael Epstein, Andrew Fikes, Christopher Frost, JJ Furman, Sanjay Ghemawat, Andrey Gubarev, Christopher Heiser, Peter Hochschild, Wilson Hsieh, Sebastian Kanthak, Eugene Kogan, Hongyi Li, Alexander Lloyd, Sergey Melnik, David Mwaura, David Nagle, Sean Quinlan, Rajesh Rao, Lindsay Rolig, Dale Woodford, Yasushi Saito, Christopher Taylor, Michal Szymaniak, and Ruth Wang. [Spanner: Google’s Globally-Distributed Database](https://research.google/pubs/pub39966/). At *10th USENIX Symposium on Operating System Design and Implementation* (OSDI), October 2012.
[^46]: M. Caporaloni and R. Ambrosini. [How Closely Can a Personal Computer Clock Track the UTC Timescale Via the Internet?](https://iopscience.iop.org/0143-0807/23/4/103/) *European Journal of Physics*, volume 23, issue 4, pages L17–L21, June 2012. [doi:10.1088/0143-0807/23/4/103](https://doi.org/10.1088/0143-0807/23/4/103)
[^47]: Nelson Minar. [A Survey of the NTP Network](https://alumni.media.mit.edu/~nelson/research/ntp-survey99/). *alumni.media.mit.edu*, December 1999. Archived at [perma.cc/EV76-7ZV3](https://perma.cc/EV76-7ZV3)
[^48]: Viliam Holub. [Synchronizing Clocks in a Cassandra Cluster Pt. 1 – The Problem](https://blog.rapid7.com/2014/03/14/synchronizing-clocks-in-a-cassandra-cluster-pt-1-the-problem/). *blog.rapid7.com*, March 2014. Archived at [perma.cc/N3RV-5LNL](https://perma.cc/N3RV-5LNL)
[^49]: Poul-Henning Kamp. [The One-Second War (What Time Will You Die?)](https://queue.acm.org/detail.cfm?id=1967009) *ACM Queue*, volume 9, issue 4, pages 44–48, April 2011. [doi:10.1145/1966989.1967009](https://doi.org/10.1145/1966989.1967009)
[^50]: Nelson Minar. [Leap Second Crashes Half the Internet](https://www.somebits.com/weblog/tech/bad/leap-second-2012.html). *somebits.com*, July 2012. Archived at [perma.cc/2WB8-D6EU](https://perma.cc/2WB8-D6EU)
[^51]: Christopher Pascoe. [Time, Technology and Leaping Seconds](https://googleblog.blogspot.com/2011/09/time-technology-and-leaping-seconds.html). *googleblog.blogspot.co.uk*, September 2011. Archived at [perma.cc/U2JL-7E74](https://perma.cc/U2JL-7E74)
[^52]: Mingxue Zhao and Jeff Barr. [Look Before You Leap – The Coming Leap Second and AWS](https://aws.amazon.com/blogs/aws/look-before-you-leap-the-coming-leap-second-and-aws/). *aws.amazon.com*, May 2015. Archived at [perma.cc/KPE9-XMFM](https://perma.cc/KPE9-XMFM)
[^53]: Darryl Veitch and Kanthaiah Vijayalayan. [Network Timing and the 2015 Leap Second](https://opus.lib.uts.edu.au/bitstream/10453/43923/1/LeapSecond_camera.pdf). At *17th International Conference on Passive and Active Measurement* (PAM), April 2016. [doi:10.1007/978-3-319-30505-9\_29](https://doi.org/10.1007/978-3-319-30505-9_29)
[^54]: VMware, Inc. [Timekeeping in VMware Virtual Machines](https://www.vmware.com/docs/vmware_timekeeping). *vmware.com*, October 2008. Archived at [perma.cc/HM5R-T5NF](https://perma.cc/HM5R-T5NF)
[^55]: Victor Yodaiken. [Clock Synchronization in Finance and Beyond](https://www.yodaiken.com/wp-content/uploads/2018/05/financeandbeyond.pdf). *yodaiken.com*, November 2017. Archived at [perma.cc/9XZD-8ZZN](https://perma.cc/9XZD-8ZZN)
[^56]: Mustafa Emre Acer, Emily Stark, Adrienne Porter Felt, Sascha Fahl, Radhika Bhargava, Bhanu Dev, Matt Braithwaite, Ryan Sleevi, and Parisa Tabriz. [Where the Wild Warnings Are: Root Causes of Chrome HTTPS Certificate Errors](https://acmccs.github.io/papers/p1407-acerA.pdf). At *ACM SIGSAC Conference on Computer and Communications Security* (CCS), pages 1407–1420, October 2017. [doi:10.1145/3133956.3134007](https://doi.org/10.1145/3133956.3134007)
[^57]: European Securities and Markets Authority. [MiFID II / MiFIR: Regulatory Technical and Implementing Standards – Annex I](https://www.esma.europa.eu/sites/default/files/library/2015/11/2015-esma-1464_annex_i_-_draft_rts_and_its_on_mifid_ii_and_mifir.pdf). *esma.europa.eu*, Report ESMA/2015/1464, September 2015. Archived at [perma.cc/ZLX9-FGQ3](https://perma.cc/ZLX9-FGQ3)
[^58]: Luke Bigum. [Solving MiFID II Clock Synchronisation With Minimum Spend (Part 1)](https://catach.blogspot.com/2015/11/solving-mifid-ii-clock-synchronisation.html). *catach.blogspot.com*, November 2015. Archived at [perma.cc/4J5W-FNM4](https://perma.cc/4J5W-FNM4)
[^59]: Oleg Obleukhov and Ahmad Byagowi. [How Precision Time Protocol is being deployed at Meta](https://engineering.fb.com/2022/11/21/production-engineering/precision-time-protocol-at-meta/). *engineering.fb.com*, November 2022. Archived at [perma.cc/29G6-UJNW](https://perma.cc/29G6-UJNW)
[^60]: John Wiseman. [gpsjam.org](https://gpsjam.org/), July 2022.
[^61]: Josh Levinson, Julien Ridoux, and Chris Munns. [It’s About Time: Microsecond-Accurate Clocks on Amazon EC2 Instances](https://aws.amazon.com/blogs/compute/its-about-time-microsecond-accurate-clocks-on-amazon-ec2-instances/). *aws.amazon.com*, November 2023. Archived at [perma.cc/56M6-5VMZ](https://perma.cc/56M6-5VMZ)
[^62]: Kyle Kingsbury. [Call Me Maybe: Cassandra](https://aphyr.com/posts/294-call-me-maybe-cassandra/). *aphyr.com*, September 2013. Archived at [perma.cc/4MBR-J96V](https://perma.cc/4MBR-J96V)
[^63]: John Daily. [Clocks Are Bad, or, Welcome to the Wonderful World of Distributed Systems](https://riak.com/clocks-are-bad-or-welcome-to-distributed-systems/). *riak.com*, November 2013. Archived at [perma.cc/4XB5-UCXY](https://perma.cc/4XB5-UCXY)
[^64]: Marc Brooker. [It’s About Time!](https://brooker.co.za/blog/2023/11/27/about-time.html) *brooker.co.za*, November 2023. Archived at [perma.cc/N6YK-DRPA](https://perma.cc/N6YK-DRPA)
[^65]: Kyle Kingsbury. [The Trouble with Timestamps](https://aphyr.com/posts/299-the-trouble-with-timestamps). *aphyr.com*, October 2013. Archived at [perma.cc/W3AM-5VAV](https://perma.cc/W3AM-5VAV)
[^66]: Leslie Lamport. [Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/). *Communications of the ACM*, volume 21, issue 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](https://doi.org/10.1145/359545.359563)
[^67]: Justin Sheehy. [There Is No Now: Problems With Simultaneity in Distributed Systems](https://queue.acm.org/detail.cfm?id=2745385). *ACM Queue*, volume 13, issue 3, pages 36–41, March 2015. [doi:10.1145/2733108](https://doi.org/10.1145/2733108)
[^68]: Murat Demirbas. [Spanner: Google’s Globally-Distributed Database](https://muratbuffalo.blogspot.com/2013/07/spanner-googles-globally-distributed_4.html). *muratbuffalo.blogspot.co.uk*, July 2013. Archived at [perma.cc/6VWR-C9WB](https://perma.cc/6VWR-C9WB)
[^69]: Dahlia Malkhi and Jean-Philippe Martin. [Spanner’s Concurrency Control](https://www.cs.cornell.edu/~ie53/publications/DC-col51-Sep13.pdf). *ACM SIGACT News*, volume 44, issue 3, pages 73–77, September 2013. [doi:10.1145/2527748.2527767](https://doi.org/10.1145/2527748.2527767)
[^70]: Franck Pachot. [Achieving Precise Clock Synchronization on AWS](https://www.yugabyte.com/blog/aws-clock-synchronization/). *yugabyte.com*, December 2024. Archived at [perma.cc/UYM6-RNBS](https://perma.cc/UYM6-RNBS)
[^71]: Spencer Kimball. [Living Without Atomic Clocks: Where CockroachDB and Spanner diverge](https://www.cockroachlabs.com/blog/living-without-atomic-clocks/). *cockroachlabs.com*, January 2022. Archived at [perma.cc/AWZ7-RXFT](https://perma.cc/AWZ7-RXFT)
[^72]: Murat Demirbas. [Use of Time in Distributed Databases (part 4): Synchronized clocks in production databases](https://muratbuffalo.blogspot.com/2025/01/use-of-time-in-distributed-databases.html). *muratbuffalo.blogspot.com*, January 2025. Archived at [perma.cc/9WNX-Q9U3](https://perma.cc/9WNX-Q9U3)
[^73]: Cary G. Gray and David R. Cheriton. [Leases: An Efficient Fault-Tolerant Mechanism for Distributed File Cache Consistency](https://courses.cs.duke.edu/spring11/cps210/papers/p202-gray.pdf). At *12th ACM Symposium on Operating Systems Principles* (SOSP), December 1989. [doi:10.1145/74850.74870](https://doi.org/10.1145/74850.74870)
[^74]: Daniel Sturman, Scott Delap, Max Ross, et al. [Roblox Return to Service](https://corp.roblox.com/newsroom/2022/01/roblox-return-to-service-10-28-10-31-2021). *corp.roblox.com*, January 2022. Archived at [perma.cc/8ALT-WAS4](https://perma.cc/8ALT-WAS4)
[^75]: Todd Lipcon. [Avoiding Full GCs with MemStore-Local Allocation Buffers](https://www.slideshare.net/slideshow/hbase-hug-presentation/7038178). *slideshare.net*, February 2011. Archived at <https://perma.cc/CH62-2EWJ>
[^76]: Christopher Clark, Keir Fraser, Steven Hand, Jacob Gorm Hansen, Eric Jul, Christian Limpach, Ian Pratt, and Andrew Warfield. [Live Migration of Virtual Machines](https://www.usenix.org/legacy/publications/library/proceedings/nsdi05/tech/full_papers/clark/clark.pdf). At *2nd USENIX Symposium on Symposium on Networked Systems Design & Implementation* (NSDI), May 2005.
[^77]: Mike Shaver. [fsyncers and Curveballs](https://web.archive.org/web/20220107141023/http%3A//shaver.off.net/diary/2008/05/25/fsyncers-and-curveballs/). *shaver.off.net*, May 2008. Archived at [archive.org](https://web.archive.org/web/20220107141023/http%3A//shaver.off.net/diary/2008/05/25/fsyncers-and-curveballs/)
[^78]: Zhenyun Zhuang and Cuong Tran. [Eliminating Large JVM GC Pauses Caused by Background IO Traffic](https://engineering.linkedin.com/blog/2016/02/eliminating-large-jvm-gc-pauses-caused-by-background-io-traffic). *engineering.linkedin.com*, February 2016. Archived at [perma.cc/ML2M-X9XT](https://perma.cc/ML2M-X9XT)
[^79]: Martin Thompson. [Java Garbage Collection Distilled](https://mechanical-sympathy.blogspot.com/2013/07/java-garbage-collection-distilled.html). *mechanical-sympathy.blogspot.co.uk*, July 2013. Archived at [perma.cc/DJT3-NQLQ](https://perma.cc/DJT3-NQLQ)
[^80]: David Terei and Amit Levy. [Blade: A Data Center Garbage Collector](https://arxiv.org/pdf/1504.02578). arXiv:1504.02578, April 2015.
[^81]: Martin Maas, Tim Harris, Krste Asanović, and John Kubiatowicz. [Trash Day: Coordinating Garbage Collection in Distributed Systems](https://timharris.uk/papers/2015-hotos.pdf). At *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
[^82]: Martin Fowler. [The LMAX Architecture](https://martinfowler.com/articles/lmax.html). *martinfowler.com*, July 2011. Archived at [perma.cc/5AV4-N6RJ](https://perma.cc/5AV4-N6RJ)
[^83]: Joseph Y. Halpern and Yoram Moses. [Knowledge and common knowledge in a distributed environment](https://groups.csail.mit.edu/tds/papers/Halpern/JACM90.pdf). *Journal of the ACM* (JACM), volume 37, issue 3, pages 549–587, July 1990. [doi:10.1145/79147.79161](https://doi.org/10.1145/79147.79161)
[^84]: Chuzhe Tang, Zhaoguo Wang, Xiaodong Zhang, Qianmian Yu, Binyu Zang, Haibing Guan, and Haibo Chen. [Ad Hoc Transactions in Web Applications: The Good, the Bad, and the Ugly](https://ipads.se.sjtu.edu.cn/_media/publications/concerto-sigmod22.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2022. [doi:10.1145/3514221.3526120](https://doi.org/10.1145/3514221.3526120)
[^85]: Flavio P. Junqueira and Benjamin Reed. [*ZooKeeper: Distributed Process Coordination*](https://www.oreilly.com/library/view/zookeeper/9781449361297/). O’Reilly Media, 2013. ISBN: 978-1-449-36130-3
[^86]: Enis Söztutar. [HBase and HDFS: Understanding Filesystem Usage in HBase](https://www.slideshare.net/slideshow/hbase-and-hdfs-understanding-filesystem-usage/22990858). At *HBaseCon*, June 2013. Archived at [perma.cc/4DXR-9P88](https://perma.cc/4DXR-9P88)
[^87]: SUSE LLC. [SUSE Linux Enterprise High Availability 15 SP6 Administration Guide, Section 12: Fencing and STONITH](https://documentation.suse.com/sle-ha/15-SP6/html/SLE-HA-all/cha-ha-fencing.html). *documentation.suse.com*, March 2025. Archived at [perma.cc/8LAR-EL9D](https://perma.cc/8LAR-EL9D)
[^88]: Mike Burrows. [The Chubby Lock Service for Loosely-Coupled Distributed Systems](https://research.google/pubs/pub27897/). At *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006.
[^89]: Kyle Kingsbury. [etcd 3.4.3](https://jepsen.io/analyses/etcd-3.4.3). *jepsen.io*, January 2020. Archived at [perma.cc/2P3Y-MPWU](https://perma.cc/2P3Y-MPWU)
[^90]: Ensar Basri Kahveci. [Distributed Locks are Dead; Long Live Distributed Locks!](https://hazelcast.com/blog/long-live-distributed-locks/) *hazelcast.com*, April 2019. Archived at [perma.cc/7FS5-LDXE](https://perma.cc/7FS5-LDXE)
[^91]: Martin Kleppmann. [How to do distributed locking](https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html). *martin.kleppmann.com*, February 2016. Archived at [perma.cc/Y24W-YQ5L](https://perma.cc/Y24W-YQ5L)
[^92]: Salvatore Sanfilippo. [Is Redlock safe?](https://antirez.com/news/101) *antirez.com*, February 2016. Archived at [perma.cc/B6GA-9Q6A](https://perma.cc/B6GA-9Q6A)
[^93]: Gunnar Morling. [Leader Election With S3 Conditional Writes](https://www.morling.dev/blog/leader-election-with-s3-conditional-writes/). *www.morling.dev*, August 2024. Archived at [perma.cc/7V2N-J78Y](https://perma.cc/7V2N-J78Y)
[^94]: Leslie Lamport, Robert Shostak, and Marshall Pease. [The Byzantine Generals Problem](https://www.microsoft.com/en-us/research/publication/byzantine-generals-problem/). *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 4, issue 3, pages 382–401, July 1982. [doi:10.1145/357172.357176](https://doi.org/10.1145/357172.357176)
[^95]: Jim N. Gray. [Notes on Data Base Operating Systems](https://jimgray.azurewebsites.net/papers/dbos.pdf). in *Operating Systems: An Advanced Course*, Lecture Notes in Computer Science, volume 60, edited by R. Bayer, R. M. Graham, and G. Seegmüller, pages 393–481, Springer-Verlag, 1978. ISBN: 978-3-540-08755-7. Archived at [perma.cc/7S9M-2LZU](https://perma.cc/7S9M-2LZU)
[^96]: Brian Palmer. [How Complicated Was the Byzantine Empire?](https://slate.com/news-and-politics/2011/10/the-byzantine-tax-code-how-complicated-was-byzantium-anyway.html) *slate.com*, October 2011. Archived at [perma.cc/AN7X-FL3N](https://perma.cc/AN7X-FL3N)
[^97]: Leslie Lamport. [My Writings](https://lamport.azurewebsites.net/pubs/pubs.html). *lamport.azurewebsites.net*, December 2014. Archived at [perma.cc/5NNM-SQGR](https://perma.cc/5NNM-SQGR)
[^98]: John Rushby. [Bus Architectures for Safety-Critical Embedded Systems](https://www.csl.sri.com/papers/emsoft01/emsoft01.pdf). At *1st International Workshop on Embedded Software* (EMSOFT), October 2001. [doi:10.1007/3-540-45449-7\_22](https://doi.org/10.1007/3-540-45449-7_22)
[^99]: Jake Edge. [ELC: SpaceX Lessons Learned](https://lwn.net/Articles/540368/). *lwn.net*, March 2013. Archived at [perma.cc/AYX8-QP5X](https://perma.cc/AYX8-QP5X)
[^100]: Shehar Bano, Alberto Sonnino, Mustafa Al-Bassam, Sarah Azouvi, Patrick McCorry, Sarah Meiklejohn, and George Danezis. [SoK: Consensus in the Age of Blockchains](https://smeiklej.com/files/aft19a.pdf). At *1st ACM Conference on Advances in Financial Technologies* (AFT), October 2019. [doi:10.1145/3318041.3355458](https://doi.org/10.1145/3318041.3355458)
[^101]: Ezra Feilden, Adi Oltean, and Philip Johnston. [Why we should train AI in space](https://www.starcloud.com/wp). White Paper, *starcloud.com*, September 2024. Archived at [perma.cc/7Y3S-8UB6](https://perma.cc/7Y3S-8UB6)
[^102]: James Mickens. [The Saddest Moment](https://www.usenix.org/system/files/login-logout_1305_mickens.pdf). *USENIX ;login*, May 2013. Archived at [perma.cc/T7BZ-XCFR](https://perma.cc/T7BZ-XCFR)
[^103]: Martin Kleppmann and Heidi Howard. [Byzantine Eventual Consistency and the Fundamental Limits of Peer-to-Peer Databases](https://arxiv.org/abs/2012.00472). *arxiv.org*, December 2020. [doi:10.48550/arXiv.2012.00472](https://doi.org/10.48550/arXiv.2012.00472)
[^104]: Martin Kleppmann. [Making CRDTs Byzantine Fault Tolerant](https://martin.kleppmann.com/papers/bft-crdt-papoc22.pdf). At *9th Workshop on Principles and Practice of Consistency for Distributed Data* (PaPoC), April 2022. [doi:10.1145/3517209.3524042](https://doi.org/10.1145/3517209.3524042)
[^105]: Evan Gilman. [The Discovery of Apache ZooKeeper’s Poison Packet](https://www.pagerduty.com/blog/the-discovery-of-apache-zookeepers-poison-packet/). *pagerduty.com*, May 2015. Archived at [perma.cc/RV6L-Y5CQ](https://perma.cc/RV6L-Y5CQ)
[^106]: Jonathan Stone and Craig Partridge. [When the CRC and TCP Checksum Disagree](https://conferences2.sigcomm.org/sigcomm/2000/conf/paper/sigcomm2000-9-1.pdf). At *ACM Conference on Applications, Technologies, Architectures, and Protocols for Computer Communication* (SIGCOMM), August 2000. [doi:10.1145/347059.347561](https://doi.org/10.1145/347059.347561)
[^107]: Evan Jones. [How Both TCP and Ethernet Checksums Fail](https://www.evanjones.ca/tcp-and-ethernet-checksums-fail.html). *evanjones.ca*, October 2015. Archived at [perma.cc/9T5V-B8X5](https://perma.cc/9T5V-B8X5)
[^108]: Cynthia Dwork, Nancy Lynch, and Larry Stockmeyer. [Consensus in the Presence of Partial Synchrony](https://groups.csail.mit.edu/tds/papers/Lynch/jacm88.pdf). *Journal of the ACM*, volume 35, issue 2, pages 288–323, April 1988. [doi:10.1145/42282.42283](https://doi.org/10.1145/42282.42283)
[^109]: Richard D. Schlichting and Fred B. Schneider. [Fail-stop processors: an approach to designing fault-tolerant computing systems](https://www.cs.cornell.edu/fbs/publications/Fail_Stop.pdf). *ACM Transactions on Computer Systems* (TOCS), volume 1, issue 3, pages 222–238, August 1983. [doi:10.1145/357369.357371](https://doi.org/10.1145/357369.357371)
[^110]: Thanh Do, Mingzhe Hao, Tanakorn Leesatapornwongsa, Tiratat Patana-anake, and Haryadi S. Gunawi. [Limplock: Understanding the Impact of Limpware on Scale-out Cloud Systems](https://ucare.cs.uchicago.edu/pdf/socc13-limplock.pdf). At *4th ACM Symposium on Cloud Computing* (SoCC), October 2013. [doi:10.1145/2523616.2523627](https://doi.org/10.1145/2523616.2523627)
[^111]: Josh Snyder and Joseph Lynch. [Garbage collecting unhealthy JVMs, a proactive approach](https://netflixtechblog.medium.com/introducing-jvmquake-ec944c60ba70). Netflix Technology Blog, *netflixtechblog.medium.com*, November 2019. Archived at [perma.cc/8BTA-N3YB](https://perma.cc/8BTA-N3YB)
[^112]: Haryadi S. Gunawi, Riza O. Suminto, Russell Sears, Casey Golliher, Swaminathan Sundararaman, Xing Lin, Tim Emami, Weiguang Sheng, Nematollah Bidokhti, Caitie McCaffrey, Gary Grider, Parks M. Fields, Kevin Harms, Robert B. Ross, Andree Jacobson, Robert Ricci, Kirk Webb, Peter Alvaro, H. Birali Runesha, Mingzhe Hao, and Huaicheng Li. [Fail-Slow at Scale: Evidence of Hardware Performance Faults in Large Production Systems](https://www.usenix.org/system/files/conference/fast18/fast18-gunawi.pdf). At *16th USENIX Conference on File and Storage Technologies*, February 2018.
[^113]: Peng Huang, Chuanxiong Guo, Lidong Zhou, Jacob R. Lorch, Yingnong Dang, Murali Chintalapati, and Randolph Yao. [Gray Failure: The Achilles’ Heel of Cloud-Scale Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/06/paper-1.pdf). At *16th Workshop on Hot Topics in Operating Systems* (HotOS), May 2017. [doi:10.1145/3102980.3103005](https://doi.org/10.1145/3102980.3103005)
[^114]: Chang Lou, Peng Huang, and Scott Smith. [Understanding, Detecting and Localizing Partial Failures in Large System Software](https://www.usenix.org/conference/nsdi20/presentation/lou). At *17th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), February 2020.
[^115]: Peter Bailis and Ali Ghodsi. [Eventual Consistency Today: Limitations, Extensions, and Beyond](https://queue.acm.org/detail.cfm?id=2462076). *ACM Queue*, volume 11, issue 3, pages 55-63, March 2013. [doi:10.1145/2460276.2462076](https://doi.org/10.1145/2460276.2462076)
[^116]: Bowen Alpern and Fred B. Schneider. [Defining Liveness](https://www.cs.cornell.edu/fbs/publications/DefLiveness.pdf). *Information Processing Letters*, volume 21, issue 4, pages 181–185, October 1985. [doi:10.1016/0020-0190(85)90056-0](https://doi.org/10.1016/0020-0190%2885%2990056-0)
[^117]: Flavio P. Junqueira. [Dude, Where’s My Metadata?](https://fpj.me/2015/05/28/dude-wheres-my-metadata/) *fpj.me*, May 2015. Archived at [perma.cc/D2EU-Y9S5](https://perma.cc/D2EU-Y9S5)
[^118]: Scott Sanders. [January 28th Incident Report](https://github.com/blog/2106-january-28th-incident-report). *github.com*, February 2016. Archived at [perma.cc/5GZR-88TV](https://perma.cc/5GZR-88TV)
[^119]: Jay Kreps. [A Few Notes on Kafka and Jepsen](https://blog.empathybox.com/post/62279088548/a-few-notes-on-kafka-and-jepsen). *blog.empathybox.com*, September 2013. [perma.cc/XJ5C-F583](https://perma.cc/XJ5C-F583)
[^120]: Marc Brooker and Ankush Desai. [Systems Correctness Practices at AWS](https://dl.acm.org/doi/pdf/10.1145/3712057). *Queue, Volume 22, Issue 6*, November/December 2024. [doi:10.1145/3712057](https://doi.org/10.1145/3712057)
[^121]: Andrey Satarin. [Testing Distributed Systems: Curated list of resources on testing distributed systems](https://asatarin.github.io/testing-distributed-systems/). *asatarin.github.io*. Archived at [perma.cc/U5V8-XP24](https://perma.cc/U5V8-XP24)
[^122]: Jack Vanlightly. [Verifying Kafka transactions - Diary entry 2 - Writing an initial TLA+ spec](https://jack-vanlightly.com/analyses/2024/12/3/verifying-kafka-transactions-diary-entry-2-writing-an-initial-tla-spec). *jack-vanlightly.com*, December 2024. Archived at [perma.cc/NSQ8-MQ5N](https://perma.cc/NSQ8-MQ5N)
[^123]: Siddon Tang. [From Chaos to Order — Tools and Techniques for Testing TiDB, A Distributed NewSQL Database](https://www.pingcap.com/blog/chaos-practice-in-tidb/). *pingcap.com*, April 2018. Archived at [perma.cc/5EJB-R29F](https://perma.cc/5EJB-R29F)
[^124]: Nathan VanBenschoten. [Parallel Commits: An atomic commit protocol for globally distributed transactions](https://www.cockroachlabs.com/blog/parallel-commits/). *cockroachlabs.com*, November 2019. Archived at [perma.cc/5FZ7-QK6J](https://perma.cc/5FZ7-QK6J%20)
[^125]: Jack Vanlightly. [Paper: VR Revisited - State Transfer (part 3)](https://jack-vanlightly.com/analyses/2022/12/28/paper-vr-revisited-state-transfer-part-3). *jack-vanlightly.com*, December 2022. Archived at [perma.cc/KNK3-K6WS](https://perma.cc/KNK3-K6WS)
[^126]: Hillel Wayne. [What if the spec doesn’t match the code?](https://buttondown.com/hillelwayne/archive/what-if-the-spec-doesnt-match-the-code/) *buttondown.com*, March 2024. Archived at [perma.cc/8HEZ-KHER](https://perma.cc/8HEZ-KHER)
[^127]: Lingzhi Ouyang, Xudong Sun, Ruize Tang, Yu Huang, Madhav Jivrajani, Xiaoxing Ma, Tianyin Xu. [Multi-Grained Specifications for Distributed System Model Checking and Verification](https://arxiv.org/abs/2409.14301). At *20th European Conference on Computer Systems* (EuroSys), March 2025. [doi:10.1145/3689031.3696069](https://doi.org/10.1145/3689031.3696069)
[^128]: Yury Izrailevsky and Ariel Tseitlin. [The Netflix Simian Army](https://netflixtechblog.com/the-netflix-simian-army-16e57fbab116). *netflixtechblog.com*, July, 2011. Archived at [perma.cc/M3NY-FJW6](https://perma.cc/M3NY-FJW6)
[^129]: Kyle Kingsbury. [Jepsen: On the perils of network partitions](https://aphyr.com/posts/281-jepsen-on-the-perils-of-network-partitions). *aphyr.com*, May, 2013. Archived at [perma.cc/W98G-6HQP](https://perma.cc/W98G-6HQP)
[^130]: Kyle Kingsbury. [Jepsen Analyses](https://jepsen.io/analyses). *jepsen.io*, 2024. Archived at [perma.cc/8LDN-D2T8](https://perma.cc/8LDN-D2T8)
[^131]: Rupak Majumdar and Filip Niksic. [Why is random testing effective for partition tolerance bugs?](https://dl.acm.org/doi/pdf/10.1145/3158134) *Proceedings of the ACM on Programming Languages* (PACMPL), volume 2, issue POPL, article no. 46, December 2017. [doi:10.1145/3158134](https://doi.org/10.1145/3158134)
[^132]: FoundationDB project authors. [Simulation and Testing](https://apple.github.io/foundationdb/testing.html). *apple.github.io*. Archived at [perma.cc/NQ3L-PM4C](https://perma.cc/NQ3L-PM4C)
[^133]: Alex Kladov. [Simulation Testing For Liveness](https://tigerbeetle.com/blog/2023-07-06-simulation-testing-for-liveness/). *tigerbeetle.com*, July 2023. Archived at [perma.cc/RKD4-HGCR](https://perma.cc/RKD4-HGCR)
[^134]: Alfonso Subiotto Marqués. [(Mostly) Deterministic Simulation Testing in Go](https://www.polarsignals.com/blog/posts/2024/05/28/mostly-dst-in-go). *polarsignals.com*, May 2024. Archived at [perma.cc/ULD6-TSA4](https://perma.cc/ULD6-TSA4)

================================================
FILE: content/tw/colophon.md
================================================
---
title: 後記
weight: 600
breadcrumbs: false
---

{{< callout type="warning" >}}
當前頁面來自本書第一版，第二版尚不可用
{{< /callout >}}

## 關於作者

**Martin Kleppmann** 是英國劍橋大學副教授，教授分散式系統與密碼學協議。2017 年出版的《設計資料密集型應用》第一版確立了他在資料系統領域的權威地位；他在分散式系統方面的研究也推動了 local-first 軟體運動。此前他曾在 LinkedIn、Rapportive 等網際網路公司擔任軟體工程師和創業者，負責大規模資料基礎設施。

**Chris Riccomini** 是軟體工程師、創業投資人和作者，擁有 15 年以上在 PayPal、LinkedIn、WePay 的工作經驗。他運營 Materialized View Capital，專注於基礎設施初創企業投資；同時也是 Apache Samza 與 SlateDB 的共同創造者，併合著了 *The Missing README: A Guide for the New Software Engineer*。

![](http://martin.kleppmann.com/2017/03/ddia-poster.jpg)

## 關於譯者

[**馮若航**](https://vonng.com)，網名 [@Vonng](https://github.com/Vonng)。
PostgreSQL 專家，資料庫老司機，雲計算泥石流。
PostgreSQL 發行版 [**Pigsty**](https://pgsty.com) 作者與創始人。
架構師，DBA，全棧工程師 @ TanTan，Alibaba，Apple。
獨立開源貢獻者，[GitStar Ranking 585](https://gitstar-ranking.com/Vonng)，[國區活躍 Top20](https://committers.top/china)。
[DDIA](https://ddia.pigsty.io) / [PG Internal](https://pgint.vonng.com) 中文版譯者，資料庫/雲計算 KOL。


## 後記

《設計資料密集型應用》封面上的動物是 **印度野豬（Sus scrofa cristatus）**，它是在印度、緬甸、尼泊爾、斯里蘭卡和泰國發現的一種野豬的亞種。與歐洲野豬不同，它們有更高的背部鬃毛，沒有體表絨毛，以及更大更直的頭骨。

印度野豬有一頭灰色或黑色的頭髮，脊背上有短而硬的毛。雄性有突出的犬齒（稱為 T），用來與對手戰鬥或抵禦掠食者。雄性比雌性大，這些物種平均肩高 33-35 英寸，體重 200-300 磅。他們的天敵包括熊、老虎和各種大型貓科動物。

這些動物夜行且雜食 —— 它們吃各種各樣的東西，包括根、昆蟲、腐肉、堅果、漿果和小動物。野豬經常因為破壞農作物的根被人們所熟知，他們造成大量的破壞，並被農民所敵視。他們每天需要攝入 4,000 ~ 4,500 卡路里的能量。野豬有發達的嗅覺，這有助於尋找地下植物和挖掘動物。然而，它們的視力很差。

野豬在人類文化中一直具有重要意義。在印度教傳說中，野豬是毗溼奴神的化身。在古希臘的喪葬紀念碑中，它是一個勇敢失敗者的象徵（與勝利的獅子相反）。由於它的侵略，它被描繪在斯堪的納維亞、日耳曼和盎格魯撒克遜戰士的盔甲和武器上。在中國十二生肖中，它象徵著決心和急躁。

O'Reilly 封面上的許多動物都受到威脅，這些動物對世界都很重要。要了解有關如何提供幫助的更多資訊，請訪問 animals.oreilly.com。

封面圖片來自 Shaw's Zoology。封面字型是 URW Typewriter 和 Guardian Sans。文字字型是 Adobe Minion Pro；圖中的字型是 Adobe Myriad Pro；標題字型是 Adobe Myriad Condensed；程式碼字型是 Dalton Maag 的 Ubuntu Mono。

================================================
FILE: content/tw/contrib.md
================================================
---
title: 貢獻者
weight: 800
breadcrumbs: false
---

## 譯者

[**馮若航**](https://vonng.com)，網名 [@Vonng](https://github.com/Vonng)。
PostgreSQL 專家，資料庫老司機，雲計算泥石流。
[**Pigsty**](https://pgsty.com) 作者與創始人。
架構師，DBA，全棧工程師 @ TanTan，Alibaba，Apple。
獨立開源貢獻者，[GitStar Ranking 585](https://gitstar-ranking.com/Vonng)，[國區活躍 Top20](https://committers.top/china)。
[DDIA](https://ddia.pigsty.io) / [PG Internal](https://pgint.vonng.com) 中文版譯者，公眾號：《老馮雲數》，資料庫 KOL。

## 校訂與維護

YinGang [@yingang](https://github.com/yingang) 對本書進行了全文校訂，並持續維護。

## 繁體中文版本

[繁體中文](/tw) **版本維護** by  [@afunTW](https://github.com/afunTW)

## 貢獻列表

[GitHub 貢獻者列表](https://github.com/Vonng/ddia/graphs/contributors)

0. 全文校訂 by [@yingang](https://github.com/Vonng/ddia/commits?author=yingang)
1. [序言初翻修正](https://github.com/Vonng/ddia/commit/afb5edab55c62ed23474149f229677e3b42dfc2c) by [@seagullbird](https://github.com/Vonng/ddia/commits?author=seagullbird)
2. [第一章語法標點校正](https://github.com/Vonng/ddia/commit/973b12cd8f8fcdf4852f1eb1649ddd9d187e3644) by [@nevertiree](https://github.com/Vonng/ddia/commits?author=nevertiree)
3. [第六章部分校正](https://github.com/Vonng/ddia/commit/d4eb0852c0ec1e93c8aacc496c80b915bb1e6d48) 與[第十章的初翻](https://github.com/Vonng/ddia/commit/9de8dbd1bfe6fbb03b3bf6c1a1aa2291aed2490e) by [@MuAlex](https://github.com/Vonng/ddia/commits?author=MuAlex)
4. [第一部分](/tw/part-i)前言，[ch2](/tw/ch2)校正 by [@jiajiadebug](https://github.com/Vonng/ddia/commits?author=jiajiadebug)
5. [詞彙表](/tw/glossary)、[後記](/tw/colophon)關於野豬的部分 by [@Chowss](https://github.com/Vonng/ddia/commits?author=Chowss)
6. [繁體中文](https://github.com/Vonng/ddia/pulls)版本與轉換指令碼 by [@afunTW](https://github.com/afunTW)
7. 多處翻譯修正 by [@songzhibin97](https://github.com/Vonng/ddia/commits?author=songzhibin97) [@MamaShip](https://github.com/Vonng/ddia/commits?author=MamaShip) [@FangYuan33](https://github.com/Vonng/ddia/commits?author=FangYuan33)


感謝所有提出意見，作出貢獻的朋友們，您可以在這裡找到所有貢獻的 [Issue 列表](https://github.com/Vonng/ddia/issues) 與 [PR 列表](https://github.com/Vonng/ddia/pulls)：

| ISSUE & Pull Requests                           | USER                                                       | Title                                                          |
|-------------------------------------------------|------------------------------------------------------------|----------------------------------------------------------------|
| [359](https://github.com/Vonng/ddia/pull/359)   | [@c25423](https://github.com/c25423)                       | ch10: 修正一處拼寫錯誤                                                 |
| [358](https://github.com/Vonng/ddia/pull/358)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch4: 修正一處拼寫錯誤                                                  |
| [356](https://github.com/Vonng/ddia/pull/356)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch2: 修正一處標點錯誤                                                  |
| [355](https://github.com/Vonng/ddia/pull/355)   | [@DuroyGeorge](https://github.com/DuroyGeorge)             | ch12: 修正一處格式錯誤                                                 |
| [354](https://github.com/Vonng/ddia/pull/354)   | [@justlorain](https://github.com/justlorain)               | ch7: 修正一處參考連結                                                  |
| [353](https://github.com/Vonng/ddia/pull/353)   | [@fantasyczl](https://github.com/fantasyczl)               | ch3&9: 修正兩處引用錯誤                                                |
| [352](https://github.com/Vonng/ddia/pull/352)   | [@fantasyczl](https://github.com/fantasyczl)               | 支援輸出為 EPUB 格式                                                  |
| [349](https://github.com/Vonng/ddia/pull/349)   | [@xiyihan0](https://github.com/xiyihan0)                   | ch1: 修正一處格式錯誤                                                  |
| [348](https://github.com/Vonng/ddia/pull/348)   | [@omegaatt36](https://github.com/omegaatt36)               | ch3: 修正一處影像連結                                                  |
| [346](https://github.com/Vonng/ddia/issues/346) | [@Vermouth1995](https://github.com/Vermouth1995)           | ch1: 最佳化一處翻譯                                                    |
| [343](https://github.com/Vonng/ddia/pull/343)   | [@kehao-chen](https://github.com/kehao-chen)               | ch10: 最佳化一處翻譯                                                   |
| [341](https://github.com/Vonng/ddia/pull/341)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch3: 最佳化兩處翻譯                                                    |
| [340](https://github.com/Vonng/ddia/pull/340)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch2: 最佳化多處翻譯                                                    |
| [338](https://github.com/Vonng/ddia/pull/338)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch1: 最佳化一處翻譯                                                    |
| [335](https://github.com/Vonng/ddia/pull/335)   | [@kimi0230](https://github.com/kimi0230)                   | 修正一處繁體中文錯誤                                                     |
| [334](https://github.com/Vonng/ddia/pull/334)   | [@soulrrrrr](https://github.com/soulrrrrr)                 | ch2: 修正一處繁體中文錯誤                                                |
| [332](https://github.com/Vonng/ddia/pull/332)   | [@justlorain](https://github.com/justlorain)               | ch5: 修正一處翻譯錯誤                                                  |
| [331](https://github.com/Vonng/ddia/pull/331)   | [@Lyianu](https://github.com/Lyianu)                       | ch9: 更正幾處拼寫錯誤                                                  |
| [330](https://github.com/Vonng/ddia/pull/330)   | [@Lyianu](https://github.com/Lyianu)                       | ch7: 最佳化一處翻譯                                                    |
| [329](https://github.com/Vonng/ddia/issues/329) | [@Lyianu](https://github.com/Lyianu)                       | ch6: 指出一處翻譯錯誤                                                  |
| [328](https://github.com/Vonng/ddia/pull/328)   | [@justlorain](https://github.com/justlorain)               | ch4: 更正一處翻譯遺漏                                                  |
| [326](https://github.com/Vonng/ddia/pull/326)   | [@liangGTY](https://github.com/liangGTY)                   | ch1: 最佳化一處翻譯                                                    |
| [323](https://github.com/Vonng/ddia/pull/323)   | [@marvin263](https://github.com/marvin263)                 | ch5: 最佳化一處翻譯                                                    |
| [322](https://github.com/Vonng/ddia/pull/322)   | [@marvin263](https://github.com/marvin263)                 | ch8: 最佳化一處翻譯                                                    |
| [304](https://github.com/Vonng/ddia/pull/304)   | [@spike014](https://github.com/spike014)                   | ch11: 最佳化一處翻譯                                                   |
| [298](https://github.com/Vonng/ddia/pull/298)   | [@Makonike](https://github.com/Makonike)                   | ch11&12: 修正兩處錯誤                                                |
| [284](https://github.com/Vonng/ddia/pull/284)   | [@WAangzE](https://github.com/WAangzE)                     | ch4: 更正一處列表錯誤                                                  |
| [283](https://github.com/Vonng/ddia/pull/283)   | [@WAangzE](https://github.com/WAangzE)                     | ch3: 更正一處錯別字                                                   |
| [282](https://github.com/Vonng/ddia/pull/282)   | [@WAangzE](https://github.com/WAangzE)                     | ch2: 更正一處公式問題                                                  |
| [281](https://github.com/Vonng/ddia/pull/281)   | [@lyuxi99](https://github.com/lyuxi99)                     | 更正多處內部連結錯誤                                                     |
| [280](https://github.com/Vonng/ddia/pull/280)   | [@lyuxi99](https://github.com/lyuxi99)                     | ch9: 更正內部連結錯誤                                                  |
| [279](https://github.com/Vonng/ddia/issues/279) | [@codexvn](https://github.com/codexvn)                     | ch9: 指出公式在 GitHub Pages 顯示的問題                                  |
| [278](https://github.com/Vonng/ddia/pull/278)   | [@LJlkdskdjflsa](https://github.com/LJlkdskdjflsa)         | 發現了繁體中文版本中的錯誤翻譯                                                |
| [275](https://github.com/Vonng/ddia/pull/275)   | [@117503445](https://github.com/117503445)                 | 更正 LICENSE 連結                                                  |
| [274](https://github.com/Vonng/ddia/pull/274)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch7: 修正錯別字                                                     |
| [273](https://github.com/Vonng/ddia/pull/273)   | [@Sdot-Python](https://github.com/Sdot-Python)             | ch7: 統一了 write skew 的翻譯                                        |
| [271](https://github.com/Vonng/ddia/pull/271)   | [@Makonike](https://github.com/Makonike)                   | ch6: 統一了 rebalancing 的翻譯                                       |
| [270](https://github.com/Vonng/ddia/pull/270)   | [@Ynjxsjmh](https://github.com/Ynjxsjmh)                   | ch7: 修正不一致的翻譯                                                  |
| [263](https://github.com/Vonng/ddia/pull/263)   | [@zydmayday](https://github.com/zydmayday)                 | ch5: 修正譯文中的重複單詞                                                |
| [260](https://github.com/Vonng/ddia/pull/260)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch4: 修正部分不準確的翻譯                                                |
| [258](https://github.com/Vonng/ddia/pull/258)   | [@bestgrc](https://github.com/bestgrc)                     | ch3: 修正一處翻譯錯誤                                                  |
| [257](https://github.com/Vonng/ddia/pull/257)   | [@UnderSam](https://github.com/UnderSam)                   | ch8: 修正一處拼寫錯誤                                                  |
| [256](https://github.com/Vonng/ddia/pull/256)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可序列化”相關內容的多處翻譯不當                                       |
| [255](https://github.com/Vonng/ddia/pull/255)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可重複讀”相關內容的多處翻譯不當                                       |
| [253](https://github.com/Vonng/ddia/pull/253)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“讀已提交”相關內容的多處翻譯不當                                       |
| [246](https://github.com/Vonng/ddia/pull/246)   | [@derekwu0101](https://github.com/derekwu0101)             | ch3: 修正繁體中文的轉譯錯誤                                               |
| [245](https://github.com/Vonng/ddia/pull/245)   | [@skyran1278](https://github.com/skyran1278)               | ch12: 修正繁體中文的轉譯錯誤                                              |
| [244](https://github.com/Vonng/ddia/pull/244)   | [@Axlgrep](https://github.com/Axlgrep)                     | ch9: 修正不通順的翻譯                                                  |
| [242](https://github.com/Vonng/ddia/pull/242)   | [@lynkeib](https://github.com/lynkeib)                     | ch9: 修正不通順的翻譯                                                  |
| [241](https://github.com/Vonng/ddia/pull/241)   | [@lynkeib](https://github.com/lynkeib)                     | ch8: 修正不正確的公式格式                                                |
| [240](https://github.com/Vonng/ddia/pull/240)   | [@8da2k](https://github.com/8da2k)                         | ch9: 修正不通順的翻譯                                                  |
| [239](https://github.com/Vonng/ddia/pull/239)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch7: 修正不一致的翻譯                                                  |
| [237](https://github.com/Vonng/ddia/pull/237)   | [@zhangnew](https://github.com/zhangnew)                   | ch3: 修正錯誤的圖片連結                                                 |
| [229](https://github.com/Vonng/ddia/pull/229)   | [@lis186](https://github.com/lis186)                       | 指出繁體中文的轉譯錯誤：複雜                                                 |
| [226](https://github.com/Vonng/ddia/pull/226)   | [@chroming](https://github.com/chroming)                   | ch1: 修正導航欄中的章節名稱                                               |
| [220](https://github.com/Vonng/ddia/pull/220)   | [@skyran1278](https://github.com/skyran1278)               | ch9: 修正線性一致的繁體中文翻譯                                             |
| [194](https://github.com/Vonng/ddia/pull/194)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正錯誤的翻譯                                                   |
| [193](https://github.com/Vonng/ddia/pull/193)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 最佳化譯文                                                      |
| [192](https://github.com/Vonng/ddia/pull/192)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正不一致和不通順的翻譯                                              |
| [190](https://github.com/Vonng/ddia/pull/190)   | [@Pcrab](https://github.com/Pcrab)                         | ch1: 修正不準確的翻譯                                                  |
| [187](https://github.com/Vonng/ddia/pull/187)   | [@narojay](https://github.com/narojay)                     | ch9: 修正生硬的翻譯                                                   |
| [186](https://github.com/Vonng/ddia/pull/186)   | [@narojay](https://github.com/narojay)                     | ch8: 修正錯別字                                                     |
| [185](https://github.com/Vonng/ddia/issues/185) | [@8da2k](https://github.com/8da2k)                         | 指出小標題跳轉的問題                                                     |
| [184](https://github.com/Vonng/ddia/pull/184)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch10: 修正失效的網址                                                  |
| [183](https://github.com/Vonng/ddia/pull/183)   | [@OneSizeFitsQuorum](https://github.com/OneSizeFitsQuorum) | ch8: 修正錯別字                                                     |
| [182](https://github.com/Vonng/ddia/issues/182) | [@lroolle](https://github.com/lroolle)                     | 建議docsify的主題風格                                                 |
| [181](https://github.com/Vonng/ddia/pull/181)   | [@YunfengGao](https://github.com/YunfengGao)               | ch2: 修正翻譯錯誤                                                    |
| [180](https://github.com/Vonng/ddia/pull/180)   | [@skyran1278](https://github.com/skyran1278)               | ch3: 指出繁體中文的轉譯錯誤                                               |
| [177](https://github.com/Vonng/ddia/pull/177)   | [@exzhawk](https://github.com/exzhawk)                     | 支援 Github Pages 裡的公式顯示                                         |
| [176](https://github.com/Vonng/ddia/pull/176)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch2: 語義網相關翻譯更正                                                 |
| [175](https://github.com/Vonng/ddia/pull/175)   | [@cwr31](https://github.com/cwr31)                         | ch7: 不變式相關翻譯更正                                                 |
| [174](https://github.com/Vonng/ddia/pull/174)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | README & preface: 更正不正確的中文用詞和標點符號                              |
| [173](https://github.com/Vonng/ddia/pull/173)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正不完整的翻譯                                                 |
| [171](https://github.com/Vonng/ddia/pull/171)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正重複的譯文                                                  |
| [169](https://github.com/Vonng/ddia/pull/169)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 更正不太通順的翻譯                                                |
| [166](https://github.com/Vonng/ddia/pull/166)   | [@bp4m4h94](https://github.com/bp4m4h94)                   | ch1: 發現錯誤的文獻索引                                                 |
| [164](https://github.com/Vonng/ddia/pull/164)   | [@DragonDriver](https://github.com/DragonDriver)           | preface: 更正錯誤的標點符號                                             |
| [163](https://github.com/Vonng/ddia/pull/163)   | [@llmmddCoder](https://github.com/llmmddCoder)             | ch1: 更正錯誤字                                                     |
| [160](https://github.com/Vonng/ddia/pull/160)   | [@Zhayhp](https://github.com/Zhayhp)                       | ch2: 建議將 network model 翻譯為網狀模型                                 |
| [159](https://github.com/Vonng/ddia/pull/159)   | [@1ess](https://github.com/1ess)                           | ch4: 更正錯誤字                                                     |
| [157](https://github.com/Vonng/ddia/pull/157)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通順的翻譯                                                 |
| [155](https://github.com/Vonng/ddia/pull/155)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通順的翻譯                                                 |
| [153](https://github.com/Vonng/ddia/pull/153)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch9: 修正縮圖的錯別字                                                 |
| [152](https://github.com/Vonng/ddia/pull/152)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 除重->去重                                                    |
| [151](https://github.com/Vonng/ddia/pull/151)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 修訂sibling相關的翻譯                                            |
| [147](https://github.com/Vonng/ddia/pull/147)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 更正一處不準確的翻譯                                                |
| [145](https://github.com/Vonng/ddia/pull/145)   | [@Hookey](https://github.com/Hookey)                       | 識別了當前簡繁轉譯過程中處理不當的地方，暫透過轉換指令碼規避                                  |
| [144](https://github.com/Vonng/ddia/issues/144) | [@secret4233](https://github.com/secret4233)               | ch7: 不翻譯`next-key locking`                                     |
| [143](https://github.com/Vonng/ddia/issues/143) | [@imcheney](https://github.com/imcheney)                   | ch3: 更新殘留的機翻段落                                                 |
| [142](https://github.com/Vonng/ddia/issues/142) | [@XIJINIAN](https://github.com/XIJINIAN)                   | 建議去除段首的製表符                                                     |
| [141](https://github.com/Vonng/ddia/issues/141) | [@Flyraty](https://github.com/Flyraty)                     | ch5: 發現一處錯誤格式的章節引用                                             |
| [140](https://github.com/Vonng/ddia/pull/140)   | [@Bowser1704](https://github.com/Bowser1704)               | ch5: 修正章節Summary中多處不通順的翻譯                                      |
| [139](https://github.com/Vonng/ddia/pull/139)   | [@Bowser1704](https://github.com/Bowser1704)               | ch2&ch3: 修正多處不通順的或錯誤的翻譯                                        |
| [137](https://github.com/Vonng/ddia/pull/137)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch5&ch6: 最佳化多處不通順的或錯誤的翻譯                                        |
| [134](https://github.com/Vonng/ddia/pull/134)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch4: 最佳化多處不通順的或錯誤的翻譯                                            |
| [133](https://github.com/Vonng/ddia/pull/133)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 最佳化多處錯誤的或不通順的翻譯                                            |
| [132](https://github.com/Vonng/ddia/pull/132)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 最佳化一處容易產生歧義的翻譯                                             |
| [131](https://github.com/Vonng/ddia/pull/131)   | [@rwwg4](https://github.com/rwwg4)                         | ch6: 修正兩處錯誤的翻譯                                                 |
| [129](https://github.com/Vonng/ddia/pull/129)   | [@anaer](https://github.com/anaer)                         | ch4: 修正兩處強調文字和四處程式碼變數名稱                                         |
| [128](https://github.com/Vonng/ddia/pull/128)   | [@meilin96](https://github.com/meilin96)                   | ch5: 修正一處錯誤的引用                                                 |
| [126](https://github.com/Vonng/ddia/pull/126)   | [@cwr31](https://github.com/cwr31)                         | ch10: 修正一處錯誤的翻譯（功能 -> 函式）                                      |
| [125](https://github.com/Vonng/ddia/pull/125)   | [@dch1228](https://github.com/dch1228)                     | ch2: 最佳化 how best 的翻譯（如何以最佳方式）                                  |
| [123](https://github.com/Vonng/ddia/pull/123)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 9, TOC in readme, glossary, etc.) |
| [121](https://github.com/Vonng/ddia/pull/121)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 5 to chapter 8)                   |
| [120](https://github.com/Vonng/ddia/pull/120)   | [@jiong-han](https://github.com/jiong-han)                 | Typo fix: 呲之以鼻 -> 嗤之以鼻                                         |
| [119](https://github.com/Vonng/ddia/pull/119)   | [@cclauss](https://github.com/cclauss)                     | Streamline file operations in convert()                        |
| [118](https://github.com/Vonng/ddia/pull/118)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 2 to chapter 4)                   |
| [117](https://github.com/Vonng/ddia/pull/117)   | [@feeeei](https://github.com/feeeei)                       | 統一每章的標題格式                                                      |
| [115](https://github.com/Vonng/ddia/pull/115)   | [@NageNalock](https://github.com/NageNalock)               | 第七章病句修改: 重複詞語                                                  |
| [114](https://github.com/Vonng/ddia/pull/114)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | Update README.md: correct the book name                        |
| [113](https://github.com/Vonng/ddia/pull/113)   | [@lpxxn](https://github.com/lpxxn)                         | 修改語句                                                           |
| [112](https://github.com/Vonng/ddia/pull/112)   | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [110](https://github.com/Vonng/ddia/pull/110)   | [@lpxxn](https://github.com/lpxxn)                         | 讀已寫入資料                                                         |
| [107](https://github.com/Vonng/ddia/pull/107)   | [@abbychau](https://github.com/abbychau)                   | 單調鐘和好死還是賴活著                                                    |
| [106](https://github.com/Vonng/ddia/pull/106)   | [@enochii](https://github.com/enochii)                     | typo in ch2: fix braces typo                                   |
| [105](https://github.com/Vonng/ddia/pull/105)   | [@LiminCode](https://github.com/LiminCode)                 | Chronicle translation error                                    |
| [104](https://github.com/Vonng/ddia/pull/104)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | several advice for better translation                          |
| [103](https://github.com/Vonng/ddia/pull/103)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in ch4: should be 完成 rather than 完全                       |
| [102](https://github.com/Vonng/ddia/pull/102)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | ch4: better-translation: 扼殺 → 破壞                               |
| [101](https://github.com/Vonng/ddia/pull/101)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in Ch4: should be "改變" rathr than "蓋面"                    |
| [100](https://github.com/Vonng/ddia/pull/100)   | [@LiminCode](https://github.com/LiminCode)                 | fix missing translation                                        |
| [99 ](https://github.com/Vonng/ddia/pull/99)    | [@mrdrivingduck](https://github.com/mrdrivingduck)         | ch6: fix the word rebalancing                                  |
| [98 ](https://github.com/Vonng/ddia/pull/98)    | [@jacklightChen](https://github.com/jacklightChen)         | fix ch7.md: fix wrong references                               |
| [97 ](https://github.com/Vonng/ddia/pull/97)    | [@jenac](https://github.com/jenac)                         | 96                                                             |
| [96 ](https://github.com/Vonng/ddia/pull/96)    | [@PragmaTwice](https://github.com/PragmaTwice)             | ch2: fix typo about 'may or may not be'                        |
| [95 ](https://github.com/Vonng/ddia/pull/95)    | [@EvanMu96](https://github.com/EvanMu96)                   | fix translation of "the battle cry" in ch5                     |
| [94 ](https://github.com/Vonng/ddia/pull/94)    | [@kemingy](https://github.com/kemingy)                     | ch6: fix markdown and punctuations                             |
| [93 ](https://github.com/Vonng/ddia/pull/93)    | [@kemingy](https://github.com/kemingy)                     | ch5: fix markdown and some typos                               |
| [92 ](https://github.com/Vonng/ddia/pull/92)    | [@Gilbert1024](https://github.com/Gilbert1024)             | Merge pull request #1 from Vonng/master                        |
| [88 ](https://github.com/Vonng/ddia/pull/88)    | [@kemingy](https://github.com/kemingy)                     | fix typo for ch1, ch2, ch3, ch4                                |
| [87 ](https://github.com/Vonng/ddia/pull/87)    | [@wynn5a](https://github.com/wynn5a)                       | Update ch3.md                                                  |
| [86 ](https://github.com/Vonng/ddia/pull/86)    | [@northmorn](https://github.com/northmorn)                 | Update ch1.md                                                  |
| [85 ](https://github.com/Vonng/ddia/pull/85)    | [@sunbuhui](https://github.com/sunbuhui)                   | fix ch2.md: fix ch2 ambiguous translation                      |
| [84 ](https://github.com/Vonng/ddia/pull/84)    | [@ganler](https://github.com/ganler)                       | Fix translation: use up                                        |
| [83 ](https://github.com/Vonng/ddia/pull/83)    | [@afunTW](https://github.com/afunTW)                       | Using OpenCC to convert from zh-cn to zh-tw                    |
| [82 ](https://github.com/Vonng/ddia/pull/82)    | [@kangni](https://github.com/kangni)                       | fix gitbook url                                                |
| [78 ](https://github.com/Vonng/ddia/pull/78)    | [@hanyu2](https://github.com/hanyu2)                       | Fix unappropriated translation                                 |
| [77 ](https://github.com/Vonng/ddia/pull/77)    | [@Ozarklake](https://github.com/Ozarklake)                 | fix typo                                                       |
| [75 ](https://github.com/Vonng/ddia/pull/75)    | [@2997ms](https://github.com/2997ms)                       | Fix typo                                                       |
| [74 ](https://github.com/Vonng/ddia/pull/74)    | [@2997ms](https://github.com/2997ms)                       | Update ch9.md                                                  |
| [70 ](https://github.com/Vonng/ddia/pull/70)    | [@2997ms](https://github.com/2997ms)                       | Update ch7.md                                                  |
| [67 ](https://github.com/Vonng/ddia/pull/67)    | [@jiajiadebug](https://github.com/jiajiadebug)             | fix issues in ch2 - ch9 and glossary                           |
| [66 ](https://github.com/Vonng/ddia/pull/66)    | [@blindpirate](https://github.com/blindpirate)             | Fix typo                                                       |
| [63 ](https://github.com/Vonng/ddia/pull/63)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch10.md                                                 |
| [62 ](https://github.com/Vonng/ddia/pull/62)    | [@ych](https://github.com/ych)                             | fix ch1.md typesetting problem                                 |
| [61 ](https://github.com/Vonng/ddia/pull/61)    | [@xianlaioy](https://github.com/xianlaioy)                 | docs:鍾-->種，去掉ou                                                |
| [60 ](https://github.com/Vonng/ddia/pull/60)    | [@Zombo1296](https://github.com/Zombo1296)                 | 否則 -> 或者                                                       |
| [59 ](https://github.com/Vonng/ddia/pull/59)    | [@AlexanderMisel](https://github.com/AlexanderMisel)       | 呼叫->呼叫，顯著->顯著                                                  |
| [58 ](https://github.com/Vonng/ddia/pull/58)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch8.md                                                  |
| [55 ](https://github.com/Vonng/ddia/pull/55)    | [@saintube](https://github.com/saintube)                   | ch8: 修改連結錯誤                                                    |
| [54 ](https://github.com/Vonng/ddia/pull/54)    | [@Panmax](https://github.com/Panmax)                       | Update ch2.md                                                  |
| [53 ](https://github.com/Vonng/ddia/pull/53)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [52 ](https://github.com/Vonng/ddia/pull/52)    | [@hecenjie](https://github.com/hecenjie)                   | Update ch1.md                                                  |
| [51 ](https://github.com/Vonng/ddia/pull/51)    | [@latavin243](https://github.com/latavin243)               | fix 修正ch3 ch4幾處翻譯                                              |
| [50 ](https://github.com/Vonng/ddia/pull/50)    | [@AlexZFX](https://github.com/AlexZFX)                     | 幾個疏漏和格式錯誤                                                      |
| [49 ](https://github.com/Vonng/ddia/pull/49)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch1.md                                                  |
| [48 ](https://github.com/Vonng/ddia/pull/48)    | [@scaugrated](https://github.com/scaugrated)               | fix typo                                                       |
| [47 ](https://github.com/Vonng/ddia/pull/47)    | [@lzwill](https://github.com/lzwill)                       | Fixed typos in ch2                                             |
| [45 ](https://github.com/Vonng/ddia/pull/45)    | [@zenuo](https://github.com/zenuo)                         | 刪除一個多餘的右括號                                                     |
| [44 ](https://github.com/Vonng/ddia/pull/44)    | [@akxxsb](https://github.com/akxxsb)                       | 修正第七章底部連結錯誤                                                    |
| [43 ](https://github.com/Vonng/ddia/pull/43)    | [@baijinping](https://github.com/baijinping)               | "更假簡單"->"更加簡單"                                                 |
| [42 ](https://github.com/Vonng/ddia/pull/42)    | [@tisonkun](https://github.com/tisonkun)                   | 修復 ch1 中的無序列表格式                                                |
| [38 ](https://github.com/Vonng/ddia/pull/38)    | [@renjie-c](https://github.com/renjie-c)                   | 糾正多處的翻譯小錯誤                                                     |
| [37 ](https://github.com/Vonng/ddia/pull/37)    | [@tankilo](https://github.com/tankilo)                     | fix translation mistakes in ch4.md                             |
| [36 ](https://github.com/Vonng/ddia/pull/36)    | [@wwek](https://github.com/wwek)                           | 1.修復多個連結錯誤 2.名詞最佳化修訂 3.錯誤修訂                                     |
| [35 ](https://github.com/Vonng/ddia/pull/35)    | [@wwek](https://github.com/wwek)                           | fix ch7.md  to ch8.md  link error                              |
| [34 ](https://github.com/Vonng/ddia/pull/34)    | [@wwek](https://github.com/wwek)                           | Merge pull request #1 from Vonng/master                        |
| [33 ](https://github.com/Vonng/ddia/pull/33)    | [@wwek](https://github.com/wwek)                           | fix part-ii.md link error                                      |
| [32 ](https://github.com/Vonng/ddia/pull/32)    | [@JCYoky](https://github.com/JCYoky)                       | Update ch2.md                                                  |
| [31 ](https://github.com/Vonng/ddia/pull/31)    | [@elsonLee](https://github.com/elsonLee)                   | Update ch7.md                                                  |
| [26 ](https://github.com/Vonng/ddia/pull/26)    | [@yjhmelody](https://github.com/yjhmelody)                 | 修復一些明顯錯誤                                                       |
| [25 ](https://github.com/Vonng/ddia/pull/25)    | [@lqbilbo](https://github.com/lqbilbo)                     | 修復連結錯誤                                                         |
| [24 ](https://github.com/Vonng/ddia/pull/24)    | [@artiship](https://github.com/artiship)                   | 修改詞語順序                                                         |
| [23 ](https://github.com/Vonng/ddia/pull/23)    | [@artiship](https://github.com/artiship)                   | 修正錯別字                                                          |
| [22 ](https://github.com/Vonng/ddia/pull/22)    | [@artiship](https://github.com/artiship)                   | 糾正翻譯錯誤                                                         |
| [21 ](https://github.com/Vonng/ddia/pull/21)    | [@zhtisi](https://github.com/zhtisi)                       | 修正目錄和本章標題不符的情況                                                 |
| [20 ](https://github.com/Vonng/ddia/pull/20)    | [@rentiansheng](https://github.com/rentiansheng)           | Update ch7.md                                                  |
| [19 ](https://github.com/Vonng/ddia/pull/19)    | [@LHRchina](https://github.com/LHRchina)                   | 修復語句小bug                                                       |
| [16 ](https://github.com/Vonng/ddia/pull/16)    | [@MuAlex](https://github.com/MuAlex)                       | Master                                                         |
| [15 ](https://github.com/Vonng/ddia/pull/15)    | [@cg-zhou](https://github.com/cg-zhou)                     | Update translation progress                                    |
| [14 ](https://github.com/Vonng/ddia/pull/14)    | [@cg-zhou](https://github.com/cg-zhou)                     | Translate glossary                                             |
| [13 ](https://github.com/Vonng/ddia/pull/13)    | [@cg-zhou](https://github.com/cg-zhou)                     | 詳細修改了後記中和印度野豬相關的描述                                             |
| [12 ](https://github.com/Vonng/ddia/pull/12)    | [@ibyte2011](https://github.com/ibyte2011)                 | 修改了部分翻譯                                                        |
| [11 ](https://github.com/Vonng/ddia/pull/11)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 100%                                                       |
| [10 ](https://github.com/Vonng/ddia/pull/10)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 20%                                                        |
| [9  ](https://github.com/Vonng/ddia/pull/9)     | [@jiajiadebug](https://github.com/jiajiadebug)             | Preface, ch1, part-i translation minor fixes                   |
| [7  ](https://github.com/Vonng/ddia/pull/7)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 translation pull request                                   |
| [6  ](https://github.com/Vonng/ddia/pull/6)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 change version1                                            |
| [5  ](https://github.com/Vonng/ddia/pull/5)     | [@nevertiree](https://github.com/nevertiree)               | Chapter 01語法微調                                                 |
| [2  ](https://github.com/Vonng/ddia/pull/2)     | [@seagullbird](https://github.com/seagullbird)             | 序言初翻                                                           |


================================================
FILE: content/tw/glossary.md
================================================
---
title: 術語表
weight: 500
breadcrumbs: false
---

> 請注意：本術語表的定義刻意保持簡短，旨在傳達核心概念，而非覆蓋術語的全部細節。更多內容請參閱正文對應章節。

### 非同步（asynchronous）

不等待某件事完成（例如透過網路把資料傳送到另一個節點），且不假設它會在多長時間內完成。參見“[同步與非同步複製](/tw/ch6#sec_replication_sync_async)”、“[同步網路與非同步網路](/tw/ch9#sec_distributed_sync_networks)”和“[系統模型與現實](/tw/ch9#sec_distributed_system_model)”。

### 原子（atomic）

1. 在併發語境下：指一個操作看起來在某個單一時刻生效，其他併發程序不會看到它處於“半完成”狀態。另見 *isolation*。
2. 在事務語境下：指一組寫入要麼全部提交、要麼全部回滾，即使發生故障也不例外。參見“[原子性](/tw/ch8#sec_transactions_acid_atomicity)”和“[兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc)”。

### 背壓（backpressure）

當接收方跟不上時，強制傳送方降速。也稱為 *flow control*。參見“[系統過載後無法恢復時會發生什麼](/tw/ch2#sidebar_metastable)”。

### 批處理（batch process）

以一個固定（通常較大）資料集為輸入、產出另一份資料且不修改輸入的計算。參見[第 11 章](/tw/ch11#ch_batch)。

### 有界（bounded）

具有已知上限或大小。例如可用於描述網路延遲（參見“[超時與無界延遲](/tw/ch9#sec_distributed_queueing)”）和資料集（參見[第 12 章](/tw/ch12#ch_stream)導言）。

### 拜占庭故障（Byzantine fault）

節點以任意錯誤方式行為，例如向不同節點發送相互矛盾或惡意訊息。參見“[拜占庭故障](/tw/ch9#sec_distributed_byzantine)”。

### 快取（cache）

透過記住近期訪問資料來加速後續讀取的元件。快取通常不完整：若未命中，需要回源到更慢但完整的底層資料儲存。

### CAP 定理（CAP theorem）

一個在實踐中經常被誤解、且不太有直接指導價值的理論結果。參見“[CAP 定理](/tw/ch10#the-cap-theorem)”。

### 因果關係（causality）

當一件事“先於”另一件事發生時產生的事件依賴關係。例如後續事件對先前事件的響應、建立在先前事件之上，或必須結合先前事件理解。參見“[happens-before 關係與併發](/tw/ch6#sec_replication_happens_before)”。

### 共識（consensus）

分散式計算中的基本問題：讓多個節點就某件事達成一致（例如誰是主節點）。這比直覺上要困難得多。參見“[共識](/tw/ch10#sec_consistency_consensus)”。

### 資料倉庫（data warehouse）

將多個 OLTP 系統的資料彙總並整理後，用於分析場景的資料庫。參見“[資料倉庫](/tw/ch1#sec_introduction_dwh)”。

### 宣告式（declarative）

描述“想要什麼性質”，而非“如何一步步實現”。在資料庫查詢中，最佳化器接收宣告式查詢並決定最佳執行方式。參見“[術語：宣告式查詢語言](/tw/ch3)”。

### 反正規化（denormalize）

在已正規化資料集中引入一定冗餘（常見形式為快取或索引）以換取更快讀取。反正規化值可看作預計算結果，類似物化檢視。參見“[正規化、反正規化與連線](/tw/ch3#sec_datamodels_normalization)”。

### 派生資料（derived data）

透過可重複流程由其他資料生成的資料集，必要時可重新計算。通常用於加速某類讀取。索引、快取、物化檢視都屬於派生資料。參見“[記錄系統與派生資料](/tw/ch1#sec_introduction_derived)”。

### 確定性（deterministic）

一個函式在相同輸入下總產生相同輸出，不依賴隨機數、當前時間、網路互動等不可預測因素。參見“[確定性的力量](/tw/ch9#sidebar_distributed_determinism)”。

### 分散式（distributed）

系統在多個透過網路連線的節點上執行。其典型特徵是 *部分失效*：一部分壞了，另一部分仍在工作，而軟體往往難以精確知道哪裡壞了。參見“[故障與部分失效](/tw/ch9#sec_distributed_partial_failure)”。

### 永續性（durable）

以你相信不會丟失的方式儲存資料，即使發生各種故障。參見“[永續性](/tw/ch8#durability)”。

### ETL

Extract-Transform-Load（提取-轉換-載入）：從源資料庫抽取資料，轉成更適合分析查詢的形式，再載入到資料倉庫或批處理系統。參見“[資料倉庫](/tw/ch1#sec_introduction_dwh)”。

### 故障切換（failover）

在單主系統中，將主角色從一個節點切到另一個節點的過程。參見“[處理節點故障](/tw/ch6#sec_replication_failover)”。

### 容錯（fault-tolerant）

出現故障（如機器崩潰、鏈路故障）後仍可自動恢復。參見“[可靠性與容錯](/tw/ch2#sec_introduction_reliability)”。

### 流量控制（flow control）

見 *backpressure*。

### 追隨者（follower）

不直接接收客戶端寫入、僅應用來自主節點變更的副本。也稱 *secondary*、*read replica* 或 *hot standby*。參見“[單主複製](/tw/ch6#sec_replication_leader)”。

### 全文檢索（full-text search）

按任意關鍵詞搜尋文字，通常支援近似拼寫、同義詞等能力。全文索引是支援此類查詢的一種 *secondary index*。參見“[全文檢索](/tw/ch4#sec_storage_full_text)”。

### 圖（graph）

由 *vertices*（可引用物件，也稱 *nodes* 或 *entities*）和 *edges*（頂點間連線，也稱 *relationships* 或 *arcs*）組成的資料結構。參見“[圖狀資料模型](/tw/ch3#sec_datamodels_graph)”。

### 雜湊（hash）

把輸入對映成看似隨機數字的函式。相同輸入總得相同輸出；不同輸入通常輸出不同，但也可能碰撞（*collision*）。參見“[按鍵的雜湊分片](/tw/ch7#sec_sharding_hash)”。

### 冪等（idempotent）

可安全重試的操作：執行多次與執行一次效果相同。參見“[冪等性](/tw/ch12#sec_stream_idempotence)”。

### 索引（index）

一種可高效檢索“某欄位取某值”的記錄的資料結構。參見“[OLTP 的儲存與索引](/tw/ch4#sec_storage_oltp)”。

### 隔離性（isolation）

在事務語境下，併發事務相互干擾的程度。*Serializable* 最強，也常用更弱隔離級別。參見“[隔離性](/tw/ch8#sec_transactions_acid_isolation)”。

### 連線（join）

把具有關聯關係的記錄拼在一起。常見於一個記錄引用另一個記錄（外部索引鍵、文件引用、圖邊）時，查詢需要取到被引用物件。參見“[正規化、反正規化與連線](/tw/ch3#sec_datamodels_normalization)”和“[JOIN 與 GROUP BY](/tw/ch11#sec_batch_join)”。

### 領導者（leader）

當資料或服務跨多個節點複製時，被指定為可接受寫入的副本。可透過協議選舉或管理員指定。也稱 *primary* 或 *source*。參見“[單主複製](/tw/ch6#sec_replication_leader)”。

### 線性一致（linearizable）

表現得像系統裡只有一份資料副本，且由原子操作更新。參見“[線性一致性](/tw/ch10#sec_consistency_linearizability)”。

### 區域性（locality）

一種效能最佳化：把經常被一起訪問的資料放在一起。參見“[讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality)”。

### 鎖（lock）

保證同一時刻只有一個執行緒/節點/事務訪問某資源的機制；其他訪問者需等待鎖釋放。參見“[兩階段鎖（2PL）](/tw/ch8#sec_transactions_2pl)”和“[分散式鎖與租約](/tw/ch9#sec_distributed_lock_fencing)”。

### 日誌（log）

只追加寫入的資料檔案。*WAL* 用於崩潰恢復（參見“[讓 B 樹可靠](/tw/ch4#sec_storage_btree_wal)”）；*log-structured* 儲存把日誌作為主儲存格式（參見“[日誌結構儲存](/tw/ch4#sec_storage_log_structured)”）；*replication log* 用於主從複製（參見“[單主複製](/tw/ch6#sec_replication_leader)”）；*event log* 可表示資料流（參見“[基於日誌的訊息代理](/tw/ch12#sec_stream_log) ”）。

### 物化（materialize）

把計算結果提前算出並寫下來，而不是按需即時計算。參見“[事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)”。

### 節點（node）

執行在某臺計算機上的軟體例項，透過網路與其他節點協作完成任務。

### 正規化（normalized）

資料結構中儘量避免冗餘與重複。正規化資料庫裡某資料變化時通常只改一處，不需多處同步。參見“[正規化、反正規化與連線](/tw/ch3#sec_datamodels_normalization)”。

### OLAP

Online Analytic Processing（線上分析處理）：典型訪問模式是對大量記錄做聚合（如 count/sum/avg）。參見“[事務系統與分析系統](/tw/ch1#sec_introduction_analytics)”。

### OLTP

Online Transaction Processing（線上事務處理）：典型訪問模式是快速讀寫少量記錄，通常按鍵索引。參見“[事務系統與分析系統](/tw/ch1#sec_introduction_analytics)”。

### 分片（sharding）

把單機裝不下的大資料集或計算拆成更小部分並分散到多臺機器上。也稱 *partitioning*。參見[第 7 章](/tw/ch7#ch_sharding)。

### 百分位（percentile）

透過統計多少值高於/低於某閾值來描述分佈。例如某時段 95 分位響應時間為 *t*，表示 95% 請求耗時小於 *t*，5% 更長。參見“[描述效能](/tw/ch2#sec_introduction_percentiles)”。

### 主鍵（primary key）

唯一標識一條記錄的值（通常為數字或字串）。在很多應用中由系統在建立時生成（順序或隨機），而非使用者手工指定。另見 *secondary index*。

### 法定票數（quorum）

一個操作被判定成功前所需的最少投票節點數。參見“[讀寫法定票數](/tw/ch6#sec_replication_quorum_condition)”。

### 再平衡（rebalance）

為均衡負載，把資料或服務從一個節點遷移到另一個節點。參見“[鍵值資料的分片](/tw/ch7#sec_sharding_key_value)”。

### 複製（replication）

在多個節點（*replicas*）上儲存同一份資料，以便部分節點不可達時仍可訪問。參見[第 6 章](/tw/ch6#ch_replication)。

### 模式（schema）

對資料結構（欄位、型別等）的描述。資料是否符合模式可在生命週期不同階段檢查（參見“[文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility)”），模式也可隨時間演進（參見[第 5 章](/tw/ch5#ch_encoding)）。

### 二級索引（secondary index）

與主儲存並行維護的附加結構，用於高效檢索滿足某類條件的記錄。參見“[多列索引與二級索引](/tw/ch4#sec_storage_index_multicolumn)”和“[分片與二級索引](/tw/ch7#sec_sharding_secondary_indexes)”。

### 可序列化（serializable）

一種 *isolation* 保證：多個事務併發執行時，行為等價於某個序列順序逐個執行。參見“[可序列化](/tw/ch8#sec_transactions_serializability)”。

### 無共享（shared-nothing）

一種架構：獨立節點（各自 CPU、記憶體、磁碟）透過普通網路連線；相對的是共享記憶體或共享磁碟架構。參見“[共享記憶體、共享磁碟與無共享架構](/tw/ch2#sec_introduction_shared_nothing)”。

### 偏斜（skew）

1. 分片負載不均：某些分片請求/資料很多，另一些很少。也稱 *hot spots*。參見“[負載偏斜與熱點消除](/tw/ch7#sec_sharding_skew)”。
2. 一種時序異常，導致事件呈現為非預期的非順序。參見“[快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)”中的讀偏斜、“[寫偏斜與幻讀](/tw/ch8#sec_transactions_write_skew)”中的寫偏斜、以及“[用於事件排序的時間戳](/tw/ch9#sec_distributed_lww)”中的時鐘偏斜。

### 腦裂（split brain）

兩個節點同時認為自己是領導者，可能破壞系統保證。參見“[處理節點故障](/tw/ch6#sec_replication_failover)”和“[少數服從多數](/tw/ch9#sec_distributed_majority)”。

### 儲存過程（stored procedure）

把事務邏輯編碼到資料庫伺服器端執行，使事務過程中無需與客戶端來回通訊。參見“[實際序列執行](/tw/ch8#sec_transactions_serial)”。

### 流處理（stream process）

持續執行的計算：消費無窮事件流併產出結果。參見[第 12 章](/tw/ch12#ch_stream)。

### 同步（synchronous）

*asynchronous* 的反義詞。

### 記錄系統（system of record）

持有某類資料主權威版本的系統，也稱 *source of truth*。資料變更首先寫入這裡，其他資料集可由其派生。參見“[記錄系統與派生資料](/tw/ch1#sec_introduction_derived)”。

### 超時（timeout）

最簡單的故障檢測方式之一：在一定時間內未收到響應即判定超時。但無法確定是遠端節點故障還是網路問題導致。參見“[超時與無界延遲](/tw/ch9#sec_distributed_queueing)”。

### 全序（total order）

一種可比較關係（如時間戳），任意兩者都能判定大小。若存在不可比較元素，則稱 *partial order*（偏序）。

### 事務（transaction）

把多次讀寫封裝為一個邏輯單元，以簡化錯誤處理與併發問題。參見[第 8 章](/tw/ch8#ch_transactions)。

### 兩階段提交（two-phase commit, 2PC）

保證多個數據庫節點對同一事務要麼都 *atomically* 提交、要麼都中止的演算法。參見“[兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc)”。

### 兩階段鎖（two-phase locking, 2PL）

實現 *serializable isolation* 的演算法：事務對讀寫資料加鎖並持有到事務結束。參見“[兩階段鎖（2PL）](/tw/ch8#sec_transactions_2pl)”。

### 無界（unbounded）

沒有已知上限或大小。與 *bounded* 相反。

================================================
FILE: content/tw/indexes.md
================================================
---
title: 索引
weight: 550
breadcrumbs: false
---

### 符號

- 3FS（分散式檔案系統）, [分散式檔案系統](/tw/ch11#sec_batch_dfs)

### A

- 中止（事務）, [事務](/tw/ch8#ch_transactions), [原子性](/tw/ch8#sec_transactions_acid_atomicity)
  - 級聯, [沒有髒讀](/tw/ch8#no-dirty-reads)
  - 在兩階段提交中, [兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc)
  - 樂觀併發控制的效能, [可序列化快照隔離的效能](/tw/ch8#performance-of-serializable-snapshot-isolation)
  - 重試已中止的事務, [處理錯誤和中止](/tw/ch8#handling-errors-and-aborts)
- 抽象, [雲服務的分層](/tw/ch1#layering-of-cloud-services), [簡單性：管理複雜度](/tw/ch2#id38), [資料模型與查詢語言](/tw/ch3#ch_datamodels), [事務](/tw/ch8#ch_transactions), [總結](/tw/ch8#summary)
- 意外複雜性, [簡單性：管理複雜度](/tw/ch2#id38)
- 問責制, [責任與問責](/ch14#id371)
- 會計（財務資料）, [總結](/tw/ch3#summary), [不可變事件的優點](/tw/ch12#sec_stream_immutability_pros)
- Accumulo（資料庫）
  - 寬柱資料模型, [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality), [列壓縮](/tw/ch4#sec_storage_column_compression)
- ACID 屬性（事務）, [ACID 的含義](/tw/ch8#sec_transactions_acid)
  - 原子性, [原子性](/tw/ch8#sec_transactions_acid_atomicity), [單物件與多物件操作](/tw/ch8#sec_transactions_multi_object)
  - 一致性, [一致性](/tw/ch8#sec_transactions_acid_consistency), [維護完整性，儘管軟體有Bug](/tw/ch13#id455)
  - 永續性, [使 B 樹可靠](/tw/ch4#sec_storage_btree_wal), [永續性](/tw/ch8#durability)
  - 隔離性, [隔離性](/tw/ch8#sec_transactions_acid_isolation), [單物件與多物件操作](/tw/ch8#sec_transactions_multi_object)
- 確認（訊息）, [確認與重新傳遞](/tw/ch12#sec_stream_reordering)
- active/active replication（見 multi-leader replication）
- active/passive replication（見 基於領導者的複製）
- ActiveMQ（訊息系統）, [訊息代理](/tw/ch5#message-brokers), [訊息代理與資料庫的對比](/tw/ch12#id297)
  - 分散式事務支援, [XA 事務](/tw/ch8#xa-transactions)
- ActiveRecord（物件關係對映器）, [物件關係對映（ORM）](/tw/ch3#object-relational-mapping-orm), [處理錯誤和中止](/tw/ch8#handling-errors-and-aborts)
- activity (workflows)（見 workflow engines）
- Actor 模型, [分散式 actor 框架](/tw/ch5#distributed-actor-frameworks)
  - （另見 event-driven architecture）
  - 與流處理的比較, [事件驅動架構與 RPC](/tw/ch12#sec_stream_actors_drpc)
- 自適應容量, [偏斜的工作負載與緩解熱點](/tw/ch7#sec_sharding_skew)
- Advanced Message Queuing Protocol（見 AMQP）
- 航空航天系統, [拜占庭故障](/tw/ch9#sec_distributed_byzantine)
- Aerospike（資料庫）
  - 強一致性模式, [單物件寫入](/tw/ch8#sec_transactions_single_object)
- AGE（圖資料庫）, [Cypher 查詢語言](/tw/ch3#id57)
- 彙總
  - 資料立方體和已實現檢視, [物化檢視與資料立方體](/tw/ch4#sec_storage_materialized_views)
  - 分批處理, [排序與記憶體聚合](/tw/ch11#id275)
  - 流程中, [流分析](/tw/ch12#id318)
- 聚合管道（MongoDB）, [正規化、反正規化與連線](/tw/ch3#sec_datamodels_normalization), [文件的查詢語言](/tw/ch3#query-languages-for-documents)
- 敏捷, [可演化性：讓變化更容易](/tw/ch2#sec_introduction_evolvability)
  - 最小化不可逆性, [批處理](/tw/ch11#ch_batch), [應用演化後重新處理資料](/tw/ch13#sec_future_reprocessing)
  - 充滿自信地快速前進, [端到端原則重現](/tw/ch13#id456)
- 一致意見, [單值共識](/tw/ch10#single-value-consensus), [原子提交作為共識](/tw/ch10#atomic-commitment-as-consensus)
  - （另見 共識）
- AI (artificial intelligence)（見 machine learning）
- AI Act (European Union), [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)
- Airbyte, [資料倉庫](/tw/ch1#sec_introduction_dwh)
- Airflow（工作流排程器）, [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows), [批處理](/tw/ch11#ch_batch), [工作流排程](/tw/ch11#sec_batch_workflows)
  - 雲資料倉整合, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
  - 用於 ETL, [提取-轉換-載入（ETL）](/tw/ch11#sec_batch_etl_usage)
- 阿卡邁
  - 響應時間研究, [平均值、中位數與百分位點](/tw/ch2#id24)
- 演算法
  - 演算法正確性, [定義演算法的正確性](/tw/ch9#defining-the-correctness-of-an-algorithm)
  - B樹, [B 樹](/tw/ch4#sec_storage_b_trees)-[B 樹變體](/tw/ch4#b-tree-variants)
  - 分散式系統, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
  - 歸併排序, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables), [混洗資料](/tw/ch11#sec_shuffle)
  - 排程, [資源分配](/tw/ch11#id279)
  - SSTable 與 LSM 樹, [SSTable 檔案格式](/tw/ch4#the-sstable-file-format)-[壓實策略](/tw/ch4#sec_storage_lsm_compaction)
- 全互聯複製拓撲, [多主複製拓撲](/tw/ch6#sec_replication_topologies)
- AllegroGraph（資料庫）, [圖資料模型](/tw/ch3#sec_datamodels_graph)
  - SPARQL 查詢語言, [SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
- ALTER TABLE 語句（SQL）, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility), [編碼與演化](/tw/ch5#ch_encoding)
- 亞馬遜
  - Dynamo（見 Dynamo（資料庫））
  - 響應時間研究, [平均值、中位數與百分位點](/tw/ch2#id24)
- Amazon Web Services (AWS)
  - Aurora（見 Aurora（雲資料庫））
  - ClockBound（見 ClockBound（時間同步））
  - 正確性測試, [形式化方法和隨機測試](/tw/ch9#sec_distributed_formal)
  - DynamoDB（見 DynamoDB（資料庫））
  - EBS（見 EBS（虛擬塊裝置））
  - Kinesis（見 Kinesis（訊息系統））
  - Neptune（見 Neptune（圖資料庫））
  - 網路可靠性, [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults)
  - S3（見 S3（物件儲存））
- 放大
  - 偏見, [偏見與歧視](/ch14#id370)
  - 故障, [維護派生狀態](/tw/ch13#id446)
  - 尾延遲, [響應時間指標的應用](/tw/ch2#sec_introduction_slo_sla), [本地二級索引](/tw/ch7#id166)
  - 寫入放大, [寫放大](/tw/ch4#write-amplification)
- AMQP（高階訊息佇列協議）, [訊息代理與資料庫的對比](/tw/ch12#id297)
  - （另見 messaging systems）
  - 比較基於日誌的郵件, [日誌與傳統的訊息傳遞相比](/tw/ch12#sec_stream_logs_vs_messaging), [重播舊訊息](/tw/ch12#sec_stream_replay)
  - 訊息順序, [確認與重新傳遞](/tw/ch12#sec_stream_reordering)
- 分析系統, [分析型與事務型系統](/tw/ch1#sec_introduction_analytics)
  - 作為衍生資料系統, [記錄系統與派生資料](/tw/ch1#sec_introduction_derived)
  - 來自運營系統的 ETL, [資料倉庫](/tw/ch1#sec_introduction_dwh)
  - 治理, [超越資料湖](/tw/ch1#beyond-the-data-lake)
- 分析, [分析型與事務型系統](/tw/ch1#sec_introduction_analytics)-[記錄系統與派生資料](/tw/ch1#sec_introduction_derived)
  - 與事務處理的比較, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp)
  - 資料正常化, [正規化的權衡](/tw/ch3#trade-offs-of-normalization)
  - data warehousing（見 data warehousing）
  - predictive（見 predictive analytics）
  - 與批次處理的關係, [分析（Analytics）](/tw/ch11#sec_batch_olap)-[分析（Analytics）](/tw/ch11#sec_batch_olap)
  - 計劃, [星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics)-[星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics)
  - 快速隔離查詢, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)
  - 流式分析, [流分析](/tw/ch12#id318)
- 分析工程, [分析型與事務型系統](/tw/ch1#sec_introduction_analytics)
- 反熵, [追趕錯過的寫入](/tw/ch6#sec_replication_read_repair)
- Antithesis（確定性模擬測試）, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
- Apache Accumulo（見 Accumulo）
- Apache ActiveMQ（見 ActiveMQ）
- Apache AGE（見 AGE）
- Apache Arrow（見 Arrow（資料格式））
- Apache Avro（見 Avro）
- Apache Beam（見 Beam）
- Apache BookKeeper（見 BookKeeper）
- Apache Cassandra（見 Cassandra）
- Apache Curator（見 Curator）
- Apache DataFusion（見 DataFusion（查詢引擎））
- Apache Druid（見 Druid（資料庫））
- Apache Flink（見 Flink（處理框架））
- Apache HBase（見 HBase）
- Apache Iceberg（見 Iceberg（表格式））
- Apache Jena（見 Jena）
- Apache Kafka（見 Kafka）
- Apache Lucene（見 Lucene）
- Apache Oozie（見 Oozie（工作流排程器））
- Apache ORC（見 ORC（資料格式））
- Apache Parquet（見 Parquet（資料格式））
- Apache Pig（查詢語言）, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
- Apache Pinot（見 Pinot（資料庫））
- Apache Pulsar（見 Pulsar）
- Apache Qpid（見 Qpid）
- Apache Samza（見 Samza）
- Apache Solr（見 Solr）
- Apache Spark（見 Spark；見 Spark（處理框架））
- Apache Storm（見 Storm）
- Apache Superset（見 Superset（資料視覺化軟體））
- Apache Thrift（見 Thrift）
- Apache ZooKeeper（見 ZooKeeper）
- Apama （流式分析）, [複合事件處理](/tw/ch12#id317)
- append-only files（見 logs）
- Application Programming Interfaces (APIs), [資料模型與查詢語言](/tw/ch3#ch_datamodels)
  - 用於改變流, [變更流的 API 支援](/tw/ch12#sec_stream_change_api)
  - 分散式事務, [XA 事務](/tw/ch8#xa-transactions)
  - 服務費用, [流經服務的資料流：REST 與 RPC](/tw/ch5#sec_encoding_dataflow_rpc)-[RPC 的資料編碼與演化](/tw/ch5#data-encoding-and-evolution-for-rpc)
    - （另見 services）
    - 可演化性, [RPC 的資料編碼與演化](/tw/ch5#data-encoding-and-evolution-for-rpc)
    - RESTful, [Web 服務](/tw/ch5#sec_web_services)
- application state（見 國家）
- approximate search（見 similarity search）
- 檔案儲存、資料庫資料, [歸檔儲存](/tw/ch5#archival-storage)
- arcs（見 edges）
- ArcticDB（資料庫）, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- 算術平均值, [平均值、中位數與百分位點](/tw/ch2#id24)
- 陣列
  - 陣列資料庫, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
  - 多層面, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- Arrow（資料格式）, [列式儲存](/tw/ch4#sec_storage_column), [DataFrames](/tw/ch11#id287)
- artificial intelligence（見 machine learning）
- ASCII text, [Protocol Buffers](/tw/ch5#sec_encoding_protobuf)
- ASN.1 (schema language), [模式的優點](/tw/ch5#sec_encoding_schemas)
- 關聯表格, [多對一與多對多關係](/tw/ch3#sec_datamodels_many_to_many), [屬性圖](/tw/ch3#id56)
- 同步網路, [不可靠的網路](/tw/ch9#sec_distributed_networks), [術語表](/tw/glossary)
  - 比較同步網路, [同步與非同步網路](/tw/ch9#sec_distributed_sync_networks)
  - 系統模型, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
- 同步複製, [同步複製與非同步複製](/tw/ch6#sec_replication_sync_async), [術語表](/tw/glossary)
  - 故障資料損失, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover)
  - 從同步跟蹤器讀取, [複製延遲的問題](/tw/ch6#sec_replication_lag)
  - 有多個領導, [多主複製](/tw/ch6#sec_replication_multi_leader)
- 非同步傳輸模式, [我們不能簡單地使網路延遲可預測嗎？](/tw/ch9#can-we-not-simply-make-network-delays-predictable)
- 原子廣播, [共享日誌作為共識](/tw/ch10#sec_consistency_shared_logs)
- 原子鐘, [帶置信區間的時鐘讀數](/tw/ch9#clock-readings-with-a-confidence-interval), [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
  - （另見 clocks）
- 原子性, [術語表](/tw/glossary)
  - 原子自增, [單物件寫入](/tw/ch8#sec_transactions_single_object)
  - 比較和設定, [條件寫入（比較並設定）](/tw/ch8#sec_transactions_compare_and_set), [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
    - （另見 比較和設定）
  - 異常資料, [正規化的權衡](/tw/ch3#trade-offs-of-normalization)
  - 獲取和新增/遞增, [ID 生成器和邏輯時鐘](/tw/ch10#sec_consistency_logical), [共識](/tw/ch10#sec_consistency_consensus), [獲取並增加作為共識](/tw/ch10#fetch-and-add-as-consensus)
  - 寫入操作, [原子寫操作](/tw/ch8#atomic-write-operations)
- 原子性, [原子性](/tw/ch8#sec_transactions_acid_atomicity), [單物件與多物件操作](/tw/ch8#sec_transactions_multi_object), [術語表](/tw/glossary)
  - 原子提交
    - 避開, [多分割槽請求處理](/tw/ch13#id360), [無協調資料系統](/tw/ch13#id454)
    - 遮蔽和非遮蔽, [三階段提交](/tw/ch8#three-phase-commit)
    - 在溪流處理中, [恰好一次訊息處理](/tw/ch8#sec_transactions_exactly_once), [再談恰好一次訊息處理](/tw/ch8#exactly-once-message-processing-revisited), [原子提交再現](/tw/ch12#sec_stream_atomic_commit)
    - 維護衍生資料, [保持系統同步](/tw/ch12#sec_stream_sync)
  - 分散式事務, [分散式事務](/tw/ch8#sec_transactions_distributed)-[再談恰好一次訊息處理](/tw/ch8#exactly-once-message-processing-revisited)
  - 用於多物件事務, [單物件與多物件操作](/tw/ch8#sec_transactions_multi_object)
  - 用於單物件寫入, [單物件寫入](/tw/ch8#sec_transactions_single_object)
  - 與協商一致的關係, [原子提交作為共識](/tw/ch10#atomic-commitment-as-consensus)
- 可審計性, [信任但驗證](/tw/ch13#sec_future_verification)-[用於可審計資料系統的工具](/tw/ch13#id366)
  - 設計, [為可審計性而設計](/tw/ch13#id365)
  - 自動審計系統, [不要盲目信任承諾](/tw/ch13#id364)
  - 透過不可改變性, [不可變事件的優點](/tw/ch12#sec_stream_immutability_pros)
  - 可審計資料系統工具, [用於可審計資料系統的工具](/tw/ch13#id366)
- Aurora（雲資料庫）, [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native)
- Aurora DSQL（資料庫）
  - 快速隔離支援, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)
- 自動縮放, [運維：自動/手動再平衡](/tw/ch7#sec_sharding_operations)
- Automerge (CRDT library), [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
- 可用性, [可靠性與容錯](/tw/ch2#sec_introduction_reliability)
  - （另見 fault tolerance）
  - 在 CAP 定理中, [CAP 定理](/tw/ch10#the-cap-theorem)
  - 領袖選舉, [共識的微妙之處](/tw/ch10#subtleties-of-consensus)
  - 在服務級別協議（SLA）中, [響應時間指標的應用](/tw/ch2#sec_introduction_slo_sla)
- 可用區, [透過冗餘容忍硬體故障](/tw/ch2#tolerating-hardware-faults-through-redundancy), [讀己之寫](/tw/ch6#sec_replication_ryw)
- Avro（資料格式）, [Avro](/tw/ch5#sec_encoding_avro)-[動態生成的模式](/tw/ch5#dynamically-generated-schemas)
  - 動態生成的計劃, [動態生成的模式](/tw/ch5#dynamically-generated-schemas)
  - 物件容器檔案, [但什麼是寫入者模式？](/tw/ch5#but-what-is-the-writers-schema), [歸檔儲存](/tw/ch5#archival-storage)
  - 讀者決定作家的計劃, [但什麼是寫入者模式？](/tw/ch5#but-what-is-the-writers-schema)
  - 計劃演變, [寫入者模式與讀取者模式](/tw/ch5#the-writers-schema-and-the-readers-schema)
  - 批次處理中的用途, [MapReduce](/tw/ch11#sec_batch_mapreduce)
- awk （Unix 工具） （英語）., [簡單日誌分析](/tw/ch11#sec_batch_log_analysis), [簡單日誌分析](/tw/ch11#sec_batch_log_analysis), [分散式作業編排](/tw/ch11#id278)
- Axon Framework, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
- Azkaban（工作流排程器）, [批處理](/tw/ch11#ch_batch)
- Azure Blob Storage（物件儲存）, [雲服務的分層](/tw/ch1#layering-of-cloud-services), [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 有條件的標題, [隔離殭屍程序和延遲請求](/tw/ch9#sec_distributed_fencing_tokens)
- Azure managed disks, [儲存與計算的分離](/tw/ch1#sec_introduction_storage_compute)
- Azure SQL DB（資料庫）, [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native)
- Azure Storage, [物件儲存](/tw/ch11#id277)
- Azure Synapse Analytics（資料庫）, [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native)
- Azure Virtual Machines
  - 現場虛擬機器, [故障處理](/tw/ch11#id281)

### B

- B樹（指數）, [B 樹](/tw/ch4#sec_storage_b_trees)-[B 樹變體](/tw/ch4#b-tree-variants)
  - B+ trees, [B 樹變體](/tw/ch4#b-tree-variants)
  - 分支因子, [B 樹](/tw/ch4#sec_storage_b_trees)
  - comparison to LSM-trees, [比較 B 樹與 LSM 樹](/tw/ch4#sec_storage_btree_lsm_comparison)-[磁碟空間使用](/tw/ch4#disk-space-usage)
  - 崩潰恢復, [使 B 樹可靠](/tw/ch4#sec_storage_btree_wal)
  - 透過分割頁面增長, [B 樹](/tw/ch4#sec_storage_b_trees)
  - 不可變變種, [B 樹變體](/tw/ch4#b-tree-variants), [索引與快照隔離](/tw/ch8#indexes-and-snapshot-isolation)
  - 與硬分裂相似, [重新平衡鍵範圍分片資料](/tw/ch7#rebalancing-key-range-sharded-data)
  - 變體, [B 樹變體](/tw/ch4#b-tree-variants)
- B2（物件儲存）, [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- Backblaze B2（見 B2（物件儲存））
- 後端, [資料系統架構中的權衡](/tw/ch1#ch_tradeoffs)
- 返回, 指數, [描述效能](/tw/ch2#sec_introduction_percentiles), [處理錯誤和中止](/tw/ch8#handling-errors-and-aborts)
- 背壓, [描述效能](/tw/ch2#sec_introduction_percentiles), [讀取效能](/tw/ch4#read-performance), [訊息傳遞系統](/tw/ch12#sec_stream_messaging), [術語表](/tw/glossary)
  - 分批處理, [工作流排程](/tw/ch11#sec_batch_workflows)
  - in TCP, [TCP 的侷限性](/tw/ch9#sec_distributed_tcp)
- 備份
  - 用於複製的資料庫快照, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 在多使用者系統中, [面向多租戶的分片](/tw/ch7#sec_sharding_multitenancy)
  - 完整性, [不要盲目信任承諾](/tw/ch13#id364)
  - 抓圖隔離, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)
  - 使用物件儲存, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 相對複製, [複製](/tw/ch6#ch_replication)
- 向後相容, [編碼與演化](/tw/ch5#ch_encoding)
- BadgerDB（資料庫）
  - 可序列事務, [可序列化快照隔離（SSI）](/tw/ch8#sec_transactions_ssi)
- BASE, contrast to ACID, [ACID 的含義](/tw/ch8#sec_transactions_acid)
- 擊打彈殼（Unix）, [OLTP 系統的儲存與索引](/tw/ch4#sec_storage_oltp)
- 批處理, [批處理](/tw/ch11#ch_batch)-[本章小結](/tw/ch11#id292), [術語表](/tw/glossary)
  - 方案規劃和職能規劃, [MapReduce](/tw/ch11#sec_batch_mapreduce)
  - 惠益, [批處理](/tw/ch11#ch_batch)
  - 結合流處理, [統一批處理和流處理](/tw/ch13#id338)
  - 與流處理的比較, [流處理](/tw/ch12#sec_stream_processing)
  - 資料流引擎, [資料流引擎](/tw/ch11#sec_batch_dataflow)-[資料流引擎](/tw/ch11#sec_batch_dataflow)
  - 過失容忍, [故障處理](/tw/ch11#id281), [訊息傳遞系統](/tw/ch12#sec_stream_messaging)
  - 資料整合, [批處理與流處理](/tw/ch13#sec_future_batch_streaming)-[統一批處理和流處理](/tw/ch13#id338)
  - 圖表和迭代處理, [機器學習](/tw/ch11#id290)
  - high-level APIs and languages, [查詢語言](/tw/ch11#sec_batch_query_lanauges)-[查詢語言](/tw/ch11#sec_batch_query_lanauges)
  - 雲資料倉庫中, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
  - 在分散式系統中, [分散式系統中的批處理](/tw/ch11#sec_batch_distributed)
  - 加入和分組, [JOIN 與 GROUP BY](/tw/ch11#sec_batch_join)-[JOIN 與 GROUP BY](/tw/ch11#sec_batch_join)
  - 限制, [批處理](/tw/ch11#ch_batch)
  - 基於日誌的資訊和, [重播舊訊息](/tw/ch12#sec_stream_replay)
  - 保持衍生狀態, [維護派生狀態](/tw/ch13#id446)
  - 衡量業績, [批處理](/tw/ch11#ch_batch)
  - 模式, [批處理模型](/tw/ch11#id431)
  - 資源分配, [資源分配](/tw/ch11#id279)-[資源分配](/tw/ch11#id279)
  - 資源管理員, [分散式作業編排](/tw/ch11#id278)
  - 排程器, [分散式作業編排](/tw/ch11#id278)
  - 服務衍生資料, [對外提供派生資料](/tw/ch11#sec_batch_serving_derived)-[對外提供派生資料](/tw/ch11#sec_batch_serving_derived)
  - 移動資料, [混洗資料](/tw/ch11#sec_shuffle)-[混洗資料](/tw/ch11#sec_shuffle)
  - 任務執行, [分散式作業編排](/tw/ch11#id278)
  - 使用大小寫, [批處理用例](/tw/ch11#sec_batch_output)-[對外提供派生資料](/tw/ch11#sec_batch_serving_derived)
  - 使用 Unix 工具（例如）, [使用 Unix 工具的批處理](/tw/ch11#sec_batch_unix)-[排序與記憶體聚合](/tw/ch11#id275)
- 批處理框架
  - 與作業系統的比較, [分散式系統中的批處理](/tw/ch11#sec_batch_distributed)
- Beam （資料流庫）, [統一批處理和流處理](/tw/ch13#id338)
- BERT (language model), [向量嵌入](/tw/ch4#id92)
- 偏向, [偏見與歧視](/ch14#id370)
- bidirectional replication（見 multi-leader replication）
- 泥漿大球, [簡單性：管理複雜度](/tw/ch2#id38)
- 大資料
  - 對資料最小化, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance), [立法與自律](/ch14#sec_future_legislation)
- BigQuery（資料庫）, [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native), [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses), [批處理](/tw/ch11#ch_batch)
  - DataFrames, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
  - 硬化和叢集, [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
  - 移動資料, [混洗資料](/tw/ch11#sec_shuffle)
  - 快速隔離支援, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)
- Bigtable（資料庫）
  - 硬化計劃, [按鍵的範圍分片](/tw/ch7#sec_sharding_key_range)
  - 儲存佈局, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
  - 平板（硬化）, [分片](/tw/ch7#ch_sharding)
  - 寬柱資料模型, [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality), [列壓縮](/tw/ch4#sec_storage_column_compression)
- 二進位制資料編碼, [二進位制編碼](/tw/ch5#binary-encoding)-[模式的優點](/tw/ch5#sec_encoding_schemas)
  - Avro, [Avro](/tw/ch5#sec_encoding_avro)-[動態生成的模式](/tw/ch5#dynamically-generated-schemas)
  - MessagePack, [二進位制編碼](/tw/ch5#binary-encoding)-[二進位制編碼](/tw/ch5#binary-encoding)
  - Protocol Buffers, [Protocol Buffers](/tw/ch5#sec_encoding_protobuf)-[欄位標籤與模式演化](/tw/ch5#field-tags-and-schema-evolution)
- 二進位制編碼
  - 根據計劃, [模式的優點](/tw/ch5#sec_encoding_schemas)
  - 按網路驅動程式, [模式的優點](/tw/ch5#sec_encoding_schemas)
- binary strings, lack of support in JSON and XML, [JSON、XML 及其二進位制變體](/tw/ch5#sec_encoding_json)
- 比特幣（催眠幣）, [用於可審計資料系統的工具](/tw/ch13#id366)
  - 拜占庭斷層承受力, [拜占庭故障](/tw/ch9#sec_distributed_byzantine)
  - 交換中的貨幣錯誤, [弱隔離級別](/tw/ch8#sec_transactions_isolation_levels)
- 點陣圖索引, [列壓縮](/tw/ch4#sec_storage_column_compression)
- BitTorrent uTP protocol, [TCP 的侷限性](/tw/ch9#sec_distributed_tcp)
- Bkd-樹木（指數）, [多維索引與全文索引](/tw/ch4#sec_storage_multidimensional)
- 無咎死後, [人類與可靠性](/tw/ch2#id31)
- Blazegraph（資料庫）, [圖資料模型](/tw/ch3#sec_datamodels_graph)
  - SPARQL 查詢語言, [SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
- blob storage（見 object storage）
- 塊, [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- 塊裝置（磁碟）, [儲存與計算的分離](/tw/ch1#sec_introduction_storage_compute)
- 塊鏈, [總結](/tw/ch3#summary)
  - 拜占庭斷層承受力, [拜占庭故障](/tw/ch9#sec_distributed_byzantine), [共識](/tw/ch10#sec_consistency_consensus), [用於可審計資料系統的工具](/tw/ch13#id366)
- 阻止原子承諾, [三階段提交](/tw/ch8#three-phase-commit)
- Bloom 過濾器（演算法）, [布隆過濾器](/tw/ch4#bloom-filters), [讀取效能](/tw/ch4#read-performance), [流分析](/tw/ch12#id318)
- BookKeeper (replicated log), [將工作分配給節點](/tw/ch10#allocating-work-to-nodes)
- 邊框資料集, [流處理](/tw/ch12#ch_stream), [術語表](/tw/glossary)
  - （另見 batch processing）
- 受限延遲, [術語表](/tw/glossary)
  - 在網路中, [同步與非同步網路](/tw/ch9#sec_distributed_sync_networks)
  - 程序暫停, [響應時間保證](/tw/ch9#sec_distributed_clocks_realtime)
- 廣播
  - 全序廣播（見 shared logs）
- 無中介訊息, [直接從生產者傳遞給消費者](/tw/ch12#id296)
- 粗糙（計量聚合器）, [直接從生產者傳遞給消費者](/tw/ch12#id296)
- BTM (transaction coordinator), [兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc)
- 緩衝
  - Bufstream（訊息系統）, [設定新的副本](/tw/ch6#sec_replication_new_replica)
- Bufstream（訊息系統）, [磁碟空間使用](/tw/ch12#sec_stream_disk_usage)
- 新建或購買, [雲服務與自託管](/tw/ch1#sec_introduction_cloud)
- 快速網路交通模式, [我們不能簡單地使網路延遲可預測嗎？](/tw/ch9#can-we-not-simply-make-network-delays-predictable)
- 商業分析員, [分析型與事務型系統](/tw/ch1#sec_introduction_analytics), [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake)
- 商業資料處理, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp)
- 商業情報, [分析型與事務型系統](/tw/ch1#sec_introduction_analytics)-[資料倉庫](/tw/ch1#sec_introduction_dwh)
- Business Process Execution Language (BPEL), [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows)
- Business Process Model and Notation (BPMN), [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows)
  - 例項, [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows)
- 位元組序列,編碼資料, [編碼資料的格式](/tw/ch5#sec_encoding_formats)
- 拜占庭斷層, [拜占庭故障](/tw/ch9#sec_distributed_byzantine)-[弱形式的謊言](/tw/ch9#weak-forms-of-lying), [系統模型與現實](/tw/ch9#sec_distributed_system_model), [術語表](/tw/glossary)
  - 拜占庭容錯系統, [拜占庭故障](/tw/ch9#sec_distributed_byzantine)
  - Byzantine Generals Problem, [拜占庭故障](/tw/ch9#sec_distributed_byzantine)
  - 協商一致演算法和, [共識](/tw/ch10#sec_consistency_consensus), [用於可審計資料系統的工具](/tw/ch13#id366)

### C

- 快取, [全記憶體儲存](/tw/ch4#sec_storage_inmemory), [術語表](/tw/glossary)
  - 意見, [物化檢視與資料立方體](/tw/ch4#sec_storage_materialized_views)
  - 作為衍生資料, [記錄系統與派生資料](/tw/ch1#sec_introduction_derived), [組合使用資料儲存技術](/tw/ch13#id447)-[分拆系統與整合系統](/tw/ch13#id448)
  - in CPUs, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized), [線性一致性與網路延遲](/tw/ch10#linearizability-and-network-delays)
  - 無效和贍養費, [保持系統同步](/tw/ch12#sec_stream_sync), [維護物化檢視](/tw/ch12#sec_stream_mat_view)
  - 線性一致性, [線性一致性](/tw/ch10#sec_consistency_linearizability)
  - 雲中的本地磁碟, [儲存與計算的分離](/tw/ch1#sec_introduction_storage_compute)
- 日曆同步, [同步引擎與本地優先軟體](/tw/ch6#sec_replication_offline_clients), [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
- California Consumer Privacy Act (CCPA), [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)
- Camunda（工作流程引擎）, [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows)
- （資料）, [記錄系統與派生資料](/tw/ch1#sec_introduction_derived)
- CAP定理, [CAP 定理](/tw/ch10#the-cap-theorem)-[CAP 定理](/tw/ch10#the-cap-theorem), [術語表](/tw/glossary)
- 能力規劃, [雲時代的運維](/tw/ch1#sec_introduction_operations)
- Cap'n Proto（資料格式）, [編碼資料的格式](/tw/ch5#sec_encoding_formats)
- 碳排放, [分散式與單節點系統](/tw/ch1#sec_introduction_distributed)
- 級聯中止, [沒有髒讀](/tw/ch8#no-dirty-reads)
- 連鎖失敗, [軟體故障](/tw/ch2#software-faults), [運維：自動/手動再平衡](/tw/ch7#sec_sharding_operations), [超時和無界延遲](/tw/ch9#sec_distributed_queueing)
- Cassandra（資料庫）
  - 資料變更捕獲, [資料變更捕獲的實現](/tw/ch12#id307), [變更流的 API 支援](/tw/ch12#sec_stream_change_api)
  - 壓縮戰略, [壓實策略](/tw/ch4#sec_storage_lsm_compaction)
  - consistency level ANY, [單主與無主複製的效能](/tw/ch6#sec_replication_leaderless_perf)
  - 雜湊變硬, [按鍵的雜湊分片](/tw/ch7#sec_sharding_hash), [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
  - 最後寫成的解決衝突, [檢測併發寫入](/tw/ch6#sec_replication_concurrent)
  - 無領導複製, [無主複製](/tw/ch6#sec_replication_leaderless)
  - 輕量事務, [單物件寫入](/tw/ch8#sec_transactions_single_object)
  - 線性,缺少, [實現線性一致性系統](/tw/ch10#sec_consistency_implementing_linearizable)
  - 日誌結構儲存, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
  - 多區域支助, [多地區操作](/tw/ch6#multi-region-operation)
  - 二級指數, [本地二級索引](/tw/ch7#id166)
  - 使用時鐘, [仲裁一致性的侷限](/tw/ch6#sec_replication_quorum_limitations), [用於事件排序的時間戳](/tw/ch9#sec_distributed_lww)
  - 節點（硬化）, [分片](/tw/ch7#ch_sharding)
- 貓（Unix 工具）, [簡單日誌分析](/tw/ch11#sec_batch_log_analysis)
- 目錄, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
- 因果關係, [版本向量](/tw/ch6#version-vectors)
  - （另見 causal dependencies）
- 因果關係, ["先發生"關係與併發](/tw/ch6#sec_replication_happens_before)-[版本向量](/tw/ch6#version-vectors)
  - 捕獲, [版本向量](/tw/ch6#version-vectors), [排序事件以捕獲因果關係](/tw/ch13#sec_future_capture_causality), [讀也是事件](/tw/ch13#sec_future_read_events)
    - 按總訂單, [全序的限制](/tw/ch13#id335)
  - 事務中, [基於過時前提的決策](/tw/ch8#decisions-based-on-an-outdated-premise)
  - 向朋友傳送訊息（例如）, [排序事件以捕獲因果關係](/tw/ch13#sec_future_capture_causality)
- 因果關係, [術語表](/tw/glossary)
  - 因果順序
    - 與, [邏輯時鐘](/tw/ch10#sec_consistency_timestamps)
  - 與, [邏輯時鐘](/tw/ch10#sec_consistency_timestamps)-[使用邏輯時鐘強制約束](/tw/ch10#enforcing-constraints-using-logical-clocks)
  - 發生關係前, ["先發生"關係與併發](/tw/ch6#sec_replication_happens_before)
  - 在可序列事務中, [基於過時前提的決策](/tw/ch8#decisions-based-on-an-outdated-premise)-[檢測影響先前讀取的寫入](/tw/ch8#sec_detecting_writes_affect_reads)
  - 與時鐘不符, [用於事件排序的時間戳](/tw/ch9#sec_distributed_lww)
  - 命令要抓取的事件, [排序事件以捕獲因果關係](/tw/ch13#sec_future_capture_causality)
  - 違反《公約》的行為, [一致字首讀](/tw/ch6#sec_replication_consistent_prefix), [不同拓撲的問題](/tw/ch6#problems-with-different-topologies), [用於事件排序的時間戳](/tw/ch9#sec_distributed_lww)
  - 帶有同步時鐘, [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
- 基於單元格的架構, [面向多租戶的分片](/tw/ch7#sec_sharding_multitenancy)
- 複合事件處理（見 複合事件處理）
- CephFS（分散式檔案系統）, [批處理](/tw/ch11#ch_batch), [物件儲存](/tw/ch11#id277)
- 證書透明性, [用於可審計資料系統的工具](/tw/ch13#id366)
- c組, [分散式作業編排](/tw/ch11#id278)
- 資料變更捕獲, [邏輯（基於行）日誌複製](/tw/ch6#logical-row-based-log-replication), [資料變更捕獲](/tw/ch12#sec_stream_cdc)
  - 變更流的 API 支援, [變更流的 API 支援](/tw/ch12#sec_stream_change_api)
  - 比較事件來源, [資料變更捕獲與事件溯源](/tw/ch12#sec_stream_event_sourcing)
  - 執行, [資料變更捕獲的實現](/tw/ch12#id307)
  - 初始快照, [初始快照](/tw/ch12#sec_stream_cdc_snapshot)
  - 日誌壓縮, [日誌壓縮](/tw/ch12#sec_stream_log_compaction)
- 更改日誌, [狀態、流和不變性](/tw/ch12#sec_stream_immutability)
  - 資料變更捕獲, [資料變更捕獲](/tw/ch12#sec_stream_cdc)
  - 操作狀態, [失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance)
  - 在溪流中連線, [流表連線（流擴充）](/tw/ch12#sec_stream_table_joins)
  - 日誌壓縮, [日誌壓縮](/tw/ch12#sec_stream_log_compaction)
  - 保持衍生狀態, [資料庫與流](/tw/ch12#sec_stream_databases)
- 混亂工程, [容錯](/tw/ch2#id27), [故障注入](/tw/ch9#sec_fault_injection)
- 檢查站
  - 在高效能計算中, [雲計算與超級計算](/tw/ch1#id17)
  - 在流處理器中, [微批次與存檔點](/tw/ch12#id329)
- 斷路器（限制重試）, [描述效能](/tw/ch2#sec_introduction_percentiles)
- 電路交換網路, [同步與非同步網路](/tw/ch9#sec_distributed_sync_networks)
- 迴圈緩衝器, [磁碟空間使用](/tw/ch12#sec_stream_disk_usage)
- 迴圈複製地形, [多主複製拓撲](/tw/ch6#sec_replication_topologies)
- Citus（資料庫）
  - 雜湊變硬, [固定數量的分片](/tw/ch7#fixed-number-of-shards)
- ClickHouse（資料庫）, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp), [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native)
  - 增量檢視維護, [維護物化檢視](/tw/ch12#sec_stream_mat_view)
- 點選流資料,分析, [JOIN 與 GROUP BY](/tw/ch11#sec_batch_join)
- 客戶
  - 電話服務, [流經服務的資料流：REST 與 RPC](/tw/ch5#sec_encoding_dataflow_rpc)
  - 離線, [同步引擎與本地優先軟體](/tw/ch6#sec_replication_offline_clients), [有狀態、可離線的客戶端](/tw/ch13#id347)
  - 推動狀態更改到, [將狀態變更推送給客戶端](/tw/ch13#id348)
  - 請求路由, [請求路由](/tw/ch7#sec_sharding_routing)
- ClockBound（時間同步）, [帶置信區間的時鐘讀數](/tw/ch9#clock-readings-with-a-confidence-interval)
  - use in YugabyteDB, [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
- 時鐘, [不可靠的時鐘](/tw/ch9#sec_distributed_clocks)-[限制垃圾回收的影響](/tw/ch9#sec_distributed_gc_impact)
  - 原子鐘, [帶置信區間的時鐘讀數](/tw/ch9#clock-readings-with-a-confidence-interval), [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
  - 信任間隔, [帶置信區間的時鐘讀數](/tw/ch9#clock-readings-with-a-confidence-interval)-[用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
  - 全球快照, [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
  - 混合邏輯時鐘, [混合邏輯時鐘](/tw/ch10#hybrid-logical-clocks)
  - logical（見 logical clocks）
  - 偏斜, [最後寫入勝利（丟棄併發寫入）](/tw/ch6#sec_replication_lww), [仲裁一致性的侷限](/tw/ch6#sec_replication_quorum_limitations), [對同步時鐘的依賴](/tw/ch9#sec_distributed_clocks_relying)-[帶置信區間的時鐘讀數](/tw/ch9#clock-readings-with-a-confidence-interval), [實現線性一致性系統](/tw/ch10#sec_consistency_implementing_linearizable)
  - 殺人, [單調時鐘](/tw/ch9#monotonic-clocks)
  - 同步和準確性, [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy)-[時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy)
  - synchronization using GPS, [不可靠的時鐘](/tw/ch9#sec_distributed_clocks), [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy), [帶置信區間的時鐘讀數](/tw/ch9#clock-readings-with-a-confidence-interval), [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
  - 時間與單調時鐘, [單調時鐘與日曆時鐘](/tw/ch9#sec_distributed_monotonic_timeofday)
  - 時間標記事件, [你用的是誰的時鐘？](/tw/ch12#id438)
- 雲服務, [雲服務與自託管](/tw/ch1#sec_introduction_cloud)-[雲計算與超級計算](/tw/ch1#id17)
  - 可用區, [透過冗餘容忍硬體故障](/tw/ch2#tolerating-hardware-faults-through-redundancy), [讀己之寫](/tw/ch6#sec_replication_ryw)
  - 資料倉庫, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
  - 需要發現服務, [服務發現](/tw/ch10#service-discovery)
  - 網路故障, [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults)
  - 利弊關係, [雲服務的利弊](/tw/ch1#sec_introduction_cloud_tradeoffs)-[雲服務的利弊](/tw/ch1#sec_introduction_cloud_tradeoffs)
  - 配額, [雲時代的運維](/tw/ch1#sec_introduction_operations)
  - regions（見 regions (geographic distribution)）
  - 無伺服器, [微服務與無伺服器](/tw/ch1#sec_introduction_microservices)
  - 共享資源, [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
  - 對超級計算, [雲計算與超級計算](/tw/ch1#id17)
- 雲內, [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native)-[雲時代的運維](/tw/ch1#sec_introduction_operations)
- 雲飛
  - R2（見 R2（物件儲存））
- 組合索引, [在索引中儲存值](/tw/ch4#sec_storage_index_heap)
- 分組（記錄順序）, [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
- CockroachDB（資料庫）
  - 基於共識的複製, [單主複製](/tw/ch6#sec_replication_leader)
  - 一致性模式, [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
  - 鍵程硬化, [分片](/tw/ch7#ch_sharding), [按鍵的範圍分片](/tw/ch7#sec_sharding_key_range)
  - 可序列事務, [可序列化快照隔離（SSI）](/tw/ch8#sec_transactions_ssi)
  - 硬化二級指數, [全域性二級索引](/tw/ch7#id167)
  - 事務, [事務到底是什麼？](/tw/ch8#sec_transactions_overview), [資料庫內部的分散式事務](/tw/ch8#sec_transactions_internal)
  - 使用模型檢查, [模型檢查與規範語言](/tw/ch9#model-checking-and-specification-languages)
- 程式碼生成
  - 用於查詢執行, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
  - 帶有協議緩衝, [Protocol Buffers](/tw/ch5#sec_encoding_protobuf)
- 協作編輯, [即時協作、離線優先和本地優先應用](/tw/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- 列家庭（大表）, [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality), [列壓縮](/tw/ch4#sec_storage_column_compression)
- 面向列的儲存, [列式儲存](/tw/ch4#sec_storage_column)-[查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
  - 列壓縮, [列壓縮](/tw/ch4#sec_storage_column_compression)
  - 公園, [列式儲存](/tw/ch4#sec_storage_column), [歸檔儲存](/tw/ch5#archival-storage)
  - 排序在, [列儲存中的排序順序](/tw/ch4#sort-order-in-column-storage)-[列儲存中的排序順序](/tw/ch4#sort-order-in-column-storage)
  - 向量處理, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
  - 寬柱型, [列壓縮](/tw/ch4#sec_storage_column_compression)
  - 寫入, [寫入列式儲存](/tw/ch4#writing-to-column-oriented-storage)
- comma-separated values（見 CSV）
- 命令查詢責任分離, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)-[事件溯源與 CQRS](/tw/ch3#sec_datamodels_events), [從同一事件日誌中派生多個檢視](/tw/ch12#sec_stream_deriving_views)
- 命令（活動來源）, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
- 執行（事務）, [事務](/tw/ch8#ch_transactions)
  - 原子提交, [分散式事務](/tw/ch8#sec_transactions_distributed)-[再談恰好一次訊息處理](/tw/ch8#exactly-once-message-processing-revisited)
    - （另見 原子性）
  - 讀作承諾隔離, [讀已提交](/tw/ch8#sec_transactions_read_committed)
  - three-phase commit (3PC), [三階段提交](/tw/ch8#three-phase-commit)
  - 兩階段提交, [兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc)-[協調器故障](/tw/ch8#coordinator-failure)
- 通用業務, [衝突解決與複製](/tw/ch8#conflict-resolution-and-replication)
- 壓實（Compaction）
  - 更改日誌, [日誌壓縮](/tw/ch12#sec_stream_log_compaction)
    - （另見 日誌壓縮）
    - 流運算子狀態, [失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance)
  - 日誌結構儲存, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
    - 問題, [讀取效能](/tw/ch4#read-performance)
    - 規模分級和分級辦法, [壓實策略](/tw/ch4#sec_storage_lsm_compaction), [磁碟空間使用](/tw/ch4#disk-space-usage)
- 比較和設定, [條件寫入（比較並設定）](/tw/ch8#sec_transactions_compare_and_set), [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
  - 執行鎖定, [協調服務](/tw/ch10#sec_consistency_coordination)
  - 執行獨特性限制, [約束與唯一性保證](/tw/ch10#sec_consistency_uniqueness)
  - 在物件儲存中, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 與協商一致的關係, [線性一致性與仲裁](/tw/ch10#sec_consistency_quorum_linearizable), [共識](/tw/ch10#sec_consistency_consensus), [比較並設定作為共識](/tw/ch10#compare-and-set-as-consensus)
  - 與柵欄標誌的關係, [隔離殭屍程序和延遲請求](/tw/ch9#sec_distributed_fencing_tokens)
  - 與事務的關係, [單物件寫入](/tw/ch8#sec_transactions_single_object)
- 相容性, [編碼與演化](/tw/ch5#ch_encoding), [資料流的模式](/tw/ch5#sec_encoding_dataflow)
  - 電話服務, [RPC 的資料編碼與演化](/tw/ch5#data-encoding-and-evolution-for-rpc)
  - 編碼格式的屬性, [總結](/tw/ch5#summary)
  - 使用資料庫, [流經資料庫的資料流](/tw/ch5#sec_encoding_dataflow_db)-[歸檔儲存](/tw/ch5#archival-storage)
- 補償事務, [不可變事件的優點](/tw/ch12#sec_stream_immutability_pros), [寬鬆地解釋約束](/tw/ch13#id362)
- 彙編, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
- 複合事件處理, [複合事件處理](/tw/ch12#id317)
- 複雜度
  - 理論模型中的蒸餾, [將系統模型對映到現實世界](/tw/ch9#mapping-system-models-to-the-real-world)
  - 重要和意外事項, [簡單性：管理複雜度](/tw/ch2#id38)
  - 使用抽象來隱藏, [資料模型與查詢語言](/tw/ch3#ch_datamodels)
  - 管理, [簡單性：管理複雜度](/tw/ch2#id38)
- composing data systems（見 unbundling databases）
- 壓縮
  - in SSTables, [SSTable 檔案格式](/tw/ch4#the-sstable-file-format)
- 計算密集型應用程式, [資料系統架構中的權衡](/tw/ch1#ch_tradeoffs)
- 電腦遊戲, [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
- 縮寫索引, [多維索引與全文索引](/tw/ch4#sec_storage_multidimensional)
  - 在雜湊硬化系統中, [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
- 併發
  - 演員程式設計模式, [分散式 actor 框架](/tw/ch5#distributed-actor-frameworks), [事件驅動架構與 RPC](/tw/ch12#sec_stream_actors_drpc)
    - （另見 event-driven architecture）
  - 事務隔離薄弱時出現的錯誤, [弱隔離級別](/tw/ch8#sec_transactions_isolation_levels)
  - 解決衝突, [處理寫入衝突](/tw/ch6#sec_replication_write_conflicts)-[處理寫入衝突](/tw/ch6#sec_replication_write_conflicts)
  - 定義, [處理寫入衝突](/tw/ch6#sec_replication_write_conflicts)
  - 檢測並行寫作, [檢測併發寫入](/tw/ch6#sec_replication_concurrent)-[版本向量](/tw/ch6#version-vectors)
  - 雙寫、 問題, [保持系統同步](/tw/ch12#sec_stream_sync)
  - 發生關係前, ["先發生"關係與併發](/tw/ch6#sec_replication_happens_before)
  - 在複製系統中, [複製延遲的問題](/tw/ch6#sec_replication_lag)-[版本向量](/tw/ch6#version-vectors), [線性一致性](/tw/ch10#sec_consistency_linearizability)-[線性一致性與網路延遲](/tw/ch10#linearizability-and-network-delays)
  - 丟失更新, [防止丟失更新](/tw/ch8#sec_transactions_lost_update)
  - 多版本併發控制, [多版本併發控制（MVCC）](/tw/ch8#sec_transactions_snapshot_impl), [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
  - 樂觀併發控制, [悲觀併發控制與樂觀併發控制](/tw/ch8#pessimistic-versus-optimistic-concurrency-control)
  - 行動命令, [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
  - 透過事件日誌減少, [併發控制](/tw/ch12#sec_stream_concurrency), [資料流：應用程式碼與狀態變化的互動](/tw/ch13#id450)
  - 時間和相對性, ["先發生"關係與併發](/tw/ch6#sec_replication_happens_before)
  - 事務隔離, [隔離性](/tw/ch8#sec_transactions_acid_isolation)
  - 寫偏差, [寫偏差與幻讀](/tw/ch8#sec_transactions_write_skew)-[物化衝突](/tw/ch8#materializing-conflicts)
- 有條件寫入, [條件寫入（比較並設定）](/tw/ch8#sec_transactions_compare_and_set)
  - 事務中, [單物件寫入](/tw/ch8#sec_transactions_single_object)
  - 在物件儲存中, [設定新的副本](/tw/ch6#sec_replication_new_replica)
- 會議管理系統（例如）, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
- conflict-free replicated datatypes (CRDTs), [CRDT 與操作變換](/tw/ch6#sec_replication_crdts)
  - 用於無頭複製, [捕獲先發生關係](/tw/ch6#capturing-the-happens-before-relationship)
  - 防止丟失更新, [衝突解決與複製](/tw/ch8#conflict-resolution-and-replication)
- 衝突
  - 撤銷, [衝突避免](/tw/ch6#conflict-avoidance)
  - 因果關係, ["先發生"關係與併發](/tw/ch6#sec_replication_happens_before)
  - 衝突檢測
    - 分散式事務, [XA 事務的問題](/tw/ch8#problems-with-xa-transactions)
    - 在基於日誌的系統中, [唯一性約束需要達成共識](/tw/ch13#id452)
    - in serializable snapshot isolation (SSI), [檢測影響先前讀取的寫入](/tw/ch8#sec_detecting_writes_affect_reads)
    - 在兩階段提交中, [系統性的承諾](/tw/ch8#a-system-of-promises)
  - 解決衝突
    - 透過中止事務, [悲觀併發控制與樂觀併發控制](/tw/ch8#pessimistic-versus-optimistic-concurrency-control)
    - 透過道歉, [寬鬆地解釋約束](/tw/ch13#id362)
    - 最後寫入勝利, [用於事件排序的時間戳](/tw/ch9#sec_distributed_lww)
    - 使用原子操作, [衝突解決與複製](/tw/ch8#conflict-resolution-and-replication)
  - 確定什麼是衝突, [處理寫入衝突](/tw/ch6#sec_replication_write_conflicts), [基於日誌訊息傳遞中的唯一性](/tw/ch13#sec_future_uniqueness_log)
  - 無領導複製, [檢測併發寫入](/tw/ch6#sec_replication_concurrent)
  - 丟失更新, [防止丟失更新](/tw/ch8#sec_transactions_lost_update)-[衝突解決與複製](/tw/ch8#conflict-resolution-and-replication)
  - 實現, [物化衝突](/tw/ch8#materializing-conflicts)
  - 決議, [處理寫入衝突](/tw/ch6#sec_replication_write_conflicts)-[處理寫入衝突](/tw/ch6#sec_replication_write_conflicts)
    - 自動, [自動衝突解決](/tw/ch6#automatic-conflict-resolution)
    - 無頭系統, [檢測併發寫入](/tw/ch6#sec_replication_concurrent)
    - 最後寫入勝利, [最後寫入勝利（丟棄併發寫入）](/tw/ch6#sec_replication_lww)
    - 使用自定義邏輯, [手動衝突解決](/tw/ch6#manual-conflict-resolution), [捕獲先發生關係](/tw/ch6#capturing-the-happens-before-relationship)
  - 兄弟, [手動衝突解決](/tw/ch6#manual-conflict-resolution), [捕獲先發生關係](/tw/ch6#capturing-the-happens-before-relationship)
    - 合併, [捕獲先發生關係](/tw/ch6#capturing-the-happens-before-relationship)
  - 寫偏差, [寫偏差與幻讀](/tw/ch8#sec_transactions_write_skew)-[物化衝突](/tw/ch8#materializing-conflicts)
- 調和
  - Freight（訊息系統）, [設定新的副本](/tw/ch6#sec_replication_new_replica), [磁碟空間使用](/tw/ch12#sec_stream_disk_usage)
  - 計劃登記, [JSON 模式](/tw/ch5#json-schema), [但什麼是寫入者模式？](/tw/ch5#but-what-is-the-writers-schema)
- 擁堵（網路）
  - 撤銷, [TCP 的侷限性](/tw/ch9#sec_distributed_tcp)
  - 限制時鐘的準確性, [帶置信區間的時鐘讀數](/tw/ch9#clock-readings-with-a-confidence-interval)
  - 排隊延遲, [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
- 共識, [共識](/tw/ch10#sec_consistency_consensus)-[總結](/tw/ch10#summary), [術語表](/tw/glossary)
  - 演算法, [共識](/tw/ch10#sec_consistency_consensus), [共識的實踐](/tw/ch10#sec_consistency_total_order)
  - 協商一致編號, [獲取並增加作為共識](/tw/ch10#fetch-and-add-as-consensus)
  - 協調事務, [協調服務](/tw/ch10#sec_consistency_coordination)-[服務發現](/tw/ch10#service-discovery)
  - 費用, [共識的利弊](/tw/ch10#pros-and-cons-of-consensus)
  - 無法實現, [共識](/tw/ch10#sec_consistency_consensus)
  - 防止腦分裂, [從單主複製到共識](/tw/ch10#from-single-leader-replication-to-consensus)
  - 重組, [共識的微妙之處](/tw/ch10#subtleties-of-consensus)
  - 與原子承諾的關係, [原子提交作為共識](/tw/ch10#atomic-commitment-as-consensus)
  - relation to compare-and-set (CAS), [線性一致性與仲裁](/tw/ch10#sec_consistency_quorum_linearizable), [比較並設定作為共識](/tw/ch10#compare-and-set-as-consensus)
  - 與獲取和新增的關係, [獲取並增加作為共識](/tw/ch10#fetch-and-add-as-consensus)
  - 與複製有關, [使用共享日誌](/tw/ch10#sec_consistency_smr)
  - 與共享日誌的關係, [共享日誌作為共識](/tw/ch10#sec_consistency_shared_logs)
  - 與獨特性制約因素的關係, [唯一性約束需要達成共識](/tw/ch13#id452)
  - 安全和生活特性, [單值共識](/tw/ch10#single-value-consensus)
  - 單一價值共識, [單值共識](/tw/ch10#single-value-consensus)
- consent (GDPR), [同意與選擇自由](/ch14#id375)
- 一致性, [一致性](/tw/ch8#sec_transactions_acid_consistency), [及時性與完整性](/tw/ch13#sec_future_integrity)
  - 跨越不同資料庫, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover), [保持系統同步](/tw/ch12#sec_stream_sync), [從同一事件日誌中派生多個檢視](/tw/ch12#sec_stream_deriving_views), [派生資料與分散式事務](/tw/ch13#sec_future_derived_vs_transactions)
  - 因果關係, [一致字首讀](/tw/ch6#sec_replication_consistent_prefix), [不同拓撲的問題](/tw/ch6#problems-with-different-topologies), [排序事件以捕獲因果關係](/tw/ch13#sec_future_capture_causality)
  - 一致字首讀, [一致字首讀](/tw/ch6#sec_replication_consistent_prefix)-[一致字首讀](/tw/ch6#sec_replication_consistent_prefix)
  - 一致的快照, [設定新的副本](/tw/ch6#sec_replication_new_replica), [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)-[快照隔離、可重複讀和命名混淆](/tw/ch8#snapshot-isolation-repeatable-read-and-naming-confusion), [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner), [初始快照](/tw/ch12#sec_stream_cdc_snapshot), [建立索引](/tw/ch13#id340)
    - （另見 snapshots）
  - 崩潰恢復, [使 B 樹可靠](/tw/ch4#sec_storage_btree_wal)
  - enforcing constraints（見 constraints）
  - 最終, [複製延遲的問題](/tw/ch6#sec_replication_lag)
    - （另見 最終一致性）
  - in ACID transactions, [一致性](/tw/ch8#sec_transactions_acid_consistency), [維護完整性，儘管軟體有Bug](/tw/ch13#id455)
  - 在 CAP 定理中, [CAP 定理](/tw/ch10#the-cap-theorem)
  - 領袖選舉, [共識的微妙之處](/tw/ch10#subtleties-of-consensus)
  - 微服務, [分散式系統的問題](/tw/ch1#sec_introduction_dist_sys_problems)
  - 線性一致性, [複製延遲的解決方案](/tw/ch6#id131), [線性一致性](/tw/ch10#sec_consistency_linearizability)-[線性一致性與網路延遲](/tw/ch10#linearizability-and-network-delays)
  - 含義, [一致性](/tw/ch8#sec_transactions_acid_consistency)
  - 單調讀, [單調讀](/tw/ch6#sec_replication_monotonic_reads)-[單調讀](/tw/ch6#sec_replication_monotonic_reads)
  - 二級指數, [多物件事務的需求](/tw/ch8#sec_transactions_need), [索引與快照隔離](/tw/ch8#indexes-and-snapshot-isolation), [理解資料流](/tw/ch13#id443), [建立索引](/tw/ch13#id340)
  - 讀後寫, [讀己之寫](/tw/ch6#sec_replication_ryw)-[讀己之寫](/tw/ch6#sec_replication_ryw)
    - 在衍生資料系統中, [派生資料與分散式事務](/tw/ch13#sec_future_derived_vs_transactions)
  - strong（見 線性一致性）
  - 及時性和完整性, [及時性與完整性](/tw/ch13#sec_future_integrity)
  - 使用法定人數, [仲裁一致性的侷限](/tw/ch6#sec_replication_quorum_limitations), [線性一致性與仲裁](/tw/ch10#sec_consistency_quorum_linearizable)
- 連續的雜湊, [一致性雜湊](/tw/ch7#sec_sharding_consistent_hashing)
- 一致字首讀, [一致字首讀](/tw/ch6#sec_replication_consistent_prefix)
- 限制（資料庫）, [一致性](/tw/ch8#sec_transactions_acid_consistency), [寫偏差的特徵](/tw/ch8#characterizing-write-skew)
  - 同步檢查, [寬鬆地解釋約束](/tw/ch13#id362)
  - 避免協調, [無協調資料系統](/tw/ch13#id454)
  - 確保一能, [操作識別符號](/tw/ch13#id355)
  - 在基於日誌的系統中, [強制約束](/tw/ch13#sec_future_constraints)-[多分割槽請求處理](/tw/ch13#id360)
    - 跨越多個硬塊, [多分割槽請求處理](/tw/ch13#id360)
  - 在兩階段提交中, [分散式事務](/tw/ch8#sec_transactions_distributed), [系統性的承諾](/tw/ch8#a-system-of-promises)
  - 與協商一致的關係, [唯一性約束需要達成共識](/tw/ch13#id452)
  - 需要線性, [約束與唯一性保證](/tw/ch10#sec_consistency_uniqueness)
- 領事（協調處）, [協調服務](/tw/ch10#sec_consistency_coordination)
  - 用於服務發現, [服務發現](/tw/ch10#service-discovery)
- 消費者（資訊流）, [訊息代理](/tw/ch5#message-brokers), [傳遞事件流](/tw/ch12#sec_stream_transmit)
  - 背壓, [訊息傳遞系統](/tw/ch12#sec_stream_messaging)
  - 消費者群體, [多個消費者](/tw/ch12#id298)
  - 以原木計的消費者抵銷額, [消費者偏移量](/tw/ch12#sec_stream_log_offsets)
  - 失敗, [確認與重新傳遞](/tw/ch12#sec_stream_reordering), [消費者偏移量](/tw/ch12#sec_stream_log_offsets)
  - 扇出, [時間線的物化與更新](/tw/ch2#sec_introduction_materializing), [多個消費者](/tw/ch12#id298), [日誌與傳統的訊息傳遞相比](/tw/ch12#sec_stream_logs_vs_messaging)
  - 負載平衡, [多個消費者](/tw/ch12#id298), [日誌與傳統的訊息傳遞相比](/tw/ch12#sec_stream_logs_vs_messaging)
  - 未與生產者保持同步, [訊息傳遞系統](/tw/ch12#sec_stream_messaging), [磁碟空間使用](/tw/ch12#sec_stream_disk_usage), [開展分拆工作](/tw/ch13#sec_future_unbundling_favor)
- content models (JSON Schema), [JSON 模式](/tw/ch5#json-schema)
- 引數
  - 事務之間, [處理錯誤和中止](/tw/ch8#handling-errors-and-aborts)
  - 遮蔽執行緒, [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)
  - 樂觀併發控制的效能, [悲觀併發控制與樂觀併發控制](/tw/ch8#pessimistic-versus-optimistic-concurrency-control)
  - 雙相鎖定, [兩階段鎖定的效能](/tw/ch8#performance-of-two-phase-locking)
- 上下文開關, [延遲與響應時間](/tw/ch2#id23), [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)
- 收斂, [自動衝突解決](/tw/ch6#automatic-conflict-resolution)-[CRDT 與操作變換](/tw/ch6#sec_replication_crdts)
- 協調
  - 撤銷, [無協調資料系統](/tw/ch13#id454)
  - 跨資料中心, [全序的限制](/tw/ch13#id335)
  - 跨區域, [跨地域執行](/tw/ch6#sec_replication_multi_dc)
  - 交叉硬度順序, [分片](/tw/ch8#sharding), [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner), [使用共享日誌](/tw/ch10#sec_consistency_smr), [多分割槽請求處理](/tw/ch13#id360)
  - 路徑請求到硬體, [請求路由](/tw/ch7#sec_sharding_routing)
  - 服務, [鎖定與領導者選舉](/tw/ch10#locking-and-leader-election), [協調服務](/tw/ch10#sec_consistency_coordination)-[服務發現](/tw/ch10#service-discovery)
- 協調者, [兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc)
  - 失效, [協調器故障](/tw/ch8#coordinator-failure)
  - in XA transactions, [XA 事務](/tw/ch8#xa-transactions)-[XA 事務的問題](/tw/ch8#problems-with-xa-transactions)
  - 恢復, [從協調器故障中恢復](/tw/ch8#recovering-from-coordinator-failure)
- 複製寫（B- 樹）, [B 樹變體](/tw/ch4#b-tree-variants), [索引與快照隔離](/tw/ch8#indexes-and-snapshot-isolation)
- 公共物件請求代理體系結構, [遠端過程呼叫（RPC）的問題](/tw/ch5#sec_problems_with_rpc)
- coronal mass ejection（見 solar storm）
- 正確性
  - 可審計性, [信任但驗證](/tw/ch13#sec_future_verification)-[用於可審計資料系統的工具](/tw/ch13#id366)
  - 拜占庭斷層承受力, [拜占庭故障](/tw/ch9#sec_distributed_byzantine)
  - 處理部分失敗, [故障與部分失效](/tw/ch9#sec_distributed_partial_failure)
  - 在基於日誌的系統中, [強制約束](/tw/ch13#sec_future_constraints)-[多分割槽請求處理](/tw/ch13#id360)
  - 系統模型中的演算法, [定義演算法的正確性](/tw/ch9#defining-the-correctness-of-an-algorithm)
  - 生成資料, [為可審計性而設計](/tw/ch13#id365)
  - 不可變資料, [不可變事件的優點](/tw/ch12#sec_stream_immutability_pros)
  - 個人資料, [責任與問責](/ch14#id371), [隱私與資料使用](/ch14#id457)
  - 時間, [不同拓撲的問題](/tw/ch6#problems-with-different-topologies), [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy)-[用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
  - 事務次數, [一致性](/tw/ch8#sec_transactions_acid_consistency), [追求正確性](/tw/ch13#sec_future_correctness), [維護完整性，儘管軟體有Bug](/tw/ch13#id455)
  - 及時性和完整性, [及時性與完整性](/tw/ch13#sec_future_integrity)-[無協調資料系統](/tw/ch13#id454)
- 資料腐敗
  - 檢測, [端到端原則](/tw/ch13#sec_future_e2e_argument), [不要盲目信任承諾](/tw/ch13#id364)-[用於可審計資料系統的工具](/tw/ch13#id366)
  - 由於病態記憶體訪問, [硬體與軟體故障](/tw/ch2#sec_introduction_hardware_faults)
  - 輻射所致, [拜占庭故障](/tw/ch9#sec_distributed_byzantine)
  - 由於大腦分裂, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover), [分散式鎖和租約](/tw/ch9#sec_distributed_lock_fencing)
  - 由於事務隔離薄弱, [弱隔離級別](/tw/ch8#sec_transactions_isolation_levels)
  - 完整性作為不存在, [及時性與完整性](/tw/ch13#sec_future_integrity)
  - 網路包, [弱形式的謊言](/tw/ch9#weak-forms-of-lying)
  - 磁碟, [永續性](/tw/ch8#durability)
  - 防止使用寫頭日誌, [使 B 樹可靠](/tw/ch4#sec_storage_btree_wal)
  - 從, [批處理](/tw/ch11#ch_batch), [不可變事件的優點](/tw/ch12#sec_stream_immutability_pros)
- 餘弦相似性（語義搜尋）, [向量嵌入](/tw/ch4#id92)
- Couchbase（資料庫）
  - 文件資料模型, [關係模型與文件模型](/tw/ch3#sec_datamodels_history)
  - 永續性, [全記憶體儲存](/tw/ch4#sec_storage_inmemory)
  - 雜湊變硬, [固定數量的分片](/tw/ch7#fixed-number-of-shards)
  - 加入支援, [文件和關係資料庫的融合](/tw/ch3#convergence-of-document-and-relational-databases)
  - 再平衡, [運維：自動/手動再平衡](/tw/ch7#sec_sharding_operations)
  - vBuckets（硬化）, [分片](/tw/ch7#ch_sharding)
- CouchDB（資料庫）
  - 作為同步引擎, [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
  - B-樹木儲存, [索引與快照隔離](/tw/ch8#indexes-and-snapshot-isolation)
  - 解決衝突, [手動衝突解決](/tw/ch6#manual-conflict-resolution)
- 耦合（鬆緊）, [可演化性：讓變化更容易](/tw/ch2#sec_introduction_evolvability)
- 覆蓋索引, [在索引中儲存值](/tw/ch4#sec_storage_index_heap)
- CozoDB（資料庫）, [Datalog：遞迴關係查詢](/tw/ch3#id62)
- CPUs
  - 快取一致性和記憶體障礙, [線性一致性與網路延遲](/tw/ch10#linearizability-and-network-delays)
  - 緩衝和管道, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
  - 計算錯誤的結果, [硬體與軟體故障](/tw/ch2#sec_introduction_hardware_faults)
  - SIMD instructions, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
- 斷層和斷層, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
- CRDTs（見 conflict-free replicated datatypes）
- CREATE INDEX statement (SQL), [多列索引與二級索引](/tw/ch4#sec_storage_index_multicolumn), [建立索引](/tw/ch13#id340)
- 信用評級機構, [責任與問責](/ch14#id371)
- 加密重新整理, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events), [不變性的侷限性](/tw/ch12#sec_stream_immutability_limitations)
- 密碼, [總結](/tw/ch3#summary)
- 密碼學
  - 防禦攻擊者, [拜占庭故障](/tw/ch9#sec_distributed_byzantine)
  - 端到端加密和認證, [端到端原則](/tw/ch13#sec_future_e2e_argument)
- CSV (comma-separated values), [OLTP 系統的儲存與索引](/tw/ch4#sec_storage_oltp), [JSON、XML 及其二進位制變體](/tw/ch5#sec_encoding_json)
- Curator (ZooKeeper recipes), [鎖定與領導者選舉](/tw/ch10#locking-and-leader-election), [將工作分配給節點](/tw/ch10#allocating-work-to-nodes)
- Cypher（查詢語言）, [Cypher 查詢語言](/tw/ch3#id57)
  - comparison to SPARQL, [SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)

### D

- Daft（處理框架）
  - DataFrames, [DataFrames](/tw/ch11#id287)
  - 移動資料, [混洗資料](/tw/ch11#sec_shuffle)
- Dagster（工作流排程器）, [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows), [批處理](/tw/ch11#ch_batch), [工作流排程](/tw/ch11#sec_batch_workflows)
  - 雲資料倉整合, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
- 儀表板（業務情報）, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp)
- Dask（處理框架）, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- 資料目錄, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
- 資料聯結器, [資料倉庫](/tw/ch1#sec_introduction_dwh)
- 資料合同, [提取-轉換-載入（ETL）](/tw/ch11#sec_batch_etl_usage)
  - 資料變更捕獲, [資料變更捕獲與事件溯源](/tw/ch12#sec_stream_event_sourcing)
- data corruption（見 corruption of data）
- 資料方塊, [物化檢視與資料立方體](/tw/ch4#sec_storage_materialized_views)
- 資料工程, [分析型與事務型系統](/tw/ch1#sec_introduction_analytics)
- 資料結構, [提取-轉換-載入（ETL）](/tw/ch11#sec_batch_etl_usage)
- data formats（見 編碼）
- 資料基礎設施, [資料系統架構中的權衡](/tw/ch1#ch_tradeoffs)
- 資料整合, [資料整合](/tw/ch13#sec_future_integration)-[統一批處理和流處理](/tw/ch13#id338), [本章小結](/tw/ch13#id367)
  - 批次和流處理, [批處理與流處理](/tw/ch13#sec_future_batch_streaming)-[統一批處理和流處理](/tw/ch13#id338)
    - 保持衍生狀態, [維護派生狀態](/tw/ch13#id446)
    - 後處理資料, [應用演化後重新處理資料](/tw/ch13#sec_future_reprocessing)
    - 統一, [統一批處理和流處理](/tw/ch13#id338)
  - 透過解開資料庫, [分拆資料庫](/tw/ch13#sec_future_unbundling)-[多分割槽資料處理](/tw/ch13#sec_future_unbundled_multi_shard)
    - 與聯邦資料庫的比較, [一切的元資料庫](/tw/ch13#id341)
  - 透過生成資料合併工具, [組合使用派生資料的工具](/tw/ch13#id442)-[排序事件以捕獲因果關係](/tw/ch13#sec_future_capture_causality)
    - 衍生資料與分散式事務, [派生資料與分散式事務](/tw/ch13#sec_future_derived_vs_transactions)
    - 總訂單的限制, [全序的限制](/tw/ch13#id335)
    - 命令事件捕獲因果關係, [排序事件以捕獲因果關係](/tw/ch13#sec_future_capture_causality)
    - 關於資料流的推理, [理解資料流](/tw/ch13#id443)
  - 需求, [記錄系統與派生資料](/tw/ch1#sec_introduction_derived)
  - 使用批次處理, [批處理](/tw/ch11#ch_batch), [提取-轉換-載入（ETL）](/tw/ch11#sec_batch_etl_usage)
- 資料湖, [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake)
  - 資料湖區, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses), [分析（Analytics）](/tw/ch11#sec_batch_olap)
- data locality（見 區域性）
- 資料網格, [提取-轉換-載入（ETL）](/tw/ch11#sec_batch_etl_usage)
- 資料最小化, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance), [立法與自律](/ch14#sec_future_legislation)
- 資料模型, [資料模型與查詢語言](/tw/ch3#ch_datamodels)-[總結](/tw/ch3#summary)
  - DataFrames and arrays, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
  - 類似圖表的模型, [圖資料模型](/tw/ch3#sec_datamodels_graph)-[GraphQL](/tw/ch3#id63)
    - 資料日誌語言, [Datalog：遞迴關係查詢](/tw/ch3#id62)-[Datalog：遞迴關係查詢](/tw/ch3#id62)
    - 屬性圖, [屬性圖](/tw/ch3#id56)
    - RDF and triple-stores, [三元組儲存與 SPARQL](/tw/ch3#id59)-[SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
  - 關係模型對文件模型, [關係模型與文件模型](/tw/ch3#sec_datamodels_history)-[文件和關係資料庫的融合](/tw/ch3#convergence-of-document-and-relational-databases)
  - 支援多個, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
- 資料管道, [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake), [記錄系統與派生資料](/tw/ch1#sec_introduction_derived), [提取-轉換-載入（ETL）](/tw/ch11#sec_batch_etl_usage)
- 資料產品, [超越資料湖](/tw/ch1#beyond-the-data-lake)
- data protection regulations（見 GDPR）
- 資料居住法, [分散式與單節點系統](/tw/ch1#sec_introduction_distributed), [面向多租戶的分片](/tw/ch7#sec_sharding_multitenancy)
- 資料科學, [分析型與事務型系統](/tw/ch1#sec_introduction_analytics), [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake)
- 資料倉, [資料倉庫](/tw/ch1#sec_introduction_dwh)
- 資料系統
  - 正確性、制約因素和完整性, [追求正確性](/tw/ch13#sec_future_correctness)-[用於可審計資料系統的工具](/tw/ch13#id366)
  - 資料整合, [資料整合](/tw/ch13#sec_future_integration)-[統一批處理和流處理](/tw/ch13#id338)
  - 使用目標, [資料系統架構中的權衡](/tw/ch1#ch_tradeoffs)
  - 多樣性, 保持同步, [保持系統同步](/tw/ch12#sec_stream_sync)
  - 可維護性, [可運維性](/tw/ch2#sec_introduction_maintainability)-[可演化性：讓變化更容易](/tw/ch2#sec_introduction_evolvability)
  - 可能的錯誤, [事務](/tw/ch8#ch_transactions)
  - 可靠性, [可靠性與容錯](/tw/ch2#sec_introduction_reliability)-[人類與可靠性](/tw/ch2#id31)
    - 硬體故障, [硬體與軟體故障](/tw/ch2#sec_introduction_hardware_faults)
    - 人類錯誤, [人類與可靠性](/tw/ch2#id31)
    - 重要性, [人類與可靠性](/tw/ch2#id31)
    - 軟體故障, [軟體故障](/tw/ch2#software-faults)
  - 可伸縮性, [可伸縮性](/tw/ch2#sec_introduction_scalability)-[可伸縮性原則](/tw/ch2#id35)
  - 解析資料庫, [分拆資料庫](/tw/ch13#sec_future_unbundling)-[多分割槽資料處理](/tw/ch13#sec_future_unbundled_multi_shard)
  - 不可靠的時鐘, [不可靠的時鐘](/tw/ch9#sec_distributed_clocks)-[限制垃圾回收的影響](/tw/ch9#sec_distributed_gc_impact)
- 資料儲存, [資料倉庫](/tw/ch1#sec_introduction_dwh), [術語表](/tw/glossary)
  - 基於雲的解決辦法, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
  - ETL, [資料倉庫](/tw/ch1#sec_introduction_dwh), [保持系統同步](/tw/ch12#sec_stream_sync)
  - 用於批處理, [批處理](/tw/ch11#ch_batch)
  - 保持資料系統的同步, [保持系統同步](/tw/ch12#sec_stream_sync)
  - 設計, [星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics)
  - 硬化和叢集, [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
  - 緩慢變化的維度, [連線的時間依賴性](/tw/ch12#sec_stream_join_time)
- 資料密集型應用, [資料系統架構中的權衡](/tw/ch1#ch_tradeoffs)
- 資料庫管理員, [雲時代的運維](/tw/ch1#sec_introduction_operations)
- 內部分散式事務, [跨不同系統的分散式事務](/tw/ch8#sec_transactions_xa), [資料庫內部的分散式事務](/tw/ch8#sec_transactions_internal), [原子提交再現](/tw/ch12#sec_stream_atomic_commit)
- 資料庫
  - 歸檔儲存, [歸檔儲存](/tw/ch5#archival-storage)
  - 信件經紀人的比較, [訊息代理與資料庫的對比](/tw/ch12#id297)
  - 資料流, [流經資料庫的資料流](/tw/ch5#sec_encoding_dataflow_db)
  - 端到端引數, [端到端原則](/tw/ch13#sec_future_e2e_argument)-[在資料系統中應用端到端思考](/tw/ch13#id357)
    - 檢查完整性, [端到端原則重現](/tw/ch13#id456)
  - 與事件流的關係, [資料庫與流](/tw/ch12#sec_stream_databases)-[不變性的侷限性](/tw/ch12#sec_stream_immutability_limitations)
    - （另見 changelogs）
    - 變更流的 API 支援, [變更流的 API 支援](/tw/ch12#sec_stream_change_api), [應用程式碼和狀態的分離](/tw/ch13#id344)
    - 資料變更捕獲, [資料變更捕獲](/tw/ch12#sec_stream_cdc)-[變更流的 API 支援](/tw/ch12#sec_stream_change_api)
    - 事件溯源, [資料變更捕獲與事件溯源](/tw/ch12#sec_stream_event_sourcing)
    - 保持系統同步, [保持系統同步](/tw/ch12#sec_stream_sync)-[保持系統同步](/tw/ch12#sec_stream_sync)
    - 不可改變事件哲學, [狀態、流和不變性](/tw/ch12#sec_stream_immutability)-[不變性的侷限性](/tw/ch12#sec_stream_immutability_limitations)
  - 分拆, [分拆資料庫](/tw/ch13#sec_future_unbundling)-[多分割槽資料處理](/tw/ch13#sec_future_unbundled_multi_shard)
    - 構建資料儲存技術, [組合使用資料儲存技術](/tw/ch13#id447)-[分拆系統與整合系統](/tw/ch13#id448)
    - 圍繞資料流設計應用程式, [圍繞資料流設計應用](/tw/ch13#sec_future_dataflow)-[流處理器和服務](/tw/ch13#id345)
    - 觀察匯出狀態, [觀察派生資料狀態](/tw/ch13#sec_future_observing)-[多分割槽資料處理](/tw/ch13#sec_future_unbundled_multi_shard)
- 資料中心
  - 失敗, [硬體與軟體故障](/tw/ch2#sec_introduction_hardware_faults)
  - geographically distributed（見 regions (geographic distribution)）
  - 多種使用和共享資源, [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
  - 網路架構, [雲計算與超級計算](/tw/ch1#id17)
  - 網路斷層, [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults)
- 資料流動, [資料流的模式](/tw/ch5#sec_encoding_dataflow)-[分散式 actor 框架](/tw/ch5#distributed-actor-frameworks), [圍繞資料流設計應用](/tw/ch13#sec_future_dataflow)-[流處理器和服務](/tw/ch13#id345)
  - 資料流系統的正確性, [資料流系統的正確性](/tw/ch13#id453)
  - 資料流引擎, [資料流引擎](/tw/ch11#sec_batch_dataflow)
    - 與流處理的比較, [流處理](/tw/ch12#sec_stream_processing)
    - DataFrames, [DataFrames](/tw/ch11#id287)
    - 批次處理框架中的支援, [批處理](/tw/ch11#ch_batch)
  - 事件驅動, [事件驅動的架構](/tw/ch5#sec_encoding_dataflow_msg)-[分散式 actor 框架](/tw/ch5#distributed-actor-frameworks)
  - 關於, [理解資料流](/tw/ch13#id443)
  - 透過資料庫, [流經資料庫的資料流](/tw/ch5#sec_encoding_dataflow_db)
  - 透過服務, [流經服務的資料流：REST 與 RPC](/tw/ch5#sec_encoding_dataflow_rpc)-[RPC 的資料編碼與演化](/tw/ch5#data-encoding-and-evolution-for-rpc)
  - workflow engines（見 workflow engines）
- DataFrames, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
  - 執行, [DataFrames](/tw/ch11#id287)
  - 分批處理, [DataFrames](/tw/ch11#id287)
  - 在筆記本中, [機器學習](/tw/ch11#id290)
  - 批次處理框架中的支援, [批處理](/tw/ch11#ch_batch)
- DataFusion（查詢引擎）, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
- Datalog（查詢語言）, [Datalog：遞迴關係查詢](/tw/ch3#id62)-[Datalog：遞迴關係查詢](/tw/ch3#id62)
- 資料流（變化資料捕獲）, [變更流的 API 支援](/tw/ch12#sec_stream_change_api)
- 資料型別
  - binary strings in XML and JSON, [JSON、XML 及其二進位制變體](/tw/ch5#sec_encoding_json)
  - 無衝突, [CRDT 與操作變換](/tw/ch6#sec_replication_crdts)
  - 在 Avro 編碼中, [Avro](/tw/ch5#sec_encoding_avro)
  - 在協議緩衝中, [欄位標籤與模式演化](/tw/ch5#field-tags-and-schema-evolution)
  - numbers in XML and JSON, [JSON、XML 及其二進位制變體](/tw/ch5#sec_encoding_json)
- 日期和日期, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)
- Datomic（資料庫）
  - B-樹木儲存, [索引與快照隔離](/tw/ch8#indexes-and-snapshot-isolation)
  - 資料模型, [圖資料模型](/tw/ch3#sec_datamodels_graph), [三元組儲存與 SPARQL](/tw/ch3#id59)
  - 資料日誌查詢語言, [Datalog：遞迴關係查詢](/tw/ch3#id62)
  - 切除, [不變性的侷限性](/tw/ch12#sec_stream_immutability_limitations)
  - 事務語言, [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs)
  - 事務的序列執行, [實際序列執行](/tw/ch8#sec_transactions_serial)
- Daylight Saving Time (DST), [日曆時鐘](/tw/ch9#time-of-day-clocks)
- Db2（資料庫）
  - 資料變更捕獲, [資料變更捕獲的實現](/tw/ch12#id307)
- DBA (database administrator), [雲時代的運維](/tw/ch1#sec_introduction_operations)
- 僵局, [顯式鎖定](/tw/ch8#explicit-locking)
  - 檢測, 分散式事務, [XA 事務的問題](/tw/ch8#problems-with-xa-transactions)
  - in two-phase locking (2PL), [兩階段鎖定的實現](/tw/ch8#implementation-of-two-phase-locking)
- Debezium（變化資料捕獲）, [資料變更捕獲的實現](/tw/ch12#id307)
  - 卡桑德拉島, [變更流的 API 支援](/tw/ch12#sec_stream_change_api)
  - 資料整合, [分拆系統與整合系統](/tw/ch13#id448)
- 宣告語言, [資料模型與查詢語言](/tw/ch3#ch_datamodels), [術語表](/tw/glossary)
  - 並同步引擎, [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
  - 資料日誌, [Datalog：遞迴關係查詢](/tw/ch3#id62)
  - 文件資料庫中, [文件和關係資料庫的融合](/tw/ch3#convergence-of-document-and-relational-databases)
  - recursive SQL queries, [SQL 中的圖查詢](/tw/ch3#id58)
  - SPARQL, [SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
- DeepSeek
  - 3FS（見 3FS）
- 延遲
  - 限制網路延遲, [同步與非同步網路](/tw/ch9#sec_distributed_sync_networks)
  - 邊框程序暫停, [響應時間保證](/tw/ch9#sec_distributed_clocks_realtime)
  - 無限制的網路延遲, [超時和無界延遲](/tw/ch9#sec_distributed_queueing)
  - 未繫結的程序暫停, [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)
- 刪除資料, [不變性的侷限性](/tw/ch12#sec_stream_immutability_limitations)
  - in LSM storage, [磁碟空間使用](/tw/ch4#disk-space-usage)
  - 法律依據, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)
- Delta Lake（表格式）, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables), [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
  - 硬化和叢集, [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
- 非軍事區（聯網）, [對外提供派生資料](/tw/ch11#sec_batch_serving_derived)
- 非正常化（資料表示）, [正規化、反正規化與連線](/tw/ch3#sec_datamodels_normalization)-[多對一與多對多關係](/tw/ch3#sec_datamodels_many_to_many), [術語表](/tw/glossary)
  - 在衍生資料系統中, [記錄系統與派生資料](/tw/ch1#sec_introduction_derived)
  - in event sourcing/CQRS, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
  - 社會網路案例研究, [社交網路案例研究中的反正規化](/tw/ch3#denormalization-in-the-social-networking-case-study)
  - 實際意見, [物化檢視與資料立方體](/tw/ch4#sec_storage_materialized_views)
  - 更新衍生資料, [單物件與多物件操作](/tw/ch8#sec_transactions_multi_object), [多物件事務的需求](/tw/ch8#sec_transactions_need), [組合使用派生資料的工具](/tw/ch13#id442)
  - 相對於正常化, [從同一事件日誌中派生多個檢視](/tw/ch12#sec_stream_deriving_views)
- 衍生資料, [記錄系統與派生資料](/tw/ch1#sec_introduction_derived), [流處理](/tw/ch12#ch_stream), [術語表](/tw/glossary)
  - 批處理, [批處理](/tw/ch11#ch_batch)
  - 事件溯源與 CQRS, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
  - 從變化資料抓取, [資料變更捕獲的實現](/tw/ch12#id307)
  - 透過日誌維護匯出狀態, [資料庫與流](/tw/ch12#sec_stream_databases)-[變更流的 API 支援](/tw/ch12#sec_stream_change_api), [狀態、流和不變性](/tw/ch12#sec_stream_immutability)-[併發控制](/tw/ch12#sec_stream_concurrency)
  - 透過對流的訂閱來觀察, [端到端的事件流](/tw/ch13#id349)
  - 批次和流處理的產出, [批處理與流處理](/tw/ch13#sec_future_batch_streaming)
  - 透過應用程式程式碼, [應用程式碼作為派生函式](/tw/ch13#sec_future_dataflow_derivation)
  - 相對於已分配事務, [派生資料與分散式事務](/tw/ch13#sec_future_derived_vs_transactions)
- 設計模式, [簡單性：管理複雜度](/tw/ch2#id38)
- 決定性行動, [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs), [故障與部分失效](/tw/ch9#sec_distributed_partial_failure), [術語表](/tw/glossary)
  - 專有權, [冪等性](/tw/ch12#sec_stream_idempotence), [理解資料流](/tw/ch13#id443)
  - 計算衍生資料, [維護派生狀態](/tw/ch13#id446), [資料流系統的正確性](/tw/ch13#id453), [為可審計性而設計](/tw/ch13#id365)
  - 如果來源, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
  - 狀態機器複製, [使用共享日誌](/tw/ch10#sec_consistency_smr), [資料庫與流](/tw/ch12#sec_stream_databases)
  - 基於語句的複製, [基於語句的複製](/tw/ch6#statement-based-replication)
  - 測試中, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
  - 加入, [連線的時間依賴性](/tw/ch12#sec_stream_join_time)
  - 使程式碼確定性, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
  - 概覽, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
- 確定性模擬測試（DST）, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
- DevOps, [雲時代的運維](/tw/ch1#sec_introduction_operations)
- 維度表, [星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics)
- dimensional modeling（見 star schemas）
- directed acyclic graphs (DAG)
  - 工作流程, [工作流排程](/tw/ch11#sec_batch_workflows)
    - （另見 workflow engines）
- 髒讀, [沒有髒讀](/tw/ch8#no-dirty-reads)
- 髒字（事務隔離）, [沒有髒寫](/tw/ch8#sec_transactions_dirty_write)
- 分類
  - 儲存和計算, [儲存與計算的分離](/tw/ch1#sec_introduction_storage_compute)
- discord（分組聊天）
  - GraphQL example, [GraphQL](/tw/ch3#id63)
- 歧視, [偏見與歧視](/ch14#id370)
- disks（見 hard disks）
- 分散式行為者框架, [分散式 actor 框架](/tw/ch5#distributed-actor-frameworks)
- 分散式檔案系統, [分散式檔案系統](/tw/ch11#sec_batch_dfs)-[分散式檔案系統](/tw/ch11#sec_batch_dfs)
  - 比較物件儲存, [物件儲存](/tw/ch11#id277)
  - 由 Flink 使用, [失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance)
- 已分發分類賬, [總結](/tw/ch3#summary)
- 分散式系統, [分散式系統的麻煩](/tw/ch9#ch_distributed)-[總結](/tw/ch9#summary), [術語表](/tw/glossary)
  - 拜占庭斷層, [拜占庭故障](/tw/ch9#sec_distributed_byzantine)-[弱形式的謊言](/tw/ch9#weak-forms-of-lying)
  - 檢測網路斷層, [檢測故障](/tw/ch9#id307)
  - 過失和部分失敗, [故障與部分失效](/tw/ch9#sec_distributed_partial_failure)
  - 協商一致的正式化, [單值共識](/tw/ch10#single-value-consensus)
  - 無法取得的結果, [CAP 定理](/tw/ch10#the-cap-theorem), [共識](/tw/ch10#sec_consistency_consensus)
  - 出現故障的問題, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover)
  - multi-region（見 regions (geographic distribution)）
  - 網路問題, [不可靠的網路](/tw/ch9#sec_distributed_networks)-[我們不能簡單地使網路延遲可預測嗎？](/tw/ch9#can-we-not-simply-make-network-delays-predictable)
  - 問題, [分散式系統的問題](/tw/ch1#sec_introduction_dist_sys_problems)
  - 法定人數,依賴, [多數派原則](/tw/ch9#sec_distributed_majority)
  - 使用原因, [分散式與單節點系統](/tw/ch1#sec_introduction_distributed), [複製](/tw/ch6#ch_replication)
  - 同步時鐘, 依賴, [對同步時鐘的依賴](/tw/ch9#sec_distributed_clocks_relying)-[用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
  - 系統模型, [系統模型與現實](/tw/ch9#sec_distributed_system_model)-[確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
  - 使用時鐘和時間, [不可靠的時鐘](/tw/ch9#sec_distributed_clocks)
- distributed transactions（見 transactions）
- Django（網路框架）, [處理錯誤和中止](/tw/ch8#handling-errors-and-aborts)
- DMZ (demilitarized zone), [對外提供派生資料](/tw/ch11#sec_batch_serving_derived)
- DNS (Domain Name System), [請求路由](/tw/ch7#sec_sharding_routing), [服務發現](/tw/ch10#service-discovery)
  - 用於負載平衡, [負載均衡器、服務發現和服務網格](/tw/ch5#sec_encoding_service_discovery)
- Docker （集裝箱管理器）, [應用程式碼和狀態的分離](/tw/ch13#id344)
- 文件資料模型, [關係模型與文件模型](/tw/ch3#sec_datamodels_history)-[文件和關係資料庫的融合](/tw/ch3#convergence-of-document-and-relational-databases)
  - 比較關係模式, [何時使用哪種模型](/tw/ch3#sec_datamodels_document_summary)-[文件和關係資料庫的融合](/tw/ch3#convergence-of-document-and-relational-databases)
  - 多物件事務, 需要, [多物件事務的需求](/tw/ch8#sec_transactions_need)
  - 硬化二級指數, [分片與二級索引](/tw/ch7#sec_sharding_secondary_indexes)
  - 相對關係模式
    - 模式的趨同, [文件和關係資料庫的融合](/tw/ch3#convergence-of-document-and-relational-databases)
    - 資料位置, [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality)
- document-partitioned indexes（見 local secondary indexes）
- 領域驅動設計, [簡單性：管理複雜度](/tw/ch2#id38), [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
- 點版向量, [版本向量](/tw/ch6#version-vectors)
- 雙重登入簿記, [總結](/tw/ch3#summary)
- DRBD (Distributed Replicated Block Device), [單主複製](/tw/ch6#sec_replication_leader)
- 漂移（小時）, [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy)
- Druid（資料庫）, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp), [列式儲存](/tw/ch4#sec_storage_column), [從同一事件日誌中派生多個檢視](/tw/ch12#sec_stream_deriving_views)
  - 處理寫入, [寫入列式儲存](/tw/ch4#writing-to-column-oriented-storage)
  - 預彙總, [分析（Analytics）](/tw/ch11#sec_batch_olap)
  - 服務衍生資料, [對外提供派生資料](/tw/ch11#sec_batch_serving_derived)
- Dryad（資料流引擎）, [資料流引擎](/tw/ch11#sec_batch_dataflow)
- 雙寫、 問題, [保持系統同步](/tw/ch12#sec_stream_sync)
- DuckDB（資料庫）, [分散式系統的問題](/tw/ch1#sec_introduction_dist_sys_problems), [壓實策略](/tw/ch4#sec_storage_lsm_compaction)
  - 面向列的儲存, [列式儲存](/tw/ch4#sec_storage_column)
  - 用於 ETL, [提取-轉換-載入（ETL）](/tw/ch11#sec_batch_etl_usage)
- 減少重複,消除, [抑制重複](/tw/ch13#id354)
  - （另見 冪等性）
  - using a unique ID, [操作識別符號](/tw/ch13#id355), [多分割槽請求處理](/tw/ch13#id360)
- 永續性, [使 B 樹可靠](/tw/ch4#sec_storage_btree_wal), [永續性](/tw/ch8#durability), [術語表](/tw/glossary)
- 持久執行, [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows)
  - 依賴決定性因素, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
  - Restate（見 Restate (workflow engine)）
  - Temporal（見 Temporal (workflow engine)）
- durable functions（見 workflow engines）
- 時間（時間）, [不可靠的時鐘](/tw/ch9#sec_distributed_clocks)
  - 用單音鍾測量, [單調時鐘](/tw/ch9#monotonic-clocks)
- 動態輸入語言
  - 類比於閱讀時的圖案, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility)
- Dynamo（資料庫）, [無主複製](/tw/ch6#sec_replication_leaderless)
- Dynamo-style databases（見 leaderless replication）
- DynamoDB（資料庫）
  - 自動縮放, [運維：自動/手動再平衡](/tw/ch7#sec_sharding_operations)
  - 雜湊變硬, [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
  - 基於領導者的複製, [單主複製](/tw/ch6#sec_replication_leader)
  - 硬化二級指數, [全域性二級索引](/tw/ch7#id167)

### E

- EBS（虛擬塊裝置）, [儲存與計算的分離](/tw/ch1#sec_introduction_storage_compute)
  - 比較物件儲存, [設定新的副本](/tw/ch6#sec_replication_new_replica)
- ECC（見 error-correcting codes）
- EDB Postgres Distributed（資料庫）, [跨地域執行](/tw/ch6#sec_replication_multi_dc)
- 邊緣（圖）, [圖資料模型](/tw/ch3#sec_datamodels_graph)
  - 屬性圖模型, [屬性圖](/tw/ch3#id56)
- 編輯距離（全文搜尋）, [全文檢索](/tw/ch4#sec_storage_full_text)
- 有效即時語義, [容錯](/tw/ch12#sec_stream_fault_tolerance), [恰好執行一次操作](/tw/ch13#id353)
  - （另見 恰好一次語義）
  - 維護完整性, [資料流系統的正確性](/tw/ch13#id453)
- Elastic Compute Cloud (EC2)
  - 現場例項, [故障處理](/tw/ch11#id281)
- 彈性, [分散式與單節點系統](/tw/ch1#sec_introduction_distributed)
  - 雲資料倉庫, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses), [查詢語言](/tw/ch11#sec_batch_query_lanauges)
- 彈性搜尋（搜尋伺服器）
  - 本地二級指數, [本地二級索引](/tw/ch7#id166)
  - 剖析器（流搜尋）, [在流上搜索](/tw/ch12#id320)
  - 服務衍生資料, [對外提供派生資料](/tw/ch11#sec_batch_serving_derived)
  - 硬調和, [固定數量的分片](/tw/ch7#fixed-number-of-shards)
  - 使用 Lucene, [全文檢索](/tw/ch4#sec_storage_full_text)
- 精靈（程式語言）, [端到端的事件流](/tw/ch13#id349)
- ELT (extract-load-transform), [資料倉庫](/tw/ch1#sec_introduction_dwh)
  - 與批次處理的關係, [提取-轉換-載入（ETL）](/tw/ch11#sec_batch_etl_usage)
- 嚴重平行（演算法）
  - 提取-轉換-載入（ETL）（見 ETL）
  - MapReduce, [MapReduce](/tw/ch11#sec_batch_mapreduce)
    - （另見 MapReduce）
- 嵌入式儲存引擎, [壓實策略](/tw/ch4#sec_storage_lsm_compaction)
- 嵌入（顯示器）, [向量嵌入](/tw/ch4#id92)
- 編碼（資料格式）, [編碼與演化](/tw/ch5#ch_encoding)-[模式的優點](/tw/ch5#sec_encoding_schemas)
  - Avro, [Avro](/tw/ch5#sec_encoding_avro)-[動態生成的模式](/tw/ch5#dynamically-generated-schemas)
  - binary variants of JSON and XML, [二進位制編碼](/tw/ch5#binary-encoding)
  - 相容性, [編碼與演化](/tw/ch5#ch_encoding)
    - 電話服務, [RPC 的資料編碼與演化](/tw/ch5#data-encoding-and-evolution-for-rpc)
    - 使用資料庫, [流經資料庫的資料流](/tw/ch5#sec_encoding_dataflow_db)-[歸檔儲存](/tw/ch5#archival-storage)
  - 定義, [編碼資料的格式](/tw/ch5#sec_encoding_formats)
  - JSON, XML, and CSV, [JSON、XML 及其二進位制變體](/tw/ch5#sec_encoding_json)
  - 語言特定格式, [特定語言的格式](/tw/ch5#id96)
  - 計劃的價值, [模式的優點](/tw/ch5#sec_encoding_schemas)
  - Protocol Buffers, [Protocol Buffers](/tw/ch5#sec_encoding_protobuf)-[欄位標籤與模式演化](/tw/ch5#field-tags-and-schema-evolution)
  - 資料說明, [編碼資料的格式](/tw/ch5#sec_encoding_formats)
- 端到端原則, [端到端原則](/tw/ch13#sec_future_e2e_argument)-[在資料系統中應用端到端思考](/tw/ch13#id357)
  - 檢查完整性, [端到端原則重現](/tw/ch13#id456)
  - 釋出/訂閱流, [端到端的事件流](/tw/ch13#id349)
- 濃縮（流）, [流表連線（流擴充）](/tw/ch12#sec_stream_table_joins)
- Enterprise JavaBeans (EJB), [遠端過程呼叫（RPC）的問題](/tw/ch5#sec_problems_with_rpc)
- 企業軟體, [資料系統架構中的權衡](/tw/ch1#ch_tradeoffs)
- entities（見 vertices）
- 電子儲存, [儲存與計算的分離](/tw/ch1#sec_introduction_storage_compute)
- 時代（協商一致演算法）, [從單主複製到共識](/tw/ch10#from-single-leader-replication-to-consensus)
- 時代（Unix 時間戳）, [日曆時鐘](/tw/ch9#time-of-day-clocks)
- 清除編碼（錯誤校正）, [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- 錯誤處理
  - 網路斷層, [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults)
  - 事務中, [處理錯誤和中止](/tw/ch8#handling-errors-and-aborts)
- 錯誤更正程式碼, [硬體與軟體故障](/tw/ch2#sec_introduction_hardware_faults), [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- Esper (CEP engine), [複合事件處理](/tw/ch12#id317)
- 基本複雜性, [簡單性：管理複雜度](/tw/ch2#id38)
- 協調事務, [協調服務](/tw/ch10#sec_consistency_coordination)-[服務發現](/tw/ch10#service-discovery)
  - 生成柵欄標誌, [隔離殭屍程序和延遲請求](/tw/ch9#sec_distributed_fencing_tokens), [協調服務](/tw/ch10#sec_consistency_coordination)
  - 線性操作, [實現線性一致性系統](/tw/ch10#sec_consistency_implementing_linearizable), [共識的微妙之處](/tw/ch10#subtleties-of-consensus)
  - 鎖和領袖選舉, [鎖定與領導者選舉](/tw/ch10#locking-and-leader-election)
  - 用於服務發現, [負載均衡器、服務發現和服務網格](/tw/ch5#sec_encoding_service_discovery), [服務發現](/tw/ch10#service-discovery)
  - 用於硬性轉讓, [請求路由](/tw/ch7#sec_sharding_routing)
  - 使用 Raft 演算法, [單主複製](/tw/ch6#sec_replication_leader)
- 伊特魯姆（塊鏈）, [用於可審計資料系統的工具](/tw/ch13#id366)
- 乙太網（網路）, [雲計算與超級計算](/tw/ch1#id17), [不可靠的網路](/tw/ch9#sec_distributed_networks), [我們不能簡單地使網路延遲可預測嗎？](/tw/ch9#can-we-not-simply-make-network-delays-predictable)
  - 包檢查和, [弱形式的謊言](/tw/ch9#weak-forms-of-lying), [端到端原則](/tw/ch13#sec_future_e2e_argument)
- 道德操守, [將事情做正確](/ch14)-[立法與自律](/ch14#sec_future_legislation)
  - 道德守則和專業實務, [將事情做正確](/ch14)
  - 立法和自律, [立法與自律](/ch14#sec_future_legislation)
  - 預測分析, [預測分析](/ch14#id369)-[反饋迴路](/ch14#id372)
    - 擴大偏見, [偏見與歧視](/ch14#id370)
    - 反饋迴圈, [反饋迴路](/ch14#id372)
  - 隱私和跟蹤, [隱私與追蹤](/ch14#id373)-[立法與自律](/ch14#sec_future_legislation)
    - 同意和選擇自由, [同意與選擇自由](/ch14#id375)
    - 資料作為資產和權力, [資料作為資產與權力](/ch14#id376)
    - 隱私的含義, [隱私與資料使用](/ch14#id457)
    - 監視, [監視](/ch14#id374)
  - 尊重、尊嚴和機構, [立法與自律](/ch14#sec_future_legislation)
  - 意外後果, [將事情做正確](/ch14), [反饋迴路](/ch14#id372)
- ETL, [資料倉庫](/tw/ch1#sec_introduction_dwh), [保持系統同步](/tw/ch12#sec_stream_sync), [術語表](/tw/glossary)
  - 與批次處理的關係, [提取-轉換-載入（ETL）](/tw/ch11#sec_batch_etl_usage)-[提取-轉換-載入（ETL）](/tw/ch11#sec_batch_etl_usage)
  - 使用批次處理, [批處理](/tw/ch11#ch_batch)
- 歐幾利得距離（語義搜尋）, [向量嵌入](/tw/ch4#id92)
- European Union
  - AI Act（見 AI Act）
  - GDPR（見 GDPR）
- 事件溯源, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)-[事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
  - 並更改資料捕獲, [資料變更捕獲與事件溯源](/tw/ch12#sec_stream_event_sourcing)
  - 與變化資料捕獲的比較, [資料變更捕獲與事件溯源](/tw/ch12#sec_stream_event_sourcing)
  - 不可更改性和可審計性, [狀態、流和不變性](/tw/ch12#sec_stream_immutability), [為可審計性而設計](/tw/ch13#id365)
  - 大型可靠資料系統, [操作識別符號](/tw/ch13#id355), [資料流系統的正確性](/tw/ch13#id453)
  - 依賴決定性因素, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
- event streams（見 streams）
- 事件驅動的架構, [事件驅動的架構](/tw/ch5#sec_encoding_dataflow_msg)-[分散式 actor 框架](/tw/ch5#distributed-actor-frameworks)
  - 分散式行為者框架, [分散式 actor 框架](/tw/ch5#distributed-actor-frameworks)
- 事件, [傳遞事件流](/tw/ch12#sec_stream_transmit)
  - 決定總順序, [全序的限制](/tw/ch13#id335)
  - 從事件日誌中得出看法, [從同一事件日誌中派生多個檢視](/tw/ch12#sec_stream_deriving_views)
  - 事件時間與處理時間, [事件時間與處理時間](/tw/ch12#id322), [微批次與存檔點](/tw/ch12#id329), [統一批處理和流處理](/tw/ch13#id338)
  - 不可改變的優點, [不可變事件的優點](/tw/ch12#sec_stream_immutability_pros), [為可審計性而設計](/tw/ch13#id365)
  - 命令捕獲因果關係, [排序事件以捕獲因果關係](/tw/ch13#sec_future_capture_causality)
  - 讀作:, [讀也是事件](/tw/ch13#sec_future_read_events)
  - 疏遠者, [處理滯留事件](/tw/ch12#id323)
  - 溪流處理中的時間戳, [你用的是誰的時鐘？](/tw/ch12#id438)
- EventSource (browser API), [將狀態變更推送給客戶端](/tw/ch13#id348)
- EventStoreDB（資料庫）, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
- 最終一致性, [複製](/tw/ch6#ch_replication), [複製延遲的問題](/tw/ch6#sec_replication_lag), [安全性與活性](/tw/ch9#sec_distributed_safety_liveness)
  - （另見 conflicts）
  - 和長期不一致, [及時性與完整性](/tw/ch13#sec_future_integrity)
  - 最終的一致性, [自動衝突解決](/tw/ch6#automatic-conflict-resolution)
- 證據
  - 資料用作, [人類與可靠性](/tw/ch2#id31)
- 可演化性, [可演化性：讓變化更容易](/tw/ch2#sec_introduction_evolvability), [編碼與演化](/tw/ch5#ch_encoding)
  - 電話服務, [RPC 的資料編碼與演化](/tw/ch5#data-encoding-and-evolution-for-rpc)
  - 事件溯源, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
  - 圖表結構資料, [屬性圖](/tw/ch3#id56)
  - 資料庫, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility), [流經資料庫的資料流](/tw/ch5#sec_encoding_dataflow_db)-[歸檔儲存](/tw/ch5#archival-storage), [從同一事件日誌中派生多個檢視](/tw/ch12#sec_stream_deriving_views), [應用演化後重新處理資料](/tw/ch13#sec_future_reprocessing)
  - 後處理資料, [應用演化後重新處理資料](/tw/ch13#sec_future_reprocessing), [統一批處理和流處理](/tw/ch13#id338)
  - Avro 的策略進化, [寫入者模式與讀取者模式](/tw/ch5#the-writers-schema-and-the-readers-schema)
  - 協議緩衝的策略演變, [欄位標籤與模式演化](/tw/ch5#field-tags-and-schema-evolution)
  - 閱讀時的圖謀, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility), [編碼與演化](/tw/ch5#ch_encoding), [模式的優點](/tw/ch5#sec_encoding_schemas)
- 恰好一次語義, [恰好一次訊息處理](/tw/ch8#sec_transactions_exactly_once), [再談恰好一次訊息處理](/tw/ch8#exactly-once-message-processing-revisited), [容錯](/tw/ch12#sec_stream_fault_tolerance), [恰好執行一次操作](/tw/ch13#id353)
  - 與批次處理器對等, [統一批處理和流處理](/tw/ch13#id338)
  - 維護完整性, [資料流系統的正確性](/tw/ch13#id453)
  - 使用持久執行, [持久化執行](/tw/ch5#durable-execution)
- 獨佔模式, [兩階段鎖定的實現](/tw/ch8#implementation-of-two-phase-locking)
- 指數備份, [描述效能](/tw/ch2#sec_introduction_percentiles), [處理錯誤和中止](/tw/ch8#handling-errors-and-aborts)
- ext4 (file system), [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- eXtended Architecture transactions（見 XA 事務）
- ETL（見 提取-轉換-載入（ETL））

### F

- 臉書
  - 費斯（媒介指數）, [向量嵌入](/tw/ch4#id92)
  - 反應（使用者介面庫）, [端到端的事件流](/tw/ch13#id349)
  - 社會圖表, [圖資料模型](/tw/ch3#sec_datamodels_graph)
- 事實
  - 事實表（星圖）, [星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics)
  - 在資料日誌中, [Datalog：遞迴關係查詢](/tw/ch3#id62)
  - 如果來源, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
- 慢故障, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
- 失敗停止模式, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
- 故障切換, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover), [術語表](/tw/glossary)
  - （另見 基於領導者的複製）
  - 無領導複製,沒有, [當節點故障時寫入資料庫](/tw/ch6#id287)
  - 領袖選舉, [分散式鎖和租約](/tw/ch9#sec_distributed_lock_fencing), [共識](/tw/ch10#sec_consistency_consensus), [從單主複製到共識](/tw/ch10#from-single-leader-replication-to-consensus)
  - 潛在問題, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover)
- 失敗
  - 透過經銷事務擴充, [維護派生狀態](/tw/ch13#id446)
  - 檢測失敗, [檢測故障](/tw/ch9#id307)
    - 自動再平衡導致連鎖故障, [運維：自動/手動再平衡](/tw/ch7#sec_sharding_operations)
    - 超時和無限制延誤, [超時和無界延遲](/tw/ch9#sec_distributed_queueing), [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
    - 使用協調服務, [協調服務](/tw/ch10#sec_consistency_coordination)
  - 錯對錯, [可靠性與容錯](/tw/ch2#sec_introduction_reliability)
  - 部分失敗, [故障與部分失效](/tw/ch9#sec_distributed_partial_failure), [總結](/tw/ch9#summary)
- 費斯（媒介指數）, [向量嵌入](/tw/ch4#id92)
- 假陽性（Bloom 過濾器）, [布隆過濾器](/tw/ch4#bloom-filters)
- 扇出, [時間線的物化與更新](/tw/ch2#sec_introduction_materializing), [多個消費者](/tw/ch12#id298)
- 斷層注射, [容錯](/tw/ch2#id27), [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults), [故障注入](/tw/ch9#sec_fault_injection)
- 斷層隔離, [面向多租戶的分片](/tw/ch7#sec_sharding_multitenancy)
- 過失容忍, [可靠性與容錯](/tw/ch2#sec_introduction_reliability)-[人類與可靠性](/tw/ch2#id31), [術語表](/tw/glossary)
  - 協商一致的形式化, [單值共識](/tw/ch10#single-value-consensus)
  - 容忍人為失誤, [批處理](/tw/ch11#ch_batch)
  - 分批處理, [故障處理](/tw/ch11#id281)
  - 在基於日誌的系統中, [在資料系統中應用端到端思考](/tw/ch13#id357), [及時性與完整性](/tw/ch13#sec_future_integrity)-[資料流系統的正確性](/tw/ch13#id453)
  - 在溪流處理中, [容錯](/tw/ch12#sec_stream_fault_tolerance)-[失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance)
    - 原子提交, [原子提交再現](/tw/ch12#sec_stream_atomic_commit)
    - 冪等性, [冪等性](/tw/ch12#sec_stream_idempotence)
    - 保持衍生狀態, [維護派生狀態](/tw/ch13#id446)
    - 微打鬥和檢查站, [微批次與存檔點](/tw/ch12#id329)
    - 失敗後重建狀態, [失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance)
  - 分散式事務, [XA 事務](/tw/ch8#xa-transactions)-[再談恰好一次訊息處理](/tw/ch8#exactly-once-message-processing-revisited)
  - 基於領導和無領導者的複製, [單主與無主複製的效能](/tw/ch6#sec_replication_leaderless_perf)
  - 事務原子性, [原子性](/tw/ch8#sec_transactions_acid_atomicity), [分散式事務](/tw/ch8#sec_transactions_distributed)-[恰好一次訊息處理](/tw/ch8#sec_transactions_exactly_once)
- 錯誤
  - 拜占庭斷層, [拜占庭故障](/tw/ch9#sec_distributed_byzantine)-[弱形式的謊言](/tw/ch9#weak-forms-of-lying)
  - 失敗與, [可靠性與容錯](/tw/ch2#sec_introduction_reliability)
  - 事務處理, [事務](/tw/ch8#ch_transactions)
  - 超級計算機和雲計算處理, [雲計算與超級計算](/tw/ch1#id17)
  - 硬體, [硬體與軟體故障](/tw/ch2#sec_introduction_hardware_faults)
  - 在分散式系統中, [故障與部分失效](/tw/ch9#sec_distributed_partial_failure)
  - introducing deliberately（見 fault injection）
  - 網路斷層, [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults)-[檢測故障](/tw/ch9#id307)
    - 非對稱斷層, [多數派原則](/tw/ch9#sec_distributed_majority)
    - 檢測, [檢測故障](/tw/ch9#id307)
    - 容忍,多領導複製, [跨地域執行](/tw/ch6#sec_replication_multi_dc)
  - 軟體故障, [軟體故障](/tw/ch2#software-faults)
  - tolerating（見 fault tolerance）
- 特性工程（機器學習）, [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake)
- 聯邦資料庫, [一切的元資料庫](/tw/ch13#id341)
- Feldera（資料庫）
  - 增量檢視維護, [維護物化檢視](/tw/ch12#sec_stream_mat_view)
- 圍欄, [線性一致性與網路延遲](/tw/ch10#linearizability-and-network-delays)
- 屏障, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover), [隔離殭屍程序和延遲請求](/tw/ch9#sec_distributed_fencing_tokens)-[多副本隔離](/tw/ch9#fencing-with-multiple-replicas)
  - 生成柵欄標誌, [使用共享日誌](/tw/ch10#sec_consistency_smr), [協調服務](/tw/ch10#sec_consistency_coordination)
  - 柵欄標誌的屬性, [定義演算法的正確性](/tw/ch9#defining-the-correctness-of-an-algorithm)
  - 流處理器寫入資料庫, [冪等性](/tw/ch12#sec_stream_idempotence), [恰好執行一次操作](/tw/ch13#id353)
- 獲取和新增
  - 與協商一致的關係, [獲取並增加作為共識](/tw/ch10#fetch-and-add-as-consensus)
- 纖維通道（網路）, [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- 欄位標記（協議緩衝）, [Protocol Buffers](/tw/ch5#sec_encoding_protobuf)-[欄位標籤與模式演化](/tw/ch5#field-tags-and-schema-evolution)
- Figma （圖形軟體）, [即時協作、離線優先和本地優先應用](/tw/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- filesystem in userspace (FUSE), [設定新的副本](/tw/ch6#sec_replication_new_replica), [分散式檔案系統](/tw/ch11#sec_batch_dfs)
  - 在物件儲存中, [物件儲存](/tw/ch11#id277)
- 財務資料
  - 會計分類賬, [總結](/tw/ch3#summary)
  - 不可改變性, [不可變事件的優點](/tw/ch12#sec_stream_immutability_pros)
  - 時間序列資料, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- 五特蘭, [資料倉庫](/tw/ch1#sec_introduction_dwh)
- FizzBee (specification language), [模型檢查與規範語言](/tw/ch9#model-checking-and-specification-languages)
- 平面指數（媒介指數）, [向量嵌入](/tw/ch4#id92)
- FlatBuffers（資料格式）, [編碼資料的格式](/tw/ch5#sec_encoding_formats)
- Flink（處理框架）, [批處理](/tw/ch11#ch_batch), [資料流引擎](/tw/ch11#sec_batch_dataflow)
  - 成本效率, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
  - DataFrames, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes), [DataFrames](/tw/ch11#id287)
  - 過失容忍, [故障處理](/tw/ch11#id281), [微批次與存檔點](/tw/ch12#id329), [失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance)
  - FlinkML, [機器學習](/tw/ch11#id290)
  - 資料倉庫, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
  - high availability using ZooKeeper, [協調服務](/tw/ch10#sec_consistency_coordination)
  - 整合批次和流處理, [統一批處理和流處理](/tw/ch13#id338)
  - 查詢最佳化器, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
  - 移動資料, [混洗資料](/tw/ch11#sec_shuffle)
  - 流處理, [流分析](/tw/ch12#id318)
  - streaming SQL support, [複合事件處理](/tw/ch12#id317)
- 流量控制, [TCP 的侷限性](/tw/ch9#sec_distributed_tcp), [訊息傳遞系統](/tw/ch12#sec_stream_messaging), [術語表](/tw/glossary)
- FLP result (on consensus), [共識](/tw/ch10#sec_consistency_consensus)
- Flyte（工作流排程器）, [機器學習](/tw/ch11#id290)
- 追隨者, [單主複製](/tw/ch6#sec_replication_leader), [術語表](/tw/glossary)
  - （另見 基於領導者的複製）
- 正式方法, [形式化方法和隨機測試](/tw/ch9#sec_distributed_formal)-[確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
- 轉發相容性, [編碼與演化](/tw/ch5#ch_encoding)
- 前進衰變（演算法）, [響應時間指標的應用](/tw/ch2#sec_introduction_slo_sla)
- 化石（版本控制系統）, [併發控制](/tw/ch12#sec_stream_concurrency)
  - 避免, [不變性的侷限性](/tw/ch12#sec_stream_immutability_limitations)
- FoundationDB（資料庫）
  - 一致性模式, [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
  - 確定性模擬測試, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
  - 鍵程硬化, [按鍵的範圍分片](/tw/ch7#sec_sharding_key_range)
  - 程序/核心模式, [分片的利與弊](/tw/ch7#sec_sharding_reasons)
  - 可序列事務, [可序列化快照隔離（SSI）](/tw/ch8#sec_transactions_ssi), [可序列化快照隔離的效能](/tw/ch8#performance-of-serializable-snapshot-isolation)
  - 事務, [事務到底是什麼？](/tw/ch8#sec_transactions_overview), [資料庫內部的分散式事務](/tw/ch8#sec_transactions_internal)
- 分數索引, [何時使用哪種模型](/tw/ch3#sec_datamodels_document_summary)
- 碎裂（B樹）, [磁碟空間使用](/tw/ch4#disk-space-usage)
- 框架（計算機圖形）, [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
- 前端 （網頁開發）, [資料系統架構中的權衡](/tw/ch1#ch_tradeoffs)
- FrostDB（資料庫）
  - 確定性模擬測試（DST）, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
- fsync （系統呼叫）, [使 B 樹可靠](/tw/ch4#sec_storage_btree_wal), [永續性](/tw/ch8#durability)
- 全文檢索, [全文檢索](/tw/ch4#sec_storage_full_text), [術語表](/tw/glossary)
  - 和模糊的指數, [全文檢索](/tw/ch4#sec_storage_full_text)
  - Lucene 儲存引擎, [全文檢索](/tw/ch4#sec_storage_full_text)
  - 硬化指數, [分片與二級索引](/tw/ch7#sec_sharding_secondary_indexes)
- Function as a Service (FaaS), [微服務與無伺服器](/tw/ch1#sec_introduction_microservices)
- 職能方案擬訂
  - inspiration for MapReduce, [MapReduce](/tw/ch11#sec_batch_mapreduce)
- 職能要求, [定義非功能性需求](/tw/ch2#ch_nonfunctional)
- FUSE（見 filesystem in userspace (FUSE)）
- 模糊, [形式化方法和隨機測試](/tw/ch9#sec_distributed_formal)
- fuzzy search（見 similarity search）

### G

- Gallina（特寫語言）, [模型檢查與規範語言](/tw/ch9#model-checking-and-specification-languages)
- 遊戲開發, [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
- 垃圾收集
  - 不可改變性和, [不變性的侷限性](/tw/ch12#sec_stream_immutability_limitations)
  - 程序暫停, [延遲與響應時間](/tw/ch2#id23), [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)-[限制垃圾回收的影響](/tw/ch9#sec_distributed_gc_impact), [多數派原則](/tw/ch9#sec_distributed_majority)
    - （另見 process pauses）
- 加油站演算法定價, [反饋迴路](/ch14#id372)
- GDPR (regulation), [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance), [不變性的侷限性](/tw/ch12#sec_stream_immutability_limitations)
  - 同意書, [同意與選擇自由](/ch14#id375)
  - 資料最小化, [立法與自律](/ch14#sec_future_legislation)
  - 合法權益, [同意與選擇自由](/ch14#id375)
  - 使用權, [面向多租戶的分片](/tw/ch7#sec_sharding_multitenancy)
  - 清除的權利, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance), [磁碟空間使用](/tw/ch4#disk-space-usage), [面向多租戶的分片](/tw/ch7#sec_sharding_multitenancy)
- GenBank (genome database), [總結](/tw/ch3#summary)
- General Data Protection Regulation（見 GDPR (regulation)）
- 基因組分析, [總結](/tw/ch3#summary)
- geographic distribution（見 regions (geographic distribution)）
- 地理空間指數, [多維索引與全文索引](/tw/ch4#sec_storage_multidimensional)
- Git（版本控制系統）, [併發控制](/tw/ch12#sec_stream_concurrency)
  - 本地第一軟體, [即時協作、離線優先和本地優先應用](/tw/ch6#real-time-collaboration-offline-first-and-local-first-apps)
  - 合併衝突, [手動衝突解決](/tw/ch6#manual-conflict-resolution)
- GitHub, postmortems, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover), [領導者故障：故障轉移](/tw/ch6#leader-failure-failover), [將系統模型對映到現實世界](/tw/ch9#mapping-system-models-to-the-real-world)
- 全球二級指數, [全域性二級索引](/tw/ch7#id167), [總結](/tw/ch7#summary)
- globally unique identifiers（見 UUIDs）
- GlusterFS（分散式檔案系統）, [批處理](/tw/ch11#ch_batch), [分散式檔案系統](/tw/ch11#sec_batch_dfs), [物件儲存](/tw/ch11#id277)
- GNU Coreutils (Linux), [排序與記憶體聚合](/tw/ch11#id275)
- Go（程式語言）
  - 垃圾收集, [限制垃圾回收的影響](/tw/ch9#sec_distributed_gc_impact)
- GoldenGate (change data capture), [資料變更捕獲的實現](/tw/ch12#id307)
  - （另見 Oracle）
- 谷歌
  - BigQuery（見 BigQuery（資料庫））
  - Bigtable（見 Bigtable（資料庫））
  - Chubby（鎖服務）, [協調服務](/tw/ch10#sec_consistency_coordination)
  - Cloud Storage（物件儲存）, [設定新的副本](/tw/ch6#sec_replication_new_replica), [物件儲存](/tw/ch11#id277)
    - 請求先決條件, [隔離殭屍程序和延遲請求](/tw/ch9#sec_distributed_fencing_tokens)
  - Compute Engine
    - 預設例項, [故障處理](/tw/ch11#id281)
  - 資料流（流程處理）
    - 資料倉整合, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
    - 移動資料, [混洗資料](/tw/ch11#sec_shuffle)
  - 資料流（流處理器）, [流分析](/tw/ch12#id318), [原子提交再現](/tw/ch12#sec_stream_atomic_commit), [統一批處理和流處理](/tw/ch13#id338)
    - （另見 Beam）
  - 資料流（變化資料捕獲）, [變更流的 API 支援](/tw/ch12#sec_stream_change_api)
  - Docs（協作編輯）, [即時協作、離線優先和本地優先應用](/tw/ch6#real-time-collaboration-offline-first-and-local-first-apps), [CRDT 與操作變換](/tw/ch6#sec_replication_crdts)
    - 操作轉換, [CRDT 與操作變換](/tw/ch6#sec_replication_crdts)
  - Dremel（查詢引擎）, [列式儲存](/tw/ch4#sec_storage_column)
  - Firestore（資料庫）, [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
  - MapReduce (batch processing), [批處理](/tw/ch11#ch_batch)
    - （另見 MapReduce）
  - Percolator（事務系統）, [實現線性一致的 ID 生成器](/tw/ch10#implementing-a-linearizable-id-generator)
  - 永續性磁碟（雲服務）, [儲存與計算的分離](/tw/ch1#sec_introduction_storage_compute)
  - Pub/Sub（訊息系統）, [訊息代理](/tw/ch5#message-brokers), [訊息代理與資料庫的對比](/tw/ch12#id297), [使用日誌進行訊息儲存](/tw/ch12#id300)
  - 響應時間研究, [平均值、中位數與百分位點](/tw/ch2#id24)
  - 工作表（協作電子表格）, [即時協作、離線優先和本地優先應用](/tw/ch6#real-time-collaboration-offline-first-and-local-first-apps), [CRDT 與操作變換](/tw/ch6#sec_replication_crdts)
  - Spanner（見 Spanner（資料庫））
  - TrueTime (clock API), [帶置信區間的時鐘讀數](/tw/ch9#clock-readings-with-a-confidence-interval)
- 流言協議, [請求路由](/tw/ch7#sec_sharding_routing)
- 治理, [超越資料湖](/tw/ch1#beyond-the-data-lake)
- 政府對資料的使用, [資料作為資產與權力](/ch14#id376)
- GPS (Global Positioning System)
  - 用於時鐘同步, [不可靠的時鐘](/tw/ch9#sec_distributed_clocks), [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy), [帶置信區間的時鐘讀數](/tw/ch9#clock-readings-with-a-confidence-interval), [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
- GPT (language model), [向量嵌入](/tw/ch4#id92)
- GPU (graphics processing unit), [雲服務的分層](/tw/ch1#layering-of-cloud-services), [分散式與單節點系統](/tw/ch1#sec_introduction_distributed)
- gradual rollout（見 rolling upgrades）
- GraphQL（查詢語言）, [GraphQL](/tw/ch3#id63)
  - 驗證, [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs)
- 圖表, [術語表](/tw/glossary)
  - 作為資料模型, [圖資料模型](/tw/ch3#sec_datamodels_graph)-[GraphQL](/tw/ch3#id63)
    - 屬性圖, [屬性圖](/tw/ch3#id56)
    - RDF and triple-stores, [三元組儲存與 SPARQL](/tw/ch3#id59)-[SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
  - DAGs（見 directed acyclic graphs）
  - 處理和分析, [機器學習](/tw/ch11#id290)
  - 查詢語言
    - 密碼, [Cypher 查詢語言](/tw/ch3#id57)
    - 資料日誌, [Datalog：遞迴關係查詢](/tw/ch3#id62)-[Datalog：遞迴關係查詢](/tw/ch3#id62)
    - GraphQL, [GraphQL](/tw/ch3#id63)
    - 格倫林, [圖資料模型](/tw/ch3#sec_datamodels_graph)
    - recursive SQL queries, [SQL 中的圖查詢](/tw/ch3#id58)
    - SPARQL, [SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)-[SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
  - 轉彎, [屬性圖](/tw/ch3#id56)
- 灰色失敗, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
  - 無領導複製, [單主與無主複製的效能](/tw/ch6#sec_replication_leaderless_perf)
- 格勒姆林（圖形查詢語言）, [圖資料模型](/tw/ch3#sec_datamodels_graph)
- grep （Unix 工具） （英語）., [簡單日誌分析](/tw/ch11#sec_batch_log_analysis)
- gRPC (service calls), [微服務與無伺服器](/tw/ch1#sec_introduction_microservices), [Web 服務](/tw/ch5#sec_web_services)
  - 前向和後向相容性, [RPC 的資料編碼與演化](/tw/ch5#data-encoding-and-evolution-for-rpc)
- GUIDs（見 UUIDs）

### H

- Hadoop（資料基礎設施）
  - 比較分散式資料庫, [批處理](/tw/ch11#ch_batch)
  - MapReduce（見 MapReduce）
  - NodeManager, [分散式作業編排](/tw/ch11#id278)
  - YARN（見 YARN (job scheduler)）
- HANA（見 SAP HANA（資料庫））
- 發生關係前, ["先發生"關係與併發](/tw/ch6#sec_replication_happens_before)
- 硬碟
  - 訪問模式, [順序與隨機寫入](/tw/ch4#sidebar_sequential)
  - 偵查腐敗, [端到端原則](/tw/ch13#sec_future_e2e_argument), [不要盲目信任承諾](/tw/ch13#id364)
  - 錯誤在, [硬體與軟體故障](/tw/ch2#sec_introduction_hardware_faults), [永續性](/tw/ch8#durability)
  - 順序對隨機寫入, [順序與隨機寫入](/tw/ch4#sidebar_sequential)
  - 連續寫入吞吐量, [磁碟空間使用](/tw/ch12#sec_stream_disk_usage)
- 硬體故障, [硬體與軟體故障](/tw/ch2#sec_introduction_hardware_faults)
- 雜湊函式
  - 在 Bloom 過濾器中, [布隆過濾器](/tw/ch4#bloom-filters)
- 加入雜湊
  - 在溪流處理中, [流表連線（流擴充）](/tw/ch12#sec_stream_table_joins)
- 雜湊變硬, [按鍵的雜湊分片](/tw/ch7#sec_sharding_hash)-[一致性雜湊](/tw/ch7#sec_sharding_consistent_hashing), [總結](/tw/ch7#summary)
  - 連續的雜湊, [一致性雜湊](/tw/ch7#sec_sharding_consistent_hashing)
  - Hash mod N的問題, [雜湊取模節點數](/tw/ch7#hash-modulo-number-of-nodes)
  - 區域查詢, [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
  - 合適的雜湊函式, [按鍵的雜湊分片](/tw/ch7#sec_sharding_hash)
  - 有固定的硬塊數, [固定數量的分片](/tw/ch7#fixed-number-of-shards)
- 散列表格, [日誌結構儲存](/tw/ch4#sec_storage_log_structured)
- Hazelcast（模擬資料網）
  - FencedLock, [隔離殭屍程序和延遲請求](/tw/ch9#sec_distributed_fencing_tokens)
  - Flake ID Generator, [ID 生成器和邏輯時鐘](/tw/ch10#sec_consistency_logical)
- HBase（資料庫）
  - 由於缺乏圍欄而出現錯誤, [分散式鎖和租約](/tw/ch9#sec_distributed_lock_fencing)
  - 鍵程硬化, [按鍵的範圍分片](/tw/ch7#sec_sharding_key_range)
  - 日誌結構儲存, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
  - 區域（硬化）, [分片](/tw/ch7#ch_sharding)
  - 請求路由, [請求路由](/tw/ch7#sec_sharding_routing)
  - 大小級緊湊, [壓實策略](/tw/ch4#sec_storage_lsm_compaction)
  - 寬柱資料模型, [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality), [列壓縮](/tw/ch4#sec_storage_column_compression)
- HDFS (Hadoop Distributed File System), [批處理](/tw/ch11#ch_batch), [分散式檔案系統](/tw/ch11#sec_batch_dfs)
  - （另見 distributed filesystems）
  - 檢查資料完整性, [不要盲目信任承諾](/tw/ch13#id364)
  - DataNode, [分散式檔案系統](/tw/ch11#sec_batch_dfs)
  - NameNode, [分散式檔案系統](/tw/ch11#sec_batch_dfs)
  - use in MapReduce, [MapReduce](/tw/ch11#sec_batch_mapreduce)
  - 工作流程示例, [工作流排程](/tw/ch11#sec_batch_workflows)
- HdrHistogram (numerical library), [響應時間指標的應用](/tw/ch2#sec_introduction_slo_sla)
- 頭 （Unix 工具）, [簡單日誌分析](/tw/ch11#sec_batch_log_analysis), [分散式作業編排](/tw/ch11#id278)
- 頭頂（財產圖）, [屬性圖](/tw/ch3#id56)
- 頭部阻塞, [延遲與響應時間](/tw/ch2#id23)
- 堆積檔案（資料庫）, [在索引中儲存值](/tw/ch4#sec_storage_index_heap)
  - 多轉換併發控制, [多版本併發控制（MVCC）](/tw/ch8#sec_transactions_snapshot_impl)
- 熱量管理, [偏斜的工作負載與緩解熱點](/tw/ch7#sec_sharding_skew)
- 被套期請求, [單主與無主複製的效能](/tw/ch6#sec_replication_leaderless_perf)
- 分散事務, [跨不同系統的分散式事務](/tw/ch8#sec_transactions_xa), [XA 事務的問題](/tw/ch8#problems-with-xa-transactions)
- 啟發式決策, [從協調器故障中恢復](/tw/ch8#recovering-from-coordinator-failure)
- 十六進位制（註解本）, [機器學習](/tw/ch11#id290)
- 六邊形
  - 地理空間索引, [多維索引與全文索引](/tw/ch4#sec_storage_multidimensional)
- Hibernate（物件關係對映器）, [物件關係對映（ORM）](/tw/ch3#object-relational-mapping-orm)
- 層次模型, [關係模型與文件模型](/tw/ch3#sec_datamodels_history)
- 可導航的小世界（媒介指數）, [向量嵌入](/tw/ch4#id92)
- hierarchical queries（見 recursive common table expressions）
- high availability（見 fault tolerance）
- 高頻事務, [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy)
- high-performance computing (HPC), [雲計算與超級計算](/tw/ch1#id17)
- 提示移交, [追趕錯過的寫入](/tw/ch6#sec_replication_read_repair)
- 直方圖, [響應時間指標的應用](/tw/ch2#sec_introduction_slo_sla)
- 蜂窩（資料倉）, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
  - 查詢最佳化器, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
- HNSW (vector index), [向量嵌入](/tw/ch4#id92)
- 購物視窗（流程處理）, [視窗的型別](/tw/ch12#id324)
  - （另見 windows）
- Hoptimator（查詢引擎）, [一切的元資料庫](/tw/ch13#id341)
- 地平線醜聞, [人類與可靠性](/tw/ch2#id31)
  - 缺乏事務, [事務](/tw/ch8#ch_transactions)
- horizontal scaling（見 scaling out）
  - 透過磨損, [分片的利與弊](/tw/ch7#sec_sharding_reasons)
- HornetQ（訊息系統）, [訊息代理](/tw/ch5#message-brokers), [訊息代理與資料庫的對比](/tw/ch12#id297)
  - 分散式事務支援, [XA 事務](/tw/ch8#xa-transactions)
- 熱鍵, [鍵值資料的分片](/tw/ch7#sec_sharding_key_value)
- 熱點, [鍵值資料的分片](/tw/ch7#sec_sharding_key_value)
  - 由於名人, [偏斜的工作負載與緩解熱點](/tw/ch7#sec_sharding_skew)
  - 時間序列資料, [按鍵的範圍分片](/tw/ch7#sec_sharding_key_range)
  - 解除武裝, [偏斜的工作負載與緩解熱點](/tw/ch7#sec_sharding_skew)
- hot standbys（見 基於領導者的複製）
- HTAP（見 hybrid transactional/analytic processing）
- HTTP, use in APIs（見 services）
- 人類錯誤, [人類與可靠性](/tw/ch2#id31), [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults), [批處理](/tw/ch11#ch_batch)
- 混合邏輯時鐘, [混合邏輯時鐘](/tw/ch10#hybrid-logical-clocks)
- 混合事務/分析處理, [資料倉庫](/tw/ch1#sec_introduction_dwh), [分析型資料儲存](/tw/ch4#sec_storage_analytics)
- hydrating IDs (join), [社交網路案例研究中的反正規化](/tw/ch3#denormalization-in-the-social-networking-case-study)
- 高頻圖, [屬性圖](/tw/ch3#id56)
- HyperLogLog (algorithm), [流分析](/tw/ch12#id318)

### I

- I/O operations, waiting for, [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)
- IaaS（見 infrastructure as a service (IaaS)）
- IBM
  - Db2（資料庫）
    - 分散式事務支援, [XA 事務](/tw/ch8#xa-transactions)
    - 可序列隔離, [快照隔離、可重複讀和命名混淆](/tw/ch8#snapshot-isolation-repeatable-read-and-naming-confusion), [兩階段鎖定的實現](/tw/ch8#implementation-of-two-phase-locking)
  - MQ（訊息系統）, [訊息代理與資料庫的對比](/tw/ch12#id297)
    - 分散式事務支援, [XA 事務](/tw/ch8#xa-transactions)
  - System R（資料庫）, [事務到底是什麼？](/tw/ch8#sec_transactions_overview)
  - WebSphere（訊息系統）, [訊息代理](/tw/ch5#message-brokers)
- Iceberg（表格式）, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
  - 物件儲存的資料庫, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 基於日誌的資訊代理儲存, [磁碟空間使用](/tw/ch12#sec_stream_disk_usage)
- 冪等性, [遠端過程呼叫（RPC）的問題](/tw/ch5#sec_problems_with_rpc), [冪等性](/tw/ch12#sec_stream_idempotence), [術語表](/tw/glossary)
  - by giving operations unique IDs, [多分割槽請求處理](/tw/ch13#id360)
  - by giving requests unique IDs, [操作識別符號](/tw/ch13#id355)
  - 對於完全的語義, [再談恰好一次訊息處理](/tw/ch8#exactly-once-message-processing-revisited)
  - 一元業務, [恰好執行一次操作](/tw/ch13#id353)
  - 工作流程引擎中, [持久化執行](/tw/ch5#durable-execution)
- 不可改變性
  - 好處, [不可變事件的優點](/tw/ch12#sec_stream_immutability_pros), [為可審計性而設計](/tw/ch13#id365)
  - 和清除的權利, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance), [磁碟空間使用](/tw/ch4#disk-space-usage)
  - 刪除加密, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events), [不變性的侷限性](/tw/ch12#sec_stream_immutability_limitations)
  - 從事件日誌中獲取狀態, [狀態、流和不變性](/tw/ch12#sec_stream_immutability)-[不變性的侷限性](/tw/ch12#sec_stream_immutability_limitations)
  - 事故恢復, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
  - 在B樹上, [B 樹變體](/tw/ch4#b-tree-variants), [索引與快照隔離](/tw/ch8#indexes-and-snapshot-isolation)
  - 如果來源, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events), [資料變更捕獲與事件溯源](/tw/ch12#sec_stream_event_sourcing)
  - 限制, [併發控制](/tw/ch12#sec_stream_concurrency)
- 阻抗不匹配, [物件關係不匹配](/tw/ch3#sec_datamodels_document)
- 存疑, [協調器故障](/tw/ch8#coordinator-failure)
  - 鎖定, [存疑時持有鎖](/tw/ch8#holding-locks-while-in-doubt)
  - 孤兒事務, [從協調器故障中恢復](/tw/ch8#recovering-from-coordinator-failure)
- 模擬資料庫, [全記憶體儲存](/tw/ch4#sec_storage_inmemory)
  - 永續性, [永續性](/tw/ch8#durability)
  - 序列事務執行, [實際序列執行](/tw/ch8#sec_transactions_serial)
- 事件
  - 導致錯誤定罪的會計軟體錯誤, [人類與可靠性](/tw/ch2#id31)
  - 無咎死後, [人類與可靠性](/tw/ch2#id31)
  - 跳躍秒墜機, [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy)
  - 資料腐敗和貨幣錯誤造成的經濟損失, [弱隔離級別](/tw/ch8#sec_transactions_isolation_levels)
  - 硬碟上的資料腐敗, [永續性](/tw/ch8#durability)
  - 資料損失,因最後寫成, [用於事件排序的時間戳](/tw/ch9#sec_distributed_lww)
  - 磁碟上無法讀取的資料, [將系統模型對映到現實世界](/tw/ch9#mapping-system-models-to-the-real-world)
  - 由於重用主鑰匙而披露敏感資料, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover)
  - 事務序列性中的錯誤, [維護完整性，儘管軟體有Bug](/tw/ch13#id455)
  - gigabit network interface with 1 Kb/s throughput, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
  - 跳躍第二次崩潰, [軟體故障](/tw/ch2#software-faults)
  - 網路斷層, [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults)
  - 網路介面只放下入境包, [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults)
  - 網路分割槽和全資料中心故障, [故障與部分失效](/tw/ch9#sec_distributed_partial_failure)
  - 網路故障處理不當, [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults)
  - 向前合夥人傳送訊息, [排序事件以捕獲因果關係](/tw/ch13#sec_future_capture_causality)
  - 咬海底電纜的鯊魚, [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults)
  - split brain due to 1-minute packet delay, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover), [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults)
  - SSD failure after 32,768 hours, [軟體故障](/tw/ch2#software-faults)
  - 執行緒爭吵導致服務下降, [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)
  - 伺服器架中的振動, [延遲與響應時間](/tw/ch2#id23)
  - 違反獨特性限制, [維護完整性，儘管軟體有Bug](/tw/ch13#id455)
- incremental view maintenance (IVM), [維護物化檢視](/tw/ch12#sec_stream_mat_view)
  - 資料整合, [分拆系統與整合系統](/tw/ch13#id448)
- 索引, [OLTP 系統的儲存與索引](/tw/ch4#sec_storage_oltp), [術語表](/tw/glossary)
  - 並快照隔離, [索引與快照隔離](/tw/ch8#indexes-and-snapshot-isolation)
  - 作為衍生資料, [記錄系統與派生資料](/tw/ch1#sec_introduction_derived), [組合使用資料儲存技術](/tw/ch13#id447)-[分拆系統與整合系統](/tw/ch13#id448)
  - B樹, [B 樹](/tw/ch4#sec_storage_b_trees)-[B 樹變體](/tw/ch4#b-tree-variants)
  - 分組, [在索引中儲存值](/tw/ch4#sec_storage_index_heap)
  - comparison of B-trees and LSM-trees, [比較 B 樹與 LSM 樹](/tw/ch4#sec_storage_btree_lsm_comparison)-[磁碟空間使用](/tw/ch4#disk-space-usage)
  - 覆蓋（包括各欄）, [在索引中儲存值](/tw/ch4#sec_storage_index_heap)
  - 建立, [建立索引](/tw/ch13#id340)
  - 全文檢索, [全文檢索](/tw/ch4#sec_storage_full_text)
  - 地理空間, [多維索引與全文索引](/tw/ch4#sec_storage_multidimensional)
  - 索引範圍鎖定, [索引範圍鎖](/tw/ch8#sec_transactions_2pl_range)
  - 多列（壓縮）, [多維索引與全文索引](/tw/ch4#sec_storage_multidimensional)
  - 中學, [多列索引與二級索引](/tw/ch4#sec_storage_index_multicolumn)
    - （另見 secondary indexes）
    - 雙寫問題, [保持系統同步](/tw/ch12#sec_stream_sync), [理解資料流](/tw/ch13#id443)
  - 硬化指數和二級指數, [分片與二級索引](/tw/ch7#sec_sharding_secondary_indexes)-[全域性二級索引](/tw/ch7#id167), [總結](/tw/ch7#summary)
  - 人煙稀少, [SSTable 檔案格式](/tw/ch4#the-sstable-file-format)
  - SSTable 與 LSM 樹, [SSTable 檔案格式](/tw/ch4#the-sstable-file-format)-[壓實策略](/tw/ch4#sec_storage_lsm_compaction)
  - 資料變化時更新, [保持系統同步](/tw/ch12#sec_stream_sync), [維護物化檢視](/tw/ch12#sec_stream_mat_view)
- Industrial Revolution, [回顧工業革命](/ch14#id377)
- InfiniBand (networks), [我們不能簡單地使網路延遲可預測嗎？](/tw/ch9#can-we-not-simply-make-network-delays-predictable)
- InfluxDB IOx (storage engine), [列式儲存](/tw/ch4#sec_storage_column)
- information retrieval（見 全文檢索）
- infrastructure as a service (IaaS), [雲服務與自託管](/tw/ch1#sec_introduction_cloud), [雲服務的分層](/tw/ch1#layering-of-cloud-services)
- InnoDB (storage engine)
  - 主金鑰的分組索引, [在索引中儲存值](/tw/ch4#sec_storage_index_heap)
  - 不防止丟失的更新, [自動檢測丟失的更新](/tw/ch8#automatically-detecting-lost-updates)
  - 防止寫入skew, [寫偏差的特徵](/tw/ch8#characterizing-write-skew), [兩階段鎖定的實現](/tw/ch8#implementation-of-two-phase-locking)
  - 可序列隔離, [兩階段鎖定的實現](/tw/ch8#implementation-of-two-phase-locking)
  - 快速隔離支援, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)
- 例項（雲計算）, [雲服務的分層](/tw/ch1#layering-of-cloud-services)
- integrating different data systems（見 資料整合）
- 誠信, [及時性與完整性](/tw/ch13#sec_future_integrity)
  - 協調-避免資料系統, [無協調資料系統](/tw/ch13#id454)
  - 資料流系統的正確性, [資料流系統的正確性](/tw/ch13#id453)
  - 協商一致形式化, [單值共識](/tw/ch10#single-value-consensus), [原子提交作為共識](/tw/ch10#atomic-commitment-as-consensus)
  - 完整性檢查, [不要盲目信任承諾](/tw/ch13#id364)
    - （另見 審計）
    - 端到端, [端到端原則](/tw/ch13#sec_future_e2e_argument), [端到端原則重現](/tw/ch13#id456)
    - 使用快照隔離, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)
  - 儘管軟體錯誤仍然維護, [維護完整性，儘管軟體有Bug](/tw/ch13#id455)
- Interface Definition Language (IDL), [Protocol Buffers](/tw/ch5#sec_encoding_protobuf), [Avro](/tw/ch5#sec_encoding_avro), [Web 服務](/tw/ch5#sec_web_services)
- 不變式, [一致性](/tw/ch8#sec_transactions_acid_consistency)
  - （另見 constraints）
- 反向檔案索引（向量索引）, [向量嵌入](/tw/ch4#id92)
- 倒轉索引, [全文檢索](/tw/ch4#sec_storage_full_text)
- 不可逆轉,儘量減少, [可演化性：讓變化更容易](/tw/ch2#sec_introduction_evolvability), [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events), [批處理](/tw/ch11#ch_batch)
- ISDN (Integrated Services Digital Network), [同步與非同步網路](/tw/ch9#sec_distributed_sync_networks)
- 隔離性
  - cgroups（見 cgroups）
- 隔離性, [隔離性](/tw/ch8#sec_transactions_acid_isolation), [單物件與多物件操作](/tw/ch8#sec_transactions_multi_object), [術語表](/tw/glossary)
  - 正確性和, [追求正確性](/tw/ch13#sec_future_correctness)
  - 用於單物件寫入, [單物件寫入](/tw/ch8#sec_transactions_single_object)
  - 可序列化, [可序列化](/tw/ch8#sec_transactions_serializability)-[可序列化快照隔離的效能](/tw/ch8#performance-of-serializable-snapshot-isolation)
    - 實際執行, [實際序列執行](/tw/ch8#sec_transactions_serial)-[序列執行總結](/tw/ch8#summary-of-serial-execution)
    - 可序列化快照隔離, [可序列化快照隔離（SSI）](/tw/ch8#sec_transactions_ssi)-[可序列化快照隔離的效能](/tw/ch8#performance-of-serializable-snapshot-isolation)
    - 兩階段鎖定, [兩階段鎖定（2PL）](/tw/ch8#sec_transactions_2pl)-[索引範圍鎖](/tw/ch8#sec_transactions_2pl_range)
  - 違反, [單物件與多物件操作](/tw/ch8#sec_transactions_multi_object)
  - 薄弱的隔離水平, [弱隔離級別](/tw/ch8#sec_transactions_isolation_levels)-[物化衝突](/tw/ch8#materializing-conflicts)
    - 防止丟失更新, [防止丟失更新](/tw/ch8#sec_transactions_lost_update)-[衝突解決與複製](/tw/ch8#conflict-resolution-and-replication)
    - 讀已提交, [讀已提交](/tw/ch8#sec_transactions_read_committed)-[實現讀已提交](/tw/ch8#sec_transactions_read_committed_impl)
    - 快照隔離, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)-[快照隔離、可重複讀和命名混淆](/tw/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
- IVF (vector index), [向量嵌入](/tw/ch4#id92)

### J

- 資料庫連線
  - 分散式事務支援, [XA 事務](/tw/ch8#xa-transactions)
  - 網路驅動程式, [模式的優點](/tw/ch5#sec_encoding_schemas)
- Java Enterprise Edition (EE), [遠端過程呼叫（RPC）的問題](/tw/ch5#sec_problems_with_rpc), [兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc), [XA 事務](/tw/ch8#xa-transactions)
- Java Message Service (JMS), [訊息代理與資料庫的對比](/tw/ch12#id297)
  - （另見 messaging systems）
  - 比較基於日誌的郵件, [日誌與傳統的訊息傳遞相比](/tw/ch12#sec_stream_logs_vs_messaging), [重播舊訊息](/tw/ch12#sec_stream_replay)
  - 分散式事務支援, [XA 事務](/tw/ch8#xa-transactions)
  - 訊息順序, [確認與重新傳遞](/tw/ch12#sec_stream_reordering)
- Java Transaction API (JTA), [兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc), [XA 事務](/tw/ch8#xa-transactions)
- Java Virtual Machine (JVM)
  - 垃圾收集, [程序暫停](/tw/ch9#sec_distributed_clocks_pauses), [限制垃圾回收的影響](/tw/ch9#sec_distributed_gc_impact)
  - JIT compilation, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
  - 批次處理器中的工藝再利用, [資料流引擎](/tw/ch11#sec_batch_dataflow)
- Jena (RDF framework), [RDF 資料模型](/tw/ch3#the-rdf-data-model)
  - SPARQL 查詢語言, [SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
- Jepsen（過失容忍度測試）, [故障注入](/tw/ch9#sec_fault_injection), [追求正確性](/tw/ch13#sec_future_correctness)
- jitter （網路延遲）, [平均值、中位數與百分位點](/tw/ch2#id24), [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
- JMESPath（查詢語言）, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
- 合併表格, [多對一與多對多關係](/tw/ch3#sec_datamodels_many_to_many), [屬性圖](/tw/ch3#id56)
- 加入, [術語表](/tw/glossary)
  - 作為關係運算符表示, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
  - handling GraphQL query, [GraphQL](/tw/ch3#id63)
  - 應用程式程式碼, [正規化、反正規化與連線](/tw/ch3#sec_datamodels_normalization), [社交網路案例研究中的反正規化](/tw/ch3#denormalization-in-the-social-networking-case-study)
  - in DataFrames, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
  - 關係資料庫和文件資料庫, [正規化、反正規化與連線](/tw/ch3#sec_datamodels_normalization)
  - 二級指數和, [多列索引與二級索引](/tw/ch4#sec_storage_index_multicolumn)
  - 排序合併, [JOIN 與 GROUP BY](/tw/ch11#sec_batch_join)
  - 串流連線, [流連線](/tw/ch12#sec_stream_joins)-[連線的時間依賴性](/tw/ch12#sec_stream_join_time)
    - 串流流連線, [流流連線（視窗連線）](/tw/ch12#id440)
    - 序列表連線, [流表連線（流擴充）](/tw/ch12#sec_stream_table_joins)
    - 表格連線, [表表連線（維護物化檢視）](/tw/ch12#id326)
    - 時間的依賴性, [連線的時間依賴性](/tw/ch12#sec_stream_join_time)
  - 文件資料庫中的支援, [文件和關係資料庫的融合](/tw/ch3#convergence-of-document-and-relational-databases)
- JOTM (transaction coordinator), [兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc)
- 日記（檔案系統）, [使 B 樹可靠](/tw/ch4#sec_storage_btree_wal)
- JSON
  - 管道彙總（用克里語）, [文件的查詢語言](/tw/ch3#query-languages-for-documents)
  - Avro 方案說明, [Avro](/tw/ch5#sec_encoding_avro)
  - 二進位制變體, [二進位制編碼](/tw/ch5#binary-encoding)
  - 資料位置, [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality)
  - 文件資料模型, [關係模型與文件模型](/tw/ch3#sec_datamodels_history)
  - 應用資料的問題, [JSON、XML 及其二進位制變體](/tw/ch5#sec_encoding_json)
  - GraphQL response, [GraphQL](/tw/ch3#id63)
  - 關係資料庫, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility)
  - 代表簡歷（例）, [用於一對多關係的文件資料模型](/tw/ch3#the-document-data-model-for-one-to-many-relationships)
  - 模式, [JSON 模式](/tw/ch5#json-schema)
- JSON-LD, [三元組儲存與 SPARQL](/tw/ch3#id59)
- JsonPath（查詢語言）, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
- JuiceFS（分散式檔案系統）, [分散式檔案系統](/tw/ch11#sec_batch_dfs), [物件儲存](/tw/ch11#id277)
- 朱皮特（註解本）, [機器學習](/tw/ch11#id290)
- just-in-time (JIT) compilation, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)

### K

- Kafka（訊息系統）, [訊息代理](/tw/ch5#message-brokers), [使用日誌進行訊息儲存](/tw/ch12#id300)
  - 消費者群體, [多個消費者](/tw/ch12#id298)
  - 資料整合, [分拆系統與整合系統](/tw/ch13#id448)
  - 用於事件原始碼, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
  - Kafka 連線（資料庫整合）, [資料變更捕獲的實現](/tw/ch12#id307), [變更流的 API 支援](/tw/ch12#sec_stream_change_api), [從同一事件日誌中派生多個檢視](/tw/ch12#sec_stream_deriving_views)
  - 卡夫卡流（流處理器）, [流分析](/tw/ch12#id318), [維護物化檢視](/tw/ch12#sec_stream_mat_view)
    - 恰好一次語義, [再談恰好一次訊息處理](/tw/ch8#exactly-once-message-processing-revisited)
    - 過失容忍, [失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance)
  - ksqlDB (stream database), [維護物化檢視](/tw/ch12#sec_stream_mat_view)
  - 基於領導者的複製, [單主複製](/tw/ch6#sec_replication_leader)
  - 日誌壓縮, [日誌壓縮](/tw/ch12#sec_stream_log_compaction), [維護物化檢視](/tw/ch12#sec_stream_mat_view)
  - 頁:1, [使用日誌進行訊息儲存](/tw/ch12#id300), [冪等性](/tw/ch12#sec_stream_idempotence)
  - 分割槽, [分片](/tw/ch7#ch_sharding)
  - 請求路由, [請求路由](/tw/ch7#sec_sharding_routing)
  - 計劃登記, [但什麼是寫入者模式？](/tw/ch5#but-what-is-the-writers-schema)
  - 服務衍生資料, [對外提供派生資料](/tw/ch11#sec_batch_serving_derived)
  - 分層儲存, [磁碟空間使用](/tw/ch12#sec_stream_disk_usage)
  - 事務, [資料庫內部的分散式事務](/tw/ch8#sec_transactions_internal), [原子提交再現](/tw/ch12#sec_stream_atomic_commit)
  - 不潔領袖選舉, [共識的微妙之處](/tw/ch10#subtleties-of-consensus)
  - 使用模型檢查, [模型檢查與規範語言](/tw/ch9#model-checking-and-specification-languages)
- kappa 架構, [統一批處理和流處理](/tw/ch13#id338)
- 關鍵價值儲存, [OLTP 系統的儲存與索引](/tw/ch4#sec_storage_oltp)
  - 比較物件儲存, [物件儲存](/tw/ch11#id277)
  - 記憶, [全記憶體儲存](/tw/ch4#sec_storage_inmemory)
  - LSM storage, [日誌結構儲存](/tw/ch4#sec_storage_log_structured)-[磁碟空間使用](/tw/ch4#disk-space-usage)
  - 分片, [鍵值資料的分片](/tw/ch7#sec_sharding_key_value)-[偏斜的工作負載與緩解熱點](/tw/ch7#sec_sharding_skew)
    - 鍵的雜湊, [按鍵的雜湊分片](/tw/ch7#sec_sharding_hash), [總結](/tw/ch7#summary)
    - 按金鑰範圍, [按鍵的範圍分片](/tw/ch7#sec_sharding_key_range), [總結](/tw/ch7#summary)
    - 搖擺和熱點, [偏斜的工作負載與緩解熱點](/tw/ch7#sec_sharding_skew)
- Kinesis（訊息系統）, [訊息代理](/tw/ch5#message-brokers), [使用日誌進行訊息儲存](/tw/ch12#id300)
  - 資料倉整合, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
- Kryo (Java), [特定語言的格式](/tw/ch5#id96)
- ksqlDB (stream database), [維護物化檢視](/tw/ch12#sec_stream_mat_view)
- Kubernetes（叢集經理）, [雲服務與自託管](/tw/ch1#sec_introduction_cloud), [微服務與無伺服器](/tw/ch1#sec_introduction_microservices), [分散式作業編排](/tw/ch11#id278), [應用程式碼和狀態的分離](/tw/ch13#id344)
  - 庫貝流, [機器學習](/tw/ch11#id290)
  - 立方體, [分散式作業編排](/tw/ch11#id278)
  - 運算元, [分散式作業編排](/tw/ch11#id278)
  - 使用等資料d, [請求路由](/tw/ch7#sec_sharding_routing), [協調服務](/tw/ch10#sec_consistency_coordination)
- KùzuDB (database), [分散式系統的問題](/tw/ch1#sec_introduction_dist_sys_problems), [圖資料模型](/tw/ch3#sec_datamodels_graph)
  - 作為嵌入式儲存引擎, [壓實策略](/tw/ch4#sec_storage_lsm_compaction)
  - Cypher 查詢語言, [Cypher 查詢語言](/tw/ch3#id57)

### L

- labeled property graphs（見 property graphs）
- 羊肉達建築, [統一批處理和流處理](/tw/ch13#id338)
- Lamport 時間戳, [Lamport 時間戳](/tw/ch10#lamport-timestamps)
- Lance（資料格式）, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses), [列式儲存](/tw/ch4#sec_storage_column)
  - （另見 column-oriented storage）
- large language models (LLMs)
  - 預處理培訓資料, [機器學習](/tw/ch11#id290)
- 最後寫入勝利, [最後寫入勝利（丟棄併發寫入）](/tw/ch6#sec_replication_lww), [檢測併發寫入](/tw/ch6#sec_replication_concurrent), [實現線性一致性系統](/tw/ch10#sec_consistency_implementing_linearizable)
  - 問題, [用於事件排序的時間戳](/tw/ch9#sec_distributed_lww)
  - 容易丟失更新, [衝突解決與複製](/tw/ch8#conflict-resolution-and-replication)
- 延遲, [延遲與響應時間](/tw/ch2#id23)
  - （另見 響應時間）
  - 跨區域, [分散式與單節點系統](/tw/ch1#sec_introduction_distributed)
  - 在兩階段鎖定下的不穩定, [兩階段鎖定的效能](/tw/ch8#performance-of-two-phase-locking)
  - 網路延遲和資源利用, [我們不能簡單地使網路延遲可預測嗎？](/tw/ch9#can-we-not-simply-make-network-delays-predictable)
  - 根據請求減少套期保值, [單主與無主複製的效能](/tw/ch6#sec_replication_leaderless_perf)
  - 響應時間對比, [延遲與響應時間](/tw/ch2#id23)
  - 尾延遲, [平均值、中位數與百分位點](/tw/ch2#id24), [響應時間指標的應用](/tw/ch2#sec_introduction_slo_sla), [本地二級索引](/tw/ch7#id166)
- law（見 legal matters）
- （雲服務）, [雲服務的分層](/tw/ch1#layering-of-cloud-services)
- 基於領導者的複製, [單主複製](/tw/ch6#sec_replication_leader)-[邏輯（基於行）日誌複製](/tw/ch6#logical-row-based-log-replication)
  - （另見 複製）
  - 故障切換, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover), [分散式鎖和租約](/tw/ch9#sec_distributed_lock_fencing)
  - 處理節點斷電, [處理節點故障](/tw/ch6#sec_replication_failover)
  - 實施複製日誌
    - 資料變更捕獲, [資料變更捕獲](/tw/ch12#sec_stream_cdc)-[變更流的 API 支援](/tw/ch12#sec_stream_change_api)
      - （另見 changelogs）
    - 基於語句的, [基於語句的複製](/tw/ch6#statement-based-replication)
    - 預寫日誌（WAL）傳輸, [預寫日誌（WAL）傳輸](/tw/ch6#write-ahead-log-wal-shipping)
  - 操作的可線性, [實現線性一致性系統](/tw/ch10#sec_consistency_implementing_linearizable)
  - 鎖定和領導者選舉, [鎖定與領導者選舉](/tw/ch10#locking-and-leader-election)
  - 日誌序列號, [設定新的副本](/tw/ch6#sec_replication_new_replica), [消費者偏移量](/tw/ch12#sec_stream_log_offsets)
  - 讀縮放架構, [複製延遲的問題](/tw/ch6#sec_replication_lag), [單主與無主複製的效能](/tw/ch6#sec_replication_leaderless_perf)
  - 與協商一致的關係, [共識](/tw/ch10#sec_consistency_consensus), [從單主複製到共識](/tw/ch10#from-single-leader-replication-to-consensus), [共識的利弊](/tw/ch10#pros-and-cons-of-consensus)
  - 設立新的追隨者, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 同步對同步, [同步複製與非同步複製](/tw/ch6#sec_replication_sync_async)-[同步複製與非同步複製](/tw/ch6#sec_replication_sync_async)
- 無領導複製, [無主複製](/tw/ch6#sec_replication_leaderless)-[版本向量](/tw/ch6#version-vectors)
  - （另見 複製）
  - 追趕丟失的寫入, [追趕錯過的寫入](/tw/ch6#sec_replication_read_repair)
  - 檢測並行寫作, [檢測併發寫入](/tw/ch6#sec_replication_concurrent)-[版本向量](/tw/ch6#version-vectors)
    - 版本向量, [版本向量](/tw/ch6#version-vectors)
  - 多區域, [多地區操作](/tw/ch6#multi-region-operation)
  - 法定人數, [讀寫仲裁](/tw/ch6#sec_replication_quorum_condition)-[多地區操作](/tw/ch6#multi-region-operation)
    - 一致性限制, [仲裁一致性的侷限](/tw/ch6#sec_replication_quorum_limitations)-[監控陳舊性](/tw/ch6#monitoring-staleness), [線性一致性與仲裁](/tw/ch10#sec_consistency_quorum_linearizable)
- 跳躍秒, [軟體故障](/tw/ch2#software-faults), [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy)
  - 時鐘, [日曆時鐘](/tw/ch9#time-of-day-clocks)
- 租賃, [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)
  - 與協調處合作執行, [協調服務](/tw/ch10#sec_consistency_coordination)
  - 需要圍欄, [分散式鎖和租約](/tw/ch9#sec_distributed_lock_fencing)
  - 與協商一致的關係, [單值共識](/tw/ch10#single-value-consensus)
- 分類賬（會計）, [總結](/tw/ch3#summary)
  - 不可改變性, [不可變事件的優點](/tw/ch12#sec_stream_immutability_pros)
- 遺留系統,維護, [可運維性](/tw/ch2#sec_introduction_maintainability)
- 法律事項, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)-[資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)
  - 資料刪除, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance), [磁碟空間使用](/tw/ch4#disk-space-usage)
  - 資料儲存, [分散式與單節點系統](/tw/ch1#sec_introduction_distributed), [面向多租戶的分片](/tw/ch7#sec_sharding_multitenancy)
  - 隱私監管, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance), [立法與自律](/ch14#sec_future_legislation)
- legitimate interest (GDPR), [同意與選擇自由](/ch14#id375)
- 平面壓縮, [壓實策略](/tw/ch4#sec_storage_lsm_compaction), [磁碟空間使用](/tw/ch4#disk-space-usage)
- Levenshtein 自動地圖, [全文檢索](/tw/ch4#sec_storage_full_text)
- 跛腳（部分失敗）, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
- 線性（專案管理軟體）, [即時協作、離線優先和本地優先應用](/tw/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- 線性代數, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- 線性可縮放性, [描述負載](/tw/ch2#id33)
- 線性一致性, [複製延遲的解決方案](/tw/ch6#id131), [線性一致性](/tw/ch10#sec_consistency_linearizability)-[線性一致性與網路延遲](/tw/ch10#linearizability-and-network-delays), [術語表](/tw/glossary)
  - 和共識, [共識](/tw/ch10#sec_consistency_consensus)
  - 費用, [線性一致性的代價](/tw/ch10#sec_linearizability_cost)-[線性一致性與網路延遲](/tw/ch10#linearizability-and-network-delays)
    - CAP定理, [CAP 定理](/tw/ch10#the-cap-theorem)
    - memory on multi-core CPUs, [線性一致性與網路延遲](/tw/ch10#linearizability-and-network-delays)
  - 定義, [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)-[什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
  - ID generation, [線性一致的 ID 生成器](/tw/ch10#sec_consistency_linearizable_id)
  - 協調事務, [協調服務](/tw/ch10#sec_consistency_coordination)
  - 資料系統
    - 避免協調, [無協調資料系統](/tw/ch13#id454)
  - 不同複製方法, [實現線性一致性系統](/tw/ch10#sec_consistency_implementing_linearizable)-[線性一致性與仲裁](/tw/ch10#sec_consistency_quorum_linearizable)
    - 使用法定人數, [線性一致性與仲裁](/tw/ch10#sec_consistency_quorum_linearizable)
  - 在協商一致的制度中讀取, [共識的微妙之處](/tw/ch10#subtleties-of-consensus)
  - 依賴, [依賴線性一致性](/tw/ch10#sec_consistency_linearizability_usage)-[跨通道時序依賴](/tw/ch10#cross-channel-timing-dependencies)
    - 限制和獨特性, [約束與唯一性保證](/tw/ch10#sec_consistency_uniqueness)
    - 跨渠道時間依賴性, [跨通道時序依賴](/tw/ch10#cross-channel-timing-dependencies)
    - 鎖定和領導者選舉, [鎖定與領導者選舉](/tw/ch10#locking-and-leader-election)
  - 可序列性, [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
- 連結資料, [三元組儲存與 SPARQL](/tw/ch3#id59)
- LinkedIn
  - Espresso（資料庫）, [但什麼是寫入者模式？](/tw/ch5#but-what-is-the-writers-schema)
  - LIquid（資料庫）, [Datalog：遞迴關係查詢](/tw/ch3#id62)
  - 配置檔案（例）, [用於一對多關係的文件資料模型](/tw/ch3#the-document-data-model-for-one-to-many-relationships)
- Linux 跳過第二個錯誤, [軟體故障](/tw/ch2#software-faults), [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy)
- Litestream （備份工具）, [設定新的副本](/tw/ch6#sec_replication_new_replica)
- 生活屬性, [安全性與活性](/tw/ch9#sec_distributed_safety_liveness)
- LLVM (compiler), [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
- LMDB (storage engine), [壓實策略](/tw/ch4#sec_storage_lsm_compaction), [B 樹變體](/tw/ch4#b-tree-variants), [索引與快照隔離](/tw/ch8#indexes-and-snapshot-isolation)
- 負載
  - 應付, [可伸縮性原則](/tw/ch2#id35)
  - 描述, [描述負載](/tw/ch2#id33)
- 負載平衡, [描述效能](/tw/ch2#sec_introduction_percentiles), [負載均衡器、服務發現和服務網格](/tw/ch5#sec_encoding_service_discovery)
  - 硬體, [負載均衡器、服務發現和服務網格](/tw/ch5#sec_encoding_service_discovery)
  - 軟體, [負載均衡器、服務發現和服務網格](/tw/ch5#sec_encoding_service_discovery)
  - 使用信件經紀人, [多個消費者](/tw/ch12#id298)
- 裝彈, [描述效能](/tw/ch2#sec_introduction_percentiles)
- 本地二級指數, [本地二級索引](/tw/ch7#id166), [總結](/tw/ch7#summary)
- 本地第一軟體, [即時協作、離線優先和本地優先應用](/tw/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- 區域性, [用於一對多關係的文件資料模型](/tw/ch3#the-document-data-model-for-one-to-many-relationships), [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality), [術語表](/tw/glossary)
  - 分批處理, [資料流引擎](/tw/ch11#sec_batch_dataflow)
  - 在狀態客戶端, [同步引擎與本地優先軟體](/tw/ch6#sec_replication_offline_clients), [有狀態、可離線的客戶端](/tw/ch13#id347)
  - 在溪流處理中, [流表連線（流擴充）](/tw/ch12#sec_stream_table_joins), [失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance), [流處理器和服務](/tw/ch13#id345), [基於日誌訊息傳遞中的唯一性](/tw/ch13#sec_future_uniqueness_log)
- 地點透明度, [遠端過程呼叫（RPC）的問題](/tw/ch5#sec_problems_with_rpc)
  - 在演員模式中, [分散式 actor 框架](/tw/ch5#distributed-actor-frameworks)
- 鎖定, [雲服務的利弊](/tw/ch1#sec_introduction_cloud_tradeoffs)
- 鎖, [術語表](/tw/glossary)
  - 死鎖, [顯式鎖定](/tw/ch8#explicit-locking), [兩階段鎖定的實現](/tw/ch8#implementation-of-two-phase-locking)
  - 分散式鎖定, [分散式鎖和租約](/tw/ch9#sec_distributed_lock_fencing)-[多副本隔離](/tw/ch9#fencing-with-multiple-replicas), [鎖定與領導者選舉](/tw/ch10#locking-and-leader-election)
    - 柵欄標誌, [隔離殭屍程序和延遲請求](/tw/ch9#sec_distributed_fencing_tokens)
    - 與協調處合作執行, [協調服務](/tw/ch10#sec_consistency_coordination)
    - 與協商一致的關係, [單值共識](/tw/ch10#single-value-consensus)
  - 用於事務隔離
    - 在快照隔離中, [多版本併發控制（MVCC）](/tw/ch8#sec_transactions_snapshot_impl)
    - in two-phase locking (2PL), [兩階段鎖定（2PL）](/tw/ch8#sec_transactions_2pl)-[索引範圍鎖](/tw/ch8#sec_transactions_2pl_range)
    - 使操作原子化, [原子寫操作](/tw/ch8#atomic-write-operations)
    - 效能, [兩階段鎖定的效能](/tw/ch8#performance-of-two-phase-locking)
    - 防止骯髒的寫作, [實現讀已提交](/tw/ch8#sec_transactions_read_committed_impl)
    - 防止帶有索引範圍鎖的幽靈, [索引範圍鎖](/tw/ch8#sec_transactions_2pl_range), [檢測影響先前讀取的寫入](/tw/ch8#sec_detecting_writes_affect_reads)
    - 讀取鎖（共享模式）, [實現讀已提交](/tw/ch8#sec_transactions_read_committed_impl), [兩階段鎖定的實現](/tw/ch8#implementation-of-two-phase-locking)
    - 共享模式和專屬模式, [兩階段鎖定的實現](/tw/ch8#implementation-of-two-phase-locking)
  - 分散式事務
    - 發現僵局, [XA 事務的問題](/tw/ch8#problems-with-xa-transactions)
    - 持有鎖的可疑事務, [存疑時持有鎖](/tw/ch8#holding-locks-while-in-doubt)
  - 實現衝突, [物化衝突](/tw/ch8#materializing-conflicts)
  - 透過明確鎖定防止丟失更新, [顯式鎖定](/tw/ch8#explicit-locking)
- 日誌序列號, [設定新的副本](/tw/ch6#sec_replication_new_replica), [消費者偏移量](/tw/ch12#sec_stream_log_offsets)
- 邏輯時鐘, [用於事件排序的時間戳](/tw/ch9#sec_distributed_lww), [ID 生成器和邏輯時鐘](/tw/ch10#sec_consistency_logical)-[使用邏輯時鐘強制約束](/tw/ch10#enforcing-constraints-using-logical-clocks), [排序事件以捕獲因果關係](/tw/ch13#sec_future_capture_causality)
  - 最後寫成的, [最後寫入勝利（丟棄併發寫入）](/tw/ch6#sec_replication_lww)
  - 讀後寫入一致性, [讀己之寫](/tw/ch6#sec_replication_ryw)
  - 混合邏輯時鐘, [混合邏輯時鐘](/tw/ch10#hybrid-logical-clocks)
  - 執行制約因素不足, [使用邏輯時鐘強制約束](/tw/ch10#enforcing-constraints-using-logical-clocks)
  - Lamport 時間戳, [Lamport 時間戳](/tw/ch10#lamport-timestamps)
- 邏輯複製, [邏輯（基於行）日誌複製](/tw/ch6#logical-row-based-log-replication)
  - 用於獲取變化資料, [資料變更捕獲的實現](/tw/ch12#id307)
- LogicBlox（資料庫）, [Datalog：遞迴關係查詢](/tw/ch3#id62)
- 日誌（資料結構）, [OLTP 系統的儲存與索引](/tw/ch4#sec_storage_oltp), [共享日誌作為共識](/tw/ch10#sec_consistency_shared_logs), [術語表](/tw/glossary)
  - （另見 shared logs）
  - 不可改變性的好處, [不可變事件的優點](/tw/ch12#sec_stream_immutability_pros)
  - 和清除的權利, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance), [磁碟空間使用](/tw/ch4#disk-space-usage)
  - 壓實（Compaction）, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables), [壓實策略](/tw/ch4#sec_storage_lsm_compaction), [日誌壓縮](/tw/ch12#sec_stream_log_compaction), [狀態、流和不變性](/tw/ch12#sec_stream_immutability)
    - 流運算子狀態, [失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance)
  - 執行獨特性限制, [基於日誌訊息傳遞中的唯一性](/tw/ch13#sec_future_uniqueness_log)
  - 基於日誌的資訊, [基於日誌的訊息代理](/tw/ch12#sec_stream_log)-[重播舊訊息](/tw/ch12#sec_stream_replay)
    - 比較傳統訊息, [日誌與傳統的訊息傳遞相比](/tw/ch12#sec_stream_logs_vs_messaging), [重播舊訊息](/tw/ch12#sec_stream_replay)
    - 減 減, [消費者偏移量](/tw/ch12#sec_stream_log_offsets)
    - 磁碟空間使用情況, [磁碟空間使用](/tw/ch12#sec_stream_disk_usage)
    - 重播舊信件, [重播舊訊息](/tw/ch12#sec_stream_replay), [應用演化後重新處理資料](/tw/ch13#sec_future_reprocessing), [統一批處理和流處理](/tw/ch13#id338)
    - 緩慢的消費者, [當消費者跟不上生產者時](/tw/ch12#id459)
    - 使用日誌儲存信件, [使用日誌進行訊息儲存](/tw/ch12#id300)
  - 日誌結構儲存, [OLTP 系統的儲存與索引](/tw/ch4#sec_storage_oltp)-[壓實策略](/tw/ch4#sec_storage_lsm_compaction)
    - log-structured merge tree（見 LSM-trees）
  - 與協商一致的關係, [共享日誌作為共識](/tw/ch10#sec_consistency_shared_logs)
  - 複製, [單主複製](/tw/ch6#sec_replication_leader), [複製日誌的實現](/tw/ch6#sec_replication_implementation)-[邏輯（基於行）日誌複製](/tw/ch6#logical-row-based-log-replication)
    - 資料變更捕獲, [資料變更捕獲](/tw/ch12#sec_stream_cdc)-[變更流的 API 支援](/tw/ch12#sec_stream_change_api)
      - （另見 changelogs）
    - 與快照協調, [設定新的副本](/tw/ch6#sec_replication_new_replica)
    - 邏輯（基於row） 複製, [邏輯（基於行）日誌複製](/tw/ch6#logical-row-based-log-replication)
    - 基於語句的複製, [基於語句的複製](/tw/ch6#statement-based-replication)
    - 預寫日誌（WAL）傳輸, [預寫日誌（WAL）傳輸](/tw/ch6#write-ahead-log-wal-shipping)
  - 伸縮性限制, [全序的限制](/tw/ch13#id335)
- 瀏覽器（商業情報軟體）, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp), [分析（Analytics）](/tw/ch11#sec_batch_olap)
- 松耦合, [開展分拆工作](/tw/ch13#sec_future_unbundling_favor)
- lost updates（見 updates）
- 蓮花筆記（同步引擎）, [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
- LSM-trees (indexes), [SSTable 檔案格式](/tw/ch4#the-sstable-file-format)-[壓實策略](/tw/ch4#sec_storage_lsm_compaction)
  - 與B樹的比較, [比較 B 樹與 LSM 樹](/tw/ch4#sec_storage_btree_lsm_comparison)-[磁碟空間使用](/tw/ch4#disk-space-usage)
- Lucene（儲存引擎）, [全文檢索](/tw/ch4#sec_storage_full_text)
  - 相似性搜尋, [全文檢索](/tw/ch4#sec_storage_full_text)
- 最後寫入勝利（見 最後寫入勝利）

### M

- 機器學習
  - 批次推論, [機器學習](/tw/ch11#id290)
  - data preparation with DataFrames, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
  - 刪除培訓資料, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)
  - 部署資料產品, [超越資料湖](/tw/ch1#beyond-the-data-lake)
  - 道德考慮, [預測分析](/ch14#id369)
    - （另見 ethics）
  - 特性工程, [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake), [機器學習](/tw/ch11#id290)
  - 分析系統, [分析型與事務型系統](/tw/ch1#sec_introduction_analytics)
  - 迭代處理, [機器學習](/tw/ch11#id290)
  - LLMs（見 large language models (LLMs)）
  - 培訓資料產生的模型, [應用程式碼作為派生函式](/tw/ch13#sec_future_dataflow_derivation)
  - 與批次處理的關係, [機器學習](/tw/ch11#id290)-[機器學習](/tw/ch11#id290)
  - 使用資料湖, [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake)
  - using GPUs, [雲服務的分層](/tw/ch1#layering-of-cloud-services), [分散式與單節點系統](/tw/ch1#sec_introduction_distributed)
  - 使用矩陣, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- 瘋狂（決定性模擬測試）, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
- 萬金油, [可伸縮性原則](/tw/ch2#id35)
- 可維護性, [可運維性](/tw/ch2#sec_introduction_maintainability)-[可演化性：讓變化更容易](/tw/ch2#sec_introduction_evolvability), [流式系統的哲學](/tw/ch13#ch_philosophy)
  - 可演化性（見 可演化性）
  - 可操作性, [可運維性：讓運維更輕鬆](/tw/ch2#id37)
  - 簡化和管理複雜性, [簡單性：管理複雜度](/tw/ch2#id38)
- 多種關係, [多對一與多對多關係](/tw/ch3#sec_datamodels_many_to_many)
  - 模擬為圖表, [圖資料模型](/tw/ch3#sec_datamodels_graph)
- 多對一關係, [多對一與多對多關係](/tw/ch3#sec_datamodels_many_to_many)
  - 在恆星計時, [星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics)
- MapReduce (batch processing), [批處理](/tw/ch11#ch_batch), [MapReduce](/tw/ch11#sec_batch_mapreduce)-[MapReduce](/tw/ch11#sec_batch_mapreduce)
  - 使用者活動活動分析（例項）, [JOIN 與 GROUP BY](/tw/ch11#sec_batch_join)
  - 與流處理的比較, [流處理](/tw/ch12#sec_stream_processing)
  - 不利條件和限制, [MapReduce](/tw/ch11#sec_batch_mapreduce)
  - 過失容忍, [故障處理](/tw/ch11#id281)
  - 高階工具, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
  - 對映和縮小函式, [MapReduce](/tw/ch11#sec_batch_mapreduce)
  - 移動資料, [混洗資料](/tw/ch11#sec_shuffle)
  - 排序合併, [JOIN 與 GROUP BY](/tw/ch11#sec_batch_join)
  - 工作流程, [工作流排程](/tw/ch11#sec_batch_workflows)
    - （另見 workflow engines）
- 編組（見 編碼）
- MartenDB（資料庫）, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
- 主奴隸複製（過時術語）, [單主複製](/tw/ch6#sec_replication_leader)
- 物化, [術語表](/tw/glossary)
  - 總價值, [物化檢視與資料立方體](/tw/ch4#sec_storage_materialized_views)
  - 衝突, [物化衝突](/tw/ch8#materializing-conflicts)
  - 實際意見, [物化檢視與資料立方體](/tw/ch4#sec_storage_materialized_views)
    - 作為衍生資料, [記錄系統與派生資料](/tw/ch1#sec_introduction_derived), [組合使用資料儲存技術](/tw/ch13#id447)-[分拆系統與整合系統](/tw/ch13#id448)
    - 如果來源, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
    - 增量檢視維護, [維護物化檢視](/tw/ch12#sec_stream_mat_view)
      - （另見 incremental view maintenance (IVM)）
    - 維護,使用流處理, [維護物化檢視](/tw/ch12#sec_stream_mat_view), [表表連線（維護物化檢視）](/tw/ch12#id326)
  - 社會網路時間表例項, [時間線的物化與更新](/tw/ch2#sec_introduction_materializing)
- 物化, [物化檢視與資料立方體](/tw/ch4#sec_storage_materialized_views)
  - 增量檢視維護, [維護物化檢視](/tw/ch12#sec_stream_mat_view)
- 矩陣, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
  - 人煙稀少, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- Maxwell（變化資料捕獲）, [資料變更捕獲的實現](/tw/ch12#id307)
- 說, [平均值、中位數與百分位點](/tw/ch2#id24)
- 媒體監測, [在流上搜索](/tw/ch12#id320)
- 中位數, [平均值、中位數與百分位點](/tw/ch2#id24)
- 會議室預訂（例）, [寫偏差的更多例子](/tw/ch8#more-examples-of-write-skew), [謂詞鎖](/tw/ch8#predicate-locks), [強制約束](/tw/ch13#sec_future_constraints)
- 除錯（除錯伺服器）, [全記憶體儲存](/tw/ch4#sec_storage_inmemory)
- Memgraph（資料庫）, [圖資料模型](/tw/ch3#sec_datamodels_graph)
  - Cypher 查詢語言, [Cypher 查詢語言](/tw/ch3#id57)
- 記憶體
  - 壁障, [線性一致性與網路延遲](/tw/ch10#linearizability-and-network-delays)
  - 腐敗, [硬體與軟體故障](/tw/ch2#sec_introduction_hardware_faults)
  - 模擬資料庫, [全記憶體儲存](/tw/ch4#sec_storage_inmemory)
    - 永續性, [永續性](/tw/ch8#durability)
    - 序列事務執行, [實際序列執行](/tw/ch8#sec_transactions_serial)
  - 資料模擬表示, [編碼資料的格式](/tw/ch5#sec_encoding_formats)
  - 記憶體表, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
  - 隨機位元- flips in, [信任但驗證](/tw/ch13#sec_future_verification)
  - 索引的使用, [日誌結構儲存](/tw/ch4#sec_storage_log_structured)
- 記憶體表, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
- 商品（版本控制系統）, [併發控制](/tw/ch12#sec_stream_concurrency)
- 合併, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- 合併排序的檔案, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables), [混洗資料](/tw/ch11#sec_shuffle)
- 默克爾樹, [用於可審計資料系統的工具](/tw/ch13#id366)
- Mesos（分組管理器）, [應用程式碼和狀態的分離](/tw/ch13#id344)
- message brokers（見 messaging systems）
- message-passing（見 event-driven architecture）
- MessagePack (encoding format), [二進位制編碼](/tw/ch5#binary-encoding)
- 通訊系統, [流處理](/tw/ch12#ch_stream)-[重播舊訊息](/tw/ch12#sec_stream_replay)
  - （另見 streams）
  - 後壓、緩衝或丟棄信件, [訊息傳遞系統](/tw/ch12#sec_stream_messaging)
  - 無中介訊息, [直接從生產者傳遞給消費者](/tw/ch12#id296)
  - 事件日誌, [基於日誌的訊息代理](/tw/ch12#sec_stream_log)-[重播舊訊息](/tw/ch12#sec_stream_replay)
    - 作為資料模型, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
    - 比較傳統訊息, [日誌與傳統的訊息傳遞相比](/tw/ch12#sec_stream_logs_vs_messaging), [重播舊訊息](/tw/ch12#sec_stream_replay)
    - 減 減, [消費者偏移量](/tw/ch12#sec_stream_log_offsets)
    - 重播舊信件, [重播舊訊息](/tw/ch12#sec_stream_replay), [應用演化後重新處理資料](/tw/ch13#sec_future_reprocessing), [統一批處理和流處理](/tw/ch13#id338)
    - 緩慢的消費者, [當消費者跟不上生產者時](/tw/ch12#id459)
  - 恰好一次語義, [恰好一次訊息處理](/tw/ch8#sec_transactions_exactly_once), [再談恰好一次訊息處理](/tw/ch8#exactly-once-message-processing-revisited), [容錯](/tw/ch12#sec_stream_fault_tolerance)
  - 信件經紀人, [訊息代理](/tw/ch12#id433)-[確認與重新傳遞](/tw/ch12#sec_stream_reordering)
    - 承認和重新交付, [確認與重新傳遞](/tw/ch12#sec_stream_reordering)
    - 比較事件日誌, [日誌與傳統的訊息傳遞相比](/tw/ch12#sec_stream_logs_vs_messaging), [重播舊訊息](/tw/ch12#sec_stream_replay)
    - 同一主題的多個消費者, [多個消費者](/tw/ch12#id298)
    - versus RPC, [事件驅動的架構](/tw/ch5#sec_encoding_dataflow_msg)
  - 訊息丟失, [訊息傳遞系統](/tw/ch12#sec_stream_messaging)
  - 可靠性, [訊息傳遞系統](/tw/ch12#sec_stream_messaging)
  - 以日誌為基礎的信件中的獨特性, [基於日誌訊息傳遞中的唯一性](/tw/ch13#sec_future_uniqueness_log)
- 可調味的失敗, [描述效能](/tw/ch2#sec_introduction_percentiles)
- 計票
  - 無伺服器, [微服務與無伺服器](/tw/ch1#sec_introduction_microservices)
  - 儲存, [雲時代的運維](/tw/ch1#sec_introduction_operations)
- 微批次, [微批次與存檔點](/tw/ch12#id329)
- 微服務, [微服務與無伺服器](/tw/ch1#sec_introduction_microservices)
  - （另見 services）
  - 各種服務的因果關係, [全序的限制](/tw/ch13#id335)
  - 松耦合, [開展分拆工作](/tw/ch13#sec_future_unbundling_favor)
  - 與批次/流程處理器的關係, [批處理](/tw/ch11#ch_batch), [流處理器和服務](/tw/ch13#id345)
- 微軟
  - Azure Blob Storage（見 Azure Blob Storage）
  - Azure managed disks, [儲存與計算的分離](/tw/ch1#sec_introduction_storage_compute)
  - Azure Service Bus（訊息系統）, [訊息代理](/tw/ch5#message-brokers), [訊息代理與資料庫的對比](/tw/ch12#id297)
  - Azure SQL DB（資料庫）, [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native)
  - Azure Storage, [物件儲存](/tw/ch11#id277)
  - Azure Stream Analytics, [流分析](/tw/ch12#id318)
  - Azure Synapse Analytics（資料庫）, [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native)
  - 分散式元件物件模型, [遠端過程呼叫（RPC）的問題](/tw/ch5#sec_problems_with_rpc)
  - MSDTC (transaction coordinator), [兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc)
  - SQL Server（見 SQL Server）
- Microsoft Power BI（見 Power BI (business intelligence software)）
- 遷移（重寫）資料, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility), [不同時間寫入的不同值](/tw/ch5#different-values-written-at-different-times), [從同一事件日誌中派生多個檢視](/tw/ch12#sec_stream_deriving_views), [應用演化後重新處理資料](/tw/ch13#sec_future_reprocessing)
- MinIO（物件儲存）, [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- 移動應用程式, [資料系統架構中的權衡](/tw/ch1#ch_tradeoffs)
  - 嵌入式資料庫, [壓實策略](/tw/ch4#sec_storage_lsm_compaction)
- 模式檢查, [模型檢查與規範語言](/tw/ch9#model-checking-and-specification-languages)
- 模組操作員（%）, [雜湊取模節點數](/tw/ch7#hash-modulo-number-of-nodes)
- Mojo（程式語言）
  - 記憶體管理, [限制垃圾回收的影響](/tw/ch9#sec_distributed_gc_impact)
- MongoDB（資料庫）
  - 管道合計, [文件的查詢語言](/tw/ch3#query-languages-for-documents)
  - 原子操作, [原子寫操作](/tw/ch8#atomic-write-operations)
  - BSON, [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality)
  - 文件資料模型, [關係模型與文件模型](/tw/ch3#sec_datamodels_history)
  - 雜湊變硬, [按鍵的雜湊分片](/tw/ch7#sec_sharding_hash), [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
  - 在雲層中, [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native)
  - 加入支援, [文件和關係資料庫的融合](/tw/ch3#convergence-of-document-and-relational-databases)
  - 加入（\$$ookup 運算子）, [正規化、反正規化與連線](/tw/ch3#sec_datamodels_normalization)
  - JSON Schema validation, [JSON 模式](/tw/ch5#json-schema)
  - 基於領導者的複製, [單主複製](/tw/ch6#sec_replication_leader)
  - ObjectIds, [ID 生成器和邏輯時鐘](/tw/ch10#sec_consistency_logical)
  - 基於範圍的硬化, [按鍵的範圍分片](/tw/ch7#sec_sharding_key_range)
  - 請求路由, [請求路由](/tw/ch7#sec_sharding_routing)
  - 二級指數, [本地二級索引](/tw/ch7#id166)
  - 硬分裂, [重新平衡鍵範圍分片資料](/tw/ch7#rebalancing-key-range-sharded-data)
  - 儲存程式, [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs)
- 監測, [雲時代的運維](/tw/ch1#sec_introduction_operations), [人類與可靠性](/tw/ch2#id31), [可運維性：讓運維更輕鬆](/tw/ch2#id37)
- 單音鍾, [單調時鐘](/tw/ch9#monotonic-clocks)
- 單調讀, [單調讀](/tw/ch6#sec_replication_monotonic_reads)
- Morel（查詢語言）, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
- MSMQ（訊息系統）, [XA 事務](/tw/ch8#xa-transactions)
- 多列索引, [多維索引與全文索引](/tw/ch4#sec_storage_multidimensional)
- 多領導複製, [多主複製](/tw/ch6#sec_replication_multi_leader)-[處理寫入衝突](/tw/ch6#sec_replication_write_conflicts)
  - （另見 複製）
  - 協作編輯, [即時協作、離線優先和本地優先應用](/tw/ch6#real-time-collaboration-offline-first-and-local-first-apps)
  - 衝突檢測, [處理寫入衝突](/tw/ch6#sec_replication_write_conflicts)
  - 解決衝突, [處理寫入衝突](/tw/ch6#sec_replication_write_conflicts)
  - 供多區域複製, [跨地域執行](/tw/ch6#sec_replication_multi_dc), [線性一致性的代價](/tw/ch10#sec_linearizability_cost)
  - 線性,缺少, [實現線性一致性系統](/tw/ch10#sec_consistency_implementing_linearizable)
  - 可離線客戶端, [同步引擎與本地優先軟體](/tw/ch6#sec_replication_offline_clients)
  - 複製地形, [多主複製拓撲](/tw/ch6#sec_replication_topologies)-[不同拓撲的問題](/tw/ch6#problems-with-different-topologies)
- 多物件事務, [單物件與多物件操作](/tw/ch8#sec_transactions_multi_object)
  - 需求, [多物件事務的需求](/tw/ch8#sec_transactions_need)
- Multi-Paxos (consensus algorithm), [共識的實踐](/tw/ch10#sec_consistency_total_order)
- 多讀單寫鎖定, [兩階段鎖定的實現](/tw/ch8#implementation-of-two-phase-locking)
- 多表索引叢集表, [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality)
- 多版本併發控制, [多版本併發控制（MVCC）](/tw/ch8#sec_transactions_snapshot_impl), [總結](/tw/ch8#summary)
  - detecting stale MVCC reads, [檢測陳舊的 MVCC 讀取](/tw/ch8#detecting-stale-mvcc-reads)
  - 索引和快照隔離, [索引與快照隔離](/tw/ch8#indexes-and-snapshot-isolation)
  - 使用同步時鐘, [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
- 多層面陣列, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- 多重租賃, [儲存與計算的分離](/tw/ch1#sec_introduction_storage_compute), [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
  - 透過磨損, [面向多租戶的分片](/tw/ch7#sec_sharding_multitenancy)
  - 使用嵌入式資料庫, [壓實策略](/tw/ch4#sec_storage_lsm_compaction)
  - 與拜占庭斷層承受能力相比, [拜占庭故障](/tw/ch9#sec_distributed_byzantine)
- 相互排斥, [悲觀併發控制與樂觀併發控制](/tw/ch8#pessimistic-versus-optimistic-concurrency-control)
  - （另見 locks）
- MySQL（資料庫）
  - archiving WAL to object stores, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 二進位制日誌座標, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 資料變更捕獲, [資料變更捕獲的實現](/tw/ch12#id307), [變更流的 API 支援](/tw/ch12#sec_stream_change_api)
  - 迴圈複製地形, [多主複製拓撲](/tw/ch6#sec_replication_topologies)
  - 一致的快照, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 分散式事務支援, [XA 事務](/tw/ch8#xa-transactions)
  - global transaction identifiers (GTIDs), [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 在雲層中, [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native)
  - InnoDB storage engine（見 InnoDB）
  - 基於領導者的複製, [單主複製](/tw/ch6#sec_replication_leader)
  - 多領導複製, [跨地域執行](/tw/ch6#sec_replication_multi_dc)
  - 基於行的複製, [邏輯（基於行）日誌複製](/tw/ch6#logical-row-based-log-replication)
  - 分片（見 Vitess（資料庫））
  - 快速隔離支援, [快照隔離、可重複讀和命名混淆](/tw/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
    - （另見 InnoDB）
  - 基於語句的複製, [基於語句的複製](/tw/ch6#statement-based-replication)

### N

- N+1 query problem, [物件關係對映（ORM）](/tw/ch3#object-relational-mapping-orm)
- 奈米msg（資訊庫）, [直接從生產者傳遞給消費者](/tw/ch12#id296)
- Narayana（事務協調員）, [兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc)
- NATS（訊息系統）, [訊息代理](/tw/ch5#message-brokers)
- 自然語言處理, [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake)
- Neo4j（資料庫）
  - Cypher 查詢語言, [Cypher 查詢語言](/tw/ch3#id57)
  - 圖表資料模型, [圖資料模型](/tw/ch3#sec_datamodels_graph)
- Neon（資料庫）, [設定新的副本](/tw/ch6#sec_replication_new_replica)
- 侄子（資料流引擎）, [資料流引擎](/tw/ch11#sec_batch_dataflow)
- Neptune（圖資料庫）, [圖資料模型](/tw/ch3#sec_datamodels_graph)
  - Cypher 查詢語言, [Cypher 查詢語言](/tw/ch3#id57)
  - SPARQL 查詢語言, [SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
- 網碼（遊戲開發）, [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
- Network Attached Storage (NAS), [共享記憶體、共享磁碟與無共享架構](/tw/ch2#sec_introduction_shared_nothing), [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- 網路模型（資料表示）, [關係模型與文件模型](/tw/ch3#sec_datamodels_history)
- Network Time Protocol（見 網路時間協議）
- 網路
  - 擁堵和排隊, [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
  - 資料中心網路地形, [雲計算與超級計算](/tw/ch1#id17)
  - faults（見 faults）
  - 線性化和網路延遲, [線性一致性與網路延遲](/tw/ch10#linearizability-and-network-delays)
  - 網路分割槽, [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults)
    - 在 CAP 定理中, [線性一致性的代價](/tw/ch10#sec_linearizability_cost)
  - 超時和無限制延誤, [超時和無界延遲](/tw/ch9#sec_distributed_queueing)
- NewSQL, [關係模型與文件模型](/tw/ch3#sec_datamodels_history), [複製延遲的解決方案](/tw/ch6#id131)
  - 事務和, [事務到底是什麼？](/tw/ch8#sec_transactions_overview), [資料庫內部的分散式事務](/tw/ch8#sec_transactions_internal)
- 下鍵鎖定, [索引範圍鎖](/tw/ch8#sec_transactions_2pl_range)
- NFS (network file system), [分散式檔案系統](/tw/ch11#sec_batch_dfs)
  - 在物件儲存中, [物件儲存](/tw/ch11#id277)
- Nimble（資料格式）, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses), [列式儲存](/tw/ch4#sec_storage_column)
  - （另見 column-oriented storage）
- node (in graphs)（見 vertices）
- 節點（程序）, [分散式與單節點系統](/tw/ch1#sec_introduction_distributed), [術語表](/tw/glossary)
  - 在基於領導器的複製中處理斷電, [處理節點故障](/tw/ch6#sec_replication_failover)
  - 失敗的系統模型, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
- 吵鬧的鄰居, [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
- 原子承諾, [三階段提交](/tw/ch8#three-phase-commit)
- 非決定性操作, [基於語句的複製](/tw/ch6#statement-based-replication)
  - （另見 deterministic operations）
  - 在分散式系統中, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
  - 工作流程引擎中, [持久化執行](/tw/ch5#durable-execution)
  - 部分失敗, [故障與部分失效](/tw/ch9#sec_distributed_partial_failure)
  - 非決定因素, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
- 不起作用的要求, [定義非功能性需求](/tw/ch2#ch_nonfunctional), [總結](/tw/ch2#summary)
- 不可重複讀作, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)
  - （另見 讀取偏差）
- 正規化, [正規化、反正規化與連線](/tw/ch3#sec_datamodels_normalization)-[多對一與多對多關係](/tw/ch3#sec_datamodels_many_to_many), [術語表](/tw/glossary)
  - 外國關鍵參考文獻, [多物件事務的需求](/tw/ch8#sec_transactions_need)
  - 社會網路案例研究, [社交網路案例研究中的反正規化](/tw/ch3#denormalization-in-the-social-networking-case-study)
  - 在記錄系統中, [記錄系統與派生資料](/tw/ch1#sec_introduction_derived)
  - 相對於非正常化, [從同一事件日誌中派生多個檢視](/tw/ch12#sec_stream_deriving_views)
- NoSQL, [關係模型與文件模型](/tw/ch3#sec_datamodels_history), [複製延遲的解決方案](/tw/ch6#id131), [分拆資料庫](/tw/ch13#sec_future_unbundling)
  - 事務和, [事務到底是什麼？](/tw/ch8#sec_transactions_overview)
- Notation3 (N3), [三元組儲存與 SPARQL](/tw/ch3#id59)
- 網路時間協議, [不可靠的時鐘](/tw/ch9#sec_distributed_clocks)
  - 準確性, [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy), [用於事件排序的時間戳](/tw/ch9#sec_distributed_lww)
  - 對單音鐘的調整, [單調時鐘](/tw/ch9#monotonic-clocks)
  - 多個伺服器地址, [弱形式的謊言](/tw/ch9#weak-forms-of-lying)
- XML 與 JSON 編碼中的數字, [JSON、XML 及其二進位制變體](/tw/ch5#sec_encoding_json)
- NumPy (Python library), [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes), [列式儲存](/tw/ch4#sec_storage_column)
- NVMe (Non-Volatile Memory Express)（見 solid state drives (SSDs)）

### O

- 物件資料庫, [關係模型與文件模型](/tw/ch3#sec_datamodels_history)
- 物件儲存, [雲服務的分層](/tw/ch1#layering-of-cloud-services), [物件儲存](/tw/ch11#id277)-[物件儲存](/tw/ch11#id277)
  - Azure Blob Storage（見 Azure Blob Storage）
  - 比較分散式檔案系統, [物件儲存](/tw/ch11#id277)
  - 與關鍵價值庫存的比較, [物件儲存](/tw/ch11#id277)
  - 資料庫由, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 備份, [複製](/tw/ch6#ch_replication)
  - 用於雲資料倉庫, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses), [寫入列式儲存](/tw/ch4#writing-to-column-oriented-storage)
  - 資料庫複製, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - Google Cloud Storage（見 Google Cloud Storage）
  - 物件大小, [儲存與計算的分離](/tw/ch1#sec_introduction_storage_compute)
  - S3（見 S3（物件儲存））
  - storing LSM segment files, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
  - 支援圍欄, [隔離殭屍程序和延遲請求](/tw/ch9#sec_distributed_fencing_tokens)
  - 資料湖中的使用, [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake)
- 物件關係對映（ORM）框架, [物件關係對映（ORM）](/tw/ch3#object-relational-mapping-orm)
  - 處理錯誤和中止事務, [處理錯誤和中止](/tw/ch8#handling-errors-and-aborts)
  - 不安全的讀寫週期程式碼, [原子寫操作](/tw/ch8#atomic-write-operations)
- 物件關係不匹配, [物件關係不匹配](/tw/ch3#sec_datamodels_document)
- 可觀察性, [分散式系統的問題](/tw/ch1#sec_introduction_dist_sys_problems), [人類與可靠性](/tw/ch2#id31), [可運維性：讓運維更輕鬆](/tw/ch2#id37)
- 觀察員模式, [應用程式碼和狀態的分離](/tw/ch13#id344)
- OBT (one big table), [星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics), [星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics)
- 離線系統, [批處理](/tw/ch11#ch_batch)
  - （另見 batch processing）
- 離線第一應用程式, [即時協作、離線優先和本地優先應用](/tw/ch6#real-time-collaboration-offline-first-and-local-first-apps), [有狀態、可離線的客戶端](/tw/ch13#id347)
- 頁:1
  - 加工過的原木中的消費者抵消額, [消費者偏移量](/tw/ch12#sec_stream_log_offsets)
  - 已磨損日誌中的訊息, [使用日誌進行訊息儲存](/tw/ch12#id300)
- OLAP, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp), [術語表](/tw/glossary)
  - 資料方塊, [物化檢視與資料立方體](/tw/ch4#sec_storage_materialized_views)
- OLTP, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp), [術語表](/tw/glossary)
  - 分析查詢與, [分析（Analytics）](/tw/ch11#sec_batch_olap)
  - 資料正常化, [正規化的權衡](/tw/ch3#trade-offs-of-normalization)
  - 工作量特點, [實際序列執行](/tw/ch8#sec_transactions_serial)
- 現場部署, [雲服務與自託管](/tw/ch1#sec_introduction_cloud)
  - 資料倉庫, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
- 一個大表格（資料倉計劃）, [星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics), [星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics)
- 單熱編碼, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- 一對夫婦關係, [用於一對多關係的文件資料模型](/tw/ch3#the-document-data-model-for-one-to-many-relationships)
- 一對多種關係, [用於一對多關係的文件資料模型](/tw/ch3#the-document-data-model-for-one-to-many-relationships)
  - JSON representation, [用於一對多關係的文件資料模型](/tw/ch3#the-document-data-model-for-one-to-many-relationships)
- 線上系統, [批處理](/tw/ch11#ch_batch)
  - （另見 services）
  - 相對於科學計算, [雲計算與超級計算](/tw/ch1#id17)
- 腫瘤, [三元組儲存與 SPARQL](/tw/ch3#id59)
- Oozie（工作流排程器）, [批處理](/tw/ch11#ch_batch)
- OpenAPI (service definition format), [微服務與無伺服器](/tw/ch1#sec_introduction_microservices), [Web 服務](/tw/ch5#sec_web_services), [Web 服務](/tw/ch5#sec_web_services)
  - use of JSON Schema, [JSON 模式](/tw/ch5#json-schema)
- openCypher（見 Cypher（查詢語言））
- OpenLink Virtuoso（見 Virtuoso（資料庫））
- OpenStack
  - Swift（物件儲存）, [物件儲存](/tw/ch11#id277)
- 可操作性, [可運維性：讓運維更輕鬆](/tw/ch2#id37)
- 作業系統與資料庫, [分拆資料庫](/tw/ch13#sec_future_unbundling)
- 業務系統, [分析型與事務型系統](/tw/ch1#sec_introduction_analytics)
  - （另見 線上事務處理）
  - 作為記錄系統, [記錄系統與派生資料](/tw/ch1#sec_introduction_derived)
  - ETL into analytical systems, [資料倉庫](/tw/ch1#sec_introduction_dwh)
- 操作轉換, [CRDT 與操作變換](/tw/ch6#sec_replication_crdts)
- 行動組, [雲時代的運維](/tw/ch1#sec_introduction_operations)
- 運算元, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
  - 在溪流處理中, [流處理](/tw/ch12#sec_stream_processing)
- 樂觀併發控制, [悲觀併發控制與樂觀併發控制](/tw/ch8#pessimistic-versus-optimistic-concurrency-control)
- 樂觀鎖定, [條件寫入（比較並設定）](/tw/ch8#sec_transactions_compare_and_set)
- Oracle（資料庫）
  - 分散式事務支援, [XA 事務](/tw/ch8#xa-transactions)
  - GoldenGate (change data capture), [資料變更捕獲的實現](/tw/ch12#id307)
  - 等級查詢, [SQL 中的圖查詢](/tw/ch3#id58), [SQL 中的圖查詢](/tw/ch3#id58)
  - 缺乏序列性, [隔離性](/tw/ch8#sec_transactions_acid_isolation)
  - 基於領導者的複製, [單主複製](/tw/ch6#sec_replication_leader)
  - 多領導複製, [跨地域執行](/tw/ch6#sec_replication_multi_dc)
  - 多表索引叢集表, [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality)
  - 無法阻止寫入 skew, [寫偏差的特徵](/tw/ch8#characterizing-write-skew)
  - PL/SQL language, [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs)
  - 防止丟失更新, [自動檢測丟失的更新](/tw/ch8#automatically-detecting-lost-updates)
  - 讀作承諾隔離, [實現讀已提交](/tw/ch8#sec_transactions_read_committed_impl)
  - Real Application Clusters (RAC), [鎖定與領導者選舉](/tw/ch10#locking-and-leader-election)
  - 快速隔離支援, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation), [快照隔離、可重複讀和命名混淆](/tw/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
  - TimesTen (in-memory database), [全記憶體儲存](/tw/ch4#sec_storage_inmemory)
  - WAL-based replication, [預寫日誌（WAL）傳輸](/tw/ch6#write-ahead-log-wal-shipping)
- ORC（資料格式）, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses), [列式儲存](/tw/ch4#sec_storage_column)
  - （另見 column-oriented storage）
- 協調（服務部署）, [雲服務與自託管](/tw/ch1#sec_introduction_cloud), [微服務與無伺服器](/tw/ch1#sec_introduction_microservices)
  - 批次任務執行, [分散式作業編排](/tw/ch11#id278)-[分散式作業編排](/tw/ch11#id278)
  - 工作流程引擎, [批處理](/tw/ch11#ch_batch)
- 順序
  - 事件日誌, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
  - 總訂單的限制, [全序的限制](/tw/ch13#id335)
  - 邏輯時間戳, [邏輯時鐘](/tw/ch10#sec_consistency_timestamps)
  - of auto-incrementing IDs, [ID 生成器和邏輯時鐘](/tw/ch10#sec_consistency_logical)
  - 共享日誌, [共識的實踐](/tw/ch10#sec_consistency_total_order)-[共識的利弊](/tw/ch10#pros-and-cons-of-consensus)
- Orkes（工作流程引擎）, [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows)
- 孤兒頁面（B- 樹）, [使 B 樹可靠](/tw/ch4#sec_storage_btree_wal)
- 發件箱圖案, [資料變更捕獲與事件溯源](/tw/ch12#sec_stream_event_sourcing)
- 異常值（響應時間）, [平均值、中位數與百分位點](/tw/ch2#id24)
- 外包, [雲服務與自託管](/tw/ch1#sec_introduction_cloud)
- 超載, [描述效能](/tw/ch2#sec_introduction_percentiles), [處理錯誤和中止](/tw/ch8#handling-errors-and-aborts)

### P

- PACELC principle, [CAP 定理](/tw/ch10#the-cap-theorem)
- 軟體包管理器, [應用程式碼和狀態的分離](/tw/ch13#id344)
- 包切換, [我們不能簡單地使網路延遲可預測嗎？](/tw/ch9#can-we-not-simply-make-network-delays-predictable)
- 資料包
  - 腐敗, [弱形式的謊言](/tw/ch9#weak-forms-of-lying)
  - sending via UDP, [直接從生產者傳遞給消費者](/tw/ch12#id296)
- PageRank (algorithm), [圖資料模型](/tw/ch3#sec_datamodels_graph), [查詢語言](/tw/ch11#sec_batch_query_lanauges), [機器學習](/tw/ch11#id290)
- paging（見 virtual memory）
- 大熊貓（蟒蛇圖書館）, [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake), [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes), [列式儲存](/tw/ch4#sec_storage_column), [DataFrames](/tw/ch11#id287)
- Parquet（資料格式）, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses), [列式儲存](/tw/ch4#sec_storage_column), [歸檔儲存](/tw/ch5#archival-storage), [查詢語言](/tw/ch11#sec_batch_query_lanauges)
  - （另見 column-oriented storage）
  - 物件儲存的資料庫, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 文件資料模型, [列式儲存](/tw/ch4#sec_storage_column)
  - 批次處理中的用途, [MapReduce](/tw/ch11#sec_batch_mapreduce)
- 部分失敗, [故障與部分失效](/tw/ch9#sec_distributed_partial_failure), [總結](/tw/ch9#summary)
  - 跛腳, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
- 部分同步（系統模型）, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
- 分割槽鍵, [分片的利與弊](/tw/ch7#sec_sharding_reasons), [鍵值資料的分片](/tw/ch7#sec_sharding_key_value)
- 分割槽（見 分片）
- Paxos（協商一致演算法）, [共識](/tw/ch10#sec_consistency_consensus), [共識的實踐](/tw/ch10#sec_consistency_total_order)
  - 票數, [從單主複製到共識](/tw/ch10#from-single-leader-replication-to-consensus)
  - Multi-Paxos, [共識的實踐](/tw/ch10#sec_consistency_total_order)
- payment card industry (PCI), [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)
- PCI (payment card industry) compliance, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)
- 百分位點, [平均值、中位數與百分位點](/tw/ch2#id24), [術語表](/tw/glossary)
  - 高效計算, [響應時間指標的應用](/tw/ch2#sec_introduction_slo_sla)
  - 高百分數的重要性, [響應時間指標的應用](/tw/ch2#sec_introduction_slo_sla)
  - use in service level agreements (SLAs), [響應時間指標的應用](/tw/ch2#sec_introduction_slo_sla)
- Percolator (Google), [實現線性一致的 ID 生成器](/tw/ch10#implementing-a-linearizable-id-generator)
- Percona XtraBackup (MySQL tool), [設定新的副本](/tw/ch6#sec_replication_new_replica)
- 效能
  - 作為過失的降解, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
  - 描述, [描述效能](/tw/ch2#sec_introduction_percentiles)
  - 分散式事務, [跨不同系統的分散式事務](/tw/ch8#sec_transactions_xa)
  - 記憶體資料庫, [全記憶體儲存](/tw/ch4#sec_storage_inmemory)
  - 線性, [線性一致性與網路延遲](/tw/ch10#linearizability-and-network-delays)
  - 多領導者複製, [跨地域執行](/tw/ch6#sec_replication_multi_dc)
- 許可權隔離, [面向多租戶的分片](/tw/ch7#sec_sharding_multitenancy)
- 永久不一致, [及時性與完整性](/tw/ch13#sec_future_integrity)
- 悲觀併發控制, [悲觀併發控制與樂觀併發控制](/tw/ch8#pessimistic-versus-optimistic-concurrency-control)
- pglogical (PostgreSQL extension), [跨地域執行](/tw/ch6#sec_replication_multi_dc)
- pgvector （向量指數）, [向量嵌入](/tw/ch4#id92)
- 幻讀, [導致寫偏差的幻讀](/tw/ch8#sec_transactions_phantom)
  - 物化衝突, [物化衝突](/tw/ch8#materializing-conflicts)
  - 預防,序列性, [謂詞鎖](/tw/ch8#predicate-locks)
- physical clocks（見 clocks）
- pick菜（蟒魚）, [特定語言的格式](/tw/ch5#id96)
- Pinot（資料庫）, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp), [列式儲存](/tw/ch4#sec_storage_column)
  - 處理寫入, [寫入列式儲存](/tw/ch4#writing-to-column-oriented-storage)
  - 預彙總, [分析（Analytics）](/tw/ch11#sec_batch_olap)
  - 服務衍生資料, [對外提供派生資料](/tw/ch11#sec_batch_serving_derived), [對外提供派生資料](/tw/ch11#sec_batch_serving_derived)
- 編審中的執行
  - 資料倉查詢, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
- 樞軸表, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- 時間點, [不可靠的時鐘](/tw/ch9#sec_distributed_clocks)
- 點查詢, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp)
- 極地（資料目錄）, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
- 投票, [表示使用者、帖子與關注關係](/tw/ch2#id20)
- 多邊儲存器, [一切的元資料庫](/tw/ch13#id341)
- POSIX (portable operating system interface)
  - 符合的檔案系統, [設定新的副本](/tw/ch6#sec_replication_new_replica), [分散式檔案系統](/tw/ch11#sec_batch_dfs), [物件儲存](/tw/ch11#id277)
- 郵政局地平線醜聞, [人類與可靠性](/tw/ch2#id31)
  - 缺乏事務, [事務](/tw/ch8#ch_transactions)
- PostgreSQL（資料庫）
  - archiving WAL to object stores, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 資料變更捕獲, [資料變更捕獲的實現](/tw/ch12#id307), [變更流的 API 支援](/tw/ch12#sec_stream_change_api)
  - 分散式事務支援, [XA 事務](/tw/ch8#xa-transactions)
  - 外國資料包, [一切的元資料庫](/tw/ch13#id341)
  - 全文搜尋支援, [組合使用派生資料的工具](/tw/ch13#id442)
  - 在雲層中, [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native)
  - JSON Schema validation, [JSON 模式](/tw/ch5#json-schema)
  - 基於領導者的複製, [單主複製](/tw/ch6#sec_replication_leader)
  - 日誌序列號, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 邏輯解碼, [邏輯（基於行）日誌複製](/tw/ch6#logical-row-based-log-replication)
  - 實現檢視維護, [維護物化檢視](/tw/ch12#sec_stream_mat_view)
  - 多領導複製, [跨地域執行](/tw/ch6#sec_replication_multi_dc)
  - MVCC implementation, [多版本併發控制（MVCC）](/tw/ch8#sec_transactions_snapshot_impl), [索引與快照隔離](/tw/ch8#indexes-and-snapshot-isolation)
  - 分割對硬化, [分片](/tw/ch7#ch_sharding)
  - pgvector （向量指數）, [向量嵌入](/tw/ch4#id92)
  - PL/pgSQL language, [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs)
  - PostGIS geospatial indexes, [多維索引與全文索引](/tw/ch4#sec_storage_multidimensional)
  - 防止丟失更新, [自動檢測丟失的更新](/tw/ch8#automatically-detecting-lost-updates)
  - 防止寫入skew, [寫偏差的特徵](/tw/ch8#characterizing-write-skew), [可序列化快照隔離（SSI）](/tw/ch8#sec_transactions_ssi)
  - 讀作承諾隔離, [實現讀已提交](/tw/ch8#sec_transactions_read_committed_impl)
  - 表示圖表, [屬性圖](/tw/ch3#id56)
  - 可序列化快照隔離, [可序列化快照隔離（SSI）](/tw/ch8#sec_transactions_ssi)
  - 分片（見 Citus（資料庫））
  - 快速隔離支援, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation), [快照隔離、可重複讀和命名混淆](/tw/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
  - WAL-based replication, [預寫日誌（WAL）傳輸](/tw/ch6#write-ahead-log-wal-shipping)
- 倒排列表, [全文檢索](/tw/ch4#sec_storage_full_text)
  - 在硬化指數中, [本地二級索引](/tw/ch7#id166)
- 死後無咎, [人類與可靠性](/tw/ch2#id31)
- PouchDB（資料庫）, [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
- Power BI (business intelligence software), [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp), [分析（Analytics）](/tw/ch11#sec_batch_olap)
- 預彙總, [分析（Analytics）](/tw/ch11#sec_batch_olap)
  - 服務衍生資料, [對外提供派生資料](/tw/ch11#sec_batch_serving_derived)
- 分享前, [重新平衡鍵範圍分片資料](/tw/ch7#rebalancing-key-range-sharded-data)
- Precision Time Protocol (PTP), [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy)
- 上游鎖定, [謂詞鎖](/tw/ch8#predicate-locks)
- 預測分析, [分析型與事務型系統](/tw/ch1#sec_introduction_analytics), [預測分析](/ch14#id369)-[反饋迴路](/ch14#id372)
  - 擴大偏見, [偏見與歧視](/ch14#id370)
  - ethics of（見 ethics）
  - 反饋迴圈, [反饋迴路](/ch14#id372)
- 預設, [資源分配](/tw/ch11#id279)
  - 在分散式排程器中, [故障處理](/tw/ch11#id281)
  - 執行緒, [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)
- Prefect（工作流排程器）, [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows), [批處理](/tw/ch11#ch_batch), [工作流排程](/tw/ch11#sec_batch_workflows)
  - 雲資料倉整合, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
- Presto（查詢引擎）, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
- 主金鑰, [多列索引與二級索引](/tw/ch4#sec_storage_index_multicolumn), [術語表](/tw/glossary)
  - 自動遞增, [ID 生成器和邏輯時鐘](/tw/ch10#sec_consistency_logical)
  - 對分割槽鍵, [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
- primary-backup replication（見 基於領導者的複製）
- 隱私, [隱私與追蹤](/ch14#id373)-[立法與自律](/ch14#sec_future_legislation)
  - 同意和選擇自由, [同意與選擇自由](/ch14#id375)
  - 資料作為資產和權力, [資料作為資產與權力](/ch14#id376)
  - 刪除資料, [不變性的侷限性](/tw/ch12#sec_stream_immutability_limitations)
  - ethical considerations（見 ethics）
  - 立法和自律, [立法與自律](/ch14#sec_future_legislation)
  - 含義, [隱私與資料使用](/ch14#id457)
  - 條例, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)
  - 監視, [監視](/ch14#id374)
  - 跟蹤行為資料, [隱私與追蹤](/ch14#id373)
- 機率演算法, [響應時間指標的應用](/tw/ch2#sec_introduction_slo_sla), [流分析](/tw/ch12#id318)
- 程序暫停, [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)-[限制垃圾回收的影響](/tw/ch9#sec_distributed_gc_impact)
- 處理時間（事件）, [時間推理](/tw/ch12#sec_stream_time)
- 生產者（資訊流）, [傳遞事件流](/tw/ch12#sec_stream_transmit)
- 產品分析, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp)
  - 面向列的儲存, [列式儲存](/tw/ch4#sec_storage_column)
- 程式語言
  - 用於儲存程式, [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs)
- 預測（活動來源）, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
- Prolog（語言）, [Datalog：遞迴關係查詢](/tw/ch3#id62)
  - （另見 Datalog）
- 屬性圖, [屬性圖](/tw/ch3#id56)
  - Cypher 查詢語言, [Cypher 查詢語言](/tw/ch3#id57)
  - Property Graph Query Language (PGQL), [SQL 中的圖查詢](/tw/ch3#id58)
- 基於屬性的測試, [人類與可靠性](/tw/ch2#id31), [形式化方法和隨機測試](/tw/ch9#sec_distributed_formal)
- Protocol Buffers（資料格式）, [Protocol Buffers](/tw/ch5#sec_encoding_protobuf)-[欄位標籤與模式演化](/tw/ch5#field-tags-and-schema-evolution), [Protocol Buffers](/tw/ch5#sec_encoding_protobuf)
  - 欄位標記和計劃演變, [欄位標籤與模式演化](/tw/ch5#field-tags-and-schema-evolution)
- 資料來源, [為可審計性而設計](/tw/ch13#id365)
- 釋出/訂閱模式, [訊息傳遞系統](/tw/ch12#sec_stream_messaging)
- 出版社（資訊流）, [傳遞事件流](/tw/ch12#sec_stream_transmit)
- Pulsar （流線平臺）, [確認與重新傳遞](/tw/ch12#sec_stream_reordering)
- PyTorch (machine learning library), [機器學習](/tw/ch11#id290)

### Q

- Qpid（訊息系統）, [訊息代理與資料庫的對比](/tw/ch12#id297)
- quality of service (QoS), [我們不能簡單地使網路延遲可預測嗎？](/tw/ch9#can-we-not-simply-make-network-delays-predictable)
- Quantcast File System（分散式檔案系統）, [物件儲存](/tw/ch11#id277)
- 查詢引擎
  - 彙編和向量化, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
  - 在雲資料倉庫中, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
  - 運算元, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
  - 最佳化申報查詢, [資料模型與查詢語言](/tw/ch3#ch_datamodels)
- 查詢語言
  - 密碼, [Cypher 查詢語言](/tw/ch3#id57)
  - 資料日誌, [Datalog：遞迴關係查詢](/tw/ch3#id62)
  - GraphQL, [GraphQL](/tw/ch3#id63)
  - MongoDB aggregation pipeline, [正規化、反正規化與連線](/tw/ch3#sec_datamodels_normalization), [文件的查詢語言](/tw/ch3#query-languages-for-documents)
  - recursive SQL queries, [SQL 中的圖查詢](/tw/ch3#id58)
  - SPARQL, [SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
  - SQL, [正規化、反正規化與連線](/tw/ch3#sec_datamodels_normalization)
- 查詢最佳化器, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
- 查詢計劃, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
- 排隊延遲, [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
  - 頭部阻塞, [延遲與響應時間](/tw/ch2#id23)
  - 延遲和反應時間, [延遲與響應時間](/tw/ch2#id23)
- 佇列（訊息）, [訊息代理](/tw/ch5#message-brokers)
- QUIC (protocol), [TCP 的侷限性](/tw/ch9#sec_distributed_tcp)
- 法定人數, [讀寫仲裁](/tw/ch6#sec_replication_quorum_condition)-[多地區操作](/tw/ch6#multi-region-operation), [術語表](/tw/glossary)
  - 用於無頭複製, [讀寫仲裁](/tw/ch6#sec_replication_quorum_condition)
  - 在共識演算法中, [從單主複製到共識](/tw/ch10#from-single-leader-replication-to-consensus)
  - 一致性的限制, [仲裁一致性的侷限](/tw/ch6#sec_replication_quorum_limitations)-[監控陳舊性](/tw/ch6#monitoring-staleness), [線性一致性與仲裁](/tw/ch10#sec_consistency_quorum_linearizable)
  - 在分散式系統中作出決定, [多數派原則](/tw/ch9#sec_distributed_majority)
  - 監測停滯情況, [監控陳舊性](/tw/ch6#monitoring-staleness)
  - 多區域複製, [多地區操作](/tw/ch6#multi-region-operation)
  - 依賴耐久性, [將系統模型對映到現實世界](/tw/ch9#mapping-system-models-to-the-real-world)
- 配額, [雲時代的運維](/tw/ch1#sec_introduction_operations)

### R

- R（語言）, [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake), [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes), [DataFrames](/tw/ch11#id287)
- R樹（指數）, [多維索引與全文索引](/tw/ch4#sec_storage_multidimensional)
- R2（物件儲存）, [雲服務的分層](/tw/ch1#layering-of-cloud-services), [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- RabbitMQ（訊息系統）, [訊息代理](/tw/ch5#message-brokers), [訊息代理與資料庫的對比](/tw/ch12#id297)
  - 法定人數佇列（複製）, [單主複製](/tw/ch6#sec_replication_leader)
- 種族條件, [隔離性](/tw/ch8#sec_transactions_acid_isolation)
  - （另見 併發）
  - 以可線性避免, [跨通道時序依賴](/tw/ch10#cross-channel-timing-dependencies)
  - 由雙寫引起, [保持系統同步](/tw/ch12#sec_stream_sync)
  - 造成資金損失, [弱隔離級別](/tw/ch8#sec_transactions_isolation_levels)
  - 骯髒的寫作, [沒有髒寫](/tw/ch8#sec_transactions_dirty_write)
  - 逆增量, [沒有髒寫](/tw/ch8#sec_transactions_dirty_write)
  - 丟失更新, [防止丟失更新](/tw/ch8#sec_transactions_lost_update)-[衝突解決與複製](/tw/ch8#conflict-resolution-and-replication)
  - 以事件日誌防止, [併發控制](/tw/ch12#sec_stream_concurrency), [資料流：應用程式碼與狀態變化的互動](/tw/ch13#id450)
  - 以可序列隔離的方式防止, [可序列化](/tw/ch8#sec_transactions_serializability)
  - 事務隔離薄弱, [弱隔離級別](/tw/ch8#sec_transactions_isolation_levels)
  - 寫偏差, [寫偏差與幻讀](/tw/ch8#sec_transactions_write_skew)-[物化衝突](/tw/ch8#materializing-conflicts)
- Raft（協商一致演算法）, [共識](/tw/ch10#sec_consistency_consensus), [共識的實踐](/tw/ch10#sec_consistency_total_order)
  - 基於領導者的複製, [單主複製](/tw/ch6#sec_replication_leader)
  - 對網路問題的敏感性, [共識的利弊](/tw/ch10#pros-and-cons-of-consensus)
  - 任期, [從單主複製到共識](/tw/ch10#from-single-leader-replication-to-consensus)
  - 用於等, [實現線性一致性系統](/tw/ch10#sec_consistency_implementing_linearizable)
- RAID (Redundant Array of Independent Disks), [儲存與計算的分離](/tw/ch1#sec_introduction_storage_compute), [透過冗餘容忍硬體故障](/tw/ch2#tolerating-hardware-faults-through-redundancy), [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- 鐵路,計劃遷移, [應用演化後重新處理資料](/tw/ch13#sec_future_reprocessing)
- RAM（見 memory）
- RAMCloud (in-memory storage), [全記憶體儲存](/tw/ch4#sec_storage_inmemory)
- 隨機寫入（訪問模式）, [順序與隨機寫入](/tw/ch4#sidebar_sequential)
- 區域查詢
  - 在B樹上, [B 樹](/tw/ch4#sec_storage_b_trees), [讀取效能](/tw/ch4#read-performance)
  - in LSM-trees, [讀取效能](/tw/ch4#read-performance)
  - 雜湊地圖中不高效, [日誌結構儲存](/tw/ch4#sec_storage_log_structured)
  - 與大麻的磨損,, [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
- 排名演算法, [機器學習](/tw/ch11#id290)
- Ray（工作流排程器）, [機器學習](/tw/ch11#id290)
- RDF (Resource Description Framework), [RDF 資料模型](/tw/ch3#the-rdf-data-model)
  - querying with SPARQL, [SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
- 遠端直接記憶體訪問, [雲服務的分層](/tw/ch1#layering-of-cloud-services), [雲計算與超級計算](/tw/ch1#id17)
- 反應（使用者介面庫）, [端到端的事件流](/tw/ch13#id349)
- 被動方案擬訂, [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
- 讀取承諾隔離級別, [讀已提交](/tw/ch8#sec_transactions_read_committed)-[實現讀已提交](/tw/ch8#sec_transactions_read_committed_impl)
  - 執行, [實現讀已提交](/tw/ch8#sec_transactions_read_committed_impl)
  - 多版本併發控制, [多版本併發控制（MVCC）](/tw/ch8#sec_transactions_snapshot_impl)
  - 沒有髒讀, [沒有髒讀](/tw/ch8#no-dirty-reads)
  - 沒有汙穢的文字, [沒有髒寫](/tw/ch8#sec_transactions_dirty_write)
- 讀取模型（活動來源）, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
- 讀路徑, [觀察派生資料狀態](/tw/ch13#sec_future_observing)
- （無鉛複製）, [追趕錯過的寫入](/tw/ch6#sec_replication_read_repair)
  - 線性, [線性一致性與仲裁](/tw/ch10#sec_consistency_quorum_linearizable)
- 只讀副本（見 基於領導者的複製）
- 讀取偏差, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation), [總結](/tw/ch8#summary)
- 讀取未承諾的隔離級別, [實現讀已提交](/tw/ch8#sec_transactions_read_committed_impl)
- 寫後讀一致性, [讀己之寫](/tw/ch6#sec_replication_ryw), [及時性與完整性](/tw/ch13#sec_future_integrity)
  - 交叉裝置, [讀己之寫](/tw/ch6#sec_replication_ryw)
  - 在衍生資料系統中, [派生資料與分散式事務](/tw/ch13#sec_future_derived_vs_transactions)
- 讀 - 修改 - 寫入週期, [防止丟失更新](/tw/ch8#sec_transactions_lost_update)
- 讀縮放架構, [複製延遲的問題](/tw/ch6#sec_replication_lag), [單主與無主複製的效能](/tw/ch6#sec_replication_leaderless_perf)
  - 與磨損, [分片的利與弊](/tw/ch7#sec_sharding_reasons)
- 讀作事件, [讀也是事件](/tw/ch13#sec_future_read_events)
- 即時
  - analytics（見 product analytics）
  - 協作編輯, [即時協作、離線優先和本地優先應用](/tw/ch6#real-time-collaboration-offline-first-and-local-first-apps)
  - 釋出/訂閱資料流, [端到端的事件流](/tw/ch13#id349)
  - 響應時間保障, [響應時間保證](/tw/ch9#sec_distributed_clocks_realtime)
  - 每日時鐘, [日曆時鐘](/tw/ch9#time-of-day-clocks)
- Realm（資料庫）, [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
- 重新平衡困難, [重新平衡鍵範圍分片資料](/tw/ch7#rebalancing-key-range-sharded-data)-[運維：自動/手動再平衡](/tw/ch7#sec_sharding_operations), [術語表](/tw/glossary)
  - （另見 分片）
  - 自動或人工重新平衡, [運維：自動/手動再平衡](/tw/ch7#sec_sharding_operations)
  - 固定塊數, [固定數量的分片](/tw/ch7#fixed-number-of-shards)
  - 每個節點的固定硬度數, [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
  - Hash mod N的問題, [雜湊取模節點數](/tw/ch7#hash-modulo-number-of-nodes)
- 新鮮度保證, [線性一致性](/tw/ch10#sec_consistency_linearizability)
- 建議引擎, [分析型與事務型系統](/tw/ch1#sec_introduction_analytics)
  - building using DataFrames, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
  - 迭代處理, [機器學習](/tw/ch11#id290)
- 重組（協商一致）, [共識的微妙之處](/tw/ch10#subtleties-of-consensus)
- 記錄, [MapReduce](/tw/ch11#sec_batch_mapreduce)
  - 流處理中的事件, [傳遞事件流](/tw/ch12#sec_stream_transmit)
- 遞迴查詢
  - 在金鑰中, [Cypher 查詢語言](/tw/ch3#id57)
  - 在資料日誌中, [Datalog：遞迴關係查詢](/tw/ch3#id62)
  - in SPARQL, [SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
  - lack of, in GraphQL, [GraphQL](/tw/ch3#id63)
  - SQL common table expressions, [SQL 中的圖查詢](/tw/ch3#id58)
- Red Hat
  - Apicurio Registry, [JSON 模式](/tw/ch5#json-schema)
- 紅黑樹, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
- 重新交付（通訊）, [確認與重新傳遞](/tw/ch12#sec_stream_reordering)
- Redis（資料庫）
  - 原子操作, [原子寫操作](/tw/ch8#atomic-write-operations)
  - CRDT support, [CRDT 與操作變換](/tw/ch6#sec_replication_crdts)
  - 永續性, [全記憶體儲存](/tw/ch4#sec_storage_inmemory)
  - Lua 指令碼, [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs)
  - 多領導複製, [跨地域執行](/tw/ch6#sec_replication_multi_dc)
  - 程序/核心模式, [分片的利與弊](/tw/ch7#sec_sharding_reasons)
  - 單條執行, [實際序列執行](/tw/ch8#sec_transactions_serial)
- redo log（見 write-ahead log）
- Redpanda（訊息系統）, [訊息代理](/tw/ch5#message-brokers), [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 分層儲存, [磁碟空間使用](/tw/ch12#sec_stream_disk_usage)
- Redshift（資料庫）, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
- 冗餘
  - 硬體元件, [透過冗餘容忍硬體故障](/tw/ch2#tolerating-hardware-faults-through-redundancy)
  - 生成資料, [記錄系統與派生資料](/tw/ch1#sec_introduction_derived)
    - （另見 衍生資料）
- Reed--Solomon codes (error correction), [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- 重構, [可演化性：讓變化更容易](/tw/ch2#sec_introduction_evolvability)
  - （另見 可演化性）
- （地理分佈）, [讀己之寫](/tw/ch6#sec_replication_ryw)
  - （另見 datacenters）
  - 協商一致, [共識的利弊](/tw/ch10#pros-and-cons-of-consensus)
  - 定義, [讀己之寫](/tw/ch6#sec_replication_ryw)
  - 延遲, [分散式與單節點系統](/tw/ch1#sec_introduction_distributed)
  - linearizable ID generation, [實現線性一致的 ID 生成器](/tw/ch10#implementing-a-linearizable-id-generator)
  - 在整個區域複製, [跨地域執行](/tw/ch6#sec_replication_multi_dc)-[不同拓撲的問題](/tw/ch6#problems-with-different-topologies), [線性一致性的代價](/tw/ch10#sec_linearizability_cost), [全序的限制](/tw/ch13#id335)
    - 無主（無領導者）, [多地區操作](/tw/ch6#multi-region-operation)
    - 多領導者, [跨地域執行](/tw/ch6#sec_replication_multi_dc)
- 區域（硬化）, [分片](/tw/ch7#ch_sharding)
- 暫存器, [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
- regulation（見 legal matters）
- 關係資料模型, [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake), [關係模型與文件模型](/tw/ch3#sec_datamodels_history)-[文件和關係資料庫的融合](/tw/ch3#convergence-of-document-and-relational-databases)
  - 與檔案模型的比較, [何時使用哪種模型](/tw/ch3#sec_datamodels_document_summary)-[文件和關係資料庫的融合](/tw/ch3#convergence-of-document-and-relational-databases)
  - graph queries in SQL, [SQL 中的圖查詢](/tw/ch3#id58)
  - 模擬資料庫, [全記憶體儲存](/tw/ch4#sec_storage_inmemory)
  - 多對多對多的關係, [多對一與多對多關係](/tw/ch3#sec_datamodels_many_to_many)
  - 多物件事務, 需要, [多物件事務的需求](/tw/ch8#sec_transactions_need)
  - 物件關係不匹配, [物件關係不匹配](/tw/ch3#sec_datamodels_document)
  - 代表可重排列表, [何時使用哪種模型](/tw/ch3#sec_datamodels_document_summary)
  - 對文件模式
    - 模式的趨同, [文件和關係資料庫的融合](/tw/ch3#convergence-of-document-and-relational-databases)
    - 資料位置, [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality)
- 關係資料庫
  - 最終一致性, [複製延遲的問題](/tw/ch6#sec_replication_lag)
  - 歷史, [關係模型與文件模型](/tw/ch3#sec_datamodels_history)
  - 基於領導者的複製, [單主複製](/tw/ch6#sec_replication_leader)
  - 邏輯日誌, [邏輯（基於行）日誌複製](/tw/ch6#logical-row-based-log-replication)
  - 哲學比Unix, [分拆資料庫](/tw/ch13#sec_future_unbundling), [一切的元資料庫](/tw/ch13#id341)
  - 方案變化, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility), [編碼與演化](/tw/ch5#ch_encoding), [不同時間寫入的不同值](/tw/ch5#different-values-written-at-different-times)
  - 硬化二級指數, [分片與二級索引](/tw/ch7#sec_sharding_secondary_indexes)
  - 基於語句的複製, [基於語句的複製](/tw/ch6#statement-based-replication)
  - B樹指數的使用, [B 樹](/tw/ch4#sec_storage_b_trees)
- relationships（見 edges）
- 可靠性, [可靠性與容錯](/tw/ch2#sec_introduction_reliability)-[人類與可靠性](/tw/ch2#id31), [流式系統的哲學](/tw/ch13#ch_philosophy)
  - 從不可靠的元件建立可靠的系統, [故障與部分失效](/tw/ch9#sec_distributed_partial_failure)
  - 硬體故障, [硬體與軟體故障](/tw/ch2#sec_introduction_hardware_faults)
  - 人類錯誤, [人類與可靠性](/tw/ch2#id31)
  - 重要性, [人類與可靠性](/tw/ch2#id31)
  - 通訊系統, [訊息傳遞系統](/tw/ch12#sec_stream_messaging)
  - 軟體故障, [軟體故障](/tw/ch2#software-faults)
- Remote Method Invocation (Java RMI), [遠端過程呼叫（RPC）的問題](/tw/ch5#sec_problems_with_rpc)
- remote procedure calls (RPCs), [遠端過程呼叫（RPC）的問題](/tw/ch5#sec_problems_with_rpc)-[RPC 的資料編碼與演化](/tw/ch5#data-encoding-and-evolution-for-rpc)
  - （另見 services）
  - 資料編碼和演化, [RPC 的資料編碼與演化](/tw/ch5#data-encoding-and-evolution-for-rpc)
  - 問題, [遠端過程呼叫（RPC）的問題](/tw/ch5#sec_problems_with_rpc)
  - 使用 Avro, [但什麼是寫入者模式？](/tw/ch5#but-what-is-the-writers-schema)
  - 對信件經紀人, [事件驅動的架構](/tw/ch5#sec_encoding_dataflow_msg)
- 可再生能源, [分散式與單節點系統](/tw/ch1#sec_introduction_distributed)
- 可重複讀（切換隔離）, [快照隔離、可重複讀和命名混淆](/tw/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
- 複製品, [單主複製](/tw/ch6#sec_replication_leader)
- 複製, [複製](/tw/ch6#ch_replication)-[總結](/tw/ch6#summary), [術語表](/tw/glossary)
  - 永續性, [永續性](/tw/ch8#durability)
  - 解決衝突, [衝突解決與複製](/tw/ch8#conflict-resolution-and-replication)
  - 一致性屬性, [複製延遲的問題](/tw/ch6#sec_replication_lag)-[複製延遲的解決方案](/tw/ch6#id131)
    - 一致字首讀, [一致字首讀](/tw/ch6#sec_replication_consistent_prefix)
    - 單調讀, [單調讀](/tw/ch6#sec_replication_monotonic_reads)
    - 讀取您的寫作, [讀己之寫](/tw/ch6#sec_replication_ryw)
  - 在分散式檔案系統中, [分散式檔案系統](/tw/ch11#sec_batch_dfs)
  - 無主（無領導者）, [無主複製](/tw/ch6#sec_replication_leaderless)-[版本向量](/tw/ch6#version-vectors)
    - 檢測並行寫作, [檢測併發寫入](/tw/ch6#sec_replication_concurrent)-[版本向量](/tw/ch6#version-vectors)
    - 法定人數一致性的限制, [仲裁一致性的侷限](/tw/ch6#sec_replication_quorum_limitations)-[監控陳舊性](/tw/ch6#monitoring-staleness), [線性一致性與仲裁](/tw/ch10#sec_consistency_quorum_linearizable)
  - 監測停滯情況, [監控陳舊性](/tw/ch6#monitoring-staleness)
  - 多領導者, [多主複製](/tw/ch6#sec_replication_multi_leader)-[處理寫入衝突](/tw/ch6#sec_replication_write_conflicts)
    - 跨多個區域, [跨地域執行](/tw/ch6#sec_replication_multi_dc), [線性一致性的代價](/tw/ch10#sec_linearizability_cost)
    - 解決衝突, [處理寫入衝突](/tw/ch6#sec_replication_write_conflicts)-[處理寫入衝突](/tw/ch6#sec_replication_write_conflicts)
    - 複製地形, [多主複製拓撲](/tw/ch6#sec_replication_topologies)-[不同拓撲的問題](/tw/ch6#problems-with-different-topologies)
  - 使用原因, [分散式與單節點系統](/tw/ch1#sec_introduction_distributed), [複製](/tw/ch6#ch_replication)
  - 硬化和, [分片](/tw/ch7#ch_sharding)
  - 單人領導, [單主複製](/tw/ch6#sec_replication_leader)-[邏輯（基於行）日誌複製](/tw/ch6#logical-row-based-log-replication)
    - 故障切換, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover)
    - 實施複製日誌, [複製日誌的實現](/tw/ch6#sec_replication_implementation)-[邏輯（基於行）日誌複製](/tw/ch6#logical-row-based-log-replication)
    - 與協商一致的關係, [從單主複製到共識](/tw/ch10#from-single-leader-replication-to-consensus), [共識的利弊](/tw/ch10#pros-and-cons-of-consensus)
    - 設立新的追隨者, [設定新的副本](/tw/ch6#sec_replication_new_replica)
    - 同步對同步, [同步複製與非同步複製](/tw/ch6#sec_replication_sync_async)-[同步複製與非同步複製](/tw/ch6#sec_replication_sync_async)
  - 狀態機複製, [基於語句的複製](/tw/ch6#statement-based-replication), [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs), [使用共享日誌](/tw/ch10#sec_consistency_smr), [資料庫與流](/tw/ch12#sec_stream_databases)
    - 事件溯源, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
    - 依賴決定性因素, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
  - 利用協商一致, [共識的利弊](/tw/ch10#pros-and-cons-of-consensus)
  - 使用擦除編碼, [分散式檔案系統](/tw/ch11#sec_batch_dfs)
  - 使用物件儲存, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 相對備份, [複製](/tw/ch6#ch_replication)
  - 具有多樣化資料系統, [保持系統同步](/tw/ch12#sec_stream_sync)
- replication logs（見 logs）
- representations of data（見 data models）
- 後處理資料, [應用演化後重新處理資料](/tw/ch13#sec_future_reprocessing), [統一批處理和流處理](/tw/ch13#id338)
  - （另見 可演化性）
  - 從基於日誌的信件, [重播舊訊息](/tw/ch12#sec_stream_replay)
- 請求套期, [單主與無主複製的效能](/tw/ch6#sec_replication_leaderless_perf)
- 請求識別符號, [操作識別符號](/tw/ch13#id355), [多分割槽請求處理](/tw/ch13#id360)
- 請求路由, [請求路由](/tw/ch7#sec_sharding_routing)-[請求路由](/tw/ch7#sec_sharding_routing)
  - 方法, [請求路由](/tw/ch7#sec_sharding_routing)
- 資料居住法, [分散式與單節點系統](/tw/ch1#sec_introduction_distributed), [面向多租戶的分片](/tw/ch7#sec_sharding_multitenancy)
- 彈性系統, [可靠性與容錯](/tw/ch2#sec_introduction_reliability)
  - （另見 fault tolerance）
- 資源隔離, [雲計算與超級計算](/tw/ch1#id17), [面向多租戶的分片](/tw/ch7#sec_sharding_multitenancy)
- 資源限制, [雲時代的運維](/tw/ch1#sec_introduction_operations)
- 響應時間
  - 作為業績計量, [描述效能](/tw/ch2#sec_introduction_percentiles), [批處理](/tw/ch11#ch_batch)
  - 保證, [響應時間保證](/tw/ch9#sec_distributed_clocks_realtime)
  - 對使用者的影響, [平均值、中位數與百分位點](/tw/ch2#id24)
  - 在複製系統中, [單主與無主複製的效能](/tw/ch6#sec_replication_leaderless_perf)
  - 暫時性與, [延遲與響應時間](/tw/ch2#id23)
  - 平均值和百分位數, [平均值、中位數與百分位點](/tw/ch2#id24)
  - 使用者體驗, [平均值、中位數與百分位點](/tw/ch2#id24)
- 責任和問責制, [責任與問責](/ch14#id371)
- 表述性狀態傳遞, [Web 服務](/tw/ch5#sec_web_services)
  - （另見 services）
- 重報（工作流程引擎）, [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows)
- RethinkDB（資料庫）
  - 加入支援, [文件和關係資料庫的融合](/tw/ch3#convergence-of-document-and-relational-databases)
  - 鍵程硬化, [按鍵的範圍分片](/tw/ch7#sec_sharding_key_range)
- 重試風暴, [描述效能](/tw/ch2#sec_introduction_percentiles), [軟體故障](/tw/ch2#software-faults)
- reverse ETL, [超越資料湖](/tw/ch1#beyond-the-data-lake)
- Riak（資料庫）
  - CRDT support, [CRDT 與操作變換](/tw/ch6#sec_replication_crdts), [檢測併發寫入](/tw/ch6#sec_replication_concurrent)
  - 點版向量, [版本向量](/tw/ch6#version-vectors)
  - 流言協議, [請求路由](/tw/ch7#sec_sharding_routing)
  - 雜湊變硬, [固定數量的分片](/tw/ch7#fixed-number-of-shards)
  - 無領導複製, [無主複製](/tw/ch6#sec_replication_leaderless)
  - 線性,缺少, [線性一致性與仲裁](/tw/ch10#sec_consistency_quorum_linearizable)
  - 多區域支助, [多地區操作](/tw/ch6#multi-region-operation)
  - 再平衡, [運維：自動/手動再平衡](/tw/ch7#sec_sharding_operations)
  - 二級指數, [本地二級索引](/tw/ch7#id166)
  - 草率法定人數, [單主與無主複製的效能](/tw/ch6#sec_replication_leaderless_perf)
  - 節點（硬化）, [分片](/tw/ch7#ch_sharding)
- 環緩衝器, [磁碟空間使用](/tw/ch12#sec_stream_disk_usage)
- RisingWave（資料庫）
  - 增量檢視維護, [維護物化檢視](/tw/ch12#sec_stream_mat_view)
- 火箭彈, [拜占庭故障](/tw/ch9#sec_distributed_byzantine)
- RocksDB (storage engine), [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
  - 作為嵌入式儲存引擎, [壓實策略](/tw/ch4#sec_storage_lsm_compaction)
  - 平面壓縮, [壓實策略](/tw/ch4#sec_storage_lsm_compaction)
  - 服務衍生資料, [對外提供派生資料](/tw/ch11#sec_batch_serving_derived)
- 退縮（事務）, [事務](/tw/ch8#ch_transactions)
- 滾動升級, [透過冗餘容忍硬體故障](/tw/ch2#tolerating-hardware-faults-through-redundancy), [編碼與演化](/tw/ch5#ch_encoding), [故障與部分失效](/tw/ch9#sec_distributed_partial_failure)
  - 在多種租戶系統中, [面向多租戶的分片](/tw/ch7#sec_sharding_multitenancy)
- routing（見 request routing）
- 基於行的複製, [邏輯（基於行）日誌複製](/tw/ch6#logical-row-based-log-replication)
- 面向行儲存, [列式儲存](/tw/ch4#sec_storage_column)
- 搶劫犯（貪汙）, [硬體與軟體故障](/tw/ch2#sec_introduction_hardware_faults)
- RPCs（見 remote procedure calls）
- 規則（資料）, [Datalog：遞迴關係查詢](/tw/ch3#id62)
- Rust（程式語言）
  - 記憶體管理, [限制垃圾回收的影響](/tw/ch9#sec_distributed_gc_impact)

### S

- S3（物件儲存）, [雲服務的分層](/tw/ch1#layering-of-cloud-services), [設定新的副本](/tw/ch6#sec_replication_new_replica), [批處理](/tw/ch11#ch_batch), [分散式檔案系統](/tw/ch11#sec_batch_dfs), [物件儲存](/tw/ch11#id277)
  - 檢查資料完整性, [不要盲目信任承諾](/tw/ch13#id364)
  - 有條件寫入, [隔離殭屍程序和延遲請求](/tw/ch9#sec_distributed_fencing_tokens)
  - 物件大小, [儲存與計算的分離](/tw/ch1#sec_introduction_storage_compute)
  - S3 Express One Zone, [物件儲存](/tw/ch11#id277), [物件儲存](/tw/ch11#id277)
  - use in MapReduce, [MapReduce](/tw/ch11#sec_batch_mapreduce)
  - 工作流程示例, [工作流排程](/tw/ch11#sec_batch_workflows)
- SaaS（見 軟體即服務（SaaS））
- 安全和生活特性, [安全性與活性](/tw/ch9#sec_distributed_safety_liveness)
  - 在共識演算法中, [單值共識](/tw/ch10#single-value-consensus)
  - 事務中, [事務](/tw/ch8#ch_transactions)
- sagas（見 compensating transactions）
- Samza （流處理器）, [流分析](/tw/ch12#id318)
- SAP HANA（資料庫）, [分析型資料儲存](/tw/ch4#sec_storage_analytics)
- 可伸縮性, [可伸縮性](/tw/ch2#sec_introduction_scalability)-[可伸縮性原則](/tw/ch2#id35), [流式系統的哲學](/tw/ch13#ch_philosophy)
  - 自動縮放, [運維：自動/手動再平衡](/tw/ch7#sec_sharding_operations)
  - 透過磨損, [分片的利與弊](/tw/ch7#sec_sharding_reasons)
  - 描述負載, [描述負載](/tw/ch2#id33)
  - 描述效能, [描述效能](/tw/ch2#sec_introduction_percentiles)
  - 線性, [描述負載](/tw/ch2#id33)
  - 原則, [可伸縮性原則](/tw/ch2#id35)
  - 複製和, [複製延遲的問題](/tw/ch6#sec_replication_lag)
  - 擴大規模與擴大規模, [共享記憶體、共享磁碟與無共享架構](/tw/ch2#sec_introduction_shared_nothing)
- 縮放, [共享記憶體、共享磁碟與無共享架構](/tw/ch2#sec_introduction_shared_nothing)
  - （另見 shared-nothing architecture）
  - 透過磨損, [分片的利與弊](/tw/ch7#sec_sharding_reasons)
- 擴大規模, [共享記憶體、共享磁碟與無共享架構](/tw/ch2#sec_introduction_shared_nothing)
- 緩慢變化的維度, [連線的時間依賴性](/tw/ch12#sec_stream_join_time)
- 排程
  - 演算法, [資源分配](/tw/ch11#id279)
  - 批次任務, [分散式作業編排](/tw/ch11#id278)-[工作流排程](/tw/ch11#sec_batch_workflows)
  - 幫派列表, [資源分配](/tw/ch11#id279)
- 閱讀時的圖謀, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility)
  - 與可變方案比較, [模式的優點](/tw/ch5#sec_encoding_schemas)
- 拼寫圖, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility)
- schemaless databases（見 schema-on-read）
- 計劃, [術語表](/tw/glossary)
  - Avro, [Avro](/tw/ch5#sec_encoding_avro)-[動態生成的模式](/tw/ch5#dynamically-generated-schemas)
    - 讀者決定作家的計劃, [但什麼是寫入者模式？](/tw/ch5#but-what-is-the-writers-schema)
    - 計劃演變, [寫入者模式與讀取者模式](/tw/ch5#the-writers-schema-and-the-readers-schema)
  - 動態生成, [動態生成的模式](/tw/ch5#dynamically-generated-schemas)
  - 變化, [應用演化後重新處理資料](/tw/ch13#sec_future_reprocessing)
    - 影響應用程式程式碼, [編碼與演化](/tw/ch5#ch_encoding)
    - 相容性檢查, [但什麼是寫入者模式？](/tw/ch5#but-what-is-the-writers-schema)
    - 資料庫中, [流經資料庫的資料流](/tw/ch5#sec_encoding_dataflow_db)-[歸檔儲存](/tw/ch5#archival-storage)
    - 服務電話, [RPC 的資料編碼與演化](/tw/ch5#data-encoding-and-evolution-for-rpc)
  - 檔案模式的靈活性, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility)
  - 用於分析, [星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics)-[星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics)
  - for JSON and XML, [JSON、XML 及其二進位制變體](/tw/ch5#sec_encoding_json), [JSON 模式](/tw/ch5#json-schema)
  - generation and migration using ORMs, [物件關係對映（ORM）](/tw/ch3#object-relational-mapping-orm)
  - 案情, [模式的優點](/tw/ch5#sec_encoding_schemas)
  - 遷移, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility)
  - Protocol Buffers, [Protocol Buffers](/tw/ch5#sec_encoding_protobuf)-[欄位標籤與模式演化](/tw/ch5#field-tags-and-schema-evolution)
    - 計劃演變, [欄位標籤與模式演化](/tw/ch5#field-tags-and-schema-evolution)
  - 鐵路移民計劃, [應用演化後重新處理資料](/tw/ch13#sec_future_reprocessing)
  - 傳統的設計方法,謬誤, [從同一事件日誌中派生多個檢視](/tw/ch12#sec_stream_deriving_views)
- 科學計算, [雲計算與超級計算](/tw/ch1#id17)
- scikit-learn （Python 圖書館）, [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake)
- ScyllaDB（資料庫）
  - 叢集元資料, [請求路由](/tw/ch7#sec_sharding_routing)
  - consistency level ANY, [單主與無主複製的效能](/tw/ch6#sec_replication_leaderless_perf)
  - 雜湊變硬, [按鍵的雜湊分片](/tw/ch7#sec_sharding_hash), [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
  - 最後寫成的解決衝突, [檢測併發寫入](/tw/ch6#sec_replication_concurrent)
  - 無領導複製, [無主複製](/tw/ch6#sec_replication_leaderless)
  - 輕量事務, [單物件寫入](/tw/ch8#sec_transactions_single_object)
  - 線性,缺少, [實現線性一致性系統](/tw/ch10#sec_consistency_implementing_linearizable)
  - 日誌結構儲存, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
  - 多區域支助, [多地區操作](/tw/ch6#multi-region-operation)
  - 使用時鐘, [仲裁一致性的侷限](/tw/ch6#sec_replication_quorum_limitations), [用於事件排序的時間戳](/tw/ch9#sec_distributed_lww)
  - 節點（硬化）, [分片](/tw/ch7#ch_sharding)
- search engines（見 全文檢索）
- 搜尋流, [在流上搜索](/tw/ch12#id320)
- 備庫（見 基於領導者的複製）
- 二級指數, [多列索引與二級索引](/tw/ch4#sec_storage_index_multicolumn), [術語表](/tw/glossary)
  - 多對多關係, [多對一與多對多關係](/tw/ch3#sec_datamodels_many_to_many)
  - 雙寫問題, [保持系統同步](/tw/ch12#sec_stream_sync), [理解資料流](/tw/ch13#id443)
  - 分片, [分片與二級索引](/tw/ch7#sec_sharding_secondary_indexes)-[全域性二級索引](/tw/ch7#id167), [總結](/tw/ch7#summary)
    - 全球, [全域性二級索引](/tw/ch7#id167)
    - 指數維護, [維護派生狀態](/tw/ch13#id446)
    - 當地, [本地二級索引](/tw/ch7#id166)
  - 更新、事務隔離和, [多物件事務的需求](/tw/ch8#sec_transactions_need)
- 二次排序, [JOIN 與 GROUP BY](/tw/ch11#sec_batch_join)
- sed （Unix 工具） （英語）., [簡單日誌分析](/tw/ch11#sec_batch_log_analysis)
- 自我託管, [雲服務與自託管](/tw/ch1#sec_introduction_cloud)
  - 資料倉庫, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
- 自我歡樂, [本章小結](/tw/ch12#id332)
- 自動驗證系統, [不要盲目信任承諾](/tw/ch13#id364)
- 語義搜尋, [向量嵌入](/tw/ch4#id92)
- 語義相似性, [向量嵌入](/tw/ch4#id92)
- 語義網, [三元組儲存與 SPARQL](/tw/ch3#id59)
- 半同步複製, [同步複製與非同步複製](/tw/ch6#sec_replication_sync_async)
- 順序寫（訪問模式）, [順序與隨機寫入](/tw/ch4#sidebar_sequential)
- 可序列化, [隔離性](/tw/ch8#sec_transactions_acid_isolation), [弱隔離級別](/tw/ch8#sec_transactions_isolation_levels), [可序列化](/tw/ch8#sec_transactions_serializability)-[可序列化快照隔離的效能](/tw/ch8#performance-of-serializable-snapshot-isolation), [術語表](/tw/glossary)
  - 線性比對, [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
  - 悲觀與樂觀的併發控制, [悲觀併發控制與樂觀併發控制](/tw/ch8#pessimistic-versus-optimistic-concurrency-control)
  - 序列執行, [實際序列執行](/tw/ch8#sec_transactions_serial)-[序列執行總結](/tw/ch8#summary-of-serial-execution)
    - 分片, [分片](/tw/ch8#sharding)
    - 使用儲存程式, [將事務封裝在儲存過程中](/tw/ch8#encapsulating-transactions-in-stored-procedures), [使用共享日誌](/tw/ch10#sec_consistency_smr)
  - 可序列化快照隔離, [可序列化快照隔離（SSI）](/tw/ch8#sec_transactions_ssi)-[可序列化快照隔離的效能](/tw/ch8#performance-of-serializable-snapshot-isolation)
    - detecting stale MVCC reads, [檢測陳舊的 MVCC 讀取](/tw/ch8#detecting-stale-mvcc-reads)
    - 檢測影響先前讀取的寫入, [檢測影響先前讀取的寫入](/tw/ch8#sec_detecting_writes_affect_reads)
    - 分散式執行, [可序列化快照隔離的效能](/tw/ch8#performance-of-serializable-snapshot-isolation), [資料庫內部的分散式事務](/tw/ch8#sec_transactions_internal)
    - performance of SSI, [可序列化快照隔離的效能](/tw/ch8#performance-of-serializable-snapshot-isolation)
    - 防止寫入skew, [基於過時前提的決策](/tw/ch8#decisions-based-on-an-outdated-premise)-[檢測影響先前讀取的寫入](/tw/ch8#sec_detecting_writes_affect_reads)
  - 嚴格的序列性, [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
    - 及時性與完整性, [及時性與完整性](/tw/ch13#sec_future_integrity)
  - 兩階段鎖定, [兩階段鎖定（2PL）](/tw/ch8#sec_transactions_2pl)-[索引範圍鎖](/tw/ch8#sec_transactions_2pl_range)
    - 索引範圍鎖定, [索引範圍鎖](/tw/ch8#sec_transactions_2pl_range)
    - 效能, [兩階段鎖定的效能](/tw/ch8#performance-of-two-phase-locking)
- 可序列化, [特定語言的格式](/tw/ch5#id96)
- 序列化, [編碼資料的格式](/tw/ch5#sec_encoding_formats)
  - （另見 編碼）
- 無伺服器, [微服務與無伺服器](/tw/ch1#sec_introduction_microservices)
- 服務發現, [負載均衡器、服務發現和服務網格](/tw/ch5#sec_encoding_service_discovery), [請求路由](/tw/ch7#sec_sharding_routing), [服務發現](/tw/ch10#service-discovery)
  - 登記, [負載均衡器、服務發現和服務網格](/tw/ch5#sec_encoding_service_discovery)
  - using DNS, [負載均衡器、服務發現和服務網格](/tw/ch5#sec_encoding_service_discovery), [請求路由](/tw/ch7#sec_sharding_routing), [服務發現](/tw/ch10#service-discovery)
- 服務級別協議（SLA）, [響應時間指標的應用](/tw/ch2#sec_introduction_slo_sla), [描述負載](/tw/ch2#id33)
- 服務網格, [負載均衡器、服務發現和服務網格](/tw/ch5#sec_encoding_service_discovery)
- Service Organization Control (SOC), [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)
- 服務時間, [延遲與響應時間](/tw/ch2#id23)
- 面向服務的體系結構, [微服務與無伺服器](/tw/ch1#sec_introduction_microservices)
  - （另見 services）
- 服務, [流經服務的資料流：REST 與 RPC](/tw/ch5#sec_encoding_dataflow_rpc)-[RPC 的資料編碼與演化](/tw/ch5#data-encoding-and-evolution-for-rpc)
  - 微服務, [微服務與無伺服器](/tw/ch1#sec_introduction_microservices)
    - 各種服務的因果關係, [全序的限制](/tw/ch13#id335)
    - 松耦合, [開展分拆工作](/tw/ch13#sec_future_unbundling_favor)
  - 與批次/流程處理器的關係, [批處理](/tw/ch11#ch_batch), [流處理器和服務](/tw/ch13#id345)
  - remote procedure calls (RPCs), [遠端過程呼叫（RPC）的問題](/tw/ch5#sec_problems_with_rpc)-[RPC 的資料編碼與演化](/tw/ch5#data-encoding-and-evolution-for-rpc)
    - 問題, [遠端過程呼叫（RPC）的問題](/tw/ch5#sec_problems_with_rpc)
  - 與資料庫相似, [流經服務的資料流：REST 與 RPC](/tw/ch5#sec_encoding_dataflow_rpc)
  - 網路服務, [Web 服務](/tw/ch5#sec_web_services)
- 會話視窗（流處理）, [視窗的型別](/tw/ch12#id324)
  - （另見 windows）
- 分片, [分片](/tw/ch7#ch_sharding)-[總結](/tw/ch7#summary), [術語表](/tw/glossary)
  - 和共識, [使用共享日誌](/tw/ch10#sec_consistency_smr)
  - 複製, [分片](/tw/ch7#ch_sharding)
  - 分散事務, [分散式事務](/tw/ch8#sec_transactions_distributed)
  - 熱的軟糖, [鍵值資料的分片](/tw/ch7#sec_sharding_key_value)
  - 分批處理, [批處理](/tw/ch11#ch_batch)
  - 鍵程分割, [重新平衡鍵範圍分片資料](/tw/ch7#rebalancing-key-range-sharded-data)
  - 多硬性操作, [多分割槽資料處理](/tw/ch13#sec_future_unbundled_multi_shard)
    - 執行限制, [多分割槽請求處理](/tw/ch13#id360)
    - 二級指數維護, [維護派生狀態](/tw/ch13#id446)
  - 關鍵值資料, [鍵值資料的分片](/tw/ch7#sec_sharding_key_value)-[偏斜的工作負載與緩解熱點](/tw/ch7#sec_sharding_skew)
    - 按金鑰範圍, [按鍵的範圍分片](/tw/ch7#sec_sharding_key_range)
    - 搖擺和熱點, [偏斜的工作負載與緩解熱點](/tw/ch7#sec_sharding_skew)
  - 詞源, [分片](/tw/ch7#ch_sharding)
  - 分割槽鍵, [分片的利與弊](/tw/ch7#sec_sharding_reasons), [鍵值資料的分片](/tw/ch7#sec_sharding_key_value)
  - 再平衡
    - 金鑰範圍壓縮資料, [重新平衡鍵範圍分片資料](/tw/ch7#rebalancing-key-range-sharded-data)
  - 重新平衡困難, [重新平衡鍵範圍分片資料](/tw/ch7#rebalancing-key-range-sharded-data)-[運維：自動/手動再平衡](/tw/ch7#sec_sharding_operations)
    - 自動或人工重新平衡, [運維：自動/手動再平衡](/tw/ch7#sec_sharding_operations)
    - Hash mod N的問題, [雜湊取模節點數](/tw/ch7#hash-modulo-number-of-nodes)
    - 使用固定的碎片數, [固定數量的分片](/tw/ch7#fixed-number-of-shards)
    - 使用 N 個節點, [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
  - 請求路由, [請求路由](/tw/ch7#sec_sharding_routing)-[請求路由](/tw/ch7#sec_sharding_routing)
  - 二級指數, [分片與二級索引](/tw/ch7#sec_sharding_secondary_indexes)-[全域性二級索引](/tw/ch7#id167)
    - 全球, [全域性二級索引](/tw/ch7#id167)
    - 當地, [本地二級索引](/tw/ch7#id166)
  - 連續執行事務和, [分片](/tw/ch8#sharding)
  - 正在排序硬化資料, [混洗資料](/tw/ch11#sec_shuffle)
- 共享日誌, [共識的實踐](/tw/ch10#sec_consistency_total_order)-[共識的利弊](/tw/ch10#pros-and-cons-of-consensus), [全序的限制](/tw/ch13#id335), [基於日誌訊息傳遞中的唯一性](/tw/ch13#sec_future_uniqueness_log)
  - 演算法, [共識的實踐](/tw/ch10#sec_consistency_total_order)
  - 用於事件原始碼, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
  - 用於通訊, [基於日誌的訊息代理](/tw/ch12#sec_stream_log)-[重播舊訊息](/tw/ch12#sec_stream_replay)
  - 與協商一致的關係, [共享日誌作為共識](/tw/ch10#sec_consistency_shared_logs)
  - 使用, [使用共享日誌](/tw/ch10#sec_consistency_smr)
- 共享模式, [兩階段鎖定的實現](/tw/ch8#implementation-of-two-phase-locking)
- 共享磁碟架構, [共享記憶體、共享磁碟與無共享架構](/tw/ch2#sec_introduction_shared_nothing), [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- 共享記憶體架構, [共享記憶體、共享磁碟與無共享架構](/tw/ch2#sec_introduction_shared_nothing)
- 共享- 無結構, [共享記憶體、共享磁碟與無共享架構](/tw/ch2#sec_introduction_shared_nothing), [術語表](/tw/glossary)
  - 分散式檔案系統, [分散式檔案系統](/tw/ch11#sec_batch_dfs)
    - （另見 distributed filesystems）
  - 網路的使用, [不可靠的網路](/tw/ch9#sec_distributed_networks)
- 鯊魚
  - 咬海底電纜, [實踐中的網路故障](/tw/ch9#sec_distributed_network_faults)
  - 計數（例）, [文件的查詢語言](/tw/ch3#query-languages-for-documents)
- shredding (deletion)（見 crypto-shredding）
- 粉碎（專欄編碼）, [列式儲存](/tw/ch4#sec_storage_column)
- 粉碎（相關模型）, [何時使用哪種模型](/tw/ch3#sec_datamodels_document_summary)
- 混洗, [混洗資料](/tw/ch11#sec_shuffle)-[混洗資料](/tw/ch11#sec_shuffle)
- 兄弟, [手動衝突解決](/tw/ch6#manual-conflict-resolution), [捕獲先發生關係](/tw/ch6#capturing-the-happens-before-relationship), [衝突解決與複製](/tw/ch8#conflict-resolution-and-replication)
  - （另見 conflicts）
- 倉, [資料倉庫](/tw/ch1#sec_introduction_dwh)
- 相似性搜尋
  - 編輯距離, [全文檢索](/tw/ch4#sec_storage_full_text)
  - 基因組資料, [總結](/tw/ch3#summary)
- 簡單, [簡單性：管理複雜度](/tw/ch2#id38)
- 歌手, [資料倉庫](/tw/ch1#sec_introduction_dwh)
- single-instruction-multi-data (SIMD) instructions, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
- single-leader replication（見 基於領導者的複製）
- 單條執行, [原子寫操作](/tw/ch8#atomic-write-operations), [實際序列執行](/tw/ch8#sec_transactions_serial)
  - 在溪流處理中, [日誌與傳統的訊息傳遞相比](/tw/ch12#sec_stream_logs_vs_messaging), [併發控制](/tw/ch12#sec_stream_concurrency), [基於日誌訊息傳遞中的唯一性](/tw/ch13#sec_future_uniqueness_log)
- SingleStore（資料庫）
  - 記憶體儲, [全記憶體儲存](/tw/ch4#sec_storage_inmemory)
- 工地可靠性工程師, [雲時代的運維](/tw/ch1#sec_introduction_operations)
- 大小級緊湊, [壓實策略](/tw/ch4#sec_storage_lsm_compaction), [磁碟空間使用](/tw/ch4#disk-space-usage)
- 偏斜, [術語表](/tw/glossary)
  - 時鐘搖擺, [對同步時鐘的依賴](/tw/ch9#sec_distributed_clocks_relying)-[帶置信區間的時鐘讀數](/tw/ch9#clock-readings-with-a-confidence-interval), [實現線性一致性系統](/tw/ch10#sec_consistency_implementing_linearizable)
  - 事務隔離
    - 讀取偏差, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation), [總結](/tw/ch8#summary)
    - 寫偏差, [寫偏差與幻讀](/tw/ch8#sec_transactions_write_skew)-[物化衝突](/tw/ch8#materializing-conflicts), [基於過時前提的決策](/tw/ch8#decisions-based-on-an-outdated-premise)-[檢測影響先前讀取的寫入](/tw/ch8#sec_detecting_writes_affect_reads)
      - （另見 寫偏差）
  - 含義, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)
  - 不平衡的工作量, [鍵值資料的分片](/tw/ch7#sec_sharding_key_value)
    - 補償, [偏斜的工作負載與緩解熱點](/tw/ch7#sec_sharding_skew)
    - 由於名人, [偏斜的工作負載與緩解熱點](/tw/ch7#sec_sharding_skew)
    - 時間序列資料, [按鍵的範圍分片](/tw/ch7#sec_sharding_key_range)
- 跳過列表, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
- 服務級別協議（見 服務級別協議）
- Slack（分組聊天）
  - GraphQL example, [GraphQL](/tw/ch3#id63)
- SlateDB（資料庫）, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables), [設定新的副本](/tw/ch6#sec_replication_new_replica)
- 滑動視窗（流處理）, [視窗的型別](/tw/ch12#id324)
  - （另見 windows）
- 草率法定人數, [單主與無主複製的效能](/tw/ch6#sec_replication_leaderless_perf)
- 緩慢變化的維度, [連線的時間依賴性](/tw/ch12#sec_stream_join_time)
- 塗抹（傾斜秒調整）, [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy)
- 快照（資料庫）
  - 作為備份, [複製](/tw/ch6#ch_replication)
  - 計算衍生資料, [建立索引](/tw/ch13#id340)
  - 變化資料捕獲中, [初始快照](/tw/ch12#sec_stream_cdc_snapshot)
  - 可序列化快照隔離, [可序列化快照隔離（SSI）](/tw/ch8#sec_transactions_ssi)-[可序列化快照隔離的效能](/tw/ch8#performance-of-serializable-snapshot-isolation)
  - 新建複製品, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 快速隔離和可重複讀取, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)-[快照隔離、可重複讀和命名混淆](/tw/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
    - implementing with MVCC, [多版本併發控制（MVCC）](/tw/ch8#sec_transactions_snapshot_impl)
    - indexes and MVCC, [索引與快照隔離](/tw/ch8#indexes-and-snapshot-isolation)
    - 可見度規則, [觀察一致快照的可見性規則](/tw/ch8#sec_transactions_mvcc_visibility)
  - 全球快照同步時鐘, [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
- Snowflake（資料庫）, [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native), [雲服務的分層](/tw/ch1#layering-of-cloud-services), [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses), [批處理](/tw/ch11#ch_batch)
  - 面向列的儲存, [列式儲存](/tw/ch4#sec_storage_column)
  - 處理寫入, [寫入列式儲存](/tw/ch4#writing-to-column-oriented-storage)
  - 硬化和叢集, [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
  - 雪園, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
- Snowflake (ID generator), [ID 生成器和邏輯時鐘](/tw/ch10#sec_consistency_logical)
- 雪花計劃, [星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics)
- SOAP (web services), [遠端過程呼叫（RPC）的問題](/tw/ch5#sec_problems_with_rpc)
- SOC2（見 Service Organization Control (SOC)）
- 社會圖表, [圖資料模型](/tw/ch3#sec_datamodels_graph)
- 社會
  - 的責任, [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance), [立法與自律](/ch14#sec_future_legislation)
- 社會技術系統, [人類與可靠性](/tw/ch2#id31)
- 軟體即服務（SaaS）, [資料系統架構中的權衡](/tw/ch1#ch_tradeoffs), [雲服務與自託管](/tw/ch1#sec_introduction_cloud)
  - ETL from, [資料倉庫](/tw/ch1#sec_introduction_dwh)
  - 多重租賃, [面向多租戶的分片](/tw/ch7#sec_sharding_multitenancy)
- 軟體錯誤, [軟體故障](/tw/ch2#software-faults)
  - 維護誠信, [維護完整性，儘管軟體有Bug](/tw/ch13#id455)
- 太陽風暴, [硬體與軟體故障](/tw/ch2#sec_introduction_hardware_faults)
- solid state drives (SSDs)
  - 訪問模式, [順序與隨機寫入](/tw/ch4#sidebar_sequential)
  - 比較物件儲存, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 偵查腐敗, [端到端原則](/tw/ch13#sec_future_e2e_argument), [不要盲目信任承諾](/tw/ch13#id364)
  - 失敗率, [硬體與軟體故障](/tw/ch2#sec_introduction_hardware_faults)
  - 錯誤在, [永續性](/tw/ch8#durability)
  - 韌體錯誤, [軟體故障](/tw/ch2#software-faults)
  - 讀取吞吐量, [讀取效能](/tw/ch4#read-performance)
  - 順序對隨機寫入, [順序與隨機寫入](/tw/ch4#sidebar_sequential)
- Solr （搜尋伺服器）
  - 本地二級指數, [本地二級索引](/tw/ch7#id166)
  - 請求路由, [請求路由](/tw/ch7#sec_sharding_routing)
  - 使用 Lucene, [全文檢索](/tw/ch4#sec_storage_full_text)
- 排序（Unix 工具）, [簡單日誌分析](/tw/ch11#sec_batch_log_analysis), [簡單日誌分析](/tw/ch11#sec_batch_log_analysis), [排序與記憶體聚合](/tw/ch11#id275), [分散式作業編排](/tw/ch11#id278)
- 排序歸併連線（MapReduce）, [JOIN 與 GROUP BY](/tw/ch11#sec_batch_join)
- Sorted String Tables（見 SSTables）
- 排序
  - 列儲存中的排序順序, [列儲存中的排序順序](/tw/ch4#sort-order-in-column-storage)
- 真相來源（權威資料來源）（見 systems of record）
- Spanner（資料庫）
  - 一致性模式, [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
  - 資料位置, [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality)
  - 在雲層中, [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native)
  - 使用時鐘快照隔離, [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
  - 事務, [事務到底是什麼？](/tw/ch8#sec_transactions_overview), [資料庫內部的分散式事務](/tw/ch8#sec_transactions_internal)
  - TrueTime API, [帶置信區間的時鐘讀數](/tw/ch9#clock-readings-with-a-confidence-interval)
- Spark（處理框架）, [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake), [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native), [批處理](/tw/ch11#ch_batch), [資料流引擎](/tw/ch11#sec_batch_dataflow)
  - 成本效率, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
  - DataFrames, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes), [DataFrames](/tw/ch11#id287)
  - 過失容忍, [故障處理](/tw/ch11#id281)
  - 資料倉庫, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
  - high availability using ZooKeeper, [協調服務](/tw/ch10#sec_consistency_coordination)
  - MLlib, [機器學習](/tw/ch11#id290)
  - 查詢最佳化器, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
  - 移動資料, [混洗資料](/tw/ch11#sec_shuffle)
  - Spark Streaming, [流分析](/tw/ch12#id318)
    - 微批次, [微批次與存檔點](/tw/ch12#id329)
  - streaming SQL support, [複合事件處理](/tw/ch12#id317)
  - 用於 ETL, [提取-轉換-載入（ETL）](/tw/ch11#sec_batch_etl_usage)
- SPARQL（查詢語言）, [SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
- 零星指數, [SSTable 檔案格式](/tw/ch4#the-sstable-file-format)
- 稀疏矩陣, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- 腦裂, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover), [請求路由](/tw/ch7#sec_sharding_routing), [術語表](/tw/glossary)
  - 執行限制, [唯一性約束需要達成共識](/tw/ch13#id452)
  - 在共識演算法中, [共識](/tw/ch10#sec_consistency_consensus), [從單主複製到共識](/tw/ch10#from-single-leader-replication-to-consensus)
  - 預防, [實現線性一致性系統](/tw/ch10#sec_consistency_implementing_linearizable)
  - 使用柵欄標誌來避免, [隔離殭屍程序和延遲請求](/tw/ch9#sec_distributed_fencing_tokens)-[多副本隔離](/tw/ch9#fencing-with-multiple-replicas)
- 現場例項, [故障處理](/tw/ch11#id281)
- 電子表格, [資料系統架構中的權衡](/tw/ch1#ch_tradeoffs), [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
  - 資料流程式設計, [圍繞資料流設計應用](/tw/ch13#sec_future_dataflow)
  - 樞軸表, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- SQL (Structured Query Language), [簡單性：管理複雜度](/tw/ch2#id38), [關係模型與文件模型](/tw/ch3#sec_datamodels_history), [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
  - 用於分析, [資料倉庫](/tw/ch1#sec_introduction_dwh), [列式儲存](/tw/ch4#sec_storage_column)
  - 圖表查詢, [SQL 中的圖查詢](/tw/ch3#id58)
  - 隔離級別標準,問題, [快照隔離、可重複讀和命名混淆](/tw/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
  - 加入, [正規化、反正規化與連線](/tw/ch3#sec_datamodels_normalization)
  - 簡歷（例）, [用於一對多關係的文件資料模型](/tw/ch3#the-document-data-model-for-one-to-many-relationships)
  - 社會網路家庭時間表（例）, [表示使用者、帖子與關注關係](/tw/ch2#id20)
  - SQL injection vulnerability, [拜占庭故障](/tw/ch9#sec_distributed_byzantine)
  - 基於語句的複製, [基於語句的複製](/tw/ch6#statement-based-replication)
  - 儲存程式, [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs)
  - 批次處理框架中的支援, [批處理](/tw/ch11#ch_batch)
  - 檢視, [Datalog：遞迴關係查詢](/tw/ch3#id62)
- SQL Server（資料庫）
  - archiving WAL to object stores, [設定新的副本](/tw/ch6#sec_replication_new_replica)
  - 資料變更捕獲, [資料變更捕獲的實現](/tw/ch12#id307)
  - 資料儲存支援, [分析型資料儲存](/tw/ch4#sec_storage_analytics)
  - 分散式事務支援, [XA 事務](/tw/ch8#xa-transactions)
  - 基於領導者的複製, [單主複製](/tw/ch6#sec_replication_leader)
  - 多領導複製, [跨地域執行](/tw/ch6#sec_replication_multi_dc)
  - 防止丟失更新, [自動檢測丟失的更新](/tw/ch8#automatically-detecting-lost-updates)
  - 防止寫入skew, [寫偏差的特徵](/tw/ch8#characterizing-write-skew), [兩階段鎖定的實現](/tw/ch8#implementation-of-two-phase-locking)
  - 讀作承諾隔離, [實現讀已提交](/tw/ch8#sec_transactions_read_committed_impl)
  - 可序列隔離, [兩階段鎖定的實現](/tw/ch8#implementation-of-two-phase-locking)
  - 快速隔離支援, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)
  - T-SQL language, [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs)
- SQLite（資料庫）, [分散式系統的問題](/tw/ch1#sec_introduction_dist_sys_problems), [壓實策略](/tw/ch4#sec_storage_lsm_compaction)
  - archiving WAL to object stores, [設定新的副本](/tw/ch6#sec_replication_new_replica)
- SRE (site reliability engineer), [雲時代的運維](/tw/ch1#sec_introduction_operations)
- SSDs（見 solid state drives）
- SSTables (storage format), [SSTable 檔案格式](/tw/ch4#the-sstable-file-format)-[壓實策略](/tw/ch4#sec_storage_lsm_compaction)
  - 建造和維護, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
  - making LSM-Tree from, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
- 階段釋出（見 rolling upgrades）
- 停滯（舊資料）, [讀己之寫](/tw/ch6#sec_replication_ryw)
  - 跨渠道時間依賴性, [跨通道時序依賴](/tw/ch10#cross-channel-timing-dependencies)
  - 無頭資料庫中, [當節點故障時寫入資料庫](/tw/ch6#id287)
  - 多轉換併發控制, [檢測陳舊的 MVCC 讀取](/tw/ch8#detecting-stale-mvcc-reads)
  - 監測, [監控陳舊性](/tw/ch6#monitoring-staleness)
  - 客戶端狀態, [將狀態變更推送給客戶端](/tw/ch13#id348)
  - 相對線性, [線性一致性](/tw/ch10#sec_consistency_linearizability)
  - 相對於及時性, [及時性與完整性](/tw/ch13#sec_future_integrity)
- standbys（見 基於領導者的複製）
- 恆星複製地形, [多主複製拓撲](/tw/ch6#sec_replication_topologies)
- 恆星計劃, [星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics)-[星型與雪花型：分析模式](/tw/ch3#sec_datamodels_analytics)
- 星球大戰類比（事件時間與處理時間）, [事件時間與處理時間](/tw/ch12#id322)
- 飢餓（時間安排）, [資源分配](/tw/ch11#id279)
- 國家
  - 從不可改變事件日誌中得出, [狀態、流和不變性](/tw/ch12#sec_stream_immutability)
  - 狀態變化與應用程式程式碼之間的相互作用, [資料流：應用程式碼與狀態變化的互動](/tw/ch13#id450)
  - 保持衍生狀態, [維護派生狀態](/tw/ch13#id446)
  - 由流處理器在流-流連線中維護, [流流連線（視窗連線）](/tw/ch12#id440)
  - 觀察匯出狀態, [觀察派生資料狀態](/tw/ch13#sec_future_observing)-[多分割槽資料處理](/tw/ch13#sec_future_unbundled_multi_shard)
  - 流處理器失敗後重建, [失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance)
  - 應用程式碼和, [應用程式碼和狀態的分離](/tw/ch13#id344)
- 狀態機複製, [基於語句的複製](/tw/ch6#statement-based-replication), [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs), [使用共享日誌](/tw/ch10#sec_consistency_smr), [資料庫與流](/tw/ch12#sec_stream_databases)
  - 事件溯源, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
  - 依賴決定性因素, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
- 無國籍人制度, [資料系統架構中的權衡](/tw/ch1#ch_tradeoffs)
- 基於語句的複製, [基於語句的複製](/tw/ch6#statement-based-replication)
  - 依賴決定性因素, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
- 靜態輸入語言
  - 類比於圖案, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility)
- 統計和數字演算法, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- StatsD (metrics aggregator), [直接從生產者傳遞給消費者](/tw/ch12#id296)
- 股票市場飼料, [直接從生產者傳遞給消費者](/tw/ch12#id296)
- 爆彼之頭, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover)
  - 問題, [隔離殭屍程序和延遲請求](/tw/ch9#sec_distributed_fencing_tokens)
- 停止所有處理（見 garbage collection）
- 儲存
  - 構建資料儲存技術, [組合使用資料儲存技術](/tw/ch13#id447)-[分拆系統與整合系統](/tw/ch13#id448)
- 儲存區網路, [共享記憶體、共享磁碟與無共享架構](/tw/ch2#sec_introduction_shared_nothing), [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- 儲存引擎, [儲存與檢索](/tw/ch4#ch_storage)-[總結](/tw/ch4#summary)
  - 面向列, [列式儲存](/tw/ch4#sec_storage_column)-[查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
    - 列壓縮, [列壓縮](/tw/ch4#sec_storage_column_compression)-[列壓縮](/tw/ch4#sec_storage_column_compression)
    - 定義, [列式儲存](/tw/ch4#sec_storage_column)
    - 公園, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses), [列式儲存](/tw/ch4#sec_storage_column), [歸檔儲存](/tw/ch5#archival-storage)
    - 排序在, [列儲存中的排序順序](/tw/ch4#sort-order-in-column-storage)-[列儲存中的排序順序](/tw/ch4#sort-order-in-column-storage)
    - 寬柱型, [列壓縮](/tw/ch4#sec_storage_column_compression)
    - 寫入, [寫入列式儲存](/tw/ch4#writing-to-column-oriented-storage)
  - 記憶體儲, [全記憶體儲存](/tw/ch4#sec_storage_inmemory)
    - 永續性, [永續性](/tw/ch8#durability)
  - 面向行, [OLTP 系統的儲存與索引](/tw/ch4#sec_storage_oltp)-[全記憶體儲存](/tw/ch4#sec_storage_inmemory)
    - B樹, [B 樹](/tw/ch4#sec_storage_b_trees)-[B 樹變體](/tw/ch4#b-tree-variants)
    - comparing B-trees and LSM-trees, [比較 B 樹與 LSM 樹](/tw/ch4#sec_storage_btree_lsm_comparison)-[磁碟空間使用](/tw/ch4#disk-space-usage)
    - 定義, [列式儲存](/tw/ch4#sec_storage_column)
    - 日誌結構, [日誌結構儲存](/tw/ch4#sec_storage_log_structured)-[壓實策略](/tw/ch4#sec_storage_lsm_compaction)
- 儲存程式, [將事務封裝在儲存過程中](/tw/ch8#encapsulating-transactions-in-stored-procedures)-[儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs), [術語表](/tw/glossary)
  - 和共享日誌, [使用共享日誌](/tw/ch10#sec_consistency_smr)
  - 利弊因素, [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs)
  - 類似於流處理器, [應用程式碼作為派生函式](/tw/ch13#sec_future_dataflow_derivation)
- 風暴（流處理器）, [流分析](/tw/ch12#id318)
  - distributed RPC, [事件驅動架構與 RPC](/tw/ch12#sec_stream_actors_drpc), [多分割槽資料處理](/tw/ch13#sec_future_unbundled_multi_shard)
  - 三叉戟狀態處理, [冪等性](/tw/ch12#sec_stream_idempotence)
- 斜拉機事件, [處理滯留事件](/tw/ch12#id323)
- Stream Control Transmission Protocol (SCTP), [TCP 的侷限性](/tw/ch9#sec_distributed_tcp)
- 流處理, [流處理](/tw/ch12#sec_stream_processing)-[本章小結](/tw/ch12#id332), [術語表](/tw/glossary)
  - 在工作範圍內獲得外部服務, [流表連線（流擴充）](/tw/ch12#sec_stream_table_joins), [微批次與存檔點](/tw/ch12#id329), [冪等性](/tw/ch12#sec_stream_idempotence), [恰好執行一次操作](/tw/ch13#id353)
  - 與批次處理相結合, [統一批處理和流處理](/tw/ch13#id338)
  - 與批次處理的比較, [流處理](/tw/ch12#sec_stream_processing)
  - 複合事件處理, [複合事件處理](/tw/ch12#id317)
  - 過失容忍, [容錯](/tw/ch12#sec_stream_fault_tolerance)-[失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance)
    - 原子提交, [原子提交再現](/tw/ch12#sec_stream_atomic_commit)
    - 冪等性, [冪等性](/tw/ch12#sec_stream_idempotence)
    - 微打鬥和檢查站, [微批次與存檔點](/tw/ch12#id329)
    - 失敗後重建狀態, [失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance)
  - 資料整合, [批處理與流處理](/tw/ch13#sec_future_batch_streaming)-[統一批處理和流處理](/tw/ch13#id338)
  - 用於事件原始碼, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
  - 保持衍生狀態, [維護派生狀態](/tw/ch13#id446)
  - 維持實際意見, [維護物化檢視](/tw/ch12#sec_stream_mat_view)
  - messaging systems（見 messaging systems）
  - 關於時間的推理, [時間推理](/tw/ch12#sec_stream_time)-[視窗的型別](/tw/ch12#id324)
    - 事件時間與處理時間, [事件時間與處理時間](/tw/ch12#id322), [微批次與存檔點](/tw/ch12#id329), [統一批處理和流處理](/tw/ch13#id338)
    - 知道視窗何時準備好, [處理滯留事件](/tw/ch12#id323)
    - 視窗型別, [視窗的型別](/tw/ch12#id324)
  - relation to databases（見 streams）
  - 與服務的關係, [流處理器和服務](/tw/ch13#id345)
  - 與批次處理的關係, [批處理](/tw/ch11#ch_batch)
  - 在流中搜索, [在流上搜索](/tw/ch12#id320)
  - 單條執行, [日誌與傳統的訊息傳遞相比](/tw/ch12#sec_stream_logs_vs_messaging), [併發控制](/tw/ch12#sec_stream_concurrency)
  - 流式分析, [流分析](/tw/ch12#id318)
  - 串流連線, [流連線](/tw/ch12#sec_stream_joins)-[連線的時間依賴性](/tw/ch12#sec_stream_join_time)
    - 串流流連線, [流流連線（視窗連線）](/tw/ch12#id440)
    - 序列表連線, [流表連線（流擴充）](/tw/ch12#sec_stream_table_joins)
    - 表格連線, [表表連線（維護物化檢視）](/tw/ch12#id326)
    - 時間的依賴性, [連線的時間依賴性](/tw/ch12#sec_stream_join_time)
- 流程, [流處理](/tw/ch12#ch_stream)-[重播舊訊息](/tw/ch12#sec_stream_replay)
  - 端對端,向客戶推進事件, [端到端的事件流](/tw/ch13#id349)
  - messaging systems（見 messaging systems）
  - processing（見 流處理）
  - 與資料庫的關係, [資料庫與流](/tw/ch12#sec_stream_databases)-[不變性的侷限性](/tw/ch12#sec_stream_immutability_limitations)
    - （另見 changelogs）
    - 變更流的 API 支援, [變更流的 API 支援](/tw/ch12#sec_stream_change_api)
    - 資料變更捕獲, [資料變更捕獲](/tw/ch12#sec_stream_cdc)-[變更流的 API 支援](/tw/ch12#sec_stream_change_api)
    - 按時間分列的狀態衍生物, [狀態、流和不變性](/tw/ch12#sec_stream_immutability)
    - 事件溯源, [資料變更捕獲與事件溯源](/tw/ch12#sec_stream_event_sourcing)
    - 保持系統同步, [保持系統同步](/tw/ch12#sec_stream_sync)-[保持系統同步](/tw/ch12#sec_stream_sync)
    - 不可改變事件哲學, [狀態、流和不變性](/tw/ch12#sec_stream_immutability)-[不變性的侷限性](/tw/ch12#sec_stream_immutability_limitations)
  - 專題, [傳遞事件流](/tw/ch12#sec_stream_transmit)
- 嚴格的序列性, [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
  - 及時性與完整性, [及時性與完整性](/tw/ch13#sec_future_integrity)
- 條紋（列編碼）, [列式儲存](/tw/ch4#sec_storage_column)
- 強一致性（見 線性一致性）
- 最終的一致性, [自動衝突解決](/tw/ch6#automatic-conflict-resolution)
- 強烈的單份序列性, [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
- 主題、上游和物體（三層）, [三元組儲存與 SPARQL](/tw/ch3#id59)
- 訂閱者, [傳遞事件流](/tw/ch12#sec_stream_transmit)
  - （另見 consumers）
- 超級計算機, [雲計算與超級計算](/tw/ch1#id17)
- Superset（資料視覺化軟體）, [分析（Analytics）](/tw/ch11#sec_batch_olap)
- 監視, [監視](/ch14#id374)
  - （另見 隱私）
- 壽司原則, [從資料倉庫到資料湖](/tw/ch1#from-data-warehouse-to-data-lake)
- 可持續性, [分散式與單節點系統](/tw/ch1#sec_introduction_distributed)
- Swagger（服務定義格式）, [Web 服務](/tw/ch5#sec_web_services)
- swapping to disk（見 virtual memory）
- Swift（程式語言）
  - 記憶體管理, [限制垃圾回收的影響](/tw/ch9#sec_distributed_gc_impact)
- 同步引擎, [同步引擎與本地優先軟體](/tw/ch6#sec_replication_offline_clients)-[同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
  - 例項, [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
  - 用於本地第一軟體, [即時協作、離線優先和本地優先應用](/tw/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- 同步網路, [同步與非同步網路](/tw/ch9#sec_distributed_sync_networks), [術語表](/tw/glossary)
  - 比較同步網路, [同步與非同步網路](/tw/ch9#sec_distributed_sync_networks)
  - 系統模型, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
- 同步複製, [同步複製與非同步複製](/tw/ch6#sec_replication_sync_async), [術語表](/tw/glossary)
  - 有多個領導, [多主複製](/tw/ch6#sec_replication_multi_leader)
- 系統管理員, [雲時代的運維](/tw/ch1#sec_introduction_operations)
- 系統模型, [知識、真相和謊言](/tw/ch9#sec_distributed_truth), [系統模型與現實](/tw/ch9#sec_distributed_system_model)-[確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
  - 假設, [信任但驗證](/tw/ch13#sec_future_verification)
  - 演算法的正確性, [定義演算法的正確性](/tw/ch9#defining-the-correctness-of-an-algorithm)
  - 繪製真實世界的地圖, [將系統模型對映到現實世界](/tw/ch9#mapping-system-models-to-the-real-world)
  - 安全和生活, [安全性與活性](/tw/ch9#sec_distributed_safety_liveness)
- 記錄系統, [記錄系統與派生資料](/tw/ch1#sec_introduction_derived), [術語表](/tw/glossary)
  - 資料變更捕獲, [資料變更捕獲的實現](/tw/ch12#id307), [理解資料流](/tw/ch13#id443)
  - 事件日誌, [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
  - 事件日誌處理為, [狀態、流和不變性](/tw/ch12#sec_stream_immutability)
- 系統思維, [反饋迴路](/ch14#id372)

### T

- t- digest（演算法）, [響應時間指標的應用](/tw/ch2#sec_introduction_slo_sla)
- 表格連線, [表表連線（維護物化檢視）](/tw/ch12#id326)
- Tableau（資料視覺化軟體）, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp), [分析（Analytics）](/tw/ch11#sec_batch_olap)
- 尾巴 （Unix 工具）, [使用日誌進行訊息儲存](/tw/ch12#id300)
- tail latency（見 延遲）
- 尾頂（財產圖）, [屬性圖](/tw/ch3#id56)
- task (workflows)（見 workflow engines）
- TCP (Transmission Control Protocol), [TCP 的侷限性](/tw/ch9#sec_distributed_tcp)
  - 比較電路切換, [我們不能簡單地使網路延遲可預測嗎？](/tw/ch9#can-we-not-simply-make-network-delays-predictable)
  - comparison to UDP, [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
  - 連線失敗, [檢測故障](/tw/ch9#id307)
  - 流量控制, [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing), [訊息傳遞系統](/tw/ch12#sec_stream_messaging)
  - 包檢查和, [弱形式的謊言](/tw/ch9#weak-forms-of-lying), [端到端原則](/tw/ch13#sec_future_e2e_argument), [信任但驗證](/tw/ch13#sec_future_verification)
  - 可靠性和重複壓制, [抑制重複](/tw/ch13#id354)
  - 轉發超時, [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
  - 用於事務會話, [單物件與多物件操作](/tw/ch8#sec_transactions_multi_object)
- 時間（工作流程引擎）, [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows)
- Tensorflow （機器學習圖書館）, [機器學習](/tw/ch11#id290)
- Teradata（資料庫）, [雲原生系統架構](/tw/ch1#sec_introduction_cloud_native), [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
- term-partitioned indexes（見 global secondary indexes）
- 終止（協商一致）, [單值共識](/tw/ch10#single-value-consensus), [原子提交作為共識](/tw/ch10#atomic-commitment-as-consensus)
- 測試, [人類與可靠性](/tw/ch2#id31)
- 擊打（記憶體斷）, [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)
- 執行緒（併發）
  - Actor 模型, [分散式 actor 框架](/tw/ch5#distributed-actor-frameworks), [事件驅動架構與 RPC](/tw/ch12#sec_stream_actors_drpc)
    - （另見 event-driven architecture）
  - 原子操作, [原子性](/tw/ch8#sec_transactions_acid_atomicity)
  - 背景執行緒, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables)
  - 執行暫停, [我們不能簡單地使網路延遲可預測嗎？](/tw/ch9#can-we-not-simply-make-network-delays-predictable), [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)-[程序暫停](/tw/ch9#sec_distributed_clocks_pauses)
  - 記憶體障礙, [線性一致性與網路延遲](/tw/ch10#linearizability-and-network-delays)
  - 預設, [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)
  - single（見 single-threaded execution）
- 三階段承諾, [三階段提交](/tw/ch8#three-phase-commit)
- 三方關係, [屬性圖](/tw/ch3#id56)
- Thrift（資料格式）, [Protocol Buffers](/tw/ch5#sec_encoding_protobuf)
- 吞吐量, [描述效能](/tw/ch2#sec_introduction_percentiles), [描述負載](/tw/ch2#id33), [批處理](/tw/ch11#ch_batch)
- TIBCO, [訊息代理](/tw/ch5#message-brokers)
  - Enterprise Message Service, [訊息代理與資料庫的對比](/tw/ch12#id297)
  - StreamBase (stream analytics), [複合事件處理](/tw/ch12#id317)
- TiDB（資料庫）
  - 基於共識的複製, [單主複製](/tw/ch6#sec_replication_leader)
  - 區域（硬化）, [分片](/tw/ch7#ch_sharding)
  - 請求路由, [請求路由](/tw/ch7#sec_sharding_routing)
  - 服務衍生資料, [對外提供派生資料](/tw/ch11#sec_batch_serving_derived)
  - 硬化二級指數, [全域性二級索引](/tw/ch7#id167)
  - 快速隔離支援, [快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)
  - 時間戳, [實現線性一致的 ID 生成器](/tw/ch10#implementing-a-linearizable-id-generator)
  - 事務, [事務到底是什麼？](/tw/ch8#sec_transactions_overview), [資料庫內部的分散式事務](/tw/ch8#sec_transactions_internal)
  - 使用模型檢查, [模型檢查與規範語言](/tw/ch9#model-checking-and-specification-languages)
- 分層儲存, [設定新的副本](/tw/ch6#sec_replication_new_replica), [磁碟空間使用](/tw/ch12#sec_stream_disk_usage)
- TigerBeetle（資料庫）, [總結](/tw/ch3#summary)
  - 確定性模擬測試, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
- TigerGraph（資料庫）
  - GSQL language, [SQL 中的圖查詢](/tw/ch3#id58)
- Tigris（物件儲存）, [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- TileDB（資料庫）, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- 時間
  - 併發與, ["先發生"關係與併發](/tw/ch6#sec_replication_happens_before)
  - 跨渠道時間依賴性, [跨通道時序依賴](/tw/ch10#cross-channel-timing-dependencies)
  - 在分散式系統中, [不可靠的時鐘](/tw/ch9#sec_distributed_clocks)-[限制垃圾回收的影響](/tw/ch9#sec_distributed_gc_impact)
    - （另見 clocks）
    - 時鐘同步和準確性, [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy)
    - 依賴同步時鐘, [對同步時鐘的依賴](/tw/ch9#sec_distributed_clocks_relying)-[用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
  - 程序暫停, [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)-[限制垃圾回收的影響](/tw/ch9#sec_distributed_gc_impact)
  - 流程處理器中的推理, [時間推理](/tw/ch12#sec_stream_time)-[視窗的型別](/tw/ch12#id324)
    - 事件時間與處理時間, [事件時間與處理時間](/tw/ch12#id322), [微批次與存檔點](/tw/ch12#id329), [統一批處理和流處理](/tw/ch13#id338)
    - 知道視窗何時準備好, [處理滯留事件](/tw/ch12#id323)
    - 事件的時間戳, [你用的是誰的時鐘？](/tw/ch12#id438)
    - 視窗型別, [視窗的型別](/tw/ch12#id324)
  - 分散式系統的系統模型, [系統模型與現實](/tw/ch9#sec_distributed_system_model)
  - 串流中的時間依賴, [連線的時間依賴性](/tw/ch12#sec_stream_join_time)
- 時間序列資料
  - as DataFrames, [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
  - 面向列的儲存, [列式儲存](/tw/ch4#sec_storage_column)
- 每日時鐘, [日曆時鐘](/tw/ch9#time-of-day-clocks)
  - 混合邏輯時鐘, [混合邏輯時鐘](/tw/ch10#hybrid-logical-clocks)
- 及時性, [及時性與完整性](/tw/ch13#sec_future_integrity)
  - 協調-避免資料系統, [無協調資料系統](/tw/ch13#id454)
  - 資料流系統的正確性, [資料流系統的正確性](/tw/ch13#id453)
- 超時, [不可靠的網路](/tw/ch9#sec_distributed_networks), [術語表](/tw/glossary)
  - 動態配置, [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
  - 失敗, [領導者故障：故障轉移](/tw/ch6#leader-failure-failover)
  - 長度, [超時和無界延遲](/tw/ch9#sec_distributed_queueing)
- TimescaleDB（資料庫）, [列式儲存](/tw/ch4#sec_storage_column)
- 時間戳, [邏輯時鐘](/tw/ch10#sec_consistency_timestamps)
  - 指定流處理中的事件, [你用的是誰的時鐘？](/tw/ch12#id438)
  - 讀後寫入一致性, [讀己之寫](/tw/ch6#sec_replication_ryw)
  - 用於事務命令, [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)
  - 執行制約因素不足, [使用邏輯時鐘強制約束](/tw/ch10#enforcing-constraints-using-logical-clocks)
  - 金鑰範圍, [按鍵的範圍分片](/tw/ch7#sec_sharding_key_range)
  - 蘭波特, [Lamport 時間戳](/tw/ch10#lamport-timestamps)
  - 邏輯, [排序事件以捕獲因果關係](/tw/ch13#sec_future_capture_causality)
  - 命令事件, [用於事件排序的時間戳](/tw/ch9#sec_distributed_lww)
  - 時間戳, [實現線性一致的 ID 生成器](/tw/ch10#implementing-a-linearizable-id-generator)
- TLA+ (specification language), [模型檢查與規範語言](/tw/ch9#model-checking-and-specification-languages)
- 符號桶（限制重試）, [描述效能](/tw/ch2#sec_introduction_percentiles)
- 墓碑, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables), [磁碟空間使用](/tw/ch4#disk-space-usage), [日誌壓縮](/tw/ch12#sec_stream_log_compaction)
- 專題（資訊）, [訊息代理](/tw/ch5#message-brokers), [傳遞事件流](/tw/ch12#sec_stream_transmit)
- 撕裂的頁面（B- 樹）, [使 B 樹可靠](/tw/ch4#sec_storage_btree_wal)
- 全序, [術語表](/tw/glossary)
  - broadcast（見 shared logs）
  - 限制, [全序的限制](/tw/ch13#id335)
  - 在邏輯時間戳上, [邏輯時鐘](/tw/ch10#sec_consistency_timestamps)
- 追蹤, [分散式系統的問題](/tw/ch1#sec_introduction_dist_sys_problems)
- 跟蹤行為資料, [隱私與追蹤](/ch14#id373)
  - （另見 隱私）
- 權衡, [資料系統架構中的權衡](/tw/ch1#ch_tradeoffs)-[資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)
- transaction coordinator（見 協調者）
- transaction manager（見 協調者）
- 事務處理, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp)-[事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp)
  - 與分析的比較, [事務處理與分析的特徵](/tw/ch1#sec_introduction_oltp)
  - 與資料儲存的比較, [分析型資料儲存](/tw/ch4#sec_storage_analytics)
- 事務, [事務](/tw/ch8#ch_transactions)-[總結](/tw/ch8#summary), [術語表](/tw/glossary)
  - ACID properties of, [ACID 的含義](/tw/ch8#sec_transactions_acid)
    - 原子性, [原子性](/tw/ch8#sec_transactions_acid_atomicity)
    - 一致性, [一致性](/tw/ch8#sec_transactions_acid_consistency)
    - 永續性, [使 B 樹可靠](/tw/ch4#sec_storage_btree_wal), [永續性](/tw/ch8#durability)
    - 隔離性, [隔離性](/tw/ch8#sec_transactions_acid_isolation)
  - 資料完整性, [及時性與完整性](/tw/ch13#sec_future_integrity)
  - 複製, [複製延遲的解決方案](/tw/ch6#id131)
  - compensating（見 compensating transactions）
  - 概念, [事務到底是什麼？](/tw/ch8#sec_transactions_overview)
  - 分散式事務, [分散式事務](/tw/ch8#sec_transactions_distributed)-[再談恰好一次訊息處理](/tw/ch8#exactly-once-message-processing-revisited)
    - 避開, [派生資料與分散式事務](/tw/ch13#sec_future_derived_vs_transactions), [開展分拆工作](/tw/ch13#sec_future_unbundling_favor), [強制約束](/tw/ch13#sec_future_constraints)-[無協調資料系統](/tw/ch13#id454)
    - 失敗放大, [維護派生狀態](/tw/ch13#id446)
    - 已磨損的系統, [分片的利與弊](/tw/ch7#sec_sharding_reasons)
    - 可疑/不確定狀況, [協調器故障](/tw/ch8#coordinator-failure), [存疑時持有鎖](/tw/ch8#holding-locks-while-in-doubt)
    - 兩階段提交, [兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc)-[三階段提交](/tw/ch8#three-phase-commit)
    - 使用, [跨不同系統的分散式事務](/tw/ch8#sec_transactions_xa)-[恰好一次訊息處理](/tw/ch8#sec_transactions_exactly_once)
    - XA 事務, [XA 事務](/tw/ch8#xa-transactions)-[XA 事務的問題](/tw/ch8#problems-with-xa-transactions)
  - OLTP versus analytics queries, [分析（Analytics）](/tw/ch11#sec_batch_olap)
  - 目標, [事務](/tw/ch8#ch_transactions)
  - 可序列化, [可序列化](/tw/ch8#sec_transactions_serializability)-[可序列化快照隔離的效能](/tw/ch8#performance-of-serializable-snapshot-isolation)
    - 實際執行, [實際序列執行](/tw/ch8#sec_transactions_serial)-[序列執行總結](/tw/ch8#summary-of-serial-execution)
    - 悲觀與樂觀的併發控制, [悲觀併發控制與樂觀併發控制](/tw/ch8#pessimistic-versus-optimistic-concurrency-control)
    - 可序列化快照隔離, [可序列化快照隔離（SSI）](/tw/ch8#sec_transactions_ssi)-[可序列化快照隔離的效能](/tw/ch8#performance-of-serializable-snapshot-isolation)
    - 兩階段鎖定, [兩階段鎖定（2PL）](/tw/ch8#sec_transactions_2pl)-[索引範圍鎖](/tw/ch8#sec_transactions_2pl_range)
  - 單物件和多物件, [單物件與多物件操作](/tw/ch8#sec_transactions_multi_object)-[處理錯誤和中止](/tw/ch8#handling-errors-and-aborts)
    - 處理錯誤和中止, [處理錯誤和中止](/tw/ch8#handling-errors-and-aborts)
    - 多物件事務的需要, [多物件事務的需求](/tw/ch8#sec_transactions_need)
    - 單物件寫入, [單物件寫入](/tw/ch8#sec_transactions_single_object)
  - 快照隔離（見 snapshots）
  - 嚴格的序列性, [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition)
  - 薄弱的隔離水平, [弱隔離級別](/tw/ch8#sec_transactions_isolation_levels)-[物化衝突](/tw/ch8#materializing-conflicts)
    - 防止丟失更新, [防止丟失更新](/tw/ch8#sec_transactions_lost_update)-[衝突解決與複製](/tw/ch8#conflict-resolution-and-replication)
    - 讀已提交, [讀已提交](/tw/ch8#sec_transactions_read_committed)-[快照隔離與可重複讀](/tw/ch8#sec_transactions_snapshot_isolation)
- 曲線（圖）, [屬性圖](/tw/ch3#id56)
- 三（資料結構）, [構建和合並 SSTable](/tw/ch4#constructing-and-merging-sstables), [全文檢索](/tw/ch4#sec_storage_full_text)
  - as SSTable index, [SSTable 檔案格式](/tw/ch4#the-sstable-file-format)
- 觸發器（資料庫）, [傳遞事件流](/tw/ch12#sec_stream_transmit)
- Trino（資料倉庫）, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
  - 聯邦資料庫, [一切的元資料庫](/tw/ch13#id341)
  - 查詢最佳化器, [查詢語言](/tw/ch11#sec_batch_query_lanauges)
  - 用於 ETL, [提取-轉換-載入（ETL）](/tw/ch11#sec_batch_etl_usage)
  - 工作流程示例, [工作流排程](/tw/ch11#sec_batch_workflows)
- 三層, [三元組儲存與 SPARQL](/tw/ch3#id59)-[SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
  - SPARQL 查詢語言, [SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
- 翻轉視窗（流處理）, [視窗的型別](/tw/ch12#id324)
  - （另見 windows）
  - 在微戰鬥中, [微批次與存檔點](/tw/ch12#id329)
- Turbopuffer（種子搜尋） Name, [設定新的副本](/tw/ch6#sec_replication_new_replica)
- Turtle (RDF data format), [三元組儲存與 SPARQL](/tw/ch3#id59)
- Twitter（見 X (social network)）
- 兩階段提交, [兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc)-[協調器故障](/tw/ch8#coordinator-failure), [術語表](/tw/glossary)
  - 與雙相鎖定混淆, [兩階段鎖定（2PL）](/tw/ch8#sec_transactions_2pl)
  - 協調員失敗, [協調器故障](/tw/ch8#coordinator-failure)
  - 協調員恢復, [從協調器故障中恢復](/tw/ch8#recovering-from-coordinator-failure)
  - 如何運作, [系統性的承諾](/tw/ch8#a-system-of-promises)
  - 績效成本, [跨不同系統的分散式事務](/tw/ch8#sec_transactions_xa)
  - problems with XA transactions, [XA 事務的問題](/tw/ch8#problems-with-xa-transactions)
  - 持有鎖定的事務, [存疑時持有鎖](/tw/ch8#holding-locks-while-in-doubt)
- 兩階段鎖定, [兩階段鎖定（2PL）](/tw/ch8#sec_transactions_2pl)-[索引範圍鎖](/tw/ch8#sec_transactions_2pl_range), [什麼使系統具有線性一致性？](/tw/ch10#sec_consistency_lin_definition), [術語表](/tw/glossary)
  - 與兩階段提交混淆, [兩階段鎖定（2PL）](/tw/ch8#sec_transactions_2pl)
  - 增長和縮小階段, [兩階段鎖定的實現](/tw/ch8#implementation-of-two-phase-locking)
  - 索引範圍鎖定, [索引範圍鎖](/tw/ch8#sec_transactions_2pl_range)
  - 業績, [兩階段鎖定的效能](/tw/ch8#performance-of-two-phase-locking)
- 型別檢查,動態對靜態, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility)

### U

- UDP (User Datagram Protocol)
  - comparison to TCP, [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
  - 多廣播, [直接從生產者傳遞給消費者](/tw/ch12#id296)
- 終極線上（遊戲）, [分片](/tw/ch7#ch_sharding)
- 未繫結的資料集, [流處理](/tw/ch12#ch_stream), [術語表](/tw/glossary)
  - （另見 streams）
- 無限制的延誤, [術語表](/tw/glossary)
  - 在網路中, [超時和無界延遲](/tw/ch9#sec_distributed_queueing)
  - 程序暫停, [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)
- 解析資料庫, [分拆資料庫](/tw/ch13#sec_future_unbundling)-[多分割槽資料處理](/tw/ch13#sec_future_unbundled_multi_shard)
  - 構建資料儲存技術, [組合使用資料儲存技術](/tw/ch13#id447)-[分拆系統與整合系統](/tw/ch13#id448)
    - 聯邦制與拆分制, [一切的元資料庫](/tw/ch13#id341)
  - 圍繞資料流設計應用程式, [圍繞資料流設計應用](/tw/ch13#sec_future_dataflow)-[流處理器和服務](/tw/ch13#id345)
  - 觀察匯出狀態, [觀察派生資料狀態](/tw/ch13#sec_future_observing)-[多分割槽資料處理](/tw/ch13#sec_future_unbundled_multi_shard)
    - 實現檢視和快取, [物化檢視和快取](/tw/ch13#id451)
    - 多硬資料處理, [多分割槽資料處理](/tw/ch13#sec_future_unbundled_multi_shard)
    - 推動客戶端更改狀態, [將狀態變更推送給客戶端](/tw/ch13#id348)
- uncertain (transaction status)（見 存疑）
- 聯盟型別（在 Avro）, [模式演化規則](/tw/ch5#schema-evolution-rules)
- uniq（Unix 工具）, [簡單日誌分析](/tw/ch11#sec_batch_log_analysis), [簡單日誌分析](/tw/ch11#sec_batch_log_analysis), [分散式作業編排](/tw/ch11#id278)
- 獨特性限制
  - 同步檢查, [寬鬆地解釋約束](/tw/ch13#id362)
  - 需要協商一致, [唯一性約束需要達成共識](/tw/ch13#id452)
  - 需要線性, [約束與唯一性保證](/tw/ch10#sec_consistency_uniqueness)
  - 以日誌為基礎的信件中的獨特性, [基於日誌訊息傳遞中的唯一性](/tw/ch13#sec_future_uniqueness_log)
- 團結（資料目錄）, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
- universally unique identifiers（見 UUIDs）
- unix 哲學
  - 比較關係資料庫, [分拆資料庫](/tw/ch13#sec_future_unbundling), [一切的元資料庫](/tw/ch13#id341)
  - 與流處理的比較, [流處理](/tw/ch12#sec_stream_processing)
- unix 管道, [簡單日誌分析](/tw/ch11#sec_batch_log_analysis)
  - 與分散式批次處理相比, [工作流排程](/tw/ch11#sec_batch_workflows)
- UPDATE statement (SQL), [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility)
- 更新
  - 防止丟失更新, [防止丟失更新](/tw/ch8#sec_transactions_lost_update)-[衝突解決與複製](/tw/ch8#conflict-resolution-and-replication)
    - 原子寫入操作, [原子寫操作](/tw/ch8#atomic-write-operations)
    - 自動檢測丟失的更新, [自動檢測丟失的更新](/tw/ch8#automatically-detecting-lost-updates)
    - 比較和設定, [條件寫入（比較並設定）](/tw/ch8#sec_transactions_compare_and_set)
    - 衝突解決和推廣, [衝突解決與複製](/tw/ch8#conflict-resolution-and-replication)
    - 使用明確的鎖定, [顯式鎖定](/tw/ch8#explicit-locking)
  - 防止寫入skew, [寫偏差與幻讀](/tw/ch8#sec_transactions_write_skew)-[物化衝突](/tw/ch8#materializing-conflicts)
- 使用量
  - 批次過程排程, [資源分配](/tw/ch11#id279)
  - 透過預設增加, [故障處理](/tw/ch11#id281)
  - 與暫時取捨, [我們不能簡單地使網路延遲可預測嗎？](/tw/ch9#can-we-not-simply-make-network-delays-predictable)
- uTP protocol (BitTorrent), [TCP 的侷限性](/tw/ch9#sec_distributed_tcp)
- UUIDs, [ID 生成器和邏輯時鐘](/tw/ch10#sec_consistency_logical)

### V

- 有效性（協商一致）, [單值共識](/tw/ch10#single-value-consensus), [原子提交作為共識](/tw/ch10#atomic-commitment-as-consensus)
- vBuckets（硬化）, [分片](/tw/ch7#ch_sharding)
- 向量時鐘, [版本向量](/tw/ch6#version-vectors)
  - （另見 版本向量）
  - 和 Lamport/hybrid 邏輯鍾, [Lamport/混合邏輯時鐘 vs. 向量時鐘](/tw/ch10#lamporthybrid-logical-clocks-vs-vector-clocks)
  - 和版本向量, [版本向量](/tw/ch6#version-vectors)
- 向量嵌入, [向量嵌入](/tw/ch4#id92)
- 向量處理, [查詢執行：編譯與向量化](/tw/ch4#sec_storage_vectorized)
- 供應商鎖定, [雲服務的利弊](/tw/ch1#sec_introduction_cloud_tradeoffs)
- Venice（資料庫）, [對外提供派生資料](/tw/ch11#sec_batch_serving_derived)
- 核查, [信任但驗證](/tw/ch13#sec_future_verification)-[用於可審計資料系統的工具](/tw/ch13#id366)
  - 避免盲目信任, [不要盲目信任承諾](/tw/ch13#id364)
  - 設計可審計性, [為可審計性而設計](/tw/ch13#id365)
  - 端對端完整性檢查, [端到端原則重現](/tw/ch13#id456)
  - 可審計資料系統工具, [用於可審計資料系統的工具](/tw/ch13#id366)
- 版本控制系統
  - 合併衝突, [手動衝突解決](/tw/ch6#manual-conflict-resolution)
  - 依賴不可改變的資料, [併發控制](/tw/ch12#sec_stream_concurrency)
- 版本向量, [不同拓撲的問題](/tw/ch6#problems-with-different-topologies), [版本向量](/tw/ch6#version-vectors)
  - 點數, [版本向量](/tw/ch6#version-vectors)
  - 對向量時鐘, [版本向量](/tw/ch6#version-vectors)
- Vertica（資料庫）, [雲資料倉庫](/tw/ch4#sec_cloud_data_warehouses)
  - 處理寫入, [寫入列式儲存](/tw/ch4#writing-to-column-oriented-storage)
- vertical scaling（見 scaling up）
- 頂點（圖）, [圖資料模型](/tw/ch3#sec_datamodels_graph)
  - 屬性圖模型, [屬性圖](/tw/ch3#id56)
- 電子遊戲, [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
- 影片轉碼（例如）, [跨通道時序依賴](/tw/ch10#cross-channel-timing-dependencies)
- views (SQL queries), [Datalog：遞迴關係查詢](/tw/ch3#id62)
  - materialized views（見 物化）
- 檢視戳複製, [共識](/tw/ch10#sec_consistency_consensus), [共識的實踐](/tw/ch10#sec_consistency_total_order)
  - 使用模型檢查, [模型檢查與規範語言](/tw/ch9#model-checking-and-specification-languages)
  - 檢視編號, [從單主複製到共識](/tw/ch10#from-single-leader-replication-to-consensus)
- 虛擬塊裝置, [儲存與計算的分離](/tw/ch1#sec_introduction_storage_compute)
- 虛擬檔案系統, [分散式檔案系統](/tw/ch11#sec_batch_dfs)
  - 比較分散式檔案系統, [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- 虛擬機器, [雲服務的分層](/tw/ch1#layering-of-cloud-services)
  - 上下文開關, [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)
  - 網路效能, [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
  - 吵鬧的鄰居, [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
  - 虛擬時鐘在, [時鐘同步和準確性](/tw/ch9#sec_distributed_clock_accuracy)
- 虛擬記憶體
  - 因頁面錯誤造成的程序暫停, [延遲與響應時間](/tw/ch2#id23), [程序暫停](/tw/ch9#sec_distributed_clocks_pauses)
- Virtuoso（資料庫）, [SPARQL 查詢語言](/tw/ch3#the-sparql-query-language)
- VisiCalc (spreadsheets), [圍繞資料流設計應用](/tw/ch13#sec_future_dataflow)
- Vitess（資料庫）
  - 鍵程硬化, [按鍵的範圍分片](/tw/ch7#sec_sharding_key_range)
- 節點（硬化）, [分片](/tw/ch7#ch_sharding)
- 詞彙, [三元組儲存與 SPARQL](/tw/ch3#id59)
- Voice over IP (VoIP), [網路擁塞和排隊](/tw/ch9#network-congestion-and-queueing)
- VoltDB（資料庫）
  - 交叉硬度序列化, [分片](/tw/ch8#sharding)
  - 確定性儲存程式, [儲存過程的利弊](/tw/ch8#sec_transactions_stored_proc_tradeoffs)
  - 記憶體儲, [全記憶體儲存](/tw/ch4#sec_storage_inmemory)
  - 程序/核心模式, [分片的利與弊](/tw/ch7#sec_sharding_reasons)
  - 二級指數, [本地二級索引](/tw/ch7#id166)
  - 事務的序列執行, [實際序列執行](/tw/ch8#sec_transactions_serial)
  - 基於語句的複製, [基於語句的複製](/tw/ch6#statement-based-replication), [失敗後重建狀態](/tw/ch12#sec_stream_state_fault_tolerance)
  - 流程處理中的事務, [原子提交再現](/tw/ch12#sec_stream_atomic_commit)

### W

- 預寫式日誌, [使 B 樹可靠](/tw/ch4#sec_storage_btree_wal)
- WAL-G (backup tool), [設定新的副本](/tw/ch6#sec_replication_new_replica)
- WarpStream（訊息系統）, [磁碟空間使用](/tw/ch12#sec_stream_disk_usage)
- web services（見 services）
- 網路使用者, [直接從生產者傳遞給消費者](/tw/ch12#id296)
- 網路方法（通訊）, [訊息代理](/tw/ch5#message-brokers)
- WebSocket (protocol), [將狀態變更推送給客戶端](/tw/ch13#id348)
- 寬柱資料模型, [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality)
  - 相對於面向列的儲存, [列壓縮](/tw/ch4#sec_storage_column_compression)
- 視窗（流程處理）, [流分析](/tw/ch12#id318), [時間推理](/tw/ch12#sec_stream_time)-[視窗的型別](/tw/ch12#id324)
  - 更改日誌的無限視窗, [維護物化檢視](/tw/ch12#sec_stream_mat_view), [流表連線（流擴充）](/tw/ch12#sec_stream_table_joins)
  - 知道所有事件何時到來, [處理滯留事件](/tw/ch12#id323)
  - 串流在視窗內連線, [流流連線（視窗連線）](/tw/ch12#id440)
  - 視窗型別, [視窗的型別](/tw/ch12#id324)
- WITH RECURSIVE syntax (SQL), [SQL 中的圖查詢](/tw/ch3#id58)
- Word2Vec (language model), [向量嵌入](/tw/ch4#id92)
- 工作流程引擎, [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows)
  - Airflow（見 Airflow（工作流排程器））
  - 批處理, [工作流排程](/tw/ch11#sec_batch_workflows)
  - Camunda（見 Camunda (workflow engine)）
  - Dagster（見 Dagster（工作流排程器））
  - 持久執行, [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows)
  - 提取-轉換-載入（ETL）（見 ETL）
  - 執行器, [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows)
  - 樂團, [持久化執行與工作流](/tw/ch5#sec_encoding_dataflow_workflows), [批處理](/tw/ch11#ch_batch)
  - Orkes（見 Orkes (workflow engine)）
  - Prefect（見 Prefect（工作流排程器））
  - 依賴決定性因素, [確定性模擬測試](/tw/ch9#deterministic-simulation-testing)
  - Restate（見 Restate (workflow engine)）
  - Temporal（見 Temporal (workflow engine)）
- 工作設定, [排序與記憶體聚合](/tw/ch11#id275)
- 寫入放大, [寫放大](/tw/ch4#write-amplification)
- 寫路徑, [觀察派生資料狀態](/tw/ch13#sec_future_observing)
- 寫偏差, [寫偏差與幻讀](/tw/ch8#sec_transactions_write_skew)-[物化衝突](/tw/ch8#materializing-conflicts)
  - 特性, [寫偏差與幻讀](/tw/ch8#sec_transactions_write_skew)-[導致寫偏差的幻讀](/tw/ch8#sec_transactions_phantom), [基於過時前提的決策](/tw/ch8#decisions-based-on-an-outdated-premise)
  - 例項, [寫偏差與幻讀](/tw/ch8#sec_transactions_write_skew), [寫偏差的更多例子](/tw/ch8#more-examples-of-write-skew)
  - 物化衝突, [物化衝突](/tw/ch8#materializing-conflicts)
  - 實際發生情況, [維護完整性，儘管軟體有Bug](/tw/ch13#id455)
  - 幻讀, [導致寫偏差的幻讀](/tw/ch8#sec_transactions_phantom)
  - 預防
    - 在快照隔離中, [基於過時前提的決策](/tw/ch8#decisions-based-on-an-outdated-premise)-[檢測影響先前讀取的寫入](/tw/ch8#sec_detecting_writes_affect_reads)
    - 雙相鎖定, [謂詞鎖](/tw/ch8#predicate-locks)-[索引範圍鎖](/tw/ch8#sec_transactions_2pl_range)
    - 選項, [寫偏差的特徵](/tw/ch8#characterizing-write-skew)
- 預寫式日誌, [使 B 樹可靠](/tw/ch4#sec_storage_btree_wal), [預寫日誌（WAL）傳輸](/tw/ch6#write-ahead-log-wal-shipping)
  - 持久執行, [持久化執行](/tw/ch5#durable-execution)
- 寫入（資料庫）
  - 原子寫入操作, [原子寫操作](/tw/ch8#atomic-write-operations)
  - 檢測影響前讀的寫入, [檢測影響先前讀取的寫入](/tw/ch8#sec_detecting_writes_affect_reads)
  - 防止汙穢的寫作,, [沒有髒寫](/tw/ch8#sec_transactions_dirty_write)
- WS-\* framework, [遠端過程呼叫（RPC）的問題](/tw/ch5#sec_problems_with_rpc)
- WS-AtomicTransaction (2PC), [兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc)

### X

- X （社會網路）
  - 建造住房時間表（例如）, [案例研究：社交網路首頁時間線](/tw/ch2#sec_introduction_twitter), [從同一事件日誌中派生多個檢視](/tw/ch12#sec_stream_deriving_views), [表表連線（維護物化檢視）](/tw/ch12#id326), [物化檢視和快取](/tw/ch13#id451)
    - 加入費用, [社交網路案例研究中的反正規化](/tw/ch3#denormalization-in-the-social-networking-case-study)
    - 描述負載, [描述負載](/tw/ch2#id33)
    - 過失容忍, [容錯](/tw/ch2#id27)
    - 業績計量, [描述效能](/tw/ch2#sec_introduction_percentiles)
  - DistributedLog (event log), [使用日誌進行訊息儲存](/tw/ch12#id300)
  - Snowflake (ID generator), [ID 生成器和邏輯時鐘](/tw/ch10#sec_consistency_logical)
- XA 事務, [兩階段提交（2PC）](/tw/ch8#sec_transactions_2pc), [XA 事務](/tw/ch8#xa-transactions)-[XA 事務的問題](/tw/ch8#problems-with-xa-transactions)
  - 啟發式決策, [從協調器故障中恢復](/tw/ch8#recovering-from-coordinator-failure)
  - 問題, [XA 事務的問題](/tw/ch8#problems-with-xa-transactions)
- xargs （Unix 工具） （英語）., [簡單日誌分析](/tw/ch11#sec_batch_log_analysis)
- XFS (file system), [分散式檔案系統](/tw/ch11#sec_batch_dfs)
- XGBoost (machine learning library), [機器學習](/tw/ch11#id290)
- XML
  - 二進位制變體, [二進位制編碼](/tw/ch5#binary-encoding)
  - 資料位置, [讀寫的資料區域性](/tw/ch3#sec_datamodels_document_locality)
  - encoding RDF data, [RDF 資料模型](/tw/ch3#the-rdf-data-model)
  - 應用資料的問題, [JSON、XML 及其二進位制變體](/tw/ch5#sec_encoding_json)
  - 關係資料庫, [文件模型中的模式靈活性](/tw/ch3#sec_datamodels_schema_flexibility)
  - XML databases, [關係模型與文件模型](/tw/ch3#sec_datamodels_history), [文件的查詢語言](/tw/ch3#query-languages-for-documents)
- Xorq（查詢引擎）, [一切的元資料庫](/tw/ch13#id341)
- XPath, [文件的查詢語言](/tw/ch3#query-languages-for-documents)
- XQuery, [文件的查詢語言](/tw/ch3#query-languages-for-documents)

### Y

- 亞虎
  - 響應時間研究, [平均值、中位數與百分位點](/tw/ch2#id24)
- YARN (job scheduler), [分散式作業編排](/tw/ch11#id278), [應用程式碼和狀態的分離](/tw/ch13#id344)
  - ApplicationMaster, [分散式作業編排](/tw/ch11#id278)
- Yjs (CRDT library), [同步引擎的利弊](/tw/ch6#pros-and-cons-of-sync-engines)
- YugabyteDB（資料庫）
  - 雜湊變硬, [按雜湊範圍分片](/tw/ch7#sharding-by-hash-range)
  - 鍵程硬化, [按鍵的範圍分片](/tw/ch7#sec_sharding_key_range)
  - 多領導複製, [跨地域執行](/tw/ch6#sec_replication_multi_dc)
  - 請求路由, [請求路由](/tw/ch7#sec_sharding_routing)
  - 硬化二級指數, [全域性二級索引](/tw/ch7#id167)
  - 平板（硬化）, [分片](/tw/ch7#ch_sharding)
  - 事務, [事務到底是什麼？](/tw/ch8#sec_transactions_overview), [資料庫內部的分散式事務](/tw/ch8#sec_transactions_internal)
  - 使用時鐘同步, [用於全域性快照的同步時鐘](/tw/ch9#sec_distributed_spanner)

### Z

- Zab（協商一致演算法）, [共識](/tw/ch10#sec_consistency_consensus), [共識的實踐](/tw/ch10#sec_consistency_total_order)
  - use in ZooKeeper, [實現線性一致性系統](/tw/ch10#sec_consistency_implementing_linearizable)
- 零複製, [編碼資料的格式](/tw/ch5#sec_encoding_formats)
- zero-disk architecture (ZDA), [設定新的副本](/tw/ch6#sec_replication_new_replica)
- ZeroMQ (messaging library), [直接從生產者傳遞給消費者](/tw/ch12#id296)
- 殭屍（分裂的大腦）, [隔離殭屍程序和延遲請求](/tw/ch9#sec_distributed_fencing_tokens)
- zones (cloud computing)（見 availability zones）
- ZooKeeper (coordination service), [協調服務](/tw/ch10#sec_consistency_coordination)-[服務發現](/tw/ch10#service-discovery)
  - 生成柵欄標誌, [隔離殭屍程序和延遲請求](/tw/ch9#sec_distributed_fencing_tokens), [使用共享日誌](/tw/ch10#sec_consistency_smr), [協調服務](/tw/ch10#sec_consistency_coordination)
  - 線性操作, [實現線性一致性系統](/tw/ch10#sec_consistency_implementing_linearizable)
  - 鎖和領袖選舉, [鎖定與領導者選舉](/tw/ch10#locking-and-leader-election)
  - 觀察員, [服務發現](/tw/ch10#service-discovery)
  - 用於服務發現, [負載均衡器、服務發現和服務網格](/tw/ch5#sec_encoding_service_discovery), [服務發現](/tw/ch10#service-discovery)
  - 用於硬性轉讓, [請求路由](/tw/ch7#sec_sharding_routing)
  - 使用 Zab 演算法, [共識](/tw/ch10#sec_consistency_consensus)

================================================
FILE: content/tw/part-i.md
================================================
---
title: 第一部分：資料系統基礎
weight: 100
breadcrumbs: false
---

{{< callout type="warning" >}}
當前頁面來自本書第一版，第二版尚不可用
{{< /callout >}}

本書前五章介紹了資料系統底層的基礎概念，無論是在單臺機器上執行的單點資料系統，還是分佈在多臺機器上的分散式資料系統都適用。

1. [第一章](/tw/ch1) 將介紹 **資料系統架構中的利弊權衡**。我們將討論不同型別的資料系統（例如，分析型與事務型），以及它們在雲環境中的執行方式。
2. [第二章](/tw/ch2) 將介紹非功能性需求的定義。。**可靠性，可伸縮性和可維護性** ，這些詞彙到底意味著什麼？如何實現這些目標？
3. [第三章](/tw/ch3) 將對幾種不同的 **資料模型和查詢語言** 進行比較。從程式設計師的角度看，這是資料庫之間最明顯的區別。不同的資料模型適用於不同的應用場景。
4. [第四章](/tw/ch4) 將深入 **儲存引擎** 內部，研究資料庫如何在磁碟上擺放資料。不同的儲存引擎針對不同的負載進行最佳化，選擇合適的儲存引擎對系統性能有巨大影響。
5. [第五章](/tw/ch5) 將對幾種不同的 **資料編碼** 進行比較。特別研究了這些格式在應用需求經常變化、模式需要隨時間演變的環境中表現如何。

[第二部分](/tw/part-ii) 將專門討論在 **分散式資料系統** 中特有的問題。


## [1. 資料系統架構中的權衡](/tw/ch1)
- [分析型與事務型系統](/tw/ch1#sec_introduction_analytics)
- [雲服務與自託管](/tw/ch1#sec_introduction_cloud)
- [分散式與單節點系統](/tw/ch1#sec_introduction_distributed)
- [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)
- [總結](/tw/ch1#summary)

## [2. 定義非功能性需求](/tw/ch2)
- [案例研究：社交網路首頁時間線](/tw/ch2#sec_introduction_twitter)
- [描述效能](/tw/ch2#sec_introduction_percentiles)
- [可靠性與容錯](/tw/ch2#sec_introduction_reliability)
- [可伸縮性](/tw/ch2#sec_introduction_scalability)
- [可運維性](/tw/ch2#sec_introduction_maintainability)
- [總結](/tw/ch2#summary)

## [3. 資料模型與查詢語言](/tw/ch3)
- [關係模型與文件模型](/tw/ch3#sec_datamodels_history)
- [圖資料模型](/tw/ch3#sec_datamodels_graph)
- [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
- [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- [總結](/tw/ch3#summary)

## [4. 儲存與檢索](/tw/ch4)
- [OLTP 系統的儲存與索引](/tw/ch4#sec_storage_oltp)
- [分析型資料儲存](/tw/ch4#sec_storage_analytics)
- [多維索引與全文索引](/tw/ch4#sec_storage_multidimensional)
- [總結](/tw/ch4#summary)

## [5. 編碼與演化](/tw/ch5)
- [編碼資料的格式](/tw/ch5#sec_encoding_formats)
- [資料流的模式](/tw/ch5#sec_encoding_dataflow)
- [總結](/tw/ch5#summary)

================================================
FILE: content/tw/part-ii.md
================================================
---
title: 第二部分：分散式資料
weight: 200
breadcrumbs: false
---

{{< callout type="warning" >}}
當前頁面來自本書第一版，第二版尚不可用
{{< /callout >}}

> 一個成功的技術，現實的優先順序必須高於公關，你可以糊弄別人，但糊弄不了自然規律。
>
> —— 羅傑斯委員會報告（1986）
>

-------

在本書的 [第一部分](/tw/part-i) 中，我們討論了資料系統的各個方面，但僅限於資料儲存在單臺機器上的情況。
現在我們到了 [第二部分](/tw/part-ii)，進入更高的層次，並提出一個問題：如果 **多臺機器** 參與資料的儲存和檢索，會發生什麼？

你可能會出於各種各樣的原因，希望將資料庫分佈到多臺機器上：

可伸縮性
: 如果你的資料量、讀取負載、寫入負載超出單臺機器的處理能力，可以將負載分散到多臺計算機上。

容錯 / 高可用性
: 如果你的應用需要在單臺機器（或多臺機器，網路或整個資料中心）出現故障的情況下仍然能繼續工作，則可使用多臺機器，以提供冗餘。一臺故障時，另一臺可以接管。

延遲
: 如果在世界各地都有使用者，你也許會考慮在全球範圍部署多個伺服器，從而每個使用者可以從地理上最近的資料中心獲取服務，避免了等待網路資料包穿越半個世界。

## 伸縮至更高的負載

如果你需要的只是伸縮至更高的 **負載（load）**，最簡單的方法就是購買更強大的機器（有時稱為 **垂直伸縮**，即 vertical scaling，或 **向上伸縮**，即 scale up）。許多處理器，記憶體和磁碟可以在同一個作業系統下相互連線，快速的相互連線允許任意處理器訪問記憶體或磁碟的任意部分。在這種 **共享記憶體架構（shared-memory architecture）** 中，所有的元件都可以看作一臺單獨的機器。

> [!NOTE]
> 在大型機中，儘管任意處理器都可以訪問記憶體的任意部分，但總有一些記憶體區域與一些處理器更接近（稱為 **非均勻記憶體訪問（nonuniform memory access, NUMA）** [^1]）。為了有效利用這種架構特性，需要對處理進行細分，以便每個處理器主要訪問臨近的記憶體，這意味著即使表面上看起來只有一臺機器在執行，**分割槽（partitioning）** 仍然是必要的。

共享記憶體方法的問題在於，成本增長速度快於線性增長：一臺有著雙倍處理器數量，雙倍記憶體大小，雙倍磁碟容量的機器，通常成本會遠遠超過原來的兩倍。而且可能因為存在瓶頸，並不足以處理雙倍的載荷。

共享記憶體架構可以提供有限的容錯能力，高階機器可以使用熱插拔的元件（不關機更換磁碟，記憶體模組，甚至處理器）—— 但它必然囿於單個地理位置的桎梏。

另一種方法是 **共享磁碟架構（shared-disk architecture）**，它使用多臺具有獨立處理器和記憶體的機器，但將資料儲存在機器之間共享的磁碟陣列上，這些磁碟透過快速網路連線。這種架構用於某些資料倉庫，但競爭和鎖定的開銷限制了共享磁碟方法的可伸縮性 [^2]。

> [!NOTE]
> 網路附屬儲存（Network Attached Storage, NAS），或 **儲存區網路（Storage Area Network, SAN）**

### 無共享架構

相比之下，**無共享架構** [^3]（shared-nothing architecture，有時被稱為 **水平伸縮**，即 horizontal scaling，或 **向外伸縮**，即 scaling out）已經相當普及。
在這種架構中，執行資料庫軟體的每臺機器 / 虛擬機器都稱為 **節點（node）**。每個節點只使用各自的處理器，記憶體和磁碟。節點之間的任何協調，都是在軟體層面使用傳統網路實現的。

無共享系統不需要使用特殊的硬體，所以你可以用任意機器 —— 比如價效比最好的機器。你也許可以跨多個地理區域分佈資料從而減少使用者延遲，或者在損失一整個資料中心的情況下倖免於難。
隨著雲端虛擬機器部署的出現，即使是小公司，現在無需 Google 級別的運維，也可以實現異地分散式架構。

在這一部分裡，我們將重點放在無共享架構上。它不見得是所有場景的最佳選擇，但它是最需要你謹慎從事的架構。
如果你的資料分佈在多個節點上，你需要意識到這樣一個分散式系統中約束和權衡 —— 資料庫並不能魔術般地把這些東西隱藏起來。

雖然分散式無共享架構有許多優點，但它通常也會給應用帶來額外的複雜度，有時也會限制你可用資料模型的表達力。
在某些情況下，一個簡單的單執行緒程式可以比一個擁有超過 100 個 CPU 核的叢集表現得更好 [^4]。另一方面，無共享系統可以非常強大。接下來的幾章，將詳細討論分散式資料會帶來的問題。


### 複製 vs 分割槽

資料分佈在多個節點上有兩種常見的方式：

複製（Replication）
: 在幾個不同的節點上儲存資料的相同副本，可能放在不同的位置。複製提供了冗餘：如果一些節點不可用，剩餘的節點仍然可以提供資料服務。複製也有助於改善效能。[第六章](/tw/ch6) 將討論複製。

分割槽 (Partitioning)
: 將一個大型資料庫拆分成較小的子集（稱為 **分割槽**，即 partitions），從而不同的分割槽可以指派給不同的 **節點**（nodes，亦稱 **分片**，即 sharding）。[第七章](/tw/ch7) 將討論分割槽。

複製和分割槽是不同的機制，但它們經常同時使用。如 [圖 II-1](#fig_replication_partitioning) 所示。

{{< figure src="/v1/ddia_part-ii_01.png" id="fig_replication_partitioning" caption="圖 II-1 一個數據庫切分為兩個分割槽，每個分割槽都有兩個副本" class="w-full my-4" >}}


理解了這些概念，就可以開始討論在分散式系統中需要做出的困難抉擇。[第八章](/tw/ch8) 將討論 **事務（Transaction）**，這對於瞭解資料系統中可能出現的各種問題，以及我們可以做些什麼很有幫助。
[第九章](/tw/ch9) 和 [第十章](/tw/ch10) 將討論分散式系統的根本侷限性。

在本書的 [第三部分](/tw/part-iii) 中，將討論如何將多個（可能是分散式的）資料儲存整合為一個更大的系統，以滿足複雜的應用需求。但首先，我們來聊聊分散式的資料。


## [6. 複製](/tw/ch6)
- [單主複製](/tw/ch6#sec_replication_leader)
- [複製延遲的問題](/tw/ch6#sec_replication_lag)
- [多主複製](/tw/ch6#sec_replication_multi_leader)
- [無主複製](/tw/ch6#sec_replication_leaderless)
- [總結](/tw/ch6#summary)

## [7. 分片](/tw/ch7)
- [分片的利與弊](/tw/ch7#sec_sharding_reasons)
- [鍵值資料的分片](/tw/ch7#sec_sharding_key_value)
- [請求路由](/tw/ch7#sec_sharding_routing)
- [分片與二級索引](/tw/ch7#sec_sharding_secondary_indexes)
- [總結](/tw/ch7#summary)

## [8. 事務](/tw/ch8)
- [事務到底是什麼？](/tw/ch8#sec_transactions_overview)
- [弱隔離級別](/tw/ch8#sec_transactions_isolation_levels)
- [可序列化](/tw/ch8#sec_transactions_serializability)
- [分散式事務](/tw/ch8#sec_transactions_distributed)
- [總結](/tw/ch8#summary)
- [參考](/tw/ch8#參考)

## [9. 分散式系統的麻煩](/tw/ch9)
- [故障與部分失效](/tw/ch9#sec_distributed_partial_failure)
- [不可靠的網路](/tw/ch9#sec_distributed_networks)
- [不可靠的時鐘](/tw/ch9#sec_distributed_clocks)
- [知識、真相和謊言](/tw/ch9#sec_distributed_truth)
- [總結](/tw/ch9#summary)

## [10. 一致性與共識](/tw/ch10)
- [線性一致性](/tw/ch10#sec_consistency_linearizability)
- [ID 生成器和邏輯時鐘](/tw/ch10#sec_consistency_logical)
- [共識](/tw/ch10#sec_consistency_consensus)
- [總結](/tw/ch10#summary)


### 參考

[^1]: Ulrich Drepper: “[What Every Programmer Should Know About Memory](https://people.freebsd.org/~lstewart/articles/cpumemory.pdf),” akka‐dia.org, November 21, 2007.
[^2]: Ben Stopford: “[Shared Nothing vs. Shared Disk Architectures: An Independent View](http://www.benstopford.com/2009/11/24/understanding-the-shared-nothing-architecture/),” benstopford.com, November 24, 2009.
[^3]: Michael Stonebraker: “[The Case for Shared Nothing](http://db.cs.berkeley.edu/papers/hpts85-nothing.pdf),” IEEE Database EngineeringBulletin, volume 9, number 1, pages 4–9, March 1986.
[^4]: Frank McSherry, Michael Isard, and Derek G. Murray: “[Scalability! But at What COST?](http://www.frankmcsherry.org/assets/COST.pdf),” at 15th USENIX Workshop on Hot Topics in Operating Systems (HotOS),May 2015.

================================================
FILE: content/tw/part-iii.md
================================================
---
title: 第三部分：派生資料
weight: 300
breadcrumbs: false
---

{{< callout type="warning" >}}
當前頁面來自本書第一版，第二版尚不可用
{{< /callout >}}

在本書的 [第一部分](/tw/part-i) 和 [第二部分](/tw/part-ii) 中，我們自底向上地把所有關於分散式資料庫的主要考量都過了一遍。從資料在磁碟上的佈局，一直到出現故障時分散式系統一致性的侷限。但所有的討論都假定了應用中只用了一種資料庫。

現實世界中的資料系統往往更為複雜。大型應用程式經常需要以多種方式訪問和處理資料，沒有一個數據庫可以同時滿足所有這些不同的需求。因此應用程式通常組合使用多種元件：資料儲存、索引、快取、分析系統等等，並實現在這些元件中移動資料的機制。

本書的最後一部分，會研究將多個不同資料系統（可能有著不同資料模型，並針對不同的訪問模式進行最佳化）整合為一個協調一致的應用架構時，會遇到的問題。軟體供應商經常會忽略這一方面的生態建設，並聲稱他們的產品能夠滿足你的所有需求。在現實世界中，整合不同的系統是實際應用中最重要的事情之一。

## 記錄系統和派生資料系統

從高層次上看，儲存和處理資料的系統可以分為兩大類：

權威記錄系統（System of record）
: **記錄系統**，也被稱為 **真相源（source of truth）**，持有資料的權威版本。當新的資料進入時（例如，使用者輸入）首先會記錄在這裡。
 每個事實正正好好表示一次（表示通常是 **正規化的**，即 normalized）。如果其他系統和 **記錄系統** 之間存在任何差異，那麼記錄系統中的值是正確的（根據定義）。

派生資料系統（Derived data systems）
: **派生系統** 中的資料，通常是另一個系統中的現有資料以某種方式進行轉換或處理的結果。如果丟失派生資料，可以從原始來源重新建立。
 典型的例子是 **快取（cache）**：如果資料在快取中，就可以由快取提供服務；如果快取不包含所需資料，則降級由底層資料庫提供。反正規化的值，索引和物化檢視亦屬此類。在推薦系統中，預測彙總資料通常派生自使用者日誌。

從技術上講，派生資料是 **冗餘的（redundant）**，因為它重複了已有的資訊。但是派生資料對於獲得良好的只讀查詢效能通常是至關重要的。它通常是反正規化的。可以從單個源頭派生出多個不同的資料集，使你能從不同的 “視角” 洞察資料。

並不是所有的系統都在其架構中明確區分 **記錄系統** 和 **派生資料系統**，但是這是一種有用的區分方式，因為它明確了系統中的資料流：系統的哪一部分具有哪些輸入和哪些輸出，以及它們如何相互依賴。

大多數資料庫，儲存引擎和查詢語言，本質上既不是記錄系統也不是派生系統。資料庫只是一個工具：如何使用它取決於你自己。**記錄系統和派生資料系統之間的區別不在於工具，而在於應用程式中的使用方式。**

透過梳理資料的派生關係，可以清楚地理解一個令人困惑的系統架構。這將貫穿本書的這一部分。

## 章節概述

我們將從 [第十一章](/tw/ch11) 開始，研究例如 MapReduce 這樣 **面向批處理（batch-oriented）** 的資料流系統。對於建設大規模資料系統，我們將看到，它們提供了優秀的工具和思想。
[第十二章](/tw/ch12) 將把這些思想應用到 **流式資料（data streams）** 中，使我們能用更低的延遲完成同樣的任務。[第十三章](/tw/ch13) 將探討如何使用這些工具來構建可靠、可伸縮和可維護的應用。[第十四章](/ch14) 將以倫理、隱私與社會影響為主題，為全書收束。


## 索引

## [11. 批處理](/tw/ch11)
- [使用 Unix 工具的批處理](/tw/ch11#sec_batch_unix)
- [分散式系統中的批處理](/tw/ch11#sec_batch_distributed)
- [批處理模型](/tw/ch11#id431)
- [批處理用例](/tw/ch11#sec_batch_output)
- [本章小結](/tw/ch11#id292)
- [參考文獻](/tw/ch11#references)

## [12. 流處理](/tw/ch12)
- [傳遞事件流](/tw/ch12#sec_stream_transmit)
- [資料庫與流](/tw/ch12#sec_stream_databases)
- [流處理](/tw/ch12#sec_stream_processing)
- [本章小結](/tw/ch12#id332)
- [參考文獻](/tw/ch12#references)

## [13. 流式系統的哲學](/tw/ch13)
- [資料整合](/tw/ch13#sec_future_integration)
- [分拆資料庫](/tw/ch13#sec_future_unbundling)
- [追求正確性](/tw/ch13#sec_future_correctness)
- [本章小結](/tw/ch13#id367)
- [參考文獻](/tw/ch13#references)

## [14. 將事情做正確](/ch14)
- [預測分析](/ch14#id369)
- [隱私與追蹤](/ch14#id373)
- [總結](/ch14#id594)
- [參考文獻](/ch14#references)

================================================
FILE: content/tw/preface.md
================================================
---
title: 序言
weight: 50
breadcrumbs: false
---

{{< callout type="warning" >}}
當前頁面來自本書第一版，第二版尚不可用
{{< /callout >}}

如果近幾年從業於軟體工程，特別是伺服器端和後端系統開發，那麼你很有可能已經被大量關於資料儲存和處理的時髦詞彙轟炸過了： NoSQL！大資料！Web-Scale！分片！最終一致性！ACID！CAP 定理！雲服務！MapReduce！即時！

在最近十年中，我們看到了很多有趣的進展，關於資料庫，分散式系統，以及在此基礎上構建應用程式的方式。這些進展有著各種各樣的驅動力：

* 谷歌、雅虎、亞馬遜、臉書、領英、微軟和推特等網際網路公司正在和巨大的流量 / 資料打交道，這迫使他們去創造能有效應對如此規模的新工具。
* 企業需要變得敏捷，需要低成本地檢驗假設，需要透過縮短開發週期和保持資料模型的靈活性，快速地響應新的市場洞察。
* 免費和開源軟體變得非常成功，在許多環境中比商業軟體和定製軟體更受歡迎。
* 處理器主頻幾乎沒有增長，但是多核處理器已經成為標配，網路也越來越快。這意味著並行化程度只增不減。
* 即使你在一個小團隊中工作，現在也可以構建分佈在多臺計算機甚至多個地理區域的系統，這要歸功於譬如亞馬遜網路服務（AWS）等基礎設施即服務（IaaS）概念的踐行者。
* 許多服務都要求高可用，因停電或維護導致的服務不可用，變得越來越難以接受。

**資料密集型應用（data-intensive applications）** 正在透過使用這些技術進步來推動可能性的邊界。一個應用被稱為 **資料密集型** 的，如果 **資料是其主要挑戰**（資料量，資料複雜度或資料變化速度）—— 與之相對的是 **計算密集型**，即處理器速度是其瓶頸。

幫助資料密集型應用儲存和處理資料的工具與技術，正迅速地適應這些變化。新型資料庫系統（“NoSQL”）已經備受關注，而訊息佇列，快取，搜尋索引，批處理和流處理框架以及相關技術也非常重要。很多應用組合使用這些工具與技術。

這些生意盎然的時髦詞彙體現出人們對新的可能性的熱情，這是一件好事。但是作為軟體工程師和架構師，如果要開發優秀的應用，我們還需要對各種層出不窮的技術及其利弊權衡有精準的技術理解。為了獲得這種洞察，我們需要深挖時髦詞彙背後的內容。

幸運的是，在技術迅速變化的背後總是存在一些持續成立的原則，無論你使用了特定工具的哪個版本。如果你理解了這些原則，就可以領會這些工具的適用場景，如何充分利用它們，以及如何避免其中的陷阱。這正是本書的初衷。

本書的目標是幫助你在飛速變化的資料處理和資料儲存技術大觀園中找到方向。本書並不是某個特定工具的教程，也不是一本充滿枯燥理論的教科書。相反，我們將看到一些成功資料系統的樣例：許多流行應用每天都要在生產中滿足可伸縮性、效能、以及可靠性的要求，而這些技術構成了這些應用的基礎。

我們將深入這些系統的內部，理清它們的關鍵演算法，討論背後的原則和它們必須做出的權衡。在這個過程中，我們將嘗試尋找 **思考** 資料系統的有效方式 —— 不僅關於它們 **如何** 工作，還包括它們 **為什麼** 以這種方式工作，以及哪些問題是我們需要問的。

閱讀本書後，你能很好地決定哪種技術適合哪種用途，並瞭解如何將工具組合起來，為一個良好應用架構奠定基礎。本書並不足以使你從頭開始構建自己的資料庫儲存引擎，不過幸運的是這基本上很少有必要。你將獲得對系統底層發生事情的敏銳直覺，這樣你就有能力推理它們的行為，做出優秀的設計決策，並追蹤任何可能出現的問題。


## 本書的目標讀者

如果你開發的應用具有用於儲存或處理資料的某種伺服器 / 後端系統，而且使用網路（例如，Web 應用、移動應用或連線到網際網路的感測器），那麼本書就是為你準備的。

本書是為軟體工程師，軟體架構師，以及喜歡寫程式碼的技術經理準備的。如果你需要對所從事系統的架構做出決策 —— 例如你需要選擇解決某個特定問題的工具，並找出如何最好地使用這些工具，那麼這本書對你尤有價值。但即使你無法選擇你的工具，本書仍將幫助你更好地瞭解所使用工具的長處和短處。

你應當具有一些開發 Web 應用或網路服務的經驗，且應當熟悉關係型資料庫和 SQL。任何你瞭解的非關係型資料庫和其他與資料相關工具都會有所幫助，但不是必需的。對常見網路協議如 TCP 和 HTTP 的大概理解是有幫助的。程式語言或框架的選擇對閱讀本書沒有任何不同影響。

如果以下任意一條對你為真，你會發現這本書很有價值：

* 你想了解如何使資料系統可伸縮，例如，支援擁有數百萬使用者的 Web 或移動應用。
* 你需要提高應用程式的可用性（最大限度地減少停機時間），保持穩定執行。
* 你正在尋找使系統在長期執行過程易於維護的方法，即使系統規模增長，需求與技術也發生變化。
* 你對事物的運作方式有著天然的好奇心，並且希望知道一些主流網站和線上服務背後發生的事情。這本書打破了各種資料庫和資料處理系統的內幕，探索這些系統設計中的智慧是非常有趣的。

有時在討論可伸縮的資料系統時，人們會說：“你又不在谷歌或亞馬遜，別操心可伸縮性了，直接上關係型資料庫”。這個陳述有一定的道理：為了不必要的伸縮性而設計程式，不僅會浪費不必要的精力，並且可能會把你鎖死在一個不靈活的設計中。實際上這是一種 “過早最佳化” 的形式。不過，選擇合適的工具確實很重要，而不同的技術各有優缺點。我們將看到，關係資料庫雖然很重要，但絕不是資料處理的終章。


## 本書涉及的領域

本書並不會嘗試告訴讀者如何安裝或使用特定的軟體包或 API，因為已經有大量文件給出了詳細的使用說明。相反，我們會討論資料系統的基礎 —— 各種原則與利弊權衡，並探討了不同產品所做出的不同設計決策。

在電子書中包含了線上資源全文的連結。所有連結在出版時都進行了驗證，但不幸的是，由於網路的自然規律，連結往往會頻繁地破損。如果你遇到連結斷開的情況，或者正在閱讀本書的列印副本，可以使用搜索引擎查詢參考文獻。對於學術論文，你可以在 Google 學術中搜索標題，查詢可以公開獲取的 PDF 檔案。或者，你也可以在 https://github.com/ept/ddia-references 中找到所有的參考資料，我們在那兒維護最新的連結。

我們主要關注的是資料系統的 **架構（architecture）**，以及它們被整合到資料密集型應用中的方式。本書沒有足夠的空間覆蓋部署、運維、安全、管理等領域 —— 這些都是複雜而重要的主題，僅僅在本書中用粗略的註解討論這些對它們很不公平。每個領域都值得用單獨的書去講。

本書中描述的許多技術都被涵蓋在 **大資料（Big Data）** 這個時髦詞的範疇中。然而 “大資料” 這個術語被濫用，缺乏明確定義，以至於在嚴肅的工程討論中沒有用處。這本書使用歧義更小的術語，如 “單節點” 之於 “分散式系統”，或 “線上 / 互動式系統” 之於 “離線 / 批處理系統”。

本書對 **自由和開源軟體（FOSS）** 有一定偏好，因為閱讀、修改和執行原始碼是瞭解某事物詳細工作原理的好方法。開放的平臺也可以降低供應商壟斷的風險。然而在適當的情況下，我們也會討論專利軟體（閉源軟體，軟體即服務 SaaS，或一些在文獻中描述過但未公開發行的公司內部軟體）。

## 本書綱要

本書分為三部分：

1. 在 [第一部分](/tw/part-i) 中，我們會討論設計資料密集型應用所賴的基本思想。我們從 [第一章](/tw/ch1) 開始，討論我們實際要達到的目標：可靠性、可伸縮性和可維護性；我們該如何思考這些概念；以及如何實現它們。在 [第二章](/tw/ch2) 中，我們比較了幾種不同的資料模型和查詢語言，看看它們如何適用於不同的場景。在 [第三章](/tw/ch3) 中將討論儲存引擎：資料庫如何在磁碟上擺放資料，以便能高效地再次找到它。[第四章](/tw/ch4) 轉向資料編碼（序列化），以及隨時間演化的模式。

2. 在 [第二部分](/tw/part-ii) 中，我們從討論儲存在一臺機器上的資料轉向討論分佈在多臺機器上的資料。這對於可伸縮性通常是必需的，但帶來了各種獨特的挑戰。我們首先討論複製（[第五章](/tw/ch5)）、分割槽 / 分片（[第六章](/tw/ch6)）和事務（[第七章](/tw/ch7)）。然後我們將探索關於分散式系統問題的更多細節（[第八章](/tw/ch8)），以及在分散式系統中實現一致性與共識意味著什麼（[第九章](/tw/ch9)）。

3. 在 [第三部分](/tw/part-iii) 中，我們討論那些從其他資料集派生出一些資料集的系統。派生資料經常出現在異構系統中：當沒有單個數據庫可以把所有事情都做的很好時，應用需要整合幾種不同的資料庫、快取、索引等。在 [第十章](/tw/ch10) 中我們將從一種派生資料的批處理方法開始，然後在此基礎上建立在 [第十一章](/tw/ch11) 中討論的流處理。最後，在 [第十二章](/tw/ch12) 中，我們將所有內容彙總，討論在將來構建可靠、可伸縮和可維護的應用程式的方法。


## 參考文獻與延伸閱讀

本書中討論的大部分內容已經在其它地方以某種形式出現過了 —— 會議簡報、研究論文、部落格文章、程式碼、BUG 跟蹤器、郵件列表以及工程習慣中。本書總結了不同來源資料中最重要的想法，並在文字中包含了指向原始文獻的連結。如果你想更深入地探索一個領域，那麼每章末尾的參考文獻都是很好的資源，其中大部分可以免費線上獲取。


## O‘Reilly Safari

[Safari](http://oreilly.com/safari) (formerly Safari Books Online) is a membership-based training and reference platform for enterprise, government, educators, and individuals.

Members have access to thousands of books, training videos, Learning Paths, interac‐ tive tutorials, and curated playlists from over 250 publishers, including O’Reilly Media, Harvard Business Review, Prentice Hall Professional, Addison-Wesley Pro‐ fessional, Microsoft Press, Sams, Que, Peachpit Press, Adobe, Focal Press, Cisco Press, John Wiley & Sons, Syngress, Morgan Kaufmann, IBM Redbooks, Packt, Adobe Press, FT Press, Apress, Manning, New Riders, McGraw-Hill, Jones & Bartlett, and Course Technology, among others.

For more information, please visit http://oreilly.com/safari.


## 聯絡我們

有關本書的評論和問題，請聯絡出版社：

O’Reilly Media, Inc.
1005 Gravenstein Highway North
Sebastopol, CA 95472
800-998-9938（美國或加拿大）
707-829-0515（國際或本地）
707-829-0104（傳真）

我們為本書提供了網頁，會在上面列出勘誤、示例以及任何補充資訊。你可以訪問：*http://bit.ly/designing-data-intensive-apps*。

如需發表評論或提出技術問題，請傳送郵件至：*bookquestions@oreilly.com*。

有關 O’Reilly 圖書、課程、會議和新聞的更多資訊，請訪問：*http://www.oreilly.com*。

* Facebook: [http://facebook.com/oreilly](http://facebook.com/oreilly)
* Twitter: [http://twitter.com/oreillymedia](http://twitter.com/oreillymedia)
* YouTube: [http://www.youtube.com/oreillymedia](http://www.youtube.com/oreillymedia)


## 致謝

本書融合了學術研究和工業實踐的經驗，融合並系統化了大量其他人的想法與知識。在計算領域，我們往往會被各種新鮮花樣所吸引，但我認為前人完成的工作中，有太多值得我們學習的地方了。本書有 800 多處引用：文章、部落格、講座、文件等，對我來說這些都是寶貴的學習資源。我非常感謝這些材料的作者分享他們的知識。

我也從與人交流中學到了很多東西，很多人花費了寶貴的時間與我討論想法並耐心解釋。特別感謝 Joe Adler, Ross Anderson, Peter Bailis, Márton Balassi, Alastair Beresford, Mark Callaghan, Mat Clayton, Patrick Collison, Sean Cribbs, Shirshanka Das, Niklas Ekström, Stephan Ewen, Alan Fekete, Gyula Fóra, Camille Fournier, Andres Freund, John Garbutt, Seth Gilbert, Tom Haggett, Pat Hel‐ land, Joe Hellerstein, Jakob Homan, Heidi Howard, John Hugg, Julian Hyde, Conrad Irwin, Evan Jones, Flavio Junqueira, Jessica Kerr, Kyle Kingsbury, Jay Kreps, Carl Lerche, Nicolas Liochon, Steve Loughran, Lee Mallabone, Nathan Marz, Caitie McCaffrey, Josie McLellan, Christopher Meiklejohn, Ian Meyers, Neha Narkhede, Neha Narula, Cathy O’Neil, Onora O’Neill, Ludovic Orban, Zoran Perkov, Julia Powles, Chris Riccomini, Henry Robinson, David Rosenthal, Jennifer Rullmann, Matthew Sackman, Martin Scholl, Amit Sela, Gwen Shapira, Greg Spurrier, Sam Stokes, Ben Stopford, Tom Stuart, Diana Vasile, Rahul Vohra, Pete Warden, 以及 Brett Wooldridge.

更多人透過審閱草稿並提供反饋意見在本書的創作過程中做出了無價的貢獻。我要特別感謝 Raul Agepati, Tyler Akidau, Mattias Andersson, Sasha Baranov, Veena Basavaraj, David Beyer, Jim Brikman, Paul Carey, Raul Castro Fernandez, Joseph Chow, Derek Elkins, Sam Elliott, Alexander Gallego, Mark Grover, Stu Halloway, Heidi Howard, Nicola Kleppmann, Stefan Kruppa, Bjorn Madsen, Sander Mak, Stefan Podkowinski, Phil Potter, Hamid Ramazani, Sam Stokes, 以及 Ben Summers。當然對於本書中的任何遺留錯誤或難以接受的見解，我都承擔全部責任。

為了幫助這本書落地，並且耐心地處理我緩慢的寫作和不尋常的要求，我要對編輯 Marie Beaugureau，Mike Loukides，Ann Spencer 和 O'Reilly 的所有團隊表示感謝。我要感謝 Rachel Head 幫我找到了合適的術語。我要感謝 Alastair Beresford，Susan Goodhue，Neha Narkhede 和 Kevin Scott，在其他工作事務之外給了我充分地創作時間和自由。

特別感謝 Shabbir Diwan 和 Edie Freedman，他們非常用心地為各章配了地圖。他們提出了不落俗套的靈感，創作了這些地圖，美麗而引人入勝，真是太棒了。

最後我要表達對家人和朋友們的愛，沒有他們，我將無法走完這個將近四年的寫作歷程。你們是最棒的。

================================================
FILE: content/tw/toc.md
================================================
---
title: "目錄"
linkTitle: "目錄"
weight: 10
breadcrumbs: false
---


![](/title.jpg)


## [序言](/tw/preface)
- [本書的目標讀者](/tw/preface#本書的目標讀者)
- [本書涉及的領域](/tw/preface#本書涉及的領域)
- [本書綱要](/tw/preface#本書綱要)
- [參考文獻與延伸閱讀](/tw/preface#參考文獻與延伸閱讀)
- [O‘Reilly Safari](/tw/preface#oreilly-safari)
- [致謝](/tw/preface#致謝)

## [1. 資料系統架構中的權衡](/tw/ch1)
- [分析型與事務型系統](/tw/ch1#sec_introduction_analytics)
- [雲服務與自託管](/tw/ch1#sec_introduction_cloud)
- [分散式與單節點系統](/tw/ch1#sec_introduction_distributed)
- [資料系統、法律與社會](/tw/ch1#sec_introduction_compliance)
- [總結](/tw/ch1#summary)

## [2. 定義非功能性需求](/tw/ch2)
- [案例研究：社交網路首頁時間線](/tw/ch2#sec_introduction_twitter)
- [描述效能](/tw/ch2#sec_introduction_percentiles)
- [可靠性與容錯](/tw/ch2#sec_introduction_reliability)
- [可伸縮性](/tw/ch2#sec_introduction_scalability)
- [可運維性](/tw/ch2#sec_introduction_maintainability)
- [總結](/tw/ch2#summary)

## [3. 資料模型與查詢語言](/tw/ch3)
- [關係模型與文件模型](/tw/ch3#sec_datamodels_history)
- [圖資料模型](/tw/ch3#sec_datamodels_graph)
- [事件溯源與 CQRS](/tw/ch3#sec_datamodels_events)
- [資料框、矩陣與陣列](/tw/ch3#sec_datamodels_dataframes)
- [總結](/tw/ch3#summary)

## [4. 儲存與檢索](/tw/ch4)
- [OLTP 系統的儲存與索引](/tw/ch4#sec_storage_oltp)
- [分析型資料儲存](/tw/ch4#sec_storage_analytics)
- [多維索引與全文索引](/tw/ch4#sec_storage_multidimensional)
- [總結](/tw/ch4#summary)

## [5. 編碼與演化](/tw/ch5)
- [編碼資料的格式](/tw/ch5#sec_encoding_formats)
- [資料流的模式](/tw/ch5#sec_encoding_dataflow)
- [總結](/tw/ch5#summary)

## [6. 複製](/tw/ch6)
- [單主複製](/tw/ch6#sec_replication_leader)
- [複製延遲的問題](/tw/ch6#sec_replication_lag)
- [多主複製](/tw/ch6#sec_replication_multi_leader)
- [無主複製](/tw/ch6#sec_replication_leaderless)
- [總結](/tw/ch6#summary)

## [7. 分片](/tw/ch7)
- [分片的利與弊](/tw/ch7#sec_sharding_reasons)
- [鍵值資料的分片](/tw/ch7#sec_sharding_key_value)
- [請求路由](/tw/ch7#sec_sharding_routing)
- [分片與二級索引](/tw/ch7#sec_sharding_secondary_indexes)
- [總結](/tw/ch7#summary)

## [8. 事務](/tw/ch8)
- [事務到底是什麼？](/tw/ch8#sec_transactions_overview)
- [弱隔離級別](/tw/ch8#sec_transactions_isolation_levels)
- [可序列化](/tw/ch8#sec_transactions_serializability)
- [分散式事務](/tw/ch8#sec_transactions_distributed)
- [總結](/tw/ch8#summary)
- [參考](/tw/ch8#參考)

## [9. 分散式系統的麻煩](/tw/ch9)
- [故障與部分失效](/tw/ch9#sec_distributed_partial_failure)
- [不可靠的網路](/tw/ch9#sec_distributed_networks)
- [不可靠的時鐘](/tw/ch9#sec_distributed_clocks)
- [知識、真相和謊言](/tw/ch9#sec_distributed_truth)
- [總結](/tw/ch9#summary)

## [10. 一致性與共識](/tw/ch10)
- [線性一致性](/tw/ch10#sec_consistency_linearizability)
- [ID 生成器和邏輯時鐘](/tw/ch10#sec_consistency_logical)
- [共識](/tw/ch10#sec_consistency_consensus)
- [總結](/tw/ch10#summary)

## [11. 批處理](/tw/ch11)
- [使用 Unix 工具的批處理](/tw/ch11#sec_batch_unix)
- [分散式系統中的批處理](/tw/ch11#sec_batch_distributed)
- [批處理模型](/tw/ch11#id431)
- [批處理用例](/tw/ch11#sec_batch_output)
- [本章小結](/tw/ch11#id292)
- [參考文獻](/tw/ch11#references)

## [12. 流處理](/tw/ch12)
- [傳遞事件流](/tw/ch12#sec_stream_transmit)
- [資料庫與流](/tw/ch12#sec_stream_databases)
- [流處理](/tw/ch12#sec_stream_processing)
- [本章小結](/tw/ch12#id332)
- [參考文獻](/tw/ch12#references)

## [13. 流式系統的哲學](/tw/ch13)
- [資料整合](/tw/ch13#sec_future_integration)
- [分拆資料庫](/tw/ch13#sec_future_unbundling)
- [追求正確性](/tw/ch13#sec_future_correctness)
- [本章小結](/tw/ch13#id367)
- [參考文獻](/tw/ch13#references)

## [14. 將事情做正確](/ch14)
- [預測分析](/ch14#id369)
- [隱私與追蹤](/ch14#id373)
- [總結](/ch14#id594)
- [參考文獻](/ch14#references)

## [術語表](/tw/glossary)

## [後記](/tw/colophon)
- [關於作者](/tw/colophon#關於作者)
- [關於譯者](/tw/colophon#關於譯者)
- [後記](/tw/colophon#後記)

================================================
FILE: content/v1/_index.md
================================================
---
title: 设计数据密集型应用（第一版）
linkTitle: DDIA
cascade:
  type: docs
breadcrumbs: false
---


**作者**： [Martin Kleppmann](https://martin.kleppmann.com)，[《Designing Data-Intensive Applications 2nd Edition》](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch01.html) ： 英国剑桥大学分布式系统研究员，演讲者，博主和开源贡献者，软件工程师和企业家，曾在 LinkedIn 和 Rapportive 负责数据基础架构。

**译者**：[**冯若航**](https://vonng.com)，网名 [@Vonng](https://github.com/Vonng)。
PostgreSQL 专家，数据库老司机，云计算泥石流。
[**Pigsty**](https://pgsty.com) 作者与创始人。
架构师，DBA，全栈工程师 @ TanTan，Alibaba，Apple。
独立开源贡献者，[GitStar Ranking 585](https://gitstar-ranking.com/Vonng)，[国区活跃 Top20](https://committers.top/china)。
[DDIA](https://ddia.pigsty.io) / [PG Internal](https://pgint.vonng.com) 中文版译者，公众号：《老冯云数》，数据库 KOL。

**校订**： [@yingang](https://github.com/yingang)  ｜  [繁體中文](/tw) **版本维护** by  [@afunTW](https://github.com/afunTW) ｜ [完整贡献者列表](/contrib)

> [!NOTE]
> DDIA [**第二版**](/zh) 正在翻译中 ([`content/v2`](https://github.com/Vonng/ddia/tree/main) 目录)，欢迎加入并提出您的宝贵意见！[点击此处阅览第二版](/zh)。


## 译序

> 不懂数据库的全栈工程师不是好架构师 —— 冯若航 / Vonng

现今，尤其是在互联网领域，大多数应用都属于数据密集型应用。本书从底层数据结构到顶层架构设计，将数据系统设计中的精髓娓娓道来。其中的宝贵经验无论是对架构师、DBA、还是后端工程师、甚至产品经理都会有帮助。

这是一本理论结合实践的书，书中很多问题，译者在实际场景中都曾遇到过，读来让人击节扼腕。如果能早点读到这本书，该少走多少弯路啊！

这也是一本深入浅出的书，讲述概念的来龙去脉而不是卖弄定义，介绍事物发展演化历程而不是事实堆砌，将复杂的概念讲述的浅显易懂，但又直击本质不失深度。每章最后的引用质量非常好，是深入学习各个主题的绝佳索引。

本书为数据系统的设计、实现、与评价提供了很好的概念框架。读完并理解本书内容后，读者可以轻松看破大多数的技术忽悠，与技术砖家撕起来虎虎生风。

这是 2017 年译者读过最好的一本技术类书籍，这么好的书没有中文翻译，实在是遗憾。某不才，愿为先进技术文化的传播贡献一份力量。既可以深入学习有趣的技术主题，又可以锻炼中英文语言文字功底，何乐而不为？


## 前言

> 在我们的社会中，技术是一种强大的力量。数据、软件、通信可以用于坏的方面：不公平的阶级固化，损害公民权利，保护既得利益集团。但也可以用于好的方面：让底层人民发出自己的声音，让每个人都拥有机会，避免灾难。本书献给所有将技术用于善途的人们。


> 计算是一种流行文化，流行文化鄙视历史。流行文化关乎个体身份和参与感，但与合作无关。流行文化活在当下，也与过去和未来无关。我认为大部分（为了钱）编写代码的人就是这样的，他们不知道自己的文化来自哪里。
>
>  —— 阿兰・凯接受 Dobb 博士的杂志采访时（2012 年）


## 目录

### [序言](/v1/preface)

### [第一部分：数据系统基础](/v1/part-i)

* [第一章：可靠性、可伸缩性和可维护性](/v1/ch1)
* [第二章：数据模型与查询语言](/v1/ch2)
* [第三章：存储与检索](/v1/ch3)
* [第四章：编码与演化](/v1/ch4)

### [第二部分：分布式数据](/v1/part-ii)

* [第五章：复制](/v1/ch5)
* [第六章：分区](/v1/ch6)
* [第七章：事务](/v1/ch7)
* [第八章：分布式系统的麻烦](/v1/ch8)
* [第九章：一致性与共识](/v1/ch9)

### [第三部分：衍生数据](/v1/part-iii)

* [第十章：批处理](/v1/ch10)
* [第十一章：流处理](/v1/ch11)
* [第十二章：数据系统的未来](/v1/ch12)

### [术语表](/v1/glossary)

### [后记](/v1/colophon)

<br>

---------

## 法律声明

从原作者处得知，已经有简体中文的翻译计划，将于 2018 年末完成。[购买地址](https://search.jd.com/Search?keyword=设计数据密集型应用)

译者纯粹出于 **学习目的** 与 **个人兴趣** 翻译本书，不追求任何经济利益。

译者保留对此版本译文的署名权，其他权利以原作者和出版社的主张为准。

本译文只供学习研究参考之用，不得公开传播发行或用于商业用途。有能力阅读英文书籍者请购买正版支持。


---------

## 贡献

0. 全文校订 by [@yingang](https://github.com/Vonng/ddia/commits?author=yingang)
1. [序言初翻修正](https://github.com/Vonng/ddia/commit/afb5edab55c62ed23474149f229677e3b42dfc2c) by [@seagullbird](https://github.com/Vonng/ddia/commits?author=seagullbird)
2. [第一章语法标点校正](https://github.com/Vonng/ddia/commit/973b12cd8f8fcdf4852f1eb1649ddd9d187e3644) by [@nevertiree](https://github.com/Vonng/ddia/commits?author=nevertiree)
3. [第六章部分校正](https://github.com/Vonng/ddia/commit/d4eb0852c0ec1e93c8aacc496c80b915bb1e6d48) 与[第十章的初翻](https://github.com/Vonng/ddia/commit/9de8dbd1bfe6fbb03b3bf6c1a1aa2291aed2490e) by [@MuAlex](https://github.com/Vonng/ddia/commits?author=MuAlex)
4. 第一部分]前言，ch2 校正 by [@jiajiadebug](https://github.com/Vonng/ddia/commits?author=jiajiadebug)
5. 词汇表、后记关于野猪的部分 by [@Chowss](https://github.com/Vonng/ddia/commits?author=Chowss)
6. 繁體中文版本与转换脚本 by [@afunTW](https://github.com/afunTW)
7. 多处翻译修正 by [@songzhibin97](https://github.com/Vonng/ddia/commits?author=songzhibin97) [@MamaShip](https://github.com/Vonng/ddia/commits?author=MamaShip) [@FangYuan33](https://github.com/Vonng/ddia/commits?author=FangYuan33)
8. [感谢所有作出贡献，提出意见的朋友们](/contrib)：

<details>
<summary><a href="https://github.com/Vonng/ddia/pulls">Pull Requests</a> & <a href="https://github.com/Vonng/ddia/issues">Issues</a></summary>

| ISSUE & Pull Requests                           | USER                                                       | Title                                                          |
|-------------------------------------------------|------------------------------------------------------------|----------------------------------------------------------------|
| [386](https://github.com/Vonng/ddia/pull/386)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch2: 优化一处翻译                                                    |
| [384](https://github.com/Vonng/ddia/pull/384)   | [@PanggNOTlovebean](https://github.com/PanggNOTlovebean)   | docs: 优化中文文档的措辞和表达                                              |
| [383](https://github.com/Vonng/ddia/pull/383)   | [@PanggNOTlovebean](https://github.com/PanggNOTlovebean)   | docs: 修正 ch4 中的术语和表达错误                                          |
| [382](https://github.com/Vonng/ddia/pull/382)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch1: 优化一处翻译                                                    |
| [381](https://github.com/Vonng/ddia/pull/381)   | [@Max-Tortoise](https://github.com/Max-Tortoise)           | ch4: 修正一处术语不完整问题                                               |
| [377](https://github.com/Vonng/ddia/pull/377)   | [@huang06](https://github.com/huang06)                     | 优化翻译术语                                                        |
| [375](https://github.com/Vonng/ddia/issues/375) | [@z-soulx](https://github.com/z-soulx)                     | 对于是否100%全中文翻译的必要性讨论？个人-没必要100%，特别是“名词”，有原单词更加适合it人员                 |
| [371](https://github.com/Vonng/ddia/pull/371)   | [@lewiszlw](https://github.com/lewiszlw)                   | CPU core -> CPU 核心                                          |
| [369](https://github.com/Vonng/ddia/pull/369)   | [@bbwang-gl](https://github.com/bbwang-gl)                 | ch7: 可串行化快照隔离检测一个事务何时修改另一个事务的读取                                 |
| [368](https://github.com/Vonng/ddia/pull/368)   | [@yhao3](https://github.com/yhao3)                         | 更新 zh-tw.py 与 zh-tw 内容                                       |
| [367](https://github.com/Vonng/ddia/pull/367)   | [@yhao3](https://github.com/yhao3)                         | 修正拼写、格式和标点问题                                                  |
| [366](https://github.com/Vonng/ddia/pull/366)   | [@yangshangde](https://github.com/yangshangde)             | ch8: 将“电源失败”改为“电源失效”                                           |
| [365](https://github.com/Vonng/ddia/pull/365)   | [@xyohn](https://github.com/xyohn)                         | ch1: 优化“存储与计算分离”相关翻译                                           |
| [364](https://github.com/Vonng/ddia/issues/364) | [@xyohn](https://github.com/xyohn)                         | ch1: 优化“存储与计算分离”相关翻译                                           |
| [363](https://github.com/Vonng/ddia/pull/363)   | [@xyohn](https://github.com/xyohn)                         | #362: 优化一处翻译                                                 |
| [362](https://github.com/Vonng/ddia/issues/362) | [@xyohn](https://github.com/xyohn)                         | ch1: 优化一处翻译                                                   |
| [359](https://github.com/Vonng/ddia/pull/359)   | [@c25423](https://github.com/c25423)                       | ch10: 修正一处拼写错误                                                 |
| [358](https://github.com/Vonng/ddia/pull/358)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch4: 修正一处拼写错误                                                  |
| [356](https://github.com/Vonng/ddia/pull/356)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch2: 修正一处标点错误                                                  |
| [355](https://github.com/Vonng/ddia/pull/355)   | [@DuroyGeorge](https://github.com/DuroyGeorge)             | ch12: 修正一处格式错误                                                 |
| [354](https://github.com/Vonng/ddia/pull/354)   | [@justlorain](https://github.com/justlorain)               | ch7: 修正一处参考链接                                                  |
| [353](https://github.com/Vonng/ddia/pull/353)   | [@fantasyczl](https://github.com/fantasyczl)               | ch3&9: 修正两处引用错误                                                |
| [352](https://github.com/Vonng/ddia/pull/352)   | [@fantasyczl](https://github.com/fantasyczl)               | 支持输出为 EPUB 格式                                                  |
| [349](https://github.com/Vonng/ddia/pull/349)   | [@xiyihan0](https://github.com/xiyihan0)                   | ch1: 修正一处格式错误                                                  |
| [348](https://github.com/Vonng/ddia/pull/348)   | [@omegaatt36](https://github.com/omegaatt36)               | ch3: 修正一处图像链接                                                  |
| [346](https://github.com/Vonng/ddia/issues/346) | [@Vermouth1995](https://github.com/Vermouth1995)           | ch1: 优化一处翻译                                                    |
| [343](https://github.com/Vonng/ddia/pull/343)   | [@kehao-chen](https://github.com/kehao-chen)               | ch10: 优化一处翻译                                                   |
| [341](https://github.com/Vonng/ddia/pull/341)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch3: 优化两处翻译                                                    |
| [340](https://github.com/Vonng/ddia/pull/340)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch2: 优化多处翻译                                                    |
| [338](https://github.com/Vonng/ddia/pull/338)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch1: 优化一处翻译                                                    |
| [335](https://github.com/Vonng/ddia/pull/335)   | [@kimi0230](https://github.com/kimi0230)                   | 修正一处繁体中文错误                                                     |
| [334](https://github.com/Vonng/ddia/pull/334)   | [@soulrrrrr](https://github.com/soulrrrrr)                 | ch2: 修正一处繁体中文错误                                                |
| [332](https://github.com/Vonng/ddia/pull/332)   | [@justlorain](https://github.com/justlorain)               | ch5: 修正一处翻译错误                                                  |
| [331](https://github.com/Vonng/ddia/pull/331)   | [@Lyianu](https://github.com/Lyianu)                       | ch9: 更正几处拼写错误                                                  |
| [330](https://github.com/Vonng/ddia/pull/330)   | [@Lyianu](https://github.com/Lyianu)                       | ch7: 优化一处翻译                                                    |
| [329](https://github.com/Vonng/ddia/issues/329) | [@Lyianu](https://github.com/Lyianu)                       | ch6: 指出一处翻译错误                                                  |
| [328](https://github.com/Vonng/ddia/pull/328)   | [@justlorain](https://github.com/justlorain)               | ch4: 更正一处翻译遗漏                                                  |
| [326](https://github.com/Vonng/ddia/pull/326)   | [@liangGTY](https://github.com/liangGTY)                   | ch1: 优化一处翻译                                                    |
| [323](https://github.com/Vonng/ddia/pull/323)   | [@marvin263](https://github.com/marvin263)                 | ch5: 优化一处翻译                                                    |
| [322](https://github.com/Vonng/ddia/pull/322)   | [@marvin263](https://github.com/marvin263)                 | ch8: 优化一处翻译                                                    |
| [304](https://github.com/Vonng/ddia/pull/304)   | [@spike014](https://github.com/spike014)                   | ch11: 优化一处翻译                                                   |
| [298](https://github.com/Vonng/ddia/pull/298)   | [@Makonike](https://github.com/Makonike)                   | ch11&12: 修正两处错误                                                |
| [284](https://github.com/Vonng/ddia/pull/284)   | [@WAangzE](https://github.com/WAangzE)                     | ch4: 更正一处列表错误                                                  |
| [283](https://github.com/Vonng/ddia/pull/283)   | [@WAangzE](https://github.com/WAangzE)                     | ch3: 更正一处错别字                                                   |
| [282](https://github.com/Vonng/ddia/pull/282)   | [@WAangzE](https://github.com/WAangzE)                     | ch2: 更正一处公式问题                                                  |
| [281](https://github.com/Vonng/ddia/pull/281)   | [@lyuxi99](https://github.com/lyuxi99)                     | 更正多处内部链接错误                                                     |
| [280](https://github.com/Vonng/ddia/pull/280)   | [@lyuxi99](https://github.com/lyuxi99)                     | ch9: 更正内部链接错误                                                  |
| [279](https://github.com/Vonng/ddia/issues/279) | [@codexvn](https://github.com/codexvn)                     | ch9: 指出公式在 GitHub Pages 显示的问题                                  |
| [278](https://github.com/Vonng/ddia/pull/278)   | [@LJlkdskdjflsa](https://github.com/LJlkdskdjflsa)         | 发现了繁体中文版本中的错误翻译                                                |
| [275](https://github.com/Vonng/ddia/pull/275)   | [@117503445](https://github.com/117503445)                 | 更正 LICENSE 链接                                                  |
| [274](https://github.com/Vonng/ddia/pull/274)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch7: 修正错别字                                                     |
| [273](https://github.com/Vonng/ddia/pull/273)   | [@Sdot-Python](https://github.com/Sdot-Python)             | ch7: 统一了 write skew 的翻译                                        |
| [271](https://github.com/Vonng/ddia/pull/271)   | [@Makonike](https://github.com/Makonike)                   | ch6: 统一了 rebalancing 的翻译                                       |
| [270](https://github.com/Vonng/ddia/pull/270)   | [@Ynjxsjmh](https://github.com/Ynjxsjmh)                   | ch7: 修正不一致的翻译                                                  |
| [263](https://github.com/Vonng/ddia/pull/263)   | [@zydmayday](https://github.com/zydmayday)                 | ch5: 修正译文中的重复单词                                                |
| [260](https://github.com/Vonng/ddia/pull/260)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch4: 修正部分不准确的翻译                                                |
| [258](https://github.com/Vonng/ddia/pull/258)   | [@bestgrc](https://github.com/bestgrc)                     | ch3: 修正一处翻译错误                                                  |
| [257](https://github.com/Vonng/ddia/pull/257)   | [@UnderSam](https://github.com/UnderSam)                   | ch8: 修正一处拼写错误                                                  |
| [256](https://github.com/Vonng/ddia/pull/256)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可串行化”相关内容的多处翻译不当                                       |
| [255](https://github.com/Vonng/ddia/pull/255)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可重复读”相关内容的多处翻译不当                                       |
| [253](https://github.com/Vonng/ddia/pull/253)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“读已提交”相关内容的多处翻译不当                                       |
| [246](https://github.com/Vonng/ddia/pull/246)   | [@derekwu0101](https://github.com/derekwu0101)             | ch3: 修正繁体中文的转译错误                                               |
| [245](https://github.com/Vonng/ddia/pull/245)   | [@skyran1278](https://github.com/skyran1278)               | ch12: 修正繁体中文的转译错误                                              |
| [244](https://github.com/Vonng/ddia/pull/244)   | [@Axlgrep](https://github.com/Axlgrep)                     | ch9: 修正不通顺的翻译                                                  |
| [242](https://github.com/Vonng/ddia/pull/242)   | [@lynkeib](https://github.com/lynkeib)                     | ch9: 修正不通顺的翻译                                                  |
| [241](https://github.com/Vonng/ddia/pull/241)   | [@lynkeib](https://github.com/lynkeib)                     | ch8: 修正不正确的公式格式                                                |
| [240](https://github.com/Vonng/ddia/pull/240)   | [@8da2k](https://github.com/8da2k)                         | ch9: 修正不通顺的翻译                                                  |
| [239](https://github.com/Vonng/ddia/pull/239)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch7: 修正不一致的翻译                                                  |
| [237](https://github.com/Vonng/ddia/pull/237)   | [@zhangnew](https://github.com/zhangnew)                   | ch3: 修正错误的图片链接                                                 |
| [229](https://github.com/Vonng/ddia/pull/229)   | [@lis186](https://github.com/lis186)                       | 指出繁体中文的转译错误：复杂                                                 |
| [226](https://github.com/Vonng/ddia/pull/226)   | [@chroming](https://github.com/chroming)                   | ch1: 修正导航栏中的章节名称                                               |
| [220](https://github.com/Vonng/ddia/pull/220)   | [@skyran1278](https://github.com/skyran1278)               | ch9: 修正线性一致的繁体中文翻译                                             |
| [194](https://github.com/Vonng/ddia/pull/194)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正错误的翻译                                                   |
| [193](https://github.com/Vonng/ddia/pull/193)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 优化译文                                                      |
| [192](https://github.com/Vonng/ddia/pull/192)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正不一致和不通顺的翻译                                              |
| [190](https://github.com/Vonng/ddia/pull/190)   | [@Pcrab](https://github.com/Pcrab)                         | ch1: 修正不准确的翻译                                                  |
| [187](https://github.com/Vonng/ddia/pull/187)   | [@narojay](https://github.com/narojay)                     | ch9: 修正生硬的翻译                                                   |
| [186](https://github.com/Vonng/ddia/pull/186)   | [@narojay](https://github.com/narojay)                     | ch8: 修正错别字                                                     |
| [185](https://github.com/Vonng/ddia/issues/185) | [@8da2k](https://github.com/8da2k)                         | 指出小标题跳转的问题                                                     |
| [184](https://github.com/Vonng/ddia/pull/184)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch10: 修正失效的网址                                                  |
| [183](https://github.com/Vonng/ddia/pull/183)   | [@OneSizeFitsQuorum](https://github.com/OneSizeFitsQuorum) | ch8: 修正错别字                                                     |
| [182](https://github.com/Vonng/ddia/issues/182) | [@lroolle](https://github.com/lroolle)                     | 建议docsify的主题风格                                                 |
| [181](https://github.com/Vonng/ddia/pull/181)   | [@YunfengGao](https://github.com/YunfengGao)               | ch2: 修正翻译错误                                                    |
| [180](https://github.com/Vonng/ddia/pull/180)   | [@skyran1278](https://github.com/skyran1278)               | ch3: 指出繁体中文的转译错误                                               |
| [177](https://github.com/Vonng/ddia/pull/177)   | [@exzhawk](https://github.com/exzhawk)                     | 支持 Github Pages 里的公式显示                                         |
| [176](https://github.com/Vonng/ddia/pull/176)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch2: 语义网相关翻译更正                                                 |
| [175](https://github.com/Vonng/ddia/pull/175)   | [@cwr31](https://github.com/cwr31)                         | ch7: 不变式相关翻译更正                                                 |
| [174](https://github.com/Vonng/ddia/pull/174)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | README & preface: 更正不正确的中文用词和标点符号                              |
| [173](https://github.com/Vonng/ddia/pull/173)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正不完整的翻译                                                 |
| [171](https://github.com/Vonng/ddia/pull/171)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正重复的译文                                                  |
| [169](https://github.com/Vonng/ddia/pull/169)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 更正不太通顺的翻译                                                |
| [166](https://github.com/Vonng/ddia/pull/166)   | [@bp4m4h94](https://github.com/bp4m4h94)                   | ch1: 发现错误的文献索引                                                 |
| [164](https://github.com/Vonng/ddia/pull/164)   | [@DragonDriver](https://github.com/DragonDriver)           | preface: 更正错误的标点符号                                             |
| [163](https://github.com/Vonng/ddia/pull/163)   | [@llmmddCoder](https://github.com/llmmddCoder)             | ch1: 更正错误字                                                     |
| [160](https://github.com/Vonng/ddia/pull/160)   | [@Zhayhp](https://github.com/Zhayhp)                       | ch2: 建议将 network model 翻译为网状模型                                 |
| [159](https://github.com/Vonng/ddia/pull/159)   | [@1ess](https://github.com/1ess)                           | ch4: 更正错误字                                                     |
| [157](https://github.com/Vonng/ddia/pull/157)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通顺的翻译                                                 |
| [155](https://github.com/Vonng/ddia/pull/155)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通顺的翻译                                                 |
| [153](https://github.com/Vonng/ddia/pull/153)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch9: 修正缩略图的错别字                                                 |
| [152](https://github.com/Vonng/ddia/pull/152)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 除重->去重                                                    |
| [151](https://github.com/Vonng/ddia/pull/151)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 修订sibling相关的翻译                                            |
| [147](https://github.com/Vonng/ddia/pull/147)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 更正一处不准确的翻译                                                |
| [145](https://github.com/Vonng/ddia/pull/145)   | [@Hookey](https://github.com/Hookey)                       | 识别了当前简繁转译过程中处理不当的地方，暂通过转换脚本规避                                  |
| [144](https://github.com/Vonng/ddia/issues/144) | [@secret4233](https://github.com/secret4233)               | ch7: 不翻译`next-key locking`                                     |
| [143](https://github.com/Vonng/ddia/issues/143) | [@imcheney](https://github.com/imcheney)                   | ch3: 更新残留的机翻段落                                                 |
| [142](https://github.com/Vonng/ddia/issues/142) | [@XIJINIAN](https://github.com/XIJINIAN)                   | 建议去除段首的制表符                                                     |
| [141](https://github.com/Vonng/ddia/issues/141) | [@Flyraty](https://github.com/Flyraty)                     | ch5: 发现一处错误格式的章节引用                                             |
| [140](https://github.com/Vonng/ddia/pull/140)   | [@Bowser1704](https://github.com/Bowser1704)               | ch5: 修正章节Summary中多处不通顺的翻译                                      |
| [139](https://github.com/Vonng/ddia/pull/139)   | [@Bowser1704](https://github.com/Bowser1704)               | ch2&ch3: 修正多处不通顺的或错误的翻译                                        |
| [137](https://github.com/Vonng/ddia/pull/137)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch5&ch6: 优化多处不通顺的或错误的翻译                                        |
| [134](https://github.com/Vonng/ddia/pull/134)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch4: 优化多处不通顺的或错误的翻译                                            |
| [133](https://github.com/Vonng/ddia/pull/133)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 优化多处错误的或不通顺的翻译                                            |
| [132](https://github.com/Vonng/ddia/pull/132)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 优化一处容易产生歧义的翻译                                             |
| [131](https://github.com/Vonng/ddia/pull/131)   | [@rwwg4](https://github.com/rwwg4)                         | ch6: 修正两处错误的翻译                                                 |
| [129](https://github.com/Vonng/ddia/pull/129)   | [@anaer](https://github.com/anaer)                         | ch4: 修正两处强调文本和四处代码变量名称                                         |
| [128](https://github.com/Vonng/ddia/pull/128)   | [@meilin96](https://github.com/meilin96)                   | ch5: 修正一处错误的引用                                                 |
| [126](https://github.com/Vonng/ddia/pull/126)   | [@cwr31](https://github.com/cwr31)                         | ch10: 修正一处错误的翻译（功能 -> 函数）                                      |
| [125](https://github.com/Vonng/ddia/pull/125)   | [@dch1228](https://github.com/dch1228)                     | ch2: 优化 how best 的翻译（如何以最佳方式）                                  |
| [123](https://github.com/Vonng/ddia/pull/123)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 9, TOC in readme, glossary, etc.) |
| [121](https://github.com/Vonng/ddia/pull/121)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 5 to chapter 8)                   |
| [120](https://github.com/Vonng/ddia/pull/120)   | [@jiong-han](https://github.com/jiong-han)                 | Typo fix: 呲之以鼻 -> 嗤之以鼻                                         |
| [119](https://github.com/Vonng/ddia/pull/119)   | [@cclauss](https://github.com/cclauss)                     | Streamline file operations in convert()                        |
| [118](https://github.com/Vonng/ddia/pull/118)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 2 to chapter 4)                   |
| [117](https://github.com/Vonng/ddia/pull/117)   | [@feeeei](https://github.com/feeeei)                       | 统一每章的标题格式                                                      |
| [115](https://github.com/Vonng/ddia/pull/115)   | [@NageNalock](https://github.com/NageNalock)               | 第七章病句修改: 重复词语                                                  |
| [114](https://github.com/Vonng/ddia/pull/114)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | Update README.md: correct the book name                        |
| [113](https://github.com/Vonng/ddia/pull/113)   | [@lpxxn](https://github.com/lpxxn)                         | 修改语句                                                           |
| [112](https://github.com/Vonng/ddia/pull/112)   | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [110](https://github.com/Vonng/ddia/pull/110)   | [@lpxxn](https://github.com/lpxxn)                         | 读已写入数据                                                         |
| [107](https://github.com/Vonng/ddia/pull/107)   | [@abbychau](https://github.com/abbychau)                   | 單調鐘和好死还是赖活着                                                    |
| [106](https://github.com/Vonng/ddia/pull/106)   | [@enochii](https://github.com/enochii)                     | typo in ch2: fix braces typo                                   |
| [105](https://github.com/Vonng/ddia/pull/105)   | [@LiminCode](https://github.com/LiminCode)                 | Chronicle translation error                                    |
| [104](https://github.com/Vonng/ddia/pull/104)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | several advice for better translation                          |
| [103](https://github.com/Vonng/ddia/pull/103)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in ch4: should be 完成 rather than 完全                       |
| [102](https://github.com/Vonng/ddia/pull/102)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | ch4: better-translation: 扼杀 → 破坏                               |
| [101](https://github.com/Vonng/ddia/pull/101)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in Ch4: should be "改变" rathr than "盖面"                    |
| [100](https://github.com/Vonng/ddia/pull/100)   | [@LiminCode](https://github.com/LiminCode)                 | fix missing translation                                        |
| [99 ](https://github.com/Vonng/ddia/pull/99)    | [@mrdrivingduck](https://github.com/mrdrivingduck)         | ch6: fix the word rebalancing                                  |
| [98 ](https://github.com/Vonng/ddia/pull/98)    | [@jacklightChen](https://github.com/jacklightChen)         | fix ch7.md: fix wrong references                               |
| [97 ](https://github.com/Vonng/ddia/pull/97)    | [@jenac](https://github.com/jenac)                         | 96                                                             |
| [96 ](https://github.com/Vonng/ddia/pull/96)    | [@PragmaTwice](https://github.com/PragmaTwice)             | ch2: fix typo about 'may or may not be'                        |
| [95 ](https://github.com/Vonng/ddia/pull/95)    | [@EvanMu96](https://github.com/EvanMu96)                   | fix translation of "the battle cry" in ch5                     |
| [94 ](https://github.com/Vonng/ddia/pull/94)    | [@kemingy](https://github.com/kemingy)                     | ch6: fix markdown and punctuations                             |
| [93 ](https://github.com/Vonng/ddia/pull/93)    | [@kemingy](https://github.com/kemingy)                     | ch5: fix markdown and some typos                               |
| [92 ](https://github.com/Vonng/ddia/pull/92)    | [@Gilbert1024](https://github.com/Gilbert1024)             | Merge pull request #1 from Vonng/master                        |
| [88 ](https://github.com/Vonng/ddia/pull/88)    | [@kemingy](https://github.com/kemingy)                     | fix typo for ch1, ch2, ch3, ch4                                |
| [87 ](https://github.com/Vonng/ddia/pull/87)    | [@wynn5a](https://github.com/wynn5a)                       | Update ch3.md                                                  |
| [86 ](https://github.com/Vonng/ddia/pull/86)    | [@northmorn](https://github.com/northmorn)                 | Update ch1.md                                                  |
| [85 ](https://github.com/Vonng/ddia/pull/85)    | [@sunbuhui](https://github.com/sunbuhui)                   | fix ch2.md: fix ch2 ambiguous translation                      |
| [84 ](https://github.com/Vonng/ddia/pull/84)    | [@ganler](https://github.com/ganler)                       | Fix translation: use up                                        |
| [83 ](https://github.com/Vonng/ddia/pull/83)    | [@afunTW](https://github.com/afunTW)                       | Using OpenCC to convert from zh-cn to zh-tw                    |
| [82 ](https://github.com/Vonng/ddia/pull/82)    | [@kangni](https://github.com/kangni)                       | fix gitbook url                                                |
| [78 ](https://github.com/Vonng/ddia/pull/78)    | [@hanyu2](https://github.com/hanyu2)                       | Fix unappropriated translation                                 |
| [77 ](https://github.com/Vonng/ddia/pull/77)    | [@Ozarklake](https://github.com/Ozarklake)                 | fix typo                                                       |
| [75 ](https://github.com/Vonng/ddia/pull/75)    | [@2997ms](https://github.com/2997ms)                       | Fix typo                                                       |
| [74 ](https://github.com/Vonng/ddia/pull/74)    | [@2997ms](https://github.com/2997ms)                       | Update ch9.md                                                  |
| [70 ](https://github.com/Vonng/ddia/pull/70)    | [@2997ms](https://github.com/2997ms)                       | Update ch7.md                                                  |
| [67 ](https://github.com/Vonng/ddia/pull/67)    | [@jiajiadebug](https://github.com/jiajiadebug)             | fix issues in ch2 - ch9 and glossary                           |
| [66 ](https://github.com/Vonng/ddia/pull/66)    | [@blindpirate](https://github.com/blindpirate)             | Fix typo                                                       |
| [63 ](https://github.com/Vonng/ddia/pull/63)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch10.md                                                 |
| [62 ](https://github.com/Vonng/ddia/pull/62)    | [@ych](https://github.com/ych)                             | fix ch1.md typesetting problem                                 |
| [61 ](https://github.com/Vonng/ddia/pull/61)    | [@xianlaioy](https://github.com/xianlaioy)                 | docs:钟-->种，去掉ou                                                |
| [60 ](https://github.com/Vonng/ddia/pull/60)    | [@Zombo1296](https://github.com/Zombo1296)                 | 否则 -> 或者                                                       |
| [59 ](https://github.com/Vonng/ddia/pull/59)    | [@AlexanderMisel](https://github.com/AlexanderMisel)       | 呼叫->调用，显着->显著                                                  |
| [58 ](https://github.com/Vonng/ddia/pull/58)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch8.md                                                  |
| [55 ](https://github.com/Vonng/ddia/pull/55)    | [@saintube](https://github.com/saintube)                   | ch8: 修改链接错误                                                    |
| [54 ](https://github.com/Vonng/ddia/pull/54)    | [@Panmax](https://github.com/Panmax)                       | Update ch2.md                                                  |
| [53 ](https://github.com/Vonng/ddia/pull/53)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [52 ](https://github.com/Vonng/ddia/pull/52)    | [@hecenjie](https://github.com/hecenjie)                   | Update ch1.md                                                  |
| [51 ](https://github.com/Vonng/ddia/pull/51)    | [@latavin243](https://github.com/latavin243)               | fix 修正ch3 ch4几处翻译                                              |
| [50 ](https://github.com/Vonng/ddia/pull/50)    | [@AlexZFX](https://github.com/AlexZFX)                     | 几个疏漏和格式错误                                                      |
| [49 ](https://github.com/Vonng/ddia/pull/49)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch1.md                                                  |
| [48 ](https://github.com/Vonng/ddia/pull/48)    | [@scaugrated](https://github.com/scaugrated)               | fix typo                                                       |
| [47 ](https://github.com/Vonng/ddia/pull/47)    | [@lzwill](https://github.com/lzwill)                       | Fixed typos in ch2                                             |
| [45 ](https://github.com/Vonng/ddia/pull/45)    | [@zenuo](https://github.com/zenuo)                         | 删除一个多余的右括号                                                     |
| [44 ](https://github.com/Vonng/ddia/pull/44)    | [@akxxsb](https://github.com/akxxsb)                       | 修正第七章底部链接错误                                                    |
| [43 ](https://github.com/Vonng/ddia/pull/43)    | [@baijinping](https://github.com/baijinping)               | "更假简单"->"更加简单"                                                 |
| [42 ](https://github.com/Vonng/ddia/pull/42)    | [@tisonkun](https://github.com/tisonkun)                   | 修复 ch1 中的无序列表格式                                                |
| [38 ](https://github.com/Vonng/ddia/pull/38)    | [@renjie-c](https://github.com/renjie-c)                   | 纠正多处的翻译小错误                                                     |
| [37 ](https://github.com/Vonng/ddia/pull/37)    | [@tankilo](https://github.com/tankilo)                     | fix translation mistakes in ch4.md                             |
| [36 ](https://github.com/Vonng/ddia/pull/36)    | [@wwek](https://github.com/wwek)                           | 1.修复多个链接错误 2.名词优化修订 3.错误修订                                     |
| [35 ](https://github.com/Vonng/ddia/pull/35)    | [@wwek](https://github.com/wwek)                           | fix ch7.md  to ch8.md  link error                              |
| [34 ](https://github.com/Vonng/ddia/pull/34)    | [@wwek](https://github.com/wwek)                           | Merge pull request #1 from Vonng/master                        |
| [33 ](https://github.com/Vonng/ddia/pull/33)    | [@wwek](https://github.com/wwek)                           | fix part-ii.md link error                                      |
| [32 ](https://github.com/Vonng/ddia/pull/32)    | [@JCYoky](https://github.com/JCYoky)                       | Update ch2.md                                                  |
| [31 ](https://github.com/Vonng/ddia/pull/31)    | [@elsonLee](https://github.com/elsonLee)                   | Update ch7.md                                                  |
| [26 ](https://github.com/Vonng/ddia/pull/26)    | [@yjhmelody](https://github.com/yjhmelody)                 | 修复一些明显错误                                                       |
| [25 ](https://github.com/Vonng/ddia/pull/25)    | [@lqbilbo](https://github.com/lqbilbo)                     | 修复链接错误                                                         |
| [24 ](https://github.com/Vonng/ddia/pull/24)    | [@artiship](https://github.com/artiship)                   | 修改词语顺序                                                         |
| [23 ](https://github.com/Vonng/ddia/pull/23)    | [@artiship](https://github.com/artiship)                   | 修正错别字                                                          |
| [22 ](https://github.com/Vonng/ddia/pull/22)    | [@artiship](https://github.com/artiship)                   | 纠正翻译错误                                                         |
| [21 ](https://github.com/Vonng/ddia/pull/21)    | [@zhtisi](https://github.com/zhtisi)                       | 修正目录和本章标题不符的情况                                                 |
| [20 ](https://github.com/Vonng/ddia/pull/20)    | [@rentiansheng](https://github.com/rentiansheng)           | Update ch7.md                                                  |
| [19 ](https://github.com/Vonng/ddia/pull/19)    | [@LHRchina](https://github.com/LHRchina)                   | 修复语句小bug                                                       |
| [16 ](https://github.com/Vonng/ddia/pull/16)    | [@MuAlex](https://github.com/MuAlex)                       | Master                                                         |
| [15 ](https://github.com/Vonng/ddia/pull/15)    | [@cg-zhou](https://github.com/cg-zhou)                     | Update translation progress                                    |
| [14 ](https://github.com/Vonng/ddia/pull/14)    | [@cg-zhou](https://github.com/cg-zhou)                     | Translate glossary                                             |
| [13 ](https://github.com/Vonng/ddia/pull/13)    | [@cg-zhou](https://github.com/cg-zhou)                     | 详细修改了后记中和印度野猪相关的描述                                             |
| [12 ](https://github.com/Vonng/ddia/pull/12)    | [@ibyte2011](https://github.com/ibyte2011)                 | 修改了部分翻译                                                        |
| [11 ](https://github.com/Vonng/ddia/pull/11)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 100%                                                       |
| [10 ](https://github.com/Vonng/ddia/pull/10)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 20%                                                        |
| [9  ](https://github.com/Vonng/ddia/pull/9)     | [@jiajiadebug](https://github.com/jiajiadebug)             | Preface, ch1, part-i translation minor fixes                   |
| [7  ](https://github.com/Vonng/ddia/pull/7)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 translation pull request                                   |
| [6  ](https://github.com/Vonng/ddia/pull/6)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 change version1                                            |
| [5  ](https://github.com/Vonng/ddia/pull/5)     | [@nevertiree](https://github.com/nevertiree)               | Chapter 01语法微调                                                 |
| [2  ](https://github.com/Vonng/ddia/pull/2)     | [@seagullbird](https://github.com/seagullbird)             | 序言初翻                                                           |

</details><br />


---------

## 许可证

本项目采用 [CC-BY 4.0](https://github.com/Vonng/ddia/blob/master/LICENSE) 许可证，您可以在这里找到完整说明：

- [署名 4.0 协议国际版 CC BY 4.0 Deed](https://creativecommons.org/licenses/by/4.0/deed.zh-hans)
- [Attribution 4.0 International CC BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.en)


================================================
FILE: content/v1/ch1.md
================================================
---
title: "第一章：可靠性、可伸缩性和可维护性"
linkTitle: "1. 可靠性、可伸缩性和可维护性"
weight: 101
breadcrumbs: false
---


![](/map/ch01.png)

> 互联网做得太棒了，以至于大多数人将它看作像太平洋这样的自然资源，而不是什么人工产物。上一次出现这种大规模且无差错的技术，你还记得是什么时候吗？
>
> —— [艾伦・凯](http://www.drdobbs.com/architecture-and-design/interview-with-alan-kay/240003442) 在接受 Dobb 博士杂志采访时说（2012 年）

现今很多应用程序都是 **数据密集型（data-intensive）** 的，而非 **计算密集型（compute-intensive）** 的。因此 CPU 很少成为这类应用的瓶颈，更大的问题通常来自数据量、数据复杂性、以及数据的变更速度。

数据密集型应用通常由标准组件构建而成，标准组件提供了很多通用的功能；例如，许多应用程序都需要：

 - 存储数据，以便自己或其他应用程序之后能再次找到 （*数据库，即 databases*）
 - 记住开销昂贵操作的结果，加快读取速度（*缓存，即 caches*）
 - 允许用户按关键字搜索数据，或以各种方式对数据进行过滤（*搜索索引，即 search indexes*）
 - 向其他进程发送消息，进行异步处理（*流处理，即 stream processing*）
 - 定期处理累积的大批量数据（*批处理，即 batch processing*）

如果这些功能听上去平淡无奇，那是因为这些 **数据系统（data system）** 是非常成功的抽象：我们一直不假思索地使用它们并习以为常。绝大多数工程师不会幻想从零开始编写存储引擎，因为在开发应用时，数据库已经是足够完美的工具了。

但现实没有这么简单。不同的应用有着不同的需求，因而数据库系统也是百花齐放，有着各式各样的特性。实现缓存有很多种手段，创建搜索索引也有好几种方法，诸如此类。因此在开发应用前，我们依然有必要先弄清楚最适合手头工作的工具和方法。而且当单个工具解决不了你的问题时，组合使用这些工具可能还是有些难度的。

本书将是一趟关于数据系统原理、实践与应用的旅程，并讲述了设计数据密集型应用的方法。我们将探索不同工具之间的共性与特性，以及各自的实现原理。

本章将从我们所要实现的基础目标开始：可靠、可伸缩、可维护的数据系统。我们将澄清这些词语的含义，概述考量这些目标的方法。并回顾一些后续章节所需的基础知识。在接下来的章节中我们将抽丝剥茧，研究设计数据密集型应用时可能遇到的设计决策。


## 关于数据系统的思考

我们通常认为，数据库、消息队列、缓存等工具分属于几个差异显著的类别。虽然数据库和消息队列表面上有一些相似性 —— 它们都会存储一段时间的数据 —— 但它们有迥然不同的访问模式，这意味着迥异的性能特征和实现手段。

那我们为什么要把这些东西放在 **数据系统（data system）** 的总称之下混为一谈呢？

近些年来，出现了许多新的数据存储工具与数据处理工具。它们针对不同应用场景进行优化，因此不再适合生硬地归入传统类别【1】。类别之间的界限变得越来越模糊，例如：数据存储可以被当成消息队列用（Redis），消息队列则带有类似数据库的持久保证（Apache Kafka）。

其次，越来越多的应用程序有着各种严格而广泛的要求，单个工具不足以满足所有的数据处理和存储需求。取而代之的是，总体工作被拆分成一系列能被单个工具高效完成的任务，并通过应用代码将它们缝合起来。

例如，如果将缓存（应用管理的缓存层，Memcached 或同类产品）和全文搜索（全文搜索服务器，例如 Elasticsearch 或 Solr）功能从主数据库剥离出来，那么使缓存 / 索引与主数据库保持同步通常是应用代码的责任。[图 1-1](/v1/ddia_0101.png) 给出了这种架构可能的样子（细节将在后面的章节中详细介绍）。

![](/v1/ddia_0101.png)

**图 1-1 一个可能的组合使用多个组件的数据系统架构**

当你将多个工具组合在一起提供服务时，服务的接口或 **应用程序编程接口（API, Application Programming Interface）** 通常向客户端隐藏这些实现细节。现在，你基本上已经使用较小的通用组件创建了一个全新的、专用的数据系统。这个新的复合数据系统可能会提供特定的保证，例如：缓存在写入时会作废或更新，以便外部客户端获取一致的结果。现在你不仅是应用程序开发人员，还是数据系统设计人员了。

设计数据系统或服务时可能会遇到很多棘手的问题，例如：当系统出问题时，如何确保数据的正确性和完整性？当部分系统退化降级时，如何为客户提供始终如一的良好性能？当负载增加时，如何扩容应对？什么样的 API 才是好的 API？

影响数据系统设计的因素很多，包括参与人员的技能和经验、历史遗留问题、系统路径依赖、交付时限、公司的风险容忍度、监管约束等，这些因素都需要具体问题具体分析。

本书着重讨论三个在大多数软件系统中都很重要的问题：

可靠性（Reliability）
: 系统在 **困境**（adversity，比如硬件故障、软件故障、人为错误）中仍可正常工作（正确完成功能，并能达到期望的性能水准）。请参阅 “[可靠性](#可靠性)”。

可伸缩性（Scalability）
: 有合理的办法应对系统的增长（数据量、流量、复杂性）。请参阅 “[可伸缩性](#可伸缩性)”。

可维护性（Maintainability）
: 许多不同的人（工程师、运维）在不同的生命周期，都能高效地在系统上工作（使系统保持现有行为，并适应新的应用场景）。请参阅 “[可维护性](#可维护性)”。

人们经常追求这些词汇，却没有清楚理解它们到底意味着什么。为了工程的严谨性，本章的剩余部分将探讨可靠性、可伸缩性和可维护性的含义。为实现这些目标而使用的各种技术，架构和算法将在后续的章节中研究。


## 可靠性

人们对于一个东西是否可靠，都有一个直观的想法。人们对可靠软件的典型期望包括：

* 应用程序表现出用户所期望的功能。
* 允许用户犯错，允许用户以出乎意料的方式使用软件。
* 在预期的负载和数据量下，性能满足要求。
* 系统能防止未经授权的访问和滥用。

如果所有这些在一起意味着 “正确工作”，那么可以把可靠性粗略理解为 “即使出现问题，也能继续正确工作”。

造成错误的原因叫做 **故障（fault）**，能预料并应对故障的系统特性可称为 **容错（fault-tolerant）** 或 **回弹性（resilient）**。“**容错**” 一词可能会产生误导，因为它暗示着系统可以容忍所有可能的错误，但在实际中这是不可能的。比方说，如果整个地球（及其上的所有服务器）都被黑洞吞噬了，想要容忍这种错误，需要把网络托管到太空中 —— 这种预算能不能批准就祝你好运了。所以在讨论容错时，只有谈论特定类型的错误才有意义。

注意 **故障（fault）** 不同于 **失效（failure）**【2】。**故障** 通常定义为系统的一部分状态偏离其标准，而 **失效** 则是系统作为一个整体停止向用户提供服务。故障的概率不可能降到零，因此最好设计容错机制以防因 **故障** 而导致 **失效**。本书中我们将介绍几种用不可靠的部件构建可靠系统的技术。

反直觉的是，在这类容错系统中，通过故意触发来 **提高** 故障率是有意义的，例如：在没有警告的情况下随机地杀死单个进程。许多高危漏洞实际上是由糟糕的错误处理导致的【3】，因此我们可以通过故意引发故障来确保容错机制不断运行并接受考验，从而提高故障自然发生时系统能正确处理的信心。Netflix 公司的 *Chaos Monkey*【4】就是这种方法的一个例子。

尽管比起 **阻止错误（prevent error）**，我们通常更倾向于 **容忍错误**。但也有 **预防胜于治疗** 的情况（比如不存在治疗方法时）。安全问题就属于这种情况。例如，如果攻击者破坏了系统，并获取了敏感数据，这种事是撤销不了的。但本书主要讨论的是可以恢复的故障种类，正如下面几节所述。

### 硬件故障

当想到系统失效的原因时，**硬件故障（hardware faults）** 总会第一个进入脑海。硬盘崩溃、内存出错、机房断电、有人拔错网线…… 任何与大型数据中心打过交道的人都会告诉你：一旦你拥有很多机器，这些事情 **总** 会发生！

据报道称，硬盘的 **平均无故障时间（MTTF, mean time to failure）** 约为 10 到 50 年【5】【6】。因此从数学期望上讲，在拥有 10000 个磁盘的存储集群上，平均每天会有 1 个磁盘出故障。

为了减少系统的故障率，第一反应通常都是增加单个硬件的冗余度，例如：磁盘可以组建 RAID，服务器可能有双路电源和热插拔 CPU，数据中心可能有电池和柴油发电机作为后备电源，某个组件挂掉时冗余组件可以立刻接管。这种方法虽然不能完全防止由硬件问题导致的系统失效，但它简单易懂，通常也足以让机器不间断运行很多年。

直到最近，硬件冗余对于大多数应用来说已经足够了，它使单台机器完全失效变得相当罕见。只要你能快速地把备份恢复到新机器上，故障停机时间对大多数应用而言都算不上灾难性的。只有少量高可用性至关重要的应用才会要求有多套硬件冗余。

但是随着数据量和应用计算需求的增加，越来越多的应用开始大量使用机器，这会相应地增加硬件故障率。此外，在类似亚马逊 AWS（Amazon Web Services）的一些云服务平台上，虚拟机实例不可用却没有任何警告也是很常见的【7】，因为云平台的设计就是优先考虑 **灵活性（flexibility）** 和 **弹性（elasticity）**[^i]，而不是单机可靠性。

如果在硬件冗余的基础上进一步引入软件容错机制，那么系统在容忍整个（单台）机器故障的道路上就更进一步了。这样的系统也有运维上的便利，例如：如果需要重启机器（例如应用操作系统安全补丁），单服务器系统就需要计划停机。而允许机器失效的系统则可以一次修复一个节点，无需整个系统停机。

[^i]: 在 [应对负载的方法](#应对负载的方法) 一节定义

### 软件错误

我们通常认为硬件故障是随机的、相互独立的：一台机器的磁盘失效并不意味着另一台机器的磁盘也会失效。虽然大量硬件组件之间可能存在微弱的相关性（例如服务器机架的温度等共同的原因），但同时发生故障也是极为罕见的。

另一类错误是内部的 **系统性错误（systematic error）**【8】。这类错误难以预料，而且因为是跨节点相关的，所以比起不相关的硬件故障往往可能造成更多的 **系统失效**【5】。例子包括：

* 接受特定的错误输入，便导致所有应用服务器实例崩溃的 BUG。例如 2012 年 6 月 30 日的闰秒，由于 Linux 内核中的一个错误【9】，许多应用同时挂掉了。
* 失控进程会用尽一些共享资源，包括 CPU 时间、内存、磁盘空间或网络带宽。
* 系统依赖的服务变慢，没有响应，或者开始返回错误的响应。
* 级联故障，一个组件中的小故障触发另一个组件中的故障，进而触发更多的故障【10】。

导致这类软件故障的 BUG 通常会潜伏很长时间，直到被异常情况触发为止。这种情况意味着软件对其环境做出了某种假设 —— 虽然这种假设通常来说是正确的，但由于某种原因最后不再成立了【11】。

虽然软件中的系统性故障没有速效药，但我们还是有很多小办法，例如：仔细考虑系统中的假设和交互；彻底的测试；进程隔离；允许进程崩溃并重启；测量、监控并分析生产环境中的系统行为。如果系统能够提供一些保证（例如在一个消息队列中，进入与发出的消息数量相等），那么系统就可以在运行时不断自检，并在出现 **差异（discrepancy）** 时报警【12】。

### 人为错误

设计并构建了软件系统的工程师是人类，维持系统运行的运维也是人类。即使他们怀有最大的善意，人类也是不可靠的。举个例子，一项关于大型互联网服务的研究发现，运维配置错误是导致服务中断的首要原因，而硬件故障（服务器或网络）仅导致了 10-25% 的服务中断【13】。

尽管人类不可靠，但怎么做才能让系统变得可靠？最好的系统会组合使用以下几种办法：

* 以最小化犯错机会的方式设计系统。例如，精心设计的抽象、API 和管理后台使做对事情更容易，搞砸事情更困难。但如果接口限制太多，人们就会忽略它们的好处而想办法绕开。很难正确把握这种微妙的平衡。
* 将人们最容易犯错的地方与可能导致失效的地方 **解耦（decouple）**。特别是提供一个功能齐全的非生产环境 **沙箱（sandbox）**，使人们可以在不影响真实用户的情况下，使用真实数据安全地探索和实验。
* 在各个层次进行彻底的测试【3】，从单元测试、全系统集成测试到手动测试。自动化测试易于理解，已经被广泛使用，特别适合用来覆盖正常情况中少见的 **边缘场景（corner case）**。
* 允许从人为错误中简单快速地恢复，以最大限度地减少失效情况带来的影响。例如，快速回滚配置变更，分批发布新代码（以便任何意外错误只影响一小部分用户），并提供数据重算工具（以备旧的计算出错）。
* 配置详细和明确的监控，比如性能指标和错误率。在其他工程学科中这指的是 **遥测（telemetry）**（一旦火箭离开了地面，遥测技术对于跟踪发生的事情和理解失败是至关重要的）。监控可以向我们发出预警信号，并允许我们检查是否有任何地方违反了假设和约束。当出现问题时，指标数据对于问题诊断是非常宝贵的。
* 良好的管理实践与充分的培训 —— 一个复杂而重要的方面，但超出了本书的范围。

### 可靠性有多重要？

可靠性不仅仅是针对核电站和空中交通管制软件而言，我们也期望更多平凡的应用能可靠地运行。商务应用中的错误会导致生产力损失（也许数据报告不完整还会有法律风险），而电商网站的中断则可能会导致收入和声誉的巨大损失。

即使在 “非关键” 应用中，我们也对用户负有责任。试想一位家长把所有的照片和孩子的视频储存在你的照片应用里【15】。如果数据库突然损坏，他们会感觉如何？他们可能会知道如何从备份恢复吗？

在某些情况下，我们可能会选择牺牲可靠性来降低开发成本（例如为未经证实的市场开发产品原型）或运营成本（例如利润率极低的服务），但我们偷工减料时，应该清楚意识到自己在做什么。


## 可伸缩性

系统今天能可靠运行，并不意味未来也能可靠运行。服务 **降级（degradation）** 的一个常见原因是负载增加，例如：系统负载已经从一万个并发用户增长到十万个并发用户，或者从一百万增长到一千万。也许现在处理的数据量级要比过去大得多。

**可伸缩性（Scalability）** 是用来描述系统应对负载增长能力的术语。但是请注意，这不是贴在系统上的一维标签：说 “X 可伸缩” 或 “Y 不可伸缩” 是没有任何意义的。相反，讨论可伸缩性意味着考虑诸如 “如果系统以特定方式增长，有什么选项可以应对增长？” 和 “如何增加计算资源来处理额外的负载？” 等问题。

### 描述负载

在讨论增长问题（如果负载加倍会发生什么？）前，首先要能简要描述系统的当前负载。负载可以用一些称为 **负载参数（load parameters）** 的数字来描述。参数的最佳选择取决于系统架构，它可能是每秒向 Web 服务器发出的请求、数据库中的读写比率、聊天室中同时活跃的用户数量、缓存命中率或其他东西。除此之外，也许平均情况对你很重要，也许你的瓶颈是少数极端场景。

为了使这个概念更加具体，我们以推特在 2012 年 11 月发布的数据【16】为例。推特的两个主要业务是：

发布推文
: 用户可以向其粉丝发布新消息（平均 4.6k 请求 / 秒，峰值超过 12k 请求 / 秒）。

主页时间线
: 用户可以查阅他们关注的人发布的推文（300k 请求 / 秒）。

处理每秒 12,000 次写入（发推文的速率峰值）还是很简单的。然而推特的伸缩性挑战并不是主要来自推特量，而是来自 **扇出（fan-out）**[^ii]—— 每个用户关注了很多人，也被很多人关注。

[^ii]: 扇出：从电子工程学中借用的术语，它描述了输入连接到另一个门输出的逻辑门数量。输出需要提供足够的电流来驱动所有连接的输入。在事务处理系统中，我们使用它来描述为了服务一个传入请求而需要执行其他服务的请求数量。

大体上讲，这一对操作有两种实现方式。

1. 发布推文时，只需将新推文插入全局推文集合即可。当一个用户请求自己的主页时间线时，首先查找他关注的所有人，查询这些被关注用户发布的推文并按时间顺序合并。在如 [图 1-2](/v1/ddia_0102.png) 所示的关系型数据库中，可以编写这样的查询：

    ```sql
    SELECT tweets.*, users.*
      FROM tweets
      JOIN users   ON tweets.sender_id = users.id
      JOIN follows ON follows.followee_id = users.id
      WHERE follows.follower_id = current_user
    ```

    ![](/v1/ddia_0102.png)

    **图 1-2 推特主页时间线的关系型模式简单实现**

2. 为每个用户的主页时间线维护一个缓存，就像每个用户的推文收件箱（[图 1-3](/v1/ddia_0103.png)）。当一个用户发布推文时，查找所有关注该用户的人，并将新的推文插入到每个主页时间线缓存中。因此读取主页时间线的请求开销很小，因为结果已经提前计算好了。

    ![](/v1/ddia_0103.png)

    **图 1-3 用于分发推特至关注者的数据流水线，2012 年 11 月的负载参数【16】**

推特的第一个版本使用了方法 1，但系统很难跟上主页时间线查询的负载。所以公司转向了方法 2，方法 2 的效果更好，因为发推频率比查询主页时间线的频率几乎低了两个数量级，所以在这种情况下，最好在写入时做更多的工作，而在读取时做更少的工作。

然而方法 2 的缺点是，发推现在需要大量的额外工作。平均来说，一条推文会发往约 75 个关注者，所以每秒 4.6k 的发推写入，变成了对主页时间线缓存每秒 345k 的写入。但这个平均值隐藏了用户粉丝数差异巨大这一现实，一些用户有超过 3000 万的粉丝，这意味着一条推文就可能会导致主页时间线缓存的 3000 万次写入！及时完成这种操作是一个巨大的挑战 —— 推特尝试在 5 秒内向粉丝发送推文。

在推特的例子中，每个用户粉丝数的分布（可能按这些用户的发推频率来加权）是探讨可伸缩性的一个关键负载参数，因为它决定了扇出负载。你的应用程序可能具有非常不同的特征，但可以采用相似的原则来考虑它的负载。

推特轶事的最终转折：现在已经稳健地实现了方法 2，推特逐步转向了两种方法的混合。大多数用户发的推文会被扇出写入其粉丝主页时间线缓存中。但是少数拥有海量粉丝的用户（即名流）会被排除在外。当用户读取主页时间线时，分别地获取出该用户所关注的每位名流的推文，再与用户的主页时间线缓存合并，如方法 1 所示。这种混合方法能始终如一地提供良好性能。在 [第十二章](/v1/ch12) 中我们将重新讨论这个例子，这在覆盖更多技术层面之后。

### 描述性能

一旦系统的负载被描述好，就可以研究当负载增加会发生什么。我们可以从两种角度来看：

* 增加负载参数并保持系统资源（CPU、内存、网络带宽等）不变时，系统性能将受到什么影响？
* 增加负载参数并希望保持性能不变时，需要增加多少系统资源？

这两个问题都需要性能数据，所以让我们简单地看一下如何描述系统性能。

对于 Hadoop 这样的批处理系统，通常关心的是 **吞吐量（throughput）**，即每秒可以处理的记录数量，或者在特定规模数据集上运行作业的总时间 [^iii]。对于在线系统，通常更重要的是服务的 **响应时间（response time）**，即客户端发送请求到接收响应之间的时间。

[^iii]: 理想情况下，批量作业的运行时间是数据集的大小除以吞吐量。在实践中由于数据倾斜（数据不是均匀分布在每个工作进程中），需要等待最慢的任务完成，所以运行时间往往更长。

> #### 延迟和响应时间
>
> **延迟（latency）** 和 **响应时间（response time）** 经常用作同义词，但实际上它们并不一样。响应时间是客户所看到的，除了实际处理请求的时间（ **服务时间（service time）** ）之外，还包括网络延迟和排队延迟。延迟是某个请求等待处理的 **持续时长**，在此期间它处于 **休眠（latent）** 状态，并等待服务【17】。

即使不断重复发送同样的请求，每次得到的响应时间也都会略有不同。现实世界的系统会处理各式各样的请求，响应时间可能会有很大差异。因此我们需要将响应时间视为一个可以测量的数值 **分布（distribution）**，而不是单个数值。

在 [图 1-4](/v1/ddia_0104.png) 中，每个灰条代表一次对服务的请求，其高度表示请求花费了多长时间。大多数请求是相当快的，但偶尔会出现需要更长的时间的异常值。这也许是因为缓慢的请求实质上开销更大，例如它们可能会处理更多的数据。但即使（你认为）所有请求都花费相同时间的情况下，随机的附加延迟也会导致结果变化，例如：上下文切换到后台进程，网络数据包丢失与 TCP 重传，垃圾收集暂停，强制从磁盘读取的页面错误，服务器机架中的震动【18】，还有很多其他原因。

![](/v1/ddia_0104.png)

**图 1-4 展示了一个服务 100 次请求响应时间的均值与百分位数**

通常报表都会展示服务的平均响应时间。（严格来讲 “平均” 一词并不指代任何特定公式，但实际上它通常被理解为 **算术平均值（arithmetic mean）**：给定 n 个值，加起来除以 n ）。然而如果你想知道 “**典型（typical）**” 响应时间，那么平均值并不是一个非常好的指标，因为它不能告诉你有多少用户实际上经历了这个延迟。

通常使用 **百分位点（percentiles）** 会更好。如果将响应时间列表按最快到最慢排序，那么 **中位数（median）** 就在正中间：举个例子，如果你的响应时间中位数是 200 毫秒，这意味着一半请求的返回时间少于 200 毫秒，另一半比这个要长。

如果想知道典型场景下用户需要等待多长时间，那么中位数是一个好的度量标准：一半用户请求的响应时间少于响应时间的中位数，另一半服务时间比中位数长。中位数也被称为第 50 百分位点，有时缩写为 p50。注意中位数是关于单个请求的；如果用户同时发出几个请求（在一个会话过程中，或者由于一个页面中包含了多个资源），则至少一个请求比中位数慢的概率远大于 50%。

为了弄清异常值有多糟糕，可以看看更高的百分位点，例如第 95、99 和 99.9 百分位点（缩写为 p95，p99 和 p999）。它们意味着 95%、99% 或 99.9% 的请求响应时间要比该阈值快，例如：如果第 95 百分位点响应时间是 1.5 秒，则意味着 100 个请求中的 95 个响应时间快于 1.5 秒，而 100 个请求中的 5 个响应时间超过 1.5 秒。如 [图 1-4](/v1/ddia_0104.png) 所示。

响应时间的高百分位点（也称为 **尾部延迟**，即 **tail latencies**）非常重要，因为它们直接影响用户的服务体验。例如亚马逊在描述内部服务的响应时间要求时是以 99.9 百分位点为准，即使它只影响一千个请求中的一个。这是因为请求响应最慢的客户往往也是数据最多的客户，也可以说是最有价值的客户 —— 因为他们掏钱了【19】。保证网站响应迅速对于保持客户的满意度非常重要，亚马逊观察到：响应时间增加 100 毫秒，销售量就减少 1%【20】；而另一些报告说：慢 1 秒钟会让客户满意度指标减少 16%【21，22】。

另一方面，优化第 99.99 百分位点（一万个请求中最慢的一个）被认为太昂贵了，不能为亚马逊的目标带来足够好处。减小高百分位点处的响应时间相当困难，因为它很容易受到随机事件的影响，这超出了控制范围，而且效益也很小。

百分位点通常用于 **服务级别目标（SLO, service level objectives）** 和 **服务级别协议（SLA, service level agreements）**，即定义服务预期性能和可用性的合同。SLA 可能会声明，如果服务响应时间的中位数小于 200 毫秒，且 99.9 百分位点低于 1 秒，则认为服务工作正常（如果响应时间更长，就认为服务不达标）。这些指标为客户设定了期望值，并允许客户在 SLA 未达标的情况下要求退款。

**排队延迟（queueing delay）** 通常占了高百分位点处响应时间的很大一部分。由于服务器只能并行处理少量的事务（如受其 CPU 核数的限制），所以只要有少量缓慢的请求就能阻碍后续请求的处理，这种效应有时被称为 **头部阻塞（head-of-line blocking）** 。即使后续请求在服务器上处理的非常迅速，由于需要等待先前请求完成，客户端最终看到的是缓慢的总体响应时间。因为存在这种效应，测量客户端的响应时间非常重要。

为测试系统的可伸缩性而人为产生负载时，产生负载的客户端要独立于响应时间不断发送请求。如果客户端在发送下一个请求之前等待先前的请求完成，这种行为会产生人为排队的效果，使得测试时的队列比现实情况更短，使测量结果产生偏差【23】。

> #### 实践中的百分位点
>
> 在多重调用的后端服务里，高百分位数变得特别重要。即使并行调用，最终用户请求仍然需要等待最慢的并行调用完成。如 [图 1-5](/v1/ddia_0105.png) 所示，只需要一个缓慢的调用就可以使整个最终用户请求变慢。即使只有一小部分后端调用速度较慢，如果最终用户请求需要多个后端调用，则获得较慢调用的机会也会增加，因此较高比例的最终用户请求速度会变慢（该效果称为尾部延迟放大，即 tail latency amplification【24】）。
>
> 如果你想将响应时间百分点添加到你的服务的监视仪表板，则需要持续有效地计算它们。例如，你可以使用滑动窗口来跟踪连续10分钟内的请求响应时间。每一分钟，你都会计算出该窗口中的响应时间中值和各种百分数，并将这些度量值绘制在图上。
>
> 简单的实现是在时间窗口内保存所有请求的响应时间列表，并且每分钟对列表进行排序。如果对你来说效率太低，那么有一些算法能够以最小的 CPU 和内存成本（如前向衰减【25】、t-digest【26】或 HdrHistogram 【27】）来计算百分位数的近似值。请注意，平均百分比（例如，减少时间分辨率或合并来自多台机器的数据）在数学上没有意义 - 聚合响应时间数据的正确方法是添加直方图【28】。

![](/v1/ddia_0105.png)

**图 1-5 当一个请求需要多个后端请求时，单个后端慢请求就会拖慢整个终端用户的请求**

### 应对负载的方法

现在我们已经讨论了用于描述负载的参数和用于衡量性能的指标。可以开始认真讨论可伸缩性了：当负载参数增加时，如何保持良好的性能？

适应某个级别负载的架构不太可能应付 10 倍于此的负载。如果你正在开发一个快速增长的服务，那么每次负载发生数量级的增长时，你可能都需要重新考虑架构 —— 或者更频繁。

人们经常讨论 **纵向伸缩**（scaling up，也称为垂直伸缩，即 vertical scaling，转向更强大的机器）和 **横向伸缩**（scaling out，也称为水平伸缩，即 horizontal scaling，将负载分布到多台小机器上）之间的对立。跨多台机器分配负载也称为 “**无共享（shared-nothing）**” 架构。可以在单台机器上运行的系统通常更简单，但高端机器可能非常贵，所以非常密集的负载通常无法避免地需要横向伸缩。现实世界中的优秀架构需要将这两种方法务实地结合，因为使用几台足够强大的机器可能比使用大量的小型虚拟机更简单也更便宜。

有些系统是 **弹性（elastic）** 的，这意味着可以在检测到负载增加时自动增加计算资源，而其他系统则是手动伸缩（人工分析容量并决定向系统添加更多的机器）。如果负载 **极难预测（highly unpredictable）**，则弹性系统可能很有用，但手动伸缩系统更简单，并且意外操作可能会更少（请参阅 “[分区再平衡](/v1/ch6#分区再平衡)”）。

跨多台机器部署 **无状态服务（stateless services）** 非常简单，但将带状态的数据系统从单节点变为分布式配置则可能引入许多额外复杂度。出于这个原因，常识告诉我们应该将数据库放在单个节点上（纵向伸缩），直到伸缩成本或可用性需求迫使其改为分布式。

随着分布式系统的工具和抽象越来越好，至少对于某些类型的应用而言，这种常识可能会改变。可以预见分布式数据系统将成为未来的默认设置，即使对不处理大量数据或流量的场景也如此。本书的其余部分将介绍多种分布式数据系统，不仅讨论它们在可伸缩性方面的表现，还包括易用性和可维护性。

大规模的系统架构通常是应用特定的 —— 没有一招鲜吃遍天的通用可伸缩架构（不正式的叫法：**万金油（magic scaling sauce）** ）。应用的问题可能是读取量、写入量、要存储的数据量、数据的复杂度、响应时间要求、访问模式或者所有问题的大杂烩。

举个例子，用于处理每秒十万个请求（每个大小为 1 kB）的系统与用于处理每分钟 3 个请求（每个大小为 2GB）的系统看上去会非常不一样，尽管两个系统有同样的数据吞吐量。

一个良好适配应用的可伸缩架构，是围绕着 **假设（assumption）** 建立的：哪些操作是常见的？哪些操作是罕见的？这就是所谓负载参数。如果假设最终是错误的，那么为伸缩所做的工程投入就白费了，最糟糕的是适得其反。在早期创业公司或非正式产品中，通常支持产品快速迭代的能力，要比可伸缩至未来的假想负载要重要的多。

尽管这些架构是应用程序特定的，但可伸缩的架构通常也是从通用的积木块搭建而成的，并以常见的模式排列。在本书中，我们将讨论这些构件和模式。


## 可维护性

众所周知，软件的大部分开销并不在最初的开发阶段，而是在持续的维护阶段，包括修复漏洞、保持系统正常运行、调查失效、适配新的平台、为新的场景进行修改、偿还技术债和添加新的功能。

不幸的是，许多从事软件系统行业的人不喜欢维护所谓的 **遗留（legacy）** 系统，—— 也许因为涉及修复其他人的错误、和过时的平台打交道，或者系统被迫使用于一些份外工作。每一个遗留系统都以自己的方式让人不爽，所以很难给出一个通用的建议来和它们打交道。

但是我们可以，也应该以这样一种方式来设计软件：在设计之初就尽量考虑尽可能减少维护期间的痛苦，从而避免自己的软件系统变成遗留系统。为此，我们将特别关注软件系统的三个设计原则：

可操作性（Operability）
: 便于运维团队保持系统平稳运行。

简单性（Simplicity）
: 从系统中消除尽可能多的 **复杂度（complexity）**，使新工程师也能轻松理解系统（注意这和用户接口的简单性不一样）。

可演化性（evolvability）
: 使工程师在未来能轻松地对系统进行更改，当需求变化时为新应用场景做适配。也称为 **可扩展性（extensibility）**、**可修改性（modifiability）** 或 **可塑性（plasticity）**。

和之前提到的可靠性、可伸缩性一样，实现这些目标也没有简单的解决方案。不过我们会试着想象具有可操作性，简单性和可演化性的系统会是什么样子。

### 可操作性：人生苦短，关爱运维

有人认为，“良好的运维经常可以绕开垃圾（或不完整）软件的局限性，而再好的软件摊上垃圾运维也没法可靠运行”。尽管运维的某些方面可以，而且应该是自动化的，但在最初建立正确运作的自动化机制仍然取决于人。

运维团队对于保持软件系统顺利运行至关重要。一个优秀运维团队的典型职责如下（或者更多）【29】：

* 监控系统的运行状况，并在服务状态不佳时快速恢复服务。
* 跟踪问题的原因，例如系统故障或性能下降。
* 及时更新软件和平台，比如安全补丁。
* 了解系统间的相互作用，以便在异常变更造成损失前进行规避。
* 预测未来的问题，并在问题出现之前加以解决（例如，容量规划）。
* 建立部署、配置、管理方面的良好实践，编写相应工具。
* 执行复杂的维护任务，例如将应用程序从一个平台迁移到另一个平台。
* 当配置变更时，维持系统的安全性。
* 定义工作流程，使运维操作可预测，并保持生产环境稳定。
* 铁打的营盘流水的兵，维持组织对系统的了解。

良好的可操作性意味着更轻松的日常工作，进而运维团队能专注于高价值的事情。数据系统可以通过各种方式使日常任务更轻松：

* 通过良好的监控，提供对系统内部状态和运行时行为的 **可见性（visibility）**。
* 为自动化提供良好支持，将系统与标准化工具相集成。
* 避免依赖单台机器（在整个系统继续不间断运行的情况下允许机器停机维护）。
* 提供良好的文档和易于理解的操作模型（“如果做 X，会发生 Y”）。
* 提供良好的默认行为，但需要时也允许管理员自由覆盖默认值。
* 有条件时进行自我修复，但需要时也允许管理员手动控制系统状态。
* 行为可预测，最大限度减少意外。


### 简单性：管理复杂度

小型软件项目可以使用简单讨喜的、富表现力的代码，但随着项目越来越大，代码往往变得非常复杂，难以理解。这种复杂度拖慢了所有系统相关人员，进一步增加了维护成本。一个陷入复杂泥潭的软件项目有时被描述为 **烂泥潭（a big ball of mud）** 【30】。

**复杂度（complexity）** 有各种可能的症状，例如：状态空间激增、模块间紧密耦合、纠结的依赖关系、不一致的命名和术语、解决性能问题的 Hack、需要绕开的特例等等，现在已经有很多关于这个话题的讨论【31,32,33】。

因为复杂度导致维护困难时，预算和时间安排通常会超支。在复杂的软件中进行变更，引入错误的风险也更大：当开发人员难以理解系统时，隐藏的假设、无意的后果和意外的交互就更容易被忽略。相反，降低复杂度能极大地提高软件的可维护性，因此简单性应该是构建系统的一个关键目标。

简化系统并不一定意味着减少功能；它也可以意味着消除 **额外的（accidental）** 的复杂度。Moseley 和 Marks【32】把 **额外复杂度** 定义为：由具体实现中涌现，而非（从用户视角看，系统所解决的）问题本身固有的复杂度。

用于消除 **额外复杂度** 的最好工具之一是 **抽象（abstraction）**。一个好的抽象可以将大量实现细节隐藏在一个干净，简单易懂的外观下面。一个好的抽象也可以广泛用于各类不同应用。比起重复造很多轮子，重用抽象不仅更有效率，而且有助于开发高质量的软件。抽象组件的质量改进将使所有使用它的应用受益。

例如，高级编程语言是一种抽象，隐藏了机器码、CPU 寄存器和系统调用。SQL 也是一种抽象，隐藏了复杂的磁盘 / 内存数据结构、来自其他客户端的并发请求、崩溃后的不一致性。当然在用高级语言编程时，我们仍然用到了机器码；只不过没有 **直接（directly）** 使用罢了，正是因为编程语言的抽象，我们才不必去考虑这些实现细节。

抽象可以帮助我们将系统的复杂度控制在可管理的水平，不过，找到好的抽象是非常困难的。在分布式系统领域虽然有许多好的算法，但我们并不清楚它们应该打包成什么样抽象。

本书将紧盯那些允许我们将大型系统的部分提取为定义明确的、可重用的组件的优秀抽象。

### 可演化性：拥抱变化

系统的需求永远不变，基本是不可能的。更可能的情况是，它们处于常态的变化中，例如：你了解了新的事实、出现意想不到的应用场景、业务优先级发生变化、用户要求新功能、新平台取代旧平台、法律或监管要求发生变化、系统增长迫使架构变化等。

在组织流程方面，**敏捷（agile）** 工作模式为适应变化提供了一个框架。敏捷社区还开发了对在频繁变化的环境中开发软件很有帮助的技术工具和模式，如 **测试驱动开发（TDD, test-driven development）** 和 **重构（refactoring）** 。

这些敏捷技术的大部分讨论都集中在相当小的规模（同一个应用中的几个代码文件）。本书将探索在更大数据系统层面上提高敏捷性的方法，可能由几个不同的应用或服务组成。例如，为了将装配主页时间线的方法从方法 1 变为方法 2，你会如何 “重构” 推特的架构 ？

修改数据系统并使其适应不断变化需求的容易程度，是与 **简单性** 和 **抽象性** 密切相关的：简单易懂的系统通常比复杂系统更容易修改。但由于这是一个非常重要的概念，我们将用一个不同的词来指代数据系统层面的敏捷性： **可演化性（evolvability）** 【34】。


## 本章小结

本章探讨了一些关于数据密集型应用的基本思考方式。这些原则将指导我们阅读本书的其余部分，那里将会深入技术细节。

一个应用必须满足各种需求才称得上有用。有一些 **功能需求**（functional requirements，即它应该做什么，比如允许以各种方式存储，检索，搜索和处理数据）以及一些 **非功能性需求**（nonfunctional，即通用属性，例如安全性、可靠性、合规性、可伸缩性、兼容性和可维护性）。在本章详细讨论了可靠性，可伸缩性和可维护性。

**可靠性（Reliability）** 意味着即使发生故障，系统也能正常工作。故障可能发生在硬件（通常是随机的和不相关的）、软件（通常是系统性的 Bug，很难处理）和人类（不可避免地时不时出错）。**容错技术** 可以对终端用户隐藏某些类型的故障。

**可伸缩性（Scalability）** 意味着即使在负载增加的情况下也有保持性能的策略。为了讨论可伸缩性，我们首先需要定量描述负载和性能的方法。我们简要了解了推特主页时间线的例子，介绍描述负载的方法，并将响应时间百分位点作为衡量性能的一种方式。在可伸缩的系统中可以添加 **处理容量（processing capacity）** 以在高负载下保持可靠。

**可维护性（Maintainability）** 有许多方面，但实质上是关于工程师和运维团队的生活质量的。良好的抽象可以帮助降低复杂度，并使系统易于修改和适应新的应用场景。良好的可操作性意味着对系统的健康状态具有良好的可见性，并拥有有效的管理手段。

不幸的是，使应用可靠、可伸缩或可维护并不容易。但是某些模式和技术会不断重新出现在不同的应用中。在接下来的几章中，我们将看到一些数据系统的例子，并分析它们如何实现这些目标。

在本书后面的 [第三部分](/v1/part-iii) 中，我们将看到一种模式：几个组件协同工作以构成一个完整的系统（如 [图 1-1](/v1/ddia_0101.png) 中的例子）


## 参考文献

1. Michael Stonebraker and Uğur Çetintemel: “['One Size Fits All': An Idea Whose Time Has Come and Gone](https://cs.brown.edu/~ugur/fits_all.pdf),” at *21st International Conference on Data Engineering* (ICDE), April 2005.
1. Walter L. Heimerdinger and Charles B. Weinstock: “[A Conceptual Framework for System Fault Tolerance](https://resources.sei.cmu.edu/asset_files/TechnicalReport/1992_005_001_16112.pdf),” Technical Report CMU/SEI-92-TR-033, Software Engineering Institute, Carnegie Mellon University, October 1992.
1. Ding Yuan, Yu Luo, Xin Zhuang, et al.: “[Simple Testing Can Prevent Most Critical Failures: An Analysis of Production Failures in Distributed Data-Intensive Systems](https://www.usenix.org/system/files/conference/osdi14/osdi14-paper-yuan.pdf),” at *11th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2014.
1. Yury Izrailevsky and Ariel Tseitlin: “[The Netflix Simian Army](https://netflixtechblog.com/the-netflix-simian-army-16e57fbab116),” *netflixtechblog.com*, July 19, 2011.
1. Daniel Ford, François Labelle, Florentina I. Popovici, et al.: “[Availability in Globally Distributed Storage Systems](http://research.google.com/pubs/archive/36737.pdf),” at *9th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2010.
1. Brian Beach: “[Hard Drive Reliability Update – Sep 2014](https://www.backblaze.com/blog/hard-drive-reliability-update-september-2014/),” *backblaze.com*, September 23, 2014.
1. Laurie Voss: “[AWS: The Good, the Bad and the Ugly](https://web.archive.org/web/20160429075023/http://blog.awe.sm/2012/12/18/aws-the-good-the-bad-and-the-ugly/),” *blog.awe.sm*, December 18, 2012.
1. Haryadi S. Gunawi, Mingzhe Hao, Tanakorn Leesatapornwongsa, et al.: “[What Bugs Live in the Cloud?](http://ucare.cs.uchicago.edu/pdf/socc14-cbs.pdf),” at *5th ACM Symposium on Cloud Computing* (SoCC), November 2014. [doi:10.1145/2670979.2670986](http://dx.doi.org/10.1145/2670979.2670986)
1. Nelson Minar: “[Leap Second Crashes Half the Internet](http://www.somebits.com/weblog/tech/bad/leap-second-2012.html),” *somebits.com*, July 3, 2012.
1. Amazon Web Services: “[Summary of the Amazon EC2 and Amazon RDS Service Disruption in the US East Region](http://aws.amazon.com/message/65648/),” *aws.amazon.com*, April 29, 2011.
1. Richard I. Cook: “[How Complex Systems Fail](https://www.adaptivecapacitylabs.com/HowComplexSystemsFail.pdf),” Cognitive Technologies Laboratory, April 2000.
1. Jay Kreps: “[Getting Real About Distributed System Reliability](http://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability),” *blog.empathybox.com*, March 19, 2012.
1. David Oppenheimer, Archana Ganapathi, and David A. Patterson: “[Why Do Internet Services Fail, and What Can Be Done About It?](http://static.usenix.org/legacy/events/usits03/tech/full_papers/oppenheimer/oppenheimer.pdf),” at *4th USENIX Symposium on Internet Technologies and Systems* (USITS), March 2003.
1. Nathan Marz: “[Principles of Software Engineering, Part 1](http://nathanmarz.com/blog/principles-of-software-engineering-part-1.html),” *nathanmarz.com*, April 2, 2013.
1. Michael Jurewitz: “[The Human Impact of Bugs](http://jury.me/blog/2013/3/14/the-human-impact-of-bugs),” *jury.me*, March 15, 2013.
1. Raffi Krikorian: “[Timelines at Scale](http://www.infoq.com/presentations/Twitter-Timeline-Scalability),” at *QCon San Francisco*, November 2012.
1. Martin Fowler: *Patterns of Enterprise Application Architecture*. Addison Wesley, 2002. ISBN: 978-0-321-12742-6
1. Kelly Sommers: “[After all that run around, what caused 500ms disk latency even when we replaced physical server?](https://twitter.com/kellabyte/status/532930540777635840)” *twitter.com*, November 13, 2014.
1. Giuseppe DeCandia, Deniz Hastorun, Madan Jampani, et al.: “[Dynamo: Amazon's Highly Available Key-Value Store](http://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf),” at *21st ACM Symposium on Operating Systems Principles* (SOSP), October 2007.
1. Greg Linden: “[Make Data Useful](http://glinden.blogspot.co.uk/2006/12/slides-from-my-talk-at-stanford.html),” slides from presentation at Stanford University Data Mining class (CS345), December 2006.
1. Tammy Everts: “[The Real Cost of Slow Time vs Downtime](https://www.slideshare.net/Radware/radware-cmg2014-tammyevertsslowtimevsdowntime),” *slideshare.net*, November 5, 2014.
1. Jake Brutlag: “[Speed Matters](https://ai.googleblog.com/2009/06/speed-matters.html),” *ai.googleblog.com*, June 23, 2009.
1. Tyler Treat: “[Everything You Know About Latency Is Wrong](http://bravenewgeek.com/everything-you-know-about-latency-is-wrong/),” *bravenewgeek.com*, December 12, 2015.
1. Jeffrey Dean and Luiz André Barroso: “[The Tail at Scale](http://cacm.acm.org/magazines/2013/2/160173-the-tail-at-scale/fulltext),” *Communications of the ACM*, volume 56, number 2, pages 74–80, February 2013. [doi:10.1145/2408776.2408794](http://dx.doi.org/10.1145/2408776.2408794)
1. Graham Cormode, Vladislav Shkapenyuk, Divesh Srivastava, and Bojian Xu: “[Forward Decay: A Practical Time Decay Model for Streaming Systems](http://dimacs.rutgers.edu/~graham/pubs/papers/fwddecay.pdf),” at *25th IEEE International Conference on Data Engineering* (ICDE), March 2009.
1. Ted Dunning and Otmar Ertl: “[Computing Extremely Accurate Quantiles Using t-Digests](https://github.com/tdunning/t-digest),” *github.com*, March 2014.
1. Gil Tene: “[HdrHistogram](http://www.hdrhistogram.org/),” *hdrhistogram.org*.
1. Baron Schwartz: “[Why Percentiles Don’t Work the Way You Think](https://orangematter.solarwinds.com/2016/11/18/why-percentiles-dont-work-the-way-you-think/),” *solarwinds.com*, November 18, 2016.
1. James Hamilton: “[On Designing and Deploying Internet-Scale Services](https://www.usenix.org/legacy/events/lisa07/tech/full_papers/hamilton/hamilton.pdf),” at *21st Large Installation System Administration Conference* (LISA), November 2007.
1. Brian Foote and Joseph Yoder: “[Big Ball of Mud](http://www.laputan.org/pub/foote/mud.pdf),” at *4th Conference on Pattern Languages of Programs* (PLoP), September 1997.
1. Frederick P Brooks: “No Silver Bullet – Essence and Accident in Software Engineering,” in *The Mythical Man-Month*, Anniversary edition, Addison-Wesley, 1995. ISBN: 978-0-201-83595-3
1. Ben Moseley and Peter Marks: “[Out of the Tar Pit](https://curtclifton.net/papers/MoseleyMarks06a.pdf),” at *BCS Software Practice Advancement* (SPA), 2006.
1. Rich Hickey: “[Simple Made Easy](http://www.infoq.com/presentations/Simple-Made-Easy),” at *Strange Loop*, September 2011.
1. Hongyu Pei Breivold, Ivica Crnkovic, and Peter J. Eriksson: “[Analyzing Software Evolvability](http://www.es.mdh.se/pdf_publications/1251.pdf),” at *32nd Annual IEEE International Computer Software and Applications Conference* (COMPSAC), July 2008. [doi:10.1109/COMPSAC.2008.50](http://dx.doi.org/10.1109/COMPSAC.2008.50)

================================================
FILE: content/v1/ch10.md
================================================
---
title: "第十章：批处理"
linkTitle: "10. 批处理"
weight: 310
breadcrumbs: false
---

![](/map/ch10.png)

> 带有太强个人色彩的系统无法成功。当最初的设计完成并且相对稳定时，不同的人们以自己的方式进行测试，真正的考验才开始。
>
> —— 高德纳

在本书的前两部分中，我们讨论了很多关于 **请求** 和 **查询** 以及相应的 **响应** 或 **结果**。许多现有数据系统中都采用这种数据处理方式：你发送请求指令，一段时间后（我们期望）系统会给出一个结果。数据库、缓存、搜索索引、Web 服务器以及其他一些系统都以这种方式工作。

像这样的 **在线（online）** 系统，无论是浏览器请求页面还是调用远程 API 的服务，我们通常认为请求是由人类用户触发的，并且正在等待响应。他们不应该等太久，所以我们非常关注系统的响应时间（请参阅 “[描述性能](/v1/ch1#描述性能)”）。

Web 和越来越多的基于 HTTP/REST 的 API 使交互的请求 / 响应风格变得如此普遍，以至于很容易将其视为理所当然。但我们应该记住，这不是构建系统的唯一方式，其他方法也有其优点。我们来看看三种不同类型的系统：

服务（在线系统）
: 服务等待客户的请求或指令到达。每收到一个，服务会试图尽快处理它，并发回一个响应。响应时间通常是服务性能的主要衡量指标，可用性通常非常重要（如果客户端无法访问服务，用户可能会收到错误消息）。

批处理系统（离线系统）
: 一个批处理系统有大量的输入数据，跑一个 **作业（job）** 来处理它，并生成一些输出数据，这往往需要一段时间（从几分钟到几天），所以通常不会有用户等待作业完成。相反，批量作业通常会定期运行（例如，每天一次）。批处理作业的主要性能衡量标准通常是吞吐量（处理特定大小的输入所需的时间）。本章中讨论的就是批处理。

流处理系统（准实时系统）
: 流处理介于在线和离线（批处理）之间，所以有时候被称为 **准实时（near-real-time）** 或 **准在线（nearline）** 处理。像批处理系统一样，流处理消费输入并产生输出（并不需要响应请求）。但是，流式作业在事件发生后不久就会对事件进行操作，而批处理作业则需等待固定的一组输入数据。这种差异使流处理系统比起批处理系统具有更低的延迟。由于流处理基于批处理，我们将在 [第十一章](/v1/ch11) 讨论它。

正如我们将在本章中看到的那样，批处理是构建可靠、可伸缩和可维护应用程序的重要组成部分。例如，2004 年发布的批处理算法 Map-Reduce（可能被过分热情地）被称为 “造就 Google 大规模可伸缩性的算法”【2】。随后在各种开源数据系统中得到应用，包括 Hadoop、CouchDB 和 MongoDB。

与多年前为数据仓库开发的并行处理系统【3,4】相比，MapReduce 是一个相当低级别的编程模型，但它使得在商用硬件上能进行的处理规模迈上一个新的台阶。虽然 MapReduce 的重要性正在下降【5】，但它仍然值得去理解，因为它描绘了一幅关于批处理为什么有用，以及如何做到有用的清晰图景。

实际上，批处理是一种非常古老的计算方式。早在可编程数字计算机诞生之前，打孔卡制表机（例如 1890 年美国人口普查【6】中使用的霍尔里斯机）实现了半机械化的批处理形式，从大量输入中汇总计算。Map-Reduce 与 1940 年代和 1950 年代广泛用于商业数据处理的机电 IBM 卡片分类机器有着惊人的相似之处【7】。正如我们所说，历史总是在不断重复自己。

在本章中，我们将了解 MapReduce 和其他一些批处理算法和框架，并探索它们在现代数据系统中的作用。但首先我们将看看使用标准 Unix 工具的数据处理。即使你已经熟悉了它们，Unix 的哲学也值得一读，Unix 的思想和经验教训可以迁移到大规模、异构的分布式数据系统中。


## 使用Unix工具的批处理

我们从一个简单的例子开始。假设你有一台 Web 服务器，每次处理请求时都会在日志文件中附加一行。例如，使用 nginx 默认的访问日志格式，日志的一行可能如下所示：

```bash
216.58.210.78 - - [27/Feb/2015:17:55:11 +0000] "GET /css/typography.css HTTP/1.1"
200 3377 "http://martin.kleppmann.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36"
```

（实际上这只是一行，分成多行只是为了便于阅读。）这一行中有很多信息。为了解释它，你需要了解日志格式的定义，如下所示：

```bash
 $remote_addr - $remote_user [$time_local] "$request"
 $status $body_bytes_sent "$http_referer" "$http_user_agent"
```

日志的这一行表明在 UTC 时间的 2015 年 2 月 27 日 17 点 55 分 11 秒，服务器从客户端 IP 地址 `216.58.210.78` 接收到对文件 `/css/typography.css` 的请求。用户没有认证，所以 `$remote_user` 被设置为连字符（`-`）。响应状态是 200（即请求成功），响应的大小是 3377 字节。网页浏览器是 Chrome 40，它加载了这个文件是因为该文件在网址为 `http://martin.kleppmann.com/` 的页面中被引用到了。


### 简单日志分析

很多工具可以从这些日志文件生成关于网站流量的漂亮的报告，但为了练手，让我们使用基本的 Unix 功能创建自己的工具。例如，假设你想在你的网站上找到五个最受欢迎的网页。则可以在 Unix shell 中这样做：[^i]

[^i]: 有些人认为 `cat` 这里并没有必要，因为输入文件可以直接作为 awk 的参数。但这种写法让线性管道更为显眼。

```bash
cat /var/log/nginx/access.log | #1
  awk '{print $7}' | #2
  sort             | #3
  uniq -c          | #4
  sort -r -n       | #5
  head -n 5          #6
```

1. 读取日志文件
2. 将每一行按空格分割成不同的字段，每行只输出第七个字段，恰好是请求的 URL。在我们的例子中是 `/css/typography.css`。
3. 按字母顺序排列请求的 URL 列表。如果某个 URL 被请求过 n 次，那么排序后，文件将包含连续重复出现 n 次的该 URL。
4. `uniq` 命令通过检查两个相邻的行是否相同来过滤掉输入中的重复行。`-c` 则表示还要输出一个计数器：对于每个不同的 URL，它会报告输入中出现该 URL 的次数。
5. 第二种排序按每行起始处的数字（`-n`）排序，这是 URL 的请求次数。然后逆序（`-r`）返回结果，大的数字在前。
6. 最后，只输出前五行（`-n 5`），并丢弃其余的。该系列命令的输出如下所示：

```bash
    4189 /favicon.ico
    3631 /2013/05/24/improving-security-of-ssh-private-keys.html
    2124 /2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html
    1369 /
     915 /css/typography.css
```

如果你不熟悉 Unix 工具，上面的命令行可能看起来有点吃力，但是它非常强大。它能在几秒钟内处理几 GB 的日志文件，并且你可以根据需要轻松修改命令。例如，如果要从报告中省略 CSS 文件，可以将 awk 参数更改为 `'$7 !~ /\.css$/ {print $7}'`, 如果想统计最多的客户端 IP 地址，可以把 awk 参数改为 `'{print $1}'`，等等。

我们不会在这里详细探索 Unix 工具，但是它非常值得学习。令人惊讶的是，使用 awk、sed、grep、sort、uniq 和 xargs 的组合，可以在几分钟内完成许多数据分析，并且它们的性能相当的好【8】。

#### 命令链与自定义程序

除了 Unix 命令链，你还可以写一个简单的程序来做同样的事情。例如在 Ruby 中，它可能看起来像这样：

```ruby
counts = Hash.new(0)         # 1
File.open('/var/log/nginx/access.log') do |file|
    file.each do |line|
        url = line.split[6]  # 2
        counts[url] += 1     # 3
    end
end

top5 = counts.map{|url, count| [count, url] }.sort.reverse[0...5] # 4
top5.each{|count, url| puts "#{count} #{url}" }                   # 5
```

1. `counts` 是一个存储计数器的哈希表，保存了每个 URL 被浏览的次数，默认为 0。
2. 逐行读取日志，抽取每行第七个被空格分隔的字段为 URL（这里的数组索引是 6，因为 Ruby 的数组索引从 0 开始计数）
3. 将日志当前行中 URL 对应的计数器值加一。
4. 按计数器值（降序）对哈希表内容进行排序，并取前五位。
5. 打印出前五个条目。

这个程序并不像 Unix 管道那样简洁，但是它的可读性很强，喜欢哪一种属于口味的问题。但两者除了表面上的差异之外，执行流程也有很大差异，如果你在大文件上运行此分析，则会变得明显。

#### 排序 VS 内存中的聚合

Ruby 脚本在内存中保存了一个 URL 的哈希表，将每个 URL 映射到它出现的次数。Unix 管道没有这样的哈希表，而是依赖于对 URL 列表的排序，在这个 URL 列表中，同一个 URL 的只是简单地重复出现。

哪种方法更好？这取决于你有多少个不同的 URL。对于大多数中小型网站，你可能可以为所有不同网址提供一个计数器（假设我们使用 1GB 内存）。在此例中，作业的 **工作集**（working set，即作业需要随机访问的内存大小）仅取决于不同 URL 的数量：如果日志中只有单个 URL，重复出现一百万次，则散列表所需的空间表就只有一个 URL 加上一个计数器的大小。当工作集足够小时，内存散列表表现良好，甚至在性能较差的笔记本电脑上也可以正常工作。

另一方面，如果作业的工作集大于可用内存，则排序方法的优点是可以高效地使用磁盘。这与我们在 “[SSTables 和 LSM 树](/v1/ch3#SSTables和LSM树)” 中讨论过的原理是一样的：数据块可以在内存中排序并作为段文件写入磁盘，然后多个排序好的段可以合并为一个更大的排序文件。归并排序具有在磁盘上运行良好的顺序访问模式。（请记住，针对顺序 I/O 进行优化是 [第三章](/v1/ch3) 中反复出现的主题，相同的模式在此重现）

GNU Coreutils（Linux）中的 `sort` 程序通过溢出至磁盘的方式来自动应对大于内存的数据集，并能同时使用多个 CPU 核进行并行排序【9】。这意味着我们之前看到的简单的 Unix 命令链很容易伸缩至大数据集，且不会耗尽内存。瓶颈可能是从磁盘读取输入文件的速度。


### Unix哲学

我们可以非常容易地使用前一个例子中的一系列命令来分析日志文件，这并非巧合：事实上，这实际上是 Unix 的关键设计思想之一，而且它直至今天也仍然令人讶异地重要。让我们更深入地研究一下，以便从 Unix 中借鉴一些想法【10】。

Unix 管道的发明者道格・麦克罗伊（Doug McIlroy）在 1964 年首先描述了这种情况【11】：“我们需要一种类似园艺胶管的方式来拼接程序 —— 当我们需要将消息从一个程序传递另一个程序时，直接接上去就行。I/O 应该也按照这种方式进行 ”。水管的类比仍然在生效，通过管道连接程序的想法成为了现在被称为 **Unix 哲学** 的一部分 —— 这一组设计原则在 Unix 用户与开发者之间流行起来，该哲学在 1978 年表述如下【12,13】：

1. 让每个程序都做好一件事。要做一件新的工作，写一个新程序，而不是通过添加 “功能” 让老程序复杂化。
2. 期待每个程序的输出成为另一个程序的输入。不要将无关信息混入输出。避免使用严格的列数据或二进制输入格式。不要坚持交互式输入。
3. 设计和构建软件时，即使是操作系统，也让它们能够尽早地被试用，最好在几周内完成。不要犹豫，扔掉笨拙的部分，重建它们。
4. 优先使用工具来减轻编程任务，即使必须绕道去编写工具，且在用完后很可能要扔掉大部分。

这种方法 —— 自动化，快速原型设计，增量式迭代，对实验友好，将大型项目分解成可管理的块 —— 听起来非常像今天的敏捷开发和 DevOps 运动。奇怪的是，四十年来变化不大。

`sort` 工具是一个很好的例子。可以说它比大多数编程语言标准库中的实现（它们不会利用磁盘或使用多线程，即使这样做有很大好处）要更好。然而，单独使用 `sort` 几乎没什么用。它只能与其他 Unix 工具（如 `uniq`）结合使用。

像 `bash` 这样的 Unix shell 可以让我们轻松地将这些小程序组合成令人讶异的强大数据处理任务。尽管这些程序中有很多是由不同人群编写的，但它们可以灵活地结合在一起。Unix 如何实现这种可组合性？

#### 统一的接口

如果你希望一个程序的输出成为另一个程序的输入，那意味着这些程序必须使用相同的数据格式 —— 换句话说，一个兼容的接口。如果你希望能够将任何程序的输出连接到任何程序的输入，那意味着所有程序必须使用相同的 I/O 接口。

在 Unix 中，这种接口是一个 **文件**（file，更准确地说，是一个文件描述符）。一个文件只是一串有序的字节序列。因为这是一个非常简单的接口，所以可以使用相同的接口来表示许多不同的东西：文件系统上的真实文件，到另一个进程（Unix 套接字，stdin，stdout）的通信通道，设备驱动程序（比如 `/dev/audio` 或 `/dev/lp0`），表示 TCP 连接的套接字，等等。很容易将这些设计视为理所当然的，但实际上能让这些差异巨大的东西共享一个统一的接口是非常厉害的，这使得它们可以很容易地连接在一起 [^ii]。

[^ii]: 统一接口的另一个例子是 URL 和 HTTP，这是 Web 的基石。一个 URL 标识一个网站上的一个特定的东西（资源），你可以链接到任何其他网站的任何网址。具有网络浏览器的用户因此可以通过跟随链接在网站之间无缝跳转，即使服务器可能由完全不相关的组织维护。这个原则现在似乎非常明显，但它却是网络取能取得今天成就的关键。之前的系统并不是那么统一：例如，在公告板系统（BBS）时代，每个系统都有自己的电话号码和波特率配置。从一个 BBS 到另一个 BBS 的引用必须以电话号码和调制解调器设置的形式；用户将不得不挂断，拨打其他 BBS，然后手动找到他们正在寻找的信息。直接链接到另一个 BBS 内的一些内容当时是不可能的。

按照惯例，许多（但不是全部）Unix 程序将这个字节序列视为 ASCII 文本。我们的日志分析示例使用了这个事实：`awk`、`sort`、`uniq` 和 `head` 都将它们的输入文件视为由 `\n`（换行符，ASCII `0x0A`）字符分隔的记录列表。`\n` 的选择是任意的 —— 可以说，ASCII 记录分隔符 `0x1E` 本来就是一个更好的选择，因为它是为了这个目的而设计的【14】，但是无论如何，所有这些程序都使用相同的记录分隔符允许它们互操作。

每条记录（即一行输入）的解析则更加模糊。Unix 工具通常通过空白或制表符将行分割成字段，但也使用 CSV（逗号分隔），管道分隔和其他编码。即使像 `xargs` 这样一个相当简单的工具也有六个命令行选项，用于指定如何解析输入。

ASCII 文本的统一接口大多数时候都能工作，但它不是很优雅：我们的日志分析示例使用 `{print $7}` 来提取网址，这样可读性不是很好。在理想的世界中可能是 `{print $request_url}` 或类似的东西。我们稍后会回顾这个想法。

尽管几十年后还不够完美，但统一的 Unix 接口仍然是非常出色的设计。没有多少软件能像 Unix 工具一样交互组合的这么好：你不能通过自定义分析工具轻松地将电子邮件帐户的内容和在线购物历史记录以管道传送至电子表格中，并将结果发布到社交网络或维基。今天，像 Unix 工具一样流畅地运行程序是一种例外，而不是规范。

即使是具有 **相同数据模型** 的数据库，将数据从一种数据库导出再导入到另一种数据库也并不容易。缺乏整合导致了数据的 **巴尔干化**[^译注i]。

[^译注i]: **巴尔干化（Balkanization）** 是一个常带有贬义的地缘政治学术语，其定义为：一个国家或政区分裂成多个互相敌对的国家或政区的过程。


#### 逻辑与布线相分离

Unix 工具的另一个特点是使用标准输入（`stdin`）和标准输出（`stdout`）。如果你运行一个程序，而不指定任何其他的东西，标准输入来自键盘，标准输出指向屏幕。但是，你也可以从文件输入和 / 或将输出重定向到文件。管道允许你将一个进程的标准输出附加到另一个进程的标准输入（有个小内存缓冲区，而不需要将整个中间数据流写入磁盘）。

如果需要，程序仍然可以直接读取和写入文件，但 Unix 方法在程序不关心特定的文件路径、只使用标准输入和标准输出时效果最好。这允许 shell 用户以任何他们想要的方式连接输入和输出；该程序不知道或不关心输入来自哪里以及输出到哪里。（人们可以说这是一种 **松耦合（loose coupling）**，**晚期绑定（late binding）**【15】或 **控制反转（inversion of control）**【16】）。将输入 / 输出布线与程序逻辑分开，可以将小工具组合成更大的系统。

你甚至可以编写自己的程序，并将它们与操作系统提供的工具组合在一起。你的程序只需要从标准输入读取输入，并将输出写入标准输出，它就可以加入数据处理的管道中。在日志分析示例中，你可以编写一个将 User-Agent 字符串转换为更灵敏的浏览器标识符，或者将 IP 地址转换为国家代码的工具，并将其插入管道。`sort` 程序并不关心它是否与操作系统的另一部分或者你写的程序通信。

但是，使用 `stdin` 和 `stdout` 能做的事情是有限的。需要多个输入或输出的程序虽然可能，却非常棘手。你没法将程序的输出管道连接至网络连接中【17,18】[^iii] 。如果程序直接打开文件进行读取和写入，或者将另一个程序作为子进程启动，或者打开网络连接，那么 I/O 的布线就取决于程序本身了。它仍然可以被配置（例如通过命令行选项），但在 Shell 中对输入和输出进行布线的灵活性就少了。

[^iii]: 除了使用一个单独的工具，如 `netcat` 或 `curl`。Unix 起初试图将所有东西都表示为文件，但是 BSD 套接字 API 偏离了这个惯例【17】。研究用操作系统 Plan 9 和 Inferno 在使用文件方面更加一致：它们将 TCP 连接表示为 `/net/tcp` 中的文件【18】。


#### 透明度和实验

使 Unix 工具如此成功的部分原因是，它们使查看正在发生的事情变得非常容易：

- Unix 命令的输入文件通常被视为不可变的。这意味着你可以随意运行命令，尝试各种命令行选项，而不会损坏输入文件。
- 你可以在任何时候结束管道，将管道输出到 `less`，然后查看它是否具有预期的形式。这种检查能力对调试非常有用。
- 你可以将一个流水线阶段的输出写入文件，并将该文件用作下一阶段的输入。这使你可以重新启动后面的阶段，而无需重新运行整个管道。

因此，与关系数据库的查询优化器相比，即使 Unix 工具非常简单，但仍然非常有用，特别是对于实验而言。

然而，Unix 工具的最大局限在于它们只能在一台机器上运行 —— 而 Hadoop 这样的工具即应运而生。


## MapReduce和分布式文件系统

MapReduce 有点像 Unix 工具，但分布在数千台机器上。像 Unix 工具一样，它相当简单粗暴，但令人惊异地管用。一个 MapReduce 作业可以和一个 Unix 进程相类比：它接受一个或多个输入，并产生一个或多个输出。

和大多数 Unix 工具一样，运行 MapReduce 作业通常不会修改输入，除了生成输出外没有任何副作用。输出文件以连续的方式一次性写入（一旦写入文件，不会修改任何现有的文件部分）。

虽然 Unix 工具使用 `stdin` 和 `stdout` 作为输入和输出，但 MapReduce 作业在分布式文件系统上读写文件。在 Hadoop 的 MapReduce 实现中，该文件系统被称为 **HDFS（Hadoop 分布式文件系统）**，一个 Google 文件系统（GFS）的开源实现【19】。

除 HDFS 外，还有各种其他分布式文件系统，如 GlusterFS 和 Quantcast File System（QFS）【20】。诸如 Amazon S3、Azure Blob 存储和 OpenStack Swift【21】等对象存储服务在很多方面都是相似的 [^iv]。在本章中，我们将主要使用 HDFS 作为示例，但是这些原则适用于任何分布式文件系统。

[^iv]: 一个不同之处在于，对于 HDFS，可以将计算任务安排在存储特定文件副本的计算机上运行，而对象存储通常将存储和计算分开。如果网络带宽是一个瓶颈，从本地磁盘读取有性能优势。但是请注意，如果使用纠删码（Erasure Coding），则会丢失局部性，因为来自多台机器的数据必须进行合并以重建原始文件【20】。

与网络连接存储（NAS）和存储区域网络（SAN）架构的共享磁盘方法相比，HDFS 基于 **无共享** 原则（请参阅 [第二部分](/v1/part-ii) 的介绍）。共享磁盘存储由集中式存储设备实现，通常使用定制硬件和专用网络基础设施（如光纤通道）。而另一方面，无共享方法不需要特殊的硬件，只需要通过传统数据中心网络连接的计算机。

HDFS 在每台机器上运行了一个守护进程，它对外暴露网络服务，允许其他节点访问存储在该机器上的文件（假设数据中心中的每台通用计算机都挂载着一些磁盘）。名为 **NameNode** 的中央服务器会跟踪哪个文件块存储在哪台机器上。因此，HDFS 在概念上创建了一个大型文件系统，可以使用所有运行有守护进程的机器的磁盘。

为了容忍机器和磁盘故障，文件块被复制到多台机器上。复制可能意味着多个机器上的相同数据的多个副本，如 [第五章](/v1/ch5) 中所述，或者诸如 Reed-Solomon 码这样的纠删码方案，它能以比完全复制更低的存储开销来支持恢复丢失的数据【20,22】。这些技术与 RAID 相似，后者可以在连接到同一台机器的多个磁盘上提供冗余；区别在于在分布式文件系统中，文件访问和复制是在传统的数据中心网络上完成的，没有特殊的硬件。

HDFS 的可伸缩性已经很不错了：在撰写本书时，最大的 HDFS 部署运行在上万台机器上，总存储容量达数百 PB【23】。如此大的规模已经变得可行，因为使用商品硬件和开源软件的 HDFS 上的数据存储和访问成本远低于在专用存储设备上支持同等容量的成本【24】。

### MapReduce作业执行

MapReduce 是一个编程框架，你可以使用它编写代码来处理 HDFS 等分布式文件系统中的大型数据集。理解它的最简单方法是参考 “[简单日志分析](#简单日志分析)” 中的 Web 服务器日志分析示例。MapReduce 中的数据处理模式与此示例非常相似：

1. 读取一组输入文件，并将其分解成 **记录（records）**。在 Web 服务器日志示例中，每条记录都是日志中的一行（即 `\n` 是记录分隔符）。
2. 调用 Mapper 函数，从每条输入记录中提取一对键值。在前面的例子中，Mapper 函数是 `awk '{print $7}'`：它提取 URL（`$7`）作为键，并将值留空。
3. 按键排序所有的键值对。在日志的例子中，这由第一个 `sort` 命令完成。
4. 调用 Reducer 函数遍历排序后的键值对。如果同一个键出现多次，排序使它们在列表中相邻，所以很容易组合这些值而不必在内存中保留很多状态。在前面的例子中，Reducer 是由 `uniq -c` 命令实现的，该命令使用相同的键来统计相邻记录的数量。

这四个步骤可以作为一个 MapReduce 作业执行。步骤 2（Map）和 4（Reduce）是你编写自定义数据处理代码的地方。步骤 1（将文件分解成记录）由输入格式解析器处理。步骤 3 中的排序步骤隐含在 MapReduce 中 —— 你不必编写它，因为 Mapper 的输出始终在送往 Reducer 之前进行排序。

要创建 MapReduce 作业，你需要实现两个回调函数，Mapper 和 Reducer，其行为如下（请参阅 “[MapReduce 查询](/v1/ch2#MapReduce查询)”）：

Mapper
: Mapper 会在每条输入记录上调用一次，其工作是从输入记录中提取键值。对于每个输入，它可以生成任意数量的键值对（包括 None）。它不会保留从一个输入记录到下一个记录的任何状态，因此每个记录都是独立处理的。

Reducer
: MapReduce 框架拉取由 Mapper 生成的键值对，收集属于同一个键的所有值，并在这组值上迭代调用 Reducer。Reducer 可以产生输出记录（例如相同 URL 的出现次数）。

在 Web 服务器日志的例子中，我们在第 5 步中有第二个 `sort` 命令，它按请求数对 URL 进行排序。在 MapReduce 中，如果你需要第二个排序阶段，则可以通过编写第二个 MapReduce 作业并将第一个作业的输出用作第二个作业的输入来实现它。这样看来，Mapper 的作用是将数据放入一个适合排序的表单中，并且 Reducer 的作用是处理已排序的数据。

#### 分布式执行MapReduce

MapReduce 与 Unix 命令管道的主要区别在于，MapReduce 可以在多台机器上并行执行计算，而无需编写代码来显式处理并行问题。Mapper 和 Reducer 一次只能处理一条记录；它们不需要知道它们的输入来自哪里，或者输出去往什么地方，所以框架可以处理在机器之间移动数据的复杂性。

在分布式计算中可以使用标准的 Unix 工具作为 Mapper 和 Reducer【25】，但更常见的是，它们被实现为传统编程语言的函数。在 Hadoop MapReduce 中，Mapper 和 Reducer 都是实现特定接口的 Java 类。在 MongoDB 和 CouchDB 中，Mapper 和 Reducer 都是 JavaScript 函数（请参阅 “[MapReduce 查询](/v1/ch2#MapReduce查询)”）。

[图 10-1](/v1/ddia_1001.png) 显示了 Hadoop MapReduce 作业中的数据流。其并行化基于分区（请参阅 [第六章](/v1/ch6)）：作业的输入通常是 HDFS 中的一个目录，输入目录中的每个文件或文件块都被认为是一个单独的分区，可以单独处理 map 任务（[图 10-1](/v1/ddia_1001.png) 中的 m1，m2 和 m3 标记）。

每个输入文件的大小通常是数百兆字节。MapReduce 调度器（图中未显示）试图在其中一台存储输入文件副本的机器上运行每个 Mapper，只要该机器有足够的备用 RAM 和 CPU 资源来运行 Mapper 任务【26】。这个原则被称为 **将计算放在数据附近**【27】：它节省了通过网络复制输入文件的开销，减少网络负载并增加局部性。

![](/v1/ddia_1001.png)

**图 10-1 具有三个 Mapper 和三个 Reducer 的 MapReduce 任务**

在大多数情况下，应该在 Mapper 任务中运行的应用代码在将要运行它的机器上还不存在，所以 MapReduce 框架首先将代码（例如 Java 程序中的 JAR 文件）复制到适当的机器。然后启动 Map 任务并开始读取输入文件，一次将一条记录传入 Mapper 回调函数。Mapper 的输出由键值对组成。

计算的 Reduce 端也被分区。虽然 Map 任务的数量由输入文件块的数量决定，但 Reducer 的任务的数量是由作业作者配置的（它可以不同于 Map 任务的数量）。为了确保具有相同键的所有键值对最终落在相同的 Reducer 处，框架使用键的散列值来确定哪个 Reduce 任务应该接收到特定的键值对（请参阅 “[根据键的散列分区](/v1/ch6#根据键的散列分区)”）。

键值对必须进行排序，但数据集可能太大，无法在单台机器上使用常规排序算法进行排序。相反，分类是分阶段进行的。首先每个 Map 任务都按照 Reducer 对输出进行分区。每个分区都被写入 Mapper 程序的本地磁盘，使用的技术与我们在 “[SSTables 与 LSM 树](/v1/ch3#SSTables和LSM树)” 中讨论的类似。

只要当 Mapper 读取完输入文件，并写完排序后的输出文件，MapReduce 调度器就会通知 Reducer 可以从该 Mapper 开始获取输出文件。Reducer 连接到每个 Mapper，并下载自己相应分区的有序键值对文件。按 Reducer 分区，排序，从 Mapper 向 Reducer 复制分区数据，这一整个过程被称为 **混洗（shuffle）**【26】（一个容易混淆的术语  —— 不像洗牌，在 MapReduce 中的混洗没有随机性）。

Reduce 任务从 Mapper 获取文件，并将它们合并在一起，并保留有序特性。因此，如果不同的 Mapper 生成了键相同的记录，则在 Reducer 的输入中，这些记录将会相邻。

Reducer 调用时会收到一个键，和一个迭代器作为参数，迭代器会顺序地扫过所有具有该键的记录（因为在某些情况可能无法完全放入内存中）。Reducer 可以使用任意逻辑来处理这些记录，并且可以生成任意数量的输出记录。这些输出记录会写入分布式文件系统上的文件中（通常是在跑 Reducer 的机器本地磁盘上留一份，并在其他机器上留几份副本）。

#### MapReduce工作流

单个 MapReduce 作业可以解决的问题范围很有限。以日志分析为例，单个 MapReduce 作业可以确定每个 URL 的页面浏览次数，但无法确定最常见的 URL，因为这需要第二轮排序。

因此将 MapReduce 作业链接成为 **工作流（workflow）** 中是极为常见的，例如，一个作业的输出成为下一个作业的输入。Hadoop MapReduce 框架对工作流没有特殊支持，所以这个链是通过目录名隐式实现的：第一个作业必须将其输出配置为 HDFS 中的指定目录，第二个作业必须将其输入配置为从同一个目录。从 MapReduce 框架的角度来看，这是两个独立的作业。

因此，被链接的 MapReduce 作业并没有那么像 Unix 命令管道（它直接将一个进程的输出作为另一个进程的输入，仅用一个很小的内存缓冲区）。它更像是一系列命令，其中每个命令的输出写入临时文件，下一个命令从临时文件中读取。这种设计有利也有弊，我们将在 “[物化中间状态](#物化中间状态)” 中讨论。

只有当作业成功完成后，批处理作业的输出才会被视为有效的（MapReduce 会丢弃失败作业的部分输出）。因此，工作流中的一项作业只有在先前的作业 —— 即生产其输入的作业 —— 成功完成后才能开始。为了处理这些作业之间的依赖，有很多针对 Hadoop 的工作流调度器被开发出来，包括 Oozie、Azkaban、Luigi、Airflow 和 Pinball 【28】。

这些调度程序还具有管理功能，在维护大量批处理作业时非常有用。在构建推荐系统时，由 50 到 100 个 MapReduce 作业组成的工作流是常见的【29】。而在大型组织中，许多不同的团队可能运行不同的作业来读取彼此的输出。工具支持对于管理这样复杂的数据流而言非常重要。

Hadoop 的各种高级工具（如 Pig 【30】、Hive 【31】、Cascading 【32】、Crunch 【33】和 FlumeJava 【34】）也能自动布线组装多个 MapReduce 阶段，生成合适的工作流。

### Reduce侧连接与分组

我们在 [第二章](/v1/ch2) 中讨论了数据模型和查询语言的连接，但是我们还没有深入探讨连接是如何实现的。现在是我们再次捡起这条线索的时候了。

在许多数据集中，一条记录与另一条记录存在关联是很常见的：关系模型中的 **外键**，文档模型中的 **文档引用** 或图模型中的 **边**。当你需要同时访问这一关联的两侧（持有引用的记录与被引用的记录）时，连接就是必须的。正如 [第二章](/v1/ch2) 所讨论的，非规范化可以减少对连接的需求，但通常无法将其完全移除 [^v]。

[^v]: 我们在本书中讨论的连接通常是等值连接，即最常见的连接类型，其中记录通过与其他记录在特定字段（例如 ID）中具有 **相同值** 相关联。有些数据库支持更通用的连接类型，例如使用小于运算符而不是等号运算符，但是我们没有地方来讲这些东西。

在数据库中，如果执行只涉及少量记录的查询，数据库通常会使用 **索引** 来快速定位感兴趣的记录（请参阅 [第三章](/v1/ch3)）。如果查询涉及到连接，则可能涉及到查找多个索引。然而 MapReduce 没有索引的概念 —— 至少在通常意义上没有。

当 MapReduce 作业被赋予一组文件作为输入时，它读取所有这些文件的全部内容；数据库会将这种操作称为 **全表扫描**。如果你只想读取少量的记录，则全表扫描与索引查询相比，代价非常高昂。但是在分析查询中（请参阅 “[事务处理还是分析？](/v1/ch3#事务处理还是分析？)”），通常需要计算大量记录的聚合。在这种情况下，特别是如果能在多台机器上并行处理时，扫描整个输入可能是相当合理的事情。

当我们在批处理的语境中讨论连接时，我们指的是在数据集中解析某种关联的全量存在。例如我们假设一个作业是同时处理所有用户的数据，而非仅仅是为某个特定用户查找数据（而这能通过索引更高效地完成）。

#### 示例：用户活动事件分析

[图 10-2](/v1/ddia_1002.png) 给出了一个批处理作业中连接的典型例子。左侧是事件日志，描述登录用户在网站上做的事情（称为 **活动事件**，即 activity events，或 **点击流数据**，即 clickstream data），右侧是用户数据库。你可以将此示例看作是星型模式的一部分（请参阅 “[星型和雪花型：分析的模式](/v1/ch3#星型和雪花型：分析的模式)”）：事件日志是事实表，用户数据库是其中的一个维度。

![](/v1/ddia_1002.png)

**图 10-2 用户行为日志与用户档案的连接**

分析任务可能需要将用户活动与用户档案信息相关联：例如，如果档案包含用户的年龄或出生日期，系统就可以确定哪些页面更受哪些年龄段的用户欢迎。然而活动事件仅包含用户 ID，而没有包含完整的用户档案信息。在每个活动事件中嵌入这些档案信息很可能会非常浪费。因此，活动事件需要与用户档案数据库相连接。

实现这一连接的最简单方法是，逐个遍历活动事件，并为每个遇到的用户 ID 查询用户数据库（在远程服务器上）。这是可能的，但是它的性能可能会非常差：处理吞吐量将受限于受数据库服务器的往返时间，本地缓存的有效性很大程度上取决于数据的分布，并行运行大量查询可能会轻易压垮数据库【35】。

为了在批处理过程中实现良好的吞吐量，计算必须（尽可能）限于单台机器上进行。为待处理的每条记录发起随机访问的网络请求实在是太慢了。而且，查询远程数据库意味着批处理作业变为 **非确定的（nondeterministic）**，因为远程数据库中的数据可能会改变。

因此，更好的方法是获取用户数据库的副本（例如，使用 ETL 进程从数据库备份中提取数据，请参阅 “[数据仓库](/v1/ch3#数据仓库)”），并将它和用户行为日志放入同一个分布式文件系统中。然后你可以将用户数据库存储在 HDFS 中的一组文件中，而用户活动记录存储在另一组文件中，并能用 MapReduce 将所有相关记录集中到同一个地方进行高效处理。

#### 排序合并连接

回想一下，Mapper 的目的是从每个输入记录中提取一对键值。在 [图 10-2](/v1/ddia_1002.png) 的情况下，这个键就是用户 ID：一组 Mapper 会扫过活动事件（提取用户 ID 作为键，活动事件作为值），而另一组 Mapper 将会扫过用户数据库（提取用户 ID 作为键，用户的出生日期作为值）。这个过程如 [图 10-3](/v1/ddia_1003.png) 所示。

![](/v1/ddia_1003.png)

**图 10-3 在用户 ID 上进行的 Reduce 端连接。如果输入数据集分区为多个文件，则每个分区都会被多个 Mapper 并行处理**

当 MapReduce 框架通过键对 Mapper 输出进行分区，然后对键值对进行排序时，效果是具有相同 ID 的所有活动事件和用户记录在 Reducer 输入中彼此相邻。Map-Reduce 作业甚至可以也让这些记录排序，使 Reducer 总能先看到来自用户数据库的记录，紧接着是按时间戳顺序排序的活动事件 ——  这种技术被称为 **二次排序（secondary sort）**【26】。

然后 Reducer 可以容易地执行实际的连接逻辑：每个用户 ID 都会被调用一次 Reducer 函数，且因为二次排序，第一个值应该是来自用户数据库的出生日期记录。Reducer 将出生日期存储在局部变量中，然后使用相同的用户 ID 遍历活动事件，输出 **已观看网址** 和 **观看者年龄** 的结果对。随后的 Map-Reduce 作业可以计算每个 URL 的查看者年龄分布，并按年龄段进行聚集。

由于 Reducer 一次处理一个特定用户 ID 的所有记录，因此一次只需要将一条用户记录保存在内存中，而不需要通过网络发出任何请求。这个算法被称为 **排序合并连接（sort-merge join）**，因为 Mapper 的输出是按键排序的，然后 Reducer 将来自连接两侧的有序记录列表合并在一起。

#### 把相关数据放在一起

在排序合并连接中，Mapper 和排序过程确保了所有对特定用户 ID 执行连接操作的必须数据都被放在同一个地方：单次调用 Reducer 的地方。预先排好了所有需要的数据，Reducer 可以是相当简单的单线程代码，能够以高吞吐量和与低内存开销扫过这些记录。

这种架构可以看做，Mapper 将 “消息” 发送给 Reducer。当一个 Mapper 发出一个键值对时，这个键的作用就像值应该传递到的目标地址。即使键只是一个任意的字符串（不是像 IP 地址和端口号那样的实际的网络地址），它表现的就像一个地址：所有具有相同键的键值对将被传递到相同的目标（一次 Reducer 的调用）。

使用 MapReduce 编程模型，能将计算的物理网络通信层面（从正确的机器获取数据）从应用逻辑中剥离出来（获取数据后执行处理）。这种分离与数据库的典型用法形成了鲜明对比，从数据库中获取数据的请求经常出现在应用代码内部【36】。由于 MapReduce 处理了所有的网络通信，因此它也避免了让应用代码去担心部分故障，例如另一个节点的崩溃：MapReduce 在不影响应用逻辑的情况下能透明地重试失败的任务。

#### 分组

除了连接之外，“把相关数据放在一起” 的另一种常见模式是，按某个键对记录分组（如 SQL 中的 GROUP BY 子句）。所有带有相同键的记录构成一个组，而下一步往往是在每个组内进行某种聚合操作，例如：

- 统计每个组中记录的数量（例如在统计 PV 的例子中，在 SQL 中表示为 `COUNT(*)` 聚合）
- 对某个特定字段求和（SQL 中的 `SUM(fieldname)`）
- 按某种分级函数取出排名前 k 条记录。

使用 MapReduce 实现这种分组操作的最简单方法是设置 Mapper，以便它们生成的键值对使用所需的分组键。然后分区和排序过程将所有具有相同分区键的记录导向同一个 Reducer。因此在 MapReduce 之上实现分组和连接看上去非常相似。

分组的另一个常见用途是整理特定用户会话的所有活动事件，以找出用户进行的一系列操作（称为 **会话化（sessionization）**【37】）。例如，可以使用这种分析来确定显示新版网站的用户是否比那些显示旧版本的用户更有购买欲（A/B 测试），或者计算某个营销活动是否值得。

如果你有多个 Web 服务器处理用户请求，则特定用户的活动事件很可能分散在各个不同的服务器的日志文件中。你可以通过使用会话 cookie，用户 ID 或类似的标识符作为分组键，以将特定用户的所有活动事件放在一起来实现会话化，与此同时，不同用户的事件仍然散布在不同的分区中。

#### 处理偏斜

如果存在与单个键关联的大量数据，则 “将具有相同键的所有记录放到相同的位置” 这种模式就被破坏了。例如在社交网络中，大多数用户可能会与几百人有连接，但少数名人可能有数百万的追随者。这种不成比例的活动数据库记录被称为 **关键对象（linchpin object）**【38】或 **热键（hot key）**。

在单个 Reducer 中收集与某个名人相关的所有活动（例如他们发布内容的回复）可能导致严重的 **偏斜**（也称为 **热点**，即 hot spot）—— 也就是说，一个 Reducer 必须比其他 Reducer 处理更多的记录（请参阅 “[负载偏斜与热点消除](/v1/ch6#负载偏斜与热点消除)”）。由于 MapReduce 作业只有在所有 Mapper 和 Reducer 都完成时才完成，所有后续作业必须等待最慢的 Reducer 才能启动。

如果连接的输入存在热键，可以使用一些算法进行补偿。例如，Pig 中的 **偏斜连接（skewed join）** 方法首先运行一个抽样作业（Sampling Job）来确定哪些键是热键【39】。连接实际执行时，Mapper 会将热键的关联记录 **随机**（相对于传统 MapReduce 基于键散列的确定性方法）发送到几个 Reducer 之一。对于另外一侧的连接输入，与热键相关的记录需要被复制到 **所有** 处理该键的 Reducer 上【40】。

这种技术将处理热键的工作分散到多个 Reducer 上，这样可以使其更好地并行化，代价是需要将连接另一侧的输入记录复制到多个 Reducer 上。Crunch 中的 **分片连接（sharded join）** 方法与之类似，但需要显式指定热键而不是使用抽样作业。这种技术也非常类似于我们在 “[负载偏斜与热点消除](/v1/ch6#负载偏斜与热点消除)” 中讨论的技术，使用随机化来缓解分区数据库中的热点。

Hive 的偏斜连接优化采取了另一种方法。它需要在表格元数据中显式指定热键，并将与这些键相关的记录单独存放，与其它文件分开。当在该表上执行连接时，对于热键，它会使用 Map 端连接（请参阅下一节）。

当按照热键进行分组并聚合时，可以将分组分两个阶段进行。第一个 MapReduce 阶段将记录发送到随机 Reducer，以便每个 Reducer 只对热键的子集执行分组，为每个键输出一个更紧凑的中间聚合结果。然后第二个 MapReduce 作业将所有来自第一阶段 Reducer 的中间聚合结果合并为每个键一个值。


### Map侧连接

上一节描述的连接算法在 Reducer 中执行实际的连接逻辑，因此被称为 Reduce 侧连接。Mapper 扮演着预处理输入数据的角色：从每个输入记录中提取键值，将键值对分配给 Reducer 分区，并按键排序。

Reduce 侧方法的优点是不需要对输入数据做任何假设：无论其属性和结构如何，Mapper 都可以对其预处理以备连接。然而不利的一面是，排序，复制至 Reducer，以及合并 Reducer 输入，所有这些操作可能开销巨大。当数据通过 MapReduce 阶段时，数据可能需要落盘好几次，取决于可用的内存缓冲区【37】。

另一方面，如果你 **能** 对输入数据作出某些假设，则通过使用所谓的 Map 侧连接来加快连接速度是可行的。这种方法使用了一个裁减掉 Reducer 与排序的 MapReduce 作业，每个 Mapper 只是简单地从分布式文件系统中读取一个输入文件块，然后将输出文件写入文件系统，仅此而已。

#### 广播散列连接

适用于执行 Map 端连接的最简单场景是大数据集与小数据集连接的情况。要点在于小数据集需要足够小，以便可以将其全部加载到每个 Mapper 的内存中。

例如，假设在 [图 10-2](/v1/ddia_1002.png) 的情况下，用户数据库小到足以放进内存中。在这种情况下，当 Mapper 启动时，它可以首先将用户数据库从分布式文件系统读取到内存中的散列表中。完成此操作后，Mapper 可以扫描用户活动事件，并简单地在散列表中查找每个事件的用户 ID [^vi]。

[^vi]: 这个例子假定散列表中的每个键只有一个条目，这对用户数据库（用户 ID 唯一标识一个用户）可能是正确的。通常，哈希表可能需要包含具有相同键的多个条目，而连接运算符将对每个键输出所有的匹配。

参与连接的较大输入的每个文件块各有一个 Mapper（在 [图 10-2](/v1/ddia_1002.png) 的例子中活动事件是较大的输入）。每个 Mapper 都会将较小输入整个加载到内存中。

这种简单有效的算法被称为 **广播散列连接（broadcast hash join）**：**广播** 一词反映了这样一个事实，每个连接较大输入端分区的 Mapper 都会将较小输入端数据集整个读入内存中（所以较小输入实际上 “广播” 到较大数据的所有分区上），**散列** 一词反映了它使用一个散列表。Pig（名为 “**复制链接（replicated join）**”），Hive（“**MapJoin**”），Cascading 和 Crunch 支持这种连接。它也被诸如 Impala 的数据仓库查询引擎使用【41】。

除了将较小的连接输入加载到内存散列表中，另一种方法是将较小输入存储在本地磁盘上的只读索引中【42】。索引中经常使用的部分将保留在操作系统的页面缓存中，因而这种方法可以提供与内存散列表几乎一样快的随机查找性能，但实际上并不需要数据集能放入内存中。

#### 分区散列连接

如果 Map 侧连接的输入以相同的方式进行分区，则散列连接方法可以独立应用于每个分区。在 [图 10-2](/v1/ddia_1002.png) 的情况中，你可以根据用户 ID 的最后一位十进制数字来对活动事件和用户数据库进行分区（因此连接两侧各有 10 个分区）。例如，Mapper3 首先将所有具有以 3 结尾的 ID 的用户加载到散列表中，然后扫描 ID 为 3 的每个用户的所有活动事件。

如果分区正确无误，可以确定的是，所有你可能需要连接的记录都落在同一个编号的分区中。因此每个 Mapper 只需要从输入两端各读取一个分区就足够了。好处是每个 Mapper 都可以在内存散列表中少放点数据。

这种方法只有当连接两端输入有相同的分区数，且两侧的记录都是使用相同的键与相同的哈希函数做分区时才适用。如果输入是由之前执行过这种分组的 MapReduce 作业生成的，那么这可能是一个合理的假设。

分区散列连接在 Hive 中称为 **Map 侧桶连接（bucketed map joins）【37】**。

#### Map侧合并连接

如果输入数据集不仅以相同的方式进行分区，而且还基于相同的键进行 **排序**，则可适用另一种 Map 侧连接的变体。在这种情况下，输入是否小到能放入内存并不重要，因为这时候 Mapper 同样可以执行归并操作（通常由 Reducer 执行）的归并操作：按键递增的顺序依次读取两个输入文件，将具有相同键的记录配对。

如果能进行 Map 侧合并连接，这通常意味着前一个 MapReduce 作业可能一开始就已经把输入数据做了分区并进行了排序。原则上这个连接就可以在前一个作业的 Reduce 阶段进行。但使用独立的仅 Map 作业有时也是合适的，例如，分好区且排好序的中间数据集可能还会用于其他目的。

#### MapReduce工作流与Map侧连接

当下游作业使用 MapReduce 连接的输出时，选择 Map 侧连接或 Reduce 侧连接会影响输出的结构。Reduce 侧连接的输出是按照 **连接键** 进行分区和排序的，而 Map 端连接的输出则按照与较大输入相同的方式进行分区和排序（因为无论是使用分区连接还是广播连接，连接较大输入端的每个文件块都会启动一个 Map 任务）。

如前所述，Map 侧连接也对输入数据集的大小，有序性和分区方式做出了更多假设。在优化连接策略时，了解分布式文件系统中数据集的物理布局变得非常重要：仅仅知道编码格式和数据存储目录的名称是不够的；你还必须知道数据是按哪些键做的分区和排序，以及分区的数量。

在 Hadoop 生态系统中，这种关于数据集分区的元数据通常在 HCatalog 和 Hive Metastore 中维护【37】。


### 批处理工作流的输出

我们已经说了很多用于实现 MapReduce 工作流的算法，但却忽略了一个重要的问题：这些处理完成之后的最终结果是什么？我们最开始为什么要跑这些作业？

在数据库查询的场景中，我们将事务处理（OLTP）与分析两种目的区分开来（请参阅 “[事务处理还是分析？](/v1/ch3#事务处理还是分析？)”）。我们看到，OLTP 查询通常根据键查找少量记录，使用索引，并将其呈现给用户（比如在网页上）。另一方面，分析查询通常会扫描大量记录，执行分组与聚合，输出通常有着报告的形式：显示某个指标随时间变化的图表，或按照某种排位取前 10 项，或将一些数字细化为子类。这种报告的消费者通常是需要做出商业决策的分析师或经理。

批处理放哪里合适？它不属于事务处理，也不是分析。它和分析比较接近，因为批处理通常会扫过输入数据集的绝大部分。然而 MapReduce 作业工作流与用于分析目的的 SQL 查询是不同的（请参阅 “[Hadoop 与分布式数据库的对比](#Hadoop与分布式数据库的对比)”）。批处理过程的输出通常不是报表，而是一些其他类型的结构。

#### 建立搜索索引

Google 最初使用 MapReduce 是为其搜索引擎建立索引，其实现为由 5 到 10 个 MapReduce 作业组成的工作流【1】。虽然 Google 后来也不仅仅是为这个目的而使用 MapReduce 【43】，但如果从构建搜索索引的角度来看，更能帮助理解 MapReduce。（直至今日，Hadoop MapReduce 仍然是为 Lucene/Solr 构建索引的好方法【44】）

我们在 “[全文搜索和模糊索引](/v1/ch3#全文搜索和模糊索引)” 中简要地了解了 Lucene 这样的全文搜索索引是如何工作的：它是一个文件（关键词字典），你可以在其中高效地查找特定关键字，并找到包含该关键字的所有文档 ID 列表（文章列表）。这是一种非常简化的看法 —— 实际上，搜索索引需要各种额外数据，以便根据相关性对搜索结果进行排名、纠正拼写错误、解析同义词等等 —— 但这个原则是成立的。

如果需要对一组固定文档执行全文搜索，则批处理是一种构建索引的高效方法：Mapper 根据需要对文档集合进行分区，每个 Reducer 构建该分区的索引，并将索引文件写入分布式文件系统。构建这样的文档分区索引（请参阅 “[分区与次级索引](/v1/ch6#分区与次级索引)”）并行处理效果拔群。

由于按关键字查询搜索索引是只读操作，因而这些索引文件一旦创建就是不可变的。

如果索引的文档集合发生更改，一种选择是定期重跑整个索引工作流，并在完成后用新的索引文件批量替换以前的索引文件。如果只有少量的文档发生了变化，这种方法的计算成本可能会很高。但它的优点是索引过程很容易理解：文档进，索引出。

另一个选择是，可以增量建立索引。如 [第三章](/v1/ch3) 中讨论的，如果要在索引中添加，删除或更新文档，Lucene 会写新的段文件，并在后台异步合并压缩段文件。我们将在 [第十一章](/v1/ch11) 中看到更多这种增量处理。

#### 键值存储作为批处理输出

搜索索引只是批处理工作流可能输出的一个例子。批处理的另一个常见用途是构建机器学习系统，例如分类器（比如垃圾邮件过滤器，异常检测，图像识别）与推荐系统（例如，你可能认识的人，你可能感兴趣的产品或相关的搜索【29】）。

这些批处理作业的输出通常是某种数据库：例如，可以通过给定用户 ID 查询该用户推荐好友的数据库，或者可以通过产品 ID 查询相关产品的数据库【45】。

这些数据库需要被处理用户请求的 Web 应用所查询，而它们通常是独立于 Hadoop 基础设施的。那么批处理过程的输出如何回到 Web 应用可以查询的数据库中呢？

最直接的选择可能是，直接在 Mapper 或 Reducer 中使用你最爱的数据库的客户端库，并从批处理作业直接写入数据库服务器，一次写入一条记录。它能工作（假设你的防火墙规则允许从你的 Hadoop 环境直接访问你的生产数据库），但这并不是一个好主意，出于以下几个原因：

- 正如前面在连接的上下文中讨论的那样，为每条记录发起一个网络请求，要比批处理任务的正常吞吐量慢几个数量级。即使客户端库支持批处理，性能也可能很差。
- MapReduce 作业经常并行运行许多任务。如果所有 Mapper 或 Reducer 都同时写入相同的输出数据库，并以批处理的预期速率工作，那么该数据库很可能被轻易压垮，其查询性能可能变差。这可能会导致系统其他部分的运行问题【35】。
- 通常情况下，MapReduce 为作业输出提供了一个干净利落的 “全有或全无” 保证：如果作业成功，则结果就是每个任务恰好执行一次所产生的输出，即使某些任务失败且必须一路重试。如果整个作业失败，则不会生成输出。然而从作业内部写入外部系统，会产生外部可见的副作用，这种副作用是不能以这种方式被隐藏的。因此，你不得不去操心对其他系统可见的部分完成的作业结果，并需要理解 Hadoop 任务尝试与预测执行的复杂性。

更好的解决方案是在批处理作业 **内** 创建一个全新的数据库，并将其作为文件写入分布式文件系统中作业的输出目录，就像上节中的搜索索引一样。这些数据文件一旦写入就是不可变的，可以批量加载到处理只读查询的服务器中。不少键值存储都支持在 MapReduce 作业中构建数据库文件，包括 Voldemort 【46】、Terrapin 【47】、ElephantDB 【48】和 HBase 批量加载【49】。

构建这些数据库文件是 MapReduce 的一种好用法：使用 Mapper 提取出键并按该键排序，已经完成了构建索引所必需的大量工作。由于这些键值存储大多都是只读的（文件只能由批处理作业一次性写入，然后就不可变），所以数据结构非常简单。比如它们就不需要预写式日志（WAL，请参阅 “[让 B 树更可靠](/v1/ch3#让B树更可靠)”）。

将数据加载到 Voldemort 时，服务器将继续用旧数据文件服务请求，同时将新数据文件从分布式文件系统复制到服务器的本地磁盘。一旦复制完成，服务器会自动将查询切换到新文件。如果在这个过程中出现任何问题，它可以轻易回滚至旧文件，因为它们仍然存在而且不可变【46】。

#### 批处理输出的哲学

本章前面讨论过的 Unix 哲学（“[Unix 哲学](#Unix哲学)”）鼓励以显式指明数据流的方式进行实验：程序读取输入并写入输出。在这一过程中，输入保持不变，任何先前的输出都被新输出完全替换，且没有其他副作用。这意味着你可以随心所欲地重新运行一个命令，略做改动或进行调试，而不会搅乱系统的状态。

MapReduce 作业的输出处理遵循同样的原理。通过将输入视为不可变且避免副作用（如写入外部数据库），批处理作业不仅实现了良好的性能，而且更容易维护：

- 如果在代码中引入了一个错误，而输出错误或损坏了，则可以简单地回滚到代码的先前版本，然后重新运行该作业，输出将重新被纠正。或者，甚至更简单，你可以将旧的输出保存在不同的目录中，然后切换回原来的目录。具有读写事务的数据库没有这个属性：如果你部署了错误的代码，将错误的数据写入数据库，那么回滚代码将无法修复数据库中的数据。（能够从错误代码中恢复的概念被称为 **人类容错（human fault tolerance）**【50】）
- 由于回滚很容易，比起在错误意味着不可挽回的伤害的环境，功能开发进展能快很多。这种 **最小化不可逆性（minimizing irreversibility）** 的原则有利于敏捷软件开发【51】。
- 如果 Map 或 Reduce 任务失败，MapReduce 框架将自动重新调度，并在同样的输入上再次运行它。如果失败是由代码中的错误造成的，那么它会不断崩溃，并最终导致作业在几次尝试之后失败。但是如果故障是由于临时问题导致的，那么故障就会被容忍。因为输入不可变，这种自动重试是安全的，而失败任务的输出会被 MapReduce 框架丢弃。
- 同一组文件可用作各种不同作业的输入，包括计算指标的监控作业并且评估作业的输出是否具有预期的性质（例如，将其与前一次运行的输出进行比较并测量差异） 。
- 与 Unix 工具类似，MapReduce 作业将逻辑与布线（配置输入和输出目录）分离，这使得关注点分离，可以重用代码：一个团队可以专注实现一个做好一件事的作业；而其他团队可以决定何时何地运行这项作业。

在这些领域，在 Unix 上表现良好的设计原则似乎也适用于 Hadoop，但 Unix 和 Hadoop 在某些方面也有所不同。例如，因为大多数 Unix 工具都假设输入输出是无类型文本文件，所以它们必须做大量的输入解析工作（本章开头的日志分析示例使用 `{print $7}` 来提取 URL）。在 Hadoop 上可以通过使用更结构化的文件格式消除一些低价值的语法转换：比如 Avro（请参阅 “[Avro](/v1/ch4#Avro)”）和 Parquet（请参阅 “[列式存储](/v1/ch3#列式存储)”）经常使用，因为它们提供了基于模式的高效编码，并允许模式随时间推移而演进（见 [第四章](/v1/ch4)）。

### Hadoop与分布式数据库的对比

正如我们所看到的，Hadoop 有点像 Unix 的分布式版本，其中 HDFS 是文件系统，而 MapReduce 是 Unix 进程的怪异实现（总是在 Map 阶段和 Reduce 阶段运行 `sort` 工具）。我们了解了如何在这些原语的基础上实现各种连接和分组操作。

当 MapReduce 论文发表时【1】，它从某种意义上来说 —— 并不新鲜。我们在前几节中讨论的所有处理和并行连接算法已经在十多年前所谓的 **大规模并行处理（MPP，massively parallel processing）** 数据库中实现了【3,40】。比如 Gamma database machine、Teradata 和 Tandem NonStop SQL 就是这方面的先驱【52】。

最大的区别是，MPP 数据库专注于在一组机器上并行执行分析 SQL 查询，而 MapReduce 和分布式文件系统【19】的组合则更像是一个可以运行任意程序的通用操作系统。

#### 存储多样性

数据库要求你根据特定的模型（例如关系或文档）来构造数据，而分布式文件系统中的文件只是字节序列，可以使用任何数据模型和编码来编写。它们可能是数据库记录的集合，但同样可以是文本、图像、视频、传感器读数、稀疏矩阵、特征向量、基因组序列或任何其他类型的数据。

说白了，Hadoop 开放了将数据不加区分地转储到 HDFS 的可能性，允许后续再研究如何进一步处理【53】。相比之下，在将数据导入数据库专有存储格式之前，MPP 数据库通常需要对数据和查询模式进行仔细的前期建模。

在纯粹主义者看来，这种仔细的建模和导入似乎是可取的，因为这意味着数据库的用户有更高质量的数据来处理。然而实践经验表明，简单地使数据快速可用 —— 即使它很古怪，难以使用，使用原始格式 —— 也通常要比事先决定理想数据模型要更有价值【54】。

这个想法与数据仓库类似（请参阅 “[数据仓库](/v1/ch3#数据仓库)”）：将大型组织的各个部分的数据集中在一起是很有价值的，因为它可以跨越以前相互分离的数据集进行连接。MPP 数据库所要求的谨慎模式设计拖慢了集中式数据收集速度；以原始形式收集数据，稍后再操心模式的设计，能使数据收集速度加快（有时被称为 “**数据湖（data lake）**” 或 “**企业数据中心（enterprise data hub）**”【55】）。

不加区分的数据转储转移了解释数据的负担：数据集的生产者不再需要强制将其转化为标准格式，数据的解释成为消费者的问题（**读时模式** 方法【56】；请参阅 “[文档模型中的模式灵活性](/v1/ch2#文档模型中的模式灵活性)”）。如果生产者和消费者是不同优先级的不同团队，这可能是一种优势。甚至可能不存在一个理想的数据模型，对于不同目的有不同的合适视角。以原始形式简单地转储数据，可以允许多种这样的转换。这种方法被称为 **寿司原则（sushi principle）**：“原始数据更好”【57】。

因此，Hadoop 经常被用于实现 ETL 过程（请参阅 “[数据仓库](/v1/ch3#数据仓库)”）：事务处理系统中的数据以某种原始形式转储到分布式文件系统中，然后编写 MapReduce 作业来清理数据，将其转换为关系形式，并将其导入 MPP 数据仓库以进行分析。数据建模仍然在进行，但它在一个单独的步骤中进行，与数据收集相解耦。这种解耦是可行的，因为分布式文件系统支持以任何格式编码的数据。

#### 处理模型的多样性

MPP 数据库是单体的，紧密集成的软件，负责磁盘上的存储布局，查询计划，调度和执行。由于这些组件都可以针对数据库的特定需求进行调整和优化，因此整个系统可以在其设计针对的查询类型上取得非常好的性能。而且，SQL 查询语言允许以优雅的语法表达查询，而无需编写代码，可以在业务分析师使用的可视化工具（例如 Tableau）中访问到。

另一方面，并非所有类型的处理都可以合理地表达为 SQL 查询。例如，如果要构建机器学习和推荐系统，或者使用相关性排名模型的全文搜索索引，或者执行图像分析，则很可能需要更一般的数据处理模型。这些类型的处理通常是特别针对特定应用的（例如机器学习的特征工程，机器翻译的自然语言模型，欺诈预测的风险评估函数），因此它们不可避免地需要编写代码，而不仅仅是查询。

MapReduce 使工程师能够轻松地在大型数据集上运行自己的代码。如果你有 HDFS 和 MapReduce，那么你 **可以** 在它之上建立一个 SQL 查询执行引擎，事实上这正是 Hive 项目所做的【31】。但是，你也可以编写许多其他形式的批处理，这些批处理不必非要用 SQL 查询表示。

随后，人们发现 MapReduce 对于某些类型的处理而言局限性很大，表现很差，因此在 Hadoop 之上其他各种处理模型也被开发出来（我们将在 “[MapReduce 之后](#MapReduce之后)” 中看到其中一些）。只有两种处理模型，SQL 和 MapReduce，还不够，需要更多不同的模型！而且由于 Hadoop 平台的开放性，实施一整套方法是可行的，而这在单体 MPP 数据库的范畴内是不可能的【58】。

至关重要的是，这些不同的处理模型都可以在共享的单个机器集群上运行，所有这些机器都可以访问分布式文件系统上的相同文件。在 Hadoop 方式中，不需要将数据导入到几个不同的专用系统中进行不同类型的处理：系统足够灵活，可以支持同一个集群内不同的工作负载。不需要移动数据，使得从数据中挖掘价值变得容易得多，也使采用新的处理模型容易的多。

Hadoop 生态系统包括随机访问的 OLTP 数据库，如 HBase（请参阅 “[SSTables 和 LSM 树](/v1/ch3#SSTables和LSM树)”）和 MPP 风格的分析型数据库，如 Impala 【41】。HBase 与 Impala 都不使用 MapReduce，但都使用 HDFS 进行存储。它们是迥异的数据访问与处理方法，但是它们可以共存，并被集成到同一个系统中。

#### 针对频繁故障设计

当比较 MapReduce 和 MPP 数据库时，两种不同的设计思路出现了：处理故障和使用内存与磁盘的方式。与在线系统相比，批处理对故障不太敏感，因为就算失败也不会立即影响到用户，而且它们总是能再次运行。

如果一个节点在执行查询时崩溃，大多数 MPP 数据库会中止整个查询，并让用户重新提交查询或自动重新运行它【3】。由于查询通常最多运行几秒钟或几分钟，所以这种错误处理的方法是可以接受的，因为重试的代价不是太大。MPP 数据库还倾向于在内存中保留尽可能多的数据（例如，使用散列连接）以避免从磁盘读取的开销。

另一方面，MapReduce 可以容忍单个 Map 或 Reduce 任务的失败，而不会影响作业的整体，通过以单个任务的粒度重试工作。它也会非常急切地将数据写入磁盘，一方面是为了容错，另一部分是因为假设数据集太大而不能适应内存。

MapReduce 方式更适用于较大的作业：要处理如此之多的数据并运行很长时间的作业，以至于在此过程中很可能至少遇到一个任务故障。在这种情况下，由于单个任务失败而重新运行整个作业将是非常浪费的。即使以单个任务的粒度进行恢复引入了使得无故障处理更慢的开销，但如果任务失败率足够高，这仍然是一种合理的权衡。

但是这些假设有多么现实呢？在大多数集群中，机器故障确实会发生，但是它们不是很频繁 —— 可能少到绝大多数作业都不会经历机器故障。为了容错，真的值得带来这么大的额外开销吗？

要了解 MapReduce 节约使用内存和在任务的层次进行恢复的原因，了解最初设计 MapReduce 的环境是很有帮助的。Google 有着混用的数据中心，在线生产服务和离线批处理作业在同样机器上运行。每个任务都有一个通过容器强制执行的资源配给（CPU 核心、RAM、磁盘空间等）。每个任务也具有优先级，如果优先级较高的任务需要更多的资源，则可以终止（抢占）同一台机器上较低优先级的任务以释放资源。优先级还决定了计算资源的定价：团队必须为他们使用的资源付费，而优先级更高的进程花费更多【59】。

这种架构允许非生产（低优先级）计算资源被 **过量使用（overcommitted）**，因为系统知道必要时它可以回收资源。与分离生产和非生产任务的系统相比，过量使用资源可以更好地利用机器并提高效率。但由于 MapReduce 作业以低优先级运行，它们随时都有被抢占的风险，因为优先级较高的进程可能需要其资源。在高优先级进程拿走所需资源后，批量作业能有效地 “捡面包屑”，利用剩下的任何计算资源。

在谷歌，运行一个小时的 MapReduce 任务有大约有 5% 的风险被终止，为了给更高优先级的进程挪地方。这一概率比硬件问题、机器重启或其他原因的概率高了一个数量级【59】。按照这种抢占率，如果一个作业有 100 个任务，每个任务运行 10 分钟，那么至少有一个任务在完成之前被终止的风险大于 50%。

这就是 MapReduce 被设计为容忍频繁意外任务终止的原因：不是因为硬件很不可靠，而是因为任意终止进程的自由有利于提高计算集群中的资源利用率。

在开源的集群调度器中，抢占的使用较少。YARN 的 CapacityScheduler 支持抢占，以平衡不同队列的资源分配【58】，但在编写本文时，YARN，Mesos 或 Kubernetes 不支持通用的优先级抢占【60】。在任务不经常被终止的环境中，MapReduce 的这一设计决策就没有多少意义了。在下一节中，我们将研究一些与 MapReduce 设计决策相异的替代方案。


## MapReduce之后

虽然 MapReduce 在 2000 年代后期变得非常流行，并受到大量的炒作，但它只是分布式系统的许多可能的编程模型之一。对于不同的数据量，数据结构和处理类型，其他工具可能更适合表示计算。


不管如何，我们在这一章花了大把时间来讨论 MapReduce，因为它是一种有用的学习工具，它是分布式文件系统的一种相当简单明晰的抽象。在这里，**简单** 意味着我们能理解它在做什么，而不是意味着使用它很简单。恰恰相反：使用原始的 MapReduce API 来实现复杂的处理工作实际上是非常困难和费力的 —— 例如，任意一种连接算法都需要你从头开始实现【37】。

针对直接使用 MapReduce 的困难，在 MapReduce 上有很多高级编程模型（Pig、Hive、Cascading、Crunch）被创造出来，作为建立在 MapReduce 之上的抽象。如果你了解 MapReduce 的原理，那么它们学起来相当简单。而且它们的高级结构能显著简化许多常见批处理任务的实现。

但是，MapReduce 执行模型本身也存在一些问题，这些问题并没有通过增加另一个抽象层次而解决，而对于某些类型的处理，它表现得非常差劲。一方面，MapReduce 非常稳健：你可以使用它在任务会频繁终止的多租户系统上处理几乎任意大量级的数据，并且仍然可以完成工作（虽然速度很慢）。另一方面，对于某些类型的处理而言，其他工具有时会快上几个数量级。

在本章的其余部分中，我们将介绍一些批处理方法。在 [第十一章](/v1/ch11) 我们将转向流处理，它可以看作是加速批处理的另一种方法。

### 物化中间状态

如前所述，每个 MapReduce 作业都独立于其他任何作业。作业与世界其他地方的主要连接点是分布式文件系统上的输入和输出目录。如果希望一个作业的输出成为第二个作业的输入，则需要将第二个作业的输入目录配置为第一个作业输出目录，且外部工作流调度程序必须在第一个作业完成后再启动第二个。

如果第一个作业的输出是要在组织内广泛发布的数据集，则这种配置是合理的。在这种情况下，你需要通过名称引用它，并将其重用为多个不同作业的输入（包括由其他团队开发的作业）。将数据发布到分布式文件系统中众所周知的位置能够带来 **松耦合**，这样作业就不需要知道是谁在提供输入或谁在消费输出（请参阅 “[逻辑与布线相分离](#逻辑与布线相分离)”）。

但在很多情况下，你知道一个作业的输出只能用作另一个作业的输入，这些作业由同一个团队维护。在这种情况下，分布式文件系统上的文件只是简单的 **中间状态（intermediate state）**：一种将数据从一个作业传递到下一个作业的方式。在一个用于构建推荐系统的，由 50 或 100 个 MapReduce 作业组成的复杂工作流中，存在着很多这样的中间状态【29】。

将这个中间状态写入文件的过程称为 **物化（materialization）**。（在 “[聚合：数据立方体和物化视图](/v1/ch3#聚合：数据立方体和物化视图)” 中已经在物化视图的背景中遇到过这个术语。它意味着对某个操作的结果立即求值并写出来，而不是在请求时按需计算）

作为对照，本章开头的日志分析示例使用 Unix 管道将一个命令的输出与另一个命令的输入连接起来。管道并没有完全物化中间状态，而是只使用一个小的内存缓冲区，将输出增量地 **流（stream）** 向输入。

与 Unix 管道相比，MapReduce 完全物化中间状态的方法存在不足之处：

- MapReduce 作业只有在前驱作业（生成其输入）中的所有任务都完成时才能启动，而由 Unix 管道连接的进程会同时启动，输出一旦生成就会被消费。不同机器上的数据偏斜或负载不均意味着一个作业往往会有一些掉队的任务，比其他任务要慢得多才能完成。必须等待至前驱作业的所有任务完成，拖慢了整个工作流程的执行。
- Mapper 通常是多余的：它们仅仅是读取刚刚由 Reducer 写入的同样文件，为下一个阶段的分区和排序做准备。在许多情况下，Mapper 代码可能是前驱 Reducer 的一部分：如果 Reducer 和 Mapper 的输出有着相同的分区与排序方式，那么 Reducer 就可以直接串在一起，而不用与 Mapper 相互交织。
- 将中间状态存储在分布式文件系统中意味着这些文件被复制到多个节点，对这些临时数据这么搞就比较过分了。

#### 数据流引擎

为了解决 MapReduce 的这些问题，几种用于分布式批处理的新执行引擎被开发出来，其中最著名的是 Spark 【61,62】，Tez 【63,64】和 Flink 【65,66】。它们的设计方式有很多区别，但有一个共同点：把整个工作流作为单个作业来处理，而不是把它分解为独立的子作业。

由于它们将工作流显式建模为数据从几个处理阶段穿过，所以这些系统被称为 **数据流引擎（dataflow engines）**。像 MapReduce 一样，它们在一条线上通过反复调用用户定义的函数来一次处理一条记录，它们通过输入分区来并行化载荷，它们通过网络将一个函数的输出复制到另一个函数的输入。

与 MapReduce 不同，这些函数不需要严格扮演交织的 Map 与 Reduce 的角色，而是可以以更灵活的方式进行组合。我们称这些函数为 **算子（operators）**，数据流引擎提供了几种不同的选项来将一个算子的输出连接到另一个算子的输入：

- 一种选项是对记录按键重新分区并排序，就像在 MapReduce 的混洗阶段一样（请参阅 “[分布式执行 MapReduce](#分布式执行MapReduce)”）。这种功能可以用于实现排序合并连接和分组，就像在 MapReduce 中一样。
- 另一种可能是接受多个输入，并以相同的方式进行分区，但跳过排序。当记录的分区重要但顺序无关紧要时，这省去了分区散列连接的工作，因为构建散列表还是会把顺序随机打乱。
- 对于广播散列连接，可以将一个算子的输出，发送到连接算子的所有分区。

这种类型的处理引擎是基于像 Dryad【67】和 Nephele【68】这样的研究系统，与 MapReduce 模型相比，它有几个优点：

- 排序等昂贵的工作只需要在实际需要的地方执行，而不是默认地在每个 Map 和 Reduce 阶段之间出现。
- 没有不必要的 Map 任务，因为 Mapper 所做的工作通常可以合并到前面的 Reduce 算子中（因为 Mapper 不会更改数据集的分区）。
- 由于工作流中的所有连接和数据依赖都是显式声明的，因此调度程序能够总览全局，知道哪里需要哪些数据，因而能够利用局部性进行优化。例如，它可以尝试将消费某些数据的任务放在与生成这些数据的任务相同的机器上，从而数据可以通过共享内存缓冲区传输，而不必通过网络复制。
- 通常，算子间的中间状态足以保存在内存中或写入本地磁盘，这比写入 HDFS 需要更少的 I/O（必须将其复制到多台机器，并将每个副本写入磁盘）。MapReduce 已经对 Mapper 的输出做了这种优化，但数据流引擎将这种思想推广至所有的中间状态。
- 算子可以在输入就绪后立即开始执行；后续阶段无需等待前驱阶段整个完成后再开始。
- 与 MapReduce（为每个任务启动一个新的 JVM）相比，现有 Java 虚拟机（JVM）进程可以重用来运行新算子，从而减少启动开销。

你可以使用数据流引擎执行与 MapReduce 工作流同样的计算，而且由于此处所述的优化，通常执行速度要明显快得多。既然算子是 Map 和 Reduce 的泛化，那么相同的处理代码就可以在任一执行引擎上运行：Pig，Hive 或 Cascading 中实现的工作流可以无需修改代码，可以通过修改配置，简单地从 MapReduce 切换到 Tez 或 Spark【64】。

Tez 是一个相当薄的库，它依赖于 YARN shuffle 服务来实现节点间数据的实际复制【58】，而 Spark 和 Flink 则是包含了独立网络通信层，调度器，及用户向 API 的大型框架。我们将简要讨论这些高级 API。

#### 容错

完全物化中间状态至分布式文件系统的一个优点是，它具有持久性，这使得 MapReduce 中的容错相当容易：如果一个任务失败，它可以在另一台机器上重新启动，并从文件系统重新读取相同的输入。

Spark、Flink 和 Tez 避免将中间状态写入 HDFS，因此它们采取了不同的方法来容错：如果一台机器发生故障，并且该机器上的中间状态丢失，则它会从其他仍然可用的数据重新计算（在可行的情况下是先前的中间状态，要么就只能是原始输入数据，通常在 HDFS 上）。

为了实现这种重新计算，框架必须跟踪一个给定的数据是如何计算的 —— 使用了哪些输入分区？应用了哪些算子？ Spark 使用 **弹性分布式数据集（RDD，Resilient Distributed Dataset）** 的抽象来跟踪数据的谱系【61】，而 Flink 对算子状态存档，允许恢复运行在执行过程中遇到错误的算子【66】。

在重新计算数据时，重要的是要知道计算是否是 **确定性的**：也就是说，给定相同的输入数据，算子是否始终产生相同的输出？如果一些丢失的数据已经发送给下游算子，这个问题就很重要。如果算子重新启动，重新计算的数据与原有的丢失数据不一致，下游算子很难解决新旧数据之间的矛盾。对于不确定性算子来说，解决方案通常是杀死下游算子，然后再重跑新数据。

为了避免这种级联故障，最好让算子具有确定性。但需要注意的是，非确定性行为很容易悄悄溜进来：例如，许多编程语言在迭代哈希表的元素时不能对顺序作出保证，许多概率和统计算法显式依赖于使用随机数，以及用到系统时钟或外部数据源，这些都是都不确定性的行为。为了能可靠地从故障中恢复，需要消除这种不确定性因素，例如使用固定的种子生成伪随机数。

通过重算数据来从故障中恢复并不总是正确的答案：如果中间状态数据要比源数据小得多，或者如果计算量非常大，那么将中间数据物化为文件可能要比重新计算廉价的多。

#### 关于物化的讨论

回到 Unix 的类比，我们看到，MapReduce 就像是将每个命令的输出写入临时文件，而数据流引擎看起来更像是 Unix 管道。尤其是 Flink 是基于管道执行的思想而建立的：也就是说，将算子的输出增量地传递给其他算子，不待输入完成便开始处理。

排序算子不可避免地需要消费全部的输入后才能生成任何输出，因为输入中最后一条输入记录可能具有最小的键，因此需要作为第一条记录输出。因此，任何需要排序的算子都需要至少暂时地累积状态。但是工作流的许多其他部分可以以流水线方式执行。

当作业完成时，它的输出需要持续到某个地方，以便用户可以找到并使用它 —— 很可能它会再次写入分布式文件系统。因此，在使用数据流引擎时，HDFS 上的物化数据集通常仍是作业的输入和最终输出。和 MapReduce 一样，输入是不可变的，输出被完全替换。比起 MapReduce 的改进是，你不用再自己去将中间状态写入文件系统了。

### 图与迭代处理

在 “[图数据模型](/v1/ch2#图数据模型)” 中，我们讨论了使用图来建模数据，并使用图查询语言来遍历图中的边与点。[第二章](/v1/ch2) 的讨论集中在 OLTP 风格的应用场景：快速执行查询来查找少量符合特定条件的顶点。

批处理上下文中的图也很有趣，其目标是在整个图上执行某种离线处理或分析。这种需求经常出现在机器学习应用（如推荐引擎）或排序系统中。例如，最着名的图形分析算法之一是 PageRank 【69】，它试图根据链接到某个网页的其他网页来估计该网页的流行度。它作为配方的一部分，用于确定网络搜索引擎呈现结果的顺序。

> 像 Spark、Flink 和 Tez 这样的数据流引擎（请参阅 “[物化中间状态](#物化中间状态)”）通常将算子作为 **有向无环图（DAG）** 的一部分安排在作业中。这与图处理不一样：在数据流引擎中，**从一个算子到另一个算子的数据流** 被构造成一个图，而数据本身通常由关系型元组构成。在图处理中，数据本身具有图的形式。又一个不幸的命名混乱！

许多图算法是通过一次遍历一条边来表示的，将一个顶点与近邻的顶点连接起来，以传播一些信息，并不断重复，直到满足一些条件为止 —— 例如，直到没有更多的边要跟进，或直到一些指标收敛。我们在 [图 2-6](/v1/ddia_0206.png) 中看到一个例子，它通过重复跟进标明地点归属关系的边，生成了数据库中北美包含的所有地点列表（这种算法被称为 **传递闭包**，即 transitive closure）。

可以在分布式文件系统中存储图（包含顶点和边的列表的文件），但是这种 “重复至完成” 的想法不能用普通的 MapReduce 来表示，因为它只扫过一趟数据。这种算法因此经常以 **迭代** 的风格实现：

1. 外部调度程序运行批处理来计算算法的一个步骤。
2. 当批处理过程完成时，调度器检查它是否完成（基于完成条件 —— 例如，没有更多的边要跟进，或者与上次迭代相比的变化低于某个阈值）。
3. 如果尚未完成，则调度程序返回到步骤 1 并运行另一轮批处理。

这种方法是有效的，但是用 MapReduce 实现它往往非常低效，因为 MapReduce 没有考虑算法的迭代性质：它总是读取整个输入数据集并产生一个全新的输出数据集，即使与上次迭代相比，改变的仅仅是图中的一小部分。

#### Pregel处理模型

针对图批处理的优化 —— **批量同步并行（BSP，Bulk Synchronous Parallel）** 计算模型【70】已经开始流行起来。其中，Apache Giraph 【37】，Spark 的 GraphX API 和 Flink 的 Gelly API 【71】实现了它。它也被称为 **Pregel** 模型，因为 Google 的 Pregel 论文推广了这种处理图的方法【72】。

回想一下在 MapReduce 中，Mapper 在概念上向 Reducer 的特定调用 “发送消息”，因为框架将所有具有相同键的 Mapper 输出集中在一起。Pregel 背后有一个类似的想法：一个顶点可以向另一个顶点 “发送消息”，通常这些消息是沿着图的边发送的。

在每次迭代中，为每个顶点调用一个函数，将所有发送给它的消息传递给它 —— 就像调用 Reducer 一样。与 MapReduce 的不同之处在于，在 Pregel 模型中，顶点在一次迭代到下一次迭代的过程中会记住它的状态，所以这个函数只需要处理新的传入消息。如果图的某个部分没有被发送消息，那里就不需要做任何工作。

这与 Actor 模型有些相似（请参阅 “[分布式的 Actor 框架](/v1/ch4#分布式的Actor框架)”），除了顶点状态和顶点之间的消息具有容错性和持久性，且通信以固定的回合进行：在每次迭代中，框架递送上次迭代中发送的所有消息。Actor 通常没有这样的时序保证。

#### 容错

顶点只能通过消息传递进行通信（而不是直接相互查询）的事实有助于提高 Pregel 作业的性能，因为消息可以成批处理，且等待通信的次数也减少了。唯一的等待是在迭代之间：由于 Pregel 模型保证所有在一轮迭代中发送的消息都在下轮迭代中送达，所以在下一轮迭代开始前，先前的迭代必须完全完成，而所有的消息必须在网络上完成复制。

即使底层网络可能丢失、重复或任意延迟消息（请参阅 “[不可靠的网络](/v1/ch8#不可靠的网络)”），Pregel 的实现能保证在后续迭代中消息在其目标顶点恰好处理一次。像 MapReduce 一样，框架能从故障中透明地恢复，以简化在 Pregel 上实现算法的编程模型。

这种容错是通过在迭代结束时，定期存档所有顶点的状态来实现的，即将其全部状态写入持久化存储。如果某个节点发生故障并且其内存中的状态丢失，则最简单的解决方法是将整个图计算回滚到上一个存档点，然后重启计算。如果算法是确定性的，且消息记录在日志中，那么也可以选择性地只恢复丢失的分区（就像之前讨论过的数据流引擎）【72】。

#### 并行执行

顶点不需要知道它在哪台物理机器上执行；当它向其他顶点发送消息时，它只是简单地将消息发往某个顶点 ID。图的分区取决于框架 —— 即，确定哪个顶点运行在哪台机器上，以及如何通过网络路由消息，以便它们到达正确的地方。

由于编程模型一次仅处理一个顶点（有时称为 “像顶点一样思考”），所以框架可以以任意方式对图分区。理想情况下如果顶点需要进行大量的通信，那么它们最好能被分区到同一台机器上。然而找到这样一种优化的分区方法是很困难的 —— 在实践中，图经常按照任意分配的顶点 ID 分区，而不会尝试将相关的顶点分组在一起。

因此，图算法通常会有很多跨机器通信的额外开销，而中间状态（节点之间发送的消息）往往比原始图大。通过网络发送消息的开销会显著拖慢分布式图算法的速度。

出于这个原因，如果你的图可以放入一台计算机的内存中，那么单机（甚至可能是单线程）算法很可能会超越分布式批处理【73,74】。图比内存大也没关系，只要能放入单台计算机的磁盘，使用 GraphChi 等框架进行单机处理是就一个可行的选择【75】。如果图太大，不适合单机处理，那么像 Pregel 这样的分布式方法是不可避免的。高效的并行图算法是一个进行中的研究领域【76】。


### 高级API和语言

自 MapReduce 开始流行的这几年以来，分布式批处理的执行引擎已经很成熟了。到目前为止，基础设施已经足够强大，能够存储和处理超过 10,000 台机器集群上的数 PB 的数据。由于在这种规模下物理执行批处理的问题已经被认为或多或少解决了，所以关注点已经转向其他领域：改进编程模型，提高处理效率，扩大这些技术可以解决的问题集。

如前所述，Hive、Pig、Cascading 和 Crunch 等高级语言和 API 变得越来越流行，因为手写 MapReduce 作业实在是个苦力活。随着 Tez 的出现，这些高级语言还有一个额外好处，可以迁移到新的数据流执行引擎，而无需重写作业代码。Spark 和 Flink 也有它们自己的高级数据流 API，通常是从 FlumeJava 中获取的灵感【34】。

这些数据流 API 通常使用关系型构建块来表达一个计算：按某个字段连接数据集；按键对元组做分组；按某些条件过滤；并通过计数求和或其他函数来聚合元组。在内部，这些操作是使用本章前面讨论过的各种连接和分组算法来实现的。

除了少写代码的明显优势之外，这些高级接口还支持交互式用法，在这种交互式使用中，你可以在 Shell 中增量式编写分析代码，频繁运行来观察它做了什么。这种开发风格在探索数据集和试验处理方法时非常有用。这也让人联想到 Unix 哲学，我们在 “[Unix 哲学](#Unix哲学)” 中讨论过这个问题。

此外，这些高级接口不仅提高了人类的工作效率，也提高了机器层面的作业执行效率。

#### 向声明式查询语言的转变

与硬写执行连接的代码相比，指定连接关系算子的优点是，框架可以分析连接输入的属性，并自动决定哪种上述连接算法最适合当前任务。Hive、Spark 和 Flink 都有基于代价的查询优化器可以做到这一点，甚至可以改变连接顺序，最小化中间状态的数量【66,77,78,79】。

连接算法的选择可以对批处理作业的性能产生巨大影响，而无需理解和记住本章中讨论的各种连接算法。如果连接是以 **声明式（declarative）** 的方式指定的，那这就这是可行的：应用只是简单地说明哪些连接是必需的，查询优化器决定如何最好地执行连接。我们以前在 “[数据查询语言](/v1/ch2#数据查询语言)” 中见过这个想法。

但 MapReduce 及其数据流后继者在其他方面，与 SQL 的完全声明式查询模型有很大区别。MapReduce 是围绕着回调函数的概念建立的：对于每条记录或者一组记录，调用一个用户定义的函数（Mapper 或 Reducer），并且该函数可以自由地调用任意代码来决定输出什么。这种方法的优点是可以基于大量已有库的生态系统创作：解析、自然语言分析、图像分析以及运行数值或统计算法等。

自由运行任意代码，长期以来都是传统 MapReduce 批处理系统与 MPP 数据库的区别所在（请参阅 “[Hadoop 与分布式数据库的对比](#Hadoop与分布式数据库的对比)” 一节）。虽然数据库具有编写用户定义函数的功能，但是它们通常使用起来很麻烦，而且与大多数编程语言中广泛使用的程序包管理器和依赖管理系统兼容不佳（例如 Java 的 Maven、Javascript 的 npm 以及 Ruby 的 gems）。

然而数据流引擎已经发现，支持除连接之外的更多 **声明式特性** 还有其他的优势。例如，如果一个回调函数只包含一个简单的过滤条件，或者只是从一条记录中选择了一些字段，那么在为每条记录调用函数时会有相当大的额外 CPU 开销。如果以声明方式表示这些简单的过滤和映射操作，那么查询优化器可以利用列式存储布局（请参阅 “[列式存储](/v1/ch3#列式存储)”），只从磁盘读取所需的列。Hive、Spark DataFrames 和 Impala 还使用了向量化执行（请参阅 “[内存带宽和矢量化处理](/v1/ch3#内存带宽和矢量化处理)”）：在对 CPU 缓存友好的内部循环中迭代数据，避免函数调用。Spark 生成 JVM 字节码【79】，Impala 使用 LLVM 为这些内部循环生成本机代码【41】。

通过在高级 API 中引入声明式的部分，并使查询优化器可以在执行期间利用这些来做优化，批处理框架看起来越来越像 MPP 数据库了（并且能实现可与之媲美的性能）。同时，通过拥有运行任意代码和以任意格式读取数据的可扩展性，它们保持了灵活性的优势。

#### 专业化的不同领域

尽管能够运行任意代码的可扩展性是很有用的，但是也有很多常见的例子，不断重复着标准的处理模式。因而这些模式值得拥有自己的可重用通用构建模块实现。传统上，MPP 数据库满足了商业智能分析和业务报表的需求，但这只是许多使用批处理的领域之一。

另一个越来越重要的领域是统计和数值算法，它们是机器学习应用所需要的（例如分类器和推荐系统）。可重用的实现正在出现：例如，Mahout 在 MapReduce、Spark 和 Flink 之上实现了用于机器学习的各种算法，而 MADlib 在关系型 MPP 数据库（Apache HAWQ）中实现了类似的功能【54】。

空间算法也是有用的，例如 **k 近邻搜索（k-nearest neighbors, kNN）**【80】，它在一些多维空间中搜索与给定项最近的项目 —— 这是一种相似性搜索。近似搜索对于基因组分析算法也很重要，它们需要找到相似但不相同的字符串【81】。

批处理引擎正被用于分布式执行日益广泛的各领域算法。随着批处理系统获得各种内置功能以及高级声明式算子，且随着 MPP 数据库变得更加灵活和易于编程，两者开始看起来相似了：最终，它们都只是存储和处理数据的系统。


## 本章小结

在本章中，我们探索了批处理的主题。我们首先看到了诸如 awk、grep 和 sort 之类的 Unix 工具，然后我们看到了这些工具的设计理念是如何应用到 MapReduce 和更近的数据流引擎中的。一些设计原则包括：输入是不可变的，输出是为了作为另一个（仍未知的）程序的输入，而复杂的问题是通过编写 “做好一件事” 的小工具来解决的。

在 Unix 世界中，允许程序与程序组合的统一接口是文件与管道；在 MapReduce 中，该接口是一个分布式文件系统。我们看到数据流引擎添加了自己的管道式数据传输机制，以避免将中间状态物化至分布式文件系统，但作业的初始输入和最终输出通常仍是 HDFS。

分布式批处理框架需要解决的两个主要问题是：

分区
: 在 MapReduce 中，Mapper 根据输入文件块进行分区。Mapper 的输出被重新分区、排序并合并到可配置数量的 Reducer 分区中。这一过程的目的是把所有的 **相关** 数据（例如带有相同键的所有记录）都放在同一个地方。
  后 MapReduce 时代的数据流引擎若非必要会尽量避免排序，但它们也采取了大致类似的分区方法。

容错
: MapReduce 经常写入磁盘，这使得从单个失败的任务恢复很轻松，无需重新启动整个作业，但在无故障的情况下减慢了执行速度。数据流引擎更多地将中间状态保存在内存中，更少地物化中间状态，这意味着如果节点发生故障，则需要重算更多的数据。确定性算子减少了需要重算的数据量。


我们讨论了几种 MapReduce 的连接算法，其中大多数也在 MPP 数据库和数据流引擎内部使用。它们也很好地演示了分区算法是如何工作的：

排序合并连接
: 每个参与连接的输入都通过一个提取连接键的 Mapper。通过分区、排序和合并，具有相同键的所有记录最终都会进入相同的 Reducer 调用。这个函数能输出连接好的记录。

广播散列连接
: 两个连接输入之一很小，所以它并没有分区，而且能被完全加载进一个哈希表中。因此，你可以为连接输入大端的每个分区启动一个 Mapper，将输入小端的散列表加载到每个 Mapper 中，然后扫描大端，一次一条记录，并为每条记录查询散列表。

分区散列连接
: 如果两个连接输入以相同的方式分区（使用相同的键，相同的散列函数和相同数量的分区），则可以独立地对每个分区应用散列表方法。

分布式批处理引擎有一个刻意限制的编程模型：回调函数（比如 Mapper 和 Reducer）被假定是无状态的，而且除了指定的输出外，必须没有任何外部可见的副作用。这一限制允许框架在其抽象下隐藏一些困难的分布式系统问题：当遇到崩溃和网络问题时，任务可以安全地重试，任何失败任务的输出都被丢弃。如果某个分区的多个任务成功，则其中只有一个能使其输出实际可见。

得益于这个框架，你在批处理作业中的代码无需操心实现容错机制：框架可以保证作业的最终输出与没有发生错误的情况相同，虽然实际上也许不得不重试各种任务。比起在线服务一边处理用户请求一边将写入数据库作为处理请求的副作用，批处理提供的这种可靠性语义要强得多。

批处理作业的显著特点是，它读取一些输入数据并产生一些输出数据，但不修改输入 —— 换句话说，输出是从输入衍生出的。最关键的是，输入数据是 **有界的（bounded）**：它有一个已知的，固定的大小（例如，它包含一些时间点的日志文件或数据库内容的快照）。因为它是有界的，一个作业知道自己什么时候完成了整个输入的读取，所以一个工作在做完后，最终总是会完成的。

在下一章中，我们将转向流处理，其中的输入是 **无界的（unbounded）** —— 也就是说，你还有活儿要干，然而它的输入是永无止境的数据流。在这种情况下，作业永无完成之日。因为在任何时候都可能有更多的工作涌入。我们将看到，在某些方面上，流处理和批处理是相似的。但是关于无尽数据流的假设也对我们构建系统的方式产生了很多改变。


## 参考文献

1. Jeffrey Dean and Sanjay Ghemawat: “[MapReduce: Simplified Data Processing on Large Clusters](https://research.google/pubs/pub62/),” at *6th USENIX Symposium on Operating System Design and Implementation* (OSDI), December 2004.
1. Joel Spolsky: “[The Perils of JavaSchools](https://www.joelonsoftware.com/2005/12/29/the-perils-of-javaschools-2/),” *joelonsoftware.com*, December 29, 2005.
1. Shivnath Babu and Herodotos Herodotou: “[Massively Parallel Databases and MapReduce Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2013/11/db-mr-survey-final.pdf),” *Foundations and Trends in Databases*, volume 5, number 1, pages 1–104, November 2013. [doi:10.1561/1900000036](http://dx.doi.org/10.1561/1900000036)
1. David J. DeWitt and Michael Stonebraker: “[MapReduce: A Major Step Backwards](https://homes.cs.washington.edu/~billhowe/mapreduce_a_major_step_backwards.html),” originally published at *databasecolumn.vertica.com*, January 17, 2008.
1. Henry Robinson: “[The Elephant Was a Trojan Horse: On the Death of Map-Reduce at Google](https://www.the-paper-trail.org/post/2014-06-25-the-elephant-was-a-trojan-horse-on-the-death-of-map-reduce-at-google/),” *the-paper-trail.org*, June 25, 2014.
1. “[The Hollerith Machine](https://www.census.gov/history/www/innovations/technology/the_hollerith_tabulator.html),” United States Census Bureau, *census.gov*.
1. “[IBM 82, 83, and 84 Sorters Reference Manual](https://bitsavers.org/pdf/ibm/punchedCard/Sorter/A24-1034-1_82-83-84_sorters.pdf),” Edition A24-1034-1, International Business Machines Corporation, July 1962.
1. Adam Drake: “[Command-Line Tools Can Be 235x Faster than Your Hadoop Cluster](https://adamdrake.com/command-line-tools-can-be-235x-faster-than-your-hadoop-cluster.html),” *aadrake.com*, January 25, 2014.
1. “[GNU Coreutils 8.23 Documentation](http://www.gnu.org/software/coreutils/manual/html_node/index.html),” Free Software Foundation, Inc., 2014.
1. Martin Kleppmann: “[Kafka, Samza, and the Unix Philosophy of Distributed Data](http://martin.kleppmann.com/2015/08/05/kafka-samza-unix-philosophy-distributed-data.html),” *martin.kleppmann.com*, August 5, 2015.
1. Doug McIlroy: [Internal Bell Labs memo](https://swtch.com/~rsc/thread/mdmpipe.pdf), October 1964. Cited in: Dennis M. Richie: “[Advice from Doug McIlroy](https://www.bell-labs.com/usr/dmr/www/mdmpipe.html),” *bell-labs.com*.
1. M. D. McIlroy, E. N. Pinson, and B. A. Tague: “[UNIX Time-Sharing System: Foreword](https://archive.org/details/bstj57-6-1899),” *The Bell System Technical Journal*, volume 57, number 6, pages 1899–1904, July 1978.
1. Eric S. Raymond: [*The Art of UNIX Programming*](http://www.catb.org/~esr/writings/taoup/html/). Addison-Wesley, 2003. ISBN: 978-0-13-142901-7
1. Ronald Duncan: “[Text File Formats – ASCII Delimited Text – Not CSV or TAB Delimited Text](https://ronaldduncan.wordpress.com/2009/10/31/text-file-formats-ascii-delimited-text-not-csv-or-tab-delimited-text/),” *ronaldduncan.wordpress.com*, October 31, 2009.
1. Alan Kay: “[Is 'Software Engineering' an Oxymoron?](http://tinlizzie.org/~takashi/IsSoftwareEngineeringAnOxymoron.pdf),” *tinlizzie.org*.
1. Martin Fowler: “[InversionOfControl](http://martinfowler.com/bliki/InversionOfControl.html),” *martinfowler.com*, June 26, 2005.
1. Daniel J. Bernstein: “[Two File Descriptors for Sockets](http://cr.yp.to/tcpip/twofd.html),” *cr.yp.to*.
1. Rob Pike and Dennis M. Ritchie: “[The Styx Architecture for Distributed Systems](http://doc.cat-v.org/inferno/4th_edition/styx),” *Bell Labs Technical Journal*, volume 4, number 2, pages 146–152, April 1999.
1. Sanjay Ghemawat, Howard Gobioff, and Shun-Tak Leung: “[The Google File System](http://research.google.com/archive/gfs-sosp2003.pdf),” at *19th ACM Symposium on Operating Systems Principles* (SOSP), October 2003. [doi:10.1145/945445.945450](http://dx.doi.org/10.1145/945445.945450)
1. Michael Ovsiannikov, Silvius Rus, Damian Reeves, et al.: “[The Quantcast File System](http://db.disi.unitn.eu/pages/VLDBProgram/pdf/industry/p808-ovsiannikov.pdf),” *Proceedings of the VLDB Endowment*, volume 6, number 11, pages 1092–1101, August 2013. [doi:10.14778/2536222.2536234](http://dx.doi.org/10.14778/2536222.2536234)
1. “[OpenStack Swift 2.6.1 Developer Documentation](http://docs.openstack.org/developer/swift/),” OpenStack Foundation, *docs.openstack.org*, March 2016.
1. Zhe Zhang, Andrew Wang, Kai Zheng, et al.: “[Introduction to HDFS Erasure Coding in Apache Hadoop](https://blog.cloudera.com/introduction-to-hdfs-erasure-coding-in-apache-hadoop/),” *blog.cloudera.com*, September 23, 2015.
1. Peter Cnudde: “[Hadoop Turns 10](https://web.archive.org/web/20190119112713/https://yahoohadoop.tumblr.com/post/138739227316/hadoop-turns-10),” *yahoohadoop.tumblr.com*, February 5, 2016.
1. Eric Baldeschwieler: “[Thinking About the HDFS vs. Other Storage Technologies](https://web.archive.org/web/20190529215115/http://hortonworks.com/blog/thinking-about-the-hdfs-vs-other-storage-technologies/),” *hortonworks.com*, July 25, 2012.
1. Brendan Gregg: “[Manta: Unix Meets Map Reduce](https://web.archive.org/web/20220125052545/http://dtrace.org/blogs/brendan/2013/06/25/manta-unix-meets-map-reduce/),” *dtrace.org*, June 25, 2013.
1. Tom White: *Hadoop: The Definitive Guide*, 4th edition. O'Reilly Media, 2015. ISBN: 978-1-491-90163-2
1. Jim N. Gray: “[Distributed Computing Economics](http://arxiv.org/pdf/cs/0403019.pdf),” Microsoft Research Tech Report MSR-TR-2003-24, March 2003.
1. Márton Trencséni: “[Luigi vs Airflow vs Pinball](http://bytepawn.com/luigi-airflow-pinball.html),” *bytepawn.com*, February 6, 2016.
1. Roshan Sumbaly, Jay Kreps, and Sam Shah: “[The 'Big Data' Ecosystem at LinkedIn](http://www.slideshare.net/s_shah/the-big-data-ecosystem-at-linkedin-23512853),” at *ACM International Conference on Management of Data* (SIGMOD), July 2013. [doi:10.1145/2463676.2463707](http://dx.doi.org/10.1145/2463676.2463707)
1. Alan F. Gates, Olga Natkovich, Shubham Chopra, et al.: “[Building a High-Level Dataflow System on Top of Map-Reduce: The Pig Experience](http://www.vldb.org/pvldb/vol2/vldb09-1074.pdf),” at *35th International Conference on Very Large Data Bases* (VLDB), August 2009.
1. Ashish Thusoo, Joydeep Sen Sarma, Namit Jain, et al.: “[Hive – A Petabyte Scale Data Warehouse Using Hadoop](http://i.stanford.edu/~ragho/hive-icde2010.pdf),” at *26th IEEE International Conference on Data Engineering* (ICDE), March 2010. [doi:10.1109/ICDE.2010.5447738](http://dx.doi.org/10.1109/ICDE.2010.5447738)
1. “[Cascading 3.0 User Guide](https://web.archive.org/web/20231206195311/http://docs.cascading.org/cascading/3.0/userguide/),” Concurrent, Inc., *docs.cascading.org*, January 2016.
1. “[Apache Crunch User Guide](https://crunch.apache.org/user-guide.html),” Apache Software Foundation, *crunch.apache.org*.
1. Craig Chambers, Ashish Raniwala, Frances Perry, et al.: “[FlumeJava: Easy, Efficient Data-Parallel Pipelines](https://research.google.com/pubs/archive/35650.pdf),” at *31st ACM SIGPLAN Conference on Programming Language Design and Implementation* (PLDI), June 2010. [doi:10.1145/1806596.1806638](http://dx.doi.org/10.1145/1806596.1806638)
1. Jay Kreps: “[Why Local State is a Fundamental Primitive in Stream Processing](https://www.oreilly.com/ideas/why-local-state-is-a-fundamental-primitive-in-stream-processing),” *oreilly.com*, July 31, 2014.
1. Martin Kleppmann: “[Rethinking Caching in Web Apps](http://martin.kleppmann.com/2012/10/01/rethinking-caching-in-web-apps.html),” *martin.kleppmann.com*, October 1, 2012.
1. Mark Grover, Ted Malaska, Jonathan Seidman, and Gwen Shapira: *[Hadoop Application Architectures](http://shop.oreilly.com/product/0636920033196.do)*. O'Reilly Media, 2015. ISBN: 978-1-491-90004-8
1. Philippe Ajoux, Nathan Bronson, Sanjeev Kumar, et al.: “[Challenges to Adopting Stronger Consistency at Scale](https://www.usenix.org/system/files/conference/hotos15/hotos15-paper-ajoux.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
1. Sriranjan Manjunath: “[Skewed Join](https://web.archive.org/web/20151228114742/https://wiki.apache.org/pig/PigSkewedJoinSpec),” *wiki.apache.org*, 2009.
1. David J. DeWitt, Jeffrey F. Naughton, Donovan A. Schneider, and S. Seshadri: “[Practical Skew Handling in Parallel Joins](http://www.vldb.org/conf/1992/P027.PDF),” at *18th International Conference on Very Large Data Bases* (VLDB), August 1992.
1. Marcel Kornacker, Alexander Behm, Victor Bittorf, et al.: “[Impala: A Modern, Open-Source SQL Engine for Hadoop](http://pandis.net/resources/cidr15impala.pdf),” at *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
1. Matthieu Monsch: “[Open-Sourcing PalDB, a Lightweight Companion for Storing Side Data](https://engineering.linkedin.com/blog/2015/10/open-sourcing-paldb--a-lightweight-companion-for-storing-side-da),” *engineering.linkedin.com*, October 26, 2015.
1. Daniel Peng and Frank Dabek: “[Large-Scale Incremental Processing Using Distributed Transactions and Notifications](https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Peng.pdf),” at *9th USENIX conference on Operating Systems Design and Implementation* (OSDI), October 2010.
1. “["Cloudera Search User Guide,"](http://www.cloudera.com/documentation/cdh/5-1-x/Search/Cloudera-Search-User-Guide/Cloudera-Search-User-Guide.html) Cloudera, Inc., September 2015.
1. Lili Wu, Sam Shah, Sean Choi, et al.: “[The Browsemaps: Collaborative Filtering at LinkedIn](http://ceur-ws.org/Vol-1271/Paper3.pdf),” at *6th Workshop on Recommender Systems and the Social Web* (RSWeb), October 2014.
1. Roshan Sumbaly, Jay Kreps, Lei Gao, et al.: “[Serving Large-Scale Batch Computed Data with Project Voldemort](http://static.usenix.org/events/fast12/tech/full_papers/Sumbaly.pdf),” at *10th USENIX Conference on File and Storage Technologies* (FAST), February 2012.
1. Varun Sharma: “[Open-Sourcing Terrapin: A Serving System for Batch Generated Data](https://web.archive.org/web/20170215032514/https://engineering.pinterest.com/blog/open-sourcing-terrapin-serving-system-batch-generated-data-0),” *engineering.pinterest.com*, September 14, 2015.
1. Nathan Marz: “[ElephantDB](http://www.slideshare.net/nathanmarz/elephantdb),” *slideshare.net*, May 30, 2011.
1. Jean-Daniel (JD) Cryans: “[How-to: Use HBase Bulk Loading, and Why](https://blog.cloudera.com/how-to-use-hbase-bulk-loading-and-why/),” *blog.cloudera.com*, September 27, 2013.
1. Nathan Marz: “[How to Beat the CAP Theorem](http://nathanmarz.com/blog/how-to-beat-the-cap-theorem.html),” *nathanmarz.com*, October 13, 2011.
1. Molly Bartlett Dishman and Martin Fowler: “[Agile Architecture](https://web.archive.org/web/20161130034721/http://conferences.oreilly.com/software-architecture/sa2015/public/schedule/detail/40388),” at *O'Reilly Software Architecture Conference*, March 2015.
1. David J. DeWitt and Jim N. Gray: “[Parallel Database Systems: The Future of High Performance Database Systems](http://www.cs.cmu.edu/~pavlo/courses/fall2013/static/papers/dewittgray92.pdf),” *Communications of the ACM*, volume 35, number 6, pages 85–98, June 1992. [doi:10.1145/129888.129894](http://dx.doi.org/10.1145/129888.129894)
1. Jay Kreps: “[But the multi-tenancy thing is actually really really hard](https://twitter.com/jaykreps/status/528235702480142336),” tweetstorm, *twitter.com*, October 31, 2014.
1. Jeffrey Cohen, Brian Dolan, Mark Dunlap, et al.: “[MAD Skills: New Analysis Practices for Big Data](http://www.vldb.org/pvldb/vol2/vldb09-219.pdf),” *Proceedings of the VLDB Endowment*, volume 2, number 2, pages 1481–1492, August 2009. [doi:10.14778/1687553.1687576](http://dx.doi.org/10.14778/1687553.1687576)
1. Ignacio Terrizzano, Peter Schwarz, Mary Roth, and John E. Colino: “[Data Wrangling: The Challenging Journey from the Wild to the Lake](http://cidrdb.org/cidr2015/Papers/CIDR15_Paper2.pdf),” at *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
1. Paige Roberts: “[To Schema on Read or to Schema on Write, That Is the Hadoop Data Lake Question](https://web.archive.org/web/20171105001306/http://adaptivesystemsinc.com/blog/to-schema-on-read-or-to-schema-on-write-that-is-the-hadoop-data-lake-question/),” *adaptivesystemsinc.com*, July 2, 2015.
1. Bobby Johnson and Joseph Adler: “[The Sushi Principle: Raw Data Is Better](https://web.archive.org/web/20161126104941/https://conferences.oreilly.com/strata/big-data-conference-ca-2015/public/schedule/detail/38737),” at *Strata+Hadoop World*, February 2015.
1. Vinod Kumar Vavilapalli, Arun C. Murthy, Chris Douglas, et al.: “[Apache Hadoop YARN: Yet Another Resource Negotiator](https://www.cs.cmu.edu/~garth/15719/papers/yarn.pdf),” at *4th ACM Symposium on Cloud Computing* (SoCC), October 2013. [doi:10.1145/2523616.2523633](http://dx.doi.org/10.1145/2523616.2523633)
1. Abhishek Verma, Luis Pedrosa, Madhukar Korupolu, et al.: “[Large-Scale Cluster Management at Google with Borg](http://research.google.com/pubs/pub43438.html),” at *10th European Conference on Computer Systems* (EuroSys), April 2015. [doi:10.1145/2741948.2741964](http://dx.doi.org/10.1145/2741948.2741964)
1. Malte Schwarzkopf: “[The Evolution of Cluster Scheduler Architectures](https://web.archive.org/web/20201109052657/http://www.firmament.io/blog/scheduler-architectures.html),” *firmament.io*, March 9, 2016.
1. Matei Zaharia, Mosharaf Chowdhury, Tathagata Das, et al.: “[Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing](https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final138.pdf),” at *9th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), April 2012.
1. Holden Karau, Andy Konwinski, Patrick Wendell, and Matei Zaharia: *Learning Spark*. O'Reilly Media, 2015. ISBN: 978-1-449-35904-1
1. Bikas Saha and Hitesh Shah: “[Apache Tez: Accelerating Hadoop Query Processing](http://www.slideshare.net/Hadoop_Summit/w-1205phall1saha),” at *Hadoop Summit*, June 2014.
1. Bikas Saha, Hitesh Shah, Siddharth Seth, et al.: “[Apache Tez: A Unifying Framework for Modeling and Building Data Processing Applications](http://home.cse.ust.hk/~weiwa/teaching/Fall15-COMP6611B/reading_list/Tez.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2742790](http://dx.doi.org/10.1145/2723372.2742790)
1. Kostas Tzoumas: “[Apache Flink: API, Runtime, and Project Roadmap](http://www.slideshare.net/KostasTzoumas/apache-flink-api-runtime-and-project-roadmap),” *slideshare.net*, January 14, 2015.
1. Alexander Alexandrov, Rico Bergmann, Stephan Ewen, et al.: “[The Stratosphere Platform for Big Data Analytics](https://ssc.io/pdf/2014-VLDBJ_Stratosphere_Overview.pdf),” *The VLDB Journal*, volume 23, number 6, pages 939–964, May 2014. [doi:10.1007/s00778-014-0357-y](http://dx.doi.org/10.1007/s00778-014-0357-y)
1. Michael Isard, Mihai Budiu, Yuan Yu, et al.: “[Dryad: Distributed Data-Parallel Programs from Sequential Building Blocks](https://www.microsoft.com/en-us/research/publication/dryad-distributed-data-parallel-programs-from-sequential-building-blocks/),” at *European Conference on Computer Systems* (EuroSys), March 2007. [doi:10.1145/1272996.1273005](http://dx.doi.org/10.1145/1272996.1273005)
1. Daniel Warneke and Odej Kao: “[Nephele: Efficient Parallel Data Processing in the Cloud](https://stratosphere2.dima.tu-berlin.de/assets/papers/Nephele_09.pdf),” at *2nd Workshop on Many-Task Computing on Grids and Supercomputers* (MTAGS), November 2009. [doi:10.1145/1646468.1646476](http://dx.doi.org/10.1145/1646468.1646476)
1. Lawrence Page, Sergey Brin, Rajeev Motwani, and Terry Winograd: “[The PageRank Citation Ranking: Bringing Order to the Web](https://web.archive.org/web/20230219170930/http://ilpubs.stanford.edu:8090/422/),” Stanford InfoLab Technical Report 422, 1999.
1. Leslie G. Valiant: “[A Bridging Model for Parallel Computation](http://dl.acm.org/citation.cfm?id=79181),” *Communications of the ACM*, volume 33, number 8, pages 103–111, August 1990. [doi:10.1145/79173.79181](http://dx.doi.org/10.1145/79173.79181)
1. Stephan Ewen, Kostas Tzoumas, Moritz Kaufmann, and Volker Markl: “[Spinning Fast Iterative Data Flows](http://vldb.org/pvldb/vol5/p1268_stephanewen_vldb2012.pdf),” *Proceedings of the VLDB Endowment*, volume 5, number 11, pages 1268-1279, July 2012. [doi:10.14778/2350229.2350245](http://dx.doi.org/10.14778/2350229.2350245)
1. Grzegorz Malewicz, Matthew H. Austern, Aart J. C. Bik, et al.: “[Pregel: A System for Large-Scale Graph Processing](https://kowshik.github.io/JPregel/pregel_paper.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2010. [doi:10.1145/1807167.1807184](http://dx.doi.org/10.1145/1807167.1807184)
1. Frank McSherry, Michael Isard, and Derek G. Murray: “[Scalability! But at What COST?](http://www.frankmcsherry.org/assets/COST.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
1. Ionel Gog, Malte Schwarzkopf, Natacha Crooks, et al.: “[Musketeer: All for One, One for All in Data Processing Systems](http://www.cl.cam.ac.uk/research/srg/netos/camsas/pubs/eurosys15-musketeer.pdf),” at *10th European Conference on Computer Systems* (EuroSys), April 2015. [doi:10.1145/2741948.2741968](http://dx.doi.org/10.1145/2741948.2741968)
1. Aapo Kyrola, Guy Blelloch, and Carlos Guestrin: “[GraphChi: Large-Scale Graph Computation on Just a PC](https://www.usenix.org/system/files/conference/osdi12/osdi12-final-126.pdf),” at *10th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2012.
1. Andrew Lenharth, Donald Nguyen, and Keshav Pingali: “[Parallel Graph Analytics](http://cacm.acm.org/magazines/2016/5/201591-parallel-graph-analytics/fulltext),” *Communications of the ACM*, volume 59, number 5, pages 78–87, May 2016. [doi:10.1145/2901919](http://dx.doi.org/10.1145/2901919)
1. Fabian Hüske: “[Peeking into Apache Flink's Engine Room](http://flink.apache.org/news/2015/03/13/peeking-into-Apache-Flinks-Engine-Room.html),” *flink.apache.org*, March 13, 2015.
1. Mostafa Mokhtar: “[Hive 0.14 Cost Based Optimizer (CBO) Technical Overview](https://web.archive.org/web/20170607112708/http://hortonworks.com/blog/hive-0-14-cost-based-optimizer-cbo-technical-overview/),” *hortonworks.com*, March 2, 2015.
1. Michael Armbrust, Reynold S Xin, Cheng Lian, et al.: “[Spark SQL: Relational Data Processing in Spark](http://people.csail.mit.edu/matei/papers/2015/sigmod_spark_sql.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2742797](http://dx.doi.org/10.1145/2723372.2742797)
1. Daniel Blazevski: “[Planting Quadtrees for Apache Flink](https://blog.insightdatascience.com/planting-quadtrees-for-apache-flink-b396ebc80d35),” *insightdataengineering.com*, March 25, 2016.
1. Tom White: “[Genome Analysis Toolkit: Now Using Apache Spark for Data Processing](https://web.archive.org/web/20190215132904/http://blog.cloudera.com/blog/2016/04/genome-analysis-toolkit-now-using-apache-spark-for-data-processing/),” *blog.cloudera.com*, April 6, 2016.


================================================
FILE: content/v1/ch11.md
================================================
---
title: "第十一章：流处理"
linkTitle: "11. 流处理"
weight: 311
math: true
breadcrumbs: false
---


![](/map/ch11.png)

> 有效的复杂系统总是从简单的系统演化而来。反之亦然：从零设计的复杂系统没一个能有效工作的。
>
> —— 约翰・加尔，Systemantics（1975）


在 [第十章](/v1/ch10) 中，我们讨论了批处理技术，它读取一组文件作为输入，并生成一组新的文件作为输出。输出是 **衍生数据（derived data）** 的一种形式；也就是说，如果需要，可以通过再次运行批处理过程来重新创建数据集。我们看到了如何使用这个简单而强大的想法来建立搜索索引、推荐系统、做分析等等。

然而，在 [第十章](/v1/ch10) 中仍然有一个很大的假设：即输入是有界的，即已知和有限的大小，所以批处理知道它何时完成输入的读取。例如，MapReduce 核心的排序操作必须读取其全部输入，然后才能开始生成输出：可能发生这种情况：最后一条输入记录具有最小的键，因此需要第一个被输出，所以提早开始输出是不可行的。

实际上，很多数据是 **无界限** 的，因为它随着时间的推移而逐渐到达：你的用户在昨天和今天产生了数据，明天他们将继续产生更多的数据。除非你停业，否则这个过程永远都不会结束，所以数据集从来就不会以任何有意义的方式 “完成”【1】。因此，批处理程序必须将数据人为地分成固定时间段的数据块，例如，在每天结束时处理一天的数据，或者在每小时结束时处理一小时的数据。

日常批处理中的问题是，输入的变更只会在一天之后的输出中反映出来，这对于许多急躁的用户来说太慢了。为了减少延迟，我们可以更频繁地运行处理 ——  比如说，在每秒钟的末尾 —— 或者甚至更连续一些，完全抛开固定的时间切片，当事件发生时就立即进行处理，这就是 **流处理（stream processing）** 背后的想法。

一般来说，“流” 是指随着时间的推移逐渐可用的数据。这个概念出现在很多地方：Unix 的 stdin 和 stdout、编程语言（惰性列表）【2】、文件系统 API（如 Java 的 `FileInputStream`）、TCP 连接、通过互联网传送音频和视频等等。

在本章中，我们将把 **事件流（event stream）** 视为一种数据管理机制：无界限，增量处理，与上一章中的批量数据相对应。我们将首先讨论怎样表示、存储、通过网络传输流。在 “[数据库与流](#数据库与流)” 中，我们将研究流和数据库之间的关系。最后在 “[流处理](#流处理)” 中，我们将研究连续处理这些流的方法和工具，以及它们用于应用构建的方式。


## 传递事件流

在批处理领域，作业的输入和输出是文件（也许在分布式文件系统上）。流处理领域中的等价物看上去是什么样子的？

当输入是一个文件（一个字节序列），第一个处理步骤通常是将其解析为一系列记录。在流处理的上下文中，记录通常被叫做 **事件（event）** ，但它本质上是一样的：一个小的、自包含的、不可变的对象，包含某个时间点发生的某件事情的细节。一个事件通常包含一个来自日历时钟的时间戳，以指明事件发生的时间（请参阅 “[单调钟与日历时钟](/v1/ch8#单调钟与日历时钟)”）。

例如，发生的事件可能是用户采取的行动，例如查看页面或进行购买。它也可能来源于机器，例如对温度传感器或 CPU 利用率的周期性测量。在 “[使用 Unix 工具的批处理](/v1/ch10#使用Unix工具的批处理)” 的示例中，Web 服务器日志的每一行都是一个事件。

事件可能被编码为文本字符串或 JSON，或者某种二进制编码，如 [第四章](/v1/ch4) 所述。这种编码允许你存储一个事件，例如将其追加到一个文件，将其插入关系表，或将其写入文档数据库。它还允许你通过网络将事件发送到另一个节点以进行处理。

在批处理中，文件被写入一次，然后可能被多个作业读取。类似地，在流处理术语中，一个事件由 **生产者（producer）** （也称为 **发布者（publisher）** 或 **发送者（sender）** ）生成一次，然后可能由多个 **消费者（consumer）** （ **订阅者（subscribers）** 或 **接收者（recipients）** ）进行处理【3】。在文件系统中，文件名标识一组相关记录；在流式系统中，相关的事件通常被聚合为一个 **主题（topic）** 或 **流（stream）** 。

原则上讲，文件或数据库就足以连接生产者和消费者：生产者将其生成的每个事件写入数据存储，且每个消费者定期轮询数据存储，检查自上次运行以来新出现的事件。这实际上正是批处理在每天结束时处理当天数据时所做的事情。

但当我们想要进行低延迟的连续处理时，如果数据存储不是为这种用途专门设计的，那么轮询开销就会很大。轮询的越频繁，能返回新事件的请求比例就越低，而额外开销也就越高。相比之下，最好能在新事件出现时直接通知消费者。

数据库在传统上对这种通知机制支持的并不好，关系型数据库通常有 **触发器（trigger）** ，它们可以对变化（如，插入表中的一行）作出反应，但是它们的功能非常有限，并且在数据库设计中有些后顾之忧【4,5】。相应的是，已经开发了专门的工具来提供事件通知。


### 消息传递系统

向消费者通知新事件的常用方式是使用 **消息传递系统（messaging system）**：生产者发送包含事件的消息，然后将消息推送给消费者。我们之前在 “[消息传递中的数据流](/v1/ch4#消息传递中的数据流)” 中谈到了这些系统，但现在我们将详细介绍这些系统。

像生产者和消费者之间的 Unix 管道或 TCP 连接这样的直接信道，是实现消息传递系统的简单方法。但是，大多数消息传递系统都在这一基本模型上进行了扩展。特别的是，Unix 管道和 TCP 将恰好一个发送者与恰好一个接收者连接，而一个消息传递系统允许多个生产者节点将消息发送到同一个主题，并允许多个消费者节点接收主题中的消息。

在这个 **发布 / 订阅** 模式中，不同的系统采取各种各样的方法，并没有针对所有目的的通用答案。为了区分这些系统，问一下这两个问题会特别有帮助：

1. **如果生产者发送消息的速度比消费者能够处理的速度快会发生什么？** 一般来说，有三种选择：系统可以丢掉消息，将消息放入缓冲队列，或使用 **背压**（backpressure，也称为 **流量控制**，即 flow control：阻塞生产者，以免其发送更多的消息）。例如 Unix 管道和 TCP 就使用了背压：它们有一个固定大小的小缓冲区，如果填满，发送者会被阻塞，直到接收者从缓冲区中取出数据（请参阅 “[网络拥塞和排队](/v1/ch8#网络拥塞和排队)”）。

   如果消息被缓存在队列中，那么理解队列增长会发生什么是很重要的。当队列装不进内存时系统会崩溃吗？还是将消息写入磁盘？如果是这样，磁盘访问又会如何影响消息传递系统的性能【6】？

2. **如果节点崩溃或暂时脱机，会发生什么情况？ —— 是否会有消息丢失？** 与数据库一样，持久性可能需要写入磁盘和 / 或复制的某种组合（请参阅 “[复制与持久性](/v1/ch7#复制与持久性)”），这是有代价的。如果你能接受有时消息会丢失，则可能在同一硬件上获得更高的吞吐量和更低的延迟。

是否可以接受消息丢失取决于应用。例如，对于周期传输的传感器读数和指标，偶尔丢失的数据点可能并不重要，因为更新的值会在短时间内发出。但要注意，如果大量的消息被丢弃，可能无法立刻意识到指标已经不正确了【7】。如果你正在对事件计数，那么它们能够可靠送达是更重要的，因为每个丢失的消息都意味着使计数器的错误扩大。

我们在 [第十章](/v1/ch10) 中探讨的批处理系统的一个很好的特性是，它们提供了强大的可靠性保证：失败的任务会自动重试，失败任务的部分输出会自动丢弃。这意味着输出与没有发生故障一样，这有助于简化编程模型。在本章的后面，我们将研究如何在流处理的上下文中提供类似的保证。

#### 直接从生产者传递给消费者

许多消息传递系统使用生产者和消费者之间的直接网络通信，而不通过中间节点：

* UDP 组播广泛应用于金融行业，例如股票市场，其中低时延非常重要【8】。虽然 UDP 本身是不可靠的，但应用层的协议可以恢复丢失的数据包（生产者必须记住它发送的数据包，以便能按需重新发送数据包）。
* 无代理的消息库，如 ZeroMQ 【9】和 nanomsg 采取类似的方法，通过 TCP 或 IP 多播实现发布 / 订阅消息传递。
* StatsD 【10】和 Brubeck 【7】使用不可靠的 UDP 消息传递来收集网络中所有机器的指标并对其进行监控。（在 StatsD 协议中，只有接收到所有消息，才认为计数器指标是正确的；使用 UDP 将使得指标处在一种最佳近似状态【11】。另请参阅 “[TCP 与 UDP](/v1/ch8#TCP与UDP)”
* 如果消费者在网络上公开了服务，生产者可以直接发送 HTTP 或 RPC 请求（请参阅 “[服务中的数据流：REST 与 RPC](/v1/ch4#服务中的数据流：REST与RPC)”）将消息推送给使用者。这就是 webhooks 背后的想法【12】，一种服务的回调 URL 被注册到另一个服务中，并且每当事件发生时都会向该 URL 发出请求。

尽管这些直接消息传递系统在设计它们的环境中运行良好，但是它们通常要求应用代码意识到消息丢失的可能性。它们的容错程度极为有限：即使协议检测到并重传在网络中丢失的数据包，它们通常也只是假设生产者和消费者始终在线。

如果消费者处于脱机状态，则可能会丢失其不可达时发送的消息。一些协议允许生产者重试失败的消息传递，但当生产者崩溃时，它可能会丢失消息缓冲区及其本应发送的消息，这种方法可能就没用了。

#### 消息代理

一种广泛使用的替代方法是通过 **消息代理**（message broker，也称为 **消息队列**，即 message queue）发送消息，消息代理实质上是一种针对处理消息流而优化的数据库。它作为服务器运行，生产者和消费者作为客户端连接到服务器。生产者将消息写入代理，消费者通过从代理那里读取来接收消息。

通过将数据集中在代理上，这些系统可以更容易地容忍来来去去的客户端（连接，断开连接和崩溃），而持久性问题则转移到代理的身上。一些消息代理只将消息保存在内存中，而另一些消息代理（取决于配置）将其写入磁盘，以便在代理崩溃的情况下不会丢失。针对缓慢的消费者，它们通常会允许无上限的排队（而不是丢弃消息或背压），尽管这种选择也可能取决于配置。

排队的结果是，消费者通常是 **异步（asynchronous）** 的：当生产者发送消息时，通常只会等待代理确认消息已经被缓存，而不等待消息被消费者处理。向消费者递送消息将发生在未来某个未定的时间点 —— 通常在几分之一秒之内，但有时当消息堆积时会显著延迟。

#### 消息代理与数据库的对比

有些消息代理甚至可以使用 XA 或 JTA 参与两阶段提交协议（请参阅 “[实践中的分布式事务](/v1/ch9#实践中的分布式事务)”）。这个功能与数据库在本质上非常相似，尽管消息代理和数据库之间仍存在实践上很重要的差异：

* 数据库通常保留数据直至显式删除，而大多数消息代理在消息成功递送给消费者时会自动删除消息。这样的消息代理不适合长期的数据存储。
* 由于它们很快就能删除消息，大多数消息代理都认为它们的工作集相当小 —— 即队列很短。如果代理需要缓冲很多消息，比如因为消费者速度较慢（如果内存装不下消息，可能会溢出到磁盘），每个消息需要更长的处理时间，整体吞吐量可能会恶化【6】。
* 数据库通常支持次级索引和各种搜索数据的方式，而消息代理通常支持按照某种模式匹配主题，订阅其子集。虽然机制并不一样，但对于客户端选择想要了解的数据的一部分，都是基本的方式。
* 查询数据库时，结果通常基于某个时间点的数据快照；如果另一个客户端随后向数据库写入一些改变了查询结果的内容，则第一个客户端不会发现其先前结果现已过期（除非它重复查询或轮询变更）。相比之下，消息代理不支持任意查询，但是当数据发生变化时（即新消息可用时），它们会通知客户端。

这是关于消息代理的传统观点，它被封装在诸如 JMS 【14】和 AMQP 【15】的标准中，并且被诸如 RabbitMQ、ActiveMQ、HornetQ、Qpid、TIBCO 企业消息服务、IBM MQ、Azure Service Bus 和 Google Cloud Pub/Sub 所实现 【16】。

#### 多个消费者

当多个消费者从同一主题中读取消息时，有两种主要的消息传递模式，如 [图 11-1](/v1/ddia_1101.png) 所示：

负载均衡（load balancing）
: 每条消息都被传递给消费者 **之一**，所以处理该主题下消息的工作能被多个消费者共享。代理可以为消费者任意分配消息。当处理消息的代价高昂，希望能并行处理消息时，此模式非常有用（在 AMQP 中，可以通过让多个客户端从同一个队列中消费来实现负载均衡，而在 JMS 中则称之为 **共享订阅**，即 shared subscription）。

扇出（fan-out）
: 每条消息都被传递给 **所有** 消费者。扇出允许几个独立的消费者各自 “收听” 相同的消息广播，而不会相互影响 ——  这个流处理中的概念对应批处理中多个不同批处理作业读取同一份输入文件 （JMS 中的主题订阅与 AMQP 中的交叉绑定提供了这一功能）。

![](/v1/ddia_1101.png)

**图 11-1 （a）负载平衡：在消费者间共享消费主题；（b）扇出：将每条消息传递给多个消费者。**

两种模式可以组合使用：例如，两个独立的消费者组可以每组各订阅同一个主题，每一组都共同收到所有消息，但在每一组内部，每条消息仅由单个节点处理。

#### 确认与重新传递

消费者随时可能会崩溃，所以有一种可能的情况是：代理向消费者递送消息，但消费者没有处理，或者在消费者崩溃之前只进行了部分处理。为了确保消息不会丢失，消息代理使用 **确认（acknowledgments）**：客户端必须显式告知代理消息处理完毕的时间，以便代理能将消息从队列中移除。

如果与客户端的连接关闭，或者代理超出一段时间未收到确认，代理则认为消息没有被处理，因此它将消息再递送给另一个消费者。（请注意可能发生这样的情况，消息 **实际上是** 处理完毕的，但 **确认** 在网络中丢失了。需要一种原子提交协议才能处理这种情况，正如在 “[实践中的分布式事务](/v1/ch9#实践中的分布式事务)” 中所讨论的那样）

当与负载均衡相结合时，这种重传行为对消息的顺序有种有趣的影响。在 [图 11-2](/v1/ddia_1102.png) 中，消费者通常按照生产者发送的顺序处理消息。然而消费者 2 在处理消息 m3 时崩溃，与此同时消费者 1 正在处理消息 m4。未确认的消息 m3 随后被重新发送给消费者 1，结果消费者 1 按照 m4，m3，m5 的顺序处理消息。因此 m3 和 m4 的交付顺序与生产者 1 的发送顺序不同。

![](/v1/ddia_1102.png)

**图 11-2 在处理 m3 时消费者 2 崩溃，因此稍后重传至消费者 1**

即使消息代理试图保留消息的顺序（如 JMS 和 AMQP 标准所要求的），负载均衡与重传的组合也不可避免地导致消息被重新排序。为避免此问题，你可以让每个消费者使用单独的队列（即不使用负载均衡功能）。如果消息是完全独立的，则消息顺序重排并不是一个问题。但正如我们将在本章后续部分所述，如果消息之间存在因果依赖关系，这就是一个很重要的问题。

### 分区日志

通过网络发送数据包或向网络服务发送请求通常是短暂的操作，不会留下永久的痕迹。尽管可以永久记录（通过抓包与日志），但我们通常不这么做。即使是将消息持久地写入磁盘的消息代理，在送达给消费者之后也会很快删除消息，因为它们建立在短暂消息传递的思维方式上。

数据库和文件系统采用截然相反的方法论：至少在某人显式删除前，通常写入数据库或文件的所有内容都要被永久记录下来。

这种思维方式上的差异对创建衍生数据的方式有巨大影响。如 [第十章](/v1/ch10) 所述，批处理过程的一个关键特性是，你可以反复运行它们，试验处理步骤，不用担心损坏输入（因为输入是只读的）。而 AMQP/JMS 风格的消息传递并非如此：收到消息是具有破坏性的，因为确认可能导致消息从代理中被删除，因此你不能期望再次运行同一个消费者能得到相同的结果。

如果你将新的消费者添加到消息传递系统，通常只能接收到消费者注册之后开始发送的消息。先前的任何消息都随风而逝，一去不复返。作为对比，你可以随时为文件和数据库添加新的客户端，且能读取任意久远的数据（只要应用没有显式覆盖或删除这些数据）。

为什么我们不能把它俩杂交一下，既有数据库的持久存储方式，又有消息传递的低延迟通知？这就是 **基于日志的消息代理（log-based message brokers）** 背后的想法。

#### 使用日志进行消息存储

日志只是磁盘上简单的仅追加记录序列。我们先前在 [第三章](/v1/ch3) 中日志结构存储引擎和预写式日志的上下文中讨论了日志，在 [第五章](/v1/ch5) 复制的上下文里也讨论了它。

同样的结构可以用于实现消息代理：生产者通过将消息追加到日志末尾来发送消息，而消费者通过依次读取日志来接收消息。如果消费者读到日志末尾，则会等待新消息追加的通知。Unix 工具 `tail -f` 能监视文件被追加写入的数据，基本上就是这样工作的。

为了伸缩超出单个磁盘所能提供的更高吞吐量，可以对日志进行 **分区**（按 [第六章](/v1/ch6) 的定义）。不同的分区可以托管在不同的机器上，使得每个分区都有一份能独立于其他分区进行读写的日志。一个主题可以定义为一组携带相同类型消息的分区。这种方法如 [图 11-3](/v1/ddia_1103.png) 所示。

在每个分区内，代理为每个消息分配一个单调递增的序列号或 **偏移量**（offset，在 [图 11-3](/v1/ddia_1103.png) 中，框中的数字是消息偏移量）。这种序列号是有意义的，因为分区是仅追加写入的，所以分区内的消息是完全有序的。没有跨不同分区的顺序保证。

![](/v1/ddia_1103.png)

**图 11-3 生产者通过将消息追加写入主题分区文件来发送消息，消费者依次读取这些文件**

Apache Kafka 【17,18】、Amazon Kinesis Streams 【19】和 Twitter 的 DistributedLog 【20,21】都是基于日志的消息代理。Google Cloud Pub/Sub 在架构上类似，但对外暴露的是 JMS 风格的 API，而不是日志抽象【16】。尽管这些消息代理将所有消息写入磁盘，但通过跨多台机器分区，每秒能够实现数百万条消息的吞吐量，并通过复制消息来实现容错性【22,23】。

#### 日志与传统的消息传递相比

基于日志的方法天然支持扇出式消息传递，因为多个消费者可以独立读取日志，而不会相互影响 —— 读取消息不会将其从日志中删除。为了在一组消费者之间实现负载平衡，代理可以将整个分区分配给消费者组中的节点，而不是将单条消息分配给消费者客户端。

然后每个客户端将消费被指派分区中的 **所有** 消息。通常情况下，当一个用户被指派了一个日志分区时，它会以简单的单线程方式顺序地读取分区中的消息。这种粗粒度的负载均衡方法有一些缺点：

* 共享消费主题工作的节点数，最多为该主题中的日志分区数，因为同一个分区内的所有消息被递送到同一个节点 [^i]。
* 如果某条消息处理缓慢，则它会阻塞该分区中后续消息的处理（一种行首阻塞的形式；请参阅 “[描述性能](/v1/ch1#描述性能)”）。

因此在消息处理代价高昂，希望逐条并行处理，以及消息的顺序并没有那么重要的情况下，JMS/AMQP 风格的消息代理是可取的。另一方面，在消息吞吐量很高，处理迅速，顺序很重要的情况下，基于日志的方法表现得非常好。

[^i]: 要设计一种负载均衡方案也是有可能的，在这种方案中，两个消费者通过读取全部消息来共享分区处理的工作，但是其中一个只考虑具有偶数偏移量的消息，而另一个消费者只处理奇数编号的偏移量。或者你可以将消息摊到一个线程池中来处理，但这种方法会使消费者偏移量管理变得复杂。一般来说，单线程处理单分区是合适的，可以通过增加更多分区来提高并行度。

#### 消费者偏移量

顺序消费一个分区使得判断消息是否已经被处理变得相当容易：所有偏移量小于消费者的当前偏移量的消息已经被处理，而具有更大偏移量的消息还没有被看到。因此，代理不需要跟踪确认每条消息，只需要定期记录消费者的偏移即可。这种方法减少了额外簿记开销，而且在批处理和流处理中采用这种方法有助于提高基于日志的系统的吞吐量。

实际上，这种偏移量与单领导者数据库复制中常见的日志序列号非常相似，我们在 “[设置新从库](/v1/ch5#设置新从库)” 中讨论了这种情况。在数据库复制中，日志序列号允许跟随者断开连接后，重新连接到领导者，并在不跳过任何写入的情况下恢复复制。这里原理完全相同：消息代理表现得像一个主库，而消费者就像一个从库。

如果消费者节点失效，则失效消费者的分区将指派给其他节点，并从最后记录的偏移量开始消费消息。如果消费者已经处理了后续的消息，但还没有记录它们的偏移量，那么重启后这些消息将被处理两次。我们将在本章后面讨论这个问题的处理方法。

#### 磁盘空间使用

如果只追加写入日志，则磁盘空间终究会耗尽。为了回收磁盘空间，日志实际上被分割成段，并不时地将旧段删除或移动到归档存储。（我们将在后面讨论一种更为复杂的磁盘空间释放方式）

这就意味着如果一个慢消费者跟不上消息产生的速率而落后得太多，它的消费偏移量指向了删除的段，那么它就会错过一些消息。实际上，日志实现了一个有限大小的缓冲区，当缓冲区填满时会丢弃旧消息，它也被称为 **循环缓冲区（circular buffer）** 或 **环形缓冲区（ring buffer）**。不过由于缓冲区在磁盘上，因此缓冲区可能相当的大。

让我们做个简单计算。在撰写本文时，典型的大型硬盘容量为 6TB，顺序写入吞吐量为 150MB/s。如果以最快的速度写消息，则需要大约 11 个小时才能填满磁盘。因而磁盘可以缓冲 11 个小时的消息，之后它将开始覆盖旧的消息。即使使用多个磁盘和机器，这个比率也是一样的。实践中的部署很少能用满磁盘的写入带宽，所以通常可以保存一个几天甚至几周的日志缓冲区。

不管保留多长时间的消息，日志的吞吐量或多或少保持不变，因为无论如何，每个消息都会被写入磁盘【18】。这种行为与默认将消息保存在内存中，仅当队列太长时才写入磁盘的消息传递系统形成鲜明对比。当队列很短时，这些系统非常快；而当这些系统开始写入磁盘时，就要慢的多，所以吞吐量取决于保留的历史数量。

#### 当消费者跟不上生产者时

在 “[消息传递系统](#消息传递系统)” 中，如果消费者无法跟上生产者发送信息的速度时，我们讨论了三种选择：丢弃信息，进行缓冲或施加背压。在这种分类法里，基于日志的方法是缓冲的一种形式，具有很大但大小固定的缓冲区（受可用磁盘空间的限制）。

如果消费者远远落后，而所要求的信息比保留在磁盘上的信息还要旧，那么它将不能读取这些信息，所以代理实际上丢弃了比缓冲区容量更大的旧信息。你可以监控消费者落后日志头部的距离，如果落后太多就发出报警。由于缓冲区很大，因而有足够的时间让运维人员来修复慢消费者，并在消息开始丢失之前让其赶上。

即使消费者真的落后太多开始丢失消息，也只有那个消费者受到影响；它不会中断其他消费者的服务。这是一个巨大的运维优势：你可以实验性地消费生产日志，以进行开发，测试或调试，而不必担心会中断生产服务。当消费者关闭或崩溃时，会停止消耗资源，唯一剩下的只有消费者偏移量。

这种行为也与传统的消息代理形成了鲜明对比，在那种情况下，你需要小心地删除那些消费者已经关闭的队列 —— 否则那些队列就会累积不必要的消息，从其他仍活跃的消费者那里占走内存。

#### 重播旧消息

我们之前提到，使用 AMQP 和 JMS 风格的消息代理，处理和确认消息是一个破坏性的操作，因为它会导致消息在代理上被删除。另一方面，在基于日志的消息代理中，使用消息更像是从文件中读取数据：这是只读操作，不会更改日志。

除了消费者的任何输出之外，处理的唯一副作用是消费者偏移量的前进。但偏移量是在消费者的控制之下的，所以如果需要的话可以很容易地操纵：例如你可以用昨天的偏移量跑一个消费者副本，并将输出写到不同的位置，以便重新处理最近一天的消息。你可以使用各种不同的处理代码重复任意次。

这一方面使得基于日志的消息传递更像上一章的批处理，其中衍生数据通过可重复的转换过程与输入数据显式分离。它允许进行更多的实验，更容易从错误和漏洞中恢复，使其成为在组织内集成数据流的良好工具【24】。


## 数据库与流

我们已经在消息代理和数据库之间进行了一些比较。尽管传统上它们被视为单独的工具类别，但是我们看到基于日志的消息代理已经成功地从数据库中获取灵感并将其应用于消息传递。我们也可以反过来：从消息传递和流中获取灵感，并将它们应用于数据库。

我们之前曾经说过，事件是某个时刻发生的事情的记录。发生的事情可能是用户操作（例如键入搜索查询）或读取传感器，但也可能是 **写入数据库**。某些东西被写入数据库的事实是可以被捕获、存储和处理的事件。这一观察结果表明，数据库和数据流之间的联系不仅仅是磁盘日志的物理存储 —— 而是更深层的联系。

事实上，复制日志（请参阅 “[复制日志的实现](/v1/ch5#复制日志的实现)”）是一个由数据库写入事件组成的流，由主库在处理事务时生成。从库将写入流应用到它们自己的数据库副本，从而最终得到相同数据的精确副本。复制日志中的事件描述发生的数据更改。

我们还在 “[全序广播](/v1/ch9#全序广播)” 中遇到了状态机复制原理，其中指出：如果每个事件代表对数据库的写入，并且每个副本按相同的顺序处理相同的事件，则副本将达到相同的最终状态 （假设事件处理是一个确定性的操作）。这是事件流的又一种场景！

在本节中，我们将首先看看异构数据系统中出现的一个问题，然后探讨如何通过将事件流的想法带入数据库来解决这个问题。

### 保持系统同步

正如我们在本书中所看到的，没有一个系统能够满足所有的数据存储、查询和处理需求。在实践中，大多数重要应用都需要组合使用几种不同的技术来满足所有的需求：例如，使用 OLTP 数据库来为用户请求提供服务，使用缓存来加速常见请求，使用全文索引来处理搜索查询，使用数据仓库用于分析。每一种技术都有自己的数据副本，并根据自己的目的进行存储方式的优化。

由于相同或相关的数据出现在了不同的地方，因此相互间需要保持同步：如果某个项目在数据库中被更新，它也应当在缓存、搜索索引和数据仓库中被更新。对于数据仓库，这种同步通常由 ETL 进程执行（请参阅 “[数据仓库](/v1/ch3#数据仓库)”），通常是先取得数据库的完整副本，然后执行转换，并批量加载到数据仓库中 —— 换句话说，批处理。我们在 “[批处理工作流的输出](/v1/ch10#批处理工作流的输出)” 中同样看到了如何使用批处理创建搜索索引、推荐系统和其他衍生数据系统。

如果周期性的完整数据库转储过于缓慢，有时会使用的替代方法是 **双写（dual write）**，其中应用代码在数据变更时明确写入每个系统：例如，首先写入数据库，然后更新搜索索引，然后使缓存项失效（甚至同时执行这些写入）。

但是，双写有一些严重的问题，其中一个是竞争条件，如 [图 11-4](/v1/ddia_1104.png) 所示。在这个例子中，两个客户端同时想要更新一个项目 X：客户端 1 想要将值设置为 A，客户端 2 想要将其设置为 B。两个客户端首先将新值写入数据库，然后将其写入到搜索索引。因为运气不好，这些请求的时序是交错的：数据库首先看到来自客户端 1 的写入将值设置为 A，然后来自客户端 2 的写入将值设置为 B，因此数据库中的最终值为 B。搜索索引首先看到来自客户端 2 的写入，然后是客户端 1 的写入，所以搜索索引中的最终值是 A。即使没发生错误，这两个系统现在也永久地不一致了。

![](/v1/ddia_1104.png)

**图 11-4 在数据库中 X 首先被设置为 A，然后被设置为 B，而在搜索索引处，写入以相反的顺序到达**

除非有一些额外的并发检测机制，例如我们在 “[检测并发写入](/v1/ch5#检测并发写入)” 中讨论的版本向量，否则你甚至不会意识到发生了并发写入 —— 一个值将简单地以无提示方式覆盖另一个值。

双重写入的另一个问题是，其中一个写入可能会失败，而另一个成功。这是一个容错问题，而不是一个并发问题，但也会造成两个系统互相不一致的结果。确保它们要么都成功要么都失败，是原子提交问题的一个例子，解决这个问题的代价是昂贵的（请参阅 “[原子提交与两阶段提交](/v1/ch9#原子提交与两阶段提交)”）。

如果你只有一个单领导者复制的数据库，那么这个领导者决定了写入顺序，而状态机复制方法可以在数据库副本上工作。然而，在 [图 11-4](/v1/ddia_1104.png) 中，没有单个主库：数据库可能有一个领导者，搜索索引也可能有一个领导者，但是两者都不追随对方，所以可能会发生冲突（请参阅 “[多主复制](/v1/ch5#多主复制)”）。

如果实际上只有一个领导者 —— 例如，数据库 —— 而且我们能让搜索索引成为数据库的追随者，情况要好得多。但这在实践中可能吗？

### 变更数据捕获

大多数数据库的复制日志的问题在于，它们一直被当做数据库的内部实现细节，而不是公开的 API。客户端应该通过其数据模型和查询语言来查询数据库，而不是解析复制日志并尝试从中提取数据。

数十年来，许多数据库根本没有记录在档的获取变更日志的方式。由于这个原因，捕获数据库中所有的变更，然后将其复制到其他存储技术（搜索索引、缓存或数据仓库）中是相当困难的。

最近，人们对 **变更数据捕获（change data capture, CDC）** 越来越感兴趣，这是一种观察写入数据库的所有数据变更，并将其提取并转换为可以复制到其他系统中的形式的过程。CDC 是非常有意思的，尤其是当变更能在被写入后立刻用于流时。

例如，你可以捕获数据库中的变更，并不断将相同的变更应用至搜索索引。如果变更日志以相同的顺序应用，则可以预期搜索索引中的数据与数据库中的数据是匹配的。搜索索引和任何其他衍生数据系统只是变更流的消费者，如 [图 11-5](/v1/ddia_1105.png) 所示。

![](/v1/ddia_1105.png)

**图 11-5 将数据按顺序写入一个数据库，然后按照相同的顺序将这些更改应用到其他系统**

#### 变更数据捕获的实现

我们可以将日志消费者叫做 **衍生数据系统**，正如在 [第三部分](/v1/part-iii) 的介绍中所讨论的：存储在搜索索引和数据仓库中的数据，只是 **记录系统** 数据的额外视图。变更数据捕获是一种机制，可确保对记录系统所做的所有更改都反映在衍生数据系统中，以便衍生系统具有数据的准确副本。

从本质上说，变更数据捕获使得一个数据库成为领导者（被捕获变化的数据库），并将其他组件变为追随者。基于日志的消息代理非常适合从源数据库传输变更事件，因为它保留了消息的顺序（避免了 [图 11-2](/v1/ddia_1102.png) 的重新排序问题）。

数据库触发器可用来实现变更数据捕获（请参阅 “[基于触发器的复制](/v1/ch5#基于触发器的复制)”），通过注册观察所有变更的触发器，并将相应的变更项写入变更日志表中。但是它们往往是脆弱的，而且有显著的性能开销。解析复制日志可能是一种更稳健的方法，但它也很有挑战，例如如何应对模式变更。

LinkedIn 的 Databus【25】，Facebook 的 Wormhole【26】和 Yahoo! 的 Sherpa【27】大规模地应用这个思路。Bottled Water 使用解码 WAL 的 API 实现了 PostgreSQL 的 CDC【28】，Maxwell 和 Debezium 通过解析 binlog 对 MySQL 做了类似的事情【29,30,31】，Mongoriver 读取 MongoDB oplog【32,33】，而 GoldenGate 为 Oracle 提供类似的功能【34,35】。

类似于消息代理，变更数据捕获通常是异步的：记录数据库系统在提交变更之前不会等待消费者应用变更。这种设计具有的运维优势是，添加缓慢的消费者不会过度影响记录系统。不过，所有复制延迟可能有的问题在这里都可能出现（请参阅 “[复制延迟问题](/v1/ch5#复制延迟问题)”）。

#### 初始快照

如果你拥有 **所有** 对数据库进行变更的日志，则可以通过重播该日志，来重建数据库的完整状态。但是在许多情况下，永远保留所有更改会耗费太多磁盘空间，且重播过于费时，因此日志需要被截断。

例如，构建新的全文索引需要整个数据库的完整副本 —— 仅仅应用最近变更的日志是不够的，因为这样会丢失最近未曾更新的项目。因此，如果你没有完整的历史日志，则需要从一个一致的快照开始，如先前的 “[设置新从库](/v1/ch5#设置新从库)” 中所述。

数据库的快照必须与变更日志中的已知位置或偏移量相对应，以便在处理完快照后知道从哪里开始应用变更。一些 CDC 工具集成了这种快照功能，而其他工具则把它留给你手动执行。

#### 日志压缩

如果你只能保留有限的历史日志，则每次要添加新的衍生数据系统时，都需要做一次快照。但 **日志压缩（log compaction）** 提供了一个很好的备选方案。

我们之前在 “[散列索引](/v1/ch3#散列索引)” 中关于日志结构存储引擎的上下文中讨论了日志压缩（请参阅 [图 3-2](/v1/ddia_0302.png) 的示例）。原理很简单：存储引擎定期在日志中查找具有相同键的记录，丢掉所有重复的内容，并只保留每个键的最新更新。这个压缩与合并过程在后台运行。

在日志结构存储引擎中，具有特殊值 NULL（**墓碑**，即 tombstone）的更新表示该键被删除，并会在日志压缩过程中被移除。但只要键不被覆盖或删除，它就会永远留在日志中。这种压缩日志所需的磁盘空间仅取决于数据库的当前内容，而不取决于数据库中曾经发生的写入次数。如果相同的键经常被覆盖写入，则先前的值将最终将被垃圾回收，只有最新的值会保留下来。

在基于日志的消息代理与变更数据捕获的上下文中也适用相同的想法。如果 CDC 系统被配置为，每个变更都包含一个主键，且每个键的更新都替换了该键以前的值，那么只需要保留对键的最新写入就足够了。

现在，无论何时需要重建衍生数据系统（如搜索索引），你可以从压缩日志主题的零偏移量处启动新的消费者，然后依次扫描日志中的所有消息。日志能保证包含数据库中每个键的最新值（也可能是一些较旧的值）—— 换句话说，你可以使用它来获取数据库内容的完整副本，而无需从 CDC 源数据库取一个快照。

Apache Kafka 支持这种日志压缩功能。正如我们将在本章后面看到的，它允许消息代理被当成持久性存储使用，而不仅仅是用于临时消息。

#### 变更流的API支持

越来越多的数据库开始将变更流作为第一等的接口，而不像传统上要去做加装改造，或者费工夫逆向工程一个 CDC。例如，RethinkDB 允许查询订阅通知，当查询结果变更时获得通知【36】，Firebase 【37】和 CouchDB 【38】基于变更流进行同步，该变更流同样可用于应用。而 Meteor 使用 MongoDB oplog 订阅数据变更，并改变了用户接口【39】。

VoltDB 允许事务以流的形式连续地从数据库中导出数据【40】。数据库将关系数据模型中的输出流表示为一个表，事务可以向其中插入元组，但不能查询。已提交事务按照提交顺序写入这个特殊表，而流则由该表中的元组日志构成。外部消费者可以异步消费该日志，并使用它来更新衍生数据系统。

Kafka Connect【41】致力于将广泛的数据库系统的变更数据捕获工具与 Kafka 集成。一旦变更事件进入 Kafka 中，它就可以用于更新衍生数据系统，比如搜索索引，也可以用于本章稍后讨论的流处理系统。

### 事件溯源

我们在这里讨论的想法和 **事件溯源（Event Sourcing）** 之间有一些相似之处，这是一个在 **领域驱动设计（domain-driven design, DDD）** 社区中折腾出来的技术。我们将简要讨论事件溯源，因为它包含了一些关于流处理系统的有用想法。

与变更数据捕获类似，事件溯源涉及到 **将所有对应用状态的变更** 存储为变更事件日志。最大的区别是事件溯源将这一想法应用到了一个不同的抽象层次上：

* 在变更数据捕获中，应用以 **可变方式（mutable way）** 使用数据库，可以任意更新和删除记录。变更日志是从数据库的底层提取的（例如，通过解析复制日志），从而确保从数据库中提取的写入顺序与实际写入的顺序相匹配，从而避免 [图 11-4](/v1/ddia_1104.png) 中的竞态条件。写入数据库的应用不需要知道 CDC 的存在。
* 在事件溯源中，应用逻辑显式构建在写入事件日志的不可变事件之上。在这种情况下，事件存储是仅追加写入的，更新与删除是不鼓励的或禁止的。事件被设计为旨在反映应用层面发生的事情，而不是底层的状态变更。

事件溯源是一种强大的数据建模技术：从应用的角度来看，将用户的行为记录为不可变的事件更有意义，而不是在可变数据库中记录这些行为的影响。事件溯源使得应用随时间演化更为容易，通过更容易理解事情发生的原因来帮助调试的进行，并有利于防止应用 Bug（请参阅 “[不可变事件的优点](#不可变事件的优点)”）。

例如，存储 “学生取消选课” 事件以中性的方式清楚地表达了单个行为的意图，而其副作用 “从登记表中删除了一个条目，而一条取消原因的记录被添加到学生反馈表” 则嵌入了很多有关稍后对数据的使用方式的假设。如果引入一个新的应用功能，例如 “将位置留给等待列表中的下一个人” —— 事件溯源方法允许将新的副作用轻松地从现有事件中脱开。

事件溯源类似于 **编年史（chronicle）** 数据模型【45】，事件日志与星型模式中的事实表之间也存在相似之处（请参阅 “[星型和雪花型：分析的模式](/v1/ch3#星型和雪花型：分析的模式)”） 。

诸如 Event Store【46】这样的专业数据库已经被开发出来，供使用事件溯源的应用使用，但总的来说，这种方法独立于任何特定的工具。传统的数据库或基于日志的消息代理也可以用来构建这种风格的应用。

#### 从事件日志中派生出当前状态

事件日志本身并不是很有用，因为用户通常期望看到的是系统的当前状态，而不是变更历史。例如，在购物网站上，用户期望能看到他们购物车里的当前内容，而不是他们购物车所有变更的一个仅追加列表。

因此，使用事件溯源的应用需要拉取事件日志（表示 **写入** 系统的数据），并将其转换为适合向用户显示的应用状态（从系统 **读取** 数据的方式【47】）。这种转换可以使用任意逻辑，但它应当是确定性的，以便能再次运行，并从事件日志中衍生出相同的应用状态。

与变更数据捕获一样，重播事件日志允许让你重新构建系统的当前状态。不过，日志压缩需要采用不同的方式处理：

* 用于记录更新的 CDC 事件通常包含记录的 **完整新版本**，因此主键的当前值完全由该主键的最近事件确定，而日志压缩可以丢弃相同主键的先前事件。
* 另一方面，事件溯源在更高层次进行建模：事件通常表示用户操作的意图，而不是因为操作而发生的状态更新机制。在这种情况下，后面的事件通常不会覆盖先前的事件，所以你需要完整的历史事件来重新构建最终状态。这里进行同样的日志压缩是不可能的。

使用事件溯源的应用通常有一些机制，用于存储从事件日志中导出的当前状态快照，因此它们不需要重复处理完整的日志。然而这只是一种性能优化，用来加速读取，提高从崩溃中恢复的速度；真正的目的是系统能够永久存储所有原始事件，并在需要时重新处理完整的事件日志。我们将在 “[不变性的局限性](#不变性的局限性)” 中讨论这个假设。

#### 命令和事件

事件溯源的哲学是仔细区分 **事件（event）** 和 **命令（command）**【48】。当来自用户的请求刚到达时，它一开始是一个命令：在这个时间点上它仍然可能失败，比如，因为违反了一些完整性条件。应用必须首先验证它是否可以执行该命令。如果验证成功并且命令被接受，则它变为一个持久化且不可变的事件。

例如，如果用户试图注册特定用户名，或预定飞机或剧院的座位，则应用需要检查用户名或座位是否已被占用。（先前在 “[容错共识](/v1/ch9#容错共识)” 中讨论过这个例子）当检查成功时，应用可以生成一个事件，指示特定的用户名是由特定的用户 ID 注册的，或者座位已经预留给特定的顾客。

在事件生成的时刻，它就成为了 **事实（fact）**。即使客户稍后决定更改或取消预订，他们之前曾预定了某个特定座位的事实仍然成立，而更改或取消是之后添加的单独的事件。

事件流的消费者不允许拒绝事件：当消费者看到事件时，它已经成为日志中不可变的一部分，并且可能已经被其他消费者看到了。因此任何对命令的验证，都需要在它成为事件之前同步完成。例如，通过使用一个可以原子性地自动验证命令并发布事件的可串行事务。

或者，预订座位的用户请求可以拆分为两个事件：第一个是暂时预约，第二个是验证预约后的独立的确认事件（如 “[使用全序广播实现线性一致的存储](/v1/ch9#使用全序广播实现线性一致的存储)” 中所述） 。这种分割方式允许验证发生在一个异步的过程中。

### 状态、流和不变性

我们在 [第十章](/v1/ch10) 中看到，批处理因其输入文件不变性而受益良多，你可以在现有输入文件上运行实验性处理作业，而不用担心损坏它们。这种不变性原则也是使得事件溯源与变更数据捕获如此强大的原因。

我们通常将数据库视为应用程序当前状态的存储 —— 这种表示针对读取进行了优化，而且通常对于服务查询而言是最为方便的表示。状态的本质是，它会变化，所以数据库才会支持数据的增删改。这又该如何匹配不变性呢？

只要你的状态发生了变化，那么这个状态就是这段时间中事件修改的结果。例如，当前可用的座位列表是你已处理的预订所产生的结果，当前帐户余额是帐户中的借与贷的结果，而 Web 服务器的响应时间图，是所有已发生 Web 请求的独立响应时间的聚合结果。

无论状态如何变化，总是有一系列事件导致了这些变化。即使事情已经执行与回滚，这些事件出现是始终成立的。关键的想法是：可变的状态与不可变事件的仅追加日志相互之间并不矛盾：它们是一体两面，互为阴阳的。所有变化的日志 —— **变化日志（changelog）**，表示了随时间演变的状态。

如果你倾向于数学表示，那么你可能会说，应用状态是事件流对时间求积分得到的结果，而变更流是状态对时间求微分的结果，如 [图 11-6](/v1/ddia_1106.png) 所示【49,50,51】。这个比喻有一些局限性（例如，状态的二阶导似乎没有意义），但这是考虑数据的一个实用出发点。
$$
state(now) = \int_{t=0}^{now}{stream(t) \ dt} \\
stream(t) = \frac{d\ state(t)}{dt}
$$

![](/v1/ddia_1106.png)

**图 11-6 应用当前状态与事件流之间的关系**

如果你持久存储了变更日志，那么重现状态就非常简单。如果你认为事件日志是你的记录系统，而所有的衍生状态都从它派生而来，那么系统中的数据流动就容易理解的多。正如帕特・赫兰（Pat Helland）所说的【52】：

> 事务日志记录了数据库的所有变更。高速追加是更改日志的唯一方法。从这个角度来看，数据库的内容其实是日志中记录最新值的缓存。日志才是真相，数据库是日志子集的缓存，这一缓存子集恰好来自日志中每条记录与索引值的最新值。

日志压缩（如 “[日志压缩](#日志压缩)” 中所述）是连接日志与数据库状态之间的桥梁：它只保留每条记录的最新版本，并丢弃被覆盖的版本。

#### 不可变事件的优点

数据库中的不变性是一个古老的概念。例如，会计在几个世纪以来一直在财务记账中应用不变性。一笔交易发生时，它被记录在一个仅追加写入的分类帐中，实质上是描述货币、商品或服务转手的事件日志。账目，比如利润、亏损、资产负债表，是从分类账中的交易求和衍生而来【53】。

如果发生错误，会计师不会删除或更改分类帐中的错误交易 —— 而是添加另一笔交易以补偿错误，例如退还一笔不正确的费用。不正确的交易将永远保留在分类帐中，对于审计而言可能非常重要。如果从不正确的分类账衍生出的错误数字已经公布，那么下一个会计周期的数字就会包括一个更正。这个过程在会计事务中是很常见的【54】。

尽管这种可审计性只在金融系统中尤其重要，但对于不受这种严格监管的许多其他系统，也是很有帮助的。如 “[批处理输出的哲学](/v1/ch10#批处理输出的哲学)” 中所讨论的，如果你意外地部署了将错误数据写入数据库的错误代码，当代码会破坏性地覆写数据时，恢复要困难得多。使用不可变事件的仅追加日志，诊断问题与故障恢复就要容易的多。

不可变的事件也包含了比当前状态更多的信息。例如在购物网站上，顾客可以将物品添加到他们的购物车，然后再将其移除。虽然从履行订单的角度，第二个事件取消了第一个事件，但对分析目的而言，知道客户考虑过某个特定项而之后又反悔，可能是很有用的。也许他们会选择在未来购买，或者他们已经找到了替代品。这个信息被记录在事件日志中，但对于移出购物车就删除记录的数据库而言，这个信息在移出购物车时可能就丢失了【42】。

#### 从同一事件日志中派生多个视图

此外，通过从不变的事件日志中分离出可变的状态，你可以针对不同的读取方式，从相同的事件日志中衍生出几种不同的表现形式。效果就像一个流的多个消费者一样（[图 11-5](/v1/ddia_1105.png)）：例如，分析型数据库 Druid 使用这种方式直接从 Kafka 摄取数据【55】，Pistachio 是一个分布式的键值存储，使用 Kafka 作为提交日志【56】，Kafka Connect 能将来自 Kafka 的数据导出到各种不同的数据库与索引【41】。这对于许多其他存储和索引系统（如搜索服务器）来说是很有意义的，当系统要从分布式日志中获取输入时亦然（请参阅 “[保持系统同步](#保持系统同步)”）。

添加从事件日志到数据库的显式转换，能够使应用更容易地随时间演进：如果你想要引入一个新功能，以新的方式表示现有数据，则可以使用事件日志来构建一个单独的、针对新功能的读取优化视图，无需修改现有系统而与之共存。并行运行新旧系统通常比在现有系统中执行复杂的模式迁移更容易。一旦不再需要旧的系统，你可以简单地关闭它并回收其资源【47,57】。

如果你不需要担心如何查询与访问数据，那么存储数据通常是非常简单的。模式设计、索引和存储引擎的许多复杂性，都是希望支持某些特定查询和访问模式的结果（请参阅 [第三章](/v1/ch3)）。出于这个原因，通过将数据写入的形式与读取形式相分离，并允许几个不同的读取视图，你能获得很大的灵活性。这个想法有时被称为 **命令查询责任分离（command query responsibility segregation, CQRS）**【42,58,59】。

数据库和模式设计的传统方法是基于这样一种谬论，数据必须以与查询相同的形式写入。如果可以将数据从针对写入优化的事件日志转换为针对读取优化的应用状态，那么有关规范化和非规范化的争论就变得无关紧要了（请参阅 “[多对一和多对多的关系](/v1/ch2#多对一和多对多的关系)”）：在针对读取优化的视图中对数据进行非规范化是完全合理的，因为翻译过程提供了使其与事件日志保持一致的机制。

在 “[描述负载](/v1/ch1#描述负载)” 中，我们讨论了推特主页时间线，它是特定用户关注的人群所发推特的缓存（类似邮箱）。这是 **针对读取优化的状态** 的又一个例子：主页时间线是高度非规范化的，因为你的推文与你所有粉丝的时间线都构成了重复。然而，扇出服务保持了这种重复状态与新推特以及新关注关系的同步，从而保证了重复的可管理性。

#### 并发控制

事件溯源和变更数据捕获的最大缺点是，事件日志的消费者通常是异步的，所以可能会出现这样的情况：用户会写入日志，然后从日志衍生视图中读取，结果发现他的写入还没有反映在读取视图中。我们之前在 “[读己之写](/v1/ch5#读己之写)” 中讨论了这个问题以及可能的解决方案。

一种解决方案是将事件追加到日志时同步执行读取视图的更新。而将这些写入操作合并为一个原子单元需要 **事务**，所以要么将事件日志和读取视图保存在同一个存储系统中，要么就需要跨不同系统进行分布式事务。或者，你也可以使用在 “[使用全序广播实现线性一致的存储](/v1/ch9#使用全序广播实现线性一致的存储)” 中讨论的方法。

另一方面，从事件日志导出当前状态也简化了并发控制的某些部分。许多对于多对象事务的需求（请参阅 “[单对象和多对象操作](/v1/ch7#单对象和多对象操作)”）源于单个用户操作需要在多个不同的位置更改数据。通过事件溯源，你可以设计一个自包含的事件以表示一个用户操作。然后用户操作就只需要在一个地方进行单次写入操作 —— 即将事件附加到日志中 —— 这个还是很容易使原子化的。

如果事件日志与应用状态以相同的方式分区（例如，处理分区 3 中的客户事件只需要更新分区 3 中的应用状态），那么直接使用单线程日志消费者就不需要写入并发控制了。它从设计上一次只处理一个事件（请参阅 “[真的串行执行](/v1/ch7#真的串行执行)”）。日志通过在分区中定义事件的序列顺序，消除了并发性的不确定性【24】。如果一个事件触及多个状态分区，那么需要做更多的工作，我们将在 [第十二章](/v1/ch12) 讨论。

#### 不变性的局限性

许多不使用事件溯源模型的系统也还是依赖不可变性：各种数据库在内部使用不可变的数据结构或多版本数据来支持时间点快照（请参阅 “[索引和快照隔离](/v1/ch7#索引和快照隔离)” ）。Git、Mercurial 和 Fossil 等版本控制系统也依靠不可变的数据来保存文件的版本历史记录。

永远保持所有变更的不变历史，在多大程度上是可行的？答案取决于数据集的流失率。一些工作负载主要是添加数据，很少更新或删除；它们很容易保持不变。其他工作负载在相对较小的数据集上有较高的更新 / 删除率；在这些情况下，不可变的历史可能增至难以接受的巨大，碎片化可能成为一个问题，压缩与垃圾收集的表现对于运维的稳健性变得至关重要【60,61】。

除了性能方面的原因外，也可能有出于管理方面的原因需要删除数据的情况，尽管这些数据都是不可变的。例如，隐私条例可能要求在用户关闭帐户后删除他们的个人信息，数据保护立法可能要求删除错误的信息，或者可能需要阻止敏感信息的意外泄露。

在这种情况下，仅仅在日志中添加另一个事件来指明先前的数据应该被视为删除是不够的 —— 你实际上是想改写历史，并假装数据从一开始就没有写入。例如，Datomic 管这个特性叫 **切除（excision）** 【62】，而 Fossil 版本控制系统有一个类似的概念叫 **避免（shunning）** 【63】。

真正删除数据是非常非常困难的【64】，因为副本可能存在于很多地方：例如，存储引擎，文件系统和 SSD 通常会向一个新位置写入，而不是原地覆盖旧数据【52】，而备份通常是特意做成不可变的，防止意外删除或损坏。删除操作更多的是指 “使取回数据更困难”，而不是指 “使取回数据不可能”。无论如何，有时你必须得尝试，正如我们在 “[立法与自律](/v1/ch12#立法与自律)” 中所看到的。


## 流处理

到目前为止，本章中我们已经讨论了流的来源（用户活动事件，传感器和写入数据库），我们讨论了流如何传输（直接通过消息传送，通过消息代理，通过事件日志）。

剩下的就是讨论一下你可以用流做什么 —— 也就是说，你可以处理它。一般来说，有三种选项：

1. 你可以将事件中的数据写入数据库、缓存、搜索索引或类似的存储系统，然后能被其他客户端查询。如 [图 11-5](/v1/ddia_1105.png) 所示，这是数据库与系统其他部分所发生的变更保持同步的好方法 —— 特别是当流消费者是写入数据库的唯一客户端时。如 “[批处理工作流的输出](/v1/ch10#批处理工作流的输出)” 中所讨论的，它是写入存储系统的流等价物。
2. 你能以某种方式将事件推送给用户，例如发送报警邮件或推送通知，或将事件流式传输到可实时显示的仪表板上。在这种情况下，人是流的最终消费者。
3. 你可以处理一个或多个输入流，并产生一个或多个输出流。流可能会经过由几个这样的处理阶段组成的流水线，最后再输出（选项 1 或 2）。

在本章的剩余部分中，我们将讨论选项 3：处理流以产生其他衍生流。处理这样的流的代码片段，被称为 **算子（operator）** 或 **作业（job）**。它与我们在 [第十章](/v1/ch10) 中讨论过的 Unix 进程和 MapReduce 作业密切相关，数据流的模式是相似的：一个流处理器以只读的方式使用输入流，并将其输出以仅追加的方式写入一个不同的位置。

流处理中的分区和并行化模式也非常类似于 [第十章](/v1/ch10) 中介绍的 MapReduce 和数据流引擎，因此我们不再重复这些主题。基本的 Map 操作（如转换和过滤记录）也是一样的。

与批量作业相比的一个关键区别是，流不会结束。这种差异会带来很多隐含的结果。正如本章开始部分所讨论的，排序对无界数据集没有意义，因此无法使用 **排序合并连接**（请参阅 “[Reduce 侧连接与分组](/v1/ch10#Reduce侧连接与分组)”）。容错机制也必须改变：对于已经运行了几分钟的批处理作业，可以简单地从头开始重启失败任务，但是对于已经运行数年的流作业，重启后从头开始跑可能并不是一个可行的选项。

### 流处理的应用

长期以来，流处理一直用于监控目的，如果某个事件发生，组织希望能得到警报。例如：

* 欺诈检测系统需要确定信用卡的使用模式是否有意外地变化，如果信用卡可能已被盗刷，则锁卡。
* 交易系统需要检查金融市场的价格变化，并根据指定的规则进行交易。
* 制造系统需要监控工厂中机器的状态，如果出现故障，可以快速定位问题。
* 军事和情报系统需要跟踪潜在侵略者的活动，并在出现袭击征兆时发出警报。

这些类型的应用需要非常精密复杂的模式匹配与相关检测。然而随着时代的进步，流处理的其他用途也开始出现。在本节中，我们将简要比较一下这些应用。

#### 复合事件处理

**复合事件处理（complex event processing, CEP）** 是 20 世纪 90 年代为分析事件流而开发出的一种方法，尤其适用于需要搜索某些事件模式的应用【65,66】。与正则表达式允许你在字符串中搜索特定字符模式的方式类似，CEP 允许你指定规则以在流中搜索某些事件模式。

CEP 系统通常使用高层次的声明式查询语言，比如 SQL，或者图形用户界面，来描述应该检测到的事件模式。这些查询被提交给处理引擎，该引擎消费输入流，并在内部维护一个执行所需匹配的状态机。当发现匹配时，引擎发出一个 **复合事件**（即 complex event，CEP 因此得名），并附有检测到的事件模式详情【67】。

在这些系统中，查询和数据之间的关系与普通数据库相比是颠倒的。通常情况下，数据库会持久存储数据，并将查询视为临时的：当查询进入时，数据库搜索与查询匹配的数据，然后在查询完成时丢掉查询。CEP 引擎反转了角色：查询是长期存储的，来自输入流的事件不断流过它们，搜索匹配事件模式的查询【68】。

CEP 的实现包括 Esper【69】、IBM InfoSphere Streams【70】、Apama、TIBCO StreamBase 和 SQLstream。像 Samza 这样的分布式流处理组件，支持使用 SQL 在流上进行声明式查询【71】。

#### 流分析

使用流处理的另一个领域是对流进行分析。CEP 与流分析之间的边界是模糊的，但一般来说，分析往往对找出特定事件序列并不关心，而更关注大量事件上的聚合与统计指标 —— 例如：

* 测量某种类型事件的速率（每个时间间隔内发生的频率）
* 滚动计算一段时间窗口内某个值的平均值
* 将当前的统计值与先前的时间区间的值对比（例如，检测趋势，当指标与上周同比异常偏高或偏低时报警）

这些统计值通常是在固定时间区间内进行计算的，例如，你可能想知道在过去 5 分钟内服务每秒查询次数的均值，以及此时间段内响应时间的第 99 百分位点。在几分钟内取平均，能抹平秒和秒之间的无关波动，且仍然能向你展示流量模式的时间图景。聚合的时间间隔称为 **窗口（window）**，我们将在 “[时间推理](#时间推理)” 中更详细地讨论窗口。

流分析系统有时会使用概率算法，例如 Bloom filter（我们在 “[性能优化](/v1/ch3#性能优化)” 中遇到过）来管理成员资格，HyperLogLog【72】用于基数估计以及各种百分比估计算法（请参阅 “[实践中的百分位点](/v1/ch1#实践中的百分位点)”）。概率算法产出近似的结果，但比起精确算法的优点是内存使用要少得多。使用近似算法有时让人们觉得流处理系统总是有损的和不精确的，但这是错误看法：流处理并没有任何内在的近似性，而概率算法只是一种优化【73】。

许多开源分布式流处理框架的设计都是针对分析设计的：例如 Apache Storm、Spark Streaming、Flink、Concord、Samza 和 Kafka Streams 【74】。托管服务包括 Google Cloud Dataflow 和 Azure Stream Analytics。

#### 维护物化视图

我们在 “[数据库与流](#数据库与流)” 中看到，数据库的变更流可以用于维护衍生数据系统（如缓存、搜索索引和数据仓库），并使其与源数据库保持最新。我们可以将这些示例视作维护 **物化视图（materialized view）** 的一种具体场景（请参阅 “[聚合：数据立方体和物化视图](/v1/ch3#聚合：数据立方体和物化视图)”）：在某个数据集上衍生出一个替代视图以便高效查询，并在底层数据变更时更新视图【50】。

同样，在事件溯源中，应用程序的状态是通过应用事件日志来维护的；这里的应用程序状态也是一种物化视图。与流分析场景不同的是，仅考虑某个时间窗口内的事件通常是不够的：构建物化视图可能需要任意时间段内的 **所有** 事件，除了那些可能由日志压缩丢弃的过时事件（请参阅 “[日志压缩](#日志压缩)”）。实际上，你需要一个可以一直延伸到时间开端的窗口。

原则上讲，任何流处理组件都可以用于维护物化视图，尽管 “永远运行” 与一些面向分析的框架假设的 “主要在有限时间段窗口上运行” 背道而驰，Samza 和 Kafka Streams 支持这种用法，建立在 Kafka 对日志压缩的支持上【75】。

#### 在流上搜索

除了允许搜索由多个事件构成模式的 CEP 外，有时也存在基于复杂标准（例如全文搜索查询）来搜索单个事件的需求。

例如，媒体监测服务可以订阅新闻文章 Feed 与来自媒体的播客，搜索任何关于公司、产品或感兴趣的话题的新闻。这是通过预先构建一个搜索查询来完成的，然后不断地将新闻项的流与该查询进行匹配。在一些网站上也有类似的功能：例如，当市场上出现符合其搜索条件的新房产时，房地产网站的用户可以要求网站通知他们。Elasticsearch 的这种过滤器功能，是实现这种流搜索的一种选择【76】。

传统的搜索引擎首先索引文件，然后在索引上跑查询。相比之下，搜索一个数据流则反了过来：查询被存储下来，文档从查询中流过，就像在 CEP 中一样。最简单的情况就是，你可以为每个文档测试每个查询。但是如果你有大量查询，这可能会变慢。为了优化这个过程，可以像对文档一样，为查询建立索引。因而收窄可能匹配的查询集合【77】。

#### 消息传递和RPC

在 “[消息传递中的数据流](/v1/ch4#消息传递中的数据流)” 中我们讨论过，消息传递系统可以作为 RPC 的替代方案，即作为一种服务间通信的机制，比如在 Actor 模型中所使用的那样。尽管这些系统也是基于消息和事件，但我们通常不会将其视作流处理组件：

* Actor 框架主要是管理模块通信的并发和分布式执行的一种机制，而流处理主要是一种数据管理技术。
* Actor 之间的交流往往是短暂的、一对一的；而事件日志则是持久的、多订阅者的。
* Actor 可以以任意方式进行通信（包括循环的请求 / 响应模式），但流处理通常配置在无环流水线中，其中每个流都是一个特定作业的输出，由良好定义的输入流中派生而来。

也就是说，RPC 类系统与流处理之间有一些交叉领域。例如，Apache Storm 有一个称为 **分布式 RPC** 的功能，它允许将用户查询分散到一系列也处理事件流的节点上；然后这些查询与来自输入流的事件交织，而结果可以被汇总并发回给用户【78】（另请参阅 “[多分区数据处理](/v1/ch12#多分区数据处理)”）。

也可以使用 Actor 框架来处理流。但是，很多这样的框架在崩溃时不能保证消息的传递，除非你实现了额外的重试逻辑，否则这种处理不是容错的。

### 时间推理

流处理通常需要与时间打交道，尤其是用于分析目的时候，会频繁使用时间窗口，例如 “过去五分钟的平均值”。“过去五分钟” 的含义看上去似乎是清晰而无歧义的，但不幸的是，这个概念非常棘手。

在批处理中过程中，大量的历史事件被快速地处理。如果需要按时间来分析，批处理器需要检查每个事件中嵌入的时间戳。读取运行批处理机器的系统时钟没有任何意义，因为处理运行的时间与事件实际发生的时间无关。

批处理可以在几分钟内读取一年的历史事件；在大多数情况下，感兴趣的时间线是历史中的一年，而不是处理中的几分钟。而且使用事件中的时间戳，使得处理是 **确定性** 的：在相同的输入上再次运行相同的处理过程会得到相同的结果（请参阅 “[容错](/v1/ch10#容错)”）。

另一方面，许多流处理框架使用处理机器上的本地系统时钟（**处理时间**，即 processing time）来确定 **窗口（windowing）**【79】。这种方法的优点是简单，如果事件创建与事件处理之间的延迟可以忽略不计，那也是合理的。然而，如果存在任何显著的处理延迟 —— 即，事件处理显著地晚于事件实际发生的时间，这种处理方式就失效了。

#### 事件时间与处理时间

很多原因都可能导致处理延迟：排队，网络故障（请参阅 “[不可靠的网络](/v1/ch8#不可靠的网络)”），性能问题导致消息代理 / 消息处理器出现争用，流消费者重启，从故障中恢复时重新处理过去的事件（请参阅 “[重播旧消息](#重播旧消息)”），或者在修复代码 BUG 之后。

而且，消息延迟还可能导致无法预测消息顺序。例如，假设用户首先发出一个 Web 请求（由 Web 服务器 A 处理），然后发出第二个请求（由服务器 B 处理）。A 和 B 发出描述它们所处理请求的事件，但是 B 的事件在 A 的事件发生之前到达消息代理。现在，流处理器将首先看到 B 事件，然后看到 A 事件，即使它们实际上是以相反的顺序发生的。

有一个类比也许能帮助理解，“星球大战” 电影：第四集于 1977 年发行，第五集于 1980 年，第六集于 1983 年，紧随其后的是 1999 年的第一集，2002 年的第二集，和 2005 年的第三集，以及 2015 年的第七集【80】[^ii]。如果你按照按照它们上映的顺序观看电影，你处理电影的顺序与它们叙事的顺序就是不一致的。（集数编号就像事件时间戳，而你观看电影的日期就是处理时间）作为人类，我们能够应对这种不连续性，但是流处理算法需要专门编写，以适应这种时序与顺序的问题。

[^ii]: 感谢 Flink 社区的 Kostas Kloudas 提出这个比喻。

将事件时间和处理时间搞混会导致错误的数据。例如，假设你有一个流处理器用于测量请求速率（计算每秒请求数）。如果你重新部署流处理器，它可能会停止一分钟，并在恢复之后处理积压的事件。如果你按处理时间来衡量速率，那么在处理积压日志时，请求速率看上去就像有一个异常的突发尖峰，而实际上请求速率是稳定的（[图 11-7](/v1/ddia_1107.png)）。

![](/v1/ddia_1107.png)

**图 11-7 按处理时间分窗，会因为处理速率的变动引入人为因素**

#### 知道什么时候准备好了

用事件时间来定义窗口的一个棘手的问题是，你永远也无法确定是不是已经收到了特定窗口的所有事件，还是说还有一些事件正在来的路上。

例如，假设你将事件分组为一分钟的窗口，以便统计每分钟的请求数。你已经计数了一些带有本小时内第 37 分钟时间戳的事件，时间流逝，现在进入的主要都是本小时内第 38 和第 39 分钟的事件。什么时候才能宣布你已经完成了第 37 分钟的窗口计数，并输出其计数器值？

在一段时间没有看到任何新的事件之后，你可以超时并宣布一个窗口已经就绪，但仍然可能发生这种情况：某些事件被缓冲在另一台机器上，由于网络中断而延迟。你需要能够处理这种在窗口宣告完成之后到达的 **滞留（straggler）** 事件。大体上，你有两种选择【1】：

1. 忽略这些滞留事件，因为在正常情况下它们可能只是事件中的一小部分。你可以将丢弃事件的数量作为一个监控指标，并在出现大量丢消息的情况时报警。
2. 发布一个 **更正（correction）**，一个包括滞留事件的更新窗口值。你可能还需要收回以前的输出。

在某些情况下，可以使用特殊的消息来指示 “从现在开始，不会有比 t 更早时间戳的消息了”，消费者可以使用它来触发窗口【81】。但是，如果不同机器上的多个生产者都在生成事件，每个生产者都有自己的最小时间戳阈值，则消费者需要分别跟踪每个生产者。在这种情况下，添加和删除生产者都是比较棘手的。

#### 你用的是谁的时钟？

当事件可能在系统内多个地方进行缓冲时，为事件分配时间戳更加困难了。例如，考虑一个移动应用向服务器上报关于用量的事件。该应用可能会在设备处于脱机状态时被使用，在这种情况下，它将在设备本地缓冲事件，并在下一次互联网连接可用时向服务器上报这些事件（可能是几小时甚至几天）。对于这个流的任意消费者而言，它们就如延迟极大的滞留事件一样。

在这种情况下，事件上的事件戳实际上应当是用户交互发生的时间，取决于移动设备的本地时钟。然而用户控制的设备上的时钟通常是不可信的，因为它可能会被无意或故意设置成错误的时间（请参阅 “[时钟同步与准确性](/v1/ch8#时钟同步与准确性)”）。服务器收到事件的时间（取决于服务器的时钟）可能是更准确的，因为服务器在你的控制之下，但在描述用户交互方面意义不大。

要校正不正确的设备时钟，一种方法是记录三个时间戳【82】：

* 事件发生的时间，取决于设备时钟
* 事件发送往服务器的时间，取决于设备时钟
* 事件被服务器接收的时间，取决于服务器时钟

通过从第三个时间戳中减去第二个时间戳，可以估算设备时钟和服务器时钟之间的偏移（假设网络延迟与所需的时间戳精度相比可忽略不计）。然后可以将该偏移应用于事件时间戳，从而估计事件实际发生的真实时间（假设设备时钟偏移在事件发生时与送往服务器之间没有变化）。

这并不是流处理独有的问题，批处理有着完全一样的时间推理问题。只是在流处理的上下文中，我们更容易意识到时间的流逝。

#### 窗口的类型

当你知道如何确定一个事件的时间戳后，下一步就是如何定义时间段的窗口。然后窗口就可以用于聚合，例如事件计数，或计算窗口内值的平均值。有几种窗口很常用【79,83】：

滚动窗口（Tumbling Window）
: 滚动窗口有着固定的长度，每个事件都仅能属于一个窗口。例如，假设你有一个 1 分钟的滚动窗口，则所有时间戳在 `10:03:00` 和 `10:03:59` 之间的事件会被分组到一个窗口中，`10:04:00` 和 `10:04:59` 之间的事件被分组到下一个窗口，依此类推。通过将每个事件时间戳四舍五入至最近的分钟来确定它所属的窗口，可以实现 1 分钟的滚动窗口。

跳动窗口（Hopping Window）
: 跳动窗口也有着固定的长度，但允许窗口重叠以提供一些平滑。例如，一个带有 1 分钟跳跃步长的 5 分钟窗口将包含 `10:03:00` 至 `10:07:59` 之间的事件，而下一个窗口将覆盖 `10:04:00` 至 `10:08:59` 之间的事件，等等。通过首先计算 1 分钟的滚动窗口（tunmbling window），然后在几个相邻窗口上进行聚合，可以实现这种跳动窗口。

滑动窗口（Sliding Window）
: 滑动窗口包含了彼此间距在特定时长内的所有事件。例如，一个 5 分钟的滑动窗口应当覆盖 `10:03:39` 和 `10:08:12` 的事件，因为它们相距不超过 5 分钟（注意滚动窗口与步长 5 分钟的跳动窗口可能不会把这两个事件分组到同一个窗口中，因为它们使用固定的边界）。通过维护一个按时间排序的事件缓冲区，并不断从窗口中移除过期的旧事件，可以实现滑动窗口。

会话窗口（Session window）
: 与其他窗口类型不同，会话窗口没有固定的持续时间，而定义为：将同一用户出现时间相近的所有事件分组在一起，而当用户一段时间没有活动时（例如，如果 30 分钟内没有事件）窗口结束。会话切分是网站分析的常见需求（请参阅 “[分组](/v1/ch10#分组)”）。

### 流连接

在 [第十章](/v1/ch10) 中，我们讨论了批处理作业如何通过键来连接数据集，以及这种连接是如何成为数据管道的重要组成部分的。由于流处理将数据管道泛化为对无限数据集进行增量处理，因此对流进行连接的需求也是完全相同的。

然而，新事件随时可能出现在一个流中，这使得流连接要比批处理连接更具挑战性。为了更好地理解情况，让我们先来区分三种不同类型的连接：**流 - 流** 连接，**流 - 表** 连接，与 **表 - 表** 连接【84】。我们将在下面的章节中通过例子来说明。

#### 流流连接（窗口连接）

假设你的网站上有搜索功能，而你想要找出搜索 URL 的近期趋势。每当有人键入搜索查询时，都会记录下一个包含查询与其返回结果的事件。每当有人点击其中一个搜索结果时，就会记录另一个记录点击事件。为了计算搜索结果中每个 URL 的点击率，你需要将搜索动作与点击动作的事件连在一起，这些事件通过相同的会话 ID 进行连接。广告系统中需要类似的分析【85】。

如果用户丢弃了搜索结果，点击可能永远不会发生，即使它出现了，搜索与点击之间的时间可能是高度可变的：在很多情况下，它可能是几秒钟，但也可能长达几天或几周（如果用户执行搜索，忘掉了这个浏览器页面，过了一段时间后重新回到这个浏览器页面上，并点击了一个结果）。由于可变的网络延迟，点击事件甚至可能先于搜索事件到达。你可以选择合适的连接窗口 —— 例如，如果点击与搜索之间的时间间隔在一小时内，你可能会选择连接两者。

请注意，在点击事件中嵌入搜索详情与事件连接并不一样：这样做的话，只有当用户点击了一个搜索结果时你才能知道，而那些没有点击的搜索就无能为力了。为了衡量搜索质量，你需要准确的点击率，为此搜索事件和点击事件两者都是必要的。

为了实现这种类型的连接，流处理器需要维护 **状态**：例如，按会话 ID 索引最近一小时内发生的所有事件。无论何时发生搜索事件或点击事件，都会被添加到合适的索引中，而流处理器也会检查另一个索引是否有具有相同会话 ID 的事件到达。如果有匹配事件就会发出一个表示搜索结果被点击的事件；如果搜索事件直到过期都没看见有匹配的点击事件，就会发出一个表示搜索结果未被点击的事件。

#### 流表连接（流扩充）

在 “[示例：用户活动事件分析](/v1/ch10#示例：用户活动事件分析)”（[图 10-2](/v1/ddia_1002.png)）中，我们看到了连接两个数据集的批处理作业示例：一组用户活动事件和一个用户档案数据库。将用户活动事件视为流，并在流处理器中连续执行相同的连接是很自然的想法：输入是包含用户 ID 的活动事件流，而输出还是活动事件流，但其中用户 ID 已经被扩展为用户的档案信息。这个过程有时被称为使用数据库的信息来 **扩充（enriching）** 活动事件。

要执行此连接，流处理器需要一次处理一个活动事件，在数据库中查找事件的用户 ID，并将档案信息添加到活动事件中。数据库查询可以通过查询远程数据库来实现。但正如在 “[示例：用户活动事件分析](/v1/ch10#示例：用户活动事件分析)” 一节中讨论的，此类远程查询可能会很慢，并且有可能导致数据库过载【75】。

另一种方法是将数据库副本加载到流处理器中，以便在本地进行查询而无需网络往返。这种技术与我们在 “[Map 侧连接](/v1/ch10#Map侧连接)” 中讨论的散列连接非常相似：如果数据库的本地副本足够小，则可以是内存中的散列表，比较大的话也可以是本地磁盘上的索引。

与批处理作业的区别在于，批处理作业使用数据库的时间点快照作为输入，而流处理器是长时间运行的，且数据库的内容可能随时间而改变，所以流处理器数据库的本地副本需要保持更新。这个问题可以通过变更数据捕获来解决：流处理器可以订阅用户档案数据库的更新日志，如同活动事件流一样。当增添或修改档案时，流处理器会更新其本地副本。因此，我们有了两个流之间的连接：活动事件和档案更新。

流表连接实际上非常类似于流流连接；最大的区别在于对于表的变更日志流，连接使用了一个可以回溯到 “时间起点” 的窗口（概念上是无限的窗口），新版本的记录会覆盖更早的版本。对于输入的流，连接可能压根儿就没有维护任何窗口。

#### 表表连接（维护物化视图）

我们在 “[描述负载](/v1/ch1#描述负载)” 中讨论的推特时间线例子时说过，当用户想要查看他们的主页时间线时，迭代用户所关注人群的推文并合并它们是一个开销巨大的操作。

相反，我们需要一个时间线缓存：一种每个用户的 “收件箱”，在发送推文的时候写入这些信息，因而读取时间线时只需要简单地查询即可。物化与维护这个缓存需要处理以下事件：

* 当用户 u 发送新的推文时，它将被添加到每个关注用户 u 的时间线上。
* 用户删除推文时，推文将从所有用户的时间表中删除。
* 当用户 $u_1$ 开始关注用户 $u_2$ 时，$u_2$ 最近的推文将被添加到 $u_1$ 的时间线上。
* 当用户 $u_1$ 取消关注用户 $u_2$ 时，$u_2$ 的推文将从 $u_1$ 的时间线中移除。

要在流处理器中实现这种缓存维护，你需要推文事件流（发送与删除）和关注关系事件流（关注与取消关注）。流处理需要维护一个数据库，包含每个用户的粉丝集合。以便知道当一条新推文到达时，需要更新哪些时间线【86】。

观察这个流处理过程的另一种视角是：它维护了一个连接了两个表（推文与关注）的物化视图，如下所示：

```sql
SELECT follows.follower_id AS timeline_id,
    array_agg(tweets.* ORDER BY tweets.timestamp DESC)
FROM tweets
JOIN follows ON follows.followee_id = tweets.sender_id
GROUP BY follows.follower_id
```

流连接直接对应于这个查询中的表连接。时间线实际上是这个查询结果的缓存，每当底层的表发生变化时都会更新 [^iii]。

[^iii]: 如果你将流视作表的衍生物，如 [图 11-6](/v1/ddia_1106.png) 所示，而把一个连接看作是两个表的乘法u·v，那么会发生一些有趣的事情：物化连接的变化流遵循乘积法则：(u·v)'= u'v + uv'。换句话说，任何推文的变化量都与当前的关注联系在一起，任何关注的变化量都与当前的推文相连接【49,50】。

#### 连接的时间依赖性

这里描述的三种连接（流流，流表，表表）有很多共通之处：它们都需要流处理器维护连接一侧的一些状态（搜索与点击事件，用户档案，关注列表），然后当连接另一侧的消息到达时查询该状态。

用于维护状态的事件顺序是很重要的（先关注然后取消关注，或者其他类似操作）。在分区日志中，单个分区内的事件顺序是保留下来的。但典型情况下是没有跨流或跨分区的顺序保证的。

这就产生了一个问题：如果不同流中的事件发生在近似的时间范围内，则应该按照什么样的顺序进行处理？在流表连接的例子中，如果用户更新了它们的档案，哪些活动事件与旧档案连接（在档案更新前处理），哪些又与新档案连接（在档案更新之后处理）？换句话说：你需要对一些状态做连接，如果状态会随着时间推移而变化，那应当使用什么时间点来连接呢【45】？

这种时序依赖可能出现在很多地方。例如销售东西需要对发票应用适当的税率，这取决于所处的国家 / 州，产品类型，销售日期（因为税率时不时会变化）。当连接销售额与税率表时，你可能期望的是使用销售时的税率参与连接。如果你正在重新处理历史数据，销售时的税率可能和现在的税率有所不同。

如果跨越流的事件顺序是未定的，则连接会变为不确定性的【87】，这意味着你在同样输入上重跑相同的作业未必会得到相同的结果：当你重跑任务时，输入流上的事件可能会以不同的方式交织。

在数据仓库中，这个问题被称为 **缓慢变化的维度（slowly changing dimension, SCD）**，通常通过对特定版本的记录使用唯一的标识符来解决：例如，每当税率改变时都会获得一个新的标识符，而发票在销售时会带有税率的标识符【88,89】。这种变化使连接变为确定性的，但也会导致日志压缩无法进行：表中所有的记录版本都需要保留。

### 容错

在本章的最后一节中，让我们看一看流处理是如何容错的。我们在 [第十章](/v1/ch10) 中看到，批处理框架可以很容易地容错：如果 MapReduce 作业中的任务失败，可以简单地在另一台机器上再次启动，并且丢弃失败任务的输出。这种透明的重试是可能的，因为输入文件是不可变的，每个任务都将其输出写入到 HDFS 上的独立文件中，而输出仅当任务成功完成后可见。

特别是，批处理容错方法可确保批处理作业的输出与没有出错的情况相同，即使实际上某些任务失败了。看起来好像每条输入记录都被处理了恰好一次 —— 没有记录被跳过，而且没有记录被处理两次。尽管重启任务意味着实际上可能会多次处理记录，但输出中的可见效果看上去就像只处理过一次。这个原则被称为 **恰好一次语义（exactly-once semantics）**，尽管 **等效一次（effectively-once）** 可能会是一个更写实的术语【90】。

在流处理中也出现了同样的容错问题，但是处理起来没有那么直观：等待某个任务完成之后再使其输出可见并不是一个可行选项，因为你永远无法处理完一个无限的流。

#### 微批量与存档点

一个解决方案是将流分解成小块，并像微型批处理一样处理每个块。这种方法被称为 **微批次（microbatching）**，它被用于 Spark Streaming 【91】。批次的大小通常约为 1 秒，这是对性能妥协的结果：较小的批次会导致更大的调度与协调开销，而较大的批次意味着流处理器结果可见之前的延迟要更长。

微批次也隐式提供了一个与批次大小相等的滚动窗口（按处理时间而不是事件时间戳分窗）。任何需要更大窗口的作业都需要显式地将状态从一个微批次转移到下一个微批次。

Apache Flink 则使用不同的方法，它会定期生成状态的滚动存档点并将其写入持久存储【92,93】。如果流算子崩溃，它可以从最近的存档点重启，并丢弃从最近检查点到崩溃之间的所有输出。存档点会由消息流中的 **壁障（barrier）** 触发，类似于微批次之间的边界，但不会强制一个特定的窗口大小。

在流处理框架的范围内，微批次与存档点方法提供了与批处理一样的 **恰好一次语义**。但是，只要输出离开流处理器（例如，写入数据库，向外部消息代理发送消息，或发送电子邮件），框架就无法抛弃失败批次的输出了。在这种情况下，重启失败任务会导致外部副作用发生两次，只有微批次或存档点不足以阻止这一问题。

#### 原子提交再现

为了在出现故障时表现出恰好处理一次的样子，我们需要确保事件处理的所有输出和副作用 **当且仅当** 处理成功时才会生效。这些影响包括发送给下游算子或外部消息传递系统（包括电子邮件或推送通知）的任何消息，任何数据库写入，对算子状态的任何变更，以及对输入消息的任何确认（包括在基于日志的消息代理中将消费者偏移量前移）。

这些事情要么都原子地发生，要么都不发生，但是它们不应当失去同步。如果这种方法听起来很熟悉，那是因为我们在分布式事务和两阶段提交的上下文中讨论过它（请参阅 “[恰好一次的消息处理](/v1/ch9#恰好一次的消息处理)”）。

在 [第九章](/v1/ch9) 中，我们讨论了分布式事务传统实现中的问题（如 XA）。然而在限制更为严苛的环境中，也是有可能高效实现这种原子提交机制的。Google Cloud Dataflow【81,92】和 VoltDB 【94】中使用了这种方法，Apache Kafka 有计划加入类似的功能【95,96】。与 XA 不同，这些实现不会尝试跨异构技术提供事务，而是通过在流处理框架中同时管理状态变更与消息传递来内化事务。事务协议的开销可以通过在单个事务中处理多个输入消息来分摊。

#### 幂等性

我们的目标是丢弃任何失败任务的部分输出，以便能安全地重试，而不会生效两次。分布式事务是实现这个目标的一种方式，而另一种方式是依赖 **幂等性（idempotence）**【97】。

幂等操作是多次重复执行与单次执行效果相同的操作。例如，将键值存储中的某个键设置为某个特定值是幂等的（再次写入该值，只是用同样的值替代），而递增一个计数器不是幂等的（再次执行递增意味着该值递增两次）。

即使一个操作不是天生幂等的，往往可以通过一些额外的元数据做成幂等的。例如，在使用来自 Kafka 的消息时，每条消息都有一个持久的、单调递增的偏移量。将值写入外部数据库时可以将这个偏移量带上，这样你就可以判断一条更新是不是已经执行过了，因而避免重复执行。

Storm 的 Trident 基于类似的想法来处理状态【78】。依赖幂等性意味着隐含了一些假设：重启一个失败的任务必须以相同的顺序重播相同的消息（基于日志的消息代理能做这些事），处理必须是确定性的，没有其他节点能同时更新相同的值【98,99】。

当从一个处理节点故障切换到另一个节点时，可能需要进行 **防护**（fencing，请参阅 “[领导者和锁](/v1/ch8#领导者和锁)”），以防止被假死节点干扰。尽管有这么多注意事项，幂等操作是一种实现 **恰好一次语义** 的有效方式，仅需很小的额外开销。

#### 失败后重建状态

任何需要状态的流处理 —— 例如，任何窗口聚合（例如计数器，平均值和直方图）以及任何用于连接的表和索引，都必须确保在失败之后能恢复其状态。

一种选择是将状态保存在远程数据存储中，并进行复制，然而正如在 “[流表连接（流扩充）](#流表连接（流扩充）)” 中所述，每个消息都要查询远程数据库可能会很慢。另一种方法是在流处理器本地保存状态，并定期复制。然后当流处理器从故障中恢复时，新任务可以读取状态副本，恢复处理而不丢失数据。

例如，Flink 定期捕获算子状态的快照，并将它们写入 HDFS 等持久存储中【92,93】。Samza 和 Kafka Streams 通过将状态变更发送到具有日志压缩功能的专用 Kafka 主题来复制状态变更，这与变更数据捕获类似【84,100】。VoltDB 通过在多个节点上对每个输入消息进行冗余处理来复制状态（请参阅 “[真的串行执行](/v1/ch7#真的串行执行)”）。

在某些情况下，甚至可能都不需要复制状态，因为它可以从输入流重建。例如，如果状态是从相当短的窗口中聚合而成，则简单地重播该窗口中的输入事件可能是足够快的。如果状态是通过变更数据捕获来维护的数据库的本地副本，那么也可以从日志压缩的变更流中重建数据库（请参阅 “[日志压缩](#日志压缩)”）。

然而，所有这些权衡取决于底层基础架构的性能特征：在某些系统中，网络延迟可能低于磁盘访问延迟，网络带宽也可能与磁盘带宽相当。没有针对所有情况的普适理想权衡，随着存储和网络技术的发展，本地状态与远程状态的优点也可能会互换。


## 本章小结

在本章中，我们讨论了事件流，它们所服务的目的，以及如何处理它们。在某些方面，流处理非常类似于在 [第十章](/v1/ch10) 中讨论的批处理，不过是在无限的（永无止境的）流而不是固定大小的输入上持续进行。从这个角度来看，消息代理和事件日志可以视作文件系统的流式等价物。

我们花了一些时间比较两种消息代理：

AMQP/JMS 风格的消息代理
: 代理将单条消息分配给消费者，消费者在成功处理单条消息后确认消息。消息被确认后从代理中删除。这种方法适合作为一种异步形式的 RPC（另请参阅 “[消息传递中的数据流](/v1/ch4#消息传递中的数据流)”），例如在任务队列中，消息处理的确切顺序并不重要，而且消息在处理完之后，不需要回头重新读取旧消息。

基于日志的消息代理
: 代理将一个分区中的所有消息分配给同一个消费者节点，并始终以相同的顺序传递消息。并行是通过分区实现的，消费者通过存档最近处理消息的偏移量来跟踪工作进度。消息代理将消息保留在磁盘上，因此如有必要的话，可以回跳并重新读取旧消息。

基于日志的方法与数据库中的复制日志（请参阅 [第五章](/v1/ch5)）和日志结构存储引擎（请参阅 [第三章](/v1/ch3)）有相似之处。我们看到，这种方法对于消费输入流，并产生衍生状态或衍生输出数据流的系统而言特别适用。

就流的来源而言，我们讨论了几种可能性：用户活动事件，定期读数的传感器，和 Feed 数据（例如，金融中的市场数据）能够自然地表示为流。我们发现将数据库写入视作流也是很有用的：我们可以捕获变更日志 —— 即对数据库所做的所有变更的历史记录 —— 隐式地通过变更数据捕获，或显式地通过事件溯源。日志压缩允许流也能保有数据库内容的完整副本。

将数据库表示为流为系统集成带来了很多强大机遇。通过消费变更日志并将其应用至衍生系统，你能使诸如搜索索引、缓存以及分析系统这类衍生数据系统不断保持更新。你甚至能从头开始，通过读取从创世至今的所有变更日志，为现有数据创建全新的视图。

像流一样维护状态以及消息重播的基础设施，是在各种流处理框架中实现流连接和容错的基础。我们讨论了流处理的几种目的，包括搜索事件模式（复杂事件处理），计算分窗聚合（流分析），以及保证衍生数据系统处于最新状态（物化视图）。

然后我们讨论了在流处理中对时间进行推理的困难，包括处理时间与事件时间戳之间的区别，以及当你认为窗口已经完事之后，如何处理到达的掉队事件的问题。

我们区分了流处理中可能出现的三种连接类型：

流流连接
: 两个输入流都由活动事件组成，而连接算子在某个时间窗口内搜索相关的事件。例如，它可能会将同一个用户 30 分钟内进行的两个活动联系在一起。如果你想要找出一个流内的相关事件，连接的两侧输入可能实际上都是同一个流（**自连接**，即 self-join）。

流表连接
: 一个输入流由活动事件组成，另一个输入流是数据库变更日志。变更日志保证了数据库的本地副本是最新的。对于每个活动事件，连接算子将查询数据库，并输出一个扩展的活动事件。

表表连接
: 两个输入流都是数据库变更日志。在这种情况下，一侧的每一个变化都与另一侧的最新状态相连接。结果是两表连接所得物化视图的变更流。

最后，我们讨论了在流处理中实现容错和恰好一次语义的技术。与批处理一样，我们需要放弃任何失败任务的部分输出。然而由于流处理长时间运行并持续产生输出，所以不能简单地丢弃所有的输出。相反，可以使用更细粒度的恢复机制，基于微批次、存档点、事务或幂等写入。

## 参考文献

1. Tyler Akidau, Robert Bradshaw, Craig Chambers, et al.: “[The Dataflow Model: A Practical Approach to Balancing Correctness, Latency, and Cost in Massive-Scale, Unbounded, Out-of-Order Data Processing](http://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf),” *Proceedings of the VLDB Endowment*, volume 8, number 12, pages 1792–1803, August 2015. [doi:10.14778/2824032.2824076](http://dx.doi.org/10.14778/2824032.2824076)
1. Harold Abelson, Gerald Jay Sussman, and Julie Sussman: [*Structure and Interpretation of Computer Programs*](https://web.archive.org/web/20220807043536/https://mitpress.mit.edu/sites/default/files/sicp/index.html), 2nd edition. MIT Press, 1996. ISBN: 978-0-262-51087-5, available online at *mitpress.mit.edu*
1. Patrick Th. Eugster, Pascal A. Felber, Rachid Guerraoui, and Anne-Marie Kermarrec: “[The Many Faces of Publish/Subscribe](http://www.cs.ru.nl/~pieter/oss/manyfaces.pdf),” *ACM Computing Surveys*, volume 35, number 2, pages 114–131, June 2003. [doi:10.1145/857076.857078](http://dx.doi.org/10.1145/857076.857078)
1. Joseph M. Hellerstein and Michael Stonebraker: [*Readings in Database Systems*](http://redbook.cs.berkeley.edu/), 4th edition. MIT Press, 2005. ISBN: 978-0-262-69314-1, available online at *redbook.cs.berkeley.edu*
1. Don Carney, Uğur Çetintemel, Mitch Cherniack, et al.: “[Monitoring Streams – A New Class of Data Management Applications](http://www.vldb.org/conf/2002/S07P02.pdf),” at *28th International Conference on Very Large Data Bases* (VLDB), August 2002.
1. Matthew Sackman: “[Pushing Back](https://wellquite.org/posts/lshift/pushing_back/),” *lshift.net*, May 5, 2016.
1. Vicent Martí: “[Brubeck, a statsd-Compatible Metrics Aggregator](http://githubengineering.com/brubeck/),” *githubengineering.com*, June 15, 2015.
1. Seth Lowenberger: “[MoldUDP64 Protocol Specification V 1.00](http://www.nasdaqtrader.com/content/technicalsupport/specifications/dataproducts/moldudp64.pdf),” *nasdaqtrader.com*, July 2009.
1. Pieter Hintjens: [*ZeroMQ – The Guide*](http://zguide.zeromq.org/page:all). O'Reilly Media, 2013. ISBN: 978-1-449-33404-8
1. Ian Malpass: “[Measure Anything, Measure Everything](https://codeascraft.com/2011/02/15/measure-anything-measure-everything/),” *codeascraft.com*, February 15, 2011.
1. Dieter Plaetinck: “[25 Graphite, Grafana and statsd Gotchas](https://grafana.com/blog/2016/03/03/25-graphite-grafana-and-statsd-gotchas/),” *grafana.com*, March 3, 2016.
1. Jeff Lindsay: “[Web Hooks to Revolutionize the Web](https://web.archive.org/web/20180928201955/http://progrium.com/blog/2007/05/03/web-hooks-to-revolutionize-the-web/),” *progrium.com*, May 3, 2007.
1. Jim N. Gray: “[Queues Are Databases](https://arxiv.org/pdf/cs/0701158.pdf),” Microsoft Research Technical Report MSR-TR-95-56, December 1995.
1. Mark Hapner, Rich Burridge, Rahul Sharma, et al.: “[JSR-343 Java Message Service (JMS) 2.0 Specification](https://jcp.org/en/jsr/detail?id=343),” *jms-spec.java.net*, March 2013.
1. Sanjay Aiyagari, Matthew Arrott, Mark Atwell, et al.: “[AMQP: Advanced Message Queuing Protocol Specification](http://www.rabbitmq.com/resources/specs/amqp0-9-1.pdf),” Version 0-9-1, November 2008.
1. “[Google Cloud Pub/Sub: A Google-Scale Messaging Service](https://cloud.google.com/pubsub/architecture),” *cloud.google.com*, 2016.
1. “[Apache Kafka 0.9 Documentation](http://kafka.apache.org/documentation.html),” *kafka.apache.org*, November 2015.
1. Jay Kreps, Neha Narkhede, and Jun Rao: “[Kafka: A Distributed Messaging System for Log Processing](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/09/Kafka.pdf),” at *6th International Workshop on Networking Meets Databases* (NetDB), June 2011.
1. “[Amazon Kinesis Streams Developer Guide](http://docs.aws.amazon.com/streams/latest/dev/introduction.html),” *docs.aws.amazon.com*, April 2016.
1. Leigh Stewart and Sijie Guo: “[Building DistributedLog: Twitter’s High-Performance Replicated Log Service](https://blog.twitter.com/2015/building-distributedlog-twitter-s-high-performance-replicated-log-service),” *blog.twitter.com*, September 16, 2015.
1. “[DistributedLog Documentation](https://web.archive.org/web/20210517201308/https://bookkeeper.apache.org/distributedlog/docs/latest/),” Apache Software Foundation, *distributedlog.io*.
1. Jay Kreps: “[Benchmarking Apache Kafka: 2 Million Writes Per Second (On Three Cheap Machines)](https://engineering.linkedin.com/kafka/benchmarking-apache-kafka-2-million-writes-second-three-cheap-machines),” *engineering.linkedin.com*, April 27, 2014.
1. Kartik Paramasivam: “[How We’re Improving and Advancing Kafka at LinkedIn](https://engineering.linkedin.com/apache-kafka/how-we_re-improving-and-advancing-kafka-linkedin),” *engineering.linkedin.com*, September 2, 2015.
1. Jay Kreps: “[The Log: What Every Software Engineer Should Know About Real-Time Data's Unifying Abstraction](http://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying),” *engineering.linkedin.com*, December 16, 2013.
1. Shirshanka Das, Chavdar Botev, Kapil Surlaker, et al.: “[All Aboard the Databus!](http://www.socc2012.org/s18-das.pdf),” at *3rd ACM Symposium on Cloud Computing* (SoCC), October 2012.
1. Yogeshwer Sharma, Philippe Ajoux, Petchean Ang, et al.: “[Wormhole: Reliable Pub-Sub to Support Geo-Replicated Internet Services](https://www.usenix.org/system/files/conference/nsdi15/nsdi15-paper-sharma.pdf),” at *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015.
1. P. P. S. Narayan: “[Sherpa Update](http://web.archive.org/web/20160801221400/https://developer.yahoo.com/blogs/ydn/sherpa-7992.html),” *developer.yahoo.com*, June 8, .
1. Martin Kleppmann: “[Bottled Water: Real-Time Integration of PostgreSQL and Kafka](http://martin.kleppmann.com/2015/04/23/bottled-water-real-time-postgresql-kafka.html),” *martin.kleppmann.com*, April 23, 2015.
1. Ben Osheroff: “[Introducing Maxwell, a mysql-to-kafka Binlog Processor](https://web.archive.org/web/20170208100334/https://developer.zendesk.com/blog/introducing-maxwell-a-mysql-to-kafka-binlog-processor),” *developer.zendesk.com*, August 20, 2015.
1. Randall Hauch: “[Debezium 0.2.1 Released](https://debezium.io/blog/2016/06/10/Debezium-0.2.1-Released/),” *debezium.io*, June 10, 2016.
1. Prem Santosh Udaya Shankar: “[Streaming MySQL Tables in Real-Time to Kafka](https://engineeringblog.yelp.com/2016/08/streaming-mysql-tables-in-real-time-to-kafka.html),” *engineeringblog.yelp.com*, August 1, 2016.
1. “[Mongoriver](https://github.com/stripe/mongoriver),” Stripe, Inc., *github.com*, September 2014.
1. Dan Harvey: “[Change Data Capture with Mongo + Kafka](http://www.slideshare.net/danharvey/change-data-capture-with-mongodb-and-kafka),” at *Hadoop Users Group UK*, August 2015.
1. “[Oracle GoldenGate 12c: Real-Time Access to Real-Time Information](https://web.archive.org/web/20160923105841/http://www.oracle.com/us/products/middleware/data-integration/oracle-goldengate-realtime-access-2031152.pdf),” Oracle White Paper, March 2015.
1. “[Oracle GoldenGate Fundamentals: How Oracle GoldenGate Works](https://www.youtube.com/watch?v=6H9NibIiPQE),” Oracle Corporation, *youtube.com*, November 2012.
1. Slava Akhmechet: “[Advancing the Realtime Web](http://rethinkdb.com/blog/realtime-web/),” *rethinkdb.com*, January 27, 2015.
1. “[Firebase Realtime Database Documentation](https://firebase.google.com/docs/database/),” Google, Inc., *firebase.google.com*, May 2016.
1. “[Apache CouchDB 1.6 Documentation](http://docs.couchdb.org/en/latest/),” *docs.couchdb.org*, 2014.
1. Matt DeBergalis: “[Meteor 0.7.0: Scalable Database Queries Using MongoDB Oplog Instead of Poll-and-Diff](https://web.archive.org/web/20160324055429/http://info.meteor.com/blog/meteor-070-scalable-database-queries-using-mongodb-oplog-instead-of-poll-and-diff),” *info.meteor.com*, December 17, 2013.
1. “[Chapter 15. Importing and Exporting Live Data](https://docs.voltdb.com/UsingVoltDB/ChapExport.php),” VoltDB 6.4 User Manual, *docs.voltdb.com*, June 2016.
1. Neha Narkhede: “[Announcing Kafka Connect: Building Large-Scale Low-Latency Data Pipelines](http://www.confluent.io/blog/announcing-kafka-connect-building-large-scale-low-latency-data-pipelines),” *confluent.io*, February 18, 2016.
1. Greg Young: “[CQRS and Event Sourcing](https://www.youtube.com/watch?v=JHGkaShoyNs),” at *Code on the Beach*, August 2014.
1. Martin Fowler: “[Event Sourcing](http://martinfowler.com/eaaDev/EventSourcing.html),” *martinfowler.com*, December 12, 2005.
1. Vaughn Vernon: [*Implementing Domain-Driven Design*](https://www.informit.com/store/implementing-domain-driven-design-9780321834577). Addison-Wesley Professional, 2013. ISBN: 978-0-321-83457-7
1. H. V. Jagadish, Inderpal Singh Mumick, and Abraham Silberschatz: “[View Maintenance Issues for the Chronicle Data Model](https://dl.acm.org/doi/10.1145/212433.220201),” at *14th ACM SIGACT-SIGMOD-SIGART Symposium on Principles of Database Systems* (PODS), May 1995. [doi:10.1145/212433.220201](http://dx.doi.org/10.1145/212433.220201)
1. “[Event Store 3.5.0 Documentation](http://docs.geteventstore.com/),” Event Store LLP, *docs.geteventstore.com*, February 2016.
1. Martin Kleppmann: [*Making Sense of Stream Processing*](http://www.oreilly.com/data/free/stream-processing.csp). Report, O'Reilly Media, May 2016.
1. Sander Mak: “[Event-Sourced Architectures with Akka](http://www.slideshare.net/SanderMak/eventsourced-architectures-with-akka),” at *JavaOne*, September 2014.
1. Julian Hyde: [personal communication](https://twitter.com/julianhyde/status/743374145006641153), June 2016.
1. Ashish Gupta and Inderpal Singh Mumick: *Materialized Views: Techniques, Implementations, and Applications*. MIT Press, 1999. ISBN: 978-0-262-57122-7
1. Timothy Griffin and Leonid Libkin: “[Incremental Maintenance of Views with Duplicates](http://homepages.inf.ed.ac.uk/libkin/papers/sigmod95.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), May 1995. [doi:10.1145/223784.223849](http://dx.doi.org/10.1145/223784.223849)
1. Pat Helland: “[Immutability Changes Everything](http://cidrdb.org/cidr2015/Papers/CIDR15_Paper16.pdf),” at *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
1. Martin Kleppmann: “[Accounting for Computer Scientists](http://martin.kleppmann.com/2011/03/07/accounting-for-computer-scientists.html),” *martin.kleppmann.com*, March 7, 2011.
1. Pat Helland: “[Accountants Don't Use Erasers](https://web.archive.org/web/20200220161036/https://blogs.msdn.microsoft.com/pathelland/2007/06/14/accountants-dont-use-erasers/),” *blogs.msdn.com*, June 14, 2007.
1. Fangjin Yang: “[Dogfooding with Druid, Samza, and Kafka: Metametrics at Metamarkets](https://metamarkets.com/2015/dogfooding-with-druid-samza-and-kafka-metametrics-at-metamarkets/),” *metamarkets.com*, June 3, 2015.
1. Gavin Li, Jianqiu Lv, and Hang Qi: “[Pistachio: Co-Locate the Data and Compute for Fastest Cloud Compute](https://web.archive.org/web/20181214032620/https://yahoohadoop.tumblr.com/post/116365275781/pistachio-co-locate-the-data-and-compute-for),” *yahoohadoop.tumblr.com*, April 13, 2015.
1. Kartik Paramasivam: “[Stream Processing Hard Problems – Part 1: Killing Lambda](https://engineering.linkedin.com/blog/2016/06/stream-processing-hard-problems-part-1-killing-lambda),” *engineering.linkedin.com*, June 27, 2016.
1. Martin Fowler: “[CQRS](http://martinfowler.com/bliki/CQRS.html),” *martinfowler.com*, July 14, 2011.
1. Greg Young: “[CQRS Documents](https://cqrs.files.wordpress.com/2010/11/cqrs_documents.pdf),” *cqrs.files.wordpress.com*, November 2010.
1. Baron Schwartz: “[Immutability, MVCC, and Garbage Collection](https://web.archive.org/web/20161110094746/http://www.xaprb.com/blog/2013/12/28/immutability-mvcc-and-garbage-collection/),” *xaprb.com*, December 28, 2013.
1. Daniel Eloff, Slava Akhmechet, Jay Kreps, et al.: ["Re: Turning the Database Inside-out with Apache Samza](https://news.ycombinator.com/item?id=9145197)," Hacker News discussion, *news.ycombinator.com*, March 4, 2015.
1. “[Datomic Development Resources: Excision](http://docs.datomic.com/excision.html),” Cognitect, Inc., *docs.datomic.com*.
1. “[Fossil Documentation: Deleting Content from Fossil](http://fossil-scm.org/index.html/doc/trunk/www/shunning.wiki),” *fossil-scm.org*, 2016.
1. Jay Kreps: “[The irony of distributed systems is that data loss is really easy but deleting data is surprisingly hard,](https://twitter.com/jaykreps/status/582580836425330688)” *twitter.com*, March 30, 2015.
1. David C. Luckham: “[What’s the Difference Between ESP and CEP?](http://www.complexevents.com/2006/08/01/what%E2%80%99s-the-difference-between-esp-and-cep/),” *complexevents.com*, August 1, 2006.
1. Srinath Perera: “[How Is Stream Processing and Complex Event Processing (CEP) Different?](https://www.quora.com/How-is-stream-processing-and-complex-event-processing-CEP-different),” *quora.com*, December 3, 2015.
1. Arvind Arasu, Shivnath Babu, and Jennifer Widom: “[The CQL Continuous Query Language: Semantic Foundations and Query Execution](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cql.pdf),” *The VLDB Journal*, volume 15, number 2, pages 121–142, June 2006. [doi:10.1007/s00778-004-0147-z](http://dx.doi.org/10.1007/s00778-004-0147-z)
1. Julian Hyde: “[Data in Flight: How Streaming SQL Technology Can Help Solve the Web 2.0 Data Crunch](http://queue.acm.org/detail.cfm?id=1667562),” *ACM Queue*, volume 7, number 11, December 2009. [doi:10.1145/1661785.1667562](http://dx.doi.org/10.1145/1661785.1667562)
1. “[Esper Reference, Version 5.4.0](http://esper.espertech.com/release-5.4.0/esper-reference/html_single/index.html),” EsperTech, Inc., *espertech.com*, April 2016.
1. Zubair Nabi, Eric Bouillet, Andrew Bainbridge, and Chris Thomas: “[Of Streams and Storms](https://web.archive.org/web/20170711081434/https://developer.ibm.com/streamsdev/wp-content/uploads/sites/15/2014/04/Streams-and-Storm-April-2014-Final.pdf),” IBM technical report, *developer.ibm.com*, April 2014.
1. Milinda Pathirage, Julian Hyde, Yi Pan, and Beth Plale: “[SamzaSQL: Scalable Fast Data Management with Streaming SQL](https://github.com/milinda/samzasql-hpbdc2016/blob/master/samzasql-hpbdc2016.pdf),” at *IEEE International Workshop on High-Performance Big Data Computing* (HPBDC), May 2016. [doi:10.1109/IPDPSW.2016.141](http://dx.doi.org/10.1109/IPDPSW.2016.141)
1. Philippe Flajolet, Éric Fusy, Olivier Gandouet, and Frédéric Meunier: “[HyperLogLog: The Analysis of a Near-Optimal Cardinality Estimation Algorithm](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf),” at *Conference on Analysis of Algorithms* (AofA), June 2007.
1. Jay Kreps: “[Questioning the Lambda Architecture](https://www.oreilly.com/ideas/questioning-the-lambda-architecture),” *oreilly.com*, July 2, 2014.
1. Ian Hellström: “[An Overview of Apache Streaming Technologies](https://databaseline.bitbucket.io/an-overview-of-apache-streaming-technologies/),” *databaseline.bitbucket.io*, March 12, 2016.
1. Jay Kreps: “[Why Local State Is a Fundamental Primitive in Stream Processing](https://www.oreilly.com/ideas/why-local-state-is-a-fundamental-primitive-in-stream-processing),” *oreilly.com*, July 31, 2014.
1. Shay Banon: “[Percolator](https://www.elastic.co/blog/percolator),” *elastic.co*, February 8, 2011.
1. Alan Woodward and Martin Kleppmann: “[Real-Time Full-Text Search with Luwak and Samza](http://martin.kleppmann.com/2015/04/13/real-time-full-text-search-luwak-samza.html),” *martin.kleppmann.com*, April 13, 2015.
1. “[Apache Storm 2.1.0 Documentation](https://storm.apache.org/releases/2.1.0/index.html),” *storm.apache.org*, October 2019.
1. Tyler Akidau: “[The World Beyond Batch: Streaming 102](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-102),” *oreilly.com*, January 20, 2016.
1. Stephan Ewen: “[Streaming Analytics with Apache Flink](https://www.confluent.io/resources/kafka-summit-2016/advanced-streaming-analytics-apache-flink-apache-kafka/),” at *Kafka Summit*, April 2016.
1. Tyler Akidau, Alex Balikov, Kaya Bekiroğlu, et al.: “[MillWheel: Fault-Tolerant Stream Processing at Internet Scale](http://research.google.com/pubs/pub41378.html),” at *39th International Conference on Very Large Data Bases* (VLDB), August 2013.
1. Alex Dean: “[Improving Snowplow's Understanding of Time](https://snowplow.io/blog/improving-snowplows-understanding-of-time/),” *snowplowanalytics.com*, September 15, 2015.
1. “[Windowing (Azure Stream Analytics)](https://msdn.microsoft.com/en-us/library/azure/dn835019.aspx),” Microsoft Azure Reference, *msdn.microsoft.com*, April 2016.
1. “[State Management](http://samza.apache.org/learn/documentation/0.10/container/state-management.html),” Apache Samza 0.10 Documentation, *samza.apache.org*, December 2015.
1. Rajagopal Ananthanarayanan, Venkatesh Basker, Sumit Das, et al.: “[Photon: Fault-Tolerant and Scalable Joining of Continuous Data Streams](http://research.google.com/pubs/pub41318.html),” at *ACM International Conference on Management of Data* (SIGMOD), June 2013. [doi:10.1145/2463676.2465272](http://dx.doi.org/10.1145/2463676.2465272)
1. Martin Kleppmann: “[Samza Newsfeed Demo](https://github.com/ept/newsfeed),” *github.com*, September 2014.
1. Ben Kirwin: “[Doing the Impossible: Exactly-Once Messaging Patterns in Kafka](http://ben.kirw.in/2014/11/28/kafka-patterns/),” *ben.kirw.in*, November 28, 2014.
1. Pat Helland: “[Data on the Outside Versus Data on the Inside](http://cidrdb.org/cidr2005/papers/P12.pdf),” at *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005.
1. Ralph Kimball and Margy Ross: *The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling*, 3rd edition. John Wiley & Sons, 2013. ISBN: 978-1-118-53080-1
1. Viktor Klang: “[I'm coining the phrase 'effectively-once' for message processing with at-least-once + idempotent operations](https://twitter.com/viktorklang/status/789036133434978304),” *twitter.com*, October 20, 2016.
1. Matei Zaharia, Tathagata Das, Haoyuan Li, et al.: “[Discretized Streams: An Efficient and Fault-Tolerant Model for Stream Processing on Large Clusters](https://www.usenix.org/system/files/conference/hotcloud12/hotcloud12-final28.pdf),” at *4th USENIX Conference in Hot Topics in Cloud Computing* (HotCloud), June 2012.
1. Kostas Tzoumas, Stephan Ewen, and Robert Metzger: “[High-Throughput, Low-Latency, and Exactly-Once Stream Processing with Apache Flink](https://www.ververica.com/blog/high-throughput-low-latency-and-exactly-once-stream-processing-with-apache-flink),” *ververica.com*, August 5, 2015.
1. Paris Carbone, Gyula Fóra, Stephan Ewen, et al.: “[Lightweight Asynchronous Snapshots for Distributed Dataflows](http://arxiv.org/abs/1506.08603),” arXiv:1506.08603 &#91;cs.DC&#93;, June 29, 2015.
1. Ryan Betts and John Hugg: [*Fast Data: Smart and at Scale*](http://www.oreilly.com/data/free/fast-data-smart-and-at-scale.csp). Report, O'Reilly Media, October 2015.
1. Flavio Junqueira: “[Making Sense of Exactly-Once Semantics](https://web.archive.org/web/20160812172900/http://conferences.oreilly.com/strata/hadoop-big-data-eu/public/schedule/detail/49690),” at *Strata+Hadoop World London*, June 2016.
1. Jason Gustafson, Flavio Junqueira, Apurva Mehta, Sriram Subramanian, and Guozhang Wang: “[KIP-98 – Exactly Once Delivery and Transactional Messaging](https://cwiki.apache.org/confluence/display/KAFKA/KIP-98+-+Exactly+Once+Delivery+and+Transactional+Messaging),” *cwiki.apache.org*, November 2016.
1. Pat Helland: “[Idempotence Is Not a Medical Condition](https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=4b6dda7fe75b51e1c543a87ca7b3b322fbf55614),” *Communications of the ACM*, volume 55, number 5, page 56, May 2012. [doi:10.1145/2160718.2160734](http://dx.doi.org/10.1145/2160718.2160734)
1. Jay Kreps: “[Re: Trying to Achieve Deterministic Behavior on Recovery/Rewind](http://mail-archives.apache.org/mod_mbox/samza-dev/201409.mbox/%3CCAOeJiJg%2Bc7Ei%3DgzCuOz30DD3G5Hm9yFY%3DUJ6SafdNUFbvRgorg%40mail.gmail.com%3E),” email to *samza-dev* mailing list, September 9, 2014.
1. E. N. (Mootaz) Elnozahy, Lorenzo Alvisi, Yi-Min Wang, and David B. Johnson: “[A Survey of Rollback-Recovery Protocols in Message-Passing Systems](http://www.cs.utexas.edu/~lorenzo/papers/SurveyFinal.pdf),” *ACM Computing Surveys*, volume 34, number 3, pages 375–408, September 2002. [doi:10.1145/568522.568525](http://dx.doi.org/10.1145/568522.568525)
1. Adam Warski: “[Kafka Streams – How Does It Fit the Stream Processing Landscape?](https://softwaremill.com/kafka-streams-how-does-it-fit-stream-landscape/),” *softwaremill.com*, June 1, 2016.


================================================
FILE: content/v1/ch12.md
================================================
---
title: "第十二章：数据系统的未来"
linkTitle: "12. 数据系统的未来"
weight: 312
breadcrumbs: false
---


![](/map/ch12.png)

> 如果船长的终极目标是保护船只，他应该永远待在港口。
>
> —— 圣托马斯・阿奎那《神学大全》（1265-1274）

到目前为止，本书主要描述的是 **现状**。在这最后一章中，我们将放眼 **未来**，讨论应该是怎么样的：我将提出一些想法与方法，我相信它们能从根本上改进我们设计与构建应用的方式。

对未来的看法与推测当然具有很大的主观性。所以在撰写本章时，当提及我个人的观点时会使用第一人称。你完全可以不同意这些观点并提出自己的看法，但我希望本章中的概念，至少能成为富有成效的讨论出发点，并澄清一些经常被混淆的概念。

[第一章](/v1/ch1) 概述了本书的目标：探索如何创建 **可靠**、**可伸缩** 和 **可维护** 的应用与系统。这一主题贯穿了所有的章节：例如，我们讨论了许多有助于提高可靠性的容错算法，有助于提高可伸缩性的分区，以及有助于提高可维护性的演化与抽象机制。在本章中，我们将把所有这些想法结合在一起，并在它们的基础上展望未来。我们的目标是，发现如何设计出比现有应用更好的应用 —— 健壮、正确、可演化、且最终对人类有益。

## 数据集成

本书中反复出现的主题是，对于任何给定的问题都会有好几种解决方案，所有这些解决方案都有不同的优缺点与利弊权衡。例如在 [第三章](/v1/ch3) 讨论存储引擎时，我们看到了日志结构存储、B 树以及列式存储。在 [第五章](/v1/ch5) 讨论复制时，我们看到了单领导者、多领导者和无领导者的方法。

如果你有一个类似于 “我想存储一些数据并稍后再查询” 的问题，那么并没有一种正确的解决方案。但对于不同的具体环境，总会有不同的合适方法。软件实现通常必须选择一种特定的方法。使单条代码路径能做到稳定健壮且表现良好已经是一件非常困难的事情了 —— 尝试在单个软件中完成所有事情，几乎可以保证，实现效果会很差。

因此软件工具的最佳选择也取决于情况。每一种软件，甚至所谓的 “通用” 数据库，都是针对特定的使用模式设计的。

面对让人眼花缭乱的诸多替代品，第一个挑战就是弄清软件与其适用环境的映射关系。供应商不愿告诉你他们软件不适用的工作负载，这是可以理解的。但是希望先前的章节能给你提供一些问题，让你读出字里行间的言外之意，并更好地理解这些权衡。

但是，即使你已经完全理解各种工具与其适用环境间的关系，还有一个挑战：在复杂的应用中，数据的用法通常花样百出。不太可能存在适用于 **所有** 不同数据应用场景的软件，因此你不可避免地需要拼凑几个不同的软件来以提供应用所需的功能。

### 组合使用衍生数据的工具

例如，为了处理任意关键词的搜索查询，将 OLTP 数据库与全文搜索索引集成在一起是很常见的需求。尽管一些数据库（例如 PostgreSQL）包含了全文索引功能，对于简单的应用完全够了【1】，但更复杂的搜索能力就需要专业的信息检索工具了。相反的是，搜索索引通常不适合作为持久的记录系统，因此许多应用需要组合这两种不同的工具以满足所有需求。

我们在 “[保持系统同步](/v1/ch11#保持系统同步)” 中接触过集成数据系统的问题。随着数据不同表示形式的增加，集成问题变得越来越困难。除了数据库和搜索索引之外，也许你需要在分析系统（数据仓库，或批处理和流处理系统）中维护数据副本；维护从原始数据中衍生的缓存，或反规范化的数据版本；将数据灌入机器学习、分类、排名或推荐系统中；或者基于数据变更发送通知。

令人惊讶的是，我经常看到软件工程师做出这样的陈述：“根据我的经验，99% 的人只需要 X” 或者 “...... 不需要 X”（对于各种各样的 X）。我认为这种陈述更像是发言人自己的经验，而不是技术实际上的实用性。可能对数据执行的操作，其范围极其宽广。某人认为鸡肋而毫无意义的功能可能是别人的核心需求。当你拉高视角，并考虑跨越整个组织范围的数据流时，数据集成的需求往往就会变得明显起来。

#### 理解数据流

当需要在多个存储系统中维护相同数据的副本以满足不同的访问模式时，你要对输入和输出了如指掌：哪些数据先写入，哪些数据表示衍生自哪些来源？如何以正确的格式，将所有数据导入正确的地方？

例如，你可能会首先将数据写入 **记录系统** 数据库，捕获对该数据库所做的变更（请参阅 “[变更数据捕获](/v1/ch11#变更数据捕获)”），然后将变更以相同的顺序应用于搜索索引。如果变更数据捕获（CDC）是更新索引的唯一方式，则可以确定该索引完全派生自记录系统，因此与其保持一致（除软件错误外）。写入数据库是向该系统提供新输入的唯一方式。

允许应用程序直接写入搜索索引和数据库引入了如 [图 11-4](/v1/ddia_1104.png) 所示的问题，其中两个客户端同时发送冲突的写入，且两个存储系统按不同顺序处理它们。在这种情况下，既不是数据库说了算，也不是搜索索引说了算，所以它们做出了相反的决定，进入彼此间持久性的不一致状态。

如果你可以通过单个系统来提供所有用户输入，从而决定所有写入的排序，则通过按相同顺序处理写入，可以更容易地衍生出其他数据表示。这是状态机复制方法的一个应用，我们在 “[全序广播](/v1/ch9#全序广播)” 中看到。无论你使用变更数据捕获还是事件溯源日志，都不如简单的基于全序的决策原则更重要。

基于事件日志来更新衍生数据的系统，通常可以做到 **确定性** 与 **幂等性**（请参阅 “[幂等性](/v1/ch11#幂等性)”），使得从故障中恢复相当容易。

#### 衍生数据与分布式事务

保持不同数据系统彼此一致的经典方法涉及分布式事务，如 “[原子提交与两阶段提交](/v1/ch9#原子提交与两阶段提交)” 中所述。与分布式事务相比，使用衍生数据系统的方法如何？

在抽象层面，它们通过不同的方式达到类似的目标。分布式事务通过 **锁** 进行互斥来决定写入的顺序（请参阅 “[两阶段锁定](/v1/ch7#两阶段锁定)”），而 CDC 和事件溯源使用日志进行排序。分布式事务使用原子提交来确保变更只生效一次，而基于日志的系统通常基于 **确定性重试** 和 **幂等性**。

最大的不同之处在于事务系统通常提供 [线性一致性](/v1/ch9#线性一致性)，这包含着有用的保证，例如 [读己之写](/v1/ch5#读己之写)。另一方面，衍生数据系统通常是异步更新的，因此它们默认不会提供相同的时序保证。

在愿意为分布式事务付出代价的有限场景中，它们已被成功应用。但是，我认为 XA 的容错能力和性能很差劲（请参阅 “[实践中的分布式事务](/v1/ch9#实践中的分布式事务)”），这严重限制了它的实用性。我相信为分布式事务设计一种更好的协议是可行的。但使这样一种协议被现有工具广泛接受是很有挑战的，且不是立竿见影的事。

在没有广泛支持的良好分布式事务协议的情况下，我认为基于日志的衍生数据是集成不同数据系统的最有前途的方法。然而，诸如读己之写的保证是有用的，我认为告诉所有人 “最终一致性是不可避免的 —— 忍一忍并学会和它打交道” 是没有什么建设性的（至少在缺乏 **如何** 应对的良好指导时）。

在 “[将事情做正确](#将事情做正确)” 中，我们将讨论一些在异步衍生系统之上实现更强保障的方法，并迈向分布式事务和基于日志的异步系统之间的中间地带。

#### 全序的限制

对于足够小的系统，构建一个完全有序的事件日志是完全可行的（正如单主复制数据库的流行所证明的那样，它正好建立了这样一种日志）。但是，随着系统向更大更复杂的工作负载伸缩，限制开始出现：

* 在大多数情况下，构建完全有序的日志，需要所有事件汇集于决定顺序的 **单个领导者节点**。如果事件吞吐量大于单台计算机的处理能力，则需要将其分区到多台计算机上（请参阅 “[分区日志](/v1/ch11#分区日志)”）。然后两个不同分区中的事件顺序关系就不明确了。
* 如果服务器分布在多个 **地理位置分散** 的数据中心上，例如为了容忍整个数据中心掉线，你通常在每个数据中心都有单独的主库，因为网络延迟会导致同步的跨数据中心协调效率低下（请参阅 “[多主复制](/v1/ch5#多主复制)”）。这意味着源自两个不同数据中心的事件顺序未定义。
* 将应用程序部署为微服务时（请参阅 “[服务中的数据流：REST 与 RPC](/v1/ch4#服务中的数据流：REST与RPC)”），常见的设计选择是将每个服务及其持久状态作为独立单元进行部署，服务之间不共享持久状态。当两个事件来自不同的服务时，这些事件间的顺序未定义。
* 某些应用程序在客户端保存状态，该状态在用户输入时立即更新（无需等待服务器确认），甚至可以继续脱机工作（请参阅 “[需要离线操作的客户端](/v1/ch5#需要离线操作的客户端)”）。对于这样的应用程序，客户端和服务器很可能以不同的顺序看到事件。

在形式上，决定事件的全局顺序称为 **全序广播**，相当于 **共识**（请参阅 “[共识算法和全序广播](/v1/ch9#共识算法和全序广播)”）。大多数共识算法都是针对单个节点的吞吐量足以处理整个事件流的情况而设计的，并且这些算法不提供多个节点共享事件排序工作的机制。设计可以伸缩至单个节点的吞吐量之上，且在地理位置分散的环境中仍然工作良好的的共识算法仍然是一个开放的研究问题。

#### 排序事件以捕获因果关系

在事件之间不存在因果关系的情况下，全序的缺乏并不是一个大问题，因为并发事件可以任意排序。其他一些情况很容易处理：例如，当同一对象有多个更新时，它们可以通过将特定对象 ID 的所有更新路由到相同的日志分区来完全排序。然而，因果关系有时会以更微妙的方式出现（请参阅 “[顺序与因果关系](/v1/ch9#顺序与因果关系)”）。

例如，考虑一个社交网络服务，以及一对曾处于恋爱关系但刚分手的用户。其中一个用户将另一个用户从好友中移除，然后向剩余的好友发送消息，抱怨他们的前任。用户的心思是他们的前任不应该看到这些粗鲁的消息，因为消息是在好友状态解除后发送的。

但是如果好友关系状态与消息存储在不同的地方，在这样一个系统中，可能会出现 **解除好友** 事件与 **发送消息** 事件之间的因果依赖丢失的情况。如果因果依赖关系没有被捕捉到，则发送有关新消息的通知的服务可能会在 **解除好友** 事件之前处理 **发送消息** 事件，从而错误地向前任发送通知。

在本例中，通知实际上是消息和好友列表之间的连接，使得它与我们先前讨论的连接的时序问题有关（请参阅 “[连接的时间依赖性](/v1/ch11#连接的时间依赖性)”）。不幸的是，这个问题似乎并没有一个简单的答案【2,3】。起点包括：

* 逻辑时间戳可以提供无需协调的全局顺序（请参阅 “[序列号顺序](/v1/ch9#序列号顺序)”），因此它们可能有助于全序广播不可行的情况。但是，他们仍然要求收件人处理不按顺序发送的事件，并且需要传递其他元数据。
* 如果你可以记录一个事件来记录用户在做出决定之前所看到的系统状态，并给该事件一个唯一的标识符，那么后面的任何事件都可以引用该事件标识符来记录因果关系【4】。我们将在 “[读也是事件](#读也是事件)” 中回到这个想法。
* 冲突解决算法（请参阅 “[自动冲突解决](/v1/ch5#自动冲突解决)”）有助于处理以意外顺序传递的事件。它们对于维护状态很有用，但如果行为有外部副作用（例如，给用户发送通知），就没什么帮助了。

也许，随着时间的推移，应用开发模式将出现，使得能够有效地捕获因果依赖关系，并且保持正确的衍生状态，而不会迫使所有事件经历全序广播的瓶颈）。

### 批处理与流处理

我会说数据集成的目标是，确保数据最终能在所有正确的地方表现出正确的形式。这样做需要消费输入、转换、连接、过滤、聚合、训练模型、评估、以及最终写出适当的输出。批处理和流处理是实现这一目标的工具。

批处理和流处理的输出是衍生数据集，例如搜索索引、物化视图、向用户显示的建议、聚合指标等（请参阅 “[批处理工作流的输出](/v1/ch10#批处理工作流的输出)” 和 “[流处理的应用](/v1/ch11#流处理的应用)”）。

正如我们在 [第十章](/v1/ch10) 和 [第十一章](/v1/ch11) 中看到的，批处理和流处理有许多共同的原则，主要的根本区别在于流处理器在无限数据集上运行，而批处理输入是已知的有限大小。处理引擎的实现方式也有很多细节上的差异，但是这些区别已经开始模糊。

Spark 在批处理引擎上执行流处理，将流分解为 **微批次（microbatches）**，而 Apache Flink 则在流处理引擎上执行批处理【5】。原则上，一种类型的处理可以用另一种类型来模拟，但是性能特征会有所不同：例如，在跳跃或滑动窗口上，微批次可能表现不佳【6】。

#### 维护衍生状态

批处理有着很强的函数式风格（即使其代码不是用函数式语言编写的）：它鼓励确定性的纯函数，其输出仅依赖于输入，除了显式输出外没有副作用，将输入视作不可变的，且输出是仅追加的。流处理与之类似，但它扩展了算子以允许受管理的、容错的状态（请参阅 “[失败后重建状态”](/v1/ch11#失败后重建状态)）。

具有良好定义的输入和输出的确定性函数的原理不仅有利于容错（请参阅 “[幂等性](/v1/ch11#幂等性)”），也简化了有关组织中数据流的推理【7】。无论衍生数据是搜索索引、统计模型还是缓存，采用这种观点思考都是很有帮助的：将其视为从一个东西衍生出另一个的数据管道，通过函数式应用代码推送一个系统的状态变更，并将其效果应用至衍生系统中。

原则上，衍生数据系统可以同步地维护，就像关系数据库在与索引表写入操作相同的事务中同步更新次级索引一样。然而，异步是使基于事件日志的系统稳健的原因：它允许系统的一部分故障被抑制在本地。而如果任何一个参与者失败，分布式事务将中止，因此它们倾向于通过将故障传播到系统的其余部分来放大故障（请参阅 “[分布式事务的限制](/v1/ch9#分布式事务的限制)”）。

我们在 “[分区与次级索引](/v1/ch6#分区与次级索引)” 中看到，次级索引经常跨越分区边界。具有次级索引的分区系统需要将写入发送到多个分区（如果索引按关键词分区的话）或将读取发送到所有分区（如果索引是按文档分区的话）。如果索引是异步维护的，这种跨分区通信也是最可靠和最可伸缩的【8】（另请参阅 “[多分区数据处理](#多分区数据处理)”）。

#### 应用演化后重新处理数据

在维护衍生数据时，批处理和流处理都是有用的。流处理允许将输入中的变化以低延迟反映在衍生视图中，而批处理允许重新处理大量累积的历史数据以便将新视图导出到现有数据集上。

特别是，重新处理现有数据为维护系统、演化并支持新功能和需求变更提供了一个良好的机制（请参阅 [第四章](/v1/ch4)）。没有重新进行处理，模式演化将仅限于简单的变化，例如向记录中添加新的可选字段或添加新类型的记录。无论是在写时模式还是在读时模式中都是如此（请参阅 “[文档模型中的模式灵活性](/v1/ch2#文档模型中的模式灵活性)”）。另一方面，通过重新处理，可以将数据集重组为一个完全不同的模型，以便更好地满足新的要求。

> ### 铁路上的模式迁移
>
> 大规模的 “模式迁移” 也发生在非计算机系统中。例如，在 19 世纪英国铁路建设初期，轨距（两轨之间的距离）就有了各种各样的竞争标准。为一种轨距而建的列车不能在另一种轨距的轨道上运行，这限制了火车网络中可能的相互连接【9】。
>
> 在 1846 年最终确定了一个标准轨距之后，其他轨距的轨道必须转换 —— 但是如何在不停运火车线路的情况下进行数月甚至数年的迁移？解决的办法是首先通过添加第三条轨道将轨道转换为 **双轨距（dual guage）** 或 **混合轨距**。这种转换可以逐渐完成，当完成时，两种轨距的列车都可以在线路上跑，使用三条轨道中的两条。事实上，一旦所有的列车都转换成标准轨距，那么可以移除提供非标准轨距的轨道。
>
> 以这种方式 “再加工” 现有的轨道，让新旧版本并存，可以在几年的时间内逐渐改变轨距。然而，这是一项昂贵的事业，这就是今天非标准轨距仍然存在的原因。例如，旧金山湾区的 BART 系统使用了与美国大部分地区不同的轨距。

衍生视图允许 **渐进演化（gradual evolution）**。如果你想重新构建数据集，不需要执行突然切换式的迁移。取而代之的是，你可以将旧架构和新架构并排维护为相同基础数据上的两个独立衍生视图。然后可以开始将少量用户转移到新视图，以测试其性能并发现任何错误，而大多数用户仍然会被路由到旧视图。你可以逐渐地增加访问新视图的用户比例，最终可以删除旧视图【10】。

这种逐渐迁移的美妙之处在于，如果出现问题，每个阶段的过程都很容易逆转：你始终有一个可以回滚的可用系统。通过降低不可逆损害的风险，你能对继续前进更有信心，从而更快地改善系统【11】。

#### Lambda架构

如果批处理用于重新处理历史数据，而流处理用于处理最近的更新，那么如何将这两者结合起来？Lambda 架构【12】是这方面的一个建议，引起了很多关注。

Lambda 架构的核心思想是通过将不可变事件附加到不断增长的数据集来记录传入数据，这类似于事件溯源（请参阅 “[事件溯源](/v1/ch11#事件溯源)”）。为了从这些事件中衍生出读取优化的视图，Lambda 架构建议并行运行两个不同的系统：批处理系统（如 Hadoop MapReduce）和独立的流处理系统（如 Storm）。

在 Lambda 方法中，流处理器消耗事件并快速生成对视图的近似更新；批处理器稍后将使用同一组事件并生成衍生视图的更正版本。这个设计背后的原因是批处理更简单，因此不易出错，而流处理器被认为是不太可靠和难以容错的（请参阅 “[容错](/v1/ch11#容错)”）。而且，流处理可以使用快速近似算法，而批处理使用较慢的精确算法。

Lambda 架构是一种有影响力的想法，它将数据系统的设计变得更好，尤其是通过推广这样的原则：在不可变事件流上建立衍生视图，并在需要时重新处理事件。但是我也认为它有一些实际问题：

* 在批处理和流处理框架中维护相同的逻辑是很显著的额外工作。虽然像 Summingbird【13】这样的库提供了一种可以在批处理和流处理的上下文中运行的计算抽象。调试、调整和维护两个不同系统的操作复杂性依然存在【14】。
* 由于流管道和批处理管道产生独立的输出，因此需要合并它们以响应用户请求。如果计算是基于滚动窗口的简单聚合，则合并相当容易，但如果视图基于更复杂的操作（例如连接和会话化）而导出，或者输出不是时间序列，则会变得非常困难。
* 尽管有能力重新处理整个历史数据集是很好的，但在大型数据集上这样做经常会开销巨大。因此，批处理流水线通常需要设置为处理增量批处理（例如，在每小时结束时处理一小时的数据），而不是重新处理所有内容。这引发了 “[时间推理](/v1/ch11#时间推理)” 中讨论的问题，例如处理滞留事件和处理跨批次边界的窗口。增量化批处理计算会增加复杂性，使其更类似于流式传输层，这与保持批处理层尽可能简单的目标背道而驰。

#### 统一批处理和流处理

最近的工作使得 Lambda 架构的优点在没有其缺点的情况下得以实现，允许批处理计算（重新处理历史数据）和流计算（在事件到达时即处理）在同一个系统中实现【15】。

在一个系统中统一批处理和流处理需要以下功能，这些功能也正在越来越广泛地被提供：

* 通过处理最近事件流的相同处理引擎来重播历史事件的能力。例如，基于日志的消息代理可以重播消息（请参阅 “[重播旧消息](/v1/ch11#重播旧消息)”），某些流处理器可以从 HDFS 等分布式文件系统读取输入。
* 对于流处理器来说，恰好一次语义 —— 即确保输出与未发生故障的输出相同，即使事实上发生故障（请参阅 “[容错](/v1/ch11#容错)”）。与批处理一样，这需要丢弃任何失败任务的部分输出。
* 按事件时间进行窗口化的工具，而不是按处理时间进行窗口化，因为处理历史事件时，处理时间毫无意义（请参阅 “[时间推理](/v1/ch11#时间推理)”）。例如，Apache Beam 提供了用于表达这种计算的 API，可以在 Apache Flink 或 Google Cloud Dataflow 使用。


## 分拆数据库

在最抽象的层面上，数据库，Hadoop 和操作系统都发挥相同的功能：它们存储一些数据，并允许你处理和查询这些数据【16】。数据库将数据存储为特定数据模型的记录（表中的行、文档、图中的顶点等），而操作系统的文件系统则将数据存储在文件中 —— 但其核心都是 “信息管理” 系统【17】。正如我们在 [第十章](/v1/ch10) 中看到的，Hadoop 生态系统有点像 Unix 的分布式版本。

当然，有很多实际的差异。例如，许多文件系统都不能很好地处理包含 1000 万个小文件的目录，而包含 1000 万个小记录的数据库完全是寻常而不起眼的。无论如何，操作系统和数据库之间的相似之处和差异值得探讨。

Unix 和关系数据库以非常不同的哲学来处理信息管理问题。Unix 认为它的目的是为程序员提供一种相当低层次的硬件的逻辑抽象，而关系数据库则希望为应用程序员提供一种高层次的抽象，以隐藏磁盘上数据结构的复杂性、并发性、崩溃恢复等等。Unix 发展出的管道和文件只是字节序列，而数据库则发展出了 SQL 和事务。

哪种方法更好？当然这取决于你想要的是什么。Unix 是 “简单的”，因为它是对硬件资源相当薄的包装；关系数据库是 “更简单” 的，因为一个简短的声明性查询可以利用很多强大的基础设施（查询优化、索引、连接方法、并发控制、复制等），而不需要查询的作者理解其实现细节。

这些哲学之间的矛盾已经持续了几十年（Unix 和关系模型都出现在 70 年代初），仍然没有解决。例如，我将 NoSQL 运动解释为，希望将类 Unix 的低级别抽象方法应用于分布式 OLTP 数据存储的领域。

在这一部分我将试图调和这两个哲学，希望我们能各取其美。

### 组合使用数据存储技术

在本书的过程中，我们讨论了数据库提供的各种功能及其工作原理，其中包括：

* 次级索引，使你可以根据字段的值有效地搜索记录（请参阅 “[其他索引结构](/v1/ch3#其他索引结构)”）
* 物化视图，这是一种预计算的查询结果缓存（请参阅 “[聚合：数据立方体和物化视图](/v1/ch3#聚合：数据立方体和物化视图)”）
* 复制日志，保持其他节点上数据的副本最新（请参阅 “[复制日志的实现](/v1/ch5#复制日志的实现)”）
* 全文搜索索引，允许在文本中进行关键字搜索（请参阅 “[全文搜索和模糊索引](/v1/ch3#全文搜索和模糊索引)”），也内置于某些关系数据库【1】

在 [第十章](/v1/ch10) 和 [第十一章](/v1/ch11) 中，出现了类似的主题。我们讨论了如何构建全文搜索索引（请参阅 “[批处理工作流的输出](/v1/ch10#批处理工作流的输出)”），了解了如何维护物化视图（请参阅 “[维护物化视图](/v1/ch11#维护物化视图)”）以及如何将变更从数据库复制到衍生数据系统（请参阅 “[变更数据捕获](/v1/ch11#变更数据捕获)”）。

数据库中内置的功能与人们用批处理和流处理器构建的衍生数据系统似乎有相似之处。

#### 创建索引

想想当你运行 `CREATE INDEX` 在关系数据库中创建一个新的索引时会发生什么。数据库必须扫描表的一致性快照，挑选出所有被索引的字段值，对它们进行排序，然后写出索引。然后它必须处理自一致快照以来所做的写入操作（假设表在创建索引时未被锁定，所以写操作可能会继续）。一旦完成，只要事务写入表中，数据库就必须继续保持索引最新。

此过程非常类似于设置新的从库副本（请参阅 “[设置新从库](/v1/ch5#设置新从库)”），也非常类似于流处理系统中的 **引导（bootstrap）** 变更数据捕获（请参阅 “[初始快照](/v1/ch11#初始快照)”）。

无论何时运行 `CREATE INDEX`，数据库都会重新处理现有数据集（如 “[应用演化后重新处理数据](#应用演化后重新处理数据)” 中所述），并将该索引作为新视图导出到现有数据上。现有数据可能是状态的快照，而不是所有发生变化的日志，但两者密切相关（请参阅 “[状态、流和不变性](/v1/ch11#状态、流和不变性)”）。

#### 一切的元数据库

有鉴于此，我认为整个组织的数据流开始像一个巨大的数据库【7】。每当批处理、流或 ETL 过程将数据从一个地方传输到另一个地方并组装时，它表现地就像数据库子系统一样，使索引或物化视图保持最新。

从这种角度来看，批处理和流处理器就像精心实现的触发器、存储过程和物化视图维护例程。它们维护的衍生数据系统就像不同的索引类型。例如，关系数据库可能支持 B 树索引、散列索引、空间索引（请参阅 “[多列索引](/v1/ch3#多列索引)”）以及其他类型的索引。在新兴的衍生数据系统架构中，不是将这些设施作为单个集成数据库产品的功能实现，而是由各种不同的软件提供，运行在不同的机器上，由不同的团队管理。

这些发展在未来将会把我们带到哪里？如果我们从没有适合所有访问模式的单一数据模型或存储格式的前提出发，我推测有两种途径可以将不同的存储和处理工具组合成一个有凝聚力的系统：

**联合数据库：统一读取**

可以为各种各样的底层存储引擎和处理方法提供一个统一的查询接口 —— 一种称为 **联合数据库（federated database）** 或 **多态存储（polystore）** 的方法【18,19】。例如，PostgreSQL 的 **外部数据包装器（foreign data wrapper）** 功能符合这种模式【20】。需要专用数据模型或查询接口的应用程序仍然可以直接访问底层存储引擎，而想要组合来自不同位置的数据的用户可以通过联合接口轻松完成操作。

联合查询接口遵循着单一集成系统的关系型传统，带有高级查询语言和优雅的语义，但实现起来非常复杂。

**分拆数据库：统一写入**

虽然联合能解决跨多个不同系统的只读查询问题，但它并没有很好的解决跨系统 **同步** 写入的问题。我们说过，在单个数据库中，创建一致的索引是一项内置功能。当我们构建多个存储系统时，我们同样需要确保所有数据变更都会在所有正确的位置结束，即使在出现故障时也是如此。想要更容易地将存储系统可靠地插接在一起（例如，通过变更数据捕获和事件日志），就像将数据库的索引维护功能以可以跨不同技术同步写入的方式分开【7,21】。

分拆方法遵循 Unix 传统的小型工具，它可以很好地完成一件事【22】，通过统一的低层级 API（管道）进行通信，并且可以使用更高层级的语言进行组合（shell）【16】 。

#### 开展分拆工作

联合和分拆是一个硬币的两面：用不同的组件构成可靠、 可伸缩和可维护的系统。联合只读查询需要将一个数据模型映射到另一个数据模型，这需要一些思考，但最终还是一个可解决的问题。而我认为同步写入到几个存储系统是更困难的工程问题，所以我将重点关注它。

传统的同步写入方法需要跨异构存储系统的分布式事务【18】，我认为这是错误的解决方案（请参阅 “[衍生数据与分布式事务](#衍生数据与分布式事务)”）。单个存储或流处理系统内的事务是可行的，但是当数据跨越不同技术之间的边界时，我认为具有幂等写入的异步事件日志是一种更加健壮和实用的方法。

例如，分布式事务在某些流处理组件内部使用，以匹配 **恰好一次（exactly-once）** 语义（请参阅 “[原子提交再现](/v1/ch11#原子提交再现)”），这可以很好地工作。然而，当事务需要涉及由不同人群编写的系统时（例如，当数据从流处理组件写入分布式键值存储或搜索索引时），缺乏标准化的事务协议会使集成更难。有幂等消费者的有序事件日志（请参阅 “[幂等性](/v1/ch11#幂等性)”）是一种更简单的抽象，因此在异构系统中实现更加可行【7】。

基于日志的集成的一大优势是各个组件之间的 **松散耦合（loose coupling）**，这体现在两个方面：

1. 在系统级别，异步事件流使整个系统在个别组件的中断或性能下降时更加稳健。如果消费者运行缓慢或失败，那么事件日志可以缓冲消息（请参阅 “[磁盘空间使用](/v1/ch11#磁盘空间使用)”），以便生产者和任何其他消费者可以继续不受影响地运行。有问题的消费者可以在问题修复后赶上，因此不会错过任何数据，并且包含故障。相比之下，分布式事务的同步交互往往会将本地故障升级为大规模故障（请参阅 “[分布式事务的限制](/v1/ch9#分布式事务的限制)”）。
2. 在人力方面，分拆数据系统允许不同的团队独立开发，改进和维护不同的软件组件和服务。专业化使得每个团队都可以专注于做好一件事，并与其他团队的系统以明确的接口交互。事件日志提供了一个足够强大的接口，以捕获相当强的一致性属性（由于持久性和事件的顺序），但也足够普适于几乎任何类型的数据。

#### 分拆系统vs集成系统

如果分拆确实成为未来的方式，它也不会取代目前形式的数据库 —— 它们仍然会像以往一样被需要。为了维护流处理组件中的状态，数据库仍然是需要的，并且为批处理和流处理器的输出提供查询服务（请参阅 “[批处理工作流的输出](/v1/ch10#批处理工作流的输出)” 与 “[流处理](/v1/ch11#流处理)”）。专用查询引擎对于特定的工作负载仍然非常重要：例如，MPP 数据仓库中的查询引擎针对探索性分析查询进行了优化，并且能够很好地处理这种类型的工作负载（请参阅 “[Hadoop 与分布式数据库的对比](/v1/ch10#Hadoop与分布式数据库的对比)”）。

运行几种不同基础设施的复杂性可能是一个问题：每种软件都有一个学习曲线，配置问题和操作怪癖，因此部署尽可能少的移动部件是很有必要的。比起使用应用代码拼接多个工具而成的系统，单一集成软件产品也可以在其设计应对的工作负载类型上实现更好、更可预测的性能【23】。正如在前言中所说的那样，为了不需要的规模而构建系统是白费精力，而且可能会将你锁死在一个不灵活的设计中。实际上，这是一种过早优化的形式。

分拆的目标不是要针对个别数据库与特定工作负载的性能进行竞争；我们的目标是允许你结合多个不同的数据库，以便在比单个软件可能实现的更广泛的工作负载范围内实现更好的性能。这是关于广度，而不是深度 —— 与我们在 “[Hadoop 与分布式数据库的对比](/v1/ch10#Hadoop与分布式数据库的对比)” 中讨论的存储和处理模型的多样性一样。

因此，如果有一项技术可以满足你的所有需求，那么最好使用该产品，而不是试图用更低层级的组件重新实现它。只有当没有单一软件满足你的所有需求时，才会出现拆分和联合的优势。

#### 少了什么？

用于组成数据系统的工具正在变得越来越好，但我认为还缺少一个主要的东西：我们还没有与 Unix shell 类似的分拆数据库等价物（即，一种声明式的、简单的、用于组装存储和处理系统的高级语言）。

例如，如果我们可以简单地声明 `mysql | elasticsearch`，类似于 Unix 管道【22】，成为 `CREATE INDEX` 的分拆等价物：它将读取 MySQL 数据库中的所有文档并将其索引到 Elasticsearch 集群中。然后它会不断捕获对数据库所做的所有变更，并自动将它们应用于搜索索引，而无需编写自定义应用代码。这种集成应当支持几乎任何类型的存储或索引系统。

同样，能够更容易地预先计算和更新缓存将是一件好事。回想一下，物化视图本质上是一个预先计算的缓存，所以你可以通过为复杂查询声明指定物化视图来创建缓存，包括图上的递归查询（请参阅 “[图数据模型](/v1/ch2#图数据模型)”）和应用逻辑。在这方面有一些有趣的早期研究，如 **差分数据流（differential dataflow）**【24,25】，我希望这些想法能够在生产系统中找到自己的方法。

### 围绕数据流设计应用

使用应用代码组合专用存储与处理系统来分拆数据库的方法，也被称为 “**数据库由内而外（database inside-out）**” 方法【26】，该名称来源于我在 2014 年的一次会议演讲标题【27】。然而称它为 “新架构” 过于夸大，我仅将其看作是一种设计模式，一个讨论的起点，我们只是简单地给它起一个名字，以便我们能更好地讨论它。

这些想法不是我的；它们是很多人的思想的融合，这些思想非常值得我们学习。尤其是，以 Oz【28】和 Juttle【29】为代表的数据流语言，以 Elm【30,31】为代表的 **函数式响应式编程（functional reactive programming, FRP）**，以 Bloom【32】为代表的逻辑编程语言。在这一语境中的术语 **分拆（unbundling）** 是由 Jay Kreps 提出的【7】。

即使是 **电子表格** 也在数据流编程能力上甩开大多数主流编程语言几条街【33】。在电子表格中，可以将公式放入一个单元格中（例如，对另一列中的单元格求和），并且只要公式的任何输入发生变更，公式的结果都会自动重新计算。这正是我们在数据系统层次所需要的：当数据库中的记录发生变更时，我们希望自动更新该记录的任何索引，并且自动刷新依赖于记录的任何缓存视图或聚合。你不必担心这种刷新如何发生的技术细节，但能够简单地相信它可以正常工作。

因此，我认为绝大多数数据系统仍然可以从 VisiCalc 在 1979 年已经具备的功能中学习【34】。与电子表格的不同之处在于，今天的数据系统需要具有容错性，可伸缩性以及持久存储数据。它们还需要能够整合不同人群编写的不同技术，并重用现有的库和服务：期望使用某一种特定的语言、框架或工具来开发所有软件是不切实际的。

在本节中，我将详细介绍这些想法，并探讨一些围绕分拆数据库和数据流的想法构建应用的方法。

#### 应用代码作为衍生函数

当一个数据集衍生自另一个数据集时，它会经历某种转换函数。例如：

* 次级索引是由一种直白的转换函数生成的衍生数据集：对于基础表中的每行或每个文档，它挑选被索引的列或字段中的值，并按这些值排序（假设使用 B 树或 SSTable 索引，按键排序，如 [第三章](/v1/ch3) 所述）。
* 全文搜索索引是通过应用各种自然语言处理函数而创建的，诸如语言检测、分词、词干或词汇化、拼写纠正和同义词识别，然后构建用于高效查找的数据结构（例如倒排索引）。
* 在机器学习系统中，我们可以将模型视作从训练数据通过应用各种特征提取、统计分析函数衍生的数据，当模型应用于新的输入数据时，模型的输出是从输入和模型（因此间接地从训练数据）中衍生的。
* 缓存通常包含将以用户界面（UI）显示的形式的数据聚合。因此填充缓存需要知道 UI 中引用的字段；UI 中的变更可能需要更新缓存填充方式的定义，并重建缓存。

用于次级索引的衍生函数是如此常用的需求，以致于它作为核心功能被内建至许多数据库中，你可以简单地通过 `CREATE INDEX` 来调用它。对于全文索引，常见语言的基本语言特征可能内置到数据库中，但更复杂的特征通常需要领域特定的调整。在机器学习中，特征工程是众所周知的特定于应用的特征，通常需要包含很多关于用户交互与应用部署的详细知识【35】。

当创建衍生数据集的函数不是像创建次级索引那样的标准搬砖函数时，需要自定义代码来处理特定于应用的东西。而这个自定义代码是让许多数据库挣扎的地方，虽然关系数据库通常支持触发器、存储过程和用户定义的函数，可以用它们来在数据库中执行应用代码，但它们有点像数据库设计里的事后反思。（请参阅 “[传递事件流](/v1/ch11#传递事件流)”）。

#### 应用代码和状态的分离

理论上，数据库可以是任意应用代码的部署环境，就如同操作系统一样。然而实践中它们对这一目标适配的很差。它们不满足现代应用开发的要求，例如依赖和软件包管理、版本控制、滚动升级、可演化性、监控、指标、对网络服务的调用以及与外部系统的集成。

另一方面，Mesos、YARN、Docker、Kubernetes 等部署和集群管理工具专为运行应用代码而设计。通过专注于做好一件事情，他们能够做得比将数据库作为其众多功能之一执行用户定义的功能要好得多。

我认为让系统的某些部分专门用于持久数据存储并让其他部分专门运行应用程序代码是有意义的。这两者可以在保持独立的同时互动。

现在大多数 Web 应用程序都是作为无状态服务部署的，其中任何用户请求都可以路由到任何应用程序服务器，并且服务器在发送响应后会忘记所有请求。这种部署方式很方便，因为可以随意添加或删除服务器，但状态必须到某个地方：通常是数据库。趋势是将无状态应用程序逻辑与状态管理（数据库）分开：不将应用程序逻辑放入数据库中，也不将持久状态置于应用程序中【36】。正如函数式编程社区喜欢开玩笑说的那样，“我们相信 **教会（Church）** 与 **国家（state）** 的分离”【37】 [^i]

[^i]: 解释笑话很少会让人感觉更好，但我不想让任何人感到被遗漏。在这里，Church 指代的是数学家的阿隆佐・邱奇，他创立了 lambda 演算，这是计算的早期形式，是大多数函数式编程语言的基础。lambda 演算不具有可变状态（即没有变量可以被覆盖），所以可以说可变状态与 Church 的工作是分离的。

在这个典型的 Web 应用模型中，数据库充当一种可以通过网络同步访问的可变共享变量。应用程序可以读取和更新变量，而数据库负责维持它的持久性，提供一些诸如并发控制和容错的功能。

但是，在大多数编程语言中，你无法订阅可变变量中的变更 —— 你只能定期读取它。与电子表格不同，如果变量的值发生变化，变量的读者不会收到通知（你可以在自己的代码中实现这样的通知 —— 这被称为 **观察者模式** —— 但大多数语言没有将这种模式作为内置功能）。

数据库继承了这种可变数据的被动方法：如果你想知道数据库的内容是否发生了变化，通常你唯一的选择就是轮询（即定期重复你的查询）。订阅变更只是刚刚开始出现的功能（请参阅 “[变更流的 API 支持](/v1/ch11#变更流的API支持)”）。

#### 数据流：应用代码与状态变化的交互

从数据流的角度思考应用程序，意味着重新协调应用代码和状态管理之间的关系。我们不再将数据库视作被应用操纵的被动变量，取而代之的是更多地考虑状态，状态变更和处理它们的代码之间的相互作用与协同关系。应用代码通过在另一个地方触发状态变更来响应状态变更。

我们在 “[数据库与流](/v1/ch11#数据库与流)” 中看到了这一思路，我们讨论了将数据库的变更日志视为一种我们可以订阅的事件流。诸如 Actor 的消息传递系统（请参阅 “[消息传递中的数据流](/v1/ch4#消息传递中的数据流)”）也具有响应事件的概念。早在 20 世纪 80 年代，**元组空间（tuple space）** 模型就已经探索了表达分布式计算的方式：观察状态变更并作出反应的过程【38,39】。

如前所述，当触发器由于数据变更而被触发时，或次级索引更新以反映索引表中的变更时，数据库内部也发生着类似的情况。分拆数据库意味着将这个想法应用于在主数据库之外，用于创建衍生数据集：缓存、全文搜索索引、机器学习或分析系统。我们可以为此使用流处理和消息传递系统。

需要记住的重要一点是，维护衍生数据不同于执行异步任务。传统的消息传递系统通常是为执行异步任务设计的（请参阅 “[日志与传统的消息传递相比](/v1/ch11#日志与传统的消息传递相比)”）：

* 在维护衍生数据时，状态变更的顺序通常很重要（如果多个视图是从事件日志衍生的，则需要按照相同的顺序处理事件，以便它们之间保持一致）。如 “[确认与重新传递](/v1/ch11#确认与重新传递)” 中所述，许多消息代理在重传未确认消息时没有此属性，双写也被排除在外（请参阅 “[保持系统同步](/v1/ch11#保持系统同步)”）。
* 容错是衍生数据的关键：仅仅丢失单个消息就会导致衍生数据集永远与其数据源失去同步。消息传递和衍生状态更新都必须可靠。例如，许多 Actor 系统默认在内存中维护 Actor 的状态和消息，所以如果运行 Actor 的机器崩溃，状态和消息就会丢失。

稳定的消息排序和容错消息处理是相当严格的要求，但与分布式事务相比，它们开销更小，运行更稳定。现代流处理组件可以提供这些排序和可靠性保证，并允许应用代码以流算子的形式运行。

这些应用代码可以执行任意处理，包括数据库内置衍生函数通常不提供的功能。就像通过管道链接的 Unix 工具一样，流算子可以围绕着数据流构建大型系统。每个算子接受状态变更的流作为输入，并产生其他状态变化的流作为输出。

#### 流处理器和服务

当今流行的应用开发风格涉及将功能分解为一组通过同步网络请求（如 REST API）进行通信的 **服务**（service，请参阅 “[服务中的数据流：REST 与 RPC](/v1/ch4#服务中的数据流：REST与RPC)”）。这种面向服务的架构优于单一庞大应用的优势主要在于：通过松散耦合来提供组织上的可伸缩性：不同的团队可以专职于不同的服务上，从而减少团队之间的协调工作（因为服务可以独立部署和更新）。

在数据流中组装流算子与微服务方法有很多相似之处【40】。但底层通信机制是有很大区别：数据流采用单向异步消息流，而不是同步的请求 / 响应式交互。

除了在 “[消息传递中的数据流](/v1/ch4#消息传递中的数据流)” 中列出的优点（如更好的容错性），数据流系统还能实现更好的性能。例如，假设客户正在购买以一种货币定价，但以另一种货币支付的商品。为了执行货币换算，你需要知道当前的汇率。这个操作可以通过两种方式实现【40,41】：

1. 在微服务方法中，处理购买的代码可能会查询汇率服务或数据库，以获取特定货币的当前汇率。
2. 在数据流方法中，处理订单的代码会提前订阅汇率变更流，并在汇率发生变动时将当前汇率存储在本地数据库中。处理订单时只需查询本地数据库即可。

第二种方法能将对另一服务的同步网络请求替换为对本地数据库的查询（可能在同一台机器甚至同一个进程中）[^ii]。数据流方法不仅更快，而且当其他服务失效时也更稳健。最快且最可靠的网络请求就是压根没有网络请求！我们现在不再使用 RPC，而是在购买事件和汇率更新事件之间建立流联接（请参阅 “[流表连接（流扩充）](/v1/ch11#流表连接（流扩充）)”）。

[^ii]: 在微服务方法中，你也可以通过在处理购买的服务中本地缓存汇率来避免同步网络请求。但是为了保证缓存的新鲜度，你需要定期轮询汇率以获取其更新，或订阅变更流 —— 这恰好是数据流方法中发生的事情。

连接是时间相关的：如果购买事件在稍后的时间点被重新处理，汇率可能已经改变。如果要重建原始输出，则需要获取原始购买时的历史汇率。无论是查询服务还是订阅汇率更新流，你都需要处理这种时间相关性（请参阅 “[连接的时间依赖性](/v1/ch11#连接的时间依赖性)”）。

订阅变更流，而不是在需要时查询当前状态，使我们更接近类似电子表格的计算模型：当某些数据发生变更时，依赖于此的所有衍生数据都可以快速更新。还有很多未解决的问题，例如关于时间相关连接等问题，但我认为围绕数据流构建应用的想法是一个非常有希望的方向。

### 观察衍生数据状态

在抽象层面，上一节讨论的数据流系统提供了创建衍生数据集（例如搜索索引、物化视图和预测模型）并使其保持更新的过程。我们将这个过程称为 **写路径（write path）**：只要某些信息被写入系统，它可能会经历批处理与流处理的多个阶段，而最终每个衍生数据集都会被更新，以适配写入的数据。[图 12-1](/v1/ddia_1201.png) 显示了一个更新搜索索引的例子。

![](/v1/ddia_1201.png)

**图 12-1 在搜索索引中，写（文档更新）遇上读（查询）**

但你为什么一开始就要创建衍生数据集？很可能是因为你想在以后再次查询它。这就是 **读路径（read path）**：当服务用户请求时，你需要从衍生数据集中读取，也许还要对结果进行一些额外处理，然后构建给用户的响应。

总而言之，写路径和读路径涵盖了数据的整个旅程，从收集数据开始，到使用数据结束（可能是由另一个人）。写路径是预计算过程的一部分 —— 即，一旦数据进入，即刻完成，无论是否有人需要看它。读路径是这个过程中只有当有人请求时才会发生的部分。如果你熟悉函数式编程语言，则可能会注意到写路径类似于立即求值，读路径类似于惰性求值。

如 [图 12-1](/v1/ddia_1201.png) 所示，衍生数据集是写路径和读路径相遇的地方。它代表了在写入时需要完成的工作量与在读取时需要完成的工作量之间的权衡。

#### 物化视图和缓存

全文搜索索引就是一个很好的例子：写路径更新索引，读路径在索引中搜索关键字。读写都需要做一些工作。写入需要更新文档中出现的所有关键词的索引条目。读取需要搜索查询中的每个单词，并应用布尔逻辑来查找包含查询中所有单词（AND 运算符）的文档，或者每个单词（OR 运算符）的任何同义词。

如果没有索引，搜索查询将不得不扫描所有文档（如 grep），如果有着大量文档，这样做的开销巨大。没有索引意味着写入路径上的工作量较少（没有要更新的索引），但是在读取路径上需要更多工作。

另一方面，可以想象为所有可能的查询预先计算搜索结果。在这种情况下，读路径上的工作量会减少：不需要布尔逻辑，只需查找查询结果并返回即可。但写路径会更加昂贵：可能的搜索查询集合是无限大的，因此预先计算所有可能的搜索结果将需要无限的时间和存储空间。那肯定没戏 [^iii]。

[^iii]: 假设一个有限的语料库，那么返回非空搜索结果的搜索查询集合是有限的。然而，它是与语料库中的术语数量呈指数关系，这仍是一个坏消息。

另一种选择是预先计算一组固定的最常见查询的搜索结果，以便可以快速提供它们而无需转到索引。不常见的查询仍然可以通过索引来提供服务。这通常被称为常见查询的 **缓存（cache）**，尽管我们也可以称之为 **物化视图（materialized view）**，因为当新文档出现，且需要被包含在这些常见查询的搜索结果之中时，这些索引就需要更新。

从这个例子中我们可以看到，索引不是写路径和读路径之间唯一可能的边界；缓存常见搜索结果也是可行的；而在少量文档上使用没有索引的类 grep 扫描也是可行的。由此来看，缓存，索引和物化视图的作用很简单：它们改变了读路径与写路径之间的边界。通过预先计算结果，从而允许我们在写路径上做更多的工作，以节省读路径上的工作量。

在写路径上完成的工作和读路径之间的界限，实际上是本书开始处在 “[描述负载](/v1/ch1#描述负载)” 中推特例子里谈到的主题。在该例中，我们还看到了与普通用户相比，名人的写路径和读路径可能有所不同。在 500 页之后，我们已经绕回了起点！

#### 有状态、可离线的客户端

我发现写路径和读路径之间的边界很有趣，因为我们可以试着改变这个边界，并探讨这种改变的实际意义。我们来看看不同上下文中的这一想法。

过去二十年来，Web 应用的火热让我们对应用开发作出了一些很容易视作理所当然的假设。具体来说就是，客户端 / 服务器模型 —— 客户端大多是无状态的，而服务器拥有数据的权威 —— 已经普遍到我们几乎忘掉了还有其他任何模型的存在。但是技术在不断地发展，我认为不时地质疑现状非常重要。

传统上，网络浏览器是无状态的客户端，只有当连接到互联网时才能做一些有用的事情（能离线执行的唯一事情基本上就是上下滚动之前在线时加载好的页面）。然而，最近的 “单页面” JavaScript Web 应用已经获得了很多有状态的功能，包括客户端用户界面交互，以及 Web 浏览器中的持久化本地存储。移动应用可以类似地在设备上存储大量状态，而且大多数用户交互都不需要与服务器往返交互。

这些不断变化的功能重新引发了对 **离线优先（offline-first）** 应用的兴趣，这些应用尽可能地在同一设备上使用本地数据库，无需连接互联网，并在后台网络连接可用时与远程服务器同步【42】。由于移动设备通常具有缓慢且不可靠的蜂窝网络连接，因此，如果用户的用户界面不必等待同步网络请求，且应用主要是离线工作的，则这是一个巨大优势（请参阅 “[需要离线操作的客户端](/v1/ch5#需要离线操作的客户端)”）。

当我们摆脱无状态客户端与中央数据库交互的假设，并转向在终端用户设备上维护状态时，这就开启了新世界的大门。特别是，我们可以将设备上的状态视为 **服务器状态的缓存**。屏幕上的像素是客户端应用中模型对象的物化视图；模型对象是远程数据中心的本地状态副本【27】。

#### 将状态变更推送给客户端

在典型的网页中，如果你在 Web 浏览器中加载页面，并且随后服务器上的数据发生变更，则浏览器在重新加载页面之前对此一无所知。浏览器只能在一个时间点读取数据，假设它是静态的 —— 它不会订阅来自服务器的更新。因此设备上的状态是陈旧的缓存，除非你显式轮询变更否则不会更新。（像 RSS 这样基于 HTTP 的 Feed 订阅协议实际上只是一种基本的轮询形式）

最近的协议已经超越了 HTTP 的基本请求 / 响应模式：服务端发送的事件（EventSource API）和 WebSockets 提供了通信信道，通过这些信道，Web 浏览器可以与服务器保持打开的 TCP 连接，只要浏览器仍然连接着，服务器就能主动向浏览器推送信息。这为服务器提供了主动通知终端用户客户端的机会，服务器能告知客户端其本地存储状态的任何变化，从而减少客户端状态的陈旧程度。

用我们的写路径与读路径模型来讲，主动将状态变更推至到客户端设备，意味着将写路径一直延伸到终端用户。当客户端首次初始化时，它仍然需要使用读路径来获取其初始状态，但此后它就能够依赖服务器发送的状态变更流了。我们在流处理和消息传递部分讨论的想法并不局限于数据中心中：我们可以进一步采纳这些想法，并将它们一直延伸到终端用户设备【43】。

这些设备有时会离线，并在此期间无法收到服务器状态变更的任何通知。但是我们已经解决了这个问题：在 “[消费者偏移量](/v1/ch11#消费者偏移量)” 中，我们讨论了基于日志的消息代理的消费者能在失败或断开连接后重连，并确保它不会错过掉线期间任何到达的消息。同样的技术适用于单个用户，每个设备都是一个小事件流的小小订阅者。

#### 端到端的事件流

最近用于开发有状态的客户端与用户界面的工具，例如如 Elm 语言【30】和 Facebook 的 React、Flux 和 Redux 工具链，已经通过订阅表示用户输入或服务器响应的事件流来管理客户端的内部状态，其结构与事件溯源相似（请参阅 “[事件溯源](/v1/ch11#事件溯源)”）。

将这种编程模型扩展为：允许服务器将状态变更事件推送到客户端的事件管道中，是非常自然的。因此，状态变化可以通过 **端到端（end-to-end）** 的写路径流动：从一个设备上的交互触发状态变更开始，经由事件日志，并穿过几个衍生数据系统与流处理器，一直到另一台设备上的用户界面，而有人正在观察用户界面上的状态变化。这些状态变化能以相当低的延迟传播 —— 比如说，在一秒内从一端到另一端。

一些应用（如即时消息传递与在线游戏）已经具有这种 “实时” 架构（在低延迟交互的意义上，不是在 “[响应时间保证](/v1/ch8#响应时间保证)” 中的意义上）。但我们为什么不用这种方式构建所有的应用？

挑战在于，关于无状态客户端和请求 / 响应交互的假设已经根深蒂固地植入在我们的数据库、库、框架以及协议之中。许多数据存储支持读取与写入操作，为请求返回一个响应，但只有极少数提供订阅变更的能力 —— 请求返回一个随时间推移的响应流（请参阅 “[变更流的 API 支持](/v1/ch11#变更流的API支持)” ）。

为了将写路径延伸至终端用户，我们需要从根本上重新思考我们构建这些系统的方式：从请求 / 响应交互转向发布 / 订阅数据流【27】。更具响应性的用户界面与更好的离线支持，我认为这些优势值得我们付出努力。如果你正在设计数据系统，我希望你对订阅变更的选项留有印象，而不只是查询当前状态。

#### 读也是事件

我们讨论过，当流处理器将衍生数据写入存储（数据库，缓存或索引）时，以及当用户请求查询该存储时，存储将充当写路径和读路径之间的边界。该存储应当允许对数据进行随机访问的读取查询，否则这些查询将需要扫描整个事件日志。

在很多情况下，数据存储与流处理系统是分开的。但回想一下，流处理器还是需要维护状态以执行聚合和连接的（请参阅 “[流连接](/v1/ch11#流连接)”）。这种状态通常隐藏在流处理器内部，但一些框架也允许这些状态被外部客户端查询【45】，将流处理器本身变成一种简单的数据库。

我愿意进一步思考这个想法。正如到目前为止所讨论的那样，对存储的写入是通过事件日志进行的，而读取是临时的网络请求，直接流向存储着待查数据的节点。这是一个合理的设计，但不是唯一可行的设计。也可以将读取请求表示为事件流，并同时将读事件与写事件送往流处理器；流处理器通过将读取结果发送到输出流来响应读取事件【46】。

当写入和读取都被表示为事件，并且被路由到同一个流算子以便处理时，我们实际上是在读取查询流和数据库之间执行流表连接。读取事件需要被送往保存数据的数据库分区（请参阅 “[请求路由](/v1/ch6#请求路由)”），就像批处理和流处理器在连接时需要在同一个键上对输入分区一样（请参阅 “[Reduce 侧连接与分组](/v1/ch10#Reduce侧连接与分组)”）。

服务请求与执行连接之间的这种相似之处是非常关键的【47】。一次性读取请求只是将请求传过连接算子，然后请求马上就被忘掉了；而一个订阅请求，则是与连接另一侧过去与未来事件的持久化连接。

记录读取事件的日志可能对于追踪整个系统中的因果关系与数据来源也有好处：它可以让你重现出当用户做出特定决策之前看见了什么。例如在网商中，向客户显示的预测送达日期与库存状态，可能会影响他们是否选择购买一件商品【4】。要分析这种联系，则需要记录用户查询运输与库存状态的结果。

将读取事件写入持久存储可以更好地跟踪因果关系（请参阅 “[排序事件以捕获因果关系](#排序事件以捕获因果关系)”），但会产生额外的存储与 I/O 成本。优化这些系统以减少开销仍然是一个开放的研究问题【2】。但如果你已经出于运维目的留下了读取请求日志，将其作为请求处理的副作用，那么将这份日志作为请求事件源并不是什么特别大的变更。

#### 多分区数据处理

对于只涉及单个分区的查询，通过流来发送查询与收集响应可能是杀鸡用牛刀了。然而，这个想法开启了分布式执行复杂查询的可能性，这需要合并来自多个分区的数据，利用了流处理器已经提供的消息路由、分区和连接的基础设施。

Storm 的分布式 RPC 功能支持这种使用模式（请参阅 “[消息传递和 RPC](/v1/ch11#消息传递和RPC)”）。例如，它已经被用来计算浏览过某个推特 URL 的人数 —— 即，发推包含该 URL 的所有人的粉丝集合的并集【48】。由于推特的用户是分区的，因此这种计算需要合并来自多个分区的结果。

这种模式的另一个例子是欺诈预防：为了评估特定购买事件是否具有欺诈风险，你可以检查该用户 IP 地址，电子邮件地址，帐单地址，送货地址的信用分。这些信用数据库中的每一个都是有分区的，因此为特定购买事件采集分数需要连接一系列不同的分区数据集【49】。

MPP 数据库的内部查询执行图有着类似的特征（请参阅 “[Hadoop 与分布式数据库的对比](/v1/ch10#Hadoop与分布式数据库的对比)”）。如果需要执行这种多分区连接，则直接使用提供此功能的数据库，可能要比使用流处理器实现它要更简单。然而将查询视为流提供了一种选项，可以用于实现超出传统现成解决方案的大规模应用。


## 将事情做正确

对于只读取数据的无状态服务，出问题也没什么大不了的：你可以修复该错误并重启服务，而一切都恢复正常。像数据库这样的有状态系统就没那么简单了：它们被设计为永远记住事物（或多或少），所以如果出现问题，这种（错误的）效果也将潜在地永远持续下去，这意味着它们需要更仔细的思考【50】。

我们希望构建可靠且 **正确** 的应用（即使面对各种故障，程序的语义也能被很好地定义与理解）。约四十年来，原子性、隔离性和持久性（[第七章](/v1/ch7)）等事务特性一直是构建正确应用的首选工具。然而这些地基没有看上去那么牢固：例如弱隔离级别带来的困惑可以佐证（请参阅 “[弱隔离级别](/v1/ch7#弱隔离级别)”）。

事务在某些领域被完全抛弃，并被提供更好性能与可伸缩性的模型取代，但后者有更复杂的语义（例如，请参阅 “[无主复制](/v1/ch5#无主复制)”）。**一致性（Consistency）** 经常被谈起，但其定义并不明确（请参阅 “[一致性](/v1/ch7#一致性)” 和 [第九章](/v1/ch9)）。有些人断言我们应当为了高可用而 “拥抱弱一致性”，但却对这些概念实际上意味着什么缺乏清晰的认识。

对于如此重要的话题，我们的理解，以及我们的工程方法却是惊人地薄弱。例如，确定在特定事务隔离等级或复制配置下运行特定应用是否安全是非常困难的【51,52】。通常简单的解决方案似乎在低并发性的情况下工作正常，并且没有错误，但在要求更高的情况下却会出现许多微妙的错误。

例如，Kyle Kingsbury 的 Jepsen 实验【53】标出了一些产品声称的安全保证与其在网络问题与崩溃时的实际行为之间的明显差异。即使像数据库这样的基础设施产品没有问题，应用代码仍然需要正确使用它们提供的功能才行，如果配置很难理解，这是很容易出错的（在这种情况下指的是弱隔离级别，法定人数配置等）。

如果你的应用可以容忍偶尔的崩溃，以及以不可预料的方式损坏或丢失数据，那生活就要简单得多，而你可能只要双手合十念阿弥陀佛，期望佛祖能保佑最好的结果。另一方面，如果你需要更强的正确性保证，那么可串行化与原子提交就是久经考验的方法，但它们是有代价的：它们通常只在单个数据中心中工作（这就排除了地理位置分散的架构），并限制了系统能够实现的规模与容错特性。

虽然传统的事务方法并没有走远，但我也相信在使应用正确而灵活地处理错误方面上，事务也不是最后一个可以谈的。在本节中，我将提出一些在数据流架构中考量正确性的方式。

### 数据库的端到端原则

仅仅因为一个应用程序使用了具有相对较强安全属性的数据系统（例如可串行化的事务），并不意味着就可以保证没有数据丢失或损坏。例如，如果某个应用有个 Bug，导致它写入不正确的数据，或者从数据库中删除数据，那么可串行化的事务也救不了你。

这个例子可能看起来很无聊，但值得认真对待：应用会出 Bug，而人也会犯错误。我在 “[状态、流和不变性](/v1/ch11#状态、流和不变性)” 中使用了这个例子来支持不可变和仅追加的数据，阉割掉错误代码摧毁良好数据的能力，能让从错误中恢复更为容易。

虽然不变性很有用，但它本身并非万灵药。让我们来看一个可能发生的、非常微妙的数据损坏案例。

#### 正好执行一次操作

在 “[容错](/v1/ch11#容错)” 中，我们见到了 **恰好一次**（或 **等效一次**）语义的概念。如果在处理消息时出现问题，你可以选择放弃（丢弃消息 —— 导致数据丢失）或重试。如果重试，就会有这种风险：第一次实际上成功了，只不过你没有发现。结果这个消息就被处理了两次。

处理两次是数据损坏的一种形式：为同样的服务向客户收费两次（收费太多）或增长计数器两次（夸大指标）都不是我们想要的。在这种情况下，恰好一次意味着安排计算，使得最终效果与没有发生错误的情况一样，即使操作实际上因为某种错误而重试。我们先前讨论过实现这一目标的几种方法。

最有效的方法之一是使操作 **幂等**（idempotent，请参阅 “[幂等性](/v1/ch11#幂等性)”）：即确保它无论是执行一次还是执行多次都具有相同的效果。但是，将不是天生幂等的操作变为幂等的操作需要一些额外的努力与关注：你可能需要维护一些额外的元数据（例如更新了值的操作 ID 集合），并在从一个节点故障切换至另一个节点时做好防护（请参阅 “[领导者和锁](/v1/ch8#领导者和锁)”）。

#### 抑制重复

除了流处理之外，其他许多地方也需要抑制重复的模式。例如，TCP 使用了数据包上的序列号，以便接收方可以将它们正确排序，并确定网络上是否有数据包丢失或重复。在将数据交付应用前，TCP 协议栈会重新传输任何丢失的数据包，也会移除任何重复的数据包。

但是，这种重复抑制仅适用于单条 TCP 连接的场景中。假设 TCP 连接是一个客户端与数据库的连接，并且它正在执行 [例 12-1]() 中的事务。在许多数据库中，事务是绑定在客户端连接上的（如果客户端发送了多个查询，数据库就知道它们属于同一个事务，因为它们是在同一个 TCP 连接上发送的）。如果客户端在发送 `COMMIT` 之后并在从数据库服务器收到响应之前遇到网络中断与连接超时，客户端是不知道事务是否已经被提交的（[图 8-1](/v1/ddia_0801.png)）。

**例 12-1 资金从一个账户到另一个账户的非幂等转移**

```sql
BEGIN TRANSACTION;
    UPDATE accounts SET balance = balance + 11.00 WHERE account_id = 1234;
    UPDATE accounts SET balance = balance - 11.00 WHERE account_id = 4321;
COMMIT;
```

客户端可以重连到数据库并重试事务，但现在已经处于 TCP 重复抑制的范围之外了。因为 [例 12-1]() 中的事务不是幂等的，可能会发生转了 \$22 而不是期望的 \$11。因此，尽管 [例 12-1]() 是一个事务原子性的标准样例，但它实际上并不正确，而真正的银行并不会这样办事【3】。

两阶段提交（请参阅 “[原子提交与两阶段提交](/v1/ch9#原子提交与两阶段提交)”）协议会破坏 TCP 连接与事务之间的 1:1 映射，因为它们必须在故障后允许事务协调器重连到数据库，告诉数据库将存疑事务提交还是中止。这足以确保事务只被恰好执行一次吗？不幸的是，并不能。

即使我们可以抑制数据库客户端与服务器之间的重复事务，我们仍然需要担心终端用户设备与应用服务器之间的网络。例如，如果终端用户的客户端是 Web 浏览器，则它可能会使用 HTTP POST 请求向服务器提交指令。也许用户正处于一个信号微弱的蜂窝数据网络连接中，它们成功地发送了 POST，但却在能够从服务器接收响应之前没了信号。

在这种情况下，可能会向用户显示错误消息，而他们可能会手动重试。Web 浏览器警告说，“你确定要再次提交这个表单吗？”  —— 用户选 “是”，因为他们希望操作发生（Post/Redirect/Get 模式【54】可以避免在正常操作中出现此警告消息，但 POST 请求超时就没办法了）。从 Web 服务器的角度来看，重试是一个独立的请求；从数据库的角度来看，这是一个独立的事务。通常的除重机制无济于事。

#### 操作标识符

要在通过几跳的网络通信上使操作具有幂等性，仅仅依赖数据库提供的事务机制是不够的 —— 你需要考虑 **端到端（end-to-end）** 的请求流。
例如，你可以为操作生成一个唯一的标识符（例如 UUID），并将其作为隐藏表单字段包含在客户端应用中，或通过计算所有表单相关字段的散列来生成操作 ID 【3】。如果 Web 浏览器提交了两次 POST 请求，这两个请求将具有相同的操作 ID。然后，你可以将该操作 ID 一路传递到数据库，并检查你是否曾经使用给定的 ID 执行过一个操作，如 [例 12-2]() 中所示。

**例 12-2 使用唯一 ID 来抑制重复请求**

```sql
ALTER TABLE requests ADD UNIQUE (request_id);

BEGIN TRANSACTION;
    INSERT INTO requests
        (request_id, from_account, to_account, amount)
        VALUES('0286FDB8-D7E1-423F-B40B-792B3608036C', 4321, 1234, 11.00);
    UPDATE accounts SET balance = balance + 11.00 WHERE account_id = 1234;
    UPDATE accounts SET balance = balance - 11.00 WHERE account_id = 4321;
COMMIT;
```

[例 12-2]() 依赖于 `request_id` 列上的唯一约束。如果一个事务尝试插入一个已经存在的 ID，那么 `INSERT` 失败，事务被中止，使其无法生效两次。即使在较弱的隔离级别下，关系数据库也能正确地维护唯一性约束（而在 “[写入偏差与幻读](/v1/ch7#写入偏差与幻读)” 中讨论过，应用级别的 **检查 - 然后 - 插入** 可能会在不可串行化的隔离下失败）。

除了抑制重复的请求之外，[例 12-2]() 中的请求表表现得就像一种事件日志，暗示着事件溯源的想法（请参阅 “[事件溯源](/v1/ch11#事件溯源)”）。更新账户余额事实上不必与插入事件发生在同一个事务中，因为它们是冗余的，而能由下游消费者从请求事件中衍生出来 —— 只要该事件被恰好处理一次，这又一次可以使用请求 ID 来强制执行。

#### 端到端原则

抑制重复事务的这种情况只是一个更普遍的原则的一个例子，这个原则被称为 **端到端原则（end-to-end argument）**，它在 1984 年由 Saltzer、Reed 和 Clark 阐述【55】：

> 只有在通信系统两端应用的知识与帮助下，所讨论的功能才能完全地正确地实现。因而将这种被质疑的功能作为通信系统本身的功能是不可能的（有时，通信系统可以提供这种功能的不完备版本，可能有助于提高性能）。
>

在我们的例子中 **所讨论的功能** 是重复抑制。我们看到 TCP 在 TCP 连接层次抑制了重复的数据包，一些流处理器在消息处理层次提供了所谓的恰好一次语义，但这些都无法阻止当一个请求超时时，用户亲自提交重复的请求。TCP，数据库事务，以及流处理器本身并不能完全排除这些重复。解决这个问题需要一个端到端的解决方案：从终端用户的客户端一路传递到数据库的事务标识符。

端到端原则也适用于检查数据的完整性：以太网，TCP 和 TLS 中内置的校验和可以检测网络中数据包的损坏情况，但是它们无法检测到由连接两端发送 / 接收软件中 Bug 导致的损坏。或数据存储所在磁盘上的损坏。如果你想捕获数据所有可能的损坏来源，你也需要端到端的校验和。

类似的原则也适用于加密【55】：家庭 WiFi 网络上的密码可以防止人们窃听你的 WiFi 流量，但无法阻止互联网上其他地方攻击者的窥探；客户端与服务器之间的 TLS/SSL 可以阻挡网络攻击者，但无法阻止恶意服务器。只有端到端的加密和认证可以防止所有这些事情。

尽管低层级的功能（TCP 重复抑制、以太网校验和、WiFi 加密）无法单独提供所需的端到端功能，但它们仍然很有用，因为它们能降低较高层级出现问题的可能性。例如，如果我们没有 TCP 来将数据包排成正确的顺序，那么 HTTP 请求通常就会被搅烂。我们只需要记住，低级别的可靠性功能本身并不足以确保端到端的正确性。

#### 在数据系统中应用端到端思考

这将我带回最初的论点：仅仅因为应用使用了提供相对较强安全属性的数据系统，例如可串行化的事务，并不意味着应用的数据就不会丢失或损坏了。应用本身也需要采取端到端的措施，例如除重。

这实在是一个遗憾，因为容错机制很难弄好。低层级的可靠机制（比如 TCP 中的那些）运行的相当好，因而剩下的高层级错误基本很少出现。如果能将这些剩下的高层级容错机制打包成抽象，而应用不需要再去操心，那该多好呀 —— 但恐怕我们还没有找到这一正确的抽象。

长期以来，事务被认为是一个很好的抽象，我相信它们确实是很有用的。正如 [第七章](/v1/ch7) 导言中所讨论的，它们将各种可能的问题（并发写入、违背约束、崩溃、网络中断、磁盘故障）合并为两种可能结果：提交或中止。这是对编程模型而言是一种巨大的简化，但恐怕这还不够。

事务是代价高昂的，当涉及异构存储技术时尤为甚（请参阅 “[实践中的分布式事务](/v1/ch9#实践中的分布式事务)”）。我们拒绝使用分布式事务是因为它开销太大，结果我们最后不得不在应用代码中重新实现容错机制。正如本书中大量的例子所示，对并发性与部分失败的推理是困难且违反直觉的，所以我怀疑大多数应用级别的机制都不能正确工作，最终结果是数据丢失或损坏。

出于这些原因，我认为探索对容错的抽象是很有价值的。它使提供应用特定的端到端的正确性属性变得更简单，而且还能在大规模分布式环境中提供良好的性能与运维特性。

### 强制约束

让我们思考一下在 [分拆数据库](#分拆数据库) 上下文中的 **正确性（correctness）**。我们看到端到端的除重可以通过从客户端一路透传到数据库的请求 ID 实现。那么其他类型的约束呢？

我们先来特别关注一下 **唯一性约束** —— 例如我们在 [例 12-2]() 中所依赖的约束。在 “[约束和唯一性保证](/v1/ch9#约束和唯一性保证)” 中，我们看到了几个其他需要强制实施唯一性的应用功能例子：用户名或电子邮件地址必须唯一标识用户，文件存储服务不能包含多个重名文件，两个人不能在航班或剧院预订同一个座位。

其他类型的约束也非常类似：例如，确保帐户余额永远不会变为负数，确保不会超卖库存，或者会议室没有重复的预订。执行唯一性约束的技术通常也可以用于这些约束。

#### 唯一性约束需要达成共识

在 [第九章](/v1/ch9) 中我们看到，在分布式环境中，强制执行唯一性约束需要共识：如果存在多个具有相同值的并发请求，则系统需要决定冲突操作中的哪一个被接受，并拒绝其他违背约束的操作。

达成这一共识的最常见方式是使单个节点作为领导，并使其负责所有决策。只要你不介意所有请求都挤过单个节点（即使客户端位于世界的另一端），只要该节点没有失效，系统就能正常工作。如果你需要容忍领导者失效，那么就又回到了共识问题（请参阅 “[单主复制与共识](/v1/ch9#单主复制与共识)”）。

唯一性检查可以通过对唯一性字段分区做横向伸缩。例如，如果需要通过请求 ID 确保唯一性（如 [例 12-2]() 所示），你可以确保所有具有相同请求 ID 的请求都被路由到同一分区（请参阅 [第六章](/v1/ch6)）。如果你需要让用户名是唯一的，则可以按用户名的散列值做分区。

但异步多主复制排除在外，因为可能会发生不同主库同时接受冲突写操作的情况，因而这些值不再是唯一的（请参阅 “[实现线性一致的系统](/v1/ch9#实现线性一致的系统)”）。如果你想立刻拒绝任何违背约束的写入，同步协调是无法避免的【56】。

#### 基于日志消息传递中的唯一性

日志确保所有消费者以相同的顺序看见消息 —— 这种保证在形式上被称为 **全序广播（total order boardcast）** 并且等价于共识（请参阅 “[全序广播](/v1/ch9#全序广播)”）。在使用基于日志的消息传递的分拆数据库方法中，我们可以使用非常类似的方法来执行唯一性约束。

流处理器在单个线程上依次消费单个日志分区中的所有消息（请参阅 “[日志与传统的消息传递相比](/v1/ch11#日志与传统的消息传递相比)”）。因此，如果日志是按需要确保唯一的值做的分区，则流处理器可以无歧义地、确定性地决定几个冲突操作中的哪一个先到达。例如，在多个用户尝试宣告相同用户名的情况下【57】：

1. 每个对用户名的请求都被编码为一条消息，并追加到按用户名散列值确定的分区。
2. 流处理器依序读取日志中的请求，并使用本地数据库来追踪哪些用户名已经被占用了。对于所有申请可用用户名的请求，它都会记录该用户名，并向输出流发送一条成功消息。对于所有申请已占用用户名的请求，它都会向输出流发送一条拒绝消息。
3. 请求用户名的客户端监视输出流，等待与其请求相对应的成功或拒绝消息。

该算法基本上与 “[使用全序广播实现线性一致的存储](/v1/ch9#使用全序广播实现线性一致的存储)” 中的算法相同。它可以简单地通过增加分区数伸缩至较大的请求吞吐量，因为每个分区都可以被独立处理。

该方法不仅适用于唯一性约束，而且适用于许多其他类型的约束。其基本原理是，任何可能冲突的写入都会路由到相同的分区并按顺序处理。正如 “[什么是冲突？](/v1/ch5#什么是冲突？)” 与 “[写入偏差与幻读](/v1/ch7#写入偏差与幻读)” 中所述，冲突的定义可能取决于应用，但流处理器可以使用任意逻辑来验证请求。这个想法与 Bayou 在 90 年代开创的方法类似【58】。

#### 多分区请求处理

当涉及多个分区时，确保操作以原子方式执行且同时满足约束就变得很有趣了。在 [例 12-2]() 中，可能有三个分区：一个包含请求 ID，一个包含收款人账户，另一个包含付款人账户。没有理由把这三种东西放入同一个分区，因为它们都是相互独立的。

在数据库的传统方法中，执行此事务需要跨全部三个分区进行原子提交，就这些分区上的所有其他事务而言，这实质上是将该事务嵌入一个全序。而这样就要求跨分区协调，不同的分区无法再独立地进行处理，因此吞吐量很可能会受到影响。

但事实证明，使用分区日志可以达到等价的正确性而无需原子提交：

1. 从账户 A 向账户 B 转账的请求由客户端提供一个唯一的请求 ID，并按请求 ID 追加写入相应日志分区。
2. 流处理器读取请求日志。对于每个请求消息，它向输出流发出两条消息：付款人账户 A 的借记指令（按 A 分区），收款人 B 的贷记指令（按 B 分区）。被发出的消息中会带有原始的请求 ID。
3. 后续处理器消费借记 / 贷记指令流，按照请求 ID 除重，并将变更应用至账户余额。

步骤 1 和步骤 2 是必要的，因为如果客户直接发送贷记与借记指令，则需要在这两个分区之间进行原子提交，以确保两者要么都发生或都不发生。为了避免对分布式事务的需要，我们首先将请求持久化记录为单条消息，然后从这第一条消息中衍生出贷记指令与借记指令。几乎在所有数据系统中，单对象写入都是原子性的（请参阅 “[单对象写入](/v1/ch7#单对象写入)），因此请求要么出现在日志中，要么就不出现，无需多分区原子提交。

如果流处理器在步骤 2 中崩溃，则它会从上一个存档点恢复处理。这样做时，它不会跳过任何请求消息，但可能会多次处理请求并产生重复的贷记与借记指令。但由于它是确定性的，因此它只是再次生成相同的指令，而步骤 3 中的处理器可以使用端到端请求 ID 轻松地对其除重。

如果你想确保付款人的帐户不会因此次转账而透支，则可以使用一个额外的流处理器来维护账户余额并校验事务（按付款人账户分区），只有有效的事务会被记录在步骤 1 中的请求日志中。

通过将多分区事务分解为两个不同分区方式的阶段，并使用端到端的请求 ID，我们实现了同样的正确性属性（每个请求对付款人与收款人都恰好生效一次），即使在出现故障，且没有使用原子提交协议的情况下依然如此。使用多个不同分区阶段的想法与我们在 “[多分区数据处理](#多分区数据处理)” 中讨论的想法类似（也请参阅 “[并发控制](/v1/ch11#并发控制)”）。

### 及时性与完整性

事务的一个便利属性是，它们通常是线性一致的（请参阅 “[线性一致性](/v1/ch9#线性一致性)”），也就是说，写入者会等到事务提交，而之后其写入立刻对所有读取者可见。

当我们把一个操作拆分为跨越多个阶段的流处理器时，却并非如此：日志的消费者在设计上就是异步的，因此发送者不会等其消息被消费者处理完。但是，客户端等待输出流中的特定消息是可能的。这正是我们在 “[基于日志消息传递中的唯一性](#基于日志消息传递中的唯一性)” 一节中检查唯一性约束时所做的事情。

在这个例子中，唯一性检查的正确性不取决于消息发送者是否等待结果。等待的目的仅仅是同步通知发送者唯一性检查是否成功。但该通知可以与消息处理的结果相解耦。

更一般地来讲，我认为术语 **一致性（consistency）** 这个术语混淆了两个值得分别考虑的需求：

* 及时性（Timeliness）

  及时性意味着确保用户观察到系统的最新状态。我们之前看到，如果用户从陈旧的数据副本中读取数据，它们可能会观察到系统处于不一致的状态（请参阅 “[复制延迟问题](/v1/ch5#复制延迟问题)”）。但这种不一致是暂时的，而最终会通过等待与重试简单地得到解决。

  CAP 定理（请参阅 “[线性一致性的代价](/v1/ch9#线性一致性的代价)”）使用 **线性一致性（linearizability）** 意义上的一致性，这是实现及时性的强有力方法。像 **写后读** 这样及时性更弱的一致性也很有用（请参阅 “[读己之写](/v1/ch5#读己之写)”）。

* 完整性（Integrity）

  完整性意味着没有损坏；即没有数据丢失，并且没有矛盾或错误的数据。尤其是如果某些衍生数据集是作为底层数据之上的视图而维护的（请参阅 “[从事件日志中派生出当前状态](/v1/ch11#从事件日志中派生出当前状态)”），这种衍生必须是正确的。例如，数据库索引必须正确地反映数据库的内容 —— 缺失某些记录的索引并不是很有用。

  如果完整性被违背，这种不一致是永久的：在大多数情况下，等待与重试并不能修复数据库损坏。相反的是，需要显式地检查与修复。在 ACID 事务的上下文中（请参阅 “[ACID 的含义](/v1/ch7#ACID的含义)”），一致性通常被理解为某种特定于应用的完整性概念。原子性和持久性是保持完整性的重要工具。


口号形式：违反及时性，“最终一致性”；违反完整性，“永无一致性”。

我断言在大多数应用中，完整性比及时性重要得多。违反及时性可能令人困惑与讨厌，但违反完整性的结果可能是灾难性的。

例如在你的信用卡对账单上，如果某一笔过去 24 小时内完成的交易尚未出现并不令人奇怪 —— 这些系统有一定的滞后是正常的。我们知道银行是异步核算与敲定交易的，这里的及时性并不是非常重要【3】。但如果当期对账单余额与上期对账单余额加交易总额对不上（求和错误），或者出现一笔向你收费但未向商家付款的交易（消失的钱），那就实在是太糟糕了，这样的问题就违背了系统的完整性。

#### 数据流系统的正确性

ACID 事务通常既提供及时性（例如线性一致性）也提供完整性保证（例如原子提交）。因此如果你从 ACID 事务的角度来看待应用的正确性，那么及时性与完整性的区别是无关紧要的。

另一方面，对于在本章中讨论的基于事件的数据流系统而言，它们的一个有趣特性就是将及时性与完整性分开。在异步处理事件流时不能保证及时性，除非你显式构建一个在返回之前明确等待特定消息到达的消费者。但完整性实际上才是流处理系统的核心。

**恰好一次** 或 **等效一次** 语义（请参阅 “[容错](/v1/ch11#容错)”）是一种保持完整性的机制。如果事件丢失或者生效两次，就有可能违背数据系统的完整性。因此在出现故障时，容错消息传递与重复抑制（例如，幂等操作）对于维护数据系统的完整性是很重要的。

正如我们在上一节看到的那样，可靠的流处理系统可以在无需分布式事务与原子提交协议的情况下保持完整性，这意味着它们有潜力达到与后者相当的正确性，同时还具备好得多的性能与运维稳健性。为了达成这种正确性，我们组合使用了多种机制：

* 将写入操作的内容表示为单条消息，从而可以轻松地被原子写入 —— 与事件溯源搭配效果拔群（请参阅 “[事件溯源](/v1/ch11#事件溯源)”）。
* 使用与存储过程类似的确定性衍生函数，从这一消息中衍生出所有其他的状态变更（请参阅 “[真的串行执行](/v1/ch7#真的串行执行)” 和 “[应用代码作为衍生函数](/v1/ch12#应用代码作为衍生函数)”）
* 将客户端生成的请求 ID 传递通过所有的处理层次，从而允许端到端的除重，带来幂等性。
* 使消息不可变，并允许衍生数据能随时被重新处理，这使从错误中恢复更加容易（请参阅 “[不可变事件的优点](/v1/ch11#不可变事件的优点)”）

这种机制组合在我看来，是未来构建容错应用的一个非常有前景的方向。

#### 宽松地解释约束

如前所述，执行唯一性约束需要共识，通常通过在单个节点中汇集特定分区中的所有事件来实现。如果我们想要传统的唯一性约束形式，这种限制是不可避免的，流处理也不例外。

然而另一个需要了解的事实是，许多真实世界的应用实际上可以摆脱这种形式，接受弱得多的唯一性：

* 如果两个人同时注册了相同的用户名或预订了相同的座位，你可以给其中一个人发消息道歉，并要求他们换一个不同的用户名或座位。这种纠正错误的变化被称为 **补偿性事务（compensating transaction）**【59,60】。
* 如果客户订购的物品多于仓库中的物品，你可以下单补仓，并为延误向客户道歉，向他们提供折扣。实际上，这么说吧，如果叉车在仓库中轧过了你的货物，剩下的货物比你想象的要少，那么你也是得这么做【61】。因此，既然道歉工作流无论如何已经成为你商业过程中的一部分了，那么对库存物品数目添加线性一致的约束可能就没必要了。
* 与之类似，许多航空公司都会超卖机票，打着一些旅客可能会错过航班的算盘；许多旅馆也会超卖客房，抱着部分客人可能会取消预订的期望。在这些情况下，出于商业原因而故意违反了 “一人一座” 的约束；当需求超过供给的情况出现时，就会进入补偿流程（退款、升级舱位 / 房型、提供隔壁酒店的免费的房间）。即使没有超卖，为了应对由恶劣天气或员工罢工导致的航班取消，你还是需要道歉与补偿流程 —— 从这些问题中恢复仅仅是商业活动的正常组成部分。
* 如果有人从账户超额取款，银行可以向他们收取透支费用，并要求他们偿还欠款。通过限制每天的提款总额，银行的风险是有限的。

在许多商业场景中，临时违背约束并稍后通过道歉来修复，实际上是可以接受的。道歉的成本各不相同，但通常很低（以金钱或名声来算）：你无法撤回已发送的电子邮件，但可以发送一封后续电子邮件进行更正。如果你不小心向信用卡收取了两次费用，则可以将其中一项收费退款，而代价仅仅是手续费，也许还有客户的投诉。尽管一旦 ATM 吐了钱，你无法直接取回，但原则上如果账户透支而客户拒不支付，你可以派催收员收回欠款。

道歉的成本是否能接受是一个商业决策。如果可以接受的话，在写入数据之前检查所有约束的传统模型反而会带来不必要的限制，而线性一致性的约束也不是必须的。乐观写入，事后检查可能是一种合理的选择。你仍然可以在做一些挽回成本高昂的事情前确保有相关的验证，但这并不意味着写入数据之前必须先进行验证。

这些应用 **确实** 需要完整性：你不会希望丢失预订信息，或者由于借方贷方不匹配导致资金消失。但是它们在执行约束时 **并不需要** 及时性：如果你销售的货物多于仓库中的库存，可以在事后道歉后并弥补问题。这种做法与我们在 “[处理写入冲突](/v1/ch5#处理写入冲突)” 中讨论的冲突解决方法类似。

#### 无协调数据系统

我们现在已经做了两个有趣的观察：

1. 数据流系统可以维持衍生数据的完整性保证，而无需原子提交、线性一致性或者同步的跨分区协调。
2. 虽然严格的唯一性约束要求及时性和协调，但许多应用实际上可以接受宽松的约束：只要整个过程保持完整性，这些约束可能会被临时违反并在稍后被修复。

总之这些观察意味着，数据流系统可以为许多应用提供无需协调的数据管理服务，且仍能给出很强的完整性保证。这种 **无协调（coordination-avoiding）** 的数据系统有着很大的吸引力：比起需要执行同步协调的系统，它们能达到更好的性能与更强的容错能力【56】。

例如，这种系统可以使用多领导者配置运维，跨越多个数据中心，在区域间异步复制。任何一个数据中心都可以持续独立运行，因为不需要同步的跨区域协调。这样的系统的及时性保证会很弱 —— 如果不引入协调它是不可能是线性一致的 —— 但它仍然可以提供有力的完整性保证。

在这种情况下，可串行化事务作为维护衍生状态的一部分仍然是有用的，但它们只能在小范围内运行，在那里它们工作得很好【8】。异构分布式事务（如 XA 事务，请参阅 “[实践中的分布式事务](/v1/ch9#实践中的分布式事务)”）不是必需的。同步协调仍然可以在需要的地方引入（例如在无法恢复的操作之前强制执行严格的约束），但是如果只是应用的一小部分地方需要它，没必要让所有操作都付出协调的代价。【43】。

另一种审视协调与约束的角度是：它们减少了由于不一致而必须做出的道歉数量，但也可能会降低系统的性能和可用性，从而可能增加由于宕机中断而需要做出的道歉数量。你不可能将道歉数量减少到零，但可以根据自己的需求寻找最佳平衡点 —— 既不存在太多不一致性，又不存在太多可用性问题。

### 信任但验证

我们所有关于正确性，完整性和容错的讨论都基于一些假设，假设某些事情可能会出错，但其他事情不会。我们将这些假设称为我们的 **系统模型**（system model，请参阅 “[将系统模型映射到现实世界](/v1/ch8#将系统模型映射到现实世界)”）：例如，我们应该假设进程可能会崩溃，机器可能突然断电，网络可能会任意延迟或丢弃消息。但是我们也可能假设写入磁盘的数据在执行 `fsync` 后不会丢失，内存中的数据没有损坏，而 CPU 的乘法指令总是能返回正确的结果。

这些假设是相当合理的，因为大多数时候它们都是成立的，如果我们不得不经常担心计算机出错，那么基本上寸步难行。在传统上，系统模型采用二元方法处理故障：我们假设有些事情可能会发生，而其他事情 **永远** 不会发生。实际上，这更像是一个概率问题：有些事情更有可能，其他事情不太可能。问题在于违反我们假设的情况是否经常发生，以至于我们可能在实践中遇到它们。

我们已经看到，数据可能会在尚未落盘时损坏（请参阅 “[复制与持久性](/v1/ch7#复制与持久性)”），而网络上的数据损坏有时可能规避了 TCP 校验和（请参阅 “[弱谎言形式](/v1/ch8#弱谎言形式)” ）。也许我们应当更关注这些事情？

我过去所从事的一个应用收集了来自客户端的崩溃报告，我们收到的一些报告，只有在这些设备内存中出现了随机位翻转才解释的通。这看起来不太可能，但是如果有足够多的设备运行你的软件，那么即使再不可能发生的事也确实会发生。除了由于硬件故障或辐射导致的随机存储器损坏之外，一些病态的存储器访问模式甚至可以在没有故障的存储器中翻转位【62】 —— 一种可用于破坏操作系统安全机制的效应【63】（这种技术被称为 **Rowhammer**）。一旦你仔细观察，硬件并不是看上去那样完美的抽象。

要澄清的是，随机位翻转在现代硬件上仍是非常罕见的【64】。我只想指出，它们并没有超越可能性的范畴，所以值得一些关注。

#### 维护完整性，尽管软件有Bug

除了这些硬件问题之外，总是存在软件 Bug 的风险，这些错误不会被较低层次的网络、内存或文件系统校验和所捕获。即使广泛使用的数据库软件也有 Bug：即使像 MySQL 与 PostgreSQL 这样稳健、口碑良好、多年来被许多人充分测试过的软件，就我个人所见也有 Bug，比如 MySQL 未能正确维护唯一约束【65】，以及 PostgreSQL 的可串行化隔离等级存在特定的写入偏差异常【66】。对于不那么成熟的软件来说，情况可能要糟糕得多。

尽管在仔细设计，测试，以及审查上做出很多努力，但 Bug 仍然会在不知不觉中产生。尽管它们很少，而且最终会被发现并被修复，但总会有那么一段时间，这些 Bug 可能会损坏数据。

而对于应用代码，我们不得不假设会有更多的错误，因为绝大多数应用的代码经受的评审与测试远远无法与数据库的代码相比。许多应用甚至没有正确使用数据库提供的用于维持完整性的功能，例如外键或唯一性约束【36】。

ACID 意义下的一致性（请参阅 “[一致性](/v1/ch7#一致性)”）基于这样一种想法：数据库以一致的状态启动，而事务将其从一个一致状态转换至另一个一致的状态。因此，我们期望数据库始终处于一致状态。然而，只有当你假设事务没有 Bug 时，这种想法才有意义。如果应用以某种错误的方式使用数据库，例如，不安全地使用弱隔离等级，数据库的完整性就无法得到保证。

#### 不要盲目信任承诺

由于硬件和软件并不总是符合我们的理想，所以数据损坏似乎早晚不可避免。因此，我们至少应该有办法查明数据是否已经损坏，以便我们能够修复它，并尝试追查错误的来源。检查数据完整性称为 **审计（auditing）**。

如 “[不可变事件的优点](/v1/ch11#不可变事件的优点)” 一节中所述，审计不仅仅适用于财务应用程序。不过，可审计性在财务中是非常非常重要的，因为每个人都知道错误总会发生，我们也都认为能够检测和解决问题是合理的需求。

成熟的系统同样倾向于考虑不太可能的事情出错的可能性，并管理这种风险。例如，HDFS 和 Amazon S3 等大规模存储系统并不完全信任磁盘：它们运行后台进程持续回读文件，并将其与其他副本进行比较，并将文件从一个磁盘移动到另一个，以便降低静默损坏的风险【67】。

如果你想确保你的数据仍然存在，你必须真正读取它并进行检查。大多数时候它们仍然会在那里，但如果不是这样，你一定想尽早知道答案，而不是更晚。按照同样的原则，不时地尝试从备份中恢复是非常重要的 —— 否则当你发现备份损坏时，你可能已经遇到了数据丢失，那时候就真的太晚了。不要盲目地相信它们全都管用。

#### 验证的文化

像 HDFS 和 S3 这样的系统仍然需要假设磁盘大部分时间都能正常工作 —— 这是一个合理的假设，但与它们 **始终** 能正常工作的假设并不相同。然而目前还没有多少系统采用这种 “信任但是验证” 的方式来持续审计自己。许多人认为正确性保证是绝对的，并且没有为罕见的数据损坏的可能性做过准备。我希望未来能看到更多的 **自我验证（self-validating）** 或 **自我审计（self-auditing）** 系统，不断检查自己的完整性，而不是依赖盲目的信任【68】。

我担心 ACID 数据库的文化导致我们在盲目信任技术（如事务机制）的基础上开发应用，而忽视了这种过程中的任何可审计性。由于我们所信任的技术在大多数情况下工作得很好，通常会认为审计机制并不值得投资。

但随之而来的是，数据库的格局发生了变化：在 NoSQL 的旗帜下，更弱的一致性保证成为常态，更不成熟的存储技术越来越被广泛使用。但是由于审计机制还没有被开发出来，尽管这种方式越来越危险，我们仍不断在盲目信任的基础上构建应用。让我们想一想如何针对可审计性而设计吧。

#### 为可审计性而设计

如果一个事务在一个数据库中改变了多个对象，在这一事实发生后，很难说清这个事务到底意味着什么。即使你捕获了事务日志（请参阅 “[变更数据捕获](/v1/ch11#变更数据捕获)”），各种表中的插入、更新和删除操作并不一定能清楚地表明 **为什么** 要执行这些变更。决定这些变更的是应用逻辑中的调用，而这一应用逻辑稍纵即逝，无法重现。

相比之下，基于事件的系统可以提供更好的可审计性。在事件溯源方法中，系统的用户输入被表示为一个单一不可变事件，而任何其导致的状态变更都衍生自该事件。衍生可以实现为具有确定性与可重复性，因而相同的事件日志通过相同版本的衍生代码时，会导致相同的状态变更。

显式处理数据流（请参阅 “[批处理输出的哲学](/v1/ch10#批处理输出的哲学)”）可以使数据的 **来龙去脉（provenance）** 更加清晰，从而使完整性检查更具可行性。对于事件日志，我们可以使用散列来检查事件存储没有被破坏。对于任何衍生状态，我们可以重新运行从事件日志中衍生它的批处理器与流处理器，以检查是否获得相同的结果，或者，甚至并行运行冗余的衍生流程。

具有确定性且定义良好的数据流，也使调试与跟踪系统的执行变得容易，以便确定它 **为什么** 做了某些事情【4,69】。如果出现意想之外的事情，那么重现导致意外事件的确切事故现场的诊断能力 —— 一种时间旅行调试功能是非常有价值的。

#### 端到端原则重现

如果我们不能完全相信系统的每个组件都不会损坏 —— 每一个硬件都没缺陷，每一个软件都没有 Bug —— 那我们至少必须定期检查数据的完整性。如果我们不检查，我们就不能发现损坏，直到无可挽回地导致对下游的破坏时，那时候再去追踪问题就要难得多，且代价也要高的多。

检查数据系统的完整性，最好是以端到端的方式进行（请参阅 “[数据库的端到端原则](#数据库的端到端原则)”）：我们能在完整性检查中涵盖的系统越多，某些处理阶中出现不被察觉损坏的几率就越小。如果我们能检查整个衍生数据管道端到端的正确性，那么沿着这一路径的任何磁盘、网络、服务以及算法的正确性检查都隐含在其中了。

持续的端到端完整性检查可以不断提高你对系统正确性的信心，从而使你能更快地进步【70】。与自动化测试一样，审计提高了快速发现错误的可能性，从而降低了系统变更或新存储技术可能导致损失的风险。如果你不害怕进行变更，就可以更好地充分演化一个应用，使其满足不断变化的需求。

#### 用于可审计数据系统的工具

目前，将可审计性作为顶层关注点的数据系统并不多。一些应用实现了自己的审计机制，例如将所有变更记录到单独的审计表中，但是确保审计日志与数据库状态的完整性仍然是很困难的。可以定期使用硬件安全模块对事务日志进行签名来防止篡改，但这无法保证正确的事务一开始就能进入到日志中。

使用密码学工具来证明系统的完整性是十分有趣的，这种方式对于宽泛的硬件与软件问题，甚至是潜在的恶意行为都很稳健有效。加密货币、区块链、以及诸如比特币、以太坊、Ripple、Stellar 的分布式账本技术已经迅速出现在这一领域【71,72,73】。

我没有资格评论这些技术用于货币，或者合同商定机制的价值。但从数据系统的角度来看，它们包含了一些有趣的想法。实质上，它们是分布式数据库，具有数据模型与事务机制，而不同副本可以由互不信任的组织托管。副本不断检查其他副本的完整性，并使用共识协议对应当执行的事务达成一致。

我对这些技术的拜占庭容错方面有些怀疑（请参阅 “[拜占庭故障](/v1/ch8#拜占庭故障)”），而且我发现 **工作证明（proof of work）** 技术非常浪费（比如，比特币挖矿）。比特币的交易吞吐量相当低，尽管更多是出于政治与经济原因而非技术上的原因。不过，完整性检查的方面是很有趣的。

密码学审计与完整性检查通常依赖 **默克尔树（Merkle tree）**【74】，这是一颗散列值的树，能够用于高效地证明一条记录出现在一个数据集中（以及其他一些特性）。除了炒作的沸沸扬扬的加密货币之外，**证书透明性（certificate transparency）** 也是一种依赖 Merkle 树的安全技术，用来检查 TLS/SSL 证书的有效性【75,76】。

我可以想象，那些在证书透明度与分布式账本中使用的完整性检查和审计算法，将会在通用数据系统中得到越来越广泛的应用。要使得这些算法对于没有密码学审计的系统同样可伸缩，并尽可能降低性能损失还需要一些工作。但我认为这是一个值得关注的有趣领域。


## 做正确的事情

在本书的最后部分，我想退后一步。在本书中，我们考察了各种不同的数据系统架构，评价了它们的优点与缺点，并探讨了构建可靠，可伸缩，可维护应用的技术。但是，我们忽略了讨论中一个重要而基础的部分，现在我想补充一下。

每个系统都服务于一个目的；我们采取的每个举措都会同时产生期望的后果与意外的后果。这个目的可能只是简单地赚钱，但其对世界的影响，可能会远远超出最初的目的。我们，建立这些系统的工程师，有责任去仔细考虑这些后果，并有意识地决定，我们希望生活在怎样的世界中。

我们将数据当成一种抽象的东西来讨论，但请记住，许多数据集都是关于人的：他们的行为，他们的兴趣，他们的身份。对待这些数据，我们必须怀着人性与尊重。用户也是人类，人类的尊严是至关重要的。

软件开发越来越多地涉及重要的道德抉择。有一些指导原则可以帮助软件工程师解决这些问题，例如 ACM 的软件工程道德规范与专业实践【77】，但实践中很少会讨论这些，更不用说应用与强制执行了。因此，工程师和产品经理有时会对隐私与产品潜在的负面后果抱有非常傲慢的态度【78,79,80】。

技术本身并无好坏之分 —— 关键在于它被如何使用，以及它如何影响人们。这对枪械这样的武器是成立的，而搜索引擎这样的软件系统与之类似。我认为，软件工程师仅仅专注于技术而忽视其后果是不够的：道德责任也是我们的责任。对道德推理很困难，但它太重要了，我们无法忽视。

### 预测性分析

举个例子，预测性分析是 “大数据” 炒作的主要内容之一。使用数据分析预测天气或疾病传播是一码事【81】；而预测一个罪犯是否可能再犯，一个贷款申请人是否有可能违约，或者一个保险客户是否可能进行昂贵的索赔，则是另外一码事。后者会直接影响到个人的生活。

当然，支付网络希望防止欺诈交易，银行希望避免不良贷款，航空公司希望避免劫机，公司希望避免雇佣效率低下或不值得信任的人。从它们的角度来看，失去商机的成本很低，而不良贷款或问题员工的成本则要高得多，因而组织希望保持谨慎也是自然而然的事情。所以如果存疑，它们通常会 Say No。

然而，随着算法决策变得越来越普遍，被某种算法（准确地或错误地）标记为有风险的某人可能会遭受大量这种 “No” 的决定。系统性地被排除在工作，航旅，保险，租赁，金融服务，以及其他社会关键领域之外。这是一种对个体自由的极大约束，因此被称为 “算法监狱”【82】。在尊重人权的国家，刑事司法系统会做无罪推定（默认清白，直到被证明有罪）。另一方面，自动化系统可以系统地，任意地将一个人排除在社会参与之外，不需要任何有罪的证明，而且几乎没有申诉的机会。

#### 偏见与歧视

算法做出的决定不一定比人类更好或更差。每个人都可能有偏见，即使他们主动抗拒这一点；而歧视性做法也可能已经在文化上被制度化了。人们希望根据数据做出决定，而不是通过人的主观评价与直觉，希望这样能更加公平，并给予传统体制中经常被忽视的人更好的机会【83】。

当我们开发预测性分析系统时，不是仅仅用软件通过一系列 IF ELSE 规则将人类的决策过程自动化，那些规则本身甚至都是从数据中推断出来的。但这些系统学到的模式是个黑盒：即使数据中存在一些相关性，我们可能也压根不知道为什么。如果算法的输入中存在系统性的偏见，则系统很有可能会在输出中学习并放大这种偏见【84】。

在许多国家，反歧视法律禁止按种族、年龄、性别、性取向、残疾或信仰等受保护的特征区分对待不同的人。其他的个人特征可能是允许用于分析的，但是如果这些特征与受保护的特征存在关联，又会发生什么？例如在种族隔离地区中，一个人的邮政编码，甚至是他们的 IP 地址，都是很强的种族指示物。这样的话，相信一种算法可以以某种方式将有偏见的数据作为输入，并产生公平和公正的输出【85】似乎是很荒谬的。然而这种观点似乎常常潜伏在数据驱动型决策的支持者中，这种态度被讽刺为 “在处理偏差上，机器学习与洗钱类似”（machine learning is like money laundering for bias）【86】。

预测性分析系统只是基于过去进行推断；如果过去是歧视性的，它们就会将这种歧视归纳为规律。如果我们希望未来比过去更好，那么就需要道德想象力，而这是只有人类才能提供的东西【87】。数据与模型应该是我们的工具，而不是我们的主人。

#### 责任与问责

自动决策引发了关于责任与问责的问题【87】。如果一个人犯了错误，他可以被追责，受决定影响的人可以申诉。算法也会犯错误，但是如果它们出错，谁来负责【88】？当一辆自动驾驶汽车引发事故时，谁来负责？如果自动信用评分算法系统性地歧视特定种族或宗教的人，这些人是否有任何追索权？如果机器学习系统的决定要受到司法审查，你能向法官解释算法是如何做出决定的吗？

收集关于人的数据并进行决策，信用评级机构是一个很经典的例子。不良的信用评分会使生活变得更艰难，但至少信用分通常是基于个人 **实际的** 借款历史记录，而记录中的任何错误都能被纠正（尽管机构通常会设置门槛）。然而，基于机器学习的评分算法通常会使用更宽泛的输入，并且更不透明；因而很难理解特定决策是怎样作出的，以及是否有人被不公正地，歧视性地对待【89】。

信用分总结了 “你过去的表现如何？”，而预测性分析通常是基于 “谁与你类似，以及与你类似的人过去表现的如何？”。与他人的行为画上等号意味着刻板印象，例如，根据他们居住的地方（与种族和阶级关系密切的特征）。那么那些放错位置的人怎么办？而且，如果是因为错误数据导致的错误决定，追索几乎是不可能的【87】。

很多数据本质上是统计性的，这意味着即使概率分布在总体上是正确的，对于个例也可能是错误的。例如，如果贵国的平均寿命是 80 岁，这并不意味着你在 80 岁生日时就会死掉。很难从平均值与概率分布中对某个特定个体的寿命作出什么判断，同样，预测系统的输出是概率性的，对于个例可能是错误的。

盲目相信数据决策至高无上，这不仅仅是一种妄想，而是有切实危险的。随着数据驱动的决策变得越来越普遍，我们需要弄清楚，如何使算法更负责任且更加透明，如何避免加强现有的偏见，以及如何在它们不可避免地出错时加以修复。

我们还需要想清楚，如何避免数据被用于害人，如何认识数据的积极潜力。例如，分析可以揭示人们生活的财务特点与社会特点。一方面，这种权力可以用来将援助与支持集中在帮助那些最需要援助的人身上。另一方面，它有时会被掠夺性企业用于识别弱势群体，并向其兜售高风险产品，比如高利贷和没有价值的大学文凭【87,90】。

#### 反馈循环

即使是那些对人直接影响比较小的预测性应用，比如推荐系统，也有一些必须正视的难题。当服务变得善于预测用户想要看到什么内容时，它最终可能只会向人们展示他们已经同意的观点，将人们带入滋生刻板印象，误导信息，与极端思想的 **回音室**。我们已经看到过社交媒体回音室对竞选的影响了【91】。

当预测性分析影响人们的生活时，自我强化的反馈循环会导致非常有害的问题。例如，考虑雇主使用信用分来评估候选人的例子。你可能是一个信用分不错的好员工，但因不可抗力的意外而陷入财务困境。由于不能按期付账单，你的信用分会受到影响，进而导致找到工作更为困难。失业使你陷入贫困，这进一步恶化了你的分数，使你更难找到工作【87】。在数据与数学严谨性的伪装背后，隐藏的是由恶毒假设导致的恶性循环。

我们无法预测这种反馈循环何时发生。然而通过对整个系统（不仅仅是计算机化的部分，而且还有与之互动的人）进行整体思考，许多后果是可以够预测的 —— 一种称为 **系统思维（systems thinking）** 的方法【92】。我们可以尝试理解数据分析系统如何响应不同的行为，结构或特性。该系统是否加强和增大了人们之间现有的差异（例如，损不足以奉有余，富者愈富，贫者愈贫），还是试图与不公作斗争？而且即使有着最好的动机，我们也必须当心意想不到的后果。

### 隐私和追踪

除了预测性分析 —— 使用数据来做出关于人的自动决策 —— 数据收集本身也存在道德问题。收集数据的组织，与被收集数据的人之间，到底属于什么关系？

当系统只存储用户明确输入的数据时，是因为用户希望系统以特定方式存储和处理这些数据，**系统是在为用户提供服务**：用户就是客户。但是，当用户的活动被跟踪并记录，作为他们正在做的其他事情的副作用时，这种关系就没有那么清晰了。该服务不再仅仅完成用户想要它要做的事情，而是服务于它自己的利益，而这可能与用户的利益相冲突。

追踪用户行为数据对于许多面向用户的在线服务而言，变得越来越重要：追踪用户点击了哪些搜索结果有助于改善搜索结果的排名；推荐 “喜欢 X 的人也喜欢 Y”，可以帮助用户发现实用有趣的东西；A/B 测试和用户流量分析有助于改善用户界面。这些功能需要一定量的用户行为跟踪，而用户也可以从中受益。

但不同公司有着不同的商业模式，追踪并未止步于此。如果服务是通过广告盈利的，那么广告主才是真正的客户，而用户的利益则屈居其次。跟踪的数据会变得更详细，分析变得更深入，数据会保留很长时间，以便为每个人建立详细画像，用于营销。

现在，公司与被收集数据的用户之间的关系，看上去就不太一样了。公司会免费服务用户，并引诱用户尽可能多地使用服务。对用户的追踪，主要不是服务于该用户个体，而是服务于掏钱资助该服务的广告商。我认为这种关系可以用一个更具罪犯内涵的词来恰当地描述：**监视（surveilance）**。

#### 监视

让我们做一个思想实验，尝试用 **监视（surveillance）** 一词替换 **数据（data）**，再看看常见的短语是不是听起来还那么漂亮【93】。比如：“在我们的监视驱动的组织中，我们收集实时监视流并将它们存储在我们的监视仓库中。我们的监视科学家使用高级分析和监视处理来获得新的见解。”

对于本书《设计监控密集型应用》而言，这个思想实验是罕见的争议性内容，但我认为需要激烈的言辞来强调这一点。在我们尝试制造软件 “吞噬世界” 的过程中【94】，我们已经建立了世界上迄今为止所见过的最伟大的大规模监视基础设施。我们正朝着万物互联迈进，我们正在迅速走近这样一个世界：每个有人居住的空间至少包含一个带互联网连接的麦克风，以智能手机、智能电视、语音控制助理设备、婴儿监视器甚至儿童玩具的形式存在，并使用基于云的语音识别。这些设备中的很多都有着可怕的安全记录【95】。

即使是最为极权与专制的政权，可能也只会想着在每个房间装一个麦克风，并强迫每个人始终携带能够追踪其位置与动向的设备。然而，我们显然是自愿地，甚至热情地投身于这个全域监视的世界。不同之处在于，数据是由公司，而不是由政府机构收集的【96】。

并不是所有的数据收集都称得上监视，但检视这一点有助于理解我们与数据收集者之间的关系。为什么我们似乎很乐意接受企业的监视呢？也许你觉得自己没有什么好隐瞒的 —— 换句话说，你与当权阶级穿一条裤子，你不是被边缘化的少数派，也不必害怕受到迫害【97】。不是每个人都如此幸运。或者，也许这是因为目的似乎是温和的 —— 这不是公然胁迫，也不是强制性的，而只是更好的推荐与更个性化的营销。但是，结合上一节中对预测性分析的讨论，这种区别似乎并不是很清晰。

我们已经看到与汽车追踪设备挂钩的汽车保险费，以及取决于需要人佩戴健身追踪设备来确定的健康保险范围。当监视被用于决定生活的重要方面时，例如保险或就业，它就开始变得不那么温和了。此外，数据分析可以揭示出令人惊讶的私密事物：例如，智能手表或健身追踪器中的运动传感器能以相当好的精度计算出你正在输入的内容（比如密码）【98】。而分析算法只会变得越来越精确。

#### 同意与选择的自由

我们可能会断言用户是自愿选择使用了会跟踪其活动的服务，而且他们已经同意了服务条款与隐私政策，因此他们同意数据收集。我们甚至可以声称，用户在用所提供的数据来 **换取** 有价值的服务，并且为了提供服务，追踪是必要的。毫无疑问，社交网络、搜索引擎以及各种其他免费的在线服务对于用户来说都是有价值的，但是这个说法却存在问题。

用户几乎不知道他们提供给我们的是什么数据，哪些数据被放进了数据库，数据又是怎样被保留与处理的 —— 大多数隐私政策都是模棱两可的，忽悠用户而不敢打开天窗说亮话。如果用户不了解他们的数据会发生什么，就无法给出任何有意义的同意。有时来自一个用户的数据还会提到一些关于其他人的事，而其他那些人既不是该服务的用户，也没有同意任何条款。我们在本书这一部分中讨论的衍生数据集 —— 来自整个用户群的数据，加上行为追踪与外部数据源 —— 就恰好是用户无法（在真正意义上）理解的数据类型。

而且从用户身上挖掘数据是一个单向过程，而不是真正的互惠关系，也不是公平的价值交换。用户对能用多少数据换来什么样的服务，既没有没有发言权也没有选择权：服务与用户之间的关系是非常不对称与单边的。这些条款是由服务提出的，而不是由用户提出的【99】。

对于不同意监视的用户，唯一真正管用的备选项，就是简单地不使用服务。但这个选择也不是真正自由的：如果一项服务如此受欢迎，以至于 “被大多数人认为是基本社会参与的必要条件”【99】，那么指望人们选择退出这项服务是不合理的 —— 使用它 **事实上（de facto）** 是强制性的。例如，在大多数西方社会群体中，携带智能手机，使用 Facebook 进行社交，以及使用 Google 查找信息已成为常态。特别是当一项服务具有网络效应时，人们选择 **不** 使用会产生社会成本。

因为一个服务会跟踪用户而拒绝使用它，这只是少数人才拥有的权力，他们有足够的时间与知识来了解隐私政策，并承受得起代价：错过社会参与，以及使用服务可能带来的专业机会。对于那些处境不太好的人而言，并没有真正意义上的选择：监控是不可避免的。

#### 隐私与数据使用

有时候，人们声称 “隐私已死”，理由是有些用户愿意把各种关于他们生活的事情发布到社交媒体上，有时是平凡俗套，但有时是高度私密的。但这种说法是错误的，而且是对 **隐私（privacy）** 一词的误解。

拥有隐私并不意味着保密一切东西；它意味着拥有选择向谁展示哪些东西的自由，要公开什么，以及要保密什么。**隐私权是一项决定权**：在从保密到透明的光谱上，隐私使得每个人都能决定自己想要在什么地方位于光谱上的哪个位置【99】。这是一个人自由与自主的重要方面。

当通过监控基础设施从人身上提取数据时，隐私权不一定受到损害，而是转移到了数据收集者手中。获取数据的公司实际上是说 “相信我们会用你的数据做正确的事情”，这意味着，决定要透露什么和保密什么的权利从个体手中转移到了公司手中。

这些公司反过来选择保密这些监视结果，因为揭露这些会令人毛骨悚然，并损害它们的商业模式（比其他公司更了解人）。用户的私密信息只会间接地披露，例如针对特定人群定向投放广告的工具（比如那些患有特定疾病的人群）。

即使特定用户无法从特定广告定向的人群中以个体的形式区分出来，但他们已经失去了披露一些私密信息的能动性，例如他们是否患有某种疾病。决定向谁透露什么并不是由个体按照自己的喜好决定的，而是由 **公司**，以利润最大化为目标来行使隐私权的。

许多公司都有一个目标，不要让人 **感觉到** 毛骨悚然 —— 先不说它们收集数据实际上是多么具有侵犯性，让我们先关注对用户感受的管理。这些用户感受经常被管理得很糟糕：例如，在事实上可能正确的一些东西，如果会触发痛苦的回忆，用户可能并不希望被提醒【100】。对于任何类型的数据，我们都应当考虑它出错、不可取、不合时宜的可能性，并且需要建立处理这些失效的机制。无论是 “不可取” 还是 “不合时宜”，当然都是由人的判断决定的；除非我们明确地将算法编码设计为尊重人类的需求，否则算法会无视这些概念。作为这些系统的工程师，我们必须保持谦卑，充分规划，接受这些失效。

允许在线服务的用户控制其隐私设置，例如控制其他用户可以看到哪些东西，是将一些控制交还给用户的第一步。但无论怎么设置，服务本身仍然可以不受限制地访问数据，并能以隐私策略允许的任何方式自由使用它。即使服务承诺不会将数据出售给第三方，它通常会授予自己不受限制的权利，以便在内部处理与分析数据，而且往往比用户公开可见的部分要深入的多。

这种从个体到公司的大规模隐私权转移在历史上是史无前例的【99】。监控一直存在，但它过去是昂贵的、手动的，不是可伸缩的、自动化的。信任关系一直存在，例如患者与其医生之间，或被告与其律师之间 —— 但在这些情况下，数据的使用严格受到道德，法律和监管限制的约束。互联网服务使得在未经有意义的同意下收集大量敏感信息变得容易得多，而且无需用户理解他们的私人数据到底发生了什么。

#### 数据资产与权力

由于行为数据是用户与服务交互的副产品，因此有时被称为 “数据废气” —— 暗示数据是毫无价值的废料。从这个角度来看，行为和预测性分析可以被看作是一种从数据中提取价值的回收形式，否则这些数据就会被浪费。

更准确的看法恰恰相反：从经济的角度来看，如果定向广告是服务的金主，那么关于人的行为数据就是服务的核心资产。在这种情况下，用户与之交互的应用仅仅是一种诱骗用户将更多的个人信息提供给监控基础设施的手段【99】。在线服务中经常表现出的令人愉悦的人类创造力与社会关系，十分讽刺地被数据提取机器所滥用。

个人数据是珍贵资产的说法因为数据中介的存在得到支持，这是阴影中的秘密行业，购买、聚合、分析、推断以及转售私密个人数据，主要用于市场营销【90】。初创公司按照它们的用户数量，“眼球数”，—— 即它们的监视能力来估值。

因为数据很有价值，所以很多人都想要它。当然，公司也想要它 —— 这就是为什么它们一开始就收集数据的原因。但政府也想获得它：通过秘密交易、胁迫、法律强制或者只是窃取【101】。当公司破产时，收集到的个人数据就是被出售的资产之一。而且数据安全很难保护，因此经常发生令人难堪的泄漏事件【102】。

这些观察已经导致批评者声称，数据不仅仅是一种资产，而且是一种 “有毒资产”【101】，或者至少是 “有害物质”【103】。即使我们认为自己有能力阻止数据滥用，但每当我们收集数据时，我们都需要平衡收益以及这些数据落入恶人手中的风险：计算机系统可能会被犯罪分子或敌国特务渗透，数据可能会被内鬼泄露，公司可能会落入不择手段的管理层手中，而这些管理者有着迥然不同的价值观，或者国家可能被能毫无愧色迫使我们交出数据的政权所接管。

俗话说，“知识就是力量”。更进一步，“在避免自己被审视的同时审视他人，是权力最重要的形式之一”【105】。这就是极权政府想要监控的原因：这让它们有能力控制全体居民。尽管今天的科技公司并没有公开地寻求政治权力，但是它们积累的数据与知识却给它们带来了很多权力，其中大部分是在公共监督之外偷偷进行的【106】。

#### 回顾工业革命

数据是信息时代的决定性特征。互联网，数据存储，处理和软件驱动的自动化正在对全球经济和人类社会产生重大影响。我们的日常生活与社会组织在过去十年中发生了变化，而且在未来的十年中可能会继续发生根本性的变化，所以我们会想到与工业革命对比【87,96】。

工业革命是通过重大的技术与农业进步实现的，它带来了持续的经济增长，长期的生活水平显著提高。然而它也带来了一些严重的问题：空气污染（由于烟雾和化学过程）和水污染（工业垃圾和人类垃圾）是可怖的。工厂老板生活在纷奢之中，而城市工人经常居住在非常糟糕的住房中，并且在恶劣的条件下长时间工作。童工很常见，甚至包括矿井中危险而低薪的工作。

制定保护措施花费了很长的时间，例如环境保护条例、工作场所安全条例、宣布使用童工非法以及食品卫生检查。毫无疑问，生产成本增加了，因为工厂再也不能把废物倒入河流、销售污染的食物或者剥削工人。但是整个社会都从中受益良多，我们中很少会有人想回到这些管制条例之前的日子【87】。

就像工业革命有着黑暗面需要应对一样，我们转向信息时代的过程中，也有需要应对与解决的重大问题。我相信数据的收集与使用就是其中一个问题。用 Bruce Schneier 的话来说【96】：

> 数据是信息时代的污染问题，保护隐私是环境挑战。几乎所有的电脑都能生产信息。它堆积在周围，开始溃烂。我们如何处理它 —— 我们如何控制它，以及如何摆脱它 —— 是信息经济健康发展的核心议题。正如我们今天回顾工业时代的早期年代，并想知道我们的祖先在忙于建设工业世界的过程时怎么能忽略污染问题；我们的孙辈在回望信息时代的早期年代时，将会就我们如何应对数据收集和滥用的挑战来评断我们。
>
> 我们应该设法让他们感到骄傲。

#### 立法与自律

数据保护法可能有助于维护个人的权利。例如，1995 年的 “欧洲数据保护指示” 规定，个人数据必须 “为特定的、明确的和合法的目的收集，而不是以与这些目的不相符的方式进一步处理”，并且数据必须 “就收集的目的而言适当、相关、不过分。”【107】。

但是，这个立法在今天的互联网环境下是否有效还是有疑问的【108】。这些规则直接否定了大数据的哲学，即最大限度地收集数据，将其与其他数据集结合起来进行试验和探索，以便产生新的洞察。探索意味着将数据用于未曾预期的目的，这与用户同意的 “特定和明确” 目的相反（如果我们可以有意义地表示同意的话）【109】。更新的规章正在制定中【89】。

那些收集了大量有关人的数据的公司反对监管，认为这是创新的负担与阻碍。在某种程度上，这种反对是有道理的。例如，分享医疗数据时，存在明显的隐私风险，但也有潜在的机遇：如果数据分析能够帮助我们实现更好的诊断或找到更好的治疗方法，能够阻止多少人的死亡【110】？过度监管可能会阻止这种突破。在这种潜在机会与风险之间找出平衡是很困难的【105】。

从根本上说，我认为我们需要科技行业在个人数据方面的文化转变。我们应该停止将用户视作待优化的指标数据，并记住他们是值得尊重、有尊严和能动性的人。我们应当在数据收集和实际处理中自我约束，以建立和维持依赖我们软件的人们的信任【111】。我们应当将教育终端用户视为己任，告诉他们我们是如何使用他们的数据的，而不是将他们蒙在鼓里。

我们应该允许每个人保留自己的隐私 —— 即，对自己数据的控制 —— 而不是通过监视来窃取这种控制权。我们控制自己数据的个体权利就像是国家公园的自然环境：如果我们不去明确地保护它、关心它，它就会被破坏。这将是公地的悲剧，我们都会因此而变得更糟。无所不在的监视并非不可避免的 —— 我们现在仍然能阻止它。

我们究竟能做到哪一步，是一个开放的问题。首先，我们不应该永久保留数据，而是一旦不再需要就立即清除数据【111,112】。清除数据与不变性的想法背道而驰（请参阅 “[不变性的局限性](/v1/ch11#不变性的局限性)”），但这是可以解决的问题。我所看到的一种很有前景的方法是通过加密协议来实施访问控制，而不仅仅是通过策略【113,114】。总的来说，文化与态度的改变是必要的。


## 本章小结

在本章中，我们讨论了设计数据系统的新方式，而且也包括了我的个人观点，以及对未来的猜测。我们从这样一种观察开始：没有单种工具能高效服务所有可能的用例，因此应用必须组合使用几种不同的软件才能实现其目标。我们讨论了如何使用批处理与事件流来解决这一 **数据集成（data integration）** 问题，以便让数据变更在不同系统之间流动。

在这种方法中，某些系统被指定为记录系统，而其他数据则通过转换衍生自记录系统。通过这种方式，我们可以维护索引、物化视图、机器学习模型、统计摘要等等。通过使这些衍生和转换操作异步且松散耦合，能够防止一个区域中的问题扩散到系统中不相关部分，从而增加整个系统的稳健性与容错性。

将数据流表示为从一个数据集到另一个数据集的转换也有助于演化应用程序：如果你想变更其中一个处理步骤，例如变更索引或缓存的结构，则可以在整个输入数据集上重新运行新的转换代码，以便重新衍生输出。同样，出现问题时，你也可以修复代码并重新处理数据以便恢复。

这些过程与数据库内部已经完成的过程非常类似，因此我们将数据流应用的概念重新改写为，**分拆（unbundling）** 数据库组件，并通过组合这些松散耦合的组件来构建应用程序。

衍生状态可以通过观察底层数据的变更来更新。此外，衍生状态本身可以进一步被下游消费者观察。我们甚至可以将这种数据流一路传送至显示数据的终端用户设备，从而构建可动态更新以反映数据变更，并在离线时能继续工作的用户界面。

接下来，我们讨论了如何确保所有这些处理在出现故障时保持正确。我们看到可伸缩的强完整性保证可以通过异步事件处理来实现，通过使用端到端操作标识符使操作幂等，以及通过异步检查约束。客户端可以等到检查通过，或者不等待继续前进，但是可能会冒有违反约束需要道歉的风险。这种方法比使用分布式事务的传统方法更具可伸缩性与可靠性，并且在实践中适用于很多业务流程。

通过围绕数据流构建应用，并异步检查约束，我们可以避免绝大多数的协调工作，创建保证完整性且性能仍然表现良好的系统，即使在地理散布的情况下与出现故障时亦然。然后，我们对使用审计来验证数据完整性，以及损坏检测进行了一些讨论。

最后，我们退后一步，审视了构建数据密集型应用的一些道德问题。我们看到，虽然数据可以用来做好事，但它也可能造成很大伤害：作出严重影响人们生活的决定却难以申诉，导致歧视与剥削、监视常态化、曝光私密信息。我们也冒着数据被泄露的风险，并且可能会发现，即使是善意地使用数据也可能会导致意想不到的后果。

由于软件和数据对世界产生了如此巨大的影响，我们工程师们必须牢记，我们有责任为我们想要的那种世界而努力：一个尊重人们，尊重人性的世界。我希望我们能够一起为实现这一目标而努力。


## 参考文献

1. Rachid Belaid: “[Postgres Full-Text Search is Good Enough!](http://rachbelaid.com/postgres-full-text-search-is-good-enough/),” *rachbelaid.com*, July 13, 2015.
1. Philippe Ajoux, Nathan Bronson, Sanjeev Kumar, et al.: “[Challenges to Adopting Stronger Consistency at Scale](https://www.usenix.org/system/files/conference/hotos15/hotos15-paper-ajoux.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
1. Pat Helland and Dave Campbell: “[Building on Quicksand](https://web.archive.org/web/20220606172817/https://database.cs.wisc.edu/cidr/cidr2009/Paper_133.pdf),” at *4th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2009.
1. Jessica Kerr: “[Provenance and Causality in Distributed Systems](https://web.archive.org/web/20190425150540/http://blog.jessitron.com/2016/09/provenance-and-causality-in-distributed.html),” *blog.jessitron.com*, September 25, 2016.
1. Kostas Tzoumas: “[Batch Is a Special Case of Streaming](http://data-artisans.com/blog/batch-is-a-special-case-of-streaming/),” *data-artisans.com*, September 15, 2015.
1. Shinji Kim and Robert Blafford: “[Stream Windowing Performance Analysis: Concord and Spark Streaming](https://web.archive.org/web/20180125074821/http://concord.io/posts/windowing_performance_analysis_w_spark_streaming),” *concord.io*, July 6, 2016.
1. Jay Kreps: “[The Log: What Every Software Engineer Should Know About Real-Time Data's Unifying Abstraction](http://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying),” *engineering.linkedin.com*, December 16, 2013.
1. Pat Helland: “[Life Beyond Distributed Transactions: An Apostate’s Opinion](https://web.archive.org/web/20200730171311/http://www-db.cs.wisc.edu/cidr/cidr2007/papers/cidr07p15.pdf),” at *3rd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2007.
1. “[Great Western Railway (1835–1948)](https://web.archive.org/web/20160122155425/https://www.networkrail.co.uk/VirtualArchive/great-western/),” Network Rail Virtual Archive, *networkrail.co.uk*.
1. Jacqueline Xu: “[Online Migrations at Scale](https://stripe.com/blog/online-migrations),” *stripe.com*, February 2, 2017.
1. Molly Bartlett Dishman and Martin Fowler: “[Agile Architecture](https://web.archive.org/web/20161130034721/http://conferences.oreilly.com/software-architecture/sa2015/public/schedule/detail/40388),” at *O'Reilly Software Architecture Conference*, March 2015.
1. Nathan Marz and James Warren: [*Big Data: Principles and Best Practices of Scalable Real-Time Data Systems*](https://www.manning.com/books/big-data). Manning, 2015. ISBN: 978-1-617-29034-3
1. Oscar Boykin, Sam Ritchie, Ian O'Connell, and Jimmy Lin: “[Summingbird: A Framework for Integrating Batch and Online MapReduce Computations](http://www.vldb.org/pvldb/vol7/p1441-boykin.pdf),” at *40th International Conference on Very Large Data Bases* (VLDB), September 2014.
1. Jay Kreps: “[Questioning the Lambda Architecture](https://www.oreilly.com/ideas/questioning-the-lambda-architecture),” *oreilly.com*, July 2, 2014.
1. Raul Castro Fernandez, Peter Pietzuch, Jay Kreps, et al.: “[Liquid: Unifying Nearline and Offline Big Data Integration](http://cidrdb.org/cidr2015/Papers/CIDR15_Paper25u.pdf),” at *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
1. Dennis M. Ritchie and Ken Thompson: “[The UNIX Time-Sharing System](http://web.eecs.utk.edu/~qcao1/cs560/papers/paper-unix.pdf),” *Communications of the ACM*, volume 17, number 7, pages 365–375, July 1974. [doi:10.1145/361011.361061](http://dx.doi.org/10.1145/361011.361061)
1. Eric A. Brewer and Joseph M. Hellerstein: “[CS262a: Advanced Topics in Computer Systems](http://people.eecs.berkeley.edu/~brewer/cs262/systemr.html),” lecture notes, University of California, Berkeley, *cs.berkeley.edu*, August 2011.
1. Michael Stonebraker: “[The Case for Polystores](http://wp.sigmod.org/?p=1629),” *wp.sigmod.org*, July 13, 2015.
1. Jennie Duggan, Aaron J. Elmore, Michael Stonebraker, et al.: “[The BigDAWG Polystore System](https://dspace.mit.edu/handle/1721.1/100936),” *ACM SIGMOD Record*, volume 44, number 2, pages 11–16, June 2015. [doi:10.1145/2814710.2814713](http://dx.doi.org/10.1145/2814710.2814713)
1. Patrycja Dybka: “[Foreign Data Wrappers for PostgreSQL](https://web.archive.org/web/20221003115732/https://www.vertabelo.com/blog/foreign-data-wrappers-for-postgresql/),” *vertabelo.com*, March 24, 2015.
1. David B. Lomet, Alan Fekete, Gerhard Weikum, and Mike Zwilling: “[Unbundling Transaction Services in the Cloud](https://www.microsoft.com/en-us/research/publication/unbundling-transaction-services-in-the-cloud/),” at *4th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2009.
1. Martin Kleppmann and Jay Kreps: “[Kafka, Samza and the Unix Philosophy of Distributed Data](http://martin.kleppmann.com/papers/kafka-debull15.pdf),” *IEEE Data Engineering Bulletin*, volume 38, number 4, pages 4–14, December 2015.
1. John Hugg: “[Winning Now and in the Future: Where VoltDB Shines](https://voltdb.com/blog/winning-now-and-future-where-voltdb-shines),” *voltdb.com*, March 23, 2016.
1. Frank McSherry, Derek G. Murray, Rebecca Isaacs, and Michael Isard: “[Differential Dataflow](http://cidrdb.org/cidr2013/Papers/CIDR13_Paper111.pdf),” at *6th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2013.
1. Derek G Murray, Frank McSherry, Rebecca Isaacs, et al.: “[Naiad: A Timely Dataflow System](http://sigops.org/s/conferences/sosp/2013/papers/p439-murray.pdf),” at *24th ACM Symposium on Operating Systems Principles* (SOSP), pages 439–455, November 2013. [doi:10.1145/2517349.2522738](http://dx.doi.org/10.1145/2517349.2522738)
1. Gwen Shapira: “[We have a bunch of customers who are implementing ‘database inside-out’ concept and they all ask ‘is anyone else doing it? are we crazy?’](https://twitter.com/gwenshap/status/758800071110430720)” *twitter.com*, July 28, 2016.
1. Martin Kleppmann: “[Turning the Database Inside-out with Apache Samza,](http://martin.kleppmann.com/2015/03/04/turning-the-database-inside-out.html)” at *Strange Loop*, September 2014.
1. Peter Van Roy and Seif Haridi: [*Concepts, Techniques, and Models of Computer Programming*](https://www.info.ucl.ac.be/~pvr/book.html). MIT Press, 2004. ISBN: 978-0-262-22069-9
1. “[Juttle Documentation](http://juttle.github.io/juttle/),” *juttle.github.io*, 2016.
1. Evan Czaplicki and Stephen Chong: “[Asynchronous Functional Reactive Programming for GUIs](http://people.seas.harvard.edu/~chong/pubs/pldi13-elm.pdf),” at *34th ACM SIGPLAN Conference on Programming Language Design and Implementation* (PLDI), June 2013. [doi:10.1145/2491956.2462161](http://dx.doi.org/10.1145/2491956.2462161)
1. Engineer Bainomugisha, Andoni Lombide Carreton, Tom van Cutsem, Stijn Mostinckx, and Wolfgang de Meuter: “[A Survey on Reactive Programming](http://soft.vub.ac.be/Publications/2012/vub-soft-tr-12-13.pdf),” *ACM Computing Surveys*, volume 45, number 4, pages 1–34, August 2013. [doi:10.1145/2501654.2501666](http://dx.doi.org/10.1145/2501654.2501666)
1. Peter Alvaro, Neil Conway, Joseph M. Hellerstein, and William R. Marczak: “[Consistency Analysis in Bloom: A CALM and Collected Approach](https://dsf.berkeley.edu/cs286/papers/calm-cidr2011.pdf),” at *5th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2011.
1. Felienne Hermans: “[Spreadsheets Are Code](https://vimeo.com/145492419),” at *Code Mesh*, November 2015.
1. Dan Bricklin and Bob Frankston: “[VisiCalc: Information from Its Creators](http://danbricklin.com/visicalc.htm),” *danbricklin.com*.
1. D. Sculley, Gary Holt, Daniel Golovin, et al.: “[Machine Learning: The High-Interest Credit Card of Technical Debt](http://research.google.com/pubs/pub43146.html),” at *NIPS Workshop on Software Engineering for Machine Learning* (SE4ML), December 2014.
1. Peter Bailis, Alan Fekete, Michael J Franklin, et al.: “[Feral Concurrency Control: An Empirical Investigation of Modern Application Integrity](http://www.bailis.org/papers/feral-sigmod2015.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2737784](http://dx.doi.org/10.1145/2723372.2737784)
1. Guy Steele: “[Re: Need for Macros (Was Re: Icon)](https://people.csail.mit.edu/gregs/ll1-discuss-archive-html/msg01134.html),” email to *ll1-discuss* mailing list, *people.csail.mit.edu*, December 24, 2001.
1. David Gelernter: “[Generative Communication in Linda](http://cseweb.ucsd.edu/groups/csag/html/teaching/cse291s03/Readings/p80-gelernter.pdf),” *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 7, number 1, pages 80–112, January 1985. [doi:10.1145/2363.2433](http://dx.doi.org/10.1145/2363.2433)
1. Patrick Th. Eugster, Pascal A. Felber, Rachid Guerraoui, and Anne-Marie Kermarrec: “[The Many Faces of Publish/Subscribe](http://www.cs.ru.nl/~pieter/oss/manyfaces.pdf),” *ACM Computing Surveys*, volume 35, number 2, pages 114–131, June 2003. [doi:10.1145/857076.857078](http://dx.doi.org/10.1145/857076.857078)
1. Ben Stopford: “[Microservices in a Streaming World](https://www.infoq.com/presentations/microservices-streaming),” at *QCon London*, March 2016.
1. Christian Posta: “[Why Microservices Should Be Event Driven: Autonomy vs Authority](http://blog.christianposta.com/microservices/why-microservices-should-be-event-driven-autonomy-vs-authority/),” *blog.christianposta.com*, May 27, 2016.
1. Alex Feyerke: “[Say Hello to Offline First](https://web.archive.org/web/20210420014747/http://hood.ie/blog/say-hello-to-offline-first.html),” *hood.ie*, November 5, 2013.
1. Sebastian Burckhardt, Daan Leijen, Jonathan Protzenko, and Manuel Fähndrich: “[Global Sequence Protocol: A Robust Abstraction for Replicated Shared State](http://drops.dagstuhl.de/opus/volltexte/2015/5238/),” at *29th European Conference on Object-Oriented Programming* (ECOOP), July 2015. [doi:10.4230/LIPIcs.ECOOP.2015.568](http://dx.doi.org/10.4230/LIPIcs.ECOOP.2015.568)
1. Mark Soper: “[Clearing Up React Data Management Confusion with Flux, Redux, and Relay](https://medium.com/@marksoper/clearing-up-react-data-management-confusion-with-flux-redux-and-relay-aad504e63cae),” *medium.com*, December 3, 2015.
1. Eno Thereska, Damian Guy, Michael Noll, and Neha Narkhede: “[Unifying Stream Processing and Interactive Queries in Apache Kafka](http://www.confluent.io/blog/unifying-stream-processing-and-interactive-queries-in-apache-kafka/),” *confluent.io*, October 26, 2016.
1. Frank McSherry: “[Dataflow as Database](https://github.com/frankmcsherry/blog/blob/master/posts/2016-07-17.md),” *github.com*, July 17, 2016.
1. Peter Alvaro: “[I See What You Mean](https://www.youtube.com/watch?v=R2Aa4PivG0g),” at *Strange Loop*, September 2015.
1. Nathan Marz: “[Trident: A High-Level Abstraction for Realtime Computation](https://blog.twitter.com/2012/trident-a-high-level-abstraction-for-realtime-computation),” *blog.twitter.com*, August 2, 2012.
1. Edi Bice: “[Low Latency Web Scale Fraud Prevention with Apache Samza, Kafka and Friends](http://www.slideshare.net/edibice/extremely-low-latency-web-scale-fraud-prevention-with-apache-samza-kafka-and-friends),” at *Merchant Risk Council MRC Vegas Conference*, March 2016.
1. Charity Majors: “[The Accidental DBA](https://charity.wtf/2016/10/02/the-accidental-dba/),” *charity.wtf*, October 2, 2016.
1. Arthur J. Bernstein, Philip M. Lewis, and Shiyong Lu: “[Semantic Conditions for Correctness at Different Isolation Levels](http://db.cs.berkeley.edu/cs286/papers/isolation-icde2000.pdf),” at *16th International Conference on Data Engineering* (ICDE), February 2000. [doi:10.1109/ICDE.2000.839387](http://dx.doi.org/10.1109/ICDE.2000.839387)
1. Sudhir Jorwekar, Alan Fekete, Krithi Ramamritham, and S. Sudarshan: “[Automating the Detection of Snapshot Isolation Anomalies](http://www.vldb.org/conf/2007/papers/industrial/p1263-jorwekar.pdf),” at *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
1. Kyle Kingsbury: [Jepsen blog post series](https://aphyr.com/tags/jepsen), *aphyr.com*, 2013–2016.
1. Michael Jouravlev: “[Redirect After Post](http://www.theserverside.com/news/1365146/Redirect-After-Post),” *theserverside.com*, August 1, 2004.
1. Jerome H. Saltzer, David P. Reed, and David D. Clark: “[End-to-End Arguments in System Design](https://groups.csail.mit.edu/ana/Publications/PubPDFs/End-to-End%20Arguments%20in%20System%20Design.pdf),” *ACM Transactions on Computer Systems*, volume 2, number 4, pages 277–288, November 1984. [doi:10.1145/357401.357402](http://dx.doi.org/10.1145/357401.357402)
1. Peter Bailis, Alan Fekete, Michael J. Franklin, et al.: “[Coordination-Avoiding Database Systems](http://arxiv.org/pdf/1402.2237.pdf),” *Proceedings of the VLDB Endowment*, volume 8, number 3, pages 185–196, November 2014.
1. Alex Yarmula: “[Strong Consistency in Manhattan](https://blog.twitter.com/2016/strong-consistency-in-manhattan),” *blog.twitter.com*, March 17, 2016.
1. Douglas B Terry, Marvin M Theimer, Karin Petersen, et al.: “[Managing Update Conflicts in Bayou, a Weakly Connected Replicated Storage System](http://css.csail.mit.edu/6.824/2014/papers/bayou-conflicts.pdf),” at *15th ACM Symposium on Operating Systems Principles* (SOSP), pages 172–182, December 1995. [doi:10.1145/224056.224070](http://dx.doi.org/10.1145/224056.224070)
1. Jim Gray: “[The Transaction Concept: Virtues and Limitations](http://jimgray.azurewebsites.net/papers/thetransactionconcept.pdf),” at *7th International Conference on Very Large Data Bases* (VLDB), September 1981.
1. Hector Garcia-Molina and Kenneth Salem: “[Sagas](http://www.cs.cornell.edu/andru/cs711/2002fa/reading/sagas.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), May 1987. [doi:10.1145/38713.38742](http://dx.doi.org/10.1145/38713.38742)
1. Pat Helland: “[Memories, Guesses, and Apologies](https://web.archive.org/web/20160304020907/http://blogs.msdn.com/b/pathelland/archive/2007/05/15/memories-guesses-and-apologies.aspx),” *blogs.msdn.com*, May 15, 2007.
1. Yoongu Kim, Ross Daly, Jeremie Kim, et al.: “[Flipping Bits in Memory Without Accessing Them: An Experimental Study of DRAM Disturbance Errors](https://users.ece.cmu.edu/~yoonguk/papers/kim-isca14.pdf),” at *41st Annual International Symposium on Computer Architecture* (ISCA), June 2014. [doi:10.1145/2678373.2665726](http://dx.doi.org/10.1145/2678373.2665726)
1. Mark Seaborn and Thomas Dullien: “[Exploiting the DRAM Rowhammer Bug to Gain Kernel Privileges](https://googleprojectzero.blogspot.co.uk/2015/03/exploiting-dram-rowhammer-bug-to-gain.html),” *googleprojectzero.blogspot.co.uk*, March 9, 2015.
1. Jim N. Gray and Catharine van Ingen: “[Empirical Measurements of Disk Failure Rates and Error Rates](https://www.microsoft.com/en-us/research/publication/empirical-measurements-of-disk-failure-rates-and-error-rates/),” Microsoft Research, MSR-TR-2005-166, December 2005.
1. Annamalai Gurusami and Daniel Price: “[Bug #73170: Duplicates in Unique Secondary Index Because of Fix of Bug#68021](http://bugs.mysql.com/bug.php?id=73170),” *bugs.mysql.com*, July 2014.
1. Gary Fredericks: “[Postgres Serializability Bug](https://github.com/gfredericks/pg-serializability-bug),” *github.com*, September 2015.
1. Xiao Chen: “[HDFS DataNode Scanners and Disk Checker Explained](http://blog.cloudera.com/blog/2016/12/hdfs-datanode-scanners-and-disk-checker-explained/),” *blog.cloudera.com*, December 20, 2016.
1. Jay Kreps: “[Getting Real About Distributed System Reliability](http://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability),” *blog.empathybox.com*, March 19, 2012.
1. Martin Fowler: “[The LMAX Architecture](http://martinfowler.com/articles/lmax.html),” *martinfowler.com*, July 12, 2011.
1. Sam Stokes: “[Move Fast with Confidence](http://blog.samstokes.co.uk/blog/2016/07/11/move-fast-with-confidence/),” *blog.samstokes.co.uk*, July 11, 2016.
1. “[Hyperledger Sawtooth documentation](https://web.archive.org/web/20220120211548/https://sawtooth.hyperledger.org/docs/core/releases/latest/introduction.html),” Intel Corporation, *sawtooth.hyperledger.org*, 2017.
1. Richard Gendal Brown: “[Introducing R3 Corda™: A Distributed Ledger Designed for Financial Services](https://gendal.me/2016/04/05/introducing-r3-corda-a-distributed-ledger-designed-for-financial-services/),” *gendal.me*, April 5, 2016.
1. Trent McConaghy, Rodolphe Marques, Andreas Müller, et al.: “[BigchainDB: A Scalable Blockchain Database](https://www.bigchaindb.com/whitepaper/bigchaindb-whitepaper.pdf),” *bigchaindb.com*, June 8, 2016.
1. Ralph C. Merkle: “[A Digital Signature Based on a Conventional Encryption Function](https://people.eecs.berkeley.edu/~raluca/cs261-f15/readings/merkle.pdf),” at *CRYPTO '87*, August 1987. [doi:10.1007/3-540-48184-2_32](http://dx.doi.org/10.1007/3-540-48184-2_32)
1. Ben Laurie: “[Certificate Transparency](http://queue.acm.org/detail.cfm?id=2668154),” *ACM Queue*, volume 12, number 8, pages 10-19, August 2014. [doi:10.1145/2668152.2668154](http://dx.doi.org/10.1145/2668152.2668154)
1. Mark D. Ryan: “[Enhanced Certificate Transparency and End-to-End Encrypted Mail](https://www.ndss-symposium.org/wp-content/uploads/2017/09/12_2_1.pdf),” at *Network and Distributed System Security Symposium* (NDSS), February 2014. [doi:10.14722/ndss.2014.23379](http://dx.doi.org/10.14722/ndss.2014.23379)
1. “[ACM Code of Ethics and Professional Conduct](https://www.acm.org/code-of-ethics),” Association for Computing Machinery, *acm.org*, 2018.
1. François Chollet: “[Software development is starting to involve important ethical choices](https://twitter.com/fchollet/status/792958695722201088),” *twitter.com*, October 30, 2016.
1. Igor Perisic: “[Making Hard Choices: The Quest for Ethics in Machine Learning](https://engineering.linkedin.com/blog/2016/11/making-hard-choices--the-quest-for-ethics-in-machine-learning),” *engineering.linkedin.com*, November 2016.
1. John Naughton: “[Algorithm Writers Need a Code of Conduct](https://www.theguardian.com/commentisfree/2015/dec/06/algorithm-writers-should-have-code-of-conduct),” *theguardian.com*, December 6, 2015.
1. Logan Kugler: “[What Happens When Big Data Blunders?](http://cacm.acm.org/magazines/2016/6/202655-what-happens-when-big-data-blunders/fulltext),” *Communications of the ACM*, volume 59, number 6, pages 15–16, June 2016. [doi:10.1145/2911975](http://dx.doi.org/10.1145/2911975)
1. Bill Davidow: “[Welcome to Algorithmic Prison](http://www.theatlantic.com/technology/archive/2014/02/welcome-to-algorithmic-prison/283985/),” *theatlantic.com*, February 20, 2014.
1. Don Peck: “[They're Watching You at Work](http://www.theatlantic.com/magazine/archive/2013/12/theyre-watching-you-at-work/354681/),” *theatlantic.com*, December 2013.
1. Leigh Alexander: “[Is an Algorithm Any Less Racist Than a Human?](https://www.theguardian.com/technology/2016/aug/03/algorithm-racist-human-employers-work)” *theguardian.com*, August 3, 2016.
1. Jesse Emspak: “[How a Machine Learns Prejudice](https://www.scientificamerican.com/article/how-a-machine-learns-prejudice/),” *scientificamerican.com*, December 29, 2016.
1. Maciej Cegłowski: “[The Moral Economy of Tech](http://idlewords.com/talks/sase_panel.htm),” *idlewords.com*, June 2016.
1. Cathy O'Neil: [*Weapons of Math Destruction: How Big Data Increases Inequality and Threatens Democracy*](https://web.archive.org/web/20210621234447/https://weaponsofmathdestructionbook.com/). Crown Publishing, 2016. ISBN: 978-0-553-41881-1
1. Julia Angwin: “[Make Algorithms Accountable](http://www.nytimes.com/2016/08/01/opinion/make-algorithms-accountable.html),” *nytimes.com*, August 1, 2016.
1. Bryce Goodman and Seth Flaxman: “[European Union Regulations on Algorithmic Decision-Making and a ‘Right to Explanation’](https://arxiv.org/abs/1606.08813),” *arXiv:1606.08813*, August 31, 2016.
1. “[A Review of the Data Broker Industry: Collection, Use, and Sale of Consumer Data for Marketing Purposes](https://web.archive.org/web/20240619042302/http://educationnewyork.com/files/rockefeller_databroker.pdf),” Staff Report, *United States Senate Committee on Commerce, Science, and Transportation*, *commerce.senate.gov*, December 2013.
1. Olivia Solon: “[Facebook’s Failure: Did Fake News and Polarized Politics Get Trump Elected?](https://www.theguardian.com/technology/2016/nov/10/facebook-fake-news-election-conspiracy-theories)” *theguardian.com*, November 10, 2016.
1. Donella H. Meadows and Diana Wright: *Thinking in Systems: A Primer*. Chelsea Green Publishing, 2008. ISBN: 978-1-603-58055-7
1. Daniel J. Bernstein: “[Listening to a ‘big data’/‘data science’ talk](https://twitter.com/hashbreaker/status/598076230437568512),” *twitter.com*, May 12, 2015.
1. Marc Andreessen: “[Why Software Is Eating the World](http://genius.com/Marc-andreessen-why-software-is-eating-the-world-annotated),” *The Wall Street Journal*, 20 August 2011.
1. J. M. Porup: “[‘Internet of Things’ Security Is Hilariously Broken and Getting Worse](http://arstechnica.com/security/2016/01/how-to-search-the-internet-of-things-for-photos-of-sleeping-babies/),” *arstechnica.com*, January 23, 2016.
1. Bruce Schneier: [*Data and Goliath: The Hidden Battles to Collect Your Data and Control Your World*](https://www.schneier.com/books/data_and_goliath/). W. W. Norton, 2015. ISBN: 978-0-393-35217-7
1. The Grugq: “[Nothing to Hide](https://grugq.tumblr.com/post/142799983558/nothing-to-hide),” *grugq.tumblr.com*, April 15, 2016.
1. Tony Beltramelli: “[Deep-Spying: Spying Using Smartwatch and Deep Learning](https://arxiv.org/abs/1512.05616),” Masters Thesis, IT University of Copenhagen, December 2015. Available at *arxiv.org/abs/1512.05616*
1. Shoshana Zuboff: “[Big Other: Surveillance Capitalism and the Prospects of an Information Civilization](http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2594754),” *Journal of Information Technology*, volume 30, number 1, pages 75–89, April 2015. [doi:10.1057/jit.2015.5](http://dx.doi.org/10.1057/jit.2015.5)
1. Carina C. Zona: “[Consequences of an Insightful Algorithm](https://www.youtube.com/watch?v=YRI40A4tyWU),” at *GOTO Berlin*, November 2016.
1. Bruce Schneier: “[Data Is a Toxic Asset, So Why Not Throw It Out?](https://www.schneier.com/essays/archives/2016/03/data_is_a_toxic_asse.html),” *schneier.com*, March 1, 2016.
1. John E. Dunn: “[The UK’s 15 Most Infamous Data Breaches](https://web.archive.org/web/20161120070058/http://www.techworld.com/security/uks-most-infamous-data-breaches-2016-3604586/),” *techworld.com*, November 18, 2016.
1. Cory Scott: “[Data is not toxic - which implies no benefit - but rather hazardous material, where we must balance need vs. want](https://twitter.com/cory_scott/status/706586399483437056),” *twitter.com*, March 6, 2016.
1. Bruce Schneier: “[Mission Creep: When Everything Is Terrorism](https://www.schneier.com/essays/archives/2013/07/mission_creep_when_e.html),” *schneier.com*, July 16, 2013.
1. Lena Ulbricht and Maximilian von Grafenstein: “[Big Data: Big Power Shifts?](http://policyreview.info/articles/analysis/big-data-big-power-shifts),” *Internet Policy Review*, volume 5, number 1, March 2016. [doi:10.14763/2016.1.406](http://dx.doi.org/10.14763/2016.1.406)
1. Ellen P. Goodman and Julia Powles: “[Facebook and Google: Most Powerful and Secretive Empires We've Ever Known](https://www.theguardian.com/technology/2016/sep/28/google-facebook-powerful-secretive-empire-transparency),” *theguardian.com*, September 28, 2016.
1. [Directive 95/46/EC on the protection of individuals with regard to the processing of personal data and on the free movement of such data](http://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:31995L0046), Official Journal of the European Communities No. L 281/31, *eur-lex.europa.eu*, November 1995.
1. Brendan Van Alsenoy: “[Regulating Data Protection: The Allocation of Responsibility and Risk Among Actors Involved in Personal Data Processing](https://lirias.kuleuven.be/handle/123456789/545027),” Thesis, KU Leuven Centre for IT and IP Law, August 2016.
1. Michiel Rhoen: “[Beyond Consent: Improving Data Protection Through Consumer Protection Law](http://policyreview.info/articles/analysis/beyond-consent-improving-data-protection-through-consumer-protection-law),” *Internet Policy Review*, volume 5, number 1, March 2016. [doi:10.14763/2016.1.404](http://dx.doi.org/10.14763/2016.1.404)
1. Jessica Leber: “[Your Data Footprint Is Affecting Your Life in Ways You Can’t Even Imagine](https://www.fastcoexist.com/3057514/your-data-footprint-is-affecting-your-life-in-ways-you-cant-even-imagine),” *fastcoexist.com*, March 15, 2016.
1. Maciej Cegłowski: “[Haunted by Data](http://idlewords.com/talks/haunted_by_data.htm),” *idlewords.com*, October 2015.
1. Sam Thielman: “[You Are Not What You Read: Librarians Purge User Data to Protect Privacy](https://www.theguardian.com/us-news/2016/jan/13/us-library-records-purged-data-privacy),” *theguardian.com*, January 13, 2016.
1. Conor Friedersdorf: “[Edward Snowden’s Other Motive for Leaking](http://www.theatlantic.com/politics/archive/2014/05/edward-snowdens-other-motive-for-leaking/370068/),” *theatlantic.com*, May 13, 2014.
1. Phillip Rogaway: “[The Moral Character of Cryptographic Work](http://web.cs.ucdavis.edu/~rogaway/papers/moral-fn.pdf),” Cryptology ePrint 2015/1162, December 2015.


================================================
FILE: content/v1/ch2.md
================================================
---
title: "第二章：数据模型与查询语言"
linkTitle: "2. 数据模型与查询语言"
weight: 102
math: true
breadcrumbs: false
---


![](/map/ch02.png)

> 语言的边界就是思想的边界。
>
> —— 路德维奇・维特根斯坦，《逻辑哲学》（1922）


数据模型可能是软件开发中最重要的部分了，因为它们的影响如此深远：不仅仅影响着软件的编写方式，而且影响着我们的 **解题思路**。

多数应用使用层层叠加的数据模型构建。对于每层数据模型的关键问题是：它是如何用低一层数据模型来 **表示** 的？例如：

1. 作为一名应用开发人员，你观察现实世界（里面有人员、组织、货物、行为、资金流向、传感器等），并采用对象或数据结构，以及操控那些数据结构的 API 来进行建模。那些结构通常是特定于应用程序的。
2. 当要存储那些数据结构时，你可以利用通用数据模型来表示它们，如 JSON 或 XML 文档、关系数据库中的表或图模型。
3. 数据库软件的工程师选定如何以内存、磁盘或网络上的字节来表示 JSON / XML/ 关系 / 图数据。这类表示形式使数据有可能以各种方式来查询，搜索，操纵和处理。
4. 在更低的层次上，硬件工程师已经想出了使用电流、光脉冲、磁场或者其他东西来表示字节的方法。

一个复杂的应用程序可能会有更多的中间层次，比如基于 API 的 API，不过基本思想仍然是一样的：每个层都通过提供一个明确的数据模型来隐藏更低层次中的复杂性。这些抽象允许不同的人群有效地协作（例如数据库厂商的工程师和使用数据库的应用程序开发人员）。

数据模型种类繁多，每个数据模型都带有如何使用的设想。有些用法很容易，有些则不支持如此；有些操作运行很快，有些则表现很差；有些数据转换非常自然，有些则很麻烦。

掌握一个数据模型需要花费很多精力（想想关系数据建模有多少本书）。即便只使用一个数据模型，不用操心其内部工作机制，构建软件也是非常困难的。然而，因为数据模型对上层软件的功能（能做什么，不能做什么）有着至深的影响，所以选择一个适合的数据模型是非常重要的。

在本章中，我们将研究一系列用于数据存储和查询的通用数据模型（前面列表中的第 2 点）。特别地，我们将比较关系模型，文档模型和少量基于图形的数据模型。我们还将查看各种查询语言并比较它们的用例。在 [第三章](/v1/ch3) 中，我们将讨论存储引擎是如何工作的。也就是说，这些数据模型实际上是如何实现的（列表中的第 3 点）。


## 关系模型与文档模型

现在最著名的数据模型可能是 SQL。它基于 Edgar Codd 在 1970 年提出的关系模型【1】：数据被组织成 **关系**（SQL 中称作 **表**），其中每个关系是 **元组**（SQL 中称作 **行**) 的无序集合。

关系模型曾是一个理论性的提议，当时很多人都怀疑是否能够有效实现它。然而到了 20 世纪 80 年代中期，关系数据库管理系统（RDBMSes）和 SQL 已成为大多数人们存储和查询某些常规结构的数据的首选工具。关系数据库已经持续称霸了大约 25~30 年 —— 这对计算机史来说是极其漫长的时间。

关系数据库起源于商业数据处理，在 20 世纪 60 年代和 70 年代用大型计算机来执行。从今天的角度来看，那些用例显得很平常：典型的 **事务处理**（将销售或银行交易，航空公司预订，库存管理信息记录在库）和 **批处理**（客户发票，工资单，报告）。

当时的其他数据库迫使应用程序开发人员必须考虑数据库内部的数据表示形式。关系模型致力于将上述实现细节隐藏在更简洁的接口之后。

多年来，在数据存储和查询方面存在着许多相互竞争的方法。在 20 世纪 70 年代和 80 年代初，网状模型（network model）和层次模型（hierarchical model）曾是主要的选择，但关系模型（relational model）随后占据了主导地位。对象数据库在 20 世纪 80 年代末和 90 年代初来了又去。XML 数据库在二十一世纪初出现，但只有小众采用过。关系模型的每个竞争者都在其时代产生了大量的炒作，但从来没有持续【2】。

随着电脑越来越强大和互联，它们开始用于日益多样化的目的。关系数据库非常成功地被推广到业务数据处理的原始范围之外更为广泛的用例上。你今天在网上看到的大部分内容依旧是由关系数据库来提供支持，无论是在线发布、讨论、社交网络、电子商务、游戏、软件即服务生产力应用程序等内容。

### NoSQL 的诞生

现在 - 2010 年代，NoSQL 开始了最新一轮尝试，试图推翻关系模型的统治地位。“NoSQL” 这个名字让人遗憾，因为实际上它并没有涉及到任何特定的技术。最初它只是作为一个醒目的 Twitter 标签，用在 2009 年一个关于分布式，非关系数据库上的开源聚会上。无论如何，这个术语触动了某些神经，并迅速在网络创业社区内外传播开来。好些有趣的数据库系统现在都与 *#NoSQL* 标签相关联，并且 NoSQL 被追溯性地重新解释为 **不仅是 SQL（Not Only SQL）** 【4】。

采用 NoSQL 数据库的背后有几个驱动因素，其中包括：

* 需要比关系数据库更好的可伸缩性，包括非常大的数据集或非常高的写入吞吐量
* 相比商业数据库产品，免费和开源软件更受偏爱
* 关系模型不能很好地支持一些特殊的查询操作
* 受挫于关系模型的限制性，渴望一种更具多动态性与表现力的数据模型【5】

不同的应用程序有不同的需求，一个用例的最佳技术选择可能不同于另一个用例的最佳技术选择。因此，在可预见的未来，关系数据库似乎可能会继续与各种非关系数据库一起使用 - 这种想法有时也被称为 **混合持久化（polyglot persistence）**。

### 对象关系不匹配

目前大多数应用程序开发都使用面向对象的编程语言来开发，这导致了对 SQL 数据模型的普遍批评：如果数据存储在关系表中，那么需要一个笨拙的转换层，处于应用程序代码中的对象和表，行，列的数据库模型之间。模型之间的不连贯有时被称为 **阻抗不匹配（impedance mismatch）**[^i]。

[^i]: 一个从电子学借用的术语。每个电路的输入和输出都有一定的阻抗（交流电阻）。当你将一个电路的输出连接到另一个电路的输入时，如果两个电路的输出和输入阻抗匹配，则连接上的功率传输将被最大化。阻抗不匹配会导致信号反射及其他问题。

像 ActiveRecord 和 Hibernate 这样的 **对象关系映射（ORM object-relational mapping）** 框架可以减少这个转换层所需的样板代码的数量，但是它们不能完全隐藏这两个模型之间的差异。

![](/v1/ddia_0201.png)

**图 2-1 使用关系型模式来表示领英简介**

例如，[图 2-1](/v1/ddia_0201.png) 展示了如何在关系模式中表示简历（一个 LinkedIn 简介）。整个简介可以通过一个唯一的标识符 `user_id` 来标识。像 `first_name` 和 `last_name` 这样的字段每个用户只出现一次，所以可以在 User 表上将其建模为列。但是，大多数人在职业生涯中拥有多于一份的工作，人们可能有不同样的教育阶段和任意数量的联系信息。从用户到这些项目之间存在一对多的关系，可以用多种方式来表示：

* 传统 SQL 模型（SQL：1999 之前）中，最常见的规范化表示形式是将职位，教育和联系信息放在单独的表中，对 User 表提供外键引用，如 [图 2-1](/v1/ddia_0201.png) 所示。
* 后续的 SQL 标准增加了对结构化数据类型和 XML 数据的支持；这允许将多值数据存储在单行内，并支持在这些文档内查询和索引。这些功能在 Oracle，IBM DB2，MS SQL Server 和 PostgreSQL 中都有不同程度的支持【6,7】。JSON 数据类型也得到多个数据库的支持，包括 IBM DB2，MySQL 和 PostgreSQL 【8】。
* 第三种选择是将职业，教育和联系信息编码为 JSON 或 XML 文档，将其存储在数据库的文本列中，并让应用程序解析其结构和内容。这种配置下，通常不能使用数据库来查询该编码列中的值。

对于一个像简历这样自包含文档的数据结构而言，JSON 表示是非常合适的：请参阅 [例 2-1]()。JSON 比 XML 更简单。面向文档的数据库（如 MongoDB 【9】，RethinkDB 【10】，CouchDB 【11】和 Espresso【12】）支持这种数据模型。

**例 2-1. 用 JSON 文档表示一个 LinkedIn 简介**

```json
{
  "user_id": 251,
  "first_name": "Bill",
  "last_name": "Gates",
  "summary": "Co-chair of the Bill & Melinda Gates... Active blogger.",
  "region_id": "us:91",
  "industry_id": 131,
  "photo_url": "/p/7/000/253/05b/308dd6e.jpg",
  "positions": [
    {
      "job_title": "Co-chair",
      "organization": "Bill & Melinda Gates Foundation"
    },
    {
      "job_title": "Co-founder, Chairman",
      "organization": "Microsoft"
    }
  ],
  "education": [
    {
      "school_name": "Harvard University",
      "start": 1973,
      "end": 1975
    },
    {
      "school_name": "Lakeside School, Seattle",
      "start": null,
      "end": null
    }
  ],
  "contact_info": {
    "blog": "http://thegatesnotes.com",
    "twitter": "http://twitter.com/BillGates"
  }
}
```

有一些开发人员认为 JSON 模型减少了应用程序代码和存储层之间的阻抗不匹配。不过，正如我们将在 [第四章](/v1/ch4) 中看到的那样，JSON 作为数据编码格式也存在问题。无模式对 JSON 模型来说往往被认为是一个优势；我们将在 “[文档模型中的模式灵活性](#文档模型中的模式灵活性)” 中讨论这个问题。

JSON 表示比 [图 2-1](/v1/ddia_0201.png) 中的多表模式具有更好的 **局部性（locality）**。如果在前面的关系型示例中获取简介，那需要执行多个查询（通过 `user_id` 查询每个表），或者在 User 表与其下属表之间混乱地执行多路连接。而在 JSON 表示中，所有相关信息都在同一个地方，一个查询就足够了。

从用户简介文件到用户职位，教育历史和联系信息，这种一对多关系隐含了数据中的一个树状结构，而 JSON 表示使得这个树状结构变得明确（见 [图 2-2](/v1/ddia_0202.png)）。

![](/v1/ddia_0202.png)

**图 2-2 一对多关系构建了一个树结构**

### 多对一和多对多的关系

在上一节的 [例 2-1]() 中，`region_id` 和 `industry_id` 是以 ID，而不是纯字符串 “Greater Seattle Area” 和 “Philanthropy” 的形式给出的。为什么？

如果用户界面用一个自由文本字段来输入区域和行业，那么将他们存储为纯文本字符串是合理的。另一方式是给出地理区域和行业的标准化的列表，并让用户从下拉列表或自动填充器中进行选择，其优势如下：

* 各个简介之间样式和拼写统一
* 避免歧义（例如，如果有几个同名的城市）
* 易于更新 —— 名称只存储在一个地方，如果需要更改（例如，由于政治事件而改变城市名称），很容易进行全面更新。
* 本地化支持 —— 当网站翻译成其他语言时，标准化的列表可以被本地化，使得地区和行业可以使用用户的语言来显示
* 更好的搜索 —— 例如，搜索华盛顿州的慈善家就会匹配这份简介，因为地区列表可以编码记录西雅图在华盛顿这一事实（从 “Greater Seattle Area” 这个字符串中看不出来）

存储 ID 还是文本字符串，这是个 **副本（duplication）** 问题。当使用 ID 时，对人类有意义的信息（比如单词：Philanthropy）只存储在一处，所有引用它的地方使用 ID（ID 只在数据库中有意义）。当直接存储文本时，对人类有意义的信息会复制在每处使用记录中。

使用 ID 的好处是，ID 对人类没有任何意义，因而永远不需要改变：ID 可以保持不变，即使它标识的信息发生变化。任何对人类有意义的东西都可能需要在将来某个时候改变 —— 如果这些信息被复制，所有的冗余副本都需要更新。这会导致写入开销，也存在不一致的风险（一些副本被更新了，还有些副本没有被更新）。去除此类重复是数据库 **规范化（normalization）** 的关键思想。[^ii]

[^ii]: 关于关系模型的文献区分了几种不同的规范形式，但这些区别几乎没有实际意义。一个经验法则是，如果重复存储了可以存储在一个地方的值，则模式就不是 **规范化（normalized）** 的。

> 数据库管理员和开发人员喜欢争论规范化和非规范化，让我们暂时保留判断吧。在本书的 [第三部分](/v1/part-iii)，我们将回到这个话题，探讨系统的方法用以处理缓存，非规范化和衍生数据。

不幸的是，对这些数据进行规范化需要多对一的关系（许多人生活在一个特定的地区，许多人在一个特定的行业工作），这与文档模型不太吻合。在关系数据库中，通过 ID 来引用其他表中的行是正常的，因为连接很容易。在文档数据库中，一对多树结构没有必要用连接，对连接的支持通常很弱 [^iii]。

[^iii]: 在撰写本文时，RethinkDB 支持连接，MongoDB 不支持连接，而 CouchDB 只支持预先声明的视图。

如果数据库本身不支持连接，则必须在应用程序代码中通过对数据库进行多个查询来模拟连接。（在这种情况中，地区和行业的列表可能很小，改动很少，应用程序可以简单地将其保存在内存中。不过，执行连接的工作从数据库被转移到应用程序代码上。）

此外，即便应用程序的最初版本适合无连接的文档模型，随着功能添加到应用程序中，数据会变得更加互联。例如，考虑一下对简历例子进行的一些修改：

组织和学校作为实体
: 在前面的描述中，`organization`（用户工作的公司）和 `school_name`（他们学习的地方）只是字符串。也许他们应该是对实体的引用呢？然后，每个组织、学校或大学都可以拥有自己的网页（标识、新闻提要等）。每个简历可以链接到它所提到的组织和学校，并且包括他们的图标和其他信息（请参阅 [图 2-3](/v1/ddia_0203.png)，来自 LinkedIn 的一个例子）。

推荐
: 假设你想添加一个新的功能：一个用户可以为另一个用户写一个推荐。在用户的简历上显示推荐，并附上推荐用户的姓名和照片。如果推荐人更新他们的照片，那他们写的任何推荐都需要显示新的照片。因此，推荐应该拥有作者个人简介的引用。

![](/v1/ddia_0203.png)

**图 2-3 公司名不仅是字符串，还是一个指向公司实体的链接（LinkedIn 截图）**

[图 2-4](/v1/ddia_0204.png) 阐明了这些新功能需要如何使用多对多关系。每个虚线矩形内的数据可以分组成一个文档，但是对单位，学校和其他用户的引用需要表示成引用，并且在查询时需要连接。

![](/v1/ddia_0204.png)

**图 2-4 使用多对多关系扩展简历**

### 文档数据库是否在重蹈覆辙？

在多对多的关系和连接已常规用在关系数据库时，文档数据库和 NoSQL 重启了辩论：如何以最佳方式在数据库中表示多对多关系。那场辩论可比 NoSQL 古老得多，事实上，最早可以追溯到计算机化数据库系统。

20 世纪 70 年代最受欢迎的业务数据处理数据库是 IBM 的信息管理系统（IMS），最初是为了阿波罗太空计划的库存管理而开发的，并于 1968 年有了首次商业发布【13】。目前它仍在使用和维护，运行在 IBM 大型机的 OS/390 上【14】。

IMS 的设计中使用了一个相当简单的数据模型，称为 **层次模型（hierarchical model）**，它与文档数据库使用的 JSON 模型有一些惊人的相似之处【2】。它将所有数据表示为嵌套在记录中的记录树，这很像 [图 2-2](/v1/ddia_0202.png) 的 JSON 结构。

同文档数据库一样，IMS 能良好处理一对多的关系，但是很难应对多对多的关系，并且不支持连接。开发人员必须决定是否复制（非规范化）数据或手动解决从一个记录到另一个记录的引用。这些二十世纪六七十年代的问题与现在开发人员遇到的文档数据库问题非常相似【15】。

那时人们提出了各种不同的解决方案来解决层次模型的局限性。其中最突出的两个是 **关系模型**（relational model，它变成了 SQL，并统治了世界）和 **网状模型**（network model，最初很受关注，但最终变得冷门）。这两个阵营之间的 “大辩论” 在 70 年代持续了很久时间【2】。

那两个模式解决的问题与当前的问题相关，因此值得简要回顾一下那场辩论。

#### 网状模型

网状模型由一个称为数据系统语言会议（CODASYL）的委员会进行了标准化，并被数个不同的数据库厂商实现；它也被称为 CODASYL 模型【16】。

CODASYL 模型是层次模型的推广。在层次模型的树结构中，每条记录只有一个父节点；在网络模式中，每条记录可能有多个父节点。例如，“Greater Seattle Area” 地区可能是一条记录，每个居住在该地区的用户都可以与之相关联。这允许对多对一和多对多的关系进行建模。

网状模型中记录之间的链接不是外键，而更像编程语言中的指针（同时仍然存储在磁盘上）。访问记录的唯一方法是跟随从根记录起沿这些链路所形成的路径。这被称为 **访问路径（access path）**。

最简单的情况下，访问路径类似遍历链表：从列表头开始，每次查看一条记录，直到找到所需的记录。但在多对多关系的情况中，数条不同的路径可以到达相同的记录，网状模型的程序员必须跟踪这些不同的访问路径。

CODASYL 中的查询是通过利用遍历记录列和跟随访问路径表在数据库中移动游标来执行的。如果记录有多个父结点（即多个来自其他记录的传入指针），则应用程序代码必须跟踪所有的各种关系。甚至 CODASYL 委员会成员也承认，这就像在 n 维数据空间中进行导航【17】。

尽管手动选择访问路径能够最有效地利用 20 世纪 70 年代非常有限的硬件功能（如磁带驱动器，其搜索速度非常慢），但这使得查询和更新数据库的代码变得复杂不灵活。无论是分层还是网状模型，如果你没有所需数据的路径，就会陷入困境。你可以改变访问路径，但是必须浏览大量手写数据库查询代码，并重写来处理新的访问路径。更改应用程序的数据模型是很难的。

#### 关系模型

相比之下，关系模型做的就是将所有的数据放在光天化日之下：一个 **关系（表）** 只是一个 **元组（行）** 的集合，仅此而已。如果你想读取数据，它没有迷宫似的嵌套结构，也没有复杂的访问路径。你可以选中符合任意条件的行，读取表中的任何或所有行。你可以通过指定某些列作为匹配关键字来读取特定行。你可以在任何表中插入一个新的行，而不必担心与其他表的外键关系 [^iv]。

[^iv]: 外键约束允许对修改进行限制，但对于关系模型这并不是必选项。即使有约束，外键连接在查询时执行，而在 CODASYL 中，连接在插入时高效完成。

在关系数据库中，查询优化器自动决定查询的哪些部分以哪个顺序执行，以及使用哪些索引。这些选择实际上是 “访问路径”，但最大的区别在于它们是由查询优化器自动生成的，而不是由程序员生成，所以我们很少需要考虑它们。

如果想按新的方式查询数据，你可以声明一个新的索引，查询会自动使用最合适的那些索引。无需更改查询来利用新的索引（请参阅 “[数据查询语言](#数据查询语言)”）。关系模型因此使添加应用程序新功能变得更加容易。

关系数据库的查询优化器是复杂的，已耗费了多年的研究和开发精力【18】。关系模型的一个关键洞察是：只需构建一次查询优化器，随后使用该数据库的所有应用程序都可以从中受益。如果你没有查询优化器的话，那么为特定查询手动编写访问路径比编写通用优化器更容易 —— 不过从长期看通用解决方案更好。

#### 与文档数据库相比

在一个方面，文档数据库还原为层次模型：在其父记录中存储嵌套记录（[图 2-1](/v1/ddia_0201.png) 中的一对多关系，如 `positions`，`education` 和 `contact_info`），而不是在单独的表中。

但是，在表示多对一和多对多的关系时，关系数据库和文档数据库并没有根本的不同：在这两种情况下，相关项目都被一个唯一的标识符引用，这个标识符在关系模型中被称为 **外键**，在文档模型中称为 **文档引用**【9】。该标识符在读取时通过连接或后续查询来解析。迄今为止，文档数据库没有走 CODASYL 的老路。

### 关系型数据库与文档数据库在今日的对比

将关系数据库与文档数据库进行比较时，可以考虑许多方面的差异，包括它们的容错属性（请参阅 [第五章](/v1/ch5)）和处理并发性（请参阅 [第七章](/v1/ch7)）。本章将只关注数据模型中的差异。

支持文档数据模型的主要论据是架构灵活性，因局部性而拥有更好的性能，以及对于某些应用程序而言更接近于应用程序使用的数据结构。关系模型通过为连接提供更好的支持以及支持多对一和多对多的关系来反击。

#### 哪种数据模型更有助于简化应用代码？

如果应用程序中的数据具有类似文档的结构（即，一对多关系树，通常一次性加载整个树），那么使用文档模型可能是一个好主意。将类似文档的结构分解成多个表（如 [图 2-1](/v1/ddia_0201.png) 中的 `positions`、`education` 和 `contact_info`）的关系技术可能导致繁琐的模式和不必要的复杂的应用程序代码。

文档模型有一定的局限性：例如，不能直接引用文档中的嵌套的项目，而是需要说 “用户 251 的位置列表中的第二项”（很像层次模型中的访问路径）。但是，只要文件嵌套不太深，这通常不是问题。

文档数据库对连接的糟糕支持可能是个问题，也可能不是问题，这取决于应用程序。例如，如果某分析型应用程序使用一个文档数据库来记录何时何地发生了何事，那么多对多关系可能永远也用不上。【19】。

但如果你的应用程序确实会用到多对多关系，那么文档模型就没有那么诱人了。尽管可以通过反规范化来消除对连接的需求，但这需要应用程序代码来做额外的工作以确保数据一致性。尽管应用程序代码可以通过向数据库发出多个请求的方式来模拟连接，但这也将复杂性转移到应用程序中，而且通常也会比由数据库内的专用代码更慢。在这种情况下，使用文档模型可能会导致更复杂的应用代码与更差的性能【15】。

我们没有办法说哪种数据模型更有助于简化应用代码，因为它取决于数据项之间的关系种类。对高度关联的数据而言，文档模型是极其糟糕的，关系模型是可以接受的，而选用图形模型（请参阅 “[图数据模型](#图数据模型)”）是最自然的。

#### 文档模型中的模式灵活性

大多数文档数据库以及关系数据库中的 JSON 支持都不会强制文档中的数据采用何种模式。关系数据库的 XML 支持通常带有可选的模式验证。没有模式意味着可以将任意的键和值添加到文档中，并且当读取时，客户端无法保证文档可能包含的字段。

文档数据库有时称为 **无模式（schemaless）**，但这具有误导性，因为读取数据的代码通常假定某种结构 —— 即存在隐式模式，但不由数据库强制执行【20】。一个更精确的术语是 **读时模式**（即 schema-on-read，数据的结构是隐含的，只有在数据被读取时才被解释），相应的是 **写时模式**（即 schema-on-write，传统的关系数据库方法中，模式明确，且数据库确保所有的数据都符合其模式）【21】。

读时模式类似于编程语言中的动态（运行时）类型检查，而写时模式类似于静态（编译时）类型检查。就像静态和动态类型检查的相对优点具有很大的争议性一样【22】，数据库中模式的强制性是一个具有争议的话题，一般来说没有正确或错误的答案。

在应用程序想要改变其数据格式的情况下，这些方法之间的区别尤其明显。例如，假设你把每个用户的全名存储在一个字段中，而现在想分别存储名字和姓氏【23】。在文档数据库中，只需开始写入具有新字段的新文档，并在应用程序中使用代码来处理读取旧文档的情况。例如：

```go
if (user && user.name && !user.first_name) {
  // Documents written before Dec 8, 2013 don't have first_name
  user.first_name = user.name.split(" ")[0];
}
```

另一方面，在 “静态类型” 数据库模式中，通常会执行以下 **迁移（migration）** 操作：

```sql
ALTER TABLE users ADD COLUMN first_name text;
UPDATE users SET first_name = split_part(name, ' ', 1);      -- PostgreSQL
UPDATE users SET first_name = substring_index(name, ' ', 1);      -- MySQL
```

模式变更的速度很慢，而且要求停运。它的这种坏名誉并不是完全应得的：大多数关系数据库系统可在几毫秒内执行 `ALTER TABLE` 语句。MySQL 是一个值得注意的例外，它执行 `ALTER TABLE` 时会复制整个表，这可能意味着在更改一个大型表时会花费几分钟甚至几个小时的停机时间，尽管存在各种工具来解决这个限制【24,25,26】。

大型表上运行 `UPDATE` 语句在任何数据库上都可能会很慢，因为每一行都需要重写。要是不可接受的话，应用程序可以将 `first_name` 设置为默认值 `NULL`，并在读取时再填充，就像使用文档数据库一样。

当由于某种原因（例如，数据是异构的）集合中的项目并不都具有相同的结构时，读时模式更具优势。例如，如果：

* 存在许多不同类型的对象，将每种类型的对象放在自己的表中是不现实的。
* 数据的结构由外部系统决定。你无法控制外部系统且它随时可能变化。

在上述情况下，模式的坏处远大于它的帮助，无模式文档可能是一个更加自然的数据模型。但是，要是所有记录都具有相同的结构，那么模式是记录并强制这种结构的有效机制。第四章将更详细地讨论模式和模式演化。

#### 查询的数据局部性

文档通常以单个连续字符串形式进行存储，编码为 JSON、XML 或其二进制变体（如 MongoDB 的 BSON）。如果应用程序经常需要访问整个文档（例如，将其渲染至网页），那么存储局部性会带来性能优势。如果将数据分割到多个表中（如 [图 2-1](/v1/ddia_0201.png) 所示），则需要进行多次索引查找才能将其全部检索出来，这可能需要更多的磁盘查找并花费更多的时间。

局部性仅仅适用于同时需要文档绝大部分内容的情况。即使只访问文档其中的一小部分，数据库通常需要加载整个文档，对于大型文档来说这种加载行为是很浪费的。更新文档时，通常需要整个重写。只有不改变文档大小的修改才可以容易地原地执行。因此，通常建议保持相对小的文档，并避免增加文档大小的写入【9】。这些性能限制大大减少了文档数据库的实用场景。

值得指出的是，为了局部性而分组集合相关数据的想法并不局限于文档模型。例如，Google 的 Spanner 数据库在关系数据模型中提供了同样的局部性属性，允许模式声明一个表的行应该交错（嵌套）在父表内【27】。Oracle 类似地允许使用一个称为 **多表索引集群表（multi-table index cluster tables）** 的类似特性【28】。Bigtable 数据模型（用于 Cassandra 和 HBase）中的 **列族（column-family）** 概念与管理局部性的目的类似【29】。

在 [第三章](/v1/ch3) 将还会看到更多关于局部性的内容。

#### 文档和关系数据库的融合

自 2000 年代中期以来，大多数关系数据库系统（MySQL 除外）都已支持 XML。这包括对 XML 文档进行本地修改的功能，以及在 XML 文档中进行索引和查询的功能。这允许应用程序使用那种与文档数据库应当使用的非常类似的数据模型。

从 9.3 版本开始的 PostgreSQL 【8】，从 5.7 版本开始的 MySQL 以及从版本 10.5 开始的 IBM DB2【30】也对 JSON 文档提供了类似的支持级别。鉴于用在 Web APIs 的 JSON 流行趋势，其他关系数据库很可能会跟随他们的脚步并添加 JSON 支持。

在文档数据库中，RethinkDB 在其查询语言中支持类似关系的连接，一些 MongoDB 驱动程序可以自动解析数据库引用（有效地执行客户端连接，尽管这可能比在数据库中执行的连接慢，需要额外的网络往返，并且优化更少）。

随着时间的推移，关系数据库和文档数据库似乎变得越来越相似，这是一件好事：数据模型相互补充 [^v]，如果一个数据库能够处理类似文档的数据，并能够对其执行关系查询，那么应用程序就可以使用最符合其需求的功能组合。

关系模型和文档模型的混合是未来数据库一条很好的路线。

[^v]: Codd 对关系模型【1】的原始描述实际上允许在关系模式中与 JSON 文档非常相似。他称之为 **非简单域（nonsimple domains）**。这个想法是，一行中的值不一定是一个像数字或字符串一样的原始数据类型，也可以是一个嵌套的关系（表），因此可以把一个任意嵌套的树结构作为一个值，这很像 30 年后添加到 SQL 中的 JSON 或 XML 支持。


## 数据查询语言

当引入关系模型时，关系模型包含了一种查询数据的新方法：SQL 是一种 **声明式** 查询语言，而 IMS 和 CODASYL 使用 **命令式** 代码来查询数据库。那是什么意思？

许多常用的编程语言是命令式的。例如，给定一个动物物种的列表，返回列表中的鲨鱼可以这样写：

```js
function getSharks() {
    var sharks = [];
    for (var i = 0; i < animals.length; i++) {
        if (animals[i].family === "Sharks") {
            sharks.push(animals[i]);
        }
    }
    return sharks;
}
```

而在关系代数中，你可以这样写：

$$
sharks = \sigma_{family = "sharks"}(animals)
$$

其中 $\sigma$（希腊字母西格玛）是选择操作符，只返回符合 `family="shark"` 条件的动物。

定义 SQL 时，它紧密地遵循关系代数的结构：

```sql
SELECT * FROM animals WHERE family ='Sharks';
```

命令式语言告诉计算机以特定顺序执行某些操作。可以想象一下，逐行地遍历代码，评估条件，更新变量，并决定是否再循环一遍。

在声明式查询语言（如 SQL 或关系代数）中，你只需指定所需数据的模式 - 结果必须符合哪些条件，以及如何将数据转换（例如，排序，分组和集合） - 但不是如何实现这一目标。数据库系统的查询优化器决定使用哪些索引和哪些连接方法，以及以何种顺序执行查询的各个部分。

声明式查询语言是迷人的，因为它通常比命令式 API 更加简洁和容易。但更重要的是，它还隐藏了数据库引擎的实现细节，这使得数据库系统可以在无需对查询做任何更改的情况下进行性能提升。

例如，在本节开头所示的命令代码中，动物列表以特定顺序出现。如果数据库想要在后台回收未使用的磁盘空间，则可能需要移动记录，这会改变动物出现的顺序。数据库能否安全地执行，而不会中断查询？

SQL 示例不确保任何特定的顺序，因此不在意顺序是否改变。但是如果查询用命令式的代码来写的话，那么数据库就永远不可能确定代码是否依赖于排序。SQL 相当有限的功能性为数据库提供了更多自动优化的空间。

最后，声明式语言往往适合并行执行。现在，CPU 的速度通过核心（core）的增加变得更快，而不是以比以前更高的时钟速度运行【31】。命令代码很难在多个核心和多个机器之间并行化，因为它指定了指令必须以特定顺序执行。声明式语言更具有并行执行的潜力，因为它们仅指定结果的模式，而不指定用于确定结果的算法。在适当情况下，数据库可以自由使用查询语言的并行实现【32】。

### Web 上的声明式查询

声明式查询语言的优势不仅限于数据库。为了说明这一点，让我们在一个完全不同的环境中比较声明式和命令式方法：一个 Web 浏览器。

假设你有一个关于海洋动物的网站。用户当前正在查看鲨鱼页面，因此你将当前所选的导航项目 “鲨鱼” 标记为当前选中项目。

```html
<ul>
    <li class="selected">
        <p>Sharks</p>
        <ul>
            <li>Great White Shark</li>
            <li>Tiger Shark</li>
            <li>Hammerhead Shark</li>
        </ul>
    </li>
    <li><p>Whales</p>
        <ul>
            <li>Blue Whale</li>
            <li>Humpback Whale</li>
            <li>Fin Whale</li>
        </ul>
    </li>
</ul>
```

现在想让当前所选页面的标题具有一个蓝色的背景，以便在视觉上突出显示。使用 CSS 实现起来非常简单：

```css
li.selected > p {
  background-color: blue;
}
```

这里的 CSS 选择器 `li.selected > p` 声明了我们想要应用蓝色样式的元素的模式：即其直接父元素是具有 CSS 类 `selected` 的 `<li>` 元素的所有 `<p>` 元素。示例中的元素 `<p>Sharks</p>` 匹配此模式，但 `<p>Whales</p>` 不匹配，因为其 `<li>` 父元素缺少 `class="selected"`。

如果使用 XSL 而不是 CSS，你可以做类似的事情：

```xml
<xsl:template match="li[@class='selected']/p">
    <fo:block background-color="blue">
        <xsl:apply-templates/>
    </fo:block>
</xsl:template>
```

这里的 XPath 表达式 `li[@class='selected']/p` 相当于上例中的 CSS 选择器 `li.selected > p`。CSS 和 XSL 的共同之处在于，它们都是用于指定文档样式的声明式语言。

想象一下，必须使用命令式方法的情况会是如何。在 Javascript 中，使用 **文档对象模型（DOM）** API，其结果可能如下所示：

```js
var liElements = document.getElementsByTagName("li");
for (var i = 0; i < liElements.length; i++) {
    if (liElements[i].className === "selected") {
        var children = liElements[i].childNodes;
        for (var j = 0; j < children.length; j++) {
            var child = children[j];
            if (child.nodeType === Node.ELEMENT_NODE && child.tagName === "P") {
                child.setAttribute("style", "background-color: blue");
            }
        }
    }
}
```

这段 JavaScript 代码命令式地将元素设置为蓝色背景，但是代码看起来很糟糕。不仅比 CSS 和 XSL 等价物更长，更难理解，而且还有一些严重的问题：

* 如果选定的类被移除（例如，因为用户点击了不同的页面），即使代码重新运行，蓝色背景也不会被移除 - 因此该项目将保持突出显示，直到整个页面被重新加载。使用 CSS，浏览器会自动检测 `li.selected > p` 规则何时不再适用，并在选定的类被移除后立即移除蓝色背景。

* 如果你想要利用新的 API（例如 `document.getElementsByClassName("selected")` 甚至 `document.evaluate()`）来提高性能，则必须重写代码。另一方面，浏览器供应商可以在不破坏兼容性的情况下提高 CSS 和 XPath 的性能。

在 Web 浏览器中，使用声明式 CSS 样式比使用 JavaScript 命令式地操作样式要好得多。类似地，在数据库中，使用像 SQL 这样的声明式查询语言比使用命令式查询 API 要好得多 [^vi]。

[^vi]: IMS 和 CODASYL 都使用命令式 API。应用程序通常使用 COBOL 代码遍历数据库中的记录，一次一条记录【2,16】。

### MapReduce查询

MapReduce 是一个由 Google 推广的编程模型，用于在多台机器上批量处理大规模的数据【33】。一些 NoSQL 数据存储（包括 MongoDB 和 CouchDB）支持有限形式的 MapReduce，作为在多个文档中执行只读查询的机制。

关于 MapReduce 更详细的介绍在 [第十章](/v1/ch10)。现在我们只简要讨论一下 MongoDB 使用的模型。

MapReduce 既不是一个声明式的查询语言，也不是一个完全命令式的查询 API，而是处于两者之间：查询的逻辑用代码片段来表示，这些代码片段会被处理框架重复性调用。它基于 `map`（也称为 `collect`）和 `reduce`（也称为 `fold` 或 `inject`）函数，两个函数存在于许多函数式编程语言中。

最好举例来解释 MapReduce 模型。假设你是一名海洋生物学家，每当你看到海洋中的动物时，你都会在数据库中添加一条观察记录。现在你想生成一个报告，说明你每月看到多少鲨鱼。

在 PostgreSQL 中，你可以像这样表述这个查询：

```sql
SELECT
  date_trunc('month', observation_timestamp) AS observation_month,
  sum(num_animals)                           AS total_animals
FROM observations
WHERE family = 'Sharks'
GROUP BY observation_month;
```

`date_trunc('month', timestamp)` 函数用于确定包含 `timestamp` 的日历月份，并返回代表该月份开始的另一个时间戳。换句话说，它将时间戳舍入成最近的月份。

这个查询首先过滤观察记录，以只显示鲨鱼家族的物种，然后根据它们发生的日历月份对观察记录果进行分组，最后将在该月的所有观察记录中看到的动物数目加起来。

同样的查询用 MongoDB 的 MapReduce 功能可以按如下来表述：

```js
db.observations.mapReduce(function map() {
        var year = this.observationTimestamp.getFullYear();
        var month = this.observationTimestamp.getMonth() + 1;
        emit(year + "-" + month, this.numAnimals);
    },
    function reduce(key, values) {
        return Array.sum(values);
    },
    {
        query: {
          family: "Sharks"
        },
        out: "monthlySharkReport"
    });
```

* 可以声明式地指定一个只考虑鲨鱼种类的过滤器（这是 MongoDB 特定的 MapReduce 扩展）。
* 每个匹配查询的文档都会调用一次 JavaScript 函数 `map`，将 `this` 设置为文档对象。
* `map` 函数发出一个键（包括年份和月份的字符串，如 `"2013-12"` 或 `"2014-1"`）和一个值（该观察记录中的动物数量）。
* `map` 发出的键值对按键来分组。对于具有相同键（即，相同的月份和年份）的所有键值对，调用一次 `reduce` 函数。
* `reduce` 函数将特定月份内所有观测记录中的动物数量相加。
* 将最终的输出写入到 `monthlySharkReport` 集合中。

例如，假设 `observations` 集合包含这两个文档：

```json
{
  observationTimestamp: Date.parse(  "Mon, 25 Dec 1995 12:34:56 GMT"),
  family: "Sharks",
  species: "Carcharodon carcharias",
  numAnimals: 3
}
{
  observationTimestamp: Date.parse("Tue, 12 Dec 1995 16:17:18 GMT"),
  family: "Sharks",
  species:    "Carcharias taurus",
  numAnimals: 4
}
```

对每个文档都会调用一次 `map` 函数，结果将是 `emit("1995-12",3)` 和 `emit("1995-12",4)`。随后，以 `reduce("1995-12",[3,4])` 调用 `reduce` 函数，将返回 `7`。

map 和 reduce 函数在功能上有所限制：它们必须是 **纯** 函数，这意味着它们只使用传递给它们的数据作为输入，它们不能执行额外的数据库查询，也不能有任何副作用。这些限制允许数据库以任何顺序运行任何功能，并在失败时重新运行它们。然而，map 和 reduce 函数仍然是强大的：它们可以解析字符串、调用库函数、执行计算等等。

MapReduce 是一个相当底层的编程模型，用于计算机集群上的分布式执行。像 SQL 这样的更高级的查询语言可以用一系列的 MapReduce 操作来实现（见 [第十章](/v1/ch10)），但是也有很多不使用 MapReduce 的分布式 SQL 实现。須注意，SQL 并没有限制它只能在单一机器上运行，而 MapReduce 也并没有垄断所有的分布式查询执行。

能够在查询中使用 JavaScript 代码是高级查询的一个重要特性，但这不限于 MapReduce，一些 SQL 数据库也可以用 JavaScript 函数进行扩展【34】。

MapReduce 的一个可用性问题是，必须编写两个密切合作的 JavaScript 函数，这通常比编写单个查询更困难。此外，声明式查询语言为查询优化器提供了更多机会来提高查询的性能。基于这些原因，MongoDB 2.2 添加了一种叫做 **聚合管道** 的声明式查询语言的支持【9】。用这种语言表述鲨鱼计数查询如下所示：

```js
db.observations.aggregate([
  { $match: { family: "Sharks" } },
  { $group: {
    _id: {
      year:  { $year:  "$observationTimestamp" },
      month: { $month: "$observationTimestamp" }
    },
    totalAnimals: { $sum: "$numAnimals" } }}
]);
```

聚合管道语言的表现力与（前述 PostgreSQL 例子的）SQL 子集相当，但是它使用基于 JSON 的语法而不是 SQL 那种接近英文句式的语法；这种差异也许只是口味问题。这个故事的寓意是：NoSQL 系统可能会意外发现自己只是重新发明了一套经过乔装改扮的 SQL。


## 图数据模型

如我们之前所见，多对多关系是不同数据模型之间具有区别性的重要特征。如果你的应用程序大多数的关系是一对多关系（树状结构化数据），或者大多数记录之间不存在关系，那么使用文档模型是合适的。

但是，要是多对多关系在你的数据中很常见呢？关系模型可以处理多对多关系的简单情况，但是随着数据之间的连接变得更加复杂，将数据建模为图形显得更加自然。

一个图由两种对象组成：**顶点**（vertices，也称为 **节点**，即 nodes，或 **实体**，即 entities），和 **边**（edges，也称为 **关系**，即 relationships，或 **弧**，即 arcs）。多种数据可以被建模为一个图形。典型的例子包括：

社交图谱
: 顶点是人，边指示哪些人彼此认识。

网络图谱
: 顶点是网页，边缘表示指向其他页面的 HTML 链接。

公路或铁路网络
: 顶点是交叉路口，边线代表它们之间的道路或铁路线。

可以将那些众所周知的算法运用到这些图上：例如，汽车导航系统搜索道路网络中两点之间的最短路径，PageRank 可以用在网络图上来确定网页的流行程度，从而确定该网页在搜索结果中的排名。

在刚刚给出的例子中，图中的所有顶点代表了相同类型的事物（人、网页或交叉路口）。不过，图并不局限于这样的同类数据：同样强大地是，图提供了一种一致的方式，用来在单个数据存储中存储完全不同类型的对象。例如，Facebook 维护一个包含许多不同类型的顶点和边的单个图：顶点表示人、地点、事件、签到和用户的评论；边表示哪些人是好友、签到发生在哪里、谁评论了什么帖子、谁参与了什么事件等等【35】。

在本节中，我们将使用 [图 2-5](/v1/ddia_0205.png) 所示的示例。它可以从社交网络或系谱数据库中获得：它显示了两个人，来自爱达荷州的 Lucy 和来自法国 Beaune 的 Alain。他们已婚，住在伦敦。

![](/v1/ddia_0205.png)

**图 2-5 图数据结构示例（框代表顶点，箭头代表边）**

有几种不同但相关的方法用来构建和查询图表中的数据。在本节中，我们将讨论属性图模型（由 Neo4j，Titan 和 InfiniteGraph 实现）和三元组存储（triple-store）模型（由 Datomic、AllegroGraph 等实现）。我们将查看图的三种声明式查询语言：Cypher，SPARQL 和 Datalog。除此之外，还有像 Gremlin 【36】这样的图形查询语言和像 Pregel 这样的图形处理框架（见 [第十章](/v1/ch10)）。

### 属性图

在属性图模型中，每个顶点（vertex）包括：

* 唯一的标识符
* 一组出边（outgoing edges）
* 一组入边（ingoing edges）
* 一组属性（键值对）

每条边（edge）包括：

* 唯一标识符
* 边的起点（**尾部顶点**，即 tail vertex）
* 边的终点（**头部顶点**，即 head vertex）
* 描述两个顶点之间关系类型的标签
* 一组属性（键值对）

可以将图存储看作由两个关系表组成：一个存储顶点，另一个存储边，如 [例 2-2]() 所示（该模式使用 PostgreSQL JSON 数据类型来存储每个顶点或每条边的属性）。头部和尾部顶点用来存储每条边；如果你想要一组顶点的输入或输出边，你可以分别通过 `head_vertex` 或 `tail_vertex` 来查询 `edges` 表。

**例 2-2 使用关系模式来表示属性图**

```sql
CREATE TABLE vertices (
  vertex_id  INTEGER PRIMARY KEY,
  properties JSON
);

CREATE TABLE edges (
  edge_id     INTEGER PRIMARY KEY,
  tail_vertex INTEGER REFERENCES vertices (vertex_id),
  head_vertex INTEGER REFERENCES vertices (vertex_id),
  label       TEXT,
  properties  JSON
);

CREATE INDEX edges_tails ON edges (tail_vertex);
CREATE INDEX edges_heads ON edges (head_vertex);
```

关于这个模型的一些重要方面是：

1. 任何顶点都可以有一条边连接到任何其他顶点。没有模式限制哪种事物可不可以关联。
2. 给定任何顶点，可以高效地找到它的入边和出边，从而遍历图，即沿着一系列顶点的路径前后移动（这就是为什么 [例 2-2]() 在 `tail_vertex` 和 `head_vertex` 列上都有索引的原因）。
3. 通过对不同类型的关系使用不同的标签，可以在一个图中存储几种不同的信息，同时仍然保持一个清晰的数据模型。

这些特性为数据建模提供了很大的灵活性，如 [图 2-5](/v1/ddia_0205.png) 所示。图中显示了一些传统关系模式难以表达的事情，例如不同国家的不同地区结构（法国有省和大区，美国有县和州），国中国的怪事（先忽略主权国家和民族错综复杂的烂摊子），不同的数据粒度（Lucy 现在的住所记录具体到城市，而她的出生地点只是在一个州的级别）。

你可以想象该图还能延伸出许多关于 Lucy 和 Alain 的事实，或其他人的其他更多的事实。例如，你可以用它来表示食物过敏（为每个过敏源增加一个顶点，并增加人与过敏源之间的一条边来指示一种过敏情况），并链接到过敏源，每个过敏源具有一组顶点用来显示哪些食物含有哪些物质。然后，你可以写一个查询，找出每个人吃什么是安全的。图在可演化性方面是富有优势的：当你向应用程序添加功能时，可以轻松扩展图以适应程序数据结构的变化。

### Cypher 查询语言

Cypher 是属性图的声明式查询语言，为 Neo4j 图形数据库而发明【37】（它是以电影 “黑客帝国” 中的一个角色来命名的，而与密码学中的加密算法无关【38】）。

[例 2-3]() 显示了将 [图 2-5](/v1/ddia_0205.png) 的左边部分插入图形数据库的 Cypher 查询。你可以以类似的方式把图的剩余部分添加进去，但这里为了文章可閱读性而省略这部分的示例。每个顶点都有一个像 `USA` 或 `Idaho` 这样的符号名称，查询的其他部分可以使用这些名称在顶点之间创建边，使用箭头符号：`(Idaho) -[:WITHIN]-> (USA)` 创建一条标记为 `WITHIN` 的边，`Idaho` 为尾节点，`USA` 为头节点。

**例 2-3 将图 2-5 中的数据子集表示为 Cypher 查询**

```cypher
CREATE
  (NAmerica:Location {name:'North America', type:'continent'}),
  (USA:Location      {name:'United States', type:'country'  }),
  (Idaho:Location    {name:'Idaho',         type:'state'    }),
  (Lucy:Person       {name:'Lucy' }),
  (Idaho) -[:WITHIN]->  (USA)  -[:WITHIN]-> (NAmerica),
  (Lucy)  -[:BORN_IN]-> (Idaho)
```

当 [图 2-5](/v1/ddia_0205.png) 的所有顶点和边被添加到数据库后，让我们提些有趣的问题：例如，找到所有从美国移民到欧洲的人的名字。更确切地说，这里我们想要找到符合下面条件的所有顶点，并且返回这些顶点的 `name` 属性：该顶点拥有一条连到美国任一位置的 `BORN_IN` 边，和一条连到欧洲的任一位置的 `LIVING_IN` 边。

[例 2-4]() 展示了如何在 Cypher 中表达这个查询。在 MATCH 子句中使用相同的箭头符号来查找图中的模式：`(person) -[:BORN_IN]-> ()` 可以匹配 `BORN_IN` 边的任意两个顶点。该边的尾节点被绑定了变量 `person`，头节点则未被绑定。

**例 2-4 查找所有从美国移民到欧洲的人的 Cypher 查询：**

```cypher
MATCH
  (person) -[:BORN_IN]->  () -[:WITHIN*0..]-> (us:Location {name:'United States'}),
  (person) -[:LIVES_IN]-> () -[:WITHIN*0..]-> (eu:Location {name:'Europe'})
RETURN person.name
```

查询按如下来解读：

> 找到满足以下两个条件的所有顶点（称之为 person 顶点）：
> 1.  `person` 顶点拥有一条到某个顶点的 `BORN_IN` 出边。从那个顶点开始，沿着一系列 `WITHIN` 出边最终到达一个类型为 `Location`，`name` 属性为 `United States` 的顶点。
>
> 2. `person` 顶点还拥有一条 `LIVES_IN` 出边。沿着这条边，可以通过一系列 `WITHIN` 出边最终到达一个类型为 `Location`，`name` 属性为 `Europe` 的顶点。
>
> 对于这样的 `Person` 顶点，返回其 `name` 属性。

执行这条查询可能会有几种可行的查询路径。这里给出的描述建议首先扫描数据库中的所有人，检查每个人的出生地和居住地，然后只返回符合条件的那些人。

等价地，也可以从两个 `Location` 顶点开始反向地查找。假如 `name` 属性上有索引，则可以高效地找到代表美国和欧洲的两个顶点。然后，沿着所有 `WITHIN` 入边，可以继续查找出所有在美国和欧洲的位置（州、地区、城市等）。最后，查找出那些可以由 `BORN_IN` 或 `LIVES_IN` 入边到那些位置顶点的人。

通常对于声明式查询语言来说，在编写查询语句时，不需要指定执行细节：查询优化程序会自动选择预测效率最高的策略，因此你可以专注于编写应用程序的其他部分。

### SQL 中的图查询

[例 2-2]() 指出，可以在关系数据库中表示图数据。但是，如果图数据已经以关系结构存储，我们是否也可以使用 SQL 查询它？

答案是肯定的，但有些困难。在关系数据库中，你通常会事先知道在查询中需要哪些连接。在图查询中，你可能需要在找到待查找的顶点之前，遍历可变数量的边。也就是说，连接的数量事先并不确定。

在我们的例子中，这发生在 Cypher 查询中的 `() -[:WITHIN*0..]-> ()` 规则中。一个人的 `LIVES_IN` 边可以指向任何类型的位置：街道、城市、地区、国家等。一个城市可以在（WITHIN）一个地区内，一个地区可以在（WITHIN）在一个州内，一个州可以在（WITHIN）一个国家内，等等。`LIVES_IN` 边可以直接指向正在查找的位置，或者一个在位置层次结构中隔了数层的位置。

在 Cypher 中，用 `WITHIN*0..` 非常简洁地表述了上述事实：“沿着 `WITHIN` 边，零次或多次”。它很像正则表达式中的 `*` 运算符。

自 SQL:1999，查询可变长度遍历路径的思想可以使用称为 **递归公用表表达式**（`WITH RECURSIVE` 语法）的东西来表示。[例 2-5]() 显示了同样的查询 - 查找从美国移民到欧洲的人的姓名 - 在 SQL 使用这种技术（PostgreSQL、IBM DB2、Oracle 和 SQL Server 均支持）来表述。但是，与 Cypher 相比，其语法非常笨拙。

**例 2-5  与示例 2-4 同样的查询，在 SQL 中使用递归公用表表达式表示**

```sql
WITH RECURSIVE
  -- in_usa 包含所有的美国境内的位置 ID
    in_usa(vertex_id) AS (
    SELECT vertex_id FROM vertices WHERE properties ->> 'name' = 'United States'
    UNION
    SELECT edges.tail_vertex FROM edges
      JOIN in_usa ON edges.head_vertex = in_usa.vertex_id
      WHERE edges.label = 'within'
  ),
  -- in_europe 包含所有的欧洲境内的位置 ID
    in_europe(vertex_id) AS (
    SELECT vertex_id FROM vertices WHERE properties ->> 'name' = 'Europe'
    UNION
    SELECT edges.tail_vertex FROM edges
      JOIN in_europe ON edges.head_vertex = in_europe.vertex_id
      WHERE edges.label = 'within' ),

  -- born_in_usa 包含了所有类型为 Person，且出生在美国的顶点
    born_in_usa(vertex_id) AS (
      SELECT edges.tail_vertex FROM edges
        JOIN in_usa ON edges.head_vertex = in_usa.vertex_id
        WHERE edges.label = 'born_in' ),

  -- lives_in_europe 包含了所有类型为 Person，且居住在欧洲的顶点。
    lives_in_europe(vertex_id) AS (
      SELECT edges.tail_vertex FROM edges
        JOIN in_europe ON edges.head_vertex = in_europe.vertex_id
        WHERE edges.label = 'lives_in')

  SELECT vertices.properties ->> 'name'
  FROM vertices
    JOIN born_in_usa ON vertices.vertex_id = born_in_usa.vertex_id
    JOIN lives_in_europe ON vertices.vertex_id = lives_in_europe.vertex_id;
```

* 首先，查找 `name` 属性为 `United States` 的顶点，将其作为 `in_usa` 顶点的集合的第一个元素。
* 从 `in_usa` 集合的顶点出发，沿着所有的 `with_in` 入边，将其尾顶点加入同一集合，不断递归直到所有 `with_in` 入边都被访问完毕。
* 同理，从 `name` 属性为 `Europe` 的顶点出发，建立 `in_europe` 顶点的集合。
* 对于 `in_usa` 集合中的每个顶点，根据 `born_in` 入边来查找出生在美国某个地方的人。
* 同样，对于 `in_europe` 集合中的每个顶点，根据 `lives_in` 入边来查找居住在欧洲的人。
* 最后，把在美国出生的人的集合与在欧洲居住的人的集合相交。

同一个查询，用某一个查询语言可以写成 4 行，而用另一个查询语言需要 29 行，这恰恰说明了不同的数据模型是为不同的应用场景而设计的。选择适合应用程序的数据模型非常重要。

### 三元组存储和 SPARQL

三元组存储模式大体上与属性图模型相同，用不同的词来描述相同的想法。不过仍然值得讨论，因为三元组存储有很多现成的工具和语言，这些工具和语言对于构建应用程序的工具箱可能是宝贵的补充。

在三元组存储中，所有信息都以非常简单的三部分表示形式存储（**主语**，**谓语**，**宾语**）。例如，三元组 **(吉姆, 喜欢, 香蕉)** 中，**吉姆** 是主语，**喜欢** 是谓语（动词），**香蕉** 是对象。

三元组的主语相当于图中的一个顶点。而宾语是下面两者之一：

1. 原始数据类型中的值，例如字符串或数字。在这种情况下，三元组的谓语和宾语相当于主语顶点上的属性的键和值。例如，`(lucy, age, 33)` 就像属性 `{"age": 33}` 的顶点 lucy。
2. 图中的另一个顶点。在这种情况下，谓语是图中的一条边，主语是其尾部顶点，而宾语是其头部顶点。例如，在 `(lucy, marriedTo, alain)` 中主语和宾语 `lucy` 和 `alain` 都是顶点，并且谓语 `marriedTo` 是连接他们的边的标签。

[例 2-6]() 展示了与 [例 2-3]() 相同的数据，以称为 Turtle 的格式（Notation3（N3）【39】的一个子集）写成三元组。

**例 2-6 图 2-5 中的数据子集，表示为 Turtle 三元组**

```reStructuredText
@prefix : <urn:example:>.
_:lucy     a       :Person.
_:lucy     :name   "Lucy".
_:lucy     :bornIn _:idaho.
_:idaho    a       :Location.
_:idaho    :name   "Idaho".
_:idaho    :type   "state".
_:idaho    :within _:usa.
_:usa      a       :Location
_:usa      :name   "United States"
_:usa      :type   "country".
_:usa      :within _:namerica.
_:namerica a       :Location
_:namerica :name   "North America"
_:namerica :type   :"continent"
```

在这个例子中，图的顶点被写为：`_:someName`。这个名字并不意味着这个文件以外的任何东西。它的存在只是帮助我们明确哪些三元组引用了同一顶点。当谓语表示边时，该宾语是一个顶点，如 `_:idaho :within _:usa.`。当谓语是一个属性时，该宾语是一个字符串，如 `_:usa :name"United States"`

一遍又一遍地重复相同的主语看起来相当重复，但幸运的是，可以使用分号来说明关于同一主语的多个事情。这使得 Turtle 格式相当不错，可读性强：请参阅 [例 2-7]()。

**例 2-7 一种相对例 2-6 写入数据的更为简洁的方法。**

```
@prefix : <urn:example:>.
_:lucy      a :Person;   :name "Lucy";          :bornIn _:idaho.
_:idaho     a :Location; :name "Idaho";         :type "state";   :within _:usa
_:usa       a :Loaction; :name "United States"; :type "country"; :within _:namerica.
_:namerica  a :Location; :name "North America"; :type "continent".
```

#### 语义网

如果你深入了解关于三元组存储的信息，可能会陷入关于**语义网**的讨论漩涡中。三元组存储模型其实是完全独立于语义网存在的，例如，Datomic【40】作为一种三元组存储数据库 [^vii]，从未被用于语义网中。但是，由于在很多人眼中这两者紧密相连，我们应该简要地讨论一下。

[^vii]: 从技术上讲，Datomic 使用的是五元组而不是三元组，两个额外的字段是用于版本控制的元数据

从本质上讲，语义网是一个简单且合理的想法：网站已经将信息发布为文字和图片供人类阅读，为什么不将信息作为机器可读的数据也发布给计算机呢？（基于三元组模型的）**资源描述框架**（**RDF**）【41】，被用作不同网站以统一的格式发布数据的一种机制，允许来自不同网站的数据自动合并成 **一个数据网络** —— 成为一种互联网范围内的 “通用语义网数据库”。

不幸的是，语义网在二十一世纪初被过度炒作，但到目前为止没有任何迹象表明已在实践中应用，这使得许多人嗤之以鼻。它还饱受眼花缭乱的缩略词、过于复杂的标准提案和狂妄自大的困扰。

然而，如果从过去的失败中汲取教训，语义网项目还是拥有很多优秀的成果。即使你没有兴趣在语义网上发布 RDF 数据，三元组这种模型也是一种好的应用程序内部数据模型。

#### RDF 数据模型

[例 2-7]() 中使用的 Turtle 语言是一种用于 RDF 数据的人类可读格式。有时候，RDF 也可以以 XML 格式编写，不过完成同样的事情会相对啰嗦，请参阅 [例 2-8]()。Turtle/N3 是更可取的，因为它更容易阅读，像 Apache Jena 【42】这样的工具可以根据需要在不同的 RDF 格式之间进行自动转换。

**例 2-8 用 RDF/XML 语法表示例 2-7 的数据**

```xml
<rdf:RDF xmlns="urn:example:"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <Location rdf:nodeID="idaho">
        <name>Idaho</name>
        <type>state</type>
        <within>
            <Location rdf:nodeID="usa">
                <name>United States</name>
                <type>country</type>
                <within>
                    <Location rdf:nodeID="namerica">
                        <name>North America</name>
                        <type>continent</type>
                    </Location>
                </within>
            </Location>
        </within>
    </Location>
    <Person rdf:nodeID="lucy">
        <name>Lucy</name>
        <bornIn rdf:nodeID="idaho"/>
    </Person>
</rdf:RDF>
```

RDF 有一些奇怪之处，因为它是为了在互联网上交换数据而设计的。三元组的主语，谓语和宾语通常是 URI。例如，谓语可能是一个 URI，如 `<http://my-company.com/namespace#within>` 或 `<http://my-company.com/namespace#lives_in>`，而不仅仅是 `WITHIN` 或 `LIVES_IN`。这个设计背后的原因为了让你能够把你的数据和其他人的数据结合起来，如果他们赋予单词 `within` 或者 `lives_in` 不同的含义，两者也不会冲突，因为它们的谓语实际上是 `<http://other.org/foo#within>` 和 `<http://other.org/foo#lives_in>`。

从 RDF 的角度来看，URL `<http://my-company.com/namespace>` 不一定需要能解析成什么东西，它只是一个命名空间。为避免与 `http://URL` 混淆，本节中的示例使用不可解析的 URI，如 `urn:example:within`。幸运的是，你只需在文件顶部对这个前缀做一次声明，后续就不用再管了。

### SPARQL 查询语言

**SPARQL** 是一种用于三元组存储的面向 RDF 数据模型的查询语言【43】（它是 SPARQL 协议和 RDF 查询语言的缩写，发音为 “sparkle”）。SPARQL 早于 Cypher，并且由于 Cypher 的模式匹配借鉴于 SPARQL，这使得它们看起来非常相似【37】。

与之前相同的查询 —— 查找从美国移民到欧洲的人 —— 使用 SPARQL 比使用 Cypher 甚至更为简洁（请参阅 [例 2-9]()）。

**例 2-9 与示例 2-4 相同的查询，用 SPARQL 表示**

```sparql
PREFIX : <urn:example:>
SELECT ?personName WHERE {
  ?person :name ?personName.
  ?person :bornIn  / :within* / :name "United States".
  ?person :livesIn / :within* / :name "Europe".
}
```

结构非常相似。以下两个表达式是等价的（SPARQL 中的变量以问号开头）：

```
(person) -[:BORN_IN]-> () -[:WITHIN*0..]-> (location)   # Cypher
?person :bornIn / :within* ?location.                   # SPARQL
```

因为 RDF 不区分属性和边，而只是将它们作为谓语，所以可以使用相同的语法来匹配属性。在下面的表达式中，变量 `usa` 被绑定到任意 `name` 属性为字符串值 `"United States"` 的顶点：

```
(usa {name:'United States'})   # Cypher
?usa :name "United States".    # SPARQL
```

SPARQL 是一种很好的查询语言 —— 尽管它构想的语义网从未实现，但它仍然是一种可用于应用程序内部的强大工具。

> #### 图形数据库与网状模型相比较
>
> 在 “[文档数据库是否在重蹈覆辙？](#文档数据库是否在重蹈覆辙？)” 中，我们讨论了 CODASYL 和关系模型如何竞相解决 IMS 中的多对多关系问题。乍一看，CODASYL 的网状模型看起来与图模型相似。CODASYL 是否是图形数据库的第二个变种？
>
> 不，他们在几个重要方面有所不同：
>
> * 在 CODASYL 中，数据库有一个模式，用于指定哪种记录类型可以嵌套在其他记录类型中。在图形数据库中，不存在这样的限制：任何顶点都可以具有到其他任何顶点的边。这为应用程序适应不断变化的需求提供了更大的灵活性。
> * 在 CODASYL 中，达到特定记录的唯一方法是遍历其中的一个访问路径。在图形数据库中，可以通过其唯一 ID 直接引用任何顶点，也可以使用索引来查找具有特定值的顶点。
> * 在 CODASYL 中，记录的子项目是一个有序集合，所以数据库必须去管理它们的次序（这会影响存储布局），并且应用程序在插入新记录到数据库时必须关注新记录在这些集合中的位置。在图形数据库中，顶点和边是无序的（只能在查询时对结果进行排序）。
> * 在 CODASYL 中，所有查询都是命令式的，难以编写，并且很容易因架构变化而受到破坏。在图形数据库中，你可以在命令式代码中手写遍历过程，但大多数图形数据库都支持高级声明式查询，如 Cypher 或 SPARQL。
>
>

### 基础：Datalog

**Datalog** 是比 SPARQL、Cypher 更古老的语言，在 20 世纪 80 年代被学者广泛研究【44,45,46】。它在软件工程师中不太知名，但是它是重要的，因为它为以后的查询语言提供了基础。

实践中，Datalog 在有限的几个数据系统中使用：例如，它是 Datomic 【40】的查询语言，Cascalog 【47】是一种用于查询 Hadoop 大数据集的 Datalog 实现 [^viii]。

[^viii]: Datomic 和 Cascalog 使用 Datalog 的 Clojure S 表达式语法。在下面的例子中使用了一个更容易阅读的 Prolog 语法，但两者没有任何功能差异。

Datalog 的数据模型类似于三元组模式，但进行了一点泛化。把三元组写成 **谓语**（**主语，宾语**），而不是写三元语（**主语，谓语，宾语**）。[例 2-10]() 显示了如何用 Datalog 写入我们的例子中的数据。

**例 2-10 用 Datalog 来表示图 2-5 中的数据子集**

```prolog
name(namerica, 'North America').
type(namerica, continent).

name(usa, 'United States').
type(usa, country).
within(usa, namerica).

name(idaho, 'Idaho').
type(idaho, state).
within(idaho, usa).

name(lucy, 'Lucy').
born_in(lucy, idaho).
```

既然已经定义了数据，我们可以像之前一样编写相同的查询，如 [例 2-11]() 所示。它看起来与 Cypher 或 SPARQL 的语法差异较大，但请不要抗拒它。Datalog 是 Prolog 的一个子集，如果你是计算机科学专业的学生，可能已经见过 Prolog。

**例 2-11 与示例 2-4 相同的查询，用 Datalog 表示**

```
within_recursive(Location, Name) :- name(Location, Name). /* Rule 1 */

within_recursive(Location, Name) :- within(Location, Via), /* Rule 2 */
                  within_recursive(Via, Name).

migrated(Name, BornIn, LivingIn) :- name(Person, Name), /* Rule 3 */
                                    born_in(Person, BornLoc),
                                    within_recursive(BornLoc, BornIn),
                                    lives_in(Person, LivingLoc),
                                    within_recursive(LivingLoc, LivingIn).

?- migrated(Who, 'United States', 'Europe'). /* Who = 'Lucy'. */
```

Cypher 和 SPARQL 使用 SELECT 立即跳转，但是 Datalog 一次只进行一小步。我们定义 **规则**，以将新谓语告诉数据库：在这里，我们定义了两个新的谓语，`within_recursive` 和 `migrated`。这些谓语不是存储在数据库中的三元组中，而是从数据或其他规则派生而来的。规则可以引用其他规则，就像函数可以调用其他函数或者递归地调用自己一样。像这样，复杂的查询可以借由小的砖瓦构建起来。

在规则中，以大写字母开头的单词是变量，谓语则用 Cypher 和 SPARQL 的方式一样来匹配。例如，`name(Location, Name)` 通过变量绑定 `Location = namerica` 和 `Name ='North America'` 可以匹配三元组 `name(namerica, 'North America')`。

要是系统可以在 `:-` 操作符的右侧找到与所有谓语的一个匹配，就运用该规则。当规则运用时，就好像通过 `:-` 的左侧将其添加到数据库（将变量替换成它们匹配的值）。

因此，一种可能的应用规则的方式是：

1. 数据库存在 `name (namerica, 'North America')`，故运用规则 1。它生成 `within_recursive (namerica, 'North America')`。
2. 数据库存在 `within (usa, namerica)`，在上一步骤中生成 `within_recursive (namerica, 'North America')`，故运用规则 2。它会产生 `within_recursive (usa, 'North America')`。
3. 数据库存在 `within (idaho, usa)`，在上一步生成 `within_recursive (usa, 'North America')`，故运用规则 2。它产生 `within_recursive (idaho, 'North America')`。

通过重复应用规则 1 和 2，`within_recursive` 谓语可以告诉我们在数据库中包含北美（或任何其他位置名称）的所有位置。这个过程如 [图 2-6](/v1/ddia_0206.png) 所示。

![](/v1/ddia_0206.png)

**图 2-6 使用示例 2-11 中的 Datalog 规则来确定爱达荷州在北美。**

现在规则 3 可以找到出生在某个地方 `BornIn` 的人，并住在某个地方 `LivingIn`。通过查询 `BornIn ='United States'` 和 `LivingIn ='Europe'`，并将此人作为变量 `Who`，让 Datalog 系统找出变量 `Who` 会出现哪些值。因此，最后得到了与早先的 Cypher 和 SPARQL 查询相同的答案。

相对于本章讨论的其他查询语言，我们需要采取不同的思维方式来思考 Datalog 方法，但这是一种非常强大的方法，因为规则可以在不同的查询中进行组合和重用。虽然对于简单的一次性查询，显得不太方便，但是它可以更好地处理数据很复杂的情况。


## 本章小结

数据模型是一个巨大的课题，在本章中，我们快速浏览了各种不同的模型。我们没有足够的篇幅来详述每个模型的细节，但是希望这个概述足以激起你的兴趣，以更多地了解最适合你的应用需求的模型。

在历史上，数据最开始被表示为一棵大树（层次数据模型），但是这不利于表示多对多的关系，所以发明了关系模型来解决这个问题。最近，开发人员发现一些应用程序也不适合采用关系模型。新的非关系型 “NoSQL” 数据存储分化为两个主要方向：

**文档数据库**
: 主要关注自我包含的数据文档，而且文档之间的关系非常稀少。

**图形数据库**
: 用于相反的场景：任意事物之间都可能存在潜在的关联。

这三种模型（文档，关系和图形）在今天都被广泛使用，并且在各自的领域都发挥很好。一个模型可以用另一个模型来模拟 —— 例如，图数据可以在关系数据库中表示 —— 但结果往往是糟糕的。这就是为什么我们有着针对不同目的的不同系统，而不是一个单一的万能解决方案。

文档数据库和图数据库有一个共同点，那就是它们通常不会将存储的数据强制约束为特定模式，这可以使应用程序更容易适应不断变化的需求。但是应用程序很可能仍会假定数据具有一定的结构；区别仅在于模式是**明确的**（写入时强制）还是**隐含的**（读取时处理）。

每个数据模型都具有各自的查询语言或框架，我们讨论了几个例子：SQL、MapReduce、MongoDB 的聚合管道、Cypher、SPARQL 和 Datalog。我们也谈到了 CSS 和 XSL/XPath，它们不是数据库查询语言，而包含有趣的相似之处。

虽然我们已经覆盖了很多层面，但仍然有许多数据模型没有提到。举几个简单的例子：

* 使用基因组数据的研究人员通常需要执行 **序列相似性搜索**，这意味着需要一个很长的字符串（代表一个 DNA 序列），并在一个拥有类似但不完全相同的字符串的大型数据库中寻找匹配。这里所描述的数据库都不能处理这种用法，这就是为什么研究人员编写了像 GenBank 这样的专门的基因组数据库软件的原因【48】。
* 粒子物理学家数十年来一直在进行大数据类型的大规模数据分析，像大型强子对撞机（LHC）这样的项目现在会处理数百 PB 的数据！在这样的规模下，需要定制解决方案来阻止硬件成本的失控【49】。
* **全文搜索** 可以说是一种经常与数据库一起使用的数据模型。信息检索是一个很大的专业课题，我们不会在本书中详细介绍，但是我们将在第三章和第三部分中介绍搜索索引。

让我们暂时将其放在一边。在 [下一章](/v1/ch3) 中，我们将讨论在 **实现** 本章描述的数据模型时会遇到的一些权衡。


## 参考文献

1. Edgar F. Codd: “[A Relational Model of Data for Large Shared Data Banks](https://www.seas.upenn.edu/~zives/03f/cis550/codd.pdf),” *Communications of the ACM*, volume 13, number 6, pages 377–387, June 1970. [doi:10.1145/362384.362685](http://dx.doi.org/10.1145/362384.362685)
1. Michael Stonebraker and Joseph M. Hellerstein: “[What Goes Around Comes Around](http://mitpress2.mit.edu/books/chapters/0262693143chapm1.pdf),” in *Readings in Database Systems*, 4th edition, MIT Press, pages 2–41, 2005. ISBN: 978-0-262-69314-1
1. Pramod J. Sadalage and Martin Fowler: *NoSQL Distilled*. Addison-Wesley, August 2012. ISBN: 978-0-321-82662-6
1. Eric Evans: “[NoSQL: What's in a Name?](https://web.archive.org/web/20190623045155/http://blog.sym-link.com/2009/10/30/nosql_whats_in_a_name.html),” *blog.sym-link.com*, October 30, 2009.
1. James Phillips: “[Surprises in Our NoSQL Adoption Survey](http://blog.couchbase.com/nosql-adoption-survey-surprises),” *blog.couchbase.com*, February 8, 2012.
1. Michael Wagner: *SQL/XML:2006 – Evaluierung der Standardkonformität ausgewählter Datenbanksysteme*. Diplomica Verlag, Hamburg, 2010. ISBN: 978-3-836-64609-3
1. “[XML Data (SQL Server)](https://docs.microsoft.com/en-us/sql/relational-databases/xml/xml-data-sql-server?view=sql-server-ver15),” SQL Server documentation, *docs.microsoft.com*, 2013.
1. “[PostgreSQL 9.3.1 Documentation](http://www.postgresql.org/docs/9.3/static/index.html),” The PostgreSQL Global Development Group, 2013.
1. “[The MongoDB 2.4 Manual](http://docs.mongodb.org/manual/),” MongoDB, Inc., 2013.
1. “[RethinkDB 1.11 Documentation](http://www.rethinkdb.com/docs/),” *rethinkdb.com*, 2013.
1. “[Apache CouchDB 1.6 Documentation](http://docs.couchdb.org/en/latest/),” *docs.couchdb.org*, 2014.
1. Lin Qiao, Kapil Surlaker, Shirshanka Das, et al.: “[On Brewing Fresh Espresso: LinkedIn’s Distributed Data Serving Platform](http://www.slideshare.net/amywtang/espresso-20952131),” at *ACM International Conference on Management of Data* (SIGMOD), June 2013.
1. Rick Long, Mark Harrington, Robert Hain, and Geoff Nicholls: [*IMS Primer*](http://www.redbooks.ibm.com/redbooks/pdfs/sg245352.pdf). IBM Redbook SG24-5352-00, IBM International Technical Support Organization, January 2000.
1. Stephen D. Bartlett: “[IBM’s IMS—Myths, Realities, and Opportunities](https://public.dhe.ibm.com/software/data/ims/pdf/TCG2013015LI.pdf),” The Clipper Group Navigator, TCG2013015LI, July 2013.
1. Sarah Mei: “[Why You Should Never Use MongoDB](http://www.sarahmei.com/blog/2013/11/11/why-you-should-never-use-mongodb/),” *sarahmei.com*, November 11, 2013.
1. J. S. Knowles and D. M. R. Bell: “The CODASYL Model,” in *Databases—Role and Structure: An Advanced Course*, edited by P. M. Stocker, P. M. D. Gray, and M. P. Atkinson, pages 19–56, Cambridge University Press, 1984. ISBN: 978-0-521-25430-4
1. Charles W. Bachman: “[The Programmer as Navigator](http://dl.acm.org/citation.cfm?id=362534),” *Communications of the ACM*, volume 16, number 11, pages 653–658, November 1973. [doi:10.1145/355611.362534](http://dx.doi.org/10.1145/355611.362534)
1. Joseph M. Hellerstein, Michael Stonebraker, and James Hamilton: “[Architecture of a Database System](http://db.cs.berkeley.edu/papers/fntdb07-architecture.pdf),” *Foundations and Trends in Databases*, volume 1, number 2, pages 141–259, November 2007. [doi:10.1561/1900000002](http://dx.doi.org/10.1561/1900000002)
1. Sandeep Parikh and Kelly Stirman: “[Schema Design for Time Series Data in MongoDB](http://blog.mongodb.org/post/65517193370/schema-design-for-time-series-data-in-mongodb),” *blog.mongodb.org*, October 30, 2013.
1. Martin Fowler: “[Schemaless Data Structures](http://martinfowler.com/articles/schemaless/),” *martinfowler.com*, January 7, 2013.
1. Amr Awadallah: “[Schema-on-Read vs. Schema-on-Write](http://www.slideshare.net/awadallah/schemaonread-vs-schemaonwrite),” at *Berkeley EECS RAD Lab Retreat*, Santa Cruz, CA, May 2009.
1. Martin Odersky: “[The Trouble with Types](http://www.infoq.com/presentations/data-types-issues),” at *Strange Loop*, September 2013.
1. Conrad Irwin: “[MongoDB—Confessions of a PostgreSQL Lover](https://speakerdeck.com/conradirwin/mongodb-confessions-of-a-postgresql-lover),” at *HTML5DevConf*, October 2013.
1. “[Percona Toolkit Documentation: pt-online-schema-change](http://www.percona.com/doc/percona-toolkit/2.2/pt-online-schema-change.html),” Percona Ireland Ltd., 2013.
1. Rany Keddo, Tobias Bielohlawek, and Tobias Schmidt: “[Large Hadron Migrator](https://github.com/soundcloud/lhm),” SoundCloud, 2013.
1. Shlomi Noach: “[gh-ost: GitHub's Online Schema Migration Tool for MySQL](http://githubengineering.com/gh-ost-github-s-online-migration-tool-for-mysql/),” *githubengineering.com*, August 1, 2016.
1. James C. Corbett, Jeffrey Dean, Michael Epstein, et al.: “[Spanner: Google’s Globally-Distributed Database](https://research.google/pubs/pub39966/),” at *10th USENIX Symposium on Operating System Design and Implementation* (OSDI), October 2012.
1. Donald K. Burleson: “[Reduce I/O with Oracle Cluster Tables](https://web.archive.org/web/20231207233228/http://www.dba-oracle.com/oracle_tip_hash_index_cluster_table.htm),” *dba-oracle.com*.
1. Fay Chang, Jeffrey Dean, Sanjay Ghemawat, et al.: “[Bigtable: A Distributed Storage System for Structured Data](https://research.google/pubs/pub27898/),” at *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006.
1. Bobbie J. Cochrane and Kathy A. McKnight: “[DB2 JSON Capabilities, Part 1: Introduction to DB2 JSON](https://web.archive.org/web/20180516203043/https://www.ibm.com/developerworks/data/library/techarticle/dm-1306nosqlforjson1/),” IBM developerWorks, June 20, 2013.
1. Herb Sutter: “[The Free Lunch Is Over: A Fundamental Turn Toward Concurrency in Software](http://www.gotw.ca/publications/concurrency-ddj.htm),” *Dr. Dobb's Journal*, volume 30, number 3, pages 202-210, March 2005.
1. Joseph M. Hellerstein: “[The Declarative Imperative: Experiences and Conjectures in Distributed Logic](http://www.eecs.berkeley.edu/Pubs/TechRpts/2010/EECS-2010-90.pdf),” Electrical Engineering and Computer Sciences, University of California at Berkeley, Tech report UCB/EECS-2010-90, June 2010.
1. Jeffrey Dean and Sanjay Ghemawat: “[MapReduce: Simplified Data Processing on Large Clusters](https://research.google/pubs/pub62/),” at *6th USENIX Symposium on Operating System Design and Implementation* (OSDI), December 2004.
1. Craig Kerstiens: “[JavaScript in Your Postgres](https://blog.heroku.com/javascript_in_your_postgres),” *blog.heroku.com*, June 5, 2013.
1. Nathan Bronson, Zach Amsden, George Cabrera, et al.: “[TAO: Facebook’s Distributed Data Store for the Social Graph](https://www.usenix.org/conference/atc13/technical-sessions/presentation/bronson),” at *USENIX Annual Technical Conference* (USENIX ATC), June 2013.
1. “[Apache TinkerPop3.2.3 Documentation](http://tinkerpop.apache.org/docs/3.2.3/reference/),” *tinkerpop.apache.org*, October 2016.
1. “[The Neo4j Manual v2.0.0](http://docs.neo4j.org/chunked/2.0.0/index.html),” Neo Technology, 2013.
1. Emil Eifrem: [Twitter correspondence](https://twitter.com/emileifrem/status/419107961512804352), January 3, 2014.
1. David Beckett and Tim Berners-Lee: “[Turtle – Terse RDF Triple Language](http://www.w3.org/TeamSubmission/turtle/),” W3C Team Submission, March 28, 2011.
1. “[Datomic Development Resources](http://docs.datomic.com/),” Metadata Partners, LLC, 2013.
1. W3C RDF Working Group: “[Resource Description Framework (RDF)](http://www.w3.org/RDF/),” *w3.org*, 10 February 2004.
1. “[Apache Jena](http://jena.apache.org/),” Apache Software Foundation.
1. Steve Harris, Andy Seaborne, and Eric Prud'hommeaux: “[SPARQL 1.1 Query Language](http://www.w3.org/TR/sparql11-query/),” W3C Recommendation, March 2013.
1. Todd J. Green, Shan Shan Huang, Boon Thau Loo, and Wenchao Zhou: “[Datalog and Recursive Query Processing](http://blogs.evergreen.edu/sosw/files/2014/04/Green-Vol5-DBS-017.pdf),” *Foundations and Trends in Databases*, volume 5, number 2, pages 105–195, November 2013. [doi:10.1561/1900000017](http://dx.doi.org/10.1561/1900000017)
1. Stefano Ceri, Georg Gottlob, and Letizia Tanca: “[What You Always Wanted to Know About Datalog (And Never Dared to Ask)](https://www.researchgate.net/profile/Letizia_Tanca/publication/3296132_What_you_always_wanted_to_know_about_Datalog_and_never_dared_to_ask/links/0fcfd50ca2d20473ca000000.pdf),” *IEEE Transactions on Knowledge and Data Engineering*, volume 1, number 1, pages 146–166, March 1989. [doi:10.1109/69.43410](http://dx.doi.org/10.1109/69.43410)
1. Serge Abiteboul, Richard Hull, and Victor Vianu: [*Foundations of Databases*](http://webdam.inria.fr/Alice/). Addison-Wesley, 1995. ISBN: 978-0-201-53771-0, available online at *webdam.inria.fr/Alice*
1. Nathan Marz: “[Cascalog](https://github.com/nathanmarz/cascalog)," *github.com*.
1. Dennis A. Benson, Ilene Karsch-Mizrachi, David J. Lipman, et al.: “[GenBank](https://academic.oup.com/nar/article/36/suppl_1/D25/2507746),” *Nucleic Acids Research*, volume 36, Database issue, pages D25–D30, December 2007. [doi:10.1093/nar/gkm929](http://dx.doi.org/10.1093/nar/gkm929)
1. Fons Rademakers: “[ROOT for Big Data Analysis](https://indico.cern.ch/event/246453/contributions/1566610/attachments/423154/587535/ROOT-BigData-Analysis-London-2013.pdf),” at *Workshop on the Future of Big Data Management*, London, UK, June 2013.


================================================
FILE: content/v1/ch3.md
================================================
---
title: "第三章：存储与检索"
linkTitle: "3. 存储与检索"
weight: 103
breadcrumbs: false
---


![](/map/ch03.png)

> 建立秩序，省却搜索
>
> —— 德国谚语

一个数据库在最基础的层次上需要完成两件事情：当你把数据交给数据库时，它应当把数据存储起来；而后当你向数据库要数据时，它应当把数据返回给你。

在 [第二章](/v1/ch2) 中，我们讨论了数据模型和查询语言，即程序员将数据录入数据库的格式，以及再次要回数据的机制。在本章中我们会从数据库的视角来讨论同样的问题：数据库如何存储我们提供的数据，以及如何在我们需要时重新找到数据。

作为程序员，为什么要关心数据库内部存储与检索的机理？你可能不会去从头开始实现自己的存储引擎，但是你 **确实** 需要从许多可用的存储引擎中选择一个合适的。而且为了让存储引擎能在你的工作负载类型上运行良好，你也需要大致了解存储引擎在底层究竟做了什么。

特别需要注意，针对 **事务性** 负载优化的和针对 **分析性** 负载优化的存储引擎之间存在巨大差异。稍后我们将在 “[事务处理还是分析？](#事务处理还是分析？)” 一节中探讨这一区别，并在 “[列式存储](#列式存储)” 中讨论一系列针对分析性负载而优化的存储引擎。

但首先，我们将从你可能已经很熟悉的两大类数据库（传统的关系型数据库和很多所谓的 “NoSQL” 数据库）中使用的 **存储引擎** 来开始本章的内容。我们将研究两大类存储引擎：**日志结构（log-structured）** 的存储引擎，以及 **面向页面（page-oriented）** 的存储引擎（例如 B 树）。

## 驱动数据库的数据结构

世界上最简单的数据库可以用两个 Bash 函数实现：

```bash
#!/bin/bash
db_set () {
  echo "$1,$2" >> database
}

db_get () {
  grep "^$1," database | sed -e "s/^$1,//" | tail -n 1
}
```

这两个函数实现了键值存储的功能。执行 `db_set key value` 会将 **键（key）** 和 **值（value）** 存储在数据库中。键和值（几乎）可以是你喜欢的任何东西，例如，值可以是 JSON 文档。然后调用 `db_get key` 会查找与该键关联的最新值并将其返回。

麻雀虽小，五脏俱全：

```bash
$ db_set 123456 '{"name":"London","attractions":["Big Ben","London Eye"]}'

$ db_set 42 '{"name":"San Francisco","attractions":["Golden Gate Bridge"]}'

$ db_get 42
{"name":"San Francisco","attractions":["Golden Gate Bridge"]}
```

底层的存储格式非常简单：一个文本文件，每行包含一条逗号分隔的键值对（忽略转义问题的话，大致与 CSV 文件类似）。每次对 `db_set` 的调用都会向文件末尾追加记录，所以更新键的时候旧版本的值不会被覆盖 —— 因而查找最新值的时候，需要找到文件中键最后一次出现的位置（因此 `db_get` 中使用了 `tail -n 1` )。

```bash
$ db_set 42 '{"name":"San Francisco","attractions":["Exploratorium"]}'

$ db_get 42
{"name":"San Francisco","attractions":["Exploratorium"]}

$ cat database
123456,{"name":"London","attractions":["Big Ben","London Eye"]}
42,{"name":"San Francisco","attractions":["Golden Gate Bridge"]}
42,{"name":"San Francisco","attractions":["Exploratorium"]}
```

`db_set` 函数对于极其简单的场景其实有非常好的性能，因为在文件尾部追加写入通常是非常高效的。与 `db_set` 做的事情类似，许多数据库在内部使用了 **日志（log）**，也就是一个 **仅追加（append-only）** 的数据文件。真正的数据库有更多的问题需要处理（如并发控制，回收硬盘空间以避免日志无限增长，处理错误与部分写入的记录），但基本原理是一样的。日志极其有用，我们还将在本书的其它部分重复见到它好几次。

> **日志（log）** 这个词通常指应用日志：即应用程序输出的描述正在发生的事情的文本。本书在更普遍的意义下使用 **日志** 这一词：一个仅追加的记录序列。它可能压根就不是给人类看的，它可以使用二进制格式，并仅能由其他程序读取。

另一方面，如果这个数据库中有着大量记录，则这个 `db_get` 函数的性能会非常糟糕。每次你想查找一个键时，`db_get` 必须从头到尾扫描整个数据库文件来查找键的出现。用算法的语言来说，查找的开销是 `O(n)` ：如果数据库记录数量 n 翻了一倍，查找时间也要翻一倍。这就不好了。

为了高效查找数据库中特定键的值，我们需要一个数据结构：**索引（index）**。本章将介绍一系列的索引结构，并在它们之间进行比较。索引背后的大致思想是通过保存一些额外的元数据作为路标来帮助你找到想要的数据。如果你想以几种不同的方式搜索同一份数据，那么你也许需要在数据的不同部分上建立多个索引。

索引是从主数据衍生的 **额外的（additional）** 结构。许多数据库允许添加与删除索引，这不会影响数据的内容，而只会影响查询的性能。维护额外的结构会产生开销，特别是在写入时。写入性能很难超过简单地追加写入文件，因为追加写入是最简单的写入操作。任何类型的索引通常都会减慢写入速度，因为每次写入数据时都需要更新索引。

这是存储系统中一个重要的权衡：精心选择的索引加快了读查询的速度，但是每个索引都会拖慢写入速度。因为这个原因，数据库默认并不会索引所有的内容，而需要你，也就是程序员或数据库管理员（DBA），基于对应用的典型查询模式的了解来手动选择索引。你可以选择那些能为应用带来最大收益而且又不会引入超出必要开销的索引。


### 散列索引

让我们从 **键值数据（key-value Data）** 的索引开始。这不是你可以索引的唯一数据类型，但键值数据是很常见的。在引入更复杂的索引之前，它是重要的第一步。

键值存储与在大多数编程语言中可以找到的 **字典（dictionary）** 类型非常相似，通常字典都是用 **散列映射（hash map）** 或 **散列表（hash table）** 实现的。散列映射在许多算法教科书中都有描述【1,2】，所以这里我们不会讨论它的工作细节。既然我们已经可以用散列映射来表示 **内存中** 的数据结构，为什么不使用它来索引 **硬盘上** 的数据呢？

假设我们的数据存储只是一个追加写入的文件，就像前面的例子一样，那么最简单的索引策略就是：保留一个内存中的散列映射，其中每个键都映射到数据文件中的一个字节偏移量，指明了可以找到对应值的位置，如 [图 3-1](/v1/ddia_0301.png) 所示。当你将新的键值对追加写入文件中时，还要更新散列映射，以反映刚刚写入的数据的偏移量（这同时适用于插入新键与更新现有键）。当你想查找一个值时，使用散列映射来查找数据文件中的偏移量，**寻找（seek）** 该位置并读取该值即可。

![](/v1/ddia_0301.png)

**图 3-1 以类 CSV 格式存储键值对的日志，并使用内存散列映射进行索引。**

听上去简单，但这是一个可行的方法。现实中，Bitcask 实际上就是这么做的（Riak 中默认的存储引擎）【3】。Bitcask 提供高性能的读取和写入操作，但要求所有的键必须能放入可用内存中，因为散列映射完全保留在内存中。而数据值可以使用比可用内存更多的空间，因为可以在硬盘上通过一次硬盘查找操作来加载所需部分，如果数据文件的那部分已经在文件系统缓存中，则读取根本不需要任何硬盘 I/O。

像 Bitcask 这样的存储引擎非常适合每个键的值经常更新的情况。例如，键可能是某个猫咪视频的网址（URL），而值可能是该视频被播放的次数（每次有人点击播放按钮时递增）。在这种类型的工作负载中，有很多写操作，但是没有太多不同的键 —— 每个键有很多的写操作，但是将所有键保存在内存中是可行的。

到目前为止，我们只是在追加写入一个文件 —— 所以如何避免最终用完硬盘空间？一种好的解决方案是，将日志分为特定大小的 **段（segment）**，当日志增长到特定尺寸时关闭当前段文件，并开始写入一个新的段文件。然后，我们就可以对这些段进行 **压缩（compaction）**，如 [图 3-2](/v1/ddia_0302.png) 所示。这里的压缩意味着在日志中丢弃重复的键，只保留每个键的最近更新。

![](/v1/ddia_0302.png)

**图 3-2 键值更新日志（统计猫咪视频的播放次数）的压缩，只保留每个键的最近值**

而且，由于压缩经常会使得段变得很小（假设在一个段内键被平均重写了好几次），我们也可以在执行压缩的同时将多个段合并在一起，如 [图 3-3](/v1/ddia_0303.png) 所示。段被写入后永远不会被修改，所以合并的段被写入一个新的文件。冻结段的合并和压缩可以在后台线程中完成，这个过程进行的同时，我们仍然可以继续使用旧的段文件来正常提供读写请求。合并过程完成后，我们将读取请求转换为使用新合并的段而不是旧的段 —— 然后旧的段文件就可以简单地删除掉了。

![](/v1/ddia_0303.png)

**图 3-3 同时执行压缩和分段合并**

每个段现在都有自己的内存散列表，将键映射到文件偏移量。为了找到一个键的值，我们首先检查最近的段的散列映射；如果键不存在，我们就检查第二个最近的段，依此类推。合并过程将保持段的数量足够小，所以查找过程不需要检查太多的散列映射。

要让这个简单的想法在实际中能工作会涉及到大量的细节。简单来说，下面几点都是实现过程中需要认真考虑的问题：

* 文件格式

  CSV 不是日志的最佳格式。使用二进制格式更快，更简单：首先以字节为单位对字符串的长度进行编码，然后是原始的字符串（不需要转义）。

* 删除记录

  如果要删除一个键及其关联的值，则必须在数据文件中追加一个特殊的删除记录（逻辑删除，有时被称为墓碑，即 tombstone）。当日志段被合并时，合并过程会通过这个墓碑知道要将被删除键的所有历史值都丢弃掉。

* 崩溃恢复

  如果数据库重新启动，则内存散列映射将丢失。原则上，你可以通过从头到尾读取整个段文件并记录下来每个键的最近值来恢复每个段的散列映射。但是，如果段文件很大，可能需要很长时间，这会使服务的重启比较痛苦。Bitcask 通过将每个段的散列映射的快照存储在硬盘上来加速恢复，可以使散列映射更快地加载到内存中。

* 部分写入记录

  数据库随时可能崩溃，包括在将记录追加到日志的过程中。Bitcask 文件包含校验和，允许检测和忽略日志中的这些损坏部分。

* 并发控制

  由于写操作是以严格的顺序追加到日志中的，所以常见的实现是只有一个写入线程。也因为数据文件段是仅追加的或者说是不可变的，所以它们可以被多个线程同时读取。

乍一看，仅追加日志似乎很浪费：为什么不直接在文件里更新，用新值覆盖旧值？仅追加的设计之所以是个好的设计，有如下几个原因：

* 追加和分段合并都是顺序写入操作，通常比随机写入快得多，尤其是在磁性机械硬盘上。在某种程度上，顺序写入在基于闪存的 **固态硬盘（SSD）** 上也是好的选择【4】。我们将在“[比较 B 树和 LSM 树](#比较b树和lsm树)”中进一步讨论这个问题。
* 如果段文件是仅追加的或不可变的，并发和崩溃恢复就简单多了。例如，当一个数据值被更新的时候发生崩溃，你不用担心文件里将会同时包含旧值和新值各自的一部分。
* 合并旧段的处理也可以避免数据文件随着时间的推移而碎片化的问题。

但是，散列表索引也有其局限性：

* 散列表必须能放进内存。如果你有非常多的键，那真是倒霉。原则上可以在硬盘上维护一个散列映射，不幸的是硬盘散列映射很难表现优秀。它需要大量的随机访问 I/O，而后者耗尽时想要再扩充是很昂贵的，并且需要很烦琐的逻辑去解决散列冲突【5】。
* 范围查询效率不高。例如，你无法轻松扫描 kitty00000 和 kitty99999 之间的所有键 —— 你必须在散列映射中单独查找每个键。

在下一节中，我们将看到一个没有这些限制的索引结构。


### SSTables和LSM树

在 [图 3-3](/v1/ddia_0303.png) 中，每个日志结构存储段都是一系列键值对。这些键值对按照它们写入的顺序排列，日志中稍后的值优先于日志中较早的相同键的值。除此之外，文件中键值对的顺序并不重要。

现在我们可以对段文件的格式做一个简单的改变：要求键值对的序列按键排序。乍一看，这个要求似乎打破了我们使用顺序写入的能力，我们将稍后再回到这个问题。

我们把这个格式称为 **排序字符串表（Sorted String Table）**，简称 SSTable。我们还要求每个键只在每个合并的段文件中出现一次（压缩过程已经保证）。与使用散列索引的日志段相比，SSTable 有几个大的优势：

1. 即使文件大于可用内存，合并段的操作仍然是简单而高效的。这种方法就像归并排序算法中使用的方法一样，如 [图 3-4](/v1/ddia_0304.png) 所示：你开始并排读取多个输入文件，查看每个文件中的第一个键，复制最低的键（根据排序顺序）到输出文件，不断重复此步骤，将产生一个新的合并段文件，而且它也是也按键排序的。

   ![](/v1/ddia_0304.png)

   **图 3-4 合并几个 SSTable 段，只保留每个键的最新值**

   如果在几个输入段中出现相同的键，该怎么办？请记住，每个段都包含在一段时间内写入数据库的所有值。这意味着一个输入段中的所有值一定比另一个段中的所有值都更近（假设我们总是合并相邻的段）。当多个段包含相同的键时，我们可以保留最近段的值，并丢弃旧段中的值。

2. 为了在文件中找到一个特定的键，你不再需要在内存中保存所有键的索引。以 [图 3-5](/v1/ddia_0305.png) 为例：假设你正在内存中寻找键 `handiwork`，但是你不知道这个键在段文件中的确切偏移量。然而，你知道 `handbag` 和 `handsome` 的偏移，而且由于排序特性，你知道 `handiwork` 必须出现在这两者之间。这意味着你可以跳到 `handbag` 的偏移位置并从那里扫描，直到你找到 `handiwork`（或没找到，如果该文件中没有该键）。

   ![](/v1/ddia_0305.png)

   **图 3-5 具有内存索引的 SSTable**

   你仍然需要一个内存中的索引来告诉你一些键的偏移量，但它可以是稀疏的：每几千字节的段文件有一个键就足够了，因为几千字节可以很快地被扫描完 [^i]。

[^i]: 如果所有的键与值都是定长的，你可以使用段文件上的二分查找并完全避免使用内存索引。然而实践中的键和值通常都是变长的，因此如果没有索引，就很难知道记录的分界点（前一条记录结束以及后一条记录开始的地方）。

3. 由于读取请求无论如何都需要扫描所请求范围内的多个键值对，因此可以将这些记录分组为块（block），并在将其写入硬盘之前对其进行压缩（如 [图 3-5](/v1/ddia_0305.png) 中的阴影区域所示）[^译注i] 。稀疏内存索引中的每个条目都指向压缩块的开始处。除了节省硬盘空间之外，压缩还可以减少对 I/O 带宽的使用。

[^译注i]: 这里的压缩是 compression，不是前文的 compaction，请注意区分。

#### 构建和维护SSTables

到目前为止还不错，但是如何让你的数据能够预先排好序呢？毕竟我们接收到的写入请求可能以任何顺序发生。

虽然在硬盘上维护有序结构也是可能的（请参阅 “[B 树](#B树)”），但在内存保存则要容易得多。有许多可以使用的众所周知的树形数据结构，例如红黑树或 AVL 树【2】。使用这些数据结构，你可以按任何顺序插入键，并按排序顺序读取它们。

现在我们可以让我们的存储引擎以如下方式工作：

* 有新写入时，将其添加到内存中的平衡树数据结构（例如红黑树）。这个内存树有时被称为 **内存表（memtable）**。
* 当 **内存表** 大于某个阈值（通常为几兆字节）时，将其作为 SSTable 文件写入硬盘。这可以高效地完成，因为树已经维护了按键排序的键值对。新的 SSTable 文件将成为数据库中最新的段。当该 SSTable 被写入硬盘时，新的写入可以在一个新的内存表实例上继续进行。
* 收到读取请求时，首先尝试在内存表中找到对应的键，如果没有就在最近的硬盘段中寻找，如果还没有就在下一个较旧的段中继续寻找，以此类推。
* 时不时地，在后台运行一个合并和压缩过程，以合并段文件并将已覆盖或已删除的值丢弃掉。

这个方案效果很好。它只会遇到一个问题：如果数据库崩溃，则最近的写入（在内存表中，但尚未写入硬盘）将丢失。为了避免这个问题，我们可以在硬盘上保存一个单独的日志，每个写入都会立即被追加到这个日志上，就像在前面的章节中所描述的那样。这个日志没有按排序顺序，但这并不重要，因为它的唯一目的是在崩溃后恢复内存表。每当内存表写出到 SSTable 时，相应的日志都可以被丢弃。

#### 用SSTables制作LSM树

这里描述的算法本质上是 LevelDB【6】和 RocksDB【7】这些键值存储引擎库所使用的技术，这些存储引擎被设计嵌入到其他应用程序中。除此之外，LevelDB 可以在 Riak 中用作 Bitcask 的替代品。在 Cassandra 和 HBase 中也使用了类似的存储引擎【8】，而且他们都受到了 Google 的 Bigtable 论文【9】（引入了术语 SSTable 和 memtable ）的启发。

这种索引结构最早由 Patrick O'Neil 等人发明，且被命名为日志结构合并树（或 LSM 树）【10】，它是基于更早之前的日志结构文件系统【11】来构建的。基于这种合并和压缩排序文件原理的存储引擎通常被称为 LSM 存储引擎。

Lucene，是一种全文搜索的索引引擎，在 Elasticsearch 和 Solr 被使用，它使用类似的方法来存储它的关键词词典【12,13】。全文索引比键值索引复杂得多，但是基于类似的想法：在搜索查询中，由一个给定的单词，找到提及单词的所有文档（网页、产品描述等）。这也是通过键值结构实现的：其中键是 **单词（term）**，值是所有包含该单词的文档的 ID 列表（**postings list**）。在 Lucene 中，从词语到记录列表的这种映射保存在类似于 SSTable 的有序文件中，并根据需要在后台执行合并【14】。

#### 性能优化

与往常一样，要让存储引擎在实践中表现良好涉及到大量设计细节。例如，当查找数据库中不存在的键时，LSM 树算法可能会很慢：你必须先检查内存表，然后查看从最近的到最旧的所有的段（可能还必须从硬盘读取每一个段文件），然后才能确定这个键不存在。为了优化这种访问，存储引擎通常使用额外的布隆过滤器（Bloom filters）【15】。（布隆过滤器是一种节省内存的数据结构，用于近似表达集合的内容，它可以告诉你数据库中是否存在某个键，从而为不存在的键节省掉许多不必要的硬盘读取操作。）

还有一些不同的策略来确定 SSTables 被压缩和合并的顺序和时间。最常见的选择是 size-tiered 和 leveled compaction。LevelDB 和 RocksDB 使用 leveled compaction（LevelDB 因此得名），HBase 使用 size-tiered，Cassandra 同时支持这两种【16】。对于 sized-tiered，较新和较小的 SSTables 相继被合并到较旧的和较大的 SSTable 中。对于 leveled compaction，key （按照分布范围）被拆分到较小的 SSTables，而较旧的数据被移动到单独的层级（level），这使得压缩（compaction）能够更加增量地进行，并且使用较少的硬盘空间。

即使有许多微妙的东西，LSM 树的基本思想 —— 保存一系列在后台合并的 SSTables —— 简单而有效。即使数据集比可用内存大得多，它仍能继续正常工作。由于数据按排序顺序存储，你可以高效地执行范围查询（扫描所有从某个最小值到某个最大值之间的所有键），并且因为硬盘写入是连续的，所以 LSM 树可以支持非常高的写入吞吐量。


### B树

前面讨论的日志结构索引看起来已经相当可用了，但它们却不是最常见的索引类型。使用最广泛的索引结构和日志结构索引相当不同，它就是我们接下来要讨论的 B 树。

从 1970 年被引入【17】，仅不到 10 年后就变得 “无处不在”【18】，B 树很好地经受了时间的考验。在几乎所有的关系数据库中，它们仍然是标准的索引实现，许多非关系数据库也会使用到 B 树。

像 SSTables 一样，B 树保持按键排序的键值对，这允许高效的键值查找和范围查询。但这也就是仅有的相似之处了：B 树有着非常不同的设计理念。

我们前面看到的日志结构索引将数据库分解为可变大小的段，通常是几兆字节或更大的大小，并且总是按顺序写入段。相比之下，B 树将数据库分解成固定大小的 **块（block）** 或 **分页（page）**，传统上大小为 4KB（有时会更大），并且一次只能读取或写入一个页面。这种设计更接近于底层硬件，因为硬盘空间也是按固定大小的块来组织的。

每个页面都可以使用地址或位置来标识，这允许一个页面引用另一个页面 —— 类似于指针，但在硬盘而不是在内存中。我们可以使用这些页面引用来构建一个页面树，如 [图 3-6](/v1/ddia_0306.png) 所示。

![](/v1/ddia_0306.png)

**图 3-6 使用 B 树索引查找一个键**

一个页面会被指定为 B 树的根；在索引中查找一个键时，就从这里开始。该页面包含几个键和对子页面的引用。每个子页面负责一段连续范围的键，根页面上每两个引用之间的键，表示相邻子页面管理的键的范围（边界）。

在 [图 3-6](/v1/ddia_0306.png) 的例子中，我们正在寻找键 251 ，所以我们知道我们需要跟踪边界 200 和 300 之间的页面引用。这将我们带到一个类似的页面，进一步将 200 到 300 的范围拆分到子范围。

最终，我们将到达某个包含单个键的页面（叶子页面，leaf page），该页面或者直接包含每个键的值，或者包含了对可以找到值的页面的引用。

在 B 树的一个页面中对子页面的引用的数量称为 **分支因子（branching factor）**。例如，在 [图 3-6](/v1/ddia_0306.png) 中，分支因子是 6。在实践中，分支因子的大小取决于存储页面引用和范围边界所需的空间，但这个值通常是几百。

如果要更新 B 树中现有键的值，需要搜索包含该键的叶子页面，更改该页面中的值，并将该页面写回到硬盘（对该页面的任何引用都将保持有效）。如果你想添加一个新的键，你需要找到其范围能包含新键的页面，并将其添加到该页面。如果页面中没有足够的可用空间容纳新键，则将其分成两个半满页面，并更新父页面以反映新的键范围分区，如 [图 3-7](/v1/ddia_0307.png) 所示 [^ii]。

![](/v1/ddia_0307.png)

**图 3-7 通过分割页面来生长 B 树**

[^ii]: 向 B 树中插入一个新的键是相当符合直觉的，但删除一个键（同时保持树平衡）就会牵扯很多其他东西了【2】。

这个算法可以确保树保持平衡：具有 n 个键的 B 树总是具有 $O (log n)$ 的深度。大多数数据库可以放入一个三到四层的 B 树，所以你不需要追踪多个页面引用来找到你正在查找的页面（分支因子为 500 的 4KB 页面的四层树可以存储多达 256TB 的数据）。

#### 让B树更可靠

B 树的基本底层写操作是用新数据覆写硬盘上的页面，并假定覆写不改变页面的位置：即，当页面被覆写时，对该页面的所有引用保持完整。这与日志结构索引（如 LSM 树）形成鲜明对比，后者只追加到文件（并最终删除过时的文件），但从不修改文件中已有的内容。

你可以把覆写硬盘上的页面对应为实际的硬件操作。在磁性硬盘驱动器上，这意味着将磁头移动到正确的位置，等待旋转盘上的正确位置出现，然后用新的数据覆写适当的扇区。在固态硬盘上，由于 SSD 必须一次擦除和重写相当大的存储芯片块，所以会发生更复杂的事情【19】。

而且，一些操作需要覆写几个不同的页面。例如，如果因为插入导致页面过满而拆分页面，则需要写入新拆分的两个页面，并覆写其父页面以更新对两个子页面的引用。这是一个危险的操作，因为如果数据库在系列操作进行到一半时崩溃，那么最终将导致一个损坏的索引（例如，可能有一个孤儿页面没有被任何页面引用） 。

为了使数据库能处理异常崩溃的场景，B 树实现通常会带有一个额外的硬盘数据结构：**预写式日志**（WAL，即 write-ahead log，也称为 **重做日志**，即 redo log）。这是一个仅追加的文件，每个 B 树的修改在其能被应用到树本身的页面之前都必须先写入到该文件。当数据库在崩溃后恢复时，这个日志将被用来使 B 树恢复到一致的状态【5,20】。

另外还有一个更新页面的复杂情况是，如果多个线程要同时访问 B 树，则需要仔细的并发控制 —— 否则线程可能会看到树处于不一致的状态。这通常是通过使用 **锁存器**（latches，轻量级锁）保护树的数据结构来完成。日志结构化的方法在这方面更简单，因为它们在后台进行所有的合并，而不会干扰新接收到的查询，并且能够时不时地将段文件切换为新的（该切换是原子操作）。

#### B树的优化

由于 B 树已经存在了很久，所以并不奇怪这么多年下来有很多优化的设计被开发出来，仅举几例：

* 不同于覆写页面并维护 WAL 以支持崩溃恢复，一些数据库（如 LMDB）使用写时复制方案【21】。经过修改的页面被写入到不同的位置，并且还在树中创建了父页面的新版本，以指向新的位置。这种方法对于并发控制也很有用，我们将在 “[快照隔离和可重复读](/v1/ch7#快照隔离和可重复读)” 中看到。
* 我们可以通过不存储整个键，而是缩短其大小，来节省页面空间。特别是在树内部的页面上，键只需要提供足够的信息来充当键范围之间的边界。在页面中包含更多的键允许树具有更高的分支因子，因此也就允许更少的层级 [^iii]。
* 通常，页面可以放置在硬盘上的任何位置；没有什么要求相邻键范围的页面也放在硬盘上相邻的区域。如果某个查询需要按照排序顺序扫描大部分的键范围，那么这种按页面存储的布局可能会效率低下，因为每个页面的读取都需要执行一次硬盘查找。因此，许多 B 树的实现在布局树时会尽量使叶子页面按顺序出现在硬盘上。但是，随着树的增长，要维持这个顺序是很困难的。相比之下，由于 LSM 树在合并过程中一次性重写一大段存储，所以它们更容易使顺序键在硬盘上连续存储。
* 额外的指针被添加到树中。例如，每个叶子页面可以引用其左边和右边的兄弟页面，使得不用跳回父页面就能按顺序对键进行扫描。
* B 树的变体如 **分形树（fractal trees）**【22】借用了一些日志结构的思想来减少硬盘查找（而且它们与分形无关）。

[^iii]: 这个变种有时被称为 B+ 树，但因为这个优化已被广泛使用，所以经常无法区分于其它的 B 树变种。

### 比较B树和LSM树

尽管 B 树实现通常比 LSM 树实现更成熟，LSM 树由于其性能特征的关系，仍然引起了不少关注。根据经验，通常 LSM 树的写入速度更快，而 B 树的读取速度更快【23】。LSM 树上的读取通常比较慢，因为它们必须检查几种不同的数据结构和不同压缩（Compaction）层级的 SSTables。

然而，基准测试的结果通常和工作负载的细节相关。你需要用你特有的工作负载来测试系统，以便进行有效的比较。在本节中，我们将简要讨论一些在衡量存储引擎性能时值得考虑的事情。

#### LSM树的优点

B 树索引中的每块数据都必须至少写入两次：一次写入预先写入日志（WAL），一次写入树页面本身（如果有分页还需要再写入一次）。即使在该页面中只有几个字节发生了变化，也需要接受写入整个页面的开销。有些存储引擎甚至会覆写同一个页面两次，以免在电源故障的情况下页面未完整更新【24,25】。

由于反复压缩和合并 SSTables，日志结构索引也会多次重写数据。这种影响 —— 在数据库的生命周期中每笔数据导致对硬盘的多次写入 —— 被称为 **写入放大（write amplification）**。使用固态硬盘的机器需要额外关注这点，固态硬盘的闪存寿命在覆写有限次数后就会耗尽。

在写入繁重的应用程序中，性能瓶颈可能是数据库可以写入硬盘的速度。在这种情况下，写放大会导致直接的性能代价：存储引擎写入硬盘的次数越多，可用硬盘带宽内它能处理的每秒写入次数就越少。

进而，LSM 树通常能够比 B 树支持更高的写入吞吐量，部分原因是它们有时具有较低的写放大（尽管这取决于存储引擎的配置和工作负载），部分是因为它们顺序地写入紧凑的 SSTable 文件而不是必须覆写树中的几个页面【26】。这种差异在机械硬盘上尤其重要，其顺序写入比随机写入要快得多。

LSM 树可以被压缩得更好，因此通常能比 B 树在硬盘上产生更小的文件。B 树存储引擎会由于碎片化（fragmentation）而留下一些未使用的硬盘空间：当页面被拆分或某行不能放入现有页面时，页面中的某些空间仍未被使用。由于 LSM 树不是面向页面的，并且会通过定期重写 SSTables 以去除碎片，所以它们具有较低的存储开销，特别是当使用分层压缩（leveled compaction）时【27】。

在许多固态硬盘上，固件内部使用了日志结构化算法，以将随机写入转变为顺序写入底层存储芯片，因此存储引擎写入模式的影响不太明显【19】。但是，较低的写入放大率和减少的碎片仍然对固态硬盘更有利：更紧凑地表示数据允许在可用的 I/O 带宽内处理更多的读取和写入请求。

#### LSM树的缺点

日志结构存储的缺点是压缩过程有时会干扰正在进行的读写操作。尽管存储引擎尝试增量地执行压缩以尽量不影响并发访问，但是硬盘资源有限，所以很容易发生某个请求需要等待硬盘先完成昂贵的压缩操作。对吞吐量和平均响应时间的影响通常很小，但是日志结构化存储引擎在更高百分位的响应时间（请参阅 “[描述性能](/v1/ch1#描述性能)”）有时会相当长，而 B 树的行为则相对更具有可预测性【28】。

压缩的另一个问题出现在高写入吞吐量时：硬盘的有限写入带宽需要在初始写入（记录日志和刷新内存表到硬盘）和在后台运行的压缩线程之间共享。写入空数据库时，可以使用全硬盘带宽进行初始写入，但数据库越大，压缩所需的硬盘带宽就越多。

如果写入吞吐量很高，并且压缩没有仔细配置好，有可能导致压缩跟不上写入速率。在这种情况下，硬盘上未合并段的数量不断增加，直到硬盘空间用完，读取速度也会减慢，因为它们需要检查更多的段文件。通常情况下，即使压缩无法跟上，基于 SSTable 的存储引擎也不会限制传入写入的速率，所以你需要进行明确的监控来检测这种情况【29,30】。

B 树的一个优点是每个键只存在于索引中的一个位置，而日志结构化的存储引擎可能在不同的段中有相同键的多个副本。这个方面使得 B 树在想要提供强大的事务语义的数据库中很有吸引力：在许多关系数据库中，事务隔离是通过在键范围上使用锁来实现的，在 B 树索引中，这些锁可以直接附加到树上【5】。在 [第七章](/v1/ch7) 中，我们将更详细地讨论这一点。

B 树在数据库架构中是非常根深蒂固的，为许多工作负载都提供了始终如一的良好性能，所以它们不可能在短期内消失。在新的数据库中，日志结构化索引变得越来越流行。没有简单易行的办法来判断哪种类型的存储引擎对你的使用场景更好，所以需要通过一些测试来得到相关经验。

### 其他索引结构

到目前为止，我们只讨论了键值索引，它们就像关系模型中的 **主键（primary key）** 索引。主键唯一标识关系表中的一行，或文档数据库中的一个文档或图形数据库中的一个顶点。数据库中的其他记录可以通过其主键（或 ID）引用该行 / 文档 / 顶点，索引就被用于解析这样的引用。

次级索引（secondary indexes）也很常见。在关系数据库中，你可以使用 `CREATE INDEX` 命令在同一个表上创建多个次级索引，而且这些索引通常对于有效地执行联接（join）而言至关重要。例如，在 [第二章](/v1/ch2) 中的 [图 2-1](/v1/ddia_0201.png) 中，很可能在 `user_id` 列上有一个次级索引，以便你可以在每个表中找到属于同一用户的所有行。

次级索引可以很容易地从键值索引构建。次级索引主要的不同是键不是唯一的，即可能有许多行（文档，顶点）具有相同的键。这可以通过两种方式来解决：将匹配行标识符的列表作为索引里的值（就像全文索引中的记录列表），或者通过向每个键添加行标识符来使键唯一。无论哪种方式，B 树和日志结构索引都可以用作次级索引。

#### 将值存储在索引中

索引中的键是查询要搜索的内容，而其值可以是以下两种情况之一：它可以是实际的行（文档，顶点），也可以是对存储在别处的行的引用。在后一种情况下，行被存储的地方被称为 **堆文件（heap file）**，并且存储的数据没有特定的顺序（它可以是仅追加的，或者它可以跟踪被删除的行以便后续可以用新的数据进行覆盖）。堆文件方法很常见，因为它避免了在存在多个次级索引时对数据的复制：每个索引只引用堆文件中的一个位置，实际的数据都保存在一个地方。

在不更改键的情况下更新值时，堆文件方法可以非常高效：只要新值的字节数不大于旧值，就可以覆盖该记录。如果新值更大，情况会更复杂，因为它可能需要移到堆中有足够空间的新位置。在这种情况下，要么所有的索引都需要更新，以指向记录的新堆位置，或者在旧堆位置留下一个转发指针【5】。

在某些情况下，从索引到堆文件的额外跳跃对读取来说性能损失太大，因此可能希望将被索引的行直接存储在索引中。这被称为聚集索引（clustered index）。例如，在 MySQL 的 InnoDB 存储引擎中，表的主键总是一个聚集索引，次级索引则引用主键（而不是堆文件中的位置）【31】。在 SQL Server 中，可以为每个表指定一个聚集索引【32】。

在 **聚集索引**（在索引中存储所有的行数据）和 **非聚集索引**（仅在索引中存储对数据的引用）之间的折衷被称为 **覆盖索引（covering index）** 或 **包含列的索引（index with included columns）**，其在索引内存储表的一部分列【33】。这允许通过单独使用索引来处理一些查询（这种情况下，可以说索引 **覆盖（cover）** 了查询）【32】。

与任何类型的数据重复一样，聚集索引和覆盖索引可以加快读取速度，但是它们需要额外的存储空间，并且会增加写入开销。数据库还需要额外的努力来执行事务保证，因为应用程序不应看到任何因为使用副本而导致的不一致。

#### 多列索引

至今讨论的索引只是将一个键映射到一个值。如果我们需要同时查询一个表中的多个列（或文档中的多个字段），这显然是不够的。

最常见的多列索引被称为 **连接索引（concatenated index）** ，它通过将一列的值追加到另一列后面，简单地将多个字段组合成一个键（索引定义中指定了字段的连接顺序）。这就像一个老式的纸质电话簿，它提供了一个从（姓氏，名字）到电话号码的索引。由于排序顺序，索引可以用来查找所有具有特定姓氏的人，或所有具有特定姓氏 - 名字组合的人。但如果你想找到所有具有特定名字的人，这个索引是没有用的。

**多维索引（multi-dimensional index）** 是一种查询多个列的更一般的方法，这对于地理空间数据尤为重要。例如，餐厅搜索网站可能有一个数据库，其中包含每个餐厅的经度和纬度。当用户在地图上查看餐馆时，网站需要搜索用户正在查看的矩形地图区域内的所有餐馆。这需要一个二维范围查询，如下所示：

```sql
SELECT * FROM restaurants WHERE latitude > 51.4946 AND latitude < 51.5079
                          AND longitude > -0.1162 AND longitude < -0.1004;
```

一个标准的 B 树或者 LSM 树索引不能够高效地处理这种查询：它可以返回一个纬度范围内的所有餐馆（但经度可能是任意值），或者返回在同一个经度范围内的所有餐馆（但纬度可能是北极和南极之间的任意地方），但不能同时满足两个条件。

一种选择是使用 **空间填充曲线（space-filling curve）** 将二维位置转换为单个数字，然后使用常规 B 树索引【34】。更普遍的是，使用特殊化的空间索引，例如 R 树。例如，PostGIS 使用 PostgreSQL 的通用 GiST 工具【35】将地理空间索引实现为 R 树。这里我们没有足够的地方来描述 R 树，但是有大量的文献可供参考。

有趣的是，多维索引不仅可以用于地理位置。例如，在电子商务网站上可以使用建立在（红，绿，蓝）维度上的三维索引来搜索特定颜色范围内的产品，也可以在天气观测数据库中建立（日期，温度）的二维索引，以便有效地搜索 2013 年内的温度在 25 至 30°C 之间的所有观测资料。如果使用一维索引，你将不得不扫描 2013 年的所有记录（不管温度如何），然后通过温度进行过滤，或者反之亦然。二维索引可以同时通过时间戳和温度来收窄数据集。这个技术被 HyperDex 所使用【36】。

#### 全文搜索和模糊索引

到目前为止所讨论的所有索引都假定你有确切的数据，并允许你查询键的确切值或具有排序顺序的键的值范围。他们不允许你做的是搜索**类似**的键，如拼写错误的单词。这种模糊的查询需要不同的技术。

例如，全文搜索引擎通常允许搜索目标从一个单词扩展为包括该单词的同义词，忽略单词的语法变体，搜索在相同文档中的近义词，并且支持各种其他取决于文本的语言分析功能。为了处理文档或查询中的拼写错误，Lucene 能够在一定的编辑距离内搜索文本【37】（编辑距离 1 意味着单词内发生了 1 个字母的添加、删除或替换）。

正如 “[用 SSTables 制作 LSM 树](#用SSTables制作LSM树)” 中所提到的，Lucene 为其词典使用了一个类似于 SSTable 的结构。这个结构需要一个小的内存索引，告诉查询需要在排序文件中哪个偏移量查找键。在 LevelDB 中，这个内存中的索引是一些键的稀疏集合，但在 Lucene 中，内存中的索引是键中字符的有限状态自动机，类似于 trie 【38】。这个自动机可以转换成 Levenshtein 自动机，它支持在给定的编辑距离内有效地搜索单词【39】。

其他的模糊搜索技术正朝着文档分类和机器学习的方向发展。更多详细信息请参阅信息检索教科书，例如【40】。

#### 在内存中存储一切

本章到目前为止讨论的数据结构都是对硬盘限制的应对。与主内存相比，硬盘处理起来很麻烦。对于磁性硬盘和固态硬盘，如果要在读取和写入时获得良好性能，则需要仔细地布置硬盘上的数据。但是，我们能容忍这种麻烦，因为硬盘有两个显著的优点：它们是持久的（它们的内容在电源关闭时不会丢失），并且每 GB 的成本比 RAM 低。

随着 RAM 变得更便宜，每 GB 成本的论据被侵蚀了。许多数据集不是那么大，所以将它们全部保存在内存中是非常可行的，包括可能分布在多个机器上。这导致了内存数据库的发展。

某些内存中的键值存储（如 Memcached）仅用于缓存，在重新启动计算机时丢失的数据是可以接受的。但其他内存数据库的目标是持久性，可以通过特殊的硬件（例如电池供电的 RAM）来实现，也可以将更改日志写入硬盘，还可以将定时快照写入硬盘或者将内存中的状态复制到其他机器上。

内存数据库重新启动时，需要从硬盘或通过网络从副本重新加载其状态（除非使用特殊的硬件）。尽管写入硬盘，它仍然是一个内存数据库，因为硬盘仅出于持久性目的进行日志追加，读取请求完全由内存来处理。写入硬盘同时还有运维上的好处：硬盘上的文件可以很容易地由外部程序进行备份、检查和分析。

诸如 VoltDB、MemSQL 和 Oracle TimesTen 等产品是具有关系模型的内存数据库，供应商声称，通过消除与管理硬盘上的数据结构相关的所有开销，他们可以提供巨大的性能改进【41,42】。RAM Cloud 是一个开源的内存键值存储器，具有持久性（对内存和硬盘上的数据都使用日志结构化方法）【43】。Redis 和 Couchbase 通过异步写入硬盘提供了较弱的持久性。

反直觉的是，内存数据库的性能优势并不是因为它们不需要从硬盘读取的事实。只要有足够的内存即使是基于硬盘的存储引擎也可能永远不需要从硬盘读取，因为操作系统在内存中缓存了最近使用的硬盘块。相反，它们更快的原因在于省去了将内存数据结构编码为硬盘数据结构的开销【44】。

除了性能，内存数据库的另一个有趣的地方是提供了难以用基于硬盘的索引实现的数据模型。例如，Redis 为各种数据结构（如优先级队列和集合）提供了类似数据库的接口。因为它将所有数据保存在内存中，所以它的实现相对简单。

最近的研究表明，内存数据库体系结构可以扩展到支持比可用内存更大的数据集，而不必重新采用以硬盘为中心的体系结构【45】。所谓的 **反缓存（anti-caching）** 方法通过在内存不足的情况下将最近最少使用的数据从内存转移到硬盘，并在将来再次访问时将其重新加载到内存中。这与操作系统对虚拟内存和交换文件的操作类似，但数据库可以比操作系统更有效地管理内存，因为它可以按单个记录的粒度工作，而不是整个内存页面。尽管如此，这种方法仍然需要索引能完全放入内存中（就像本章开头的 Bitcask 例子）。

如果 **非易失性存储器（non-volatile memory, NVM）** 技术得到更广泛的应用，可能还需要进一步改变存储引擎设计【46】。目前这是一个新的研究领域，值得关注。


## 事务处理还是分析？

在早期的业务数据处理过程中，一次典型的数据库写入通常与一笔 *商业交易（commercial transaction）* 相对应：卖个货、向供应商下订单、支付员工工资等等。但随着数据库开始应用到那些不涉及到钱的领域，术语 **交易 / 事务（transaction）** 仍留了下来，用于指代一组读写操作构成的逻辑单元。

> 事务不一定具有 ACID（原子性，一致性，隔离性和持久性）属性。事务处理只是意味着允许客户端进行低延迟的读取和写入 —— 而不是只能定期运行（例如每天一次）的批处理作业。我们在 [第七章](/v1/ch7) 中讨论 ACID 属性，在 [第十章](/v1/ch10) 中讨论批处理。

即使数据库开始被用于许多不同类型的数据，比如博客文章的评论、游戏中的动作、地址簿中的联系人等等，基本的访问模式仍然类似于处理商业交易。应用程序通常使用索引通过某个键找少量记录。根据用户的输入来插入或更新记录。由于这些应用程序是交互式的，这种访问模式被称为 **在线事务处理（OLTP, OnLine Transaction Processing）**。

但是，数据库也开始越来越多地用于数据分析，这些数据分析具有非常不同的访问模式。通常，分析查询需要扫描大量记录，每个记录只读取几列，并计算汇总统计信息（如计数、总和或平均值），而不是将原始数据返回给用户。例如，如果你的数据是一个销售交易表，那么分析查询可能是：

* 一月份每个商店的总收入是多少？
* 在最近的推广活动中多卖了多少香蕉？
* 哪个牌子的婴儿食品最常与 X 品牌的尿布同时购买？

这些查询通常由业务分析师编写，并提供报告以帮助公司管理层做出更好的决策（商业智能）。为了将这种使用数据库的模式和事务处理区分开，它被称为 **在线分析处理（OLAP, OnLine Analytic Processing）**【47】[^iv]。OLTP 和 OLAP 之间的区别并不总是清晰的，但是一些典型的特征在 [表 3-1]() 中列出。

**表 3-1 比较事务处理和分析系统的特点**

|     属性     |      事务处理系统 OLTP       |      分析系统 OLAP       |
| :----------: | :--------------------------: | :----------------------: |
| 主要读取模式 |    查询少量记录，按键读取    |    在大批量记录上聚合    |
| 主要写入模式 |   随机访问，写入要求低延时   | 批量导入（ETL）或者事件流  |
|   主要用户   |    终端用户，通过 Web 应用     | 内部数据分析师，用于决策支持 |
|  处理的数据  | 数据的最新状态（当前时间点） |   随时间推移的历史事件   |
|  数据集尺寸  |           GB ~ TB            |         TB ~ PB          |

[^iv]: OLAP 中的首字母 O（online）的含义并不明确，它可能是指查询并不是用来生成预定义好的报告的事实，也可能是指分析师通常是交互式地使用 OLAP 系统来进行探索式的查询。

起初，事务处理和分析查询使用了相同的数据库。SQL 在这方面已证明是非常灵活的：对于 OLTP 类型的查询以及 OLAP 类型的查询来说效果都很好。尽管如此，在二十世纪八十年代末和九十年代初期，企业有停止使用 OLTP 系统进行分析的趋势，转而在单独的数据库上运行分析。这个单独的数据库被称为 **数据仓库（data warehouse）**。

### 数据仓库

一个企业可能有几十个不同的交易处理系统：面向终端客户的网站、控制实体商店的收银系统、仓库库存跟踪、车辆路线规划、供应链管理、员工管理等。这些系统中每一个都很复杂，需要专人维护，所以最终这些系统互相之间都是独立运行的。

这些 OLTP 系统往往对业务运作至关重要，因而通常会要求 **高可用** 与 **低延迟**。所以 DBA 会密切关注他们的 OLTP 数据库，他们通常不愿意让业务分析人员在 OLTP 数据库上运行临时的分析查询，因为这些查询通常开销巨大，会扫描大部分数据集，这会损害同时在执行的事务的性能。

相比之下，数据仓库是一个独立的数据库，分析人员可以查询他们想要的内容而不影响 OLTP 操作【48】。数据仓库包含公司各种 OLTP 系统中所有的只读数据副本。从 OLTP 数据库中提取数据（使用定期的数据转储或连续的更新流），转换成适合分析的模式，清理并加载到数据仓库中。将数据存入仓库的过程称为 “**抽取 - 转换 - 加载（ETL）**”，如 [图 3-8](/v1/ddia_0308.png) 所示。

![](/v1/ddia_0308.png)

**图 3-8 ETL 至数据仓库的简化提纲**

几乎所有的大型企业都有数据仓库，但在小型企业中几乎闻所未闻。这可能是因为大多数小公司没有这么多不同的 OLTP 系统，大多数小公司只有少量的数据 —— 可以在传统的 SQL 数据库中查询，甚至可以在电子表格中分析。在一家大公司里，要做一些在一家小公司很简单的事情，需要很多繁重的工作。

使用单独的数据仓库，而不是直接查询 OLTP 系统进行分析的一大优势是数据仓库可针对分析类的访问模式进行优化。事实证明，本章前半部分讨论的索引算法对于 OLTP 来说工作得很好，但对于处理分析查询并不是很好。在本章的其余部分中，我们将研究为分析而优化的存储引擎。

#### OLTP数据库和数据仓库之间的分歧

数据仓库的数据模型通常是关系型的，因为 SQL 通常很适合分析查询。有许多图形数据分析工具可以生成 SQL 查询，可视化结果，并允许分析人员探索数据（通过下钻、切片和切块等操作）。

表面上，一个数据仓库和一个关系型 OLTP 数据库看起来很相似，因为它们都有一个 SQL 查询接口。然而，系统的内部看起来可能完全不同，因为它们针对非常不同的查询模式进行了优化。现在许多数据库供应商都只是重点支持事务处理负载和分析工作负载这两者中的一个，而不是都支持。

一些数据库（例如 Microsoft SQL Server 和 SAP HANA）支持在同一产品中进行事务处理和数据仓库。但是，它们也正日益发展为两套独立的存储和查询引擎，只是这些引擎正好可以通过一个通用的 SQL 接口访问【49,50,51】。

Teradata、Vertica、SAP HANA 和 ParAccel 等数据仓库供应商通常使用昂贵的商业许可证销售他们的系统。Amazon RedShift 是 ParAccel 的托管版本。最近，大量的开源 SQL-on-Hadoop 项目已经出现，它们还很年轻，但是正在与商业数据仓库系统竞争，包括 Apache Hive、Spark SQL、Cloudera Impala、Facebook Presto、Apache Tajo 和 Apache Drill【52,53】。其中一些基于了谷歌 Dremel 的想法【54】。

### 星型和雪花型：分析的模式

正如 [第二章](/v1/ch2) 所探讨的，根据应用程序的需要，在事务处理领域中使用了大量不同的数据模型。另一方面，在分析型业务中，数据模型的多样性则少得多。许多数据仓库都以相当公式化的方式使用，被称为星型模式（也称为维度建模【55】）。

[图 3-9](/v1/ddia_0309.png) 中的示例模式显示了可能在食品零售商处找到的数据仓库。在模式的中心是一个所谓的事实表（在这个例子中，它被称为 `fact_sales`）。事实表的每一行代表在特定时间发生的事件（这里，每一行代表客户购买的产品）。如果我们分析的是网站流量而不是零售量，则每行可能代表一个用户的页面浏览或点击。

![](/v1/ddia_0309.png)

**图 3-9 用于数据仓库的星型模式的示例**

通常情况下，事实被视为单独的事件，因为这样可以在以后分析中获得最大的灵活性。但是，这意味着事实表可以变得非常大。像苹果、沃尔玛或 eBay 这样的大企业在其数据仓库中可能有几十 PB 的交易历史，其中大部分保存在事实表中【56】。

事实表中的一些列是属性，例如产品销售的价格和从供应商那里购买的成本（可以用来计算利润率）。事实表中的其他列是对其他表（称为维度表）的外键引用。由于事实表中的每一行都表示一个事件，因此这些维度代表事件发生的对象、内容、地点、时间、方式和原因。

例如，在 [图 3-9](/v1/ddia_0309.png) 中，其中一个维度是已售出的产品。`dim_product` 表中的每一行代表一种待售产品，包括库存单位（SKU）、产品描述、品牌名称、类别、脂肪含量、包装尺寸等。`fact_sales` 表中的每一行都使用外键表明在特定交易中销售了什么产品。（简单起见，如果客户一次购买了几种不同的产品，则它们在事实表中被表示为单独的行）。

甚至日期和时间也通常使用维度表来表示，因为这允许对日期的附加信息（诸如公共假期）进行编码，从而允许区分假期和非假期的销售查询。

“星型模式” 这个名字来源于这样一个事实，即当我们对表之间的关系进行可视化时，事实表在中间，被维度表包围；与这些表的连接就像星星的光芒。

这个模板的变体被称为雪花模式，其中维度被进一步分解为子维度。例如，品牌和产品类别可能有单独的表格，并且 `dim_product` 表格中的每一行都可以将品牌和类别作为外键引用，而不是将它们作为字符串存储在 `dim_product` 表格中。雪花模式比星形模式更规范化，但是星形模式通常是首选，因为分析师使用它更简单【55】。

在典型的数据仓库中，表格通常非常宽：事实表通常有 100 列以上，有时甚至有数百列【51】。维度表也可以是非常宽的，因为它们包括了所有可能与分析相关的元数据 —— 例如，`dim_store` 表可以包括在每个商店提供哪些服务的细节、它是否具有店内面包房、店面面积、商店第一次开张的日期、最近一次改造的时间、离最近的高速公路的距离等等。


## 列式存储

如果事实表中有万亿行和数 PB 的数据，那么高效地存储和查询它们就成为一个具有挑战性的问题。维度表通常要小得多（数百万行），所以在本节中我们将主要关注事实表的存储。

尽管事实表通常超过 100 列，但典型的数据仓库查询一次只会访问其中 4 个或 5 个列（ “`SELECT *`” 查询很少用于分析）【51】。以 [例 3-1]() 中的查询为例：它访问了大量的行（在 2013 年中所有购买了水果或糖果的记录），但只需访问 `fact_sales` 表的三列：`date_key, product_sk, quantity`。该查询忽略了所有其他的列。

**例 3-1 分析人们是否更倾向于在一周的某一天购买新鲜水果或糖果**

```sql
SELECT
  dim_date.weekday,
  dim_product.category,
  SUM(fact_sales.quantity) AS quantity_sold
FROM fact_sales
  JOIN dim_date ON fact_sales.date_key = dim_date.date_key
  JOIN dim_product ON fact_sales.product_sk = dim_product.product_sk
WHERE
  dim_date.year = 2013 AND
  dim_product.category IN ('Fresh fruit', 'Candy')
GROUP BY
  dim_date.weekday, dim_product.category;
```

我们如何有效地执行这个查询？

在大多数 OLTP 数据库中，存储都是以面向行的方式进行布局的：表格的一行中的所有值都相邻存储。文档数据库也是相似的：整个文档通常存储为一个连续的字节序列。你可以在 [图 3-1](/v1/ddia_0301.png) 的 CSV 例子中看到这个。

为了处理像 [例 3-1]() 这样的查询，你可能在 `fact_sales.date_key`、`fact_sales.product_sk` 上有索引，它们告诉存储引擎在哪里查找特定日期或特定产品的所有销售情况。但是，面向行的存储引擎仍然需要将所有这些行（每个包含超过 100 个属性）从硬盘加载到内存中，解析它们，并过滤掉那些不符合要求的属性。这可能需要很长时间。

列式存储背后的想法很简单：不要将所有来自一行的值存储在一起，而是将来自每一列的所有值存储在一起。如果每个列式存储在一个单独的文件中，查询只需要读取和解析查询中使用的那些列，这可以节省大量的工作。这个原理如 [图 3-10](/v1/ddia_0310.png) 所示。

![](/v1/ddia_0310.png)

**图 3-10 按列存储关系型数据，而不是行**

> 列式存储在关系数据模型中是最容易理解的，但它同样适用于非关系数据。例如，Parquet【57】是一种列式存储格式，支持基于 Google 的 Dremel 的文档数据模型【54】。

列式存储布局依赖于每个列文件包含相同顺序的行。因此，如果你需要重新组装完整的行，你可以从每个单独的列文件中获取第 23 项，并将它们放在一起形成表的第 23 行。


### 列压缩

除了仅从硬盘加载查询所需的列以外，我们还可以通过压缩数据来进一步降低对硬盘吞吐量的需求。幸运的是，列式存储通常很适合压缩。

看看 [图 3-10](/v1/ddia_0310.png) 中每一列的值序列：它们通常看起来是相当重复的，这是压缩的好兆头。根据列中的数据，可以使用不同的压缩技术。在数据仓库中特别有效的一种技术是位图编码，如 [图 3-11](/v1/ddia_0311.png) 所示。

![](/v1/ddia_0311.png)

**图 3-11 压缩的位图索引存储布局**

通常情况下，一列中不同值的数量与行数相比要小得多（例如，零售商可能有数十亿的销售交易，但只有 100,000 个不同的产品）。现在我们可以拿一个有 n 个不同值的列，并把它转换成 n 个独立的位图：每个不同值对应一个位图，每行对应一个比特位。如果该行具有该值，则该位为 1，否则为 0。

如果 n 非常小（例如，国家 / 地区列可能有大约 200 个不同的值），则这些位图可以将每行存储成一个比特位。但是，如果 n 更大，大部分位图中将会有很多的零（我们说它们是稀疏的）。在这种情况下，位图可以另外再进行游程编码（run-length encoding，一种无损数据压缩技术），如 [图 3-11](fig3-11.png) 底部所示。这可以使列的编码非常紧凑。

这些位图索引非常适合数据仓库中常见的各种查询。例如：

```sql
WHERE product_sk IN（30，68，69）
```

加载 `product_sk = 30`、`product_sk = 68` 和 `product_sk = 69` 这三个位图，并计算三个位图的按位或（OR），这可以非常有效地完成。

```sql
WHERE product_sk = 31 AND store_sk = 3
```

加载 `product_sk = 31` 和 `store_sk = 3` 的位图，并计算按位与（AND）。这是因为列按照相同的顺序包含行，因此一列的位图中的第 k 位和另一列的位图中的第 k 位对应相同的行。

对于不同种类的数据，也有各种不同的压缩方案，但我们不会详细讨论它们，请参阅【58】的概述。

> #### 列式存储和列族
>
> Cassandra 和 HBase 有一个列族（column families）的概念，他们从 Bigtable 继承【9】。然而，把它们称为列式（column-oriented）是非常具有误导性的：在每个列族中，它们将一行中的所有列与行键一起存储，并且不使用列压缩。因此，Bigtable 模型仍然主要是面向行的。
>

#### 内存带宽和矢量化处理

对于需要扫描数百万行的数据仓库查询来说，一个巨大的瓶颈是从硬盘获取数据到内存的带宽。但是，这不是唯一的瓶颈。分析型数据库的开发人员还需要有效地利用内存到 CPU 缓存的带宽，避免 CPU 指令处理流水线中的分支预测错误和闲置等待，以及在现代 CPU 上使用单指令多数据（SIMD）指令来加速运算【59,60】。

除了减少需要从硬盘加载的数据量以外，列式存储布局也可以有效利用 CPU 周期。例如，查询引擎可以将一整块压缩好的列数据放进 CPU 的 L1 缓存中，然后在紧密的循环（即没有函数调用）中遍历。相比于每条记录的处理都需要大量函数调用和条件判断的代码，CPU 执行这样一个循环要快得多。列压缩允许列中的更多行被同时放进容量有限的 L1 缓存。前面描述的按位 “与” 和 “或” 运算符可以被设计为直接在这样的压缩列数据块上操作。这种技术被称为矢量化处理（vectorized processing）【58,49】。


### 列式存储中的排序顺序

在列式存储中，存储行的顺序并不关键。按插入顺序存储它们是最简单的，因为插入一个新行只需要追加到每个列文件。但是，我们也可以选择按某种顺序来排列数据，就像我们之前对 SSTables 所做的那样，并将其用作索引机制。

注意，对每列分别执行排序是没有意义的，因为那样就没法知道不同列中的哪些项属于同一行。我们只能在明确一列中的第 k 项与另一列中的第 k 项属于同一行的情况下，才能重建出完整的行。

相反，数据的排序需要对一整行统一操作，即使它们的存储方式是按列的。数据库管理员可以根据他们对常用查询的了解，来选择表格中用来排序的列。例如，如果查询通常以日期范围为目标，例如“上个月”，则可以将 `date_key` 作为第一个排序键。这样查询优化器就可以只扫描近1个月范围的行了，这比扫描所有行要快得多。

对于第一排序列中具有相同值的行，可以用第二排序列来进一步排序。例如，如果 `date_key` 是 [图 3-10](/v1/ddia_0310.png) 中的第一个排序关键字，那么 `product_sk` 可能是第二个排序关键字，以便同一天的同一产品的所有销售数据都被存储在相邻位置。这将有助于需要在特定日期范围内按产品对销售进行分组或过滤的查询。

按顺序排序的另一个好处是它可以帮助压缩列。如果主要排序列没有太多个不同的值，那么在排序之后，将会得到一个相同的值连续重复多次的序列。一个简单的游程编码（就像我们用于 [图 3-11](/v1/ddia_0311.png) 中的位图一样）可以将该列压缩到几 KB —— 即使表中有数十亿行。

第一个排序键的压缩效果最强。第二和第三个排序键会更混乱，因此不会有这么长的连续的重复值。排序优先级更低的列以几乎随机的顺序出现，所以可能不会被压缩。但对前几列做排序在整体上仍然是有好处的。

#### 几个不同的排序顺序

对这个想法，有一个巧妙的扩展被 C-Store 发现，并在商业数据仓库 Vertica 中被采用【61,62】：既然不同的查询受益于不同的排序顺序，为什么不以几种不同的方式来存储相同的数据呢？反正数据都需要做备份，以防单点故障时丢失数据。因此你可以用不同排序方式来存储冗余数据，以便在处理查询时，调用最适合查询模式的版本。

在一个列式存储中有多个排序顺序有点类似于在一个面向行的存储中有多个次级索引。但最大的区别在于面向行的存储将每一行保存在一个地方（在堆文件或聚集索引中），次级索引只包含指向匹配行的指针。在列式存储中，通常在其他地方没有任何指向数据的指针，只有包含值的列。

### 写入列式存储

这些优化在数据仓库中是有意义的，因为其负载主要由分析人员运行的大型只读查询组成。列式存储、压缩和排序都有助于更快地读取这些查询。然而，他们的缺点是写入更加困难。

使用 B 树的就地更新方法对于压缩的列是不可能的。如果你想在排序表的中间插入一行，你很可能不得不重写所有的列文件。由于行由列中的位置标识，因此插入必须对所有列进行一致地更新。

幸运的是，本章前面已经看到了一个很好的解决方案：LSM 树。所有的写操作首先进入一个内存中的存储，在这里它们被添加到一个已排序的结构中，并准备写入硬盘。内存中的存储是面向行还是列的并不重要。当已经积累了足够的写入数据时，它们将与硬盘上的列文件合并，并批量写入新文件。这基本上是 Vertica 所做的【62】。

查询操作需要检查硬盘上的列数据和内存中的最近写入，并将两者的结果合并起来。但是，查询优化器对用户隐藏了这个细节。从分析师的角度来看，通过插入、更新或删除操作进行修改的数据会立即反映在后续的查询中。

### 聚合：数据立方体和物化视图

并非所有数据仓库都需要采用列式存储：传统的面向行的数据库和其他一些架构也被使用。然而，列式存储可以显著加快专门的分析查询，所以它正在迅速变得流行起来【51,63】。

数据仓库的另一个值得一提的方面是物化聚合（materialized aggregates）。如前所述，数据仓库查询通常涉及一个聚合函数，如 SQL 中的 COUNT、SUM、AVG、MIN 或 MAX。如果相同的聚合被许多不同的查询使用，那么每次都通过原始数据来处理可能太浪费了。为什么不将一些查询使用最频繁的计数或总和缓存起来？

创建这种缓存的一种方式是物化视图（Materialized View）。在关系数据模型中，它通常被定义为一个标准（虚拟）视图：一个类似于表的对象，其内容是一些查询的结果。不同的是，物化视图是查询结果的实际副本，会被写入硬盘，而虚拟视图只是编写查询的一个捷径。从虚拟视图读取时，SQL 引擎会将其展开到视图的底层查询中，然后再处理展开的查询。

当底层数据发生变化时，物化视图需要更新，因为它是数据的非规范化副本。数据库可以自动完成该操作，但是这样的更新使得写入成本更高，这就是在 OLTP 数据库中不经常使用物化视图的原因。在读取繁重的数据仓库中，它们可能更有意义（它们是否实际上改善了读取性能取决于使用场景）。

物化视图的常见特例称为数据立方体或 OLAP 立方【64】。它是按不同维度分组的聚合网格。[图 3-12](/v1/ddia_0312.png) 显示了一个例子。

![](/v1/ddia_0312.png)

**图 3-12 数据立方的两个维度，通过求和聚合**

想象一下，现在每个事实都只有两个维度表的外键 —— 在 [图 3-12](/v1/ddia_0312.png) 中分别是日期和产品。你现在可以绘制一个二维表格，一个轴线上是日期，另一个轴线上是产品。每个单元格包含具有该日期 - 产品组合的所有事实的属性（例如 `net_price`）的聚合（例如 `SUM`）。然后，你可以沿着每行或每列应用相同的汇总，并获得减少了一个维度的汇总（按产品的销售额，无论日期，或者按日期的销售额，无论产品）。

一般来说，事实往往有两个以上的维度。在图 3-9 中有五个维度：日期、产品、商店、促销和客户。要想象一个五维超立方体是什么样子是很困难的，但是原理是一样的：每个单元格都包含特定日期 - 产品 - 商店 - 促销 - 客户组合的销售额。这些值可以在每个维度上求和汇总。

物化数据立方体的优点是可以让某些查询变得非常快，因为它们已经被有效地预先计算了。例如，如果你想知道每个商店的总销售额，则只需查看合适维度的总计，而无需扫描数百万行的原始数据。

数据立方体的缺点是不具有查询原始数据的灵活性。例如，没有办法计算有多少比例的销售来自成本超过 100 美元的项目，因为价格不是其中的一个维度。因此，大多数数据仓库试图保留尽可能多的原始数据，并将聚合数据（如数据立方体）仅用作某些查询的性能提升手段。


## 本章小结

在本章中，我们试图深入了解数据库是如何处理存储和检索的。将数据存储在数据库中会发生什么？稍后再次查询数据时数据库会做什么？

在高层次上，我们看到存储引擎分为两大类：针对 **事务处理（OLTP）** 优化的存储引擎和针对 **在线分析（OLAP）** 优化的存储引擎。这两类使用场景的访问模式之间有很大的区别：

* OLTP 系统通常面向最终用户，这意味着系统可能会收到大量的请求。为了处理负载，应用程序在每个查询中通常只访问少量的记录。应用程序使用某种键来请求记录，存储引擎使用索引来查找所请求的键的数据。硬盘查找时间往往是这里的瓶颈。
* 数据仓库和类似的分析系统会少见一些，因为它们主要由业务分析人员使用，而不是最终用户。它们的查询量要比 OLTP 系统少得多，但通常每个查询开销高昂，需要在短时间内扫描数百万条记录。硬盘带宽（而不是查找时间）往往是瓶颈，列式存储是针对这种工作负载的日益流行的解决方案。

在 OLTP 这一边，我们能看到两派主流的存储引擎：

* 日志结构学派：只允许追加到文件和删除过时的文件，但不会更新已经写入的文件。Bitcask、SSTables、LSM 树、LevelDB、Cassandra、HBase、Lucene 等都属于这个类别。
* 就地更新学派：将硬盘视为一组可以覆写的固定大小的页面。B 树是这种理念的典范，用在所有主要的关系数据库和许多非关系型数据库中。

日志结构的存储引擎是相对较新的技术。他们的主要想法是，通过系统性地将随机访问写入转换为硬盘上的顺序写入，由于硬盘驱动器和固态硬盘的性能特点，可以实现更高的写入吞吐量。

关于 OLTP，我们最后还介绍了一些更复杂的索引结构，以及针对所有数据都放在内存里而优化的数据库。

然后，我们暂时放下了存储引擎的内部细节，查看了典型数据仓库的高级架构，并说明了为什么分析工作负载与 OLTP 差别很大：当你的查询需要在大量行中顺序扫描时，索引的重要性就会降低很多。相反，非常紧凑地编码数据变得非常重要，以最大限度地减少查询需要从硬盘读取的数据量。我们讨论了列式存储如何帮助实现这一目标。

作为一名应用程序开发人员，如果你掌握了有关存储引擎内部的知识，那么你就能更好地了解哪种工具最适合你的特定应用程序。当你调整数据库的优化参数时，这种理解让你能够设想增减某个值会产生怎样的效果。

尽管本章不能让你成为一个特定存储引擎的调参专家，但它至少大概率使你有了足够的概念与词汇储备去读懂你所选择的数据库的文档。


## 参考文献

1. Alfred V. Aho, John E. Hopcroft, and Jeffrey D. Ullman: *Data Structures and Algorithms*. Addison-Wesley, 1983. ISBN: 978-0-201-00023-8
1. Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein: *Introduction to Algorithms*, 3rd edition. MIT Press, 2009. ISBN: 978-0-262-53305-8
1. Justin Sheehy and David Smith: “[Bitcask: A Log-Structured Hash Table for Fast Key/Value Data](https://riak.com/assets/bitcask-intro.pdf),” Basho Technologies, April 2010.
1. Yinan Li, Bingsheng He, Robin Jun Yang, et al.: “[Tree Indexing on Solid State Drives](http://pages.cs.wisc.edu/~yinan/paper/fdtree_pvldb.pdf),” *Proceedings of the VLDB Endowment*, volume 3, number 1, pages 1195–1206, September 2010.
1. Goetz Graefe: “[Modern B-Tree Techniques](https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=0b19f413ffb5bc68b43f3bd05a97c282a7c6d6ab),” *Foundations and Trends in Databases*, volume 3, number 4, pages 203–402, August 2011. [doi:10.1561/1900000028](http://dx.doi.org/10.1561/1900000028)
1. Jeffrey Dean and Sanjay Ghemawat: “[LevelDB Implementation Notes](https://github.com/google/leveldb/blob/master/doc/impl.md),” *github.com*.
1. Dhruba Borthakur: “[The History of RocksDB](https://rocksdb.blogspot.com/2013/11/the-history-of-rocksdb.html),” *rocksdb.blogspot.com*, November 24, 2013.
1. Matteo Bertozzi: “[Apache HBase I/O – HFile](https://blog.cloudera.com/apache-hbase-i-o-hfile/),” *blog.cloudera.com*, June 29, 2012.
1. Fay Chang, Jeffrey Dean, Sanjay Ghemawat, et al.: “[Bigtable: A Distributed Storage System for Structured Data](https://research.google/pubs/pub27898/),” at *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006.
1. Patrick O'Neil, Edward Cheng, Dieter Gawlick, and Elizabeth O'Neil: “[The Log-Structured Merge-Tree (LSM-Tree)](http://www.cs.umb.edu/~poneil/lsmtree.pdf),” *Acta Informatica*, volume 33, number 4, pages 351–385, June 1996. [doi:10.1007/s002360050048](http://dx.doi.org/10.1007/s002360050048)
1. Mendel Rosenblum and John K. Ousterhout: “[The Design and Implementation of a Log-Structured File System](http://research.cs.wisc.edu/areas/os/Qual/papers/lfs.pdf),” *ACM Transactions on Computer Systems*, volume 10, number 1, pages 26–52, February 1992. [doi:10.1145/146941.146943](http://dx.doi.org/10.1145/146941.146943)
1. Adrien Grand: “[What Is in a Lucene Index?](http://www.slideshare.net/lucenerevolution/what-is-inaluceneagrandfinal),” at *Lucene/Solr Revolution*, November 14, 2013.
1. Deepak Kandepet: “[Hacking Lucene—The Index Format](https://web.archive.org/web/20160316190830/http://hackerlabs.github.io/blog/2011/10/01/hacking-lucene-the-index-format/index.html),” *hackerlabs.github.io*, October 1, 2011.
1. Michael McCandless: “[Visualizing Lucene's Segment Merges](http://blog.mikemccandless.com/2011/02/visualizing-lucenes-segment-merges.html),” *blog.mikemccandless.com*, February 11, 2011.
1. Burton H. Bloom: “[Space/Time Trade-offs in Hash Coding with Allowable Errors](https://people.cs.umass.edu/~emery/classes/cmpsci691st/readings/Misc/p422-bloom.pdf),” *Communications of the ACM*, volume 13, number 7, pages 422–426, July 1970. [doi:10.1145/362686.362692](http://dx.doi.org/10.1145/362686.362692)
1. “[Operating Cassandra: Compaction](https://cassandra.apache.org/doc/latest/operating/compaction/index.html),” Apache Cassandra Documentation v4.0, 2016.
1. Rudolf Bayer and Edward M. McCreight: “[Organization and Maintenance of Large Ordered Indices](https://apps.dtic.mil/sti/citations/AD0712079),” Boeing Scientific Research Laboratories, Mathematical and Information Sciences Laboratory, report no. 20, July 1970.
1. Douglas Comer: “[The Ubiquitous B-Tree](https://carlosproal.com/ir/papers/p121-comer.pdf),” *ACM Computing Surveys*, volume 11, number 2, pages 121–137, June 1979. [doi:10.1145/356770.356776](http://dx.doi.org/10.1145/356770.356776)
1. Emmanuel Goossaert: “[Coding for SSDs](http://codecapsule.com/2014/02/12/coding-for-ssds-part-1-introduction-and-table-of-contents/),” *codecapsule.com*, February 12, 2014.
1. C. Mohan and Frank Levine: “[ARIES/IM: An Efficient and High Concurrency Index Management Method Using Write-Ahead Logging](http://www.ics.uci.edu/~cs223/papers/p371-mohan.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 1992. [doi:10.1145/130283.130338](http://dx.doi.org/10.1145/130283.130338)
1. Howard Chu: “[LDAP at Lightning Speed](https://buildstuff14.sched.com/event/08a1a368e272eb599a52e08b4c3c779d),” at *Build Stuff '14*, November 2014.
1. Bradley C. Kuszmaul: “[A Comparison of Fractal Trees to Log-Structured Merge (LSM) Trees](http://www.pandademo.com/wp-content/uploads/2017/12/A-Comparison-of-Fractal-Trees-to-Log-Structured-Merge-LSM-Trees.pdf),” *tokutek.com*, April 22, 2014.
1. Manos Athanassoulis, Michael S. Kester, Lukas M. Maas, et al.: “[Designing Access Methods: The RUM Conjecture](http://openproceedings.org/2016/conf/edbt/paper-12.pdf),” at *19th International Conference on Extending Database Technology* (EDBT), March 2016. [doi:10.5441/002/edbt.2016.42](http://dx.doi.org/10.5441/002/edbt.2016.42)
1. Peter Zaitsev: “[Innodb Double Write](https://www.percona.com/blog/2006/08/04/innodb-double-write/),” *percona.com*, August 4, 2006.
1. Tomas Vondra: “[On the Impact of Full-Page Writes](https://www.enterprisedb.com/blog/impact-full-page-writes),” *blog.2ndquadrant.com*, November 23, 2016.
1. Mark Callaghan: “[The Advantages of an LSM vs a B-Tree](http://smalldatum.blogspot.co.uk/2016/01/summary-of-advantages-of-lsm-vs-b-tree.html),” *smalldatum.blogspot.co.uk*, January 19, 2016.
1. Mark Callaghan: “[Choosing Between Efficiency and Performance with RocksDB](https://codemesh.io/codemesh2016/mark-callaghan),” at *Code Mesh*, November 4, 2016.
1. Michi Mutsuzaki: “[MySQL vs. LevelDB](https://github.com/m1ch1/mapkeeper/wiki/MySQL-vs.-LevelDB),” *github.com*, August 2011.
1. Benjamin Coverston, Jonathan Ellis, et al.: “[CASSANDRA-1608: Redesigned Compaction](https://issues.apache.org/jira/browse/CASSANDRA-1608), *issues.apache.org*, July 2011.
1. Igor Canadi, Siying Dong, and Mark Callaghan: “[RocksDB Tuning Guide](https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide),” *github.com*, 2016.
1. [*MySQL 5.7 Reference Manual*](http://dev.mysql.com/doc/refman/5.7/en/index.html). Oracle, 2014.
1. [*Books Online for SQL Server 2012*](https://learn.microsoft.com/en-us/previous-versions/sql/sql-server-2012/ms130214(v=sql.110)). Microsoft, 2012.
1. Joe Webb: “[Using Covering Indexes to Improve Query Performance](https://www.simple-talk.com/sql/learn-sql-server/using-covering-indexes-to-improve-query-performance/),” *simple-talk.com*, 29 September 2008.
1. Frank Ramsak, Volker Markl, Robert Fenk, et al.: “[Integrating the UB-Tree into a Database System Kernel](http://www.vldb.org/conf/2000/P263.pdf),” at *26th International Conference on Very Large Data Bases* (VLDB), September 2000.
1. The PostGIS Development Group: “[PostGIS 2.1.2dev Manual](http://postgis.net/docs/manual-2.1/),” *postgis.net*, 2014.
1. Robert Escriva, Bernard Wong, and Emin Gün Sirer: “[HyperDex: A Distributed, Searchable Key-Value Store](http://www.cs.princeton.edu/courses/archive/fall13/cos518/papers/hyperdex.pdf),” at *ACM SIGCOMM Conference*, August 2012. [doi:10.1145/2377677.2377681](http://dx.doi.org/10.1145/2377677.2377681)
1. Michael McCandless: “[Lucene's FuzzyQuery Is 100 Times Faster in 4.0](http://blog.mikemccandless.com/2011/03/lucenes-fuzzyquery-is-100-times-faster.html),” *blog.mikemccandless.com*, March 24, 2011.
1. Steffen Heinz, Justin Zobel, and Hugh E. Williams: “[Burst Tries: A Fast, Efficient Data Structure for String Keys](http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499),” *ACM Transactions on Information Systems*, volume 20, number 2, pages 192–223, April 2002. [doi:10.1145/506309.506312](http://dx.doi.org/10.1145/506309.506312)
1. Klaus U. Schulz and Stoyan Mihov: “[Fast String Correction with Levenshtein Automata](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.16.652),” *International Journal on Document Analysis and Recognition*, volume 5, number 1, pages 67–85, November 2002. [doi:10.1007/s10032-002-0082-8](http://dx.doi.org/10.1007/s10032-002-0082-8)
1. Christopher D. Manning, Prabhakar Raghavan, and Hinrich Schütze: [*Introduction to Information Retrieval*](http://nlp.stanford.edu/IR-book/). Cambridge University Press, 2008. ISBN: 978-0-521-86571-5, available online at *nlp.stanford.edu/IR-book*
1. Michael Stonebraker, Samuel Madden, Daniel J. Abadi, et al.: “[The End of an Architectural Era (It’s Time for a Complete Rewrite)](http://nms.csail.mit.edu/~stavros/pubs/hstore.pdf),” at *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
1. “[VoltDB Technical Overview White Paper](https://www.voltdb.com/files/voltdb-technical-overview/),” VoltDB, 2014.
1. Stephen M. Rumble, Ankita Kejriwal, and John K. Ousterhout: “[Log-Structured Memory for DRAM-Based Storage](https://www.usenix.org/system/files/conference/fast14/fast14-paper_rumble.pdf),” at *12th USENIX Conference on File and Storage Technologies* (FAST), February 2014.
1. Stavros Harizopoulos, Daniel J. Abadi, Samuel Madden, and Michael Stonebraker: “[OLTP Through the Looking Glass, and What We Found There](http://hstore.cs.brown.edu/papers/hstore-lookingglass.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2008. [doi:10.1145/1376616.1376713](http://dx.doi.org/10.1145/1376616.1376713)
1. Justin DeBrabant, Andrew Pavlo, Stephen Tu, et al.: “[Anti-Caching: A New Approach to Database Management System Architecture](http://www.vldb.org/pvldb/vol6/p1942-debrabant.pdf),” *Proceedings of the VLDB Endowment*, volume 6, number 14, pages 1942–1953, September 2013.
1. Joy Arulraj, Andrew Pavlo, and Subramanya R. Dulloor: “[Let's Talk About Storage & Recovery Methods for Non-Volatile Memory Database Systems](http://www.pdl.cmu.edu/PDL-FTP/NVM/storage.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2749441](http://dx.doi.org/10.1145/2723372.2749441)
1. Edgar F. Codd, S. B. Codd, and C. T. Salley: “[Providing OLAP to User-Analysts: An IT Mandate](https://pdfs.semanticscholar.org/a0bd/1491a54a4de428c5eef9b836ef6ee2915fe7.pdf),” E. F. Codd Associates, 1993.
1. Surajit Chaudhuri and Umeshwar Dayal: “[An Overview of Data Warehousing and OLAP Technology](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/sigrecord.pdf),” *ACM SIGMOD Record*, volume 26, number 1, pages 65–74, March 1997. [doi:10.1145/248603.248616](http://dx.doi.org/10.1145/248603.248616)
1. Per-Åke Larson, Cipri Clinciu, Campbell Fraser, et al.: “[Enhancements to SQL Server Column Stores](http://research.microsoft.com/pubs/193599/Apollo3%20-%20Sigmod%202013%20-%20final.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2013.
1. Franz Färber, Norman May, Wolfgang Lehner, et al.: “[The SAP HANA Database – An Architecture Overview](http://sites.computer.org/debull/A12mar/hana.pdf),” *IEEE Data Engineering Bulletin*, volume 35, number 1, pages 28–33, March 2012.
1. Michael Stonebraker: “[The Traditional RDBMS Wisdom Is (Almost Certainly) All Wrong](http://slideshot.epfl.ch/talks/166),” presentation at *EPFL*, May 2013.
1. Daniel J. Abadi: “[Classifying the SQL-on-Hadoop Solutions](https://web.archive.org/web/20150622074951/http://hadapt.com/blog/2013/10/02/classifying-the-sql-on-hadoop-solutions/),” *hadapt.com*, October 2, 2013.
1. Marcel Kornacker, Alexander Behm, Victor Bittorf, et al.: “[Impala: A Modern, Open-Source SQL Engine for Hadoop](http://pandis.net/resources/cidr15impala.pdf),” at *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
1. Sergey Melnik, Andrey Gubarev, Jing Jing Long, et al.: “[Dremel: Interactive Analysis of Web-Scale Datasets](https://research.google/pubs/pub36632/),” at *36th International Conference on Very Large Data Bases* (VLDB), pages 330–339, September 2010.
1. Ralph Kimball and Margy Ross: *The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling*, 3rd edition. John Wiley & Sons, July 2013. ISBN: 978-1-118-53080-1
1. Derrick Harris: “[Why Apple, eBay, and Walmart Have Some of the Biggest Data Warehouses You’ve Ever Seen](https://web.archive.org/web/20221129085658/https://old.gigaom.com/2013/03/27/why-apple-ebay-and-walmart-have-some-of-the-biggest-data-warehouses-youve-ever-seen/),” *gigaom.com*, March 27, 2013.
1. Julien Le Dem: “[Dremel Made Simple with Parquet](https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet.html),” *blog.twitter.com*, September 11, 2013.
1. Daniel J. Abadi, Peter Boncz, Stavros Harizopoulos, et al.: “[The Design and Implementation of Modern Column-Oriented Database Systems](http://cs-www.cs.yale.edu/homes/dna/papers/abadi-column-stores.pdf),” *Foundations and Trends in Databases*, volume 5, number 3, pages 197–280, December 2013. [doi:10.1561/1900000024](http://dx.doi.org/10.1561/1900000024)
1. Peter Boncz, Marcin Zukowski, and Niels Nes: “[MonetDB/X100: Hyper-Pipelining Query Execution](http://cidrdb.org/cidr2005/papers/P19.pdf),” at *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005.
1. Jingren Zhou and Kenneth A. Ross: “[Implementing Database Operations Using SIMD Instructions](http://www1.cs.columbia.edu/~kar/pubsk/simd.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), pages 145–156, June 2002. [doi:10.1145/564691.564709](http://dx.doi.org/10.1145/564691.564709)
1. Michael Stonebraker, Daniel J. Abadi, Adam Batkin, et al.: “[C-Store: A Column-oriented DBMS](http://www.cs.umd.edu/~abadi/vldb.pdf),” at *31st International Conference on Very Large Data Bases* (VLDB), pages 553–564, September 2005.
1. Andrew Lamb, Matt Fuller, Ramakrishna Varadarajan, et al.: “[The Vertica Analytic Database: C-Store 7 Years Later](http://vldb.org/pvldb/vol5/p1790_andrewlamb_vldb2012.pdf),” *Proceedings of the VLDB Endowment*, volume 5, number 12, pages 1790–1801, August 2012.
1. Julien Le Dem and Nong Li: “[Efficient Data Storage for Analytics with Apache Parquet 2.0](http://www.slideshare.net/julienledem/th-210pledem),” at *Hadoop Summit*, San Jose, June 2014.
1. Jim Gray, Surajit Chaudhuri, Adam Bosworth, et al.: “[Data Cube: A Relational Aggregation Operator Generalizing Group-By, Cross-Tab, and Sub-Totals](http://arxiv.org/pdf/cs/0701155.pdf),” *Data Mining and Knowledge Discovery*, volume 1, number 1, pages 29–53, March 2007. [doi:10.1023/A:1009726021843](http://dx.doi.org/10.1023/A:1009726021843)


================================================
FILE: content/v1/ch4.md
================================================
---
title: "第四章：编码与演化"
linkTitle: "4. 编码与演化"
weight: 104
breadcrumbs: false
---

![](/map/ch04.png)

> 唯变所适
>
> —— 以弗所的赫拉克利特，为柏拉图所引（公元前 360 年）


应用程序不可避免地随时间而变化。新产品的推出，对需求的深入理解，或者商业环境的变化，总会伴随着 **功能（feature）** 的增增改改。[第一章](/v1/ch1) 介绍了 **可演化性（evolvability）** 的概念：应该尽力构建能灵活适应变化的系统（请参阅 “[可演化性：拥抱变化](/v1/ch1#可演化性：拥抱变化)”）。

在大多数情况下，修改应用程序的功能也意味着需要更改其存储的数据：可能需要使用新的字段或记录类型，或者以新方式展示现有数据。

我们在 [第二章](/v1/ch2) 讨论的数据模型有不同的方法来应对这种变化。关系数据库通常假定数据库中的所有数据都遵循一个模式：尽管可以更改该模式（通过模式迁移，即 `ALTER` 语句），但是在任何时间点都有且仅有一个正确的模式。相比之下，**读时模式**（schema-on-read，或 **无模式**，即 schemaless）数据库不会强制一个模式，因此数据库可以包含在不同时间写入的新老数据格式的混合（请参阅 “[文档模型中的模式灵活性](/v1/ch2#文档模型中的模式灵活性)” ）。

当数据 **格式（format）** 或 **模式（schema）** 发生变化时，通常需要对应用程序代码进行相应的更改（例如，为记录添加新字段，然后修改程序开始读写该字段）。但在大型应用程序中，代码变更通常不会立即完成：

* 对于 **服务端（server-side）** 应用程序，可能需要执行 **滚动升级 （rolling upgrade）** （也称为 **阶段发布（staged rollout）** ），一次将新版本部署到少数几个节点，检查新版本是否运行正常，然后逐渐部完所有的节点。这样无需中断服务即可部署新版本，为频繁发布提供了可行性，从而带来更好的可演化性。
* 对于 **客户端（client-side）** 应用程序，升不升级就要看用户的心情了。用户可能相当长一段时间里都不会去升级软件。

这意味着，新旧版本的代码，以及新旧数据格式可能会在系统中同时共处。系统想要继续顺利运行，就需要保持 **双向兼容性**：

向后兼容 (backward compatibility)
: 新的代码可以读取由旧的代码写入的数据。

向前兼容 (forward compatibility)
: 旧的代码可以读取由新的代码写入的数据。

向后兼容性通常并不难实现：新代码的作者当然知道由旧代码使用的数据格式，因此可以显示地处理它（最简单的办法是，保留旧代码即可读取旧数据）。

向前兼容性可能会更棘手，因为旧版的程序需要忽略新版数据格式中新增的部分。

本章中将介绍几种编码数据的格式，包括 JSON、XML、Protocol Buffers、Thrift 和 Avro。尤其将关注这些格式如何应对模式变化，以及它们如何对新旧代码数据需要共存的系统提供支持。然后将讨论如何使用这些格式进行数据存储和通信：在 Web 服务中，**表述性状态传递（REST）** 和 **远程过程调用（RPC）**，以及 **消息传递系统**（如 Actor 和消息队列）。

## 编码数据的格式

程序通常（至少）使用两种形式的数据：

1. 在内存中，数据保存在对象、结构体、列表、数组、散列表、树等中。这些数据结构针对 CPU 的高效访问和操作进行了优化（通常使用指针）。
2. 如果要将数据写入文件，或通过网络发送，则必须将其 **编码（encode）** 为某种自包含的字节序列（例如，JSON 文档）。由于每个进程都有自己独立的地址空间，一个进程中的指针对任何其他进程都没有意义，所以这个字节序列表示会与通常在内存中使用的数据结构完全不同。

> [!TIP] 除一些特殊情况外，例如某些内存映射文件或直接在压缩数据上操作（如 “[列压缩](/v1/ch3#列压缩)” 中所述）。

所以，需要在两种表示之间进行某种类型的翻译。从内存中表示到字节序列的转换称为 **编码（Encoding）** （也称为 **序列化（serialization）** 或 **编组（marshalling）**），反过来称为 **解码（Decoding）**[^ii]（**解析（Parsing）**，**反序列化（deserialization）**，**反编组（unmarshalling）**）。

> [!TIP] 请注意，**编码（encode）**  与 **加密（encryption）** 无关。本书不讨论加密。

> [!WARNING] 术语冲突
> 不幸的是，在 [第七章](/v1/ch7)： **事务（Transaction）** 的上下文里，**序列化（Serialization）** 这个术语也出现了，而且具有完全不同的含义。尽管序列化可能是更常见的术语，为了避免术语重载，本书中坚持使用 **编码（Encoding）** 表达此含义。

这是一个常见的问题，因而有许多库和编码格式可供选择。首先让我们概览一下。

### 语言特定的格式

许多编程语言都内建了将内存对象编码为字节序列的支持。例如，Java 有 `java.io.Serializable` 【1】，Ruby 有 `Marshal`【2】，Python 有 `pickle`【3】，等等。许多第三方库也存在，例如 `Kryo for Java` 【4】。

这些编码库非常方便，可以用很少的额外代码实现内存对象的保存与恢复。但是它们也有一些深层次的问题：

* 这类编码通常与特定的编程语言深度绑定，其他语言很难读取这种数据。如果以这类编码存储或传输数据，那你就和这门语言绑死在一起了。并且很难将系统与其他组织的系统（可能用的是不同的语言）进行集成。
* 为了恢复相同对象类型的数据，解码过程需要 **实例化任意类** 的能力，这通常是安全问题的一个来源【5】：如果攻击者可以让应用程序解码任意的字节序列，他们就能实例化任意的类，这会允许他们做可怕的事情，如远程执行任意代码【6,7】。
* 在这些库中，数据版本控制通常是事后才考虑的。因为它们旨在快速简便地对数据进行编码，所以往往忽略了向前和向后兼容性带来的麻烦问题。
* 效率（编码或解码所花费的 CPU 时间，以及编码结构的大小）往往也是事后才考虑的。例如，Java 的内置序列化由于其糟糕的性能和臃肿的编码而臭名昭著【8】。

因此，除非临时使用，采用语言内置编码通常是一个坏主意。

### JSON、XML和二进制变体

当我们谈到可以被多种编程语言读写的标准编码时，JSON 和 XML 是最显眼的角逐者。它们广为人知，广受支持，也 “广受憎恶”。XML 经常收到批评：过于冗长与且过份复杂【9】。JSON 的流行则主要源于（通过成为 JavaScript 的一个子集）Web 浏览器的内置支持，以及相对于 XML 的简单性。CSV 是另一种流行的与语言无关的格式，尽管其功能相对较弱。

JSON，XML 和 CSV 属于文本格式，因此具有人类可读性（尽管它们的语法是一个热门争议话题）。除了表面的语法问题之外，它们也存在一些微妙的问题：

* **数字（numbers）** 编码有很多模糊之处。在 XML 和 CSV 中，无法区分数字和碰巧由数字组成的字符串（除了引用外部模式）。JSON 虽然区分字符串与数字，但并不区分整数和浮点数，并且不能指定精度。
这在处理大数字时是个问题。例如大于 $2^{53}$ 的整数无法使用 IEEE 754 双精度浮点数精确表示，因此在使用浮点数（例如 JavaScript）的语言进行分析时，这些数字会变得不准确。Twitter 有一个关于大于 $2^{53}$ 的数字的例子，它使用 64 位整数来标识每条推文。Twitter API 返回的 JSON 包含了两个推特 ID，一个是 JSON 数字，另一个是十进制字符串，以解决 JavaScript 程序中无法正确解析数字的问题【10】。
* JSON 和 XML 对 Unicode 字符串（即人类可读的文本）有很好的支持，但是它们不支持二进制数据（即不带 **字符编码（character encoding）** 的字节序列）。二进制串是很有用的功能，人们通过使用 Base64 将二进制数据编码为文本来绕过此限制。其特有的模式标识着这个值应当被解释为 Base64 编码的二进制数据。这种方案虽然管用，但比较 Hacky，并且会增加三分之一的数据大小。
*  XML 【11】和 JSON 【12】都有可选的模式支持。这些模式语言相当强大，所以学习和实现起来都相当复杂。XML 模式的使用相当普遍，但许多基于 JSON 的工具才不会去折腾模式。对数据的正确解读（例如区分数值与二进制串）取决于模式中的信息，因此不使用 XML/JSON 模式的应用程序可能需要对相应的编码 / 解码逻辑进行硬编码。
* CSV 没有任何模式，因此每行和每列的含义完全由应用程序自行定义。如果应用程序变更添加了新的行或列，那么这种变更必须通过手工处理。CSV 也是一个相当模糊的格式（如果一个值包含逗号或换行符，会发生什么？）。尽管其转义规则已经被正式指定【13】，但并不是所有的解析器都正确的实现了标准。

尽管存在这些缺陷，但 JSON、XML 和 CSV 对很多需求来说已经足够好了。它们很可能会继续流行下去，特别是作为数据交换格式来说（即将数据从一个组织发送到另一个组织）。在这种情况下，只要人们对格式是什么意见一致，格式有多美观或者效率有多高效就无所谓了。让不同的组织就这些东西达成一致的难度超过了绝大多数问题。

#### 二进制编码

对于仅在组织内部使用的数据，使用最小公约数式的编码格式压力较小。例如，可以选择更紧凑或更快的解析格式。虽然对小数据集来说，收益可以忽略不计；但一旦达到 TB 级别，数据格式的选型就会产生巨大的影响。

JSON 比 XML 简洁，但与二进制格式相比还是太占空间。这一事实导致大量二进制编码版本 JSON（MessagePack、BSON、BJSON、UBJSON、BISON 和 Smile 等） 和 XML（例如 WBXML 和 Fast Infoset）的出现。这些格式已经在各种各样的领域中采用，但是没有一个能像文本版 JSON 和 XML 那样被广泛采用。

这些格式中的一些扩展了一组数据类型（例如，区分整数和浮点数，或者增加对二进制字符串的支持），另一方面，它们没有改变 JSON / XML 的数据模型。特别是由于它们没有规定模式，所以它们需要在编码数据中包含所有的对象字段名称。也就是说，在 [例 4-1]() 中的 JSON 文档的二进制编码中，需要在某处包含字符串 `userName`，`favoriteNumber` 和 `interests`。

**例 4-1 本章中用于展示二进制编码的示例记录**

```json
{
    "userName": "Martin",
    "favoriteNumber": 1337,
    "interests": ["daydreaming", "hacking"]
}
```

我们来看一个 MessagePack 的例子，它是一个 JSON 的二进制编码。图 4-1 显示了如果使用 MessagePack 【14】对 [例 4-1]() 中的 JSON 文档进行编码，则得到的字节序列。前几个字节如下：

1. 第一个字节 `0x83` 表示接下来是 **3** 个字段（低四位 = `0x03`）的 **对象 object**（高四位 = `0x80`）。（如果想知道如果一个对象有 15 个以上的字段会发生什么情况，字段的数量塞不进 4 个 bit 里，那么它会用另一个不同的类型标识符，字段的数量被编码两个或四个字节）。
2. 第二个字节 `0xa8` 表示接下来是 **8** 字节长（低四位 = `0x08`）的字符串（高四位 = `0x0a`）。
3. 接下来八个字节是 ASCII 字符串形式的字段名称 `userName`。由于之前已经指明长度，不需要任何标记来标识字符串的结束位置（或者任何转义）。
4. 接下来的七个字节对前缀为 `0xa6` 的六个字母的字符串值 `Martin` 进行编码，依此类推。

二进制编码长度为 66 个字节，仅略小于文本 JSON 编码所取的 81 个字节（删除了空白）。所有的 JSON 的二进制编码在这方面是相似的。空间节省了一丁点（以及解析加速）是否能弥补可读性的损失，谁也说不准。

在下面的章节中，能达到比这好得多的结果，只用 32 个字节对相同的记录进行编码。

![](/v1/ddia_0401.png)

**图 4-1 使用 MessagePack 编码的记录（例 4-1）**

### Thrift与Protocol Buffers

Apache Thrift 【15】和 Protocol Buffers（protobuf）【16】是基于相同原理的二进制编码库。Protocol Buffers 最初是在 Google 开发的，Thrift 最初是在 Facebook 开发的，并且都是在 2007~2008 开源的【17】。
Thrift 和 Protocol Buffers 都需要一个模式来编码任何数据。要在 Thrift 的 [例 4-1]() 中对数据进行编码，可以使用 Thrift **接口定义语言（IDL）** 来描述模式，如下所示：

```c
struct Person {
    1: required string       userName,
    2: optional i64          favoriteNumber,
    3: optional list<string> interests
}
```

Protocol Buffers 的等效模式定义看起来非常相似：

```protobuf
message Person {
    required string user_name       = 1;
    optional int64  favorite_number = 2;
    repeated string interests       = 3;
}
```

Thrift 和 Protocol Buffers 每一个都带有一个代码生成工具，它采用了类似于这里所示的模式定义，并且生成了以各种编程语言实现模式的类【18】。你的应用程序代码可以调用此生成的代码来对模式的记录进行编码或解码。
用这个模式编码的数据是什么样的？令人困惑的是，Thrift 有两种不同的二进制编码格式 [^iii]，分别称为 BinaryProtocol 和 CompactProtocol。先来看看 BinaryProtocol。使用这种格式的编码来编码 [例 4-1]() 中的消息只需要 59 个字节，如 [图 4-2](/v1/ddia_0402.png) 所示【19】。

![](/v1/ddia_0402.png)

**图 4-2 使用 Thrift 二进制协议编码的记录**

[^iii]: 实际上，Thrift 有三种二进制协议：BinaryProtocol、CompactProtocol 和 DenseProtocol，尽管 DenseProtocol 只支持 C ++ 实现，所以不算作跨语言【18】。除此之外，它还有两种不同的基于 JSON 的编码格式【19】。真逗！

与 [图 4-1](/v1/ddia_0401.png) 类似，每个字段都有一个类型注释（用于指示它是一个字符串、整数、列表等），还可以根据需要指定长度（字符串的长度，列表中的项目数） 。出现在数据中的字符串 `(“Martin”, “daydreaming”, “hacking”)` 也被编码为 ASCII（或者说，UTF-8），与之前类似。

与 [图 4-1](/v1/ddia_0401.png) 相比，最大的区别是没有字段名 `(userName, favoriteNumber, interests)`。相反，编码数据包含字段标签，它们是数字 `(1, 2 和 3)`。这些是模式定义中出现的数字。字段标记就像字段的别名 - 它们是说我们正在谈论的字段的一种紧凑的方式，而不必拼出字段名称。

Thrift CompactProtocol 编码在语义上等同于 BinaryProtocol，但是如 [图 4-3](/v1/ddia_0403.png) 所示，它只将相同的信息打包成只有 34 个字节。它通过将字段类型和标签号打包到单个字节中，并使用可变长度整数来实现。数字 1337 不是使用全部八个字节，而是用两个字节编码，每个字节的最高位用来指示是否还有更多的字节。这意味着 - 64 到 63 之间的数字被编码为一个字节，-8192 和 8191 之间的数字以两个字节编码，等等。较大的数字使用更多的字节。

![](/v1/ddia_0403.png)

**图 4-3 使用 Thrift 压缩协议编码的记录**

最后，Protocol Buffers（只有一种二进制编码格式）对相同的数据进行编码，如 [图 4-4](/v1/ddia_0404.png) 所示。它的打包方式稍有不同，但与 Thrift 的 CompactProtocol 非常相似。Protobuf 将同样的记录塞进了 33 个字节中。

![](/v1/ddia_0404.png)

**图 4-4 使用 Protobuf 编码的记录**

需要注意的一个细节：在前面所示的模式中，每个字段被标记为必需或可选，但是这对字段如何编码没有任何影响（二进制数据中没有任何字段指示某字段是否必须）。区别在于，如果字段设置为 `required`，但未设置该字段，则所需的运行时检查将失败，这对于捕获错误非常有用。

#### 字段标签和模式演变

我们之前说过，模式不可避免地需要随着时间而改变。我们称之为模式演变。Thrift 和 Protocol Buffers 如何处理模式更改，同时保持向后兼容性？

从示例中可以看出，编码的记录就是其编码字段的拼接。每个字段由其标签号码（样本模式中的数字 1,2,3）标识，并用数据类型（例如字符串或整数）注释。如果没有设置字段值，则简单地从编码记录中省略。从中可以看到，字段标记对编码数据的含义至关重要。你可以更改架构中字段的名称，因为编码的数据永远不会引用字段名称，但不能更改字段的标记，因为这会使所有现有的编码数据无效。

你可以添加新的字段到架构，只要你给每个字段一个新的标签号码。如果旧的代码（不知道你添加的新的标签号码）试图读取新代码写入的数据，包括一个新的字段，其标签号码不能识别，它可以简单地忽略该字段。数据类型注释允许解析器确定需要跳过的字节数。这保持了向前兼容性：旧代码可以读取由新代码编写的记录。

向后兼容性呢？只要每个字段都有一个唯一的标签号码，新的代码总是可以读取旧的数据，因为标签号码仍然具有相同的含义。唯一的细节是，如果你添加一个新的字段，你不能设置为必需。如果你要添加一个字段并将其设置为必需，那么如果新代码读取旧代码写入的数据，则该检查将失败，因为旧代码不会写入你添加的新字段。因此，为了保持向后兼容性，在模式的初始部署之后 **添加的每个字段必须是可选的或具有默认值**。

删除一个字段就像添加一个字段，只是这回要考虑的是向前兼容性。这意味着你只能删除可选的字段（必需字段永远不能删除），而且你不能再次使用相同的标签号码（因为你可能仍然有数据写在包含旧标签号码的地方，而该字段必须被新代码忽略）。

#### 数据类型和模式演变

如何改变字段的数据类型？这也许是可能的 —— 详细信息请查阅相关的文档 —— 但是有一个风险，值将失去精度或被截断。例如，假设你将一个 32 位的整数变成一个 64 位的整数。新代码可以轻松读取旧代码写入的数据，因为解析器可以用零填充任何缺失的位。但是，如果旧代码读取由新代码写入的数据，则旧代码仍使用 32 位变量来保存该值。如果解码的 64 位值不适合 32 位，则它将被截断。

Protobuf 的一个奇怪的细节是，它没有列表或数组数据类型，而是有一个字段的重复标记（`repeated`，这是除必需和可选之外的第三个选项）。如 [图 4-4](/v1/ddia_0404.png) 所示，重复字段的编码正如它所说的那样：同一个字段标记只是简单地出现在记录中。这具有很好的效果，可以将可选（单值）字段更改为重复（多值）字段。读取旧数据的新代码会看到一个包含零个或一个元素的列表（取决于该字段是否存在）。读取新数据的旧代码只能看到列表的最后一个元素。

Thrift 有一个专用的列表数据类型，它使用列表元素的数据类型进行参数化。这不允许 Protocol Buffers 所做的从单值到多值的演变，但是它具有支持嵌套列表的优点。

### Avro

Apache Avro 【20】是另一种二进制编码格式，与 Protocol Buffers 和 Thrift 有着有趣的不同。它是作为 Hadoop 的一个子项目在 2009 年开始的，因为 Thrift 不适合 Hadoop 的用例【21】。

Avro 也使用模式来指定正在编码的数据的结构。它有两种模式语言：一种（Avro IDL）用于人工编辑，一种（基于 JSON）更易于机器读取。

我们用 Avro IDL 编写的示例模式可能如下所示：

```c
record Person {
    string                userName;
    union { null, long }  favoriteNumber = null;
    array<string>         interests;
}
```

等价的 JSON 表示：

```json
{
    "type": "record",
    "name": "Person",
    "fields": [
        {"name": "userName", "type": "string"},
        {"name": "favoriteNumber", "type": ["null", "long"], "default": null},
        {"name": "interests", "type": {"type": "array", "items": "string"}}
    ]
}
```

首先，请注意模式中没有标签号码。如果我们使用这个模式编码我们的例子记录（[例 4-1]()），Avro 二进制编码只有 32 个字节长，这是我们所见过的所有编码中最紧凑的。编码字节序列的分解如 [图 4-5](/v1/ddia_0405.png) 所示。

如果你检查字节序列，你可以看到没有什么可以识别字段或其数据类型。编码只是由连在一起的值组成。一个字符串只是一个长度前缀，后跟 UTF-8 字节，但是在被包含的数据中没有任何内容告诉你它是一个字符串。它可以是一个整数，也可以是其他的整数。整数使用可变长度编码（与 Thrift 的 CompactProtocol 相同）进行编码。

![](/v1/ddia_0405.png)

**图 4-5 使用 Avro 编码的记录**

为了解析二进制数据，你按照它们出现在模式中的顺序遍历这些字段，并使用模式来告诉你每个字段的数据类型。这意味着如果读取数据的代码使用与写入数据的代码完全相同的模式，才能正确解码二进制数据。Reader 和 Writer 之间的模式不匹配意味着错误地解码数据。

那么，Avro 如何支持模式演变呢？

#### Writer模式与Reader模式

有了 Avro，当应用程序想要编码一些数据（将其写入文件或数据库、通过网络发送等）时，它使用它知道的任何版本的模式编码数据，例如，模式可能被编译到应用程序中。这被称为 Writer 模式。

当一个应用程序想要解码一些数据（从一个文件或数据库读取数据、从网络接收数据等）时，它希望数据在某个模式中，这就是 Reader 模式。这是应用程序代码所依赖的模式，在应用程序的构建过程中，代码可能已经从该模式生成。

Avro 的关键思想是 Writer 模式和 Reader 模式不必是相同的 - 他们只需要兼容。当数据解码（读取）时，Avro 库通过并排查看 Writer 模式和 Reader 模式并将数据从 Writer 模式转换到 Reader 模式来解决差异。Avro 规范【20】确切地定义了这种解析的工作原理，如 [图 4-6](/v1/ddia_0406.png) 所示。

例如，如果 Writer 模式和 Reader 模式的字段顺序不同，这是没有问题的，因为模式解析通过字段名匹配字段。如果读取数据的代码遇到出现在 Writer 模式中但不在 Reader 模式中的字段，则忽略它。如果读取数据的代码需要某个字段，但是 Writer 模式不包含该名称的字段，则使用在 Reader 模式中声明的默认值填充。

![](/v1/ddia_0406.png)

**图 4-6 一个 Avro Reader 解决读写模式的差异**

#### 模式演变规则

使用 Avro，向前兼容性意味着你可以将新版本的模式作为 Writer，并将旧版本的模式作为 Reader。相反，向后兼容意味着你可以有一个作为 Reader 的新版本模式和作为 Writer 的旧版本模式。

为了保持兼容性，你只能添加或删除具有默认值的字段（我们的 Avro 模式中的字段 `favoriteNumber` 的默认值为 `null`）。例如，假设你添加了一个有默认值的字段，这个新的字段将存在于新模式而不是旧模式中。当使用新模式的 Reader 读取使用旧模式写入的记录时，将为缺少的字段填充默认值。

如果你要添加一个没有默认值的字段，新的 Reader 将无法读取旧 Writer 写的数据，所以你会破坏向后兼容性。如果你要删除没有默认值的字段，旧的 Reader 将无法读取新 Writer 写入的数据，因此你会打破向前兼容性。在一些编程语言中，null 是任何变量可以接受的默认值，但在 Avro 中并不是这样：如果要允许一个字段为 `null`，则必须使用联合类型。例如，`union {null, long, string} field;` 表示 field 可以是数字或字符串，也可以是 `null`。如果要将 null 作为默认值，则它必须是 union 的分支之一 [^iv]。这样的写法比默认情况下就允许任何变量是 `null` 显得更加冗长，但是通过明确什么可以和什么不可以是 `null`，有助于防止出错【22】。

[^iv]: 确切地说，默认值必须是联合的第一个分支的类型，尽管这是 Avro 的特定限制，而不是联合类型的一般特征。

因此，Avro 没有像 Protocol Buffers 和 Thrift 那样的 `optional` 和 `required` 标记（但它有联合类型和默认值）。

只要 Avro 可以支持相应的类型转换，就可以改变字段的数据类型。更改字段的名称也是可能的，但有点棘手：Reader 模式可以包含字段名称的别名，所以它可以匹配旧 Writer 的模式字段名称与别名。这意味着更改字段名称是向后兼容的，但不能向前兼容。同样，向联合类型添加分支也是向后兼容的，但不能向前兼容。

#### 但Writer模式到底是什么？

到目前为止，我们一直跳过了一个重要的问题：对于一段特定的编码数据，Reader 如何知道其 Writer 模式？我们不能只将整个模式包括在每个记录中，因为模式可能比编码的数据大得多，从而使二进制编码节省的所有空间都是徒劳的。

答案取决于 Avro 使用的上下文。举几个例子：

有很多记录的大文件
: Avro 的一个常见用途 - 尤其是在 Hadoop 环境中 - 用于存储包含数百万条记录的大文件，所有记录都使用相同的模式进行编码（我们将在 [第十章](/v1/ch10) 讨论这种情况）。在这种情况下，该文件的作者可以在文件的开头只包含一次 Writer 模式。Avro 指定了一个文件格式（对象容器文件）来做到这一点。

支持独立写入的记录的数据库
: 在一个数据库中，不同的记录可能会在不同的时间点使用不同的 Writer 模式来写入 - 你不能假定所有的记录都有相同的模式。最简单的解决方案是在每个编码记录的开始处包含一个版本号，并在数据库中保留一个模式版本列表。Reader 可以获取记录，提取版本号，然后从数据库中获取该版本号的 Writer 模式。使用该 Writer 模式，它可以解码记录的其余部分（例如 Espresso 【23】就是这样工作的）。

通过网络连接发送记录
: 当两个进程通过双向网络连接进行通信时，他们可以在连接设置上协商模式版本，然后在连接的生命周期中使用该模式。Avro RPC 协议（请参阅 “[服务中的数据流：REST 与 RPC](#服务中的数据流：REST与RPC)”）就是这样工作的。

具有模式版本的数据库在任何情况下都是非常有用的，因为它充当文档并为你提供了检查模式兼容性的机会【24】。作为版本号，你可以使用一个简单的递增整数，或者你可以使用模式的散列。

#### 动态生成的模式

与 Protocol Buffers 和 Thrift 相比，Avro 方法的一个优点是架构不包含任何标签号码。但为什么这很重要？在模式中保留一些数字有什么问题？

不同之处在于 Avro 对动态生成的模式更友善。例如，假如你有一个关系数据库，你想要把它的内容转储到一个文件中，并且你想使用二进制格式来避免前面提到的文本格式（JSON，CSV，SQL）的问题。如果你使用 Avro，你可以很容易地从关系模式生成一个 Avro 模式（在我们之前看到的 JSON 表示中），并使用该模式对数据库内容进行编码，并将其全部转储到 Avro 对象容器文件【25】中。你为每个数据库表生成一个记录模式，每个列成为该记录中的一个字段。数据库中的列名称映射到 Avro 中的字段名称。

现在，如果数据库模式发生变化（例如，一个表中添加了一列，删除了一列），则可以从更新的数据库模式生成新的 Avro 模式，并在新的 Avro 模式中导出数据。数据导出过程不需要注意模式的改变 - 每次运行时都可以简单地进行模式转换。任何读取新数据文件的人都会看到记录的字段已经改变，但是由于字段是通过名字来标识的，所以更新的 Writer 模式仍然可以与旧的 Reader 模式匹配。

相比之下，如果你为此使用 Thrift 或 Protocol Buffers，则字段标签可能必须手动分配：每次数据库模式更改时，管理员都必须手动更新从数据库列名到字段标签的映射（这可能会自动化，但模式生成器必须非常小心，不要分配以前使用的字段标签）。这种动态生成的模式根本不是 Thrift 或 Protocol Buffers 的设计目标，而是 Avro 的。

#### 代码生成和动态类型的语言

Thrift 和 Protobuf 依赖于代码生成：在定义了模式之后，可以使用你选择的编程语言生成实现此模式的代码。这在 Java、C++ 或 C# 等静态类型语言中很有用，因为它允许将高效的内存中的数据结构用于解码的数据，并且在编写访问数据结构的程序时允许在 IDE 中进行类型检查和自动补全。

在动态类型编程语言（如 JavaScript、Ruby 或 Python）中，生成代码没有太多意义，因为没有编译时类型检查器来满足。代码生成在这些语言中经常被忽视，因为它们避免了显式的编译步骤。而且，对于动态生成的模式（例如从数据库表生成的 Avro 模式），代码生成对获取数据是一个不必要的障碍。

Avro 为静态类型编程语言提供了可选的代码生成功能，但是它也可以在不生成任何代码的情况下使用。如果你有一个对象容器文件（它嵌入了 Writer 模式），你可以简单地使用 Avro 库打开它，并以与查看 JSON 文件相同的方式查看数据。该文件是自描述的，因为它包含所有必要的元数据。

这个属性特别适用于动态类型的数据处理语言如 Apache Pig 【26】。在 Pig 中，你可以打开一些 Avro 文件，开始分析它们，并编写派生数据集以 Avro 格式输出文件，而无需考虑模式。

### 模式的优点

正如我们所看到的，Protocol Buffers、Thrift 和 Avro 都使用模式来描述二进制编码格式。他们的模式语言比 XML 模式或者 JSON 模式简单得多，而后者支持更详细的验证规则（例如，“该字段的字符串值必须与该正则表达式匹配” 或 “该字段的整数值必须在 0 和 100 之间” ）。由于 Protocol Buffers，Thrift 和 Avro 实现起来更简单，使用起来也更简单，所以它们已经发展到支持相当广泛的编程语言。

这些编码所基于的想法绝不是新的。例如，它们与 ASN.1 有很多相似之处，它是 1984 年首次被标准化的模式定义语言【27】。它被用来定义各种网络协议，例如其二进制编码（DER）仍然被用于编码 SSL 证书（X.509）【28】。ASN.1 支持使用标签号码的模式演进，类似于 Protocol Buffers 和 Thrift 【29】。然而，它也非常复杂，而且没有好的配套文档，所以 ASN.1 可能不是新应用程序的好选择。

许多数据系统也为其数据实现了某种专有的二进制编码。例如，大多数关系数据库都有一个网络协议，你可以通过该协议向数据库发送查询并获取响应。这些协议通常特定于特定的数据库，并且数据库供应商提供将来自数据库的网络协议的响应解码为内存数据结构的驱动程序（例如使用 ODBC 或 JDBC API）。

所以，我们可以看到，尽管 JSON、XML 和 CSV 等文本数据格式非常普遍，但基于模式的二进制编码也是一个可行的选择。他们有一些很好的属性：

* 它们可以比各种 “二进制 JSON” 变体更紧凑，因为它们可以省略编码数据中的字段名称。
* 模式是一种有价值的文档形式，因为模式是解码所必需的，所以可以确定它是最新的（而手动维护的文档可能很容易偏离现实）。
* 维护一个模式的数据库允许你在部署任何内容之前检查模式更改的向前和向后兼容性。
* 对于静态类型编程语言的用户来说，从模式生成代码的能力是有用的，因为它可以在编译时进行类型检查。

总而言之，模式演化保持了与 JSON 数据库提供的无模式 / 读时模式相同的灵活性（请参阅 “[文档模型中的模式灵活性](/v1/ch2#文档模型中的模式灵活性)”），同时还可以更好地保证你的数据并提供更好的工具。


## 数据流的类型

在本章的开始部分，我们曾经说过，无论何时你想要将某些数据发送到不共享内存的另一个进程，例如，只要你想通过网络发送数据或将其写入文件，就需要将它编码为一个字节序列。然后我们讨论了做这个的各种不同的编码。

我们讨论了向前和向后的兼容性，这对于可演化性来说非常重要（通过允许你独立升级系统的不同部分，而不必一次改变所有内容，可以轻松地进行更改）。兼容性是编码数据的一个进程和解码它的另一个进程之间的一种关系。

这是一个相当抽象的概念 - 数据可以通过多种方式从一个流程流向另一个流程。谁编码数据，谁解码？在本章的其余部分中，我们将探讨数据如何在流程之间流动的一些最常见的方式：

* 通过数据库（请参阅 “[数据库中的数据流](#数据库中的数据流)”）
* 通过服务调用（请参阅 “[服务中的数据流：REST 与 RPC](#服务中的数据流：REST与RPC)”）
* 通过异步消息传递（请参阅 “[消息传递中的数据流](#消息传递中的数据流)”）


### 数据库中的数据流

在数据库中，写入数据库的过程对数据进行编码，从数据库读取的过程对数据进行解码。可能只有一个进程访问数据库，在这种情况下，读者只是相同进程的后续版本 - 在这种情况下，你可以考虑将数据库中的内容存储为向未来的自我发送消息。

向后兼容性显然是必要的。否则你未来的自己将无法解码你以前写的东西。

一般来说，几个不同的进程同时访问数据库是很常见的。这些进程可能是几个不同的应用程序或服务，或者它们可能只是几个相同服务的实例（为了可伸缩性或容错性而并行运行）。无论哪种方式，在应用程序发生变化的环境中，访问数据库的某些进程可能会运行较新的代码，有些进程可能会运行较旧的代码，例如，因为新版本当前正在部署滚动升级，所以有些实例已经更新，而其他实例尚未更新。

这意味着数据库中的一个值可能会被更新版本的代码写入，然后被仍旧运行的旧版本的代码读取。因此，数据库也经常需要向前兼容。

但是，还有一个额外的障碍。假设你将一个字段添加到记录模式，并且较新的代码将该新字段的值写入数据库。随后，旧版本的代码（尚不知道新字段）将读取记录，更新记录并将其写回。在这种情况下，理想的行为通常是旧代码保持新的字段不变，即使它不能被解释。

前面讨论的编码格式支持未知字段的保存，但是有时候需要在应用程序层面保持谨慎，如图 4-7 所示。例如，如果将数据库值解码为应用程序中的模型对象，稍后重新编码这些模型对象，那么未知字段可能会在该翻译过程中丢失。解决这个问题不是一个难题，你只需要意识到它。

![](/v1/ddia_0407.png)

**图 4-7 当较旧版本的应用程序更新以前由较新版本的应用程序编写的数据时，如果不小心，数据可能会丢失。**

#### 在不同的时间写入不同的值

数据库通常允许任何时候更新任何值。这意味着在一个单一的数据库中，可能有一些值是五毫秒前写的，而一些值是五年前写的。

在部署应用程序的新版本时，也许用不了几分钟就可以将所有的旧版本替换为新版本（至少服务器端应用程序是这样的）。但数据库内容并非如此：对于五年前的数据来说，除非对其进行显式重写，否则它仍然会以原始编码形式存在。这种现象有时被概括为：数据的生命周期超出代码的生命周期。

将数据重写（迁移）到一个新的模式当然是可能的，但是在一个大数据集上执行是一个昂贵的事情，所以大多数数据库如果可能的话就避免它。大多数关系数据库都允许简单的模式更改，例如添加一个默认值为空的新列，而不重写现有数据 [^v]。读取旧行时，对于磁盘上的编码数据缺少的任何列，数据库将填充空值。LinkedIn 的文档数据库 Espresso 使用 Avro 存储，允许它使用 Avro 的模式演变规则【23】。

因此，模式演变允许整个数据库看起来好像是用单个模式编码的，即使底层存储可能包含用各种历史版本的模式编码的记录。

[^v]: 除了 MySQL，即使并非真的必要，它也经常会重写整个表，正如 “[文档模型中的模式灵活性](/v1/ch2#文档模型中的模式灵活性)” 中所提到的。


#### 归档存储

也许你不时为数据库创建一个快照，例如备份或加载到数据仓库（请参阅 “[数据仓库](/v1/ch3#数据仓库)”）。在这种情况下，即使源数据库中的原始编码包含来自不同时代的模式版本的混合，数据转储通常也将使用最新模式进行编码。既然你不管怎样都要拷贝数据，那么你可以对这个数据拷贝进行一致的编码。

由于数据转储是一次写入的，而且以后是不可变的，所以 Avro 对象容器文件等格式非常适合。这也是一个很好的机会，可以将数据编码为面向分析的列式格式，例如 Parquet（请参阅 “[列压缩](/v1/ch3#列压缩)”）。

在 [第十章](/v1/ch10) 中，我们将详细讨论使用档案存储中的数据。


### 服务中的数据流：REST与RPC

当你需要通过网络进行进程间的通讯时，安排该通信的方式有几种。最常见的安排是有两个角色：客户端和服务器。服务器通过网络公开 API，并且客户端可以连接到服务器以向该 API 发出请求。服务器公开的 API 被称为服务。

Web 以这种方式工作：客户（Web 浏览器）向 Web 服务器发出请求，通过 GET 请求下载 HTML、CSS、JavaScript、图像等，并通过 POST 请求提交数据到服务器。API 包含一组标准的协议和数据格式（HTTP、URL、SSL/TLS、HTML 等）。由于网络浏览器、网络服务器和网站作者大多同意这些标准，你可以使用任何网络浏览器访问任何网站（至少在理论上！）。

Web 浏览器不是唯一的客户端类型。例如，在移动设备或桌面计算机上运行的本地应用程序也可以向服务器发出网络请求，并且在 Web 浏览器内运行的客户端 JavaScript 应用程序可以使用 XMLHttpRequest 成为 HTTP 客户端（该技术被称为 Ajax 【30】）。在这种情况下，服务器的响应通常不是用于显示给人的 HTML，而是便于客户端应用程序代码进一步处理的编码数据（如 JSON）。尽管 HTTP 可能被用作传输协议，但顶层实现的 API 是特定于应用程序的，客户端和服务器需要就该 API 的细节达成一致。

此外，服务器本身可以是另一个服务的客户端（例如，典型的 Web 应用服务器充当数据库的客户端）。这种方法通常用于将大型应用程序按照功能区域分解为较小的服务，这样当一个服务需要来自另一个服务的某些功能或数据时，就会向另一个服务发出请求。这种构建应用程序的方式传统上被称为 **面向服务的体系结构（service-oriented architecture，SOA）**，最近被改进和更名为 **微服务架构**【31,32】。

在某些方面，服务类似于数据库：它们通常允许客户端提交和查询数据。但是，虽然数据库允许使用我们在 [第二章](/v1/ch2) 中讨论的查询语言进行任意查询，但是服务公开了一个特定于应用程序的 API，它只允许由服务的业务逻辑（应用程序代码）预定的输入和输出【33】。这种限制提供了一定程度的封装：服务能够对客户可以做什么和不可以做什么施加细粒度的限制。

面向服务 / 微服务架构的一个关键设计目标是通过使服务独立部署和演化来使应用程序更易于更改和维护。例如，每个服务应该由一个团队拥有，并且该团队应该能够经常发布新版本的服务，而不必与其他团队协调。换句话说，我们应该期望服务器和客户端的旧版本和新版本同时运行，因此服务器和客户端使用的数据编码必须在不同版本的服务 API 之间兼容 —— 这正是我们在本章所一直在谈论的。

#### Web服务

**当服务使用 HTTP 作为底层通信协议时，可称之为 Web 服务**。这可能是一个小错误，因为 Web 服务不仅在 Web 上使用，而且在几个不同的环境中使用。例如：

1. 运行在用户设备上的客户端应用程序（例如，移动设备上的本地应用程序，或使用 Ajax 的 JavaScript web 应用程序）通过 HTTP 向服务发出请求。这些请求通常通过公共互联网进行。
2. 一种服务向同一组织拥有的另一项服务提出请求，这些服务通常位于同一数据中心内，作为面向服务 / 微服务架构的一部分。（支持这种用例的软件有时被称为 **中间件（middleware）** ）
3. 一种服务通过互联网向不同组织所拥有的服务提出请求。这用于不同组织后端系统之间的数据交换。此类别包括由在线服务（如信用卡处理系统）提供的公共 API，或用于共享访问用户数据的 OAuth。

有两种流行的 Web 服务方法：REST 和 SOAP。他们在哲学方面几乎是截然相反的，往往也是各自支持者之间的激烈辩论的主题 [^vi]。

[^vi]: 即使在每个阵营内也有很多争论。例如，**HATEOAS（超媒体作为应用程序状态的引擎）** 就经常引发讨论【35】。

REST 不是一个协议，而是一个基于 HTTP 原则的设计哲学【34,35】。它强调简单的数据格式，使用 URL 来标识资源，并使用 HTTP 功能进行缓存控制，身份验证和内容类型协商。与 SOAP 相比，REST 已经越来越受欢迎，至少在跨组织服务集成的背景下【36】，并经常与微服务相关【31】。根据 REST 原则设计的 API 称为 RESTful。

相比之下，SOAP 是用于制作网络 API 请求的基于 XML 的协议 [^vii]。虽然它最常用于 HTTP，但其目的是独立于 HTTP，并避免使用大多数 HTTP 功能。相反，它带有庞大而复杂的多种相关标准（Web 服务框架，称为 `WS-*`），它们增加了各种功能【37】。

[^vii]: 尽管首字母缩写词相似，SOAP 并不是 SOA 的要求。SOAP 是一种特殊的技术，而 SOA 是构建系统的一般方法。

SOAP Web 服务的 API 使用称为 Web 服务描述语言（WSDL）的基于 XML 的语言来描述。WSDL 支持代码生成，客户端可以使用本地类和方法调用（编码为 XML 消息并由框架再次解码）访问远程服务。这在静态类型编程语言中非常有用，但在动态类型编程语言中很少（请参阅 “[代码生成和动态类型的语言](#代码生成和动态类型的语言)”）。

由于 WSDL 的设计不是人类可读的，而且由于 SOAP 消息通常因为过于复杂而无法手动构建，所以 SOAP 的用户在很大程度上依赖于工具支持，代码生成和 IDE【38】。对于 SOAP 供应商不支持的编程语言的用户来说，与 SOAP 服务的集成是困难的。

尽管 SOAP 及其各种扩展表面上是标准化的，但是不同厂商的实现之间的互操作性往往会造成问题【39】。由于所有这些原因，尽管许多大型企业仍然使用 SOAP，但在大多数小公司中已经不再受到青睐。

REST 风格的 API 倾向于更简单的方法，通常涉及较少的代码生成和自动化工具。定义格式（如 OpenAPI，也称为 Swagger 【40】）可用于描述 RESTful API 并生成文档。

#### 远程过程调用（RPC）的问题

Web 服务仅仅是通过网络进行 API 请求的一系列技术的最新版本，其中许多技术受到了大量的炒作，但是存在严重的问题。Enterprise JavaBeans（EJB）和 Java 的 **远程方法调用（RMI）** 仅限于 Java。**分布式组件对象模型（DCOM）** 仅限于 Microsoft 平台。**公共对象请求代理体系结构（CORBA）** 过于复杂，不提供向后或向前兼容性【41】。

所有这些都是基于 **远程过程调用（RPC）** 的思想，该过程调用自 20 世纪 70 年代以来一直存在【42】。RPC 模型试图向远程网络服务发出请求，看起来与在同一进程中调用编程语言中的函数或方法相同（这种抽象称为位置透明）。尽管 RPC 起初看起来很方便，但这种方法根本上是有缺陷的【43,44】。网络请求与本地函数调用非常不同：

* 本地函数调用是可预测的，并且成功或失败仅取决于受你控制的参数。网络请求是不可预测的：请求或响应可能由于网络问题会丢失，或者远程计算机可能很慢或不可用，这些问题完全不在你的控制范围之内。网络问题很常见，因此必须有所准备，例如重试失败的请求。
* 本地函数调用要么返回结果，要么抛出异常，或者永远不返回（因为进入无限循环或进程崩溃）。网络请求有另一个可能的结果：由于超时，它返回时可能没有结果。在这种情况下，你根本不知道发生了什么：如果你没有得到来自远程服务的响应，你无法知道请求是否通过（我们将在 [第八章](/v1/ch8) 更详细地讨论这个问题）。
* 如果你重试失败的网络请求，可能会发生请求实际上已经完成，只是响应丢失的情况。在这种情况下，重试将导致该操作被执行多次，除非你在协议中建立数据去重机制（**幂等性**，即 idempotence）。本地函数调用时没有这样的问题。（在 [第十一章](/v1/ch11) 更详细地讨论幂等性）
* 每次调用本地函数时，通常需要大致相同的时间来执行。网络请求比函数调用要慢得多，而且其延迟也是非常可变的：好的时候它可能会在不到一毫秒的时间内完成，但是当网络拥塞或者远程服务超载时，可能需要几秒钟的时间才能完成相同的操作。
* 调用本地函数时，可以高效地将引用（指针）传递给本地内存中的对象。当你发出一个网络请求时，所有这些参数都需要被编码成可以通过网络发送的一系列字节。如果参数是像数字或字符串这样的基本类型倒是没关系，但是对于较大的对象很快就会出现问题。
* 客户端和服务可以用不同的编程语言实现，所以 RPC 框架必须将数据类型从一种语言翻译成另一种语言。这可能会变得很丑陋，因为不是所有的语言都具有相同的类型 —— 例如回想一下 JavaScript 的数字大于 $2^{53}$ 的问题（请参阅 “[JSON、XML 和二进制变体](#JSON、XML和二进制变体)”）。用单一语言编写的单个进程中不存在此问题。

所有这些因素意味着尝试使远程服务看起来像编程语言中的本地对象一样毫无意义，因为这是一个根本不同的事情。REST 的部分吸引力在于，它并不试图隐藏它是一个网络协议的事实（尽管这似乎并没有阻止人们在 REST 之上构建 RPC 库）。

#### RPC的当前方向

尽管有这样那样的问题，RPC 不会消失。在本章提到的所有编码的基础上构建了各种 RPC 框架：例如，Thrift 和 Avro 带有 RPC 支持，gRPC 是使用 Protocol Buffers 的 RPC 实现，Finagle 也使用 Thrift，Rest.li 使用 JSON over HTTP。

这种新一代的 RPC 框架更加明确的是，远程请求与本地函数调用不同。例如，Finagle 和 Rest.li 使用 futures（promises）来封装可能失败的异步操作。`Futures` 还可以简化需要并行发出多项服务并将其结果合并的情况【45】。gRPC 支持流，其中一个调用不仅包括一个请求和一个响应，还可以是随时间的一系列请求和响应【46】。

其中一些框架还提供服务发现，即允许客户端找出在哪个 IP 地址和端口号上可以找到特定的服务。我们将在 “[请求路由](/v1/ch6#请求路由)” 中回到这个主题。

使用二进制编码格式的自定义 RPC 协议可以实现比通用的 JSON over REST 更好的性能。但是，RESTful API 还有其他一些显著的优点：方便实验和调试（只需使用 Web 浏览器或命令行工具 curl，无需任何代码生成或软件安装即可向其请求），能被所有主流的编程语言和平台所支持，还有大量可用的工具（服务器、缓存、负载平衡器、代理、防火墙、监控、调试工具、测试工具等）的生态系统。

由于这些原因，REST 似乎是公共 API 的主要风格。RPC 框架的主要重点在于同一组织拥有的服务之间的请求，通常在同一数据中心内。

#### 数据编码与RPC的演化

对于可演化性，重要的是可以独立更改和部署 RPC 客户端和服务器。与通过数据库流动的数据相比（如上一节所述），我们可以在通过服务进行数据流的情况下做一个简化的假设：假定所有的服务器都会先更新，其次是所有的客户端。因此，你只需要在请求上具有向后兼容性，并且对响应具有向前兼容性。

RPC 方案的向后和向前兼容性属性是从它使用的编码方式中继承而来：

* Thrift、gRPC（Protobuf）和 Avro RPC 可以根据相应编码格式的兼容性规则进行演变。
* 在 SOAP 中，请求和响应是使用 XML 模式指定的。这些可以演变，但有一些微妙的陷阱【47】。
* RESTful API 通常使用 JSON（没有正式指定的模式）用于响应，以及用于请求的 JSON 或 URI 编码 / 表单编码的请求参数。添加可选的请求参数并向响应对象添加新的字段通常被认为是保持兼容性的改变。

由于 RPC 经常被用于跨越组织边界的通信，所以服务的兼容性变得更加困难，因此服务的提供者经常无法控制其客户，也不能强迫他们升级。因此，需要长期保持兼容性，也许是无限期的。如果需要进行兼容性更改，则服务提供商通常会并排维护多个版本的服务 API。

关于 API 版本化应该如何工作（即，客户端如何指示它想要使用哪个版本的 API）没有一致意见【48】）。对于 RESTful API，常用的方法是在 URL 或 HTTP Accept 头中使用版本号。对于使用 API 密钥来标识特定客户端的服务，另一种选择是将客户端请求的 API 版本存储在服务器上，并允许通过单独的管理界面更新该版本选项【49】。

### 消息传递中的数据流

我们一直在研究从一个过程到另一个过程的编码数据流的不同方式。到目前为止，我们已经讨论了 REST 和 RPC（其中一个进程通过网络向另一个进程发送请求并期望尽可能快的响应）以及数据库（一个进程写入编码数据，另一个进程在将来再次读取）。

在最后一节中，我们将简要介绍一下 RPC 和数据库之间的异步消息传递系统。它们与 RPC 类似，因为客户端的请求（通常称为消息）以低延迟传送到另一个进程。它们与数据库类似，不是通过直接的网络连接发送消息，而是通过称为消息代理（也称为消息队列或面向消息的中间件）的中介来临时存储消息。

与直接 RPC 相比，使用消息代理有几个优点：

* 如果收件人不可用或过载，可以充当缓冲区，从而提高系统的可靠性。
* 它可以自动将消息重新发送到已经崩溃的进程，从而防止消息丢失。
* 避免发件人需要知道收件人的 IP 地址和端口号（这在虚拟机经常出入的云部署中特别有用）。
* 它允许将一条消息发送给多个收件人。
* 将发件人与收件人逻辑分离（发件人只是发布邮件，不关心使用者）。

然而，与 RPC 相比，差异在于消息传递通信通常是单向的：发送者通常不期望收到其消息的回复。一个进程可能发送一个响应，但这通常是在一个单独的通道上完成的。这种通信模式是异步的：发送者不会等待消息被传递，而只是发送它，然后忘记它。

#### 消息代理

过去，**消息代理（Message Broker）** 主要是 TIBCO、IBM WebSphere 和 webMethods 等公司的商业软件的秀场。最近像 RabbitMQ、ActiveMQ、HornetQ、NATS 和 Apache Kafka 这样的开源实现已经流行起来。我们将在 [第十一章](/v1/ch11) 中对它们进行更详细的比较。

详细的交付语义因实现和配置而异，但通常情况下，消息代理的使用方式如下：一个进程将消息发送到指定的队列或主题，代理确保将消息传递给那个队列或主题的一个或多个消费者或订阅者。在同一主题上可以有许多生产者和许多消费者。

一个主题只提供单向数据流。但是，消费者本身可能会将消息发布到另一个主题上（因此，可以将它们链接在一起，就像我们将在 [第十一章](/v1/ch11) 中看到的那样），或者发送给原始消息的发送者使用的回复队列（允许请求 / 响应数据流，类似于 RPC）。

消息代理通常不会执行任何特定的数据模型 —— 消息只是包含一些元数据的字节序列，因此你可以使用任何编码格式。如果编码是向后和向前兼容的，你可以灵活地对发布者和消费者的编码进行独立的修改，并以任意顺序进行部署。

如果消费者重新发布消息到另一个主题，则可能需要小心保留未知字段，以防止前面在数据库环境中描述的问题（[图 4-7](/v1/ddia_0407.png)）。

#### 分布式的Actor框架

Actor 模型是单个进程中并发的编程模型。逻辑被封装在 actor 中，而不是直接处理线程（以及竞争条件、锁定和死锁的相关问题）。每个 actor 通常代表一个客户或实体，它可能有一些本地状态（不与其他任何角色共享），它通过发送和接收异步消息与其他角色通信。不保证消息传送：在某些错误情况下，消息将丢失。由于每个角色一次只能处理一条消息，因此不需要担心线程，每个角色可以由框架独立调度。

在分布式 Actor 框架中，此编程模型用于跨多个节点伸缩应用程序。不管发送方和接收方是在同一个节点上还是在不同的节点上，都使用相同的消息传递机制。如果它们在不同的节点上，则该消息被透明地编码成字节序列，通过网络发送，并在另一侧解码。

位置透明在 actor 模型中比在 RPC 中效果更好，因为 actor 模型已经假定消息可能会丢失，即使在单个进程中也是如此。尽管网络上的延迟可能比同一个进程中的延迟更高，但是在使用 actor 模型时，本地和远程通信之间的基本不匹配是较少的。

分布式的 Actor 框架实质上是将消息代理和 actor 编程模型集成到一个框架中。但是，如果要执行基于 actor 的应用程序的滚动升级，则仍然需要担心向前和向后兼容性问题，因为消息可能会从运行新版本的节点发送到运行旧版本的节点，反之亦然。

三个流行的分布式 actor 框架处理消息编码如下：

* 默认情况下，Akka 使用 Java 的内置序列化，不提供向前或向后兼容性。但是，你可以用类似 Protocol Buffers 的东西替代它，从而获得滚动升级的能力【50】。
* Orleans 默认使用不支持滚动升级部署的自定义数据编码格式；要部署新版本的应用程序，你需要设置一个新的集群，将流量从旧集群迁移到新集群，然后关闭旧集群【51,52】。像 Akka 一样，可以使用自定义序列化插件。
* 在 Erlang OTP 中，对记录模式进行更改是非常困难的（尽管系统具有许多为高可用性设计的功能）。滚动升级是可能的，但需要仔细计划【53】。一个新的实验性的 `maps` 数据类型（2014 年在 Erlang R17 中引入的类似于 JSON 的结构）可能使得这个数据类型在未来更容易【54】。


## 本章小结

在本章中，我们研究了将数据结构转换为网络中的字节或磁盘上的字节的几种方法。我们看到了这些编码的细节不仅影响其效率，更重要的是也影响了应用程序的体系结构和部署它们的选项。

特别是，许多服务需要支持滚动升级，其中新版本的服务逐步部署到少数节点，而不是同时部署到所有节点。滚动升级允许在不停机的情况下发布新版本的服务（从而鼓励在罕见的大型版本上频繁发布小型版本），并使部署风险降低（允许在影响大量用户之前检测并回滚有故障的版本）。这些属性对于可演化性，以及对应用程序进行更改的容易性都是非常有利的。

在滚动升级期间，或出于各种其他原因，我们必须假设不同的节点正在运行我们的应用程序代码的不同版本。因此，在系统周围流动的所有数据都是以提供向后兼容性（新代码可以读取旧数据）和向前兼容性（旧代码可以读取新数据）的方式进行编码是重要的。

我们讨论了几种数据编码格式及其兼容性属性：

* 编程语言特定的编码仅限于单一编程语言，并且往往无法提供向前和向后兼容性。
* JSON、XML 和 CSV 等文本格式非常普遍，其兼容性取决于你如何使用它们。他们有可选的模式语言，这有时是有用的，有时是一个障碍。这些格式对于数据类型有些模糊，所以你必须小心数字和二进制字符串。
* 像 Thrift、Protocol Buffers 和 Avro 这样的二进制模式驱动格式允许使用清晰定义的向前和向后兼容性语义进行紧凑、高效的编码。这些模式可以用于静态类型语言的文档和代码生成。但是，他们有一个缺点，就是在数据可读之前需要对数据进行解码。

我们还讨论了数据流的几种模式，说明了数据编码重要性的不同场景：

* 数据库，写入数据库的进程对数据进行编码，并从数据库读取进程对其进行解码
* RPC 和 REST API，客户端对请求进行编码，服务器对请求进行解码并对响应进行编码，客户端最终对响应进行解码
* 异步消息传递（使用消息代理或参与者），其中节点之间通过发送消息进行通信，消息由发送者编码并由接收者解码

我们可以小心地得出这样的结论：向后/向前兼容性和滚动升级在某种程度上是可以实现的。愿你的应用程序的演变迅速、敏捷部署。


## 参考文献

1. “[Java Object Serialization Specification](http://docs.oracle.com/javase/7/docs/platform/serialization/spec/serialTOC.html),” *docs.oracle.com*, 2010.
1. “[Ruby 2.2.0 API Documentation](http://ruby-doc.org/core-2.2.0/),” *ruby-doc.org*, Dec 2014.
1. “[The Python 3.4.3 Standard Library Reference Manual](https://docs.python.org/3/library/pickle.html),” *docs.python.org*, February 2015.
1. “[EsotericSoftware/kryo](https://github.com/EsotericSoftware/kryo),” *github.com*, October 2014.
1. “[CWE-502: Deserialization of Untrusted Data](http://cwe.mitre.org/data/definitions/502.html),” Common Weakness Enumeration, *cwe.mitre.org*, July 30, 2014.
1. Steve Breen: “[What Do WebLogic, WebSphere, JBoss, Jenkins, OpenNMS, and Your Application Have in Common? This Vulnerability](http://foxglovesecurity.com/2015/11/06/what-do-weblogic-websphere-jboss-jenkins-opennms-and-your-application-have-in-common-this-vulnerability/),” *foxglovesecurity.com*, November 6, 2015.
1. Patrick McKenzie: “[What the Rails Security Issue Means for Your Startup](http://www.kalzumeus.com/2013/01/31/what-the-rails-security-issue-means-for-your-startup/),” *kalzumeus.com*, January 31, 2013.
1. Eishay Smith: “[jvm-serializers wiki](https://github.com/eishay/jvm-serializers/wiki),” *github.com*, November 2014.
1. “[XML Is a Poor Copy of S-Expressions](http://c2.com/cgi/wiki?XmlIsaPoorCopyOfEssExpressions),” *c2.com* wiki.
1. Matt Harris: “[Snowflake: An Update and Some Very Important Information](https://groups.google.com/forum/#!topic/twitter-development-talk/ahbvo3VTIYI),” email to *Twitter Development Talk* mailing list, October 19, 2010.
1. Shudi (Sandy) Gao, C. M. Sperberg-McQueen, and Henry S. Thompson: “[XML Schema 1.1](http://www.w3.org/XML/Schema),” W3C Recommendation, May 2001.
1. Francis Galiegue, Kris Zyp, and Gary Court: “[JSON Schema](http://json-schema.org/),” IETF Internet-Draft, February 2013.
1. Yakov Shafranovich: “[RFC 4180: Common Format and MIME Type for Comma-Separated Values (CSV) Files](https://tools.ietf.org/html/rfc4180),” October 2005.
1. “[MessagePack Specification](http://msgpack.org/),” *msgpack.org*.
1. Mark Slee, Aditya Agarwal, and Marc Kwiatkowski: “[Thrift: Scalable Cross-Language Services Implementation](http://thrift.apache.org/static/files/thrift-20070401.pdf),” Facebook technical report, April 2007.
1. “[Protocol Buffers Developer Guide](https://developers.google.com/protocol-buffers/docs/overview),” Google, Inc., *developers.google.com*.
1. Igor Anishchenko: “[Thrift vs Protocol Buffers vs Avro - Biased Comparison](http://www.slideshare.net/IgorAnishchenko/pb-vs-thrift-vs-avro),” *slideshare.net*, September 17, 2012.
1. “[A Matrix of the Features Each Individual Language Library Supports](http://wiki.apache.org/thrift/LibraryFeatures),” *wiki.apache.org*.
1. Martin Kleppmann: “[Schema Evolution in Avro, Protocol Buffers and Thrift](http://martin.kleppmann.com/2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html),” *martin.kleppmann.com*, December 5, 2012.
1. “[Apache Avro 1.7.7 Documentation](http://avro.apache.org/docs/1.7.7/),” *avro.apache.org*, July 2014.
1. Doug Cutting, Chad Walters, Jim Kellerman, et al.: “[&#91;PROPOSAL&#93; New Subproject: Avro](http://mail-archives.apache.org/mod_mbox/hadoop-general/200904.mbox/%3C49D53694.1050906@apache.org%3E),” email thread on *hadoop-general* mailing list, *mail-archives.apache.org*, April 2009.
1. Tony Hoare: “[Null References: The Billion Dollar Mistake](http://www.infoq.com/presentations/Null-References-The-Billion-Dollar-Mistake-Tony-Hoare),” at *QCon London*, March 2009.
1. Aditya Auradkar and Tom Quiggle: “[Introducing Espresso—LinkedIn's Hot New Distributed Document Store](https://engineering.linkedin.com/espresso/introducing-espresso-linkedins-hot-new-distributed-document-store),” *engineering.linkedin.com*, January 21, 2015.
1. Jay Kreps: “[Putting Apache Kafka to Use: A Practical Guide to Building a Stream Data Platform (Part 2)](http://blog.confluent.io/2015/02/25/stream-data-platform-2/),” *blog.confluent.io*, February 25, 2015.
1. Gwen Shapira: “[The Problem of Managing Schemas](http://radar.oreilly.com/2014/11/the-problem-of-managing-schemas.html),” *radar.oreilly.com*, November 4, 2014.
1. “[Apache Pig 0.14.0 Documentation](http://pig.apache.org/docs/r0.14.0/),” *pig.apache.org*, November 2014.
1. John Larmouth: [*ASN.1 Complete*](http://www.oss.com/asn1/resources/books-whitepapers-pubs/larmouth-asn1-book.pdf). Morgan Kaufmann, 1999. ISBN: 978-0-122-33435-1
1. Russell Housley, Warwick Ford, Tim Polk, and David Solo: “[RFC 2459: Internet X.509 Public Key Infrastructure: Certificate and CRL Profile](https://www.ietf.org/rfc/rfc2459.txt),” IETF Network Working Group, Standards Track, January 1999.
1. Lev Walkin: “[Question: Extensibility and Dropping Fields](http://lionet.info/asn1c/blog/2010/09/21/question-extensibility-removing-fields/),” *lionet.info*, September 21, 2010.
1. Jesse James Garrett: “[Ajax: A New Approach to Web Applications](https://web.archive.org/web/20181231094556/https://www.adaptivepath.com/ideas/ajax-new-approach-web-applications/),” *adaptivepath.com*, February 18, 2005.
1. Sam Newman: *Building Microservices*. O'Reilly Media, 2015. ISBN: 978-1-491-95035-7
1. Chris Richardson: “[Microservices: Decomposing Applications for Deployability and Scalability](http://www.infoq.com/articles/microservices-intro),” *infoq.com*, May 25, 2014.
1. Pat Helland: “[Data on the Outside Versus Data on the Inside](http://cidrdb.org/cidr2005/papers/P12.pdf),” at *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005.
1. Roy Thomas Fielding: “[Architectural Styles and the Design of Network-Based Software Architectures](https://www.ics.uci.edu/~fielding/pubs/dissertation/fielding_dissertation.pdf),” PhD Thesis, University of California, Irvine, 2000.
1. Roy Thomas Fielding: “[REST APIs Must Be Hypertext-Driven](http://roy.gbiv.com/untangled/2008/rest-apis-must-be-hypertext-driven),” *roy.gbiv.com*, October 20 2008.
1. “[REST in Peace, SOAP](https://royal.pingdom.com/rest-in-peace-soap/),” *royal.pingdom.com*, October 15, 2010.
1. “[Web Services Standards as of Q1 2007](https://www.innoq.com/resources/ws-standards-poster/),” *innoq.com*, February 2007.
1. Pete Lacey: “[The S Stands for Simple](http://harmful.cat-v.org/software/xml/soap/simple),” *harmful.cat-v.org*, November 15, 2006.
1. Stefan Tilkov: “[Interview: Pete Lacey Criticizes Web Services](http://www.infoq.com/articles/pete-lacey-ws-criticism),” *infoq.com*, December 12, 2006.
1. “[OpenAPI Specification (fka Swagger RESTful API Documentation Specification) Version 2.0](http://swagger.io/specification/),” *swagger.io*, September 8, 2014.
1. Michi Henning: “[The Rise and Fall of CORBA](https://cacm.acm.org/magazines/2008/8/5336-the-rise-and-fall-of-corba/fulltext),” *Communications of the ACM*, volume 51, number 8, pages 52–57, August 2008. [doi:10.1145/1378704.1378718](http://dx.doi.org/10.1145/1378704.1378718)
1. Andrew D. Birrell and Bruce Jay Nelson: “[Implementing Remote Procedure Calls](http://www.cs.princeton.edu/courses/archive/fall03/cs518/papers/rpc.pdf),” *ACM Transactions on Computer Systems* (TOCS), volume 2, number 1, pages 39–59, February 1984. [doi:10.1145/2080.357392](http://dx.doi.org/10.1145/2080.357392)
1. Jim Waldo, Geoff Wyant, Ann Wollrath, and Sam Kendall: “[A Note on Distributed Computing](http://m.mirror.facebook.net/kde/devel/smli_tr-94-29.pdf),” Sun Microsystems Laboratories, Inc., Technical Report TR-94-29, November 1994.
1. Steve Vinoski: “[Convenience over Correctness](http://steve.vinoski.net/pdf/IEEE-Convenience_Over_Correctness.pdf),” *IEEE Internet Computing*, volume 12, number 4, pages 89–92, July 2008. [doi:10.1109/MIC.2008.75](http://dx.doi.org/10.1109/MIC.2008.75)
1. Marius Eriksen: “[Your Server as a Function](http://monkey.org/~marius/funsrv.pdf),” at *7th Workshop on Programming Languages and Operating Systems* (PLOS), November 2013. [doi:10.1145/2525528.2525538](http://dx.doi.org/10.1145/2525528.2525538)
1. “[gRPC concepts](https://grpc.io/docs/guides/concepts/),” The Linux Foundation, *grpc.io*.
1. Aditya Narayan and Irina Singh: “[Designing and Versioning Compatible Web Services](https://web.archive.org/web/20141016000136/http://www.ibm.com/developerworks/websphere/library/techarticles/0705_narayan/0705_narayan.html),” *ibm.com*, March 28, 2007.
1. Troy Hunt: “[Your API Versioning Is Wrong, Which Is Why I Decided to Do It 3 Different Wrong Ways](http://www.troyhunt.com/2014/02/your-api-versioning-is-wrong-which-is.html),” *troyhunt.com*, February 10, 2014.
1. “[API Upgrades](https://stripe.com/docs/upgrades),” Stripe, Inc., April 2015.
1. Jonas Bonér: “[Upgrade in an Akka Cluster](http://grokbase.com/t/gg/akka-user/138wd8j9e3/upgrade-in-an-akka-cluster),” email to *akka-user* mailing list, *grokbase.com*, August 28, 2013.
1. Philip A. Bernstein, Sergey Bykov, Alan Geller, et al.: “[Orleans: Distributed Virtual Actors for Programmability and Scalability](https://www.microsoft.com/en-us/research/publication/orleans-distributed-virtual-actors-for-programmability-and-scalability/),” Microsoft Research Technical Report MSR-TR-2014-41, March 2014.
1. “[Microsoft Project Orleans Documentation](http://dotnet.github.io/orleans/),” Microsoft Research, *dotnet.github.io*, 2015.
1. David Mercer, Sean Hinde, Yinso Chen, and Richard A O'Keefe: “[beginner: Updating Data Structures](http://erlang.org/pipermail/erlang-questions/2007-October/030318.html),” email thread on *erlang-questions* mailing list, *erlang.com*, October 29, 2007.
1. Fred Hebert: “[Postscript: Maps](http://learnyousomeerlang.com/maps),” *learnyousomeerlang.com*, April 9, 2014.


================================================
FILE: content/v1/ch5.md
================================================
---
title: "第五章：复制"
linkTitle: "5. 复制"
weight: 205
breadcrumbs: false
math: true
---

![](/map/ch05.png)

> 与可能出错的东西比，“不可能”出错的东西最显著的特点就是：一旦真的出错，通常就彻底玩完了。
>
> —— 道格拉斯・亚当斯（1992）


复制意味着在通过网络连接的多台机器上保留相同数据的副本。正如在 [第二部分](/v1/part-ii) 的介绍中所讨论的那样，我们希望能复制数据，可能出于各种各样的原因：

* 使得数据与用户在地理上接近（从而减少延迟）
* 即使系统的一部分出现故障，系统也能继续工作（从而提高可用性）
* 伸缩可以接受读请求的机器数量（从而提高读取吞吐量）

本章将假设你的数据集非常小，每台机器都可以保存整个数据集的副本。在 [第六章](/v1/ch6) 中将放宽这个假设，讨论对单个机器来说太大的数据集的分割（分片）。在后面的章节中，我们将讨论复制数据系统中可能发生的各种故障，以及如何处理这些故障。

如果复制中的数据不会随时间而改变，那复制就很简单：将数据复制到每个节点一次就万事大吉。复制的困难之处在于处理复制数据的 **变更（change）**，这就是本章所要讲的。我们将讨论三种流行的变更复制算法：**单领导者（single leader，单主）**，**多领导者（multi leader，多主）** 和 **无领导者（leaderless，无主）**。几乎所有分布式数据库都使用这三种方法之一。

在复制时需要进行许多权衡：例如，使用同步复制还是异步复制？如何处理失败的副本？这些通常是数据库中的配置选项，细节因数据库而异，但原理在许多不同的实现中都类似。本章会讨论这些决策的后果。

数据库的复制算得上是老生常谈了 ——70 年代研究得出的基本原则至今没有太大变化【1】，因为网络的基本约束仍保持不变。然而在研究之外，许多开发人员仍然假设一个数据库只有一个节点。分布式数据库变为主流只是最近发生的事。许多程序员都是这一领域的新手，因此对于诸如 **最终一致性（eventual consistency）** 等问题存在许多误解。在 “[复制延迟问题](#复制延迟问题)” 一节，我们将更加精确地了解最终一致性，并讨论诸如 **读己之写（read-your-writes）** 和 **单调读（monotonic read）** 等内容。

## 领导者与追随者

存储了数据库拷贝的每个节点被称为 **副本（replica）** 。当存在多个副本时，会不可避免的出现一个问题：如何确保所有数据都落在了所有的副本上？

每一次向数据库的写入操作都需要传播到所有副本上，否则副本就会包含不一样的数据。最常见的解决方案被称为 **基于领导者的复制（leader-based replication）** （也称 **主动/被动（active/passive）** 复制或 **主/从（master/slave）** 复制），如 [图 5-1](#fig5-1.png) 所示。它的工作原理如下：

1. 其中一个副本被指定为 **领导者（leader）**，也称为 **主库（master|primary）** 。当客户端要向数据库写入时，它必须将请求发送给该 **领导者**，其会将新数据写入其本地存储。
2. 其他副本被称为 **追随者（followers）**，亦称为 **只读副本（read replicas）**、**从库（slaves）**、**备库（ secondaries）** 或 **热备（hot-standby）**[^i]。每当领导者将新数据写入本地存储时，它也会将数据变更发送给所有的追随者，称之为 **复制日志（replication log）** 或 **变更流（change stream）**。每个跟随者从领导者拉取日志，并相应更新其本地数据库副本，方法是按照与领导者相同的处理顺序来进行所有写入。
3. 当客户想要从数据库中读取数据时，它可以向领导者或任一追随者进行查询。但只有领导者才能接受写入操作（从客户端的角度来看从库都是只读的）。

[^i]: 不同的人对 **热（hot）**、**温（warm）** 和 **冷（cold）** 备份服务器有不同的定义。例如在 PostgreSQL 中，**热备（hot standby）** 指的是能接受客户端读请求的副本。而 **温备（warm standby）** 只是追随领导者，但不处理客户端的任何查询。就本书而言，这些差异并不重要。

![](/v1/ddia_0501.png)

**图 5-1 基于领导者的（主/从）复制**

这种复制模式是许多关系数据库的内置功能，如 PostgreSQL（从 9.0 版本开始）、MySQL、Oracle Data Guard【2】和 SQL Server 的 AlwaysOn 可用性组【3】。它也被用于一些非关系数据库，包括 MongoDB、RethinkDB 和 Espresso【4】。最后，基于领导者的复制并不仅限于数据库：像 Kafka【5】和 RabbitMQ 高可用队列【6】这样的分布式消息代理也使用它。某些网络文件系统，例如 DRBD 这样的块复制设备也与之类似。

### 同步复制与异步复制

复制系统的一个重要细节是：复制是 **同步（synchronously）** 发生的还是 **异步（asynchronously）** 发生的。（在关系型数据库中这通常是一个配置项，其他系统则通常硬编码为其中一个）。

想象一下 [图 5-1](fig5-1.png) 中发生的场景，即网站的用户更新他们的个人头像。在某个时间点，客户向主库发送更新请求；不久之后主库就收到了请求。在某个时间点，主库又会将数据变更转发给自己的从库。最终，主库通知客户更新成功。

[图 5-2](/v1/ddia_0502.png) 显示了系统各个组件之间的通信：用户客户端、主库和两个从库。时间从左向右流动。请求或响应消息用粗箭头表示。

![](/v1/ddia_0502.png)

**图 5-2 基于领导者的复制：一个同步从库和一个异步从库**

在 [图 5-2](/v1/ddia_0502.png) 的示例中，从库 1 的复制是同步的：在向用户报告写入成功并使结果对其他用户可见之前，主库需要等待从库 1 的确认，确保从库 1 已经收到写入操作。而从库 2 的复制是异步的：主库发送消息，但不等待该从库的响应。

在这幅图中，从库 2 处理消息前存在一个显著的延迟。通常情况下，复制的速度相当快：大多数数据库系统能在不到一秒内完成从库的同步，但它们不能提供复制用时的保证。有些情况下，从库可能落后主库几分钟或更久，例如：从库正在从故障中恢复，系统正在最大容量附近运行，或者当节点间存在网络问题时。

同步复制的优点是，从库能保证有与主库一致的最新数据副本。如果主库突然失效，我们可以确信这些数据仍然能在从库上找到。缺点是，如果同步从库没有响应（比如它已经崩溃，或者出现网络故障，或其它任何原因），主库就无法处理写入操作。主库必须阻止所有写入，并等待同步副本再次可用。

因此，将所有从库都设置为同步的是不切实际的：任何一个节点的中断都会导致整个系统停滞不前。实际上，如果在数据库上启用同步复制，通常意味着其中 **一个** 从库是同步的，而其他的从库则是异步的。如果该同步从库变得不可用或缓慢，则将一个异步从库改为同步运行。这保证你至少在两个节点上拥有最新的数据副本：主库和同步从库。这种配置有时也被称为 **半同步（semi-synchronous）**【7】。

通常情况下，基于领导者的复制都配置为完全异步。在这种情况下，如果主库失效且不可恢复，则任何尚未复制给从库的写入都会丢失。这意味着即使已经向客户端确认成功，写入也不能保证是 **持久（Durable）** 的。然而，一个完全异步的配置也有优点：即使所有的从库都落后了，主库也可以继续处理写入。

弱化的持久性可能听起来像是一个坏的折衷，但异步复制其实已经被广泛使用了，特别是在有很多从库的场景下，或者当从库在地理上分布很广的时候。我们将在讨论 “[复制延迟问题](#复制延迟问题)” 时回到这个问题。

> ### 关于复制的研究
>
> 对于异步复制系统而言，主库故障时会丢失数据可能是一个严重的问题，因此研究人员仍在研究不丢数据但仍能提供良好性能和可用性的复制方法。例如，**链式复制（chain replication）**【8,9】是同步复制的一种变体，已经在一些系统（如 Microsoft Azure Storage【10,11】）中成功实现。
>
> 复制的一致性与 **共识**（consensus，使几个节点就某个值达成一致）之间有着密切的联系，[第九章](/v1/ch9) 将详细地探讨这一领域的理论。本章主要讨论实践中的数据库常用的简单复制形式。
>

### 设置新从库

有时候需要设置一个新的从库：也许是为了增加副本的数量，或替换失败的节点。如何确保新的从库拥有主库数据的精确副本？

简单地将数据文件从一个节点复制到另一个节点通常是不够的：客户端不断向数据库写入数据，数据总是在不断地变化，标准的文件复制会看到数据库的不同部分在不同的时间点的内容，其结果可能没有任何意义。

可以通过锁定数据库（使其不可用于写入）来使磁盘上的文件保持一致，但是这会违背高可用的目标。幸运的是，设置新从库通常并不需要停机。从概念上讲，其过程如下所示：

1. 在某个时刻获取主库的一致性快照（如果可能，不必锁定整个数据库）。大多数数据库都具有这个功能，因为它是备份必需的。对于某些场景，可能需要第三方工具，例如用于 MySQL 的 innobackupex【12】。
2. 将快照复制到新的从库节点。
3. 从库连接到主库，并拉取快照之后发生的所有数据变更。这要求快照与主库复制日志中的位置精确关联。该位置有不同的名称，例如 PostgreSQL 将其称为 **日志序列号（log sequence number，LSN）**，MySQL 将其称为 **二进制日志坐标（binlog coordinates）**。
4. 当从库处理完快照之后积累的数据变更，我们就说它 **赶上（caught up）** 了主库，现在它可以继续及时处理主库产生的数据变化了。

建立从库的实际步骤因数据库而异。在某些系统中，这个过程是完全自动化的，而在另外一些系统中，它可能是一个需要由管理员手动执行的、有点神秘的多步骤工作流。

### 处理节点宕机

系统中的任何节点都可能宕机，可能因为意外的故障，也可能由于计划内的维护（例如，重启机器以安装内核安全补丁）。对运维而言，能在系统不中断服务的情况下重启单个节点好处多多。我们的目标是，即使个别节点失效，也能保持整个系统运行，并尽可能控制节点停机带来的影响。

如何通过基于领导者的复制实现高可用？

#### 从库失效：追赶恢复

在其本地磁盘上，每个从库记录从主库收到的数据变更。如果从库崩溃并重新启动，或者，如果主库和从库之间的网络暂时中断，则比较容易恢复：从库可以从日志中知道，在发生故障之前处理的最后一个事务。因此，从库可以连接到主库，并请求在从库断开期间发生的所有数据变更。当应用完所有这些变更后，它就赶上了主库，并可以像以前一样继续接收数据变更流。

#### 主库失效：故障切换

主库失效处理起来相当棘手：其中一个从库需要被提升为新的主库，需要重新配置客户端，以将它们的写操作发送给新的主库，其他从库需要开始拉取来自新主库的数据变更。这个过程被称为 **故障切换（failover）**。

故障切换可以手动进行（通知管理员主库挂了，并采取必要的步骤来创建新的主库）或自动进行。自动的故障切换过程通常由以下步骤组成：

1. 确认主库失效。有很多事情可能会出错：崩溃、停电、网络问题等等。没有万无一失的方法来检测出现了什么问题，所以大多数系统只是简单使用 **超时（Timeout）** ：节点频繁地相互来回传递消息，如果一个节点在一段时间内（例如 30 秒）没有响应，就认为它挂了（因为计划内维护而故意关闭主库不算）。
2. 选择一个新的主库。这可以通过选举过程（主库由剩余副本以多数选举产生）来完成，或者可以由之前选定的 **控制器节点（controller node）** 来指定新的主库。主库的最佳人选通常是拥有旧主库最新数据副本的从库（以最小化数据损失）。让所有的节点同意一个新的领导者，是一个 **共识** 问题，将在 [第九章](/v1/ch9) 详细讨论。
3. 重新配置系统以启用新的主库。客户端现在需要将它们的写请求发送给新主库（将在 “[请求路由](/v1/ch6#请求路由)” 中讨论这个问题）。如果旧主库恢复，可能仍然认为自己是主库，而没有意识到其他副本已经让它失去领导权了。系统需要确保旧主库意识到新主库的存在，并成为一个从库。

故障切换的过程中有很多地方可能出错：

* 如果使用异步复制，则新主库可能没有收到老主库宕机前最后的写入操作。在选出新主库后，如果老主库重新加入集群，又该如何处理这些老主库尚未复制的写入？在此期间，新主库可能已经收到了与老主库尚未复制的写入相冲突的写入。最常见的解决方案是简单丢弃老主库未复制的写入，这很可能打破客户对于数据持久性的期望。

* 如果数据库需要和其他外部存储相协调，那么丢弃写入内容是极其危险的操作。例如在 GitHub 【13】的一场事故中，一个过时的 MySQL 从库被提升为主库。数据库使用自增 ID 作为主键，因为新主库的计数器落后于老主库的计数器，所以新主库重新分配了一些已经被老主库分配掉的 ID 作为主键。这些主键也在 Redis 中使用，主键重用使得 MySQL 和 Redis 中的数据产生不一致，最后导致一些私有数据泄漏到错误的用户手中。

* 发生某些故障时（见 [第八章](/v1/ch8)）可能会出现两个节点都以为自己是主库的情况。这种情况称为 **脑裂（split brain）**，非常危险：如果两个主库都可以接受写操作，却没有冲突解决机制（请参阅 “[多主复制](#多主复制)”），那么数据就可能丢失或损坏。一些系统采取了安全防范措施：当检测到两个主库节点同时存在时会关闭其中一个节点 [^ii]，但设计粗糙的机制可能最后会导致两个节点都被关闭【14】。

  [^ii]: 这种机制称为 **屏障（fencing）**，或者更充满感情的术语是：**爆彼之头（Shoot The Other Node In The Head, STONITH）**。我们将在 “[领导者和锁](/v1/ch8#领导者和锁)” 中对屏障进行详细讨论。

* 主库被宣告死亡之前的正确超时应该怎么配置？在主库失效的情况下，超时时间越长意味着恢复时间也越长。但是如果超时设置太短，又可能会出现不必要的故障切换。例如，临时的负载峰值可能导致节点的响应时间增加到超出超时时间，或者网络故障也可能导致数据包延迟。如果系统已经处于高负载或网络问题的困扰之中，那么不必要的故障切换可能会让情况变得更糟糕。

这些问题没有简单的解决方案。因此，即使软件支持自动故障切换，不少运维团队还是更愿意手动执行故障切换。

节点故障、不可靠的网络、对副本一致性、持久性、可用性和延迟的权衡，这些问题实际上是分布式系统中的基本问题。[第八章](/v1/ch8) 和 [第九章](/v1/ch9) 将更深入地讨论它们。

### 复制日志的实现

基于领导者的复制在底层是如何工作的？实践中有好几种不同的复制方式，所以先简要地看一下。

#### 基于语句的复制

在最简单的情况下，主库记录下它执行的每个写入请求（**语句**，即 statement）并将该语句日志发送给从库。对于关系数据库来说，这意味着每个 `INSERT`、`UPDATE` 或 `DELETE` 语句都被转发给每个从库，每个从库解析并执行该 SQL 语句，就像直接从客户端收到一样。

虽然听上去很合理，但有很多问题会搞砸这种复制方式：

* 任何调用 **非确定性函数（nondeterministic）** 的语句，可能会在每个副本上生成不同的值。例如，使用 `NOW()` 获取当前日期时间，或使用 `RAND()` 获取一个随机数。
* 如果语句使用了 **自增列（auto increment）**，或者依赖于数据库中的现有数据（例如，`UPDATE ... WHERE <某些条件>`），则必须在每个副本上按照完全相同的顺序执行它们，否则可能会产生不同的效果。当有多个并发执行的事务时，这可能成为一个限制。
* 有副作用的语句（例如：触发器、存储过程、用户定义的函数）可能会在每个副本上产生不同的副作用，除非副作用是绝对确定性的。

的确有办法绕开这些问题 —— 例如，当语句被记录时，主库可以用固定的返回值替换掉任何不确定的函数调用，以便所有从库都能获得相同的值。但是由于边缘情况实在太多了，现在通常会选择其他的复制方法。

基于语句的复制在 5.1 版本前的 MySQL 中被使用到。因为它相当紧凑，现在有时候也还在用。但现在在默认情况下，如果语句中存在任何不确定性，MySQL 会切换到基于行的复制（稍后讨论）。VoltDB 使用了基于语句的复制，但要求事务必须是确定性的，以此来保证安全【15】。

#### 传输预写式日志（WAL）

在 [第三章](/v1/ch3) 中，我们讨论了存储引擎如何在磁盘上表示数据，我们也发现了通常会将写操作追加到日志中：

* 对于日志结构存储引擎（请参阅 “[SSTables 和 LSM 树](/v1/ch3#SSTables和LSM树)”），日志是主要的存储位置。日志段在后台压缩，并进行垃圾回收。
* 对于覆写单个磁盘块的 [B 树](/v1/ch3#B树)，每次修改都会先写入 **预写式日志（Write Ahead Log, WAL）**，以便崩溃后索引可以恢复到一个一致的状态。

在任何一种情况下，该日志都是包含了所有数据库写入的仅追加字节序列。可以使用完全相同的日志在另一个节点上构建副本：除了将日志写入磁盘之外，主库还可以通过网络将其发送给从库。

通过使用这个日志，从库可以构建一个与主库一模一样的数据结构拷贝。

这种复制方法在 PostgreSQL 和 Oracle 等一些产品中被使用到【16】。其主要缺点是日志记录的数据非常底层：WAL 包含哪些磁盘块中的哪些字节发生了更改。这使复制与存储引擎紧密耦合。如果数据库将其存储格式从一个版本更改为另一个版本，通常不可能在主库和从库上运行不同版本的数据库软件。

看上去这可能只是一个小的实现细节，但却可能对运维产生巨大的影响。如果复制协议允许从库使用比主库更新的软件版本，则可以先升级从库，然后执行故障切换，使升级后的节点之一成为新的主库，从而允许数据库软件的零停机升级。如果复制协议不允许版本不匹配（传输 WAL 经常出现这种情况），则此类升级需要停机。

#### 逻辑日志复制（基于行）

另一种方法是对复制和存储引擎使用不同的日志格式，这样可以将复制日志从存储引擎的内部实现中解耦出来。这种复制日志被称为逻辑日志（logical log），以将其与存储引擎的（物理）数据表示区分开来。

关系数据库的逻辑日志通常是以行的粒度来描述对数据库表的写入记录的序列：

* 对于插入的行，日志包含所有列的新值。
* 对于删除的行，日志包含足够的信息来唯一标识被删除的行，这通常是主键，但如果表上没有主键，则需要记录所有列的旧值。
* 对于更新的行，日志包含足够的信息来唯一标识被更新的行，以及所有列的新值（或至少所有已更改的列的新值）。

修改多行的事务会生成多条这样的日志记录，后面跟着一条指明事务已经提交的记录。MySQL 的二进制日志（当配置为使用基于行的复制时）使用了这种方法【17】。

由于逻辑日志与存储引擎的内部实现是解耦的，系统可以更容易地做到向后兼容，从而使主库和从库能够运行不同版本的数据库软件，或者甚至不同的存储引擎。

对于外部应用程序来说，逻辑日志格式也更容易解析。如果要将数据库的内容发送到外部系统，例如复制到数据仓库进行离线分析，或建立自定义索引和缓存【18】，这一点会很有用。这种技术被称为 **数据变更捕获（change data capture）**，[第十一章](/v1/ch11) 将重新讲到它。

#### 基于触发器的复制

到目前为止描述的复制方法是由数据库系统实现的，不涉及任何应用程序代码。在很多情况下，这就是你想要的。但在某些情况下需要更多的灵活性。例如，如果你只想复制数据的一个子集，或者想从一种数据库复制到另一种数据库，或者如果你需要冲突解决逻辑（请参阅 “[处理写入冲突](#处理写入冲突)”），则可能需要将复制操作上移到应用程序层。

一些工具，如 Oracle Golden Gate【19】，可以通过读取数据库日志，使得其他应用程序可以使用数据。另一种方法是使用许多关系数据库自带的功能：触发器和存储过程。

触发器允许你将数据更改（写入事务）发生时自动执行的自定义应用程序代码注册在数据库系统中。触发器有机会将更改记录到一个单独的表中，使用外部程序读取这个表，再加上一些必要的业务逻辑，就可以将数据变更复制到另一个系统去。例如，Databus for Oracle【20】和 Bucardo for Postgres【21】就是这样工作的。

基于触发器的复制通常比其他复制方法具有更高的开销，并且比数据库内置的复制更容易出错，也有很多限制。然而由于其灵活性，它仍然是很有用的。


## 复制延迟问题

容忍节点故障只是需要复制的一个原因。正如在 [第二部分](/v1/part-ii) 的介绍中提到的，其它原因还包括可伸缩性（处理比单个机器更多的请求）和延迟（让副本在地理位置上更接近用户）。

基于领导者的复制要求所有写入都由单个节点处理，但只读查询可以由任何一个副本来处理。所以对于读多写少的场景（Web 上的常见模式），一个有吸引力的选择是创建很多从库，并将读请求分散到所有的从库上去。这样能减小主库的负载，并允许由附近的副本来处理读请求。

在这种读伸缩（read-scaling）的体系结构中，只需添加更多的从库，就可以提高只读请求的服务容量。但是，这种方法实际上只适用于异步复制 —— 如果尝试同步复制到所有从库，则单个节点故障或网络中断将导致整个系统都无法写入。而且节点越多越有可能出现个别节点宕机的情况，所以完全同步的配置将是非常不可靠的。

不幸的是，当应用程序从异步从库读取时，如果从库落后，它可能会看到过时的信息。这会导致数据库中出现明显的不一致：同时对主库和从库执行相同的查询，可能得到不同的结果，因为并非所有的写入都反映在从库中。这种不一致只是一个暂时的状态 —— 如果停止写入数据库并等待一段时间，从库最终会赶上并与主库保持一致。出于这个原因，这种效应被称为 **最终一致性（eventual consistency）**【22,23】。[^iii]

[^iii]: 道格拉斯・特里（Douglas Terry）等人【24】创造了最终一致性这个术语，并经由 Werner Vogels【22】的推广，成为了许多 NoSQL 项目的口号。然而，最终一致性并不只属于 NoSQL 数据库：关系型数据库中的异步复制从库也有相同的特性。

最终一致性中的 “最终” 一词有意进行了模糊化：总的来说，副本落后的程度是没有限制的。在正常的操作中，**复制延迟（replication lag）**，即写入主库到反映至从库之间的延迟，可能仅仅是几分之一秒，在实践中并不显眼。但如果系统在接近极限的情况下运行，或网络中存在问题时，延迟可以轻而易举地超过几秒，甚至达到几分钟。

因为滞后时间太长引入的不一致性，不仅仅是一个理论问题，更是应用设计中会遇到的真实问题。本节将重点介绍三个在复制延迟时可能发生的问题实例，并简述解决这些问题的一些方法。

### 读己之写

许多应用让用户提交一些数据，然后查看他们提交的内容。可能是用户数据库中的记录，也可能是对讨论主题的评论，或其他类似的内容。提交新数据时，必须将其发送给主库，但是当用户查看数据时，可以通过从库进行读取。如果数据经常被查看，但只是偶尔写入，这是非常合适的。

但对于异步复制，问题就来了。如 [图 5-3](fig5-3.png) 所示：如果用户在写入后马上就查看数据，则新数据可能尚未到达副本。对用户而言，看起来好像是刚提交的数据丢失了，所以他们不高兴是可以理解的。

![](/v1/ddia_0503.png)

**图 5-3 用户写入后从旧副本中读取数据。需要写后读 (read-after-write) 的一致性来防止这种异常**

在这种情况下，我们需要 **写后读一致性（read-after-write consistency）**，也称为 **读己之写一致性（read-your-writes consistency）**【24】。这是一个保证，如果用户重新加载页面，他们总会看到他们自己提交的任何更新。它不会对其他用户的写入做出承诺：其他用户的更新可能稍等才会看到。它保证用户自己的输入已被正确保存。

如何在基于领导者的复制系统中实现写后读一致性？有各种可能的技术，这里说一些：

* 对于用户 **可能修改过** 的内容，总是从主库读取；这就要求得有办法不通过实际的查询就可以知道用户是否修改了某些东西。举个例子，社交网络上的用户个人资料信息通常只能由用户本人编辑，而不能由其他人编辑。因此一个简单的规则就是：总是从主库读取用户自己的档案，如果要读取其他用户的档案就去从库。

* 如果应用中的大部分内容都可能被用户编辑，那这种方法就没用了，因为大部分内容都必须从主库读取（读伸缩就没效果了）。在这种情况下可以使用其他标准来决定是否从主库读取。例如可以跟踪上次更新的时间，在上次更新后的一分钟内，从主库读。还可以监控从库的复制延迟，防止向任何滞后主库超过一分钟的从库发出查询。

* 客户端可以记住最近一次写入的时间戳，系统需要确保从库在处理该用户的读取请求时，该时间戳前的变更都已经传播到了本从库中。如果当前从库不够新，则可以从另一个从库读取，或者等待从库追赶上来。这里的时间戳可以是逻辑时间戳（表示写入顺序的东西，例如日志序列号）或实际的系统时钟（在这种情况下，时钟同步变得至关重要，请参阅 “[不可靠的时钟](/v1/ch8#不可靠的时钟)”）。

* 如果你的副本分布在多个数据中心（为了在地理上接近用户或者出于可用性目的），还会有额外的复杂性。任何需要由主库提供服务的请求都必须路由到包含该主库的数据中心。

另一种复杂的情况发生在同一位用户从多个设备（例如桌面浏览器和移动 APP）请求服务的时候。这种情况下可能就需要提供跨设备的写后读一致性：如果用户在一个设备上输入了一些信息，然后在另一个设备上查看，则应该看到他们刚输入的信息。

在这种情况下，还有一些需要考虑的问题：

* 记住用户上次更新时间戳的方法变得更加困难，因为一个设备上运行的程序不知道另一个设备上发生了什么。需要对这些元数据进行中心化的存储。
* 如果副本分布在不同的数据中心，很难保证来自不同设备的连接会路由到同一数据中心。（例如，用户的台式计算机使用家庭宽带连接，而移动设备使用蜂窝数据网络，则设备的网络路由可能完全不同）。如果你的方法需要读主库，可能首先需要把来自该用户所有设备的请求都路由到同一个数据中心。


### 单调读

在从异步从库读取时可能发生的异常的第二个例子是用户可能会遇到 **时光倒流（moving backward in time）**。

如果用户从不同从库进行多次读取，就可能发生这种情况。例如，[图 5-4](/v1/ddia_0504.png) 显示了用户 2345 两次进行相同的查询，首先查询了一个延迟很小的从库，然后是一个延迟较大的从库（如果用户刷新网页时每个请求都被路由到一个随机的服务器，这种情况就很有可能发生）。第一个查询返回了最近由用户 1234 添加的评论，但是第二个查询不返回任何东西，因为滞后的从库还没有拉取到该写入内容。实际上可以认为第二个查询是在比第一个查询更早的时间点上观察系统。如果第一个查询没有返回任何内容，那问题并不大，因为用户 2345 可能不知道用户 1234 最近添加了评论。但如果用户 2345 先看见用户 1234 的评论，然后又看到它消失，这就会让人觉得非常困惑了。

![](/v1/ddia_0504.png)

**图 5-4 用户首先从新副本读取，然后从旧副本读取。时间看上去回退了。为了防止这种异常，我们需要单调的读取。**

**单调读（monotonic reads）**【23】可以保证这种异常不会发生。这是一个比 **强一致性（strong consistency）** 更弱，但比 **最终一致性（eventual consistency）** 更强的保证。当读取数据时，你可能会看到一个旧值；单调读仅意味着如果一个用户顺序地进行多次读取，则他们不会看到时间回退，也就是说，如果已经读取到较新的数据，后续的读取不会得到更旧的数据。

实现单调读的一种方式是确保每个用户总是从同一个副本进行读取（不同的用户可以从不同的副本读取）。例如，可以基于用户 ID 的散列来选择副本，而不是随机选择副本。但是，如果该副本出现故障，用户的查询将需要重新路由到另一个副本。


### 一致前缀读

第三个复制延迟异常的例子违反了因果律。想象一下 Poons 先生和 Cake 夫人之间的以下简短对话：

*Mr. Poons*
> Mrs. Cake，你能看到多远的未来？

*Mrs. Cake*
> 通常约十秒钟，Mr. Poons.

这两句话之间有因果关系：Cake 夫人听到了 Poons 先生的问题并回答了这个问题。

现在，想象第三个人正在通过从库来听这个对话。Cake 夫人说的内容是从一个延迟很低的从库读取的，但 Poons 先生所说的内容，从库的延迟要大的多（见 [图 5-5](/v1/ddia_0505.png)）。于是，这个观察者会听到以下内容：

*Mrs. Cake*
> 通常约十秒钟，Mr. Poons.

*Mr. Poons*
> Mrs. Cake，你能看到多远的未来？

对于观察者来说，看起来好像 Cake 夫人在 Poons 先生提问前就回答了这个问题。这种超能力让人印象深刻，但也会把人搞糊涂。【25】。

![](/v1/ddia_0505.png)

**图 5-5 如果某些分区的复制速度慢于其他分区，那么观察者可能会在看到问题之前先看到答案。**

要防止这种异常，需要另一种类型的保证：**一致前缀读（consistent prefix reads）**【23】。这个保证的意思是说：如果一系列写入按某个顺序发生，那么任何人读取这些写入时，也会看见它们以同样的顺序出现。

这是 **分区（partitioned）** 或 **分片（sharded）** 数据库中的一个特殊问题，我们将在 [第六章](/v1/ch6) 中讨论分区数据库。如果数据库总是以相同的顺序应用写入，而读取总是看到一致的前缀，那么这种异常不会发生。但是在许多分布式数据库中，不同的分区独立运行，因此不存在 **全局的写入顺序**：当用户从数据库中读取数据时，可能会看到数据库的某些部分处于较旧的状态，而某些则处于较新的状态。

一种解决方案是，确保任何因果相关的写入都写入相同的分区，但在一些应用中可能无法高效地完成这种操作。还有一些显式跟踪因果依赖关系的算法，我们将在 “[“此前发生” 的关系和并发](#“此前发生”的关系和并发)” 一节中回到这个话题。

### 复制延迟的解决方案

在使用最终一致的系统时，如果复制延迟增加到几分钟甚至几小时，则应该考虑应用程序的行为。如果答案是 “没问题”，那很好。但如果结果对于用户来说是不好的体验，那么设计系统来提供更强的保证（例如 **写后读**）是很重要的。明明是异步复制却假设复制是同步的，这是很多麻烦的根源。

如前所述，应用程序可以提供比底层数据库更强有力的保证，例如通过主库进行某种读取。但在应用程序代码中处理这些问题是复杂的，容易出错。

如果应用程序开发人员不必担心微妙的复制问题，并可以信赖他们的数据库 “做了正确的事情”，那该多好呀。这就是 **事务（transaction）** 存在的原因：**数据库通过事务提供强大的保证**，所以应用程序可以更加简单。

单节点事务已经存在了很长时间。然而在走向分布式（复制和分区）数据库时，许多系统放弃了事务，声称事务在性能和可用性上的代价太高，并断言在可伸缩系统中最终一致性是不可避免的。这个叙述有一些道理，但过于简单了，本书其余部分将提出更为细致的观点。我们将在 [第七章](/v1/ch7) 和 [第九章](/v1/ch9) 回到事务的话题，并将在 [第三部分](/v1/part-iii) 讨论一些替代机制。


## 多主复制

本章到目前为止，我们只考虑了使用单个主库的复制架构。虽然这是一种常见的方法，但还有其它一些有趣的选择。

基于领导者的复制有一个主要的缺点：只有一个主库，而且所有的写入都必须通过它 [^iv]。如果出于任何原因（例如和主库之间的网络连接中断）无法连接到主库，就无法向数据库写入。

[^iv]: 如果数据库被分区（见 [第六章](/v1/ch6)），每个分区都有一个主库。不同的分区的主库可能在不同的节点上，但是每个分区都必须有一个主库。

基于领导者的复制模型的自然延伸是允许多个节点接受写入。复制仍然以同样的方式发生：处理写入的每个节点都必须将该数据变更转发给所有其他节点。我们将其称之为 **多领导者配置**（multi-leader configuration，也称多主、多活复制，即 master-master replication 或 active/active replication）。在这种情况下，每个主库同时是其他主库的从库。

### 多主复制的应用场景

在单个数据中心内部使用多个主库的配置没有太大意义，因为其导致的复杂性已经超过了能带来的好处。但在一些情况下，这种配置也是合理的。

#### 运维多个数据中心

假如你有一个数据库，副本分散在好几个不同的数据中心（可能会用来容忍单个数据中心的故障，或者为了在地理上更接近用户）。如果使用常规的基于领导者的复制设置，主库必须位于其中一个数据中心，且所有写入都必须经过该数据中心。

多主配置中可以在每个数据中心都有主库。[图 5-6](/v1/ddia_0506.png) 展示了这个架构。在每个数据中心内使用常规的主从复制；在数据中心之间，每个数据中心的主库都会将其更改复制到其他数据中心的主库中。

![](/v1/ddia_0506.png)

**图 5-6 跨多个数据中心的多主复制**

我们来比较一下在运维多个数据中心时，单主和多主的适应情况：

* 性能

  在单主配置中，每个写入都必须穿过互联网，进入主库所在的数据中心。这可能会增加写入时间，并可能违背了设置多个数据中心的初心。在多主配置中，每个写操作都可以在本地数据中心进行处理，并与其他数据中心异步复制。因此，数据中心之间的网络延迟对用户来说是透明的，这意味着感觉到的性能可能会更好。

* 容忍数据中心停机

  在单主配置中，如果主库所在的数据中心发生故障，故障切换必须使另一个数据中心里的从库成为主库。在多主配置中，每个数据中心可以独立于其他数据中心继续运行，并且当发生故障的数据中心归队时，复制会自动赶上。

* 容忍网络问题

  数据中心之间的通信通常穿过公共互联网，这可能不如数据中心内的本地网络可靠。单主配置对数据中心之间的连接问题非常敏感，因为通过这个连接进行的写操作是同步的。采用异步复制功能的多主配置通常能更好地承受网络问题：临时的网络中断并不会妨碍正在处理的写入。

有些数据库默认情况下支持多主配置，但使用外部工具实现也很常见，例如用于 MySQL 的 Tungsten Replicator 【26】，用于 PostgreSQL 的 BDR【27】以及用于 Oracle 的 GoldenGate 【19】。

尽管多主复制有这些优势，但也有一个很大的缺点：两个不同的数据中心可能会同时修改相同的数据，写冲突是必须解决的（如 [图 5-6](/v1/ddia_0506.png) 中的 “冲突解决（conflict resolution）”）。本书将在 “[处理写入冲突](#处理写入冲突)” 中详细讨论这个问题。

由于多主复制在许多数据库中都属于改装的功能，所以常常存在微妙的配置缺陷，且经常与其他数据库功能之间出现意外的反应。比如自增主键、触发器、完整性约束等都可能会有麻烦。因此，多主复制往往被认为是危险的领域，应尽可能避免【28】。

#### 需要离线操作的客户端

多主复制的另一种适用场景是：应用程序在断网之后仍然需要继续工作。

例如，考虑手机，笔记本电脑和其他设备上的日历应用。无论设备目前是否有互联网连接，你需要能随时查看你的会议（发出读取请求），输入新的会议（发出写入请求）。如果在离线状态下进行任何更改，则设备下次上线时，需要与服务器和其他设备同步。

在这种情况下，每个设备都有一个充当主库的本地数据库（它接受写请求），并且在所有设备上的日历副本之间同步时，存在异步的多主复制过程。复制延迟可能是几小时甚至几天，具体取决于何时可以访问互联网。

从架构的角度来看，这种设置实际上与数据中心之间的多主复制类似，每个设备都是一个 “数据中心”，而它们之间的网络连接是极度不可靠的。从历史上各类日历同步功能的破烂实现可以看出，想把多主复制用好是多么困难的一件事。

有一些工具旨在使这种多主配置更容易。例如，CouchDB 就是为这种操作模式而设计的【29】。

#### 协同编辑

实时协作编辑应用程序允许多个人同时编辑文档。例如，Etherpad 【30】和 Google Docs 【31】允许多人同时编辑文本文档或电子表格（该算法在 “[自动冲突解决](#自动冲突解决)” 中简要讨论）。我们通常不会将协作式编辑视为数据库复制问题，但它与前面提到的离线编辑用例有许多相似之处。当一个用户编辑文档时，所做的更改将立即应用到其本地副本（Web 浏览器或客户端应用程序中的文档状态），并异步复制到服务器和编辑同一文档的任何其他用户。

如果要保证不会发生编辑冲突，则应用程序必须先取得文档的锁定，然后用户才能对其进行编辑。如果另一个用户想要编辑同一个文档，他们首先必须等到第一个用户提交修改并释放锁定。这种协作模式相当于主从复制模型下在主节点上执行事务操作。

但是，为了加速协作，你可能希望将更改的单位设置得非常小（例如单次按键），并避免锁定。这种方法允许多个用户同时进行编辑，但同时也带来了多主复制的所有挑战，包括需要解决冲突【32】。

### 处理写入冲突

多主复制的最大问题是可能发生写冲突，这意味着需要解决冲突。

例如，考虑一个由两个用户同时编辑的维基页面，如 [图 5-7](/v1/ddia_0507.png) 所示。用户 1 将页面的标题从 A 更改为 B，并且用户 2 同时将标题从 A 更改为 C。每个用户的更改已成功应用到其本地主库。但当异步复制时，会发现冲突【33】。单主数据库中不会出现此问题。

![](/v1/ddia_0507.png)

**图 5-7 两个主库同时更新同一记录引起的写入冲突**

#### 同步与异步冲突检测

在单主数据库中，第二个写入将被阻塞并等待第一个写入完成，或者中止第二个写入事务并强制用户重试。另一方面，在多主配置中，两个写入都是成功的，在稍后的某个时间点才能异步地检测到冲突。那时再来要求用户解决冲突可能为时已晚。

原则上，可以使冲突检测同步 - 即等待写入被复制到所有副本，然后再告诉用户写入成功。但是，通过这样做，你将失去多主复制的主要优点：允许每个副本独立地接受写入。如果你想要同步冲突检测，那么你可能不如直接使用单主复制。

#### 避免冲突

处理冲突的最简单的策略就是避免它们：如果应用程序可以确保特定记录的所有写入都通过同一个主库，那么冲突就不会发生。由于许多的多主复制实现在处理冲突时处理得相当不好，避免冲突是一个经常被推荐的方法【34】。

例如，在一个用户可以编辑自己数据的应用程序中，可以确保来自特定用户的请求始终路由到同一数据中心，并使用该数据中心的主库进行读写。不同的用户可能有不同的 “主” 数据中心（可能根据用户的地理位置选择），但从任何一位用户的角度来看，本质上就是单主配置了。

但是，有时你可能需要更改被指定的主库 —— 可能是因为某个数据中心出现故障，你需要将流量重新路由到另一个数据中心，或者可能是因为用户已经迁移到另一个位置，现在更接近其它的数据中心。在这种情况下，冲突避免将失效，你必须处理不同主库同时写入的可能性。

#### 收敛至一致的状态

单主数据库按顺序进行写操作：如果同一个字段有多个更新，则最后一个写操作将决定该字段的最终值。

在多主配置中，没有明确的写入顺序，所以最终值应该是什么并不清楚。在 [图 5-7](/v1/ddia_0507.png) 中，在主库 1 中标题首先更新为 B 而后更新为 C；在主库 2 中，首先更新为 C，然后更新为 B。两种顺序都不比另一种“更正确”。

如果每个副本只是按照它看到写入的顺序写入，那么数据库最终将处于不一致的状态：最终值将是在主库 1 的 C 和主库 2 的 B。这是不可接受的，每个复制方案都必须确保数据最终在所有副本中都是相同的。因此，数据库必须以一种 **收敛（convergent）** 的方式解决冲突，这意味着所有副本必须在所有变更复制完成时收敛至一个相同的最终值。

实现冲突合并解决有多种途径：

* 给每个写入一个唯一的 ID（例如时间戳、长随机数、UUID 或者键和值的哈希），挑选最高 ID 的写入作为胜利者，并丢弃其他写入。如果使用时间戳，这种技术被称为 **最后写入胜利（LWW, last write wins）**。虽然这种方法很流行，但是很容易造成数据丢失【35】。我们将在本章末尾的 [检测并发写入](#检测并发写入) 一节更详细地讨论 LWW。
* 为每个副本分配一个唯一的 ID，ID 编号更高的写入具有更高的优先级。这种方法也意味着数据丢失。
* 以某种方式将这些值合并在一起 - 例如，按字母顺序排序，然后连接它们（在 [图 5-7](/v1/ddia_0507.png) 中，合并的标题可能类似于 “B/C”）。
* 用一种可保留所有信息的显式数据结构来记录冲突，并编写解决冲突的应用程序代码（也许通过提示用户的方式）。


#### 自定义冲突解决逻辑

解决冲突的最合适的方法可能取决于应用程序，大多数多主复制工具允许使用应用程序代码编写冲突解决逻辑。该代码可以在写入或读取时执行：

写时执行
: 只要数据库系统检测到复制更改日志中存在冲突，就会调用冲突处理程序。例如，Bucardo 允许你为此编写一段 Perl 代码。这个处理程序通常不能提示用户 —— 它在后台进程中运行，并且必须快速执行。

读时执行
: 当检测到冲突时，所有冲突写入被存储。下一次读取数据时，会将这些多个版本的数据返回给应用程序。应用程序可以提示用户或自动解决冲突，并将结果写回数据库。例如 CouchDB 就以这种方式工作。

请注意，冲突解决通常适用于单行记录或单个文档的层面，而不是整个事务【36】。因此，如果你有一个事务会原子性地进行几次不同的写入（请参阅 [第七章](/v1/ch7)），对于冲突解决而言，每个写入仍需分开单独考虑。


> #### 自动冲突解决
>
> 冲突解决规则可能很容易变得越来越复杂，自定义代码可能也很容易出错。亚马逊是一个经常被引用的例子，由于冲突解决处理程序而产生了令人意外的效果：一段时间以来，购物车上的冲突解决逻辑将保留添加到购物车的物品，但不包括从购物车中移除的物品。因此，顾客有时会看到物品重新出现在他们的购物车中，即使他们之前已经被移走【37】。
>
> 已经有一些有趣的研究来自动解决由于数据修改引起的冲突。有几项研究值得一提：
>
> * **无冲突复制数据类型（Conflict-free replicated datatypes，CRDT）**【32,38】是可以由多个用户同时编辑的集合、映射、有序列表、计数器等一系列数据结构，它们以合理的方式自动解决冲突。一些 CRDT 已经在 Riak 2.0 中实现【39,40】。
> * **可合并的持久数据结构（Mergeable persistent data structures）**【41】显式跟踪历史记录，类似于 Git 版本控制系统，并使用三向合并功能（而 CRDT 使用双向合并）。
> * **操作转换（operational transformation）**[42] 是 Etherpad 【30】和 Google Docs 【31】等协同编辑应用背后的冲突解决算法。它是专为有序列表的并发编辑而设计的，例如构成文本文档的字符列表。
>
> 这些算法在数据库中的实现还很年轻，但很可能将来它们会被集成到更多的复制数据系统中。自动冲突解决方案可以使应用程序处理多主数据同步更为简单。


#### 什么是冲突？

有些冲突是显而易见的。在 [图 5-7](/v1/ddia_0507.png) 的例子中，两个写操作并发地修改了同一条记录中的同一个字段，并将其设置为两个不同的值。毫无疑问这是一个冲突。

其他类型的冲突可能更为微妙而难以发现。例如，考虑一个会议室预订系统：它记录谁订了哪个时间段的哪个房间。应用程序需要确保每个房间在任意时刻都只能被一组人进行预定（即不得有相同房间的重叠预订）。在这种情况下，如果为同一个房间同时创建两个不同的预订，则可能会发生冲突。即使应用程序在允许用户进行预订之前先检查会议室的可用性，如果两次预订是由两个不同的主库进行的，则仍然可能会有冲突。

虽然现在还没有一个现成的答案，但在接下来的章节中，我们将更好地了解这个问题。我们将在 [第七章](/v1/ch7) 中看到更多的冲突示例，在 [第十二章](/v1/ch12) 中我们将讨论用于检测和解决复制系统中冲突的可伸缩方法。


### 多主复制拓扑

**复制拓扑**（replication topology）用来描述写入操作从一个节点传播到另一个节点的通信路径。如果你有两个主库，如 [图 5-7](/v1/ddia_0507.png) 所示，只有一个合理的拓扑结构：主库 1 必须把它所有的写入都发送到主库 2，反之亦然。当有两个以上的主库，多种不同的拓扑都是可能的。[图 5-8](/v1/ddia_0508.png) 举例说明了一些例子。

![](/v1/ddia_0508.png)

**图 5-8 三种可以在多主复制中使用的拓扑示例。**

最常见的拓扑是全部到全部（all-to-all，如 [图 5-8 (c)](/v1/ddia_0508.png)），其中每个主库都将其写入发送给其他所有的主库。然而，一些更受限的拓扑也会被使用到：例如，默认情况下 MySQL 仅支持 **环形拓扑（circular topology）**【34】，其中每个节点都从一个节点接收写入，并将这些写入（加上自己的写入）转发给另一个节点。另一种流行的拓扑结构具有星形的形状 [^v]：一个指定的根节点将写入转发给所有其他节点。星形拓扑可以推广到树。

[^v]: 不要与星型模式混淆（请参阅 “[星型和雪花型：分析的模式](/v1/ch3#星型和雪花型：分析的模式)”），其中描述了数据模型的结构，而不是节点之间的通信拓扑。

在环形和星形拓扑中，写入可能需要在到达所有副本之前通过多个节点。因此，节点需要转发从其他节点收到的数据更改。为了防止无限复制循环，每个节点被赋予一个唯一的标识符，并且在复制日志中，每次写入都会使用其经过的所有节点的标识符进行标记【43】。当一个节点收到用自己的标识符标记的数据更改时，该数据更改将被忽略，因为节点知道它已经被处理过。

环形和星形拓扑的问题是，如果只有一个节点发生故障，则可能会中断其他节点之间的复制消息流，导致它们无法通信，除非节点被修复。拓扑结构可以重新配置为跳过发生故障的节点，但在大多数部署中，这种重新配置必须手动完成。更密集连接的拓扑结构（例如全部到全部）的容错性更好，因为它允许消息沿着不同的路径传播，可以避免单点故障。

另一方面，全部到全部的拓扑也可能有问题。特别是，一些网络链接可能比其他网络链接更快（例如由于网络拥塞），结果是一些复制消息可能 “超越” 其他复制消息，如 [图 5-9](/v1/ddia_0509.png) 所示。

![](/v1/ddia_0509.png)

**图 5-9 使用多主复制时，写入可能会以错误的顺序到达某些副本。**

在 [图 5-9](/v1/ddia_0509.png) 中，客户端 A 向主库 1 的表中插入一行，客户端 B 在主库 3 上更新该行。然而，主库 2 可以以不同的顺序接收写入：它可能先接收到更新（从它的角度来看，是对数据库中不存在的行的更新），稍后才接收到相应的插入（其应该在更新之前）。

这是一个因果关系的问题，类似于我们在 “[一致前缀读](#一致前缀读)” 中看到的：更新取决于先前的插入，所以我们需要确保所有节点先处理插入，然后再处理更新。仅仅在每一次写入时添加一个时间戳是不够的，因为时钟不可能被充分地同步，所以主库 2 就无法正确地对这些事件进行排序（见 [第八章](/v1/ch8)）。

要正确排序这些事件，可以使用一种称为 **版本向量（version vectors）** 的技术，本章稍后将讨论这种技术（请参阅 “[检测并发写入](#检测并发写入)”）。然而，许多多主复制系统中的冲突检测技术实现得并不好。例如，在撰写本文时，PostgreSQL BDR 不提供写入的因果排序【27】，而 Tungsten Replicator for MySQL 甚至都不做检测冲突【34】。

如果你正在使用基于多主复制的系统，那么你应该多了解这些问题，仔细阅读文档，并彻底测试你的数据库，以确保它确实提供了你想要的保证。


## 无主复制

我们在本章到目前为止所讨论的复制方法 —— 单主复制、多主复制 —— 都是这样的想法：客户端向一个主库发送写请求，而数据库系统负责将写入复制到其他副本。主库决定写入的顺序，而从库按相同顺序应用主库的写入。

一些数据存储系统采用不同的方法，放弃主库的概念，并允许任何副本直接接受来自客户端的写入。最早的一些的复制数据系统是 **无主的（leaderless）**【1,44】，但是在关系数据库主导的时代，这个想法几乎已被忘却。在亚马逊将其用于其内部的 Dynamo 系统 [^vi] 之后，它再一次成为数据库的一种时尚架构【37】。Riak，Cassandra 和 Voldemort 是受 Dynamo 启发的无主复制模型的开源数据存储，所以这类数据库也被称为 *Dynamo 风格*。

[^vi]: Dynamo 不适用于 Amazon 以外的用户。令人困惑的是，AWS 提供了一个名为 DynamoDB 的托管数据库产品，它使用了完全不同的体系结构：它基于单主复制。

在一些无主复制的实现中，客户端直接将写入发送到几个副本中，而另一些情况下，由一个 **协调者（coordinator）** 节点代表客户端进行写入。但与主库数据库不同，协调者不执行特定的写入顺序。我们将会看到，这种设计上的差异对数据库的使用方式有着深远的影响。

### 当节点故障时写入数据库

假设你有一个带有三个副本的数据库，而其中一个副本目前不可用，或许正在重新启动以安装系统更新。在基于领导者的配置中，如果要继续处理写入，则可能需要执行故障切换（请参阅「[处理节点宕机](#处理节点宕机)」）。

另一方面，在无主配置中，不存在故障转移。[图 5-10](/v1/ddia_0510.png) 演示了会发生了什么事情：客户端（用户 1234）并行发送写入到所有三个副本，并且两个可用副本接受写入，但是不可用副本错过了它。假设三个副本中的两个承认写入是足够的：在用户 1234 已经收到两个确定的响应之后，我们认为写入成功。客户简单地忽略了其中一个副本错过了写入的事实。

![](/v1/ddia_0510.png)

**图 5-10 法定写入，法定读取，并在节点中断后读修复。**

现在想象一下，不可用的节点重新联机，客户端开始读取它。节点关闭期间发生的任何写入都不在该节点上。因此，如果你从该节点读取数据，则可能会从响应中拿到陈旧的（过时的）值。

为了解决这个问题，当一个客户端从数据库中读取数据时，它不仅仅把它的请求发送到一个副本：读请求将被并行地发送到多个节点。客户可能会从不同的节点获得不同的响应，即来自一个节点的最新值和来自另一个节点的陈旧值。版本号将被用于确定哪个值是更新的（请参阅 “[检测并发写入](#检测并发写入)”）。

#### 读修复和反熵

复制方案应确保最终将所有数据复制到每个副本。在一个不可用的节点重新联机之后，它如何赶上它错过的写入？

在 Dynamo 风格的数据存储中经常使用两种机制：

读修复（Read repair）
: 当客户端并行读取多个节点时，它可以检测到任何陈旧的响应。例如，在 [图 5-10](/v1/ddia_0510.png) 中，用户 2345 获得了来自副本 3 的版本 6 值和来自副本 1 和 2 的版本 7 值。客户端发现副本 3 具有陈旧值，并将新值写回到该副本。这种方法适用于读频繁的值。

反熵过程（Anti-entropy process）
: 此外，一些数据存储具有后台进程，该进程不断查找副本之间的数据差异，并将任何缺少的数据从一个副本复制到另一个副本。与基于领导者的复制中的复制日志不同，此反熵过程不会以任何特定的顺序复制写入，并且在复制数据之前可能会有显著的延迟。

并不是所有的系统都实现了这两种机制，例如，Voldemort 目前没有反熵过程。请注意，如果没有反熵过程，很少被读取的值可能会从某些副本中丢失，从而降低了持久性，因为只有在应用程序读取值时才执行读修复。

#### 读写的法定人数

在 [图 5-10](/v1/ddia_0510.png) 的示例中，我们认为即使仅在三个副本中的两个上进行处理，写入仍然是成功的。如果三个副本中只有一个接受了写入，会怎样？以此类推，究竟多少个副本完成才可以认为写入成功？

如果我们知道，每个成功的写操作意味着在三个副本中至少有两个出现，这意味着至多有一个副本可能是陈旧的。因此，如果我们从至少两个副本读取，我们可以确定至少有一个是最新的。如果第三个副本停机或响应速度缓慢，则读取仍可以继续返回最新值。

更一般地说，如果有 n 个副本，每个写入必须由 w 个节点确认才能被认为是成功的，并且我们必须至少为每个读取查询 r 个节点。（在我们的例子中，$n = 3，w = 2，r = 2$）。只要 $w + r > n$，我们可以预期在读取时能获得最新的值，因为 r 个读取中至少有一个节点是最新的。遵循这些 r 值和 w 值的读写称为 **法定人数（quorum）**[^vii] 的读和写【44】。你可以认为，r 和 w 是有效读写所需的最低票数。

[^vii]: 有时候这种法定人数被称为严格的法定人数，其相对 “宽松的法定人数” 而言（见 “[宽松的法定人数与提示移交](#宽松的法定人数与提示移交)”）

在 Dynamo 风格的数据库中，参数 n、w 和 r 通常是可配置的。一个常见的选择是使 n 为奇数（通常为 3 或 5）并设置 $w = r = (n + 1) / 2$（向上取整）。但是你可以根据需要更改数字。例如，写入次数较少且读取次数较多的工作负载可以从设置 $w = n$ 和 $r = 1$中受益。这会使得读取速度更快，但缺点是只要有一个不可用的节点就会导致所有的数据库写入都失败。

> 集群中可能有多于 n 个的节点（集群的机器数可能多于副本数目）。但是任何给定的值只能存储在 n 个节点上。这允许对数据集进行分区，从而可以支持比单个节点的存储能力更大的数据集。我们将在 [第六章](/v1/ch6) 继续讨论分区。

法定人数条件 $w + r > n$ 允许系统容忍不可用的节点，如下所示：

* 如果 $w < n$，当节点不可用时，我们仍然可以处理写入。
* 如果 $r < n$，当节点不可用时，我们仍然可以处理读取。
* 对于 $n = 3，w = 2，r = 2$，我们可以容忍一个不可用的节点。
* 对于 $n = 5，w = 3，r = 3$，我们可以容忍两个不可用的节点。这个案例如 [图 5-11](/v1/ddia_0511.png) 所示。
* 通常，读取和写入操作始终并行发送到所有 n 个副本。参数 w 和 r 决定我们等待多少个节点，即在我们认为读或写成功之前，有多少个节点需要报告成功。

![](/v1/ddia_0511.png)

**图 5-11  如果 $w + r > n$，读取 r 个副本，至少有一个副本必然包含了最近的成功写入。**

如果可用的节点少于所需的 w 或 r，则写入或读取将返回错误。节点可能由于多种原因而不可用，比如：节点关闭（异常崩溃，电源关闭）、操作执行过程中的错误（由于磁盘已满而无法写入）、客户端和服务器节点之间的网络中断或任何其他原因。我们只需要关心节点是否返回了成功的响应，而不需要区分不同类型的错误。


### 法定人数一致性的局限性

如果你有 n 个副本，并且你选择了满足 $w + r > n$ 的 w 和 r，你通常可以期望每次读取都能返回最近写入的值。情况就是这样，因为你写入的节点集合和你读取的节点集合必然有重叠。也就是说，你读取的节点中必然至少有一个节点具有最新值（如 [图 5-11](/v1/ddia_0511.png) 所示）。

通常，r 和 w 被选为多数（超过 $n/2$ ）节点，因为这确保了 $w + r > n$，同时仍然容忍多达 $n/2$ 个节点故障。但是，法定人数不一定必须是大多数，重要的是读写使用的节点至少有一个节点的交集。其他法定人数的配置是可能的，这使得分布式算法的设计有一定的灵活性【45】。

你也可以将 w 和 r 设置为较小的数字，以使 $w + r ≤ n$（即法定条件不满足）。在这种情况下，读取和写入操作仍将被发送到 n 个节点，但操作成功只需要少量的成功响应。

较小的 w 和 r 更有可能会读取到陈旧的数据，因为你的读取更有可能未包含具有最新值的节点。另一方面，这种配置允许更低的延迟和更高的可用性：如果存在网络中断，并且许多副本变得无法访问，则有更大的机会可以继续处理读取和写入。只有当可达副本的数量低于 w 或 r 时，数据库才变得不可写入或读取。

但是，即使在 $w + r > n$ 的情况下，也可能存在返回陈旧值的边缘情况。这取决于实现，但可能的情况包括：

* 如果使用宽松的法定人数（见 “[宽松的法定人数与提示移交](#宽松的法定人数与提示移交)”），w 个写入和 r 个读取有可能落在完全不同的节点上，因此 r 节点和 w 之间不再保证有重叠节点【46】。
* 如果两个写入同时发生，不清楚哪一个先发生。在这种情况下，唯一安全的解决方案是合并并发写入（请参阅 “[处理写入冲突](#处理写入冲突)”）。如果根据时间戳（最后写入胜利）挑选出一个胜者，则写入可能由于时钟偏差【35】而丢失。我们将在 “[检测并发写入](#检测并发写入)” 继续讨论此话题。
* 如果写操作与读操作同时发生，写操作可能仅反映在某些副本上。在这种情况下，不确定读取返回的是旧值还是新值。
* 如果写操作在某些副本上成功，而在其他节点上失败（例如，因为某些节点上的磁盘已满），在小于 w 个副本上写入成功。所以整体判定写入失败，但整体写入失败并没有在写入成功的副本上回滚。这意味着一个写入虽然报告失败，后续的读取仍然可能会读取这次失败写入的值【47】。
* 如果携带新值的节点发生故障，需要从其他带有旧值的副本进行恢复，则存储新值的副本数可能会低于 w，从而打破法定人数条件。
* 即使一切工作正常，有时也会不幸地出现关于 **时序（timing）** 的边缘情况，我们将在 “[线性一致性和法定人数](/v1/ch9#线性一致性和法定人数)” 中看到这点。

因此，尽管法定人数似乎保证读取返回最新的写入值，但在实践中并不那么简单。Dynamo 风格的数据库通常针对可以忍受最终一致性的用例进行优化。你可以通过参数 w 和 r 来调整读取到陈旧值的概率，但把它们当成绝对的保证是不明智的。

尤其是，因为通常得不到 “[复制延迟问题](#复制延迟问题)” 中讨论的那些保证（读己之写，单调读，一致前缀读），前面提到的异常可能会发生在应用程序中。更强有力的保证通常需要 **事务** 或 **共识**。我们将在 [第七章](/v1/ch7) 和 [第九章](/v1/ch9) 回到这些话题。

#### 监控陈旧度

从运维的角度来看，监视你的数据库是否返回最新的结果是很重要的。即使应用可以容忍陈旧的读取，你也需要了解复制的健康状况。如果显著落后，它应该提醒你以便你可以调查原因（例如网络中的问题或过载的节点）。

对于基于领导者的复制，数据库通常会提供复制延迟的测量值，你可以将其提供给监视系统。这之所以能做到，是因为写入是按照相同的顺序应用于主库和从库，并且每个节点对应了复制日志中的一个位置（已经在本地应用的写入数量）。通过从主库的当前位置中减去从库的当前位置，你可以测量复制延迟的程度。

然而，在无主复制的系统中，没有固定的写入顺序，这使得监控变得更加困难。而且，如果数据库只使用读修复（没有反熵过程），那么对于一个值可能会有多陈旧其实是没有限制的 - 如果一个值很少被读取，那么由一个陈旧副本返回的值可能是古老的。

已经有一些关于衡量无主复制数据库中的复制陈旧度的研究，并根据参数 n、w 和 r 来预测陈旧读取的预期百分比【48】。不幸的是，这还不是很常见的做法，但是将陈旧测量值包含在数据库的标准度量集中是一件好事。虽然最终一致性是一种有意模糊的保证，但是从可操作性角度来说，能够量化 “最终” 也是很重要的。

### 宽松的法定人数与提示移交

合理配置的法定人数可以使数据库无需故障切换即可容忍个别节点的故障。它也可以容忍个别节点变慢，因为请求不必等待所有 n 个节点响应 —— 当 w 或 r 个节点响应时它们就可以返回。对于需要高可用、低延时、且能够容忍偶尔读到陈旧值的应用场景来说，这些特性使无主复制的数据库很有吸引力。

然而，法定人数（如迄今为止所描述的）并不像它们可能的那样具有容错性。网络中断可以很容易地将客户端从大量的数据库节点上切断。虽然这些节点是活着的，而其他客户端可能也能够连接到它们，但是从数据库节点切断的客户端来看，它们也可能已经死亡。在这种情况下，剩余的可用节点可能会少于 w 或 r，因此客户端不再能达到法定人数。

在一个大型的集群中（节点数量明显多于 n 个），网络中断期间客户端可能仍能连接到一些数据库节点，但又不足以组成一个特定的法定人数。在这种情况下，数据库设计人员需要权衡一下：

* 对于所有无法达到 w 或 r 个节点法定人数的请求，是否返回错误是更好的？
* 或者我们是否应该接受写入，然后将它们写入一些可达的节点，但不在这些值通常所存在的 n 个节点上？

后者被认为是一个 **宽松的法定人数（sloppy quorum）**【37】：写和读仍然需要 w 和 r 个成功的响应，但这些响应可能来自不在指定的 n 个 “主” 节点中的其它节点。就好比说，如果你把自己锁在房子外面了，你可能会去敲开邻居的门，问是否可以暂时呆在他们的沙发上。

一旦网络中断得到解决，一个节点代表另一个节点临时接受的任何写入都将被发送到适当的 “主” 节点。这就是所谓的 **提示移交（hinted handoff）**（一旦你再次找到你的房子的钥匙，你的邻居可以礼貌地要求你离开沙发回家）。

宽松的法定人数对写入可用性的提高特别有用：只要有任何 w 个节点可用，数据库就可以接受写入。然而，这意味着即使当 $w + r > n$ 时，也不能确保读取到某个键的最新值，因为最新的值可能已经临时写入了 n 之外的某些节点【47】。

因此，在传统意义上，宽松的法定人数实际上并不是法定人数。它只是一个持久性的保证，即数据已存储在某处的 w 个节点。但不能保证 r 个节点的读取能看到它，除非提示移交已经完成。

在所有常见的 Dynamo 实现中，宽松的法定人数是可选的。在 Riak 中，它们默认是启用的，而在 Cassandra 和 Voldemort 中它们默认是禁用的【46,49,50】。

#### 运维多个数据中心

我们先前讨论了跨数据中心复制，作为多主复制的用例（请参阅 “[多主复制](#多主复制)”）。其实无主复制也适用于多数据中心操作，既然它旨在容忍冲突的并发写入、网络中断和延迟尖峰。

Cassandra 和 Voldemort 在正常的无主模型中实现了他们的多数据中心支持：副本的数量 n 包括所有数据中心的节点，你可以在配置中指定每个数据中心所拥有的副本的数量。无论数据中心如何，每个来自客户端的写入都会发送到所有副本，但客户端通常只等待来自其本地数据中心内的法定节点的确认，从而不会受到跨数据中心链路延迟和中断的影响。对其他数据中心的高延迟写入通常被配置为异步执行，尽管该配置仍有一定的灵活性【50,51】。

Riak 将客户端和数据库节点之间的所有通信保持在一个本地的数据中心，因此 n 描述了一个数据中心内的副本数量。数据库集群之间的跨数据中心复制在后台异步发生，其风格类似于多主复制【52】。

### 检测并发写入

Dynamo 风格的数据库允许多个客户端同时写入相同的键（Key），这意味着即使使用严格的法定人数也会发生冲突。这种情况与多主复制相似（请参阅 “[处理写入冲突](#处理写入冲突)”），但在 Dynamo 风格的数据库中，在 **读修复** 或 **提示移交** 期间也可能会产生冲突。

其问题在于，由于可变的网络延迟和部分节点的故障，事件可能以不同的顺序到达不同的节点。例如，[图 5-12](/v1/ddia_0512.png) 显示了两个客户机 A 和 B 同时写入三节点数据存储中的键 X：

* 节点 1 接收来自 A 的写入，但由于暂时中断，未接收到来自 B 的写入。
* 节点 2 首先接收来自 A 的写入，然后接收来自 B 的写入。
* 节点 3 首先接收来自 B 的写入，然后从 A 写入。

![](/v1/ddia_0512.png)

**图 5-12 并发写入 Dynamo 风格的数据存储：没有明确定义的顺序。**

如果每个节点只要接收到来自客户端的写入请求就简单地覆写某个键值，那么节点就会永久地不一致，如 [图 5-12](/v1/ddia_0512.png) 中的最终获取请求所示：节点 2 认为 X 的最终值是 B，而其他节点认为值是 A 。

为了最终达成一致，副本应该趋于相同的值。如何做到这一点？有人可能希望复制的数据库能够自动处理，但不幸的是，大多数的实现都很糟糕：如果你想避免丢失数据，你（应用程序开发人员）需要知道很多有关数据库冲突处理的内部信息。

在 “[处理写入冲突](#处理写入冲突)” 一节中已经简要介绍了一些解决冲突的技术。在总结本章之前，让我们来更详细地探讨这个问题。

#### 最后写入胜利（丢弃并发写入）

实现最终收敛的一种方法是声明每个副本只需要存储 **“最近”** 的值，并允许 **“更旧”** 的值被覆盖和抛弃。然后，只要我们有一种明确的方式来确定哪个写是 “最近的”，并且每个写入最终都被复制到每个副本，那么复制最终会收敛到相同的值。

正如 **“最近”** 的引号所表明的，这个想法其实颇具误导性。在 [图 5-12](/v1/ddia_0512.png) 的例子中，当客户端向数据库节点发送写入请求时，两个客户端都不知道另一个客户端，因此不清楚哪一个先发送请求。事实上，说这两种情况谁先发送请求是没有意义的：既然我们说写入是 **并发（concurrent）** 的，那么它们的顺序就是不确定的。

即使写入没有自然的排序，我们也可以强制进行排序。例如，可以为每个写入附加一个时间戳，然后挑选最大的时间戳作为 **“最近的”**，并丢弃具有较早时间戳的任何写入。这种冲突解决算法被称为 **最后写入胜利（LWW, last write wins）**，是 Cassandra 唯一支持的冲突解决方法【53】，也是 Riak 中的一个可选特征【35】。

LWW 实现了最终收敛的目标，但以 **持久性** 为代价：如果同一个键有多个并发写入，即使它们反馈给客户端的结果都是成功的（因为它们被写入 w 个副本），也只有一个写入将被保留，而其他写入将被默默地丢弃。此外，LWW 甚至可能会丢弃不是并发的写入，我们将在 “[有序事件的时间戳](/v1/ch8#有序事件的时间戳)” 中进行讨论。

在类似缓存的一些情况下，写入丢失可能是可以接受的。但如果数据丢失不可接受，LWW 是解决冲突的一个很烂的选择。

在数据库中使用 LWW 的唯一安全方法是确保一个键只写入一次，然后视为不可变，从而避免对同一个键进行并发更新。例如，Cassandra 推荐使用的方法是使用 UUID 作为键，从而为每个写操作提供一个唯一的键【53】。

#### “此前发生”的关系和并发

我们如何判断两个操作是否是并发的？为了建立一个直觉，让我们看看一些例子：

* 在 [图 5-9](fig5-9.png) 中，两个写入不是并发的：A 的插入发生在 B 的递增之前，因为 B 递增的值是 A 插入的值。换句话说，B 的操作建立在 A 的操作上，所以 B 的操作必须后发生。我们也可以说 B **因果依赖（causally dependent）** 于 A。
* 另一方面，[图 5-12](fig5-12.png) 中的两个写入是并发的：当每个客户端启动操作时，它不知道另一个客户端也正在对同样的键执行操作。因此，操作之间不存在因果关系。

如果操作 B 了解操作 A，或者依赖于 A，或者以某种方式构建于操作 A 之上，则操作 A 在操作 B 之前发生（happens before）。一个操作是否在另一个操作之前发生是定义并发含义的关键。事实上，我们可以简单地说，如果两个操作中的任何一个都不在另一个之前发生（即，两个操作都不知道对方），那么这两个操作是并发的【54】。

因此，只要有两个操作 A 和 B，就有三种可能性：A 在 B 之前发生，或者 B 在 A 之前发生，或者 A 和 B 并发。我们需要的是一个算法来告诉我们两个操作是否是并发的。如果一个操作发生在另一个操作之前，则后面的操作应该覆盖前面的操作，但是如果这些操作是并发的，则存在需要解决的冲突。


> #### 并发性、时间和相对性
>
> 如果两个操作 **“同时”** 发生，似乎应该称为并发 —— 但事实上，它们在字面时间上重叠与否并不重要。由于分布式系统中的时钟问题，现实中是很难判断两个事件是否是 **同时** 发生的，这个问题我们将在 [第八章](/v1/ch8) 中详细讨论。
>
> 为了定义并发性，确切的时间并不重要：如果两个操作都意识不到对方的存在，就称这两个操作 **并发**，而不管它们实际发生的物理时间。人们有时把这个原理和物理学中的狭义相对论联系起来【54】，该理论引入了信息不能比光速更快的思想。因此，如果两个事件发生的时间差小于光通过它们之间的距离所需要的时间，那么这两个事件不可能相互影响。
>
> 在计算机系统中，即使光速原则上允许一个操作影响另一个操作，但两个操作也可能是 **并发的**。例如，如果网络缓慢或中断，两个操作间可能会出现一段时间间隔，但仍然是并发的，因为网络问题阻止一个操作意识到另一个操作的存在。


#### 捕获"此前发生"关系

我们来看一个算法，它可以确定两个操作是否为并发的，还是一个在另一个之前。简单起见，我们从一个只有一个副本的数据库开始。一旦我们知道了如何在单个副本上完成这项工作，我们可以将该方法推广到具有多个副本的无主数据库。

[图 5-13](/v1/ddia_0513.png) 显示了两个客户端同时向同一购物车添加项目。（如果这样的例子让你觉得无趣，那么可以想象一下两个空中交通管制员同时把飞机添加到他们正在跟踪的区域。）最初，购物车是空的。然后客户端向数据库发出五次写入：

1. 客户端 1 将牛奶加入购物车。这是该键的第一次写入，服务器成功存储了它并为其分配版本号 1，最后将值与版本号一起回送给客户端。
2. 客户端 2 将鸡蛋加入购物车，不知道客户端 1 同时添加了牛奶（客户端 2 认为它的鸡蛋是购物车中的唯一物品）。服务器为此写入分配版本号 2，并将鸡蛋和牛奶存储为两个单独的值。然后它将这两个值 **都** 返回给客户端 2 ，并附上版本号 2。
3. 客户端 1 不知道客户端 2 的写入，想要将面粉加入购物车，因此认为当前的购物车内容应该是 [牛奶，面粉]。它将此值与服务器先前向客户端 1 提供的版本号 1 一起发送到服务器。服务器可以从版本号中知道 [牛奶，面粉] 的写入取代了 [牛奶] 的先前值，但与 [鸡蛋] 的值是 **并发** 的。因此，服务器将版本号 3 分配给 [牛奶，面粉]，覆盖版本 1 的值 [牛奶]，但保留版本 2 的值 [鸡蛋]，并将所有的值返回给客户端 1。
4. 同时，客户端 2 想要加入火腿，不知道客户端 1 刚刚加了面粉。客户端 2 在最近一次响应中从服务器收到了两个值 [牛奶] 和 [鸡蛋]，所以客户端 2 现在合并这些值，并添加火腿形成一个新的值 [鸡蛋，牛奶，火腿]。它将这个值发送到服务器，带着之前的版本号 2 。服务器检测到新值会覆盖版本 2 的值 [鸡蛋]，但新值也会与版本 3 的值 [牛奶，面粉] **并发**，所以剩下的两个值是版本 3 的 [牛奶，面粉]，和版本 4 的 [鸡蛋，牛奶，火腿]。
5. 最后，客户端 1 想要加培根。它之前从服务器接收到了版本 3 的 [牛奶，面粉] 和 [鸡蛋]，所以它合并这些，添加培根，并将最终值 [牛奶，面粉，鸡蛋，培根] 连同版本号 3 发往服务器。这会覆盖版本 3 的值 [牛奶，面粉]（请注意 [鸡蛋] 已经在上一步被覆盖），但与版本 4 的值 [鸡蛋，牛奶，火腿] 并发，所以服务器将保留这两个并发值。

![](/v1/ddia_0513.png)

**图 5-13  在同时编辑购物车时捕获两个客户端之间的因果关系。**

[图 5-13](/v1/ddia_0513.png) 中的操作之间的数据流如 [图 5-14](/v1/ddia_0514.png) 所示。箭头表示哪个操作发生在其他操作之前，意味着后面的操作知道或依赖于较早的操作。在这个例子中，客户端永远不会完全拿到服务器上的最新数据，因为总是有另一个操作同时进行。但是旧版本的值最终会被覆盖，并且不会丢失任何写入。

![](/v1/ddia_0514.png)

**图 5-14 图 5-13 中的因果依赖关系图。**

请注意，服务器可以只通过查看版本号来确定两个操作是否是并发的 —— 它不需要对值本身进行解释（因此该值可以是任何数据结构）。该算法的工作原理如下：

* 服务器为每个键维护一个版本号，每次写入该键时都递增版本号，并将新版本号与写入的值一起存储。
* 当客户端读取键时，服务器将返回所有未覆盖的值以及最新的版本号。客户端在写入前必须先读取。
* 当客户端写入键时，必须包含之前读取的版本号，并且必须将之前读取的所有值合并在一起（针对写入请求的响应可以像读取请求一样，返回所有当前值，这使得我们可以像购物车示例那样将多个写入串联起来）。
* 当服务器接收到具有特定版本号的写入时，它可以覆盖该版本号或更低版本的所有值（因为它知道它们已经被合并到新的值中），但是它必须用更高的版本号来保存所有值（因为这些值与正在进行的其它写入是并发的）。

当一个写入包含前一次读取的版本号时，它会告诉我们的写入是基于之前的哪一种状态。如果在不包含版本号的情况下进行写操作，则与所有其他写操作并发，因此它不会覆盖任何内容 —— 只会在随后的读取中作为其中一个值返回。

#### 合并并发写入的值

这种算法可以确保没有数据被无声地丢弃，但不幸的是，客户端需要做一些额外的工作：客户端随后必须合并并发写入的值。Riak 称这些并发值为 **兄弟（siblings）**。

合并并发值，本质上是与多主复制中的冲突解决问题相同，我们先前讨论过（请参阅 “[处理写入冲突](#处理写入冲突)”）。一个简单的方法是根据版本号或时间戳（最后写入胜利）来选择一个值，但这意味着丢失数据。所以，你可能需要在应用程序代码中额外做些更聪明的事情。

以购物车为例，一种合理的合并值的方法就是做并集。在 [图 5-14](/v1/ddia_0514.png) 中，最后的两个兄弟是 [牛奶，面粉，鸡蛋，培根] 和 [鸡蛋，牛奶，火腿]。注意牛奶和鸡蛋虽然同时出现在两个并发值里，但他们每个只被写过一次。合并的值可以是 [牛奶，面粉，鸡蛋，培根，火腿]，不再有重复了。

然而，如果你想让人们也可以从他们的购物车中 **移除** 东西，而不是仅仅添加东西，那么把并发值做并集可能不会产生正确的结果：如果你合并了两个客户端的购物车，并且只在其中一个客户端里面移除了一个项目，那么被移除的项目将会重新出现在这两个客户端的交集结果中【37】。为了防止这个问题，要移除一个项目时不能简单地直接从数据库中删除；相反，系统必须留下一个具有适当版本号的标记，以在兄弟合并时表明该项目已被移除。这种删除标记被称为 **墓碑（tombstone）**（我们上一次看到墓碑是在 “[散列索引”](/v1/ch3#散列索引) 章节的日志压缩部分）。

因为在应用程序代码中做兄弟合并是复杂且容易出错的，所以有一些数据结构被设计出来用于自动执行这种合并，比如在 “[自动冲突解决](#自动冲突解决)” 中讨论过的那些。举例来说，Riak 的数据类型就支持使用称为 CRDT 【38,39,55】的能以合理方式自动进行兄弟合并的数据结构家族，包括对保留删除的支持。

#### 版本向量

[图 5-13](/v1/ddia_0513.png) 中的示例只使用了一个副本。当有多个副本但又没有主库时，算法该如何修改？

[图 5-13](/v1/ddia_0513.png) 使用单个版本号来捕获操作之间的依赖关系，但是当多个副本并发接受写入时，这是不够的。相反，除了对每个键，我们还需要对 **每个副本** 使用版本号。每个副本在处理写入时增加自己的版本号，并且跟踪从其他副本中看到的版本号。这个信息指出了要覆盖哪些并发值，以及要保留哪些并发值或兄弟值。

所有副本的版本号集合称为 **版本向量（version vector）**【56】。这个想法的一些变体正在被使用，但最有趣的可能是在 Riak 2.0 【58,59】中使用的 **虚线版本向量（dotted version vector）**【57】。我们不会深入细节，但是它的工作方式与我们在购物车示例中看到的非常相似。

与 [图 5-13](/v1/ddia_0513.png) 中的版本号一样，当读取值时，版本向量会从数据库副本发送到客户端，并且随后写入值时需要将其发送回数据库。（Riak 将版本向量编码为一个字符串，并称其为 **因果上下文**，即 causal context）。版本向量允许数据库区分覆盖写入和并发写入。

另外，就像在单个副本中的情况一样，应用程序可能需要合并并发值。版本向量结构能够确保从一个副本读取并随后写回到另一个副本是安全的。这样做虽然可能会在其他副本上面创建数据，但只要能正确合并就不会丢失数据。

> #### 版本向量和向量时钟
>
> 版本向量有时也被称为向量时钟，即使它们不完全相同。其中的差别很微妙 —— 细节请参阅参考资料【57,60,61】。简而言之，在比较副本的状态时，版本向量才是正确的数据结构。


## 本章小结

在本章中，我们考察了复制的问题。复制可以用于几个目的：

高可用性
: 即使在一台机器（或多台机器，或整个数据中心）停机的情况下也能保持系统正常运行

断开连接的操作
: 允许应用程序在网络中断时继续工作

延迟
: 将数据放置在地理上距离用户较近的地方，以便用户能够更快地与其交互

可伸缩性
: 通过在副本上读，能够处理比单机更大的读取量


尽管是一个简单的目标 - 在几台机器上保留相同数据的副本，但复制却是一个非常棘手的问题。它需要仔细考虑并发和所有可能出错的事情，并处理这些故障的后果。至少，我们需要处理不可用的节点和网络中断（这还不包括更隐蔽的故障，例如由于软件错误导致的静默数据损坏）。

我们讨论了复制的三种主要方法：

单主复制
: 客户端将所有写入操作发送到单个节点（主库），该节点将数据更改事件流发送到其他副本（从库）。读取可以在任何副本上执行，但从库的读取结果可能是陈旧的。

多主复制
: 客户端将每个写入发送到几个主库节点之一，其中任何一个主库都可以接受写入。主库将数据更改事件流发送给彼此以及任何从库节点。

无主复制
: 客户端将每个写入发送到几个节点，并从多个节点并行读取，以检测和纠正具有陈旧数据的节点。

每种方法都有优点和缺点。单主复制是非常流行的，因为它很容易理解，不需要担心冲突解决。在出现故障节点、网络中断和延迟峰值的情况下，多主复制和无主复制可以更加健壮，其代价是难以推理并且仅提供非常弱的一致性保证。

复制可以是同步的，也可以是异步的，这在发生故障时对系统行为有深远的影响。尽管在系统运行平稳时异步复制速度很快，但是要弄清楚在复制延迟增加和服务器故障时会发生什么，这一点很重要。如果主库失败后你将一个异步更新的从库提升为新的主库，那么最近提交的数据可能会丢失。

我们研究了一些可能由复制延迟引起的奇怪效应，我们也讨论了一些有助于决定应用程序在复制延迟时的行为的一致性模型：

读己之写一致性
: 用户应该总是能看到自己提交的数据。

单调读
: 用户在看到某个时间点的数据后，他们不应该再看到该数据在更早时间点的情况。

一致前缀读
: 用户应该看到数据处于一种具有因果意义的状态：例如，按正确的顺序看到一个问题和对应的回答。

最后，我们讨论了多主复制和无主复制方法所固有的并发问题：因为他们允许多个写入并发发生，这可能会导致冲突。我们研究了一个数据库可以使用的算法来确定一个操作是否发生在另一个操作之前，或者它们是否并发发生。我们还谈到了通过合并并发更新来解决冲突的方法。

在下一章中，我们将继续考察数据分布在多台机器间的另一种不同于 **复制** 的形式：将大数据集分割成 **分区**。


## 参考文献

1. Bruce G. Lindsay, Patricia Griffiths Selinger, C. Galtieri, et al.: “[Notes on Distributed Databases](https://dominoweb.draco.res.ibm.com/reports/RJ2571.pdf),” IBM Research, Research Report RJ2571(33471), July 1979.
1. “[Oracle Active Data Guard Real-Time Data Protection and Availability](http://www.oracle.com/technetwork/database/availability/active-data-guard-wp-12c-1896127.pdf),” Oracle White Paper, June 2013.
1. “[AlwaysOn Availability Groups](http://msdn.microsoft.com/en-us/library/hh510230.aspx),” in *SQL Server Books Online*, Microsoft, 2012.
1. Lin Qiao, Kapil Surlaker, Shirshanka Das, et al.: “[On Brewing Fresh Espresso: LinkedIn’s Distributed Data Serving Platform](http://www.slideshare.net/amywtang/espresso-20952131),” at *ACM International Conference on Management of Data* (SIGMOD), June 2013.
1. Jun Rao: “[Intra-Cluster Replication for Apache Kafka](http://www.slideshare.net/junrao/kafka-replication-apachecon2013),” at *ApacheCon North America*, February 2013.
1. “[Highly Available Queues](https://www.rabbitmq.com/ha.html),” in *RabbitMQ Server Documentation*, Pivotal Software, Inc., 2014.
1. Yoshinori Matsunobu: “[Semi-Synchronous Replication at Facebook](http://yoshinorimatsunobu.blogspot.co.uk/2014/04/semi-synchronous-replication-at-facebook.html),” *yoshinorimatsunobu.blogspot.co.uk*, April 1, 2014.
1. Robbert van Renesse and Fred B. Schneider: “[Chain Replication for Supporting High Throughput and Availability](http://static.usenix.org/legacy/events/osdi04/tech/full_papers/renesse/renesse.pdf),” at *6th USENIX Symposium on Operating System Design and Implementation* (OSDI), December 2004.
1. Jeff Terrace and Michael J. Freedman: “[Object Storage on CRAQ: High-Throughput Chain Replication for Read-Mostly Workloads](https://www.usenix.org/legacy/event/usenix09/tech/full_papers/terrace/terrace.pdf),” at *USENIX Annual Technical Conference* (ATC), June 2009.
1. Brad Calder, Ju Wang, Aaron Ogus, et al.: “[Windows Azure Storage: A Highly Available Cloud Storage Service with Strong Consistency](http://sigops.org/sosp/sosp11/current/2011-Cascais/printable/11-calder.pdf),” at *23rd ACM Symposium on Operating Systems Principles* (SOSP), October 2011.
1. Andrew Wang: “[Windows Azure Storage](https://www.umbrant.com/2016/02/04/windows-azure-storage/),” *umbrant.com*, February 4, 2016.
1. “[Percona Xtrabackup - Documentation](https://www.percona.com/doc/percona-xtrabackup/2.1/index.html),” Percona LLC, 2014.
1. Jesse Newland: “[GitHub Availability This Week](https://github.com/blog/1261-github-availability-this-week),” *github.com*, September 14, 2012.
1. Mark Imbriaco: “[Downtime Last Saturday](https://github.com/blog/1364-downtime-last-saturday),” *github.com*, December 26, 2012.
1. John Hugg: “[‘All in’ with Determinism for Performance and Testing in Distributed Systems](https://www.youtube.com/watch?v=gJRj3vJL4wE),” at *Strange Loop*, September 2015.
1. Amit Kapila: “[WAL Internals of PostgreSQL](http://www.pgcon.org/2012/schedule/attachments/258_212_Internals%20Of%20PostgreSQL%20Wal.pdf),” at *PostgreSQL Conference* (PGCon), May 2012.
1. [*MySQL Documentation*](https://dev.mysql.com/doc/refman/en/binary-log.html). Oracle, 2025.
1. Yogeshwer Sharma, Philippe Ajoux, Petchean Ang, et al.: “[Wormhole: Reliable Pub-Sub to Support Geo-Replicated Internet Services](https://www.usenix.org/system/files/conference/nsdi15/nsdi15-paper-sharma.pdf),” at *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015.
1. “[Oracle GoldenGate 12c: Real-Time Access to Real-Time Information](https://web.archive.org/web/20200110231516/http://www.oracle.com/us/products/middleware/data-integration/oracle-goldengate-realtime-access-2031152.pdf),” Oracle White Paper, October 2013.
1. Shirshanka Das, Chavdar Botev, Kapil Surlaker, et al.: “[All Aboard the Databus!](http://www.socc2012.org/s18-das.pdf),” at *ACM Symposium on Cloud Computing* (SoCC), October 2012.
1. Greg Sabino Mullane: “[Version 5 of Bucardo Database Replication System](https://www.endpointdev.com/blog/2014/06/bucardo-5-multimaster-postgres-released/),” *blog.endpoint.com*, June 23, 2014.
1. Werner Vogels: “[Eventually Consistent](http://queue.acm.org/detail.cfm?id=1466448),” *ACM Queue*, volume 6, number 6, pages 14–19, October 2008. [doi:10.1145/1466443.1466448](http://dx.doi.org/10.1145/1466443.1466448)
1. Douglas B. Terry: “[Replicated Data Consistency Explained Through Baseball](https://www.microsoft.com/en-us/research/publication/replicated-data-consistency-explained-through-baseball/),” Microsoft Research, Technical Report MSR-TR-2011-137, October 2011.
1. Douglas B. Terry, Alan J. Demers, Karin Petersen, et al.: “[Session Guarantees for Weakly Consistent Replicated Data](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.71.2269&rep=rep1&type=pdf),” at *3rd International Conference on Parallel and Distributed Information Systems* (PDIS), September 1994. [doi:10.1109/PDIS.1994.331722](http://dx.doi.org/10.1109/PDIS.1994.331722)
1. Terry Pratchett: *Reaper Man: A Discworld Novel*. Victor Gollancz, 1991. ISBN: 978-0-575-04979-6
1. “[Tungsten Replicator](https://github.com/holys/tungsten-replicator),” *github.com*.
1. “[BDR 0.10.0 Documentation](https://web.archive.org/web/20160728020040/http://bdr-project.org/docs/next/index.html),” The PostgreSQL Global Development Group, *bdr-project.org*, 2015.
1. Robert Hodges: “[If You *Must* Deploy Multi-Master Replication, Read This First](http://scale-out-blog.blogspot.co.uk/2012/04/if-you-must-deploy-multi-master.html),” *scale-out-blog.blogspot.co.uk*, March 30, 2012.
1. J. Chris Anderson, Jan Lehnardt, and Noah Slater: *CouchDB: The Definitive Guide*. O'Reilly Media, 2010. ISBN: 978-0-596-15589-6
1. AppJet, Inc.: “[Etherpad and EasySync Technical Manual](https://github.com/ether/etherpad-lite/blob/e2ce9dc/doc/easysync/easysync-full-description.pdf),” *github.com*, March 26, 2011.
1. John Day-Richter: “[What’s Different About the New Google Docs: Making Collaboration Fast](https://drive.googleblog.com/2010/09/whats-different-about-new-google-docs.html),” *drive.googleblog.com*, September 23, 2010.
1. Martin Kleppmann and Alastair R. Beresford: “[A Conflict-Free Replicated JSON Datatype](http://arxiv.org/abs/1608.03960),” arXiv:1608.03960, August 13, 2016.
1. Frazer Clement: “[Eventual Consistency – Detecting Conflicts](http://messagepassing.blogspot.co.uk/2011/10/eventual-consistency-detecting.html),” *messagepassing.blogspot.co.uk*, October 20, 2011.
1. Robert Hodges: “[State of the Art for MySQL Multi-Master Replication](https://web.archive.org/web/20161010052017/https://www.percona.com/live/mysql-conference-2013/sites/default/files/slides/mysql-multi-master-state-of-art-2013-04-24_0.pdf),” at *Percona Live: MySQL Conference & Expo*, April 2013.
1. John Daily: “[Clocks Are Bad, or, Welcome to the Wonderful World of Distributed Systems](https://riak.com/clocks-are-bad-or-welcome-to-distributed-systems/),” *riak.com*, November 12, 2013.
1. Riley Berton: “[Is Bi-Directional Replication (BDR) in Postgres Transactional?](https://web.archive.org/web/20211204170610/http://sdf.org/~riley/blog/2016/01/04/is-bi-directional-replication-bdr-in-postgres-transactional/),” *sdf.org*, January 4, 2016.
1. Giuseppe DeCandia, Deniz Hastorun, Madan Jampani, et al.: “[Dynamo: Amazon's Highly Available Key-Value Store](http://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf),” at *21st ACM Symposium on Operating Systems Principles* (SOSP), October 2007.
1. Marc Shapiro, Nuno Preguiça, Carlos Baquero, and Marek Zawirski: “[A Comprehensive Study of Convergent and Commutative Replicated Data Types](http://hal.inria.fr/inria-00555588/),” INRIA Research Report no. 7506, January 2011.
1. Sam Elliott: “[CRDTs: An UPDATE (or Maybe Just a PUT)](https://speakerdeck.com/lenary/crdts-an-update-or-just-a-put),” at *RICON West*, October 2013.
1. Russell Brown: “[A Bluffers Guide to CRDTs in Riak](https://gist.github.com/russelldb/f92f44bdfb619e089a4d),” *gist.github.com*, October 28, 2013.
1. Benjamin Farinier, Thomas Gazagnaire, and Anil Madhavapeddy: “[Mergeable Persistent Data Structures](http://gazagnaire.org/pub/FGM15.pdf),” at *26es Journées Francophones des Langages Applicatifs* (JFLA), January 2015.
1. Chengzheng Sun and Clarence Ellis: “[Operational Transformation in Real-Time Group Editors: Issues, Algorithms, and Achievements](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.53.933&rep=rep1&type=pdf),” at *ACM Conference on Computer Supported Cooperative Work* (CSCW), November 1998.
1. Lars Hofhansl: “[HBASE-7709: Infinite Loop Possible in Master/Master Replication](https://issues.apache.org/jira/browse/HBASE-7709),” *issues.apache.org*, January 29, 2013.
1. David K. Gifford: “[Weighted Voting for Replicated Data](https://www.cs.cmu.edu/~15-749/READINGS/required/availability/gifford79.pdf),” at *7th ACM Symposium on Operating Systems Principles* (SOSP), December 1979. [doi:10.1145/800215.806583](http://dx.doi.org/10.1145/800215.806583)
1. Heidi Howard, Dahlia Malkhi, and Alexander Spiegelman: “[Flexible Paxos: Quorum Intersection Revisited](https://arxiv.org/abs/1608.06696),” *arXiv:1608.06696*, August 24, 2016.
1. Joseph Blomstedt: “[Re: Absolute Consistency](https://web.archive.org/web/20190919171316/http://lists.basho.com:80/pipermail/riak-users_lists.basho.com/2012-January/007157.html),” email to *riak-users* mailing list, *lists.basho.com*, January 11, 2012.
1. Joseph Blomstedt: “[Bringing Consistency to Riak](https://vimeo.com/51973001),” at *RICON West*, October 2012.
1. Peter Bailis, Shivaram Venkataraman, Michael J. Franklin, et al.: “[Quantifying Eventual Consistency with PBS](http://www.bailis.org/papers/pbs-cacm2014.pdf),” *Communications of the ACM*, volume 57, number 8, pages 93–102, August 2014. [doi:10.1145/2632792](http://dx.doi.org/10.1145/2632792)
1. Jonathan Ellis: “[Modern Hinted Handoff](http://www.datastax.com/dev/blog/modern-hinted-handoff),” *datastax.com*, December 11, 2012.
1. “[Project Voldemort Wiki](https://github.com/voldemort/voldemort/wiki),” *github.com*, 2013.
1. “[Apache Cassandra Documentation](https://cassandra.apache.org/doc/latest/),” Apache Software Foundation, *cassandra.apache.org*.
1. “[Riak Enterprise: Multi-Datacenter Replication](https://web.archive.org/web/20150513041837/http://basho.com/assets/MultiDatacenter_Replication.pdf).” Technical whitepaper, Basho Technologies, Inc., September 2014.
1. Jonathan Ellis: “[Why Cassandra Doesn't Need Vector Clocks](http://www.datastax.com/dev/blog/why-cassandra-doesnt-need-vector-clocks),” *datastax.com*, September 2, 2013.
1. Leslie Lamport: “[Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/),” *Communications of the ACM*, volume 21, number 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](http://dx.doi.org/10.1145/359545.359563)
1. Joel Jacobson: “[Riak 2.0: Data Types](https://web.archive.org/web/20160327135816/http://blog.joeljacobson.com/riak-2-0-data-types/),” *blog.joeljacobson.com*, March 23, 2014.
1. D. Stott Parker Jr., Gerald J. Popek, Gerard Rudisin, et al.: “[Detection of Mutual Inconsistency in Distributed Systems](https://web.archive.org/web/20170808212704/https://zoo.cs.yale.edu/classes/cs426/2013/bib/parker83detection.pdf),” *IEEE Transactions on Software Engineering*, volume 9, number 3, pages 240–247, May 1983. [doi:10.1109/TSE.1983.236733](http://dx.doi.org/10.1109/TSE.1983.236733)
1. Nuno Preguiça, Carlos Baquero, Paulo Sérgio Almeida, et al.: “[Dotted Version Vectors: Logical Clocks for Optimistic Replication](http://arxiv.org/pdf/1011.5808v1.pdf),” arXiv:1011.5808, November 26, 2010.
1. Sean Cribbs: “[A Brief History of Time in Riak](https://speakerdeck.com/seancribbs/a-brief-history-of-time-in-riak),” at *RICON*, October 2014.
1. Russell Brown: “[Vector Clocks Revisited Part 2: Dotted Version Vectors](https://riak.com/posts/technical/vector-clocks-revisited-part-2-dotted-version-vectors/),” *basho.com*, November 10, 2015.
1. Carlos Baquero: “[Version Vectors Are Not Vector Clocks](https://haslab.wordpress.com/2011/07/08/version-vectors-are-not-vector-clocks/),” *haslab.wordpress.com*, July 8, 2011.
1. Reinhard Schwarz and Friedemann Mattern: “[Detecting Causal Relationships in Distributed Computations: In Search of the Holy Grail](http://dcg.ethz.ch/lectures/hs08/seminar/papers/mattern4.pdf),” *Distributed Computing*, volume 7, number 3, pages 149–174, March 1994. [doi:10.1007/BF02277859](http://dx.doi.org/10.1007/BF02277859)


================================================
FILE: content/v1/ch6.md
================================================
---
linktitle: "第六章：分区"
linkTitle: "6. 分区"
weight: 206
breadcrumbs: false
---


![](/map/ch06.png)

> 我们必须跳出电脑指令序列的窠臼。叙述定义、描述元数据、梳理关系，而不是编写过程。
>
> —— Grace Murray Hopper，未来的计算机及其管理（1962）


在 [第五章](/v1/ch5) 中，我们讨论了复制 —— 即数据在不同节点上的副本，对于非常大的数据集，或非常高的吞吐量，仅仅进行复制是不够的：我们需要将数据进行 **分区（partitions）**，也称为 **分片（sharding）**[^i]。

[^i]: 正如本章所讨论的，分区是一种有意将大型数据库分解成小型数据库的方式。它与 **网络分区（network partitions, netsplits）** 无关，这是节点之间网络故障的一种。我们将在 [第八章](/v1/ch8) 讨论这些错误。

> [!TIP] 术语澄清
>
> 上文中的 **分区（partition）**，在 MongoDB，Elasticsearch 和 Solr Cloud 中被称为 **分片（shard）**，在 HBase 中称之为 **区域（Region）**，Bigtable 中则是 **表块（tablet）**，Cassandra 和 Riak 中是 **虚节点（vnode）**，Couchbase 中叫做 **虚桶（vBucket）**。但是 **分区（partitioning）** 是最约定俗成的叫法。

通常情况下，每条数据（每条记录，每行或每个文档）属于且仅属于一个分区。有很多方法可以实现这一点，本章将进行深入讨论。实际上，每个分区都是自己的小型数据库，尽管数据库可能支持同时进行多个分区的操作。

分区主要是为了 **可伸缩性**。不同的分区可以放在不共享集群中的不同节点上（请参阅 [第二部分](/v1/part-ii) 关于 [无共享架构](/v1/part-ii#无共享架构) 的定义）。因此，大数据集可以分布在多个磁盘上，并且查询负载可以分布在多个处理器上。

对于在单个分区上运行的查询，每个节点可以独立执行对自己的查询，因此可以通过添加更多的节点来扩大查询吞吐量。大型，复杂的查询可能会跨越多个节点并行处理，尽管这也带来了新的困难。

分区数据库在 20 世纪 80 年代由 Teradata 和 NonStop SQL【1】等产品率先推出，最近因为 NoSQL 数据库和基于 Hadoop 的数据仓库重新被关注。有些系统是为事务性工作设计的，有些系统则用于分析（请参阅 “[事务处理还是分析](/v1/ch3#事务处理还是分析？)”）：这种差异会影响系统的运作方式，但是分区的基本原理均适用于这两种工作方式。

在本章中，我们将首先介绍分割大型数据集的不同方法，并观察索引如何与分区配合。然后我们将讨论 [分区再平衡（rebalancing）](#分区再平衡)，如果想要添加或删除集群中的节点，则必须进行再平衡。最后，我们将概述数据库如何将请求路由到正确的分区并执行查询。

## 分区与复制

分区通常与复制结合使用，使得每个分区的副本存储在多个节点上。这意味着，即使每条记录属于一个分区，它仍然可以存储在多个不同的节点上以获得容错能力。

一个节点可能存储多个分区。如果使用主从复制模型，则分区和复制的组合如 [图 6-1](/v1/ddia_0601.png) 所示。每个分区领导者（主库）被分配给一个节点，追随者（从库）被分配给其他节点。每个节点可能是某些分区的主库，同时是其他分区的从库。

我们在 [第五章](/v1/ch5) 讨论的关于数据库复制的所有内容同样适用于分区的复制。大多数情况下，分区方案的选择与复制方案的选择是独立的，为简单起见，本章中将忽略复制。

![](/v1/ddia_0601.png)

**图 6-1 组合使用复制和分区：每个节点充当某些分区的主库，其他分区充当从库。**

## 键值数据的分区

假设你有大量数据并且想要分区，如何决定在哪些节点上存储哪些记录呢？

分区目标是将数据和查询负载均匀分布在各个节点上。如果每个节点公平分享数据和负载，那么理论上 10 个节点应该能够处理 10 倍的数据量和 10 倍的单个节点的读写吞吐量（暂时忽略复制）。

如果分区是不公平的，一些分区比其他分区有更多的数据或查询，我们称之为 **偏斜（skew）**。数据偏斜的存在使分区效率下降很多。在极端的情况下，所有的负载可能压在一个分区上，其余 9 个节点空闲的，瓶颈落在这一个繁忙的节点上。不均衡导致的高负载的分区被称为 **热点（hot spot）**。

避免热点最简单的方法是将记录随机分配给节点。这将在所有节点上平均分配数据，但是它有一个很大的缺点：当你试图读取一个特定的值时，你无法知道它在哪个节点上，所以你必须并行地查询所有的节点。

我们可以做得更好。现在假设你有一个简单的键值数据模型，其中你总是通过其主键访问记录。例如，在一本老式的纸质百科全书中，你可以通过标题来查找一个条目；由于所有条目按字母顺序排序，因此你可以快速找到你要查找的条目。

### 根据键的范围分区

一种分区的方法是为每个分区指定一块连续的键范围（从最小值到最大值），如纸质百科全书的卷（[图 6-2](/v1/ddia_0602.png)）。如果知道范围之间的边界，则可以轻松确定哪个分区包含某个值。如果你还知道分区所在的节点，那么可以直接向相应的节点发出请求（对于百科全书而言，就像从书架上选取正确的书籍）。

![](/v1/ddia_0602.png)

**图 6-2 印刷版百科全书按照关键字范围进行分区**

键的范围不一定均匀分布，因为数据也很可能不均匀分布。例如在 [图 6-2](/v1/ddia_0602.png) 中，第 1 卷包含以 A 和 B 开头的单词，但第 12 卷则包含以 T、U、V、X、Y 和 Z 开头的单词。只是简单的规定每个卷包含两个字母会导致一些卷比其他卷大。为了均匀分配数据，分区边界需要依据数据调整。

分区边界可以由管理员手动选择，也可以由数据库自动选择（我们会在 “[分区再平衡](#分区再平衡)” 中更详细地讨论分区边界的选择）。Bigtable 使用了这种分区策略，以及其开源等价物 HBase 【2, 3】、RethinkDB 和 2.4 版本之前的 MongoDB 【4】。

在每个分区中，我们可以按照一定的顺序保存键（请参阅 “[SSTables 和 LSM 树](/v1/ch3#SSTables和LSM树)”）。好处是进行范围扫描非常简单，你可以将键作为联合索引来处理，以便在一次查询中获取多个相关记录（请参阅 “[多列索引](/v1/ch3#多列索引)”）。例如，假设我们有一个程序来存储传感器网络的数据，其中主键是测量的时间戳（年月日时分秒）。范围扫描在这种情况下非常有用，因为我们可以轻松获取某个月份的所有数据。

然而，Key Range 分区的缺点是某些特定的访问模式会导致热点。如果主键是时间戳，则分区对应于时间范围，例如，给每天分配一个分区。不幸的是，由于我们在测量发生时将数据从传感器写入数据库，因此所有写入操作都会转到同一个分区（即今天的分区），这样分区可能会因写入而过载，而其他分区则处于空闲状态【5】。

为了避免传感器数据库中的这个问题，需要使用除了时间戳以外的其他东西作为主键的第一个部分。例如，可以在每个时间戳前添加传感器名称，这样会首先按传感器名称，然后按时间进行分区。假设有多个传感器同时运行，写入负载将最终均匀分布在不同分区上。现在，当想要在一个时间范围内获取多个传感器的值时，你需要为每个传感器名称执行一个单独的范围查询。

### 根据键的散列分区

由于偏斜和热点的风险，许多分布式数据存储使用散列函数来确定给定键的分区。

一个好的散列函数可以将偏斜的数据均匀分布。假设你有一个 32 位散列函数，无论何时给定一个新的字符串输入，它将返回一个 0 到 $2^{32}$ -1 之间的 “随机” 数。即使输入的字符串非常相似，它们的散列也会均匀分布在这个数字范围内。

出于分区的目的，散列函数不需要多么强壮的加密算法：例如，Cassandra 和 MongoDB 使用 MD5，Voldemort 使用 Fowler-Noll-Vo 函数。许多编程语言都有内置的简单哈希函数（它们用于散列表），但是它们可能不适合分区：例如，在 Java 的 `Object.hashCode()` 和 Ruby 的 `Object#hash`，同一个键可能在不同的进程中有不同的哈希值【6】。

一旦你有一个合适的键散列函数，你可以为每个分区分配一个散列范围（而不是键的范围），每个通过哈希散列落在分区范围内的键将被存储在该分区中。如 [图 6-3](/v1/ddia_0603.png) 所示。

![](/v1/ddia_0603.png)

**图 6-3 按哈希键分区**

这种技术擅长在分区之间公平地分配键。分区边界可以是均匀间隔的，也可以是伪随机选择的（在这种情况下，该技术有时也被称为 **一致性哈希**，即 consistent hashing）。

> #### 一致性哈希
>
> 一致性哈希由 Karger 等人定义。【7】 用于跨互联网级别的缓存系统，例如 CDN 中，是一种能均匀分配负载的方法。它使用随机选择的 **分区边界（partition boundaries）** 来避免中央控制或分布式共识的需要。请注意，这里的一致性与复制一致性（请参阅 [第五章](/v1/ch5)）或 ACID 一致性（请参阅 [第七章](/v1/ch7)）无关，而只是描述了一种再平衡（rebalancing）的特定方法。
>
> 正如我们将在 “[分区再平衡](#分区再平衡)” 中所看到的，这种特殊的方法对于数据库实际上并不是很好，所以在实际中很少使用（某些数据库的文档仍然会使用一致性哈希的说法，但是它往往是不准确的）。因为有可能产生混淆，所以最好避免使用一致性哈希这个术语，而只是把它称为 **散列分区（hash partitioning）**。

不幸的是，通过使用键散列进行分区，我们失去了键范围分区的一个很好的属性：高效执行范围查询的能力。曾经相邻的键现在分散在所有分区中，所以它们之间的顺序就丢失了。在 MongoDB 中，如果你使用了基于散列的分区模式，则任何范围查询都必须发送到所有分区【4】。Riak【9】、Couchbase 【10】或 Voldemort 不支持主键上的范围查询。

Cassandra 采取了折衷的策略【11, 12, 13】。Cassandra 中的表可以使用由多个列组成的复合主键来声明。键中只有第一列会作为散列的依据，而其他列则被用作 Casssandra 的 SSTables 中排序数据的连接索引。尽管查询无法在复合主键的第一列中按范围扫表，但如果第一列已经指定了固定值，则可以对该键的其他列执行有效的范围扫描。

组合索引方法为一对多关系提供了一个优雅的数据模型。例如，在社交媒体网站上，一个用户可能会发布很多更新。如果更新的主键被选择为 `(user_id, update_timestamp)`，那么你可以有效地检索特定用户在某个时间间隔内按时间戳排序的所有更新。不同的用户可以存储在不同的分区上，对于每个用户，更新按时间戳顺序存储在单个分区上。

### 负载偏斜与热点消除

如前所述，哈希分区可以帮助减少热点。但是，它不能完全避免它们：在极端情况下，所有的读写操作都是针对同一个键的，所有的请求都会被路由到同一个分区。

这种场景也许并不常见，但并非闻所未闻：例如，在社交媒体网站上，一个拥有数百万追随者的名人用户在做某事时可能会引发一场风暴【14】。这个事件可能导致同一个键的大量写入（键可能是名人的用户 ID，或者人们正在评论的动作的 ID）。哈希策略不起作用，因为两个相同 ID 的哈希值仍然是相同的。

如今，大多数数据系统无法自动补偿这种高度偏斜的负载，因此应用程序有责任减少偏斜。例如，如果一个主键被认为是非常火爆的，一个简单的方法是在主键的开始或结尾添加一个随机数。只要一个两位数的十进制随机数就可以将主键分散为 100 种不同的主键，从而存储在不同的分区中。

然而，将主键进行分割之后，任何读取都必须要做额外的工作，因为他们必须从所有 100 个主键分布中读取数据并将其合并。此技术还需要额外的记录：只需要对少量热点附加随机数；对于写入吞吐量低的绝大多数主键来说是不必要的开销。因此，你还需要一些方法来跟踪哪些键需要被分割。

也许在将来，数据系统将能够自动检测和补偿偏斜的工作负载；但现在，你需要自己来权衡。


## 分区与次级索引


到目前为止，我们讨论的分区方案依赖于键值数据模型。如果只通过主键访问记录，我们可以从该键确定分区，并使用它来将读写请求路由到负责该键的分区。

如果涉及次级索引，情况会变得更加复杂（参考 “[其他索引结构](/v1/ch3#其他索引结构)”）。次级索引通常并不能唯一地标识记录，而是一种搜索记录中出现特定值的方式：查找用户 123 的所有操作、查找包含词语 `hogwash` 的所有文章、查找所有颜色为红色的车辆等等。

次级索引是关系型数据库的基础，并且在文档数据库中也很普遍。许多键值存储（如 HBase 和 Volde-mort）为了减少实现的复杂度而放弃了次级索引，但是一些（如 Riak）已经开始添加它们，因为它们对于数据模型实在是太有用了。并且次级索引也是 Solr 和 Elasticsearch 等搜索服务器的基石。

次级索引的问题是它们不能整齐地映射到分区。有两种用次级索引对数据库进行分区的方法：**基于文档的分区（document-based）** 和 **基于关键词（term-based）的分区**。

### 基于文档的次级索引进行分区

假设你正在经营一个销售二手车的网站（如 [图 6-4](/v1/ddia_0604.png) 所示）。每个列表都有一个唯一的 ID—— 称之为文档 ID—— 并且用文档 ID 对数据库进行分区（例如，分区 0 中的 ID 0 到 499，分区 1 中的 ID 500 到 999 等）。

你想让用户搜索汽车，允许他们通过颜色和厂商过滤，所以需要一个在颜色和厂商上的次级索引（文档数据库中这些是 **字段（field）**，关系数据库中这些是 **列（column）** ）。如果你声明了索引，则数据库可以自动执行索引 [^ii]。例如，无论何时将红色汽车添加到数据库，数据库分区都会自动将其添加到索引条目 `color:red` 的文档 ID 列表中。

[^ii]: 如果数据库仅支持键值模型，则你可能会尝试在应用程序代码中创建从值到文档 ID 的映射来实现次级索引。如果沿着这条路线走下去，请万分小心，确保你的索引与底层数据保持一致。竞争条件和间歇性写入失败（其中一些更改已保存，但其他更改未保存）很容易导致数据不同步 - 请参阅 “[多对象事务的需求](/v1/ch7#多对象事务的需求)”。

![](/v1/ddia_0604.png)

**图 6-4 基于文档的次级索引进行分区**

在这种索引方法中，每个分区是完全独立的：每个分区维护自己的次级索引，仅覆盖该分区中的文档。它不关心存储在其他分区的数据。无论何时你需要写入数据库（添加，删除或更新文档），只需处理包含你正在编写的文档 ID 的分区即可。出于这个原因，**文档分区索引** 也被称为 **本地索引**（而不是将在下一节中描述的 **全局索引**）。

但是，从文档分区索引中读取需要注意：除非你对文档 ID 做了特别的处理，否则没有理由将所有具有特定颜色或特定品牌的汽车放在同一个分区中。在 [图 6-4](/v1/ddia_0604.png) 中，红色汽车出现在分区 0 和分区 1 中。因此，如果要搜索红色汽车，则需要将查询发送到所有分区，并合并所有返回的结果。


这种查询分区数据库的方法有时被称为 **分散 / 聚集（scatter/gather）**，并且可能会使次级索引上的读取查询相当昂贵。即使并行查询分区，分散 / 聚集也容易导致尾部延迟放大（请参阅 “[实践中的百分位点](/v1/ch1#实践中的百分位点)”）。然而，它被广泛使用：MongoDB，Riak 【15】，Cassandra 【16】，Elasticsearch 【17】，SolrCloud 【18】和 VoltDB 【19】都使用文档分区次级索引。大多数数据库供应商建议你构建一个能从单个分区提供次级索引查询的分区方案，但这并不总是可行，尤其是当在单个查询中使用多个次级索引时（例如同时需要按颜色和制造商查询）。


### 基于关键词(Term)的次级索引进行分区

我们可以构建一个覆盖所有分区数据的 **全局索引**，而不是给每个分区创建自己的次级索引（本地索引）。但是，我们不能只把这个索引存储在一个节点上，因为它可能会成为瓶颈，违背了分区的目的。全局索引也必须进行分区，但可以采用与主键不同的分区方式。

[图 6-5](/v1/ddia_0605.png) 描述了这可能是什么样子：来自所有分区的红色汽车在红色索引中，并且索引是分区的，首字母从 `a` 到 `r` 的颜色在分区 0 中，`s` 到 `z` 的在分区 1。汽车制造商的索引也与之类似（分区边界在 `f` 和 `h` 之间）。

![](/v1/ddia_0605.png)

**图 6-5 基于关键词对次级索引进行分区**

我们将这种索引称为 **关键词分区（term-partitioned）**，因为我们寻找的关键词决定了索引的分区方式。例如，一个关键词可能是：`color:red`。**关键词（Term）** 这个名称来源于全文搜索索引（一种特殊的次级索引），指文档中出现的所有单词。

和之前一样，我们可以通过 **关键词** 本身或者它的散列进行索引分区。根据关键词本身来分区对于范围扫描非常有用（例如对于数值类的属性，像汽车的报价），而对关键词的哈希分区提供了负载均衡的能力。

关键词分区的全局索引优于文档分区索引的地方点是它可以使读取更有效率：不需要 **分散 / 收集** 所有分区，客户端只需要向包含关键词的分区发出请求。全局索引的缺点在于写入速度较慢且较为复杂，因为写入单个文档现在可能会影响索引的多个分区（文档中的每个关键词可能位于不同的分区或者不同的节点上） 。

理想情况下，索引总是最新的，写入数据库的每个文档都会立即反映在索引中。但是，在关键词分区索引中，这需要跨分区的分布式事务，并不是所有数据库都支持（请参阅 [第七章](/v1/ch7) 和 [第九章](/v1/ch9)）。

在实践中，对全局次级索引的更新通常是 **异步** 的（也就是说，如果在写入之后不久读取索引，刚才所做的更改可能尚未反映在索引中）。例如，Amazon DynamoDB 声称在正常情况下，其全局次级索引会在不到一秒的时间内更新，但在基础架构出现故障的情况下可能会有延迟【20】。

全局关键词分区索引的其他用途包括 Riak 的搜索功能【21】和 Oracle 数据仓库，它允许你在本地和全局索引之间进行选择【22】。我们将在 [第十二章](/v1/ch12) 中继续关键词分区次级索引实现的话题。

## 分区再平衡

随着时间的推移，数据库会有各种变化：

* 查询吞吐量增加，所以你想要添加更多的 CPU 来处理负载。
* 数据集大小增加，所以你想添加更多的磁盘和 RAM 来存储它。
* 机器出现故障，其他机器需要接管故障机器的责任。

所有这些更改都需要数据和请求从一个节点移动到另一个节点。将负载从集群中的一个节点向另一个节点移动的过程称为 **再平衡（rebalancing）**。

无论使用哪种分区方案，再平衡通常都要满足一些最低要求：

* 再平衡之后，负载（数据存储，读取和写入请求）应该在集群中的节点之间公平地共享。
* 再平衡发生时，数据库应该继续接受读取和写入。
* 节点之间只移动必须的数据，以便快速再平衡，并减少网络和磁盘 I/O 负载。


### 再平衡策略

有几种不同的分区分配方法【23】，让我们依次简要讨论一下。

#### 反面教材：hash mod N

我们在前面说过（[图 6-3](/v1/ddia_0603.png)），最好将可能的散列分成不同的范围，并将每个范围分配给一个分区（例如，如果 $0 ≤ hash(key)< b_0$，则将键分配给分区 0，如果 $b_0 ≤ hash(key) < b_1$，则分配给分区 1）

也许你想知道为什么我们不使用 ***取模（mod）***（许多编程语言中的 % 运算符）。例如，`hash(key) mod 10` 会返回一个介于 0 和 9 之间的数字（如果我们将散列写为十进制数，散列模 10 将是最后一个数字）。如果我们有 10 个节点，编号为 0 到 9，这似乎是将每个键分配给一个节点的简单方法。

模 N（$mod N$）方法的问题是，如果节点数量 N 发生变化，大多数键将需要从一个节点移动到另一个节点。例如，假设 $hash(key)=123456$。如果最初有 10 个节点，那么这个键一开始放在节点 6 上（因为 $123456\ mod\  10 = 6$）。当你增长到 11 个节点时，键需要移动到节点 3（$123456\ mod\ 11 = 3$），当你增长到 12 个节点时，需要移动到节点 0（$123456\ mod\ 12 = 0$）。这种频繁的举动使得再平衡的成本过高。

我们需要一种只移动必需数据的方法。

#### 固定数量的分区

幸运的是，有一个相当简单的解决方案：创建比节点更多的分区，并为每个节点分配多个分区。例如，运行在 10 个节点的集群上的数据库可能会从一开始就被拆分为 1,000 个分区，因此大约有 100 个分区被分配给每个节点。

现在，如果一个节点被添加到集群中，新节点可以从当前每个节点中 **窃取** 一些分区，直到分区再次公平分配。这个过程如 [图 6-6](/v1/ddia_0606.png) 所示。如果从集群中删除一个节点，则会发生相反的情况。

只有分区在节点之间的移动。分区的数量不会改变，键所指定的分区也不会改变。唯一改变的是分区所在的节点。这种变更并不是即时的 — 在网络上传输大量的数据需要一些时间 — 所以在传输过程中，原有分区仍然会接受读写操作。

![](/v1/ddia_0606.png)

**图 6-6 将新节点添加到每个节点具有多个分区的数据库集群。**

原则上，你甚至可以解决集群中的硬件不匹配问题：通过为更强大的节点分配更多的分区，可以强制这些节点承载更多的负载。在 Riak 【15】、Elasticsearch 【24】、Couchbase 【10】和 Voldemort 【25】中使用了这种再平衡的方法。

在这种配置中，分区的数量通常在数据库第一次建立时确定，之后不会改变。虽然原则上可以分割和合并分区（请参阅下一节），但固定数量的分区在操作上更简单，因此许多固定分区数据库选择不实施分区分割。因此，一开始配置的分区数就是你可以拥有的最大节点数量，所以你需要选择足够多的分区以适应未来的增长。但是，每个分区也有管理开销，所以选择太大的数字会适得其反。

如果数据集的总大小难以预估（例如，可能它开始很小，但随着时间的推移会变得更大），选择正确的分区数是困难的。由于每个分区包含了总数据量固定比率的数据，因此每个分区的大小与集群中的数据总量成比例增长。如果分区非常大，再平衡和从节点故障恢复变得昂贵。但是，如果分区太小，则会产生太多的开销。当分区大小 “恰到好处” 的时候才能获得很好的性能，如果分区数量固定，但数据量变动很大，则难以达到最佳性能。

#### 动态分区

对于使用键范围分区的数据库（请参阅 “[根据键的范围分区](#根据键的范围分区)”），具有固定边界的固定数量的分区将非常不便：如果边界设置错误，可能会导致所有数据都在一个分区中，而其他分区则为空。手动重新配置分区边界将非常繁琐。

出于这个原因，按键的范围进行分区的数据库（如 HBase 和 RethinkDB）会动态创建分区。当分区增长到超过配置的大小时（在 HBase 上，默认值是 10GB），会被分成两个分区，每个分区约占一半的数据【26】。与之相反，如果大量数据被删除并且分区缩小到某个阈值以下，则可以将其与相邻分区合并。此过程与 B 树顶层发生的过程类似（请参阅 “[B 树](/v1/ch3#B树)”）。

每个分区分配给一个节点，每个节点可以处理多个分区，就像固定数量的分区一样。大型分区拆分后，可以将其中的一半转移到另一个节点，以平衡负载。在 HBase 中，分区文件的传输通过 HDFS（底层使用的分布式文件系统）来实现【3】。

动态分区的一个优点是分区数量适应总数据量。如果只有少量的数据，少量的分区就足够了，所以开销很小；如果有大量的数据，每个分区的大小被限制在一个可配置的最大值【23】。

需要注意的是，一个空的数据库从一个分区开始，因为没有关于在哪里绘制分区边界的先验信息。数据集开始时很小，直到达到第一个分区的分割点，所有写入操作都必须由单个节点处理，而其他节点则处于空闲状态。为了解决这个问题，HBase 和 MongoDB 允许在一个空的数据库上配置一组初始分区（这被称为 **预分割**，即 pre-splitting）。在键范围分区的情况中，预分割需要提前知道键是如何进行分配的【4,26】。

动态分区不仅适用于数据的范围分区，而且也适用于散列分区。从版本 2.4 开始，MongoDB 同时支持范围和散列分区，并且都支持动态分割分区。

#### 按节点比例分区

通过动态分区，分区的数量与数据集的大小成正比，因为拆分和合并过程将每个分区的大小保持在固定的最小值和最大值之间。另一方面，对于固定数量的分区，每个分区的大小与数据集的大小成正比。在这两种情况下，分区的数量都与节点的数量无关。

Cassandra 和 Ketama 使用的第三种方法是使分区数与节点数成正比 —— 换句话说，每个节点具有固定数量的分区【23,27,28】。在这种情况下，每个分区的大小与数据集大小成比例地增长，而节点数量保持不变，但是当增加节点数时，分区将再次变小。由于较大的数据量通常需要较大数量的节点进行存储，因此这种方法也使每个分区的大小较为稳定。

当一个新节点加入集群时，它随机选择固定数量的现有分区进行拆分，然后占有这些拆分分区中每个分区的一半，同时将每个分区的另一半留在原地。随机化可能会产生不公平的分割，但是平均在更大数量的分区上时（在 Cassandra 中，默认情况下，每个节点有 256 个分区），新节点最终从现有节点获得公平的负载份额。Cassandra 3.0 引入了另一种再平衡的算法来避免不公平的分割【29】。

随机选择分区边界要求使用基于散列的分区（可以从散列函数产生的数字范围中挑选边界）。实际上，这种方法最符合一致性哈希的原始定义【7】（请参阅 “[一致性哈希](#一致性哈希)”）。最新的哈希函数可以在较低元数据开销的情况下达到类似的效果【8】。

### 运维：手动还是自动再平衡

关于再平衡有一个重要问题：自动还是手动进行？

在全自动再平衡（系统自动决定何时将分区从一个节点移动到另一个节点，无须人工干预）和完全手动（分区指派给节点由管理员明确配置，仅在管理员明确重新配置时才会更改）之间有一个权衡。例如，Couchbase、Riak 和 Voldemort 会自动生成建议的分区分配，但需要管理员提交才能生效。

全自动再平衡可以很方便，因为正常维护的操作工作较少。然而，它可能是不可预测的。再平衡是一个昂贵的操作，因为它需要重新路由请求并将大量数据从一个节点移动到另一个节点。如果没有做好，这个过程可能会使网络或节点负载过重，降低其他请求的性能。

这种自动化与自动故障检测相结合可能十分危险。例如，假设一个节点过载，并且对请求的响应暂时很慢。其他节点得出结论：过载的节点已经死亡，并自动重新平衡集群，使负载离开它。这会对已经超负荷的节点，其他节点和网络造成额外的负载，从而使情况变得更糟，并可能导致级联失败。

出于这个原因，再平衡的过程中有人参与是一件好事。这比全自动的过程慢，但可以帮助防止运维意外。

## 请求路由

现在我们已经将数据集分割到多个机器上运行的多个节点上。但是仍然存在一个悬而未决的问题：当客户想要发出请求时，如何知道要连接哪个节点？随着分区的重新平衡，分区对节点的分配也发生变化。为了回答这个问题，需要有人知晓这些变化：如果我想读或写键 “foo”，需要连接哪个 IP 地址和端口号？

这个问题可以概括为 **服务发现（service discovery）** ，它不仅限于数据库。任何可通过网络访问的软件都有这个问题，特别是如果它的目标是高可用性（在多台机器上运行冗余配置）。许多公司已经编写了自己的内部服务发现工具，其中许多已经作为开源发布【30】。

概括来说，这个问题有几种不同的方案（如图 6-7 所示）:

1. 允许客户联系任何节点（例如，通过 **循环策略的负载均衡**，即 Round-Robin Load Balancer）。如果该节点恰巧拥有请求的分区，则它可以直接处理该请求；否则，它将请求转发到适当的节点，接收回复并传递给客户端。
2. 首先将所有来自客户端的请求发送到路由层，它决定了应该处理请求的节点，并相应地转发。此路由层本身不处理任何请求；它仅负责分区的负载均衡。
3. 要求客户端知道分区和节点的分配。在这种情况下，客户端可以直接连接到适当的节点，而不需要任何中介。

以上所有情况中的关键问题是：作出路由决策的组件（可能是节点之一，还是路由层或客户端）如何了解分区 - 节点之间的分配关系变化？

![](/v1/ddia_0607.png)

**图 6-7 将请求路由到正确节点的三种不同方式。**

这是一个具有挑战性的问题，因为重要的是所有参与者都达成共识 - 否则请求将被发送到错误的节点，得不到正确的处理。在分布式系统中有达成共识的协议，但很难正确地实现（见 [第九章](/v1/ch9)）。

许多分布式数据系统都依赖于一个独立的协调服务，比如 ZooKeeper 来跟踪集群元数据，如 [图 6-8](/v1/ddia_0608.png) 所示。每个节点在 ZooKeeper 中注册自己，ZooKeeper 维护分区到节点的可靠映射。其他参与者（如路由层或分区感知客户端）可以在 ZooKeeper 中订阅此信息。只要分区分配发生了改变，或者集群中添加或删除了一个节点，ZooKeeper 就会通知路由层使路由信息保持最新状态。

![](/v1/ddia_0608.png)

**图 6-8 使用 ZooKeeper 跟踪分区分配给节点。**

例如，LinkedIn的Espresso使用Helix 【31】进行集群管理（依靠ZooKeeper），实现了如[图6-8](/v1/ddia_0608.png)所示的路由层。HBase、SolrCloud和Kafka也使用ZooKeeper来跟踪分区分配。MongoDB具有类似的体系结构，但它依赖于自己的**配置服务器（config server）** 实现和mongos守护进程作为路由层。

Cassandra 和 Riak 采取不同的方法：他们在节点之间使用 **流言协议（gossip protocol）** 来传播集群状态的变化。请求可以发送到任意节点，该节点会转发到包含所请求的分区的适当节点（[图 6-7](/v1/ddia_0607.png) 中的方法 1）。这个模型在数据库节点中增加了更多的复杂性，但是避免了对像 ZooKeeper 这样的外部协调服务的依赖。

Couchbase 不会自动进行再平衡，这简化了设计。通常情况下，它配置了一个名为 moxi 的路由层，它会从集群节点了解路由变化【32】。

当使用路由层或向随机节点发送请求时，客户端仍然需要找到要连接的 IP 地址。这些地址并不像分区的节点分布变化的那么快，所以使用 DNS 通常就足够了。

### 执行并行查询

到目前为止，我们只关注读取或写入单个键的非常简单的查询（加上基于文档分区的次级索引场景下的分散 / 聚集查询）。这也是大多数 NoSQL 分布式数据存储所支持的访问层级。

然而，通常用于分析的 **大规模并行处理（MPP, Massively parallel processing）** 关系型数据库产品在其支持的查询类型方面要复杂得多。一个典型的数据仓库查询包含多个连接，过滤，分组和聚合操作。MPP 查询优化器将这个复杂的查询分解成许多执行阶段和分区，其中许多可以在数据库集群的不同节点上并行执行。涉及扫描大规模数据集的查询特别受益于这种并行执行。

数据仓库查询的快速并行执行是一个专门的话题，由于分析有很重要的商业意义，可以带来很多利益。我们将在 [第十章](/v1/ch10) 讨论并行查询执行的一些技巧。有关并行数据库中使用的技术的更详细的概述，请参阅参考文献【1,33】。

## 本章小结

在本章中，我们探讨了将大数据集划分成更小的子集的不同方法。数据量非常大的时候，在单台机器上存储和处理不再可行，而分区则十分必要。分区的目标是在多台机器上均匀分布数据和查询负载，避免出现热点（负载不成比例的节点）。这需要选择适合于你的数据的分区方案，并在将节点添加到集群或从集群删除时重新平衡分区。

我们讨论了两种主要的分区方法：

* 键范围分区

  其中键是有序的，并且分区拥有从某个最小值到某个最大值的所有键。排序的优势在于可以进行有效的范围查询，但是如果应用程序经常访问相邻的键，则存在热点的风险。

  在这种方法中，当分区变得太大时，通常将分区分成两个子分区来动态地重新平衡分区。

* 散列分区

  散列函数应用于每个键，分区拥有一定范围的散列。这种方法破坏了键的排序，使得范围查询效率低下，但可以更均匀地分配负载。

  通过散列进行分区时，通常先提前创建固定数量的分区，为每个节点分配多个分区，并在添加或删除节点时将整个分区从一个节点移动到另一个节点。也可以使用动态分区。

两种方法搭配使用也是可行的，例如使用复合主键：使用键的一部分来标识分区，而使用另一部分作为排序顺序。

我们还讨论了分区和次级索引之间的相互作用。次级索引也需要分区，有两种方法：

* 基于文档分区（本地索引），其中次级索引存储在与主键和值相同的分区中。这意味着只有一个分区需要在写入时更新，但是读取次级索引需要在所有分区之间进行分散 / 收集。
* 基于关键词分区（全局索引），其中次级索引存在不同的分区中。次级索引中的条目可以包括来自主键的所有分区的记录。当文档写入时，需要更新多个分区中的次级索引；但是可以从单个分区中进行读取。

最后，我们讨论了将查询路由到适当的分区的技术，从简单的分区负载平衡到复杂的并行查询执行引擎。

按照设计，多数情况下每个分区是独立运行的 — 这就是分区数据库可以伸缩到多台机器的原因。但是，需要写入多个分区的操作结果可能难以预料：例如，如果写入一个分区成功，但另一个分区失败，会发生什么情况？我们将在下面的章节中讨论这个问题。


## 参考文献

1. David J. DeWitt and Jim N. Gray: “[Parallel Database Systems: The Future of High Performance Database Systems](http://www.cs.cmu.edu/~pavlo/courses/fall2013/static/papers/dewittgray92.pdf),” *Communications of the ACM*, volume 35, number 6, pages 85–98, June 1992. [doi:10.1145/129888.129894](http://dx.doi.org/10.1145/129888.129894)
1. Lars George: “[HBase vs. BigTable Comparison](http://www.larsgeorge.com/2009/11/hbase-vs-bigtable-comparison.html),” *larsgeorge.com*, November 2009.
1. “[The Apache HBase Reference Guide](https://hbase.apache.org/book/book.html),” Apache Software Foundation, *hbase.apache.org*, 2014.
1. MongoDB, Inc.: “[New Hash-Based Sharding Feature in MongoDB 2.4](https://web.archive.org/web/20230610080235/https://www.mongodb.com/blog/post/new-hash-based-sharding-feature-in-mongodb-24),” *blog.mongodb.org*, April 10, 2013.
1. Ikai Lan: “[App Engine Datastore Tip: Monotonically Increasing Values Are Bad](http://ikaisays.com/2011/01/25/app-engine-datastore-tip-monotonically-increasing-values-are-bad/),” *ikaisays.com*, January 25, 2011.
1. Martin Kleppmann: “[Java's hashCode Is Not Safe for Distributed Systems](http://martin.kleppmann.com/2012/06/18/java-hashcode-unsafe-for-distributed-systems.html),” *martin.kleppmann.com*, June 18, 2012.
1. David Karger, Eric Lehman, Tom Leighton, et al.: “[Consistent Hashing and Random Trees: Distributed Caching Protocols for Relieving Hot Spots on the World Wide Web](https://www.akamai.com/site/en/documents/research-paper/consistent-hashing-and-random-trees-distributed-caching-protocols-for-relieving-hot-spots-on-the-world-wide-web-technical-publication.pdf),” at *29th Annual ACM Symposium on Theory of Computing* (STOC), pages 654–663, 1997. [doi:10.1145/258533.258660](http://dx.doi.org/10.1145/258533.258660)
1. John Lamping and Eric Veach: “[A Fast, Minimal Memory, Consistent Hash Algorithm](http://arxiv.org/pdf/1406.2294.pdf),” *arxiv.org*, June 2014.
1. Eric Redmond: “[A Little Riak Book](https://web.archive.org/web/20160807123307/http://www.littleriakbook.com/),” Version 1.4.0, Basho Technologies, September 2013.
1. “[Couchbase 2.5 Administrator Guide](http://docs.couchbase.com/couchbase-manual-2.5/cb-admin/),” Couchbase, Inc., 2014.
1. Avinash Lakshman and Prashant Malik: “[Cassandra – A Decentralized Structured Storage System](http://www.cs.cornell.edu/Projects/ladis2009/papers/Lakshman-ladis2009.PDF),” at *3rd ACM SIGOPS International Workshop on Large Scale Distributed Systems and Middleware* (LADIS), October 2009.
1. Jonathan Ellis: “[Facebook’s Cassandra Paper, Annotated and Compared to Apache Cassandra 2.0](https://docs.datastax.com/en/articles/cassandra/cassandrathenandnow.html),” *docs.datastax.com*, September 12, 2013.
1. “[Introduction to Cassandra Query Language](https://docs.datastax.com/en/cql-oss/3.1/cql/cql_intro_c.html),” DataStax, Inc., 2014.
1. Samuel Axon: “[3% of Twitter's Servers Dedicated to Justin Bieber](https://web.archive.org/web/20201109041636/https://mashable.com/2010/09/07/justin-bieber-twitter/?europe=true),” *mashable.com*, September 7, 2010.
1. “[Riak KV Docs](https://docs.riak.com/riak/kv/latest/index.html),” *docs.riak.com*.
1. Richard Low: “[The Sweet Spot for Cassandra Secondary Indexing](https://web.archive.org/web/20190831132955/http://www.wentnet.com/blog/?p=77),” *wentnet.com*, October 21, 2013.
1. Zachary Tong: “[Customizing Your Document Routing](https://www.elastic.co/blog/customizing-your-document-routing/),” *elastic.co*, June 3, 2013.
1. “[Apache Solr Reference Guide](https://cwiki.apache.org/confluence/display/solr/Apache+Solr+Reference+Guide),” Apache Software Foundation, 2014.
1. Andrew Pavlo: “[H-Store Frequently Asked Questions](http://hstore.cs.brown.edu/documentation/faq/),” *hstore.cs.brown.edu*, October 2013.
1. “[Amazon DynamoDB Developer Guide](http://docs.aws.amazon.com/amazondynamodb/latest/developerguide/),” Amazon Web Services, Inc., 2014.
1. Rusty Klophaus: “[Difference Between 2I and Search](https://web.archive.org/web/20150926053350/http://lists.basho.com/pipermail/riak-users_lists.basho.com/2011-October/006220.html),” email to *riak-users* mailing list, *lists.basho.com*, October 25, 2011.
1. Donald K. Burleson: “[Object Partitioning in Oracle](http://www.dba-oracle.com/art_partit.htm),”*dba-oracle.com*, November 8, 2000.
1. Eric Evans: “[Rethinking Topology in Cassandra](http://www.slideshare.net/jericevans/virtual-nodes-rethinking-topology-in-cassandra),” at *ApacheCon Europe*, November 2012.
1. Rafał Kuć: “[Reroute API Explained](https://web.archive.org/web/20190706215750/http://elasticsearchserverbook.com/reroute-api-explained/),” *elasticsearchserverbook.com*, September 30, 2013.
1. “[Project Voldemort Documentation](https://web.archive.org/web/20250107145644/http://www.project-voldemort.com/voldemort/),” *project-voldemort.com*.
1. Enis Soztutar: “[Apache HBase Region Splitting and Merging](http://hortonworks.com/blog/apache-hbase-region-splitting-and-merging/),” *hortonworks.com*, February 1, 2013.
1. Brandon Williams: “[Virtual Nodes in Cassandra 1.2](http://www.datastax.com/dev/blog/virtual-nodes-in-cassandra-1-2),” *datastax.com*, December 4, 2012.
1. Richard Jones: “[libketama: Consistent Hashing Library for Memcached Clients](https://www.metabrew.com/article/libketama-consistent-hashing-algo-memcached-clients),” *metabrew.com*, April 10, 2007.
1. Branimir Lambov: “[New Token Allocation Algorithm in Cassandra 3.0](http://www.datastax.com/dev/blog/token-allocation-algorithm),” *datastax.com*, January 28, 2016.
1. Jason Wilder: “[Open-Source Service Discovery](http://jasonwilder.com/blog/2014/02/04/service-discovery-in-the-cloud/),” *jasonwilder.com*, February 2014.
1. Kishore Gopalakrishna, Shi Lu, Zhen Zhang, et al.: “[Untangling Cluster Management with Helix](http://www.socc2012.org/helix_onecol.pdf?attredirects=0),” at *ACM Symposium on Cloud Computing* (SoCC), October 2012. [doi:10.1145/2391229.2391248](http://dx.doi.org/10.1145/2391229.2391248)
1. “[Moxi 1.8 Manual](http://docs.couchbase.com/moxi-manual-1.8/),” Couchbase, Inc., 2014.
1. Shivnath Babu and Herodotos Herodotou: “[Massively Parallel Databases and MapReduce Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2013/11/db-mr-survey-final.pdf),” *Foundations and Trends in Databases*, volume 5, number 1, pages 1–104, November 2013. [doi:10.1561/1900000036](http://dx.doi.org/10.1561/1900000036)


================================================
FILE: content/v1/ch7.md
================================================
---
title: "第七章：事务"
linkTitle: "7. 事务"
weight: 207
breadcrumbs: false
---

![](/map/ch07.png)

> 一些作者声称，支持通用的两阶段提交代价太大，会带来性能与可用性的问题。让程序员来处理过度使用事务导致的性能问题，总比缺少事务编程好得多。
>
> —— James Corbett 等人，Spanner：Google 的全球分布式数据库（2012）

在数据系统的残酷现实中，很多事情都可能出错：

- 数据库软件、硬件可能在任意时刻发生故障（包括写操作进行到一半时）。
- 应用程序可能在任意时刻崩溃（包括一系列操作的中间）。
- 网络中断可能会意外切断数据库与应用的连接，或数据库之间的连接。
- 多个客户端可能会同时写入数据库，覆盖彼此的更改。
- 客户端可能读取到无意义的数据，因为数据只更新了一部分。
- 客户端之间的竞争条件可能导致令人惊讶的错误。

为了实现可靠性，系统必须处理这些故障，确保它们不会导致整个系统的灾难性故障。但是实现容错机制工作量巨大。需要仔细考虑所有可能出错的事情，并进行大量的测试，以确保解决方案真正管用。

数十年来，**事务（transaction）** 一直是简化这些问题的首选机制。事务是应用程序将多个读写操作组合成一个逻辑单元的一种方式。从概念上讲，事务中的所有读写操作被视作单个操作来执行：整个事务要么成功 **提交**（commit），要么失败 **中止**（abort）或 **回滚**（rollback）。如果失败，应用程序可以安全地重试。对于事务来说，应用程序的错误处理变得简单多了，因为它不用再担心部分失败的情况了，即某些操作成功，某些失败（无论出于何种原因）。

和事务打交道时间长了，你可能会觉得它显而易见。但我们不应将其视为理所当然。事务不是天然存在的；它们是为了 **简化应用编程模型** 而创建的。通过使用事务，应用程序可以自由地忽略某些潜在的错误情况和并发问题，因为数据库会替应用处理好这些。（我们称之为 **安全保证**，即 safety guarantees）。

并不是所有的应用都需要事务，有时候弱化事务保证、或完全放弃事务也是有好处的（例如，为了获得更高性能或更高可用性）。一些安全属性也可以在没有事务的情况下实现。

怎样知道你是否需要事务？为了回答这个问题，首先需要确切理解事务可以提供的安全保障，以及它们的代价。尽管乍看事务似乎很简单，但实际上有许多微妙但重要的细节在起作用。

本章将研究许多出错案例，并探索数据库用于防范这些问题的算法。尤其会深入 **并发控制** 的领域，讨论各种可能发生的竞争条件，以及数据库如何实现 **读已提交（read committed）**，**快照隔离（snapshot isolation）** 和 **可串行化（serializability）** 等隔离级别。

本章同时适用于单机数据库与分布式数据库；在 [第八章](/v1/ch8) 中将重点讨论仅出现在分布式系统中的特殊挑战。


## 事务的棘手概念

现今，几乎所有的关系型数据库和一些非关系数据库都支持 **事务**。其中大多数遵循 IBM System R（第一个 SQL 数据库）在 1975 年引入的风格【1,2,3】。40 年里，尽管一些实现细节发生了变化，但总体思路大同小异：MySQL、PostgreSQL、Oracle 和 SQL Server 等数据库中的事务支持与 System R 异乎寻常地相似。

2000 年以后，非关系（NoSQL）数据库开始普及。它们的目标是在关系数据库的现状基础上，通过提供新的数据模型选择（请参阅 [第二章](/v1/ch2)）并默认包含复制（第五章）和分区（第六章）来进一步提升。事务是这次运动的主要牺牲品：这些新一代数据库中的许多数据库完全放弃了事务，或者重新定义了这个词，描述比以前所理解的更弱得多的一套保证【4】。

随着这种新型分布式数据库的炒作，人们普遍认为事务是可伸缩性的对立面，任何大型系统都必须放弃事务以保持良好的性能和高可用性【5,6】。另一方面，数据库厂商有时将事务保证作为 “重要应用” 和 “有价值数据” 的基本要求。这两种观点都是 **纯粹的夸张**。

事实并非如此简单：与其他技术设计选择一样，事务有其优势和局限性。为了理解这些权衡，让我们了解事务所提供保证的细节 —— 无论是在正常运行中还是在各种极端（但是现实存在）的情况下。

### ACID的含义

事务所提供的安全保证，通常由众所周知的首字母缩略词 ACID 来描述，ACID 代表 **原子性（Atomicity）**，**一致性（Consistency）**，**隔离性（Isolation）** 和 **持久性（Durability）**。它由 Theo Härder 和 Andreas Reuter 于 1983 年提出，旨在为数据库中的容错机制建立精确的术语。

但实际上，不同数据库的 ACID 实现并不相同。例如，我们将会看到，关于 **隔离性** 的含义就有许多含糊不清【8】。高层次上的想法很美好，但魔鬼隐藏在细节里。今天，当一个系统声称自己 “符合 ACID” 时，实际上能期待的是什么保证并不清楚。不幸的是，ACID 现在几乎已经变成了一个营销术语。

（不符合 ACID 标准的系统有时被称为 BASE，它代表 **基本可用性（Basically Available）**，**软状态（Soft State）** 和 **最终一致性（Eventual consistency）**【9】，这比 ACID 的定义更加模糊，似乎 BASE 的唯一合理的定义是 “不是 ACID”，即它几乎可以代表任何你想要的东西。）

让我们深入了解原子性，一致性，隔离性和持久性的定义，这可以让我们提炼出事务的思想。

#### 原子性

一般来说，原子是指不能分解成小部分的东西。这个词在计算机的不同领域中意味着相似但又微妙不同的东西。例如，在多线程编程中，如果一个线程执行一个原子操作，这意味着另一个线程无法看到该操作的一半结果。系统只能处于操作之前或操作之后的状态，而不是介于两者之间的状态。

相比之下，ACID 的原子性并 **不** 是关于 **并发（concurrent）** 的。它并不是在描述如果几个进程试图同时访问相同的数据会发生什么情况，这种情况包含在 [**隔离性**](#隔离性) 中。

ACID 的原子性描述了当客户想进行多次写入，但在一些写操作处理完之后出现故障的情况。例如进程崩溃，网络连接中断，磁盘变满或者某种完整性约束被违反。如果这些写操作被分组到一个原子事务中，并且该事务由于错误而不能完成（提交），则该事务将被中止，并且数据库必须丢弃或撤消该事务中迄今为止所做的任何写入。

如果没有原子性，在多处更改进行到一半时发生错误，很难知道哪些更改已经生效，哪些没有生效。该应用程序可以再试一次，但冒着进行两次相同变更的风险，可能会导致数据重复或错误的数据。原子性简化了这个问题：如果事务被 **中止（abort）**，应用程序可以确定它没有改变任何东西，所以可以安全地重试。

ACID 原子性的定义特征是：**能够在错误时中止事务，丢弃该事务进行的所有写入变更的能力。** 或许 **可中止性（abortability）** 是更好的术语，但本书将继续使用原子性，因为这是惯用词。

#### 一致性

一致性这个词被赋予太多含义：

* 在 [第五章](/v1/ch5) 中，我们讨论了副本一致性，以及异步复制系统中的最终一致性问题（请参阅 “[复制延迟问题](/v1/ch5#复制延迟问题)”）。
* [一致性哈希](/v1/ch6#一致性哈希) 是某些系统用于重新分区的一种分区方法。
* 在 [CAP 定理](/v1/ch9#CAP定理) 中，一致性一词用于表示 [线性一致性](/v1/ch9#线性一致性)。
* 在 ACID 的上下文中，**一致性** 是指数据库在应用程序的特定概念中处于 “良好状态”。

很不幸，这一个词就至少有四种不同的含义。

ACID 一致性的概念是，**对数据的一组特定约束必须始终成立**，即 **不变式（invariants）**。例如，在会计系统中，所有账户整体上必须借贷相抵。如果一个事务开始于一个满足这些不变式的有效数据库，且在事务处理期间的任何写入操作都保持这种有效性，那么可以确定，不变式总是满足的。

但是，一致性的这种概念取决于应用程序对不变式的理解，应用程序负责正确定义它的事务，并保持一致性。这并不是数据库可以保证的事情：如果你写入违反不变式的脏数据，数据库也无法阻止你（一些特定类型的不变式可以由数据库检查，例如外键约束或唯一约束，但是一般来说，是应用程序来定义什么样的数据是有效的，什么样是无效的。—— 数据库只管存储）。

原子性、隔离性和持久性是数据库的属性，而一致性（在 ACID 意义上）是应用程序的属性。应用可能依赖数据库的原子性和隔离性来实现一致性，但这并不仅取决于数据库。因此，字母 C 不属于 ACID [^i]。

[^i]: 乔・海勒斯坦（Joe Hellerstein）指出，在 Härder 与 Reuter 的论文中，“ACID 中的 C” 是被 “扔进去凑缩写单词的”【7】，而且那时候大家都不怎么在乎一致性。

#### 隔离性

大多数数据库都会同时被多个客户端访问。如果它们各自读写数据库的不同部分，这是没有问题的，但是如果它们访问相同的数据库记录，则可能会遇到 **并发** 问题（**竞争条件**，即 race conditions）。

[图 7-1](/v1/ddia_0701.png) 是这类问题的一个简单例子。假设你有两个客户端同时在数据库中增长一个计数器。（假设数据库没有内建的自增操作）每个客户端需要读取计数器的当前值，加 1 ，再回写新值。[图 7-1](/v1/ddia_0701.png) 中，因为发生了两次增长，计数器应该从 42 增至 44；但由于竞态条件，实际上只增至 43 。

ACID 意义上的隔离性意味着，**同时执行的事务是相互隔离的**：它们不能相互冒犯。传统的数据库教科书将隔离性形式化为 **可串行化（Serializability）**，这意味着每个事务可以假装它是唯一在整个数据库上运行的事务。数据库确保当多个事务被提交时，结果与它们串行运行（一个接一个）是一样的，尽管实际上它们可能是并发运行的【10】。

![](/v1/ddia_0701.png)

**图 7-1 两个客户之间的竞争状态同时递增计数器**

然而实践中很少会使用可串行的隔离，因为它有性能损失。一些流行的数据库如 Oracle 11g，甚至没有实现它。在 Oracle 中有一个名为 “可串行的” 隔离级别，但实际上它实现了一种叫做 **快照隔离（snapshot isolation）** 的功能，**这是一种比可串行化更弱的保证**【8,11】。我们将在 “[弱隔离级别](#弱隔离级别)” 中研究快照隔离和其他形式的隔离。

#### 持久性

数据库系统的目的是，提供一个安全的地方存储数据，而不用担心丢失。**持久性** 是一个承诺，即一旦事务成功完成，即使发生硬件故障或数据库崩溃，写入的任何数据也不会丢失。

在单节点数据库中，持久性通常意味着数据已被写入非易失性存储设备，如硬盘或 SSD。它通常还包括预写日志或类似的文件（请参阅 “[让 B 树更可靠](/v1/ch3#让B树更可靠)”），以便在磁盘上的数据结构损坏时进行恢复。在带复制的数据库中，持久性可能意味着数据已成功复制到一些节点。为了提供持久性保证，数据库必须等到这些写入或复制完成后，才能报告事务成功提交。

如 “[可靠性](/v1/ch1#可靠性)” 一节所述，**完美的持久性是不存在的** ：如果所有硬盘和所有备份同时被销毁，那显然没有任何数据库能救得了你。

> #### 复制与持久性
>
> 在历史上，持久性意味着写入归档磁带。后来它被理解为写入磁盘或 SSD。再后来它又有了新的内涵即 “复制（replication）”。哪种实现更好一些？
>
> 真相是，没有什么是完美的：
>
> * 如果你写入磁盘然后机器宕机，即使数据没有丢失，在修复机器或将磁盘转移到其他机器之前，也是无法访问的。这种情况下，复制系统可以保持可用性。
> * 一个相关性故障（停电，或一个特定输入导致所有节点崩溃的 Bug）可能会一次性摧毁所有副本（请参阅「[可靠性](/v1/ch1#可靠性)」），任何仅存储在内存中的数据都会丢失，故内存数据库仍然要和磁盘写入打交道。
> * 在异步复制系统中，当主库不可用时，最近的写入操作可能会丢失（请参阅「[处理节点宕机](/v1/ch5#处理节点宕机)」）。
> * 当电源突然断电时，特别是固态硬盘，有证据显示有时会违反应有的保证：甚至 fsync 也不能保证正常工作【12】。硬盘固件可能有错误，就像任何其他类型的软件一样【13,14】。
> * 存储引擎和文件系统之间的微妙交互可能会导致难以追踪的错误，并可能导致磁盘上的文件在崩溃后被损坏【15,16】。
> * 磁盘上的数据可能会在没有检测到的情况下逐渐损坏【17】。如果数据已损坏一段时间，副本和最近的备份也可能损坏。这种情况下，需要尝试从历史备份中恢复数据。
> * 一项关于固态硬盘的研究发现，在运行的前四年中，30% 到 80% 的硬盘会产生至少一个坏块【18】。相比固态硬盘，磁盘的坏道率较低，但完全失效的概率更高。
> * 如果 SSD 断电，可能会在几周内开始丢失数据，具体取决于温度【19】。
>
> 在实践中，没有一种技术可以提供绝对保证。只有各种降低风险的技术，包括写入磁盘，复制到远程机器和备份 —— 它们可以且应该一起使用。与往常一样，最好抱着怀疑的态度接受任何理论上的 “保证”。

### 单对象和多对象操作

回顾一下，在 ACID 中，原子性和隔离性描述了客户端在同一事务中执行多次写入时，数据库应该做的事情：

原子性
: 如果在一系列写操作的中途发生错误，则应中止事务处理，并丢弃当前事务的所有写入。换句话说，数据库免去了用户对部分失败的担忧 —— 通过提供 “**宁为玉碎，不为瓦全（all-or-nothing）**” 的保证。

隔离性
: 同时运行的事务不应该互相干扰。例如，如果一个事务进行多次写入，则另一个事务要么看到全部写入结果，要么什么都看不到，但不应该是一些子集。

这些定义假设你想同时修改多个对象（行，文档，记录）。通常需要 **多对象事务（multi-object transaction）** 来保持多块数据同步。[图 7-2](/v1/ddia_0702.png) 展示了一个来自电邮应用的例子。执行以下查询来显示用户未读邮件数量：

```sql
SELECT COUNT（*）FROM emails WHERE recipient_id = 2 AND unread_flag = true
```

但如果邮件太多，你可能会觉得这个查询太慢，并决定用单独的字段存储未读邮件的数量（一种反规范化）。现在每当一个新消息写入时，必须也增长未读计数器，每当一个消息被标记为已读时，也必须减少未读计数器。

在 [图 7-2](/v1/ddia_0702.png) 中，用户 2 遇到异常情况：邮件列表里显示有未读消息，但计数器显示为零未读消息，因为计数器增长还没有发生 [^ii]。隔离性可以避免这个问题：通过确保用户 2 要么同时看到新邮件和增长后的计数器，要么都看不到，而不是一个前后矛盾的中间结果。

[^ii]: 可以说邮件应用中的错误计数器并不是什么特别重要的问题。但换种方式来看，你可以把未读计数器换成客户账户余额，把邮件收发看成支付交易。

![](/v1/ddia_0702.png)

**图 7-2 违反隔离性：一个事务读取另一个事务的未被执行的写入（“脏读”）。**

[图 7-3](/v1/ddia_0703.png) 说明了对原子性的需求：如果在事务过程中发生错误，邮箱和未读计数器的内容可能会失去同步。在原子事务中，如果对计数器的更新失败，事务将被中止，并且插入的电子邮件将被回滚。

![](/v1/ddia_0703.png)

**图 7-3 原子性确保发生错误时，事务先前的任何写入都会被撤消，以避免状态不一致**

多对象事务需要某种方式来确定哪些读写操作属于同一个事务。在关系型数据库中，通常基于客户端与数据库服务器的 TCP 连接：在任何特定连接上，`BEGIN TRANSACTION` 和 `COMMIT` 语句之间的所有内容，被认为是同一事务的一部分.[^iii]

[^iii]: 这并不完美。如果 TCP 连接中断，则事务必须中止。如果中断发生在客户端请求提交之后，但在服务器确认提交发生之前，客户端并不知道事务是否已提交。为了解决这个问题，事务管理器可以通过一个唯一事务标识符来对操作进行分组，这个标识符并未绑定到特定 TCP 连接。后续再 “[数据库的端到端原则](/v1/ch12#数据库的端到端原则)” 一节将回到这个主题。

另一方面，许多非关系数据库并没有将这些操作组合在一起的方法。即使存在多对象 API（例如，某键值存储可能具有在一个操作中更新几个键的 multi-put 操作），但这并不一定意味着它具有事务语义：该命令可能在一些键上成功，在其他的键上失败，使数据库处于部分更新的状态。

#### 单对象写入

当单个对象发生改变时，原子性和隔离性也是适用的。例如，假设你正在向数据库写入一个 20 KB 的 JSON 文档：

- 如果在发送第一个 10 KB 之后网络连接中断，数据库是否存储了不可解析的 10KB JSON 片段？
- 如果在数据库正在覆盖磁盘上的前一个值的过程中电源发生故障，是否最终将新旧值拼接在一起？
- 如果另一个客户端在写入过程中读取该文档，是否会看到部分更新的值？

这些问题非常让人头大，故存储引擎一个几乎普遍的目标是：对单节点上的单个对象（例如键值对）上提供原子性和隔离性。原子性可以通过使用日志来实现崩溃恢复（请参阅 “[让 B 树更可靠](/v1/ch3#让B树更可靠)”），并且可以使用每个对象上的锁来实现隔离（每次只允许一个线程访问对象） 。

一些数据库也提供更复杂的原子操作 [^iv]，例如自增操作，这样就不再需要像 [图 7-1](/v1/ddia_0701.png) 那样的读取 - 修改 - 写入序列了。同样流行的是 **[比较和设置（CAS, compare-and-set）](#比较并设置（CAS）)** 操作，仅当值没有被其他并发修改过时，才允许执行写操作。

[^iv]: 严格地说，**原子自增（atomic increment）** 这个术语在多线程编程的意义上使用了原子这个词。在 ACID 的情况下，它实际上应该被称为 **隔离的（isolated）** 的或 **可串行的（serializable）** 的增量。但这就太吹毛求疵了。

这些单对象操作很有用，因为它们可以防止在多个客户端尝试同时写入同一个对象时丢失更新（请参阅 “[防止丢失更新](#防止丢失更新)”）。但它们不是通常意义上的事务。CAS 以及其他单一对象操作被称为 “轻量级事务”，甚至出于营销目的被称为 “ACID”【20,21,22】，但是这个术语是误导性的。事务通常被理解为，**将多个对象上的多个操作合并为一个执行单元的机制**。

#### 多对象事务的需求

许多分布式数据存储已经放弃了多对象事务，因为多对象事务很难跨分区实现，而且在需要高可用性或高性能的情况下，它们可能会碍事。但说到底，在分布式数据库中实现事务，并没有什么根本性的障碍。[第九章](/v1/ch9) 将讨论分布式事务的实现。

但是我们是否需要多对象事务？**是否有可能只用键值数据模型和单对象操作来实现任何应用程序？**

有一些场景中，单对象插入，更新和删除是足够的。但是许多其他场景需要协调写入几个不同的对象：

* 在关系数据模型中，一个表中的行通常具有对另一个表中的行的外键引用。（类似的是，在一个图数据模型中，一个顶点有着到其他顶点的边）。多对象事务使你确保这些引用始终有效：当插入几个相互引用的记录时，外键必须是正确的和最新的，不然数据就没有意义。
* 在文档数据模型中，需要一起更新的字段通常在同一个文档中，这被视为单个对象 —— 更新单个文档时不需要多对象事务。但是，缺乏连接功能的文档数据库会鼓励非规范化（请参阅 “[关系型数据库与文档数据库在今日的对比](/v1/ch2#关系型数据库与文档数据库在今日的对比)”）。当需要更新非规范化的信息时，如 [图 7-2](/v1/ddia_0702.png) 所示，需要一次更新多个文档。事务在这种情况下非常有用，可以防止非规范化的数据不同步。
* 在具有次级索引的数据库中（除了纯粹的键值存储以外几乎都有），每次更改值时都需要更新索引。从事务角度来看，这些索引是不同的数据库对象：例如，如果没有事务隔离性，记录可能出现在一个索引中，但没有出现在另一个索引中，因为第二个索引的更新还没有发生。

这些应用仍然可以在没有事务的情况下实现。然而，**没有原子性，错误处理就要复杂得多，缺乏隔离性，就会导致并发问题**。我们将在 “[弱隔离级别](#弱隔离级别)” 中讨论这些问题，并在 [第十二章](/v1/ch12) 中探讨其他方法。

#### 处理错误和中止

事务的一个关键特性是，如果发生错误，它可以中止并安全地重试。ACID 数据库基于这样的哲学：如果数据库有违反其原子性，隔离性或持久性的危险，则宁愿完全放弃事务，而不是留下半成品。

然而并不是所有的系统都遵循这个哲学。特别是具有 [无主复制](/v1/ch5#无主复制) 的数据存储，主要是在 “尽力而为” 的基础上进行工作。可以概括为 “数据库将做尽可能多的事，运行遇到错误时，它不会撤消它已经完成的事情” —— 所以，从错误中恢复是应用程序的责任。

错误发生不可避免，但许多软件开发人员倾向于只考虑乐观情况，而不是错误处理的复杂性。例如，像 Rails 的 ActiveRecord 和 Django 这样的 **对象关系映射（ORM, object-relation Mapping）** 框架不会重试中断的事务 —— 这个错误通常会导致一个从堆栈向上传播的异常，所以任何用户输入都会被丢弃，用户拿到一个错误信息。这实在是太耻辱了，因为中止的重点就是允许安全的重试。

尽管重试一个中止的事务是一个简单而有效的错误处理机制，但它并不完美：

- 如果事务实际上成功了，但是在服务器试图向客户端确认提交成功时网络发生故障（所以客户端认为提交失败了），那么重试事务会导致事务被执行两次 —— 除非你有一个额外的应用级去重机制。
- 如果错误是由于负载过大造成的，则重试事务将使问题变得更糟，而不是更好。为了避免这种正反馈循环，可以限制重试次数，使用指数退避算法，并单独处理与过载相关的错误（如果允许）。
- 仅在临时性错误（例如，由于死锁，异常情况，临时性网络中断和故障切换）后才值得重试。在发生永久性错误（例如，违反约束）之后重试是毫无意义的。
- 如果事务在数据库之外也有副作用，即使事务被中止，也可能发生这些副作用。例如，如果你正在发送电子邮件，那你肯定不希望每次重试事务时都重新发送电子邮件。如果你想确保几个不同的系统一起提交或放弃，**两阶段提交（2PC, two-phase commit）** 可以提供帮助（“[原子提交与两阶段提交](/v1/ch9#原子提交与两阶段提交)” 中将讨论这个问题）。
- 如果客户端进程在重试中失效，任何试图写入数据库的数据都将丢失。

## 弱隔离级别

如果两个事务不触及相同的数据，它们可以安全地 **并行（parallel）** 运行，因为两者都不依赖于另一个。当一个事务读取由另一个事务同时修改的数据时，或者当两个事务试图同时修改相同的数据时，并发问题（竞争条件）才会出现。

并发 BUG 很难通过测试找到，因为这样的错误只有在特殊时序下才会触发。这样的时序问题可能非常少发生，通常很难重现 [^译注i]。并发性也很难推理，特别是在大型应用中，你不一定知道哪些其他代码正在访问数据库。在一次只有一个用户时，应用开发已经很麻烦了，有许多并发用户使得它更加困难，因为任何一个数据都可能随时改变。

[^译注i]: 轶事：偶然出现的瞬时错误有时称为 ***Heisenbug***，而确定性的问题对应地称为 ***Bohrbugs***

出于这个原因，数据库一直试图通过提供 **事务隔离（transaction isolation）** 来隐藏应用程序开发者的并发问题。从理论上讲，隔离可以通过假装没有并发发生，让你的生活更加轻松：**可串行的（serializable）** 隔离等级意味着数据库保证事务的效果如同串行运行（即一次一个，没有任何并发）。

实际上不幸的是：隔离并没有那么简单。**可串行的隔离** 会有性能损失，许多数据库不愿意支付这个代价【8】。因此，系统通常使用较弱的隔离级别来防止一部分，而不是全部的并发问题。这些隔离级别难以理解，并且会导致微妙的错误，但是它们仍然在实践中被使用【23】。

弱事务隔离级别导致的并发性错误不仅仅是一个理论问题。它们造成了很多的资金损失【24,25】，耗费了财务审计人员的调查【26】，并导致客户数据被破坏【27】。关于这类问题的一个流行的评论是 “如果你正在处理财务数据，请使用 ACID 数据库！” —— 但是这一点没有提到。即使是很多流行的关系型数据库系统（通常被认为是 “ACID”）也使用弱隔离级别，所以它们也不一定能防止这些错误的发生。

比起盲目地依赖工具，我们需要对存在的各种并发问题，以及如何防止这些问题有深入的理解。然后就可以使用我们所掌握的工具来构建可靠和正确的应用程序。

在本节中，我们将看几个在实践中使用的弱（**非串行的**，即 nonserializable）隔离级别，并详细讨论哪种竞争条件可能发生也可能不发生，以便你可以决定什么级别适合你的应用程序。一旦我们完成了这个工作，我们将详细讨论可串行化（请参阅 “[可串行化](#可串行化)”）。我们讨论的隔离级别将是非正式的，通过示例来进行。如果你需要严格的定义和分析它们的属性，你可以在学术文献中找到它们【28,29,30】。

### 读已提交

最基本的事务隔离级别是 **读已提交（Read Committed）**[^v]，它提供了两个保证：

1. 从数据库读时，只能看到已提交的数据（没有 **脏读**，即 dirty reads）。
2. 写入数据库时，只会覆盖已提交的数据（没有 **脏写**，即 dirty writes）。

我们来更详细地讨论这两个保证。

[^v]: 某些数据库支持甚至更弱的隔离级别，称为 **读未提交（Read uncommitted）**。它可以防止脏写，但不防止脏读。

#### 没有脏读

设想一个事务已经将一些数据写入数据库，但事务还没有提交或中止。另一个事务可以看到未提交的数据吗？如果是的话，那就叫做 **脏读（dirty reads）**【2】。

在 **读已提交** 隔离级别运行的事务必须防止脏读。这意味着事务的任何写入操作只有在该事务提交时才能被其他人看到（然后所有的写入操作都会立即变得可见）。如 [图 7-4](/v1/ddia_0704.png) 所示，用户 1 设置了 `x = 3`，但用户 2 的 `get x` 仍旧返回旧值 2 （当用户 1 尚未提交时）。

![](/v1/ddia_0704.png)

**图 7-4 没有脏读：用户 2 只有在用户 1 的事务已经提交后才能看到 x 的新值。**

为什么要防止脏读，有几个原因：

- 如果事务需要更新多个对象，脏读取意味着另一个事务可能会只看到一部分更新。例如，在 [图 7-2](/v1/ddia_0702.png) 中，用户看到新的未读电子邮件，但看不到更新的计数器。这就是电子邮件的脏读。看到处于部分更新状态的数据库会让用户感到困惑，并可能导致其他事务做出错误的决定。
- 如果事务中止，则所有写入操作都需要回滚（如 [图 7-3](/v1/ddia_0703.png) 所示）。如果数据库允许脏读，那就意味着一个事务可能会看到稍后需要回滚的数据，即从未实际提交给数据库的数据。想想后果就让人头大。

#### 没有脏写

如果两个事务同时尝试更新数据库中的相同对象，会发生什么情况？我们不知道写入的顺序是怎样的，但是我们通常认为后面的写入会覆盖前面的写入。

但是，如果先前的写入是尚未提交事务的一部分，使得后面的写入覆盖了一个尚未提交的值，这时会发生什么呢？这被称作 **脏写（dirty write）**【28】。在 **读已提交** 的隔离级别上运行的事务必须防止脏写，通常是延迟第二次写入，直到第一次写入事务提交或中止为止。

通过防止脏写，这个隔离级别避免了一些并发问题：

- 如果事务更新多个对象，脏写会导致不好的结果。例如，考虑 [图 7-5](/v1/ddia_0705.png)，以一个二手车销售网站为例，Alice 和 Bob 两个人同时试图购买同一辆车。购买汽车需要两次数据库写入：网站上的商品列表需要更新，以反映买家的购买，销售发票需要发送给买家。在 [图 7-5](/v1/ddia_0705.png) 的情况下，销售是属于 Bob 的（因为他成功更新了商品列表），但发票却寄送给了 Alice（因为她成功更新了发票表）。读已提交会防止这样的事故。
- 但是，读已提交并不能防止 [图 7-1](/v1/ddia_0701.png) 中两个计数器增量之间的竞争状态。在这种情况下，第二次写入发生在第一个事务提交后，所以它不是一个脏写。这仍然是不正确的，但是出于不同的原因，在 “[防止丢失更新](#防止丢失更新)” 中将讨论如何使这种计数器增量安全。

![](/v1/ddia_0705.png)

**图 7-5 如果存在脏写，来自不同事务的冲突写入可能会混淆在一起**

#### 实现读已提交

**读已提交** 是一个非常流行的隔离级别。这是 Oracle 11g、PostgreSQL、SQL Server 2012、MemSQL 和其他许多数据库的默认设置【8】。

最常见的情况是，数据库通过使用 **行锁（row-level lock）** 来防止脏写：当事务想要修改特定对象（行或文档）时，它必须首先获得该对象的锁。然后必须持有该锁直到事务被提交或中止。一次只有一个事务可持有任何给定对象的锁；如果另一个事务要写入同一个对象，则必须等到第一个事务提交或中止后，才能获取该锁并继续。这种锁定是读已提交模式（或更强的隔离级别）的数据库自动完成的。

如何防止脏读？一种选择是使用相同的锁，并要求任何想要读取对象的事务来简单地获取该锁，然后在读取之后立即再次释放该锁。这将确保在对象具有脏的、未提交的值时不会发生读取（因为在此期间，锁将由进行写入的事务持有）。

但是要求读锁的办法在实践中效果并不好。因为一个长时间运行的写入事务会迫使许多只读事务等到这个慢写入事务完成。这会影响只读事务的响应时间，并且不利于可操作性：因为等待锁，应用某个部分的迟缓可能由于连锁效应，导致其他部分出现问题。

出于这个原因，大多数数据库 [^vi] 使用 [图 7-4](/v1/ddia_0704.png) 的方式防止脏读：对于写入的每个对象，数据库都会记住旧的已提交值，和由当前持有写入锁的事务设置的新值。当事务正在进行时，任何其他读取对象的事务都会拿到旧值。只有当新值提交后，事务才会切换到读取新值。

[^vi]: 在撰写本文时，唯一在读已提交隔离级别使用读锁的主流数据库是 IBM DB2 和使用 `read_committed_snapshot = off` 配置的 Microsoft SQL Server【23,36】。

### 快照隔离和可重复读

如果只从表面上看读已提交隔离级别，你可能就认为它完成了事务所需的一切，这是情有可原的。它允许 **中止**（原子性的要求）；它防止读取不完整的事务结果，并且防止并发写入造成的混乱。事实上这些功能非常有用，比起没有事务的系统来，可以提供更多的保证。

但是在使用此隔离级别时，仍然有很多地方可能会产生并发错误。例如 [图 7-6](/v1/ddia_0706.png) 说明了读已提交时可能发生的问题。

![](/v1/ddia_0706.png)

**图 7-6 读取偏差：Alice 观察数据库处于不一致的状态**

Alice 在银行有 1000 美元的储蓄，分为两个账户，每个 500 美元。现在有一笔事务从她的一个账户转移了 100 美元到另一个账户。如果她非常不幸地在事务处理的过程中查看其账户余额列表，她可能会在收到付款之前先看到一个账户的余额（收款账户，余额仍为 500 美元），在发出转账之后再看到另一个账户的余额（付款账户，新的余额为 400 美元）。对 Alice 来说，现在她的账户似乎总共只有 900 美元 —— 看起来有 100 美元已经凭空消失了。

这种异常被称为 **不可重复读（nonrepeatable read）** 或 **读取偏差（read skew）**：如果 Alice 在事务结束时再次读取账户 1 的余额，她将看到与她之前的查询中看到的不同的值（600 美元）。在读已提交的隔离条件下，**不可重复读** 被认为是可接受的：Alice 看到的帐户余额确实在阅读时已经提交了。

> 不幸的是，术语 **偏差（skew）** 这个词是过载的：以前使用它是因为热点的不平衡工作量（请参阅 “[负载偏斜与热点消除](/v1/ch6#负载偏斜与热点消除)”），而这里偏差意味着异常的时序。

对于 Alice 的情况，这不是一个长期持续的问题。因为如果她几秒钟后刷新银行网站的页面，她很可能会看到一致的帐户余额。但是有些情况下，不能容忍这种暂时的不一致：

备份
: 进行备份需要复制整个数据库，对大型数据库而言可能需要花费数小时才能完成。备份进程运行时，数据库仍然会接受写入操作。因此备份可能会包含一些旧的部分和一些新的部分。如果从这样的备份中恢复，那么不一致（如消失的钱）就会变成永久的。

分析查询和完整性检查
: 有时，你可能需要运行一个查询，扫描大部分的数据库。这样的查询在分析中很常见（请参阅 “[事务处理还是分析？](/v1/ch3#事务处理还是分析？)”），也可能是定期完整性检查（即监视数据损坏）的一部分。如果这些查询在不同时间点观察数据库的不同部分，则可能会返回毫无意义的结果。

**快照隔离（snapshot isolation）**【28】是这个问题最常见的解决方案。想法是，每个事务都从数据库的 **一致快照（consistent snapshot）** 中读取 —— 也就是说，事务可以看到事务开始时在数据库中提交的所有数据。即使这些数据随后被另一个事务更改，每个事务也只能看到该特定时间点的旧数据。

快照隔离对长时间运行的只读查询（如备份和分析）非常有用。如果查询的数据在查询执行的同时发生变化，则很难理解查询的含义。当一个事务可以看到数据库在某个特定时间点冻结时的一致快照，理解起来就很容易了。

快照隔离是一个流行的功能：PostgreSQL、使用 InnoDB 引擎的 MySQL、Oracle、SQL Server 等都支持【23,31,32】。

#### 实现快照隔离

与读取提交的隔离类似，快照隔离的实现通常使用写锁来防止脏写（请参阅 “[读已提交](#读已提交)”），这意味着进行写入的事务会阻止另一个事务修改同一个对象。但是读取则不需要加锁。从性能的角度来看，快照隔离的一个关键原则是：**读不阻塞写，写不阻塞读**。这允许数据库在处理一致性快照上的长时间查询时，可以正常地同时处理写入操作，且两者间没有任何锁争用。

为了实现快照隔离，数据库使用了我们看到的用于防止 [图 7-4](/v1/ddia_0704.png) 中的脏读的机制的一般化。数据库必须可能保留一个对象的几个不同的提交版本，因为各种正在进行的事务可能需要看到数据库在不同的时间点的状态。因为它同时维护着单个对象的多个版本，所以这种技术被称为 **多版本并发控制（MVCC, multi-version concurrency control）**。

如果一个数据库只需要提供 **读已提交** 的隔离级别，而不提供 **快照隔离**，那么保留一个对象的两个版本就足够了：已提交的版本和被覆盖但尚未提交的版本。不过支持快照隔离的存储引擎通常也使用 MVCC 来实现 **读已提交** 隔离级别。一种典型的方法是 **读已提交** 为每个查询使用单独的快照，而 **快照隔离** 对整个事务使用相同的快照。

[图 7-7](/v1/ddia_0707.png) 说明了 PostgreSQL 如何实现基于 MVCC 的快照隔离【31】（其他实现类似）。当一个事务开始时，它被赋予一个唯一的，永远增长 [^vii] 的事务 ID（`txid`）。每当事务向数据库写入任何内容时，它所写入的数据都会被标记上写入者的事务 ID。

[^vii]: 事实上，事务 ID 是 32 位整数，所以大约会在 40 亿次事务之后溢出。PostgreSQL 的 Vacuum 过程会清理老旧的事务 ID，确保事务 ID 溢出（回卷）不会影响到数据。

![](/v1/ddia_0707.png)

**图 7-7 使用多版本对象实现快照隔离**

表中的每一行都有一个 `created_by` 字段，其中包含将该行插入到表中的的事务 ID。此外，每行都有一个 `deleted_by` 字段，最初是空的。如果某个事务删除了一行，那么该行实际上并未从数据库中删除，而是通过将 `deleted_by` 字段设置为请求删除的事务的 ID 来标记为删除。在稍后的时间，当确定没有事务可以再访问已删除的数据时，数据库中的垃圾收集过程会将所有带有删除标记的行移除，并释放其空间。[^译注ii]

[^译注ii]: 在 PostgreSQL 中，`created_by` 的实际名称为 `xmin`，`deleted_by` 的实际名称为 `xmax`

`UPDATE` 操作在内部翻译为 `DELETE` 和 `INSERT` 。例如，在 [图 7-7](/v1/ddia_0707.png) 中，事务 13 从账户 2 中扣除 100 美元，将余额从 500 美元改为 400 美元。实际上包含两条账户 2 的记录：余额为 \$500 的行被标记为 **被事务 13 删除**，余额为 \$400 的行 **由事务 13 创建**。

#### 观察一致性快照的可见性规则

当一个事务从数据库中读取时，事务 ID 用于决定它可以看见哪些对象，看不见哪些对象。通过仔细定义可见性规则，数据库可以向应用程序呈现一致的数据库快照。工作如下：

1. 在每次事务开始时，数据库列出当时所有其他（尚未提交或尚未中止）的事务清单，即使之后提交了，这些事务已执行的任何写入也都会被忽略。
2. 被中止事务所执行的任何写入都将被忽略。
3. 由具有较晚事务 ID（即，在当前事务开始之后开始的）的事务所做的任何写入都被忽略，而不管这些事务是否已经提交。
4. 所有其他写入，对应用都是可见的。

这些规则适用于创建和删除对象。在 [图 7-7](/v1/ddia_0707.png) 中，当事务 12 从账户 2 读取时，它会看到 \$500 的余额，因为 \$500 余额的删除是由事务 13 完成的（根据规则 3，事务 12 看不到事务 13 执行的删除），且 400 美元记录的创建也是不可见的（按照相同的规则）。

换句话说，如果以下两个条件都成立，则可见一个对象：

- 读事务开始时，创建该对象的事务已经提交。
- 对象未被标记为删除，或如果被标记为删除，请求删除的事务在读事务开始时尚未提交。

长时间运行的事务可能会长时间使用快照，并继续读取（从其他事务的角度来看）早已被覆盖或删除的值。由于从来不原地更新值，而是每次值改变时创建一个新的版本，数据库可以在提供一致快照的同时只产生很小的额外开销。

#### 索引和快照隔离

索引如何在多版本数据库中工作？一种选择是使索引简单地指向对象的所有版本，并且需要索引查询来过滤掉当前事务不可见的任何对象版本。当垃圾收集删除任何事务不再可见的旧对象版本时，相应的索引条目也可以被删除。

在实践中，许多实现细节决定了多版本并发控制的性能。例如，如果同一对象的不同版本可以放入同一个页面中，PostgreSQL 的优化可以避免更新索引【31】。

在 CouchDB、Datomic 和 LMDB 中使用另一种方法。虽然它们也使用 [B 树](/v1/ch3#B树)，但它们使用的是一种 **仅追加 / 写时拷贝（append-only/copy-on-write）** 的变体，它们在更新时不覆盖树的页面，而为每个修改页面创建一份副本。从父页面直到树根都会级联更新，以指向它们子页面的新版本。任何不受写入影响的页面都不需要被复制，并且保持不变【33,34,35】。

使用仅追加的 B 树，每个写入事务（或一批事务）都会创建一棵新的 B 树，当创建时，从该特定树根生长的树就是数据库的一个一致性快照。没必要根据事务 ID 过滤掉对象，因为后续写入不能修改现有的 B 树；它们只能创建新的树根。但这种方法也需要一个负责压缩和垃圾收集的后台进程。

#### 可重复读与命名混淆

快照隔离是一个有用的隔离级别，特别对于只读事务而言。但是，许多数据库实现了它，却用不同的名字来称呼。在 Oracle 中称为 **可串行化（Serializable）** 的，在 PostgreSQL 和 MySQL 中称为 **可重复读（repeatable read）**【23】。

这种命名混淆的原因是 SQL 标准没有 **快照隔离** 的概念，因为标准是基于 System R 1975 年定义的隔离级别【2】，那时候 **快照隔离** 尚未发明。相反，它定义了 **可重复读**，表面上看起来与快照隔离很相似。PostgreSQL 和 MySQL 称其 **快照隔离** 级别为 **可重复读（repeatable read）**，因为这样符合标准要求，所以它们可以声称自己 “标准兼容”。

不幸的是，SQL 标准对隔离级别的定义是有缺陷的 —— 模糊，不精确，并不像标准应有的样子独立于实现【28】。有几个数据库实现了可重复读，但它们实际提供的保证存在很大的差异，尽管表面上是标准化的【23】。在研究文献【29,30】中已经有了可重复读的正式定义，但大多数的实现并不能满足这个正式定义。最后，IBM DB2 使用 “可重复读” 来引用可串行化【8】。

结果，没有人真正知道 **可重复读** 的意思。

### 防止丢失更新

到目前为止已经讨论的 **读已提交** 和 **快照隔离** 级别，主要保证了 **只读事务在并发写入时** 可以看到什么。却忽略了两个事务并发写入的问题 —— 我们只讨论了脏写（请参阅 “[没有脏写](#没有脏写)”），一种特定类型的写 - 写冲突是可能出现的。

并发的写入事务之间还有其他几种有趣的冲突。其中最著名的是 **丢失更新（lost update）** 问题，如 [图 7-1](/v1/ddia_0701.png) 所示，以两个并发计数器增量为例。

如果应用从数据库中读取一些值，修改它并写回修改的值（读取 - 修改 - 写入序列），则可能会发生丢失更新的问题。如果两个事务同时执行，则其中一个的修改可能会丢失，因为第二个写入的内容并没有包括第一个事务的修改（有时会说后面写入 **狠揍（clobber）** 了前面的写入）这种模式发生在各种不同的情况下：

- 增加计数器或更新账户余额（需要读取当前值，计算新值并写回更新后的值）
- 将本地修改写入一个复杂值中：例如，将元素添加到 JSON 文档中的一个列表（需要解析文档，进行更改并写回修改的文档）
- 两个用户同时编辑 wiki 页面，每个用户通过将整个页面内容发送到服务器来保存其更改，覆写数据库中当前的任何内容。

这是一个普遍的问题，所以已经开发了各种解决方案。

#### 原子写

许多数据库提供了原子更新操作，从而消除了在应用程序代码中执行读取 - 修改 - 写入序列的需要。如果你的代码可以用这些操作来表达，那这通常是最好的解决方案。例如，下面的指令在大多数关系数据库中是并发安全的：

```sql
UPDATE counters SET value = value + 1 WHERE key = 'foo';
```

类似地，像 MongoDB 这样的文档数据库提供了对 JSON 文档的一部分进行本地修改的原子操作，Redis 提供了修改数据结构（如优先级队列）的原子操作。并不是所有的写操作都可以用原子操作的方式来表达，例如 wiki 页面的更新涉及到任意文本编辑 [^viii]，但是在可以使用原子操作的情况下，它们通常是最好的选择。

[^viii]: 将文本文档的编辑表示为原子的变化流是可能的，尽管相当复杂。请参阅 “[自动冲突解决](/v1/ch5#自动冲突解决)”。

原子操作通常通过在读取对象时，获取其上的排它锁来实现。以便更新完成之前没有其他事务可以读取它。这种技术有时被称为 **游标稳定性（cursor stability）**【36,37】。另一个选择是简单地强制所有的原子操作在单一线程上执行。

不幸的是，ORM 框架很容易意外地执行不安全的读取 - 修改 - 写入序列，而不是使用数据库提供的原子操作【38】。如果你知道自己在做什么那当然不是问题，但它经常产生那种很难测出来的微妙 Bug。

#### 显式锁定

如果数据库的内置原子操作没有提供必要的功能，防止丢失更新的另一个选择是让应用程序显式地锁定将要更新的对象。然后应用程序可以执行读取 - 修改 - 写入序列，如果任何其他事务尝试同时读取同一个对象，则强制等待，直到第一个 **读取 - 修改 - 写入序列** 完成。

例如，考虑一个多人游戏，其中几个玩家可以同时移动相同的棋子。在这种情况下，一个原子操作可能是不够的，因为应用程序还需要确保玩家的移动符合游戏规则，这可能涉及到一些不能合理地用数据库查询实现的逻辑。但你可以使用锁来防止两名玩家同时移动相同的棋子，如例 7-1 所示。

**例 7-1 显式锁定行以防止丢失更新**

```sql
BEGIN TRANSACTION;
SELECT * FROM figures
  WHERE name = 'robot' AND game_id = 222
FOR UPDATE;

-- 检查玩家的操作是否有效，然后更新先前 SELECT 返回棋子的位置。
UPDATE figures SET position = 'c4' WHERE id = 1234;
COMMIT;
```

- `FOR UPDATE` 子句告诉数据库应该对该查询返回的所有行加锁。

这是有效的，但要做对，你需要仔细考虑应用逻辑。忘记在代码某处加锁很容易引入竞争条件。

#### 自动检测丢失的更新

原子操作和锁是通过强制 **读取 - 修改 - 写入序列** 按顺序发生，来防止丢失更新的方法。另一种方法是允许它们并行执行，如果事务管理器检测到丢失更新，则中止事务并强制它们重试其 **读取 - 修改 - 写入序列**。

这种方法的一个优点是，数据库可以结合快照隔离高效地执行此检查。事实上，PostgreSQL 的可重复读，Oracle 的可串行化和 SQL Server 的快照隔离级别，都会自动检测到丢失更新，并中止惹麻烦的事务。但是，MySQL/InnoDB 的可重复读并不会检测 **丢失更新**【23】。一些作者【28,30】认为，数据库必须能防止丢失更新才称得上是提供了 **快照隔离**，所以在这个定义下，MySQL 下不提供快照隔离。

丢失更新检测是一个很好的功能，因为它不需要应用代码使用任何特殊的数据库功能，你可能会忘记使用锁或原子操作，从而引入错误；但丢失更新的检测是自动发生的，因此不太容易出错。

#### 比较并设置（CAS）

在不提供事务的数据库中，有时会发现一种原子操作：**比较并设置**（CAS, 即 Compare And Set，先前在 “[单对象写入](#单对象写入)” 中提到）。此操作的目的是为了避免丢失更新：只有当前值从上次读取时一直未改变，才允许更新发生。如果当前值与先前读取的值不匹配，则更新不起作用，且必须重试读取 - 修改 - 写入序列。

例如，为了防止两个用户同时更新同一个 wiki 页面，可以尝试类似这样的方式，只有当用户开始编辑后页面内容未发生改变时，才会更新成功：

```sql
-- 根据数据库的实现情况，这可能安全也可能不安全
UPDATE wiki_pages SET content = '新内容'
  WHERE id = 1234 AND content = '旧内容';
```

如果内容已经更改并且不再与 “旧内容” 相匹配，则此更新将不起作用，因此你需要检查更新是否生效，必要时重试。但是，如果数据库允许 `WHERE` 子句从旧快照中读取，则此语句可能无法防止丢失更新，因为即使发生了另一个并发写入，`WHERE` 条件也可能为真。在依赖数据库的 CAS 操作前要检查其是否安全。

#### 冲突解决和复制

在复制数据库中（请参阅 [第五章](/v1/ch5)），防止丢失的更新需要考虑另一个维度：由于在多个节点上存在数据副本，并且在不同节点上的数据可能被并发地修改，因此需要采取一些额外的步骤来防止丢失更新。

锁和 CAS 操作假定只有一个最新的数据副本。但是多主或无主复制的数据库通常允许多个写入并发执行，并异步复制到副本上，因此无法保证只有一个最新数据的副本。所以基于锁或 CAS 操作的技术不适用于这种情况（我们将在 “[线性一致性](/v1/ch9#线性一致性)” 中更详细地讨论这个问题）。

相反，如 “[检测并发写入](/v1/ch5#检测并发写入)” 一节所述，这种复制数据库中的一种常见方法是允许并发写入创建多个冲突版本的值（也称为兄弟），并使用应用代码或特殊数据结构在事实发生之后解决和合并这些版本。

原子操作可以在复制的上下文中很好地工作，尤其当它们具有可交换性时（即，可以在不同的副本上以不同的顺序应用它们，且仍然可以得到相同的结果）。例如，递增计数器或向集合添加元素是可交换的操作。这是 Riak 2.0 数据类型背后的思想，它可以防止复制副本丢失更新。当不同的客户端同时更新一个值时，Riak 自动将更新合并在一起，以免丢失更新【39】。

另一方面，最后写入胜利（LWW）的冲突解决方法很容易丢失更新，如 “[最后写入胜利（丢弃并发写入）](/v1/ch5#最后写入胜利（丢弃并发写入）)” 中所述。不幸的是，LWW 是许多复制数据库中的默认方案。

### 写入偏差与幻读

前面的章节中，我们看到了 **脏写** 和 **丢失更新**，当不同的事务并发地尝试写入相同的对象时，会出现这两种竞争条件。为了避免数据损坏，这些竞争条件需要被阻止 —— 既可以由数据库自动执行，也可以通过锁和原子写操作这类手动安全措施来防止。

但是，并发写入间可能发生的竞争条件还没有完。在本节中，我们将看到一些更微妙的冲突例子。

首先，想象一下这个例子：你正在为医院写一个医生轮班管理程序。医院通常会同时要求几位医生待命，但底线是至少有一位医生在待命。医生可以放弃他们的班次（例如，如果他们自己生病了），只要至少有一个同事在这一班中继续工作【40,41】。

现在想象一下，Alice 和 Bob 是两位值班医生。两人都感到不适，所以他们都决定请假。不幸的是，他们恰好在同一时间点击按钮下班。[图 7-8](/v1/ddia_0708.png) 说明了接下来的事情。

![](/v1/ddia_0708.png)

**图 7-8 写入偏差导致应用程序错误的示例**

在两个事务中，应用首先检查是否有两个或以上的医生正在值班；如果是的话，它就假定一名医生可以安全地休班。由于数据库使用快照隔离，两次检查都返回 2 ，所以两个事务都进入下一个阶段。Alice 更新自己的记录休班了，而 Bob 也做了一样的事情。两个事务都成功提交了，现在没有医生值班了。违反了至少有一名医生在值班的要求。

#### 写入偏差的特征

这种异常称为 **写入偏差**【28】。它既不是 **脏写**，也不是 **丢失更新**，因为这两个事务正在更新两个不同的对象（Alice 和 Bob 各自的待命记录）。在这里发生的冲突并不是那么明显，但是这显然是一个竞争条件：如果两个事务一个接一个地运行，那么第二个医生就不能歇班了。异常行为只有在事务并发进行时才有可能发生。

可以将写入偏差视为丢失更新问题的一般化。如果两个事务读取相同的对象，然后更新其中一些对象（不同的事务可能更新不同的对象），则可能发生写入偏差。在多个事务更新同一个对象的特殊情况下，就会发生脏写或丢失更新（取决于时序）。

我们已经看到，有各种不同的方法来防止丢失的更新。但对于写入偏差，我们的选择更受限制：

* 由于涉及多个对象，单对象的原子操作不起作用。
* 不幸的是，在一些快照隔离的实现中，自动检测丢失更新对此并没有帮助。在 PostgreSQL 的可重复读，MySQL/InnoDB 的可重复读，Oracle 可串行化或 SQL Server 的快照隔离级别中，都不会自动检测写入偏差【23】。自动防止写入偏差需要真正的可串行化隔离（请参阅 “[可串行化](#可串行化)”）。
* 某些数据库允许配置约束，然后由数据库强制执行（例如，唯一性，外键约束或特定值限制）。但是为了指定至少有一名医生必须在线，需要一个涉及多个对象的约束。大多数数据库没有内置对这种约束的支持，但是你可以使用触发器，或者物化视图来实现它们，这取决于不同的数据库【42】。
* 如果无法使用可串行化的隔离级别，则此情况下的次优选项可能是显式锁定事务所依赖的行。在例子中，你可以写下如下的代码：

```sql
BEGIN TRANSACTION;
SELECT * FROM doctors
  WHERE on_call = TRUE
  AND shift_id = 1234 FOR UPDATE;

UPDATE doctors
  SET on_call = FALSE
  WHERE name = 'Alice'
  AND shift_id = 1234;
  
COMMIT;
```

* 和以前一样，`FOR UPDATE` 告诉数据库锁定返回的所有行以用于更新。

#### 写入偏差的更多例子

写入偏差乍看像是一个深奥的问题，但一旦意识到这一点，很容易会注意到它可能发生在更多场景下。以下是一些例子：

会议室预订系统
: 比如你想要规定不能在同一时间对同一个会议室进行多次的预订【43】。当有人想要预订时，首先检查是否存在相互冲突的预订（即预订时间范围重叠的同一房间），如果没有找到，则创建会议（请参阅示例 7-2）[^ix]。

  [^ix]: 在 PostgreSQL 中，你可以使用范围类型优雅地执行此操作，但在其他数据库中并未得到广泛支持。

  **例 7-2 会议室预订系统试图避免重复预订（在快照隔离下不安全）**

  ```sql
  BEGIN TRANSACTION;

  -- 检查所有现存的与 12:00~13:00 重叠的预定
  SELECT COUNT(*) FROM bookings
  WHERE room_id = 123 AND
    end_time > '2015-01-01 12:00' AND start_time < '2015-01-01 13:00';

  -- 如果之前的查询返回 0
  INSERT INTO bookings(room_id, start_time, end_time, user_id)
    VALUES (123, '2015-01-01 12:00', '2015-01-01 13:00', 666);

  COMMIT;
  ```

  不幸的是，快照隔离并不能防止另一个用户同时插入冲突的会议。为了确保不会遇到调度冲突，你又需要可串行化的隔离级别了。

多人游戏
: 在 [例 7-1]() 中，我们使用一个锁来防止丢失更新（也就是确保两个玩家不能同时移动同一个棋子）。但是锁定并不妨碍玩家将两个不同的棋子移动到棋盘上的相同位置，或者采取其他违反游戏规则的行为。取决于你正在执行的规则类型，也许可以使用唯一约束（unique constraint），否则你很容易发生写入偏差。

抢注用户名
: 在每个用户拥有唯一用户名的网站上，两个用户可能会尝试同时创建具有相同用户名的帐户。可以在事务检查名称是否被抢占，如果没有则使用该名称创建账户。但是像在前面的例子中那样，在快照隔离下这是不安全的。幸运的是，唯一约束是一个简单的解决办法（第二个事务在提交时会因为违反用户名唯一约束而被中止）。

防止双重开支
: 允许用户花钱或使用积分的服务，需要检查用户的支付数额不超过其余额。可以通过在用户的帐户中插入一个试探性的消费项目来实现这一点，列出帐户中的所有项目，并检查总和是否为正值【44】。在写入偏差场景下，可能会发生两个支出项目同时插入，一起导致余额变为负值，但这两个事务都不会注意到另一个。

#### 导致写入偏差的幻读

所有这些例子都遵循类似的模式：

1. 一个 `SELECT` 查询找出符合条件的行，并检查是否符合一些要求。（例如：至少有两名医生在值班；不存在对该会议室同一时段的预定；棋盘上的位置没有被其他棋子占据；用户名还没有被抢注；账户里还有足够余额）

2. 按照第一个查询的结果，应用代码决定是否继续。（可能会继续操作，也可能中止并报错）

3. 如果应用决定继续操作，就执行写入（插入、更新或删除），并提交事务。

   这个写入的效果改变了步骤 2 中的先决条件。换句话说，如果在提交写入后，重复执行一次步骤 1 的 SELECT 查询，将会得到不同的结果。因为写入改变了符合搜索条件的行集（现在少了一个医生值班，那时候的会议室现在已经被预订了，棋盘上的这个位置已经被占据了，用户名已经被抢注，账户余额不够了）。

这些步骤可能以不同的顺序发生。例如可以首先进行写入，然后进行 SELECT 查询，最后根据查询结果决定是放弃还是提交。

在医生值班的例子中，在步骤 3 中修改的行，是步骤 1 中返回的行之一，所以我们可以通过锁定步骤 1 中的行（`SELECT FOR UPDATE`）来使事务安全并避免写入偏差。但是其他四个例子是不同的：它们检查是否 **不存在** 某些满足条件的行，写入会 **添加** 一个匹配相同条件的行。如果步骤 1 中的查询没有返回任何行，则 `SELECT FOR UPDATE` 锁不了任何东西。

这种效应：一个事务中的写入改变另一个事务的搜索查询的结果，被称为 **幻读**【3】。快照隔离避免了只读查询中幻读，但是在像我们讨论的例子那样的读写事务中，幻读会导致特别棘手的写入偏差情况。

#### 物化冲突

如果幻读的问题是没有对象可以加锁，也许可以人为地在数据库中引入一个锁对象？

例如，在会议室预订的场景中，可以想象创建一个关于时间槽和房间的表。此表中的每一行对应于特定时间段（例如 15 分钟）的特定房间。可以提前插入房间和时间的所有可能组合行（例如接下来的六个月）。

现在，要创建预订的事务可以锁定（`SELECT FOR UPDATE`）表中与所需房间和时间段对应的行。在获得锁定之后，它可以检查重叠的预订并像以前一样插入新的预订。请注意，这个表并不是用来存储预订相关的信息 —— 它完全就是一组锁，用于防止同时修改同一房间和时间范围内的预订。

这种方法被称为 **物化冲突（materializing conflicts）**，因为它将幻读变为数据库中一组具体行上的锁冲突【11】。不幸的是，弄清楚如何物化冲突可能很难，也很容易出错，并且让并发控制机制泄漏到应用数据模型是很丑陋的做法。出于这些原因，如果没有其他办法可以实现，物化冲突应被视为最后的手段。在大多数情况下。**可串行化（Serializable）** 的隔离级别是更可取的。


## 可串行化

在本章中，已经看到了几个易于出现竞争条件的事务例子。**读已提交** 和 **快照隔离** 级别会阻止某些竞争条件，但不会阻止另一些。我们遇到了一些特别棘手的例子，**写入偏差** 和 **幻读**。这是一个可悲的情况：

- 隔离级别难以理解，并且在不同的数据库中实现的不一致（例如，“可重复读” 的含义天差地别）。
- 光检查应用代码很难判断在特定的隔离级别运行是否安全。特别是在大型应用程序中，你可能并不知道并发发生的所有事情。
- 没有检测竞争条件的好工具。原则上来说，静态分析可能会有帮助【26】，但研究中的技术还没法实际应用。并发问题的测试是很难的，因为它们通常是非确定性的 —— 只有在倒霉的时序下才会出现问题。

这不是一个新问题，从 20 世纪 70 年代以来就一直是这样了，当时首先引入了较弱的隔离级别【2】。一直以来，研究人员的答案都很简单：使用 **可串行化（serializable）** 的隔离级别！

**可串行化（Serializability）** 隔离通常被认为是最强的隔离级别。它保证即使事务可以并行执行，最终的结果也是一样的，就好像它们没有任何并发性，连续挨个执行一样。因此数据库保证，如果事务在单独运行时正常运行，则它们在并发运行时继续保持正确 —— 换句话说，数据库可以防止 **所有** 可能的竞争条件。

但如果可串行化隔离级别比弱隔离级别的烂摊子要好得多，那为什么没有人见人爱？为了回答这个问题，我们需要看看实现可串行化的选项，以及它们如何执行。目前大多数提供可串行化的数据库都使用了三种技术之一，本章的剩余部分将会介绍这些技术：

- 字面意义上地串行顺序执行事务（请参阅 “[真的串行执行](#真的串行执行)”）
- **两阶段锁定（2PL, two-phase locking）**，几十年来唯一可行的选择（请参阅 “[两阶段锁定](#两阶段锁定)”）
- 乐观并发控制技术，例如 **可串行化快照隔离**（serializable snapshot isolation，请参阅 “[可串行化快照隔离](#可串行化快照隔离)”）

现在将主要在单节点数据库的背景下讨论这些技术；在 [第九章](/v1/ch9) 中，我们将研究如何将它们推广到涉及分布式系统中多个节点的事务。

### 真的串行执行

避免并发问题的最简单方法就是完全不要并发：在单个线程上按顺序一次只执行一个事务。这样做就完全绕开了检测 / 防止事务间冲突的问题，由此产生的隔离，正是可串行化的定义。

尽管这似乎是一个明显的主意，但数据库设计人员只是在 2007 年左右才决定，单线程循环执行事务是可行的【45】。如果多线程并发在过去的 30 年中被认为是获得良好性能的关键所在，那么究竟是什么改变致使单线程执行变为可能呢？

两个进展引发了这个反思：

- RAM 足够便宜了，许多场景现在都可以将完整的活跃数据集保存在内存中（请参阅 “[在内存中存储一切](/v1/ch3#在内存中存储一切)”）。当事务需要访问的所有数据都在内存中时，事务处理的执行速度要比等待数据从磁盘加载时快得多。
- 数据库设计人员意识到 OLTP 事务通常很短，而且只进行少量的读写操作（请参阅 “[事务处理还是分析？](/v1/ch3#事务处理还是分析？)”）。相比之下，长时间运行的分析查询通常是只读的，因此它们可以在串行执行循环之外的一致快照（使用快照隔离）上运行。

串行执行事务的方法在 VoltDB/H-Store、Redis 和 Datomic 中实现【46,47,48】。设计用于单线程执行的系统有时可以比支持并发的系统性能更好，因为它可以避免锁的协调开销。但是其吞吐量仅限于单个 CPU 核的吞吐量。为了充分利用单一线程，需要有与传统形式的事务不同的结构。

#### 在存储过程中封装事务

在数据库的早期阶段，意图是数据库事务可以包含整个用户活动流程。例如，预订机票是一个多阶段的过程（搜索路线，票价和可用座位，决定行程，在每段行程的航班上订座，输入乘客信息，付款）。数据库设计者认为，如果整个过程是一个事务，那么它就可以被原子化地执行。

不幸的是，人类做出决定和回应的速度非常缓慢。如果数据库事务需要等待来自用户的输入，则数据库需要支持潜在的大量并发事务，其中大部分是空闲的。大多数数据库不能高效完成这项工作，因此几乎所有的 OLTP 应用程序都避免在事务中等待交互式的用户输入，以此来保持事务的简短。在 Web 上，这意味着事务在同一个 HTTP 请求中被提交 —— 一个事务不会跨越多个请求。一个新的 HTTP 请求开始一个新的事务。

即使已经将人类从关键路径中排除，事务仍然以交互式的客户端 / 服务器风格执行，一次一个语句。应用程序进行查询，读取结果，可能根据第一个查询的结果进行另一个查询，依此类推。查询和结果在应用程序代码（在一台机器上运行）和数据库服务器（在另一台机器上）之间来回发送。

在这种交互式的事务方式中，应用程序和数据库之间的网络通信耗费了大量的时间。如果不允许在数据库中进行并发处理，且一次只处理一个事务，则吞吐量将会非常糟糕，因为数据库大部分的时间都花费在等待应用程序发出当前事务的下一个查询。在这种数据库中，为了获得合理的性能，需要同时处理多个事务。

出于这个原因，具有单线程串行事务处理的系统不允许交互式的多语句事务。取而代之，应用程序必须提前将整个事务代码作为存储过程提交给数据库。这些方法之间的差异如 [图 7-9](/v1/ddia_0709.png) 所示。如果事务所需的所有数据都在内存中，则存储过程可以非常快地执行，而不用等待任何网络或磁盘 I/O。

![](/v1/ddia_0709.png)

**图 7-9 交互式事务和存储过程之间的区别（使用图 7-8 的示例事务）**

#### 存储过程的优点和缺点

存储过程在关系型数据库中已经存在了一段时间了，自 1999 年以来它们一直是 SQL 标准（SQL/PSM）的一部分。出于各种原因，它们的名声有点不太好：

- 每个数据库厂商都有自己的存储过程语言（Oracle 有 PL/SQL，SQL Server 有 T-SQL，PostgreSQL 有 PL/pgSQL，等等）。这些语言并没有跟上通用编程语言的发展，所以从今天的角度来看，它们看起来相当丑陋和陈旧，而且缺乏大多数编程语言中能找到的库的生态系统。
- 在数据库中运行的代码难以管理：与应用服务器相比，它更难调试，更难以保持版本控制和部署，更难测试，并且难以集成到指标收集系统来进行监控。
- 数据库通常比应用服务器对性能敏感的多，因为单个数据库实例通常由许多应用服务器共享。数据库中一个写得不好的存储过程（例如，占用大量内存或 CPU 时间）会比在应用服务器中相同的代码造成更多的麻烦。

但是这些问题都是可以克服的。现代的存储过程实现放弃了 PL/SQL，而是使用现有的通用编程语言：VoltDB 使用 Java 或 Groovy，Datomic 使用 Java 或 Clojure，而 Redis 使用 Lua。

**存储过程与内存存储**，使得在单个线程上执行所有事务变得可行。由于不需要等待 I/O，且避免了并发控制机制的开销，它们可以在单个线程上实现相当好的吞吐量。

VoltDB 还使用存储过程进行复制：但不是将事务的写入结果从一个节点复制到另一个节点，而是在每个节点上执行相同的存储过程。因此 VoltDB 要求存储过程是 **确定性的**（在不同的节点上运行时，它们必须产生相同的结果）。举个例子，如果事务需要使用当前的日期和时间，则必须通过特殊的确定性 API 来实现。

#### 分区

顺序执行所有事务使并发控制简单多了，但数据库的事务吞吐量被限制为单机单核的速度。只读事务可以使用快照隔离在其它地方执行，但对于写入吞吐量较高的应用，单线程事务处理器可能成为一个严重的瓶颈。

为了伸缩至多个 CPU 核心和多个节点，可以对数据进行分区（请参阅 [第六章](/v1/ch6)），在 VoltDB 中支持这样做。如果你可以找到一种对数据集进行分区的方法，以便每个事务只需要在单个分区中读写数据，那么每个分区就可以拥有自己独立运行的事务处理线程。在这种情况下可以为每个分区指派一个独立的 CPU 核，事务吞吐量就可以与 CPU 核数保持线性伸缩【47】。

但是，对于需要访问多个分区的任何事务，数据库必须在触及的所有分区之间协调事务。存储过程需要跨越所有分区锁定执行，以确保整个系统的可串行性。

由于跨分区事务具有额外的协调开销，所以它们比单分区事务慢得多。VoltDB 报告的吞吐量大约是每秒 1000 个跨分区写入，比单分区吞吐量低几个数量级，并且不能通过增加更多的机器来增加吞吐量【49】。

事务是否可以是划分至单个分区很大程度上取决于应用数据的结构。简单的键值数据通常可以非常容易地进行分区，但是具有多个次级索引的数据可能需要大量的跨分区协调（请参阅 “[分区与次级索引](/v1/ch6#分区与次级索引)”）。

#### 串行执行小结

在特定约束条件下，真的串行执行事务，已经成为一种实现可串行化隔离等级的可行办法。

- 每个事务都必须小而快，只要有一个缓慢的事务，就会拖慢所有事务处理。
- 仅限于活跃数据集可以放入内存的情况。很少访问的数据可能会被移动到磁盘，但如果需要在单线程执行的事务中访问这些磁盘中的数据，系统就会变得非常慢 [^x]。
- 写入吞吐量必须低到能在单个 CPU 核上处理，如若不然，事务需要能划分至单个分区，且不需要跨分区协调。
- 跨分区事务是可能的，但是它们能被使用的程度有很大的限制。

[^x]: 如果事务需要访问不在内存中的数据，最好的解决方案可能是中止事务，异步地将数据提取到内存中，同时继续处理其他事务，然后在数据加载完毕时重新启动事务。这种方法被称为 **反缓存（anti-caching）**，正如前面在 “[在内存中存储一切](/v1/ch3#在内存中存储一切)” 中所述。

### 两阶段锁定

大约 30 年来，在数据库中只有一种广泛使用的串行化算法：**两阶段锁定（2PL，two-phase locking）** [^xi]

[^xi]: 有时也称为 **严格两阶段锁定（SS2PL, strong strict two-phase locking）**，以便和其他 2PL 变体区分。

> #### 2PL不是2PC
>
> 请注意，虽然两阶段锁定（2PL）听起来非常类似于两阶段提交（2PC），但它们是完全不同的东西。我们将在 [第九章](/v1/ch9) 讨论 2PC。

之前我们看到锁通常用于防止脏写（请参阅 “[没有脏写](#没有脏写)” 一节）：如果两个事务同时尝试写入同一个对象，则锁可确保第二个写入必须等到第一个写入完成事务（中止或提交），然后才能继续。

两阶段锁定类似，但是锁的要求更强得多。只要没有写入，就允许多个事务同时读取同一个对象。但对象只要有写入（修改或删除），就需要 **独占访问（exclusive access）** 权限：

- 如果事务 A 读取了一个对象，并且事务 B 想要写入该对象，那么 B 必须等到 A 提交或中止才能继续（这确保 B 不能在 A 底下意外地改变对象）。
- 如果事务 A 写入了一个对象，并且事务 B 想要读取该对象，则 B 必须等到 A 提交或中止才能继续（像 [图 7-1](/v1/ddia_0701.png) 那样读取旧版本的对象在 2PL 下是不可接受的）。

在 2PL 中，写入不仅会阻塞其他写入，也会阻塞读，反之亦然。快照隔离使得 **读不阻塞写，写也不阻塞读**（请参阅 “[实现快照隔离](#实现快照隔离)”），这是 2PL 和快照隔离之间的关键区别。另一方面，因为 2PL 提供了可串行化的性质，它可以防止早先讨论的所有竞争条件，包括丢失更新和写入偏差。

#### 实现两阶段锁

2PL 用于 MySQL（InnoDB）和 SQL Server 中的可串行化隔离级别，以及 DB2 中的可重复读隔离级别【23,36】。

读与写的阻塞是通过为数据库中每个对象添加锁来实现的。锁可以处于 **共享模式（shared mode）** 或 **独占模式（exclusive mode）**。锁使用如下：

- 若事务要读取对象，则须先以共享模式获取锁。允许多个事务同时持有共享锁。但如果另一个事务已经在对象上持有排它锁，则这些事务必须等待。
- 若事务要写入一个对象，它必须首先以独占模式获取该锁。没有其他事务可以同时持有锁（无论是共享模式还是独占模式），所以如果对象上存在任何锁，该事务必须等待。
- 如果事务先读取再写入对象，则它可能会将其共享锁升级为独占锁。升级锁的工作与直接获得独占锁相同。
- 事务获得锁之后，必须继续持有锁直到事务结束（提交或中止）。这就是 “两阶段” 这个名字的来源：第一阶段（当事务正在执行时）获取锁，第二阶段（在事务结束时）释放所有的锁。

由于使用了这么多的锁，因此很可能会发生：事务 A 等待事务 B 释放它的锁，反之亦然。这种情况叫做 **死锁（Deadlock）**。数据库会自动检测事务之间的死锁，并中止其中一个，以便另一个继续执行。被中止的事务需要由应用程序重试。

#### 两阶段锁定的性能

两阶段锁定的巨大缺点，以及 70 年代以来没有被所有人使用的原因，是其性能问题。两阶段锁定下的事务吞吐量与查询响应时间要比弱隔离级别下要差得多。

这一部分是由于获取和释放所有这些锁的开销，但更重要的是由于并发性的降低。按照设计，如果两个并发事务试图做任何可能导致竞争条件的事情，那么必须等待另一个完成。

传统的关系数据库不限制事务的持续时间，因为它们是为等待人类输入的交互式应用而设计的。因此，当一个事务需要等待另一个事务时，等待的时长并没有限制。即使你保证所有的事务都很短，如果有多个事务想要访问同一个对象，那么可能会形成一个队列，所以事务可能需要等待几个其他事务才能完成。

因此，运行 2PL 的数据库可能具有相当不稳定的延迟，如果在工作负载中存在争用，那么可能高百分位点处的响应会非常的慢（请参阅 “[描述性能](/v1/ch1#描述性能)”）。可能只需要一个缓慢的事务，或者一个访问大量数据并获取许多锁的事务，就能把系统的其他部分拖慢，甚至迫使系统停机。当需要稳健的操作时，这种不稳定性是有问题的。

基于锁实现的读已提交隔离级别可能发生死锁，但在基于 2PL 实现的可串行化隔离级别中，它们会出现的频繁的多（取决于事务的访问模式）。这可能是一个额外的性能问题：当事务由于死锁而被中止并被重试时，它需要从头重做它的工作。如果死锁很频繁，这可能意味着巨大的浪费。

#### 谓词锁

在前面关于锁的描述中，我们掩盖了一个微妙而重要的细节。在 “[导致写入偏差的幻读](#导致写入偏差的幻读)” 中，我们讨论了 **幻读（phantoms）** 的问题。即一个事务改变另一个事务的搜索查询的结果。具有可串行化隔离级别的数据库必须防止 **幻读**。

在会议室预订的例子中，这意味着如果一个事务在某个时间窗口内搜索了一个房间的现有预订（见 [例 7-2]()），则另一个事务不能同时插入或更新同一时间窗口与同一房间的另一个预订 （可以同时插入其他房间的预订，或在不影响另一个预定的条件下预定同一房间的其他时间段）。

如何实现这一点？从概念上讲，我们需要一个 **谓词锁（predicate lock）**【3】。它类似于前面描述的共享 / 排它锁，但不属于特定的对象（例如，表中的一行），它属于所有符合某些搜索条件的对象，如：

```sql
SELECT * FROM bookings
WHERE room_id = 123 AND
      end_time > '2018-01-01 12:00' AND
      start_time < '2018-01-01 13:00';
```

谓词锁限制访问，如下所示：

- 如果事务 A 想要读取匹配某些条件的对象，就像在这个 `SELECT` 查询中那样，它必须获取查询条件上的 **共享谓词锁（shared-mode predicate lock）**。如果另一个事务 B 持有任何满足这一查询条件对象的排它锁，那么 A 必须等到 B 释放它的锁之后才允许进行查询。
- 如果事务 A 想要插入，更新或删除任何对象，则必须首先检查旧值或新值是否与任何现有的谓词锁匹配。如果事务 B 持有匹配的谓词锁，那么 A 必须等到 B 已经提交或中止后才能继续。

这里的关键思想是，谓词锁甚至适用于数据库中尚不存在，但将来可能会添加的对象（幻象）。如果两阶段锁定包含谓词锁，则数据库将阻止所有形式的写入偏差和其他竞争条件，因此其隔离实现了可串行化。

#### 索引范围锁

不幸的是谓词锁性能不佳：**如果活跃事务持有很多锁，检查匹配的锁会非常耗时。** 因此，大多数使用 2PL 的数据库实际上实现了索引范围锁（index-range locking，也称为 **next-key locking**），这是一个简化的近似版谓词锁【41,50】。

通过使谓词匹配到一个更大的集合来简化谓词锁是安全的。例如，如果你有在中午和下午 1 点之间预订 123 号房间的谓词锁，则锁定 123 号房间的所有时间段，或者锁定 12:00~13:00 时间段的所有房间（不只是 123 号房间）是一个安全的近似，因为任何满足原始谓词的写入也一定会满足这种更松散的近似。

在房间预订数据库中，你可能会在 `room_id` 列上有一个索引，并且 / 或者在 `start_time` 和 `end_time` 上有索引（否则前面的查询在大型数据库上的速度会非常慢）：

- 假设你的索引位于 `room_id` 上，并且数据库使用此索引查找 123 号房间的现有预订。现在数据库可以简单地将共享锁附加到这个索引项上，指示事务已搜索 123 号房间用于预订。
- 或者，如果数据库使用基于时间的索引来查找现有预订，那么它可以将共享锁附加到该索引中的一系列值，指示事务已经将 12:00~13:00 时间段标记为用于预定。

无论哪种方式，搜索条件的近似值都附加到其中一个索引上。现在，如果另一个事务想要插入、更新或删除同一个房间和 / 或重叠时间段的预订，则它将不得不更新索引的相同部分。在这样做的过程中，它会遇到共享锁，它将被迫等到锁被释放。

这种方法能够有效防止幻读和写入偏差。索引范围锁并不像谓词锁那样精确（它们可能会锁定更大范围的对象，而不是维持可串行化所必需的范围），但是由于它们的开销较低，所以是一个很好的折衷。

如果没有可以挂载范围锁的索引，数据库可以退化到使用整个表上的共享锁。这对性能不利，因为它会阻止所有其他事务写入表格，但这是一个安全的回退位置。


### 可串行化快照隔离

本章描绘了数据库中并发控制的黯淡画面。一方面，我们实现了性能不好（2PL）或者伸缩性不好（串行执行）的可串行化隔离级别。另一方面，我们有性能良好的弱隔离级别，但容易出现各种竞争条件（丢失更新、写入偏差、幻读等）。串行化的隔离级别和高性能是从根本上相互矛盾的吗？

也许不是：一个称为 **可串行化快照隔离（SSI, serializable snapshot isolation）** 的算法是非常有前途的。它提供了完整的可串行化隔离级别，但与快照隔离相比只有很小的性能损失。SSI 是相当新的：它在 2008 年首次被描述【40】，并且是 Michael Cahill 的博士论文【51】的主题。

今天，SSI 既用于单节点数据库（PostgreSQL9.1 以后的可串行化隔离级别），也用于分布式数据库（FoundationDB 使用类似的算法）。由于 SSI 与其他并发控制机制相比还很年轻，还处于在实践中证明自己表现的阶段。但它有可能因为足够快而在未来成为新的默认选项。

#### 悲观与乐观的并发控制

两阶段锁是一种所谓的 **悲观并发控制机制（pessimistic）** ：它是基于这样的原则：如果有事情可能出错（如另一个事务所持有的锁所表示的），最好等到情况安全后再做任何事情。这就像互斥，用于保护多线程编程中的数据结构。

从某种意义上说，串行执行可以称为悲观到了极致：在事务持续期间，每个事务对整个数据库（或数据库的一个分区）具有排它锁，作为对悲观的补偿，我们让每笔事务执行得非常快，所以只需要短时间持有 “锁”。

相比之下，**串行化快照隔离** 是一种 **乐观（optimistic）** 的并发控制技术。在这种情况下，乐观意味着，如果存在潜在的危险也不阻止事务，而是继续执行事务，希望一切都会好起来。当一个事务想要提交时，数据库检查是否有什么不好的事情发生（即隔离是否被违反）；如果是的话，事务将被中止，并且必须重试。只有可串行化的事务才被允许提交。

乐观并发控制是一个古老的想法【52】，其优点和缺点已经争论了很长时间【53】。如果存在很多 **争用**（contention，即很多事务试图访问相同的对象），则表现不佳，因为这会导致很大一部分事务需要中止。如果系统已经接近最大吞吐量，来自重试事务的额外负载可能会使性能变差。

但是，如果有足够的空闲容量，并且事务之间的争用不是太高，乐观的并发控制技术往往比悲观的性能要好。可交换的原子操作可以减少争用：例如，如果多个事务同时要增加一个计数器，那么应用增量的顺序（只要计数器不在同一个事务中读取）就无关紧要了，所以并发增量可以全部应用且不会有冲突。

顾名思义，SSI 基于快照隔离 —— 也就是说，事务中的所有读取都是来自数据库的一致性快照（请参阅 “[快照隔离和可重复读取](#快照隔离和可重复读)”）。与早期的乐观并发控制技术相比这是主要的区别。在快照隔离的基础上，SSI 添加了一种算法来检测写入之间的串行化冲突，并确定要中止哪些事务。

#### 基于过时前提的决策

先前讨论了快照隔离中的写入偏差（请参阅 “[写入偏差与幻读](#写入偏差与幻读)”）时，我们观察到一个循环模式：事务从数据库读取一些数据，检查查询的结果，并根据它看到的结果决定采取一些操作（写入数据库）。但是，在快照隔离的情况下，原始查询的结果在事务提交时可能不再是最新的，因为数据可能在同一时间被修改。

换句话说，事务基于一个 **前提（premise）** 采取行动（事务开始时候的事实，例如：“目前有两名医生正在值班”）。之后当事务要提交时，原始数据可能已经改变 —— 前提可能不再成立。

当应用程序进行查询时（例如，“当前有多少医生正在值班？”），数据库不知道应用逻辑如何使用该查询结果。在这种情况下为了安全，数据库需要假设任何对该结果集的变更都可能会使该事务中的写入变得无效。换而言之，事务中的查询与写入可能存在因果依赖。为了提供可串行化的隔离级别，如果事务在过时的前提下执行操作，数据库必须能检测到这种情况，并中止事务。

数据库如何知道查询结果是否可能已经改变？有两种情况需要考虑：

- 检测对旧 MVCC 对象版本的读取（读之前存在未提交的写入）
- 检测影响先前读取的写入（读之后发生写入）

#### 检测旧MVCC读取

回想一下，快照隔离通常是通过多版本并发控制（MVCC；见 [图 7-10](/v1/ddia_0710.png)）来实现的。当一个事务从 MVCC 数据库中的一致快照读时，它将忽略取快照时尚未提交的任何其他事务所做的写入。在 [图 7-10](/v1/ddia_0710.png) 中，事务 43 认为 Alice 的 `on_call = true` ，因为事务 42（修改 Alice 的待命状态）未被提交。然而，在事务 43 想要提交时，事务 42 已经提交。这意味着在读一致性快照时被忽略的写入已经生效，事务 43 的前提不再为真。

![](/v1/ddia_0710.png)

**图 7-10 检测事务何时从 MVCC 快照读取过时的值**

为了防止这种异常，数据库需要跟踪一个事务由于 MVCC 可见性规则而忽略另一个事务的写入。当事务想要提交时，数据库检查是否有任何被忽略的写入现在已经被提交。如果是这样，事务必须中止。

为什么要等到提交？当检测到陈旧的读取时，为什么不立即中止事务 43 ？因为如果事务 43 是只读事务，则不需要中止，因为没有写入偏差的风险。当事务 43 进行读取时，数据库还不知道事务是否要稍后执行写操作。此外，事务 42 可能在事务 43 被提交的时候中止或者可能仍然未被提交，因此读取可能终究不是陈旧的。通过避免不必要的中止，SSI 保留了快照隔离从一致快照中长时间读取的能力。

#### 检测影响之前读取的写入

第二种情况要考虑的是另一个事务在读取数据之后修改数据。这种情况如 [图 7-11](/v1/ddia_0711.png) 所示。

![](/v1/ddia_0711.png)

**图 7-11 在可串行化快照隔离中，检测一个事务何时修改另一个事务的读取。**

在两阶段锁定的上下文中，我们讨论了索引范围锁（请参阅 “[索引范围锁](#索引范围锁)”），它允许数据库锁定与某个搜索查询匹配的所有行的访问权，例如 `WHERE shift_id = 1234`。可以在这里使用类似的技术，除了 SSI 锁不会阻塞其他事务。

在 [图 7-11](/v1/ddia_0711.png) 中，事务 42 和 43 都在班次 1234 查找值班医生。如果在 `shift_id` 上有索引，则数据库可以使用索引项 1234 来记录事务 42 和 43 读取这个数据的事实。（如果没有索引，这个信息可以在表级别进行跟踪）。这个信息只需要保留一段时间：在一个事务完成（提交或中止），并且所有的并发事务完成之后，数据库就可以忘记它读取的数据了。

当事务写入数据库时，它必须在索引中查找最近曾读取受影响数据的其他事务。这个过程类似于在受影响的键范围上获取写锁，但锁并不会阻塞事务直到其他读事务完成，而是像警戒线一样只是简单通知其他事务：你们读过的数据可能不是最新的啦。

在 [图 7-11](/v1/ddia_0711.png) 中，事务 42 通知事务 43 其先前读已过时，反之亦然。事务 42 首先提交并成功，尽管事务 43 的写影响了 42 ，但因为事务 43 尚未提交，所以写入尚未生效。然而当事务 43 想要提交时，来自事务 42 的冲突写入已经被提交，所以事务 43 必须中止。

#### 可串行化快照隔离的性能

与往常一样，许多工程细节会影响算法的实际表现。例如一个权衡是跟踪事务的读取和写入的 **粒度（granularity）**。如果数据库详细地跟踪每个事务的活动（细粒度），那么可以准确地确定哪些事务需要中止，但是簿记开销可能变得很显著。简略的跟踪速度更快（粗粒度），但可能会导致更多不必要的事务中止。

在某些情况下，事务可以读取被另一个事务覆盖的信息：这取决于发生了什么，有时可以证明执行结果无论如何都是可串行化的。PostgreSQL 使用这个理论来减少不必要的中止次数【11,41】。

与两阶段锁定相比，可串行化快照隔离的最大优点是一个事务不需要阻塞等待另一个事务所持有的锁。就像在快照隔离下一样，写不会阻塞读，反之亦然。这种设计原则使得查询延迟更可预测，波动更少。特别是，只读查询可以运行在一致快照上，而不需要任何锁定，这对于读取繁重的工作负载非常有吸引力。

与串行执行相比，可串行化快照隔离并不局限于单个 CPU 核的吞吐量：FoundationDB 将串行化冲突的检测分布在多台机器上，允许扩展到很高的吞吐量。即使数据可能跨多台机器进行分区，事务也可以在保证可串行化隔离等级的同时读写多个分区中的数据【54】。

中止率显著影响 SSI 的整体表现。例如，长时间读取和写入数据的事务很可能会发生冲突并中止，因此 SSI 要求同时读写的事务尽量短（只读的长事务可能没问题）。对于慢事务，SSI 可能比两阶段锁定或串行执行更不敏感。


## 本章小结

事务是一个抽象层，允许应用程序假装某些并发问题和某些类型的硬件和软件故障不存在。各式各样的错误被简化为一种简单情况：**事务中止（transaction abort）**，而应用需要的仅仅是重试。

在本章中介绍了很多问题，事务有助于防止这些问题发生。并非所有应用都易受此类问题影响：具有非常简单访问模式的应用（例如每次读写单条记录）可能无需事务管理。但是对于更复杂的访问模式，事务可以大大减少需要考虑的潜在错误情景数量。

如果没有事务处理，各种错误情况（进程崩溃、网络中断、停电、磁盘已满、意外并发等）意味着数据可能以各种方式变得不一致。例如，非规范化的数据可能很容易与源数据不同步。如果没有事务处理，就很难推断复杂的交互访问可能对数据库造成的影响。

本章深入讨论了 **并发控制** 的话题。我们讨论了几个广泛使用的隔离级别，特别是 **读已提交**、**快照隔离**（有时称为可重复读）和 **可串行化**。并通过研究竞争条件的各种例子，来描述这些隔离等级：

脏读
: 一个客户端读取到另一个客户端尚未提交的写入。**读已提交** 或更强的隔离级别可以防止脏读。

脏写
: 一个客户端覆盖写入了另一个客户端尚未提交的写入。几乎所有的事务实现都可以防止脏写。

读偏差（不可重复读）
: 在同一个事务中，客户端在不同的时间点会看见数据库的不同状态。**快照隔离** 经常用于解决这个问题，它允许事务从一个特定时间点的一致性快照中读取数据。快照隔离通常使用 **多版本并发控制（MVCC）** 来实现。

丢失更新
: 两个客户端同时执行 **读取 - 修改 - 写入序列**。其中一个写操作，在没有合并另一个写入变更情况下，直接覆盖了另一个写操作的结果。所以导致数据丢失。快照隔离的一些实现可以自动防止这种异常，而另一些实现则需要手动锁定（`SELECT FOR UPDATE`）。

写偏差
: 一个事务读取一些东西，根据它所看到的值作出决定，并将该决定写入数据库。但是，写入时，该决定的前提不再是真实的。只有可串行化的隔离才能防止这种异常。

幻读
: 事务读取符合某些搜索条件的对象。另一个客户端进行写入，影响搜索结果。快照隔离可以防止直接的幻像读取，但是写入偏差上下文中的幻读需要特殊处理，例如索引范围锁定。

弱隔离级别可以防止其中一些异常情况，但要求你，也就是应用程序开发人员手动处理剩余那些（例如，使用显式锁定）。只有可串行化的隔离才能防范所有这些问题。我们讨论了实现可串行化事务的三种不同方法：

字面意义上的串行执行
: 如果每个事务的执行速度非常快，并且事务吞吐量足够低，足以在单个 CPU 核上处理，这是一个简单而有效的选择。

两阶段锁定
: 数十年来，两阶段锁定一直是实现可串行化的标准方式，但是许多应用出于性能问题的考虑避免使用它。

可串行化快照隔离（SSI）
: 一个相当新的算法，避免了先前方法的大部分缺点。它使用乐观的方法，允许事务执行而无需阻塞。当一个事务想要提交时，它会进行检查，如果执行不可串行化，事务就会被中止。

本章中的示例主要是在关系数据模型的上下文中。但是，正如在 **“[多对象事务的需求](#多对象事务的需求)”** 中所讨论的，无论使用哪种数据模型，事务都是有价值的数据库功能。

本章主要是在单机数据库的上下文中，探讨了各种想法和算法。分布式数据库中的事务，则引入了一系列新的困难挑战，我们将在接下来的两章中讨论。


## 参考文献

1. Donald D. Chamberlin, Morton M. Astrahan, Michael W. Blasgen, et al.: “[A History and Evaluation of System R](https://citeseerx.ist.psu.edu/pdf/ebb29a0ca16e04e7eeb6b606b22a9eadb3a9d531),” *Communications of the ACM*, volume 24, number 10, pages 632–646, October 1981. [doi:10.1145/358769.358784](http://dx.doi.org/10.1145/358769.358784)
1. Jim N. Gray, Raymond A. Lorie, Gianfranco R. Putzolu, and Irving L. Traiger: “[Granularity of Locks and Degrees of Consistency in a Shared Data Base](https://citeseerx.ist.psu.edu/pdf/e127f0a6a912bb9150ecfe03c0ebf7fbc289a023),” in *Modelling in Data Base Management Systems: Proceedings of the IFIP Working Conference on Modelling in Data Base Management Systems*, edited by G. M. Nijssen, pages 364–394, Elsevier/North Holland Publishing, 1976. Also in *Readings in Database Systems*, 4th edition, edited by Joseph M. Hellerstein and Michael Stonebraker, MIT Press, 2005. ISBN: 978-0-262-69314-1
1. Kapali P. Eswaran, Jim N. Gray, Raymond A. Lorie, and Irving L. Traiger: “[The Notions of Consistency and Predicate Locks in a Database System](http://research.microsoft.com/en-us/um/people/gray/papers/On%20the%20Notions%20of%20Consistency%20and%20Predicate%20Locks%20in%20a%20Database%20System%20CACM.pdf),” *Communications of the ACM*, volume 19, number 11, pages 624–633, November 1976.
1. “[ACID Transactions Are Incredibly Helpful](http://web.archive.org/web/20150320053809/https://foundationdb.com/acid-claims),” FoundationDB, LLC, 2013.
1. John D. Cook: “[ACID Versus BASE for Database Transactions](http://www.johndcook.com/blog/2009/07/06/brewer-cap-theorem-base/),” *johndcook.com*, July 6, 2009.
1. Gavin Clarke: “[NoSQL's CAP Theorem Busters: We Don't Drop ACID](http://www.theregister.co.uk/2012/11/22/foundationdb_fear_of_cap_theorem/),” *theregister.co.uk*, November 22, 2012.
1. Theo Härder and Andreas Reuter: “[Principles of Transaction-Oriented Database Recovery](https://citeseerx.ist.psu.edu/pdf/11ef7c142295aeb1a28a0e714c91fc8d610c3047),” *ACM Computing Surveys*, volume 15, number 4, pages 287–317, December 1983. [doi:10.1145/289.291](http://dx.doi.org/10.1145/289.291)
1. Peter Bailis, Alan Fekete, Ali Ghodsi, et al.: “[HAT, not CAP: Towards Highly Available Transactions](http://www.bailis.org/papers/hat-hotos2013.pdf),” at *14th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2013.
1. Armando Fox, Steven D. Gribble, Yatin Chawathe, et al.: “[Cluster-Based Scalable Network Services](https://people.eecs.berkeley.edu/~brewer/cs262b/TACC.pdf),” at *16th ACM Symposium on Operating Systems Principles* (SOSP), October 1997.
1. Philip A. Bernstein, Vassos Hadzilacos, and Nathan Goodman: [*Concurrency Control and Recovery in Database Systems*](https://www.microsoft.com/en-us/research/people/philbe/book/). Addison-Wesley, 1987. ISBN: 978-0-201-10715-9, available online at *research.microsoft.com*.
1. Alan Fekete, Dimitrios Liarokapis, Elizabeth O'Neil, et al.: “[Making Snapshot Isolation Serializable](https://www.cse.iitb.ac.in/infolab/Data/Courses/CS632/2009/Papers/p492-fekete.pdf),” *ACM Transactions on Database Systems*, volume 30, number 2, pages 492–528, June 2005. [doi:10.1145/1071610.1071615](http://dx.doi.org/10.1145/1071610.1071615)
1. Mai Zheng, Joseph Tucek, Feng Qin, and Mark Lillibridge: “[Understanding the Robustness of SSDs Under Power Fault](https://www.usenix.org/system/files/conference/fast13/fast13-final80.pdf),” at *11th USENIX Conference on File and Storage Technologies* (FAST), February 2013.
1. Laurie Denness: “[SSDs: A Gift and a Curse](https://laur.ie/blog/2015/06/ssds-a-gift-and-a-curse/),” *laur.ie*, June 2, 2015.
1. Adam Surak: “[When Solid State Drives Are Not That Solid](https://blog.algolia.com/when-solid-state-drives-are-not-that-solid/),” *blog.algolia.com*, June 15, 2015.
1. Thanumalayan Sankaranarayana Pillai, Vijay Chidambaram, Ramnatthan Alagappan, et al.: “[All File Systems Are Not Created Equal: On the Complexity of Crafting Crash-Consistent Applications](http://research.cs.wisc.edu/wind/Publications/alice-osdi14.pdf),” at *11th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2014.
1. Chris Siebenmann: “[Unix's File Durability Problem](https://utcc.utoronto.ca/~cks/space/blog/unix/FileSyncProblem),” *utcc.utoronto.ca*, April 14, 2016.
1. Lakshmi N. Bairavasundaram, Garth R. Goodson, Bianca Schroeder, et al.: “[An Analysis of Data Corruption in the Storage Stack](http://research.cs.wisc.edu/adsl/Publications/corruption-fast08.pdf),” at *6th USENIX Conference on File and Storage Technologies* (FAST), February 2008.
1. Bianca Schroeder, Raghav Lagisetty, and Arif Merchant: “[Flash Reliability in Production: The Expected and the Unexpected](https://www.usenix.org/conference/fast16/technical-sessions/presentation/schroeder),” at *14th USENIX Conference on File and Storage Technologies* (FAST), February 2016.
1. Don Allison: “[SSD Storage – Ignorance of Technology Is No Excuse](https://blog.korelogic.com/blog/2015/03/24),” *blog.korelogic.com*, March 24, 2015.
1. Dave Scherer: “[Those Are Not Transactions (Cassandra 2.0)](http://web.archive.org/web/20150526065247/http://blog.foundationdb.com/those-are-not-transactions-cassandra-2-0),” *blog.foundationdb.com*, September 6, 2013.
1. Kyle Kingsbury: “[Call Me Maybe: Cassandra](http://aphyr.com/posts/294-call-me-maybe-cassandra/),” *aphyr.com*, September 24, 2013.
1. “[ACID Support in Aerospike](https://web.archive.org/web/20170305002118/https://www.aerospike.com/docs/architecture/assets/AerospikeACIDSupport.pdf),” Aerospike, Inc., June 2014.
1. Martin Kleppmann: “[Hermitage: Testing the 'I' in ACID](http://martin.kleppmann.com/2014/11/25/hermitage-testing-the-i-in-acid.html),” *martin.kleppmann.com*, November 25, 2014.
1. Tristan D'Agosta: “[BTC Stolen from Poloniex](https://bitcointalk.org/index.php?topic=499580),” *bitcointalk.org*, March 4, 2014.
1. bitcointhief2: “[How I Stole Roughly 100 BTC from an Exchange and How I Could Have Stolen More!](http://www.reddit.com/r/Bitcoin/comments/1wtbiu/how_i_stole_roughly_100_btc_from_an_exchange_and/),” *reddit.com*, February 2, 2014.
1. Sudhir Jorwekar, Alan Fekete, Krithi Ramamritham, and S. Sudarshan: “[Automating the Detection of Snapshot Isolation Anomalies](http://www.vldb.org/conf/2007/papers/industrial/p1263-jorwekar.pdf),” at *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
1. Michael Melanson: “[Transactions: The Limits of Isolation](https://www.michaelmelanson.net/posts/transactions-the-limits-of-isolation/),” *michaelmelanson.net*, November 30, 2014.
1. Hal Berenson, Philip A. Bernstein, Jim N. Gray, et al.: “[A Critique of ANSI SQL Isolation Levels](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-95-51.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), May 1995.
1. Atul Adya: “[Weak Consistency: A Generalized Theory and Optimistic Implementations for Distributed Transactions](http://pmg.csail.mit.edu/papers/adya-phd.pdf),” PhD Thesis, Massachusetts Institute of Technology, March 1999.
1. Peter Bailis, Aaron Davidson, Alan Fekete, et al.: “[Highly Available Transactions: Virtues and Limitations (Extended Version)](http://arxiv.org/pdf/1302.0309.pdf),” at *40th International Conference on Very Large Data Bases* (VLDB), September 2014.
1. Bruce Momjian: “[MVCC Unmasked](http://momjian.us/main/presentations/internals.html#mvcc),” *momjian.us*, July 2014.
1. Annamalai Gurusami: “[Repeatable Read Isolation Level in InnoDB – How Consistent Read View Works](https://web.archive.org/web/20161225080947/https://blogs.oracle.com/mysqlinnodb/entry/repeatable_read_isolation_level_in),” *blogs.oracle.com*, January 15, 2013.
1. Nikita Prokopov: “[Unofficial Guide to Datomic Internals](http://tonsky.me/blog/unofficial-guide-to-datomic-internals/),” *tonsky.me*, May 6, 2014.
1. Baron Schwartz: “[Immutability, MVCC, and Garbage Collection](https://web.archive.org/web/20220122020806/https://www.xaprb.com/blog/2013/12/28/immutability-mvcc-and-garbage-collection/),” *xaprb.com*, December 28, 2013.
1. J. Chris Anderson, Jan Lehnardt, and Noah Slater: *CouchDB: The Definitive Guide*. O'Reilly Media, 2010. ISBN: 978-0-596-15589-6
1. Rikdeb Mukherjee: “[Isolation in DB2 (Repeatable Read, Read Stability, Cursor Stability, Uncommitted Read) with Examples](http://mframes.blogspot.co.uk/2013/07/isolation-in-cursor.html),” *mframes.blogspot.co.uk*, July 4, 2013.
1. Steve Hilker: “[Cursor Stability (CS) – IBM DB2 Community](https://web.archive.org/web/20150420001721/http://www.toadworld.com/platforms/ibmdb2/w/wiki/6661.cursor-stability-cs.aspx),” *toadworld.com*, March 14, 2013.
1. Nate Wiger: “[An Atomic Rant](https://nateware.com/2010/02/18/an-atomic-rant/),” *nateware.com*, February 18, 2010.
1. Joel Jacobson: “[Riak 2.0: Data Types](https://web.archive.org/web/20160327135816/http://blog.joeljacobson.com/riak-2-0-data-types/),” *blog.joeljacobson.com*, March 23, 2014.
1. Michael J. Cahill, Uwe Röhm, and Alan Fekete: “[Serializable Isolation for Snapshot Databases](https://web.archive.org/web/20200709144151/https://cs.nyu.edu/courses/Fall12/CSCI-GA.2434-001/p729-cahill.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2008. [doi:10.1145/1376616.1376690](http://dx.doi.org/10.1145/1376616.1376690)
1. Dan R. K. Ports and Kevin Grittner: “[Serializable Snapshot Isolation in PostgreSQL](http://drkp.net/papers/ssi-vldb12.pdf),” at *38th International Conference on Very Large Databases* (VLDB), August 2012.
1. Tony Andrews: “[Enforcing Complex Constraints in Oracle](http://tonyandrews.blogspot.co.uk/2004/10/enforcing-complex-constraints-in.html),” *tonyandrews.blogspot.co.uk*, October 15, 2004.
1. Douglas B. Terry, Marvin M. Theimer, Karin Petersen, et al.: “[Managing Update Conflicts in Bayou, a Weakly Connected Replicated Storage System](https://citeseerx.ist.psu.edu/pdf/20c450f099b661c5a2dff3f348773a0d1af1b09b),” at *15th ACM Symposium on Operating Systems Principles* (SOSP), December 1995. [doi:10.1145/224056.224070](http://dx.doi.org/10.1145/224056.224070)
1. Gary Fredericks: “[Postgres Serializability Bug](https://github.com/gfredericks/pg-serializability-bug),” *github.com*, September 2015.
1. Michael Stonebraker, Samuel Madden, Daniel J. Abadi, et al.: “[The End of an Architectural Era (It’s Time for a Complete Rewrite)](https://citeseerx.ist.psu.edu/pdf/775d54c66d271028a7d4dadf07cce6f918584cd3),” at *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
1. John Hugg: “[H-Store/VoltDB Architecture vs. CEP Systems and Newer Streaming Architectures](https://www.youtube.com/watch?v=hD5M4a1UVz8),” at *Data @Scale Boston*, November 2014.
1. Robert Kallman, Hideaki Kimura, Jonathan Natkins, et al.: “[H-Store: A High-Performance, Distributed Main Memory Transaction Processing System](http://www.vldb.org/pvldb/vol1/1454211.pdf),” *Proceedings of the VLDB Endowment*, volume 1, number 2, pages 1496–1499, August 2008.
1. Rich Hickey: “[The Architecture of Datomic](http://www.infoq.com/articles/Architecture-Datomic),” *infoq.com*, November 2, 2012.
1. John Hugg: “[Debunking Myths About the VoltDB In-Memory Database](https://dzone.com/articles/debunking-myths-about-voltdb),” *dzone.com*, May 28, 2014.
1. Joseph M. Hellerstein, Michael Stonebraker, and James Hamilton: “[Architecture of a Database System](https://dsf.berkeley.edu/papers/fntdb07-architecture.pdf),” *Foundations and Trends in Databases*, volume 1, number 2, pages 141–259, November 2007. [doi:10.1561/1900000002](http://dx.doi.org/10.1561/1900000002)
1. Michael J. Cahill: “[Serializable Isolation for Snapshot Databases](https://ses.library.usyd.edu.au/bitstream/handle/2123/5353/michael-cahill-2009-thesis.pdf),” PhD Thesis, University of Sydney, July 2009.
1. D. Z. Badal: “[Correctness of Concurrency Control and Implications in Distributed Databases](http://ieeexplore.ieee.org/abstract/document/762563/),” at *3rd International IEEE Computer Software and Applications Conference* (COMPSAC), November 1979.
1. Rakesh Agrawal, Michael J. Carey, and Miron Livny: “[Concurrency Control Performance Modeling: Alternatives and Implications](http://www.eecs.berkeley.edu/~brewer/cs262/ConcControl.pdf),” *ACM Transactions on Database Systems* (TODS), volume 12, number 4, pages 609–654, December 1987. [doi:10.1145/32204.32220](http://dx.doi.org/10.1145/32204.32220)
1. Dave Rosenthal: “[Databases at 14.4MHz](http://web.archive.org/web/20150427041746/http://blog.foundationdb.com/databases-at-14.4mhz),” *blog.foundationdb.com*, December 10, 2014.


================================================
FILE: content/v1/ch8.md
================================================
---
title: "第八章：分布式系统的麻烦"
linkTitle: "8. 分布式系统的麻烦"
weight: 208
math: true
breadcrumbs: false
---


![](/map/ch08.png)

> 邂逅相遇
>
> 网络延迟
>
> 存之为吾
>
> 无食我数
>
> —— Kyle Kingsbury, Carly Rae Jepsen 《网络分区的危害》（2013 年）[^译著1]


最近几章中反复出现的主题是，系统如何处理错误的事情。例如，我们讨论了 **副本故障切换**（“[处理节点中断](/v1/ch5#处理节点宕机)”），**复制延迟**（“[复制延迟问题](/v1/ch5#复制延迟问题)”）和事务控制（“[弱隔离级别](/v1/ch7#弱隔离级别)”）。当我们了解可能在实际系统中出现的各种边缘情况时，我们会更好地处理它们。

但是，尽管我们已经谈了很多错误，但之前几章仍然过于乐观。现实更加黑暗。我们现在将悲观主义最大化，假设任何可能出错的东西 **都会** 出错 [^i]。（经验丰富的系统运维会告诉你，这是一个合理的假设。如果你问得好，他们可能会一边治疗心理创伤一边告诉你一些可怕的故事）

[^i]: 除了一个例外：我们将假定故障是非拜占庭式的（请参阅 “[拜占庭故障](#拜占庭故障)”）。

使用分布式系统与在一台计算机上编写软件有着根本的区别，主要的区别在于，有许多新颖和刺激的方法可以使事情出错【1,2】。在这一章中，我们将了解实践中出现的问题，理解我们能够依赖，和不可以依赖的东西。

最后，作为工程师，我们的任务是构建能够完成工作的系统（即满足用户期望的保证），尽管一切都出错了。在 [第九章](/v1/ch9) 中，我们将看看一些可以在分布式系统中提供这种保证的算法的例子。但首先，在本章中，我们必须了解我们面临的挑战。

本章对分布式系统中可能出现的问题进行彻底的悲观和沮丧的总结。我们将研究网络的问题（“[不可靠的网络](#不可靠的网络)”）; 时钟和时序问题（“[不可靠的时钟](#不可靠的时钟)”）; 我们将讨论他们可以避免的程度。所有这些问题的后果都是困惑的，所以我们将探索如何思考一个分布式系统的状态，以及如何推理发生的事情（“[知识、真相与谎言](#知识、真相与谎言)”）。


## 故障与部分失效

当你在一台计算机上编写一个程序时，它通常会以一种相当可预测的方式运行：无论是工作还是不工作。充满错误的软件可能会让人觉得电脑有时候也会有 “糟糕的一天”（这种问题通常是重新启动就恢复了），但这主要是软件写得不好的结果。

单个计算机上的软件没有根本性的不可靠原因：当硬件正常工作时，相同的操作总是产生相同的结果（这是确定性的）。如果存在硬件问题（例如，内存损坏或连接器松动），其后果通常是整个系统故障（例如，内核恐慌，“蓝屏死机”，启动失败）。装有良好软件的个人计算机通常要么功能完好，要么完全失效，而不是介于两者之间。

这是计算机设计中的一个有意的选择：如果发生内部错误，我们宁愿电脑完全崩溃，而不是返回错误的结果，因为错误的结果很难处理。因为计算机隐藏了模糊不清的物理实现，并呈现出一个理想化的系统模型，并以数学一样的完美的方式运作。CPU 指令总是做同样的事情；如果你将一些数据写入内存或磁盘，那么这些数据将保持不变，并且不会被随机破坏。从第一台数字计算机开始，*始终正确地计算* 这个设计目标贯穿始终【3】。

当你编写运行在多台计算机上的软件时，情况有本质上的区别。在分布式系统中，我们不再处于理想化的系统模型中，我们别无选择，只能面对现实世界的混乱现实。而在现实世界中，各种各样的事情都可能会出现问题【4】，如下面的轶事所述：

> 在我有限的从业经历中，我已经和很多东西打过交道：单个 **数据中心（DC）** 中长期存在的网络分区，配电单元 PDU 故障，交换机故障，整个机架的意外重启，整个数据中心主干网络故障，整个数据中心的电源故障，以及一个低血糖的司机把他的福特皮卡撞在数据中心的 HVAC（加热，通风和空调）系统上。而且我甚至不是一个运维。
>
> —— 柯达黑尔

在分布式系统中，尽管系统的其他部分工作正常，但系统的某些部分可能会以某种不可预知的方式被破坏。这被称为 **部分失效（partial failure）**。难点在于部分失效是 **不确定性的（nondeterministic）**：如果你试图做任何涉及多个节点和网络的事情，它有时可能会工作，有时会出现不可预知的失败。正如我们将要看到的，你甚至不知道是否成功了，因为消息通过网络传播的时间也是不确定的！

这种不确定性和部分失效的可能性，使得分布式系统难以工作【5】。

### 云计算与超级计算机

关于如何构建大型计算系统有一系列的哲学：

* 一个极端是高性能计算（HPC）领域。具有数千个 CPU 的超级计算机通常用于计算密集型科学计算任务，如天气预报或分子动力学（模拟原子和分子的运动）。
* 另一个极端是 **云计算（cloud computing）**，云计算并不是一个良好定义的概念【6】，但通常与多租户数据中心，连接 IP 网络（通常是以太网）的商用计算机，弹性 / 按需资源分配以及计量计费等相关联。
* 传统企业数据中心位于这两个极端之间。

不同的哲学会导致不同的故障处理方式。在超级计算机中，作业通常会不时地将计算的状态存盘到持久存储中。如果一个节点出现故障，通常的解决方案是简单地停止整个集群的工作负载。故障节点修复后，计算从上一个检查点重新开始【7,8】。因此，超级计算机更像是一个单节点计算机而不是分布式系统：通过让部分失败升级为完全失败来处理部分失败 —— 如果系统的任何部分发生故障，只是让所有的东西都崩溃（就像单台机器上的内核恐慌一样）。

在本书中，我们将重点放在实现互联网服务的系统上，这些系统通常与超级计算机看起来有很大不同：

* 许多与互联网有关的应用程序都是 **在线（online）** 的，因为它们需要能够随时以低延迟服务用户。使服务不可用（例如，停止集群以进行修复）是不可接受的。相比之下，像天气模拟这样的离线（批处理）工作可以停止并重新启动，影响相当小。

* 超级计算机通常由专用硬件构建而成，每个节点相当可靠，节点通过共享内存和 **远程直接内存访问（RDMA）** 进行通信。另一方面，云服务中的节点是由商用机器构建而成的，由于规模经济，可以以较低的成本提供相同的性能，而且具有较高的故障率。

* 大型数据中心网络通常基于 IP 和以太网，以 CLOS 拓扑排列，以提供更高的对分（bisection）带宽【9】。超级计算机通常使用专门的网络拓扑结构，例如多维网格和 Torus 网络 【10】，这为具有已知通信模式的 HPC 工作负载提供了更好的性能。

* 系统越大，其组件之一就越有可能坏掉。随着时间的推移，坏掉的东西得到修复，新的东西又坏掉，但是在一个有成千上万个节点的系统中，有理由认为总是有一些东西是坏掉的【7】。当错误处理的策略只由简单放弃组成时，一个大的系统最终会花费大量时间从错误中恢复，而不是做有用的工作【8】。

* 如果系统可以容忍发生故障的节点，并继续保持整体工作状态，那么这对于运营和维护非常有用：例如，可以执行滚动升级（请参阅 [第四章](/v1/ch4)），一次重新启动一个节点，同时继续给用户提供不中断的服务。在云环境中，如果一台虚拟机运行不佳，可以杀死它并请求一台新的虚拟机（希望新的虚拟机速度更快）。

* 在地理位置分散的部署中（保持数据在地理位置上接近用户以减少访问延迟），通信很可能通过互联网进行，与本地网络相比，通信速度缓慢且不可靠。超级计算机通常假设它们的所有节点都靠近在一起。

如果要使分布式系统工作，就必须接受部分故障的可能性，并在软件中建立容错机制。换句话说，我们需要从不可靠的组件构建一个可靠的系统（正如 “[可靠性](/v1/ch1#可靠性)” 中所讨论的那样，没有完美的可靠性，所以我们需要理解我们可以实际承诺的极限）。

即使在只有少数节点的小型系统中，考虑部分故障也是很重要的。在一个小系统中，很可能大部分组件在大部分时间都正常工作。然而，迟早会有一部分系统出现故障，软件必须以某种方式处理。故障处理必须是软件设计的一部分，并且作为软件的运维，你需要知道在发生故障的情况下，软件可能会表现出怎样的行为。

简单地假设缺陷很罕见并希望始终保持最好的状况是不明智的。考虑一系列可能的错误（甚至是不太可能的错误），并在测试环境中人为地创建这些情况来查看会发生什么是非常重要的。在分布式系统中，怀疑，悲观和偏执狂是值得的。

> #### 从不可靠的组件构建可靠的系统
>
> 你可能想知道这是否有意义 —— 直观地看来，系统只能像其最不可靠的组件（最薄弱的环节）一样可靠。事实并非如此：事实上，从不太可靠的潜在基础构建更可靠的系统是计算机领域的一个古老思想【11】。例如：
>
> * 纠错码允许数字数据在通信信道上准确传输，偶尔会出现一些错误，例如由于无线网络上的无线电干扰【12】。
> * **互联网协议（Internet Protocol, IP）** 不可靠：可能丢弃、延迟、重复或重排数据包。传输控制协议（Transmission Control Protocol, TCP）在互联网协议（IP）之上提供了更可靠的传输层：它确保丢失的数据包被重新传输，消除重复，并且数据包被重新组装成它们被发送的顺序。
>
> 虽然这个系统可以比它的底层部分更可靠，但它的可靠性总是有限的。例如，纠错码可以处理少量的单比特错误，但是如果你的信号被干扰所淹没，那么通过信道可以得到多少数据，是有根本性的限制的【13】。TCP 可以隐藏数据包的丢失，重复和重新排序，但是它不能神奇地消除网络中的延迟。
>
> 虽然更可靠的高级系统并不完美，但它仍然有用，因为它处理了一些棘手的低级错误，所以其余的错误通常更容易推理和处理。我们将在 “[数据库的端到端原则](/v1/ch12#数据库的端到端原则)” 中进一步探讨这个问题。


## 不可靠的网络

正如在 [第二部分](/v1/part-ii) 的介绍中所讨论的那样，我们在本书中关注的分布式系统是无共享的系统，即通过网络连接的一堆机器。网络是这些机器可以通信的唯一途径 —— 我们假设每台机器都有自己的内存和磁盘，一台机器不能访问另一台机器的内存或磁盘（除了通过网络向服务器发出请求）。

**无共享** 并不是构建系统的唯一方式，但它已经成为构建互联网服务的主要方式，其原因如下：相对便宜，因为它不需要特殊的硬件，可以利用商品化的云计算服务，通过跨多个地理分布的数据中心进行冗余可以实现高可靠性。

互联网和数据中心（通常是以太网）中的大多数内部网络都是 **异步分组网络（asynchronous packet networks）**。在这种网络中，一个节点可以向另一个节点发送一个消息（一个数据包），但是网络不能保证它什么时候到达，或者是否到达。如果你发送请求并期待响应，则很多事情可能会出错（其中一些如 [图 8-1](/v1/ddia_0801.png) 所示）：

1. 请求可能已经丢失（可能有人拔掉了网线）。
2. 请求可能正在排队，稍后将交付（也许网络或接收方过载）。
3. 远程节点可能已经失效（可能是崩溃或关机）。
4. 远程节点可能暂时停止了响应（可能会遇到长时间的垃圾回收暂停；请参阅 “[进程暂停](#进程暂停)”），但稍后会再次响应。
5. 远程节点可能已经处理了请求，但是网络上的响应已经丢失（可能是网络交换机配置错误）。
6. 远程节点可能已经处理了请求，但是响应已经被延迟，并且稍后将被传递（可能是网络或者你自己的机器过载）。

![](/v1/ddia_0801.png)

**图 8-1 如果发送请求并没有得到响应，则无法区分（a）请求是否丢失，（b）远程节点是否关闭，或（c）响应是否丢失。**

发送者甚至不能分辨数据包是否被发送：唯一的选择是让接收者发送响应消息，这可能会丢失或延迟。这些问题在异步网络中难以区分：你所拥有的唯一信息是，你尚未收到响应。如果你向另一个节点发送请求并且没有收到响应，则不可能判断是什么原因。

处理这个问题的通常方法是 **超时（Timeout）**：在一段时间之后放弃等待，并且认为响应不会到达。但是，当发生超时时，你仍然不知道远程节点是否收到了请求（如果请求仍然在某个地方排队，那么即使发送者已经放弃了该请求，仍然可能会将其发送给接收者）。

### 真实世界的网络故障

我们几十年来一直在建设计算机网络 —— 有人可能希望现在我们已经找出了使网络变得可靠的方法。但是现在似乎还没有成功。

有一些系统的研究和大量的轶事证据表明，即使在像一家公司运营的数据中心那样的受控环境中，网络问题也可能出乎意料地普遍。在一家中型数据中心进行的一项研究发现，每个月大约有 12 个网络故障，其中一半断开一台机器，一半断开整个机架【15】。另一项研究测量了架顶式交换机，汇聚交换机和负载平衡器等组件的故障率【16】。它发现添加冗余网络设备不会像你所希望的那样减少故障，因为它不能防范人为错误（例如，错误配置的交换机），这是造成中断的主要原因。

诸如 EC2 之类的公有云服务因频繁的暂态网络故障而臭名昭著【14】，管理良好的私有数据中心网络可能是更稳定的环境。尽管如此，没有人不受网络问题的困扰：例如，交换机软件升级过程中的一个问题可能会引发网络拓扑重构，在此期间网络数据包可能会延迟超过一分钟【17】。鲨鱼可能咬住海底电缆并损坏它们 【18】。其他令人惊讶的故障包括网络接口有时会丢弃所有入站数据包，但是成功发送出站数据包 【19】：仅仅因为网络链接在一个方向上工作，并不能保证它也在相反的方向工作。

> #### 网络分区
>
> 当网络的一部分由于网络故障而被切断时，有时称为 **网络分区（network partition）** 或 **网络断裂（netsplit）**。在本书中，我们通常会坚持使用更一般的术语 **网络故障（network fault）**，以避免与 [第六章](/v1/ch6) 讨论的存储系统的分区（分片）相混淆。

即使网络故障在你的环境中非常罕见，故障可能发生的事实，意味着你的软件需要能够处理它们。无论何时通过网络进行通信，都可能会失败，这是无法避免的。

如果网络故障的错误处理没有定义与测试，武断地讲，各种错误可能都会发生：例如，即使网络恢复【20】，集群可能会发生 **死锁**，永久无法为请求提供服务，甚至可能会删除所有的数据【21】。如果软件被置于意料之外的情况下，它可能会做出出乎意料的事情。

处理网络故障并不意味着容忍它们：如果你的网络通常是相当可靠的，一个有效的方法可能是当你的网络遇到问题时，简单地向用户显示一条错误信息。但是，你确实需要知道你的软件如何应对网络问题，并确保系统能够从中恢复。有意识地触发网络问题并测试系统响应（这是 Chaos Monkey 背后的想法；请参阅 “[可靠性](/v1/ch1#可靠性)”）。

### 检测故障

许多系统需要自动检测故障节点。例如：

* 负载平衡器需要停止向已死亡的节点转发请求（从轮询列表移出，即 out of rotation）。
* 在单主复制功能的分布式数据库中，如果主库失效，则需要将从库之一升级为新主库（请参阅 “[处理节点宕机](/v1/ch5#处理节点宕机)”）。

不幸的是，网络的不确定性使得很难判断一个节点是否工作。在某些特定的情况下，你可能会收到一些反馈信息，明确告诉你某些事情没有成功：

* 如果你可以连接到运行节点的机器，但没有进程正在侦听目标端口（例如，因为进程崩溃），操作系统将通过发送 FIN 或 RST 来关闭并重用 TCP 连接。但是，如果节点在处理请求时发生崩溃，则无法知道远程节点实际处理了多少数据【22】。
* 如果节点进程崩溃（或被管理员杀死），但节点的操作系统仍在运行，则脚本可以通知其他节点有关该崩溃的信息，以便另一个节点可以快速接管，而无需等待超时到期。例如，HBase 就是这么做的【23】。
* 如果你有权访问数据中心网络交换机的管理界面，则可以通过它们检测硬件级别的链路故障（例如，远程机器是否关闭电源）。如果你通过互联网连接，或者如果你处于共享数据中心而无法访问交换机，或者由于网络问题而无法访问管理界面，则排除此选项。
* 如果路由器确认你尝试连接的 IP 地址不可用，则可能会使用 ICMP 目标不可达数据包回复你。但是，路由器不具备神奇的故障检测能力 —— 它受到与网络其他参与者相同的限制。

关于远程节点关闭的快速反馈很有用，但是你不能指望它。即使 TCP 确认已经传送了一个数据包，应用程序在处理之前可能已经崩溃。如果你想确保一个请求是成功的，你需要应用程序本身的正确响应【24】。

相反，如果出了什么问题，你可能会在堆栈的某个层次上得到一个错误响应，但总的来说，你必须假设你可能根本就得不到任何回应。你可以重试几次（TCP 重试是透明的，但是你也可以在应用程序级别重试），等待超时过期，并且如果在超时时间内没有收到响应，则最终声明节点已经死亡。

### 超时与无穷的延迟

如果超时是检测故障的唯一可靠方法，那么超时应该等待多久？不幸的是没有简单的答案。

长时间的超时意味着长时间等待，直到一个节点被宣告死亡（在这段时间内，用户可能不得不等待，或者看到错误信息）。短的超时可以更快地检测到故障，但有更高地风险误将一个节点宣布为失效，而该节点实际上只是暂时地变慢了（例如由于节点或网络上的负载峰值）。

过早地声明一个节点已经死了是有问题的：如果这个节点实际上是活着的，并且正在执行一些动作（例如，发送一封电子邮件），而另一个节点接管，那么这个动作可能会最终执行两次。我们将在 “[知识、真相与谎言](#知识、真相与谎言)” 以及 [第九章](/v1/ch9) 和 [第十一章](/v1/ch11) 中更详细地讨论这个问题。

当一个节点被宣告死亡时，它的职责需要转移到其他节点，这会给其他节点和网络带来额外的负担。如果系统已经处于高负荷状态，则过早宣告节点死亡会使问题更严重。特别是如果节点实际上没有死亡，只是由于过载导致其响应缓慢；这时将其负载转移到其他节点可能会导致 **级联失效**（即 cascading failure，表示在极端情况下，所有节点都宣告对方死亡，所有节点都将停止工作）。

设想一个虚构的系统，其网络可以保证数据包的最大延迟 —— 每个数据包要么在一段时间内传送，要么丢失，但是传递永远不会比 $d$ 更长。此外，假设你可以保证一个非故障节点总是在一段时间 $r$ 内处理一个请求。在这种情况下，你可以保证每个成功的请求在 $2d + r$ 时间内都能收到响应，如果你在此时间内没有收到响应，则知道网络或远程节点不工作。如果这是成立的，$2d + r$ 会是一个合理的超时设置。

不幸的是，我们所使用的大多数系统都没有这些保证：异步网络具有无限的延迟（即尽可能快地传送数据包，但数据包到达可能需要的时间没有上限），并且大多数服务器实现并不能保证它们可以在一定的最大时间内处理请求（请参阅 “[响应时间保证](#响应时间保证)”）。对于故障检测，即使系统大部分时间快速运行也是不够的：如果你的超时时间很短，往返时间只需要一个瞬时尖峰就可以使系统失衡。

#### 网络拥塞和排队

在驾驶汽车时，由于交通拥堵，道路交通网络的通行时间往往不尽相同。同样，计算机网络上数据包延迟的可变性通常是由于排队【25】：

* 如果多个不同的节点同时尝试将数据包发送到同一目的地，则网络交换机必须将它们排队并将它们逐个送入目标网络链路（如 [图 8-2](/v1/ddia_0802.png) 所示）。在繁忙的网络链路上，数据包可能需要等待一段时间才能获得一个插槽（这称为网络拥塞）。如果传入的数据太多，交换机队列填满，数据包将被丢弃，因此需要重新发送数据包 - 即使网络运行良好。
* 当数据包到达目标机器时，如果所有 CPU 核心当前都处于繁忙状态，则来自网络的传入请求将被操作系统排队，直到应用程序准备好处理它为止。根据机器上的负载，这可能需要一段任意的时间。
* 在虚拟化环境中，正在运行的操作系统经常暂停几十毫秒，因为另一个虚拟机正在使用 CPU 核心。在这段时间内，虚拟机不能从网络中消耗任何数据，所以传入的数据被虚拟机监视器 【26】排队（缓冲），进一步增加了网络延迟的可变性。
* TCP 执行 **流量控制**（flow control，也称为 **拥塞避免**，即 congestion avoidance，或 **背压**，即 backpressure），其中节点会限制自己的发送速率以避免网络链路或接收节点过载【27】。这意味着甚至在数据进入网络之前，在发送者处就需要进行额外的排队。

![](/v1/ddia_0802.png)

**图 8-2 如果有多台机器将网络流量发送到同一目的地，则其交换机队列可能会被填满。在这里，端口 1,2 和 4 都试图发送数据包到端口 3**

而且，如果 TCP 在某个超时时间内没有被确认（这是根据观察的往返时间计算的），则认为数据包丢失，丢失的数据包将自动重新发送。尽管应用程序没有看到数据包丢失和重新传输，但它看到了延迟（等待超时到期，然后等待重新传输的数据包得到确认）。


> #### TCP与UDP
>
> 一些对延迟敏感的应用程序，比如视频会议和 IP 语音（VoIP），使用了 UDP 而不是 TCP。这是在可靠性和和延迟变化之间的折衷：由于 UDP 不执行流量控制并且不重传丢失的分组，所以避免了网络延迟变化的一些原因（尽管它仍然易受切换队列和调度延迟的影响）。
>
> 在延迟数据毫无价值的情况下，UDP 是一个不错的选择。例如，在 VoIP 电话呼叫中，可能没有足够的时间重新发送丢失的数据包，并在扬声器上播放数据。在这种情况下，重发数据包没有意义 —— 应用程序必须使用静音填充丢失数据包的时隙（导致声音短暂中断），然后在数据流中继续。重试发生在人类层（“你能再说一遍吗？声音刚刚断了一会儿。”）。

所有这些因素都会造成网络延迟的变化。当系统接近其最大容量时，排队延迟的变化范围特别大：拥有足够备用容量的系统可以轻松排空队列，而在高利用率的系统中，很快就能积累很长的队列。

在公共云和多租户数据中心中，资源被许多客户共享：网络链接和交换机，甚至每个机器的网卡和 CPU（在虚拟机上运行时）。批处理工作负载（如 MapReduce，请参阅 [第十章](/v1/ch10)）能够很容易使网络链接饱和。由于无法控制或了解其他客户对共享资源的使用情况，如果附近的某个人（嘈杂的邻居）正在使用大量资源，则网络延迟可能会发生剧烈变化【28,29】。

在这种环境下，你只能通过实验方式选择超时：在一段较长的时期内、在多台机器上测量网络往返时间的分布，以确定延迟的预期变化。然后，考虑到应用程序的特性，可以确定 **故障检测延迟** 与 **过早超时风险** 之间的适当折衷。

更好的一种做法是，系统不是使用配置的常量超时时间，而是连续测量响应时间及其变化（抖动），并根据观察到的响应时间分布自动调整超时时间。这可以通过 Phi Accrual 故障检测器【30】来完成，该检测器在例如 Akka 和 Cassandra 【31】中使用。TCP 的超时重传机制也是以类似的方式工作【27】。

### 同步网络与异步网络

如果我们可以依靠网络来传递一些 **最大延迟固定** 的数据包，而不是丢弃数据包，那么分布式系统就会简单得多。为什么我们不能在硬件层面上解决这个问题，使网络可靠，使软件不必担心呢？

为了回答这个问题，将数据中心网络与非常可靠的传统固定电话网络（非蜂窝，非 VoIP）进行比较是很有趣的：延迟音频帧和掉话是非常罕见的。一个电话需要一个很低的端到端延迟，以及足够的带宽来传输你声音的音频采样数据。在计算机网络中有类似的可靠性和可预测性不是很好吗？

当你通过电话网络拨打电话时，它会建立一个电路：在两个呼叫者之间的整个路线上为呼叫分配一个固定的，有保证的带宽量。这个电路会保持至通话结束【32】。例如，ISDN 网络以每秒 4000 帧的固定速率运行。呼叫建立时，每个帧内（每个方向）分配 16 位空间。因此，在通话期间，每一方都保证能够每 250 微秒发送一个精确的 16 位音频数据【33,34】。

这种网络是同步的：即使数据经过多个路由器，也不会受到排队的影响，因为呼叫的 16 位空间已经在网络的下一跳中保留了下来。而且由于没有排队，网络的最大端到端延迟是固定的。我们称之为 **有限延迟（bounded delay）**。

#### 我们不能简单地使网络延迟可预测吗？

请注意，电话网络中的电路与 TCP 连接有很大不同：电路是固定数量的预留带宽，在电路建立时没有其他人可以使用，而 TCP 连接的数据包 **机会性地** 使用任何可用的网络带宽。你可以给 TCP 一个可变大小的数据块（例如，一个电子邮件或一个网页），它会尽可能在最短的时间内传输它。TCP 连接空闲时，不使用任何带宽 [^ii]。

[^ii]: 除了偶尔的 keepalive 数据包，如果 TCP keepalive 被启用。

如果数据中心网络和互联网是电路交换网络，那么在建立电路时就可以建立一个受保证的最大往返时间。但是，它们并不能这样：以太网和 IP 是 **分组交换协议**，不得不忍受排队的折磨和因此导致的网络无限延迟，这些协议没有电路的概念。

为什么数据中心网络和互联网使用分组交换？答案是，它们针对 **突发流量（bursty traffic）** 进行了优化。一个电路适用于音频或视频通话，在通话期间需要每秒传送相当数量的比特。另一方面，请求网页，发送电子邮件或传输文件没有任何特定的带宽要求 —— 我们只是希望它尽快完成。

如果想通过电路传输文件，你得预测一个带宽分配。如果你猜的太低，传输速度会不必要的太慢，导致网络容量闲置。如果你猜的太高，电路就无法建立（因为如果无法保证其带宽分配，网络不能建立电路）。因此，将电路用于突发数据传输会浪费网络容量，并且使传输不必要地缓慢。相比之下，TCP 动态调整数据传输速率以适应可用的网络容量。

已经有一些尝试去建立同时支持电路交换和分组交换的混合网络，比如 ATM [^iii]。InfiniBand 有一些相似之处【35】：它在链路层实现了端到端的流量控制，从而减少了在网络中排队的需要，尽管它仍然可能因链路拥塞而受到延迟【36】。通过仔细使用 **服务质量**（quality of service，即 QoS，数据包的优先级和调度）和 **准入控制**（admission control，限速发送器），可以在分组网络上模拟电路交换，或提供统计上的 **有限延迟**【25,32】。

[^iii]: **异步传输模式（Asynchronous Transfer Mode, ATM）** 在 20 世纪 80 年代是以太网的竞争对手【32】，但在电话网核心交换机之外并没有得到太多的采用。它与自动柜员机（也称为自动取款机）无关，尽管共用一个缩写词。或许，在一些平行的世界里，互联网是基于像 ATM 这样的东西，因此它们的互联网视频通话可能比我们的更可靠，因为它们不会遭受包的丢失和延迟。

但是，目前在多租户数据中心和公共云或通过互联网 [^iv] 进行通信时，此类服务质量尚未启用。当前部署的技术不允许我们对网络的延迟或可靠性作出任何保证：我们必须假设网络拥塞，排队和无限的延迟总是会发生。因此，超时时间没有 “正确” 的值 —— 它需要通过实验来确定。

[^iv]: 互联网服务提供商之间的对等协议和通过 **BGP 网关协议（BGP）** 建立的路由，与 IP 协议相比，更接近于电路交换。在这个级别上，可以购买专用带宽。但是，互联网路由在网络级别运行，而不是主机之间的单独连接，而且运行时间要长得多。

> ### 延迟和资源利用
>
> 更一般地说，可以将 **延迟变化** 视为 **动态资源分区** 的结果。
>
> 假设两台电话交换机之间有一条线路，可以同时进行 10,000 个呼叫。通过此线路切换的每个电路都占用其中一个呼叫插槽。因此，你可以将线路视为可由多达 10,000 个并发用户共享的资源。资源以静态方式分配：即使你现在是线路上唯一的呼叫，并且所有其他 9,999 个插槽都未使用，你的电路仍将分配与线路充分利用时相同的固定数量的带宽。
>
> 相比之下，互联网动态分享网络带宽。发送者互相推挤和争夺，以让他们的数据包尽可能快地通过网络，并且网络交换机决定从一个时刻到另一个时刻发送哪个分组（即，带宽分配）。这种方法有排队的缺点，但其优点是它最大限度地利用了线路。线路固定成本，所以如果你更好地利用它，你通过线路发送的每个字节都会更便宜。
>
> CPU 也会出现类似的情况：如果你在多个线程间动态共享每个 CPU 核心，则一个线程有时必须在操作系统的运行队列里等待，而另一个线程正在运行，这样每个线程都有可能被暂停一个不定的时间长度。但是，与为每个线程分配静态数量的 CPU 周期相比，这会更好地利用硬件（请参阅 “[响应时间保证](#响应时间保证)”）。更好的硬件利用率也是使用虚拟机的重要动机。
>
> 如果资源是静态分区的（例如，专用硬件和专用带宽分配），则在某些环境中可以实现 **延迟保证**。但是，这是以降低利用率为代价的 —— 换句话说，它是更昂贵的。另一方面，动态资源分配的多租户提供了更好的利用率，所以它更便宜，但它具有可变延迟的缺点。
>
> 网络中的可变延迟不是一种自然规律，而只是成本 / 收益权衡的结果。


## 不可靠的时钟

时钟和时间很重要。应用程序以各种方式依赖于时钟来回答以下问题：

1. 这个请求是否超时了？
2. 这项服务的第 99 百分位响应时间是多少？
3. 在过去五分钟内，该服务平均每秒处理多少个查询？
4. 用户在我们的网站上花了多长时间？
5. 这篇文章在何时发布？
6. 在什么时间发送提醒邮件？
7. 这个缓存条目何时到期？
8. 日志文件中此错误消息的时间戳是什么？

[例 1-4](/v1/ch1) 测量了 **持续时间**（durations，例如，请求发送与响应接收之间的时间间隔），而 [例 5-8](/v1/ch5) 描述了 **时间点**（point in time，在特定日期和和特定时间发生的事件）。

在分布式系统中，时间是一件棘手的事情，因为通信不是即时的：消息通过网络从一台机器传送到另一台机器需要时间。收到消息的时间总是晚于发送的时间，但是由于网络中的可变延迟，我们不知道晚了多少时间。这个事实导致有时很难确定在涉及多台机器时发生事情的顺序。

而且，网络上的每台机器都有自己的时钟，这是一个实际的硬件设备：通常是石英晶体振荡器。这些设备不是完全准确的，所以每台机器都有自己的时间概念，可能比其他机器稍快或更慢。可以在一定程度上同步时钟：最常用的机制是 **网络时间协议（NTP）**，它允许根据一组服务器报告的时间来调整计算机时钟【37】。服务器则从更精确的时间源（如 GPS 接收机）获取时间。

### 单调钟与日历时钟

现代计算机至少有两种不同的时钟：日历时钟（time-of-day clock）和单调钟（monotonic clock）。尽管它们都衡量时间，但区分这两者很重要，因为它们有不同的目的。

#### 日历时钟

日历时钟是你直观地了解时钟的依据：它根据某个日历（也称为 **挂钟时间**，即 wall-clock time）返回当前日期和时间。例如，Linux 上的 `clock_gettime(CLOCK_REALTIME)`[^v] 和 Java 中的 `System.currentTimeMillis()` 返回自 epoch（UTC 时间 1970 年 1 月 1 日午夜）以来的秒数（或毫秒），根据公历（Gregorian）日历，不包括闰秒。有些系统使用其他日期作为参考点。

[^v]: 虽然该时钟被称为实时时钟，但它与实时操作系统无关，如 “[响应时间保证](#响应时间保证)” 中所述。

日历时钟通常与 NTP 同步，这意味着来自一台机器的时间戳（理想情况下）与另一台机器上的时间戳相同。但是如下节所述，日历时钟也具有各种各样的奇特之处。特别是，如果本地时钟在 NTP 服务器之前太远，则它可能会被强制重置，看上去好像跳回了先前的时间点。这些跳跃以及他们经常忽略闰秒的事实，使日历时钟不能用于测量经过时间（elapsed time）【38】。

历史上的日历时钟还具有相当粗略的分辨率，例如，在较早的 Windows 系统上以 10 毫秒为单位前进【39】。在最近的系统中这已经不是一个问题了。

#### 单调钟

单调钟适用于测量持续时间（时间间隔），例如超时或服务的响应时间：Linux 上的 `clock_gettime(CLOCK_MONOTONIC)`，和 Java 中的 `System.nanoTime()` 都是单调时钟。这个名字来源于他们保证总是往前走的事实（而日历时钟可以往回跳）。

你可以在某个时间点检查单调钟的值，做一些事情，且稍后再次检查它。这两个值之间的差异告诉你两次检查之间经过了多长时间。但单调钟的绝对值是毫无意义的：它可能是计算机启动以来的纳秒数，或类似的任意值。特别是比较来自两台不同计算机的单调钟的值是没有意义的，因为它们并不是一回事。

在具有多个 CPU 插槽的服务器上，每个 CPU 可能有一个单独的计时器，但不一定与其他 CPU 同步。操作系统会补偿所有的差异，并尝试向应用线程表现出单调钟的样子，即使这些线程被调度到不同的 CPU 上。当然，明智的做法是不要太把这种单调性保证当回事【40】。

如果 NTP 协议检测到计算机的本地石英钟比 NTP 服务器要更快或更慢，则可以调整单调钟向前走的频率（这称为 **偏移（skewing）** 时钟）。默认情况下，NTP 允许时钟速率增加或减慢最高至 0.05%，但 NTP 不能使单调时钟向前或向后跳转。单调时钟的分辨率通常相当好：在大多数系统中，它们能在几微秒或更短的时间内测量时间间隔。

在分布式系统中，使用单调钟测量 **经过时间**（elapsed time，比如超时）通常很好，因为它不假定不同节点的时钟之间存在任何同步，并且对测量的轻微不准确性不敏感。

### 时钟同步与准确性

单调钟不需要同步，但是日历时钟需要根据 NTP 服务器或其他外部时间源来设置才能有用。不幸的是，我们获取时钟的方法并不像你所希望的那样可靠或准确 —— 硬件时钟和 NTP 可能会变幻莫测。举几个例子：

* 计算机中的石英钟不够精确：它会 **漂移**（drifts，即运行速度快于或慢于预期）。时钟漂移取决于机器的温度。Google 假设其服务器时钟漂移为 200 ppm（百万分之一）【41】，相当于每 30 秒与服务器重新同步一次的时钟漂移为 6 毫秒，或者每天重新同步的时钟漂移为 17 秒。即使一切工作正常，此漂移也会限制可以达到的最佳准确度。
* 如果计算机的时钟与 NTP 服务器的时钟差别太大，可能会拒绝同步，或者本地时钟将被强制重置【37】。任何观察重置前后时间的应用程序都可能会看到时间倒退或突然跳跃。
* 如果某个节点被 NTP 服务器的防火墙意外阻塞，有可能会持续一段时间都没有人会注意到。有证据表明，这在实践中确实发生过。
* NTP 同步只能和网络延迟一样好，所以当你在拥有可变数据包延迟的拥塞网络上时，NTP 同步的准确性会受到限制。一个实验表明，当通过互联网同步时，35 毫秒的最小误差是可以实现的，尽管偶尔的网络延迟峰值会导致大约一秒的误差。根据配置，较大的网络延迟会导致 NTP 客户端完全放弃。
* 一些 NTP 服务器是错误的或者配置错误的，报告的时间可能相差几个小时【43,44】。还好 NTP 客户端非常健壮，因为他们会查询多个服务器并忽略异常值。无论如何，依赖于互联网上的陌生人所告诉你的时间来保证你的系统的正确性，这还挺让人担忧的。
* 闰秒导致一分钟可能有 59 秒或 61 秒，这会打破一些在设计之时未考虑闰秒的系统的时序假设【45】。闰秒已经使许多大型系统崩溃的事实【38,46】说明了，关于时钟的错误假设是多么容易偷偷溜入系统中。处理闰秒的最佳方法可能是让 NTP 服务器 “撒谎”，并在一天中逐渐执行闰秒调整（这被称为 **拖尾**，即 smearing）【47,48】，虽然实际的 NTP 服务器表现各异【49】。
* 在虚拟机中，硬件时钟被虚拟化，这对于需要精确计时的应用程序提出了额外的挑战【50】。当一个 CPU 核心在虚拟机之间共享时，每个虚拟机都会暂停几十毫秒，与此同时另一个虚拟机正在运行。从应用程序的角度来看，这种停顿表现为时钟突然向前跳跃【26】。
* 如果你在没有完整控制权的设备（例如，移动设备或嵌入式设备）上运行软件，则可能完全不能信任该设备的硬件时钟。一些用户故意将其硬件时钟设置为不正确的日期和时间，例如，为了规避游戏中的时间限制，时钟可能会被设置到很远的过去或将来。

如果你足够在乎这件事并投入大量资源，就可以达到非常好的时钟精度。例如，针对金融机构的欧洲法规草案 MiFID II 要求所有高频率交易基金在 UTC 时间 100 微秒内同步时钟，以便调试 “闪崩” 等市场异常现象，并帮助检测市场操纵【51】。

通过 GPS 接收机，精确时间协议（PTP）【52】以及仔细的部署和监测可以实现这种精确度。然而，这需要很多努力和专业知识，而且有很多东西都会导致时钟同步错误。如果你的 NTP 守护进程配置错误，或者防火墙阻止了 NTP 通信，由漂移引起的时钟误差可能很快就会变大。

### 依赖同步时钟

时钟的问题在于，虽然它们看起来简单易用，但却具有令人惊讶的缺陷：一天可能不会有精确的 86,400 秒，**日历时钟** 可能会前后跳跃，而一个节点上的时间可能与另一个节点上的时间完全不同。

本章早些时候，我们讨论了网络丢包和任意延迟包的问题。尽管网络在大多数情况下表现良好，但软件的设计必须假定网络偶尔会出现故障，而软件必须正常处理这些故障。时钟也是如此：尽管大多数时间都工作得很好，但需要准备健壮的软件来处理不正确的时钟。

有一部分问题是，不正确的时钟很容易被视而不见。如果一台机器的 CPU 出现故障或者网络配置错误，很可能根本无法工作，所以很快就会被注意和修复。另一方面，如果它的石英时钟有缺陷，或者它的 NTP 客户端配置错误，大部分事情似乎仍然可以正常工作，即使它的时钟逐渐偏离现实。如果某个软件依赖于精确同步的时钟，那么结果更可能是悄无声息的，仅有微量的数据丢失，而不是一次惊天动地的崩溃【53,54】。

因此，如果你使用需要同步时钟的软件，必须仔细监控所有机器之间的时钟偏移。时钟偏离其他时钟太远的节点应当被宣告死亡，并从集群中移除。这样的监控可以确保你在损失发生之前注意到破损的时钟。

#### 有序事件的时间戳

让我们考虑一个特别的情况，一件很有诱惑但也很危险的事情：依赖时钟，在多个节点上对事件进行排序。例如，如果两个客户端写入分布式数据库，谁先到达？ 哪一个更近？

[图 8-3](/v1/ddia_0803.png) 显示了在具有多主复制的数据库中对时钟的危险使用（该例子类似于 [图 5-9](/v1/ddia_0509.png)）。客户端 A 在节点 1 上写入 `x = 1`；写入被复制到节点 3；客户端 B 在节点 3 上增加 x（我们现在有 `x = 2`）；最后这两个写入都被复制到节点 2。

![](/v1/ddia_0803.png)

**图 8-3 客户端 B 的写入比客户端 A 的写入要晚，但是 B 的写入具有较早的时间戳。**

在 [图 8-3](/v1/ddia_0803.png) 中，当一个写入被复制到其他节点时，它会根据发生写入的节点上的日历时钟标记一个时间戳。在这个例子中，时钟同步是非常好的：节点 1 和节点 3 之间的偏差小于 3ms，这可能比你在实践中能预期的更好。

尽管如此，[图 8-3](/v1/ddia_0803.png) 中的时间戳却无法正确排列事件：写入 `x = 1` 的时间戳为 42.004 秒，但写入 `x = 2` 的时间戳为 42.003 秒，即使 `x = 2` 在稍后出现。当节点 2 接收到这两个事件时，会错误地推断出 `x = 1` 是最近的值，而丢弃写入 `x = 2`。效果上表现为，客户端 B 的增量操作会丢失。

这种冲突解决策略被称为 **最后写入胜利（LWW）**，它在多主复制和无主数据库（如 Cassandra 【53】和 Riak 【54】）中被广泛使用（请参阅 “[最后写入胜利（丢弃并发写入）](/v1/ch5#最后写入胜利（丢弃并发写入）)” 一节）。有些实现会在客户端而不是服务器上生成时间戳，但这并不能改变 LWW 的基本问题：

* 数据库写入可能会神秘地消失：具有滞后时钟的节点无法覆盖之前具有快速时钟的节点写入的值，直到节点之间的时钟偏差消逝【54,55】。此方案可能导致一定数量的数据被悄悄丢弃，而未向应用报告任何错误。
* LWW 无法区分 **高频顺序写入**（在 [图 8-3](/v1/ddia_0803.png) 中，客户端 B 的增量操作 **一定** 发生在客户端 A 的写入之后）和 **真正并发写入**（写入者意识不到其他写入者）。需要额外的因果关系跟踪机制（例如版本向量），以防止违背因果关系（请参阅 “[检测并发写入](/v1/ch5#检测并发写入)”）。
* 两个节点很可能独立地生成具有相同时间戳的写入，特别是在时钟仅具有毫秒分辨率的情况下。为了解决这样的冲突，还需要一个额外的 **决胜值**（tiebreaker，可以简单地是一个大随机数），但这种方法也可能会导致违背因果关系【53】。

因此，尽管通过保留 “最近” 的值并放弃其他值来解决冲突是很诱惑人的，但是要注意，“最近” 的定义取决于本地的 **日历时钟**，这很可能是不正确的。即使用严格同步的 NTP 时钟，一个数据包也可能在时间戳 100 毫秒（根据发送者的时钟）时发送，并在时间戳 99 毫秒（根据接收者的时钟）处到达 —— 看起来好像数据包在发送之前已经到达，这是不可能的。

NTP 同步是否能足够准确，以至于这种不正确的排序不会发生？也许不能，因为 NTP 的同步精度本身，除了石英钟漂移这类误差源之外，还受到网络往返时间的限制。为了进行正确的排序，你需要一个比测量对象（即网络延迟）要精确得多的时钟。

所谓的 **逻辑时钟（logic clock）**【56,57】是基于递增计数器而不是振荡石英晶体，对于排序事件来说是更安全的选择（请参阅 “[检测并发写入](/v1/ch5#检测并发写入)”）。逻辑时钟不测量一天中的时间或经过的秒数，而仅测量事件的相对顺序（无论一个事件发生在另一个事件之前还是之后）。相反，用来测量实际经过时间的 **日历时钟** 和 **单调钟** 也被称为 **物理时钟（physical clock）**。我们将在 “[顺序保证](/v1/ch9#顺序保证)” 中来看顺序问题。

#### 时钟读数存在置信区间

你可能能够以微秒或甚至纳秒的精度读取机器的时钟。但即使可以得到如此细致的测量结果，这并不意味着这个值对于这样的精度实际上是准确的。实际上，大概率是不准确的 —— 如前所述，即使你每分钟与本地网络上的 NTP 服务器进行同步，几毫秒的时间漂移也很容易在不精确的石英时钟上发生。使用公共互联网上的 NTP 服务器，最好的准确度可能达到几十毫秒，而且当网络拥塞时，误差可能会超过 100 毫秒【57】。

因此，将时钟读数视为一个时间点是没有意义的 —— 它更像是一段时间范围：例如，一个系统可能以 95% 的置信度认为当前时间处于本分钟内的第 10.3 秒和 10.5 秒之间，它可能没法比这更精确了【58】。如果我们只知道 ±100 毫秒的时间，那么时间戳中的微秒数字部分基本上是没有意义的。

不确定性界限可以根据你的时间源来计算。如果你的 GPS 接收器或原子（铯）时钟直接连接到你的计算机上，预期的错误范围由制造商告知。如果从服务器获得时间，则不确定性取决于自上次与服务器同步以来的石英钟漂移的期望值，加上 NTP 服务器的不确定性，再加上到服务器的网络往返时间（只是获取粗略近似值，并假设服务器是可信的）。

不幸的是，大多数系统不公开这种不确定性：例如，当调用 `clock_gettime()` 时，返回值不会告诉你时间戳的预期错误，所以你不知道其置信区间是 5 毫秒还是 5 年。

一个有趣的例外是 Spanner 中的 Google TrueTime API 【41】，它明确地报告了本地时钟的置信区间。当你询问当前时间时，你会得到两个值：[最早，最晚]，这是最早可能的时间戳和最晚可能的时间戳。在不确定性估计的基础上，时钟知道当前的实际时间落在该区间内。区间的宽度取决于自从本地石英钟最后与更精确的时钟源同步以来已经过了多长时间。

#### 全局快照的同步时钟

在 “[快照隔离和可重复读](/v1/ch7#快照隔离和可重复读)” 中，我们讨论了快照隔离，这是数据库中非常有用的功能，需要支持小型快速读写事务和大型长时间运行的只读事务（用于备份或分析）。它允许只读事务看到特定时间点的处于一致状态的数据库，且不会锁定和干扰读写事务。

快照隔离最常见的实现需要单调递增的事务 ID。如果写入比快照晚（即，写入具有比快照更大的事务 ID），则该写入对于快照事务是不可见的。在单节点数据库上，一个简单的计数器就足以生成事务 ID。

但是当数据库分布在许多机器上，也许可能在多个数据中心中时，由于需要协调，（跨所有分区）全局单调递增的事务 ID 会很难生成。事务 ID 必须反映因果关系：如果事务 B 读取由事务 A 写入的值，则 B 必须具有比 A 更大的事务 ID，否则快照就无法保持一致。在有大量的小规模、高频率的事务情景下，在分布式系统中创建事务 ID 成为一个难以处理的瓶颈 [^vi]。

[^vi]: 存在分布式序列号生成器，例如 Twitter 的雪花（Snowflake），其以可伸缩的方式（例如，通过将 ID 空间的块分配给不同节点）近似单调地增加唯一 ID。但是，它们通常无法保证与因果关系一致的排序，因为分配的 ID 块的时间范围比数据库读取和写入的时间范围要长。另请参阅 “[顺序保证](/v1/ch9#顺序保证)”。

我们可以使用同步时钟的时间戳作为事务 ID 吗？如果我们能够获得足够好的同步性，那么这种方法将具有很合适的属性：更晚的事务会有更大的时间戳。当然，问题在于时钟精度的不确定性。

Spanner 以这种方式实现跨数据中心的快照隔离【59，60】。它使用 TrueTime API 报告的时钟置信区间，并基于以下观察结果：如果你有两个置信区间，每个置信区间包含最早和最晚可能的时间戳（$A = [A_{earliest}, A_{latest}]$，$B=[B_{earliest}, B_{latest}]$），这两个区间不重叠（即：$A_{earliest} <A_{latest} <B_{earliest} <B_{latest}$）的话，那么 B 肯定发生在 A 之后 —— 这是毫无疑问的。只有当区间重叠时，我们才不确定 A 和 B 发生的顺序。

为了确保事务时间戳反映因果关系，在提交读写事务之前，Spanner 在提交读写事务时，会故意等待置信区间长度的时间。通过这样，它可以确保任何可能读取数据的事务处于足够晚的时间，因此它们的置信区间不会重叠。为了保持尽可能短的等待时间，Spanner 需要保持尽可能小的时钟不确定性，为此，Google 在每个数据中心都部署了一个 GPS 接收器或原子钟，这允许时钟同步到大约 7 毫秒以内【41】。

对分布式事务语义使用时钟同步是一个活跃的研究领域【57,61,62】。这些想法很有趣，但是它们还没有在谷歌之外的主流数据库中实现。

### 进程暂停

让我们考虑在分布式系统中使用危险时钟的另一个例子。假设你有一个数据库，每个分区只有一个领导者。只有领导被允许接受写入。一个节点如何知道它仍然是领导者（它并没有被别人宣告为死亡），并且它可以安全地接受写入？

一种选择是领导者从其他节点获得一个 **租约（lease）**，类似一个带超时的锁【63】。任一时刻只有一个节点可以持有租约 —— 因此，当一个节点获得一个租约时，它知道它在某段时间内自己是领导者，直到租约到期。为了保持领导地位，节点必须周期性地在租约过期前续期。

如果节点发生故障，就会停止续期，所以当租约过期时，另一个节点可以接管。

可以想象，请求处理循环看起来像这样：

```java
while (true) {
  request = getIncomingRequest();
  // 确保租约还剩下至少 10 秒
  if (lease.expiryTimeMillis - System.currentTimeMillis() < 10000){
    lease = lease.renew();
  }

  if (lease.isValid()) {
    process(request);
  }
}
```

这个代码有什么问题？首先，它依赖于同步时钟：租约到期时间由另一台机器设置（例如，当前时间加上 30 秒，计算到期时间），并将其与本地系统时钟进行比较。如果时钟不同步超过几秒，这段代码将开始做奇怪的事情。

其次，即使我们将协议更改为仅使用本地单调时钟，也存在另一个问题：代码假定在执行剩余时间检查 `System.currentTimeMillis()` 和实际执行请求 `process(request)` 中间的时间间隔非常短。通常情况下，这段代码运行得非常快，所以 10 秒的缓冲区已经足够确保 **租约** 在请求处理到一半时不会过期。

但是，如果程序执行中出现了意外的停顿呢？例如，想象一下，线程在 `lease.isValid()` 行周围停止 15 秒，然后才继续。在这种情况下，在请求被处理的时候，租约可能已经过期，而另一个节点已经接管了领导。然而，没有什么可以告诉这个线程已经暂停了这么长时间了，所以这段代码不会注意到租约已经到期了，直到循环的下一个迭代 —— 到那个时候它可能已经做了一些不安全的处理请求。

假设一个线程可能会暂停很长时间，这是疯了吗？不幸的是，这种情况发生的原因有很多种：

* 许多编程语言运行时（如 Java 虚拟机）都有一个垃圾收集器（GC），偶尔需要停止所有正在运行的线程。这些 “**停止所有处理（stop-the-world）**”GC 暂停有时会持续几分钟【64】！甚至像 HotSpot JVM 的 CMS 这样的所谓的 “并行” 垃圾收集器也不能完全与应用程序代码并行运行，它需要不时地停止所有处理【65】。尽管通常可以通过改变分配模式或调整 GC 设置来减少暂停【66】，但是如果我们想要提供健壮的保证，就必须假设最坏的情况发生。
* 在虚拟化环境中，可以 **挂起（suspend）** 虚拟机（暂停执行所有进程并将内存内容保存到磁盘）并恢复（恢复内存内容并继续执行）。这个暂停可以在进程执行的任何时候发生，并且可以持续任意长的时间。这个功能有时用于虚拟机从一个主机到另一个主机的实时迁移，而不需要重新启动，在这种情况下，暂停的长度取决于进程写入内存的速率【67】。
* 在最终用户的设备（如笔记本电脑）上，执行也可能被暂停并随意恢复，例如当用户关闭笔记本电脑的盖子时。
* 当操作系统上下文切换到另一个线程时，或者当管理程序切换到另一个虚拟机时（在虚拟机中运行时），当前正在运行的线程可能在代码中的任意点处暂停。在虚拟机的情况下，在其他虚拟机中花费的 CPU 时间被称为 **窃取时间（steal time）**。如果机器处于沉重的负载下（即，如果等待运行的线程队列很长），暂停的线程再次运行可能需要一些时间。
* 如果应用程序执行同步磁盘访问，则线程可能暂停，等待缓慢的磁盘 I/O 操作完成【68】。在许多语言中，即使代码没有包含文件访问，磁盘访问也可能出乎意料地发生 —— 例如，Java 类加载器在第一次使用时惰性加载类文件，这可能在程序执行过程中随时发生。I/O 暂停和 GC 暂停甚至可能合谋组合它们的延迟【69】。如果磁盘实际上是一个网络文件系统或网络块设备（如亚马逊的 EBS），I/O 延迟进一步受到网络延迟变化的影响【29】。
* 如果操作系统配置为允许交换到磁盘（页面交换），则简单的内存访问可能导致 **页面错误（page fault）**，要求将磁盘中的页面装入内存。当这个缓慢的 I/O 操作发生时，线程暂停。如果内存压力很高，则可能需要将另一个页面换出到磁盘。在极端情况下，操作系统可能花费大部分时间将页面交换到内存中，而实际上完成的工作很少（这被称为 **抖动**，即 thrashing）。为了避免这个问题，通常在服务器机器上禁用页面调度（如果你宁愿干掉一个进程来释放内存，也不愿意冒抖动风险）。
* 可以通过发送 SIGSTOP 信号来暂停 Unix 进程，例如通过在 shell 中按下 Ctrl-Z。这个信号立即阻止进程继续执行更多的 CPU 周期，直到 SIGCONT 恢复为止，此时它将继续运行。即使你的环境通常不使用 SIGSTOP，也可能由运维工程师意外发送。

所有这些事件都可以随时 **抢占（preempt）** 正在运行的线程，并在稍后的时间恢复运行，而线程甚至不会注意到这一点。这个问题类似于在单个机器上使多线程代码线程安全：你不能对时序做任何假设，因为随时可能发生上下文切换，或者出现并行运行。

当在一台机器上编写多线程代码时，我们有相当好的工具来实现线程安全：互斥量、信号量、原子计数器、无锁数据结构、阻塞队列等等。不幸的是，这些工具并不能直接转化为分布式系统操作，因为分布式系统没有共享内存，只有通过不可靠网络发送的消息。

分布式系统中的节点，必须假定其执行可能在任意时刻暂停相当长的时间，即使是在一个函数的中间。在暂停期间，世界的其它部分在继续运转，甚至可能因为该节点没有响应，而宣告暂停节点的死亡。最终暂停的节点可能会继续运行，在再次检查自己的时钟之前，甚至可能不会意识到自己进入了睡眠。

#### 响应时间保证

在许多编程语言和操作系统中，线程和进程可能暂停一段无限制的时间，正如讨论的那样。如果你足够努力，导致暂停的原因是 **可以** 消除的。

某些软件的运行环境要求很高，不能在特定时间内响应可能会导致严重的损失：控制飞机、火箭、机器人、汽车和其他物体的计算机必须对其传感器输入做出快速而可预测的响应。在这些系统中，软件必须有一个特定的 **截止时间（deadline）**，如果截止时间不满足，可能会导致整个系统的故障。这就是所谓的 **硬实时（hard real-time）** 系统。

> #### 实时是真的吗？
>
> 在嵌入式系统中，实时是指系统经过精心设计和测试，以满足所有情况下的特定时间保证。这个含义与 Web 上对实时术语的模糊使用相反，后者描述了服务器将数据推送到客户端以及没有严格的响应时间限制的流处理（见 [第十一章](/v1/ch11)）。

例如，如果车载传感器检测到当前正在经历碰撞，你肯定不希望安全气囊释放系统因为 GC 暂停而延迟弹出。

在系统中提供 **实时保证** 需要各级软件栈的支持：一个实时操作系统（RTOS），允许在指定的时间间隔内保证 CPU 时间的分配。库函数必须申明最坏情况下的执行时间；动态内存分配可能受到限制或完全不允许（实时垃圾收集器存在，但是应用程序仍然必须确保它不会给 GC 太多的负担）；必须进行大量的测试和测量，以确保达到保证。

所有这些都需要大量额外的工作，严重限制了可以使用的编程语言、库和工具的范围（因为大多数语言和工具不提供实时保证）。由于这些原因，开发实时系统非常昂贵，并且它们通常用于安全关键的嵌入式设备。而且，“**实时**” 与 “**高性能**” 不一样 —— 事实上，实时系统可能具有较低的吞吐量，因为他们必须让及时响应的优先级高于一切（另请参阅 “[延迟和资源利用](#延迟和资源利用)”）。

对于大多数服务器端数据处理系统来说，实时保证是不经济或不合适的。因此，这些系统必须承受在非实时环境中运行的暂停和时钟不稳定性。

#### 限制垃圾收集的影响

进程暂停的负面影响可以在不诉诸昂贵的实时调度保证的情况下得到缓解。语言运行时在计划垃圾回收时具有一定的灵活性，因为它们可以跟踪对象分配的速度和随着时间的推移剩余的空闲内存。

一个新兴的想法是将 GC 暂停视为一个节点的短暂计划中断，并在这个节点收集其垃圾的同时，让其他节点处理来自客户端的请求。如果运行时可以警告应用程序一个节点很快需要 GC 暂停，那么应用程序可以停止向该节点发送新的请求，等待它完成处理未完成的请求，然后在没有请求正在进行时执行 GC。这个技巧向客户端隐藏了 GC 暂停，并降低了响应时间的高百分比【70,71】。一些对延迟敏感的金融交易系统【72】使用这种方法。

这个想法的一个变种是只用垃圾收集器来处理短命对象（这些对象可以快速收集），并定期在积累大量长寿对象（因此需要完整 GC）之前重新启动进程【65,73】。一次可以重新启动一个节点，在计划重新启动之前，流量可以从该节点移开，就像 [第四章](/v1/ch4) 里描述的滚动升级一样。

这些措施不能完全阻止垃圾回收暂停，但可以有效地减少它们对应用的影响。


## 知识、真相与谎言

本章到目前为止，我们已经探索了分布式系统与运行在单台计算机上的程序的不同之处：没有共享内存，只有通过可变延迟的不可靠网络传递的消息，系统可能遭受部分失效，不可靠的时钟和处理暂停。

如果你不习惯于分布式系统，那么这些问题的后果就会让人迷惑不解。网络中的一个节点无法确切地知道任何事情 —— 它只能根据它通过网络接收到（或没有接收到）的消息进行猜测。节点只能通过交换消息来找出另一个节点所处的状态（存储了哪些数据，是否正确运行等等）。如果远程节点没有响应，则无法知道它处于什么状态，因为网络中的问题不能可靠地与节点上的问题区分开来。

这些系统的讨论与哲学有关：在系统中什么是真什么是假？如果感知和测量的机制都是不可靠的，那么关于这些知识我们又能多么确定呢？软件系统应该遵循我们对物理世界所期望的法则，如因果关系吗？

幸运的是，我们不需要去搞清楚生命的意义。在分布式系统中，我们可以陈述关于行为（系统模型）的假设，并以满足这些假设的方式设计实际系统。算法可以被证明在某个系统模型中正确运行。这意味着即使底层系统模型提供了很少的保证，也可以实现可靠的行为。

但是，尽管可以使软件在不可靠的系统模型中表现良好，但这并不是可以直截了当实现的。在本章的其余部分中，我们将进一步探讨分布式系统中的知识和真相的概念，这将有助于我们思考我们可以做出的各种假设以及我们可能希望提供的保证。在 [第九章](/v1/ch9) 中，我们将着眼于分布式系统的一些例子，这些算法在特定的假设条件下提供了特定的保证。

### 真相由多数所定义

设想一个具有不对称故障的网络：一个节点能够接收发送给它的所有消息，但是来自该节点的任何传出消息被丢弃或延迟【19】。即使该节点运行良好，并且正在接收来自其他节点的请求，其他节点也无法听到其响应。经过一段时间后，其他节点宣布它已经死亡，因为他们没有听到节点的消息。这种情况就像梦魇一样：**半断开（semi-disconnected）** 的节点被拖向墓地，敲打尖叫道 “我没死！” —— 但是由于没有人能听到它的尖叫，葬礼队伍继续以坚忍的决心继续行进。

在一个稍微不那么梦魇的场景中，半断开的节点可能会注意到它发送的消息没有被其他节点确认，因此意识到网络中必定存在故障。尽管如此，节点被其他节点错误地宣告为死亡，而半连接的节点对此无能为力。

第三种情况，想象一个正在经历长时间 **垃圾收集暂停（stop-the-world GC Pause）** 的节点，节点的所有线程被 GC 抢占并暂停一分钟，因此没有请求被处理，也没有响应被发送。其他节点等待，重试，不耐烦，并最终宣布节点死亡，并将其丢到灵车上。最后，GC 完成，节点的线程继续，好像什么也没有发生。其他节点感到惊讶，因为所谓的死亡节点突然从棺材中抬起头来，身体健康，开始和旁观者高兴地聊天。GC 后的节点最初甚至没有意识到已经经过了整整一分钟，而且自己已被宣告死亡。从它自己的角度来看，从最后一次与其他节点交谈以来，几乎没有经过任何时间。

这些故事的寓意是，节点不一定能相信自己对于情况的判断。分布式系统不能完全依赖单个节点，因为节点可能随时失效，可能会使系统卡死，无法恢复。相反，许多分布式算法都依赖于法定人数，即在节点之间进行投票（请参阅 “[读写的法定人数](/v1/ch5#读写的法定人数)”）：决策需要来自多个节点的最小投票数，以减少对于某个特定节点的依赖。

这也包括关于宣告节点死亡的决定。如果法定数量的节点宣告另一个节点已经死亡，那么即使该节点仍感觉自己活着，它也必须被认为是死的。个体节点必须遵守法定决定并下台。

最常见的法定人数是超过一半的绝对多数（尽管其他类型的法定人数也是可能的）。多数法定人数允许系统继续工作，如果单个节点发生故障（三个节点可以容忍单节点故障；五个节点可以容忍双节点故障）。系统仍然是安全的，因为在这个制度中只能有一个多数 —— 不能同时存在两个相互冲突的多数决定。当我们在 [第九章](/v1/ch9) 中讨论 **共识算法（consensus algorithms）** 时，我们将更详细地讨论法定人数的应用。

#### 领导者和锁

通常情况下，一些东西在一个系统中只能有一个。例如：

* 数据库分区的领导者只能有一个节点，以避免 **脑裂**（即 split brain，请参阅 “[处理节点宕机](/v1/ch5#处理节点宕机)”）。
* 特定资源的锁或对象只允许一个事务 / 客户端持有，以防同时写入和损坏。
* 一个特定的用户名只能被一个用户所注册，因为用户名必须唯一标识一个用户。

在分布式系统中实现这一点需要注意：即使一个节点认为它是 “**天选者（the choosen one）**”（分区的负责人，锁的持有者，成功获取用户名的用户的请求处理程序），但这并不一定意味着有法定人数的节点同意！一个节点可能以前是领导者，但是如果其他节点在此期间宣布它死亡（例如，由于网络中断或 GC 暂停），则它可能已被降级，且另一个领导者可能已经当选。

如果一个节点继续表现为 **天选者**，即使大多数节点已经声明它已经死了，则在考虑不周的系统中可能会导致问题。这样的节点能以自己赋予的权能向其他节点发送消息，如果其他节点相信，整个系统可能会做一些不正确的事情。

例如，[图 8-4](/v1/ddia_0804.png) 显示了由于不正确的锁实现导致的数据损坏错误。（这个错误不仅仅是理论上的：HBase 曾经有这个问题【74,75】）假设你要确保一个存储服务中的文件一次只能被一个客户访问，因为如果多个客户试图对此写入，该文件将被损坏。你尝试通过在访问文件之前要求客户端从锁定服务获取租约来实现此目的。

![](/v1/ddia_0804.png)

**图 8-4 分布式锁的实现不正确：客户端 1 认为它仍然具有有效的租约，即使它已经过期，从而破坏了存储中的文件**

这个问题就是我们先前在 “[进程暂停](#进程暂停)” 中讨论过的一个例子：如果持有租约的客户端暂停太久，它的租约将到期。另一个客户端可以获得同一文件的租约，并开始写入文件。当暂停的客户端回来时，它认为（不正确）它仍然有一个有效的租约，并继续写入文件。结果，客户的写入将产生冲突并损坏文件。

#### 防护令牌

当使用锁或租约来保护对某些资源（如 [图 8-4](/v1/ddia_0804.png) 中的文件存储）的访问时，需要确保一个被误认为自己是 “天选者” 的节点不能扰乱系统的其它部分。实现这一目标的一个相当简单的技术就是 **防护（fencing）**，如 [图 8-5](/v1/ddia_0805.png) 所示

![](/v1/ddia_0805.png)

**图 8-5 只允许以增加防护令牌的顺序进行写操作，从而保证存储安全**

我们假设每次锁定服务器授予锁或租约时，它还会返回一个 **防护令牌（fencing token）**，这个数字在每次授予锁定时都会增加（例如，由锁定服务增加）。然后，我们可以要求客户端每次向存储服务发送写入请求时，都必须包含当前的防护令牌。

在 [图 8-5](/v1/ddia_0805.png) 中，客户端 1 以 33 的令牌获得租约，但随后进入一个长时间的停顿并且租约到期。客户端 2 以 34 的令牌（该数字总是增加）获取租约，然后将其写入请求发送到存储服务，包括 34 的令牌。稍后，客户端 1 恢复生机并将其写入存储服务，包括其令牌值 33。但是，存储服务器会记住它已经处理了一个具有更高令牌编号（34）的写入，因此它会拒绝带有令牌 33 的请求。

如果将 ZooKeeper 用作锁定服务，则可将事务标识 `zxid` 或节点版本 `cversion` 用作防护令牌。由于它们保证单调递增，因此它们具有所需的属性【74】。

请注意，这种机制要求资源本身在检查令牌方面发挥积极作用，通过拒绝使用旧的令牌，而不是已经被处理的令牌来进行写操作 —— 仅仅依靠客户端检查自己的锁状态是不够的。对于不明确支持防护令牌的资源，可能仍然可以解决此限制（例如，在文件存储服务的情况下，可以将防护令牌包含在文件名中）。但是，为了避免在锁的保护之外处理请求，需要进行某种检查。

在服务器端检查一个令牌可能看起来像是一个缺点，但这可以说是一件好事：一个服务假定它的客户总是守规矩并不明智，因为使用客户端的人与运行服务的人优先级非常不一样【76】。因此，任何服务保护自己免受意外客户的滥用是一个好主意。

### 拜占庭故障

防护令牌可以检测和阻止无意中发生错误的节点（例如，因为它尚未发现其租约已过期）。但是，如果节点有意破坏系统的保证，则可以通过使用假防护令牌发送消息来轻松完成此操作。

在本书中，我们假设节点是不可靠但诚实的：它们可能很慢或者从不响应（由于故障），并且它们的状态可能已经过时（由于 GC 暂停或网络延迟），但是我们假设如果节点它做出了回应，它正在说出 “真相”：尽其所知，它正在按照协议的规则扮演其角色。

如果存在节点可能 “撒谎”（发送任意错误或损坏的响应）的风险，则分布式系统的问题变得更困难了 —— 例如，如果节点可能声称其实际上没有收到特定的消息。这种行为被称为 **拜占庭故障（Byzantine fault）**，**在不信任的环境中达成共识的问题被称为拜占庭将军问题**【77】。

> ### 拜占庭将军问题
>
> 拜占庭将军问题是对所谓 “两将军问题” 的泛化【78】，它想象两个将军需要就战斗计划达成一致的情况。由于他们在两个不同的地点建立了营地，他们只能通过信使进行沟通，信使有时会被延迟或丢失（就像网络中的信息包一样）。我们将在 [第九章](/v1/ch9) 讨论这个共识问题。
>
> 在这个问题的拜占庭版本里，有 n 位将军需要同意，他们的努力因为有一些叛徒在他们中间而受到阻碍。大多数的将军都是忠诚的，因而发出了真实的信息，但是叛徒可能会试图通过发送虚假或不真实的信息来欺骗和混淆他人（在试图保持未被发现的同时）。事先并不知道叛徒是谁。
>
> 拜占庭是后来成为君士坦丁堡的古希腊城市，现在在土耳其的伊斯坦布尔。没有任何历史证据表明拜占庭将军比其他地方更容易出现诡计和阴谋。相反，这个名字来源于拜占庭式的过度复杂，官僚，迂回等意义，早在计算机之前就已经在政治中被使用了【79】。Lamport 想要选一个不会冒犯任何读者的国家，他被告知将其称为阿尔巴尼亚将军问题并不是一个好主意【80】。

当一个系统在部分节点发生故障、不遵守协议、甚至恶意攻击、扰乱网络时仍然能继续正确工作，称之为 **拜占庭容错（Byzantine fault-tolerant）** 的，这种担忧在某些特定情况下是有意义的：

* 在航空航天环境中，计算机内存或 CPU 寄存器中的数据可能被辐射破坏，导致其以任意不可预知的方式响应其他节点。由于系统故障非常昂贵（例如，飞机撞毁和炸死船上所有人员，或火箭与国际空间站相撞），飞行控制系统必须容忍拜占庭故障【81,82】。
* 在多个参与组织的系统中，一些参与者可能会试图欺骗或诈骗他人。在这种情况下，节点仅仅信任另一个节点的消息是不安全的，因为它们可能是出于恶意的目的而被发送的。例如，像比特币和其他区块链一样的对等网络可以被认为是让互不信任的各方同意交易是否发生的一种方式，而不依赖于中心机构（central authority）【83】。

然而，在本书讨论的那些系统中，我们通常可以安全地假设没有拜占庭式的错误。在你的数据中心里，所有的节点都是由你的组织控制的（所以他们可以信任），辐射水平足够低，内存损坏不是一个大问题。制作拜占庭容错系统的协议相当复杂【84】，而容错嵌入式系统依赖于硬件层面的支持【81】。在大多数服务器端数据系统中，部署拜占庭容错解决方案的成本使其变得不切实际。

Web 应用程序确实需要预期受终端用户控制的客户端（如 Web 浏览器）的任意和恶意行为。这就是为什么输入验证，数据清洗和输出转义如此重要：例如，防止 SQL 注入和跨站点脚本。然而，我们通常不在这里使用拜占庭容错协议，而只是让服务器有权决定是否允许客户端行为。但在没有这种中心机构的对等网络中，拜占庭容错更为重要。

软件中的一个错误（bug）可能被认为是拜占庭式的错误，但是如果你将相同的软件部署到所有节点上，那么拜占庭式的容错算法帮不到你。大多数拜占庭式容错算法要求超过三分之二的节点能够正常工作（即，如果有四个节点，最多只能有一个故障）。要使用这种方法对付 bug，你必须有四个独立的相同软件的实现，并希望一个 bug 只出现在四个实现之一中。

同样，如果一个协议可以保护我们免受漏洞，安全渗透和恶意攻击，那么这将是有吸引力的。不幸的是，这也是不现实的：在大多数系统中，如果攻击者可以渗透一个节点，那他们可能会渗透所有这些节点，因为它们可能都运行着相同的软件。因此，传统机制（认证，访问控制，加密，防火墙等）仍然是抵御攻击者的主要保护措施。

#### 弱谎言形式

尽管我们假设节点通常是诚实的，但值得向软件中添加防止 “撒谎” 弱形式的机制 —— 例如，由硬件问题导致的无效消息，软件错误和错误配置。这种保护机制并不是完全的拜占庭容错，因为它们不能抵挡决心坚定的对手，但它们仍然是简单而实用的步骤，以提高可靠性。例如：

* 由于硬件问题或操作系统、驱动程序、路由器等中的错误，网络数据包有时会受到损坏。通常，损坏的数据包会被内建于 TCP 和 UDP 中的校验和所俘获，但有时它们也会逃脱检测【85,86,87】 。要对付这种破坏通常使用简单的方法就可以做到，例如应用程序级协议中的校验和。
* 可公开访问的应用程序必须仔细清理来自用户的任何输入，例如检查值是否在合理的范围内，并限制字符串的大小以防止通过大内存分配的拒绝服务。防火墙后面的内部服务对于输入也许可以只采取一些不那么严格的检查，但是采取一些基本的合理性检查（例如，在协议解析中）仍然是一个好主意。
* NTP 客户端可以配置多个服务器地址。同步时，客户端联系所有的服务器，估计它们的误差，并检查大多数服务器是否对某个时间范围达成一致。只要大多数的服务器没问题，一个配置错误的 NTP 服务器报告的时间会被当成特异值从同步中排除【37】。使用多个服务器使 NTP 更健壮（比起只用单个服务器来）。

### 系统模型与现实

已经有很多算法被设计以解决分布式系统问题 —— 例如，我们将在 [第九章](/v1/ch9) 讨论共识问题的解决方案。为了有用，这些算法需要容忍我们在本章中讨论的分布式系统的各种故障。

算法的编写方式不应该过分依赖于运行的硬件和软件配置的细节。这就要求我们以某种方式将我们期望在系统中发生的错误形式化。我们通过定义一个系统模型来做到这一点，这个模型是一个抽象，描述一个算法可以假设的事情。

关于时序假设，三种系统模型是常用的：

同步模型
: **同步模型（synchronous model）** 假设网络延迟、进程暂停和和时钟误差都是受限的。这并不意味着完全同步的时钟或零网络延迟；这只意味着你知道网络延迟、暂停和时钟漂移将永远不会超过某个固定的上限【88】。同步模型并不是大多数实际系统的现实模型，因为（如本章所讨论的）无限延迟和暂停确实会发生。

部分同步模型
: **部分同步（partial synchronous）** 意味着一个系统在大多数情况下像一个同步系统一样运行，但有时候会超出网络延迟，进程暂停和时钟漂移的界限【88】。这是很多系统的现实模型：大多数情况下，网络和进程表现良好，否则我们永远无法完成任何事情，但是我们必须承认，在任何时刻都存在时序假设偶然被破坏的事实。发生这种情况时，网络延迟、暂停和时钟错误可能会变得相当大。

异步模型
: 在这个模型中，一个算法不允许对时序做任何假设 —— 事实上它甚至没有时钟（所以它不能使用超时）。一些算法被设计为可用于异步模型，但非常受限。


进一步来说，除了时序问题，我们还要考虑 **节点失效**。三种最常见的节点系统模型是：

崩溃 - 停止故障
: 在 **崩溃停止（crash-stop）** 模型中，算法可能会假设一个节点只能以一种方式失效，即通过崩溃。这意味着节点可能在任意时刻突然停止响应，此后该节点永远消失 —— 它永远不会回来。

崩溃 - 恢复故障
: 我们假设节点可能会在任何时候崩溃，但也许会在未知的时间之后再次开始响应。在 **崩溃 - 恢复（crash-recovery）** 模型中，假设节点具有稳定的存储（即，非易失性磁盘存储）且会在崩溃中保留，而内存中的状态会丢失。

拜占庭（任意）故障
: 节点可以做（绝对意义上的）任何事情，包括试图戏弄和欺骗其他节点，如上一节所述。

对于真实系统的建模，具有 **崩溃 - 恢复故障（crash-recovery）** 的 **部分同步模型（partial synchronous）** 通常是最有用的模型。分布式算法如何应对这种模型？

#### 算法的正确性

为了定义算法是正确的，我们可以描述它的属性。例如，排序算法的输出具有如下特性：对于输出列表中的任何两个不同的元素，左边的元素比右边的元素小。这只是定义对列表进行排序含义的一种形式方式。

同样，我们可以写下我们想要的分布式算法的属性来定义它的正确含义。例如，如果我们正在为一个锁生成防护令牌（请参阅 “[防护令牌](#防护令牌)”），我们可能要求算法具有以下属性：

唯一性（uniqueness）
: 没有两个防护令牌请求返回相同的值。

单调序列（monotonic sequence）
: 如果请求 $x$ 返回了令牌 $t_x$，并且请求 $y$ 返回了令牌 $t_y$，并且 $x$ 在 $y$ 开始之前已经完成，那么 $t_x < t_y$。

可用性（availability）
: 请求防护令牌并且不会崩溃的节点，最终会收到响应。

如果一个系统模型中的算法总是满足它在所有我们假设可能发生的情况下的性质，那么这个算法是正确的。但这如何有意义？如果所有的节点崩溃，或者所有的网络延迟突然变得无限长，那么没有任何算法能够完成任何事情。

#### 安全性和活性

为了澄清这种情况，有必要区分两种不同的属性：**安全（safety）属性** 和 **活性（liveness）属性**。在刚刚给出的例子中，**唯一性** 和 **单调序列** 是安全属性，而 **可用性** 是活性属性。

这两种性质有什么区别？一个试金石就是，活性属性通常在定义中通常包括 “**最终**” 一词（是的，你猜对了 —— 最终一致性是一个活性属性【89】）。

安全通常被非正式地定义为：**没有坏事发生**，而活性通常就类似：**最终好事发生**。但是，最好不要过多地阅读那些非正式的定义，因为好与坏的含义是主观的。安全和活性的实际定义是精确的和数学的【90】：

* 如果安全属性被违反，我们可以指向一个特定的安全属性被破坏的时间点（例如，如果违反了唯一性属性，我们可以确定重复的防护令牌被返回的特定操作）。违反安全属性后，违规行为不能被撤销 —— 损失已经发生。
* 活性属性反过来：在某个时间点（例如，一个节点可能发送了一个请求，但还没有收到响应），它可能不成立，但总是希望在未来能成立（即通过接受答复）。

区分安全属性和活性属性的一个优点是可以帮助我们处理困难的系统模型。对于分布式算法，在系统模型的所有可能情况下，要求 **始终** 保持安全属性是常见的【88】。也就是说，即使所有节点崩溃，或者整个网络出现故障，算法仍然必须确保它不会返回错误的结果（即保证安全属性得到满足）。

但是，对于活性属性，我们可以提出一些注意事项：例如，只有在大多数节点没有崩溃的情况下，只有当网络最终从中断中恢复时，我们才可以说请求需要接收响应。部分同步模型的定义要求系统最终返回到同步状态 —— 即任何网络中断的时间段只会持续一段有限的时间，然后进行修复。

#### 将系统模型映射到现实世界

安全属性和活性属性以及系统模型对于推理分布式算法的正确性非常有用。然而，在实践中实施算法时，现实的混乱事实再一次地让你咬牙切齿，很明显系统模型是对现实的简化抽象。

例如，在崩溃 - 恢复（crash-recovery）模型中的算法通常假设稳定存储器中的数据在崩溃后可以幸存。但是，如果磁盘上的数据被破坏，或者由于硬件错误或错误配置导致数据被清除，会发生什么情况【91】？如果服务器存在固件错误并且在重新启动时无法识别其硬盘驱动器，即使驱动器已正确连接到服务器，那又会发生什么情况【92】？

法定人数算法（请参阅 “[读写的法定人数](/v1/ch5#读写的法定人数)”）依赖节点来记住它声称存储的数据。如果一个节点可能患有健忘症，忘记了以前存储的数据，这会打破法定条件，从而破坏算法的正确性。也许需要一个新的系统模型，在这个模型中，我们假设稳定的存储大多能在崩溃后幸存，但有时也可能会丢失。但是那个模型就变得更难以推理了。

算法的理论描述可以简单宣称一些事是不会发生的 —— 在非拜占庭式系统中，我们确实需要对可能发生和不可能发生的故障做出假设。然而，真实世界的实现，仍然会包括处理 “假设上不可能” 情况的代码，即使代码可能就是 `printf("Sucks to be you")` 和 `exit(666)`，实际上也就是留给运维来擦屁股【93】。（这可以说是计算机科学和软件工程间的一个差异）。

这并不是说理论上抽象的系统模型是毫无价值的，恰恰相反。它们对于将实际系统的复杂性提取成一个个我们可以推理的可处理的错误类型是非常有帮助的，以便我们能够理解这个问题，并试图系统地解决这个问题。我们可以证明算法是正确的，通过表明它们的属性在某个系统模型中总是成立的。

证明算法正确并不意味着它在真实系统上的实现必然总是正确的。但这迈出了很好的第一步，因为理论分析可以发现算法中的问题，这种问题可能会在现实系统中长期潜伏，直到你的假设（例如，时序）因为不寻常的情况被打破。理论分析与经验测试同样重要。


## 本章小结

在本章中，我们讨论了分布式系统中可能发生的各种问题，包括：

* 当你尝试通过网络发送数据包时，数据包可能会丢失或任意延迟。同样，答复可能会丢失或延迟，所以如果你没有得到答复，你不知道消息是否发送成功了。
* 节点的时钟可能会与其他节点显著不同步（尽管你尽最大努力设置 NTP），它可能会突然跳转或跳回，依靠它是很危险的，因为你很可能没有好的方法来测量你的时钟的错误间隔。
* 一个进程可能会在其执行的任何时候暂停一段相当长的时间（可能是因为停止所有处理的垃圾收集器），被其他节点宣告死亡，然后再次复活，却没有意识到它被暂停了。

这类 **部分失效（partial failure）** 可能发生的事实是分布式系统的决定性特征。每当软件试图做任何涉及其他节点的事情时，偶尔就有可能会失败，或者随机变慢，或者根本没有响应（最终超时）。在分布式系统中，我们试图在软件中建立 **部分失效** 的容错机制，这样整个系统在即使某些组成部分被破坏的情况下，也可以继续运行。

为了容忍错误，第一步是 **检测** 它们，但即使这样也很难。大多数系统没有检测节点是否发生故障的准确机制，所以大多数分布式算法依靠 **超时** 来确定远程节点是否仍然可用。但是，超时无法区分网络失效和节点失效，并且可变的网络延迟有时会导致节点被错误地怀疑发生故障。此外，有时一个节点可能处于降级状态：例如，由于驱动程序错误，千兆网卡可能突然下降到 1 Kb/s 的吞吐量【94】。这样一个 “跛行” 而不是死掉的节点可能比一个干净的失效节点更难处理。

一旦检测到故障，使系统容忍它也并不容易：没有全局变量，没有共享内存，没有共同的知识，或机器之间任何其他种类的共享状态。节点甚至不能就现在是什么时间达成一致，就不用说更深奥的了。信息从一个节点流向另一个节点的唯一方法是通过不可靠的网络发送信息。重大决策不能由一个节点安全地完成，因此我们需要一个能从其他节点获得帮助的协议，并争取达到法定人数以达成一致。

如果你习惯于在理想化的数学完美的单机环境（同一个操作总能确定地返回相同的结果）中编写软件，那么转向分布式系统的凌乱的物理现实可能会有些令人震惊。相反，如果能够在单台计算机上解决一个问题，那么分布式系统工程师通常会认为这个问题是平凡的【5】，现在单个计算机确实可以做很多事情【95】。如果你可以避免打开潘多拉的盒子，把东西放在一台机器上，那么通常是值得的。

但是，正如在 [第二部分](/v1/part-ii) 的介绍中所讨论的那样，可伸缩性并不是使用分布式系统的唯一原因。容错和低延迟（通过将数据放置在距离用户较近的地方）是同等重要的目标，而这些不能用单个节点实现。

在本章中，我们也转换了几次话题，探讨了网络、时钟和进程的不可靠性是否是不可避免的自然规律。我们看到这并不是：有可能给网络提供硬实时的响应保证和有限的延迟，但是这样做非常昂贵，且导致硬件资源的利用率降低。大多数非安全关键系统会选择 **便宜而不可靠**，而不是 **昂贵和可靠**。

我们还谈到了超级计算机，它们采用可靠的组件，因此当组件发生故障时必须完全停止并重新启动。相比之下，分布式系统可以永久运行而不会在服务层面中断，因为所有的错误和维护都可以在节点级别进行处理 —— 至少在理论上是如此。（实际上，如果一个错误的配置变更被应用到所有的节点，仍然会使分布式系统瘫痪）。

本章一直在讲存在的问题，给我们展现了一幅黯淡的前景。在 [下一章](/v1/ch9) 中，我们将继续讨论解决方案，并讨论一些旨在解决分布式系统中所有问题的算法。


## 参考文献

1. Mark Cavage: “[There’s Just No Getting Around It: You’re Building a Distributed System](http://queue.acm.org/detail.cfm?id=2482856),” *ACM Queue*, volume 11, number 4, pages 80-89, April 2013. [doi:10.1145/2466486.2482856](http://dx.doi.org/10.1145/2466486.2482856)
1. Jay Kreps: “[Getting Real About Distributed System Reliability](http://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability),” *blog.empathybox.com*, March 19, 2012.
1. Sydney Padua: *The Thrilling Adventures of Lovelace and Babbage: The (Mostly) True Story of the First Computer*. Particular Books, April 2015. ISBN: 978-0-141-98151-2
1. Coda Hale: “[You Can’t Sacrifice Partition Tolerance](http://codahale.com/you-cant-sacrifice-partition-tolerance/),” *codahale.com*, October 7, 2010.
1. Jeff Hodges: “[Notes on Distributed Systems for Young Bloods](https://web.archive.org/web/20200218095605/https://www.somethingsimilar.com/2013/01/14/notes-on-distributed-systems-for-young-bloods/),” *somethingsimilar.com*, January 14, 2013.
1. Antonio Regalado: “[Who Coined 'Cloud Computing'?](https://www.technologyreview.com/2011/10/31/257406/who-coined-cloud-computing/),” *technologyreview.com*, October 31, 2011.
1. Luiz André Barroso, Jimmy Clidaras, and Urs Hölzle: “[The Datacenter as a Computer: An Introduction to the Design of Warehouse-Scale Machines, Second Edition](https://web.archive.org/web/20140404113735/http://www.morganclaypool.com/doi/abs/10.2200/S00516ED2V01Y201306CAC024),” *Synthesis Lectures on Computer Architecture*, volume 8, number 3, Morgan & Claypool Publishers, July 2013. [doi:10.2200/S00516ED2V01Y201306CAC024](http://dx.doi.org/10.2200/S00516ED2V01Y201306CAC024), ISBN: 978-1-627-05010-4
1. David Fiala, Frank Mueller, Christian Engelmann, et al.: “[Detection and Correction of Silent Data Corruption for Large-Scale High-Performance Computing](http://moss.csc.ncsu.edu/~mueller/ftp/pub/mueller/papers/sc12.pdf),” at *International Conference for High Performance Computing, Networking, Storage and Analysis* (SC12), November 2012.
1. Arjun Singh, Joon Ong, Amit Agarwal, et al.: “[Jupiter Rising: A Decade of Clos Topologies and Centralized Control in Google’s Datacenter Network](http://conferences.sigcomm.org/sigcomm/2015/pdf/papers/p183.pdf),” at *Annual Conference of the ACM Special Interest Group on Data Communication* (SIGCOMM), August 2015. [doi:10.1145/2785956.2787508](http://dx.doi.org/10.1145/2785956.2787508)
1. Glenn K. Lockwood: “[Hadoop's Uncomfortable Fit in HPC](http://glennklockwood.blogspot.co.uk/2014/05/hadoops-uncomfortable-fit-in-hpc.html),” *glennklockwood.blogspot.co.uk*, May 16, 2014.
1. John von Neumann: “[Probabilistic Logics and the Synthesis of Reliable Organisms from Unreliable Components](https://personalpages.manchester.ac.uk/staff/nikolaos.kyparissas/uploads/VonNeumann1956.pdf),” in *Automata Studies (AM-34)*, edited by Claude E. Shannon and John McCarthy, Princeton University Press, 1956. ISBN: 978-0-691-07916-5
1. Richard W. Hamming: *The Art of Doing Science and Engineering*. Taylor & Francis, 1997. ISBN: 978-9-056-99500-3
1. Claude E. Shannon: “[A Mathematical Theory of Communication](http://cs.brynmawr.edu/Courses/cs380/fall2012/shannon1948.pdf),” *The Bell System Technical Journal*, volume 27, number 3, pages 379–423 and 623–656, July 1948.
1. Peter Bailis and Kyle Kingsbury: “[The Network Is Reliable](https://queue.acm.org/detail.cfm?id=2655736),” *ACM Queue*, volume 12, number 7, pages 48-55, July 2014. [doi:10.1145/2639988.2639988](http://dx.doi.org/10.1145/2639988.2639988)
1. Joshua B. Leners, Trinabh Gupta, Marcos K. Aguilera, and Michael Walfish: “[Taming Uncertainty in Distributed Systems with Help from the Network](http://www.cs.nyu.edu/~mwalfish/papers/albatross-eurosys15.pdf),” at *10th European Conference on Computer Systems* (EuroSys), April 2015. [doi:10.1145/2741948.2741976](http://dx.doi.org/10.1145/2741948.2741976)
1. Phillipa Gill, Navendu Jain, and Nachiappan Nagappan: “[Understanding Network Failures in Data Centers: Measurement, Analysis, and Implications](http://conferences.sigcomm.org/sigcomm/2011/papers/sigcomm/p350.pdf),” at *ACM SIGCOMM Conference*, August 2011. [doi:10.1145/2018436.2018477](http://dx.doi.org/10.1145/2018436.2018477)
1. Mark Imbriaco: “[Downtime Last Saturday](https://github.com/blog/1364-downtime-last-saturday),” *github.com*, December 26, 2012.
1. Will Oremus: “[The Global Internet Is Being Attacked by Sharks, Google Confirms](http://www.slate.com/blogs/future_tense/2014/08/15/shark_attacks_threaten_google_s_undersea_internet_cables_video.html),” *slate.com*, August 15, 2014.
1. Marc A. Donges: “[Re: bnx2 cards Intermittantly Going Offline](http://www.spinics.net/lists/netdev/msg210485.html),” Message to Linux *netdev* mailing list, *spinics.net*, September 13, 2012.
1. Kyle Kingsbury: “[Call Me Maybe: Elasticsearch](https://aphyr.com/posts/317-call-me-maybe-elasticsearch),” *aphyr.com*, June 15, 2014.
1. Salvatore Sanfilippo: “[A Few Arguments About Redis Sentinel Properties and Fail Scenarios](http://antirez.com/news/80),” *antirez.com*, October 21, 2014.
1. Bert Hubert: “[The Ultimate SO_LINGER Page, or: Why Is My TCP Not Reliable](http://blog.netherlabs.nl/articles/2009/01/18/the-ultimate-so_linger-page-or-why-is-my-tcp-not-reliable),” *blog.netherlabs.nl*, January 18, 2009.
1. Nicolas Liochon: “[CAP: If All You Have Is a Timeout, Everything Looks Like a Partition](http://blog.thislongrun.com/2015/05/CAP-theorem-partition-timeout-zookeeper.html),” *blog.thislongrun.com*, May 25, 2015.
1. Jerome H. Saltzer, David P. Reed, and David D. Clark: “[End-To-End Arguments in System Design](https://groups.csail.mit.edu/ana/Publications/PubPDFs/End-to-End%20Arguments%20in%20System%20Design.pdf),” *ACM Transactions on Computer Systems*, volume 2, number 4, pages 277–288, November 1984. [doi:10.1145/357401.357402](http://dx.doi.org/10.1145/357401.357402)
1. Matthew P. Grosvenor, Malte Schwarzkopf, Ionel Gog, et al.: “[Queues Don’t Matter When You Can JUMP Them!](https://www.usenix.org/system/files/conference/nsdi15/nsdi15-paper-grosvenor_update.pdf),” at *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015.
1. Guohui Wang and T. S. Eugene Ng: “[The Impact of Virtualization on Network Performance of Amazon EC2 Data Center](http://www.cs.rice.edu/~eugeneng/papers/INFOCOM10-ec2.pdf),” at *29th IEEE International Conference on Computer Communications* (INFOCOM), March 2010. [doi:10.1109/INFCOM.2010.5461931](http://dx.doi.org/10.1109/INFCOM.2010.5461931)
1. Van Jacobson: “[Congestion Avoidance and Control](http://www.cs.usask.ca/ftp/pub/discus/seminars2002-2003/p314-jacobson.pdf),” at *ACM Symposium on Communications Architectures and Protocols* (SIGCOMM), August 1988. [doi:10.1145/52324.52356](http://dx.doi.org/10.1145/52324.52356)
1. Brandon Philips: “[etcd: Distributed Locking and Service Discovery](https://www.youtube.com/watch?v=HJIjTTHWYnE),” at *Strange Loop*, September 2014.
1. Steve Newman: “[A Systematic Look at EC2 I/O](https://web.archive.org/web/20141211094156/http://blog.scalyr.com/2012/10/a-systematic-look-at-ec2-io/),” *blog.scalyr.com*, October 16, 2012.
1. Naohiro Hayashibara, Xavier Défago, Rami Yared, and Takuya Katayama: “[The ϕ Accrual Failure Detector](http://hdl.handle.net/10119/4784),” Japan Advanced Institute of Science and Technology, School of Information Science, Technical Report IS-RR-2004-010, May 2004.
1. Jeffrey Wang: “[Phi Accrual Failure Detector](http://ternarysearch.blogspot.co.uk/2013/08/phi-accrual-failure-detector.html),” *ternarysearch.blogspot.co.uk*, August 11, 2013.
1. Srinivasan Keshav: *An Engineering Approach to Computer Networking: ATM Networks, the Internet, and the Telephone Network*. Addison-Wesley Professional, May 1997. ISBN: 978-0-201-63442-6
1. Cisco, “[Integrated Services Digital Network](https://web.archive.org/web/20181229220921/http://docwiki.cisco.com/wiki/Integrated_Services_Digital_Network),” *docwiki.cisco.com*.
1. Othmar Kyas: *ATM Networks*. International Thomson Publishing, 1995. ISBN: 978-1-850-32128-6
1. “[InfiniBand FAQ](http://www.mellanox.com/related-docs/whitepapers/InfiniBandFAQ_FQ_100.pdf),” Mellanox Technologies, December 22, 2014.
1. Jose Renato Santos, Yoshio Turner, and G. (John) Janakiraman: “[End-to-End Congestion Control for InfiniBand](http://www.hpl.hp.com/techreports/2002/HPL-2002-359.pdf),” at *22nd Annual Joint Conference of the IEEE Computer and Communications Societies* (INFOCOM), April 2003. Also published by HP Laboratories Palo Alto, Tech Report HPL-2002-359. [doi:10.1109/INFCOM.2003.1208949](http://dx.doi.org/10.1109/INFCOM.2003.1208949)
1. Ulrich Windl, David Dalton, Marc Martinec, and Dale R. Worley: “[The NTP FAQ and HOWTO](http://www.ntp.org/ntpfaq/NTP-a-faq.htm),” *ntp.org*, November 2006.
1. John Graham-Cumming: “[How and why the leap second affected Cloudflare DNS](https://blog.cloudflare.com/how-and-why-the-leap-second-affected-cloudflare-dns/),” *blog.cloudflare.com*, January 1, 2017.
1. David Holmes: “[Inside the Hotspot VM: Clocks, Timers and Scheduling Events – Part I – Windows](https://web.archive.org/web/20160308031939/https://blogs.oracle.com/dholmes/entry/inside_the_hotspot_vm_clocks),” *blogs.oracle.com*, October 2, 2006.
1. Steve Loughran: “[Time on Multi-Core, Multi-Socket Servers](http://steveloughran.blogspot.co.uk/2015/09/time-on-multi-core-multi-socket-servers.html),” *steveloughran.blogspot.co.uk*, September 17, 2015.
1. James C. Corbett, Jeffrey Dean, Michael Epstein, et al.: “[Spanner: Google’s Globally-Distributed Database](https://research.google/pubs/pub39966/),” at *10th USENIX Symposium on Operating System Design and Implementation* (OSDI), October 2012.
1. M. Caporaloni and R. Ambrosini: “[How Closely Can a Personal Computer Clock Track the UTC Timescale Via the Internet?](https://iopscience.iop.org/0143-0807/23/4/103/),” *European Journal of Physics*, volume 23, number 4, pages L17–L21, June 2012. [doi:10.1088/0143-0807/23/4/103](http://dx.doi.org/10.1088/0143-0807/23/4/103)
1. Nelson Minar: “[A Survey of the NTP Network](http://alumni.media.mit.edu/~nelson/research/ntp-survey99/),” *alumni.media.mit.edu*, December 1999.
1. Viliam Holub: “[Synchronizing Clocks in a Cassandra Cluster Pt. 1 – The Problem](https://blog.rapid7.com/2014/03/14/synchronizing-clocks-in-a-cassandra-cluster-pt-1-the-problem/),” *blog.rapid7.com*, March 14, 2014.
1. Poul-Henning Kamp: “[The One-Second War (What Time Will You Die?)](http://queue.acm.org/detail.cfm?id=1967009),” *ACM Queue*, volume 9, number 4, pages 44–48, April 2011. [doi:10.1145/1966989.1967009](http://dx.doi.org/10.1145/1966989.1967009)
1. Nelson Minar: “[Leap Second Crashes Half the Internet](http://www.somebits.com/weblog/tech/bad/leap-second-2012.html),” *somebits.com*, July 3, 2012.
1. Christopher Pascoe: “[Time, Technology and Leaping Seconds](http://googleblog.blogspot.co.uk/2011/09/time-technology-and-leaping-seconds.html),” *googleblog.blogspot.co.uk*, September 15, 2011.
1. Mingxue Zhao and Jeff Barr: “[Look Before You Leap – The Coming Leap Second and AWS](https://aws.amazon.com/blogs/aws/look-before-you-leap-the-coming-leap-second-and-aws/),” *aws.amazon.com*, May 18, 2015.
1. Darryl Veitch and Kanthaiah Vijayalayan: “[Network Timing and the 2015 Leap Second](https://tklab.feit.uts.edu.au/~darryl/Publications/LeapSecond_camera.pdf),” at *17th International Conference on Passive and Active Measurement* (PAM), April 2016. [doi:10.1007/978-3-319-30505-9_29](http://dx.doi.org/10.1007/978-3-319-30505-9_29)
1. “[Timekeeping in VMware Virtual Machines](https://www.vmware.com/content/dam/digitalmarketing/vmware/en/pdf/techpaper/Timekeeping-In-VirtualMachines.pdf),” Information Guide, VMware, Inc., December 2011.
1. “[MiFID II / MiFIR: Regulatory Technical and Implementing Standards – Annex I (Draft)](https://www.esma.europa.eu/sites/default/files/library/2015/11/2015-esma-1464_annex_i_-_draft_rts_and_its_on_mifid_ii_and_mifir.pdf),” European Securities and Markets Authority, Report ESMA/2015/1464, September 2015.
1. Luke Bigum: “[Solving MiFID II Clock Synchronisation With Minimum Spend (Part 1)](https://web.archive.org/web/20170704030310/https://www.lmax.com/blog/staff-blogs/2015/11/27/solving-mifid-ii-clock-synchronisation-minimum-spend-part-1/),” *lmax.com*, November 27, 2015.
1. Kyle Kingsbury: “[Call Me Maybe: Cassandra](https://aphyr.com/posts/294-call-me-maybe-cassandra/),” *aphyr.com*, September 24, 2013.
1. John Daily: “[Clocks Are Bad, or, Welcome to the Wonderful World of Distributed Systems](https://riak.com/clocks-are-bad-or-welcome-to-distributed-systems/),” *riak.com*, November 12, 2013.
1. Kyle Kingsbury: “[The Trouble with Timestamps](https://aphyr.com/posts/299-the-trouble-with-timestamps),” *aphyr.com*, October 12, 2013.
1. Leslie Lamport: “[Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/),” *Communications of the ACM*, volume 21, number 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](http://dx.doi.org/10.1145/359545.359563)
1. Sandeep Kulkarni, Murat Demirbas, Deepak Madeppa, et al.: “[Logical Physical Clocks and Consistent Snapshots in Globally Distributed Databases](http://www.cse.buffalo.edu/tech-reports/2014-04.pdf),” State University of New York at Buffalo, Computer Science and Engineering Technical Report 2014-04, May 2014.
1. Justin Sheehy: “[There Is No Now: Problems With Simultaneity in Distributed Systems](https://queue.acm.org/detail.cfm?id=2745385),” *ACM Queue*, volume 13, number 3, pages 36–41, March 2015. [doi:10.1145/2733108](http://dx.doi.org/10.1145/2733108)
1. Murat Demirbas: “[Spanner: Google's Globally-Distributed Database](http://muratbuffalo.blogspot.co.uk/2013/07/spanner-googles-globally-distributed_4.html),” *muratbuffalo.blogspot.co.uk*, July 4, 2013.
1. Dahlia Malkhi and Jean-Philippe Martin: “[Spanner's Concurrency Control](http://www.cs.cornell.edu/~ie53/publications/DC-col51-Sep13.pdf),” *ACM SIGACT News*, volume 44, number 3, pages 73–77, September 2013. [doi:10.1145/2527748.2527767](http://dx.doi.org/10.1145/2527748.2527767)
1. Manuel Bravo, Nuno Diegues, Jingna Zeng, et al.: “[On the Use of Clocks to Enforce Consistency in the Cloud](http://sites.computer.org/debull/A15mar/p18.pdf),” *IEEE Data Engineering Bulletin*, volume 38, number 1, pages 18–31, March 2015.
1. Spencer Kimball: “[Living Without Atomic Clocks](http://www.cockroachlabs.com/blog/living-without-atomic-clocks/),” *cockroachlabs.com*, February 17, 2016.
1. Cary G. Gray and David R. Cheriton: “[Leases: An Efficient Fault-Tolerant Mechanism for Distributed File Cache Consistency](https://web.archive.org/web/20230325205928/http://web.stanford.edu/class/cs240/readings/89-leases.pdf),” at *12th ACM Symposium on Operating Systems Principles* (SOSP), December 1989. [doi:10.1145/74850.74870](http://dx.doi.org/10.1145/74850.74870)
1. Todd Lipcon: “[Avoiding Full GCs in Apache HBase with MemStore-Local Allocation Buffers: Part 1](https://web.archive.org/web/20121101040711/http://blog.cloudera.com/blog/2011/02/avoiding-full-gcs-in-hbase-with-memstore-local-allocation-buffers-part-1/),” *blog.cloudera.com*, February 24, 2011.
1. Martin Thompson: “[Java Garbage Collection Distilled](http://mechanical-sympathy.blogspot.co.uk/2013/07/java-garbage-collection-distilled.html),” *mechanical-sympathy.blogspot.co.uk*, July 16, 2013.
1. Alexey Ragozin: “[How to Tame Java GC Pauses? Surviving 16GiB Heap and Greater](https://dzone.com/articles/how-tame-java-gc-pauses),” *dzone.com*, June 28, 2011.
1. Christopher Clark, Keir Fraser, Steven Hand, et al.: “[Live Migration of Virtual Machines](http://www.cl.cam.ac.uk/research/srg/netos/papers/2005-nsdi-migration.pdf),” at *2nd USENIX Symposium on Symposium on Networked Systems Design & Implementation* (NSDI), May 2005.
1. Mike Shaver: “[fsyncers and Curveballs](https://web.archive.org/web/20220107141023/http://shaver.off.net/diary/2008/05/25/fsyncers-and-curveballs/),” *shaver.off.net*, May 25, 2008.
1. Zhenyun Zhuang and Cuong Tran: “[Eliminating Large JVM GC Pauses Caused by Background IO Traffic](https://engineering.linkedin.com/blog/2016/02/eliminating-large-jvm-gc-pauses-caused-by-background-io-traffic),” *engineering.linkedin.com*, February 10, 2016.
1. David Terei and Amit Levy: “[Blade: A Data Center Garbage Collector](http://arxiv.org/pdf/1504.02578.pdf),” arXiv:1504.02578, April 13, 2015.
1. Martin Maas, Tim Harris, Krste Asanović, and John Kubiatowicz: “[Trash Day: Coordinating Garbage Collection in Distributed Systems](https://timharris.uk/papers/2015-hotos.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
1. “[Predictable Low Latency](http://cdn2.hubspot.net/hubfs/1624455/Website_2016/content/White%20papers/Cinnober%20on%20GC%20pause%20free%20Java%20applications.pdf),” Cinnober Financial Technology AB, *cinnober.com*, November 24, 2013.
1. Martin Fowler: “[The LMAX Architecture](http://martinfowler.com/articles/lmax.html),” *martinfowler.com*, July 12, 2011.
1. Flavio P. Junqueira and Benjamin Reed: *ZooKeeper: Distributed Process Coordination*. O'Reilly Media, 2013. ISBN: 978-1-449-36130-3
1. Enis Söztutar: “[HBase and HDFS: Understanding Filesystem Usage in HBase](http://www.slideshare.net/enissoz/hbase-and-hdfs-understanding-filesystem-usage),” at *HBaseCon*, June 2013.
1. Caitie McCaffrey: “[Clients Are Jerks: AKA How Halo 4 DoSed the Services at Launch & How We Survived](https://web.archive.org/web/20230128065851/http://caitiem.com/2015/06/23/clients-are-jerks-aka-how-halo-4-dosed-the-services-at-launch-how-we-survived/),” *caitiem.com*, June 23, 2015.
1. Leslie Lamport, Robert Shostak, and Marshall Pease: “[The Byzantine Generals Problem](https://www.microsoft.com/en-us/research/publication/byzantine-generals-problem/),” *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 4, number 3, pages 382–401, July 1982. [doi:10.1145/357172.357176](http://dx.doi.org/10.1145/357172.357176)
1. Jim N. Gray: “[Notes on Data Base Operating Systems](http://jimgray.azurewebsites.net/papers/dbos.pdf),” in *Operating Systems: An Advanced Course*, Lecture Notes in Computer Science, volume 60, edited by R. Bayer, R. M. Graham, and G. Seegmüller, pages 393–481, Springer-Verlag, 1978. ISBN: 978-3-540-08755-7
1. Brian Palmer: “[How Complicated Was the Byzantine Empire?](http://www.slate.com/articles/news_and_politics/explainer/2011/10/the_byzantine_tax_code_how_complicated_was_byzantium_anyway_.html),” *slate.com*, October 20, 2011.
1. Leslie Lamport: “[My Writings](http://lamport.azurewebsites.net/pubs/pubs.html),” *lamport.azurewebsites.net*, December 16, 2014. This page can be found by searching the web for the 23-character string obtained by removing the hyphens from the string `allla-mport-spubso-ntheweb`.
1. John Rushby: “[Bus Architectures for Safety-Critical Embedded Systems](http://www.csl.sri.com/papers/emsoft01/emsoft01.pdf),” at *1st International Workshop on Embedded Software* (EMSOFT), October 2001.
1. Jake Edge: “[ELC: SpaceX Lessons Learned](http://lwn.net/Articles/540368/),” *lwn.net*, March 6, 2013.
1. Andrew Miller and Joseph J. LaViola, Jr.: “[Anonymous Byzantine Consensus from Moderately-Hard Puzzles: A Model for Bitcoin](http://nakamotoinstitute.org/static/docs/anonymous-byzantine-consensus.pdf),” University of Central Florida, Technical Report CS-TR-14-01, April 2014.
1. James Mickens: “[The Saddest Moment](https://www.usenix.org/system/files/login-logout_1305_mickens.pdf),” *USENIX ;login: logout*, May 2013.
1. Evan Gilman: “[The Discovery of Apache ZooKeeper’s Poison Packet](http://www.pagerduty.com/blog/the-discovery-of-apache-zookeepers-poison-packet/),” *pagerduty.com*, May 7, 2015.
1. Jonathan Stone and Craig Partridge: “[When the CRC and TCP Checksum Disagree](https://web.archive.org/web/20220818235232/https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.27.7611&rep=rep1&type=pdf),” at *ACM Conference on Applications, Technologies, Architectures, and Protocols for Computer Communication* (SIGCOMM), August 2000. [doi:10.1145/347059.347561](http://dx.doi.org/10.1145/347059.347561)
1. Evan Jones: “[How Both TCP and Ethernet Checksums Fail](http://www.evanjones.ca/tcp-and-ethernet-checksums-fail.html),” *evanjones.ca*, October 5, 2015.
1. Cynthia Dwork, Nancy Lynch, and Larry Stockmeyer: “[Consensus in the Presence of Partial Synchrony](https://dl.acm.org/doi/10.1145/42282.42283),” *Journal of the ACM*, volume 35, number 2, pages 288–323, April 1988. [doi:10.1145/42282.42283](http://dx.doi.org/10.1145/42282.42283)
1. Peter Bailis and Ali Ghodsi: “[Eventual Consistency Today: Limitations, Extensions, and Beyond](http://queue.acm.org/detail.cfm?id=2462076),” *ACM Queue*, volume 11, number 3, pages 55-63, March 2013. [doi:10.1145/2460276.2462076](http://dx.doi.org/10.1145/2460276.2462076)
1. Bowen Alpern and Fred B. Schneider: “[Defining Liveness](https://www.cs.cornell.edu/fbs/publications/DefLiveness.pdf),” *Information Processing Letters*, volume 21, number 4, pages 181–185, October 1985. [doi:10.1016/0020-0190(85)90056-0](http://dx.doi.org/10.1016/0020-0190(85)90056-0)
1. Flavio P. Junqueira: “[Dude, Where’s My Metadata?](https://web.archive.org/web/20230604215314/https://fpj.systems/2015/05/28/dude-wheres-my-metadata/),” *fpj.me*, May 28, 2015.
1. Scott Sanders: “[January 28th Incident Report](https://github.com/blog/2106-january-28th-incident-report),” *github.com*, February 3, 2016.
1. Jay Kreps: “[A Few Notes on Kafka and Jepsen](http://blog.empathybox.com/post/62279088548/a-few-notes-on-kafka-and-jepsen),” *blog.empathybox.com*, September 25, 2013.
1. Thanh Do, Mingzhe Hao, Tanakorn Leesatapornwongsa, et al.: “[Limplock: Understanding the Impact of Limpware on Scale-out Cloud Systems](http://ucare.cs.uchicago.edu/pdf/socc13-limplock.pdf),” at *4th ACM Symposium on Cloud Computing* (SoCC), October 2013. [doi:10.1145/2523616.2523627](http://dx.doi.org/10.1145/2523616.2523627)
1. Frank McSherry, Michael Isard, and Derek G. Murray: “[Scalability! But at What COST?](http://www.frankmcsherry.org/assets/COST.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.

[^译著1]: 原诗为：Hey I just met you. The network’s laggy. But here’s my data. So store it maybe.Hey,  应改编自《Call Me Maybe》歌词：I just met you, And this is crazy, But here's my number, So call me, maybe?


================================================
FILE: content/v1/ch9.md
================================================
---
title: "第九章：一致性与共识"
linkTitle: "9. 一致性与共识"
weight: 209
math: true
breadcrumbs: false
---


![](/map/ch09.png)

> 好死还是赖活着？
> —— Jay Kreps, 关于 Kafka 与 Jepsen 的若干笔记 (2013)


正如 [第八章](/v1/ch8) 所讨论的，分布式系统中的许多事情可能会出错。处理这种故障的最简单方法是简单地让整个服务失效，并向用户显示错误消息。如果无法接受这个解决方案，我们就需要找到容错的方法 —— 即使某些内部组件出现故障，服务也能正常运行。

在本章中，我们将讨论构建容错分布式系统的算法和协议的一些例子。我们将假设 [第八章](/v1/ch8) 的所有问题都可能发生：网络中的数据包可能会丢失、重新排序、重复推送或任意延迟；时钟只是尽其所能地近似；且节点可以暂停（例如，由于垃圾收集）或随时崩溃。

构建容错系统的最好方法，是找到一些带有实用保证的通用抽象，实现一次，然后让应用依赖这些保证。这与 [第七章](/v1/ch7) 中的事务处理方法相同：通过使用事务，应用可以假装没有崩溃（原子性），没有其他人同时访问数据库（隔离），存储设备是完全可靠的（持久性）。即使发生崩溃，竞态条件和磁盘故障，事务抽象隐藏了这些问题，因此应用不必担心它们。

现在我们将继续沿着同样的路线前进，寻求可以让应用忽略分布式系统部分问题的抽象概念。例如，分布式系统最重要的抽象之一就是 **共识（consensus）**：**就是让所有的节点对某件事达成一致**。正如我们在本章中将会看到的那样，要可靠地达成共识，且不被网络故障和进程故障所影响，是一个令人惊讶的棘手问题。

一旦达成共识，应用可以将其用于各种目的。例如，假设你有一个单主复制的数据库。如果主库挂掉，并且需要故障切换到另一个节点，剩余的数据库节点可以使用共识来选举新的领导者。正如在 “[处理节点宕机](/v1/ch5#处理节点宕机)” 中所讨论的那样，重要的是只有一个领导者，且所有的节点都认同其领导。如果两个节点都认为自己是领导者，这种情况被称为 **脑裂（split brain）**，它经常会导致数据丢失。正确实现共识有助于避免这种问题。

在本章后面的 “[分布式事务与共识](#分布式事务与共识)” 中，我们将研究解决共识和相关问题的算法。但首先，我们首先需要探索可以在分布式系统中提供的保证和抽象的范围。

我们需要了解可以做什么和不可以做什么的范围：在某些情况下，系统可以容忍故障并继续工作；在其他情况下，这是不可能的。我们将深入研究什么可能而什么不可能的限制，既通过理论证明，也通过实际实现。我们将在本章中概述这些基本限制。

分布式系统领域的研究人员几十年来一直在研究这些主题，所以有很多资料 —— 我们只能介绍一些皮毛。在本书中，我们没有空间去详细介绍形式模型和证明的细节，所以我们会按照直觉来介绍。如果你有兴趣，参考文献可以提供更多的深度。


## 一致性保证

在 “[复制延迟问题](/v1/ch5#复制延迟问题)” 中，我们看到了数据库复制中发生的一些时序问题。如果你在同一时刻查看两个数据库节点，则可能在两个节点上看到不同的数据，因为写请求在不同的时间到达不同的节点。无论数据库使用何种复制方法（单主复制、多主复制或无主复制），都会出现这些不一致情况。

大多数复制的数据库至少提供了 **最终一致性**，这意味着如果你停止向数据库写入数据并等待一段不确定的时间，那么最终所有的读取请求都会返回相同的值【1】。换句话说，不一致性是暂时的，最终会自行解决（假设网络中的任何故障最终都会被修复）。最终一致性的一个更好的名字可能是 **收敛（convergence）**，因为我们预计所有的副本最终会收敛到相同的值【2】。

然而，这是一个非常弱的保证 —— 它并没有说什么时候副本会收敛。在收敛之前，读操作可能会返回任何东西或什么都没有【1】。例如，如果你写入了一个值，然后立即再次读取，这并不能保证你能看到刚才写入的值，因为读请求可能会被路由到另外的副本上。（请参阅 “[读己之写](/v1/ch5#读己之写)” ）。

对于应用开发人员而言，最终一致性是很困难的，因为它与普通单线程程序中变量的行为有很大区别。对于后者，如果将一个值赋给一个变量，然后很快地再次读取，不可能读到旧的值，或者读取失败。数据库表面上看起来像一个你可以读写的变量，但实际上它有更复杂的语义【3】。

在与只提供弱保证的数据库打交道时，你需要始终意识到它的局限性，而不是意外地作出太多假设。错误往往是微妙的，很难找到，也很难测试，因为应用可能在大多数情况下运行良好。当系统出现故障（例如网络中断）或高并发时，最终一致性的边缘情况才会显现出来。

本章将探索数据系统可能选择提供的更强一致性模型。它不是免费的：具有较强保证的系统可能会比保证较差的系统具有更差的性能或更少的容错性。尽管如此，更强的保证能够吸引人，因为它们更容易用对。只有见过不同的一致性模型后，才能更好地决定哪一个最适合自己的需求。

**分布式一致性模型** 和我们之前讨论的事务隔离级别的层次结构有一些相似之处【4,5】（请参阅 “[弱隔离级别](/v1/ch7#弱隔离级别)”）。尽管两者有一部分内容重叠，但它们大多是无关的问题：事务隔离主要是为了 **避免由于同时执行事务而导致的竞争状态**，而分布式一致性主要关于 **在面对延迟和故障时如何协调副本间的状态**。

本章涵盖了广泛的话题，但我们将会看到这些领域实际上是紧密联系在一起的：

* 首先看一下常用的 **最强一致性模型** 之一，**线性一致性（linearizability）**，并考察其优缺点。
* 然后我们将检查分布式系统中 [**事件顺序**](#顺序保证) 的问题，特别是因果关系和全局顺序的问题。
* 在第三节的（“[分布式事务与共识](#分布式事务与共识)”）中将探讨如何原子地提交分布式事务，这将最终引领我们走向共识问题的解决方案。


## 线性一致性

在 **最终一致** 的数据库，如果你在同一时刻问两个不同副本相同的问题，可能会得到两个不同的答案。这很让人困惑。如果数据库可以提供只有一个副本的假象（即，只有一个数据副本），那么事情就简单太多了。那么每个客户端都会有相同的数据视图，且不必担心复制滞后了。

这就是 **线性一致性（linearizability）** 背后的想法【6】（也称为 **原子一致性（atomic consistency）**【7】，**强一致性（strong consistency）**，**立即一致性（immediate consistency）** 或 **外部一致性（external consistency ）**【8】）。线性一致性的精确定义相当微妙，我们将在本节的剩余部分探讨它。但是基本的想法是让一个系统看起来好像只有一个数据副本，而且所有的操作都是原子性的。有了这个保证，即使实际中可能有多个副本，应用也不需要担心它们。

在一个线性一致的系统中，只要一个客户端成功完成写操作，所有客户端从数据库中读取数据必须能够看到刚刚写入的值。要维护数据的单个副本的假象，系统应保障读到的值是最近的、最新的，而不是来自陈旧的缓存或副本。换句话说，线性一致性是一个 **新鲜度保证（recency guarantee）**。为了阐明这个想法，我们来看看一个非线性一致系统的例子。

![](/v1/ddia_0901.png)

**图 9-1 这个系统是非线性一致的，导致了球迷的困惑**

[图 9-1](/v1/ddia_0901.png) 展示了一个关于体育网站的非线性一致例子【9】。Alice 和 Bob 正坐在同一个房间里，都盯着各自的手机，关注着 2014 年 FIFA 世界杯决赛的结果。在最后得分公布后，Alice 刷新页面，看到宣布了获胜者，并兴奋地告诉 Bob。Bob 难以置信地刷新了自己的手机，但他的请求路由到了一个落后的数据库副本上，手机显示比赛仍在进行。

如果 Alice 和 Bob 在同一时间刷新并获得了两个不同的查询结果，也许就没有那么令人惊讶了。因为他们不知道服务器处理他们请求的精确时刻。然而 Bob 是在听到 Alice 惊呼最后得分 **之后**，点击了刷新按钮（启动了他的查询），因此他希望查询结果至少与爱丽丝一样新鲜。但他的查询返回了陈旧结果，这一事实违背了线性一致性的要求。

### 什么使得系统线性一致？

线性一致性背后的基本思想很简单：使系统看起来好像只有一个数据副本。然而确切来讲，实际上有更多要操心的地方。为了更好地理解线性一致性，让我们再看几个例子。

[图 9-2](/v1/ddia_0902.png) 显示了三个客户端在线性一致数据库中同时读写相同的键 `x`。在分布式系统文献中，`x` 被称为 **寄存器（register）**，例如，它可以是键值存储中的一个 **键**，关系数据库中的一 **行**，或文档数据库中的一个 **文档**。

![](/v1/ddia_0902.png)

**图 9-2 如果读取请求与写入请求并发，则可能会返回旧值或新值**

为了简单起见，[图 9-2](/v1/ddia_0902.png) 采用了用户请求的视角，而不是数据库内部的视角。每个横柱都是由客户端发出的请求，其中柱头是请求发送的时刻，柱尾是客户端收到响应的时刻。因为网络延迟变化无常，客户端不知道数据库处理其请求的精确时间 —— 只知道它发生在发送请求和接收响应之间的某个时刻。[^i]

[^i]: 这个图的一个微妙的细节是它假定存在一个全局时钟，由水平轴表示。虽然真实的系统通常没有准确的时钟（请参阅 “[不可靠的时钟](/v1/ch8#不可靠的时钟)”），但这种假设是允许的：为了分析分布式算法，我们可以假设存在一个精确的全局时钟，不过算法无法访问它【47】。算法只能看到由石英振荡器和 NTP 产生的对真实时间的逼近。

在这个例子中，寄存器有两种类型的操作：

* $read(x)⇒v$表示客户端请求读取寄存器 `x` 的值，数据库返回值 `v`。
* $write(x,v)⇒r$ 表示客户端请求将寄存器 `x` 设置为值 `v` ，数据库返回响应 `r` （可能正确，可能错误）。

在 [图 9-2](/v1/ddia_0902.png) 中，`x` 的值最初为 `0`，客户端 C 执行写请求将其设置为 `1`。发生这种情况时，客户端 A 和 B 反复轮询数据库以读取最新值。A 和 B 的请求可能会收到怎样的响应？

* 客户端 A 的第一个读操作，完成于写操作开始之前，因此必须返回旧值 `0`。
* 客户端 A 的最后一个读操作，开始于写操作完成之后。如果数据库是线性一致性的，它必然返回新值 `1`：因为读操作和写操作一定是在其各自的起止区间内的某个时刻被处理。如果在写入结束后开始读取，则读取处理一定发生在写入完成之后，因此它必须看到写入的新值。
* 与写操作在时间上重叠的任何读操作，可能会返回 `0` 或 `1` ，因为我们不知道读取时，写操作是否已经生效。这些操作是 **并发（concurrent）** 的。

但是，这还不足以完全描述线性一致性：如果与写入同时发生的读取可以返回旧值或新值，那么读者可能会在写入期间看到数值在旧值和新值之间来回翻转。这个系统对 “单一数据副本” 的模拟还不是我们所期望的。[^ii]

[^ii]: 如果读取（与写入同时发生时）可能返回旧值或新值，则称该寄存器为 **常规寄存器（regular register）**【7,25】

为了使系统线性一致，我们需要添加另一个约束，如 [图 9-3](/v1/ddia_0903.png) 所示

![](/v1/ddia_0903.png)

**图 9-3 任何一个读取返回新值后，所有后续读取（在相同或其他客户端上）也必须返回新值。**

在一个线性一致的系统中，我们可以想象，在 `x` 的值从 `0` 自动翻转到 `1` 的时候（在写操作的开始和结束之间）必定有一个时间点。因此，如果一个客户端的读取返回新的值 `1`，即使写操作尚未完成，所有后续读取也必须返回新值。

[图 9-3](/v1/ddia_0903.png) 中的箭头说明了这个时序依赖关系。客户端 A 是第一个读取新的值 `1` 的位置。在 A 的读取返回之后，B 开始新的读取。由于 B 的读取严格发生于 A 的读取之后，因此即使 C 的写入仍在进行中，也必须返回 `1`（与 [图 9-1](/v1/ddia_0901.png) 中的 Alice 和 Bob 的情况相同：在 Alice 读取新值之后，Bob 也希望读取新的值）。

我们可以进一步细化这个时序图，展示每个操作是如何在特定时刻原子性生效的。[图 9-4](/v1/ddia_0904.png) 显示了一个更复杂的例子【10】。

在 [图 9-4](/v1/ddia_0904.png) 中，除了读写之外，还增加了第三种类型的操作：

* $cas(x, v_{old}, v_{new})⇒r$ 表示客户端请求进行原子性的 [**比较与设置**](/v1/ch7#比较并设置（CAS）) 操作。如果寄存器 $x$ 的当前值等于 $v_{old}$ ，则应该原子地设置为 $v_{new}$ 。如果 $x$ 不等于 $v_{old}$ ，则操作应该保持寄存器不变并返回一个错误。$r$ 是数据库的响应（正确或错误）。

[图 9-4](/v1/ddia_0904.png) 中的每个操作都在我们认为操作被执行的时候用竖线标出（在每个操作的横柱之内）。这些标记按顺序连在一起，其结果必须是一个有效的寄存器读写序列（**每次读取都必须返回最近一次写入设置的值**）。

线性一致性的要求是，操作标记的连线总是按时间（从左到右）向前移动，而不是向后移动。这个要求确保了我们之前讨论的新鲜度保证：一旦新的值被写入或读取，所有后续的读都会看到写入的值，直到它被再次覆盖。

![](/v1/ddia_0904.png)

**图 9-4 将读取和写入看起来已经生效的时间点进行可视化。客户端 B 的最后一次读取不是线性一致的**

[图 9-4](/v1/ddia_0904.png) 中有一些有趣的细节需要指出：

* 第一个客户端 B 发送一个读取 `x` 的请求，然后客户端 D 发送一个请求将 `x` 设置为 `0`，然后客户端 A 发送请求将 `x` 设置为 `1`。然而，返回给 B 的读取值为 `1`（由 A 写入的值）。这是可以的：这意味着数据库首先处理 D 的写入，然后是 A 的写入，最后是 B 的读取。虽然这不是请求发送的顺序，但这是一个可以接受的顺序，因为这三个请求是并发的。也许 B 的读请求在网络上略有延迟，所以它在两次写入之后才到达数据库。

* 在客户端 A 从数据库收到响应之前，客户端 B 的读取返回 `1` ，表示写入值 `1` 已成功。这也是可以的：这并不意味着在写之前读到了值，这只是意味着从数据库到客户端 A 的正确响应在网络中略有延迟。

* 此模型不假设有任何事务隔离：另一个客户端可能随时更改值。例如，C 首先读取到 `1` ，然后读取到 `2` ，因为两次读取之间的值被 B 所更改。可以使用原子 **比较并设置（cas）** 操作来检查该值是否未被另一客户端同时更改：B 和 C 的 **cas** 请求成功，但是 D 的 **cas** 请求失败（在数据库处理它时，`x` 的值不再是 `0` ）。

* 客户 B 的最后一次读取（阴影条柱中）不是线性一致的。该操作与 C 的 **cas** 写操作并发（它将 `x` 从 `2` 更新为 `4` ）。在没有其他请求的情况下，B 的读取返回 `2` 是可以的。然而，在 B 的读取开始之前，客户端 A 已经读取了新的值 `4`  ，因此不允许 B 读取比 A 更旧的值。再次，与 [图 9-1](/v1/ddia_0901.png) 中的 Alice 和 Bob 的情况相同。

  这就是线性一致性背后的直觉。正式的定义【6】更准确地描述了它。通过记录所有请求和响应的时序，并检查它们是否可以排列成有效的顺序，以测试一个系统的行为是否线性一致性是可能的（尽管在计算上是昂贵的）【11】。


> [!NOTE] 线性一致性与可串行化
>
> **线性一致性** 容易和 [**可串行化**](/v1/ch7#可串行化) 相混淆，因为两个词似乎都是类似 “可以按顺序排列” 的东西。但它们是两种完全不同的保证，区分两者非常重要：
>
> ***可串行化***
> : **可串行化（Serializability）** 是事务的隔离属性，每个事务可以读写多个对象（行，文档，记录）—— 请参阅 “[单对象和多对象操作](/v1/ch7#单对象和多对象操作)”。它确保事务的行为，与它们按照 **某种** 顺序依次执行的结果相同（每个事务在下一个事务开始之前运行完成）。这种执行顺序可以与事务实际执行的顺序不同。【12】。
>
> ***线性一致性***
> : **线性一致性（Linearizability）** 是读取和写入寄存器（单个对象）的 **新鲜度保证**。它不会将操作组合为事务，因此它也不会阻止写入偏差等问题（请参阅 “[写入偏差和幻读](/v1/ch7#写入偏差与幻读)”），除非采取其他措施（例如 [物化冲突](/v1/ch7#物化冲突)）。
>
> 一个数据库可以提供可串行化和线性一致性，这种组合被称为严格的可串行化或 **强的单副本可串行化（strong-1SR）**【4,13】。基于两阶段锁定的可串行化实现（请参阅 “[两阶段锁定](/v1/ch7#两阶段锁定)” 一节）或 **真的串行执行**（请参阅 “[真的串行执行](/v1/ch7#真的串行执行)”一节）通常是线性一致性的。
>
> 但是，可串行化的快照隔离（请参阅 “[可串行化快照隔离](/v1/ch7#可串行化快照隔离)”）不是线性一致性的：按照设计，它从一致的快照中进行读取，以避免读者和写者之间的锁竞争。一致性快照的要点就在于 **它不会包括该快照之后的写入**，因此从快照读取不是线性一致性的。


### 依赖线性一致性

线性一致性在什么情况下有用？观看体育比赛的最后得分可能是一个轻率的例子：滞后了几秒钟的结果不太可能在这种情况下造成任何真正的伤害。然而对于少数领域，线性一致性是系统正确工作的一个重要条件。

#### 锁定和领导选举

一个使用单主复制的系统，需要确保领导者真的只有一个，而不是几个（脑裂）。一种选择领导者的方法是使用锁：每个节点在启动时尝试获取锁，成功者成为领导者【14】。不管这个锁是如何实现的，它必须是线性一致的：所有节点必须就哪个节点拥有锁达成一致，否则就没用了。

诸如 Apache ZooKeeper 【15】和 etcd 【16】之类的协调服务通常用于实现分布式锁和领导者选举。它们使用一致性算法，以容错的方式实现线性一致的操作（在本章后面的 “[容错共识](#容错共识)” 中讨论此类算法）[^iii]。还有许多微妙的细节来正确地实现锁和领导者选举（例如，请参阅 “[领导者和锁](/v1/ch8#领导者和锁)” 中的防护问题），而像 Apache Curator 【17】这样的库则通过在 ZooKeeper 之上提供更高级别的配方来提供帮助。但是，线性一致性存储服务是这些协调任务的基础。

[^iii]: 严格地说，ZooKeeper 和 etcd 提供线性一致性的写操作，但读取可能是陈旧的，因为默认情况下，它们可以由任何一个副本提供服务。你可以选择请求线性一致性读取：etcd 称之为 **法定人数读取（quorum read）**【16】，而在 ZooKeeper 中，你需要在读取之前调用 `sync()`【15】。请参阅 “[使用全序广播实现线性一致的存储](#使用全序广播实现线性一致的存储)”。

分布式锁也在一些分布式数据库（如 Oracle Real Application Clusters（RAC）【18】）中有更细粒度级别的使用。RAC 对每个磁盘页面使用一个锁，多个节点共享对同一个磁盘存储系统的访问权限。由于这些线性一致的锁处于事务执行的关键路径上，RAC 部署通常具有用于数据库节点之间通信的专用集群互连网络。

#### 约束和唯一性保证

唯一性约束在数据库中很常见：例如，用户名或电子邮件地址必须唯一标识一个用户，而在文件存储服务中，不能有两个具有相同路径和文件名的文件。如果要在写入数据时强制执行此约束（例如，如果两个人试图同时创建一个具有相同名称的用户或文件，其中一个将返回一个错误），则需要线性一致性。

这种情况实际上类似于一个锁：当一个用户注册你的服务时，可以认为他们获得了所选用户名的 “锁”。该操作与原子性的比较与设置（CAS）非常相似：将用户名赋予声明它的用户，前提是用户名尚未被使用。

如果想要确保银行账户余额永远不会为负数，或者不会出售比仓库里的库存更多的物品，或者两个人不会都预定了航班或剧院里同一时间的同一个位置。这些约束条件都要求所有节点都同意一个最新的值（账户余额，库存水平，座位占用率）。

在实际应用中，宽松地处理这些限制有时是可以接受的（例如，如果航班超额预订，你可以将客户转移到不同的航班并为其提供补偿）。在这种情况下，可能不需要线性一致性，我们将在 “[及时性与完整性](/v1/ch12#及时性与完整性)” 中讨论这种宽松的约束。

然而，一个硬性的唯一性约束（关系型数据库中常见的那种）需要线性一致性。其他类型的约束，如外键或属性约束，可以不需要线性一致性【19】。

#### 跨信道的时序依赖

注意 [图 9-1](/v1/ddia_0901.png) 中的一个细节：如果 Alice 没有惊呼得分，Bob 就不会知道他的查询结果是陈旧的。他会在几秒钟之后再次刷新页面，并最终看到最后的分数。由于系统中存在额外的信道（Alice 的声音传到了 Bob 的耳朵中），线性一致性的违背才被注意到。

计算机系统也会出现类似的情况。例如，假设有一个网站，用户可以上传照片，一个后台进程会调整照片大小，降低分辨率以加快下载速度（缩略图）。该系统的架构和数据流如 [图 9-5](/v1/ddia_0905.png) 所示。

图像缩放器需要明确的指令来执行尺寸缩放作业，指令是 Web 服务器通过消息队列发送的（请参阅 [第十一章](/v1/ch11)）。Web 服务器不会将整个照片放在队列中，因为大多数消息代理都是针对较短的消息而设计的，而一张照片的空间占用可能达到几兆字节。取而代之的是，首先将照片写入文件存储服务，写入完成后再将给缩放器的指令放入消息队列。

![](/v1/ddia_0905.png)

**图 9-5 Web 服务器和图像缩放器通过文件存储和消息队列进行通信，打开竞争条件的可能性。**

如果文件存储服务是线性一致的，那么这个系统应该可以正常工作。如果它不是线性一致的，则存在竞争条件的风险：消息队列（[图 9-5](/v1/ddia_0905.png) 中的步骤 3 和 4）可能比存储服务内部的复制（replication）更快。在这种情况下，当缩放器读取图像（步骤 5）时，可能会看到图像的旧版本，或者什么都没有。如果它处理的是旧版本的图像，则文件存储中的全尺寸图和缩略图就产生了永久性的不一致。

出现这个问题是因为 Web 服务器和缩放器之间存在两个不同的信道：文件存储与消息队列。没有线性一致性的新鲜性保证，这两个信道之间的竞争条件是可能的。这种情况类似于 [图 9-1](/v1/ddia_0901.png)，数据库复制与 Alice 的嘴到 Bob 耳朵之间的真人音频信道之间也存在竞争条件。

线性一致性并不是避免这种竞争条件的唯一方法，但它是最容易理解的。如果你可以控制额外信道（例如消息队列的例子，而不是在 Alice 和 Bob 的例子），则可以使用在 “[读己之写](/v1/ch5#读己之写)” 讨论过的类似方法，不过会有额外的复杂度代价。

### 实现线性一致的系统

我们已经见到了几个线性一致性有用的例子，让我们思考一下，如何实现一个提供线性一致语义的系统。

由于线性一致性本质上意味着 “表现得好像只有一个数据副本，而且所有的操作都是原子的”，所以最简单的答案就是，真的只用一个数据副本。但是这种方法无法容错：如果持有该副本的节点失效，数据将会丢失，或者至少无法访问，直到节点重新启动。

使系统容错最常用的方法是使用复制。我们再来回顾 [第五章](/v1/ch5) 中的复制方法，并比较它们是否可以满足线性一致性：

单主复制（可能线性一致）
: 在具有单主复制功能的系统中（请参阅 “[领导者与追随者](/v1/ch5#领导者与追随者)”），主库具有用于写入的数据的主副本，而追随者在其他节点上保留数据的备份副本。如果从主库或同步更新的从库读取数据，它们 **可能（potential）** 是线性一致性的 [^iv]。然而，实际上并不是每个单主数据库都是线性一致性的，无论是因为设计的原因（例如，因为使用了快照隔离）还是因为在并发处理上存在错误【10】。

  [^iv]: 对单主数据库进行分区（分片），使得每个分区有一个单独的领导者，不会影响线性一致性，因为线性一致性只是对单一对象的保证。交叉分区事务是一个不同的问题（请参阅 “[分布式事务与共识](#分布式事务与共识)”）。

  从主库读取依赖一个假设，你确切地知道领导者是谁。正如在 “[真相由多数所定义](/v1/ch8#真相由多数所定义)” 中所讨论的那样，一个节点很可能会认为它是领导者，而事实上并非如此 —— 如果具有错觉的领导者继续为请求提供服务，可能违反线性一致性【20】。使用异步复制，故障切换时甚至可能会丢失已提交的写入（请参阅 “[处理节点宕机](/v1/ch5#处理节点宕机)”），这同时违反了持久性和线性一致性。

共识算法（线性一致）
: 一些在本章后面讨论的共识算法，与单主复制类似。然而，共识协议包含防止脑裂和陈旧副本的措施。正是由于这些细节，共识算法可以安全地实现线性一致性存储。例如，Zookeeper 【21】和 etcd 【22】就是这样工作的。

多主复制（非线性一致）
: 具有多主程序复制的系统通常不是线性一致的，因为它们同时在多个节点上处理写入，并将其异步复制到其他节点。因此，它们可能会产生需要被解决的写入冲突（请参阅 “[处理写入冲突](/v1/ch5#处理写入冲突)”）。这种冲突是因为缺少单一数据副本所导致的。

无主复制（也许不是线性一致的）
: 对于无主复制的系统（Dynamo 风格；请参阅 “[无主复制](/v1/ch5#无主复制)”），有时候人们会声称通过要求法定人数读写（ $w + r > n$ ）可以获得 “强一致性”。这取决于法定人数的具体配置，以及强一致性如何定义（通常不完全正确）。
  
  基于日历时钟（例如，在 Cassandra 中；请参阅 “[依赖同步时钟](/v1/ch8#依赖同步时钟)”）的 “最后写入胜利” 冲突解决方法几乎可以确定是非线性一致的，由于时钟偏差，不能保证时钟的时间戳与实际事件顺序一致。宽松的法定人数（请参阅 “[宽松的法定人数与提示移交](/v1/ch5#宽松的法定人数与提示移交)”）也破坏了线性一致的可能性。即使使用严格的法定人数，非线性一致的行为也是可能的，如下节所示。

#### 线性一致性和法定人数

直觉上在 Dynamo 风格的模型中，严格的法定人数读写应该是线性一致性的。但是当我们有可变的网络延迟时，就可能存在竞争条件，如 [图 9-6](/v1/ddia_0906.png) 所示。

![](/v1/ddia_0906.png)

**图 9-6 非线性一致的执行，尽管使用了严格的法定人数**

在 [图 9-6](/v1/ddia_0906.png) 中，$x$ 的初始值为 0，写入客户端通过向所有三个副本（ $n = 3, w = 3$ ）发送写入将 $x$ 更新为 `1`。客户端 A 并发地从两个节点组成的法定人群（ $r = 2$ ）中读取数据，并在其中一个节点上看到新值 `1` 。客户端 B 也并发地从两个不同的节点组成的法定人数中读取，并从两个节点中取回了旧值 `0` 。

法定人数条件满足（ $w + r> n$ ），但是这个执行是非线性一致的：B 的请求在 A 的请求完成后开始，但是 B 返回旧值，而 A 返回新值。（又一次，如同 Alice 和 Bob 的例子 [图 9-1](/v1/ddia_0901.png)）

有趣的是，通过牺牲性能，可以使 Dynamo 风格的法定人数线性化：读取者必须在将结果返回给应用之前，同步执行读修复（请参阅 “[读修复和反熵](/v1/ch5#读修复和反熵)”） ，并且写入者必须在发送写入之前，读取法定数量节点的最新状态【24,25】。然而，由于性能损失，Riak 不执行同步读修复【26】。Cassandra 在进行法定人数读取时，**确实** 在等待读修复完成【27】；但是由于使用了最后写入胜利的冲突解决方案，当同一个键有多个并发写入时，将不能保证线性一致性。

而且，这种方式只能实现线性一致的读写；不能实现线性一致的比较和设置（CAS）操作，因为它需要一个共识算法【28】。

总而言之，最安全的做法是：假设采用 Dynamo 风格无主复制的系统不能提供线性一致性。


### 线性一致性的代价

一些复制方法可以提供线性一致性，另一些复制方法则不能，因此深入地探讨线性一致性的优缺点是很有趣的。

我们已经在 [第五章](/v1/ch5) 中讨论了不同复制方法的一些用例。例如对多数据中心的复制而言，多主复制通常是理想的选择（请参阅 “[运维多个数据中心](/v1/ch5#运维多个数据中心)”）。[图 9-7](/v1/ddia_0907.png) 说明了这种部署的一个例子。

![](/v1/ddia_0907.png)

**图 9-7 网络中断迫使在线性一致性和可用性之间做出选择。**

考虑这样一种情况：如果两个数据中心之间发生网络中断会发生什么？我们假设每个数据中心内的网络正在工作，客户端可以访问数据中心，但数据中心之间彼此无法互相连接。

使用多主数据库，每个数据中心都可以继续正常运行：由于在一个数据中心写入的数据是异步复制到另一个数据中心的，所以在恢复网络连接时，写入操作只是简单地排队并交换。

另一方面，如果使用单主复制，则主库必须位于其中一个数据中心。任何写入和任何线性一致的读取请求都必须发送给该主库，因此对于连接到从库所在数据中心的客户端，这些读取和写入请求必须通过网络同步发送到主库所在的数据中心。

在单主配置的条件下，如果数据中心之间的网络被中断，则连接到从库数据中心的客户端无法联系到主库，因此它们无法对数据库执行任何写入，也不能执行任何线性一致的读取。它们仍能从从库读取，但结果可能是陈旧的（非线性一致）。如果应用需要线性一致的读写，却又位于与主库网络中断的数据中心，则网络中断将导致这些应用不可用。

如果客户端可以直接连接到主库所在的数据中心，这就不是问题了，那些应用可以继续正常工作。但只能访问从库数据中心的客户端会中断运行，直到网络连接得到修复。

#### CAP定理

这个问题不仅仅是单主复制和多主复制的后果：任何线性一致的数据库都有这个问题，不管它是如何实现的。这个问题也不仅仅局限于多数据中心部署，而可能发生在任何不可靠的网络上，即使在同一个数据中心内也是如此。问题面临的权衡如下：[^v]

* 如果应用需要线性一致性，且某些副本因为网络问题与其他副本断开连接，那么这些副本掉线时不能处理请求。请求必须等到网络问题解决，或直接返回错误。（无论哪种方式，服务都 **不可用**）。
* 如果应用不需要线性一致性，那么某个副本即使与其他副本断开连接，也可以独立处理请求（例如多主复制）。在这种情况下，应用可以在网络问题解决前保持可用，但其行为不是线性一致的。

[^v]: 这两种选择有时分别称为 CP（在网络分区下一致但不可用）和 AP（在网络分区下可用但不一致）。但是，这种分类方案存在一些缺陷【9】，所以最好不要这样用。

因此，不需要线性一致性的应用对网络问题有更强的容错能力。这种见解通常被称为 CAP 定理【29,30,31,32】，由 Eric Brewer 于 2000 年命名，尽管 70 年代的分布式数据库设计者早就知道了这种权衡【33,34,35,36】。

CAP 最初是作为一个经验法则提出的，没有准确的定义，目的是开始讨论数据库的权衡。那时候许多分布式数据库侧重于在共享存储的集群上提供线性一致性的语义【18】，CAP 定理鼓励数据库工程师向分布式无共享系统的设计领域深入探索，这类架构更适合实现大规模的网络服务【37】。对于这种文化上的转变，CAP 值得赞扬 —— 它见证了自 00 年代中期以来新数据库的技术爆炸（即 NoSQL）。

> #### CAP定理没有帮助
>
> CAP 有时以这种面目出现：一致性，可用性和分区容错性：三者只能择其二。不幸的是这种说法很有误导性【32】，因为网络分区是一种故障类型，所以它并不是一个选项：不管你喜不喜欢它都会发生【38】。
>
> 在网络正常工作的时候，系统可以提供一致性（线性一致性）和整体可用性。发生网络故障时，你必须在线性一致性和整体可用性之间做出选择。因此，CAP 更好的表述成：在分区时要么选择一致，要么选择可用【39】。一个更可靠的网络需要减少这个选择，但是在某些时候选择是不可避免的。
>
> 在 CAP 的讨论中，术语可用性有几个相互矛盾的定义，形式化作为一个定理【30】并不符合其通常的含义【40】。许多所谓的 “高可用”（容错）系统实际上不符合 CAP 对可用性的特殊定义。总而言之，围绕着 CAP 有很多误解和困惑，并不能帮助我们更好地理解系统，所以最好避免使用 CAP。

CAP 定理的正式定义仅限于很狭隘的范围【30】，它只考虑了一个一致性模型（即线性一致性）和一种故障（网络分区 [^vi]，或活跃但彼此断开的节点）。它没有讨论任何关于网络延迟，死亡节点或其他权衡的事。因此，尽管 CAP 在历史上有一些影响力，但对于设计系统而言并没有实际价值【9,40】。

在分布式系统中有更多有趣的 “不可能” 的结果【41】，且 CAP 定理现在已经被更精确的结果取代【2,42】，所以它现在基本上成了历史古迹了。

[^vi]: 正如 “[真实世界的网络故障](/v1/ch8#真实世界的网络故障)” 中所讨论的，本书使用 **分区（partition）** 指代将大数据集细分为小数据集的操作（分片；请参阅 [第六章](/v1/ch6)）。与之对应的是，**网络分区（network partition）** 是一种特定类型的网络故障，我们通常不会将其与其他类型的故障分开考虑。但是，由于它是 CAP 的 P，所以这种情况下我们无法避免混乱。

#### 线性一致性和网络延迟

虽然线性一致是一个很有用的保证，但实际上，线性一致的系统惊人的少。例如，现代多核 CPU 上的内存甚至都不是线性一致的【43】：如果一个 CPU 核上运行的线程写入某个内存地址，而另一个 CPU 核上运行的线程不久之后读取相同的地址，并没有保证一定能读到第一个线程写入的值（除非使用了 **内存屏障（memory barrier）** 或 **围栏（fence）**【44】）。

这种行为的原因是每个 CPU 核都有自己的内存缓存和存储缓冲区。默认情况下，内存访问首先走缓存，任何变更会异步写入主存。因为缓存访问比主存要快得多【45】，所以这个特性对于现代 CPU 的良好性能表现至关重要。但是现在就有几个数据副本（一个在主存中，也许还有几个在不同缓存中的其他副本），而且这些副本是异步更新的，所以就失去了线性一致性。

为什么要做这个权衡？对多核内存一致性模型而言，CAP 定理是没有意义的：在同一台计算机中，我们通常假定通信都是可靠的。并且我们并不指望一个 CPU 核能在脱离计算机其他部分的条件下继续正常工作。牺牲线性一致性的原因是 **性能（performance）**，而不是容错。

许多分布式数据库也是如此：它们是 **为了提高性能** 而选择了牺牲线性一致性，而不是为了容错【46】。线性一致的速度很慢 —— 这始终是事实，而不仅仅是网络故障期间。

能找到一个更高效的线性一致存储实现吗？看起来答案是否定的：Attiya 和 Welch 【47】证明，如果你想要线性一致性，读写请求的响应时间至少与网络延迟的不确定性成正比。在像大多数计算机网络一样具有高度可变延迟的网络中（请参阅 “[超时与无穷的延迟](/v1/ch8#超时与无穷的延迟)”），线性读写的响应时间不可避免地会很高。更快地线性一致算法不存在，但更弱的一致性模型可以快得多，所以对延迟敏感的系统而言，这类权衡非常重要。在 [第十二章](/v1/ch12) 中将讨论一些在不牺牲正确性的前提下，绕开线性一致性的方法。


## 顺序保证

之前说过，线性一致寄存器的行为就好像只有单个数据副本一样，且每个操作似乎都是在某个时间点以原子性的方式生效的。这个定义意味着操作是按照某种良好定义的顺序执行的。我们将操作以看上去被执行的顺序连接起来，以此说明了 [图 9-4](/v1/ddia_0904.png) 中的顺序。

**顺序（ordering）** 这一主题在本书中反复出现，这表明它可能是一个重要的基础性概念。让我们简要回顾一下其它曾经出现过 **顺序** 的上下文：

* 在 [第五章](/v1/ch5) 中我们看到，领导者在单主复制中的主要目的就是，在复制日志中确定 **写入顺序（order of write）**—— 也就是从库应用这些写入的顺序。如果不存在一个领导者，则并发操作可能导致冲突（请参阅 “[处理写入冲突](/v1/ch5#处理写入冲突)”）。
* 在 [第七章](/v1/ch7) 中讨论的 **可串行化**，是关于事务表现的像按 **某种先后顺序（some sequential order）** 执行的保证。它可以字面意义上地以 **串行顺序（serial order）** 执行事务来实现，或者允许并行执行，但同时防止序列化冲突来实现（通过锁或中止事务）。
* 在 [第八章](/v1/ch8) 讨论过的在分布式系统中使用时间戳和时钟（请参阅 “[依赖同步时钟](/v1/ch8#依赖同步时钟)”）是另一种将顺序引入无序世界的尝试，例如，确定两个写入操作哪一个更晚发生。

事实证明，顺序、线性一致性和共识之间有着深刻的联系。尽管这个概念比本书其他部分更加理论化和抽象，但对于明确系统的能力范围（可以做什么和不可以做什么）而言是非常有帮助的。我们将在接下来的几节中探讨这个话题。

### 顺序与因果关系

**顺序** 反复出现有几个原因，其中一个原因是，它有助于保持 **因果关系（causality）**。在本书中我们已经看到了几个例子，其中因果关系是很重要的：

* 在 “[一致前缀读](/v1/ch5#一致前缀读)”（[图 5-5](/v1/ddia_0505.png)）中，我们看到一个例子：一个对话的观察者首先看到问题的答案，然后才看到被回答的问题。这是令人困惑的，因为它违背了我们对 **因（cause）** 与 **果（effect）** 的直觉：如果一个问题被回答，显然问题本身得先在那里，因为给出答案的人必须先看到这个问题（假如他们并没有预见未来的超能力）。我们认为在问题和答案之间存在 **因果依赖（causal dependency）**。
* [图 5-9](/v1/ddia_0509.png) 中出现了类似的模式，我们看到三位领导者之间的复制，并注意到由于网络延迟，一些写入可能会 “压倒” 其他写入。从其中一个副本的角度来看，好像有一个对尚不存在的记录的更新操作。这里的因果意味着，一条记录必须先被创建，然后才能被更新。
* 在 “[检测并发写入](/v1/ch5#检测并发写入)” 中我们观察到，如果有两个操作 A 和 B，则存在三种可能性：A 发生在 B 之前，或 B 发生在 A 之前，或者 A 和 B**并发**。这种 **此前发生（happened before）** 关系是因果关系的另一种表述：如果 A 在 B 前发生，那么意味着 B 可能已经知道了 A，或者建立在 A 的基础上，或者依赖于 A。如果 A 和 B 是 **并发** 的，那么它们之间并没有因果联系；换句话说，我们确信 A 和 B 不知道彼此。
* 在事务快照隔离的上下文中（“[快照隔离和可重复读](/v1/ch7#快照隔离和可重复读)”），我们说事务是从一致性快照中读取的。但此语境中 “一致” 到底又是什么意思？这意味着 **与因果关系保持一致（consistent with causality）**：如果快照包含答案，它也必须包含被回答的问题【48】。在某个时间点观察整个数据库，与因果关系保持一致意味着：因果上在该时间点之前发生的所有操作，其影响都是可见的，但因果上在该时间点之后发生的操作，其影响对观察者不可见。**读偏差（read skew）** 意味着读取的数据处于违反因果关系的状态（不可重复读，如 [图 7-6](/v1/ddia_0706.png) 所示）。
* 事务之间 **写偏差（write skew）** 的例子（请参阅 “[写入偏差与幻读](/v1/ch7#写入偏差与幻读)”）也说明了因果依赖：在 [图 7-8](/v1/ddia_0708.png) 中，爱丽丝被允许离班，因为事务认为鲍勃仍在值班，反之亦然。在这种情况下，离班的动作因果依赖于对当前值班情况的观察。[可串行化快照隔离](/v1/ch7#可串行化快照隔离) 通过跟踪事务之间的因果依赖来检测写偏差。
* 在爱丽丝和鲍勃看球的例子中（[图 9-1](/v1/ddia_0901.png)），在听到爱丽丝惊呼比赛结果后，鲍勃从服务器得到陈旧结果的事实违背了因果关系：爱丽丝的惊呼因果依赖于得分宣告，所以鲍勃应该也能在听到爱丽斯惊呼后查询到比分。相同的模式在 “[跨信道的时序依赖](#跨信道的时序依赖)” 一节中，以 “图像大小调整服务” 的伪装再次出现。

因果关系对事件施加了一种 **顺序**：因在果之前；消息发送在消息收取之前。而且就像现实生活中一样，一件事会导致另一件事：某个节点读取了一些数据然后写入一些结果，另一个节点读取其写入的内容，并依次写入一些其他内容，等等。这些因果依赖的操作链定义了系统中的因果顺序，即，什么在什么之前发生。

如果一个系统服从因果关系所规定的顺序，我们说它是 **因果一致（causally consistent）** 的。例如，快照隔离提供了因果一致性：当你从数据库中读取到一些数据时，你一定还能够看到其因果前驱（假设在此期间这些数据还没有被删除）。


#### 因果顺序不是全序的

**全序（total order）** 允许任意两个元素进行比较，所以如果有两个元素，你总是可以说出哪个更大，哪个更小。例如，自然数集是全序的：给定两个自然数，比如说 5 和 13，那么你可以告诉我，13 大于 5。

然而数学集合并不完全是全序的：`{a, b}` 比 `{b, c}` 更大吗？好吧，你没法真正比较它们，因为二者都不是对方的子集。我们说它们是 **无法比较（incomparable）** 的，因此数学集合是 **偏序的（partially ordered）** ：在某些情况下，可以说一个集合大于另一个（如果一个集合包含另一个集合的所有元素），但在其他情况下它们是无法比较的 [^译注i]。

[^译注i]: 设 R 为非空集合 A 上的关系，如果 R 是自反的、反对称的和可传递的，则称 R 为 A 上的偏序关系。简称偏序，通常记作≦。一个集合 A 与 A 上的偏序关系 R 一起叫作偏序集，记作 $(A,R)$ 或 $(A, ≦)$。全序、偏序、关系、集合，这些概念的精确定义可以参考任意一本离散数学教材。

全序和偏序之间的差异反映在不同的数据库一致性模型中：

* 线性一致性

  在线性一致的系统中，操作是全序的：如果系统表现的就好像只有一个数据副本，并且所有操作都是原子性的，这意味着对任何两个操作，我们总是能判定哪个操作先发生。这个全序在 [图 9-4](/v1/ddia_0904.png) 中以时间线表示。

* 因果性

  我们说过，如果两个操作都没有在彼此 **之前发生**，那么这两个操作是并发的（请参阅 [“此前发生” 的关系和并发](/v1/ch5#“此前发生”的关系和并发)）。换句话说，如果两个事件是因果相关的（一个发生在另一个事件之前），则它们之间是有序的，但如果它们是并发的，则它们之间的顺序是无法比较的。这意味着因果关系定义了一个偏序，而不是一个全序：一些操作相互之间是有顺序的，但有些则是无法比较的。

因此，根据这个定义，在线性一致的数据存储中是不存在并发操作的：必须有且仅有一条时间线，所有的操作都在这条时间线上，构成一个全序关系。可能有几个请求在等待处理，但是数据存储确保了每个请求都是在唯一时间线上的某个时间点自动处理的，不存在任何并发。

并发意味着时间线会分岔然后合并 —— 在这种情况下，不同分支上的操作是无法比较的（即并发操作）。在 [第五章](/v1/ch5) 中我们看到了这种现象：例如，[图 5-14](/v1/ddia_0514.png) 并不是一条直线的全序关系，而是一堆不同的操作并发进行。图中的箭头指明了因果依赖 —— 操作的偏序。

如果你熟悉像 Git 这样的分布式版本控制系统，那么其版本历史与因果关系图极其相似。通常，一个 **提交（Commit）** 发生在另一个提交之后，在一条直线上。但是有时你会遇到分支（当多个人同时在一个项目上工作时），**合并（Merge）** 会在这些并发创建的提交相融合时创建。

#### 线性一致性强于因果一致性

那么因果顺序和线性一致性之间的关系是什么？答案是线性一致性 **隐含着（implies）** 因果关系：任何线性一致的系统都能正确保持因果性【7】。特别是，如果系统中有多个通信通道（如 [图 9-5](/v1/ddia_0905.png) 中的消息队列和文件存储服务），线性一致性可以自动保证因果性，系统无需任何特殊操作（如在不同组件间传递时间戳）。

线性一致性确保因果性的事实使线性一致系统变得简单易懂，更有吸引力。然而，正如 “[线性一致性的代价](#线性一致性的代价)” 中所讨论的，使系统线性一致可能会损害其性能和可用性，尤其是在系统具有严重的网络延迟的情况下（例如，如果系统在地理上散布）。出于这个原因，一些分布式数据系统已经放弃了线性一致性，从而获得更好的性能，但它们用起来也更为困难。

好消息是存在折衷的可能性。线性一致性并不是保持因果性的唯一途径 —— 还有其他方法。一个系统可以是因果一致的，而无需承担线性一致带来的性能折损（尤其对于 CAP 定理不适用的情况）。实际上在所有的不会被网络延迟拖慢的一致性模型中，因果一致性是可行的最强的一致性模型。而且在网络故障时仍能保持可用【2,42】。

在许多情况下，看上去需要线性一致性的系统，实际上需要的只是因果一致性，因果一致性可以更高效地实现。基于这种观察结果，研究人员正在探索新型的数据库，既能保证因果一致性，且性能与可用性与最终一致的系统类似【49,50,51】。

这方面的研究相当新鲜，其中很多尚未应用到生产系统，仍然有不少挑战需要克服【52,53】。但对于未来的系统而言，这是一个有前景的方向。

#### 捕获因果关系

我们不会在这里讨论非线性一致的系统如何保证因果性的细节，而只是简要地探讨一些关键的思想。

为了维持因果性，你需要知道哪个操作发生在哪个其他操作之前（**happened before**）。这是一个偏序：并发操作可以以任意顺序进行，但如果一个操作发生在另一个操作之前，那它们必须在所有副本上以那个顺序被处理。因此，当一个副本处理一个操作时，它必须确保所有因果前驱的操作（之前发生的所有操作）已经被处理；如果前面的某个操作丢失了，后面的操作必须等待，直到前面的操作被处理完毕。

为了确定因果依赖，我们需要一些方法来描述系统中节点的 “知识”。如果节点在发出写入 Y 的请求时已经看到了 X 的值，则 X 和 Y 可能存在因果关系。这个分析使用了那些在欺诈指控刑事调查中常见的问题：CEO 在做出决定 Y 时是否 **知道** X ？

用于确定 *哪些操作发生在其他操作之前* 的技术，与我们在 “[检测并发写入](/v1/ch5#检测并发写入)” 中所讨论的内容类似。那一节讨论了无领导者数据存储中的因果性：为了防止丢失更新，我们需要检测到对同一个键的并发写入。因果一致性则更进一步：它需要跟踪整个数据库中的因果依赖，而不仅仅是一个键。可以推广版本向量以解决此类问题【54】。

为了确定因果顺序，数据库需要知道应用读取了哪个版本的数据。这就是为什么在 [图 5-13](/v1/ddia_0513.png) 中，来自先前操作的版本号在写入时被传回到数据库的原因。在 SSI 的冲突检测中会出现类似的想法，如 “[可串行化快照隔离](/v1/ch7#可串行化快照隔离)” 中所述：当事务要提交时，数据库将检查它所读取的数据版本是否仍然是最新的。为此，数据库跟踪哪些数据被哪些事务所读取。


### 序列号顺序

虽然因果是一个重要的理论概念，但实际上跟踪所有的因果关系是不切实际的。在许多应用中，客户端在写入内容之前会先读取大量数据，我们无法弄清写入因果依赖于先前全部的读取内容，还是仅包括其中一部分。显式跟踪所有已读数据意味着巨大的额外开销。

但还有一个更好的方法：我们可以使用 **序列号（sequence number）** 或 **时间戳（timestamp）** 来排序事件。时间戳不一定来自日历时钟（或物理时钟，它们存在许多问题，如 “[不可靠的时钟](/v1/ch8#不可靠的时钟)” 中所述）。它可以来自一个 **逻辑时钟（logical clock）**，这是一个用来生成标识操作的数字序列的算法，典型实现是使用一个每次操作自增的计数器。

这样的序列号或时间戳是紧凑的（只有几个字节大小），它提供了一个全序关系：也就是说每个操作都有一个唯一的序列号，而且总是可以比较两个序列号，确定哪一个更大（即哪些操作后发生）。

特别是，我们可以使用 **与因果一致（consistent with causality）** 的全序来生成序列号 [^vii]：我们保证，如果操作 A 因果地发生在操作 B 前，那么在这个全序中 A 在 B 前（ A 具有比 B 更小的序列号）。并行操作之间可以任意排序。这样一个全序关系捕获了所有关于因果的信息，但也施加了一个比因果性要求更为严格的顺序。

[^vii]: 与因果关系不一致的全序很容易创建，但没啥用。例如你可以为每个操作生成随机的 UUID，并按照字典序比较 UUID，以定义操作的全序。这是一个有效的全序，但是随机的 UUID 并不能告诉你哪个操作先发生，或者操作是否为并发的。

在单主复制的数据库中（请参阅 “[领导者与追随者](/v1/ch5#领导者与追随者)”），复制日志定义了与因果一致的写操作。主库可以简单地为每个操作自增一个计数器，从而为复制日志中的每个操作分配一个单调递增的序列号。如果一个从库按照它们在复制日志中出现的顺序来应用写操作，那么从库的状态始终是因果一致的（即使它落后于领导者）。

#### 非因果序列号生成器

如果主库不存在（可能因为使用了多主数据库或无主数据库，或者因为使用了分区的数据库），如何为操作生成序列号就没有那么明显了。在实践中有各种各样的方法：

* 每个节点都可以生成自己独立的一组序列号。例如有两个节点，一个节点只能生成奇数，而另一个节点只能生成偶数。通常，可以在序列号的二进制表示中预留一些位，用于唯一的节点标识符，这样可以确保两个不同的节点永远不会生成相同的序列号。
*可以将日历时钟（物理时钟）的时间戳附加到每个操作上【55】。这种时间戳并不连续，但是如果它具有足够高的分辨率，那也许足以提供一个操作的全序关系。这一事实应用于* 最后写入胜利 * 的冲突解决方法中（请参阅 “[有序事件的时间戳](/v1/ch8#有序事件的时间戳)”）。
* 可以预先分配序列号区块。例如，节点 A 可能要求从序列号 1 到 1,000 区块的所有权，而节点 B 可能要求序列号 1,001 到 2,000 区块的所有权。然后每个节点可以独立分配所属区块中的序列号，并在序列号告急时请求分配一个新的区块。

这三个选项都比单一主库的自增计数器表现要好，并且更具可伸缩性。它们为每个操作生成一个唯一的，近似自增的序列号。然而它们都有同一个问题：生成的序列号与因果不一致。

因为这些序列号生成器不能正确地捕获跨节点的操作顺序，所以会出现因果关系的问题：

* 每个节点每秒可以处理不同数量的操作。因此，如果一个节点产生偶数序列号而另一个产生奇数序列号，则偶数计数器可能落后于奇数计数器，反之亦然。如果你有一个奇数编号的操作和一个偶数编号的操作，你无法准确地说出哪一个操作在因果上先发生。

* 来自物理时钟的时间戳会受到时钟偏移的影响，这可能会使其与因果不一致。例如 [图 8-3](/v1/ddia_0803.png) 展示了一个例子，其中因果上晚发生的操作，却被分配了一个更早的时间戳。[^viii]

  [^viii]: 可以使物理时钟时间戳与因果关系保持一致：在 “[全局快照的同步时钟](/v1/ch8#全局快照的同步时钟)” 中，我们讨论了 Google 的 Spanner，它可以估计预期的时钟偏差，并在提交写入之前等待不确定性间隔。这种方法确保了实际上靠后的事务会有更大的时间戳。但是大多数时钟不能提供这种所需的不确定性度量。

* 在分配区块的情况下，某个操作可能会被赋予一个范围在 1,001 到 2,000 内的序列号，然而一个因果上更晚的操作可能被赋予一个范围在 1 到 1,000 之间的数字。这里序列号与因果关系也是不一致的。


#### 兰伯特时间戳

尽管刚才描述的三个序列号生成器与因果不一致，但实际上有一个简单的方法来产生与因果关系一致的序列号。它被称为兰伯特时间戳，莱斯利・兰伯特（Leslie Lamport）于 1978 年提出【56】，现在是分布式系统领域中被引用最多的论文之一。

[图 9-8](/v1/ddia_0908.png) 说明了兰伯特时间戳的应用。每个节点都有一个唯一标识符，和一个保存自己执行操作数量的计数器。兰伯特时间戳就是两者的简单组合：（计数器，节点 ID）$(counter, node ID)$。两个节点有时可能具有相同的计数器值，但通过在时间戳中包含节点 ID，每个时间戳都是唯一的。

![](/v1/ddia_0908.png)

**图 9-8  Lamport 时间戳提供了与因果关系一致的全序。**


兰伯特时间戳与物理的日历时钟没有任何关系，但是它提供了一个全序：如果你有两个时间戳，则 **计数器** 值大者是更大的时间戳。如果计数器值相同，则节点 ID 越大的，时间戳越大。

迄今，这个描述与上节所述的奇偶计数器基本类似。使兰伯特时间戳因果一致的关键思想如下所示：每个节点和每个客户端跟踪迄今为止所见到的最大 **计数器** 值，并在每个请求中包含这个最大计数器值。当一个节点收到最大计数器值大于自身计数器值的请求或响应时，它立即将自己的计数器设置为这个最大值。

这如 [图 9-8](/v1/ddia_0908.png) 所示，其中客户端 A 从节点 2 接收计数器值 `5` ，然后将最大值 `5` 发送到节点 1 。此时，节点 1 的计数器仅为 `1` ，但是它立即前移至 `5` ，所以下一个操作的计数器的值为 `6` 。

只要每一个操作都携带着最大计数器值，这个方案确保兰伯特时间戳的排序与因果一致，因为每个因果依赖都会导致时间戳增长。

兰伯特时间戳有时会与我们在 “[检测并发写入](/v1/ch5#检测并发写入)” 中看到的版本向量相混淆。虽然两者有一些相似之处，但它们有着不同的目的：版本向量可以区分两个操作是并发的，还是一个因果依赖另一个；而兰伯特时间戳总是施行一个全序。从兰伯特时间戳的全序中，你无法分辨两个操作是并发的还是因果依赖的。兰伯特时间戳优于版本向量的地方是，它更加紧凑。

#### 光有时间戳排序还不够

虽然兰伯特时间戳定义了一个与因果一致的全序，但它还不足以解决分布式系统中的许多常见问题。

例如，考虑一个需要确保用户名能唯一标识用户帐户的系统。如果两个用户同时尝试使用相同的用户名创建帐户，则其中一个应该成功，另一个应该失败（我们之前在 “[领导者和锁](/v1/ch8#领导者和锁)” 中提到过这个问题）。

乍看之下，似乎操作的全序关系足以解决这一问题（例如使用兰伯特时间戳）：如果创建了两个具有相同用户名的帐户，选择时间戳较小的那个作为胜者（第一个抓到用户名的人），并让带有更大时间戳者失败。由于时间戳上有全序关系，所以这个比较总是可行的。

这种方法适用于事后确定胜利者：一旦你收集了系统中的所有用户名创建操作，就可以比较它们的时间戳。然而当某个节点需要实时处理用户创建用户名的请求时，这样的方法就无法满足了。节点需要 **马上（right now）** 决定这个请求是成功还是失败。在那个时刻，节点并不知道是否存在其他节点正在并发执行创建同样用户名的操作，罔论其它节点可能分配给那个操作的时间戳。

为了确保没有其他节点正在使用相同的用户名和较小的时间戳并发创建同名账户，你必须检查其它每个节点，看看它在做什么【56】。如果其中一个节点由于网络问题出现故障或不可达，则整个系统可能被拖至停机。这不是我们需要的那种容错系统。

这里的问题是，只有在所有的操作都被收集之后，操作的全序才会出现。如果另一个节点已经产生了一些操作，但你还不知道那些操作是什么，那就无法构造所有操作最终的全序关系：来自另一个节点的未知操作可能需要被插入到全序中的不同位置。

总之：为了实现诸如用户名上的唯一约束这种东西，仅有操作的全序是不够的，你还需要知道这个全序何时会尘埃落定。如果你有一个创建用户名的操作，并且确定在全序中没有任何其他节点可以在你的操作之前插入对同一用户名的声称，那么你就可以安全地宣告操作执行成功。

如何确定全序关系已经尘埃落定，这将在 [全序广播](#全序广播) 一节中详细说明。

### 全序广播

如果你的程序只运行在单个 CPU 核上，那么定义一个操作全序是很容易的：可以简单认为就是 CPU 执行这些操作的顺序。但是在分布式系统中，让所有节点对同一个全局操作顺序达成一致可能相当棘手。在上一节中，我们讨论了按时间戳或序列号进行排序，但发现它还不如单主复制给力（如果你使用时间戳排序来实现唯一性约束，就不能容忍任何错误，因为你必须要从每个节点都获取到最新的序列号）。

如前所述，单主复制通过选择一个节点作为主库来确定操作的全序，并在主库的单个 CPU 核上对所有操作进行排序。接下来的挑战是，如果吞吐量超出单个主库的处理能力，这种情况下如何扩展系统；以及，如果主库失效（“[处理节点宕机](/v1/ch5#处理节点宕机)”），如何处理故障切换。在分布式系统文献中，这个问题被称为 **全序广播（total order broadcast）** 或 **原子广播（atomic broadcast）**[^ix]【25,57,58】。

[^ix]: “原子广播” 是一个传统的术语，非常混乱，而且与 “原子” 一词的其他用法不一致：它与 ACID 事务中的原子性没有任何关系，只是与原子操作（在多线程编程的意义上 ）或原子寄存器（线性一致存储）有间接的联系。全序组播（total order multicast）是另一个同义词。

> #### 顺序保证的范围
>
> 每个分区各有一个主库的分区数据库，通常只在每个分区内维持顺序，这意味着它们不能提供跨分区的一致性保证（例如，一致性快照，外键引用）。跨所有分区的全序是可能的，但需要额外的协调【59】。

全序广播通常被描述为在节点间交换消息的协议。非正式地讲，它要满足两个安全属性：

* 可靠交付（reliable delivery）

  没有消息丢失：如果消息被传递到一个节点，它将被传递到所有节点。

* 全序交付（totally ordered delivery）

  消息以相同的顺序传递给每个节点。

正确的全序广播算法必须始终保证可靠性和有序性，即使节点或网络出现故障。当然在网络中断的时候，消息是传不出去的，但是算法可以不断重试，以便在网络最终修复时，消息能及时通过并送达（当然它们必须仍然按照正确的顺序传递）。

#### 使用全序广播

像 ZooKeeper 和 etcd 这样的共识服务实际上实现了全序广播。这一事实暗示了全序广播与共识之间有着紧密联系，我们将在本章稍后进行探讨。

全序广播正是数据库复制所需的：如果每个消息都代表一次数据库的写入，且每个副本都按相同的顺序处理相同的写入，那么副本间将相互保持一致（除了临时的复制延迟）。这个原理被称为 **状态机复制（state machine replication）**【60】，我们将在 [第十一章](/v1/ch11) 中重新回到这个概念。

与之类似，可以使用全序广播来实现可串行化的事务：如 “[真的串行执行](/v1/ch7#真的串行执行)” 中所述，如果每个消息都表示一个确定性事务，以存储过程的形式来执行，且每个节点都以相同的顺序处理这些消息，那么数据库的分区和副本就可以相互保持一致【61】。

全序广播的一个重要表现是，顺序在消息送达时被固化：如果后续的消息已经送达，节点就不允许追溯地将（先前）消息插入顺序中的较早位置。这个事实使得全序广播比时间戳排序更强。

考量全序广播的另一种方式是，这是一种创建日志的方式（如在复制日志、事务日志或预写式日志中）：传递消息就像追加写入日志。由于所有节点必须以相同的顺序传递相同的消息，因此所有节点都可以读取日志，并看到相同的消息序列。

全序广播对于实现提供防护令牌的锁服务也很有用（请参阅 “[防护令牌](/v1/ch8#防护令牌)”）。每个获取锁的请求都作为一条消息追加到日志末尾，并且所有的消息都按它们在日志中出现的顺序依次编号。序列号可以当成防护令牌用，因为它是单调递增的。在 ZooKeeper 中，这个序列号被称为 `zxid` 【15】。

#### 使用全序广播实现线性一致的存储

如 [图 9-4](/v1/ddia_0904.png) 所示，在线性一致的系统中，存在操作的全序。这是否意味着线性一致与全序广播一样？不尽然，但两者之间有着密切的联系 [^x]。

[^x]: 从形式上讲，线性一致读写寄存器是一个 “更容易” 的问题。全序广播等价于共识【67】，而共识问题在异步的崩溃 - 停止模型【68】中没有确定性的解决方案，而线性一致的读写寄存器 **可以** 在这种模型中实现【23,24,25】。然而，支持诸如 **比较并设置（CAS, compare-and-set）**，或 **自增并返回（increment-and-get）** 的原子操作使它等价于共识问题【28】。因此，共识问题与线性一致寄存器问题密切相关。

全序广播是异步的：消息被保证以固定的顺序可靠地传送，但是不能保证消息 **何时** 被送达（所以一个接收者可能落后于其他接收者）。相比之下，线性一致性是新鲜性的保证：读取一定能看见最新的写入值。

但如果有了全序广播，你就可以在此基础上构建线性一致的存储。例如，你可以确保用户名能唯一标识用户帐户。

设想对于每一个可能的用户名，你都可以有一个带有 CAS 原子操作的线性一致寄存器。每个寄存器最初的值为空值（表示未使用该用户名）。当用户想要创建一个用户名时，对该用户名的寄存器执行 CAS 操作，在先前寄存器值为空的条件，将其值设置为用户的账号 ID。如果多个用户试图同时获取相同的用户名，则只有一个 CAS 操作会成功，因为其他用户会看到非空的值（由于线性一致性）。

你可以通过将全序广播当成仅追加日志【62,63】的方式来实现这种线性一致的 CAS 操作：

1. 在日志中追加一条消息，试探性地指明你要声明的用户名。
2. 读日志，并等待你刚才追加的消息被读回。[^xi]
4. 检查是否有任何消息声称目标用户名的所有权。如果这些消息中的第一条就是你自己的消息，那么你就成功了：你可以提交声称的用户名（也许是通过向日志追加另一条消息）并向客户端确认。如果所需用户名的第一条消息来自其他用户，则中止操作。

[^xi]: 如果你不等待，而是在消息入队之后立即确认写入，则会得到类似于多核 x86 处理器内存的一致性模型【43】。该模型既不是线性一致的也不是顺序一致的。

由于日志项是以相同顺序送达至所有节点，因此如果有多个并发写入，则所有节点会对最先到达者达成一致。选择冲突写入中的第一个作为胜利者，并中止后来者，以此确定所有节点对某个写入是提交还是中止达成一致。类似的方法可以在一个日志的基础上实现可串行化的多对象事务【62】。

尽管这一过程保证写入是线性一致的，但它并不保证读取也是线性一致的 —— 如果你从与日志异步更新的存储中读取数据，结果可能是陈旧的。（精确地说，这里描述的过程提供了 **顺序一致性（sequential consistency）**【47,64】，有时也称为 **时间线一致性（timeline consistency）**【65,66】，比线性一致性稍微弱一些的保证）。为了使读取也线性一致，有几个选项：

* 你可以通过在日志中追加一条消息，然后读取日志，直到该消息被读回才执行实际的读取操作。消息在日志中的位置因此定义了读取发生的时间点（etcd 的法定人数读取有些类似这种情况【16】）。
* 如果日志允许以线性一致的方式获取最新日志消息的位置，则可以查询该位置，等待该位置前的所有消息都传达到你，然后执行读取。（这是 Zookeeper `sync()` 操作背后的思想【15】）。
* 你可以从同步更新的副本中进行读取，因此可以确保结果是最新的（这种技术用于链式复制（chain replication）【63】；请参阅 “[关于复制的研究](/v1/ch5#关于复制的研究)”）。

#### 使用线性一致性存储实现全序广播

上一节介绍了如何从全序广播构建一个线性一致的 CAS 操作。我们也可以把它反过来，假设我们有线性一致的存储，接下来会展示如何在此基础上构建全序广播。

最简单的方法是假设你有一个线性一致的寄存器来存储一个整数，并且有一个原子 **自增并返回** 操作【28】。或者原子 CAS 操作也可以完成这项工作。

该算法很简单：每个要通过全序广播发送的消息首先对线性一致寄存器执行 **自增并返回** 操作。然后将从寄存器获得的值作为序列号附加到消息中。然后你可以将消息发送到所有节点（重新发送任何丢失的消息），而收件人将按序列号依序传递（deliver）消息。

请注意，与兰伯特时间戳不同，通过自增线性一致性寄存器获得的数字形式上是一个没有间隙的序列。因此，如果一个节点已经发送了消息 4 并且接收到序列号为 6 的传入消息，则它知道它在传递消息 6 之前必须等待消息 5 。兰伯特时间戳则与之不同 —— 事实上，这是全序广播和时间戳排序间的关键区别。

实现一个带有原子性 **自增并返回** 操作的线性一致寄存器有多困难？像往常一样，如果事情从来不出差错，那很容易：你可以简单地把它保存在单个节点内的变量中。问题在于处理当该节点的网络连接中断时的情况，并在该节点失效时能恢复这个值【59】。一般来说，如果你对线性一致性的序列号生成器进行过足够深入的思考，你不可避免地会得出一个共识算法。

这并非巧合：可以证明，线性一致的 CAS（或自增并返回）寄存器与全序广播都等价于 **共识** 问题【28,67】。也就是说，如果你能解决其中的一个问题，你可以把它转化成为其他问题的解决方案。这是相当深刻和令人惊讶的洞察！

现在是时候正面处理共识问题了，我们将在本章的其余部分进行讨论。


## 分布式事务与共识

**共识** 是分布式计算中最重要也是最基本的问题之一。从表面上看似乎很简单：非正式地讲，目标只是 **让几个节点达成一致（get serveral nodes to agree on something）**。你也许会认为这不会太难。不幸的是，许多出故障的系统都是因为错误地轻信这个问题很容易解决。

尽管共识非常重要，但关于它的内容出现在本书的后半部分，因为这个主题非常微妙，欣赏细微之处需要一些必要的知识。即使在学术界，对共识的理解也是在几十年的过程中逐渐沉淀而来，一路上也有着许多误解。现在我们已经讨论了复制（[第五章](/v1/ch5)），事务（[第七章](/v1/ch7)），系统模型（[第八章](/v1/ch8)），线性一致以及全序广播（本章），我们终于准备好解决共识问题了。

节点能达成一致，在很多场景下都非常重要，例如：

* 领导选举

  在单主复制的数据库中，所有节点需要就哪个节点是领导者达成一致。如果一些节点由于网络故障而无法与其他节点通信，则可能会对领导权的归属引起争议。在这种情况下，共识对于避免错误的故障切换非常重要。错误的故障切换会导致两个节点都认为自己是领导者（**脑裂**，请参阅 “[处理节点宕机](/v1/ch5#处理节点宕机)”）。如果有两个领导者，它们都会接受写入，它们的数据会发生分歧，从而导致不一致和数据丢失。

* 原子提交

  在支持跨多节点或跨多分区事务的数据库中，一个事务可能在某些节点上失败，但在其他节点上成功。如果我们想要维护事务的原子性（就 ACID 而言，请参阅 “[原子性](/v1/ch7#原子性)”），我们必须让所有节点对事务的结果达成一致：要么全部中止 / 回滚（如果出现任何错误），要么它们全部提交（如果没有出错）。这个共识的例子被称为 **原子提交（atomic commit）** 问题 [^xii]。

  [^xii]: 原子提交的形式化与共识稍有不同：原子事务只有在 **所有** 参与者投票提交的情况下才能提交，如果有任何参与者需要中止，则必须中止。共识则允许就 **任意一个** 被参与者提出的候选值达成一致。然而，原子提交和共识可以相互简化为对方【70,71】。**非阻塞** 原子提交则要比共识更为困难 —— 请参阅 “[三阶段提交](#三阶段提交)”。

> ### 共识的不可能性
>
> 你可能已经听说过以作者 Fischer，Lynch 和 Paterson 命名的 FLP 结果【68】，它证明，如果存在节点可能崩溃的风险，则不存在 **总是** 能够达成共识的算法。在分布式系统中，我们必须假设节点可能会崩溃，所以可靠的共识是不可能的。然而这里我们正在讨论达成共识的算法，到底是怎么回事？
>
> 答案是 FLP 结果是在 **异步系统模型** 中被证明的（请参阅 “[系统模型与现实](/v1/ch8#系统模型与现实)”），而这是一种限制性很强的模型，它假定确定性算法不能使用任何时钟或超时。如果允许算法使用 **超时** 或其他方法来识别可疑的崩溃节点（即使怀疑有时是错误的），则共识变为一个可解的问题【67】。即使仅仅允许算法使用随机数，也足以绕过这个不可能的结果【69】。
>
> 因此，虽然 FLP 是关于共识不可能性的重要理论结果，但现实中的分布式系统通常是可以达成共识的。

在本节中，我们将首先更详细地研究 **原子提交** 问题。具体来说，我们将讨论 **两阶段提交（2PC, two-phase commit）** 算法，这是解决原子提交问题最常见的办法，并在各种数据库、消息队列和应用服务器中被实现。事实证明 2PC 是一种共识算法，但不是一个非常好的共识算法【70,71】。

通过对 2PC 的学习，我们将继续努力实现更好的一致性算法，比如 ZooKeeper（Zab）和 etcd（Raft）中使用的算法。


### 原子提交与两阶段提交

在 [第七章](/v1/ch7) 中我们了解到，事务原子性的目的是在多次写操作中途出错的情况下，提供一种简单的语义。事务的结果要么是成功提交，在这种情况下，事务的所有写入都是持久化的；要么是中止，在这种情况下，事务的所有写入都被回滚（即撤消或丢弃）。

原子性可以防止失败的事务搅乱数据库，避免数据库陷入半成品结果和半更新状态。这对于多对象事务（请参阅 “[单对象和多对象操作](/v1/ch7#单对象和多对象操作)”）和维护次级索引的数据库尤其重要。每个次级索引都是与主数据相分离的数据结构 —— 因此，如果你修改了一些数据，则还需要在次级索引中进行相应的更改。原子性确保次级索引与主数据保持一致（如果索引与主数据不一致，就没什么用了）。

#### 从单节点到分布式原子提交

对于在单个数据库节点执行的事务，原子性通常由存储引擎实现。当客户端请求数据库节点提交事务时，数据库将使事务的写入持久化（通常在预写式日志中，请参阅 “[让 B 树更可靠](/v1/ch3#让B树更可靠)”），然后将提交记录追加到磁盘中的日志里。如果数据库在这个过程中间崩溃，当节点重启时，事务会从日志中恢复：如果提交记录在崩溃之前成功地写入磁盘，则认为事务被提交；否则来自该事务的任何写入都被回滚。

因此，在单个节点上，事务的提交主要取决于数据持久化落盘的 **顺序**：首先是数据，然后是提交记录【72】。事务提交或终止的关键决定时刻是磁盘完成写入提交记录的时刻：在此之前，仍有可能中止（由于崩溃），但在此之后，事务已经提交（即使数据库崩溃）。因此，是单一的设备（连接到单个磁盘的控制器，且挂载在单台机器上）使得提交具有原子性。

但是，如果一个事务中涉及多个节点呢？例如，你也许在分区数据库中会有一个多对象事务，或者是一个按关键词分区的次级索引（其中索引条目可能位于与主数据不同的节点上；请参阅 “[分区与次级索引](/v1/ch6#分区与次级索引)”）。大多数 “NoSQL” 分布式数据存储不支持这种分布式事务，但是很多关系型数据库集群支持（请参阅 “[实践中的分布式事务](#实践中的分布式事务)”）。

在这些情况下，仅向所有节点发送提交请求并独立提交每个节点的事务是不够的。这样很容易发生违反原子性的情况：提交在某些节点上成功，而在其他节点上失败：

* 某些节点可能会检测到违反约束或冲突，因此需要中止，而其他节点则可以成功进行提交。
* 某些提交请求可能在网络中丢失，最终由于超时而中止，而其他提交请求则通过。
* 在提交记录完全写入之前，某些节点可能会崩溃，并在恢复时回滚，而其他节点则成功提交。

如果某些节点提交了事务，但其他节点却放弃了这些事务，那么这些节点就会彼此不一致（如 [图 7-3](/v1/ddia_0703.png) 所示）。而且一旦在某个节点上提交了一个事务，如果事后发现它在其它节点上被中止了，它是无法撤回的。出于这个原因，一旦确定事务中的所有其他节点也将提交，节点就必须进行提交。

事务提交必须是不可撤销的 —— 事务提交之后，你不能改变主意，并追溯性地中止事务。这个规则的原因是，一旦数据被提交，其结果就对其他事务可见，因此其他客户端可能会开始依赖这些数据。这个原则构成了 **读已提交** 隔离等级的基础，在 “[读已提交](/v1/ch7#读已提交)” 一节中讨论了这个问题。如果一个事务在提交后被允许中止，所有那些读取了 **已提交却又被追溯声明不存在数据** 的事务也必须回滚。

（提交事务的结果有可能通过事后执行另一个补偿事务（compensating transaction）来取消【73,74】，但从数据库的角度来看，这是一个单独的事务，因此任何关于跨事务正确性的保证都是应用自己的问题。）

#### 两阶段提交简介

**两阶段提交（two-phase commit）** 是一种用于实现跨多个节点的原子事务提交的算法，即确保所有节点提交或所有节点中止。它是分布式数据库中的经典算法【13,35,75】。2PC 在某些数据库内部使用，也以 **XA 事务** 的形式对应用可用【76,77】（例如 Java Transaction API 支持）或以 SOAP Web 服务的 `WS-AtomicTransaction` 形式提供给应用【78,79】。

[图 9-9](/v1/ddia_0909.png) 说明了 2PC 的基本流程。2PC 中的提交 / 中止过程分为两个阶段（因此而得名），而不是单节点事务中的单个提交请求。

![](/v1/ddia_0909.png)

**图 9-9 两阶段提交（2PC）的成功执行**

> #### 不要把2PC和2PL搞混了
>
> 两阶段提交（2PC）和两阶段锁定（请参阅 “[两阶段锁定](/v1/ch7#两阶段锁定)”）是两个完全不同的东西。2PC 在分布式数据库中提供原子提交，而 2PL 提供可串行化的隔离等级。为了避免混淆，最好把它们看作完全独立的概念，并忽略名称中不幸的相似性。

2PC 使用一个通常不会出现在单节点事务中的新组件：**协调者**（coordinator，也称为 **事务管理器**，即 transaction manager）。协调者通常在请求事务的相同应用进程中以库的形式实现（例如，嵌入在 Java EE 容器中），但也可以是单独的进程或服务。这种协调者的例子包括 Narayana、JOTM、BTM 或 MSDTC。

正常情况下，2PC 事务以应用在多个数据库节点上读写数据开始。我们称这些数据库节点为 **参与者（participants）**。当应用准备提交时，协调者开始阶段 1 ：它发送一个 **准备（prepare）** 请求到每个节点，询问它们是否能够提交。然后协调者会跟踪参与者的响应：

* 如果所有参与者都回答 “是”，表示它们已经准备好提交，那么协调者在阶段 2 发出 **提交（commit）** 请求，然后提交真正发生。
* 如果任意一个参与者回复了 “否”，则协调者在阶段 2 中向所有节点发送 **中止（abort）** 请求。

这个过程有点像西方传统婚姻仪式：司仪分别询问新娘和新郎是否要结婚，通常是从两方都收到 “我愿意” 的答复。收到两者的回复后，司仪宣布这对情侣成为夫妻：事务就提交了，这一幸福事实会广播至所有的参与者中。如果新娘与新郎之一没有回复 “我愿意”，婚礼就会中止【73】。

#### 系统承诺

这个简短的描述可能并没有说清楚为什么两阶段提交保证了原子性，而跨多个节点的一阶段提交却没有。在两阶段提交的情况下，准备请求和提交请求当然也可以轻易丢失。2PC 又有什么不同呢？

为了理解它的工作原理，我们必须更详细地分解这个过程：

1. 当应用想要启动一个分布式事务时，它向协调者请求一个事务 ID。此事务 ID 是全局唯一的。
2. 应用在每个参与者上启动单节点事务，并在单节点事务上捎带上这个全局事务 ID。所有的读写都是在这些单节点事务中各自完成的。如果在这个阶段出现任何问题（例如，节点崩溃或请求超时），则协调者或任何参与者都可以中止。
3. 当应用准备提交时，协调者向所有参与者发送一个 **准备** 请求，并打上全局事务 ID 的标记。如果任意一个请求失败或超时，则协调者向所有参与者发送针对该事务 ID 的中止请求。
4. 参与者收到准备请求时，需要确保在任意情况下都的确可以提交事务。这包括将所有事务数据写入磁盘（出现崩溃、电源故障或硬盘空间不足都不能是稍后拒绝提交的理由）以及检查是否存在任何冲突或违反约束。通过向协调者回答 “是”，节点承诺，只要请求，这个事务一定可以不出差错地提交。换句话说，参与者放弃了中止事务的权利，但没有实际提交。
5. 当协调者收到所有准备请求的答复时，会就提交或中止事务作出明确的决定（只有在所有参与者投赞成票的情况下才会提交）。协调者必须把这个决定写到磁盘上的事务日志中，如果它随后就崩溃，恢复后也能知道自己所做的决定。这被称为 **提交点（commit point）**。
6. 一旦协调者的决定落盘，提交或中止请求会发送给所有参与者。如果这个请求失败或超时，协调者必须永远保持重试，直到成功为止。没有回头路：如果已经做出决定，不管需要多少次重试它都必须被执行。如果参与者在此期间崩溃，事务将在其恢复后提交 —— 由于参与者投了赞成，因此恢复后它不能拒绝提交。

因此，该协议包含两个关键的 “不归路” 点：当参与者投票 “是” 时，它承诺它稍后肯定能够提交（尽管协调者可能仍然选择放弃）；以及一旦协调者做出决定，这一决定是不可撤销的。这些承诺保证了 2PC 的原子性（单节点原子提交将这两个事件合为了一体：将提交记录写入事务日志）。

回到婚姻的比喻，在说 “我愿意” 之前，你和你的新娘 / 新郎有中止这个事务的自由，只要回复 “没门！” 就行（或者有类似效果的话）。然而在说了 “我愿意” 之后，你就不能撤回那个声明了。如果你说 “我愿意” 后晕倒了，没有听到司仪说 “你们现在是夫妻了”，那也并不会改变事务已经提交的现实。当你稍后恢复意识时，可以通过查询司仪的全局事务 ID 状态来确定你是否已经成婚，或者你可以等待司仪重试下一次提交请求（因为重试将在你无意识期间一直持续）。

#### 协调者失效

我们已经讨论了在 2PC 期间，如果参与者之一或网络发生故障时会发生什么情况：如果任何一个 **准备** 请求失败或者超时，协调者就会中止事务。如果任何提交或中止请求失败，协调者将无条件重试。但是如果协调者崩溃，会发生什么情况就不太清楚了。

如果协调者在发送 **准备** 请求之前失败，参与者可以安全地中止事务。但是，一旦参与者收到了准备请求并投了 “是”，就不能再单方面放弃 —— 必须等待协调者回答事务是否已经提交或中止。如果此时协调者崩溃或网络出现故障，参与者什么也做不了只能等待。参与者的这种事务状态称为 **存疑（in doubt）** 的或 **不确定（uncertain）** 的。

情况如 [图 9-10](/v1/ddia_0910.png) 所示。在这个特定的例子中，协调者实际上决定提交，数据库 2 收到提交请求。但是，协调者在将提交请求发送到数据库 1 之前发生崩溃，因此数据库 1 不知道是否提交或中止。即使 **超时** 在这里也没有帮助：如果数据库 1 在超时后单方面中止，它将最终与执行提交的数据库 2 不一致。同样，单方面提交也是不安全的，因为另一个参与者可能已经中止了。

![](/v1/ddia_0910.png)

**图 9-10 参与者投赞成票后，协调者崩溃。数据库 1 不知道是否提交或中止**

没有协调者的消息，参与者无法知道是提交还是放弃。原则上参与者可以相互沟通，找出每个参与者是如何投票的，并达成一致，但这不是 2PC 协议的一部分。

可以完成 2PC 的唯一方法是等待协调者恢复。这就是为什么协调者必须在向参与者发送提交或中止请求之前，将其提交或中止决定写入磁盘上的事务日志：协调者恢复后，通过读取其事务日志来确定所有存疑事务的状态。任何在协调者日志中没有提交记录的事务都会中止。因此，2PC 的 **提交点** 归结为协调者上的常规单节点原子提交。

#### 三阶段提交

两阶段提交被称为 **阻塞（blocking）**- 原子提交协议，因为存在 2PC 可能卡住并等待协调者恢复的情况。理论上，可以使一个原子提交协议变为 **非阻塞（nonblocking）** 的，以便在节点失败时不会卡住。但是让这个协议能在实践中工作并没有那么简单。

作为 2PC 的替代方案，已经提出了一种称为 **三阶段提交（3PC）** 的算法【13,80】。然而，3PC 假定网络延迟有界，节点响应时间有限；在大多数具有无限网络延迟和进程暂停的实际系统中（见 [第八章](/v1/ch8)），它并不能保证原子性。

通常，非阻塞原子提交需要一个 **完美的故障检测器（perfect failure detector）**【67,71】—— 即一个可靠的机制来判断一个节点是否已经崩溃。在具有无限延迟的网络中，超时并不是一种可靠的故障检测机制，因为即使没有节点崩溃，请求也可能由于网络问题而超时。出于这个原因，2PC 仍然被使用，尽管大家都清楚可能存在协调者故障的问题。


### 实践中的分布式事务

分布式事务的名声毁誉参半，尤其是那些通过两阶段提交实现的。一方面，它被视作提供了一个难以实现的重要的安全性保证；另一方面，它们因为导致运维问题，造成性能下降，做出超过能力范围的承诺而饱受批评【81,82,83,84】。许多云服务由于其导致的运维问题，而选择不实现分布式事务【85,86】。

分布式事务的某些实现会带来严重的性能损失 —— 例如据报告称，MySQL 中的分布式事务比单节点事务慢 10 倍以上【87】，所以当人们建议不要使用它们时就不足为奇了。两阶段提交所固有的性能成本，大部分是由于崩溃恢复所需的额外强制刷盘（`fsync`）【88】以及额外的网络往返。

但我们不应该直接忽视分布式事务，而应当更加仔细地审视这些事务，因为从中可以汲取重要的经验教训。首先，我们应该精确地说明 “**分布式事务**” 的含义。两种截然不同的分布式事务类型经常被混淆：

* 数据库内部的分布式事务

  一些分布式数据库（即在其标准配置中使用复制和分区的数据库）支持数据库节点之间的内部事务。例如，VoltDB 和 MySQL Cluster 的 NDB 存储引擎就有这样的内部事务支持。在这种情况下，所有参与事务的节点都运行相同的数据库软件。

* 异构分布式事务

  在 **异构（heterogeneous）** 事务中，参与者是由两种或两种以上的不同技术组成的：例如来自不同供应商的两个数据库，甚至是非数据库系统（如消息代理）。跨系统的分布式事务必须确保原子提交，尽管系统可能完全不同。

数据库内部事务不必与任何其他系统兼容，因此它们可以使用任何协议，并能针对特定技术进行特定的优化。因此数据库内部的分布式事务通常工作地很好。另一方面，跨异构技术的事务则更有挑战性。

#### 恰好一次的消息处理

异构的分布式事务处理能够以强大的方式集成不同的系统。例如：消息队列中的一条消息可以被确认为已处理，当且仅当用于处理消息的数据库事务成功提交。这是通过在同一个事务中原子提交 **消息确认** 和 **数据库写入** 两个操作来实现的。藉由分布式事务的支持，即使消息代理和数据库是在不同机器上运行的两种不相关的技术，这种操作也是可能的。

如果消息传递或数据库事务任意一者失败，两者都会中止，因此消息代理可能会在稍后安全地重传消息。因此，通过原子提交 **消息处理及其副作用**，即使在成功之前需要几次重试，也可以确保消息被 **有效地（effectively）** 恰好处理一次。中止会抛弃部分完成事务所导致的任何副作用。

然而，只有当所有受事务影响的系统都使用同样的 **原子提交协议（atomic commit protocol）** 时，这样的分布式事务才是可能的。例如，假设处理消息的副作用是发送一封邮件，而邮件服务器并不支持两阶段提交：如果消息处理失败并重试，则可能会发送两次或更多次的邮件。但如果处理消息的所有副作用都可以在事务中止时回滚，那么这样的处理流程就可以安全地重试，就好像什么都没有发生过一样。

在 [第十一章](/v1/ch11) 中将再次回到 “恰好一次” 消息处理的主题。让我们先来看看允许这种异构分布式事务的原子提交协议。

#### XA事务

*X/Open XA*（**扩展架构（eXtended Architecture）** 的缩写）是跨异构技术实现两阶段提交的标准【76,77】。它于 1991 年推出并得到了广泛的实现：许多传统关系数据库（包括 PostgreSQL、MySQL、DB2、SQL Server 和 Oracle）和消息代理（包括 ActiveMQ、HornetQ、MSMQ 和 IBM MQ） 都支持 XA。

XA 不是一个网络协议 —— 它只是一个用来与事务协调者连接的 C API。其他语言也有这种 API 的绑定；例如在 Java EE 应用的世界中，XA 事务是使用 **Java 事务 API（JTA, Java Transaction API）** 实现的，而许多使用 **Java 数据库连接（JDBC, Java Database Connectivity）** 的数据库驱动，以及许多使用 **Java 消息服务（JMS）** API 的消息代理都支持 **Java 事务 API（JTA）**。

XA 假定你的应用使用网络驱动或客户端库来与 **参与者**（数据库或消息服务）进行通信。如果驱动支持 XA，则意味着它会调用 XA API 以查明操作是否为分布式事务的一部分 —— 如果是，则将必要的信息发往数据库服务器。驱动还会向协调者暴露回调接口，协调者可以通过回调来要求参与者准备、提交或中止。

事务协调者需要实现 XA API。标准没有指明应该如何实现，但实际上协调者通常只是一个库，被加载到发起事务的应用的同一个进程中（而不是单独的服务）。它在事务中跟踪所有的参与者，并在要求它们 **准备** 之后收集参与者的响应（通过驱动回调），并使用本地磁盘上的日志记录每次事务的决定（提交 / 中止）。

如果应用进程崩溃，或者运行应用的机器报销了，协调者也随之往生极乐。然后任何带有 **准备了** 但未提交事务的参与者都会在疑虑中卡死。由于协调程序的日志位于应用服务器的本地磁盘上，因此必须重启该服务器，且协调程序库必须读取日志以恢复每个事务的提交 / 中止结果。只有这样，协调者才能使用数据库驱动的 XA 回调来要求参与者提交或中止。数据库服务器不能直接联系协调者，因为所有通信都必须通过客户端库。

#### 怀疑时持有锁

为什么我们这么关心存疑事务？系统的其他部分就不能继续正常工作，无视那些终将被清理的存疑事务吗？

问题在于 **锁（locking）**。正如在 “[读已提交](/v1/ch7#读已提交)” 中所讨论的那样，数据库事务通常获取待修改的行上的 **行级排他锁**，以防止脏写。此外，如果要使用可串行化的隔离等级，则使用两阶段锁定的数据库也必须为事务所读取的行加上共享锁（请参阅 “[两阶段锁定](/v1/ch7#两阶段锁定)”）。

在事务提交或中止之前，数据库不能释放这些锁（如 [图 9-9](/v1/ddia_0909.png) 中的阴影区域所示）。因此，在使用两阶段提交时，事务必须在整个存疑期间持有这些锁。如果协调者已经崩溃，需要 20 分钟才能重启，那么这些锁将会被持有 20 分钟。如果协调者的日志由于某种原因彻底丢失，这些锁将被永久持有 —— 或至少在管理员手动解决该情况之前。

当这些锁被持有时，其他事务不能修改这些行。根据数据库的不同，其他事务甚至可能因为读取这些行而被阻塞。因此，其他事务没法儿简单地继续它们的业务了 —— 如果它们要访问同样的数据，就会被阻塞。这可能会导致应用大面积进入不可用状态，直到存疑事务被解决。

#### 从协调者故障中恢复

理论上，如果协调者崩溃并重新启动，它应该干净地从日志中恢复其状态，并解决任何存疑事务。然而在实践中，**孤立（orphaned）** 的存疑事务确实会出现【89,90】，即无论出于何种理由，协调者无法确定事务的结果（例如事务日志已经由于软件错误丢失或损坏）。这些事务无法自动解决，所以它们永远待在数据库中，持有锁并阻塞其他事务。

即使重启数据库服务器也无法解决这个问题，因为在 2PC 的正确实现中，即使重启也必须保留存疑事务的锁（否则就会冒违反原子性保证的风险）。这是一种棘手的情况。

唯一的出路是让管理员手动决定提交还是回滚事务。管理员必须检查每个存疑事务的参与者，确定是否有任何参与者已经提交或中止，然后将相同的结果应用于其他参与者。解决这个问题潜在地需要大量的人力，并且可能发生在严重的生产中断期间（不然为什么协调者处于这种糟糕的状态），并很可能要在巨大精神压力和时间压力下完成。

许多 XA 的实现都有一个叫做 **启发式决策（heuristic decisions）** 的紧急逃生舱口：允许参与者单方面决定放弃或提交一个存疑事务，而无需协调者做出最终决定【76,77,91】。要清楚的是，这里 **启发式** 是 **可能破坏原子性（probably breaking atomicity）** 的委婉说法，因为它违背了两阶段提交的系统承诺。因此，启发式决策只是为了逃出灾难性的情况而准备的，而不是为了日常使用的。

#### 分布式事务的限制

XA 事务解决了保持多个参与者（数据系统）相互一致的现实的和重要的问题，但正如我们所看到的那样，它也引入了严重的运维问题。特别来讲，这里的核心认识是：事务协调者本身就是一种数据库（存储了事务的结果），因此需要像其他重要数据库一样小心地打交道：

* 如果协调者没有复制，而是只在单台机器上运行，那么它是整个系统的失效单点（因为它的失效会导致其他应用服务器阻塞在存疑事务持有的锁上）。令人惊讶的是，许多协调者实现默认情况下并不是高可用的，或者只有基本的复制支持。
* 许多服务器端应用都是使用无状态模式开发的（受 HTTP 的青睐），所有持久状态都存储在数据库中，因此具有应用服务器可随意按需添加删除的优点。但是，当协调者成为应用服务器的一部分时，它会改变部署的性质。突然间，协调者的日志成为持久系统状态的关键部分 —— 与数据库本身一样重要，因为协调者日志是为了在崩溃后恢复存疑事务所必需的。这样的应用服务器不再是无状态的了。
* 由于 XA 需要兼容各种数据系统，因此它必须是所有系统的最小公分母。例如，它不能检测不同系统间的死锁（因为这将需要一个标准协议来让系统交换每个事务正在等待的锁的信息），而且它无法与 SSI（请参阅 [可串行化快照隔离](/v1/ch7#可串行化快照隔离)）协同工作，因为这需要一个跨系统定位冲突的协议。
* 对于数据库内部的分布式事务（不是 XA），限制没有这么大 —— 例如，分布式版本的 SSI 是可能的。然而仍然存在问题：2PC 成功提交一个事务需要所有参与者的响应。因此，如果系统的 **任何** 部分损坏，事务也会失败。因此，分布式事务又有 **扩大失效（amplifying failures）** 的趋势，这又与我们构建容错系统的目标背道而驰。

这些事实是否意味着我们应该放弃保持几个系统相互一致的所有希望？不完全是 —— 还有其他的办法，可以让我们在没有异构分布式事务的痛苦的情况下实现同样的事情。我们将在 [第十一章](/v1/ch11) 和 [第十二章](/v1/ch12) 回到这些话题。但首先，我们应该概括一下关于 **共识** 的话题。


### 容错共识

非正式地，共识意味着让几个节点就某事达成一致。例如，如果有几个人 **同时（concurrently）** 尝试预订飞机上的最后一个座位，或剧院中的同一个座位，或者尝试使用相同的用户名注册一个帐户。共识算法可以用来确定这些 **互不相容（mutually incompatible）** 的操作中，哪一个才是赢家。

共识问题通常形式化如下：一个或多个节点可以 **提议（propose）** 某些值，而共识算法 **决定（decides）** 采用其中的某个值。在座位预订的例子中，当几个顾客同时试图订购最后一个座位时，处理顾客请求的每个节点可以 **提议** 将要服务的顾客的 ID，而 **决定** 指明了哪个顾客获得了座位。

在这种形式下，共识算法必须满足以下性质【25】：[^xiii]

[^xiii]: 这种共识的特殊形式被称为 **统一共识（uniform consensus）**，相当于在具有不可靠故障检测器的异步系统中的 **常规共识（regular consensus）**【71】。学术文献通常指的是 **进程（process）** 而不是节点，但我们在这里使用 **节点（node）** 来与本书的其余部分保持一致。

一致同意（Uniform agreement）
: 没有两个节点的决定不同。

完整性（Integrity）
: 没有节点决定两次。

有效性（Validity）
: 如果一个节点决定了值 `v` ，则 `v` 由某个节点所提议。

终止（Termination）
: 由所有未崩溃的节点来最终决定值。

**一致同意** 和 **完整性** 属性定义了共识的核心思想：所有人都决定了相同的结果，一旦决定了，你就不能改变主意。**有效性** 属性主要是为了排除平凡的解决方案：例如，无论提议了什么值，你都可以有一个始终决定值为 `null` 的算法，该算法满足 **一致同意** 和 **完整性** 属性，但不满足 **有效性** 属性。

如果你不关心容错，那么满足前三个属性很容易：你可以将一个节点硬编码为 “独裁者”，并让该节点做出所有的决定。但如果该节点失效，那么系统就无法再做出任何决定。事实上，这就是我们在两阶段提交的情况中所看到的：如果协调者失效，那么存疑的参与者就无法决定提交还是中止。

**终止** 属性形式化了容错的思想。它实质上说的是，一个共识算法不能简单地永远闲坐着等死 —— 换句话说，它必须取得进展。即使部分节点出现故障，其他节点也必须达成一项决定（**终止** 是一种 **活性属性**，而另外三种是 **安全属性** —— 请参阅 “[安全性和活性](/v1/ch8#安全性和活性)”）。

共识的系统模型假设，当一个节点 “崩溃” 时，它会突然消失而且永远不会回来。（不像软件崩溃，想象一下地震，包含你的节点的数据中心被山体滑坡所摧毁，你必须假设节点被埋在 30 英尺以下的泥土中，并且永远不会重新上线）在这个系统模型中，任何需要等待节点恢复的算法都不能满足 **终止** 属性。特别是，2PC 不符合终止属性的要求。

当然如果 **所有** 的节点都崩溃了，没有一个在运行，那么所有算法都不可能决定任何事情。算法可以容忍的失效数量是有限的：事实上可以证明，任何共识算法都需要至少占总体 **多数（majority）** 的节点正确工作，以确保终止属性【67】。多数可以安全地组成法定人数（请参阅 “[读写的法定人数](/v1/ch5#读写的法定人数)”）。

因此 **终止** 属性取决于一个假设，**不超过一半的节点崩溃或不可达**。然而即使多数节点出现故障或存在严重的网络问题，绝大多数共识的实现都能始终确保安全属性得到满足 —— 一致同意，完整性和有效性【92】。因此，大规模的中断可能会阻止系统处理请求，但是它不能通过使系统做出无效的决定来破坏共识系统。

大多数共识算法假设不存在 **拜占庭式错误**，正如在 “[拜占庭故障](/v1/ch8#拜占庭故障)” 一节中所讨论的那样。也就是说，如果一个节点没有正确地遵循协议（例如，如果它向不同节点发送矛盾的消息），它就可能会破坏协议的安全属性。克服拜占庭故障，稳健地达成共识是可能的，只要少于三分之一的节点存在拜占庭故障【25,93】。但我们没有地方在本书中详细讨论这些算法了。

#### 共识算法和全序广播

最著名的容错共识算法是 **视图戳复制（VSR, Viewstamped Replication）**【94,95】，Paxos 【96,97,98,99】，Raft 【22,100,101】以及 Zab 【15,21,102】 。这些算法之间有不少相似之处，但它们并不相同【103】。在本书中我们不会介绍各种算法的详细细节：了解一些它们共通的高级思想通常已经足够了，除非你准备自己实现一个共识系统。（可能并不明智，相当难【98,104】）

大多数这些算法实际上并不直接使用这里描述的形式化模型（提议与决定单个值，并满足一致同意、完整性、有效性和终止属性）。取而代之的是，它们决定了值的 **顺序（sequence）**，这使它们成为全序广播算法，正如本章前面所讨论的那样（请参阅 “[全序广播](#全序广播)”）。

请记住，全序广播要求将消息按照相同的顺序，恰好传递一次，准确传送到所有节点。如果仔细思考，这相当于进行了几轮共识：在每一轮中，节点提议下一条要发送的消息，然后决定在全序中下一条要发送的消息【67】。

所以，全序广播相当于重复进行多轮共识（每次共识决定与一次消息传递相对应）：

* 由于 **一致同意** 属性，所有节点决定以相同的顺序传递相同的消息。
* 由于 **完整性** 属性，消息不会重复。
* 由于 **有效性** 属性，消息不会被损坏，也不能凭空编造。
* 由于 **终止** 属性，消息不会丢失。

视图戳复制，Raft 和 Zab 直接实现了全序广播，因为这样做比重复 **一次一值（one value a time）** 的共识更高效。在 Paxos 的情况下，这种优化被称为 Multi-Paxos。

#### 单主复制与共识

在 [第五章](/v1/ch5) 中，我们讨论了单主复制（请参阅 “[领导者与追随者](/v1/ch5#领导者与追随者)”），它将所有的写入操作都交给主库，并以相同的顺序将它们应用到从库，从而使副本保持在最新状态。这实际上不就是一个全序广播吗？为什么我们在 [第五章](/v1/ch5) 里一点都没担心过共识问题呢？

答案取决于如何选择领导者。如果主库是由运维人员手动选择和配置的，那么你实际上拥有一种 **独裁类型** 的 “共识算法”：只有一个节点被允许接受写入（即决定写入复制日志的顺序），如果该节点发生故障，则系统将无法写入，直到运维手动配置其他节点作为主库。这样的系统在实践中可以表现良好，但它无法满足共识的 **终止** 属性，因为它需要人为干预才能取得 **进展**。

一些数据库会自动执行领导者选举和故障切换，如果旧主库失效，会提拔一个从库为新主库（请参阅 “[处理节点宕机](/v1/ch5#处理节点宕机)”）。这使我们向容错的全序广播更进一步，从而达成共识。

但是还有一个问题。我们之前曾经讨论过脑裂的问题，并且说过所有的节点都需要同意是谁领导，否则两个不同的节点都会认为自己是领导者，从而导致数据库进入不一致的状态。因此，选出一位领导者需要共识。但如果这里描述的共识算法实际上是全序广播算法，并且全序广播就像单主复制，而单主复制需要一个领导者，那么...

这样看来，要选出一个领导者，我们首先需要一个领导者。要解决共识问题，我们首先需要解决共识问题。我们如何跳出这个先有鸡还是先有蛋的问题？

#### 纪元编号和法定人数

迄今为止所讨论的所有共识协议，在内部都以某种形式使用一个领导者，但它们并不能保证领导者是独一无二的。相反，它们可以做出更弱的保证：协议定义了一个 **纪元编号**（epoch number，在 Paxos 中被称为 **投票编号**，即 ballot number，在视图戳复制中被称为 **视图编号**，即 view number，以及在 Raft 中被为 **任期号码**，即 term number），并确保在每个时代中，领导者都是唯一的。

每次当现任领导被认为挂掉的时候，节点间就会开始一场投票，以选出一个新领导。这次选举被赋予一个递增的纪元编号，因此纪元编号是全序且单调递增的。如果两个不同的时代的领导者之间出现冲突（也许是因为前任领导者实际上并未死亡），那么带有更高纪元编号的领导说了算。

在任何领导者被允许决定任何事情之前，必须先检查是否存在其他带有更高纪元编号的领导者，它们可能会做出相互冲突的决定。领导者如何知道自己没有被另一个节点赶下台？回想一下在 “[真相由多数所定义](/v1/ch8#真相由多数所定义)” 中提到的：一个节点不一定能相信自己的判断 —— 因为只有节点自己认为自己是领导者，并不一定意味着其他节点接受它作为它们的领导者。

相反，它必须从 **法定人数（quorum）** 的节点中获取选票（请参阅 “[读写的法定人数](/v1/ch5#读写的法定人数)”）。对领导者想要做出的每一个决定，都必须将提议值发送给其他节点，并等待法定人数的节点响应并赞成提案。法定人数通常（但不总是）由多数节点组成【105】。只有在没有意识到任何带有更高纪元编号的领导者的情况下，一个节点才会投票赞成提议。

因此，我们有两轮投票：第一次是为了选出一位领导者，第二次是对领导者的提议进行表决。关键的洞察在于，这两次投票的 **法定人群** 必须相互 **重叠（overlap）**：如果一个提案的表决通过，则至少得有一个参与投票的节点也必须参加过最近的领导者选举【105】。因此，如果在一个提案的表决过程中没有出现更高的纪元编号。那么现任领导者就可以得出这样的结论：没有发生过更高时代的领导选举，因此可以确定自己仍然在领导。然后它就可以安全地对提议值做出决定。

这一投票过程表面上看起来很像两阶段提交。最大的区别在于，2PC 中协调者不是由选举产生的，而且 2PC 则要求 **所有** 参与者都投赞成票，而容错共识算法只需要多数节点的投票。而且，共识算法还定义了一个恢复过程，节点可以在选举出新的领导者之后进入一个一致的状态，确保始终能满足安全属性。这些区别正是共识算法正确性和容错性的关键。

#### 共识的局限性

共识算法对于分布式系统来说是一个巨大的突破：它为其他充满不确定性的系统带来了基础的安全属性（一致同意，完整性和有效性），然而它们还能保持容错（只要多数节点正常工作且可达，就能取得进展）。它们提供了全序广播，因此它们也可以以一种容错的方式实现线性一致的原子操作（请参阅 “[使用全序广播实现线性一致的存储](#使用全序广播实现线性一致的存储)”）。

尽管如此，它们并不是在所有地方都用上了，因为好处总是有代价的。

节点在做出决定之前对提议进行投票的过程是一种同步复制。如 “[同步复制与异步复制](/v1/ch5#同步复制与异步复制)” 中所述，通常数据库会配置为异步复制模式。在这种配置中发生故障切换时，一些已经提交的数据可能会丢失 —— 但是为了获得更好的性能，许多人选择接受这种风险。

共识系统总是需要严格多数来运转。这意味着你至少需要三个节点才能容忍单节点故障（其余两个构成多数），或者至少有五个节点来容忍两个节点发生故障（其余三个构成多数）。如果网络故障切断了某些节点同其他节点的连接，则只有多数节点所在的网络可以继续工作，其余部分将被阻塞（请参阅 “[线性一致性的代价](#线性一致性的代价)”）。

大多数共识算法假定参与投票的节点是固定的集合，这意味着你不能简单的在集群中添加或删除节点。共识算法的 **动态成员扩展（dynamic membership extension）** 允许集群中的节点集随时间推移而变化，但是它们比静态成员算法要难理解得多。

共识系统通常依靠超时来检测失效的节点。在网络延迟高度变化的环境中，特别是在地理上散布的系统中，经常发生一个节点由于暂时的网络问题，错误地认为领导者已经失效。虽然这种错误不会损害安全属性，但频繁的领导者选举会导致糟糕的性能表现，因系统最后可能花在权力倾扎上的时间要比花在建设性工作的多得多。

有时共识算法对网络问题特别敏感。例如 Raft 已被证明存在让人不悦的极端情况【106】：如果整个网络工作正常，但只有一条特定的网络连接一直不可靠，Raft 可能会进入领导者在两个节点间频繁切换的局面，或者当前领导者不断被迫辞职以致系统实质上毫无进展。其他一致性算法也存在类似的问题，而设计能健壮应对不可靠网络的算法仍然是一个开放的研究问题。

### 成员与协调服务

像 ZooKeeper 或 etcd 这样的项目通常被描述为 “分布式键值存储” 或 “协调与配置服务”。这种服务的 API 看起来非常像数据库：你可以读写给定键的值，并遍历键。所以如果它们基本上算是数据库的话，为什么它们要把工夫全花在实现一个共识算法上呢？是什么使它们区别于其他任意类型的数据库？

为了理解这一点，简单了解如何使用 ZooKeeper 这类服务是很有帮助的。作为应用开发人员，你很少需要直接使用 ZooKeeper，因为它实际上不适合当成通用数据库来用。更有可能的是，你会通过其他项目间接依赖它，例如 HBase、Hadoop YARN、OpenStack Nova 和 Kafka 都依赖 ZooKeeper 在后台运行。这些项目从它那里得到了什么？

ZooKeeper 和 etcd 被设计为容纳少量完全可以放在内存中的数据（虽然它们仍然会写入磁盘以保证持久性），所以你不会想着把所有应用数据放到这里。这些少量数据会通过容错的全序广播算法复制到所有节点上。正如前面所讨论的那样，数据库复制需要的就是全序广播：如果每条消息代表对数据库的写入，则以相同的顺序应用相同的写入操作可以使副本之间保持一致。

ZooKeeper 模仿了 Google 的 Chubby 锁服务【14,98】，不仅实现了全序广播（因此也实现了共识），而且还构建了一组有趣的其他特性，这些特性在构建分布式系统时变得特别有用：

线性一致性的原子操作
: 使用原子 CAS 操作可以实现锁：如果多个节点同时尝试执行相同的操作，只有一个节点会成功。共识协议保证了操作的原子性和线性一致性，即使节点发生故障或网络在任意时刻中断。分布式锁通常以 **租约（lease）** 的形式实现，租约有一个到期时间，以便在客户端失效的情况下最终能被释放（请参阅 “[进程暂停](/v1/ch8#进程暂停)”）。

操作的全序排序
: 如 “[领导者和锁](/v1/ch8#领导者和锁)” 中所述，当某个资源受到锁或租约的保护时，你需要一个防护令牌来防止客户端在进程暂停的情况下彼此冲突。防护令牌是每次锁被获取时单调增加的数字。ZooKeeper 通过全序化所有操作来提供这个功能，它为每个操作提供一个单调递增的事务 ID（`zxid`）和版本号（`cversion`）【15】。

失效检测
: 客户端在 ZooKeeper 服务器上维护一个长期会话，客户端和服务器周期性地交换心跳包来检查节点是否还活着。即使连接暂时中断，或者 ZooKeeper 节点失效，会话仍保持在活跃状态。但如果心跳停止的持续时间超出会话超时，ZooKeeper 会宣告该会话已死亡。当会话超时时（ZooKeeper 称这些节点为 **临时节点**，即 ephemeral nodes），会话持有的任何锁都可以配置为自动释放。

变更通知
: 客户端不仅可以读取其他客户端创建的锁和值，还可以监听它们的变更。因此，客户端可以知道另一个客户端何时加入集群（基于新客户端写入 ZooKeeper 的值），或发生故障（因其会话超时，而其临时节点消失）。通过订阅通知，客户端不用再通过频繁轮询的方式来找出变更。

在这些功能中，只有线性一致的原子操作才真的需要共识。但正是这些功能的组合，使得像 ZooKeeper 这样的系统在分布式协调中非常有用。

#### 将工作分配给节点

ZooKeeper/Chubby 模型运行良好的一个例子是，如果你有几个进程实例或服务，需要选择其中一个实例作为主库或首选服务。如果领导者失败，其他节点之一应该接管。这对单主数据库当然非常实用，但对作业调度程序和类似的有状态系统也很好用。

另一个例子是，当你有一些分区资源（数据库、消息流、文件存储、分布式 Actor 系统等），并需要决定将哪个分区分配给哪个节点时。当新节点加入集群时，需要将某些分区从现有节点移动到新节点，以便重新平衡负载（请参阅 “[分区再平衡](/v1/ch6#分区再平衡)”）。当节点被移除或失效时，其他节点需要接管失效节点的工作。

这类任务可以通过在 ZooKeeper 中明智地使用原子操作，临时节点与通知来实现。如果设计得当，这种方法允许应用自动从故障中恢复而无需人工干预。不过这并不容易，尽管已经有不少在 ZooKeeper 客户端 API 基础之上提供更高层工具的库，例如 Apache Curator 【17】。但它仍然要比尝试从头实现必要的共识算法要好得多，这样的尝试鲜有成功记录【107】。

应用最初只能在单个节点上运行，但最终可能会增长到数千个节点。试图在如此之多的节点上进行多数投票将是非常低效的。相反，ZooKeeper 在固定数量的节点（通常是三到五个）上运行，并在这些节点之间执行其多数票，同时支持潜在的大量客户端。因此，ZooKeeper 提供了一种将协调节点（共识，操作排序和故障检测）的一些工作 “外包” 到外部服务的方式。

通常，由 ZooKeeper 管理的数据类型的变化十分缓慢：代表 “分区 7 中的节点运行在 `10.1.1.23` 上” 的信息可能会在几分钟或几小时的时间内发生变化。它不是用来存储应用的运行时状态的，后者每秒可能会改变数千甚至数百万次。如果应用状态需要从一个节点复制到另一个节点，则可以使用其他工具（如 Apache BookKeeper 【108】）。

#### 服务发现

ZooKeeper、etcd 和 Consul 也经常用于服务发现 —— 也就是找出你需要连接到哪个 IP 地址才能到达特定的服务。在云数据中心环境中，虚拟机来来往往很常见，你通常不会事先知道服务的 IP 地址。相反，你可以配置你的服务，使其在启动时注册服务注册表中的网络端点，然后可以由其他服务找到它们。

但是，服务发现是否需要达成共识还不太清楚。DNS 是查找服务名称的 IP 地址的传统方式，它使用多层缓存来实现良好的性能和可用性。从 DNS 读取是绝对不线性一致性的，如果 DNS 查询的结果有点陈旧，通常不会有问题【109】。DNS 的可用性和对网络中断的鲁棒性更重要。

尽管服务发现并不需要共识，但领导者选举却是如此。因此，如果你的共识系统已经知道领导是谁，那么也可以使用这些信息来帮助其他服务发现领导是谁。为此，一些共识系统支持只读缓存副本。这些副本异步接收共识算法所有决策的日志，但不主动参与投票。因此，它们能够提供不需要线性一致性的读取请求。

#### 成员资格服务

ZooKeeper 和它的小伙伴们可以看作是成员资格服务（membership services）研究的悠久历史的一部分，这个历史可以追溯到 20 世纪 80 年代，并且对建立高度可靠的系统（例如空中交通管制）非常重要【110】。

成员资格服务确定哪些节点当前处于活动状态并且是集群的活动成员。正如我们在 [第八章](/v1/ch8) 中看到的那样，由于无限的网络延迟，无法可靠地检测到另一个节点是否发生故障。但是，如果你通过共识来进行故障检测，那么节点可以就哪些节点应该被认为是存在或不存在达成一致。

即使它确实存在，仍然可能发生一个节点被共识错误地宣告死亡。但是对于一个系统来说，知道哪些节点构成了当前的成员关系是非常有用的。例如，选择领导者可能意味着简单地选择当前成员中编号最小的成员，但如果不同的节点对现有的成员都有谁有不同意见，则这种方法将不起作用。


## 本章小结

在本章中，我们从几个不同的角度审视了关于一致性与共识的话题。我们深入研究了线性一致性（一种流行的一致性模型）：其目标是使多副本数据看起来好像只有一个副本一样，并使其上所有操作都原子性地生效。虽然线性一致性因为简单易懂而很吸引人 —— 它使数据库表现的好像单线程程序中的一个变量一样，但它有着速度缓慢的缺点，特别是在网络延迟很大的环境中。

我们还探讨了因果性，因果性对系统中的事件施加了顺序（什么发生在什么之前，基于因与果）。与线性一致不同，线性一致性将所有操作放在单一的全序时间线中，因果一致性为我们提供了一个较弱的一致性模型：某些事件可以是 **并发** 的，所以版本历史就像是一条不断分叉与合并的时间线。因果一致性没有线性一致性的协调开销，而且对网络问题的敏感性要低得多。

但即使捕获到因果顺序（例如使用兰伯特时间戳），我们发现有些事情也不能通过这种方式实现：在 “[光有时间戳排序还不够](#光有时间戳排序还不够)” 一节的例子中，我们需要确保用户名是唯一的，并拒绝同一用户名的其他并发注册。如果一个节点要通过注册，则需要知道其他的节点没有在并发抢注同一用户名的过程中。这个问题引领我们走向 **共识**。

我们看到，达成共识意味着以这样一种方式决定某件事：所有节点一致同意所做决定，且这一决定不可撤销。通过深入挖掘，结果我们发现很广泛的一系列问题实际上都可以归结为共识问题，并且彼此等价（从这个意义上来讲，如果你有其中之一的解决方案，就可以轻易将它转换为其他问题的解决方案）。这些等价的问题包括：

线性一致性的 CAS 寄存器
: 寄存器需要基于当前值是否等于操作给出的参数，原子地 **决定** 是否设置新值。

原子事务提交
: 数据库必须 **决定** 是否提交或中止分布式事务。

全序广播
: 消息系统必须 **决定** 传递消息的顺序。

锁和租约
: 当几个客户端争抢锁或租约时，由锁来 **决定** 哪个客户端成功获得锁。

成员 / 协调服务
: 给定某种故障检测器（例如超时），系统必须 **决定** 哪些节点活着，哪些节点因为会话超时需要被宣告死亡。

唯一性约束
: 当多个事务同时尝试使用相同的键创建冲突记录时，约束必须 **决定** 哪一个被允许，哪些因为违反约束而失败。

如果你只有一个节点，或者你愿意将决策的权能分配给单个节点，所有这些事都很简单。这就是在单领导者数据库中发生的事情：所有决策权归属于领导者，这就是为什么这样的数据库能够提供线性一致的操作，唯一性约束，完全有序的复制日志，以及更多。

但如果该领导者失效，或者如果网络中断导致领导者不可达，这样的系统就无法取得任何进展。应对这种情况可以有三种方法：

1. 等待领导者恢复，接受系统将在这段时间阻塞的事实。许多 XA/JTA 事务协调者选择这个选项。这种方法并不能完全达成共识，因为它不能满足 **终止** 属性的要求：如果领导者续命失败，系统可能会永久阻塞。
2. 人工故障切换，让人类选择一个新的领导者节点，并重新配置系统使之生效，许多关系型数据库都采用这种方方式。这是一种来自 “天意” 的共识 —— 由计算机系统之外的运维人员做出决定。故障切换的速度受到人类行动速度的限制，通常要比计算机慢（得多）。
3. 使用算法自动选择一个新的领导者。这种方法需要一种共识算法，使用成熟的算法来正确处理恶劣的网络条件是明智之举【107】。

尽管单领导者数据库可以提供线性一致性，且无需对每个写操作都执行共识算法，但共识对于保持及变更领导权仍然是必须的。因此从某种意义上说，使用单个领导者不过是 “缓兵之计”：共识仍然是需要的，只是在另一个地方，而且没那么频繁。好消息是，容错的共识算法与容错的共识系统是存在的，我们在本章中简要地讨论了它们。

像 ZooKeeper 这样的工具为应用提供了 “外包” 的共识、故障检测和成员服务。它们扮演了重要的角色，虽说使用不易，但总比自己去开发一个能经受 [第八章](/v1/ch8) 中所有问题考验的算法要好得多。如果你发现自己想要解决的问题可以归结为共识，并且希望它能容错，使用一个类似 ZooKeeper 的东西是明智之举。

尽管如此，并不是所有系统都需要共识：例如，无领导者复制和多领导者复制系统通常不会使用全局的共识。这些系统中出现的冲突（请参阅 “[处理写入冲突](/v1/ch5#处理写入冲突)”）正是不同领导者之间没有达成共识的结果，但这也许并没有关系：也许我们只是需要接受没有线性一致性的事实，并学会更好地与具有分支与合并版本历史的数据打交道。

本章引用了大量关于分布式系统理论的研究。虽然理论论文和证明并不总是容易理解，有时也会做出不切实际的假设，但它们对于指导这一领域的实践有着极其重要的价值：它们帮助我们推理什么可以做，什么不可以做，帮助我们找到反直觉的分布式系统缺陷。如果你有时间，这些参考资料值得探索。

这里已经到了本书 [第二部分](/v1/part-ii) 的末尾，第二部介绍了复制（[第五章](/v1/ch5)）、分区（[第六章](/v1/ch6)）、事务（[第七章](/v1/ch7)）、分布式系统的故障模型（[第八章](/v1/ch8)）以及最后的一致性与共识（[第九章](/v1/ch9)）。现在我们已经奠定了扎实的理论基础，我们将在 [第三部分](/v1/part-iii) 再次转向更实际的系统，并讨论如何使用异构的组件积木块构建强大的应用。


## 参考文献

1. Peter Bailis and Ali Ghodsi: “[Eventual Consistency Today: Limitations, Extensions, and Beyond](http://queue.acm.org/detail.cfm?id=2462076),” *ACM Queue*, volume 11, number 3, pages 55-63, March 2013. [doi:10.1145/2460276.2462076](http://dx.doi.org/10.1145/2460276.2462076)
1. Prince Mahajan, Lorenzo Alvisi, and Mike Dahlin: “[Consistency, Availability, and Convergence](http://apps.cs.utexas.edu/tech_reports/reports/tr/TR-2036.pdf),” University of Texas at Austin, Department of Computer Science, Tech Report UTCS TR-11-22, May 2011.
1. Alex Scotti: “[Adventures in Building Your Own Database](http://www.slideshare.net/AlexScotti1/allyourbase-55212398),” at *All Your Base*, November 2015.
1. Peter Bailis, Aaron Davidson, Alan Fekete, et al.: “[Highly Available Transactions: Virtues and Limitations](http://arxiv.org/pdf/1302.0309.pdf),” at *40th International Conference on Very Large Data Bases* (VLDB), September 2014. Extended version published as pre-print arXiv:1302.0309 &#91;cs.DB&#93;.
1. Paolo Viotti and Marko Vukolić: “[Consistency in Non-Transactional Distributed Storage Systems](http://arxiv.org/abs/1512.00168),” arXiv:1512.00168, 12 April 2016.
1. Maurice P. Herlihy and Jeannette M. Wing: “[Linearizability: A Correctness Condition for Concurrent Objects](http://cs.brown.edu/~mph/HerlihyW90/p463-herlihy.pdf),” *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 12, number 3, pages 463–492, July 1990. [doi:10.1145/78969.78972](http://dx.doi.org/10.1145/78969.78972)
1. Leslie Lamport: “[On interprocess communication](https://www.microsoft.com/en-us/research/publication/interprocess-communication-part-basic-formalism-part-ii-algorithms/),” *Distributed Computing*, volume 1, number 2, pages 77–101, June 1986. [doi:10.1007/BF01786228](http://dx.doi.org/10.1007/BF01786228)
1. David K. Gifford: “[Information Storage in a Decentralized Computer System](http://www.mirrorservice.org/sites/www.bitsavers.org/pdf/xerox/parc/techReports/CSL-81-8_Information_Storage_in_a_Decentralized_Computer_System.pdf),” Xerox Palo Alto Research Centers, CSL-81-8, June 1981.
1. Martin Kleppmann: “[Please Stop Calling Databases CP or AP](http://martin.kleppmann.com/2015/05/11/please-stop-calling-databases-cp-or-ap.html),” *martin.kleppmann.com*, May 11, 2015.
1. Kyle Kingsbury: “[Call Me Maybe: MongoDB Stale Reads](https://aphyr.com/posts/322-call-me-maybe-mongodb-stale-reads),” *aphyr.com*, April 20, 2015.
1. Kyle Kingsbury: “[Computational Techniques in Knossos](https://aphyr.com/posts/314-computational-techniques-in-knossos),” *aphyr.com*, May 17, 2014.
1. Peter Bailis: “[Linearizability Versus Serializability](http://www.bailis.org/blog/linearizability-versus-serializability/),” *bailis.org*, September 24, 2014.
1. Philip A. Bernstein, Vassos Hadzilacos, and Nathan Goodman: [*Concurrency Control and Recovery in Database Systems*](https://www.microsoft.com/en-us/research/people/philbe/book/). Addison-Wesley, 1987. ISBN: 978-0-201-10715-9, available online at *research.microsoft.com*.
1. Mike Burrows: “[The Chubby Lock Service for Loosely-Coupled Distributed Systems](https://research.google/pubs/pub27897/),” at *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006.
1. Flavio P. Junqueira and Benjamin Reed: *ZooKeeper: Distributed Process Coordination*. O'Reilly Media, 2013. ISBN: 978-1-449-36130-3
1. “[etcd Documentation](https://etcd.io/docs/),” The Linux Foundation, *etcd.io*.
1. “[Apache Curator](http://curator.apache.org/),” Apache Software Foundation, *curator.apache.org*, 2015.
1. Murali Vallath: *Oracle 10g RAC Grid, Services & Clustering*. Elsevier Digital Press, 2006. ISBN: 978-1-555-58321-7
1. Peter Bailis, Alan Fekete, Michael J Franklin, et al.: “[Coordination-Avoiding Database Systems](http://arxiv.org/pdf/1402.2237.pdf),” *Proceedings of the VLDB Endowment*, volume 8, number 3, pages 185–196, November 2014.
1. Kyle Kingsbury: “[Call Me Maybe: etcd and Consul](https://aphyr.com/posts/316-call-me-maybe-etcd-and-consul),” *aphyr.com*, June 9, 2014.
1. Flavio P. Junqueira, Benjamin C. Reed, and Marco Serafini: “[Zab: High-Performance Broadcast for Primary-Backup Systems](https://web.archive.org/web/20220419064903/https://marcoserafini.github.io/papers/zab.pdf),” at *41st IEEE International Conference on Dependable Systems and Networks* (DSN), June 2011. [doi:10.1109/DSN.2011.5958223](http://dx.doi.org/10.1109/DSN.2011.5958223)
1. Diego Ongaro and John K. Ousterhout: “[In Search of an Understandable Consensus Algorithm](https://www.usenix.org/system/files/conference/atc14/atc14-paper-ongaro.pdf),” at *USENIX Annual Technical Conference* (ATC), June 2014.
1. Hagit Attiya, Amotz Bar-Noy, and Danny Dolev: “[Sharing Memory Robustly in Message-Passing Systems](http://www.cse.huji.ac.il/course/2004/dist/p124-attiya.pdf),” *Journal of the ACM*, volume 42, number 1, pages 124–142, January 1995. [doi:10.1145/200836.200869](http://dx.doi.org/10.1145/200836.200869)
1. Nancy Lynch and Alex Shvartsman: “[Robust Emulation of Shared Memory Using Dynamic Quorum-Acknowledged Broadcasts](http://groups.csail.mit.edu/tds/papers/Lynch/FTCS97.pdf),” at *27th Annual International Symposium on Fault-Tolerant Computing* (FTCS), June 1997. [doi:10.1109/FTCS.1997.614100](http://dx.doi.org/10.1109/FTCS.1997.614100)
1. Christian Cachin, Rachid Guerraoui, and Luís Rodrigues: [*Introduction to Reliable and Secure Distributed Programming*](http://www.distributedprogramming.net/), 2nd edition. Springer, 2011. ISBN: 978-3-642-15259-7, [doi:10.1007/978-3-642-15260-3](http://dx.doi.org/10.1007/978-3-642-15260-3)
1. Sam Elliott, Mark Allen, and Martin Kleppmann: [personal communication](https://web.archive.org/web/20230620021338/https://twitter.com/lenary/status/654761711933648896), thread on *twitter.com*, October 15, 2015.
1. Niklas Ekström, Mikhail Panchenko, and Jonathan Ellis: “[Possible Issue with Read Repair?](http://mail-archives.apache.org/mod_mbox/cassandra-dev/201210.mbox/%3CFA480D1DC3964E2C8B0A14E0880094C9%40Robotech%3E),” email thread on *cassandra-dev* mailing list, October 2012.
1. Maurice P. Herlihy: “[Wait-Free Synchronization](https://cs.brown.edu/~mph/Herlihy91/p124-herlihy.pdf),” *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 13, number 1, pages 124–149, January 1991. [doi:10.1145/114005.102808](http://dx.doi.org/10.1145/114005.102808)
1. Armando Fox and Eric A. Brewer: “[Harvest, Yield, and Scalable Tolerant Systems](http://radlab.cs.berkeley.edu/people/fox/static/pubs/pdf/c18.pdf),” at *7th Workshop on Hot Topics in Operating Systems* (HotOS), March 1999. [doi:10.1109/HOTOS.1999.798396](http://dx.doi.org/10.1109/HOTOS.1999.798396)
1. Seth Gilbert and Nancy Lynch: “[Brewer’s Conjecture and the Feasibility of Consistent, Available, Partition-Tolerant Web Services](http://www.comp.nus.edu.sg/~gilbert/pubs/BrewersConjecture-SigAct.pdf),” *ACM SIGACT News*, volume 33, number 2, pages 51–59, June 2002. [doi:10.1145/564585.564601](http://dx.doi.org/10.1145/564585.564601)
1. Seth Gilbert and Nancy Lynch: “[Perspectives on the CAP Theorem](http://groups.csail.mit.edu/tds/papers/Gilbert/Brewer2.pdf),” *IEEE Computer Magazine*, volume 45, number 2, pages 30–36, February 2012. [doi:10.1109/MC.2011.389](http://dx.doi.org/10.1109/MC.2011.389)
1. Eric A. Brewer: “[CAP Twelve Years Later: How the 'Rules' Have Changed](https://web.archive.org/web/20221222092656/http://cs609.cs.ua.edu/CAP12.pdf),” *IEEE Computer Magazine*, volume 45, number 2, pages 23–29, February 2012. [doi:10.1109/MC.2012.37](http://dx.doi.org/10.1109/MC.2012.37)
1. Susan B. Davidson, Hector Garcia-Molina, and Dale Skeen: “[Consistency in Partitioned Networks](http://delab.csd.auth.gr/~dimitris/courses/mpc_fall05/papers/invalidation/acm_csur85_partitioned_network_consistency.pdf),” *ACM Computing Surveys*, volume 17, number 3, pages 341–370, September 1985. [doi:10.1145/5505.5508](http://dx.doi.org/10.1145/5505.5508)
1. Paul R. Johnson and Robert H. Thomas: “[RFC 677: The Maintenance of Duplicate Databases](https://tools.ietf.org/html/rfc677),” Network Working Group, January 27, 1975.
1. Bruce G. Lindsay, Patricia Griffiths Selinger, C. Galtieri, et al.: “[Notes on Distributed Databases](https://dominoweb.draco.res.ibm.com/reports/RJ2571.pdf),” IBM Research, Research Report RJ2571(33471), July 1979.
1. Michael J. Fischer and Alan Michael: “[Sacrificing Serializability to Attain High Availability of Data in an Unreliable Network](http://www.cs.ucsb.edu/~agrawal/spring2011/ugrad/p70-fischer.pdf),” at *1st ACM Symposium on Principles of Database Systems* (PODS), March 1982. [doi:10.1145/588111.588124](http://dx.doi.org/10.1145/588111.588124)
1. Eric A. Brewer: “[NoSQL: Past, Present, Future](http://www.infoq.com/presentations/NoSQL-History),” at *QCon San Francisco*, November 2012.
1. Henry Robinson: “[CAP Confusion: Problems with 'Partition Tolerance,'](https://web.archive.org/web/20160304020135/http://blog.cloudera.com/blog/2010/04/cap-confusion-problems-with-partition-tolerance/)” *blog.cloudera.com*, April 26, 2010.
1. Adrian Cockcroft: “[Migrating to Microservices](http://www.infoq.com/presentations/migration-cloud-native),” at *QCon London*, March 2014.
1. Martin Kleppmann: “[A Critique of the CAP Theorem](http://arxiv.org/abs/1509.05393),” arXiv:1509.05393, September 17, 2015.
1. Nancy A. Lynch: “[A Hundred Impossibility Proofs for Distributed Computing](http://groups.csail.mit.edu/tds/papers/Lynch/podc89.pdf),” at *8th ACM Symposium on Principles of Distributed Computing* (PODC), August 1989. [doi:10.1145/72981.72982](http://dx.doi.org/10.1145/72981.72982)
1. Hagit Attiya, Faith Ellen, and Adam Morrison: “[Limitations of Highly-Available Eventually-Consistent Data Stores](https://www.cs.tau.ac.il/~mad/publications/podc2015-replds.pdf),” at *ACM Symposium on Principles of Distributed Computing* (PODC), July 2015. [doi:10.1145/2767386.2767419](http://dx.doi.org/10.1145/2767386.2767419)
1. Peter Sewell, Susmit Sarkar, Scott Owens, et al.: “[x86-TSO: A Rigorous and Usable Programmer's Model for x86 Multiprocessors](http://www.cl.cam.ac.uk/~pes20/weakmemory/cacm.pdf),” *Communications of the ACM*, volume 53, number 7, pages 89–97, July 2010. [doi:10.1145/1785414.1785443](http://dx.doi.org/10.1145/1785414.1785443)
1. Martin Thompson: “[Memory Barriers/Fences](http://mechanical-sympathy.blogspot.co.uk/2011/07/memory-barriersfences.html),” *mechanical-sympathy.blogspot.co.uk*, July 24, 2011.
1. Ulrich Drepper: “[What Every Programmer Should Know About Memory](http://www.akkadia.org/drepper/cpumemory.pdf),” *akkadia.org*, November 21, 2007.
1. Daniel J. Abadi: “[Consistency Tradeoffs in Modern Distributed Database System Design](http://cs-www.cs.yale.edu/homes/dna/papers/abadi-pacelc.pdf),” *IEEE Computer Magazine*, volume 45, number 2, pages 37–42, February 2012. [doi:10.1109/MC.2012.33](http://dx.doi.org/10.1109/MC.2012.33)
1. Hagit Attiya and Jennifer L. Welch: “[Sequential Consistency Versus Linearizability](http://courses.csail.mit.edu/6.852/01/papers/p91-attiya.pdf),” *ACM Transactions on Computer Systems* (TOCS), volume 12, number 2, pages 91–122, May 1994. [doi:10.1145/176575.176576](http://dx.doi.org/10.1145/176575.176576)
1. Mustaque Ahamad, Gil Neiger, James E. Burns, et al.: “[Causal Memory: Definitions, Implementation, and Programming](http://www-i2.informatik.rwth-aachen.de/i2/fileadmin/user_upload/documents/Seminar_MCMM11/Causal_memory_1996.pdf),” *Distributed Computing*, volume 9, number 1, pages 37–49, March 1995. [doi:10.1007/BF01784241](http://dx.doi.org/10.1007/BF01784241)
1. Wyatt Lloyd, Michael J. Freedman, Michael Kaminsky, and David G. Andersen: “[Stronger Semantics for Low-Latency Geo-Replicated Storage](https://www.usenix.org/system/files/conference/nsdi13/nsdi13-final149.pdf),” at *10th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), April 2013.
1. Marek Zawirski, Annette Bieniusa, Valter Balegas, et al.: “[SwiftCloud: Fault-Tolerant Geo-Replication Integrated All the Way to the Client Machine](http://arxiv.org/abs/1310.3107),” INRIA Research Report 8347, August 2013.
1. Peter Bailis, Ali Ghodsi, Joseph M Hellerstein, and Ion Stoica: “[Bolt-on Causal Consistency](http://db.cs.berkeley.edu/papers/sigmod13-bolton.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2013.
1. Philippe Ajoux, Nathan Bronson, Sanjeev Kumar, et al.: “[Challenges to Adopting Stronger Consistency at Scale](https://www.usenix.org/system/files/conference/hotos15/hotos15-paper-ajoux.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
1. Peter Bailis: “[Causality Is Expensive (and What to Do About It)](http://www.bailis.org/blog/causality-is-expensive-and-what-to-do-about-it/),” *bailis.org*, February 5, 2014.
1. Ricardo Gonçalves, Paulo Sérgio Almeida, Carlos Baquero, and Victor Fonte: “[Concise Server-Wide Causality Management for Eventually Consistent Data Stores](https://web.archive.org/web/20220810205439/http://haslab.uminho.pt/tome/files/global_logical_clocks.pdf),” at *15th IFIP International Conference on Distributed Applications and Interoperable Systems* (DAIS), June 2015. [doi:10.1007/978-3-319-19129-4_6](http://dx.doi.org/10.1007/978-3-319-19129-4_6)
1. Rob Conery: “[A Better ID Generator for PostgreSQL](https://web.archive.org/web/20220118044729/http://rob.conery.io/2014/05/29/a-better-id-generator-for-postgresql/),” *rob.conery.io*, May 29, 2014.
1. Leslie Lamport: “[Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/),” *Communications of the ACM*, volume 21, number 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](http://dx.doi.org/10.1145/359545.359563)
1. Xavier Défago, André Schiper, and Péter Urbán: “[Total Order Broadcast and Multicast Algorithms: Taxonomy and Survey](https://dspace.jaist.ac.jp/dspace/bitstream/10119/4883/1/defago_et_al.pdf),” *ACM Computing Surveys*, volume 36, number 4, pages 372–421, December 2004. [doi:10.1145/1041680.1041682](http://dx.doi.org/10.1145/1041680.1041682)
1. Hagit Attiya and Jennifer Welch: *Distributed Computing: Fundamentals, Simulations and Advanced Topics*, 2nd edition. John Wiley & Sons, 2004. ISBN: 978-0-471-45324-6, [doi:10.1002/0471478210](http://dx.doi.org/10.1002/0471478210)
1. Mahesh Balakrishnan, Dahlia Malkhi, Vijayan Prabhakaran, et al.: “[CORFU: A Shared Log Design for Flash Clusters](https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final30.pdf),” at *9th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), April 2012.
1. Fred B. Schneider: “[Implementing Fault-Tolerant Services Using the State Machine Approach: A Tutorial](http://www.cs.cornell.edu/fbs/publications/smsurvey.pdf),” *ACM Computing Surveys*, volume 22, number 4, pages 299–319, December 1990.
1. Alexander Thomson, Thaddeus Diamond, Shu-Chun Weng, et al.: “[Calvin: Fast Distributed Transactions for Partitioned Database Systems](http://cs.yale.edu/homes/thomson/publications/calvin-sigmod12.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), May 2012.
1. Mahesh Balakrishnan, Dahlia Malkhi, Ted Wobber, et al.: “[Tango: Distributed Data Structures over a Shared Log](https://www.microsoft.com/en-us/research/publication/tango-distributed-data-structures-over-a-shared-log/),” at *24th ACM Symposium on Operating Systems Principles* (SOSP), November 2013. [doi:10.1145/2517349.2522732](http://dx.doi.org/10.1145/2517349.2522732)
1. Robbert van Renesse and Fred B. Schneider: “[Chain Replication for Supporting High Throughput and Availability](http://static.usenix.org/legacy/events/osdi04/tech/full_papers/renesse/renesse.pdf),” at *6th USENIX Symposium on Operating System Design and Implementation* (OSDI), December 2004.
1. Leslie Lamport: “[How to Make a Multiprocessor Computer That Correctly Executes Multiprocess Programs](https://lamport.azurewebsites.net/pubs/multi.pdf),” *IEEE Transactions on Computers*, volume 28, number 9, pages 690–691, September 1979. [doi:10.1109/TC.1979.1675439](http://dx.doi.org/10.1109/TC.1979.1675439)
1. Enis Söztutar, Devaraj Das, and Carter Shanklin: “[Apache HBase High Availability at the Next Level](https://web.archive.org/web/20160405122821/http://hortonworks.com/blog/apache-hbase-high-availability-next-level/),” *hortonworks.com*, January 22, 2015.
1. Brian F Cooper, Raghu Ramakrishnan, Utkarsh Srivastava, et al.: “[PNUTS: Yahoo!’s Hosted Data Serving Platform](http://www.mpi-sws.org/~druschel/courses/ds/papers/cooper-pnuts.pdf),” at *34th International Conference on Very Large Data Bases* (VLDB), August 2008. [doi:10.14778/1454159.1454167](http://dx.doi.org/10.14778/1454159.1454167)
1. Tushar Deepak Chandra and Sam Toueg: “[Unreliable Failure Detectors for Reliable Distributed Systems](http://courses.csail.mit.edu/6.852/08/papers/CT96-JACM.pdf),” *Journal of the ACM*, volume 43, number 2, pages 225–267, March 1996. [doi:10.1145/226643.226647](http://dx.doi.org/10.1145/226643.226647)
1. Michael J. Fischer, Nancy Lynch, and Michael S. Paterson: “[Impossibility of Distributed Consensus with One Faulty Process](https://groups.csail.mit.edu/tds/papers/Lynch/jacm85.pdf),” *Journal of the ACM*, volume 32, number 2, pages 374–382, April 1985. [doi:10.1145/3149.214121](http://dx.doi.org/10.1145/3149.214121)
1. Michael Ben-Or: “Another Advantage of Free Choice: Completely Asynchronous Agreement Protocols,” at *2nd ACM Symposium on Principles of Distributed Computing* (PODC), August 1983. [doi:10.1145/800221.806707](http://dl.acm.org/citation.cfm?id=806707)
1. Jim N. Gray and Leslie Lamport: “[Consensus on Transaction Commit](http://db.cs.berkeley.edu/cs286/papers/paxoscommit-tods2006.pdf),” *ACM Transactions on Database Systems* (TODS), volume 31, number 1, pages 133–160, March 2006. [doi:10.1145/1132863.1132867](http://dx.doi.org/10.1145/1132863.1132867)
1. Rachid Guerraoui: “[Revisiting the Relationship Between Non-Blocking Atomic Commitment and Consensus](https://citeseerx.ist.psu.edu/pdf/5d06489503b6f791aa56d2d7942359c2592e44b0),” at *9th International Workshop on Distributed Algorithms* (WDAG), September 1995. [doi:10.1007/BFb0022140](http://dx.doi.org/10.1007/BFb0022140)
1. Thanumalayan Sankaranarayana Pillai, Vijay Chidambaram, Ramnatthan Alagappan, et al.: “[All File Systems Are Not Created Equal: On the Complexity of Crafting Crash-Consistent Applications](http://research.cs.wisc.edu/wind/Publications/alice-osdi14.pdf),” at *11th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2014.
1. Jim Gray: “[The Transaction Concept: Virtues and Limitations](http://jimgray.azurewebsites.net/papers/thetransactionconcept.pdf),” at *7th International Conference on Very Large Data Bases* (VLDB), September 1981.
1. Hector Garcia-Molina and Kenneth Salem: “[Sagas](http://www.cs.cornell.edu/andru/cs711/2002fa/reading/sagas.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), May 1987. [doi:10.1145/38713.38742](http://dx.doi.org/10.1145/38713.38742)
1. C. Mohan, Bruce G. Lindsay, and Ron Obermarck: “[Transaction Management in the R* Distributed Database Management System](https://cs.brown.edu/courses/csci2270/archives/2012/papers/dtxn/p378-mohan.pdf),” *ACM Transactions on Database Systems*, volume 11, number 4, pages 378–396, December 1986. [doi:10.1145/7239.7266](http://dx.doi.org/10.1145/7239.7266)
1. “[Distributed Transaction Processing: The XA Specification](http://pubs.opengroup.org/onlinepubs/009680699/toc.pdf),” X/Open Company Ltd., Technical Standard XO/CAE/91/300, December 1991. ISBN: 978-1-872-63024-3
1. Mike Spille: “[XA Exposed, Part II](http://www.jroller.com/pyrasun/entry/xa_exposed_part_ii_schwartz),” *jroller.com*, April 3, 2004.
1. Ivan Silva Neto and Francisco Reverbel: “[Lessons Learned from Implementing WS-Coordination and WS-AtomicTransaction](http://www.ime.usp.br/~reverbel/papers/icis2008.pdf),” at *7th IEEE/ACIS International Conference on Computer and Information Science* (ICIS), May 2008. [doi:10.1109/ICIS.2008.75](http://dx.doi.org/10.1109/ICIS.2008.75)
1. James E. Johnson, David E. Langworthy, Leslie Lamport, and Friedrich H. Vogt: “[Formal Specification of a Web Services Protocol](https://www.microsoft.com/en-us/research/publication/formal-specification-of-a-web-services-protocol/),” at *1st International Workshop on Web Services and Formal Methods* (WS-FM), February 2004. [doi:10.1016/j.entcs.2004.02.022](http://dx.doi.org/10.1016/j.entcs.2004.02.022)
1. Dale Skeen: “[Nonblocking Commit Protocols](http://www.cs.utexas.edu/~lorenzo/corsi/cs380d/papers/Ske81.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), April 1981. [doi:10.1145/582318.582339](http://dx.doi.org/10.1145/582318.582339)
1. Gregor Hohpe: “[Your Coffee Shop Doesn’t Use Two-Phase Commit](http://www.martinfowler.com/ieeeSoftware/coffeeShop.pdf),” *IEEE Software*, volume 22, number 2, pages 64–66, March 2005. [doi:10.1109/MS.2005.52](http://dx.doi.org/10.1109/MS.2005.52)
1. Pat Helland: “[Life Beyond Distributed Transactions: An Apostate’s Opinion](https://web.archive.org/web/20210303104924/http://www-db.cs.wisc.edu/cidr/cidr2007/papers/cidr07p15.pdf),” at *3rd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2007.
1. Jonathan Oliver: “[My Beef with MSDTC and Two-Phase Commits](http://blog.jonathanoliver.com/my-beef-with-msdtc-and-two-phase-commits/),” *blog.jonathanoliver.com*, April 4, 2011.
1. Oren Eini (Ahende Rahien): “[The Fallacy of Distributed Transactions](http://ayende.com/blog/167362/the-fallacy-of-distributed-transactions),” *ayende.com*, July 17, 2014.
1. Clemens Vasters: “[Transactions in Windows Azure (with Service Bus) – An Email Discussion](https://blogs.msdn.microsoft.com/clemensv/2012/07/30/transactions-in-windows-azure-with-service-bus-an-email-discussion/),” *vasters.com*, July 30, 2012.
1. “[Understanding Transactionality in Azure](https://docs.particular.net/nservicebus/azure/understanding-transactionality-in-azure),” NServiceBus Documentation, Particular Software, 2015.
1. Randy Wigginton, Ryan Lowe, Marcos Albe, and Fernando Ipar: “[Distributed Transactions in MySQL](https://web.archive.org/web/20161010054152/https://www.percona.com/live/mysql-conference-2013/sites/default/files/slides/XA_final.pdf),” at *MySQL Conference and Expo*, April 2013.
1. Mike Spille: “[XA Exposed, Part I](https://web.archive.org/web/20130523064202/http://www.jroller.com/pyrasun/entry/xa_exposed),” *jroller.com*, April 3, 2004.
1. Ajmer Dhariwal: “[Orphaned MSDTC Transactions (-2 spids)](https://www.eraofdata.com/posts/2008/orphaned-msdtc-transactions-2-spids/),” *eraofdata.com*, December 12, 2008.
1. Paul Randal: “[Real World Story of DBCC PAGE Saving the Day](http://www.sqlskills.com/blogs/paul/real-world-story-of-dbcc-page-saving-the-day/),” *sqlskills.com*, June 19, 2013.
1. “[in-doubt xact resolution Server Configuration Option](https://msdn.microsoft.com/en-us/library/ms179586.aspx),” SQL Server 2016 documentation, Microsoft, Inc., 2016.
1. Cynthia Dwork, Nancy Lynch, and Larry Stockmeyer: “[Consensus in the Presence of Partial Synchrony](https://web.archive.org/web/20210318133551/https://www.net.t-labs.tu-berlin.de/~petr/ADC-07/papers/DLS88.pdf),” *Journal of the ACM*, volume 35, number 2, pages 288–323, April 1988. [doi:10.1145/42282.42283](http://dx.doi.org/10.1145/42282.42283)
1. Miguel Castro and Barbara H. Liskov: “[Practical Byzantine Fault Tolerance and Proactive Recovery](https://web.archive.org/web/20181123142540/http://zoo.cs.yale.edu/classes/cs426/2012/bib/castro02practical.pdf),” *ACM Transactions on Computer Systems*, volume 20, number 4, pages 396–461, November 2002. [doi:10.1145/571637.571640](http://dx.doi.org/10.1145/571637.571640)
1. Brian M. Oki and Barbara H. Liskov: “[Viewstamped Replication: A New Primary Copy Method to Support Highly-Available Distributed Systems](http://www.cs.princeton.edu/courses/archive/fall11/cos518/papers/viewstamped.pdf),” at *7th ACM Symposium on Principles of Distributed Computing* (PODC), August 1988. [doi:10.1145/62546.62549](http://dx.doi.org/10.1145/62546.62549)
1. Barbara H. Liskov and James Cowling: “[Viewstamped Replication Revisited](http://pmg.csail.mit.edu/papers/vr-revisited.pdf),” Massachusetts Institute of Technology, Tech Report MIT-CSAIL-TR-2012-021, July 2012.
1. Leslie Lamport: “[The Part-Time Parliament](https://www.microsoft.com/en-us/research/publication/part-time-parliament/),” *ACM Transactions on Computer Systems*, volume 16, number 2, pages 133–169, May 1998. [doi:10.1145/279227.279229](http://dx.doi.org/10.1145/279227.279229)
1. Leslie Lamport: “[Paxos Made Simple](https://www.microsoft.com/en-us/research/publication/paxos-made-simple/),” *ACM SIGACT News*, volume 32, number 4, pages 51–58, December 2001.
1. Tushar Deepak Chandra, Robert Griesemer, and Joshua Redstone: “[Paxos Made Live – An Engineering Perspective](http://www.read.seas.harvard.edu/~kohler/class/08w-dsi/chandra07paxos.pdf),” at *26th ACM Symposium on Principles of Distributed Computing* (PODC), June 2007.
1. Robbert van Renesse: “[Paxos Made Moderately Complex](http://www.cs.cornell.edu/home/rvr/Paxos/paxos.pdf),” *cs.cornell.edu*, March 2011.
1. Diego Ongaro: “[Consensus: Bridging Theory and Practice](https://github.com/ongardie/dissertation),” PhD Thesis, Stanford University, August 2014.
1. Heidi Howard, Malte Schwarzkopf, Anil Madhavapeddy, and Jon Crowcroft: “[Raft Refloated: Do We Have Consensus?](https://web.archive.org/web/20230319151303/https://www.cl.cam.ac.uk/~ms705/pub/papers/2015-osr-raft.pdf),” *ACM SIGOPS Operating Systems Review*, volume 49, number 1, pages 12–21, January 2015. [doi:10.1145/2723872.2723876](http://dx.doi.org/10.1145/2723872.2723876)
1. André Medeiros: “[ZooKeeper’s Atomic Broadcast Protocol: Theory and Practice](http://www.tcs.hut.fi/Studies/T-79.5001/reports/2012-deSouzaMedeiros.pdf),” Aalto University School of Science, March 20, 2012.
1. Robbert van Renesse, Nicolas Schiper, and Fred B. Schneider: “[Vive La Différence: Paxos vs. Viewstamped Replication vs. Zab](http://arxiv.org/abs/1309.5671),” *IEEE Transactions on Dependable and Secure Computing*, volume 12, number 4, pages 472–484, September 2014. [doi:10.1109/TDSC.2014.2355848](http://dx.doi.org/10.1109/TDSC.2014.2355848)
1. Will Portnoy: “[Lessons Learned from Implementing Paxos](http://blog.willportnoy.com/2012/06/lessons-learned-from-paxos.html),” *blog.willportnoy.com*, June 14, 2012.
1. Heidi Howard, Dahlia Malkhi, and Alexander Spiegelman: “[Flexible Paxos: Quorum Intersection Revisited](https://drops.dagstuhl.de/opus/volltexte/2017/7094/pdf/LIPIcs-OPODIS-2016-25.pdf),” at *20th International Conference on Principles of Distributed Systems* (OPODIS), December 2016. [doi:10.4230/LIPIcs.OPODIS.2016.25](http://dx.doi.org/10.4230/LIPIcs.OPODIS.2016.25)
1. Heidi Howard and Jon Crowcroft: “[Coracle: Evaluating Consensus at the Internet Edge](https://conferences.sigcomm.org/sigcomm/2015/pdf/papers/p85.pdf),” at *Annual Conference of the ACM Special Interest Group on Data Communication* (SIGCOMM), August 2015. [doi:10.1145/2829988.2790010](http://dx.doi.org/10.1145/2829988.2790010)
1. Kyle Kingsbury: “[Call Me Maybe: Elasticsearch 1.5.0](https://aphyr.com/posts/323-call-me-maybe-elasticsearch-1-5-0),” *aphyr.com*, April 27, 2015.
1. Ivan Kelly: “[BookKeeper Tutorial](https://github.com/ivankelly/bookkeeper-tutorial),” *github.com*, October 2014.
1. Camille Fournier: “[Consensus Systems for the Skeptical Architect](https://vimeo.com/102667163),” at *Philly ETE*, Philadelphia, PA, USA, April 2014.
1. Kenneth P. Birman: “[A History of the Virtual Synchrony Replication Model](https://ptolemy.berkeley.edu/projects/truststc/pubs/713/History%20of%20the%20Virtual%20Synchrony%20Replication%20Model%202010.pdf),” in *Replication: Theory and Practice*, Springer LNCS volume 5959, chapter 6, pages 91–120, 2010. ISBN: 978-3-642-11293-5, [doi:10.1007/978-3-642-11294-2_6](http://dx.doi.org/10.1007/978-3-642-11294-2_6)


================================================
FILE: content/v1/colophon.md
================================================
---
title: 后记
weight: 600
breadcrumbs: false
---

## 关于作者

**Martin Kleppmann** 是英国剑桥大学分布式系统的研究员。此前他曾在互联网公司担任过软件工程师和企业家，其中包括 LinkedIn 和 Rapportive，负责大规模数据基础架构。在这个过程中，他以艰难的方式学习了一些东西，他希望这本书能够让你避免重蹈覆辙。

Martin 是一位常规会议演讲者，博主和开源贡献者。他认为，每个人都应该有深刻的技术理念，深层次的理解能帮助我们开发出更好的软件。

![](http://martin.kleppmann.com/2017/03/ddia-poster.jpg)


## 关于译者

[**冯若航**](https://vonng.com)，网名 [@Vonng](https://github.com/Vonng)。
PostgreSQL 专家，数据库老司机，云计算泥石流。
PostgreSQL 发行版 [**Pigsty**](https://pgsty.com) 作者与创始人。
架构师，DBA，全栈工程师 @ TanTan，Alibaba，Apple。
独立开源贡献者，[GitStar Ranking 585](https://gitstar-ranking.com/Vonng)，[国区活跃 Top20](https://committers.top/china)。
[DDIA](https://ddia.pigsty.io) / [PG Internal](https://pgint.vonng.com) 中文版译者，数据库/云计算 KOL。


## 后记

《设计数据密集型应用》封面上的动物是 **印度野猪（Sus scrofa cristatus）**，它是在印度、缅甸、尼泊尔、斯里兰卡和泰国发现的一种野猪的亚种。与欧洲野猪不同，它们有更高的背部鬃毛，没有体表绒毛，以及更大更直的头骨。

印度野猪有一头灰色或黑色的头发，脊背上有短而硬的毛。雄性有突出的犬齿（称为 T），用来与对手战斗或抵御掠食者。雄性比雌性大，这些物种平均肩高 33-35 英寸，体重 200-300 磅。他们的天敌包括熊、老虎和各种大型猫科动物。

这些动物夜行且杂食 —— 它们吃各种各样的东西，包括根、昆虫、腐肉、坚果、浆果和小动物。野猪经常因为破坏农作物的根被人们所熟知，他们造成大量的破坏，并被农民所敌视。他们每天需要摄入 4,000 ~ 4,500 卡路里的能量。野猪有发达的嗅觉，这有助于寻找地下植物和挖掘动物。然而，它们的视力很差。

野猪在人类文化中一直具有重要意义。在印度教传说中，野猪是毗湿奴神的化身。在古希腊的丧葬纪念碑中，它是一个勇敢失败者的象征（与胜利的狮子相反）。由于它的侵略，它被描绘在斯堪的纳维亚、日耳曼和盎格鲁撒克逊战士的盔甲和武器上。在中国十二生肖中，它象征着决心和急躁。

O'Reilly 封面上的许多动物都受到威胁，这些动物对世界都很重要。要了解有关如何提供帮助的更多信息，请访问 animals.oreilly.com。

封面图片来自 Shaw's Zoology。封面字体是 URW Typewriter 和 Guardian Sans。文字字体是 Adobe Minion Pro；图中的字体是 Adobe Myriad Pro；标题字体是 Adobe Myriad Condensed；代码字体是 Dalton Maag 的 Ubuntu Mono。

================================================
FILE: content/v1/contrib.md
================================================
---
title: 贡献者
weight: 800
breadcrumbs: false
---

## 译者

[**冯若航**](https://vonng.com)，网名 [@Vonng](https://github.com/Vonng)。
PostgreSQL 专家，数据库老司机，云计算泥石流。
[**Pigsty**](https://pgsty.com) 作者与创始人。
架构师，DBA，全栈工程师 @ TanTan，Alibaba，Apple。
独立开源贡献者，[GitStar Ranking 585](https://gitstar-ranking.com/Vonng)，[国区活跃 Top20](https://committers.top/china)。
[DDIA](https://ddia.pigsty.io) / [PG Internal](https://pgint.vonng.com) 中文版译者，公众号：《老冯云数》，数据库 KOL。

## 校订与维护

Yin Gang [@yingang](https://github.com/yingang) 对本书进行了全文校订，并持续维护。

## 繁体中文版本

[繁體中文](/tw) **版本维护** by  [@afunTW](https://github.com/afunTW)

## 贡献列表

[GitHub 贡献者列表](https://github.com/Vonng/ddia/graphs/contributors)

0. 全文校订 by [@yingang](https://github.com/Vonng/ddia/commits?author=yingang)
1. [序言初翻修正](https://github.com/Vonng/ddia/commit/afb5edab55c62ed23474149f229677e3b42dfc2c) by [@seagullbird](https://github.com/Vonng/ddia/commits?author=seagullbird)
2. [第一章语法标点校正](https://github.com/Vonng/ddia/commit/973b12cd8f8fcdf4852f1eb1649ddd9d187e3644) by [@nevertiree](https://github.com/Vonng/ddia/commits?author=nevertiree)
3. [第六章部分校正](https://github.com/Vonng/ddia/commit/d4eb0852c0ec1e93c8aacc496c80b915bb1e6d48) 与[第十章的初翻](https://github.com/Vonng/ddia/commit/9de8dbd1bfe6fbb03b3bf6c1a1aa2291aed2490e) by [@MuAlex](https://github.com/Vonng/ddia/commits?author=MuAlex)
4. [第一部分](/v1/part-i)前言，[ch2](/v1/ch2)校正 by [@jiajiadebug](https://github.com/Vonng/ddia/commits?author=jiajiadebug)
5. [词汇表](/v1/glossary)、[后记](/v1/colophon)关于野猪的部分 by [@Chowss](https://github.com/Vonng/ddia/commits?author=Chowss)
6. [繁體中文](https://github.com/Vonng/ddia/pulls)版本与转换脚本 by [@afunTW](https://github.com/afunTW)
7. 多处翻译修正 by [@songzhibin97](https://github.com/Vonng/ddia/commits?author=songzhibin97) [@MamaShip](https://github.com/Vonng/ddia/commits?author=MamaShip) [@FangYuan33](https://github.com/Vonng/ddia/commits?author=FangYuan33)


感谢所有提出意见，作出贡献的朋友们，您可以在这里找到所有贡献的 [Issue 列表](https://github.com/Vonng/ddia/issues) 与 [PR 列表](https://github.com/Vonng/ddia/pulls)：

| ISSUE & Pull Requests                           | USER                                                       | Title                                                          |
|-------------------------------------------------|------------------------------------------------------------|----------------------------------------------------------------|
| [359](https://github.com/Vonng/ddia/pull/359)   | [@c25423](https://github.com/c25423)                       | ch10: 修正一处拼写错误                                                 |
| [358](https://github.com/Vonng/ddia/pull/358)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch4: 修正一处拼写错误                                                  |
| [356](https://github.com/Vonng/ddia/pull/356)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch2: 修正一处标点错误                                                  |
| [355](https://github.com/Vonng/ddia/pull/355)   | [@DuroyGeorge](https://github.com/DuroyGeorge)             | ch12: 修正一处格式错误                                                 |
| [354](https://github.com/Vonng/ddia/pull/354)   | [@justlorain](https://github.com/justlorain)               | ch7: 修正一处参考链接                                                  |
| [353](https://github.com/Vonng/ddia/pull/353)   | [@fantasyczl](https://github.com/fantasyczl)               | ch3&9: 修正两处引用错误                                                |
| [352](https://github.com/Vonng/ddia/pull/352)   | [@fantasyczl](https://github.com/fantasyczl)               | 支持输出为 EPUB 格式                                                  |
| [349](https://github.com/Vonng/ddia/pull/349)   | [@xiyihan0](https://github.com/xiyihan0)                   | ch1: 修正一处格式错误                                                  |
| [348](https://github.com/Vonng/ddia/pull/348)   | [@omegaatt36](https://github.com/omegaatt36)               | ch3: 修正一处图像链接                                                  |
| [346](https://github.com/Vonng/ddia/issues/346) | [@Vermouth1995](https://github.com/Vermouth1995)           | ch1: 优化一处翻译                                                    |
| [343](https://github.com/Vonng/ddia/pull/343)   | [@kehao-chen](https://github.com/kehao-chen)               | ch10: 优化一处翻译                                                   |
| [341](https://github.com/Vonng/ddia/pull/341)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch3: 优化两处翻译                                                    |
| [340](https://github.com/Vonng/ddia/pull/340)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch2: 优化多处翻译                                                    |
| [338](https://github.com/Vonng/ddia/pull/338)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch1: 优化一处翻译                                                    |
| [335](https://github.com/Vonng/ddia/pull/335)   | [@kimi0230](https://github.com/kimi0230)                   | 修正一处繁体中文错误                                                     |
| [334](https://github.com/Vonng/ddia/pull/334)   | [@soulrrrrr](https://github.com/soulrrrrr)                 | ch2: 修正一处繁体中文错误                                                |
| [332](https://github.com/Vonng/ddia/pull/332)   | [@justlorain](https://github.com/justlorain)               | ch5: 修正一处翻译错误                                                  |
| [331](https://github.com/Vonng/ddia/pull/331)   | [@Lyianu](https://github.com/Lyianu)                       | ch9: 更正几处拼写错误                                                  |
| [330](https://github.com/Vonng/ddia/pull/330)   | [@Lyianu](https://github.com/Lyianu)                       | ch7: 优化一处翻译                                                    |
| [329](https://github.com/Vonng/ddia/issues/329) | [@Lyianu](https://github.com/Lyianu)                       | ch6: 指出一处翻译错误                                                  |
| [328](https://github.com/Vonng/ddia/pull/328)   | [@justlorain](https://github.com/justlorain)               | ch4: 更正一处翻译遗漏                                                  |
| [326](https://github.com/Vonng/ddia/pull/326)   | [@liangGTY](https://github.com/liangGTY)                   | ch1: 优化一处翻译                                                    |
| [323](https://github.com/Vonng/ddia/pull/323)   | [@marvin263](https://github.com/marvin263)                 | ch5: 优化一处翻译                                                    |
| [322](https://github.com/Vonng/ddia/pull/322)   | [@marvin263](https://github.com/marvin263)                 | ch8: 优化一处翻译                                                    |
| [304](https://github.com/Vonng/ddia/pull/304)   | [@spike014](https://github.com/spike014)                   | ch11: 优化一处翻译                                                   |
| [298](https://github.com/Vonng/ddia/pull/298)   | [@Makonike](https://github.com/Makonike)                   | ch11&12: 修正两处错误                                                |
| [284](https://github.com/Vonng/ddia/pull/284)   | [@WAangzE](https://github.com/WAangzE)                     | ch4: 更正一处列表错误                                                  |
| [283](https://github.com/Vonng/ddia/pull/283)   | [@WAangzE](https://github.com/WAangzE)                     | ch3: 更正一处错别字                                                   |
| [282](https://github.com/Vonng/ddia/pull/282)   | [@WAangzE](https://github.com/WAangzE)                     | ch2: 更正一处公式问题                                                  |
| [281](https://github.com/Vonng/ddia/pull/281)   | [@lyuxi99](https://github.com/lyuxi99)                     | 更正多处内部链接错误                                                     |
| [280](https://github.com/Vonng/ddia/pull/280)   | [@lyuxi99](https://github.com/lyuxi99)                     | ch9: 更正内部链接错误                                                  |
| [279](https://github.com/Vonng/ddia/issues/279) | [@codexvn](https://github.com/codexvn)                     | ch9: 指出公式在 GitHub Pages 显示的问题                                  |
| [278](https://github.com/Vonng/ddia/pull/278)   | [@LJlkdskdjflsa](https://github.com/LJlkdskdjflsa)         | 发现了繁体中文版本中的错误翻译                                                |
| [275](https://github.com/Vonng/ddia/pull/275)   | [@117503445](https://github.com/117503445)                 | 更正 LICENSE 链接                                                  |
| [274](https://github.com/Vonng/ddia/pull/274)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch7: 修正错别字                                                     |
| [273](https://github.com/Vonng/ddia/pull/273)   | [@Sdot-Python](https://github.com/Sdot-Python)             | ch7: 统一了 write skew 的翻译                                        |
| [271](https://github.com/Vonng/ddia/pull/271)   | [@Makonike](https://github.com/Makonike)                   | ch6: 统一了 rebalancing 的翻译                                       |
| [270](https://github.com/Vonng/ddia/pull/270)   | [@Ynjxsjmh](https://github.com/Ynjxsjmh)                   | ch7: 修正不一致的翻译                                                  |
| [263](https://github.com/Vonng/ddia/pull/263)   | [@zydmayday](https://github.com/zydmayday)                 | ch5: 修正译文中的重复单词                                                |
| [260](https://github.com/Vonng/ddia/pull/260)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch4: 修正部分不准确的翻译                                                |
| [258](https://github.com/Vonng/ddia/pull/258)   | [@bestgrc](https://github.com/bestgrc)                     | ch3: 修正一处翻译错误                                                  |
| [257](https://github.com/Vonng/ddia/pull/257)   | [@UnderSam](https://github.com/UnderSam)                   | ch8: 修正一处拼写错误                                                  |
| [256](https://github.com/Vonng/ddia/pull/256)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可串行化”相关内容的多处翻译不当                                       |
| [255](https://github.com/Vonng/ddia/pull/255)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可重复读”相关内容的多处翻译不当                                       |
| [253](https://github.com/Vonng/ddia/pull/253)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“读已提交”相关内容的多处翻译不当                                       |
| [246](https://github.com/Vonng/ddia/pull/246)   | [@derekwu0101](https://github.com/derekwu0101)             | ch3: 修正繁体中文的转译错误                                               |
| [245](https://github.com/Vonng/ddia/pull/245)   | [@skyran1278](https://github.com/skyran1278)               | ch12: 修正繁体中文的转译错误                                              |
| [244](https://github.com/Vonng/ddia/pull/244)   | [@Axlgrep](https://github.com/Axlgrep)                     | ch9: 修正不通顺的翻译                                                  |
| [242](https://github.com/Vonng/ddia/pull/242)   | [@lynkeib](https://github.com/lynkeib)                     | ch9: 修正不通顺的翻译                                                  |
| [241](https://github.com/Vonng/ddia/pull/241)   | [@lynkeib](https://github.com/lynkeib)                     | ch8: 修正不正确的公式格式                                                |
| [240](https://github.com/Vonng/ddia/pull/240)   | [@8da2k](https://github.com/8da2k)                         | ch9: 修正不通顺的翻译                                                  |
| [239](https://github.com/Vonng/ddia/pull/239)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch7: 修正不一致的翻译                                                  |
| [237](https://github.com/Vonng/ddia/pull/237)   | [@zhangnew](https://github.com/zhangnew)                   | ch3: 修正错误的图片链接                                                 |
| [229](https://github.com/Vonng/ddia/pull/229)   | [@lis186](https://github.com/lis186)                       | 指出繁体中文的转译错误：复杂                                                 |
| [226](https://github.com/Vonng/ddia/pull/226)   | [@chroming](https://github.com/chroming)                   | ch1: 修正导航栏中的章节名称                                               |
| [220](https://github.com/Vonng/ddia/pull/220)   | [@skyran1278](https://github.com/skyran1278)               | ch9: 修正线性一致的繁体中文翻译                                             |
| [194](https://github.com/Vonng/ddia/pull/194)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正错误的翻译                                                   |
| [193](https://github.com/Vonng/ddia/pull/193)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 优化译文                                                      |
| [192](https://github.com/Vonng/ddia/pull/192)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正不一致和不通顺的翻译                                              |
| [190](https://github.com/Vonng/ddia/pull/190)   | [@Pcrab](https://github.com/Pcrab)                         | ch1: 修正不准确的翻译                                                  |
| [187](https://github.com/Vonng/ddia/pull/187)   | [@narojay](https://github.com/narojay)                     | ch9: 修正生硬的翻译                                                   |
| [186](https://github.com/Vonng/ddia/pull/186)   | [@narojay](https://github.com/narojay)                     | ch8: 修正错别字                                                     |
| [185](https://github.com/Vonng/ddia/issues/185) | [@8da2k](https://github.com/8da2k)                         | 指出小标题跳转的问题                                                     |
| [184](https://github.com/Vonng/ddia/pull/184)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch10: 修正失效的网址                                                  |
| [183](https://github.com/Vonng/ddia/pull/183)   | [@OneSizeFitsQuorum](https://github.com/OneSizeFitsQuorum) | ch8: 修正错别字                                                     |
| [182](https://github.com/Vonng/ddia/issues/182) | [@lroolle](https://github.com/lroolle)                     | 建议docsify的主题风格                                                 |
| [181](https://github.com/Vonng/ddia/pull/181)   | [@YunfengGao](https://github.com/YunfengGao)               | ch2: 修正翻译错误                                                    |
| [180](https://github.com/Vonng/ddia/pull/180)   | [@skyran1278](https://github.com/skyran1278)               | ch3: 指出繁体中文的转译错误                                               |
| [177](https://github.com/Vonng/ddia/pull/177)   | [@exzhawk](https://github.com/exzhawk)                     | 支持 Github Pages 里的公式显示                                         |
| [176](https://github.com/Vonng/ddia/pull/176)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch2: 语义网相关翻译更正                                                 |
| [175](https://github.com/Vonng/ddia/pull/175)   | [@cwr31](https://github.com/cwr31)                         | ch7: 不变式相关翻译更正                                                 |
| [174](https://github.com/Vonng/ddia/pull/174)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | README & preface: 更正不正确的中文用词和标点符号                              |
| [173](https://github.com/Vonng/ddia/pull/173)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正不完整的翻译                                                 |
| [171](https://github.com/Vonng/ddia/pull/171)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正重复的译文                                                  |
| [169](https://github.com/Vonng/ddia/pull/169)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 更正不太通顺的翻译                                                |
| [166](https://github.com/Vonng/ddia/pull/166)   | [@bp4m4h94](https://github.com/bp4m4h94)                   | ch1: 发现错误的文献索引                                                 |
| [164](https://github.com/Vonng/ddia/pull/164)   | [@DragonDriver](https://github.com/DragonDriver)           | preface: 更正错误的标点符号                                             |
| [163](https://github.com/Vonng/ddia/pull/163)   | [@llmmddCoder](https://github.com/llmmddCoder)             | ch1: 更正错误字                                                     |
| [160](https://github.com/Vonng/ddia/pull/160)   | [@Zhayhp](https://github.com/Zhayhp)                       | ch2: 建议将 network model 翻译为网状模型                                 |
| [159](https://github.com/Vonng/ddia/pull/159)   | [@1ess](https://github.com/1ess)                           | ch4: 更正错误字                                                     |
| [157](https://github.com/Vonng/ddia/pull/157)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通顺的翻译                                                 |
| [155](https://github.com/Vonng/ddia/pull/155)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通顺的翻译                                                 |
| [153](https://github.com/Vonng/ddia/pull/153)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch9: 修正缩略图的错别字                                                 |
| [152](https://github.com/Vonng/ddia/pull/152)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 除重->去重                                                    |
| [151](https://github.com/Vonng/ddia/pull/151)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 修订sibling相关的翻译                                            |
| [147](https://github.com/Vonng/ddia/pull/147)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 更正一处不准确的翻译                                                |
| [145](https://github.com/Vonng/ddia/pull/145)   | [@Hookey](https://github.com/Hookey)                       | 识别了当前简繁转译过程中处理不当的地方，暂通过转换脚本规避                                  |
| [144](https://github.com/Vonng/ddia/issues/144) | [@secret4233](https://github.com/secret4233)               | ch7: 不翻译`next-key locking`                                     |
| [143](https://github.com/Vonng/ddia/issues/143) | [@imcheney](https://github.com/imcheney)                   | ch3: 更新残留的机翻段落                                                 |
| [142](https://github.com/Vonng/ddia/issues/142) | [@XIJINIAN](https://github.com/XIJINIAN)                   | 建议去除段首的制表符                                                     |
| [141](https://github.com/Vonng/ddia/issues/141) | [@Flyraty](https://github.com/Flyraty)                     | ch5: 发现一处错误格式的章节引用                                             |
| [140](https://github.com/Vonng/ddia/pull/140)   | [@Bowser1704](https://github.com/Bowser1704)               | ch5: 修正章节Summary中多处不通顺的翻译                                      |
| [139](https://github.com/Vonng/ddia/pull/139)   | [@Bowser1704](https://github.com/Bowser1704)               | ch2&ch3: 修正多处不通顺的或错误的翻译                                        |
| [137](https://github.com/Vonng/ddia/pull/137)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch5&ch6: 优化多处不通顺的或错误的翻译                                        |
| [134](https://github.com/Vonng/ddia/pull/134)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch4: 优化多处不通顺的或错误的翻译                                            |
| [133](https://github.com/Vonng/ddia/pull/133)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 优化多处错误的或不通顺的翻译                                            |
| [132](https://github.com/Vonng/ddia/pull/132)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 优化一处容易产生歧义的翻译                                             |
| [131](https://github.com/Vonng/ddia/pull/131)   | [@rwwg4](https://github.com/rwwg4)                         | ch6: 修正两处错误的翻译                                                 |
| [129](https://github.com/Vonng/ddia/pull/129)   | [@anaer](https://github.com/anaer)                         | ch4: 修正两处强调文本和四处代码变量名称                                         |
| [128](https://github.com/Vonng/ddia/pull/128)   | [@meilin96](https://github.com/meilin96)                   | ch5: 修正一处错误的引用                                                 |
| [126](https://github.com/Vonng/ddia/pull/126)   | [@cwr31](https://github.com/cwr31)                         | ch10: 修正一处错误的翻译（功能 -> 函数）                                      |
| [125](https://github.com/Vonng/ddia/pull/125)   | [@dch1228](https://github.com/dch1228)                     | ch2: 优化 how best 的翻译（如何以最佳方式）                                  |
| [123](https://github.com/Vonng/ddia/pull/123)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 9, TOC in readme, glossary, etc.) |
| [121](https://github.com/Vonng/ddia/pull/121)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 5 to chapter 8)                   |
| [120](https://github.com/Vonng/ddia/pull/120)   | [@jiong-han](https://github.com/jiong-han)                 | Typo fix: 呲之以鼻 -> 嗤之以鼻                                         |
| [119](https://github.com/Vonng/ddia/pull/119)   | [@cclauss](https://github.com/cclauss)                     | Streamline file operations in convert()                        |
| [118](https://github.com/Vonng/ddia/pull/118)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 2 to chapter 4)                   |
| [117](https://github.com/Vonng/ddia/pull/117)   | [@feeeei](https://github.com/feeeei)                       | 统一每章的标题格式                                                      |
| [115](https://github.com/Vonng/ddia/pull/115)   | [@NageNalock](https://github.com/NageNalock)               | 第七章病句修改: 重复词语                                                  |
| [114](https://github.com/Vonng/ddia/pull/114)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | Update README.md: correct the book name                        |
| [113](https://github.com/Vonng/ddia/pull/113)   | [@lpxxn](https://github.com/lpxxn)                         | 修改语句                                                           |
| [112](https://github.com/Vonng/ddia/pull/112)   | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [110](https://github.com/Vonng/ddia/pull/110)   | [@lpxxn](https://github.com/lpxxn)                         | 读已写入数据                                                         |
| [107](https://github.com/Vonng/ddia/pull/107)   | [@abbychau](https://github.com/abbychau)                   | 單調鐘和好死还是赖活着                                                    |
| [106](https://github.com/Vonng/ddia/pull/106)   | [@enochii](https://github.com/enochii)                     | typo in ch2: fix braces typo                                   |
| [105](https://github.com/Vonng/ddia/pull/105)   | [@LiminCode](https://github.com/LiminCode)                 | Chronicle translation error                                    |
| [104](https://github.com/Vonng/ddia/pull/104)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | several advice for better translation                          |
| [103](https://github.com/Vonng/ddia/pull/103)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in ch4: should be 完成 rather than 完全                       |
| [102](https://github.com/Vonng/ddia/pull/102)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | ch4: better-translation: 扼杀 → 破坏                               |
| [101](https://github.com/Vonng/ddia/pull/101)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in Ch4: should be "改变" rathr than "盖面"                    |
| [100](https://github.com/Vonng/ddia/pull/100)   | [@LiminCode](https://github.com/LiminCode)                 | fix missing translation                                        |
| [99 ](https://github.com/Vonng/ddia/pull/99)    | [@mrdrivingduck](https://github.com/mrdrivingduck)         | ch6: fix the word rebalancing                                  |
| [98 ](https://github.com/Vonng/ddia/pull/98)    | [@jacklightChen](https://github.com/jacklightChen)         | fix ch7.md: fix wrong references                               |
| [97 ](https://github.com/Vonng/ddia/pull/97)    | [@jenac](https://github.com/jenac)                         | 96                                                             |
| [96 ](https://github.com/Vonng/ddia/pull/96)    | [@PragmaTwice](https://github.com/PragmaTwice)             | ch2: fix typo about 'may or may not be'                        |
| [95 ](https://github.com/Vonng/ddia/pull/95)    | [@EvanMu96](https://github.com/EvanMu96)                   | fix translation of "the battle cry" in ch5                     |
| [94 ](https://github.com/Vonng/ddia/pull/94)    | [@kemingy](https://github.com/kemingy)                     | ch6: fix markdown and punctuations                             |
| [93 ](https://github.com/Vonng/ddia/pull/93)    | [@kemingy](https://github.com/kemingy)                     | ch5: fix markdown and some typos                               |
| [92 ](https://github.com/Vonng/ddia/pull/92)    | [@Gilbert1024](https://github.com/Gilbert1024)             | Merge pull request #1 from Vonng/master                        |
| [88 ](https://github.com/Vonng/ddia/pull/88)    | [@kemingy](https://github.com/kemingy)                     | fix typo for ch1, ch2, ch3, ch4                                |
| [87 ](https://github.com/Vonng/ddia/pull/87)    | [@wynn5a](https://github.com/wynn5a)                       | Update ch3.md                                                  |
| [86 ](https://github.com/Vonng/ddia/pull/86)    | [@northmorn](https://github.com/northmorn)                 | Update ch1.md                                                  |
| [85 ](https://github.com/Vonng/ddia/pull/85)    | [@sunbuhui](https://github.com/sunbuhui)                   | fix ch2.md: fix ch2 ambiguous translation                      |
| [84 ](https://github.com/Vonng/ddia/pull/84)    | [@ganler](https://github.com/ganler)                       | Fix translation: use up                                        |
| [83 ](https://github.com/Vonng/ddia/pull/83)    | [@afunTW](https://github.com/afunTW)                       | Using OpenCC to convert from zh-cn to zh-tw                    |
| [82 ](https://github.com/Vonng/ddia/pull/82)    | [@kangni](https://github.com/kangni)                       | fix gitbook url                                                |
| [78 ](https://github.com/Vonng/ddia/pull/78)    | [@hanyu2](https://github.com/hanyu2)                       | Fix unappropriated translation                                 |
| [77 ](https://github.com/Vonng/ddia/pull/77)    | [@Ozarklake](https://github.com/Ozarklake)                 | fix typo                                                       |
| [75 ](https://github.com/Vonng/ddia/pull/75)    | [@2997ms](https://github.com/2997ms)                       | Fix typo                                                       |
| [74 ](https://github.com/Vonng/ddia/pull/74)    | [@2997ms](https://github.com/2997ms)                       | Update ch9.md                                                  |
| [70 ](https://github.com/Vonng/ddia/pull/70)    | [@2997ms](https://github.com/2997ms)                       | Update ch7.md                                                  |
| [67 ](https://github.com/Vonng/ddia/pull/67)    | [@jiajiadebug](https://github.com/jiajiadebug)             | fix issues in ch2 - ch9 and glossary                           |
| [66 ](https://github.com/Vonng/ddia/pull/66)    | [@blindpirate](https://github.com/blindpirate)             | Fix typo                                                       |
| [63 ](https://github.com/Vonng/ddia/pull/63)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch10.md                                                 |
| [62 ](https://github.com/Vonng/ddia/pull/62)    | [@ych](https://github.com/ych)                             | fix ch1.md typesetting problem                                 |
| [61 ](https://github.com/Vonng/ddia/pull/61)    | [@xianlaioy](https://github.com/xianlaioy)                 | docs:钟-->种，去掉ou                                                |
| [60 ](https://github.com/Vonng/ddia/pull/60)    | [@Zombo1296](https://github.com/Zombo1296)                 | 否则 -> 或者                                                       |
| [59 ](https://github.com/Vonng/ddia/pull/59)    | [@AlexanderMisel](https://github.com/AlexanderMisel)       | 呼叫->调用，显着->显著                                                  |
| [58 ](https://github.com/Vonng/ddia/pull/58)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch8.md                                                  |
| [55 ](https://github.com/Vonng/ddia/pull/55)    | [@saintube](https://github.com/saintube)                   | ch8: 修改链接错误                                                    |
| [54 ](https://github.com/Vonng/ddia/pull/54)    | [@Panmax](https://github.com/Panmax)                       | Update ch2.md                                                  |
| [53 ](https://github.com/Vonng/ddia/pull/53)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [52 ](https://github.com/Vonng/ddia/pull/52)    | [@hecenjie](https://github.com/hecenjie)                   | Update ch1.md                                                  |
| [51 ](https://github.com/Vonng/ddia/pull/51)    | [@latavin243](https://github.com/latavin243)               | fix 修正ch3 ch4几处翻译                                              |
| [50 ](https://github.com/Vonng/ddia/pull/50)    | [@AlexZFX](https://github.com/AlexZFX)                     | 几个疏漏和格式错误                                                      |
| [49 ](https://github.com/Vonng/ddia/pull/49)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch1.md                                                  |
| [48 ](https://github.com/Vonng/ddia/pull/48)    | [@scaugrated](https://github.com/scaugrated)               | fix typo                                                       |
| [47 ](https://github.com/Vonng/ddia/pull/47)    | [@lzwill](https://github.com/lzwill)                       | Fixed typos in ch2                                             |
| [45 ](https://github.com/Vonng/ddia/pull/45)    | [@zenuo](https://github.com/zenuo)                         | 删除一个多余的右括号                                                     |
| [44 ](https://github.com/Vonng/ddia/pull/44)    | [@akxxsb](https://github.com/akxxsb)                       | 修正第七章底部链接错误                                                    |
| [43 ](https://github.com/Vonng/ddia/pull/43)    | [@baijinping](https://github.com/baijinping)               | "更假简单"->"更加简单"                                                 |
| [42 ](https://github.com/Vonng/ddia/pull/42)    | [@tisonkun](https://github.com/tisonkun)                   | 修复 ch1 中的无序列表格式                                                |
| [38 ](https://github.com/Vonng/ddia/pull/38)    | [@renjie-c](https://github.com/renjie-c)                   | 纠正多处的翻译小错误                                                     |
| [37 ](https://github.com/Vonng/ddia/pull/37)    | [@tankilo](https://github.com/tankilo)                     | fix translation mistakes in ch4.md                             |
| [36 ](https://github.com/Vonng/ddia/pull/36)    | [@wwek](https://github.com/wwek)                           | 1.修复多个链接错误 2.名词优化修订 3.错误修订                                     |
| [35 ](https://github.com/Vonng/ddia/pull/35)    | [@wwek](https://github.com/wwek)                           | fix ch7.md  to ch8.md  link error                              |
| [34 ](https://github.com/Vonng/ddia/pull/34)    | [@wwek](https://github.com/wwek)                           | Merge pull request #1 from Vonng/master                        |
| [33 ](https://github.com/Vonng/ddia/pull/33)    | [@wwek](https://github.com/wwek)                           | fix part-ii.md link error                                      |
| [32 ](https://github.com/Vonng/ddia/pull/32)    | [@JCYoky](https://github.com/JCYoky)                       | Update ch2.md                                                  |
| [31 ](https://github.com/Vonng/ddia/pull/31)    | [@elsonLee](https://github.com/elsonLee)                   | Update ch7.md                                                  |
| [26 ](https://github.com/Vonng/ddia/pull/26)    | [@yjhmelody](https://github.com/yjhmelody)                 | 修复一些明显错误                                                       |
| [25 ](https://github.com/Vonng/ddia/pull/25)    | [@lqbilbo](https://github.com/lqbilbo)                     | 修复链接错误                                                         |
| [24 ](https://github.com/Vonng/ddia/pull/24)    | [@artiship](https://github.com/artiship)                   | 修改词语顺序                                                         |
| [23 ](https://github.com/Vonng/ddia/pull/23)    | [@artiship](https://github.com/artiship)                   | 修正错别字                                                          |
| [22 ](https://github.com/Vonng/ddia/pull/22)    | [@artiship](https://github.com/artiship)                   | 纠正翻译错误                                                         |
| [21 ](https://github.com/Vonng/ddia/pull/21)    | [@zhtisi](https://github.com/zhtisi)                       | 修正目录和本章标题不符的情况                                                 |
| [20 ](https://github.com/Vonng/ddia/pull/20)    | [@rentiansheng](https://github.com/rentiansheng)           | Update ch7.md                                                  |
| [19 ](https://github.com/Vonng/ddia/pull/19)    | [@LHRchina](https://github.com/LHRchina)                   | 修复语句小bug                                                       |
| [16 ](https://github.com/Vonng/ddia/pull/16)    | [@MuAlex](https://github.com/MuAlex)                       | Master                                                         |
| [15 ](https://github.com/Vonng/ddia/pull/15)    | [@cg-zhou](https://github.com/cg-zhou)                     | Update translation progress                                    |
| [14 ](https://github.com/Vonng/ddia/pull/14)    | [@cg-zhou](https://github.com/cg-zhou)                     | Translate glossary                                             |
| [13 ](https://github.com/Vonng/ddia/pull/13)    | [@cg-zhou](https://github.com/cg-zhou)                     | 详细修改了后记中和印度野猪相关的描述                                             |
| [12 ](https://github.com/Vonng/ddia/pull/12)    | [@ibyte2011](https://github.com/ibyte2011)                 | 修改了部分翻译                                                        |
| [11 ](https://github.com/Vonng/ddia/pull/11)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 100%                                                       |
| [10 ](https://github.com/Vonng/ddia/pull/10)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 20%                                                        |
| [9  ](https://github.com/Vonng/ddia/pull/9)     | [@jiajiadebug](https://github.com/jiajiadebug)             | Preface, ch1, part-i translation minor fixes                   |
| [7  ](https://github.com/Vonng/ddia/pull/7)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 translation pull request                                   |
| [6  ](https://github.com/Vonng/ddia/pull/6)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 change version1                                            |
| [5  ](https://github.com/Vonng/ddia/pull/5)     | [@nevertiree](https://github.com/nevertiree)               | Chapter 01语法微调                                                 |
| [2  ](https://github.com/Vonng/ddia/pull/2)     | [@seagullbird](https://github.com/seagullbird)             | 序言初翻                                                           |


================================================
FILE: content/v1/glossary.md
================================================
---
title: 术语表
weight: 500
breadcrumbs: false
---

> 请注意，本术语表中的定义简短而简单，旨在传达核心思想，而非死扣完整细节。有关更多详细信息，请参阅正文中的参考资料。


## **异步（asynchronous）**

  不等待某些事情完成（例如，将数据发送到网络中的另一个节点），并且不会假设要花多长时间。请参阅“[同步复制与异步复制](/v1/ch5#同步复制与异步复制)”、“[同步网络与异步网络](/v1/ch8#同步网络与异步网络)”以及“[系统模型与现实](/v1/ch8#系统模型与现实)”。

## **原子（atomic）**

   在并发操作的上下文中：描述一个在单个时间点看起来生效的操作，所以另一个并发进程永远不会遇到处于“半完成”状态的操作。另见隔离。

   在事务的上下文中：将一些写入操作分为一组，这组写入要么全部提交成功，要么遇到错误时全部回滚。请参阅“[原子性](/v1/ch7#原子性)”和“[原子提交与两阶段提交](/v1/ch9#原子提交与两阶段提交)”。

## **背压（backpressure）**

  接收方接收数据速度较慢时，强制降低发送方的数据发送速度。也称为流量控制。请参阅“[消息传递系统](/v1/ch11#消息传递系统)”。

## **批处理（batch process）**

  一种计算，它将一些固定的（通常是大的）数据集作为输入，并将其他一些数据作为输出，而不修改输入。见[第十章](/v1/ch10)。

## **边界（bounded）**

  有一些已知的上限或大小。例如，网络延迟情况（请参阅“[超时与无穷的延迟](/v1/ch8#超时与无穷的延迟)”）和数据集（请参阅[第十一章](/v1/ch11)的介绍）。

## **拜占庭故障（Byzantine fault）**

  表现异常的节点，这种异常可能以任意方式出现，例如向其他节点发送矛盾或恶意消息。请参阅“[拜占庭故障](/v1/ch8#拜占庭故障)”。

## **缓存（cache）**

  一种组件，通过存储最近使用过的数据，加快未来对相同数据的读取速度。缓存中通常存放部分数据：因此，如果缓存中缺少某些数据，则必须从某些底层较慢的数据存储系统中，获取完整的数据副本。

## **CAP定理（CAP theorem）**

  一个被广泛误解的理论结果，在实践中是没有用的。请参阅“[CAP定理](/v1/ch9#CAP定理)”。

## **因果关系（causality）**

  事件之间的依赖关系，当一件事发生在另一件事情之前。例如，后面的事件是对早期事件的回应，或者依赖于更早的事件，或者应该根据先前的事件来理解。请参阅“[“此前发生”的关系和并发](/v1/ch5#“此前发生”的关系和并发)”和“[顺序与因果关系](/v1/ch9#顺序与因果关系)”。

## **共识（consensus）**

  分布式计算的一个基本问题，就是让几个节点同意某些事情（例如，哪个节点应该是数据库集群的领导者）。问题比乍看起来要困难得多。请参阅“[容错共识](/v1/ch9#容错共识)”。

## **数据仓库（data warehouse）**

  一个数据库，其中来自几个不同的OLTP系统的数据已经被合并和准备用于分析目的。请参阅“[数据仓库](/v1/ch3#数据仓库)”。

## **声明式（declarative）**

  描述某些东西应有的属性，但不知道如何实现它的确切步骤。在查询的上下文中，查询优化器采用声明性查询并决定如何最好地执行它。请参阅“[数据查询语言](/v1/ch2#数据查询语言)”。

## **非规范化（denormalize）**

  为了加速读取，在标准数据集中引入一些冗余或重复数据，通常采用缓存或索引的形式。非规范化的值是一种预先计算的查询结果，像物化视图。请参阅“[单对象和多对象操作](/v1/ch7#单对象和多对象操作)”和“[从同一事件日志中派生多个视图](/v1/ch11#从同一事件日志中派生多个视图)”。

## **衍生数据（derived data）**

  一种数据集，根据其他数据通过可重复运行的流程创建。必要时，你可以运行该流程再次创建衍生数据。衍生数据通常用于提高特定数据的读取速度。常见的衍生数据有索引、缓存和物化视图。请参阅[第三部分](/v1/part-iii)的介绍。

## **确定性（deterministic）**

  描述一个函数，如果给它相同的输入，则总是产生相同的输出。这意味着它不能依赖于随机数字、时间、网络通信或其他不可预测的事情。

## **分布式（distributed）**

  在由网络连接的多个节点上运行。对于部分节点故障，具有容错性：系统的一部分发生故障时，其他部分仍可以正常工作，通常情况下，软件无需了解故障相关的确切情况。请参阅“[故障与部分失效](/v1/ch8#故障与部分失效)”。

## **持久（durable）**

  以某种方式存储数据，即使发生各种故障，也不会丢失数据。请参阅“[持久性](/v1/ch7#持久性)”。

## **ETL（Extract-Transform-Load）**

  提取-转换-加载（Extract-Transform-Load）。从源数据库中提取数据，将其转换为更适合分析查询的形式，并将其加载到数据仓库或批处理系统中的过程。请参阅“[数据仓库](/v1/ch3#数据仓库)”。

## **故障切换（failover）**

  在具有单一领导者的系统中，故障切换是将领导角色从一个节点转移到另一个节点的过程。请参阅“[处理节点宕机](/v1/ch5#处理节点宕机)”。

## **容错（fault-tolerant）**

  如果出现问题（例如，机器崩溃或网络连接失败），可以自动恢复。请参阅“[可靠性](/v1/ch1#可靠性)”。

## **流量控制（flow control）**

  见背压（backpressure）。

## **追随者（follower）**

  一种数据副本，仅处理领导者或主库发出的数据变更，不直接接受来自客户端的任何写入。也称为备库、从库、只读副本或热备份。请参阅“[领导者与追随者](/v1/ch5#领导者与追随者)”。

## **全文检索（full-text search）**

  通过任意关键字来搜索文本，通常具有附加特征，例如匹配类似的拼写词或同义词。全文索引是一种支持这种查询的次级索引。请参阅“[全文搜索和模糊索引](/v1/ch3#全文搜索和模糊索引)”。

## **图（graph）**

  一种数据结构，由顶点（可以指向的东西，也称为节点或实体）和边（从一个顶点到另一个顶点的连接，也称为关系或弧）组成。请参阅“[图数据模型](/v1/ch2#图数据模型)”。

## **散列（hash）**

  将输入转换为看起来像随机数值的函数。相同的输入会转换为相同的数值，不同的输入一般会转换为不同的数值，也可能转换为相同数值（也被称为冲突）。请参阅“[根据键的散列分区](/v1/ch6#根据键的散列分区)”。

## **幂等（idempotent）**

  用于描述一种操作可以安全地重试执行，即执行多次的效果和执行一次的效果相同。请参阅“[幂等性](/v1/ch11#幂等性)”。

## **索引（index）**

  一种数据结构。通过索引，你可以根据特定字段的值，在所有数据记录中进行高效检索。请参阅“[驱动数据库的数据结构](/v1/ch3#驱动数据库的数据结构)”。

## **隔离性（isolation）**

  在事务上下文中，用于描述并发执行事务的互相干扰程度。串行运行具有最强的隔离性，不过其它程度的隔离也通常被使用。请参阅“[隔离性](/v1/ch7#隔离性)”。

## **连接（join）**

  汇集有共同点的记录。在一个记录与另一个记录有关（外键，文档参考，图中的边）的情况下最常用，查询需要获取参考所指向的记录。请参阅“[多对一和多对多的关系](/v1/ch2#多对一和多对多的关系)”和“[Reduce侧连接与分组](/v1/ch10#Reduce侧连接与分组)”。

## **领导者（leader）**

  当数据或服务被复制到多个节点时，领导者是被指定为可以接受数据变更的副本。领导者可以通过某些协议选举产生，也可以由管理员手动选择。领导者也被称为主库。请参阅“[领导者与追随者](/v1/ch5#领导者与追随者)”。

## **线性化（linearizable）**

  表现为系统中只有一份通过原子操作更新的数据副本。请参阅“[线性一致性](/v1/ch9#线性一致性)”。

## **局部性（locality）**

  一种性能优化方式，如果经常在相同的时间请求一些离散数据，把这些数据放到一个位置。请参阅“[查询的数据局部性](/v1/ch2#查询的数据局部性)”。

## **锁（lock）**

  一种保证只有一个线程、节点或事务可以访问的机制，如果其它线程、节点或事务想访问相同元素，则必须等待锁被释放。请参阅“[两阶段锁定](/v1/ch7#两阶段锁定)”和“[领导者和锁](/v1/ch8#领导者和锁)”。

## **日志（log）**

  日志是一个只能以追加方式写入的文件，用于存放数据。预写式日志用于在存储引擎崩溃时恢复数据（请参阅“[让B树更可靠](/v1/ch3#让B树更可靠)”）；结构化日志存储引擎使用日志作为它的主要存储格式（请参阅“[SSTables和LSM树](/v1/ch3#SSTables和LSM树)”）；复制型日志用于把写入从领导者复制到追随者（请参阅“[领导者与追随者](/v1/ch5#领导者与追随者)”）；事件性日志可以表现为数据流（请参阅“[分区日志](/v1/ch11#分区日志)”）。

## **物化（materialize）**

  急切地计算并写出结果，而不是在请求时计算。请参阅“[聚合：数据立方体和物化视图](/v1/ch3#聚合：数据立方体和物化视图)”和“[物化中间状态](/v1/ch10#物化中间状态)”。

## **节点（node）**

  计算机上运行的一些软件的实例，通过网络与其他节点通信以完成某项任务。

## **规范化（normalized）**

  以没有冗余或重复的方式进行结构化。在规范化数据库中，当某些数据发生变化时，你只需要在一个地方进行更改，而不是在许多不同的地方复制很多次。请参阅“[多对一和多对多的关系](/v1/ch2#多对一和多对多的关系)”。

## **OLAP（Online Analytic Processing）**

  在线分析处理。通过对大量记录进行聚合（例如，计数，总和，平均）来表征的访问模式。请参阅“[事务处理还是分析？](/v1/ch3#事务处理还是分析？)”。

## **OLTP（Online Transaction Processing）**

  在线事务处理。访问模式的特点是快速查询，读取或写入少量记录，这些记录通常通过键索引。请参阅“[事务处理还是分析？](/v1/ch3#事务处理还是分析？)”。

## **分区（partitioning）**

  将单机上的大型数据集或计算结果拆分为较小部分，并将其分布到多台机器上。也称为分片。见[第六章](/v1/ch6)。

## **百分位点（percentile）**

  通过计算有多少值高于或低于某个阈值来衡量值分布的方法。例如，某个时间段的第95个百分位响应时间是时间t，则该时间段中，95%的请求完成时间小于t，5%的请求完成时间要比t长。请参阅“[描述性能](/v1/ch1#描述性能)”。

## **主键（primary key）**

  唯一标识记录的值（通常是数字或字符串）。在许多应用程序中，主键由系统在创建记录时生成（例如，按顺序或随机）; 它们通常不由用户设置。另请参阅次级索引。

## **法定人数（quorum）**

  在操作完成之前，需要对操作进行投票的最少节点数量。请参阅“[读写的法定人数](/v1/ch5#读写的法定人数)”。

## **再平衡（rebalance）**

  将数据或服务从一个节点移动到另一个节点以实现负载均衡。请参阅“[分区再平衡](/v1/ch6#分区再平衡)”。

## **复制（replication）**

  在几个节点（副本）上保留相同数据的副本，以便在某些节点无法访问时，数据仍可访问。请参阅[第五章](/v1/ch5)。

## **模式（schema）**

  一些数据结构的描述，包括其字段和数据类型。可以在数据生命周期的不同点检查某些数据是否符合模式（请参阅“[文档模型中的模式灵活性](/v1/ch2#文档模型中的模式灵活性)”），模式可以随时间变化（请参阅[第四章](/v1/ch4)）。

## **次级索引（secondary index）**

  与主要数据存储器一起维护的附加数据结构，使你可以高效地搜索与某种条件相匹配的记录。请参阅“[其他索引结构](/v1/ch3#其他索引结构)”和“[分区与次级索引](/v1/ch6#分区与次级索引)”。

## **可串行化（serializable）**

  保证多个并发事务同时执行时，它们的行为与按顺序逐个执行事务相同。请参阅第七章的“[可串行化](/v1/ch7#可串行化)”。

## **无共享（shared-nothing）**

  与共享内存或共享磁盘架构相比，独立节点（每个节点都有自己的CPU，内存和磁盘）通过传统网络连接。见[第二部分](/v1/part-ii)的介绍。

## **偏斜（skew）**

  各分区负载不平衡，例如某些分区有大量请求或数据，而其他分区则少得多。也被称为热点。请参阅“[负载偏斜与热点消除](/v1/ch6#负载偏斜与热点消除)”和“[处理偏斜](/v1/ch10#处理偏斜)”。

  时间线异常导致事件以不期望的顺序出现。请参阅“[快照隔离和可重复读](/v1/ch7#快照隔离和可重复读)”中的关于读取偏差的讨论，“[写入偏差与幻读](/v1/ch7#写入偏差与幻读)”中的写入偏差以及“[有序事件的时间戳](/v1/ch8#有序事件的时间戳)”中的时钟偏斜。

## **脑裂（split brain）**

  两个节点同时认为自己是领导者的情况，这种情况可能违反系统担保。请参阅“[处理节点宕机](/v1/ch5#处理节点宕机)”和“[真相由多数所定义](/v1/ch8#真相由多数所定义)”。

## **存储过程（stored procedure）**

  一种对事务逻辑进行编码的方式，它可以完全在数据库服务器上执行，事务执行期间无需与客户端通信。请参阅“[真的串行执行](/v1/ch7#真的串行执行)”。

## **流处理（stream process）**

  持续运行的计算。可以持续接收事件流作为输入，并得出一些输出。见[第十一章](/v1/ch11)。

## **同步（synchronous）**

  异步的反义词。

## **记录系统（system of record）**

  一个保存主要权威版本数据的系统，也被称为真相的来源。首先在这里写入数据变更，其他数据集可以从记录系统衍生。请参阅[第三部分](/v1/part-iii)的介绍。

## **超时（timeout）**

  检测故障的最简单方法之一，即在一段时间内观察是否缺乏响应。但是，不可能知道超时是由于远程节点的问题还是网络中的问题造成的。请参阅“[超时与无穷的延迟](/v1/ch8#超时与无穷的延迟)”。

## **全序（total order）**

  一种比较事物的方法（例如时间戳），可以让你总是说出两件事中哪一件更大，哪件更小。总的来说，有些东西是无法比拟的（不能说哪个更大或更小）的顺序称为偏序。请参阅“[因果顺序不是全序的](/v1/ch9#因果顺序不是全序的)”。

## **事务（transaction）**

  为了简化错误处理和并发问题，将几个读写操作分组到一个逻辑单元中。见[第七章](/v1/ch7)。

## **两阶段提交（2PC, two-phase commit）**

  一种确保多个数据库节点全部提交或全部中止事务的算法。请参阅“[原子提交与两阶段提交](/v1/ch9#原子提交与两阶段提交)”。

## **两阶段锁定（2PL, two-phase locking）**

  一种用于实现可串行化隔离的算法，该算法通过事务获取对其读取或写入的所有数据的锁，直到事务结束。请参阅“[两阶段锁定](/v1/ch7#两阶段锁定)”。

## **无边界（unbounded）**

  没有任何已知的上限或大小。反义词是边界（bounded）。

================================================
FILE: content/v1/part-i.md
================================================
---
title: 第一部分：数据系统基础
weight: 100
breadcrumbs: false
---

本书前四章介绍了数据系统底层的基础概念，无论是在单台机器上运行的单点数据系统，还是分布在多台机器上的分布式数据系统都适用。

1. [第一章](/v1/ch1) 将介绍本书使用的术语和方法。**可靠性，可伸缩性和可维护性** ，这些词汇到底意味着什么？如何实现这些目标？
2. [第二章](/v1/ch2) 将对几种不同的 **数据模型和查询语言** 进行比较。从程序员的角度看，这是数据库之间最明显的区别。不同的数据模型适用于不同的应用场景。
3. [第三章](/v1/ch3) 将深入 **存储引擎** 内部，研究数据库如何在磁盘上摆放数据。不同的存储引擎针对不同的负载进行优化，选择合适的存储引擎对系统性能有巨大影响。
4. [第四章](/v1/ch4) 将对几种不同的 **数据编码** 进行比较。特别研究了这些格式在应用需求经常变化、模式需要随时间演变的环境中表现如何。

第二部分将专门讨论在 **分布式数据系统** 中特有的问题。


## 索引

* [第一章：可靠性、可伸缩性和可维护性](/v1/ch1)
    * [关于数据系统的思考](/v1/ch1#关于数据系统的思考)
    * [可靠性](/v1/ch1#可靠性)
    * [可伸缩性](/v1/ch1#可伸缩性)
    * [可维护性](/v1/ch1#可维护性)
    * [本章小结](/v1/ch1#本章小结)
* [第二章：数据模型与查询语言](/v1/ch2)
    * [关系模型与文档模型](/v1/ch2#关系模型与文档模型)
    * [数据查询语言](/v1/ch2#数据查询语言)
    * [图数据模型](/v1/ch2#图数据模型)
    * [本章小结](/v1/ch2#本章小结)
* [第三章：存储与检索](/v1/ch3)
    * [驱动数据库的数据结构](/v1/ch3#驱动数据库的数据结构)
    * [事务处理还是分析？](/v1/ch3#事务处理还是分析)
    * [列式存储](/v1/ch3#列式存储)
    * [本章小结](/v1/ch3#本章小结)
* [第四章：编码与演化](/v1/ch4)
    * [编码数据的格式](/v1/ch4#编码数据的格式)
    * [数据流的类型](/v1/ch4#数据流的类型)
    * [本章小结](/v1/ch4#本章小结)


================================================
FILE: content/v1/part-ii.md
================================================
---
title: 第二部分：分布式数据
weight: 200
breadcrumbs: false
---

> 一个成功的技术，现实的优先级必须高于公关，你可以糊弄别人，但糊弄不了自然规律。
>
> —— 罗杰斯委员会报告（1986）
>

-------

在本书的 [第一部分](/v1/part-i) 中，我们讨论了数据系统的各个方面，但仅限于数据存储在单台机器上的情况。现在我们到了 [第二部分](/v1/part-ii)，进入更高的层次，并提出一个问题：如果 **多台机器** 参与数据的存储和检索，会发生什么？

你可能会出于各种各样的原因，希望将数据库分布到多台机器上：

可伸缩性
: 如果你的数据量、读取负载、写入负载超出单台机器的处理能力，可以将负载分散到多台计算机上。

容错 / 高可用性
: 如果你的应用需要在单台机器（或多台机器，网络或整个数据中心）出现故障的情况下仍然能继续工作，则可使用多台机器，以提供冗余。一台故障时，另一台可以接管。

延迟
: 如果在世界各地都有用户，你也许会考虑在全球范围部署多个服务器，从而每个用户可以从地理上最近的数据中心获取服务，避免了等待网络数据包穿越半个世界。

## 伸缩至更高的载荷

如果你需要的只是伸缩至更高的 **载荷（load）**，最简单的方法就是购买更强大的机器（有时称为 **垂直伸缩**，即 vertical scaling，或 **向上伸缩**，即 scale up）。许多处理器，内存和磁盘可以在同一个操作系统下相互连接，快速的相互连接允许任意处理器访问内存或磁盘的任意部分。在这种 **共享内存架构（shared-memory architecture）** 中，所有的组件都可以看作一台单独的机器 [^i]。

[^i]: 在大型机中，尽管任意处理器都可以访问内存的任意部分，但总有一些内存区域与一些处理器更接近（称为 **非均匀内存访问（nonuniform memory access, NUMA）**【1】）。为了有效利用这种架构特性，需要对处理进行细分，以便每个处理器主要访问临近的内存，这意味着即使表面上看起来只有一台机器在运行，**分区（partitioning）** 仍然是必要的。

共享内存方法的问题在于，成本增长速度快于线性增长：一台有着双倍处理器数量，双倍内存大小，双倍磁盘容量的机器，通常成本会远远超过原来的两倍。而且可能因为存在瓶颈，并不足以处理双倍的载荷。

共享内存架构可以提供有限的容错能力，高端机器可以使用热插拔的组件（不关机更换磁盘，内存模块，甚至处理器）—— 但它必然囿于单个地理位置的桎梏。

另一种方法是 **共享磁盘架构（shared-disk architecture）**，它使用多台具有独立处理器和内存的机器，但将数据存储在机器之间共享的磁盘阵列上，这些磁盘通过快速网络连接 [^ii]。这种架构用于某些数据仓库，但竞争和锁定的开销限制了共享磁盘方法的可伸缩性【2】。

[^ii]: 网络附属存储（Network Attached Storage, NAS），或 **存储区网络（Storage Area Network, SAN）**

### 无共享架构

相比之下，**无共享架构**【3】（shared-nothing architecture，有时被称为 **水平伸缩**，即 horizontal scaling，或 **向外伸缩**，即 scaling out）已经相当普及。在这种架构中，运行数据库软件的每台机器 / 虚拟机都称为 **节点（node）**。每个节点只使用各自的处理器，内存和磁盘。节点之间的任何协调，都是在软件层面使用传统网络实现的。

无共享系统不需要使用特殊的硬件，所以你可以用任意机器 —— 比如性价比最好的机器。你也许可以跨多个地理区域分布数据从而减少用户延迟，或者在损失一整个数据中心的情况下幸免于难。随着云端虚拟机部署的出现，即使是小公司，现在无需 Google 级别的运维，也可以实现异地分布式架构。

在这一部分里，我们将重点放在无共享架构上。它不见得是所有场景的最佳选择，但它是最需要你谨慎从事的架构。如果你的数据分布在多个节点上，你需要意识到这样一个分布式系统中约束和权衡 —— 数据库并不能魔术般地把这些东西隐藏起来。

虽然分布式无共享架构有许多优点，但它通常也会给应用带来额外的复杂度，有时也会限制你可用数据模型的表达力。在某些情况下，一个简单的单线程程序可以比一个拥有超过 100 个 CPU 核的集群表现得更好【4】。另一方面，无共享系统可以非常强大。接下来的几章，将详细讨论分布式数据会带来的问题。

### 复制 vs 分区

数据分布在多个节点上有两种常见的方式：

复制（Replication）
: 在几个不同的节点上保存数据的相同副本，可能放在不同的位置。复制提供了冗余：如果一些节点不可用，剩余的节点仍然可以提供数据服务。复制也有助于改善性能。[第五章](/v1/ch5) 将讨论复制。

分区 (Partitioning)
: 将一个大型数据库拆分成较小的子集（称为 **分区**，即 partitions），从而不同的分区可以指派给不同的 **节点**（nodes，亦称 **分片**，即 sharding）。[第六章](/v1/ch6) 将讨论分区。

复制和分区是不同的机制，但它们经常同时使用。如 [图 II-1](/v1/ddia_part-ii_01.png) 所示。

![](/v1/ddia_part-ii_01.png)

**图 II-1 一个数据库切分为两个分区，每个分区都有两个副本**

理解了这些概念，就可以开始讨论在分布式系统中需要做出的困难抉择。[第七章](/v1/ch7) 将讨论 **事务（Transaction）**，这对于了解数据系统中可能出现的各种问题，以及我们可以做些什么很有帮助。[第八章](/v1/ch8) 和 [第九章](/v1/ch9) 将讨论分布式系统的根本局限性。

在本书的 [第三部分](/v1/part-iii) 中，将讨论如何将多个（可能是分布式的）数据存储集成为一个更大的系统，以满足复杂的应用需求。但首先，我们来聊聊分布式的数据。


## 索引

* [第五章：复制](/v1/ch5)
  * [领导者与追随者](/v1/ch5#领导者与追随者)
  * [复制延迟问题](/v1/ch5#复制延迟问题)
  * [多主复制](/v1/ch5#多主复制)
  * [无主复制](/v1/ch5#无主复制)
  * [本章小结](/v1/ch5#本章小结)
* [第六章：分区](/v1/ch6)
  * [分区与复制](/v1/ch6#分区与复制)
  * [键值数据的分区](/v1/ch6#键值数据的分区)
  * [分区与次级索引](/v1/ch6#分区与次级索引)
  * [分区再平衡](/v1/ch6#分区再平衡)
  * [请求路由](/v1/ch6#请求路由)
  * [本章小结](/v1/ch6#本章小结)
* [第七章：事务](/v1/ch7)
  * [事务的棘手概念](/v1/ch7#事务的棘手概念)
  * [弱隔离级别](/v1/ch7#弱隔离级别)
  * [可串行化](/v1/ch7#可串行化)
  * [本章小结](/v1/ch7#本章小结)
* [第八章：分布式系统的麻烦](/v1/ch8)
  * [故障与部分失效](/v1/ch8#故障与部分失效)
  * [不可靠的网络](/v1/ch8#不可靠的网络)
  * [不可靠的时钟](/v1/ch8#不可靠的时钟)
  * [知识、真相与谎言](/v1/ch8#知识真相与谎言)
  * [本章小结](/v1/ch8#本章小结)
* [第九章：一致性与共识](/v1/ch9)
  * [一致性保证](/v1/ch9#一致性保证)
  * [线性一致性](/v1/ch9#线性一致性)
  * [顺序保证](/v1/ch9#顺序保证)
  * [分布式事务与共识](/v1/ch9#分布式事务与共识)
  * [本章小结](/v1/ch9#本章小结)


## 参考文献

1. Ulrich Drepper: “[What Every Programmer Should Know About Memory](https://people.freebsd.org/~lstewart/articles/cpumemory.pdf),” akka‐dia.org, November 21, 2007.
1. Ben Stopford: “[Shared Nothing vs. Shared Disk Architectures: An Independent View](http://www.benstopford.com/2009/11/24/understanding-the-shared-nothing-architecture/),” benstopford.com, November 24, 2009.
1. Michael Stonebraker: “[The Case for Shared Nothing](http://db.cs.berkeley.edu/papers/hpts85-nothing.pdf),” IEEE Database EngineeringBulletin, volume 9, number 1, pages 4–9, March 1986.
1. Frank McSherry, Michael Isard, and Derek G. Murray: “[Scalability! But at What COST?](http://www.frankmcsherry.org/assets/COST.pdf),” at 15th USENIX Workshop on Hot Topics in Operating Systems (HotOS),May 2015.

================================================
FILE: content/v1/part-iii.md
================================================
---
title: 第三部分：衍生数据
weight: 300
breadcrumbs: false
---


在本书的 [第一部分](/v1/part-i) 和 [第二部分](/v1/part-ii) 中，我们自底向上地把所有关于分布式数据库的主要考量都过了一遍。从数据在磁盘上的布局，一直到出现故障时分布式系统一致性的局限。但所有的讨论都假定了应用中只用了一种数据库。

现实世界中的数据系统往往更为复杂。大型应用程序经常需要以多种方式访问和处理数据，没有一个数据库可以同时满足所有这些不同的需求。因此应用程序通常组合使用多种组件：数据存储、索引、缓存、分析系统等等，并实现在这些组件中移动数据的机制。

本书的最后一部分，会研究将多个不同数据系统（可能有着不同数据模型，并针对不同的访问模式进行优化）集成为一个协调一致的应用架构时，会遇到的问题。软件供应商经常会忽略这一方面的生态建设，并声称他们的产品能够满足你的所有需求。在现实世界中，集成不同的系统是实际应用中最重要的事情之一。

## 记录系统和衍生数据系统

从高层次上看，存储和处理数据的系统可以分为两大类：

* 记录系统（System of record）

  **记录系统**，也被称为 **真相源（source of truth）**，持有数据的权威版本。当新的数据进入时（例如，用户输入）首先会记录在这里。每个事实正正好好表示一次（表示通常是 **正规化的**，即 normalized）。如果其他系统和 **记录系统** 之间存在任何差异，那么记录系统中的值是正确的（根据定义）。

* 衍生数据系统（Derived data systems）

  **衍生系统** 中的数据，通常是另一个系统中的现有数据以某种方式进行转换或处理的结果。如果丢失衍生数据，可以从原始来源重新创建。典型的例子是 **缓存（cache）**：如果数据在缓存中，就可以由缓存提供服务；如果缓存不包含所需数据，则降级由底层数据库提供。非规范化的值，索引和物化视图亦属此类。在推荐系统中，预测汇总数据通常衍生自用户日志。

从技术上讲，衍生数据是 **冗余的（redundant）**，因为它重复了已有的信息。但是衍生数据对于获得良好的只读查询性能通常是至关重要的。它通常是非规范化的。可以从单个源头衍生出多个不同的数据集，使你能从不同的 “视角” 洞察数据。

并不是所有的系统都在其架构中明确区分 **记录系统** 和 **衍生数据系统**，但是这是一种有用的区分方式，因为它明确了系统中的数据流：系统的哪一部分具有哪些输入和哪些输出，以及它们如何相互依赖。

大多数数据库，存储引擎和查询语言，本质上既不是记录系统也不是衍生系统。数据库只是一个工具：如何使用它取决于你自己。**记录系统和衍生数据系统之间的区别不在于工具，而在于应用程序中的使用方式。**

通过梳理数据的衍生关系，可以清楚地理解一个令人困惑的系统架构。这将贯穿本书的这一部分。

## 章节概述

我们将从 [第十章](/v1/ch10) 开始，研究例如 MapReduce 这样 **面向批处理（batch-oriented）** 的数据流系统。对于建设大规模数据系统，我们将看到，它们提供了优秀的工具和思想。[第十一章](/v1/ch11) 将把这些思想应用到 **流式数据（data streams）** 中，使我们能用更低的延迟完成同样的任务。[第十二章](/v1/ch12) 将对本书进行总结，探讨如何使用这些工具来构建可靠，可伸缩和可维护的应用。

## 索引

* [第十章：批处理](/v1/ch10)
  * [使用Unix工具的批处理](/v1/ch10#使用Unix工具的批处理)
  * [MapReduce和分布式文件系统](/v1/ch10#MapReduce和分布式文件系统)
  * [MapReduce之后](/v1/ch10#MapReduce之后)
  * [本章小结](/v1/ch10#本章小结)
* [第十一章：流处理](/v1/ch11)
  * [传递事件流](/v1/ch11#传递事件流)
  * [数据库与流](/v1/ch11#数据库与流)
  * [流处理](/v1/ch11#流处理)
  * [本章小结](/v1/ch11#本章小结)
* [第十二章：数据系统的未来](/v1/ch12)
  * [数据集成](/v1/ch12#数据集成)
  * [分拆数据库](/v1/ch12#分拆数据库)
  * [将事情做正确](/v1/ch12#将事情做正确)
  * [做正确的事情](/v1/ch12#做正确的事情)
  * [本章小结](/v1/ch12#本章小结)


================================================
FILE: content/v1/preface.md
================================================
---
title: 序言
weight: 50
breadcrumbs: false
---


如果近几年从业于软件工程，特别是服务器端和后端系统开发，那么你很有可能已经被大量关于数据存储和处理的时髦词汇轰炸过了： NoSQL！大数据！Web-Scale！分片！最终一致性！ACID！CAP 定理！云服务！MapReduce！实时！

在最近十年中，我们看到了很多有趣的进展，关于数据库，分布式系统，以及在此基础上构建应用程序的方式。这些进展有着各种各样的驱动力：

* 谷歌、雅虎、亚马逊、脸书、领英、微软和推特等互联网公司正在和巨大的流量 / 数据打交道，这迫使他们去创造能有效应对如此规模的新工具。
* 企业需要变得敏捷，需要低成本地检验假设，需要通过缩短开发周期和保持数据模型的灵活性，快速地响应新的市场洞察。
* 免费和开源软件变得非常成功，在许多环境中比商业软件和定制软件更受欢迎。
* 处理器主频几乎没有增长，但是多核处理器已经成为标配，网络也越来越快。这意味着并行化程度只增不减。
* 即使你在一个小团队中工作，现在也可以构建分布在多台计算机甚至多个地理区域的系统，这要归功于譬如亚马逊网络服务（AWS）等基础设施即服务（IaaS）概念的践行者。
* 许多服务都要求高可用，因停电或维护导致的服务不可用，变得越来越难以接受。

**数据密集型应用（data-intensive applications）** 正在通过使用这些技术进步来推动可能性的边界。一个应用被称为 **数据密集型** 的，如果 **数据是其主要挑战**（数据量，数据复杂度或数据变化速度）—— 与之相对的是 **计算密集型**，即处理器速度是其瓶颈。

帮助数据密集型应用存储和处理数据的工具与技术，正迅速地适应这些变化。新型数据库系统（“NoSQL”）已经备受关注，而消息队列，缓存，搜索索引，批处理和流处理框架以及相关技术也非常重要。很多应用组合使用这些工具与技术。

这些生意盎然的时髦词汇体现出人们对新的可能性的热情，这是一件好事。但是作为软件工程师和架构师，如果要开发优秀的应用，我们还需要对各种层出不穷的技术及其利弊权衡有精准的技术理解。为了获得这种洞察，我们需要深挖时髦词汇背后的内容。

幸运的是，在技术迅速变化的背后总是存在一些持续成立的原则，无论你使用了特定工具的哪个版本。如果你理解了这些原则，就可以领会这些工具的适用场景，如何充分利用它们，以及如何避免其中的陷阱。这正是本书的初衷。

本书的目标是帮助你在飞速变化的数据处理和数据存储技术大观园中找到方向。本书并不是某个特定工具的教程，也不是一本充满枯燥理论的教科书。相反，我们将看到一些成功数据系统的样例：许多流行应用每天都要在生产中满足可伸缩性、性能、以及可靠性的要求，而这些技术构成了这些应用的基础。

我们将深入这些系统的内部，理清它们的关键算法，讨论背后的原则和它们必须做出的权衡。在这个过程中，我们将尝试寻找 **思考** 数据系统的有效方式 —— 不仅关于它们 **如何** 工作，还包括它们 **为什么** 以这种方式工作，以及哪些问题是我们需要问的。

阅读本书后，你能很好地决定哪种技术适合哪种用途，并了解如何将工具组合起来，为一个良好应用架构奠定基础。本书并不足以使你从头开始构建自己的数据库存储引擎，不过幸运的是这基本上很少有必要。你将获得对系统底层发生事情的敏锐直觉，这样你就有能力推理它们的行为，做出优秀的设计决策，并追踪任何可能出现的问题。


## 本书的目标读者

如果你开发的应用具有用于存储或处理数据的某种服务器 / 后端系统，而且使用网络（例如，Web 应用、移动应用或连接到互联网的传感器），那么本书就是为你准备的。

本书是为软件工程师，软件架构师，以及喜欢写代码的技术经理准备的。如果你需要对所从事系统的架构做出决策 —— 例如你需要选择解决某个特定问题的工具，并找出如何最好地使用这些工具，那么这本书对你尤有价值。但即使你无法选择你的工具，本书仍将帮助你更好地了解所使用工具的长处和短处。

你应当具有一些开发 Web 应用或网络服务的经验，且应当熟悉关系型数据库和 SQL。任何你了解的非关系型数据库和其他与数据相关工具都会有所帮助，但不是必需的。对常见网络协议如 TCP 和 HTTP 的大概理解是有帮助的。编程语言或框架的选择对阅读本书没有任何不同影响。

如果以下任意一条对你为真，你会发现这本书很有价值：

* 你想了解如何使数据系统可伸缩，例如，支持拥有数百万用户的 Web 或移动应用。
* 你需要提高应用程序的可用性（最大限度地减少停机时间），保持稳定运行。
* 你正在寻找使系统在长期运行过程易于维护的方法，即使系统规模增长，需求与技术也发生变化。
* 你对事物的运作方式有着天然的好奇心，并且希望知道一些主流网站和在线服务背后发生的事情。这本书打破了各种数据库和数据处理系统的内幕，探索这些系统设计中的智慧是非常有趣的。

有时在讨论可伸缩的数据系统时，人们会说：“你又不在谷歌或亚马逊，别操心可伸缩性了，直接上关系型数据库”。这个陈述有一定的道理：为了不必要的伸缩性而设计程序，不仅会浪费不必要的精力，并且可能会把你锁死在一个不灵活的设计中。实际上这是一种 “过早优化” 的形式。不过，选择合适的工具确实很重要，而不同的技术各有优缺点。我们将看到，关系数据库虽然很重要，但绝不是数据处理的终章。


## 本书涉及的领域

本书并不会尝试告诉读者如何安装或使用特定的软件包或 API，因为已经有大量文档给出了详细的使用说明。相反，我们会讨论数据系统的基础 —— 各种原则与利弊权衡，并探讨了不同产品所做出的不同设计决策。

在电子书中包含了在线资源全文的链接。所有链接在出版时都进行了验证，但不幸的是，由于网络的自然规律，链接往往会频繁地破损。如果你遇到链接断开的情况，或者正在阅读本书的打印副本，可以使用搜索引擎查找参考文献。对于学术论文，你可以在 Google 学术中搜索标题，查找可以公开获取的 PDF 文件。或者，你也可以在 https://github.com/ept/ddia-references 中找到所有的参考资料，我们在那儿维护最新的链接。

我们主要关注的是数据系统的 **架构（architecture）**，以及它们被集成到数据密集型应用中的方式。本书没有足够的空间覆盖部署、运维、安全、管理等领域 —— 这些都是复杂而重要的主题，仅仅在本书中用粗略的注解讨论这些对它们很不公平。每个领域都值得用单独的书去讲。

本书中描述的许多技术都被涵盖在 **大数据（Big Data）** 这个时髦词的范畴中。然而 “大数据” 这个术语被滥用，缺乏明确定义，以至于在严肃的工程讨论中没有用处。这本书使用歧义更小的术语，如 “单节点” 之于 “分布式系统”，或 “在线 / 交互式系统” 之于 “离线 / 批处理系统”。

本书对 **自由和开源软件（FOSS）** 有一定偏好，因为阅读、修改和执行源码是了解某事物详细工作原理的好方法。开放的平台也可以降低供应商垄断的风险。然而在适当的情况下，我们也会讨论专利软件（闭源软件，软件即服务 SaaS，或一些在文献中描述过但未公开发行的公司内部软件）。

## 本书纲要

本书分为三部分：

1. 在 [第一部分](/v1/part-i) 中，我们会讨论设计数据密集型应用所赖的基本思想。我们从 [第一章](/v1/ch1) 开始，讨论我们实际要达到的目标：可靠性、可伸缩性和可维护性；我们该如何思考这些概念；以及如何实现它们。在 [第二章](/v1/ch2) 中，我们比较了几种不同的数据模型和查询语言，看看它们如何适用于不同的场景。在 [第三章](/v1/ch3) 中将讨论存储引擎：数据库如何在磁盘上摆放数据，以便能高效地再次找到它。[第四章](/v1/ch4) 转向数据编码（序列化），以及随时间演化的模式。

2. 在 [第二部分](/v1/part-ii) 中，我们从讨论存储在一台机器上的数据转向讨论分布在多台机器上的数据。这对于可伸缩性通常是必需的，但带来了各种独特的挑战。我们首先讨论复制（[第五章](/v1/ch5)）、分区 / 分片（[第六章](/v1/ch6)）和事务（[第七章](/v1/ch7)）。然后我们将探索关于分布式系统问题的更多细节（[第八章](/v1/ch8)），以及在分布式系统中实现一致性与共识意味着什么（[第九章](/v1/ch9)）。

3. 在 [第三部分](/v1/part-iii) 中，我们讨论那些从其他数据集衍生出一些数据集的系统。衍生数据经常出现在异构系统中：当没有单个数据库可以把所有事情都做的很好时，应用需要集成几种不同的数据库、缓存、索引等。在 [第十章](/v1/ch10) 中我们将从一种衍生数据的批处理方法开始，然后在此基础上建立在 [第十一章](/v1/ch11) 中讨论的流处理。最后，在 [第十二章](/v1/ch12) 中，我们将所有内容汇总，讨论在将来构建可靠、可伸缩和可维护的应用程序的方法。


## 参考文献与延伸阅读

本书中讨论的大部分内容已经在其它地方以某种形式出现过了 —— 会议演示文稿、研究论文、博客文章、代码、BUG 跟踪器、邮件列表以及工程习惯中。本书总结了不同来源资料中最重要的想法，并在文本中包含了指向原始文献的链接。如果你想更深入地探索一个领域，那么每章末尾的参考文献都是很好的资源，其中大部分可以免费在线获取。


## O‘Reilly Safari

[Safari](http://oreilly.com/safari) (formerly Safari Books Online) is a membership-based training and reference platform for enterprise, government, educators, and individuals.

Members have access to thousands of books, training videos, Learning Paths, interac‐ tive tutorials, and curated playlists from over 250 publishers, including O’Reilly Media, Harvard Business Review, Prentice Hall Professional, Addison-Wesley Pro‐ fessional, Microsoft Press, Sams, Que, Peachpit Press, Adobe, Focal Press, Cisco Press, John Wiley & Sons, Syngress, Morgan Kaufmann, IBM Redbooks, Packt, Adobe Press, FT Press, Apress, Manning, New Riders, McGraw-Hill, Jones & Bartlett, and Course Technology, among others.

For more information, please visit http://oreilly.com/safari.


## 致谢

本书融合了学术研究和工业实践的经验，融合并系统化了大量其他人的想法与知识。在计算领域，我们往往会被各种新鲜花样所吸引，但我认为前人完成的工作中，有太多值得我们学习的地方了。本书有 800 多处引用：文章、博客、讲座、文档等，对我来说这些都是宝贵的学习资源。我非常感谢这些材料的作者分享他们的知识。

我也从与人交流中学到了很多东西，很多人花费了宝贵的时间与我讨论想法并耐心解释。特别感谢 Joe Adler, Ross Anderson, Peter Bailis, Márton Balassi, Alastair Beresford, Mark Callaghan, Mat Clayton, Patrick Collison, Sean Cribbs, Shirshanka Das, Niklas Ekström, Stephan Ewen, Alan Fekete, Gyula Fóra, Camille Fournier, Andres Freund, John Garbutt, Seth Gilbert, Tom Haggett, Pat Hel‐ land, Joe Hellerstein, Jakob Homan, Heidi Howard, John Hugg, Julian Hyde, Conrad Irwin, Evan Jones, Flavio Junqueira, Jessica Kerr, Kyle Kingsbury, Jay Kreps, Carl Lerche, Nicolas Liochon, Steve Loughran, Lee Mallabone, Nathan Marz, Caitie McCaffrey, Josie McLellan, Christopher Meiklejohn, Ian Meyers, Neha Narkhede, Neha Narula, Cathy O’Neil, Onora O’Neill, Ludovic Orban, Zoran Perkov, Julia Powles, Chris Riccomini, Henry Robinson, David Rosenthal, Jennifer Rullmann, Matthew Sackman, Martin Scholl, Amit Sela, Gwen Shapira, Greg Spurrier, Sam Stokes, Ben Stopford, Tom Stuart, Diana Vasile, Rahul Vohra, Pete Warden, 以及 Brett Wooldridge.

更多人通过审阅草稿并提供反馈意见在本书的创作过程中做出了无价的贡献。我要特别感谢 Raul Agepati, Tyler Akidau, Mattias Andersson, Sasha Baranov, Veena Basavaraj, David Beyer, Jim Brikman, Paul Carey, Raul Castro Fernandez, Joseph Chow, Derek Elkins, Sam Elliott, Alexander Gallego, Mark Grover, Stu Halloway, Heidi Howard, Nicola Kleppmann, Stefan Kruppa, Bjorn Madsen, Sander Mak, Stefan Podkowinski, Phil Potter, Hamid Ramazani, Sam Stokes, 以及 Ben Summers。当然对于本书中的任何遗留错误或难以接受的见解，我都承担全部责任。

为了帮助这本书落地，并且耐心地处理我缓慢的写作和不寻常的要求，我要对编辑 Marie Beaugureau，Mike Loukides，Ann Spencer 和 O'Reilly 的所有团队表示感谢。我要感谢 Rachel Head 帮我找到了合适的术语。我要感谢 Alastair Beresford，Susan Goodhue，Neha Narkhede 和 Kevin Scott，在其他工作事务之外给了我充分地创作时间和自由。

特别感谢 Shabbir Diwan 和 Edie Freedman，他们非常用心地为各章配了地图。他们提出了不落俗套的灵感，创作了这些地图，美丽而引人入胜，真是太棒了。

最后我要表达对家人和朋友们的爱，没有他们，我将无法走完这个将近四年的写作历程。你们是最棒的。


================================================
FILE: content/v1/toc.md
================================================
---
title: "目录"
linkTitle: "目录"
weight: 10
breadcrumbs: false
---


![](/title-v1.jpg)

## [序言](/v1/preface)

## [第一部分：数据系统基础](/v1/part-i)

### [第一章：可靠性、可伸缩性和可维护性](/v1/ch1)
* [关于数据系统的思考](/v1/ch1#关于数据系统的思考)
* [可靠性](/v1/ch1#可靠性)
* [可伸缩性](/v1/ch1#可伸缩性)
* [可维护性](/v1/ch1#可维护性)
* [本章小结](/v1/ch1#本章小结)
### [第二章：数据模型与查询语言](/v1/ch2)
* [关系模型与文档模型](/v1/ch2#关系模型与文档模型)
* [数据查询语言](/v1/ch2#数据查询语言)
* [图数据模型](/v1/ch2#图数据模型)
* [本章小结](/v1/ch2#本章小结)
### [第三章：存储与检索](/v1/ch3)
* [驱动数据库的数据结构](/v1/ch3#驱动数据库的数据结构)
* [事务处理还是分析？](/v1/ch3#事务处理还是分析)
* [列式存储](/v1/ch3#列式存储)
* [本章小结](/v1/ch3#本章小结)
### [第四章：编码与演化](/v1/ch4)
* [编码数据的格式](/v1/ch4#编码数据的格式)
* [数据流的类型](/v1/ch4#数据流的类型)
* [本章小结](/v1/ch4#本章小结)

## [第二部分：分布式数据](/v1/part-ii)

### [第五章：复制](/v1/ch5)
* [领导者与追随者](/v1/ch5#领导者与追随者)
* [复制延迟问题](/v1/ch5#复制延迟问题)
* [多主复制](/v1/ch5#多主复制)
* [无主复制](/v1/ch5#无主复制)
* [本章小结](/v1/ch5#本章小结)
### [第六章：分区](/v1/ch6)
* [分区与复制](/v1/ch6#分区与复制)
* [键值数据的分区](/v1/ch6#键值数据的分区)
* [分区与次级索引](/v1/ch6#分区与次级索引)
* [分区再平衡](/v1/ch6#分区再平衡)
* [请求路由](/v1/ch6#请求路由)
* [本章小结](/v1/ch6#本章小结)
### [第七章：事务](/v1/ch7)
* [事务的棘手概念](/v1/ch7#事务的棘手概念)
* [弱隔离级别](/v1/ch7#弱隔离级别)
* [可串行化](/v1/ch7#可串行化)
* [本章小结](/v1/ch7#本章小结)
### [第八章：分布式系统的麻烦](/v1/ch8)
* [故障与部分失效](/v1/ch8#故障与部分失效)
* [不可靠的网络](/v1/ch8#不可靠的网络)
* [不可靠的时钟](/v1/ch8#不可靠的时钟)
* [知识、真相与谎言](/v1/ch8#知识真相与谎言)
* [本章小结](/v1/ch8#本章小结)
### [第九章：一致性与共识](/v1/ch9)
* [一致性保证](/v1/ch9#一致性保证)
* [线性一致性](/v1/ch9#线性一致性)
* [顺序保证](/v1/ch9#顺序保证)
* [分布式事务与共识](/v1/ch9#分布式事务与共识)
* [本章小结](/v1/ch9#本章小结)

## [第三部分：衍生数据](/v1/part-iii)

### [第十章：批处理](/v1/ch10)
* [使用Unix工具的批处理](/v1/ch10#使用Unix工具的批处理)
* [MapReduce和分布式文件系统](/v1/ch10#MapReduce和分布式文件系统)
* [MapReduce之后](/v1/ch10#MapReduce之后)
* [本章小结](/v1/ch10#本章小结)
### [第十一章：流处理](/v1/ch11)
* [传递事件流](/v1/ch11#传递事件流)
* [数据库与流](/v1/ch11#数据库与流)
* [流处理](/v1/ch11#流处理)
* [本章小结](/v1/ch11#本章小结)
### [第十二章：数据系统的未来](/v1/ch12)
* [数据集成](/v1/ch12#数据集成)
* [分拆数据库](/v1/ch12#分拆数据库)
* [将事情做正确](/v1/ch12#将事情做正确)
* [做正确的事情](/v1/ch12#做正确的事情)
* [本章小结](/v1/ch12#本章小结)

### [术语表](/v1/glossary)

### [后记](/v1/colophon)


================================================
FILE: content/v1_tw/_index.md
================================================
---
title: 設計資料密集型應用（第一版）
linkTitle: DDIA
cascade:
  type: docs
breadcrumbs: false
---


**作者**： [Martin Kleppmann](https://martin.kleppmann.com)，[《Designing Data-Intensive Applications 2nd Edition》](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch01.html) ： 英國劍橋大學分散式系統研究員，演講者，博主和開源貢獻者，軟體工程師和企業家，曾在 LinkedIn 和 Rapportive 負責資料基礎架構。

**譯者**：[**馮若航**](https://vonng.com)，網名 [@Vonng](https://github.com/Vonng)。
PostgreSQL 專家，資料庫老司機，雲計算泥石流。
[**Pigsty**](https://pgsty.com) 作者與創始人。
架構師，DBA，全棧工程師 @ TanTan，Alibaba，Apple。
獨立開源貢獻者，[GitStar Ranking 585](https://gitstar-ranking.com/Vonng)，[國區活躍 Top20](https://committers.top/china)。
[DDIA](https://ddia.pigsty.io) / [PG Internal](https://pgint.vonng.com) 中文版譯者，公眾號：《老馮雲數》，資料庫 KOL。

**校訂**： [@yingang](https://github.com/yingang)  ｜  [繁體中文](/tw) **版本維護** by  [@afunTW](https://github.com/afunTW) ｜ [完整貢獻者列表](/contrib)

> [!NOTE]
> DDIA [**第二版**](/zh) 正在翻譯中 ([`content/v2`](https://github.com/Vonng/ddia/tree/main) 目錄)，歡迎加入並提出您的寶貴意見！[點選此處閱覽第二版](/zh)。


## 譯序

> 不懂資料庫的全棧工程師不是好架構師 —— 馮若航 / Vonng

現今，尤其是在網際網路領域，大多數應用都屬於資料密集型應用。本書從底層資料結構到頂層架構設計，將資料系統設計中的精髓娓娓道來。其中的寶貴經驗無論是對架構師、DBA、還是後端工程師、甚至產品經理都會有幫助。

這是一本理論結合實踐的書，書中很多問題，譯者在實際場景中都曾遇到過，讀來讓人擊節扼腕。如果能早點讀到這本書，該少走多少彎路啊！

這也是一本深入淺出的書，講述概念的來龍去脈而不是賣弄定義，介紹事物發展演化歷程而不是事實堆砌，將複雜的概念講述的淺顯易懂，但又直擊本質不失深度。每章最後的引用質量非常好，是深入學習各個主題的絕佳索引。

本書為資料系統的設計、實現、與評價提供了很好的概念框架。讀完並理解本書內容後，讀者可以輕鬆看破大多數的技術忽悠，與技術磚家撕起來虎虎生風。

這是 2017 年譯者讀過最好的一本技術類書籍，這麼好的書沒有中文翻譯，實在是遺憾。某不才，願為先進技術文化的傳播貢獻一份力量。既可以深入學習有趣的技術主題，又可以鍛鍊中英文語言文字功底，何樂而不為？


## 前言

> 在我們的社會中，技術是一種強大的力量。資料、軟體、通訊可以用於壞的方面：不公平的階級固化，損害公民權利，保護既得利益集團。但也可以用於好的方面：讓底層人民發出自己的聲音，讓每個人都擁有機會，避免災難。本書獻給所有將技術用於善途的人們。


> 計算是一種流行文化，流行文化鄙視歷史。流行文化關乎個體身份和參與感，但與合作無關。流行文化活在當下，也與過去和未來無關。我認為大部分（為了錢）編寫程式碼的人就是這樣的，他們不知道自己的文化來自哪裡。
>
>  —— 阿蘭・凱接受 Dobb 博士的雜誌採訪時（2012 年）


## 目錄

### [序言](/v1_tw/preface)

### [第一部分：資料系統基礎](/v1_tw/part-i)

* [第一章：可靠性、可伸縮性和可維護性](/v1_tw/ch1)
* [第二章：資料模型與查詢語言](/v1_tw/ch2)
* [第三章：儲存與檢索](/v1_tw/ch3)
* [第四章：編碼與演化](/v1_tw/ch4)

### [第二部分：分散式資料](/v1_tw/part-ii)

* [第五章：複製](/v1_tw/ch5)
* [第六章：分割槽](/v1_tw/ch6)
* [第七章：事務](/v1_tw/ch7)
* [第八章：分散式系統的麻煩](/v1_tw/ch8)
* [第九章：一致性與共識](/v1_tw/ch9)

### [第三部分：衍生資料](/v1_tw/part-iii)

* [第十章：批處理](/v1_tw/ch10)
* [第十一章：流處理](/v1_tw/ch11)
* [第十二章：資料系統的未來](/v1_tw/ch12)

### [術語表](/v1_tw/glossary)

### [後記](/v1_tw/colophon)

<br>

---------

## 法律宣告

從原作者處得知，已經有簡體中文的翻譯計劃，將於 2018 年末完成。[購買地址](https://search.jd.com/Search?keyword=設計資料密集型應用)

譯者純粹出於 **學習目的** 與 **個人興趣** 翻譯本書，不追求任何經濟利益。

譯者保留對此版本譯文的署名權，其他權利以原作者和出版社的主張為準。

本譯文只供學習研究參考之用，不得公開傳播發行或用於商業用途。有能力閱讀英文書籍者請購買正版支援。


---------

## 貢獻

0. 全文校訂 by [@yingang](https://github.com/Vonng/ddia/commits?author=yingang)
1. [序言初翻修正](https://github.com/Vonng/ddia/commit/afb5edab55c62ed23474149f229677e3b42dfc2c) by [@seagullbird](https://github.com/Vonng/ddia/commits?author=seagullbird)
2. [第一章語法標點校正](https://github.com/Vonng/ddia/commit/973b12cd8f8fcdf4852f1eb1649ddd9d187e3644) by [@nevertiree](https://github.com/Vonng/ddia/commits?author=nevertiree)
3. [第六章部分校正](https://github.com/Vonng/ddia/commit/d4eb0852c0ec1e93c8aacc496c80b915bb1e6d48) 與[第十章的初翻](https://github.com/Vonng/ddia/commit/9de8dbd1bfe6fbb03b3bf6c1a1aa2291aed2490e) by [@MuAlex](https://github.com/Vonng/ddia/commits?author=MuAlex)
4. 第一部分]前言，ch2 校正 by [@jiajiadebug](https://github.com/Vonng/ddia/commits?author=jiajiadebug)
5. 詞彙表、後記關於野豬的部分 by [@Chowss](https://github.com/Vonng/ddia/commits?author=Chowss)
6. 繁體中文版本與轉換指令碼 by [@afunTW](https://github.com/afunTW)
7. 多處翻譯修正 by [@songzhibin97](https://github.com/Vonng/ddia/commits?author=songzhibin97) [@MamaShip](https://github.com/Vonng/ddia/commits?author=MamaShip) [@FangYuan33](https://github.com/Vonng/ddia/commits?author=FangYuan33)
8. [感謝所有作出貢獻，提出意見的朋友們](/contrib)：

<details>
<summary><a href="https://github.com/Vonng/ddia/pulls">Pull Requests</a> & <a href="https://github.com/Vonng/ddia/issues">Issues</a></summary>

| ISSUE & Pull Requests                           | USER                                                       | Title                                                          |
|-------------------------------------------------|------------------------------------------------------------|----------------------------------------------------------------|
| [386](https://github.com/Vonng/ddia/pull/386)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch2: 最佳化一處翻譯                                                    |
| [384](https://github.com/Vonng/ddia/pull/384)   | [@PanggNOTlovebean](https://github.com/PanggNOTlovebean)   | docs: 最佳化中文文件的措辭和表達                                              |
| [383](https://github.com/Vonng/ddia/pull/383)   | [@PanggNOTlovebean](https://github.com/PanggNOTlovebean)   | docs: 修正 ch4 中的術語和表達錯誤                                          |
| [382](https://github.com/Vonng/ddia/pull/382)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch1: 最佳化一處翻譯                                                    |
| [381](https://github.com/Vonng/ddia/pull/381)   | [@Max-Tortoise](https://github.com/Max-Tortoise)           | ch4: 修正一處術語不完整問題                                               |
| [377](https://github.com/Vonng/ddia/pull/377)   | [@huang06](https://github.com/huang06)                     | 最佳化翻譯術語                                                        |
| [375](https://github.com/Vonng/ddia/issues/375) | [@z-soulx](https://github.com/z-soulx)                     | 對於是否100%全中文翻譯的必要性討論？個人-沒必要100%，特別是“名詞”，有原單詞更加適合it人員                 |
| [371](https://github.com/Vonng/ddia/pull/371)   | [@lewiszlw](https://github.com/lewiszlw)                   | CPU core -> CPU 核心                                          |
| [369](https://github.com/Vonng/ddia/pull/369)   | [@bbwang-gl](https://github.com/bbwang-gl)                 | ch7: 可序列化快照隔離檢測一個事務何時修改另一個事務的讀取                                 |
| [368](https://github.com/Vonng/ddia/pull/368)   | [@yhao3](https://github.com/yhao3)                         | 更新 zh-tw.py 與 zh-tw 內容                                       |
| [367](https://github.com/Vonng/ddia/pull/367)   | [@yhao3](https://github.com/yhao3)                         | 修正拼寫、格式和標點問題                                                  |
| [366](https://github.com/Vonng/ddia/pull/366)   | [@yangshangde](https://github.com/yangshangde)             | ch8: 將“電源失敗”改為“電源失效”                                           |
| [365](https://github.com/Vonng/ddia/pull/365)   | [@xyohn](https://github.com/xyohn)                         | ch1: 最佳化“儲存與計算分離”相關翻譯                                           |
| [364](https://github.com/Vonng/ddia/issues/364) | [@xyohn](https://github.com/xyohn)                         | ch1: 最佳化“儲存與計算分離”相關翻譯                                           |
| [363](https://github.com/Vonng/ddia/pull/363)   | [@xyohn](https://github.com/xyohn)                         | #362: 最佳化一處翻譯                                                 |
| [362](https://github.com/Vonng/ddia/issues/362) | [@xyohn](https://github.com/xyohn)                         | ch1: 最佳化一處翻譯                                                   |
| [359](https://github.com/Vonng/ddia/pull/359)   | [@c25423](https://github.com/c25423)                       | ch10: 修正一處拼寫錯誤                                                 |
| [358](https://github.com/Vonng/ddia/pull/358)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch4: 修正一處拼寫錯誤                                                  |
| [356](https://github.com/Vonng/ddia/pull/356)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch2: 修正一處標點錯誤                                                  |
| [355](https://github.com/Vonng/ddia/pull/355)   | [@DuroyGeorge](https://github.com/DuroyGeorge)             | ch12: 修正一處格式錯誤                                                 |
| [354](https://github.com/Vonng/ddia/pull/354)   | [@justlorain](https://github.com/justlorain)               | ch7: 修正一處參考連結                                                  |
| [353](https://github.com/Vonng/ddia/pull/353)   | [@fantasyczl](https://github.com/fantasyczl)               | ch3&9: 修正兩處引用錯誤                                                |
| [352](https://github.com/Vonng/ddia/pull/352)   | [@fantasyczl](https://github.com/fantasyczl)               | 支援輸出為 EPUB 格式                                                  |
| [349](https://github.com/Vonng/ddia/pull/349)   | [@xiyihan0](https://github.com/xiyihan0)                   | ch1: 修正一處格式錯誤                                                  |
| [348](https://github.com/Vonng/ddia/pull/348)   | [@omegaatt36](https://github.com/omegaatt36)               | ch3: 修正一處影像連結                                                  |
| [346](https://github.com/Vonng/ddia/issues/346) | [@Vermouth1995](https://github.com/Vermouth1995)           | ch1: 最佳化一處翻譯                                                    |
| [343](https://github.com/Vonng/ddia/pull/343)   | [@kehao-chen](https://github.com/kehao-chen)               | ch10: 最佳化一處翻譯                                                   |
| [341](https://github.com/Vonng/ddia/pull/341)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch3: 最佳化兩處翻譯                                                    |
| [340](https://github.com/Vonng/ddia/pull/340)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch2: 最佳化多處翻譯                                                    |
| [338](https://github.com/Vonng/ddia/pull/338)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch1: 最佳化一處翻譯                                                    |
| [335](https://github.com/Vonng/ddia/pull/335)   | [@kimi0230](https://github.com/kimi0230)                   | 修正一處繁體中文錯誤                                                     |
| [334](https://github.com/Vonng/ddia/pull/334)   | [@soulrrrrr](https://github.com/soulrrrrr)                 | ch2: 修正一處繁體中文錯誤                                                |
| [332](https://github.com/Vonng/ddia/pull/332)   | [@justlorain](https://github.com/justlorain)               | ch5: 修正一處翻譯錯誤                                                  |
| [331](https://github.com/Vonng/ddia/pull/331)   | [@Lyianu](https://github.com/Lyianu)                       | ch9: 更正幾處拼寫錯誤                                                  |
| [330](https://github.com/Vonng/ddia/pull/330)   | [@Lyianu](https://github.com/Lyianu)                       | ch7: 最佳化一處翻譯                                                    |
| [329](https://github.com/Vonng/ddia/issues/329) | [@Lyianu](https://github.com/Lyianu)                       | ch6: 指出一處翻譯錯誤                                                  |
| [328](https://github.com/Vonng/ddia/pull/328)   | [@justlorain](https://github.com/justlorain)               | ch4: 更正一處翻譯遺漏                                                  |
| [326](https://github.com/Vonng/ddia/pull/326)   | [@liangGTY](https://github.com/liangGTY)                   | ch1: 最佳化一處翻譯                                                    |
| [323](https://github.com/Vonng/ddia/pull/323)   | [@marvin263](https://github.com/marvin263)                 | ch5: 最佳化一處翻譯                                                    |
| [322](https://github.com/Vonng/ddia/pull/322)   | [@marvin263](https://github.com/marvin263)                 | ch8: 最佳化一處翻譯                                                    |
| [304](https://github.com/Vonng/ddia/pull/304)   | [@spike014](https://github.com/spike014)                   | ch11: 最佳化一處翻譯                                                   |
| [298](https://github.com/Vonng/ddia/pull/298)   | [@Makonike](https://github.com/Makonike)                   | ch11&12: 修正兩處錯誤                                                |
| [284](https://github.com/Vonng/ddia/pull/284)   | [@WAangzE](https://github.com/WAangzE)                     | ch4: 更正一處列表錯誤                                                  |
| [283](https://github.com/Vonng/ddia/pull/283)   | [@WAangzE](https://github.com/WAangzE)                     | ch3: 更正一處錯別字                                                   |
| [282](https://github.com/Vonng/ddia/pull/282)   | [@WAangzE](https://github.com/WAangzE)                     | ch2: 更正一處公式問題                                                  |
| [281](https://github.com/Vonng/ddia/pull/281)   | [@lyuxi99](https://github.com/lyuxi99)                     | 更正多處內部連結錯誤                                                     |
| [280](https://github.com/Vonng/ddia/pull/280)   | [@lyuxi99](https://github.com/lyuxi99)                     | ch9: 更正內部連結錯誤                                                  |
| [279](https://github.com/Vonng/ddia/issues/279) | [@codexvn](https://github.com/codexvn)                     | ch9: 指出公式在 GitHub Pages 顯示的問題                                  |
| [278](https://github.com/Vonng/ddia/pull/278)   | [@LJlkdskdjflsa](https://github.com/LJlkdskdjflsa)         | 發現了繁體中文版本中的錯誤翻譯                                                |
| [275](https://github.com/Vonng/ddia/pull/275)   | [@117503445](https://github.com/117503445)                 | 更正 LICENSE 連結                                                  |
| [274](https://github.com/Vonng/ddia/pull/274)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch7: 修正錯別字                                                     |
| [273](https://github.com/Vonng/ddia/pull/273)   | [@Sdot-Python](https://github.com/Sdot-Python)             | ch7: 統一了 write skew 的翻譯                                        |
| [271](https://github.com/Vonng/ddia/pull/271)   | [@Makonike](https://github.com/Makonike)                   | ch6: 統一了 rebalancing 的翻譯                                       |
| [270](https://github.com/Vonng/ddia/pull/270)   | [@Ynjxsjmh](https://github.com/Ynjxsjmh)                   | ch7: 修正不一致的翻譯                                                  |
| [263](https://github.com/Vonng/ddia/pull/263)   | [@zydmayday](https://github.com/zydmayday)                 | ch5: 修正譯文中的重複單詞                                                |
| [260](https://github.com/Vonng/ddia/pull/260)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch4: 修正部分不準確的翻譯                                                |
| [258](https://github.com/Vonng/ddia/pull/258)   | [@bestgrc](https://github.com/bestgrc)                     | ch3: 修正一處翻譯錯誤                                                  |
| [257](https://github.com/Vonng/ddia/pull/257)   | [@UnderSam](https://github.com/UnderSam)                   | ch8: 修正一處拼寫錯誤                                                  |
| [256](https://github.com/Vonng/ddia/pull/256)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可序列化”相關內容的多處翻譯不當                                       |
| [255](https://github.com/Vonng/ddia/pull/255)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可重複讀”相關內容的多處翻譯不當                                       |
| [253](https://github.com/Vonng/ddia/pull/253)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“讀已提交”相關內容的多處翻譯不當                                       |
| [246](https://github.com/Vonng/ddia/pull/246)   | [@derekwu0101](https://github.com/derekwu0101)             | ch3: 修正繁體中文的轉譯錯誤                                               |
| [245](https://github.com/Vonng/ddia/pull/245)   | [@skyran1278](https://github.com/skyran1278)               | ch12: 修正繁體中文的轉譯錯誤                                              |
| [244](https://github.com/Vonng/ddia/pull/244)   | [@Axlgrep](https://github.com/Axlgrep)                     | ch9: 修正不通順的翻譯                                                  |
| [242](https://github.com/Vonng/ddia/pull/242)   | [@lynkeib](https://github.com/lynkeib)                     | ch9: 修正不通順的翻譯                                                  |
| [241](https://github.com/Vonng/ddia/pull/241)   | [@lynkeib](https://github.com/lynkeib)                     | ch8: 修正不正確的公式格式                                                |
| [240](https://github.com/Vonng/ddia/pull/240)   | [@8da2k](https://github.com/8da2k)                         | ch9: 修正不通順的翻譯                                                  |
| [239](https://github.com/Vonng/ddia/pull/239)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch7: 修正不一致的翻譯                                                  |
| [237](https://github.com/Vonng/ddia/pull/237)   | [@zhangnew](https://github.com/zhangnew)                   | ch3: 修正錯誤的圖片連結                                                 |
| [229](https://github.com/Vonng/ddia/pull/229)   | [@lis186](https://github.com/lis186)                       | 指出繁體中文的轉譯錯誤：複雜                                                 |
| [226](https://github.com/Vonng/ddia/pull/226)   | [@chroming](https://github.com/chroming)                   | ch1: 修正導航欄中的章節名稱                                               |
| [220](https://github.com/Vonng/ddia/pull/220)   | [@skyran1278](https://github.com/skyran1278)               | ch9: 修正線性一致的繁體中文翻譯                                             |
| [194](https://github.com/Vonng/ddia/pull/194)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正錯誤的翻譯                                                   |
| [193](https://github.com/Vonng/ddia/pull/193)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 最佳化譯文                                                      |
| [192](https://github.com/Vonng/ddia/pull/192)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正不一致和不通順的翻譯                                              |
| [190](https://github.com/Vonng/ddia/pull/190)   | [@Pcrab](https://github.com/Pcrab)                         | ch1: 修正不準確的翻譯                                                  |
| [187](https://github.com/Vonng/ddia/pull/187)   | [@narojay](https://github.com/narojay)                     | ch9: 修正生硬的翻譯                                                   |
| [186](https://github.com/Vonng/ddia/pull/186)   | [@narojay](https://github.com/narojay)                     | ch8: 修正錯別字                                                     |
| [185](https://github.com/Vonng/ddia/issues/185) | [@8da2k](https://github.com/8da2k)                         | 指出小標題跳轉的問題                                                     |
| [184](https://github.com/Vonng/ddia/pull/184)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch10: 修正失效的網址                                                  |
| [183](https://github.com/Vonng/ddia/pull/183)   | [@OneSizeFitsQuorum](https://github.com/OneSizeFitsQuorum) | ch8: 修正錯別字                                                     |
| [182](https://github.com/Vonng/ddia/issues/182) | [@lroolle](https://github.com/lroolle)                     | 建議docsify的主題風格                                                 |
| [181](https://github.com/Vonng/ddia/pull/181)   | [@YunfengGao](https://github.com/YunfengGao)               | ch2: 修正翻譯錯誤                                                    |
| [180](https://github.com/Vonng/ddia/pull/180)   | [@skyran1278](https://github.com/skyran1278)               | ch3: 指出繁體中文的轉譯錯誤                                               |
| [177](https://github.com/Vonng/ddia/pull/177)   | [@exzhawk](https://github.com/exzhawk)                     | 支援 Github Pages 裡的公式顯示                                         |
| [176](https://github.com/Vonng/ddia/pull/176)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch2: 語義網相關翻譯更正                                                 |
| [175](https://github.com/Vonng/ddia/pull/175)   | [@cwr31](https://github.com/cwr31)                         | ch7: 不變式相關翻譯更正                                                 |
| [174](https://github.com/Vonng/ddia/pull/174)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | README & preface: 更正不正確的中文用詞和標點符號                              |
| [173](https://github.com/Vonng/ddia/pull/173)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正不完整的翻譯                                                 |
| [171](https://github.com/Vonng/ddia/pull/171)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正重複的譯文                                                  |
| [169](https://github.com/Vonng/ddia/pull/169)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 更正不太通順的翻譯                                                |
| [166](https://github.com/Vonng/ddia/pull/166)   | [@bp4m4h94](https://github.com/bp4m4h94)                   | ch1: 發現錯誤的文獻索引                                                 |
| [164](https://github.com/Vonng/ddia/pull/164)   | [@DragonDriver](https://github.com/DragonDriver)           | preface: 更正錯誤的標點符號                                             |
| [163](https://github.com/Vonng/ddia/pull/163)   | [@llmmddCoder](https://github.com/llmmddCoder)             | ch1: 更正錯誤字                                                     |
| [160](https://github.com/Vonng/ddia/pull/160)   | [@Zhayhp](https://github.com/Zhayhp)                       | ch2: 建議將 network model 翻譯為網狀模型                                 |
| [159](https://github.com/Vonng/ddia/pull/159)   | [@1ess](https://github.com/1ess)                           | ch4: 更正錯誤字                                                     |
| [157](https://github.com/Vonng/ddia/pull/157)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通順的翻譯                                                 |
| [155](https://github.com/Vonng/ddia/pull/155)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通順的翻譯                                                 |
| [153](https://github.com/Vonng/ddia/pull/153)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch9: 修正縮圖的錯別字                                                 |
| [152](https://github.com/Vonng/ddia/pull/152)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 除重->去重                                                    |
| [151](https://github.com/Vonng/ddia/pull/151)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 修訂sibling相關的翻譯                                            |
| [147](https://github.com/Vonng/ddia/pull/147)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 更正一處不準確的翻譯                                                |
| [145](https://github.com/Vonng/ddia/pull/145)   | [@Hookey](https://github.com/Hookey)                       | 識別了當前簡繁轉譯過程中處理不當的地方，暫透過轉換指令碼規避                                  |
| [144](https://github.com/Vonng/ddia/issues/144) | [@secret4233](https://github.com/secret4233)               | ch7: 不翻譯`next-key locking`                                     |
| [143](https://github.com/Vonng/ddia/issues/143) | [@imcheney](https://github.com/imcheney)                   | ch3: 更新殘留的機翻段落                                                 |
| [142](https://github.com/Vonng/ddia/issues/142) | [@XIJINIAN](https://github.com/XIJINIAN)                   | 建議去除段首的製表符                                                     |
| [141](https://github.com/Vonng/ddia/issues/141) | [@Flyraty](https://github.com/Flyraty)                     | ch5: 發現一處錯誤格式的章節引用                                             |
| [140](https://github.com/Vonng/ddia/pull/140)   | [@Bowser1704](https://github.com/Bowser1704)               | ch5: 修正章節Summary中多處不通順的翻譯                                      |
| [139](https://github.com/Vonng/ddia/pull/139)   | [@Bowser1704](https://github.com/Bowser1704)               | ch2&ch3: 修正多處不通順的或錯誤的翻譯                                        |
| [137](https://github.com/Vonng/ddia/pull/137)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch5&ch6: 最佳化多處不通順的或錯誤的翻譯                                        |
| [134](https://github.com/Vonng/ddia/pull/134)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch4: 最佳化多處不通順的或錯誤的翻譯                                            |
| [133](https://github.com/Vonng/ddia/pull/133)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 最佳化多處錯誤的或不通順的翻譯                                            |
| [132](https://github.com/Vonng/ddia/pull/132)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 最佳化一處容易產生歧義的翻譯                                             |
| [131](https://github.com/Vonng/ddia/pull/131)   | [@rwwg4](https://github.com/rwwg4)                         | ch6: 修正兩處錯誤的翻譯                                                 |
| [129](https://github.com/Vonng/ddia/pull/129)   | [@anaer](https://github.com/anaer)                         | ch4: 修正兩處強調文字和四處程式碼變數名稱                                         |
| [128](https://github.com/Vonng/ddia/pull/128)   | [@meilin96](https://github.com/meilin96)                   | ch5: 修正一處錯誤的引用                                                 |
| [126](https://github.com/Vonng/ddia/pull/126)   | [@cwr31](https://github.com/cwr31)                         | ch10: 修正一處錯誤的翻譯（功能 -> 函式）                                      |
| [125](https://github.com/Vonng/ddia/pull/125)   | [@dch1228](https://github.com/dch1228)                     | ch2: 最佳化 how best 的翻譯（如何以最佳方式）                                  |
| [123](https://github.com/Vonng/ddia/pull/123)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 9, TOC in readme, glossary, etc.) |
| [121](https://github.com/Vonng/ddia/pull/121)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 5 to chapter 8)                   |
| [120](https://github.com/Vonng/ddia/pull/120)   | [@jiong-han](https://github.com/jiong-han)                 | Typo fix: 呲之以鼻 -> 嗤之以鼻                                         |
| [119](https://github.com/Vonng/ddia/pull/119)   | [@cclauss](https://github.com/cclauss)                     | Streamline file operations in convert()                        |
| [118](https://github.com/Vonng/ddia/pull/118)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 2 to chapter 4)                   |
| [117](https://github.com/Vonng/ddia/pull/117)   | [@feeeei](https://github.com/feeeei)                       | 統一每章的標題格式                                                      |
| [115](https://github.com/Vonng/ddia/pull/115)   | [@NageNalock](https://github.com/NageNalock)               | 第七章病句修改: 重複詞語                                                  |
| [114](https://github.com/Vonng/ddia/pull/114)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | Update README.md: correct the book name                        |
| [113](https://github.com/Vonng/ddia/pull/113)   | [@lpxxn](https://github.com/lpxxn)                         | 修改語句                                                           |
| [112](https://github.com/Vonng/ddia/pull/112)   | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [110](https://github.com/Vonng/ddia/pull/110)   | [@lpxxn](https://github.com/lpxxn)                         | 讀已寫入資料                                                         |
| [107](https://github.com/Vonng/ddia/pull/107)   | [@abbychau](https://github.com/abbychau)                   | 單調鐘和好死還是賴活著                                                    |
| [106](https://github.com/Vonng/ddia/pull/106)   | [@enochii](https://github.com/enochii)                     | typo in ch2: fix braces typo                                   |
| [105](https://github.com/Vonng/ddia/pull/105)   | [@LiminCode](https://github.com/LiminCode)                 | Chronicle translation error                                    |
| [104](https://github.com/Vonng/ddia/pull/104)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | several advice for better translation                          |
| [103](https://github.com/Vonng/ddia/pull/103)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in ch4: should be 完成 rather than 完全                       |
| [102](https://github.com/Vonng/ddia/pull/102)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | ch4: better-translation: 扼殺 → 破壞                               |
| [101](https://github.com/Vonng/ddia/pull/101)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in Ch4: should be "改變" rathr than "蓋面"                    |
| [100](https://github.com/Vonng/ddia/pull/100)   | [@LiminCode](https://github.com/LiminCode)                 | fix missing translation                                        |
| [99 ](https://github.com/Vonng/ddia/pull/99)    | [@mrdrivingduck](https://github.com/mrdrivingduck)         | ch6: fix the word rebalancing                                  |
| [98 ](https://github.com/Vonng/ddia/pull/98)    | [@jacklightChen](https://github.com/jacklightChen)         | fix ch7.md: fix wrong references                               |
| [97 ](https://github.com/Vonng/ddia/pull/97)    | [@jenac](https://github.com/jenac)                         | 96                                                             |
| [96 ](https://github.com/Vonng/ddia/pull/96)    | [@PragmaTwice](https://github.com/PragmaTwice)             | ch2: fix typo about 'may or may not be'                        |
| [95 ](https://github.com/Vonng/ddia/pull/95)    | [@EvanMu96](https://github.com/EvanMu96)                   | fix translation of "the battle cry" in ch5                     |
| [94 ](https://github.com/Vonng/ddia/pull/94)    | [@kemingy](https://github.com/kemingy)                     | ch6: fix markdown and punctuations                             |
| [93 ](https://github.com/Vonng/ddia/pull/93)    | [@kemingy](https://github.com/kemingy)                     | ch5: fix markdown and some typos                               |
| [92 ](https://github.com/Vonng/ddia/pull/92)    | [@Gilbert1024](https://github.com/Gilbert1024)             | Merge pull request #1 from Vonng/master                        |
| [88 ](https://github.com/Vonng/ddia/pull/88)    | [@kemingy](https://github.com/kemingy)                     | fix typo for ch1, ch2, ch3, ch4                                |
| [87 ](https://github.com/Vonng/ddia/pull/87)    | [@wynn5a](https://github.com/wynn5a)                       | Update ch3.md                                                  |
| [86 ](https://github.com/Vonng/ddia/pull/86)    | [@northmorn](https://github.com/northmorn)                 | Update ch1.md                                                  |
| [85 ](https://github.com/Vonng/ddia/pull/85)    | [@sunbuhui](https://github.com/sunbuhui)                   | fix ch2.md: fix ch2 ambiguous translation                      |
| [84 ](https://github.com/Vonng/ddia/pull/84)    | [@ganler](https://github.com/ganler)                       | Fix translation: use up                                        |
| [83 ](https://github.com/Vonng/ddia/pull/83)    | [@afunTW](https://github.com/afunTW)                       | Using OpenCC to convert from zh-cn to zh-tw                    |
| [82 ](https://github.com/Vonng/ddia/pull/82)    | [@kangni](https://github.com/kangni)                       | fix gitbook url                                                |
| [78 ](https://github.com/Vonng/ddia/pull/78)    | [@hanyu2](https://github.com/hanyu2)                       | Fix unappropriated translation                                 |
| [77 ](https://github.com/Vonng/ddia/pull/77)    | [@Ozarklake](https://github.com/Ozarklake)                 | fix typo                                                       |
| [75 ](https://github.com/Vonng/ddia/pull/75)    | [@2997ms](https://github.com/2997ms)                       | Fix typo                                                       |
| [74 ](https://github.com/Vonng/ddia/pull/74)    | [@2997ms](https://github.com/2997ms)                       | Update ch9.md                                                  |
| [70 ](https://github.com/Vonng/ddia/pull/70)    | [@2997ms](https://github.com/2997ms)                       | Update ch7.md                                                  |
| [67 ](https://github.com/Vonng/ddia/pull/67)    | [@jiajiadebug](https://github.com/jiajiadebug)             | fix issues in ch2 - ch9 and glossary                           |
| [66 ](https://github.com/Vonng/ddia/pull/66)    | [@blindpirate](https://github.com/blindpirate)             | Fix typo                                                       |
| [63 ](https://github.com/Vonng/ddia/pull/63)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch10.md                                                 |
| [62 ](https://github.com/Vonng/ddia/pull/62)    | [@ych](https://github.com/ych)                             | fix ch1.md typesetting problem                                 |
| [61 ](https://github.com/Vonng/ddia/pull/61)    | [@xianlaioy](https://github.com/xianlaioy)                 | docs:鍾-->種，去掉ou                                                |
| [60 ](https://github.com/Vonng/ddia/pull/60)    | [@Zombo1296](https://github.com/Zombo1296)                 | 否則 -> 或者                                                       |
| [59 ](https://github.com/Vonng/ddia/pull/59)    | [@AlexanderMisel](https://github.com/AlexanderMisel)       | 呼叫->呼叫，顯著->顯著                                                  |
| [58 ](https://github.com/Vonng/ddia/pull/58)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch8.md                                                  |
| [55 ](https://github.com/Vonng/ddia/pull/55)    | [@saintube](https://github.com/saintube)                   | ch8: 修改連結錯誤                                                    |
| [54 ](https://github.com/Vonng/ddia/pull/54)    | [@Panmax](https://github.com/Panmax)                       | Update ch2.md                                                  |
| [53 ](https://github.com/Vonng/ddia/pull/53)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [52 ](https://github.com/Vonng/ddia/pull/52)    | [@hecenjie](https://github.com/hecenjie)                   | Update ch1.md                                                  |
| [51 ](https://github.com/Vonng/ddia/pull/51)    | [@latavin243](https://github.com/latavin243)               | fix 修正ch3 ch4幾處翻譯                                              |
| [50 ](https://github.com/Vonng/ddia/pull/50)    | [@AlexZFX](https://github.com/AlexZFX)                     | 幾個疏漏和格式錯誤                                                      |
| [49 ](https://github.com/Vonng/ddia/pull/49)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch1.md                                                  |
| [48 ](https://github.com/Vonng/ddia/pull/48)    | [@scaugrated](https://github.com/scaugrated)               | fix typo                                                       |
| [47 ](https://github.com/Vonng/ddia/pull/47)    | [@lzwill](https://github.com/lzwill)                       | Fixed typos in ch2                                             |
| [45 ](https://github.com/Vonng/ddia/pull/45)    | [@zenuo](https://github.com/zenuo)                         | 刪除一個多餘的右括號                                                     |
| [44 ](https://github.com/Vonng/ddia/pull/44)    | [@akxxsb](https://github.com/akxxsb)                       | 修正第七章底部連結錯誤                                                    |
| [43 ](https://github.com/Vonng/ddia/pull/43)    | [@baijinping](https://github.com/baijinping)               | "更假簡單"->"更加簡單"                                                 |
| [42 ](https://github.com/Vonng/ddia/pull/42)    | [@tisonkun](https://github.com/tisonkun)                   | 修復 ch1 中的無序列表格式                                                |
| [38 ](https://github.com/Vonng/ddia/pull/38)    | [@renjie-c](https://github.com/renjie-c)                   | 糾正多處的翻譯小錯誤                                                     |
| [37 ](https://github.com/Vonng/ddia/pull/37)    | [@tankilo](https://github.com/tankilo)                     | fix translation mistakes in ch4.md                             |
| [36 ](https://github.com/Vonng/ddia/pull/36)    | [@wwek](https://github.com/wwek)                           | 1.修復多個連結錯誤 2.名詞最佳化修訂 3.錯誤修訂                                     |
| [35 ](https://github.com/Vonng/ddia/pull/35)    | [@wwek](https://github.com/wwek)                           | fix ch7.md  to ch8.md  link error                              |
| [34 ](https://github.com/Vonng/ddia/pull/34)    | [@wwek](https://github.com/wwek)                           | Merge pull request #1 from Vonng/master                        |
| [33 ](https://github.com/Vonng/ddia/pull/33)    | [@wwek](https://github.com/wwek)                           | fix part-ii.md link error                                      |
| [32 ](https://github.com/Vonng/ddia/pull/32)    | [@JCYoky](https://github.com/JCYoky)                       | Update ch2.md                                                  |
| [31 ](https://github.com/Vonng/ddia/pull/31)    | [@elsonLee](https://github.com/elsonLee)                   | Update ch7.md                                                  |
| [26 ](https://github.com/Vonng/ddia/pull/26)    | [@yjhmelody](https://github.com/yjhmelody)                 | 修復一些明顯錯誤                                                       |
| [25 ](https://github.com/Vonng/ddia/pull/25)    | [@lqbilbo](https://github.com/lqbilbo)                     | 修復連結錯誤                                                         |
| [24 ](https://github.com/Vonng/ddia/pull/24)    | [@artiship](https://github.com/artiship)                   | 修改詞語順序                                                         |
| [23 ](https://github.com/Vonng/ddia/pull/23)    | [@artiship](https://github.com/artiship)                   | 修正錯別字                                                          |
| [22 ](https://github.com/Vonng/ddia/pull/22)    | [@artiship](https://github.com/artiship)                   | 糾正翻譯錯誤                                                         |
| [21 ](https://github.com/Vonng/ddia/pull/21)    | [@zhtisi](https://github.com/zhtisi)                       | 修正目錄和本章標題不符的情況                                                 |
| [20 ](https://github.com/Vonng/ddia/pull/20)    | [@rentiansheng](https://github.com/rentiansheng)           | Update ch7.md                                                  |
| [19 ](https://github.com/Vonng/ddia/pull/19)    | [@LHRchina](https://github.com/LHRchina)                   | 修復語句小bug                                                       |
| [16 ](https://github.com/Vonng/ddia/pull/16)    | [@MuAlex](https://github.com/MuAlex)                       | Master                                                         |
| [15 ](https://github.com/Vonng/ddia/pull/15)    | [@cg-zhou](https://github.com/cg-zhou)                     | Update translation progress                                    |
| [14 ](https://github.com/Vonng/ddia/pull/14)    | [@cg-zhou](https://github.com/cg-zhou)                     | Translate glossary                                             |
| [13 ](https://github.com/Vonng/ddia/pull/13)    | [@cg-zhou](https://github.com/cg-zhou)                     | 詳細修改了後記中和印度野豬相關的描述                                             |
| [12 ](https://github.com/Vonng/ddia/pull/12)    | [@ibyte2011](https://github.com/ibyte2011)                 | 修改了部分翻譯                                                        |
| [11 ](https://github.com/Vonng/ddia/pull/11)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 100%                                                       |
| [10 ](https://github.com/Vonng/ddia/pull/10)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 20%                                                        |
| [9  ](https://github.com/Vonng/ddia/pull/9)     | [@jiajiadebug](https://github.com/jiajiadebug)             | Preface, ch1, part-i translation minor fixes                   |
| [7  ](https://github.com/Vonng/ddia/pull/7)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 translation pull request                                   |
| [6  ](https://github.com/Vonng/ddia/pull/6)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 change version1                                            |
| [5  ](https://github.com/Vonng/ddia/pull/5)     | [@nevertiree](https://github.com/nevertiree)               | Chapter 01語法微調                                                 |
| [2  ](https://github.com/Vonng/ddia/pull/2)     | [@seagullbird](https://github.com/seagullbird)             | 序言初翻                                                           |

</details><br />


---------

## 許可證

本專案採用 [CC-BY 4.0](https://github.com/Vonng/ddia/blob/master/LICENSE) 許可證，您可以在這裡找到完整說明：

- [署名 4.0 協議國際版 CC BY 4.0 Deed](https://creativecommons.org/licenses/by/4.0/deed.zh-hans)
- [Attribution 4.0 International CC BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.en)

================================================
FILE: content/v1_tw/ch1.md
================================================
---
title: "第一章：可靠性、可伸縮性和可維護性"
linkTitle: "1. 可靠性、可伸縮性和可維護性"
weight: 101
breadcrumbs: false
---


![](/map/ch01.png)

> 網際網路做得太棒了，以至於大多數人將它看作像太平洋這樣的自然資源，而不是什麼人工產物。上一次出現這種大規模且無差錯的技術，你還記得是什麼時候嗎？
>
> —— [艾倫・凱](http://www.drdobbs.com/architecture-and-design/interview-with-alan-kay/240003442) 在接受 Dobb 博士雜誌採訪時說（2012 年）

現今很多應用程式都是 **資料密集型（data-intensive）** 的，而非 **計算密集型（compute-intensive）** 的。因此 CPU 很少成為這類應用的瓶頸，更大的問題通常來自資料量、資料複雜性、以及資料的變更速度。

資料密集型應用通常由標準組件構建而成，標準組件提供了很多通用的功能；例如，許多應用程式都需要：

 - 儲存資料，以便自己或其他應用程式之後能再次找到 （*資料庫，即 databases*）
 - 記住開銷昂貴操作的結果，加快讀取速度（*快取，即 caches*）
 - 允許使用者按關鍵字搜尋資料，或以各種方式對資料進行過濾（*搜尋索引，即 search indexes*）
 - 向其他程序傳送訊息，進行非同步處理（*流處理，即 stream processing*）
 - 定期處理累積的大批次資料（*批處理，即 batch processing*）

如果這些功能聽上去平淡無奇，那是因為這些 **資料系統（data system）** 是非常成功的抽象：我們一直不假思索地使用它們並習以為常。絕大多數工程師不會幻想從零開始編寫儲存引擎，因為在開發應用時，資料庫已經是足夠完美的工具了。

但現實沒有這麼簡單。不同的應用有著不同的需求，因而資料庫系統也是百花齊放，有著各式各樣的特性。實現快取有很多種手段，建立搜尋索引也有好幾種方法，諸如此類。因此在開發應用前，我們依然有必要先弄清楚最適合手頭工作的工具和方法。而且當單個工具解決不了你的問題時，組合使用這些工具可能還是有些難度的。

本書將是一趟關於資料系統原理、實踐與應用的旅程，並講述了設計資料密集型應用的方法。我們將探索不同工具之間的共性與特性，以及各自的實現原理。

本章將從我們所要實現的基礎目標開始：可靠、可伸縮、可維護的資料系統。我們將澄清這些詞語的含義，概述考量這些目標的方法。並回顧一些後續章節所需的基礎知識。在接下來的章節中我們將抽絲剝繭，研究設計資料密集型應用時可能遇到的設計決策。


## 關於資料系統的思考

我們通常認為，資料庫、訊息佇列、快取等工具分屬於幾個差異顯著的類別。雖然資料庫和訊息隊列表面上有一些相似性 —— 它們都會儲存一段時間的資料 —— 但它們有迥然不同的訪問模式，這意味著迥異的效能特徵和實現手段。

那我們為什麼要把這些東西放在 **資料系統（data system）** 的總稱之下混為一談呢？

近些年來，出現了許多新的資料儲存工具與資料處理工具。它們針對不同應用場景進行最佳化，因此不再適合生硬地歸入傳統類別【1】。類別之間的界限變得越來越模糊，例如：資料儲存可以被當成訊息佇列用（Redis），訊息佇列則帶有類似資料庫的持久保證（Apache Kafka）。

其次，越來越多的應用程式有著各種嚴格而廣泛的要求，單個工具不足以滿足所有的資料處理和儲存需求。取而代之的是，總體工作被拆分成一系列能被單個工具高效完成的任務，並透過應用程式碼將它們縫合起來。

例如，如果將快取（應用管理的快取層，Memcached 或同類產品）和全文搜尋（全文搜尋伺服器，例如 Elasticsearch 或 Solr）功能從主資料庫剝離出來，那麼使快取 / 索引與主資料庫保持同步通常是應用程式碼的責任。[圖 1-1](/v1/ddia_0101.png) 給出了這種架構可能的樣子（細節將在後面的章節中詳細介紹）。

![](/v1/ddia_0101.png)

**圖 1-1 一個可能的組合使用多個元件的資料系統架構**

當你將多個工具組合在一起提供服務時，服務的介面或 **應用程式程式設計介面（API, Application Programming Interface）** 通常向客戶端隱藏這些實現細節。現在，你基本上已經使用較小的通用元件建立了一個全新的、專用的資料系統。這個新的複合資料系統可能會提供特定的保證，例如：快取在寫入時會作廢或更新，以便外部客戶端獲取一致的結果。現在你不僅是應用程式開發人員，還是資料系統設計人員了。

設計資料系統或服務時可能會遇到很多棘手的問題，例如：當系統出問題時，如何確保資料的正確性和完整性？當部分系統退化降級時，如何為客戶提供始終如一的良好效能？當負載增加時，如何擴容應對？什麼樣的 API 才是好的 API？

影響資料系統設計的因素很多，包括參與人員的技能和經驗、歷史遺留問題、系統路徑依賴、交付時限、公司的風險容忍度、監管約束等，這些因素都需要具體問題具體分析。

本書著重討論三個在大多數軟體系統中都很重要的問題：

可靠性（Reliability）
: 系統在 **困境**（adversity，比如硬體故障、軟體故障、人為錯誤）中仍可正常工作（正確完成功能，並能達到期望的效能水準）。請參閱 “[可靠性](#可靠性)”。

可伸縮性（Scalability）
: 有合理的辦法應對系統的增長（資料量、流量、複雜性）。請參閱 “[可伸縮性](#可伸縮性)”。

可維護性（Maintainability）
: 許多不同的人（工程師、運維）在不同的生命週期，都能高效地在系統上工作（使系統保持現有行為，並適應新的應用場景）。請參閱 “[可維護性](#可維護性)”。

人們經常追求這些詞彙，卻沒有清楚理解它們到底意味著什麼。為了工程的嚴謹性，本章的剩餘部分將探討可靠性、可伸縮性和可維護性的含義。為實現這些目標而使用的各種技術，架構和演算法將在後續的章節中研究。


## 可靠性

人們對於一個東西是否可靠，都有一個直觀的想法。人們對可靠軟體的典型期望包括：

* 應用程式表現出使用者所期望的功能。
* 允許使用者犯錯，允許使用者以出乎意料的方式使用軟體。
* 在預期的負載和資料量下，效能滿足要求。
* 系統能防止未經授權的訪問和濫用。

如果所有這些在一起意味著 “正確工作”，那麼可以把可靠性粗略理解為 “即使出現問題，也能繼續正確工作”。

造成錯誤的原因叫做 **故障（fault）**，能預料並應對故障的系統特性可稱為 **容錯（fault-tolerant）** 或 **回彈性（resilient）**。“**容錯**” 一詞可能會產生誤導，因為它暗示著系統可以容忍所有可能的錯誤，但在實際中這是不可能的。比方說，如果整個地球（及其上的所有伺服器）都被黑洞吞噬了，想要容忍這種錯誤，需要把網路託管到太空中 —— 這種預算能不能批准就祝你好運了。所以在討論容錯時，只有談論特定型別的錯誤才有意義。

注意 **故障（fault）** 不同於 **失效（failure）**【2】。**故障** 通常定義為系統的一部分狀態偏離其標準，而 **失效** 則是系統作為一個整體停止向用戶提供服務。故障的機率不可能降到零，因此最好設計容錯機制以防因 **故障** 而導致 **失效**。本書中我們將介紹幾種用不可靠的部件構建可靠系統的技術。

反直覺的是，在這類容錯系統中，透過故意觸發來 **提高** 故障率是有意義的，例如：在沒有警告的情況下隨機地殺死單個程序。許多高危漏洞實際上是由糟糕的錯誤處理導致的【3】，因此我們可以透過故意引發故障來確保容錯機制不斷執行並接受考驗，從而提高故障自然發生時系統能正確處理的信心。Netflix 公司的 *Chaos Monkey*【4】就是這種方法的一個例子。

儘管比起 **阻止錯誤（prevent error）**，我們通常更傾向於 **容忍錯誤**。但也有 **預防勝於治療** 的情況（比如不存在治療方法時）。安全問題就屬於這種情況。例如，如果攻擊者破壞了系統，並獲取了敏感資料，這種事是撤銷不了的。但本書主要討論的是可以恢復的故障種類，正如下面幾節所述。

### 硬體故障

當想到系統失效的原因時，**硬體故障（hardware faults）** 總會第一個進入腦海。硬碟崩潰、記憶體出錯、機房斷電、有人拔錯網線…… 任何與大型資料中心打過交道的人都會告訴你：一旦你擁有很多機器，這些事情 **總** 會發生！

據報道稱，硬碟的 **平均無故障時間（MTTF, mean time to failure）** 約為 10 到 50 年【5】【6】。因此從數學期望上講，在擁有 10000 個磁碟的儲存叢集上，平均每天會有 1 個磁碟出故障。

為了減少系統的故障率，第一反應通常都是增加單個硬體的冗餘度，例如：磁碟可以組建 RAID，伺服器可能有雙路電源和熱插拔 CPU，資料中心可能有電池和柴油發電機作為後備電源，某個元件掛掉時冗餘元件可以立刻接管。這種方法雖然不能完全防止由硬體問題導致的系統失效，但它簡單易懂，通常也足以讓機器不間斷執行很多年。

直到最近，硬體冗餘對於大多數應用來說已經足夠了，它使單臺機器完全失效變得相當罕見。只要你能快速地把備份恢復到新機器上，故障停機時間對大多數應用而言都算不上災難性的。只有少量高可用性至關重要的應用才會要求有多套硬體冗餘。

但是隨著資料量和應用計算需求的增加，越來越多的應用開始大量使用機器，這會相應地增加硬體故障率。此外，在類似亞馬遜 AWS（Amazon Web Services）的一些雲服務平臺上，虛擬機器例項不可用卻沒有任何警告也是很常見的【7】，因為雲平臺的設計就是優先考慮 **靈活性（flexibility）** 和 **彈性（elasticity）**[^i]，而不是單機可靠性。

如果在硬體冗餘的基礎上進一步引入軟體容錯機制，那麼系統在容忍整個（單臺）機器故障的道路上就更進一步了。這樣的系統也有運維上的便利，例如：如果需要重啟機器（例如應用作業系統安全補丁），單伺服器系統就需要計劃停機。而允許機器失效的系統則可以一次修復一個節點，無需整個系統停機。

[^i]: 在 [應對負載的方法](#應對負載的方法) 一節定義

### 軟體錯誤

我們通常認為硬體故障是隨機的、相互獨立的：一臺機器的磁碟失效並不意味著另一臺機器的磁碟也會失效。雖然大量硬體元件之間可能存在微弱的相關性（例如伺服器機架的溫度等共同的原因），但同時發生故障也是極為罕見的。

另一類錯誤是內部的 **系統性錯誤（systematic error）**【8】。這類錯誤難以預料，而且因為是跨節點相關的，所以比起不相關的硬體故障往往可能造成更多的 **系統失效**【5】。例子包括：

* 接受特定的錯誤輸入，便導致所有應用伺服器例項崩潰的 BUG。例如 2012 年 6 月 30 日的閏秒，由於 Linux 核心中的一個錯誤【9】，許多應用同時掛掉了。
* 失控程序會用盡一些共享資源，包括 CPU 時間、記憶體、磁碟空間或網路頻寬。
* 系統依賴的服務變慢，沒有響應，或者開始返回錯誤的響應。
* 級聯故障，一個元件中的小故障觸發另一個元件中的故障，進而觸發更多的故障【10】。

導致這類軟體故障的 BUG 通常會潛伏很長時間，直到被異常情況觸發為止。這種情況意味著軟體對其環境做出了某種假設 —— 雖然這種假設通常來說是正確的，但由於某種原因最後不再成立了【11】。

雖然軟體中的系統性故障沒有速效藥，但我們還是有很多小辦法，例如：仔細考慮系統中的假設和互動；徹底的測試；程序隔離；允許程序崩潰並重啟；測量、監控並分析生產環境中的系統行為。如果系統能夠提供一些保證（例如在一個訊息佇列中，進入與發出的訊息數量相等），那麼系統就可以在執行時不斷自檢，並在出現 **差異（discrepancy）** 時報警【12】。

### 人為錯誤

設計並構建了軟體系統的工程師是人類，維持系統執行的運維也是人類。即使他們懷有最大的善意，人類也是不可靠的。舉個例子，一項關於大型網際網路服務的研究發現，運維配置錯誤是導致服務中斷的首要原因，而硬體故障（伺服器或網路）僅導致了 10-25% 的服務中斷【13】。

儘管人類不可靠，但怎麼做才能讓系統變得可靠？最好的系統會組合使用以下幾種辦法：

* 以最小化犯錯機會的方式設計系統。例如，精心設計的抽象、API 和管理後臺使做對事情更容易，搞砸事情更困難。但如果介面限制太多，人們就會忽略它們的好處而想辦法繞開。很難正確把握這種微妙的平衡。
* 將人們最容易犯錯的地方與可能導致失效的地方 **解耦（decouple）**。特別是提供一個功能齊全的非生產環境 **沙箱（sandbox）**，使人們可以在不影響真實使用者的情況下，使用真實資料安全地探索和實驗。
* 在各個層次進行徹底的測試【3】，從單元測試、全系統整合測試到手動測試。自動化測試易於理解，已經被廣泛使用，特別適合用來覆蓋正常情況中少見的 **邊緣場景（corner case）**。
* 允許從人為錯誤中簡單快速地恢復，以最大限度地減少失效情況帶來的影響。例如，快速回滾配置變更，分批發布新程式碼（以便任何意外錯誤隻影響一小部分使用者），並提供資料重算工具（以備舊的計算出錯）。
* 配置詳細和明確的監控，比如效能指標和錯誤率。在其他工程學科中這指的是 **遙測（telemetry）**（一旦火箭離開了地面，遙測技術對於跟蹤發生的事情和理解失敗是至關重要的）。監控可以向我們發出預警訊號，並允許我們檢查是否有任何地方違反了假設和約束。當出現問題時，指標資料對於問題診斷是非常寶貴的。
* 良好的管理實踐與充分的培訓 —— 一個複雜而重要的方面，但超出了本書的範圍。

### 可靠性有多重要？

可靠性不僅僅是針對核電站和空中交通管制軟體而言，我們也期望更多平凡的應用能可靠地執行。商務應用中的錯誤會導致生產力損失（也許資料報告不完整還會有法律風險），而電商網站的中斷則可能會導致收入和聲譽的巨大損失。

即使在 “非關鍵” 應用中，我們也對使用者負有責任。試想一位家長把所有的照片和孩子的影片儲存在你的照片應用裡【15】。如果資料庫突然損壞，他們會感覺如何？他們可能會知道如何從備份恢復嗎？

在某些情況下，我們可能會選擇犧牲可靠性來降低開發成本（例如為未經證實的市場開發產品原型）或運營成本（例如利潤率極低的服務），但我們偷工減料時，應該清楚意識到自己在做什麼。


## 可伸縮性

系統今天能可靠執行，並不意味未來也能可靠執行。服務 **降級（degradation）** 的一個常見原因是負載增加，例如：系統負載已經從一萬個併發使用者增長到十萬個併發使用者，或者從一百萬增長到一千萬。也許現在處理的資料量級要比過去大得多。

**可伸縮性（Scalability）** 是用來描述系統應對負載增長能力的術語。但是請注意，這不是貼在系統上的一維標籤：說 “X 可伸縮” 或 “Y 不可伸縮” 是沒有任何意義的。相反，討論可伸縮性意味著考慮諸如 “如果系統以特定方式增長，有什麼選項可以應對增長？” 和 “如何增加計算資源來處理額外的負載？” 等問題。

### 描述負載

在討論增長問題（如果負載加倍會發生什麼？）前，首先要能簡要描述系統的當前負載。負載可以用一些稱為 **負載引數（load parameters）** 的數字來描述。引數的最佳選擇取決於系統架構，它可能是每秒向 Web 伺服器發出的請求、資料庫中的讀寫比率、聊天室中同時活躍的使用者數量、快取命中率或其他東西。除此之外，也許平均情況對你很重要，也許你的瓶頸是少數極端場景。

為了使這個概念更加具體，我們以推特在 2012 年 11 月釋出的資料【16】為例。推特的兩個主要業務是：

釋出推文
: 使用者可以向其粉絲釋出新訊息（平均 4.6k 請求 / 秒，峰值超過 12k 請求 / 秒）。

主頁時間線
: 使用者可以查閱他們關注的人釋出的推文（300k 請求 / 秒）。

處理每秒 12,000 次寫入（發推文的速率峰值）還是很簡單的。然而推特的伸縮性挑戰並不是主要來自推特量，而是來自 **扇出（fan-out）**[^ii]—— 每個使用者關注了很多人，也被很多人關注。

[^ii]: 扇出：從電子工程學中借用的術語，它描述了輸入連線到另一個門輸出的邏輯閘數量。輸出需要提供足夠的電流來驅動所有連線的輸入。在事務處理系統中，我們使用它來描述為了服務一個傳入請求而需要執行其他服務的請求數量。

大體上講，這一對操作有兩種實現方式。

1. 釋出推文時，只需將新推文插入全域性推文集合即可。當一個使用者請求自己的主頁時間線時，首先查詢他關注的所有人，查詢這些被關注使用者釋出的推文並按時間順序合併。在如 [圖 1-2](/v1/ddia_0102.png) 所示的關係型資料庫中，可以編寫這樣的查詢：

    ```sql
    SELECT tweets.*, users.*
      FROM tweets
      JOIN users   ON tweets.sender_id = users.id
      JOIN follows ON follows.followee_id = users.id
      WHERE follows.follower_id = current_user
    ```

    ![](/v1/ddia_0102.png)

    **圖 1-2 推特主頁時間線的關係型模式簡單實現**

2. 為每個使用者的主頁時間線維護一個快取，就像每個使用者的推文收件箱（[圖 1-3](/v1/ddia_0103.png)）。當一個使用者釋出推文時，查詢所有關注該使用者的人，並將新的推文插入到每個主頁時間線快取中。因此讀取主頁時間線的請求開銷很小，因為結果已經提前計算好了。

    ![](/v1/ddia_0103.png)

    **圖 1-3 用於分發推特至關注者的資料流水線，2012 年 11 月的負載引數【16】**

推特的第一個版本使用了方法 1，但系統很難跟上主頁時間線查詢的負載。所以公司轉向了方法 2，方法 2 的效果更好，因為發推頻率比查詢主頁時間線的頻率幾乎低了兩個數量級，所以在這種情況下，最好在寫入時做更多的工作，而在讀取時做更少的工作。

然而方法 2 的缺點是，發推現在需要大量的額外工作。平均來說，一條推文會發往約 75 個關注者，所以每秒 4.6k 的發推寫入，變成了對主頁時間線快取每秒 345k 的寫入。但這個平均值隱藏了使用者粉絲數差異巨大這一現實，一些使用者有超過 3000 萬的粉絲，這意味著一條推文就可能會導致主頁時間線快取的 3000 萬次寫入！及時完成這種操作是一個巨大的挑戰 —— 推特嘗試在 5 秒內向粉絲傳送推文。

在推特的例子中，每個使用者粉絲數的分佈（可能按這些使用者的發推頻率來加權）是探討可伸縮性的一個關鍵負載引數，因為它決定了扇出負載。你的應用程式可能具有非常不同的特徵，但可以採用相似的原則來考慮它的負載。

推特軼事的最終轉折：現在已經穩健地實現了方法 2，推特逐步轉向了兩種方法的混合。大多數使用者發的推文會被扇出寫入其粉絲主頁時間線快取中。但是少數擁有海量粉絲的使用者（即名流）會被排除在外。當用戶讀取主頁時間線時，分別地獲取出該使用者所關注的每位名流的推文，再與使用者的主頁時間線快取合併，如方法 1 所示。這種混合方法能始終如一地提供良好效能。在 [第十二章](/v1_tw/ch12) 中我們將重新討論這個例子，這在覆蓋更多技術層面之後。

### 描述效能

一旦系統的負載被描述好，就可以研究當負載增加會發生什麼。我們可以從兩種角度來看：

* 增加負載引數並保持系統資源（CPU、記憶體、網路頻寬等）不變時，系統性能將受到什麼影響？
* 增加負載引數並希望保持效能不變時，需要增加多少系統資源？

這兩個問題都需要效能資料，所以讓我們簡單地看一下如何描述系統性能。

對於 Hadoop 這樣的批處理系統，通常關心的是 **吞吐量（throughput）**，即每秒可以處理的記錄數量，或者在特定規模資料集上執行作業的總時間 [^iii]。對於線上系統，通常更重要的是服務的 **響應時間（response time）**，即客戶端傳送請求到接收響應之間的時間。

[^iii]: 理想情況下，批次作業的執行時間是資料集的大小除以吞吐量。在實踐中由於資料傾斜（資料不是均勻分佈在每個工作程序中），需要等待最慢的任務完成，所以執行時間往往更長。

> #### 延遲和響應時間
>
> **延遲（latency）** 和 **響應時間（response time）** 經常用作同義詞，但實際上它們並不一樣。響應時間是客戶所看到的，除了實際處理請求的時間（ **服務時間（service time）** ）之外，還包括網路延遲和排隊延遲。延遲是某個請求等待處理的 **持續時長**，在此期間它處於 **休眠（latent）** 狀態，並等待服務【17】。

即使不斷重複傳送同樣的請求，每次得到的響應時間也都會略有不同。現實世界的系統會處理各式各樣的請求，響應時間可能會有很大差異。因此我們需要將響應時間視為一個可以測量的數值 **分佈（distribution）**，而不是單個數值。

在 [圖 1-4](/v1/ddia_0104.png) 中，每個灰條代表一次對服務的請求，其高度表示請求花費了多長時間。大多數請求是相當快的，但偶爾會出現需要更長的時間的異常值。這也許是因為緩慢的請求實質上開銷更大，例如它們可能會處理更多的資料。但即使（你認為）所有請求都花費相同時間的情況下，隨機的附加延遲也會導致結果變化，例如：上下文切換到後臺程序，網路資料包丟失與 TCP 重傳，垃圾收集暫停，強制從磁碟讀取的頁面錯誤，伺服器機架中的震動【18】，還有很多其他原因。

![](/v1/ddia_0104.png)

**圖 1-4 展示了一個服務 100 次請求響應時間的均值與百分位數**

通常報表都會展示服務的平均響應時間。（嚴格來講 “平均” 一詞並不指代任何特定公式，但實際上它通常被理解為 **算術平均值（arithmetic mean）**：給定 n 個值，加起來除以 n ）。然而如果你想知道 “**典型（typical）**” 響應時間，那麼平均值並不是一個非常好的指標，因為它不能告訴你有多少使用者實際上經歷了這個延遲。

通常使用 **百分位點（percentiles）** 會更好。如果將響應時間列表按最快到最慢排序，那麼 **中位數（median）** 就在正中間：舉個例子，如果你的響應時間中位數是 200 毫秒，這意味著一半請求的返回時間少於 200 毫秒，另一半比這個要長。

如果想知道典型場景下使用者需要等待多長時間，那麼中位數是一個好的度量標準：一半使用者請求的響應時間少於響應時間的中位數，另一半服務時間比中位數長。中位數也被稱為第 50 百分位點，有時縮寫為 p50。注意中位數是關於單個請求的；如果使用者同時發出幾個請求（在一個會話過程中，或者由於一個頁面中包含了多個資源），則至少一個請求比中位數慢的機率遠大於 50%。

為了弄清異常值有多糟糕，可以看看更高的百分位點，例如第 95、99 和 99.9 百分位點（縮寫為 p95，p99 和 p999）。它們意味著 95%、99% 或 99.9% 的請求響應時間要比該閾值快，例如：如果第 95 百分位點響應時間是 1.5 秒，則意味著 100 個請求中的 95 個響應時間快於 1.5 秒，而 100 個請求中的 5 個響應時間超過 1.5 秒。如 [圖 1-4](/v1/ddia_0104.png) 所示。

響應時間的高百分位點（也稱為 **尾部延遲**，即 **tail latencies**）非常重要，因為它們直接影響使用者的服務體驗。例如亞馬遜在描述內部服務的響應時間要求時是以 99.9 百分位點為準，即使它隻影響一千個請求中的一個。這是因為請求響應最慢的客戶往往也是資料最多的客戶，也可以說是最有價值的客戶 —— 因為他們掏錢了【19】。保證網站響應迅速對於保持客戶的滿意度非常重要，亞馬遜觀察到：響應時間增加 100 毫秒，銷售量就減少 1%【20】；而另一些報告說：慢 1 秒鐘會讓客戶滿意度指標減少 16%【21，22】。

另一方面，最佳化第 99.99 百分位點（一萬個請求中最慢的一個）被認為太昂貴了，不能為亞馬遜的目標帶來足夠好處。減小高百分位點處的響應時間相當困難，因為它很容易受到隨機事件的影響，這超出了控制範圍，而且效益也很小。

百分位點通常用於 **服務級別目標（SLO, service level objectives）** 和 **服務級別協議（SLA, service level agreements）**，即定義服務預期效能和可用性的合同。SLA 可能會宣告，如果服務響應時間的中位數小於 200 毫秒，且 99.9 百分位點低於 1 秒，則認為服務工作正常（如果響應時間更長，就認為服務不達標）。這些指標為客戶設定了期望值，並允許客戶在 SLA 未達標的情況下要求退款。

**排隊延遲（queueing delay）** 通常佔了高百分位點處響應時間的很大一部分。由於伺服器只能並行處理少量的事務（如受其 CPU 核數的限制），所以只要有少量緩慢的請求就能阻礙後續請求的處理，這種效應有時被稱為 **頭部阻塞（head-of-line blocking）** 。即使後續請求在伺服器上處理的非常迅速，由於需要等待先前請求完成，客戶端最終看到的是緩慢的總體響應時間。因為存在這種效應，測量客戶端的響應時間非常重要。

為測試系統的可伸縮性而人為產生負載時，產生負載的客戶端要獨立於響應時間不斷傳送請求。如果客戶端在傳送下一個請求之前等待先前的請求完成，這種行為會產生人為排隊的效果，使得測試時的佇列比現實情況更短，使測量結果產生偏差【23】。

> #### 實踐中的百分位點
>
> 在多重呼叫的後端服務裡，高百分位數變得特別重要。即使並行呼叫，終端使用者請求仍然需要等待最慢的並行呼叫完成。如 [圖 1-5](/v1/ddia_0105.png) 所示，只需要一個緩慢的呼叫就可以使整個終端使用者請求變慢。即使只有一小部分後端呼叫速度較慢，如果終端使用者請求需要多個後端呼叫，則獲得較慢呼叫的機會也會增加，因此較高比例的終端使用者請求速度會變慢（該效果稱為尾部延遲放大，即 tail latency amplification【24】）。
>
> 如果你想將響應時間百分點新增到你的服務的監視儀表板，則需要持續有效地計算它們。例如，你可以使用滑動視窗來跟蹤連續10分鐘內的請求響應時間。每一分鐘，你都會計算出該視窗中的響應時間中值和各種百分數，並將這些度量值繪製在圖上。
>
> 簡單的實現是在時間視窗內儲存所有請求的響應時間列表，並且每分鐘對列表進行排序。如果對你來說效率太低，那麼有一些演算法能夠以最小的 CPU 和記憶體成本（如前向衰減【25】、t-digest【26】或 HdrHistogram 【27】）來計算百分位數的近似值。請注意，平均百分比（例如，減少時間解析度或合併來自多臺機器的資料）在數學上沒有意義 - 聚合響應時間資料的正確方法是新增直方圖【28】。

![](/v1/ddia_0105.png)

**圖 1-5 當一個請求需要多個後端請求時，單個後端慢請求就會拖慢整個終端使用者的請求**

### 應對負載的方法

現在我們已經討論了用於描述負載的引數和用於衡量效能的指標。可以開始認真討論可伸縮性了：當負載引數增加時，如何保持良好的效能？

適應某個級別負載的架構不太可能應付 10 倍於此的負載。如果你正在開發一個快速增長的服務，那麼每次負載發生數量級的增長時，你可能都需要重新考慮架構 —— 或者更頻繁。

人們經常討論 **縱向伸縮**（scaling up，也稱為垂直伸縮，即 vertical scaling，轉向更強大的機器）和 **橫向伸縮**（scaling out，也稱為水平伸縮，即 horizontal scaling，將負載分佈到多臺小機器上）之間的對立。跨多臺機器分配負載也稱為 “**無共享（shared-nothing）**” 架構。可以在單臺機器上執行的系統通常更簡單，但高階機器可能非常貴，所以非常密集的負載通常無法避免地需要橫向伸縮。現實世界中的優秀架構需要將這兩種方法務實地結合，因為使用幾臺足夠強大的機器可能比使用大量的小型虛擬機器更簡單也更便宜。

有些系統是 **彈性（elastic）** 的，這意味著可以在檢測到負載增加時自動增加計算資源，而其他系統則是手動伸縮（人工分析容量並決定向系統新增更多的機器）。如果負載 **極難預測（highly unpredictable）**，則彈性系統可能很有用，但手動伸縮系統更簡單，並且意外操作可能會更少（請參閱 “[分割槽再平衡](/v1_tw/ch6#分割槽再平衡)”）。

跨多臺機器部署 **無狀態服務（stateless services）** 非常簡單，但將帶狀態的資料系統從單節點變為分散式配置則可能引入許多額外複雜度。出於這個原因，常識告訴我們應該將資料庫放在單個節點上（縱向伸縮），直到伸縮成本或可用性需求迫使其改為分散式。

隨著分散式系統的工具和抽象越來越好，至少對於某些型別的應用而言，這種常識可能會改變。可以預見分散式資料系統將成為未來的預設設定，即使對不處理大量資料或流量的場景也如此。本書的其餘部分將介紹多種分散式資料系統，不僅討論它們在可伸縮性方面的表現，還包括易用性和可維護性。

大規模的系統架構通常是應用特定的 —— 沒有一招鮮吃遍天的通用可伸縮架構（不正式的叫法：**萬金油（magic scaling sauce）** ）。應用的問題可能是讀取量、寫入量、要儲存的資料量、資料的複雜度、響應時間要求、訪問模式或者所有問題的大雜燴。

舉個例子，用於處理每秒十萬個請求（每個大小為 1 kB）的系統與用於處理每分鐘 3 個請求（每個大小為 2GB）的系統看上去會非常不一樣，儘管兩個系統有同樣的資料吞吐量。

一個良好適配應用的可伸縮架構，是圍繞著 **假設（assumption）** 建立的：哪些操作是常見的？哪些操作是罕見的？這就是所謂負載引數。如果假設最終是錯誤的，那麼為伸縮所做的工程投入就白費了，最糟糕的是適得其反。在早期創業公司或非正式產品中，通常支援產品快速迭代的能力，要比可伸縮至未來的假想負載要重要的多。

儘管這些架構是應用程式特定的，但可伸縮的架構通常也是從通用的積木塊搭建而成的，並以常見的模式排列。在本書中，我們將討論這些構件和模式。


## 可維護性

眾所周知，軟體的大部分開銷並不在最初的開發階段，而是在持續的維護階段，包括修復漏洞、保持系統正常執行、調查失效、適配新的平臺、為新的場景進行修改、償還技術債和新增新的功能。

不幸的是，許多從事軟體系統行業的人不喜歡維護所謂的 **遺留（legacy）** 系統，—— 也許因為涉及修復其他人的錯誤、和過時的平臺打交道，或者系統被迫使用於一些份外工作。每一個遺留系統都以自己的方式讓人不爽，所以很難給出一個通用的建議來和它們打交道。

但是我們可以，也應該以這樣一種方式來設計軟體：在設計之初就儘量考慮儘可能減少維護期間的痛苦，從而避免自己的軟體系統變成遺留系統。為此，我們將特別關注軟體系統的三個設計原則：

可操作性（Operability）
: 便於運維團隊保持系統平穩執行。

簡單性（Simplicity）
: 從系統中消除儘可能多的 **複雜度（complexity）**，使新工程師也能輕鬆理解系統（注意這和使用者介面的簡單性不一樣）。

可演化性（evolvability）
: 使工程師在未來能輕鬆地對系統進行更改，當需求變化時為新應用場景做適配。也稱為 **可擴充套件性（extensibility）**、**可修改性（modifiability）** 或 **可塑性（plasticity）**。

和之前提到的可靠性、可伸縮性一樣，實現這些目標也沒有簡單的解決方案。不過我們會試著想象具有可操作性，簡單性和可演化性的系統會是什麼樣子。

### 可操作性：人生苦短，關愛運維

有人認為，“良好的運維經常可以繞開垃圾（或不完整）軟體的侷限性，而再好的軟體攤上垃圾運維也沒法可靠執行”。儘管運維的某些方面可以，而且應該是自動化的，但在最初建立正確運作的自動化機制仍然取決於人。

運維團隊對於保持軟體系統順利執行至關重要。一個優秀運維團隊的典型職責如下（或者更多）【29】：

* 監控系統的執行狀況，並在服務狀態不佳時快速恢復服務。
* 跟蹤問題的原因，例如系統故障或效能下降。
* 及時更新軟體和平臺，比如安全補丁。
* 瞭解系統間的相互作用，以便在異常變更造成損失前進行規避。
* 預測未來的問題，並在問題出現之前加以解決（例如，容量規劃）。
* 建立部署、配置、管理方面的良好實踐，編寫相應工具。
* 執行複雜的維護任務，例如將應用程式從一個平臺遷移到另一個平臺。
* 當配置變更時，維持系統的安全性。
* 定義工作流程，使運維操作可預測，並保持生產環境穩定。
* 鐵打的營盤流水的兵，維持組織對系統的瞭解。

良好的可操作性意味著更輕鬆的日常工作，進而運維團隊能專注於高價值的事情。資料系統可以透過各種方式使日常任務更輕鬆：

* 透過良好的監控，提供對系統內部狀態和執行時行為的 **可見性（visibility）**。
* 為自動化提供良好支援，將系統與標準化工具相整合。
* 避免依賴單臺機器（在整個系統繼續不間斷執行的情況下允許機器停機維護）。
* 提供良好的文件和易於理解的操作模型（“如果做 X，會發生 Y”）。
* 提供良好的預設行為，但需要時也允許管理員自由覆蓋預設值。
* 有條件時進行自我修復，但需要時也允許管理員手動控制系統狀態。
* 行為可預測，最大限度減少意外。


### 簡單性：管理複雜度

小型軟體專案可以使用簡單討喜的、富表現力的程式碼，但隨著專案越來越大，程式碼往往變得非常複雜，難以理解。這種複雜度拖慢了所有系統相關人員，進一步增加了維護成本。一個陷入複雜泥潭的軟體專案有時被描述為 **爛泥潭（a big ball of mud）** 【30】。

**複雜度（complexity）** 有各種可能的症狀，例如：狀態空間激增、模組間緊密耦合、糾結的依賴關係、不一致的命名和術語、解決效能問題的 Hack、需要繞開的特例等等，現在已經有很多關於這個話題的討論【31,32,33】。

因為複雜度導致維護困難時，預算和時間安排通常會超支。在複雜的軟體中進行變更，引入錯誤的風險也更大：當開發人員難以理解系統時，隱藏的假設、無意的後果和意外的互動就更容易被忽略。相反，降低複雜度能極大地提高軟體的可維護性，因此簡單性應該是構建系統的一個關鍵目標。

簡化系統並不一定意味著減少功能；它也可以意味著消除 **額外的（accidental）** 的複雜度。Moseley 和 Marks【32】把 **額外複雜度** 定義為：由具體實現中湧現，而非（從使用者視角看，系統所解決的）問題本身固有的複雜度。

用於消除 **額外複雜度** 的最好工具之一是 **抽象（abstraction）**。一個好的抽象可以將大量實現細節隱藏在一個乾淨，簡單易懂的外觀下面。一個好的抽象也可以廣泛用於各類不同應用。比起重複造很多輪子，重用抽象不僅更有效率，而且有助於開發高質量的軟體。抽象元件的質量改進將使所有使用它的應用受益。

例如，高階程式語言是一種抽象，隱藏了機器碼、CPU 暫存器和系統呼叫。SQL 也是一種抽象，隱藏了複雜的磁碟 / 記憶體資料結構、來自其他客戶端的併發請求、崩潰後的不一致性。當然在用高階語言程式設計時，我們仍然用到了機器碼；只不過沒有 **直接（directly）** 使用罷了，正是因為程式語言的抽象，我們才不必去考慮這些實現細節。

抽象可以幫助我們將系統的複雜度控制在可管理的水平，不過，找到好的抽象是非常困難的。在分散式系統領域雖然有許多好的演算法，但我們並不清楚它們應該打包成什麼樣抽象。

本書將緊盯那些允許我們將大型系統的部分提取為定義明確的、可重用的元件的優秀抽象。

### 可演化性：擁抱變化

系統的需求永遠不變，基本是不可能的。更可能的情況是，它們處於常態的變化中，例如：你瞭解了新的事實、出現意想不到的應用場景、業務優先順序發生變化、使用者要求新功能、新平臺取代舊平臺、法律或監管要求發生變化、系統增長迫使架構變化等。

在組織流程方面，**敏捷（agile）** 工作模式為適應變化提供了一個框架。敏捷社群還開發了對在頻繁變化的環境中開發軟體很有幫助的技術工具和模式，如 **測試驅動開發（TDD, test-driven development）** 和 **重構（refactoring）** 。

這些敏捷技術的大部分討論都集中在相當小的規模（同一個應用中的幾個程式碼檔案）。本書將探索在更大資料系統層面上提高敏捷性的方法，可能由幾個不同的應用或服務組成。例如，為了將裝配主頁時間線的方法從方法 1 變為方法 2，你會如何 “重構” 推特的架構 ？

修改資料系統並使其適應不斷變化需求的容易程度，是與 **簡單性** 和 **抽象性** 密切相關的：簡單易懂的系統通常比複雜系統更容易修改。但由於這是一個非常重要的概念，我們將用一個不同的詞來指代資料系統層面的敏捷性： **可演化性（evolvability）** 【34】。


## 本章小結

本章探討了一些關於資料密集型應用的基本思考方式。這些原則將指導我們閱讀本書的其餘部分，那裡將會深入技術細節。

一個應用必須滿足各種需求才稱得上有用。有一些 **功能需求**（functional requirements，即它應該做什麼，比如允許以各種方式儲存，檢索，搜尋和處理資料）以及一些 **非功能性需求**（nonfunctional，即通用屬性，例如安全性、可靠性、合規性、可伸縮性、相容性和可維護性）。在本章詳細討論了可靠性，可伸縮性和可維護性。

**可靠性（Reliability）** 意味著即使發生故障，系統也能正常工作。故障可能發生在硬體（通常是隨機的和不相關的）、軟體（通常是系統性的 Bug，很難處理）和人類（不可避免地時不時出錯）。**容錯技術** 可以對終端使用者隱藏某些型別的故障。

**可伸縮性（Scalability）** 意味著即使在負載增加的情況下也有保持效能的策略。為了討論可伸縮性，我們首先需要定量描述負載和效能的方法。我們簡要了解了推特主頁時間線的例子，介紹描述負載的方法，並將響應時間百分位點作為衡量效能的一種方式。在可伸縮的系統中可以新增 **處理容量（processing capacity）** 以在高負載下保持可靠。

**可維護性（Maintainability）** 有許多方面，但實質上是關於工程師和運維團隊的生活質量的。良好的抽象可以幫助降低複雜度，並使系統易於修改和適應新的應用場景。良好的可操作性意味著對系統的健康狀態具有良好的可見性，並擁有有效的管理手段。

不幸的是，使應用可靠、可伸縮或可維護並不容易。但是某些模式和技術會不斷重新出現在不同的應用中。在接下來的幾章中，我們將看到一些資料系統的例子，並分析它們如何實現這些目標。

在本書後面的 [第三部分](/v1_tw/part-iii) 中，我們將看到一種模式：幾個元件協同工作以構成一個完整的系統（如 [圖 1-1](/v1/ddia_0101.png) 中的例子）


## 參考文獻

1. Michael Stonebraker and Uğur Çetintemel: “['One Size Fits All': An Idea Whose Time Has Come and Gone](https://cs.brown.edu/~ugur/fits_all.pdf),” at *21st International Conference on Data Engineering* (ICDE), April 2005.
1. Walter L. Heimerdinger and Charles B. Weinstock: “[A Conceptual Framework for System Fault Tolerance](https://resources.sei.cmu.edu/asset_files/TechnicalReport/1992_005_001_16112.pdf),” Technical Report CMU/SEI-92-TR-033, Software Engineering Institute, Carnegie Mellon University, October 1992.
1. Ding Yuan, Yu Luo, Xin Zhuang, et al.: “[Simple Testing Can Prevent Most Critical Failures: An Analysis of Production Failures in Distributed Data-Intensive Systems](https://www.usenix.org/system/files/conference/osdi14/osdi14-paper-yuan.pdf),” at *11th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2014.
1. Yury Izrailevsky and Ariel Tseitlin: “[The Netflix Simian Army](https://netflixtechblog.com/the-netflix-simian-army-16e57fbab116),” *netflixtechblog.com*, July 19, 2011.
1. Daniel Ford, François Labelle, Florentina I. Popovici, et al.: “[Availability in Globally Distributed Storage Systems](http://research.google.com/pubs/archive/36737.pdf),” at *9th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2010.
1. Brian Beach: “[Hard Drive Reliability Update – Sep 2014](https://www.backblaze.com/blog/hard-drive-reliability-update-september-2014/),” *backblaze.com*, September 23, 2014.
1. Laurie Voss: “[AWS: The Good, the Bad and the Ugly](https://web.archive.org/web/20160429075023/http://blog.awe.sm/2012/12/18/aws-the-good-the-bad-and-the-ugly/),” *blog.awe.sm*, December 18, 2012.
1. Haryadi S. Gunawi, Mingzhe Hao, Tanakorn Leesatapornwongsa, et al.: “[What Bugs Live in the Cloud?](http://ucare.cs.uchicago.edu/pdf/socc14-cbs.pdf),” at *5th ACM Symposium on Cloud Computing* (SoCC), November 2014. [doi:10.1145/2670979.2670986](http://dx.doi.org/10.1145/2670979.2670986)
1. Nelson Minar: “[Leap Second Crashes Half the Internet](http://www.somebits.com/weblog/tech/bad/leap-second-2012.html),” *somebits.com*, July 3, 2012.
1. Amazon Web Services: “[Summary of the Amazon EC2 and Amazon RDS Service Disruption in the US East Region](http://aws.amazon.com/message/65648/),” *aws.amazon.com*, April 29, 2011.
1. Richard I. Cook: “[How Complex Systems Fail](https://www.adaptivecapacitylabs.com/HowComplexSystemsFail.pdf),” Cognitive Technologies Laboratory, April 2000.
1. Jay Kreps: “[Getting Real About Distributed System Reliability](http://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability),” *blog.empathybox.com*, March 19, 2012.
1. David Oppenheimer, Archana Ganapathi, and David A. Patterson: “[Why Do Internet Services Fail, and What Can Be Done About It?](http://static.usenix.org/legacy/events/usits03/tech/full_papers/oppenheimer/oppenheimer.pdf),” at *4th USENIX Symposium on Internet Technologies and Systems* (USITS), March 2003.
1. Nathan Marz: “[Principles of Software Engineering, Part 1](http://nathanmarz.com/blog/principles-of-software-engineering-part-1.html),” *nathanmarz.com*, April 2, 2013.
1. Michael Jurewitz: “[The Human Impact of Bugs](http://jury.me/blog/2013/3/14/the-human-impact-of-bugs),” *jury.me*, March 15, 2013.
1. Raffi Krikorian: “[Timelines at Scale](http://www.infoq.com/presentations/Twitter-Timeline-Scalability),” at *QCon San Francisco*, November 2012.
1. Martin Fowler: *Patterns of Enterprise Application Architecture*. Addison Wesley, 2002. ISBN: 978-0-321-12742-6
1. Kelly Sommers: “[After all that run around, what caused 500ms disk latency even when we replaced physical server?](https://twitter.com/kellabyte/status/532930540777635840)” *twitter.com*, November 13, 2014.
1. Giuseppe DeCandia, Deniz Hastorun, Madan Jampani, et al.: “[Dynamo: Amazon's Highly Available Key-Value Store](http://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf),” at *21st ACM Symposium on Operating Systems Principles* (SOSP), October 2007.
1. Greg Linden: “[Make Data Useful](http://glinden.blogspot.co.uk/2006/12/slides-from-my-talk-at-stanford.html),” slides from presentation at Stanford University Data Mining class (CS345), December 2006.
1. Tammy Everts: “[The Real Cost of Slow Time vs Downtime](https://www.slideshare.net/Radware/radware-cmg2014-tammyevertsslowtimevsdowntime),” *slideshare.net*, November 5, 2014.
1. Jake Brutlag: “[Speed Matters](https://ai.googleblog.com/2009/06/speed-matters.html),” *ai.googleblog.com*, June 23, 2009.
1. Tyler Treat: “[Everything You Know About Latency Is Wrong](http://bravenewgeek.com/everything-you-know-about-latency-is-wrong/),” *bravenewgeek.com*, December 12, 2015.
1. Jeffrey Dean and Luiz André Barroso: “[The Tail at Scale](http://cacm.acm.org/magazines/2013/2/160173-the-tail-at-scale/fulltext),” *Communications of the ACM*, volume 56, number 2, pages 74–80, February 2013. [doi:10.1145/2408776.2408794](http://dx.doi.org/10.1145/2408776.2408794)
1. Graham Cormode, Vladislav Shkapenyuk, Divesh Srivastava, and Bojian Xu: “[Forward Decay: A Practical Time Decay Model for Streaming Systems](http://dimacs.rutgers.edu/~graham/pubs/papers/fwddecay.pdf),” at *25th IEEE International Conference on Data Engineering* (ICDE), March 2009.
1. Ted Dunning and Otmar Ertl: “[Computing Extremely Accurate Quantiles Using t-Digests](https://github.com/tdunning/t-digest),” *github.com*, March 2014.
1. Gil Tene: “[HdrHistogram](http://www.hdrhistogram.org/),” *hdrhistogram.org*.
1. Baron Schwartz: “[Why Percentiles Don’t Work the Way You Think](https://orangematter.solarwinds.com/2016/11/18/why-percentiles-dont-work-the-way-you-think/),” *solarwinds.com*, November 18, 2016.
1. James Hamilton: “[On Designing and Deploying Internet-Scale Services](https://www.usenix.org/legacy/events/lisa07/tech/full_papers/hamilton/hamilton.pdf),” at *21st Large Installation System Administration Conference* (LISA), November 2007.
1. Brian Foote and Joseph Yoder: “[Big Ball of Mud](http://www.laputan.org/pub/foote/mud.pdf),” at *4th Conference on Pattern Languages of Programs* (PLoP), September 1997.
1. Frederick P Brooks: “No Silver Bullet – Essence and Accident in Software Engineering,” in *The Mythical Man-Month*, Anniversary edition, Addison-Wesley, 1995. ISBN: 978-0-201-83595-3
1. Ben Moseley and Peter Marks: “[Out of the Tar Pit](https://curtclifton.net/papers/MoseleyMarks06a.pdf),” at *BCS Software Practice Advancement* (SPA), 2006.
1. Rich Hickey: “[Simple Made Easy](http://www.infoq.com/presentations/Simple-Made-Easy),” at *Strange Loop*, September 2011.
1. Hongyu Pei Breivold, Ivica Crnkovic, and Peter J. Eriksson: “[Analyzing Software Evolvability](http://www.es.mdh.se/pdf_publications/1251.pdf),” at *32nd Annual IEEE International Computer Software and Applications Conference* (COMPSAC), July 2008. [doi:10.1109/COMPSAC.2008.50](http://dx.doi.org/10.1109/COMPSAC.2008.50)

================================================
FILE: content/v1_tw/ch10.md
================================================
---
title: "第十章：批處理"
linkTitle: "10. 批處理"
weight: 310
breadcrumbs: false
---

![](/map/ch10.png)

> 帶有太強個人色彩的系統無法成功。當最初的設計完成並且相對穩定時，不同的人們以自己的方式進行測試，真正的考驗才開始。
>
> —— 高德納

在本書的前兩部分中，我們討論了很多關於 **請求** 和 **查詢** 以及相應的 **響應** 或 **結果**。許多現有資料系統中都採用這種資料處理方式：你傳送請求指令，一段時間後（我們期望）系統會給出一個結果。資料庫、快取、搜尋索引、Web 伺服器以及其他一些系統都以這種方式工作。

像這樣的 **線上（online）** 系統，無論是瀏覽器請求頁面還是呼叫遠端 API 的服務，我們通常認為請求是由人類使用者觸發的，並且正在等待響應。他們不應該等太久，所以我們非常關注系統的響應時間（請參閱 “[描述效能](/v1_tw/ch1#描述效能)”）。

Web 和越來越多的基於 HTTP/REST 的 API 使互動的請求 / 響應風格變得如此普遍，以至於很容易將其視為理所當然。但我們應該記住，這不是構建系統的唯一方式，其他方法也有其優點。我們來看看三種不同型別的系統：

服務（線上系統）
: 服務等待客戶的請求或指令到達。每收到一個，服務會試圖儘快處理它，併發回一個響應。響應時間通常是服務效能的主要衡量指標，可用性通常非常重要（如果客戶端無法訪問服務，使用者可能會收到錯誤訊息）。

批處理系統（離線系統）
: 一個批處理系統有大量的輸入資料，跑一個 **作業（job）** 來處理它，並生成一些輸出資料，這往往需要一段時間（從幾分鐘到幾天），所以通常不會有使用者等待作業完成。相反，批次作業通常會定期執行（例如，每天一次）。批處理作業的主要效能衡量標準通常是吞吐量（處理特定大小的輸入所需的時間）。本章中討論的就是批處理。

流處理系統（準即時系統）
: 流處理介於線上和離線（批處理）之間，所以有時候被稱為 **準即時（near-real-time）** 或 **準線上（nearline）** 處理。像批處理系統一樣，流處理消費輸入併產生輸出（並不需要響應請求）。但是，流式作業在事件發生後不久就會對事件進行操作，而批處理作業則需等待固定的一組輸入資料。這種差異使流處理系統比起批處理系統具有更低的延遲。由於流處理基於批處理，我們將在 [第十一章](/v1_tw/ch11) 討論它。

正如我們將在本章中看到的那樣，批處理是構建可靠、可伸縮和可維護應用程式的重要組成部分。例如，2004 年釋出的批處理演算法 Map-Reduce（可能被過分熱情地）被稱為 “造就 Google 大規模可伸縮性的演算法”【2】。隨後在各種開源資料系統中得到應用，包括 Hadoop、CouchDB 和 MongoDB。

與多年前為資料倉庫開發的並行處理系統【3,4】相比，MapReduce 是一個相當低級別的程式設計模型，但它使得在商用硬體上能進行的處理規模邁上一個新的臺階。雖然 MapReduce 的重要性正在下降【5】，但它仍然值得去理解，因為它描繪了一幅關於批處理為什麼有用，以及如何做到有用的清晰圖景。

實際上，批處理是一種非常古老的計算方式。早在可程式設計數字計算機誕生之前，打孔卡製表機（例如 1890 年美國人口普查【6】中使用的霍爾里斯機）實現了半機械化的批處理形式，從大量輸入中彙總計算。Map-Reduce 與 1940 年代和 1950 年代廣泛用於商業資料處理的機電 IBM 卡片分類機器有著驚人的相似之處【7】。正如我們所說，歷史總是在不斷重複自己。

在本章中，我們將瞭解 MapReduce 和其他一些批處理演算法和框架，並探索它們在現代資料系統中的作用。但首先我們將看看使用標準 Unix 工具的資料處理。即使你已經熟悉了它們，Unix 的哲學也值得一讀，Unix 的思想和經驗教訓可以遷移到大規模、異構的分散式資料系統中。


## 使用Unix工具的批處理

我們從一個簡單的例子開始。假設你有一臺 Web 伺服器，每次處理請求時都會在日誌檔案中附加一行。例如，使用 nginx 預設的訪問日誌格式，日誌的一行可能如下所示：

```bash
216.58.210.78 - - [27/Feb/2015:17:55:11 +0000] "GET /css/typography.css HTTP/1.1"
200 3377 "http://martin.kleppmann.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36"
```

（實際上這只是一行，分成多行只是為了便於閱讀。）這一行中有很多資訊。為了解釋它，你需要了解日誌格式的定義，如下所示：

```bash
 $remote_addr - $remote_user [$time_local] "$request"
 $status $body_bytes_sent "$http_referer" "$http_user_agent"
```

日誌的這一行表明在 UTC 時間的 2015 年 2 月 27 日 17 點 55 分 11 秒，伺服器從客戶端 IP 地址 `216.58.210.78` 接收到對檔案 `/css/typography.css` 的請求。使用者沒有認證，所以 `$remote_user` 被設定為連字元（`-`）。響應狀態是 200（即請求成功），響應的大小是 3377 位元組。網頁瀏覽器是 Chrome 40，它載入了這個檔案是因為該檔案在網址為 `http://martin.kleppmann.com/` 的頁面中被引用到了。


### 簡單日誌分析

很多工具可以從這些日誌檔案生成關於網站流量的漂亮的報告，但為了練手，讓我們使用基本的 Unix 功能建立自己的工具。例如，假設你想在你的網站上找到五個最受歡迎的網頁。則可以在 Unix shell 中這樣做：[^i]

[^i]: 有些人認為 `cat` 這裡並沒有必要，因為輸入檔案可以直接作為 awk 的引數。但這種寫法讓線性管道更為顯眼。

```bash
cat /var/log/nginx/access.log | #1
  awk '{print $7}' | #2
  sort             | #3
  uniq -c          | #4
  sort -r -n       | #5
  head -n 5          #6
```

1. 讀取日誌檔案
2. 將每一行按空格分割成不同的欄位，每行只輸出第七個欄位，恰好是請求的 URL。在我們的例子中是 `/css/typography.css`。
3. 按字母順序排列請求的 URL 列表。如果某個 URL 被請求過 n 次，那麼排序後，檔案將包含連續重複出現 n 次的該 URL。
4. `uniq` 命令透過檢查兩個相鄰的行是否相同來過濾掉輸入中的重複行。`-c` 則表示還要輸出一個計數器：對於每個不同的 URL，它會報告輸入中出現該 URL 的次數。
5. 第二種排序按每行起始處的數字（`-n`）排序，這是 URL 的請求次數。然後逆序（`-r`）返回結果，大的數字在前。
6. 最後，只輸出前五行（`-n 5`），並丟棄其餘的。該系列命令的輸出如下所示：

```bash
    4189 /favicon.ico
    3631 /2013/05/24/improving-security-of-ssh-private-keys.html
    2124 /2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html
    1369 /
     915 /css/typography.css
```

如果你不熟悉 Unix 工具，上面的命令列可能看起來有點吃力，但是它非常強大。它能在幾秒鐘內處理幾 GB 的日誌檔案，並且你可以根據需要輕鬆修改命令。例如，如果要從報告中省略 CSS 檔案，可以將 awk 引數更改為 `'$7 !~ /\.css$/ {print $7}'`, 如果想統計最多的客戶端 IP 地址，可以把 awk 引數改為 `'{print $1}'`，等等。

我們不會在這裡詳細探索 Unix 工具，但是它非常值得學習。令人驚訝的是，使用 awk、sed、grep、sort、uniq 和 xargs 的組合，可以在幾分鐘內完成許多資料分析，並且它們的效能相當的好【8】。

#### 命令鏈與自定義程式

除了 Unix 命令鏈，你還可以寫一個簡單的程式來做同樣的事情。例如在 Ruby 中，它可能看起來像這樣：

```ruby
counts = Hash.new(0)         # 1
File.open('/var/log/nginx/access.log') do |file|
    file.each do |line|
        url = line.split[6]  # 2
        counts[url] += 1     # 3
    end
end

top5 = counts.map{|url, count| [count, url] }.sort.reverse[0...5] # 4
top5.each{|count, url| puts "#{count} #{url}" }                   # 5
```

1. `counts` 是一個儲存計數器的雜湊表，儲存了每個 URL 被瀏覽的次數，預設為 0。
2. 逐行讀取日誌，抽取每行第七個被空格分隔的欄位為 URL（這裡的陣列索引是 6，因為 Ruby 的陣列索引從 0 開始計數）
3. 將日誌當前行中 URL 對應的計數器值加一。
4. 按計數器值（降序）對雜湊表內容進行排序，並取前五位。
5. 打印出前五個條目。

這個程式並不像 Unix 管道那樣簡潔，但是它的可讀性很強，喜歡哪一種屬於口味的問題。但兩者除了表面上的差異之外，執行流程也有很大差異，如果你在大檔案上執行此分析，則會變得明顯。

#### 排序 VS 記憶體中的聚合

Ruby 指令碼在記憶體中儲存了一個 URL 的雜湊表，將每個 URL 對映到它出現的次數。Unix 管道沒有這樣的雜湊表，而是依賴於對 URL 列表的排序，在這個 URL 列表中，同一個 URL 的只是簡單地重複出現。

哪種方法更好？這取決於你有多少個不同的 URL。對於大多數中小型網站，你可能可以為所有不同網址提供一個計數器（假設我們使用 1GB 記憶體）。在此例中，作業的 **工作集**（working set，即作業需要隨機訪問的記憶體大小）僅取決於不同 URL 的數量：如果日誌中只有單個 URL，重複出現一百萬次，則散列表所需的空間表就只有一個 URL 加上一個計數器的大小。當工作集足夠小時，記憶體散列表表現良好，甚至在效能較差的筆記型電腦上也可以正常工作。

另一方面，如果作業的工作集大於可用記憶體，則排序方法的優點是可以高效地使用磁碟。這與我們在 “[SSTables 和 LSM 樹](/v1_tw/ch3#SSTables和LSM樹)” 中討論過的原理是一樣的：資料塊可以在記憶體中排序並作為段檔案寫入磁碟，然後多個排序好的段可以合併為一個更大的排序檔案。歸併排序具有在磁碟上執行良好的順序訪問模式。（請記住，針對順序 I/O 進行最佳化是 [第三章](/v1_tw/ch3) 中反覆出現的主題，相同的模式在此重現）

GNU Coreutils（Linux）中的 `sort` 程式透過溢位至磁碟的方式來自動應對大於記憶體的資料集，並能同時使用多個 CPU 核進行並行排序【9】。這意味著我們之前看到的簡單的 Unix 命令鏈很容易伸縮至大資料集，且不會耗盡記憶體。瓶頸可能是從磁碟讀取輸入檔案的速度。


### Unix哲學

我們可以非常容易地使用前一個例子中的一系列命令來分析日誌檔案，這並非巧合：事實上，這實際上是 Unix 的關鍵設計思想之一，而且它直至今天也仍然令人訝異地重要。讓我們更深入地研究一下，以便從 Unix 中借鑑一些想法【10】。

Unix 管道的發明者道格・麥克羅伊（Doug McIlroy）在 1964 年首先描述了這種情況【11】：“我們需要一種類似園藝膠管的方式來拼接程式 —— 當我們需要將訊息從一個程式傳遞另一個程式時，直接接上去就行。I/O 應該也按照這種方式進行 ”。水管的類比仍然在生效，透過管道連線程式的想法成為了現在被稱為 **Unix 哲學** 的一部分 —— 這一組設計原則在 Unix 使用者與開發者之間流行起來，該哲學在 1978 年表述如下【12,13】：

1. 讓每個程式都做好一件事。要做一件新的工作，寫一個新程式，而不是透過新增 “功能” 讓老程式複雜化。
2. 期待每個程式的輸出成為另一個程式的輸入。不要將無關資訊混入輸出。避免使用嚴格的列資料或二進位制輸入格式。不要堅持互動式輸入。
3. 設計和構建軟體時，即使是作業系統，也讓它們能夠儘早地被試用，最好在幾周內完成。不要猶豫，扔掉笨拙的部分，重建它們。
4. 優先使用工具來減輕程式設計任務，即使必須繞道去編寫工具，且在用完後很可能要扔掉大部分。

這種方法 —— 自動化，快速原型設計，增量式迭代，對實驗友好，將大型專案分解成可管理的塊 —— 聽起來非常像今天的敏捷開發和 DevOps 運動。奇怪的是，四十年來變化不大。

`sort` 工具是一個很好的例子。可以說它比大多數程式語言標準庫中的實現（它們不會利用磁碟或使用多執行緒，即使這樣做有很大好處）要更好。然而，單獨使用 `sort` 幾乎沒什麼用。它只能與其他 Unix 工具（如 `uniq`）結合使用。

像 `bash` 這樣的 Unix shell 可以讓我們輕鬆地將這些小程式組合成令人訝異的強大資料處理任務。儘管這些程式中有很多是由不同人群編寫的，但它們可以靈活地結合在一起。Unix 如何實現這種可組合性？

#### 統一的介面

如果你希望一個程式的輸出成為另一個程式的輸入，那意味著這些程式必須使用相同的資料格式 —— 換句話說，一個相容的介面。如果你希望能夠將任何程式的輸出連線到任何程式的輸入，那意味著所有程式必須使用相同的 I/O 介面。

在 Unix 中，這種介面是一個 **檔案**（file，更準確地說，是一個檔案描述符）。一個檔案只是一串有序的位元組序列。因為這是一個非常簡單的介面，所以可以使用相同的介面來表示許多不同的東西：檔案系統上的真實檔案，到另一個程序（Unix 套接字，stdin，stdout）的通訊通道，裝置驅動程式（比如 `/dev/audio` 或 `/dev/lp0`），表示 TCP 連線的套接字，等等。很容易將這些設計視為理所當然的，但實際上能讓這些差異巨大的東西共享一個統一的介面是非常厲害的，這使得它們可以很容易地連線在一起 [^ii]。

[^ii]: 統一介面的另一個例子是 URL 和 HTTP，這是 Web 的基石。一個 URL 標識一個網站上的一個特定的東西（資源），你可以連結到任何其他網站的任何網址。具有網路瀏覽器的使用者因此可以透過跟隨連結在網站之間無縫跳轉，即使伺服器可能由完全不相關的組織維護。這個原則現在似乎非常明顯，但它卻是網路取能取得今天成就的關鍵。之前的系統並不是那麼統一：例如，在公告板系統（BBS）時代，每個系統都有自己的電話號碼和波特率配置。從一個 BBS 到另一個 BBS 的引用必須以電話號碼和調變解調器設定的形式；使用者將不得不掛斷，撥打其他 BBS，然後手動找到他們正在尋找的資訊。直接連結到另一個 BBS 內的一些內容當時是不可能的。

按照慣例，許多（但不是全部）Unix 程式將這個位元組序列視為 ASCII 文字。我們的日誌分析示例使用了這個事實：`awk`、`sort`、`uniq` 和 `head` 都將它們的輸入檔案視為由 `\n`（換行符，ASCII `0x0A`）字元分隔的記錄列表。`\n` 的選擇是任意的 —— 可以說，ASCII 記錄分隔符 `0x1E` 本來就是一個更好的選擇，因為它是為了這個目的而設計的【14】，但是無論如何，所有這些程式都使用相同的記錄分隔符允許它們互操作。

每條記錄（即一行輸入）的解析則更加模糊。Unix 工具通常透過空白或製表符將行分割成欄位，但也使用 CSV（逗號分隔），管道分隔和其他編碼。即使像 `xargs` 這樣一個相當簡單的工具也有六個命令列選項，用於指定如何解析輸入。

ASCII 文字的統一介面大多數時候都能工作，但它不是很優雅：我們的日誌分析示例使用 `{print $7}` 來提取網址，這樣可讀性不是很好。在理想的世界中可能是 `{print $request_url}` 或類似的東西。我們稍後會回顧這個想法。

儘管幾十年後還不夠完美，但統一的 Unix 介面仍然是非常出色的設計。沒有多少軟體能像 Unix 工具一樣互動組合的這麼好：你不能透過自定義分析工具輕鬆地將電子郵件帳戶的內容和線上購物歷史記錄以管道傳送至電子表格中，並將結果釋出到社交網路或維基。今天，像 Unix 工具一樣流暢地執行程式是一種例外，而不是規範。

即使是具有 **相同資料模型** 的資料庫，將資料從一種資料庫匯出再匯入到另一種資料庫也並不容易。缺乏整合導致了資料的 **巴爾幹化**[^譯註i]。

[^譯註i]: **巴爾幹化（Balkanization）** 是一個常帶有貶義的地緣政治學術語，其定義為：一個國家或政區分裂成多個互相敵對的國家或政區的過程。


#### 邏輯與佈線相分離

Unix 工具的另一個特點是使用標準輸入（`stdin`）和標準輸出（`stdout`）。如果你執行一個程式，而不指定任何其他的東西，標準輸入來自鍵盤，標準輸出指向螢幕。但是，你也可以從檔案輸入和 / 或將輸出重定向到檔案。管道允許你將一個程序的標準輸出附加到另一個程序的標準輸入（有個小記憶體緩衝區，而不需要將整個中間資料流寫入磁碟）。

如果需要，程式仍然可以直接讀取和寫入檔案，但 Unix 方法在程式不關心特定的檔案路徑、只使用標準輸入和標準輸出時效果最好。這允許 shell 使用者以任何他們想要的方式連線輸入和輸出；該程式不知道或不關心輸入來自哪裡以及輸出到哪裡。（人們可以說這是一種 **松耦合（loose coupling）**，**晚期繫結（late binding）**【15】或 **控制反轉（inversion of control）**【16】）。將輸入 / 輸出佈線與程式邏輯分開，可以將小工具組合成更大的系統。

你甚至可以編寫自己的程式，並將它們與作業系統提供的工具組合在一起。你的程式只需要從標準輸入讀取輸入，並將輸出寫入標準輸出，它就可以加入資料處理的管道中。在日誌分析示例中，你可以編寫一個將 User-Agent 字串轉換為更靈敏的瀏覽器識別符號，或者將 IP 地址轉換為國家程式碼的工具，並將其插入管道。`sort` 程式並不關心它是否與作業系統的另一部分或者你寫的程式通訊。

但是，使用 `stdin` 和 `stdout` 能做的事情是有限的。需要多個輸入或輸出的程式雖然可能，卻非常棘手。你沒法將程式的輸出管道連線至網路連線中【17,18】[^iii] 。如果程式直接開啟檔案進行讀取和寫入，或者將另一個程式作為子程序啟動，或者開啟網路連線，那麼 I/O 的佈線就取決於程式本身了。它仍然可以被配置（例如透過命令列選項），但在 Shell 中對輸入和輸出進行佈線的靈活性就少了。

[^iii]: 除了使用一個單獨的工具，如 `netcat` 或 `curl`。Unix 起初試圖將所有東西都表示為檔案，但是 BSD 套接字 API 偏離了這個慣例【17】。研究用作業系統 Plan 9 和 Inferno 在使用檔案方面更加一致：它們將 TCP 連線表示為 `/net/tcp` 中的檔案【18】。


#### 透明度和實驗

使 Unix 工具如此成功的部分原因是，它們使檢視正在發生的事情變得非常容易：

- Unix 命令的輸入檔案通常被視為不可變的。這意味著你可以隨意執行命令，嘗試各種命令列選項，而不會損壞輸入檔案。
- 你可以在任何時候結束管道，將管道輸出到 `less`，然後檢視它是否具有預期的形式。這種檢查能力對除錯非常有用。
- 你可以將一個流水線階段的輸出寫入檔案，並將該檔案用作下一階段的輸入。這使你可以重新啟動後面的階段，而無需重新執行整個管道。

因此，與關係資料庫的查詢最佳化器相比，即使 Unix 工具非常簡單，但仍然非常有用，特別是對於實驗而言。

然而，Unix 工具的最大侷限在於它們只能在一臺機器上執行 —— 而 Hadoop 這樣的工具即應運而生。


## MapReduce和分散式檔案系統

MapReduce 有點像 Unix 工具，但分佈在數千臺機器上。像 Unix 工具一樣，它相當簡單粗暴，但令人驚異地管用。一個 MapReduce 作業可以和一個 Unix 程序相類比：它接受一個或多個輸入，併產生一個或多個輸出。

和大多數 Unix 工具一樣，執行 MapReduce 作業通常不會修改輸入，除了生成輸出外沒有任何副作用。輸出檔案以連續的方式一次性寫入（一旦寫入檔案，不會修改任何現有的檔案部分）。

雖然 Unix 工具使用 `stdin` 和 `stdout` 作為輸入和輸出，但 MapReduce 作業在分散式檔案系統上讀寫檔案。在 Hadoop 的 MapReduce 實現中，該檔案系統被稱為 **HDFS（Hadoop 分散式檔案系統）**，一個 Google 檔案系統（GFS）的開源實現【19】。

除 HDFS 外，還有各種其他分散式檔案系統，如 GlusterFS 和 Quantcast File System（QFS）【20】。諸如 Amazon S3、Azure Blob 儲存和 OpenStack Swift【21】等物件儲存服務在很多方面都是相似的 [^iv]。在本章中，我們將主要使用 HDFS 作為示例，但是這些原則適用於任何分散式檔案系統。

[^iv]: 一個不同之處在於，對於 HDFS，可以將計算任務安排在儲存特定檔案副本的計算機上執行，而物件儲存通常將儲存和計算分開。如果網路頻寬是一個瓶頸，從本地磁碟讀取有效能優勢。但是請注意，如果使用糾刪碼（Erasure Coding），則會丟失區域性，因為來自多臺機器的資料必須進行合併以重建原始檔案【20】。

與網路連線儲存（NAS）和儲存區域網路（SAN）架構的共享磁碟方法相比，HDFS 基於 **無共享** 原則（請參閱 [第二部分](/v1_tw/part-ii) 的介紹）。共享磁碟儲存由集中式儲存裝置實現，通常使用定製硬體和專用網路基礎設施（如光纖通道）。而另一方面，無共享方法不需要特殊的硬體，只需要透過傳統資料中心網路連線的計算機。

HDFS 在每臺機器上運行了一個守護程序，它對外暴露網路服務，允許其他節點訪問儲存在該機器上的檔案（假設資料中心中的每臺通用計算機都掛載著一些磁碟）。名為 **NameNode** 的中央伺服器會跟蹤哪個檔案塊儲存在哪臺機器上。因此，HDFS 在概念上建立了一個大型檔案系統，可以使用所有執行有守護程序的機器的磁碟。

為了容忍機器和磁碟故障，檔案塊被複制到多臺機器上。複製可能意味著多個機器上的相同資料的多個副本，如 [第五章](/v1_tw/ch5) 中所述，或者諸如 Reed-Solomon 碼這樣的糾刪碼方案，它能以比完全複製更低的儲存開銷來支援恢復丟失的資料【20,22】。這些技術與 RAID 相似，後者可以在連線到同一臺機器的多個磁碟上提供冗餘；區別在於在分散式檔案系統中，檔案訪問和複製是在傳統的資料中心網路上完成的，沒有特殊的硬體。

HDFS 的可伸縮性已經很不錯了：在撰寫本書時，最大的 HDFS 部署執行在上萬臺機器上，總儲存容量達數百 PB【23】。如此大的規模已經變得可行，因為使用商品硬體和開源軟體的 HDFS 上的資料儲存和訪問成本遠低於在專用儲存裝置上支援同等容量的成本【24】。

### MapReduce作業執行

MapReduce 是一個程式設計框架，你可以使用它編寫程式碼來處理 HDFS 等分散式檔案系統中的大型資料集。理解它的最簡單方法是參考 “[簡單日誌分析](#簡單日誌分析)” 中的 Web 伺服器日誌分析示例。MapReduce 中的資料處理模式與此示例非常相似：

1. 讀取一組輸入檔案，並將其分解成 **記錄（records）**。在 Web 伺服器日誌示例中，每條記錄都是日誌中的一行（即 `\n` 是記錄分隔符）。
2. 呼叫 Mapper 函式，從每條輸入記錄中提取一對鍵值。在前面的例子中，Mapper 函式是 `awk '{print $7}'`：它提取 URL（`$7`）作為鍵，並將值留空。
3. 按鍵排序所有的鍵值對。在日誌的例子中，這由第一個 `sort` 命令完成。
4. 呼叫 Reducer 函式遍歷排序後的鍵值對。如果同一個鍵出現多次，排序使它們在列表中相鄰，所以很容易組合這些值而不必在記憶體中保留很多狀態。在前面的例子中，Reducer 是由 `uniq -c` 命令實現的，該命令使用相同的鍵來統計相鄰記錄的數量。

這四個步驟可以作為一個 MapReduce 作業執行。步驟 2（Map）和 4（Reduce）是你編寫自定義資料處理程式碼的地方。步驟 1（將檔案分解成記錄）由輸入格式解析器處理。步驟 3 中的排序步驟隱含在 MapReduce 中 —— 你不必編寫它，因為 Mapper 的輸出始終在送往 Reducer 之前進行排序。

要建立 MapReduce 作業，你需要實現兩個回撥函式，Mapper 和 Reducer，其行為如下（請參閱 “[MapReduce 查詢](/v1_tw/ch2#MapReduce查詢)”）：

Mapper
: Mapper 會在每條輸入記錄上呼叫一次，其工作是從輸入記錄中提取鍵值。對於每個輸入，它可以生成任意數量的鍵值對（包括 None）。它不會保留從一個輸入記錄到下一個記錄的任何狀態，因此每個記錄都是獨立處理的。

Reducer
: MapReduce 框架拉取由 Mapper 生成的鍵值對，收集屬於同一個鍵的所有值，並在這組值上迭代呼叫 Reducer。Reducer 可以產生輸出記錄（例如相同 URL 的出現次數）。

在 Web 伺服器日誌的例子中，我們在第 5 步中有第二個 `sort` 命令，它按請求數對 URL 進行排序。在 MapReduce 中，如果你需要第二個排序階段，則可以透過編寫第二個 MapReduce 作業並將第一個作業的輸出用作第二個作業的輸入來實現它。這樣看來，Mapper 的作用是將資料放入一個適合排序的表單中，並且 Reducer 的作用是處理已排序的資料。

#### 分散式執行MapReduce

MapReduce 與 Unix 命令管道的主要區別在於，MapReduce 可以在多臺機器上並行執行計算，而無需編寫程式碼來顯式處理並行問題。Mapper 和 Reducer 一次只能處理一條記錄；它們不需要知道它們的輸入來自哪裡，或者輸出去往什麼地方，所以框架可以處理在機器之間移動資料的複雜性。

在分散式計算中可以使用標準的 Unix 工具作為 Mapper 和 Reducer【25】，但更常見的是，它們被實現為傳統程式語言的函式。在 Hadoop MapReduce 中，Mapper 和 Reducer 都是實現特定介面的 Java 類。在 MongoDB 和 CouchDB 中，Mapper 和 Reducer 都是 JavaScript 函式（請參閱 “[MapReduce 查詢](/v1_tw/ch2#MapReduce查詢)”）。

[圖 10-1](/v1/ddia_1001.png) 顯示了 Hadoop MapReduce 作業中的資料流。其並行化基於分割槽（請參閱 [第六章](/v1_tw/ch6)）：作業的輸入通常是 HDFS 中的一個目錄，輸入目錄中的每個檔案或檔案塊都被認為是一個單獨的分割槽，可以單獨處理 map 任務（[圖 10-1](/v1/ddia_1001.png) 中的 m1，m2 和 m3 標記）。

每個輸入檔案的大小通常是數百兆位元組。MapReduce 排程器（圖中未顯示）試圖在其中一臺儲存輸入檔案副本的機器上執行每個 Mapper，只要該機器有足夠的備用 RAM 和 CPU 資源來執行 Mapper 任務【26】。這個原則被稱為 **將計算放在資料附近**【27】：它節省了透過網路複製輸入檔案的開銷，減少網路負載並增加區域性。

![](/v1/ddia_1001.png)

**圖 10-1 具有三個 Mapper 和三個 Reducer 的 MapReduce 任務**

在大多數情況下，應該在 Mapper 任務中執行的應用程式碼在將要執行它的機器上還不存在，所以 MapReduce 框架首先將程式碼（例如 Java 程式中的 JAR 檔案）複製到適當的機器。然後啟動 Map 任務並開始讀取輸入檔案，一次將一條記錄傳入 Mapper 回撥函式。Mapper 的輸出由鍵值對組成。

計算的 Reduce 端也被分割槽。雖然 Map 任務的數量由輸入檔案塊的數量決定，但 Reducer 的任務的數量是由作業作者配置的（它可以不同於 Map 任務的數量）。為了確保具有相同鍵的所有鍵值對最終落在相同的 Reducer 處，框架使用鍵的雜湊值來確定哪個 Reduce 任務應該接收到特定的鍵值對（請參閱 “[根據鍵的雜湊分割槽](/v1_tw/ch6#根據鍵的雜湊分割槽)”）。

鍵值對必須進行排序，但資料集可能太大，無法在單臺機器上使用常規排序演算法進行排序。相反，分類是分階段進行的。首先每個 Map 任務都按照 Reducer 對輸出進行分割槽。每個分割槽都被寫入 Mapper 程式的本地磁碟，使用的技術與我們在 “[SSTables 與 LSM 樹](/v1_tw/ch3#SSTables和LSM樹)” 中討論的類似。

只要當 Mapper 讀取完輸入檔案，並寫完排序後的輸出檔案，MapReduce 排程器就會通知 Reducer 可以從該 Mapper 開始獲取輸出檔案。Reducer 連線到每個 Mapper，並下載自己相應分割槽的有序鍵值對檔案。按 Reducer 分割槽，排序，從 Mapper 向 Reducer 複製分割槽資料，這一整個過程被稱為 **混洗（shuffle）**【26】（一個容易混淆的術語  —— 不像洗牌，在 MapReduce 中的混洗沒有隨機性）。

Reduce 任務從 Mapper 獲取檔案，並將它們合併在一起，並保留有序特性。因此，如果不同的 Mapper 生成了鍵相同的記錄，則在 Reducer 的輸入中，這些記錄將會相鄰。

Reducer 呼叫時會收到一個鍵，和一個迭代器作為引數，迭代器會順序地掃過所有具有該鍵的記錄（因為在某些情況可能無法完全放入記憶體中）。Reducer 可以使用任意邏輯來處理這些記錄，並且可以生成任意數量的輸出記錄。這些輸出記錄會寫入分散式檔案系統上的檔案中（通常是在跑 Reducer 的機器本地磁碟上留一份，並在其他機器上留幾份副本）。

#### MapReduce工作流

單個 MapReduce 作業可以解決的問題範圍很有限。以日誌分析為例，單個 MapReduce 作業可以確定每個 URL 的頁面瀏覽次數，但無法確定最常見的 URL，因為這需要第二輪排序。

因此將 MapReduce 作業連結成為 **工作流（workflow）** 中是極為常見的，例如，一個作業的輸出成為下一個作業的輸入。Hadoop MapReduce 框架對工作流沒有特殊支援，所以這個鏈是透過目錄名隱式實現的：第一個作業必須將其輸出配置為 HDFS 中的指定目錄，第二個作業必須將其輸入配置為從同一個目錄。從 MapReduce 框架的角度來看，這是兩個獨立的作業。

因此，被連結的 MapReduce 作業並沒有那麼像 Unix 命令管道（它直接將一個程序的輸出作為另一個程序的輸入，僅用一個很小的記憶體緩衝區）。它更像是一系列命令，其中每個命令的輸出寫入臨時檔案，下一個命令從臨時檔案中讀取。這種設計有利也有弊，我們將在 “[物化中間狀態](#物化中間狀態)” 中討論。

只有當作業成功完成後，批處理作業的輸出才會被視為有效的（MapReduce 會丟棄失敗作業的部分輸出）。因此，工作流中的一項作業只有在先前的作業 —— 即生產其輸入的作業 —— 成功完成後才能開始。為了處理這些作業之間的依賴，有很多針對 Hadoop 的工作流排程器被開發出來，包括 Oozie、Azkaban、Luigi、Airflow 和 Pinball 【28】。

這些排程程式還具有管理功能，在維護大量批處理作業時非常有用。在構建推薦系統時，由 50 到 100 個 MapReduce 作業組成的工作流是常見的【29】。而在大型組織中，許多不同的團隊可能執行不同的作業來讀取彼此的輸出。工具支援對於管理這樣複雜的資料流而言非常重要。

Hadoop 的各種高階工具（如 Pig 【30】、Hive 【31】、Cascading 【32】、Crunch 【33】和 FlumeJava 【34】）也能自動佈線組裝多個 MapReduce 階段，生成合適的工作流。

### Reduce側連線與分組

我們在 [第二章](/v1_tw/ch2) 中討論了資料模型和查詢語言的連線，但是我們還沒有深入探討連線是如何實現的。現在是我們再次撿起這條線索的時候了。

在許多資料集中，一條記錄與另一條記錄存在關聯是很常見的：關係模型中的 **外部索引鍵**，文件模型中的 **文件引用** 或圖模型中的 **邊**。當你需要同時訪問這一關聯的兩側（持有引用的記錄與被引用的記錄）時，連線就是必須的。正如 [第二章](/v1_tw/ch2) 所討論的，反正規化可以減少對連線的需求，但通常無法將其完全移除 [^v]。

[^v]: 我們在本書中討論的連線通常是等值連線，即最常見的連線型別，其中記錄透過與其他記錄在特定欄位（例如 ID）中具有 **相同值** 相關聯。有些資料庫支援更通用的連線型別，例如使用小於運算子而不是等號運算子，但是我們沒有地方來講這些東西。

在資料庫中，如果執行只涉及少量記錄的查詢，資料庫通常會使用 **索引** 來快速定位感興趣的記錄（請參閱 [第三章](/v1_tw/ch3)）。如果查詢涉及到連線，則可能涉及到查詢多個索引。然而 MapReduce 沒有索引的概念 —— 至少在通常意義上沒有。

當 MapReduce 作業被賦予一組檔案作為輸入時，它讀取所有這些檔案的全部內容；資料庫會將這種操作稱為 **全表掃描**。如果你只想讀取少量的記錄，則全表掃描與索引查詢相比，代價非常高昂。但是在分析查詢中（請參閱 “[事務處理還是分析？](/v1_tw/ch3#事務處理還是分析？)”），通常需要計算大量記錄的聚合。在這種情況下，特別是如果能在多臺機器上並行處理時，掃描整個輸入可能是相當合理的事情。

當我們在批處理的語境中討論連線時，我們指的是在資料集中解析某種關聯的全量存在。例如我們假設一個作業是同時處理所有使用者的資料，而非僅僅是為某個特定使用者查詢資料（而這能透過索引更高效地完成）。

#### 示例：使用者活動事件分析

[圖 10-2](/v1/ddia_1002.png) 給出了一個批處理作業中連線的典型例子。左側是事件日誌，描述登入使用者在網站上做的事情（稱為 **活動事件**，即 activity events，或 **點選流資料**，即 clickstream data），右側是使用者資料庫。你可以將此示例看作是星型模式的一部分（請參閱 “[星型和雪花型：分析的模式](/v1_tw/ch3#星型和雪花型：分析的模式)”）：事件日誌是事實表，使用者資料庫是其中的一個維度。

![](/v1/ddia_1002.png)

**圖 10-2 使用者行為日誌與使用者檔案的連線**

分析任務可能需要將使用者活動與使用者檔案資訊相關聯：例如，如果檔案包含使用者的年齡或出生日期，系統就可以確定哪些頁面更受哪些年齡段的使用者歡迎。然而活動事件僅包含使用者 ID，而沒有包含完整的使用者檔案資訊。在每個活動事件中嵌入這些檔案資訊很可能會非常浪費。因此，活動事件需要與使用者檔案資料庫相連線。

實現這一連線的最簡單方法是，逐個遍歷活動事件，併為每個遇到的使用者 ID 查詢使用者資料庫（在遠端伺服器上）。這是可能的，但是它的效能可能會非常差：處理吞吐量將受限於受資料庫伺服器的往返時間，本地快取的有效性很大程度上取決於資料的分佈，並行執行大量查詢可能會輕易壓垮資料庫【35】。

為了在批處理過程中實現良好的吞吐量，計算必須（儘可能）限於單臺機器上進行。為待處理的每條記錄發起隨機訪問的網路請求實在是太慢了。而且，查詢遠端資料庫意味著批處理作業變為 **非確定的（nondeterministic）**，因為遠端資料庫中的資料可能會改變。

因此，更好的方法是獲取使用者資料庫的副本（例如，使用 ETL 程序從資料庫備份中提取資料，請參閱 “[資料倉庫](/v1_tw/ch3#資料倉庫)”），並將它和使用者行為日誌放入同一個分散式檔案系統中。然後你可以將使用者資料庫儲存在 HDFS 中的一組檔案中，而使用者活動記錄儲存在另一組檔案中，並能用 MapReduce 將所有相關記錄集中到同一個地方進行高效處理。

#### 排序合併連線

回想一下，Mapper 的目的是從每個輸入記錄中提取一對鍵值。在 [圖 10-2](/v1/ddia_1002.png) 的情況下，這個鍵就是使用者 ID：一組 Mapper 會掃過活動事件（提取使用者 ID 作為鍵，活動事件作為值），而另一組 Mapper 將會掃過使用者資料庫（提取使用者 ID 作為鍵，使用者的出生日期作為值）。這個過程如 [圖 10-3](/v1/ddia_1003.png) 所示。

![](/v1/ddia_1003.png)

**圖 10-3 在使用者 ID 上進行的 Reduce 端連線。如果輸入資料集分割槽為多個檔案，則每個分割槽都會被多個 Mapper 並行處理**

當 MapReduce 框架透過鍵對 Mapper 輸出進行分割槽，然後對鍵值對進行排序時，效果是具有相同 ID 的所有活動事件和使用者記錄在 Reducer 輸入中彼此相鄰。Map-Reduce 作業甚至可以也讓這些記錄排序，使 Reducer 總能先看到來自使用者資料庫的記錄，緊接著是按時間戳順序排序的活動事件 ——  這種技術被稱為 **二次排序（secondary sort）**【26】。

然後 Reducer 可以容易地執行實際的連線邏輯：每個使用者 ID 都會被呼叫一次 Reducer 函式，且因為二次排序，第一個值應該是來自使用者資料庫的出生日期記錄。Reducer 將出生日期儲存在區域性變數中，然後使用相同的使用者 ID 遍歷活動事件，輸出 **已觀看網址** 和 **觀看者年齡** 的結果對。隨後的 Map-Reduce 作業可以計算每個 URL 的檢視者年齡分佈，並按年齡段進行聚集。

由於 Reducer 一次處理一個特定使用者 ID 的所有記錄，因此一次只需要將一條使用者記錄儲存在記憶體中，而不需要透過網路發出任何請求。這個演算法被稱為 **排序合併連線（sort-merge join）**，因為 Mapper 的輸出是按鍵排序的，然後 Reducer 將來自連線兩側的有序記錄列表合併在一起。

#### 把相關資料放在一起

在排序合併連線中，Mapper 和排序過程確保了所有對特定使用者 ID 執行連線操作的必須資料都被放在同一個地方：單次呼叫 Reducer 的地方。預先排好了所有需要的資料，Reducer 可以是相當簡單的單執行緒程式碼，能夠以高吞吐量和與低記憶體開銷掃過這些記錄。

這種架構可以看做，Mapper 將 “訊息” 傳送給 Reducer。當一個 Mapper 發出一個鍵值對時，這個鍵的作用就像值應該傳遞到的目標地址。即使鍵只是一個任意的字串（不是像 IP 地址和埠號那樣的實際的網路地址），它表現的就像一個地址：所有具有相同鍵的鍵值對將被傳遞到相同的目標（一次 Reducer 的呼叫）。

使用 MapReduce 程式設計模型，能將計算的物理網路通訊層面（從正確的機器獲取資料）從應用邏輯中剝離出來（獲取資料後執行處理）。這種分離與資料庫的典型用法形成了鮮明對比，從資料庫中獲取資料的請求經常出現在應用程式碼內部【36】。由於 MapReduce 處理了所有的網路通訊，因此它也避免了讓應用程式碼去擔心部分故障，例如另一個節點的崩潰：MapReduce 在不影響應用邏輯的情況下能透明地重試失敗的任務。

#### 分組

除了連線之外，“把相關資料放在一起” 的另一種常見模式是，按某個鍵對記錄分組（如 SQL 中的 GROUP BY 子句）。所有帶有相同鍵的記錄構成一個組，而下一步往往是在每個組內進行某種聚合操作，例如：

- 統計每個組中記錄的數量（例如在統計 PV 的例子中，在 SQL 中表示為 `COUNT(*)` 聚合）
- 對某個特定欄位求和（SQL 中的 `SUM(fieldname)`）
- 按某種分級函式取出排名前 k 條記錄。

使用 MapReduce 實現這種分組操作的最簡單方法是設定 Mapper，以便它們生成的鍵值對使用所需的分組鍵。然後分割槽和排序過程將所有具有相同分割槽鍵的記錄導向同一個 Reducer。因此在 MapReduce 之上實現分組和連線看上去非常相似。

分組的另一個常見用途是整理特定使用者會話的所有活動事件，以找出使用者進行的一系列操作（稱為 **會話化（sessionization）**【37】）。例如，可以使用這種分析來確定顯示新版網站的使用者是否比那些顯示舊版本的使用者更有購買慾（A/B 測試），或者計算某個營銷活動是否值得。

如果你有多個 Web 伺服器處理使用者請求，則特定使用者的活動事件很可能分散在各個不同的伺服器的日誌檔案中。你可以透過使用會話 cookie，使用者 ID 或類似的識別符號作為分組鍵，以將特定使用者的所有活動事件放在一起來實現會話化，與此同時，不同使用者的事件仍然散佈在不同的分割槽中。

#### 處理偏斜

如果存在與單個鍵關聯的大量資料，則 “將具有相同鍵的所有記錄放到相同的位置” 這種模式就被破壞了。例如在社交網路中，大多數使用者可能會與幾百人有連線，但少數名人可能有數百萬的追隨者。這種不成比例的活動資料庫記錄被稱為 **關鍵物件（linchpin object）**【38】或 **熱鍵（hot key）**。

在單個 Reducer 中收集與某個名人相關的所有活動（例如他們釋出內容的回覆）可能導致嚴重的 **偏斜**（也稱為 **熱點**，即 hot spot）—— 也就是說，一個 Reducer 必須比其他 Reducer 處理更多的記錄（請參閱 “[負載偏斜與熱點消除](/v1_tw/ch6#負載偏斜與熱點消除)”）。由於 MapReduce 作業只有在所有 Mapper 和 Reducer 都完成時才完成，所有後續作業必須等待最慢的 Reducer 才能啟動。

如果連線的輸入存在熱鍵，可以使用一些演算法進行補償。例如，Pig 中的 **偏斜連線（skewed join）** 方法首先執行一個抽樣作業（Sampling Job）來確定哪些鍵是熱鍵【39】。連線實際執行時，Mapper 會將熱鍵的關聯記錄 **隨機**（相對於傳統 MapReduce 基於鍵雜湊的確定性方法）傳送到幾個 Reducer 之一。對於另外一側的連線輸入，與熱鍵相關的記錄需要被複制到 **所有** 處理該鍵的 Reducer 上【40】。

這種技術將處理熱鍵的工作分散到多個 Reducer 上，這樣可以使其更好地並行化，代價是需要將連線另一側的輸入記錄複製到多個 Reducer 上。Crunch 中的 **分片連線（sharded join）** 方法與之類似，但需要顯式指定熱鍵而不是使用抽樣作業。這種技術也非常類似於我們在 “[負載偏斜與熱點消除](/v1_tw/ch6#負載偏斜與熱點消除)” 中討論的技術，使用隨機化來緩解分割槽資料庫中的熱點。

Hive 的偏斜連線最佳化採取了另一種方法。它需要在表格元資料中顯式指定熱鍵，並將與這些鍵相關的記錄單獨存放，與其它檔案分開。當在該表上執行連線時，對於熱鍵，它會使用 Map 端連線（請參閱下一節）。

當按照熱鍵進行分組並聚合時，可以將分組分兩個階段進行。第一個 MapReduce 階段將記錄傳送到隨機 Reducer，以便每個 Reducer 只對熱鍵的子集執行分組，為每個鍵輸出一個更緊湊的中間聚合結果。然後第二個 MapReduce 作業將所有來自第一階段 Reducer 的中間聚合結果合併為每個鍵一個值。


### Map側連線

上一節描述的連線演算法在 Reducer 中執行實際的連線邏輯，因此被稱為 Reduce 側連線。Mapper 扮演著預處理輸入資料的角色：從每個輸入記錄中提取鍵值，將鍵值對分配給 Reducer 分割槽，並按鍵排序。

Reduce 側方法的優點是不需要對輸入資料做任何假設：無論其屬性和結構如何，Mapper 都可以對其預處理以備連線。然而不利的一面是，排序，複製至 Reducer，以及合併 Reducer 輸入，所有這些操作可能開銷巨大。當資料透過 MapReduce 階段時，資料可能需要落盤好幾次，取決於可用的記憶體緩衝區【37】。

另一方面，如果你 **能** 對輸入資料作出某些假設，則透過使用所謂的 Map 側連線來加快連線速度是可行的。這種方法使用了一個裁減掉 Reducer 與排序的 MapReduce 作業，每個 Mapper 只是簡單地從分散式檔案系統中讀取一個輸入檔案塊，然後將輸出檔案寫入檔案系統，僅此而已。

#### 廣播雜湊連線

適用於執行 Map 端連線的最簡單場景是大資料集與小資料集連線的情況。要點在於小資料集需要足夠小，以便可以將其全部載入到每個 Mapper 的記憶體中。

例如，假設在 [圖 10-2](/v1/ddia_1002.png) 的情況下，使用者資料庫小到足以放進記憶體中。在這種情況下，當 Mapper 啟動時，它可以首先將使用者資料庫從分散式檔案系統讀取到記憶體中的散列表中。完成此操作後，Mapper 可以掃描使用者活動事件，並簡單地在散列表中查詢每個事件的使用者 ID [^vi]。

[^vi]: 這個例子假定散列表中的每個鍵只有一個條目，這對使用者資料庫（使用者 ID 唯一標識一個使用者）可能是正確的。通常，雜湊表可能需要包含具有相同鍵的多個條目，而連線運算子將對每個鍵輸出所有的匹配。

參與連線的較大輸入的每個檔案塊各有一個 Mapper（在 [圖 10-2](/v1/ddia_1002.png) 的例子中活動事件是較大的輸入）。每個 Mapper 都會將較小輸入整個載入到記憶體中。

這種簡單有效的演算法被稱為 **廣播雜湊連線（broadcast hash join）**：**廣播** 一詞反映了這樣一個事實，每個連線較大輸入端分割槽的 Mapper 都會將較小輸入端資料集整個讀入記憶體中（所以較小輸入實際上 “廣播” 到較大資料的所有分割槽上），**雜湊** 一詞反映了它使用一個散列表。Pig（名為 “**複製連結（replicated join）**”），Hive（“**MapJoin**”），Cascading 和 Crunch 支援這種連線。它也被諸如 Impala 的資料倉庫查詢引擎使用【41】。

除了將較小的連線輸入載入到記憶體散列表中，另一種方法是將較小輸入儲存在本地磁碟上的只讀索引中【42】。索引中經常使用的部分將保留在作業系統的頁面快取中，因而這種方法可以提供與記憶體散列表幾乎一樣快的隨機查詢效能，但實際上並不需要資料集能放入記憶體中。

#### 分割槽雜湊連線

如果 Map 側連線的輸入以相同的方式進行分割槽，則雜湊連線方法可以獨立應用於每個分割槽。在 [圖 10-2](/v1/ddia_1002.png) 的情況中，你可以根據使用者 ID 的最後一位十進位制數字來對活動事件和使用者資料庫進行分割槽（因此連線兩側各有 10 個分割槽）。例如，Mapper3 首先將所有具有以 3 結尾的 ID 的使用者載入到散列表中，然後掃描 ID 為 3 的每個使用者的所有活動事件。

如果分割槽正確無誤，可以確定的是，所有你可能需要連線的記錄都落在同一個編號的分割槽中。因此每個 Mapper 只需要從輸入兩端各讀取一個分割槽就足夠了。好處是每個 Mapper 都可以在記憶體散列表中少放點資料。

這種方法只有當連線兩端輸入有相同的分割槽數，且兩側的記錄都是使用相同的鍵與相同的雜湊函式做分割槽時才適用。如果輸入是由之前執行過這種分組的 MapReduce 作業生成的，那麼這可能是一個合理的假設。

分割槽雜湊連線在 Hive 中稱為 **Map 側桶連線（bucketed map joins）【37】**。

#### Map側合併連線

如果輸入資料集不僅以相同的方式進行分割槽，而且還基於相同的鍵進行 **排序**，則可適用另一種 Map 側連線的變體。在這種情況下，輸入是否小到能放入記憶體並不重要，因為這時候 Mapper 同樣可以執行歸併操作（通常由 Reducer 執行）的歸併操作：按鍵遞增的順序依次讀取兩個輸入檔案，將具有相同鍵的記錄配對。

如果能進行 Map 側合併連線，這通常意味著前一個 MapReduce 作業可能一開始就已經把輸入資料做了分割槽並進行了排序。原則上這個連線就可以在前一個作業的 Reduce 階段進行。但使用獨立的僅 Map 作業有時也是合適的，例如，分好區且排好序的中間資料集可能還會用於其他目的。

#### MapReduce工作流與Map側連線

當下遊作業使用 MapReduce 連線的輸出時，選擇 Map 側連線或 Reduce 側連線會影響輸出的結構。Reduce 側連線的輸出是按照 **連線鍵** 進行分割槽和排序的，而 Map 端連線的輸出則按照與較大輸入相同的方式進行分割槽和排序（因為無論是使用分割槽連線還是廣播連線，連線較大輸入端的每個檔案塊都會啟動一個 Map 任務）。

如前所述，Map 側連線也對輸入資料集的大小，有序性和分割槽方式做出了更多假設。在最佳化連線策略時，瞭解分散式檔案系統中資料集的物理佈局變得非常重要：僅僅知道編碼格式和資料儲存目錄的名稱是不夠的；你還必須知道資料是按哪些鍵做的分割槽和排序，以及分割槽的數量。

在 Hadoop 生態系統中，這種關於資料集分割槽的元資料通常在 HCatalog 和 Hive Metastore 中維護【37】。


### 批處理工作流的輸出

我們已經說了很多用於實現 MapReduce 工作流的演算法，但卻忽略了一個重要的問題：這些處理完成之後的最終結果是什麼？我們最開始為什麼要跑這些作業？

在資料庫查詢的場景中，我們將事務處理（OLTP）與分析兩種目的區分開來（請參閱 “[事務處理還是分析？](/v1_tw/ch3#事務處理還是分析？)”）。我們看到，OLTP 查詢通常根據鍵查詢少量記錄，使用索引，並將其呈現給使用者（比如在網頁上）。另一方面，分析查詢通常會掃描大量記錄，執行分組與聚合，輸出通常有著報告的形式：顯示某個指標隨時間變化的圖表，或按照某種排位取前 10 項，或將一些數字細化為子類。這種報告的消費者通常是需要做出商業決策的分析師或經理。

批處理放哪裡合適？它不屬於事務處理，也不是分析。它和分析比較接近，因為批處理通常會掃過輸入資料集的絕大部分。然而 MapReduce 作業工作流與用於分析目的的 SQL 查詢是不同的（請參閱 “[Hadoop 與分散式資料庫的對比](#Hadoop與分散式資料庫的對比)”）。批處理過程的輸出通常不是報表，而是一些其他型別的結構。

#### 建立搜尋索引

Google 最初使用 MapReduce 是為其搜尋引擎建立索引，其實現為由 5 到 10 個 MapReduce 作業組成的工作流【1】。雖然 Google 後來也不僅僅是為這個目的而使用 MapReduce 【43】，但如果從構建搜尋索引的角度來看，更能幫助理解 MapReduce。（直至今日，Hadoop MapReduce 仍然是為 Lucene/Solr 構建索引的好方法【44】）

我們在 “[全文搜尋和模糊索引](/v1_tw/ch3#全文搜尋和模糊索引)” 中簡要地瞭解了 Lucene 這樣的全文搜尋索引是如何工作的：它是一個檔案（關鍵詞字典），你可以在其中高效地查詢特定關鍵字，並找到包含該關鍵字的所有文件 ID 列表（文章列表）。這是一種非常簡化的看法 —— 實際上，搜尋索引需要各種額外資料，以便根據相關性對搜尋結果進行排名、糾正拼寫錯誤、解析同義詞等等 —— 但這個原則是成立的。

如果需要對一組固定文件執行全文搜尋，則批處理是一種構建索引的高效方法：Mapper 根據需要對文件集合進行分割槽，每個 Reducer 構建該分割槽的索引，並將索引檔案寫入分散式檔案系統。構建這樣的文件分割槽索引（請參閱 “[分割槽與次級索引](/v1_tw/ch6#分割槽與次級索引)”）並行處理效果拔群。

由於按關鍵字查詢搜尋索引是隻讀操作，因而這些索引檔案一旦建立就是不可變的。

如果索引的文件集合發生更改，一種選擇是定期重跑整個索引工作流，並在完成後用新的索引檔案批次替換以前的索引檔案。如果只有少量的文件發生了變化，這種方法的計算成本可能會很高。但它的優點是索引過程很容易理解：文件進，索引出。

另一個選擇是，可以增量建立索引。如 [第三章](/v1_tw/ch3) 中討論的，如果要在索引中新增，刪除或更新文件，Lucene 會寫新的段檔案，並在後臺非同步合併壓縮段檔案。我們將在 [第十一章](/v1_tw/ch11) 中看到更多這種增量處理。

#### 鍵值儲存作為批處理輸出

搜尋索引只是批處理工作流可能輸出的一個例子。批處理的另一個常見用途是構建機器學習系統，例如分類器（比如垃圾郵件過濾器，異常檢測，影像識別）與推薦系統（例如，你可能認識的人，你可能感興趣的產品或相關的搜尋【29】）。

這些批處理作業的輸出通常是某種資料庫：例如，可以透過給定使用者 ID 查詢該使用者推薦好友的資料庫，或者可以透過產品 ID 查詢相關產品的資料庫【45】。

這些資料庫需要被處理使用者請求的 Web 應用所查詢，而它們通常是獨立於 Hadoop 基礎設施的。那麼批處理過程的輸出如何回到 Web 應用可以查詢的資料庫中呢？

最直接的選擇可能是，直接在 Mapper 或 Reducer 中使用你最愛的資料庫的客戶端庫，並從批處理作業直接寫入資料庫伺服器，一次寫入一條記錄。它能工作（假設你的防火牆規則允許從你的 Hadoop 環境直接訪問你的生產資料庫），但這並不是一個好主意，出於以下幾個原因：

- 正如前面在連線的上下文中討論的那樣，為每條記錄發起一個網路請求，要比批處理任務的正常吞吐量慢幾個數量級。即使客戶端庫支援批處理，效能也可能很差。
- MapReduce 作業經常並行執行許多工。如果所有 Mapper 或 Reducer 都同時寫入相同的輸出資料庫，並以批處理的預期速率工作，那麼該資料庫很可能被輕易壓垮，其查詢效能可能變差。這可能會導致系統其他部分的執行問題【35】。
- 通常情況下，MapReduce 為作業輸出提供了一個乾淨利落的 “全有或全無” 保證：如果作業成功，則結果就是每個任務恰好執行一次所產生的輸出，即使某些任務失敗且必須一路重試。如果整個作業失敗，則不會生成輸出。然而從作業內部寫入外部系統，會產生外部可見的副作用，這種副作用是不能以這種方式被隱藏的。因此，你不得不去操心對其他系統可見的部分完成的作業結果，並需要理解 Hadoop 任務嘗試與預測執行的複雜性。

更好的解決方案是在批處理作業 **內** 建立一個全新的資料庫，並將其作為檔案寫入分散式檔案系統中作業的輸出目錄，就像上節中的搜尋索引一樣。這些資料檔案一旦寫入就是不可變的，可以批次載入到處理只讀查詢的伺服器中。不少鍵值儲存都支援在 MapReduce 作業中構建資料庫檔案，包括 Voldemort 【46】、Terrapin 【47】、ElephantDB 【48】和 HBase 批次載入【49】。

構建這些資料庫檔案是 MapReduce 的一種好用法：使用 Mapper 提取出鍵並按該鍵排序，已經完成了構建索引所必需的大量工作。由於這些鍵值儲存大多都是隻讀的（檔案只能由批處理作業一次性寫入，然後就不可變），所以資料結構非常簡單。比如它們就不需要預寫式日誌（WAL，請參閱 “[讓 B 樹更可靠](/v1_tw/ch3#讓B樹更可靠)”）。

將資料載入到 Voldemort 時，伺服器將繼續用舊資料檔案服務請求，同時將新資料檔案從分散式檔案系統複製到伺服器的本地磁碟。一旦複製完成，伺服器會自動將查詢切換到新檔案。如果在這個過程中出現任何問題，它可以輕易回滾至舊檔案，因為它們仍然存在而且不可變【46】。

#### 批處理輸出的哲學

本章前面討論過的 Unix 哲學（“[Unix 哲學](#Unix哲學)”）鼓勵以顯式指明資料流的方式進行實驗：程式讀取輸入並寫入輸出。在這一過程中，輸入保持不變，任何先前的輸出都被新輸出完全替換，且沒有其他副作用。這意味著你可以隨心所欲地重新執行一個命令，略做改動或進行除錯，而不會攪亂系統的狀態。

MapReduce 作業的輸出處理遵循同樣的原理。透過將輸入視為不可變且避免副作用（如寫入外部資料庫），批處理作業不僅實現了良好的效能，而且更容易維護：

- 如果在程式碼中引入了一個錯誤，而輸出錯誤或損壞了，則可以簡單地回滾到程式碼的先前版本，然後重新執行該作業，輸出將重新被糾正。或者，甚至更簡單，你可以將舊的輸出儲存在不同的目錄中，然後切換回原來的目錄。具有讀寫事務的資料庫沒有這個屬性：如果你部署了錯誤的程式碼，將錯誤的資料寫入資料庫，那麼回滾程式碼將無法修復資料庫中的資料。（能夠從錯誤程式碼中恢復的概念被稱為 **人類容錯（human fault tolerance）**【50】）
- 由於回滾很容易，比起在錯誤意味著不可挽回的傷害的環境，功能開發進展能快很多。這種 **最小化不可逆性（minimizing irreversibility）** 的原則有利於敏捷軟體開發【51】。
- 如果 Map 或 Reduce 任務失敗，MapReduce 框架將自動重新排程，並在同樣的輸入上再次執行它。如果失敗是由程式碼中的錯誤造成的，那麼它會不斷崩潰，並最終導致作業在幾次嘗試之後失敗。但是如果故障是由於臨時問題導致的，那麼故障就會被容忍。因為輸入不可變，這種自動重試是安全的，而失敗任務的輸出會被 MapReduce 框架丟棄。
- 同一組檔案可用作各種不同作業的輸入，包括計算指標的監控作業並且評估作業的輸出是否具有預期的性質（例如，將其與前一次執行的輸出進行比較並測量差異） 。
- 與 Unix 工具類似，MapReduce 作業將邏輯與佈線（配置輸入和輸出目錄）分離，這使得關注點分離，可以重用程式碼：一個團隊可以專注實現一個做好一件事的作業；而其他團隊可以決定何時何地執行這項作業。

在這些領域，在 Unix 上表現良好的設計原則似乎也適用於 Hadoop，但 Unix 和 Hadoop 在某些方面也有所不同。例如，因為大多數 Unix 工具都假設輸入輸出是無型別文字檔案，所以它們必須做大量的輸入解析工作（本章開頭的日誌分析示例使用 `{print $7}` 來提取 URL）。在 Hadoop 上可以透過使用更結構化的檔案格式消除一些低價值的語法轉換：比如 Avro（請參閱 “[Avro](/v1_tw/ch4#Avro)”）和 Parquet（請參閱 “[列式儲存](/v1_tw/ch3#列式儲存)”）經常使用，因為它們提供了基於模式的高效編碼，並允許模式隨時間推移而演進（見 [第四章](/v1_tw/ch4)）。

### Hadoop與分散式資料庫的對比

正如我們所看到的，Hadoop 有點像 Unix 的分散式版本，其中 HDFS 是檔案系統，而 MapReduce 是 Unix 程序的怪異實現（總是在 Map 階段和 Reduce 階段執行 `sort` 工具）。我們瞭解了如何在這些原語的基礎上實現各種連線和分組操作。

當 MapReduce 論文發表時【1】，它從某種意義上來說 —— 並不新鮮。我們在前幾節中討論的所有處理和並行連線演算法已經在十多年前所謂的 **大規模並行處理（MPP，massively parallel processing）** 資料庫中實現了【3,40】。比如 Gamma database machine、Teradata 和 Tandem NonStop SQL 就是這方面的先驅【52】。

最大的區別是，MPP 資料庫專注於在一組機器上並行執行分析 SQL 查詢，而 MapReduce 和分散式檔案系統【19】的組合則更像是一個可以執行任意程式的通用作業系統。

#### 儲存多樣性

資料庫要求你根據特定的模型（例如關係或文件）來構造資料，而分散式檔案系統中的檔案只是位元組序列，可以使用任何資料模型和編碼來編寫。它們可能是資料庫記錄的集合，但同樣可以是文字、影像、影片、感測器讀數、稀疏矩陣、特徵向量、基因組序列或任何其他型別的資料。

說白了，Hadoop 開放了將資料不加區分地轉儲到 HDFS 的可能性，允許後續再研究如何進一步處理【53】。相比之下，在將資料匯入資料庫專有儲存格式之前，MPP 資料庫通常需要對資料和查詢模式進行仔細的前期建模。

在純粹主義者看來，這種仔細的建模和匯入似乎是可取的，因為這意味著資料庫的使用者有更高質量的資料來處理。然而實踐經驗表明，簡單地使資料快速可用 —— 即使它很古怪，難以使用，使用原始格式 —— 也通常要比事先決定理想資料模型要更有價值【54】。

這個想法與資料倉庫類似（請參閱 “[資料倉庫](/v1_tw/ch3#資料倉庫)”）：將大型組織的各個部分的資料集中在一起是很有價值的，因為它可以跨越以前相互分離的資料集進行連線。MPP 資料庫所要求的謹慎模式設計拖慢了集中式資料收集速度；以原始形式收集資料，稍後再操心模式的設計，能使資料收集速度加快（有時被稱為 “**資料湖（data lake）**” 或 “**企業資料中心（enterprise data hub）**”【55】）。

不加區分的資料轉儲轉移了解釋資料的負擔：資料集的生產者不再需要強制將其轉化為標準格式，資料的解釋成為消費者的問題（**讀時模式** 方法【56】；請參閱 “[文件模型中的模式靈活性](/v1_tw/ch2#文件模型中的模式靈活性)”）。如果生產者和消費者是不同優先順序的不同團隊，這可能是一種優勢。甚至可能不存在一個理想的資料模型，對於不同目的有不同的合適視角。以原始形式簡單地轉儲資料，可以允許多種這樣的轉換。這種方法被稱為 **壽司原則（sushi principle）**：“原始資料更好”【57】。

因此，Hadoop 經常被用於實現 ETL 過程（請參閱 “[資料倉庫](/v1_tw/ch3#資料倉庫)”）：事務處理系統中的資料以某種原始形式轉儲到分散式檔案系統中，然後編寫 MapReduce 作業來清理資料，將其轉換為關係形式，並將其匯入 MPP 資料倉庫以進行分析。資料建模仍然在進行，但它在一個單獨的步驟中進行，與資料收集相解耦。這種解耦是可行的，因為分散式檔案系統支援以任何格式編碼的資料。

#### 處理模型的多樣性

MPP 資料庫是單體的，緊密整合的軟體，負責磁碟上的儲存佈局，查詢計劃，排程和執行。由於這些元件都可以針對資料庫的特定需求進行調整和最佳化，因此整個系統可以在其設計針對的查詢型別上取得非常好的效能。而且，SQL 查詢語言允許以優雅的語法表達查詢，而無需編寫程式碼，可以在業務分析師使用的視覺化工具（例如 Tableau）中訪問到。

另一方面，並非所有型別的處理都可以合理地表達為 SQL 查詢。例如，如果要構建機器學習和推薦系統，或者使用相關性排名模型的全文搜尋索引，或者執行影像分析，則很可能需要更一般的資料處理模型。這些型別的處理通常是特別針對特定應用的（例如機器學習的特徵工程，機器翻譯的自然語言模型，欺詐預測的風險評估函式），因此它們不可避免地需要編寫程式碼，而不僅僅是查詢。

MapReduce 使工程師能夠輕鬆地在大型資料集上執行自己的程式碼。如果你有 HDFS 和 MapReduce，那麼你 **可以** 在它之上建立一個 SQL 查詢執行引擎，事實上這正是 Hive 專案所做的【31】。但是，你也可以編寫許多其他形式的批處理，這些批處理不必非要用 SQL 查詢表示。

隨後，人們發現 MapReduce 對於某些型別的處理而言侷限性很大，表現很差，因此在 Hadoop 之上其他各種處理模型也被開發出來（我們將在 “[MapReduce 之後](#MapReduce之後)” 中看到其中一些）。只有兩種處理模型，SQL 和 MapReduce，還不夠，需要更多不同的模型！而且由於 Hadoop 平臺的開放性，實施一整套方法是可行的，而這在單體 MPP 資料庫的範疇內是不可能的【58】。

至關重要的是，這些不同的處理模型都可以在共享的單個機器叢集上執行，所有這些機器都可以訪問分散式檔案系統上的相同檔案。在 Hadoop 方式中，不需要將資料匯入到幾個不同的專用系統中進行不同型別的處理：系統足夠靈活，可以支援同一個叢集內不同的工作負載。不需要移動資料，使得從資料中挖掘價值變得容易得多，也使採用新的處理模型容易的多。

Hadoop 生態系統包括隨機訪問的 OLTP 資料庫，如 HBase（請參閱 “[SSTables 和 LSM 樹](/v1_tw/ch3#SSTables和LSM樹)”）和 MPP 風格的分析型資料庫，如 Impala 【41】。HBase 與 Impala 都不使用 MapReduce，但都使用 HDFS 進行儲存。它們是迥異的資料訪問與處理方法，但是它們可以共存，並被整合到同一個系統中。

#### 針對頻繁故障設計

當比較 MapReduce 和 MPP 資料庫時，兩種不同的設計思路出現了：處理故障和使用記憶體與磁碟的方式。與線上系統相比，批處理對故障不太敏感，因為就算失敗也不會立即影響到使用者，而且它們總是能再次執行。

如果一個節點在執行查詢時崩潰，大多數 MPP 資料庫會中止整個查詢，並讓使用者重新提交查詢或自動重新執行它【3】。由於查詢通常最多執行幾秒鐘或幾分鐘，所以這種錯誤處理的方法是可以接受的，因為重試的代價不是太大。MPP 資料庫還傾向於在記憶體中保留儘可能多的資料（例如，使用雜湊連線）以避免從磁碟讀取的開銷。

另一方面，MapReduce 可以容忍單個 Map 或 Reduce 任務的失敗，而不會影響作業的整體，透過以單個任務的粒度重試工作。它也會非常急切地將資料寫入磁碟，一方面是為了容錯，另一部分是因為假設資料集太大而不能適應記憶體。

MapReduce 方式更適用於較大的作業：要處理如此之多的資料並執行很長時間的作業，以至於在此過程中很可能至少遇到一個任務故障。在這種情況下，由於單個任務失敗而重新執行整個作業將是非常浪費的。即使以單個任務的粒度進行恢復引入了使得無故障處理更慢的開銷，但如果任務失敗率足夠高，這仍然是一種合理的權衡。

但是這些假設有多麼現實呢？在大多數叢集中，機器故障確實會發生，但是它們不是很頻繁 —— 可能少到絕大多數作業都不會經歷機器故障。為了容錯，真的值得帶來這麼大的額外開銷嗎？

要了解 MapReduce 節約使用記憶體和在任務的層次進行恢復的原因，瞭解最初設計 MapReduce 的環境是很有幫助的。Google 有著混用的資料中心，線上生產服務和離線批處理作業在同樣機器上執行。每個任務都有一個透過容器強制執行的資源配給（CPU 核心、RAM、磁碟空間等）。每個任務也具有優先順序，如果優先順序較高的任務需要更多的資源，則可以終止（搶佔）同一臺機器上較低優先順序的任務以釋放資源。優先順序還決定了計算資源的定價：團隊必須為他們使用的資源付費，而優先順序更高的程序花費更多【59】。

這種架構允許非生產（低優先順序）計算資源被 **過量使用（overcommitted）**，因為系統知道必要時它可以回收資源。與分離生產和非生產任務的系統相比，過量使用資源可以更好地利用機器並提高效率。但由於 MapReduce 作業以低優先順序執行，它們隨時都有被搶佔的風險，因為優先順序較高的程序可能需要其資源。在高優先順序程序拿走所需資源後，批次作業能有效地 “撿麵包屑”，利用剩下的任何計算資源。

在谷歌，執行一個小時的 MapReduce 任務有大約有 5% 的風險被終止，為了給更高優先順序的程序挪地方。這一機率比硬體問題、機器重啟或其他原因的機率高了一個數量級【59】。按照這種搶佔率，如果一個作業有 100 個任務，每個任務執行 10 分鐘，那麼至少有一個任務在完成之前被終止的風險大於 50%。

這就是 MapReduce 被設計為容忍頻繁意外任務終止的原因：不是因為硬體很不可靠，而是因為任意終止程序的自由有利於提高計算叢集中的資源利用率。

在開源的叢集排程器中，搶佔的使用較少。YARN 的 CapacityScheduler 支援搶佔，以平衡不同佇列的資源分配【58】，但在編寫本文時，YARN，Mesos 或 Kubernetes 不支援通用的優先順序搶佔【60】。在任務不經常被終止的環境中，MapReduce 的這一設計決策就沒有多少意義了。在下一節中，我們將研究一些與 MapReduce 設計決策相異的替代方案。


## MapReduce之後

雖然 MapReduce 在 2000 年代後期變得非常流行，並受到大量的炒作，但它只是分散式系統的許多可能的程式設計模型之一。對於不同的資料量，資料結構和處理型別，其他工具可能更適合表示計算。


不管如何，我們在這一章花了大把時間來討論 MapReduce，因為它是一種有用的學習工具，它是分散式檔案系統的一種相當簡單明晰的抽象。在這裡，**簡單** 意味著我們能理解它在做什麼，而不是意味著使用它很簡單。恰恰相反：使用原始的 MapReduce API 來實現複雜的處理工作實際上是非常困難和費力的 —— 例如，任意一種連線演算法都需要你從頭開始實現【37】。

針對直接使用 MapReduce 的困難，在 MapReduce 上有很多高階程式設計模型（Pig、Hive、Cascading、Crunch）被創造出來，作為建立在 MapReduce 之上的抽象。如果你瞭解 MapReduce 的原理，那麼它們學起來相當簡單。而且它們的高階結構能顯著簡化許多常見批處理任務的實現。

但是，MapReduce 執行模型本身也存在一些問題，這些問題並沒有透過增加另一個抽象層次而解決，而對於某些型別的處理，它表現得非常差勁。一方面，MapReduce 非常穩健：你可以使用它在任務會頻繁終止的多租戶系統上處理幾乎任意大量級的資料，並且仍然可以完成工作（雖然速度很慢）。另一方面，對於某些型別的處理而言，其他工具有時會快上幾個數量級。

在本章的其餘部分中，我們將介紹一些批處理方法。在 [第十一章](/v1_tw/ch11) 我們將轉向流處理，它可以看作是加速批處理的另一種方法。

### 物化中間狀態

如前所述，每個 MapReduce 作業都獨立於其他任何作業。作業與世界其他地方的主要連線點是分散式檔案系統上的輸入和輸出目錄。如果希望一個作業的輸出成為第二個作業的輸入，則需要將第二個作業的輸入目錄配置為第一個作業輸出目錄，且外部工作流排程程式必須在第一個作業完成後再啟動第二個。

如果第一個作業的輸出是要在組織內廣泛釋出的資料集，則這種配置是合理的。在這種情況下，你需要透過名稱引用它，並將其重用為多個不同作業的輸入（包括由其他團隊開發的作業）。將資料釋出到分散式檔案系統中眾所周知的位置能夠帶來 **松耦合**，這樣作業就不需要知道是誰在提供輸入或誰在消費輸出（請參閱 “[邏輯與佈線相分離](#邏輯與佈線相分離)”）。

但在很多情況下，你知道一個作業的輸出只能用作另一個作業的輸入，這些作業由同一個團隊維護。在這種情況下，分散式檔案系統上的檔案只是簡單的 **中間狀態（intermediate state）**：一種將資料從一個作業傳遞到下一個作業的方式。在一個用於構建推薦系統的，由 50 或 100 個 MapReduce 作業組成的複雜工作流中，存在著很多這樣的中間狀態【29】。

將這個中間狀態寫入檔案的過程稱為 **物化（materialization）**。（在 “[聚合：資料立方體和物化檢視](/v1_tw/ch3#聚合：資料立方體和物化檢視)” 中已經在物化檢視的背景中遇到過這個術語。它意味著對某個操作的結果立即求值並寫出來，而不是在請求時按需計算）

作為對照，本章開頭的日誌分析示例使用 Unix 管道將一個命令的輸出與另一個命令的輸入連線起來。管道並沒有完全物化中間狀態，而是隻使用一個小的記憶體緩衝區，將輸出增量地 **流（stream）** 向輸入。

與 Unix 管道相比，MapReduce 完全物化中間狀態的方法存在不足之處：

- MapReduce 作業只有在前驅作業（生成其輸入）中的所有任務都完成時才能啟動，而由 Unix 管道連線的程序會同時啟動，輸出一旦生成就會被消費。不同機器上的資料偏斜或負載不均意味著一個作業往往會有一些掉隊的任務，比其他任務要慢得多才能完成。必須等待至前驅作業的所有任務完成，拖慢了整個工作流程的執行。
- Mapper 通常是多餘的：它們僅僅是讀取剛剛由 Reducer 寫入的同樣檔案，為下一個階段的分割槽和排序做準備。在許多情況下，Mapper 程式碼可能是前驅 Reducer 的一部分：如果 Reducer 和 Mapper 的輸出有著相同的分割槽與排序方式，那麼 Reducer 就可以直接串在一起，而不用與 Mapper 相互交織。
- 將中間狀態儲存在分散式檔案系統中意味著這些檔案被複制到多個節點，對這些臨時資料這麼搞就比較過分了。

#### 資料流引擎

為了解決 MapReduce 的這些問題，幾種用於分散式批處理的新執行引擎被開發出來，其中最著名的是 Spark 【61,62】，Tez 【63,64】和 Flink 【65,66】。它們的設計方式有很多區別，但有一個共同點：把整個工作流作為單個作業來處理，而不是把它分解為獨立的子作業。

由於它們將工作流顯式建模為資料從幾個處理階段穿過，所以這些系統被稱為 **資料流引擎（dataflow engines）**。像 MapReduce 一樣，它們在一條線上透過反覆呼叫使用者定義的函式來一次處理一條記錄，它們透過輸入分割槽來並行化載荷，它們透過網路將一個函式的輸出複製到另一個函式的輸入。

與 MapReduce 不同，這些函式不需要嚴格扮演交織的 Map 與 Reduce 的角色，而是可以以更靈活的方式進行組合。我們稱這些函式為 **運算元（operators）**，資料流引擎提供了幾種不同的選項來將一個運算元的輸出連線到另一個運算元的輸入：

- 一種選項是對記錄按鍵重新分割槽並排序，就像在 MapReduce 的混洗階段一樣（請參閱 “[分散式執行 MapReduce](#分散式執行MapReduce)”）。這種功能可以用於實現排序合併連線和分組，就像在 MapReduce 中一樣。
- 另一種可能是接受多個輸入，並以相同的方式進行分割槽，但跳過排序。當記錄的分割槽重要但順序無關緊要時，這省去了分割槽雜湊連線的工作，因為構建散列表還是會把順序隨機打亂。
- 對於廣播雜湊連線，可以將一個運算元的輸出，傳送到連線運算元的所有分割槽。

這種型別的處理引擎是基於像 Dryad【67】和 Nephele【68】這樣的研究系統，與 MapReduce 模型相比，它有幾個優點：

- 排序等昂貴的工作只需要在實際需要的地方執行，而不是預設地在每個 Map 和 Reduce 階段之間出現。
- 沒有不必要的 Map 任務，因為 Mapper 所做的工作通常可以合併到前面的 Reduce 運算元中（因為 Mapper 不會更改資料集的分割槽）。
- 由於工作流中的所有連線和資料依賴都是顯式宣告的，因此排程程式能夠總覽全域性，知道哪裡需要哪些資料，因而能夠利用區域性進行最佳化。例如，它可以嘗試將消費某些資料的任務放在與生成這些資料的任務相同的機器上，從而資料可以透過共享記憶體緩衝區傳輸，而不必透過網路複製。
- 通常，運算元間的中間狀態足以儲存在記憶體中或寫入本地磁碟，這比寫入 HDFS 需要更少的 I/O（必須將其複製到多臺機器，並將每個副本寫入磁碟）。MapReduce 已經對 Mapper 的輸出做了這種最佳化，但資料流引擎將這種思想推廣至所有的中間狀態。
- 運算元可以在輸入就緒後立即開始執行；後續階段無需等待前驅階段整個完成後再開始。
- 與 MapReduce（為每個任務啟動一個新的 JVM）相比，現有 Java 虛擬機器（JVM）程序可以重用來執行新運算元，從而減少啟動開銷。

你可以使用資料流引擎執行與 MapReduce 工作流同樣的計算，而且由於此處所述的最佳化，通常執行速度要明顯快得多。既然運算元是 Map 和 Reduce 的泛化，那麼相同的處理程式碼就可以在任一執行引擎上執行：Pig，Hive 或 Cascading 中實現的工作流可以無需修改程式碼，可以透過修改配置，簡單地從 MapReduce 切換到 Tez 或 Spark【64】。

Tez 是一個相當薄的庫，它依賴於 YARN shuffle 服務來實現節點間資料的實際複製【58】，而 Spark 和 Flink 則是包含了獨立網路通訊層，排程器，及使用者向 API 的大型框架。我們將簡要討論這些高階 API。

#### 容錯

完全物化中間狀態至分散式檔案系統的一個優點是，它具有永續性，這使得 MapReduce 中的容錯相當容易：如果一個任務失敗，它可以在另一臺機器上重新啟動，並從檔案系統重新讀取相同的輸入。

Spark、Flink 和 Tez 避免將中間狀態寫入 HDFS，因此它們採取了不同的方法來容錯：如果一臺機器發生故障，並且該機器上的中間狀態丟失，則它會從其他仍然可用的資料重新計算（在可行的情況下是先前的中間狀態，要麼就只能是原始輸入資料，通常在 HDFS 上）。

為了實現這種重新計算，框架必須跟蹤一個給定的資料是如何計算的 —— 使用了哪些輸入分割槽？應用了哪些運算元？ Spark 使用 **彈性分散式資料集（RDD，Resilient Distributed Dataset）** 的抽象來跟蹤資料的譜系【61】，而 Flink 對運算元狀態存檔，允許恢復執行在執行過程中遇到錯誤的運算元【66】。

在重新計算資料時，重要的是要知道計算是否是 **確定性的**：也就是說，給定相同的輸入資料，運算元是否始終產生相同的輸出？如果一些丟失的資料已經發送給下游運算元，這個問題就很重要。如果運算元重新啟動，重新計算的資料與原有的丟失資料不一致，下游運算元很難解決新舊資料之間的矛盾。對於不確定性運算元來說，解決方案通常是殺死下游運算元，然後再重跑新資料。

為了避免這種級聯故障，最好讓運算元具有確定性。但需要注意的是，非確定性行為很容易悄悄溜進來：例如，許多程式語言在迭代雜湊表的元素時不能對順序作出保證，許多機率和統計算法顯式依賴於使用隨機數，以及用到系統時鐘或外部資料來源，這些都是都不確定性的行為。為了能可靠地從故障中恢復，需要消除這種不確定性因素，例如使用固定的種子生成偽隨機數。

透過重算資料來從故障中恢復並不總是正確的答案：如果中間狀態資料要比源資料小得多，或者如果計算量非常大，那麼將中間資料物化為檔案可能要比重新計算廉價的多。

#### 關於物化的討論

回到 Unix 的類比，我們看到，MapReduce 就像是將每個命令的輸出寫入臨時檔案，而資料流引擎看起來更像是 Unix 管道。尤其是 Flink 是基於管道執行的思想而建立的：也就是說，將運算元的輸出增量地傳遞給其他運算元，不待輸入完成便開始處理。

排序運算元不可避免地需要消費全部的輸入後才能生成任何輸出，因為輸入中最後一條輸入記錄可能具有最小的鍵，因此需要作為第一條記錄輸出。因此，任何需要排序的運算元都需要至少暫時地累積狀態。但是工作流的許多其他部分可以以流水線方式執行。

當作業完成時，它的輸出需要持續到某個地方，以便使用者可以找到並使用它 —— 很可能它會再次寫入分散式檔案系統。因此，在使用資料流引擎時，HDFS 上的物化資料集通常仍是作業的輸入和最終輸出。和 MapReduce 一樣，輸入是不可變的，輸出被完全替換。比起 MapReduce 的改進是，你不用再自己去將中間狀態寫入檔案系統了。

### 圖與迭代處理

在 “[圖資料模型](/v1_tw/ch2#圖資料模型)” 中，我們討論了使用圖來建模資料，並使用圖查詢語言來遍歷圖中的邊與點。[第二章](/v1_tw/ch2) 的討論集中在 OLTP 風格的應用場景：快速執行查詢來查詢少量符合特定條件的頂點。

批處理上下文中的圖也很有趣，其目標是在整個圖上執行某種離線處理或分析。這種需求經常出現在機器學習應用（如推薦引擎）或排序系統中。例如，最著名的圖形分析演算法之一是 PageRank 【69】，它試圖根據連結到某個網頁的其他網頁來估計該網頁的流行度。它作為配方的一部分，用於確定網路搜尋引擎呈現結果的順序。

> 像 Spark、Flink 和 Tez 這樣的資料流引擎（請參閱 “[物化中間狀態](#物化中間狀態)”）通常將運算元作為 **有向無環圖（DAG）** 的一部分安排在作業中。這與圖處理不一樣：在資料流引擎中，**從一個運算元到另一個運算元的資料流** 被構造成一個圖，而資料本身通常由關係型元組構成。在圖處理中，資料本身具有圖的形式。又一個不幸的命名混亂！

許多圖演算法是透過一次遍歷一條邊來表示的，將一個頂點與近鄰的頂點連線起來，以傳播一些資訊，並不斷重複，直到滿足一些條件為止 —— 例如，直到沒有更多的邊要跟進，或直到一些指標收斂。我們在 [圖 2-6](/v1/ddia_0206.png) 中看到一個例子，它透過重複跟進標明地點歸屬關係的邊，生成了資料庫中北美包含的所有地點列表（這種演算法被稱為 **傳遞閉包**，即 transitive closure）。

可以在分散式檔案系統中儲存圖（包含頂點和邊的列表的檔案），但是這種 “重複至完成” 的想法不能用普通的 MapReduce 來表示，因為它只掃過一趟資料。這種演算法因此經常以 **迭代** 的風格實現：

1. 外部排程程式執行批處理來計算演算法的一個步驟。
2. 當批處理過程完成時，排程器檢查它是否完成（基於完成條件 —— 例如，沒有更多的邊要跟進，或者與上次迭代相比的變化低於某個閾值）。
3. 如果尚未完成，則排程程式返回到步驟 1 並執行另一輪批處理。

這種方法是有效的，但是用 MapReduce 實現它往往非常低效，因為 MapReduce 沒有考慮演算法的迭代性質：它總是讀取整個輸入資料集併產生一個全新的輸出資料集，即使與上次迭代相比，改變的僅僅是圖中的一小部分。

#### Pregel處理模型

針對圖批處理的最佳化 —— **批次同步並行（BSP，Bulk Synchronous Parallel）** 計算模型【70】已經開始流行起來。其中，Apache Giraph 【37】，Spark 的 GraphX API 和 Flink 的 Gelly API 【71】實現了它。它也被稱為 **Pregel** 模型，因為 Google 的 Pregel 論文推廣了這種處理圖的方法【72】。

回想一下在 MapReduce 中，Mapper 在概念上向 Reducer 的特定呼叫 “傳送訊息”，因為框架將所有具有相同鍵的 Mapper 輸出集中在一起。Pregel 背後有一個類似的想法：一個頂點可以向另一個頂點 “傳送訊息”，通常這些訊息是沿著圖的邊傳送的。

在每次迭代中，為每個頂點呼叫一個函式，將所有傳送給它的訊息傳遞給它 —— 就像呼叫 Reducer 一樣。與 MapReduce 的不同之處在於，在 Pregel 模型中，頂點在一次迭代到下一次迭代的過程中會記住它的狀態，所以這個函式只需要處理新的傳入訊息。如果圖的某個部分沒有被傳送訊息，那裡就不需要做任何工作。

這與 Actor 模型有些相似（請參閱 “[分散式的 Actor 框架](/v1_tw/ch4#分散式的Actor框架)”），除了頂點狀態和頂點之間的訊息具有容錯性和永續性，且通訊以固定的回合進行：在每次迭代中，框架遞送上次迭代中傳送的所有訊息。Actor 通常沒有這樣的時序保證。

#### 容錯

頂點只能透過訊息傳遞進行通訊（而不是直接相互查詢）的事實有助於提高 Pregel 作業的效能，因為訊息可以成批處理，且等待通訊的次數也減少了。唯一的等待是在迭代之間：由於 Pregel 模型保證所有在一輪迭代中傳送的訊息都在下輪迭代中送達，所以在下一輪迭代開始前，先前的迭代必須完全完成，而所有的訊息必須在網路上完成複製。

即使底層網路可能丟失、重複或任意延遲訊息（請參閱 “[不可靠的網路](/v1_tw/ch8#不可靠的網路)”），Pregel 的實現能保證在後續迭代中訊息在其目標頂點恰好處理一次。像 MapReduce 一樣，框架能從故障中透明地恢復，以簡化在 Pregel 上實現演算法的程式設計模型。

這種容錯是透過在迭代結束時，定期存檔所有頂點的狀態來實現的，即將其全部狀態寫入持久化儲存。如果某個節點發生故障並且其記憶體中的狀態丟失，則最簡單的解決方法是將整個圖計算回滾到上一個存檔點，然後重啟計算。如果演算法是確定性的，且訊息記錄在日誌中，那麼也可以選擇性地只恢復丟失的分割槽（就像之前討論過的資料流引擎）【72】。

#### 並行執行

頂點不需要知道它在哪臺物理機器上執行；當它向其他頂點發送訊息時，它只是簡單地將訊息發往某個頂點 ID。圖的分割槽取決於框架 —— 即，確定哪個頂點執行在哪臺機器上，以及如何透過網路路由訊息，以便它們到達正確的地方。

由於程式設計模型一次僅處理一個頂點（有時稱為 “像頂點一樣思考”），所以框架可以以任意方式對圖分割槽。理想情況下如果頂點需要進行大量的通訊，那麼它們最好能被分割槽到同一臺機器上。然而找到這樣一種最佳化的分割槽方法是很困難的 —— 在實踐中，圖經常按照任意分配的頂點 ID 分割槽，而不會嘗試將相關的頂點分組在一起。

因此，圖演算法通常會有很多跨機器通訊的額外開銷，而中間狀態（節點之間傳送的訊息）往往比原始圖大。透過網路傳送訊息的開銷會顯著拖慢分散式圖演算法的速度。

出於這個原因，如果你的圖可以放入一臺計算機的記憶體中，那麼單機（甚至可能是單執行緒）演算法很可能會超越分散式批處理【73,74】。圖比記憶體大也沒關係，只要能放入單臺計算機的磁碟，使用 GraphChi 等框架進行單機處理是就一個可行的選擇【75】。如果圖太大，不適合單機處理，那麼像 Pregel 這樣的分散式方法是不可避免的。高效的並行圖演算法是一個進行中的研究領域【76】。


### 高階API和語言

自 MapReduce 開始流行的這幾年以來，分散式批處理的執行引擎已經很成熟了。到目前為止，基礎設施已經足夠強大，能夠儲存和處理超過 10,000 臺機器叢集上的數 PB 的資料。由於在這種規模下物理執行批處理的問題已經被認為或多或少解決了，所以關注點已經轉向其他領域：改進程式設計模型，提高處理效率，擴大這些技術可以解決的問題集。

如前所述，Hive、Pig、Cascading 和 Crunch 等高階語言和 API 變得越來越流行，因為手寫 MapReduce 作業實在是個苦力活。隨著 Tez 的出現，這些高階語言還有一個額外好處，可以遷移到新的資料流執行引擎，而無需重寫作業程式碼。Spark 和 Flink 也有它們自己的高階資料流 API，通常是從 FlumeJava 中獲取的靈感【34】。

這些資料流 API 通常使用關係型構建塊來表達一個計算：按某個欄位連線資料集；按鍵對元組做分組；按某些條件過濾；並透過計數求和或其他函式來聚合元組。在內部，這些操作是使用本章前面討論過的各種連線和分組演算法來實現的。

除了少寫程式碼的明顯優勢之外，這些高階介面還支援互動式用法，在這種互動式使用中，你可以在 Shell 中增量式編寫分析程式碼，頻繁執行來觀察它做了什麼。這種開發風格在探索資料集和試驗處理方法時非常有用。這也讓人聯想到 Unix 哲學，我們在 “[Unix 哲學](#Unix哲學)” 中討論過這個問題。

此外，這些高階介面不僅提高了人類的工作效率，也提高了機器層面的作業執行效率。

#### 向宣告式查詢語言的轉變

與硬寫執行連線的程式碼相比，指定連線關係運算元的優點是，框架可以分析連線輸入的屬性，並自動決定哪種上述連線演算法最適合當前任務。Hive、Spark 和 Flink 都有基於代價的查詢最佳化器可以做到這一點，甚至可以改變連線順序，最小化中間狀態的數量【66,77,78,79】。

連線演算法的選擇可以對批處理作業的效能產生巨大影響，而無需理解和記住本章中討論的各種連線演算法。如果連線是以 **宣告式（declarative）** 的方式指定的，那這就這是可行的：應用只是簡單地說明哪些連線是必需的，查詢最佳化器決定如何最好地執行連線。我們以前在 “[資料查詢語言](/v1_tw/ch2#資料查詢語言)” 中見過這個想法。

但 MapReduce 及其資料流後繼者在其他方面，與 SQL 的完全宣告式查詢模型有很大區別。MapReduce 是圍繞著回撥函式的概念建立的：對於每條記錄或者一組記錄，呼叫一個使用者定義的函式（Mapper 或 Reducer），並且該函式可以自由地呼叫任意程式碼來決定輸出什麼。這種方法的優點是可以基於大量已有庫的生態系統創作：解析、自然語言分析、影像分析以及執行數值或統計算法等。

自由執行任意程式碼，長期以來都是傳統 MapReduce 批處理系統與 MPP 資料庫的區別所在（請參閱 “[Hadoop 與分散式資料庫的對比](#Hadoop與分散式資料庫的對比)” 一節）。雖然資料庫具有編寫使用者定義函式的功能，但是它們通常使用起來很麻煩，而且與大多數程式語言中廣泛使用的程式包管理器和依賴管理系統相容不佳（例如 Java 的 Maven、Javascript 的 npm 以及 Ruby 的 gems）。

然而資料流引擎已經發現，支援除連線之外的更多 **宣告式特性** 還有其他的優勢。例如，如果一個回撥函式只包含一個簡單的過濾條件，或者只是從一條記錄中選擇了一些欄位，那麼在為每條記錄呼叫函式時會有相當大的額外 CPU 開銷。如果以宣告方式表示這些簡單的過濾和對映操作，那麼查詢最佳化器可以利用列式儲存佈局（請參閱 “[列式儲存](/v1_tw/ch3#列式儲存)”），只從磁碟讀取所需的列。Hive、Spark DataFrames 和 Impala 還使用了向量化執行（請參閱 “[記憶體頻寬和向量化處理](/v1_tw/ch3#記憶體頻寬和向量化處理)”）：在對 CPU 快取友好的內部迴圈中迭代資料，避免函式呼叫。Spark 生成 JVM 位元組碼【79】，Impala 使用 LLVM 為這些內部迴圈生成本機程式碼【41】。

透過在高階 API 中引入宣告式的部分，並使查詢最佳化器可以在執行期間利用這些來做最佳化，批處理框架看起來越來越像 MPP 資料庫了（並且能實現可與之媲美的效能）。同時，透過擁有執行任意程式碼和以任意格式讀取資料的可擴充套件性，它們保持了靈活性的優勢。

#### 專業化的不同領域

儘管能夠執行任意程式碼的可擴充套件性是很有用的，但是也有很多常見的例子，不斷重複著標準的處理模式。因而這些模式值得擁有自己的可重用通用構建模組實現。傳統上，MPP 資料庫滿足了商業智慧分析和業務報表的需求，但這只是許多使用批處理的領域之一。

另一個越來越重要的領域是統計和數值演算法，它們是機器學習應用所需要的（例如分類器和推薦系統）。可重用的實現正在出現：例如，Mahout 在 MapReduce、Spark 和 Flink 之上實現了用於機器學習的各種演算法，而 MADlib 在關係型 MPP 資料庫（Apache HAWQ）中實現了類似的功能【54】。

空間演算法也是有用的，例如 **k 近鄰搜尋（k-nearest neighbors, kNN）**【80】，它在一些多維空間中搜索與給定項最近的專案 —— 這是一種相似性搜尋。近似搜尋對於基因組分析演算法也很重要，它們需要找到相似但不相同的字串【81】。

批處理引擎正被用於分散式執行日益廣泛的各領域演算法。隨著批處理系統獲得各種內建功能以及高階宣告式運算元，且隨著 MPP 資料庫變得更加靈活和易於程式設計，兩者開始看起來相似了：最終，它們都只是儲存和處理資料的系統。


## 本章小結

在本章中，我們探索了批處理的主題。我們首先看到了諸如 awk、grep 和 sort 之類的 Unix 工具，然後我們看到了這些工具的設計理念是如何應用到 MapReduce 和更近的資料流引擎中的。一些設計原則包括：輸入是不可變的，輸出是為了作為另一個（仍未知的）程式的輸入，而複雜的問題是透過編寫 “做好一件事” 的小工具來解決的。

在 Unix 世界中，允許程式與程式組合的統一介面是檔案與管道；在 MapReduce 中，該介面是一個分散式檔案系統。我們看到資料流引擎添加了自己的管道式資料傳輸機制，以避免將中間狀態物化至分散式檔案系統，但作業的初始輸入和最終輸出通常仍是 HDFS。

分散式批處理框架需要解決的兩個主要問題是：

分割槽
: 在 MapReduce 中，Mapper 根據輸入檔案塊進行分割槽。Mapper 的輸出被重新分割槽、排序併合併到可配置數量的 Reducer 分割槽中。這一過程的目的是把所有的 **相關** 資料（例如帶有相同鍵的所有記錄）都放在同一個地方。
  後 MapReduce 時代的資料流引擎若非必要會盡量避免排序，但它們也採取了大致類似的分割槽方法。

容錯
: MapReduce 經常寫入磁碟，這使得從單個失敗的任務恢復很輕鬆，無需重新啟動整個作業，但在無故障的情況下減慢了執行速度。資料流引擎更多地將中間狀態儲存在記憶體中，更少地物化中間狀態，這意味著如果節點發生故障，則需要重算更多的資料。確定性運算元減少了需要重算的資料量。


我們討論了幾種 MapReduce 的連線演算法，其中大多數也在 MPP 資料庫和資料流引擎內部使用。它們也很好地演示了分割槽演算法是如何工作的：

排序合併連線
: 每個參與連線的輸入都透過一個提取連線鍵的 Mapper。透過分割槽、排序和合並，具有相同鍵的所有記錄最終都會進入相同的 Reducer 呼叫。這個函式能輸出連線好的記錄。

廣播雜湊連線
: 兩個連線輸入之一很小，所以它並沒有分割槽，而且能被完全載入進一個雜湊表中。因此，你可以為連線輸入大端的每個分割槽啟動一個 Mapper，將輸入小端的散列表載入到每個 Mapper 中，然後掃描大端，一次一條記錄，併為每條記錄查詢散列表。

分割槽雜湊連線
: 如果兩個連線輸入以相同的方式分割槽（使用相同的鍵，相同的雜湊函式和相同數量的分割槽），則可以獨立地對每個分割槽應用散列表方法。

分散式批處理引擎有一個刻意限制的程式設計模型：回撥函式（比如 Mapper 和 Reducer）被假定是無狀態的，而且除了指定的輸出外，必須沒有任何外部可見的副作用。這一限制允許框架在其抽象下隱藏一些困難的分散式系統問題：當遇到崩潰和網路問題時，任務可以安全地重試，任何失敗任務的輸出都被丟棄。如果某個分割槽的多個任務成功，則其中只有一個能使其輸出實際可見。

得益於這個框架，你在批處理作業中的程式碼無需操心實現容錯機制：框架可以保證作業的最終輸出與沒有發生錯誤的情況相同，雖然實際上也許不得不重試各種任務。比起線上服務一邊處理使用者請求一邊將寫入資料庫作為處理請求的副作用，批處理提供的這種可靠性語義要強得多。

批處理作業的顯著特點是，它讀取一些輸入資料併產生一些輸出資料，但不修改輸入 —— 換句話說，輸出是從輸入衍生出的。最關鍵的是，輸入資料是 **有界的（bounded）**：它有一個已知的，固定的大小（例如，它包含一些時間點的日誌檔案或資料庫內容的快照）。因為它是有界的，一個作業知道自己什麼時候完成了整個輸入的讀取，所以一個工作在做完後，最終總是會完成的。

在下一章中，我們將轉向流處理，其中的輸入是 **無界的（unbounded）** —— 也就是說，你還有活兒要幹，然而它的輸入是永無止境的資料流。在這種情況下，作業永無完成之日。因為在任何時候都可能有更多的工作湧入。我們將看到，在某些方面上，流處理和批處理是相似的。但是關於無盡資料流的假設也對我們構建系統的方式產生了很多改變。


## 參考文獻

1. Jeffrey Dean and Sanjay Ghemawat: “[MapReduce: Simplified Data Processing on Large Clusters](https://research.google/pubs/pub62/),” at *6th USENIX Symposium on Operating System Design and Implementation* (OSDI), December 2004.
1. Joel Spolsky: “[The Perils of JavaSchools](https://www.joelonsoftware.com/2005/12/29/the-perils-of-javaschools-2/),” *joelonsoftware.com*, December 29, 2005.
1. Shivnath Babu and Herodotos Herodotou: “[Massively Parallel Databases and MapReduce Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2013/11/db-mr-survey-final.pdf),” *Foundations and Trends in Databases*, volume 5, number 1, pages 1–104, November 2013. [doi:10.1561/1900000036](http://dx.doi.org/10.1561/1900000036)
1. David J. DeWitt and Michael Stonebraker: “[MapReduce: A Major Step Backwards](https://homes.cs.washington.edu/~billhowe/mapreduce_a_major_step_backwards.html),” originally published at *databasecolumn.vertica.com*, January 17, 2008.
1. Henry Robinson: “[The Elephant Was a Trojan Horse: On the Death of Map-Reduce at Google](https://www.the-paper-trail.org/post/2014-06-25-the-elephant-was-a-trojan-horse-on-the-death-of-map-reduce-at-google/),” *the-paper-trail.org*, June 25, 2014.
1. “[The Hollerith Machine](https://www.census.gov/history/www/innovations/technology/the_hollerith_tabulator.html),” United States Census Bureau, *census.gov*.
1. “[IBM 82, 83, and 84 Sorters Reference Manual](https://bitsavers.org/pdf/ibm/punchedCard/Sorter/A24-1034-1_82-83-84_sorters.pdf),” Edition A24-1034-1, International Business Machines Corporation, July 1962.
1. Adam Drake: “[Command-Line Tools Can Be 235x Faster than Your Hadoop Cluster](https://adamdrake.com/command-line-tools-can-be-235x-faster-than-your-hadoop-cluster.html),” *aadrake.com*, January 25, 2014.
1. “[GNU Coreutils 8.23 Documentation](http://www.gnu.org/software/coreutils/manual/html_node/index.html),” Free Software Foundation, Inc., 2014.
1. Martin Kleppmann: “[Kafka, Samza, and the Unix Philosophy of Distributed Data](http://martin.kleppmann.com/2015/08/05/kafka-samza-unix-philosophy-distributed-data.html),” *martin.kleppmann.com*, August 5, 2015.
1. Doug McIlroy: [Internal Bell Labs memo](https://swtch.com/~rsc/thread/mdmpipe.pdf), October 1964. Cited in: Dennis M. Richie: “[Advice from Doug McIlroy](https://www.bell-labs.com/usr/dmr/www/mdmpipe.html),” *bell-labs.com*.
1. M. D. McIlroy, E. N. Pinson, and B. A. Tague: “[UNIX Time-Sharing System: Foreword](https://archive.org/details/bstj57-6-1899),” *The Bell System Technical Journal*, volume 57, number 6, pages 1899–1904, July 1978.
1. Eric S. Raymond: [*The Art of UNIX Programming*](http://www.catb.org/~esr/writings/taoup/html/). Addison-Wesley, 2003. ISBN: 978-0-13-142901-7
1. Ronald Duncan: “[Text File Formats – ASCII Delimited Text – Not CSV or TAB Delimited Text](https://ronaldduncan.wordpress.com/2009/10/31/text-file-formats-ascii-delimited-text-not-csv-or-tab-delimited-text/),” *ronaldduncan.wordpress.com*, October 31, 2009.
1. Alan Kay: “[Is 'Software Engineering' an Oxymoron?](http://tinlizzie.org/~takashi/IsSoftwareEngineeringAnOxymoron.pdf),” *tinlizzie.org*.
1. Martin Fowler: “[InversionOfControl](http://martinfowler.com/bliki/InversionOfControl.html),” *martinfowler.com*, June 26, 2005.
1. Daniel J. Bernstein: “[Two File Descriptors for Sockets](http://cr.yp.to/tcpip/twofd.html),” *cr.yp.to*.
1. Rob Pike and Dennis M. Ritchie: “[The Styx Architecture for Distributed Systems](http://doc.cat-v.org/inferno/4th_edition/styx),” *Bell Labs Technical Journal*, volume 4, number 2, pages 146–152, April 1999.
1. Sanjay Ghemawat, Howard Gobioff, and Shun-Tak Leung: “[The Google File System](http://research.google.com/archive/gfs-sosp2003.pdf),” at *19th ACM Symposium on Operating Systems Principles* (SOSP), October 2003. [doi:10.1145/945445.945450](http://dx.doi.org/10.1145/945445.945450)
1. Michael Ovsiannikov, Silvius Rus, Damian Reeves, et al.: “[The Quantcast File System](http://db.disi.unitn.eu/pages/VLDBProgram/pdf/industry/p808-ovsiannikov.pdf),” *Proceedings of the VLDB Endowment*, volume 6, number 11, pages 1092–1101, August 2013. [doi:10.14778/2536222.2536234](http://dx.doi.org/10.14778/2536222.2536234)
1. “[OpenStack Swift 2.6.1 Developer Documentation](http://docs.openstack.org/developer/swift/),” OpenStack Foundation, *docs.openstack.org*, March 2016.
1. Zhe Zhang, Andrew Wang, Kai Zheng, et al.: “[Introduction to HDFS Erasure Coding in Apache Hadoop](https://blog.cloudera.com/introduction-to-hdfs-erasure-coding-in-apache-hadoop/),” *blog.cloudera.com*, September 23, 2015.
1. Peter Cnudde: “[Hadoop Turns 10](https://web.archive.org/web/20190119112713/https://yahoohadoop.tumblr.com/post/138739227316/hadoop-turns-10),” *yahoohadoop.tumblr.com*, February 5, 2016.
1. Eric Baldeschwieler: “[Thinking About the HDFS vs. Other Storage Technologies](https://web.archive.org/web/20190529215115/http://hortonworks.com/blog/thinking-about-the-hdfs-vs-other-storage-technologies/),” *hortonworks.com*, July 25, 2012.
1. Brendan Gregg: “[Manta: Unix Meets Map Reduce](https://web.archive.org/web/20220125052545/http://dtrace.org/blogs/brendan/2013/06/25/manta-unix-meets-map-reduce/),” *dtrace.org*, June 25, 2013.
1. Tom White: *Hadoop: The Definitive Guide*, 4th edition. O'Reilly Media, 2015. ISBN: 978-1-491-90163-2
1. Jim N. Gray: “[Distributed Computing Economics](http://arxiv.org/pdf/cs/0403019.pdf),” Microsoft Research Tech Report MSR-TR-2003-24, March 2003.
1. Márton Trencséni: “[Luigi vs Airflow vs Pinball](http://bytepawn.com/luigi-airflow-pinball.html),” *bytepawn.com*, February 6, 2016.
1. Roshan Sumbaly, Jay Kreps, and Sam Shah: “[The 'Big Data' Ecosystem at LinkedIn](http://www.slideshare.net/s_shah/the-big-data-ecosystem-at-linkedin-23512853),” at *ACM International Conference on Management of Data* (SIGMOD), July 2013. [doi:10.1145/2463676.2463707](http://dx.doi.org/10.1145/2463676.2463707)
1. Alan F. Gates, Olga Natkovich, Shubham Chopra, et al.: “[Building a High-Level Dataflow System on Top of Map-Reduce: The Pig Experience](http://www.vldb.org/pvldb/vol2/vldb09-1074.pdf),” at *35th International Conference on Very Large Data Bases* (VLDB), August 2009.
1. Ashish Thusoo, Joydeep Sen Sarma, Namit Jain, et al.: “[Hive – A Petabyte Scale Data Warehouse Using Hadoop](http://i.stanford.edu/~ragho/hive-icde2010.pdf),” at *26th IEEE International Conference on Data Engineering* (ICDE), March 2010. [doi:10.1109/ICDE.2010.5447738](http://dx.doi.org/10.1109/ICDE.2010.5447738)
1. “[Cascading 3.0 User Guide](https://web.archive.org/web/20231206195311/http://docs.cascading.org/cascading/3.0/userguide/),” Concurrent, Inc., *docs.cascading.org*, January 2016.
1. “[Apache Crunch User Guide](https://crunch.apache.org/user-guide.html),” Apache Software Foundation, *crunch.apache.org*.
1. Craig Chambers, Ashish Raniwala, Frances Perry, et al.: “[FlumeJava: Easy, Efficient Data-Parallel Pipelines](https://research.google.com/pubs/archive/35650.pdf),” at *31st ACM SIGPLAN Conference on Programming Language Design and Implementation* (PLDI), June 2010. [doi:10.1145/1806596.1806638](http://dx.doi.org/10.1145/1806596.1806638)
1. Jay Kreps: “[Why Local State is a Fundamental Primitive in Stream Processing](https://www.oreilly.com/ideas/why-local-state-is-a-fundamental-primitive-in-stream-processing),” *oreilly.com*, July 31, 2014.
1. Martin Kleppmann: “[Rethinking Caching in Web Apps](http://martin.kleppmann.com/2012/10/01/rethinking-caching-in-web-apps.html),” *martin.kleppmann.com*, October 1, 2012.
1. Mark Grover, Ted Malaska, Jonathan Seidman, and Gwen Shapira: *[Hadoop Application Architectures](http://shop.oreilly.com/product/0636920033196.do)*. O'Reilly Media, 2015. ISBN: 978-1-491-90004-8
1. Philippe Ajoux, Nathan Bronson, Sanjeev Kumar, et al.: “[Challenges to Adopting Stronger Consistency at Scale](https://www.usenix.org/system/files/conference/hotos15/hotos15-paper-ajoux.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
1. Sriranjan Manjunath: “[Skewed Join](https://web.archive.org/web/20151228114742/https://wiki.apache.org/pig/PigSkewedJoinSpec),” *wiki.apache.org*, 2009.
1. David J. DeWitt, Jeffrey F. Naughton, Donovan A. Schneider, and S. Seshadri: “[Practical Skew Handling in Parallel Joins](http://www.vldb.org/conf/1992/P027.PDF),” at *18th International Conference on Very Large Data Bases* (VLDB), August 1992.
1. Marcel Kornacker, Alexander Behm, Victor Bittorf, et al.: “[Impala: A Modern, Open-Source SQL Engine for Hadoop](http://pandis.net/resources/cidr15impala.pdf),” at *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
1. Matthieu Monsch: “[Open-Sourcing PalDB, a Lightweight Companion for Storing Side Data](https://engineering.linkedin.com/blog/2015/10/open-sourcing-paldb--a-lightweight-companion-for-storing-side-da),” *engineering.linkedin.com*, October 26, 2015.
1. Daniel Peng and Frank Dabek: “[Large-Scale Incremental Processing Using Distributed Transactions and Notifications](https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Peng.pdf),” at *9th USENIX conference on Operating Systems Design and Implementation* (OSDI), October 2010.
1. “["Cloudera Search User Guide,"](http://www.cloudera.com/documentation/cdh/5-1-x/Search/Cloudera-Search-User-Guide/Cloudera-Search-User-Guide.html) Cloudera, Inc., September 2015.
1. Lili Wu, Sam Shah, Sean Choi, et al.: “[The Browsemaps: Collaborative Filtering at LinkedIn](http://ceur-ws.org/Vol-1271/Paper3.pdf),” at *6th Workshop on Recommender Systems and the Social Web* (RSWeb), October 2014.
1. Roshan Sumbaly, Jay Kreps, Lei Gao, et al.: “[Serving Large-Scale Batch Computed Data with Project Voldemort](http://static.usenix.org/events/fast12/tech/full_papers/Sumbaly.pdf),” at *10th USENIX Conference on File and Storage Technologies* (FAST), February 2012.
1. Varun Sharma: “[Open-Sourcing Terrapin: A Serving System for Batch Generated Data](https://web.archive.org/web/20170215032514/https://engineering.pinterest.com/blog/open-sourcing-terrapin-serving-system-batch-generated-data-0),” *engineering.pinterest.com*, September 14, 2015.
1. Nathan Marz: “[ElephantDB](http://www.slideshare.net/nathanmarz/elephantdb),” *slideshare.net*, May 30, 2011.
1. Jean-Daniel (JD) Cryans: “[How-to: Use HBase Bulk Loading, and Why](https://blog.cloudera.com/how-to-use-hbase-bulk-loading-and-why/),” *blog.cloudera.com*, September 27, 2013.
1. Nathan Marz: “[How to Beat the CAP Theorem](http://nathanmarz.com/blog/how-to-beat-the-cap-theorem.html),” *nathanmarz.com*, October 13, 2011.
1. Molly Bartlett Dishman and Martin Fowler: “[Agile Architecture](https://web.archive.org/web/20161130034721/http://conferences.oreilly.com/software-architecture/sa2015/public/schedule/detail/40388),” at *O'Reilly Software Architecture Conference*, March 2015.
1. David J. DeWitt and Jim N. Gray: “[Parallel Database Systems: The Future of High Performance Database Systems](http://www.cs.cmu.edu/~pavlo/courses/fall2013/static/papers/dewittgray92.pdf),” *Communications of the ACM*, volume 35, number 6, pages 85–98, June 1992. [doi:10.1145/129888.129894](http://dx.doi.org/10.1145/129888.129894)
1. Jay Kreps: “[But the multi-tenancy thing is actually really really hard](https://twitter.com/jaykreps/status/528235702480142336),” tweetstorm, *twitter.com*, October 31, 2014.
1. Jeffrey Cohen, Brian Dolan, Mark Dunlap, et al.: “[MAD Skills: New Analysis Practices for Big Data](http://www.vldb.org/pvldb/vol2/vldb09-219.pdf),” *Proceedings of the VLDB Endowment*, volume 2, number 2, pages 1481–1492, August 2009. [doi:10.14778/1687553.1687576](http://dx.doi.org/10.14778/1687553.1687576)
1. Ignacio Terrizzano, Peter Schwarz, Mary Roth, and John E. Colino: “[Data Wrangling: The Challenging Journey from the Wild to the Lake](http://cidrdb.org/cidr2015/Papers/CIDR15_Paper2.pdf),” at *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
1. Paige Roberts: “[To Schema on Read or to Schema on Write, That Is the Hadoop Data Lake Question](https://web.archive.org/web/20171105001306/http://adaptivesystemsinc.com/blog/to-schema-on-read-or-to-schema-on-write-that-is-the-hadoop-data-lake-question/),” *adaptivesystemsinc.com*, July 2, 2015.
1. Bobby Johnson and Joseph Adler: “[The Sushi Principle: Raw Data Is Better](https://web.archive.org/web/20161126104941/https://conferences.oreilly.com/strata/big-data-conference-ca-2015/public/schedule/detail/38737),” at *Strata+Hadoop World*, February 2015.
1. Vinod Kumar Vavilapalli, Arun C. Murthy, Chris Douglas, et al.: “[Apache Hadoop YARN: Yet Another Resource Negotiator](https://www.cs.cmu.edu/~garth/15719/papers/yarn.pdf),” at *4th ACM Symposium on Cloud Computing* (SoCC), October 2013. [doi:10.1145/2523616.2523633](http://dx.doi.org/10.1145/2523616.2523633)
1. Abhishek Verma, Luis Pedrosa, Madhukar Korupolu, et al.: “[Large-Scale Cluster Management at Google with Borg](http://research.google.com/pubs/pub43438.html),” at *10th European Conference on Computer Systems* (EuroSys), April 2015. [doi:10.1145/2741948.2741964](http://dx.doi.org/10.1145/2741948.2741964)
1. Malte Schwarzkopf: “[The Evolution of Cluster Scheduler Architectures](https://web.archive.org/web/20201109052657/http://www.firmament.io/blog/scheduler-architectures.html),” *firmament.io*, March 9, 2016.
1. Matei Zaharia, Mosharaf Chowdhury, Tathagata Das, et al.: “[Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing](https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final138.pdf),” at *9th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), April 2012.
1. Holden Karau, Andy Konwinski, Patrick Wendell, and Matei Zaharia: *Learning Spark*. O'Reilly Media, 2015. ISBN: 978-1-449-35904-1
1. Bikas Saha and Hitesh Shah: “[Apache Tez: Accelerating Hadoop Query Processing](http://www.slideshare.net/Hadoop_Summit/w-1205phall1saha),” at *Hadoop Summit*, June 2014.
1. Bikas Saha, Hitesh Shah, Siddharth Seth, et al.: “[Apache Tez: A Unifying Framework for Modeling and Building Data Processing Applications](http://home.cse.ust.hk/~weiwa/teaching/Fall15-COMP6611B/reading_list/Tez.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2742790](http://dx.doi.org/10.1145/2723372.2742790)
1. Kostas Tzoumas: “[Apache Flink: API, Runtime, and Project Roadmap](http://www.slideshare.net/KostasTzoumas/apache-flink-api-runtime-and-project-roadmap),” *slideshare.net*, January 14, 2015.
1. Alexander Alexandrov, Rico Bergmann, Stephan Ewen, et al.: “[The Stratosphere Platform for Big Data Analytics](https://ssc.io/pdf/2014-VLDBJ_Stratosphere_Overview.pdf),” *The VLDB Journal*, volume 23, number 6, pages 939–964, May 2014. [doi:10.1007/s00778-014-0357-y](http://dx.doi.org/10.1007/s00778-014-0357-y)
1. Michael Isard, Mihai Budiu, Yuan Yu, et al.: “[Dryad: Distributed Data-Parallel Programs from Sequential Building Blocks](https://www.microsoft.com/en-us/research/publication/dryad-distributed-data-parallel-programs-from-sequential-building-blocks/),” at *European Conference on Computer Systems* (EuroSys), March 2007. [doi:10.1145/1272996.1273005](http://dx.doi.org/10.1145/1272996.1273005)
1. Daniel Warneke and Odej Kao: “[Nephele: Efficient Parallel Data Processing in the Cloud](https://stratosphere2.dima.tu-berlin.de/assets/papers/Nephele_09.pdf),” at *2nd Workshop on Many-Task Computing on Grids and Supercomputers* (MTAGS), November 2009. [doi:10.1145/1646468.1646476](http://dx.doi.org/10.1145/1646468.1646476)
1. Lawrence Page, Sergey Brin, Rajeev Motwani, and Terry Winograd: “[The PageRank Citation Ranking: Bringing Order to the Web](https://web.archive.org/web/20230219170930/http://ilpubs.stanford.edu:8090/422/),” Stanford InfoLab Technical Report 422, 1999.
1. Leslie G. Valiant: “[A Bridging Model for Parallel Computation](http://dl.acm.org/citation.cfm?id=79181),” *Communications of the ACM*, volume 33, number 8, pages 103–111, August 1990. [doi:10.1145/79173.79181](http://dx.doi.org/10.1145/79173.79181)
1. Stephan Ewen, Kostas Tzoumas, Moritz Kaufmann, and Volker Markl: “[Spinning Fast Iterative Data Flows](http://vldb.org/pvldb/vol5/p1268_stephanewen_vldb2012.pdf),” *Proceedings of the VLDB Endowment*, volume 5, number 11, pages 1268-1279, July 2012. [doi:10.14778/2350229.2350245](http://dx.doi.org/10.14778/2350229.2350245)
1. Grzegorz Malewicz, Matthew H. Austern, Aart J. C. Bik, et al.: “[Pregel: A System for Large-Scale Graph Processing](https://kowshik.github.io/JPregel/pregel_paper.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2010. [doi:10.1145/1807167.1807184](http://dx.doi.org/10.1145/1807167.1807184)
1. Frank McSherry, Michael Isard, and Derek G. Murray: “[Scalability! But at What COST?](http://www.frankmcsherry.org/assets/COST.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
1. Ionel Gog, Malte Schwarzkopf, Natacha Crooks, et al.: “[Musketeer: All for One, One for All in Data Processing Systems](http://www.cl.cam.ac.uk/research/srg/netos/camsas/pubs/eurosys15-musketeer.pdf),” at *10th European Conference on Computer Systems* (EuroSys), April 2015. [doi:10.1145/2741948.2741968](http://dx.doi.org/10.1145/2741948.2741968)
1. Aapo Kyrola, Guy Blelloch, and Carlos Guestrin: “[GraphChi: Large-Scale Graph Computation on Just a PC](https://www.usenix.org/system/files/conference/osdi12/osdi12-final-126.pdf),” at *10th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2012.
1. Andrew Lenharth, Donald Nguyen, and Keshav Pingali: “[Parallel Graph Analytics](http://cacm.acm.org/magazines/2016/5/201591-parallel-graph-analytics/fulltext),” *Communications of the ACM*, volume 59, number 5, pages 78–87, May 2016. [doi:10.1145/2901919](http://dx.doi.org/10.1145/2901919)
1. Fabian Hüske: “[Peeking into Apache Flink's Engine Room](http://flink.apache.org/news/2015/03/13/peeking-into-Apache-Flinks-Engine-Room.html),” *flink.apache.org*, March 13, 2015.
1. Mostafa Mokhtar: “[Hive 0.14 Cost Based Optimizer (CBO) Technical Overview](https://web.archive.org/web/20170607112708/http://hortonworks.com/blog/hive-0-14-cost-based-optimizer-cbo-technical-overview/),” *hortonworks.com*, March 2, 2015.
1. Michael Armbrust, Reynold S Xin, Cheng Lian, et al.: “[Spark SQL: Relational Data Processing in Spark](http://people.csail.mit.edu/matei/papers/2015/sigmod_spark_sql.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2742797](http://dx.doi.org/10.1145/2723372.2742797)
1. Daniel Blazevski: “[Planting Quadtrees for Apache Flink](https://blog.insightdatascience.com/planting-quadtrees-for-apache-flink-b396ebc80d35),” *insightdataengineering.com*, March 25, 2016.
1. Tom White: “[Genome Analysis Toolkit: Now Using Apache Spark for Data Processing](https://web.archive.org/web/20190215132904/http://blog.cloudera.com/blog/2016/04/genome-analysis-toolkit-now-using-apache-spark-for-data-processing/),” *blog.cloudera.com*, April 6, 2016.

================================================
FILE: content/v1_tw/ch11.md
================================================
---
title: "第十一章：流處理"
linkTitle: "11. 流處理"
weight: 311
math: true
breadcrumbs: false
---


![](/map/ch11.png)

> 有效的複雜系統總是從簡單的系統演化而來。反之亦然：從零設計的複雜系統沒一個能有效工作的。
>
> —— 約翰・加爾，Systemantics（1975）


在 [第十章](/v1_tw/ch10) 中，我們討論了批處理技術，它讀取一組檔案作為輸入，並生成一組新的檔案作為輸出。輸出是 **衍生資料（derived data）** 的一種形式；也就是說，如果需要，可以透過再次執行批處理過程來重新建立資料集。我們看到了如何使用這個簡單而強大的想法來建立搜尋索引、推薦系統、做分析等等。

然而，在 [第十章](/v1_tw/ch10) 中仍然有一個很大的假設：即輸入是有界的，即已知和有限的大小，所以批處理知道它何時完成輸入的讀取。例如，MapReduce 核心的排序操作必須讀取其全部輸入，然後才能開始生成輸出：可能發生這種情況：最後一條輸入記錄具有最小的鍵，因此需要第一個被輸出，所以提早開始輸出是不可行的。

實際上，很多資料是 **無界限** 的，因為它隨著時間的推移而逐漸到達：你的使用者在昨天和今天產生了資料，明天他們將繼續產生更多的資料。除非你停業，否則這個過程永遠都不會結束，所以資料集從來就不會以任何有意義的方式 “完成”【1】。因此，批處理程式必須將資料人為地分成固定時間段的資料塊，例如，在每天結束時處理一天的資料，或者在每小時結束時處理一小時的資料。

日常批處理中的問題是，輸入的變更只會在一天之後的輸出中反映出來，這對於許多急躁的使用者來說太慢了。為了減少延遲，我們可以更頻繁地執行處理 ——  比如說，在每秒鐘的末尾 —— 或者甚至更連續一些，完全拋開固定的時間切片，當事件發生時就立即進行處理，這就是 **流處理（stream processing）** 背後的想法。

一般來說，“流” 是指隨著時間的推移逐漸可用的資料。這個概念出現在很多地方：Unix 的 stdin 和 stdout、程式語言（惰性列表）【2】、檔案系統 API（如 Java 的 `FileInputStream`）、TCP 連線、透過網際網路傳送音訊和影片等等。

在本章中，我們將把 **事件流（event stream）** 視為一種資料管理機制：無界限，增量處理，與上一章中的批次資料相對應。我們將首先討論怎樣表示、儲存、透過網路傳輸流。在 “[資料庫與流](#資料庫與流)” 中，我們將研究流和資料庫之間的關係。最後在 “[流處理](#流處理)” 中，我們將研究連續處理這些流的方法和工具，以及它們用於應用構建的方式。


## 傳遞事件流

在批處理領域，作業的輸入和輸出是檔案（也許在分散式檔案系統上）。流處理領域中的等價物看上去是什麼樣子的？

當輸入是一個檔案（一個位元組序列），第一個處理步驟通常是將其解析為一系列記錄。在流處理的上下文中，記錄通常被叫做 **事件（event）** ，但它本質上是一樣的：一個小的、自包含的、不可變的物件，包含某個時間點發生的某件事情的細節。一個事件通常包含一個來自日曆時鐘的時間戳，以指明事件發生的時間（請參閱 “[單調鍾與日曆時鐘](/v1_tw/ch8#單調鍾與日曆時鐘)”）。

例如，發生的事件可能是使用者採取的行動，例如檢視頁面或進行購買。它也可能來源於機器，例如對溫度感測器或 CPU 利用率的週期性測量。在 “[使用 Unix 工具的批處理](/v1_tw/ch10#使用Unix工具的批處理)” 的示例中，Web 伺服器日誌的每一行都是一個事件。

事件可能被編碼為文字字串或 JSON，或者某種二進位制編碼，如 [第四章](/v1_tw/ch4) 所述。這種編碼允許你儲存一個事件，例如將其追加到一個檔案，將其插入關係表，或將其寫入文件資料庫。它還允許你透過網路將事件傳送到另一個節點以進行處理。

在批處理中，檔案被寫入一次，然後可能被多個作業讀取。類似地，在流處理術語中，一個事件由 **生產者（producer）** （也稱為 **釋出者（publisher）** 或 **傳送者（sender）** ）生成一次，然後可能由多個 **消費者（consumer）** （ **訂閱者（subscribers）** 或 **接收者（recipients）** ）進行處理【3】。在檔案系統中，檔名標識一組相關記錄；在流式系統中，相關的事件通常被聚合為一個 **主題（topic）** 或 **流（stream）** 。

原則上講，檔案或資料庫就足以連線生產者和消費者：生產者將其生成的每個事件寫入資料儲存，且每個消費者定期輪詢資料儲存，檢查自上次執行以來新出現的事件。這實際上正是批處理在每天結束時處理當天資料時所做的事情。

但當我們想要進行低延遲的連續處理時，如果資料儲存不是為這種用途專門設計的，那麼輪詢開銷就會很大。輪詢的越頻繁，能返回新事件的請求比例就越低，而額外開銷也就越高。相比之下，最好能在新事件出現時直接通知消費者。

資料庫在傳統上對這種通知機制支援的並不好，關係型資料庫通常有 **觸發器（trigger）** ，它們可以對變化（如，插入表中的一行）作出反應，但是它們的功能非常有限，並且在資料庫設計中有些後顧之憂【4,5】。相應的是，已經開發了專門的工具來提供事件通知。


### 訊息傳遞系統

向消費者通知新事件的常用方式是使用 **訊息傳遞系統（messaging system）**：生產者傳送包含事件的訊息，然後將訊息推送給消費者。我們之前在 “[訊息傳遞中的資料流](/v1_tw/ch4#訊息傳遞中的資料流)” 中談到了這些系統，但現在我們將詳細介紹這些系統。

像生產者和消費者之間的 Unix 管道或 TCP 連線這樣的直接通道，是實現訊息傳遞系統的簡單方法。但是，大多數訊息傳遞系統都在這一基本模型上進行了擴充套件。特別的是，Unix 管道和 TCP 將恰好一個傳送者與恰好一個接收者連線，而一個訊息傳遞系統允許多個生產者節點將訊息傳送到同一個主題，並允許多個消費者節點接收主題中的訊息。

在這個 **釋出 / 訂閱** 模式中，不同的系統採取各種各樣的方法，並沒有針對所有目的的通用答案。為了區分這些系統，問一下這兩個問題會特別有幫助：

1. **如果生產者傳送訊息的速度比消費者能夠處理的速度快會發生什麼？** 一般來說，有三種選擇：系統可以丟掉訊息，將訊息放入緩衝佇列，或使用 **背壓**（backpressure，也稱為 **流量控制**，即 flow control：阻塞生產者，以免其傳送更多的訊息）。例如 Unix 管道和 TCP 就使用了背壓：它們有一個固定大小的小緩衝區，如果填滿，傳送者會被阻塞，直到接收者從緩衝區中取出資料（請參閱 “[網路擁塞和排隊](/v1_tw/ch8#網路擁塞和排隊)”）。

   如果訊息被快取在佇列中，那麼理解佇列增長會發生什麼是很重要的。當佇列裝不進記憶體時系統會崩潰嗎？還是將訊息寫入磁碟？如果是這樣，磁碟訪問又會如何影響訊息傳遞系統的效能【6】？

2. **如果節點崩潰或暫時離線，會發生什麼情況？ —— 是否會有訊息丟失？** 與資料庫一樣，永續性可能需要寫入磁碟和 / 或複製的某種組合（請參閱 “[複製與永續性](/v1_tw/ch7#複製與永續性)”），這是有代價的。如果你能接受有時訊息會丟失，則可能在同一硬體上獲得更高的吞吐量和更低的延遲。

是否可以接受訊息丟失取決於應用。例如，對於週期傳輸的感測器讀數和指標，偶爾丟失的資料點可能並不重要，因為更新的值會在短時間內發出。但要注意，如果大量的訊息被丟棄，可能無法立刻意識到指標已經不正確了【7】。如果你正在對事件計數，那麼它們能夠可靠送達是更重要的，因為每個丟失的訊息都意味著使計數器的錯誤擴大。

我們在 [第十章](/v1_tw/ch10) 中探討的批處理系統的一個很好的特性是，它們提供了強大的可靠性保證：失敗的任務會自動重試，失敗任務的部分輸出會自動丟棄。這意味著輸出與沒有發生故障一樣，這有助於簡化程式設計模型。在本章的後面，我們將研究如何在流處理的上下文中提供類似的保證。

#### 直接從生產者傳遞給消費者

許多訊息傳遞系統使用生產者和消費者之間的直接網路通訊，而不透過中間節點：

* UDP 組播廣泛應用於金融行業，例如股票市場，其中低時延非常重要【8】。雖然 UDP 本身是不可靠的，但應用層的協議可以恢復丟失的資料包（生產者必須記住它傳送的資料包，以便能按需重新發送資料包）。
* 無代理的訊息庫，如 ZeroMQ 【9】和 nanomsg 採取類似的方法，透過 TCP 或 IP 多播實現釋出 / 訂閱訊息傳遞。
* StatsD 【10】和 Brubeck 【7】使用不可靠的 UDP 訊息傳遞來收集網路中所有機器的指標並對其進行監控。（在 StatsD 協議中，只有接收到所有訊息，才認為計數器指標是正確的；使用 UDP 將使得指標處在一種最佳近似狀態【11】。另請參閱 “[TCP 與 UDP](/v1_tw/ch8#TCP與UDP)”
* 如果消費者在網路上公開了服務，生產者可以直接傳送 HTTP 或 RPC 請求（請參閱 “[服務中的資料流：REST 與 RPC](/v1_tw/ch4#服務中的資料流：REST與RPC)”）將訊息推送給使用者。這就是 webhooks 背後的想法【12】，一種服務的回撥 URL 被註冊到另一個服務中，並且每當事件發生時都會向該 URL 發出請求。

儘管這些直接訊息傳遞系統在設計它們的環境中執行良好，但是它們通常要求應用程式碼意識到訊息丟失的可能性。它們的容錯程度極為有限：即使協議檢測到並重傳在網路中丟失的資料包，它們通常也只是假設生產者和消費者始終線上。

如果消費者處於離線狀態，則可能會丟失其不可達時傳送的訊息。一些協議允許生產者重試失敗的訊息傳遞，但當生產者崩潰時，它可能會丟失訊息緩衝區及其本應傳送的訊息，這種方法可能就沒用了。

#### 訊息代理

一種廣泛使用的替代方法是透過 **訊息代理**（message broker，也稱為 **訊息佇列**，即 message queue）傳送訊息，訊息代理實質上是一種針對處理訊息流而最佳化的資料庫。它作為伺服器執行，生產者和消費者作為客戶端連線到伺服器。生產者將訊息寫入代理，消費者透過從代理那裡讀取來接收訊息。

透過將資料集中在代理上，這些系統可以更容易地容忍來來去去的客戶端（連線，斷開連線和崩潰），而永續性問題則轉移到代理的身上。一些訊息代理只將訊息儲存在記憶體中，而另一些訊息代理（取決於配置）將其寫入磁碟，以便在代理崩潰的情況下不會丟失。針對緩慢的消費者，它們通常會允許無上限的排隊（而不是丟棄訊息或背壓），儘管這種選擇也可能取決於配置。

排隊的結果是，消費者通常是 **非同步（asynchronous）** 的：當生產者傳送訊息時，通常只會等待代理確認訊息已經被快取，而不等待訊息被消費者處理。向消費者遞送訊息將發生在未來某個未定的時間點 —— 通常在幾分之一秒之內，但有時當訊息堆積時會顯著延遲。

#### 訊息代理與資料庫的對比

有些訊息代理甚至可以使用 XA 或 JTA 參與兩階段提交協議（請參閱 “[實踐中的分散式事務](/v1_tw/ch9#實踐中的分散式事務)”）。這個功能與資料庫在本質上非常相似，儘管訊息代理和資料庫之間仍存在實踐上很重要的差異：

* 資料庫通常保留資料直至顯式刪除，而大多數訊息代理在訊息成功遞送給消費者時會自動刪除訊息。這樣的訊息代理不適合長期的資料儲存。
* 由於它們很快就能刪除訊息，大多數訊息代理都認為它們的工作集相當小 —— 即佇列很短。如果代理需要緩衝很多訊息，比如因為消費者速度較慢（如果記憶體裝不下訊息，可能會溢位到磁碟），每個訊息需要更長的處理時間，整體吞吐量可能會惡化【6】。
* 資料庫通常支援次級索引和各種搜尋資料的方式，而訊息代理通常支援按照某種模式匹配主題，訂閱其子集。雖然機制並不一樣，但對於客戶端選擇想要了解的資料的一部分，都是基本的方式。
* 查詢資料庫時，結果通常基於某個時間點的資料快照；如果另一個客戶端隨後向資料庫寫入一些改變了查詢結果的內容，則第一個客戶端不會發現其先前結果現已過期（除非它重複查詢或輪詢變更）。相比之下，訊息代理不支援任意查詢，但是當資料發生變化時（即新訊息可用時），它們會通知客戶端。

這是關於訊息代理的傳統觀點，它被封裝在諸如 JMS 【14】和 AMQP 【15】的標準中，並且被諸如 RabbitMQ、ActiveMQ、HornetQ、Qpid、TIBCO 企業訊息服務、IBM MQ、Azure Service Bus 和 Google Cloud Pub/Sub 所實現 【16】。

#### 多個消費者

當多個消費者從同一主題中讀取訊息時，有兩種主要的訊息傳遞模式，如 [圖 11-1](/v1/ddia_1101.png) 所示：

負載均衡（load balancing）
: 每條訊息都被傳遞給消費者 **之一**，所以處理該主題下訊息的工作能被多個消費者共享。代理可以為消費者任意分配訊息。當處理訊息的代價高昂，希望能並行處理訊息時，此模式非常有用（在 AMQP 中，可以透過讓多個客戶端從同一個佇列中消費來實現負載均衡，而在 JMS 中則稱之為 **共享訂閱**，即 shared subscription）。

扇出（fan-out）
: 每條訊息都被傳遞給 **所有** 消費者。扇出允許幾個獨立的消費者各自 “收聽” 相同的訊息廣播，而不會相互影響 ——  這個流處理中的概念對應批處理中多個不同批處理作業讀取同一份輸入檔案 （JMS 中的主題訂閱與 AMQP 中的交叉繫結提供了這一功能）。

![](/v1/ddia_1101.png)

**圖 11-1 （a）負載平衡：在消費者間共享消費主題；（b）扇出：將每條訊息傳遞給多個消費者。**

兩種模式可以組合使用：例如，兩個獨立的消費者組可以每組各訂閱同一個主題，每一組都共同收到所有訊息，但在每一組內部，每條訊息僅由單個節點處理。

#### 確認與重新傳遞

消費者隨時可能會崩潰，所以有一種可能的情況是：代理向消費者遞送訊息，但消費者沒有處理，或者在消費者崩潰之前只進行了部分處理。為了確保訊息不會丟失，訊息代理使用 **確認（acknowledgments）**：客戶端必須顯式告知代理訊息處理完畢的時間，以便代理能將訊息從佇列中移除。

如果與客戶端的連線關閉，或者代理超出一段時間未收到確認，代理則認為訊息沒有被處理，因此它將訊息再遞送給另一個消費者。（請注意可能發生這樣的情況，訊息 **實際上是** 處理完畢的，但 **確認** 在網路中丟失了。需要一種原子提交協議才能處理這種情況，正如在 “[實踐中的分散式事務](/v1_tw/ch9#實踐中的分散式事務)” 中所討論的那樣）

當與負載均衡相結合時，這種重傳行為對訊息的順序有種有趣的影響。在 [圖 11-2](/v1/ddia_1102.png) 中，消費者通常按照生產者傳送的順序處理訊息。然而消費者 2 在處理訊息 m3 時崩潰，與此同時消費者 1 正在處理訊息 m4。未確認的訊息 m3 隨後被重新發送給消費者 1，結果消費者 1 按照 m4，m3，m5 的順序處理訊息。因此 m3 和 m4 的交付順序與生產者 1 的傳送順序不同。

![](/v1/ddia_1102.png)

**圖 11-2 在處理 m3 時消費者 2 崩潰，因此稍後重傳至消費者 1**

即使訊息代理試圖保留訊息的順序（如 JMS 和 AMQP 標準所要求的），負載均衡與重傳的組合也不可避免地導致訊息被重新排序。為避免此問題，你可以讓每個消費者使用單獨的佇列（即不使用負載均衡功能）。如果訊息是完全獨立的，則訊息順序重排並不是一個問題。但正如我們將在本章後續部分所述，如果訊息之間存在因果依賴關係，這就是一個很重要的問題。

### 分割槽日誌

透過網路傳送資料包或向網路服務傳送請求通常是短暫的操作，不會留下永久的痕跡。儘管可以永久記錄（透過抓包與日誌），但我們通常不這麼做。即使是將訊息持久地寫入磁碟的訊息代理，在送達給消費者之後也會很快刪除訊息，因為它們建立在短暫訊息傳遞的思維方式上。

資料庫和檔案系統採用截然相反的方法論：至少在某人顯式刪除前，通常寫入資料庫或檔案的所有內容都要被永久記錄下來。

這種思維方式上的差異對建立衍生資料的方式有巨大影響。如 [第十章](/v1_tw/ch10) 所述，批處理過程的一個關鍵特性是，你可以反覆執行它們，試驗處理步驟，不用擔心損壞輸入（因為輸入是隻讀的）。而 AMQP/JMS 風格的訊息傳遞並非如此：收到訊息是具有破壞性的，因為確認可能導致訊息從代理中被刪除，因此你不能期望再次運行同一個消費者能得到相同的結果。

如果你將新的消費者新增到訊息傳遞系統，通常只能接收到消費者註冊之後開始傳送的訊息。先前的任何訊息都隨風而逝，一去不復返。作為對比，你可以隨時為檔案和資料庫新增新的客戶端，且能讀取任意久遠的資料（只要應用沒有顯式覆蓋或刪除這些資料）。

為什麼我們不能把它倆雜交一下，既有資料庫的持久儲存方式，又有訊息傳遞的低延遲通知？這就是 **基於日誌的訊息代理（log-based message brokers）** 背後的想法。

#### 使用日誌進行訊息儲存

日誌只是磁碟上簡單的僅追加記錄序列。我們先前在 [第三章](/v1_tw/ch3) 中日誌結構儲存引擎和預寫式日誌的上下文中討論了日誌，在 [第五章](/v1_tw/ch5) 複製的上下文裡也討論了它。

同樣的結構可以用於實現訊息代理：生產者透過將訊息追加到日誌末尾來發送訊息，而消費者透過依次讀取日誌來接收訊息。如果消費者讀到日誌末尾，則會等待新訊息追加的通知。Unix 工具 `tail -f` 能監視檔案被追加寫入的資料，基本上就是這樣工作的。

為了伸縮超出單個磁碟所能提供的更高吞吐量，可以對日誌進行 **分割槽**（按 [第六章](/v1_tw/ch6) 的定義）。不同的分割槽可以託管在不同的機器上，使得每個分割槽都有一份能獨立於其他分割槽進行讀寫的日誌。一個主題可以定義為一組攜帶相同型別訊息的分割槽。這種方法如 [圖 11-3](/v1/ddia_1103.png) 所示。

在每個分割槽內，代理為每個訊息分配一個單調遞增的序列號或 **偏移量**（offset，在 [圖 11-3](/v1/ddia_1103.png) 中，框中的數字是訊息偏移量）。這種序列號是有意義的，因為分割槽是僅追加寫入的，所以分割槽內的訊息是完全有序的。沒有跨不同分割槽的順序保證。

![](/v1/ddia_1103.png)

**圖 11-3 生產者透過將訊息追加寫入主題分割槽檔案來發送訊息，消費者依次讀取這些檔案**

Apache Kafka 【17,18】、Amazon Kinesis Streams 【19】和 Twitter 的 DistributedLog 【20,21】都是基於日誌的訊息代理。Google Cloud Pub/Sub 在架構上類似，但對外暴露的是 JMS 風格的 API，而不是日誌抽象【16】。儘管這些訊息代理將所有訊息寫入磁碟，但透過跨多臺機器分割槽，每秒能夠實現數百萬條訊息的吞吐量，並透過複製訊息來實現容錯性【22,23】。

#### 日誌與傳統的訊息傳遞相比

基於日誌的方法天然支援扇出式訊息傳遞，因為多個消費者可以獨立讀取日誌，而不會相互影響 —— 讀取訊息不會將其從日誌中刪除。為了在一組消費者之間實現負載平衡，代理可以將整個分割槽分配給消費者組中的節點，而不是將單條訊息分配給消費者客戶端。

然後每個客戶端將消費被指派分割槽中的 **所有** 訊息。通常情況下，當一個使用者被指派了一個日誌分割槽時，它會以簡單的單執行緒方式順序地讀取分割槽中的訊息。這種粗粒度的負載均衡方法有一些缺點：

* 共享消費主題工作的節點數，最多為該主題中的日誌分割槽數，因為同一個分割槽內的所有訊息被遞送到同一個節點 [^i]。
* 如果某條訊息處理緩慢，則它會阻塞該分割槽中後續訊息的處理（一種行首阻塞的形式；請參閱 “[描述效能](/v1_tw/ch1#描述效能)”）。

因此在訊息處理代價高昂，希望逐條並行處理，以及訊息的順序並沒有那麼重要的情況下，JMS/AMQP 風格的訊息代理是可取的。另一方面，在訊息吞吐量很高，處理迅速，順序很重要的情況下，基於日誌的方法表現得非常好。

[^i]: 要設計一種負載均衡方案也是有可能的，在這種方案中，兩個消費者透過讀取全部訊息來共享分割槽處理的工作，但是其中一個只考慮具有偶數偏移量的訊息，而另一個消費者只處理奇數編號的偏移量。或者你可以將訊息攤到一個執行緒池中來處理，但這種方法會使消費者偏移量管理變得複雜。一般來說，單執行緒處理單分割槽是合適的，可以透過增加更多分割槽來提高並行度。

#### 消費者偏移量

順序消費一個分割槽使得判斷訊息是否已經被處理變得相當容易：所有偏移量小於消費者的當前偏移量的訊息已經被處理，而具有更大偏移量的訊息還沒有被看到。因此，代理不需要跟蹤確認每條訊息，只需要定期記錄消費者的偏移即可。這種方法減少了額外簿記開銷，而且在批處理和流處理中採用這種方法有助於提高基於日誌的系統的吞吐量。

實際上，這種偏移量與單領導者資料庫複製中常見的日誌序列號非常相似，我們在 “[設定新從庫](/v1_tw/ch5#設定新從庫)” 中討論了這種情況。在資料庫複製中，日誌序列號允許跟隨者斷開連線後，重新連線到領導者，並在不跳過任何寫入的情況下恢復複製。這裡原理完全相同：訊息代理表現得像一個主庫，而消費者就像一個從庫。

如果消費者節點失效，則失效消費者的分割槽將指派給其他節點，並從最後記錄的偏移量開始消費訊息。如果消費者已經處理了後續的訊息，但還沒有記錄它們的偏移量，那麼重啟後這些訊息將被處理兩次。我們將在本章後面討論這個問題的處理方法。

#### 磁碟空間使用

如果只追加寫入日誌，則磁碟空間終究會耗盡。為了回收磁碟空間，日誌實際上被分割成段，並不時地將舊段刪除或移動到歸檔儲存。（我們將在後面討論一種更為複雜的磁碟空間釋放方式）

這就意味著如果一個慢消費者跟不上訊息產生的速率而落後得太多，它的消費偏移量指向了刪除的段，那麼它就會錯過一些訊息。實際上，日誌實現了一個有限大小的緩衝區，當緩衝區填滿時會丟棄舊訊息，它也被稱為 **迴圈緩衝區（circular buffer）** 或 **環形緩衝區（ring buffer）**。不過由於緩衝區在磁碟上，因此緩衝區可能相當的大。

讓我們做個簡單計算。在撰寫本文時，典型的大型硬碟容量為 6TB，順序寫入吞吐量為 150MB/s。如果以最快的速度寫訊息，則需要大約 11 個小時才能填滿磁碟。因而磁碟可以緩衝 11 個小時的訊息，之後它將開始覆蓋舊的訊息。即使使用多個磁碟和機器，這個比率也是一樣的。實踐中的部署很少能用滿磁碟的寫入頻寬，所以通常可以儲存一個幾天甚至幾周的日誌緩衝區。

不管保留多長時間的訊息，日誌的吞吐量或多或少保持不變，因為無論如何，每個訊息都會被寫入磁碟【18】。這種行為與預設將訊息儲存在記憶體中，僅當佇列太長時才寫入磁碟的訊息傳遞系統形成鮮明對比。當佇列很短時，這些系統非常快；而當這些系統開始寫入磁碟時，就要慢的多，所以吞吐量取決於保留的歷史數量。

#### 當消費者跟不上生產者時

在 “[訊息傳遞系統](#訊息傳遞系統)” 中，如果消費者無法跟上生產者傳送資訊的速度時，我們討論了三種選擇：丟棄資訊，進行緩衝或施加背壓。在這種分類法裡，基於日誌的方法是緩衝的一種形式，具有很大但大小固定的緩衝區（受可用磁碟空間的限制）。

如果消費者遠遠落後，而所要求的資訊比保留在磁碟上的資訊還要舊，那麼它將不能讀取這些資訊，所以代理實際上丟棄了比緩衝區容量更大的舊資訊。你可以監控消費者落後日誌頭部的距離，如果落後太多就發出報警。由於緩衝區很大，因而有足夠的時間讓運維人員來修復慢消費者，並在訊息開始丟失之前讓其趕上。

即使消費者真的落後太多開始丟失訊息，也只有那個消費者受到影響；它不會中斷其他消費者的服務。這是一個巨大的運維優勢：你可以實驗性地消費生產日誌，以進行開發，測試或除錯，而不必擔心會中斷生產服務。當消費者關閉或崩潰時，會停止消耗資源，唯一剩下的只有消費者偏移量。

這種行為也與傳統的訊息代理形成了鮮明對比，在那種情況下，你需要小心地刪除那些消費者已經關閉的佇列 —— 否則那些佇列就會累積不必要的訊息，從其他仍活躍的消費者那裡佔走記憶體。

#### 重播舊訊息

我們之前提到，使用 AMQP 和 JMS 風格的訊息代理，處理和確認訊息是一個破壞性的操作，因為它會導致訊息在代理上被刪除。另一方面，在基於日誌的訊息代理中，使用訊息更像是從檔案中讀取資料：這是隻讀操作，不會更改日誌。

除了消費者的任何輸出之外，處理的唯一副作用是消費者偏移量的前進。但偏移量是在消費者的控制之下的，所以如果需要的話可以很容易地操縱：例如你可以用昨天的偏移量跑一個消費者副本，並將輸出寫到不同的位置，以便重新處理最近一天的訊息。你可以使用各種不同的處理程式碼重複任意次。

這一方面使得基於日誌的訊息傳遞更像上一章的批處理，其中衍生資料透過可重複的轉換過程與輸入資料顯式分離。它允許進行更多的實驗，更容易從錯誤和漏洞中恢復，使其成為在組織內整合資料流的良好工具【24】。


## 資料庫與流

我們已經在訊息代理和資料庫之間進行了一些比較。儘管傳統上它們被視為單獨的工具類別，但是我們看到基於日誌的訊息代理已經成功地從資料庫中獲取靈感並將其應用於訊息傳遞。我們也可以反過來：從訊息傳遞和流中獲取靈感，並將它們應用於資料庫。

我們之前曾經說過，事件是某個時刻發生的事情的記錄。發生的事情可能是使用者操作（例如鍵入搜尋查詢）或讀取感測器，但也可能是 **寫入資料庫**。某些東西被寫入資料庫的事實是可以被捕獲、儲存和處理的事件。這一觀察結果表明，資料庫和資料流之間的聯絡不僅僅是磁碟日誌的物理儲存 —— 而是更深層的聯絡。

事實上，複製日誌（請參閱 “[複製日誌的實現](/v1_tw/ch5#複製日誌的實現)”）是一個由資料庫寫入事件組成的流，由主庫在處理事務時生成。從庫將寫入流應用到它們自己的資料庫副本，從而最終得到相同資料的精確副本。複製日誌中的事件描述發生的資料更改。

我們還在 “[全序廣播](/v1_tw/ch9#全序廣播)” 中遇到了狀態機複製原理，其中指出：如果每個事件代表對資料庫的寫入，並且每個副本按相同的順序處理相同的事件，則副本將達到相同的最終狀態 （假設事件處理是一個確定性的操作）。這是事件流的又一種場景！

在本節中，我們將首先看看異構資料系統中出現的一個問題，然後探討如何透過將事件流的想法帶入資料庫來解決這個問題。

### 保持系統同步

正如我們在本書中所看到的，沒有一個系統能夠滿足所有的資料儲存、查詢和處理需求。在實踐中，大多數重要應用都需要組合使用幾種不同的技術來滿足所有的需求：例如，使用 OLTP 資料庫來為使用者請求提供服務，使用快取來加速常見請求，使用全文索引來處理搜尋查詢，使用資料倉庫用於分析。每一種技術都有自己的資料副本，並根據自己的目的進行儲存方式的最佳化。

由於相同或相關的資料出現在了不同的地方，因此相互間需要保持同步：如果某個專案在資料庫中被更新，它也應當在快取、搜尋索引和資料倉庫中被更新。對於資料倉庫，這種同步通常由 ETL 程序執行（請參閱 “[資料倉庫](/v1_tw/ch3#資料倉庫)”），通常是先取得資料庫的完整副本，然後執行轉換，並批次載入到資料倉庫中 —— 換句話說，批處理。我們在 “[批處理工作流的輸出](/v1_tw/ch10#批處理工作流的輸出)” 中同樣看到了如何使用批處理建立搜尋索引、推薦系統和其他衍生資料系統。

如果週期性的完整資料庫轉儲過於緩慢，有時會使用的替代方法是 **雙寫（dual write）**，其中應用程式碼在資料變更時明確寫入每個系統：例如，首先寫入資料庫，然後更新搜尋索引，然後使快取項失效（甚至同時執行這些寫入）。

但是，雙寫有一些嚴重的問題，其中一個是競爭條件，如 [圖 11-4](/v1/ddia_1104.png) 所示。在這個例子中，兩個客戶端同時想要更新一個專案 X：客戶端 1 想要將值設定為 A，客戶端 2 想要將其設定為 B。兩個客戶端首先將新值寫入資料庫，然後將其寫入到搜尋索引。因為運氣不好，這些請求的時序是交錯的：資料庫首先看到來自客戶端 1 的寫入將值設定為 A，然後來自客戶端 2 的寫入將值設定為 B，因此資料庫中的最終值為 B。搜尋索引首先看到來自客戶端 2 的寫入，然後是客戶端 1 的寫入，所以搜尋索引中的最終值是 A。即使沒發生錯誤，這兩個系統現在也永久地不一致了。

![](/v1/ddia_1104.png)

**圖 11-4 在資料庫中 X 首先被設定為 A，然後被設定為 B，而在搜尋索引處，寫入以相反的順序到達**

除非有一些額外的併發檢測機制，例如我們在 “[檢測併發寫入](/v1_tw/ch5#檢測併發寫入)” 中討論的版本向量，否則你甚至不會意識到發生了併發寫入 —— 一個值將簡單地以無提示方式覆蓋另一個值。

雙重寫入的另一個問題是，其中一個寫入可能會失敗，而另一個成功。這是一個容錯問題，而不是一個併發問題，但也會造成兩個系統互相不一致的結果。確保它們要麼都成功要麼都失敗，是原子提交問題的一個例子，解決這個問題的代價是昂貴的（請參閱 “[原子提交與兩階段提交](/v1_tw/ch9#原子提交與兩階段提交)”）。

如果你只有一個單領導者複製的資料庫，那麼這個領導者決定了寫入順序，而狀態機複製方法可以在資料庫副本上工作。然而，在 [圖 11-4](/v1/ddia_1104.png) 中，沒有單個主庫：資料庫可能有一個領導者，搜尋索引也可能有一個領導者，但是兩者都不追隨對方，所以可能會發生衝突（請參閱 “[多主複製](/v1_tw/ch5#多主複製)”）。

如果實際上只有一個領導者 —— 例如，資料庫 —— 而且我們能讓搜尋索引成為資料庫的追隨者，情況要好得多。但這在實踐中可能嗎？

### 變更資料捕獲

大多數資料庫的複製日誌的問題在於，它們一直被當做資料庫的內部實現細節，而不是公開的 API。客戶端應該透過其資料模型和查詢語言來查詢資料庫，而不是解析複製日誌並嘗試從中提取資料。

數十年來，許多資料庫根本沒有記錄在檔的獲取變更日誌的方式。由於這個原因，捕獲資料庫中所有的變更，然後將其複製到其他儲存技術（搜尋索引、快取或資料倉庫）中是相當困難的。

最近，人們對 **變更資料捕獲（change data capture, CDC）** 越來越感興趣，這是一種觀察寫入資料庫的所有資料變更，並將其提取並轉換為可以複製到其他系統中的形式的過程。CDC 是非常有意思的，尤其是當變更能在被寫入後立刻用於流時。

例如，你可以捕獲資料庫中的變更，並不斷將相同的變更應用至搜尋索引。如果變更日誌以相同的順序應用，則可以預期搜尋索引中的資料與資料庫中的資料是匹配的。搜尋索引和任何其他衍生資料系統只是變更流的消費者，如 [圖 11-5](/v1/ddia_1105.png) 所示。

![](/v1/ddia_1105.png)

**圖 11-5 將資料按順序寫入一個數據庫，然後按照相同的順序將這些更改應用到其他系統**

#### 變更資料捕獲的實現

我們可以將日誌消費者叫做 **衍生資料系統**，正如在 [第三部分](/v1_tw/part-iii) 的介紹中所討論的：儲存在搜尋索引和資料倉庫中的資料，只是 **記錄系統** 資料的額外檢視。變更資料捕獲是一種機制，可確保對記錄系統所做的所有更改都反映在衍生資料系統中，以便衍生系統具有資料的準確副本。

從本質上說，變更資料捕獲使得一個數據庫成為領導者（被捕獲變化的資料庫），並將其他元件變為追隨者。基於日誌的訊息代理非常適合從源資料庫傳輸變更事件，因為它保留了訊息的順序（避免了 [圖 11-2](/v1/ddia_1102.png) 的重新排序問題）。

資料庫觸發器可用來實現變更資料捕獲（請參閱 “[基於觸發器的複製](/v1_tw/ch5#基於觸發器的複製)”），透過註冊觀察所有變更的觸發器，並將相應的變更項寫入變更日誌表中。但是它們往往是脆弱的，而且有顯著的效能開銷。解析複製日誌可能是一種更穩健的方法，但它也很有挑戰，例如如何應對模式變更。

LinkedIn 的 Databus【25】，Facebook 的 Wormhole【26】和 Yahoo! 的 Sherpa【27】大規模地應用這個思路。Bottled Water 使用解碼 WAL 的 API 實現了 PostgreSQL 的 CDC【28】，Maxwell 和 Debezium 透過解析 binlog 對 MySQL 做了類似的事情【29,30,31】，Mongoriver 讀取 MongoDB oplog【32,33】，而 GoldenGate 為 Oracle 提供類似的功能【34,35】。

類似於訊息代理，變更資料捕獲通常是非同步的：記錄資料庫系統在提交變更之前不會等待消費者應用變更。這種設計具有的運維優勢是，新增緩慢的消費者不會過度影響記錄系統。不過，所有複製延遲可能有的問題在這裡都可能出現（請參閱 “[複製延遲問題](/v1_tw/ch5#複製延遲問題)”）。

#### 初始快照

如果你擁有 **所有** 對資料庫進行變更的日誌，則可以透過重播該日誌，來重建資料庫的完整狀態。但是在許多情況下，永遠保留所有更改會耗費太多磁碟空間，且重播過於費時，因此日誌需要被截斷。

例如，構建新的全文索引需要整個資料庫的完整副本 —— 僅僅應用最近變更的日誌是不夠的，因為這樣會丟失最近未曾更新的專案。因此，如果你沒有完整的歷史日誌，則需要從一個一致的快照開始，如先前的 “[設定新從庫](/v1_tw/ch5#設定新從庫)” 中所述。

資料庫的快照必須與變更日誌中的已知位置或偏移量相對應，以便在處理完快照後知道從哪裡開始應用變更。一些 CDC 工具集成了這種快照功能，而其他工具則把它留給你手動執行。

#### 日誌壓縮

如果你只能保留有限的歷史日誌，則每次要新增新的衍生資料系統時，都需要做一次快照。但 **日誌壓縮（log compaction）** 提供了一個很好的備選方案。

我們之前在 “[雜湊索引](/v1_tw/ch3#雜湊索引)” 中關於日誌結構儲存引擎的上下文中討論了日誌壓縮（請參閱 [圖 3-2](/v1/ddia_0302.png) 的示例）。原理很簡單：儲存引擎定期在日誌中查詢具有相同鍵的記錄，丟掉所有重複的內容，並只保留每個鍵的最新更新。這個壓縮與合併過程在後臺執行。

在日誌結構儲存引擎中，具有特殊值 NULL（**墓碑**，即 tombstone）的更新表示該鍵被刪除，並會在日誌壓縮過程中被移除。但只要鍵不被覆蓋或刪除，它就會永遠留在日誌中。這種壓縮日誌所需的磁碟空間僅取決於資料庫的當前內容，而不取決於資料庫中曾經發生的寫入次數。如果相同的鍵經常被覆蓋寫入，則先前的值將最終將被垃圾回收，只有最新的值會保留下來。

在基於日誌的訊息代理與變更資料捕獲的上下文中也適用相同的想法。如果 CDC 系統被配置為，每個變更都包含一個主鍵，且每個鍵的更新都替換了該鍵以前的值，那麼只需要保留對鍵的最新寫入就足夠了。

現在，無論何時需要重建衍生資料系統（如搜尋索引），你可以從壓縮日誌主題的零偏移量處啟動新的消費者，然後依次掃描日誌中的所有訊息。日誌能保證包含資料庫中每個鍵的最新值（也可能是一些較舊的值）—— 換句話說，你可以使用它來獲取資料庫內容的完整副本，而無需從 CDC 源資料庫取一個快照。

Apache Kafka 支援這種日誌壓縮功能。正如我們將在本章後面看到的，它允許訊息代理被當成永續性儲存使用，而不僅僅是用於臨時訊息。

#### 變更流的API支援

越來越多的資料庫開始將變更流作為第一等的介面，而不像傳統上要去做加裝改造，或者費工夫逆向工程一個 CDC。例如，RethinkDB 允許查詢訂閱通知，當查詢結果變更時獲得通知【36】，Firebase 【37】和 CouchDB 【38】基於變更流進行同步，該變更流同樣可用於應用。而 Meteor 使用 MongoDB oplog 訂閱資料變更，並改變了使用者介面【39】。

VoltDB 允許事務以流的形式連續地從資料庫中匯出資料【40】。資料庫將關係資料模型中的輸出流表示為一個表，事務可以向其中插入元組，但不能查詢。已提交事務按照提交順序寫入這個特殊表，而流則由該表中的元組日誌構成。外部消費者可以非同步消費該日誌，並使用它來更新衍生資料系統。

Kafka Connect【41】致力於將廣泛的資料庫系統的變更資料捕獲工具與 Kafka 整合。一旦變更事件進入 Kafka 中，它就可以用於更新衍生資料系統，比如搜尋索引，也可以用於本章稍後討論的流處理系統。

### 事件溯源

我們在這裡討論的想法和 **事件溯源（Event Sourcing）** 之間有一些相似之處，這是一個在 **領域驅動設計（domain-driven design, DDD）** 社群中折騰出來的技術。我們將簡要討論事件溯源，因為它包含了一些關於流處理系統的有用想法。

與變更資料捕獲類似，事件溯源涉及到 **將所有對應用狀態的變更** 儲存為變更事件日誌。最大的區別是事件溯源將這一想法應用到了一個不同的抽象層次上：

* 在變更資料捕獲中，應用以 **可變方式（mutable way）** 使用資料庫，可以任意更新和刪除記錄。變更日誌是從資料庫的底層提取的（例如，透過解析複製日誌），從而確保從資料庫中提取的寫入順序與實際寫入的順序相匹配，從而避免 [圖 11-4](/v1/ddia_1104.png) 中的競態條件。寫入資料庫的應用不需要知道 CDC 的存在。
* 在事件溯源中，應用邏輯顯式構建在寫入事件日誌的不可變事件之上。在這種情況下，事件儲存是僅追加寫入的，更新與刪除是不鼓勵的或禁止的。事件被設計為旨在反映應用層面發生的事情，而不是底層的狀態變更。

事件溯源是一種強大的資料建模技術：從應用的角度來看，將使用者的行為記錄為不可變的事件更有意義，而不是在可變資料庫中記錄這些行為的影響。事件溯源使得應用隨時間演化更為容易，透過更容易理解事情發生的原因來幫助除錯的進行，並有利於防止應用 Bug（請參閱 “[不可變事件的優點](#不可變事件的優點)”）。

例如，儲存 “學生取消選課” 事件以中性的方式清楚地表達了單個行為的意圖，而其副作用 “從登記表中刪除了一個條目，而一條取消原因的記錄被新增到學生反饋表” 則嵌入了很多有關稍後對資料的使用方式的假設。如果引入一個新的應用功能，例如 “將位置留給等待列表中的下一個人” —— 事件溯源方法允許將新的副作用輕鬆地從現有事件中脫開。

事件溯源類似於 **編年史（chronicle）** 資料模型【45】，事件日誌與星型模式中的事實表之間也存在相似之處（請參閱 “[星型和雪花型：分析的模式](/v1_tw/ch3#星型和雪花型：分析的模式)”） 。

諸如 Event Store【46】這樣的專業資料庫已經被開發出來，供使用事件溯源的應用使用，但總的來說，這種方法獨立於任何特定的工具。傳統的資料庫或基於日誌的訊息代理也可以用來構建這種風格的應用。

#### 從事件日誌中派生出當前狀態

事件日誌本身並不是很有用，因為使用者通常期望看到的是系統的當前狀態，而不是變更歷史。例如，在購物網站上，使用者期望能看到他們購物車裡的當前內容，而不是他們購物車所有變更的一個僅追加列表。

因此，使用事件溯源的應用需要拉取事件日誌（表示 **寫入** 系統的資料），並將其轉換為適合向用戶顯示的應用狀態（從系統 **讀取** 資料的方式【47】）。這種轉換可以使用任意邏輯，但它應當是確定性的，以便能再次執行，並從事件日誌中衍生出相同的應用狀態。

與變更資料捕獲一樣，重播事件日誌允許讓你重新構建系統的當前狀態。不過，日誌壓縮需要採用不同的方式處理：

* 用於記錄更新的 CDC 事件通常包含記錄的 **完整新版本**，因此主鍵的當前值完全由該主鍵的最近事件確定，而日誌壓縮可以丟棄相同主鍵的先前事件。
* 另一方面，事件溯源在更高層次進行建模：事件通常表示使用者操作的意圖，而不是因為操作而發生的狀態更新機制。在這種情況下，後面的事件通常不會覆蓋先前的事件，所以你需要完整的歷史事件來重新構建最終狀態。這裡進行同樣的日誌壓縮是不可能的。

使用事件溯源的應用通常有一些機制，用於儲存從事件日誌中匯出的當前狀態快照，因此它們不需要重複處理完整的日誌。然而這只是一種效能最佳化，用來加速讀取，提高從崩潰中恢復的速度；真正的目的是系統能夠永久儲存所有原始事件，並在需要時重新處理完整的事件日誌。我們將在 “[不變性的侷限性](#不變性的侷限性)” 中討論這個假設。

#### 命令和事件

事件溯源的哲學是仔細區分 **事件（event）** 和 **命令（command）**【48】。當來自使用者的請求剛到達時，它一開始是一個命令：在這個時間點上它仍然可能失敗，比如，因為違反了一些完整性條件。應用必須首先驗證它是否可以執行該命令。如果驗證成功並且命令被接受，則它變為一個持久化且不可變的事件。

例如，如果使用者試圖註冊特定使用者名稱，或預定飛機或劇院的座位，則應用需要檢查使用者名稱或座位是否已被佔用。（先前在 “[容錯共識](/v1_tw/ch9#容錯共識)” 中討論過這個例子）當檢查成功時，應用可以生成一個事件，指示特定的使用者名稱是由特定的使用者 ID 註冊的，或者座位已經預留給特定的顧客。

在事件生成的時刻，它就成為了 **事實（fact）**。即使客戶稍後決定更改或取消預訂，他們之前曾預定了某個特定座位的事實仍然成立，而更改或取消是之後新增的單獨的事件。

事件流的消費者不允許拒絕事件：當消費者看到事件時，它已經成為日誌中不可變的一部分，並且可能已經被其他消費者看到了。因此任何對命令的驗證，都需要在它成為事件之前同步完成。例如，透過使用一個可以原子性地自動驗證命令併發布事件的可序列事務。

或者，預訂座位的使用者請求可以拆分為兩個事件：第一個是暫時預約，第二個是驗證預約後的獨立的確認事件（如 “[使用全序廣播實現線性一致的儲存](/v1_tw/ch9#使用全序廣播實現線性一致的儲存)” 中所述） 。這種分割方式允許驗證發生在一個非同步的過程中。

### 狀態、流和不變性

我們在 [第十章](/v1_tw/ch10) 中看到，批處理因其輸入檔案不變性而受益良多，你可以在現有輸入檔案上執行實驗性處理作業，而不用擔心損壞它們。這種不變性原則也是使得事件溯源與變更資料捕獲如此強大的原因。

我們通常將資料庫視為應用程式當前狀態的儲存 —— 這種表示針對讀取進行了最佳化，而且通常對於服務查詢而言是最為方便的表示。狀態的本質是，它會變化，所以資料庫才會支援資料的增刪改。這又該如何匹配不變性呢？

只要你的狀態發生了變化，那麼這個狀態就是這段時間中事件修改的結果。例如，當前可用的座位列表是你已處理的預訂所產生的結果，當前帳戶餘額是帳戶中的借與貸的結果，而 Web 伺服器的響應時間圖，是所有已發生 Web 請求的獨立響應時間的聚合結果。

無論狀態如何變化，總是有一系列事件導致了這些變化。即使事情已經執行與回滾，這些事件出現是始終成立的。關鍵的想法是：可變的狀態與不可變事件的僅追加日誌相互之間並不矛盾：它們是一體兩面，互為陰陽的。所有變化的日誌 —— **變化日誌（changelog）**，表示了隨時間演變的狀態。

如果你傾向於數學表示，那麼你可能會說，應用狀態是事件流對時間求積分得到的結果，而變更流是狀態對時間求微分的結果，如 [圖 11-6](/v1/ddia_1106.png) 所示【49,50,51】。這個比喻有一些侷限性（例如，狀態的二階導似乎沒有意義），但這是考慮資料的一個實用出發點。
$$
state(now) = \int_{t=0}^{now}{stream(t) \ dt} \\
stream(t) = \frac{d\ state(t)}{dt}
$$

![](/v1/ddia_1106.png)

**圖 11-6 應用當前狀態與事件流之間的關係**

如果你持久儲存了變更日誌，那麼重現狀態就非常簡單。如果你認為事件日誌是你的記錄系統，而所有的衍生狀態都從它派生而來，那麼系統中的資料流動就容易理解的多。正如帕特・赫蘭（Pat Helland）所說的【52】：

> 事務日誌記錄了資料庫的所有變更。高速追加是更改日誌的唯一方法。從這個角度來看，資料庫的內容其實是日誌中記錄最新值的快取。日誌才是真相，資料庫是日誌子集的快取，這一快取子集恰好來自日誌中每條記錄與索引值的最新值。

日誌壓縮（如 “[日誌壓縮](#日誌壓縮)” 中所述）是連線日誌與資料庫狀態之間的橋樑：它只保留每條記錄的最新版本，並丟棄被覆蓋的版本。

#### 不可變事件的優點

資料庫中的不變性是一個古老的概念。例如，會計在幾個世紀以來一直在財務記賬中應用不變性。一筆交易發生時，它被記錄在一個僅追加寫入的分類帳中，實質上是描述貨幣、商品或服務轉手的事件日誌。賬目，比如利潤、虧損、資產負債表，是從分類賬中的交易求和衍生而來【53】。

如果發生錯誤，會計師不會刪除或更改分類帳中的錯誤交易 —— 而是新增另一筆交易以補償錯誤，例如退還一筆不正確的費用。不正確的交易將永遠保留在分類帳中，對於審計而言可能非常重要。如果從不正確的分類賬衍生出的錯誤數字已經公佈，那麼下一個會計週期的數字就會包括一個更正。這個過程在會計事務中是很常見的【54】。

儘管這種可審計性只在金融系統中尤其重要，但對於不受這種嚴格監管的許多其他系統，也是很有幫助的。如 “[批處理輸出的哲學](/v1_tw/ch10#批處理輸出的哲學)” 中所討論的，如果你意外地部署了將錯誤資料寫入資料庫的錯誤程式碼，當代碼會破壞性地覆寫資料時，恢復要困難得多。使用不可變事件的僅追加日誌，診斷問題與故障恢復就要容易的多。

不可變的事件也包含了比當前狀態更多的資訊。例如在購物網站上，顧客可以將物品新增到他們的購物車，然後再將其移除。雖然從履行訂單的角度，第二個事件取消了第一個事件，但對分析目的而言，知道客戶考慮過某個特定項而之後又反悔，可能是很有用的。也許他們會選擇在未來購買，或者他們已經找到了替代品。這個資訊被記錄在事件日誌中，但對於移出購物車就刪除記錄的資料庫而言，這個資訊在移出購物車時可能就丟失了【42】。

#### 從同一事件日誌中派生多個檢視

此外，透過從不變的事件日誌中分離出可變的狀態，你可以針對不同的讀取方式，從相同的事件日誌中衍生出幾種不同的表現形式。效果就像一個流的多個消費者一樣（[圖 11-5](/v1/ddia_1105.png)）：例如，分析型資料庫 Druid 使用這種方式直接從 Kafka 攝取資料【55】，Pistachio 是一個分散式的鍵值儲存，使用 Kafka 作為提交日誌【56】，Kafka Connect 能將來自 Kafka 的資料匯出到各種不同的資料庫與索引【41】。這對於許多其他儲存和索引系統（如搜尋伺服器）來說是很有意義的，當系統要從分散式日誌中獲取輸入時亦然（請參閱 “[保持系統同步](#保持系統同步)”）。

新增從事件日誌到資料庫的顯式轉換，能夠使應用更容易地隨時間演進：如果你想要引入一個新功能，以新的方式表示現有資料，則可以使用事件日誌來構建一個單獨的、針對新功能的讀取最佳化檢視，無需修改現有系統而與之共存。並行執行新舊系統通常比在現有系統中執行複雜的模式遷移更容易。一旦不再需要舊的系統，你可以簡單地關閉它並回收其資源【47,57】。

如果你不需要擔心如何查詢與訪問資料，那麼儲存資料通常是非常簡單的。模式設計、索引和儲存引擎的許多複雜性，都是希望支援某些特定查詢和訪問模式的結果（請參閱 [第三章](/v1_tw/ch3)）。出於這個原因，透過將資料寫入的形式與讀取形式相分離，並允許幾個不同的讀取檢視，你能獲得很大的靈活性。這個想法有時被稱為 **命令查詢責任分離（command query responsibility segregation, CQRS）**【42,58,59】。

資料庫和模式設計的傳統方法是基於這樣一種謬論，資料必須以與查詢相同的形式寫入。如果可以將資料從針對寫入最佳化的事件日誌轉換為針對讀取最佳化的應用狀態，那麼有關正規化和反正規化的爭論就變得無關緊要了（請參閱 “[多對一和多對多的關係](/v1_tw/ch2#多對一和多對多的關係)”）：在針對讀取最佳化的檢視中對資料進行反正規化是完全合理的，因為翻譯過程提供了使其與事件日誌保持一致的機制。

在 “[描述負載](/v1_tw/ch1#描述負載)” 中，我們討論了推特主頁時間線，它是特定使用者關注的人群所發推特的快取（類似郵箱）。這是 **針對讀取最佳化的狀態** 的又一個例子：主頁時間線是高度反正規化的，因為你的推文與你所有粉絲的時間線都構成了重複。然而，扇出服務保持了這種重複狀態與新推特以及新關注關係的同步，從而保證了重複的可管理性。

#### 併發控制

事件溯源和變更資料捕獲的最大缺點是，事件日誌的消費者通常是非同步的，所以可能會出現這樣的情況：使用者會寫入日誌，然後從日誌衍生檢視中讀取，結果發現他的寫入還沒有反映在讀取檢視中。我們之前在 “[讀己之寫](/v1_tw/ch5#讀己之寫)” 中討論了這個問題以及可能的解決方案。

一種解決方案是將事件追加到日誌時同步執行讀取檢視的更新。而將這些寫入操作合併為一個原子單元需要 **事務**，所以要麼將事件日誌和讀取檢視儲存在同一個儲存系統中，要麼就需要跨不同系統進行分散式事務。或者，你也可以使用在 “[使用全序廣播實現線性一致的儲存](/v1_tw/ch9#使用全序廣播實現線性一致的儲存)” 中討論的方法。

另一方面，從事件日誌匯出當前狀態也簡化了併發控制的某些部分。許多對於多物件事務的需求（請參閱 “[單物件和多物件操作](/v1_tw/ch7#單物件和多物件操作)”）源於單個使用者操作需要在多個不同的位置更改資料。透過事件溯源，你可以設計一個自包含的事件以表示一個使用者操作。然後使用者操作就只需要在一個地方進行單次寫入操作 —— 即將事件附加到日誌中 —— 這個還是很容易使原子化的。

如果事件日誌與應用狀態以相同的方式分割槽（例如，處理分割槽 3 中的客戶事件只需要更新分割槽 3 中的應用狀態），那麼直接使用單執行緒日誌消費者就不需要寫入併發控制了。它從設計上一次只處理一個事件（請參閱 “[真的序列執行](/v1_tw/ch7#真的序列執行)”）。日誌透過在分割槽中定義事件的序列順序，消除了併發性的不確定性【24】。如果一個事件觸及多個狀態分割槽，那麼需要做更多的工作，我們將在 [第十二章](/v1_tw/ch12) 討論。

#### 不變性的侷限性

許多不使用事件溯源模型的系統也還是依賴不可變性：各種資料庫在內部使用不可變的資料結構或多版本資料來支援時間點快照（請參閱 “[索引和快照隔離](/v1_tw/ch7#索引和快照隔離)” ）。Git、Mercurial 和 Fossil 等版本控制系統也依靠不可變的資料來儲存檔案的版本歷史記錄。

永遠保持所有變更的不變歷史，在多大程度上是可行的？答案取決於資料集的流失率。一些工作負載主要是新增資料，很少更新或刪除；它們很容易保持不變。其他工作負載在相對較小的資料集上有較高的更新 / 刪除率；在這些情況下，不可變的歷史可能增至難以接受的巨大，碎片化可能成為一個問題，壓縮與垃圾收集的表現對於運維的穩健性變得至關重要【60,61】。

除了效能方面的原因外，也可能有出於管理方面的原因需要刪除資料的情況，儘管這些資料都是不可變的。例如，隱私條例可能要求在使用者關閉帳戶後刪除他們的個人資訊，資料保護立法可能要求刪除錯誤的資訊，或者可能需要阻止敏感資訊的意外洩露。

在這種情況下，僅僅在日誌中新增另一個事件來指明先前的資料應該被視為刪除是不夠的 —— 你實際上是想改寫歷史，並假裝資料從一開始就沒有寫入。例如，Datomic 管這個特性叫 **切除（excision）** 【62】，而 Fossil 版本控制系統有一個類似的概念叫 **避免（shunning）** 【63】。

真正刪除資料是非常非常困難的【64】，因為副本可能存在於很多地方：例如，儲存引擎，檔案系統和 SSD 通常會向一個新位置寫入，而不是原地覆蓋舊資料【52】，而備份通常是特意做成不可變的，防止意外刪除或損壞。刪除操作更多的是指 “使取回資料更困難”，而不是指 “使取回資料不可能”。無論如何，有時你必須得嘗試，正如我們在 “[立法與自律](/v1_tw/ch12#立法與自律)” 中所看到的。


## 流處理

到目前為止，本章中我們已經討論了流的來源（使用者活動事件，感測器和寫入資料庫），我們討論了流如何傳輸（直接透過訊息傳送，透過訊息代理，透過事件日誌）。

剩下的就是討論一下你可以用流做什麼 —— 也就是說，你可以處理它。一般來說，有三種選項：

1. 你可以將事件中的資料寫入資料庫、快取、搜尋索引或類似的儲存系統，然後能被其他客戶端查詢。如 [圖 11-5](/v1/ddia_1105.png) 所示，這是資料庫與系統其他部分所發生的變更保持同步的好方法 —— 特別是當流消費者是寫入資料庫的唯一客戶端時。如 “[批處理工作流的輸出](/v1_tw/ch10#批處理工作流的輸出)” 中所討論的，它是寫入儲存系統的流等價物。
2. 你能以某種方式將事件推送給使用者，例如傳送報警郵件或推送通知，或將事件流式傳輸到可即時顯示的儀表板上。在這種情況下，人是流的最終消費者。
3. 你可以處理一個或多個輸入流，併產生一個或多個輸出流。流可能會經過由幾個這樣的處理階段組成的流水線，最後再輸出（選項 1 或 2）。

在本章的剩餘部分中，我們將討論選項 3：處理流以產生其他衍生流。處理這樣的流的程式碼片段，被稱為 **運算元（operator）** 或 **作業（job）**。它與我們在 [第十章](/v1_tw/ch10) 中討論過的 Unix 程序和 MapReduce 作業密切相關，資料流的模式是相似的：一個流處理器以只讀的方式使用輸入流，並將其輸出以僅追加的方式寫入一個不同的位置。

流處理中的分割槽和並行化模式也非常類似於 [第十章](/v1_tw/ch10) 中介紹的 MapReduce 和資料流引擎，因此我們不再重複這些主題。基本的 Map 操作（如轉換和過濾記錄）也是一樣的。

與批次作業相比的一個關鍵區別是，流不會結束。這種差異會帶來很多隱含的結果。正如本章開始部分所討論的，排序對無界資料集沒有意義，因此無法使用 **排序合併連線**（請參閱 “[Reduce 側連線與分組](/v1_tw/ch10#Reduce側連線與分組)”）。容錯機制也必須改變：對於已經運行了幾分鐘的批處理作業，可以簡單地從頭開始重啟失敗任務，但是對於已經執行數年的流作業，重啟後從頭開始跑可能並不是一個可行的選項。

### 流處理的應用

長期以來，流處理一直用於監控目的，如果某個事件發生，組織希望能得到警報。例如：

* 欺詐檢測系統需要確定信用卡的使用模式是否有意外地變化，如果信用卡可能已被盜刷，則鎖卡。
* 交易系統需要檢查金融市場的價格變化，並根據指定的規則進行交易。
* 製造系統需要監控工廠中機器的狀態，如果出現故障，可以快速定位問題。
* 軍事和情報系統需要跟蹤潛在侵略者的活動，並在出現襲擊徵兆時發出警報。

這些型別的應用需要非常精密複雜的模式匹配與相關檢測。然而隨著時代的進步，流處理的其他用途也開始出現。在本節中，我們將簡要比較一下這些應用。

#### 複合事件處理

**複合事件處理（complex event processing, CEP）** 是 20 世紀 90 年代為分析事件流而開發出的一種方法，尤其適用於需要搜尋某些事件模式的應用【65,66】。與正則表示式允許你在字串中搜索特定字元模式的方式類似，CEP 允許你指定規則以在流中搜索某些事件模式。

CEP 系統通常使用高層次的宣告式查詢語言，比如 SQL，或者圖形使用者介面，來描述應該檢測到的事件模式。這些查詢被提交給處理引擎，該引擎消費輸入流，並在內部維護一個執行所需匹配的狀態機。當發現匹配時，引擎發出一個 **複合事件**（即 complex event，CEP 因此得名），並附有檢測到的事件模式詳情【67】。

在這些系統中，查詢和資料之間的關係與普通資料庫相比是顛倒的。通常情況下，資料庫會持久儲存資料，並將查詢視為臨時的：當查詢進入時，資料庫搜尋與查詢匹配的資料，然後在查詢完成時丟掉查詢。CEP 引擎反轉了角色：查詢是長期儲存的，來自輸入流的事件不斷流過它們，搜尋匹配事件模式的查詢【68】。

CEP 的實現包括 Esper【69】、IBM InfoSphere Streams【70】、Apama、TIBCO StreamBase 和 SQLstream。像 Samza 這樣的分散式流處理元件，支援使用 SQL 在流上進行宣告式查詢【71】。

#### 流分析

使用流處理的另一個領域是對流進行分析。CEP 與流分析之間的邊界是模糊的，但一般來說，分析往往對找出特定事件序列並不關心，而更關注大量事件上的聚合與統計指標 —— 例如：

* 測量某種型別事件的速率（每個時間間隔內發生的頻率）
* 滾動計算一段時間視窗內某個值的平均值
* 將當前的統計值與先前的時間區間的值對比（例如，檢測趨勢，當指標與上週同比異常偏高或偏低時報警）

這些統計值通常是在固定時間區間內進行計算的，例如，你可能想知道在過去 5 分鐘內服務每秒查詢次數的均值，以及此時間段內響應時間的第 99 百分位點。在幾分鐘內取平均，能抹平秒和秒之間的無關波動，且仍然能向你展示流量模式的時間圖景。聚合的時間間隔稱為 **視窗（window）**，我們將在 “[時間推理](#時間推理)” 中更詳細地討論視窗。

流分析系統有時會使用機率演算法，例如 Bloom filter（我們在 “[效能最佳化](/v1_tw/ch3#效能最佳化)” 中遇到過）來管理成員資格，HyperLogLog【72】用於基數估計以及各種百分比估計算法（請參閱 “[實踐中的百分位點](/v1_tw/ch1#實踐中的百分位點)”）。機率演算法產出近似的結果，但比起精確演算法的優點是記憶體使用要少得多。使用近似演算法有時讓人們覺得流處理系統總是有損的和不精確的，但這是錯誤看法：流處理並沒有任何內在的近似性，而機率演算法只是一種最佳化【73】。

許多開源分散式流處理框架的設計都是針對分析設計的：例如 Apache Storm、Spark Streaming、Flink、Concord、Samza 和 Kafka Streams 【74】。託管服務包括 Google Cloud Dataflow 和 Azure Stream Analytics。

#### 維護物化檢視

我們在 “[資料庫與流](#資料庫與流)” 中看到，資料庫的變更流可以用於維護衍生資料系統（如快取、搜尋索引和資料倉庫），並使其與源資料庫保持最新。我們可以將這些示例視作維護 **物化檢視（materialized view）** 的一種具體場景（請參閱 “[聚合：資料立方體和物化檢視](/v1_tw/ch3#聚合：資料立方體和物化檢視)”）：在某個資料集上衍生出一個替代檢視以便高效查詢，並在底層資料變更時更新檢視【50】。

同樣，在事件溯源中，應用程式的狀態是透過應用事件日誌來維護的；這裡的應用程式狀態也是一種物化檢視。與流分析場景不同的是，僅考慮某個時間視窗內的事件通常是不夠的：構建物化檢視可能需要任意時間段內的 **所有** 事件，除了那些可能由日誌壓縮丟棄的過時事件（請參閱 “[日誌壓縮](#日誌壓縮)”）。實際上，你需要一個可以一直延伸到時間開端的視窗。

原則上講，任何流處理元件都可以用於維護物化檢視，儘管 “永遠執行” 與一些面向分析的框架假設的 “主要在有限時間段視窗上執行” 背道而馳，Samza 和 Kafka Streams 支援這種用法，建立在 Kafka 對日誌壓縮的支援上【75】。

#### 在流上搜索

除了允許搜尋由多個事件構成模式的 CEP 外，有時也存在基於複雜標準（例如全文搜尋查詢）來搜尋單個事件的需求。

例如，媒體監測服務可以訂閱新聞文章 Feed 與來自媒體的播客，搜尋任何關於公司、產品或感興趣的話題的新聞。這是透過預先構建一個搜尋查詢來完成的，然後不斷地將新聞項的流與該查詢進行匹配。在一些網站上也有類似的功能：例如，當市場上出現符合其搜尋條件的新房產時，房地產網站的使用者可以要求網站通知他們。Elasticsearch 的這種過濾器功能，是實現這種流搜尋的一種選擇【76】。

傳統的搜尋引擎首先索引檔案，然後在索引上跑查詢。相比之下，搜尋一個數據流則反了過來：查詢被儲存下來，文件從查詢中流過，就像在 CEP 中一樣。最簡單的情況就是，你可以為每個文件測試每個查詢。但是如果你有大量查詢，這可能會變慢。為了最佳化這個過程，可以像對文件一樣，為查詢建立索引。因而收窄可能匹配的查詢集合【77】。

#### 訊息傳遞和RPC

在 “[訊息傳遞中的資料流](/v1_tw/ch4#訊息傳遞中的資料流)” 中我們討論過，訊息傳遞系統可以作為 RPC 的替代方案，即作為一種服務間通訊的機制，比如在 Actor 模型中所使用的那樣。儘管這些系統也是基於訊息和事件，但我們通常不會將其視作流處理元件：

* Actor 框架主要是管理模組通訊的併發和分散式執行的一種機制，而流處理主要是一種資料管理技術。
* Actor 之間的交流往往是短暫的、一對一的；而事件日誌則是持久的、多訂閱者的。
* Actor 可以以任意方式進行通訊（包括迴圈的請求 / 響應模式），但流處理通常配置在無環流水線中，其中每個流都是一個特定作業的輸出，由良好定義的輸入流中派生而來。

也就是說，RPC 類系統與流處理之間有一些交叉領域。例如，Apache Storm 有一個稱為 **分散式 RPC** 的功能，它允許將使用者查詢分散到一系列也處理事件流的節點上；然後這些查詢與來自輸入流的事件交織，而結果可以被彙總併發回給使用者【78】（另請參閱 “[多分割槽資料處理](/v1_tw/ch12#多分割槽資料處理)”）。

也可以使用 Actor 框架來處理流。但是，很多這樣的框架在崩潰時不能保證訊息的傳遞，除非你實現了額外的重試邏輯，否則這種處理不是容錯的。

### 時間推理

流處理通常需要與時間打交道，尤其是用於分析目的時候，會頻繁使用時間視窗，例如 “過去五分鐘的平均值”。“過去五分鐘” 的含義看上去似乎是清晰而無歧義的，但不幸的是，這個概念非常棘手。

在批處理中過程中，大量的歷史事件被快速地處理。如果需要按時間來分析，批處理器需要檢查每個事件中嵌入的時間戳。讀取執行批處理機器的系統時鐘沒有任何意義，因為處理執行的時間與事件實際發生的時間無關。

批處理可以在幾分鐘內讀取一年的歷史事件；在大多數情況下，感興趣的時間線是歷史中的一年，而不是處理中的幾分鐘。而且使用事件中的時間戳，使得處理是 **確定性** 的：在相同的輸入上再次執行相同的處理過程會得到相同的結果（請參閱 “[容錯](/v1_tw/ch10#容錯)”）。

另一方面，許多流處理框架使用處理機器上的本地系統時鐘（**處理時間**，即 processing time）來確定 **視窗（windowing）**【79】。這種方法的優點是簡單，如果事件建立與事件處理之間的延遲可以忽略不計，那也是合理的。然而，如果存在任何顯著的處理延遲 —— 即，事件處理顯著地晚於事件實際發生的時間，這種處理方式就失效了。

#### 事件時間與處理時間

很多原因都可能導致處理延遲：排隊，網路故障（請參閱 “[不可靠的網路](/v1_tw/ch8#不可靠的網路)”），效能問題導致訊息代理 / 訊息處理器出現爭用，流消費者重啟，從故障中恢復時重新處理過去的事件（請參閱 “[重播舊訊息](#重播舊訊息)”），或者在修復程式碼 BUG 之後。

而且，訊息延遲還可能導致無法預測訊息順序。例如，假設使用者首先發出一個 Web 請求（由 Web 伺服器 A 處理），然後發出第二個請求（由伺服器 B 處理）。A 和 B 發出描述它們所處理請求的事件，但是 B 的事件在 A 的事件發生之前到達訊息代理。現在，流處理器將首先看到 B 事件，然後看到 A 事件，即使它們實際上是以相反的順序發生的。

有一個類比也許能幫助理解，“星球大戰” 電影：第四集於 1977 年發行，第五集於 1980 年，第六集於 1983 年，緊隨其後的是 1999 年的第一集，2002 年的第二集，和 2005 年的第三集，以及 2015 年的第七集【80】[^ii]。如果你按照按照它們上映的順序觀看電影，你處理電影的順序與它們敘事的順序就是不一致的。（集數編號就像事件時間戳，而你觀看電影的日期就是處理時間）作為人類，我們能夠應對這種不連續性，但是流處理演算法需要專門編寫，以適應這種時序與順序的問題。

[^ii]: 感謝 Flink 社群的 Kostas Kloudas 提出這個比喻。

將事件時間和處理時間搞混會導致錯誤的資料。例如，假設你有一個流處理器用於測量請求速率（計算每秒請求數）。如果你重新部署流處理器，它可能會停止一分鐘，並在恢復之後處理積壓的事件。如果你按處理時間來衡量速率，那麼在處理積壓日誌時，請求速率看上去就像有一個異常的突發尖峰，而實際上請求速率是穩定的（[圖 11-7](/v1/ddia_1107.png)）。

![](/v1/ddia_1107.png)

**圖 11-7 按處理時間分窗，會因為處理速率的變動引入人為因素**

#### 知道什麼時候準備好了

用事件時間來定義視窗的一個棘手的問題是，你永遠也無法確定是不是已經收到了特定視窗的所有事件，還是說還有一些事件正在來的路上。

例如，假設你將事件分組為一分鐘的視窗，以便統計每分鐘的請求數。你已經計數了一些帶有本小時內第 37 分鐘時間戳的事件，時間流逝，現在進入的主要都是本小時內第 38 和第 39 分鐘的事件。什麼時候才能宣佈你已經完成了第 37 分鐘的視窗計數，並輸出其計數器值？

在一段時間沒有看到任何新的事件之後，你可以超時並宣佈一個視窗已經就緒，但仍然可能發生這種情況：某些事件被緩衝在另一臺機器上，由於網路中斷而延遲。你需要能夠處理這種在視窗宣告完成之後到達的 **滯留（straggler）** 事件。大體上，你有兩種選擇【1】：

1. 忽略這些滯留事件，因為在正常情況下它們可能只是事件中的一小部分。你可以將丟棄事件的數量作為一個監控指標，並在出現大量丟訊息的情況時報警。
2. 釋出一個 **更正（correction）**，一個包括滯留事件的更新視窗值。你可能還需要收回以前的輸出。

在某些情況下，可以使用特殊的訊息來指示 “從現在開始，不會有比 t 更早時間戳的訊息了”，消費者可以使用它來觸發視窗【81】。但是，如果不同機器上的多個生產者都在生成事件，每個生產者都有自己的最小時間戳閾值，則消費者需要分別跟蹤每個生產者。在這種情況下，新增和刪除生產者都是比較棘手的。

#### 你用的是誰的時鐘？

當事件可能在系統內多個地方進行緩衝時，為事件分配時間戳更加困難了。例如，考慮一個移動應用向伺服器上報關於用量的事件。該應用可能會在裝置處於離線狀態時被使用，在這種情況下，它將在裝置本地緩衝事件，並在下一次網際網路連線可用時向伺服器上報這些事件（可能是幾小時甚至幾天）。對於這個流的任意消費者而言，它們就如延遲極大的滯留事件一樣。

在這種情況下，事件上的事件戳實際上應當是使用者交互發生的時間，取決於移動裝置的本地時鐘。然而使用者控制的裝置上的時鐘通常是不可信的，因為它可能會被無意或故意設定成錯誤的時間（請參閱 “[時鐘同步與準確性](/v1_tw/ch8#時鐘同步與準確性)”）。伺服器收到事件的時間（取決於伺服器的時鐘）可能是更準確的，因為伺服器在你的控制之下，但在描述使用者互動方面意義不大。

要校正不正確的裝置時鐘，一種方法是記錄三個時間戳【82】：

* 事件發生的時間，取決於裝置時鐘
* 事件傳送往伺服器的時間，取決於裝置時鐘
* 事件被伺服器接收的時間，取決於伺服器時鐘

透過從第三個時間戳中減去第二個時間戳，可以估算裝置時鐘和伺服器時鐘之間的偏移（假設網路延遲與所需的時間戳精度相比可忽略不計）。然後可以將該偏移應用於事件時間戳，從而估計事件實際發生的真實時間（假設裝置時鐘偏移在事件發生時與送往伺服器之間沒有變化）。

這並不是流處理獨有的問題，批處理有著完全一樣的時間推理問題。只是在流處理的上下文中，我們更容易意識到時間的流逝。

#### 視窗的型別

當你知道如何確定一個事件的時間戳後，下一步就是如何定義時間段的視窗。然後視窗就可以用於聚合，例如事件計數，或計算視窗內值的平均值。有幾種視窗很常用【79,83】：

滾動視窗（Tumbling Window）
: 滾動視窗有著固定的長度，每個事件都僅能屬於一個視窗。例如，假設你有一個 1 分鐘的滾動視窗，則所有時間戳在 `10:03:00` 和 `10:03:59` 之間的事件會被分組到一個視窗中，`10:04:00` 和 `10:04:59` 之間的事件被分組到下一個視窗，依此類推。透過將每個事件時間戳四捨五入至最近的分鐘來確定它所屬的視窗，可以實現 1 分鐘的滾動視窗。

跳動視窗（Hopping Window）
: 跳動視窗也有著固定的長度，但允許視窗重疊以提供一些平滑。例如，一個帶有 1 分鐘跳躍步長的 5 分鐘視窗將包含 `10:03:00` 至 `10:07:59` 之間的事件，而下一個視窗將覆蓋 `10:04:00` 至 `10:08:59` 之間的事件，等等。透過首先計算 1 分鐘的滾動視窗（tunmbling window），然後在幾個相鄰視窗上進行聚合，可以實現這種跳動視窗。

滑動視窗（Sliding Window）
: 滑動視窗包含了彼此間距在特定時長內的所有事件。例如，一個 5 分鐘的滑動視窗應當覆蓋 `10:03:39` 和 `10:08:12` 的事件，因為它們相距不超過 5 分鐘（注意滾動視窗與步長 5 分鐘的跳動視窗可能不會把這兩個事件分組到同一個視窗中，因為它們使用固定的邊界）。透過維護一個按時間排序的事件緩衝區，並不斷從視窗中移除過期的舊事件，可以實現滑動視窗。

會話視窗（Session window）
: 與其他視窗型別不同，會話視窗沒有固定的持續時間，而定義為：將同一使用者出現時間相近的所有事件分組在一起，而當用戶一段時間沒有活動時（例如，如果 30 分鐘內沒有事件）視窗結束。會話切分是網站分析的常見需求（請參閱 “[分組](/v1_tw/ch10#分組)”）。

### 流連線

在 [第十章](/v1_tw/ch10) 中，我們討論了批處理作業如何透過鍵來連線資料集，以及這種連線是如何成為資料管道的重要組成部分的。由於流處理將資料管道泛化為對無限資料集進行增量處理，因此對流進行連線的需求也是完全相同的。

然而，新事件隨時可能出現在一個流中，這使得流連線要比批處理連線更具挑戰性。為了更好地理解情況，讓我們先來區分三種不同型別的連線：**流 - 流** 連線，**流 - 表** 連線，與 **表 - 表** 連線【84】。我們將在下面的章節中透過例子來說明。

#### 流流連線（視窗連線）

假設你的網站上有搜尋功能，而你想要找出搜尋 URL 的近期趨勢。每當有人鍵入搜尋查詢時，都會記錄下一個包含查詢與其返回結果的事件。每當有人點選其中一個搜尋結果時，就會記錄另一個記錄點選事件。為了計算搜尋結果中每個 URL 的點選率，你需要將搜尋動作與點選動作的事件連在一起，這些事件透過相同的會話 ID 進行連線。廣告系統中需要類似的分析【85】。

如果使用者丟棄了搜尋結果，點選可能永遠不會發生，即使它出現了，搜尋與點選之間的時間可能是高度可變的：在很多情況下，它可能是幾秒鐘，但也可能長達幾天或幾周（如果使用者執行搜尋，忘掉了這個瀏覽器頁面，過了一段時間後重新回到這個瀏覽器頁面上，並點選了一個結果）。由於可變的網路延遲，點選事件甚至可能先於搜尋事件到達。你可以選擇合適的連線視窗 —— 例如，如果點選與搜尋之間的時間間隔在一小時內，你可能會選擇連線兩者。

請注意，在點選事件中嵌入搜尋詳情與事件連線並不一樣：這樣做的話，只有當用戶點選了一個搜尋結果時你才能知道，而那些沒有點選的搜尋就無能為力了。為了衡量搜尋質量，你需要準確的點選率，為此搜尋事件和點選事件兩者都是必要的。

為了實現這種型別的連線，流處理器需要維護 **狀態**：例如，按會話 ID 索引最近一小時內發生的所有事件。無論何時發生搜尋事件或點選事件，都會被新增到合適的索引中，而流處理器也會檢查另一個索引是否有具有相同會話 ID 的事件到達。如果有匹配事件就會發出一個表示搜尋結果被點選的事件；如果搜尋事件直到過期都沒看見有匹配的點選事件，就會發出一個表示搜尋結果未被點選的事件。

#### 流表連線（流擴充）

在 “[示例：使用者活動事件分析](/v1_tw/ch10#示例：使用者活動事件分析)”（[圖 10-2](/v1/ddia_1002.png)）中，我們看到了連線兩個資料集的批處理作業示例：一組使用者活動事件和一個使用者檔案資料庫。將使用者活動事件視為流，並在流處理器中連續執行相同的連線是很自然的想法：輸入是包含使用者 ID 的活動事件流，而輸出還是活動事件流，但其中使用者 ID 已經被擴充套件為使用者的檔案資訊。這個過程有時被稱為使用資料庫的資訊來 **擴充（enriching）** 活動事件。

要執行此連線，流處理器需要一次處理一個活動事件，在資料庫中查詢事件的使用者 ID，並將檔案資訊新增到活動事件中。資料庫查詢可以透過查詢遠端資料庫來實現。但正如在 “[示例：使用者活動事件分析](/v1_tw/ch10#示例：使用者活動事件分析)” 一節中討論的，此類遠端查詢可能會很慢，並且有可能導致資料庫過載【75】。

另一種方法是將資料庫副本載入到流處理器中，以便在本地進行查詢而無需網路往返。這種技術與我們在 “[Map 側連線](/v1_tw/ch10#Map側連線)” 中討論的雜湊連線非常相似：如果資料庫的本地副本足夠小，則可以是記憶體中的散列表，比較大的話也可以是本地磁碟上的索引。

與批處理作業的區別在於，批處理作業使用資料庫的時間點快照作為輸入，而流處理器是長時間執行的，且資料庫的內容可能隨時間而改變，所以流處理器資料庫的本地副本需要保持更新。這個問題可以透過變更資料捕獲來解決：流處理器可以訂閱使用者檔案資料庫的更新日誌，如同活動事件流一樣。當增添或修改檔案時，流處理器會更新其本地副本。因此，我們有了兩個流之間的連線：活動事件和檔案更新。

流表連線實際上非常類似於流流連線；最大的區別在於對於表的變更日誌流，連線使用了一個可以回溯到 “時間起點” 的視窗（概念上是無限的視窗），新版本的記錄會覆蓋更早的版本。對於輸入的流，連線可能壓根兒就沒有維護任何視窗。

#### 表表連線（維護物化檢視）

我們在 “[描述負載](/v1_tw/ch1#描述負載)” 中討論的推特時間線例子時說過，當用戶想要檢視他們的主頁時間線時，迭代使用者所關注人群的推文併合並它們是一個開銷巨大的操作。

相反，我們需要一個時間線快取：一種每個使用者的 “收件箱”，在傳送推文的時候寫入這些資訊，因而讀取時間線時只需要簡單地查詢即可。物化與維護這個快取需要處理以下事件：

* 當用戶 u 傳送新的推文時，它將被新增到每個關注使用者 u 的時間線上。
* 使用者刪除推文時，推文將從所有使用者的時間表中刪除。
* 當用戶 $u_1$ 開始關注使用者 $u_2$ 時，$u_2$ 最近的推文將被新增到 $u_1$ 的時間線上。
* 當用戶 $u_1$ 取消關注使用者 $u_2$ 時，$u_2$ 的推文將從 $u_1$ 的時間線中移除。

要在流處理器中實現這種快取維護，你需要推文事件流（傳送與刪除）和關注關係事件流（關注與取消關注）。流處理需要維護一個數據庫，包含每個使用者的粉絲集合。以便知道當一條新推文到達時，需要更新哪些時間線【86】。

觀察這個流處理過程的另一種視角是：它維護了一個連線了兩個表（推文與關注）的物化檢視，如下所示：

```sql
SELECT follows.follower_id AS timeline_id,
    array_agg(tweets.* ORDER BY tweets.timestamp DESC)
FROM tweets
JOIN follows ON follows.followee_id = tweets.sender_id
GROUP BY follows.follower_id
```

流連線直接對應於這個查詢中的表連線。時間線實際上是這個查詢結果的快取，每當底層的表發生變化時都會更新 [^iii]。

[^iii]: 如果你將流視作表的衍生物，如 [圖 11-6](/v1/ddia_1106.png) 所示，而把一個連線看作是兩個表的乘法u·v，那麼會發生一些有趣的事情：物化連線的變化流遵循乘積法則：(u·v)'= u'v + uv'。換句話說，任何推文的變化量都與當前的關注聯絡在一起，任何關注的變化量都與當前的推文相連線【49,50】。

#### 連線的時間依賴性

這裡描述的三種連線（流流，流表，表表）有很多共通之處：它們都需要流處理器維護連線一側的一些狀態（搜尋與點選事件，使用者檔案，關注列表），然後當連線另一側的訊息到達時查詢該狀態。

用於維護狀態的事件順序是很重要的（先關注然後取消關注，或者其他類似操作）。在分割槽日誌中，單個分割槽內的事件順序是保留下來的。但典型情況下是沒有跨流或跨分割槽的順序保證的。

這就產生了一個問題：如果不同流中的事件發生在近似的時間範圍內，則應該按照什麼樣的順序進行處理？在流表連線的例子中，如果使用者更新了它們的檔案，哪些活動事件與舊檔案連線（在檔案更新前處理），哪些又與新檔案連線（在檔案更新之後處理）？換句話說：你需要對一些狀態做連線，如果狀態會隨著時間推移而變化，那應當使用什麼時間點來連線呢【45】？

這種時序依賴可能出現在很多地方。例如銷售東西需要對發票應用適當的稅率，這取決於所處的國家 / 州，產品型別，銷售日期（因為稅率時不時會變化）。當連線銷售額與稅率表時，你可能期望的是使用銷售時的稅率參與連線。如果你正在重新處理歷史資料，銷售時的稅率可能和現在的稅率有所不同。

如果跨越流的事件順序是未定的，則連線會變為不確定性的【87】，這意味著你在同樣輸入上重跑相同的作業未必會得到相同的結果：當你重跑任務時，輸入流上的事件可能會以不同的方式交織。

在資料倉庫中，這個問題被稱為 **緩慢變化的維度（slowly changing dimension, SCD）**，通常透過對特定版本的記錄使用唯一的識別符號來解決：例如，每當稅率改變時都會獲得一個新的識別符號，而發票在銷售時會帶有稅率的識別符號【88,89】。這種變化使連線變為確定性的，但也會導致日誌壓縮無法進行：表中所有的記錄版本都需要保留。

### 容錯

在本章的最後一節中，讓我們看一看流處理是如何容錯的。我們在 [第十章](/v1_tw/ch10) 中看到，批處理框架可以很容易地容錯：如果 MapReduce 作業中的任務失敗，可以簡單地在另一臺機器上再次啟動，並且丟棄失敗任務的輸出。這種透明的重試是可能的，因為輸入檔案是不可變的，每個任務都將其輸出寫入到 HDFS 上的獨立檔案中，而輸出僅當任務成功完成後可見。

特別是，批處理容錯方法可確保批處理作業的輸出與沒有出錯的情況相同，即使實際上某些任務失敗了。看起來好像每條輸入記錄都被處理了恰好一次 —— 沒有記錄被跳過，而且沒有記錄被處理兩次。儘管重啟任務意味著實際上可能會多次處理記錄，但輸出中的可見效果看上去就像只處理過一次。這個原則被稱為 **恰好一次語義（exactly-once semantics）**，儘管 **等效一次（effectively-once）** 可能會是一個更寫實的術語【90】。

在流處理中也出現了同樣的容錯問題，但是處理起來沒有那麼直觀：等待某個任務完成之後再使其輸出可見並不是一個可行選項，因為你永遠無法處理完一個無限的流。

#### 微批次與存檔點

一個解決方案是將流分解成小塊，並像微型批處理一樣處理每個塊。這種方法被稱為 **微批次（microbatching）**，它被用於 Spark Streaming 【91】。批次的大小通常約為 1 秒，這是對效能妥協的結果：較小的批次會導致更大的排程與協調開銷，而較大的批次意味著流處理器結果可見之前的延遲要更長。

微批次也隱式提供了一個與批次大小相等的滾動視窗（按處理時間而不是事件時間戳分窗）。任何需要更大視窗的作業都需要顯式地將狀態從一個微批次轉移到下一個微批次。

Apache Flink 則使用不同的方法，它會定期生成狀態的滾動存檔點並將其寫入持久儲存【92,93】。如果流運算元崩潰，它可以從最近的存檔點重啟，並丟棄從最近檢查點到崩潰之間的所有輸出。存檔點會由訊息流中的 **壁障（barrier）** 觸發，類似於微批次之間的邊界，但不會強制一個特定的視窗大小。

在流處理框架的範圍內，微批次與存檔點方法提供了與批處理一樣的 **恰好一次語義**。但是，只要輸出離開流處理器（例如，寫入資料庫，向外部訊息代理傳送訊息，或傳送電子郵件），框架就無法拋棄失敗批次的輸出了。在這種情況下，重啟失敗任務會導致外部副作用發生兩次，只有微批次或存檔點不足以阻止這一問題。

#### 原子提交再現

為了在出現故障時表現出恰好處理一次的樣子，我們需要確保事件處理的所有輸出和副作用 **當且僅當** 處理成功時才會生效。這些影響包括傳送給下游運算元或外部訊息傳遞系統（包括電子郵件或推送通知）的任何訊息，任何資料庫寫入，對運算元狀態的任何變更，以及對輸入訊息的任何確認（包括在基於日誌的訊息代理中將消費者偏移量前移）。

這些事情要麼都原子地發生，要麼都不發生，但是它們不應當失去同步。如果這種方法聽起來很熟悉，那是因為我們在分散式事務和兩階段提交的上下文中討論過它（請參閱 “[恰好一次的訊息處理](/v1_tw/ch9#恰好一次的訊息處理)”）。

在 [第九章](/v1_tw/ch9) 中，我們討論了分散式事務傳統實現中的問題（如 XA）。然而在限制更為嚴苛的環境中，也是有可能高效實現這種原子提交機制的。Google Cloud Dataflow【81,92】和 VoltDB 【94】中使用了這種方法，Apache Kafka 有計劃加入類似的功能【95,96】。與 XA 不同，這些實現不會嘗試跨異構技術提供事務，而是透過在流處理框架中同時管理狀態變更與訊息傳遞來內化事務。事務協議的開銷可以透過在單個事務中處理多個輸入訊息來分攤。

#### 冪等性

我們的目標是丟棄任何失敗任務的部分輸出，以便能安全地重試，而不會生效兩次。分散式事務是實現這個目標的一種方式，而另一種方式是依賴 **冪等性（idempotence）**【97】。

冪等操作是多次重複執行與單次執行效果相同的操作。例如，將鍵值儲存中的某個鍵設定為某個特定值是冪等的（再次寫入該值，只是用同樣的值替代），而遞增一個計數器不是冪等的（再次執行遞增意味著該值遞增兩次）。

即使一個操作不是天生冪等的，往往可以透過一些額外的元資料做成冪等的。例如，在使用來自 Kafka 的訊息時，每條訊息都有一個持久的、單調遞增的偏移量。將值寫入外部資料庫時可以將這個偏移量帶上，這樣你就可以判斷一條更新是不是已經執行過了，因而避免重複執行。

Storm 的 Trident 基於類似的想法來處理狀態【78】。依賴冪等性意味著隱含了一些假設：重啟一個失敗的任務必須以相同的順序重播相同的訊息（基於日誌的訊息代理能做這些事），處理必須是確定性的，沒有其他節點能同時更新相同的值【98,99】。

當從一個處理節點故障切換到另一個節點時，可能需要進行 **防護**（fencing，請參閱 “[領導者和鎖](/v1_tw/ch8#領導者和鎖)”），以防止被假死節點干擾。儘管有這麼多注意事項，冪等操作是一種實現 **恰好一次語義** 的有效方式，僅需很小的額外開銷。

#### 失敗後重建狀態

任何需要狀態的流處理 —— 例如，任何視窗聚合（例如計數器，平均值和直方圖）以及任何用於連線的表和索引，都必須確保在失敗之後能恢復其狀態。

一種選擇是將狀態儲存在遠端資料儲存中，並進行復制，然而正如在 “[流表連線（流擴充）](#流表連線（流擴充）)” 中所述，每個訊息都要查詢遠端資料庫可能會很慢。另一種方法是在流處理器本地儲存狀態，並定期複製。然後當流處理器從故障中恢復時，新任務可以讀取狀態副本，恢復處理而不丟失資料。

例如，Flink 定期捕獲運算元狀態的快照，並將它們寫入 HDFS 等持久儲存中【92,93】。Samza 和 Kafka Streams 透過將狀態變更傳送到具有日誌壓縮功能的專用 Kafka 主題來複制狀態變更，這與變更資料捕獲類似【84,100】。VoltDB 透過在多個節點上對每個輸入訊息進行冗餘處理來複制狀態（請參閱 “[真的序列執行](/v1_tw/ch7#真的序列執行)”）。

在某些情況下，甚至可能都不需要複製狀態，因為它可以從輸入流重建。例如，如果狀態是從相當短的視窗中聚合而成，則簡單地重播該視窗中的輸入事件可能是足夠快的。如果狀態是透過變更資料捕獲來維護的資料庫的本地副本，那麼也可以從日誌壓縮的變更流中重建資料庫（請參閱 “[日誌壓縮](#日誌壓縮)”）。

然而，所有這些權衡取決於底層基礎架構的效能特徵：在某些系統中，網路延遲可能低於磁碟訪問延遲，網路頻寬也可能與磁碟頻寬相當。沒有針對所有情況的普適理想權衡，隨著儲存和網路技術的發展，本地狀態與遠端狀態的優點也可能會互換。


## 本章小結

在本章中，我們討論了事件流，它們所服務的目的，以及如何處理它們。在某些方面，流處理非常類似於在 [第十章](/v1_tw/ch10) 中討論的批處理，不過是在無限的（永無止境的）流而不是固定大小的輸入上持續進行。從這個角度來看，訊息代理和事件日誌可以視作檔案系統的流式等價物。

我們花了一些時間比較兩種訊息代理：

AMQP/JMS 風格的訊息代理
: 代理將單條訊息分配給消費者，消費者在成功處理單條訊息後確認訊息。訊息被確認後從代理中刪除。這種方法適合作為一種非同步形式的 RPC（另請參閱 “[訊息傳遞中的資料流](/v1_tw/ch4#訊息傳遞中的資料流)”），例如在任務佇列中，訊息處理的確切順序並不重要，而且訊息在處理完之後，不需要回頭重新讀取舊訊息。

基於日誌的訊息代理
: 代理將一個分割槽中的所有訊息分配給同一個消費者節點，並始終以相同的順序傳遞訊息。並行是透過分割槽實現的，消費者透過存檔最近處理訊息的偏移量來跟蹤工作進度。訊息代理將訊息保留在磁碟上，因此如有必要的話，可以回跳並重新讀取舊訊息。

基於日誌的方法與資料庫中的複製日誌（請參閱 [第五章](/v1_tw/ch5)）和日誌結構儲存引擎（請參閱 [第三章](/v1_tw/ch3)）有相似之處。我們看到，這種方法對於消費輸入流，併產生衍生狀態或衍生輸出資料流的系統而言特別適用。

就流的來源而言，我們討論了幾種可能性：使用者活動事件，定期讀數的感測器，和 Feed 資料（例如，金融中的市場資料）能夠自然地表示為流。我們發現將資料庫寫入視作流也是很有用的：我們可以捕獲變更日誌 —— 即對資料庫所做的所有變更的歷史記錄 —— 隱式地透過變更資料捕獲，或顯式地透過事件溯源。日誌壓縮允許流也能保有資料庫內容的完整副本。

將資料庫表示為流為系統整合帶來了很多強大機遇。透過消費變更日誌並將其應用至衍生系統，你能使諸如搜尋索引、快取以及分析系統這類衍生資料系統不斷保持更新。你甚至能從頭開始，透過讀取從創世至今的所有變更日誌，為現有資料建立全新的檢視。

像流一樣維護狀態以及訊息重播的基礎設施，是在各種流處理框架中實現流連線和容錯的基礎。我們討論了流處理的幾種目的，包括搜尋事件模式（複雜事件處理），計算分窗聚合（流分析），以及保證衍生資料系統處於最新狀態（物化檢視）。

然後我們討論了在流處理中對時間進行推理的困難，包括處理時間與事件時間戳之間的區別，以及當你認為視窗已經完事之後，如何處理到達的掉隊事件的問題。

我們區分了流處理中可能出現的三種連線型別：

流流連線
: 兩個輸入流都由活動事件組成，而連線運算元在某個時間視窗內搜尋相關的事件。例如，它可能會將同一個使用者 30 分鐘內進行的兩個活動聯絡在一起。如果你想要找出一個流內的相關事件，連線的兩側輸入可能實際上都是同一個流（**自連線**，即 self-join）。

流表連線
: 一個輸入流由活動事件組成，另一個輸入流是資料庫變更日誌。變更日誌保證了資料庫的本地副本是最新的。對於每個活動事件，連線運算元將查詢資料庫，並輸出一個擴充套件的活動事件。

表表連線
: 兩個輸入流都是資料庫變更日誌。在這種情況下，一側的每一個變化都與另一側的最新狀態相連線。結果是兩表連線所得物化檢視的變更流。

最後，我們討論了在流處理中實現容錯和恰好一次語義的技術。與批處理一樣，我們需要放棄任何失敗任務的部分輸出。然而由於流處理長時間執行並持續產生輸出，所以不能簡單地丟棄所有的輸出。相反，可以使用更細粒度的恢復機制，基於微批次、存檔點、事務或冪等寫入。

## 參考文獻

1. Tyler Akidau, Robert Bradshaw, Craig Chambers, et al.: “[The Dataflow Model: A Practical Approach to Balancing Correctness, Latency, and Cost in Massive-Scale, Unbounded, Out-of-Order Data Processing](http://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf),” *Proceedings of the VLDB Endowment*, volume 8, number 12, pages 1792–1803, August 2015. [doi:10.14778/2824032.2824076](http://dx.doi.org/10.14778/2824032.2824076)
1. Harold Abelson, Gerald Jay Sussman, and Julie Sussman: [*Structure and Interpretation of Computer Programs*](https://web.archive.org/web/20220807043536/https://mitpress.mit.edu/sites/default/files/sicp/index.html), 2nd edition. MIT Press, 1996. ISBN: 978-0-262-51087-5, available online at *mitpress.mit.edu*
1. Patrick Th. Eugster, Pascal A. Felber, Rachid Guerraoui, and Anne-Marie Kermarrec: “[The Many Faces of Publish/Subscribe](http://www.cs.ru.nl/~pieter/oss/manyfaces.pdf),” *ACM Computing Surveys*, volume 35, number 2, pages 114–131, June 2003. [doi:10.1145/857076.857078](http://dx.doi.org/10.1145/857076.857078)
1. Joseph M. Hellerstein and Michael Stonebraker: [*Readings in Database Systems*](http://redbook.cs.berkeley.edu/), 4th edition. MIT Press, 2005. ISBN: 978-0-262-69314-1, available online at *redbook.cs.berkeley.edu*
1. Don Carney, Uğur Çetintemel, Mitch Cherniack, et al.: “[Monitoring Streams – A New Class of Data Management Applications](http://www.vldb.org/conf/2002/S07P02.pdf),” at *28th International Conference on Very Large Data Bases* (VLDB), August 2002.
1. Matthew Sackman: “[Pushing Back](https://wellquite.org/posts/lshift/pushing_back/),” *lshift.net*, May 5, 2016.
1. Vicent Martí: “[Brubeck, a statsd-Compatible Metrics Aggregator](http://githubengineering.com/brubeck/),” *githubengineering.com*, June 15, 2015.
1. Seth Lowenberger: “[MoldUDP64 Protocol Specification V 1.00](http://www.nasdaqtrader.com/content/technicalsupport/specifications/dataproducts/moldudp64.pdf),” *nasdaqtrader.com*, July 2009.
1. Pieter Hintjens: [*ZeroMQ – The Guide*](http://zguide.zeromq.org/page:all). O'Reilly Media, 2013. ISBN: 978-1-449-33404-8
1. Ian Malpass: “[Measure Anything, Measure Everything](https://codeascraft.com/2011/02/15/measure-anything-measure-everything/),” *codeascraft.com*, February 15, 2011.
1. Dieter Plaetinck: “[25 Graphite, Grafana and statsd Gotchas](https://grafana.com/blog/2016/03/03/25-graphite-grafana-and-statsd-gotchas/),” *grafana.com*, March 3, 2016.
1. Jeff Lindsay: “[Web Hooks to Revolutionize the Web](https://web.archive.org/web/20180928201955/http://progrium.com/blog/2007/05/03/web-hooks-to-revolutionize-the-web/),” *progrium.com*, May 3, 2007.
1. Jim N. Gray: “[Queues Are Databases](https://arxiv.org/pdf/cs/0701158.pdf),” Microsoft Research Technical Report MSR-TR-95-56, December 1995.
1. Mark Hapner, Rich Burridge, Rahul Sharma, et al.: “[JSR-343 Java Message Service (JMS) 2.0 Specification](https://jcp.org/en/jsr/detail?id=343),” *jms-spec.java.net*, March 2013.
1. Sanjay Aiyagari, Matthew Arrott, Mark Atwell, et al.: “[AMQP: Advanced Message Queuing Protocol Specification](http://www.rabbitmq.com/resources/specs/amqp0-9-1.pdf),” Version 0-9-1, November 2008.
1. “[Google Cloud Pub/Sub: A Google-Scale Messaging Service](https://cloud.google.com/pubsub/architecture),” *cloud.google.com*, 2016.
1. “[Apache Kafka 0.9 Documentation](http://kafka.apache.org/documentation.html),” *kafka.apache.org*, November 2015.
1. Jay Kreps, Neha Narkhede, and Jun Rao: “[Kafka: A Distributed Messaging System for Log Processing](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/09/Kafka.pdf),” at *6th International Workshop on Networking Meets Databases* (NetDB), June 2011.
1. “[Amazon Kinesis Streams Developer Guide](http://docs.aws.amazon.com/streams/latest/dev/introduction.html),” *docs.aws.amazon.com*, April 2016.
1. Leigh Stewart and Sijie Guo: “[Building DistributedLog: Twitter’s High-Performance Replicated Log Service](https://blog.twitter.com/2015/building-distributedlog-twitter-s-high-performance-replicated-log-service),” *blog.twitter.com*, September 16, 2015.
1. “[DistributedLog Documentation](https://web.archive.org/web/20210517201308/https://bookkeeper.apache.org/distributedlog/docs/latest/),” Apache Software Foundation, *distributedlog.io*.
1. Jay Kreps: “[Benchmarking Apache Kafka: 2 Million Writes Per Second (On Three Cheap Machines)](https://engineering.linkedin.com/kafka/benchmarking-apache-kafka-2-million-writes-second-three-cheap-machines),” *engineering.linkedin.com*, April 27, 2014.
1. Kartik Paramasivam: “[How We’re Improving and Advancing Kafka at LinkedIn](https://engineering.linkedin.com/apache-kafka/how-we_re-improving-and-advancing-kafka-linkedin),” *engineering.linkedin.com*, September 2, 2015.
1. Jay Kreps: “[The Log: What Every Software Engineer Should Know About Real-Time Data's Unifying Abstraction](http://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying),” *engineering.linkedin.com*, December 16, 2013.
1. Shirshanka Das, Chavdar Botev, Kapil Surlaker, et al.: “[All Aboard the Databus!](http://www.socc2012.org/s18-das.pdf),” at *3rd ACM Symposium on Cloud Computing* (SoCC), October 2012.
1. Yogeshwer Sharma, Philippe Ajoux, Petchean Ang, et al.: “[Wormhole: Reliable Pub-Sub to Support Geo-Replicated Internet Services](https://www.usenix.org/system/files/conference/nsdi15/nsdi15-paper-sharma.pdf),” at *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015.
1. P. P. S. Narayan: “[Sherpa Update](http://web.archive.org/web/20160801221400/https://developer.yahoo.com/blogs/ydn/sherpa-7992.html),” *developer.yahoo.com*, June 8, .
1. Martin Kleppmann: “[Bottled Water: Real-Time Integration of PostgreSQL and Kafka](http://martin.kleppmann.com/2015/04/23/bottled-water-real-time-postgresql-kafka.html),” *martin.kleppmann.com*, April 23, 2015.
1. Ben Osheroff: “[Introducing Maxwell, a mysql-to-kafka Binlog Processor](https://web.archive.org/web/20170208100334/https://developer.zendesk.com/blog/introducing-maxwell-a-mysql-to-kafka-binlog-processor),” *developer.zendesk.com*, August 20, 2015.
1. Randall Hauch: “[Debezium 0.2.1 Released](https://debezium.io/blog/2016/06/10/Debezium-0.2.1-Released/),” *debezium.io*, June 10, 2016.
1. Prem Santosh Udaya Shankar: “[Streaming MySQL Tables in Real-Time to Kafka](https://engineeringblog.yelp.com/2016/08/streaming-mysql-tables-in-real-time-to-kafka.html),” *engineeringblog.yelp.com*, August 1, 2016.
1. “[Mongoriver](https://github.com/stripe/mongoriver),” Stripe, Inc., *github.com*, September 2014.
1. Dan Harvey: “[Change Data Capture with Mongo + Kafka](http://www.slideshare.net/danharvey/change-data-capture-with-mongodb-and-kafka),” at *Hadoop Users Group UK*, August 2015.
1. “[Oracle GoldenGate 12c: Real-Time Access to Real-Time Information](https://web.archive.org/web/20160923105841/http://www.oracle.com/us/products/middleware/data-integration/oracle-goldengate-realtime-access-2031152.pdf),” Oracle White Paper, March 2015.
1. “[Oracle GoldenGate Fundamentals: How Oracle GoldenGate Works](https://www.youtube.com/watch?v=6H9NibIiPQE),” Oracle Corporation, *youtube.com*, November 2012.
1. Slava Akhmechet: “[Advancing the Realtime Web](http://rethinkdb.com/blog/realtime-web/),” *rethinkdb.com*, January 27, 2015.
1. “[Firebase Realtime Database Documentation](https://firebase.google.com/docs/database/),” Google, Inc., *firebase.google.com*, May 2016.
1. “[Apache CouchDB 1.6 Documentation](http://docs.couchdb.org/en/latest/),” *docs.couchdb.org*, 2014.
1. Matt DeBergalis: “[Meteor 0.7.0: Scalable Database Queries Using MongoDB Oplog Instead of Poll-and-Diff](https://web.archive.org/web/20160324055429/http://info.meteor.com/blog/meteor-070-scalable-database-queries-using-mongodb-oplog-instead-of-poll-and-diff),” *info.meteor.com*, December 17, 2013.
1. “[Chapter 15. Importing and Exporting Live Data](https://docs.voltdb.com/UsingVoltDB/ChapExport.php),” VoltDB 6.4 User Manual, *docs.voltdb.com*, June 2016.
1. Neha Narkhede: “[Announcing Kafka Connect: Building Large-Scale Low-Latency Data Pipelines](http://www.confluent.io/blog/announcing-kafka-connect-building-large-scale-low-latency-data-pipelines),” *confluent.io*, February 18, 2016.
1. Greg Young: “[CQRS and Event Sourcing](https://www.youtube.com/watch?v=JHGkaShoyNs),” at *Code on the Beach*, August 2014.
1. Martin Fowler: “[Event Sourcing](http://martinfowler.com/eaaDev/EventSourcing.html),” *martinfowler.com*, December 12, 2005.
1. Vaughn Vernon: [*Implementing Domain-Driven Design*](https://www.informit.com/store/implementing-domain-driven-design-9780321834577). Addison-Wesley Professional, 2013. ISBN: 978-0-321-83457-7
1. H. V. Jagadish, Inderpal Singh Mumick, and Abraham Silberschatz: “[View Maintenance Issues for the Chronicle Data Model](https://dl.acm.org/doi/10.1145/212433.220201),” at *14th ACM SIGACT-SIGMOD-SIGART Symposium on Principles of Database Systems* (PODS), May 1995. [doi:10.1145/212433.220201](http://dx.doi.org/10.1145/212433.220201)
1. “[Event Store 3.5.0 Documentation](http://docs.geteventstore.com/),” Event Store LLP, *docs.geteventstore.com*, February 2016.
1. Martin Kleppmann: [*Making Sense of Stream Processing*](http://www.oreilly.com/data/free/stream-processing.csp). Report, O'Reilly Media, May 2016.
1. Sander Mak: “[Event-Sourced Architectures with Akka](http://www.slideshare.net/SanderMak/eventsourced-architectures-with-akka),” at *JavaOne*, September 2014.
1. Julian Hyde: [personal communication](https://twitter.com/julianhyde/status/743374145006641153), June 2016.
1. Ashish Gupta and Inderpal Singh Mumick: *Materialized Views: Techniques, Implementations, and Applications*. MIT Press, 1999. ISBN: 978-0-262-57122-7
1. Timothy Griffin and Leonid Libkin: “[Incremental Maintenance of Views with Duplicates](http://homepages.inf.ed.ac.uk/libkin/papers/sigmod95.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), May 1995. [doi:10.1145/223784.223849](http://dx.doi.org/10.1145/223784.223849)
1. Pat Helland: “[Immutability Changes Everything](http://cidrdb.org/cidr2015/Papers/CIDR15_Paper16.pdf),” at *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
1. Martin Kleppmann: “[Accounting for Computer Scientists](http://martin.kleppmann.com/2011/03/07/accounting-for-computer-scientists.html),” *martin.kleppmann.com*, March 7, 2011.
1. Pat Helland: “[Accountants Don't Use Erasers](https://web.archive.org/web/20200220161036/https://blogs.msdn.microsoft.com/pathelland/2007/06/14/accountants-dont-use-erasers/),” *blogs.msdn.com*, June 14, 2007.
1. Fangjin Yang: “[Dogfooding with Druid, Samza, and Kafka: Metametrics at Metamarkets](https://metamarkets.com/2015/dogfooding-with-druid-samza-and-kafka-metametrics-at-metamarkets/),” *metamarkets.com*, June 3, 2015.
1. Gavin Li, Jianqiu Lv, and Hang Qi: “[Pistachio: Co-Locate the Data and Compute for Fastest Cloud Compute](https://web.archive.org/web/20181214032620/https://yahoohadoop.tumblr.com/post/116365275781/pistachio-co-locate-the-data-and-compute-for),” *yahoohadoop.tumblr.com*, April 13, 2015.
1. Kartik Paramasivam: “[Stream Processing Hard Problems – Part 1: Killing Lambda](https://engineering.linkedin.com/blog/2016/06/stream-processing-hard-problems-part-1-killing-lambda),” *engineering.linkedin.com*, June 27, 2016.
1. Martin Fowler: “[CQRS](http://martinfowler.com/bliki/CQRS.html),” *martinfowler.com*, July 14, 2011.
1. Greg Young: “[CQRS Documents](https://cqrs.files.wordpress.com/2010/11/cqrs_documents.pdf),” *cqrs.files.wordpress.com*, November 2010.
1. Baron Schwartz: “[Immutability, MVCC, and Garbage Collection](https://web.archive.org/web/20161110094746/http://www.xaprb.com/blog/2013/12/28/immutability-mvcc-and-garbage-collection/),” *xaprb.com*, December 28, 2013.
1. Daniel Eloff, Slava Akhmechet, Jay Kreps, et al.: ["Re: Turning the Database Inside-out with Apache Samza](https://news.ycombinator.com/item?id=9145197)," Hacker News discussion, *news.ycombinator.com*, March 4, 2015.
1. “[Datomic Development Resources: Excision](http://docs.datomic.com/excision.html),” Cognitect, Inc., *docs.datomic.com*.
1. “[Fossil Documentation: Deleting Content from Fossil](http://fossil-scm.org/index.html/doc/trunk/www/shunning.wiki),” *fossil-scm.org*, 2016.
1. Jay Kreps: “[The irony of distributed systems is that data loss is really easy but deleting data is surprisingly hard,](https://twitter.com/jaykreps/status/582580836425330688)” *twitter.com*, March 30, 2015.
1. David C. Luckham: “[What’s the Difference Between ESP and CEP?](http://www.complexevents.com/2006/08/01/what%E2%80%99s-the-difference-between-esp-and-cep/),” *complexevents.com*, August 1, 2006.
1. Srinath Perera: “[How Is Stream Processing and Complex Event Processing (CEP) Different?](https://www.quora.com/How-is-stream-processing-and-complex-event-processing-CEP-different),” *quora.com*, December 3, 2015.
1. Arvind Arasu, Shivnath Babu, and Jennifer Widom: “[The CQL Continuous Query Language: Semantic Foundations and Query Execution](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cql.pdf),” *The VLDB Journal*, volume 15, number 2, pages 121–142, June 2006. [doi:10.1007/s00778-004-0147-z](http://dx.doi.org/10.1007/s00778-004-0147-z)
1. Julian Hyde: “[Data in Flight: How Streaming SQL Technology Can Help Solve the Web 2.0 Data Crunch](http://queue.acm.org/detail.cfm?id=1667562),” *ACM Queue*, volume 7, number 11, December 2009. [doi:10.1145/1661785.1667562](http://dx.doi.org/10.1145/1661785.1667562)
1. “[Esper Reference, Version 5.4.0](http://esper.espertech.com/release-5.4.0/esper-reference/html_single/index.html),” EsperTech, Inc., *espertech.com*, April 2016.
1. Zubair Nabi, Eric Bouillet, Andrew Bainbridge, and Chris Thomas: “[Of Streams and Storms](https://web.archive.org/web/20170711081434/https://developer.ibm.com/streamsdev/wp-content/uploads/sites/15/2014/04/Streams-and-Storm-April-2014-Final.pdf),” IBM technical report, *developer.ibm.com*, April 2014.
1. Milinda Pathirage, Julian Hyde, Yi Pan, and Beth Plale: “[SamzaSQL: Scalable Fast Data Management with Streaming SQL](https://github.com/milinda/samzasql-hpbdc2016/blob/master/samzasql-hpbdc2016.pdf),” at *IEEE International Workshop on High-Performance Big Data Computing* (HPBDC), May 2016. [doi:10.1109/IPDPSW.2016.141](http://dx.doi.org/10.1109/IPDPSW.2016.141)
1. Philippe Flajolet, Éric Fusy, Olivier Gandouet, and Frédéric Meunier: “[HyperLogLog: The Analysis of a Near-Optimal Cardinality Estimation Algorithm](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf),” at *Conference on Analysis of Algorithms* (AofA), June 2007.
1. Jay Kreps: “[Questioning the Lambda Architecture](https://www.oreilly.com/ideas/questioning-the-lambda-architecture),” *oreilly.com*, July 2, 2014.
1. Ian Hellström: “[An Overview of Apache Streaming Technologies](https://databaseline.bitbucket.io/an-overview-of-apache-streaming-technologies/),” *databaseline.bitbucket.io*, March 12, 2016.
1. Jay Kreps: “[Why Local State Is a Fundamental Primitive in Stream Processing](https://www.oreilly.com/ideas/why-local-state-is-a-fundamental-primitive-in-stream-processing),” *oreilly.com*, July 31, 2014.
1. Shay Banon: “[Percolator](https://www.elastic.co/blog/percolator),” *elastic.co*, February 8, 2011.
1. Alan Woodward and Martin Kleppmann: “[Real-Time Full-Text Search with Luwak and Samza](http://martin.kleppmann.com/2015/04/13/real-time-full-text-search-luwak-samza.html),” *martin.kleppmann.com*, April 13, 2015.
1. “[Apache Storm 2.1.0 Documentation](https://storm.apache.org/releases/2.1.0/index.html),” *storm.apache.org*, October 2019.
1. Tyler Akidau: “[The World Beyond Batch: Streaming 102](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-102),” *oreilly.com*, January 20, 2016.
1. Stephan Ewen: “[Streaming Analytics with Apache Flink](https://www.confluent.io/resources/kafka-summit-2016/advanced-streaming-analytics-apache-flink-apache-kafka/),” at *Kafka Summit*, April 2016.
1. Tyler Akidau, Alex Balikov, Kaya Bekiroğlu, et al.: “[MillWheel: Fault-Tolerant Stream Processing at Internet Scale](http://research.google.com/pubs/pub41378.html),” at *39th International Conference on Very Large Data Bases* (VLDB), August 2013.
1. Alex Dean: “[Improving Snowplow's Understanding of Time](https://snowplow.io/blog/improving-snowplows-understanding-of-time/),” *snowplowanalytics.com*, September 15, 2015.
1. “[Windowing (Azure Stream Analytics)](https://msdn.microsoft.com/en-us/library/azure/dn835019.aspx),” Microsoft Azure Reference, *msdn.microsoft.com*, April 2016.
1. “[State Management](http://samza.apache.org/learn/documentation/0.10/container/state-management.html),” Apache Samza 0.10 Documentation, *samza.apache.org*, December 2015.
1. Rajagopal Ananthanarayanan, Venkatesh Basker, Sumit Das, et al.: “[Photon: Fault-Tolerant and Scalable Joining of Continuous Data Streams](http://research.google.com/pubs/pub41318.html),” at *ACM International Conference on Management of Data* (SIGMOD), June 2013. [doi:10.1145/2463676.2465272](http://dx.doi.org/10.1145/2463676.2465272)
1. Martin Kleppmann: “[Samza Newsfeed Demo](https://github.com/ept/newsfeed),” *github.com*, September 2014.
1. Ben Kirwin: “[Doing the Impossible: Exactly-Once Messaging Patterns in Kafka](http://ben.kirw.in/2014/11/28/kafka-patterns/),” *ben.kirw.in*, November 28, 2014.
1. Pat Helland: “[Data on the Outside Versus Data on the Inside](http://cidrdb.org/cidr2005/papers/P12.pdf),” at *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005.
1. Ralph Kimball and Margy Ross: *The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling*, 3rd edition. John Wiley & Sons, 2013. ISBN: 978-1-118-53080-1
1. Viktor Klang: “[I'm coining the phrase 'effectively-once' for message processing with at-least-once + idempotent operations](https://twitter.com/viktorklang/status/789036133434978304),” *twitter.com*, October 20, 2016.
1. Matei Zaharia, Tathagata Das, Haoyuan Li, et al.: “[Discretized Streams: An Efficient and Fault-Tolerant Model for Stream Processing on Large Clusters](https://www.usenix.org/system/files/conference/hotcloud12/hotcloud12-final28.pdf),” at *4th USENIX Conference in Hot Topics in Cloud Computing* (HotCloud), June 2012.
1. Kostas Tzoumas, Stephan Ewen, and Robert Metzger: “[High-Throughput, Low-Latency, and Exactly-Once Stream Processing with Apache Flink](https://www.ververica.com/blog/high-throughput-low-latency-and-exactly-once-stream-processing-with-apache-flink),” *ververica.com*, August 5, 2015.
1. Paris Carbone, Gyula Fóra, Stephan Ewen, et al.: “[Lightweight Asynchronous Snapshots for Distributed Dataflows](http://arxiv.org/abs/1506.08603),” arXiv:1506.08603 &#91;cs.DC&#93;, June 29, 2015.
1. Ryan Betts and John Hugg: [*Fast Data: Smart and at Scale*](http://www.oreilly.com/data/free/fast-data-smart-and-at-scale.csp). Report, O'Reilly Media, October 2015.
1. Flavio Junqueira: “[Making Sense of Exactly-Once Semantics](https://web.archive.org/web/20160812172900/http://conferences.oreilly.com/strata/hadoop-big-data-eu/public/schedule/detail/49690),” at *Strata+Hadoop World London*, June 2016.
1. Jason Gustafson, Flavio Junqueira, Apurva Mehta, Sriram Subramanian, and Guozhang Wang: “[KIP-98 – Exactly Once Delivery and Transactional Messaging](https://cwiki.apache.org/confluence/display/KAFKA/KIP-98+-+Exactly+Once+Delivery+and+Transactional+Messaging),” *cwiki.apache.org*, November 2016.
1. Pat Helland: “[Idempotence Is Not a Medical Condition](https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=4b6dda7fe75b51e1c543a87ca7b3b322fbf55614),” *Communications of the ACM*, volume 55, number 5, page 56, May 2012. [doi:10.1145/2160718.2160734](http://dx.doi.org/10.1145/2160718.2160734)
1. Jay Kreps: “[Re: Trying to Achieve Deterministic Behavior on Recovery/Rewind](http://mail-archives.apache.org/mod_mbox/samza-dev/201409.mbox/%3CCAOeJiJg%2Bc7Ei%3DgzCuOz30DD3G5Hm9yFY%3DUJ6SafdNUFbvRgorg%40mail.gmail.com%3E),” email to *samza-dev* mailing list, September 9, 2014.
1. E. N. (Mootaz) Elnozahy, Lorenzo Alvisi, Yi-Min Wang, and David B. Johnson: “[A Survey of Rollback-Recovery Protocols in Message-Passing Systems](http://www.cs.utexas.edu/~lorenzo/papers/SurveyFinal.pdf),” *ACM Computing Surveys*, volume 34, number 3, pages 375–408, September 2002. [doi:10.1145/568522.568525](http://dx.doi.org/10.1145/568522.568525)
1. Adam Warski: “[Kafka Streams – How Does It Fit the Stream Processing Landscape?](https://softwaremill.com/kafka-streams-how-does-it-fit-stream-landscape/),” *softwaremill.com*, June 1, 2016.


================================================
FILE: content/v1_tw/ch12.md
================================================
---
title: "第十二章：資料系統的未來"
linkTitle: "12. 資料系統的未來"
weight: 312
breadcrumbs: false
---


![](/map/ch12.png)

> 如果船長的終極目標是保護船隻，他應該永遠待在港口。
>
> —— 聖托馬斯・阿奎那《神學大全》（1265-1274）

到目前為止，本書主要描述的是 **現狀**。在這最後一章中，我們將放眼 **未來**，討論應該是怎麼樣的：我將提出一些想法與方法，我相信它們能從根本上改進我們設計與構建應用的方式。

對未來的看法與推測當然具有很大的主觀性。所以在撰寫本章時，當提及我個人的觀點時會使用第一人稱。你完全可以不同意這些觀點並提出自己的看法，但我希望本章中的概念，至少能成為富有成效的討論出發點，並澄清一些經常被混淆的概念。

[第一章](/v1_tw/ch1) 概述了本書的目標：探索如何建立 **可靠**、**可伸縮** 和 **可維護** 的應用與系統。這一主題貫穿了所有的章節：例如，我們討論了許多有助於提高可靠性的容錯演算法，有助於提高可伸縮性的分割槽，以及有助於提高可維護性的演化與抽象機制。在本章中，我們將把所有這些想法結合在一起，並在它們的基礎上展望未來。我們的目標是，發現如何設計出比現有應用更好的應用 —— 健壯、正確、可演化、且最終對人類有益。

## 資料整合

本書中反覆出現的主題是，對於任何給定的問題都會有好幾種解決方案，所有這些解決方案都有不同的優缺點與利弊權衡。例如在 [第三章](/v1_tw/ch3) 討論儲存引擎時，我們看到了日誌結構儲存、B 樹以及列式儲存。在 [第五章](/v1_tw/ch5) 討論複製時，我們看到了單領導者、多領導者和無領導者的方法。

如果你有一個類似於 “我想儲存一些資料並稍後再查詢” 的問題，那麼並沒有一種正確的解決方案。但對於不同的具體環境，總會有不同的合適方法。軟體實現通常必須選擇一種特定的方法。使單條程式碼路徑能做到穩定健壯且表現良好已經是一件非常困難的事情了 —— 嘗試在單個軟體中完成所有事情，幾乎可以保證，實現效果會很差。

因此軟體工具的最佳選擇也取決於情況。每一種軟體，甚至所謂的 “通用” 資料庫，都是針對特定的使用模式設計的。

面對讓人眼花繚亂的諸多替代品，第一個挑戰就是弄清軟體與其適用環境的對映關係。供應商不願告訴你他們軟體不適用的工作負載，這是可以理解的。但是希望先前的章節能給你提供一些問題，讓你讀出字裡行間的言外之意，並更好地理解這些權衡。

但是，即使你已經完全理解各種工具與其適用環境間的關係，還有一個挑戰：在複雜的應用中，資料的用法通常花樣百出。不太可能存在適用於 **所有** 不同資料應用場景的軟體，因此你不可避免地需要拼湊幾個不同的軟體來以提供應用所需的功能。

### 組合使用衍生資料的工具

例如，為了處理任意關鍵詞的搜尋查詢，將 OLTP 資料庫與全文搜尋索引整合在一起是很常見的需求。儘管一些資料庫（例如 PostgreSQL）包含了全文索引功能，對於簡單的應用完全夠了【1】，但更複雜的搜尋能力就需要專業的資訊檢索工具了。相反的是，搜尋索引通常不適合作為持久的記錄系統，因此許多應用需要組合這兩種不同的工具以滿足所有需求。

我們在 “[保持系統同步](/v1_tw/ch11#保持系統同步)” 中接觸過整合資料系統的問題。隨著資料不同表示形式的增加，整合問題變得越來越困難。除了資料庫和搜尋索引之外，也許你需要在分析系統（資料倉庫，或批處理和流處理系統）中維護資料副本；維護從原始資料中衍生的快取，或反正規化的資料版本；將資料灌入機器學習、分類、排名或推薦系統中；或者基於資料變更傳送通知。

令人驚訝的是，我經常看到軟體工程師做出這樣的陳述：“根據我的經驗，99% 的人只需要 X” 或者 “...... 不需要 X”（對於各種各樣的 X）。我認為這種陳述更像是發言人自己的經驗，而不是技術實際上的實用性。可能對資料執行的操作，其範圍極其寬廣。某人認為雞肋而毫無意義的功能可能是別人的核心需求。當你拉高視角，並考慮跨越整個組織範圍的資料流時，資料整合的需求往往就會變得明顯起來。

#### 理解資料流

當需要在多個儲存系統中維護相同資料的副本以滿足不同的訪問模式時，你要對輸入和輸出瞭如指掌：哪些資料先寫入，哪些資料表示衍生自哪些來源？如何以正確的格式，將所有資料匯入正確的地方？

例如，你可能會首先將資料寫入 **記錄系統** 資料庫，捕獲對該資料庫所做的變更（請參閱 “[變更資料捕獲](/v1_tw/ch11#變更資料捕獲)”），然後將變更以相同的順序應用於搜尋索引。如果變更資料捕獲（CDC）是更新索引的唯一方式，則可以確定該索引完全派生自記錄系統，因此與其保持一致（除軟體錯誤外）。寫入資料庫是向該系統提供新輸入的唯一方式。

允許應用程式直接寫入搜尋索引和資料庫引入了如 [圖 11-4](/v1/ddia_1104.png) 所示的問題，其中兩個客戶端同時傳送衝突的寫入，且兩個儲存系統按不同順序處理它們。在這種情況下，既不是資料庫說了算，也不是搜尋索引說了算，所以它們做出了相反的決定，進入彼此間永續性的不一致狀態。

如果你可以透過單個系統來提供所有使用者輸入，從而決定所有寫入的排序，則透過按相同順序處理寫入，可以更容易地衍生出其他資料表示。這是狀態機複製方法的一個應用，我們在 “[全序廣播](/v1_tw/ch9#全序廣播)” 中看到。無論你使用變更資料捕獲還是事件溯源日誌，都不如簡單的基於全序的決策原則更重要。

基於事件日誌來更新衍生資料的系統，通常可以做到 **確定性** 與 **冪等性**（請參閱 “[冪等性](/v1_tw/ch11#冪等性)”），使得從故障中恢復相當容易。

#### 衍生資料與分散式事務

保持不同資料系統彼此一致的經典方法涉及分散式事務，如 “[原子提交與兩階段提交](/v1_tw/ch9#原子提交與兩階段提交)” 中所述。與分散式事務相比，使用衍生資料系統的方法如何？

在抽象層面，它們透過不同的方式達到類似的目標。分散式事務透過 **鎖** 進行互斥來決定寫入的順序（請參閱 “[兩階段鎖定](/v1_tw/ch7#兩階段鎖定)”），而 CDC 和事件溯源使用日誌進行排序。分散式事務使用原子提交來確保變更只生效一次，而基於日誌的系統通常基於 **確定性重試** 和 **冪等性**。

最大的不同之處在於事務系統通常提供 [線性一致性](/v1_tw/ch9#線性一致性)，這包含著有用的保證，例如 [讀己之寫](/v1_tw/ch5#讀己之寫)。另一方面，衍生資料系統通常是非同步更新的，因此它們預設不會提供相同的時序保證。

在願意為分散式事務付出代價的有限場景中，它們已被成功應用。但是，我認為 XA 的容錯能力和效能很差勁（請參閱 “[實踐中的分散式事務](/v1_tw/ch9#實踐中的分散式事務)”），這嚴重限制了它的實用性。我相信為分散式事務設計一種更好的協議是可行的。但使這樣一種協議被現有工具廣泛接受是很有挑戰的，且不是立竿見影的事。

在沒有廣泛支援的良好分散式事務協議的情況下，我認為基於日誌的衍生資料是整合不同資料系統的最有前途的方法。然而，諸如讀己之寫的保證是有用的，我認為告訴所有人 “最終一致性是不可避免的 —— 忍一忍並學會和它打交道” 是沒有什麼建設性的（至少在缺乏 **如何** 應對的良好指導時）。

在 “[將事情做正確](#將事情做正確)” 中，我們將討論一些在非同步衍生系統之上實現更強保障的方法，並邁向分散式事務和基於日誌的非同步系統之間的中間地帶。

#### 全序的限制

對於足夠小的系統，構建一個完全有序的事件日誌是完全可行的（正如單主複製資料庫的流行所證明的那樣，它正好建立了這樣一種日誌）。但是，隨著系統向更大更複雜的工作負載伸縮，限制開始出現：

* 在大多數情況下，構建完全有序的日誌，需要所有事件彙集於決定順序的 **單個領導者節點**。如果事件吞吐量大於單臺計算機的處理能力，則需要將其分割槽到多臺計算機上（請參閱 “[分割槽日誌](/v1_tw/ch11#分割槽日誌)”）。然後兩個不同分割槽中的事件順序關係就不明確了。
* 如果伺服器分佈在多個 **地理位置分散** 的資料中心上，例如為了容忍整個資料中心掉線，你通常在每個資料中心都有單獨的主庫，因為網路延遲會導致同步的跨資料中心協調效率低下（請參閱 “[多主複製](/v1_tw/ch5#多主複製)”）。這意味著源自兩個不同資料中心的事件順序未定義。
* 將應用程式部署為微服務時（請參閱 “[服務中的資料流：REST 與 RPC](/v1_tw/ch4#服務中的資料流：REST與RPC)”），常見的設計選擇是將每個服務及其持久狀態作為獨立單元進行部署，服務之間不共享持久狀態。當兩個事件來自不同的服務時，這些事件間的順序未定義。
* 某些應用程式在客戶端儲存狀態，該狀態在使用者輸入時立即更新（無需等待伺服器確認），甚至可以繼續離線工作（請參閱 “[需要離線操作的客戶端](/v1_tw/ch5#需要離線操作的客戶端)”）。對於這樣的應用程式，客戶端和伺服器很可能以不同的順序看到事件。

在形式上，決定事件的全域性順序稱為 **全序廣播**，相當於 **共識**（請參閱 “[共識演算法和全序廣播](/v1_tw/ch9#共識演算法和全序廣播)”）。大多數共識演算法都是針對單個節點的吞吐量足以處理整個事件流的情況而設計的，並且這些演算法不提供多個節點共享事件排序工作的機制。設計可以伸縮至單個節點的吞吐量之上，且在地理位置分散的環境中仍然工作良好的的共識演算法仍然是一個開放的研究問題。

#### 排序事件以捕獲因果關係

在事件之間不存在因果關係的情況下，全序的缺乏並不是一個大問題，因為併發事件可以任意排序。其他一些情況很容易處理：例如，當同一物件有多個更新時，它們可以透過將特定物件 ID 的所有更新路由到相同的日誌分割槽來完全排序。然而，因果關係有時會以更微妙的方式出現（請參閱 “[順序與因果關係](/v1_tw/ch9#順序與因果關係)”）。

例如，考慮一個社交網路服務，以及一對曾處於戀愛關係但剛分手的使用者。其中一個使用者將另一個使用者從好友中移除，然後向剩餘的好友傳送訊息，抱怨他們的前任。使用者的心思是他們的前任不應該看到這些粗魯的訊息，因為訊息是在好友狀態解除後傳送的。

但是如果好友關係狀態與訊息儲存在不同的地方，在這樣一個系統中，可能會出現 **解除好友** 事件與 **傳送訊息** 事件之間的因果依賴丟失的情況。如果因果依賴關係沒有被捕捉到，則傳送有關新訊息的通知的服務可能會在 **解除好友** 事件之前處理 **傳送訊息** 事件，從而錯誤地向前任傳送通知。

在本例中，通知實際上是訊息和好友列表之間的連線，使得它與我們先前討論的連線的時序問題有關（請參閱 “[連線的時間依賴性](/v1_tw/ch11#連線的時間依賴性)”）。不幸的是，這個問題似乎並沒有一個簡單的答案【2,3】。起點包括：

* 邏輯時間戳可以提供無需協調的全域性順序（請參閱 “[序列號順序](/v1_tw/ch9#序列號順序)”），因此它們可能有助於全序廣播不可行的情況。但是，他們仍然要求收件人處理不按順序傳送的事件，並且需要傳遞其他元資料。
* 如果你可以記錄一個事件來記錄使用者在做出決定之前所看到的系統狀態，並給該事件一個唯一的識別符號，那麼後面的任何事件都可以引用該事件識別符號來記錄因果關係【4】。我們將在 “[讀也是事件](#讀也是事件)” 中回到這個想法。
* 衝突解決演算法（請參閱 “[自動衝突解決](/v1_tw/ch5#自動衝突解決)”）有助於處理以意外順序傳遞的事件。它們對於維護狀態很有用，但如果行為有外部副作用（例如，給使用者傳送通知），就沒什麼幫助了。

也許，隨著時間的推移，應用開發模式將出現，使得能夠有效地捕獲因果依賴關係，並且保持正確的衍生狀態，而不會迫使所有事件經歷全序廣播的瓶頸）。

### 批處理與流處理

我會說資料整合的目標是，確保資料最終能在所有正確的地方表現出正確的形式。這樣做需要消費輸入、轉換、連線、過濾、聚合、訓練模型、評估、以及最終寫出適當的輸出。批處理和流處理是實現這一目標的工具。

批處理和流處理的輸出是衍生資料集，例如搜尋索引、物化檢視、向用戶顯示的建議、聚合指標等（請參閱 “[批處理工作流的輸出](/v1_tw/ch10#批處理工作流的輸出)” 和 “[流處理的應用](/v1_tw/ch11#流處理的應用)”）。

正如我們在 [第十章](/v1_tw/ch10) 和 [第十一章](/v1_tw/ch11) 中看到的，批處理和流處理有許多共同的原則，主要的根本區別在於流處理器在無限資料集上執行，而批處理輸入是已知的有限大小。處理引擎的實現方式也有很多細節上的差異，但是這些區別已經開始模糊。

Spark 在批處理引擎上執行流處理，將流分解為 **微批次（microbatches）**，而 Apache Flink 則在流處理引擎上執行批處理【5】。原則上，一種型別的處理可以用另一種型別來模擬，但是效能特徵會有所不同：例如，在跳躍或滑動視窗上，微批次可能表現不佳【6】。

#### 維護衍生狀態

批處理有著很強的函式式風格（即使其程式碼不是用函式式語言編寫的）：它鼓勵確定性的純函式，其輸出僅依賴於輸入，除了顯式輸出外沒有副作用，將輸入視作不可變的，且輸出是僅追加的。流處理與之類似，但它擴充套件了運算元以允許受管理的、容錯的狀態（請參閱 “[失敗後重建狀態”](/v1_tw/ch11#失敗後重建狀態)）。

具有良好定義的輸入和輸出的確定性函式的原理不僅有利於容錯（請參閱 “[冪等性](/v1_tw/ch11#冪等性)”），也簡化了有關組織中資料流的推理【7】。無論衍生資料是搜尋索引、統計模型還是快取，採用這種觀點思考都是很有幫助的：將其視為從一個東西衍生出另一個的資料管道，透過函式式應用程式碼推送一個系統的狀態變更，並將其效果應用至衍生系統中。

原則上，衍生資料系統可以同步地維護，就像關係資料庫在與索引表寫入操作相同的事務中同步更新次級索引一樣。然而，非同步是使基於事件日誌的系統穩健的原因：它允許系統的一部分故障被抑制在本地。而如果任何一個參與者失敗，分散式事務將中止，因此它們傾向於透過將故障傳播到系統的其餘部分來放大故障（請參閱 “[分散式事務的限制](/v1_tw/ch9#分散式事務的限制)”）。

我們在 “[分割槽與次級索引](/v1_tw/ch6#分割槽與次級索引)” 中看到，次級索引經常跨越分割槽邊界。具有次級索引的分割槽系統需要將寫入傳送到多個分割槽（如果索引按關鍵詞分割槽的話）或將讀取傳送到所有分割槽（如果索引是按文件分割槽的話）。如果索引是非同步維護的，這種跨分割槽通訊也是最可靠和最可伸縮的【8】（另請參閱 “[多分割槽資料處理](#多分割槽資料處理)”）。

#### 應用演化後重新處理資料

在維護衍生資料時，批處理和流處理都是有用的。流處理允許將輸入中的變化以低延遲反映在衍生檢視中，而批處理允許重新處理大量累積的歷史資料以便將新檢視匯出到現有資料集上。

特別是，重新處理現有資料為維護系統、演化並支援新功能和需求變更提供了一個良好的機制（請參閱 [第四章](/v1_tw/ch4)）。沒有重新進行處理，模式演化將僅限於簡單的變化，例如向記錄中新增新的可選欄位或新增新型別的記錄。無論是在寫時模式還是在讀時模式中都是如此（請參閱 “[文件模型中的模式靈活性](/v1_tw/ch2#文件模型中的模式靈活性)”）。另一方面，透過重新處理，可以將資料集重組為一個完全不同的模型，以便更好地滿足新的要求。

> ### 鐵路上的模式遷移
>
> 大規模的 “模式遷移” 也發生在非計算機系統中。例如，在 19 世紀英國鐵路建設初期，軌距（兩軌之間的距離）就有了各種各樣的競爭標準。為一種軌距而建的列車不能在另一種軌距的軌道上執行，這限制了火車網路中可能的相互連線【9】。
>
> 在 1846 年最終確定了一個標準軌距之後，其他軌距的軌道必須轉換 —— 但是如何在不停運火車線路的情況下進行數月甚至數年的遷移？解決的辦法是首先透過新增第三條軌道將軌道轉換為 **雙軌距（dual guage）** 或 **混合軌距**。這種轉換可以逐漸完成，當完成時，兩種軌距的列車都可以線上路上跑，使用三條軌道中的兩條。事實上，一旦所有的列車都轉換成標準軌距，那麼可以移除提供非標準軌距的軌道。
>
> 以這種方式 “再加工” 現有的軌道，讓新舊版本並存，可以在幾年的時間內逐漸改變軌距。然而，這是一項昂貴的事業，這就是今天非標準軌距仍然存在的原因。例如，舊金山灣區的 BART 系統使用了與美國大部分地區不同的軌距。

衍生檢視允許 **漸進演化（gradual evolution）**。如果你想重新構建資料集，不需要執行突然切換式的遷移。取而代之的是，你可以將舊架構和新架構並排維護為相同基礎資料上的兩個獨立衍生檢視。然後可以開始將少量使用者轉移到新檢視，以測試其效能並發現任何錯誤，而大多數使用者仍然會被路由到舊檢視。你可以逐漸地增加訪問新檢視的使用者比例，最終可以刪除舊檢視【10】。

這種逐漸遷移的美妙之處在於，如果出現問題，每個階段的過程都很容易逆轉：你始終有一個可以回滾的可用系統。透過降低不可逆損害的風險，你能對繼續前進更有信心，從而更快地改善系統【11】。

#### Lambda架構

如果批處理用於重新處理歷史資料，而流處理用於處理最近的更新，那麼如何將這兩者結合起來？Lambda 架構【12】是這方面的一個建議，引起了很多關注。

Lambda 架構的核心思想是透過將不可變事件附加到不斷增長的資料集來記錄傳入資料，這類似於事件溯源（請參閱 “[事件溯源](/v1_tw/ch11#事件溯源)”）。為了從這些事件中衍生出讀取最佳化的檢視，Lambda 架構建議並行執行兩個不同的系統：批處理系統（如 Hadoop MapReduce）和獨立的流處理系統（如 Storm）。

在 Lambda 方法中，流處理器消耗事件並快速生成對檢視的近似更新；批處理器稍後將使用同一組事件並生成衍生檢視的更正版本。這個設計背後的原因是批處理更簡單，因此不易出錯，而流處理器被認為是不太可靠和難以容錯的（請參閱 “[容錯](/v1_tw/ch11#容錯)”）。而且，流處理可以使用快速近似演算法，而批處理使用較慢的精確演算法。

Lambda 架構是一種有影響力的想法，它將資料系統的設計變得更好，尤其是透過推廣這樣的原則：在不可變事件流上建立衍生檢視，並在需要時重新處理事件。但是我也認為它有一些實際問題：

* 在批處理和流處理框架中維護相同的邏輯是很顯著的額外工作。雖然像 Summingbird【13】這樣的庫提供了一種可以在批處理和流處理的上下文中執行的計算抽象。除錯、調整和維護兩個不同系統的操作複雜性依然存在【14】。
* 由於流管道和批處理管道產生獨立的輸出，因此需要合併它們以響應使用者請求。如果計算是基於滾動視窗的簡單聚合，則合併相當容易，但如果檢視基於更複雜的操作（例如連線和會話化）而匯出，或者輸出不是時間序列，則會變得非常困難。
* 儘管有能力重新處理整個歷史資料集是很好的，但在大型資料集上這樣做經常會開銷巨大。因此，批處理流水線通常需要設定為處理增量批處理（例如，在每小時結束時處理一小時的資料），而不是重新處理所有內容。這引發了 “[時間推理](/v1_tw/ch11#時間推理)” 中討論的問題，例如處理滯留事件和處理跨批次邊界的視窗。增量化批處理計算會增加複雜性，使其更類似於流式傳輸層，這與保持批處理層儘可能簡單的目標背道而馳。

#### 統一批處理和流處理

最近的工作使得 Lambda 架構的優點在沒有其缺點的情況下得以實現，允許批處理計算（重新處理歷史資料）和流計算（在事件到達時即處理）在同一個系統中實現【15】。

在一個系統中統一批處理和流處理需要以下功能，這些功能也正在越來越廣泛地被提供：

* 透過處理最近事件流的相同處理引擎來重播歷史事件的能力。例如，基於日誌的訊息代理可以重播訊息（請參閱 “[重播舊訊息](/v1_tw/ch11#重播舊訊息)”），某些流處理器可以從 HDFS 等分散式檔案系統讀取輸入。
* 對於流處理器來說，恰好一次語義 —— 即確保輸出與未發生故障的輸出相同，即使事實上發生故障（請參閱 “[容錯](/v1_tw/ch11#容錯)”）。與批處理一樣，這需要丟棄任何失敗任務的部分輸出。
* 按事件時間進行視窗化的工具，而不是按處理時間進行視窗化，因為處理歷史事件時，處理時間毫無意義（請參閱 “[時間推理](/v1_tw/ch11#時間推理)”）。例如，Apache Beam 提供了用於表達這種計算的 API，可以在 Apache Flink 或 Google Cloud Dataflow 使用。


## 分拆資料庫

在最抽象的層面上，資料庫，Hadoop 和作業系統都發揮相同的功能：它們儲存一些資料，並允許你處理和查詢這些資料【16】。資料庫將資料儲存為特定資料模型的記錄（表中的行、文件、圖中的頂點等），而作業系統的檔案系統則將資料儲存在檔案中 —— 但其核心都是 “資訊管理” 系統【17】。正如我們在 [第十章](/v1_tw/ch10) 中看到的，Hadoop 生態系統有點像 Unix 的分散式版本。

當然，有很多實際的差異。例如，許多檔案系統都不能很好地處理包含 1000 萬個小檔案的目錄，而包含 1000 萬個小記錄的資料庫完全是尋常而不起眼的。無論如何，作業系統和資料庫之間的相似之處和差異值得探討。

Unix 和關係資料庫以非常不同的哲學來處理資訊管理問題。Unix 認為它的目的是為程式設計師提供一種相當低層次的硬體的邏輯抽象，而關係資料庫則希望為應用程式設計師提供一種高層次的抽象，以隱藏磁碟上資料結構的複雜性、併發性、崩潰恢復等等。Unix 發展出的管道和檔案只是位元組序列，而資料庫則發展出了 SQL 和事務。

哪種方法更好？當然這取決於你想要的是什麼。Unix 是 “簡單的”，因為它是對硬體資源相當薄的包裝；關係資料庫是 “更簡單” 的，因為一個簡短的宣告性查詢可以利用很多強大的基礎設施（查詢最佳化、索引、連線方法、併發控制、複製等），而不需要查詢的作者理解其實現細節。

這些哲學之間的矛盾已經持續了幾十年（Unix 和關係模型都出現在 70 年代初），仍然沒有解決。例如，我將 NoSQL 運動解釋為，希望將類 Unix 的低級別抽象方法應用於分散式 OLTP 資料儲存的領域。

在這一部分我將試圖調和這兩個哲學，希望我們能各取其美。

### 組合使用資料儲存技術

在本書的過程中，我們討論了資料庫提供的各種功能及其工作原理，其中包括：

* 次級索引，使你可以根據欄位的值有效地搜尋記錄（請參閱 “[其他索引結構](/v1_tw/ch3#其他索引結構)”）
* 物化檢視，這是一種預計算的查詢結果快取（請參閱 “[聚合：資料立方體和物化檢視](/v1_tw/ch3#聚合：資料立方體和物化檢視)”）
* 複製日誌，保持其他節點上資料的副本最新（請參閱 “[複製日誌的實現](/v1_tw/ch5#複製日誌的實現)”）
* 全文搜尋索引，允許在文字中進行關鍵字搜尋（請參閱 “[全文搜尋和模糊索引](/v1_tw/ch3#全文搜尋和模糊索引)”），也內置於某些關係資料庫【1】

在 [第十章](/v1_tw/ch10) 和 [第十一章](/v1_tw/ch11) 中，出現了類似的主題。我們討論了如何構建全文搜尋索引（請參閱 “[批處理工作流的輸出](/v1_tw/ch10#批處理工作流的輸出)”），瞭解了如何維護物化檢視（請參閱 “[維護物化檢視](/v1_tw/ch11#維護物化檢視)”）以及如何將變更從資料庫複製到衍生資料系統（請參閱 “[變更資料捕獲](/v1_tw/ch11#變更資料捕獲)”）。

資料庫中內建的功能與人們用批處理和流處理器構建的衍生資料系統似乎有相似之處。

#### 建立索引

想想當你執行 `CREATE INDEX` 在關係資料庫中建立一個新的索引時會發生什麼。資料庫必須掃描表的一致性快照，挑選出所有被索引的欄位值，對它們進行排序，然後寫出索引。然後它必須處理自一致快照以來所做的寫入操作（假設表在建立索引時未被鎖定，所以寫操作可能會繼續）。一旦完成，只要事務寫入表中，資料庫就必須繼續保持索引最新。

此過程非常類似於設定新的從庫副本（請參閱 “[設定新從庫](/v1_tw/ch5#設定新從庫)”），也非常類似於流處理系統中的 **引導（bootstrap）** 變更資料捕獲（請參閱 “[初始快照](/v1_tw/ch11#初始快照)”）。

無論何時執行 `CREATE INDEX`，資料庫都會重新處理現有資料集（如 “[應用演化後重新處理資料](#應用演化後重新處理資料)” 中所述），並將該索引作為新檢視匯出到現有資料上。現有資料可能是狀態的快照，而不是所有發生變化的日誌，但兩者密切相關（請參閱 “[狀態、流和不變性](/v1_tw/ch11#狀態、流和不變性)”）。

#### 一切的元資料庫

有鑑於此，我認為整個組織的資料流開始像一個巨大的資料庫【7】。每當批處理、流或 ETL 過程將資料從一個地方傳輸到另一個地方並組裝時，它表現地就像資料庫子系統一樣，使索引或物化檢視保持最新。

從這種角度來看，批處理和流處理器就像精心實現的觸發器、儲存過程和物化檢視維護例程。它們維護的衍生資料系統就像不同的索引型別。例如，關係資料庫可能支援 B 樹索引、雜湊索引、空間索引（請參閱 “[多列索引](/v1_tw/ch3#多列索引)”）以及其他型別的索引。在新興的衍生資料系統架構中，不是將這些設施作為單個整合資料庫產品的功能實現，而是由各種不同的軟體提供，執行在不同的機器上，由不同的團隊管理。

這些發展在未來將會把我們帶到哪裡？如果我們從沒有適合所有訪問模式的單一資料模型或儲存格式的前提出發，我推測有兩種途徑可以將不同的儲存和處理工具組合成一個有凝聚力的系統：

**聯合資料庫：統一讀取**

可以為各種各樣的底層儲存引擎和處理方法提供一個統一的查詢介面 —— 一種稱為 **聯合資料庫（federated database）** 或 **多型儲存（polystore）** 的方法【18,19】。例如，PostgreSQL 的 **外部資料包裝器（foreign data wrapper）** 功能符合這種模式【20】。需要專用資料模型或查詢介面的應用程式仍然可以直接訪問底層儲存引擎，而想要組合來自不同位置的資料的使用者可以透過聯合介面輕鬆完成操作。

聯合查詢介面遵循著單一整合系統的關係型傳統，帶有高階查詢語言和優雅的語義，但實現起來非常複雜。

**分拆資料庫：統一寫入**

雖然聯合能解決跨多個不同系統的只讀查詢問題，但它並沒有很好的解決跨系統 **同步** 寫入的問題。我們說過，在單個數據庫中，建立一致的索引是一項內建功能。當我們構建多個儲存系統時，我們同樣需要確保所有資料變更都會在所有正確的位置結束，即使在出現故障時也是如此。想要更容易地將儲存系統可靠地插接在一起（例如，透過變更資料捕獲和事件日誌），就像將資料庫的索引維護功能以可以跨不同技術同步寫入的方式分開【7,21】。

分拆方法遵循 Unix 傳統的小型工具，它可以很好地完成一件事【22】，透過統一的低層級 API（管道）進行通訊，並且可以使用更高層級的語言進行組合（shell）【16】 。

#### 開展分拆工作

聯合和分拆是一個硬幣的兩面：用不同的元件構成可靠、 可伸縮和可維護的系統。聯合只讀查詢需要將一個數據模型對映到另一個數據模型，這需要一些思考，但最終還是一個可解決的問題。而我認為同步寫入到幾個儲存系統是更困難的工程問題，所以我將重點關注它。

傳統的同步寫入方法需要跨異構儲存系統的分散式事務【18】，我認為這是錯誤的解決方案（請參閱 “[衍生資料與分散式事務](#衍生資料與分散式事務)”）。單個儲存或流處理系統內的事務是可行的，但是當資料跨越不同技術之間的邊界時，我認為具有冪等寫入的非同步事件日誌是一種更加健壯和實用的方法。

例如，分散式事務在某些流處理元件內部使用，以匹配 **恰好一次（exactly-once）** 語義（請參閱 “[原子提交再現](/v1_tw/ch11#原子提交再現)”），這可以很好地工作。然而，當事務需要涉及由不同人群編寫的系統時（例如，當資料從流處理元件寫入分散式鍵值儲存或搜尋索引時），缺乏標準化的事務協議會使整合更難。有冪等消費者的有序事件日誌（請參閱 “[冪等性](/v1_tw/ch11#冪等性)”）是一種更簡單的抽象，因此在異構系統中實現更加可行【7】。

基於日誌的整合的一大優勢是各個元件之間的 **鬆散耦合（loose coupling）**，這體現在兩個方面：

1. 在系統級別，非同步事件流使整個系統在個別元件的中斷或效能下降時更加穩健。如果消費者執行緩慢或失敗，那麼事件日誌可以緩衝訊息（請參閱 “[磁碟空間使用](/v1_tw/ch11#磁碟空間使用)”），以便生產者和任何其他消費者可以繼續不受影響地執行。有問題的消費者可以在問題修復後趕上，因此不會錯過任何資料，並且包含故障。相比之下，分散式事務的同步互動往往會將本地故障升級為大規模故障（請參閱 “[分散式事務的限制](/v1_tw/ch9#分散式事務的限制)”）。
2. 在人力方面，分拆資料系統允許不同的團隊獨立開發，改進和維護不同的軟體元件和服務。專業化使得每個團隊都可以專注於做好一件事，並與其他團隊的系統以明確的介面互動。事件日誌提供了一個足夠強大的介面，以捕獲相當強的一致性屬性（由於永續性和事件的順序），但也足夠普適於幾乎任何型別的資料。

#### 分拆系統vs整合系統

如果分拆確實成為未來的方式，它也不會取代目前形式的資料庫 —— 它們仍然會像以往一樣被需要。為了維護流處理元件中的狀態，資料庫仍然是需要的，並且為批處理和流處理器的輸出提供查詢服務（請參閱 “[批處理工作流的輸出](/v1_tw/ch10#批處理工作流的輸出)” 與 “[流處理](/v1_tw/ch11#流處理)”）。專用查詢引擎對於特定的工作負載仍然非常重要：例如，MPP 資料倉庫中的查詢引擎針對探索性分析查詢進行了最佳化，並且能夠很好地處理這種型別的工作負載（請參閱 “[Hadoop 與分散式資料庫的對比](/v1_tw/ch10#Hadoop與分散式資料庫的對比)”）。

執行幾種不同基礎設施的複雜性可能是一個問題：每種軟體都有一個學習曲線，配置問題和操作怪癖，因此部署儘可能少的移動部件是很有必要的。比起使用應用程式碼拼接多個工具而成的系統，單一整合軟體產品也可以在其設計應對的工作負載型別上實現更好、更可預測的效能【23】。正如在前言中所說的那樣，為了不需要的規模而構建系統是白費精力，而且可能會將你鎖死在一個不靈活的設計中。實際上，這是一種過早最佳化的形式。

分拆的目標不是要針對個別資料庫與特定工作負載的效能進行競爭；我們的目標是允許你結合多個不同的資料庫，以便在比單個軟體可能實現的更廣泛的工作負載範圍內實現更好的效能。這是關於廣度，而不是深度 —— 與我們在 “[Hadoop 與分散式資料庫的對比](/v1_tw/ch10#Hadoop與分散式資料庫的對比)” 中討論的儲存和處理模型的多樣性一樣。

因此，如果有一項技術可以滿足你的所有需求，那麼最好使用該產品，而不是試圖用更低層級的元件重新實現它。只有當沒有單一軟體滿足你的所有需求時，才會出現拆分和聯合的優勢。

#### 少了什麼？

用於組成資料系統的工具正在變得越來越好，但我認為還缺少一個主要的東西：我們還沒有與 Unix shell 類似的分拆資料庫等價物（即，一種宣告式的、簡單的、用於組裝儲存和處理系統的高階語言）。

例如，如果我們可以簡單地宣告 `mysql | elasticsearch`，類似於 Unix 管道【22】，成為 `CREATE INDEX` 的分拆等價物：它將讀取 MySQL 資料庫中的所有文件並將其索引到 Elasticsearch 叢集中。然後它會不斷捕獲對資料庫所做的所有變更，並自動將它們應用於搜尋索引，而無需編寫自定義應用程式碼。這種整合應當支援幾乎任何型別的儲存或索引系統。

同樣，能夠更容易地預先計算和更新快取將是一件好事。回想一下，物化檢視本質上是一個預先計算的快取，所以你可以透過為複雜查詢宣告指定物化檢視來建立快取，包括圖上的遞迴查詢（請參閱 “[圖資料模型](/v1_tw/ch2#圖資料模型)”）和應用邏輯。在這方面有一些有趣的早期研究，如 **差分資料流（differential dataflow）**【24,25】，我希望這些想法能夠在生產系統中找到自己的方法。

### 圍繞資料流設計應用

使用應用程式碼組合專用儲存與處理系統來分拆資料庫的方法，也被稱為 “**資料庫由內而外（database inside-out）**” 方法【26】，該名稱來源於我在 2014 年的一次會議演講標題【27】。然而稱它為 “新架構” 過於誇大，我僅將其看作是一種設計模式，一個討論的起點，我們只是簡單地給它起一個名字，以便我們能更好地討論它。

這些想法不是我的；它們是很多人的思想的融合，這些思想非常值得我們學習。尤其是，以 Oz【28】和 Juttle【29】為代表的資料流語言，以 Elm【30,31】為代表的 **函式式響應式程式設計（functional reactive programming, FRP）**，以 Bloom【32】為代表的邏輯程式語言。在這一語境中的術語 **分拆（unbundling）** 是由 Jay Kreps 提出的【7】。

即使是 **電子表格** 也在資料流程式設計能力上甩開大多數主流程式語言幾條街【33】。在電子表格中，可以將公式放入一個單元格中（例如，對另一列中的單元格求和），並且只要公式的任何輸入發生變更，公式的結果都會自動重新計算。這正是我們在資料系統層次所需要的：當資料庫中的記錄發生變更時，我們希望自動更新該記錄的任何索引，並且自動重新整理依賴於記錄的任何快取檢視或聚合。你不必擔心這種重新整理如何發生的技術細節，但能夠簡單地相信它可以正常工作。

因此，我認為絕大多數資料系統仍然可以從 VisiCalc 在 1979 年已經具備的功能中學習【34】。與電子表格的不同之處在於，今天的資料系統需要具有容錯性，可伸縮性以及持久儲存資料。它們還需要能夠整合不同人群編寫的不同技術，並重用現有的庫和服務：期望使用某一種特定的語言、框架或工具來開發所有軟體是不切實際的。

在本節中，我將詳細介紹這些想法，並探討一些圍繞分拆資料庫和資料流的想法構建應用的方法。

#### 應用程式碼作為衍生函式

當一個數據集衍生自另一個數據集時，它會經歷某種轉換函式。例如：

* 次級索引是由一種直白的轉換函式生成的衍生資料集：對於基礎表中的每行或每個文件，它挑選被索引的列或欄位中的值，並按這些值排序（假設使用 B 樹或 SSTable 索引，按鍵排序，如 [第三章](/v1_tw/ch3) 所述）。
* 全文搜尋索引是透過應用各種自然語言處理函式而建立的，諸如語言檢測、分詞、詞幹或詞彙化、拼寫糾正和同義詞識別，然後構建用於高效查詢的資料結構（例如倒排索引）。
* 在機器學習系統中，我們可以將模型視作從訓練資料透過應用各種特徵提取、統計分析函式衍生的資料，當模型應用於新的輸入資料時，模型的輸出是從輸入和模型（因此間接地從訓練資料）中衍生的。
* 快取通常包含將以使用者介面（UI）顯示的形式的資料聚合。因此填充快取需要知道 UI 中引用的欄位；UI 中的變更可能需要更新快取填充方式的定義，並重建快取。

用於次級索引的衍生函式是如此常用的需求，以致於它作為核心功能被內建至許多資料庫中，你可以簡單地透過 `CREATE INDEX` 來呼叫它。對於全文索引，常見語言的基本語言特徵可能內建到資料庫中，但更複雜的特徵通常需要領域特定的調整。在機器學習中，特徵工程是眾所周知的特定於應用的特徵，通常需要包含很多關於使用者互動與應用部署的詳細知識【35】。

當建立衍生資料集的函式不是像建立次級索引那樣的標準搬磚函式時，需要自定義程式碼來處理特定於應用的東西。而這個自定義程式碼是讓許多資料庫掙扎的地方，雖然關係資料庫通常支援觸發器、儲存過程和使用者定義的函式，可以用它們來在資料庫中執行應用程式碼，但它們有點像資料庫設計裡的事後反思。（請參閱 “[傳遞事件流](/v1_tw/ch11#傳遞事件流)”）。

#### 應用程式碼和狀態的分離

理論上，資料庫可以是任意應用程式碼的部署環境，就如同作業系統一樣。然而實踐中它們對這一目標適配的很差。它們不滿足現代應用開發的要求，例如依賴和軟體包管理、版本控制、滾動升級、可演化性、監控、指標、對網路服務的呼叫以及與外部系統的整合。

另一方面，Mesos、YARN、Docker、Kubernetes 等部署和叢集管理工具專為執行應用程式碼而設計。透過專注於做好一件事情，他們能夠做得比將資料庫作為其眾多功能之一執行使用者定義的功能要好得多。

我認為讓系統的某些部分專門用於持久資料儲存並讓其他部分專門執行應用程式程式碼是有意義的。這兩者可以在保持獨立的同時互動。

現在大多數 Web 應用程式都是作為無狀態服務部署的，其中任何使用者請求都可以路由到任何應用程式伺服器，並且伺服器在傳送響應後會忘記所有請求。這種部署方式很方便，因為可以隨意新增或刪除伺服器，但狀態必須到某個地方：通常是資料庫。趨勢是將無狀態應用程式邏輯與狀態管理（資料庫）分開：不將應用程式邏輯放入資料庫中，也不將持久狀態置於應用程式中【36】。正如函數語言程式設計社群喜歡開玩笑說的那樣，“我們相信 **教會（Church）** 與 **國家（state）** 的分離”【37】 [^i]

[^i]: 解釋笑話很少會讓人感覺更好，但我不想讓任何人感到被遺漏。在這裡，Church 指代的是數學家的阿隆佐・邱奇，他創立了 lambda 演算，這是計算的早期形式，是大多數函數語言程式設計語言的基礎。lambda 演算不具有可變狀態（即沒有變數可以被覆蓋），所以可以說可變狀態與 Church 的工作是分離的。

在這個典型的 Web 應用模型中，資料庫充當一種可以透過網路同步訪問的可變共享變數。應用程式可以讀取和更新變數，而資料庫負責維持它的永續性，提供一些諸如併發控制和容錯的功能。

但是，在大多數程式語言中，你無法訂閱可變變數中的變更 —— 你只能定期讀取它。與電子表格不同，如果變數的值發生變化，變數的讀者不會收到通知（你可以在自己的程式碼中實現這樣的通知 —— 這被稱為 **觀察者模式** —— 但大多數語言沒有將這種模式作為內建功能）。

資料庫繼承了這種可變資料的被動方法：如果你想知道資料庫的內容是否發生了變化，通常你唯一的選擇就是輪詢（即定期重複你的查詢）。訂閱變更只是剛剛開始出現的功能（請參閱 “[變更流的 API 支援](/v1_tw/ch11#變更流的API支援)”）。

#### 資料流：應用程式碼與狀態變化的互動

從資料流的角度思考應用程式，意味著重新協調應用程式碼和狀態管理之間的關係。我們不再將資料庫視作被應用操縱的被動變數，取而代之的是更多地考慮狀態，狀態變更和處理它們的程式碼之間的相互作用與協同關係。應用程式碼透過在另一個地方觸發狀態變更來響應狀態變更。

我們在 “[資料庫與流](/v1_tw/ch11#資料庫與流)” 中看到了這一思路，我們討論了將資料庫的變更日誌視為一種我們可以訂閱的事件流。諸如 Actor 的訊息傳遞系統（請參閱 “[訊息傳遞中的資料流](/v1_tw/ch4#訊息傳遞中的資料流)”）也具有響應事件的概念。早在 20 世紀 80 年代，**元組空間（tuple space）** 模型就已經探索了表達分散式計算的方式：觀察狀態變更並作出反應的過程【38,39】。

如前所述，當觸發器由於資料變更而被觸發時，或次級索引更新以反映索引表中的變更時，資料庫內部也發生著類似的情況。分拆資料庫意味著將這個想法應用於在主資料庫之外，用於建立衍生資料集：快取、全文搜尋索引、機器學習或分析系統。我們可以為此使用流處理和訊息傳遞系統。

需要記住的重要一點是，維護衍生資料不同於執行非同步任務。傳統的訊息傳遞系統通常是為執行非同步任務設計的（請參閱 “[日誌與傳統的訊息傳遞相比](/v1_tw/ch11#日誌與傳統的訊息傳遞相比)”）：

* 在維護衍生資料時，狀態變更的順序通常很重要（如果多個檢視是從事件日誌衍生的，則需要按照相同的順序處理事件，以便它們之間保持一致）。如 “[確認與重新傳遞](/v1_tw/ch11#確認與重新傳遞)” 中所述，許多訊息代理在重傳未確認訊息時沒有此屬性，雙寫也被排除在外（請參閱 “[保持系統同步](/v1_tw/ch11#保持系統同步)”）。
* 容錯是衍生資料的關鍵：僅僅丟失單個訊息就會導致衍生資料集永遠與其資料來源失去同步。訊息傳遞和衍生狀態更新都必須可靠。例如，許多 Actor 系統預設在記憶體中維護 Actor 的狀態和訊息，所以如果執行 Actor 的機器崩潰，狀態和訊息就會丟失。

穩定的訊息排序和容錯訊息處理是相當嚴格的要求，但與分散式事務相比，它們開銷更小，執行更穩定。現代流處理元件可以提供這些排序和可靠性保證，並允許應用程式碼以流運算元的形式執行。

這些應用程式碼可以執行任意處理，包括資料庫內建衍生函式通常不提供的功能。就像透過管道連結的 Unix 工具一樣，流運算元可以圍繞著資料流構建大型系統。每個運算元接受狀態變更的流作為輸入，併產生其他狀態變化的流作為輸出。

#### 流處理器和服務

當今流行的應用開發風格涉及將功能分解為一組透過同步網路請求（如 REST API）進行通訊的 **服務**（service，請參閱 “[服務中的資料流：REST 與 RPC](/v1_tw/ch4#服務中的資料流：REST與RPC)”）。這種面向服務的架構優於單一龐大應用的優勢主要在於：通過鬆散耦合來提供組織上的可伸縮性：不同的團隊可以專職於不同的服務上，從而減少團隊之間的協調工作（因為服務可以獨立部署和更新）。

在資料流中組裝流運算元與微服務方法有很多相似之處【40】。但底層通訊機制是有很大區別：資料流採用單向非同步訊息流，而不是同步的請求 / 響應式互動。

除了在 “[訊息傳遞中的資料流](/v1_tw/ch4#訊息傳遞中的資料流)” 中列出的優點（如更好的容錯性），資料流系統還能實現更好的效能。例如，假設客戶正在購買以一種貨幣定價，但以另一種貨幣支付的商品。為了執行貨幣換算，你需要知道當前的匯率。這個操作可以透過兩種方式實現【40,41】：

1. 在微服務方法中，處理購買的程式碼可能會查詢匯率服務或資料庫，以獲取特定貨幣的當前匯率。
2. 在資料流方法中，處理訂單的程式碼會提前訂閱匯率變更流，並在匯率發生變動時將當前匯率儲存在本地資料庫中。處理訂單時只需查詢本地資料庫即可。

第二種方法能將對另一服務的同步網路請求替換為對本地資料庫的查詢（可能在同一臺機器甚至同一個程序中）[^ii]。資料流方法不僅更快，而且當其他服務失效時也更穩健。最快且最可靠的網路請求就是壓根沒有網路請求！我們現在不再使用 RPC，而是在購買事件和匯率更新事件之間建立流聯接（請參閱 “[流表連線（流擴充）](/v1_tw/ch11#流表連線（流擴充）)”）。

[^ii]: 在微服務方法中，你也可以透過在處理購買的服務中本地快取匯率來避免同步網路請求。但是為了保證快取的新鮮度，你需要定期輪詢匯率以獲取其更新，或訂閱變更流 —— 這恰好是資料流方法中發生的事情。

連線是時間相關的：如果購買事件在稍後的時間點被重新處理，匯率可能已經改變。如果要重建原始輸出，則需要獲取原始購買時的歷史匯率。無論是查詢服務還是訂閱匯率更新流，你都需要處理這種時間相關性（請參閱 “[連線的時間依賴性](/v1_tw/ch11#連線的時間依賴性)”）。

訂閱變更流，而不是在需要時查詢當前狀態，使我們更接近類似電子表格的計算模型：當某些資料發生變更時，依賴於此的所有衍生資料都可以快速更新。還有很多未解決的問題，例如關於時間相關連線等問題，但我認為圍繞資料流構建應用的想法是一個非常有希望的方向。

### 觀察衍生資料狀態

在抽象層面，上一節討論的資料流系統提供了建立衍生資料集（例如搜尋索引、物化檢視和預測模型）並使其保持更新的過程。我們將這個過程稱為 **寫路徑（write path）**：只要某些資訊被寫入系統，它可能會經歷批處理與流處理的多個階段，而最終每個衍生資料集都會被更新，以適配寫入的資料。[圖 12-1](/v1/ddia_1201.png) 顯示了一個更新搜尋索引的例子。

![](/v1/ddia_1201.png)

**圖 12-1 在搜尋索引中，寫（文件更新）遇上讀（查詢）**

但你為什麼一開始就要建立衍生資料集？很可能是因為你想在以後再次查詢它。這就是 **讀路徑（read path）**：當服務使用者請求時，你需要從衍生資料集中讀取，也許還要對結果進行一些額外處理，然後構建給使用者的響應。

總而言之，寫路徑和讀路徑涵蓋了資料的整個旅程，從收集資料開始，到使用資料結束（可能是由另一個人）。寫路徑是預計算過程的一部分 —— 即，一旦資料進入，即刻完成，無論是否有人需要看它。讀路徑是這個過程中只有當有人請求時才會發生的部分。如果你熟悉函數語言程式設計語言，則可能會注意到寫路徑類似於立即求值，讀路徑類似於惰性求值。

如 [圖 12-1](/v1/ddia_1201.png) 所示，衍生資料集是寫路徑和讀路徑相遇的地方。它代表了在寫入時需要完成的工作量與在讀取時需要完成的工作量之間的權衡。

#### 物化檢視和快取

全文搜尋索引就是一個很好的例子：寫路徑更新索引，讀路徑在索引中搜索關鍵字。讀寫都需要做一些工作。寫入需要更新文件中出現的所有關鍵詞的索引條目。讀取需要搜尋查詢中的每個單詞，並應用布林邏輯來查詢包含查詢中所有單詞（AND 運算子）的文件，或者每個單詞（OR 運算子）的任何同義詞。

如果沒有索引，搜尋查詢將不得不掃描所有文件（如 grep），如果有著大量文件，這樣做的開銷巨大。沒有索引意味著寫入路徑上的工作量較少（沒有要更新的索引），但是在讀取路徑上需要更多工作。

另一方面，可以想象為所有可能的查詢預先計算搜尋結果。在這種情況下，讀路徑上的工作量會減少：不需要布林邏輯，只需查詢查詢結果並返回即可。但寫路徑會更加昂貴：可能的搜尋查詢集合是無限大的，因此預先計算所有可能的搜尋結果將需要無限的時間和儲存空間。那肯定沒戲 [^iii]。

[^iii]: 假設一個有限的語料庫，那麼返回非空搜尋結果的搜尋查詢集合是有限的。然而，它是與語料庫中的術語數量呈指數關係，這仍是一個壞訊息。

另一種選擇是預先計算一組固定的最常見查詢的搜尋結果，以便可以快速提供它們而無需轉到索引。不常見的查詢仍然可以透過索引來提供服務。這通常被稱為常見查詢的 **快取（cache）**，儘管我們也可以稱之為 **物化檢視（materialized view）**，因為當新文件出現，且需要被包含在這些常見查詢的搜尋結果之中時，這些索引就需要更新。

從這個例子中我們可以看到，索引不是寫路徑和讀路徑之間唯一可能的邊界；快取常見搜尋結果也是可行的；而在少量文件上使用沒有索引的類 grep 掃描也是可行的。由此來看，快取，索引和物化檢視的作用很簡單：它們改變了讀路徑與寫路徑之間的邊界。透過預先計算結果，從而允許我們在寫路徑上做更多的工作，以節省讀路徑上的工作量。

在寫路徑上完成的工作和讀路徑之間的界限，實際上是本書開始處在 “[描述負載](/v1_tw/ch1#描述負載)” 中推特例子裡談到的主題。在該例中，我們還看到了與普通使用者相比，名人的寫路徑和讀路徑可能有所不同。在 500 頁之後，我們已經繞回了起點！

#### 有狀態、可離線的客戶端

我發現寫路徑和讀路徑之間的邊界很有趣，因為我們可以試著改變這個邊界，並探討這種改變的實際意義。我們來看看不同上下文中的這一想法。

過去二十年來，Web 應用的火熱讓我們對應用開發作出了一些很容易視作理所當然的假設。具體來說就是，客戶端 / 伺服器模型 —— 客戶端大多是無狀態的，而伺服器擁有資料的權威 —— 已經普遍到我們幾乎忘掉了還有其他任何模型的存在。但是技術在不斷地發展，我認為不時地質疑現狀非常重要。

傳統上，網路瀏覽器是無狀態的客戶端，只有當連線到網際網路時才能做一些有用的事情（能離線執行的唯一事情基本上就是上下滾動之前線上時載入好的頁面）。然而，最近的 “單頁面” JavaScript Web 應用已經獲得了很多有狀態的功能，包括客戶端使用者介面互動，以及 Web 瀏覽器中的持久化本地儲存。移動應用可以類似地在裝置上儲存大量狀態，而且大多數使用者互動都不需要與伺服器往返互動。

這些不斷變化的功能重新引發了對 **離線優先（offline-first）** 應用的興趣，這些應用盡可能地在同一裝置上使用本地資料庫，無需連線網際網路，並在後臺網路連線可用時與遠端伺服器同步【42】。由於移動裝置通常具有緩慢且不可靠的蜂窩網路連線，因此，如果使用者的使用者介面不必等待同步網路請求，且應用主要是離線工作的，則這是一個巨大優勢（請參閱 “[需要離線操作的客戶端](/v1_tw/ch5#需要離線操作的客戶端)”）。

當我們擺脫無狀態客戶端與中央資料庫互動的假設，並轉向在終端使用者裝置上維護狀態時，這就開啟了新世界的大門。特別是，我們可以將裝置上的狀態視為 **伺服器狀態的快取**。螢幕上的畫素是客戶端應用中模型物件的物化檢視；模型物件是遠端資料中心的本地狀態副本【27】。

#### 將狀態變更推送給客戶端

在典型的網頁中，如果你在 Web 瀏覽器中載入頁面，並且隨後伺服器上的資料發生變更，則瀏覽器在重新載入頁面之前對此一無所知。瀏覽器只能在一個時間點讀取資料，假設它是靜態的 —— 它不會訂閱來自伺服器的更新。因此裝置上的狀態是陳舊的快取，除非你顯式輪詢變更否則不會更新。（像 RSS 這樣基於 HTTP 的 Feed 訂閱協議實際上只是一種基本的輪詢形式）

最近的協議已經超越了 HTTP 的基本請求 / 響應模式：服務端傳送的事件（EventSource API）和 WebSockets 提供了通訊通道，透過這些通道，Web 瀏覽器可以與伺服器保持開啟的 TCP 連線，只要瀏覽器仍然連線著，伺服器就能主動向瀏覽器推送資訊。這為伺服器提供了主動通知終端使用者客戶端的機會，伺服器能告知客戶端其本地儲存狀態的任何變化，從而減少客戶端狀態的陳舊程度。

用我們的寫路徑與讀路徑模型來講，主動將狀態變更推至到客戶端裝置，意味著將寫路徑一直延伸到終端使用者。當客戶端首次初始化時，它仍然需要使用讀路徑來獲取其初始狀態，但此後它就能夠依賴伺服器傳送的狀態變更流了。我們在流處理和訊息傳遞部分討論的想法並不侷限於資料中心中：我們可以進一步採納這些想法，並將它們一直延伸到終端使用者裝置【43】。

這些裝置有時會離線，並在此期間無法收到伺服器狀態變更的任何通知。但是我們已經解決了這個問題：在 “[消費者偏移量](/v1_tw/ch11#消費者偏移量)” 中，我們討論了基於日誌的訊息代理的消費者能在失敗或斷開連線後重連，並確保它不會錯過掉線期間任何到達的訊息。同樣的技術適用於單個使用者，每個裝置都是一個小事件流的小小訂閱者。

#### 端到端的事件流

最近用於開發有狀態的客戶端與使用者介面的工具，例如如 Elm 語言【30】和 Facebook 的 React、Flux 和 Redux 工具鏈，已經透過訂閱表示使用者輸入或伺服器響應的事件流來管理客戶端的內部狀態，其結構與事件溯源相似（請參閱 “[事件溯源](/v1_tw/ch11#事件溯源)”）。

將這種程式設計模型擴充套件為：允許伺服器將狀態變更事件推送到客戶端的事件管道中，是非常自然的。因此，狀態變化可以透過 **端到端（end-to-end）** 的寫路徑流動：從一個裝置上的互動觸發狀態變更開始，經由事件日誌，並穿過幾個衍生資料系統與流處理器，一直到另一臺裝置上的使用者介面，而有人正在觀察使用者介面上的狀態變化。這些狀態變化能以相當低的延遲傳播 —— 比如說，在一秒內從一端到另一端。

一些應用（如即時訊息傳遞與線上遊戲）已經具有這種 “即時” 架構（在低延遲互動的意義上，不是在 “[響應時間保證](/v1_tw/ch8#響應時間保證)” 中的意義上）。但我們為什麼不用這種方式構建所有的應用？

挑戰在於，關於無狀態客戶端和請求 / 響應互動的假設已經根深蒂固地植入在我們的資料庫、庫、框架以及協議之中。許多資料儲存支援讀取與寫入操作，為請求返回一個響應，但只有極少數提供訂閱變更的能力 —— 請求返回一個隨時間推移的響應流（請參閱 “[變更流的 API 支援](/v1_tw/ch11#變更流的API支援)” ）。

為了將寫路徑延伸至終端使用者，我們需要從根本上重新思考我們構建這些系統的方式：從請求 / 響應互動轉向釋出 / 訂閱資料流【27】。更具響應性的使用者介面與更好的離線支援，我認為這些優勢值得我們付出努力。如果你正在設計資料系統，我希望你對訂閱變更的選項留有印象，而不只是查詢當前狀態。

#### 讀也是事件

我們討論過，當流處理器將衍生資料寫入儲存（資料庫，快取或索引）時，以及當用戶請求查詢該儲存時，儲存將充當寫路徑和讀路徑之間的邊界。該儲存應當允許對資料進行隨機訪問的讀取查詢，否則這些查詢將需要掃描整個事件日誌。

在很多情況下，資料儲存與流處理系統是分開的。但回想一下，流處理器還是需要維護狀態以執行聚合和連線的（請參閱 “[流連線](/v1_tw/ch11#流連線)”）。這種狀態通常隱藏在流處理器內部，但一些框架也允許這些狀態被外部客戶端查詢【45】，將流處理器本身變成一種簡單的資料庫。

我願意進一步思考這個想法。正如到目前為止所討論的那樣，對儲存的寫入是透過事件日誌進行的，而讀取是臨時的網路請求，直接流向儲存著待查資料的節點。這是一個合理的設計，但不是唯一可行的設計。也可以將讀取請求表示為事件流，並同時將讀事件與寫事件送往流處理器；流處理器透過將讀取結果傳送到輸出流來響應讀取事件【46】。

當寫入和讀取都被表示為事件，並且被路由到同一個流運算元以便處理時，我們實際上是在讀取查詢流和資料庫之間執行流表連線。讀取事件需要被送往儲存資料的資料庫分割槽（請參閱 “[請求路由](/v1_tw/ch6#請求路由)”），就像批處理和流處理器在連線時需要在同一個鍵上對輸入分割槽一樣（請參閱 “[Reduce 側連線與分組](/v1_tw/ch10#Reduce側連線與分組)”）。

服務請求與執行連線之間的這種相似之處是非常關鍵的【47】。一次性讀取請求只是將請求傳過連線運算元，然後請求馬上就被忘掉了；而一個訂閱請求，則是與連線另一側過去與未來事件的持久化連線。

記錄讀取事件的日誌可能對於追蹤整個系統中的因果關係與資料來源也有好處：它可以讓你重現出當用戶做出特定決策之前看見了什麼。例如在網商中，向客戶顯示的預測送達日期與庫存狀態，可能會影響他們是否選擇購買一件商品【4】。要分析這種聯絡，則需要記錄使用者查詢運輸與庫存狀態的結果。

將讀取事件寫入持久儲存可以更好地跟蹤因果關係（請參閱 “[排序事件以捕獲因果關係](#排序事件以捕獲因果關係)”），但會產生額外的儲存與 I/O 成本。最佳化這些系統以減少開銷仍然是一個開放的研究問題【2】。但如果你已經出於運維目的留下了讀取請求日誌，將其作為請求處理的副作用，那麼將這份日誌作為請求事件源並不是什麼特別大的變更。

#### 多分割槽資料處理

對於只涉及單個分割槽的查詢，透過流來發送查詢與收集響應可能是殺雞用牛刀了。然而，這個想法開啟了分散式執行複雜查詢的可能性，這需要合併來自多個分割槽的資料，利用了流處理器已經提供的訊息路由、分割槽和連線的基礎設施。

Storm 的分散式 RPC 功能支援這種使用模式（請參閱 “[訊息傳遞和 RPC](/v1_tw/ch11#訊息傳遞和RPC)”）。例如，它已經被用來計算瀏覽過某個推特 URL 的人數 —— 即，發推包含該 URL 的所有人的粉絲集合的並集【48】。由於推特的使用者是分割槽的，因此這種計算需要合併來自多個分割槽的結果。

這種模式的另一個例子是欺詐預防：為了評估特定購買事件是否具有欺詐風險，你可以檢查該使用者 IP 地址，電子郵件地址，帳單地址，送貨地址的信用分。這些信用資料庫中的每一個都是有分割槽的，因此為特定購買事件採集分數需要連線一系列不同的分割槽資料集【49】。

MPP 資料庫的內部查詢執行圖有著類似的特徵（請參閱 “[Hadoop 與分散式資料庫的對比](/v1_tw/ch10#Hadoop與分散式資料庫的對比)”）。如果需要執行這種多分割槽連線，則直接使用提供此功能的資料庫，可能要比使用流處理器實現它要更簡單。然而將查詢視為流提供了一種選項，可以用於實現超出傳統現成解決方案的大規模應用。


## 將事情做正確

對於只讀取資料的無狀態服務，出問題也沒什麼大不了的：你可以修復該錯誤並重啟服務，而一切都恢復正常。像資料庫這樣的有狀態系統就沒那麼簡單了：它們被設計為永遠記住事物（或多或少），所以如果出現問題，這種（錯誤的）效果也將潛在地永遠持續下去，這意味著它們需要更仔細的思考【50】。

我們希望構建可靠且 **正確** 的應用（即使面對各種故障，程式的語義也能被很好地定義與理解）。約四十年來，原子性、隔離性和永續性（[第七章](/v1_tw/ch7)）等事務特性一直是構建正確應用的首選工具。然而這些地基沒有看上去那麼牢固：例如弱隔離級別帶來的困惑可以佐證（請參閱 “[弱隔離級別](/v1_tw/ch7#弱隔離級別)”）。

事務在某些領域被完全拋棄，並被提供更好效能與可伸縮性的模型取代，但後者有更複雜的語義（例如，請參閱 “[無主複製](/v1_tw/ch5#無主複製)”）。**一致性（Consistency）** 經常被談起，但其定義並不明確（請參閱 “[一致性](/v1_tw/ch7#一致性)” 和 [第九章](/v1_tw/ch9)）。有些人斷言我們應當為了高可用而 “擁抱弱一致性”，但卻對這些概念實際上意味著什麼缺乏清晰的認識。

對於如此重要的話題，我們的理解，以及我們的工程方法卻是驚人地薄弱。例如，確定在特定事務隔離等級或複製配置下執行特定應用是否安全是非常困難的【51,52】。通常簡單的解決方案似乎在低併發性的情況下工作正常，並且沒有錯誤，但在要求更高的情況下卻會出現許多微妙的錯誤。

例如，Kyle Kingsbury 的 Jepsen 實驗【53】標出了一些產品聲稱的安全保證與其在網路問題與崩潰時的實際行為之間的明顯差異。即使像資料庫這樣的基礎設施產品沒有問題，應用程式碼仍然需要正確使用它們提供的功能才行，如果配置很難理解，這是很容易出錯的（在這種情況下指的是弱隔離級別，法定人數配置等）。

如果你的應用可以容忍偶爾的崩潰，以及以不可預料的方式損壞或丟失資料，那生活就要簡單得多，而你可能只要雙手合十念阿彌陀佛，期望佛祖能保佑最好的結果。另一方面，如果你需要更強的正確性保證，那麼可序列化與原子提交就是久經考驗的方法，但它們是有代價的：它們通常只在單個數據中心中工作（這就排除了地理位置分散的架構），並限制了系統能夠實現的規模與容錯特性。

雖然傳統的事務方法並沒有走遠，但我也相信在使應用正確而靈活地處理錯誤方面上，事務也不是最後一個可以談的。在本節中，我將提出一些在資料流架構中考量正確性的方式。

### 資料庫的端到端原則

僅僅因為一個應用程式使用了具有相對較強安全屬性的資料系統（例如可序列化的事務），並不意味著就可以保證沒有資料丟失或損壞。例如，如果某個應用有個 Bug，導致它寫入不正確的資料，或者從資料庫中刪除資料，那麼可序列化的事務也救不了你。

這個例子可能看起來很無聊，但值得認真對待：應用會出 Bug，而人也會犯錯誤。我在 “[狀態、流和不變性](/v1_tw/ch11#狀態、流和不變性)” 中使用了這個例子來支援不可變和僅追加的資料，閹割掉錯誤程式碼摧毀良好資料的能力，能讓從錯誤中恢復更為容易。

雖然不變性很有用，但它本身並非萬靈藥。讓我們來看一個可能發生的、非常微妙的資料損壞案例。

#### 正好執行一次操作

在 “[容錯](/v1_tw/ch11#容錯)” 中，我們見到了 **恰好一次**（或 **等效一次**）語義的概念。如果在處理訊息時出現問題，你可以選擇放棄（丟棄訊息 —— 導致資料丟失）或重試。如果重試，就會有這種風險：第一次實際上成功了，只不過你沒有發現。結果這個訊息就被處理了兩次。

處理兩次是資料損壞的一種形式：為同樣的服務向客戶收費兩次（收費太多）或增長計數器兩次（誇大指標）都不是我們想要的。在這種情況下，恰好一次意味著安排計算，使得最終效果與沒有發生錯誤的情況一樣，即使操作實際上因為某種錯誤而重試。我們先前討論過實現這一目標的幾種方法。

最有效的方法之一是使操作 **冪等**（idempotent，請參閱 “[冪等性](/v1_tw/ch11#冪等性)”）：即確保它無論是執行一次還是執行多次都具有相同的效果。但是，將不是天生冪等的操作變為冪等的操作需要一些額外的努力與關注：你可能需要維護一些額外的元資料（例如更新了值的操作 ID 集合），並在從一個節點故障切換至另一個節點時做好防護（請參閱 “[領導者和鎖](/v1_tw/ch8#領導者和鎖)”）。

#### 抑制重複

除了流處理之外，其他許多地方也需要抑制重複的模式。例如，TCP 使用了資料包上的序列號，以便接收方可以將它們正確排序，並確定網路上是否有資料包丟失或重複。在將資料交付應用前，TCP 協議棧會重新傳輸任何丟失的資料包，也會移除任何重複的資料包。

但是，這種重複抑制僅適用於單條 TCP 連線的場景中。假設 TCP 連線是一個客戶端與資料庫的連線，並且它正在執行 [例 12-1]() 中的事務。在許多資料庫中，事務是繫結在客戶端連線上的（如果客戶端傳送了多個查詢，資料庫就知道它們屬於同一個事務，因為它們是在同一個 TCP 連線上傳送的）。如果客戶端在傳送 `COMMIT` 之後並在從資料庫伺服器收到響應之前遇到網路中斷與連線超時，客戶端是不知道事務是否已經被提交的（[圖 8-1](/v1/ddia_0801.png)）。

**例 12-1 資金從一個賬戶到另一個賬戶的非冪等轉移**

```sql
BEGIN TRANSACTION;
    UPDATE accounts SET balance = balance + 11.00 WHERE account_id = 1234;
    UPDATE accounts SET balance = balance - 11.00 WHERE account_id = 4321;
COMMIT;
```

客戶端可以重連到資料庫並重試事務，但現在已經處於 TCP 重複抑制的範圍之外了。因為 [例 12-1]() 中的事務不是冪等的，可能會發生轉了 \$22 而不是期望的 \$11。因此，儘管 [例 12-1]() 是一個事務原子性的標準樣例，但它實際上並不正確，而真正的銀行並不會這樣辦事【3】。

兩階段提交（請參閱 “[原子提交與兩階段提交](/v1_tw/ch9#原子提交與兩階段提交)”）協議會破壞 TCP 連線與事務之間的 1:1 對映，因為它們必須在故障後允許事務協調器重連到資料庫，告訴資料庫將存疑事務提交還是中止。這足以確保事務只被恰好執行一次嗎？不幸的是，並不能。

即使我們可以抑制資料庫客戶端與伺服器之間的重複事務，我們仍然需要擔心終端使用者裝置與應用伺服器之間的網路。例如，如果終端使用者的客戶端是 Web 瀏覽器，則它可能會使用 HTTP POST 請求向伺服器提交指令。也許使用者正處於一個訊號微弱的蜂窩資料網路連線中，它們成功地傳送了 POST，但卻在能夠從伺服器接收響應之前沒了訊號。

在這種情況下，可能會向用戶顯示錯誤訊息，而他們可能會手動重試。Web 瀏覽器警告說，“你確定要再次提交這個表單嗎？”  —— 使用者選 “是”，因為他們希望操作發生（Post/Redirect/Get 模式【54】可以避免在正常操作中出現此警告訊息，但 POST 請求超時就沒辦法了）。從 Web 伺服器的角度來看，重試是一個獨立的請求；從資料庫的角度來看，這是一個獨立的事務。通常的除重機制無濟於事。

#### 操作識別符號

要在通過幾跳的網路通訊上使操作具有冪等性，僅僅依賴資料庫提供的事務機制是不夠的 —— 你需要考慮 **端到端（end-to-end）** 的請求流。
例如，你可以為操作生成一個唯一的識別符號（例如 UUID），並將其作為隱藏表單欄位包含在客戶端應用中，或透過計算所有表單相關欄位的雜湊來生成操作 ID 【3】。如果 Web 瀏覽器提交了兩次 POST 請求，這兩個請求將具有相同的操作 ID。然後，你可以將該操作 ID 一路傳遞到資料庫，並檢查你是否曾經使用給定的 ID 執行過一個操作，如 [例 12-2]() 中所示。

**例 12-2 使用唯一 ID 來抑制重複請求**

```sql
ALTER TABLE requests ADD UNIQUE (request_id);

BEGIN TRANSACTION;
    INSERT INTO requests
        (request_id, from_account, to_account, amount)
        VALUES('0286FDB8-D7E1-423F-B40B-792B3608036C', 4321, 1234, 11.00);
    UPDATE accounts SET balance = balance + 11.00 WHERE account_id = 1234;
    UPDATE accounts SET balance = balance - 11.00 WHERE account_id = 4321;
COMMIT;
```

[例 12-2]() 依賴於 `request_id` 列上的唯一約束。如果一個事務嘗試插入一個已經存在的 ID，那麼 `INSERT` 失敗，事務被中止，使其無法生效兩次。即使在較弱的隔離級別下，關係資料庫也能正確地維護唯一性約束（而在 “[寫入偏差與幻讀](/v1_tw/ch7#寫入偏差與幻讀)” 中討論過，應用級別的 **檢查 - 然後 - 插入** 可能會在不可序列化的隔離下失敗）。

除了抑制重複的請求之外，[例 12-2]() 中的請求表表現得就像一種事件日誌，暗示著事件溯源的想法（請參閱 “[事件溯源](/v1_tw/ch11#事件溯源)”）。更新賬戶餘額事實上不必與插入事件發生在同一個事務中，因為它們是冗餘的，而能由下游消費者從請求事件中衍生出來 —— 只要該事件被恰好處理一次，這又一次可以使用請求 ID 來強制執行。

#### 端到端原則

抑制重複事務的這種情況只是一個更普遍的原則的一個例子，這個原則被稱為 **端到端原則（end-to-end argument）**，它在 1984 年由 Saltzer、Reed 和 Clark 闡述【55】：

> 只有在通訊系統兩端應用的知識與幫助下，所討論的功能才能完全地正確地實現。因而將這種被質疑的功能作為通訊系統本身的功能是不可能的（有時，通訊系統可以提供這種功能的不完備版本，可能有助於提高效能）。
>

在我們的例子中 **所討論的功能** 是重複抑制。我們看到 TCP 在 TCP 連線層次抑制了重複的資料包，一些流處理器在訊息處理層次提供了所謂的恰好一次語義，但這些都無法阻止當一個請求超時時，使用者親自提交重複的請求。TCP，資料庫事務，以及流處理器本身並不能完全排除這些重複。解決這個問題需要一個端到端的解決方案：從終端使用者的客戶端一路傳遞到資料庫的事務識別符號。

端到端原則也適用於檢查資料的完整性：乙太網，TCP 和 TLS 中內建的校驗和可以檢測網路中資料包的損壞情況，但是它們無法檢測到由連線兩端傳送 / 接收軟體中 Bug 導致的損壞。或資料儲存所在磁碟上的損壞。如果你想捕獲資料所有可能的損壞來源，你也需要端到端的校驗和。

類似的原則也適用於加密【55】：家庭 WiFi 網路上的密碼可以防止人們竊聽你的 WiFi 流量，但無法阻止網際網路上其他地方攻擊者的窺探；客戶端與伺服器之間的 TLS/SSL 可以阻擋網路攻擊者，但無法阻止惡意伺服器。只有端到端的加密和認證可以防止所有這些事情。

儘管低層級的功能（TCP 重複抑制、乙太網校驗和、WiFi 加密）無法單獨提供所需的端到端功能，但它們仍然很有用，因為它們能降低較高層級出現問題的可能性。例如，如果我們沒有 TCP 來將資料包排成正確的順序，那麼 HTTP 請求通常就會被攪爛。我們只需要記住，低級別的可靠性功能本身並不足以確保端到端的正確性。

#### 在資料系統中應用端到端思考

這將我帶回最初的論點：僅僅因為應用使用了提供相對較強安全屬性的資料系統，例如可序列化的事務，並不意味著應用的資料就不會丟失或損壞了。應用本身也需要採取端到端的措施，例如除重。

這實在是一個遺憾，因為容錯機制很難弄好。低層級的可靠機制（比如 TCP 中的那些）執行的相當好，因而剩下的高層級錯誤基本很少出現。如果能將這些剩下的高層級容錯機制打包成抽象，而應用不需要再去操心，那該多好呀 —— 但恐怕我們還沒有找到這一正確的抽象。

長期以來，事務被認為是一個很好的抽象，我相信它們確實是很有用的。正如 [第七章](/v1_tw/ch7) 導言中所討論的，它們將各種可能的問題（併發寫入、違背約束、崩潰、網路中斷、磁碟故障）合併為兩種可能結果：提交或中止。這是對程式設計模型而言是一種巨大的簡化，但恐怕這還不夠。

事務是代價高昂的，當涉及異構儲存技術時尤為甚（請參閱 “[實踐中的分散式事務](/v1_tw/ch9#實踐中的分散式事務)”）。我們拒絕使用分散式事務是因為它開銷太大，結果我們最後不得不在應用程式碼中重新實現容錯機制。正如本書中大量的例子所示，對併發性與部分失敗的推理是困難且違反直覺的，所以我懷疑大多數應用級別的機制都不能正確工作，最終結果是資料丟失或損壞。

出於這些原因，我認為探索對容錯的抽象是很有價值的。它使提供應用特定的端到端的正確性屬性變得更簡單，而且還能在大規模分散式環境中提供良好的效能與運維特性。

### 強制約束

讓我們思考一下在 [分拆資料庫](#分拆資料庫) 上下文中的 **正確性（correctness）**。我們看到端到端的除重可以透過從客戶端一路透傳到資料庫的請求 ID 實現。那麼其他型別的約束呢？

我們先來特別關注一下 **唯一性約束** —— 例如我們在 [例 12-2]() 中所依賴的約束。在 “[約束和唯一性保證](/v1_tw/ch9#約束和唯一性保證)” 中，我們看到了幾個其他需要強制實施唯一性的應用功能例子：使用者名稱或電子郵件地址必須唯一標識使用者，檔案儲存服務不能包含多個重名檔案，兩個人不能在航班或劇院預訂同一個座位。

其他型別的約束也非常類似：例如，確保帳戶餘額永遠不會變為負數，確保不會超賣庫存，或者會議室沒有重複的預訂。執行唯一性約束的技術通常也可以用於這些約束。

#### 唯一性約束需要達成共識

在 [第九章](/v1_tw/ch9) 中我們看到，在分散式環境中，強制執行唯一性約束需要共識：如果存在多個具有相同值的併發請求，則系統需要決定衝突操作中的哪一個被接受，並拒絕其他違背約束的操作。

達成這一共識的最常見方式是使單個節點作為領導，並使其負責所有決策。只要你不介意所有請求都擠過單個節點（即使客戶端位於世界的另一端），只要該節點沒有失效，系統就能正常工作。如果你需要容忍領導者失效，那麼就又回到了共識問題（請參閱 “[單主複製與共識](/v1_tw/ch9#單主複製與共識)”）。

唯一性檢查可以透過對唯一性欄位分割槽做橫向伸縮。例如，如果需要透過請求 ID 確保唯一性（如 [例 12-2]() 所示），你可以確保所有具有相同請求 ID 的請求都被路由到同一分割槽（請參閱 [第六章](/v1_tw/ch6)）。如果你需要讓使用者名稱是唯一的，則可以按使用者名稱的雜湊值做分割槽。

但非同步多主複製排除在外，因為可能會發生不同主庫同時接受衝突寫操作的情況，因而這些值不再是唯一的（請參閱 “[實現線性一致的系統](/v1_tw/ch9#實現線性一致的系統)”）。如果你想立刻拒絕任何違背約束的寫入，同步協調是無法避免的【56】。

#### 基於日誌訊息傳遞中的唯一性

日誌確保所有消費者以相同的順序看見訊息 —— 這種保證在形式上被稱為 **全序廣播（total order boardcast）** 並且等價於共識（請參閱 “[全序廣播](/v1_tw/ch9#全序廣播)”）。在使用基於日誌的訊息傳遞的分拆資料庫方法中，我們可以使用非常類似的方法來執行唯一性約束。

流處理器在單個執行緒上依次消費單個日誌分割槽中的所有訊息（請參閱 “[日誌與傳統的訊息傳遞相比](/v1_tw/ch11#日誌與傳統的訊息傳遞相比)”）。因此，如果日誌是按需要確保唯一的值做的分割槽，則流處理器可以無歧義地、確定性地決定幾個衝突操作中的哪一個先到達。例如，在多個使用者嘗試宣告相同使用者名稱的情況下【57】：

1. 每個對使用者名稱的請求都被編碼為一條訊息，並追加到按使用者名稱雜湊值確定的分割槽。
2. 流處理器依序讀取日誌中的請求，並使用本地資料庫來追蹤哪些使用者名稱已經被佔用了。對於所有申請可用使用者名稱的請求，它都會記錄該使用者名稱，並向輸出流傳送一條成功訊息。對於所有申請已佔用使用者名稱的請求，它都會向輸出流傳送一條拒絕訊息。
3. 請求使用者名稱的客戶端監視輸出流，等待與其請求相對應的成功或拒絕訊息。

該演算法基本上與 “[使用全序廣播實現線性一致的儲存](/v1_tw/ch9#使用全序廣播實現線性一致的儲存)” 中的演算法相同。它可以簡單地透過增加分割槽數伸縮至較大的請求吞吐量，因為每個分割槽都可以被獨立處理。

該方法不僅適用於唯一性約束，而且適用於許多其他型別的約束。其基本原理是，任何可能衝突的寫入都會路由到相同的分割槽並按順序處理。正如 “[什麼是衝突？](/v1_tw/ch5#什麼是衝突？)” 與 “[寫入偏差與幻讀](/v1_tw/ch7#寫入偏差與幻讀)” 中所述，衝突的定義可能取決於應用，但流處理器可以使用任意邏輯來驗證請求。這個想法與 Bayou 在 90 年代開創的方法類似【58】。

#### 多分割槽請求處理

當涉及多個分割槽時，確保操作以原子方式執行且同時滿足約束就變得很有趣了。在 [例 12-2]() 中，可能有三個分割槽：一個包含請求 ID，一個包含收款人賬戶，另一個包含付款人賬戶。沒有理由把這三種東西放入同一個分割槽，因為它們都是相互獨立的。

在資料庫的傳統方法中，執行此事務需要跨全部三個分割槽進行原子提交，就這些分割槽上的所有其他事務而言，這實質上是將該事務嵌入一個全序。而這樣就要求跨分割槽協調，不同的分割槽無法再獨立地進行處理，因此吞吐量很可能會受到影響。

但事實證明，使用分割槽日誌可以達到等價的正確性而無需原子提交：

1. 從賬戶 A 向賬戶 B 轉賬的請求由客戶端提供一個唯一的請求 ID，並按請求 ID 追加寫入相應日誌分割槽。
2. 流處理器讀取請求日誌。對於每個請求訊息，它向輸出流發出兩條訊息：付款人賬戶 A 的借記指令（按 A 分割槽），收款人 B 的貸記指令（按 B 分割槽）。被發出的訊息中會帶有原始的請求 ID。
3. 後續處理器消費借記 / 貸記指令流，按照請求 ID 除重，並將變更應用至賬戶餘額。

步驟 1 和步驟 2 是必要的，因為如果客戶直接傳送貸記與借記指令，則需要在這兩個分割槽之間進行原子提交，以確保兩者要麼都發生或都不發生。為了避免對分散式事務的需要，我們首先將請求持久化記錄為單條訊息，然後從這第一條訊息中衍生出貸記指令與借記指令。幾乎在所有資料系統中，單物件寫入都是原子性的（請參閱 “[單物件寫入](/v1_tw/ch7#單物件寫入)），因此請求要麼出現在日誌中，要麼就不出現，無需多分割槽原子提交。

如果流處理器在步驟 2 中崩潰，則它會從上一個存檔點恢復處理。這樣做時，它不會跳過任何請求訊息，但可能會多次處理請求併產生重複的貸記與借記指令。但由於它是確定性的，因此它只是再次生成相同的指令，而步驟 3 中的處理器可以使用端到端請求 ID 輕鬆地對其除重。

如果你想確保付款人的帳戶不會因此次轉賬而透支，則可以使用一個額外的流處理器來維護賬戶餘額並校驗事務（按付款人賬戶分割槽），只有有效的事務會被記錄在步驟 1 中的請求日誌中。

透過將多分割槽事務分解為兩個不同分割槽方式的階段，並使用端到端的請求 ID，我們實現了同樣的正確性屬性（每個請求對付款人與收款人都恰好生效一次），即使在出現故障，且沒有使用原子提交協議的情況下依然如此。使用多個不同分割槽階段的想法與我們在 “[多分割槽資料處理](#多分割槽資料處理)” 中討論的想法類似（也請參閱 “[併發控制](/v1_tw/ch11#併發控制)”）。

### 及時性與完整性

事務的一個便利屬性是，它們通常是線性一致的（請參閱 “[線性一致性](/v1_tw/ch9#線性一致性)”），也就是說，寫入者會等到事務提交，而之後其寫入立刻對所有讀取者可見。

當我們把一個操作拆分為跨越多個階段的流處理器時，卻並非如此：日誌的消費者在設計上就是非同步的，因此傳送者不會等其訊息被消費者處理完。但是，客戶端等待輸出流中的特定訊息是可能的。這正是我們在 “[基於日誌訊息傳遞中的唯一性](#基於日誌訊息傳遞中的唯一性)” 一節中檢查唯一性約束時所做的事情。

在這個例子中，唯一性檢查的正確性不取決於訊息傳送者是否等待結果。等待的目的僅僅是同步通知傳送者唯一性檢查是否成功。但該通知可以與訊息處理的結果相解耦。

更一般地來講，我認為術語 **一致性（consistency）** 這個術語混淆了兩個值得分別考慮的需求：

* 及時性（Timeliness）

  及時性意味著確保使用者觀察到系統的最新狀態。我們之前看到，如果使用者從陳舊的資料副本中讀取資料，它們可能會觀察到系統處於不一致的狀態（請參閱 “[複製延遲問題](/v1_tw/ch5#複製延遲問題)”）。但這種不一致是暫時的，而最終會透過等待與重試簡單地得到解決。

  CAP 定理（請參閱 “[線性一致性的代價](/v1_tw/ch9#線性一致性的代價)”）使用 **線性一致性（linearizability）** 意義上的一致性，這是實現及時性的強有力方法。像 **寫後讀** 這樣及時性更弱的一致性也很有用（請參閱 “[讀己之寫](/v1_tw/ch5#讀己之寫)”）。

* 完整性（Integrity）

  完整性意味著沒有損壞；即沒有資料丟失，並且沒有矛盾或錯誤的資料。尤其是如果某些衍生資料集是作為底層資料之上的檢視而維護的（請參閱 “[從事件日誌中派生出當前狀態](/v1_tw/ch11#從事件日誌中派生出當前狀態)”），這種衍生必須是正確的。例如，資料庫索引必須正確地反映資料庫的內容 —— 缺失某些記錄的索引並不是很有用。

  如果完整性被違背，這種不一致是永久的：在大多數情況下，等待與重試並不能修復資料庫損壞。相反的是，需要顯式地檢查與修復。在 ACID 事務的上下文中（請參閱 “[ACID 的含義](/v1_tw/ch7#ACID的含義)”），一致性通常被理解為某種特定於應用的完整性概念。原子性和永續性是保持完整性的重要工具。


口號形式：違反及時性，“最終一致性”；違反完整性，“永無一致性”。

我斷言在大多數應用中，完整性比及時性重要得多。違反及時性可能令人困惑與討厭，但違反完整性的結果可能是災難性的。

例如在你的信用卡對賬單上，如果某一筆過去 24 小時內完成的交易尚未出現並不令人奇怪 —— 這些系統有一定的滯後是正常的。我們知道銀行是非同步核算與敲定交易的，這裡的及時性並不是非常重要【3】。但如果當期對賬單餘額與上期對賬單餘額加交易總額對不上（求和錯誤），或者出現一筆向你收費但未向商家付款的交易（消失的錢），那就實在是太糟糕了，這樣的問題就違背了系統的完整性。

#### 資料流系統的正確性

ACID 事務通常既提供及時性（例如線性一致性）也提供完整性保證（例如原子提交）。因此如果你從 ACID 事務的角度來看待應用的正確性，那麼及時性與完整性的區別是無關緊要的。

另一方面，對於在本章中討論的基於事件的資料流系統而言，它們的一個有趣特性就是將及時性與完整性分開。在非同步處理事件流時不能保證及時性，除非你顯式構建一個在返回之前明確等待特定訊息到達的消費者。但完整性實際上才是流處理系統的核心。

**恰好一次** 或 **等效一次** 語義（請參閱 “[容錯](/v1_tw/ch11#容錯)”）是一種保持完整性的機制。如果事件丟失或者生效兩次，就有可能違背資料系統的完整性。因此在出現故障時，容錯訊息傳遞與重複抑制（例如，冪等操作）對於維護資料系統的完整性是很重要的。

正如我們在上一節看到的那樣，可靠的流處理系統可以在無需分散式事務與原子提交協議的情況下保持完整性，這意味著它們有潛力達到與後者相當的正確性，同時還具備好得多的效能與運維穩健性。為了達成這種正確性，我們組合使用了多種機制：

* 將寫入操作的內容表示為單條訊息，從而可以輕鬆地被原子寫入 —— 與事件溯源搭配效果拔群（請參閱 “[事件溯源](/v1_tw/ch11#事件溯源)”）。
* 使用與儲存過程類似的確定性衍生函式，從這一訊息中衍生出所有其他的狀態變更（請參閱 “[真的序列執行](/v1_tw/ch7#真的序列執行)” 和 “[應用程式碼作為衍生函式](/v1_tw/ch12#應用程式碼作為衍生函式)”）
* 將客戶端生成的請求 ID 傳遞透過所有的處理層次，從而允許端到端的除重，帶來冪等性。
* 使訊息不可變，並允許衍生資料能隨時被重新處理，這使從錯誤中恢復更加容易（請參閱 “[不可變事件的優點](/v1_tw/ch11#不可變事件的優點)”）

這種機制組合在我看來，是未來構建容錯應用的一個非常有前景的方向。

#### 寬鬆地解釋約束

如前所述，執行唯一性約束需要共識，通常透過在單個節點中彙集特定分割槽中的所有事件來實現。如果我們想要傳統的唯一性約束形式，這種限制是不可避免的，流處理也不例外。

然而另一個需要了解的事實是，許多真實世界的應用實際上可以擺脫這種形式，接受弱得多的唯一性：

* 如果兩個人同時註冊了相同的使用者名稱或預訂了相同的座位，你可以給其中一個人發訊息道歉，並要求他們換一個不同的使用者名稱或座位。這種糾正錯誤的變化被稱為 **補償性事務（compensating transaction）**【59,60】。
* 如果客戶訂購的物品多於倉庫中的物品，你可以下單補倉，併為延誤向客戶道歉，向他們提供折扣。實際上，這麼說吧，如果叉車在倉庫中軋過了你的貨物，剩下的貨物比你想象的要少，那麼你也是得這麼做【61】。因此，既然道歉工作流無論如何已經成為你商業過程中的一部分了，那麼對庫存物品數目新增線性一致的約束可能就沒必要了。
* 與之類似，許多航空公司都會超賣機票，打著一些旅客可能會錯過航班的算盤；許多旅館也會超賣客房，抱著部分客人可能會取消預訂的期望。在這些情況下，出於商業原因而故意違反了 “一人一座” 的約束；當需求超過供給的情況出現時，就會進入補償流程（退款、升級艙位 / 房型、提供隔壁酒店的免費的房間）。即使沒有超賣，為了應對由惡劣天氣或員工罷工導致的航班取消，你還是需要道歉與補償流程 —— 從這些問題中恢復僅僅是商業活動的正常組成部分。
* 如果有人從賬戶超額取款，銀行可以向他們收取透支費用，並要求他們償還欠款。透過限制每天的提款總額，銀行的風險是有限的。

在許多商業場景中，臨時違背約束並稍後透過道歉來修復，實際上是可以接受的。道歉的成本各不相同，但通常很低（以金錢或名聲來算）：你無法撤回已傳送的電子郵件，但可以傳送一封后續電子郵件進行更正。如果你不小心向信用卡收取了兩次費用，則可以將其中一項收費退款，而代價僅僅是手續費，也許還有客戶的投訴。儘管一旦 ATM 吐了錢，你無法直接取回，但原則上如果賬戶透支而客戶拒不支付，你可以派催收員收回欠款。

道歉的成本是否能接受是一個商業決策。如果可以接受的話，在寫入資料之前檢查所有約束的傳統模型反而會帶來不必要的限制，而線性一致性的約束也不是必須的。樂觀寫入，事後檢查可能是一種合理的選擇。你仍然可以在做一些挽回成本高昂的事情前確保有相關的驗證，但這並不意味著寫入資料之前必須先進行驗證。

這些應用 **確實** 需要完整性：你不會希望丟失預訂資訊，或者由於借方貸方不匹配導致資金消失。但是它們在執行約束時 **並不需要** 及時性：如果你銷售的貨物多於倉庫中的庫存，可以在事後道歉後並彌補問題。這種做法與我們在 “[處理寫入衝突](/v1_tw/ch5#處理寫入衝突)” 中討論的衝突解決方法類似。

#### 無協調資料系統

我們現在已經做了兩個有趣的觀察：

1. 資料流系統可以維持衍生資料的完整性保證，而無需原子提交、線性一致性或者同步的跨分割槽協調。
2. 雖然嚴格的唯一性約束要求及時性和協調，但許多應用實際上可以接受寬鬆的約束：只要整個過程保持完整性，這些約束可能會被臨時違反並在稍後被修復。

總之這些觀察意味著，資料流系統可以為許多應用提供無需協調的資料管理服務，且仍能給出很強的完整性保證。這種 **無協調（coordination-avoiding）** 的資料系統有著很大的吸引力：比起需要執行同步協調的系統，它們能達到更好的效能與更強的容錯能力【56】。

例如，這種系統可以使用多領導者配置運維，跨越多個數據中心，在區域間非同步複製。任何一個數據中心都可以持續獨立執行，因為不需要同步的跨區域協調。這樣的系統的及時性保證會很弱 —— 如果不引入協調它是不可能是線性一致的 —— 但它仍然可以提供有力的完整性保證。

在這種情況下，可序列化事務作為維護衍生狀態的一部分仍然是有用的，但它們只能在小範圍內執行，在那裡它們工作得很好【8】。異構分散式事務（如 XA 事務，請參閱 “[實踐中的分散式事務](/v1_tw/ch9#實踐中的分散式事務)”）不是必需的。同步協調仍然可以在需要的地方引入（例如在無法恢復的操作之前強制執行嚴格的約束），但是如果只是應用的一小部分地方需要它，沒必要讓所有操作都付出協調的代價。【43】。

另一種審視協調與約束的角度是：它們減少了由於不一致而必須做出的道歉數量，但也可能會降低系統的效能和可用性，從而可能增加由於宕機中斷而需要做出的道歉數量。你不可能將道歉數量減少到零，但可以根據自己的需求尋找最佳平衡點 —— 既不存在太多不一致性，又不存在太多可用性問題。

### 信任但驗證

我們所有關於正確性，完整性和容錯的討論都基於一些假設，假設某些事情可能會出錯，但其他事情不會。我們將這些假設稱為我們的 **系統模型**（system model，請參閱 “[將系統模型對映到現實世界](/v1_tw/ch8#將系統模型對映到現實世界)”）：例如，我們應該假設程序可能會崩潰，機器可能突然斷電，網路可能會任意延遲或丟棄訊息。但是我們也可能假設寫入磁碟的資料在執行 `fsync` 後不會丟失，記憶體中的資料沒有損壞，而 CPU 的乘法指令總是能返回正確的結果。

這些假設是相當合理的，因為大多數時候它們都是成立的，如果我們不得不經常擔心計算機出錯，那麼基本上寸步難行。在傳統上，系統模型採用二元方法處理故障：我們假設有些事情可能會發生，而其他事情 **永遠** 不會發生。實際上，這更像是一個機率問題：有些事情更有可能，其他事情不太可能。問題在於違反我們假設的情況是否經常發生，以至於我們可能在實踐中遇到它們。

我們已經看到，資料可能會在尚未落盤時損壞（請參閱 “[複製與永續性](/v1_tw/ch7#複製與永續性)”），而網路上的資料損壞有時可能規避了 TCP 校驗和（請參閱 “[弱謊言形式](/v1_tw/ch8#弱謊言形式)” ）。也許我們應當更關注這些事情？

我過去所從事的一個應用收集了來自客戶端的崩潰報告，我們收到的一些報告，只有在這些裝置記憶體中出現了隨機位翻轉才解釋的通。這看起來不太可能，但是如果有足夠多的裝置執行你的軟體，那麼即使再不可能發生的事也確實會發生。除了由於硬體故障或輻射導致的隨機儲存器損壞之外，一些病態的儲存器訪問模式甚至可以在沒有故障的儲存器中翻轉位【62】 —— 一種可用於破壞作業系統安全機制的效應【63】（這種技術被稱為 **Rowhammer**）。一旦你仔細觀察，硬體並不是看上去那樣完美的抽象。

要澄清的是，隨機位翻轉在現代硬體上仍是非常罕見的【64】。我只想指出，它們並沒有超越可能性的範疇，所以值得一些關注。

#### 維護完整性，儘管軟體有Bug

除了這些硬體問題之外，總是存在軟體 Bug 的風險，這些錯誤不會被較低層次的網路、記憶體或檔案系統校驗和所捕獲。即使廣泛使用的資料庫軟體也有 Bug：即使像 MySQL 與 PostgreSQL 這樣穩健、口碑良好、多年來被許多人充分測試過的軟體，就我個人所見也有 Bug，比如 MySQL 未能正確維護唯一約束【65】，以及 PostgreSQL 的可序列化隔離等級存在特定的寫入偏差異常【66】。對於不那麼成熟的軟體來說，情況可能要糟糕得多。

儘管在仔細設計，測試，以及審查上做出很多努力，但 Bug 仍然會在不知不覺中產生。儘管它們很少，而且最終會被發現並被修復，但總會有那麼一段時間，這些 Bug 可能會損壞資料。

而對於應用程式碼，我們不得不假設會有更多的錯誤，因為絕大多數應用的程式碼經受的評審與測試遠遠無法與資料庫的程式碼相比。許多應用甚至沒有正確使用資料庫提供的用於維持完整性的功能，例如外部索引鍵或唯一性約束【36】。

ACID 意義下的一致性（請參閱 “[一致性](/v1_tw/ch7#一致性)”）基於這樣一種想法：資料庫以一致的狀態啟動，而事務將其從一個一致狀態轉換至另一個一致的狀態。因此，我們期望資料庫始終處於一致狀態。然而，只有當你假設事務沒有 Bug 時，這種想法才有意義。如果應用以某種錯誤的方式使用資料庫，例如，不安全地使用弱隔離等級，資料庫的完整性就無法得到保證。

#### 不要盲目信任承諾

由於硬體和軟體並不總是符合我們的理想，所以資料損壞似乎早晚不可避免。因此，我們至少應該有辦法查明資料是否已經損壞，以便我們能夠修復它，並嘗試追查錯誤的來源。檢查資料完整性稱為 **審計（auditing）**。

如 “[不可變事件的優點](/v1_tw/ch11#不可變事件的優點)” 一節中所述，審計不僅僅適用於財務應用程式。不過，可審計性在財務中是非常非常重要的，因為每個人都知道錯誤總會發生，我們也都認為能夠檢測和解決問題是合理的需求。

成熟的系統同樣傾向於考慮不太可能的事情出錯的可能性，並管理這種風險。例如，HDFS 和 Amazon S3 等大規模儲存系統並不完全信任磁碟：它們執行後臺程序持續回讀檔案，並將其與其他副本進行比較，並將檔案從一個磁碟移動到另一個，以便降低靜默損壞的風險【67】。

如果你想確保你的資料仍然存在，你必須真正讀取它並進行檢查。大多數時候它們仍然會在那裡，但如果不是這樣，你一定想盡早知道答案，而不是更晚。按照同樣的原則，不時地嘗試從備份中恢復是非常重要的 —— 否則當你發現備份損壞時，你可能已經遇到了資料丟失，那時候就真的太晚了。不要盲目地相信它們全都管用。

#### 驗證的文化

像 HDFS 和 S3 這樣的系統仍然需要假設磁碟大部分時間都能正常工作 —— 這是一個合理的假設，但與它們 **始終** 能正常工作的假設並不相同。然而目前還沒有多少系統採用這種 “信任但是驗證” 的方式來持續審計自己。許多人認為正確性保證是絕對的，並且沒有為罕見的資料損壞的可能性做過準備。我希望未來能看到更多的 **自我驗證（self-validating）** 或 **自我審計（self-auditing）** 系統，不斷檢查自己的完整性，而不是依賴盲目的信任【68】。

我擔心 ACID 資料庫的文化導致我們在盲目信任技術（如事務機制）的基礎上開發應用，而忽視了這種過程中的任何可審計性。由於我們所信任的技術在大多數情況下工作得很好，通常會認為審計機制並不值得投資。

但隨之而來的是，資料庫的格局發生了變化：在 NoSQL 的旗幟下，更弱的一致性保證成為常態，更不成熟的儲存技術越來越被廣泛使用。但是由於審計機制還沒有被開發出來，儘管這種方式越來越危險，我們仍不斷在盲目信任的基礎上構建應用。讓我們想一想如何針對可審計性而設計吧。

#### 為可審計性而設計

如果一個事務在一個數據庫中改變了多個物件，在這一事實發生後，很難說清這個事務到底意味著什麼。即使你捕獲了事務日誌（請參閱 “[變更資料捕獲](/v1_tw/ch11#變更資料捕獲)”），各種表中的插入、更新和刪除操作並不一定能清楚地表明 **為什麼** 要執行這些變更。決定這些變更的是應用邏輯中的呼叫，而這一應用邏輯稍縱即逝，無法重現。

相比之下，基於事件的系統可以提供更好的可審計性。在事件溯源方法中，系統的使用者輸入被表示為一個單一不可變事件，而任何其導致的狀態變更都衍生自該事件。衍生可以實現為具有確定性與可重複性，因而相同的事件日誌透過相同版本的衍生程式碼時，會導致相同的狀態變更。

顯式處理資料流（請參閱 “[批處理輸出的哲學](/v1_tw/ch10#批處理輸出的哲學)”）可以使資料的 **來龍去脈（provenance）** 更加清晰，從而使完整性檢查更具可行性。對於事件日誌，我們可以使用雜湊來檢查事件儲存沒有被破壞。對於任何衍生狀態，我們可以重新執行從事件日誌中衍生它的批處理器與流處理器，以檢查是否獲得相同的結果，或者，甚至並行執行冗餘的衍生流程。

具有確定性且定義良好的資料流，也使除錯與跟蹤系統的執行變得容易，以便確定它 **為什麼** 做了某些事情【4,69】。如果出現意想之外的事情，那麼重現導致意外事件的確切事故現場的診斷能力 —— 一種時間旅行除錯功能是非常有價值的。

#### 端到端原則重現

如果我們不能完全相信系統的每個元件都不會損壞 —— 每一個硬體都沒缺陷，每一個軟體都沒有 Bug —— 那我們至少必須定期檢查資料的完整性。如果我們不檢查，我們就不能發現損壞，直到無可挽回地導致對下游的破壞時，那時候再去追蹤問題就要難得多，且代價也要高的多。

檢查資料系統的完整性，最好是以端到端的方式進行（請參閱 “[資料庫的端到端原則](#資料庫的端到端原則)”）：我們能在完整性檢查中涵蓋的系統越多，某些處理階中出現不被察覺損壞的機率就越小。如果我們能檢查整個衍生資料管道端到端的正確性，那麼沿著這一路徑的任何磁碟、網路、服務以及演算法的正確性檢查都隱含在其中了。

持續的端到端完整性檢查可以不斷提高你對系統正確性的信心，從而使你能更快地進步【70】。與自動化測試一樣，審計提高了快速發現錯誤的可能性，從而降低了系統變更或新儲存技術可能導致損失的風險。如果你不害怕進行變更，就可以更好地充分演化一個應用，使其滿足不斷變化的需求。

#### 用於可審計資料系統的工具

目前，將可審計性作為頂層關注點的資料系統並不多。一些應用實現了自己的審計機制，例如將所有變更記錄到單獨的審計表中，但是確保審計日誌與資料庫狀態的完整性仍然是很困難的。可以定期使用硬體安全模組對事務日誌進行簽名來防止篡改，但這無法保證正確的事務一開始就能進入到日誌中。

使用密碼學工具來證明系統的完整性是十分有趣的，這種方式對於寬泛的硬體與軟體問題，甚至是潛在的惡意行為都很穩健有效。加密貨幣、區塊鏈、以及諸如比特幣、以太坊、Ripple、Stellar 的分散式賬本技術已經迅速出現在這一領域【71,72,73】。

我沒有資格評論這些技術用於貨幣，或者合同商定機制的價值。但從資料系統的角度來看，它們包含了一些有趣的想法。實質上，它們是分散式資料庫，具有資料模型與事務機制，而不同副本可以由互不信任的組織託管。副本不斷檢查其他副本的完整性，並使用共識協議對應當執行的事務達成一致。

我對這些技術的拜占庭容錯方面有些懷疑（請參閱 “[拜占庭故障](/v1_tw/ch8#拜占庭故障)”），而且我發現 **工作證明（proof of work）** 技術非常浪費（比如，比特幣挖礦）。比特幣的交易吞吐量相當低，儘管更多是出於政治與經濟原因而非技術上的原因。不過，完整性檢查的方面是很有趣的。

密碼學審計與完整性檢查通常依賴 **默克爾樹（Merkle tree）**【74】，這是一顆雜湊值的樹，能夠用於高效地證明一條記錄出現在一個數據集中（以及其他一些特性）。除了炒作的沸沸揚揚的加密貨幣之外，**證書透明性（certificate transparency）** 也是一種依賴 Merkle 樹的安全技術，用來檢查 TLS/SSL 證書的有效性【75,76】。

我可以想象，那些在證書透明度與分散式賬本中使用的完整性檢查和審計算法，將會在通用資料系統中得到越來越廣泛的應用。要使得這些演算法對於沒有密碼學審計的系統同樣可伸縮，並儘可能降低效能損失還需要一些工作。但我認為這是一個值得關注的有趣領域。


## 做正確的事情

在本書的最後部分，我想退後一步。在本書中，我們考察了各種不同的資料系統架構，評價了它們的優點與缺點，並探討了構建可靠，可伸縮，可維護應用的技術。但是，我們忽略了討論中一個重要而基礎的部分，現在我想補充一下。

每個系統都服務於一個目的；我們採取的每個舉措都會同時產生期望的後果與意外的後果。這個目的可能只是簡單地賺錢，但其對世界的影響，可能會遠遠超出最初的目的。我們，建立這些系統的工程師，有責任去仔細考慮這些後果，並有意識地決定，我們希望生活在怎樣的世界中。

我們將資料當成一種抽象的東西來討論，但請記住，許多資料集都是關於人的：他們的行為，他們的興趣，他們的身份。對待這些資料，我們必須懷著人性與尊重。使用者也是人類，人類的尊嚴是至關重要的。

軟體開發越來越多地涉及重要的道德抉擇。有一些指導原則可以幫助軟體工程師解決這些問題，例如 ACM 的軟體工程道德規範與專業實踐【77】，但實踐中很少會討論這些，更不用說應用與強制執行了。因此，工程師和產品經理有時會對隱私與產品潛在的負面後果抱有非常傲慢的態度【78,79,80】。

技術本身並無好壞之分 —— 關鍵在於它被如何使用，以及它如何影響人們。這對槍械這樣的武器是成立的，而搜尋引擎這樣的軟體系統與之類似。我認為，軟體工程師僅僅專注於技術而忽視其後果是不夠的：道德責任也是我們的責任。對道德推理很困難，但它太重要了，我們無法忽視。

### 預測性分析

舉個例子，預測性分析是 “大資料” 炒作的主要內容之一。使用資料分析預測天氣或疾病傳播是一碼事【81】；而預測一個罪犯是否可能再犯，一個貸款申請人是否有可能違約，或者一個保險客戶是否可能進行昂貴的索賠，則是另外一碼事。後者會直接影響到個人的生活。

當然，支付網路希望防止欺詐交易，銀行希望避免不良貸款，航空公司希望避免劫機，公司希望避免僱傭效率低下或不值得信任的人。從它們的角度來看，失去商機的成本很低，而不良貸款或問題員工的成本則要高得多，因而組織希望保持謹慎也是自然而然的事情。所以如果存疑，它們通常會 Say No。

然而，隨著演算法決策變得越來越普遍，被某種演算法（準確地或錯誤地）標記為有風險的某人可能會遭受大量這種 “No” 的決定。系統性地被排除在工作，航旅，保險，租賃，金融服務，以及其他社會關鍵領域之外。這是一種對個體自由的極大約束，因此被稱為 “演算法監獄”【82】。在尊重人權的國家，刑事司法系統會做無罪推定（預設清白，直到被證明有罪）。另一方面，自動化系統可以系統地，任意地將一個人排除在社會參與之外，不需要任何有罪的證明，而且幾乎沒有申訴的機會。

#### 偏見與歧視

演算法做出的決定不一定比人類更好或更差。每個人都可能有偏見，即使他們主動抗拒這一點；而歧視性做法也可能已經在文化上被制度化了。人們希望根據資料做出決定，而不是透過人的主觀評價與直覺，希望這樣能更加公平，並給予傳統體制中經常被忽視的人更好的機會【83】。

當我們開發預測性分析系統時，不是僅僅用軟體透過一系列 IF ELSE 規則將人類的決策過程自動化，那些規則本身甚至都是從資料中推斷出來的。但這些系統學到的模式是個黑盒：即使資料中存在一些相關性，我們可能也壓根不知道為什麼。如果演算法的輸入中存在系統性的偏見，則系統很有可能會在輸出中學習並放大這種偏見【84】。

在許多國家，反歧視法律禁止按種族、年齡、性別、性取向、殘疾或信仰等受保護的特徵區分對待不同的人。其他的個人特徵可能是允許用於分析的，但是如果這些特徵與受保護的特徵存在關聯，又會發生什麼？例如在種族隔離地區中，一個人的郵政編碼，甚至是他們的 IP 地址，都是很強的種族指示物。這樣的話，相信一種演算法可以以某種方式將有偏見的資料作為輸入，併產生公平和公正的輸出【85】似乎是很荒謬的。然而這種觀點似乎常常潛伏在資料驅動型決策的支持者中，這種態度被諷刺為 “在處理偏差上，機器學習與洗錢類似”（machine learning is like money laundering for bias）【86】。

預測性分析系統只是基於過去進行推斷；如果過去是歧視性的，它們就會將這種歧視歸納為規律。如果我們希望未來比過去更好，那麼就需要道德想象力，而這是隻有人類才能提供的東西【87】。資料與模型應該是我們的工具，而不是我們的主人。

#### 責任與問責

自動決策引發了關於責任與問責的問題【87】。如果一個人犯了錯誤，他可以被追責，受決定影響的人可以申訴。演算法也會犯錯誤，但是如果它們出錯，誰來負責【88】？當一輛自動駕駛汽車引發事故時，誰來負責？如果自動信用評分算法系統性地歧視特定種族或宗教的人，這些人是否有任何追索權？如果機器學習系統的決定要受到司法審查，你能向法官解釋演算法是如何做出決定的嗎？

收集關於人的資料並進行決策，信用評級機構是一個很經典的例子。不良的信用評分會使生活變得更艱難，但至少信用分通常是基於個人 **實際的** 借款歷史記錄，而記錄中的任何錯誤都能被糾正（儘管機構通常會設定門檻）。然而，基於機器學習的評分演算法通常會使用更寬泛的輸入，並且更不透明；因而很難理解特定決策是怎樣作出的，以及是否有人被不公正地，歧視性地對待【89】。

信用分總結了 “你過去的表現如何？”，而預測性分析通常是基於 “誰與你類似，以及與你類似的人過去表現的如何？”。與他人的行為畫上等號意味著刻板印象，例如，根據他們居住的地方（與種族和階級關係密切的特徵）。那麼那些放錯位置的人怎麼辦？而且，如果是因為錯誤資料導致的錯誤決定，追索幾乎是不可能的【87】。

很多資料本質上是統計性的，這意味著即使機率分佈在總體上是正確的，對於個例也可能是錯誤的。例如，如果貴國的平均壽命是 80 歲，這並不意味著你在 80 歲生日時就會死掉。很難從平均值與機率分佈中對某個特定個體的壽命作出什麼判斷，同樣，預測系統的輸出是機率性的，對於個例可能是錯誤的。

盲目相信資料決策至高無上，這不僅僅是一種妄想，而是有切實危險的。隨著資料驅動的決策變得越來越普遍，我們需要弄清楚，如何使演算法更負責任且更加透明，如何避免加強現有的偏見，以及如何在它們不可避免地出錯時加以修復。

我們還需要想清楚，如何避免資料被用於害人，如何認識資料的積極潛力。例如，分析可以揭示人們生活的財務特點與社會特點。一方面，這種權力可以用來將援助與支援集中在幫助那些最需要援助的人身上。另一方面，它有時會被掠奪性企業用於識別弱勢群體，並向其兜售高風險產品，比如高利貸和沒有價值的大學文憑【87,90】。

#### 反饋迴圈

即使是那些對人直接影響比較小的預測性應用，比如推薦系統，也有一些必須正視的難題。當服務變得善於預測使用者想要看到什麼內容時，它最終可能只會向人們展示他們已經同意的觀點，將人們帶入滋生刻板印象，誤導資訊，與極端思想的 **迴音室**。我們已經看到過社交媒體迴音室對競選的影響了【91】。

當預測性分析影響人們的生活時，自我強化的反饋迴圈會導致非常有害的問題。例如，考慮僱主使用信用分來評估候選人的例子。你可能是一個信用分不錯的好員工，但因不可抗力的意外而陷入財務困境。由於不能按期付賬單，你的信用分會受到影響，進而導致找到工作更為困難。失業使你陷入貧困，這進一步惡化了你的分數，使你更難找到工作【87】。在資料與數學嚴謹性的偽裝背後，隱藏的是由惡毒假設導致的惡性迴圈。

我們無法預測這種反饋迴圈何時發生。然而透過對整個系統（不僅僅是計算機化的部分，而且還有與之互動的人）進行整體思考，許多後果是可以夠預測的 —— 一種稱為 **系統思維（systems thinking）** 的方法【92】。我們可以嘗試理解資料分析系統如何響應不同的行為，結構或特性。該系統是否加強和增大了人們之間現有的差異（例如，損不足以奉有餘，富者愈富，貧者愈貧），還是試圖與不公作鬥爭？而且即使有著最好的動機，我們也必須當心意想不到的後果。

### 隱私和追蹤

除了預測性分析 —— 使用資料來做出關於人的自動決策 —— 資料收集本身也存在道德問題。收集資料的組織，與被收集資料的人之間，到底屬於什麼關係？

當系統只儲存使用者明確輸入的資料時，是因為使用者希望系統以特定方式儲存和處理這些資料，**系統是在為使用者提供服務**：使用者就是客戶。但是，當用戶的活動被跟蹤並記錄，作為他們正在做的其他事情的副作用時，這種關係就沒有那麼清晰了。該服務不再僅僅完成使用者想要它要做的事情，而是服務於它自己的利益，而這可能與使用者的利益相沖突。

追蹤使用者行為資料對於許多面向用戶的線上服務而言，變得越來越重要：追蹤使用者點選了哪些搜尋結果有助於改善搜尋結果的排名；推薦 “喜歡 X 的人也喜歡 Y”，可以幫助使用者發現實用有趣的東西；A/B 測試和使用者流量分析有助於改善使用者介面。這些功能需要一定量的使用者行為跟蹤，而使用者也可以從中受益。

但不同公司有著不同的商業模式，追蹤並未止步於此。如果服務是透過廣告盈利的，那麼廣告主才是真正的客戶，而使用者的利益則屈居其次。跟蹤的資料會變得更詳細，分析變得更深入，資料會保留很長時間，以便為每個人建立詳細畫像，用於營銷。

現在，公司與被收集資料的使用者之間的關係，看上去就不太一樣了。公司會免費服務使用者，並引誘使用者儘可能多地使用服務。對使用者的追蹤，主要不是服務於該使用者個體，而是服務於掏錢資助該服務的廣告商。我認為這種關係可以用一個更具罪犯內涵的詞來恰當地描述：**監視（surveilance）**。

#### 監視

讓我們做一個思想實驗，嘗試用 **監視（surveillance）** 一詞替換 **資料（data）**，再看看常見的短語是不是聽起來還那麼漂亮【93】。比如：“在我們的監視驅動的組織中，我們收集即時監視流並將它們儲存在我們的監視倉庫中。我們的監視科學家使用高階分析和監視處理來獲得新的見解。”

對於本書《設計監控密集型應用》而言，這個思想實驗是罕見的爭議性內容，但我認為需要激烈的言辭來強調這一點。在我們嘗試製造軟體 “吞噬世界” 的過程中【94】，我們已經建立了世界上迄今為止所見過的最偉大的大規模監視基礎設施。我們正朝著萬物互聯邁進，我們正在迅速走近這樣一個世界：每個有人居住的空間至少包含一個帶網際網路連線的麥克風，以智慧手機、智慧電視、語音控制助理裝置、嬰兒監視器甚至兒童玩具的形式存在，並使用基於雲的語音識別。這些裝置中的很多都有著可怕的安全記錄【95】。

即使是最為極權與專制的政權，可能也只會想著在每個房間裝一個麥克風，並強迫每個人始終攜帶能夠追蹤其位置與動向的裝置。然而，我們顯然是自願地，甚至熱情地投身於這個全域監視的世界。不同之處在於，資料是由公司，而不是由政府機構收集的【96】。

並不是所有的資料收集都稱得上監視，但檢視這一點有助於理解我們與資料收集者之間的關係。為什麼我們似乎很樂意接受企業的監視呢？也許你覺得自己沒有什麼好隱瞞的 —— 換句話說，你與當權階級穿一條褲子，你不是被邊緣化的少數派，也不必害怕受到迫害【97】。不是每個人都如此幸運。或者，也許這是因為目的似乎是溫和的 —— 這不是公然脅迫，也不是強制性的，而只是更好的推薦與更個性化的營銷。但是，結合上一節中對預測性分析的討論，這種區別似乎並不是很清晰。

我們已經看到與汽車追蹤裝置掛鉤的汽車保險費，以及取決於需要人佩戴健身追蹤裝置來確定的健康保險範圍。當監視被用於決定生活的重要方面時，例如保險或就業，它就開始變得不那麼溫和了。此外，資料分析可以揭示出令人驚訝的私密事物：例如，智慧手錶或健身追蹤器中的運動感測器能以相當好的精度計算出你正在輸入的內容（比如密碼）【98】。而分析演算法只會變得越來越精確。

#### 同意與選擇的自由

我們可能會斷言使用者是自願選擇使用了會跟蹤其活動的服務，而且他們已經同意了服務條款與隱私政策，因此他們同意資料收集。我們甚至可以聲稱，使用者在用所提供的資料來 **換取** 有價值的服務，並且為了提供服務，追蹤是必要的。毫無疑問，社交網路、搜尋引擎以及各種其他免費的線上服務對於使用者來說都是有價值的，但是這個說法卻存在問題。

使用者幾乎不知道他們提供給我們的是什麼資料，哪些資料被放進了資料庫，資料又是怎樣被保留與處理的 —— 大多數隱私政策都是模稜兩可的，忽悠使用者而不敢開啟天窗說亮話。如果使用者不瞭解他們的資料會發生什麼，就無法給出任何有意義的同意。有時來自一個使用者的資料還會提到一些關於其他人的事，而其他那些人既不是該服務的使用者，也沒有同意任何條款。我們在本書這一部分中討論的衍生資料集 —— 來自整個使用者群的資料，加上行為追蹤與外部資料來源 —— 就恰好是使用者無法（在真正意義上）理解的資料型別。

而且從使用者身上挖掘資料是一個單向過程，而不是真正的互惠關係，也不是公平的價值交換。使用者對能用多少資料換來什麼樣的服務，既沒有沒有發言權也沒有選擇權：服務與使用者之間的關係是非常不對稱與單邊的。這些條款是由服務提出的，而不是由使用者提出的【99】。

對於不同意監視的使用者，唯一真正管用的備選項，就是簡單地不使用服務。但這個選擇也不是真正自由的：如果一項服務如此受歡迎，以至於 “被大多數人認為是基本社會參與的必要條件”【99】，那麼指望人們選擇退出這項服務是不合理的 —— 使用它 **事實上（de facto）** 是強制性的。例如，在大多數西方社會群體中，攜帶智慧手機，使用 Facebook 進行社交，以及使用 Google 查詢資訊已成為常態。特別是當一項服務具有網路效應時，人們選擇 **不** 使用會產生社會成本。

因為一個服務會跟蹤使用者而拒絕使用它，這只是少數人才擁有的權力，他們有足夠的時間與知識來了解隱私政策，並承受得起代價：錯過社會參與，以及使用服務可能帶來的專業機會。對於那些處境不太好的人而言，並沒有真正意義上的選擇：監控是不可避免的。

#### 隱私與資料使用

有時候，人們聲稱 “隱私已死”，理由是有些使用者願意把各種關於他們生活的事情釋出到社交媒體上，有時是平凡俗套，但有時是高度私密的。但這種說法是錯誤的，而且是對 **隱私（privacy）** 一詞的誤解。

擁有隱私並不意味著保密一切東西；它意味著擁有選擇向誰展示哪些東西的自由，要公開什麼，以及要保密什麼。**隱私權是一項決定權**：在從保密到透明的光譜上，隱私使得每個人都能決定自己想要在什麼地方位於光譜上的哪個位置【99】。這是一個人自由與自主的重要方面。

當透過監控基礎設施從人身上提取資料時，隱私權不一定受到損害，而是轉移到了資料收集者手中。獲取資料的公司實際上是說 “相信我們會用你的資料做正確的事情”，這意味著，決定要透露什麼和保密什麼的權利從個體手中轉移到了公司手中。

這些公司反過來選擇保密這些監視結果，因為揭露這些會令人毛骨悚然，並損害它們的商業模式（比其他公司更瞭解人）。使用者的私密資訊只會間接地披露，例如針對特定人群定向投放廣告的工具（比如那些患有特定疾病的人群）。

即使特定使用者無法從特定廣告定向的人群中以個體的形式區分出來，但他們已經失去了披露一些私密資訊的能動性，例如他們是否患有某種疾病。決定向誰透露什麼並不是由個體按照自己的喜好決定的，而是由 **公司**，以利潤最大化為目標來行使隱私權的。

許多公司都有一個目標，不要讓人 **感覺到** 毛骨悚然 —— 先不說它們收集資料實際上是多麼具有侵犯性，讓我們先關注對使用者感受的管理。這些使用者感受經常被管理得很糟糕：例如，在事實上可能正確的一些東西，如果會觸發痛苦的回憶，使用者可能並不希望被提醒【100】。對於任何型別的資料，我們都應當考慮它出錯、不可取、不合時宜的可能性，並且需要建立處理這些失效的機制。無論是 “不可取” 還是 “不合時宜”，當然都是由人的判斷決定的；除非我們明確地將演算法編碼設計為尊重人類的需求，否則演算法會無視這些概念。作為這些系統的工程師，我們必須保持謙卑，充分規劃，接受這些失效。

允許線上服務的使用者控制其隱私設定，例如控制其他使用者可以看到哪些東西，是將一些控制交還給使用者的第一步。但無論怎麼設定，服務本身仍然可以不受限制地訪問資料，並能以隱私策略允許的任何方式自由使用它。即使服務承諾不會將資料出售給第三方，它通常會授予自己不受限制的權利，以便在內部處理與分析資料，而且往往比使用者公開可見的部分要深入的多。

這種從個體到公司的大規模隱私權轉移在歷史上是史無前例的【99】。監控一直存在，但它過去是昂貴的、手動的，不是可伸縮的、自動化的。信任關係一直存在，例如患者與其醫生之間，或被告與其律師之間 —— 但在這些情況下，資料的使用嚴格受到道德，法律和監管限制的約束。網際網路服務使得在未經有意義的同意下收集大量敏感資訊變得容易得多，而且無需使用者理解他們的私人資料到底發生了什麼。

#### 資料資產與權力

由於行為資料是使用者與服務互動的副產品，因此有時被稱為 “資料廢氣” —— 暗示資料是毫無價值的廢料。從這個角度來看，行為和預測性分析可以被看作是一種從資料中提取價值的回收形式，否則這些資料就會被浪費。

更準確的看法恰恰相反：從經濟的角度來看，如果定向廣告是服務的金主，那麼關於人的行為資料就是服務的核心資產。在這種情況下，使用者與之互動的應用僅僅是一種誘騙使用者將更多的個人資訊提供給監控基礎設施的手段【99】。線上服務中經常表現出的令人愉悅的人類創造力與社會關係，十分諷刺地被資料提取機器所濫用。

個人資料是珍貴資產的說法因為資料中介的存在得到支援，這是陰影中的秘密行業，購買、聚合、分析、推斷以及轉售私密個人資料，主要用於市場營銷【90】。初創公司按照它們的使用者數量，“眼球數”，—— 即它們的監視能力來估值。

因為資料很有價值，所以很多人都想要它。當然，公司也想要它 —— 這就是為什麼它們一開始就收集資料的原因。但政府也想獲得它：透過秘密交易、脅迫、法律強制或者只是竊取【101】。當公司破產時，收集到的個人資料就是被出售的資產之一。而且資料安全很難保護，因此經常發生令人難堪的洩漏事件【102】。

這些觀察已經導致批評者聲稱，資料不僅僅是一種資產，而且是一種 “有毒資產”【101】，或者至少是 “有害物質”【103】。即使我們認為自己有能力阻止資料濫用，但每當我們收集資料時，我們都需要平衡收益以及這些資料落入惡人手中的風險：計算機系統可能會被犯罪分子或敵國特務滲透，資料可能會被內鬼洩露，公司可能會落入不擇手段的管理層手中，而這些管理者有著迥然不同的價值觀，或者國家可能被能毫無愧色迫使我們交出資料的政權所接管。

俗話說，“知識就是力量”。更進一步，“在避免自己被審視的同時審視他人，是權力最重要的形式之一”【105】。這就是極權政府想要監控的原因：這讓它們有能力控制全體居民。儘管今天的科技公司並沒有公開地尋求政治權力，但是它們積累的資料與知識卻給它們帶來了很多權力，其中大部分是在公共監督之外偷偷進行的【106】。

#### 回顧工業革命

資料是資訊時代的決定性特徵。網際網路，資料儲存，處理和軟體驅動的自動化正在對全球經濟和人類社會產生重大影響。我們的日常生活與社會組織在過去十年中發生了變化，而且在未來的十年中可能會繼續發生根本性的變化，所以我們會想到與工業革命對比【87,96】。

工業革命是透過重大的技術與農業進步實現的，它帶來了持續的經濟增長，長期的生活水平顯著提高。然而它也帶來了一些嚴重的問題：空氣汙染（由於煙霧和化學過程）和水汙染（工業垃圾和人類垃圾）是可怖的。工廠老闆生活在紛奢之中，而城市工人經常居住在非常糟糕的住房中，並且在惡劣的條件下長時間工作。童工很常見，甚至包括礦井中危險而低薪的工作。

制定保護措施花費了很長的時間，例如環境保護條例、工作場所安全條例、宣佈使用童工非法以及食品衛生檢查。毫無疑問，生產成本增加了，因為工廠再也不能把廢物倒入河流、銷售汙染的食物或者剝削工人。但是整個社會都從中受益良多，我們中很少會有人想回到這些管制條例之前的日子【87】。

就像工業革命有著黑暗面需要應對一樣，我們轉向資訊時代的過程中，也有需要應對與解決的重大問題。我相信資料的收集與使用就是其中一個問題。用 Bruce Schneier 的話來說【96】：

> 資料是資訊時代的汙染問題，保護隱私是環境挑戰。幾乎所有的電腦都能生產資訊。它堆積在周圍，開始潰爛。我們如何處理它 —— 我們如何控制它，以及如何擺脫它 —— 是資訊經濟健康發展的核心議題。正如我們今天回顧工業時代的早期年代，並想知道我們的祖先在忙於建設工業世界的過程時怎麼能忽略汙染問題；我們的孫輩在回望資訊時代的早期年代時，將會就我們如何應對資料收集和濫用的挑戰來評斷我們。
>
> 我們應該設法讓他們感到驕傲。

#### 立法與自律

資料保護法可能有助於維護個人的權利。例如，1995 年的 “歐洲資料保護指示” 規定，個人資料必須 “為特定的、明確的和合法的目的收集，而不是以與這些目的不相符的方式進一步處理”，並且資料必須 “就收集的目的而言適當、相關、不過分。”【107】。

但是，這個立法在今天的網際網路環境下是否有效還是有疑問的【108】。這些規則直接否定了大資料的哲學，即最大限度地收集資料，將其與其他資料集結合起來進行試驗和探索，以便產生新的洞察。探索意味著將資料用於未曾預期的目的，這與使用者同意的 “特定和明確” 目的相反（如果我們可以有意義地表示同意的話）【109】。更新的規章正在制定中【89】。

那些收集了大量有關人的資料的公司反對監管，認為這是創新的負擔與阻礙。在某種程度上，這種反對是有道理的。例如，分享醫療資料時，存在明顯的隱私風險，但也有潛在的機遇：如果資料分析能夠幫助我們實現更好的診斷或找到更好的治療方法，能夠阻止多少人的死亡【110】？過度監管可能會阻止這種突破。在這種潛在機會與風險之間找出平衡是很困難的【105】。

從根本上說，我認為我們需要科技行業在個人資料方面的文化轉變。我們應該停止將使用者視作待最佳化的指標資料，並記住他們是值得尊重、有尊嚴和能動性的人。我們應當在資料收集和實際處理中自我約束，以建立和維持依賴我們軟體的人們的信任【111】。我們應當將教育終端使用者視為己任，告訴他們我們是如何使用他們的資料的，而不是將他們矇在鼓裡。

我們應該允許每個人保留自己的隱私 —— 即，對自己資料的控制 —— 而不是透過監視來竊取這種控制權。我們控制自己資料的個體權利就像是國家公園的自然環境：如果我們不去明確地保護它、關心它，它就會被破壞。這將是公地的悲劇，我們都會因此而變得更糟。無所不在的監視並非不可避免的 —— 我們現在仍然能阻止它。

我們究竟能做到哪一步，是一個開放的問題。首先，我們不應該永久保留資料，而是一旦不再需要就立即清除資料【111,112】。清除資料與不變性的想法背道而馳（請參閱 “[不變性的侷限性](/v1_tw/ch11#不變性的侷限性)”），但這是可以解決的問題。我所看到的一種很有前景的方法是透過加密協議來實施訪問控制，而不僅僅是透過策略【113,114】。總的來說，文化與態度的改變是必要的。


## 本章小結

在本章中，我們討論了設計資料系統的新方式，而且也包括了我的個人觀點，以及對未來的猜測。我們從這樣一種觀察開始：沒有單種工具能高效服務所有可能的用例，因此應用必須組合使用幾種不同的軟體才能實現其目標。我們討論了如何使用批處理與事件流來解決這一 **資料整合（data integration）** 問題，以便讓資料變更在不同系統之間流動。

在這種方法中，某些系統被指定為記錄系統，而其他資料則透過轉換衍生自記錄系統。透過這種方式，我們可以維護索引、物化檢視、機器學習模型、統計摘要等等。透過使這些衍生和轉換操作非同步且鬆散耦合，能夠防止一個區域中的問題擴散到系統中不相關部分，從而增加整個系統的穩健性與容錯性。

將資料流表示為從一個數據集到另一個數據集的轉換也有助於演化應用程式：如果你想變更其中一個處理步驟，例如變更索引或快取的結構，則可以在整個輸入資料集上重新執行新的轉換程式碼，以便重新衍生輸出。同樣，出現問題時，你也可以修復程式碼並重新處理資料以便恢復。

這些過程與資料庫內部已經完成的過程非常類似，因此我們將資料流應用的概念重新改寫為，**分拆（unbundling）** 資料庫元件，並透過組合這些鬆散耦合的元件來構建應用程式。

衍生狀態可以透過觀察底層資料的變更來更新。此外，衍生狀態本身可以進一步被下游消費者觀察。我們甚至可以將這種資料流一路傳送至顯示資料的終端使用者裝置，從而構建可動態更新以反映資料變更，並在離線時能繼續工作的使用者介面。

接下來，我們討論了如何確保所有這些處理在出現故障時保持正確。我們看到可伸縮的強完整性保證可以透過非同步事件處理來實現，透過使用端到端操作識別符號使操作冪等，以及透過非同步檢查約束。客戶端可以等到檢查透過，或者不等待繼續前進，但是可能會冒有違反約束需要道歉的風險。這種方法比使用分散式事務的傳統方法更具可伸縮性與可靠性，並且在實踐中適用於很多業務流程。

透過圍繞資料流構建應用，並非同步檢查約束，我們可以避免絕大多數的協調工作，建立保證完整性且效能仍然表現良好的系統，即使在地理散佈的情況下與出現故障時亦然。然後，我們對使用審計來驗證資料完整性，以及損壞檢測進行了一些討論。

最後，我們退後一步，審視了構建資料密集型應用的一些道德問題。我們看到，雖然資料可以用來做好事，但它也可能造成很大傷害：作出嚴重影響人們生活的決定卻難以申訴，導致歧視與剝削、監視常態化、曝光私密資訊。我們也冒著資料被洩露的風險，並且可能會發現，即使是善意地使用資料也可能會導致意想不到的後果。

由於軟體和資料對世界產生了如此巨大的影響，我們工程師們必須牢記，我們有責任為我們想要的那種世界而努力：一個尊重人們，尊重人性的世界。我希望我們能夠一起為實現這一目標而努力。


## 參考文獻

1. Rachid Belaid: “[Postgres Full-Text Search is Good Enough!](http://rachbelaid.com/postgres-full-text-search-is-good-enough/),” *rachbelaid.com*, July 13, 2015.
1. Philippe Ajoux, Nathan Bronson, Sanjeev Kumar, et al.: “[Challenges to Adopting Stronger Consistency at Scale](https://www.usenix.org/system/files/conference/hotos15/hotos15-paper-ajoux.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
1. Pat Helland and Dave Campbell: “[Building on Quicksand](https://web.archive.org/web/20220606172817/https://database.cs.wisc.edu/cidr/cidr2009/Paper_133.pdf),” at *4th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2009.
1. Jessica Kerr: “[Provenance and Causality in Distributed Systems](https://web.archive.org/web/20190425150540/http://blog.jessitron.com/2016/09/provenance-and-causality-in-distributed.html),” *blog.jessitron.com*, September 25, 2016.
1. Kostas Tzoumas: “[Batch Is a Special Case of Streaming](http://data-artisans.com/blog/batch-is-a-special-case-of-streaming/),” *data-artisans.com*, September 15, 2015.
1. Shinji Kim and Robert Blafford: “[Stream Windowing Performance Analysis: Concord and Spark Streaming](https://web.archive.org/web/20180125074821/http://concord.io/posts/windowing_performance_analysis_w_spark_streaming),” *concord.io*, July 6, 2016.
1. Jay Kreps: “[The Log: What Every Software Engineer Should Know About Real-Time Data's Unifying Abstraction](http://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying),” *engineering.linkedin.com*, December 16, 2013.
1. Pat Helland: “[Life Beyond Distributed Transactions: An Apostate’s Opinion](https://web.archive.org/web/20200730171311/http://www-db.cs.wisc.edu/cidr/cidr2007/papers/cidr07p15.pdf),” at *3rd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2007.
1. “[Great Western Railway (1835–1948)](https://web.archive.org/web/20160122155425/https://www.networkrail.co.uk/VirtualArchive/great-western/),” Network Rail Virtual Archive, *networkrail.co.uk*.
1. Jacqueline Xu: “[Online Migrations at Scale](https://stripe.com/blog/online-migrations),” *stripe.com*, February 2, 2017.
1. Molly Bartlett Dishman and Martin Fowler: “[Agile Architecture](https://web.archive.org/web/20161130034721/http://conferences.oreilly.com/software-architecture/sa2015/public/schedule/detail/40388),” at *O'Reilly Software Architecture Conference*, March 2015.
1. Nathan Marz and James Warren: [*Big Data: Principles and Best Practices of Scalable Real-Time Data Systems*](https://www.manning.com/books/big-data). Manning, 2015. ISBN: 978-1-617-29034-3
1. Oscar Boykin, Sam Ritchie, Ian O'Connell, and Jimmy Lin: “[Summingbird: A Framework for Integrating Batch and Online MapReduce Computations](http://www.vldb.org/pvldb/vol7/p1441-boykin.pdf),” at *40th International Conference on Very Large Data Bases* (VLDB), September 2014.
1. Jay Kreps: “[Questioning the Lambda Architecture](https://www.oreilly.com/ideas/questioning-the-lambda-architecture),” *oreilly.com*, July 2, 2014.
1. Raul Castro Fernandez, Peter Pietzuch, Jay Kreps, et al.: “[Liquid: Unifying Nearline and Offline Big Data Integration](http://cidrdb.org/cidr2015/Papers/CIDR15_Paper25u.pdf),” at *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
1. Dennis M. Ritchie and Ken Thompson: “[The UNIX Time-Sharing System](http://web.eecs.utk.edu/~qcao1/cs560/papers/paper-unix.pdf),” *Communications of the ACM*, volume 17, number 7, pages 365–375, July 1974. [doi:10.1145/361011.361061](http://dx.doi.org/10.1145/361011.361061)
1. Eric A. Brewer and Joseph M. Hellerstein: “[CS262a: Advanced Topics in Computer Systems](http://people.eecs.berkeley.edu/~brewer/cs262/systemr.html),” lecture notes, University of California, Berkeley, *cs.berkeley.edu*, August 2011.
1. Michael Stonebraker: “[The Case for Polystores](http://wp.sigmod.org/?p=1629),” *wp.sigmod.org*, July 13, 2015.
1. Jennie Duggan, Aaron J. Elmore, Michael Stonebraker, et al.: “[The BigDAWG Polystore System](https://dspace.mit.edu/handle/1721.1/100936),” *ACM SIGMOD Record*, volume 44, number 2, pages 11–16, June 2015. [doi:10.1145/2814710.2814713](http://dx.doi.org/10.1145/2814710.2814713)
1. Patrycja Dybka: “[Foreign Data Wrappers for PostgreSQL](https://web.archive.org/web/20221003115732/https://www.vertabelo.com/blog/foreign-data-wrappers-for-postgresql/),” *vertabelo.com*, March 24, 2015.
1. David B. Lomet, Alan Fekete, Gerhard Weikum, and Mike Zwilling: “[Unbundling Transaction Services in the Cloud](https://www.microsoft.com/en-us/research/publication/unbundling-transaction-services-in-the-cloud/),” at *4th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2009.
1. Martin Kleppmann and Jay Kreps: “[Kafka, Samza and the Unix Philosophy of Distributed Data](http://martin.kleppmann.com/papers/kafka-debull15.pdf),” *IEEE Data Engineering Bulletin*, volume 38, number 4, pages 4–14, December 2015.
1. John Hugg: “[Winning Now and in the Future: Where VoltDB Shines](https://voltdb.com/blog/winning-now-and-future-where-voltdb-shines),” *voltdb.com*, March 23, 2016.
1. Frank McSherry, Derek G. Murray, Rebecca Isaacs, and Michael Isard: “[Differential Dataflow](http://cidrdb.org/cidr2013/Papers/CIDR13_Paper111.pdf),” at *6th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2013.
1. Derek G Murray, Frank McSherry, Rebecca Isaacs, et al.: “[Naiad: A Timely Dataflow System](http://sigops.org/s/conferences/sosp/2013/papers/p439-murray.pdf),” at *24th ACM Symposium on Operating Systems Principles* (SOSP), pages 439–455, November 2013. [doi:10.1145/2517349.2522738](http://dx.doi.org/10.1145/2517349.2522738)
1. Gwen Shapira: “[We have a bunch of customers who are implementing ‘database inside-out’ concept and they all ask ‘is anyone else doing it? are we crazy?’](https://twitter.com/gwenshap/status/758800071110430720)” *twitter.com*, July 28, 2016.
1. Martin Kleppmann: “[Turning the Database Inside-out with Apache Samza,](http://martin.kleppmann.com/2015/03/04/turning-the-database-inside-out.html)” at *Strange Loop*, September 2014.
1. Peter Van Roy and Seif Haridi: [*Concepts, Techniques, and Models of Computer Programming*](https://www.info.ucl.ac.be/~pvr/book.html). MIT Press, 2004. ISBN: 978-0-262-22069-9
1. “[Juttle Documentation](http://juttle.github.io/juttle/),” *juttle.github.io*, 2016.
1. Evan Czaplicki and Stephen Chong: “[Asynchronous Functional Reactive Programming for GUIs](http://people.seas.harvard.edu/~chong/pubs/pldi13-elm.pdf),” at *34th ACM SIGPLAN Conference on Programming Language Design and Implementation* (PLDI), June 2013. [doi:10.1145/2491956.2462161](http://dx.doi.org/10.1145/2491956.2462161)
1. Engineer Bainomugisha, Andoni Lombide Carreton, Tom van Cutsem, Stijn Mostinckx, and Wolfgang de Meuter: “[A Survey on Reactive Programming](http://soft.vub.ac.be/Publications/2012/vub-soft-tr-12-13.pdf),” *ACM Computing Surveys*, volume 45, number 4, pages 1–34, August 2013. [doi:10.1145/2501654.2501666](http://dx.doi.org/10.1145/2501654.2501666)
1. Peter Alvaro, Neil Conway, Joseph M. Hellerstein, and William R. Marczak: “[Consistency Analysis in Bloom: A CALM and Collected Approach](https://dsf.berkeley.edu/cs286/papers/calm-cidr2011.pdf),” at *5th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2011.
1. Felienne Hermans: “[Spreadsheets Are Code](https://vimeo.com/145492419),” at *Code Mesh*, November 2015.
1. Dan Bricklin and Bob Frankston: “[VisiCalc: Information from Its Creators](http://danbricklin.com/visicalc.htm),” *danbricklin.com*.
1. D. Sculley, Gary Holt, Daniel Golovin, et al.: “[Machine Learning: The High-Interest Credit Card of Technical Debt](http://research.google.com/pubs/pub43146.html),” at *NIPS Workshop on Software Engineering for Machine Learning* (SE4ML), December 2014.
1. Peter Bailis, Alan Fekete, Michael J Franklin, et al.: “[Feral Concurrency Control: An Empirical Investigation of Modern Application Integrity](http://www.bailis.org/papers/feral-sigmod2015.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2737784](http://dx.doi.org/10.1145/2723372.2737784)
1. Guy Steele: “[Re: Need for Macros (Was Re: Icon)](https://people.csail.mit.edu/gregs/ll1-discuss-archive-html/msg01134.html),” email to *ll1-discuss* mailing list, *people.csail.mit.edu*, December 24, 2001.
1. David Gelernter: “[Generative Communication in Linda](http://cseweb.ucsd.edu/groups/csag/html/teaching/cse291s03/Readings/p80-gelernter.pdf),” *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 7, number 1, pages 80–112, January 1985. [doi:10.1145/2363.2433](http://dx.doi.org/10.1145/2363.2433)
1. Patrick Th. Eugster, Pascal A. Felber, Rachid Guerraoui, and Anne-Marie Kermarrec: “[The Many Faces of Publish/Subscribe](http://www.cs.ru.nl/~pieter/oss/manyfaces.pdf),” *ACM Computing Surveys*, volume 35, number 2, pages 114–131, June 2003. [doi:10.1145/857076.857078](http://dx.doi.org/10.1145/857076.857078)
1. Ben Stopford: “[Microservices in a Streaming World](https://www.infoq.com/presentations/microservices-streaming),” at *QCon London*, March 2016.
1. Christian Posta: “[Why Microservices Should Be Event Driven: Autonomy vs Authority](http://blog.christianposta.com/microservices/why-microservices-should-be-event-driven-autonomy-vs-authority/),” *blog.christianposta.com*, May 27, 2016.
1. Alex Feyerke: “[Say Hello to Offline First](https://web.archive.org/web/20210420014747/http://hood.ie/blog/say-hello-to-offline-first.html),” *hood.ie*, November 5, 2013.
1. Sebastian Burckhardt, Daan Leijen, Jonathan Protzenko, and Manuel Fähndrich: “[Global Sequence Protocol: A Robust Abstraction for Replicated Shared State](http://drops.dagstuhl.de/opus/volltexte/2015/5238/),” at *29th European Conference on Object-Oriented Programming* (ECOOP), July 2015. [doi:10.4230/LIPIcs.ECOOP.2015.568](http://dx.doi.org/10.4230/LIPIcs.ECOOP.2015.568)
1. Mark Soper: “[Clearing Up React Data Management Confusion with Flux, Redux, and Relay](https://medium.com/@marksoper/clearing-up-react-data-management-confusion-with-flux-redux-and-relay-aad504e63cae),” *medium.com*, December 3, 2015.
1. Eno Thereska, Damian Guy, Michael Noll, and Neha Narkhede: “[Unifying Stream Processing and Interactive Queries in Apache Kafka](http://www.confluent.io/blog/unifying-stream-processing-and-interactive-queries-in-apache-kafka/),” *confluent.io*, October 26, 2016.
1. Frank McSherry: “[Dataflow as Database](https://github.com/frankmcsherry/blog/blob/master/posts/2016-07-17.md),” *github.com*, July 17, 2016.
1. Peter Alvaro: “[I See What You Mean](https://www.youtube.com/watch?v=R2Aa4PivG0g),” at *Strange Loop*, September 2015.
1. Nathan Marz: “[Trident: A High-Level Abstraction for Realtime Computation](https://blog.twitter.com/2012/trident-a-high-level-abstraction-for-realtime-computation),” *blog.twitter.com*, August 2, 2012.
1. Edi Bice: “[Low Latency Web Scale Fraud Prevention with Apache Samza, Kafka and Friends](http://www.slideshare.net/edibice/extremely-low-latency-web-scale-fraud-prevention-with-apache-samza-kafka-and-friends),” at *Merchant Risk Council MRC Vegas Conference*, March 2016.
1. Charity Majors: “[The Accidental DBA](https://charity.wtf/2016/10/02/the-accidental-dba/),” *charity.wtf*, October 2, 2016.
1. Arthur J. Bernstein, Philip M. Lewis, and Shiyong Lu: “[Semantic Conditions for Correctness at Different Isolation Levels](http://db.cs.berkeley.edu/cs286/papers/isolation-icde2000.pdf),” at *16th International Conference on Data Engineering* (ICDE), February 2000. [doi:10.1109/ICDE.2000.839387](http://dx.doi.org/10.1109/ICDE.2000.839387)
1. Sudhir Jorwekar, Alan Fekete, Krithi Ramamritham, and S. Sudarshan: “[Automating the Detection of Snapshot Isolation Anomalies](http://www.vldb.org/conf/2007/papers/industrial/p1263-jorwekar.pdf),” at *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
1. Kyle Kingsbury: [Jepsen blog post series](https://aphyr.com/tags/jepsen), *aphyr.com*, 2013–2016.
1. Michael Jouravlev: “[Redirect After Post](http://www.theserverside.com/news/1365146/Redirect-After-Post),” *theserverside.com*, August 1, 2004.
1. Jerome H. Saltzer, David P. Reed, and David D. Clark: “[End-to-End Arguments in System Design](https://groups.csail.mit.edu/ana/Publications/PubPDFs/End-to-End%20Arguments%20in%20System%20Design.pdf),” *ACM Transactions on Computer Systems*, volume 2, number 4, pages 277–288, November 1984. [doi:10.1145/357401.357402](http://dx.doi.org/10.1145/357401.357402)
1. Peter Bailis, Alan Fekete, Michael J. Franklin, et al.: “[Coordination-Avoiding Database Systems](http://arxiv.org/pdf/1402.2237.pdf),” *Proceedings of the VLDB Endowment*, volume 8, number 3, pages 185–196, November 2014.
1. Alex Yarmula: “[Strong Consistency in Manhattan](https://blog.twitter.com/2016/strong-consistency-in-manhattan),” *blog.twitter.com*, March 17, 2016.
1. Douglas B Terry, Marvin M Theimer, Karin Petersen, et al.: “[Managing Update Conflicts in Bayou, a Weakly Connected Replicated Storage System](http://css.csail.mit.edu/6.824/2014/papers/bayou-conflicts.pdf),” at *15th ACM Symposium on Operating Systems Principles* (SOSP), pages 172–182, December 1995. [doi:10.1145/224056.224070](http://dx.doi.org/10.1145/224056.224070)
1. Jim Gray: “[The Transaction Concept: Virtues and Limitations](http://jimgray.azurewebsites.net/papers/thetransactionconcept.pdf),” at *7th International Conference on Very Large Data Bases* (VLDB), September 1981.
1. Hector Garcia-Molina and Kenneth Salem: “[Sagas](http://www.cs.cornell.edu/andru/cs711/2002fa/reading/sagas.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), May 1987. [doi:10.1145/38713.38742](http://dx.doi.org/10.1145/38713.38742)
1. Pat Helland: “[Memories, Guesses, and Apologies](https://web.archive.org/web/20160304020907/http://blogs.msdn.com/b/pathelland/archive/2007/05/15/memories-guesses-and-apologies.aspx),” *blogs.msdn.com*, May 15, 2007.
1. Yoongu Kim, Ross Daly, Jeremie Kim, et al.: “[Flipping Bits in Memory Without Accessing Them: An Experimental Study of DRAM Disturbance Errors](https://users.ece.cmu.edu/~yoonguk/papers/kim-isca14.pdf),” at *41st Annual International Symposium on Computer Architecture* (ISCA), June 2014. [doi:10.1145/2678373.2665726](http://dx.doi.org/10.1145/2678373.2665726)
1. Mark Seaborn and Thomas Dullien: “[Exploiting the DRAM Rowhammer Bug to Gain Kernel Privileges](https://googleprojectzero.blogspot.co.uk/2015/03/exploiting-dram-rowhammer-bug-to-gain.html),” *googleprojectzero.blogspot.co.uk*, March 9, 2015.
1. Jim N. Gray and Catharine van Ingen: “[Empirical Measurements of Disk Failure Rates and Error Rates](https://www.microsoft.com/en-us/research/publication/empirical-measurements-of-disk-failure-rates-and-error-rates/),” Microsoft Research, MSR-TR-2005-166, December 2005.
1. Annamalai Gurusami and Daniel Price: “[Bug #73170: Duplicates in Unique Secondary Index Because of Fix of Bug#68021](http://bugs.mysql.com/bug.php?id=73170),” *bugs.mysql.com*, July 2014.
1. Gary Fredericks: “[Postgres Serializability Bug](https://github.com/gfredericks/pg-serializability-bug),” *github.com*, September 2015.
1. Xiao Chen: “[HDFS DataNode Scanners and Disk Checker Explained](http://blog.cloudera.com/blog/2016/12/hdfs-datanode-scanners-and-disk-checker-explained/),” *blog.cloudera.com*, December 20, 2016.
1. Jay Kreps: “[Getting Real About Distributed System Reliability](http://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability),” *blog.empathybox.com*, March 19, 2012.
1. Martin Fowler: “[The LMAX Architecture](http://martinfowler.com/articles/lmax.html),” *martinfowler.com*, July 12, 2011.
1. Sam Stokes: “[Move Fast with Confidence](http://blog.samstokes.co.uk/blog/2016/07/11/move-fast-with-confidence/),” *blog.samstokes.co.uk*, July 11, 2016.
1. “[Hyperledger Sawtooth documentation](https://web.archive.org/web/20220120211548/https://sawtooth.hyperledger.org/docs/core/releases/latest/introduction.html),” Intel Corporation, *sawtooth.hyperledger.org*, 2017.
1. Richard Gendal Brown: “[Introducing R3 Corda™: A Distributed Ledger Designed for Financial Services](https://gendal.me/2016/04/05/introducing-r3-corda-a-distributed-ledger-designed-for-financial-services/),” *gendal.me*, April 5, 2016.
1. Trent McConaghy, Rodolphe Marques, Andreas Müller, et al.: “[BigchainDB: A Scalable Blockchain Database](https://www.bigchaindb.com/whitepaper/bigchaindb-whitepaper.pdf),” *bigchaindb.com*, June 8, 2016.
1. Ralph C. Merkle: “[A Digital Signature Based on a Conventional Encryption Function](https://people.eecs.berkeley.edu/~raluca/cs261-f15/readings/merkle.pdf),” at *CRYPTO '87*, August 1987. [doi:10.1007/3-540-48184-2_32](http://dx.doi.org/10.1007/3-540-48184-2_32)
1. Ben Laurie: “[Certificate Transparency](http://queue.acm.org/detail.cfm?id=2668154),” *ACM Queue*, volume 12, number 8, pages 10-19, August 2014. [doi:10.1145/2668152.2668154](http://dx.doi.org/10.1145/2668152.2668154)
1. Mark D. Ryan: “[Enhanced Certificate Transparency and End-to-End Encrypted Mail](https://www.ndss-symposium.org/wp-content/uploads/2017/09/12_2_1.pdf),” at *Network and Distributed System Security Symposium* (NDSS), February 2014. [doi:10.14722/ndss.2014.23379](http://dx.doi.org/10.14722/ndss.2014.23379)
1. “[ACM Code of Ethics and Professional Conduct](https://www.acm.org/code-of-ethics),” Association for Computing Machinery, *acm.org*, 2018.
1. François Chollet: “[Software development is starting to involve important ethical choices](https://twitter.com/fchollet/status/792958695722201088),” *twitter.com*, October 30, 2016.
1. Igor Perisic: “[Making Hard Choices: The Quest for Ethics in Machine Learning](https://engineering.linkedin.com/blog/2016/11/making-hard-choices--the-quest-for-ethics-in-machine-learning),” *engineering.linkedin.com*, November 2016.
1. John Naughton: “[Algorithm Writers Need a Code of Conduct](https://www.theguardian.com/commentisfree/2015/dec/06/algorithm-writers-should-have-code-of-conduct),” *theguardian.com*, December 6, 2015.
1. Logan Kugler: “[What Happens When Big Data Blunders?](http://cacm.acm.org/magazines/2016/6/202655-what-happens-when-big-data-blunders/fulltext),” *Communications of the ACM*, volume 59, number 6, pages 15–16, June 2016. [doi:10.1145/2911975](http://dx.doi.org/10.1145/2911975)
1. Bill Davidow: “[Welcome to Algorithmic Prison](http://www.theatlantic.com/technology/archive/2014/02/welcome-to-algorithmic-prison/283985/),” *theatlantic.com*, February 20, 2014.
1. Don Peck: “[They're Watching You at Work](http://www.theatlantic.com/magazine/archive/2013/12/theyre-watching-you-at-work/354681/),” *theatlantic.com*, December 2013.
1. Leigh Alexander: “[Is an Algorithm Any Less Racist Than a Human?](https://www.theguardian.com/technology/2016/aug/03/algorithm-racist-human-employers-work)” *theguardian.com*, August 3, 2016.
1. Jesse Emspak: “[How a Machine Learns Prejudice](https://www.scientificamerican.com/article/how-a-machine-learns-prejudice/),” *scientificamerican.com*, December 29, 2016.
1. Maciej Cegłowski: “[The Moral Economy of Tech](http://idlewords.com/talks/sase_panel.htm),” *idlewords.com*, June 2016.
1. Cathy O'Neil: [*Weapons of Math Destruction: How Big Data Increases Inequality and Threatens Democracy*](https://web.archive.org/web/20210621234447/https://weaponsofmathdestructionbook.com/). Crown Publishing, 2016. ISBN: 978-0-553-41881-1
1. Julia Angwin: “[Make Algorithms Accountable](http://www.nytimes.com/2016/08/01/opinion/make-algorithms-accountable.html),” *nytimes.com*, August 1, 2016.
1. Bryce Goodman and Seth Flaxman: “[European Union Regulations on Algorithmic Decision-Making and a ‘Right to Explanation’](https://arxiv.org/abs/1606.08813),” *arXiv:1606.08813*, August 31, 2016.
1. “[A Review of the Data Broker Industry: Collection, Use, and Sale of Consumer Data for Marketing Purposes](https://web.archive.org/web/20240619042302/http://educationnewyork.com/files/rockefeller_databroker.pdf),” Staff Report, *United States Senate Committee on Commerce, Science, and Transportation*, *commerce.senate.gov*, December 2013.
1. Olivia Solon: “[Facebook’s Failure: Did Fake News and Polarized Politics Get Trump Elected?](https://www.theguardian.com/technology/2016/nov/10/facebook-fake-news-election-conspiracy-theories)” *theguardian.com*, November 10, 2016.
1. Donella H. Meadows and Diana Wright: *Thinking in Systems: A Primer*. Chelsea Green Publishing, 2008. ISBN: 978-1-603-58055-7
1. Daniel J. Bernstein: “[Listening to a ‘big data’/‘data science’ talk](https://twitter.com/hashbreaker/status/598076230437568512),” *twitter.com*, May 12, 2015.
1. Marc Andreessen: “[Why Software Is Eating the World](http://genius.com/Marc-andreessen-why-software-is-eating-the-world-annotated),” *The Wall Street Journal*, 20 August 2011.
1. J. M. Porup: “[‘Internet of Things’ Security Is Hilariously Broken and Getting Worse](http://arstechnica.com/security/2016/01/how-to-search-the-internet-of-things-for-photos-of-sleeping-babies/),” *arstechnica.com*, January 23, 2016.
1. Bruce Schneier: [*Data and Goliath: The Hidden Battles to Collect Your Data and Control Your World*](https://www.schneier.com/books/data_and_goliath/). W. W. Norton, 2015. ISBN: 978-0-393-35217-7
1. The Grugq: “[Nothing to Hide](https://grugq.tumblr.com/post/142799983558/nothing-to-hide),” *grugq.tumblr.com*, April 15, 2016.
1. Tony Beltramelli: “[Deep-Spying: Spying Using Smartwatch and Deep Learning](https://arxiv.org/abs/1512.05616),” Masters Thesis, IT University of Copenhagen, December 2015. Available at *arxiv.org/abs/1512.05616*
1. Shoshana Zuboff: “[Big Other: Surveillance Capitalism and the Prospects of an Information Civilization](http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2594754),” *Journal of Information Technology*, volume 30, number 1, pages 75–89, April 2015. [doi:10.1057/jit.2015.5](http://dx.doi.org/10.1057/jit.2015.5)
1. Carina C. Zona: “[Consequences of an Insightful Algorithm](https://www.youtube.com/watch?v=YRI40A4tyWU),” at *GOTO Berlin*, November 2016.
1. Bruce Schneier: “[Data Is a Toxic Asset, So Why Not Throw It Out?](https://www.schneier.com/essays/archives/2016/03/data_is_a_toxic_asse.html),” *schneier.com*, March 1, 2016.
1. John E. Dunn: “[The UK’s 15 Most Infamous Data Breaches](https://web.archive.org/web/20161120070058/http://www.techworld.com/security/uks-most-infamous-data-breaches-2016-3604586/),” *techworld.com*, November 18, 2016.
1. Cory Scott: “[Data is not toxic - which implies no benefit - but rather hazardous material, where we must balance need vs. want](https://twitter.com/cory_scott/status/706586399483437056),” *twitter.com*, March 6, 2016.
1. Bruce Schneier: “[Mission Creep: When Everything Is Terrorism](https://www.schneier.com/essays/archives/2013/07/mission_creep_when_e.html),” *schneier.com*, July 16, 2013.
1. Lena Ulbricht and Maximilian von Grafenstein: “[Big Data: Big Power Shifts?](http://policyreview.info/articles/analysis/big-data-big-power-shifts),” *Internet Policy Review*, volume 5, number 1, March 2016. [doi:10.14763/2016.1.406](http://dx.doi.org/10.14763/2016.1.406)
1. Ellen P. Goodman and Julia Powles: “[Facebook and Google: Most Powerful and Secretive Empires We've Ever Known](https://www.theguardian.com/technology/2016/sep/28/google-facebook-powerful-secretive-empire-transparency),” *theguardian.com*, September 28, 2016.
1. [Directive 95/46/EC on the protection of individuals with regard to the processing of personal data and on the free movement of such data](http://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:31995L0046), Official Journal of the European Communities No. L 281/31, *eur-lex.europa.eu*, November 1995.
1. Brendan Van Alsenoy: “[Regulating Data Protection: The Allocation of Responsibility and Risk Among Actors Involved in Personal Data Processing](https://lirias.kuleuven.be/handle/123456789/545027),” Thesis, KU Leuven Centre for IT and IP Law, August 2016.
1. Michiel Rhoen: “[Beyond Consent: Improving Data Protection Through Consumer Protection Law](http://policyreview.info/articles/analysis/beyond-consent-improving-data-protection-through-consumer-protection-law),” *Internet Policy Review*, volume 5, number 1, March 2016. [doi:10.14763/2016.1.404](http://dx.doi.org/10.14763/2016.1.404)
1. Jessica Leber: “[Your Data Footprint Is Affecting Your Life in Ways You Can’t Even Imagine](https://www.fastcoexist.com/3057514/your-data-footprint-is-affecting-your-life-in-ways-you-cant-even-imagine),” *fastcoexist.com*, March 15, 2016.
1. Maciej Cegłowski: “[Haunted by Data](http://idlewords.com/talks/haunted_by_data.htm),” *idlewords.com*, October 2015.
1. Sam Thielman: “[You Are Not What You Read: Librarians Purge User Data to Protect Privacy](https://www.theguardian.com/us-news/2016/jan/13/us-library-records-purged-data-privacy),” *theguardian.com*, January 13, 2016.
1. Conor Friedersdorf: “[Edward Snowden’s Other Motive for Leaking](http://www.theatlantic.com/politics/archive/2014/05/edward-snowdens-other-motive-for-leaking/370068/),” *theatlantic.com*, May 13, 2014.
1. Phillip Rogaway: “[The Moral Character of Cryptographic Work](http://web.cs.ucdavis.edu/~rogaway/papers/moral-fn.pdf),” Cryptology ePrint 2015/1162, December 2015.

================================================
FILE: content/v1_tw/ch2.md
================================================
---
title: "第二章：資料模型與查詢語言"
linkTitle: "2. 資料模型與查詢語言"
weight: 102
math: true
breadcrumbs: false
---


![](/map/ch02.png)

> 語言的邊界就是思想的邊界。
>
> —— 路德維奇・維特根斯坦，《邏輯哲學》（1922）


資料模型可能是軟體開發中最重要的部分了，因為它們的影響如此深遠：不僅僅影響著軟體的編寫方式，而且影響著我們的 **解題思路**。

多數應用使用層層疊加的資料模型構建。對於每層資料模型的關鍵問題是：它是如何用低一層資料模型來 **表示** 的？例如：

1. 作為一名應用開發人員，你觀察現實世界（裡面有人員、組織、貨物、行為、資金流向、感測器等），並採用物件或資料結構，以及操控那些資料結構的 API 來進行建模。那些結構通常是特定於應用程式的。
2. 當要儲存那些資料結構時，你可以利用通用資料模型來表示它們，如 JSON 或 XML 文件、關係資料庫中的表或圖模型。
3. 資料庫軟體的工程師選定如何以記憶體、磁碟或網路上的位元組來表示 JSON / XML/ 關係 / 圖資料。這類表示形式使資料有可能以各種方式來查詢，搜尋，操縱和處理。
4. 在更低的層次上，硬體工程師已經想出了使用電流、光脈衝、磁場或者其他東西來表示位元組的方法。

一個複雜的應用程式可能會有更多的中間層次，比如基於 API 的 API，不過基本思想仍然是一樣的：每個層都透過提供一個明確的資料模型來隱藏更低層次中的複雜性。這些抽象允許不同的人群有效地協作（例如資料庫廠商的工程師和使用資料庫的應用程式開發人員）。

資料模型種類繁多，每個資料模型都帶有如何使用的設想。有些用法很容易，有些則不支援如此；有些操作執行很快，有些則表現很差；有些資料轉換非常自然，有些則很麻煩。

掌握一個數據模型需要花費很多精力（想想關係資料建模有多少本書）。即便只使用一個數據模型，不用操心其內部工作機制，構建軟體也是非常困難的。然而，因為資料模型對上層軟體的功能（能做什麼，不能做什麼）有著至深的影響，所以選擇一個適合的資料模型是非常重要的。

在本章中，我們將研究一系列用於資料儲存和查詢的通用資料模型（前面列表中的第 2 點）。特別地，我們將比較關係模型，文件模型和少量基於圖形的資料模型。我們還將檢視各種查詢語言並比較它們的用例。在 [第三章](/v1_tw/ch3) 中，我們將討論儲存引擎是如何工作的。也就是說，這些資料模型實際上是如何實現的（列表中的第 3 點）。


## 關係模型與文件模型

現在最著名的資料模型可能是 SQL。它基於 Edgar Codd 在 1970 年提出的關係模型【1】：資料被組織成 **關係**（SQL 中稱作 **表**），其中每個關係是 **元組**（SQL 中稱作 **行**) 的無序集合。

關係模型曾是一個理論性的提議，當時很多人都懷疑是否能夠有效實現它。然而到了 20 世紀 80 年代中期，關係資料庫管理系統（RDBMSes）和 SQL 已成為大多數人們儲存和查詢某些常規結構的資料的首選工具。關係資料庫已經持續稱霸了大約 25~30 年 —— 這對計算機史來說是極其漫長的時間。

關係資料庫起源於商業資料處理，在 20 世紀 60 年代和 70 年代用大型計算機來執行。從今天的角度來看，那些用例顯得很平常：典型的 **事務處理**（將銷售或銀行交易，航空公司預訂，庫存管理資訊記錄在庫）和 **批處理**（客戶發票，工資單，報告）。

當時的其他資料庫迫使應用程式開發人員必須考慮資料庫內部的資料表示形式。關係模型致力於將上述實現細節隱藏在更簡潔的介面之後。

多年來，在資料儲存和查詢方面存在著許多相互競爭的方法。在 20 世紀 70 年代和 80 年代初，網狀模型（network model）和層次模型（hierarchical model）曾是主要的選擇，但關係模型（relational model）隨後佔據了主導地位。物件資料庫在 20 世紀 80 年代末和 90 年代初來了又去。XML 資料庫在二十一世紀初出現，但只有小眾採用過。關係模型的每個競爭者都在其時代產生了大量的炒作，但從來沒有持續【2】。

隨著電腦越來越強大和互聯，它們開始用於日益多樣化的目的。關係資料庫非常成功地被推廣到業務資料處理的原始範圍之外更為廣泛的用例上。你今天在網上看到的大部分內容依舊是由關係資料庫來提供支援，無論是線上釋出、討論、社交網路、電子商務、遊戲、軟體即服務生產力應用程式等內容。

### NoSQL 的誕生

現在 - 2010 年代，NoSQL 開始了最新一輪嘗試，試圖推翻關係模型的統治地位。“NoSQL” 這個名字讓人遺憾，因為實際上它並沒有涉及到任何特定的技術。最初它只是作為一個醒目的 Twitter 標籤，用在 2009 年一個關於分散式，非關係資料庫上的開源聚會上。無論如何，這個術語觸動了某些神經，並迅速在網路創業社群內外傳播開來。好些有趣的資料庫系統現在都與 *#NoSQL* 標籤相關聯，並且 NoSQL 被追溯性地重新解釋為 **不僅是 SQL（Not Only SQL）** 【4】。

採用 NoSQL 資料庫的背後有幾個驅動因素，其中包括：

* 需要比關係資料庫更好的可伸縮性，包括非常大的資料集或非常高的寫入吞吐量
* 相比商業資料庫產品，免費和開源軟體更受偏愛
* 關係模型不能很好地支援一些特殊的查詢操作
* 受挫於關係模型的限制性，渴望一種更具多動態性與表現力的資料模型【5】

不同的應用程式有不同的需求，一個用例的最佳技術選擇可能不同於另一個用例的最佳技術選擇。因此，在可預見的未來，關係資料庫似乎可能會繼續與各種非關係資料庫一起使用 - 這種想法有時也被稱為 **混合持久化（polyglot persistence）**。

### 物件關係不匹配

目前大多數應用程式開發都使用物件導向的程式語言來開發，這導致了對 SQL 資料模型的普遍批評：如果資料儲存在關係表中，那麼需要一個笨拙的轉換層，處於應用程式程式碼中的物件和表，行，列的資料庫模型之間。模型之間的不連貫有時被稱為 **阻抗不匹配（impedance mismatch）**[^i]。

[^i]: 一個從電子學借用的術語。每個電路的輸入和輸出都有一定的阻抗（交流電阻）。當你將一個電路的輸出連線到另一個電路的輸入時，如果兩個電路的輸出和輸入阻抗匹配，則連線上的功率傳輸將被最大化。阻抗不匹配會導致訊號反射及其他問題。

像 ActiveRecord 和 Hibernate 這樣的 **物件關係對映（ORM object-relational mapping）** 框架可以減少這個轉換層所需的樣板程式碼的數量，但是它們不能完全隱藏這兩個模型之間的差異。

![](/v1/ddia_0201.png)

**圖 2-1 使用關係型模式來表示領英簡介**

例如，[圖 2-1](/v1/ddia_0201.png) 展示了如何在關係模式中表示簡歷（一個 LinkedIn 簡介）。整個簡介可以透過一個唯一的識別符號 `user_id` 來標識。像 `first_name` 和 `last_name` 這樣的欄位每個使用者只出現一次，所以可以在 User 表上將其建模為列。但是，大多數人在職業生涯中擁有多於一份的工作，人們可能有不同樣的教育階段和任意數量的聯絡資訊。從使用者到這些專案之間存在一對多的關係，可以用多種方式來表示：

* 傳統 SQL 模型（SQL：1999 之前）中，最常見的正規化表示形式是將職位，教育和聯絡資訊放在單獨的表中，對 User 表提供外部索引鍵引用，如 [圖 2-1](/v1/ddia_0201.png) 所示。
* 後續的 SQL 標準增加了對結構化資料型別和 XML 資料的支援；這允許將多值資料儲存在單行內，並支援在這些文件內查詢和索引。這些功能在 Oracle，IBM DB2，MS SQL Server 和 PostgreSQL 中都有不同程度的支援【6,7】。JSON 資料型別也得到多個數據庫的支援，包括 IBM DB2，MySQL 和 PostgreSQL 【8】。
* 第三種選擇是將職業，教育和聯絡資訊編碼為 JSON 或 XML 文件，將其儲存在資料庫的文字列中，並讓應用程式解析其結構和內容。這種配置下，通常不能使用資料庫來查詢該編碼列中的值。

對於一個像簡歷這樣自包含文件的資料結構而言，JSON 表示是非常合適的：請參閱 [例 2-1]()。JSON 比 XML 更簡單。面向文件的資料庫（如 MongoDB 【9】，RethinkDB 【10】，CouchDB 【11】和 Espresso【12】）支援這種資料模型。

**例 2-1. 用 JSON 文件表示一個 LinkedIn 簡介**

```json
{
  "user_id": 251,
  "first_name": "Bill",
  "last_name": "Gates",
  "summary": "Co-chair of the Bill & Melinda Gates... Active blogger.",
  "region_id": "us:91",
  "industry_id": 131,
  "photo_url": "/p/7/000/253/05b/308dd6e.jpg",
  "positions": [
    {
      "job_title": "Co-chair",
      "organization": "Bill & Melinda Gates Foundation"
    },
    {
      "job_title": "Co-founder, Chairman",
      "organization": "Microsoft"
    }
  ],
  "education": [
    {
      "school_name": "Harvard University",
      "start": 1973,
      "end": 1975
    },
    {
      "school_name": "Lakeside School, Seattle",
      "start": null,
      "end": null
    }
  ],
  "contact_info": {
    "blog": "http://thegatesnotes.com",
    "twitter": "http://twitter.com/BillGates"
  }
}
```

有一些開發人員認為 JSON 模型減少了應用程式程式碼和儲存層之間的阻抗不匹配。不過，正如我們將在 [第四章](/v1_tw/ch4) 中看到的那樣，JSON 作為資料編碼格式也存在問題。無模式對 JSON 模型來說往往被認為是一個優勢；我們將在 “[文件模型中的模式靈活性](#文件模型中的模式靈活性)” 中討論這個問題。

JSON 表示比 [圖 2-1](/v1/ddia_0201.png) 中的多表模式具有更好的 **區域性（locality）**。如果在前面的關係型示例中獲取簡介，那需要執行多個查詢（透過 `user_id` 查詢每個表），或者在 User 表與其下屬表之間混亂地執行多路連線。而在 JSON 表示中，所有相關資訊都在同一個地方，一個查詢就足夠了。

從使用者簡介檔案到使用者職位，教育歷史和聯絡資訊，這種一對多關係隱含了資料中的一個樹狀結構，而 JSON 表示使得這個樹狀結構變得明確（見 [圖 2-2](/v1/ddia_0202.png)）。

![](/v1/ddia_0202.png)

**圖 2-2 一對多關係構建了一個樹結構**

### 多對一和多對多的關係

在上一節的 [例 2-1]() 中，`region_id` 和 `industry_id` 是以 ID，而不是純字串 “Greater Seattle Area” 和 “Philanthropy” 的形式給出的。為什麼？

如果使用者介面用一個自由文字欄位來輸入區域和行業，那麼將他們儲存為純文字字串是合理的。另一方式是給出地理區域和行業的標準化的列表，並讓使用者從下拉列表或自動填充器中進行選擇，其優勢如下：

* 各個簡介之間樣式和拼寫統一
* 避免歧義（例如，如果有幾個同名的城市）
* 易於更新 —— 名稱只儲存在一個地方，如果需要更改（例如，由於政治事件而改變城市名稱），很容易進行全面更新。
* 本地化支援 —— 當網站翻譯成其他語言時，標準化的列表可以被本地化，使得地區和行業可以使用使用者的語言來顯示
* 更好的搜尋 —— 例如，搜尋華盛頓州的慈善家就會匹配這份簡介，因為地區列表可以編碼記錄西雅圖在華盛頓這一事實（從 “Greater Seattle Area” 這個字串中看不出來）

儲存 ID 還是文字字串，這是個 **副本（duplication）** 問題。當使用 ID 時，對人類有意義的資訊（比如單詞：Philanthropy）只儲存在一處，所有引用它的地方使用 ID（ID 只在資料庫中有意義）。當直接儲存文字時，對人類有意義的資訊會複製在每處使用記錄中。

使用 ID 的好處是，ID 對人類沒有任何意義，因而永遠不需要改變：ID 可以保持不變，即使它標識的資訊發生變化。任何對人類有意義的東西都可能需要在將來某個時候改變 —— 如果這些資訊被複制，所有的冗餘副本都需要更新。這會導致寫入開銷，也存在不一致的風險（一些副本被更新了，還有些副本沒有被更新）。去除此類重複是資料庫 **正規化（normalization）** 的關鍵思想。[^ii]

[^ii]: 關於關係模型的文獻區分了幾種不同的規範形式，但這些區別幾乎沒有實際意義。一個經驗法則是，如果重複儲存了可以儲存在一個地方的值，則模式就不是 **正規化（normalized）** 的。

> 資料庫管理員和開發人員喜歡爭論正規化和反正規化，讓我們暫時保留判斷吧。在本書的 [第三部分](/v1_tw/part-iii)，我們將回到這個話題，探討系統的方法用以處理快取，反正規化和衍生資料。

不幸的是，對這些資料進行正規化需要多對一的關係（許多人生活在一個特定的地區，許多人在一個特定的行業工作），這與文件模型不太吻合。在關係資料庫中，透過 ID 來引用其他表中的行是正常的，因為連線很容易。在文件資料庫中，一對多樹結構沒有必要用連線，對連線的支援通常很弱 [^iii]。

[^iii]: 在撰寫本文時，RethinkDB 支援連線，MongoDB 不支援連線，而 CouchDB 只支援預先宣告的檢視。

如果資料庫本身不支援連線，則必須在應用程式程式碼中透過對資料庫進行多個查詢來模擬連線。（在這種情況中，地區和行業的列表可能很小，改動很少，應用程式可以簡單地將其儲存在記憶體中。不過，執行連線的工作從資料庫被轉移到應用程式程式碼上。）

此外，即便應用程式的最初版本適合無連線的文件模型，隨著功能新增到應用程式中，資料會變得更加互聯。例如，考慮一下對簡歷例子進行的一些修改：

組織和學校作為實體
: 在前面的描述中，`organization`（使用者工作的公司）和 `school_name`（他們學習的地方）只是字串。也許他們應該是對實體的引用呢？然後，每個組織、學校或大學都可以擁有自己的網頁（標識、新聞提要等）。每個簡歷可以連結到它所提到的組織和學校，並且包括他們的圖示和其他資訊（請參閱 [圖 2-3](/v1/ddia_0203.png)，來自 LinkedIn 的一個例子）。

推薦
: 假設你想新增一個新的功能：一個使用者可以為另一個使用者寫一個推薦。在使用者的簡歷上顯示推薦，並附上推薦使用者的姓名和照片。如果推薦人更新他們的照片，那他們寫的任何推薦都需要顯示新的照片。因此，推薦應該擁有作者個人簡介的引用。

![](/v1/ddia_0203.png)

**圖 2-3 公司名不僅是字串，還是一個指向公司實體的連結（LinkedIn 截圖）**

[圖 2-4](/v1/ddia_0204.png) 闡明了這些新功能需要如何使用多對多關係。每個虛線矩形內的資料可以分組成一個文件，但是對單位，學校和其他使用者的引用需要表示成引用，並且在查詢時需要連線。

![](/v1/ddia_0204.png)

**圖 2-4 使用多對多關係擴充套件簡歷**

### 文件資料庫是否在重蹈覆轍？

在多對多的關係和連線已常規用在關係資料庫時，文件資料庫和 NoSQL 重啟了辯論：如何以最佳方式在資料庫中表示多對多關係。那場辯論可比 NoSQL 古老得多，事實上，最早可以追溯到計算機化資料庫系統。

20 世紀 70 年代最受歡迎的業務資料處理資料庫是 IBM 的資訊管理系統（IMS），最初是為了阿波羅太空計劃的庫存管理而開發的，並於 1968 年有了首次商業釋出【13】。目前它仍在使用和維護，執行在 IBM 大型機的 OS/390 上【14】。

IMS 的設計中使用了一個相當簡單的資料模型，稱為 **層次模型（hierarchical model）**，它與文件資料庫使用的 JSON 模型有一些驚人的相似之處【2】。它將所有資料表示為巢狀在記錄中的記錄樹，這很像 [圖 2-2](/v1/ddia_0202.png) 的 JSON 結構。

同文檔資料庫一樣，IMS 能良好處理一對多的關係，但是很難應對多對多的關係，並且不支援連線。開發人員必須決定是否複製（反正規化）資料或手動解決從一個記錄到另一個記錄的引用。這些二十世紀六七十年代的問題與現在開發人員遇到的文件資料庫問題非常相似【15】。

那時人們提出了各種不同的解決方案來解決層次模型的侷限性。其中最突出的兩個是 **關係模型**（relational model，它變成了 SQL，並統治了世界）和 **網狀模型**（network model，最初很受關注，但最終變得冷門）。這兩個陣營之間的 “大辯論” 在 70 年代持續了很久時間【2】。

那兩個模式解決的問題與當前的問題相關，因此值得簡要回顧一下那場辯論。

#### 網狀模型

網狀模型由一個稱為資料系統語言會議（CODASYL）的委員會進行了標準化，並被數個不同的資料庫廠商實現；它也被稱為 CODASYL 模型【16】。

CODASYL 模型是層次模型的推廣。在層次模型的樹結構中，每條記錄只有一個父節點；在網路模式中，每條記錄可能有多個父節點。例如，“Greater Seattle Area” 地區可能是一條記錄，每個居住在該地區的使用者都可以與之相關聯。這允許對多對一和多對多的關係進行建模。

網狀模型中記錄之間的連結不是外部索引鍵，而更像程式語言中的指標（同時仍然儲存在磁碟上）。訪問記錄的唯一方法是跟隨從根記錄起沿這些鏈路所形成的路徑。這被稱為 **訪問路徑（access path）**。

最簡單的情況下，訪問路徑類似遍歷連結串列：從列表頭開始，每次檢視一條記錄，直到找到所需的記錄。但在多對多關係的情況中，數條不同的路徑可以到達相同的記錄，網狀模型的程式設計師必須跟蹤這些不同的訪問路徑。

CODASYL 中的查詢是透過利用遍歷記錄列和跟隨訪問路徑表在資料庫中移動遊標來執行的。如果記錄有多個父結點（即多個來自其他記錄的傳入指標），則應用程式程式碼必須跟蹤所有的各種關係。甚至 CODASYL 委員會成員也承認，這就像在 n 維資料空間中進行導航【17】。

儘管手動選擇訪問路徑能夠最有效地利用 20 世紀 70 年代非常有限的硬體功能（如磁帶驅動器，其搜尋速度非常慢），但這使得查詢和更新資料庫的程式碼變得複雜不靈活。無論是分層還是網狀模型，如果你沒有所需資料的路徑，就會陷入困境。你可以改變訪問路徑，但是必須瀏覽大量手寫資料庫查詢程式碼，並重寫來處理新的訪問路徑。更改應用程式的資料模型是很難的。

#### 關係模型

相比之下，關係模型做的就是將所有的資料放在光天化日之下：一個 **關係（表）** 只是一個 **元組（行）** 的集合，僅此而已。如果你想讀取資料，它沒有迷宮似的巢狀結構，也沒有複雜的訪問路徑。你可以選中符合任意條件的行，讀取表中的任何或所有行。你可以透過指定某些列作為匹配關鍵字來讀取特定行。你可以在任何表中插入一個新的行，而不必擔心與其他表的外部索引鍵關係 [^iv]。

[^iv]: 外部索引鍵約束允許對修改進行限制，但對於關係模型這並不是必選項。即使有約束，外部索引鍵連線在查詢時執行，而在 CODASYL 中，連線在插入時高效完成。

在關係資料庫中，查詢最佳化器自動決定查詢的哪些部分以哪個順序執行，以及使用哪些索引。這些選擇實際上是 “訪問路徑”，但最大的區別在於它們是由查詢最佳化器自動生成的，而不是由程式設計師生成，所以我們很少需要考慮它們。

如果想按新的方式查詢資料，你可以宣告一個新的索引，查詢會自動使用最合適的那些索引。無需更改查詢來利用新的索引（請參閱 “[資料查詢語言](#資料查詢語言)”）。關係模型因此使新增應用程式新功能變得更加容易。

關係資料庫的查詢最佳化器是複雜的，已耗費了多年的研究和開發精力【18】。關係模型的一個關鍵洞察是：只需構建一次查詢最佳化器，隨後使用該資料庫的所有應用程式都可以從中受益。如果你沒有查詢最佳化器的話，那麼為特定查詢手動編寫訪問路徑比編寫通用最佳化器更容易 —— 不過從長期看通用解決方案更好。

#### 與文件資料庫相比

在一個方面，文件資料庫還原為層次模型：在其父記錄中儲存巢狀記錄（[圖 2-1](/v1/ddia_0201.png) 中的一對多關係，如 `positions`，`education` 和 `contact_info`），而不是在單獨的表中。

但是，在表示多對一和多對多的關係時，關係資料庫和文件資料庫並沒有根本的不同：在這兩種情況下，相關專案都被一個唯一的識別符號引用，這個識別符號在關係模型中被稱為 **外部索引鍵**，在文件模型中稱為 **文件引用**【9】。該識別符號在讀取時透過連線或後續查詢來解析。迄今為止，文件資料庫沒有走 CODASYL 的老路。

### 關係型資料庫與文件資料庫在今日的對比

將關係資料庫與文件資料庫進行比較時，可以考慮許多方面的差異，包括它們的容錯屬性（請參閱 [第五章](/v1_tw/ch5)）和處理併發性（請參閱 [第七章](/v1_tw/ch7)）。本章將只關注資料模型中的差異。

支援文件資料模型的主要論據是架構靈活性，因區域性而擁有更好的效能，以及對於某些應用程式而言更接近於應用程式使用的資料結構。關係模型透過為連線提供更好的支援以及支援多對一和多對多的關係來反擊。

#### 哪種資料模型更有助於簡化應用程式碼？

如果應用程式中的資料具有類似文件的結構（即，一對多關係樹，通常一次性載入整個樹），那麼使用文件模型可能是一個好主意。將類似文件的結構分解成多個表（如 [圖 2-1](/v1/ddia_0201.png) 中的 `positions`、`education` 和 `contact_info`）的關係技術可能導致繁瑣的模式和不必要的複雜的應用程式程式碼。

文件模型有一定的侷限性：例如，不能直接引用文件中的巢狀的專案，而是需要說 “使用者 251 的位置列表中的第二項”（很像層次模型中的訪問路徑）。但是，只要檔案巢狀不太深，這通常不是問題。

文件資料庫對連線的糟糕支援可能是個問題，也可能不是問題，這取決於應用程式。例如，如果某分析型應用程式使用一個文件資料庫來記錄何時何地發生了何事，那麼多對多關係可能永遠也用不上。【19】。

但如果你的應用程式確實會用到多對多關係，那麼文件模型就沒有那麼誘人了。儘管可以透過反正規化來消除對連線的需求，但這需要應用程式程式碼來做額外的工作以確保資料一致性。儘管應用程式程式碼可以透過向資料庫發出多個請求的方式來模擬連線，但這也將複雜性轉移到應用程式中，而且通常也會比由資料庫內的專用程式碼更慢。在這種情況下，使用文件模型可能會導致更複雜的應用程式碼與更差的效能【15】。

我們沒有辦法說哪種資料模型更有助於簡化應用程式碼，因為它取決於資料項之間的關係種類。對高度關聯的資料而言，文件模型是極其糟糕的，關係模型是可以接受的，而選用圖形模型（請參閱 “[圖資料模型](#圖資料模型)”）是最自然的。

#### 文件模型中的模式靈活性

大多數文件資料庫以及關係資料庫中的 JSON 支援都不會強制文件中的資料採用何種模式。關係資料庫的 XML 支援通常帶有可選的模式驗證。沒有模式意味著可以將任意的鍵和值新增到文件中，並且當讀取時，客戶端無法保證文件可能包含的欄位。

文件資料庫有時稱為 **無模式（schemaless）**，但這具有誤導性，因為讀取資料的程式碼通常假定某種結構 —— 即存在隱式模式，但不由資料庫強制執行【20】。一個更精確的術語是 **讀時模式**（即 schema-on-read，資料的結構是隱含的，只有在資料被讀取時才被解釋），相應的是 **寫時模式**（即 schema-on-write，傳統的關係資料庫方法中，模式明確，且資料庫確保所有的資料都符合其模式）【21】。

讀時模式類似於程式語言中的動態（執行時）型別檢查，而寫時模式類似於靜態（編譯時）型別檢查。就像靜態和動態型別檢查的相對優點具有很大的爭議性一樣【22】，資料庫中模式的強制性是一個具有爭議的話題，一般來說沒有正確或錯誤的答案。

在應用程式想要改變其資料格式的情況下，這些方法之間的區別尤其明顯。例如，假設你把每個使用者的全名儲存在一個欄位中，而現在想分別儲存名字和姓氏【23】。在文件資料庫中，只需開始寫入具有新欄位的新文件，並在應用程式中使用程式碼來處理讀取舊文件的情況。例如：

```go
if (user && user.name && !user.first_name) {
  // Documents written before Dec 8, 2013 don't have first_name
  user.first_name = user.name.split(" ")[0];
}
```

另一方面，在 “靜態型別” 資料庫模式中，通常會執行以下 **遷移（migration）** 操作：

```sql
ALTER TABLE users ADD COLUMN first_name text;
UPDATE users SET first_name = split_part(name, ' ', 1);      -- PostgreSQL
UPDATE users SET first_name = substring_index(name, ' ', 1);      -- MySQL
```

模式變更的速度很慢，而且要求停運。它的這種壞名譽並不是完全應得的：大多數關係資料庫系統可在幾毫秒內執行 `ALTER TABLE` 語句。MySQL 是一個值得注意的例外，它執行 `ALTER TABLE` 時會複製整個表，這可能意味著在更改一個大型表時會花費幾分鐘甚至幾個小時的停機時間，儘管存在各種工具來解決這個限制【24,25,26】。

大型表上執行 `UPDATE` 語句在任何資料庫上都可能會很慢，因為每一行都需要重寫。要是不可接受的話，應用程式可以將 `first_name` 設定為預設值 `NULL`，並在讀取時再填充，就像使用文件資料庫一樣。

當由於某種原因（例如，資料是異構的）集合中的專案並不都具有相同的結構時，讀時模式更具優勢。例如，如果：

* 存在許多不同型別的物件，將每種型別的物件放在自己的表中是不現實的。
* 資料的結構由外部系統決定。你無法控制外部系統且它隨時可能變化。

在上述情況下，模式的壞處遠大於它的幫助，無模式文件可能是一個更加自然的資料模型。但是，要是所有記錄都具有相同的結構，那麼模式是記錄並強制這種結構的有效機制。第四章將更詳細地討論模式和模式演化。

#### 查詢的資料區域性

文件通常以單個連續字串形式進行儲存，編碼為 JSON、XML 或其二進位制變體（如 MongoDB 的 BSON）。如果應用程式經常需要訪問整個文件（例如，將其渲染至網頁），那麼儲存區域性會帶來效能優勢。如果將資料分割到多個表中（如 [圖 2-1](/v1/ddia_0201.png) 所示），則需要進行多次索引查詢才能將其全部檢索出來，這可能需要更多的磁碟查詢並花費更多的時間。

區域性僅僅適用於同時需要文件絕大部分內容的情況。即使只訪問文件其中的一小部分，資料庫通常需要載入整個文件，對於大型文件來說這種載入行為是很浪費的。更新文件時，通常需要整個重寫。只有不改變文件大小的修改才可以容易地原地執行。因此，通常建議保持相對小的文件，並避免增加文件大小的寫入【9】。這些效能限制大大減少了文件資料庫的實用場景。

值得指出的是，為了區域性而分組集合相關資料的想法並不侷限於文件模型。例如，Google 的 Spanner 資料庫在關係資料模型中提供了同樣的區域性屬性，允許模式宣告一個表的行應該交錯（巢狀）在父表內【27】。Oracle 類似地允許使用一個稱為 **多表索引叢集表（multi-table index cluster tables）** 的類似特性【28】。Bigtable 資料模型（用於 Cassandra 和 HBase）中的 **列族（column-family）** 概念與管理區域性的目的類似【29】。

在 [第三章](/v1_tw/ch3) 將還會看到更多關於區域性的內容。

#### 文件和關係資料庫的融合

自 2000 年代中期以來，大多數關係資料庫系統（MySQL 除外）都已支援 XML。這包括對 XML 文件進行本地修改的功能，以及在 XML 文件中進行索引和查詢的功能。這允許應用程式使用那種與文件資料庫應當使用的非常類似的資料模型。

從 9.3 版本開始的 PostgreSQL 【8】，從 5.7 版本開始的 MySQL 以及從版本 10.5 開始的 IBM DB2【30】也對 JSON 文件提供了類似的支援級別。鑑於用在 Web APIs 的 JSON 流行趨勢，其他關係資料庫很可能會跟隨他們的腳步並新增 JSON 支援。

在文件資料庫中，RethinkDB 在其查詢語言中支援類似關係的連線，一些 MongoDB 驅動程式可以自動解析資料庫引用（有效地執行客戶端連線，儘管這可能比在資料庫中執行的連線慢，需要額外的網路往返，並且最佳化更少）。

隨著時間的推移，關係資料庫和文件資料庫似乎變得越來越相似，這是一件好事：資料模型相互補充 [^v]，如果一個數據庫能夠處理類似文件的資料，並能夠對其執行關係查詢，那麼應用程式就可以使用最符合其需求的功能組合。

關係模型和文件模型的混合是未來資料庫一條很好的路線。

[^v]: Codd 對關係模型【1】的原始描述實際上允許在關係模式中與 JSON 文件非常相似。他稱之為 **非簡單域（nonsimple domains）**。這個想法是，一行中的值不一定是一個像數字或字串一樣的原始資料型別，也可以是一個巢狀的關係（表），因此可以把一個任意巢狀的樹結構作為一個值，這很像 30 年後新增到 SQL 中的 JSON 或 XML 支援。


## 資料查詢語言

當引入關係模型時，關係模型包含了一種查詢資料的新方法：SQL 是一種 **宣告式** 查詢語言，而 IMS 和 CODASYL 使用 **命令式** 程式碼來查詢資料庫。那是什麼意思？

許多常用的程式語言是命令式的。例如，給定一個動物物種的列表，返回列表中的鯊魚可以這樣寫：

```js
function getSharks() {
    var sharks = [];
    for (var i = 0; i < animals.length; i++) {
        if (animals[i].family === "Sharks") {
            sharks.push(animals[i]);
        }
    }
    return sharks;
}
```

而在關係代數中，你可以這樣寫：

$$
sharks = \sigma_{family = "sharks"}(animals)
$$

其中 $\sigma$（希臘字母西格瑪）是選擇運算子，只返回符合 `family="shark"` 條件的動物。

定義 SQL 時，它緊密地遵循關係代數的結構：

```sql
SELECT * FROM animals WHERE family ='Sharks';
```

命令式語言告訴計算機以特定順序執行某些操作。可以想象一下，逐行地遍歷程式碼，評估條件，更新變數，並決定是否再迴圈一遍。

在宣告式查詢語言（如 SQL 或關係代數）中，你只需指定所需資料的模式 - 結果必須符合哪些條件，以及如何將資料轉換（例如，排序，分組和集合） - 但不是如何實現這一目標。資料庫系統的查詢最佳化器決定使用哪些索引和哪些連線方法，以及以何種順序執行查詢的各個部分。

宣告式查詢語言是迷人的，因為它通常比命令式 API 更加簡潔和容易。但更重要的是，它還隱藏了資料庫引擎的實現細節，這使得資料庫系統可以在無需對查詢做任何更改的情況下進行效能提升。

例如，在本節開頭所示的命令程式碼中，動物列表以特定順序出現。如果資料庫想要在後臺回收未使用的磁碟空間，則可能需要移動記錄，這會改變動物出現的順序。資料庫能否安全地執行，而不會中斷查詢？

SQL 示例不確保任何特定的順序，因此不在意順序是否改變。但是如果查詢用命令式的程式碼來寫的話，那麼資料庫就永遠不可能確定程式碼是否依賴於排序。SQL 相當有限的功能性為資料庫提供了更多自動最佳化的空間。

最後，宣告式語言往往適合並行執行。現在，CPU 的速度透過核心（core）的增加變得更快，而不是以比以前更高的時鐘速度執行【31】。命令程式碼很難在多個核心和多個機器之間並行化，因為它指定了指令必須以特定順序執行。宣告式語言更具有並行執行的潛力，因為它們僅指定結果的模式，而不指定用於確定結果的演算法。在適當情況下，資料庫可以自由使用查詢語言的並行實現【32】。

### Web 上的宣告式查詢

宣告式查詢語言的優勢不僅限於資料庫。為了說明這一點，讓我們在一個完全不同的環境中比較宣告式和命令式方法：一個 Web 瀏覽器。

假設你有一個關於海洋動物的網站。使用者當前正在檢視鯊魚頁面，因此你將當前所選的導航專案 “鯊魚” 標記為當前選中專案。

```html
<ul>
    <li class="selected">
        <p>Sharks</p>
        <ul>
            <li>Great White Shark</li>
            <li>Tiger Shark</li>
            <li>Hammerhead Shark</li>
        </ul>
    </li>
    <li><p>Whales</p>
        <ul>
            <li>Blue Whale</li>
            <li>Humpback Whale</li>
            <li>Fin Whale</li>
        </ul>
    </li>
</ul>
```

現在想讓當前所選頁面的標題具有一個藍色的背景，以便在視覺上突出顯示。使用 CSS 實現起來非常簡單：

```css
li.selected > p {
  background-color: blue;
}
```

這裡的 CSS 選擇器 `li.selected > p` 聲明了我們想要應用藍色樣式的元素的模式：即其直接父元素是具有 CSS 類 `selected` 的 `<li>` 元素的所有 `<p>` 元素。示例中的元素 `<p>Sharks</p>` 匹配此模式，但 `<p>Whales</p>` 不匹配，因為其 `<li>` 父元素缺少 `class="selected"`。

如果使用 XSL 而不是 CSS，你可以做類似的事情：

```xml
<xsl:template match="li[@class='selected']/p">
    <fo:block background-color="blue">
        <xsl:apply-templates/>
    </fo:block>
</xsl:template>
```

這裡的 XPath 表示式 `li[@class='selected']/p` 相當於上例中的 CSS 選擇器 `li.selected > p`。CSS 和 XSL 的共同之處在於，它們都是用於指定文件樣式的宣告式語言。

想象一下，必須使用命令式方法的情況會是如何。在 Javascript 中，使用 **文件物件模型（DOM）** API，其結果可能如下所示：

```js
var liElements = document.getElementsByTagName("li");
for (var i = 0; i < liElements.length; i++) {
    if (liElements[i].className === "selected") {
        var children = liElements[i].childNodes;
        for (var j = 0; j < children.length; j++) {
            var child = children[j];
            if (child.nodeType === Node.ELEMENT_NODE && child.tagName === "P") {
                child.setAttribute("style", "background-color: blue");
            }
        }
    }
}
```

這段 JavaScript 程式碼命令式地將元素設定為藍色背景，但是程式碼看起來很糟糕。不僅比 CSS 和 XSL 等價物更長，更難理解，而且還有一些嚴重的問題：

* 如果選定的類被移除（例如，因為使用者點選了不同的頁面），即使程式碼重新執行，藍色背景也不會被移除 - 因此該專案將保持突出顯示，直到整個頁面被重新載入。使用 CSS，瀏覽器會自動檢測 `li.selected > p` 規則何時不再適用，並在選定的類被移除後立即移除藍色背景。

* 如果你想要利用新的 API（例如 `document.getElementsByClassName("selected")` 甚至 `document.evaluate()`）來提高效能，則必須重寫程式碼。另一方面，瀏覽器供應商可以在不破壞相容性的情況下提高 CSS 和 XPath 的效能。

在 Web 瀏覽器中，使用宣告式 CSS 樣式比使用 JavaScript 命令式地操作樣式要好得多。類似地，在資料庫中，使用像 SQL 這樣的宣告式查詢語言比使用命令式查詢 API 要好得多 [^vi]。

[^vi]: IMS 和 CODASYL 都使用命令式 API。應用程式通常使用 COBOL 程式碼遍歷資料庫中的記錄，一次一條記錄【2,16】。

### MapReduce查詢

MapReduce 是一個由 Google 推廣的程式設計模型，用於在多臺機器上批次處理大規模的資料【33】。一些 NoSQL 資料儲存（包括 MongoDB 和 CouchDB）支援有限形式的 MapReduce，作為在多個文件中執行只讀查詢的機制。

關於 MapReduce 更詳細的介紹在 [第十章](/v1_tw/ch10)。現在我們只簡要討論一下 MongoDB 使用的模型。

MapReduce 既不是一個宣告式的查詢語言，也不是一個完全命令式的查詢 API，而是處於兩者之間：查詢的邏輯用程式碼片段來表示，這些程式碼片段會被處理框架重複性呼叫。它基於 `map`（也稱為 `collect`）和 `reduce`（也稱為 `fold` 或 `inject`）函式，兩個函式存在於許多函數語言程式設計語言中。

最好舉例來解釋 MapReduce 模型。假設你是一名海洋生物學家，每當你看到海洋中的動物時，你都會在資料庫中新增一條觀察記錄。現在你想生成一個報告，說明你每月看到多少鯊魚。

在 PostgreSQL 中，你可以像這樣表述這個查詢：

```sql
SELECT
  date_trunc('month', observation_timestamp) AS observation_month,
  sum(num_animals)                           AS total_animals
FROM observations
WHERE family = 'Sharks'
GROUP BY observation_month;
```

`date_trunc('month', timestamp)` 函式用於確定包含 `timestamp` 的日曆月份，並返回代表該月份開始的另一個時間戳。換句話說，它將時間戳舍入成最近的月份。

這個查詢首先過濾觀察記錄，以只顯示鯊魚家族的物種，然後根據它們發生的日曆月份對觀察記錄果進行分組，最後將在該月的所有觀察記錄中看到的動物數目加起來。

同樣的查詢用 MongoDB 的 MapReduce 功能可以按如下來表述：

```js
db.observations.mapReduce(function map() {
        var year = this.observationTimestamp.getFullYear();
        var month = this.observationTimestamp.getMonth() + 1;
        emit(year + "-" + month, this.numAnimals);
    },
    function reduce(key, values) {
        return Array.sum(values);
    },
    {
        query: {
          family: "Sharks"
        },
        out: "monthlySharkReport"
    });
```

* 可以宣告式地指定一個只考慮鯊魚種類的過濾器（這是 MongoDB 特定的 MapReduce 擴充套件）。
* 每個匹配查詢的文件都會呼叫一次 JavaScript 函式 `map`，將 `this` 設定為文件物件。
* `map` 函式發出一個鍵（包括年份和月份的字串，如 `"2013-12"` 或 `"2014-1"`）和一個值（該觀察記錄中的動物數量）。
* `map` 發出的鍵值對按鍵來分組。對於具有相同鍵（即，相同的月份和年份）的所有鍵值對，呼叫一次 `reduce` 函式。
* `reduce` 函式將特定月份內所有觀測記錄中的動物數量相加。
* 將最終的輸出寫入到 `monthlySharkReport` 集合中。

例如，假設 `observations` 集合包含這兩個文件：

```json
{
  observationTimestamp: Date.parse(  "Mon, 25 Dec 1995 12:34:56 GMT"),
  family: "Sharks",
  species: "Carcharodon carcharias",
  numAnimals: 3
}
{
  observationTimestamp: Date.parse("Tue, 12 Dec 1995 16:17:18 GMT"),
  family: "Sharks",
  species:    "Carcharias taurus",
  numAnimals: 4
}
```

對每個文件都會呼叫一次 `map` 函式，結果將是 `emit("1995-12",3)` 和 `emit("1995-12",4)`。隨後，以 `reduce("1995-12",[3,4])` 呼叫 `reduce` 函式，將返回 `7`。

map 和 reduce 函式在功能上有所限制：它們必須是 **純** 函式，這意味著它們只使用傳遞給它們的資料作為輸入，它們不能執行額外的資料庫查詢，也不能有任何副作用。這些限制允許資料庫以任何順序執行任何功能，並在失敗時重新執行它們。然而，map 和 reduce 函式仍然是強大的：它們可以解析字串、呼叫庫函式、執行計算等等。

MapReduce 是一個相當底層的程式設計模型，用於計算機叢集上的分散式執行。像 SQL 這樣的更高階的查詢語言可以用一系列的 MapReduce 操作來實現（見 [第十章](/v1_tw/ch10)），但是也有很多不使用 MapReduce 的分散式 SQL 實現。須注意，SQL 並沒有限制它只能在單一機器上執行，而 MapReduce 也並沒有壟斷所有的分散式查詢執行。

能夠在查詢中使用 JavaScript 程式碼是高階查詢的一個重要特性，但這不限於 MapReduce，一些 SQL 資料庫也可以用 JavaScript 函式進行擴充套件【34】。

MapReduce 的一個可用性問題是，必須編寫兩個密切合作的 JavaScript 函式，這通常比編寫單個查詢更困難。此外，宣告式查詢語言為查詢最佳化器提供了更多機會來提高查詢的效能。基於這些原因，MongoDB 2.2 添加了一種叫做 **聚合管道** 的宣告式查詢語言的支援【9】。用這種語言表述鯊魚計數查詢如下所示：

```js
db.observations.aggregate([
  { $match: { family: "Sharks" } },
  { $group: {
    _id: {
      year:  { $year:  "$observationTimestamp" },
      month: { $month: "$observationTimestamp" }
    },
    totalAnimals: { $sum: "$numAnimals" } }}
]);
```

聚合管道語言的表現力與（前述 PostgreSQL 例子的）SQL 子集相當，但是它使用基於 JSON 的語法而不是 SQL 那種接近英文句式的語法；這種差異也許只是口味問題。這個故事的寓意是：NoSQL 系統可能會意外發現自己只是重新發明了一套經過喬裝改扮的 SQL。


## 圖資料模型

如我們之前所見，多對多關係是不同資料模型之間具有區別性的重要特徵。如果你的應用程式大多數的關係是一對多關係（樹狀結構化資料），或者大多數記錄之間不存在關係，那麼使用文件模型是合適的。

但是，要是多對多關係在你的資料中很常見呢？關係模型可以處理多對多關係的簡單情況，但是隨著資料之間的連線變得更加複雜，將資料建模為圖形顯得更加自然。

一個圖由兩種物件組成：**頂點**（vertices，也稱為 **節點**，即 nodes，或 **實體**，即 entities），和 **邊**（edges，也稱為 **關係**，即 relationships，或 **弧**，即 arcs）。多種資料可以被建模為一個圖形。典型的例子包括：

社交圖譜
: 頂點是人，邊指示哪些人彼此認識。

網路圖譜
: 頂點是網頁，邊緣表示指向其他頁面的 HTML 連結。

公路或鐵路網路
: 頂點是交叉路口，邊線代表它們之間的道路或鐵路線。

可以將那些眾所周知的演算法運用到這些圖上：例如，汽車導航系統搜尋道路網路中兩點之間的最短路徑，PageRank 可以用在網路圖上來確定網頁的流行程度，從而確定該網頁在搜尋結果中的排名。

在剛剛給出的例子中，圖中的所有頂點代表了相同型別的事物（人、網頁或交叉路口）。不過，圖並不侷限於這樣的同類資料：同樣強大地是，圖提供了一種一致的方式，用來在單個數據儲存中儲存完全不同型別的物件。例如，Facebook 維護一個包含許多不同型別的頂點和邊的單個圖：頂點表示人、地點、事件、簽到和使用者的評論；邊表示哪些人是好友、簽到發生在哪裡、誰評論了什麼帖子、誰參與了什麼事件等等【35】。

在本節中，我們將使用 [圖 2-5](/v1/ddia_0205.png) 所示的示例。它可以從社交網路或系譜資料庫中獲得：它顯示了兩個人，來自愛達荷州的 Lucy 和來自法國 Beaune 的 Alain。他們已婚，住在倫敦。

![](/v1/ddia_0205.png)

**圖 2-5 圖資料結構示例（框代表頂點，箭頭代表邊）**

有幾種不同但相關的方法用來構建和查詢圖表中的資料。在本節中，我們將討論屬性圖模型（由 Neo4j，Titan 和 InfiniteGraph 實現）和三元組儲存（triple-store）模型（由 Datomic、AllegroGraph 等實現）。我們將檢視圖的三種宣告式查詢語言：Cypher，SPARQL 和 Datalog。除此之外，還有像 Gremlin 【36】這樣的圖形查詢語言和像 Pregel 這樣的圖形處理框架（見 [第十章](/v1_tw/ch10)）。

### 屬性圖

在屬性圖模型中，每個頂點（vertex）包括：

* 唯一的識別符號
* 一組出邊（outgoing edges）
* 一組入邊（ingoing edges）
* 一組屬性（鍵值對）

每條邊（edge）包括：

* 唯一識別符號
* 邊的起點（**尾部頂點**，即 tail vertex）
* 邊的終點（**頭部頂點**，即 head vertex）
* 描述兩個頂點之間關係型別的標籤
* 一組屬性（鍵值對）

可以將圖儲存看作由兩個關係表組成：一個儲存頂點，另一個儲存邊，如 [例 2-2]() 所示（該模式使用 PostgreSQL JSON 資料型別來儲存每個頂點或每條邊的屬性）。頭部和尾部頂點用來儲存每條邊；如果你想要一組頂點的輸入或輸出邊，你可以分別透過 `head_vertex` 或 `tail_vertex` 來查詢 `edges` 表。

**例 2-2 使用關係模式來表示屬性圖**

```sql
CREATE TABLE vertices (
  vertex_id  INTEGER PRIMARY KEY,
  properties JSON
);

CREATE TABLE edges (
  edge_id     INTEGER PRIMARY KEY,
  tail_vertex INTEGER REFERENCES vertices (vertex_id),
  head_vertex INTEGER REFERENCES vertices (vertex_id),
  label       TEXT,
  properties  JSON
);

CREATE INDEX edges_tails ON edges (tail_vertex);
CREATE INDEX edges_heads ON edges (head_vertex);
```

關於這個模型的一些重要方面是：

1. 任何頂點都可以有一條邊連線到任何其他頂點。沒有模式限制哪種事物可不可以關聯。
2. 給定任何頂點，可以高效地找到它的入邊和出邊，從而遍歷圖，即沿著一系列頂點的路徑前後移動（這就是為什麼 [例 2-2]() 在 `tail_vertex` 和 `head_vertex` 列上都有索引的原因）。
3. 透過對不同型別的關係使用不同的標籤，可以在一個圖中儲存幾種不同的資訊，同時仍然保持一個清晰的資料模型。

這些特性為資料建模提供了很大的靈活性，如 [圖 2-5](/v1/ddia_0205.png) 所示。圖中顯示了一些傳統關係模式難以表達的事情，例如不同國家的不同地區結構（法國有省和大區，美國有縣和州），國中國的怪事（先忽略主權國家和民族錯綜複雜的爛攤子），不同的資料粒度（Lucy 現在的住所記錄具體到城市，而她的出生地點只是在一個州的級別）。

你可以想象該圖還能延伸出許多關於 Lucy 和 Alain 的事實，或其他人的其他更多的事實。例如，你可以用它來表示食物過敏（為每個過敏源增加一個頂點，並增加人與過敏源之間的一條邊來指示一種過敏情況），並連結到過敏源，每個過敏源具有一組頂點用來顯示哪些食物含有哪些物質。然後，你可以寫一個查詢，找出每個人吃什麼是安全的。圖在可演化性方面是富有優勢的：當你嚮應用程式新增功能時，可以輕鬆擴充套件圖以適應程式資料結構的變化。

### Cypher 查詢語言

Cypher 是屬性圖的宣告式查詢語言，為 Neo4j 圖形資料庫而發明【37】（它是以電影 “駭客帝國” 中的一個角色來命名的，而與密碼學中的加密演算法無關【38】）。

[例 2-3]() 顯示了將 [圖 2-5](/v1/ddia_0205.png) 的左邊部分插入圖形資料庫的 Cypher 查詢。你可以以類似的方式把圖的剩餘部分新增進去，但這裡為了文章可閱讀性而省略這部分的示例。每個頂點都有一個像 `USA` 或 `Idaho` 這樣的符號名稱，查詢的其他部分可以使用這些名稱在頂點之間建立邊，使用箭頭符號：`(Idaho) -[:WITHIN]-> (USA)` 建立一條標記為 `WITHIN` 的邊，`Idaho` 為尾節點，`USA` 為頭節點。

**例 2-3 將圖 2-5 中的資料子集表示為 Cypher 查詢**

```cypher
CREATE
  (NAmerica:Location {name:'North America', type:'continent'}),
  (USA:Location      {name:'United States', type:'country'  }),
  (Idaho:Location    {name:'Idaho',         type:'state'    }),
  (Lucy:Person       {name:'Lucy' }),
  (Idaho) -[:WITHIN]->  (USA)  -[:WITHIN]-> (NAmerica),
  (Lucy)  -[:BORN_IN]-> (Idaho)
```

當 [圖 2-5](/v1/ddia_0205.png) 的所有頂點和邊被新增到資料庫後，讓我們提些有趣的問題：例如，找到所有從美國移民到歐洲的人的名字。更確切地說，這裡我們想要找到符合下面條件的所有頂點，並且返回這些頂點的 `name` 屬性：該頂點擁有一條連到美國任一位置的 `BORN_IN` 邊，和一條連到歐洲的任一位置的 `LIVING_IN` 邊。

[例 2-4]() 展示了如何在 Cypher 中表達這個查詢。在 MATCH 子句中使用相同的箭頭符號來查詢圖中的模式：`(person) -[:BORN_IN]-> ()` 可以匹配 `BORN_IN` 邊的任意兩個頂點。該邊的尾節點被綁定了變數 `person`，頭節點則未被繫結。

**例 2-4 查詢所有從美國移民到歐洲的人的 Cypher 查詢：**

```cypher
MATCH
  (person) -[:BORN_IN]->  () -[:WITHIN*0..]-> (us:Location {name:'United States'}),
  (person) -[:LIVES_IN]-> () -[:WITHIN*0..]-> (eu:Location {name:'Europe'})
RETURN person.name
```

查詢按如下來解讀：

> 找到滿足以下兩個條件的所有頂點（稱之為 person 頂點）：
> 1.  `person` 頂點擁有一條到某個頂點的 `BORN_IN` 出邊。從那個頂點開始，沿著一系列 `WITHIN` 出邊最終到達一個型別為 `Location`，`name` 屬性為 `United States` 的頂點。
>
> 2. `person` 頂點還擁有一條 `LIVES_IN` 出邊。沿著這條邊，可以透過一系列 `WITHIN` 出邊最終到達一個型別為 `Location`，`name` 屬性為 `Europe` 的頂點。
>
> 對於這樣的 `Person` 頂點，返回其 `name` 屬性。

執行這條查詢可能會有幾種可行的查詢路徑。這裡給出的描述建議首先掃描資料庫中的所有人，檢查每個人的出生地和居住地，然後只返回符合條件的那些人。

等價地，也可以從兩個 `Location` 頂點開始反向地查詢。假如 `name` 屬性上有索引，則可以高效地找到代表美國和歐洲的兩個頂點。然後，沿著所有 `WITHIN` 入邊，可以繼續查找出所有在美國和歐洲的位置（州、地區、城市等）。最後，查找出那些可以由 `BORN_IN` 或 `LIVES_IN` 入邊到那些位置頂點的人。

通常對於宣告式查詢語言來說，在編寫查詢語句時，不需要指定執行細節：查詢最佳化程式會自動選擇預測效率最高的策略，因此你可以專注於編寫應用程式的其他部分。

### SQL 中的圖查詢

[例 2-2]() 指出，可以在關係資料庫中表示圖資料。但是，如果圖資料已經以關係結構儲存，我們是否也可以使用 SQL 查詢它？

答案是肯定的，但有些困難。在關係資料庫中，你通常會事先知道在查詢中需要哪些連線。在圖查詢中，你可能需要在找到待查詢的頂點之前，遍歷可變數量的邊。也就是說，連線的數量事先並不確定。

在我們的例子中，這發生在 Cypher 查詢中的 `() -[:WITHIN*0..]-> ()` 規則中。一個人的 `LIVES_IN` 邊可以指向任何型別的位置：街道、城市、地區、國家等。一個城市可以在（WITHIN）一個地區內，一個地區可以在（WITHIN）在一個州內，一個州可以在（WITHIN）一個國家內，等等。`LIVES_IN` 邊可以直接指向正在查詢的位置，或者一個在位置層次結構中隔了數層的位置。

在 Cypher 中，用 `WITHIN*0..` 非常簡潔地表述了上述事實：“沿著 `WITHIN` 邊，零次或多次”。它很像正則表示式中的 `*` 運算子。

自 SQL:1999，查詢可變長度遍歷路徑的思想可以使用稱為 **遞迴公用表表達式**（`WITH RECURSIVE` 語法）的東西來表示。[例 2-5]() 顯示了同樣的查詢 - 查詢從美國移民到歐洲的人的姓名 - 在 SQL 使用這種技術（PostgreSQL、IBM DB2、Oracle 和 SQL Server 均支援）來表述。但是，與 Cypher 相比，其語法非常笨拙。

**例 2-5  與示例 2-4 同樣的查詢，在 SQL 中使用遞迴公用表表達式表示**

```sql
WITH RECURSIVE
  -- in_usa 包含所有的美國境內的位置 ID
    in_usa(vertex_id) AS (
    SELECT vertex_id FROM vertices WHERE properties ->> 'name' = 'United States'
    UNION
    SELECT edges.tail_vertex FROM edges
      JOIN in_usa ON edges.head_vertex = in_usa.vertex_id
      WHERE edges.label = 'within'
  ),
  -- in_europe 包含所有的歐洲境內的位置 ID
    in_europe(vertex_id) AS (
    SELECT vertex_id FROM vertices WHERE properties ->> 'name' = 'Europe'
    UNION
    SELECT edges.tail_vertex FROM edges
      JOIN in_europe ON edges.head_vertex = in_europe.vertex_id
      WHERE edges.label = 'within' ),

  -- born_in_usa 包含了所有型別為 Person，且出生在美國的頂點
    born_in_usa(vertex_id) AS (
      SELECT edges.tail_vertex FROM edges
        JOIN in_usa ON edges.head_vertex = in_usa.vertex_id
        WHERE edges.label = 'born_in' ),

  -- lives_in_europe 包含了所有型別為 Person，且居住在歐洲的頂點。
    lives_in_europe(vertex_id) AS (
      SELECT edges.tail_vertex FROM edges
        JOIN in_europe ON edges.head_vertex = in_europe.vertex_id
        WHERE edges.label = 'lives_in')

  SELECT vertices.properties ->> 'name'
  FROM vertices
    JOIN born_in_usa ON vertices.vertex_id = born_in_usa.vertex_id
    JOIN lives_in_europe ON vertices.vertex_id = lives_in_europe.vertex_id;
```

* 首先，查詢 `name` 屬性為 `United States` 的頂點，將其作為 `in_usa` 頂點的集合的第一個元素。
* 從 `in_usa` 集合的頂點出發，沿著所有的 `with_in` 入邊，將其尾頂點加入同一集合，不斷遞迴直到所有 `with_in` 入邊都被訪問完畢。
* 同理，從 `name` 屬性為 `Europe` 的頂點出發，建立 `in_europe` 頂點的集合。
* 對於 `in_usa` 集合中的每個頂點，根據 `born_in` 入邊來查找出生在美國某個地方的人。
* 同樣，對於 `in_europe` 集合中的每個頂點，根據 `lives_in` 入邊來查詢居住在歐洲的人。
* 最後，把在美國出生的人的集合與在歐洲居住的人的集合相交。

同一個查詢，用某一個查詢語言可以寫成 4 行，而用另一個查詢語言需要 29 行，這恰恰說明了不同的資料模型是為不同的應用場景而設計的。選擇適合應用程式的資料模型非常重要。

### 三元組儲存和 SPARQL

三元組儲存模式大體上與屬性圖模型相同，用不同的詞來描述相同的想法。不過仍然值得討論，因為三元組儲存有很多現成的工具和語言，這些工具和語言對於構建應用程式的工具箱可能是寶貴的補充。

在三元組儲存中，所有資訊都以非常簡單的三部分表示形式儲存（**主語**，**謂語**，**賓語**）。例如，三元組 **(吉姆, 喜歡, 香蕉)** 中，**吉姆** 是主語，**喜歡** 是謂語（動詞），**香蕉** 是物件。

三元組的主語相當於圖中的一個頂點。而賓語是下面兩者之一：

1. 原始資料型別中的值，例如字串或數字。在這種情況下，三元組的謂語和賓語相當於主語頂點上的屬性的鍵和值。例如，`(lucy, age, 33)` 就像屬性 `{"age": 33}` 的頂點 lucy。
2. 圖中的另一個頂點。在這種情況下，謂語是圖中的一條邊，主語是其尾部頂點，而賓語是其頭部頂點。例如，在 `(lucy, marriedTo, alain)` 中主語和賓語 `lucy` 和 `alain` 都是頂點，並且謂語 `marriedTo` 是連線他們的邊的標籤。

[例 2-6]() 展示了與 [例 2-3]() 相同的資料，以稱為 Turtle 的格式（Notation3（N3）【39】的一個子集）寫成三元組。

**例 2-6 圖 2-5 中的資料子集，表示為 Turtle 三元組**

```reStructuredText
@prefix : <urn:example:>.
_:lucy     a       :Person.
_:lucy     :name   "Lucy".
_:lucy     :bornIn _:idaho.
_:idaho    a       :Location.
_:idaho    :name   "Idaho".
_:idaho    :type   "state".
_:idaho    :within _:usa.
_:usa      a       :Location
_:usa      :name   "United States"
_:usa      :type   "country".
_:usa      :within _:namerica.
_:namerica a       :Location
_:namerica :name   "North America"
_:namerica :type   :"continent"
```

在這個例子中，圖的頂點被寫為：`_:someName`。這個名字並不意味著這個檔案以外的任何東西。它的存在只是幫助我們明確哪些三元組引用了同一頂點。當謂語表示邊時，該賓語是一個頂點，如 `_:idaho :within _:usa.`。當謂語是一個屬性時，該賓語是一個字串，如 `_:usa :name"United States"`

一遍又一遍地重複相同的主語看起來相當重複，但幸運的是，可以使用分號來說明關於同一主語的多個事情。這使得 Turtle 格式相當不錯，可讀性強：請參閱 [例 2-7]()。

**例 2-7 一種相對例 2-6 寫入資料的更為簡潔的方法。**

```
@prefix : <urn:example:>.
_:lucy      a :Person;   :name "Lucy";          :bornIn _:idaho.
_:idaho     a :Location; :name "Idaho";         :type "state";   :within _:usa
_:usa       a :Loaction; :name "United States"; :type "country"; :within _:namerica.
_:namerica  a :Location; :name "North America"; :type "continent".
```

#### 語義網

如果你深入瞭解關於三元組儲存的資訊，可能會陷入關於**語義網**的討論漩渦中。三元組儲存模型其實是完全獨立於語義網存在的，例如，Datomic【40】作為一種三元組儲存資料庫 [^vii]，從未被用於語義網中。但是，由於在很多人眼中這兩者緊密相連，我們應該簡要地討論一下。

[^vii]: 從技術上講，Datomic 使用的是五元組而不是三元組，兩個額外的欄位是用於版本控制的元資料

從本質上講，語義網是一個簡單且合理的想法：網站已經將資訊釋出為文字和圖片供人類閱讀，為什麼不將資訊作為機器可讀的資料也釋出給計算機呢？（基於三元組模型的）**資源描述框架**（**RDF**）【41】，被用作不同網站以統一的格式釋出資料的一種機制，允許來自不同網站的資料自動合併成 **一個數據網路** —— 成為一種網際網路範圍內的 “通用語義網資料庫”。

不幸的是，語義網在二十一世紀初被過度炒作，但到目前為止沒有任何跡象表明已在實踐中應用，這使得許多人嗤之以鼻。它還飽受眼花繚亂的縮略詞、過於複雜的標準提案和狂妄自大的困擾。

然而，如果從過去的失敗中汲取教訓，語義網專案還是擁有很多優秀的成果。即使你沒有興趣在語義網上釋出 RDF 資料，三元組這種模型也是一種好的應用程式內部資料模型。

#### RDF 資料模型

[例 2-7]() 中使用的 Turtle 語言是一種用於 RDF 資料的人類可讀格式。有時候，RDF 也可以以 XML 格式編寫，不過完成同樣的事情會相對囉嗦，請參閱 [例 2-8]()。Turtle/N3 是更可取的，因為它更容易閱讀，像 Apache Jena 【42】這樣的工具可以根據需要在不同的 RDF 格式之間進行自動轉換。

**例 2-8 用 RDF/XML 語法表示例 2-7 的資料**

```xml
<rdf:RDF xmlns="urn:example:"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <Location rdf:nodeID="idaho">
        <name>Idaho</name>
        <type>state</type>
        <within>
            <Location rdf:nodeID="usa">
                <name>United States</name>
                <type>country</type>
                <within>
                    <Location rdf:nodeID="namerica">
                        <name>North America</name>
                        <type>continent</type>
                    </Location>
                </within>
            </Location>
        </within>
    </Location>
    <Person rdf:nodeID="lucy">
        <name>Lucy</name>
        <bornIn rdf:nodeID="idaho"/>
    </Person>
</rdf:RDF>
```

RDF 有一些奇怪之處，因為它是為了在網際網路上交換資料而設計的。三元組的主語，謂語和賓語通常是 URI。例如，謂語可能是一個 URI，如 `<http://my-company.com/namespace#within>` 或 `<http://my-company.com/namespace#lives_in>`，而不僅僅是 `WITHIN` 或 `LIVES_IN`。這個設計背後的原因為了讓你能夠把你的資料和其他人的資料結合起來，如果他們賦予單詞 `within` 或者 `lives_in` 不同的含義，兩者也不會衝突，因為它們的謂語實際上是 `<http://other.org/foo#within>` 和 `<http://other.org/foo#lives_in>`。

從 RDF 的角度來看，URL `<http://my-company.com/namespace>` 不一定需要能解析成什麼東西，它只是一個名稱空間。為避免與 `http://URL` 混淆，本節中的示例使用不可解析的 URI，如 `urn:example:within`。幸運的是，你只需在檔案頂部對這個字首做一次宣告，後續就不用再管了。

### SPARQL 查詢語言

**SPARQL** 是一種用於三元組儲存的面向 RDF 資料模型的查詢語言【43】（它是 SPARQL 協議和 RDF 查詢語言的縮寫，發音為 “sparkle”）。SPARQL 早於 Cypher，並且由於 Cypher 的模式匹配借鑑於 SPARQL，這使得它們看起來非常相似【37】。

與之前相同的查詢 —— 查詢從美國移民到歐洲的人 —— 使用 SPARQL 比使用 Cypher 甚至更為簡潔（請參閱 [例 2-9]()）。

**例 2-9 與示例 2-4 相同的查詢，用 SPARQL 表示**

```sparql
PREFIX : <urn:example:>
SELECT ?personName WHERE {
  ?person :name ?personName.
  ?person :bornIn  / :within* / :name "United States".
  ?person :livesIn / :within* / :name "Europe".
}
```

結構非常相似。以下兩個表示式是等價的（SPARQL 中的變數以問號開頭）：

```
(person) -[:BORN_IN]-> () -[:WITHIN*0..]-> (location)   # Cypher
?person :bornIn / :within* ?location.                   # SPARQL
```

因為 RDF 不區分屬性和邊，而只是將它們作為謂語，所以可以使用相同的語法來匹配屬性。在下面的表示式中，變數 `usa` 被繫結到任意 `name` 屬性為字串值 `"United States"` 的頂點：

```
(usa {name:'United States'})   # Cypher
?usa :name "United States".    # SPARQL
```

SPARQL 是一種很好的查詢語言 —— 儘管它構想的語義網從未實現，但它仍然是一種可用於應用程式內部的強大工具。

> #### 圖形資料庫與網狀模型相比較
>
> 在 “[文件資料庫是否在重蹈覆轍？](#文件資料庫是否在重蹈覆轍？)” 中，我們討論了 CODASYL 和關係模型如何競相解決 IMS 中的多對多關係問題。乍一看，CODASYL 的網狀模型看起來與圖模型相似。CODASYL 是否是圖形資料庫的第二個變種？
>
> 不，他們在幾個重要方面有所不同：
>
> * 在 CODASYL 中，資料庫有一個模式，用於指定哪種記錄型別可以巢狀在其他記錄型別中。在圖形資料庫中，不存在這樣的限制：任何頂點都可以具有到其他任何頂點的邊。這為應用程式適應不斷變化的需求提供了更大的靈活性。
> * 在 CODASYL 中，達到特定記錄的唯一方法是遍歷其中的一個訪問路徑。在圖形資料庫中，可以透過其唯一 ID 直接引用任何頂點，也可以使用索引來查詢具有特定值的頂點。
> * 在 CODASYL 中，記錄的子專案是一個有序集合，所以資料庫必須去管理它們的次序（這會影響儲存佈局），並且應用程式在插入新記錄到資料庫時必須關注新記錄在這些集合中的位置。在圖形資料庫中，頂點和邊是無序的（只能在查詢時對結果進行排序）。
> * 在 CODASYL 中，所有查詢都是命令式的，難以編寫，並且很容易因架構變化而受到破壞。在圖形資料庫中，你可以在命令式程式碼中手寫遍歷過程，但大多數圖形資料庫都支援高階宣告式查詢，如 Cypher 或 SPARQL。
>
>

### 基礎：Datalog

**Datalog** 是比 SPARQL、Cypher 更古老的語言，在 20 世紀 80 年代被學者廣泛研究【44,45,46】。它在軟體工程師中不太知名，但是它是重要的，因為它為以後的查詢語言提供了基礎。

實踐中，Datalog 在有限的幾個資料系統中使用：例如，它是 Datomic 【40】的查詢語言，Cascalog 【47】是一種用於查詢 Hadoop 大資料集的 Datalog 實現 [^viii]。

[^viii]: Datomic 和 Cascalog 使用 Datalog 的 Clojure S 表示式語法。在下面的例子中使用了一個更容易閱讀的 Prolog 語法，但兩者沒有任何功能差異。

Datalog 的資料模型類似於三元組模式，但進行了一點泛化。把三元組寫成 **謂語**（**主語，賓語**），而不是寫三元語（**主語，謂語，賓語**）。[例 2-10]() 顯示了如何用 Datalog 寫入我們的例子中的資料。

**例 2-10 用 Datalog 來表示圖 2-5 中的資料子集**

```prolog
name(namerica, 'North America').
type(namerica, continent).

name(usa, 'United States').
type(usa, country).
within(usa, namerica).

name(idaho, 'Idaho').
type(idaho, state).
within(idaho, usa).

name(lucy, 'Lucy').
born_in(lucy, idaho).
```

既然已經定義了資料，我們可以像之前一樣編寫相同的查詢，如 [例 2-11]() 所示。它看起來與 Cypher 或 SPARQL 的語法差異較大，但請不要抗拒它。Datalog 是 Prolog 的一個子集，如果你是計算機科學專業的學生，可能已經見過 Prolog。

**例 2-11 與示例 2-4 相同的查詢，用 Datalog 表示**

```
within_recursive(Location, Name) :- name(Location, Name). /* Rule 1 */

within_recursive(Location, Name) :- within(Location, Via), /* Rule 2 */
                  within_recursive(Via, Name).

migrated(Name, BornIn, LivingIn) :- name(Person, Name), /* Rule 3 */
                                    born_in(Person, BornLoc),
                                    within_recursive(BornLoc, BornIn),
                                    lives_in(Person, LivingLoc),
                                    within_recursive(LivingLoc, LivingIn).

?- migrated(Who, 'United States', 'Europe'). /* Who = 'Lucy'. */
```

Cypher 和 SPARQL 使用 SELECT 立即跳轉，但是 Datalog 一次只進行一小步。我們定義 **規則**，以將新謂語告訴資料庫：在這裡，我們定義了兩個新的謂語，`within_recursive` 和 `migrated`。這些謂語不是儲存在資料庫中的三元組中，而是從資料或其他規則派生而來的。規則可以引用其他規則，就像函式可以呼叫其他函式或者遞迴地呼叫自己一樣。像這樣，複雜的查詢可以藉由小的磚瓦構建起來。

在規則中，以大寫字母開頭的單詞是變數，謂語則用 Cypher 和 SPARQL 的方式一樣來匹配。例如，`name(Location, Name)` 透過變數繫結 `Location = namerica` 和 `Name ='North America'` 可以匹配三元組 `name(namerica, 'North America')`。

要是系統可以在 `:-` 運算子的右側找到與所有謂語的一個匹配，就運用該規則。當規則運用時，就好像透過 `:-` 的左側將其新增到資料庫（將變數替換成它們匹配的值）。

因此，一種可能的應用規則的方式是：

1. 資料庫存在 `name (namerica, 'North America')`，故運用規則 1。它生成 `within_recursive (namerica, 'North America')`。
2. 資料庫存在 `within (usa, namerica)`，在上一步驟中生成 `within_recursive (namerica, 'North America')`，故運用規則 2。它會產生 `within_recursive (usa, 'North America')`。
3. 資料庫存在 `within (idaho, usa)`，在上一步生成 `within_recursive (usa, 'North America')`，故運用規則 2。它產生 `within_recursive (idaho, 'North America')`。

透過重複應用規則 1 和 2，`within_recursive` 謂語可以告訴我們在資料庫中包含北美（或任何其他位置名稱）的所有位置。這個過程如 [圖 2-6](/v1/ddia_0206.png) 所示。

![](/v1/ddia_0206.png)

**圖 2-6 使用示例 2-11 中的 Datalog 規則來確定愛達荷州在北美。**

現在規則 3 可以找到出生在某個地方 `BornIn` 的人，並住在某個地方 `LivingIn`。透過查詢 `BornIn ='United States'` 和 `LivingIn ='Europe'`，並將此人作為變數 `Who`，讓 Datalog 系統找出變數 `Who` 會出現哪些值。因此，最後得到了與早先的 Cypher 和 SPARQL 查詢相同的答案。

相對於本章討論的其他查詢語言，我們需要採取不同的思維方式來思考 Datalog 方法，但這是一種非常強大的方法，因為規則可以在不同的查詢中進行組合和重用。雖然對於簡單的一次性查詢，顯得不太方便，但是它可以更好地處理資料很複雜的情況。


## 本章小結

資料模型是一個巨大的課題，在本章中，我們快速瀏覽了各種不同的模型。我們沒有足夠的篇幅來詳述每個模型的細節，但是希望這個概述足以激起你的興趣，以更多地瞭解最適合你的應用需求的模型。

在歷史上，資料最開始被表示為一棵大樹（層次資料模型），但是這不利於表示多對多的關係，所以發明了關係模型來解決這個問題。最近，開發人員發現一些應用程式也不適合採用關係模型。新的非關係型 “NoSQL” 資料儲存分化為兩個主要方向：

**文件資料庫**
: 主要關注自我包含的資料文件，而且文件之間的關係非常稀少。

**圖形資料庫**
: 用於相反的場景：任意事物之間都可能存在潛在的關聯。

這三種模型（文件，關係和圖形）在今天都被廣泛使用，並且在各自的領域都發揮很好。一個模型可以用另一個模型來模擬 —— 例如，圖資料可以在關係資料庫中表示 —— 但結果往往是糟糕的。這就是為什麼我們有著針對不同目的的不同系統，而不是一個單一的萬能解決方案。

文件資料庫和圖資料庫有一個共同點，那就是它們通常不會將儲存的資料強制約束為特定模式，這可以使應用程式更容易適應不斷變化的需求。但是應用程式很可能仍會假定資料具有一定的結構；區別僅在於模式是**明確的**（寫入時強制）還是**隱含的**（讀取時處理）。

每個資料模型都具有各自的查詢語言或框架，我們討論了幾個例子：SQL、MapReduce、MongoDB 的聚合管道、Cypher、SPARQL 和 Datalog。我們也談到了 CSS 和 XSL/XPath，它們不是資料庫查詢語言，而包含有趣的相似之處。

雖然我們已經覆蓋了很多層面，但仍然有許多資料模型沒有提到。舉幾個簡單的例子：

* 使用基因組資料的研究人員通常需要執行 **序列相似性搜尋**，這意味著需要一個很長的字串（代表一個 DNA 序列），並在一個擁有類似但不完全相同的字串的大型資料庫中尋找匹配。這裡所描述的資料庫都不能處理這種用法，這就是為什麼研究人員編寫了像 GenBank 這樣的專門的基因組資料庫軟體的原因【48】。
* 粒子物理學家數十年來一直在進行大資料型別的大規模資料分析，像大型強子對撞機（LHC）這樣的專案現在會處理數百 PB 的資料！在這樣的規模下，需要定製解決方案來阻止硬體成本的失控【49】。
* **全文搜尋** 可以說是一種經常與資料庫一起使用的資料模型。資訊檢索是一個很大的專業課題，我們不會在本書中詳細介紹，但是我們將在第三章和第三部分中介紹搜尋索引。

讓我們暫時將其放在一邊。在 [下一章](/v1_tw/ch3) 中，我們將討論在 **實現** 本章描述的資料模型時會遇到的一些權衡。


## 參考文獻

1. Edgar F. Codd: “[A Relational Model of Data for Large Shared Data Banks](https://www.seas.upenn.edu/~zives/03f/cis550/codd.pdf),” *Communications of the ACM*, volume 13, number 6, pages 377–387, June 1970. [doi:10.1145/362384.362685](http://dx.doi.org/10.1145/362384.362685)
1. Michael Stonebraker and Joseph M. Hellerstein: “[What Goes Around Comes Around](http://mitpress2.mit.edu/books/chapters/0262693143chapm1.pdf),” in *Readings in Database Systems*, 4th edition, MIT Press, pages 2–41, 2005. ISBN: 978-0-262-69314-1
1. Pramod J. Sadalage and Martin Fowler: *NoSQL Distilled*. Addison-Wesley, August 2012. ISBN: 978-0-321-82662-6
1. Eric Evans: “[NoSQL: What's in a Name?](https://web.archive.org/web/20190623045155/http://blog.sym-link.com/2009/10/30/nosql_whats_in_a_name.html),” *blog.sym-link.com*, October 30, 2009.
1. James Phillips: “[Surprises in Our NoSQL Adoption Survey](http://blog.couchbase.com/nosql-adoption-survey-surprises),” *blog.couchbase.com*, February 8, 2012.
1. Michael Wagner: *SQL/XML:2006 – Evaluierung der Standardkonformität ausgewählter Datenbanksysteme*. Diplomica Verlag, Hamburg, 2010. ISBN: 978-3-836-64609-3
1. “[XML Data (SQL Server)](https://docs.microsoft.com/en-us/sql/relational-databases/xml/xml-data-sql-server?view=sql-server-ver15),” SQL Server documentation, *docs.microsoft.com*, 2013.
1. “[PostgreSQL 9.3.1 Documentation](http://www.postgresql.org/docs/9.3/static/index.html),” The PostgreSQL Global Development Group, 2013.
1. “[The MongoDB 2.4 Manual](http://docs.mongodb.org/manual/),” MongoDB, Inc., 2013.
1. “[RethinkDB 1.11 Documentation](http://www.rethinkdb.com/docs/),” *rethinkdb.com*, 2013.
1. “[Apache CouchDB 1.6 Documentation](http://docs.couchdb.org/en/latest/),” *docs.couchdb.org*, 2014.
1. Lin Qiao, Kapil Surlaker, Shirshanka Das, et al.: “[On Brewing Fresh Espresso: LinkedIn’s Distributed Data Serving Platform](http://www.slideshare.net/amywtang/espresso-20952131),” at *ACM International Conference on Management of Data* (SIGMOD), June 2013.
1. Rick Long, Mark Harrington, Robert Hain, and Geoff Nicholls: [*IMS Primer*](http://www.redbooks.ibm.com/redbooks/pdfs/sg245352.pdf). IBM Redbook SG24-5352-00, IBM International Technical Support Organization, January 2000.
1. Stephen D. Bartlett: “[IBM’s IMS—Myths, Realities, and Opportunities](https://public.dhe.ibm.com/software/data/ims/pdf/TCG2013015LI.pdf),” The Clipper Group Navigator, TCG2013015LI, July 2013.
1. Sarah Mei: “[Why You Should Never Use MongoDB](http://www.sarahmei.com/blog/2013/11/11/why-you-should-never-use-mongodb/),” *sarahmei.com*, November 11, 2013.
1. J. S. Knowles and D. M. R. Bell: “The CODASYL Model,” in *Databases—Role and Structure: An Advanced Course*, edited by P. M. Stocker, P. M. D. Gray, and M. P. Atkinson, pages 19–56, Cambridge University Press, 1984. ISBN: 978-0-521-25430-4
1. Charles W. Bachman: “[The Programmer as Navigator](http://dl.acm.org/citation.cfm?id=362534),” *Communications of the ACM*, volume 16, number 11, pages 653–658, November 1973. [doi:10.1145/355611.362534](http://dx.doi.org/10.1145/355611.362534)
1. Joseph M. Hellerstein, Michael Stonebraker, and James Hamilton: “[Architecture of a Database System](http://db.cs.berkeley.edu/papers/fntdb07-architecture.pdf),” *Foundations and Trends in Databases*, volume 1, number 2, pages 141–259, November 2007. [doi:10.1561/1900000002](http://dx.doi.org/10.1561/1900000002)
1. Sandeep Parikh and Kelly Stirman: “[Schema Design for Time Series Data in MongoDB](http://blog.mongodb.org/post/65517193370/schema-design-for-time-series-data-in-mongodb),” *blog.mongodb.org*, October 30, 2013.
1. Martin Fowler: “[Schemaless Data Structures](http://martinfowler.com/articles/schemaless/),” *martinfowler.com*, January 7, 2013.
1. Amr Awadallah: “[Schema-on-Read vs. Schema-on-Write](http://www.slideshare.net/awadallah/schemaonread-vs-schemaonwrite),” at *Berkeley EECS RAD Lab Retreat*, Santa Cruz, CA, May 2009.
1. Martin Odersky: “[The Trouble with Types](http://www.infoq.com/presentations/data-types-issues),” at *Strange Loop*, September 2013.
1. Conrad Irwin: “[MongoDB—Confessions of a PostgreSQL Lover](https://speakerdeck.com/conradirwin/mongodb-confessions-of-a-postgresql-lover),” at *HTML5DevConf*, October 2013.
1. “[Percona Toolkit Documentation: pt-online-schema-change](http://www.percona.com/doc/percona-toolkit/2.2/pt-online-schema-change.html),” Percona Ireland Ltd., 2013.
1. Rany Keddo, Tobias Bielohlawek, and Tobias Schmidt: “[Large Hadron Migrator](https://github.com/soundcloud/lhm),” SoundCloud, 2013.
1. Shlomi Noach: “[gh-ost: GitHub's Online Schema Migration Tool for MySQL](http://githubengineering.com/gh-ost-github-s-online-migration-tool-for-mysql/),” *githubengineering.com*, August 1, 2016.
1. James C. Corbett, Jeffrey Dean, Michael Epstein, et al.: “[Spanner: Google’s Globally-Distributed Database](https://research.google/pubs/pub39966/),” at *10th USENIX Symposium on Operating System Design and Implementation* (OSDI), October 2012.
1. Donald K. Burleson: “[Reduce I/O with Oracle Cluster Tables](https://web.archive.org/web/20231207233228/http://www.dba-oracle.com/oracle_tip_hash_index_cluster_table.htm),” *dba-oracle.com*.
1. Fay Chang, Jeffrey Dean, Sanjay Ghemawat, et al.: “[Bigtable: A Distributed Storage System for Structured Data](https://research.google/pubs/pub27898/),” at *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006.
1. Bobbie J. Cochrane and Kathy A. McKnight: “[DB2 JSON Capabilities, Part 1: Introduction to DB2 JSON](https://web.archive.org/web/20180516203043/https://www.ibm.com/developerworks/data/library/techarticle/dm-1306nosqlforjson1/),” IBM developerWorks, June 20, 2013.
1. Herb Sutter: “[The Free Lunch Is Over: A Fundamental Turn Toward Concurrency in Software](http://www.gotw.ca/publications/concurrency-ddj.htm),” *Dr. Dobb's Journal*, volume 30, number 3, pages 202-210, March 2005.
1. Joseph M. Hellerstein: “[The Declarative Imperative: Experiences and Conjectures in Distributed Logic](http://www.eecs.berkeley.edu/Pubs/TechRpts/2010/EECS-2010-90.pdf),” Electrical Engineering and Computer Sciences, University of California at Berkeley, Tech report UCB/EECS-2010-90, June 2010.
1. Jeffrey Dean and Sanjay Ghemawat: “[MapReduce: Simplified Data Processing on Large Clusters](https://research.google/pubs/pub62/),” at *6th USENIX Symposium on Operating System Design and Implementation* (OSDI), December 2004.
1. Craig Kerstiens: “[JavaScript in Your Postgres](https://blog.heroku.com/javascript_in_your_postgres),” *blog.heroku.com*, June 5, 2013.
1. Nathan Bronson, Zach Amsden, George Cabrera, et al.: “[TAO: Facebook’s Distributed Data Store for the Social Graph](https://www.usenix.org/conference/atc13/technical-sessions/presentation/bronson),” at *USENIX Annual Technical Conference* (USENIX ATC), June 2013.
1. “[Apache TinkerPop3.2.3 Documentation](http://tinkerpop.apache.org/docs/3.2.3/reference/),” *tinkerpop.apache.org*, October 2016.
1. “[The Neo4j Manual v2.0.0](http://docs.neo4j.org/chunked/2.0.0/index.html),” Neo Technology, 2013.
1. Emil Eifrem: [Twitter correspondence](https://twitter.com/emileifrem/status/419107961512804352), January 3, 2014.
1. David Beckett and Tim Berners-Lee: “[Turtle – Terse RDF Triple Language](http://www.w3.org/TeamSubmission/turtle/),” W3C Team Submission, March 28, 2011.
1. “[Datomic Development Resources](http://docs.datomic.com/),” Metadata Partners, LLC, 2013.
1. W3C RDF Working Group: “[Resource Description Framework (RDF)](http://www.w3.org/RDF/),” *w3.org*, 10 February 2004.
1. “[Apache Jena](http://jena.apache.org/),” Apache Software Foundation.
1. Steve Harris, Andy Seaborne, and Eric Prud'hommeaux: “[SPARQL 1.1 Query Language](http://www.w3.org/TR/sparql11-query/),” W3C Recommendation, March 2013.
1. Todd J. Green, Shan Shan Huang, Boon Thau Loo, and Wenchao Zhou: “[Datalog and Recursive Query Processing](http://blogs.evergreen.edu/sosw/files/2014/04/Green-Vol5-DBS-017.pdf),” *Foundations and Trends in Databases*, volume 5, number 2, pages 105–195, November 2013. [doi:10.1561/1900000017](http://dx.doi.org/10.1561/1900000017)
1. Stefano Ceri, Georg Gottlob, and Letizia Tanca: “[What You Always Wanted to Know About Datalog (And Never Dared to Ask)](https://www.researchgate.net/profile/Letizia_Tanca/publication/3296132_What_you_always_wanted_to_know_about_Datalog_and_never_dared_to_ask/links/0fcfd50ca2d20473ca000000.pdf),” *IEEE Transactions on Knowledge and Data Engineering*, volume 1, number 1, pages 146–166, March 1989. [doi:10.1109/69.43410](http://dx.doi.org/10.1109/69.43410)
1. Serge Abiteboul, Richard Hull, and Victor Vianu: [*Foundations of Databases*](http://webdam.inria.fr/Alice/). Addison-Wesley, 1995. ISBN: 978-0-201-53771-0, available online at *webdam.inria.fr/Alice*
1. Nathan Marz: “[Cascalog](https://github.com/nathanmarz/cascalog)," *github.com*.
1. Dennis A. Benson, Ilene Karsch-Mizrachi, David J. Lipman, et al.: “[GenBank](https://academic.oup.com/nar/article/36/suppl_1/D25/2507746),” *Nucleic Acids Research*, volume 36, Database issue, pages D25–D30, December 2007. [doi:10.1093/nar/gkm929](http://dx.doi.org/10.1093/nar/gkm929)
1. Fons Rademakers: “[ROOT for Big Data Analysis](https://indico.cern.ch/event/246453/contributions/1566610/attachments/423154/587535/ROOT-BigData-Analysis-London-2013.pdf),” at *Workshop on the Future of Big Data Management*, London, UK, June 2013.

================================================
FILE: content/v1_tw/ch3.md
================================================
---
title: "第三章：儲存與檢索"
linkTitle: "3. 儲存與檢索"
weight: 103
breadcrumbs: false
---


![](/map/ch03.png)

> 建立秩序，省卻搜尋
>
> —— 德國諺語

一個數據庫在最基礎的層次上需要完成兩件事情：當你把資料交給資料庫時，它應當把資料儲存起來；而後當你向資料庫要資料時，它應當把資料返回給你。

在 [第二章](/v1_tw/ch2) 中，我們討論了資料模型和查詢語言，即程式設計師將資料錄入資料庫的格式，以及再次要回資料的機制。在本章中我們會從資料庫的視角來討論同樣的問題：資料庫如何儲存我們提供的資料，以及如何在我們需要時重新找到資料。

作為程式設計師，為什麼要關心資料庫內部儲存與檢索的機理？你可能不會去從頭開始實現自己的儲存引擎，但是你 **確實** 需要從許多可用的儲存引擎中選擇一個合適的。而且為了讓儲存引擎能在你的工作負載型別上執行良好，你也需要大致瞭解儲存引擎在底層究竟做了什麼。

特別需要注意，針對 **事務性** 負載最佳化的和針對 **分析性** 負載最佳化的儲存引擎之間存在巨大差異。稍後我們將在 “[事務處理還是分析？](#事務處理還是分析？)” 一節中探討這一區別，並在 “[列式儲存](#列式儲存)” 中討論一系列針對分析性負載而最佳化的儲存引擎。

但首先，我們將從你可能已經很熟悉的兩大類資料庫（傳統的關係型資料庫和很多所謂的 “NoSQL” 資料庫）中使用的 **儲存引擎** 來開始本章的內容。我們將研究兩大類儲存引擎：**日誌結構（log-structured）** 的儲存引擎，以及 **面向頁面（page-oriented）** 的儲存引擎（例如 B 樹）。

## 驅動資料庫的資料結構

世界上最簡單的資料庫可以用兩個 Bash 函式實現：

```bash
#!/bin/bash
db_set () {
  echo "$1,$2" >> database
}

db_get () {
  grep "^$1," database | sed -e "s/^$1,//" | tail -n 1
}
```

這兩個函式實現了鍵值儲存的功能。執行 `db_set key value` 會將 **鍵（key）** 和 **值（value）** 儲存在資料庫中。鍵和值（幾乎）可以是你喜歡的任何東西，例如，值可以是 JSON 文件。然後呼叫 `db_get key` 會查詢與該鍵關聯的最新值並將其返回。

麻雀雖小，五臟俱全：

```bash
$ db_set 123456 '{"name":"London","attractions":["Big Ben","London Eye"]}'

$ db_set 42 '{"name":"San Francisco","attractions":["Golden Gate Bridge"]}'

$ db_get 42
{"name":"San Francisco","attractions":["Golden Gate Bridge"]}
```

底層的儲存格式非常簡單：一個文字檔案，每行包含一條逗號分隔的鍵值對（忽略轉義問題的話，大致與 CSV 檔案類似）。每次對 `db_set` 的呼叫都會向檔案末尾追加記錄，所以更新鍵的時候舊版本的值不會被覆蓋 —— 因而查詢最新值的時候，需要找到檔案中鍵最後一次出現的位置（因此 `db_get` 中使用了 `tail -n 1` )。

```bash
$ db_set 42 '{"name":"San Francisco","attractions":["Exploratorium"]}'

$ db_get 42
{"name":"San Francisco","attractions":["Exploratorium"]}

$ cat database
123456,{"name":"London","attractions":["Big Ben","London Eye"]}
42,{"name":"San Francisco","attractions":["Golden Gate Bridge"]}
42,{"name":"San Francisco","attractions":["Exploratorium"]}
```

`db_set` 函式對於極其簡單的場景其實有非常好的效能，因為在檔案尾部追加寫入通常是非常高效的。與 `db_set` 做的事情類似，許多資料庫在內部使用了 **日誌（log）**，也就是一個 **僅追加（append-only）** 的資料檔案。真正的資料庫有更多的問題需要處理（如併發控制，回收硬碟空間以避免日誌無限增長，處理錯誤與部分寫入的記錄），但基本原理是一樣的。日誌極其有用，我們還將在本書的其它部分重複見到它好幾次。

> **日誌（log）** 這個詞通常指應用日誌：即應用程式輸出的描述正在發生的事情的文字。本書在更普遍的意義下使用 **日誌** 這一詞：一個僅追加的記錄序列。它可能壓根就不是給人類看的，它可以使用二進位制格式，並僅能由其他程式讀取。

另一方面，如果這個資料庫中有著大量記錄，則這個 `db_get` 函式的效能會非常糟糕。每次你想查詢一個鍵時，`db_get` 必須從頭到尾掃描整個資料庫檔案來查詢鍵的出現。用演算法的語言來說，查詢的開銷是 `O(n)` ：如果資料庫記錄數量 n 翻了一倍，查詢時間也要翻一倍。這就不好了。

為了高效查詢資料庫中特定鍵的值，我們需要一個數據結構：**索引（index）**。本章將介紹一系列的索引結構，並在它們之間進行比較。索引背後的大致思想是透過儲存一些額外的元資料作為路標來幫助你找到想要的資料。如果你想以幾種不同的方式搜尋同一份資料，那麼你也許需要在資料的不同部分上建立多個索引。

索引是從主資料衍生的 **額外的（additional）** 結構。許多資料庫允許新增與刪除索引，這不會影響資料的內容，而只會影響查詢的效能。維護額外的結構會產生開銷，特別是在寫入時。寫入效能很難超過簡單地追加寫入檔案，因為追加寫入是最簡單的寫入操作。任何型別的索引通常都會減慢寫入速度，因為每次寫入資料時都需要更新索引。

這是儲存系統中一個重要的權衡：精心選擇的索引加快了讀查詢的速度，但是每個索引都會拖慢寫入速度。因為這個原因，資料庫預設並不會索引所有的內容，而需要你，也就是程式設計師或資料庫管理員（DBA），基於對應用的典型查詢模式的瞭解來手動選擇索引。你可以選擇那些能為應用帶來最大收益而且又不會引入超出必要開銷的索引。


### 雜湊索引

讓我們從 **鍵值資料（key-value Data）** 的索引開始。這不是你可以索引的唯一資料型別，但鍵值資料是很常見的。在引入更複雜的索引之前，它是重要的第一步。

鍵值儲存與在大多數程式語言中可以找到的 **字典（dictionary）** 型別非常相似，通常字典都是用 **雜湊對映（hash map）** 或 **散列表（hash table）** 實現的。雜湊對映在許多演算法教科書中都有描述【1,2】，所以這裡我們不會討論它的工作細節。既然我們已經可以用雜湊對映來表示 **記憶體中** 的資料結構，為什麼不使用它來索引 **硬碟上** 的資料呢？

假設我們的資料儲存只是一個追加寫入的檔案，就像前面的例子一樣，那麼最簡單的索引策略就是：保留一個記憶體中的雜湊對映，其中每個鍵都對映到資料檔案中的一個位元組偏移量，指明了可以找到對應值的位置，如 [圖 3-1](/v1/ddia_0301.png) 所示。當你將新的鍵值對追加寫入檔案中時，還要更新雜湊對映，以反映剛剛寫入的資料的偏移量（這同時適用於插入新鍵與更新現有鍵）。當你想查詢一個值時，使用雜湊對映來查詢資料檔案中的偏移量，**尋找（seek）** 該位置並讀取該值即可。

![](/v1/ddia_0301.png)

**圖 3-1 以類 CSV 格式儲存鍵值對的日誌，並使用記憶體雜湊對映進行索引。**

聽上去簡單，但這是一個可行的方法。現實中，Bitcask 實際上就是這麼做的（Riak 中預設的儲存引擎）【3】。Bitcask 提供高效能的讀取和寫入操作，但要求所有的鍵必須能放入可用記憶體中，因為雜湊對映完全保留在記憶體中。而資料值可以使用比可用記憶體更多的空間，因為可以在硬碟上透過一次硬碟查詢操作來載入所需部分，如果資料檔案的那部分已經在檔案系統快取中，則讀取根本不需要任何硬碟 I/O。

像 Bitcask 這樣的儲存引擎非常適合每個鍵的值經常更新的情況。例如，鍵可能是某個貓咪影片的網址（URL），而值可能是該影片被播放的次數（每次有人點選播放按鈕時遞增）。在這種型別的工作負載中，有很多寫操作，但是沒有太多不同的鍵 —— 每個鍵有很多的寫操作，但是將所有鍵儲存在記憶體中是可行的。

到目前為止，我們只是在追加寫入一個檔案 —— 所以如何避免最終用完硬碟空間？一種好的解決方案是，將日誌分為特定大小的 **段（segment）**，當日誌增長到特定尺寸時關閉當前段檔案，並開始寫入一個新的段檔案。然後，我們就可以對這些段進行 **壓縮（compaction）**，如 [圖 3-2](/v1/ddia_0302.png) 所示。這裡的壓縮意味著在日誌中丟棄重複的鍵，只保留每個鍵的最近更新。

![](/v1/ddia_0302.png)

**圖 3-2 鍵值更新日誌（統計貓咪影片的播放次數）的壓縮，只保留每個鍵的最近值**

而且，由於壓縮經常會使得段變得很小（假設在一個段內鍵被平均重寫了好幾次），我們也可以在執行壓縮的同時將多個段合併在一起，如 [圖 3-3](/v1/ddia_0303.png) 所示。段被寫入後永遠不會被修改，所以合併的段被寫入一個新的檔案。凍結段的合併和壓縮可以在後臺執行緒中完成，這個過程進行的同時，我們仍然可以繼續使用舊的段檔案來正常提供讀寫請求。合併過程完成後，我們將讀取請求轉換為使用新合併的段而不是舊的段 —— 然後舊的段檔案就可以簡單地刪除掉了。

![](/v1/ddia_0303.png)

**圖 3-3 同時執行壓縮和分段合併**

每個段現在都有自己的記憶體散列表，將鍵對映到檔案偏移量。為了找到一個鍵的值，我們首先檢查最近的段的雜湊對映；如果鍵不存在，我們就檢查第二個最近的段，依此類推。合併過程將保持段的數量足夠小，所以查詢過程不需要檢查太多的雜湊對映。

要讓這個簡單的想法在實際中能工作會涉及到大量的細節。簡單來說，下面幾點都是實現過程中需要認真考慮的問題：

* 檔案格式

  CSV 不是日誌的最佳格式。使用二進位制格式更快，更簡單：首先以位元組為單位對字串的長度進行編碼，然後是原始的字串（不需要轉義）。

* 刪除記錄

  如果要刪除一個鍵及其關聯的值，則必須在資料檔案中追加一個特殊的刪除記錄（邏輯刪除，有時被稱為墓碑，即 tombstone）。當日誌段被合併時，合併過程會透過這個墓碑知道要將被刪除鍵的所有歷史值都丟棄掉。

* 崩潰恢復

  如果資料庫重新啟動，則記憶體雜湊對映將丟失。原則上，你可以透過從頭到尾讀取整個段檔案並記錄下來每個鍵的最近值來恢復每個段的雜湊對映。但是，如果段檔案很大，可能需要很長時間，這會使服務的重啟比較痛苦。Bitcask 透過將每個段的雜湊對映的快照儲存在硬碟上來加速恢復，可以使雜湊對映更快地載入到記憶體中。

* 部分寫入記錄

  資料庫隨時可能崩潰，包括在將記錄追加到日誌的過程中。Bitcask 檔案包含校驗和，允許檢測和忽略日誌中的這些損壞部分。

* 併發控制

  由於寫操作是以嚴格的順序追加到日誌中的，所以常見的實現是隻有一個寫入執行緒。也因為資料檔案段是僅追加的或者說是不可變的，所以它們可以被多個執行緒同時讀取。

乍一看，僅追加日誌似乎很浪費：為什麼不直接在檔案裡更新，用新值覆蓋舊值？僅追加的設計之所以是個好的設計，有如下幾個原因：

* 追加和分段合併都是順序寫入操作，通常比隨機寫入快得多，尤其是在磁性機械硬碟上。在某種程度上，順序寫入在基於快閃記憶體的 **固態硬碟（SSD）** 上也是好的選擇【4】。我們將在“[比較 B 樹和 LSM 樹](#比較b樹和lsm樹)”中進一步討論這個問題。
* 如果段檔案是僅追加的或不可變的，併發和崩潰恢復就簡單多了。例如，當一個數據值被更新的時候發生崩潰，你不用擔心檔案裡將會同時包含舊值和新值各自的一部分。
* 合併舊段的處理也可以避免資料檔案隨著時間的推移而碎片化的問題。

但是，散列表索引也有其侷限性：

* 散列表必須能放進記憶體。如果你有非常多的鍵，那真是倒楣。原則上可以在硬碟上維護一個雜湊對映，不幸的是硬碟雜湊對映很難表現優秀。它需要大量的隨機訪問 I/O，而後者耗盡時想要再擴充是很昂貴的，並且需要很煩瑣的邏輯去解決雜湊衝突【5】。
* 範圍查詢效率不高。例如，你無法輕鬆掃描 kitty00000 和 kitty99999 之間的所有鍵 —— 你必須在雜湊對映中單獨查詢每個鍵。

在下一節中，我們將看到一個沒有這些限制的索引結構。


### SSTables和LSM樹

在 [圖 3-3](/v1/ddia_0303.png) 中，每個日誌結構儲存段都是一系列鍵值對。這些鍵值對按照它們寫入的順序排列，日誌中稍後的值優先於日誌中較早的相同鍵的值。除此之外，檔案中鍵值對的順序並不重要。

現在我們可以對段檔案的格式做一個簡單的改變：要求鍵值對的序列按鍵排序。乍一看，這個要求似乎打破了我們使用順序寫入的能力，我們將稍後再回到這個問題。

我們把這個格式稱為 **排序字串表（Sorted String Table）**，簡稱 SSTable。我們還要求每個鍵只在每個合併的段檔案中出現一次（壓縮過程已經保證）。與使用雜湊索引的日誌段相比，SSTable 有幾個大的優勢：

1. 即使檔案大於可用記憶體，合併段的操作仍然是簡單而高效的。這種方法就像歸併排序演算法中使用的方法一樣，如 [圖 3-4](/v1/ddia_0304.png) 所示：你開始並排讀取多個輸入檔案，檢視每個檔案中的第一個鍵，複製最低的鍵（根據排序順序）到輸出檔案，不斷重複此步驟，將產生一個新的合併段檔案，而且它也是也按鍵排序的。

   ![](/v1/ddia_0304.png)

   **圖 3-4 合併幾個 SSTable 段，只保留每個鍵的最新值**

   如果在幾個輸入段中出現相同的鍵，該怎麼辦？請記住，每個段都包含在一段時間內寫入資料庫的所有值。這意味著一個輸入段中的所有值一定比另一個段中的所有值都更近（假設我們總是合併相鄰的段）。當多個段包含相同的鍵時，我們可以保留最近段的值，並丟棄舊段中的值。

2. 為了在檔案中找到一個特定的鍵，你不再需要在記憶體中儲存所有鍵的索引。以 [圖 3-5](/v1/ddia_0305.png) 為例：假設你正在記憶體中尋找鍵 `handiwork`，但是你不知道這個鍵在段檔案中的確切偏移量。然而，你知道 `handbag` 和 `handsome` 的偏移，而且由於排序特性，你知道 `handiwork` 必須出現在這兩者之間。這意味著你可以跳到 `handbag` 的偏移位置並從那裡掃描，直到你找到 `handiwork`（或沒找到，如果該檔案中沒有該鍵）。

   ![](/v1/ddia_0305.png)

   **圖 3-5 具有記憶體索引的 SSTable**

   你仍然需要一個記憶體中的索引來告訴你一些鍵的偏移量，但它可以是稀疏的：每幾千位元組的段檔案有一個鍵就足夠了，因為幾千位元組可以很快地被掃描完 [^i]。

[^i]: 如果所有的鍵與值都是定長的，你可以使用段檔案上的二分查詢並完全避免使用記憶體索引。然而實踐中的鍵和值通常都是變長的，因此如果沒有索引，就很難知道記錄的分界點（前一條記錄結束以及後一條記錄開始的地方）。

3. 由於讀取請求無論如何都需要掃描所請求範圍內的多個鍵值對，因此可以將這些記錄分組為塊（block），並在將其寫入硬碟之前對其進行壓縮（如 [圖 3-5](/v1/ddia_0305.png) 中的陰影區域所示）[^譯註i] 。稀疏記憶體索引中的每個條目都指向壓縮塊的開始處。除了節省硬碟空間之外，壓縮還可以減少對 I/O 頻寬的使用。

[^譯註i]: 這裡的壓縮是 compression，不是前文的 compaction，請注意區分。

#### 構建和維護SSTables

到目前為止還不錯，但是如何讓你的資料能夠預先排好序呢？畢竟我們接收到的寫入請求可能以任何順序發生。

雖然在硬碟上維護有序結構也是可能的（請參閱 “[B 樹](#B樹)”），但在記憶體儲存則要容易得多。有許多可以使用的眾所周知的樹形資料結構，例如紅黑樹或 AVL 樹【2】。使用這些資料結構，你可以按任何順序插入鍵，並按排序順序讀取它們。

現在我們可以讓我們的儲存引擎以如下方式工作：

* 有新寫入時，將其新增到記憶體中的平衡樹資料結構（例如紅黑樹）。這個記憶體樹有時被稱為 **記憶體表（memtable）**。
* 當 **記憶體表** 大於某個閾值（通常為幾兆位元組）時，將其作為 SSTable 檔案寫入硬碟。這可以高效地完成，因為樹已經維護了按鍵排序的鍵值對。新的 SSTable 檔案將成為資料庫中最新的段。當該 SSTable 被寫入硬碟時，新的寫入可以在一個新的記憶體表例項上繼續進行。
* 收到讀取請求時，首先嘗試在記憶體表中找到對應的鍵，如果沒有就在最近的硬碟段中尋找，如果還沒有就在下一個較舊的段中繼續尋找，以此類推。
* 時不時地，在後臺執行一個合併和壓縮過程，以合併段檔案並將已覆蓋或已刪除的值丟棄掉。

這個方案效果很好。它只會遇到一個問題：如果資料庫崩潰，則最近的寫入（在記憶體表中，但尚未寫入硬碟）將丟失。為了避免這個問題，我們可以在硬碟上儲存一個單獨的日誌，每個寫入都會立即被追加到這個日誌上，就像在前面的章節中所描述的那樣。這個日誌沒有按排序順序，但這並不重要，因為它的唯一目的是在崩潰後恢復記憶體表。每當記憶體表寫出到 SSTable 時，相應的日誌都可以被丟棄。

#### 用SSTables製作LSM樹

這裡描述的演算法本質上是 LevelDB【6】和 RocksDB【7】這些鍵值儲存引擎庫所使用的技術，這些儲存引擎被設計嵌入到其他應用程式中。除此之外，LevelDB 可以在 Riak 中用作 Bitcask 的替代品。在 Cassandra 和 HBase 中也使用了類似的儲存引擎【8】，而且他們都受到了 Google 的 Bigtable 論文【9】（引入了術語 SSTable 和 memtable ）的啟發。

這種索引結構最早由 Patrick O'Neil 等人發明，且被命名為日誌結構合併樹（或 LSM 樹）【10】，它是基於更早之前的日誌結構檔案系統【11】來構建的。基於這種合併和壓縮排序檔案原理的儲存引擎通常被稱為 LSM 儲存引擎。

Lucene，是一種全文搜尋的索引引擎，在 Elasticsearch 和 Solr 被使用，它使用類似的方法來儲存它的關鍵詞詞典【12,13】。全文索引比鍵值索引複雜得多，但是基於類似的想法：在搜尋查詢中，由一個給定的單詞，找到提及單詞的所有文件（網頁、產品描述等）。這也是透過鍵值結構實現的：其中鍵是 **單詞（term）**，值是所有包含該單詞的文件的 ID 列表（**postings list**）。在 Lucene 中，從詞語到記錄列表的這種對映儲存在類似於 SSTable 的有序檔案中，並根據需要在後臺執行合併【14】。

#### 效能最佳化

與往常一樣，要讓儲存引擎在實踐中表現良好涉及到大量設計細節。例如，當查詢資料庫中不存在的鍵時，LSM 樹演算法可能會很慢：你必須先檢查記憶體表，然後檢視從最近的到最舊的所有的段（可能還必須從硬碟讀取每一個段檔案），然後才能確定這個鍵不存在。為了最佳化這種訪問，儲存引擎通常使用額外的布隆過濾器（Bloom filters）【15】。（布隆過濾器是一種節省記憶體的資料結構，用於近似表達集合的內容，它可以告訴你資料庫中是否存在某個鍵，從而為不存在的鍵節省掉許多不必要的硬碟讀取操作。）

還有一些不同的策略來確定 SSTables 被壓縮和合並的順序和時間。最常見的選擇是 size-tiered 和 leveled compaction。LevelDB 和 RocksDB 使用 leveled compaction（LevelDB 因此得名），HBase 使用 size-tiered，Cassandra 同時支援這兩種【16】。對於 sized-tiered，較新和較小的 SSTables 相繼被合併到較舊的和較大的 SSTable 中。對於 leveled compaction，key （按照分佈範圍）被拆分到較小的 SSTables，而較舊的資料被移動到單獨的層級（level），這使得壓縮（compaction）能夠更加增量地進行，並且使用較少的硬碟空間。

即使有許多微妙的東西，LSM 樹的基本思想 —— 儲存一系列在後臺合併的 SSTables —— 簡單而有效。即使資料集比可用記憶體大得多，它仍能繼續正常工作。由於資料按排序順序儲存，你可以高效地執行範圍查詢（掃描所有從某個最小值到某個最大值之間的所有鍵），並且因為硬碟寫入是連續的，所以 LSM 樹可以支援非常高的寫入吞吐量。


### B樹

前面討論的日誌結構索引看起來已經相當可用了，但它們卻不是最常見的索引型別。使用最廣泛的索引結構和日誌結構索引相當不同，它就是我們接下來要討論的 B 樹。

從 1970 年被引入【17】，僅不到 10 年後就變得 “無處不在”【18】，B 樹很好地經受了時間的考驗。在幾乎所有的關係資料庫中，它們仍然是標準的索引實現，許多非關係資料庫也會使用到 B 樹。

像 SSTables 一樣，B 樹保持按鍵排序的鍵值對，這允許高效的鍵值查詢和範圍查詢。但這也就是僅有的相似之處了：B 樹有著非常不同的設計理念。

我們前面看到的日誌結構索引將資料庫分解為可變大小的段，通常是幾兆位元組或更大的大小，並且總是按順序寫入段。相比之下，B 樹將資料庫分解成固定大小的 **塊（block）** 或 **分頁（page）**，傳統上大小為 4KB（有時會更大），並且一次只能讀取或寫入一個頁面。這種設計更接近於底層硬體，因為硬碟空間也是按固定大小的塊來組織的。

每個頁面都可以使用地址或位置來標識，這允許一個頁面引用另一個頁面 —— 類似於指標，但在硬碟而不是在記憶體中。我們可以使用這些頁面引用來構建一個頁面樹，如 [圖 3-6](/v1/ddia_0306.png) 所示。

![](/v1/ddia_0306.png)

**圖 3-6 使用 B 樹索引查詢一個鍵**

一個頁面會被指定為 B 樹的根；在索引中查詢一個鍵時，就從這裡開始。該頁面包含幾個鍵和對子頁面的引用。每個子頁面負責一段連續範圍的鍵，根頁面上每兩個引用之間的鍵，表示相鄰子頁面管理的鍵的範圍（邊界）。

在 [圖 3-6](/v1/ddia_0306.png) 的例子中，我們正在尋找鍵 251 ，所以我們知道我們需要跟蹤邊界 200 和 300 之間的頁面引用。這將我們帶到一個類似的頁面，進一步將 200 到 300 的範圍拆分到子範圍。

最終，我們將到達某個包含單個鍵的頁面（葉子頁面，leaf page），該頁面或者直接包含每個鍵的值，或者包含了對可以找到值的頁面的引用。

在 B 樹的一個頁面中對子頁面的引用的數量稱為 **分支因子（branching factor）**。例如，在 [圖 3-6](/v1/ddia_0306.png) 中，分支因子是 6。在實踐中，分支因子的大小取決於儲存頁面引用和範圍邊界所需的空間，但這個值通常是幾百。

如果要更新 B 樹中現有鍵的值，需要搜尋包含該鍵的葉子頁面，更改該頁面中的值，並將該頁面寫回到硬碟（對該頁面的任何引用都將保持有效）。如果你想新增一個新的鍵，你需要找到其範圍能包含新鍵的頁面，並將其新增到該頁面。如果頁面中沒有足夠的可用空間容納新鍵，則將其分成兩個半滿頁面，並更新父頁面以反映新的鍵範圍分割槽，如 [圖 3-7](/v1/ddia_0307.png) 所示 [^ii]。

![](/v1/ddia_0307.png)

**圖 3-7 透過分割頁面來生長 B 樹**

[^ii]: 向 B 樹中插入一個新的鍵是相當符合直覺的，但刪除一個鍵（同時保持樹平衡）就會牽扯很多其他東西了【2】。

這個演算法可以確保樹保持平衡：具有 n 個鍵的 B 樹總是具有 $O (log n)$ 的深度。大多數資料庫可以放入一個三到四層的 B 樹，所以你不需要追蹤多個頁面引用來找到你正在查詢的頁面（分支因子為 500 的 4KB 頁面的四層樹可以儲存多達 256TB 的資料）。

#### 讓B樹更可靠

B 樹的基本底層寫操作是用新資料覆寫硬碟上的頁面，並假定覆寫不改變頁面的位置：即，當頁面被覆寫時，對該頁面的所有引用保持完整。這與日誌結構索引（如 LSM 樹）形成鮮明對比，後者只追加到檔案（並最終刪除過時的檔案），但從不修改檔案中已有的內容。

你可以把覆寫硬碟上的頁面對應為實際的硬體操作。在磁性硬碟驅動器上，這意味著將磁頭移動到正確的位置，等待旋轉盤上的正確位置出現，然後用新的資料覆寫適當的扇區。在固態硬碟上，由於 SSD 必須一次擦除和重寫相當大的儲存晶片塊，所以會發生更複雜的事情【19】。

而且，一些操作需要覆寫幾個不同的頁面。例如，如果因為插入導致頁面過滿而拆分頁面，則需要寫入新拆分的兩個頁面，並覆寫其父頁面以更新對兩個子頁面的引用。這是一個危險的操作，因為如果資料庫在系列操作進行到一半時崩潰，那麼最終將導致一個損壞的索引（例如，可能有一個孤兒頁面沒有被任何頁面引用） 。

為了使資料庫能處理異常崩潰的場景，B 樹實現通常會帶有一個額外的硬碟資料結構：**預寫式日誌**（WAL，即 write-ahead log，也稱為 **重做日誌**，即 redo log）。這是一個僅追加的檔案，每個 B 樹的修改在其能被應用到樹本身的頁面之前都必須先寫入到該檔案。當資料庫在崩潰後恢復時，這個日誌將被用來使 B 樹恢復到一致的狀態【5,20】。

另外還有一個更新頁面的複雜情況是，如果多個執行緒要同時訪問 B 樹，則需要仔細的併發控制 —— 否則執行緒可能會看到樹處於不一致的狀態。這通常是透過使用 **鎖存器**（latches，輕量級鎖）保護樹的資料結構來完成。日誌結構化的方法在這方面更簡單，因為它們在後臺進行所有的合併，而不會干擾新接收到的查詢，並且能夠時不時地將段檔案切換為新的（該切換是原子操作）。

#### B樹的最佳化

由於 B 樹已經存在了很久，所以並不奇怪這麼多年下來有很多最佳化的設計被開發出來，僅舉幾例：

* 不同於覆寫頁面並維護 WAL 以支援崩潰恢復，一些資料庫（如 LMDB）使用寫時複製方案【21】。經過修改的頁面被寫入到不同的位置，並且還在樹中建立了父頁面的新版本，以指向新的位置。這種方法對於併發控制也很有用，我們將在 “[快照隔離和可重複讀](/v1_tw/ch7#快照隔離和可重複讀)” 中看到。
* 我們可以透過不儲存整個鍵，而是縮短其大小，來節省頁面空間。特別是在樹內部的頁面上，鍵只需要提供足夠的資訊來充當鍵範圍之間的邊界。在頁面中包含更多的鍵允許樹具有更高的分支因子，因此也就允許更少的層級 [^iii]。
* 通常，頁面可以放置在硬碟上的任何位置；沒有什麼要求相鄰鍵範圍的頁面也放在硬碟上相鄰的區域。如果某個查詢需要按照排序順序掃描大部分的鍵範圍，那麼這種按頁面儲存的佈局可能會效率低下，因為每個頁面的讀取都需要執行一次硬碟查詢。因此，許多 B 樹的實現在佈局樹時會盡量使葉子頁面按順序出現在硬碟上。但是，隨著樹的增長，要維持這個順序是很困難的。相比之下，由於 LSM 樹在合併過程中一次性重寫一大段儲存，所以它們更容易使順序鍵在硬碟上連續儲存。
* 額外的指標被新增到樹中。例如，每個葉子頁面可以引用其左邊和右邊的兄弟頁面，使得不用跳回父頁面就能按順序對鍵進行掃描。
* B 樹的變體如 **分形樹（fractal trees）**【22】借用了一些日誌結構的思想來減少硬碟查詢（而且它們與分形無關）。

[^iii]: 這個變種有時被稱為 B+ 樹，但因為這個最佳化已被廣泛使用，所以經常無法區分於其它的 B 樹變種。

### 比較B樹和LSM樹

儘管 B 樹實現通常比 LSM 樹實現更成熟，LSM 樹由於其效能特徵的關係，仍然引起了不少關注。根據經驗，通常 LSM 樹的寫入速度更快，而 B 樹的讀取速度更快【23】。LSM 樹上的讀取通常比較慢，因為它們必須檢查幾種不同的資料結構和不同壓縮（Compaction）層級的 SSTables。

然而，基準測試的結果通常和工作負載的細節相關。你需要用你特有的工作負載來測試系統，以便進行有效的比較。在本節中，我們將簡要討論一些在衡量儲存引擎效能時值得考慮的事情。

#### LSM樹的優點

B 樹索引中的每塊資料都必須至少寫入兩次：一次寫入預先寫入日誌（WAL），一次寫入樹頁面本身（如果有分頁還需要再寫入一次）。即使在該頁面中只有幾個位元組發生了變化，也需要接受寫入整個頁面的開銷。有些儲存引擎甚至會覆寫同一個頁面兩次，以免在電源故障的情況下頁面未完整更新【24,25】。

由於反覆壓縮和合並 SSTables，日誌結構索引也會多次重寫資料。這種影響 —— 在資料庫的生命週期中每筆資料導致對硬碟的多次寫入 —— 被稱為 **寫入放大（write amplification）**。使用固態硬碟的機器需要額外關注這點，固態硬碟的快閃記憶體壽命在覆寫有限次數後就會耗盡。

在寫入繁重的應用程式中，效能瓶頸可能是資料庫可以寫入硬碟的速度。在這種情況下，寫放大會導致直接的效能代價：儲存引擎寫入硬碟的次數越多，可用硬碟頻寬內它能處理的每秒寫入次數就越少。

進而，LSM 樹通常能夠比 B 樹支援更高的寫入吞吐量，部分原因是它們有時具有較低的寫放大（儘管這取決於儲存引擎的配置和工作負載），部分是因為它們順序地寫入緊湊的 SSTable 檔案而不是必須覆寫樹中的幾個頁面【26】。這種差異在機械硬碟上尤其重要，其順序寫入比隨機寫入要快得多。

LSM 樹可以被壓縮得更好，因此通常能比 B 樹在硬碟上產生更小的檔案。B 樹儲存引擎會由於碎片化（fragmentation）而留下一些未使用的硬碟空間：當頁面被拆分或某行不能放入現有頁面時，頁面中的某些空間仍未被使用。由於 LSM 樹不是面向頁面的，並且會透過定期重寫 SSTables 以去除碎片，所以它們具有較低的儲存開銷，特別是當使用分層壓縮（leveled compaction）時【27】。

在許多固態硬碟上，韌體內部使用了日誌結構化演算法，以將隨機寫入轉變為順序寫入底層儲存晶片，因此儲存引擎寫入模式的影響不太明顯【19】。但是，較低的寫入放大率和減少的碎片仍然對固態硬碟更有利：更緊湊地表示資料允許在可用的 I/O 頻寬內處理更多的讀取和寫入請求。

#### LSM樹的缺點

日誌結構儲存的缺點是壓縮過程有時會干擾正在進行的讀寫操作。儘管儲存引擎嘗試增量地執行壓縮以儘量不影響併發訪問，但是硬碟資源有限，所以很容易發生某個請求需要等待硬碟先完成昂貴的壓縮操作。對吞吐量和平均響應時間的影響通常很小，但是日誌結構化儲存引擎在更高百分位的響應時間（請參閱 “[描述效能](/v1_tw/ch1#描述效能)”）有時會相當長，而 B 樹的行為則相對更具有可預測性【28】。

壓縮的另一個問題出現在高寫入吞吐量時：硬碟的有限寫入頻寬需要在初始寫入（記錄日誌和重新整理記憶體表到硬碟）和在後臺執行的壓縮執行緒之間共享。寫入空資料庫時，可以使用全硬碟頻寬進行初始寫入，但資料庫越大，壓縮所需的硬碟頻寬就越多。

如果寫入吞吐量很高，並且壓縮沒有仔細配置好，有可能導致壓縮跟不上寫入速率。在這種情況下，硬碟上未合併段的數量不斷增加，直到硬碟空間用完，讀取速度也會減慢，因為它們需要檢查更多的段檔案。通常情況下，即使壓縮無法跟上，基於 SSTable 的儲存引擎也不會限制傳入寫入的速率，所以你需要進行明確的監控來檢測這種情況【29,30】。

B 樹的一個優點是每個鍵只存在於索引中的一個位置，而日誌結構化的儲存引擎可能在不同的段中有相同鍵的多個副本。這個方面使得 B 樹在想要提供強大的事務語義的資料庫中很有吸引力：在許多關係資料庫中，事務隔離是透過在鍵範圍上使用鎖來實現的，在 B 樹索引中，這些鎖可以直接附加到樹上【5】。在 [第七章](/v1_tw/ch7) 中，我們將更詳細地討論這一點。

B 樹在資料庫架構中是非常根深蒂固的，為許多工作負載都提供了始終如一的良好效能，所以它們不可能在短期內消失。在新的資料庫中，日誌結構化索引變得越來越流行。沒有簡單易行的辦法來判斷哪種型別的儲存引擎對你的使用場景更好，所以需要透過一些測試來得到相關經驗。

### 其他索引結構

到目前為止，我們只討論了鍵值索引，它們就像關係模型中的 **主鍵（primary key）** 索引。主鍵唯一標識關係表中的一行，或文件資料庫中的一個文件或圖形資料庫中的一個頂點。資料庫中的其他記錄可以透過其主鍵（或 ID）引用該行 / 文件 / 頂點，索引就被用於解析這樣的引用。

次級索引（secondary indexes）也很常見。在關係資料庫中，你可以使用 `CREATE INDEX` 命令在同一個表上建立多個次級索引，而且這些索引通常對於有效地執行聯接（join）而言至關重要。例如，在 [第二章](/v1_tw/ch2) 中的 [圖 2-1](/v1/ddia_0201.png) 中，很可能在 `user_id` 列上有一個次級索引，以便你可以在每個表中找到屬於同一使用者的所有行。

次級索引可以很容易地從鍵值索引構建。次級索引主要的不同是鍵不是唯一的，即可能有許多行（文件，頂點）具有相同的鍵。這可以透過兩種方式來解決：將匹配行識別符號的列表作為索引裡的值（就像全文索引中的記錄列表），或者透過向每個鍵新增行識別符號來使鍵唯一。無論哪種方式，B 樹和日誌結構索引都可以用作次級索引。

#### 將值儲存在索引中

索引中的鍵是查詢要搜尋的內容，而其值可以是以下兩種情況之一：它可以是實際的行（文件，頂點），也可以是對儲存在別處的行的引用。在後一種情況下，行被儲存的地方被稱為 **堆檔案（heap file）**，並且儲存的資料沒有特定的順序（它可以是僅追加的，或者它可以跟蹤被刪除的行以便後續可以用新的資料進行覆蓋）。堆檔案方法很常見，因為它避免了在存在多個次級索引時對資料的複製：每個索引只引用堆檔案中的一個位置，實際的資料都儲存在一個地方。

在不更改鍵的情況下更新值時，堆檔案方法可以非常高效：只要新值的位元組數不大於舊值，就可以覆蓋該記錄。如果新值更大，情況會更複雜，因為它可能需要移到堆中有足夠空間的新位置。在這種情況下，要麼所有的索引都需要更新，以指向記錄的新堆位置，或者在舊堆位置留下一個轉發指標【5】。

在某些情況下，從索引到堆檔案的額外跳躍對讀取來說效能損失太大，因此可能希望將被索引的行直接儲存在索引中。這被稱為聚集索引（clustered index）。例如，在 MySQL 的 InnoDB 儲存引擎中，表的主鍵總是一個聚集索引，次級索引則引用主鍵（而不是堆檔案中的位置）【31】。在 SQL Server 中，可以為每個表指定一個聚集索引【32】。

在 **聚集索引**（在索引中儲存所有的行資料）和 **非聚集索引**（僅在索引中儲存對資料的引用）之間的折衷被稱為 **覆蓋索引（covering index）** 或 **包含列的索引（index with included columns）**，其在索引記憶體儲表的一部分列【33】。這允許透過單獨使用索引來處理一些查詢（這種情況下，可以說索引 **覆蓋（cover）** 了查詢）【32】。

與任何型別的資料重複一樣，聚集索引和覆蓋索引可以加快讀取速度，但是它們需要額外的儲存空間，並且會增加寫入開銷。資料庫還需要額外的努力來執行事務保證，因為應用程式不應看到任何因為使用副本而導致的不一致。

#### 多列索引

至今討論的索引只是將一個鍵對映到一個值。如果我們需要同時查詢一個表中的多個列（或文件中的多個欄位），這顯然是不夠的。

最常見的多列索引被稱為 **連線索引（concatenated index）** ，它透過將一列的值追加到另一列後面，簡單地將多個欄位組合成一個鍵（索引定義中指定了欄位的連線順序）。這就像一個老式的紙質電話簿，它提供了一個從（姓氏，名字）到電話號碼的索引。由於排序順序，索引可以用來查詢所有具有特定姓氏的人，或所有具有特定姓氏 - 名字組合的人。但如果你想找到所有具有特定名字的人，這個索引是沒有用的。

**多維索引（multi-dimensional index）** 是一種查詢多個列的更一般的方法，這對於地理空間資料尤為重要。例如，餐廳搜尋網站可能有一個數據庫，其中包含每個餐廳的經度和緯度。當用戶在地圖上檢視餐館時，網站需要搜尋使用者正在檢視的矩形地圖區域內的所有餐館。這需要一個二維範圍查詢，如下所示：

```sql
SELECT * FROM restaurants WHERE latitude > 51.4946 AND latitude < 51.5079
                          AND longitude > -0.1162 AND longitude < -0.1004;
```

一個標準的 B 樹或者 LSM 樹索引不能夠高效地處理這種查詢：它可以返回一個緯度範圍內的所有餐館（但經度可能是任意值），或者返回在同一個經度範圍內的所有餐館（但緯度可能是北極和南極之間的任意地方），但不能同時滿足兩個條件。

一種選擇是使用 **空間填充曲線（space-filling curve）** 將二維位置轉換為單個數字，然後使用常規 B 樹索引【34】。更普遍的是，使用特殊化的空間索引，例如 R 樹。例如，PostGIS 使用 PostgreSQL 的通用 GiST 工具【35】將地理空間索引實現為 R 樹。這裡我們沒有足夠的地方來描述 R 樹，但是有大量的文獻可供參考。

有趣的是，多維索引不僅可以用於地理位置。例如，在電子商務網站上可以使用建立在（紅，綠，藍）維度上的三維索引來搜尋特定顏色範圍內的產品，也可以在天氣觀測資料庫中建立（日期，溫度）的二維索引，以便有效地搜尋 2013 年內的溫度在 25 至 30°C 之間的所有觀測資料。如果使用一維索引，你將不得不掃描 2013 年的所有記錄（不管溫度如何），然後透過溫度進行過濾，或者反之亦然。二維索引可以同時透過時間戳和溫度來收窄資料集。這個技術被 HyperDex 所使用【36】。

#### 全文搜尋和模糊索引

到目前為止所討論的所有索引都假定你有確切的資料，並允許你查詢鍵的確切值或具有排序順序的鍵的值範圍。他們不允許你做的是搜尋**類似**的鍵，如拼寫錯誤的單詞。這種模糊的查詢需要不同的技術。

例如，全文搜尋引擎通常允許搜尋目標從一個單詞擴充套件為包括該單詞的同義詞，忽略單詞的語法變體，搜尋在相同文件中的近義詞，並且支援各種其他取決於文字的語言分析功能。為了處理文件或查詢中的拼寫錯誤，Lucene 能夠在一定的編輯距離內搜尋文字【37】（編輯距離 1 意味著單詞內發生了 1 個字母的新增、刪除或替換）。

正如 “[用 SSTables 製作 LSM 樹](#用SSTables製作LSM樹)” 中所提到的，Lucene 為其詞典使用了一個類似於 SSTable 的結構。這個結構需要一個小的記憶體索引，告訴查詢需要在排序檔案中哪個偏移量查詢鍵。在 LevelDB 中，這個記憶體中的索引是一些鍵的稀疏集合，但在 Lucene 中，記憶體中的索引是鍵中字元的有限狀態自動機，類似於 trie 【38】。這個自動機可以轉換成 Levenshtein 自動機，它支援在給定的編輯距離內有效地搜尋單詞【39】。

其他的模糊搜尋技術正朝著文件分類和機器學習的方向發展。更多詳細資訊請參閱資訊檢索教科書，例如【40】。

#### 在記憶體中儲存一切

本章到目前為止討論的資料結構都是對硬碟限制的應對。與主記憶體相比，硬碟處理起來很麻煩。對於磁性硬碟和固態硬碟，如果要在讀取和寫入時獲得良好效能，則需要仔細地佈置硬碟上的資料。但是，我們能容忍這種麻煩，因為硬碟有兩個顯著的優點：它們是持久的（它們的內容在電源關閉時不會丟失），並且每 GB 的成本比 RAM 低。

隨著 RAM 變得更便宜，每 GB 成本的論據被侵蝕了。許多資料集不是那麼大，所以將它們全部儲存在記憶體中是非常可行的，包括可能分佈在多個機器上。這導致了記憶體資料庫的發展。

某些記憶體中的鍵值儲存（如 Memcached）僅用於快取，在重新啟動計算機時丟失的資料是可以接受的。但其他記憶體資料庫的目標是永續性，可以透過特殊的硬體（例如電池供電的 RAM）來實現，也可以將更改日誌寫入硬碟，還可以將定時快照寫入硬碟或者將記憶體中的狀態複製到其他機器上。

記憶體資料庫重新啟動時，需要從硬碟或透過網路從副本重新載入其狀態（除非使用特殊的硬體）。儘管寫入硬碟，它仍然是一個記憶體資料庫，因為硬碟僅出於永續性目的進行日誌追加，讀取請求完全由記憶體來處理。寫入硬碟同時還有運維上的好處：硬碟上的檔案可以很容易地由外部程式進行備份、檢查和分析。

諸如 VoltDB、MemSQL 和 Oracle TimesTen 等產品是具有關係模型的記憶體資料庫，供應商聲稱，透過消除與管理硬碟上的資料結構相關的所有開銷，他們可以提供巨大的效能改進【41,42】。RAM Cloud 是一個開源的記憶體鍵值儲存器，具有永續性（對記憶體和硬碟上的資料都使用日誌結構化方法）【43】。Redis 和 Couchbase 透過非同步寫入硬碟提供了較弱的永續性。

反直覺的是，記憶體資料庫的效能優勢並不是因為它們不需要從硬碟讀取的事實。只要有足夠的記憶體即使是基於硬碟的儲存引擎也可能永遠不需要從硬碟讀取，因為作業系統在記憶體中快取了最近使用的硬碟塊。相反，它們更快的原因在於省去了將記憶體資料結構編碼為硬碟資料結構的開銷【44】。

除了效能，記憶體資料庫的另一個有趣的地方是提供了難以用基於硬碟的索引實現的資料模型。例如，Redis 為各種資料結構（如優先順序佇列和集合）提供了類似資料庫的介面。因為它將所有資料儲存在記憶體中，所以它的實現相對簡單。

最近的研究表明，記憶體資料庫體系結構可以擴充套件到支援比可用記憶體更大的資料集，而不必重新採用以硬碟為中心的體系結構【45】。所謂的 **反快取（anti-caching）** 方法透過在記憶體不足的情況下將最近最少使用的資料從記憶體轉移到硬碟，並在將來再次訪問時將其重新載入到記憶體中。這與作業系統對虛擬記憶體和交換檔案的操作類似，但資料庫可以比作業系統更有效地管理記憶體，因為它可以按單個記錄的粒度工作，而不是整個記憶體頁面。儘管如此，這種方法仍然需要索引能完全放入記憶體中（就像本章開頭的 Bitcask 例子）。

如果 **非易失性儲存器（non-volatile memory, NVM）** 技術得到更廣泛的應用，可能還需要進一步改變儲存引擎設計【46】。目前這是一個新的研究領域，值得關注。


## 事務處理還是分析？

在早期的業務資料處理過程中，一次典型的資料庫寫入通常與一筆 *商業交易（commercial transaction）* 相對應：賣個貨、向供應商下訂單、支付員工工資等等。但隨著資料庫開始應用到那些不涉及到錢的領域，術語 **交易 / 事務（transaction）** 仍留了下來，用於指代一組讀寫操作構成的邏輯單元。

> 事務不一定具有 ACID（原子性，一致性，隔離性和永續性）屬性。事務處理只是意味著允許客戶端進行低延遲的讀取和寫入 —— 而不是隻能定期執行（例如每天一次）的批處理作業。我們在 [第七章](/v1_tw/ch7) 中討論 ACID 屬性，在 [第十章](/v1_tw/ch10) 中討論批處理。

即使資料庫開始被用於許多不同型別的資料，比如部落格文章的評論、遊戲中的動作、地址簿中的聯絡人等等，基本的訪問模式仍然類似於處理商業交易。應用程式通常使用索引透過某個鍵找少量記錄。根據使用者的輸入來插入或更新記錄。由於這些應用程式是互動式的，這種訪問模式被稱為 **線上事務處理（OLTP, OnLine Transaction Processing）**。

但是，資料庫也開始越來越多地用於資料分析，這些資料分析具有非常不同的訪問模式。通常，分析查詢需要掃描大量記錄，每個記錄只讀取幾列，並計算彙總統計資訊（如計數、總和或平均值），而不是將原始資料返回給使用者。例如，如果你的資料是一個銷售交易表，那麼分析查詢可能是：

* 一月份每個商店的總收入是多少？
* 在最近的推廣活動中多賣了多少香蕉？
* 哪個牌子的嬰兒食品最常與 X 品牌的尿布同時購買？

這些查詢通常由業務分析師編寫，並提供報告以幫助公司管理層做出更好的決策（商業智慧）。為了將這種使用資料庫的模式和事務處理區分開，它被稱為 **線上分析處理（OLAP, OnLine Analytic Processing）**【47】[^iv]。OLTP 和 OLAP 之間的區別並不總是清晰的，但是一些典型的特徵在 [表 3-1]() 中列出。

**表 3-1 比較事務處理和分析系統的特點**

|     屬性     |      事務處理系統 OLTP       |      分析系統 OLAP       |
| :----------: | :--------------------------: | :----------------------: |
| 主要讀取模式 |    查詢少量記錄，按鍵讀取    |    在大批次記錄上聚合    |
| 主要寫入模式 |   隨機訪問，寫入要求低延時   | 批次匯入（ETL）或者事件流  |
|   主要使用者   |    終端使用者，透過 Web 應用     | 內部資料分析師，用於決策支援 |
|  處理的資料  | 資料的最新狀態（當前時間點） |   隨時間推移的歷史事件   |
|  資料集尺寸  |           GB ~ TB            |         TB ~ PB          |

[^iv]: OLAP 中的首字母 O（online）的含義並不明確，它可能是指查詢並不是用來生成預定義好的報告的事實，也可能是指分析師通常是互動式地使用 OLAP 系統來進行探索式的查詢。

起初，事務處理和分析查詢使用了相同的資料庫。SQL 在這方面已證明是非常靈活的：對於 OLTP 型別的查詢以及 OLAP 型別的查詢來說效果都很好。儘管如此，在二十世紀八十年代末和九十年代初期，企業有停止使用 OLTP 系統進行分析的趨勢，轉而在單獨的資料庫上執行分析。這個單獨的資料庫被稱為 **資料倉庫（data warehouse）**。

### 資料倉庫

一個企業可能有幾十個不同的交易處理系統：面向終端客戶的網站、控制實體商店的收銀系統、倉庫庫存跟蹤、車輛路線規劃、供應鏈管理、員工管理等。這些系統中每一個都很複雜，需要專人維護，所以最終這些系統互相之間都是獨立執行的。

這些 OLTP 系統往往對業務運作至關重要，因而通常會要求 **高可用** 與 **低延遲**。所以 DBA 會密切關注他們的 OLTP 資料庫，他們通常不願意讓業務分析人員在 OLTP 資料庫上執行臨時的分析查詢，因為這些查詢通常開銷巨大，會掃描大部分資料集，這會損害同時在執行的事務的效能。

相比之下，資料倉庫是一個獨立的資料庫，分析人員可以查詢他們想要的內容而不影響 OLTP 操作【48】。資料倉庫包含公司各種 OLTP 系統中所有的只讀資料副本。從 OLTP 資料庫中提取資料（使用定期的資料轉儲或連續的更新流），轉換成適合分析的模式，清理並載入到資料倉庫中。將資料存入倉庫的過程稱為 “**抽取 - 轉換 - 載入（ETL）**”，如 [圖 3-8](/v1/ddia_0308.png) 所示。

![](/v1/ddia_0308.png)

**圖 3-8 ETL 至資料倉庫的簡化提綱**

幾乎所有的大型企業都有資料倉庫，但在小型企業中幾乎聞所未聞。這可能是因為大多數小公司沒有這麼多不同的 OLTP 系統，大多數小公司只有少量的資料 —— 可以在傳統的 SQL 資料庫中查詢，甚至可以在電子表格中分析。在一家大公司裡，要做一些在一家小公司很簡單的事情，需要很多繁重的工作。

使用單獨的資料倉庫，而不是直接查詢 OLTP 系統進行分析的一大優勢是資料倉庫可針對分析類的訪問模式進行最佳化。事實證明，本章前半部分討論的索引演算法對於 OLTP 來說工作得很好，但對於處理分析查詢並不是很好。在本章的其餘部分中，我們將研究為分析而最佳化的儲存引擎。

#### OLTP資料庫和資料倉庫之間的分歧

資料倉庫的資料模型通常是關係型的，因為 SQL 通常很適合分析查詢。有許多圖形資料分析工具可以生成 SQL 查詢，視覺化結果，並允許分析人員探索資料（透過下鑽、切片和切塊等操作）。

表面上，一個數據倉庫和一個關係型 OLTP 資料庫看起來很相似，因為它們都有一個 SQL 查詢介面。然而，系統的內部看起來可能完全不同，因為它們針對非常不同的查詢模式進行了最佳化。現在許多資料庫供應商都只是重點支援事務處理負載和分析工作負載這兩者中的一個，而不是都支援。

一些資料庫（例如 Microsoft SQL Server 和 SAP HANA）支援在同一產品中進行事務處理和資料倉庫。但是，它們也正日益發展為兩套獨立的儲存和查詢引擎，只是這些引擎正好可以透過一個通用的 SQL 介面訪問【49,50,51】。

Teradata、Vertica、SAP HANA 和 ParAccel 等資料倉庫供應商通常使用昂貴的商業許可證銷售他們的系統。Amazon RedShift 是 ParAccel 的託管版本。最近，大量的開源 SQL-on-Hadoop 專案已經出現，它們還很年輕，但是正在與商業資料倉庫系統競爭，包括 Apache Hive、Spark SQL、Cloudera Impala、Facebook Presto、Apache Tajo 和 Apache Drill【52,53】。其中一些基於了谷歌 Dremel 的想法【54】。

### 星型和雪花型：分析的模式

正如 [第二章](/v1_tw/ch2) 所探討的，根據應用程式的需要，在事務處理領域中使用了大量不同的資料模型。另一方面，在分析型業務中，資料模型的多樣性則少得多。許多資料倉庫都以相當公式化的方式使用，被稱為星型模式（也稱為維度建模【55】）。

[圖 3-9](/v1/ddia_0309.png) 中的示例模式顯示了可能在食品零售商處找到的資料倉庫。在模式的中心是一個所謂的事實表（在這個例子中，它被稱為 `fact_sales`）。事實表的每一行代表在特定時間發生的事件（這裡，每一行代表客戶購買的產品）。如果我們分析的是網站流量而不是零售量，則每行可能代表一個使用者的頁面瀏覽或點選。

![](/v1/ddia_0309.png)

**圖 3-9 用於資料倉庫的星型模式的示例**

通常情況下，事實被視為單獨的事件，因為這樣可以在以後分析中獲得最大的靈活性。但是，這意味著事實表可以變得非常大。像蘋果、沃爾瑪或 eBay 這樣的大企業在其資料倉庫中可能有幾十 PB 的交易歷史，其中大部分儲存在事實表中【56】。

事實表中的一些列是屬性，例如產品銷售的價格和從供應商那裡購買的成本（可以用來計算利潤率）。事實表中的其他列是對其他表（稱為維度表）的外部索引鍵引用。由於事實表中的每一行都表示一個事件，因此這些維度代表事件發生的物件、內容、地點、時間、方式和原因。

例如，在 [圖 3-9](/v1/ddia_0309.png) 中，其中一個維度是已售出的產品。`dim_product` 表中的每一行代表一種待售產品，包括庫存單位（SKU）、產品描述、品牌名稱、類別、脂肪含量、包裝尺寸等。`fact_sales` 表中的每一行都使用外部索引鍵表明在特定交易中銷售了什麼產品。（簡單起見，如果客戶一次購買了幾種不同的產品，則它們在事實表中被表示為單獨的行）。

甚至日期和時間也通常使用維度表來表示，因為這允許對日期的附加資訊（諸如公共假期）進行編碼，從而允許區分假期和非假期的銷售查詢。

“星型模式” 這個名字來源於這樣一個事實，即當我們對錶之間的關係進行視覺化時，事實表在中間，被維度表包圍；與這些表的連線就像星星的光芒。

這個模板的變體被稱為雪花模式，其中維度被進一步分解為子維度。例如，品牌和產品類別可能有單獨的表格，並且 `dim_product` 表格中的每一行都可以將品牌和類別作為外部索引鍵引用，而不是將它們作為字串儲存在 `dim_product` 表格中。雪花模式比星形模式更正規化，但是星形模式通常是首選，因為分析師使用它更簡單【55】。

在典型的資料倉庫中，表格通常非常寬：事實表通常有 100 列以上，有時甚至有數百列【51】。維度表也可以是非常寬的，因為它們包括了所有可能與分析相關的元資料 —— 例如，`dim_store` 表可以包括在每個商店提供哪些服務的細節、它是否具有店內麵包房、店面面積、商店第一次開張的日期、最近一次改造的時間、離最近的高速公路的距離等等。


## 列式儲存

如果事實表中有萬億行和數 PB 的資料，那麼高效地儲存和查詢它們就成為一個具有挑戰性的問題。維度表通常要小得多（數百萬行），所以在本節中我們將主要關注事實表的儲存。

儘管事實表通常超過 100 列，但典型的資料倉庫查詢一次只會訪問其中 4 個或 5 個列（ “`SELECT *`” 查詢很少用於分析）【51】。以 [例 3-1]() 中的查詢為例：它訪問了大量的行（在 2013 年中所有購買了水果或糖果的記錄），但只需訪問 `fact_sales` 表的三列：`date_key, product_sk, quantity`。該查詢忽略了所有其他的列。

**例 3-1 分析人們是否更傾向於在一週的某一天購買新鮮水果或糖果**

```sql
SELECT
  dim_date.weekday,
  dim_product.category,
  SUM(fact_sales.quantity) AS quantity_sold
FROM fact_sales
  JOIN dim_date ON fact_sales.date_key = dim_date.date_key
  JOIN dim_product ON fact_sales.product_sk = dim_product.product_sk
WHERE
  dim_date.year = 2013 AND
  dim_product.category IN ('Fresh fruit', 'Candy')
GROUP BY
  dim_date.weekday, dim_product.category;
```

我們如何有效地執行這個查詢？

在大多數 OLTP 資料庫中，儲存都是以面向行的方式進行佈局的：表格的一行中的所有值都相鄰儲存。文件資料庫也是相似的：整個文件通常儲存為一個連續的位元組序列。你可以在 [圖 3-1](/v1/ddia_0301.png) 的 CSV 例子中看到這個。

為了處理像 [例 3-1]() 這樣的查詢，你可能在 `fact_sales.date_key`、`fact_sales.product_sk` 上有索引，它們告訴儲存引擎在哪裡查詢特定日期或特定產品的所有銷售情況。但是，面向行的儲存引擎仍然需要將所有這些行（每個包含超過 100 個屬性）從硬碟載入到記憶體中，解析它們，並過濾掉那些不符合要求的屬性。這可能需要很長時間。

列式儲存背後的想法很簡單：不要將所有來自一行的值儲存在一起，而是將來自每一列的所有值儲存在一起。如果每個列式儲存在一個單獨的檔案中，查詢只需要讀取和解析查詢中使用的那些列，這可以節省大量的工作。這個原理如 [圖 3-10](/v1/ddia_0310.png) 所示。

![](/v1/ddia_0310.png)

**圖 3-10 按列儲存關係型資料，而不是行**

> 列式儲存在關係資料模型中是最容易理解的，但它同樣適用於非關係資料。例如，Parquet【57】是一種列式儲存格式，支援基於 Google 的 Dremel 的文件資料模型【54】。

列式儲存佈局依賴於每個列檔案包含相同順序的行。因此，如果你需要重新組裝完整的行，你可以從每個單獨的列檔案中獲取第 23 項，並將它們放在一起形成表的第 23 行。


### 列壓縮

除了僅從硬碟載入查詢所需的列以外，我們還可以透過壓縮資料來進一步降低對硬碟吞吐量的需求。幸運的是，列式儲存通常很適合壓縮。

看看 [圖 3-10](/v1/ddia_0310.png) 中每一列的值序列：它們通常看起來是相當重複的，這是壓縮的好兆頭。根據列中的資料，可以使用不同的壓縮技術。在資料倉庫中特別有效的一種技術是點陣圖編碼，如 [圖 3-11](/v1/ddia_0311.png) 所示。

![](/v1/ddia_0311.png)

**圖 3-11 壓縮的點陣圖索引儲存佈局**

通常情況下，一列中不同值的數量與行數相比要小得多（例如，零售商可能有數十億的銷售交易，但只有 100,000 個不同的產品）。現在我們可以拿一個有 n 個不同值的列，並把它轉換成 n 個獨立的點陣圖：每個不同值對應一個位圖，每行對應一個位元位。如果該行具有該值，則該位為 1，否則為 0。

如果 n 非常小（例如，國家 / 地區列可能有大約 200 個不同的值），則這些點陣圖可以將每行儲存成一個位元位。但是，如果 n 更大，大部分點陣圖中將會有很多的零（我們說它們是稀疏的）。在這種情況下，點陣圖可以另外再進行遊程編碼（run-length encoding，一種無損資料壓縮技術），如 [圖 3-11](fig3-11.png) 底部所示。這可以使列的編碼非常緊湊。

這些點陣圖索引非常適合資料倉庫中常見的各種查詢。例如：

```sql
WHERE product_sk IN（30，68，69）
```

載入 `product_sk = 30`、`product_sk = 68` 和 `product_sk = 69` 這三個點陣圖，並計算三個點陣圖的按位或（OR），這可以非常有效地完成。

```sql
WHERE product_sk = 31 AND store_sk = 3
```

載入 `product_sk = 31` 和 `store_sk = 3` 的點陣圖，並計算按位與（AND）。這是因為列按照相同的順序包含行，因此一列的點陣圖中的第 k 位和另一列的點陣圖中的第 k 位對應相同的行。

對於不同種類的資料，也有各種不同的壓縮方案，但我們不會詳細討論它們，請參閱【58】的概述。

> #### 列式儲存和列族
>
> Cassandra 和 HBase 有一個列族（column families）的概念，他們從 Bigtable 繼承【9】。然而，把它們稱為列式（column-oriented）是非常具有誤導性的：在每個列族中，它們將一行中的所有列與行鍵一起儲存，並且不使用列壓縮。因此，Bigtable 模型仍然主要是面向行的。
>

#### 記憶體頻寬和向量化處理

對於需要掃描數百萬行的資料倉庫查詢來說，一個巨大的瓶頸是從硬盤獲取資料到記憶體的頻寬。但是，這不是唯一的瓶頸。分析型資料庫的開發人員還需要有效地利用記憶體到 CPU 快取的頻寬，避免 CPU 指令處理流水線中的分支預測錯誤和閒置等待，以及在現代 CPU 上使用單指令多資料（SIMD）指令來加速運算【59,60】。

除了減少需要從硬碟載入的資料量以外，列式儲存佈局也可以有效利用 CPU 週期。例如，查詢引擎可以將一整塊壓縮好的列資料放進 CPU 的 L1 快取中，然後在緊密的迴圈（即沒有函式呼叫）中遍歷。相比於每條記錄的處理都需要大量函式呼叫和條件判斷的程式碼，CPU 執行這樣一個迴圈要快得多。列壓縮允許列中的更多行被同時放進容量有限的 L1 快取。前面描述的按位 “與” 和 “或” 運算子可以被設計為直接在這樣的壓縮列資料塊上操作。這種技術被稱為向量化處理（vectorized processing）【58,49】。


### 列式儲存中的排序順序

在列式儲存中，儲存行的順序並不關鍵。按插入順序儲存它們是最簡單的，因為插入一個新行只需要追加到每個列檔案。但是，我們也可以選擇按某種順序來排列資料，就像我們之前對 SSTables 所做的那樣，並將其用作索引機制。

注意，對每列分別執行排序是沒有意義的，因為那樣就沒法知道不同列中的哪些項屬於同一行。我們只能在明確一列中的第 k 項與另一列中的第 k 項屬於同一行的情況下，才能重建出完整的行。

相反，資料的排序需要對一整行統一操作，即使它們的儲存方式是按列的。資料庫管理員可以根據他們對常用查詢的瞭解，來選擇表格中用來排序的列。例如，如果查詢通常以日期範圍為目標，例如“上個月”，則可以將 `date_key` 作為第一個排序鍵。這樣查詢最佳化器就可以只掃描近1個月範圍的行了，這比掃描所有行要快得多。

對於第一排序列中具有相同值的行，可以用第二排序列來進一步排序。例如，如果 `date_key` 是 [圖 3-10](/v1/ddia_0310.png) 中的第一個排序關鍵字，那麼 `product_sk` 可能是第二個排序關鍵字，以便同一天的同一產品的所有銷售資料都被儲存在相鄰位置。這將有助於需要在特定日期範圍內按產品對銷售進行分組或過濾的查詢。

按順序排序的另一個好處是它可以幫助壓縮列。如果主要排序列沒有太多個不同的值，那麼在排序之後，將會得到一個相同的值連續重複多次的序列。一個簡單的遊程編碼（就像我們用於 [圖 3-11](/v1/ddia_0311.png) 中的點陣圖一樣）可以將該列壓縮到幾 KB —— 即使表中有數十億行。

第一個排序鍵的壓縮效果最強。第二和第三個排序鍵會更混亂，因此不會有這麼長的連續的重複值。排序優先順序更低的列以幾乎隨機的順序出現，所以可能不會被壓縮。但對前幾列做排序在整體上仍然是有好處的。

#### 幾個不同的排序順序

對這個想法，有一個巧妙的擴充套件被 C-Store 發現，並在商業資料倉庫 Vertica 中被採用【61,62】：既然不同的查詢受益於不同的排序順序，為什麼不以幾種不同的方式來儲存相同的資料呢？反正資料都需要做備份，以防單點故障時丟失資料。因此你可以用不同排序方式來儲存冗餘資料，以便在處理查詢時，呼叫最適合查詢模式的版本。

在一個列式儲存中有多個排序順序有點類似於在一個面向行的儲存中有多個次級索引。但最大的區別在於面向行的儲存將每一行儲存在一個地方（在堆檔案或聚集索引中），次級索引只包含指向匹配行的指標。在列式儲存中，通常在其他地方沒有任何指向資料的指標，只有包含值的列。

### 寫入列式儲存

這些最佳化在資料倉庫中是有意義的，因為其負載主要由分析人員執行的大型只讀查詢組成。列式儲存、壓縮和排序都有助於更快地讀取這些查詢。然而，他們的缺點是寫入更加困難。

使用 B 樹的就地更新方法對於壓縮的列是不可能的。如果你想在排序表的中間插入一行，你很可能不得不重寫所有的列檔案。由於行由列中的位置標識，因此插入必須對所有列進行一致地更新。

幸運的是，本章前面已經看到了一個很好的解決方案：LSM 樹。所有的寫操作首先進入一個記憶體中的儲存，在這裡它們被新增到一個已排序的結構中，並準備寫入硬碟。記憶體中的儲存是面向行還是列的並不重要。當已經積累了足夠的寫入資料時，它們將與硬碟上的列檔案合併，並批次寫入新檔案。這基本上是 Vertica 所做的【62】。

查詢操作需要檢查硬碟上的列資料和記憶體中的最近寫入，並將兩者的結果合併起來。但是，查詢最佳化器對使用者隱藏了這個細節。從分析師的角度來看，透過插入、更新或刪除操作進行修改的資料會立即反映在後續的查詢中。

### 聚合：資料立方體和物化檢視

並非所有資料倉庫都需要採用列式儲存：傳統的面向行的資料庫和其他一些架構也被使用。然而，列式儲存可以顯著加快專門的分析查詢，所以它正在迅速變得流行起來【51,63】。

資料倉庫的另一個值得一提的方面是物化聚合（materialized aggregates）。如前所述，資料倉庫查詢通常涉及一個聚合函式，如 SQL 中的 COUNT、SUM、AVG、MIN 或 MAX。如果相同的聚合被許多不同的查詢使用，那麼每次都透過原始資料來處理可能太浪費了。為什麼不將一些查詢使用最頻繁的計數或總和快取起來？

建立這種快取的一種方式是物化檢視（Materialized View）。在關係資料模型中，它通常被定義為一個標準（虛擬）檢視：一個類似於表的物件，其內容是一些查詢的結果。不同的是，物化檢視是查詢結果的實際副本，會被寫入硬碟，而虛擬檢視只是編寫查詢的一個捷徑。從虛擬檢視讀取時，SQL 引擎會將其展開到檢視的底層查詢中，然後再處理展開的查詢。

當底層資料發生變化時，物化檢視需要更新，因為它是資料的反正規化副本。資料庫可以自動完成該操作，但是這樣的更新使得寫入成本更高，這就是在 OLTP 資料庫中不經常使用物化檢視的原因。在讀取繁重的資料倉庫中，它們可能更有意義（它們是否實際上改善了讀取效能取決於使用場景）。

物化檢視的常見特例稱為資料立方體或 OLAP 立方【64】。它是按不同維度分組的聚合網格。[圖 3-12](/v1/ddia_0312.png) 顯示了一個例子。

![](/v1/ddia_0312.png)

**圖 3-12 資料立方的兩個維度，透過求和聚合**

想象一下，現在每個事實都只有兩個維度表的外部索引鍵 —— 在 [圖 3-12](/v1/ddia_0312.png) 中分別是日期和產品。你現在可以繪製一個二維表格，一個軸線上是日期，另一個軸線上是產品。每個單元格包含具有該日期 - 產品組合的所有事實的屬性（例如 `net_price`）的聚合（例如 `SUM`）。然後，你可以沿著每行或每列應用相同的彙總，並獲得減少了一個維度的彙總（按產品的銷售額，無論日期，或者按日期的銷售額，無論產品）。

一般來說，事實往往有兩個以上的維度。在圖 3-9 中有五個維度：日期、產品、商店、促銷和客戶。要想象一個五維超立方體是什麼樣子是很困難的，但是原理是一樣的：每個單元格都包含特定日期 - 產品 - 商店 - 促銷 - 客戶組合的銷售額。這些值可以在每個維度上求和彙總。

物化資料立方體的優點是可以讓某些查詢變得非常快，因為它們已經被有效地預先計算了。例如，如果你想知道每個商店的總銷售額，則只需檢視合適維度的總計，而無需掃描數百萬行的原始資料。

資料立方體的缺點是不具有查詢原始資料的靈活性。例如，沒有辦法計算有多少比例的銷售來自成本超過 100 美元的專案，因為價格不是其中的一個維度。因此，大多數資料倉庫試圖保留儘可能多的原始資料，並將聚合資料（如資料立方體）僅用作某些查詢的效能提升手段。


## 本章小結

在本章中，我們試圖深入瞭解資料庫是如何處理儲存和檢索的。將資料儲存在資料庫中會發生什麼？稍後再次查詢資料時資料庫會做什麼？

在高層次上，我們看到儲存引擎分為兩大類：針對 **事務處理（OLTP）** 最佳化的儲存引擎和針對 **線上分析（OLAP）** 最佳化的儲存引擎。這兩類使用場景的訪問模式之間有很大的區別：

* OLTP 系統通常面向終端使用者，這意味著系統可能會收到大量的請求。為了處理負載，應用程式在每個查詢中通常只訪問少量的記錄。應用程式使用某種鍵來請求記錄，儲存引擎使用索引來查詢所請求的鍵的資料。硬碟查詢時間往往是這裡的瓶頸。
* 資料倉庫和類似的分析系統會少見一些，因為它們主要由業務分析人員使用，而不是終端使用者。它們的查詢量要比 OLTP 系統少得多，但通常每個查詢開銷高昂，需要在短時間內掃描數百萬條記錄。硬碟頻寬（而不是查詢時間）往往是瓶頸，列式儲存是針對這種工作負載的日益流行的解決方案。

在 OLTP 這一邊，我們能看到兩派主流的儲存引擎：

* 日誌結構學派：只允許追加到檔案和刪除過時的檔案，但不會更新已經寫入的檔案。Bitcask、SSTables、LSM 樹、LevelDB、Cassandra、HBase、Lucene 等都屬於這個類別。
* 就地更新學派：將硬碟視為一組可以覆寫的固定大小的頁面。B 樹是這種理念的典範，用在所有主要的關係資料庫和許多非關係型資料庫中。

日誌結構的儲存引擎是相對較新的技術。他們的主要想法是，透過系統性地將隨機訪問寫入轉換為硬碟上的順序寫入，由於硬碟驅動器和固態硬碟的效能特點，可以實現更高的寫入吞吐量。

關於 OLTP，我們最後還介紹了一些更複雜的索引結構，以及針對所有資料都放在記憶體裡而最佳化的資料庫。

然後，我們暫時放下了儲存引擎的內部細節，查看了典型資料倉庫的高階架構，並說明了為什麼分析工作負載與 OLTP 差別很大：當你的查詢需要在大量行中順序掃描時，索引的重要性就會降低很多。相反，非常緊湊地編碼資料變得非常重要，以最大限度地減少查詢需要從硬碟讀取的資料量。我們討論了列式儲存如何幫助實現這一目標。

作為一名應用程式開發人員，如果你掌握了有關儲存引擎內部的知識，那麼你就能更好地瞭解哪種工具最適合你的特定應用程式。當你調整資料庫的最佳化引數時，這種理解讓你能夠設想增減某個值會產生怎樣的效果。

儘管本章不能讓你成為一個特定儲存引擎的調參專家，但它至少大機率使你有了足夠的概念與詞彙儲備去讀懂你所選擇的資料庫的文件。


## 參考文獻

1. Alfred V. Aho, John E. Hopcroft, and Jeffrey D. Ullman: *Data Structures and Algorithms*. Addison-Wesley, 1983. ISBN: 978-0-201-00023-8
1. Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein: *Introduction to Algorithms*, 3rd edition. MIT Press, 2009. ISBN: 978-0-262-53305-8
1. Justin Sheehy and David Smith: “[Bitcask: A Log-Structured Hash Table for Fast Key/Value Data](https://riak.com/assets/bitcask-intro.pdf),” Basho Technologies, April 2010.
1. Yinan Li, Bingsheng He, Robin Jun Yang, et al.: “[Tree Indexing on Solid State Drives](http://pages.cs.wisc.edu/~yinan/paper/fdtree_pvldb.pdf),” *Proceedings of the VLDB Endowment*, volume 3, number 1, pages 1195–1206, September 2010.
1. Goetz Graefe: “[Modern B-Tree Techniques](https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=0b19f413ffb5bc68b43f3bd05a97c282a7c6d6ab),” *Foundations and Trends in Databases*, volume 3, number 4, pages 203–402, August 2011. [doi:10.1561/1900000028](http://dx.doi.org/10.1561/1900000028)
1. Jeffrey Dean and Sanjay Ghemawat: “[LevelDB Implementation Notes](https://github.com/google/leveldb/blob/master/doc/impl.md),” *github.com*.
1. Dhruba Borthakur: “[The History of RocksDB](https://rocksdb.blogspot.com/2013/11/the-history-of-rocksdb.html),” *rocksdb.blogspot.com*, November 24, 2013.
1. Matteo Bertozzi: “[Apache HBase I/O – HFile](https://blog.cloudera.com/apache-hbase-i-o-hfile/),” *blog.cloudera.com*, June 29, 2012.
1. Fay Chang, Jeffrey Dean, Sanjay Ghemawat, et al.: “[Bigtable: A Distributed Storage System for Structured Data](https://research.google/pubs/pub27898/),” at *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006.
1. Patrick O'Neil, Edward Cheng, Dieter Gawlick, and Elizabeth O'Neil: “[The Log-Structured Merge-Tree (LSM-Tree)](http://www.cs.umb.edu/~poneil/lsmtree.pdf),” *Acta Informatica*, volume 33, number 4, pages 351–385, June 1996. [doi:10.1007/s002360050048](http://dx.doi.org/10.1007/s002360050048)
1. Mendel Rosenblum and John K. Ousterhout: “[The Design and Implementation of a Log-Structured File System](http://research.cs.wisc.edu/areas/os/Qual/papers/lfs.pdf),” *ACM Transactions on Computer Systems*, volume 10, number 1, pages 26–52, February 1992. [doi:10.1145/146941.146943](http://dx.doi.org/10.1145/146941.146943)
1. Adrien Grand: “[What Is in a Lucene Index?](http://www.slideshare.net/lucenerevolution/what-is-inaluceneagrandfinal),” at *Lucene/Solr Revolution*, November 14, 2013.
1. Deepak Kandepet: “[Hacking Lucene—The Index Format](https://web.archive.org/web/20160316190830/http://hackerlabs.github.io/blog/2011/10/01/hacking-lucene-the-index-format/index.html),” *hackerlabs.github.io*, October 1, 2011.
1. Michael McCandless: “[Visualizing Lucene's Segment Merges](http://blog.mikemccandless.com/2011/02/visualizing-lucenes-segment-merges.html),” *blog.mikemccandless.com*, February 11, 2011.
1. Burton H. Bloom: “[Space/Time Trade-offs in Hash Coding with Allowable Errors](https://people.cs.umass.edu/~emery/classes/cmpsci691st/readings/Misc/p422-bloom.pdf),” *Communications of the ACM*, volume 13, number 7, pages 422–426, July 1970. [doi:10.1145/362686.362692](http://dx.doi.org/10.1145/362686.362692)
1. “[Operating Cassandra: Compaction](https://cassandra.apache.org/doc/latest/operating/compaction/index.html),” Apache Cassandra Documentation v4.0, 2016.
1. Rudolf Bayer and Edward M. McCreight: “[Organization and Maintenance of Large Ordered Indices](https://apps.dtic.mil/sti/citations/AD0712079),” Boeing Scientific Research Laboratories, Mathematical and Information Sciences Laboratory, report no. 20, July 1970.
1. Douglas Comer: “[The Ubiquitous B-Tree](https://carlosproal.com/ir/papers/p121-comer.pdf),” *ACM Computing Surveys*, volume 11, number 2, pages 121–137, June 1979. [doi:10.1145/356770.356776](http://dx.doi.org/10.1145/356770.356776)
1. Emmanuel Goossaert: “[Coding for SSDs](http://codecapsule.com/2014/02/12/coding-for-ssds-part-1-introduction-and-table-of-contents/),” *codecapsule.com*, February 12, 2014.
1. C. Mohan and Frank Levine: “[ARIES/IM: An Efficient and High Concurrency Index Management Method Using Write-Ahead Logging](http://www.ics.uci.edu/~cs223/papers/p371-mohan.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 1992. [doi:10.1145/130283.130338](http://dx.doi.org/10.1145/130283.130338)
1. Howard Chu: “[LDAP at Lightning Speed](https://buildstuff14.sched.com/event/08a1a368e272eb599a52e08b4c3c779d),” at *Build Stuff '14*, November 2014.
1. Bradley C. Kuszmaul: “[A Comparison of Fractal Trees to Log-Structured Merge (LSM) Trees](http://www.pandademo.com/wp-content/uploads/2017/12/A-Comparison-of-Fractal-Trees-to-Log-Structured-Merge-LSM-Trees.pdf),” *tokutek.com*, April 22, 2014.
1. Manos Athanassoulis, Michael S. Kester, Lukas M. Maas, et al.: “[Designing Access Methods: The RUM Conjecture](http://openproceedings.org/2016/conf/edbt/paper-12.pdf),” at *19th International Conference on Extending Database Technology* (EDBT), March 2016. [doi:10.5441/002/edbt.2016.42](http://dx.doi.org/10.5441/002/edbt.2016.42)
1. Peter Zaitsev: “[Innodb Double Write](https://www.percona.com/blog/2006/08/04/innodb-double-write/),” *percona.com*, August 4, 2006.
1. Tomas Vondra: “[On the Impact of Full-Page Writes](https://www.enterprisedb.com/blog/impact-full-page-writes),” *blog.2ndquadrant.com*, November 23, 2016.
1. Mark Callaghan: “[The Advantages of an LSM vs a B-Tree](http://smalldatum.blogspot.co.uk/2016/01/summary-of-advantages-of-lsm-vs-b-tree.html),” *smalldatum.blogspot.co.uk*, January 19, 2016.
1. Mark Callaghan: “[Choosing Between Efficiency and Performance with RocksDB](https://codemesh.io/codemesh2016/mark-callaghan),” at *Code Mesh*, November 4, 2016.
1. Michi Mutsuzaki: “[MySQL vs. LevelDB](https://github.com/m1ch1/mapkeeper/wiki/MySQL-vs.-LevelDB),” *github.com*, August 2011.
1. Benjamin Coverston, Jonathan Ellis, et al.: “[CASSANDRA-1608: Redesigned Compaction](https://issues.apache.org/jira/browse/CASSANDRA-1608), *issues.apache.org*, July 2011.
1. Igor Canadi, Siying Dong, and Mark Callaghan: “[RocksDB Tuning Guide](https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide),” *github.com*, 2016.
1. [*MySQL 5.7 Reference Manual*](http://dev.mysql.com/doc/refman/5.7/en/index.html). Oracle, 2014.
1. [*Books Online for SQL Server 2012*](https://learn.microsoft.com/en-us/previous-versions/sql/sql-server-2012/ms130214(v=sql.110)). Microsoft, 2012.
1. Joe Webb: “[Using Covering Indexes to Improve Query Performance](https://www.simple-talk.com/sql/learn-sql-server/using-covering-indexes-to-improve-query-performance/),” *simple-talk.com*, 29 September 2008.
1. Frank Ramsak, Volker Markl, Robert Fenk, et al.: “[Integrating the UB-Tree into a Database System Kernel](http://www.vldb.org/conf/2000/P263.pdf),” at *26th International Conference on Very Large Data Bases* (VLDB), September 2000.
1. The PostGIS Development Group: “[PostGIS 2.1.2dev Manual](http://postgis.net/docs/manual-2.1/),” *postgis.net*, 2014.
1. Robert Escriva, Bernard Wong, and Emin Gün Sirer: “[HyperDex: A Distributed, Searchable Key-Value Store](http://www.cs.princeton.edu/courses/archive/fall13/cos518/papers/hyperdex.pdf),” at *ACM SIGCOMM Conference*, August 2012. [doi:10.1145/2377677.2377681](http://dx.doi.org/10.1145/2377677.2377681)
1. Michael McCandless: “[Lucene's FuzzyQuery Is 100 Times Faster in 4.0](http://blog.mikemccandless.com/2011/03/lucenes-fuzzyquery-is-100-times-faster.html),” *blog.mikemccandless.com*, March 24, 2011.
1. Steffen Heinz, Justin Zobel, and Hugh E. Williams: “[Burst Tries: A Fast, Efficient Data Structure for String Keys](http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499),” *ACM Transactions on Information Systems*, volume 20, number 2, pages 192–223, April 2002. [doi:10.1145/506309.506312](http://dx.doi.org/10.1145/506309.506312)
1. Klaus U. Schulz and Stoyan Mihov: “[Fast String Correction with Levenshtein Automata](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.16.652),” *International Journal on Document Analysis and Recognition*, volume 5, number 1, pages 67–85, November 2002. [doi:10.1007/s10032-002-0082-8](http://dx.doi.org/10.1007/s10032-002-0082-8)
1. Christopher D. Manning, Prabhakar Raghavan, and Hinrich Schütze: [*Introduction to Information Retrieval*](http://nlp.stanford.edu/IR-book/). Cambridge University Press, 2008. ISBN: 978-0-521-86571-5, available online at *nlp.stanford.edu/IR-book*
1. Michael Stonebraker, Samuel Madden, Daniel J. Abadi, et al.: “[The End of an Architectural Era (It’s Time for a Complete Rewrite)](http://nms.csail.mit.edu/~stavros/pubs/hstore.pdf),” at *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
1. “[VoltDB Technical Overview White Paper](https://www.voltdb.com/files/voltdb-technical-overview/),” VoltDB, 2014.
1. Stephen M. Rumble, Ankita Kejriwal, and John K. Ousterhout: “[Log-Structured Memory for DRAM-Based Storage](https://www.usenix.org/system/files/conference/fast14/fast14-paper_rumble.pdf),” at *12th USENIX Conference on File and Storage Technologies* (FAST), February 2014.
1. Stavros Harizopoulos, Daniel J. Abadi, Samuel Madden, and Michael Stonebraker: “[OLTP Through the Looking Glass, and What We Found There](http://hstore.cs.brown.edu/papers/hstore-lookingglass.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2008. [doi:10.1145/1376616.1376713](http://dx.doi.org/10.1145/1376616.1376713)
1. Justin DeBrabant, Andrew Pavlo, Stephen Tu, et al.: “[Anti-Caching: A New Approach to Database Management System Architecture](http://www.vldb.org/pvldb/vol6/p1942-debrabant.pdf),” *Proceedings of the VLDB Endowment*, volume 6, number 14, pages 1942–1953, September 2013.
1. Joy Arulraj, Andrew Pavlo, and Subramanya R. Dulloor: “[Let's Talk About Storage & Recovery Methods for Non-Volatile Memory Database Systems](http://www.pdl.cmu.edu/PDL-FTP/NVM/storage.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2749441](http://dx.doi.org/10.1145/2723372.2749441)
1. Edgar F. Codd, S. B. Codd, and C. T. Salley: “[Providing OLAP to User-Analysts: An IT Mandate](https://pdfs.semanticscholar.org/a0bd/1491a54a4de428c5eef9b836ef6ee2915fe7.pdf),” E. F. Codd Associates, 1993.
1. Surajit Chaudhuri and Umeshwar Dayal: “[An Overview of Data Warehousing and OLAP Technology](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/sigrecord.pdf),” *ACM SIGMOD Record*, volume 26, number 1, pages 65–74, March 1997. [doi:10.1145/248603.248616](http://dx.doi.org/10.1145/248603.248616)
1. Per-Åke Larson, Cipri Clinciu, Campbell Fraser, et al.: “[Enhancements to SQL Server Column Stores](http://research.microsoft.com/pubs/193599/Apollo3%20-%20Sigmod%202013%20-%20final.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2013.
1. Franz Färber, Norman May, Wolfgang Lehner, et al.: “[The SAP HANA Database – An Architecture Overview](http://sites.computer.org/debull/A12mar/hana.pdf),” *IEEE Data Engineering Bulletin*, volume 35, number 1, pages 28–33, March 2012.
1. Michael Stonebraker: “[The Traditional RDBMS Wisdom Is (Almost Certainly) All Wrong](http://slideshot.epfl.ch/talks/166),” presentation at *EPFL*, May 2013.
1. Daniel J. Abadi: “[Classifying the SQL-on-Hadoop Solutions](https://web.archive.org/web/20150622074951/http://hadapt.com/blog/2013/10/02/classifying-the-sql-on-hadoop-solutions/),” *hadapt.com*, October 2, 2013.
1. Marcel Kornacker, Alexander Behm, Victor Bittorf, et al.: “[Impala: A Modern, Open-Source SQL Engine for Hadoop](http://pandis.net/resources/cidr15impala.pdf),” at *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
1. Sergey Melnik, Andrey Gubarev, Jing Jing Long, et al.: “[Dremel: Interactive Analysis of Web-Scale Datasets](https://research.google/pubs/pub36632/),” at *36th International Conference on Very Large Data Bases* (VLDB), pages 330–339, September 2010.
1. Ralph Kimball and Margy Ross: *The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling*, 3rd edition. John Wiley & Sons, July 2013. ISBN: 978-1-118-53080-1
1. Derrick Harris: “[Why Apple, eBay, and Walmart Have Some of the Biggest Data Warehouses You’ve Ever Seen](https://web.archive.org/web/20221129085658/https://old.gigaom.com/2013/03/27/why-apple-ebay-and-walmart-have-some-of-the-biggest-data-warehouses-youve-ever-seen/),” *gigaom.com*, March 27, 2013.
1. Julien Le Dem: “[Dremel Made Simple with Parquet](https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet.html),” *blog.twitter.com*, September 11, 2013.
1. Daniel J. Abadi, Peter Boncz, Stavros Harizopoulos, et al.: “[The Design and Implementation of Modern Column-Oriented Database Systems](http://cs-www.cs.yale.edu/homes/dna/papers/abadi-column-stores.pdf),” *Foundations and Trends in Databases*, volume 5, number 3, pages 197–280, December 2013. [doi:10.1561/1900000024](http://dx.doi.org/10.1561/1900000024)
1. Peter Boncz, Marcin Zukowski, and Niels Nes: “[MonetDB/X100: Hyper-Pipelining Query Execution](http://cidrdb.org/cidr2005/papers/P19.pdf),” at *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005.
1. Jingren Zhou and Kenneth A. Ross: “[Implementing Database Operations Using SIMD Instructions](http://www1.cs.columbia.edu/~kar/pubsk/simd.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), pages 145–156, June 2002. [doi:10.1145/564691.564709](http://dx.doi.org/10.1145/564691.564709)
1. Michael Stonebraker, Daniel J. Abadi, Adam Batkin, et al.: “[C-Store: A Column-oriented DBMS](http://www.cs.umd.edu/~abadi/vldb.pdf),” at *31st International Conference on Very Large Data Bases* (VLDB), pages 553–564, September 2005.
1. Andrew Lamb, Matt Fuller, Ramakrishna Varadarajan, et al.: “[The Vertica Analytic Database: C-Store 7 Years Later](http://vldb.org/pvldb/vol5/p1790_andrewlamb_vldb2012.pdf),” *Proceedings of the VLDB Endowment*, volume 5, number 12, pages 1790–1801, August 2012.
1. Julien Le Dem and Nong Li: “[Efficient Data Storage for Analytics with Apache Parquet 2.0](http://www.slideshare.net/julienledem/th-210pledem),” at *Hadoop Summit*, San Jose, June 2014.
1. Jim Gray, Surajit Chaudhuri, Adam Bosworth, et al.: “[Data Cube: A Relational Aggregation Operator Generalizing Group-By, Cross-Tab, and Sub-Totals](http://arxiv.org/pdf/cs/0701155.pdf),” *Data Mining and Knowledge Discovery*, volume 1, number 1, pages 29–53, March 2007. [doi:10.1023/A:1009726021843](http://dx.doi.org/10.1023/A:1009726021843)

================================================
FILE: content/v1_tw/ch4.md
================================================
---
title: "第四章：編碼與演化"
linkTitle: "4. 編碼與演化"
weight: 104
breadcrumbs: false
---

![](/map/ch04.png)

> 唯變所適
>
> —— 以弗所的赫拉克利特，為柏拉圖所引（公元前 360 年）


應用程式不可避免地隨時間而變化。新產品的推出，對需求的深入理解，或者商業環境的變化，總會伴隨著 **功能（feature）** 的增增改改。[第一章](/v1_tw/ch1) 介紹了 **可演化性（evolvability）** 的概念：應該盡力構建能靈活適應變化的系統（請參閱 “[可演化性：擁抱變化](/v1_tw/ch1#可演化性：擁抱變化)”）。

在大多數情況下，修改應用程式的功能也意味著需要更改其儲存的資料：可能需要使用新的欄位或記錄型別，或者以新方式展示現有資料。

我們在 [第二章](/v1_tw/ch2) 討論的資料模型有不同的方法來應對這種變化。關係資料庫通常假定資料庫中的所有資料都遵循一個模式：儘管可以更改該模式（透過模式遷移，即 `ALTER` 語句），但是在任何時間點都有且僅有一個正確的模式。相比之下，**讀時模式**（schema-on-read，或 **無模式**，即 schemaless）資料庫不會強制一個模式，因此資料庫可以包含在不同時間寫入的新老資料格式的混合（請參閱 “[文件模型中的模式靈活性](/v1_tw/ch2#文件模型中的模式靈活性)” ）。

當資料 **格式（format）** 或 **模式（schema）** 發生變化時，通常需要對應用程式程式碼進行相應的更改（例如，為記錄新增新欄位，然後修改程式開始讀寫該欄位）。但在大型應用程式中，程式碼變更通常不會立即完成：

* 對於 **服務端（server-side）** 應用程式，可能需要執行 **滾動升級 （rolling upgrade）** （也稱為 **階段釋出（staged rollout）** ），一次將新版本部署到少數幾個節點，檢查新版本是否執行正常，然後逐漸部完所有的節點。這樣無需中斷服務即可部署新版本，為頻繁釋出提供了可行性，從而帶來更好的可演化性。
* 對於 **客戶端（client-side）** 應用程式，升不升級就要看使用者的心情了。使用者可能相當長一段時間裡都不會去升級軟體。

這意味著，新舊版本的程式碼，以及新舊資料格式可能會在系統中同時共處。系統想要繼續順利執行，就需要保持 **雙向相容性**：

向後相容 (backward compatibility)
: 新的程式碼可以讀取由舊的程式碼寫入的資料。

向前相容 (forward compatibility)
: 舊的程式碼可以讀取由新的程式碼寫入的資料。

向後相容性通常並不難實現：新程式碼的作者當然知道由舊程式碼使用的資料格式，因此可以顯示地處理它（最簡單的辦法是，保留舊程式碼即可讀取舊資料）。

向前相容性可能會更棘手，因為舊版的程式需要忽略新版資料格式中新增的部分。

本章中將介紹幾種編碼資料的格式，包括 JSON、XML、Protocol Buffers、Thrift 和 Avro。尤其將關注這些格式如何應對模式變化，以及它們如何對新舊程式碼資料需要共存的系統提供支援。然後將討論如何使用這些格式進行資料儲存和通訊：在 Web 服務中，**表述性狀態傳遞（REST）** 和 **遠端過程呼叫（RPC）**，以及 **訊息傳遞系統**（如 Actor 和訊息佇列）。

## 編碼資料的格式

程式通常（至少）使用兩種形式的資料：

1. 在記憶體中，資料儲存在物件、結構體、列表、陣列、散列表、樹等中。這些資料結構針對 CPU 的高效訪問和操作進行了最佳化（通常使用指標）。
2. 如果要將資料寫入檔案，或透過網路傳送，則必須將其 **編碼（encode）** 為某種自包含的位元組序列（例如，JSON 文件）。由於每個程序都有自己獨立的地址空間，一個程序中的指標對任何其他程序都沒有意義，所以這個位元組序列表示會與通常在記憶體中使用的資料結構完全不同。

> [!TIP] 除一些特殊情況外，例如某些記憶體對映檔案或直接在壓縮資料上操作（如 “[列壓縮](/v1_tw/ch3#列壓縮)” 中所述）。

所以，需要在兩種表示之間進行某種型別的翻譯。從記憶體中表示到位元組序列的轉換稱為 **編碼（Encoding）** （也稱為 **序列化（serialization）** 或 **編組（marshalling）**），反過來稱為 **解碼（Decoding）**[^ii]（**解析（Parsing）**，**反序列化（deserialization）**，**反編組（unmarshalling）**）。

> [!TIP] 請注意，**編碼（encode）**  與 **加密（encryption）** 無關。本書不討論加密。

> [!WARNING] 術語衝突
> 不幸的是，在 [第七章](/v1_tw/ch7)： **事務（Transaction）** 的上下文裡，**序列化（Serialization）** 這個術語也出現了，而且具有完全不同的含義。儘管序列化可能是更常見的術語，為了避免術語過載，本書中堅持使用 **編碼（Encoding）** 表達此含義。

這是一個常見的問題，因而有許多庫和編碼格式可供選擇。首先讓我們概覽一下。

### 語言特定的格式

許多程式語言都內建了將記憶體物件編碼為位元組序列的支援。例如，Java 有 `java.io.Serializable` 【1】，Ruby 有 `Marshal`【2】，Python 有 `pickle`【3】，等等。許多第三方庫也存在，例如 `Kryo for Java` 【4】。

這些編碼庫非常方便，可以用很少的額外程式碼實現記憶體物件的儲存與恢復。但是它們也有一些深層次的問題：

* 這類編碼通常與特定的程式語言深度繫結，其他語言很難讀取這種資料。如果以這類編碼儲存或傳輸資料，那你就和這門語言綁死在一起了。並且很難將系統與其他組織的系統（可能用的是不同的語言）進行整合。
* 為了恢復相同物件型別的資料，解碼過程需要 **例項化任意類** 的能力，這通常是安全問題的一個來源【5】：如果攻擊者可以讓應用程式解碼任意的位元組序列，他們就能例項化任意的類，這會允許他們做可怕的事情，如遠端執行任意程式碼【6,7】。
* 在這些庫中，資料版本控制通常是事後才考慮的。因為它們旨在快速簡便地對資料進行編碼，所以往往忽略了向前和向後相容性帶來的麻煩問題。
* 效率（編碼或解碼所花費的 CPU 時間，以及編碼結構的大小）往往也是事後才考慮的。例如，Java 的內建序列化由於其糟糕的效能和臃腫的編碼而臭名昭著【8】。

因此，除非臨時使用，採用語言內建編碼通常是一個壞主意。

### JSON、XML和二進位制變體

當我們談到可以被多種程式語言讀寫的標準編碼時，JSON 和 XML 是最顯眼的角逐者。它們廣為人知，廣受支援，也 “廣受憎惡”。XML 經常收到批評：過於冗長與且過份複雜【9】。JSON 的流行則主要源於（透過成為 JavaScript 的一個子集）Web 瀏覽器的內建支援，以及相對於 XML 的簡單性。CSV 是另一種流行的與語言無關的格式，儘管其功能相對較弱。

JSON，XML 和 CSV 屬於文字格式，因此具有人類可讀性（儘管它們的語法是一個熱門爭議話題）。除了表面的語法問題之外，它們也存在一些微妙的問題：

* **數字（numbers）** 編碼有很多模糊之處。在 XML 和 CSV 中，無法區分數字和碰巧由數字組成的字串（除了引用外部模式）。JSON 雖然區分字串與數字，但並不區分整數和浮點數，並且不能指定精度。
這在處理大數字時是個問題。例如大於 $2^{53}$ 的整數無法使用 IEEE 754 雙精度浮點數精確表示，因此在使用浮點數（例如 JavaScript）的語言進行分析時，這些數字會變得不準確。Twitter 有一個關於大於 $2^{53}$ 的數字的例子，它使用 64 位整數來標識每條推文。Twitter API 返回的 JSON 包含了兩個推特 ID，一個是 JSON 數字，另一個是十進位制字串，以解決 JavaScript 程式中無法正確解析數字的問題【10】。
* JSON 和 XML 對 Unicode 字串（即人類可讀的文字）有很好的支援，但是它們不支援二進位制資料（即不帶 **字元編碼（character encoding）** 的位元組序列）。二進位制串是很有用的功能，人們透過使用 Base64 將二進位制資料編碼為文字來繞過此限制。其特有的模式標識著這個值應當被解釋為 Base64 編碼的二進位制資料。這種方案雖然管用，但比較 Hacky，並且會增加三分之一的資料大小。
*  XML 【11】和 JSON 【12】都有可選的模式支援。這些模式語言相當強大，所以學習和實現起來都相當複雜。XML 模式的使用相當普遍，但許多基於 JSON 的工具才不會去折騰模式。對資料的正確解讀（例如區分數值與二進位制串）取決於模式中的資訊，因此不使用 XML/JSON 模式的應用程式可能需要對相應的編碼 / 解碼邏輯進行硬編碼。
* CSV 沒有任何模式，因此每行和每列的含義完全由應用程式自行定義。如果應用程式變更添加了新的行或列，那麼這種變更必須透過手工處理。CSV 也是一個相當模糊的格式（如果一個值包含逗號或換行符，會發生什麼？）。儘管其轉義規則已經被正式指定【13】，但並不是所有的解析器都正確的實現了標準。

儘管存在這些缺陷，但 JSON、XML 和 CSV 對很多需求來說已經足夠好了。它們很可能會繼續流行下去，特別是作為資料交換格式來說（即將資料從一個組織傳送到另一個組織）。在這種情況下，只要人們對格式是什麼意見一致，格式有多美觀或者效率有多高效就無所謂了。讓不同的組織就這些東西達成一致的難度超過了絕大多數問題。

#### 二進位制編碼

對於僅在組織內部使用的資料，使用最小公約數式的編碼格式壓力較小。例如，可以選擇更緊湊或更快的解析格式。雖然對小資料集來說，收益可以忽略不計；但一旦達到 TB 級別，資料格式的選型就會產生巨大的影響。

JSON 比 XML 簡潔，但與二進位制格式相比還是太佔空間。這一事實導致大量二進位制編碼版本 JSON（MessagePack、BSON、BJSON、UBJSON、BISON 和 Smile 等） 和 XML（例如 WBXML 和 Fast Infoset）的出現。這些格式已經在各種各樣的領域中採用，但是沒有一個能像文字版 JSON 和 XML 那樣被廣泛採用。

這些格式中的一些擴充套件了一組資料型別（例如，區分整數和浮點數，或者增加對二進位制字串的支援），另一方面，它們沒有改變 JSON / XML 的資料模型。特別是由於它們沒有規定模式，所以它們需要在編碼資料中包含所有的物件欄位名稱。也就是說，在 [例 4-1]() 中的 JSON 文件的二進位制編碼中，需要在某處包含字串 `userName`，`favoriteNumber` 和 `interests`。

**例 4-1 本章中用於展示二進位制編碼的示例記錄**

```json
{
    "userName": "Martin",
    "favoriteNumber": 1337,
    "interests": ["daydreaming", "hacking"]
}
```

我們來看一個 MessagePack 的例子，它是一個 JSON 的二進位制編碼。圖 4-1 顯示了如果使用 MessagePack 【14】對 [例 4-1]() 中的 JSON 文件進行編碼，則得到的位元組序列。前幾個位元組如下：

1. 第一個位元組 `0x83` 表示接下來是 **3** 個欄位（低四位 = `0x03`）的 **物件 object**（高四位 = `0x80`）。（如果想知道如果一個物件有 15 個以上的欄位會發生什麼情況，欄位的數量塞不進 4 個 bit 裡，那麼它會用另一個不同的型別識別符號，欄位的數量被編碼兩個或四個位元組）。
2. 第二個位元組 `0xa8` 表示接下來是 **8** 位元組長（低四位 = `0x08`）的字串（高四位 = `0x0a`）。
3. 接下來八個位元組是 ASCII 字串形式的欄位名稱 `userName`。由於之前已經指明長度，不需要任何標記來標識字串的結束位置（或者任何轉義）。
4. 接下來的七個位元組對字首為 `0xa6` 的六個字母的字串值 `Martin` 進行編碼，依此類推。

二進位制編碼長度為 66 個位元組，僅略小於文字 JSON 編碼所取的 81 個位元組（刪除了空白）。所有的 JSON 的二進位制編碼在這方面是相似的。空間節省了一丁點（以及解析加速）是否能彌補可讀性的損失，誰也說不準。

在下面的章節中，能達到比這好得多的結果，只用 32 個位元組對相同的記錄進行編碼。

![](/v1/ddia_0401.png)

**圖 4-1 使用 MessagePack 編碼的記錄（例 4-1）**

### Thrift與Protocol Buffers

Apache Thrift 【15】和 Protocol Buffers（protobuf）【16】是基於相同原理的二進位制編碼庫。Protocol Buffers 最初是在 Google 開發的，Thrift 最初是在 Facebook 開發的，並且都是在 2007~2008 開源的【17】。
Thrift 和 Protocol Buffers 都需要一個模式來編碼任何資料。要在 Thrift 的 [例 4-1]() 中對資料進行編碼，可以使用 Thrift **介面定義語言（IDL）** 來描述模式，如下所示：

```c
struct Person {
    1: required string       userName,
    2: optional i64          favoriteNumber,
    3: optional list<string> interests
}
```

Protocol Buffers 的等效模式定義看起來非常相似：

```protobuf
message Person {
    required string user_name       = 1;
    optional int64  favorite_number = 2;
    repeated string interests       = 3;
}
```

Thrift 和 Protocol Buffers 每一個都帶有一個程式碼生成工具，它採用了類似於這裡所示的模式定義，並且生成了以各種程式語言實現模式的類【18】。你的應用程式程式碼可以呼叫此生成的程式碼來對模式的記錄進行編碼或解碼。
用這個模式編碼的資料是什麼樣的？令人困惑的是，Thrift 有兩種不同的二進位制編碼格式 [^iii]，分別稱為 BinaryProtocol 和 CompactProtocol。先來看看 BinaryProtocol。使用這種格式的編碼來編碼 [例 4-1]() 中的訊息只需要 59 個位元組，如 [圖 4-2](/v1/ddia_0402.png) 所示【19】。

![](/v1/ddia_0402.png)

**圖 4-2 使用 Thrift 二進位制協議編碼的記錄**

[^iii]: 實際上，Thrift 有三種二進位制協議：BinaryProtocol、CompactProtocol 和 DenseProtocol，儘管 DenseProtocol 只支援 C ++ 實現，所以不算作跨語言【18】。除此之外，它還有兩種不同的基於 JSON 的編碼格式【19】。真逗！

與 [圖 4-1](/v1/ddia_0401.png) 類似，每個欄位都有一個型別註釋（用於指示它是一個字串、整數、列表等），還可以根據需要指定長度（字串的長度，列表中的專案數） 。出現在資料中的字串 `(“Martin”, “daydreaming”, “hacking”)` 也被編碼為 ASCII（或者說，UTF-8），與之前類似。

與 [圖 4-1](/v1/ddia_0401.png) 相比，最大的區別是沒有欄位名 `(userName, favoriteNumber, interests)`。相反，編碼資料包含欄位標籤，它們是數字 `(1, 2 和 3)`。這些是模式定義中出現的數字。欄位標記就像欄位的別名 - 它們是說我們正在談論的欄位的一種緊湊的方式，而不必拼出欄位名稱。

Thrift CompactProtocol 編碼在語義上等同於 BinaryProtocol，但是如 [圖 4-3](/v1/ddia_0403.png) 所示，它只將相同的資訊打包成只有 34 個位元組。它透過將欄位型別和標籤號打包到單個位元組中，並使用可變長度整數來實現。數字 1337 不是使用全部八個位元組，而是用兩個位元組編碼，每個位元組的最高位用來指示是否還有更多的位元組。這意味著 - 64 到 63 之間的數字被編碼為一個位元組，-8192 和 8191 之間的數字以兩個位元組編碼，等等。較大的數字使用更多的位元組。

![](/v1/ddia_0403.png)

**圖 4-3 使用 Thrift 壓縮協議編碼的記錄**

最後，Protocol Buffers（只有一種二進位制編碼格式）對相同的資料進行編碼，如 [圖 4-4](/v1/ddia_0404.png) 所示。它的打包方式稍有不同，但與 Thrift 的 CompactProtocol 非常相似。Protobuf 將同樣的記錄塞進了 33 個位元組中。

![](/v1/ddia_0404.png)

**圖 4-4 使用 Protobuf 編碼的記錄**

需要注意的一個細節：在前面所示的模式中，每個欄位被標記為必需或可選，但是這對欄位如何編碼沒有任何影響（二進位制資料中沒有任何欄位指示某欄位是否必須）。區別在於，如果欄位設定為 `required`，但未設定該欄位，則所需的執行時檢查將失敗，這對於捕獲錯誤非常有用。

#### 欄位標籤和模式演變

我們之前說過，模式不可避免地需要隨著時間而改變。我們稱之為模式演變。Thrift 和 Protocol Buffers 如何處理模式更改，同時保持向後相容性？

從示例中可以看出，編碼的記錄就是其編碼欄位的拼接。每個欄位由其標籤號碼（樣本模式中的數字 1,2,3）標識，並用資料型別（例如字串或整數）註釋。如果沒有設定欄位值，則簡單地從編碼記錄中省略。從中可以看到，欄位標記對編碼資料的含義至關重要。你可以更改架構中欄位的名稱，因為編碼的資料永遠不會引用欄位名稱，但不能更改欄位的標記，因為這會使所有現有的編碼資料無效。

你可以新增新的欄位到架構，只要你給每個欄位一個新的標籤號碼。如果舊的程式碼（不知道你新增的新的標籤號碼）試圖讀取新程式碼寫入的資料，包括一個新的欄位，其標籤號碼不能識別，它可以簡單地忽略該欄位。資料型別註釋允許解析器確定需要跳過的位元組數。這保持了向前相容性：舊程式碼可以讀取由新程式碼編寫的記錄。

向後相容性呢？只要每個欄位都有一個唯一的標籤號碼，新的程式碼總是可以讀取舊的資料，因為標籤號碼仍然具有相同的含義。唯一的細節是，如果你新增一個新的欄位，你不能設定為必需。如果你要新增一個欄位並將其設定為必需，那麼如果新程式碼讀取舊程式碼寫入的資料，則該檢查將失敗，因為舊程式碼不會寫入你新增的新欄位。因此，為了保持向後相容性，在模式的初始部署之後 **新增的每個欄位必須是可選的或具有預設值**。

刪除一個欄位就像新增一個欄位，只是這回要考慮的是向前相容性。這意味著你只能刪除可選的欄位（必需欄位永遠不能刪除），而且你不能再次使用相同的標籤號碼（因為你可能仍然有資料寫在包含舊標籤號碼的地方，而該欄位必須被新程式碼忽略）。

#### 資料型別和模式演變

如何改變欄位的資料型別？這也許是可能的 —— 詳細資訊請查閱相關的文件 —— 但是有一個風險，值將失去精度或被截斷。例如，假設你將一個 32 位的整數變成一個 64 位的整數。新程式碼可以輕鬆讀取舊程式碼寫入的資料，因為解析器可以用零填充任何缺失的位。但是，如果舊程式碼讀取由新程式碼寫入的資料，則舊程式碼仍使用 32 位變數來儲存該值。如果解碼的 64 位值不適合 32 位，則它將被截斷。

Protobuf 的一個奇怪的細節是，它沒有列表或陣列資料型別，而是有一個欄位的重複標記（`repeated`，這是除必需和可選之外的第三個選項）。如 [圖 4-4](/v1/ddia_0404.png) 所示，重複欄位的編碼正如它所說的那樣：同一個欄位標記只是簡單地出現在記錄中。這具有很好的效果，可以將可選（單值）欄位更改為重複（多值）欄位。讀取舊資料的新程式碼會看到一個包含零個或一個元素的列表（取決於該欄位是否存在）。讀取新資料的舊程式碼只能看到列表的最後一個元素。

Thrift 有一個專用的列表資料型別，它使用列表元素的資料型別進行引數化。這不允許 Protocol Buffers 所做的從單值到多值的演變，但是它具有支援巢狀列表的優點。

### Avro

Apache Avro 【20】是另一種二進位制編碼格式，與 Protocol Buffers 和 Thrift 有著有趣的不同。它是作為 Hadoop 的一個子專案在 2009 年開始的，因為 Thrift 不適合 Hadoop 的用例【21】。

Avro 也使用模式來指定正在編碼的資料的結構。它有兩種模式語言：一種（Avro IDL）用於人工編輯，一種（基於 JSON）更易於機器讀取。

我們用 Avro IDL 編寫的示例模式可能如下所示：

```c
record Person {
    string                userName;
    union { null, long }  favoriteNumber = null;
    array<string>         interests;
}
```

等價的 JSON 表示：

```json
{
    "type": "record",
    "name": "Person",
    "fields": [
        {"name": "userName", "type": "string"},
        {"name": "favoriteNumber", "type": ["null", "long"], "default": null},
        {"name": "interests", "type": {"type": "array", "items": "string"}}
    ]
}
```

首先，請注意模式中沒有標籤號碼。如果我們使用這個模式編碼我們的例子記錄（[例 4-1]()），Avro 二進位制編碼只有 32 個位元組長，這是我們所見過的所有編碼中最緊湊的。編碼位元組序列的分解如 [圖 4-5](/v1/ddia_0405.png) 所示。

如果你檢查位元組序列，你可以看到沒有什麼可以識別字段或其資料型別。編碼只是由連在一起的值組成。一個字串只是一個長度字首，後跟 UTF-8 位元組，但是在被包含的資料中沒有任何內容告訴你它是一個字串。它可以是一個整數，也可以是其他的整數。整數使用可變長度編碼（與 Thrift 的 CompactProtocol 相同）進行編碼。

![](/v1/ddia_0405.png)

**圖 4-5 使用 Avro 編碼的記錄**

為了解析二進位制資料，你按照它們出現在模式中的順序遍歷這些欄位，並使用模式來告訴你每個欄位的資料型別。這意味著如果讀取資料的程式碼使用與寫入資料的程式碼完全相同的模式，才能正確解碼二進位制資料。Reader 和 Writer 之間的模式不匹配意味著錯誤地解碼資料。

那麼，Avro 如何支援模式演變呢？

#### Writer模式與Reader模式

有了 Avro，當應用程式想要編碼一些資料（將其寫入檔案或資料庫、透過網路傳送等）時，它使用它知道的任何版本的模式編碼資料，例如，模式可能被編譯到應用程式中。這被稱為 Writer 模式。

當一個應用程式想要解碼一些資料（從一個檔案或資料庫讀取資料、從網路接收資料等）時，它希望資料在某個模式中，這就是 Reader 模式。這是應用程式程式碼所依賴的模式，在應用程式的構建過程中，程式碼可能已經從該模式生成。

Avro 的關鍵思想是 Writer 模式和 Reader 模式不必是相同的 - 他們只需要相容。當資料解碼（讀取）時，Avro 庫透過並排檢視 Writer 模式和 Reader 模式並將資料從 Writer 模式轉換到 Reader 模式來解決差異。Avro 規範【20】確切地定義了這種解析的工作原理，如 [圖 4-6](/v1/ddia_0406.png) 所示。

例如，如果 Writer 模式和 Reader 模式的欄位順序不同，這是沒有問題的，因為模式解析透過欄位名匹配欄位。如果讀取資料的程式碼遇到出現在 Writer 模式中但不在 Reader 模式中的欄位，則忽略它。如果讀取資料的程式碼需要某個欄位，但是 Writer 模式不包含該名稱的欄位，則使用在 Reader 模式中宣告的預設值填充。

![](/v1/ddia_0406.png)

**圖 4-6 一個 Avro Reader 解決讀寫模式的差異**

#### 模式演變規則

使用 Avro，向前相容性意味著你可以將新版本的模式作為 Writer，並將舊版本的模式作為 Reader。相反，向後相容意味著你可以有一個作為 Reader 的新版本模式和作為 Writer 的舊版本模式。

為了保持相容性，你只能新增或刪除具有預設值的欄位（我們的 Avro 模式中的欄位 `favoriteNumber` 的預設值為 `null`）。例如，假設你添加了一個有預設值的欄位，這個新的欄位將存在於新模式而不是舊模式中。當使用新模式的 Reader 讀取使用舊模式寫入的記錄時，將為缺少的欄位填充預設值。

如果你要新增一個沒有預設值的欄位，新的 Reader 將無法讀取舊 Writer 寫的資料，所以你會破壞向後相容性。如果你要刪除沒有預設值的欄位，舊的 Reader 將無法讀取新 Writer 寫入的資料，因此你會打破向前相容性。在一些程式語言中，null 是任何變數可以接受的預設值，但在 Avro 中並不是這樣：如果要允許一個欄位為 `null`，則必須使用聯合型別。例如，`union {null, long, string} field;` 表示 field 可以是數字或字串，也可以是 `null`。如果要將 null 作為預設值，則它必須是 union 的分支之一 [^iv]。這樣的寫法比預設情況下就允許任何變數是 `null` 顯得更加冗長，但是透過明確什麼可以和什麼不可以是 `null`，有助於防止出錯【22】。

[^iv]: 確切地說，預設值必須是聯合的第一個分支的型別，儘管這是 Avro 的特定限制，而不是聯合型別的一般特徵。

因此，Avro 沒有像 Protocol Buffers 和 Thrift 那樣的 `optional` 和 `required` 標記（但它有聯合型別和預設值）。

只要 Avro 可以支援相應的型別轉換，就可以改變欄位的資料型別。更改欄位的名稱也是可能的，但有點棘手：Reader 模式可以包含欄位名稱的別名，所以它可以匹配舊 Writer 的模式欄位名稱與別名。這意味著更改欄位名稱是向後相容的，但不能向前相容。同樣，向聯合型別新增分支也是向後相容的，但不能向前相容。

#### 但Writer模式到底是什麼？

到目前為止，我們一直跳過了一個重要的問題：對於一段特定的編碼資料，Reader 如何知道其 Writer 模式？我們不能只將整個模式包括在每個記錄中，因為模式可能比編碼的資料大得多，從而使二進位制編碼節省的所有空間都是徒勞的。

答案取決於 Avro 使用的上下文。舉幾個例子：

有很多記錄的大檔案
: Avro 的一個常見用途 - 尤其是在 Hadoop 環境中 - 用於儲存包含數百萬條記錄的大檔案，所有記錄都使用相同的模式進行編碼（我們將在 [第十章](/v1_tw/ch10) 討論這種情況）。在這種情況下，該檔案的作者可以在檔案的開頭只包含一次 Writer 模式。Avro 指定了一個檔案格式（物件容器檔案）來做到這一點。

支援獨立寫入的記錄的資料庫
: 在一個數據庫中，不同的記錄可能會在不同的時間點使用不同的 Writer 模式來寫入 - 你不能假定所有的記錄都有相同的模式。最簡單的解決方案是在每個編碼記錄的開始處包含一個版本號，並在資料庫中保留一個模式版本列表。Reader 可以獲取記錄，提取版本號，然後從資料庫中獲取該版本號的 Writer 模式。使用該 Writer 模式，它可以解碼記錄的其餘部分（例如 Espresso 【23】就是這樣工作的）。

透過網路連線傳送記錄
: 當兩個程序透過雙向網路連線進行通訊時，他們可以在連線設定上協商模式版本，然後在連線的生命週期中使用該模式。Avro RPC 協議（請參閱 “[服務中的資料流：REST 與 RPC](#服務中的資料流：REST與RPC)”）就是這樣工作的。

具有模式版本的資料庫在任何情況下都是非常有用的，因為它充當文件併為你提供了檢查模式相容性的機會【24】。作為版本號，你可以使用一個簡單的遞增整數，或者你可以使用模式的雜湊。

#### 動態生成的模式

與 Protocol Buffers 和 Thrift 相比，Avro 方法的一個優點是架構不包含任何標籤號碼。但為什麼這很重要？在模式中保留一些數字有什麼問題？

不同之處在於 Avro 對動態生成的模式更友善。例如，假如你有一個關係資料庫，你想要把它的內容轉儲到一個檔案中，並且你想使用二進位制格式來避免前面提到的文字格式（JSON，CSV，SQL）的問題。如果你使用 Avro，你可以很容易地從關係模式生成一個 Avro 模式（在我們之前看到的 JSON 表示中），並使用該模式對資料庫內容進行編碼，並將其全部轉儲到 Avro 物件容器檔案【25】中。你為每個資料庫表生成一個記錄模式，每個列成為該記錄中的一個欄位。資料庫中的列名稱對映到 Avro 中的欄位名稱。

現在，如果資料庫模式發生變化（例如，一個表中添加了一列，刪除了一列），則可以從更新的資料庫模式生成新的 Avro 模式，並在新的 Avro 模式中匯出資料。資料匯出過程不需要注意模式的改變 - 每次執行時都可以簡單地進行模式轉換。任何讀取新資料檔案的人都會看到記錄的欄位已經改變，但是由於欄位是透過名字來標識的，所以更新的 Writer 模式仍然可以與舊的 Reader 模式匹配。

相比之下，如果你為此使用 Thrift 或 Protocol Buffers，則欄位標籤可能必須手動分配：每次資料庫模式更改時，管理員都必須手動更新從資料庫列名到欄位標籤的對映（這可能會自動化，但模式生成器必須非常小心，不要分配以前使用的欄位標籤）。這種動態生成的模式根本不是 Thrift 或 Protocol Buffers 的設計目標，而是 Avro 的。

#### 程式碼生成和動態型別的語言

Thrift 和 Protobuf 依賴於程式碼生成：在定義了模式之後，可以使用你選擇的程式語言生成實現此模式的程式碼。這在 Java、C++ 或 C# 等靜態型別語言中很有用，因為它允許將高效的記憶體中的資料結構用於解碼的資料，並且在編寫訪問資料結構的程式時允許在 IDE 中進行型別檢查和自動補全。

在動態型別程式語言（如 JavaScript、Ruby 或 Python）中，生成程式碼沒有太多意義，因為沒有編譯時型別檢查器來滿足。程式碼生成在這些語言中經常被忽視，因為它們避免了顯式的編譯步驟。而且，對於動態生成的模式（例如從資料庫表生成的 Avro 模式），程式碼生成對獲取資料是一個不必要的障礙。

Avro 為靜態型別程式語言提供了可選的程式碼生成功能，但是它也可以在不生成任何程式碼的情況下使用。如果你有一個物件容器檔案（它嵌入了 Writer 模式），你可以簡單地使用 Avro 庫開啟它，並以與檢視 JSON 檔案相同的方式檢視資料。該檔案是自描述的，因為它包含所有必要的元資料。

這個屬性特別適用於動態型別的資料處理語言如 Apache Pig 【26】。在 Pig 中，你可以開啟一些 Avro 檔案，開始分析它們，並編寫派生資料集以 Avro 格式輸出檔案，而無需考慮模式。

### 模式的優點

正如我們所看到的，Protocol Buffers、Thrift 和 Avro 都使用模式來描述二進位制編碼格式。他們的模式語言比 XML 模式或者 JSON 模式簡單得多，而後者支援更詳細的驗證規則（例如，“該欄位的字串值必須與該正則表示式匹配” 或 “該欄位的整數值必須在 0 和 100 之間” ）。由於 Protocol Buffers，Thrift 和 Avro 實現起來更簡單，使用起來也更簡單，所以它們已經發展到支援相當廣泛的程式語言。

這些編碼所基於的想法絕不是新的。例如，它們與 ASN.1 有很多相似之處，它是 1984 年首次被標準化的模式定義語言【27】。它被用來定義各種網路協議，例如其二進位制編碼（DER）仍然被用於編碼 SSL 證書（X.509）【28】。ASN.1 支援使用標籤號碼的模式演進，類似於 Protocol Buffers 和 Thrift 【29】。然而，它也非常複雜，而且沒有好的配套文件，所以 ASN.1 可能不是新應用程式的好選擇。

許多資料系統也為其資料實現了某種專有的二進位制編碼。例如，大多數關係資料庫都有一個網路協議，你可以透過該協議向資料庫傳送查詢並獲取響應。這些協議通常特定於特定的資料庫，並且資料庫供應商提供將來自資料庫的網路協議的響應解碼為記憶體資料結構的驅動程式（例如使用 ODBC 或 JDBC API）。

所以，我們可以看到，儘管 JSON、XML 和 CSV 等文字資料格式非常普遍，但基於模式的二進位制編碼也是一個可行的選擇。他們有一些很好的屬性：

* 它們可以比各種 “二進位制 JSON” 變體更緊湊，因為它們可以省略編碼資料中的欄位名稱。
* 模式是一種有價值的文件形式，因為模式是解碼所必需的，所以可以確定它是最新的（而手動維護的文件可能很容易偏離現實）。
* 維護一個模式的資料庫允許你在部署任何內容之前檢查模式更改的向前和向後相容性。
* 對於靜態型別程式語言的使用者來說，從模式生成程式碼的能力是有用的，因為它可以在編譯時進行型別檢查。

總而言之，模式演化保持了與 JSON 資料庫提供的無模式 / 讀時模式相同的靈活性（請參閱 “[文件模型中的模式靈活性](/v1_tw/ch2#文件模型中的模式靈活性)”），同時還可以更好地保證你的資料並提供更好的工具。


## 資料流的型別

在本章的開始部分，我們曾經說過，無論何時你想要將某些資料傳送到不共享記憶體的另一個程序，例如，只要你想透過網路傳送資料或將其寫入檔案，就需要將它編碼為一個位元組序列。然後我們討論了做這個的各種不同的編碼。

我們討論了向前和向後的相容性，這對於可演化性來說非常重要（透過允許你獨立升級系統的不同部分，而不必一次改變所有內容，可以輕鬆地進行更改）。相容性是編碼資料的一個程序和解碼它的另一個程序之間的一種關係。

這是一個相當抽象的概念 - 資料可以透過多種方式從一個流程流向另一個流程。誰編碼資料，誰解碼？在本章的其餘部分中，我們將探討資料如何在流程之間流動的一些最常見的方式：

* 透過資料庫（請參閱 “[資料庫中的資料流](#資料庫中的資料流)”）
* 透過服務呼叫（請參閱 “[服務中的資料流：REST 與 RPC](#服務中的資料流：REST與RPC)”）
* 透過非同步訊息傳遞（請參閱 “[訊息傳遞中的資料流](#訊息傳遞中的資料流)”）


### 資料庫中的資料流

在資料庫中，寫入資料庫的過程對資料進行編碼，從資料庫讀取的過程對資料進行解碼。可能只有一個程序訪問資料庫，在這種情況下，讀者只是相同程序的後續版本 - 在這種情況下，你可以考慮將資料庫中的內容儲存為向未來的自我傳送訊息。

向後相容性顯然是必要的。否則你未來的自己將無法解碼你以前寫的東西。

一般來說，幾個不同的程序同時訪問資料庫是很常見的。這些程序可能是幾個不同的應用程式或服務，或者它們可能只是幾個相同服務的例項（為了可伸縮性或容錯性而並行執行）。無論哪種方式，在應用程式發生變化的環境中，訪問資料庫的某些程序可能會執行較新的程式碼，有些程序可能會執行較舊的程式碼，例如，因為新版本當前正在部署滾動升級，所以有些例項已經更新，而其他例項尚未更新。

這意味著資料庫中的一個值可能會被更新版本的程式碼寫入，然後被仍舊執行的舊版本的程式碼讀取。因此，資料庫也經常需要向前相容。

但是，還有一個額外的障礙。假設你將一個欄位新增到記錄模式，並且較新的程式碼將該新欄位的值寫入資料庫。隨後，舊版本的程式碼（尚不知道新欄位）將讀取記錄，更新記錄並將其寫回。在這種情況下，理想的行為通常是舊程式碼保持新的欄位不變，即使它不能被解釋。

前面討論的編碼格式支援未知欄位的儲存，但是有時候需要在應用程式層面保持謹慎，如圖 4-7 所示。例如，如果將資料庫值解碼為應用程式中的模型物件，稍後重新編碼這些模型物件，那麼未知欄位可能會在該翻譯過程中丟失。解決這個問題不是一個難題，你只需要意識到它。

![](/v1/ddia_0407.png)

**圖 4-7 當較舊版本的應用程式更新以前由較新版本的應用程式編寫的資料時，如果不小心，資料可能會丟失。**

#### 在不同的時間寫入不同的值

資料庫通常允許任何時候更新任何值。這意味著在一個單一的資料庫中，可能有一些值是五毫秒前寫的，而一些值是五年前寫的。

在部署應用程式的新版本時，也許用不了幾分鐘就可以將所有的舊版本替換為新版本（至少伺服器端應用程式是這樣的）。但資料庫內容並非如此：對於五年前的資料來說，除非對其進行顯式重寫，否則它仍然會以原始編碼形式存在。這種現象有時被概括為：資料的生命週期超出程式碼的生命週期。

將資料重寫（遷移）到一個新的模式當然是可能的，但是在一個大資料集上執行是一個昂貴的事情，所以大多數資料庫如果可能的話就避免它。大多數關係資料庫都允許簡單的模式更改，例如新增一個預設值為空的新列，而不重寫現有資料 [^v]。讀取舊行時，對於磁碟上的編碼資料缺少的任何列，資料庫將填充空值。LinkedIn 的文件資料庫 Espresso 使用 Avro 儲存，允許它使用 Avro 的模式演變規則【23】。

因此，模式演變允許整個資料庫看起來好像是用單個模式編碼的，即使底層儲存可能包含用各種歷史版本的模式編碼的記錄。

[^v]: 除了 MySQL，即使並非真的必要，它也經常會重寫整個表，正如 “[文件模型中的模式靈活性](/v1_tw/ch2#文件模型中的模式靈活性)” 中所提到的。


#### 歸檔儲存

也許你不時為資料庫建立一個快照，例如備份或載入到資料倉庫（請參閱 “[資料倉庫](/v1_tw/ch3#資料倉庫)”）。在這種情況下，即使源資料庫中的原始編碼包含來自不同時代的模式版本的混合，資料轉儲通常也將使用最新模式進行編碼。既然你不管怎樣都要複製資料，那麼你可以對這個資料複製進行一致的編碼。

由於資料轉儲是一次寫入的，而且以後是不可變的，所以 Avro 物件容器檔案等格式非常適合。這也是一個很好的機會，可以將資料編碼為面向分析的列式格式，例如 Parquet（請參閱 “[列壓縮](/v1_tw/ch3#列壓縮)”）。

在 [第十章](/v1_tw/ch10) 中，我們將詳細討論使用檔案儲存中的資料。


### 服務中的資料流：REST與RPC

當你需要透過網路進行程序間的通訊時，安排該通訊的方式有幾種。最常見的安排是有兩個角色：客戶端和伺服器。伺服器透過網路公開 API，並且客戶端可以連線到伺服器以向該 API 發出請求。伺服器公開的 API 被稱為服務。

Web 以這種方式工作：客戶（Web 瀏覽器）向 Web 伺服器發出請求，透過 GET 請求下載 HTML、CSS、JavaScript、影像等，並透過 POST 請求提交資料到伺服器。API 包含一組標準的協議和資料格式（HTTP、URL、SSL/TLS、HTML 等）。由於網路瀏覽器、網路伺服器和網站作者大多同意這些標準，你可以使用任何網路瀏覽器訪問任何網站（至少在理論上！）。

Web 瀏覽器不是唯一的客戶端型別。例如，在移動裝置或桌面計算機上執行的本地應用程式也可以向伺服器發出網路請求，並且在 Web 瀏覽器內執行的客戶端 JavaScript 應用程式可以使用 XMLHttpRequest 成為 HTTP 客戶端（該技術被稱為 Ajax 【30】）。在這種情況下，伺服器的響應通常不是用於顯示給人的 HTML，而是便於客戶端應用程式程式碼進一步處理的編碼資料（如 JSON）。儘管 HTTP 可能被用作傳輸協議，但頂層實現的 API 是特定於應用程式的，客戶端和伺服器需要就該 API 的細節達成一致。

此外，伺服器本身可以是另一個服務的客戶端（例如，典型的 Web 應用伺服器充當資料庫的客戶端）。這種方法通常用於將大型應用程式按照功能區域分解為較小的服務，這樣當一個服務需要來自另一個服務的某些功能或資料時，就會向另一個服務發出請求。這種構建應用程式的方式傳統上被稱為 **面向服務的體系結構（service-oriented architecture，SOA）**，最近被改進和更名為 **微服務架構**【31,32】。

在某些方面，服務類似於資料庫：它們通常允許客戶端提交和查詢資料。但是，雖然資料庫允許使用我們在 [第二章](/v1_tw/ch2) 中討論的查詢語言進行任意查詢，但是服務公開了一個特定於應用程式的 API，它只允許由服務的業務邏輯（應用程式程式碼）預定的輸入和輸出【33】。這種限制提供了一定程度的封裝：服務能夠對客戶可以做什麼和不可以做什麼施加細粒度的限制。

面向服務 / 微服務架構的一個關鍵設計目標是透過使服務獨立部署和演化來使應用程式更易於更改和維護。例如，每個服務應該由一個團隊擁有，並且該團隊應該能夠經常釋出新版本的服務，而不必與其他團隊協調。換句話說，我們應該期望伺服器和客戶端的舊版本和新版本同時執行，因此伺服器和客戶端使用的資料編碼必須在不同版本的服務 API 之間相容 —— 這正是我們在本章所一直在談論的。

#### Web服務

**當服務使用 HTTP 作為底層通訊協議時，可稱之為 Web 服務**。這可能是一個小錯誤，因為 Web 服務不僅在 Web 上使用，而且在幾個不同的環境中使用。例如：

1. 執行在使用者裝置上的客戶端應用程式（例如，移動裝置上的本地應用程式，或使用 Ajax 的 JavaScript web 應用程式）透過 HTTP 向服務發出請求。這些請求通常透過公共網際網路進行。
2. 一種服務向同一組織擁有的另一項服務提出請求，這些服務通常位於同一資料中心內，作為面向服務 / 微服務架構的一部分。（支援這種用例的軟體有時被稱為 **中介軟體（middleware）** ）
3. 一種服務透過網際網路向不同組織所擁有的服務提出請求。這用於不同組織後端系統之間的資料交換。此類別包括由線上服務（如信用卡處理系統）提供的公共 API，或用於共享訪問使用者資料的 OAuth。

有兩種流行的 Web 服務方法：REST 和 SOAP。他們在哲學方面幾乎是截然相反的，往往也是各自支持者之間的激烈辯論的主題 [^vi]。

[^vi]: 即使在每個陣營內也有很多爭論。例如，**HATEOAS（超媒體作為應用程式狀態的引擎）** 就經常引發討論【35】。

REST 不是一個協議，而是一個基於 HTTP 原則的設計哲學【34,35】。它強調簡單的資料格式，使用 URL 來標識資源，並使用 HTTP 功能進行快取控制，身份驗證和內容型別協商。與 SOAP 相比，REST 已經越來越受歡迎，至少在跨組織服務整合的背景下【36】，並經常與微服務相關【31】。根據 REST 原則設計的 API 稱為 RESTful。

相比之下，SOAP 是用於製作網路 API 請求的基於 XML 的協議 [^vii]。雖然它最常用於 HTTP，但其目的是獨立於 HTTP，並避免使用大多數 HTTP 功能。相反，它帶有龐大而複雜的多種相關標準（Web 服務框架，稱為 `WS-*`），它們增加了各種功能【37】。

[^vii]: 儘管首字母縮寫詞相似，SOAP 並不是 SOA 的要求。SOAP 是一種特殊的技術，而 SOA 是構建系統的一般方法。

SOAP Web 服務的 API 使用稱為 Web 服務描述語言（WSDL）的基於 XML 的語言來描述。WSDL 支援程式碼生成，客戶端可以使用本地類和方法呼叫（編碼為 XML 訊息並由框架再次解碼）訪問遠端服務。這在靜態型別程式語言中非常有用，但在動態型別程式語言中很少（請參閱 “[程式碼生成和動態型別的語言](#程式碼生成和動態型別的語言)”）。

由於 WSDL 的設計不是人類可讀的，而且由於 SOAP 訊息通常因為過於複雜而無法手動構建，所以 SOAP 的使用者在很大程度上依賴於工具支援，程式碼生成和 IDE【38】。對於 SOAP 供應商不支援的程式語言的使用者來說，與 SOAP 服務的整合是困難的。

儘管 SOAP 及其各種擴充套件表面上是標準化的，但是不同廠商的實現之間的互操作性往往會造成問題【39】。由於所有這些原因，儘管許多大型企業仍然使用 SOAP，但在大多數小公司中已經不再受到青睞。

REST 風格的 API 傾向於更簡單的方法，通常涉及較少的程式碼生成和自動化工具。定義格式（如 OpenAPI，也稱為 Swagger 【40】）可用於描述 RESTful API 並生成文件。

#### 遠端過程呼叫（RPC）的問題

Web 服務僅僅是透過網路進行 API 請求的一系列技術的最新版本，其中許多技術受到了大量的炒作，但是存在嚴重的問題。Enterprise JavaBeans（EJB）和 Java 的 **遠端方法呼叫（RMI）** 僅限於 Java。**分散式元件物件模型（DCOM）** 僅限於 Microsoft 平臺。**公共物件請求代理體系結構（CORBA）** 過於複雜，不提供向後或向前相容性【41】。

所有這些都是基於 **遠端過程呼叫（RPC）** 的思想，該過程呼叫自 20 世紀 70 年代以來一直存在【42】。RPC 模型試圖向遠端網路服務發出請求，看起來與在同一程序中呼叫程式語言中的函式或方法相同（這種抽象稱為位置透明）。儘管 RPC 起初看起來很方便，但這種方法根本上是有缺陷的【43,44】。網路請求與本地函式呼叫非常不同：

* 本地函式呼叫是可預測的，並且成功或失敗僅取決於受你控制的引數。網路請求是不可預測的：請求或響應可能由於網路問題會丟失，或者遠端計算機可能很慢或不可用，這些問題完全不在你的控制範圍之內。網路問題很常見，因此必須有所準備，例如重試失敗的請求。
* 本地函式呼叫要麼返回結果，要麼丟擲異常，或者永遠不返回（因為進入無限迴圈或程序崩潰）。網路請求有另一個可能的結果：由於超時，它返回時可能沒有結果。在這種情況下，你根本不知道發生了什麼：如果你沒有得到來自遠端服務的響應，你無法知道請求是否透過（我們將在 [第八章](/v1_tw/ch8) 更詳細地討論這個問題）。
* 如果你重試失敗的網路請求，可能會發生請求實際上已經完成，只是響應丟失的情況。在這種情況下，重試將導致該操作被執行多次，除非你在協議中建立資料去重機制（**冪等性**，即 idempotence）。本地函式呼叫時沒有這樣的問題。（在 [第十一章](/v1_tw/ch11) 更詳細地討論冪等性）
* 每次呼叫本地函式時，通常需要大致相同的時間來執行。網路請求比函式呼叫要慢得多，而且其延遲也是非常可變的：好的時候它可能會在不到一毫秒的時間內完成，但是當網路擁塞或者遠端服務超載時，可能需要幾秒鐘的時間才能完成相同的操作。
* 呼叫本地函式時，可以高效地將引用（指標）傳遞給本地記憶體中的物件。當你發出一個網路請求時，所有這些引數都需要被編碼成可以透過網路傳送的一系列位元組。如果引數是像數字或字串這樣的基本型別倒是沒關係，但是對於較大的物件很快就會出現問題。
* 客戶端和服務可以用不同的程式語言實現，所以 RPC 框架必須將資料型別從一種語言翻譯成另一種語言。這可能會變得很醜陋，因為不是所有的語言都具有相同的型別 —— 例如回想一下 JavaScript 的數字大於 $2^{53}$ 的問題（請參閱 “[JSON、XML 和二進位制變體](#JSON、XML和二進位制變體)”）。用單一語言編寫的單個程序中不存在此問題。

所有這些因素意味著嘗試使遠端服務看起來像程式語言中的本地物件一樣毫無意義，因為這是一個根本不同的事情。REST 的部分吸引力在於，它並不試圖隱藏它是一個網路協議的事實（儘管這似乎並沒有阻止人們在 REST 之上構建 RPC 庫）。

#### RPC的當前方向

儘管有這樣那樣的問題，RPC 不會消失。在本章提到的所有編碼的基礎上構建了各種 RPC 框架：例如，Thrift 和 Avro 帶有 RPC 支援，gRPC 是使用 Protocol Buffers 的 RPC 實現，Finagle 也使用 Thrift，Rest.li 使用 JSON over HTTP。

這種新一代的 RPC 框架更加明確的是，遠端請求與本地函式呼叫不同。例如，Finagle 和 Rest.li 使用 futures（promises）來封裝可能失敗的非同步操作。`Futures` 還可以簡化需要並行發出多項服務並將其結果合併的情況【45】。gRPC 支援流，其中一個呼叫不僅包括一個請求和一個響應，還可以是隨時間的一系列請求和響應【46】。

其中一些框架還提供服務發現，即允許客戶端找出在哪個 IP 地址和埠號上可以找到特定的服務。我們將在 “[請求路由](/v1_tw/ch6#請求路由)” 中回到這個主題。

使用二進位制編碼格式的自定義 RPC 協議可以實現比通用的 JSON over REST 更好的效能。但是，RESTful API 還有其他一些顯著的優點：方便實驗和除錯（只需使用 Web 瀏覽器或命令列工具 curl，無需任何程式碼生成或軟體安裝即可向其請求），能被所有主流的程式語言和平臺所支援，還有大量可用的工具（伺服器、快取、負載平衡器、代理、防火牆、監控、除錯工具、測試工具等）的生態系統。

由於這些原因，REST 似乎是公共 API 的主要風格。RPC 框架的主要重點在於同一組織擁有的服務之間的請求，通常在同一資料中心內。

#### 資料編碼與RPC的演化

對於可演化性，重要的是可以獨立更改和部署 RPC 客戶端和伺服器。與透過資料庫流動的資料相比（如上一節所述），我們可以在透過服務進行資料流的情況下做一個簡化的假設：假定所有的伺服器都會先更新，其次是所有的客戶端。因此，你只需要在請求上具有向後相容性，並且對響應具有向前相容性。

RPC 方案的向後和向前相容性屬性是從它使用的編碼方式中繼承而來：

* Thrift、gRPC（Protobuf）和 Avro RPC 可以根據相應編碼格式的相容性規則進行演變。
* 在 SOAP 中，請求和響應是使用 XML 模式指定的。這些可以演變，但有一些微妙的陷阱【47】。
* RESTful API 通常使用 JSON（沒有正式指定的模式）用於響應，以及用於請求的 JSON 或 URI 編碼 / 表單編碼的請求引數。新增可選的請求引數並向響應物件新增新的欄位通常被認為是保持相容性的改變。

由於 RPC 經常被用於跨越組織邊界的通訊，所以服務的相容性變得更加困難，因此服務的提供者經常無法控制其客戶，也不能強迫他們升級。因此，需要長期保持相容性，也許是無限期的。如果需要進行相容性更改，則服務提供商通常會並排維護多個版本的服務 API。

關於 API 版本化應該如何工作（即，客戶端如何指示它想要使用哪個版本的 API）沒有一致意見【48】）。對於 RESTful API，常用的方法是在 URL 或 HTTP Accept 頭中使用版本號。對於使用 API 金鑰來標識特定客戶端的服務，另一種選擇是將客戶端請求的 API 版本儲存在伺服器上，並允許透過單獨的管理介面更新該版本選項【49】。

### 訊息傳遞中的資料流

我們一直在研究從一個過程到另一個過程的編碼資料流的不同方式。到目前為止，我們已經討論了 REST 和 RPC（其中一個程序透過網路向另一個程序傳送請求並期望儘可能快的響應）以及資料庫（一個程序寫入編碼資料，另一個程序在將來再次讀取）。

在最後一節中，我們將簡要介紹一下 RPC 和資料庫之間的非同步訊息傳遞系統。它們與 RPC 類似，因為客戶端的請求（通常稱為訊息）以低延遲傳送到另一個程序。它們與資料庫類似，不是透過直接的網路連線傳送訊息，而是透過稱為訊息代理（也稱為訊息佇列或面向訊息的中介軟體）的中介來臨時儲存訊息。

與直接 RPC 相比，使用訊息代理有幾個優點：

* 如果收件人不可用或過載，可以充當緩衝區，從而提高系統的可靠性。
* 它可以自動將訊息重新發送到已經崩潰的程序，從而防止訊息丟失。
* 避免發件人需要知道收件人的 IP 地址和埠號（這在虛擬機器經常出入的雲部署中特別有用）。
* 它允許將一條訊息傳送給多個收件人。
* 將發件人與收件人邏輯分離（發件人只是釋出郵件，不關心使用者）。

然而，與 RPC 相比，差異在於訊息傳遞通訊通常是單向的：傳送者通常不期望收到其訊息的回覆。一個程序可能傳送一個響應，但這通常是在一個單獨的通道上完成的。這種通訊模式是非同步的：傳送者不會等待訊息被傳遞，而只是傳送它，然後忘記它。

#### 訊息代理

過去，**訊息代理（Message Broker）** 主要是 TIBCO、IBM WebSphere 和 webMethods 等公司的商業軟體的秀場。最近像 RabbitMQ、ActiveMQ、HornetQ、NATS 和 Apache Kafka 這樣的開源實現已經流行起來。我們將在 [第十一章](/v1_tw/ch11) 中對它們進行更詳細的比較。

詳細的交付語義因實現和配置而異，但通常情況下，訊息代理的使用方式如下：一個程序將訊息傳送到指定的佇列或主題，代理確保將訊息傳遞給那個佇列或主題的一個或多個消費者或訂閱者。在同一主題上可以有許多生產者和許多消費者。

一個主題只提供單向資料流。但是，消費者本身可能會將訊息釋出到另一個主題上（因此，可以將它們連結在一起，就像我們將在 [第十一章](/v1_tw/ch11) 中看到的那樣），或者傳送給原始訊息的傳送者使用的回覆佇列（允許請求 / 響應資料流，類似於 RPC）。

訊息代理通常不會執行任何特定的資料模型 —— 訊息只是包含一些元資料的位元組序列，因此你可以使用任何編碼格式。如果編碼是向後和向前相容的，你可以靈活地對釋出者和消費者的編碼進行獨立的修改，並以任意順序進行部署。

如果消費者重新發布訊息到另一個主題，則可能需要小心保留未知欄位，以防止前面在資料庫環境中描述的問題（[圖 4-7](/v1/ddia_0407.png)）。

#### 分散式的Actor框架

Actor 模型是單個程序中併發的程式設計模型。邏輯被封裝在 actor 中，而不是直接處理執行緒（以及競爭條件、鎖定和死鎖的相關問題）。每個 actor 通常代表一個客戶或實體，它可能有一些本地狀態（不與其他任何角色共享），它透過傳送和接收非同步訊息與其他角色通訊。不保證訊息傳送：在某些錯誤情況下，訊息將丟失。由於每個角色一次只能處理一條訊息，因此不需要擔心執行緒，每個角色可以由框架獨立排程。

在分散式 Actor 框架中，此程式設計模型用於跨多個節點伸縮應用程式。不管傳送方和接收方是在同一個節點上還是在不同的節點上，都使用相同的訊息傳遞機制。如果它們在不同的節點上，則該訊息被透明地編碼成位元組序列，透過網路傳送，並在另一側解碼。

位置透明在 actor 模型中比在 RPC 中效果更好，因為 actor 模型已經假定訊息可能會丟失，即使在單個程序中也是如此。儘管網路上的延遲可能比同一個程序中的延遲更高，但是在使用 actor 模型時，本地和遠端通訊之間的基本不匹配是較少的。

分散式的 Actor 框架實質上是將訊息代理和 actor 程式設計模型整合到一個框架中。但是，如果要執行基於 actor 的應用程式的滾動升級，則仍然需要擔心向前和向後相容性問題，因為訊息可能會從執行新版本的節點發送到執行舊版本的節點，反之亦然。

三個流行的分散式 actor 框架處理訊息編碼如下：

* 預設情況下，Akka 使用 Java 的內建序列化，不提供向前或向後相容性。但是，你可以用類似 Protocol Buffers 的東西替代它，從而獲得滾動升級的能力【50】。
* Orleans 預設使用不支援滾動升級部署的自定義資料編碼格式；要部署新版本的應用程式，你需要設定一個新的叢集，將流量從舊叢集遷移到新叢集，然後關閉舊叢集【51,52】。像 Akka 一樣，可以使用自定義序列化外掛。
* 在 Erlang OTP 中，對記錄模式進行更改是非常困難的（儘管系統具有許多為高可用性設計的功能）。滾動升級是可能的，但需要仔細計劃【53】。一個新的實驗性的 `maps` 資料型別（2014 年在 Erlang R17 中引入的類似於 JSON 的結構）可能使得這個資料型別在未來更容易【54】。


## 本章小結

在本章中，我們研究了將資料結構轉換為網路中的位元組或磁碟上的位元組的幾種方法。我們看到了這些編碼的細節不僅影響其效率，更重要的是也影響了應用程式的體系結構和部署它們的選項。

特別是，許多服務需要支援滾動升級，其中新版本的服務逐步部署到少數節點，而不是同時部署到所有節點。滾動升級允許在不停機的情況下發布新版本的服務（從而鼓勵在罕見的大型版本上頻繁釋出小型版本），並使部署風險降低（允許在影響大量使用者之前檢測並回滾有故障的版本）。這些屬性對於可演化性，以及對應用程式進行更改的容易性都是非常有利的。

在滾動升級期間，或出於各種其他原因，我們必須假設不同的節點正在執行我們的應用程式程式碼的不同版本。因此，在系統周圍流動的所有資料都是以提供向後相容性（新程式碼可以讀取舊資料）和向前相容性（舊程式碼可以讀取新資料）的方式進行編碼是重要的。

我們討論了幾種資料編碼格式及其相容性屬性：

* 程式語言特定的編碼僅限於單一程式語言，並且往往無法提供向前和向後相容性。
* JSON、XML 和 CSV 等文字格式非常普遍，其相容性取決於你如何使用它們。他們有可選的模式語言，這有時是有用的，有時是一個障礙。這些格式對於資料型別有些模糊，所以你必須小心數字和二進位制字串。
* 像 Thrift、Protocol Buffers 和 Avro 這樣的二進位制模式驅動格式允許使用清晰定義的向前和向後相容性語義進行緊湊、高效的編碼。這些模式可以用於靜態型別語言的文件和程式碼生成。但是，他們有一個缺點，就是在資料可讀之前需要對資料進行解碼。

我們還討論了資料流的幾種模式，說明了資料編碼重要性的不同場景：

* 資料庫，寫入資料庫的程序對資料進行編碼，並從資料庫讀取程序對其進行解碼
* RPC 和 REST API，客戶端對請求進行編碼，伺服器對請求進行解碼並對響應進行編碼，客戶端最終對響應進行解碼
* 非同步訊息傳遞（使用訊息代理或參與者），其中節點之間透過傳送訊息進行通訊，訊息由傳送者編碼並由接收者解碼

我們可以小心地得出這樣的結論：向後/向前相容性和滾動升級在某種程度上是可以實現的。願你的應用程式的演變迅速、敏捷部署。


## 參考文獻

1. “[Java Object Serialization Specification](http://docs.oracle.com/javase/7/docs/platform/serialization/spec/serialTOC.html),” *docs.oracle.com*, 2010.
1. “[Ruby 2.2.0 API Documentation](http://ruby-doc.org/core-2.2.0/),” *ruby-doc.org*, Dec 2014.
1. “[The Python 3.4.3 Standard Library Reference Manual](https://docs.python.org/3/library/pickle.html),” *docs.python.org*, February 2015.
1. “[EsotericSoftware/kryo](https://github.com/EsotericSoftware/kryo),” *github.com*, October 2014.
1. “[CWE-502: Deserialization of Untrusted Data](http://cwe.mitre.org/data/definitions/502.html),” Common Weakness Enumeration, *cwe.mitre.org*, July 30, 2014.
1. Steve Breen: “[What Do WebLogic, WebSphere, JBoss, Jenkins, OpenNMS, and Your Application Have in Common? This Vulnerability](http://foxglovesecurity.com/2015/11/06/what-do-weblogic-websphere-jboss-jenkins-opennms-and-your-application-have-in-common-this-vulnerability/),” *foxglovesecurity.com*, November 6, 2015.
1. Patrick McKenzie: “[What the Rails Security Issue Means for Your Startup](http://www.kalzumeus.com/2013/01/31/what-the-rails-security-issue-means-for-your-startup/),” *kalzumeus.com*, January 31, 2013.
1. Eishay Smith: “[jvm-serializers wiki](https://github.com/eishay/jvm-serializers/wiki),” *github.com*, November 2014.
1. “[XML Is a Poor Copy of S-Expressions](http://c2.com/cgi/wiki?XmlIsaPoorCopyOfEssExpressions),” *c2.com* wiki.
1. Matt Harris: “[Snowflake: An Update and Some Very Important Information](https://groups.google.com/forum/#!topic/twitter-development-talk/ahbvo3VTIYI),” email to *Twitter Development Talk* mailing list, October 19, 2010.
1. Shudi (Sandy) Gao, C. M. Sperberg-McQueen, and Henry S. Thompson: “[XML Schema 1.1](http://www.w3.org/XML/Schema),” W3C Recommendation, May 2001.
1. Francis Galiegue, Kris Zyp, and Gary Court: “[JSON Schema](http://json-schema.org/),” IETF Internet-Draft, February 2013.
1. Yakov Shafranovich: “[RFC 4180: Common Format and MIME Type for Comma-Separated Values (CSV) Files](https://tools.ietf.org/html/rfc4180),” October 2005.
1. “[MessagePack Specification](http://msgpack.org/),” *msgpack.org*.
1. Mark Slee, Aditya Agarwal, and Marc Kwiatkowski: “[Thrift: Scalable Cross-Language Services Implementation](http://thrift.apache.org/static/files/thrift-20070401.pdf),” Facebook technical report, April 2007.
1. “[Protocol Buffers Developer Guide](https://developers.google.com/protocol-buffers/docs/overview),” Google, Inc., *developers.google.com*.
1. Igor Anishchenko: “[Thrift vs Protocol Buffers vs Avro - Biased Comparison](http://www.slideshare.net/IgorAnishchenko/pb-vs-thrift-vs-avro),” *slideshare.net*, September 17, 2012.
1. “[A Matrix of the Features Each Individual Language Library Supports](http://wiki.apache.org/thrift/LibraryFeatures),” *wiki.apache.org*.
1. Martin Kleppmann: “[Schema Evolution in Avro, Protocol Buffers and Thrift](http://martin.kleppmann.com/2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html),” *martin.kleppmann.com*, December 5, 2012.
1. “[Apache Avro 1.7.7 Documentation](http://avro.apache.org/docs/1.7.7/),” *avro.apache.org*, July 2014.
1. Doug Cutting, Chad Walters, Jim Kellerman, et al.: “[&#91;PROPOSAL&#93; New Subproject: Avro](http://mail-archives.apache.org/mod_mbox/hadoop-general/200904.mbox/%3C49D53694.1050906@apache.org%3E),” email thread on *hadoop-general* mailing list, *mail-archives.apache.org*, April 2009.
1. Tony Hoare: “[Null References: The Billion Dollar Mistake](http://www.infoq.com/presentations/Null-References-The-Billion-Dollar-Mistake-Tony-Hoare),” at *QCon London*, March 2009.
1. Aditya Auradkar and Tom Quiggle: “[Introducing Espresso—LinkedIn's Hot New Distributed Document Store](https://engineering.linkedin.com/espresso/introducing-espresso-linkedins-hot-new-distributed-document-store),” *engineering.linkedin.com*, January 21, 2015.
1. Jay Kreps: “[Putting Apache Kafka to Use: A Practical Guide to Building a Stream Data Platform (Part 2)](http://blog.confluent.io/2015/02/25/stream-data-platform-2/),” *blog.confluent.io*, February 25, 2015.
1. Gwen Shapira: “[The Problem of Managing Schemas](http://radar.oreilly.com/2014/11/the-problem-of-managing-schemas.html),” *radar.oreilly.com*, November 4, 2014.
1. “[Apache Pig 0.14.0 Documentation](http://pig.apache.org/docs/r0.14.0/),” *pig.apache.org*, November 2014.
1. John Larmouth: [*ASN.1 Complete*](http://www.oss.com/asn1/resources/books-whitepapers-pubs/larmouth-asn1-book.pdf). Morgan Kaufmann, 1999. ISBN: 978-0-122-33435-1
1. Russell Housley, Warwick Ford, Tim Polk, and David Solo: “[RFC 2459: Internet X.509 Public Key Infrastructure: Certificate and CRL Profile](https://www.ietf.org/rfc/rfc2459.txt),” IETF Network Working Group, Standards Track, January 1999.
1. Lev Walkin: “[Question: Extensibility and Dropping Fields](http://lionet.info/asn1c/blog/2010/09/21/question-extensibility-removing-fields/),” *lionet.info*, September 21, 2010.
1. Jesse James Garrett: “[Ajax: A New Approach to Web Applications](https://web.archive.org/web/20181231094556/https://www.adaptivepath.com/ideas/ajax-new-approach-web-applications/),” *adaptivepath.com*, February 18, 2005.
1. Sam Newman: *Building Microservices*. O'Reilly Media, 2015. ISBN: 978-1-491-95035-7
1. Chris Richardson: “[Microservices: Decomposing Applications for Deployability and Scalability](http://www.infoq.com/articles/microservices-intro),” *infoq.com*, May 25, 2014.
1. Pat Helland: “[Data on the Outside Versus Data on the Inside](http://cidrdb.org/cidr2005/papers/P12.pdf),” at *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005.
1. Roy Thomas Fielding: “[Architectural Styles and the Design of Network-Based Software Architectures](https://www.ics.uci.edu/~fielding/pubs/dissertation/fielding_dissertation.pdf),” PhD Thesis, University of California, Irvine, 2000.
1. Roy Thomas Fielding: “[REST APIs Must Be Hypertext-Driven](http://roy.gbiv.com/untangled/2008/rest-apis-must-be-hypertext-driven),” *roy.gbiv.com*, October 20 2008.
1. “[REST in Peace, SOAP](https://royal.pingdom.com/rest-in-peace-soap/),” *royal.pingdom.com*, October 15, 2010.
1. “[Web Services Standards as of Q1 2007](https://www.innoq.com/resources/ws-standards-poster/),” *innoq.com*, February 2007.
1. Pete Lacey: “[The S Stands for Simple](http://harmful.cat-v.org/software/xml/soap/simple),” *harmful.cat-v.org*, November 15, 2006.
1. Stefan Tilkov: “[Interview: Pete Lacey Criticizes Web Services](http://www.infoq.com/articles/pete-lacey-ws-criticism),” *infoq.com*, December 12, 2006.
1. “[OpenAPI Specification (fka Swagger RESTful API Documentation Specification) Version 2.0](http://swagger.io/specification/),” *swagger.io*, September 8, 2014.
1. Michi Henning: “[The Rise and Fall of CORBA](https://cacm.acm.org/magazines/2008/8/5336-the-rise-and-fall-of-corba/fulltext),” *Communications of the ACM*, volume 51, number 8, pages 52–57, August 2008. [doi:10.1145/1378704.1378718](http://dx.doi.org/10.1145/1378704.1378718)
1. Andrew D. Birrell and Bruce Jay Nelson: “[Implementing Remote Procedure Calls](http://www.cs.princeton.edu/courses/archive/fall03/cs518/papers/rpc.pdf),” *ACM Transactions on Computer Systems* (TOCS), volume 2, number 1, pages 39–59, February 1984. [doi:10.1145/2080.357392](http://dx.doi.org/10.1145/2080.357392)
1. Jim Waldo, Geoff Wyant, Ann Wollrath, and Sam Kendall: “[A Note on Distributed Computing](http://m.mirror.facebook.net/kde/devel/smli_tr-94-29.pdf),” Sun Microsystems Laboratories, Inc., Technical Report TR-94-29, November 1994.
1. Steve Vinoski: “[Convenience over Correctness](http://steve.vinoski.net/pdf/IEEE-Convenience_Over_Correctness.pdf),” *IEEE Internet Computing*, volume 12, number 4, pages 89–92, July 2008. [doi:10.1109/MIC.2008.75](http://dx.doi.org/10.1109/MIC.2008.75)
1. Marius Eriksen: “[Your Server as a Function](http://monkey.org/~marius/funsrv.pdf),” at *7th Workshop on Programming Languages and Operating Systems* (PLOS), November 2013. [doi:10.1145/2525528.2525538](http://dx.doi.org/10.1145/2525528.2525538)
1. “[gRPC concepts](https://grpc.io/docs/guides/concepts/),” The Linux Foundation, *grpc.io*.
1. Aditya Narayan and Irina Singh: “[Designing and Versioning Compatible Web Services](https://web.archive.org/web/20141016000136/http://www.ibm.com/developerworks/websphere/library/techarticles/0705_narayan/0705_narayan.html),” *ibm.com*, March 28, 2007.
1. Troy Hunt: “[Your API Versioning Is Wrong, Which Is Why I Decided to Do It 3 Different Wrong Ways](http://www.troyhunt.com/2014/02/your-api-versioning-is-wrong-which-is.html),” *troyhunt.com*, February 10, 2014.
1. “[API Upgrades](https://stripe.com/docs/upgrades),” Stripe, Inc., April 2015.
1. Jonas Bonér: “[Upgrade in an Akka Cluster](http://grokbase.com/t/gg/akka-user/138wd8j9e3/upgrade-in-an-akka-cluster),” email to *akka-user* mailing list, *grokbase.com*, August 28, 2013.
1. Philip A. Bernstein, Sergey Bykov, Alan Geller, et al.: “[Orleans: Distributed Virtual Actors for Programmability and Scalability](https://www.microsoft.com/en-us/research/publication/orleans-distributed-virtual-actors-for-programmability-and-scalability/),” Microsoft Research Technical Report MSR-TR-2014-41, March 2014.
1. “[Microsoft Project Orleans Documentation](http://dotnet.github.io/orleans/),” Microsoft Research, *dotnet.github.io*, 2015.
1. David Mercer, Sean Hinde, Yinso Chen, and Richard A O'Keefe: “[beginner: Updating Data Structures](http://erlang.org/pipermail/erlang-questions/2007-October/030318.html),” email thread on *erlang-questions* mailing list, *erlang.com*, October 29, 2007.
1. Fred Hebert: “[Postscript: Maps](http://learnyousomeerlang.com/maps),” *learnyousomeerlang.com*, April 9, 2014.

================================================
FILE: content/v1_tw/ch5.md
================================================
---
title: "第五章：複製"
linkTitle: "5. 複製"
weight: 205
breadcrumbs: false
math: true
---

![](/map/ch05.png)

> 與可能出錯的東西比，“不可能”出錯的東西最顯著的特點就是：一旦真的出錯，通常就徹底玩完了。
>
> —— 道格拉斯・亞當斯（1992）


複製意味著在透過網路連線的多臺機器上保留相同資料的副本。正如在 [第二部分](/v1_tw/part-ii) 的介紹中所討論的那樣，我們希望能複製資料，可能出於各種各樣的原因：

* 使得資料與使用者在地理上接近（從而減少延遲）
* 即使系統的一部分出現故障，系統也能繼續工作（從而提高可用性）
* 伸縮可以接受讀請求的機器數量（從而提高讀取吞吐量）

本章將假設你的資料集非常小，每臺機器都可以儲存整個資料集的副本。在 [第六章](/v1_tw/ch6) 中將放寬這個假設，討論對單個機器來說太大的資料集的分割（分片）。在後面的章節中，我們將討論複製資料系統中可能發生的各種故障，以及如何處理這些故障。

如果複製中的資料不會隨時間而改變，那複製就很簡單：將資料複製到每個節點一次就萬事大吉。複製的困難之處在於處理複製資料的 **變更（change）**，這就是本章所要講的。我們將討論三種流行的變更復制演算法：**單領導者（single leader，單主）**，**多領導者（multi leader，多主）** 和 **無領導者（leaderless，無主）**。幾乎所有分散式資料庫都使用這三種方法之一。

在複製時需要進行許多權衡：例如，使用同步複製還是非同步複製？如何處理失敗的副本？這些通常是資料庫中的配置選項，細節因資料庫而異，但原理在許多不同的實現中都類似。本章會討論這些決策的後果。

資料庫的複製算得上是老生常談了 ——70 年代研究得出的基本原則至今沒有太大變化【1】，因為網路的基本約束仍保持不變。然而在研究之外，許多開發人員仍然假設一個數據庫只有一個節點。分散式資料庫變為主流只是最近發生的事。許多程式設計師都是這一領域的新手，因此對於諸如 **最終一致性（eventual consistency）** 等問題存在許多誤解。在 “[複製延遲問題](#複製延遲問題)” 一節，我們將更加精確地瞭解最終一致性，並討論諸如 **讀己之寫（read-your-writes）** 和 **單調讀（monotonic read）** 等內容。

## 領導者與追隨者

儲存了資料庫複製的每個節點被稱為 **副本（replica）** 。當存在多個副本時，會不可避免的出現一個問題：如何確保所有資料都落在了所有的副本上？

每一次向資料庫的寫入操作都需要傳播到所有副本上，否則副本就會包含不一樣的資料。最常見的解決方案被稱為 **基於領導者的複製（leader-based replication）** （也稱 **主動/被動（active/passive）** 複製或 **主/從（master/slave）** 複製），如 [圖 5-1](#fig5-1.png) 所示。它的工作原理如下：

1. 其中一個副本被指定為 **領導者（leader）**，也稱為 **主庫（master|primary）** 。當客戶端要向資料庫寫入時，它必須將請求傳送給該 **領導者**，其會將新資料寫入其本地儲存。
2. 其他副本被稱為 **追隨者（followers）**，亦稱為 **只讀副本（read replicas）**、**從庫（slaves）**、**備庫（ secondaries）** 或 **熱備（hot-standby）**[^i]。每當領導者將新資料寫入本地儲存時，它也會將資料變更傳送給所有的追隨者，稱之為 **複製日誌（replication log）** 或 **變更流（change stream）**。每個跟隨者從領導者拉取日誌，並相應更新其本地資料庫副本，方法是按照與領導者相同的處理順序來進行所有寫入。
3. 當客戶想要從資料庫中讀取資料時，它可以向領導者或任一追隨者進行查詢。但只有領導者才能接受寫入操作（從客戶端的角度來看從庫都是隻讀的）。

[^i]: 不同的人對 **熱（hot）**、**溫（warm）** 和 **冷（cold）** 備份伺服器有不同的定義。例如在 PostgreSQL 中，**熱備（hot standby）** 指的是能接受客戶端讀請求的副本。而 **溫備（warm standby）** 只是追隨領導者，但不處理客戶端的任何查詢。就本書而言，這些差異並不重要。

![](/v1/ddia_0501.png)

**圖 5-1 基於領導者的（主/從）複製**

這種複製模式是許多關係資料庫的內建功能，如 PostgreSQL（從 9.0 版本開始）、MySQL、Oracle Data Guard【2】和 SQL Server 的 AlwaysOn 可用性組【3】。它也被用於一些非關係資料庫，包括 MongoDB、RethinkDB 和 Espresso【4】。最後，基於領導者的複製並不僅限於資料庫：像 Kafka【5】和 RabbitMQ 高可用佇列【6】這樣的分散式訊息代理也使用它。某些網路檔案系統，例如 DRBD 這樣的塊複製裝置也與之類似。

### 同步複製與非同步複製

複製系統的一個重要細節是：複製是 **同步（synchronously）** 發生的還是 **非同步（asynchronously）** 發生的。（在關係型資料庫中這通常是一個配置項，其他系統則通常硬編碼為其中一個）。

想象一下 [圖 5-1](fig5-1.png) 中發生的場景，即網站的使用者更新他們的個人頭像。在某個時間點，客戶向主庫傳送更新請求；不久之後主庫就收到了請求。在某個時間點，主庫又會將資料變更轉發給自己的從庫。最終，主庫通知客戶更新成功。

[圖 5-2](/v1/ddia_0502.png) 顯示了系統各個元件之間的通訊：使用者客戶端、主庫和兩個從庫。時間從左向右流動。請求或響應訊息用粗箭頭表示。

![](/v1/ddia_0502.png)

**圖 5-2 基於領導者的複製：一個同步從庫和一個非同步從庫**

在 [圖 5-2](/v1/ddia_0502.png) 的示例中，從庫 1 的複製是同步的：在向用戶報告寫入成功並使結果對其他使用者可見之前，主庫需要等待從庫 1 的確認，確保從庫 1 已經收到寫入操作。而從庫 2 的複製是非同步的：主庫傳送訊息，但不等待該從庫的響應。

在這幅圖中，從庫 2 處理訊息前存在一個顯著的延遲。通常情況下，複製的速度相當快：大多數資料庫系統能在不到一秒內完成從庫的同步，但它們不能提供複製用時的保證。有些情況下，從庫可能落後主庫幾分鐘或更久，例如：從庫正在從故障中恢復，系統正在最大容量附近執行，或者當節點間存在網路問題時。

同步複製的優點是，從庫能保證有與主庫一致的最新資料副本。如果主庫突然失效，我們可以確信這些資料仍然能在從庫上找到。缺點是，如果同步從庫沒有響應（比如它已經崩潰，或者出現網路故障，或其它任何原因），主庫就無法處理寫入操作。主庫必須阻止所有寫入，並等待同步副本再次可用。

因此，將所有從庫都設定為同步的是不切實際的：任何一個節點的中斷都會導致整個系統停滯不前。實際上，如果在資料庫上啟用同步複製，通常意味著其中 **一個** 從庫是同步的，而其他的從庫則是非同步的。如果該同步從庫變得不可用或緩慢，則將一個非同步從庫改為同步執行。這保證你至少在兩個節點上擁有最新的資料副本：主庫和同步從庫。這種配置有時也被稱為 **半同步（semi-synchronous）**【7】。

通常情況下，基於領導者的複製都配置為完全非同步。在這種情況下，如果主庫失效且不可恢復，則任何尚未複製給從庫的寫入都會丟失。這意味著即使已經向客戶端確認成功，寫入也不能保證是 **持久（Durable）** 的。然而，一個完全非同步的配置也有優點：即使所有的從庫都落後了，主庫也可以繼續處理寫入。

弱化的永續性可能聽起來像是一個壞的折衷，但非同步複製其實已經被廣泛使用了，特別是在有很多從庫的場景下，或者當從庫在地理上分佈很廣的時候。我們將在討論 “[複製延遲問題](#複製延遲問題)” 時回到這個問題。

> ### 關於複製的研究
>
> 對於非同步複製系統而言，主庫故障時會丟失資料可能是一個嚴重的問題，因此研究人員仍在研究不丟資料但仍能提供良好效能和可用性的複製方法。例如，**鏈式複製（chain replication）**【8,9】是同步複製的一種變體，已經在一些系統（如 Microsoft Azure Storage【10,11】）中成功實現。
>
> 複製的一致性與 **共識**（consensus，使幾個節點就某個值達成一致）之間有著密切的聯絡，[第九章](/v1_tw/ch9) 將詳細地探討這一領域的理論。本章主要討論實踐中的資料庫常用的簡單複製形式。
>

### 設定新從庫

有時候需要設定一個新的從庫：也許是為了增加副本的數量，或替換失敗的節點。如何確保新的從庫擁有主庫資料的精確副本？

簡單地將資料檔案從一個節點複製到另一個節點通常是不夠的：客戶端不斷向資料庫寫入資料，資料總是在不斷地變化，標準的檔案複製會看到資料庫的不同部分在不同的時間點的內容，其結果可能沒有任何意義。

可以透過鎖定資料庫（使其不可用於寫入）來使磁碟上的檔案保持一致，但是這會違背高可用的目標。幸運的是，設定新從庫通常並不需要停機。從概念上講，其過程如下所示：

1. 在某個時刻獲取主庫的一致性快照（如果可能，不必鎖定整個資料庫）。大多數資料庫都具有這個功能，因為它是備份必需的。對於某些場景，可能需要第三方工具，例如用於 MySQL 的 innobackupex【12】。
2. 將快照複製到新的從庫節點。
3. 從庫連線到主庫，並拉取快照之後發生的所有資料變更。這要求快照與主庫複製日誌中的位置精確關聯。該位置有不同的名稱，例如 PostgreSQL 將其稱為 **日誌序列號（log sequence number，LSN）**，MySQL 將其稱為 **二進位制日誌座標（binlog coordinates）**。
4. 當從庫處理完快照之後積累的資料變更，我們就說它 **趕上（caught up）** 了主庫，現在它可以繼續及時處理主庫產生的資料變化了。

建立從庫的實際步驟因資料庫而異。在某些系統中，這個過程是完全自動化的，而在另外一些系統中，它可能是一個需要由管理員手動執行的、有點神秘的多步驟工作流。

### 處理節點宕機

系統中的任何節點都可能宕機，可能因為意外的故障，也可能由於計劃內的維護（例如，重啟機器以安裝核心安全補丁）。對運維而言，能在系統不中斷服務的情況下重啟單個節點好處多多。我們的目標是，即使個別節點失效，也能保持整個系統執行，並儘可能控制節點停機帶來的影響。

如何透過基於領導者的複製實現高可用？

#### 從庫失效：追趕恢復

在其本地磁碟上，每個從庫記錄從主庫收到的資料變更。如果從庫崩潰並重新啟動，或者，如果主庫和從庫之間的網路暫時中斷，則比較容易恢復：從庫可以從日誌中知道，在發生故障之前處理的最後一個事務。因此，從庫可以連線到主庫，並請求在從庫斷開期間發生的所有資料變更。當應用完所有這些變更後，它就趕上了主庫，並可以像以前一樣繼續接收資料變更流。

#### 主庫失效：故障切換

主庫失效處理起來相當棘手：其中一個從庫需要被提升為新的主庫，需要重新配置客戶端，以將它們的寫操作傳送給新的主庫，其他從庫需要開始拉取來自新主庫的資料變更。這個過程被稱為 **故障切換（failover）**。

故障切換可以手動進行（通知管理員主庫掛了，並採取必要的步驟來建立新的主庫）或自動進行。自動的故障切換過程通常由以下步驟組成：

1. 確認主庫失效。有很多事情可能會出錯：崩潰、停電、網路問題等等。沒有萬無一失的方法來檢測出現了什麼問題，所以大多數系統只是簡單使用 **超時（Timeout）** ：節點頻繁地相互來回傳遞訊息，如果一個節點在一段時間內（例如 30 秒）沒有響應，就認為它掛了（因為計劃內維護而故意關閉主庫不算）。
2. 選擇一個新的主庫。這可以透過選舉過程（主庫由剩餘副本以多數選舉產生）來完成，或者可以由之前選定的 **控制器節點（controller node）** 來指定新的主庫。主庫的最佳人選通常是擁有舊主庫最新資料副本的從庫（以最小化資料損失）。讓所有的節點同意一個新的領導者，是一個 **共識** 問題，將在 [第九章](/v1_tw/ch9) 詳細討論。
3. 重新配置系統以啟用新的主庫。客戶端現在需要將它們的寫請求傳送給新主庫（將在 “[請求路由](/v1_tw/ch6#請求路由)” 中討論這個問題）。如果舊主庫恢復，可能仍然認為自己是主庫，而沒有意識到其他副本已經讓它失去領導權了。系統需要確保舊主庫意識到新主庫的存在，併成為一個從庫。

故障切換的過程中有很多地方可能出錯：

* 如果使用非同步複製，則新主庫可能沒有收到老主庫宕機前最後的寫入操作。在選出新主庫後，如果老主庫重新加入叢集，又該如何處理這些老主庫尚未複製的寫入？在此期間，新主庫可能已經收到了與老主庫尚未複製的寫入相沖突的寫入。最常見的解決方案是簡單丟棄老主庫未複製的寫入，這很可能打破客戶對於資料永續性的期望。

* 如果資料庫需要和其他外部儲存相協調，那麼丟棄寫入內容是極其危險的操作。例如在 GitHub 【13】的一場事故中，一個過時的 MySQL 從庫被提升為主庫。資料庫使用自增 ID 作為主鍵，因為新主庫的計數器落後於老主庫的計數器，所以新主庫重新分配了一些已經被老主庫分配掉的 ID 作為主鍵。這些主鍵也在 Redis 中使用，主鍵重用使得 MySQL 和 Redis 中的資料產生不一致，最後導致一些私有資料洩漏到錯誤的使用者手中。

* 發生某些故障時（見 [第八章](/v1_tw/ch8)）可能會出現兩個節點都以為自己是主庫的情況。這種情況稱為 **腦裂（split brain）**，非常危險：如果兩個主庫都可以接受寫操作，卻沒有衝突解決機制（請參閱 “[多主複製](#多主複製)”），那麼資料就可能丟失或損壞。一些系統採取了安全防範措施：當檢測到兩個主庫節點同時存在時會關閉其中一個節點 [^ii]，但設計粗糙的機制可能最後會導致兩個節點都被關閉【14】。

  [^ii]: 這種機制稱為 **屏障（fencing）**，或者更充滿感情的術語是：**爆彼之頭（Shoot The Other Node In The Head, STONITH）**。我們將在 “[領導者和鎖](/v1_tw/ch8#領導者和鎖)” 中對屏障進行詳細討論。

* 主庫被宣告死亡之前的正確超時應該怎麼配置？在主庫失效的情況下，超時時間越長意味著恢復時間也越長。但是如果超時設定太短，又可能會出現不必要的故障切換。例如，臨時的負載峰值可能導致節點的響應時間增加到超出超時時間，或者網路故障也可能導致資料包延遲。如果系統已經處於高負載或網路問題的困擾之中，那麼不必要的故障切換可能會讓情況變得更糟糕。

這些問題沒有簡單的解決方案。因此，即使軟體支援自動故障切換，不少運維團隊還是更願意手動執行故障切換。

節點故障、不可靠的網路、對副本一致性、永續性、可用性和延遲的權衡，這些問題實際上是分散式系統中的基本問題。[第八章](/v1_tw/ch8) 和 [第九章](/v1_tw/ch9) 將更深入地討論它們。

### 複製日誌的實現

基於領導者的複製在底層是如何工作的？實踐中有好幾種不同的複製方式，所以先簡要地看一下。

#### 基於語句的複製

在最簡單的情況下，主庫記錄下它執行的每個寫入請求（**語句**，即 statement）並將該語句日誌傳送給從庫。對於關係資料庫來說，這意味著每個 `INSERT`、`UPDATE` 或 `DELETE` 語句都被轉發給每個從庫，每個從庫解析並執行該 SQL 語句，就像直接從客戶端收到一樣。

雖然聽上去很合理，但有很多問題會搞砸這種複製方式：

* 任何呼叫 **非確定性函式（nondeterministic）** 的語句，可能會在每個副本上生成不同的值。例如，使用 `NOW()` 獲取當前日期時間，或使用 `RAND()` 獲取一個隨機數。
* 如果語句使用了 **自增列（auto increment）**，或者依賴於資料庫中的現有資料（例如，`UPDATE ... WHERE <某些條件>`），則必須在每個副本上按照完全相同的順序執行它們，否則可能會產生不同的效果。當有多個併發執行的事務時，這可能成為一個限制。
* 有副作用的語句（例如：觸發器、儲存過程、使用者定義的函式）可能會在每個副本上產生不同的副作用，除非副作用是絕對確定性的。

的確有辦法繞開這些問題 —— 例如，當語句被記錄時，主庫可以用固定的返回值替換掉任何不確定的函式呼叫，以便所有從庫都能獲得相同的值。但是由於邊緣情況實在太多了，現在通常會選擇其他的複製方法。

基於語句的複製在 5.1 版本前的 MySQL 中被使用到。因為它相當緊湊，現在有時候也還在用。但現在在預設情況下，如果語句中存在任何不確定性，MySQL 會切換到基於行的複製（稍後討論）。VoltDB 使用了基於語句的複製，但要求事務必須是確定性的，以此來保證安全【15】。

#### 傳輸預寫式日誌（WAL）

在 [第三章](/v1_tw/ch3) 中，我們討論了儲存引擎如何在磁碟上表示資料，我們也發現了通常會將寫操作追加到日誌中：

* 對於日誌結構儲存引擎（請參閱 “[SSTables 和 LSM 樹](/v1_tw/ch3#SSTables和LSM樹)”），日誌是主要的儲存位置。日誌段在後臺壓縮，並進行垃圾回收。
* 對於覆寫單個磁碟塊的 [B 樹](/v1_tw/ch3#B樹)，每次修改都會先寫入 **預寫式日誌（Write Ahead Log, WAL）**，以便崩潰後索引可以恢復到一個一致的狀態。

在任何一種情況下，該日誌都是包含了所有資料庫寫入的僅追加位元組序列。可以使用完全相同的日誌在另一個節點上構建副本：除了將日誌寫入磁碟之外，主庫還可以透過網路將其傳送給從庫。

透過使用這個日誌，從庫可以構建一個與主庫一模一樣的資料結構複製。

這種複製方法在 PostgreSQL 和 Oracle 等一些產品中被使用到【16】。其主要缺點是日誌記錄的資料非常底層：WAL 包含哪些磁碟塊中的哪些位元組發生了更改。這使複製與儲存引擎緊密耦合。如果資料庫將其儲存格式從一個版本更改為另一個版本，通常不可能在主庫和從庫上執行不同版本的資料庫軟體。

看上去這可能只是一個小的實現細節，但卻可能對運維產生巨大的影響。如果複製協議允許從庫使用比主庫更新的軟體版本，則可以先升級從庫，然後執行故障切換，使升級後的節點之一成為新的主庫，從而允許資料庫軟體的零停機升級。如果複製協議不允許版本不匹配（傳輸 WAL 經常出現這種情況），則此類升級需要停機。

#### 邏輯日誌複製（基於行）

另一種方法是對複製和儲存引擎使用不同的日誌格式，這樣可以將複製日誌從儲存引擎的內部實現中解耦出來。這種複製日誌被稱為邏輯日誌（logical log），以將其與儲存引擎的（物理）資料表示區分開來。

關係資料庫的邏輯日誌通常是以行的粒度來描述對資料庫表的寫入記錄的序列：

* 對於插入的行，日誌包含所有列的新值。
* 對於刪除的行，日誌包含足夠的資訊來唯一標識被刪除的行，這通常是主鍵，但如果表上沒有主鍵，則需要記錄所有列的舊值。
* 對於更新的行，日誌包含足夠的資訊來唯一標識被更新的行，以及所有列的新值（或至少所有已更改的列的新值）。

修改多行的事務會生成多條這樣的日誌記錄，後面跟著一條指明事務已經提交的記錄。MySQL 的二進位制日誌（當配置為使用基於行的複製時）使用了這種方法【17】。

由於邏輯日誌與儲存引擎的內部實現是解耦的，系統可以更容易地做到向後相容，從而使主庫和從庫能夠執行不同版本的資料庫軟體，或者甚至不同的儲存引擎。

對於外部應用程式來說，邏輯日誌格式也更容易解析。如果要將資料庫的內容傳送到外部系統，例如複製到資料倉庫進行離線分析，或建立自定義索引和快取【18】，這一點會很有用。這種技術被稱為 **資料變更捕獲（change data capture）**，[第十一章](/v1_tw/ch11) 將重新講到它。

#### 基於觸發器的複製

到目前為止描述的複製方法是由資料庫系統實現的，不涉及任何應用程式程式碼。在很多情況下，這就是你想要的。但在某些情況下需要更多的靈活性。例如，如果你只想複製資料的一個子集，或者想從一種資料庫複製到另一種資料庫，或者如果你需要衝突解決邏輯（請參閱 “[處理寫入衝突](#處理寫入衝突)”），則可能需要將複製操作上移到應用程式層。

一些工具，如 Oracle Golden Gate【19】，可以透過讀取資料庫日誌，使得其他應用程式可以使用資料。另一種方法是使用許多關係資料庫自帶的功能：觸發器和儲存過程。

觸發器允許你將資料更改（寫入事務）發生時自動執行的自定義應用程式程式碼註冊在資料庫系統中。觸發器有機會將更改記錄到一個單獨的表中，使用外部程式讀取這個表，再加上一些必要的業務邏輯，就可以將資料變更復制到另一個系統去。例如，Databus for Oracle【20】和 Bucardo for Postgres【21】就是這樣工作的。

基於觸發器的複製通常比其他複製方法具有更高的開銷，並且比資料庫內建的複製更容易出錯，也有很多限制。然而由於其靈活性，它仍然是很有用的。


## 複製延遲問題

容忍節點故障只是需要複製的一個原因。正如在 [第二部分](/v1_tw/part-ii) 的介紹中提到的，其它原因還包括可伸縮性（處理比單個機器更多的請求）和延遲（讓副本在地理位置上更接近使用者）。

基於領導者的複製要求所有寫入都由單個節點處理，但只讀查詢可以由任何一個副本來處理。所以對於讀多寫少的場景（Web 上的常見模式），一個有吸引力的選擇是建立很多從庫，並將讀請求分散到所有的從庫上去。這樣能減小主庫的負載，並允許由附近的副本來處理讀請求。

在這種讀伸縮（read-scaling）的體系結構中，只需新增更多的從庫，就可以提高只讀請求的服務容量。但是，這種方法實際上只適用於非同步複製 —— 如果嘗試同步複製到所有從庫，則單個節點故障或網路中斷將導致整個系統都無法寫入。而且節點越多越有可能出現個別節點宕機的情況，所以完全同步的配置將是非常不可靠的。

不幸的是，當應用程式從非同步從庫讀取時，如果從庫落後，它可能會看到過時的資訊。這會導致資料庫中出現明顯的不一致：同時對主庫和從庫執行相同的查詢，可能得到不同的結果，因為並非所有的寫入都反映在從庫中。這種不一致只是一個暫時的狀態 —— 如果停止寫入資料庫並等待一段時間，從庫最終會趕上並與主庫保持一致。出於這個原因，這種效應被稱為 **最終一致性（eventual consistency）**【22,23】。[^iii]

[^iii]: 道格拉斯・特里（Douglas Terry）等人【24】創造了最終一致性這個術語，並經由 Werner Vogels【22】的推廣，成為了許多 NoSQL 專案的口號。然而，最終一致性並不只屬於 NoSQL 資料庫：關係型資料庫中的非同步複製從庫也有相同的特性。

最終一致性中的 “最終” 一詞有意進行了模糊化：總的來說，副本落後的程度是沒有限制的。在正常的操作中，**複製延遲（replication lag）**，即寫入主庫到反映至從庫之間的延遲，可能僅僅是幾分之一秒，在實踐中並不顯眼。但如果系統在接近極限的情況下執行，或網路中存在問題時，延遲可以輕而易舉地超過幾秒，甚至達到幾分鐘。

因為滯後時間太長引入的不一致性，不僅僅是一個理論問題，更是應用設計中會遇到的真實問題。本節將重點介紹三個在複製延遲時可能發生的問題例項，並簡述解決這些問題的一些方法。

### 讀己之寫

許多應用讓使用者提交一些資料，然後檢視他們提交的內容。可能是使用者資料庫中的記錄，也可能是對討論主題的評論，或其他類似的內容。提交新資料時，必須將其傳送給主庫，但是當用戶檢視資料時，可以透過從庫進行讀取。如果資料經常被檢視，但只是偶爾寫入，這是非常合適的。

但對於非同步複製，問題就來了。如 [圖 5-3](fig5-3.png) 所示：如果使用者在寫入後馬上就檢視資料，則新資料可能尚未到達副本。對使用者而言，看起來好像是剛提交的資料丟失了，所以他們不高興是可以理解的。

![](/v1/ddia_0503.png)

**圖 5-3 使用者寫入後從舊副本中讀取資料。需要寫後讀 (read-after-write) 的一致性來防止這種異常**

在這種情況下，我們需要 **寫後讀一致性（read-after-write consistency）**，也稱為 **讀己之寫一致性（read-your-writes consistency）**【24】。這是一個保證，如果使用者重新載入頁面，他們總會看到他們自己提交的任何更新。它不會對其他使用者的寫入做出承諾：其他使用者的更新可能稍等才會看到。它保證使用者自己的輸入已被正確儲存。

如何在基於領導者的複製系統中實現寫後讀一致性？有各種可能的技術，這裡說一些：

* 對於使用者 **可能修改過** 的內容，總是從主庫讀取；這就要求得有辦法不透過實際的查詢就可以知道使用者是否修改了某些東西。舉個例子，社交網路上的使用者個人資料資訊通常只能由使用者本人編輯，而不能由其他人編輯。因此一個簡單的規則就是：總是從主庫讀取使用者自己的檔案，如果要讀取其他使用者的檔案就去從庫。

* 如果應用中的大部分內容都可能被使用者編輯，那這種方法就沒用了，因為大部分內容都必須從主庫讀取（讀伸縮就沒效果了）。在這種情況下可以使用其他標準來決定是否從主庫讀取。例如可以跟蹤上次更新的時間，在上次更新後的一分鐘內，從主庫讀。還可以監控從庫的複製延遲，防止向任何滯後主庫超過一分鐘的從庫發出查詢。

* 客戶端可以記住最近一次寫入的時間戳，系統需要確保從庫在處理該使用者的讀取請求時，該時間戳前的變更都已經傳播到了本從庫中。如果當前從庫不夠新，則可以從另一個從庫讀取，或者等待從庫追趕上來。這裡的時間戳可以是邏輯時間戳（表示寫入順序的東西，例如日誌序列號）或實際的系統時鐘（在這種情況下，時鐘同步變得至關重要，請參閱 “[不可靠的時鐘](/v1_tw/ch8#不可靠的時鐘)”）。

* 如果你的副本分佈在多個數據中心（為了在地理上接近使用者或者出於可用性目的），還會有額外的複雜性。任何需要由主庫提供服務的請求都必須路由到包含該主庫的資料中心。

另一種複雜的情況發生在同一位使用者從多個裝置（例如桌面瀏覽器和移動 APP）請求服務的時候。這種情況下可能就需要提供跨裝置的寫後讀一致性：如果使用者在一個裝置上輸入了一些資訊，然後在另一個裝置上檢視，則應該看到他們剛輸入的資訊。

在這種情況下，還有一些需要考慮的問題：

* 記住使用者上次更新時間戳的方法變得更加困難，因為一個裝置上執行的程式不知道另一個裝置上發生了什麼。需要對這些元資料進行中心化的儲存。
* 如果副本分佈在不同的資料中心，很難保證來自不同裝置的連線會路由到同一資料中心。（例如，使用者的臺式計算機使用家庭寬頻連線，而移動裝置使用蜂窩資料網路，則裝置的網路路由可能完全不同）。如果你的方法需要讀主庫，可能首先需要把來自該使用者所有裝置的請求都路由到同一個資料中心。


### 單調讀

在從非同步從庫讀取時可能發生的異常的第二個例子是使用者可能會遇到 **時光倒流（moving backward in time）**。

如果使用者從不同從庫進行多次讀取，就可能發生這種情況。例如，[圖 5-4](/v1/ddia_0504.png) 顯示了使用者 2345 兩次進行相同的查詢，首先查詢了一個延遲很小的從庫，然後是一個延遲較大的從庫（如果使用者重新整理網頁時每個請求都被路由到一個隨機的伺服器，這種情況就很有可能發生）。第一個查詢返回了最近由使用者 1234 新增的評論，但是第二個查詢不返回任何東西，因為滯後的從庫還沒有拉取到該寫入內容。實際上可以認為第二個查詢是在比第一個查詢更早的時間點上觀察系統。如果第一個查詢沒有返回任何內容，那問題並不大，因為使用者 2345 可能不知道使用者 1234 最近添加了評論。但如果使用者 2345 先看見使用者 1234 的評論，然後又看到它消失，這就會讓人覺得非常困惑了。

![](/v1/ddia_0504.png)

**圖 5-4 使用者首先從新副本讀取，然後從舊副本讀取。時間看上去回退了。為了防止這種異常，我們需要單調的讀取。**

**單調讀（monotonic reads）**【23】可以保證這種異常不會發生。這是一個比 **強一致性（strong consistency）** 更弱，但比 **最終一致性（eventual consistency）** 更強的保證。當讀取資料時，你可能會看到一個舊值；單調讀僅意味著如果一個使用者順序地進行多次讀取，則他們不會看到時間回退，也就是說，如果已經讀取到較新的資料，後續的讀取不會得到更舊的資料。

實現單調讀的一種方式是確保每個使用者總是從同一個副本進行讀取（不同的使用者可以從不同的副本讀取）。例如，可以基於使用者 ID 的雜湊來選擇副本，而不是隨機選擇副本。但是，如果該副本出現故障，使用者的查詢將需要重新路由到另一個副本。


### 一致字首讀

第三個複製延遲異常的例子違反了因果律。想象一下 Poons 先生和 Cake 夫人之間的以下簡短對話：

*Mr. Poons*
> Mrs. Cake，你能看到多遠的未來？

*Mrs. Cake*
> 通常約十秒鐘，Mr. Poons.

這兩句話之間有因果關係：Cake 夫人聽到了 Poons 先生的問題並回答了這個問題。

現在，想象第三個人正在透過從庫來聽這個對話。Cake 夫人說的內容是從一個延遲很低的從庫讀取的，但 Poons 先生所說的內容，從庫的延遲要大的多（見 [圖 5-5](/v1/ddia_0505.png)）。於是，這個觀察者會聽到以下內容：

*Mrs. Cake*
> 通常約十秒鐘，Mr. Poons.

*Mr. Poons*
> Mrs. Cake，你能看到多遠的未來？

對於觀察者來說，看起來好像 Cake 夫人在 Poons 先生提問前就回答了這個問題。這種超能力讓人印象深刻，但也會把人搞糊塗。【25】。

![](/v1/ddia_0505.png)

**圖 5-5 如果某些分割槽的複製速度慢於其他分割槽，那麼觀察者可能會在看到問題之前先看到答案。**

要防止這種異常，需要另一種型別的保證：**一致字首讀（consistent prefix reads）**【23】。這個保證的意思是說：如果一系列寫入按某個順序發生，那麼任何人讀取這些寫入時，也會看見它們以同樣的順序出現。

這是 **分割槽（partitioned）** 或 **分片（sharded）** 資料庫中的一個特殊問題，我們將在 [第六章](/v1_tw/ch6) 中討論分割槽資料庫。如果資料庫總是以相同的順序應用寫入，而讀取總是看到一致的字首，那麼這種異常不會發生。但是在許多分散式資料庫中，不同的分割槽獨立執行，因此不存在 **全域性的寫入順序**：當用戶從資料庫中讀取資料時，可能會看到資料庫的某些部分處於較舊的狀態，而某些則處於較新的狀態。

一種解決方案是，確保任何因果相關的寫入都寫入相同的分割槽，但在一些應用中可能無法高效地完成這種操作。還有一些顯式跟蹤因果依賴關係的演算法，我們將在 “[“此前發生” 的關係和併發](#“此前發生”的關係和併發)” 一節中回到這個話題。

### 複製延遲的解決方案

在使用最終一致的系統時，如果複製延遲增加到幾分鐘甚至幾小時，則應該考慮應用程式的行為。如果答案是 “沒問題”，那很好。但如果結果對於使用者來說是不好的體驗，那麼設計系統來提供更強的保證（例如 **寫後讀**）是很重要的。明明是非同步複製卻假設複製是同步的，這是很多麻煩的根源。

如前所述，應用程式可以提供比底層資料庫更強有力的保證，例如透過主庫進行某種讀取。但在應用程式程式碼中處理這些問題是複雜的，容易出錯。

如果應用程式開發人員不必擔心微妙的複製問題，並可以信賴他們的資料庫 “做了正確的事情”，那該多好呀。這就是 **事務（transaction）** 存在的原因：**資料庫透過事務提供強大的保證**，所以應用程式可以更加簡單。

單節點事務已經存在了很長時間。然而在走向分散式（複製和分割槽）資料庫時，許多系統放棄了事務，聲稱事務在效能和可用性上的代價太高，並斷言在可伸縮系統中最終一致性是不可避免的。這個敘述有一些道理，但過於簡單了，本書其餘部分將提出更為細緻的觀點。我們將在 [第七章](/v1_tw/ch7) 和 [第九章](/v1_tw/ch9) 回到事務的話題，並將在 [第三部分](/v1_tw/part-iii) 討論一些替代機制。


## 多主複製

本章到目前為止，我們只考慮了使用單個主庫的複製架構。雖然這是一種常見的方法，但還有其它一些有趣的選擇。

基於領導者的複製有一個主要的缺點：只有一個主庫，而且所有的寫入都必須透過它 [^iv]。如果出於任何原因（例如和主庫之間的網路連線中斷）無法連線到主庫，就無法向資料庫寫入。

[^iv]: 如果資料庫被分割槽（見 [第六章](/v1_tw/ch6)），每個分割槽都有一個主庫。不同的分割槽的主庫可能在不同的節點上，但是每個分割槽都必須有一個主庫。

基於領導者的複製模型的自然延伸是允許多個節點接受寫入。複製仍然以同樣的方式發生：處理寫入的每個節點都必須將該資料變更轉發給所有其他節點。我們將其稱之為 **多領導者配置**（multi-leader configuration，也稱多主、多活複製，即 master-master replication 或 active/active replication）。在這種情況下，每個主庫同時是其他主庫的從庫。

### 多主複製的應用場景

在單個數據中心內部使用多個主庫的配置沒有太大意義，因為其導致的複雜性已經超過了能帶來的好處。但在一些情況下，這種配置也是合理的。

#### 運維多個數據中心

假如你有一個數據庫，副本分散在好幾個不同的資料中心（可能會用來容忍單個數據中心的故障，或者為了在地理上更接近使用者）。如果使用常規的基於領導者的複製設定，主庫必須位於其中一個數據中心，且所有寫入都必須經過該資料中心。

多主配置中可以在每個資料中心都有主庫。[圖 5-6](/v1/ddia_0506.png) 展示了這個架構。在每個資料中心內使用常規的主從複製；在資料中心之間，每個資料中心的主庫都會將其更改複製到其他資料中心的主庫中。

![](/v1/ddia_0506.png)

**圖 5-6 跨多個數據中心的多主複製**

我們來比較一下在運維多個數據中心時，單主和多主的適應情況：

* 效能

  在單主配置中，每個寫入都必須穿過網際網路，進入主庫所在的資料中心。這可能會增加寫入時間，並可能違背了設定多個數據中心的初心。在多主配置中，每個寫操作都可以在本地資料中心進行處理，並與其他資料中心非同步複製。因此，資料中心之間的網路延遲對使用者來說是透明的，這意味著感覺到的效能可能會更好。

* 容忍資料中心停機

  在單主配置中，如果主庫所在的資料中心發生故障，故障切換必須使另一個數據中心裡的從庫成為主庫。在多主配置中，每個資料中心可以獨立於其他資料中心繼續執行，並且當發生故障的資料中心歸隊時，複製會自動趕上。

* 容忍網路問題

  資料中心之間的通訊通常穿過公共網際網路，這可能不如資料中心內的本地網路可靠。單主配置對資料中心之間的連線問題非常敏感，因為透過這個連線進行的寫操作是同步的。採用非同步複製功能的多主配置通常能更好地承受網路問題：臨時的網路中斷並不會妨礙正在處理的寫入。

有些資料庫預設情況下支援多主配置，但使用外部工具實現也很常見，例如用於 MySQL 的 Tungsten Replicator 【26】，用於 PostgreSQL 的 BDR【27】以及用於 Oracle 的 GoldenGate 【19】。

儘管多主複製有這些優勢，但也有一個很大的缺點：兩個不同的資料中心可能會同時修改相同的資料，寫衝突是必須解決的（如 [圖 5-6](/v1/ddia_0506.png) 中的 “衝突解決（conflict resolution）”）。本書將在 “[處理寫入衝突](#處理寫入衝突)” 中詳細討論這個問題。

由於多主複製在許多資料庫中都屬於改裝的功能，所以常常存在微妙的配置缺陷，且經常與其他資料庫功能之間出現意外的反應。比如自增主鍵、觸發器、完整性約束等都可能會有麻煩。因此，多主複製往往被認為是危險的領域，應儘可能避免【28】。

#### 需要離線操作的客戶端

多主複製的另一種適用場景是：應用程式在斷網之後仍然需要繼續工作。

例如，考慮手機，筆記型電腦和其他裝置上的日曆應用。無論裝置目前是否有網際網路連線，你需要能隨時檢視你的會議（發出讀取請求），輸入新的會議（發出寫入請求）。如果在離線狀態下進行任何更改，則裝置下次上線時，需要與伺服器和其他裝置同步。

在這種情況下，每個裝置都有一個充當主庫的本地資料庫（它接受寫請求），並且在所有裝置上的日曆副本之間同步時，存在非同步的多主複製過程。複製延遲可能是幾小時甚至幾天，具體取決於何時可以訪問網際網路。

從架構的角度來看，這種設定實際上與資料中心之間的多主複製類似，每個裝置都是一個 “資料中心”，而它們之間的網路連線是極度不可靠的。從歷史上各類日曆同步功能的破爛實現可以看出，想把多主複製用好是多麼困難的一件事。

有一些工具旨在使這種多主配置更容易。例如，CouchDB 就是為這種操作模式而設計的【29】。

#### 協同編輯

即時協作編輯應用程式允許多個人同時編輯文件。例如，Etherpad 【30】和 Google Docs 【31】允許多人同時編輯文字文件或電子表格（該演算法在 “[自動衝突解決](#自動衝突解決)” 中簡要討論）。我們通常不會將協作式編輯視為資料庫複製問題，但它與前面提到的離線編輯用例有許多相似之處。當一個使用者編輯文件時，所做的更改將立即應用到其本地副本（Web 瀏覽器或客戶端應用程式中的文件狀態），並非同步複製到伺服器和編輯同一文件的任何其他使用者。

如果要保證不會發生編輯衝突，則應用程式必須先取得文件的鎖定，然後使用者才能對其進行編輯。如果另一個使用者想要編輯同一個文件，他們首先必須等到第一個使用者提交修改並釋放鎖定。這種協作模式相當於主從複製模型下在主節點上執行事務操作。

但是，為了加速協作，你可能希望將更改的單位設定得非常小（例如單次按鍵），並避免鎖定。這種方法允許多個使用者同時進行編輯，但同時也帶來了多主複製的所有挑戰，包括需要解決衝突【32】。

### 處理寫入衝突

多主複製的最大問題是可能發生寫衝突，這意味著需要解決衝突。

例如，考慮一個由兩個使用者同時編輯的維基頁面，如 [圖 5-7](/v1/ddia_0507.png) 所示。使用者 1 將頁面的標題從 A 更改為 B，並且使用者 2 同時將標題從 A 更改為 C。每個使用者的更改已成功應用到其本地主庫。但當非同步複製時，會發現衝突【33】。單主資料庫中不會出現此問題。

![](/v1/ddia_0507.png)

**圖 5-7 兩個主庫同時更新同一記錄引起的寫入衝突**

#### 同步與非同步衝突檢測

在單主資料庫中，第二個寫入將被阻塞並等待第一個寫入完成，或者中止第二個寫入事務並強制使用者重試。另一方面，在多主配置中，兩個寫入都是成功的，在稍後的某個時間點才能非同步地檢測到衝突。那時再來要求使用者解決衝突可能為時已晚。

原則上，可以使衝突檢測同步 - 即等待寫入被複制到所有副本，然後再告訴使用者寫入成功。但是，透過這樣做，你將失去多主複製的主要優點：允許每個副本獨立地接受寫入。如果你想要同步衝突檢測，那麼你可能不如直接使用單主複製。

#### 避免衝突

處理衝突的最簡單的策略就是避免它們：如果應用程式可以確保特定記錄的所有寫入都透過同一個主庫，那麼衝突就不會發生。由於許多的多主複製實現在處理衝突時處理得相當不好，避免衝突是一個經常被推薦的方法【34】。

例如，在一個使用者可以編輯自己資料的應用程式中，可以確保來自特定使用者的請求始終路由到同一資料中心，並使用該資料中心的主庫進行讀寫。不同的使用者可能有不同的 “主” 資料中心（可能根據使用者的地理位置選擇），但從任何一位使用者的角度來看，本質上就是單主配置了。

但是，有時你可能需要更改被指定的主庫 —— 可能是因為某個資料中心出現故障，你需要將流量重新路由到另一個數據中心，或者可能是因為使用者已經遷移到另一個位置，現在更接近其它的資料中心。在這種情況下，衝突避免將失效，你必須處理不同主庫同時寫入的可能性。

#### 收斂至一致的狀態

單主資料庫按順序進行寫操作：如果同一個欄位有多個更新，則最後一個寫操作將決定該欄位的最終值。

在多主配置中，沒有明確的寫入順序，所以最終值應該是什麼並不清楚。在 [圖 5-7](/v1/ddia_0507.png) 中，在主庫 1 中標題首先更新為 B 而後更新為 C；在主庫 2 中，首先更新為 C，然後更新為 B。兩種順序都不比另一種“更正確”。

如果每個副本只是按照它看到寫入的順序寫入，那麼資料庫最終將處於不一致的狀態：最終值將是在主庫 1 的 C 和主庫 2 的 B。這是不可接受的，每個複製方案都必須確保資料最終在所有副本中都是相同的。因此，資料庫必須以一種 **收斂（convergent）** 的方式解決衝突，這意味著所有副本必須在所有變更復制完成時收斂至一個相同的最終值。

實現衝突合併解決有多種途徑：

* 給每個寫入一個唯一的 ID（例如時間戳、長隨機數、UUID 或者鍵和值的雜湊），挑選最高 ID 的寫入作為勝利者，並丟棄其他寫入。如果使用時間戳，這種技術被稱為 **最後寫入勝利（LWW, last write wins）**。雖然這種方法很流行，但是很容易造成資料丟失【35】。我們將在本章末尾的 [檢測併發寫入](#檢測併發寫入) 一節更詳細地討論 LWW。
* 為每個副本分配一個唯一的 ID，ID 編號更高的寫入具有更高的優先順序。這種方法也意味著資料丟失。
* 以某種方式將這些值合併在一起 - 例如，按字母順序排序，然後連線它們（在 [圖 5-7](/v1/ddia_0507.png) 中，合併的標題可能類似於 “B/C”）。
* 用一種可保留所有資訊的顯式資料結構來記錄衝突，並編寫解決衝突的應用程式程式碼（也許透過提示使用者的方式）。


#### 自定義衝突解決邏輯

解決衝突的最合適的方法可能取決於應用程式，大多數多主複製工具允許使用應用程式程式碼編寫衝突解決邏輯。該程式碼可以在寫入或讀取時執行：

寫時執行
: 只要資料庫系統檢測到複製更改日誌中存在衝突，就會呼叫衝突處理程式。例如，Bucardo 允許你為此編寫一段 Perl 程式碼。這個處理程式通常不能提示使用者 —— 它在後臺程序中執行，並且必須快速執行。

讀時執行
: 當檢測到衝突時，所有衝突寫入被儲存。下一次讀取資料時，會將這些多個版本的資料返回給應用程式。應用程式可以提示使用者或自動解決衝突，並將結果寫回資料庫。例如 CouchDB 就以這種方式工作。

請注意，衝突解決通常適用於單行記錄或單個文件的層面，而不是整個事務【36】。因此，如果你有一個事務會原子性地進行幾次不同的寫入（請參閱 [第七章](/v1_tw/ch7)），對於衝突解決而言，每個寫入仍需分開單獨考慮。


> #### 自動衝突解決
>
> 衝突解決規則可能很容易變得越來越複雜，自定義程式碼可能也很容易出錯。亞馬遜是一個經常被引用的例子，由於衝突解決處理程式而產生了令人意外的效果：一段時間以來，購物車上的衝突解決邏輯將保留新增到購物車的物品，但不包括從購物車中移除的物品。因此，顧客有時會看到物品重新出現在他們的購物車中，即使他們之前已經被移走【37】。
>
> 已經有一些有趣的研究來自動解決由於資料修改引起的衝突。有幾項研究值得一提：
>
> * **無衝突複製資料型別（Conflict-free replicated datatypes，CRDT）**【32,38】是可以由多個使用者同時編輯的集合、對映、有序列表、計數器等一系列資料結構，它們以合理的方式自動解決衝突。一些 CRDT 已經在 Riak 2.0 中實現【39,40】。
> * **可合併的持久資料結構（Mergeable persistent data structures）**【41】顯式跟蹤歷史記錄，類似於 Git 版本控制系統，並使用三向合併功能（而 CRDT 使用雙向合併）。
> * **操作轉換（operational transformation）**[42] 是 Etherpad 【30】和 Google Docs 【31】等協同編輯應用背後的衝突解決演算法。它是專為有序列表的併發編輯而設計的，例如構成文字文件的字元列表。
>
> 這些演算法在資料庫中的實現還很年輕，但很可能將來它們會被整合到更多的複製資料系統中。自動衝突解決方案可以使應用程式處理多主資料同步更為簡單。


#### 什麼是衝突？

有些衝突是顯而易見的。在 [圖 5-7](/v1/ddia_0507.png) 的例子中，兩個寫操作併發地修改了同一條記錄中的同一個欄位，並將其設定為兩個不同的值。毫無疑問這是一個衝突。

其他型別的衝突可能更為微妙而難以發現。例如，考慮一個會議室預訂系統：它記錄誰訂了哪個時間段的哪個房間。應用程式需要確保每個房間在任意時刻都只能被一組人進行預定（即不得有相同房間的重疊預訂）。在這種情況下，如果為同一個房間同時建立兩個不同的預訂，則可能會發生衝突。即使應用程式在允許使用者進行預訂之前先檢查會議室的可用性，如果兩次預訂是由兩個不同的主庫進行的，則仍然可能會有衝突。

雖然現在還沒有一個現成的答案，但在接下來的章節中，我們將更好地瞭解這個問題。我們將在 [第七章](/v1_tw/ch7) 中看到更多的衝突示例，在 [第十二章](/v1_tw/ch12) 中我們將討論用於檢測和解決複製系統中衝突的可伸縮方法。


### 多主複製拓撲

**複製拓撲**（replication topology）用來描述寫入操作從一個節點傳播到另一個節點的通訊路徑。如果你有兩個主庫，如 [圖 5-7](/v1/ddia_0507.png) 所示，只有一個合理的拓撲結構：主庫 1 必須把它所有的寫入都發送到主庫 2，反之亦然。當有兩個以上的主庫，多種不同的拓撲都是可能的。[圖 5-8](/v1/ddia_0508.png) 舉例說明了一些例子。

![](/v1/ddia_0508.png)

**圖 5-8 三種可以在多主複製中使用的拓撲示例。**

最常見的拓撲是全部到全部（all-to-all，如 [圖 5-8 (c)](/v1/ddia_0508.png)），其中每個主庫都將其寫入傳送給其他所有的主庫。然而，一些更受限的拓撲也會被使用到：例如，預設情況下 MySQL 僅支援 **環形拓撲（circular topology）**【34】，其中每個節點都從一個節點接收寫入，並將這些寫入（加上自己的寫入）轉發給另一個節點。另一種流行的拓撲結構具有星形的形狀 [^v]：一個指定的根節點將寫入轉發給所有其他節點。星形拓撲可以推廣到樹。

[^v]: 不要與星型模式混淆（請參閱 “[星型和雪花型：分析的模式](/v1_tw/ch3#星型和雪花型：分析的模式)”），其中描述了資料模型的結構，而不是節點之間的通訊拓撲。

在環形和星形拓撲中，寫入可能需要在到達所有副本之前透過多個節點。因此，節點需要轉發從其他節點收到的資料更改。為了防止無限複製迴圈，每個節點被賦予一個唯一的識別符號，並且在複製日誌中，每次寫入都會使用其經過的所有節點的識別符號進行標記【43】。當一個節點收到用自己的識別符號標記的資料更改時，該資料更改將被忽略，因為節點知道它已經被處理過。

環形和星形拓撲的問題是，如果只有一個節點發生故障，則可能會中斷其他節點之間的複製訊息流，導致它們無法通訊，除非節點被修復。拓撲結構可以重新配置為跳過發生故障的節點，但在大多數部署中，這種重新配置必須手動完成。更密集連線的拓撲結構（例如全部到全部）的容錯性更好，因為它允許訊息沿著不同的路徑傳播，可以避免單點故障。

另一方面，全部到全部的拓撲也可能有問題。特別是，一些網路連結可能比其他網路連結更快（例如由於網路擁塞），結果是一些複製訊息可能 “超越” 其他複製訊息，如 [圖 5-9](/v1/ddia_0509.png) 所示。

![](/v1/ddia_0509.png)

**圖 5-9 使用多主複製時，寫入可能會以錯誤的順序到達某些副本。**

在 [圖 5-9](/v1/ddia_0509.png) 中，客戶端 A 向主庫 1 的表中插入一行，客戶端 B 在主庫 3 上更新該行。然而，主庫 2 可以以不同的順序接收寫入：它可能先接收到更新（從它的角度來看，是對資料庫中不存在的行的更新），稍後才接收到相應的插入（其應該在更新之前）。

這是一個因果關係的問題，類似於我們在 “[一致字首讀](#一致字首讀)” 中看到的：更新取決於先前的插入，所以我們需要確保所有節點先處理插入，然後再處理更新。僅僅在每一次寫入時新增一個時間戳是不夠的，因為時鐘不可能被充分地同步，所以主庫 2 就無法正確地對這些事件進行排序（見 [第八章](/v1_tw/ch8)）。

要正確排序這些事件，可以使用一種稱為 **版本向量（version vectors）** 的技術，本章稍後將討論這種技術（請參閱 “[檢測併發寫入](#檢測併發寫入)”）。然而，許多多主複製系統中的衝突檢測技術實現得並不好。例如，在撰寫本文時，PostgreSQL BDR 不提供寫入的因果排序【27】，而 Tungsten Replicator for MySQL 甚至都不做檢測衝突【34】。

如果你正在使用基於多主複製的系統，那麼你應該多瞭解這些問題，仔細閱讀文件，並徹底測試你的資料庫，以確保它確實提供了你想要的保證。


## 無主複製

我們在本章到目前為止所討論的複製方法 —— 單主複製、多主複製 —— 都是這樣的想法：客戶端向一個主庫傳送寫請求，而資料庫系統負責將寫入複製到其他副本。主庫決定寫入的順序，而從庫按相同順序應用主庫的寫入。

一些資料儲存系統採用不同的方法，放棄主庫的概念，並允許任何副本直接接受來自客戶端的寫入。最早的一些的複製資料系統是 **無主的（leaderless）**【1,44】，但是在關係資料庫主導的時代，這個想法幾乎已被忘卻。在亞馬遜將其用於其內部的 Dynamo 系統 [^vi] 之後，它再一次成為資料庫的一種時尚架構【37】。Riak，Cassandra 和 Voldemort 是受 Dynamo 啟發的無主複製模型的開源資料儲存，所以這類資料庫也被稱為 *Dynamo 風格*。

[^vi]: Dynamo 不適用於 Amazon 以外的使用者。令人困惑的是，AWS 提供了一個名為 DynamoDB 的託管資料庫產品，它使用了完全不同的體系結構：它基於單主複製。

在一些無主複製的實現中，客戶端直接將寫入傳送到幾個副本中，而另一些情況下，由一個 **協調者（coordinator）** 節點代表客戶端進行寫入。但與主庫資料庫不同，協調者不執行特定的寫入順序。我們將會看到，這種設計上的差異對資料庫的使用方式有著深遠的影響。

### 當節點故障時寫入資料庫

假設你有一個帶有三個副本的資料庫，而其中一個副本目前不可用，或許正在重新啟動以安裝系統更新。在基於領導者的配置中，如果要繼續處理寫入，則可能需要執行故障切換（請參閱「[處理節點宕機](#處理節點宕機)」）。

另一方面，在無主配置中，不存在故障轉移。[圖 5-10](/v1/ddia_0510.png) 演示了會發生了什麼事情：客戶端（使用者 1234）並行傳送寫入到所有三個副本，並且兩個可用副本接受寫入，但是不可用副本錯過了它。假設三個副本中的兩個承認寫入是足夠的：在使用者 1234 已經收到兩個確定的響應之後，我們認為寫入成功。客戶簡單地忽略了其中一個副本錯過了寫入的事實。

![](/v1/ddia_0510.png)

**圖 5-10 法定寫入，法定讀取，並在節點中斷後讀修復。**

現在想象一下，不可用的節點重新聯機，客戶端開始讀取它。節點關閉期間發生的任何寫入都不在該節點上。因此，如果你從該節點讀取資料，則可能會從響應中拿到陳舊的（過時的）值。

為了解決這個問題，當一個客戶端從資料庫中讀取資料時，它不僅僅把它的請求傳送到一個副本：讀請求將被並行地傳送到多個節點。客戶可能會從不同的節點獲得不同的響應，即來自一個節點的最新值和來自另一個節點的陳舊值。版本號將被用於確定哪個值是更新的（請參閱 “[檢測併發寫入](#檢測併發寫入)”）。

#### 讀修復和反熵

複製方案應確保最終將所有資料複製到每個副本。在一個不可用的節點重新聯機之後，它如何趕上它錯過的寫入？

在 Dynamo 風格的資料儲存中經常使用兩種機制：

讀修復（Read repair）
: 當客戶端並行讀取多個節點時，它可以檢測到任何陳舊的響應。例如，在 [圖 5-10](/v1/ddia_0510.png) 中，使用者 2345 獲得了來自副本 3 的版本 6 值和來自副本 1 和 2 的版本 7 值。客戶端發現副本 3 具有陳舊值，並將新值寫回到該副本。這種方法適用於讀頻繁的值。

反熵過程（Anti-entropy process）
: 此外，一些資料儲存具有後臺程序，該程序不斷查詢副本之間的資料差異，並將任何缺少的資料從一個副本複製到另一個副本。與基於領導者的複製中的複製日誌不同，此反熵過程不會以任何特定的順序複製寫入，並且在複製資料之前可能會有顯著的延遲。

並不是所有的系統都實現了這兩種機制，例如，Voldemort 目前沒有反熵過程。請注意，如果沒有反熵過程，很少被讀取的值可能會從某些副本中丟失，從而降低了永續性，因為只有在應用程式讀取值時才執行讀修復。

#### 讀寫的法定人數

在 [圖 5-10](/v1/ddia_0510.png) 的示例中，我們認為即使僅在三個副本中的兩個上進行處理，寫入仍然是成功的。如果三個副本中只有一個接受了寫入，會怎樣？以此類推，究竟多少個副本完成才可以認為寫入成功？

如果我們知道，每個成功的寫操作意味著在三個副本中至少有兩個出現，這意味著至多有一個副本可能是陳舊的。因此，如果我們從至少兩個副本讀取，我們可以確定至少有一個是最新的。如果第三個副本停機或響應速度緩慢，則讀取仍可以繼續返回最新值。

更一般地說，如果有 n 個副本，每個寫入必須由 w 個節點確認才能被認為是成功的，並且我們必須至少為每個讀取查詢 r 個節點。（在我們的例子中，$n = 3，w = 2，r = 2$）。只要 $w + r > n$，我們可以預期在讀取時能獲得最新的值，因為 r 個讀取中至少有一個節點是最新的。遵循這些 r 值和 w 值的讀寫稱為 **法定人數（quorum）**[^vii] 的讀和寫【44】。你可以認為，r 和 w 是有效讀寫所需的最低票數。

[^vii]: 有時候這種法定人數被稱為嚴格的法定人數，其相對 “寬鬆的法定人數” 而言（見 “[寬鬆的法定人數與提示移交](#寬鬆的法定人數與提示移交)”）

在 Dynamo 風格的資料庫中，引數 n、w 和 r 通常是可配置的。一個常見的選擇是使 n 為奇數（通常為 3 或 5）並設定 $w = r = (n + 1) / 2$（向上取整）。但是你可以根據需要更改數字。例如，寫入次數較少且讀取次數較多的工作負載可以從設定 $w = n$ 和 $r = 1$中受益。這會使得讀取速度更快，但缺點是隻要有一個不可用的節點就會導致所有的資料庫寫入都失敗。

> 叢集中可能有多於 n 個的節點（叢集的機器數可能多於副本數目）。但是任何給定的值只能儲存在 n 個節點上。這允許對資料集進行分割槽，從而可以支援比單個節點的儲存能力更大的資料集。我們將在 [第六章](/v1_tw/ch6) 繼續討論分割槽。

法定人數條件 $w + r > n$ 允許系統容忍不可用的節點，如下所示：

* 如果 $w < n$，當節點不可用時，我們仍然可以處理寫入。
* 如果 $r < n$，當節點不可用時，我們仍然可以處理讀取。
* 對於 $n = 3，w = 2，r = 2$，我們可以容忍一個不可用的節點。
* 對於 $n = 5，w = 3，r = 3$，我們可以容忍兩個不可用的節點。這個案例如 [圖 5-11](/v1/ddia_0511.png) 所示。
* 通常，讀取和寫入操作始終並行傳送到所有 n 個副本。引數 w 和 r 決定我們等待多少個節點，即在我們認為讀或寫成功之前，有多少個節點需要報告成功。

![](/v1/ddia_0511.png)

**圖 5-11  如果 $w + r > n$，讀取 r 個副本，至少有一個副本必然包含了最近的成功寫入。**

如果可用的節點少於所需的 w 或 r，則寫入或讀取將返回錯誤。節點可能由於多種原因而不可用，比如：節點關閉（異常崩潰，電源關閉）、操作執行過程中的錯誤（由於磁碟已滿而無法寫入）、客戶端和伺服器節點之間的網路中斷或任何其他原因。我們只需要關心節點是否返回了成功的響應，而不需要區分不同型別的錯誤。


### 法定人數一致性的侷限性

如果你有 n 個副本，並且你選擇了滿足 $w + r > n$ 的 w 和 r，你通常可以期望每次讀取都能返回最近寫入的值。情況就是這樣，因為你寫入的節點集合和你讀取的節點集合必然有重疊。也就是說，你讀取的節點中必然至少有一個節點具有最新值（如 [圖 5-11](/v1/ddia_0511.png) 所示）。

通常，r 和 w 被選為多數（超過 $n/2$ ）節點，因為這確保了 $w + r > n$，同時仍然容忍多達 $n/2$ 個節點故障。但是，法定人數不一定必須是大多數，重要的是讀寫使用的節點至少有一個節點的交集。其他法定人數的配置是可能的，這使得分散式演算法的設計有一定的靈活性【45】。

你也可以將 w 和 r 設定為較小的數字，以使 $w + r ≤ n$（即法定條件不滿足）。在這種情況下，讀取和寫入操作仍將被傳送到 n 個節點，但操作成功只需要少量的成功響應。

較小的 w 和 r 更有可能會讀取到陳舊的資料，因為你的讀取更有可能未包含具有最新值的節點。另一方面，這種配置允許更低的延遲和更高的可用性：如果存在網路中斷，並且許多副本變得無法訪問，則有更大的機會可以繼續處理讀取和寫入。只有當可達副本的數量低於 w 或 r 時，資料庫才變得不可寫入或讀取。

但是，即使在 $w + r > n$ 的情況下，也可能存在返回陳舊值的邊緣情況。這取決於實現，但可能的情況包括：

* 如果使用寬鬆的法定人數（見 “[寬鬆的法定人數與提示移交](#寬鬆的法定人數與提示移交)”），w 個寫入和 r 個讀取有可能落在完全不同的節點上，因此 r 節點和 w 之間不再保證有重疊節點【46】。
* 如果兩個寫入同時發生，不清楚哪一個先發生。在這種情況下，唯一安全的解決方案是合併併發寫入（請參閱 “[處理寫入衝突](#處理寫入衝突)”）。如果根據時間戳（最後寫入勝利）挑選出一個勝者，則寫入可能由於時鐘偏差【35】而丟失。我們將在 “[檢測併發寫入](#檢測併發寫入)” 繼續討論此話題。
* 如果寫操作與讀操作同時發生，寫操作可能僅反映在某些副本上。在這種情況下，不確定讀取返回的是舊值還是新值。
* 如果寫操作在某些副本上成功，而在其他節點上失敗（例如，因為某些節點上的磁碟已滿），在小於 w 個副本上寫入成功。所以整體判定寫入失敗，但整體寫入失敗並沒有在寫入成功的副本上回滾。這意味著一個寫入雖然報告失敗，後續的讀取仍然可能會讀取這次失敗寫入的值【47】。
* 如果攜帶新值的節點發生故障，需要從其他帶有舊值的副本進行恢復，則儲存新值的副本數可能會低於 w，從而打破法定人數條件。
* 即使一切工作正常，有時也會不幸地出現關於 **時序（timing）** 的邊緣情況，我們將在 “[線性一致性和法定人數](/v1_tw/ch9#線性一致性和法定人數)” 中看到這點。

因此，儘管法定人數似乎保證讀取返回最新的寫入值，但在實踐中並不那麼簡單。Dynamo 風格的資料庫通常針對可以忍受最終一致性的用例進行最佳化。你可以透過引數 w 和 r 來調整讀取到陳舊值的機率，但把它們當成絕對的保證是不明智的。

尤其是，因為通常得不到 “[複製延遲問題](#複製延遲問題)” 中討論的那些保證（讀己之寫，單調讀，一致字首讀），前面提到的異常可能會發生在應用程式中。更強有力的保證通常需要 **事務** 或 **共識**。我們將在 [第七章](/v1_tw/ch7) 和 [第九章](/v1_tw/ch9) 回到這些話題。

#### 監控陳舊度

從運維的角度來看，監視你的資料庫是否返回最新的結果是很重要的。即使應用可以容忍陳舊的讀取，你也需要了解複製的健康狀況。如果顯著落後，它應該提醒你以便你可以調查原因（例如網路中的問題或過載的節點）。

對於基於領導者的複製，資料庫通常會提供複製延遲的測量值，你可以將其提供給監視系統。這之所以能做到，是因為寫入是按照相同的順序應用於主庫和從庫，並且每個節點對應了複製日誌中的一個位置（已經在本地應用的寫入數量）。透過從主庫的當前位置中減去從庫的當前位置，你可以測量複製延遲的程度。

然而，在無主複製的系統中，沒有固定的寫入順序，這使得監控變得更加困難。而且，如果資料庫只使用讀修復（沒有反熵過程），那麼對於一個值可能會有多陳舊其實是沒有限制的 - 如果一個值很少被讀取，那麼由一個陳舊副本返回的值可能是古老的。

已經有一些關於衡量無主複製資料庫中的複製陳舊度的研究，並根據引數 n、w 和 r 來預測陳舊讀取的預期百分比【48】。不幸的是，這還不是很常見的做法，但是將陳舊測量值包含在資料庫的標準度量集中是一件好事。雖然最終一致性是一種有意模糊的保證，但是從可操作性角度來說，能夠量化 “最終” 也是很重要的。

### 寬鬆的法定人數與提示移交

合理配置的法定人數可以使資料庫無需故障切換即可容忍個別節點的故障。它也可以容忍個別節點變慢，因為請求不必等待所有 n 個節點響應 —— 當 w 或 r 個節點響應時它們就可以返回。對於需要高可用、低延時、且能夠容忍偶爾讀到陳舊值的應用場景來說，這些特性使無主複製的資料庫很有吸引力。

然而，法定人數（如迄今為止所描述的）並不像它們可能的那樣具有容錯性。網路中斷可以很容易地將客戶端從大量的資料庫節點上切斷。雖然這些節點是活著的，而其他客戶端可能也能夠連線到它們，但是從資料庫節點切斷的客戶端來看，它們也可能已經死亡。在這種情況下，剩餘的可用節點可能會少於 w 或 r，因此客戶端不再能達到法定人數。

在一個大型的叢集中（節點數量明顯多於 n 個），網路中斷期間客戶端可能仍能連線到一些資料庫節點，但又不足以組成一個特定的法定人數。在這種情況下，資料庫設計人員需要權衡一下：

* 對於所有無法達到 w 或 r 個節點法定人數的請求，是否返回錯誤是更好的？
* 或者我們是否應該接受寫入，然後將它們寫入一些可達的節點，但不在這些值通常所存在的 n 個節點上？

後者被認為是一個 **寬鬆的法定人數（sloppy quorum）**【37】：寫和讀仍然需要 w 和 r 個成功的響應，但這些響應可能來自不在指定的 n 個 “主” 節點中的其它節點。就好比說，如果你把自己鎖在房子外面了，你可能會去敲開鄰居的門，問是否可以暫時呆在他們的沙發上。

一旦網路中斷得到解決，一個節點代表另一個節點臨時接受的任何寫入都將被傳送到適當的 “主” 節點。這就是所謂的 **提示移交（hinted handoff）**（一旦你再次找到你的房子的鑰匙，你的鄰居可以禮貌地要求你離開沙發回家）。

寬鬆的法定人數對寫入可用性的提高特別有用：只要有任何 w 個節點可用，資料庫就可以接受寫入。然而，這意味著即使當 $w + r > n$ 時，也不能確保讀取到某個鍵的最新值，因為最新的值可能已經臨時寫入了 n 之外的某些節點【47】。

因此，在傳統意義上，寬鬆的法定人數實際上並不是法定人數。它只是一個永續性的保證，即資料已儲存在某處的 w 個節點。但不能保證 r 個節點的讀取能看到它，除非提示移交已經完成。

在所有常見的 Dynamo 實現中，寬鬆的法定人數是可選的。在 Riak 中，它們預設是啟用的，而在 Cassandra 和 Voldemort 中它們預設是停用的【46,49,50】。

#### 運維多個數據中心

我們先前討論了跨資料中心複製，作為多主複製的用例（請參閱 “[多主複製](#多主複製)”）。其實無主複製也適用於多資料中心操作，既然它旨在容忍衝突的併發寫入、網路中斷和延遲尖峰。

Cassandra 和 Voldemort 在正常的無主模型中實現了他們的多資料中心支援：副本的數量 n 包括所有資料中心的節點，你可以在配置中指定每個資料中心所擁有的副本的數量。無論資料中心如何，每個來自客戶端的寫入都會發送到所有副本，但客戶端通常只等待來自其本地資料中心內的法定節點的確認，從而不會受到跨資料中心鏈路延遲和中斷的影響。對其他資料中心的高延遲寫入通常被配置為非同步執行，儘管該配置仍有一定的靈活性【50,51】。

Riak 將客戶端和資料庫節點之間的所有通訊保持在一個本地的資料中心，因此 n 描述了一個數據中心內的副本數量。資料庫叢集之間的跨資料中心複製在後臺非同步發生，其風格類似於多主複製【52】。

### 檢測併發寫入

Dynamo 風格的資料庫允許多個客戶端同時寫入相同的鍵（Key），這意味著即使使用嚴格的法定人數也會發生衝突。這種情況與多主複製相似（請參閱 “[處理寫入衝突](#處理寫入衝突)”），但在 Dynamo 風格的資料庫中，在 **讀修復** 或 **提示移交** 期間也可能會產生衝突。

其問題在於，由於可變的網路延遲和部分節點的故障，事件可能以不同的順序到達不同的節點。例如，[圖 5-12](/v1/ddia_0512.png) 顯示了兩個客戶機 A 和 B 同時寫入三節點資料儲存中的鍵 X：

* 節點 1 接收來自 A 的寫入，但由於暫時中斷，未接收到來自 B 的寫入。
* 節點 2 首先接收來自 A 的寫入，然後接收來自 B 的寫入。
* 節點 3 首先接收來自 B 的寫入，然後從 A 寫入。

![](/v1/ddia_0512.png)

**圖 5-12 併發寫入 Dynamo 風格的資料儲存：沒有明確定義的順序。**

如果每個節點只要接收到來自客戶端的寫入請求就簡單地覆寫某個鍵值，那麼節點就會永久地不一致，如 [圖 5-12](/v1/ddia_0512.png) 中的最終獲取請求所示：節點 2 認為 X 的最終值是 B，而其他節點認為值是 A 。

為了最終達成一致，副本應該趨於相同的值。如何做到這一點？有人可能希望複製的資料庫能夠自動處理，但不幸的是，大多數的實現都很糟糕：如果你想避免丟失資料，你（應用程式開發人員）需要知道很多有關資料庫衝突處理的內部資訊。

在 “[處理寫入衝突](#處理寫入衝突)” 一節中已經簡要介紹了一些解決衝突的技術。在總結本章之前，讓我們來更詳細地探討這個問題。

#### 最後寫入勝利（丟棄併發寫入）

實現最終收斂的一種方法是宣告每個副本只需要儲存 **“最近”** 的值，並允許 **“更舊”** 的值被覆蓋和拋棄。然後，只要我們有一種明確的方式來確定哪個寫是 “最近的”，並且每個寫入最終都被複制到每個副本，那麼複製最終會收斂到相同的值。

正如 **“最近”** 的引號所表明的，這個想法其實頗具誤導性。在 [圖 5-12](/v1/ddia_0512.png) 的例子中，當客戶端向資料庫節點發送寫入請求時，兩個客戶端都不知道另一個客戶端，因此不清楚哪一個先發送請求。事實上，說這兩種情況誰先發送請求是沒有意義的：既然我們說寫入是 **併發（concurrent）** 的，那麼它們的順序就是不確定的。

即使寫入沒有自然的排序，我們也可以強制進行排序。例如，可以為每個寫入附加一個時間戳，然後挑選最大的時間戳作為 **“最近的”**，並丟棄具有較早時間戳的任何寫入。這種衝突解決演算法被稱為 **最後寫入勝利（LWW, last write wins）**，是 Cassandra 唯一支援的衝突解決方法【53】，也是 Riak 中的一個可選特徵【35】。

LWW 實現了最終收斂的目標，但以 **永續性** 為代價：如果同一個鍵有多個併發寫入，即使它們反饋給客戶端的結果都是成功的（因為它們被寫入 w 個副本），也只有一個寫入將被保留，而其他寫入將被默默地丟棄。此外，LWW 甚至可能會丟棄不是併發的寫入，我們將在 “[有序事件的時間戳](/v1_tw/ch8#有序事件的時間戳)” 中進行討論。

在類似快取的一些情況下，寫入丟失可能是可以接受的。但如果資料丟失不可接受，LWW 是解決衝突的一個很爛的選擇。

在資料庫中使用 LWW 的唯一安全方法是確保一個鍵只寫入一次，然後視為不可變，從而避免對同一個鍵進行併發更新。例如，Cassandra 推薦使用的方法是使用 UUID 作為鍵，從而為每個寫操作提供一個唯一的鍵【53】。

#### “此前發生”的關係和併發

我們如何判斷兩個操作是否是併發的？為了建立一個直覺，讓我們看看一些例子：

* 在 [圖 5-9](fig5-9.png) 中，兩個寫入不是併發的：A 的插入發生在 B 的遞增之前，因為 B 遞增的值是 A 插入的值。換句話說，B 的操作建立在 A 的操作上，所以 B 的操作必須後發生。我們也可以說 B **因果依賴（causally dependent）** 於 A。
* 另一方面，[圖 5-12](fig5-12.png) 中的兩個寫入是併發的：當每個客戶端啟動操作時，它不知道另一個客戶端也正在對同樣的鍵執行操作。因此，操作之間不存在因果關係。

如果操作 B 瞭解操作 A，或者依賴於 A，或者以某種方式構建於操作 A 之上，則操作 A 在操作 B 之前發生（happens before）。一個操作是否在另一個操作之前發生是定義併發含義的關鍵。事實上，我們可以簡單地說，如果兩個操作中的任何一個都不在另一個之前發生（即，兩個操作都不知道對方），那麼這兩個操作是併發的【54】。

因此，只要有兩個操作 A 和 B，就有三種可能性：A 在 B 之前發生，或者 B 在 A 之前發生，或者 A 和 B 併發。我們需要的是一個演算法來告訴我們兩個操作是否是併發的。如果一個操作發生在另一個操作之前，則後面的操作應該覆蓋前面的操作，但是如果這些操作是併發的，則存在需要解決的衝突。


> #### 併發性、時間和相對性
>
> 如果兩個操作 **“同時”** 發生，似乎應該稱為併發 —— 但事實上，它們在字面時間上重疊與否並不重要。由於分散式系統中的時鐘問題，現實中是很難判斷兩個事件是否是 **同時** 發生的，這個問題我們將在 [第八章](/v1_tw/ch8) 中詳細討論。
>
> 為了定義併發性，確切的時間並不重要：如果兩個操作都意識不到對方的存在，就稱這兩個操作 **併發**，而不管它們實際發生的物理時間。人們有時把這個原理和物理學中的狹義相對論聯絡起來【54】，該理論引入了資訊不能比光速更快的思想。因此，如果兩個事件發生的時間差小於光透過它們之間的距離所需要的時間，那麼這兩個事件不可能相互影響。
>
> 在計算機系統中，即使光速原則上允許一個操作影響另一個操作，但兩個操作也可能是 **併發的**。例如，如果網路緩慢或中斷，兩個操作間可能會出現一段時間間隔，但仍然是併發的，因為網路問題阻止一個操作意識到另一個操作的存在。


#### 捕獲"此前發生"關係

我們來看一個演算法，它可以確定兩個操作是否為併發的，還是一個在另一個之前。簡單起見，我們從一個只有一個副本的資料庫開始。一旦我們知道了如何在單個副本上完成這項工作，我們可以將該方法推廣到具有多個副本的無主資料庫。

[圖 5-13](/v1/ddia_0513.png) 顯示了兩個客戶端同時向同一購物車新增專案。（如果這樣的例子讓你覺得無趣，那麼可以想象一下兩個空中交通管制員同時把飛機新增到他們正在跟蹤的區域。）最初，購物車是空的。然後客戶端向資料庫發出五次寫入：

1. 客戶端 1 將牛奶加入購物車。這是該鍵的第一次寫入，伺服器成功儲存了它併為其分配版本號 1，最後將值與版本號一起回送給客戶端。
2. 客戶端 2 將雞蛋加入購物車，不知道客戶端 1 同時添加了牛奶（客戶端 2 認為它的雞蛋是購物車中的唯一物品）。伺服器為此寫入分配版本號 2，並將雞蛋和牛奶儲存為兩個單獨的值。然後它將這兩個值 **都** 返回給客戶端 2 ，並附上版本號 2。
3. 客戶端 1 不知道客戶端 2 的寫入，想要將麵粉加入購物車，因此認為當前的購物車內容應該是 [牛奶，麵粉]。它將此值與伺服器先前向客戶端 1 提供的版本號 1 一起傳送到伺服器。伺服器可以從版本號中知道 [牛奶，麵粉] 的寫入取代了 [牛奶] 的先前值，但與 [雞蛋] 的值是 **併發** 的。因此，伺服器將版本號 3 分配給 [牛奶，麵粉]，覆蓋版本 1 的值 [牛奶]，但保留版本 2 的值 [雞蛋]，並將所有的值返回給客戶端 1。
4. 同時，客戶端 2 想要加入火腿，不知道客戶端 1 剛剛加了麵粉。客戶端 2 在最近一次響應中從伺服器收到了兩個值 [牛奶] 和 [雞蛋]，所以客戶端 2 現在合併這些值，並新增火腿形成一個新的值 [雞蛋，牛奶，火腿]。它將這個值傳送到伺服器，帶著之前的版本號 2 。伺服器檢測到新值會覆蓋版本 2 的值 [雞蛋]，但新值也會與版本 3 的值 [牛奶，麵粉] **併發**，所以剩下的兩個值是版本 3 的 [牛奶，麵粉]，和版本 4 的 [雞蛋，牛奶，火腿]。
5. 最後，客戶端 1 想要加培根。它之前從伺服器接收到了版本 3 的 [牛奶，麵粉] 和 [雞蛋]，所以它合併這些，新增培根，並將最終值 [牛奶，麵粉，雞蛋，培根] 連同版本號 3 發往伺服器。這會覆蓋版本 3 的值 [牛奶，麵粉]（請注意 [雞蛋] 已經在上一步被覆蓋），但與版本 4 的值 [雞蛋，牛奶，火腿] 併發，所以伺服器將保留這兩個併發值。

![](/v1/ddia_0513.png)

**圖 5-13  在同時編輯購物車時捕獲兩個客戶端之間的因果關係。**

[圖 5-13](/v1/ddia_0513.png) 中的操作之間的資料流如 [圖 5-14](/v1/ddia_0514.png) 所示。箭頭表示哪個操作發生在其他操作之前，意味著後面的操作知道或依賴於較早的操作。在這個例子中，客戶端永遠不會完全拿到伺服器上的最新資料，因為總是有另一個操作同時進行。但是舊版本的值最終會被覆蓋，並且不會丟失任何寫入。

![](/v1/ddia_0514.png)

**圖 5-14 圖 5-13 中的因果依賴關係圖。**

請注意，伺服器可以只通過檢視版本號來確定兩個操作是否是併發的 —— 它不需要對值本身進行解釋（因此該值可以是任何資料結構）。該演算法的工作原理如下：

* 伺服器為每個鍵維護一個版本號，每次寫入該鍵時都遞增版本號，並將新版本號與寫入的值一起儲存。
* 當客戶端讀取鍵時，伺服器將返回所有未覆蓋的值以及最新的版本號。客戶端在寫入前必須先讀取。
* 當客戶端寫入鍵時，必須包含之前讀取的版本號，並且必須將之前讀取的所有值合併在一起（針對寫入請求的響應可以像讀取請求一樣，返回所有當前值，這使得我們可以像購物車示例那樣將多個寫入串聯起來）。
* 當伺服器接收到具有特定版本號的寫入時，它可以覆蓋該版本號或更低版本的所有值（因為它知道它們已經被合併到新的值中），但是它必須用更高的版本號來儲存所有值（因為這些值與正在進行的其它寫入是併發的）。

當一個寫入包含前一次讀取的版本號時，它會告訴我們的寫入是基於之前的哪一種狀態。如果在不包含版本號的情況下進行寫操作，則與所有其他寫操作併發，因此它不會覆蓋任何內容 —— 只會在隨後的讀取中作為其中一個值返回。

#### 合併併發寫入的值

這種演算法可以確保沒有資料被無聲地丟棄，但不幸的是，客戶端需要做一些額外的工作：客戶端隨後必須合併併發寫入的值。Riak 稱這些併發值為 **兄弟（siblings）**。

合併併發值，本質上是與多主複製中的衝突解決問題相同，我們先前討論過（請參閱 “[處理寫入衝突](#處理寫入衝突)”）。一個簡單的方法是根據版本號或時間戳（最後寫入勝利）來選擇一個值，但這意味著丟失資料。所以，你可能需要在應用程式程式碼中額外做些更聰明的事情。

以購物車為例，一種合理的合併值的方法就是做並集。在 [圖 5-14](/v1/ddia_0514.png) 中，最後的兩個兄弟是 [牛奶，麵粉，雞蛋，培根] 和 [雞蛋，牛奶，火腿]。注意牛奶和雞蛋雖然同時出現在兩個併發值裡，但他們每個只被寫過一次。合併的值可以是 [牛奶，麵粉，雞蛋，培根，火腿]，不再有重複了。

然而，如果你想讓人們也可以從他們的購物車中 **移除** 東西，而不是僅僅新增東西，那麼把併發值做並集可能不會產生正確的結果：如果你合併了兩個客戶端的購物車，並且只在其中一個客戶端裡面移除了一個專案，那麼被移除的專案將會重新出現在這兩個客戶端的交集結果中【37】。為了防止這個問題，要移除一個專案時不能簡單地直接從資料庫中刪除；相反，系統必須留下一個具有適當版本號的標記，以在兄弟合併時表明該專案已被移除。這種刪除標記被稱為 **墓碑（tombstone）**（我們上一次看到墓碑是在 “[雜湊索引”](/v1_tw/ch3#雜湊索引) 章節的日誌壓縮部分）。

因為在應用程式程式碼中做兄弟合併是複雜且容易出錯的，所以有一些資料結構被設計出來用於自動執行這種合併，比如在 “[自動衝突解決](#自動衝突解決)” 中討論過的那些。舉例來說，Riak 的資料型別就支援使用稱為 CRDT 【38,39,55】的能以合理方式自動進行兄弟合併的資料結構家族，包括對保留刪除的支援。

#### 版本向量

[圖 5-13](/v1/ddia_0513.png) 中的示例只使用了一個副本。當有多個副本但又沒有主庫時，演算法該如何修改？

[圖 5-13](/v1/ddia_0513.png) 使用單個版本號來捕獲操作之間的依賴關係，但是當多個副本併發接受寫入時，這是不夠的。相反，除了對每個鍵，我們還需要對 **每個副本** 使用版本號。每個副本在處理寫入時增加自己的版本號，並且跟蹤從其他副本中看到的版本號。這個資訊指出了要覆蓋哪些併發值，以及要保留哪些併發值或兄弟值。

所有副本的版本號集合稱為 **版本向量（version vector）**【56】。這個想法的一些變體正在被使用，但最有趣的可能是在 Riak 2.0 【58,59】中使用的 **虛線版本向量（dotted version vector）**【57】。我們不會深入細節，但是它的工作方式與我們在購物車示例中看到的非常相似。

與 [圖 5-13](/v1/ddia_0513.png) 中的版本號一樣，當讀取值時，版本向量會從資料庫副本傳送到客戶端，並且隨後寫入值時需要將其傳送回資料庫。（Riak 將版本向量編碼為一個字串，並稱其為 **因果上下文**，即 causal context）。版本向量允許資料庫區分覆蓋寫入和併發寫入。

另外，就像在單個副本中的情況一樣，應用程式可能需要合併併發值。版本向量結構能夠確保從一個副本讀取並隨後寫回到另一個副本是安全的。這樣做雖然可能會在其他副本上面建立資料，但只要能正確合併就不會丟失資料。

> #### 版本向量和向量時鐘
>
> 版本向量有時也被稱為向量時鐘，即使它們不完全相同。其中的差別很微妙 —— 細節請參閱參考資料【57,60,61】。簡而言之，在比較副本的狀態時，版本向量才是正確的資料結構。


## 本章小結

在本章中，我們考察了複製的問題。複製可以用於幾個目的：

高可用性
: 即使在一臺機器（或多臺機器，或整個資料中心）停機的情況下也能保持系統正常執行

斷開連線的操作
: 允許應用程式在網路中斷時繼續工作

延遲
: 將資料放置在地理上距離使用者較近的地方，以便使用者能夠更快地與其互動

可伸縮性
: 透過在副本上讀，能夠處理比單機更大的讀取量


儘管是一個簡單的目標 - 在幾臺機器上保留相同資料的副本，但複製卻是一個非常棘手的問題。它需要仔細考慮併發和所有可能出錯的事情，並處理這些故障的後果。至少，我們需要處理不可用的節點和網路中斷（這還不包括更隱蔽的故障，例如由於軟體錯誤導致的靜默資料損壞）。

我們討論了複製的三種主要方法：

單主複製
: 客戶端將所有寫入操作傳送到單個節點（主庫），該節點將資料更改事件流傳送到其他副本（從庫）。讀取可以在任何副本上執行，但從庫的讀取結果可能是陳舊的。

多主複製
: 客戶端將每個寫入傳送到幾個主庫節點之一，其中任何一個主庫都可以接受寫入。主庫將資料更改事件流傳送給彼此以及任何從庫節點。

無主複製
: 客戶端將每個寫入傳送到幾個節點，並從多個節點並行讀取，以檢測和糾正具有陳舊資料的節點。

每種方法都有優點和缺點。單主複製是非常流行的，因為它很容易理解，不需要擔心衝突解決。在出現故障節點、網路中斷和延遲峰值的情況下，多主複製和無主複製可以更加健壯，其代價是難以推理並且僅提供非常弱的一致性保證。

複製可以是同步的，也可以是非同步的，這在發生故障時對系統行為有深遠的影響。儘管在系統執行平穩時非同步複製速度很快，但是要弄清楚在複製延遲增加和伺服器故障時會發生什麼，這一點很重要。如果主庫失敗後你將一個非同步更新的從庫提升為新的主庫，那麼最近提交的資料可能會丟失。

我們研究了一些可能由複製延遲引起的奇怪效應，我們也討論了一些有助於決定應用程式在複製延遲時的行為的一致性模型：

讀己之寫一致性
: 使用者應該總是能看到自己提交的資料。

單調讀
: 使用者在看到某個時間點的資料後，他們不應該再看到該資料在更早時間點的情況。

一致字首讀
: 使用者應該看到資料處於一種具有因果意義的狀態：例如，按正確的順序看到一個問題和對應的回答。

最後，我們討論了多主複製和無主複製方法所固有的併發問題：因為他們允許多個寫入併發發生，這可能會導致衝突。我們研究了一個數據庫可以使用的演算法來確定一個操作是否發生在另一個操作之前，或者它們是否併發發生。我們還談到了透過合併併發更新來解決衝突的方法。

在下一章中，我們將繼續考察資料分佈在多臺機器間的另一種不同於 **複製** 的形式：將大資料集分割成 **分割槽**。


## 參考文獻

1. Bruce G. Lindsay, Patricia Griffiths Selinger, C. Galtieri, et al.: “[Notes on Distributed Databases](https://dominoweb.draco.res.ibm.com/reports/RJ2571.pdf),” IBM Research, Research Report RJ2571(33471), July 1979.
1. “[Oracle Active Data Guard Real-Time Data Protection and Availability](http://www.oracle.com/technetwork/database/availability/active-data-guard-wp-12c-1896127.pdf),” Oracle White Paper, June 2013.
1. “[AlwaysOn Availability Groups](http://msdn.microsoft.com/en-us/library/hh510230.aspx),” in *SQL Server Books Online*, Microsoft, 2012.
1. Lin Qiao, Kapil Surlaker, Shirshanka Das, et al.: “[On Brewing Fresh Espresso: LinkedIn’s Distributed Data Serving Platform](http://www.slideshare.net/amywtang/espresso-20952131),” at *ACM International Conference on Management of Data* (SIGMOD), June 2013.
1. Jun Rao: “[Intra-Cluster Replication for Apache Kafka](http://www.slideshare.net/junrao/kafka-replication-apachecon2013),” at *ApacheCon North America*, February 2013.
1. “[Highly Available Queues](https://www.rabbitmq.com/ha.html),” in *RabbitMQ Server Documentation*, Pivotal Software, Inc., 2014.
1. Yoshinori Matsunobu: “[Semi-Synchronous Replication at Facebook](http://yoshinorimatsunobu.blogspot.co.uk/2014/04/semi-synchronous-replication-at-facebook.html),” *yoshinorimatsunobu.blogspot.co.uk*, April 1, 2014.
1. Robbert van Renesse and Fred B. Schneider: “[Chain Replication for Supporting High Throughput and Availability](http://static.usenix.org/legacy/events/osdi04/tech/full_papers/renesse/renesse.pdf),” at *6th USENIX Symposium on Operating System Design and Implementation* (OSDI), December 2004.
1. Jeff Terrace and Michael J. Freedman: “[Object Storage on CRAQ: High-Throughput Chain Replication for Read-Mostly Workloads](https://www.usenix.org/legacy/event/usenix09/tech/full_papers/terrace/terrace.pdf),” at *USENIX Annual Technical Conference* (ATC), June 2009.
1. Brad Calder, Ju Wang, Aaron Ogus, et al.: “[Windows Azure Storage: A Highly Available Cloud Storage Service with Strong Consistency](http://sigops.org/sosp/sosp11/current/2011-Cascais/printable/11-calder.pdf),” at *23rd ACM Symposium on Operating Systems Principles* (SOSP), October 2011.
1. Andrew Wang: “[Windows Azure Storage](https://www.umbrant.com/2016/02/04/windows-azure-storage/),” *umbrant.com*, February 4, 2016.
1. “[Percona Xtrabackup - Documentation](https://www.percona.com/doc/percona-xtrabackup/2.1/index.html),” Percona LLC, 2014.
1. Jesse Newland: “[GitHub Availability This Week](https://github.com/blog/1261-github-availability-this-week),” *github.com*, September 14, 2012.
1. Mark Imbriaco: “[Downtime Last Saturday](https://github.com/blog/1364-downtime-last-saturday),” *github.com*, December 26, 2012.
1. John Hugg: “[‘All in’ with Determinism for Performance and Testing in Distributed Systems](https://www.youtube.com/watch?v=gJRj3vJL4wE),” at *Strange Loop*, September 2015.
1. Amit Kapila: “[WAL Internals of PostgreSQL](http://www.pgcon.org/2012/schedule/attachments/258_212_Internals%20Of%20PostgreSQL%20Wal.pdf),” at *PostgreSQL Conference* (PGCon), May 2012.
1. [*MySQL Documentation*](https://dev.mysql.com/doc/refman/en/binary-log.html). Oracle, 2025.
1. Yogeshwer Sharma, Philippe Ajoux, Petchean Ang, et al.: “[Wormhole: Reliable Pub-Sub to Support Geo-Replicated Internet Services](https://www.usenix.org/system/files/conference/nsdi15/nsdi15-paper-sharma.pdf),” at *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015.
1. “[Oracle GoldenGate 12c: Real-Time Access to Real-Time Information](https://web.archive.org/web/20200110231516/http://www.oracle.com/us/products/middleware/data-integration/oracle-goldengate-realtime-access-2031152.pdf),” Oracle White Paper, October 2013.
1. Shirshanka Das, Chavdar Botev, Kapil Surlaker, et al.: “[All Aboard the Databus!](http://www.socc2012.org/s18-das.pdf),” at *ACM Symposium on Cloud Computing* (SoCC), October 2012.
1. Greg Sabino Mullane: “[Version 5 of Bucardo Database Replication System](https://www.endpointdev.com/blog/2014/06/bucardo-5-multimaster-postgres-released/),” *blog.endpoint.com*, June 23, 2014.
1. Werner Vogels: “[Eventually Consistent](http://queue.acm.org/detail.cfm?id=1466448),” *ACM Queue*, volume 6, number 6, pages 14–19, October 2008. [doi:10.1145/1466443.1466448](http://dx.doi.org/10.1145/1466443.1466448)
1. Douglas B. Terry: “[Replicated Data Consistency Explained Through Baseball](https://www.microsoft.com/en-us/research/publication/replicated-data-consistency-explained-through-baseball/),” Microsoft Research, Technical Report MSR-TR-2011-137, October 2011.
1. Douglas B. Terry, Alan J. Demers, Karin Petersen, et al.: “[Session Guarantees for Weakly Consistent Replicated Data](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.71.2269&rep=rep1&type=pdf),” at *3rd International Conference on Parallel and Distributed Information Systems* (PDIS), September 1994. [doi:10.1109/PDIS.1994.331722](http://dx.doi.org/10.1109/PDIS.1994.331722)
1. Terry Pratchett: *Reaper Man: A Discworld Novel*. Victor Gollancz, 1991. ISBN: 978-0-575-04979-6
1. “[Tungsten Replicator](https://github.com/holys/tungsten-replicator),” *github.com*.
1. “[BDR 0.10.0 Documentation](https://web.archive.org/web/20160728020040/http://bdr-project.org/docs/next/index.html),” The PostgreSQL Global Development Group, *bdr-project.org*, 2015.
1. Robert Hodges: “[If You *Must* Deploy Multi-Master Replication, Read This First](http://scale-out-blog.blogspot.co.uk/2012/04/if-you-must-deploy-multi-master.html),” *scale-out-blog.blogspot.co.uk*, March 30, 2012.
1. J. Chris Anderson, Jan Lehnardt, and Noah Slater: *CouchDB: The Definitive Guide*. O'Reilly Media, 2010. ISBN: 978-0-596-15589-6
1. AppJet, Inc.: “[Etherpad and EasySync Technical Manual](https://github.com/ether/etherpad-lite/blob/e2ce9dc/doc/easysync/easysync-full-description.pdf),” *github.com*, March 26, 2011.
1. John Day-Richter: “[What’s Different About the New Google Docs: Making Collaboration Fast](https://drive.googleblog.com/2010/09/whats-different-about-new-google-docs.html),” *drive.googleblog.com*, September 23, 2010.
1. Martin Kleppmann and Alastair R. Beresford: “[A Conflict-Free Replicated JSON Datatype](http://arxiv.org/abs/1608.03960),” arXiv:1608.03960, August 13, 2016.
1. Frazer Clement: “[Eventual Consistency – Detecting Conflicts](http://messagepassing.blogspot.co.uk/2011/10/eventual-consistency-detecting.html),” *messagepassing.blogspot.co.uk*, October 20, 2011.
1. Robert Hodges: “[State of the Art for MySQL Multi-Master Replication](https://web.archive.org/web/20161010052017/https://www.percona.com/live/mysql-conference-2013/sites/default/files/slides/mysql-multi-master-state-of-art-2013-04-24_0.pdf),” at *Percona Live: MySQL Conference & Expo*, April 2013.
1. John Daily: “[Clocks Are Bad, or, Welcome to the Wonderful World of Distributed Systems](https://riak.com/clocks-are-bad-or-welcome-to-distributed-systems/),” *riak.com*, November 12, 2013.
1. Riley Berton: “[Is Bi-Directional Replication (BDR) in Postgres Transactional?](https://web.archive.org/web/20211204170610/http://sdf.org/~riley/blog/2016/01/04/is-bi-directional-replication-bdr-in-postgres-transactional/),” *sdf.org*, January 4, 2016.
1. Giuseppe DeCandia, Deniz Hastorun, Madan Jampani, et al.: “[Dynamo: Amazon's Highly Available Key-Value Store](http://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf),” at *21st ACM Symposium on Operating Systems Principles* (SOSP), October 2007.
1. Marc Shapiro, Nuno Preguiça, Carlos Baquero, and Marek Zawirski: “[A Comprehensive Study of Convergent and Commutative Replicated Data Types](http://hal.inria.fr/inria-00555588/),” INRIA Research Report no. 7506, January 2011.
1. Sam Elliott: “[CRDTs: An UPDATE (or Maybe Just a PUT)](https://speakerdeck.com/lenary/crdts-an-update-or-just-a-put),” at *RICON West*, October 2013.
1. Russell Brown: “[A Bluffers Guide to CRDTs in Riak](https://gist.github.com/russelldb/f92f44bdfb619e089a4d),” *gist.github.com*, October 28, 2013.
1. Benjamin Farinier, Thomas Gazagnaire, and Anil Madhavapeddy: “[Mergeable Persistent Data Structures](http://gazagnaire.org/pub/FGM15.pdf),” at *26es Journées Francophones des Langages Applicatifs* (JFLA), January 2015.
1. Chengzheng Sun and Clarence Ellis: “[Operational Transformation in Real-Time Group Editors: Issues, Algorithms, and Achievements](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.53.933&rep=rep1&type=pdf),” at *ACM Conference on Computer Supported Cooperative Work* (CSCW), November 1998.
1. Lars Hofhansl: “[HBASE-7709: Infinite Loop Possible in Master/Master Replication](https://issues.apache.org/jira/browse/HBASE-7709),” *issues.apache.org*, January 29, 2013.
1. David K. Gifford: “[Weighted Voting for Replicated Data](https://www.cs.cmu.edu/~15-749/READINGS/required/availability/gifford79.pdf),” at *7th ACM Symposium on Operating Systems Principles* (SOSP), December 1979. [doi:10.1145/800215.806583](http://dx.doi.org/10.1145/800215.806583)
1. Heidi Howard, Dahlia Malkhi, and Alexander Spiegelman: “[Flexible Paxos: Quorum Intersection Revisited](https://arxiv.org/abs/1608.06696),” *arXiv:1608.06696*, August 24, 2016.
1. Joseph Blomstedt: “[Re: Absolute Consistency](https://web.archive.org/web/20190919171316/http://lists.basho.com:80/pipermail/riak-users_lists.basho.com/2012-January/007157.html),” email to *riak-users* mailing list, *lists.basho.com*, January 11, 2012.
1. Joseph Blomstedt: “[Bringing Consistency to Riak](https://vimeo.com/51973001),” at *RICON West*, October 2012.
1. Peter Bailis, Shivaram Venkataraman, Michael J. Franklin, et al.: “[Quantifying Eventual Consistency with PBS](http://www.bailis.org/papers/pbs-cacm2014.pdf),” *Communications of the ACM*, volume 57, number 8, pages 93–102, August 2014. [doi:10.1145/2632792](http://dx.doi.org/10.1145/2632792)
1. Jonathan Ellis: “[Modern Hinted Handoff](http://www.datastax.com/dev/blog/modern-hinted-handoff),” *datastax.com*, December 11, 2012.
1. “[Project Voldemort Wiki](https://github.com/voldemort/voldemort/wiki),” *github.com*, 2013.
1. “[Apache Cassandra Documentation](https://cassandra.apache.org/doc/latest/),” Apache Software Foundation, *cassandra.apache.org*.
1. “[Riak Enterprise: Multi-Datacenter Replication](https://web.archive.org/web/20150513041837/http://basho.com/assets/MultiDatacenter_Replication.pdf).” Technical whitepaper, Basho Technologies, Inc., September 2014.
1. Jonathan Ellis: “[Why Cassandra Doesn't Need Vector Clocks](http://www.datastax.com/dev/blog/why-cassandra-doesnt-need-vector-clocks),” *datastax.com*, September 2, 2013.
1. Leslie Lamport: “[Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/),” *Communications of the ACM*, volume 21, number 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](http://dx.doi.org/10.1145/359545.359563)
1. Joel Jacobson: “[Riak 2.0: Data Types](https://web.archive.org/web/20160327135816/http://blog.joeljacobson.com/riak-2-0-data-types/),” *blog.joeljacobson.com*, March 23, 2014.
1. D. Stott Parker Jr., Gerald J. Popek, Gerard Rudisin, et al.: “[Detection of Mutual Inconsistency in Distributed Systems](https://web.archive.org/web/20170808212704/https://zoo.cs.yale.edu/classes/cs426/2013/bib/parker83detection.pdf),” *IEEE Transactions on Software Engineering*, volume 9, number 3, pages 240–247, May 1983. [doi:10.1109/TSE.1983.236733](http://dx.doi.org/10.1109/TSE.1983.236733)
1. Nuno Preguiça, Carlos Baquero, Paulo Sérgio Almeida, et al.: “[Dotted Version Vectors: Logical Clocks for Optimistic Replication](http://arxiv.org/pdf/1011.5808v1.pdf),” arXiv:1011.5808, November 26, 2010.
1. Sean Cribbs: “[A Brief History of Time in Riak](https://speakerdeck.com/seancribbs/a-brief-history-of-time-in-riak),” at *RICON*, October 2014.
1. Russell Brown: “[Vector Clocks Revisited Part 2: Dotted Version Vectors](https://riak.com/posts/technical/vector-clocks-revisited-part-2-dotted-version-vectors/),” *basho.com*, November 10, 2015.
1. Carlos Baquero: “[Version Vectors Are Not Vector Clocks](https://haslab.wordpress.com/2011/07/08/version-vectors-are-not-vector-clocks/),” *haslab.wordpress.com*, July 8, 2011.
1. Reinhard Schwarz and Friedemann Mattern: “[Detecting Causal Relationships in Distributed Computations: In Search of the Holy Grail](http://dcg.ethz.ch/lectures/hs08/seminar/papers/mattern4.pdf),” *Distributed Computing*, volume 7, number 3, pages 149–174, March 1994. [doi:10.1007/BF02277859](http://dx.doi.org/10.1007/BF02277859)


================================================
FILE: content/v1_tw/ch6.md
================================================
---
linktitle: "第六章：分割槽"
linkTitle: "6. 分割槽"
weight: 206
breadcrumbs: false
---


![](/map/ch06.png)

> 我們必須跳出電腦指令序列的窠臼。敘述定義、描述元資料、梳理關係，而不是編寫過程。
>
> —— Grace Murray Hopper，未來的計算機及其管理（1962）


在 [第五章](/v1_tw/ch5) 中，我們討論了複製 —— 即資料在不同節點上的副本，對於非常大的資料集，或非常高的吞吐量，僅僅進行復制是不夠的：我們需要將資料進行 **分割槽（partitions）**，也稱為 **分片（sharding）**[^i]。

[^i]: 正如本章所討論的，分割槽是一種有意將大型資料庫分解成小型資料庫的方式。它與 **網路分割槽（network partitions, netsplits）** 無關，這是節點之間網路故障的一種。我們將在 [第八章](/v1_tw/ch8) 討論這些錯誤。

> [!TIP] 術語澄清
>
> 上文中的 **分割槽（partition）**，在 MongoDB，Elasticsearch 和 Solr Cloud 中被稱為 **分片（shard）**，在 HBase 中稱之為 **區域（Region）**，Bigtable 中則是 **表塊（tablet）**，Cassandra 和 Riak 中是 **虛節點（vnode）**，Couchbase 中叫做 **虛桶（vBucket）**。但是 **分割槽（partitioning）** 是最約定俗成的叫法。

通常情況下，每條資料（每條記錄，每行或每個文件）屬於且僅屬於一個分割槽。有很多方法可以實現這一點，本章將進行深入討論。實際上，每個分割槽都是自己的小型資料庫，儘管資料庫可能支援同時進行多個分割槽的操作。

分割槽主要是為了 **可伸縮性**。不同的分割槽可以放在不共享叢集中的不同節點上（請參閱 [第二部分](/v1_tw/part-ii) 關於 [無共享架構](/v1_tw/part-ii#無共享架構) 的定義）。因此，大資料集可以分佈在多個磁碟上，並且查詢負載可以分佈在多個處理器上。

對於在單個分割槽上執行的查詢，每個節點可以獨立執行對自己的查詢，因此可以透過新增更多的節點來擴大查詢吞吐量。大型，複雜的查詢可能會跨越多個節點並行處理，儘管這也帶來了新的困難。

分割槽資料庫在 20 世紀 80 年代由 Teradata 和 NonStop SQL【1】等產品率先推出，最近因為 NoSQL 資料庫和基於 Hadoop 的資料倉庫重新被關注。有些系統是為事務性工作設計的，有些系統則用於分析（請參閱 “[事務處理還是分析](/v1_tw/ch3#事務處理還是分析？)”）：這種差異會影響系統的運作方式，但是分割槽的基本原理均適用於這兩種工作方式。

在本章中，我們將首先介紹分割大型資料集的不同方法，並觀察索引如何與分割槽配合。然後我們將討論 [分割槽再平衡（rebalancing）](#分割槽再平衡)，如果想要新增或刪除叢集中的節點，則必須進行再平衡。最後，我們將概述資料庫如何將請求路由到正確的分割槽並執行查詢。

## 分割槽與複製

分割槽通常與複製結合使用，使得每個分割槽的副本儲存在多個節點上。這意味著，即使每條記錄屬於一個分割槽，它仍然可以儲存在多個不同的節點上以獲得容錯能力。

一個節點可能儲存多個分割槽。如果使用主從複製模型，則分割槽和複製的組合如 [圖 6-1](/v1/ddia_0601.png) 所示。每個分割槽領導者（主庫）被分配給一個節點，追隨者（從庫）被分配給其他節點。每個節點可能是某些分割槽的主庫，同時是其他分割槽的從庫。

我們在 [第五章](/v1_tw/ch5) 討論的關於資料庫複製的所有內容同樣適用於分割槽的複製。大多數情況下，分割槽方案的選擇與複製方案的選擇是獨立的，為簡單起見，本章中將忽略複製。

![](/v1/ddia_0601.png)

**圖 6-1 組合使用複製和分割槽：每個節點充當某些分割槽的主庫，其他分割槽充當從庫。**

## 鍵值資料的分割槽

假設你有大量資料並且想要分割槽，如何決定在哪些節點上儲存哪些記錄呢？

分割槽目標是將資料和查詢負載均勻分佈在各個節點上。如果每個節點公平分享資料和負載，那麼理論上 10 個節點應該能夠處理 10 倍的資料量和 10 倍的單個節點的讀寫吞吐量（暫時忽略複製）。

如果分割槽是不公平的，一些分割槽比其他分割槽有更多的資料或查詢，我們稱之為 **偏斜（skew）**。資料偏斜的存在使分割槽效率下降很多。在極端的情況下，所有的負載可能壓在一個分割槽上，其餘 9 個節點空閒的，瓶頸落在這一個繁忙的節點上。不均衡導致的高負載的分割槽被稱為 **熱點（hot spot）**。

避免熱點最簡單的方法是將記錄隨機分配給節點。這將在所有節點上平均分配資料，但是它有一個很大的缺點：當你試圖讀取一個特定的值時，你無法知道它在哪個節點上，所以你必須並行地查詢所有的節點。

我們可以做得更好。現在假設你有一個簡單的鍵值資料模型，其中你總是透過其主鍵訪問記錄。例如，在一本老式的紙質百科全書中，你可以透過標題來查詢一個條目；由於所有條目按字母順序排序，因此你可以快速找到你要查詢的條目。

### 根據鍵的範圍分割槽

一種分割槽的方法是為每個分割槽指定一塊連續的鍵範圍（從最小值到最大值），如紙質百科全書的卷（[圖 6-2](/v1/ddia_0602.png)）。如果知道範圍之間的邊界，則可以輕鬆確定哪個分割槽包含某個值。如果你還知道分割槽所在的節點，那麼可以直接向相應的節點發出請求（對於百科全書而言，就像從書架上選取正確的書籍）。

![](/v1/ddia_0602.png)

**圖 6-2 印刷版百科全書按照關鍵字範圍進行分割槽**

鍵的範圍不一定均勻分佈，因為資料也很可能不均勻分佈。例如在 [圖 6-2](/v1/ddia_0602.png) 中，第 1 捲包含以 A 和 B 開頭的單詞，但第 12 卷則包含以 T、U、V、X、Y 和 Z 開頭的單詞。只是簡單的規定每個捲包含兩個字母會導致一些卷比其他卷大。為了均勻分配資料，分割槽邊界需要依據資料調整。

分割槽邊界可以由管理員手動選擇，也可以由資料庫自動選擇（我們會在 “[分割槽再平衡](#分割槽再平衡)” 中更詳細地討論分割槽邊界的選擇）。Bigtable 使用了這種分割槽策略，以及其開源等價物 HBase 【2, 3】、RethinkDB 和 2.4 版本之前的 MongoDB 【4】。

在每個分割槽中，我們可以按照一定的順序儲存鍵（請參閱 “[SSTables 和 LSM 樹](/v1_tw/ch3#SSTables和LSM樹)”）。好處是進行範圍掃描非常簡單，你可以將鍵作為聯合索引來處理，以便在一次查詢中獲取多個相關記錄（請參閱 “[多列索引](/v1_tw/ch3#多列索引)”）。例如，假設我們有一個程式來儲存感測器網路的資料，其中主鍵是測量的時間戳（年月日時分秒）。範圍掃描在這種情況下非常有用，因為我們可以輕鬆獲取某個月份的所有資料。

然而，Key Range 分割槽的缺點是某些特定的訪問模式會導致熱點。如果主鍵是時間戳，則分割槽對應於時間範圍，例如，給每天分配一個分割槽。不幸的是，由於我們在測量發生時將資料從感測器寫入資料庫，因此所有寫入操作都會轉到同一個分割槽（即今天的分割槽），這樣分割槽可能會因寫入而過載，而其他分割槽則處於空閒狀態【5】。

為了避免感測器資料庫中的這個問題，需要使用除了時間戳以外的其他東西作為主鍵的第一個部分。例如，可以在每個時間戳前新增感測器名稱，這樣會首先按感測器名稱，然後按時間進行分割槽。假設有多個感測器同時執行，寫入負載將最終均勻分佈在不同分割槽上。現在，當想要在一個時間範圍內獲取多個感測器的值時，你需要為每個感測器名稱執行一個單獨的範圍查詢。

### 根據鍵的雜湊分割槽

由於偏斜和熱點的風險，許多分散式資料儲存使用雜湊函式來確定給定鍵的分割槽。

一個好的雜湊函式可以將偏斜的資料均勻分佈。假設你有一個 32 位雜湊函式，無論何時給定一個新的字串輸入，它將返回一個 0 到 $2^{32}$ -1 之間的 “隨機” 數。即使輸入的字串非常相似，它們的雜湊也會均勻分佈在這個數字範圍內。

出於分割槽的目的，雜湊函式不需要多麼強壯的加密演算法：例如，Cassandra 和 MongoDB 使用 MD5，Voldemort 使用 Fowler-Noll-Vo 函式。許多程式語言都有內建的簡單雜湊函式（它們用於散列表），但是它們可能不適合分割槽：例如，在 Java 的 `Object.hashCode()` 和 Ruby 的 `Object#hash`，同一個鍵可能在不同的程序中有不同的雜湊值【6】。

一旦你有一個合適的鍵雜湊函式，你可以為每個分割槽分配一個雜湊範圍（而不是鍵的範圍），每個透過雜湊雜湊落在分割槽範圍內的鍵將被儲存在該分割槽中。如 [圖 6-3](/v1/ddia_0603.png) 所示。

![](/v1/ddia_0603.png)

**圖 6-3 按雜湊鍵分割槽**

這種技術擅長在分割槽之間公平地分配鍵。分割槽邊界可以是均勻間隔的，也可以是偽隨機選擇的（在這種情況下，該技術有時也被稱為 **一致性雜湊**，即 consistent hashing）。

> #### 一致性雜湊
>
> 一致性雜湊由 Karger 等人定義。【7】 用於跨網際網路級別的快取系統，例如 CDN 中，是一種能均勻分配負載的方法。它使用隨機選擇的 **分割槽邊界（partition boundaries）** 來避免中央控制或分散式共識的需要。請注意，這裡的一致性與複製一致性（請參閱 [第五章](/v1_tw/ch5)）或 ACID 一致性（請參閱 [第七章](/v1_tw/ch7)）無關，而只是描述了一種再平衡（rebalancing）的特定方法。
>
> 正如我們將在 “[分割槽再平衡](#分割槽再平衡)” 中所看到的，這種特殊的方法對於資料庫實際上並不是很好，所以在實際中很少使用（某些資料庫的文件仍然會使用一致性雜湊的說法，但是它往往是不準確的）。因為有可能產生混淆，所以最好避免使用一致性雜湊這個術語，而只是把它稱為 **雜湊分割槽（hash partitioning）**。

不幸的是，透過使用鍵雜湊進行分割槽，我們失去了鍵範圍分割槽的一個很好的屬性：高效執行範圍查詢的能力。曾經相鄰的鍵現在分散在所有分割槽中，所以它們之間的順序就丟失了。在 MongoDB 中，如果你使用了基於雜湊的分割槽模式，則任何範圍查詢都必須傳送到所有分割槽【4】。Riak【9】、Couchbase 【10】或 Voldemort 不支援主鍵上的範圍查詢。

Cassandra 採取了折衷的策略【11, 12, 13】。Cassandra 中的表可以使用由多個列組成的複合主鍵來宣告。鍵中只有第一列會作為雜湊的依據，而其他列則被用作 Casssandra 的 SSTables 中排序資料的連線索引。儘管查詢無法在複合主鍵的第一列中按範圍掃表，但如果第一列已經指定了固定值，則可以對該鍵的其他列執行有效的範圍掃描。

組合索引方法為一對多關係提供了一個優雅的資料模型。例如，在社交媒體網站上，一個使用者可能會發布很多更新。如果更新的主鍵被選擇為 `(user_id, update_timestamp)`，那麼你可以有效地檢索特定使用者在某個時間間隔內按時間戳排序的所有更新。不同的使用者可以儲存在不同的分割槽上，對於每個使用者，更新按時間戳順序儲存在單個分割槽上。

### 負載偏斜與熱點消除

如前所述，雜湊分割槽可以幫助減少熱點。但是，它不能完全避免它們：在極端情況下，所有的讀寫操作都是針對同一個鍵的，所有的請求都會被路由到同一個分割槽。

這種場景也許並不常見，但並非聞所未聞：例如，在社交媒體網站上，一個擁有數百萬追隨者的名人使用者在做某事時可能會引發一場風暴【14】。這個事件可能導致同一個鍵的大量寫入（鍵可能是名人的使用者 ID，或者人們正在評論的動作的 ID）。雜湊策略不起作用，因為兩個相同 ID 的雜湊值仍然是相同的。

如今，大多數資料系統無法自動補償這種高度偏斜的負載，因此應用程式有責任減少偏斜。例如，如果一個主鍵被認為是非常火爆的，一個簡單的方法是在主鍵的開始或結尾新增一個隨機數。只要一個兩位數的十進位制隨機數就可以將主鍵分散為 100 種不同的主鍵，從而儲存在不同的分割槽中。

然而，將主鍵進行分割之後，任何讀取都必須要做額外的工作，因為他們必須從所有 100 個主鍵分佈中讀取資料並將其合併。此技術還需要額外的記錄：只需要對少量熱點附加隨機數；對於寫入吞吐量低的絕大多數主鍵來說是不必要的開銷。因此，你還需要一些方法來跟蹤哪些鍵需要被分割。

也許在將來，資料系統將能夠自動檢測和補償偏斜的工作負載；但現在，你需要自己來權衡。


## 分割槽與次級索引


到目前為止，我們討論的分割槽方案依賴於鍵值資料模型。如果只通過主鍵訪問記錄，我們可以從該鍵確定分割槽，並使用它來將讀寫請求路由到負責該鍵的分割槽。

如果涉及次級索引，情況會變得更加複雜（參考 “[其他索引結構](/v1_tw/ch3#其他索引結構)”）。次級索引通常並不能唯一地標識記錄，而是一種搜尋記錄中出現特定值的方式：查詢使用者 123 的所有操作、查詢包含詞語 `hogwash` 的所有文章、查詢所有顏色為紅色的車輛等等。

次級索引是關係型資料庫的基礎，並且在文件資料庫中也很普遍。許多鍵值儲存（如 HBase 和 Volde-mort）為了減少實現的複雜度而放棄了次級索引，但是一些（如 Riak）已經開始新增它們，因為它們對於資料模型實在是太有用了。並且次級索引也是 Solr 和 Elasticsearch 等搜尋伺服器的基石。

次級索引的問題是它們不能整齊地對映到分割槽。有兩種用次級索引對資料庫進行分割槽的方法：**基於文件的分割槽（document-based）** 和 **基於關鍵詞（term-based）的分割槽**。

### 基於文件的次級索引進行分割槽

假設你正在經營一個銷售二手車的網站（如 [圖 6-4](/v1/ddia_0604.png) 所示）。每個列表都有一個唯一的 ID—— 稱之為文件 ID—— 並且用文件 ID 對資料庫進行分割槽（例如，分割槽 0 中的 ID 0 到 499，分割槽 1 中的 ID 500 到 999 等）。

你想讓使用者搜尋汽車，允許他們透過顏色和廠商過濾，所以需要一個在顏色和廠商上的次級索引（文件資料庫中這些是 **欄位（field）**，關係資料庫中這些是 **列（column）** ）。如果你聲明了索引，則資料庫可以自動執行索引 [^ii]。例如，無論何時將紅色汽車新增到資料庫，資料庫分割槽都會自動將其新增到索引條目 `color:red` 的文件 ID 列表中。

[^ii]: 如果資料庫僅支援鍵值模型，則你可能會嘗試在應用程式程式碼中建立從值到文件 ID 的對映來實現次級索引。如果沿著這條路線走下去，請萬分小心，確保你的索引與底層資料保持一致。競爭條件和間歇性寫入失敗（其中一些更改已儲存，但其他更改未儲存）很容易導致資料不同步 - 請參閱 “[多物件事務的需求](/v1_tw/ch7#多物件事務的需求)”。

![](/v1/ddia_0604.png)

**圖 6-4 基於文件的次級索引進行分割槽**

在這種索引方法中，每個分割槽是完全獨立的：每個分割槽維護自己的次級索引，僅覆蓋該分割槽中的文件。它不關心儲存在其他分割槽的資料。無論何時你需要寫入資料庫（新增，刪除或更新文件），只需處理包含你正在編寫的文件 ID 的分割槽即可。出於這個原因，**文件分割槽索引** 也被稱為 **本地索引**（而不是將在下一節中描述的 **全域性索引**）。

但是，從文件分割槽索引中讀取需要注意：除非你對文件 ID 做了特別的處理，否則沒有理由將所有具有特定顏色或特定品牌的汽車放在同一個分割槽中。在 [圖 6-4](/v1/ddia_0604.png) 中，紅色汽車出現在分割槽 0 和分割槽 1 中。因此，如果要搜尋紅色汽車，則需要將查詢傳送到所有分割槽，併合並所有返回的結果。


這種查詢分割槽資料庫的方法有時被稱為 **分散 / 聚集（scatter/gather）**，並且可能會使次級索引上的讀取查詢相當昂貴。即使並行查詢分割槽，分散 / 聚集也容易導致尾部延遲放大（請參閱 “[實踐中的百分位點](/v1_tw/ch1#實踐中的百分位點)”）。然而，它被廣泛使用：MongoDB，Riak 【15】，Cassandra 【16】，Elasticsearch 【17】，SolrCloud 【18】和 VoltDB 【19】都使用文件分割槽次級索引。大多數資料庫供應商建議你構建一個能從單個分割槽提供次級索引查詢的分割槽方案，但這並不總是可行，尤其是當在單個查詢中使用多個次級索引時（例如同時需要按顏色和製造商查詢）。


### 基於關鍵詞(Term)的次級索引進行分割槽

我們可以構建一個覆蓋所有分割槽資料的 **全域性索引**，而不是給每個分割槽建立自己的次級索引（本地索引）。但是，我們不能只把這個索引儲存在一個節點上，因為它可能會成為瓶頸，違背了分割槽的目的。全域性索引也必須進行分割槽，但可以採用與主鍵不同的分割槽方式。

[圖 6-5](/v1/ddia_0605.png) 描述了這可能是什麼樣子：來自所有分割槽的紅色汽車在紅色索引中，並且索引是分割槽的，首字母從 `a` 到 `r` 的顏色在分割槽 0 中，`s` 到 `z` 的在分割槽 1。汽車製造商的索引也與之類似（分割槽邊界在 `f` 和 `h` 之間）。

![](/v1/ddia_0605.png)

**圖 6-5 基於關鍵詞對次級索引進行分割槽**

我們將這種索引稱為 **關鍵詞分割槽（term-partitioned）**，因為我們尋找的關鍵詞決定了索引的分割槽方式。例如，一個關鍵詞可能是：`color:red`。**關鍵詞（Term）** 這個名稱來源於全文搜尋索引（一種特殊的次級索引），指文件中出現的所有單詞。

和之前一樣，我們可以透過 **關鍵詞** 本身或者它的雜湊進行索引分割槽。根據關鍵詞本身來分割槽對於範圍掃描非常有用（例如對於數值類的屬性，像汽車的報價），而對關鍵詞的雜湊分割槽提供了負載均衡的能力。

關鍵詞分割槽的全域性索引優於文件分割槽索引的地方點是它可以使讀取更有效率：不需要 **分散 / 收集** 所有分割槽，客戶端只需要向包含關鍵詞的分割槽發出請求。全域性索引的缺點在於寫入速度較慢且較為複雜，因為寫入單個文件現在可能會影響索引的多個分割槽（文件中的每個關鍵詞可能位於不同的分割槽或者不同的節點上） 。

理想情況下，索引總是最新的，寫入資料庫的每個文件都會立即反映在索引中。但是，在關鍵詞分割槽索引中，這需要跨分割槽的分散式事務，並不是所有資料庫都支援（請參閱 [第七章](/v1_tw/ch7) 和 [第九章](/v1_tw/ch9)）。

在實踐中，對全域性次級索引的更新通常是 **非同步** 的（也就是說，如果在寫入之後不久讀取索引，剛才所做的更改可能尚未反映在索引中）。例如，Amazon DynamoDB 聲稱在正常情況下，其全域性次級索引會在不到一秒的時間內更新，但在基礎架構出現故障的情況下可能會有延遲【20】。

全域性關鍵詞分割槽索引的其他用途包括 Riak 的搜尋功能【21】和 Oracle 資料倉庫，它允許你在本地和全域性索引之間進行選擇【22】。我們將在 [第十二章](/v1_tw/ch12) 中繼續關鍵詞分割槽次級索引實現的話題。

## 分割槽再平衡

隨著時間的推移，資料庫會有各種變化：

* 查詢吞吐量增加，所以你想要新增更多的 CPU 來處理負載。
* 資料集大小增加，所以你想新增更多的磁碟和 RAM 來儲存它。
* 機器出現故障，其他機器需要接管故障機器的責任。

所有這些更改都需要資料和請求從一個節點移動到另一個節點。將負載從叢集中的一個節點向另一個節點移動的過程稱為 **再平衡（rebalancing）**。

無論使用哪種分割槽方案，再平衡通常都要滿足一些最低要求：

* 再平衡之後，負載（資料儲存，讀取和寫入請求）應該在叢集中的節點之間公平地共享。
* 再平衡發生時，資料庫應該繼續接受讀取和寫入。
* 節點之間只移動必須的資料，以便快速再平衡，並減少網路和磁碟 I/O 負載。


### 再平衡策略

有幾種不同的分割槽分配方法【23】，讓我們依次簡要討論一下。

#### 反面教材：hash mod N

我們在前面說過（[圖 6-3](/v1/ddia_0603.png)），最好將可能的雜湊分成不同的範圍，並將每個範圍分配給一個分割槽（例如，如果 $0 ≤ hash(key)< b_0$，則將鍵分配給分割槽 0，如果 $b_0 ≤ hash(key) < b_1$，則分配給分割槽 1）

也許你想知道為什麼我們不使用 ***取模（mod）***（許多程式語言中的 % 運算子）。例如，`hash(key) mod 10` 會返回一個介於 0 和 9 之間的數字（如果我們將雜湊寫為十進位制數，雜湊模 10 將是最後一個數字）。如果我們有 10 個節點，編號為 0 到 9，這似乎是將每個鍵分配給一個節點的簡單方法。

模 N（$mod N$）方法的問題是，如果節點數量 N 發生變化，大多數鍵將需要從一個節點移動到另一個節點。例如，假設 $hash(key)=123456$。如果最初有 10 個節點，那麼這個鍵一開始放在節點 6 上（因為 $123456\ mod\  10 = 6$）。當你增長到 11 個節點時，鍵需要移動到節點 3（$123456\ mod\ 11 = 3$），當你增長到 12 個節點時，需要移動到節點 0（$123456\ mod\ 12 = 0$）。這種頻繁的舉動使得再平衡的成本過高。

我們需要一種只移動必需資料的方法。

#### 固定數量的分割槽

幸運的是，有一個相當簡單的解決方案：建立比節點更多的分割槽，併為每個節點分配多個分割槽。例如，執行在 10 個節點的叢集上的資料庫可能會從一開始就被拆分為 1,000 個分割槽，因此大約有 100 個分割槽被分配給每個節點。

現在，如果一個節點被新增到叢集中，新節點可以從當前每個節點中 **竊取** 一些分割槽，直到分割槽再次公平分配。這個過程如 [圖 6-6](/v1/ddia_0606.png) 所示。如果從叢集中刪除一個節點，則會發生相反的情況。

只有分割槽在節點之間的移動。分割槽的數量不會改變，鍵所指定的分割槽也不會改變。唯一改變的是分割槽所在的節點。這種變更並不是即時的 — 在網路上傳輸大量的資料需要一些時間 — 所以在傳輸過程中，原有分割槽仍然會接受讀寫操作。

![](/v1/ddia_0606.png)

**圖 6-6 將新節點新增到每個節點具有多個分割槽的資料庫叢集。**

原則上，你甚至可以解決叢集中的硬體不匹配問題：透過為更強大的節點分配更多的分割槽，可以強制這些節點承載更多的負載。在 Riak 【15】、Elasticsearch 【24】、Couchbase 【10】和 Voldemort 【25】中使用了這種再平衡的方法。

在這種配置中，分割槽的數量通常在資料庫第一次建立時確定，之後不會改變。雖然原則上可以分割和合並分割槽（請參閱下一節），但固定數量的分割槽在操作上更簡單，因此許多固定分割槽資料庫選擇不實施分割槽分割。因此，一開始配置的分割槽數就是你可以擁有的最大節點數量，所以你需要選擇足夠多的分割槽以適應未來的增長。但是，每個分割槽也有管理開銷，所以選擇太大的數字會適得其反。

如果資料集的總大小難以預估（例如，可能它開始很小，但隨著時間的推移會變得更大），選擇正確的分割槽數是困難的。由於每個分割槽包含了總資料量固定比率的資料，因此每個分割槽的大小與叢集中的資料總量成比例增長。如果分割槽非常大，再平衡和從節點故障恢復變得昂貴。但是，如果分割槽太小，則會產生太多的開銷。當分割槽大小 “恰到好處” 的時候才能獲得很好的效能，如果分割槽數量固定，但資料量變動很大，則難以達到最佳效能。

#### 動態分割槽

對於使用鍵範圍分割槽的資料庫（請參閱 “[根據鍵的範圍分割槽](#根據鍵的範圍分割槽)”），具有固定邊界的固定數量的分割槽將非常不便：如果邊界設定錯誤，可能會導致所有資料都在一個分割槽中，而其他分割槽則為空。手動重新配置分割槽邊界將非常繁瑣。

出於這個原因，按鍵的範圍進行分割槽的資料庫（如 HBase 和 RethinkDB）會動態建立分割槽。當分割槽增長到超過配置的大小時（在 HBase 上，預設值是 10GB），會被分成兩個分割槽，每個分割槽約佔一半的資料【26】。與之相反，如果大量資料被刪除並且分割槽縮小到某個閾值以下，則可以將其與相鄰分割槽合併。此過程與 B 樹頂層發生的過程類似（請參閱 “[B 樹](/v1_tw/ch3#B樹)”）。

每個分割槽分配給一個節點，每個節點可以處理多個分割槽，就像固定數量的分割槽一樣。大型分割槽拆分後，可以將其中的一半轉移到另一個節點，以平衡負載。在 HBase 中，分割槽檔案的傳輸透過 HDFS（底層使用的分散式檔案系統）來實現【3】。

動態分割槽的一個優點是分割槽數量適應總資料量。如果只有少量的資料，少量的分割槽就足夠了，所以開銷很小；如果有大量的資料，每個分割槽的大小被限制在一個可配置的最大值【23】。

需要注意的是，一個空的資料庫從一個分割槽開始，因為沒有關於在哪裡繪製分割槽邊界的先驗資訊。資料集開始時很小，直到達到第一個分割槽的分割點，所有寫入操作都必須由單個節點處理，而其他節點則處於空閒狀態。為了解決這個問題，HBase 和 MongoDB 允許在一個空的資料庫上配置一組初始分割槽（這被稱為 **預分割**，即 pre-splitting）。在鍵範圍分割槽的情況中，預分割需要提前知道鍵是如何進行分配的【4,26】。

動態分割槽不僅適用於資料的範圍分割槽，而且也適用於雜湊分割槽。從版本 2.4 開始，MongoDB 同時支援範圍和雜湊分割槽，並且都支援動態分割分割槽。

#### 按節點比例分割槽

透過動態分割槽，分割槽的數量與資料集的大小成正比，因為拆分和合並過程將每個分割槽的大小保持在固定的最小值和最大值之間。另一方面，對於固定數量的分割槽，每個分割槽的大小與資料集的大小成正比。在這兩種情況下，分割槽的數量都與節點的數量無關。

Cassandra 和 Ketama 使用的第三種方法是使分割槽數與節點數成正比 —— 換句話說，每個節點具有固定數量的分割槽【23,27,28】。在這種情況下，每個分割槽的大小與資料集大小成比例地增長，而節點數量保持不變，但是當增加節點數時，分割槽將再次變小。由於較大的資料量通常需要較大數量的節點進行儲存，因此這種方法也使每個分割槽的大小較為穩定。

當一個新節點加入叢集時，它隨機選擇固定數量的現有分割槽進行拆分，然後佔有這些拆分分割槽中每個分割槽的一半，同時將每個分割槽的另一半留在原地。隨機化可能會產生不公平的分割，但是平均在更大數量的分割槽上時（在 Cassandra 中，預設情況下，每個節點有 256 個分割槽），新節點最終從現有節點獲得公平的負載份額。Cassandra 3.0 引入了另一種再平衡的演算法來避免不公平的分割【29】。

隨機選擇分割槽邊界要求使用基於雜湊的分割槽（可以從雜湊函式產生的數字範圍中挑選邊界）。實際上，這種方法最符合一致性雜湊的原始定義【7】（請參閱 “[一致性雜湊](#一致性雜湊)”）。最新的雜湊函式可以在較低元資料開銷的情況下達到類似的效果【8】。

### 運維：手動還是自動再平衡

關於再平衡有一個重要問題：自動還是手動進行？

在全自動再平衡（系統自動決定何時將分割槽從一個節點移動到另一個節點，無須人工干預）和完全手動（分割槽指派給節點由管理員明確配置，僅在管理員明確重新配置時才會更改）之間有一個權衡。例如，Couchbase、Riak 和 Voldemort 會自動生成建議的分割槽分配，但需要管理員提交才能生效。

全自動再平衡可以很方便，因為正常維護的操作工作較少。然而，它可能是不可預測的。再平衡是一個昂貴的操作，因為它需要重新路由請求並將大量資料從一個節點移動到另一個節點。如果沒有做好，這個過程可能會使網路或節點負載過重，降低其他請求的效能。

這種自動化與自動故障檢測相結合可能十分危險。例如，假設一個節點過載，並且對請求的響應暫時很慢。其他節點得出結論：過載的節點已經死亡，並自動重新平衡叢集，使負載離開它。這會對已經超負荷的節點，其他節點和網路造成額外的負載，從而使情況變得更糟，並可能導致級聯失敗。

出於這個原因，再平衡的過程中有人參與是一件好事。這比全自動的過程慢，但可以幫助防止運維意外。

## 請求路由

現在我們已經將資料集分割到多個機器上執行的多個節點上。但是仍然存在一個懸而未決的問題：當客戶想要發出請求時，如何知道要連線哪個節點？隨著分割槽的重新平衡，分割槽對節點的分配也發生變化。為了回答這個問題，需要有人知曉這些變化：如果我想讀或寫鍵 “foo”，需要連線哪個 IP 地址和埠號？

這個問題可以概括為 **服務發現（service discovery）** ，它不僅限於資料庫。任何可透過網路訪問的軟體都有這個問題，特別是如果它的目標是高可用性（在多臺機器上執行冗餘配置）。許多公司已經編寫了自己的內部服務發現工具，其中許多已經作為開源釋出【30】。

概括來說，這個問題有幾種不同的方案（如圖 6-7 所示）:

1. 允許客戶聯絡任何節點（例如，透過 **迴圈策略的負載均衡**，即 Round-Robin Load Balancer）。如果該節點恰巧擁有請求的分割槽，則它可以直接處理該請求；否則，它將請求轉發到適當的節點，接收回復並傳遞給客戶端。
2. 首先將所有來自客戶端的請求傳送到路由層，它決定了應該處理請求的節點，並相應地轉發。此路由層本身不處理任何請求；它僅負責分割槽的負載均衡。
3. 要求客戶端知道分割槽和節點的分配。在這種情況下，客戶端可以直接連線到適當的節點，而不需要任何中介。

以上所有情況中的關鍵問題是：作出路由決策的元件（可能是節點之一，還是路由層或客戶端）如何瞭解分割槽 - 節點之間的分配關係變化？

![](/v1/ddia_0607.png)

**圖 6-7 將請求路由到正確節點的三種不同方式。**

這是一個具有挑戰性的問題，因為重要的是所有參與者都達成共識 - 否則請求將被傳送到錯誤的節點，得不到正確的處理。在分散式系統中有達成共識的協議，但很難正確地實現（見 [第九章](/v1_tw/ch9)）。

許多分散式資料系統都依賴於一個獨立的協調服務，比如 ZooKeeper 來跟蹤叢集元資料，如 [圖 6-8](/v1/ddia_0608.png) 所示。每個節點在 ZooKeeper 中註冊自己，ZooKeeper 維護分割槽到節點的可靠對映。其他參與者（如路由層或分割槽感知客戶端）可以在 ZooKeeper 中訂閱此資訊。只要分割槽分配發生了改變，或者叢集中新增或刪除了一個節點，ZooKeeper 就會通知路由層使路由資訊保持最新狀態。

![](/v1/ddia_0608.png)

**圖 6-8 使用 ZooKeeper 跟蹤分割槽分配給節點。**

例如，LinkedIn的Espresso使用Helix 【31】進行叢集管理（依靠ZooKeeper），實現了如[圖6-8](/v1/ddia_0608.png)所示的路由層。HBase、SolrCloud和Kafka也使用ZooKeeper來跟蹤分割槽分配。MongoDB具有類似的體系結構，但它依賴於自己的**配置伺服器（config server）** 實現和mongos守護程序作為路由層。

Cassandra 和 Riak 採取不同的方法：他們在節點之間使用 **流言協議（gossip protocol）** 來傳播叢集狀態的變化。請求可以傳送到任意節點，該節點會轉發到包含所請求的分割槽的適當節點（[圖 6-7](/v1/ddia_0607.png) 中的方法 1）。這個模型在資料庫節點中增加了更多的複雜性，但是避免了對像 ZooKeeper 這樣的外部協調服務的依賴。

Couchbase 不會自動進行再平衡，這簡化了設計。通常情況下，它配置了一個名為 moxi 的路由層，它會從叢集節點了解路由變化【32】。

當使用路由層或向隨機節點發送請求時，客戶端仍然需要找到要連線的 IP 地址。這些地址並不像分割槽的節點分佈變化的那麼快，所以使用 DNS 通常就足夠了。

### 執行並行查詢

到目前為止，我們只關注讀取或寫入單個鍵的非常簡單的查詢（加上基於文件分割槽的次級索引場景下的分散 / 聚集查詢）。這也是大多數 NoSQL 分散式資料儲存所支援的訪問層級。

然而，通常用於分析的 **大規模並行處理（MPP, Massively parallel processing）** 關係型資料庫產品在其支援的查詢型別方面要複雜得多。一個典型的資料倉庫查詢包含多個連線，過濾，分組和聚合操作。MPP 查詢最佳化器將這個複雜的查詢分解成許多執行階段和分割槽，其中許多可以在資料庫叢集的不同節點上並行執行。涉及掃描大規模資料集的查詢特別受益於這種並行執行。

資料倉庫查詢的快速並行執行是一個專門的話題，由於分析有很重要的商業意義，可以帶來很多利益。我們將在 [第十章](/v1_tw/ch10) 討論並行查詢執行的一些技巧。有關並行資料庫中使用的技術的更詳細的概述，請參閱參考文獻【1,33】。

## 本章小結

在本章中，我們探討了將大資料集劃分成更小的子集的不同方法。資料量非常大的時候，在單臺機器上儲存和處理不再可行，而分割槽則十分必要。分割槽的目標是在多臺機器上均勻分佈資料和查詢負載，避免出現熱點（負載不成比例的節點）。這需要選擇適合於你的資料的分割槽方案，並在將節點新增到叢集或從叢集刪除時重新平衡分割槽。

我們討論了兩種主要的分割槽方法：

* 鍵範圍分割槽

  其中鍵是有序的，並且分割槽擁有從某個最小值到某個最大值的所有鍵。排序的優勢在於可以進行有效的範圍查詢，但是如果應用程式經常訪問相鄰的鍵，則存在熱點的風險。

  在這種方法中，當分割槽變得太大時，通常將分割槽分成兩個子分割槽來動態地重新平衡分割槽。

* 雜湊分割槽

  雜湊函式應用於每個鍵，分割槽擁有一定範圍的雜湊。這種方法破壞了鍵的排序，使得範圍查詢效率低下，但可以更均勻地分配負載。

  透過雜湊進行分割槽時，通常先提前建立固定數量的分割槽，為每個節點分配多個分割槽，並在新增或刪除節點時將整個分割槽從一個節點移動到另一個節點。也可以使用動態分割槽。

兩種方法搭配使用也是可行的，例如使用複合主鍵：使用鍵的一部分來標識分割槽，而使用另一部分作為排序順序。

我們還討論了分割槽和次級索引之間的相互作用。次級索引也需要分割槽，有兩種方法：

* 基於文件分割槽（本地索引），其中次級索引儲存在與主鍵和值相同的分割槽中。這意味著只有一個分割槽需要在寫入時更新，但是讀取次級索引需要在所有分割槽之間進行分散 / 收集。
* 基於關鍵詞分割槽（全域性索引），其中次級索引存在不同的分割槽中。次級索引中的條目可以包括來自主鍵的所有分割槽的記錄。當文件寫入時，需要更新多個分割槽中的次級索引；但是可以從單個分割槽中進行讀取。

最後，我們討論了將查詢路由到適當的分割槽的技術，從簡單的分割槽負載平衡到複雜的並行查詢執行引擎。

按照設計，多數情況下每個分割槽是獨立執行的 — 這就是分割槽資料庫可以伸縮到多臺機器的原因。但是，需要寫入多個分割槽的操作結果可能難以預料：例如，如果寫入一個分割槽成功，但另一個分割槽失敗，會發生什麼情況？我們將在下面的章節中討論這個問題。


## 參考文獻

1. David J. DeWitt and Jim N. Gray: “[Parallel Database Systems: The Future of High Performance Database Systems](http://www.cs.cmu.edu/~pavlo/courses/fall2013/static/papers/dewittgray92.pdf),” *Communications of the ACM*, volume 35, number 6, pages 85–98, June 1992. [doi:10.1145/129888.129894](http://dx.doi.org/10.1145/129888.129894)
1. Lars George: “[HBase vs. BigTable Comparison](http://www.larsgeorge.com/2009/11/hbase-vs-bigtable-comparison.html),” *larsgeorge.com*, November 2009.
1. “[The Apache HBase Reference Guide](https://hbase.apache.org/book/book.html),” Apache Software Foundation, *hbase.apache.org*, 2014.
1. MongoDB, Inc.: “[New Hash-Based Sharding Feature in MongoDB 2.4](https://web.archive.org/web/20230610080235/https://www.mongodb.com/blog/post/new-hash-based-sharding-feature-in-mongodb-24),” *blog.mongodb.org*, April 10, 2013.
1. Ikai Lan: “[App Engine Datastore Tip: Monotonically Increasing Values Are Bad](http://ikaisays.com/2011/01/25/app-engine-datastore-tip-monotonically-increasing-values-are-bad/),” *ikaisays.com*, January 25, 2011.
1. Martin Kleppmann: “[Java's hashCode Is Not Safe for Distributed Systems](http://martin.kleppmann.com/2012/06/18/java-hashcode-unsafe-for-distributed-systems.html),” *martin.kleppmann.com*, June 18, 2012.
1. David Karger, Eric Lehman, Tom Leighton, et al.: “[Consistent Hashing and Random Trees: Distributed Caching Protocols for Relieving Hot Spots on the World Wide Web](https://www.akamai.com/site/en/documents/research-paper/consistent-hashing-and-random-trees-distributed-caching-protocols-for-relieving-hot-spots-on-the-world-wide-web-technical-publication.pdf),” at *29th Annual ACM Symposium on Theory of Computing* (STOC), pages 654–663, 1997. [doi:10.1145/258533.258660](http://dx.doi.org/10.1145/258533.258660)
1. John Lamping and Eric Veach: “[A Fast, Minimal Memory, Consistent Hash Algorithm](http://arxiv.org/pdf/1406.2294.pdf),” *arxiv.org*, June 2014.
1. Eric Redmond: “[A Little Riak Book](https://web.archive.org/web/20160807123307/http://www.littleriakbook.com/),” Version 1.4.0, Basho Technologies, September 2013.
1. “[Couchbase 2.5 Administrator Guide](http://docs.couchbase.com/couchbase-manual-2.5/cb-admin/),” Couchbase, Inc., 2014.
1. Avinash Lakshman and Prashant Malik: “[Cassandra – A Decentralized Structured Storage System](http://www.cs.cornell.edu/Projects/ladis2009/papers/Lakshman-ladis2009.PDF),” at *3rd ACM SIGOPS International Workshop on Large Scale Distributed Systems and Middleware* (LADIS), October 2009.
1. Jonathan Ellis: “[Facebook’s Cassandra Paper, Annotated and Compared to Apache Cassandra 2.0](https://docs.datastax.com/en/articles/cassandra/cassandrathenandnow.html),” *docs.datastax.com*, September 12, 2013.
1. “[Introduction to Cassandra Query Language](https://docs.datastax.com/en/cql-oss/3.1/cql/cql_intro_c.html),” DataStax, Inc., 2014.
1. Samuel Axon: “[3% of Twitter's Servers Dedicated to Justin Bieber](https://web.archive.org/web/20201109041636/https://mashable.com/2010/09/07/justin-bieber-twitter/?europe=true),” *mashable.com*, September 7, 2010.
1. “[Riak KV Docs](https://docs.riak.com/riak/kv/latest/index.html),” *docs.riak.com*.
1. Richard Low: “[The Sweet Spot for Cassandra Secondary Indexing](https://web.archive.org/web/20190831132955/http://www.wentnet.com/blog/?p=77),” *wentnet.com*, October 21, 2013.
1. Zachary Tong: “[Customizing Your Document Routing](https://www.elastic.co/blog/customizing-your-document-routing/),” *elastic.co*, June 3, 2013.
1. “[Apache Solr Reference Guide](https://cwiki.apache.org/confluence/display/solr/Apache+Solr+Reference+Guide),” Apache Software Foundation, 2014.
1. Andrew Pavlo: “[H-Store Frequently Asked Questions](http://hstore.cs.brown.edu/documentation/faq/),” *hstore.cs.brown.edu*, October 2013.
1. “[Amazon DynamoDB Developer Guide](http://docs.aws.amazon.com/amazondynamodb/latest/developerguide/),” Amazon Web Services, Inc., 2014.
1. Rusty Klophaus: “[Difference Between 2I and Search](https://web.archive.org/web/20150926053350/http://lists.basho.com/pipermail/riak-users_lists.basho.com/2011-October/006220.html),” email to *riak-users* mailing list, *lists.basho.com*, October 25, 2011.
1. Donald K. Burleson: “[Object Partitioning in Oracle](http://www.dba-oracle.com/art_partit.htm),”*dba-oracle.com*, November 8, 2000.
1. Eric Evans: “[Rethinking Topology in Cassandra](http://www.slideshare.net/jericevans/virtual-nodes-rethinking-topology-in-cassandra),” at *ApacheCon Europe*, November 2012.
1. Rafał Kuć: “[Reroute API Explained](https://web.archive.org/web/20190706215750/http://elasticsearchserverbook.com/reroute-api-explained/),” *elasticsearchserverbook.com*, September 30, 2013.
1. “[Project Voldemort Documentation](https://web.archive.org/web/20250107145644/http://www.project-voldemort.com/voldemort/),” *project-voldemort.com*.
1. Enis Soztutar: “[Apache HBase Region Splitting and Merging](http://hortonworks.com/blog/apache-hbase-region-splitting-and-merging/),” *hortonworks.com*, February 1, 2013.
1. Brandon Williams: “[Virtual Nodes in Cassandra 1.2](http://www.datastax.com/dev/blog/virtual-nodes-in-cassandra-1-2),” *datastax.com*, December 4, 2012.
1. Richard Jones: “[libketama: Consistent Hashing Library for Memcached Clients](https://www.metabrew.com/article/libketama-consistent-hashing-algo-memcached-clients),” *metabrew.com*, April 10, 2007.
1. Branimir Lambov: “[New Token Allocation Algorithm in Cassandra 3.0](http://www.datastax.com/dev/blog/token-allocation-algorithm),” *datastax.com*, January 28, 2016.
1. Jason Wilder: “[Open-Source Service Discovery](http://jasonwilder.com/blog/2014/02/04/service-discovery-in-the-cloud/),” *jasonwilder.com*, February 2014.
1. Kishore Gopalakrishna, Shi Lu, Zhen Zhang, et al.: “[Untangling Cluster Management with Helix](http://www.socc2012.org/helix_onecol.pdf?attredirects=0),” at *ACM Symposium on Cloud Computing* (SoCC), October 2012. [doi:10.1145/2391229.2391248](http://dx.doi.org/10.1145/2391229.2391248)
1. “[Moxi 1.8 Manual](http://docs.couchbase.com/moxi-manual-1.8/),” Couchbase, Inc., 2014.
1. Shivnath Babu and Herodotos Herodotou: “[Massively Parallel Databases and MapReduce Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2013/11/db-mr-survey-final.pdf),” *Foundations and Trends in Databases*, volume 5, number 1, pages 1–104, November 2013. [doi:10.1561/1900000036](http://dx.doi.org/10.1561/1900000036)

================================================
FILE: content/v1_tw/ch7.md
================================================
---
title: "第七章：事務"
linkTitle: "7. 事務"
weight: 207
breadcrumbs: false
---

![](/map/ch07.png)

> 一些作者聲稱，支援通用的兩階段提交代價太大，會帶來效能與可用性的問題。讓程式設計師來處理過度使用事務導致的效能問題，總比缺少事務程式設計好得多。
>
> —— James Corbett 等人，Spanner：Google 的全球分散式資料庫（2012）

在資料系統的殘酷現實中，很多事情都可能出錯：

- 資料庫軟體、硬體可能在任意時刻發生故障（包括寫操作進行到一半時）。
- 應用程式可能在任意時刻崩潰（包括一系列操作的中間）。
- 網路中斷可能會意外切斷資料庫與應用的連線，或資料庫之間的連線。
- 多個客戶端可能會同時寫入資料庫，覆蓋彼此的更改。
- 客戶端可能讀取到無意義的資料，因為資料只更新了一部分。
- 客戶端之間的競爭條件可能導致令人驚訝的錯誤。

為了實現可靠性，系統必須處理這些故障，確保它們不會導致整個系統的災難性故障。但是實現容錯機制工作量巨大。需要仔細考慮所有可能出錯的事情，並進行大量的測試，以確保解決方案真正管用。

數十年來，**事務（transaction）** 一直是簡化這些問題的首選機制。事務是應用程式將多個讀寫操作組合成一個邏輯單元的一種方式。從概念上講，事務中的所有讀寫操作被視作單個操作來執行：整個事務要麼成功 **提交**（commit），要麼失敗 **中止**（abort）或 **回滾**（rollback）。如果失敗，應用程式可以安全地重試。對於事務來說，應用程式的錯誤處理變得簡單多了，因為它不用再擔心部分失敗的情況了，即某些操作成功，某些失敗（無論出於何種原因）。

和事務打交道時間長了，你可能會覺得它顯而易見。但我們不應將其視為理所當然。事務不是天然存在的；它們是為了 **簡化應用程式設計模型** 而建立的。透過使用事務，應用程式可以自由地忽略某些潛在的錯誤情況和併發問題，因為資料庫會替應用處理好這些。（我們稱之為 **安全保證**，即 safety guarantees）。

並不是所有的應用都需要事務，有時候弱化事務保證、或完全放棄事務也是有好處的（例如，為了獲得更高效能或更高可用性）。一些安全屬性也可以在沒有事務的情況下實現。

怎樣知道你是否需要事務？為了回答這個問題，首先需要確切理解事務可以提供的安全保障，以及它們的代價。儘管乍看事務似乎很簡單，但實際上有許多微妙但重要的細節在起作用。

本章將研究許多出錯案例，並探索資料庫用於防範這些問題的演算法。尤其會深入 **併發控制** 的領域，討論各種可能發生的競爭條件，以及資料庫如何實現 **讀已提交（read committed）**，**快照隔離（snapshot isolation）** 和 **可序列化（serializability）** 等隔離級別。

本章同時適用於單機資料庫與分散式資料庫；在 [第八章](/v1_tw/ch8) 中將重點討論僅出現在分散式系統中的特殊挑戰。


## 事務的棘手概念

現今，幾乎所有的關係型資料庫和一些非關係資料庫都支援 **事務**。其中大多數遵循 IBM System R（第一個 SQL 資料庫）在 1975 年引入的風格【1,2,3】。40 年裡，儘管一些實現細節發生了變化，但總體思路大同小異：MySQL、PostgreSQL、Oracle 和 SQL Server 等資料庫中的事務支援與 System R 異乎尋常地相似。

2000 年以後，非關係（NoSQL）資料庫開始普及。它們的目標是在關係資料庫的現狀基礎上，透過提供新的資料模型選擇（請參閱 [第二章](/v1_tw/ch2)）並預設包含複製（第五章）和分割槽（第六章）來進一步提升。事務是這次運動的主要犧牲品：這些新一代資料庫中的許多資料庫完全放棄了事務，或者重新定義了這個詞，描述比以前所理解的更弱得多的一套保證【4】。

隨著這種新型分散式資料庫的炒作，人們普遍認為事務是可伸縮性的對立面，任何大型系統都必須放棄事務以保持良好的效能和高可用性【5,6】。另一方面，資料庫廠商有時將事務保證作為 “重要應用” 和 “有價值資料” 的基本要求。這兩種觀點都是 **純粹的誇張**。

事實並非如此簡單：與其他技術設計選擇一樣，事務有其優勢和侷限性。為了理解這些權衡，讓我們瞭解事務所提供保證的細節 —— 無論是在正常執行中還是在各種極端（但是現實存在）的情況下。

### ACID的含義

事務所提供的安全保證，通常由眾所周知的首字母縮略詞 ACID 來描述，ACID 代表 **原子性（Atomicity）**，**一致性（Consistency）**，**隔離性（Isolation）** 和 **永續性（Durability）**。它由 Theo Härder 和 Andreas Reuter 於 1983 年提出，旨在為資料庫中的容錯機制建立精確的術語。

但實際上，不同資料庫的 ACID 實現並不相同。例如，我們將會看到，關於 **隔離性** 的含義就有許多含糊不清【8】。高層次上的想法很美好，但魔鬼隱藏在細節裡。今天，當一個系統聲稱自己 “符合 ACID” 時，實際上能期待的是什麼保證並不清楚。不幸的是，ACID 現在幾乎已經變成了一個營銷術語。

（不符合 ACID 標準的系統有時被稱為 BASE，它代表 **基本可用性（Basically Available）**，**軟狀態（Soft State）** 和 **最終一致性（Eventual consistency）**【9】，這比 ACID 的定義更加模糊，似乎 BASE 的唯一合理的定義是 “不是 ACID”，即它幾乎可以代表任何你想要的東西。）

讓我們深入瞭解原子性，一致性，隔離性和永續性的定義，這可以讓我們提煉出事務的思想。

#### 原子性

一般來說，原子是指不能分解成小部分的東西。這個詞在計算機的不同領域中意味著相似但又微妙不同的東西。例如，在多執行緒程式設計中，如果一個執行緒執行一個原子操作，這意味著另一個執行緒無法看到該操作的一半結果。系統只能處於操作之前或操作之後的狀態，而不是介於兩者之間的狀態。

相比之下，ACID 的原子性並 **不** 是關於 **併發（concurrent）** 的。它並不是在描述如果幾個程序試圖同時訪問相同的資料會發生什麼情況，這種情況包含在 [**隔離性**](#隔離性) 中。

ACID 的原子性描述了當客戶想進行多次寫入，但在一些寫操作處理完之後出現故障的情況。例如程序崩潰，網路連線中斷，磁碟變滿或者某種完整性約束被違反。如果這些寫操作被分組到一個原子事務中，並且該事務由於錯誤而不能完成（提交），則該事務將被中止，並且資料庫必須丟棄或撤消該事務中迄今為止所做的任何寫入。

如果沒有原子性，在多處更改進行到一半時發生錯誤，很難知道哪些更改已經生效，哪些沒有生效。該應用程式可以再試一次，但冒著進行兩次相同變更的風險，可能會導致資料重複或錯誤的資料。原子性簡化了這個問題：如果事務被 **中止（abort）**，應用程式可以確定它沒有改變任何東西，所以可以安全地重試。

ACID 原子性的定義特徵是：**能夠在錯誤時中止事務，丟棄該事務進行的所有寫入變更的能力。** 或許 **可中止性（abortability）** 是更好的術語，但本書將繼續使用原子性，因為這是慣用詞。

#### 一致性

一致性這個詞被賦予太多含義：

* 在 [第五章](/v1_tw/ch5) 中，我們討論了副本一致性，以及非同步複製系統中的最終一致性問題（請參閱 “[複製延遲問題](/v1_tw/ch5#複製延遲問題)”）。
* [一致性雜湊](/v1_tw/ch6#一致性雜湊) 是某些系統用於重新分割槽的一種分割槽方法。
* 在 [CAP 定理](/v1_tw/ch9#CAP定理) 中，一致性一詞用於表示 [線性一致性](/v1_tw/ch9#線性一致性)。
* 在 ACID 的上下文中，**一致性** 是指資料庫在應用程式的特定概念中處於 “良好狀態”。

很不幸，這一個詞就至少有四種不同的含義。

ACID 一致性的概念是，**對資料的一組特定約束必須始終成立**，即 **不變式（invariants）**。例如，在會計系統中，所有賬戶整體上必須借貸相抵。如果一個事務開始於一個滿足這些不變式的有效資料庫，且在事務處理期間的任何寫入操作都保持這種有效性，那麼可以確定，不變式總是滿足的。

但是，一致性的這種概念取決於應用程式對不變式的理解，應用程式負責正確定義它的事務，並保持一致性。這並不是資料庫可以保證的事情：如果你寫入違反不變式的髒資料，資料庫也無法阻止你（一些特定型別的不變式可以由資料庫檢查，例如外部索引鍵約束或唯一約束，但是一般來說，是應用程式來定義什麼樣的資料是有效的，什麼樣是無效的。—— 資料庫只管儲存）。

原子性、隔離性和永續性是資料庫的屬性，而一致性（在 ACID 意義上）是應用程式的屬性。應用可能依賴資料庫的原子性和隔離性來實現一致性，但這並不僅取決於資料庫。因此，字母 C 不屬於 ACID [^i]。

[^i]: 喬・海勒斯坦（Joe Hellerstein）指出，在 Härder 與 Reuter 的論文中，“ACID 中的 C” 是被 “扔進去湊縮寫單詞的”【7】，而且那時候大家都不怎麼在乎一致性。

#### 隔離性

大多數資料庫都會同時被多個客戶端訪問。如果它們各自讀寫資料庫的不同部分，這是沒有問題的，但是如果它們訪問相同的資料庫記錄，則可能會遇到 **併發** 問題（**競爭條件**，即 race conditions）。

[圖 7-1](/v1/ddia_0701.png) 是這類問題的一個簡單例子。假設你有兩個客戶端同時在資料庫中增長一個計數器。（假設資料庫沒有內建的自增操作）每個客戶端需要讀取計數器的當前值，加 1 ，再回寫新值。[圖 7-1](/v1/ddia_0701.png) 中，因為發生了兩次增長，計數器應該從 42 增至 44；但由於競態條件，實際上只增至 43 。

ACID 意義上的隔離性意味著，**同時執行的事務是相互隔離的**：它們不能相互冒犯。傳統的資料庫教科書將隔離性形式化為 **可序列化（Serializability）**，這意味著每個事務可以假裝它是唯一在整個資料庫上執行的事務。資料庫確保當多個事務被提交時，結果與它們序列執行（一個接一個）是一樣的，儘管實際上它們可能是併發執行的【10】。

![](/v1/ddia_0701.png)

**圖 7-1 兩個客戶之間的競爭狀態同時遞增計數器**

然而實踐中很少會使用可序列的隔離，因為它有效能損失。一些流行的資料庫如 Oracle 11g，甚至沒有實現它。在 Oracle 中有一個名為 “可序列的” 隔離級別，但實際上它實現了一種叫做 **快照隔離（snapshot isolation）** 的功能，**這是一種比可序列化更弱的保證**【8,11】。我們將在 “[弱隔離級別](#弱隔離級別)” 中研究快照隔離和其他形式的隔離。

#### 永續性

資料庫系統的目的是，提供一個安全的地方儲存資料，而不用擔心丟失。**永續性** 是一個承諾，即一旦事務成功完成，即使發生硬體故障或資料庫崩潰，寫入的任何資料也不會丟失。

在單節點資料庫中，永續性通常意味著資料已被寫入非易失性儲存裝置，如硬碟或 SSD。它通常還包括預寫日誌或類似的檔案（請參閱 “[讓 B 樹更可靠](/v1_tw/ch3#讓B樹更可靠)”），以便在磁碟上的資料結構損壞時進行恢復。在帶複製的資料庫中，永續性可能意味著資料已成功複製到一些節點。為了提供永續性保證，資料庫必須等到這些寫入或複製完成後，才能報告事務成功提交。

如 “[可靠性](/v1_tw/ch1#可靠性)” 一節所述，**完美的永續性是不存在的** ：如果所有硬碟和所有備份同時被銷燬，那顯然沒有任何資料庫能救得了你。

> #### 複製與永續性
>
> 在歷史上，永續性意味著寫入歸檔磁帶。後來它被理解為寫入磁碟或 SSD。再後來它又有了新的內涵即 “複製（replication）”。哪種實現更好一些？
>
> 真相是，沒有什麼是完美的：
>
> * 如果你寫入磁碟然後機器宕機，即使資料沒有丟失，在修復機器或將磁碟轉移到其他機器之前，也是無法訪問的。這種情況下，複製系統可以保持可用性。
> * 一個相關性故障（停電，或一個特定輸入導致所有節點崩潰的 Bug）可能會一次性摧毀所有副本（請參閱「[可靠性](/v1_tw/ch1#可靠性)」），任何僅儲存在記憶體中的資料都會丟失，故記憶體資料庫仍然要和磁碟寫入打交道。
> * 在非同步複製系統中，當主庫不可用時，最近的寫入操作可能會丟失（請參閱「[處理節點宕機](/v1_tw/ch5#處理節點宕機)」）。
> * 當電源突然斷電時，特別是固態硬碟，有證據顯示有時會違反應有的保證：甚至 fsync 也不能保證正常工作【12】。硬碟韌體可能有錯誤，就像任何其他型別的軟體一樣【13,14】。
> * 儲存引擎和檔案系統之間的微妙互動可能會導致難以追蹤的錯誤，並可能導致磁碟上的檔案在崩潰後被損壞【15,16】。
> * 磁碟上的資料可能會在沒有檢測到的情況下逐漸損壞【17】。如果資料已損壞一段時間，副本和最近的備份也可能損壞。這種情況下，需要嘗試從歷史備份中恢復資料。
> * 一項關於固態硬碟的研究發現，在執行的前四年中，30% 到 80% 的硬碟會產生至少一個壞塊【18】。相比固態硬碟，磁碟的壞道率較低，但完全失效的機率更高。
> * 如果 SSD 斷電，可能會在幾周內開始丟失資料，具體取決於溫度【19】。
>
> 在實踐中，沒有一種技術可以提供絕對保證。只有各種降低風險的技術，包括寫入磁碟，複製到遠端機器和備份 —— 它們可以且應該一起使用。與往常一樣，最好抱著懷疑的態度接受任何理論上的 “保證”。

### 單物件和多物件操作

回顧一下，在 ACID 中，原子性和隔離性描述了客戶端在同一事務中執行多次寫入時，資料庫應該做的事情：

原子性
: 如果在一系列寫操作的中途發生錯誤，則應中止事務處理，並丟棄當前事務的所有寫入。換句話說，資料庫免去了使用者對部分失敗的擔憂 —— 透過提供 “**寧為玉碎，不為瓦全（all-or-nothing）**” 的保證。

隔離性
: 同時執行的事務不應該互相干擾。例如，如果一個事務進行多次寫入，則另一個事務要麼看到全部寫入結果，要麼什麼都看不到，但不應該是一些子集。

這些定義假設你想同時修改多個物件（行，文件，記錄）。通常需要 **多物件事務（multi-object transaction）** 來保持多塊資料同步。[圖 7-2](/v1/ddia_0702.png) 展示了一個來自電郵應用的例子。執行以下查詢來顯示使用者未讀郵件數量：

```sql
SELECT COUNT（*）FROM emails WHERE recipient_id = 2 AND unread_flag = true
```

但如果郵件太多，你可能會覺得這個查詢太慢，並決定用單獨的欄位儲存未讀郵件的數量（一種反正規化）。現在每當一個新訊息寫入時，必須也增長未讀計數器，每當一個訊息被標記為已讀時，也必須減少未讀計數器。

在 [圖 7-2](/v1/ddia_0702.png) 中，使用者 2 遇到異常情況：郵件列表裡顯示有未讀訊息，但計數器顯示為零未讀訊息，因為計數器增長還沒有發生 [^ii]。隔離性可以避免這個問題：透過確保使用者 2 要麼同時看到新郵件和增長後的計數器，要麼都看不到，而不是一個前後矛盾的中間結果。

[^ii]: 可以說郵件應用中的錯誤計數器並不是什麼特別重要的問題。但換種方式來看，你可以把未讀計數器換成客戶賬戶餘額，把郵件收發看成支付交易。

![](/v1/ddia_0702.png)

**圖 7-2 違反隔離性：一個事務讀取另一個事務的未被執行的寫入（“髒讀”）。**

[圖 7-3](/v1/ddia_0703.png) 說明了對原子性的需求：如果在事務過程中發生錯誤，郵箱和未讀計數器的內容可能會失去同步。在原子事務中，如果對計數器的更新失敗，事務將被中止，並且插入的電子郵件將被回滾。

![](/v1/ddia_0703.png)

**圖 7-3 原子性確保發生錯誤時，事務先前的任何寫入都會被撤消，以避免狀態不一致**

多物件事務需要某種方式來確定哪些讀寫操作屬於同一個事務。在關係型資料庫中，通常基於客戶端與資料庫伺服器的 TCP 連線：在任何特定連線上，`BEGIN TRANSACTION` 和 `COMMIT` 語句之間的所有內容，被認為是同一事務的一部分.[^iii]

[^iii]: 這並不完美。如果 TCP 連線中斷，則事務必須中止。如果中斷發生在客戶端請求提交之後，但在伺服器確認提交發生之前，客戶端並不知道事務是否已提交。為了解決這個問題，事務管理器可以透過一個唯一事務識別符號來對操作進行分組，這個識別符號並未繫結到特定 TCP 連線。後續再 “[資料庫的端到端原則](/v1_tw/ch12#資料庫的端到端原則)” 一節將回到這個主題。

另一方面，許多非關係資料庫並沒有將這些操作組合在一起的方法。即使存在多物件 API（例如，某鍵值儲存可能具有在一個操作中更新幾個鍵的 multi-put 操作），但這並不一定意味著它具有事務語義：該命令可能在一些鍵上成功，在其他的鍵上失敗，使資料庫處於部分更新的狀態。

#### 單物件寫入

當單個物件發生改變時，原子性和隔離性也是適用的。例如，假設你正在向資料庫寫入一個 20 KB 的 JSON 文件：

- 如果在傳送第一個 10 KB 之後網路連線中斷，資料庫是否儲存了不可解析的 10KB JSON 片段？
- 如果在資料庫正在覆蓋磁碟上的前一個值的過程中電源發生故障，是否最終將新舊值拼接在一起？
- 如果另一個客戶端在寫入過程中讀取該文件，是否會看到部分更新的值？

這些問題非常讓人頭大，故儲存引擎一個幾乎普遍的目標是：對單節點上的單個物件（例如鍵值對）上提供原子性和隔離性。原子性可以透過使用日誌來實現崩潰恢復（請參閱 “[讓 B 樹更可靠](/v1_tw/ch3#讓B樹更可靠)”），並且可以使用每個物件上的鎖來實現隔離（每次只允許一個執行緒訪問物件） 。

一些資料庫也提供更複雜的原子操作 [^iv]，例如自增操作，這樣就不再需要像 [圖 7-1](/v1/ddia_0701.png) 那樣的讀取 - 修改 - 寫入序列了。同樣流行的是 **[比較和設定（CAS, compare-and-set）](#比較並設定（CAS）)** 操作，僅當值沒有被其他併發修改過時，才允許執行寫操作。

[^iv]: 嚴格地說，**原子自增（atomic increment）** 這個術語在多執行緒程式設計的意義上使用了原子這個詞。在 ACID 的情況下，它實際上應該被稱為 **隔離的（isolated）** 的或 **可序列的（serializable）** 的增量。但這就太吹毛求疵了。

這些單物件操作很有用，因為它們可以防止在多個客戶端嘗試同時寫入同一個物件時丟失更新（請參閱 “[防止丟失更新](#防止丟失更新)”）。但它們不是通常意義上的事務。CAS 以及其他單一物件操作被稱為 “輕量級事務”，甚至出於營銷目的被稱為 “ACID”【20,21,22】，但是這個術語是誤導性的。事務通常被理解為，**將多個物件上的多個操作合併為一個執行單元的機制**。

#### 多物件事務的需求

許多分散式資料儲存已經放棄了多物件事務，因為多物件事務很難跨分割槽實現，而且在需要高可用性或高效能的情況下，它們可能會礙事。但說到底，在分散式資料庫中實現事務，並沒有什麼根本性的障礙。[第九章](/v1_tw/ch9) 將討論分散式事務的實現。

但是我們是否需要多物件事務？**是否有可能只用鍵值資料模型和單物件操作來實現任何應用程式？**

有一些場景中，單物件插入，更新和刪除是足夠的。但是許多其他場景需要協調寫入幾個不同的物件：

* 在關係資料模型中，一個表中的行通常具有對另一個表中的行的外部索引鍵引用。（類似的是，在一個圖資料模型中，一個頂點有著到其他頂點的邊）。多物件事務使你確保這些引用始終有效：當插入幾個相互引用的記錄時，外部索引鍵必須是正確的和最新的，不然資料就沒有意義。
* 在文件資料模型中，需要一起更新的欄位通常在同一個文件中，這被視為單個物件 —— 更新單個文件時不需要多物件事務。但是，缺乏連線功能的文件資料庫會鼓勵反正規化（請參閱 “[關係型資料庫與文件資料庫在今日的對比](/v1_tw/ch2#關係型資料庫與文件資料庫在今日的對比)”）。當需要更新反正規化的資訊時，如 [圖 7-2](/v1/ddia_0702.png) 所示，需要一次更新多個文件。事務在這種情況下非常有用，可以防止反正規化的資料不同步。
* 在具有次級索引的資料庫中（除了純粹的鍵值儲存以外幾乎都有），每次更改值時都需要更新索引。從事務角度來看，這些索引是不同的資料庫物件：例如，如果沒有事務隔離性，記錄可能出現在一個索引中，但沒有出現在另一個索引中，因為第二個索引的更新還沒有發生。

這些應用仍然可以在沒有事務的情況下實現。然而，**沒有原子性，錯誤處理就要複雜得多，缺乏隔離性，就會導致併發問題**。我們將在 “[弱隔離級別](#弱隔離級別)” 中討論這些問題，並在 [第十二章](/v1_tw/ch12) 中探討其他方法。

#### 處理錯誤和中止

事務的一個關鍵特性是，如果發生錯誤，它可以中止並安全地重試。ACID 資料庫基於這樣的哲學：如果資料庫有違反其原子性，隔離性或永續性的危險，則寧願完全放棄事務，而不是留下半成品。

然而並不是所有的系統都遵循這個哲學。特別是具有 [無主複製](/v1_tw/ch5#無主複製) 的資料儲存，主要是在 “盡力而為” 的基礎上進行工作。可以概括為 “資料庫將做盡可能多的事，執行遇到錯誤時，它不會撤消它已經完成的事情” —— 所以，從錯誤中恢復是應用程式的責任。

錯誤發生不可避免，但許多軟體開發人員傾向於只考慮樂觀情況，而不是錯誤處理的複雜性。例如，像 Rails 的 ActiveRecord 和 Django 這樣的 **物件關係對映（ORM, object-relation Mapping）** 框架不會重試中斷的事務 —— 這個錯誤通常會導致一個從堆疊向上傳播的異常，所以任何使用者輸入都會被丟棄，使用者拿到一個錯誤資訊。這實在是太恥辱了，因為中止的重點就是允許安全的重試。

儘管重試一個中止的事務是一個簡單而有效的錯誤處理機制，但它並不完美：

- 如果事務實際上成功了，但是在伺服器試圖向客戶端確認提交成功時網路發生故障（所以客戶端認為提交失敗了），那麼重試事務會導致事務被執行兩次 —— 除非你有一個額外的應用級去重機制。
- 如果錯誤是由於負載過大造成的，則重試事務將使問題變得更糟，而不是更好。為了避免這種正反饋迴圈，可以限制重試次數，使用指數退避演算法，並單獨處理與過載相關的錯誤（如果允許）。
- 僅在臨時性錯誤（例如，由於死鎖，異常情況，臨時性網路中斷和故障切換）後才值得重試。在發生永久性錯誤（例如，違反約束）之後重試是毫無意義的。
- 如果事務在資料庫之外也有副作用，即使事務被中止，也可能發生這些副作用。例如，如果你正在傳送電子郵件，那你肯定不希望每次重試事務時都重新發送電子郵件。如果你想確保幾個不同的系統一起提交或放棄，**兩階段提交（2PC, two-phase commit）** 可以提供幫助（“[原子提交與兩階段提交](/v1_tw/ch9#原子提交與兩階段提交)” 中將討論這個問題）。
- 如果客戶端程序在重試中失效，任何試圖寫入資料庫的資料都將丟失。

## 弱隔離級別

如果兩個事務不觸及相同的資料，它們可以安全地 **並行（parallel）** 執行，因為兩者都不依賴於另一個。當一個事務讀取由另一個事務同時修改的資料時，或者當兩個事務試圖同時修改相同的資料時，併發問題（競爭條件）才會出現。

併發 BUG 很難透過測試找到，因為這樣的錯誤只有在特殊時序下才會觸發。這樣的時序問題可能非常少發生，通常很難重現 [^譯註i]。併發性也很難推理，特別是在大型應用中，你不一定知道哪些其他程式碼正在訪問資料庫。在一次只有一個使用者時，應用開發已經很麻煩了，有許多併發使用者使得它更加困難，因為任何一個數據都可能隨時改變。

[^譯註i]: 軼事：偶然出現的瞬時錯誤有時稱為 ***Heisenbug***，而確定性的問題對應地稱為 ***Bohrbugs***

出於這個原因，資料庫一直試圖透過提供 **事務隔離（transaction isolation）** 來隱藏應用程式開發者的併發問題。從理論上講，隔離可以透過假裝沒有併發發生，讓你的生活更加輕鬆：**可序列的（serializable）** 隔離等級意味著資料庫保證事務的效果如同序列執行（即一次一個，沒有任何併發）。

實際上不幸的是：隔離並沒有那麼簡單。**可序列的隔離** 會有效能損失，許多資料庫不願意支付這個代價【8】。因此，系統通常使用較弱的隔離級別來防止一部分，而不是全部的併發問題。這些隔離級別難以理解，並且會導致微妙的錯誤，但是它們仍然在實踐中被使用【23】。

弱事務隔離級別導致的併發性錯誤不僅僅是一個理論問題。它們造成了很多的資金損失【24,25】，耗費了財務審計人員的調查【26】，並導致客戶資料被破壞【27】。關於這類問題的一個流行的評論是 “如果你正在處理財務資料，請使用 ACID 資料庫！” —— 但是這一點沒有提到。即使是很多流行的關係型資料庫系統（通常被認為是 “ACID”）也使用弱隔離級別，所以它們也不一定能防止這些錯誤的發生。

比起盲目地依賴工具，我們需要對存在的各種併發問題，以及如何防止這些問題有深入的理解。然後就可以使用我們所掌握的工具來構建可靠和正確的應用程式。

在本節中，我們將看幾個在實踐中使用的弱（**非序列的**，即 nonserializable）隔離級別，並詳細討論哪種競爭條件可能發生也可能不發生，以便你可以決定什麼級別適合你的應用程式。一旦我們完成了這個工作，我們將詳細討論可序列化（請參閱 “[可序列化](#可序列化)”）。我們討論的隔離級別將是非正式的，透過示例來進行。如果你需要嚴格的定義和分析它們的屬性，你可以在學術文獻中找到它們【28,29,30】。

### 讀已提交

最基本的事務隔離級別是 **讀已提交（Read Committed）**[^v]，它提供了兩個保證：

1. 從資料庫讀時，只能看到已提交的資料（沒有 **髒讀**，即 dirty reads）。
2. 寫入資料庫時，只會覆蓋已提交的資料（沒有 **髒寫**，即 dirty writes）。

我們來更詳細地討論這兩個保證。

[^v]: 某些資料庫支援甚至更弱的隔離級別，稱為 **讀未提交（Read uncommitted）**。它可以防止髒寫，但不防止髒讀。

#### 沒有髒讀

設想一個事務已經將一些資料寫入資料庫，但事務還沒有提交或中止。另一個事務可以看到未提交的資料嗎？如果是的話，那就叫做 **髒讀（dirty reads）**【2】。

在 **讀已提交** 隔離級別執行的事務必須防止髒讀。這意味著事務的任何寫入操作只有在該事務提交時才能被其他人看到（然後所有的寫入操作都會立即變得可見）。如 [圖 7-4](/v1/ddia_0704.png) 所示，使用者 1 設定了 `x = 3`，但使用者 2 的 `get x` 仍舊返回舊值 2 （當用戶 1 尚未提交時）。

![](/v1/ddia_0704.png)

**圖 7-4 沒有髒讀：使用者 2 只有在使用者 1 的事務已經提交後才能看到 x 的新值。**

為什麼要防止髒讀，有幾個原因：

- 如果事務需要更新多個物件，髒讀取意味著另一個事務可能會只看到一部分更新。例如，在 [圖 7-2](/v1/ddia_0702.png) 中，使用者看到新的未讀電子郵件，但看不到更新的計數器。這就是電子郵件的髒讀。看到處於部分更新狀態的資料庫會讓使用者感到困惑，並可能導致其他事務做出錯誤的決定。
- 如果事務中止，則所有寫入操作都需要回滾（如 [圖 7-3](/v1/ddia_0703.png) 所示）。如果資料庫允許髒讀，那就意味著一個事務可能會看到稍後需要回滾的資料，即從未實際提交給資料庫的資料。想想後果就讓人頭大。

#### 沒有髒寫

如果兩個事務同時嘗試更新資料庫中的相同物件，會發生什麼情況？我們不知道寫入的順序是怎樣的，但是我們通常認為後面的寫入會覆蓋前面的寫入。

但是，如果先前的寫入是尚未提交事務的一部分，使得後面的寫入覆蓋了一個尚未提交的值，這時會發生什麼呢？這被稱作 **髒寫（dirty write）**【28】。在 **讀已提交** 的隔離級別上執行的事務必須防止髒寫，通常是延遲第二次寫入，直到第一次寫入事務提交或中止為止。

透過防止髒寫，這個隔離級別避免了一些併發問題：

- 如果事務更新多個物件，髒寫會導致不好的結果。例如，考慮 [圖 7-5](/v1/ddia_0705.png)，以一個二手車銷售網站為例，Alice 和 Bob 兩個人同時試圖購買同一輛車。購買汽車需要兩次資料庫寫入：網站上的商品列表需要更新，以反映買家的購買，銷售發票需要傳送給買家。在 [圖 7-5](/v1/ddia_0705.png) 的情況下，銷售是屬於 Bob 的（因為他成功更新了商品列表），但發票卻寄送給了 Alice（因為她成功更新了發票表）。讀已提交會防止這樣的事故。
- 但是，讀已提交併不能防止 [圖 7-1](/v1/ddia_0701.png) 中兩個計數器增量之間的競爭狀態。在這種情況下，第二次寫入發生在第一個事務提交後，所以它不是一個髒寫。這仍然是不正確的，但是出於不同的原因，在 “[防止丟失更新](#防止丟失更新)” 中將討論如何使這種計數器增量安全。

![](/v1/ddia_0705.png)

**圖 7-5 如果存在髒寫，來自不同事務的衝突寫入可能會混淆在一起**

#### 實現讀已提交

**讀已提交** 是一個非常流行的隔離級別。這是 Oracle 11g、PostgreSQL、SQL Server 2012、MemSQL 和其他許多資料庫的預設設定【8】。

最常見的情況是，資料庫透過使用 **行鎖（row-level lock）** 來防止髒寫：當事務想要修改特定物件（行或文件）時，它必須首先獲得該物件的鎖。然後必須持有該鎖直到事務被提交或中止。一次只有一個事務可持有任何給定物件的鎖；如果另一個事務要寫入同一個物件，則必須等到第一個事務提交或中止後，才能獲取該鎖並繼續。這種鎖定是讀已提交模式（或更強的隔離級別）的資料庫自動完成的。

如何防止髒讀？一種選擇是使用相同的鎖，並要求任何想要讀取物件的事務來簡單地獲取該鎖，然後在讀取之後立即再次釋放該鎖。這將確保在物件具有髒的、未提交的值時不會發生讀取（因為在此期間，鎖將由進行寫入的事務持有）。

但是要求讀鎖的辦法在實踐中效果並不好。因為一個長時間執行的寫入事務會迫使許多隻讀事務等到這個慢寫入事務完成。這會影響只讀事務的響應時間，並且不利於可操作性：因為等待鎖，應用某個部分的遲緩可能由於連鎖效應，導致其他部分出現問題。

出於這個原因，大多數資料庫 [^vi] 使用 [圖 7-4](/v1/ddia_0704.png) 的方式防止髒讀：對於寫入的每個物件，資料庫都會記住舊的已提交值，和由當前持有寫入鎖的事務設定的新值。當事務正在進行時，任何其他讀取物件的事務都會拿到舊值。只有當新值提交後，事務才會切換到讀取新值。

[^vi]: 在撰寫本文時，唯一在讀已提交隔離級別使用讀鎖的主流資料庫是 IBM DB2 和使用 `read_committed_snapshot = off` 配置的 Microsoft SQL Server【23,36】。

### 快照隔離和可重複讀

如果只從表面上看讀已提交隔離級別，你可能就認為它完成了事務所需的一切，這是情有可原的。它允許 **中止**（原子性的要求）；它防止讀取不完整的事務結果，並且防止併發寫入造成的混亂。事實上這些功能非常有用，比起沒有事務的系統來，可以提供更多的保證。

但是在使用此隔離級別時，仍然有很多地方可能會產生併發錯誤。例如 [圖 7-6](/v1/ddia_0706.png) 說明了讀已提交時可能發生的問題。

![](/v1/ddia_0706.png)

**圖 7-6 讀取偏差：Alice 觀察資料庫處於不一致的狀態**

Alice 在銀行有 1000 美元的儲蓄，分為兩個賬戶，每個 500 美元。現在有一筆事務從她的一個賬戶轉移了 100 美元到另一個賬戶。如果她非常不幸地在事務處理的過程中檢視其賬戶餘額列表，她可能會在收到付款之前先看到一個賬戶的餘額（收款賬戶，餘額仍為 500 美元），在發出轉賬之後再看到另一個賬戶的餘額（付款賬戶，新的餘額為 400 美元）。對 Alice 來說，現在她的賬戶似乎總共只有 900 美元 —— 看起來有 100 美元已經憑空消失了。

這種異常被稱為 **不可重複讀（nonrepeatable read）** 或 **讀取偏差（read skew）**：如果 Alice 在事務結束時再次讀取賬戶 1 的餘額，她將看到與她之前的查詢中看到的不同的值（600 美元）。在讀已提交的隔離條件下，**不可重複讀** 被認為是可接受的：Alice 看到的帳戶餘額確實在閱讀時已經提交了。

> 不幸的是，術語 **偏差（skew）** 這個詞是過載的：以前使用它是因為熱點的不平衡工作量（請參閱 “[負載偏斜與熱點消除](/v1_tw/ch6#負載偏斜與熱點消除)”），而這裡偏差意味著異常的時序。

對於 Alice 的情況，這不是一個長期持續的問題。因為如果她幾秒鐘後重新整理銀行網站的頁面，她很可能會看到一致的帳戶餘額。但是有些情況下，不能容忍這種暫時的不一致：

備份
: 進行備份需要複製整個資料庫，對大型資料庫而言可能需要花費數小時才能完成。備份程序執行時，資料庫仍然會接受寫入操作。因此備份可能會包含一些舊的部分和一些新的部分。如果從這樣的備份中恢復，那麼不一致（如消失的錢）就會變成永久的。

分析查詢和完整性檢查
: 有時，你可能需要執行一個查詢，掃描大部分的資料庫。這樣的查詢在分析中很常見（請參閱 “[事務處理還是分析？](/v1_tw/ch3#事務處理還是分析？)”），也可能是定期完整性檢查（即監視資料損壞）的一部分。如果這些查詢在不同時間點觀察資料庫的不同部分，則可能會返回毫無意義的結果。

**快照隔離（snapshot isolation）**【28】是這個問題最常見的解決方案。想法是，每個事務都從資料庫的 **一致快照（consistent snapshot）** 中讀取 —— 也就是說，事務可以看到事務開始時在資料庫中提交的所有資料。即使這些資料隨後被另一個事務更改，每個事務也只能看到該特定時間點的舊資料。

快照隔離對長時間執行的只讀查詢（如備份和分析）非常有用。如果查詢的資料在查詢執行的同時發生變化，則很難理解查詢的含義。當一個事務可以看到資料庫在某個特定時間點凍結時的一致快照，理解起來就很容易了。

快照隔離是一個流行的功能：PostgreSQL、使用 InnoDB 引擎的 MySQL、Oracle、SQL Server 等都支援【23,31,32】。

#### 實現快照隔離

與讀取提交的隔離類似，快照隔離的實現通常使用寫鎖來防止髒寫（請參閱 “[讀已提交](#讀已提交)”），這意味著進行寫入的事務會阻止另一個事務修改同一個物件。但是讀取則不需要加鎖。從效能的角度來看，快照隔離的一個關鍵原則是：**讀不阻塞寫，寫不阻塞讀**。這允許資料庫在處理一致性快照上的長時間查詢時，可以正常地同時處理寫入操作，且兩者間沒有任何鎖爭用。

為了實現快照隔離，資料庫使用了我們看到的用於防止 [圖 7-4](/v1/ddia_0704.png) 中的髒讀的機制的一般化。資料庫必須可能保留一個物件的幾個不同的提交版本，因為各種正在進行的事務可能需要看到資料庫在不同的時間點的狀態。因為它同時維護著單個物件的多個版本，所以這種技術被稱為 **多版本併發控制（MVCC, multi-version concurrency control）**。

如果一個數據庫只需要提供 **讀已提交** 的隔離級別，而不提供 **快照隔離**，那麼保留一個物件的兩個版本就足夠了：已提交的版本和被覆蓋但尚未提交的版本。不過支援快照隔離的儲存引擎通常也使用 MVCC 來實現 **讀已提交** 隔離級別。一種典型的方法是 **讀已提交** 為每個查詢使用單獨的快照，而 **快照隔離** 對整個事務使用相同的快照。

[圖 7-7](/v1/ddia_0707.png) 說明了 PostgreSQL 如何實現基於 MVCC 的快照隔離【31】（其他實現類似）。當一個事務開始時，它被賦予一個唯一的，永遠增長 [^vii] 的事務 ID（`txid`）。每當事務向資料庫寫入任何內容時，它所寫入的資料都會被標記上寫入者的事務 ID。

[^vii]: 事實上，事務 ID 是 32 位整數，所以大約會在 40 億次事務之後溢位。PostgreSQL 的 Vacuum 過程會清理老舊的事務 ID，確保事務 ID 溢位（回捲）不會影響到資料。

![](/v1/ddia_0707.png)

**圖 7-7 使用多版本物件實現快照隔離**

表中的每一行都有一個 `created_by` 欄位，其中包含將該行插入到表中的的事務 ID。此外，每行都有一個 `deleted_by` 欄位，最初是空的。如果某個事務刪除了一行，那麼該行實際上並未從資料庫中刪除，而是透過將 `deleted_by` 欄位設定為請求刪除的事務的 ID 來標記為刪除。在稍後的時間，當確定沒有事務可以再訪問已刪除的資料時，資料庫中的垃圾收集過程會將所有帶有刪除標記的行移除，並釋放其空間。[^譯註ii]

[^譯註ii]: 在 PostgreSQL 中，`created_by` 的實際名稱為 `xmin`，`deleted_by` 的實際名稱為 `xmax`

`UPDATE` 操作在內部翻譯為 `DELETE` 和 `INSERT` 。例如，在 [圖 7-7](/v1/ddia_0707.png) 中，事務 13 從賬戶 2 中扣除 100 美元，將餘額從 500 美元改為 400 美元。實際上包含兩條賬戶 2 的記錄：餘額為 \$500 的行被標記為 **被事務 13 刪除**，餘額為 \$400 的行 **由事務 13 建立**。

#### 觀察一致性快照的可見性規則

當一個事務從資料庫中讀取時，事務 ID 用於決定它可以看見哪些物件，看不見哪些物件。透過仔細定義可見性規則，資料庫可以嚮應用程式呈現一致的資料庫快照。工作如下：

1. 在每次事務開始時，資料庫列出當時所有其他（尚未提交或尚未中止）的事務清單，即使之後提交了，這些事務已執行的任何寫入也都會被忽略。
2. 被中止事務所執行的任何寫入都將被忽略。
3. 由具有較晚事務 ID（即，在當前事務開始之後開始的）的事務所做的任何寫入都被忽略，而不管這些事務是否已經提交。
4. 所有其他寫入，對應用都是可見的。

這些規則適用於建立和刪除物件。在 [圖 7-7](/v1/ddia_0707.png) 中，當事務 12 從賬戶 2 讀取時，它會看到 \$500 的餘額，因為 \$500 餘額的刪除是由事務 13 完成的（根據規則 3，事務 12 看不到事務 13 執行的刪除），且 400 美元記錄的建立也是不可見的（按照相同的規則）。

換句話說，如果以下兩個條件都成立，則可見一個物件：

- 讀事務開始時，建立該物件的事務已經提交。
- 物件未被標記為刪除，或如果被標記為刪除，請求刪除的事務在讀事務開始時尚未提交。

長時間執行的事務可能會長時間使用快照，並繼續讀取（從其他事務的角度來看）早已被覆蓋或刪除的值。由於從來不原地更新值，而是每次值改變時建立一個新的版本，資料庫可以在提供一致快照的同時只產生很小的額外開銷。

#### 索引和快照隔離

索引如何在多版本資料庫中工作？一種選擇是使索引簡單地指向物件的所有版本，並且需要索引查詢來過濾掉當前事務不可見的任何物件版本。當垃圾收集刪除任何事務不再可見的舊物件版本時，相應的索引條目也可以被刪除。

在實踐中，許多實現細節決定了多版本併發控制的效能。例如，如果同一物件的不同版本可以放入同一個頁面中，PostgreSQL 的最佳化可以避免更新索引【31】。

在 CouchDB、Datomic 和 LMDB 中使用另一種方法。雖然它們也使用 [B 樹](/v1_tw/ch3#B樹)，但它們使用的是一種 **僅追加 / 寫時複製（append-only/copy-on-write）** 的變體，它們在更新時不覆蓋樹的頁面，而為每個修改頁面建立一份副本。從父頁面直到樹根都會級聯更新，以指向它們子頁面的新版本。任何不受寫入影響的頁面都不需要被複制，並且保持不變【33,34,35】。

使用僅追加的 B 樹，每個寫入事務（或一批事務）都會建立一棵新的 B 樹，當建立時，從該特定樹根生長的樹就是資料庫的一個一致性快照。沒必要根據事務 ID 過濾掉物件，因為後續寫入不能修改現有的 B 樹；它們只能建立新的樹根。但這種方法也需要一個負責壓縮和垃圾收集的後臺程序。

#### 可重複讀與命名混淆

快照隔離是一個有用的隔離級別，特別對於只讀事務而言。但是，許多資料庫實現了它，卻用不同的名字來稱呼。在 Oracle 中稱為 **可序列化（Serializable）** 的，在 PostgreSQL 和 MySQL 中稱為 **可重複讀（repeatable read）**【23】。

這種命名混淆的原因是 SQL 標準沒有 **快照隔離** 的概念，因為標準是基於 System R 1975 年定義的隔離級別【2】，那時候 **快照隔離** 尚未發明。相反，它定義了 **可重複讀**，表面上看起來與快照隔離很相似。PostgreSQL 和 MySQL 稱其 **快照隔離** 級別為 **可重複讀（repeatable read）**，因為這樣符合標準要求，所以它們可以聲稱自己 “標準相容”。

不幸的是，SQL 標準對隔離級別的定義是有缺陷的 —— 模糊，不精確，並不像標準應有的樣子獨立於實現【28】。有幾個資料庫實現了可重複讀，但它們實際提供的保證存在很大的差異，儘管表面上是標準化的【23】。在研究文獻【29,30】中已經有了可重複讀的正式定義，但大多數的實現並不能滿足這個正式定義。最後，IBM DB2 使用 “可重複讀” 來引用可序列化【8】。

結果，沒有人真正知道 **可重複讀** 的意思。

### 防止丟失更新

到目前為止已經討論的 **讀已提交** 和 **快照隔離** 級別，主要保證了 **只讀事務在併發寫入時** 可以看到什麼。卻忽略了兩個事務併發寫入的問題 —— 我們只討論了髒寫（請參閱 “[沒有髒寫](#沒有髒寫)”），一種特定型別的寫 - 寫衝突是可能出現的。

併發的寫入事務之間還有其他幾種有趣的衝突。其中最著名的是 **丟失更新（lost update）** 問題，如 [圖 7-1](/v1/ddia_0701.png) 所示，以兩個併發計數器增量為例。

如果應用從資料庫中讀取一些值，修改它並寫回修改的值（讀取 - 修改 - 寫入序列），則可能會發生丟失更新的問題。如果兩個事務同時執行，則其中一個的修改可能會丟失，因為第二個寫入的內容並沒有包括第一個事務的修改（有時會說後面寫入 **狠揍（clobber）** 了前面的寫入）這種模式發生在各種不同的情況下：

- 增加計數器或更新賬戶餘額（需要讀取當前值，計算新值並寫回更新後的值）
- 將本地修改寫入一個複雜值中：例如，將元素新增到 JSON 文件中的一個列表（需要解析文件，進行更改並寫回修改的文件）
- 兩個使用者同時編輯 wiki 頁面，每個使用者透過將整個頁面內容傳送到伺服器來儲存其更改，覆寫資料庫中當前的任何內容。

這是一個普遍的問題，所以已經開發了各種解決方案。

#### 原子寫

許多資料庫提供了原子更新操作，從而消除了在應用程式程式碼中執行讀取 - 修改 - 寫入序列的需要。如果你的程式碼可以用這些操作來表達，那這通常是最好的解決方案。例如，下面的指令在大多數關係資料庫中是併發安全的：

```sql
UPDATE counters SET value = value + 1 WHERE key = 'foo';
```

類似地，像 MongoDB 這樣的文件資料庫提供了對 JSON 文件的一部分進行本地修改的原子操作，Redis 提供了修改資料結構（如優先順序佇列）的原子操作。並不是所有的寫操作都可以用原子操作的方式來表達，例如 wiki 頁面的更新涉及到任意文字編輯 [^viii]，但是在可以使用原子操作的情況下，它們通常是最好的選擇。

[^viii]: 將文字文件的編輯表示為原子的變化流是可能的，儘管相當複雜。請參閱 “[自動衝突解決](/v1_tw/ch5#自動衝突解決)”。

原子操作通常透過在讀取物件時，獲取其上的排它鎖來實現。以便更新完成之前沒有其他事務可以讀取它。這種技術有時被稱為 **遊標穩定性（cursor stability）**【36,37】。另一個選擇是簡單地強制所有的原子操作在單一執行緒上執行。

不幸的是，ORM 框架很容易意外地執行不安全的讀取 - 修改 - 寫入序列，而不是使用資料庫提供的原子操作【38】。如果你知道自己在做什麼那當然不是問題，但它經常產生那種很難測出來的微妙 Bug。

#### 顯式鎖定

如果資料庫的內建原子操作沒有提供必要的功能，防止丟失更新的另一個選擇是讓應用程式顯式地鎖定將要更新的物件。然後應用程式可以執行讀取 - 修改 - 寫入序列，如果任何其他事務嘗試同時讀取同一個物件，則強制等待，直到第一個 **讀取 - 修改 - 寫入序列** 完成。

例如，考慮一個多人遊戲，其中幾個玩家可以同時移動相同的棋子。在這種情況下，一個原子操作可能是不夠的，因為應用程式還需要確保玩家的移動符合遊戲規則，這可能涉及到一些不能合理地用資料庫查詢實現的邏輯。但你可以使用鎖來防止兩名玩家同時移動相同的棋子，如例 7-1 所示。

**例 7-1 顯式鎖定行以防止丟失更新**

```sql
BEGIN TRANSACTION;
SELECT * FROM figures
  WHERE name = 'robot' AND game_id = 222
FOR UPDATE;

-- 檢查玩家的操作是否有效，然後更新先前 SELECT 返回棋子的位置。
UPDATE figures SET position = 'c4' WHERE id = 1234;
COMMIT;
```

- `FOR UPDATE` 子句告訴資料庫應該對該查詢返回的所有行加鎖。

這是有效的，但要做對，你需要仔細考慮應用邏輯。忘記在程式碼某處加鎖很容易引入競爭條件。

#### 自動檢測丟失的更新

原子操作和鎖是透過強制 **讀取 - 修改 - 寫入序列** 按順序發生，來防止丟失更新的方法。另一種方法是允許它們並行執行，如果事務管理器檢測到丟失更新，則中止事務並強制它們重試其 **讀取 - 修改 - 寫入序列**。

這種方法的一個優點是，資料庫可以結合快照隔離高效地執行此檢查。事實上，PostgreSQL 的可重複讀，Oracle 的可序列化和 SQL Server 的快照隔離級別，都會自動檢測到丟失更新，並中止惹麻煩的事務。但是，MySQL/InnoDB 的可重複讀並不會檢測 **丟失更新**【23】。一些作者【28,30】認為，資料庫必須能防止丟失更新才稱得上是提供了 **快照隔離**，所以在這個定義下，MySQL 下不提供快照隔離。

丟失更新檢測是一個很好的功能，因為它不需要應用程式碼使用任何特殊的資料庫功能，你可能會忘記使用鎖或原子操作，從而引入錯誤；但丟失更新的檢測是自動發生的，因此不太容易出錯。

#### 比較並設定（CAS）

在不提供事務的資料庫中，有時會發現一種原子操作：**比較並設定**（CAS, 即 Compare And Set，先前在 “[單物件寫入](#單物件寫入)” 中提到）。此操作的目的是為了避免丟失更新：只有當前值從上次讀取時一直未改變，才允許更新發生。如果當前值與先前讀取的值不匹配，則更新不起作用，且必須重試讀取 - 修改 - 寫入序列。

例如，為了防止兩個使用者同時更新同一個 wiki 頁面，可以嘗試類似這樣的方式，只有當用戶開始編輯後頁面內容未發生改變時，才會更新成功：

```sql
-- 根據資料庫的實現情況，這可能安全也可能不安全
UPDATE wiki_pages SET content = '新內容'
  WHERE id = 1234 AND content = '舊內容';
```

如果內容已經更改並且不再與 “舊內容” 相匹配，則此更新將不起作用，因此你需要檢查更新是否生效，必要時重試。但是，如果資料庫允許 `WHERE` 子句從舊快照中讀取，則此語句可能無法防止丟失更新，因為即使發生了另一個併發寫入，`WHERE` 條件也可能為真。在依賴資料庫的 CAS 操作前要檢查其是否安全。

#### 衝突解決和複製

在複製資料庫中（請參閱 [第五章](/v1_tw/ch5)），防止丟失的更新需要考慮另一個維度：由於在多個節點上存在資料副本，並且在不同節點上的資料可能被併發地修改，因此需要採取一些額外的步驟來防止丟失更新。

鎖和 CAS 操作假定只有一個最新的資料副本。但是多主或無主複製的資料庫通常允許多個寫入併發執行，並非同步複製到副本上，因此無法保證只有一個最新資料的副本。所以基於鎖或 CAS 操作的技術不適用於這種情況（我們將在 “[線性一致性](/v1_tw/ch9#線性一致性)” 中更詳細地討論這個問題）。

相反，如 “[檢測併發寫入](/v1_tw/ch5#檢測併發寫入)” 一節所述，這種複製資料庫中的一種常見方法是允許併發寫入建立多個衝突版本的值（也稱為兄弟），並使用應用程式碼或特殊資料結構在事實發生之後解決和合並這些版本。

原子操作可以在複製的上下文中很好地工作，尤其當它們具有可交換性時（即，可以在不同的副本上以不同的順序應用它們，且仍然可以得到相同的結果）。例如，遞增計數器或向集合新增元素是可交換的操作。這是 Riak 2.0 資料型別背後的思想，它可以防止複製副本丟失更新。當不同的客戶端同時更新一個值時，Riak 自動將更新合併在一起，以免丟失更新【39】。

另一方面，最後寫入勝利（LWW）的衝突解決方法很容易丟失更新，如 “[最後寫入勝利（丟棄併發寫入）](/v1_tw/ch5#最後寫入勝利（丟棄併發寫入）)” 中所述。不幸的是，LWW 是許多複製資料庫中的預設方案。

### 寫入偏差與幻讀

前面的章節中，我們看到了 **髒寫** 和 **丟失更新**，當不同的事務併發地嘗試寫入相同的物件時，會出現這兩種競爭條件。為了避免資料損壞，這些競爭條件需要被阻止 —— 既可以由資料庫自動執行，也可以透過鎖和原子寫操作這類手動安全措施來防止。

但是，併發寫入間可能發生的競爭條件還沒有完。在本節中，我們將看到一些更微妙的衝突例子。

首先，想象一下這個例子：你正在為醫院寫一個醫生輪班管理程式。醫院通常會同時要求幾位醫生待命，但底線是至少有一位醫生在待命。醫生可以放棄他們的班次（例如，如果他們自己生病了），只要至少有一個同事在這一班中繼續工作【40,41】。

現在想象一下，Alice 和 Bob 是兩位值班醫生。兩人都感到不適，所以他們都決定請假。不幸的是，他們恰好在同一時間點選按鈕下班。[圖 7-8](/v1/ddia_0708.png) 說明了接下來的事情。

![](/v1/ddia_0708.png)

**圖 7-8 寫入偏差導致應用程式錯誤的示例**

在兩個事務中，應用首先檢查是否有兩個或以上的醫生正在值班；如果是的話，它就假定一名醫生可以安全地休班。由於資料庫使用快照隔離，兩次檢查都返回 2 ，所以兩個事務都進入下一個階段。Alice 更新自己的記錄休班了，而 Bob 也做了一樣的事情。兩個事務都成功提交了，現在沒有醫生值班了。違反了至少有一名醫生在值班的要求。

#### 寫入偏差的特徵

這種異常稱為 **寫入偏差**【28】。它既不是 **髒寫**，也不是 **丟失更新**，因為這兩個事務正在更新兩個不同的物件（Alice 和 Bob 各自的待命記錄）。在這裡發生的衝突並不是那麼明顯，但是這顯然是一個競爭條件：如果兩個事務一個接一個地執行，那麼第二個醫生就不能歇班了。異常行為只有在事務併發進行時才有可能發生。

可以將寫入偏差視為丟失更新問題的一般化。如果兩個事務讀取相同的物件，然後更新其中一些物件（不同的事務可能更新不同的物件），則可能發生寫入偏差。在多個事務更新同一個物件的特殊情況下，就會發生髒寫或丟失更新（取決於時序）。

我們已經看到，有各種不同的方法來防止丟失的更新。但對於寫入偏差，我們的選擇更受限制：

* 由於涉及多個物件，單物件的原子操作不起作用。
* 不幸的是，在一些快照隔離的實現中，自動檢測丟失更新對此並沒有幫助。在 PostgreSQL 的可重複讀，MySQL/InnoDB 的可重複讀，Oracle 可序列化或 SQL Server 的快照隔離級別中，都不會自動檢測寫入偏差【23】。自動防止寫入偏差需要真正的可序列化隔離（請參閱 “[可序列化](#可序列化)”）。
* 某些資料庫允許配置約束，然後由資料庫強制執行（例如，唯一性，外部索引鍵約束或特定值限制）。但是為了指定至少有一名醫生必須線上，需要一個涉及多個物件的約束。大多數資料庫沒有內建對這種約束的支援，但是你可以使用觸發器，或者物化檢視來實現它們，這取決於不同的資料庫【42】。
* 如果無法使用可序列化的隔離級別，則此情況下的次優選項可能是顯式鎖定事務所依賴的行。在例子中，你可以寫下如下的程式碼：

```sql
BEGIN TRANSACTION;
SELECT * FROM doctors
  WHERE on_call = TRUE
  AND shift_id = 1234 FOR UPDATE;

UPDATE doctors
  SET on_call = FALSE
  WHERE name = 'Alice'
  AND shift_id = 1234;

COMMIT;
```

* 和以前一樣，`FOR UPDATE` 告訴資料庫鎖定返回的所有行以用於更新。

#### 寫入偏差的更多例子

寫入偏差乍看像是一個深奧的問題，但一旦意識到這一點，很容易會注意到它可能發生在更多場景下。以下是一些例子：

會議室預訂系統
: 比如你想要規定不能在同一時間對同一個會議室進行多次的預訂【43】。當有人想要預訂時，首先檢查是否存在相互衝突的預訂（即預訂時間範圍重疊的同一房間），如果沒有找到，則建立會議（請參閱示例 7-2）[^ix]。

  [^ix]: 在 PostgreSQL 中，你可以使用範圍型別優雅地執行此操作，但在其他資料庫中並未得到廣泛支援。

  **例 7-2 會議室預訂系統試圖避免重複預訂（在快照隔離下不安全）**

  ```sql
  BEGIN TRANSACTION;

  -- 檢查所有現存的與 12:00~13:00 重疊的預定
  SELECT COUNT(*) FROM bookings
  WHERE room_id = 123 AND
    end_time > '2015-01-01 12:00' AND start_time < '2015-01-01 13:00';

  -- 如果之前的查詢返回 0
  INSERT INTO bookings(room_id, start_time, end_time, user_id)
    VALUES (123, '2015-01-01 12:00', '2015-01-01 13:00', 666);

  COMMIT;
  ```

  不幸的是，快照隔離並不能防止另一個使用者同時插入衝突的會議。為了確保不會遇到排程衝突，你又需要可序列化的隔離級別了。

多人遊戲
: 在 [例 7-1]() 中，我們使用一個鎖來防止丟失更新（也就是確保兩個玩家不能同時移動同一個棋子）。但是鎖定並不妨礙玩家將兩個不同的棋子移動到棋盤上的相同位置，或者採取其他違反遊戲規則的行為。取決於你正在執行的規則型別，也許可以使用唯一約束（unique constraint），否則你很容易發生寫入偏差。

搶注使用者名稱
: 在每個使用者擁有唯一使用者名稱的網站上，兩個使用者可能會嘗試同時建立具有相同使用者名稱的帳戶。可以在事務檢查名稱是否被搶佔，如果沒有則使用該名稱建立賬戶。但是像在前面的例子中那樣，在快照隔離下這是不安全的。幸運的是，唯一約束是一個簡單的解決辦法（第二個事務在提交時會因為違反使用者名稱唯一約束而被中止）。

防止雙重開支
: 允許使用者花錢或使用積分的服務，需要檢查使用者的支付數額不超過其餘額。可以透過在使用者的帳戶中插入一個試探性的消費專案來實現這一點，列出帳戶中的所有專案，並檢查總和是否為正值【44】。在寫入偏差場景下，可能會發生兩個支出專案同時插入，一起導致餘額變為負值，但這兩個事務都不會注意到另一個。

#### 導致寫入偏差的幻讀

所有這些例子都遵循類似的模式：

1. 一個 `SELECT` 查詢找出符合條件的行，並檢查是否符合一些要求。（例如：至少有兩名醫生在值班；不存在對該會議室同一時段的預定；棋盤上的位置沒有被其他棋子佔據；使用者名稱還沒有被搶注；賬戶裡還有足夠餘額）

2. 按照第一個查詢的結果，應用程式碼決定是否繼續。（可能會繼續操作，也可能中止並報錯）

3. 如果應用決定繼續操作，就執行寫入（插入、更新或刪除），並提交事務。

   這個寫入的效果改變了步驟 2 中的先決條件。換句話說，如果在提交寫入後，重複執行一次步驟 1 的 SELECT 查詢，將會得到不同的結果。因為寫入改變了符合搜尋條件的行集（現在少了一個醫生值班，那時候的會議室現在已經被預訂了，棋盤上的這個位置已經被佔據了，使用者名稱已經被搶注，賬戶餘額不夠了）。

這些步驟可能以不同的順序發生。例如可以首先進行寫入，然後進行 SELECT 查詢，最後根據查詢結果決定是放棄還是提交。

在醫生值班的例子中，在步驟 3 中修改的行，是步驟 1 中返回的行之一，所以我們可以透過鎖定步驟 1 中的行（`SELECT FOR UPDATE`）來使事務安全並避免寫入偏差。但是其他四個例子是不同的：它們檢查是否 **不存在** 某些滿足條件的行，寫入會 **新增** 一個匹配相同條件的行。如果步驟 1 中的查詢沒有返回任何行，則 `SELECT FOR UPDATE` 鎖不了任何東西。

這種效應：一個事務中的寫入改變另一個事務的搜尋查詢的結果，被稱為 **幻讀**【3】。快照隔離避免了只讀查詢中幻讀，但是在像我們討論的例子那樣的讀寫事務中，幻讀會導致特別棘手的寫入偏差情況。

#### 物化衝突

如果幻讀的問題是沒有物件可以加鎖，也許可以人為地在資料庫中引入一個鎖物件？

例如，在會議室預訂的場景中，可以想象建立一個關於時間槽和房間的表。此表中的每一行對應於特定時間段（例如 15 分鐘）的特定房間。可以提前插入房間和時間的所有可能組合行（例如接下來的六個月）。

現在，要建立預訂的事務可以鎖定（`SELECT FOR UPDATE`）表中與所需房間和時間段對應的行。在獲得鎖定之後，它可以檢查重疊的預訂並像以前一樣插入新的預訂。請注意，這個表並不是用來儲存預訂相關的資訊 —— 它完全就是一組鎖，用於防止同時修改同一房間和時間範圍內的預訂。

這種方法被稱為 **物化衝突（materializing conflicts）**，因為它將幻讀變為資料庫中一組具體行上的鎖衝突【11】。不幸的是，弄清楚如何物化衝突可能很難，也很容易出錯，並且讓併發控制機制洩漏到應用資料模型是很醜陋的做法。出於這些原因，如果沒有其他辦法可以實現，物化衝突應被視為最後的手段。在大多數情況下。**可序列化（Serializable）** 的隔離級別是更可取的。


## 可序列化

在本章中，已經看到了幾個易於出現競爭條件的事務例子。**讀已提交** 和 **快照隔離** 級別會阻止某些競爭條件，但不會阻止另一些。我們遇到了一些特別棘手的例子，**寫入偏差** 和 **幻讀**。這是一個可悲的情況：

- 隔離級別難以理解，並且在不同的資料庫中實現的不一致（例如，“可重複讀” 的含義天差地別）。
- 光檢查應用程式碼很難判斷在特定的隔離級別執行是否安全。特別是在大型應用程式中，你可能並不知道併發發生的所有事情。
- 沒有檢測競爭條件的好工具。原則上來說，靜態分析可能會有幫助【26】，但研究中的技術還沒法實際應用。併發問題的測試是很難的，因為它們通常是非確定性的 —— 只有在倒楣的時序下才會出現問題。

這不是一個新問題，從 20 世紀 70 年代以來就一直是這樣了，當時首先引入了較弱的隔離級別【2】。一直以來，研究人員的答案都很簡單：使用 **可序列化（serializable）** 的隔離級別！

**可序列化（Serializability）** 隔離通常被認為是最強的隔離級別。它保證即使事務可以並行執行，最終的結果也是一樣的，就好像它們沒有任何併發性，連續挨個執行一樣。因此資料庫保證，如果事務在單獨執行時正常執行，則它們在併發執行時繼續保持正確 —— 換句話說，資料庫可以防止 **所有** 可能的競爭條件。

但如果可序列化隔離級別比弱隔離級別的爛攤子要好得多，那為什麼沒有人見人愛？為了回答這個問題，我們需要看看實現可序列化的選項，以及它們如何執行。目前大多數提供可序列化的資料庫都使用了三種技術之一，本章的剩餘部分將會介紹這些技術：

- 字面意義上地序列順序執行事務（請參閱 “[真的序列執行](#真的序列執行)”）
- **兩階段鎖定（2PL, two-phase locking）**，幾十年來唯一可行的選擇（請參閱 “[兩階段鎖定](#兩階段鎖定)”）
- 樂觀併發控制技術，例如 **可序列化快照隔離**（serializable snapshot isolation，請參閱 “[可序列化快照隔離](#可序列化快照隔離)”）

現在將主要在單節點資料庫的背景下討論這些技術；在 [第九章](/v1_tw/ch9) 中，我們將研究如何將它們推廣到涉及分散式系統中多個節點的事務。

### 真的序列執行

避免併發問題的最簡單方法就是完全不要併發：在單個執行緒上按順序一次只執行一個事務。這樣做就完全繞開了檢測 / 防止事務間衝突的問題，由此產生的隔離，正是可序列化的定義。

儘管這似乎是一個明顯的主意，但資料庫設計人員只是在 2007 年左右才決定，單執行緒迴圈執行事務是可行的【45】。如果多執行緒併發在過去的 30 年中被認為是獲得良好效能的關鍵所在，那麼究竟是什麼改變致使單執行緒執行變為可能呢？

兩個進展引發了這個反思：

- RAM 足夠便宜了，許多場景現在都可以將完整的活躍資料集儲存在記憶體中（請參閱 “[在記憶體中儲存一切](/v1_tw/ch3#在記憶體中儲存一切)”）。當事務需要訪問的所有資料都在記憶體中時，事務處理的執行速度要比等待資料從磁碟載入時快得多。
- 資料庫設計人員意識到 OLTP 事務通常很短，而且只進行少量的讀寫操作（請參閱 “[事務處理還是分析？](/v1_tw/ch3#事務處理還是分析？)”）。相比之下，長時間執行的分析查詢通常是隻讀的，因此它們可以在序列執行迴圈之外的一致快照（使用快照隔離）上執行。

序列執行事務的方法在 VoltDB/H-Store、Redis 和 Datomic 中實現【46,47,48】。設計用於單執行緒執行的系統有時可以比支援併發的系統性能更好，因為它可以避免鎖的協調開銷。但是其吞吐量僅限於單個 CPU 核的吞吐量。為了充分利用單一執行緒，需要有與傳統形式的事務不同的結構。

#### 在儲存過程中封裝事務

在資料庫的早期階段，意圖是資料庫事務可以包含整個使用者活動流程。例如，預訂機票是一個多階段的過程（搜尋路線，票價和可用座位，決定行程，在每段行程的航班上訂座，輸入乘客資訊，付款）。資料庫設計者認為，如果整個過程是一個事務，那麼它就可以被原子化地執行。

不幸的是，人類做出決定和回應的速度非常緩慢。如果資料庫事務需要等待來自使用者的輸入，則資料庫需要支援潛在的大量併發事務，其中大部分是空閒的。大多數資料庫不能高效完成這項工作，因此幾乎所有的 OLTP 應用程式都避免在事務中等待互動式的使用者輸入，以此來保持事務的簡短。在 Web 上，這意味著事務在同一個 HTTP 請求中被提交 —— 一個事務不會跨越多個請求。一個新的 HTTP 請求開始一個新的事務。

即使已經將人類從關鍵路徑中排除，事務仍然以互動式的客戶端 / 伺服器風格執行，一次一個語句。應用程式進行查詢，讀取結果，可能根據第一個查詢的結果進行另一個查詢，依此類推。查詢和結果在應用程式程式碼（在一臺機器上執行）和資料庫伺服器（在另一臺機器上）之間來回傳送。

在這種互動式的事務方式中，應用程式和資料庫之間的網路通訊耗費了大量的時間。如果不允許在資料庫中進行併發處理，且一次只處理一個事務，則吞吐量將會非常糟糕，因為資料庫大部分的時間都花費在等待應用程式發出當前事務的下一個查詢。在這種資料庫中，為了獲得合理的效能，需要同時處理多個事務。

出於這個原因，具有單執行緒序列事務處理的系統不允許互動式的多語句事務。取而代之，應用程式必須提前將整個事務程式碼作為儲存過程提交給資料庫。這些方法之間的差異如 [圖 7-9](/v1/ddia_0709.png) 所示。如果事務所需的所有資料都在記憶體中，則儲存過程可以非常快地執行，而不用等待任何網路或磁碟 I/O。

![](/v1/ddia_0709.png)

**圖 7-9 互動式事務和儲存過程之間的區別（使用圖 7-8 的示例事務）**

#### 儲存過程的優點和缺點

儲存過程在關係型資料庫中已經存在了一段時間了，自 1999 年以來它們一直是 SQL 標準（SQL/PSM）的一部分。出於各種原因，它們的名聲有點不太好：

- 每個資料庫廠商都有自己的儲存過程語言（Oracle 有 PL/SQL，SQL Server 有 T-SQL，PostgreSQL 有 PL/pgSQL，等等）。這些語言並沒有跟上通用程式語言的發展，所以從今天的角度來看，它們看起來相當醜陋和陳舊，而且缺乏大多數程式語言中能找到的庫的生態系統。
- 在資料庫中執行的程式碼難以管理：與應用伺服器相比，它更難除錯，更難以保持版本控制和部署，更難測試，並且難以整合到指標收集系統來進行監控。
- 資料庫通常比應用伺服器對效能敏感的多，因為單個數據庫例項通常由許多應用伺服器共享。資料庫中一個寫得不好的儲存過程（例如，佔用大量記憶體或 CPU 時間）會比在應用伺服器中相同的程式碼造成更多的麻煩。

但是這些問題都是可以克服的。現代的儲存過程實現放棄了 PL/SQL，而是使用現有的通用程式語言：VoltDB 使用 Java 或 Groovy，Datomic 使用 Java 或 Clojure，而 Redis 使用 Lua。

**儲存過程與記憶體儲存**，使得在單個執行緒上執行所有事務變得可行。由於不需要等待 I/O，且避免了併發控制機制的開銷，它們可以在單個執行緒上實現相當好的吞吐量。

VoltDB 還使用儲存過程進行復制：但不是將事務的寫入結果從一個節點複製到另一個節點，而是在每個節點上執行相同的儲存過程。因此 VoltDB 要求儲存過程是 **確定性的**（在不同的節點上執行時，它們必須產生相同的結果）。舉個例子，如果事務需要使用當前的日期和時間，則必須透過特殊的確定性 API 來實現。

#### 分割槽

順序執行所有事務使併發控制簡單多了，但資料庫的事務吞吐量被限制為單機單核的速度。只讀事務可以使用快照隔離在其它地方執行，但對於寫入吞吐量較高的應用，單執行緒事務處理器可能成為一個嚴重的瓶頸。

為了伸縮至多個 CPU 核心和多個節點，可以對資料進行分割槽（請參閱 [第六章](/v1_tw/ch6)），在 VoltDB 中支援這樣做。如果你可以找到一種對資料集進行分割槽的方法，以便每個事務只需要在單個分割槽中讀寫資料，那麼每個分割槽就可以擁有自己獨立執行的事務處理執行緒。在這種情況下可以為每個分割槽指派一個獨立的 CPU 核，事務吞吐量就可以與 CPU 核數保持線性伸縮【47】。

但是，對於需要訪問多個分割槽的任何事務，資料庫必須在觸及的所有分割槽之間協調事務。儲存過程需要跨越所有分割槽鎖定執行，以確保整個系統的可序列性。

由於跨分割槽事務具有額外的協調開銷，所以它們比單分割槽事務慢得多。VoltDB 報告的吞吐量大約是每秒 1000 個跨分割槽寫入，比單分割槽吞吐量低幾個數量級，並且不能透過增加更多的機器來增加吞吐量【49】。

事務是否可以是劃分至單個分割槽很大程度上取決於應用資料的結構。簡單的鍵值資料通常可以非常容易地進行分割槽，但是具有多個次級索引的資料可能需要大量的跨分割槽協調（請參閱 “[分割槽與次級索引](/v1_tw/ch6#分割槽與次級索引)”）。

#### 序列執行小結

在特定約束條件下，真的序列執行事務，已經成為一種實現可序列化隔離等級的可行辦法。

- 每個事務都必須小而快，只要有一個緩慢的事務，就會拖慢所有事務處理。
- 僅限於活躍資料集可以放入記憶體的情況。很少訪問的資料可能會被移動到磁碟，但如果需要在單執行緒執行的事務中訪問這些磁碟中的資料，系統就會變得非常慢 [^x]。
- 寫入吞吐量必須低到能在單個 CPU 核上處理，如若不然，事務需要能劃分至單個分割槽，且不需要跨分割槽協調。
- 跨分割槽事務是可能的，但是它們能被使用的程度有很大的限制。

[^x]: 如果事務需要訪問不在記憶體中的資料，最好的解決方案可能是中止事務，非同步地將資料提取到記憶體中，同時繼續處理其他事務，然後在資料載入完畢時重新啟動事務。這種方法被稱為 **反快取（anti-caching）**，正如前面在 “[在記憶體中儲存一切](/v1_tw/ch3#在記憶體中儲存一切)” 中所述。

### 兩階段鎖定

大約 30 年來，在資料庫中只有一種廣泛使用的序列化演算法：**兩階段鎖定（2PL，two-phase locking）** [^xi]

[^xi]: 有時也稱為 **嚴格兩階段鎖定（SS2PL, strong strict two-phase locking）**，以便和其他 2PL 變體區分。

> #### 2PL不是2PC
>
> 請注意，雖然兩階段鎖定（2PL）聽起來非常類似於兩階段提交（2PC），但它們是完全不同的東西。我們將在 [第九章](/v1_tw/ch9) 討論 2PC。

之前我們看到鎖通常用於防止髒寫（請參閱 “[沒有髒寫](#沒有髒寫)” 一節）：如果兩個事務同時嘗試寫入同一個物件，則鎖可確保第二個寫入必須等到第一個寫入完成事務（中止或提交），然後才能繼續。

兩階段鎖定類似，但是鎖的要求更強得多。只要沒有寫入，就允許多個事務同時讀取同一個物件。但物件只要有寫入（修改或刪除），就需要 **獨佔訪問（exclusive access）** 許可權：

- 如果事務 A 讀取了一個物件，並且事務 B 想要寫入該物件，那麼 B 必須等到 A 提交或中止才能繼續（這確保 B 不能在 A 底下意外地改變物件）。
- 如果事務 A 寫入了一個物件，並且事務 B 想要讀取該物件，則 B 必須等到 A 提交或中止才能繼續（像 [圖 7-1](/v1/ddia_0701.png) 那樣讀取舊版本的物件在 2PL 下是不可接受的）。

在 2PL 中，寫入不僅會阻塞其他寫入，也會阻塞讀，反之亦然。快照隔離使得 **讀不阻塞寫，寫也不阻塞讀**（請參閱 “[實現快照隔離](#實現快照隔離)”），這是 2PL 和快照隔離之間的關鍵區別。另一方面，因為 2PL 提供了可序列化的性質，它可以防止早先討論的所有競爭條件，包括丟失更新和寫入偏差。

#### 實現兩階段鎖

2PL 用於 MySQL（InnoDB）和 SQL Server 中的可序列化隔離級別，以及 DB2 中的可重複讀隔離級別【23,36】。

讀與寫的阻塞是透過為資料庫中每個物件新增鎖來實現的。鎖可以處於 **共享模式（shared mode）** 或 **獨佔模式（exclusive mode）**。鎖使用如下：

- 若事務要讀取物件，則須先以共享模式獲取鎖。允許多個事務同時持有共享鎖。但如果另一個事務已經在物件上持有排它鎖，則這些事務必須等待。
- 若事務要寫入一個物件，它必須首先以獨佔模式獲取該鎖。沒有其他事務可以同時持有鎖（無論是共享模式還是獨佔模式），所以如果物件上存在任何鎖，該事務必須等待。
- 如果事務先讀取再寫入物件，則它可能會將其共享鎖升級為獨佔鎖。升級鎖的工作與直接獲得獨佔鎖相同。
- 事務獲得鎖之後，必須繼續持有鎖直到事務結束（提交或中止）。這就是 “兩階段” 這個名字的來源：第一階段（當事務正在執行時）獲取鎖，第二階段（在事務結束時）釋放所有的鎖。

由於使用了這麼多的鎖，因此很可能會發生：事務 A 等待事務 B 釋放它的鎖，反之亦然。這種情況叫做 **死鎖（Deadlock）**。資料庫會自動檢測事務之間的死鎖，並中止其中一個，以便另一個繼續執行。被中止的事務需要由應用程式重試。

#### 兩階段鎖定的效能

兩階段鎖定的巨大缺點，以及 70 年代以來沒有被所有人使用的原因，是其效能問題。兩階段鎖定下的事務吞吐量與查詢響應時間要比弱隔離級別下要差得多。

這一部分是由於獲取和釋放所有這些鎖的開銷，但更重要的是由於併發性的降低。按照設計，如果兩個併發事務試圖做任何可能導致競爭條件的事情，那麼必須等待另一個完成。

傳統的關係資料庫不限制事務的持續時間，因為它們是為等待人類輸入的互動式應用而設計的。因此，當一個事務需要等待另一個事務時，等待的時長並沒有限制。即使你保證所有的事務都很短，如果有多個事務想要訪問同一個物件，那麼可能會形成一個佇列，所以事務可能需要等待幾個其他事務才能完成。

因此，執行 2PL 的資料庫可能具有相當不穩定的延遲，如果在工作負載中存在爭用，那麼可能高百分位點處的響應會非常的慢（請參閱 “[描述效能](/v1_tw/ch1#描述效能)”）。可能只需要一個緩慢的事務，或者一個訪問大量資料並獲取許多鎖的事務，就能把系統的其他部分拖慢，甚至迫使系統停機。當需要穩健的操作時，這種不穩定性是有問題的。

基於鎖實現的讀已提交隔離級別可能發生死鎖，但在基於 2PL 實現的可序列化隔離級別中，它們會出現的頻繁的多（取決於事務的訪問模式）。這可能是一個額外的效能問題：當事務由於死鎖而被中止並被重試時，它需要從頭重做它的工作。如果死鎖很頻繁，這可能意味著巨大的浪費。

#### 謂詞鎖

在前面關於鎖的描述中，我們掩蓋了一個微妙而重要的細節。在 “[導致寫入偏差的幻讀](#導致寫入偏差的幻讀)” 中，我們討論了 **幻讀（phantoms）** 的問題。即一個事務改變另一個事務的搜尋查詢的結果。具有可序列化隔離級別的資料庫必須防止 **幻讀**。

在會議室預訂的例子中，這意味著如果一個事務在某個時間視窗內搜尋了一個房間的現有預訂（見 [例 7-2]()），則另一個事務不能同時插入或更新同一時間視窗與同一房間的另一個預訂 （可以同時插入其他房間的預訂，或在不影響另一個預定的條件下預定同一房間的其他時間段）。

如何實現這一點？從概念上講，我們需要一個 **謂詞鎖（predicate lock）**【3】。它類似於前面描述的共享 / 排它鎖，但不屬於特定的物件（例如，表中的一行），它屬於所有符合某些搜尋條件的物件，如：

```sql
SELECT * FROM bookings
WHERE room_id = 123 AND
      end_time > '2018-01-01 12:00' AND
      start_time < '2018-01-01 13:00';
```

謂詞鎖限制訪問，如下所示：

- 如果事務 A 想要讀取匹配某些條件的物件，就像在這個 `SELECT` 查詢中那樣，它必須獲取查詢條件上的 **共享謂詞鎖（shared-mode predicate lock）**。如果另一個事務 B 持有任何滿足這一查詢條件物件的排它鎖，那麼 A 必須等到 B 釋放它的鎖之後才允許進行查詢。
- 如果事務 A 想要插入，更新或刪除任何物件，則必須首先檢查舊值或新值是否與任何現有的謂詞鎖匹配。如果事務 B 持有匹配的謂詞鎖，那麼 A 必須等到 B 已經提交或中止後才能繼續。

這裡的關鍵思想是，謂詞鎖甚至適用於資料庫中尚不存在，但將來可能會新增的物件（幻象）。如果兩階段鎖定包含謂詞鎖，則資料庫將阻止所有形式的寫入偏差和其他競爭條件，因此其隔離實現了可序列化。

#### 索引範圍鎖

不幸的是謂詞鎖效能不佳：**如果活躍事務持有很多鎖，檢查匹配的鎖會非常耗時。** 因此，大多數使用 2PL 的資料庫實際上實現了索引範圍鎖（index-range locking，也稱為 **next-key locking**），這是一個簡化的近似版謂詞鎖【41,50】。

透過使謂詞匹配到一個更大的集合來簡化謂詞鎖是安全的。例如，如果你有在中午和下午 1 點之間預訂 123 號房間的謂詞鎖，則鎖定 123 號房間的所有時間段，或者鎖定 12:00~13:00 時間段的所有房間（不只是 123 號房間）是一個安全的近似，因為任何滿足原始謂詞的寫入也一定會滿足這種更鬆散的近似。

在房間預訂資料庫中，你可能會在 `room_id` 列上有一個索引，並且 / 或者在 `start_time` 和 `end_time` 上有索引（否則前面的查詢在大型資料庫上的速度會非常慢）：

- 假設你的索引位於 `room_id` 上，並且資料庫使用此索引查詢 123 號房間的現有預訂。現在資料庫可以簡單地將共享鎖附加到這個索引項上，指示事務已搜尋 123 號房間用於預訂。
- 或者，如果資料庫使用基於時間的索引來查詢現有預訂，那麼它可以將共享鎖附加到該索引中的一系列值，指示事務已經將 12:00~13:00 時間段標記為用於預定。

無論哪種方式，搜尋條件的近似值都附加到其中一個索引上。現在，如果另一個事務想要插入、更新或刪除同一個房間和 / 或重疊時間段的預訂，則它將不得不更新索引的相同部分。在這樣做的過程中，它會遇到共享鎖，它將被迫等到鎖被釋放。

這種方法能夠有效防止幻讀和寫入偏差。索引範圍鎖並不像謂詞鎖那樣精確（它們可能會鎖定更大範圍的物件，而不是維持可序列化所必需的範圍），但是由於它們的開銷較低，所以是一個很好的折衷。

如果沒有可以掛載範圍鎖的索引，資料庫可以退化到使用整個表上的共享鎖。這對效能不利，因為它會阻止所有其他事務寫入表格，但這是一個安全的回退位置。


### 可序列化快照隔離

本章描繪了資料庫中併發控制的黯淡畫面。一方面，我們實現了效能不好（2PL）或者伸縮性不好（序列執行）的可序列化隔離級別。另一方面，我們有效能良好的弱隔離級別，但容易出現各種競爭條件（丟失更新、寫入偏差、幻讀等）。序列化的隔離級別和高效能是從根本上相互矛盾的嗎？

也許不是：一個稱為 **可序列化快照隔離（SSI, serializable snapshot isolation）** 的演算法是非常有前途的。它提供了完整的可序列化隔離級別，但與快照隔離相比只有很小的效能損失。SSI 是相當新的：它在 2008 年首次被描述【40】，並且是 Michael Cahill 的博士論文【51】的主題。

今天，SSI 既用於單節點資料庫（PostgreSQL9.1 以後的可序列化隔離級別），也用於分散式資料庫（FoundationDB 使用類似的演算法）。由於 SSI 與其他併發控制機制相比還很年輕，還處於在實踐中證明自己表現的階段。但它有可能因為足夠快而在未來成為新的預設選項。

#### 悲觀與樂觀的併發控制

兩階段鎖是一種所謂的 **悲觀併發控制機制（pessimistic）** ：它是基於這樣的原則：如果有事情可能出錯（如另一個事務所持有的鎖所表示的），最好等到情況安全後再做任何事情。這就像互斥，用於保護多執行緒程式設計中的資料結構。

從某種意義上說，序列執行可以稱為悲觀到了極致：在事務持續期間，每個事務對整個資料庫（或資料庫的一個分割槽）具有排它鎖，作為對悲觀的補償，我們讓每筆事務執行得非常快，所以只需要短時間持有 “鎖”。

相比之下，**序列化快照隔離** 是一種 **樂觀（optimistic）** 的併發控制技術。在這種情況下，樂觀意味著，如果存在潛在的危險也不阻止事務，而是繼續執行事務，希望一切都會好起來。當一個事務想要提交時，資料庫檢查是否有什麼不好的事情發生（即隔離是否被違反）；如果是的話，事務將被中止，並且必須重試。只有可序列化的事務才被允許提交。

樂觀併發控制是一個古老的想法【52】，其優點和缺點已經爭論了很長時間【53】。如果存在很多 **爭用**（contention，即很多事務試圖訪問相同的物件），則表現不佳，因為這會導致很大一部分事務需要中止。如果系統已經接近最大吞吐量，來自重試事務的額外負載可能會使效能變差。

但是，如果有足夠的空閒容量，並且事務之間的爭用不是太高，樂觀的併發控制技術往往比悲觀的效能要好。可交換的原子操作可以減少爭用：例如，如果多個事務同時要增加一個計數器，那麼應用增量的順序（只要計數器不在同一個事務中讀取）就無關緊要了，所以併發增量可以全部應用且不會有衝突。

顧名思義，SSI 基於快照隔離 —— 也就是說，事務中的所有讀取都是來自資料庫的一致性快照（請參閱 “[快照隔離和可重複讀取](#快照隔離和可重複讀)”）。與早期的樂觀併發控制技術相比這是主要的區別。在快照隔離的基礎上，SSI 添加了一種演算法來檢測寫入之間的序列化衝突，並確定要中止哪些事務。

#### 基於過時前提的決策

先前討論了快照隔離中的寫入偏差（請參閱 “[寫入偏差與幻讀](#寫入偏差與幻讀)”）時，我們觀察到一個迴圈模式：事務從資料庫讀取一些資料，檢查查詢的結果，並根據它看到的結果決定採取一些操作（寫入資料庫）。但是，在快照隔離的情況下，原始查詢的結果在事務提交時可能不再是最新的，因為資料可能在同一時間被修改。

換句話說，事務基於一個 **前提（premise）** 採取行動（事務開始時候的事實，例如：“目前有兩名醫生正在值班”）。之後當事務要提交時，原始資料可能已經改變 —— 前提可能不再成立。

當應用程式進行查詢時（例如，“當前有多少醫生正在值班？”），資料庫不知道應用邏輯如何使用該查詢結果。在這種情況下為了安全，資料庫需要假設任何對該結果集的變更都可能會使該事務中的寫入變得無效。換而言之，事務中的查詢與寫入可能存在因果依賴。為了提供可序列化的隔離級別，如果事務在過時的前提下執行操作，資料庫必須能檢測到這種情況，並中止事務。

資料庫如何知道查詢結果是否可能已經改變？有兩種情況需要考慮：

- 檢測對舊 MVCC 物件版本的讀取（讀之前存在未提交的寫入）
- 檢測影響先前讀取的寫入（讀之後發生寫入）

#### 檢測舊MVCC讀取

回想一下，快照隔離通常是透過多版本併發控制（MVCC；見 [圖 7-10](/v1/ddia_0710.png)）來實現的。當一個事務從 MVCC 資料庫中的一致快照讀時，它將忽略取快照時尚未提交的任何其他事務所做的寫入。在 [圖 7-10](/v1/ddia_0710.png) 中，事務 43 認為 Alice 的 `on_call = true` ，因為事務 42（修改 Alice 的待命狀態）未被提交。然而，在事務 43 想要提交時，事務 42 已經提交。這意味著在讀一致性快照時被忽略的寫入已經生效，事務 43 的前提不再為真。

![](/v1/ddia_0710.png)

**圖 7-10 檢測事務何時從 MVCC 快照讀取過時的值**

為了防止這種異常，資料庫需要跟蹤一個事務由於 MVCC 可見性規則而忽略另一個事務的寫入。當事務想要提交時，資料庫檢查是否有任何被忽略的寫入現在已經被提交。如果是這樣，事務必須中止。

為什麼要等到提交？當檢測到陳舊的讀取時，為什麼不立即中止事務 43 ？因為如果事務 43 是隻讀事務，則不需要中止，因為沒有寫入偏差的風險。當事務 43 進行讀取時，資料庫還不知道事務是否要稍後執行寫操作。此外，事務 42 可能在事務 43 被提交的時候中止或者可能仍然未被提交，因此讀取可能終究不是陳舊的。透過避免不必要的中止，SSI 保留了快照隔離從一致快照中長時間讀取的能力。

#### 檢測影響之前讀取的寫入

第二種情況要考慮的是另一個事務在讀取資料之後修改資料。這種情況如 [圖 7-11](/v1/ddia_0711.png) 所示。

![](/v1/ddia_0711.png)

**圖 7-11 在可序列化快照隔離中，檢測一個事務何時修改另一個事務的讀取。**

在兩階段鎖定的上下文中，我們討論了索引範圍鎖（請參閱 “[索引範圍鎖](#索引範圍鎖)”），它允許資料庫鎖定與某個搜尋查詢匹配的所有行的訪問權，例如 `WHERE shift_id = 1234`。可以在這裡使用類似的技術，除了 SSI 鎖不會阻塞其他事務。

在 [圖 7-11](/v1/ddia_0711.png) 中，事務 42 和 43 都在班次 1234 查詢值班醫生。如果在 `shift_id` 上有索引，則資料庫可以使用索引項 1234 來記錄事務 42 和 43 讀取這個資料的事實。（如果沒有索引，這個資訊可以在表級別進行跟蹤）。這個資訊只需要保留一段時間：在一個事務完成（提交或中止），並且所有的併發事務完成之後，資料庫就可以忘記它讀取的資料了。

當事務寫入資料庫時，它必須在索引中查詢最近曾讀取受影響資料的其他事務。這個過程類似於在受影響的鍵範圍上獲取寫鎖，但鎖並不會阻塞事務直到其他讀事務完成，而是像警戒線一樣只是簡單通知其他事務：你們讀過的資料可能不是最新的啦。

在 [圖 7-11](/v1/ddia_0711.png) 中，事務 42 通知事務 43 其先前讀已過時，反之亦然。事務 42 首先提交併成功，儘管事務 43 的寫影響了 42 ，但因為事務 43 尚未提交，所以寫入尚未生效。然而當事務 43 想要提交時，來自事務 42 的衝突寫入已經被提交，所以事務 43 必須中止。

#### 可序列化快照隔離的效能

與往常一樣，許多工程細節會影響演算法的實際表現。例如一個權衡是跟蹤事務的讀取和寫入的 **粒度（granularity）**。如果資料庫詳細地跟蹤每個事務的活動（細粒度），那麼可以準確地確定哪些事務需要中止，但是簿記開銷可能變得很顯著。簡略的跟蹤速度更快（粗粒度），但可能會導致更多不必要的事務中止。

在某些情況下，事務可以讀取被另一個事務覆蓋的資訊：這取決於發生了什麼，有時可以證明執行結果無論如何都是可序列化的。PostgreSQL 使用這個理論來減少不必要的中止次數【11,41】。

與兩階段鎖定相比，可序列化快照隔離的最大優點是一個事務不需要阻塞等待另一個事務所持有的鎖。就像在快照隔離下一樣，寫不會阻塞讀，反之亦然。這種設計原則使得查詢延遲更可預測，波動更少。特別是，只讀查詢可以執行在一致快照上，而不需要任何鎖定，這對於讀取繁重的工作負載非常有吸引力。

與序列執行相比，可序列化快照隔離並不侷限於單個 CPU 核的吞吐量：FoundationDB 將序列化衝突的檢測分佈在多臺機器上，允許擴充套件到很高的吞吐量。即使資料可能跨多臺機器進行分割槽，事務也可以在保證可序列化隔離等級的同時讀寫多個分割槽中的資料【54】。

中止率顯著影響 SSI 的整體表現。例如，長時間讀取和寫入資料的事務很可能會發生衝突並中止，因此 SSI 要求同時讀寫的事務儘量短（只讀的長事務可能沒問題）。對於慢事務，SSI 可能比兩階段鎖定或序列執行更不敏感。


## 本章小結

事務是一個抽象層，允許應用程式假裝某些併發問題和某些型別的硬體和軟體故障不存在。各式各樣的錯誤被簡化為一種簡單情況：**事務中止（transaction abort）**，而應用需要的僅僅是重試。

在本章中介紹了很多問題，事務有助於防止這些問題發生。並非所有應用都易受此類問題影響：具有非常簡單訪問模式的應用（例如每次讀寫單條記錄）可能無需事務管理。但是對於更複雜的訪問模式，事務可以大大減少需要考慮的潛在錯誤情景數量。

如果沒有事務處理，各種錯誤情況（程序崩潰、網路中斷、停電、磁碟已滿、意外併發等）意味著資料可能以各種方式變得不一致。例如，反正規化的資料可能很容易與源資料不同步。如果沒有事務處理，就很難推斷複雜的互動訪問可能對資料庫造成的影響。

本章深入討論了 **併發控制** 的話題。我們討論了幾個廣泛使用的隔離級別，特別是 **讀已提交**、**快照隔離**（有時稱為可重複讀）和 **可序列化**。並透過研究競爭條件的各種例子，來描述這些隔離等級：

髒讀
: 一個客戶端讀取到另一個客戶端尚未提交的寫入。**讀已提交** 或更強的隔離級別可以防止髒讀。

髒寫
: 一個客戶端覆蓋寫入了另一個客戶端尚未提交的寫入。幾乎所有的事務實現都可以防止髒寫。

讀偏差（不可重複讀）
: 在同一個事務中，客戶端在不同的時間點會看見資料庫的不同狀態。**快照隔離** 經常用於解決這個問題，它允許事務從一個特定時間點的一致性快照中讀取資料。快照隔離通常使用 **多版本併發控制（MVCC）** 來實現。

丟失更新
: 兩個客戶端同時執行 **讀取 - 修改 - 寫入序列**。其中一個寫操作，在沒有合併另一個寫入變更情況下，直接覆蓋了另一個寫操作的結果。所以導致資料丟失。快照隔離的一些實現可以自動防止這種異常，而另一些實現則需要手動鎖定（`SELECT FOR UPDATE`）。

寫偏差
: 一個事務讀取一些東西，根據它所看到的值作出決定，並將該決定寫入資料庫。但是，寫入時，該決定的前提不再是真實的。只有可序列化的隔離才能防止這種異常。

幻讀
: 事務讀取符合某些搜尋條件的物件。另一個客戶端進行寫入，影響搜尋結果。快照隔離可以防止直接的幻像讀取，但是寫入偏差上下文中的幻讀需要特殊處理，例如索引範圍鎖定。

弱隔離級別可以防止其中一些異常情況，但要求你，也就是應用程式開發人員手動處理剩餘那些（例如，使用顯式鎖定）。只有可序列化的隔離才能防範所有這些問題。我們討論了實現可序列化事務的三種不同方法：

字面意義上的序列執行
: 如果每個事務的執行速度非常快，並且事務吞吐量足夠低，足以在單個 CPU 核上處理，這是一個簡單而有效的選擇。

兩階段鎖定
: 數十年來，兩階段鎖定一直是實現可序列化的標準方式，但是許多應用出於效能問題的考慮避免使用它。

可序列化快照隔離（SSI）
: 一個相當新的演算法，避免了先前方法的大部分缺點。它使用樂觀的方法，允許事務執行而無需阻塞。當一個事務想要提交時，它會進行檢查，如果執行不可序列化，事務就會被中止。

本章中的示例主要是在關係資料模型的上下文中。但是，正如在 **“[多物件事務的需求](#多物件事務的需求)”** 中所討論的，無論使用哪種資料模型，事務都是有價值的資料庫功能。

本章主要是在單機資料庫的上下文中，探討了各種想法和演算法。分散式資料庫中的事務，則引入了一系列新的困難挑戰，我們將在接下來的兩章中討論。


## 參考文獻

1. Donald D. Chamberlin, Morton M. Astrahan, Michael W. Blasgen, et al.: “[A History and Evaluation of System R](https://citeseerx.ist.psu.edu/pdf/ebb29a0ca16e04e7eeb6b606b22a9eadb3a9d531),” *Communications of the ACM*, volume 24, number 10, pages 632–646, October 1981. [doi:10.1145/358769.358784](http://dx.doi.org/10.1145/358769.358784)
1. Jim N. Gray, Raymond A. Lorie, Gianfranco R. Putzolu, and Irving L. Traiger: “[Granularity of Locks and Degrees of Consistency in a Shared Data Base](https://citeseerx.ist.psu.edu/pdf/e127f0a6a912bb9150ecfe03c0ebf7fbc289a023),” in *Modelling in Data Base Management Systems: Proceedings of the IFIP Working Conference on Modelling in Data Base Management Systems*, edited by G. M. Nijssen, pages 364–394, Elsevier/North Holland Publishing, 1976. Also in *Readings in Database Systems*, 4th edition, edited by Joseph M. Hellerstein and Michael Stonebraker, MIT Press, 2005. ISBN: 978-0-262-69314-1
1. Kapali P. Eswaran, Jim N. Gray, Raymond A. Lorie, and Irving L. Traiger: “[The Notions of Consistency and Predicate Locks in a Database System](http://research.microsoft.com/en-us/um/people/gray/papers/On%20the%20Notions%20of%20Consistency%20and%20Predicate%20Locks%20in%20a%20Database%20System%20CACM.pdf),” *Communications of the ACM*, volume 19, number 11, pages 624–633, November 1976.
1. “[ACID Transactions Are Incredibly Helpful](http://web.archive.org/web/20150320053809/https://foundationdb.com/acid-claims),” FoundationDB, LLC, 2013.
1. John D. Cook: “[ACID Versus BASE for Database Transactions](http://www.johndcook.com/blog/2009/07/06/brewer-cap-theorem-base/),” *johndcook.com*, July 6, 2009.
1. Gavin Clarke: “[NoSQL's CAP Theorem Busters: We Don't Drop ACID](http://www.theregister.co.uk/2012/11/22/foundationdb_fear_of_cap_theorem/),” *theregister.co.uk*, November 22, 2012.
1. Theo Härder and Andreas Reuter: “[Principles of Transaction-Oriented Database Recovery](https://citeseerx.ist.psu.edu/pdf/11ef7c142295aeb1a28a0e714c91fc8d610c3047),” *ACM Computing Surveys*, volume 15, number 4, pages 287–317, December 1983. [doi:10.1145/289.291](http://dx.doi.org/10.1145/289.291)
1. Peter Bailis, Alan Fekete, Ali Ghodsi, et al.: “[HAT, not CAP: Towards Highly Available Transactions](http://www.bailis.org/papers/hat-hotos2013.pdf),” at *14th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2013.
1. Armando Fox, Steven D. Gribble, Yatin Chawathe, et al.: “[Cluster-Based Scalable Network Services](https://people.eecs.berkeley.edu/~brewer/cs262b/TACC.pdf),” at *16th ACM Symposium on Operating Systems Principles* (SOSP), October 1997.
1. Philip A. Bernstein, Vassos Hadzilacos, and Nathan Goodman: [*Concurrency Control and Recovery in Database Systems*](https://www.microsoft.com/en-us/research/people/philbe/book/). Addison-Wesley, 1987. ISBN: 978-0-201-10715-9, available online at *research.microsoft.com*.
1. Alan Fekete, Dimitrios Liarokapis, Elizabeth O'Neil, et al.: “[Making Snapshot Isolation Serializable](https://www.cse.iitb.ac.in/infolab/Data/Courses/CS632/2009/Papers/p492-fekete.pdf),” *ACM Transactions on Database Systems*, volume 30, number 2, pages 492–528, June 2005. [doi:10.1145/1071610.1071615](http://dx.doi.org/10.1145/1071610.1071615)
1. Mai Zheng, Joseph Tucek, Feng Qin, and Mark Lillibridge: “[Understanding the Robustness of SSDs Under Power Fault](https://www.usenix.org/system/files/conference/fast13/fast13-final80.pdf),” at *11th USENIX Conference on File and Storage Technologies* (FAST), February 2013.
1. Laurie Denness: “[SSDs: A Gift and a Curse](https://laur.ie/blog/2015/06/ssds-a-gift-and-a-curse/),” *laur.ie*, June 2, 2015.
1. Adam Surak: “[When Solid State Drives Are Not That Solid](https://blog.algolia.com/when-solid-state-drives-are-not-that-solid/),” *blog.algolia.com*, June 15, 2015.
1. Thanumalayan Sankaranarayana Pillai, Vijay Chidambaram, Ramnatthan Alagappan, et al.: “[All File Systems Are Not Created Equal: On the Complexity of Crafting Crash-Consistent Applications](http://research.cs.wisc.edu/wind/Publications/alice-osdi14.pdf),” at *11th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2014.
1. Chris Siebenmann: “[Unix's File Durability Problem](https://utcc.utoronto.ca/~cks/space/blog/unix/FileSyncProblem),” *utcc.utoronto.ca*, April 14, 2016.
1. Lakshmi N. Bairavasundaram, Garth R. Goodson, Bianca Schroeder, et al.: “[An Analysis of Data Corruption in the Storage Stack](http://research.cs.wisc.edu/adsl/Publications/corruption-fast08.pdf),” at *6th USENIX Conference on File and Storage Technologies* (FAST), February 2008.
1. Bianca Schroeder, Raghav Lagisetty, and Arif Merchant: “[Flash Reliability in Production: The Expected and the Unexpected](https://www.usenix.org/conference/fast16/technical-sessions/presentation/schroeder),” at *14th USENIX Conference on File and Storage Technologies* (FAST), February 2016.
1. Don Allison: “[SSD Storage – Ignorance of Technology Is No Excuse](https://blog.korelogic.com/blog/2015/03/24),” *blog.korelogic.com*, March 24, 2015.
1. Dave Scherer: “[Those Are Not Transactions (Cassandra 2.0)](http://web.archive.org/web/20150526065247/http://blog.foundationdb.com/those-are-not-transactions-cassandra-2-0),” *blog.foundationdb.com*, September 6, 2013.
1. Kyle Kingsbury: “[Call Me Maybe: Cassandra](http://aphyr.com/posts/294-call-me-maybe-cassandra/),” *aphyr.com*, September 24, 2013.
1. “[ACID Support in Aerospike](https://web.archive.org/web/20170305002118/https://www.aerospike.com/docs/architecture/assets/AerospikeACIDSupport.pdf),” Aerospike, Inc., June 2014.
1. Martin Kleppmann: “[Hermitage: Testing the 'I' in ACID](http://martin.kleppmann.com/2014/11/25/hermitage-testing-the-i-in-acid.html),” *martin.kleppmann.com*, November 25, 2014.
1. Tristan D'Agosta: “[BTC Stolen from Poloniex](https://bitcointalk.org/index.php?topic=499580),” *bitcointalk.org*, March 4, 2014.
1. bitcointhief2: “[How I Stole Roughly 100 BTC from an Exchange and How I Could Have Stolen More!](http://www.reddit.com/r/Bitcoin/comments/1wtbiu/how_i_stole_roughly_100_btc_from_an_exchange_and/),” *reddit.com*, February 2, 2014.
1. Sudhir Jorwekar, Alan Fekete, Krithi Ramamritham, and S. Sudarshan: “[Automating the Detection of Snapshot Isolation Anomalies](http://www.vldb.org/conf/2007/papers/industrial/p1263-jorwekar.pdf),” at *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
1. Michael Melanson: “[Transactions: The Limits of Isolation](https://www.michaelmelanson.net/posts/transactions-the-limits-of-isolation/),” *michaelmelanson.net*, November 30, 2014.
1. Hal Berenson, Philip A. Bernstein, Jim N. Gray, et al.: “[A Critique of ANSI SQL Isolation Levels](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-95-51.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), May 1995.
1. Atul Adya: “[Weak Consistency: A Generalized Theory and Optimistic Implementations for Distributed Transactions](http://pmg.csail.mit.edu/papers/adya-phd.pdf),” PhD Thesis, Massachusetts Institute of Technology, March 1999.
1. Peter Bailis, Aaron Davidson, Alan Fekete, et al.: “[Highly Available Transactions: Virtues and Limitations (Extended Version)](http://arxiv.org/pdf/1302.0309.pdf),” at *40th International Conference on Very Large Data Bases* (VLDB), September 2014.
1. Bruce Momjian: “[MVCC Unmasked](http://momjian.us/main/presentations/internals.html#mvcc),” *momjian.us*, July 2014.
1. Annamalai Gurusami: “[Repeatable Read Isolation Level in InnoDB – How Consistent Read View Works](https://web.archive.org/web/20161225080947/https://blogs.oracle.com/mysqlinnodb/entry/repeatable_read_isolation_level_in),” *blogs.oracle.com*, January 15, 2013.
1. Nikita Prokopov: “[Unofficial Guide to Datomic Internals](http://tonsky.me/blog/unofficial-guide-to-datomic-internals/),” *tonsky.me*, May 6, 2014.
1. Baron Schwartz: “[Immutability, MVCC, and Garbage Collection](https://web.archive.org/web/20220122020806/https://www.xaprb.com/blog/2013/12/28/immutability-mvcc-and-garbage-collection/),” *xaprb.com*, December 28, 2013.
1. J. Chris Anderson, Jan Lehnardt, and Noah Slater: *CouchDB: The Definitive Guide*. O'Reilly Media, 2010. ISBN: 978-0-596-15589-6
1. Rikdeb Mukherjee: “[Isolation in DB2 (Repeatable Read, Read Stability, Cursor Stability, Uncommitted Read) with Examples](http://mframes.blogspot.co.uk/2013/07/isolation-in-cursor.html),” *mframes.blogspot.co.uk*, July 4, 2013.
1. Steve Hilker: “[Cursor Stability (CS) – IBM DB2 Community](https://web.archive.org/web/20150420001721/http://www.toadworld.com/platforms/ibmdb2/w/wiki/6661.cursor-stability-cs.aspx),” *toadworld.com*, March 14, 2013.
1. Nate Wiger: “[An Atomic Rant](https://nateware.com/2010/02/18/an-atomic-rant/),” *nateware.com*, February 18, 2010.
1. Joel Jacobson: “[Riak 2.0: Data Types](https://web.archive.org/web/20160327135816/http://blog.joeljacobson.com/riak-2-0-data-types/),” *blog.joeljacobson.com*, March 23, 2014.
1. Michael J. Cahill, Uwe Röhm, and Alan Fekete: “[Serializable Isolation for Snapshot Databases](https://web.archive.org/web/20200709144151/https://cs.nyu.edu/courses/Fall12/CSCI-GA.2434-001/p729-cahill.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2008. [doi:10.1145/1376616.1376690](http://dx.doi.org/10.1145/1376616.1376690)
1. Dan R. K. Ports and Kevin Grittner: “[Serializable Snapshot Isolation in PostgreSQL](http://drkp.net/papers/ssi-vldb12.pdf),” at *38th International Conference on Very Large Databases* (VLDB), August 2012.
1. Tony Andrews: “[Enforcing Complex Constraints in Oracle](http://tonyandrews.blogspot.co.uk/2004/10/enforcing-complex-constraints-in.html),” *tonyandrews.blogspot.co.uk*, October 15, 2004.
1. Douglas B. Terry, Marvin M. Theimer, Karin Petersen, et al.: “[Managing Update Conflicts in Bayou, a Weakly Connected Replicated Storage System](https://citeseerx.ist.psu.edu/pdf/20c450f099b661c5a2dff3f348773a0d1af1b09b),” at *15th ACM Symposium on Operating Systems Principles* (SOSP), December 1995. [doi:10.1145/224056.224070](http://dx.doi.org/10.1145/224056.224070)
1. Gary Fredericks: “[Postgres Serializability Bug](https://github.com/gfredericks/pg-serializability-bug),” *github.com*, September 2015.
1. Michael Stonebraker, Samuel Madden, Daniel J. Abadi, et al.: “[The End of an Architectural Era (It’s Time for a Complete Rewrite)](https://citeseerx.ist.psu.edu/pdf/775d54c66d271028a7d4dadf07cce6f918584cd3),” at *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
1. John Hugg: “[H-Store/VoltDB Architecture vs. CEP Systems and Newer Streaming Architectures](https://www.youtube.com/watch?v=hD5M4a1UVz8),” at *Data @Scale Boston*, November 2014.
1. Robert Kallman, Hideaki Kimura, Jonathan Natkins, et al.: “[H-Store: A High-Performance, Distributed Main Memory Transaction Processing System](http://www.vldb.org/pvldb/vol1/1454211.pdf),” *Proceedings of the VLDB Endowment*, volume 1, number 2, pages 1496–1499, August 2008.
1. Rich Hickey: “[The Architecture of Datomic](http://www.infoq.com/articles/Architecture-Datomic),” *infoq.com*, November 2, 2012.
1. John Hugg: “[Debunking Myths About the VoltDB In-Memory Database](https://dzone.com/articles/debunking-myths-about-voltdb),” *dzone.com*, May 28, 2014.
1. Joseph M. Hellerstein, Michael Stonebraker, and James Hamilton: “[Architecture of a Database System](https://dsf.berkeley.edu/papers/fntdb07-architecture.pdf),” *Foundations and Trends in Databases*, volume 1, number 2, pages 141–259, November 2007. [doi:10.1561/1900000002](http://dx.doi.org/10.1561/1900000002)
1. Michael J. Cahill: “[Serializable Isolation for Snapshot Databases](https://ses.library.usyd.edu.au/bitstream/handle/2123/5353/michael-cahill-2009-thesis.pdf),” PhD Thesis, University of Sydney, July 2009.
1. D. Z. Badal: “[Correctness of Concurrency Control and Implications in Distributed Databases](http://ieeexplore.ieee.org/abstract/document/762563/),” at *3rd International IEEE Computer Software and Applications Conference* (COMPSAC), November 1979.
1. Rakesh Agrawal, Michael J. Carey, and Miron Livny: “[Concurrency Control Performance Modeling: Alternatives and Implications](http://www.eecs.berkeley.edu/~brewer/cs262/ConcControl.pdf),” *ACM Transactions on Database Systems* (TODS), volume 12, number 4, pages 609–654, December 1987. [doi:10.1145/32204.32220](http://dx.doi.org/10.1145/32204.32220)
1. Dave Rosenthal: “[Databases at 14.4MHz](http://web.archive.org/web/20150427041746/http://blog.foundationdb.com/databases-at-14.4mhz),” *blog.foundationdb.com*, December 10, 2014.

================================================
FILE: content/v1_tw/ch8.md
================================================
---
title: "第八章：分散式系統的麻煩"
linkTitle: "8. 分散式系統的麻煩"
weight: 208
math: true
breadcrumbs: false
---


![](/map/ch08.png)

> 邂逅相遇
>
> 網路延遲
>
> 存之為吾
>
> 無食我數
>
> —— Kyle Kingsbury, Carly Rae Jepsen 《網路分割槽的危害》（2013 年）[^譯著1]


最近幾章中反覆出現的主題是，系統如何處理錯誤的事情。例如，我們討論了 **副本故障切換**（“[處理節點中斷](/v1_tw/ch5#處理節點宕機)”），**複製延遲**（“[複製延遲問題](/v1_tw/ch5#複製延遲問題)”）和事務控制（“[弱隔離級別](/v1_tw/ch7#弱隔離級別)”）。當我們瞭解可能在實際系統中出現的各種邊緣情況時，我們會更好地處理它們。

但是，儘管我們已經談了很多錯誤，但之前幾章仍然過於樂觀。現實更加黑暗。我們現在將悲觀主義最大化，假設任何可能出錯的東西 **都會** 出錯 [^i]。（經驗豐富的系統運維會告訴你，這是一個合理的假設。如果你問得好，他們可能會一邊治療心理創傷一邊告訴你一些可怕的故事）

[^i]: 除了一個例外：我們將假定故障是非拜占庭式的（請參閱 “[拜占庭故障](#拜占庭故障)”）。

使用分散式系統與在一臺計算機上編寫軟體有著根本的區別，主要的區別在於，有許多新穎和刺激的方法可以使事情出錯【1,2】。在這一章中，我們將瞭解實踐中出現的問題，理解我們能夠依賴，和不可以依賴的東西。

最後，作為工程師，我們的任務是構建能夠完成工作的系統（即滿足使用者期望的保證），儘管一切都出錯了。在 [第九章](/v1_tw/ch9) 中，我們將看看一些可以在分散式系統中提供這種保證的演算法的例子。但首先，在本章中，我們必須瞭解我們面臨的挑戰。

本章對分散式系統中可能出現的問題進行徹底的悲觀和沮喪的總結。我們將研究網路的問題（“[不可靠的網路](#不可靠的網路)”）; 時鐘和時序問題（“[不可靠的時鐘](#不可靠的時鐘)”）; 我們將討論他們可以避免的程度。所有這些問題的後果都是困惑的，所以我們將探索如何思考一個分散式系統的狀態，以及如何推理發生的事情（“[知識、真相與謊言](#知識、真相與謊言)”）。


## 故障與部分失效

當你在一臺計算機上編寫一個程式時，它通常會以一種相當可預測的方式執行：無論是工作還是不工作。充滿錯誤的軟體可能會讓人覺得電腦有時候也會有 “糟糕的一天”（這種問題通常是重新啟動就恢復了），但這主要是軟體寫得不好的結果。

單個計算機上的軟體沒有根本性的不可靠原因：當硬體正常工作時，相同的操作總是產生相同的結果（這是確定性的）。如果存在硬體問題（例如，記憶體損壞或聯結器鬆動），其後果通常是整個系統故障（例如，核心恐慌，“藍色畫面宕機”，啟動失敗）。裝有良好軟體的個人計算機通常要麼功能完好，要麼完全失效，而不是介於兩者之間。

這是計算機設計中的一個有意的選擇：如果發生內部錯誤，我們寧願電腦完全崩潰，而不是返回錯誤的結果，因為錯誤的結果很難處理。因為計算機隱藏了模糊不清的物理實現，並呈現出一個理想化的系統模型，並以數學一樣的完美的方式運作。CPU 指令總是做同樣的事情；如果你將一些資料寫入記憶體或磁碟，那麼這些資料將保持不變，並且不會被隨機破壞。從第一臺數字計算機開始，*始終正確地計算* 這個設計目標貫穿始終【3】。

當你編寫執行在多臺計算機上的軟體時，情況有本質上的區別。在分散式系統中，我們不再處於理想化的系統模型中，我們別無選擇，只能面對現實世界的混亂現實。而在現實世界中，各種各樣的事情都可能會出現問題【4】，如下面的軼事所述：

> 在我有限的從業經歷中，我已經和很多東西打過交道：單個 **資料中心（DC）** 中長期存在的網路分割槽，配電單元 PDU 故障，交換機故障，整個機架的意外重啟，整個資料中心主幹網路故障，整個資料中心的電源故障，以及一個低血糖的司機把他的福特皮卡撞在資料中心的 HVAC（加熱，通風和空調）系統上。而且我甚至不是一個運維。
>
> —— 柯達黑爾

在分散式系統中，儘管系統的其他部分工作正常，但系統的某些部分可能會以某種不可預知的方式被破壞。這被稱為 **部分失效（partial failure）**。難點在於部分失效是 **不確定性的（nondeterministic）**：如果你試圖做任何涉及多個節點和網路的事情，它有時可能會工作，有時會出現不可預知的失敗。正如我們將要看到的，你甚至不知道是否成功了，因為訊息透過網路傳播的時間也是不確定的！

這種不確定性和部分失效的可能性，使得分散式系統難以工作【5】。

### 雲計算與超級計算機

關於如何構建大型計算系統有一系列的哲學：

* 一個極端是高效能計算（HPC）領域。具有數千個 CPU 的超級計算機通常用於計算密集型科學計算任務，如天氣預報或分子動力學（模擬原子和分子的運動）。
* 另一個極端是 **雲計算（cloud computing）**，雲計算並不是一個良好定義的概念【6】，但通常與多租戶資料中心，連線 IP 網路（通常是乙太網）的商用計算機，彈性 / 按需資源分配以及計量計費等相關聯。
* 傳統企業資料中心位於這兩個極端之間。

不同的哲學會導致不同的故障處理方式。在超級計算機中，作業通常會不時地將計算的狀態存檔到持久儲存中。如果一個節點出現故障，通常的解決方案是簡單地停止整個叢集的工作負載。故障節點修復後，計算從上一個檢查點重新開始【7,8】。因此，超級計算機更像是一個單節點計算機而不是分散式系統：透過讓部分失敗升級為完全失敗來處理部分失敗 —— 如果系統的任何部分發生故障，只是讓所有的東西都崩潰（就像單臺機器上的核心恐慌一樣）。

在本書中，我們將重點放在實現網際網路服務的系統上，這些系統通常與超級計算機看起來有很大不同：

* 許多與網際網路有關的應用程式都是 **線上（online）** 的，因為它們需要能夠隨時以低延遲服務使用者。使服務不可用（例如，停止叢集以進行修復）是不可接受的。相比之下，像天氣模擬這樣的離線（批處理）工作可以停止並重新啟動，影響相當小。

* 超級計算機通常由專用硬體構建而成，每個節點相當可靠，節點透過共享記憶體和 **遠端直接記憶體訪問（RDMA）** 進行通訊。另一方面，雲服務中的節點是由商用機器構建而成的，由於規模經濟，可以以較低的成本提供相同的效能，而且具有較高的故障率。

* 大型資料中心網路通常基於 IP 和乙太網，以 CLOS 拓撲排列，以提供更高的對分（bisection）頻寬【9】。超級計算機通常使用專門的網路拓撲結構，例如多維網格和 Torus 網路 【10】，這為具有已知通訊模式的 HPC 工作負載提供了更好的效能。

* 系統越大，其元件之一就越有可能壞掉。隨著時間的推移，壞掉的東西得到修復，新的東西又壞掉，但是在一個有成千上萬個節點的系統中，有理由認為總是有一些東西是壞掉的【7】。當錯誤處理的策略只由簡單放棄組成時，一個大的系統最終會花費大量時間從錯誤中恢復，而不是做有用的工作【8】。

* 如果系統可以容忍發生故障的節點，並繼續保持整體工作狀態，那麼這對於運營和維護非常有用：例如，可以執行滾動升級（請參閱 [第四章](/v1_tw/ch4)），一次重新啟動一個節點，同時繼續給使用者提供不中斷的服務。在雲環境中，如果一臺虛擬機器執行不佳，可以殺死它並請求一臺新的虛擬機器（希望新的虛擬機器速度更快）。

* 在地理位置分散的部署中（保持資料在地理位置上接近使用者以減少訪問延遲），通訊很可能透過網際網路進行，與本地網路相比，通訊速度緩慢且不可靠。超級計算機通常假設它們的所有節點都靠近在一起。

如果要使分散式系統工作，就必須接受部分故障的可能性，並在軟體中建立容錯機制。換句話說，我們需要從不可靠的元件構建一個可靠的系統（正如 “[可靠性](/v1_tw/ch1#可靠性)” 中所討論的那樣，沒有完美的可靠性，所以我們需要理解我們可以實際承諾的極限）。

即使在只有少數節點的小型系統中，考慮部分故障也是很重要的。在一個小系統中，很可能大部分元件在大部分時間都正常工作。然而，遲早會有一部分系統出現故障，軟體必須以某種方式處理。故障處理必須是軟體設計的一部分，並且作為軟體的運維，你需要知道在發生故障的情況下，軟體可能會表現出怎樣的行為。

簡單地假設缺陷很罕見並希望始終保持最好的狀況是不明智的。考慮一系列可能的錯誤（甚至是不太可能的錯誤），並在測試環境中人為地建立這些情況來檢視會發生什麼是非常重要的。在分散式系統中，懷疑，悲觀和偏執狂是值得的。

> #### 從不可靠的元件構建可靠的系統
>
> 你可能想知道這是否有意義 —— 直觀地看來，系統只能像其最不可靠的元件（最薄弱的環節）一樣可靠。事實並非如此：事實上，從不太可靠的潛在基礎構建更可靠的系統是計算機領域的一個古老思想【11】。例如：
>
> * 糾錯碼允許數字資料在通訊通道上準確傳輸，偶爾會出現一些錯誤，例如由於無線網路上的無線電干擾【12】。
> * **網際網路協議（Internet Protocol, IP）** 不可靠：可能丟棄、延遲、重複或重排資料包。傳輸控制協議（Transmission Control Protocol, TCP）在網際網路協議（IP）之上提供了更可靠的傳輸層：它確保丟失的資料包被重新傳輸，消除重複，並且資料包被重新組裝成它們被傳送的順序。
>
> 雖然這個系統可以比它的底層部分更可靠，但它的可靠性總是有限的。例如，糾錯碼可以處理少量的單位元錯誤，但是如果你的訊號被幹擾所淹沒，那麼透過通道可以得到多少資料，是有根本性的限制的【13】。TCP 可以隱藏資料包的丟失，重複和重新排序，但是它不能神奇地消除網路中的延遲。
>
> 雖然更可靠的高階系統並不完美，但它仍然有用，因為它處理了一些棘手的低階錯誤，所以其餘的錯誤通常更容易推理和處理。我們將在 “[資料庫的端到端原則](/v1_tw/ch12#資料庫的端到端原則)” 中進一步探討這個問題。


## 不可靠的網路

正如在 [第二部分](/v1_tw/part-ii) 的介紹中所討論的那樣，我們在本書中關注的分散式系統是無共享的系統，即透過網路連線的一堆機器。網路是這些機器可以通訊的唯一途徑 —— 我們假設每臺機器都有自己的記憶體和磁碟，一臺機器不能訪問另一臺機器的記憶體或磁碟（除了透過網路向伺服器發出請求）。

**無共享** 並不是構建系統的唯一方式，但它已經成為構建網際網路服務的主要方式，其原因如下：相對便宜，因為它不需要特殊的硬體，可以利用商品化的雲計算服務，透過跨多個地理分佈的資料中心進行冗餘可以實現高可靠性。

網際網路和資料中心（通常是乙太網）中的大多數內部網路都是 **非同步分組網路（asynchronous packet networks）**。在這種網路中，一個節點可以向另一個節點發送一個訊息（一個數據包），但是網路不能保證它什麼時候到達，或者是否到達。如果你傳送請求並期待響應，則很多事情可能會出錯（其中一些如 [圖 8-1](/v1/ddia_0801.png) 所示）：

1. 請求可能已經丟失（可能有人拔掉了網線）。
2. 請求可能正在排隊，稍後將交付（也許網路或接收方過載）。
3. 遠端節點可能已經失效（可能是崩潰或關機）。
4. 遠端節點可能暫時停止了響應（可能會遇到長時間的垃圾回收暫停；請參閱 “[程序暫停](#程序暫停)”），但稍後會再次響應。
5. 遠端節點可能已經處理了請求，但是網路上的響應已經丟失（可能是網路交換機配置錯誤）。
6. 遠端節點可能已經處理了請求，但是響應已經被延遲，並且稍後將被傳遞（可能是網路或者你自己的機器過載）。

![](/v1/ddia_0801.png)

**圖 8-1 如果傳送請求並沒有得到響應，則無法區分（a）請求是否丟失，（b）遠端節點是否關閉，或（c）響應是否丟失。**

傳送者甚至不能分辨資料包是否被傳送：唯一的選擇是讓接收者傳送響應訊息，這可能會丟失或延遲。這些問題在非同步網路中難以區分：你所擁有的唯一資訊是，你尚未收到響應。如果你向另一個節點發送請求並且沒有收到響應，則不可能判斷是什麼原因。

處理這個問題的通常方法是 **超時（Timeout）**：在一段時間之後放棄等待，並且認為響應不會到達。但是，當發生超時時，你仍然不知道遠端節點是否收到了請求（如果請求仍然在某個地方排隊，那麼即使傳送者已經放棄了該請求，仍然可能會將其傳送給接收者）。

### 真實世界的網路故障

我們幾十年來一直在建設計算機網路 —— 有人可能希望現在我們已經找出了使網路變得可靠的方法。但是現在似乎還沒有成功。

有一些系統的研究和大量的軼事證據表明，即使在像一家公司運營的資料中心那樣的受控環境中，網路問題也可能出乎意料地普遍。在一家中型資料中心進行的一項研究發現，每個月大約有 12 個網路故障，其中一半斷開一臺機器，一半斷開整個機架【15】。另一項研究測量了架頂式交換機，匯聚交換機和負載平衡器等元件的故障率【16】。它發現新增冗餘網路裝置不會像你所希望的那樣減少故障，因為它不能防範人為錯誤（例如，錯誤配置的交換機），這是造成中斷的主要原因。

諸如 EC2 之類的公有云服務因頻繁的暫態網路故障而臭名昭著【14】，管理良好的私有資料中心網路可能是更穩定的環境。儘管如此，沒有人不受網路問題的困擾：例如，交換機軟體升級過程中的一個問題可能會引發網路拓撲重構，在此期間網路資料包可能會延遲超過一分鐘【17】。鯊魚可能咬住海底電纜並損壞它們 【18】。其他令人驚訝的故障包括網路介面有時會丟棄所有入站資料包，但是成功傳送出站資料包 【19】：僅僅因為網路連結在一個方向上工作，並不能保證它也在相反的方向工作。

> #### 網路分割槽
>
> 當網路的一部分由於網路故障而被切斷時，有時稱為 **網路分割槽（network partition）** 或 **網路斷裂（netsplit）**。在本書中，我們通常會堅持使用更一般的術語 **網路故障（network fault）**，以避免與 [第六章](/v1_tw/ch6) 討論的儲存系統的分割槽（分片）相混淆。

即使網路故障在你的環境中非常罕見，故障可能發生的事實，意味著你的軟體需要能夠處理它們。無論何時透過網路進行通訊，都可能會失敗，這是無法避免的。

如果網路故障的錯誤處理沒有定義與測試，武斷地講，各種錯誤可能都會發生：例如，即使網路恢復【20】，叢集可能會發生 **死鎖**，永久無法為請求提供服務，甚至可能會刪除所有的資料【21】。如果軟體被置於意料之外的情況下，它可能會做出出乎意料的事情。

處理網路故障並不意味著容忍它們：如果你的網路通常是相當可靠的，一個有效的方法可能是當你的網路遇到問題時，簡單地向用戶顯示一條錯誤資訊。但是，你確實需要知道你的軟體如何應對網路問題，並確保系統能夠從中恢復。有意識地觸發網路問題並測試系統響應（這是 Chaos Monkey 背後的想法；請參閱 “[可靠性](/v1_tw/ch1#可靠性)”）。

### 檢測故障

許多系統需要自動檢測故障節點。例如：

* 負載平衡器需要停止向已死亡的節點轉發請求（從輪詢列表移出，即 out of rotation）。
* 在單主複製功能的分散式資料庫中，如果主庫失效，則需要將從庫之一升級為新主庫（請參閱 “[處理節點宕機](/v1_tw/ch5#處理節點宕機)”）。

不幸的是，網路的不確定性使得很難判斷一個節點是否工作。在某些特定的情況下，你可能會收到一些反饋資訊，明確告訴你某些事情沒有成功：

* 如果你可以連線到執行節點的機器，但沒有程序正在偵聽目標埠（例如，因為程序崩潰），作業系統將透過傳送 FIN 或 RST 來關閉並重用 TCP 連線。但是，如果節點在處理請求時發生崩潰，則無法知道遠端節點實際處理了多少資料【22】。
* 如果節點程序崩潰（或被管理員殺死），但節點的作業系統仍在執行，則指令碼可以通知其他節點有關該崩潰的資訊，以便另一個節點可以快速接管，而無需等待超時到期。例如，HBase 就是這麼做的【23】。
* 如果你有權訪問資料中心網路交換機的管理介面，則可以透過它們檢測硬體級別的鏈路故障（例如，遠端機器是否關閉電源）。如果你透過網際網路連線，或者如果你處於共享資料中心而無法訪問交換機，或者由於網路問題而無法訪問管理介面，則排除此選項。
* 如果路由器確認你嘗試連線的 IP 地址不可用，則可能會使用 ICMP 目標不可達資料包回覆你。但是，路由器不具備神奇的故障檢測能力 —— 它受到與網路其他參與者相同的限制。

關於遠端節點關閉的快速反饋很有用，但是你不能指望它。即使 TCP 確認已經傳送了一個數據包，應用程式在處理之前可能已經崩潰。如果你想確保一個請求是成功的，你需要應用程式本身的正確響應【24】。

相反，如果出了什麼問題，你可能會在堆疊的某個層次上得到一個錯誤響應，但總的來說，你必須假設你可能根本就得不到任何回應。你可以重試幾次（TCP 重試是透明的，但是你也可以在應用程式級別重試），等待超時過期，並且如果在超時時間內沒有收到響應，則最終宣告節點已經死亡。

### 超時與無窮的延遲

如果超時是檢測故障的唯一可靠方法，那麼超時應該等待多久？不幸的是沒有簡單的答案。

長時間的超時意味著長時間等待，直到一個節點被宣告死亡（在這段時間內，使用者可能不得不等待，或者看到錯誤資訊）。短的超時可以更快地檢測到故障，但有更高地風險誤將一個節點宣佈為失效，而該節點實際上只是暫時地變慢了（例如由於節點或網路上的負載峰值）。

過早地宣告一個節點已經死了是有問題的：如果這個節點實際上是活著的，並且正在執行一些動作（例如，傳送一封電子郵件），而另一個節點接管，那麼這個動作可能會最終執行兩次。我們將在 “[知識、真相與謊言](#知識、真相與謊言)” 以及 [第九章](/v1_tw/ch9) 和 [第十一章](/v1_tw/ch11) 中更詳細地討論這個問題。

當一個節點被宣告死亡時，它的職責需要轉移到其他節點，這會給其他節點和網路帶來額外的負擔。如果系統已經處於高負荷狀態，則過早宣告節點死亡會使問題更嚴重。特別是如果節點實際上沒有死亡，只是由於過載導致其響應緩慢；這時將其負載轉移到其他節點可能會導致 **級聯失效**（即 cascading failure，表示在極端情況下，所有節點都宣告對方死亡，所有節點都將停止工作）。

設想一個虛構的系統，其網路可以保證資料包的最大延遲 —— 每個資料包要麼在一段時間內傳送，要麼丟失，但是傳遞永遠不會比 $d$ 更長。此外，假設你可以保證一個非故障節點總是在一段時間 $r$ 內處理一個請求。在這種情況下，你可以保證每個成功的請求在 $2d + r$ 時間內都能收到響應，如果你在此時間內沒有收到響應，則知道網路或遠端節點不工作。如果這是成立的，$2d + r$ 會是一個合理的超時設定。

不幸的是，我們所使用的大多數系統都沒有這些保證：非同步網路具有無限的延遲（即儘可能快地傳送資料包，但資料包到達可能需要的時間沒有上限），並且大多數伺服器實現並不能保證它們可以在一定的最大時間內處理請求（請參閱 “[響應時間保證](#響應時間保證)”）。對於故障檢測，即使系統大部分時間快速執行也是不夠的：如果你的超時時間很短，往返時間只需要一個瞬時尖峰就可以使系統失衡。

#### 網路擁塞和排隊

在駕駛汽車時，由於交通擁堵，道路交通網路的通行時間往往不盡相同。同樣，計算機網路上資料包延遲的可變性通常是由於排隊【25】：

* 如果多個不同的節點同時嘗試將資料包傳送到同一目的地，則網路交換機必須將它們排隊並將它們逐個送入目標網路鏈路（如 [圖 8-2](/v1/ddia_0802.png) 所示）。在繁忙的網路鏈路上，資料包可能需要等待一段時間才能獲得一個插槽（這稱為網路擁塞）。如果傳入的資料太多，交換機佇列填滿，資料包將被丟棄，因此需要重新發送資料包 - 即使網路執行良好。
* 當資料包到達目標機器時，如果所有 CPU 核心當前都處於繁忙狀態，則來自網路的傳入請求將被作業系統排隊，直到應用程式準備好處理它為止。根據機器上的負載，這可能需要一段任意的時間。
* 在虛擬化環境中，正在執行的作業系統經常暫停幾十毫秒，因為另一個虛擬機器正在使用 CPU 核心。在這段時間內，虛擬機器不能從網路中消耗任何資料，所以傳入的資料被虛擬機器監視器 【26】排隊（緩衝），進一步增加了網路延遲的可變性。
* TCP 執行 **流量控制**（flow control，也稱為 **擁塞避免**，即 congestion avoidance，或 **背壓**，即 backpressure），其中節點會限制自己的傳送速率以避免網路鏈路或接收節點過載【27】。這意味著甚至在資料進入網路之前，在傳送者處就需要進行額外的排隊。

![](/v1/ddia_0802.png)

**圖 8-2 如果有多臺機器將網路流量傳送到同一目的地，則其交換機佇列可能會被填滿。在這裡，埠 1,2 和 4 都試圖傳送資料包到埠 3**

而且，如果 TCP 在某個超時時間內沒有被確認（這是根據觀察的往返時間計算的），則認為資料包丟失，丟失的資料包將自動重新發送。儘管應用程式沒有看到資料包丟失和重新傳輸，但它看到了延遲（等待超時到期，然後等待重新傳輸的資料包得到確認）。


> #### TCP與UDP
>
> 一些對延遲敏感的應用程式，比如視訊會議和 IP 語音（VoIP），使用了 UDP 而不是 TCP。這是在可靠性和和延遲變化之間的折衷：由於 UDP 不執行流量控制並且不重傳丟失的分組，所以避免了網路延遲變化的一些原因（儘管它仍然易受切換佇列和排程延遲的影響）。
>
> 在延遲資料毫無價值的情況下，UDP 是一個不錯的選擇。例如，在 VoIP 電話呼叫中，可能沒有足夠的時間重新發送丟失的資料包，並在揚聲器上播放資料。在這種情況下，重發資料包沒有意義 —— 應用程式必須使用靜音填充丟失資料包的時隙（導致聲音短暫中斷），然後在資料流中繼續。重試發生在人類層（“你能再說一遍嗎？聲音剛剛斷了一會兒。”）。

所有這些因素都會造成網路延遲的變化。當系統接近其最大容量時，排隊延遲的變化範圍特別大：擁有足夠備用容量的系統可以輕鬆排空佇列，而在高利用率的系統中，很快就能積累很長的佇列。

在公共雲和多租戶資料中心中，資源被許多客戶共享：網路連結和交換機，甚至每個機器的網絡卡和 CPU（在虛擬機器上執行時）。批處理工作負載（如 MapReduce，請參閱 [第十章](/v1_tw/ch10)）能夠很容易使網路連結飽和。由於無法控制或瞭解其他客戶對共享資源的使用情況，如果附近的某個人（嘈雜的鄰居）正在使用大量資源，則網路延遲可能會發生劇烈變化【28,29】。

在這種環境下，你只能透過實驗方式選擇超時：在一段較長的時期內、在多臺機器上測量網路往返時間的分佈，以確定延遲的預期變化。然後，考慮到應用程式的特性，可以確定 **故障檢測延遲** 與 **過早超時風險** 之間的適當折衷。

更好的一種做法是，系統不是使用配置的常量超時時間，而是連續測量響應時間及其變化（抖動），並根據觀察到的響應時間分佈自動調整超時時間。這可以透過 Phi Accrual 故障檢測器【30】來完成，該檢測器在例如 Akka 和 Cassandra 【31】中使用。TCP 的超時重傳機制也是以類似的方式工作【27】。

### 同步網路與非同步網路

如果我們可以依靠網路來傳遞一些 **最大延遲固定** 的資料包，而不是丟棄資料包，那麼分散式系統就會簡單得多。為什麼我們不能在硬體層面上解決這個問題，使網路可靠，使軟體不必擔心呢？

為了回答這個問題，將資料中心網路與非常可靠的傳統固定電話網路（非蜂窩，非 VoIP）進行比較是很有趣的：延遲音訊幀和掉話是非常罕見的。一個電話需要一個很低的端到端延遲，以及足夠的頻寬來傳輸你聲音的音訊取樣資料。在計算機網路中有類似的可靠性和可預測性不是很好嗎？

當你透過電話網路撥打電話時，它會建立一個電路：在兩個呼叫者之間的整個路線上為呼叫分配一個固定的，有保證的頻寬量。這個電路會保持至通話結束【32】。例如，ISDN 網路以每秒 4000 幀的固定速率執行。呼叫建立時，每個幀內（每個方向）分配 16 位空間。因此，在通話期間，每一方都保證能夠每 250 微秒傳送一個精確的 16 位音訊資料【33,34】。

這種網路是同步的：即使資料經過多個路由器，也不會受到排隊的影響，因為呼叫的 16 位空間已經在網路的下一跳中保留了下來。而且由於沒有排隊，網路的最大端到端延遲是固定的。我們稱之為 **有限延遲（bounded delay）**。

#### 我們不能簡單地使網路延遲可預測嗎？

請注意，電話網路中的電路與 TCP 連線有很大不同：電路是固定數量的預留頻寬，在電路建立時沒有其他人可以使用，而 TCP 連線的資料包 **機會性地** 使用任何可用的網路頻寬。你可以給 TCP 一個可變大小的資料塊（例如，一個電子郵件或一個網頁），它會盡可能在最短的時間內傳輸它。TCP 連線空閒時，不使用任何頻寬 [^ii]。

[^ii]: 除了偶爾的 keepalive 資料包，如果 TCP keepalive 被啟用。

如果資料中心網路和網際網路是電路交換網路，那麼在建立電路時就可以建立一個受保證的最大往返時間。但是，它們並不能這樣：乙太網和 IP 是 **分組交換協議**，不得不忍受排隊的折磨和因此導致的網路無限延遲，這些協議沒有電路的概念。

為什麼資料中心網路和網際網路使用分組交換？答案是，它們針對 **突發流量（bursty traffic）** 進行了最佳化。一個電路適用於音訊或視訊通話，在通話期間需要每秒傳送相當數量的位元。另一方面，請求網頁，傳送電子郵件或傳輸檔案沒有任何特定的頻寬要求 —— 我們只是希望它儘快完成。

如果想透過電路傳輸檔案，你得預測一個頻寬分配。如果你猜的太低，傳輸速度會不必要的太慢，導致網路容量閒置。如果你猜的太高，電路就無法建立（因為如果無法保證其頻寬分配，網路不能建立電路）。因此，將電路用於突發資料傳輸會浪費網路容量，並且使傳輸不必要地緩慢。相比之下，TCP 動態調整資料傳輸速率以適應可用的網路容量。

已經有一些嘗試去建立同時支援電路交換和分組交換的混合網路，比如 ATM [^iii]。InfiniBand 有一些相似之處【35】：它在鏈路層實現了端到端的流量控制，從而減少了在網路中排隊的需要，儘管它仍然可能因鏈路擁塞而受到延遲【36】。透過仔細使用 **服務質量**（quality of service，即 QoS，資料包的優先順序和排程）和 **准入控制**（admission control，限速傳送器），可以在分組網路上類比電路交換，或提供統計上的 **有限延遲**【25,32】。

[^iii]: **非同步傳輸模式（Asynchronous Transfer Mode, ATM）** 在 20 世紀 80 年代是乙太網的競爭對手【32】，但在電話網核心交換機之外並沒有得到太多的採用。它與自動櫃員機（也稱為自動取款機）無關，儘管共用一個縮寫詞。或許，在一些平行的世界裡，網際網路是基於像 ATM 這樣的東西，因此它們的網際網路視訊通話可能比我們的更可靠，因為它們不會遭受包的丟失和延遲。

但是，目前在多租戶資料中心和公共雲或透過網際網路 [^iv] 進行通訊時，此類服務質量尚未啟用。當前部署的技術不允許我們對網路的延遲或可靠性作出任何保證：我們必須假設網路擁塞，排隊和無限的延遲總是會發生。因此，超時時間沒有 “正確” 的值 —— 它需要透過實驗來確定。

[^iv]: 網際網路服務提供商之間的對等協議和透過 **BGP 閘道器協議（BGP）** 建立的路由，與 IP 協議相比，更接近於電路交換。在這個級別上，可以購買專用頻寬。但是，網際網路路由在網路級別執行，而不是主機之間的單獨連線，而且執行時間要長得多。

> ### 延遲和資源利用
>
> 更一般地說，可以將 **延遲變化** 視為 **動態資源分割槽** 的結果。
>
> 假設兩臺電話交換機之間有一條線路，可以同時進行 10,000 個呼叫。透過此線路切換的每個電路都佔用其中一個呼叫插槽。因此，你可以將線路視為可由多達 10,000 個併發使用者共享的資源。資源以靜態方式分配：即使你現在是線路上唯一的呼叫，並且所有其他 9,999 個插槽都未使用，你的電路仍將分配與線路充分利用時相同的固定數量的頻寬。
>
> 相比之下，網際網路動態分享網路頻寬。傳送者互相推擠和爭奪，以讓他們的資料包儘可能快地透過網路，並且網路交換機決定從一個時刻到另一個時刻傳送哪個分組（即，頻寬分配）。這種方法有排隊的缺點，但其優點是它最大限度地利用了線路。線路固定成本，所以如果你更好地利用它，你透過線路傳送的每個位元組都會更便宜。
>
> CPU 也會出現類似的情況：如果你在多個執行緒間動態共享每個 CPU 核心，則一個執行緒有時必須在作業系統的執行佇列裡等待，而另一個執行緒正在執行，這樣每個執行緒都有可能被暫停一個不定的時間長度。但是，與為每個執行緒分配靜態數量的 CPU 週期相比，這會更好地利用硬體（請參閱 “[響應時間保證](#響應時間保證)”）。更好的硬體利用率也是使用虛擬機器的重要動機。
>
> 如果資源是靜態分割槽的（例如，專用硬體和專用頻寬分配），則在某些環境中可以實現 **延遲保證**。但是，這是以降低利用率為代價的 —— 換句話說，它是更昂貴的。另一方面，動態資源分配的多租戶提供了更好的利用率，所以它更便宜，但它具有可變延遲的缺點。
>
> 網路中的可變延遲不是一種自然規律，而只是成本 / 收益權衡的結果。


## 不可靠的時鐘

時鐘和時間很重要。應用程式以各種方式依賴於時鐘來回答以下問題：

1. 這個請求是否超時了？
2. 這項服務的第 99 百分位響應時間是多少？
3. 在過去五分鐘內，該服務平均每秒處理多少個查詢？
4. 使用者在我們的網站上花了多長時間？
5. 這篇文章在何時釋出？
6. 在什麼時間傳送提醒郵件？
7. 這個快取條目何時到期？
8. 日誌檔案中此錯誤訊息的時間戳是什麼？

[例 1-4](/v1_tw/ch1) 測量了 **持續時間**（durations，例如，請求傳送與響應接收之間的時間間隔），而 [例 5-8](/v1_tw/ch5) 描述了 **時間點**（point in time，在特定日期和和特定時間發生的事件）。

在分散式系統中，時間是一件棘手的事情，因為通訊不是即時的：訊息透過網路從一臺機器傳送到另一臺機器需要時間。收到訊息的時間總是晚於傳送的時間，但是由於網路中的可變延遲，我們不知道晚了多少時間。這個事實導致有時很難確定在涉及多臺機器時發生事情的順序。

而且，網路上的每臺機器都有自己的時鐘，這是一個實際的硬體裝置：通常是石英晶體振盪器。這些裝置不是完全準確的，所以每臺機器都有自己的時間概念，可能比其他機器稍快或更慢。可以在一定程度上同步時鐘：最常用的機制是 **網路時間協議（NTP）**，它允許根據一組伺服器報告的時間來調整計算機時鐘【37】。伺服器則從更精確的時間源（如 GPS 接收機）獲取時間。

### 單調鍾與日曆時鐘

現代計算機至少有兩種不同的時鐘：日曆時鐘（time-of-day clock）和單調鍾（monotonic clock）。儘管它們都衡量時間，但區分這兩者很重要，因為它們有不同的目的。

#### 日曆時鐘

日曆時鐘是你直觀地瞭解時鐘的依據：它根據某個日曆（也稱為 **掛鐘時間**，即 wall-clock time）返回當前日期和時間。例如，Linux 上的 `clock_gettime(CLOCK_REALTIME)`[^v] 和 Java 中的 `System.currentTimeMillis()` 返回自 epoch（UTC 時間 1970 年 1 月 1 日午夜）以來的秒數（或毫秒），根據公曆（Gregorian）日曆，不包括閏秒。有些系統使用其他日期作為參考點。

[^v]: 雖然該時鐘被稱為即時時鐘，但它與即時作業系統無關，如 “[響應時間保證](#響應時間保證)” 中所述。

日曆時鐘通常與 NTP 同步，這意味著來自一臺機器的時間戳（理想情況下）與另一臺機器上的時間戳相同。但是如下節所述，日曆時鐘也具有各種各樣的奇特之處。特別是，如果本地時鐘在 NTP 伺服器之前太遠，則它可能會被強制重置，看上去好像跳回了先前的時間點。這些跳躍以及他們經常忽略閏秒的事實，使日曆時鐘不能用於測量經過時間（elapsed time）【38】。

歷史上的日曆時鐘還具有相當粗略的解析度，例如，在較早的 Windows 系統上以 10 毫秒為單位前進【39】。在最近的系統中這已經不是一個問題了。

#### 單調鍾

單調鍾適用於測量持續時間（時間間隔），例如超時或服務的響應時間：Linux 上的 `clock_gettime(CLOCK_MONOTONIC)`，和 Java 中的 `System.nanoTime()` 都是單調時鐘。這個名字來源於他們保證總是往前走的事實（而日曆時鐘可以往回跳）。

你可以在某個時間點檢查單調鐘的值，做一些事情，且稍後再次檢查它。這兩個值之間的差異告訴你兩次檢查之間經過了多長時間。但單調鐘的絕對值是毫無意義的：它可能是計算機啟動以來的納秒數，或類似的任意值。特別是比較來自兩臺不同計算機的單調鐘的值是沒有意義的，因為它們並不是一回事。

在具有多個 CPU 插槽的伺服器上，每個 CPU 可能有一個單獨的計時器，但不一定與其他 CPU 同步。作業系統會補償所有的差異，並嘗試嚮應用執行緒表現出單調鐘的樣子，即使這些執行緒被排程到不同的 CPU 上。當然，明智的做法是不要太把這種單調性保證當回事【40】。

如果 NTP 協議檢測到計算機的本地石英鐘比 NTP 伺服器要更快或更慢，則可以調整單調鍾向前走的頻率（這稱為 **偏移（skewing）** 時鐘）。預設情況下，NTP 允許時鐘速率增加或減慢最高至 0.05%，但 NTP 不能使單調時鐘向前或向後跳轉。單調時鐘的解析度通常相當好：在大多數系統中，它們能在幾微秒或更短的時間內測量時間間隔。

在分散式系統中，使用單調鍾測量 **經過時間**（elapsed time，比如超時）通常很好，因為它不假定不同節點的時鐘之間存在任何同步，並且對測量的輕微不準確性不敏感。

### 時鐘同步與準確性

單調鐘不需要同步，但是日曆時鐘需要根據 NTP 伺服器或其他外部時間源來設定才能有用。不幸的是，我們獲取時鐘的方法並不像你所希望的那樣可靠或準確 —— 硬體時鐘和 NTP 可能會變幻莫測。舉幾個例子：

* 計算機中的石英鐘不夠精確：它會 **漂移**（drifts，即執行速度快於或慢於預期）。時鐘漂移取決於機器的溫度。Google 假設其伺服器時鐘漂移為 200 ppm（百萬分之一）【41】，相當於每 30 秒與伺服器重新同步一次的時鐘漂移為 6 毫秒，或者每天重新同步的時鐘漂移為 17 秒。即使一切工作正常，此漂移也會限制可以達到的最佳準確度。
* 如果計算機的時鐘與 NTP 伺服器的時鐘差別太大，可能會拒絕同步，或者本地時鐘將被強制重置【37】。任何觀察重置前後時間的應用程式都可能會看到時間倒退或突然跳躍。
* 如果某個節點被 NTP 伺服器的防火牆意外阻塞，有可能會持續一段時間都沒有人會注意到。有證據表明，這在實踐中確實發生過。
* NTP 同步只能和網路延遲一樣好，所以當你在擁有可變資料包延遲的擁塞網路上時，NTP 同步的準確性會受到限制。一個實驗表明，當透過網際網路同步時，35 毫秒的最小誤差是可以實現的，儘管偶爾的網路延遲峰值會導致大約一秒的誤差。根據配置，較大的網路延遲會導致 NTP 客戶端完全放棄。
* 一些 NTP 伺服器是錯誤的或者配置錯誤的，報告的時間可能相差幾個小時【43,44】。還好 NTP 客戶端非常健壯，因為他們會查詢多個伺服器並忽略異常值。無論如何，依賴於網際網路上的陌生人所告訴你的時間來保證你的系統的正確性，這還挺讓人擔憂的。
* 閏秒導致一分鐘可能有 59 秒或 61 秒，這會打破一些在設計之時未考慮閏秒的系統的時序假設【45】。閏秒已經使許多大型系統崩潰的事實【38,46】說明了，關於時鐘的錯誤假設是多麼容易偷偷溜入系統中。處理閏秒的最佳方法可能是讓 NTP 伺服器 “撒謊”，並在一天中逐漸執行閏秒調整（這被稱為 **拖尾**，即 smearing）【47,48】，雖然實際的 NTP 伺服器表現各異【49】。
* 在虛擬機器中，硬體時鐘被虛擬化，這對於需要精確計時的應用程式提出了額外的挑戰【50】。當一個 CPU 核心在虛擬機器之間共享時，每個虛擬機器都會暫停幾十毫秒，與此同時另一個虛擬機器正在執行。從應用程式的角度來看，這種停頓表現為時鐘突然向前跳躍【26】。
* 如果你在沒有完整控制權的裝置（例如，移動裝置或嵌入式裝置）上執行軟體，則可能完全不能信任該裝置的硬體時鐘。一些使用者故意將其硬體時鐘設定為不正確的日期和時間，例如，為了規避遊戲中的時間限制，時鐘可能會被設定到很遠的過去或將來。

如果你足夠在乎這件事並投入大量資源，就可以達到非常好的時鐘精度。例如，針對金融機構的歐洲法規草案 MiFID II 要求所有高頻率交易基金在 UTC 時間 100 微秒內同步時鐘，以便除錯 “閃崩” 等市場異常現象，並幫助檢測市場操縱【51】。

透過 GPS 接收機，精確時間協議（PTP）【52】以及仔細的部署和監測可以實現這種精確度。然而，這需要很多努力和專業知識，而且有很多東西都會導致時鐘同步錯誤。如果你的 NTP 守護程序配置錯誤，或者防火牆阻止了 NTP 通訊，由漂移引起的時鐘誤差可能很快就會變大。

### 依賴同步時鐘

時鐘的問題在於，雖然它們看起來簡單易用，但卻具有令人驚訝的缺陷：一天可能不會有精確的 86,400 秒，**日曆時鐘** 可能會前後跳躍，而一個節點上的時間可能與另一個節點上的時間完全不同。

本章早些時候，我們討論了網路丟包和任意延遲包的問題。儘管網路在大多數情況下表現良好，但軟體的設計必須假定網路偶爾會出現故障，而軟體必須正常處理這些故障。時鐘也是如此：儘管大多數時間都工作得很好，但需要準備健壯的軟體來處理不正確的時鐘。

有一部分問題是，不正確的時鐘很容易被視而不見。如果一臺機器的 CPU 出現故障或者網路配置錯誤，很可能根本無法工作，所以很快就會被注意和修復。另一方面，如果它的石英時鐘有缺陷，或者它的 NTP 客戶端配置錯誤，大部分事情似乎仍然可以正常工作，即使它的時鐘逐漸偏離現實。如果某個軟體依賴於精確同步的時鐘，那麼結果更可能是悄無聲息的，僅有微量的資料丟失，而不是一次驚天動地的崩潰【53,54】。

因此，如果你使用需要同步時鐘的軟體，必須仔細監控所有機器之間的時鐘偏移。時鐘偏離其他時鐘太遠的節點應當被宣告死亡，並從叢集中移除。這樣的監控可以確保你在損失發生之前注意到破損的時鐘。

#### 有序事件的時間戳

讓我們考慮一個特別的情況，一件很有誘惑但也很危險的事情：依賴時鐘，在多個節點上對事件進行排序。例如，如果兩個客戶端寫入分散式資料庫，誰先到達？ 哪一個更近？

[圖 8-3](/v1/ddia_0803.png) 顯示了在具有多主複製的資料庫中對時鐘的危險使用（該例子類似於 [圖 5-9](/v1/ddia_0509.png)）。客戶端 A 在節點 1 上寫入 `x = 1`；寫入被複制到節點 3；客戶端 B 在節點 3 上增加 x（我們現在有 `x = 2`）；最後這兩個寫入都被複制到節點 2。

![](/v1/ddia_0803.png)

**圖 8-3 客戶端 B 的寫入比客戶端 A 的寫入要晚，但是 B 的寫入具有較早的時間戳。**

在 [圖 8-3](/v1/ddia_0803.png) 中，當一個寫入被複制到其他節點時，它會根據發生寫入的節點上的日曆時鐘標記一個時間戳。在這個例子中，時鐘同步是非常好的：節點 1 和節點 3 之間的偏差小於 3ms，這可能比你在實踐中能預期的更好。

儘管如此，[圖 8-3](/v1/ddia_0803.png) 中的時間戳卻無法正確排列事件：寫入 `x = 1` 的時間戳為 42.004 秒，但寫入 `x = 2` 的時間戳為 42.003 秒，即使 `x = 2` 在稍後出現。當節點 2 接收到這兩個事件時，會錯誤地推斷出 `x = 1` 是最近的值，而丟棄寫入 `x = 2`。效果上表現為，客戶端 B 的增量操作會丟失。

這種衝突解決策略被稱為 **最後寫入勝利（LWW）**，它在多主複製和無主資料庫（如 Cassandra 【53】和 Riak 【54】）中被廣泛使用（請參閱 “[最後寫入勝利（丟棄併發寫入）](/v1_tw/ch5#最後寫入勝利（丟棄併發寫入）)” 一節）。有些實現會在客戶端而不是伺服器上生成時間戳，但這並不能改變 LWW 的基本問題：

* 資料庫寫入可能會神秘地消失：具有滯後時鐘的節點無法覆蓋之前具有快速時鐘的節點寫入的值，直到節點之間的時鐘偏差消逝【54,55】。此方案可能導致一定數量的資料被悄悄丟棄，而未嚮應用報告任何錯誤。
* LWW 無法區分 **高頻順序寫入**（在 [圖 8-3](/v1/ddia_0803.png) 中，客戶端 B 的增量操作 **一定** 發生在客戶端 A 的寫入之後）和 **真正併發寫入**（寫入者意識不到其他寫入者）。需要額外的因果關係跟蹤機制（例如版本向量），以防止違背因果關係（請參閱 “[檢測併發寫入](/v1_tw/ch5#檢測併發寫入)”）。
* 兩個節點很可能獨立地生成具有相同時間戳的寫入，特別是在時鐘僅具有毫秒解析度的情況下。為了解決這樣的衝突，還需要一個額外的 **決勝值**（tiebreaker，可以簡單地是一個大隨機數），但這種方法也可能會導致違背因果關係【53】。

因此，儘管透過保留 “最近” 的值並放棄其他值來解決衝突是很誘惑人的，但是要注意，“最近” 的定義取決於本地的 **日曆時鐘**，這很可能是不正確的。即使用嚴格同步的 NTP 時鐘，一個數據包也可能在時間戳 100 毫秒（根據傳送者的時鐘）時傳送，並在時間戳 99 毫秒（根據接收者的時鐘）處到達 —— 看起來好像資料包在傳送之前已經到達，這是不可能的。

NTP 同步是否能足夠準確，以至於這種不正確的排序不會發生？也許不能，因為 NTP 的同步精度本身，除了石英鐘漂移這類誤差源之外，還受到網路往返時間的限制。為了進行正確的排序，你需要一個比測量物件（即網路延遲）要精確得多的時鐘。

所謂的 **邏輯時鐘（logic clock）**【56,57】是基於遞增計數器而不是振盪石英晶體，對於排序事件來說是更安全的選擇（請參閱 “[檢測併發寫入](/v1_tw/ch5#檢測併發寫入)”）。邏輯時鐘不測量一天中的時間或經過的秒數，而僅測量事件的相對順序（無論一個事件發生在另一個事件之前還是之後）。相反，用來測量實際經過時間的 **日曆時鐘** 和 **單調鍾** 也被稱為 **物理時鐘（physical clock）**。我們將在 “[順序保證](/v1_tw/ch9#順序保證)” 中來看順序問題。

#### 時鐘讀數存在置信區間

你可能能夠以微秒或甚至納秒的精度讀取機器的時鐘。但即使可以得到如此細緻的測量結果，這並不意味著這個值對於這樣的精度實際上是準確的。實際上，大機率是不準確的 —— 如前所述，即使你每分鐘與本地網路上的 NTP 伺服器進行同步，幾毫秒的時間漂移也很容易在不精確的石英時鐘上發生。使用公共網際網路上的 NTP 伺服器，最好的準確度可能達到幾十毫秒，而且當網路擁塞時，誤差可能會超過 100 毫秒【57】。

因此，將時鐘讀數視為一個時間點是沒有意義的 —— 它更像是一段時間範圍：例如，一個系統可能以 95% 的置信度認為當前時間處於本分鐘內的第 10.3 秒和 10.5 秒之間，它可能沒法比這更精確了【58】。如果我們只知道 ±100 毫秒的時間，那麼時間戳中的微秒數字部分基本上是沒有意義的。

不確定性界限可以根據你的時間源來計算。如果你的 GPS 接收器或原子（銫）時鐘直接連線到你的計算機上，預期的錯誤範圍由製造商告知。如果從伺服器獲得時間，則不確定性取決於自上次與伺服器同步以來的石英鐘漂移的期望值，加上 NTP 伺服器的不確定性，再加上到伺服器的網路往返時間（只是獲取粗略近似值，並假設伺服器是可信的）。

不幸的是，大多數系統不公開這種不確定性：例如，當呼叫 `clock_gettime()` 時，返回值不會告訴你時間戳的預期錯誤，所以你不知道其置信區間是 5 毫秒還是 5 年。

一個有趣的例外是 Spanner 中的 Google TrueTime API 【41】，它明確地報告了本地時鐘的置信區間。當你詢問當前時間時，你會得到兩個值：[最早，最晚]，這是最早可能的時間戳和最晚可能的時間戳。在不確定性估計的基礎上，時鐘知道當前的實際時間落在該區間內。區間的寬度取決於自從本地石英鐘最後與更精確的時鐘源同步以來已經過了多長時間。

#### 全域性快照的同步時鐘

在 “[快照隔離和可重複讀](/v1_tw/ch7#快照隔離和可重複讀)” 中，我們討論了快照隔離，這是資料庫中非常有用的功能，需要支援小型快速讀寫事務和大型長時間執行的只讀事務（用於備份或分析）。它允許只讀事務看到特定時間點的處於一致狀態的資料庫，且不會鎖定和干擾讀寫事務。

快照隔離最常見的實現需要單調遞增的事務 ID。如果寫入比快照晚（即，寫入具有比快照更大的事務 ID），則該寫入對於快照事務是不可見的。在單節點資料庫上，一個簡單的計數器就足以生成事務 ID。

但是當資料庫分佈在許多機器上，也許可能在多個數據中心中時，由於需要協調，（跨所有分割槽）全域性單調遞增的事務 ID 會很難生成。事務 ID 必須反映因果關係：如果事務 B 讀取由事務 A 寫入的值，則 B 必須具有比 A 更大的事務 ID，否則快照就無法保持一致。在有大量的小規模、高頻率的事務情景下，在分散式系統中建立事務 ID 成為一個難以處理的瓶頸 [^vi]。

[^vi]: 存在分散式序列號生成器，例如 Twitter 的雪花（Snowflake），其以可伸縮的方式（例如，透過將 ID 空間的塊分配給不同節點）近似單調地增加唯一 ID。但是，它們通常無法保證與因果關係一致的排序，因為分配的 ID 塊的時間範圍比資料庫讀取和寫入的時間範圍要長。另請參閱 “[順序保證](/v1_tw/ch9#順序保證)”。

我們可以使用同步時鐘的時間戳作為事務 ID 嗎？如果我們能夠獲得足夠好的同步性，那麼這種方法將具有很合適的屬性：更晚的事務會有更大的時間戳。當然，問題在於時鐘精度的不確定性。

Spanner 以這種方式實現跨資料中心的快照隔離【59，60】。它使用 TrueTime API 報告的時鐘置信區間，並基於以下觀察結果：如果你有兩個置信區間，每個置信區間包含最早和最晚可能的時間戳（$A = [A_{earliest}, A_{latest}]$，$B=[B_{earliest}, B_{latest}]$），這兩個區間不重疊（即：$A_{earliest} <A_{latest} <B_{earliest} <B_{latest}$）的話，那麼 B 肯定發生在 A 之後 —— 這是毫無疑問的。只有當區間重疊時，我們才不確定 A 和 B 發生的順序。

為了確保事務時間戳反映因果關係，在提交讀寫事務之前，Spanner 在提交讀寫事務時，會故意等待置信區間長度的時間。透過這樣，它可以確保任何可能讀取資料的事務處於足夠晚的時間，因此它們的置信區間不會重疊。為了保持儘可能短的等待時間，Spanner 需要保持儘可能小的時鐘不確定性，為此，Google 在每個資料中心都部署了一個 GPS 接收器或原子鐘，這允許時鐘同步到大約 7 毫秒以內【41】。

對分散式事務語義使用時鐘同步是一個活躍的研究領域【57,61,62】。這些想法很有趣，但是它們還沒有在谷歌之外的主流資料庫中實現。

### 程序暫停

讓我們考慮在分散式系統中使用危險時鐘的另一個例子。假設你有一個數據庫，每個分割槽只有一個領導者。只有領導被允許接受寫入。一個節點如何知道它仍然是領導者（它並沒有被別人宣告為死亡），並且它可以安全地接受寫入？

一種選擇是領導者從其他節點獲得一個 **租約（lease）**，類似一個帶超時的鎖【63】。任一時刻只有一個節點可以持有租約 —— 因此，當一個節點獲得一個租約時，它知道它在某段時間內自己是領導者，直到租約到期。為了保持領導地位，節點必須週期性地在租約過期前續期。

如果節點發生故障，就會停止續期，所以當租約過期時，另一個節點可以接管。

可以想象，請求處理迴圈看起來像這樣：

```java
while (true) {
  request = getIncomingRequest();
  // 確保租約還剩下至少 10 秒
  if (lease.expiryTimeMillis - System.currentTimeMillis() < 10000){
    lease = lease.renew();
  }

  if (lease.isValid()) {
    process(request);
  }
}
```

這個程式碼有什麼問題？首先，它依賴於同步時鐘：租約到期時間由另一臺機器設定（例如，當前時間加上 30 秒，計算到期時間），並將其與本地系統時鐘進行比較。如果時鐘不同步超過幾秒，這段程式碼將開始做奇怪的事情。

其次，即使我們將協議更改為僅使用本地單調時鐘，也存在另一個問題：程式碼假定在執行剩餘時間檢查 `System.currentTimeMillis()` 和實際執行請求 `process(request)` 中間的時間間隔非常短。通常情況下，這段程式碼執行得非常快，所以 10 秒的緩衝區已經足夠確保 **租約** 在請求處理到一半時不會過期。

但是，如果程式執行中出現了意外的停頓呢？例如，想象一下，執行緒在 `lease.isValid()` 行周圍停止 15 秒，然後才繼續。在這種情況下，在請求被處理的時候，租約可能已經過期，而另一個節點已經接管了領導。然而，沒有什麼可以告訴這個執行緒已經暫停了這麼長時間了，所以這段程式碼不會注意到租約已經到期了，直到迴圈的下一個迭代 —— 到那個時候它可能已經做了一些不安全的處理請求。

假設一個執行緒可能會暫停很長時間，這是瘋了嗎？不幸的是，這種情況發生的原因有很多種：

* 許多程式語言執行時（如 Java 虛擬機器）都有一個垃圾收集器（GC），偶爾需要停止所有正在執行的執行緒。這些 “**停止所有處理（stop-the-world）**”GC 暫停有時會持續幾分鐘【64】！甚至像 HotSpot JVM 的 CMS 這樣的所謂的 “並行” 垃圾收集器也不能完全與應用程式程式碼並行執行，它需要不時地停止所有處理【65】。儘管通常可以透過改變分配模式或調整 GC 設定來減少暫停【66】，但是如果我們想要提供健壯的保證，就必須假設最壞的情況發生。
* 在虛擬化環境中，可以 **掛起（suspend）** 虛擬機器（暫停執行所有程序並將記憶體內容儲存到磁碟）並恢復（恢復記憶體內容並繼續執行）。這個暫停可以在程序執行的任何時候發生，並且可以持續任意長的時間。這個功能有時用於虛擬機器從一個主機到另一個主機的即時遷移，而不需要重新啟動，在這種情況下，暫停的長度取決於程序寫入記憶體的速率【67】。
* 在終端使用者的裝置（如筆記型電腦）上，執行也可能被暫停並隨意恢復，例如當用戶關閉筆記型電腦的蓋子時。
* 當作業系統上下文切換到另一個執行緒時，或者當管理程式切換到另一個虛擬機器時（在虛擬機器中執行時），當前正在執行的執行緒可能在程式碼中的任意點處暫停。在虛擬機器的情況下，在其他虛擬機器中花費的 CPU 時間被稱為 **竊取時間（steal time）**。如果機器處於沉重的負載下（即，如果等待執行的執行緒佇列很長），暫停的執行緒再次執行可能需要一些時間。
* 如果應用程式執行同步磁碟訪問，則執行緒可能暫停，等待緩慢的磁碟 I/O 操作完成【68】。在許多語言中，即使程式碼沒有包含檔案訪問，磁碟訪問也可能出乎意料地發生 —— 例如，Java 類載入器在第一次使用時惰性載入類檔案，這可能在程式執行過程中隨時發生。I/O 暫停和 GC 暫停甚至可能合謀組合它們的延遲【69】。如果磁碟實際上是一個網路檔案系統或網路塊裝置（如亞馬遜的 EBS），I/O 延遲進一步受到網路延遲變化的影響【29】。
* 如果作業系統配置為允許交換到磁碟（頁面交換），則簡單的記憶體訪問可能導致 **頁面錯誤（page fault）**，要求將磁碟中的頁面裝入記憶體。當這個緩慢的 I/O 操作發生時，執行緒暫停。如果記憶體壓力很高，則可能需要將另一個頁面換出到磁碟。在極端情況下，作業系統可能花費大部分時間將頁面交換到記憶體中，而實際上完成的工作很少（這被稱為 **抖動**，即 thrashing）。為了避免這個問題，通常在伺服器機器上停用頁面排程（如果你寧願幹掉一個程序來釋放記憶體，也不願意冒抖動風險）。
* 可以透過傳送 SIGSTOP 訊號來暫停 Unix 程序，例如透過在 shell 中按下 Ctrl-Z。這個訊號立即阻止程序繼續執行更多的 CPU 週期，直到 SIGCONT 恢復為止，此時它將繼續執行。即使你的環境通常不使用 SIGSTOP，也可能由運維工程師意外發送。

所有這些事件都可以隨時 **搶佔（preempt）** 正在執行的執行緒，並在稍後的時間恢復執行，而執行緒甚至不會注意到這一點。這個問題類似於在單個機器上使多執行緒程式碼執行緒安全：你不能對時序做任何假設，因為隨時可能發生上下文切換，或者出現並行執行。

當在一臺機器上編寫多執行緒程式碼時，我們有相當好的工具來實現執行緒安全：互斥量、訊號量、原子計數器、無鎖資料結構、阻塞佇列等等。不幸的是，這些工具並不能直接轉化為分散式系統操作，因為分散式系統沒有共享記憶體，只有透過不可靠網路傳送的訊息。

分散式系統中的節點，必須假定其執行可能在任意時刻暫停相當長的時間，即使是在一個函式的中間。在暫停期間，世界的其它部分在繼續運轉，甚至可能因為該節點沒有響應，而宣告暫停節點的死亡。最終暫停的節點可能會繼續執行，在再次檢查自己的時鐘之前，甚至可能不會意識到自己進入了睡眠。

#### 響應時間保證

在許多程式語言和作業系統中，執行緒和程序可能暫停一段無限制的時間，正如討論的那樣。如果你足夠努力，導致暫停的原因是 **可以** 消除的。

某些軟體的執行環境要求很高，不能在特定時間內響應可能會導致嚴重的損失：控制飛機、火箭、機器人、汽車和其他物體的計算機必須對其感測器輸入做出快速而可預測的響應。在這些系統中，軟體必須有一個特定的 **截止時間（deadline）**，如果截止時間不滿足，可能會導致整個系統的故障。這就是所謂的 **硬即時（hard real-time）** 系統。

> #### 即時是真的嗎？
>
> 在嵌入式系統中，即時是指系統經過精心設計和測試，以滿足所有情況下的特定時間保證。這個含義與 Web 上對即時術語的模糊使用相反，後者描述了伺服器將資料推送到客戶端以及沒有嚴格的響應時間限制的流處理（見 [第十一章](/v1_tw/ch11)）。

例如，如果車載感測器檢測到當前正在經歷碰撞，你肯定不希望安全氣囊釋放系統因為 GC 暫停而延遲彈出。

在系統中提供 **即時保證** 需要各級軟體棧的支援：一個即時作業系統（RTOS），允許在指定的時間間隔內保證 CPU 時間的分配。庫函式必須申明最壞情況下的執行時間；動態記憶體分配可能受到限制或完全不允許（即時垃圾收集器存在，但是應用程式仍然必須確保它不會給 GC 太多的負擔）；必須進行大量的測試和測量，以確保達到保證。

所有這些都需要大量額外的工作，嚴重限制了可以使用的程式語言、庫和工具的範圍（因為大多數語言和工具不提供即時保證）。由於這些原因，開發即時系統非常昂貴，並且它們通常用於安全關鍵的嵌入式裝置。而且，“**即時**” 與 “**高效能**” 不一樣 —— 事實上，即時系統可能具有較低的吞吐量，因為他們必須讓及時響應的優先順序高於一切（另請參閱 “[延遲和資源利用](#延遲和資源利用)”）。

對於大多數伺服器端資料處理系統來說，即時保證是不經濟或不合適的。因此，這些系統必須承受在非即時環境中執行的暫停和時鐘不穩定性。

#### 限制垃圾收集的影響

程序暫停的負面影響可以在不訴諸昂貴的即時排程保證的情況下得到緩解。語言執行時在計劃垃圾回收時具有一定的靈活性，因為它們可以跟蹤物件分配的速度和隨著時間的推移剩餘的空閒記憶體。

一個新興的想法是將 GC 暫停視為一個節點的短暫計劃中斷，並在這個節點收集其垃圾的同時，讓其他節點處理來自客戶端的請求。如果執行時可以警告應用程式一個節點很快需要 GC 暫停，那麼應用程式可以停止向該節點發送新的請求，等待它完成處理未完成的請求，然後在沒有請求正在進行時執行 GC。這個技巧向客戶端隱藏了 GC 暫停，並降低了響應時間的高百分比【70,71】。一些對延遲敏感的金融交易系統【72】使用這種方法。

這個想法的一個變種是隻用垃圾收集器來處理短命物件（這些物件可以快速收集），並定期在積累大量長壽物件（因此需要完整 GC）之前重新啟動程序【65,73】。一次可以重新啟動一個節點，在計劃重新啟動之前，流量可以從該節點移開，就像 [第四章](/v1_tw/ch4) 裡描述的滾動升級一樣。

這些措施不能完全阻止垃圾回收暫停，但可以有效地減少它們對應用的影響。


## 知識、真相與謊言

本章到目前為止，我們已經探索了分散式系統與執行在單臺計算機上的程式的不同之處：沒有共享記憶體，只有透過可變延遲的不可靠網路傳遞的訊息，系統可能遭受部分失效，不可靠的時鐘和處理暫停。

如果你不習慣於分散式系統，那麼這些問題的後果就會讓人迷惑不解。網路中的一個節點無法確切地知道任何事情 —— 它只能根據它透過網路接收到（或沒有接收到）的訊息進行猜測。節點只能透過交換訊息來找出另一個節點所處的狀態（儲存了哪些資料，是否正確執行等等）。如果遠端節點沒有響應，則無法知道它處於什麼狀態，因為網路中的問題不能可靠地與節點上的問題區分開來。

這些系統的討論與哲學有關：在系統中什麼是真什麼是假？如果感知和測量的機制都是不可靠的，那麼關於這些知識我們又能多麼確定呢？軟體系統應該遵循我們對物理世界所期望的法則，如因果關係嗎？

幸運的是，我們不需要去搞清楚生命的意義。在分散式系統中，我們可以陳述關於行為（系統模型）的假設，並以滿足這些假設的方式設計實際系統。演算法可以被證明在某個系統模型中正確執行。這意味著即使底層系統模型提供了很少的保證，也可以實現可靠的行為。

但是，儘管可以使軟體在不可靠的系統模型中表現良好，但這並不是可以直截了當實現的。在本章的其餘部分中，我們將進一步探討分散式系統中的知識和真相的概念，這將有助於我們思考我們可以做出的各種假設以及我們可能希望提供的保證。在 [第九章](/v1_tw/ch9) 中，我們將著眼於分散式系統的一些例子，這些演算法在特定的假設條件下提供了特定的保證。

### 真相由多數所定義

設想一個具有不對稱故障的網路：一個節點能夠接收發送給它的所有訊息，但是來自該節點的任何傳出訊息被丟棄或延遲【19】。即使該節點執行良好，並且正在接收來自其他節點的請求，其他節點也無法聽到其響應。經過一段時間後，其他節點宣佈它已經死亡，因為他們沒有聽到節點的訊息。這種情況就像夢魘一樣：**半斷開（semi-disconnected）** 的節點被拖向墓地，敲打尖叫道 “我沒死！” —— 但是由於沒有人能聽到它的尖叫，葬禮隊伍繼續以堅忍的決心繼續行進。

在一個稍微不那麼夢魘的場景中，半斷開的節點可能會注意到它傳送的訊息沒有被其他節點確認，因此意識到網路中必定存在故障。儘管如此，節點被其他節點錯誤地宣告為死亡，而半連線的節點對此無能為力。

第三種情況，想象一個正在經歷長時間 **垃圾收集暫停（stop-the-world GC Pause）** 的節點，節點的所有執行緒被 GC 搶佔並暫停一分鐘，因此沒有請求被處理，也沒有響應被傳送。其他節點等待，重試，不耐煩，並最終宣佈節點死亡，並將其丟到靈車上。最後，GC 完成，節點的執行緒繼續，好像什麼也沒有發生。其他節點感到驚訝，因為所謂的死亡節點突然從棺材中抬起頭來，身體健康，開始和旁觀者高興地聊天。GC 後的節點最初甚至沒有意識到已經經過了整整一分鐘，而且自己已被宣告死亡。從它自己的角度來看，從最後一次與其他節點交談以來，幾乎沒有經過任何時間。

這些故事的寓意是，節點不一定能相信自己對於情況的判斷。分散式系統不能完全依賴單個節點，因為節點可能隨時失效，可能會使系統卡死，無法恢復。相反，許多分散式演算法都依賴於法定人數，即在節點之間進行投票（請參閱 “[讀寫的法定人數](/v1_tw/ch5#讀寫的法定人數)”）：決策需要來自多個節點的最小投票數，以減少對於某個特定節點的依賴。

這也包括關於宣告節點死亡的決定。如果法定數量的節點宣告另一個節點已經死亡，那麼即使該節點仍感覺自己活著，它也必須被認為是死的。個體節點必須遵守法定決定並下臺。

最常見的法定人數是超過一半的絕對多數（儘管其他型別的法定人數也是可能的）。多數法定人數允許系統繼續工作，如果單個節點發生故障（三個節點可以容忍單節點故障；五個節點可以容忍雙節點故障）。系統仍然是安全的，因為在這個制度中只能有一個多數 —— 不能同時存在兩個相互衝突的多數決定。當我們在 [第九章](/v1_tw/ch9) 中討論 **共識演算法（consensus algorithms）** 時，我們將更詳細地討論法定人數的應用。

#### 領導者和鎖

通常情況下，一些東西在一個系統中只能有一個。例如：

* 資料庫分割槽的領導者只能有一個節點，以避免 **腦裂**（即 split brain，請參閱 “[處理節點宕機](/v1_tw/ch5#處理節點宕機)”）。
* 特定資源的鎖或物件只允許一個事務 / 客戶端持有，以防同時寫入和損壞。
* 一個特定的使用者名稱只能被一個使用者所註冊，因為使用者名稱必須唯一標識一個使用者。

在分散式系統中實現這一點需要注意：即使一個節點認為它是 “**天選者（the choosen one）**”（分割槽的負責人，鎖的持有者，成功獲取使用者名稱的使用者的請求處理程式），但這並不一定意味著有法定人數的節點同意！一個節點可能以前是領導者，但是如果其他節點在此期間宣佈它死亡（例如，由於網路中斷或 GC 暫停），則它可能已被降級，且另一個領導者可能已經當選。

如果一個節點繼續表現為 **天選者**，即使大多數節點已經宣告它已經死了，則在考慮不周的系統中可能會導致問題。這樣的節點能以自己賦予的權能向其他節點發送訊息，如果其他節點相信，整個系統可能會做一些不正確的事情。

例如，[圖 8-4](/v1/ddia_0804.png) 顯示了由於不正確的鎖實現導致的資料損壞錯誤。（這個錯誤不僅僅是理論上的：HBase 曾經有這個問題【74,75】）假設你要確保一個儲存服務中的檔案一次只能被一個客戶訪問，因為如果多個客戶試圖對此寫入，該檔案將被損壞。你嘗試透過在訪問檔案之前要求客戶端從鎖定服務獲取租約來實現此目的。

![](/v1/ddia_0804.png)

**圖 8-4 分散式鎖的實現不正確：客戶端 1 認為它仍然具有有效的租約，即使它已經過期，從而破壞了儲存中的檔案**

這個問題就是我們先前在 “[程序暫停](#程序暫停)” 中討論過的一個例子：如果持有租約的客戶端暫停太久，它的租約將到期。另一個客戶端可以獲得同一檔案的租約，並開始寫入檔案。當暫停的客戶端回來時，它認為（不正確）它仍然有一個有效的租約，並繼續寫入檔案。結果，客戶的寫入將產生衝突並損壞檔案。

#### 防護令牌

當使用鎖或租約來保護對某些資源（如 [圖 8-4](/v1/ddia_0804.png) 中的檔案儲存）的訪問時，需要確保一個被誤認為自己是 “天選者” 的節點不能擾亂系統的其它部分。實現這一目標的一個相當簡單的技術就是 **防護（fencing）**，如 [圖 8-5](/v1/ddia_0805.png) 所示

![](/v1/ddia_0805.png)

**圖 8-5 只允許以增加防護令牌的順序進行寫操作，從而保證儲存安全**

我們假設每次鎖定伺服器授予鎖或租約時，它還會返回一個 **防護令牌（fencing token）**，這個數字在每次授予鎖定時都會增加（例如，由鎖定服務增加）。然後，我們可以要求客戶端每次向儲存服務傳送寫入請求時，都必須包含當前的防護令牌。

在 [圖 8-5](/v1/ddia_0805.png) 中，客戶端 1 以 33 的令牌獲得租約，但隨後進入一個長時間的停頓並且租約到期。客戶端 2 以 34 的令牌（該數字總是增加）獲取租約，然後將其寫入請求傳送到儲存服務，包括 34 的令牌。稍後，客戶端 1 恢復生機並將其寫入儲存服務，包括其令牌值 33。但是，儲存伺服器會記住它已經處理了一個具有更高令牌編號（34）的寫入，因此它會拒絕帶有令牌 33 的請求。

如果將 ZooKeeper 用作鎖定服務，則可將事務標識 `zxid` 或節點版本 `cversion` 用作防護令牌。由於它們保證單調遞增，因此它們具有所需的屬性【74】。

請注意，這種機制要求資源本身在檢查令牌方面發揮積極作用，透過拒絕使用舊的令牌，而不是已經被處理的令牌來進行寫操作 —— 僅僅依靠客戶端檢查自己的鎖狀態是不夠的。對於不明確支援防護令牌的資源，可能仍然可以解決此限制（例如，在檔案儲存服務的情況下，可以將防護令牌包含在檔名中）。但是，為了避免在鎖的保護之外處理請求，需要進行某種檢查。

在伺服器端檢查一個令牌可能看起來像是一個缺點，但這可以說是一件好事：一個服務假定它的客戶總是守規矩並不明智，因為使用客戶端的人與執行服務的人優先順序非常不一樣【76】。因此，任何服務保護自己免受意外客戶的濫用是一個好主意。

### 拜占庭故障

防護令牌可以檢測和阻止無意中發生錯誤的節點（例如，因為它尚未發現其租約已過期）。但是，如果節點有意破壞系統的保證，則可以透過使用假防護令牌傳送訊息來輕鬆完成此操作。

在本書中，我們假設節點是不可靠但誠實的：它們可能很慢或者從不響應（由於故障），並且它們的狀態可能已經過時（由於 GC 暫停或網路延遲），但是我們假設如果節點它做出了回應，它正在說出 “真相”：盡其所知，它正在按照協議的規則扮演其角色。

如果存在節點可能 “撒謊”（傳送任意錯誤或損壞的響應）的風險，則分散式系統的問題變得更困難了 —— 例如，如果節點可能聲稱其實際上沒有收到特定的訊息。這種行為被稱為 **拜占庭故障（Byzantine fault）**，**在不信任的環境中達成共識的問題被稱為拜占庭將軍問題**【77】。

> ### 拜占庭將軍問題
>
> 拜占庭將軍問題是對所謂 “兩將軍問題” 的泛化【78】，它想象兩個將軍需要就戰鬥計劃達成一致的情況。由於他們在兩個不同的地點建立了營地，他們只能透過信使進行溝通，信使有時會被延遲或丟失（就像網路中的資訊包一樣）。我們將在 [第九章](/v1_tw/ch9) 討論這個共識問題。
>
> 在這個問題的拜占庭版本里，有 n 位將軍需要同意，他們的努力因為有一些叛徒在他們中間而受到阻礙。大多數的將軍都是忠誠的，因而發出了真實的資訊，但是叛徒可能會試圖透過傳送虛假或不真實的資訊來欺騙和混淆他人（在試圖保持未被發現的同時）。事先並不知道叛徒是誰。
>
> 拜占庭是後來成為君士坦丁堡的古希臘城市，現在在土耳其的伊斯坦布林。沒有任何歷史證據表明拜占庭將軍比其他地方更容易出現詭計和陰謀。相反，這個名字來源於拜占庭式的過度複雜，官僚，迂迴等意義，早在計算機之前就已經在政治中被使用了【79】。Lamport 想要選一個不會冒犯任何讀者的國家，他被告知將其稱為阿爾巴尼亞將軍問題並不是一個好主意【80】。

當一個系統在部分節點發生故障、不遵守協議、甚至惡意攻擊、擾亂網路時仍然能繼續正確工作，稱之為 **拜占庭容錯（Byzantine fault-tolerant）** 的，這種擔憂在某些特定情況下是有意義的：

* 在航空航天環境中，計算機記憶體或 CPU 暫存器中的資料可能被輻射破壞，導致其以任意不可預知的方式響應其他節點。由於系統故障非常昂貴（例如，飛機撞毀和炸死船上所有人員，或火箭與國際空間站相撞），飛行控制系統必須容忍拜占庭故障【81,82】。
* 在多個參與組織的系統中，一些參與者可能會試圖欺騙或詐騙他人。在這種情況下，節點僅僅信任另一個節點的訊息是不安全的，因為它們可能是出於惡意的目的而被傳送的。例如，像比特幣和其他區塊鏈一樣的對等網路可以被認為是讓互不信任的各方同意交易是否發生的一種方式，而不依賴於中心機構（central authority）【83】。

然而，在本書討論的那些系統中，我們通常可以安全地假設沒有拜占庭式的錯誤。在你的資料中心裡，所有的節點都是由你的組織控制的（所以他們可以信任），輻射水平足夠低，記憶體損壞不是一個大問題。製作拜占庭容錯系統的協議相當複雜【84】，而容錯嵌入式系統依賴於硬體層面的支援【81】。在大多數伺服器端資料系統中，部署拜占庭容錯解決方案的成本使其變得不切實際。

Web 應用程式確實需要預期受終端使用者控制的客戶端（如 Web 瀏覽器）的任意和惡意行為。這就是為什麼輸入驗證，資料清洗和輸出轉義如此重要：例如，防止 SQL 注入和跨站點指令碼。然而，我們通常不在這裡使用拜占庭容錯協議，而只是讓伺服器有權決定是否允許客戶端行為。但在沒有這種中心機構的對等網路中，拜占庭容錯更為重要。

軟體中的一個錯誤（bug）可能被認為是拜占庭式的錯誤，但是如果你將相同的軟體部署到所有節點上，那麼拜占庭式的容錯演算法幫不到你。大多數拜占庭式容錯演算法要求超過三分之二的節點能夠正常工作（即，如果有四個節點，最多隻能有一個故障）。要使用這種方法對付 bug，你必須有四個獨立的相同軟體的實現，並希望一個 bug 只出現在四個實現之一中。

同樣，如果一個協議可以保護我們免受漏洞，安全滲透和惡意攻擊，那麼這將是有吸引力的。不幸的是，這也是不現實的：在大多數系統中，如果攻擊者可以滲透一個節點，那他們可能會滲透所有這些節點，因為它們可能都執行著相同的軟體。因此，傳統機制（認證，訪問控制，加密，防火牆等）仍然是抵禦攻擊者的主要保護措施。

#### 弱謊言形式

儘管我們假設節點通常是誠實的，但值得向軟體中新增防止 “撒謊” 弱形式的機制 —— 例如，由硬體問題導致的無效訊息，軟體錯誤和錯誤配置。這種保護機制並不是完全的拜占庭容錯，因為它們不能抵擋決心堅定的對手，但它們仍然是簡單而實用的步驟，以提高可靠性。例如：

* 由於硬體問題或作業系統、驅動程式、路由器等中的錯誤，網路資料包有時會受到損壞。通常，損壞的資料包會被內建於 TCP 和 UDP 中的校驗和所俘獲，但有時它們也會逃脫檢測【85,86,87】 。要對付這種破壞通常使用簡單的方法就可以做到，例如應用程式級協議中的校驗和。
* 可公開訪問的應用程式必須仔細清理來自使用者的任何輸入，例如檢查值是否在合理的範圍內，並限制字串的大小以防止透過大記憶體分配的拒絕服務。防火牆後面的內部服務對於輸入也許可以只採取一些不那麼嚴格的檢查，但是採取一些基本的合理性檢查（例如，在協議解析中）仍然是一個好主意。
* NTP 客戶端可以配置多個伺服器地址。同步時，客戶端聯絡所有的伺服器，估計它們的誤差，並檢查大多數伺服器是否對某個時間範圍達成一致。只要大多數的伺服器沒問題，一個配置錯誤的 NTP 伺服器報告的時間會被當成特異值從同步中排除【37】。使用多個伺服器使 NTP 更健壯（比起只用單個伺服器來）。

### 系統模型與現實

已經有很多演算法被設計以解決分散式系統問題 —— 例如，我們將在 [第九章](/v1_tw/ch9) 討論共識問題的解決方案。為了有用，這些演算法需要容忍我們在本章中討論的分散式系統的各種故障。

演算法的編寫方式不應該過分依賴於執行的硬體和軟體配置的細節。這就要求我們以某種方式將我們期望在系統中發生的錯誤形式化。我們透過定義一個系統模型來做到這一點，這個模型是一個抽象，描述一個演算法可以假設的事情。

關於時序假設，三種系統模型是常用的：

同步模型
: **同步模型（synchronous model）** 假設網路延遲、程序暫停和和時鐘誤差都是受限的。這並不意味著完全同步的時鐘或零網路延遲；這隻意味著你知道網路延遲、暫停和時鐘漂移將永遠不會超過某個固定的上限【88】。同步模型並不是大多數實際系統的現實模型，因為（如本章所討論的）無限延遲和暫停確實會發生。

部分同步模型
: **部分同步（partial synchronous）** 意味著一個系統在大多數情況下像一個同步系統一樣執行，但有時候會超出網路延遲，程序暫停和時鐘漂移的界限【88】。這是很多系統的現實模型：大多數情況下，網路和程序表現良好，否則我們永遠無法完成任何事情，但是我們必須承認，在任何時刻都存在時序假設偶然被破壞的事實。發生這種情況時，網路延遲、暫停和時鐘錯誤可能會變得相當大。

非同步模型
: 在這個模型中，一個演算法不允許對時序做任何假設 —— 事實上它甚至沒有時鐘（所以它不能使用超時）。一些演算法被設計為可用於非同步模型，但非常受限。


進一步來說，除了時序問題，我們還要考慮 **節點失效**。三種最常見的節點系統模型是：

崩潰 - 停止故障
: 在 **崩潰停止（crash-stop）** 模型中，演算法可能會假設一個節點只能以一種方式失效，即透過崩潰。這意味著節點可能在任意時刻突然停止響應，此後該節點永遠消失 —— 它永遠不會回來。

崩潰 - 恢復故障
: 我們假設節點可能會在任何時候崩潰，但也許會在未知的時間之後再次開始響應。在 **崩潰 - 恢復（crash-recovery）** 模型中，假設節點具有穩定的儲存（即，非易失性磁碟儲存）且會在崩潰中保留，而記憶體中的狀態會丟失。

拜占庭（任意）故障
: 節點可以做（絕對意義上的）任何事情，包括試圖戲弄和欺騙其他節點，如上一節所述。

對於真實系統的建模，具有 **崩潰 - 恢復故障（crash-recovery）** 的 **部分同步模型（partial synchronous）** 通常是最有用的模型。分散式演算法如何應對這種模型？

#### 演算法的正確性

為了定義演算法是正確的，我們可以描述它的屬性。例如，排序演算法的輸出具有如下特性：對於輸出列表中的任何兩個不同的元素，左邊的元素比右邊的元素小。這只是定義對列表進行排序含義的一種形式方式。

同樣，我們可以寫下我們想要的分散式演算法的屬性來定義它的正確含義。例如，如果我們正在為一個鎖生成防護令牌（請參閱 “[防護令牌](#防護令牌)”），我們可能要求演算法具有以下屬性：

唯一性（uniqueness）
: 沒有兩個防護令牌請求返回相同的值。

單調序列（monotonic sequence）
: 如果請求 $x$ 返回了令牌 $t_x$，並且請求 $y$ 返回了令牌 $t_y$，並且 $x$ 在 $y$ 開始之前已經完成，那麼 $t_x < t_y$。

可用性（availability）
: 請求防護令牌並且不會崩潰的節點，最終會收到響應。

如果一個系統模型中的演算法總是滿足它在所有我們假設可能發生的情況下的性質，那麼這個演算法是正確的。但這如何有意義？如果所有的節點崩潰，或者所有的網路延遲突然變得無限長，那麼沒有任何演算法能夠完成任何事情。

#### 安全性和活性

為了澄清這種情況，有必要區分兩種不同的屬性：**安全（safety）屬性** 和 **活性（liveness）屬性**。在剛剛給出的例子中，**唯一性** 和 **單調序列** 是安全屬性，而 **可用性** 是活性屬性。

這兩種性質有什麼區別？一個試金石就是，活性屬性通常在定義中通常包括 “**最終**” 一詞（是的，你猜對了 —— 最終一致性是一個活性屬性【89】）。

安全通常被非正式地定義為：**沒有壞事發生**，而活性通常就類似：**最終好事發生**。但是，最好不要過多地閱讀那些非正式的定義，因為好與壞的含義是主觀的。安全和活性的實際定義是精確的和數學的【90】：

* 如果安全屬性被違反，我們可以指向一個特定的安全屬性被破壞的時間點（例如，如果違反了唯一性屬性，我們可以確定重複的防護令牌被返回的特定操作）。違反安全屬性後，違規行為不能被撤銷 —— 損失已經發生。
* 活性屬性反過來：在某個時間點（例如，一個節點可能傳送了一個請求，但還沒有收到響應），它可能不成立，但總是希望在未來能成立（即透過接受答覆）。

區分安全屬性和活性屬性的一個優點是可以幫助我們處理困難的系統模型。對於分散式演算法，在系統模型的所有可能情況下，要求 **始終** 保持安全屬性是常見的【88】。也就是說，即使所有節點崩潰，或者整個網路出現故障，演算法仍然必須確保它不會返回錯誤的結果（即保證安全屬性得到滿足）。

但是，對於活性屬性，我們可以提出一些注意事項：例如，只有在大多數節點沒有崩潰的情況下，只有當網路最終從中斷中恢復時，我們才可以說請求需要接收響應。部分同步模型的定義要求系統最終返回到同步狀態 —— 即任何網路中斷的時間段只會持續一段有限的時間，然後進行修復。

#### 將系統模型對映到現實世界

安全屬性和活性屬性以及系統模型對於推理分散式演算法的正確性非常有用。然而，在實踐中實施演算法時，現實的混亂事實再一次地讓你咬牙切齒，很明顯系統模型是對現實的簡化抽象。

例如，在崩潰 - 恢復（crash-recovery）模型中的演算法通常假設穩定儲存器中的資料在崩潰後可以倖存。但是，如果磁碟上的資料被破壞，或者由於硬體錯誤或錯誤配置導致資料被清除，會發生什麼情況【91】？如果伺服器存在韌體錯誤並且在重新啟動時無法識別其硬碟驅動器，即使驅動器已正確連線到伺服器，那又會發生什麼情況【92】？

法定人數演算法（請參閱 “[讀寫的法定人數](/v1_tw/ch5#讀寫的法定人數)”）依賴節點來記住它聲稱儲存的資料。如果一個節點可能患有健忘症，忘記了以前儲存的資料，這會打破法定條件，從而破壞演算法的正確性。也許需要一個新的系統模型，在這個模型中，我們假設穩定的儲存大多能在崩潰後倖存，但有時也可能會丟失。但是那個模型就變得更難以推理了。

演算法的理論描述可以簡單宣稱一些事是不會發生的 —— 在非拜占庭式系統中，我們確實需要對可能發生和不可能發生的故障做出假設。然而，真實世界的實現，仍然會包括處理 “假設上不可能” 情況的程式碼，即使程式碼可能就是 `printf("Sucks to be you")` 和 `exit(666)`，實際上也就是留給運維來擦屁股【93】。（這可以說是計算機科學和軟體工程間的一個差異）。

這並不是說理論上抽象的系統模型是毫無價值的，恰恰相反。它們對於將實際系統的複雜性提取成一個個我們可以推理的可處理的錯誤型別是非常有幫助的，以便我們能夠理解這個問題，並試圖系統地解決這個問題。我們可以證明演算法是正確的，透過表明它們的屬性在某個系統模型中總是成立的。

證明演算法正確並不意味著它在真實系統上的實現必然總是正確的。但這邁出了很好的第一步，因為理論分析可以發現演算法中的問題，這種問題可能會在現實系統中長期潛伏，直到你的假設（例如，時序）因為不尋常的情況被打破。理論分析與經驗測試同樣重要。


## 本章小結

在本章中，我們討論了分散式系統中可能發生的各種問題，包括：

* 當你嘗試透過網路傳送資料包時，資料包可能會丟失或任意延遲。同樣，答覆可能會丟失或延遲，所以如果你沒有得到答覆，你不知道訊息是否傳送成功了。
* 節點的時鐘可能會與其他節點顯著不同步（儘管你盡最大努力設定 NTP），它可能會突然跳轉或跳回，依靠它是很危險的，因為你很可能沒有好的方法來測量你的時鐘的錯誤間隔。
* 一個程序可能會在其執行的任何時候暫停一段相當長的時間（可能是因為停止所有處理的垃圾收集器），被其他節點宣告死亡，然後再次復活，卻沒有意識到它被暫停了。

這類 **部分失效（partial failure）** 可能發生的事實是分散式系統的決定性特徵。每當軟體試圖做任何涉及其他節點的事情時，偶爾就有可能會失敗，或者隨機變慢，或者根本沒有響應（最終超時）。在分散式系統中，我們試圖在軟體中建立 **部分失效** 的容錯機制，這樣整個系統在即使某些組成部分被破壞的情況下，也可以繼續執行。

為了容忍錯誤，第一步是 **檢測** 它們，但即使這樣也很難。大多數系統沒有檢測節點是否發生故障的準確機制，所以大多數分散式演算法依靠 **超時** 來確定遠端節點是否仍然可用。但是，超時無法區分網路失效和節點失效，並且可變的網路延遲有時會導致節點被錯誤地懷疑發生故障。此外，有時一個節點可能處於降級狀態：例如，由於驅動程式錯誤，千兆網絡卡可能突然下降到 1 Kb/s 的吞吐量【94】。這樣一個 “跛行” 而不是死掉的節點可能比一個乾淨的失效節點更難處理。

一旦檢測到故障，使系統容忍它也並不容易：沒有全域性變數，沒有共享記憶體，沒有共同的知識，或機器之間任何其他種類的共享狀態。節點甚至不能就現在是什麼時間達成一致，就不用說更深奧的了。資訊從一個節點流向另一個節點的唯一方法是透過不可靠的網路傳送資訊。重大決策不能由一個節點安全地完成，因此我們需要一個能從其他節點獲得幫助的協議，並爭取達到法定人數以達成一致。

如果你習慣於在理想化的數學完美的單機環境（同一個操作總能確定地返回相同的結果）中編寫軟體，那麼轉向分散式系統的凌亂的物理現實可能會有些令人震驚。相反，如果能夠在單臺計算機上解決一個問題，那麼分散式系統工程師通常會認為這個問題是平凡的【5】，現在單個計算機確實可以做很多事情【95】。如果你可以避免開啟潘多拉的盒子，把東西放在一臺機器上，那麼通常是值得的。

但是，正如在 [第二部分](/v1_tw/part-ii) 的介紹中所討論的那樣，可伸縮性並不是使用分散式系統的唯一原因。容錯和低延遲（透過將資料放置在距離使用者較近的地方）是同等重要的目標，而這些不能用單個節點實現。

在本章中，我們也轉換了幾次話題，探討了網路、時鐘和程序的不可靠性是否是不可避免的自然規律。我們看到這並不是：有可能給網路提供硬即時的響應保證和有限的延遲，但是這樣做非常昂貴，且導致硬體資源的利用率降低。大多數非安全關鍵系統會選擇 **便宜而不可靠**，而不是 **昂貴和可靠**。

我們還談到了超級計算機，它們採用可靠的元件，因此當元件發生故障時必須完全停止並重新啟動。相比之下，分散式系統可以永久執行而不會在服務層面中斷，因為所有的錯誤和維護都可以在節點級別進行處理 —— 至少在理論上是如此。（實際上，如果一個錯誤的配置變更被應用到所有的節點，仍然會使分散式系統癱瘓）。

本章一直在講存在的問題，給我們展現了一幅黯淡的前景。在 [下一章](/v1_tw/ch9) 中，我們將繼續討論解決方案，並討論一些旨在解決分散式系統中所有問題的演算法。


## 參考文獻

1. Mark Cavage: “[There’s Just No Getting Around It: You’re Building a Distributed System](http://queue.acm.org/detail.cfm?id=2482856),” *ACM Queue*, volume 11, number 4, pages 80-89, April 2013. [doi:10.1145/2466486.2482856](http://dx.doi.org/10.1145/2466486.2482856)
1. Jay Kreps: “[Getting Real About Distributed System Reliability](http://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability),” *blog.empathybox.com*, March 19, 2012.
1. Sydney Padua: *The Thrilling Adventures of Lovelace and Babbage: The (Mostly) True Story of the First Computer*. Particular Books, April 2015. ISBN: 978-0-141-98151-2
1. Coda Hale: “[You Can’t Sacrifice Partition Tolerance](http://codahale.com/you-cant-sacrifice-partition-tolerance/),” *codahale.com*, October 7, 2010.
1. Jeff Hodges: “[Notes on Distributed Systems for Young Bloods](https://web.archive.org/web/20200218095605/https://www.somethingsimilar.com/2013/01/14/notes-on-distributed-systems-for-young-bloods/),” *somethingsimilar.com*, January 14, 2013.
1. Antonio Regalado: “[Who Coined 'Cloud Computing'?](https://www.technologyreview.com/2011/10/31/257406/who-coined-cloud-computing/),” *technologyreview.com*, October 31, 2011.
1. Luiz André Barroso, Jimmy Clidaras, and Urs Hölzle: “[The Datacenter as a Computer: An Introduction to the Design of Warehouse-Scale Machines, Second Edition](https://web.archive.org/web/20140404113735/http://www.morganclaypool.com/doi/abs/10.2200/S00516ED2V01Y201306CAC024),” *Synthesis Lectures on Computer Architecture*, volume 8, number 3, Morgan & Claypool Publishers, July 2013. [doi:10.2200/S00516ED2V01Y201306CAC024](http://dx.doi.org/10.2200/S00516ED2V01Y201306CAC024), ISBN: 978-1-627-05010-4
1. David Fiala, Frank Mueller, Christian Engelmann, et al.: “[Detection and Correction of Silent Data Corruption for Large-Scale High-Performance Computing](http://moss.csc.ncsu.edu/~mueller/ftp/pub/mueller/papers/sc12.pdf),” at *International Conference for High Performance Computing, Networking, Storage and Analysis* (SC12), November 2012.
1. Arjun Singh, Joon Ong, Amit Agarwal, et al.: “[Jupiter Rising: A Decade of Clos Topologies and Centralized Control in Google’s Datacenter Network](http://conferences.sigcomm.org/sigcomm/2015/pdf/papers/p183.pdf),” at *Annual Conference of the ACM Special Interest Group on Data Communication* (SIGCOMM), August 2015. [doi:10.1145/2785956.2787508](http://dx.doi.org/10.1145/2785956.2787508)
1. Glenn K. Lockwood: “[Hadoop's Uncomfortable Fit in HPC](http://glennklockwood.blogspot.co.uk/2014/05/hadoops-uncomfortable-fit-in-hpc.html),” *glennklockwood.blogspot.co.uk*, May 16, 2014.
1. John von Neumann: “[Probabilistic Logics and the Synthesis of Reliable Organisms from Unreliable Components](https://personalpages.manchester.ac.uk/staff/nikolaos.kyparissas/uploads/VonNeumann1956.pdf),” in *Automata Studies (AM-34)*, edited by Claude E. Shannon and John McCarthy, Princeton University Press, 1956. ISBN: 978-0-691-07916-5
1. Richard W. Hamming: *The Art of Doing Science and Engineering*. Taylor & Francis, 1997. ISBN: 978-9-056-99500-3
1. Claude E. Shannon: “[A Mathematical Theory of Communication](http://cs.brynmawr.edu/Courses/cs380/fall2012/shannon1948.pdf),” *The Bell System Technical Journal*, volume 27, number 3, pages 379–423 and 623–656, July 1948.
1. Peter Bailis and Kyle Kingsbury: “[The Network Is Reliable](https://queue.acm.org/detail.cfm?id=2655736),” *ACM Queue*, volume 12, number 7, pages 48-55, July 2014. [doi:10.1145/2639988.2639988](http://dx.doi.org/10.1145/2639988.2639988)
1. Joshua B. Leners, Trinabh Gupta, Marcos K. Aguilera, and Michael Walfish: “[Taming Uncertainty in Distributed Systems with Help from the Network](http://www.cs.nyu.edu/~mwalfish/papers/albatross-eurosys15.pdf),” at *10th European Conference on Computer Systems* (EuroSys), April 2015. [doi:10.1145/2741948.2741976](http://dx.doi.org/10.1145/2741948.2741976)
1. Phillipa Gill, Navendu Jain, and Nachiappan Nagappan: “[Understanding Network Failures in Data Centers: Measurement, Analysis, and Implications](http://conferences.sigcomm.org/sigcomm/2011/papers/sigcomm/p350.pdf),” at *ACM SIGCOMM Conference*, August 2011. [doi:10.1145/2018436.2018477](http://dx.doi.org/10.1145/2018436.2018477)
1. Mark Imbriaco: “[Downtime Last Saturday](https://github.com/blog/1364-downtime-last-saturday),” *github.com*, December 26, 2012.
1. Will Oremus: “[The Global Internet Is Being Attacked by Sharks, Google Confirms](http://www.slate.com/blogs/future_tense/2014/08/15/shark_attacks_threaten_google_s_undersea_internet_cables_video.html),” *slate.com*, August 15, 2014.
1. Marc A. Donges: “[Re: bnx2 cards Intermittantly Going Offline](http://www.spinics.net/lists/netdev/msg210485.html),” Message to Linux *netdev* mailing list, *spinics.net*, September 13, 2012.
1. Kyle Kingsbury: “[Call Me Maybe: Elasticsearch](https://aphyr.com/posts/317-call-me-maybe-elasticsearch),” *aphyr.com*, June 15, 2014.
1. Salvatore Sanfilippo: “[A Few Arguments About Redis Sentinel Properties and Fail Scenarios](http://antirez.com/news/80),” *antirez.com*, October 21, 2014.
1. Bert Hubert: “[The Ultimate SO_LINGER Page, or: Why Is My TCP Not Reliable](http://blog.netherlabs.nl/articles/2009/01/18/the-ultimate-so_linger-page-or-why-is-my-tcp-not-reliable),” *blog.netherlabs.nl*, January 18, 2009.
1. Nicolas Liochon: “[CAP: If All You Have Is a Timeout, Everything Looks Like a Partition](http://blog.thislongrun.com/2015/05/CAP-theorem-partition-timeout-zookeeper.html),” *blog.thislongrun.com*, May 25, 2015.
1. Jerome H. Saltzer, David P. Reed, and David D. Clark: “[End-To-End Arguments in System Design](https://groups.csail.mit.edu/ana/Publications/PubPDFs/End-to-End%20Arguments%20in%20System%20Design.pdf),” *ACM Transactions on Computer Systems*, volume 2, number 4, pages 277–288, November 1984. [doi:10.1145/357401.357402](http://dx.doi.org/10.1145/357401.357402)
1. Matthew P. Grosvenor, Malte Schwarzkopf, Ionel Gog, et al.: “[Queues Don’t Matter When You Can JUMP Them!](https://www.usenix.org/system/files/conference/nsdi15/nsdi15-paper-grosvenor_update.pdf),” at *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015.
1. Guohui Wang and T. S. Eugene Ng: “[The Impact of Virtualization on Network Performance of Amazon EC2 Data Center](http://www.cs.rice.edu/~eugeneng/papers/INFOCOM10-ec2.pdf),” at *29th IEEE International Conference on Computer Communications* (INFOCOM), March 2010. [doi:10.1109/INFCOM.2010.5461931](http://dx.doi.org/10.1109/INFCOM.2010.5461931)
1. Van Jacobson: “[Congestion Avoidance and Control](http://www.cs.usask.ca/ftp/pub/discus/seminars2002-2003/p314-jacobson.pdf),” at *ACM Symposium on Communications Architectures and Protocols* (SIGCOMM), August 1988. [doi:10.1145/52324.52356](http://dx.doi.org/10.1145/52324.52356)
1. Brandon Philips: “[etcd: Distributed Locking and Service Discovery](https://www.youtube.com/watch?v=HJIjTTHWYnE),” at *Strange Loop*, September 2014.
1. Steve Newman: “[A Systematic Look at EC2 I/O](https://web.archive.org/web/20141211094156/http://blog.scalyr.com/2012/10/a-systematic-look-at-ec2-io/),” *blog.scalyr.com*, October 16, 2012.
1. Naohiro Hayashibara, Xavier Défago, Rami Yared, and Takuya Katayama: “[The ϕ Accrual Failure Detector](http://hdl.handle.net/10119/4784),” Japan Advanced Institute of Science and Technology, School of Information Science, Technical Report IS-RR-2004-010, May 2004.
1. Jeffrey Wang: “[Phi Accrual Failure Detector](http://ternarysearch.blogspot.co.uk/2013/08/phi-accrual-failure-detector.html),” *ternarysearch.blogspot.co.uk*, August 11, 2013.
1. Srinivasan Keshav: *An Engineering Approach to Computer Networking: ATM Networks, the Internet, and the Telephone Network*. Addison-Wesley Professional, May 1997. ISBN: 978-0-201-63442-6
1. Cisco, “[Integrated Services Digital Network](https://web.archive.org/web/20181229220921/http://docwiki.cisco.com/wiki/Integrated_Services_Digital_Network),” *docwiki.cisco.com*.
1. Othmar Kyas: *ATM Networks*. International Thomson Publishing, 1995. ISBN: 978-1-850-32128-6
1. “[InfiniBand FAQ](http://www.mellanox.com/related-docs/whitepapers/InfiniBandFAQ_FQ_100.pdf),” Mellanox Technologies, December 22, 2014.
1. Jose Renato Santos, Yoshio Turner, and G. (John) Janakiraman: “[End-to-End Congestion Control for InfiniBand](http://www.hpl.hp.com/techreports/2002/HPL-2002-359.pdf),” at *22nd Annual Joint Conference of the IEEE Computer and Communications Societies* (INFOCOM), April 2003. Also published by HP Laboratories Palo Alto, Tech Report HPL-2002-359. [doi:10.1109/INFCOM.2003.1208949](http://dx.doi.org/10.1109/INFCOM.2003.1208949)
1. Ulrich Windl, David Dalton, Marc Martinec, and Dale R. Worley: “[The NTP FAQ and HOWTO](http://www.ntp.org/ntpfaq/NTP-a-faq.htm),” *ntp.org*, November 2006.
1. John Graham-Cumming: “[How and why the leap second affected Cloudflare DNS](https://blog.cloudflare.com/how-and-why-the-leap-second-affected-cloudflare-dns/),” *blog.cloudflare.com*, January 1, 2017.
1. David Holmes: “[Inside the Hotspot VM: Clocks, Timers and Scheduling Events – Part I – Windows](https://web.archive.org/web/20160308031939/https://blogs.oracle.com/dholmes/entry/inside_the_hotspot_vm_clocks),” *blogs.oracle.com*, October 2, 2006.
1. Steve Loughran: “[Time on Multi-Core, Multi-Socket Servers](http://steveloughran.blogspot.co.uk/2015/09/time-on-multi-core-multi-socket-servers.html),” *steveloughran.blogspot.co.uk*, September 17, 2015.
1. James C. Corbett, Jeffrey Dean, Michael Epstein, et al.: “[Spanner: Google’s Globally-Distributed Database](https://research.google/pubs/pub39966/),” at *10th USENIX Symposium on Operating System Design and Implementation* (OSDI), October 2012.
1. M. Caporaloni and R. Ambrosini: “[How Closely Can a Personal Computer Clock Track the UTC Timescale Via the Internet?](https://iopscience.iop.org/0143-0807/23/4/103/),” *European Journal of Physics*, volume 23, number 4, pages L17–L21, June 2012. [doi:10.1088/0143-0807/23/4/103](http://dx.doi.org/10.1088/0143-0807/23/4/103)
1. Nelson Minar: “[A Survey of the NTP Network](http://alumni.media.mit.edu/~nelson/research/ntp-survey99/),” *alumni.media.mit.edu*, December 1999.
1. Viliam Holub: “[Synchronizing Clocks in a Cassandra Cluster Pt. 1 – The Problem](https://blog.rapid7.com/2014/03/14/synchronizing-clocks-in-a-cassandra-cluster-pt-1-the-problem/),” *blog.rapid7.com*, March 14, 2014.
1. Poul-Henning Kamp: “[The One-Second War (What Time Will You Die?)](http://queue.acm.org/detail.cfm?id=1967009),” *ACM Queue*, volume 9, number 4, pages 44–48, April 2011. [doi:10.1145/1966989.1967009](http://dx.doi.org/10.1145/1966989.1967009)
1. Nelson Minar: “[Leap Second Crashes Half the Internet](http://www.somebits.com/weblog/tech/bad/leap-second-2012.html),” *somebits.com*, July 3, 2012.
1. Christopher Pascoe: “[Time, Technology and Leaping Seconds](http://googleblog.blogspot.co.uk/2011/09/time-technology-and-leaping-seconds.html),” *googleblog.blogspot.co.uk*, September 15, 2011.
1. Mingxue Zhao and Jeff Barr: “[Look Before You Leap – The Coming Leap Second and AWS](https://aws.amazon.com/blogs/aws/look-before-you-leap-the-coming-leap-second-and-aws/),” *aws.amazon.com*, May 18, 2015.
1. Darryl Veitch and Kanthaiah Vijayalayan: “[Network Timing and the 2015 Leap Second](https://tklab.feit.uts.edu.au/~darryl/Publications/LeapSecond_camera.pdf),” at *17th International Conference on Passive and Active Measurement* (PAM), April 2016. [doi:10.1007/978-3-319-30505-9_29](http://dx.doi.org/10.1007/978-3-319-30505-9_29)
1. “[Timekeeping in VMware Virtual Machines](https://www.vmware.com/content/dam/digitalmarketing/vmware/en/pdf/techpaper/Timekeeping-In-VirtualMachines.pdf),” Information Guide, VMware, Inc., December 2011.
1. “[MiFID II / MiFIR: Regulatory Technical and Implementing Standards – Annex I (Draft)](https://www.esma.europa.eu/sites/default/files/library/2015/11/2015-esma-1464_annex_i_-_draft_rts_and_its_on_mifid_ii_and_mifir.pdf),” European Securities and Markets Authority, Report ESMA/2015/1464, September 2015.
1. Luke Bigum: “[Solving MiFID II Clock Synchronisation With Minimum Spend (Part 1)](https://web.archive.org/web/20170704030310/https://www.lmax.com/blog/staff-blogs/2015/11/27/solving-mifid-ii-clock-synchronisation-minimum-spend-part-1/),” *lmax.com*, November 27, 2015.
1. Kyle Kingsbury: “[Call Me Maybe: Cassandra](https://aphyr.com/posts/294-call-me-maybe-cassandra/),” *aphyr.com*, September 24, 2013.
1. John Daily: “[Clocks Are Bad, or, Welcome to the Wonderful World of Distributed Systems](https://riak.com/clocks-are-bad-or-welcome-to-distributed-systems/),” *riak.com*, November 12, 2013.
1. Kyle Kingsbury: “[The Trouble with Timestamps](https://aphyr.com/posts/299-the-trouble-with-timestamps),” *aphyr.com*, October 12, 2013.
1. Leslie Lamport: “[Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/),” *Communications of the ACM*, volume 21, number 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](http://dx.doi.org/10.1145/359545.359563)
1. Sandeep Kulkarni, Murat Demirbas, Deepak Madeppa, et al.: “[Logical Physical Clocks and Consistent Snapshots in Globally Distributed Databases](http://www.cse.buffalo.edu/tech-reports/2014-04.pdf),” State University of New York at Buffalo, Computer Science and Engineering Technical Report 2014-04, May 2014.
1. Justin Sheehy: “[There Is No Now: Problems With Simultaneity in Distributed Systems](https://queue.acm.org/detail.cfm?id=2745385),” *ACM Queue*, volume 13, number 3, pages 36–41, March 2015. [doi:10.1145/2733108](http://dx.doi.org/10.1145/2733108)
1. Murat Demirbas: “[Spanner: Google's Globally-Distributed Database](http://muratbuffalo.blogspot.co.uk/2013/07/spanner-googles-globally-distributed_4.html),” *muratbuffalo.blogspot.co.uk*, July 4, 2013.
1. Dahlia Malkhi and Jean-Philippe Martin: “[Spanner's Concurrency Control](http://www.cs.cornell.edu/~ie53/publications/DC-col51-Sep13.pdf),” *ACM SIGACT News*, volume 44, number 3, pages 73–77, September 2013. [doi:10.1145/2527748.2527767](http://dx.doi.org/10.1145/2527748.2527767)
1. Manuel Bravo, Nuno Diegues, Jingna Zeng, et al.: “[On the Use of Clocks to Enforce Consistency in the Cloud](http://sites.computer.org/debull/A15mar/p18.pdf),” *IEEE Data Engineering Bulletin*, volume 38, number 1, pages 18–31, March 2015.
1. Spencer Kimball: “[Living Without Atomic Clocks](http://www.cockroachlabs.com/blog/living-without-atomic-clocks/),” *cockroachlabs.com*, February 17, 2016.
1. Cary G. Gray and David R. Cheriton: “[Leases: An Efficient Fault-Tolerant Mechanism for Distributed File Cache Consistency](https://web.archive.org/web/20230325205928/http://web.stanford.edu/class/cs240/readings/89-leases.pdf),” at *12th ACM Symposium on Operating Systems Principles* (SOSP), December 1989. [doi:10.1145/74850.74870](http://dx.doi.org/10.1145/74850.74870)
1. Todd Lipcon: “[Avoiding Full GCs in Apache HBase with MemStore-Local Allocation Buffers: Part 1](https://web.archive.org/web/20121101040711/http://blog.cloudera.com/blog/2011/02/avoiding-full-gcs-in-hbase-with-memstore-local-allocation-buffers-part-1/),” *blog.cloudera.com*, February 24, 2011.
1. Martin Thompson: “[Java Garbage Collection Distilled](http://mechanical-sympathy.blogspot.co.uk/2013/07/java-garbage-collection-distilled.html),” *mechanical-sympathy.blogspot.co.uk*, July 16, 2013.
1. Alexey Ragozin: “[How to Tame Java GC Pauses? Surviving 16GiB Heap and Greater](https://dzone.com/articles/how-tame-java-gc-pauses),” *dzone.com*, June 28, 2011.
1. Christopher Clark, Keir Fraser, Steven Hand, et al.: “[Live Migration of Virtual Machines](http://www.cl.cam.ac.uk/research/srg/netos/papers/2005-nsdi-migration.pdf),” at *2nd USENIX Symposium on Symposium on Networked Systems Design & Implementation* (NSDI), May 2005.
1. Mike Shaver: “[fsyncers and Curveballs](https://web.archive.org/web/20220107141023/http://shaver.off.net/diary/2008/05/25/fsyncers-and-curveballs/),” *shaver.off.net*, May 25, 2008.
1. Zhenyun Zhuang and Cuong Tran: “[Eliminating Large JVM GC Pauses Caused by Background IO Traffic](https://engineering.linkedin.com/blog/2016/02/eliminating-large-jvm-gc-pauses-caused-by-background-io-traffic),” *engineering.linkedin.com*, February 10, 2016.
1. David Terei and Amit Levy: “[Blade: A Data Center Garbage Collector](http://arxiv.org/pdf/1504.02578.pdf),” arXiv:1504.02578, April 13, 2015.
1. Martin Maas, Tim Harris, Krste Asanović, and John Kubiatowicz: “[Trash Day: Coordinating Garbage Collection in Distributed Systems](https://timharris.uk/papers/2015-hotos.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
1. “[Predictable Low Latency](http://cdn2.hubspot.net/hubfs/1624455/Website_2016/content/White%20papers/Cinnober%20on%20GC%20pause%20free%20Java%20applications.pdf),” Cinnober Financial Technology AB, *cinnober.com*, November 24, 2013.
1. Martin Fowler: “[The LMAX Architecture](http://martinfowler.com/articles/lmax.html),” *martinfowler.com*, July 12, 2011.
1. Flavio P. Junqueira and Benjamin Reed: *ZooKeeper: Distributed Process Coordination*. O'Reilly Media, 2013. ISBN: 978-1-449-36130-3
1. Enis Söztutar: “[HBase and HDFS: Understanding Filesystem Usage in HBase](http://www.slideshare.net/enissoz/hbase-and-hdfs-understanding-filesystem-usage),” at *HBaseCon*, June 2013.
1. Caitie McCaffrey: “[Clients Are Jerks: AKA How Halo 4 DoSed the Services at Launch & How We Survived](https://web.archive.org/web/20230128065851/http://caitiem.com/2015/06/23/clients-are-jerks-aka-how-halo-4-dosed-the-services-at-launch-how-we-survived/),” *caitiem.com*, June 23, 2015.
1. Leslie Lamport, Robert Shostak, and Marshall Pease: “[The Byzantine Generals Problem](https://www.microsoft.com/en-us/research/publication/byzantine-generals-problem/),” *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 4, number 3, pages 382–401, July 1982. [doi:10.1145/357172.357176](http://dx.doi.org/10.1145/357172.357176)
1. Jim N. Gray: “[Notes on Data Base Operating Systems](http://jimgray.azurewebsites.net/papers/dbos.pdf),” in *Operating Systems: An Advanced Course*, Lecture Notes in Computer Science, volume 60, edited by R. Bayer, R. M. Graham, and G. Seegmüller, pages 393–481, Springer-Verlag, 1978. ISBN: 978-3-540-08755-7
1. Brian Palmer: “[How Complicated Was the Byzantine Empire?](http://www.slate.com/articles/news_and_politics/explainer/2011/10/the_byzantine_tax_code_how_complicated_was_byzantium_anyway_.html),” *slate.com*, October 20, 2011.
1. Leslie Lamport: “[My Writings](http://lamport.azurewebsites.net/pubs/pubs.html),” *lamport.azurewebsites.net*, December 16, 2014. This page can be found by searching the web for the 23-character string obtained by removing the hyphens from the string `allla-mport-spubso-ntheweb`.
1. John Rushby: “[Bus Architectures for Safety-Critical Embedded Systems](http://www.csl.sri.com/papers/emsoft01/emsoft01.pdf),” at *1st International Workshop on Embedded Software* (EMSOFT), October 2001.
1. Jake Edge: “[ELC: SpaceX Lessons Learned](http://lwn.net/Articles/540368/),” *lwn.net*, March 6, 2013.
1. Andrew Miller and Joseph J. LaViola, Jr.: “[Anonymous Byzantine Consensus from Moderately-Hard Puzzles: A Model for Bitcoin](http://nakamotoinstitute.org/static/docs/anonymous-byzantine-consensus.pdf),” University of Central Florida, Technical Report CS-TR-14-01, April 2014.
1. James Mickens: “[The Saddest Moment](https://www.usenix.org/system/files/login-logout_1305_mickens.pdf),” *USENIX ;login: logout*, May 2013.
1. Evan Gilman: “[The Discovery of Apache ZooKeeper’s Poison Packet](http://www.pagerduty.com/blog/the-discovery-of-apache-zookeepers-poison-packet/),” *pagerduty.com*, May 7, 2015.
1. Jonathan Stone and Craig Partridge: “[When the CRC and TCP Checksum Disagree](https://web.archive.org/web/20220818235232/https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.27.7611&rep=rep1&type=pdf),” at *ACM Conference on Applications, Technologies, Architectures, and Protocols for Computer Communication* (SIGCOMM), August 2000. [doi:10.1145/347059.347561](http://dx.doi.org/10.1145/347059.347561)
1. Evan Jones: “[How Both TCP and Ethernet Checksums Fail](http://www.evanjones.ca/tcp-and-ethernet-checksums-fail.html),” *evanjones.ca*, October 5, 2015.
1. Cynthia Dwork, Nancy Lynch, and Larry Stockmeyer: “[Consensus in the Presence of Partial Synchrony](https://dl.acm.org/doi/10.1145/42282.42283),” *Journal of the ACM*, volume 35, number 2, pages 288–323, April 1988. [doi:10.1145/42282.42283](http://dx.doi.org/10.1145/42282.42283)
1. Peter Bailis and Ali Ghodsi: “[Eventual Consistency Today: Limitations, Extensions, and Beyond](http://queue.acm.org/detail.cfm?id=2462076),” *ACM Queue*, volume 11, number 3, pages 55-63, March 2013. [doi:10.1145/2460276.2462076](http://dx.doi.org/10.1145/2460276.2462076)
1. Bowen Alpern and Fred B. Schneider: “[Defining Liveness](https://www.cs.cornell.edu/fbs/publications/DefLiveness.pdf),” *Information Processing Letters*, volume 21, number 4, pages 181–185, October 1985. [doi:10.1016/0020-0190(85)90056-0](http://dx.doi.org/10.1016/0020-0190(85)90056-0)
1. Flavio P. Junqueira: “[Dude, Where’s My Metadata?](https://web.archive.org/web/20230604215314/https://fpj.systems/2015/05/28/dude-wheres-my-metadata/),” *fpj.me*, May 28, 2015.
1. Scott Sanders: “[January 28th Incident Report](https://github.com/blog/2106-january-28th-incident-report),” *github.com*, February 3, 2016.
1. Jay Kreps: “[A Few Notes on Kafka and Jepsen](http://blog.empathybox.com/post/62279088548/a-few-notes-on-kafka-and-jepsen),” *blog.empathybox.com*, September 25, 2013.
1. Thanh Do, Mingzhe Hao, Tanakorn Leesatapornwongsa, et al.: “[Limplock: Understanding the Impact of Limpware on Scale-out Cloud Systems](http://ucare.cs.uchicago.edu/pdf/socc13-limplock.pdf),” at *4th ACM Symposium on Cloud Computing* (SoCC), October 2013. [doi:10.1145/2523616.2523627](http://dx.doi.org/10.1145/2523616.2523627)
1. Frank McSherry, Michael Isard, and Derek G. Murray: “[Scalability! But at What COST?](http://www.frankmcsherry.org/assets/COST.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.

[^譯著1]: 原詩為：Hey I just met you. The network’s laggy. But here’s my data. So store it maybe.Hey,  應改編自《Call Me Maybe》歌詞：I just met you, And this is crazy, But here's my number, So call me, maybe?

================================================
FILE: content/v1_tw/ch9.md
================================================
---
title: "第九章：一致性與共識"
linkTitle: "9. 一致性與共識"
weight: 209
math: true
breadcrumbs: false
---


![](/map/ch09.png)

> 好死還是賴活著？
> —— Jay Kreps, 關於 Kafka 與 Jepsen 的若干筆記 (2013)


正如 [第八章](/v1_tw/ch8) 所討論的，分散式系統中的許多事情可能會出錯。處理這種故障的最簡單方法是簡單地讓整個服務失效，並向用戶顯示錯誤訊息。如果無法接受這個解決方案，我們就需要找到容錯的方法 —— 即使某些內部元件出現故障，服務也能正常執行。

在本章中，我們將討論構建容錯分散式系統的演算法和協議的一些例子。我們將假設 [第八章](/v1_tw/ch8) 的所有問題都可能發生：網路中的資料包可能會丟失、重新排序、重複推送或任意延遲；時鐘只是盡其所能地近似；且節點可以暫停（例如，由於垃圾收集）或隨時崩潰。

構建容錯系統的最好方法，是找到一些帶有實用保證的通用抽象，實現一次，然後讓應用依賴這些保證。這與 [第七章](/v1_tw/ch7) 中的事務處理方法相同：透過使用事務，應用可以假裝沒有崩潰（原子性），沒有其他人同時訪問資料庫（隔離），儲存裝置是完全可靠的（永續性）。即使發生崩潰，競態條件和磁碟故障，事務抽象隱藏了這些問題，因此應用不必擔心它們。

現在我們將繼續沿著同樣的路線前進，尋求可以讓應用忽略分散式系統部分問題的抽象概念。例如，分散式系統最重要的抽象之一就是 **共識（consensus）**：**就是讓所有的節點對某件事達成一致**。正如我們在本章中將會看到的那樣，要可靠地達成共識，且不被網路故障和程序故障所影響，是一個令人驚訝的棘手問題。

一旦達成共識，應用可以將其用於各種目的。例如，假設你有一個單主複製的資料庫。如果主庫掛掉，並且需要故障切換到另一個節點，剩餘的資料庫節點可以使用共識來選舉新的領導者。正如在 “[處理節點宕機](/v1_tw/ch5#處理節點宕機)” 中所討論的那樣，重要的是隻有一個領導者，且所有的節點都認同其領導。如果兩個節點都認為自己是領導者，這種情況被稱為 **腦裂（split brain）**，它經常會導致資料丟失。正確實現共識有助於避免這種問題。

在本章後面的 “[分散式事務與共識](#分散式事務與共識)” 中，我們將研究解決共識和相關問題的演算法。但首先，我們首先需要探索可以在分散式系統中提供的保證和抽象的範圍。

我們需要了解可以做什麼和不可以做什麼的範圍：在某些情況下，系統可以容忍故障並繼續工作；在其他情況下，這是不可能的。我們將深入研究什麼可能而什麼不可能的限制，既透過理論證明，也透過實際實現。我們將在本章中概述這些基本限制。

分散式系統領域的研究人員幾十年來一直在研究這些主題，所以有很多資料 —— 我們只能介紹一些皮毛。在本書中，我們沒有空間去詳細介紹形式模型和證明的細節，所以我們會按照直覺來介紹。如果你有興趣，參考文獻可以提供更多的深度。


## 一致性保證

在 “[複製延遲問題](/v1_tw/ch5#複製延遲問題)” 中，我們看到了資料庫複製中發生的一些時序問題。如果你在同一時刻檢視兩個資料庫節點，則可能在兩個節點上看到不同的資料，因為寫請求在不同的時間到達不同的節點。無論資料庫使用何種複製方法（單主複製、多主複製或無主複製），都會出現這些不一致情況。

大多數複製的資料庫至少提供了 **最終一致性**，這意味著如果你停止向資料庫寫入資料並等待一段不確定的時間，那麼最終所有的讀取請求都會返回相同的值【1】。換句話說，不一致性是暫時的，最終會自行解決（假設網路中的任何故障最終都會被修復）。最終一致性的一個更好的名字可能是 **收斂（convergence）**，因為我們預計所有的副本最終會收斂到相同的值【2】。

然而，這是一個非常弱的保證 —— 它並沒有說什麼時候副本會收斂。在收斂之前，讀操作可能會返回任何東西或什麼都沒有【1】。例如，如果你寫入了一個值，然後立即再次讀取，這並不能保證你能看到剛才寫入的值，因為讀請求可能會被路由到另外的副本上。（請參閱 “[讀己之寫](/v1_tw/ch5#讀己之寫)” ）。

對於應用開發人員而言，最終一致性是很困難的，因為它與普通單執行緒程式中變數的行為有很大區別。對於後者，如果將一個值賦給一個變數，然後很快地再次讀取，不可能讀到舊的值，或者讀取失敗。資料庫表面上看起來像一個你可以讀寫的變數，但實際上它有更複雜的語義【3】。

在與只提供弱保證的資料庫打交道時，你需要始終意識到它的侷限性，而不是意外地作出太多假設。錯誤往往是微妙的，很難找到，也很難測試，因為應用可能在大多數情況下執行良好。當系統出現故障（例如網路中斷）或高併發時，最終一致性的邊緣情況才會顯現出來。

本章將探索資料系統可能選擇提供的更強一致性模型。它不是免費的：具有較強保證的系統可能會比保證較差的系統具有更差的效能或更少的容錯性。儘管如此，更強的保證能夠吸引人，因為它們更容易用對。只有見過不同的一致性模型後，才能更好地決定哪一個最適合自己的需求。

**分散式一致性模型** 和我們之前討論的事務隔離級別的層次結構有一些相似之處【4,5】（請參閱 “[弱隔離級別](/v1_tw/ch7#弱隔離級別)”）。儘管兩者有一部分內容重疊，但它們大多是無關的問題：事務隔離主要是為了 **避免由於同時執行事務而導致的競爭狀態**，而分散式一致性主要關於 **在面對延遲和故障時如何協調副本間的狀態**。

本章涵蓋了廣泛的話題，但我們將會看到這些領域實際上是緊密聯絡在一起的：

* 首先看一下常用的 **最強一致性模型** 之一，**線性一致性（linearizability）**，並考察其優缺點。
* 然後我們將檢查分散式系統中 [**事件順序**](#順序保證) 的問題，特別是因果關係和全域性順序的問題。
* 在第三節的（“[分散式事務與共識](#分散式事務與共識)”）中將探討如何原子地提交分散式事務，這將最終引領我們走向共識問題的解決方案。


## 線性一致性

在 **最終一致** 的資料庫，如果你在同一時刻問兩個不同副本相同的問題，可能會得到兩個不同的答案。這很讓人困惑。如果資料庫可以提供只有一個副本的假象（即，只有一個數據副本），那麼事情就簡單太多了。那麼每個客戶端都會有相同的資料檢視，且不必擔心複製滯後了。

這就是 **線性一致性（linearizability）** 背後的想法【6】（也稱為 **原子一致性（atomic consistency）**【7】，**強一致性（strong consistency）**，**立即一致性（immediate consistency）** 或 **外部一致性（external consistency ）**【8】）。線性一致性的精確定義相當微妙，我們將在本節的剩餘部分探討它。但是基本的想法是讓一個系統看起來好像只有一個數據副本，而且所有的操作都是原子性的。有了這個保證，即使實際中可能有多個副本，應用也不需要擔心它們。

在一個線性一致的系統中，只要一個客戶端成功完成寫操作，所有客戶端從資料庫中讀取資料必須能夠看到剛剛寫入的值。要維護資料的單個副本的假象，系統應保障讀到的值是最近的、最新的，而不是來自陳舊的快取或副本。換句話說，線性一致性是一個 **新鮮度保證（recency guarantee）**。為了闡明這個想法，我們來看看一個非線性一致系統的例子。

![](/v1/ddia_0901.png)

**圖 9-1 這個系統是非線性一致的，導致了球迷的困惑**

[圖 9-1](/v1/ddia_0901.png) 展示了一個關於體育網站的非線性一致例子【9】。Alice 和 Bob 正坐在同一個房間裡，都盯著各自的手機，關注著 2014 年 FIFA 世界盃決賽的結果。在最後得分公佈後，Alice 重新整理頁面，看到宣佈了獲勝者，並興奮地告訴 Bob。Bob 難以置信地重新整理了自己的手機，但他的請求路由到了一個落後的資料庫副本上，手機顯示比賽仍在進行。

如果 Alice 和 Bob 在同一時間重新整理並獲得了兩個不同的查詢結果，也許就沒有那麼令人驚訝了。因為他們不知道伺服器處理他們請求的精確時刻。然而 Bob 是在聽到 Alice 驚呼最後得分 **之後**，點選了重新整理按鈕（啟動了他的查詢），因此他希望查詢結果至少與愛麗絲一樣新鮮。但他的查詢返回了陳舊結果，這一事實違背了線性一致性的要求。

### 什麼使得系統線性一致？

線性一致性背後的基本思想很簡單：使系統看起來好像只有一個數據副本。然而確切來講，實際上有更多要操心的地方。為了更好地理解線性一致性，讓我們再看幾個例子。

[圖 9-2](/v1/ddia_0902.png) 顯示了三個客戶端在線性一致資料庫中同時讀寫相同的鍵 `x`。在分散式系統文獻中，`x` 被稱為 **暫存器（register）**，例如，它可以是鍵值儲存中的一個 **鍵**，關係資料庫中的一 **行**，或文件資料庫中的一個 **文件**。

![](/v1/ddia_0902.png)

**圖 9-2 如果讀取請求與寫入請求併發，則可能會返回舊值或新值**

為了簡單起見，[圖 9-2](/v1/ddia_0902.png) 採用了使用者請求的視角，而不是資料庫內部的視角。每個橫柱都是由客戶端發出的請求，其中柱頭是請求傳送的時刻，柱尾是客戶端收到響應的時刻。因為網路延遲變化無常，客戶端不知道資料庫處理其請求的精確時間 —— 只知道它發生在傳送請求和接收響應之間的某個時刻。[^i]

[^i]: 這個圖的一個微妙的細節是它假定存在一個全域性時鐘，由水平軸表示。雖然真實的系統通常沒有準確的時鐘（請參閱 “[不可靠的時鐘](/v1_tw/ch8#不可靠的時鐘)”），但這種假設是允許的：為了分析分散式演算法，我們可以假設存在一個精確的全域性時鐘，不過演算法無法訪問它【47】。演算法只能看到由石英振盪器和 NTP 產生的對真實時間的逼近。

在這個例子中，暫存器有兩種型別的操作：

* $read(x)⇒v$表示客戶端請求讀取暫存器 `x` 的值，資料庫返回值 `v`。
* $write(x,v)⇒r$ 表示客戶端請求將暫存器 `x` 設定為值 `v` ，資料庫返回響應 `r` （可能正確，可能錯誤）。

在 [圖 9-2](/v1/ddia_0902.png) 中，`x` 的值最初為 `0`，客戶端 C 執行寫請求將其設定為 `1`。發生這種情況時，客戶端 A 和 B 反覆輪詢資料庫以讀取最新值。A 和 B 的請求可能會收到怎樣的響應？

* 客戶端 A 的第一個讀操作，完成於寫操作開始之前，因此必須返回舊值 `0`。
* 客戶端 A 的最後一個讀操作，開始於寫操作完成之後。如果資料庫是線性一致性的，它必然返回新值 `1`：因為讀操作和寫操作一定是在其各自的起止區間內的某個時刻被處理。如果在寫入結束後開始讀取，則讀取處理一定發生在寫入完成之後，因此它必須看到寫入的新值。
* 與寫操作在時間上重疊的任何讀操作，可能會返回 `0` 或 `1` ，因為我們不知道讀取時，寫操作是否已經生效。這些操作是 **併發（concurrent）** 的。

但是，這還不足以完全描述線性一致性：如果與寫入同時發生的讀取可以返回舊值或新值，那麼讀者可能會在寫入期間看到數值在舊值和新值之間來回翻轉。這個系統對 “單一資料副本” 的模擬還不是我們所期望的。[^ii]

[^ii]: 如果讀取（與寫入同時發生時）可能返回舊值或新值，則稱該暫存器為 **常規暫存器（regular register）**【7,25】

為了使系統線性一致，我們需要新增另一個約束，如 [圖 9-3](/v1/ddia_0903.png) 所示

![](/v1/ddia_0903.png)

**圖 9-3 任何一個讀取返回新值後，所有後續讀取（在相同或其他客戶端上）也必須返回新值。**

在一個線性一致的系統中，我們可以想象，在 `x` 的值從 `0` 自動翻轉到 `1` 的時候（在寫操作的開始和結束之間）必定有一個時間點。因此，如果一個客戶端的讀取返回新的值 `1`，即使寫操作尚未完成，所有後續讀取也必須返回新值。

[圖 9-3](/v1/ddia_0903.png) 中的箭頭說明了這個時序依賴關係。客戶端 A 是第一個讀取新的值 `1` 的位置。在 A 的讀取返回之後，B 開始新的讀取。由於 B 的讀取嚴格發生於 A 的讀取之後，因此即使 C 的寫入仍在進行中，也必須返回 `1`（與 [圖 9-1](/v1/ddia_0901.png) 中的 Alice 和 Bob 的情況相同：在 Alice 讀取新值之後，Bob 也希望讀取新的值）。

我們可以進一步細化這個時序圖，展示每個操作是如何在特定時刻原子性生效的。[圖 9-4](/v1/ddia_0904.png) 顯示了一個更複雜的例子【10】。

在 [圖 9-4](/v1/ddia_0904.png) 中，除了讀寫之外，還增加了第三種類型的操作：

* $cas(x, v_{old}, v_{new})⇒r$ 表示客戶端請求進行原子性的 [**比較與設定**](/v1_tw/ch7#比較並設定（CAS）) 操作。如果暫存器 $x$ 的當前值等於 $v_{old}$ ，則應該原子地設定為 $v_{new}$ 。如果 $x$ 不等於 $v_{old}$ ，則操作應該保持暫存器不變並返回一個錯誤。$r$ 是資料庫的響應（正確或錯誤）。

[圖 9-4](/v1/ddia_0904.png) 中的每個操作都在我們認為操作被執行的時候用豎線標出（在每個操作的橫柱之內）。這些標記按順序連在一起，其結果必須是一個有效的暫存器讀寫序列（**每次讀取都必須返回最近一次寫入設定的值**）。

線性一致性的要求是，操作標記的連線總是按時間（從左到右）向前移動，而不是向後移動。這個要求確保了我們之前討論的新鮮度保證：一旦新的值被寫入或讀取，所有後續的讀都會看到寫入的值，直到它被再次覆蓋。

![](/v1/ddia_0904.png)

**圖 9-4 將讀取和寫入看起來已經生效的時間點進行視覺化。客戶端 B 的最後一次讀取不是線性一致的**

[圖 9-4](/v1/ddia_0904.png) 中有一些有趣的細節需要指出：

* 第一個客戶端 B 傳送一個讀取 `x` 的請求，然後客戶端 D 傳送一個請求將 `x` 設定為 `0`，然後客戶端 A 傳送請求將 `x` 設定為 `1`。然而，返回給 B 的讀取值為 `1`（由 A 寫入的值）。這是可以的：這意味著資料庫首先處理 D 的寫入，然後是 A 的寫入，最後是 B 的讀取。雖然這不是請求傳送的順序，但這是一個可以接受的順序，因為這三個請求是併發的。也許 B 的讀請求在網路上略有延遲，所以它在兩次寫入之後才到達資料庫。

* 在客戶端 A 從資料庫收到響應之前，客戶端 B 的讀取返回 `1` ，表示寫入值 `1` 已成功。這也是可以的：這並不意味著在寫之前讀到了值，這只是意味著從資料庫到客戶端 A 的正確響應在網路中略有延遲。

* 此模型不假設有任何事務隔離：另一個客戶端可能隨時更改值。例如，C 首先讀取到 `1` ，然後讀取到 `2` ，因為兩次讀取之間的值被 B 所更改。可以使用原子 **比較並設定（cas）** 操作來檢查該值是否未被另一客戶端同時更改：B 和 C 的 **cas** 請求成功，但是 D 的 **cas** 請求失敗（在資料庫處理它時，`x` 的值不再是 `0` ）。

* 客戶 B 的最後一次讀取（陰影條柱中）不是線性一致的。該操作與 C 的 **cas** 寫操作併發（它將 `x` 從 `2` 更新為 `4` ）。在沒有其他請求的情況下，B 的讀取返回 `2` 是可以的。然而，在 B 的讀取開始之前，客戶端 A 已經讀取了新的值 `4`  ，因此不允許 B 讀取比 A 更舊的值。再次，與 [圖 9-1](/v1/ddia_0901.png) 中的 Alice 和 Bob 的情況相同。

  這就是線性一致性背後的直覺。正式的定義【6】更準確地描述了它。透過記錄所有請求和響應的時序，並檢查它們是否可以排列成有效的順序，以測試一個系統的行為是否線性一致性是可能的（儘管在計算上是昂貴的）【11】。


> [!NOTE] 線性一致性與可序列化
>
> **線性一致性** 容易和 [**可序列化**](/v1_tw/ch7#可序列化) 相混淆，因為兩個詞似乎都是類似 “可以按順序排列” 的東西。但它們是兩種完全不同的保證，區分兩者非常重要：
>
> ***可序列化***
> : **可序列化（Serializability）** 是事務的隔離屬性，每個事務可以讀寫多個物件（行，文件，記錄）—— 請參閱 “[單物件和多物件操作](/v1_tw/ch7#單物件和多物件操作)”。它確保事務的行為，與它們按照 **某種** 順序依次執行的結果相同（每個事務在下一個事務開始之前執行完成）。這種執行順序可以與事務實際執行的順序不同。【12】。
>
> ***線性一致性***
> : **線性一致性（Linearizability）** 是讀取和寫入暫存器（單個物件）的 **新鮮度保證**。它不會將操作組合為事務，因此它也不會阻止寫入偏差等問題（請參閱 “[寫入偏差和幻讀](/v1_tw/ch7#寫入偏差與幻讀)”），除非採取其他措施（例如 [物化衝突](/v1_tw/ch7#物化衝突)）。
>
> 一個數據庫可以提供可序列化和線性一致性，這種組合被稱為嚴格的可序列化或 **強的單副本可序列化（strong-1SR）**【4,13】。基於兩階段鎖定的可序列化實現（請參閱 “[兩階段鎖定](/v1_tw/ch7#兩階段鎖定)” 一節）或 **真的序列執行**（請參閱 “[真的序列執行](/v1_tw/ch7#真的序列執行)”一節）通常是線性一致性的。
>
> 但是，可序列化的快照隔離（請參閱 “[可序列化快照隔離](/v1_tw/ch7#可序列化快照隔離)”）不是線性一致性的：按照設計，它從一致的快照中進行讀取，以避免讀者和寫者之間的鎖競爭。一致性快照的要點就在於 **它不會包括該快照之後的寫入**，因此從快照讀取不是線性一致性的。


### 依賴線性一致性

線性一致性在什麼情況下有用？觀看體育比賽的最後得分可能是一個輕率的例子：滯後了幾秒鐘的結果不太可能在這種情況下造成任何真正的傷害。然而對於少數領域，線性一致性是系統正確工作的一個重要條件。

#### 鎖定和領導選舉

一個使用單主複製的系統，需要確保領導者真的只有一個，而不是幾個（腦裂）。一種選擇領導者的方法是使用鎖：每個節點在啟動時嘗試獲取鎖，成功者成為領導者【14】。不管這個鎖是如何實現的，它必須是線性一致的：所有節點必須就哪個節點擁有鎖達成一致，否則就沒用了。

諸如 Apache ZooKeeper 【15】和 etcd 【16】之類的協調服務通常用於實現分散式鎖和領導者選舉。它們使用一致性演算法，以容錯的方式實現線性一致的操作（在本章後面的 “[容錯共識](#容錯共識)” 中討論此類演算法）[^iii]。還有許多微妙的細節來正確地實現鎖和領導者選舉（例如，請參閱 “[領導者和鎖](/v1_tw/ch8#領導者和鎖)” 中的防護問題），而像 Apache Curator 【17】這樣的庫則透過在 ZooKeeper 之上提供更高級別的配方來提供幫助。但是，線性一致性儲存服務是這些協調任務的基礎。

[^iii]: 嚴格地說，ZooKeeper 和 etcd 提供線性一致性的寫操作，但讀取可能是陳舊的，因為預設情況下，它們可以由任何一個副本提供服務。你可以選擇請求線性一致性讀取：etcd 稱之為 **法定人數讀取（quorum read）**【16】，而在 ZooKeeper 中，你需要在讀取之前呼叫 `sync()`【15】。請參閱 “[使用全序廣播實現線性一致的儲存](#使用全序廣播實現線性一致的儲存)”。

分散式鎖也在一些分散式資料庫（如 Oracle Real Application Clusters（RAC）【18】）中有更細粒度級別的使用。RAC 對每個磁碟頁面使用一個鎖，多個節點共享對同一個磁碟儲存系統的訪問許可權。由於這些線性一致的鎖處於事務執行的關鍵路徑上，RAC 部署通常具有用於資料庫節點之間通訊的專用叢集互連網路。

#### 約束和唯一性保證

唯一性約束在資料庫中很常見：例如，使用者名稱或電子郵件地址必須唯一標識一個使用者，而在檔案儲存服務中，不能有兩個具有相同路徑和檔名的檔案。如果要在寫入資料時強制執行此約束（例如，如果兩個人試圖同時建立一個具有相同名稱的使用者或檔案，其中一個將返回一個錯誤），則需要線性一致性。

這種情況實際上類似於一個鎖：當一個使用者註冊你的服務時，可以認為他們獲得了所選使用者名稱的 “鎖”。該操作與原子性的比較與設定（CAS）非常相似：將使用者名稱賦予宣告它的使用者，前提是使用者名稱尚未被使用。

如果想要確保銀行賬戶餘額永遠不會為負數，或者不會出售比倉庫裡的庫存更多的物品，或者兩個人不會都預定了航班或劇院裡同一時間的同一個位置。這些約束條件都要求所有節點都同意一個最新的值（賬戶餘額，庫存水平，座位佔用率）。

在實際應用中，寬鬆地處理這些限制有時是可以接受的（例如，如果航班超額預訂，你可以將客戶轉移到不同的航班併為其提供補償）。在這種情況下，可能不需要線性一致性，我們將在 “[及時性與完整性](/v1_tw/ch12#及時性與完整性)” 中討論這種寬鬆的約束。

然而，一個硬性的唯一性約束（關係型資料庫中常見的那種）需要線性一致性。其他型別的約束，如外部索引鍵或屬性約束，可以不需要線性一致性【19】。

#### 跨通道的時序依賴

注意 [圖 9-1](/v1/ddia_0901.png) 中的一個細節：如果 Alice 沒有驚呼得分，Bob 就不會知道他的查詢結果是陳舊的。他會在幾秒鐘之後再次重新整理頁面，並最終看到最後的分數。由於系統中存在額外的通道（Alice 的聲音傳到了 Bob 的耳朵中），線性一致性的違背才被注意到。

計算機系統也會出現類似的情況。例如，假設有一個網站，使用者可以上傳照片，一個後臺程序會調整照片大小，降低解析度以加快下載速度（縮圖）。該系統的架構和資料流如 [圖 9-5](/v1/ddia_0905.png) 所示。

影像縮放器需要明確的指令來執行尺寸縮放作業，指令是 Web 伺服器透過訊息佇列傳送的（請參閱 [第十一章](/v1_tw/ch11)）。Web 伺服器不會將整個照片放在佇列中，因為大多數訊息代理都是針對較短的訊息而設計的，而一張照片的空間佔用可能達到幾兆位元組。取而代之的是，首先將照片寫入檔案儲存服務，寫入完成後再將給縮放器的指令放入訊息佇列。

![](/v1/ddia_0905.png)

**圖 9-5 Web 伺服器和影像縮放器透過檔案儲存和訊息佇列進行通訊，開啟競爭條件的可能性。**

如果檔案儲存服務是線性一致的，那麼這個系統應該可以正常工作。如果它不是線性一致的，則存在競爭條件的風險：訊息佇列（[圖 9-5](/v1/ddia_0905.png) 中的步驟 3 和 4）可能比儲存服務內部的複製（replication）更快。在這種情況下，當縮放器讀取影像（步驟 5）時，可能會看到影像的舊版本，或者什麼都沒有。如果它處理的是舊版本的影像，則檔案儲存中的全尺寸圖和縮圖就產生了永久性的不一致。

出現這個問題是因為 Web 伺服器和縮放器之間存在兩個不同的通道：檔案儲存與訊息佇列。沒有線性一致性的新鮮性保證，這兩個通道之間的競爭條件是可能的。這種情況類似於 [圖 9-1](/v1/ddia_0901.png)，資料庫複製與 Alice 的嘴到 Bob 耳朵之間的真人音訊通道之間也存在競爭條件。

線性一致性並不是避免這種競爭條件的唯一方法，但它是最容易理解的。如果你可以控制額外通道（例如訊息佇列的例子，而不是在 Alice 和 Bob 的例子），則可以使用在 “[讀己之寫](/v1_tw/ch5#讀己之寫)” 討論過的類似方法，不過會有額外的複雜度代價。

### 實現線性一致的系統

我們已經見到了幾個線性一致性有用的例子，讓我們思考一下，如何實現一個提供線性一致語義的系統。

由於線性一致性本質上意味著 “表現得好像只有一個數據副本，而且所有的操作都是原子的”，所以最簡單的答案就是，真的只用一個數據副本。但是這種方法無法容錯：如果持有該副本的節點失效，資料將會丟失，或者至少無法訪問，直到節點重新啟動。

使系統容錯最常用的方法是使用複製。我們再來回顧 [第五章](/v1_tw/ch5) 中的複製方法，並比較它們是否可以滿足線性一致性：

單主複製（可能線性一致）
: 在具有單主複製功能的系統中（請參閱 “[領導者與追隨者](/v1_tw/ch5#領導者與追隨者)”），主庫具有用於寫入的資料的主副本，而追隨者在其他節點上保留資料的備份副本。如果從主庫或同步更新的從庫讀取資料，它們 **可能（potential）** 是線性一致性的 [^iv]。然而，實際上並不是每個單主資料庫都是線性一致性的，無論是因為設計的原因（例如，因為使用了快照隔離）還是因為在併發處理上存在錯誤【10】。

  [^iv]: 對單主資料庫進行分割槽（分片），使得每個分割槽有一個單獨的領導者，不會影響線性一致性，因為線性一致性只是對單一物件的保證。交叉分割槽事務是一個不同的問題（請參閱 “[分散式事務與共識](#分散式事務與共識)”）。

  從主庫讀取依賴一個假設，你確切地知道領導者是誰。正如在 “[真相由多數所定義](/v1_tw/ch8#真相由多數所定義)” 中所討論的那樣，一個節點很可能會認為它是領導者，而事實上並非如此 —— 如果具有錯覺的領導者繼續為請求提供服務，可能違反線性一致性【20】。使用非同步複製，故障切換時甚至可能會丟失已提交的寫入（請參閱 “[處理節點宕機](/v1_tw/ch5#處理節點宕機)”），這同時違反了永續性和線性一致性。

共識演算法（線性一致）
: 一些在本章後面討論的共識演算法，與單主複製類似。然而，共識協議包含防止腦裂和陳舊副本的措施。正是由於這些細節，共識演算法可以安全地實現線性一致性儲存。例如，Zookeeper 【21】和 etcd 【22】就是這樣工作的。

多主複製（非線性一致）
: 具有多主程式複製的系統通常不是線性一致的，因為它們同時在多個節點上處理寫入，並將其非同步複製到其他節點。因此，它們可能會產生需要被解決的寫入衝突（請參閱 “[處理寫入衝突](/v1_tw/ch5#處理寫入衝突)”）。這種衝突是因為缺少單一資料副本所導致的。

無主複製（也許不是線性一致的）
: 對於無主複製的系統（Dynamo 風格；請參閱 “[無主複製](/v1_tw/ch5#無主複製)”），有時候人們會聲稱透過要求法定人數讀寫（ $w + r > n$ ）可以獲得 “強一致性”。這取決於法定人數的具體配置，以及強一致性如何定義（通常不完全正確）。

  基於日曆時鐘（例如，在 Cassandra 中；請參閱 “[依賴同步時鐘](/v1_tw/ch8#依賴同步時鐘)”）的 “最後寫入勝利” 衝突解決方法幾乎可以確定是非線性一致的，由於時鐘偏差，不能保證時鐘的時間戳與實際事件順序一致。寬鬆的法定人數（請參閱 “[寬鬆的法定人數與提示移交](/v1_tw/ch5#寬鬆的法定人數與提示移交)”）也破壞了線性一致的可能性。即使使用嚴格的法定人數，非線性一致的行為也是可能的，如下節所示。

#### 線性一致性和法定人數

直覺上在 Dynamo 風格的模型中，嚴格的法定人數讀寫應該是線性一致性的。但是當我們有可變的網路延遲時，就可能存在競爭條件，如 [圖 9-6](/v1/ddia_0906.png) 所示。

![](/v1/ddia_0906.png)

**圖 9-6 非線性一致的執行，儘管使用了嚴格的法定人數**

在 [圖 9-6](/v1/ddia_0906.png) 中，$x$ 的初始值為 0，寫入客戶端透過向所有三個副本（ $n = 3, w = 3$ ）傳送寫入將 $x$ 更新為 `1`。客戶端 A 併發地從兩個節點組成的法定人群（ $r = 2$ ）中讀取資料，並在其中一個節點上看到新值 `1` 。客戶端 B 也併發地從兩個不同的節點組成的法定人數中讀取，並從兩個節點中取回了舊值 `0` 。

法定人數條件滿足（ $w + r> n$ ），但是這個執行是非線性一致的：B 的請求在 A 的請求完成後開始，但是 B 返回舊值，而 A 返回新值。（又一次，如同 Alice 和 Bob 的例子 [圖 9-1](/v1/ddia_0901.png)）

有趣的是，透過犧牲效能，可以使 Dynamo 風格的法定人數線性化：讀取者必須在將結果返回給應用之前，同步執行讀修復（請參閱 “[讀修復和反熵](/v1_tw/ch5#讀修復和反熵)”） ，並且寫入者必須在傳送寫入之前，讀取法定數量節點的最新狀態【24,25】。然而，由於效能損失，Riak 不執行同步讀修復【26】。Cassandra 在進行法定人數讀取時，**確實** 在等待讀修復完成【27】；但是由於使用了最後寫入勝利的衝突解決方案，當同一個鍵有多個併發寫入時，將不能保證線性一致性。

而且，這種方式只能實現線性一致的讀寫；不能實現線性一致的比較和設定（CAS）操作，因為它需要一個共識演算法【28】。

總而言之，最安全的做法是：假設採用 Dynamo 風格無主複製的系統不能提供線性一致性。


### 線性一致性的代價

一些複製方法可以提供線性一致性，另一些複製方法則不能，因此深入地探討線性一致性的優缺點是很有趣的。

我們已經在 [第五章](/v1_tw/ch5) 中討論了不同複製方法的一些用例。例如對多資料中心的複製而言，多主複製通常是理想的選擇（請參閱 “[運維多個數據中心](/v1_tw/ch5#運維多個數據中心)”）。[圖 9-7](/v1/ddia_0907.png) 說明了這種部署的一個例子。

![](/v1/ddia_0907.png)

**圖 9-7 網路中斷迫使在線性一致性和可用性之間做出選擇。**

考慮這樣一種情況：如果兩個資料中心之間發生網路中斷會發生什麼？我們假設每個資料中心內的網路正在工作，客戶端可以訪問資料中心，但資料中心之間彼此無法互相連線。

使用多主資料庫，每個資料中心都可以繼續正常執行：由於在一個數據中心寫入的資料是非同步複製到另一個數據中心的，所以在恢復網路連線時，寫入操作只是簡單地排隊並交換。

另一方面，如果使用單主複製，則主庫必須位於其中一個數據中心。任何寫入和任何線性一致的讀取請求都必須傳送給該主庫，因此對於連線到從庫所在資料中心的客戶端，這些讀取和寫入請求必須透過網路同步傳送到主庫所在的資料中心。

在單主配置的條件下，如果資料中心之間的網路被中斷，則連線到從庫資料中心的客戶端無法聯絡到主庫，因此它們無法對資料庫執行任何寫入，也不能執行任何線性一致的讀取。它們仍能從從庫讀取，但結果可能是陳舊的（非線性一致）。如果應用需要線性一致的讀寫，卻又位於與主庫網路中斷的資料中心，則網路中斷將導致這些應用不可用。

如果客戶端可以直接連線到主庫所在的資料中心，這就不是問題了，那些應用可以繼續正常工作。但只能訪問從庫資料中心的客戶端會中斷執行，直到網路連線得到修復。

#### CAP定理

這個問題不僅僅是單主複製和多主複製的後果：任何線性一致的資料庫都有這個問題，不管它是如何實現的。這個問題也不僅僅侷限於多資料中心部署，而可能發生在任何不可靠的網路上，即使在同一個資料中心內也是如此。問題面臨的權衡如下：[^v]

* 如果應用需要線性一致性，且某些副本因為網路問題與其他副本斷開連線，那麼這些副本掉線時不能處理請求。請求必須等到網路問題解決，或直接返回錯誤。（無論哪種方式，服務都 **不可用**）。
* 如果應用不需要線性一致性，那麼某個副本即使與其他副本斷開連線，也可以獨立處理請求（例如多主複製）。在這種情況下，應用可以在網路問題解決前保持可用，但其行為不是線性一致的。

[^v]: 這兩種選擇有時分別稱為 CP（在網路分割槽下一致但不可用）和 AP（在網路分割槽下可用但不一致）。但是，這種分類方案存在一些缺陷【9】，所以最好不要這樣用。

因此，不需要線性一致性的應用對網路問題有更強的容錯能力。這種見解通常被稱為 CAP 定理【29,30,31,32】，由 Eric Brewer 於 2000 年命名，儘管 70 年代的分散式資料庫設計者早就知道了這種權衡【33,34,35,36】。

CAP 最初是作為一個經驗法則提出的，沒有準確的定義，目的是開始討論資料庫的權衡。那時候許多分散式資料庫側重於在共享儲存的叢集上提供線性一致性的語義【18】，CAP 定理鼓勵資料庫工程師向分散式無共享系統的設計領域深入探索，這類架構更適合實現大規模的網路服務【37】。對於這種文化上的轉變，CAP 值得讚揚 —— 它見證了自 00 年代中期以來新資料庫的技術爆炸（即 NoSQL）。

> #### CAP定理沒有幫助
>
> CAP 有時以這種面目出現：一致性，可用性和分割槽容錯性：三者只能擇其二。不幸的是這種說法很有誤導性【32】，因為網路分割槽是一種故障型別，所以它並不是一個選項：不管你喜不喜歡它都會發生【38】。
>
> 在網路正常工作的時候，系統可以提供一致性（線性一致性）和整體可用性。發生網路故障時，你必須在線性一致性和整體可用性之間做出選擇。因此，CAP 更好的表述成：在分割槽時要麼選擇一致，要麼選擇可用【39】。一個更可靠的網路需要減少這個選擇，但是在某些時候選擇是不可避免的。
>
> 在 CAP 的討論中，術語可用性有幾個相互矛盾的定義，形式化作為一個定理【30】並不符合其通常的含義【40】。許多所謂的 “高可用”（容錯）系統實際上不符合 CAP 對可用性的特殊定義。總而言之，圍繞著 CAP 有很多誤解和困惑，並不能幫助我們更好地理解系統，所以最好避免使用 CAP。

CAP 定理的正式定義僅限於很狹隘的範圍【30】，它只考慮了一個一致性模型（即線性一致性）和一種故障（網路分割槽 [^vi]，或活躍但彼此斷開的節點）。它沒有討論任何關於網路延遲，死亡節點或其他權衡的事。因此，儘管 CAP 在歷史上有一些影響力，但對於設計系統而言並沒有實際價值【9,40】。

在分散式系統中有更多有趣的 “不可能” 的結果【41】，且 CAP 定理現在已經被更精確的結果取代【2,42】，所以它現在基本上成了歷史古蹟了。

[^vi]: 正如 “[真實世界的網路故障](/v1_tw/ch8#真實世界的網路故障)” 中所討論的，本書使用 **分割槽（partition）** 指代將大資料集細分為小資料集的操作（分片；請參閱 [第六章](/v1_tw/ch6)）。與之對應的是，**網路分割槽（network partition）** 是一種特定型別的網路故障，我們通常不會將其與其他型別的故障分開考慮。但是，由於它是 CAP 的 P，所以這種情況下我們無法避免混亂。

#### 線性一致性和網路延遲

雖然線性一致是一個很有用的保證，但實際上，線性一致的系統驚人的少。例如，現代多核 CPU 上的記憶體甚至都不是線性一致的【43】：如果一個 CPU 核上執行的執行緒寫入某個記憶體地址，而另一個 CPU 核上執行的執行緒不久之後讀取相同的地址，並沒有保證一定能讀到第一個執行緒寫入的值（除非使用了 **記憶體屏障（memory barrier）** 或 **圍欄（fence）**【44】）。

這種行為的原因是每個 CPU 核都有自己的記憶體快取和儲存緩衝區。預設情況下，記憶體訪問首先走快取，任何變更會非同步寫入主存。因為快取訪問比主存要快得多【45】，所以這個特性對於現代 CPU 的良好效能表現至關重要。但是現在就有幾個資料副本（一個在主存中，也許還有幾個在不同快取中的其他副本），而且這些副本是非同步更新的，所以就失去了線性一致性。

為什麼要做這個權衡？對多核記憶體一致性模型而言，CAP 定理是沒有意義的：在同一臺計算機中，我們通常假定通訊都是可靠的。並且我們並不指望一個 CPU 核能在脫離計算機其他部分的條件下繼續正常工作。犧牲線性一致性的原因是 **效能（performance）**，而不是容錯。

許多分散式資料庫也是如此：它們是 **為了提高效能** 而選擇了犧牲線性一致性，而不是為了容錯【46】。線性一致的速度很慢 —— 這始終是事實，而不僅僅是網路故障期間。

能找到一個更高效的線性一致儲存實現嗎？看起來答案是否定的：Attiya 和 Welch 【47】證明，如果你想要線性一致性，讀寫請求的響應時間至少與網路延遲的不確定性成正比。在像大多數計算機網路一樣具有高度可變延遲的網路中（請參閱 “[超時與無窮的延遲](/v1_tw/ch8#超時與無窮的延遲)”），線性讀寫的響應時間不可避免地會很高。更快地線性一致演算法不存在，但更弱的一致性模型可以快得多，所以對延遲敏感的系統而言，這類權衡非常重要。在 [第十二章](/v1_tw/ch12) 中將討論一些在不犧牲正確性的前提下，繞開線性一致性的方法。


## 順序保證

之前說過，線性一致暫存器的行為就好像只有單個數據副本一樣，且每個操作似乎都是在某個時間點以原子性的方式生效的。這個定義意味著操作是按照某種良好定義的順序執行的。我們將操作以看上去被執行的順序連線起來，以此說明了 [圖 9-4](/v1/ddia_0904.png) 中的順序。

**順序（ordering）** 這一主題在本書中反覆出現，這表明它可能是一個重要的基礎性概念。讓我們簡要回顧一下其它曾經出現過 **順序** 的上下文：

* 在 [第五章](/v1_tw/ch5) 中我們看到，領導者在單主複製中的主要目的就是，在複製日誌中確定 **寫入順序（order of write）**—— 也就是從庫應用這些寫入的順序。如果不存在一個領導者，則併發操作可能導致衝突（請參閱 “[處理寫入衝突](/v1_tw/ch5#處理寫入衝突)”）。
* 在 [第七章](/v1_tw/ch7) 中討論的 **可序列化**，是關於事務表現的像按 **某種先後順序（some sequential order）** 執行的保證。它可以字面意義上地以 **序列順序（serial order）** 執行事務來實現，或者允許並行執行，但同時防止序列化衝突來實現（透過鎖或中止事務）。
* 在 [第八章](/v1_tw/ch8) 討論過的在分散式系統中使用時間戳和時鐘（請參閱 “[依賴同步時鐘](/v1_tw/ch8#依賴同步時鐘)”）是另一種將順序引入無序世界的嘗試，例如，確定兩個寫入操作哪一個更晚發生。

事實證明，順序、線性一致性和共識之間有著深刻的聯絡。儘管這個概念比本書其他部分更加理論化和抽象，但對於明確系統的能力範圍（可以做什麼和不可以做什麼）而言是非常有幫助的。我們將在接下來的幾節中探討這個話題。

### 順序與因果關係

**順序** 反覆出現有幾個原因，其中一個原因是，它有助於保持 **因果關係（causality）**。在本書中我們已經看到了幾個例子，其中因果關係是很重要的：

* 在 “[一致字首讀](/v1_tw/ch5#一致字首讀)”（[圖 5-5](/v1/ddia_0505.png)）中，我們看到一個例子：一個對話的觀察者首先看到問題的答案，然後才看到被回答的問題。這是令人困惑的，因為它違背了我們對 **因（cause）** 與 **果（effect）** 的直覺：如果一個問題被回答，顯然問題本身得先在那裡，因為給出答案的人必須先看到這個問題（假如他們並沒有預見未來的超能力）。我們認為在問題和答案之間存在 **因果依賴（causal dependency）**。
* [圖 5-9](/v1/ddia_0509.png) 中出現了類似的模式，我們看到三位領導者之間的複製，並注意到由於網路延遲，一些寫入可能會 “壓倒” 其他寫入。從其中一個副本的角度來看，好像有一個對尚不存在的記錄的更新操作。這裡的因果意味著，一條記錄必須先被建立，然後才能被更新。
* 在 “[檢測併發寫入](/v1_tw/ch5#檢測併發寫入)” 中我們觀察到，如果有兩個操作 A 和 B，則存在三種可能性：A 發生在 B 之前，或 B 發生在 A 之前，或者 A 和 B**併發**。這種 **此前發生（happened before）** 關係是因果關係的另一種表述：如果 A 在 B 前發生，那麼意味著 B 可能已經知道了 A，或者建立在 A 的基礎上，或者依賴於 A。如果 A 和 B 是 **併發** 的，那麼它們之間並沒有因果聯絡；換句話說，我們確信 A 和 B 不知道彼此。
* 在事務快照隔離的上下文中（“[快照隔離和可重複讀](/v1_tw/ch7#快照隔離和可重複讀)”），我們說事務是從一致性快照中讀取的。但此語境中 “一致” 到底又是什麼意思？這意味著 **與因果關係保持一致（consistent with causality）**：如果快照包含答案，它也必須包含被回答的問題【48】。在某個時間點觀察整個資料庫，與因果關係保持一致意味著：因果上在該時間點之前發生的所有操作，其影響都是可見的，但因果上在該時間點之後發生的操作，其影響對觀察者不可見。**讀偏差（read skew）** 意味著讀取的資料處於違反因果關係的狀態（不可重複讀，如 [圖 7-6](/v1/ddia_0706.png) 所示）。
* 事務之間 **寫偏差（write skew）** 的例子（請參閱 “[寫入偏差與幻讀](/v1_tw/ch7#寫入偏差與幻讀)”）也說明了因果依賴：在 [圖 7-8](/v1/ddia_0708.png) 中，愛麗絲被允許離班，因為事務認為鮑勃仍在值班，反之亦然。在這種情況下，離班的動作因果依賴於對當前值班情況的觀察。[可序列化快照隔離](/v1_tw/ch7#可序列化快照隔離) 透過跟蹤事務之間的因果依賴來檢測寫偏差。
* 在愛麗絲和鮑勃看球的例子中（[圖 9-1](/v1/ddia_0901.png)），在聽到愛麗絲驚呼比賽結果後，鮑勃從伺服器得到陳舊結果的事實違背了因果關係：愛麗絲的驚呼因果依賴於得分宣告，所以鮑勃應該也能在聽到愛麗斯驚呼後查詢到比分。相同的模式在 “[跨通道的時序依賴](#跨通道的時序依賴)” 一節中，以 “影像大小調整服務” 的偽裝再次出現。

因果關係對事件施加了一種 **順序**：因在果之前；訊息傳送在訊息收取之前。而且就像現實生活中一樣，一件事會導致另一件事：某個節點讀取了一些資料然後寫入一些結果，另一個節點讀取其寫入的內容，並依次寫入一些其他內容，等等。這些因果依賴的操作鏈定義了系統中的因果順序，即，什麼在什麼之前發生。

如果一個系統服從因果關係所規定的順序，我們說它是 **因果一致（causally consistent）** 的。例如，快照隔離提供了因果一致性：當你從資料庫中讀取到一些資料時，你一定還能夠看到其因果前驅（假設在此期間這些資料還沒有被刪除）。


#### 因果順序不是全序的

**全序（total order）** 允許任意兩個元素進行比較，所以如果有兩個元素，你總是可以說出哪個更大，哪個更小。例如，自然數集是全序的：給定兩個自然數，比如說 5 和 13，那麼你可以告訴我，13 大於 5。

然而數學集合並不完全是全序的：`{a, b}` 比 `{b, c}` 更大嗎？好吧，你沒法真正比較它們，因為二者都不是對方的子集。我們說它們是 **無法比較（incomparable）** 的，因此數學集合是 **偏序的（partially ordered）** ：在某些情況下，可以說一個集合大於另一個（如果一個集合包含另一個集合的所有元素），但在其他情況下它們是無法比較的 [^譯註i]。

[^譯註i]: 設 R 為非空集合 A 上的關係，如果 R 是自反的、反對稱的和可傳遞的，則稱 R 為 A 上的偏序關係。簡稱偏序，通常記作≦。一個集合 A 與 A 上的偏序關係 R 一起叫作偏序集，記作 $(A,R)$ 或 $(A, ≦)$。全序、偏序、關係、集合，這些概念的精確定義可以參考任意一本離散數學教材。

全序和偏序之間的差異反映在不同的資料庫一致性模型中：

* 線性一致性

  在線性一致的系統中，操作是全序的：如果系統表現的就好像只有一個數據副本，並且所有操作都是原子性的，這意味著對任何兩個操作，我們總是能判定哪個操作先發生。這個全序在 [圖 9-4](/v1/ddia_0904.png) 中以時間線表示。

* 因果性

  我們說過，如果兩個操作都沒有在彼此 **之前發生**，那麼這兩個操作是併發的（請參閱 [“此前發生” 的關係和併發](/v1_tw/ch5#“此前發生”的關係和併發)）。換句話說，如果兩個事件是因果相關的（一個發生在另一個事件之前），則它們之間是有序的，但如果它們是併發的，則它們之間的順序是無法比較的。這意味著因果關係定義了一個偏序，而不是一個全序：一些操作相互之間是有順序的，但有些則是無法比較的。

因此，根據這個定義，在線性一致的資料儲存中是不存在併發操作的：必須有且僅有一條時間線，所有的操作都在這條時間線上，構成一個全序關係。可能有幾個請求在等待處理，但是資料儲存確保了每個請求都是在唯一時間線上的某個時間點自動處理的，不存在任何併發。

併發意味著時間線會分岔然後合併 —— 在這種情況下，不同分支上的操作是無法比較的（即併發操作）。在 [第五章](/v1_tw/ch5) 中我們看到了這種現象：例如，[圖 5-14](/v1/ddia_0514.png) 並不是一條直線的全序關係，而是一堆不同的操作併發進行。圖中的箭頭指明了因果依賴 —— 操作的偏序。

如果你熟悉像 Git 這樣的分散式版本控制系統，那麼其版本歷史與因果關係圖極其相似。通常，一個 **提交（Commit）** 發生在另一個提交之後，在一條直線上。但是有時你會遇到分支（當多個人同時在一個專案上工作時），**合併（Merge）** 會在這些併發建立的提交相融合時建立。

#### 線性一致性強於因果一致性

那麼因果順序和線性一致性之間的關係是什麼？答案是線性一致性 **隱含著（implies）** 因果關係：任何線性一致的系統都能正確保持因果性【7】。特別是，如果系統中有多個通訊通道（如 [圖 9-5](/v1/ddia_0905.png) 中的訊息佇列和檔案儲存服務），線性一致性可以自動保證因果性，系統無需任何特殊操作（如在不同元件間傳遞時間戳）。

線性一致性確保因果性的事實使線性一致系統變得簡單易懂，更有吸引力。然而，正如 “[線性一致性的代價](#線性一致性的代價)” 中所討論的，使系統線性一致可能會損害其效能和可用性，尤其是在系統具有嚴重的網路延遲的情況下（例如，如果系統在地理上散佈）。出於這個原因，一些分散式資料系統已經放棄了線性一致性，從而獲得更好的效能，但它們用起來也更為困難。

好訊息是存在折衷的可能性。線性一致性並不是保持因果性的唯一途徑 —— 還有其他方法。一個系統可以是因果一致的，而無需承擔線性一致帶來的效能折損（尤其對於 CAP 定理不適用的情況）。實際上在所有的不會被網路延遲拖慢的一致性模型中，因果一致性是可行的最強的一致性模型。而且在網路故障時仍能保持可用【2,42】。

在許多情況下，看上去需要線性一致性的系統，實際上需要的只是因果一致性，因果一致性可以更高效地實現。基於這種觀察結果，研究人員正在探索新型的資料庫，既能保證因果一致性，且效能與可用性與最終一致的系統類似【49,50,51】。

這方面的研究相當新鮮，其中很多尚未應用到生產系統，仍然有不少挑戰需要克服【52,53】。但對於未來的系統而言，這是一個有前景的方向。

#### 捕獲因果關係

我們不會在這裡討論非線性一致的系統如何保證因果性的細節，而只是簡要地探討一些關鍵的思想。

為了維持因果性，你需要知道哪個操作發生在哪個其他操作之前（**happened before**）。這是一個偏序：併發操作可以以任意順序進行，但如果一個操作發生在另一個操作之前，那它們必須在所有副本上以那個順序被處理。因此，當一個副本處理一個操作時，它必須確保所有因果前驅的操作（之前發生的所有操作）已經被處理；如果前面的某個操作丟失了，後面的操作必須等待，直到前面的操作被處理完畢。

為了確定因果依賴，我們需要一些方法來描述系統中節點的 “知識”。如果節點在發出寫入 Y 的請求時已經看到了 X 的值，則 X 和 Y 可能存在因果關係。這個分析使用了那些在欺詐指控刑事調查中常見的問題：CEO 在做出決定 Y 時是否 **知道** X ？

用於確定 *哪些操作發生在其他操作之前* 的技術，與我們在 “[檢測併發寫入](/v1_tw/ch5#檢測併發寫入)” 中所討論的內容類似。那一節討論了無領導者資料儲存中的因果性：為了防止丟失更新，我們需要檢測到對同一個鍵的併發寫入。因果一致性則更進一步：它需要跟蹤整個資料庫中的因果依賴，而不僅僅是一個鍵。可以推廣版本向量以解決此類問題【54】。

為了確定因果順序，資料庫需要知道應用讀取了哪個版本的資料。這就是為什麼在 [圖 5-13](/v1/ddia_0513.png) 中，來自先前操作的版本號在寫入時被傳回到資料庫的原因。在 SSI 的衝突檢測中會出現類似的想法，如 “[可序列化快照隔離](/v1_tw/ch7#可序列化快照隔離)” 中所述：當事務要提交時，資料庫將檢查它所讀取的資料版本是否仍然是最新的。為此，資料庫跟蹤哪些資料被哪些事務所讀取。


### 序列號順序

雖然因果是一個重要的理論概念，但實際上跟蹤所有的因果關係是不切實際的。在許多應用中，客戶端在寫入內容之前會先讀取大量資料，我們無法弄清寫入因果依賴於先前全部的讀取內容，還是僅包括其中一部分。顯式跟蹤所有已讀資料意味著巨大的額外開銷。

但還有一個更好的方法：我們可以使用 **序列號（sequence number）** 或 **時間戳（timestamp）** 來排序事件。時間戳不一定來自日曆時鐘（或物理時鐘，它們存在許多問題，如 “[不可靠的時鐘](/v1_tw/ch8#不可靠的時鐘)” 中所述）。它可以來自一個 **邏輯時鐘（logical clock）**，這是一個用來生成標識操作的數字序列的演算法，典型實現是使用一個每次操作自增的計數器。

這樣的序列號或時間戳是緊湊的（只有幾個位元組大小），它提供了一個全序關係：也就是說每個操作都有一個唯一的序列號，而且總是可以比較兩個序列號，確定哪一個更大（即哪些操作後發生）。

特別是，我們可以使用 **與因果一致（consistent with causality）** 的全序來生成序列號 [^vii]：我們保證，如果操作 A 因果地發生在操作 B 前，那麼在這個全序中 A 在 B 前（ A 具有比 B 更小的序列號）。並行操作之間可以任意排序。這樣一個全序關係捕獲了所有關於因果的資訊，但也施加了一個比因果性要求更為嚴格的順序。

[^vii]: 與因果關係不一致的全序很容易建立，但沒啥用。例如你可以為每個操作生成隨機的 UUID，並按照字典序比較 UUID，以定義操作的全序。這是一個有效的全序，但是隨機的 UUID 並不能告訴你哪個操作先發生，或者操作是否為併發的。

在單主複製的資料庫中（請參閱 “[領導者與追隨者](/v1_tw/ch5#領導者與追隨者)”），複製日誌定義了與因果一致的寫操作。主庫可以簡單地為每個操作自增一個計數器，從而為複製日誌中的每個操作分配一個單調遞增的序列號。如果一個從庫按照它們在複製日誌中出現的順序來應用寫操作，那麼從庫的狀態始終是因果一致的（即使它落後於領導者）。

#### 非因果序列號生成器

如果主庫不存在（可能因為使用了多主資料庫或無主資料庫，或者因為使用了分割槽的資料庫），如何為操作生成序列號就沒有那麼明顯了。在實踐中有各種各樣的方法：

* 每個節點都可以生成自己獨立的一組序列號。例如有兩個節點，一個節點只能生成奇數，而另一個節點只能生成偶數。通常，可以在序列號的二進位制表示中預留一些位，用於唯一的節點識別符號，這樣可以確保兩個不同的節點永遠不會生成相同的序列號。
*可以將日曆時鐘（物理時鐘）的時間戳附加到每個操作上【55】。這種時間戳並不連續，但是如果它具有足夠高的解析度，那也許足以提供一個操作的全序關係。這一事實應用於* 最後寫入勝利 * 的衝突解決方法中（請參閱 “[有序事件的時間戳](/v1_tw/ch8#有序事件的時間戳)”）。
* 可以預先分配序列號區塊。例如，節點 A 可能要求從序列號 1 到 1,000 區塊的所有權，而節點 B 可能要求序列號 1,001 到 2,000 區塊的所有權。然後每個節點可以獨立分配所屬區塊中的序列號，並在序列號告急時請求分配一個新的區塊。

這三個選項都比單一主庫的自增計數器表現要好，並且更具可伸縮性。它們為每個操作生成一個唯一的，近似自增的序列號。然而它們都有同一個問題：生成的序列號與因果不一致。

因為這些序列號生成器不能正確地捕獲跨節點的操作順序，所以會出現因果關係的問題：

* 每個節點每秒可以處理不同數量的操作。因此，如果一個節點產生偶數序列號而另一個產生奇數序列號，則偶數計數器可能落後於奇數計數器，反之亦然。如果你有一個奇數編號的操作和一個偶數編號的操作，你無法準確地說出哪一個操作在因果上先發生。

* 來自物理時鐘的時間戳會受到時鐘偏移的影響，這可能會使其與因果不一致。例如 [圖 8-3](/v1/ddia_0803.png) 展示了一個例子，其中因果上晚發生的操作，卻被分配了一個更早的時間戳。[^viii]

  [^viii]: 可以使物理時鐘時間戳與因果關係保持一致：在 “[全域性快照的同步時鐘](/v1_tw/ch8#全域性快照的同步時鐘)” 中，我們討論了 Google 的 Spanner，它可以估計預期的時鐘偏差，並在提交寫入之前等待不確定性間隔。這種方法確保了實際上靠後的事務會有更大的時間戳。但是大多數時鐘不能提供這種所需的不確定性度量。

* 在分配區塊的情況下，某個操作可能會被賦予一個範圍在 1,001 到 2,000 內的序列號，然而一個因果上更晚的操作可能被賦予一個範圍在 1 到 1,000 之間的數字。這裡序列號與因果關係也是不一致的。


#### 蘭伯特時間戳

儘管剛才描述的三個序列號生成器與因果不一致，但實際上有一個簡單的方法來產生與因果關係一致的序列號。它被稱為蘭伯特時間戳，萊斯利・蘭伯特（Leslie Lamport）於 1978 年提出【56】，現在是分散式系統領域中被引用最多的論文之一。

[圖 9-8](/v1/ddia_0908.png) 說明了蘭伯特時間戳的應用。每個節點都有一個唯一識別符號，和一個儲存自己執行運算元量的計數器。蘭伯特時間戳就是兩者的簡單組合：（計數器，節點 ID）$(counter, node ID)$。兩個節點有時可能具有相同的計數器值，但透過在時間戳中包含節點 ID，每個時間戳都是唯一的。

![](/v1/ddia_0908.png)

**圖 9-8  Lamport 時間戳提供了與因果關係一致的全序。**


蘭伯特時間戳與物理的日曆時鐘沒有任何關係，但是它提供了一個全序：如果你有兩個時間戳，則 **計數器** 值大者是更大的時間戳。如果計數器值相同，則節點 ID 越大的，時間戳越大。

迄今，這個描述與上節所述的奇偶計數器基本類似。使蘭伯特時間戳因果一致的關鍵思想如下所示：每個節點和每個客戶端跟蹤迄今為止所見到的最大 **計數器** 值，並在每個請求中包含這個最大計數器值。當一個節點收到最大計數器值大於自身計數器值的請求或響應時，它立即將自己的計數器設定為這個最大值。

這如 [圖 9-8](/v1/ddia_0908.png) 所示，其中客戶端 A 從節點 2 接收計數器值 `5` ，然後將最大值 `5` 傳送到節點 1 。此時，節點 1 的計數器僅為 `1` ，但是它立即前移至 `5` ，所以下一個操作的計數器的值為 `6` 。

只要每一個操作都攜帶著最大計數器值，這個方案確保蘭伯特時間戳的排序與因果一致，因為每個因果依賴都會導致時間戳增長。

蘭伯特時間戳有時會與我們在 “[檢測併發寫入](/v1_tw/ch5#檢測併發寫入)” 中看到的版本向量相混淆。雖然兩者有一些相似之處，但它們有著不同的目的：版本向量可以區分兩個操作是併發的，還是一個因果依賴另一個；而蘭伯特時間戳總是施行一個全序。從蘭伯特時間戳的全序中，你無法分辨兩個操作是併發的還是因果依賴的。蘭伯特時間戳優於版本向量的地方是，它更加緊湊。

#### 光有時間戳排序還不夠

雖然蘭伯特時間戳定義了一個與因果一致的全序，但它還不足以解決分散式系統中的許多常見問題。

例如，考慮一個需要確保使用者名稱能唯一標識使用者帳戶的系統。如果兩個使用者同時嘗試使用相同的使用者名稱建立帳戶，則其中一個應該成功，另一個應該失敗（我們之前在 “[領導者和鎖](/v1_tw/ch8#領導者和鎖)” 中提到過這個問題）。

乍看之下，似乎操作的全序關係足以解決這一問題（例如使用蘭伯特時間戳）：如果建立了兩個具有相同使用者名稱的帳戶，選擇時間戳較小的那個作為勝者（第一個抓到使用者名稱的人），並讓帶有更大時間戳者失敗。由於時間戳上有全序關係，所以這個比較總是可行的。

這種方法適用於事後確定勝利者：一旦你收集了系統中的所有使用者名稱建立操作，就可以比較它們的時間戳。然而當某個節點需要即時處理使用者建立使用者名稱的請求時，這樣的方法就無法滿足了。節點需要 **馬上（right now）** 決定這個請求是成功還是失敗。在那個時刻，節點並不知道是否存在其他節點正在併發執行建立同樣使用者名稱的操作，罔論其它節點可能分配給那個操作的時間戳。

為了確保沒有其他節點正在使用相同的使用者名稱和較小的時間戳併發建立同名賬戶，你必須檢查其它每個節點，看看它在做什麼【56】。如果其中一個節點由於網路問題出現故障或不可達，則整個系統可能被拖至停機。這不是我們需要的那種容錯系統。

這裡的問題是，只有在所有的操作都被收集之後，操作的全序才會出現。如果另一個節點已經產生了一些操作，但你還不知道那些操作是什麼，那就無法構造所有操作最終的全序關係：來自另一個節點的未知操作可能需要被插入到全序中的不同位置。

總之：為了實現諸如使用者名稱上的唯一約束這種東西，僅有操作的全序是不夠的，你還需要知道這個全序何時會塵埃落定。如果你有一個建立使用者名稱的操作，並且確定在全序中沒有任何其他節點可以在你的操作之前插入對同一使用者名稱的聲稱，那麼你就可以安全地宣告操作執行成功。

如何確定全序關係已經塵埃落定，這將在 [全序廣播](#全序廣播) 一節中詳細說明。

### 全序廣播

如果你的程式只執行在單個 CPU 核上，那麼定義一個操作全序是很容易的：可以簡單認為就是 CPU 執行這些操作的順序。但是在分散式系統中，讓所有節點對同一個全域性操作順序達成一致可能相當棘手。在上一節中，我們討論了按時間戳或序列號進行排序，但發現它還不如單主複製給力（如果你使用時間戳排序來實現唯一性約束，就不能容忍任何錯誤，因為你必須要從每個節點都獲取到最新的序列號）。

如前所述，單主複製透過選擇一個節點作為主庫來確定操作的全序，並在主庫的單個 CPU 核上對所有操作進行排序。接下來的挑戰是，如果吞吐量超出單個主庫的處理能力，這種情況下如何擴充套件系統；以及，如果主庫失效（“[處理節點宕機](/v1_tw/ch5#處理節點宕機)”），如何處理故障切換。在分散式系統文獻中，這個問題被稱為 **全序廣播（total order broadcast）** 或 **原子廣播（atomic broadcast）**[^ix]【25,57,58】。

[^ix]: “原子廣播” 是一個傳統的術語，非常混亂，而且與 “原子” 一詞的其他用法不一致：它與 ACID 事務中的原子性沒有任何關係，只是與原子操作（在多執行緒程式設計的意義上 ）或原子暫存器（線性一致儲存）有間接的聯絡。全序組播（total order multicast）是另一個同義詞。

> #### 順序保證的範圍
>
> 每個分割槽各有一個主庫的分割槽資料庫，通常只在每個分割槽內維持順序，這意味著它們不能提供跨分割槽的一致性保證（例如，一致性快照，外部索引鍵引用）。跨所有分割槽的全序是可能的，但需要額外的協調【59】。

全序廣播通常被描述為在節點間交換訊息的協議。非正式地講，它要滿足兩個安全屬性：

* 可靠交付（reliable delivery）

  沒有訊息丟失：如果訊息被傳遞到一個節點，它將被傳遞到所有節點。

* 全序交付（totally ordered delivery）

  訊息以相同的順序傳遞給每個節點。

正確的全序廣播演算法必須始終保證可靠性和有序性，即使節點或網路出現故障。當然在網路中斷的時候，訊息是傳不出去的，但是演算法可以不斷重試，以便在網路最終修復時，訊息能及時透過並送達（當然它們必須仍然按照正確的順序傳遞）。

#### 使用全序廣播

像 ZooKeeper 和 etcd 這樣的共識服務實際上實現了全序廣播。這一事實暗示了全序廣播與共識之間有著緊密聯絡，我們將在本章稍後進行探討。

全序廣播正是資料庫複製所需的：如果每個訊息都代表一次資料庫的寫入，且每個副本都按相同的順序處理相同的寫入，那麼副本間將相互保持一致（除了臨時的複製延遲）。這個原理被稱為 **狀態機複製（state machine replication）**【60】，我們將在 [第十一章](/v1_tw/ch11) 中重新回到這個概念。

與之類似，可以使用全序廣播來實現可序列化的事務：如 “[真的序列執行](/v1_tw/ch7#真的序列執行)” 中所述，如果每個訊息都表示一個確定性事務，以儲存過程的形式來執行，且每個節點都以相同的順序處理這些訊息，那麼資料庫的分割槽和副本就可以相互保持一致【61】。

全序廣播的一個重要表現是，順序在訊息送達時被固化：如果後續的訊息已經送達，節點就不允許追溯地將（先前）訊息插入順序中的較早位置。這個事實使得全序廣播比時間戳排序更強。

考量全序廣播的另一種方式是，這是一種建立日誌的方式（如在複製日誌、事務日誌或預寫式日誌中）：傳遞訊息就像追加寫入日誌。由於所有節點必須以相同的順序傳遞相同的訊息，因此所有節點都可以讀取日誌，並看到相同的訊息序列。

全序廣播對於實現提供防護令牌的鎖服務也很有用（請參閱 “[防護令牌](/v1_tw/ch8#防護令牌)”）。每個獲取鎖的請求都作為一條訊息追加到日誌末尾，並且所有的訊息都按它們在日誌中出現的順序依次編號。序列號可以當成防護令牌用，因為它是單調遞增的。在 ZooKeeper 中，這個序列號被稱為 `zxid` 【15】。

#### 使用全序廣播實現線性一致的儲存

如 [圖 9-4](/v1/ddia_0904.png) 所示，在線性一致的系統中，存在操作的全序。這是否意味著線性一致與全序廣播一樣？不盡然，但兩者之間有著密切的聯絡 [^x]。

[^x]: 從形式上講，線性一致讀寫暫存器是一個 “更容易” 的問題。全序廣播等價於共識【67】，而共識問題在非同步的崩潰 - 停止模型【68】中沒有確定性的解決方案，而線性一致的讀寫暫存器 **可以** 在這種模型中實現【23,24,25】。然而，支援諸如 **比較並設定（CAS, compare-and-set）**，或 **自增並返回（increment-and-get）** 的原子操作使它等價於共識問題【28】。因此，共識問題與線性一致暫存器問題密切相關。

全序廣播是非同步的：訊息被保證以固定的順序可靠地傳送，但是不能保證訊息 **何時** 被送達（所以一個接收者可能落後於其他接收者）。相比之下，線性一致性是新鮮性的保證：讀取一定能看見最新的寫入值。

但如果有了全序廣播，你就可以在此基礎上構建線性一致的儲存。例如，你可以確保使用者名稱能唯一標識使用者帳戶。

設想對於每一個可能的使用者名稱，你都可以有一個帶有 CAS 原子操作的線性一致暫存器。每個暫存器最初的值為空值（表示未使用該使用者名稱）。當用戶想要建立一個使用者名稱時，對該使用者名稱的暫存器執行 CAS 操作，在先前暫存器值為空的條件，將其值設定為使用者的賬號 ID。如果多個使用者試圖同時獲取相同的使用者名稱，則只有一個 CAS 操作會成功，因為其他使用者會看到非空的值（由於線性一致性）。

你可以透過將全序廣播當成僅追加日誌【62,63】的方式來實現這種線性一致的 CAS 操作：

1. 在日誌中追加一條訊息，試探性地指明你要宣告的使用者名稱。
2. 讀日誌，並等待你剛才追加的訊息被讀回。[^xi]
4. 檢查是否有任何訊息聲稱目標使用者名稱的所有權。如果這些訊息中的第一條就是你自己的訊息，那麼你就成功了：你可以提交聲稱的使用者名稱（也許是透過向日志追加另一條訊息）並向客戶端確認。如果所需使用者名稱的第一條訊息來自其他使用者，則中止操作。

[^xi]: 如果你不等待，而是在訊息入隊之後立即確認寫入，則會得到類似於多核 x86 處理器記憶體的一致性模型【43】。該模型既不是線性一致的也不是順序一致的。

由於日誌項是以相同順序送達至所有節點，因此如果有多個併發寫入，則所有節點會對最先到達者達成一致。選擇衝突寫入中的第一個作為勝利者，並中止後來者，以此確定所有節點對某個寫入是提交還是中止達成一致。類似的方法可以在一個日誌的基礎上實現可序列化的多物件事務【62】。

儘管這一過程保證寫入是線性一致的，但它並不保證讀取也是線性一致的 —— 如果你從與日誌非同步更新的儲存中讀取資料，結果可能是陳舊的。（精確地說，這裡描述的過程提供了 **順序一致性（sequential consistency）**【47,64】，有時也稱為 **時間線一致性（timeline consistency）**【65,66】，比線性一致性稍微弱一些的保證）。為了使讀取也線性一致，有幾個選項：

* 你可以透過在日誌中追加一條訊息，然後讀取日誌，直到該訊息被讀回才執行實際的讀取操作。訊息在日誌中的位置因此定義了讀取發生的時間點（etcd 的法定人數讀取有些類似這種情況【16】）。
* 如果日誌允許以線性一致的方式獲取最新日誌訊息的位置，則可以查詢該位置，等待該位置前的所有訊息都傳達到你，然後執行讀取。（這是 Zookeeper `sync()` 操作背後的思想【15】）。
* 你可以從同步更新的副本中進行讀取，因此可以確保結果是最新的（這種技術用於鏈式複製（chain replication）【63】；請參閱 “[關於複製的研究](/v1_tw/ch5#關於複製的研究)”）。

#### 使用線性一致性儲存實現全序廣播

上一節介紹了如何從全序廣播構建一個線性一致的 CAS 操作。我們也可以把它反過來，假設我們有線性一致的儲存，接下來會展示如何在此基礎上構建全序廣播。

最簡單的方法是假設你有一個線性一致的暫存器來儲存一個整數，並且有一個原子 **自增並返回** 操作【28】。或者原子 CAS 操作也可以完成這項工作。

該演算法很簡單：每個要透過全序廣播發送的訊息首先對線性一致暫存器執行 **自增並返回** 操作。然後將從暫存器獲得的值作為序列號附加到訊息中。然後你可以將訊息傳送到所有節點（重新發送任何丟失的訊息），而收件人將按序列號依序傳遞（deliver）訊息。

請注意，與蘭伯特時間戳不同，透過自增線性一致性暫存器獲得的數字形式上是一個沒有間隙的序列。因此，如果一個節點已經發送了訊息 4 並且接收到序列號為 6 的傳入訊息，則它知道它在傳遞訊息 6 之前必須等待訊息 5 。蘭伯特時間戳則與之不同 —— 事實上，這是全序廣播和時間戳排序間的關鍵區別。

實現一個帶有原子性 **自增並返回** 操作的線性一致暫存器有多困難？像往常一樣，如果事情從來不出差錯，那很容易：你可以簡單地把它儲存在單個節點內的變數中。問題在於處理當該節點的網路連線中斷時的情況，並在該節點失效時能恢復這個值【59】。一般來說，如果你對線性一致性的序列號生成器進行過足夠深入的思考，你不可避免地會得出一個共識演算法。

這並非巧合：可以證明，線性一致的 CAS（或自增並返回）暫存器與全序廣播都等價於 **共識** 問題【28,67】。也就是說，如果你能解決其中的一個問題，你可以把它轉化成為其他問題的解決方案。這是相當深刻和令人驚訝的洞察！

現在是時候正面處理共識問題了，我們將在本章的其餘部分進行討論。


## 分散式事務與共識

**共識** 是分散式計算中最重要也是最基本的問題之一。從表面上看似乎很簡單：非正式地講，目標只是 **讓幾個節點達成一致（get serveral nodes to agree on something）**。你也許會認為這不會太難。不幸的是，許多出故障的系統都是因為錯誤地輕信這個問題很容易解決。

儘管共識非常重要，但關於它的內容出現在本書的後半部分，因為這個主題非常微妙，欣賞細微之處需要一些必要的知識。即使在學術界，對共識的理解也是在幾十年的過程中逐漸沉澱而來，一路上也有著許多誤解。現在我們已經討論了複製（[第五章](/v1_tw/ch5)），事務（[第七章](/v1_tw/ch7)），系統模型（[第八章](/v1_tw/ch8)），線性一致以及全序廣播（本章），我們終於準備好解決共識問題了。

節點能達成一致，在很多場景下都非常重要，例如：

* 領導選舉

  在單主複製的資料庫中，所有節點需要就哪個節點是領導者達成一致。如果一些節點由於網路故障而無法與其他節點通訊，則可能會對領導權的歸屬引起爭議。在這種情況下，共識對於避免錯誤的故障切換非常重要。錯誤的故障切換會導致兩個節點都認為自己是領導者（**腦裂**，請參閱 “[處理節點宕機](/v1_tw/ch5#處理節點宕機)”）。如果有兩個領導者，它們都會接受寫入，它們的資料會發生分歧，從而導致不一致和資料丟失。

* 原子提交

  在支援跨多節點或跨多分割槽事務的資料庫中，一個事務可能在某些節點上失敗，但在其他節點上成功。如果我們想要維護事務的原子性（就 ACID 而言，請參閱 “[原子性](/v1_tw/ch7#原子性)”），我們必須讓所有節點對事務的結果達成一致：要麼全部中止 / 回滾（如果出現任何錯誤），要麼它們全部提交（如果沒有出錯）。這個共識的例子被稱為 **原子提交（atomic commit）** 問題 [^xii]。

  [^xii]: 原子提交的形式化與共識稍有不同：原子事務只有在 **所有** 參與者投票提交的情況下才能提交，如果有任何參與者需要中止，則必須中止。共識則允許就 **任意一個** 被參與者提出的候選值達成一致。然而，原子提交和共識可以相互簡化為對方【70,71】。**非阻塞** 原子提交則要比共識更為困難 —— 請參閱 “[三階段提交](#三階段提交)”。

> ### 共識的不可能性
>
> 你可能已經聽說過以作者 Fischer，Lynch 和 Paterson 命名的 FLP 結果【68】，它證明，如果存在節點可能崩潰的風險，則不存在 **總是** 能夠達成共識的演算法。在分散式系統中，我們必須假設節點可能會崩潰，所以可靠的共識是不可能的。然而這裡我們正在討論達成共識的演算法，到底是怎麼回事？
>
> 答案是 FLP 結果是在 **非同步系統模型** 中被證明的（請參閱 “[系統模型與現實](/v1_tw/ch8#系統模型與現實)”），而這是一種限制性很強的模型，它假定確定性演算法不能使用任何時鐘或超時。如果允許演算法使用 **超時** 或其他方法來識別可疑的崩潰節點（即使懷疑有時是錯誤的），則共識變為一個可解的問題【67】。即使僅僅允許演算法使用隨機數，也足以繞過這個不可能的結果【69】。
>
> 因此，雖然 FLP 是關於共識不可能性的重要理論結果，但現實中的分散式系統通常是可以達成共識的。

在本節中，我們將首先更詳細地研究 **原子提交** 問題。具體來說，我們將討論 **兩階段提交（2PC, two-phase commit）** 演算法，這是解決原子提交問題最常見的辦法，並在各種資料庫、訊息佇列和應用伺服器中被實現。事實證明 2PC 是一種共識演算法，但不是一個非常好的共識演算法【70,71】。

透過對 2PC 的學習，我們將繼續努力實現更好的一致性演算法，比如 ZooKeeper（Zab）和 etcd（Raft）中使用的演算法。


### 原子提交與兩階段提交

在 [第七章](/v1_tw/ch7) 中我們瞭解到，事務原子性的目的是在多次寫操作中途出錯的情況下，提供一種簡單的語義。事務的結果要麼是成功提交，在這種情況下，事務的所有寫入都是持久化的；要麼是中止，在這種情況下，事務的所有寫入都被回滾（即撤消或丟棄）。

原子性可以防止失敗的事務攪亂資料庫，避免資料庫陷入半成品結果和半更新狀態。這對於多物件事務（請參閱 “[單物件和多物件操作](/v1_tw/ch7#單物件和多物件操作)”）和維護次級索引的資料庫尤其重要。每個次級索引都是與主資料相分離的資料結構 —— 因此，如果你修改了一些資料，則還需要在次級索引中進行相應的更改。原子性確保次級索引與主資料保持一致（如果索引與主資料不一致，就沒什麼用了）。

#### 從單節點到分散式原子提交

對於在單個數據庫節點執行的事務，原子性通常由儲存引擎實現。當客戶端請求資料庫節點提交事務時，資料庫將使事務的寫入持久化（通常在預寫式日誌中，請參閱 “[讓 B 樹更可靠](/v1_tw/ch3#讓B樹更可靠)”），然後將提交記錄追加到磁碟中的日誌裡。如果資料庫在這個過程中間崩潰，當節點重啟時，事務會從日誌中恢復：如果提交記錄在崩潰之前成功地寫入磁碟，則認為事務被提交；否則來自該事務的任何寫入都被回滾。

因此，在單個節點上，事務的提交主要取決於資料持久化落盤的 **順序**：首先是資料，然後是提交記錄【72】。事務提交或終止的關鍵決定時刻是磁碟完成寫入提交記錄的時刻：在此之前，仍有可能中止（由於崩潰），但在此之後，事務已經提交（即使資料庫崩潰）。因此，是單一的裝置（連線到單個磁碟的控制器，且掛載在單臺機器上）使得提交具有原子性。

但是，如果一個事務中涉及多個節點呢？例如，你也許在分割槽資料庫中會有一個多物件事務，或者是一個按關鍵詞分割槽的次級索引（其中索引條目可能位於與主資料不同的節點上；請參閱 “[分割槽與次級索引](/v1_tw/ch6#分割槽與次級索引)”）。大多數 “NoSQL” 分散式資料儲存不支援這種分散式事務，但是很多關係型資料庫叢集支援（請參閱 “[實踐中的分散式事務](#實踐中的分散式事務)”）。

在這些情況下，僅向所有節點發送提交請求並獨立提交每個節點的事務是不夠的。這樣很容易發生違反原子性的情況：提交在某些節點上成功，而在其他節點上失敗：

* 某些節點可能會檢測到違反約束或衝突，因此需要中止，而其他節點則可以成功進行提交。
* 某些提交請求可能在網路中丟失，最終由於超時而中止，而其他提交請求則透過。
* 在提交記錄完全寫入之前，某些節點可能會崩潰，並在恢復時回滾，而其他節點則成功提交。

如果某些節點提交了事務，但其他節點卻放棄了這些事務，那麼這些節點就會彼此不一致（如 [圖 7-3](/v1/ddia_0703.png) 所示）。而且一旦在某個節點上提交了一個事務，如果事後發現它在其它節點上被中止了，它是無法撤回的。出於這個原因，一旦確定事務中的所有其他節點也將提交，節點就必須進行提交。

事務提交必須是不可撤銷的 —— 事務提交之後，你不能改變主意，並追溯性地中止事務。這個規則的原因是，一旦資料被提交，其結果就對其他事務可見，因此其他客戶端可能會開始依賴這些資料。這個原則構成了 **讀已提交** 隔離等級的基礎，在 “[讀已提交](/v1_tw/ch7#讀已提交)” 一節中討論了這個問題。如果一個事務在提交後被允許中止，所有那些讀取了 **已提交卻又被追溯宣告不存在資料** 的事務也必須回滾。

（提交事務的結果有可能透過事後執行另一個補償事務（compensating transaction）來取消【73,74】，但從資料庫的角度來看，這是一個單獨的事務，因此任何關於跨事務正確性的保證都是應用自己的問題。）

#### 兩階段提交簡介

**兩階段提交（two-phase commit）** 是一種用於實現跨多個節點的原子事務提交的演算法，即確保所有節點提交或所有節點中止。它是分散式資料庫中的經典演算法【13,35,75】。2PC 在某些資料庫內部使用，也以 **XA 事務** 的形式對應用可用【76,77】（例如 Java Transaction API 支援）或以 SOAP Web 服務的 `WS-AtomicTransaction` 形式提供給應用【78,79】。

[圖 9-9](/v1/ddia_0909.png) 說明了 2PC 的基本流程。2PC 中的提交 / 中止過程分為兩個階段（因此而得名），而不是單節點事務中的單個提交請求。

![](/v1/ddia_0909.png)

**圖 9-9 兩階段提交（2PC）的成功執行**

> #### 不要把2PC和2PL搞混了
>
> 兩階段提交（2PC）和兩階段鎖定（請參閱 “[兩階段鎖定](/v1_tw/ch7#兩階段鎖定)”）是兩個完全不同的東西。2PC 在分散式資料庫中提供原子提交，而 2PL 提供可序列化的隔離等級。為了避免混淆，最好把它們看作完全獨立的概念，並忽略名稱中不幸的相似性。

2PC 使用一個通常不會出現在單節點事務中的新元件：**協調者**（coordinator，也稱為 **事務管理器**，即 transaction manager）。協調者通常在請求事務的相同應用程序中以庫的形式實現（例如，嵌入在 Java EE 容器中），但也可以是單獨的程序或服務。這種協調者的例子包括 Narayana、JOTM、BTM 或 MSDTC。

正常情況下，2PC 事務以應用在多個數據庫節點上讀寫資料開始。我們稱這些資料庫節點為 **參與者（participants）**。當應用準備提交時，協調者開始階段 1 ：它傳送一個 **準備（prepare）** 請求到每個節點，詢問它們是否能夠提交。然後協調者會跟蹤參與者的響應：

* 如果所有參與者都回答 “是”，表示它們已經準備好提交，那麼協調者在階段 2 發出 **提交（commit）** 請求，然後提交真正發生。
* 如果任意一個參與者回覆了 “否”，則協調者在階段 2 中向所有節點發送 **中止（abort）** 請求。

這個過程有點像西方傳統婚姻儀式：司儀分別詢問新娘和新郎是否要結婚，通常是從兩方都收到 “我願意” 的答覆。收到兩者的回覆後，司儀宣佈這對情侶成為夫妻：事務就提交了，這一幸福事實會廣播至所有的參與者中。如果新娘與新郎之一沒有回覆 “我願意”，婚禮就會中止【73】。

#### 系統承諾

這個簡短的描述可能並沒有說清楚為什麼兩階段提交保證了原子性，而跨多個節點的一階段提交卻沒有。在兩階段提交的情況下，準備請求和提交請求當然也可以輕易丟失。2PC 又有什麼不同呢？

為了理解它的工作原理，我們必須更詳細地分解這個過程：

1. 當應用想要啟動一個分散式事務時，它向協調者請求一個事務 ID。此事務 ID 是全域性唯一的。
2. 應用在每個參與者上啟動單節點事務，並在單節點事務上捎帶上這個全域性事務 ID。所有的讀寫都是在這些單節點事務中各自完成的。如果在這個階段出現任何問題（例如，節點崩潰或請求超時），則協調者或任何參與者都可以中止。
3. 當應用準備提交時，協調者向所有參與者傳送一個 **準備** 請求，並打上全域性事務 ID 的標記。如果任意一個請求失敗或超時，則協調者向所有參與者傳送針對該事務 ID 的中止請求。
4. 參與者收到準備請求時，需要確保在任意情況下都的確可以提交事務。這包括將所有事務資料寫入磁碟（出現崩潰、電源故障或硬碟空間不足都不能是稍後拒絕提交的理由）以及檢查是否存在任何衝突或違反約束。透過向協調者回答 “是”，節點承諾，只要請求，這個事務一定可以不出差錯地提交。換句話說，參與者放棄了中止事務的權利，但沒有實際提交。
5. 當協調者收到所有準備請求的答覆時，會就提交或中止事務作出明確的決定（只有在所有參與者投贊成票的情況下才會提交）。協調者必須把這個決定寫到磁碟上的事務日誌中，如果它隨後就崩潰，恢復後也能知道自己所做的決定。這被稱為 **提交點（commit point）**。
6. 一旦協調者的決定落盤，提交或中止請求會發送給所有參與者。如果這個請求失敗或超時，協調者必須永遠保持重試，直到成功為止。沒有回頭路：如果已經做出決定，不管需要多少次重試它都必須被執行。如果參與者在此期間崩潰，事務將在其恢復後提交 —— 由於參與者投了贊成，因此恢復後它不能拒絕提交。

因此，該協議包含兩個關鍵的 “不歸路” 點：當參與者投票 “是” 時，它承諾它稍後肯定能夠提交（儘管協調者可能仍然選擇放棄）；以及一旦協調者做出決定，這一決定是不可撤銷的。這些承諾保證了 2PC 的原子性（單節點原子提交將這兩個事件合為了一體：將提交記錄寫入事務日誌）。

回到婚姻的比喻，在說 “我願意” 之前，你和你的新娘 / 新郎有中止這個事務的自由，只要回覆 “沒門！” 就行（或者有類似效果的話）。然而在說了 “我願意” 之後，你就不能撤回那個聲明了。如果你說 “我願意” 後暈倒了，沒有聽到司儀說 “你們現在是夫妻了”，那也並不會改變事務已經提交的現實。當你稍後恢復意識時，可以透過查詢司儀的全域性事務 ID 狀態來確定你是否已經成婚，或者你可以等待司儀重試下一次提交請求（因為重試將在你無意識期間一直持續）。

#### 協調者失效

我們已經討論了在 2PC 期間，如果參與者之一或網路發生故障時會發生什麼情況：如果任何一個 **準備** 請求失敗或者超時，協調者就會中止事務。如果任何提交或中止請求失敗，協調者將無條件重試。但是如果協調者崩潰，會發生什麼情況就不太清楚了。

如果協調者在傳送 **準備** 請求之前失敗，參與者可以安全地中止事務。但是，一旦參與者收到了準備請求並投了 “是”，就不能再單方面放棄 —— 必須等待協調者回答事務是否已經提交或中止。如果此時協調者崩潰或網路出現故障，參與者什麼也做不了只能等待。參與者的這種事務狀態稱為 **存疑（in doubt）** 的或 **不確定（uncertain）** 的。

情況如 [圖 9-10](/v1/ddia_0910.png) 所示。在這個特定的例子中，協調者實際上決定提交，資料庫 2 收到提交請求。但是，協調者在將提交請求傳送到資料庫 1 之前發生崩潰，因此資料庫 1 不知道是否提交或中止。即使 **超時** 在這裡也沒有幫助：如果資料庫 1 在超時後單方面中止，它將最終與執行提交的資料庫 2 不一致。同樣，單方面提交也是不安全的，因為另一個參與者可能已經中止了。

![](/v1/ddia_0910.png)

**圖 9-10 參與者投贊成票後，協調者崩潰。資料庫 1 不知道是否提交或中止**

沒有協調者的訊息，參與者無法知道是提交還是放棄。原則上參與者可以相互溝通，找出每個參與者是如何投票的，並達成一致，但這不是 2PC 協議的一部分。

可以完成 2PC 的唯一方法是等待協調者恢復。這就是為什麼協調者必須在向參與者傳送提交或中止請求之前，將其提交或中止決定寫入磁碟上的事務日誌：協調者恢復後，透過讀取其事務日誌來確定所有存疑事務的狀態。任何在協調者日誌中沒有提交記錄的事務都會中止。因此，2PC 的 **提交點** 歸結為協調者上的常規單節點原子提交。

#### 三階段提交

兩階段提交被稱為 **阻塞（blocking）**- 原子提交協議，因為存在 2PC 可能卡住並等待協調者恢復的情況。理論上，可以使一個原子提交協議變為 **非阻塞（nonblocking）** 的，以便在節點失敗時不會卡住。但是讓這個協議能在實踐中工作並沒有那麼簡單。

作為 2PC 的替代方案，已經提出了一種稱為 **三階段提交（3PC）** 的演算法【13,80】。然而，3PC 假定網路延遲有界，節點響應時間有限；在大多數具有無限網路延遲和程序暫停的實際系統中（見 [第八章](/v1_tw/ch8)），它並不能保證原子性。

通常，非阻塞原子提交需要一個 **完美的故障檢測器（perfect failure detector）**【67,71】—— 即一個可靠的機制來判斷一個節點是否已經崩潰。在具有無限延遲的網路中，超時並不是一種可靠的故障檢測機制，因為即使沒有節點崩潰，請求也可能由於網路問題而超時。出於這個原因，2PC 仍然被使用，儘管大家都清楚可能存在協調者故障的問題。


### 實踐中的分散式事務

分散式事務的名聲譭譽參半，尤其是那些透過兩階段提交實現的。一方面，它被視作提供了一個難以實現的重要的安全性保證；另一方面，它們因為導致運維問題，造成效能下降，做出超過能力範圍的承諾而飽受批評【81,82,83,84】。許多雲服務由於其導致的運維問題，而選擇不實現分散式事務【85,86】。

分散式事務的某些實現會帶來嚴重的效能損失 —— 例如據報告稱，MySQL 中的分散式事務比單節點事務慢 10 倍以上【87】，所以當人們建議不要使用它們時就不足為奇了。兩階段提交所固有的效能成本，大部分是由於崩潰恢復所需的額外強制刷盤（`fsync`）【88】以及額外的網路往返。

但我們不應該直接忽視分散式事務，而應當更加仔細地審視這些事務，因為從中可以汲取重要的經驗教訓。首先，我們應該精確地說明 “**分散式事務**” 的含義。兩種截然不同的分散式事務型別經常被混淆：

* 資料庫內部的分散式事務

  一些分散式資料庫（即在其標準配置中使用複製和分割槽的資料庫）支援資料庫節點之間的內部事務。例如，VoltDB 和 MySQL Cluster 的 NDB 儲存引擎就有這樣的內部事務支援。在這種情況下，所有參與事務的節點都執行相同的資料庫軟體。

* 異構分散式事務

  在 **異構（heterogeneous）** 事務中，參與者是由兩種或兩種以上的不同技術組成的：例如來自不同供應商的兩個資料庫，甚至是非資料庫系統（如訊息代理）。跨系統的分散式事務必須確保原子提交，儘管系統可能完全不同。

資料庫內部事務不必與任何其他系統相容，因此它們可以使用任何協議，並能針對特定技術進行特定的最佳化。因此資料庫內部的分散式事務通常工作地很好。另一方面，跨異構技術的事務則更有挑戰性。

#### 恰好一次的訊息處理

異構的分散式事務處理能夠以強大的方式整合不同的系統。例如：訊息佇列中的一條訊息可以被確認為已處理，當且僅當用於處理訊息的資料庫事務成功提交。這是透過在同一個事務中原子提交 **訊息確認** 和 **資料庫寫入** 兩個操作來實現的。藉由分散式事務的支援，即使訊息代理和資料庫是在不同機器上執行的兩種不相關的技術，這種操作也是可能的。

如果訊息傳遞或資料庫事務任意一者失敗，兩者都會中止，因此訊息代理可能會在稍後安全地重傳訊息。因此，透過原子提交 **訊息處理及其副作用**，即使在成功之前需要幾次重試，也可以確保訊息被 **有效地（effectively）** 恰好處理一次。中止會拋棄部分完成事務所導致的任何副作用。

然而，只有當所有受事務影響的系統都使用同樣的 **原子提交協議（atomic commit protocol）** 時，這樣的分散式事務才是可能的。例如，假設處理訊息的副作用是傳送一封郵件，而郵件伺服器並不支援兩階段提交：如果訊息處理失敗並重試，則可能會發送兩次或更多次的郵件。但如果處理訊息的所有副作用都可以在事務中止時回滾，那麼這樣的處理流程就可以安全地重試，就好像什麼都沒有發生過一樣。

在 [第十一章](/v1_tw/ch11) 中將再次回到 “恰好一次” 訊息處理的主題。讓我們先來看看允許這種異構分散式事務的原子提交協議。

#### XA事務

*X/Open XA*（**擴充套件架構（eXtended Architecture）** 的縮寫）是跨異構技術實現兩階段提交的標準【76,77】。它於 1991 年推出並得到了廣泛的實現：許多傳統關係資料庫（包括 PostgreSQL、MySQL、DB2、SQL Server 和 Oracle）和訊息代理（包括 ActiveMQ、HornetQ、MSMQ 和 IBM MQ） 都支援 XA。

XA 不是一個網路協議 —— 它只是一個用來與事務協調者連線的 C API。其他語言也有這種 API 的繫結；例如在 Java EE 應用的世界中，XA 事務是使用 **Java 事務 API（JTA, Java Transaction API）** 實現的，而許多使用 **Java 資料庫連線（JDBC, Java Database Connectivity）** 的資料庫驅動，以及許多使用 **Java 訊息服務（JMS）** API 的訊息代理都支援 **Java 事務 API（JTA）**。

XA 假定你的應用使用網路驅動或客戶端庫來與 **參與者**（資料庫或訊息服務）進行通訊。如果驅動支援 XA，則意味著它會呼叫 XA API 以查明操作是否為分散式事務的一部分 —— 如果是，則將必要的資訊發往資料庫伺服器。驅動還會向協調者暴露回撥介面，協調者可以透過回撥來要求參與者準備、提交或中止。

事務協調者需要實現 XA API。標準沒有指明應該如何實現，但實際上協調者通常只是一個庫，被載入到發起事務的應用的同一個程序中（而不是單獨的服務）。它在事務中跟蹤所有的參與者，並在要求它們 **準備** 之後收集參與者的響應（透過驅動回撥），並使用本地磁碟上的日誌記錄每次事務的決定（提交 / 中止）。

如果應用程序崩潰，或者執行應用的機器報銷了，協調者也隨之往生極樂。然後任何帶有 **準備了** 但未提交事務的參與者都會在疑慮中卡死。由於協調程式的日誌位於應用伺服器的本地磁碟上，因此必須重啟該伺服器，且協調程式庫必須讀取日誌以恢復每個事務的提交 / 中止結果。只有這樣，協調者才能使用資料庫驅動的 XA 回撥來要求參與者提交或中止。資料庫伺服器不能直接聯絡協調者，因為所有通訊都必須透過客戶端庫。

#### 懷疑時持有鎖

為什麼我們這麼關心存疑事務？系統的其他部分就不能繼續正常工作，無視那些終將被清理的存疑事務嗎？

問題在於 **鎖（locking）**。正如在 “[讀已提交](/v1_tw/ch7#讀已提交)” 中所討論的那樣，資料庫事務通常獲取待修改的行上的 **行級排他鎖**，以防止髒寫。此外，如果要使用可序列化的隔離等級，則使用兩階段鎖定的資料庫也必須為事務所讀取的行加上共享鎖（請參閱 “[兩階段鎖定](/v1_tw/ch7#兩階段鎖定)”）。

在事務提交或中止之前，資料庫不能釋放這些鎖（如 [圖 9-9](/v1/ddia_0909.png) 中的陰影區域所示）。因此，在使用兩階段提交時，事務必須在整個存疑期間持有這些鎖。如果協調者已經崩潰，需要 20 分鐘才能重啟，那麼這些鎖將會被持有 20 分鐘。如果協調者的日誌由於某種原因徹底丟失，這些鎖將被永久持有 —— 或至少在管理員手動解決該情況之前。

當這些鎖被持有時，其他事務不能修改這些行。根據資料庫的不同，其他事務甚至可能因為讀取這些行而被阻塞。因此，其他事務沒法兒簡單地繼續它們的業務了 —— 如果它們要訪問同樣的資料，就會被阻塞。這可能會導致應用大面積進入不可用狀態，直到存疑事務被解決。

#### 從協調者故障中恢復

理論上，如果協調者崩潰並重新啟動，它應該乾淨地從日誌中恢復其狀態，並解決任何存疑事務。然而在實踐中，**孤立（orphaned）** 的存疑事務確實會出現【89,90】，即無論出於何種理由，協調者無法確定事務的結果（例如事務日誌已經由於軟體錯誤丟失或損壞）。這些事務無法自動解決，所以它們永遠待在資料庫中，持有鎖並阻塞其他事務。

即使重啟資料庫伺服器也無法解決這個問題，因為在 2PC 的正確實現中，即使重啟也必須保留存疑事務的鎖（否則就會冒違反原子性保證的風險）。這是一種棘手的情況。

唯一的出路是讓管理員手動決定提交還是回滾事務。管理員必須檢查每個存疑事務的參與者，確定是否有任何參與者已經提交或中止，然後將相同的結果應用於其他參與者。解決這個問題潛在地需要大量的人力，並且可能發生在嚴重的生產中斷期間（不然為什麼協調者處於這種糟糕的狀態），並很可能要在巨大精神壓力和時間壓力下完成。

許多 XA 的實現都有一個叫做 **啟發式決策（heuristic decisions）** 的緊急逃生艙口：允許參與者單方面決定放棄或提交一個存疑事務，而無需協調者做出最終決定【76,77,91】。要清楚的是，這裡 **啟發式** 是 **可能破壞原子性（probably breaking atomicity）** 的委婉說法，因為它違背了兩階段提交的系統承諾。因此，啟發式決策只是為了逃出災難性的情況而準備的，而不是為了日常使用的。

#### 分散式事務的限制

XA 事務解決了保持多個參與者（資料系統）相互一致的現實的和重要的問題，但正如我們所看到的那樣，它也引入了嚴重的運維問題。特別來講，這裡的核心認識是：事務協調者本身就是一種資料庫（儲存了事務的結果），因此需要像其他重要資料庫一樣小心地打交道：

* 如果協調者沒有複製，而是隻在單臺機器上執行，那麼它是整個系統的失效單點（因為它的失效會導致其他應用伺服器阻塞在存疑事務持有的鎖上）。令人驚訝的是，許多協調者實現預設情況下並不是高可用的，或者只有基本的複製支援。
* 許多伺服器端應用都是使用無狀態模式開發的（受 HTTP 的青睞），所有持久狀態都儲存在資料庫中，因此具有應用伺服器可隨意按需新增刪除的優點。但是，當協調者成為應用伺服器的一部分時，它會改變部署的性質。突然間，協調者的日誌成為持久系統狀態的關鍵部分 —— 與資料庫本身一樣重要，因為協調者日誌是為了在崩潰後恢復存疑事務所必需的。這樣的應用伺服器不再是無狀態的了。
* 由於 XA 需要相容各種資料系統，因此它必須是所有系統的最小公分母。例如，它不能檢測不同系統間的死鎖（因為這將需要一個標準協議來讓系統交換每個事務正在等待的鎖的資訊），而且它無法與 SSI（請參閱 [可序列化快照隔離](/v1_tw/ch7#可序列化快照隔離)）協同工作，因為這需要一個跨系統定位衝突的協議。
* 對於資料庫內部的分散式事務（不是 XA），限制沒有這麼大 —— 例如，分散式版本的 SSI 是可能的。然而仍然存在問題：2PC 成功提交一個事務需要所有參與者的響應。因此，如果系統的 **任何** 部分損壞，事務也會失敗。因此，分散式事務又有 **擴大失效（amplifying failures）** 的趨勢，這又與我們構建容錯系統的目標背道而馳。

這些事實是否意味著我們應該放棄保持幾個系統相互一致的所有希望？不完全是 —— 還有其他的辦法，可以讓我們在沒有異構分散式事務的痛苦的情況下實現同樣的事情。我們將在 [第十一章](/v1_tw/ch11) 和 [第十二章](/v1_tw/ch12) 回到這些話題。但首先，我們應該概括一下關於 **共識** 的話題。


### 容錯共識

非正式地，共識意味著讓幾個節點就某事達成一致。例如，如果有幾個人 **同時（concurrently）** 嘗試預訂飛機上的最後一個座位，或劇院中的同一個座位，或者嘗試使用相同的使用者名稱註冊一個帳戶。共識演算法可以用來確定這些 **互不相容（mutually incompatible）** 的操作中，哪一個才是贏家。

共識問題通常形式化如下：一個或多個節點可以 **提議（propose）** 某些值，而共識演算法 **決定（decides）** 採用其中的某個值。在座位預訂的例子中，當幾個顧客同時試圖訂購最後一個座位時，處理顧客請求的每個節點可以 **提議** 將要服務的顧客的 ID，而 **決定** 指明了哪個顧客獲得了座位。

在這種形式下，共識演算法必須滿足以下性質【25】：[^xiii]

[^xiii]: 這種共識的特殊形式被稱為 **統一共識（uniform consensus）**，相當於在具有不可靠故障檢測器的非同步系統中的 **常規共識（regular consensus）**【71】。學術文獻通常指的是 **程序（process）** 而不是節點，但我們在這裡使用 **節點（node）** 來與本書的其餘部分保持一致。

一致同意（Uniform agreement）
: 沒有兩個節點的決定不同。

完整性（Integrity）
: 沒有節點決定兩次。

有效性（Validity）
: 如果一個節點決定了值 `v` ，則 `v` 由某個節點所提議。

終止（Termination）
: 由所有未崩潰的節點來最終決定值。

**一致同意** 和 **完整性** 屬性定義了共識的核心思想：所有人都決定了相同的結果，一旦決定了，你就不能改變主意。**有效性** 屬性主要是為了排除平凡的解決方案：例如，無論提議了什麼值，你都可以有一個始終決定值為 `null` 的演算法，該演算法滿足 **一致同意** 和 **完整性** 屬性，但不滿足 **有效性** 屬性。

如果你不關心容錯，那麼滿足前三個屬性很容易：你可以將一個節點硬編碼為 “獨裁者”，並讓該節點做出所有的決定。但如果該節點失效，那麼系統就無法再做出任何決定。事實上，這就是我們在兩階段提交的情況中所看到的：如果協調者失效，那麼存疑的參與者就無法決定提交還是中止。

**終止** 屬性形式化了容錯的思想。它實質上說的是，一個共識演算法不能簡單地永遠閒坐著等死 —— 換句話說，它必須取得進展。即使部分節點出現故障，其他節點也必須達成一項決定（**終止** 是一種 **活性屬性**，而另外三種是 **安全屬性** —— 請參閱 “[安全性和活性](/v1_tw/ch8#安全性和活性)”）。

共識的系統模型假設，當一個節點 “崩潰” 時，它會突然消失而且永遠不會回來。（不像軟體崩潰，想象一下地震，包含你的節點的資料中心被山體滑坡所摧毀，你必須假設節點被埋在 30 英尺以下的泥土中，並且永遠不會重新上線）在這個系統模型中，任何需要等待節點恢復的演算法都不能滿足 **終止** 屬性。特別是，2PC 不符合終止屬性的要求。

當然如果 **所有** 的節點都崩潰了，沒有一個在執行，那麼所有演算法都不可能決定任何事情。演算法可以容忍的失效數量是有限的：事實上可以證明，任何共識演算法都需要至少佔總體 **多數（majority）** 的節點正確工作，以確保終止屬性【67】。多數可以安全地組成法定人數（請參閱 “[讀寫的法定人數](/v1_tw/ch5#讀寫的法定人數)”）。

因此 **終止** 屬性取決於一個假設，**不超過一半的節點崩潰或不可達**。然而即使多數節點出現故障或存在嚴重的網路問題，絕大多數共識的實現都能始終確保安全屬性得到滿足 —— 一致同意，完整性和有效性【92】。因此，大規模的中斷可能會阻止系統處理請求，但是它不能透過使系統做出無效的決定來破壞共識系統。

大多數共識演算法假設不存在 **拜占庭式錯誤**，正如在 “[拜占庭故障](/v1_tw/ch8#拜占庭故障)” 一節中所討論的那樣。也就是說，如果一個節點沒有正確地遵循協議（例如，如果它向不同節點發送矛盾的訊息），它就可能會破壞協議的安全屬性。克服拜占庭故障，穩健地達成共識是可能的，只要少於三分之一的節點存在拜占庭故障【25,93】。但我們沒有地方在本書中詳細討論這些演算法了。

#### 共識演算法和全序廣播

最著名的容錯共識演算法是 **檢視戳複製（VSR, Viewstamped Replication）**【94,95】，Paxos 【96,97,98,99】，Raft 【22,100,101】以及 Zab 【15,21,102】 。這些演算法之間有不少相似之處，但它們並不相同【103】。在本書中我們不會介紹各種演算法的詳細細節：瞭解一些它們共通的高階思想通常已經足夠了，除非你準備自己實現一個共識系統。（可能並不明智，相當難【98,104】）

大多數這些演算法實際上並不直接使用這裡描述的形式化模型（提議與決定單個值，並滿足一致同意、完整性、有效性和終止屬性）。取而代之的是，它們決定了值的 **順序（sequence）**，這使它們成為全序廣播演算法，正如本章前面所討論的那樣（請參閱 “[全序廣播](#全序廣播)”）。

請記住，全序廣播要求將訊息按照相同的順序，恰好傳遞一次，準確傳送到所有節點。如果仔細思考，這相當於進行了幾輪共識：在每一輪中，節點提議下一條要傳送的訊息，然後決定在全序中下一條要傳送的訊息【67】。

所以，全序廣播相當於重複進行多輪共識（每次共識決定與一次訊息傳遞相對應）：

* 由於 **一致同意** 屬性，所有節點決定以相同的順序傳遞相同的訊息。
* 由於 **完整性** 屬性，訊息不會重複。
* 由於 **有效性** 屬性，訊息不會被損壞，也不能憑空編造。
* 由於 **終止** 屬性，訊息不會丟失。

檢視戳複製，Raft 和 Zab 直接實現了全序廣播，因為這樣做比重複 **一次一值（one value a time）** 的共識更高效。在 Paxos 的情況下，這種最佳化被稱為 Multi-Paxos。

#### 單主複製與共識

在 [第五章](/v1_tw/ch5) 中，我們討論了單主複製（請參閱 “[領導者與追隨者](/v1_tw/ch5#領導者與追隨者)”），它將所有的寫入操作都交給主庫，並以相同的順序將它們應用到從庫，從而使副本保持在最新狀態。這實際上不就是一個全序廣播嗎？為什麼我們在 [第五章](/v1_tw/ch5) 裡一點都沒擔心過共識問題呢？

答案取決於如何選擇領導者。如果主庫是由運維人員手動選擇和配置的，那麼你實際上擁有一種 **獨裁型別** 的 “共識演算法”：只有一個節點被允許接受寫入（即決定寫入複製日誌的順序），如果該節點發生故障，則系統將無法寫入，直到運維手動配置其他節點作為主庫。這樣的系統在實踐中可以表現良好，但它無法滿足共識的 **終止** 屬性，因為它需要人為干預才能取得 **進展**。

一些資料庫會自動執行領導者選舉和故障切換，如果舊主庫失效，會提拔一個從庫為新主庫（請參閱 “[處理節點宕機](/v1_tw/ch5#處理節點宕機)”）。這使我們向容錯的全序廣播更進一步，從而達成共識。

但是還有一個問題。我們之前曾經討論過腦裂的問題，並且說過所有的節點都需要同意是誰領導，否則兩個不同的節點都會認為自己是領導者，從而導致資料庫進入不一致的狀態。因此，選出一位領導者需要共識。但如果這裡描述的共識演算法實際上是全序廣播演算法，並且全序廣播就像單主複製，而單主複製需要一個領導者，那麼...

這樣看來，要選出一個領導者，我們首先需要一個領導者。要解決共識問題，我們首先需要解決共識問題。我們如何跳出這個先有雞還是先有蛋的問題？

#### 紀元編號和法定人數

迄今為止所討論的所有共識協議，在內部都以某種形式使用一個領導者，但它們並不能保證領導者是獨一無二的。相反，它們可以做出更弱的保證：協議定義了一個 **紀元編號**（epoch number，在 Paxos 中被稱為 **投票編號**，即 ballot number，在檢視戳複製中被稱為 **檢視編號**，即 view number，以及在 Raft 中被為 **任期號碼**，即 term number），並確保在每個時代中，領導者都是唯一的。

每次當現任領導被認為掛掉的時候，節點間就會開始一場投票，以選出一個新領導。這次選舉被賦予一個遞增的紀元編號，因此紀元編號是全序且單調遞增的。如果兩個不同的時代的領導者之間出現衝突（也許是因為前任領導者實際上並未死亡），那麼帶有更高紀元編號的領導說了算。

在任何領導者被允許決定任何事情之前，必須先檢查是否存在其他帶有更高紀元編號的領導者，它們可能會做出相互衝突的決定。領導者如何知道自己沒有被另一個節點趕下臺？回想一下在 “[真相由多數所定義](/v1_tw/ch8#真相由多數所定義)” 中提到的：一個節點不一定能相信自己的判斷 —— 因為只有節點自己認為自己是領導者，並不一定意味著其他節點接受它作為它們的領導者。

相反，它必須從 **法定人數（quorum）** 的節點中獲取選票（請參閱 “[讀寫的法定人數](/v1_tw/ch5#讀寫的法定人數)”）。對領導者想要做出的每一個決定，都必須將提議值傳送給其他節點，並等待法定人數的節點響應並贊成提案。法定人數通常（但不總是）由多數節點組成【105】。只有在沒有意識到任何帶有更高紀元編號的領導者的情況下，一個節點才會投票贊成提議。

因此，我們有兩輪投票：第一次是為了選出一位領導者，第二次是對領導者的提議進行表決。關鍵的洞察在於，這兩次投票的 **法定人群** 必須相互 **重疊（overlap）**：如果一個提案的表決透過，則至少得有一個參與投票的節點也必須參加過最近的領導者選舉【105】。因此，如果在一個提案的表決過程中沒有出現更高的紀元編號。那麼現任領導者就可以得出這樣的結論：沒有發生過更高時代的領導選舉，因此可以確定自己仍然在領導。然後它就可以安全地對提議值做出決定。

這一投票過程表面上看起來很像兩階段提交。最大的區別在於，2PC 中協調者不是由選舉產生的，而且 2PC 則要求 **所有** 參與者都投贊成票，而容錯共識演算法只需要多數節點的投票。而且，共識演算法還定義了一個恢復過程，節點可以在選舉出新的領導者之後進入一個一致的狀態，確保始終能滿足安全屬性。這些區別正是共識演算法正確性和容錯性的關鍵。

#### 共識的侷限性

共識演算法對於分散式系統來說是一個巨大的突破：它為其他充滿不確定性的系統帶來了基礎的安全屬性（一致同意，完整性和有效性），然而它們還能保持容錯（只要多數節點正常工作且可達，就能取得進展）。它們提供了全序廣播，因此它們也可以以一種容錯的方式實現線性一致的原子操作（請參閱 “[使用全序廣播實現線性一致的儲存](#使用全序廣播實現線性一致的儲存)”）。

儘管如此，它們並不是在所有地方都用上了，因為好處總是有代價的。

節點在做出決定之前對提議進行投票的過程是一種同步複製。如 “[同步複製與非同步複製](/v1_tw/ch5#同步複製與非同步複製)” 中所述，通常資料庫會配置為非同步複製模式。在這種配置中發生故障切換時，一些已經提交的資料可能會丟失 —— 但是為了獲得更好的效能，許多人選擇接受這種風險。

共識系統總是需要嚴格多數來運轉。這意味著你至少需要三個節點才能容忍單節點故障（其餘兩個構成多數），或者至少有五個節點來容忍兩個節點發生故障（其餘三個構成多數）。如果網路故障切斷了某些節點同其他節點的連線，則只有多數節點所在的網路可以繼續工作，其餘部分將被阻塞（請參閱 “[線性一致性的代價](#線性一致性的代價)”）。

大多數共識演算法假定參與投票的節點是固定的集合，這意味著你不能簡單的在叢集中新增或刪除節點。共識演算法的 **動態成員擴充套件（dynamic membership extension）** 允許叢集中的節點集隨時間推移而變化，但是它們比靜態成員演算法要難理解得多。

共識系統通常依靠超時來檢測失效的節點。在網路延遲高度變化的環境中，特別是在地理上散佈的系統中，經常發生一個節點由於暫時的網路問題，錯誤地認為領導者已經失效。雖然這種錯誤不會損害安全屬性，但頻繁的領導者選舉會導致糟糕的效能表現，因系統最後可能花在權力傾紮上的時間要比花在建設性工作的多得多。

有時共識演算法對網路問題特別敏感。例如 Raft 已被證明存在讓人不悅的極端情況【106】：如果整個網路工作正常，但只有一條特定的網路連線一直不可靠，Raft 可能會進入領導者在兩個節點間頻繁切換的局面，或者當前領導者不斷被迫辭職以致系統實質上毫無進展。其他一致性演算法也存在類似的問題，而設計能健壯應對不可靠網路的演算法仍然是一個開放的研究問題。

### 成員與協調服務

像 ZooKeeper 或 etcd 這樣的專案通常被描述為 “分散式鍵值儲存” 或 “協調與配置服務”。這種服務的 API 看起來非常像資料庫：你可以讀寫給定鍵的值，並遍歷鍵。所以如果它們基本上算是資料庫的話，為什麼它們要把工夫全花在實現一個共識演算法上呢？是什麼使它們區別於其他任意型別的資料庫？

為了理解這一點，簡單瞭解如何使用 ZooKeeper 這類服務是很有幫助的。作為應用開發人員，你很少需要直接使用 ZooKeeper，因為它實際上不適合當成通用資料庫來用。更有可能的是，你會透過其他專案間接依賴它，例如 HBase、Hadoop YARN、OpenStack Nova 和 Kafka 都依賴 ZooKeeper 在後臺執行。這些專案從它那裡得到了什麼？

ZooKeeper 和 etcd 被設計為容納少量完全可以放在記憶體中的資料（雖然它們仍然會寫入磁碟以保證永續性），所以你不會想著把所有應用資料放到這裡。這些少量資料會透過容錯的全序廣播演算法複製到所有節點上。正如前面所討論的那樣，資料庫複製需要的就是全序廣播：如果每條訊息代表對資料庫的寫入，則以相同的順序應用相同的寫入操作可以使副本之間保持一致。

ZooKeeper 模仿了 Google 的 Chubby 鎖服務【14,98】，不僅實現了全序廣播（因此也實現了共識），而且還構建了一組有趣的其他特性，這些特性在構建分散式系統時變得特別有用：

線性一致性的原子操作
: 使用原子 CAS 操作可以實現鎖：如果多個節點同時嘗試執行相同的操作，只有一個節點會成功。共識協議保證了操作的原子性和線性一致性，即使節點發生故障或網路在任意時刻中斷。分散式鎖通常以 **租約（lease）** 的形式實現，租約有一個到期時間，以便在客戶端失效的情況下最終能被釋放（請參閱 “[程序暫停](/v1_tw/ch8#程序暫停)”）。

操作的全序排序
: 如 “[領導者和鎖](/v1_tw/ch8#領導者和鎖)” 中所述，當某個資源受到鎖或租約的保護時，你需要一個防護令牌來防止客戶端在程序暫停的情況下彼此衝突。防護令牌是每次鎖被獲取時單調增加的數字。ZooKeeper 透過全序化所有操作來提供這個功能，它為每個操作提供一個單調遞增的事務 ID（`zxid`）和版本號（`cversion`）【15】。

失效檢測
: 客戶端在 ZooKeeper 伺服器上維護一個長期會話，客戶端和伺服器週期性地交換心跳包來檢查節點是否還活著。即使連線暫時中斷，或者 ZooKeeper 節點失效，會話仍保持在活躍狀態。但如果心跳停止的持續時間超出會話超時，ZooKeeper 會宣告該會話已死亡。當會話超時時（ZooKeeper 稱這些節點為 **臨時節點**，即 ephemeral nodes），會話持有的任何鎖都可以配置為自動釋放。

變更通知
: 客戶端不僅可以讀取其他客戶端建立的鎖和值，還可以監聽它們的變更。因此，客戶端可以知道另一個客戶端何時加入叢集（基於新客戶端寫入 ZooKeeper 的值），或發生故障（因其會話超時，而其臨時節點消失）。透過訂閱通知，客戶端不用再透過頻繁輪詢的方式來找出變更。

在這些功能中，只有線性一致的原子操作才真的需要共識。但正是這些功能的組合，使得像 ZooKeeper 這樣的系統在分散式協調中非常有用。

#### 將工作分配給節點

ZooKeeper/Chubby 模型執行良好的一個例子是，如果你有幾個程序例項或服務，需要選擇其中一個例項作為主庫或首選服務。如果領導者失敗，其他節點之一應該接管。這對單主資料庫當然非常實用，但對作業排程程式和類似的有狀態系統也很好用。

另一個例子是，當你有一些分割槽資源（資料庫、訊息流、檔案儲存、分散式 Actor 系統等），並需要決定將哪個分割槽分配給哪個節點時。當新節點加入叢集時，需要將某些分割槽從現有節點移動到新節點，以便重新平衡負載（請參閱 “[分割槽再平衡](/v1_tw/ch6#分割槽再平衡)”）。當節點被移除或失效時，其他節點需要接管失效節點的工作。

這類任務可以透過在 ZooKeeper 中明智地使用原子操作，臨時節點與通知來實現。如果設計得當，這種方法允許應用自動從故障中恢復而無需人工干預。不過這並不容易，儘管已經有不少在 ZooKeeper 客戶端 API 基礎之上提供更高層工具的庫，例如 Apache Curator 【17】。但它仍然要比嘗試從頭實現必要的共識演算法要好得多，這樣的嘗試鮮有成功記錄【107】。

應用最初只能在單個節點上執行，但最終可能會增長到數千個節點。試圖在如此之多的節點上進行多數投票將是非常低效的。相反，ZooKeeper 在固定數量的節點（通常是三到五個）上執行，並在這些節點之間執行其多數票，同時支援潛在的大量客戶端。因此，ZooKeeper 提供了一種將協調節點（共識，操作排序和故障檢測）的一些工作 “外包” 到外部服務的方式。

通常，由 ZooKeeper 管理的資料型別的變化十分緩慢：代表 “分割槽 7 中的節點執行在 `10.1.1.23` 上” 的資訊可能會在幾分鐘或幾小時的時間內發生變化。它不是用來儲存應用的執行時狀態的，後者每秒可能會改變數千甚至數百萬次。如果應用狀態需要從一個節點複製到另一個節點，則可以使用其他工具（如 Apache BookKeeper 【108】）。

#### 服務發現

ZooKeeper、etcd 和 Consul 也經常用於服務發現 —— 也就是找出你需要連線到哪個 IP 地址才能到達特定的服務。在雲資料中心環境中，虛擬機器來來往往很常見，你通常不會事先知道服務的 IP 地址。相反，你可以配置你的服務，使其在啟動時註冊服務登錄檔中的網路端點，然後可以由其他服務找到它們。

但是，服務發現是否需要達成共識還不太清楚。DNS 是查詢服務名稱的 IP 地址的傳統方式，它使用多層快取來實現良好的效能和可用性。從 DNS 讀取是絕對不線性一致性的，如果 DNS 查詢的結果有點陳舊，通常不會有問題【109】。DNS 的可用性和對網路中斷的魯棒性更重要。

儘管服務發現並不需要共識，但領導者選舉卻是如此。因此，如果你的共識系統已經知道領導是誰，那麼也可以使用這些資訊來幫助其他服務發現領導是誰。為此，一些共識系統支援只讀快取副本。這些副本非同步接收共識演算法所有決策的日誌，但不主動參與投票。因此，它們能夠提供不需要線性一致性的讀取請求。

#### 成員資格服務

ZooKeeper 和它的小夥伴們可以看作是成員資格服務（membership services）研究的悠久歷史的一部分，這個歷史可以追溯到 20 世紀 80 年代，並且對建立高度可靠的系統（例如空中交通管制）非常重要【110】。

成員資格服務確定哪些節點當前處於活動狀態並且是叢集的活動成員。正如我們在 [第八章](/v1_tw/ch8) 中看到的那樣，由於無限的網路延遲，無法可靠地檢測到另一個節點是否發生故障。但是，如果你透過共識來進行故障檢測，那麼節點可以就哪些節點應該被認為是存在或不存在達成一致。

即使它確實存在，仍然可能發生一個節點被共識錯誤地宣告死亡。但是對於一個系統來說，知道哪些節點構成了當前的成員關係是非常有用的。例如，選擇領導者可能意味著簡單地選擇當前成員中編號最小的成員，但如果不同的節點對現有的成員都有誰有不同意見，則這種方法將不起作用。


## 本章小結

在本章中，我們從幾個不同的角度審視了關於一致性與共識的話題。我們深入研究了線性一致性（一種流行的一致性模型）：其目標是使多副本資料看起來好像只有一個副本一樣，並使其上所有操作都原子性地生效。雖然線性一致性因為簡單易懂而很吸引人 —— 它使資料庫表現的好像單執行緒程式中的一個變數一樣，但它有著速度緩慢的缺點，特別是在網路延遲很大的環境中。

我們還探討了因果性，因果性對系統中的事件施加了順序（什麼發生在什麼之前，基於因與果）。與線性一致不同，線性一致性將所有操作放在單一的全序時間線中，因果一致性為我們提供了一個較弱的一致性模型：某些事件可以是 **併發** 的，所以版本歷史就像是一條不斷分叉與合併的時間線。因果一致性沒有線性一致性的協調開銷，而且對網路問題的敏感性要低得多。

但即使捕獲到因果順序（例如使用蘭伯特時間戳），我們發現有些事情也不能透過這種方式實現：在 “[光有時間戳排序還不夠](#光有時間戳排序還不夠)” 一節的例子中，我們需要確保使用者名稱是唯一的，並拒絕同一使用者名稱的其他併發註冊。如果一個節點要透過註冊，則需要知道其他的節點沒有在併發搶注同一使用者名稱的過程中。這個問題引領我們走向 **共識**。

我們看到，達成共識意味著以這樣一種方式決定某件事：所有節點一致同意所做決定，且這一決定不可撤銷。透過深入挖掘，結果我們發現很廣泛的一系列問題實際上都可以歸結為共識問題，並且彼此等價（從這個意義上來講，如果你有其中之一的解決方案，就可以輕易將它轉換為其他問題的解決方案）。這些等價的問題包括：

線性一致性的 CAS 暫存器
: 暫存器需要基於當前值是否等於操作給出的引數，原子地 **決定** 是否設定新值。

原子事務提交
: 資料庫必須 **決定** 是否提交或中止分散式事務。

全序廣播
: 訊息系統必須 **決定** 傳遞訊息的順序。

鎖和租約
: 當幾個客戶端爭搶鎖或租約時，由鎖來 **決定** 哪個客戶端成功獲得鎖。

成員 / 協調服務
: 給定某種故障檢測器（例如超時），系統必須 **決定** 哪些節點活著，哪些節點因為會話超時需要被宣告死亡。

唯一性約束
: 當多個事務同時嘗試使用相同的鍵建立衝突記錄時，約束必須 **決定** 哪一個被允許，哪些因為違反約束而失敗。

如果你只有一個節點，或者你願意將決策的權能分配給單個節點，所有這些事都很簡單。這就是在單領導者資料庫中發生的事情：所有決策權歸屬於領導者，這就是為什麼這樣的資料庫能夠提供線性一致的操作，唯一性約束，完全有序的複製日誌，以及更多。

但如果該領導者失效，或者如果網路中斷導致領導者不可達，這樣的系統就無法取得任何進展。應對這種情況可以有三種方法：

1. 等待領導者恢復，接受系統將在這段時間阻塞的事實。許多 XA/JTA 事務協調者選擇這個選項。這種方法並不能完全達成共識，因為它不能滿足 **終止** 屬性的要求：如果領導者續命失敗，系統可能會永久阻塞。
2. 人工故障切換，讓人類選擇一個新的領導者節點，並重新配置系統使之生效，許多關係型資料庫都採用這種方方式。這是一種來自 “天意” 的共識 —— 由計算機系統之外的運維人員做出決定。故障切換的速度受到人類行動速度的限制，通常要比計算機慢（得多）。
3. 使用演算法自動選擇一個新的領導者。這種方法需要一種共識演算法，使用成熟的演算法來正確處理惡劣的網路條件是明智之舉【107】。

儘管單領導者資料庫可以提供線性一致性，且無需對每個寫操作都執行共識演算法，但共識對於保持及變更領導權仍然是必須的。因此從某種意義上說，使用單個領導者不過是 “緩兵之計”：共識仍然是需要的，只是在另一個地方，而且沒那麼頻繁。好訊息是，容錯的共識演算法與容錯的共識系統是存在的，我們在本章中簡要地討論了它們。

像 ZooKeeper 這樣的工具為應用提供了 “外包” 的共識、故障檢測和成員服務。它們扮演了重要的角色，雖說使用不易，但總比自己去開發一個能經受 [第八章](/v1_tw/ch8) 中所有問題考驗的演算法要好得多。如果你發現自己想要解決的問題可以歸結為共識，並且希望它能容錯，使用一個類似 ZooKeeper 的東西是明智之舉。

儘管如此，並不是所有系統都需要共識：例如，無領導者複製和多領導者複製系統通常不會使用全域性的共識。這些系統中出現的衝突（請參閱 “[處理寫入衝突](/v1_tw/ch5#處理寫入衝突)”）正是不同領導者之間沒有達成共識的結果，但這也許並沒有關係：也許我們只是需要接受沒有線性一致性的事實，並學會更好地與具有分支與合併版本歷史的資料打交道。

本章引用了大量關於分散式系統理論的研究。雖然理論論文和證明並不總是容易理解，有時也會做出不切實際的假設，但它們對於指導這一領域的實踐有著極其重要的價值：它們幫助我們推理什麼可以做，什麼不可以做，幫助我們找到反直覺的分散式系統缺陷。如果你有時間，這些參考資料值得探索。

這裡已經到了本書 [第二部分](/v1_tw/part-ii) 的末尾，第二部介紹了複製（[第五章](/v1_tw/ch5)）、分割槽（[第六章](/v1_tw/ch6)）、事務（[第七章](/v1_tw/ch7)）、分散式系統的故障模型（[第八章](/v1_tw/ch8)）以及最後的一致性與共識（[第九章](/v1_tw/ch9)）。現在我們已經奠定了紮實的理論基礎，我們將在 [第三部分](/v1_tw/part-iii) 再次轉向更實際的系統，並討論如何使用異構的元件積木塊構建強大的應用。


## 參考文獻

1. Peter Bailis and Ali Ghodsi: “[Eventual Consistency Today: Limitations, Extensions, and Beyond](http://queue.acm.org/detail.cfm?id=2462076),” *ACM Queue*, volume 11, number 3, pages 55-63, March 2013. [doi:10.1145/2460276.2462076](http://dx.doi.org/10.1145/2460276.2462076)
1. Prince Mahajan, Lorenzo Alvisi, and Mike Dahlin: “[Consistency, Availability, and Convergence](http://apps.cs.utexas.edu/tech_reports/reports/tr/TR-2036.pdf),” University of Texas at Austin, Department of Computer Science, Tech Report UTCS TR-11-22, May 2011.
1. Alex Scotti: “[Adventures in Building Your Own Database](http://www.slideshare.net/AlexScotti1/allyourbase-55212398),” at *All Your Base*, November 2015.
1. Peter Bailis, Aaron Davidson, Alan Fekete, et al.: “[Highly Available Transactions: Virtues and Limitations](http://arxiv.org/pdf/1302.0309.pdf),” at *40th International Conference on Very Large Data Bases* (VLDB), September 2014. Extended version published as pre-print arXiv:1302.0309 &#91;cs.DB&#93;.
1. Paolo Viotti and Marko Vukolić: “[Consistency in Non-Transactional Distributed Storage Systems](http://arxiv.org/abs/1512.00168),” arXiv:1512.00168, 12 April 2016.
1. Maurice P. Herlihy and Jeannette M. Wing: “[Linearizability: A Correctness Condition for Concurrent Objects](http://cs.brown.edu/~mph/HerlihyW90/p463-herlihy.pdf),” *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 12, number 3, pages 463–492, July 1990. [doi:10.1145/78969.78972](http://dx.doi.org/10.1145/78969.78972)
1. Leslie Lamport: “[On interprocess communication](https://www.microsoft.com/en-us/research/publication/interprocess-communication-part-basic-formalism-part-ii-algorithms/),” *Distributed Computing*, volume 1, number 2, pages 77–101, June 1986. [doi:10.1007/BF01786228](http://dx.doi.org/10.1007/BF01786228)
1. David K. Gifford: “[Information Storage in a Decentralized Computer System](http://www.mirrorservice.org/sites/www.bitsavers.org/pdf/xerox/parc/techReports/CSL-81-8_Information_Storage_in_a_Decentralized_Computer_System.pdf),” Xerox Palo Alto Research Centers, CSL-81-8, June 1981.
1. Martin Kleppmann: “[Please Stop Calling Databases CP or AP](http://martin.kleppmann.com/2015/05/11/please-stop-calling-databases-cp-or-ap.html),” *martin.kleppmann.com*, May 11, 2015.
1. Kyle Kingsbury: “[Call Me Maybe: MongoDB Stale Reads](https://aphyr.com/posts/322-call-me-maybe-mongodb-stale-reads),” *aphyr.com*, April 20, 2015.
1. Kyle Kingsbury: “[Computational Techniques in Knossos](https://aphyr.com/posts/314-computational-techniques-in-knossos),” *aphyr.com*, May 17, 2014.
1. Peter Bailis: “[Linearizability Versus Serializability](http://www.bailis.org/blog/linearizability-versus-serializability/),” *bailis.org*, September 24, 2014.
1. Philip A. Bernstein, Vassos Hadzilacos, and Nathan Goodman: [*Concurrency Control and Recovery in Database Systems*](https://www.microsoft.com/en-us/research/people/philbe/book/). Addison-Wesley, 1987. ISBN: 978-0-201-10715-9, available online at *research.microsoft.com*.
1. Mike Burrows: “[The Chubby Lock Service for Loosely-Coupled Distributed Systems](https://research.google/pubs/pub27897/),” at *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006.
1. Flavio P. Junqueira and Benjamin Reed: *ZooKeeper: Distributed Process Coordination*. O'Reilly Media, 2013. ISBN: 978-1-449-36130-3
1. “[etcd Documentation](https://etcd.io/docs/),” The Linux Foundation, *etcd.io*.
1. “[Apache Curator](http://curator.apache.org/),” Apache Software Foundation, *curator.apache.org*, 2015.
1. Murali Vallath: *Oracle 10g RAC Grid, Services & Clustering*. Elsevier Digital Press, 2006. ISBN: 978-1-555-58321-7
1. Peter Bailis, Alan Fekete, Michael J Franklin, et al.: “[Coordination-Avoiding Database Systems](http://arxiv.org/pdf/1402.2237.pdf),” *Proceedings of the VLDB Endowment*, volume 8, number 3, pages 185–196, November 2014.
1. Kyle Kingsbury: “[Call Me Maybe: etcd and Consul](https://aphyr.com/posts/316-call-me-maybe-etcd-and-consul),” *aphyr.com*, June 9, 2014.
1. Flavio P. Junqueira, Benjamin C. Reed, and Marco Serafini: “[Zab: High-Performance Broadcast for Primary-Backup Systems](https://web.archive.org/web/20220419064903/https://marcoserafini.github.io/papers/zab.pdf),” at *41st IEEE International Conference on Dependable Systems and Networks* (DSN), June 2011. [doi:10.1109/DSN.2011.5958223](http://dx.doi.org/10.1109/DSN.2011.5958223)
1. Diego Ongaro and John K. Ousterhout: “[In Search of an Understandable Consensus Algorithm](https://www.usenix.org/system/files/conference/atc14/atc14-paper-ongaro.pdf),” at *USENIX Annual Technical Conference* (ATC), June 2014.
1. Hagit Attiya, Amotz Bar-Noy, and Danny Dolev: “[Sharing Memory Robustly in Message-Passing Systems](http://www.cse.huji.ac.il/course/2004/dist/p124-attiya.pdf),” *Journal of the ACM*, volume 42, number 1, pages 124–142, January 1995. [doi:10.1145/200836.200869](http://dx.doi.org/10.1145/200836.200869)
1. Nancy Lynch and Alex Shvartsman: “[Robust Emulation of Shared Memory Using Dynamic Quorum-Acknowledged Broadcasts](http://groups.csail.mit.edu/tds/papers/Lynch/FTCS97.pdf),” at *27th Annual International Symposium on Fault-Tolerant Computing* (FTCS), June 1997. [doi:10.1109/FTCS.1997.614100](http://dx.doi.org/10.1109/FTCS.1997.614100)
1. Christian Cachin, Rachid Guerraoui, and Luís Rodrigues: [*Introduction to Reliable and Secure Distributed Programming*](http://www.distributedprogramming.net/), 2nd edition. Springer, 2011. ISBN: 978-3-642-15259-7, [doi:10.1007/978-3-642-15260-3](http://dx.doi.org/10.1007/978-3-642-15260-3)
1. Sam Elliott, Mark Allen, and Martin Kleppmann: [personal communication](https://web.archive.org/web/20230620021338/https://twitter.com/lenary/status/654761711933648896), thread on *twitter.com*, October 15, 2015.
1. Niklas Ekström, Mikhail Panchenko, and Jonathan Ellis: “[Possible Issue with Read Repair?](http://mail-archives.apache.org/mod_mbox/cassandra-dev/201210.mbox/%3CFA480D1DC3964E2C8B0A14E0880094C9%40Robotech%3E),” email thread on *cassandra-dev* mailing list, October 2012.
1. Maurice P. Herlihy: “[Wait-Free Synchronization](https://cs.brown.edu/~mph/Herlihy91/p124-herlihy.pdf),” *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 13, number 1, pages 124–149, January 1991. [doi:10.1145/114005.102808](http://dx.doi.org/10.1145/114005.102808)
1. Armando Fox and Eric A. Brewer: “[Harvest, Yield, and Scalable Tolerant Systems](http://radlab.cs.berkeley.edu/people/fox/static/pubs/pdf/c18.pdf),” at *7th Workshop on Hot Topics in Operating Systems* (HotOS), March 1999. [doi:10.1109/HOTOS.1999.798396](http://dx.doi.org/10.1109/HOTOS.1999.798396)
1. Seth Gilbert and Nancy Lynch: “[Brewer’s Conjecture and the Feasibility of Consistent, Available, Partition-Tolerant Web Services](http://www.comp.nus.edu.sg/~gilbert/pubs/BrewersConjecture-SigAct.pdf),” *ACM SIGACT News*, volume 33, number 2, pages 51–59, June 2002. [doi:10.1145/564585.564601](http://dx.doi.org/10.1145/564585.564601)
1. Seth Gilbert and Nancy Lynch: “[Perspectives on the CAP Theorem](http://groups.csail.mit.edu/tds/papers/Gilbert/Brewer2.pdf),” *IEEE Computer Magazine*, volume 45, number 2, pages 30–36, February 2012. [doi:10.1109/MC.2011.389](http://dx.doi.org/10.1109/MC.2011.389)
1. Eric A. Brewer: “[CAP Twelve Years Later: How the 'Rules' Have Changed](https://web.archive.org/web/20221222092656/http://cs609.cs.ua.edu/CAP12.pdf),” *IEEE Computer Magazine*, volume 45, number 2, pages 23–29, February 2012. [doi:10.1109/MC.2012.37](http://dx.doi.org/10.1109/MC.2012.37)
1. Susan B. Davidson, Hector Garcia-Molina, and Dale Skeen: “[Consistency in Partitioned Networks](http://delab.csd.auth.gr/~dimitris/courses/mpc_fall05/papers/invalidation/acm_csur85_partitioned_network_consistency.pdf),” *ACM Computing Surveys*, volume 17, number 3, pages 341–370, September 1985. [doi:10.1145/5505.5508](http://dx.doi.org/10.1145/5505.5508)
1. Paul R. Johnson and Robert H. Thomas: “[RFC 677: The Maintenance of Duplicate Databases](https://tools.ietf.org/html/rfc677),” Network Working Group, January 27, 1975.
1. Bruce G. Lindsay, Patricia Griffiths Selinger, C. Galtieri, et al.: “[Notes on Distributed Databases](https://dominoweb.draco.res.ibm.com/reports/RJ2571.pdf),” IBM Research, Research Report RJ2571(33471), July 1979.
1. Michael J. Fischer and Alan Michael: “[Sacrificing Serializability to Attain High Availability of Data in an Unreliable Network](http://www.cs.ucsb.edu/~agrawal/spring2011/ugrad/p70-fischer.pdf),” at *1st ACM Symposium on Principles of Database Systems* (PODS), March 1982. [doi:10.1145/588111.588124](http://dx.doi.org/10.1145/588111.588124)
1. Eric A. Brewer: “[NoSQL: Past, Present, Future](http://www.infoq.com/presentations/NoSQL-History),” at *QCon San Francisco*, November 2012.
1. Henry Robinson: “[CAP Confusion: Problems with 'Partition Tolerance,'](https://web.archive.org/web/20160304020135/http://blog.cloudera.com/blog/2010/04/cap-confusion-problems-with-partition-tolerance/)” *blog.cloudera.com*, April 26, 2010.
1. Adrian Cockcroft: “[Migrating to Microservices](http://www.infoq.com/presentations/migration-cloud-native),” at *QCon London*, March 2014.
1. Martin Kleppmann: “[A Critique of the CAP Theorem](http://arxiv.org/abs/1509.05393),” arXiv:1509.05393, September 17, 2015.
1. Nancy A. Lynch: “[A Hundred Impossibility Proofs for Distributed Computing](http://groups.csail.mit.edu/tds/papers/Lynch/podc89.pdf),” at *8th ACM Symposium on Principles of Distributed Computing* (PODC), August 1989. [doi:10.1145/72981.72982](http://dx.doi.org/10.1145/72981.72982)
1. Hagit Attiya, Faith Ellen, and Adam Morrison: “[Limitations of Highly-Available Eventually-Consistent Data Stores](https://www.cs.tau.ac.il/~mad/publications/podc2015-replds.pdf),” at *ACM Symposium on Principles of Distributed Computing* (PODC), July 2015. [doi:10.1145/2767386.2767419](http://dx.doi.org/10.1145/2767386.2767419)
1. Peter Sewell, Susmit Sarkar, Scott Owens, et al.: “[x86-TSO: A Rigorous and Usable Programmer's Model for x86 Multiprocessors](http://www.cl.cam.ac.uk/~pes20/weakmemory/cacm.pdf),” *Communications of the ACM*, volume 53, number 7, pages 89–97, July 2010. [doi:10.1145/1785414.1785443](http://dx.doi.org/10.1145/1785414.1785443)
1. Martin Thompson: “[Memory Barriers/Fences](http://mechanical-sympathy.blogspot.co.uk/2011/07/memory-barriersfences.html),” *mechanical-sympathy.blogspot.co.uk*, July 24, 2011.
1. Ulrich Drepper: “[What Every Programmer Should Know About Memory](http://www.akkadia.org/drepper/cpumemory.pdf),” *akkadia.org*, November 21, 2007.
1. Daniel J. Abadi: “[Consistency Tradeoffs in Modern Distributed Database System Design](http://cs-www.cs.yale.edu/homes/dna/papers/abadi-pacelc.pdf),” *IEEE Computer Magazine*, volume 45, number 2, pages 37–42, February 2012. [doi:10.1109/MC.2012.33](http://dx.doi.org/10.1109/MC.2012.33)
1. Hagit Attiya and Jennifer L. Welch: “[Sequential Consistency Versus Linearizability](http://courses.csail.mit.edu/6.852/01/papers/p91-attiya.pdf),” *ACM Transactions on Computer Systems* (TOCS), volume 12, number 2, pages 91–122, May 1994. [doi:10.1145/176575.176576](http://dx.doi.org/10.1145/176575.176576)
1. Mustaque Ahamad, Gil Neiger, James E. Burns, et al.: “[Causal Memory: Definitions, Implementation, and Programming](http://www-i2.informatik.rwth-aachen.de/i2/fileadmin/user_upload/documents/Seminar_MCMM11/Causal_memory_1996.pdf),” *Distributed Computing*, volume 9, number 1, pages 37–49, March 1995. [doi:10.1007/BF01784241](http://dx.doi.org/10.1007/BF01784241)
1. Wyatt Lloyd, Michael J. Freedman, Michael Kaminsky, and David G. Andersen: “[Stronger Semantics for Low-Latency Geo-Replicated Storage](https://www.usenix.org/system/files/conference/nsdi13/nsdi13-final149.pdf),” at *10th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), April 2013.
1. Marek Zawirski, Annette Bieniusa, Valter Balegas, et al.: “[SwiftCloud: Fault-Tolerant Geo-Replication Integrated All the Way to the Client Machine](http://arxiv.org/abs/1310.3107),” INRIA Research Report 8347, August 2013.
1. Peter Bailis, Ali Ghodsi, Joseph M Hellerstein, and Ion Stoica: “[Bolt-on Causal Consistency](http://db.cs.berkeley.edu/papers/sigmod13-bolton.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2013.
1. Philippe Ajoux, Nathan Bronson, Sanjeev Kumar, et al.: “[Challenges to Adopting Stronger Consistency at Scale](https://www.usenix.org/system/files/conference/hotos15/hotos15-paper-ajoux.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
1. Peter Bailis: “[Causality Is Expensive (and What to Do About It)](http://www.bailis.org/blog/causality-is-expensive-and-what-to-do-about-it/),” *bailis.org*, February 5, 2014.
1. Ricardo Gonçalves, Paulo Sérgio Almeida, Carlos Baquero, and Victor Fonte: “[Concise Server-Wide Causality Management for Eventually Consistent Data Stores](https://web.archive.org/web/20220810205439/http://haslab.uminho.pt/tome/files/global_logical_clocks.pdf),” at *15th IFIP International Conference on Distributed Applications and Interoperable Systems* (DAIS), June 2015. [doi:10.1007/978-3-319-19129-4_6](http://dx.doi.org/10.1007/978-3-319-19129-4_6)
1. Rob Conery: “[A Better ID Generator for PostgreSQL](https://web.archive.org/web/20220118044729/http://rob.conery.io/2014/05/29/a-better-id-generator-for-postgresql/),” *rob.conery.io*, May 29, 2014.
1. Leslie Lamport: “[Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/),” *Communications of the ACM*, volume 21, number 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](http://dx.doi.org/10.1145/359545.359563)
1. Xavier Défago, André Schiper, and Péter Urbán: “[Total Order Broadcast and Multicast Algorithms: Taxonomy and Survey](https://dspace.jaist.ac.jp/dspace/bitstream/10119/4883/1/defago_et_al.pdf),” *ACM Computing Surveys*, volume 36, number 4, pages 372–421, December 2004. [doi:10.1145/1041680.1041682](http://dx.doi.org/10.1145/1041680.1041682)
1. Hagit Attiya and Jennifer Welch: *Distributed Computing: Fundamentals, Simulations and Advanced Topics*, 2nd edition. John Wiley & Sons, 2004. ISBN: 978-0-471-45324-6, [doi:10.1002/0471478210](http://dx.doi.org/10.1002/0471478210)
1. Mahesh Balakrishnan, Dahlia Malkhi, Vijayan Prabhakaran, et al.: “[CORFU: A Shared Log Design for Flash Clusters](https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final30.pdf),” at *9th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), April 2012.
1. Fred B. Schneider: “[Implementing Fault-Tolerant Services Using the State Machine Approach: A Tutorial](http://www.cs.cornell.edu/fbs/publications/smsurvey.pdf),” *ACM Computing Surveys*, volume 22, number 4, pages 299–319, December 1990.
1. Alexander Thomson, Thaddeus Diamond, Shu-Chun Weng, et al.: “[Calvin: Fast Distributed Transactions for Partitioned Database Systems](http://cs.yale.edu/homes/thomson/publications/calvin-sigmod12.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), May 2012.
1. Mahesh Balakrishnan, Dahlia Malkhi, Ted Wobber, et al.: “[Tango: Distributed Data Structures over a Shared Log](https://www.microsoft.com/en-us/research/publication/tango-distributed-data-structures-over-a-shared-log/),” at *24th ACM Symposium on Operating Systems Principles* (SOSP), November 2013. [doi:10.1145/2517349.2522732](http://dx.doi.org/10.1145/2517349.2522732)
1. Robbert van Renesse and Fred B. Schneider: “[Chain Replication for Supporting High Throughput and Availability](http://static.usenix.org/legacy/events/osdi04/tech/full_papers/renesse/renesse.pdf),” at *6th USENIX Symposium on Operating System Design and Implementation* (OSDI), December 2004.
1. Leslie Lamport: “[How to Make a Multiprocessor Computer That Correctly Executes Multiprocess Programs](https://lamport.azurewebsites.net/pubs/multi.pdf),” *IEEE Transactions on Computers*, volume 28, number 9, pages 690–691, September 1979. [doi:10.1109/TC.1979.1675439](http://dx.doi.org/10.1109/TC.1979.1675439)
1. Enis Söztutar, Devaraj Das, and Carter Shanklin: “[Apache HBase High Availability at the Next Level](https://web.archive.org/web/20160405122821/http://hortonworks.com/blog/apache-hbase-high-availability-next-level/),” *hortonworks.com*, January 22, 2015.
1. Brian F Cooper, Raghu Ramakrishnan, Utkarsh Srivastava, et al.: “[PNUTS: Yahoo!’s Hosted Data Serving Platform](http://www.mpi-sws.org/~druschel/courses/ds/papers/cooper-pnuts.pdf),” at *34th International Conference on Very Large Data Bases* (VLDB), August 2008. [doi:10.14778/1454159.1454167](http://dx.doi.org/10.14778/1454159.1454167)
1. Tushar Deepak Chandra and Sam Toueg: “[Unreliable Failure Detectors for Reliable Distributed Systems](http://courses.csail.mit.edu/6.852/08/papers/CT96-JACM.pdf),” *Journal of the ACM*, volume 43, number 2, pages 225–267, March 1996. [doi:10.1145/226643.226647](http://dx.doi.org/10.1145/226643.226647)
1. Michael J. Fischer, Nancy Lynch, and Michael S. Paterson: “[Impossibility of Distributed Consensus with One Faulty Process](https://groups.csail.mit.edu/tds/papers/Lynch/jacm85.pdf),” *Journal of the ACM*, volume 32, number 2, pages 374–382, April 1985. [doi:10.1145/3149.214121](http://dx.doi.org/10.1145/3149.214121)
1. Michael Ben-Or: “Another Advantage of Free Choice: Completely Asynchronous Agreement Protocols,” at *2nd ACM Symposium on Principles of Distributed Computing* (PODC), August 1983. [doi:10.1145/800221.806707](http://dl.acm.org/citation.cfm?id=806707)
1. Jim N. Gray and Leslie Lamport: “[Consensus on Transaction Commit](http://db.cs.berkeley.edu/cs286/papers/paxoscommit-tods2006.pdf),” *ACM Transactions on Database Systems* (TODS), volume 31, number 1, pages 133–160, March 2006. [doi:10.1145/1132863.1132867](http://dx.doi.org/10.1145/1132863.1132867)
1. Rachid Guerraoui: “[Revisiting the Relationship Between Non-Blocking Atomic Commitment and Consensus](https://citeseerx.ist.psu.edu/pdf/5d06489503b6f791aa56d2d7942359c2592e44b0),” at *9th International Workshop on Distributed Algorithms* (WDAG), September 1995. [doi:10.1007/BFb0022140](http://dx.doi.org/10.1007/BFb0022140)
1. Thanumalayan Sankaranarayana Pillai, Vijay Chidambaram, Ramnatthan Alagappan, et al.: “[All File Systems Are Not Created Equal: On the Complexity of Crafting Crash-Consistent Applications](http://research.cs.wisc.edu/wind/Publications/alice-osdi14.pdf),” at *11th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2014.
1. Jim Gray: “[The Transaction Concept: Virtues and Limitations](http://jimgray.azurewebsites.net/papers/thetransactionconcept.pdf),” at *7th International Conference on Very Large Data Bases* (VLDB), September 1981.
1. Hector Garcia-Molina and Kenneth Salem: “[Sagas](http://www.cs.cornell.edu/andru/cs711/2002fa/reading/sagas.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), May 1987. [doi:10.1145/38713.38742](http://dx.doi.org/10.1145/38713.38742)
1. C. Mohan, Bruce G. Lindsay, and Ron Obermarck: “[Transaction Management in the R* Distributed Database Management System](https://cs.brown.edu/courses/csci2270/archives/2012/papers/dtxn/p378-mohan.pdf),” *ACM Transactions on Database Systems*, volume 11, number 4, pages 378–396, December 1986. [doi:10.1145/7239.7266](http://dx.doi.org/10.1145/7239.7266)
1. “[Distributed Transaction Processing: The XA Specification](http://pubs.opengroup.org/onlinepubs/009680699/toc.pdf),” X/Open Company Ltd., Technical Standard XO/CAE/91/300, December 1991. ISBN: 978-1-872-63024-3
1. Mike Spille: “[XA Exposed, Part II](http://www.jroller.com/pyrasun/entry/xa_exposed_part_ii_schwartz),” *jroller.com*, April 3, 2004.
1. Ivan Silva Neto and Francisco Reverbel: “[Lessons Learned from Implementing WS-Coordination and WS-AtomicTransaction](http://www.ime.usp.br/~reverbel/papers/icis2008.pdf),” at *7th IEEE/ACIS International Conference on Computer and Information Science* (ICIS), May 2008. [doi:10.1109/ICIS.2008.75](http://dx.doi.org/10.1109/ICIS.2008.75)
1. James E. Johnson, David E. Langworthy, Leslie Lamport, and Friedrich H. Vogt: “[Formal Specification of a Web Services Protocol](https://www.microsoft.com/en-us/research/publication/formal-specification-of-a-web-services-protocol/),” at *1st International Workshop on Web Services and Formal Methods* (WS-FM), February 2004. [doi:10.1016/j.entcs.2004.02.022](http://dx.doi.org/10.1016/j.entcs.2004.02.022)
1. Dale Skeen: “[Nonblocking Commit Protocols](http://www.cs.utexas.edu/~lorenzo/corsi/cs380d/papers/Ske81.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), April 1981. [doi:10.1145/582318.582339](http://dx.doi.org/10.1145/582318.582339)
1. Gregor Hohpe: “[Your Coffee Shop Doesn’t Use Two-Phase Commit](http://www.martinfowler.com/ieeeSoftware/coffeeShop.pdf),” *IEEE Software*, volume 22, number 2, pages 64–66, March 2005. [doi:10.1109/MS.2005.52](http://dx.doi.org/10.1109/MS.2005.52)
1. Pat Helland: “[Life Beyond Distributed Transactions: An Apostate’s Opinion](https://web.archive.org/web/20210303104924/http://www-db.cs.wisc.edu/cidr/cidr2007/papers/cidr07p15.pdf),” at *3rd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2007.
1. Jonathan Oliver: “[My Beef with MSDTC and Two-Phase Commits](http://blog.jonathanoliver.com/my-beef-with-msdtc-and-two-phase-commits/),” *blog.jonathanoliver.com*, April 4, 2011.
1. Oren Eini (Ahende Rahien): “[The Fallacy of Distributed Transactions](http://ayende.com/blog/167362/the-fallacy-of-distributed-transactions),” *ayende.com*, July 17, 2014.
1. Clemens Vasters: “[Transactions in Windows Azure (with Service Bus) – An Email Discussion](https://blogs.msdn.microsoft.com/clemensv/2012/07/30/transactions-in-windows-azure-with-service-bus-an-email-discussion/),” *vasters.com*, July 30, 2012.
1. “[Understanding Transactionality in Azure](https://docs.particular.net/nservicebus/azure/understanding-transactionality-in-azure),” NServiceBus Documentation, Particular Software, 2015.
1. Randy Wigginton, Ryan Lowe, Marcos Albe, and Fernando Ipar: “[Distributed Transactions in MySQL](https://web.archive.org/web/20161010054152/https://www.percona.com/live/mysql-conference-2013/sites/default/files/slides/XA_final.pdf),” at *MySQL Conference and Expo*, April 2013.
1. Mike Spille: “[XA Exposed, Part I](https://web.archive.org/web/20130523064202/http://www.jroller.com/pyrasun/entry/xa_exposed),” *jroller.com*, April 3, 2004.
1. Ajmer Dhariwal: “[Orphaned MSDTC Transactions (-2 spids)](https://www.eraofdata.com/posts/2008/orphaned-msdtc-transactions-2-spids/),” *eraofdata.com*, December 12, 2008.
1. Paul Randal: “[Real World Story of DBCC PAGE Saving the Day](http://www.sqlskills.com/blogs/paul/real-world-story-of-dbcc-page-saving-the-day/),” *sqlskills.com*, June 19, 2013.
1. “[in-doubt xact resolution Server Configuration Option](https://msdn.microsoft.com/en-us/library/ms179586.aspx),” SQL Server 2016 documentation, Microsoft, Inc., 2016.
1. Cynthia Dwork, Nancy Lynch, and Larry Stockmeyer: “[Consensus in the Presence of Partial Synchrony](https://web.archive.org/web/20210318133551/https://www.net.t-labs.tu-berlin.de/~petr/ADC-07/papers/DLS88.pdf),” *Journal of the ACM*, volume 35, number 2, pages 288–323, April 1988. [doi:10.1145/42282.42283](http://dx.doi.org/10.1145/42282.42283)
1. Miguel Castro and Barbara H. Liskov: “[Practical Byzantine Fault Tolerance and Proactive Recovery](https://web.archive.org/web/20181123142540/http://zoo.cs.yale.edu/classes/cs426/2012/bib/castro02practical.pdf),” *ACM Transactions on Computer Systems*, volume 20, number 4, pages 396–461, November 2002. [doi:10.1145/571637.571640](http://dx.doi.org/10.1145/571637.571640)
1. Brian M. Oki and Barbara H. Liskov: “[Viewstamped Replication: A New Primary Copy Method to Support Highly-Available Distributed Systems](http://www.cs.princeton.edu/courses/archive/fall11/cos518/papers/viewstamped.pdf),” at *7th ACM Symposium on Principles of Distributed Computing* (PODC), August 1988. [doi:10.1145/62546.62549](http://dx.doi.org/10.1145/62546.62549)
1. Barbara H. Liskov and James Cowling: “[Viewstamped Replication Revisited](http://pmg.csail.mit.edu/papers/vr-revisited.pdf),” Massachusetts Institute of Technology, Tech Report MIT-CSAIL-TR-2012-021, July 2012.
1. Leslie Lamport: “[The Part-Time Parliament](https://www.microsoft.com/en-us/research/publication/part-time-parliament/),” *ACM Transactions on Computer Systems*, volume 16, number 2, pages 133–169, May 1998. [doi:10.1145/279227.279229](http://dx.doi.org/10.1145/279227.279229)
1. Leslie Lamport: “[Paxos Made Simple](https://www.microsoft.com/en-us/research/publication/paxos-made-simple/),” *ACM SIGACT News*, volume 32, number 4, pages 51–58, December 2001.
1. Tushar Deepak Chandra, Robert Griesemer, and Joshua Redstone: “[Paxos Made Live – An Engineering Perspective](http://www.read.seas.harvard.edu/~kohler/class/08w-dsi/chandra07paxos.pdf),” at *26th ACM Symposium on Principles of Distributed Computing* (PODC), June 2007.
1. Robbert van Renesse: “[Paxos Made Moderately Complex](http://www.cs.cornell.edu/home/rvr/Paxos/paxos.pdf),” *cs.cornell.edu*, March 2011.
1. Diego Ongaro: “[Consensus: Bridging Theory and Practice](https://github.com/ongardie/dissertation),” PhD Thesis, Stanford University, August 2014.
1. Heidi Howard, Malte Schwarzkopf, Anil Madhavapeddy, and Jon Crowcroft: “[Raft Refloated: Do We Have Consensus?](https://web.archive.org/web/20230319151303/https://www.cl.cam.ac.uk/~ms705/pub/papers/2015-osr-raft.pdf),” *ACM SIGOPS Operating Systems Review*, volume 49, number 1, pages 12–21, January 2015. [doi:10.1145/2723872.2723876](http://dx.doi.org/10.1145/2723872.2723876)
1. André Medeiros: “[ZooKeeper’s Atomic Broadcast Protocol: Theory and Practice](http://www.tcs.hut.fi/Studies/T-79.5001/reports/2012-deSouzaMedeiros.pdf),” Aalto University School of Science, March 20, 2012.
1. Robbert van Renesse, Nicolas Schiper, and Fred B. Schneider: “[Vive La Différence: Paxos vs. Viewstamped Replication vs. Zab](http://arxiv.org/abs/1309.5671),” *IEEE Transactions on Dependable and Secure Computing*, volume 12, number 4, pages 472–484, September 2014. [doi:10.1109/TDSC.2014.2355848](http://dx.doi.org/10.1109/TDSC.2014.2355848)
1. Will Portnoy: “[Lessons Learned from Implementing Paxos](http://blog.willportnoy.com/2012/06/lessons-learned-from-paxos.html),” *blog.willportnoy.com*, June 14, 2012.
1. Heidi Howard, Dahlia Malkhi, and Alexander Spiegelman: “[Flexible Paxos: Quorum Intersection Revisited](https://drops.dagstuhl.de/opus/volltexte/2017/7094/pdf/LIPIcs-OPODIS-2016-25.pdf),” at *20th International Conference on Principles of Distributed Systems* (OPODIS), December 2016. [doi:10.4230/LIPIcs.OPODIS.2016.25](http://dx.doi.org/10.4230/LIPIcs.OPODIS.2016.25)
1. Heidi Howard and Jon Crowcroft: “[Coracle: Evaluating Consensus at the Internet Edge](https://conferences.sigcomm.org/sigcomm/2015/pdf/papers/p85.pdf),” at *Annual Conference of the ACM Special Interest Group on Data Communication* (SIGCOMM), August 2015. [doi:10.1145/2829988.2790010](http://dx.doi.org/10.1145/2829988.2790010)
1. Kyle Kingsbury: “[Call Me Maybe: Elasticsearch 1.5.0](https://aphyr.com/posts/323-call-me-maybe-elasticsearch-1-5-0),” *aphyr.com*, April 27, 2015.
1. Ivan Kelly: “[BookKeeper Tutorial](https://github.com/ivankelly/bookkeeper-tutorial),” *github.com*, October 2014.
1. Camille Fournier: “[Consensus Systems for the Skeptical Architect](https://vimeo.com/102667163),” at *Philly ETE*, Philadelphia, PA, USA, April 2014.
1. Kenneth P. Birman: “[A History of the Virtual Synchrony Replication Model](https://ptolemy.berkeley.edu/projects/truststc/pubs/713/History%20of%20the%20Virtual%20Synchrony%20Replication%20Model%202010.pdf),” in *Replication: Theory and Practice*, Springer LNCS volume 5959, chapter 6, pages 91–120, 2010. ISBN: 978-3-642-11293-5, [doi:10.1007/978-3-642-11294-2_6](http://dx.doi.org/10.1007/978-3-642-11294-2_6)

================================================
FILE: content/v1_tw/colophon.md
================================================
---
title: 後記
weight: 600
breadcrumbs: false
---

## 關於作者

**Martin Kleppmann** 是英國劍橋大學分散式系統的研究員。此前他曾在網際網路公司擔任過軟體工程師和企業家，其中包括 LinkedIn 和 Rapportive，負責大規模資料基礎架構。在這個過程中，他以艱難的方式學習了一些東西，他希望這本書能夠讓你避免重蹈覆轍。

Martin 是一位常規會議演講者，博主和開源貢獻者。他認為，每個人都應該有深刻的技術理念，深層次的理解能幫助我們開發出更好的軟體。

![](http://martin.kleppmann.com/2017/03/ddia-poster.jpg)


## 關於譯者

[**馮若航**](https://vonng.com)，網名 [@Vonng](https://github.com/Vonng)。
PostgreSQL 專家，資料庫老司機，雲計算泥石流。
PostgreSQL 發行版 [**Pigsty**](https://pgsty.com) 作者與創始人。
架構師，DBA，全棧工程師 @ TanTan，Alibaba，Apple。
獨立開源貢獻者，[GitStar Ranking 585](https://gitstar-ranking.com/Vonng)，[國區活躍 Top20](https://committers.top/china)。
[DDIA](https://ddia.pigsty.io) / [PG Internal](https://pgint.vonng.com) 中文版譯者，資料庫/雲計算 KOL。


## 後記

《設計資料密集型應用》封面上的動物是 **印度野豬（Sus scrofa cristatus）**，它是在印度、緬甸、尼泊爾、斯里蘭卡和泰國發現的一種野豬的亞種。與歐洲野豬不同，它們有更高的背部鬃毛，沒有體表絨毛，以及更大更直的頭骨。

印度野豬有一頭灰色或黑色的頭髮，脊背上有短而硬的毛。雄性有突出的犬齒（稱為 T），用來與對手戰鬥或抵禦掠食者。雄性比雌性大，這些物種平均肩高 33-35 英寸，體重 200-300 磅。他們的天敵包括熊、老虎和各種大型貓科動物。

這些動物夜行且雜食 —— 它們吃各種各樣的東西，包括根、昆蟲、腐肉、堅果、漿果和小動物。野豬經常因為破壞農作物的根被人們所熟知，他們造成大量的破壞，並被農民所敵視。他們每天需要攝入 4,000 ~ 4,500 卡路里的能量。野豬有發達的嗅覺，這有助於尋找地下植物和挖掘動物。然而，它們的視力很差。

野豬在人類文化中一直具有重要意義。在印度教傳說中，野豬是毗溼奴神的化身。在古希臘的喪葬紀念碑中，它是一個勇敢失敗者的象徵（與勝利的獅子相反）。由於它的侵略，它被描繪在斯堪的納維亞、日耳曼和盎格魯撒克遜戰士的盔甲和武器上。在中國十二生肖中，它象徵著決心和急躁。

O'Reilly 封面上的許多動物都受到威脅，這些動物對世界都很重要。要了解有關如何提供幫助的更多資訊，請訪問 animals.oreilly.com。

封面圖片來自 Shaw's Zoology。封面字型是 URW Typewriter 和 Guardian Sans。文字字型是 Adobe Minion Pro；圖中的字型是 Adobe Myriad Pro；標題字型是 Adobe Myriad Condensed；程式碼字型是 Dalton Maag 的 Ubuntu Mono。

================================================
FILE: content/v1_tw/contrib.md
================================================
---
title: 貢獻者
weight: 800
breadcrumbs: false
---

## 譯者

[**馮若航**](https://vonng.com)，網名 [@Vonng](https://github.com/Vonng)。
PostgreSQL 專家，資料庫老司機，雲計算泥石流。
[**Pigsty**](https://pgsty.com) 作者與創始人。
架構師，DBA，全棧工程師 @ TanTan，Alibaba，Apple。
獨立開源貢獻者，[GitStar Ranking 585](https://gitstar-ranking.com/Vonng)，[國區活躍 Top20](https://committers.top/china)。
[DDIA](https://ddia.pigsty.io) / [PG Internal](https://pgint.vonng.com) 中文版譯者，公眾號：《老馮雲數》，資料庫 KOL。

## 校訂與維護

Yin Gang [@yingang](https://github.com/yingang) 對本書進行了全文校訂，並持續維護。

## 繁體中文版本

[繁體中文](/tw) **版本維護** by  [@afunTW](https://github.com/afunTW)

## 貢獻列表

[GitHub 貢獻者列表](https://github.com/Vonng/ddia/graphs/contributors)

0. 全文校訂 by [@yingang](https://github.com/Vonng/ddia/commits?author=yingang)
1. [序言初翻修正](https://github.com/Vonng/ddia/commit/afb5edab55c62ed23474149f229677e3b42dfc2c) by [@seagullbird](https://github.com/Vonng/ddia/commits?author=seagullbird)
2. [第一章語法標點校正](https://github.com/Vonng/ddia/commit/973b12cd8f8fcdf4852f1eb1649ddd9d187e3644) by [@nevertiree](https://github.com/Vonng/ddia/commits?author=nevertiree)
3. [第六章部分校正](https://github.com/Vonng/ddia/commit/d4eb0852c0ec1e93c8aacc496c80b915bb1e6d48) 與[第十章的初翻](https://github.com/Vonng/ddia/commit/9de8dbd1bfe6fbb03b3bf6c1a1aa2291aed2490e) by [@MuAlex](https://github.com/Vonng/ddia/commits?author=MuAlex)
4. [第一部分](/v1_tw/part-i)前言，[ch2](/v1_tw/ch2)校正 by [@jiajiadebug](https://github.com/Vonng/ddia/commits?author=jiajiadebug)
5. [詞彙表](/v1_tw/glossary)、[後記](/v1_tw/colophon)關於野豬的部分 by [@Chowss](https://github.com/Vonng/ddia/commits?author=Chowss)
6. [繁體中文](https://github.com/Vonng/ddia/pulls)版本與轉換指令碼 by [@afunTW](https://github.com/afunTW)
7. 多處翻譯修正 by [@songzhibin97](https://github.com/Vonng/ddia/commits?author=songzhibin97) [@MamaShip](https://github.com/Vonng/ddia/commits?author=MamaShip) [@FangYuan33](https://github.com/Vonng/ddia/commits?author=FangYuan33)


感謝所有提出意見，作出貢獻的朋友們，您可以在這裡找到所有貢獻的 [Issue 列表](https://github.com/Vonng/ddia/issues) 與 [PR 列表](https://github.com/Vonng/ddia/pulls)：

| ISSUE & Pull Requests                           | USER                                                       | Title                                                          |
|-------------------------------------------------|------------------------------------------------------------|----------------------------------------------------------------|
| [359](https://github.com/Vonng/ddia/pull/359)   | [@c25423](https://github.com/c25423)                       | ch10: 修正一處拼寫錯誤                                                 |
| [358](https://github.com/Vonng/ddia/pull/358)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch4: 修正一處拼寫錯誤                                                  |
| [356](https://github.com/Vonng/ddia/pull/356)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch2: 修正一處標點錯誤                                                  |
| [355](https://github.com/Vonng/ddia/pull/355)   | [@DuroyGeorge](https://github.com/DuroyGeorge)             | ch12: 修正一處格式錯誤                                                 |
| [354](https://github.com/Vonng/ddia/pull/354)   | [@justlorain](https://github.com/justlorain)               | ch7: 修正一處參考連結                                                  |
| [353](https://github.com/Vonng/ddia/pull/353)   | [@fantasyczl](https://github.com/fantasyczl)               | ch3&9: 修正兩處引用錯誤                                                |
| [352](https://github.com/Vonng/ddia/pull/352)   | [@fantasyczl](https://github.com/fantasyczl)               | 支援輸出為 EPUB 格式                                                  |
| [349](https://github.com/Vonng/ddia/pull/349)   | [@xiyihan0](https://github.com/xiyihan0)                   | ch1: 修正一處格式錯誤                                                  |
| [348](https://github.com/Vonng/ddia/pull/348)   | [@omegaatt36](https://github.com/omegaatt36)               | ch3: 修正一處影像連結                                                  |
| [346](https://github.com/Vonng/ddia/issues/346) | [@Vermouth1995](https://github.com/Vermouth1995)           | ch1: 最佳化一處翻譯                                                    |
| [343](https://github.com/Vonng/ddia/pull/343)   | [@kehao-chen](https://github.com/kehao-chen)               | ch10: 最佳化一處翻譯                                                   |
| [341](https://github.com/Vonng/ddia/pull/341)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch3: 最佳化兩處翻譯                                                    |
| [340](https://github.com/Vonng/ddia/pull/340)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch2: 最佳化多處翻譯                                                    |
| [338](https://github.com/Vonng/ddia/pull/338)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch1: 最佳化一處翻譯                                                    |
| [335](https://github.com/Vonng/ddia/pull/335)   | [@kimi0230](https://github.com/kimi0230)                   | 修正一處繁體中文錯誤                                                     |
| [334](https://github.com/Vonng/ddia/pull/334)   | [@soulrrrrr](https://github.com/soulrrrrr)                 | ch2: 修正一處繁體中文錯誤                                                |
| [332](https://github.com/Vonng/ddia/pull/332)   | [@justlorain](https://github.com/justlorain)               | ch5: 修正一處翻譯錯誤                                                  |
| [331](https://github.com/Vonng/ddia/pull/331)   | [@Lyianu](https://github.com/Lyianu)                       | ch9: 更正幾處拼寫錯誤                                                  |
| [330](https://github.com/Vonng/ddia/pull/330)   | [@Lyianu](https://github.com/Lyianu)                       | ch7: 最佳化一處翻譯                                                    |
| [329](https://github.com/Vonng/ddia/issues/329) | [@Lyianu](https://github.com/Lyianu)                       | ch6: 指出一處翻譯錯誤                                                  |
| [328](https://github.com/Vonng/ddia/pull/328)   | [@justlorain](https://github.com/justlorain)               | ch4: 更正一處翻譯遺漏                                                  |
| [326](https://github.com/Vonng/ddia/pull/326)   | [@liangGTY](https://github.com/liangGTY)                   | ch1: 最佳化一處翻譯                                                    |
| [323](https://github.com/Vonng/ddia/pull/323)   | [@marvin263](https://github.com/marvin263)                 | ch5: 最佳化一處翻譯                                                    |
| [322](https://github.com/Vonng/ddia/pull/322)   | [@marvin263](https://github.com/marvin263)                 | ch8: 最佳化一處翻譯                                                    |
| [304](https://github.com/Vonng/ddia/pull/304)   | [@spike014](https://github.com/spike014)                   | ch11: 最佳化一處翻譯                                                   |
| [298](https://github.com/Vonng/ddia/pull/298)   | [@Makonike](https://github.com/Makonike)                   | ch11&12: 修正兩處錯誤                                                |
| [284](https://github.com/Vonng/ddia/pull/284)   | [@WAangzE](https://github.com/WAangzE)                     | ch4: 更正一處列表錯誤                                                  |
| [283](https://github.com/Vonng/ddia/pull/283)   | [@WAangzE](https://github.com/WAangzE)                     | ch3: 更正一處錯別字                                                   |
| [282](https://github.com/Vonng/ddia/pull/282)   | [@WAangzE](https://github.com/WAangzE)                     | ch2: 更正一處公式問題                                                  |
| [281](https://github.com/Vonng/ddia/pull/281)   | [@lyuxi99](https://github.com/lyuxi99)                     | 更正多處內部連結錯誤                                                     |
| [280](https://github.com/Vonng/ddia/pull/280)   | [@lyuxi99](https://github.com/lyuxi99)                     | ch9: 更正內部連結錯誤                                                  |
| [279](https://github.com/Vonng/ddia/issues/279) | [@codexvn](https://github.com/codexvn)                     | ch9: 指出公式在 GitHub Pages 顯示的問題                                  |
| [278](https://github.com/Vonng/ddia/pull/278)   | [@LJlkdskdjflsa](https://github.com/LJlkdskdjflsa)         | 發現了繁體中文版本中的錯誤翻譯                                                |
| [275](https://github.com/Vonng/ddia/pull/275)   | [@117503445](https://github.com/117503445)                 | 更正 LICENSE 連結                                                  |
| [274](https://github.com/Vonng/ddia/pull/274)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch7: 修正錯別字                                                     |
| [273](https://github.com/Vonng/ddia/pull/273)   | [@Sdot-Python](https://github.com/Sdot-Python)             | ch7: 統一了 write skew 的翻譯                                        |
| [271](https://github.com/Vonng/ddia/pull/271)   | [@Makonike](https://github.com/Makonike)                   | ch6: 統一了 rebalancing 的翻譯                                       |
| [270](https://github.com/Vonng/ddia/pull/270)   | [@Ynjxsjmh](https://github.com/Ynjxsjmh)                   | ch7: 修正不一致的翻譯                                                  |
| [263](https://github.com/Vonng/ddia/pull/263)   | [@zydmayday](https://github.com/zydmayday)                 | ch5: 修正譯文中的重複單詞                                                |
| [260](https://github.com/Vonng/ddia/pull/260)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch4: 修正部分不準確的翻譯                                                |
| [258](https://github.com/Vonng/ddia/pull/258)   | [@bestgrc](https://github.com/bestgrc)                     | ch3: 修正一處翻譯錯誤                                                  |
| [257](https://github.com/Vonng/ddia/pull/257)   | [@UnderSam](https://github.com/UnderSam)                   | ch8: 修正一處拼寫錯誤                                                  |
| [256](https://github.com/Vonng/ddia/pull/256)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可序列化”相關內容的多處翻譯不當                                       |
| [255](https://github.com/Vonng/ddia/pull/255)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可重複讀”相關內容的多處翻譯不當                                       |
| [253](https://github.com/Vonng/ddia/pull/253)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“讀已提交”相關內容的多處翻譯不當                                       |
| [246](https://github.com/Vonng/ddia/pull/246)   | [@derekwu0101](https://github.com/derekwu0101)             | ch3: 修正繁體中文的轉譯錯誤                                               |
| [245](https://github.com/Vonng/ddia/pull/245)   | [@skyran1278](https://github.com/skyran1278)               | ch12: 修正繁體中文的轉譯錯誤                                              |
| [244](https://github.com/Vonng/ddia/pull/244)   | [@Axlgrep](https://github.com/Axlgrep)                     | ch9: 修正不通順的翻譯                                                  |
| [242](https://github.com/Vonng/ddia/pull/242)   | [@lynkeib](https://github.com/lynkeib)                     | ch9: 修正不通順的翻譯                                                  |
| [241](https://github.com/Vonng/ddia/pull/241)   | [@lynkeib](https://github.com/lynkeib)                     | ch8: 修正不正確的公式格式                                                |
| [240](https://github.com/Vonng/ddia/pull/240)   | [@8da2k](https://github.com/8da2k)                         | ch9: 修正不通順的翻譯                                                  |
| [239](https://github.com/Vonng/ddia/pull/239)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch7: 修正不一致的翻譯                                                  |
| [237](https://github.com/Vonng/ddia/pull/237)   | [@zhangnew](https://github.com/zhangnew)                   | ch3: 修正錯誤的圖片連結                                                 |
| [229](https://github.com/Vonng/ddia/pull/229)   | [@lis186](https://github.com/lis186)                       | 指出繁體中文的轉譯錯誤：複雜                                                 |
| [226](https://github.com/Vonng/ddia/pull/226)   | [@chroming](https://github.com/chroming)                   | ch1: 修正導航欄中的章節名稱                                               |
| [220](https://github.com/Vonng/ddia/pull/220)   | [@skyran1278](https://github.com/skyran1278)               | ch9: 修正線性一致的繁體中文翻譯                                             |
| [194](https://github.com/Vonng/ddia/pull/194)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正錯誤的翻譯                                                   |
| [193](https://github.com/Vonng/ddia/pull/193)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 最佳化譯文                                                      |
| [192](https://github.com/Vonng/ddia/pull/192)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正不一致和不通順的翻譯                                              |
| [190](https://github.com/Vonng/ddia/pull/190)   | [@Pcrab](https://github.com/Pcrab)                         | ch1: 修正不準確的翻譯                                                  |
| [187](https://github.com/Vonng/ddia/pull/187)   | [@narojay](https://github.com/narojay)                     | ch9: 修正生硬的翻譯                                                   |
| [186](https://github.com/Vonng/ddia/pull/186)   | [@narojay](https://github.com/narojay)                     | ch8: 修正錯別字                                                     |
| [185](https://github.com/Vonng/ddia/issues/185) | [@8da2k](https://github.com/8da2k)                         | 指出小標題跳轉的問題                                                     |
| [184](https://github.com/Vonng/ddia/pull/184)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch10: 修正失效的網址                                                  |
| [183](https://github.com/Vonng/ddia/pull/183)   | [@OneSizeFitsQuorum](https://github.com/OneSizeFitsQuorum) | ch8: 修正錯別字                                                     |
| [182](https://github.com/Vonng/ddia/issues/182) | [@lroolle](https://github.com/lroolle)                     | 建議docsify的主題風格                                                 |
| [181](https://github.com/Vonng/ddia/pull/181)   | [@YunfengGao](https://github.com/YunfengGao)               | ch2: 修正翻譯錯誤                                                    |
| [180](https://github.com/Vonng/ddia/pull/180)   | [@skyran1278](https://github.com/skyran1278)               | ch3: 指出繁體中文的轉譯錯誤                                               |
| [177](https://github.com/Vonng/ddia/pull/177)   | [@exzhawk](https://github.com/exzhawk)                     | 支援 Github Pages 裡的公式顯示                                         |
| [176](https://github.com/Vonng/ddia/pull/176)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch2: 語義網相關翻譯更正                                                 |
| [175](https://github.com/Vonng/ddia/pull/175)   | [@cwr31](https://github.com/cwr31)                         | ch7: 不變式相關翻譯更正                                                 |
| [174](https://github.com/Vonng/ddia/pull/174)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | README & preface: 更正不正確的中文用詞和標點符號                              |
| [173](https://github.com/Vonng/ddia/pull/173)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正不完整的翻譯                                                 |
| [171](https://github.com/Vonng/ddia/pull/171)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正重複的譯文                                                  |
| [169](https://github.com/Vonng/ddia/pull/169)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 更正不太通順的翻譯                                                |
| [166](https://github.com/Vonng/ddia/pull/166)   | [@bp4m4h94](https://github.com/bp4m4h94)                   | ch1: 發現錯誤的文獻索引                                                 |
| [164](https://github.com/Vonng/ddia/pull/164)   | [@DragonDriver](https://github.com/DragonDriver)           | preface: 更正錯誤的標點符號                                             |
| [163](https://github.com/Vonng/ddia/pull/163)   | [@llmmddCoder](https://github.com/llmmddCoder)             | ch1: 更正錯誤字                                                     |
| [160](https://github.com/Vonng/ddia/pull/160)   | [@Zhayhp](https://github.com/Zhayhp)                       | ch2: 建議將 network model 翻譯為網狀模型                                 |
| [159](https://github.com/Vonng/ddia/pull/159)   | [@1ess](https://github.com/1ess)                           | ch4: 更正錯誤字                                                     |
| [157](https://github.com/Vonng/ddia/pull/157)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通順的翻譯                                                 |
| [155](https://github.com/Vonng/ddia/pull/155)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通順的翻譯                                                 |
| [153](https://github.com/Vonng/ddia/pull/153)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch9: 修正縮圖的錯別字                                                 |
| [152](https://github.com/Vonng/ddia/pull/152)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 除重->去重                                                    |
| [151](https://github.com/Vonng/ddia/pull/151)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 修訂sibling相關的翻譯                                            |
| [147](https://github.com/Vonng/ddia/pull/147)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 更正一處不準確的翻譯                                                |
| [145](https://github.com/Vonng/ddia/pull/145)   | [@Hookey](https://github.com/Hookey)                       | 識別了當前簡繁轉譯過程中處理不當的地方，暫透過轉換指令碼規避                                  |
| [144](https://github.com/Vonng/ddia/issues/144) | [@secret4233](https://github.com/secret4233)               | ch7: 不翻譯`next-key locking`                                     |
| [143](https://github.com/Vonng/ddia/issues/143) | [@imcheney](https://github.com/imcheney)                   | ch3: 更新殘留的機翻段落                                                 |
| [142](https://github.com/Vonng/ddia/issues/142) | [@XIJINIAN](https://github.com/XIJINIAN)                   | 建議去除段首的製表符                                                     |
| [141](https://github.com/Vonng/ddia/issues/141) | [@Flyraty](https://github.com/Flyraty)                     | ch5: 發現一處錯誤格式的章節引用                                             |
| [140](https://github.com/Vonng/ddia/pull/140)   | [@Bowser1704](https://github.com/Bowser1704)               | ch5: 修正章節Summary中多處不通順的翻譯                                      |
| [139](https://github.com/Vonng/ddia/pull/139)   | [@Bowser1704](https://github.com/Bowser1704)               | ch2&ch3: 修正多處不通順的或錯誤的翻譯                                        |
| [137](https://github.com/Vonng/ddia/pull/137)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch5&ch6: 最佳化多處不通順的或錯誤的翻譯                                        |
| [134](https://github.com/Vonng/ddia/pull/134)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch4: 最佳化多處不通順的或錯誤的翻譯                                            |
| [133](https://github.com/Vonng/ddia/pull/133)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 最佳化多處錯誤的或不通順的翻譯                                            |
| [132](https://github.com/Vonng/ddia/pull/132)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 最佳化一處容易產生歧義的翻譯                                             |
| [131](https://github.com/Vonng/ddia/pull/131)   | [@rwwg4](https://github.com/rwwg4)                         | ch6: 修正兩處錯誤的翻譯                                                 |
| [129](https://github.com/Vonng/ddia/pull/129)   | [@anaer](https://github.com/anaer)                         | ch4: 修正兩處強調文字和四處程式碼變數名稱                                         |
| [128](https://github.com/Vonng/ddia/pull/128)   | [@meilin96](https://github.com/meilin96)                   | ch5: 修正一處錯誤的引用                                                 |
| [126](https://github.com/Vonng/ddia/pull/126)   | [@cwr31](https://github.com/cwr31)                         | ch10: 修正一處錯誤的翻譯（功能 -> 函式）                                      |
| [125](https://github.com/Vonng/ddia/pull/125)   | [@dch1228](https://github.com/dch1228)                     | ch2: 最佳化 how best 的翻譯（如何以最佳方式）                                  |
| [123](https://github.com/Vonng/ddia/pull/123)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 9, TOC in readme, glossary, etc.) |
| [121](https://github.com/Vonng/ddia/pull/121)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 5 to chapter 8)                   |
| [120](https://github.com/Vonng/ddia/pull/120)   | [@jiong-han](https://github.com/jiong-han)                 | Typo fix: 呲之以鼻 -> 嗤之以鼻                                         |
| [119](https://github.com/Vonng/ddia/pull/119)   | [@cclauss](https://github.com/cclauss)                     | Streamline file operations in convert()                        |
| [118](https://github.com/Vonng/ddia/pull/118)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 2 to chapter 4)                   |
| [117](https://github.com/Vonng/ddia/pull/117)   | [@feeeei](https://github.com/feeeei)                       | 統一每章的標題格式                                                      |
| [115](https://github.com/Vonng/ddia/pull/115)   | [@NageNalock](https://github.com/NageNalock)               | 第七章病句修改: 重複詞語                                                  |
| [114](https://github.com/Vonng/ddia/pull/114)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | Update README.md: correct the book name                        |
| [113](https://github.com/Vonng/ddia/pull/113)   | [@lpxxn](https://github.com/lpxxn)                         | 修改語句                                                           |
| [112](https://github.com/Vonng/ddia/pull/112)   | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [110](https://github.com/Vonng/ddia/pull/110)   | [@lpxxn](https://github.com/lpxxn)                         | 讀已寫入資料                                                         |
| [107](https://github.com/Vonng/ddia/pull/107)   | [@abbychau](https://github.com/abbychau)                   | 單調鐘和好死還是賴活著                                                    |
| [106](https://github.com/Vonng/ddia/pull/106)   | [@enochii](https://github.com/enochii)                     | typo in ch2: fix braces typo                                   |
| [105](https://github.com/Vonng/ddia/pull/105)   | [@LiminCode](https://github.com/LiminCode)                 | Chronicle translation error                                    |
| [104](https://github.com/Vonng/ddia/pull/104)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | several advice for better translation                          |
| [103](https://github.com/Vonng/ddia/pull/103)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in ch4: should be 完成 rather than 完全                       |
| [102](https://github.com/Vonng/ddia/pull/102)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | ch4: better-translation: 扼殺 → 破壞                               |
| [101](https://github.com/Vonng/ddia/pull/101)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in Ch4: should be "改變" rathr than "蓋面"                    |
| [100](https://github.com/Vonng/ddia/pull/100)   | [@LiminCode](https://github.com/LiminCode)                 | fix missing translation                                        |
| [99 ](https://github.com/Vonng/ddia/pull/99)    | [@mrdrivingduck](https://github.com/mrdrivingduck)         | ch6: fix the word rebalancing                                  |
| [98 ](https://github.com/Vonng/ddia/pull/98)    | [@jacklightChen](https://github.com/jacklightChen)         | fix ch7.md: fix wrong references                               |
| [97 ](https://github.com/Vonng/ddia/pull/97)    | [@jenac](https://github.com/jenac)                         | 96                                                             |
| [96 ](https://github.com/Vonng/ddia/pull/96)    | [@PragmaTwice](https://github.com/PragmaTwice)             | ch2: fix typo about 'may or may not be'                        |
| [95 ](https://github.com/Vonng/ddia/pull/95)    | [@EvanMu96](https://github.com/EvanMu96)                   | fix translation of "the battle cry" in ch5                     |
| [94 ](https://github.com/Vonng/ddia/pull/94)    | [@kemingy](https://github.com/kemingy)                     | ch6: fix markdown and punctuations                             |
| [93 ](https://github.com/Vonng/ddia/pull/93)    | [@kemingy](https://github.com/kemingy)                     | ch5: fix markdown and some typos                               |
| [92 ](https://github.com/Vonng/ddia/pull/92)    | [@Gilbert1024](https://github.com/Gilbert1024)             | Merge pull request #1 from Vonng/master                        |
| [88 ](https://github.com/Vonng/ddia/pull/88)    | [@kemingy](https://github.com/kemingy)                     | fix typo for ch1, ch2, ch3, ch4                                |
| [87 ](https://github.com/Vonng/ddia/pull/87)    | [@wynn5a](https://github.com/wynn5a)                       | Update ch3.md                                                  |
| [86 ](https://github.com/Vonng/ddia/pull/86)    | [@northmorn](https://github.com/northmorn)                 | Update ch1.md                                                  |
| [85 ](https://github.com/Vonng/ddia/pull/85)    | [@sunbuhui](https://github.com/sunbuhui)                   | fix ch2.md: fix ch2 ambiguous translation                      |
| [84 ](https://github.com/Vonng/ddia/pull/84)    | [@ganler](https://github.com/ganler)                       | Fix translation: use up                                        |
| [83 ](https://github.com/Vonng/ddia/pull/83)    | [@afunTW](https://github.com/afunTW)                       | Using OpenCC to convert from zh-cn to zh-tw                    |
| [82 ](https://github.com/Vonng/ddia/pull/82)    | [@kangni](https://github.com/kangni)                       | fix gitbook url                                                |
| [78 ](https://github.com/Vonng/ddia/pull/78)    | [@hanyu2](https://github.com/hanyu2)                       | Fix unappropriated translation                                 |
| [77 ](https://github.com/Vonng/ddia/pull/77)    | [@Ozarklake](https://github.com/Ozarklake)                 | fix typo                                                       |
| [75 ](https://github.com/Vonng/ddia/pull/75)    | [@2997ms](https://github.com/2997ms)                       | Fix typo                                                       |
| [74 ](https://github.com/Vonng/ddia/pull/74)    | [@2997ms](https://github.com/2997ms)                       | Update ch9.md                                                  |
| [70 ](https://github.com/Vonng/ddia/pull/70)    | [@2997ms](https://github.com/2997ms)                       | Update ch7.md                                                  |
| [67 ](https://github.com/Vonng/ddia/pull/67)    | [@jiajiadebug](https://github.com/jiajiadebug)             | fix issues in ch2 - ch9 and glossary                           |
| [66 ](https://github.com/Vonng/ddia/pull/66)    | [@blindpirate](https://github.com/blindpirate)             | Fix typo                                                       |
| [63 ](https://github.com/Vonng/ddia/pull/63)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch10.md                                                 |
| [62 ](https://github.com/Vonng/ddia/pull/62)    | [@ych](https://github.com/ych)                             | fix ch1.md typesetting problem                                 |
| [61 ](https://github.com/Vonng/ddia/pull/61)    | [@xianlaioy](https://github.com/xianlaioy)                 | docs:鍾-->種，去掉ou                                                |
| [60 ](https://github.com/Vonng/ddia/pull/60)    | [@Zombo1296](https://github.com/Zombo1296)                 | 否則 -> 或者                                                       |
| [59 ](https://github.com/Vonng/ddia/pull/59)    | [@AlexanderMisel](https://github.com/AlexanderMisel)       | 呼叫->呼叫，顯著->顯著                                                  |
| [58 ](https://github.com/Vonng/ddia/pull/58)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch8.md                                                  |
| [55 ](https://github.com/Vonng/ddia/pull/55)    | [@saintube](https://github.com/saintube)                   | ch8: 修改連結錯誤                                                    |
| [54 ](https://github.com/Vonng/ddia/pull/54)    | [@Panmax](https://github.com/Panmax)                       | Update ch2.md                                                  |
| [53 ](https://github.com/Vonng/ddia/pull/53)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [52 ](https://github.com/Vonng/ddia/pull/52)    | [@hecenjie](https://github.com/hecenjie)                   | Update ch1.md                                                  |
| [51 ](https://github.com/Vonng/ddia/pull/51)    | [@latavin243](https://github.com/latavin243)               | fix 修正ch3 ch4幾處翻譯                                              |
| [50 ](https://github.com/Vonng/ddia/pull/50)    | [@AlexZFX](https://github.com/AlexZFX)                     | 幾個疏漏和格式錯誤                                                      |
| [49 ](https://github.com/Vonng/ddia/pull/49)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch1.md                                                  |
| [48 ](https://github.com/Vonng/ddia/pull/48)    | [@scaugrated](https://github.com/scaugrated)               | fix typo                                                       |
| [47 ](https://github.com/Vonng/ddia/pull/47)    | [@lzwill](https://github.com/lzwill)                       | Fixed typos in ch2                                             |
| [45 ](https://github.com/Vonng/ddia/pull/45)    | [@zenuo](https://github.com/zenuo)                         | 刪除一個多餘的右括號                                                     |
| [44 ](https://github.com/Vonng/ddia/pull/44)    | [@akxxsb](https://github.com/akxxsb)                       | 修正第七章底部連結錯誤                                                    |
| [43 ](https://github.com/Vonng/ddia/pull/43)    | [@baijinping](https://github.com/baijinping)               | "更假簡單"->"更加簡單"                                                 |
| [42 ](https://github.com/Vonng/ddia/pull/42)    | [@tisonkun](https://github.com/tisonkun)                   | 修復 ch1 中的無序列表格式                                                |
| [38 ](https://github.com/Vonng/ddia/pull/38)    | [@renjie-c](https://github.com/renjie-c)                   | 糾正多處的翻譯小錯誤                                                     |
| [37 ](https://github.com/Vonng/ddia/pull/37)    | [@tankilo](https://github.com/tankilo)                     | fix translation mistakes in ch4.md                             |
| [36 ](https://github.com/Vonng/ddia/pull/36)    | [@wwek](https://github.com/wwek)                           | 1.修復多個連結錯誤 2.名詞最佳化修訂 3.錯誤修訂                                     |
| [35 ](https://github.com/Vonng/ddia/pull/35)    | [@wwek](https://github.com/wwek)                           | fix ch7.md  to ch8.md  link error                              |
| [34 ](https://github.com/Vonng/ddia/pull/34)    | [@wwek](https://github.com/wwek)                           | Merge pull request #1 from Vonng/master                        |
| [33 ](https://github.com/Vonng/ddia/pull/33)    | [@wwek](https://github.com/wwek)                           | fix part-ii.md link error                                      |
| [32 ](https://github.com/Vonng/ddia/pull/32)    | [@JCYoky](https://github.com/JCYoky)                       | Update ch2.md                                                  |
| [31 ](https://github.com/Vonng/ddia/pull/31)    | [@elsonLee](https://github.com/elsonLee)                   | Update ch7.md                                                  |
| [26 ](https://github.com/Vonng/ddia/pull/26)    | [@yjhmelody](https://github.com/yjhmelody)                 | 修復一些明顯錯誤                                                       |
| [25 ](https://github.com/Vonng/ddia/pull/25)    | [@lqbilbo](https://github.com/lqbilbo)                     | 修復連結錯誤                                                         |
| [24 ](https://github.com/Vonng/ddia/pull/24)    | [@artiship](https://github.com/artiship)                   | 修改詞語順序                                                         |
| [23 ](https://github.com/Vonng/ddia/pull/23)    | [@artiship](https://github.com/artiship)                   | 修正錯別字                                                          |
| [22 ](https://github.com/Vonng/ddia/pull/22)    | [@artiship](https://github.com/artiship)                   | 糾正翻譯錯誤                                                         |
| [21 ](https://github.com/Vonng/ddia/pull/21)    | [@zhtisi](https://github.com/zhtisi)                       | 修正目錄和本章標題不符的情況                                                 |
| [20 ](https://github.com/Vonng/ddia/pull/20)    | [@rentiansheng](https://github.com/rentiansheng)           | Update ch7.md                                                  |
| [19 ](https://github.com/Vonng/ddia/pull/19)    | [@LHRchina](https://github.com/LHRchina)                   | 修復語句小bug                                                       |
| [16 ](https://github.com/Vonng/ddia/pull/16)    | [@MuAlex](https://github.com/MuAlex)                       | Master                                                         |
| [15 ](https://github.com/Vonng/ddia/pull/15)    | [@cg-zhou](https://github.com/cg-zhou)                     | Update translation progress                                    |
| [14 ](https://github.com/Vonng/ddia/pull/14)    | [@cg-zhou](https://github.com/cg-zhou)                     | Translate glossary                                             |
| [13 ](https://github.com/Vonng/ddia/pull/13)    | [@cg-zhou](https://github.com/cg-zhou)                     | 詳細修改了後記中和印度野豬相關的描述                                             |
| [12 ](https://github.com/Vonng/ddia/pull/12)    | [@ibyte2011](https://github.com/ibyte2011)                 | 修改了部分翻譯                                                        |
| [11 ](https://github.com/Vonng/ddia/pull/11)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 100%                                                       |
| [10 ](https://github.com/Vonng/ddia/pull/10)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 20%                                                        |
| [9  ](https://github.com/Vonng/ddia/pull/9)     | [@jiajiadebug](https://github.com/jiajiadebug)             | Preface, ch1, part-i translation minor fixes                   |
| [7  ](https://github.com/Vonng/ddia/pull/7)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 translation pull request                                   |
| [6  ](https://github.com/Vonng/ddia/pull/6)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 change version1                                            |
| [5  ](https://github.com/Vonng/ddia/pull/5)     | [@nevertiree](https://github.com/nevertiree)               | Chapter 01語法微調                                                 |
| [2  ](https://github.com/Vonng/ddia/pull/2)     | [@seagullbird](https://github.com/seagullbird)             | 序言初翻                                                           |


================================================
FILE: content/v1_tw/glossary.md
================================================
---
title: 術語表
weight: 500
breadcrumbs: false
---

> 請注意，本術語表中的定義簡短而簡單，旨在傳達核心思想，而非死扣完整細節。有關更多詳細資訊，請參閱正文中的參考資料。


## **非同步（asynchronous）**

  不等待某些事情完成（例如，將資料傳送到網路中的另一個節點），並且不會假設要花多長時間。請參閱“[同步複製與非同步複製](/v1_tw/ch5#同步複製與非同步複製)”、“[同步網路與非同步網路](/v1_tw/ch8#同步網路與非同步網路)”以及“[系統模型與現實](/v1_tw/ch8#系統模型與現實)”。

## **原子（atomic）**

   在併發操作的上下文中：描述一個在單個時間點看起來生效的操作，所以另一個併發程序永遠不會遇到處於“半完成”狀態的操作。另見隔離。

   在事務的上下文中：將一些寫入操作分為一組，這組寫入要麼全部提交成功，要麼遇到錯誤時全部回滾。請參閱“[原子性](/v1_tw/ch7#原子性)”和“[原子提交與兩階段提交](/v1_tw/ch9#原子提交與兩階段提交)”。

## **背壓（backpressure）**

  接收方接收資料速度較慢時，強制降低傳送方的資料傳送速度。也稱為流量控制。請參閱“[訊息傳遞系統](/v1_tw/ch11#訊息傳遞系統)”。

## **批處理（batch process）**

  一種計算，它將一些固定的（通常是大的）資料集作為輸入，並將其他一些資料作為輸出，而不修改輸入。見[第十章](/v1_tw/ch10)。

## **邊界（bounded）**

  有一些已知的上限或大小。例如，網路延遲情況（請參閱“[超時與無窮的延遲](/v1_tw/ch8#超時與無窮的延遲)”）和資料集（請參閱[第十一章](/v1_tw/ch11)的介紹）。

## **拜占庭故障（Byzantine fault）**

  表現異常的節點，這種異常可能以任意方式出現，例如向其他節點發送矛盾或惡意訊息。請參閱“[拜占庭故障](/v1_tw/ch8#拜占庭故障)”。

## **快取（cache）**

  一種元件，透過儲存最近使用過的資料，加快未來對相同資料的讀取速度。快取中通常存放部分資料：因此，如果快取中缺少某些資料，則必須從某些底層較慢的資料儲存系統中，獲取完整的資料副本。

## **CAP定理（CAP theorem）**

  一個被廣泛誤解的理論結果，在實踐中是沒有用的。請參閱“[CAP定理](/v1_tw/ch9#CAP定理)”。

## **因果關係（causality）**

  事件之間的依賴關係，當一件事發生在另一件事情之前。例如，後面的事件是對早期事件的回應，或者依賴於更早的事件，或者應該根據先前的事件來理解。請參閱“[“此前發生”的關係和併發](/v1_tw/ch5#“此前發生”的關係和併發)”和“[順序與因果關係](/v1_tw/ch9#順序與因果關係)”。

## **共識（consensus）**

  分散式計算的一個基本問題，就是讓幾個節點同意某些事情（例如，哪個節點應該是資料庫叢集的領導者）。問題比乍看起來要困難得多。請參閱“[容錯共識](/v1_tw/ch9#容錯共識)”。

## **資料倉庫（data warehouse）**

  一個數據庫，其中來自幾個不同的OLTP系統的資料已經被合併和準備用於分析目的。請參閱“[資料倉庫](/v1_tw/ch3#資料倉庫)”。

## **宣告式（declarative）**

  描述某些東西應有的屬性，但不知道如何實現它的確切步驟。在查詢的上下文中，查詢最佳化器採用宣告性查詢並決定如何最好地執行它。請參閱“[資料查詢語言](/v1_tw/ch2#資料查詢語言)”。

## **反正規化（denormalize）**

  為了加速讀取，在標準資料集中引入一些冗餘或重複資料，通常採用快取或索引的形式。反正規化的值是一種預先計算的查詢結果，像物化檢視。請參閱“[單物件和多物件操作](/v1_tw/ch7#單物件和多物件操作)”和“[從同一事件日誌中派生多個檢視](/v1_tw/ch11#從同一事件日誌中派生多個檢視)”。

## **衍生資料（derived data）**

  一種資料集，根據其他資料透過可重複執行的流程建立。必要時，你可以執行該流程再次建立衍生資料。衍生資料通常用於提高特定資料的讀取速度。常見的衍生資料有索引、快取和物化檢視。請參閱[第三部分](/v1_tw/part-iii)的介紹。

## **確定性（deterministic）**

  描述一個函式，如果給它相同的輸入，則總是產生相同的輸出。這意味著它不能依賴於隨機數字、時間、網路通訊或其他不可預測的事情。

## **分散式（distributed）**

  在由網路連線的多個節點上執行。對於部分節點故障，具有容錯性：系統的一部分發生故障時，其他部分仍可以正常工作，通常情況下，軟體無需瞭解故障相關的確切情況。請參閱“[故障與部分失效](/v1_tw/ch8#故障與部分失效)”。

## **持久（durable）**

  以某種方式儲存資料，即使發生各種故障，也不會丟失資料。請參閱“[永續性](/v1_tw/ch7#永續性)”。

## **ETL（Extract-Transform-Load）**

  提取-轉換-載入（Extract-Transform-Load）。從源資料庫中提取資料，將其轉換為更適合分析查詢的形式，並將其載入到資料倉庫或批處理系統中的過程。請參閱“[資料倉庫](/v1_tw/ch3#資料倉庫)”。

## **故障切換（failover）**

  在具有單一領導者的系統中，故障切換是將領導角色從一個節點轉移到另一個節點的過程。請參閱“[處理節點宕機](/v1_tw/ch5#處理節點宕機)”。

## **容錯（fault-tolerant）**

  如果出現問題（例如，機器崩潰或網路連線失敗），可以自動恢復。請參閱“[可靠性](/v1_tw/ch1#可靠性)”。

## **流量控制（flow control）**

  見背壓（backpressure）。

## **追隨者（follower）**

  一種資料副本，僅處理領導者或主庫發出的資料變更，不直接接受來自客戶端的任何寫入。也稱為備庫、從庫、只讀副本或熱備份。請參閱“[領導者與追隨者](/v1_tw/ch5#領導者與追隨者)”。

## **全文檢索（full-text search）**

  透過任意關鍵字來搜尋文字，通常具有附加特徵，例如匹配類似的拼寫詞或同義詞。全文索引是一種支援這種查詢的次級索引。請參閱“[全文搜尋和模糊索引](/v1_tw/ch3#全文搜尋和模糊索引)”。

## **圖（graph）**

  一種資料結構，由頂點（可以指向的東西，也稱為節點或實體）和邊（從一個頂點到另一個頂點的連線，也稱為關係或弧）組成。請參閱“[圖資料模型](/v1_tw/ch2#圖資料模型)”。

## **雜湊（hash）**

  將輸入轉換為看起來像隨機數值的函式。相同的輸入會轉換為相同的數值，不同的輸入一般會轉換為不同的數值，也可能轉換為相同數值（也被稱為衝突）。請參閱“[根據鍵的雜湊分割槽](/v1_tw/ch6#根據鍵的雜湊分割槽)”。

## **冪等（idempotent）**

  用於描述一種操作可以安全地重試執行，即執行多次的效果和執行一次的效果相同。請參閱“[冪等性](/v1_tw/ch11#冪等性)”。

## **索引（index）**

  一種資料結構。透過索引，你可以根據特定欄位的值，在所有資料記錄中進行高效檢索。請參閱“[驅動資料庫的資料結構](/v1_tw/ch3#驅動資料庫的資料結構)”。

## **隔離性（isolation）**

  在事務上下文中，用於描述併發執行事務的互相干擾程度。序列執行具有最強的隔離性，不過其它程度的隔離也通常被使用。請參閱“[隔離性](/v1_tw/ch7#隔離性)”。

## **連線（join）**

  彙集有共同點的記錄。在一個記錄與另一個記錄有關（外部索引鍵，文件參考，圖中的邊）的情況下最常用，查詢需要獲取參考所指向的記錄。請參閱“[多對一和多對多的關係](/v1_tw/ch2#多對一和多對多的關係)”和“[Reduce側連線與分組](/v1_tw/ch10#Reduce側連線與分組)”。

## **領導者（leader）**

  當資料或服務被複制到多個節點時，領導者是被指定為可以接受資料變更的副本。領導者可以透過某些協議選舉產生，也可以由管理員手動選擇。領導者也被稱為主庫。請參閱“[領導者與追隨者](/v1_tw/ch5#領導者與追隨者)”。

## **線性化（linearizable）**

  表現為系統中只有一份透過原子操作更新的資料副本。請參閱“[線性一致性](/v1_tw/ch9#線性一致性)”。

## **區域性（locality）**

  一種效能最佳化方式，如果經常在相同的時間請求一些離散資料，把這些資料放到一個位置。請參閱“[查詢的資料區域性](/v1_tw/ch2#查詢的資料區域性)”。

## **鎖（lock）**

  一種保證只有一個執行緒、節點或事務可以訪問的機制，如果其它執行緒、節點或事務想訪問相同元素，則必須等待鎖被釋放。請參閱“[兩階段鎖定](/v1_tw/ch7#兩階段鎖定)”和“[領導者和鎖](/v1_tw/ch8#領導者和鎖)”。

## **日誌（log）**

  日誌是一個只能以追加方式寫入的檔案，用於存放資料。預寫式日誌用於在儲存引擎崩潰時恢復資料（請參閱“[讓B樹更可靠](/v1_tw/ch3#讓B樹更可靠)”）；結構化日誌儲存引擎使用日誌作為它的主要儲存格式（請參閱“[SSTables和LSM樹](/v1_tw/ch3#SSTables和LSM樹)”）；複製型日誌用於把寫入從領導者複製到追隨者（請參閱“[領導者與追隨者](/v1_tw/ch5#領導者與追隨者)”）；事件性日誌可以表現為資料流（請參閱“[分割槽日誌](/v1_tw/ch11#分割槽日誌)”）。

## **物化（materialize）**

  急切地計算並寫出結果，而不是在請求時計算。請參閱“[聚合：資料立方體和物化檢視](/v1_tw/ch3#聚合：資料立方體和物化檢視)”和“[物化中間狀態](/v1_tw/ch10#物化中間狀態)”。

## **節點（node）**

  計算機上執行的一些軟體的例項，透過網路與其他節點通訊以完成某項任務。

## **正規化（normalized）**

  以沒有冗餘或重複的方式進行結構化。在正規化資料庫中，當某些資料發生變化時，你只需要在一個地方進行更改，而不是在許多不同的地方複製很多次。請參閱“[多對一和多對多的關係](/v1_tw/ch2#多對一和多對多的關係)”。

## **OLAP（Online Analytic Processing）**

  線上分析處理。透過對大量記錄進行聚合（例如，計數，總和，平均）來表徵的訪問模式。請參閱“[事務處理還是分析？](/v1_tw/ch3#事務處理還是分析？)”。

## **OLTP（Online Transaction Processing）**

  線上事務處理。訪問模式的特點是快速查詢，讀取或寫入少量記錄，這些記錄通常透過鍵索引。請參閱“[事務處理還是分析？](/v1_tw/ch3#事務處理還是分析？)”。

## **分割槽（partitioning）**

  將單機上的大型資料集或計算結果拆分為較小部分，並將其分佈到多臺機器上。也稱為分片。見[第六章](/v1_tw/ch6)。

## **百分位點（percentile）**

  透過計算有多少值高於或低於某個閾值來衡量值分佈的方法。例如，某個時間段的第95個百分位響應時間是時間t，則該時間段中，95%的請求完成時間小於t，5%的請求完成時間要比t長。請參閱“[描述效能](/v1_tw/ch1#描述效能)”。

## **主鍵（primary key）**

  唯一標識記錄的值（通常是數字或字串）。在許多應用程式中，主鍵由系統在建立記錄時生成（例如，按順序或隨機）; 它們通常不由使用者設定。另請參閱次級索引。

## **法定人數（quorum）**

  在操作完成之前，需要對操作進行投票的最少節點數量。請參閱“[讀寫的法定人數](/v1_tw/ch5#讀寫的法定人數)”。

## **再平衡（rebalance）**

  將資料或服務從一個節點移動到另一個節點以實現負載均衡。請參閱“[分割槽再平衡](/v1_tw/ch6#分割槽再平衡)”。

## **複製（replication）**

  在幾個節點（副本）上保留相同資料的副本，以便在某些節點無法訪問時，資料仍可訪問。請參閱[第五章](/v1_tw/ch5)。

## **模式（schema）**

  一些資料結構的描述，包括其欄位和資料型別。可以在資料生命週期的不同點檢查某些資料是否符合模式（請參閱“[文件模型中的模式靈活性](/v1_tw/ch2#文件模型中的模式靈活性)”），模式可以隨時間變化（請參閱[第四章](/v1_tw/ch4)）。

## **次級索引（secondary index）**

  與主要資料儲存器一起維護的附加資料結構，使你可以高效地搜尋與某種條件相匹配的記錄。請參閱“[其他索引結構](/v1_tw/ch3#其他索引結構)”和“[分割槽與次級索引](/v1_tw/ch6#分割槽與次級索引)”。

## **可序列化（serializable）**

  保證多個併發事務同時執行時，它們的行為與按順序逐個執行事務相同。請參閱第七章的“[可序列化](/v1_tw/ch7#可序列化)”。

## **無共享（shared-nothing）**

  與共享記憶體或共享磁碟架構相比，獨立節點（每個節點都有自己的CPU，記憶體和磁碟）透過傳統網路連線。見[第二部分](/v1_tw/part-ii)的介紹。

## **偏斜（skew）**

  各分割槽負載不平衡，例如某些分割槽有大量請求或資料，而其他分割槽則少得多。也被稱為熱點。請參閱“[負載偏斜與熱點消除](/v1_tw/ch6#負載偏斜與熱點消除)”和“[處理偏斜](/v1_tw/ch10#處理偏斜)”。

  時間線異常導致事件以不期望的順序出現。請參閱“[快照隔離和可重複讀](/v1_tw/ch7#快照隔離和可重複讀)”中的關於讀取偏差的討論，“[寫入偏差與幻讀](/v1_tw/ch7#寫入偏差與幻讀)”中的寫入偏差以及“[有序事件的時間戳](/v1_tw/ch8#有序事件的時間戳)”中的時鐘偏斜。

## **腦裂（split brain）**

  兩個節點同時認為自己是領導者的情況，這種情況可能違反系統擔保。請參閱“[處理節點宕機](/v1_tw/ch5#處理節點宕機)”和“[真相由多數所定義](/v1_tw/ch8#真相由多數所定義)”。

## **儲存過程（stored procedure）**

  一種對事務邏輯進行編碼的方式，它可以完全在資料庫伺服器上執行，事務執行期間無需與客戶端通訊。請參閱“[真的序列執行](/v1_tw/ch7#真的序列執行)”。

## **流處理（stream process）**

  持續執行的計算。可以持續接收事件流作為輸入，並得出一些輸出。見[第十一章](/v1_tw/ch11)。

## **同步（synchronous）**

  非同步的反義詞。

## **記錄系統（system of record）**

  一個儲存主要權威版本資料的系統，也被稱為真相的來源。首先在這裡寫入資料變更，其他資料集可以從記錄系統衍生。請參閱[第三部分](/v1_tw/part-iii)的介紹。

## **超時（timeout）**

  檢測故障的最簡單方法之一，即在一段時間內觀察是否缺乏響應。但是，不可能知道超時是由於遠端節點的問題還是網路中的問題造成的。請參閱“[超時與無窮的延遲](/v1_tw/ch8#超時與無窮的延遲)”。

## **全序（total order）**

  一種比較事物的方法（例如時間戳），可以讓你總是說出兩件事中哪一件更大，哪件更小。總的來說，有些東西是無法比擬的（不能說哪個更大或更小）的順序稱為偏序。請參閱“[因果順序不是全序的](/v1_tw/ch9#因果順序不是全序的)”。

## **事務（transaction）**

  為了簡化錯誤處理和併發問題，將幾個讀寫操作分組到一個邏輯單元中。見[第七章](/v1_tw/ch7)。

## **兩階段提交（2PC, two-phase commit）**

  一種確保多個數據庫節點全部提交或全部中止事務的演算法。請參閱“[原子提交與兩階段提交](/v1_tw/ch9#原子提交與兩階段提交)”。

## **兩階段鎖定（2PL, two-phase locking）**

  一種用於實現可序列化隔離的演算法，該演算法透過事務獲取對其讀取或寫入的所有資料的鎖，直到事務結束。請參閱“[兩階段鎖定](/v1_tw/ch7#兩階段鎖定)”。

## **無邊界（unbounded）**

  沒有任何已知的上限或大小。反義詞是邊界（bounded）。

================================================
FILE: content/v1_tw/part-i.md
================================================
---
title: 第一部分：資料系統基礎
weight: 100
breadcrumbs: false
---

本書前四章介紹了資料系統底層的基礎概念，無論是在單臺機器上執行的單點資料系統，還是分佈在多臺機器上的分散式資料系統都適用。

1. [第一章](/v1_tw/ch1) 將介紹本書使用的術語和方法。**可靠性，可伸縮性和可維護性** ，這些詞彙到底意味著什麼？如何實現這些目標？
2. [第二章](/v1_tw/ch2) 將對幾種不同的 **資料模型和查詢語言** 進行比較。從程式設計師的角度看，這是資料庫之間最明顯的區別。不同的資料模型適用於不同的應用場景。
3. [第三章](/v1_tw/ch3) 將深入 **儲存引擎** 內部，研究資料庫如何在磁碟上擺放資料。不同的儲存引擎針對不同的負載進行最佳化，選擇合適的儲存引擎對系統性能有巨大影響。
4. [第四章](/v1_tw/ch4) 將對幾種不同的 **資料編碼** 進行比較。特別研究了這些格式在應用需求經常變化、模式需要隨時間演變的環境中表現如何。

第二部分將專門討論在 **分散式資料系統** 中特有的問題。


## 索引

* [第一章：可靠性、可伸縮性和可維護性](/v1_tw/ch1)
    * [關於資料系統的思考](/v1_tw/ch1#關於資料系統的思考)
    * [可靠性](/v1_tw/ch1#可靠性)
    * [可伸縮性](/v1_tw/ch1#可伸縮性)
    * [可維護性](/v1_tw/ch1#可維護性)
    * [本章小結](/v1_tw/ch1#本章小結)
* [第二章：資料模型與查詢語言](/v1_tw/ch2)
    * [關係模型與文件模型](/v1_tw/ch2#關係模型與文件模型)
    * [資料查詢語言](/v1_tw/ch2#資料查詢語言)
    * [圖資料模型](/v1_tw/ch2#圖資料模型)
    * [本章小結](/v1_tw/ch2#本章小結)
* [第三章：儲存與檢索](/v1_tw/ch3)
    * [驅動資料庫的資料結構](/v1_tw/ch3#驅動資料庫的資料結構)
    * [事務處理還是分析？](/v1_tw/ch3#事務處理還是分析)
    * [列式儲存](/v1_tw/ch3#列式儲存)
    * [本章小結](/v1_tw/ch3#本章小結)
* [第四章：編碼與演化](/v1_tw/ch4)
    * [編碼資料的格式](/v1_tw/ch4#編碼資料的格式)
    * [資料流的型別](/v1_tw/ch4#資料流的型別)
    * [本章小結](/v1_tw/ch4#本章小結)

================================================
FILE: content/v1_tw/part-ii.md
================================================
---
title: 第二部分：分散式資料
weight: 200
breadcrumbs: false
---

> 一個成功的技術，現實的優先順序必須高於公關，你可以糊弄別人，但糊弄不了自然規律。
>
> —— 羅傑斯委員會報告（1986）
>

-------

在本書的 [第一部分](/v1_tw/part-i) 中，我們討論了資料系統的各個方面，但僅限於資料儲存在單臺機器上的情況。現在我們到了 [第二部分](/v1_tw/part-ii)，進入更高的層次，並提出一個問題：如果 **多臺機器** 參與資料的儲存和檢索，會發生什麼？

你可能會出於各種各樣的原因，希望將資料庫分佈到多臺機器上：

可伸縮性
: 如果你的資料量、讀取負載、寫入負載超出單臺機器的處理能力，可以將負載分散到多臺計算機上。

容錯 / 高可用性
: 如果你的應用需要在單臺機器（或多臺機器，網路或整個資料中心）出現故障的情況下仍然能繼續工作，則可使用多臺機器，以提供冗餘。一臺故障時，另一臺可以接管。

延遲
: 如果在世界各地都有使用者，你也許會考慮在全球範圍部署多個伺服器，從而每個使用者可以從地理上最近的資料中心獲取服務，避免了等待網路資料包穿越半個世界。

## 伸縮至更高的載荷

如果你需要的只是伸縮至更高的 **載荷（load）**，最簡單的方法就是購買更強大的機器（有時稱為 **垂直伸縮**，即 vertical scaling，或 **向上伸縮**，即 scale up）。許多處理器，記憶體和磁碟可以在同一個作業系統下相互連線，快速的相互連線允許任意處理器訪問記憶體或磁碟的任意部分。在這種 **共享記憶體架構（shared-memory architecture）** 中，所有的元件都可以看作一臺單獨的機器 [^i]。

[^i]: 在大型機中，儘管任意處理器都可以訪問記憶體的任意部分，但總有一些記憶體區域與一些處理器更接近（稱為 **非均勻記憶體訪問（nonuniform memory access, NUMA）**【1】）。為了有效利用這種架構特性，需要對處理進行細分，以便每個處理器主要訪問臨近的記憶體，這意味著即使表面上看起來只有一臺機器在執行，**分割槽（partitioning）** 仍然是必要的。

共享記憶體方法的問題在於，成本增長速度快於線性增長：一臺有著雙倍處理器數量，雙倍記憶體大小，雙倍磁碟容量的機器，通常成本會遠遠超過原來的兩倍。而且可能因為存在瓶頸，並不足以處理雙倍的載荷。

共享記憶體架構可以提供有限的容錯能力，高階機器可以使用熱插拔的元件（不關機更換磁碟，記憶體模組，甚至處理器）—— 但它必然囿於單個地理位置的桎梏。

另一種方法是 **共享磁碟架構（shared-disk architecture）**，它使用多臺具有獨立處理器和記憶體的機器，但將資料儲存在機器之間共享的磁碟陣列上，這些磁碟透過快速網路連線 [^ii]。這種架構用於某些資料倉庫，但競爭和鎖定的開銷限制了共享磁碟方法的可伸縮性【2】。

[^ii]: 網路附屬儲存（Network Attached Storage, NAS），或 **儲存區網路（Storage Area Network, SAN）**

### 無共享架構

相比之下，**無共享架構**【3】（shared-nothing architecture，有時被稱為 **水平伸縮**，即 horizontal scaling，或 **向外伸縮**，即 scaling out）已經相當普及。在這種架構中，執行資料庫軟體的每臺機器 / 虛擬機器都稱為 **節點（node）**。每個節點只使用各自的處理器，記憶體和磁碟。節點之間的任何協調，都是在軟體層面使用傳統網路實現的。

無共享系統不需要使用特殊的硬體，所以你可以用任意機器 —— 比如價效比最好的機器。你也許可以跨多個地理區域分佈資料從而減少使用者延遲，或者在損失一整個資料中心的情況下倖免於難。隨著雲端虛擬機器部署的出現，即使是小公司，現在無需 Google 級別的運維，也可以實現異地分散式架構。

在這一部分裡，我們將重點放在無共享架構上。它不見得是所有場景的最佳選擇，但它是最需要你謹慎從事的架構。如果你的資料分佈在多個節點上，你需要意識到這樣一個分散式系統中約束和權衡 —— 資料庫並不能魔術般地把這些東西隱藏起來。

雖然分散式無共享架構有許多優點，但它通常也會給應用帶來額外的複雜度，有時也會限制你可用資料模型的表達力。在某些情況下，一個簡單的單執行緒程式可以比一個擁有超過 100 個 CPU 核的叢集表現得更好【4】。另一方面，無共享系統可以非常強大。接下來的幾章，將詳細討論分散式資料會帶來的問題。

### 複製 vs 分割槽

資料分佈在多個節點上有兩種常見的方式：

複製（Replication）
: 在幾個不同的節點上儲存資料的相同副本，可能放在不同的位置。複製提供了冗餘：如果一些節點不可用，剩餘的節點仍然可以提供資料服務。複製也有助於改善效能。[第五章](/v1_tw/ch5) 將討論複製。

分割槽 (Partitioning)
: 將一個大型資料庫拆分成較小的子集（稱為 **分割槽**，即 partitions），從而不同的分割槽可以指派給不同的 **節點**（nodes，亦稱 **分片**，即 sharding）。[第六章](/v1_tw/ch6) 將討論分割槽。

複製和分割槽是不同的機制，但它們經常同時使用。如 [圖 II-1](/v1/ddia_part-ii_01.png) 所示。

![](/v1/ddia_part-ii_01.png)

**圖 II-1 一個數據庫切分為兩個分割槽，每個分割槽都有兩個副本**

理解了這些概念，就可以開始討論在分散式系統中需要做出的困難抉擇。[第七章](/v1_tw/ch7) 將討論 **事務（Transaction）**，這對於瞭解資料系統中可能出現的各種問題，以及我們可以做些什麼很有幫助。[第八章](/v1_tw/ch8) 和 [第九章](/v1_tw/ch9) 將討論分散式系統的根本侷限性。

在本書的 [第三部分](/v1_tw/part-iii) 中，將討論如何將多個（可能是分散式的）資料儲存整合為一個更大的系統，以滿足複雜的應用需求。但首先，我們來聊聊分散式的資料。


## 索引

* [第五章：複製](/v1_tw/ch5)
  * [領導者與追隨者](/v1_tw/ch5#領導者與追隨者)
  * [複製延遲問題](/v1_tw/ch5#複製延遲問題)
  * [多主複製](/v1_tw/ch5#多主複製)
  * [無主複製](/v1_tw/ch5#無主複製)
  * [本章小結](/v1_tw/ch5#本章小結)
* [第六章：分割槽](/v1_tw/ch6)
  * [分割槽與複製](/v1_tw/ch6#分割槽與複製)
  * [鍵值資料的分割槽](/v1_tw/ch6#鍵值資料的分割槽)
  * [分割槽與次級索引](/v1_tw/ch6#分割槽與次級索引)
  * [分割槽再平衡](/v1_tw/ch6#分割槽再平衡)
  * [請求路由](/v1_tw/ch6#請求路由)
  * [本章小結](/v1_tw/ch6#本章小結)
* [第七章：事務](/v1_tw/ch7)
  * [事務的棘手概念](/v1_tw/ch7#事務的棘手概念)
  * [弱隔離級別](/v1_tw/ch7#弱隔離級別)
  * [可序列化](/v1_tw/ch7#可序列化)
  * [本章小結](/v1_tw/ch7#本章小結)
* [第八章：分散式系統的麻煩](/v1_tw/ch8)
  * [故障與部分失效](/v1_tw/ch8#故障與部分失效)
  * [不可靠的網路](/v1_tw/ch8#不可靠的網路)
  * [不可靠的時鐘](/v1_tw/ch8#不可靠的時鐘)
  * [知識、真相與謊言](/v1_tw/ch8#知識真相與謊言)
  * [本章小結](/v1_tw/ch8#本章小結)
* [第九章：一致性與共識](/v1_tw/ch9)
  * [一致性保證](/v1_tw/ch9#一致性保證)
  * [線性一致性](/v1_tw/ch9#線性一致性)
  * [順序保證](/v1_tw/ch9#順序保證)
  * [分散式事務與共識](/v1_tw/ch9#分散式事務與共識)
  * [本章小結](/v1_tw/ch9#本章小結)


## 參考文獻

1. Ulrich Drepper: “[What Every Programmer Should Know About Memory](https://people.freebsd.org/~lstewart/articles/cpumemory.pdf),” akka‐dia.org, November 21, 2007.
1. Ben Stopford: “[Shared Nothing vs. Shared Disk Architectures: An Independent View](http://www.benstopford.com/2009/11/24/understanding-the-shared-nothing-architecture/),” benstopford.com, November 24, 2009.
1. Michael Stonebraker: “[The Case for Shared Nothing](http://db.cs.berkeley.edu/papers/hpts85-nothing.pdf),” IEEE Database EngineeringBulletin, volume 9, number 1, pages 4–9, March 1986.
1. Frank McSherry, Michael Isard, and Derek G. Murray: “[Scalability! But at What COST?](http://www.frankmcsherry.org/assets/COST.pdf),” at 15th USENIX Workshop on Hot Topics in Operating Systems (HotOS),May 2015.

================================================
FILE: content/v1_tw/part-iii.md
================================================
---
title: 第三部分：衍生資料
weight: 300
breadcrumbs: false
---


在本書的 [第一部分](/v1_tw/part-i) 和 [第二部分](/v1_tw/part-ii) 中，我們自底向上地把所有關於分散式資料庫的主要考量都過了一遍。從資料在磁碟上的佈局，一直到出現故障時分散式系統一致性的侷限。但所有的討論都假定了應用中只用了一種資料庫。

現實世界中的資料系統往往更為複雜。大型應用程式經常需要以多種方式訪問和處理資料，沒有一個數據庫可以同時滿足所有這些不同的需求。因此應用程式通常組合使用多種元件：資料儲存、索引、快取、分析系統等等，並實現在這些元件中移動資料的機制。

本書的最後一部分，會研究將多個不同資料系統（可能有著不同資料模型，並針對不同的訪問模式進行最佳化）整合為一個協調一致的應用架構時，會遇到的問題。軟體供應商經常會忽略這一方面的生態建設，並聲稱他們的產品能夠滿足你的所有需求。在現實世界中，整合不同的系統是實際應用中最重要的事情之一。

## 記錄系統和衍生資料系統

從高層次上看，儲存和處理資料的系統可以分為兩大類：

* 記錄系統（System of record）

  **記錄系統**，也被稱為 **真相源（source of truth）**，持有資料的權威版本。當新的資料進入時（例如，使用者輸入）首先會記錄在這裡。每個事實正正好好表示一次（表示通常是 **正規化的**，即 normalized）。如果其他系統和 **記錄系統** 之間存在任何差異，那麼記錄系統中的值是正確的（根據定義）。

* 衍生資料系統（Derived data systems）

  **衍生系統** 中的資料，通常是另一個系統中的現有資料以某種方式進行轉換或處理的結果。如果丟失衍生資料，可以從原始來源重新建立。典型的例子是 **快取（cache）**：如果資料在快取中，就可以由快取提供服務；如果快取不包含所需資料，則降級由底層資料庫提供。反正規化的值，索引和物化檢視亦屬此類。在推薦系統中，預測彙總資料通常衍生自使用者日誌。

從技術上講，衍生資料是 **冗餘的（redundant）**，因為它重複了已有的資訊。但是衍生資料對於獲得良好的只讀查詢效能通常是至關重要的。它通常是反正規化的。可以從單個源頭衍生出多個不同的資料集，使你能從不同的 “視角” 洞察資料。

並不是所有的系統都在其架構中明確區分 **記錄系統** 和 **衍生資料系統**，但是這是一種有用的區分方式，因為它明確了系統中的資料流：系統的哪一部分具有哪些輸入和哪些輸出，以及它們如何相互依賴。

大多數資料庫，儲存引擎和查詢語言，本質上既不是記錄系統也不是衍生系統。資料庫只是一個工具：如何使用它取決於你自己。**記錄系統和衍生資料系統之間的區別不在於工具，而在於應用程式中的使用方式。**

透過梳理資料的衍生關係，可以清楚地理解一個令人困惑的系統架構。這將貫穿本書的這一部分。

## 章節概述

我們將從 [第十章](/v1_tw/ch10) 開始，研究例如 MapReduce 這樣 **面向批處理（batch-oriented）** 的資料流系統。對於建設大規模資料系統，我們將看到，它們提供了優秀的工具和思想。[第十一章](/v1_tw/ch11) 將把這些思想應用到 **流式資料（data streams）** 中，使我們能用更低的延遲完成同樣的任務。[第十二章](/v1_tw/ch12) 將對本書進行總結，探討如何使用這些工具來構建可靠，可伸縮和可維護的應用。

## 索引

* [第十章：批處理](/v1_tw/ch10)
  * [使用Unix工具的批處理](/v1_tw/ch10#使用Unix工具的批處理)
  * [MapReduce和分散式檔案系統](/v1_tw/ch10#MapReduce和分散式檔案系統)
  * [MapReduce之後](/v1_tw/ch10#MapReduce之後)
  * [本章小結](/v1_tw/ch10#本章小結)
* [第十一章：流處理](/v1_tw/ch11)
  * [傳遞事件流](/v1_tw/ch11#傳遞事件流)
  * [資料庫與流](/v1_tw/ch11#資料庫與流)
  * [流處理](/v1_tw/ch11#流處理)
  * [本章小結](/v1_tw/ch11#本章小結)
* [第十二章：資料系統的未來](/v1_tw/ch12)
  * [資料整合](/v1_tw/ch12#資料整合)
  * [分拆資料庫](/v1_tw/ch12#分拆資料庫)
  * [將事情做正確](/v1_tw/ch12#將事情做正確)
  * [做正確的事情](/v1_tw/ch12#做正確的事情)
  * [本章小結](/v1_tw/ch12#本章小結)

================================================
FILE: content/v1_tw/preface.md
================================================
---
title: 序言
weight: 50
breadcrumbs: false
---


如果近幾年從業於軟體工程，特別是伺服器端和後端系統開發，那麼你很有可能已經被大量關於資料儲存和處理的時髦詞彙轟炸過了： NoSQL！大資料！Web-Scale！分片！最終一致性！ACID！CAP 定理！雲服務！MapReduce！即時！

在最近十年中，我們看到了很多有趣的進展，關於資料庫，分散式系統，以及在此基礎上構建應用程式的方式。這些進展有著各種各樣的驅動力：

* 谷歌、雅虎、亞馬遜、臉書、領英、微軟和推特等網際網路公司正在和巨大的流量 / 資料打交道，這迫使他們去創造能有效應對如此規模的新工具。
* 企業需要變得敏捷，需要低成本地檢驗假設，需要透過縮短開發週期和保持資料模型的靈活性，快速地響應新的市場洞察。
* 免費和開源軟體變得非常成功，在許多環境中比商業軟體和定製軟體更受歡迎。
* 處理器主頻幾乎沒有增長，但是多核處理器已經成為標配，網路也越來越快。這意味著並行化程度只增不減。
* 即使你在一個小團隊中工作，現在也可以構建分佈在多臺計算機甚至多個地理區域的系統，這要歸功於譬如亞馬遜網路服務（AWS）等基礎設施即服務（IaaS）概念的踐行者。
* 許多服務都要求高可用，因停電或維護導致的服務不可用，變得越來越難以接受。

**資料密集型應用（data-intensive applications）** 正在透過使用這些技術進步來推動可能性的邊界。一個應用被稱為 **資料密集型** 的，如果 **資料是其主要挑戰**（資料量，資料複雜度或資料變化速度）—— 與之相對的是 **計算密集型**，即處理器速度是其瓶頸。

幫助資料密集型應用儲存和處理資料的工具與技術，正迅速地適應這些變化。新型資料庫系統（“NoSQL”）已經備受關注，而訊息佇列，快取，搜尋索引，批處理和流處理框架以及相關技術也非常重要。很多應用組合使用這些工具與技術。

這些生意盎然的時髦詞彙體現出人們對新的可能性的熱情，這是一件好事。但是作為軟體工程師和架構師，如果要開發優秀的應用，我們還需要對各種層出不窮的技術及其利弊權衡有精準的技術理解。為了獲得這種洞察，我們需要深挖時髦詞彙背後的內容。

幸運的是，在技術迅速變化的背後總是存在一些持續成立的原則，無論你使用了特定工具的哪個版本。如果你理解了這些原則，就可以領會這些工具的適用場景，如何充分利用它們，以及如何避免其中的陷阱。這正是本書的初衷。

本書的目標是幫助你在飛速變化的資料處理和資料儲存技術大觀園中找到方向。本書並不是某個特定工具的教程，也不是一本充滿枯燥理論的教科書。相反，我們將看到一些成功資料系統的樣例：許多流行應用每天都要在生產中滿足可伸縮性、效能、以及可靠性的要求，而這些技術構成了這些應用的基礎。

我們將深入這些系統的內部，理清它們的關鍵演算法，討論背後的原則和它們必須做出的權衡。在這個過程中，我們將嘗試尋找 **思考** 資料系統的有效方式 —— 不僅關於它們 **如何** 工作，還包括它們 **為什麼** 以這種方式工作，以及哪些問題是我們需要問的。

閱讀本書後，你能很好地決定哪種技術適合哪種用途，並瞭解如何將工具組合起來，為一個良好應用架構奠定基礎。本書並不足以使你從頭開始構建自己的資料庫儲存引擎，不過幸運的是這基本上很少有必要。你將獲得對系統底層發生事情的敏銳直覺，這樣你就有能力推理它們的行為，做出優秀的設計決策，並追蹤任何可能出現的問題。


## 本書的目標讀者

如果你開發的應用具有用於儲存或處理資料的某種伺服器 / 後端系統，而且使用網路（例如，Web 應用、移動應用或連線到網際網路的感測器），那麼本書就是為你準備的。

本書是為軟體工程師，軟體架構師，以及喜歡寫程式碼的技術經理準備的。如果你需要對所從事系統的架構做出決策 —— 例如你需要選擇解決某個特定問題的工具，並找出如何最好地使用這些工具，那麼這本書對你尤有價值。但即使你無法選擇你的工具，本書仍將幫助你更好地瞭解所使用工具的長處和短處。

你應當具有一些開發 Web 應用或網路服務的經驗，且應當熟悉關係型資料庫和 SQL。任何你瞭解的非關係型資料庫和其他與資料相關工具都會有所幫助，但不是必需的。對常見網路協議如 TCP 和 HTTP 的大概理解是有幫助的。程式語言或框架的選擇對閱讀本書沒有任何不同影響。

如果以下任意一條對你為真，你會發現這本書很有價值：

* 你想了解如何使資料系統可伸縮，例如，支援擁有數百萬使用者的 Web 或移動應用。
* 你需要提高應用程式的可用性（最大限度地減少停機時間），保持穩定執行。
* 你正在尋找使系統在長期執行過程易於維護的方法，即使系統規模增長，需求與技術也發生變化。
* 你對事物的運作方式有著天然的好奇心，並且希望知道一些主流網站和線上服務背後發生的事情。這本書打破了各種資料庫和資料處理系統的內幕，探索這些系統設計中的智慧是非常有趣的。

有時在討論可伸縮的資料系統時，人們會說：“你又不在谷歌或亞馬遜，別操心可伸縮性了，直接上關係型資料庫”。這個陳述有一定的道理：為了不必要的伸縮性而設計程式，不僅會浪費不必要的精力，並且可能會把你鎖死在一個不靈活的設計中。實際上這是一種 “過早最佳化” 的形式。不過，選擇合適的工具確實很重要，而不同的技術各有優缺點。我們將看到，關係資料庫雖然很重要，但絕不是資料處理的終章。


## 本書涉及的領域

本書並不會嘗試告訴讀者如何安裝或使用特定的軟體包或 API，因為已經有大量文件給出了詳細的使用說明。相反，我們會討論資料系統的基礎 —— 各種原則與利弊權衡，並探討了不同產品所做出的不同設計決策。

在電子書中包含了線上資源全文的連結。所有連結在出版時都進行了驗證，但不幸的是，由於網路的自然規律，連結往往會頻繁地破損。如果你遇到連結斷開的情況，或者正在閱讀本書的列印副本，可以使用搜索引擎查詢參考文獻。對於學術論文，你可以在 Google 學術中搜索標題，查詢可以公開獲取的 PDF 檔案。或者，你也可以在 https://github.com/ept/ddia-references 中找到所有的參考資料，我們在那兒維護最新的連結。

我們主要關注的是資料系統的 **架構（architecture）**，以及它們被整合到資料密集型應用中的方式。本書沒有足夠的空間覆蓋部署、運維、安全、管理等領域 —— 這些都是複雜而重要的主題，僅僅在本書中用粗略的註解討論這些對它們很不公平。每個領域都值得用單獨的書去講。

本書中描述的許多技術都被涵蓋在 **大資料（Big Data）** 這個時髦詞的範疇中。然而 “大資料” 這個術語被濫用，缺乏明確定義，以至於在嚴肅的工程討論中沒有用處。這本書使用歧義更小的術語，如 “單節點” 之於 “分散式系統”，或 “線上 / 互動式系統” 之於 “離線 / 批處理系統”。

本書對 **自由和開源軟體（FOSS）** 有一定偏好，因為閱讀、修改和執行原始碼是瞭解某事物詳細工作原理的好方法。開放的平臺也可以降低供應商壟斷的風險。然而在適當的情況下，我們也會討論專利軟體（閉源軟體，軟體即服務 SaaS，或一些在文獻中描述過但未公開發行的公司內部軟體）。

## 本書綱要

本書分為三部分：

1. 在 [第一部分](/v1_tw/part-i) 中，我們會討論設計資料密集型應用所賴的基本思想。我們從 [第一章](/v1_tw/ch1) 開始，討論我們實際要達到的目標：可靠性、可伸縮性和可維護性；我們該如何思考這些概念；以及如何實現它們。在 [第二章](/v1_tw/ch2) 中，我們比較了幾種不同的資料模型和查詢語言，看看它們如何適用於不同的場景。在 [第三章](/v1_tw/ch3) 中將討論儲存引擎：資料庫如何在磁碟上擺放資料，以便能高效地再次找到它。[第四章](/v1_tw/ch4) 轉向資料編碼（序列化），以及隨時間演化的模式。

2. 在 [第二部分](/v1_tw/part-ii) 中，我們從討論儲存在一臺機器上的資料轉向討論分佈在多臺機器上的資料。這對於可伸縮性通常是必需的，但帶來了各種獨特的挑戰。我們首先討論複製（[第五章](/v1_tw/ch5)）、分割槽 / 分片（[第六章](/v1_tw/ch6)）和事務（[第七章](/v1_tw/ch7)）。然後我們將探索關於分散式系統問題的更多細節（[第八章](/v1_tw/ch8)），以及在分散式系統中實現一致性與共識意味著什麼（[第九章](/v1_tw/ch9)）。

3. 在 [第三部分](/v1_tw/part-iii) 中，我們討論那些從其他資料集衍生出一些資料集的系統。衍生資料經常出現在異構系統中：當沒有單個數據庫可以把所有事情都做的很好時，應用需要整合幾種不同的資料庫、快取、索引等。在 [第十章](/v1_tw/ch10) 中我們將從一種衍生資料的批處理方法開始，然後在此基礎上建立在 [第十一章](/v1_tw/ch11) 中討論的流處理。最後，在 [第十二章](/v1_tw/ch12) 中，我們將所有內容彙總，討論在將來構建可靠、可伸縮和可維護的應用程式的方法。


## 參考文獻與延伸閱讀

本書中討論的大部分內容已經在其它地方以某種形式出現過了 —— 會議簡報、研究論文、部落格文章、程式碼、BUG 跟蹤器、郵件列表以及工程習慣中。本書總結了不同來源資料中最重要的想法，並在文字中包含了指向原始文獻的連結。如果你想更深入地探索一個領域，那麼每章末尾的參考文獻都是很好的資源，其中大部分可以免費線上獲取。


## O‘Reilly Safari

[Safari](http://oreilly.com/safari) (formerly Safari Books Online) is a membership-based training and reference platform for enterprise, government, educators, and individuals.

Members have access to thousands of books, training videos, Learning Paths, interac‐ tive tutorials, and curated playlists from over 250 publishers, including O’Reilly Media, Harvard Business Review, Prentice Hall Professional, Addison-Wesley Pro‐ fessional, Microsoft Press, Sams, Que, Peachpit Press, Adobe, Focal Press, Cisco Press, John Wiley & Sons, Syngress, Morgan Kaufmann, IBM Redbooks, Packt, Adobe Press, FT Press, Apress, Manning, New Riders, McGraw-Hill, Jones & Bartlett, and Course Technology, among others.

For more information, please visit http://oreilly.com/safari.


## 致謝

本書融合了學術研究和工業實踐的經驗，融合並系統化了大量其他人的想法與知識。在計算領域，我們往往會被各種新鮮花樣所吸引，但我認為前人完成的工作中，有太多值得我們學習的地方了。本書有 800 多處引用：文章、部落格、講座、文件等，對我來說這些都是寶貴的學習資源。我非常感謝這些材料的作者分享他們的知識。

我也從與人交流中學到了很多東西，很多人花費了寶貴的時間與我討論想法並耐心解釋。特別感謝 Joe Adler, Ross Anderson, Peter Bailis, Márton Balassi, Alastair Beresford, Mark Callaghan, Mat Clayton, Patrick Collison, Sean Cribbs, Shirshanka Das, Niklas Ekström, Stephan Ewen, Alan Fekete, Gyula Fóra, Camille Fournier, Andres Freund, John Garbutt, Seth Gilbert, Tom Haggett, Pat Hel‐ land, Joe Hellerstein, Jakob Homan, Heidi Howard, John Hugg, Julian Hyde, Conrad Irwin, Evan Jones, Flavio Junqueira, Jessica Kerr, Kyle Kingsbury, Jay Kreps, Carl Lerche, Nicolas Liochon, Steve Loughran, Lee Mallabone, Nathan Marz, Caitie McCaffrey, Josie McLellan, Christopher Meiklejohn, Ian Meyers, Neha Narkhede, Neha Narula, Cathy O’Neil, Onora O’Neill, Ludovic Orban, Zoran Perkov, Julia Powles, Chris Riccomini, Henry Robinson, David Rosenthal, Jennifer Rullmann, Matthew Sackman, Martin Scholl, Amit Sela, Gwen Shapira, Greg Spurrier, Sam Stokes, Ben Stopford, Tom Stuart, Diana Vasile, Rahul Vohra, Pete Warden, 以及 Brett Wooldridge.

更多人透過審閱草稿並提供反饋意見在本書的創作過程中做出了無價的貢獻。我要特別感謝 Raul Agepati, Tyler Akidau, Mattias Andersson, Sasha Baranov, Veena Basavaraj, David Beyer, Jim Brikman, Paul Carey, Raul Castro Fernandez, Joseph Chow, Derek Elkins, Sam Elliott, Alexander Gallego, Mark Grover, Stu Halloway, Heidi Howard, Nicola Kleppmann, Stefan Kruppa, Bjorn Madsen, Sander Mak, Stefan Podkowinski, Phil Potter, Hamid Ramazani, Sam Stokes, 以及 Ben Summers。當然對於本書中的任何遺留錯誤或難以接受的見解，我都承擔全部責任。

為了幫助這本書落地，並且耐心地處理我緩慢的寫作和不尋常的要求，我要對編輯 Marie Beaugureau，Mike Loukides，Ann Spencer 和 O'Reilly 的所有團隊表示感謝。我要感謝 Rachel Head 幫我找到了合適的術語。我要感謝 Alastair Beresford，Susan Goodhue，Neha Narkhede 和 Kevin Scott，在其他工作事務之外給了我充分地創作時間和自由。

特別感謝 Shabbir Diwan 和 Edie Freedman，他們非常用心地為各章配了地圖。他們提出了不落俗套的靈感，創作了這些地圖，美麗而引人入勝，真是太棒了。

最後我要表達對家人和朋友們的愛，沒有他們，我將無法走完這個將近四年的寫作歷程。你們是最棒的。

================================================
FILE: content/v1_tw/toc.md
================================================
---
title: "目錄"
linkTitle: "目錄"
weight: 10
breadcrumbs: false
---


![](/title-v1.jpg)

## [序言](/v1_tw/preface)

## [第一部分：資料系統基礎](/v1_tw/part-i)

### [第一章：可靠性、可伸縮性和可維護性](/v1_tw/ch1)
* [關於資料系統的思考](/v1_tw/ch1#關於資料系統的思考)
* [可靠性](/v1_tw/ch1#可靠性)
* [可伸縮性](/v1_tw/ch1#可伸縮性)
* [可維護性](/v1_tw/ch1#可維護性)
* [本章小結](/v1_tw/ch1#本章小結)
### [第二章：資料模型與查詢語言](/v1_tw/ch2)
* [關係模型與文件模型](/v1_tw/ch2#關係模型與文件模型)
* [資料查詢語言](/v1_tw/ch2#資料查詢語言)
* [圖資料模型](/v1_tw/ch2#圖資料模型)
* [本章小結](/v1_tw/ch2#本章小結)
### [第三章：儲存與檢索](/v1_tw/ch3)
* [驅動資料庫的資料結構](/v1_tw/ch3#驅動資料庫的資料結構)
* [事務處理還是分析？](/v1_tw/ch3#事務處理還是分析)
* [列式儲存](/v1_tw/ch3#列式儲存)
* [本章小結](/v1_tw/ch3#本章小結)
### [第四章：編碼與演化](/v1_tw/ch4)
* [編碼資料的格式](/v1_tw/ch4#編碼資料的格式)
* [資料流的型別](/v1_tw/ch4#資料流的型別)
* [本章小結](/v1_tw/ch4#本章小結)

## [第二部分：分散式資料](/v1_tw/part-ii)

### [第五章：複製](/v1_tw/ch5)
* [領導者與追隨者](/v1_tw/ch5#領導者與追隨者)
* [複製延遲問題](/v1_tw/ch5#複製延遲問題)
* [多主複製](/v1_tw/ch5#多主複製)
* [無主複製](/v1_tw/ch5#無主複製)
* [本章小結](/v1_tw/ch5#本章小結)
### [第六章：分割槽](/v1_tw/ch6)
* [分割槽與複製](/v1_tw/ch6#分割槽與複製)
* [鍵值資料的分割槽](/v1_tw/ch6#鍵值資料的分割槽)
* [分割槽與次級索引](/v1_tw/ch6#分割槽與次級索引)
* [分割槽再平衡](/v1_tw/ch6#分割槽再平衡)
* [請求路由](/v1_tw/ch6#請求路由)
* [本章小結](/v1_tw/ch6#本章小結)
### [第七章：事務](/v1_tw/ch7)
* [事務的棘手概念](/v1_tw/ch7#事務的棘手概念)
* [弱隔離級別](/v1_tw/ch7#弱隔離級別)
* [可序列化](/v1_tw/ch7#可序列化)
* [本章小結](/v1_tw/ch7#本章小結)
### [第八章：分散式系統的麻煩](/v1_tw/ch8)
* [故障與部分失效](/v1_tw/ch8#故障與部分失效)
* [不可靠的網路](/v1_tw/ch8#不可靠的網路)
* [不可靠的時鐘](/v1_tw/ch8#不可靠的時鐘)
* [知識、真相與謊言](/v1_tw/ch8#知識真相與謊言)
* [本章小結](/v1_tw/ch8#本章小結)
### [第九章：一致性與共識](/v1_tw/ch9)
* [一致性保證](/v1_tw/ch9#一致性保證)
* [線性一致性](/v1_tw/ch9#線性一致性)
* [順序保證](/v1_tw/ch9#順序保證)
* [分散式事務與共識](/v1_tw/ch9#分散式事務與共識)
* [本章小結](/v1_tw/ch9#本章小結)

## [第三部分：衍生資料](/v1_tw/part-iii)

### [第十章：批處理](/v1_tw/ch10)
* [使用Unix工具的批處理](/v1_tw/ch10#使用Unix工具的批處理)
* [MapReduce和分散式檔案系統](/v1_tw/ch10#MapReduce和分散式檔案系統)
* [MapReduce之後](/v1_tw/ch10#MapReduce之後)
* [本章小結](/v1_tw/ch10#本章小結)
### [第十一章：流處理](/v1_tw/ch11)
* [傳遞事件流](/v1_tw/ch11#傳遞事件流)
* [資料庫與流](/v1_tw/ch11#資料庫與流)
* [流處理](/v1_tw/ch11#流處理)
* [本章小結](/v1_tw/ch11#本章小結)
### [第十二章：資料系統的未來](/v1_tw/ch12)
* [資料整合](/v1_tw/ch12#資料整合)
* [分拆資料庫](/v1_tw/ch12#分拆資料庫)
* [將事情做正確](/v1_tw/ch12#將事情做正確)
* [做正確的事情](/v1_tw/ch12#做正確的事情)
* [本章小結](/v1_tw/ch12#本章小結)

### [術語表](/v1_tw/glossary)

### [後記](/v1_tw/colophon)


================================================
FILE: content/zh/_index.md
================================================
---
title: 设计数据密集型应用（第二版）
linkTitle: DDIA
cascade:
  type: docs
breadcrumbs: false
---


**作者**： [Martin Kleppmann](https://martin.kleppmann.com)，[《Designing Data-Intensive Applications 2nd Edition》](https://learning.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/ch01.html) ： 英国剑桥大学分布式系统研究员，演讲者，博主和开源贡献者，软件工程师和企业家，曾在 LinkedIn 和 Rapportive 负责数据基础架构。

**译者**：[**冯若航**](https://vonng.com)，网名 [@Vonng](https://github.com/Vonng)。
PostgreSQL 专家，数据库老司机，云计算泥石流。
[**Pigsty**](https://pgsty.com) 作者与创始人。
架构师，DBA，全栈工程师 @ TanTan，Alibaba，Apple。
独立开源贡献者，[GitStar Ranking 600](https://gitstar-ranking.com/Vonng)，[国区活跃 Top20](https://committers.top/china)。
[DDIA](https://ddia.pigsty.io) / [PG Internal](https://pgint.vonng.com) 中文版译者，公众号：《老冯云数》，数据库 KOL。

**校订**： [@yingang](https://github.com/yingang)  ｜  [繁體中文](/tw) **版本维护** by  [@afunTW](https://github.com/afunTW) ｜ [完整贡献者列表](/contrib)

> [!NOTE]
> **DDIA 第二版** 正在翻译中 ([`main`](https://github.com/Vonng/ddia/tree/main) 分支)，欢迎加入并提出您的宝贵意见！[点击此处阅览第一版](/v1)。


> [!TIP] 预览版读者须知
> 预览版电子书允许你在作者写作时就能获得最原始、未经编辑的内容 —— 这样你就能在这些技术正式发布之前很久就用上它们。
> 如果你想积极参与审阅和评论这份草稿，请在 GitHub 上联系。本书的 GitHub 仓库是 [ept/ddia2-feedback](https://github.com/ept/ddia2-feedback)，中文翻译版的仓库是 [Vonng/ddia](https://github.com/Vonng/ddia)。


## 译序

> 不懂数据库的全栈工程师不是好架构师 —— 冯若航 / Vonng

现今，尤其是在互联网领域，大多数应用都属于数据密集型应用。本书从底层数据结构到顶层架构设计，将数据系统设计中的精髓娓娓道来。其中的宝贵经验无论是对架构师、DBA、还是后端工程师、甚至产品经理都会有帮助。

这是一本理论结合实践的书，书中很多问题，译者在实际场景中都曾遇到过，读来让人击节扼腕。如果能早点读到这本书，该少走多少弯路啊！

这也是一本深入浅出的书，讲述概念的来龙去脉而不是卖弄定义，介绍事物发展演化历程而不是事实堆砌，将复杂的概念讲述的浅显易懂，但又直击本质不失深度。每章最后的引用质量非常好，是深入学习各个主题的绝佳索引。

本书为数据系统的设计、实现、与评价提供了很好的概念框架。读完并理解本书内容后，读者可以轻松看破大多数的技术忽悠，与技术砖家撕起来虎虎生风。

这是 2017 年译者读过最好的一本技术类书籍，这么好的书没有中文翻译，实在是遗憾。某不才，愿为先进技术文化的传播贡献一份力量。既可以深入学习有趣的技术主题，又可以锻炼中英文语言文字功底，何乐而不为？


## 前言

> 在我们的社会中，技术是一种强大的力量。数据、软件、通信可以用于坏的方面：不公平的阶级固化，损害公民权利，保护既得利益集团。但也可以用于好的方面：让底层人民发出自己的声音，让每个人都拥有机会，避免灾难。本书献给所有将技术用于善途的人们。


> 计算是一种流行文化，流行文化鄙视历史。流行文化关乎个体身份和参与感，但与合作无关。流行文化活在当下，也与过去和未来无关。我认为大部分（为了钱）编写代码的人就是这样的，他们不知道自己的文化来自哪里。
>
>  —— 阿兰・凯接受 Dobb 博士的杂志采访时（2012 年）


## 目录

### [序言](/preface)

### [第一部分：数据系统基础](/part-i)

- [1. 数据系统架构中的权衡](/ch1)
- [2. 定义非功能性需求](/ch2)
- [3. 数据模型与查询语言](/ch3)
- [4. 存储与检索](/ch4)
- [5. 编码与演化](/ch5)

### [第二部分：分布式数据](/part-ii)

- [6. 复制](/ch6)
- [7. 分片](/ch7)
- [8. 事务](/ch8)
- [9. 分布式系统的麻烦](/ch9)
- [10.一致性与共识](/ch10)

### [第三部分：派生数据](/part-iii)

- [11. 批处理](/ch11)
- [12. 流处理](/ch12)
- [13. 流式系统的哲学](/ch13)
- [14. 将事情做正确](/ch14)
- [术语表](/glossary)
- [索引](/index)
- [后记](/colophon)


## 法律声明

从原作者处得知，已经有简体中文的翻译计划，将于 2018 年末完成。[购买地址](https://search.jd.com/Search?keyword=设计数据密集型应用)

译者纯粹出于 **学习目的** 与 **个人兴趣** 翻译本书，不追求任何经济利益。

译者保留对此版本译文的署名权，其他权利以原作者和出版社的主张为准。

本译文只供学习研究参考之用，不得公开传播发行或用于商业用途。有能力阅读英文书籍者请购买正版支持。


## 贡献

0. 全文校订 by [@yingang](https://github.com/Vonng/ddia/commits?author=yingang)
1. [序言初翻修正](https://github.com/Vonng/ddia/commit/afb5edab55c62ed23474149f229677e3b42dfc2c) by [@seagullbird](https://github.com/Vonng/ddia/commits?author=seagullbird)
2. [第一章语法标点校正](https://github.com/Vonng/ddia/commit/973b12cd8f8fcdf4852f1eb1649ddd9d187e3644) by [@nevertiree](https://github.com/Vonng/ddia/commits?author=nevertiree)
3. [第六章部分校正](https://github.com/Vonng/ddia/commit/d4eb0852c0ec1e93c8aacc496c80b915bb1e6d48) 与[第十章的初翻](https://github.com/Vonng/ddia/commit/9de8dbd1bfe6fbb03b3bf6c1a1aa2291aed2490e) by [@MuAlex](https://github.com/Vonng/ddia/commits?author=MuAlex)
4. [第一部分](/part-i)前言，[ch2](/ch2)校正 by [@jiajiadebug](https://github.com/Vonng/ddia/commits?author=jiajiadebug)
5. [词汇表](/glossary)、[后记](/colophon)关于野猪的部分 by [@Chowss](https://github.com/Vonng/ddia/commits?author=Chowss)
6. [繁體中文](https://github.com/Vonng/ddia/pulls)版本与转换脚本 by [@afunTW](https://github.com/afunTW)
7. 多处翻译修正 by [@songzhibin97](https://github.com/Vonng/ddia/commits?author=songzhibin97) [@MamaShip](https://github.com/Vonng/ddia/commits?author=MamaShip) [@FangYuan33](https://github.com/Vonng/ddia/commits?author=FangYuan33)
8. [感谢所有作出贡献，提出意见的朋友们](/contrib)：

<details>
<summary><a href="https://github.com/Vonng/ddia/pulls">Pull Requests</a> & <a href="https://github.com/Vonng/ddia/issues">Issues</a></summary>

| ISSUE & Pull Requests                           | USER                                                       | Title                                                          |
|-------------------------------------------------|------------------------------------------------------------|----------------------------------------------------------------|
| [386](https://github.com/Vonng/ddia/pull/386)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch2: 优化一处翻译                                                    |
| [384](https://github.com/Vonng/ddia/pull/384)   | [@PanggNOTlovebean](https://github.com/PanggNOTlovebean)   | docs: 优化中文文档的措辞和表达                                              |
| [383](https://github.com/Vonng/ddia/pull/383)   | [@PanggNOTlovebean](https://github.com/PanggNOTlovebean)   | docs: 修正 ch4 中的术语和表达错误                                          |
| [382](https://github.com/Vonng/ddia/pull/382)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch1: 优化一处翻译                                                    |
| [381](https://github.com/Vonng/ddia/pull/381)   | [@Max-Tortoise](https://github.com/Max-Tortoise)           | ch4: 修正一处术语不完整问题                                               |
| [377](https://github.com/Vonng/ddia/pull/377)   | [@huang06](https://github.com/huang06)                     | 优化翻译术语                                                        |
| [375](https://github.com/Vonng/ddia/issues/375) | [@z-soulx](https://github.com/z-soulx)                     | 对于是否100%全中文翻译的必要性讨论？个人-没必要100%，特别是“名词”，有原单词更加适合it人员                 |
| [371](https://github.com/Vonng/ddia/pull/371)   | [@lewiszlw](https://github.com/lewiszlw)                   | CPU core -> CPU 核心                                          |
| [369](https://github.com/Vonng/ddia/pull/369)   | [@bbwang-gl](https://github.com/bbwang-gl)                 | ch7: 可串行化快照隔离检测一个事务何时修改另一个事务的读取                                 |
| [368](https://github.com/Vonng/ddia/pull/368)   | [@yhao3](https://github.com/yhao3)                         | 更新 zh-tw.py 与 zh-tw 内容                                       |
| [367](https://github.com/Vonng/ddia/pull/367)   | [@yhao3](https://github.com/yhao3)                         | 修正拼写、格式和标点问题                                                  |
| [366](https://github.com/Vonng/ddia/pull/366)   | [@yangshangde](https://github.com/yangshangde)             | ch8: 将“电源失败”改为“电源失效”                                           |
| [365](https://github.com/Vonng/ddia/pull/365)   | [@xyohn](https://github.com/xyohn)                         | ch1: 优化“存储与计算分离”相关翻译                                           |
| [364](https://github.com/Vonng/ddia/issues/364) | [@xyohn](https://github.com/xyohn)                         | ch1: 优化“存储与计算分离”相关翻译                                           |
| [363](https://github.com/Vonng/ddia/pull/363)   | [@xyohn](https://github.com/xyohn)                         | #362: 优化一处翻译                                                 |
| [362](https://github.com/Vonng/ddia/issues/362) | [@xyohn](https://github.com/xyohn)                         | ch1: 优化一处翻译                                                   |
| [359](https://github.com/Vonng/ddia/pull/359)   | [@c25423](https://github.com/c25423)                       | ch10: 修正一处拼写错误                                                 |
| [358](https://github.com/Vonng/ddia/pull/358)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch4: 修正一处拼写错误                                                  |
| [356](https://github.com/Vonng/ddia/pull/356)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch2: 修正一处标点错误                                                  |
| [355](https://github.com/Vonng/ddia/pull/355)   | [@DuroyGeorge](https://github.com/DuroyGeorge)             | ch12: 修正一处格式错误                                                 |
| [354](https://github.com/Vonng/ddia/pull/354)   | [@justlorain](https://github.com/justlorain)               | ch7: 修正一处参考链接                                                  |
| [353](https://github.com/Vonng/ddia/pull/353)   | [@fantasyczl](https://github.com/fantasyczl)               | ch3&9: 修正两处引用错误                                                |
| [352](https://github.com/Vonng/ddia/pull/352)   | [@fantasyczl](https://github.com/fantasyczl)               | 支持输出为 EPUB 格式                                                  |
| [349](https://github.com/Vonng/ddia/pull/349)   | [@xiyihan0](https://github.com/xiyihan0)                   | ch1: 修正一处格式错误                                                  |
| [348](https://github.com/Vonng/ddia/pull/348)   | [@omegaatt36](https://github.com/omegaatt36)               | ch3: 修正一处图像链接                                                  |
| [346](https://github.com/Vonng/ddia/issues/346) | [@Vermouth1995](https://github.com/Vermouth1995)           | ch1: 优化一处翻译                                                    |
| [343](https://github.com/Vonng/ddia/pull/343)   | [@kehao-chen](https://github.com/kehao-chen)               | ch10: 优化一处翻译                                                   |
| [341](https://github.com/Vonng/ddia/pull/341)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch3: 优化两处翻译                                                    |
| [340](https://github.com/Vonng/ddia/pull/340)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch2: 优化多处翻译                                                    |
| [338](https://github.com/Vonng/ddia/pull/338)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch1: 优化一处翻译                                                    |
| [335](https://github.com/Vonng/ddia/pull/335)   | [@kimi0230](https://github.com/kimi0230)                   | 修正一处繁体中文错误                                                     |
| [334](https://github.com/Vonng/ddia/pull/334)   | [@soulrrrrr](https://github.com/soulrrrrr)                 | ch2: 修正一处繁体中文错误                                                |
| [332](https://github.com/Vonng/ddia/pull/332)   | [@justlorain](https://github.com/justlorain)               | ch5: 修正一处翻译错误                                                  |
| [331](https://github.com/Vonng/ddia/pull/331)   | [@Lyianu](https://github.com/Lyianu)                       | ch9: 更正几处拼写错误                                                  |
| [330](https://github.com/Vonng/ddia/pull/330)   | [@Lyianu](https://github.com/Lyianu)                       | ch7: 优化一处翻译                                                    |
| [329](https://github.com/Vonng/ddia/issues/329) | [@Lyianu](https://github.com/Lyianu)                       | ch6: 指出一处翻译错误                                                  |
| [328](https://github.com/Vonng/ddia/pull/328)   | [@justlorain](https://github.com/justlorain)               | ch4: 更正一处翻译遗漏                                                  |
| [326](https://github.com/Vonng/ddia/pull/326)   | [@liangGTY](https://github.com/liangGTY)                   | ch1: 优化一处翻译                                                    |
| [323](https://github.com/Vonng/ddia/pull/323)   | [@marvin263](https://github.com/marvin263)                 | ch5: 优化一处翻译                                                    |
| [322](https://github.com/Vonng/ddia/pull/322)   | [@marvin263](https://github.com/marvin263)                 | ch8: 优化一处翻译                                                    |
| [304](https://github.com/Vonng/ddia/pull/304)   | [@spike014](https://github.com/spike014)                   | ch11: 优化一处翻译                                                   |
| [298](https://github.com/Vonng/ddia/pull/298)   | [@Makonike](https://github.com/Makonike)                   | ch11&12: 修正两处错误                                                |
| [284](https://github.com/Vonng/ddia/pull/284)   | [@WAangzE](https://github.com/WAangzE)                     | ch4: 更正一处列表错误                                                  |
| [283](https://github.com/Vonng/ddia/pull/283)   | [@WAangzE](https://github.com/WAangzE)                     | ch3: 更正一处错别字                                                   |
| [282](https://github.com/Vonng/ddia/pull/282)   | [@WAangzE](https://github.com/WAangzE)                     | ch2: 更正一处公式问题                                                  |
| [281](https://github.com/Vonng/ddia/pull/281)   | [@lyuxi99](https://github.com/lyuxi99)                     | 更正多处内部链接错误                                                     |
| [280](https://github.com/Vonng/ddia/pull/280)   | [@lyuxi99](https://github.com/lyuxi99)                     | ch9: 更正内部链接错误                                                  |
| [279](https://github.com/Vonng/ddia/issues/279) | [@codexvn](https://github.com/codexvn)                     | ch9: 指出公式在 GitHub Pages 显示的问题                                  |
| [278](https://github.com/Vonng/ddia/pull/278)   | [@LJlkdskdjflsa](https://github.com/LJlkdskdjflsa)         | 发现了繁体中文版本中的错误翻译                                                |
| [275](https://github.com/Vonng/ddia/pull/275)   | [@117503445](https://github.com/117503445)                 | 更正 LICENSE 链接                                                  |
| [274](https://github.com/Vonng/ddia/pull/274)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch7: 修正错别字                                                     |
| [273](https://github.com/Vonng/ddia/pull/273)   | [@Sdot-Python](https://github.com/Sdot-Python)             | ch7: 统一了 write skew 的翻译                                        |
| [271](https://github.com/Vonng/ddia/pull/271)   | [@Makonike](https://github.com/Makonike)                   | ch6: 统一了 rebalancing 的翻译                                       |
| [270](https://github.com/Vonng/ddia/pull/270)   | [@Ynjxsjmh](https://github.com/Ynjxsjmh)                   | ch7: 修正不一致的翻译                                                  |
| [263](https://github.com/Vonng/ddia/pull/263)   | [@zydmayday](https://github.com/zydmayday)                 | ch5: 修正译文中的重复单词                                                |
| [260](https://github.com/Vonng/ddia/pull/260)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch4: 修正部分不准确的翻译                                                |
| [258](https://github.com/Vonng/ddia/pull/258)   | [@bestgrc](https://github.com/bestgrc)                     | ch3: 修正一处翻译错误                                                  |
| [257](https://github.com/Vonng/ddia/pull/257)   | [@UnderSam](https://github.com/UnderSam)                   | ch8: 修正一处拼写错误                                                  |
| [256](https://github.com/Vonng/ddia/pull/256)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可串行化”相关内容的多处翻译不当                                       |
| [255](https://github.com/Vonng/ddia/pull/255)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可重复读”相关内容的多处翻译不当                                       |
| [253](https://github.com/Vonng/ddia/pull/253)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“读已提交”相关内容的多处翻译不当                                       |
| [246](https://github.com/Vonng/ddia/pull/246)   | [@derekwu0101](https://github.com/derekwu0101)             | ch3: 修正繁体中文的转译错误                                               |
| [245](https://github.com/Vonng/ddia/pull/245)   | [@skyran1278](https://github.com/skyran1278)               | ch12: 修正繁体中文的转译错误                                              |
| [244](https://github.com/Vonng/ddia/pull/244)   | [@Axlgrep](https://github.com/Axlgrep)                     | ch9: 修正不通顺的翻译                                                  |
| [242](https://github.com/Vonng/ddia/pull/242)   | [@lynkeib](https://github.com/lynkeib)                     | ch9: 修正不通顺的翻译                                                  |
| [241](https://github.com/Vonng/ddia/pull/241)   | [@lynkeib](https://github.com/lynkeib)                     | ch8: 修正不正确的公式格式                                                |
| [240](https://github.com/Vonng/ddia/pull/240)   | [@8da2k](https://github.com/8da2k)                         | ch9: 修正不通顺的翻译                                                  |
| [239](https://github.com/Vonng/ddia/pull/239)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch7: 修正不一致的翻译                                                  |
| [237](https://github.com/Vonng/ddia/pull/237)   | [@zhangnew](https://github.com/zhangnew)                   | ch3: 修正错误的图片链接                                                 |
| [229](https://github.com/Vonng/ddia/pull/229)   | [@lis186](https://github.com/lis186)                       | 指出繁体中文的转译错误：复杂                                                 |
| [226](https://github.com/Vonng/ddia/pull/226)   | [@chroming](https://github.com/chroming)                   | ch1: 修正导航栏中的章节名称                                               |
| [220](https://github.com/Vonng/ddia/pull/220)   | [@skyran1278](https://github.com/skyran1278)               | ch9: 修正线性一致的繁体中文翻译                                             |
| [194](https://github.com/Vonng/ddia/pull/194)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正错误的翻译                                                   |
| [193](https://github.com/Vonng/ddia/pull/193)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 优化译文                                                      |
| [192](https://github.com/Vonng/ddia/pull/192)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正不一致和不通顺的翻译                                              |
| [190](https://github.com/Vonng/ddia/pull/190)   | [@Pcrab](https://github.com/Pcrab)                         | ch1: 修正不准确的翻译                                                  |
| [187](https://github.com/Vonng/ddia/pull/187)   | [@narojay](https://github.com/narojay)                     | ch9: 修正生硬的翻译                                                   |
| [186](https://github.com/Vonng/ddia/pull/186)   | [@narojay](https://github.com/narojay)                     | ch8: 修正错别字                                                     |
| [185](https://github.com/Vonng/ddia/issues/185) | [@8da2k](https://github.com/8da2k)                         | 指出小标题跳转的问题                                                     |
| [184](https://github.com/Vonng/ddia/pull/184)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch10: 修正失效的网址                                                  |
| [183](https://github.com/Vonng/ddia/pull/183)   | [@OneSizeFitsQuorum](https://github.com/OneSizeFitsQuorum) | ch8: 修正错别字                                                     |
| [182](https://github.com/Vonng/ddia/issues/182) | [@lroolle](https://github.com/lroolle)                     | 建议docsify的主题风格                                                 |
| [181](https://github.com/Vonng/ddia/pull/181)   | [@YunfengGao](https://github.com/YunfengGao)               | ch2: 修正翻译错误                                                    |
| [180](https://github.com/Vonng/ddia/pull/180)   | [@skyran1278](https://github.com/skyran1278)               | ch3: 指出繁体中文的转译错误                                               |
| [177](https://github.com/Vonng/ddia/pull/177)   | [@exzhawk](https://github.com/exzhawk)                     | 支持 Github Pages 里的公式显示                                         |
| [176](https://github.com/Vonng/ddia/pull/176)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch2: 语义网相关翻译更正                                                 |
| [175](https://github.com/Vonng/ddia/pull/175)   | [@cwr31](https://github.com/cwr31)                         | ch7: 不变式相关翻译更正                                                 |
| [174](https://github.com/Vonng/ddia/pull/174)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | README & preface: 更正不正确的中文用词和标点符号                              |
| [173](https://github.com/Vonng/ddia/pull/173)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正不完整的翻译                                                 |
| [171](https://github.com/Vonng/ddia/pull/171)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正重复的译文                                                  |
| [169](https://github.com/Vonng/ddia/pull/169)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 更正不太通顺的翻译                                                |
| [166](https://github.com/Vonng/ddia/pull/166)   | [@bp4m4h94](https://github.com/bp4m4h94)                   | ch1: 发现错误的文献索引                                                 |
| [164](https://github.com/Vonng/ddia/pull/164)   | [@DragonDriver](https://github.com/DragonDriver)           | preface: 更正错误的标点符号                                             |
| [163](https://github.com/Vonng/ddia/pull/163)   | [@llmmddCoder](https://github.com/llmmddCoder)             | ch1: 更正错误字                                                     |
| [160](https://github.com/Vonng/ddia/pull/160)   | [@Zhayhp](https://github.com/Zhayhp)                       | ch2: 建议将 network model 翻译为网状模型                                 |
| [159](https://github.com/Vonng/ddia/pull/159)   | [@1ess](https://github.com/1ess)                           | ch4: 更正错误字                                                     |
| [157](https://github.com/Vonng/ddia/pull/157)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通顺的翻译                                                 |
| [155](https://github.com/Vonng/ddia/pull/155)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通顺的翻译                                                 |
| [153](https://github.com/Vonng/ddia/pull/153)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch9: 修正缩略图的错别字                                                 |
| [152](https://github.com/Vonng/ddia/pull/152)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 除重->去重                                                    |
| [151](https://github.com/Vonng/ddia/pull/151)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 修订sibling相关的翻译                                            |
| [147](https://github.com/Vonng/ddia/pull/147)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 更正一处不准确的翻译                                                |
| [145](https://github.com/Vonng/ddia/pull/145)   | [@Hookey](https://github.com/Hookey)                       | 识别了当前简繁转译过程中处理不当的地方，暂通过转换脚本规避                                  |
| [144](https://github.com/Vonng/ddia/issues/144) | [@secret4233](https://github.com/secret4233)               | ch7: 不翻译`next-key locking`                                     |
| [143](https://github.com/Vonng/ddia/issues/143) | [@imcheney](https://github.com/imcheney)                   | ch3: 更新残留的机翻段落                                                 |
| [142](https://github.com/Vonng/ddia/issues/142) | [@XIJINIAN](https://github.com/XIJINIAN)                   | 建议去除段首的制表符                                                     |
| [141](https://github.com/Vonng/ddia/issues/141) | [@Flyraty](https://github.com/Flyraty)                     | ch5: 发现一处错误格式的章节引用                                             |
| [140](https://github.com/Vonng/ddia/pull/140)   | [@Bowser1704](https://github.com/Bowser1704)               | ch5: 修正章节Summary中多处不通顺的翻译                                      |
| [139](https://github.com/Vonng/ddia/pull/139)   | [@Bowser1704](https://github.com/Bowser1704)               | ch2&ch3: 修正多处不通顺的或错误的翻译                                        |
| [137](https://github.com/Vonng/ddia/pull/137)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch5&ch6: 优化多处不通顺的或错误的翻译                                        |
| [134](https://github.com/Vonng/ddia/pull/134)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch4: 优化多处不通顺的或错误的翻译                                            |
| [133](https://github.com/Vonng/ddia/pull/133)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 优化多处错误的或不通顺的翻译                                            |
| [132](https://github.com/Vonng/ddia/pull/132)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 优化一处容易产生歧义的翻译                                             |
| [131](https://github.com/Vonng/ddia/pull/131)   | [@rwwg4](https://github.com/rwwg4)                         | ch6: 修正两处错误的翻译                                                 |
| [129](https://github.com/Vonng/ddia/pull/129)   | [@anaer](https://github.com/anaer)                         | ch4: 修正两处强调文本和四处代码变量名称                                         |
| [128](https://github.com/Vonng/ddia/pull/128)   | [@meilin96](https://github.com/meilin96)                   | ch5: 修正一处错误的引用                                                 |
| [126](https://github.com/Vonng/ddia/pull/126)   | [@cwr31](https://github.com/cwr31)                         | ch10: 修正一处错误的翻译（功能 -> 函数）                                      |
| [125](https://github.com/Vonng/ddia/pull/125)   | [@dch1228](https://github.com/dch1228)                     | ch2: 优化 how best 的翻译（如何以最佳方式）                                  |
| [123](https://github.com/Vonng/ddia/pull/123)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 9, TOC in readme, glossary, etc.) |
| [121](https://github.com/Vonng/ddia/pull/121)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 5 to chapter 8)                   |
| [120](https://github.com/Vonng/ddia/pull/120)   | [@jiong-han](https://github.com/jiong-han)                 | Typo fix: 呲之以鼻 -> 嗤之以鼻                                         |
| [119](https://github.com/Vonng/ddia/pull/119)   | [@cclauss](https://github.com/cclauss)                     | Streamline file operations in convert()                        |
| [118](https://github.com/Vonng/ddia/pull/118)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 2 to chapter 4)                   |
| [117](https://github.com/Vonng/ddia/pull/117)   | [@feeeei](https://github.com/feeeei)                       | 统一每章的标题格式                                                      |
| [115](https://github.com/Vonng/ddia/pull/115)   | [@NageNalock](https://github.com/NageNalock)               | 第七章病句修改: 重复词语                                                  |
| [114](https://github.com/Vonng/ddia/pull/114)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | Update README.md: correct the book name                        |
| [113](https://github.com/Vonng/ddia/pull/113)   | [@lpxxn](https://github.com/lpxxn)                         | 修改语句                                                           |
| [112](https://github.com/Vonng/ddia/pull/112)   | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [110](https://github.com/Vonng/ddia/pull/110)   | [@lpxxn](https://github.com/lpxxn)                         | 读已写入数据                                                         |
| [107](https://github.com/Vonng/ddia/pull/107)   | [@abbychau](https://github.com/abbychau)                   | 單調鐘和好死还是赖活着                                                    |
| [106](https://github.com/Vonng/ddia/pull/106)   | [@enochii](https://github.com/enochii)                     | typo in ch2: fix braces typo                                   |
| [105](https://github.com/Vonng/ddia/pull/105)   | [@LiminCode](https://github.com/LiminCode)                 | Chronicle translation error                                    |
| [104](https://github.com/Vonng/ddia/pull/104)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | several advice for better translation                          |
| [103](https://github.com/Vonng/ddia/pull/103)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in ch4: should be 完成 rather than 完全                       |
| [102](https://github.com/Vonng/ddia/pull/102)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | ch4: better-translation: 扼杀 → 破坏                               |
| [101](https://github.com/Vonng/ddia/pull/101)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in Ch4: should be "改变" rathr than "盖面"                    |
| [100](https://github.com/Vonng/ddia/pull/100)   | [@LiminCode](https://github.com/LiminCode)                 | fix missing translation                                        |
| [99 ](https://github.com/Vonng/ddia/pull/99)    | [@mrdrivingduck](https://github.com/mrdrivingduck)         | ch6: fix the word rebalancing                                  |
| [98 ](https://github.com/Vonng/ddia/pull/98)    | [@jacklightChen](https://github.com/jacklightChen)         | fix ch7.md: fix wrong references                               |
| [97 ](https://github.com/Vonng/ddia/pull/97)    | [@jenac](https://github.com/jenac)                         | 96                                                             |
| [96 ](https://github.com/Vonng/ddia/pull/96)    | [@PragmaTwice](https://github.com/PragmaTwice)             | ch2: fix typo about 'may or may not be'                        |
| [95 ](https://github.com/Vonng/ddia/pull/95)    | [@EvanMu96](https://github.com/EvanMu96)                   | fix translation of "the battle cry" in ch5                     |
| [94 ](https://github.com/Vonng/ddia/pull/94)    | [@kemingy](https://github.com/kemingy)                     | ch6: fix markdown and punctuations                             |
| [93 ](https://github.com/Vonng/ddia/pull/93)    | [@kemingy](https://github.com/kemingy)                     | ch5: fix markdown and some typos                               |
| [92 ](https://github.com/Vonng/ddia/pull/92)    | [@Gilbert1024](https://github.com/Gilbert1024)             | Merge pull request #1 from Vonng/master                        |
| [88 ](https://github.com/Vonng/ddia/pull/88)    | [@kemingy](https://github.com/kemingy)                     | fix typo for ch1, ch2, ch3, ch4                                |
| [87 ](https://github.com/Vonng/ddia/pull/87)    | [@wynn5a](https://github.com/wynn5a)                       | Update ch3.md                                                  |
| [86 ](https://github.com/Vonng/ddia/pull/86)    | [@northmorn](https://github.com/northmorn)                 | Update ch1.md                                                  |
| [85 ](https://github.com/Vonng/ddia/pull/85)    | [@sunbuhui](https://github.com/sunbuhui)                   | fix ch2.md: fix ch2 ambiguous translation                      |
| [84 ](https://github.com/Vonng/ddia/pull/84)    | [@ganler](https://github.com/ganler)                       | Fix translation: use up                                        |
| [83 ](https://github.com/Vonng/ddia/pull/83)    | [@afunTW](https://github.com/afunTW)                       | Using OpenCC to convert from zh-cn to zh-tw                    |
| [82 ](https://github.com/Vonng/ddia/pull/82)    | [@kangni](https://github.com/kangni)                       | fix gitbook url                                                |
| [78 ](https://github.com/Vonng/ddia/pull/78)    | [@hanyu2](https://github.com/hanyu2)                       | Fix unappropriated translation                                 |
| [77 ](https://github.com/Vonng/ddia/pull/77)    | [@Ozarklake](https://github.com/Ozarklake)                 | fix typo                                                       |
| [75 ](https://github.com/Vonng/ddia/pull/75)    | [@2997ms](https://github.com/2997ms)                       | Fix typo                                                       |
| [74 ](https://github.com/Vonng/ddia/pull/74)    | [@2997ms](https://github.com/2997ms)                       | Update ch9.md                                                  |
| [70 ](https://github.com/Vonng/ddia/pull/70)    | [@2997ms](https://github.com/2997ms)                       | Update ch7.md                                                  |
| [67 ](https://github.com/Vonng/ddia/pull/67)    | [@jiajiadebug](https://github.com/jiajiadebug)             | fix issues in ch2 - ch9 and glossary                           |
| [66 ](https://github.com/Vonng/ddia/pull/66)    | [@blindpirate](https://github.com/blindpirate)             | Fix typo                                                       |
| [63 ](https://github.com/Vonng/ddia/pull/63)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch10.md                                                 |
| [62 ](https://github.com/Vonng/ddia/pull/62)    | [@ych](https://github.com/ych)                             | fix ch1.md typesetting problem                                 |
| [61 ](https://github.com/Vonng/ddia/pull/61)    | [@xianlaioy](https://github.com/xianlaioy)                 | docs:钟-->种，去掉ou                                                |
| [60 ](https://github.com/Vonng/ddia/pull/60)    | [@Zombo1296](https://github.com/Zombo1296)                 | 否则 -> 或者                                                       |
| [59 ](https://github.com/Vonng/ddia/pull/59)    | [@AlexanderMisel](https://github.com/AlexanderMisel)       | 呼叫->调用，显着->显著                                                  |
| [58 ](https://github.com/Vonng/ddia/pull/58)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch8.md                                                  |
| [55 ](https://github.com/Vonng/ddia/pull/55)    | [@saintube](https://github.com/saintube)                   | ch8: 修改链接错误                                                    |
| [54 ](https://github.com/Vonng/ddia/pull/54)    | [@Panmax](https://github.com/Panmax)                       | Update ch2.md                                                  |
| [53 ](https://github.com/Vonng/ddia/pull/53)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [52 ](https://github.com/Vonng/ddia/pull/52)    | [@hecenjie](https://github.com/hecenjie)                   | Update ch1.md                                                  |
| [51 ](https://github.com/Vonng/ddia/pull/51)    | [@latavin243](https://github.com/latavin243)               | fix 修正ch3 ch4几处翻译                                              |
| [50 ](https://github.com/Vonng/ddia/pull/50)    | [@AlexZFX](https://github.com/AlexZFX)                     | 几个疏漏和格式错误                                                      |
| [49 ](https://github.com/Vonng/ddia/pull/49)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch1.md                                                  |
| [48 ](https://github.com/Vonng/ddia/pull/48)    | [@scaugrated](https://github.com/scaugrated)               | fix typo                                                       |
| [47 ](https://github.com/Vonng/ddia/pull/47)    | [@lzwill](https://github.com/lzwill)                       | Fixed typos in ch2                                             |
| [45 ](https://github.com/Vonng/ddia/pull/45)    | [@zenuo](https://github.com/zenuo)                         | 删除一个多余的右括号                                                     |
| [44 ](https://github.com/Vonng/ddia/pull/44)    | [@akxxsb](https://github.com/akxxsb)                       | 修正第七章底部链接错误                                                    |
| [43 ](https://github.com/Vonng/ddia/pull/43)    | [@baijinping](https://github.com/baijinping)               | "更假简单"->"更加简单"                                                 |
| [42 ](https://github.com/Vonng/ddia/pull/42)    | [@tisonkun](https://github.com/tisonkun)                   | 修复 ch1 中的无序列表格式                                                |
| [38 ](https://github.com/Vonng/ddia/pull/38)    | [@renjie-c](https://github.com/renjie-c)                   | 纠正多处的翻译小错误                                                     |
| [37 ](https://github.com/Vonng/ddia/pull/37)    | [@tankilo](https://github.com/tankilo)                     | fix translation mistakes in ch4.md                             |
| [36 ](https://github.com/Vonng/ddia/pull/36)    | [@wwek](https://github.com/wwek)                           | 1.修复多个链接错误 2.名词优化修订 3.错误修订                                     |
| [35 ](https://github.com/Vonng/ddia/pull/35)    | [@wwek](https://github.com/wwek)                           | fix ch7.md  to ch8.md  link error                              |
| [34 ](https://github.com/Vonng/ddia/pull/34)    | [@wwek](https://github.com/wwek)                           | Merge pull request #1 from Vonng/master                        |
| [33 ](https://github.com/Vonng/ddia/pull/33)    | [@wwek](https://github.com/wwek)                           | fix part-ii.md link error                                      |
| [32 ](https://github.com/Vonng/ddia/pull/32)    | [@JCYoky](https://github.com/JCYoky)                       | Update ch2.md                                                  |
| [31 ](https://github.com/Vonng/ddia/pull/31)    | [@elsonLee](https://github.com/elsonLee)                   | Update ch7.md                                                  |
| [26 ](https://github.com/Vonng/ddia/pull/26)    | [@yjhmelody](https://github.com/yjhmelody)                 | 修复一些明显错误                                                       |
| [25 ](https://github.com/Vonng/ddia/pull/25)    | [@lqbilbo](https://github.com/lqbilbo)                     | 修复链接错误                                                         |
| [24 ](https://github.com/Vonng/ddia/pull/24)    | [@artiship](https://github.com/artiship)                   | 修改词语顺序                                                         |
| [23 ](https://github.com/Vonng/ddia/pull/23)    | [@artiship](https://github.com/artiship)                   | 修正错别字                                                          |
| [22 ](https://github.com/Vonng/ddia/pull/22)    | [@artiship](https://github.com/artiship)                   | 纠正翻译错误                                                         |
| [21 ](https://github.com/Vonng/ddia/pull/21)    | [@zhtisi](https://github.com/zhtisi)                       | 修正目录和本章标题不符的情况                                                 |
| [20 ](https://github.com/Vonng/ddia/pull/20)    | [@rentiansheng](https://github.com/rentiansheng)           | Update ch7.md                                                  |
| [19 ](https://github.com/Vonng/ddia/pull/19)    | [@LHRchina](https://github.com/LHRchina)                   | 修复语句小bug                                                       |
| [16 ](https://github.com/Vonng/ddia/pull/16)    | [@MuAlex](https://github.com/MuAlex)                       | Master                                                         |
| [15 ](https://github.com/Vonng/ddia/pull/15)    | [@cg-zhou](https://github.com/cg-zhou)                     | Update translation progress                                    |
| [14 ](https://github.com/Vonng/ddia/pull/14)    | [@cg-zhou](https://github.com/cg-zhou)                     | Translate glossary                                             |
| [13 ](https://github.com/Vonng/ddia/pull/13)    | [@cg-zhou](https://github.com/cg-zhou)                     | 详细修改了后记中和印度野猪相关的描述                                             |
| [12 ](https://github.com/Vonng/ddia/pull/12)    | [@ibyte2011](https://github.com/ibyte2011)                 | 修改了部分翻译                                                        |
| [11 ](https://github.com/Vonng/ddia/pull/11)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 100%                                                       |
| [10 ](https://github.com/Vonng/ddia/pull/10)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 20%                                                        |
| [9  ](https://github.com/Vonng/ddia/pull/9)     | [@jiajiadebug](https://github.com/jiajiadebug)             | Preface, ch1, part-i translation minor fixes                   |
| [7  ](https://github.com/Vonng/ddia/pull/7)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 translation pull request                                   |
| [6  ](https://github.com/Vonng/ddia/pull/6)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 change version1                                            |
| [5  ](https://github.com/Vonng/ddia/pull/5)     | [@nevertiree](https://github.com/nevertiree)               | Chapter 01语法微调                                                 |
| [2  ](https://github.com/Vonng/ddia/pull/2)     | [@seagullbird](https://github.com/seagullbird)             | 序言初翻                                                           |

</details><br />


---------

## 许可证

本项目采用 [CC-BY 4.0](https://github.com/Vonng/ddia/blob/master/LICENSE) 许可证，您可以在这里找到完整说明：

- [署名 4.0 协议国际版 CC BY 4.0 Deed](https://creativecommons.org/licenses/by/4.0/deed.zh-hans)
- [Attribution 4.0 International CC BY 4.0](https://creativecommons.org/licenses/by/4.0/deed.en)


================================================
FILE: content/zh/ch1.md
================================================
---
title: "1. 数据系统架构中的权衡"
weight: 101
breadcrumbs: false
---

<a id="ch_tradeoffs"></a>

> *没有完美的解决方案，只有权衡取舍。[…] 你能做的就是努力获得最佳的权衡，这就是你所能期望的一切。*
>
> [Thomas Sowell](https://www.youtube.com/watch?v=2YUtKr8-_Fg)，接受 Fred Barnes 采访（2005）

> [!TIP] 早期读者注意事项
> 通过 Early Release 电子书，你可以在最早阶段读到作者写作中的原始、未编辑内容，从而在正式版发布前尽早使用这些技术。
>
> 这将是最终书籍的第 1 章。本书的 GitHub 仓库是 https://github.com/ept/ddia2-feedback。
> 如果你希望积极参与本草稿的审阅与评论，请在 GitHub 上联系。

数据是当今应用开发的核心。随着 Web 与移动应用、软件即服务（SaaS）和云服务普及，把许多不同用户的数据存放在共享的服务器端数据基础设施中，已经成为常态。来自用户行为、业务交易、设备与传感器的数据，需要被存储并可用于分析。用户每次与应用交互，既会读取已有数据，也会产生新数据。

当数据量较小、可在单机存储和处理时，问题往往并不复杂。但随着数据规模或查询速率增长，数据必须分布到多台机器上，挑战随之而来。随着需求变得更复杂，仅靠单一系统通常已不足够，你可能需要组合多个具备不同能力的存储与处理系统。

如果“管理数据”是开发过程中的主要挑战之一，我们称这样的应用为 **数据密集型（data-intensive）** 应用 [^1]。与之对照，在 **计算密集型（compute-intensive）** 系统中，难点是并行化超大规模计算；而在数据密集型应用中，我们更常关心的是：如何存储与处理海量数据、如何管理数据变化、如何在故障与并发下保持一致性，以及如何让服务保持高可用。

这类应用通常由若干标准构件搭建而成，每个构件负责一种常见能力。例如，很多应用都需要：

* 存储数据，以便它们或其他应用程序以后能再次找到（**数据库**）
* 记住昂贵操作的结果，以加快读取速度（**缓存**）
* 允许用户按关键字搜索数据或以各种方式过滤数据（**搜索索引**）
* 一旦事件和数据变更发生就立即处理（**流处理**）
* 定期处理累积的大量数据（**批处理**）

在构建应用时，我们通常会选择若干软件系统或服务（例如数据库或 API），再用应用代码把它们拼接起来。如果你的需求恰好落在这些系统的设计边界内，这并不困难。

但当应用目标更有野心时，问题就会出现。数据库有很多种，各自特性不同、适用场景也不同，如何选型？缓存有多种做法，搜索索引也有多种构建方式，如何权衡？当单个工具无法独立完成目标时，如何把多个工具可靠地组合起来？这些都并不简单。

本书正是用来帮助你做这类决策：该用什么技术、怎样组合技术。你会看到，没有哪种方案在根本上永远优于另一种；每种方案都有得失。通过本书，你将学会提出正确问题来评估和比较数据系统，从而为你的具体应用找到更合适的方案。

我们将从今天组织内数据的典型使用方式开始。这些思想很多源自 **企业软件**（即大型组织的软件需求与工程实践，例如大公司和政府机构），因为在历史上，只有这类组织才有足够大的数据规模，值得投入复杂技术方案。如果你的数据足够小，电子表格都可能够用；但近些年，小公司和初创团队构建数据密集型系统也越来越常见。

数据系统的核心难点之一在于：不同的人需要用同一份数据做完全不同的事。在公司里，你和你的团队有自己的优先级，另一个团队即使使用同一数据集，目标也可能完全不同。更麻烦的是，这些目标往往并未被明确表达，容易引发误解和分歧。

为了帮助你了解可以做出哪些选择，本章比较了几个对比概念，并探讨了它们的权衡：

* 事务型系统和分析型系统之间的区别（["分析型与事务型系统"](#sec_introduction_analytics)）；
* 云服务和自托管系统的利弊（["云服务与自托管"](#sec_introduction_cloud)）；
* 何时从单节点系统转向分布式系统（["分布式与单节点系统"](#sec_introduction_distributed)）；以及
* 平衡业务需求和用户权利（["数据系统、法律与社会"](#sec_introduction_compliance)）。

此外，本章还会引入贯穿全书的关键术语。

> [!TIP] 术语：前端和后端

本书讨论的大部分内容都与 **后端开发** 相关。对 Web 应用而言，运行在浏览器中的客户端代码称为 **前端**，处理用户请求的服务器端代码称为 **后端**。移动应用也类似前端：它们提供用户界面，通常经由互联网与服务器端后端通信。前端有时会在设备本地管理数据 [^2]，但更棘手的数据基础设施问题通常发生在后端：前端只处理单个用户的数据，而后端需要代表 **所有** 用户管理数据。

后端服务通常通过 HTTP（有时是 WebSocket）提供访问。其核心是应用代码：在一个或多个数据库中读写数据，并按需接入缓存、消息队列等其他系统（可统称为 **数据基础设施**）。应用代码往往是 **无状态** 的：处理完一个 HTTP 请求后，不保留该请求上下文。因此，凡是需要跨请求持久化的信息，都必须写在客户端，或写入服务器端数据基础设施。


## 分析型与事务型系统 {#sec_introduction_analytics}

如果你在企业中从事数据系统工作，往往会遇到几类不同的数据使用者。第一类是 **后端工程师**，他们构建服务来处理读取与更新数据的请求；这些服务通常直接面向外部用户，或通过其他服务间接提供能力（参见["微服务与无服务器"](#sec_introduction_microservices)）。有时服务也只供组织内部使用。

除了管理后端服务的团队外，通常还有两类人需要访问组织的数据：**业务分析师**，他们生成关于组织活动的报告，以帮助管理层做出更好的决策（**商业智能** 或 **BI**）；以及 **数据科学家**，他们在数据中寻找新的见解，或创建由数据分析和机器学习（AI）支持的面向用户的产品功能（例如，电子商务网站上的“购买了 X 的人也购买了 Y”推荐、风险评分或垃圾邮件过滤等预测分析，以及搜索结果排名）。

尽管业务分析师和数据科学家倾向于使用不同的工具并以不同的方式操作，但他们有一些共同点：两者都执行 **分析**，这意味着他们查看用户和后端服务生成的数据，但他们通常不修改这些数据（除了可能修复错误）。他们可能创建派生数据集，其中原始数据已经以某种方式处理过。这导致了两种类型系统之间的分离——我们将在本书中使用这种区别：

* **事务型系统** 由后端服务和数据基础设施组成，在这里创建数据，例如通过服务外部用户。在这里，应用程序代码基于用户执行的操作读取和修改其数据库中的数据。
* **分析型系统** 服务于业务分析师和数据科学家的需求。它们包含来自事务型系统的只读数据副本，并针对分析所需的数据处理类型进行了优化。

正如我们将在下一节中看到的，事务型系统和分析型系统通常出于充分的理由而保持分离。随着这些系统的成熟，出现了两个新的专业角色：**数据工程师** 和 **分析工程师**。数据工程师是知道如何集成事务型系统和分析型系统的人，并更广泛地负责组织的数据基础设施 [^3]。分析工程师对数据进行建模和转换，使其对组织中的业务分析师和数据科学家更有用 [^4]。

许多工程师只专注于事务型或分析型其中一侧。然而，本书会同时覆盖这两类数据系统，因为它们都在组织内的数据生命周期中扮演关键角色。我们将深入讨论向内外部用户提供服务所需的数据基础设施，帮助你更好地与“另一侧”的同事协作。

### 事务处理与分析的特征 {#sec_introduction_oltp}

在商业数据处理的早期，对数据库的写入通常对应于发生的 **商业交易（commercial transaction）**：进行销售、向供应商下订单、支付员工工资等。随着数据库扩展到不涉及金钱交换的领域，**事务（transaction）** 这个术语仍然保留了下来，指的是形成逻辑单元的一组读取和写入。

> [!NOTE]
> [第 8 章](/ch8#ch_transactions) 详细探讨了我们所说的事务的含义。本章松散地使用该术语来指代低延迟的读取和写入。

尽管数据库开始用于许多不同类型的数据——社交媒体上的帖子、游戏中的移动、地址簿中的联系人等等——但是基本的访问模式仍然类似于处理商业交易。事务型系统通常通过某个键查找少量记录（这称为 **点查询**）。基于用户的输入插入、更新或删除记录。因为这些应用程序是交互式的，这种访问模式被称为 **联机事务处理**（OLTP）。

然而，数据库也越来越多地用于分析，与 OLTP 相比，分析具有非常不同的访问模式。通常，分析查询会扫描大量记录，并计算聚合统计信息（如计数、求和或平均值），而不是将单个记录返回给用户。例如，连锁超市的业务分析师可能想要回答以下分析查询：

* 我们每家商店在一月份的总收入是多少？
* 在我们最近的促销期间，我们比平时多卖出了多少香蕉？
* 哪个品牌的婴儿食品最常与 X 品牌尿布一起购买？

这些类型的查询产生的报告对商业智能很重要，可以帮助管理层决定下一步做什么。为了将这种使用数据库的模式与事务处理区分开来，它被称为 **联机分析处理**（OLAP）[^5]。OLTP 和分析之间的区别并不总是很明确，但[表 1-1](#tab_oltp_vs_olap) 列出了一些典型特征。

{{< figure id="tab_oltp_vs_olap" title="表 1-1. 事务型系统和分析型系统特征比较" class="w-full my-4" >}}

| 属性            | 事务型系统（OLTP）                      | 分析型系统（OLAP）                 |
|-----------------|----------------------------------------|-----------------------------------|
| 主要读取模式    | 点查询（通过键获取单个记录）            | 对大量记录进行聚合                 |
| 主要写入模式    | 创建、更新和删除单个记录                | 批量导入（ETL）或事件流            |
| 人类用户示例    | Web 或移动应用程序的最终用户              | 内部分析师，用于决策支持           |
| 机器使用示例    | 检查操作是否被授权                      | 检测欺诈/滥用模式                  |
| 查询类型        | 固定的查询集，由应用程序预定义          | 分析师可以进行任意查询             |
| 数据代表        | 数据的最新状态（当前时间点）            | 随时间发生的事件历史               |
| 数据集大小      | GB 到 TB                                | TB 到 PB                           |

> [!NOTE]
> OLAP 中 **联机（online）** 的含义不明确；它可能指的是查询不仅用于预定义的报告，也可能是指分析师交互式地使用 OLAP 系统来进行探索性查询。

在事务型系统中，通常不允许用户构建自定义 SQL 查询并在数据库上运行它们，因为这可能会允许他们读取或修改他们没有权限访问的数据。此外，他们可能编写执行成本高昂的查询，从而影响其他用户的数据库性能。出于这些原因，OLTP 系统主要运行嵌入到应用程序代码中的固定查询集，只偶尔使用一次性的自定义查询来进行维护或故障排除。另一方面，分析数据库通常让用户可以自由地手动编写任意 SQL 查询，或使用 Tableau、Looker 或 Microsoft Power BI 等数据可视化或仪表板工具自动生成查询。

还有一种类型的系统是为分析型的工作负载（对许多记录进行聚合的查询）设计的，但嵌入到面向用户的产品中。这一类别被称为 **产品分析** 或 **实时分析**，为这种用途设计的系统包括 Pinot、Druid 和 ClickHouse [^6]。

### 数据仓库 {#sec_introduction_dwh}

起初，相同的数据库既用于事务处理，也用于分析查询。SQL 在这方面相当灵活：它对两种类型的查询都很有效。然而，在 20 世纪 80 年代末和 90 年代初，企业有停止使用其 OLTP 系统进行分析目的的趋势，转而在单独的数据库系统上运行分析。这个单独的数据库被称为 **数据仓库**。

一家大型企业可能有几十个甚至上百个联机事务处理系统：为面向客户的网站提供动力的系统、控制实体店中的销售点（收银台）系统、跟踪仓库中的库存、规划车辆路线、管理供应商、管理员工以及执行许多其他任务。这些系统中的每一个都很复杂，需要一个团队来维护它，因此这些系统最终主要是相互独立地运行。

出于几个原因，业务分析师和数据科学家直接查询这些 OLTP 系统通常是不可取的：

* 感兴趣的数据可能分布在多个事务型系统中，使得在单个查询中组合这些数据集变得困难（称为 **数据孤岛** 的问题）；
* 适合 OLTP 的模式和数据布局不太适合分析（参见["星型和雪花型：分析模式"](/ch3#sec_datamodels_analytics)）；
* 分析查询可能相当昂贵，在 OLTP 数据库上运行它们会影响其他用户的性能；以及
* 出于安全或合规原因，OLTP 系统可能位于不允许用户直接访问的单独网络中。

相比之下，**数据仓库** 是一个单独的数据库，分析师可以随心所欲地查询，而不会影响 OLTP 操作 [^7]。正如我们将在[第 4 章](/ch4#ch_storage)中看到的，数据仓库通常以与 OLTP 数据库非常不同的方式存储数据，以优化分析中常见的查询类型。

数据仓库包含公司中所有各种 OLTP 系统中数据的只读副本。数据从 OLTP 数据库中提取（使用定期数据转储或连续更新流），转换为分析友好的模式，进行清理，然后加载到数据仓库中。这种将数据导入数据仓库的过程称为 **提取-转换-加载**（ETL），如[图 1-1](#fig_dwh_etl) 所示。有时 **转换** 和 **加载** 步骤的顺序会互换（即，先加载，再在数据仓库中进行转换），从而产生 **ELT**。

{{< figure src="/fig/ddia_0101.png" id="fig_dwh_etl" caption="图 1-1. ETL 到数据仓库的简化概述。" class="w-full my-4" >}}

在某些情况下，ETL 过程的数据源是外部 SaaS 产品，如客户关系管理（CRM）、电子邮件营销或信用卡处理系统。在这些情况下，你无法直接访问原始数据库，因为它只能通过软件供应商的 API 访问。将这些外部系统的数据导入你自己的数据仓库可以实现通过 SaaS API 无法实现的分析。SaaS API 的 ETL 通常由专门的数据连接器服务（如 Fivetran、Singer 或 AirByte）实现。

一些数据库系统提供 **混合事务/分析处理**（HTAP），目标是在单个系统中同时支持 OLTP 和分析，而无需从一个系统 ETL 到另一个系统 [^8] [^9]。然而，许多 HTAP 系统内部由一个 OLTP 系统与一个单独的分析系统耦合组成，隐藏在公共接口后面——因此两者之间的区别对于理解这些系统如何工作仍然很重要。

此外，尽管 HTAP 已出现，但由于目标和约束不同，事务型系统与分析型系统分离仍很常见。尤其是，让每个事务型系统拥有自己的数据库通常被视为良好实践（参见["微服务与无服务器"](#sec_introduction_microservices)），这会形成数百个相互独立的事务型数据库；与之对应，企业往往只有一个统一的数据仓库，以便分析师能在单个查询里组合多个事务型系统的数据。

因此，HTAP 不会取代数据仓库。相反，它在同一应用程序既需要执行扫描大量行的分析查询，又需要以低延迟读取和更新单个记录的场景中很有用。例如，欺诈检测可能涉及此类工作负载 [^10]。

事务型系统和分析型系统之间的分离是更广泛趋势的一部分：随着工作负载变得更加苛刻，系统变得更加专业化并针对特定工作负载进行优化。通用系统可以舒适地处理小数据量，但规模越大，系统往往变得越专业化 [^11]。

#### 从数据仓库到数据湖 {#from-data-warehouse-to-data-lake}

数据仓库通常使用通过 SQL 进行查询的 **关系** 数据模型（参见[第 3 章](/ch3#ch_datamodels)），可能使用专门的商业智能软件。这个模型很适合业务分析师需要进行的查询类型，但不太适合数据科学家的需求，他们可能需要执行以下任务：

* 将数据转换为适合训练机器学习模型的形式；这通常需要将数据库表的行和列转换为称为 **特征** 的数值向量或矩阵。以最大化训练模型性能的方式执行这种转换的过程称为 **特征工程**，它通常需要难以用 SQL 表达的自定义代码。
* 获取文本数据（例如，产品评论）并使用自然语言处理技术尝试从中提取结构化信息（例如，作者的情感或他们提到的主题）。同样，他们可能需要使用计算机视觉技术从照片中提取结构化信息。

尽管已经有人在努力将机器学习算子添加到 SQL 数据模型 [^12] 并在关系基础上构建高效的机器学习系统 [^13]，但许多数据科学家不喜欢在数据仓库等关系数据库中工作。相反，许多人更喜欢使用 Python 数据分析库（如 pandas 和 scikit-learn）、统计分析语言（如 R）和分布式分析框架（如 Spark）[^14]。我们将在["数据框、矩阵和数组"](/ch3#sec_datamodels_dataframes)中进一步讨论这些。

因此，组织面临着以适合数据科学家使用的形式提供数据的需求。答案是 **数据湖**：一个集中的数据存储库，保存任何可能对分析有用的数据副本，通过 ETL 过程从事务型系统获得。与数据仓库的区别在于，数据湖只是包含文件，而不强制任何特定的文件格式或数据模型。数据湖中的文件可能是数据库记录的集合，使用 Avro 或 Parquet 等文件格式编码（参见[第 5 章](/ch5#ch_encoding)），但它们同样可以包含文本、图像、视频、传感器读数、稀疏矩阵、特征向量、基因组序列或任何其他类型的数据 [^15]。除了更灵活之外，这通常也比关系数据存储更便宜，因为数据湖可以使用商品化的文件存储，如对象存储（参见["云原生系统架构"](#sec_introduction_cloud_native)）。

ETL 过程已经泛化为 **数据管道**，在某些情况下，数据湖已成为从事务型系统到数据仓库路径上的中间站。数据湖包含事务型系统产生的“原始”形式的数据，没有转换为关系数据仓库模式。这种方法的优势在于，每个数据消费者都可以将原始数据转换为最适合其需求的形式。它被称为 **寿司原则**：“原始数据更好”[^16]。

除了从数据湖加载数据到单独的数据仓库之外，还可以直接在数据湖中的文件上运行典型的数据仓库工作负载（SQL 查询和业务分析），以及数据科学和机器学习的工作负载。这种架构被称为 **数据湖仓**，它需要一个查询执行引擎和一个元数据（例如，模式管理）层来扩展数据湖的文件存储 [^17]。

Apache Hive、Spark SQL、Presto 和 Trino 是这种方法的例子。

#### 超越数据湖 {#beyond-the-data-lake}

随着分析实践的成熟，组织越来越重视分析系统与数据管道的管理和运维，这一点在 DataOps 宣言中已有体现 [^18]。其中一部分是治理、隐私以及对 GDPR、CCPA 等法规的遵从；我们会在["数据系统、法律与社会"](#sec_introduction_compliance)和["立法与行业自律"](/ch14#sec_future_legislation)中讨论。

此外，分析数据的提供形式也越来越多样：不仅有文件和关系表，也有事件流（见[第 12 章](/ch12#ch_stream)）。基于文件的分析通常通过周期性重跑（例如每天一次）来响应数据变化，而流处理能够让分析系统在秒级响应事件。对于时效性要求高的场景，这种方式很有价值，例如识别并阻断潜在的欺诈或滥用行为。

在某些场景中，分析系统的输出还会回流到事务型系统（这一过程有时称为 **反向 ETL** [^19]）。例如，在分析系统里训练出的机器学习模型会部署到生产环境，为终端用户生成“买了 X 的人也买了 Y”这类推荐。此类分析系统的投产结果也称为 **数据产品** [^20]。机器学习模型可借助 TFX、Kubeflow、MLflow 等专用工具部署到事务型系统。

### 记录系统与派生数据 {#sec_introduction_derived}

与事务型系统和分析型系统的区分相关，本书还区分 **记录系统** 与 **派生数据系统**。这组术语有助于你理清数据在系统中的流向：

权威记录系统
:   记录系统，也称 **真相来源（权威数据源）**，保存某类数据的权威（canonical）版本。新数据进入系统时（例如用户输入）首先写入这里。每个事实只表示一次（这种表示通常是 **规范化** 的；见["规范化、反规范化与连接"](/ch3#sec_datamodels_normalization)）。如果其他系统与记录系统不一致，则按定义以记录系统为准。

派生数据系统
:   派生系统中的数据，是对其他系统中已有数据进行转换或处理后的结果。如果派生数据丢失，可以从原始数据源重新构建。经典例子是缓存：命中时由缓存返回，未命中时回退到底层数据库。反规范化值、索引、物化视图、变换后的数据表示，以及在数据集上训练出的模型，都属于这一类。

从技术上说，派生数据是 **冗余** 的，因为它复制了已有信息。但它往往是读查询高性能的关键。你可以从同一个源数据派生出多个数据集，以不同“视角”观察同一份事实。

分析系统通常属于派生数据系统，因为它消费的是别处产生的数据。事务型服务往往同时包含记录系统和派生数据系统：前者是数据首先写入的主数据库，后者则是用于加速常见读取操作的索引与缓存，尤其针对记录系统难以高效回答的查询。

大多数数据库、存储引擎和查询语言本身并不天然属于“记录系统”或“派生系统”。数据库只是工具，关键在于你如何使用它。两者的区别不在工具本身，而在应用中的职责划分。只要明确“哪些数据由哪些数据派生而来”，原本混乱的系统架构就会清晰很多。

当一个系统的数据由另一个系统的数据派生而来时，你需要在记录系统原始数据变化时同步更新派生数据。不幸的是，很多数据库默认假设应用只依赖单一数据库，并不擅长在多系统之间传播这类更新。在["数据集成"](/ch13#sec_future_integration)中，我们会讨论如何组合多个数据系统，实现单一系统难以独立完成的能力。

至此，我们结束了对分析与事务处理的比较。下一节将讨论另一组常被反复争论的权衡。


## 云服务与自托管 {#sec_introduction_cloud}

对于组织需要做的任何事情，首要问题之一是：应该在内部完成，还是应该外包？应该自建还是购买？

归根结底，这是一个关于业务优先级的问题。公认的管理智慧是，作为组织核心竞争力或竞争优势的事物应该在内部完成，而非核心、例行或常见的事物应该留给供应商 [^21]。
举一个极端的例子，大多数公司不会自己发电（除非他们是能源公司，而且不考虑紧急备用电源），因为从电网购买电力更便宜。

对于软件，需要做出的两个重要决定是谁构建软件和谁部署它。有一系列可能性，每个决定都在不同程度上外包，如[图 1-2](#fig_cloud_spectrum) 所示。
一个极端是你自己编写并在内部运行的定制软件；另一个极端是广泛使用的云服务或软件即服务（SaaS）产品，由外部供应商实施和运营，你只能通过 Web 界面或 API 访问。

{{< figure src="/fig/ddia_0102.png" id="fig_cloud_spectrum" caption="图 1-2. 软件类型及其运维的范围。" class="w-full my-4" >}}

中间地带是你 **自托管** 的现成软件（开源或商业），即自己部署——例如，如果你下载 MySQL 并将其安装在你控制的服务器上。
这可能在你自己的硬件上（通常称为 **本地部署**，即使服务器实际上在租用的数据中心机架中而不是字面上在你自己的场所）
，或者在云中的虚拟机上（**基础设施即服务** 或 IaaS）。沿着这个范围还有更多的点，例如，采用开源软件并运行其修改版本。

与这个范围分开的还有 **如何** 部署服务的问题，无论是在云中还是在本地——例如，是否使用 Kubernetes 等编排框架。
然而，部署工具的选择超出了本书的范围，因为其他因素对数据系统的架构有更大的影响。

### 云服务的利弊 {#sec_introduction_cloud_tradeoffs}

使用云服务而不是自己运行对应的软件，本质上是将该软件的运维外包给云提供商。
使用云服务有充分的支持和反对理由。云提供商声称，使用他们的服务可以节省你的时间和金钱，并相比自建基础设施让你更敏捷。

云服务实际上是否比自托管更便宜、更容易，很大程度上取决于你的技能和系统的工作负载。
如果你已经有设置和运维所需系统的经验，并且你的负载相当可预测（即，你需要的机器数量不会剧烈波动），
那么购买自己的机器并自己在上面运行软件通常更便宜 [^22] [^23]。

另一方面，如果你需要一个你还不知道如何部署和运维的系统，那么采用云服务通常比学习自己管理系统更容易、更快。
如果你必须专门雇用和培训员工来维护和运营系统，那可能会变得非常昂贵。
使用云时你仍然需要一个运维团队（参见["云时代的运维"](#sec_introduction_operations)），但外包基本的系统管理可以让你的团队专注于更高层次的问题。

当你将系统的运维外包给专门运维该服务的公司时，可能会带来更好的服务，因为供应商在向许多客户提供服务中获得了专业运维知识。
另一方面，如果你自己运维服务，你可以配置和调整它，以专门针对你特定的工作负载进行优化，而云服务不太可能愿意替你进行此类定制。

如果你的系统负载随时间变化很大，云服务特别有价值。如果你配置机器以能够处理峰值负载，但这些计算资源大部分时间都处于空闲状态，系统就变得不太具有成本效益。
在这种情况下，云服务的优势在于它们可以更容易地根据需求变化向上或向下扩展你的计算资源。

例如，分析系统通常具有极其可变的负载：快速运行大型分析查询需要并行使用大量计算资源，但一旦查询完成，这些资源就会处于空闲状态，直到用户进行下一个查询。
预定义的查询（例如，每日报告）可以排队和调度以平滑负载，但对于交互式查询，你越希望它们完成得快，工作负载就变得越可变。
如果你的数据集如此之大，以至于快速查询需要大量的计算资源，使用云可以节省资金，因为你可以将未使用的资源返回给供应商，而不是让它们闲置。对于较小的数据集，这种差异不太显著。

云服务的最大缺点是你无法控制它：

* 如果它缺少你需要的功能，你所能做的就是礼貌地询问供应商是否会添加它；你通常无法自己实现它。
* 如果服务宕机，你所能做的就是等它恢复。
* 如果你以触发错误或导致性能问题的方式使用服务，你将很难诊断问题。对于你自己运行的软件，你可以从操作系统获取性能指标和调试信息来帮助你理解其行为，你可以查看服务器日志，但对于供应商托管的服务，你通常无法访问这些内部信息。
* 此外，如果服务关闭或变得无法接受的昂贵，或者如果供应商决定以你不喜欢的方式更改他们的产品，你就受制于他们 —— 继续运行旧版本的软件通常不是一个可行选项，所以你将被迫迁移到替代服务 [^24]。
  如果有暴露兼容 API 的替代服务，这种风险会得到缓解，但对于许多云服务，没有标准 API，这增加了切换成本，使供应商锁定成为一个问题。
* 云供应商需要被信任以保持数据安全，这可能会使遵守隐私和安全法规的过程复杂化。

尽管有所有这些风险，组织在云服务之上构建新应用程序或采用混合方法（在系统的某些部分使用云服务）变得越来越流行。然而，云服务不会取代所有内部数据系统：许多较旧的系统早于云，对于任何具有现有云服务无法满足的专业要求的服务，内部系统仍然是必要的。例如，对延迟非常敏感的应用程序（如高频交易）需要对硬件的完全控制。

### 云原生系统架构 {#sec_introduction_cloud_native}

除了具有不同的经济模型（订阅服务而不是购买硬件和许可软件在其上运行）之外，云的兴起也对数据系统在技术层面的实现产生了深远的影响。
术语 **云原生** 用于描述旨在利用云服务的架构。

原则上，几乎任何可自托管的软件都可以做成云服务；事实上，许多主流数据系统都已有托管版本。
不过，从零设计为云原生的系统已经展示出若干优势：同等硬件下性能更好、故障恢复更快、能更快按负载扩缩计算资源，并支持更大数据集 [^25] [^26] [^27]。[表 1-2](#tab_cloud_native_dbs) 给出两类系统的一些示例。

{{< figure id="tab_cloud_native_dbs" title="表 1-2. 自托管与云原生数据库系统示例" class="w-full my-4" >}}

| 类别              | 自托管系统                  | 云原生系统                                                            |
|------------------|----------------------------|----------------------------------------------------------------------|
| 事务型/OLTP      | MySQL、PostgreSQL、MongoDB  | AWS Aurora [^25]、Azure SQL DB Hyperscale [^26]、Google Cloud Spanner |
| 分析型/OLAP      | Teradata、ClickHouse、Spark | Snowflake [^27]、Google BigQuery、Azure Synapse Analytics             |

#### 云服务的分层 {#layering-of-cloud-services}

许多自托管数据系统的系统要求非常简单：它们在传统操作系统（如 Linux 或 Windows）上运行，将数据存储为文件系统上的文件，并通过 TCP/IP 等标准网络协议进行通信。
少数系统依赖于特殊硬件，如 GPU（用于机器学习）或 RDMA 网络接口，但总的来说，自托管软件倾向于使用非常通用的计算资源：CPU、RAM、文件系统和 IP 网络。

在云中，这种类型的软件可以在基础设施即服务（IaaS）环境中运行，使用一个或多个虚拟机（或 **实例**），分配一定的 CPU、内存、磁盘和网络带宽。
与物理机器相比，云实例可以更快地配置，并且有更多种类的大小，但除此之外，它们与传统计算机类似：你可以在上面运行任何你喜欢的软件，但你负责自己管理它。

相比之下，云原生服务的关键思想是不仅使用由操作系统管理的计算资源，还基于较低级别的云服务构建更高级别的服务。例如：

* 使用 **对象存储** 服务（如 Amazon S3、Azure Blob Storage 和 Cloudflare R2）存储大文件。它们提供比典型文件系统更有限的 API（基本文件读写），但它们的优势在于隐藏了底层物理机器：服务自动将数据分布在许多机器上，因此你不必担心任何一台机器上的磁盘空间用完。即使某些机器或其磁盘完全故障，也不会丢失数据。
* 在对象存储和其他云服务之上建立更多的服务：例如，Snowflake 是一个基于云的分析数据库（数据仓库），依赖于 S3 进行数据存储 [^27]，而一些其他服务反过来建立在 Snowflake 之上。

与计算中的抽象一样，没有一个正确的答案告诉你应该使用什么。作为一般规则，更高级别的抽象往往更面向特定的用例。如果你的需求与为其设计更高级别系统的情况相匹配，使用现有的高级别系统可能会比自己从较低级别系统构建更轻松，且更能满足您的需求。另一方面，如果没有满足你需求的高级系统，那么从较低级别的组件自己构建它是唯一的选择。

#### 存储与计算的分离 {#sec_introduction_storage_compute}

在传统计算中，磁盘存储被认为是持久的（我们假设一旦某些东西被写入磁盘，它就不会丢失）。为了容忍单个硬盘的故障，通常使用 RAID（独立磁盘冗余阵列）在连接到同一台机器的几个磁盘上维护数据副本。RAID 可以在硬件中执行，也可以由操作系统在软件中执行，它对访问文件系统的应用程序是透明的。

在云中，计算实例（虚拟机）也可能有本地磁盘连接，但云原生系统通常将这些磁盘更多地视为临时缓存，而不是长期存储。这是因为如果关联的实例出现故障，或者为了适应负载变化而将实例替换为更大或更小的实例（在不同的物理机器上），本地磁盘就会变得不可访问。

作为本地磁盘的替代方案，云服务还提供可以从一个实例分离并附加到另一个实例的虚拟磁盘存储（Amazon EBS、Azure 托管磁盘和 Google Cloud 中的持久磁盘）。这种虚拟磁盘实际上不是物理磁盘，而是由一组单独的机器提供的云服务，它模拟磁盘的行为（**块设备**，其中每个块通常为 4 KiB 大小）。这项技术使得在云中运行传统的基于磁盘的软件成为可能，但块设备仿真所引入的开销在一开始就为云设计的系统中是可以避免的 [^25]。它还使应用程序对网络故障非常敏感，因为虚拟块设备上的每个 I/O 实际上都是网络调用 [^28]。

为了解决这个问题，云原生服务通常避免使用虚拟磁盘，而是建立在针对特定工作负载优化的专用存储服务之上。对象存储服务（如 S3）设计用于长期存储相当大的文件，大小从数百 KB 到几 GB 不等。数据库中存储的单个行或值通常比这小得多；因此，云数据库通常在单独的服务中管理较小的值，并将较大的数据块（包含许多单个值）存储在对象存储中 [^26] [^29]。我们将在[第 4 章](/ch4#ch_storage)中看到这样做的方法。

在传统的系统架构中，同一台计算机负责存储（磁盘）和计算（CPU 和 RAM），但在云原生系统中，这两个职责已经在某种程度上分离或 **解耦** [^9] [^27] [^30] [^31]：例如，S3 只存储文件，如果你想分析该数据，你必须在 S3 之外的某个地方运行分析代码。这意味着通过网络传输数据，我们将在["分布式与单节点系统"](#sec_introduction_distributed)中进一步讨论。

此外，云原生系统通常是 **多租户** 的，这意味着不是每个客户都有一台单独的机器，而是来自几个不同客户的数据和计算由同一服务在同一共享硬件上处理 [^32]。

多租户可以实现更好的硬件利用率、更容易的可伸缩性和云提供商更容易的管理，但它也需要仔细的工程设计，以确保一个客户的活动不会影响其他客户的系统的性能或安全性 [^33]。

### 云时代的运维 {#sec_introduction_operations}

传统上，管理组织服务器端数据基础设施的人员被称为 **数据库管理员**（DBA）或 **系统管理员**（sysadmins）。最近，许多组织已经尝试将软件开发和运维的角色整合到团队中，共同负责后端服务和数据基础设施；**DevOps** 理念引导了这一趋势。**站点可靠性工程师**（SRE）是 Google 对这个想法的实现 [^34]。

运维的作用是确保服务可靠地交付给用户（包括配置基础设施和部署应用程序），并确保稳定的生产环境（包括监控和诊断可能影响可靠性的任何问题）。对于自托管系统，运维传统上涉及大量在单个机器级别的工作，例如容量规划（例如，监控可用磁盘空间并在空间用完之前添加更多磁盘）、配置新机器、将服务从一台机器移动到另一台机器，以及安装操作系统补丁。

许多云服务提供了 API 来隐藏实际实现服务的单个机器。例如，云存储用 **计量计费** 替换固定大小的磁盘，你可以存储数据而无需提前规划容量需求，然后根据实际使用的空间收费。此外，即使在单个机器发生故障时，许多云服务仍能保持高可用性（参见["可靠性与容错"](/ch2#sec_introduction_reliability)）。

从单个机器到服务的重点转移伴随着运维角色的变化。提供可靠服务的高级目标保持不变，但流程和工具已经发展。DevOps/SRE 理念更加强调：

* 自动化——优先考虑可重复的流程而不是手动的一次性工作，
* 优先考虑短暂的虚拟机和服务而不是长期运行的服务器，
* 启用频繁的应用程序更新，
* 从事故中学习，以及
* 保留组织关于系统的知识，即使组织里的人员在不断流动 [^35]。

随着云服务的兴起，角色出现了分叉：基础设施公司的运维团队专门研究向大量客户提供可靠服务的细节，而服务的客户在基础设施上花费尽可能少的时间和精力 [^36]。

云服务的客户仍然需要运维，但他们专注于不同的方面，例如为给定任务选择最合适的服务、将不同服务相互集成，以及从一个服务迁移到另一个服务。即使计量计费消除了传统意义上的容量规划需求，了解你为哪个目的使用哪些资源仍然很重要，这样你就不会在不需要的云资源上浪费金钱：容量规划变成了财务规划，性能优化变成了成本优化 [^37]。

此外，云服务确实有资源限制或 **配额**（例如你可以同时运行的最大进程数），你需要在遇到它们之前了解并规划这些 [^38]。

采用云服务可能比运行自己的基础设施更容易、更快，尽管学习如何使用它也有成本，也许还要解决其限制。随着越来越多的供应商提供针对不同用例的更广泛的云服务，不同服务之间的集成成为一个特别的挑战 [^39] [^40]。

ETL（参见["数据仓库"](#sec_introduction_dwh)）只是故事的一部分；面向事务处理的云服务之间也需要相互集成。目前，缺乏能促进这类集成的标准，因此往往仍要投入大量手工工作。

无法完全外包给云服务的其他运维方面包括维护应用程序及其使用的库的安全性、管理你自己的服务之间的交互、监控服务的负载，以及追踪问题的原因，例如性能下降或中断。虽然云正在改变运维的角色，但对运维的需求比以往任何时候都大。


## 分布式与单节点系统 {#sec_introduction_distributed}

涉及多台机器通过网络通信的系统称为 **分布式系统**。参与分布式系统的每个进程称为 **节点**。你希望采用分布式系统的原因可能有多种：

固有的分布式系统
:   如果应用程序涉及两个或多个交互用户，每个用户使用自己的设备，那么系统不可避免地是分布式的：设备之间的通信必须通过网络进行。

云服务之间的请求
:   如果数据存储在一个服务中但在另一个服务中处理，则必须通过网络从一个服务传输到另一个服务。

容错/高可用性
:   如果你的应用程序需要在一台机器（或几台机器、网络或整个数据中心）发生故障时继续工作，你可以使用多台机器为你提供冗余。当一台故障时，另一台可以接管。参见["可靠性与容错"](/ch2#sec_introduction_reliability)和[第 6 章](/ch6#ch_replication)关于复制的内容。

可伸缩性
:   如果你的数据量或计算需求增长超过单台机器的处理能力，你可以潜在地将负载分散到多台机器上。参见["可伸缩性"](/ch2#sec_introduction_scalability)。

延迟
:   如果你在世界各地都有用户，你可能希望在全球各个地区都有服务器，以便每个用户都可以从地理位置接近他们的服务器获得服务。这避免了用户必须等待网络数据包绕地球半圈才能回答他们的请求。参见["描述性能"](/ch2#sec_introduction_percentiles)。

弹性
:   如果你的应用程序在某些时候很忙，在其他时候很空闲，云部署可以根据需求向上或向下伸缩，因此你只需为实际使用的资源付费。这在单台机器上更困难，它需要按处理最大负载的情况进行配置，即使在几乎不使用的时候也是如此。

使用专用硬件
:   系统的不同部分可以利用不同类型的硬件来匹配其工作负载。例如，对象存储可能使用具有许多磁盘但很少 CPU 的机器，而数据分析系统可能使用具有大量 CPU 和内存但没有磁盘的机器，机器学习系统可能使用具有 GPU 的机器（GPU 在训练深度神经网络和其他机器学习任务方面比 CPU 效率高得多）。

法律合规
:   一些国家有数据驻留法律，要求其管辖范围内的人员数据必须在该国地理范围内存储和处理 [^41]。这些规则的范围各不相同——例如，在某些情况下，它仅适用于医疗或金融数据，而其他情况则更广泛。因此，在几个这样的管辖区域中拥有用户的服务不得不将他们的数据分布在几个位置的服务器上。

可持续性
:   如果你能灵活把控作业运行的地点和时间，你可能能够在可再生电力充足的时间和地点运行它们，并避免在电网紧张时运行它们。这可以减少你的碳排放，并允许你利用到廉价的电力 [^42] [^43]。

这些原因既适用于你自己编写的服务（应用程序代码），也适用于由现成软件（如数据库）组成的服务。

### 分布式系统的问题 {#sec_introduction_dist_sys_problems}

分布式系统也有缺点。通过网络进行的每个请求和 API 调用都需要处理失败的可能性：网络可能中断，或者服务可能过载或崩溃，因此任何请求都可能超时而没有收到响应。在这种情况下，我们不知道服务是否收到了请求，简单地重试它可能不安全。我们将在[第 9 章](/ch9#ch_distributed)中详细讨论这些问题。

尽管数据中心网络很快，但调用另一个服务仍然比在同一进程中调用函数慢得多 [^44]。

在处理大量数据时，与其将数据从其存储处传输到处理它的单独机器，将计算带到已经拥有数据的机器上可能更快 [^45]。

更多的节点并不总是更快：在某些情况下，一个简单的单线程程序在单台计算机上运行的性能可以比在具有 100 多个 CPU 核心的集群上更好 [^46]。

对分布式系统进行故障排除通常很困难：如果系统响应缓慢，你如何找出问题所在？**可观测性** [^47] [^48] 技术可以用来对分布式系统中的问题进行诊断，这涉及到系统执行数据的收集，并提供查询方式来支持对高层级的指标或单个的事件的分析。**追踪** 工具（如 OpenTelemetry、Zipkin 和 Jaeger）允许你跟踪哪个客户端为哪个操作调用了哪个服务器，以及每次调用花费了多长时间 [^49]。

数据库提供了各种机制来确保数据一致性，正如我们将在[第 6 章](/ch6#ch_replication)和[第 8 章](/ch8#ch_transactions)中看到的。然而，当每个服务都有自己的数据库时，维护这些不同服务之间的数据一致性就成了应用程序的问题。分布式事务（我们在[第 8 章](/ch8#ch_transactions)中探讨）是确保一致性的一种可能技术，但它们在微服务上下文中很少使用，因为它们违背了使服务彼此独立的目标，而且许多数据库不支持它们 [^50]。

出于所有这些原因，如果你可以在单台机器上做某件事情，与搭建分布式系统相比通常要简单得多，成本也更低 [^23] [^46] [^51]。CPU、内存和磁盘已经变得更大、更快、更可靠。当与 DuckDB、SQLite 和 KùzuDB 等单节点数据库结合使用时，许多工作负载现在可以在单个节点上运行。我们将在[第 4 章](/ch4#ch_storage)中进一步探讨这个主题。

### 微服务与无服务器 {#sec_introduction_microservices}

在多台机器上分布系统的最常见方式是将它们分为客户端和服务器，并让客户端向服务器发出请求。最常见的是使用 HTTP 进行此通信，正如我们将在["流经服务的数据流：REST 和 RPC"](/ch5#sec_encoding_dataflow_rpc)中讨论的。同一进程可能既是服务器（处理传入请求）又是客户端（向其他服务发出出站请求）。

这种构建应用程序的方式传统上被称为 **面向服务的体系结构**（SOA）；最近，这个想法已经被细化为 **微服务** 架构 [^52] [^53]。在这种架构中，服务有一个明确定义的目的（例如，对于 S3 来说，这个目的是文件存储）；每个服务公开一个可以由客户端通过网络调用的 API，每个服务有一个负责其维护的团队。因此，复杂的应用程序可以分解为多个交互服务，每个服务由单独的团队管理。

将复杂的软件分解为多个服务有几个优点：每个服务可以独立更新，减少团队之间的协调工作；每个服务可以分配它需要的硬件资源；通过将实现细节隐藏在 API 后面，服务所有者可以自由地更改实现而不影响客户端。在数据存储方面，每个服务通常有自己的数据库，而不在服务之间共享数据库：共享数据库实际上会使整个数据库结构成为服务 API 的一部分，然后该结构将很难更改。共享数据库还可能导致一个服务的查询对其他服务的性能产生负面影响。

另一方面，拥有许多服务本身可能会带来复杂性：每个服务都需要用于部署新版本、调整分配的硬件资源以匹配负载、收集日志、监控服务健康状况以及在出现问题时向值班工程师发出警报的基础设施。**编排** 框架（如 Kubernetes）已成为部署服务的流行方式，因为它们为这种基础设施提供了基础。在开发期间测试服务可能很复杂，因为你还需要运行它所依赖的所有其他服务。

微服务 API 的演进可能具有挑战性。调用 API 的客户端期望 API 具有某些字段。开发人员可能希望根据业务需求的变化向 API 添加或删除字段，但这样做可能会导致客户端失败。更糟糕的是，这种失败通常直到开发周期的后期才被发现，当更新的服务 API 部署到预生产或生产环境时。API 描述标准（如 OpenAPI 和 gRPC）有助于管理客户端和服务器 API 之间的关系；我们将在[第 5 章](/ch5#ch_encoding)中进一步讨论这些。

微服务主要是人员问题的技术解决方案：允许不同的团队独立取得进展，而无需相互协调。这在大公司中很有价值，但在没有很多团队的小公司中，使用微服务可能是不必要的开销，最好以最简单的方式实现应用程序 [^52]。

**无服务器（Serverless）**，或 **函数即服务**（FaaS），是另一种部署方式：基础设施管理进一步外包给云厂商 [^33]。使用虚拟机时，你需要显式决定何时启动、何时关闭实例；而在无服务器模型中，云厂商会根据进入服务的请求自动分配和回收计算资源 [^54]。这种部署方式把更多运维负担转移给云厂商，并支持按使用量计费，而不是按实例计费。为实现这些优势，许多无服务器平台会限制函数执行时长、限制运行时环境，并在函数首次调用时出现较慢冷启动。术语“无服务器”本身也容易误导：每次函数执行依然运行在某台服务器上，只是后续执行未必在同一台机器上。此外，BigQuery 及多种 Kafka 产品也采用“Serverless”术语，强调其服务可自动扩缩容且按使用量计费。

就像云存储以计量计费取代了传统容量规划（预先决定买多少磁盘）一样，无服务器模式把同样的计费逻辑带到了代码执行层：你只为代码实际运行的时间付费，而不必预先准备固定资源。

### 云计算与超级计算 {#id17}

云计算不是构建大规模计算系统的唯一方式；另一种选择是 **高性能计算**（HPC），也称为 **超级计算**。尽管有重叠，但与云计算和企业数据中心系统相比，HPC 通常有不同的设计考量并使用不同的技术。其中一些差异是：

* 超级计算机通常用于计算密集型科学计算任务，例如天气预报、气候建模、分子动力学（模拟原子和分子的运动）、复杂的优化问题和求解偏微分方程。另一方面，云计算往往用于在线服务、业务数据系统和需要以高可用性为用户请求提供服务的类似系统。
* 超级计算机通常运行大型批处理作业，定期将其计算状态检查点保存到磁盘。如果节点发生故障，常见的解决方案是简单地停止整个集群工作负载，修复故障节点，然后从最后一个检查点重新启动计算 [^55] [^56]。对于云服务，通常不希望停止整个集群，因为服务需要以最小的中断持续为用户提供服务。
* 超级计算机节点通常通过共享内存和远程直接内存访问（RDMA）进行通信，这支持高带宽和低延迟，但假设系统用户之间有高度的信任 [^57]。在云计算中，网络和机器通常由相互不信任的组织共享，需要更强的安全机制，如资源隔离（例如虚拟机）、加密和身份验证。
* 云数据中心网络通常基于 IP 和以太网，以 Clos 拓扑排列以提供高对分带宽——这是网络整体性能的常用度量 [^55] [^58]。超级计算机通常使用专门的网络拓扑，例如多维网格和环面 [^59]，这能让具有已知通信模式的 HPC 工作负载产生更好的性能。
* 云计算允许节点分布在多个地理区域，而超级计算机通常假设它们的所有节点都靠近在一起。

大规模分析系统有时与超级计算共享一些特征，如果你在这个领域工作，了解这些技术可能是值得的。然而，本书主要关注需要持续可用的服务，如["可靠性与容错"](/ch2#sec_introduction_reliability)中所讨论的。

## 数据系统、法律与社会 {#sec_introduction_compliance}

到目前为止，你已经在本章中看到，数据系统的架构不仅受到技术目标和要求的影响，还受到它们所支持的组织的人力需求的影响。越来越多的数据系统工程师认识到，仅服务于自己企业的需求是不够的：我们还对整个社会负有责任。

一个特别的关注点是存储有关人员及其行为数据的系统。自 2018 年以来，**通用数据保护条例**（GDPR）赋予了许多欧洲国家居民对其个人数据更大的控制权和法律权利，类似的隐私法规已在世界各地的各个国家和州采用，例如加州消费者隐私法（CCPA）。关于 AI 的法规，例如 **欧盟 AI 法案**，对个人数据的使用方式施加了进一步的限制。

此外，即使在不直接受法规约束的领域，人们也越来越认识到计算机系统对人和社会的影响。社交媒体改变了个人消费新闻的方式，这影响了他们的政治观点，因此可能影响选举结果。自动化系统越来越多地做出对个人产生深远影响的决策，例如决定谁应该获得贷款或保险覆盖，谁应该被邀请参加工作面试，或者谁应该被怀疑犯罪 [^60]。

每个从事此类系统工作的人都有责任考虑道德影响并确保他们遵守相关法律。没有必要让每个人都成为法律和道德专家，但对法律和道德原则的基本认识与分布式系统中的一些基础知识同样重要。

法律考虑正在影响数据系统设计的基础 [^61]。例如，GDPR 授予个人在请求时删除其数据的权利（有时称为 **被遗忘权**）。然而，正如我们将在本书中看到的，许多数据系统依赖不可变构造（如仅追加日志）作为其设计的一部分；我们如何确保删除应该不可变的文件中间的某些数据？我们如何处理已被纳入派生数据集（参见["记录系统与派生数据"](#sec_introduction_derived)）的数据删除，例如机器学习模型的训练数据？回答这些问题会带来新的工程挑战。

目前，我们对于哪些特定技术或系统架构应被视为“符合 GDPR”没有明确的指导方针。法规故意不强制要求特定技术，因为随着技术的进步，这些技术可能会迅速变化。相反，法律文本规定了需要解释的高层级原则。这意味着如何遵守隐私法规的问题没有简单的答案，但我们将通过这个视角来看待本书中的一些技术。

一般来说，我们存储数据是因为我们认为其价值大于存储它的成本。然而，值得记住的是，存储成本不仅仅是你为 Amazon S3 或其他服务支付的账单：成本效益计算还应该考虑到如果数据被泄露或被对手入侵的责任和声誉损害风险，以及如果数据的存储和处理被发现不符合法律的法律成本和罚款风险 [^51]。

政府或警察部队也可能迫使公司交出数据。当存在数据可能暴露犯罪行为的风险时（例如，在几个中东和非洲国家的同性恋，或在几个美国州寻求堕胎），存储该数据会为用户创造真正的安全风险。例如，去堕胎诊所的行程很容易被位置数据泄露，甚至可能通过用户 IP 地址随时间的日志（表示大致位置）泄露。

一旦考虑到所有风险，可能合理地决定某些数据根本不值得存储，因此应该删除。这个 **数据最小化** 原则（有时以德语术语 **Datensparsamkeit** 为人所知）与“大数据”哲学相反，后者是投机性地存储大量数据，以防将来有用 [^62]。但它符合 GDPR，该法规要求个人数据只能为指定的、明确的目的收集，这些数据以后不得用于任何其他目的，并且数据不得保留超过收集目的所需的时间 [^63]。

企业也注意到了隐私和安全问题。信用卡公司要求处理支付的企业遵守严格的支付卡行业（PCI）标准。处理商需要经常接受独立审计师的评估，以验证持续的合规性。软件供应商也受到了更多的审查。现在许多买家要求他们的供应商遵守服务组织控制（SOC）类型 2 标准。与 PCI 合规性一样，供应商需要接受第三方审计以验证遵守情况。

总的来说，关键在于平衡业务目标与被收集、被处理数据的人们的权益。这个主题还有很多内容；在[第 14 章](/ch14#ch_right_thing)中，我们会进一步讨论伦理与法律合规，以及偏见与歧视等问题。


## 总结 {#summary}

本章的主线是理解“权衡”。对许多问题而言，并不存在唯一正确答案，而是有多种路径，各有利弊。我们讨论了影响数据系统架构的几个关键选择，并引入了后续章节会反复使用的术语。

我们首先区分了事务型（事务处理，OLTP）和分析型（OLAP）系统。它们不仅面对不同访问模式与数据类型，也服务于不同人群。我们还看到数据仓库与数据湖这两类体系，它们通过 ETL 接收来自事务型系统的数据。在[第 4 章](/ch4#ch_storage)中，我们会看到由于查询类型不同，事务型与分析型系统常常采用截然不同的内部数据布局。

随后，我们把相对较新的云服务模式与长期主导数据系统架构的自托管范式做了比较。哪种方式更具成本效益高度依赖具体情境，但不可否认，云原生架构正在深刻改变数据系统的构建方式，例如存储与计算的分离。

云系统天然是分布式系统，我们也简要讨论了它与单机方案之间的权衡。有些场景无法避免分布式，但如果单机可行，不必急于把系统分布式化。在[第 9 章](/ch9#ch_distributed)中，我们会更深入地讨论分布式系统的挑战。

最后，数据系统架构不仅由企业自身需求决定，也受保护数据主体权利的隐私法规所塑造，而这一点常被工程实践忽略。如何把法律要求转化为技术实现，目前仍无标准答案；但在阅读本书后续内容时，始终带着这个问题会很重要。

### 参考文献

[^1]: Richard T. Kouzes, Gordon A. Anderson, Stephen T. Elbert, Ian Gorton, and Deborah K. Gracio. [The Changing Paradigm of Data-Intensive Computing](http://www2.ic.uff.br/~boeres/slides_AP/papers/TheChanginParadigmDataIntensiveComputing_2009.pdf). *IEEE Computer*, volume 42, issue 1, January 2009. [doi:10.1109/MC.2009.26](https://doi.org/10.1109/MC.2009.26)
[^2]: Martin Kleppmann, Adam Wiggins, Peter van Hardenberg, and Mark McGranaghan. [Local-first software: you own your data, in spite of the cloud](https://www.inkandswitch.com/local-first/). At *2019 ACM SIGPLAN International Symposium on New Ideas, New Paradigms, and Reflections on Programming and Software* (Onward!), October 2019. [doi:10.1145/3359591.3359737](https://doi.org/10.1145/3359591.3359737)
[^3]: Joe Reis and Matt Housley. [*Fundamentals of Data Engineering*](https://www.oreilly.com/library/view/fundamentals-of-data/9781098108298/). O’Reilly Media, 2022. ISBN: 9781098108304
[^4]: Rui Pedro Machado and Helder Russa. [*Analytics Engineering with SQL and dbt*](https://www.oreilly.com/library/view/analytics-engineering-with/9781098142377/). O’Reilly Media, 2023. ISBN: 9781098142384
[^5]: Edgar F. Codd, S. B. Codd, and C. T. Salley. [Providing OLAP to User-Analysts: An IT Mandate](https://www.estgv.ipv.pt/PaginasPessoais/jloureiro/ESI_AID2007_2008/fichas/codd.pdf). E. F. Codd Associates, 1993. Archived at [perma.cc/RKX8-2GEE](https://perma.cc/RKX8-2GEE)
[^6]: Chinmay Soman and Neha Pawar. [Comparing Three Real-Time OLAP Databases: Apache Pinot, Apache Druid, and ClickHouse](https://startree.ai/blog/a-tale-of-three-real-time-olap-databases). *startree.ai*, April 2023. Archived at [perma.cc/8BZP-VWPA](https://perma.cc/8BZP-VWPA)
[^7]: Surajit Chaudhuri and Umeshwar Dayal. [An Overview of Data Warehousing and OLAP Technology](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/sigrecord.pdf). *ACM SIGMOD Record*, volume 26, issue 1, pages 65–74, March 1997. [doi:10.1145/248603.248616](https://doi.org/10.1145/248603.248616)
[^8]: Fatma Özcan, Yuanyuan Tian, and Pinar Tözün. [Hybrid Transactional/Analytical Processing: A Survey](https://humming80.github.io/papers/sigmod-htaptut.pdf). At *ACM International Conference on Management of Data* (SIGMOD), May 2017. [doi:10.1145/3035918.3054784](https://doi.org/10.1145/3035918.3054784)
[^9]: Adam Prout, Szu-Po Wang, Joseph Victor, Zhou Sun, Yongzhu Li, Jack Chen, Evan Bergeron, Eric Hanson, Robert Walzer, Rodrigo Gomes, and Nikita Shamgunov. [Cloud-Native Transactions and Analytics in SingleStore](https://dl.acm.org/doi/abs/10.1145/3514221.3526055). At *International Conference on Management of Data* (SIGMOD), June 2022. [doi:10.1145/3514221.3526055](https://doi.org/10.1145/3514221.3526055)
[^10]: Chao Zhang, Guoliang Li, Jintao Zhang, Xinning Zhang, and Jianhua Feng. [HTAP Databases: A Survey](https://arxiv.org/pdf/2404.15670). *IEEE Transactions on Knowledge and Data Engineering*, April 2024. [doi:10.1109/TKDE.2024.3389693](https://doi.org/10.1109/TKDE.2024.3389693)
[^11]: Michael Stonebraker and Uğur Çetintemel. [‘One Size Fits All’: An Idea Whose Time Has Come and Gone](https://pages.cs.wisc.edu/~shivaram/cs744-readings/fits_all.pdf). At *21st International Conference on Data Engineering* (ICDE), April 2005. [doi:10.1109/ICDE.2005.1](https://doi.org/10.1109/ICDE.2005.1)
[^12]: Jeffrey Cohen, Brian Dolan, Mark Dunlap, Joseph M. Hellerstein, and Caleb Welton. [MAD Skills: New Analysis Practices for Big Data](https://www.vldb.org/pvldb/vol2/vldb09-219.pdf). *Proceedings of the VLDB Endowment*, volume 2, issue 2, pages 1481–1492, August 2009. [doi:10.14778/1687553.1687576](https://doi.org/10.14778/1687553.1687576)
[^13]: Dan Olteanu. [The Relational Data Borg is Learning](https://www.vldb.org/pvldb/vol13/p3502-olteanu.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 12, August 2020. [doi:10.14778/3415478.3415572](https://doi.org/10.14778/3415478.3415572)
[^14]: Matt Bornstein, Martin Casado, and Jennifer Li. [Emerging Architectures for Modern Data Infrastructure: 2020](https://future.a16z.com/emerging-architectures-for-modern-data-infrastructure-2020/). *future.a16z.com*, October 2020. Archived at [perma.cc/LF8W-KDCC](https://perma.cc/LF8W-KDCC)
[^15]: Martin Fowler. [DataLake](https://www.martinfowler.com/bliki/DataLake.html). *martinfowler.com*, February 2015. Archived at [perma.cc/4WKN-CZUK](https://perma.cc/4WKN-CZUK)
[^16]: Bobby Johnson and Joseph Adler. [The Sushi Principle: Raw Data Is Better](https://learning.oreilly.com/videos/strata-hadoop/9781491924143/9781491924143-video210840/). At *Strata+Hadoop World*, February 2015.
[^17]: Michael Armbrust, Ali Ghodsi, Reynold Xin, and Matei Zaharia. [Lakehouse: A New Generation of Open Platforms that Unify Data Warehousing and Advanced Analytics](https://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf). At *11th Annual Conference on Innovative Data Systems Research* (CIDR), January 2021.
[^18]: DataKitchen, Inc. [The DataOps Manifesto](https://dataopsmanifesto.org/en/). *dataopsmanifesto.org*, 2017. Archived at [perma.cc/3F5N-FUQ4](https://perma.cc/3F5N-FUQ4)
[^19]: Tejas Manohar. [What is Reverse ETL: A Definition & Why It’s Taking Off](https://hightouch.io/blog/reverse-etl/). *hightouch.io*, November 2021. Archived at [perma.cc/A7TN-GLYJ](https://perma.cc/A7TN-GLYJ)
[^20]: Simon O’Regan. [Designing Data Products](https://towardsdatascience.com/designing-data-products-b6b93edf3d23). *towardsdatascience.com*, August 2018. Archived at [perma.cc/HU67-3RV8](https://perma.cc/HU67-3RV8)
[^21]: Camille Fournier. [Why is it so hard to decide to buy?](https://skamille.medium.com/why-is-it-so-hard-to-decide-to-buy-d86fee98e88e) *skamille.medium.com*, July 2021. Archived at [perma.cc/6VSG-HQ5X](https://perma.cc/6VSG-HQ5X)
[^22]: David Heinemeier Hansson. [Why we’re leaving the cloud](https://world.hey.com/dhh/why-we-re-leaving-the-cloud-654b47e0). *world.hey.com*, October 2022. Archived at [perma.cc/82E6-UJ65](https://perma.cc/82E6-UJ65)
[^23]: Nima Badizadegan. [Use One Big Server](https://specbranch.com/posts/one-big-server/). *specbranch.com*, August 2022. Archived at [perma.cc/M8NB-95UK](https://perma.cc/M8NB-95UK)
[^24]: Steve Yegge. [Dear Google Cloud: Your Deprecation Policy is Killing You](https://steve-yegge.medium.com/dear-google-cloud-your-deprecation-policy-is-killing-you-ee7525dc05dc). *steve-yegge.medium.com*, August 2020. Archived at [perma.cc/KQP9-SPGU](https://perma.cc/KQP9-SPGU)
[^25]: Alexandre Verbitski, Anurag Gupta, Debanjan Saha, Murali Brahmadesam, Kamal Gupta, Raman Mittal, Sailesh Krishnamurthy, Sandor Maurice, Tengiz Kharatishvili, and Xiaofeng Bao. [Amazon Aurora: Design Considerations for High Throughput Cloud-Native Relational Databases](https://media.amazonwebservices.com/blog/2017/aurora-design-considerations-paper.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 1041–1052, May 2017. [doi:10.1145/3035918.3056101](https://doi.org/10.1145/3035918.3056101)
[^26]: Panagiotis Antonopoulos, Alex Budovski, Cristian Diaconu, Alejandro Hernandez Saenz, Jack Hu, Hanuma Kodavalla, Donald Kossmann, Sandeep Lingam, Umar Farooq Minhas, Naveen Prakash, Vijendra Purohit, Hugh Qu, Chaitanya Sreenivas Ravella, Krystyna Reisteter, Sheetal Shrotri, Dixin Tang, and Vikram Wakade. [Socrates: The New SQL Server in the Cloud](https://www.microsoft.com/en-us/research/uploads/prod/2019/05/socrates.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 1743–1756, June 2019. [doi:10.1145/3299869.3314047](https://doi.org/10.1145/3299869.3314047)
[^27]: Midhul Vuppalapati, Justin Miron, Rachit Agarwal, Dan Truong, Ashish Motivala, and Thierry Cruanes. [Building An Elastic Query Engine on Disaggregated Storage](https://www.usenix.org/system/files/nsdi20-paper-vuppalapati.pdf). At *17th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), February 2020.
[^28]: Nick Van Wiggeren. [The Real Failure Rate of EBS](https://planetscale.com/blog/the-real-fail-rate-of-ebs). *planetscale.com*, March 2025. Archived at [perma.cc/43CR-SAH5](https://perma.cc/43CR-SAH5)
[^29]: Colin Breck. [Predicting the Future of Distributed Systems](https://blog.colinbreck.com/predicting-the-future-of-distributed-systems/). *blog.colinbreck.com*, August 2024. Archived at [perma.cc/K5FC-4XX2](https://perma.cc/K5FC-4XX2)
[^30]: Gwen Shapira. [Compute-Storage Separation Explained](https://www.thenile.dev/blog/storage-compute). *thenile.dev*, January 2023. Archived at [perma.cc/QCV3-XJNZ](https://perma.cc/QCV3-XJNZ)
[^31]: Ravi Murthy and Gurmeet Goindi. [AlloyDB for PostgreSQL under the hood: Intelligent, database-aware storage](https://cloud.google.com/blog/products/databases/alloydb-for-postgresql-intelligent-scalable-storage). *cloud.google.com*, May 2022. Archived at [archive.org](https://web.archive.org/web/20220514021120/https%3A//cloud.google.com/blog/products/databases/alloydb-for-postgresql-intelligent-scalable-storage)
[^32]: Jack Vanlightly. [The Architecture of Serverless Data Systems](https://jack-vanlightly.com/blog/2023/11/14/the-architecture-of-serverless-data-systems). *jack-vanlightly.com*, November 2023. Archived at [perma.cc/UDV4-TNJ5](https://perma.cc/UDV4-TNJ5)
[^33]: Eric Jonas, Johann Schleier-Smith, Vikram Sreekanti, Chia-Che Tsai, Anurag Khandelwal, Qifan Pu, Vaishaal Shankar, Joao Carreira, Karl Krauth, Neeraja Yadwadkar, Joseph E. Gonzalez, Raluca Ada Popa, Ion Stoica, David A. Patterson. [Cloud Programming Simplified: A Berkeley View on Serverless Computing](https://arxiv.org/abs/1902.03383). *arxiv.org*, February 2019.
[^34]: Betsy Beyer, Jennifer Petoff, Chris Jones, and Niall Richard Murphy. [*Site Reliability Engineering: How Google Runs Production Systems*](https://www.oreilly.com/library/view/site-reliability-engineering/9781491929117/). O’Reilly Media, 2016. ISBN: 9781491929124
[^35]: Thomas Limoncelli. [The Time I Stole $10,000 from Bell Labs](https://queue.acm.org/detail.cfm?id=3434773). *ACM Queue*, volume 18, issue 5, November 2020. [doi:10.1145/3434571.3434773](https://doi.org/10.1145/3434571.3434773)
[^36]: Charity Majors. [The Future of Ops Jobs](https://acloudguru.com/blog/engineering/the-future-of-ops-jobs). *acloudguru.com*, August 2020. Archived at [perma.cc/GRU2-CZG3](https://perma.cc/GRU2-CZG3)
[^37]: Boris Cherkasky. [(Over)Pay As You Go for Your Datastore](https://medium.com/riskified-technology/over-pay-as-you-go-for-your-datastore-11a29ae49a8b). *medium.com*, September 2021. Archived at [perma.cc/Q8TV-2AM2](https://perma.cc/Q8TV-2AM2)
[^38]: Shlomi Kushchi. [Serverless Doesn’t Mean DevOpsLess or NoOps](https://thenewstack.io/serverless-doesnt-mean-devopsless-or-noops/). *thenewstack.io*, February 2023. Archived at [perma.cc/3NJR-AYYU](https://perma.cc/3NJR-AYYU)
[^39]: Erik Bernhardsson. [Storm in the stratosphere: how the cloud will be reshuffled](https://erikbern.com/2021/11/30/storm-in-the-stratosphere-how-the-cloud-will-be-reshuffled.html). *erikbern.com*, November 2021. Archived at [perma.cc/SYB2-99P3](https://perma.cc/SYB2-99P3)
[^40]: Benn Stancil. [The data OS](https://benn.substack.com/p/the-data-os). *benn.substack.com*, September 2021. Archived at [perma.cc/WQ43-FHS6](https://perma.cc/WQ43-FHS6)
[^41]: Maria Korolov. [Data residency laws pushing companies toward residency as a service](https://www.csoonline.com/article/3647761/data-residency-laws-pushing-companies-toward-residency-as-a-service.html). *csoonline.com*, January 2022. Archived at [perma.cc/CHE4-XZZ2](https://perma.cc/CHE4-XZZ2)
[^42]: Severin Borenstein. [Can Data Centers Flex Their Power Demand?](https://energyathaas.wordpress.com/2025/04/14/can-data-centers-flex-their-power-demand/) *energyathaas.wordpress.com*, April 2025. Archived at <https://perma.cc/MUD3-A6FF>
[^43]: Bilge Acun, Benjamin Lee, Fiodar Kazhamiaka, Aditya Sundarrajan, Kiwan Maeng, Manoj Chakkaravarthy, David Brooks, and Carole-Jean Wu. [Carbon Dependencies in Datacenter Design and Management](https://hotcarbon.org/assets/2022/pdf/hotcarbon22-acun.pdf). *ACM SIGENERGY Energy Informatics Review*, volume 3, issue 3, pages 21–26. [doi:10.1145/3630614.3630619](https://doi.org/10.1145/3630614.3630619)
[^44]: Kousik Nath. [These are the numbers every computer engineer should know](https://www.freecodecamp.org/news/must-know-numbers-for-every-computer-engineer/). *freecodecamp.org*, September 2019. Archived at [perma.cc/RW73-36RL](https://perma.cc/RW73-36RL)
[^45]: Joseph M. Hellerstein, Jose Faleiro, Joseph E. Gonzalez, Johann Schleier-Smith, Vikram Sreekanti, Alexey Tumanov, and Chenggang Wu. [Serverless Computing: One Step Forward, Two Steps Back](https://arxiv.org/abs/1812.03651). At *Conference on Innovative Data Systems Research* (CIDR), January 2019.
[^46]: Frank McSherry, Michael Isard, and Derek G. Murray. [Scalability! But at What COST?](https://www.usenix.org/system/files/conference/hotos15/hotos15-paper-mcsherry.pdf) At *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
[^47]: Cindy Sridharan. *[Distributed Systems Observability: A Guide to Building Robust Systems](https://unlimited.humio.com/rs/756-LMY-106/images/Distributed-Systems-Observability-eBook.pdf)*. Report, O’Reilly Media, May 2018. Archived at [perma.cc/M6JL-XKCM](https://perma.cc/M6JL-XKCM)
[^48]: Charity Majors. [Observability — A 3-Year Retrospective](https://thenewstack.io/observability-a-3-year-retrospective/). *thenewstack.io*, August 2019. Archived at [perma.cc/CG62-TJWL](https://perma.cc/CG62-TJWL)
[^49]: Benjamin H. Sigelman, Luiz André Barroso, Mike Burrows, Pat Stephenson, Manoj Plakal, Donald Beaver, Saul Jaspan, and Chandan Shanbhag. [Dapper, a Large-Scale Distributed Systems Tracing Infrastructure](https://research.google/pubs/pub36356/). Google Technical Report dapper-2010-1, April 2010. Archived at [perma.cc/K7KU-2TMH](https://perma.cc/K7KU-2TMH)
[^50]: Rodrigo Laigner, Yongluan Zhou, Marcos Antonio Vaz Salles, Yijian Liu, and Marcos Kalinowski. [Data management in microservices: State of the practice, challenges, and research directions](https://www.vldb.org/pvldb/vol14/p3348-laigner.pdf). *Proceedings of the VLDB Endowment*, volume 14, issue 13, pages 3348–3361, September 2021. [doi:10.14778/3484224.3484232](https://doi.org/10.14778/3484224.3484232)
[^51]: Jordan Tigani. [Big Data is Dead](https://motherduck.com/blog/big-data-is-dead/). *motherduck.com*, February 2023. Archived at [perma.cc/HT4Q-K77U](https://perma.cc/HT4Q-K77U)
[^52]: Sam Newman. [*Building Microservices*, second edition](https://www.oreilly.com/library/view/building-microservices-2nd/9781492034018/). O’Reilly Media, 2021. ISBN: 9781492034025
[^53]: Chris Richardson. [Microservices: Decomposing Applications for Deployability and Scalability](https://www.infoq.com/articles/microservices-intro/). *infoq.com*, May 2014. Archived at [perma.cc/CKN4-YEQ2](https://perma.cc/CKN4-YEQ2)
[^54]: Mohammad Shahrad, Rodrigo Fonseca, Íñigo Goiri, Gohar Chaudhry, Paul Batum, Jason Cooke, Eduardo Laureano, Colby Tresness, Mark Russinovich, Ricardo Bianchini. [Serverless in the Wild: Characterizing and Optimizing the Serverless Workload at a Large Cloud Provider](https://www.usenix.org/system/files/atc20-shahrad.pdf). At *USENIX Annual Technical Conference* (ATC), July 2020.
[^55]: Luiz André Barroso, Urs Hölzle, and Parthasarathy Ranganathan. [The Datacenter as a Computer: Designing Warehouse-Scale Machines](https://www.morganclaypool.com/doi/10.2200/S00874ED3V01Y201809CAC046), third edition. Morgan & Claypool Synthesis Lectures on Computer Architecture, October 2018. [doi:10.2200/S00874ED3V01Y201809CAC046](https://doi.org/10.2200/S00874ED3V01Y201809CAC046)
[^56]: David Fiala, Frank Mueller, Christian Engelmann, Rolf Riesen, Kurt Ferreira, and Ron Brightwell. [Detection and Correction of Silent Data Corruption for Large-Scale High-Performance Computing](https://arcb.csc.ncsu.edu/~mueller/ftp/pub/mueller/papers/sc12.pdf),” at *International Conference for High Performance Computing, Networking, Storage and Analysis* (SC), November 2012. [doi:10.1109/SC.2012.49](https://doi.org/10.1109/SC.2012.49)
[^57]: Anna Kornfeld Simpson, Adriana Szekeres, Jacob Nelson, and Irene Zhang. [Securing RDMA for High-Performance Datacenter Storage Systems](https://www.usenix.org/conference/hotcloud20/presentation/kornfeld-simpson). At *12th USENIX Workshop on Hot Topics in Cloud Computing* (HotCloud), July 2020.
[^58]: Arjun Singh, Joon Ong, Amit Agarwal, Glen Anderson, Ashby Armistead, Roy Bannon, Seb Boving, Gaurav Desai, Bob Felderman, Paulie Germano, Anand Kanagala, Jeff Provost, Jason Simmons, Eiichi Tanda, Jim Wanderer, Urs Hölzle, Stephen Stuart, and Amin Vahdat. [Jupiter Rising: A Decade of Clos Topologies and Centralized Control in Google’s Datacenter Network](https://conferences.sigcomm.org/sigcomm/2015/pdf/papers/p183.pdf). At *Annual Conference of the ACM Special Interest Group on Data Communication* (SIGCOMM), August 2015. [doi:10.1145/2785956.2787508](https://doi.org/10.1145/2785956.2787508)
[^59]: Glenn K. Lockwood. [Hadoop’s Uncomfortable Fit in HPC](https://blog.glennklockwood.com/2014/05/hadoops-uncomfortable-fit-in-hpc.html). *glennklockwood.blogspot.co.uk*, May 2014. Archived at [perma.cc/S8XX-Y67B](https://perma.cc/S8XX-Y67B)
[^60]: Cathy O’Neil: *Weapons of Math Destruction: How Big Data Increases Inequality and Threatens Democracy*. Crown Publishing, 2016. ISBN: 9780553418811
[^61]: Supreeth Shastri, Vinay Banakar, Melissa Wasserman, Arun Kumar, and Vijay Chidambaram. [Understanding and Benchmarking the Impact of GDPR on Database Systems](https://www.vldb.org/pvldb/vol13/p1064-shastri.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 7, pages 1064–1077, March 2020. [doi:10.14778/3384345.3384354](https://doi.org/10.14778/3384345.3384354)
[^62]: Martin Fowler. [Datensparsamkeit](https://www.martinfowler.com/bliki/Datensparsamkeit.html). *martinfowler.com*, December 2013. Archived at [perma.cc/R9QX-CME6](https://perma.cc/R9QX-CME6)
[^63]: [Regulation (EU) 2016/679 of the European Parliament and of the Council of 27 April 2016 (General Data Protection Regulation)](https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32016R0679&from=EN). *Official Journal of the European Union* L 119/1, May 2016.


================================================
FILE: content/zh/ch10.md
================================================
---
title: "10. 一致性与共识"
weight: 210
breadcrumbs: false
---

<a id="ch_consistency"></a>

![](/map/ch09.png)

> *一句古老的格言告诫说："千万不要带着两块计时器出海；要么带一块，要么带三块。"*
>
> 弗雷德里克·P·布鲁克斯，《人月神话：软件工程随笔》（1995）

正如在 [第九章](/ch9) 中讨论的，分布式系统中会出现许多问题。如果我们希望服务在出现这些问题时仍能正确工作，就需要找到容错的方法。

我们拥有的最佳容错工具之一是 *复制*。然而，正如我们在 [第六章](/ch6) 中看到的，在多个副本上拥有多份数据副本会带来不一致的风险。读取可能由一个非最新的副本处理，从而产生过时的结果。如果多个副本可以接受写入，我们必须处理在不同副本上并发写入的值之间的冲突。从高层次来看，处理这些问题有两种相互竞争的理念：

最终一致性
: 在这种理念中，系统被复制这一事实对应用程序是可见的，作为应用程序开发者，你需要处理可能出现的不一致和冲突。这种方法通常用于多主复制（见 ["多主复制"](/ch6#sec_replication_multi_leader)）和无主复制（见 ["无主复制"](/ch6#sec_replication_leaderless)）的系统中。

强一致性
: 这种理念认为应用程序不应该担心复制的内部细节，系统应该表现得就像单节点一样。这种方法的优点是对你（应用程序开发者）来说更简单。缺点是更强的一致性会带来性能成本，并且某些最终一致系统能够容忍的故障会导致强一致系统出现中断。

一如既往，哪种方法更好取决于你的应用程序。如果你有一个应用程序，用户可以在离线状态下对数据进行更改，那么最终一致性是不可避免的，如 ["同步引擎与本地优先软件"](/ch6#sec_replication_offline_clients) 中所讨论的。然而，最终一致性对应用程序来说也可能很难处理。如果你的副本位于具有快速、可靠通信的数据中心，那么强一致性通常是合适的，因为其成本是可以接受的。

在本章中，我们将深入探讨强一致性方法，关注三个领域：

1. 一个挑战是"强一致性"相当模糊，因此我们将制定一个更精确的定义，明确我们想要实现什么：*线性一致性*。
2. 我们将研究生成 ID 和时间戳的问题。这可能听起来与一致性无关，但实际上密切相关。
3. 我们将探讨分布式系统如何在保持容错的同时实现线性一致性；答案是 *共识* 算法。

在此过程中，我们将看到分布式系统中什么是可能的，什么是不可能的，存在一些基本限制。

本章的主题以难以正确实现而著称；构建在没有故障时表现良好，但在面对设计者没有考虑到的不幸故障组合时完全崩溃的系统非常容易。已经发展了大量理论来帮助我们思考这些边界情况，这使我们能够构建可以稳健地容忍故障的系统。

本章只会触及表面：我们将坚持非正式的直觉，避免算法细节、形式化模型和证明。如果你想在共识系统和类似基础设施上进行认真的工作，你需要更深入地研究理论，才有机会让你的系统稳健。与往常一样，本章中的文献参考提供了一些初步的指引。


## 线性一致性 {#sec_consistency_linearizability}

如果你希望复制的数据库尽可能简单易用，你应该让它表现得就像根本没有复制一样。然后用户就不必担心复制延迟、冲突和其他不一致性。这将给我们带来容错的优势，但不会因为必须考虑多个副本而带来复杂性。

这就是 *线性一致性* [^1] 背后的想法（也称为 *原子一致性* [^2]、*强一致性*、*即时一致性* 或 *外部一致性* [^3]）。线性一致性的确切定义相当微妙，我们将在本节的其余部分探讨它。但基本思想是让系统看起来好像只有一份数据副本，并且对它的所有操作都是原子的。有了这个保证，即使实际上可能有多个副本，应用程序也不需要担心它们。

在线性一致系统中，一旦一个客户端成功完成写入，所有从数据库读取的客户端都必须能够看到刚刚写入的值。维护单一数据副本的假象，意味着要保证读取到的是最新值，而不是来自过时的缓存或副本。换句话说，线性一致性是一种 *新鲜度保证*。为了阐明这个想法，让我们看一个非线性一致系统的例子。

{{< figure src="/fig/ddia_1001.png" id="fig_consistency_linearizability_0" caption="图 10-1. 如果这个数据库是线性一致的，那么 Alice 的读取要么返回 1 而不是 0，要么 Bob 的读取返回 0 而不是 1。" class="w-full my-4" >}}

[图 10-1](#fig_consistency_linearizability_0) 显示了一个非线性一致的体育网站示例 [^4]。Aaliyah 和 Bryce 坐在同一个房间里，都在查看手机，想要了解他们最喜欢的球队比赛的结果。就在最终比分宣布后，Aaliyah 刷新了页面，看到了获胜者的公告，并兴奋地告诉了 Bryce。Bryce 怀疑地在自己的手机上点击了 *刷新*，但他的请求发送到了一个滞后的数据库副本，因此他的手机显示比赛仍在进行中。

如果 Aaliyah 和 Bryce 同时点击刷新，他们得到两个不同的查询结果就不会那么令人惊讶了，因为他们不知道他们各自的请求在服务器上被处理的确切时间。然而，Bryce 知道他是在听到 Aaliyah 宣布最终比分 *之后* 点击刷新按钮（发起查询）的，因此他期望他的查询结果至少与 Aaliyah 的一样新。他的查询返回过时结果这一事实违反了线性一致性。

### 什么使系统具有线性一致性？ {#sec_consistency_lin_definition}

为了更好地理解线性一致性，让我们看一些更多的例子。[图 10-2](#fig_consistency_linearizability_1) 显示了三个客户端在线性一致数据库中并发读取和写入同一个对象 *x*。在分布式系统理论中，*x* 被称为 *寄存器*——在实践中，它可能是键值存储中的一个键，关系数据库中的一行，或者文档数据库中的一个文档，例如。

{{< figure src="/fig/ddia_1002.png" id="fig_consistency_linearizability_1" caption="图 10-2. Alice 观察到 x = 0 且 y = 1，而 Bob 观察到 x = 1 且 y = 0。就好像 Alice 和 Bob 的计算机对写入发生的顺序意见不一。" class="w-full my-4" >}}


为简单起见，[图 10-2](#fig_consistency_linearizability_1) 仅显示了从客户端角度看的请求，而不是数据库的内部。每个条形代表客户端发出的请求，条形的开始是发送请求的时间，条形的结束是客户端收到响应的时间。由于网络延迟可变，客户端不知道数据库确切何时处理了它的请求——它只知道必须在客户端发送请求和接收响应之间的某个时间发生。

在这个例子中，寄存器有两种类型的操作：

* *read*(*x*) ⇒ *v* 表示客户端请求读取寄存器 *x* 的值，数据库返回值 *v*。
* *write*(*x*, *v*) ⇒ *r* 表示客户端请求将寄存器 *x* 设置为值 *v*，数据库返回响应 *r*（可能是 *ok* 或 *error*）。

在 [图 10-2](#fig_consistency_linearizability_1) 中，*x* 的值最初为 0，客户端 C 执行写入请求将其设置为 1。在此期间，客户端 A 和 B 反复轮询数据库以读取最新值。A 和 B 的读取请求可能得到什么响应？

* 客户端 A 的第一个读取操作在写入开始之前完成，因此它必须明确返回旧值 0。
* 客户端 A 的最后一次读取在写入完成后开始，因此如果数据库是线性一致的，它必须明确返回新值 1，因为读取必须在写入之后被处理。
* 与写入操作在时间上重叠的任何读取操作可能返回 0 或 1，因为我们不知道在读取操作被处理时写入是否已经生效。这些操作与写入是 *并发* 的。

然而，这还不足以完全描述线性一致性：如果与写入并发的读取可以返回旧值或新值，那么读者可能会在写入进行时多次看到值在旧值和新值之间来回翻转。这不是我们对模拟"单一数据副本"的系统所期望的。

为了使系统线性一致，我们需要添加另一个约束，如 [图 10-3](#fig_consistency_linearizability_2) 所示。

{{< figure src="/fig/ddia_1003.png" id="fig_consistency_linearizability_2" caption="图 10-3. 如果 Alice 和 Bob 有完美的时钟，线性一致性将要求返回 x = 1，因为 x 的读取在写入 x = 1 完成后开始。" class="w-full my-4" >}}


在线性一致系统中，我们想象必须有某个时间点（在写入操作的开始和结束之间），*x* 的值从 0 原子地翻转到 1。因此，如果一个客户端的读取返回新值 1，所有后续读取也必须返回新值，即使写入操作尚未完成。

这种时序依赖关系在 [图 10-3](#fig_consistency_linearizability_2) 中用箭头表示。客户端 A 是第一个读取新值 1 的。就在 A 的读取返回后，B 开始新的读取。由于 B 的读取严格发生在 A 的读取之后，它也必须返回 1，即使 C 的写入仍在进行中。（这与 [图 10-1](#fig_consistency_linearizability_0) 中 Aaliyah 和 Bryce 的情况相同：在 Aaliyah 读取新值后，Bryce 也期望读取新值。）

我们可以进一步细化这个时序图，以可视化每个操作在某个时间点原子地生效 [^5]，就像 [图 10-4](#fig_consistency_linearizability_3) 中显示的更复杂的例子。在这个例子中，除了 *read* 和 *write* 之外，我们添加了第三种操作类型：

* *cas*(*x*, *v*old, *v*new) ⇒ *r* 表示客户端请求一个原子 *比较并设置* 操作（见 ["条件写入（比较并设置）"](/ch8#sec_transactions_compare_and_set)）。如果寄存器 *x* 的当前值等于 *v*old，它应该原子地设置为 *v*new。如果 *x* 的值与 *v*old 不同，则操作应该保持寄存器不变并返回错误。*r* 是数据库的响应（*ok* 或 *error*）。

[图 10-4](#fig_consistency_linearizability_3) 中的每个操作都用一条垂直线（在每个操作的条形内）标记，表示我们认为操作执行的时间。这些标记按顺序连接起来，结果必须是寄存器的有效读写序列（每次读取必须返回最近写入设置的值）。

线性一致性的要求是连接操作标记的线始终向前移动（从左到右），永不后退。这个要求确保了我们之前讨论的新鲜度保证：一旦写入或读取了新值，所有后续读取都会看到写入的值，直到它再次被覆盖。

{{< figure src="/fig/ddia_1004.png" id="fig_consistency_linearizability_3" caption="图 10-4. x 的读取与写入 x = 1 并发。由于我们不知道操作的确切时序，读取可以返回 0 或 1。" class="w-full my-4" >}}


[图 10-4](#fig_consistency_linearizability_3) 中有一些有趣的细节需要指出：

* 首先客户端 B 发送了读取 *x* 的请求，然后客户端 D 发送了将 *x* 设置为 0 的请求，然后客户端 A 发送了将 *x* 设置为 1 的请求。然而，返回给 B 的读取值是 1（A 写入的值）。这是可以的：这意味着数据库首先处理了 D 的写入，然后是 A 的写入，最后是 B 的读取。虽然这不是发送请求的顺序，但这是一个可接受的顺序，因为这三个请求是并发的。也许 B 的读取请求在网络中稍有延迟，因此它在两次写入之后才到达数据库。
* 客户端 B 的读取在客户端 A 收到数据库的响应之前返回了 1，表示值 1 的写入成功。这也是可以的：这只是意味着从数据库到客户端 A 的 *ok* 响应在网络中稍有延迟。
* 这个模型不假设任何事务隔离：另一个客户端可以随时更改值。例如，C 首先读取 1，然后读取 2，因为该值在两次读取之间被 B 更改了。原子比较并设置（*cas*）操作可用于检查值是否未被另一个客户端并发更改：B 和 C 的 *cas* 请求成功，但 D 的 *cas* 请求失败（到数据库处理它时，*x* 的值不再是 0）。
* 客户端 B 的最后一次读取（在阴影条中）不是线性一致的。该操作与 C 的 *cas* 写入并发，后者将 *x* 从 2 更新到 4。在没有其他请求的情况下，B 的读取返回 2 是可以的。然而，客户端 A 在 B 的读取开始之前已经读取了新值 4，因此 B 不允许读取比 A 更旧的值。同样，这与 [图 10-1](#fig_consistency_linearizability_0) 中 Aaliyah 和 Bryce 的情况相同。

这就是线性一致性背后的直觉；形式化定义 [^1] 更精确地描述了它。可以（尽管计算成本高昂）通过记录所有请求和响应的时序，并检查它们是否可以排列成有效的顺序序列来测试系统的行为是否线性一致 [^6] [^7]。

就像除了可串行化之外还有各种弱隔离级别用于事务（见 ["弱隔离级别"](/ch8#sec_transactions_isolation_levels)），除了线性一致性之外，复制系统也有各种较弱的一致性模型 [^8]。实际上，我们在 ["复制延迟问题"](/ch6#sec_replication_lag) 中看到的 *写后读*、*单调读* 和 *一致性前缀读* 属性就是这种较弱一致性模型的例子。线性一致性保证所有这些较弱的属性，以及更多。在本章中，我们将重点关注线性一致性，它是最常用的最强一致性模型。


--------

> [!TIP] 线性一致性与可串行化

线性一致性很容易与可串行化混淆（见 ["可串行化"](/ch8#sec_transactions_serializability)），因为这两个词似乎都意味着类似"可以按顺序排列"的东西。然而，它们是完全不同的保证，区分它们很重要：

可串行化
: 可串行化是事务的隔离属性，其中每个事务可能读取和写入 *多个对象*（行、文档、记录）。它保证事务的行为与它们按 *某种* 串行顺序执行时相同：也就是说，就好像你首先执行一个事务的所有操作，然后执行另一个事务的所有操作，依此类推，而不交错它们。该串行顺序可以与事务实际运行的顺序不同 [^9]。

线性一致性
: 线性一致性是对寄存器（*单个对象*）的读写保证。它不将操作分组到事务中，因此它不能防止涉及多个对象的问题，如写偏差（见 ["写偏差和幻读"](/ch8#sec_transactions_write_skew)）。然而，线性一致性是一个 *新鲜度* 保证：它要求如果一个操作在另一个操作开始之前完成，那么后一个操作必须观察到至少与前一个操作一样新的状态。可串行化没有这个要求：例如，可串行化允许过时读取 [^10]。

（*顺序一致性* 又是另外一回事 [^8]，但我们不会在这里讨论它。）

数据库可能同时提供可串行化和线性一致性，这种组合称为 *严格可串行化* 或 *强单副本可串行化*（*strong-1SR*）[^11] [^12]。单节点数据库通常是线性一致的。对于使用乐观方法（如可串行化快照隔离）的分布式数据库（见 ["可串行化快照隔离（SSI）"](/ch8#sec_transactions_ssi)），情况更加复杂：例如，CockroachDB 提供可串行化和对读取的一些新鲜度保证，但不是严格可串行化 [^13]，因为这需要事务之间进行昂贵的协调 [^14]。

也可以将较弱的隔离级别与线性一致性结合，或将较弱的一致性模型与可串行化结合；实际上，一致性模型和隔离级别可以在很大程度上相互独立地选择 [^15] [^16]。

--------

### 依赖线性一致性 {#sec_consistency_linearizability_usage}

在什么情况下线性一致性有用？查看体育比赛的最终比分也许是一个无关紧要的例子：过时几秒钟的结果在这种情况下不太可能造成任何实际伤害。然而，有几个领域中线性一致性是使系统正确工作的重要要求。

#### 锁定与领导者选举 {#locking-and-leader-election}

使用单主复制的系统需要确保确实只有一个主节点，而不是多个（脑裂）。选举领导者的一种方法是使用租约：每个启动的节点都尝试获取租约，成功的节点成为领导者 [^17]。无论这种机制如何实现，它都必须是线性一致的：两个不同的节点不应该能够同时获取租约。

像 Apache ZooKeeper [^18] 和 etcd 这样的协调服务通常用于实现分布式租约和领导者选举。它们使用共识算法以容错的方式实现线性一致的操作（我们将在本章后面讨论这些算法）。实现租约和领导者选举正确仍然有许多微妙的细节（例如，参见 ["分布式锁和租约"](/ch9#sec_distributed_lock_fencing) 中的栅栏问题），像 Apache Curator 这样的库通过在 ZooKeeper 之上提供更高级别的配方来提供帮助。然而，线性一致的存储服务是这些协调任务的基本基础。

--------

> [!NOTE]
> 严格来说，ZooKeeper 提供线性一致的写入，但读取可能是过时的，因为不能保证它们由当前领导者提供 [^18]。etcd 从版本 3 开始默认提供线性一致的读取。

--------


分布式锁也在一些分布式数据库中以更细粒度的级别使用，例如 Oracle Real Application Clusters (RAC) [^19]。RAC 对每个磁盘页使用一个锁，多个节点共享对同一磁盘存储系统的访问。由于这些线性一致的锁位于事务执行的关键路径上，RAC 部署通常具有专用的集群互连网络用于数据库节点之间的通信。

#### 约束与唯一性保证 {#sec_consistency_uniqueness}

唯一性约束在数据库中很常见：例如，用户名或电子邮件地址必须唯一标识一个用户，在文件存储服务中不能有两个具有相同路径和文件名的文件。如果你想在数据写入时强制执行此约束（这样如果两个人同时尝试创建具有相同名称的用户或文件，其中一个将返回错误），你需要线性一致性。

这种情况实际上类似于锁：当用户注册你的服务时，你可以认为他们获取了所选用户名的"锁"。该操作也非常类似于原子比较并设置，将用户名设置为声明它的用户的 ID，前提是用户名尚未被占用。

如果你想确保银行账户余额永远不会变为负数，或者你不会销售超过仓库库存的物品，或者两个人不会同时预订同一航班或剧院的同一座位，也会出现类似的问题。这些约束都要求有一个所有节点都同意的单一最新值（账户余额、库存水平、座位占用情况）。

在实际应用中，有时可以接受宽松地对待这些约束（例如，如果航班超售，你可以将客户转移到其他航班，并为不便提供补偿）。在这种情况下，可能不需要线性一致性，我们将在 ["时效性与完整性"](/ch13#sec_future_integrity) 中讨论这种宽松解释的约束。

然而，硬唯一性约束，例如你通常在关系数据库中找到的约束，需要线性一致性。其他类型的约束，例如外键或属性约束，可以在没有线性一致性的情况下实现 [^20]。

#### 跨通道时序依赖 {#cross-channel-timing-dependencies}

注意 [图 10-1](#fig_consistency_linearizability_0) 中的一个细节：如果 Aaliyah 没有大声说出比分，Bryce 就不会知道他的查询结果是过时的。他只会在几秒钟后再次刷新页面，最终看到最终比分。线性一致性违规之所以被注意到，只是因为系统中有一个额外的通信通道（Aaliyah 的声音到 Bryce 的耳朵）。

类似的情况可能出现在计算机系统中。例如，假设你有一个网站，用户可以上传视频，后台进程将视频转码为较低质量，以便在慢速互联网连接上流式传输。该系统的架构和数据流如 [图 10-5](#fig_consistency_transcoder) 所示。

视频转码器需要明确指示执行转码作业，此指令通过消息队列从 Web 服务器发送到转码器（见 ["消息传递系统"](/ch12#sec_stream_messaging)）。Web 服务器不会将整个视频放在队列中，因为大多数消息代理都是为小消息设计的，而视频可能有许多兆字节大小。相反，视频首先写入文件存储服务，写入完成后，转码指令被放入队列。

{{< figure src="/fig/ddia_1005.png" id="fig_consistency_transcoder" caption="图 10-5. 一个非线性一致的系统：Alice 和 Bob 在不同时间看到上传的图像，因此 Bob 的请求基于过时的数据。" class="w-full my-4" >}}


如果文件存储服务是线性一致的，那么这个系统应该工作正常。如果它不是线性一致的，就存在竞态条件的风险：消息队列（[图 10-5](#fig_consistency_transcoder) 中的步骤 3 和 4）可能比存储服务内部的复制更快。在这种情况下，当转码器获取原始视频（步骤 5）时，它可能会看到文件的旧版本，或者根本看不到任何内容。如果它处理视频的旧版本，文件存储中的原始视频和转码视频将永久不一致。

这个问题的出现是因为 Web 服务器和转码器之间有两个不同的通信通道：文件存储和消息队列。如果没有线性一致性的新鲜度保证，这两个通道之间可能存在竞态条件。这种情况类似于 [图 10-1](#fig_consistency_linearizability_0)，其中也存在两个通信通道之间的竞态条件：数据库复制和 Aaliyah 嘴巴到 Bryce 耳朵之间的现实音频通道。

如果你有一个可以接收推送通知的移动应用程序，并且应用程序在收到推送通知时从服务器获取一些数据，就会发生类似的竞态条件。如果数据获取可能发送到滞后的副本，可能会发生推送通知快速通过，但后续获取没有看到推送通知所涉及的数据。

线性一致性不是避免这种竞态条件的唯一方法，但它是最容易理解的。如果你控制额外的通信通道（如消息队列的情况，但不是 Aaliyah 和 Bryce 的情况），你可以使用类似于我们在 ["读己之写"](/ch6#sec_replication_ryw) 中讨论的替代方法，但代价是额外的复杂性。


### 实现线性一致性系统 {#sec_consistency_implementing_linearizable}

现在我们已经看了线性一致性有用的几个例子，让我们思考如何实现一个提供线性一致语义的系统。

由于线性一致性本质上意味着"表现得好像只有一份数据副本，并且对它的所有操作都是原子的"，最简单的答案是真的只使用一份数据副本。然而，这种方法将无法容忍故障：如果持有该副本的节点失败，数据将丢失，或者至少在节点重新启动之前无法访问。

让我们重新审视 [第六章](/ch6) 中的复制方法，并比较它们是否可以实现线性一致：

单主复制（可能线性一致）
: 在单主复制系统中，主节点拥有用于写入的数据主副本，备库在其他节点上维护数据副本。只要你在主节点上执行所有读写操作，它们很可能是线性一致的。然而，这假设你确定知道谁是主节点。如 ["分布式锁和租约"](/ch9#sec_distributed_lock_fencing) 中所讨论的，一个节点很可能认为自己是主节点，而实际上并不是。如果这个“妄想中的主节点”继续处理请求，很可能会违反线性一致性 [^21]。使用异步复制时，故障切换甚至可能丢失已提交的写入，这违反了持久性和线性一致性。

 对单主数据库进行分片，每个分片有一个单独的主节点，不会影响线性一致性，因为它只是单对象保证。跨分片事务是另一回事（见 ["分布式事务"](/ch8#sec_transactions_distributed)）。

共识算法（可能线性一致）
: 一些共识算法本质上是带有自动领导者选举和故障切换的单主复制。它们经过精心设计以防止脑裂，使它们能够安全地实现线性一致的存储。ZooKeeper 使用 Zab 共识算法 [^22]，etcd 使用 Raft [^23]，例如。然而，仅仅因为系统使用共识并不能保证其上的所有操作都是线性一致的：如果它允许在不检查节点是否仍然是领导者的情况下在节点上读取，读取的结果可能是过时的，如果刚刚选出了新的领导者。

多主复制（非线性一致）
: 具有多主复制的系统通常不是线性一致的，因为它们在多个节点上并发处理写入，并将它们异步复制到其他节点。因此，它们可能产生需要解决的冲突写入（见 ["处理冲突写入"](/ch6#sec_replication_write_conflicts)）。

无主复制（可能非线性一致）
: 对于具有无主复制的系统（Dynamo 风格；见 ["无主复制"](/ch6#sec_replication_leaderless)），人们有时声称可以通过要求仲裁读写（*w* + *r* > *n*）来获得"强一致性"。根据确切的算法，以及你如何定义强一致性，这并不完全正确。

 基于日历时钟的"最后写入获胜"冲突解决方法（例如，在 Cassandra 和 ScyllaDB 中）几乎肯定是非线性一致的，因为时钟时间戳由于时钟偏差而无法保证与实际事件顺序一致（见 ["依赖同步时钟"](/ch9#sec_distributed_clocks_relying)）。即使使用仲裁，也可能出现非线性一致的行为，如下一节所示。

#### 线性一致性与仲裁 {#sec_consistency_quorum_linearizable}

直观地说，在 Dynamo 风格的模型中，仲裁读写似乎应该是线性一致的。然而，当我们有可变的网络延迟时，可能会出现竞态条件，如 [图 10-6](#fig_consistency_leaderless) 所示。

{{< figure src="/fig/ddia_1006.png" id="fig_consistency_leaderless" caption="图 10-6. 如果网络延迟是可变的，仲裁不足以确保线性一致性。" class="w-full my-4" >}}


在 [图 10-6](#fig_consistency_leaderless) 中，*x* 的初始值为 0，写入客户端通过向所有三个副本发送写入（*n* = 3，*w* = 3）将 *x* 更新为 1。同时，客户端 A 从两个节点的仲裁（*r* = 2）读取，并在其中一个节点上看到新值 1。同时与写入并发，客户端 B 从不同的两个节点仲裁读取，并从两者获得旧值 0。

仲裁条件得到满足（*w* + *r* > *n*），但这种执行仍然不是线性一致的：B 的请求在 A 的请求完成后开始，但 B 返回旧值而 A 返回新值。（这又是 [图 10-1](#fig_consistency_linearizability_0) 中 Aaliyah 和 Bryce 的情况。）

可以使 Dynamo 风格的仲裁线性一致，但代价是降低性能：读者必须同步执行读修复（见 ["追赶错过的写入"](/ch6#sec_replication_read_repair)），然后才能将结果返回给应用程序 [^24]。此外，在写入之前，写入者必须读取节点仲裁的最新状态以获取任何先前写入的最新时间戳，并确保新写入具有更大的时间戳 [^25] [^26]。然而，Riak 由于性能损失而不执行同步读修复。Cassandra 确实等待仲裁读取时的读修复完成 [^27]，但由于它使用日历时钟作为时间戳而失去了线性一致性。

此外，只有线性一致的读写操作可以以这种方式实现；线性一致的比较并设置操作不能，因为它需要共识算法 [^28]。

总之，最安全的假设是，具有 Dynamo 风格复制的无主系统不提供线性一致性，即使使用仲裁读写。

### 线性一致性的代价 {#sec_linearizability_cost}

由于某些复制方法可以提供线性一致性而其他方法不能，因此更深入地探讨线性一致性的利弊是很有趣的。

我们已经在 [第六章](/ch6) 中讨论了不同复制方法的一些用例；例如，我们看到多主复制通常是多区域复制的良好选择（见 ["地理分布式操作"](/ch6#sec_replication_multi_dc)）。[图 10-7](#fig_consistency_cap_availability) 展示了这种部署的示例。

{{< figure src="/fig/ddia_1007.png" id="fig_consistency_cap_availability" caption="图 10-7. 如果客户端由于网络分区而无法联系足够的副本，它们就无法处理写入。" class="w-full my-4" >}}


考虑如果两个区域之间出现网络中断会发生什么。让我们假设每个区域内的网络正常工作，客户端可以到达其本地区域，但这些区域之间无法相互连接。这被称为 *网络分区*。

使用多主数据库，每个区域可以继续正常运行：由于来自一个区域的写入被异步复制到另一个区域，写入只是排队并在网络连接恢复时交换。

另一方面，如果使用单主复制，那么主节点必须在其中一个区域。任何写入和任何线性一致的读取都必须发送到主节点。因此，对于连接到备库所在区域的任何客户端，这些读写请求都必须通过网络同步发送到主节点区域。

如果在单主设置中区域之间的网络中断，连接到备库区域的客户端无法联系主节点，因此它们既不能对数据库进行任何写入，也不能进行任何线性一致的读取。它们仍然可以从备库读取，但这些读取可能是过时的（非线性一致）。如果应用程序需要线性一致的读写，网络中断会导致应用程序在无法联系主节点的区域中变得不可用。

如果客户端可以直接连接到主节点区域，这不是问题，因为应用程序在那里继续正常工作。但只能访问备库区域的客户端将在网络链路修复之前遇到中断。

#### CAP 定理 {#the-cap-theorem}

这个问题不仅仅是单主和多主复制的结果：任何线性一致的数据库都有这个问题，无论它如何实现。这个问题也不特定于多区域部署，而是可以发生在任何不可靠的网络上，即使在一个区域内。权衡如下：

* 如果你的应用程序 *需要* 线性一致性，并且某些副本由于网络问题与其他副本断开连接，那么某些副本在断开连接时无法处理请求：它们必须等待网络问题修复，或者返回错误（无论哪种方式，它们都变得 *不可用*）。这种选择有时被称为 *CP*（在网络分区下一致）。
* 如果你的应用程序 *不需要* 线性一致性，那么它可以以一种方式编写，使每个副本可以独立处理请求，即使它与其他副本断开连接（例如，多主）。在这种情况下，应用程序可以在面对网络问题时保持 *可用*，但其行为不是线性一致的。这种选择被称为 *AP*（在网络分区下可用）。

因此，不需要线性一致性的应用程序可以更好地容忍网络问题。这种见解通常被称为 *CAP 定理* [^29] [^30] [^31] [^32]，由 Eric Brewer 在 2000 年命名，尽管这种权衡自 1970 年代以来就为分布式数据库设计者所知 [^33] [^34] [^35]。

CAP 最初是作为经验法则提出的，没有精确的定义，目的是开始关于数据库中权衡的讨论。当时，许多分布式数据库专注于在具有共享存储的机器集群上提供线性一致语义 [^19]，CAP 鼓励数据库工程师探索更广泛的分布式无共享系统设计空间，这些系统更适合实现大规模 Web 服务 [^36]。CAP 在这种文化转变方面值得称赞——它帮助触发了 NoSQL 运动，这是 2000 年代中期左右的一系列新数据库技术。

> [!TIP] 无用的 CAP 定理

CAP 有时被表述为 *一致性、可用性、分区容错性：从 3 个中选择 2 个*。不幸的是，这样表述是误导性的 [^32]，因为网络分区是一种故障，所以它们不是你可以选择的：无论你喜欢与否，它们都会发生。

当网络正常工作时，系统可以同时提供一致性（线性一致性）和完全可用性。当发生网络故障时，你必须在线性一致性或完全可用性之间进行选择。因此，CAP 的更好表述方式是 *分区时要么一致要么可用* [^37]。更可靠的网络需要更少地做出这种选择，但在某个时候这种选择是不可避免的。

CP/AP 分类方案还有几个进一步的缺陷 [^4]。*一致性* 被形式化为线性一致性（定理没有说任何关于较弱一致性模型的内容），*可用性* 的形式化 [^30] 与该术语的通常含义不匹配 [^38]。许多高可用（容错）系统实际上不符合 CAP 对可用性的特殊定义。此外，一些系统设计者选择（有充分理由）既不提供线性一致性也不提供 CAP 定理假设的可用性形式，因此这些系统既不是 CP 也不是 AP [^39] [^40]。

总的来说，关于 CAP 有很多误解和混淆，它并不能帮助我们更好地理解系统，因此最好避免使用 CAP。

正式定义的 CAP 定理 [^30] 范围非常狭窄：它只考虑一种一致性模型（即线性一致性）和一种故障（网络分区，根据 Google 的数据，这是不到 8% 事件的原因 [^41]）。它没有说任何关于网络延迟、死节点或其他权衡的内容。因此，尽管 CAP 在历史上具有影响力，但对于设计系统几乎没有实际价值 [^4] [^38]。

已经有努力推广 CAP。例如，*PACELC 原则* 观察到系统设计者也可能选择在网络正常工作时削弱一致性以减少延迟 [^39] [^40] [^42]。因此，在网络分区（P）期间，我们需要在可用性（A）和一致性（C）之间进行选择；否则（E），当没有分区时，我们可能在低延迟（L）和一致性（C）之间进行选择。然而，这个定义继承了 CAP 的几个问题，例如一致性和可用性的反直觉定义。

分布式系统中有许多更有趣的不可能性结果 [^43]，CAP 现在已被更精确的结果所取代 [^44] [^45]，因此它今天主要具有历史意义。

#### 线性一致性与网络延迟 {#linearizability-and-network-delays}

尽管线性一致性是一个有用的保证，但令人惊讶的是，实际上很少有系统是线性一致的。例如，即使现代多核 CPU 上的 RAM 也不是线性一致的 [^46]：如果在一个 CPU 核心上运行的线程写入内存地址，而另一个 CPU 核心上的线程随后读取相同的地址，不能保证读取第一个线程写入的值（除非使用 *内存屏障* 或 *栅栏* [^47]）。

这种行为的原因是每个 CPU 核心都有自己的内存缓存和存储缓冲区。默认情况下，内存访问首先进入缓存，任何更改都异步写出到主内存。由于访问缓存中的数据比访问主内存快得多 [^48]，这个特性对于现代 CPU 的良好性能至关重要。然而，现在有多份数据副本（一份在主内存中，可能还有几份在各种缓存中），这些副本是异步更新的，因此线性一致性丢失了。

为什么要做出这种权衡？使用 CAP 定理来证明多核内存一致性模型是没有意义的：在一台计算机内，我们通常假设可靠的通信，我们不期望一个 CPU 核心在与计算机其余部分断开连接的情况下能够继续正常运行。放弃线性一致性的原因是 *性能*，而不是容错 [^39]。

许多选择不提供线性一致保证的分布式数据库也是如此：它们这样做主要是为了提高性能，而不是为了容错 [^42]。线性一致性很慢——这在任何时候都是真的，不仅在网络故障期间。

我们能否找到更高效的线性一致存储实现？答案似乎是否定的：Attiya 和 Welch [^49] 证明，如果你想要线性一致性，读写请求的响应时间至少与网络中延迟的不确定性成正比。在具有高度可变延迟的网络中，例如大多数计算机网络（见 ["超时和无界延迟"](/ch9#sec_distributed_queueing)），线性一致读写的响应时间不可避免地会很高。更快的线性一致性算法不存在，但较弱的一致性模型可能会快得多，因此这种权衡对于延迟敏感的系统很重要。在 ["时效性与完整性"](/ch13#sec_future_integrity) 中，我们将讨论一些在不牺牲正确性的情况下避免线性一致性的方法。


## ID 生成器和逻辑时钟 {#sec_consistency_logical}

在许多应用程序中，你需要在创建数据库记录时为它们分配某种唯一的 ID，这给了你一个可以引用这些记录的主键。在单节点数据库中，通常使用自增整数，它的优点是只需要 64 位（如果你确定永远不会有超过 40 亿条记录，甚至可以使用 32 位，但这是有风险的）来存储。

这种自增 ID 的另一个优点是，ID 的顺序告诉你记录创建的顺序。例如，[图 10-8](#fig_consistency_id_generator) 显示了一个聊天应用程序，它在发布聊天消息时为其分配自增 ID。然后，你可以按 ID 递增的顺序显示消息，生成的聊天线程将有意义：Aaliyah 发布了一个被分配 ID 1 的问题，而 Bryce 对该问题的回答被分配了一个更大的 ID，即 3。

{{< figure src="/fig/ddia_1008.png" id="fig_consistency_id_generator" caption="图 10-8. 两个不同的节点可能生成冲突的 ID。" class="w-full my-4" >}}


这个单节点 ID 生成器是线性一致系统的另一个例子。每个获取 ID 的请求都是一个原子地递增计数器并返回旧计数器值的操作（*获取并增加* 操作）；线性一致性确保如果 Aaliyah 的消息发布在 Bryce 的发布开始之前完成，那么 Bryce 的 ID 必须大于 Aaliyah 的。[图 10-8](#fig_consistency_id_generator) 中 Aaliyah 和 Caleb 的消息是并发的，因此线性一致性不指定它们的 ID 必须如何排序，只要它们是唯一的。

内存中的单节点 ID 生成器很容易实现：你可以使用 CPU 提供的原子递增指令，它允许多个线程安全地递增同一个计数器。使计数器持久化需要更多的努力，这样节点就可以崩溃并重新启动而不重置计数器值，这将导致重复的 ID。但真正的问题是：

* 单节点 ID 生成器不具容错性，因为该节点是单点故障。
* 如果你想在另一个区域创建记录，速度会很慢，因为你可能必须往返地球的另一端才能获得 ID。
* 如果你有高写入吞吐量，该单个节点可能成为瓶颈。

你可以考虑各种 ID 生成器的替代选项：

分片 ID 分配
: 你可以有多个分配 ID 的节点——例如，一个只生成偶数，一个只生成奇数。一般来说，你可以在 ID 中保留一些位来包含分片编号。这些 ID 仍然紧凑，但你失去了排序属性：例如，如果你有 ID 为 16 和 17 的聊天消息，你不知道消息 16 是否实际上是先发送的，因为 ID 是由不同的节点分配的，其中一个节点可能领先于另一个。

预分配 ID 块
: 不是从单节点 ID 生成器请求单个 ID，它可以分发 ID 块。例如，节点 A 可能声明从 1 到 1,000 的 ID 块，节点 B 可能声明从 1,001 到 2,000 的块。然后每个节点可以独立地从其块中分发 ID，并在其序列号供应开始不足时从单节点 ID 生成器请求新块。但是，这种方案也不能确保正确的排序：可能会发生这样的情况，一条消息被分配了 1,001 到 2,000 范围内的 ID，而后来的消息被分配了 1 到 1,000 范围内的 ID，如果 ID 是由不同的节点分配的。

随机 UUID
: 你可以使用 *通用唯一标识符*（UUID），也称为 *全局唯一标识符*（GUID）。它们的一大优点是可以在任何节点上本地生成，无需通信，但它们需要更多空间（128 位）。有几种不同版本的 UUID；最简单的是版本 4，它本质上是一个如此长的随机数，以至于两个节点选择相同的可能性非常小。不幸的是，这些 ID 的顺序也是随机的，因此比较两个 ID 不会告诉你哪个更新。

时钟时间戳使其唯一
: 如果你的节点的日历时钟使用 NTP 保持大致正确，你可以通过将该时钟的时间戳放在最高有效位中，并用确保 ID 唯一的额外信息填充剩余位来生成 ID，即使时间戳不是——例如，分片编号和每分片递增序列号，或长随机值。这种方法用于版本 7 UUID [^50]、Twitter 的 Snowflake [^51]、ULID [^52]、Hazelcast 的 Flake ID 生成器、MongoDB ObjectID 和许多类似方案 [^50]。你可以在应用程序代码或数据库中实现这些 ID 生成器 [^53]。

所有这些方案都生成唯一的 ID（至少有足够高的概率，使冲突极其罕见），但它们对 ID 的排序保证比单节点自增方案弱得多。

如 ["为事件排序的时间戳"](/ch9#sec_distributed_lww) 中所讨论的，时钟时间戳最多只能提供近似排序：如果较早的写入从稍快的时钟获得时间戳，而较晚写入的时间戳来自稍慢的时钟，则时间戳顺序可能与事件实际发生的顺序不一致。由于使用非单调时钟而导致的时钟跳跃，即使单个节点生成的时间戳也可能排序错误。因此，基于时钟时间的 ID 生成器不太可能是线性一致的。

你可以通过依赖高精度时钟同步，使用原子钟或 GPS 接收器来减少这种排序不一致。但如果能够在不依赖特殊硬件的情况下生成唯一且正确排序的 ID 也会很好。这就是 *逻辑时钟* 的用途。

### 逻辑时钟 {#sec_consistency_timestamps}

在 ["不可靠的时钟"](/ch9#sec_distributed_clocks) 中，我们讨论了日历时钟和单调时钟。这两种都是 *物理时钟*：它们测量经过的秒数（或毫秒、微秒等）。

在分布式系统中，通常还使用另一种时钟，称为 *逻辑时钟*。物理时钟是计算已经过的秒数的硬件设备，而逻辑时钟是计算已发生事件的算法。来自逻辑时钟的时间戳因此不会告诉你现在几点，但你 *可以* 比较来自逻辑时钟的两个时间戳，以判断哪个更早，哪个更晚。

逻辑时钟的要求通常是：

* 其时间戳紧凑（大小为几个字节）且唯一；
* 你可以比较任意两个时间戳（即它们是 *全序* 的）；并且
* 时间戳的顺序与因果关系 *一致*：如果操作 A 发生在 B 之前，那么 A 的时间戳小于 B 的时间戳。（我们之前在 ["“先发生”关系与并发"](/ch6#sec_replication_happens_before) 中讨论了因果关系。）

单节点 ID 生成器满足这些要求，但我们刚刚讨论的分布式 ID 生成器不满足因果排序要求。

#### Lamport 时间戳 {#lamport-timestamps}

幸运的是，有一种生成逻辑时间戳的简单方法，它与因果关系 *一致*，你可以将其用作分布式 ID 生成器。它被称为 *Lamport 时钟*，由 Leslie Lamport 在 1978 年提出 [^54]，现在是分布式系统领域被引用最多的论文之一。

[图 10-9](#fig_consistency_lamport_ts) 显示了 Lamport 时钟如何在 [图 10-8](#fig_consistency_id_generator) 的聊天示例中工作。每个节点都有一个唯一标识符，在 [图 10-9](#fig_consistency_lamport_ts) 中是名称"Aaliyah"、"Bryce"或"Caleb"，但在实践中可能是随机 UUID 或类似的东西。此外，每个节点都保留它已处理的操作数的计数器。Lamport 时间戳就是一对（*计数器*，*节点 ID*）。两个节点有时可能具有相同的计数器值，但通过在时间戳中包含节点 ID，每个时间戳都是唯一的。

{{< figure src="/fig/ddia_1009.png" id="fig_consistency_lamport_ts" caption="图 10-9. Lamport 时间戳提供与因果关系一致的全序。" class="w-full my-4" >}}


每次节点生成时间戳时，它都会递增其计数器值并使用新值。此外，每次节点看到来自另一个节点的时间戳时，如果该时间戳中的计数器值大于其本地计数器值，它会将其本地计数器增加到与时间戳中的值匹配。

在 [图 10-9](#fig_consistency_lamport_ts) 中，Aaliyah 在发布自己的消息时还没有看到 Caleb 的消息，反之亦然。假设两个用户都以初始计数器值 0 开始，因此都递增其本地计数器并将新计数器值 1 附加到其消息。当 Bryce 收到这些消息时，他将本地计数器值增加到 1。最后，Bryce 向 Aaliyah 的消息发送回复，为此他递增本地计数器并将新值 2 附加到消息。

要比较两个 Lamport 时间戳，我们首先比较它们的计数器值：例如，(2, "Bryce") 大于 (1, "Aaliyah")，也大于 (1, "Caleb")。如果两个时间戳具有相同的计数器，我们改为比较它们的节点 ID，使用通常的字典序字符串比较。因此，此示例中的时间戳顺序是 (1, "Aaliyah") < (1, "Caleb") < (2, "Bryce")。

#### 混合逻辑时钟 {#hybrid-logical-clocks}

Lamport 时间戳擅长捕获事物发生的顺序，但它们有一些限制：

* 由于它们与物理时间没有直接关系，你不能使用它们来查找，比如说，在特定日期发布的所有消息——你需要单独存储物理时间。
* 如果两个节点从不通信，一个节点的计数器递增将永远不会反映在另一个节点的计数器中。因此，可能会发生这样的情况，即在不同节点上大约同一时间生成的事件具有极不相同的计数器值。

*混合逻辑时钟* 结合了物理日历时钟的优势和 Lamport 时钟的排序保证 [^55]。像物理时钟一样，它计算秒或微秒。像 Lamport 时钟一样，当一个节点看到来自另一个节点的时间戳大于其本地时钟值时，它将自己的本地值向前移动以匹配另一个节点的时间戳。因此，如果一个节点的时钟运行得很快，其他节点在通信时也会类似地向前移动它们的时钟。

每次生成混合逻辑时钟的时间戳时，它也会递增，这确保时钟单调向前移动，即使底层物理时钟由于 NTP 调整而向后跳跃。因此，混合逻辑时钟可能略微领先于底层物理时钟。算法的细节确保这种差异尽可能小。

因此，你可以将混合逻辑时钟的时间戳几乎像传统日历时钟的时间戳一样对待，具有其排序与先发生关系一致的附加属性。它不依赖于任何特殊硬件，只需要大致同步的时钟。例如，CockroachDB 使用混合逻辑时钟。

#### Lamport/混合逻辑时钟 vs. 向量时钟 {#lamporthybrid-logical-clocks-vs-vector-clocks}

在 ["多版本并发控制（MVCC）"](/ch8#sec_transactions_snapshot_impl) 中，我们讨论了快照隔离通常是如何实现的：本质上，通过给每个事务一个事务 ID，并允许每个事务看到由 ID 较低的事务进行的写入，但使 ID 较高的事务的写入不可见。Lamport 时钟和混合逻辑时钟是生成这些事务 ID 的好方法，因为它们确保快照与因果关系一致 [^56]。

当并发生成多个时间戳时，这些算法会任意排序它们。这意味着当你查看两个时间戳时，你通常无法判断它们是并发生成的还是一个发生在另一个之前。（在 [图 10-9](#fig_consistency_lamport_ts) 的示例中，你实际上可以判断 Aaliyah 和 Caleb 的消息必须是并发的，因为它们具有相同的计数器值，但当计数器值不同时，你无法判断它们是否并发。）

如果你想能够确定记录何时并发创建，你需要不同的算法，例如 *向量时钟*。缺点是向量时钟的时间戳要大得多——可能是系统中每个节点一个整数。有关检测并发的更多详细信息，请参见 ["检测并发写入"](/ch6#sec_replication_concurrent)。

### 线性一致的 ID 生成器 {#sec_consistency_linearizable_id}

尽管 Lamport 时钟和混合逻辑时钟提供了有用的排序保证，但该排序仍然弱于我们之前讨论的线性一致单节点 ID 生成器。回想一下，线性一致性要求如果请求 A 在请求 B 开始之前完成，那么 B 必须具有更高的 ID，即使 A 和 B 从未相互通信。另一方面，Lamport 时钟只能确保节点生成的时间戳大于该节点看到的任何其他时间戳，但它不能对它没有看到的时间戳说任何话。

[图 10-10](#fig_consistency_permissions) 显示了非线性一致 ID 生成器如何导致问题。想象一个社交媒体网站，用户 A 想要与朋友私下分享一张尴尬的照片。A 的账户最初是公开的，但使用他们的笔记本电脑，A 首先将他们的账户设置更改为私密。然后 A 使用他们的手机上传照片。由于 A 按顺序执行了这些更新，他们可能合理地期望照片上传受到新的、受限的账户权限的约束。

{{< figure src="/fig/ddia_1010.png" id="fig_consistency_permissions" caption="图 10-10. 使用 Lamport 时间戳的权限系统示例。" class="w-full my-4" >}}


账户权限和照片存储在两个单独的数据库（或同一数据库的单独分片）中，让我们假设它们使用 Lamport 时钟或混合逻辑时钟为每次写入分配时间戳。由于照片数据库没有从账户数据库读取，照片数据库中的本地计数器可能稍微落后，因此照片上传被分配了比账户设置更新更低的时间戳。

接下来，假设一个查看者（不是 A 的朋友）正在查看 A 的个人资料，他们的读取使用快照隔离的 MVCC 实现。可能会发生这样的情况，查看者的读取具有大于照片上传的时间戳，但小于账户设置更新的时间戳。因此，系统将确定在读取时账户仍然是公开的，因此向查看者显示他们不应该看到的尴尬照片。

你可以想象几种可能的方法来解决这个问题。也许照片数据库应该在执行写入之前读取用户的账户状态，但很容易忘记这样的检查。如果 A 的操作是在同一设备上执行的，也许该设备上的应用程序可以跟踪该用户写入的最新时间戳——但如果用户使用笔记本电脑和手机，如示例中所示，那就不那么容易了。

在这种情况下，最简单的解决方案是使用线性一致的 ID 生成器，这将确保照片上传被分配比账户权限更改更大的 ID。

#### 实现线性一致的 ID 生成器 {#implementing-a-linearizable-id-generator}

确保 ID 分配线性一致的最简单方法实际上是为此目的使用单个节点。该节点只需要原子地递增计数器并在请求时返回其值，持久化计数器值（以便在节点崩溃并重新启动时不会生成重复的 ID），并使用单主复制进行容错复制。这种方法在实践中使用：例如，TiDB/TiKV 称之为 *时间戳预言机*，受 Google 的 Percolator [^57] 启发。

作为优化，你可以避免在每个请求上执行磁盘写入和复制。相反，ID 生成器可以写入描述一批 ID 的记录；一旦该记录被持久化并完成复制，节点就可以开始按顺序向客户端分发这些 ID。在它用完该批次中的 ID 之前，它可以为下一批持久化并复制记录。这样，如果节点崩溃并重启，或故障切换到备库，某些 ID 会被跳过，但不会发出任何重复或乱序的 ID。

你不能轻易地对 ID 生成器进行分片，因为如果你有多个分片独立分发 ID，你就无法再保证它们的顺序是线性一致的。你也不能轻易地将 ID 生成器分布在多个区域；因此，在地理分布式数据库中，所有 ID 请求都必须转到单个区域的节点。从好的方面来说，ID 生成器的工作非常简单，因此单个节点可以处理大量请求吞吐量。

如果你不想使用单节点 ID 生成器，可以使用替代方案：你可以做 Google 的 Spanner 所做的，如 ["全局快照的同步时钟"](/ch9#sec_distributed_spanner) 中所讨论的。它依赖于物理时钟，该时钟不仅返回单个时间戳，还返回表示时钟读数不确定性的时间戳范围。然后它等待该不确定性间隔的持续时间过去后再返回。

假设不确定性间隔是正确的（即真实的当前物理时间始终位于该间隔内），此过程还确保如果一个请求在另一个请求开始之前完成，后一个请求将具有更大的时间戳。这种方法确保了这种线性一致的 ID 分配，而无需任何通信：即使不同区域的请求也将被正确排序，无需等待跨区域请求。缺点是你需要硬件和软件支持，以使时钟紧密同步并计算必要的不确定性间隔。

#### 使用逻辑时钟强制约束 {#enforcing-constraints-using-logical-clocks}

在 ["约束与唯一性保证"](#sec_consistency_uniqueness) 中，我们看到线性一致的比较并设置操作可用于在分布式系统中实现锁、唯一性约束和类似构造。这提出了一个问题：逻辑时钟或线性一致的 ID 生成器是否也足以实现这些东西？

答案是：不完全。当你有几个节点都试图获取同一个锁或注册同一个用户名时，你可以使用逻辑时钟为这些请求分配时间戳，并选择具有最低时间戳的请求作为获胜者。如果时钟是线性一致的，你知道任何未来的请求都将始终生成更大的时间戳，因此你可以确定没有未来的请求会收到比获胜者更低的时间戳。

不幸的是，问题的一部分仍未解决：节点如何知道自己的时间戳是否最低？要确定，它需要听到可能生成时间戳的 *每个* 其他节点 [^54]。如果其他节点之一在此期间失败，或者由于网络问题无法访问，该系统将停止运行，因为我们无法确定该节点是否可能具有最低的时间戳。这不是我们需要的那种容错系统。

要以容错方式实现锁、租约和类似构造，我们需要比逻辑时钟或 ID 生成器更强大的东西：我们需要共识。


## 共识 {#sec_consistency_consensus}

在本章中，我们已经看到了几个只有单个节点时很容易，但如果你想要容错就会变得困难得多的例子：

* 如果你只有一个主节点，并且在该主节点上进行所有读写，数据库可以是线性一致的。但是，如果该主节点失败，如何进行故障切换，同时避免脑裂？如何确保一个认为自己是主节点的节点实际上没有被投票罢免？
* 单节点上的线性一致 ID 生成器只是一个带有原子获取并增加指令的计数器，但如果它崩溃了怎么办？
* 原子比较并设置（CAS）操作对许多事情都很有用，例如当多个进程竞相获取它时决定谁获得锁或租约，或确保具有给定名称的文件或用户的唯一性。在单个节点上，CAS 可能就像一条 CPU 指令一样简单，但如何使其容错？

事实证明，所有这些都是同一个基本分布式系统问题的实例：*共识*。共识是分布式计算中最重要和最基本的问题之一；它也是出了名的难以正确实现 [^58] [^59]，许多系统在过去都出错了。现在我们已经讨论了复制（[第六章](/ch6)）、事务（[第八章](/ch8)）、系统模型（[第九章](/ch9)）和线性一致性（本章），我们终于准备好解决共识问题了。

最著名的共识算法是 Viewstamped Replication [^60] [^61]、Paxos [^58] [^62] [^63] [^64]、Raft [^23] [^65] [^66] 和 Zab [^18] [^22] [^67]。这些算法之间有相当多的相似之处，但它们并不相同 [^68] [^69]。这些算法在非拜占庭系统模型中工作：也就是说，网络通信可能会被任意延迟或丢弃，节点可能会崩溃、重启和断开连接，但算法假设节点在其他方面正确遵循协议，不会恶意行为。

也有可以容忍某些拜占庭节点的共识算法，即不正确遵循协议的节点（例如，向其他节点发送矛盾消息）。一个常见的假设是少于三分之一的节点是拜占庭故障的 [^26] [^70]。这种 *拜占庭容错*（BFT）共识算法用于区块链 [^71]。然而，如 ["拜占庭故障"](/ch9#sec_distributed_byzantine) 中所解释的，BFT 算法超出了本书的范围。

--------

> [!TIP] 共识的不可能性

你可能听说过 FLP 结果 [^72]——以作者 Fischer、Lynch 和 Paterson 的名字命名——它证明如果存在节点可能崩溃的风险，就没有算法总是能够达成共识。在分布式系统中，我们必须假设节点可能会崩溃，因此可靠的共识是不可能的。然而，在这里我们正在讨论实现共识的算法。这是怎么回事？

首先，FLP 并不是说我们永远无法达成共识——它只是说我们不能保证共识算法 *总是* 终止。此外，FLP 结果是在异步系统模型中假设确定性算法的情况下证明的（见 ["系统模型与现实"](/ch9#sec_distributed_system_model)），这意味着算法不能使用任何时钟或超时。如果它可以使用超时来怀疑另一个节点可能已经崩溃（即使怀疑有时是错误的），那么共识就变得可解 [^73]。即使只是允许算法使用随机数也足以绕过不可能性结果 [^74]。

因此，尽管 FLP 关于共识不可能性的结果具有重要的理论意义，但分布式系统通常可以在实践中实现共识。

--------

### 共识的多面性 {#sec_consistency_faces}

共识可以用几种不同的方式表达：

* *单值共识* 非常类似于原子 *比较并设置* 操作，它可用于实现锁、租约和唯一性约束。
* 构建 *仅追加日志* 也需要共识；它通常形式化为 *全序广播*。有了日志，你可以构建 *状态机复制*、基于主节点的复制、事件溯源和其他有用的东西。
* 多数据库或多分片事务的 *原子提交* 要求所有参与者就是否提交或中止事务达成一致。

我们很快就会探讨所有这些。事实上，这些问题都是相互等价的：如果你有解决其中一个问题的算法，你可以将其转换为任何其他问题的解决方案。这是一个相当深刻且也许令人惊讶的见解！这就是为什么我们可以将所有这些东西归入"共识"之下，即使它们表面上看起来完全不同。

#### 单值共识 {#single-value-consensus}

共识的标准表述涉及让多个节点就单个值达成一致。例如：

* 当具有单主复制的数据库首次启动时，或者当现有主节点失败时，多个节点可能会同时尝试成为主节点。同样，多个节点可能竞相获取锁或租约。共识允许它们决定哪一个获胜。
* 如果几个人同时尝试预订飞机上的最后一个座位，或剧院中的同一个座位，或尝试使用相同的用户名注册账户，那么共识算法可以确定哪一个应该成功。

更一般地说，一个或多个节点可能 *提议* 值，共识算法 *决定* 其中一个值。在上述示例中，每个节点可以提议自己的 ID，算法决定哪个节点 ID 应该成为新的主节点、租约的持有者或飞机/剧院座位的购买者。在这种形式主义中，共识算法必须满足以下属性 [^26]：

一致同意
: 没有两个节点决定不同。

完整性
: 一旦节点决定了一个值，它就不能通过决定另一个值来改变主意。

有效性
: 如果节点决定值 *v*，那么 *v* 是由某个节点提议的。

终止
: 每个未崩溃的节点最终都会决定某个值。

如果你想决定多个值，你可以为每个值运行共识算法的单独实例。例如，你可以为剧院中的每个可预订座位进行单独的共识运行，这样你就可以为每个座位获得一个决定（一个买家）。

一致同意和完整性属性定义了共识的核心思想：每个人都决定相同的结果，一旦你决定了，你就不能改变主意。有效性属性排除了琐碎的解决方案：例如，你可以有一个总是决定 `null` 的算法，无论提议什么；这个算法将满足同意和完整性属性，但不满足有效性属性。

如果你不关心容错，那么满足前三个属性很容易：你可以硬编码一个节点作为"独裁者"，让该节点做出所有决定。然而，如果那个节点失败，那么系统就无法再做出任何决定——就像没有故障切换的单主复制一样。所有的困难都来自对容错的需求。

终止属性形式化了容错的想法。它本质上是说共识算法不能简单地坐着什么都不做——换句话说，它必须取得进展。即使某些节点失败，其他节点仍必须达成决定。（终止是活性属性，而其他三个是安全属性——见 ["安全性和活性"](/ch9#sec_distributed_safety_liveness)。）

如果崩溃的节点可能恢复，你可以等待它回来。然而，共识必须确保即使崩溃的节点突然消失并且永远不会回来，它也会做出决定。（不要想象软件崩溃，而是想象有地震，包含你的节点的数据中心被山体滑坡摧毁。你必须假设你的节点被埋在 30 英尺的泥土下，永远不会重新上线。）

当然，如果 *所有* 节点都崩溃了，并且没有一个在运行，那么任何算法都不可能决定任何事情。算法可以容忍的故障数量是有限的：事实上，可以证明任何共识算法都需要至少大多数节点正常运行才能确保终止 [^73]。该多数可以安全地形成仲裁（见 ["读写仲裁"](/ch6#sec_replication_quorum_condition)）。

因此，终止属性受到少于一半节点崩溃或不可达的假设的约束。然而，大多数共识算法确保安全属性——同意、完整性和有效性——始终得到满足，即使大多数节点失败或存在严重的网络问题 [^75]。因此，大规模中断可能会阻止系统处理请求，但它不能通过导致做出不一致的决定来破坏共识系统。

#### 比较并设置作为共识 {#compare-and-set-as-consensus}

比较并设置（CAS）操作检查某个对象的当前值是否等于某个期望值；如果是，它原子地将对象更新为某个新值；如果不是，它保持对象不变并返回错误。

如果你有容错、线性一致的 CAS 操作，很容易解决共识问题：最初将对象设置为空值；每个想要提议值的节点都使用期望值为空、新值为它想要提议的值（假设它是非空的）调用 CAS。然后决定的值就是对象设置的任何值。

同样，如果你有共识的解决方案，你可以实现 CAS：每当一个或多个节点想要使用相同的期望值执行 CAS 时，你使用共识协议提议 CAS 调用中的新值，然后将对象设置为共识决定的任何值。任何新值未被决定的 CAS 调用都返回错误。具有不同期望值的 CAS 调用使用共识协议的单独运行。

这表明 CAS 和共识彼此等价 [^28] [^73]。同样，两者在单个节点上都很简单，但要使其容错则具有挑战性。作为分布式环境中 CAS 的示例，我们在 ["由对象存储支持的数据库"](/ch6#sec_replication_object_storage) 中看到了对象存储的条件写入操作，它允许写入仅在自当前客户端上次读取以来具有相同名称的对象未被另一个客户端创建或修改时发生。

然而，线性一致的读写寄存器不足以解决共识。FLP 结果告诉我们，共识不能由异步崩溃停止模型中的确定性算法解决 [^72]，但我们在 ["线性一致性与仲裁"](#sec_consistency_quorum_linearizable) 中看到，线性一致的寄存器可以使用此模型中的仲裁读/写来实现 [^24] [^25] [^26]。由此可见，线性一致的寄存器无法解决共识。

#### 共享日志作为共识 {#sec_consistency_shared_logs}

我们已经看到了几个日志的例子，例如复制日志、事务日志和预写日志。日志存储一系列 *日志条目*，任何读取它的人都会看到相同顺序的相同条目。有时日志有一个允许追加新条目的单个写入者，但 *共享日志* 是多个节点可以请求追加条目的日志。单主复制就是一个例子：任何客户端都可以要求主节点进行写入，主节点将其追加到复制日志，然后所有备库按照与主节点相同的顺序应用写入。

更正式地说，共享日志支持两种操作：你可以请求将值添加到日志中，并且可以读取日志中的条目。它必须满足以下属性：

最终追加
: 如果节点请求将某个值添加到日志中，并且节点不会崩溃，那么该节点最终必须在日志条目中读取该值。

可靠交付
: 没有日志条目丢失：如果一个节点读取某个日志条目，那么最终每个未崩溃的节点也必须读取该日志条目。

仅追加
: 一旦节点读取了某个日志条目，它就是不可变的，新的日志条目只能在它之后添加，而不能在之前。节点可能会重新读取日志，在这种情况下，它会以与最初读取它们时相同的顺序看到相同的日志条目（即使节点崩溃并重新启动）。

一致性
: 如果两个节点都读取某个日志条目 *e*，那么在 *e* 之前，它们必须以相同的顺序读取完全相同的日志条目序列。

有效性
: 如果节点读取包含某个值的日志条目，那么某个节点先前请求将该值添加到日志中。

--------

> [!NOTE]
> 共享日志在形式上被称为 *全序广播*、*原子广播* 或 *全序组播* 协议 [^26] [^76] [^77]。这是用不同的词描述的同一件事：请求将值添加到日志中然后称为"广播"它，读取日志条目称为"交付"它。

--------

如果你有共享日志的实现，很容易解决共识问题：每个想要提议值的节点都请求将其添加到日志中，第一个日志条目中读回的任何值就是决定的值。由于所有节点以相同的顺序读取日志条目，它们保证就首先交付哪个值达成一致 [^28]。

相反，如果你有共识的解决方案，你可以实现共享日志。细节有点复杂，但基本思想是这样的 [^73]：

1. 你为每个未来的日志条目在日志中都有一个槽，并且你为每个这样的槽运行共识算法的单独实例，以决定该条目中应该包含什么值。
2. 当节点想要向日志添加值时，它为尚未决定的槽之一提议该值。
3. 当共识算法为其中一个槽做出决定，并且所有先前的槽都已经决定时，则决定的值作为新的日志条目追加，并且已经决定的任何连续槽也将其决定的值追加到日志中。
4. 如果提议的值未被某个槽选择，想要添加它的节点会通过为稍后的槽提议它来重试。

这表明共识等价于全序广播和共享日志。没有故障切换的单主复制不满足活性要求，因为如果主节点崩溃，它将停止传递消息。像往常一样，挑战在于安全地自动执行故障切换。

#### 获取并增加作为共识 {#fetch-and-add-as-consensus}

我们在 ["线性一致的 ID 生成器"](#sec_consistency_linearizable_id) 中看到的线性一致 ID 生成器接近解决共识，但略有不足。我们可以使用获取并增加操作实现这样的 ID 生成器，该操作原子地递增计数器并返回旧的计数器值。

如果你有 CAS 操作，很容易实现获取并增加：首先读取计数器值，然后执行 CAS，其中期望值是你读取的值，新值是该值加一。如果 CAS 失败，你将重试整个过程，直到 CAS 成功。当存在争用时，这比本机获取并增加操作效率低，但在功能上是等效的。由于你可以使用共识实现 CAS，你也可以使用共识实现获取并增加。

相反，如果你有容错的获取并增加操作，你能解决共识问题吗？假设你将计数器初始化为零，每个想要提议值的节点都调用获取并增加操作来递增计数器。由于获取并增加操作是原子的，其中一个节点将读取初始值零，其他节点都将读取至少递增过一次的值。

现在假设读取零的节点是获胜者，它的值被决定。这对于读取零的节点有效，但其他节点有问题：它们知道自己不是获胜者，但它们不知道其他节点中哪一个获胜了。获胜者可以向其他节点发送消息，让它们知道它已经获胜，但如果获胜者在有机会发送此消息之前崩溃了怎么办？在这种情况下，其他节点将被挂起，无法决定任何值，因此共识不会终止。其他节点不能回退到另一个节点，因为读取零的节点可能会回来并正确地决定它提议的值。

一个例外是，如果我们确定不超过两个节点将提议值。在这种情况下，节点可以相互发送它们想要提议的值，然后每个都执行获取并增加操作。读取零的节点决定自己的值，读取一的节点决定另一个节点的值。这解决了两个节点之间的共识问题，这就是为什么我们可以说获取并增加的 *共识数* 为二 [^28]。相比之下，CAS 和共享日志解决了任意数量节点可能提议值的共识，因此它们的共识数为 ∞（无穷大）。

#### 原子提交作为共识 {#atomic-commitment-as-consensus}

在 ["分布式事务"](/ch8#sec_transactions_distributed) 中，我们看到了 *原子提交* 问题，即确保参与分布式事务的数据库或分片都提交或中止事务。我们还看到了 *两阶段提交* 算法，它依赖于作为单点故障的协调器。

共识和原子提交之间有什么关系？乍一看，它们似乎非常相似——两者都需要节点达成某种形式的一致。然而，有一个重要的区别：对于共识，可以决定提议的任何值，而对于原子提交，如果 *任何* 参与者投票中止，算法 *必须* 中止。更准确地说，原子提交需要以下属性 [^78]：

一致同意
: 没有两个节点决定不同的结果。

完整性
: 一旦节点决定了一个结果，它就不能通过决定另一个结果来改变主意。

有效性
: 如果节点决定提交，那么所有节点必须先前投票提交。如果任何节点投票中止，节点必须中止。

非平凡性
: 如果所有节点都投票提交，并且没有发生通信超时，那么所有节点必须决定提交。

终止
: 每个未崩溃的节点最终都会决定提交或中止。

有效性属性确保事务只有在所有节点都同意时才能提交；非平凡性属性确保算法不能简单地总是中止（但如果任何节点之间的通信超时，它允许中止）。其他三个属性基本上与共识相同。

如果你有共识的解决方案，有多种方法可以解决原子提交 [^78] [^79]。一种方法是这样的：当你想要提交事务时，每个节点将其提交或中止的投票发送给每个其他节点。从自己和每个其他节点收到提交投票的节点使用共识算法提议"提交"；收到中止投票或经历超时的节点使用共识算法提议"中止"。当节点发现共识算法决定了什么时，它会相应地提交或中止。

在这个算法中，只有当所有节点都投票提交时，才会提议"提交"。如果任何节点投票中止，所有共识算法中的提议都将是"中止"。如果所有节点都投票提交但某些通信超时，可能会发生某些节点提议"中止"而其他节点提议"提交"；在这种情况下，节点是提交还是中止并不重要，只要它们都做同样的事。

如果你有容错的原子提交协议，你也可以解决共识。每个想要提议值的节点都在节点仲裁上启动事务，并在每个节点上执行单节点 CAS，如果其值尚未被另一个事务设置，则将寄存器设置为提议的值。如果 CAS 成功，节点投票提交，否则投票中止。如果原子提交协议决定提交事务，其值将被决定用于共识；如果原子提交中止，提议节点将使用新事务重试。

这表明原子提交和共识也是彼此等价的。

### 共识的实践 {#sec_consistency_total_order}

我们已经看到，单值共识、CAS、共享日志和原子提交都彼此等价：你可以将其中一个的解决方案转换为任何其他的解决方案。这是一个有价值的理论见解，但它没有回答这个问题：在实践中，这些许多共识表述中哪一个最有用？

答案是大多数共识系统提供共享日志，也称为全序广播。Raft、Viewstamped Replication 和 Zab 直接提供共享日志。Paxos 提供单值共识，但在实践中，大多数使用 Paxos 的系统实际上使用称为 Multi-Paxos 的扩展，它也提供共享日志。

#### 使用共享日志 {#sec_consistency_smr}

共享日志非常适合数据库复制：如果每个日志条目代表对数据库的写入，并且每个副本使用确定性逻辑以相同的顺序处理相同的写入，那么副本将全部处于一致状态。这个想法被称为 *状态机复制* [^80]，它是事件溯源背后的原则，我们在 ["事件溯源和 CQRS"](/ch3#sec_datamodels_events) 中看到了。共享日志对于流处理也很有用，我们将在 [第十二章](/ch12#ch_stream) 中看到。

同样，共享日志可用于实现可串行化事务：如 ["实际串行执行"](/ch8#sec_transactions_serial) 中所讨论的，如果每个日志条目代表要作为存储过程执行的确定性事务，并且如果每个节点以相同的顺序执行这些事务，那么事务将是可串行化的 [^81] [^82]。

---------

> [!NOTE]
> 具有强一致性模型的分片数据库通常为每个分片维护一个单独的日志，这提高了可伸缩性，但限制了它们可以跨分片提供的一致性保证（例如，一致快照、外键引用）。跨分片的可串行化事务是可能的，但需要额外的协调 [^83]。

--------

共享日志也很强大，因为它可以很容易地适应其他形式的共识：

* 我们之前看到了如何使用它来实现单值共识和 CAS：只需决定日志中首先出现的值。
* 如果你想要许多单值共识实例（例如，几个人试图预订的剧院中每个座位一个），请在日志条目中包含座位编号，并决定包含给定座位编号的第一个日志条目。
* 如果你想要原子获取并增加，请将要添加到计数器的数字放入日志条目中，当前计数器值是到目前为止所有日志条目的总和。日志条目上的简单计数器可用于生成栅栏令牌（见 ["栅栏化僵尸和延迟请求"](/ch9#sec_distributed_fencing_tokens)）；例如，在 ZooKeeper 中，此序列号称为 `zxid` [^18]。

#### 从单主复制到共识 {#from-single-leader-replication-to-consensus}

我们之前看到，如果你有一个单一的"独裁者"节点做出决定，单值共识很容易，同样，如果单个主节点是唯一允许向其追加条目的节点，共享日志也很容易。问题是如果该节点失败如何提供容错。

传统上，具有单主复制的数据库没有解决这个问题：它们将主节点故障切换作为人类管理员必须手动执行的操作。不幸的是，这意味着大量的停机时间，因为人类反应的速度是有限的，并且它不满足共识的终止属性。对于共识，我们要求算法可以自动选择新的主节点。（并非所有共识算法都有主节点，但常用的算法有 [^84]。）

然而，有一个问题。我们之前讨论过脑裂的问题，并说所有节点都需要就谁是主节点达成一致——否则两个不同的节点可能各自认为自己是主节点，从而做出不一致的决定。因此，似乎我们需要共识来选举主节点，而我们需要主节点来解决共识。我们如何摆脱这个难题？

事实上，共识算法不要求在任何时候只有一个主节点。相反，它们做出了较弱的保证：它们定义了一个 *纪元编号*（在 Paxos 中称为 *投票编号*，在 Viewstamped Replication 中称为 *视图编号*，在 Raft 中称为 *任期编号*）并保证在每个纪元内，主节点是唯一的。

当节点因为在某个超时时间内没有收到主节点的消息而认为当前主节点已死时，它可能会开始投票选举新的主节点。这次选举被赋予一个大于任何先前纪元的新纪元编号。如果两个不同纪元中的两个不同主节点之间存在冲突（也许是因为先前的主节点实际上并没有死），那么具有更高纪元编号的主节点获胜。

在主节点被允许将下一个条目追加到共享日志之前，它必须首先检查是否有其他具有更高纪元编号的主节点可能追加不同的条目。它可以通过从一个节点仲裁收集投票来做到这一点，通常（但并非总是）是多数节点 [^85]。只有在节点不知道任何其他具有更高纪元的主节点时，节点才会投赞成票。

因此，我们有两轮投票：一次选择主节点，第二次对主节点提议的下一个要追加到日志的条目进行投票。这两次投票的仲裁必须重叠：如果对提议的投票成功，投票支持它的节点中至少有一个也必须参与了最近成功的主节点选举 [^85]。因此，如果对提议的投票通过而没有透露任何更高编号的纪元，当前主节点可以得出结论，没有选出具有更高纪元编号的主节点，因此它可以安全地将提议的条目追加到日志中 [^26] [^86]。

这两轮投票表面上看起来类似于两阶段提交，但它们是非常不同的协议。在共识算法中，任何节点都可以开始选举，它只需要节点仲裁的响应；在 2PC 中，只有协调器可以请求投票，它需要 *每个* 参与者的"是"投票才能提交。

#### 共识的微妙之处 {#subtleties-of-consensus}

这个基本结构对于 Raft、Multi-Paxos、Zab 和 Viewstamped Replication 的所有都是通用的：节点仲裁的投票选举主节点，然后主节点想要追加到日志的每个条目都需要另一个仲裁投票 [^68] [^69]。每个新的日志条目在确认给请求写入的客户端之前都会同步复制到节点仲裁。这确保如果当前主节点失败，日志条目不会丢失。

然而，魔鬼在细节中，这也是这些算法采用不同方法的地方。例如，当旧主节点失败并选出新主节点时，算法需要确保新主节点遵守旧主节点在失败之前已经追加的任何日志条目。Raft 通过只允许其日志至少与其大多数追随者一样最新的节点成为新主节点来做到这一点 [^69]。相比之下，Paxos 允许任何节点成为新主节点，但要求它在开始追加自己的新条目之前使其日志与其他节点保持最新。


--------

> [!TIP] 主节点选举中的一致性与可用性

如果你希望共识算法严格保证 ["共享日志作为共识"](#sec_consistency_shared_logs) 中列出的属性，那么新主节点在处理任何写入或线性一致读取之前必须了解任何已确认的日志条目，这一点至关重要。如果具有过时数据的节点成为新主节点，它可能会将新值写入已经由旧主节点写入的日志条目，从而违反共享日志的仅追加属性。

在某些情况下，你可能选择削弱共识属性，以便更快地从主节点故障中恢复。例如，Kafka 提供了启用 *不干净的主节点选举* 的选项，它允许任何副本成为主节点，即使它不是最新的。此外，在采用异步复制的数据库中，当主节点失败时，你无法保证任何备库是最新的。

如果你放弃新主节点必须是最新的要求，你可能会提高性能和可用性，但你是在薄冰上，因为共识理论不再适用。虽然只要没有故障，事情就会正常工作，但 [第九章](/ch9) 中讨论的问题很容易导致大量数据丢失或损坏。

--------

另一个微妙之处是如何处理算法处理旧主节点在失败之前提议的日志条目，但对于追加到日志的投票尚未完成。你可以在本章的参考文献中找到这些细节的讨论 [^23] [^69] [^86]。

对于使用共识算法进行复制的数据库，不仅写入需要转换为日志条目并复制到仲裁。如果你想保证线性一致的读取，它们也必须像写入一样通过仲裁投票，以确认认为自己是主节点的节点确实仍然是最新的。例如，etcd 中的线性一致读取就是这样工作的。

在其标准形式中，大多数共识算法假设一组固定的节点——也就是说，节点可能会宕机并重新启动，但允许投票的节点集在创建集群时是固定的。在实践中，通常需要在系统配置中添加新节点或删除旧节点。共识算法已经扩展了 *重新配置* 功能，使这成为可能。这在向系统添加新区域或从一个位置迁移到另一个位置（通过首先添加新节点，然后删除旧节点）时特别有用。

#### 共识的利弊 {#pros-and-cons-of-consensus}

尽管它们复杂而微妙，但共识算法是分布式系统的巨大突破。共识本质上是"正确完成的单主复制"，在主节点故障时自动故障切换，确保没有已提交的数据丢失，也不可能出现脑裂，即使面对我们在 [第九章](/ch9) 中讨论的所有问题。

由于单主复制与自动故障切换本质上是共识的定义之一，任何提供自动故障切换但不使用经过验证的共识算法的系统都可能是不安全的 [^87]。使用经过验证的共识算法并不能保证整个系统的正确性——仍然有很多其他地方可能潜伏着错误——但这是一个好的开始。

然而，共识并不是到处都使用，因为好处是有代价的。共识系统总是需要严格的多数才能运行——容忍一个故障需要三个节点，或者容忍两个故障需要五个节点。每个操作都需要与仲裁通信，因此你不能通过添加更多节点来增加吞吐量（事实上，你添加的每个节点都会使算法变慢）。如果网络分区将某些节点与其余节点隔离，只有网络的多数部分可以取得进展，其余部分被阻塞。

共识系统通常依赖超时来检测失败的节点。在具有高度可变网络延迟的环境中，特别是跨多个地理区域分布的系统，调整这些超时可能很困难：如果它们太大，从故障中恢复需要很长时间；如果它们太小，可能会有很多不必要的主节点选举，导致糟糕的性能，因为系统最终花费更多时间选择主节点而不是做有用的工作。

有时，共识算法对网络问题特别敏感。例如，Raft 已被证明具有不愉快的边缘情况 [^88] [^89]：如果除了一个始终不可靠的特定网络链接之外，整个网络都正常工作，Raft 可能会进入主节点身份在两个节点之间不断跳跃的情况，或者当前主节点不断被迫辞职，因此系统实际上从未取得进展。设计对不可靠网络更稳健的算法仍然是一个开放的研究问题。

对于想要高可用但不想接受共识成本的系统，唯一真正的选择是使用较弱的一致性模型，例如 [第六章](/ch6) 中讨论的无主或多主复制提供的模型。这些方法通常不提供线性一致性，但对于不需要它的应用程序来说已经足够。


### 协调服务 {#sec_consistency_coordination}

共识算法对于任何希望提供线性一致操作的分布式数据库都很有价值，许多现代分布式数据库也都用共识来做复制。但有一类系统是共识算法的重度用户：*协调服务*，例如 ZooKeeper、etcd 和 Consul。虽然它们表面上看起来像普通键值存储，但它们并不是为通用数据存储而设计的。

相反，它们的目标是协调另一个分布式系统中的多个节点。例如，Kubernetes 依赖 etcd；Spark 和 Flink 在高可用模式下会在后台依赖 ZooKeeper。协调服务通常只存储小规模数据，这些数据可以完全放入内存（同时仍会写盘以保证持久性），并通过容错共识算法在多个节点间复制。

协调服务的设计思路来自 Google 的 Chubby 锁服务 [^17] [^58]。它把共识算法与一些在分布式系统里尤其有用的能力结合在一起：

锁与租约
: 我们前面看到，共识系统可以实现具备容错能力的原子比较并设置（CAS）操作。协调服务正是基于这一点来实现锁和租约：若多个节点并发尝试获取同一个租约，最终只会有一个成功。

支持栅栏
: 如 ["分布式锁和租约"](/ch9#sec_distributed_lock_fencing) 所述，当某个资源受租约保护时，需要 *栅栏* 机制来防止进程暂停或网络大延迟时的相互干扰。共识系统可通过为每个日志条目分配单调递增 ID 来生成栅栏令牌（ZooKeeper 中的 `zxid` 和 `cversion`，etcd 中的 revision）。

故障检测
: 客户端会在协调服务上维持长连接会话，并通过周期性心跳检查对端是否存活。即使连接临时中断或某台服务端故障，客户端持有的租约仍可保持有效；但如果超过租约超时时间仍未收到心跳，协调服务就会认为客户端已失效并释放租约（ZooKeeper 将其称为 *临时节点*）。

变更通知
: 客户端可以请求：当某些键发生变化时由协调服务主动通知。这样客户端就能知道另一个节点何时加入集群（基于其写入的值），或者何时失效（会话超时、临时节点消失）。这类通知避免了客户端频繁轮询。

故障检测和变更通知本身不需要共识，但与需要共识的原子操作、栅栏机制结合后，它们对分布式协调非常有用。

--------

> [!TIP] 用协调服务管理配置

应用与基础设施通常都有配置参数，例如超时时间、线程池大小等。有时会把这类配置数据以键值对形式存放在协调服务中。进程启动时加载最新配置，并订阅后续变更通知。配置更新后，进程可以立即应用新值，或重启后生效。

配置管理本身不需要协调服务里的共识能力；但如果系统本来就已经运行了协调服务，那么直接复用它的通知机制会很方便。另一种做法是进程周期性地从文件或 URL 拉取配置更新，以避免依赖专门的协调服务。

--------

#### 将工作分配给节点 {#allocating-work-to-nodes}

当你有某个进程或服务的多个实例，且其中一个需要被选为主节点时，协调服务很有用。如果主节点失效，其他节点之一应当接管。这不仅适用于单主数据库，也适用于作业调度器等有状态系统。

另一个场景是：你有某种分片资源（数据库、消息流、文件存储、分布式 Actor 系统等），需要决定每个分片由哪个节点负责。随着新节点加入集群，需要把部分分片从旧节点迁移到新节点以实现再平衡；当节点被移除或失效时，其他节点需要接手其工作。

这类任务可以通过协调服务中的原子操作、临时节点和通知机制配合完成。若实现得当，应用可以在无人值守的情况下自动从故障中恢复。即使有 Apache Curator 这类在 ZooKeeper 客户端 API 上封装的高级库，这件事仍不容易；但它仍远好于从零实现共识算法，后者极易引入缺陷。

专用协调服务还有一个优势：无论被协调系统有多少节点，协调服务本身通常都只需运行在一组固定节点上（常见是 3 个或 5 个）。例如，一个拥有数千分片的存储系统若在数千节点上直接跑共识会非常低效；把共识“外包”给少量协调服务节点通常更合理。

通常，协调服务管理的数据变化频率不高：例如“IP 为 10.1.1.23 的节点当前是分片 7 的主节点”这类信息，更新周期往往是分钟级或小时级。协调服务不适合存储每秒变化数千次的数据。对于高频变化数据，应该使用常规数据库；或者使用 Apache BookKeeper [^90] [^91] 这类工具复制服务内部的快速变化状态。

#### 服务发现 {#service-discovery}

ZooKeeper、etcd 和 Consul 也常用于 *服务发现*：即确定连接某个服务所需的 IP 地址（见 ["负载均衡、服务发现和服务网格"](/ch5#sec_encoding_service_discovery)）。在云环境下，虚拟机常常频繁上下线，因此你通常无法预先知道服务地址。常见做法是让服务启动时把自身网络端点注册到服务注册表，再供其他服务查询。

用协调服务做服务发现很方便，因为它的故障检测和变更通知能让客户端及时跟踪服务实例的增减。而且如果你本来就用协调服务做租约、锁或主节点选举，那么继续复用它做服务发现通常也很自然，因为它已经知道哪个节点应该接收请求。

不过，对服务发现使用共识往往有些“杀鸡用牛刀”：这个场景通常不要求线性一致性，更重要的是高可用和低延迟，因为没有服务发现，整个系统都会停滞。因此通常更倾向于缓存服务发现结果，并接受其可能略有陈旧。比如基于 DNS 的服务发现，就是通过多层缓存来获得良好的性能与可用性。

为支持这类需求，ZooKeeper 提供了 *observer*（观察者）节点：它接收日志并维护一份 ZooKeeper 数据副本，但不参与共识投票。来自 observer 的读取不具备线性一致性（可能陈旧），但即使网络中断仍然可用，并且能通过缓存提高系统可支持的读吞吐量。

## 总结 {#summary}

在本章中，我们研究了容错系统中强一致性的主题：它是什么，以及如何实现它。我们深入研究了线性一致性，这是强一致性的一种流行形式化：它意味着复制的数据看起来好像只有一个副本，所有操作都以原子方式作用于它。我们看到，当你需要在读取时某些数据是最新的，或者需要解决竞争条件（例如，如果多个节点并发地尝试做同样的事情，比如创建具有相同名称的文件）时，线性一致性是有用的。

虽然线性一致性很有吸引力，因为它易于理解——它使数据库的行为像单线程程序中的变量一样——但它的缺点是速度慢，特别是在网络延迟较大的环境中。许多复制算法不能保证线性一致性，即使表面上看起来它们可能提供强一致性。

接下来，我们在 ID 生成器的背景下应用了线性一致性的概念。单节点自增计数器是线性一致的，但不是容错的。许多分布式 ID 生成方案不能保证 ID 的顺序与事件实际发生的顺序一致。像 Lamport 时钟和混合逻辑时钟这样的逻辑时钟提供了与因果关系一致的顺序，但没有线性一致性。

这引导我们进入了共识的概念。我们看到，达成共识意味着以一种所有节点都同意决定的方式决定某事，并且他们不能改变主意。广泛的问题实际上可以归约为共识，并且彼此等价（即，如果你有一个问题的解决方案，你可以将其转换为所有其他问题的解决方案）。这些等价的问题包括：

线性一致的比较并设置操作
: 寄存器需要根据其当前值是否等于操作中给定的参数，原子地 **决定** 是否设置其值。

锁和租约
: 当多个客户端并发地尝试获取锁或租约时，锁 **决定** 哪一个成功获取它。

唯一性约束
: 当多个事务并发地尝试创建具有相同键的冲突记录时，约束必须 **决定** 允许哪一个，哪一个应该因约束违反而失败。

共享日志
: 当多个节点并发地想要向日志追加条目时，日志 **决定** 它们被追加的顺序。全序广播也是等价的。

原子事务提交
: 参与分布式事务的数据库节点必须都以相同的方式 **决定** 是提交还是中止事务。

线性一致的获取并增加操作
: 这个操作可以用来实现 ID 生成器。多个节点可以并发调用该操作，它 **决定** 它们递增计数器的顺序。这种情况实际上只解决了两个节点之间的共识，而其他情况适用于任意数量的节点。

如果你只有一个节点，或者愿意把决策能力交给单个节点，所有这些都很简单。这就是单主数据库中发生的事情：所有决策权都授予主节点，这也是这类数据库能够提供线性一致操作、唯一性约束和复制日志等能力的原因。

然而，如果这个单一主节点失效，或者网络中断使其不可达，这样的系统就无法继续推进，直到人工完成手动故障切换。Raft 和 Paxos 等广泛使用的共识算法，本质上就是内置自动主节点选举与故障切换的“单主复制”。

共识算法经过精心设计，以确保在故障转移期间不会丢失任何已提交的写入，并且系统不会进入脑裂状态（多个节点接受写入）。这要求每个写入和每个线性一致的读取都由节点的仲裁（通常是多数）确认。这可能是昂贵的，特别是跨地理区域，但如果你想要共识提供的强一致性和容错性，这是不可避免的。

像 ZooKeeper 和 etcd 这样的协调服务也是建立在共识算法之上的。它们提供锁、租约、故障检测和变更通知功能，这些功能对于管理分布式应用程序的状态很有用。如果你发现自己想要做那些可以归约为共识的事情之一，并且你希望它是容错的，建议使用协调服务。它不会保证你做对，但它可能会有所帮助。

共识算法复杂而微妙，但其背后有自 1980 年代以来形成的丰富理论体系支持。正是这些理论，使我们能够构建出能够容忍 [第九章](/ch9#ch_distributed) 所述故障、同时仍保证数据不被破坏的系统。这是分布式系统工程中的重要成就，本章末尾参考文献展示了其中一些关键工作。

然而，共识并不总是正确的工具：在某些系统中，不需要它提供的强一致性属性，使用较弱一致性来换取更高可用性和更好性能反而更合适。在这些场景下，通常会使用无主或多主复制，这也是我们之前在 [第六章](/ch6#ch_replication) 讨论过的内容。我们在本章讨论的逻辑时钟在那类场景中也很有帮助。

### 参考文献

[^1]: Maurice P. Herlihy and Jeannette M. Wing. [Linearizability: A Correctness Condition for Concurrent Objects](https://cs.brown.edu/~mph/HerlihyW90/p463-herlihy.pdf). *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 12, issue 3, pages 463–492, July 1990. [doi:10.1145/78969.78972](https://doi.org/10.1145/78969.78972) 
[^2]: Leslie Lamport. [On interprocess communication](https://www.microsoft.com/en-us/research/publication/interprocess-communication-part-basic-formalism-part-ii-algorithms/). *Distributed Computing*, volume 1, issue 2, pages 77–101, June 1986. [doi:10.1007/BF01786228](https://doi.org/10.1007/BF01786228) 
[^3]: David K. Gifford. [Information Storage in a Decentralized Computer System](https://bitsavers.org/pdf/xerox/parc/techReports/CSL-81-8_Information_Storage_in_a_Decentralized_Computer_System.pdf). Xerox Palo Alto Research Centers, CSL-81-8, June 1981. Archived at [perma.cc/2XXP-3JPB](https://perma.cc/2XXP-3JPB) 
[^4]: Martin Kleppmann. [Please Stop Calling Databases CP or AP](https://martin.kleppmann.com/2015/05/11/please-stop-calling-databases-cp-or-ap.html). *martin.kleppmann.com*, May 2015. Archived at [perma.cc/MJ5G-75GL](https://perma.cc/MJ5G-75GL) 
[^5]: Kyle Kingsbury. [Call Me Maybe: MongoDB Stale Reads](https://aphyr.com/posts/322-call-me-maybe-mongodb-stale-reads). *aphyr.com*, April 2015. Archived at [perma.cc/DXB4-J4JC](https://perma.cc/DXB4-J4JC) 
[^6]: Kyle Kingsbury. [Computational Techniques in Knossos](https://aphyr.com/posts/314-computational-techniques-in-knossos). *aphyr.com*, May 2014. Archived at [perma.cc/2X5M-EHTU](https://perma.cc/2X5M-EHTU) 
[^7]: Kyle Kingsbury and Peter Alvaro. [Elle: Inferring Isolation Anomalies from Experimental Observations](https://www.vldb.org/pvldb/vol14/p268-alvaro.pdf). *Proceedings of the VLDB Endowment*, volume 14, issue 3, pages 268–280, November 2020. [doi:10.14778/3430915.3430918](https://doi.org/10.14778/3430915.3430918) 
[^8]: Paolo Viotti and Marko Vukolić. [Consistency in Non-Transactional Distributed Storage Systems](https://arxiv.org/abs/1512.00168). *ACM Computing Surveys* (CSUR), volume 49, issue 1, article no. 19, June 2016. [doi:10.1145/2926965](https://doi.org/10.1145/2926965) 
[^9]: Peter Bailis. [Linearizability Versus Serializability](http://www.bailis.org/blog/linearizability-versus-serializability/). *bailis.org*, September 2014. Archived at [perma.cc/386B-KAC3](https://perma.cc/386B-KAC3) 
[^10]: Daniel Abadi. [Correctness Anomalies Under Serializable Isolation](https://dbmsmusings.blogspot.com/2019/06/correctness-anomalies-under.html). *dbmsmusings.blogspot.com*, June 2019. Archived at [perma.cc/JGS7-BZFY](https://perma.cc/JGS7-BZFY) 
[^11]: Peter Bailis, Aaron Davidson, Alan Fekete, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Highly Available Transactions: Virtues and Limitations](https://www.vldb.org/pvldb/vol7/p181-bailis.pdf). *Proceedings of the VLDB Endowment*, volume 7, issue 3, pages 181–192, November 2013. [doi:10.14778/2732232.2732237](https://doi.org/10.14778/2732232.2732237), extended version published as [arXiv:1302.0309](https://arxiv.org/abs/1302.0309) 
[^12]: Philip A. Bernstein, Vassos Hadzilacos, and Nathan Goodman. [*Concurrency Control and Recovery in Database Systems*](https://www.microsoft.com/en-us/research/people/philbe/book/). Addison-Wesley, 1987. ISBN: 978-0-201-10715-9, available online at [*microsoft.com*](https://www.microsoft.com/en-us/research/people/philbe/book/). 
[^13]: Andrei Matei. [CockroachDB’s consistency model](https://www.cockroachlabs.com/blog/consistency-model/). *cockroachlabs.com*, February 2021. Archived at [perma.cc/MR38-883B](https://perma.cc/MR38-883B) 
[^14]: Murat Demirbas. [Strict-serializability, but at what cost, for what purpose?](https://muratbuffalo.blogspot.com/2022/08/strict-serializability-but-at-what-cost.html) *muratbuffalo.blogspot.com*, August 2022. Archived at [perma.cc/T8AY-N3U9](https://perma.cc/T8AY-N3U9) 
[^15]: Ben Darnell. [How to talk about consistency and isolation in distributed DBs](https://www.cockroachlabs.com/blog/db-consistency-isolation-terminology/). *cockroachlabs.com*, February 2022. Archived at [perma.cc/53SV-JBGK](https://perma.cc/53SV-JBGK) 
[^16]: Daniel Abadi. [An explanation of the difference between Isolation levels vs. Consistency levels](https://dbmsmusings.blogspot.com/2019/08/an-explanation-of-difference-between.html). *dbmsmusings.blogspot.com*, August 2019. Archived at [perma.cc/QSF2-CD4P](https://perma.cc/QSF2-CD4P) 
[^17]: Mike Burrows. [The Chubby Lock Service for Loosely-Coupled Distributed Systems](https://research.google/pubs/pub27897/). At *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006. 
[^18]: Flavio P. Junqueira and Benjamin Reed. [*ZooKeeper: Distributed Process Coordination*](https://www.oreilly.com/library/view/zookeeper/9781449361297/). O’Reilly Media, 2013. ISBN: 978-1-449-36130-3 
[^19]: Murali Vallath. [*Oracle 10g RAC Grid, Services & Clustering*](https://www.oreilly.com/library/view/oracle-10g-rac/9781555583217/). Elsevier Digital Press, 2006. ISBN: 978-1-555-58321-7 
[^20]: Peter Bailis, Alan Fekete, Michael J. Franklin, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Coordination Avoidance in Database Systems](https://arxiv.org/abs/1402.2237). *Proceedings of the VLDB Endowment*, volume 8, issue 3, pages 185–196, November 2014. [doi:10.14778/2735508.2735509](https://doi.org/10.14778/2735508.2735509) 
[^21]: Kyle Kingsbury. [Call Me Maybe: etcd and Consul](https://aphyr.com/posts/316-call-me-maybe-etcd-and-consul). *aphyr.com*, June 2014. Archived at [perma.cc/XL7U-378K](https://perma.cc/XL7U-378K) 
[^22]: Flavio P. Junqueira, Benjamin C. Reed, and Marco Serafini. [Zab: High-Performance Broadcast for Primary-Backup Systems](https://marcoserafini.github.io/assets/pdf/zab.pdf). At *41st IEEE International Conference on Dependable Systems and Networks* (DSN), June 2011. [doi:10.1109/DSN.2011.5958223](https://doi.org/10.1109/DSN.2011.5958223) 
[^23]: Diego Ongaro and John K. Ousterhout. [In Search of an Understandable Consensus Algorithm](https://www.usenix.org/system/files/conference/atc14/atc14-paper-ongaro.pdf). At *USENIX Annual Technical Conference* (ATC), June 2014. 
[^24]: Hagit Attiya, Amotz Bar-Noy, and Danny Dolev. [Sharing Memory Robustly in Message-Passing Systems](https://www.cs.huji.ac.il/course/2004/dist/p124-attiya.pdf). *Journal of the ACM*, volume 42, issue 1, pages 124–142, January 1995. [doi:10.1145/200836.200869](https://doi.org/10.1145/200836.200869) 
[^25]: Nancy Lynch and Alex Shvartsman. [Robust Emulation of Shared Memory Using Dynamic Quorum-Acknowledged Broadcasts](https://groups.csail.mit.edu/tds/papers/Lynch/FTCS97.pdf). At *27th Annual International Symposium on Fault-Tolerant Computing* (FTCS), June 1997. [doi:10.1109/FTCS.1997.614100](https://doi.org/10.1109/FTCS.1997.614100) 
[^26]: Christian Cachin, Rachid Guerraoui, and Luís Rodrigues. [*Introduction to Reliable and Secure Distributed Programming*](https://www.distributedprogramming.net/), 2nd edition. Springer, 2011. ISBN: 978-3-642-15259-7, [doi:10.1007/978-3-642-15260-3](https://doi.org/10.1007/978-3-642-15260-3) 
[^27]: Niklas Ekström, Mikhail Panchenko, and Jonathan Ellis. [Possible Issue with Read Repair?](https://lists.apache.org/thread/wwsjnnc93mdlpw8nb0d5gn4q1bmpzbon) Email thread on *cassandra-dev* mailing list, October 2012. 
[^28]: Maurice P. Herlihy. [Wait-Free Synchronization](https://cs.brown.edu/~mph/Herlihy91/p124-herlihy.pdf). *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 13, issue 1, pages 124–149, January 1991. [doi:10.1145/114005.102808](https://doi.org/10.1145/114005.102808) 
[^29]: Armando Fox and Eric A. Brewer. [Harvest, Yield, and Scalable Tolerant Systems](https://radlab.cs.berkeley.edu/people/fox/static/pubs/pdf/c18.pdf). At *7th Workshop on Hot Topics in Operating Systems* (HotOS), March 1999. [doi:10.1109/HOTOS.1999.798396](https://doi.org/10.1109/HOTOS.1999.798396) 
[^30]: Seth Gilbert and Nancy Lynch. [Brewer’s Conjecture and the Feasibility of Consistent, Available, Partition-Tolerant Web Services](https://www.comp.nus.edu.sg/~gilbert/pubs/BrewersConjecture-SigAct.pdf). *ACM SIGACT News*, volume 33, issue 2, pages 51–59, June 2002. [doi:10.1145/564585.564601](https://doi.org/10.1145/564585.564601) 
[^31]: Seth Gilbert and Nancy Lynch. [Perspectives on the CAP Theorem](https://groups.csail.mit.edu/tds/papers/Gilbert/Brewer2.pdf). *IEEE Computer Magazine*, volume 45, issue 2, pages 30–36, February 2012. [doi:10.1109/MC.2011.389](https://doi.org/10.1109/MC.2011.389) 
[^32]: Eric A. Brewer. [CAP Twelve Years Later: How the ‘Rules’ Have Changed](https://sites.cs.ucsb.edu/~rich/class/cs293-cloud/papers/brewer-cap.pdf). *IEEE Computer Magazine*, volume 45, issue 2, pages 23–29, February 2012. [doi:10.1109/MC.2012.37](https://doi.org/10.1109/MC.2012.37) 
[^33]: Susan B. Davidson, Hector Garcia-Molina, and Dale Skeen. [Consistency in Partitioned Networks](https://www.cs.rice.edu/~alc/old/comp520/papers/DGS85.pdf). *ACM Computing Surveys*, volume 17, issue 3, pages 341–370, September 1985. [doi:10.1145/5505.5508](https://doi.org/10.1145/5505.5508) 
[^34]: Paul R. Johnson and Robert H. Thomas. [RFC 677: The Maintenance of Duplicate Databases](https://tools.ietf.org/html/rfc677). Network Working Group, January 1975. 
[^35]: Michael J. Fischer and Alan Michael. [Sacrificing Serializability to Attain High Availability of Data in an Unreliable Network](https://sites.cs.ucsb.edu/~agrawal/spring2011/ugrad/p70-fischer.pdf). At *1st ACM Symposium on Principles of Database Systems* (PODS), March 1982. [doi:10.1145/588111.588124](https://doi.org/10.1145/588111.588124) 
[^36]: Eric A. Brewer. [NoSQL: Past, Present, Future](https://www.infoq.com/presentations/NoSQL-History/). At *QCon San Francisco*, November 2012. 
[^37]: Adrian Cockcroft. [Migrating to Microservices](https://www.infoq.com/presentations/migration-cloud-native/). At *QCon London*, March 2014. 
[^38]: Martin Kleppmann. [A Critique of the CAP Theorem](https://arxiv.org/abs/1509.05393). arXiv:1509.05393, September 2015. 
[^39]: Daniel Abadi. [Problems with CAP, and Yahoo’s little known NoSQL system](https://dbmsmusings.blogspot.com/2010/04/problems-with-cap-and-yahoos-little.html). *dbmsmusings.blogspot.com*, April 2010. Archived at [perma.cc/4NTZ-CLM9](https://perma.cc/4NTZ-CLM9) 
[^40]: Daniel Abadi. [Hazelcast and the Mythical PA/EC System](https://dbmsmusings.blogspot.com/2017/10/hazelcast-and-mythical-paec-system.html). *dbmsmusings.blogspot.com*, October 2017. Archived at [perma.cc/J5XM-U5C2](https://perma.cc/J5XM-U5C2) 
[^41]: Eric Brewer. [Spanner, TrueTime & The CAP Theorem](https://research.google.com/pubs/archive/45855.pdf). *research.google.com*, February 2017. Archived at [perma.cc/59UW-RH7N](https://perma.cc/59UW-RH7N) 
[^42]: Daniel J. Abadi. [Consistency Tradeoffs in Modern Distributed Database System Design](https://www.cs.umd.edu/~abadi/papers/abadi-pacelc.pdf). *IEEE Computer Magazine*, volume 45, issue 2, pages 37–42, February 2012. [doi:10.1109/MC.2012.33](https://doi.org/10.1109/MC.2012.33) 
[^43]: Nancy A. Lynch. [A Hundred Impossibility Proofs for Distributed Computing](https://groups.csail.mit.edu/tds/papers/Lynch/podc89.pdf). At *8th ACM Symposium on Principles of Distributed Computing* (PODC), August 1989. [doi:10.1145/72981.72982](https://doi.org/10.1145/72981.72982) 
[^44]: Prince Mahajan, Lorenzo Alvisi, and Mike Dahlin. [Consistency, Availability, and Convergence](https://apps.cs.utexas.edu/tech_reports/reports/tr/TR-2036.pdf). University of Texas at Austin, Department of Computer Science, Tech Report UTCS TR-11-22, May 2011. Archived at [perma.cc/SAV8-9JAJ](https://perma.cc/SAV8-9JAJ) 
[^45]: Hagit Attiya, Faith Ellen, and Adam Morrison. [Limitations of Highly-Available Eventually-Consistent Data Stores](https://www.cs.tau.ac.il/~mad/publications/podc2015-replds.pdf). At *ACM Symposium on Principles of Distributed Computing* (PODC), July 2015. [doi:10.1145/2767386.2767419](https://doi.org/10.1145/2767386.2767419) 
[^46]: Peter Sewell, Susmit Sarkar, Scott Owens, Francesco Zappa Nardelli, and Magnus O. Myreen. [x86-TSO: A Rigorous and Usable Programmer’s Model for x86 Multiprocessors](https://www.cl.cam.ac.uk/~pes20/weakmemory/cacm.pdf). *Communications of the ACM*, volume 53, issue 7, pages 89–97, July 2010. [doi:10.1145/1785414.1785443](https://doi.org/10.1145/1785414.1785443) 
[^47]: Martin Thompson. [Memory Barriers/Fences](https://mechanical-sympathy.blogspot.com/2011/07/memory-barriersfences.html). *mechanical-sympathy.blogspot.co.uk*, July 2011. Archived at [perma.cc/7NXM-GC5U](https://perma.cc/7NXM-GC5U) 
[^48]: Ulrich Drepper. [What Every Programmer Should Know About Memory](https://www.akkadia.org/drepper/cpumemory.pdf). *akkadia.org*, November 2007. Archived at [perma.cc/NU6Q-DRXZ](https://perma.cc/NU6Q-DRXZ) 
[^49]: Hagit Attiya and Jennifer L. Welch. [Sequential Consistency Versus Linearizability](https://courses.csail.mit.edu/6.852/01/papers/p91-attiya.pdf). *ACM Transactions on Computer Systems* (TOCS), volume 12, issue 2, pages 91–122, May 1994. [doi:10.1145/176575.176576](https://doi.org/10.1145/176575.176576) 
[^50]: Kyzer R. Davis, Brad G. Peabody, and Paul J. Leach. [Universally Unique IDentifiers (UUIDs)](https://www.rfc-editor.org/rfc/rfc9562). RFC 9562, IETF, May 2024. 
[^51]: Ryan King. [Announcing Snowflake](https://blog.x.com/engineering/en_us/a/2010/announcing-snowflake). *blog.x.com*, June 2010. Archived at [archive.org](https://web.archive.org/web/20241128214604/https%3A//blog.x.com/engineering/en_us/a/2010/announcing-snowflake) 
[^52]: Alizain Feerasta. [Universally Unique Lexicographically Sortable Identifier](https://github.com/ulid/spec). *github.com*, 2016. Archived at [perma.cc/NV2Y-ZP8U](https://perma.cc/NV2Y-ZP8U) 
[^53]: Rob Conery. [A Better ID Generator for PostgreSQL](https://bigmachine.io/2014/05/29/a-better-id-generator-for-postgresql/). *bigmachine.io*, May 2014. Archived at [perma.cc/K7QV-3KFC](https://perma.cc/K7QV-3KFC) 
[^54]: Leslie Lamport. [Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/). *Communications of the ACM*, volume 21, issue 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](https://doi.org/10.1145/359545.359563) 
[^55]: Sandeep S. Kulkarni, Murat Demirbas, Deepak Madeppa, Bharadwaj Avva, and Marcelo Leone. [Logical Physical Clocks](https://cse.buffalo.edu/~demirbas/publications/hlc.pdf). *18th International Conference on Principles of Distributed Systems* (OPODIS), December 2014. [doi:10.1007/978-3-319-14472-6\_2](https://doi.org/10.1007/978-3-319-14472-6_2) 
[^56]: Manuel Bravo, Nuno Diegues, Jingna Zeng, Paolo Romano, and Luís Rodrigues. [On the use of Clocks to Enforce Consistency in the Cloud](http://sites.computer.org/debull/A15mar/p18.pdf). *IEEE Data Engineering Bulletin*, volume 38, issue 1, pages 18–31, March 2015. Archived at [perma.cc/68ZU-45SH](https://perma.cc/68ZU-45SH) 
[^57]: Daniel Peng and Frank Dabek. [Large-Scale Incremental Processing Using Distributed Transactions and Notifications](https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Peng.pdf). At *9th USENIX Conference on Operating Systems Design and Implementation* (OSDI), October 2010. 
[^58]: Tushar Deepak Chandra, Robert Griesemer, and Joshua Redstone. [Paxos Made Live – An Engineering Perspective](https://www.read.seas.harvard.edu/~kohler/class/08w-dsi/chandra07paxos.pdf). At *26th ACM Symposium on Principles of Distributed Computing* (PODC), June 2007. [doi:10.1145/1281100.1281103](https://doi.org/10.1145/1281100.1281103) 
[^59]: Will Portnoy. [Lessons Learned from Implementing Paxos](https://blog.willportnoy.com/2012/06/lessons-learned-from-paxos.html). *blog.willportnoy.com*, June 2012. Archived at [perma.cc/QHD9-FDD2](https://perma.cc/QHD9-FDD2) 
[^60]: Brian M. Oki and Barbara H. Liskov. [Viewstamped Replication: A New Primary Copy Method to Support Highly-Available Distributed Systems](https://pmg.csail.mit.edu/papers/vr.pdf). At *7th ACM Symposium on Principles of Distributed Computing* (PODC), August 1988. [doi:10.1145/62546.62549](https://doi.org/10.1145/62546.62549) 
[^61]: Barbara H. Liskov and James Cowling. [Viewstamped Replication Revisited](https://pmg.csail.mit.edu/papers/vr-revisited.pdf). Massachusetts Institute of Technology, Tech Report MIT-CSAIL-TR-2012-021, July 2012. Archived at [perma.cc/56SJ-WENQ](https://perma.cc/56SJ-WENQ) 
[^62]: Leslie Lamport. [The Part-Time Parliament](https://www.microsoft.com/en-us/research/publication/part-time-parliament/). *ACM Transactions on Computer Systems*, volume 16, issue 2, pages 133–169, May 1998. [doi:10.1145/279227.279229](https://doi.org/10.1145/279227.279229) 
[^63]: Leslie Lamport. [Paxos Made Simple](https://www.microsoft.com/en-us/research/publication/paxos-made-simple/). *ACM SIGACT News*, volume 32, issue 4, pages 51–58, December 2001. Archived at [perma.cc/82HP-MNKE](https://perma.cc/82HP-MNKE) 
[^64]: Robbert van Renesse and Deniz Altinbuken. [Paxos Made Moderately Complex](https://people.cs.umass.edu/~arun/590CC/papers/paxos-moderately-complex.pdf). *ACM Computing Surveys* (CSUR), volume 47, issue 3, article no. 42, February 2015. [doi:10.1145/2673577](https://doi.org/10.1145/2673577) 
[^65]: Diego Ongaro. [Consensus: Bridging Theory and Practice](https://github.com/ongardie/dissertation). PhD Thesis, Stanford University, August 2014. Archived at [perma.cc/5VTZ-2ADH](https://perma.cc/5VTZ-2ADH) 
[^66]: Heidi Howard, Malte Schwarzkopf, Anil Madhavapeddy, and Jon Crowcroft. [Raft Refloated: Do We Have Consensus?](https://www.cl.cam.ac.uk/research/srg/netos/papers/2015-raftrefloated-osr.pdf) *ACM SIGOPS Operating Systems Review*, volume 49, issue 1, pages 12–21, January 2015. [doi:10.1145/2723872.2723876](https://doi.org/10.1145/2723872.2723876) 
[^67]: André Medeiros. [ZooKeeper’s Atomic Broadcast Protocol: Theory and Practice](http://www.tcs.hut.fi/Studies/T-79.5001/reports/2012-deSouzaMedeiros.pdf). Aalto University School of Science, March 2012. Archived at [perma.cc/FVL4-JMVA](https://perma.cc/FVL4-JMVA) 
[^68]: Robbert van Renesse, Nicolas Schiper, and Fred B. Schneider. [Vive La Différence: Paxos vs. Viewstamped Replication vs. Zab](https://arxiv.org/abs/1309.5671). *IEEE Transactions on Dependable and Secure Computing*, volume 12, issue 4, pages 472–484, September 2014. [doi:10.1109/TDSC.2014.2355848](https://doi.org/10.1109/TDSC.2014.2355848) 
[^69]: Heidi Howard and Richard Mortier. [Paxos vs Raft: Have we reached consensus on distributed consensus?](https://arxiv.org/abs/2004.05074). At *7th Workshop on Principles and Practice of Consistency for Distributed Data* (PaPoC), April 2020. [doi:10.1145/3380787.3393681](https://doi.org/10.1145/3380787.3393681) 
[^70]: Miguel Castro and Barbara H. Liskov. [Practical Byzantine Fault Tolerance and Proactive Recovery](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/01/p398-castro-bft-tocs.pdf). *ACM Transactions on Computer Systems*, volume 20, issue 4, pages 396–461, November 2002. [doi:10.1145/571637.571640](https://doi.org/10.1145/571637.571640) 
[^71]: Shehar Bano, Alberto Sonnino, Mustafa Al-Bassam, Sarah Azouvi, Patrick McCorry, Sarah Meiklejohn, and George Danezis. [SoK: Consensus in the Age of Blockchains](https://smeiklej.com/files/aft19a.pdf). At *1st ACM Conference on Advances in Financial Technologies* (AFT), October 2019. [doi:10.1145/3318041.3355458](https://doi.org/10.1145/3318041.3355458) 
[^72]: Michael J. Fischer, Nancy Lynch, and Michael S. Paterson. [Impossibility of Distributed Consensus with One Faulty Process](https://groups.csail.mit.edu/tds/papers/Lynch/jacm85.pdf). *Journal of the ACM*, volume 32, issue 2, pages 374–382, April 1985. [doi:10.1145/3149.214121](https://doi.org/10.1145/3149.214121) 
[^73]: Tushar Deepak Chandra and Sam Toueg. [Unreliable Failure Detectors for Reliable Distributed Systems](https://courses.csail.mit.edu/6.852/08/papers/CT96-JACM.pdf). *Journal of the ACM*, volume 43, issue 2, pages 225–267, March 1996. [doi:10.1145/226643.226647](https://doi.org/10.1145/226643.226647) 
[^74]: Michael Ben-Or. [Another Advantage of Free Choice: Completely Asynchronous Agreement Protocols](https://homepage.cs.uiowa.edu/~ghosh/BenOr.pdf). At *2nd ACM Symposium on Principles of Distributed Computing* (PODC), August 1983. [doi:10.1145/800221.806707](https://doi.org/10.1145/800221.806707) 
[^75]: Cynthia Dwork, Nancy Lynch, and Larry Stockmeyer. [Consensus in the Presence of Partial Synchrony](https://groups.csail.mit.edu/tds/papers/Lynch/jacm88.pdf). *Journal of the ACM*, volume 35, issue 2, pages 288–323, April 1988. [doi:10.1145/42282.42283](https://doi.org/10.1145/42282.42283) 
[^76]: Xavier Défago, André Schiper, and Péter Urbán. [Total Order Broadcast and Multicast Algorithms: Taxonomy and Survey](https://dspace.jaist.ac.jp/dspace/bitstream/10119/4883/1/defago_et_al.pdf). *ACM Computing Surveys*, volume 36, issue 4, pages 372–421, December 2004. [doi:10.1145/1041680.1041682](https://doi.org/10.1145/1041680.1041682) 
[^77]: Hagit Attiya and Jennifer Welch. *Distributed Computing: Fundamentals, Simulations and Advanced Topics*, 2nd edition. John Wiley & Sons, 2004. ISBN: 978-0-471-45324-6, [doi:10.1002/0471478210](https://doi.org/10.1002/0471478210) 
[^78]: Rachid Guerraoui. [Revisiting the Relationship Between Non-Blocking Atomic Commitment and Consensus](https://citeseerx.ist.psu.edu/pdf/5d06489503b6f791aa56d2d7942359c2592e44b0). At *9th International Workshop on Distributed Algorithms* (WDAG), September 1995. [doi:10.1007/BFb0022140](https://doi.org/10.1007/BFb0022140) 
[^79]: Jim N. Gray and Leslie Lamport. [Consensus on Transaction Commit](https://dsf.berkeley.edu/cs286/papers/paxoscommit-tods2006.pdf). *ACM Transactions on Database Systems* (TODS), volume 31, issue 1, pages 133–160, March 2006. [doi:10.1145/1132863.1132867](https://doi.org/10.1145/1132863.1132867) 
[^80]: Fred B. Schneider. [Implementing Fault-Tolerant Services Using the State Machine Approach: A Tutorial](https://www.cs.cornell.edu/fbs/publications/SMSurvey.pdf). *ACM Computing Surveys*, volume 22, issue 4, pages 299–319, December 1990. [doi:10.1145/98163.98167](https://doi.org/10.1145/98163.98167) 
[^81]: Alexander Thomson, Thaddeus Diamond, Shu-Chun Weng, Kun Ren, Philip Shao, and Daniel J. Abadi. [Calvin: Fast Distributed Transactions for Partitioned Database Systems](https://cs.yale.edu/homes/thomson/publications/calvin-sigmod12.pdf). At *ACM International Conference on Management of Data* (SIGMOD), May 2012. [doi:10.1145/2213836.2213838](https://doi.org/10.1145/2213836.2213838) 
[^82]: Mahesh Balakrishnan, Dahlia Malkhi, Ted Wobber, Ming Wu, Vijayan Prabhakaran, Michael Wei, John D. Davis, Sriram Rao, Tao Zou, and Aviad Zuck. [Tango: Distributed Data Structures over a Shared Log](https://www.microsoft.com/en-us/research/publication/tango-distributed-data-structures-over-a-shared-log/). At *24th ACM Symposium on Operating Systems Principles* (SOSP), November 2013. [doi:10.1145/2517349.2522732](https://doi.org/10.1145/2517349.2522732) 
[^83]: Mahesh Balakrishnan, Dahlia Malkhi, Vijayan Prabhakaran, Ted Wobber, Michael Wei, and John D. Davis. [CORFU: A Shared Log Design for Flash Clusters](https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final30.pdf). At *9th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), April 2012. 
[^84]: Vasilis Gavrielatos, Antonios Katsarakis, and Vijay Nagarajan. [Odyssey: the impact of modern hardware on strongly-consistent replication protocols](https://vasigavr1.github.io/files/Odyssey_Eurosys_2021.pdf). At *16th European Conference on Computer Systems* (EuroSys), April 2021. [doi:10.1145/3447786.3456240](https://doi.org/10.1145/3447786.3456240) 
[^85]: Heidi Howard, Dahlia Malkhi, and Alexander Spiegelman. [Flexible Paxos: Quorum Intersection Revisited](https://drops.dagstuhl.de/opus/volltexte/2017/7094/pdf/LIPIcs-OPODIS-2016-25.pdf). At *20th International Conference on Principles of Distributed Systems* (OPODIS), December 2016. [doi:10.4230/LIPIcs.OPODIS.2016.25](https://doi.org/10.4230/LIPIcs.OPODIS.2016.25) 
[^86]: Martin Kleppmann. [Distributed Systems lecture notes](https://www.cl.cam.ac.uk/teaching/2425/ConcDisSys/dist-sys-notes.pdf). *University of Cambridge*, October 2024. Archived at [perma.cc/SS3Q-FNS5](https://perma.cc/SS3Q-FNS5) 
[^87]: Kyle Kingsbury. [Call Me Maybe: Elasticsearch 1.5.0](https://aphyr.com/posts/323-call-me-maybe-elasticsearch-1-5-0). *aphyr.com*, April 2015. Archived at [perma.cc/37MZ-JT7H](https://perma.cc/37MZ-JT7H) 
[^88]: Heidi Howard and Jon Crowcroft. [Coracle: Evaluating Consensus at the Internet Edge](https://conferences.sigcomm.org/sigcomm/2015/pdf/papers/p85.pdf). At *Annual Conference of the ACM Special Interest Group on Data Communication* (SIGCOMM), August 2015. [doi:10.1145/2829988.2790010](https://doi.org/10.1145/2829988.2790010) 
[^89]: Tom Lianza and Chris Snook. [A Byzantine failure in the real world](https://blog.cloudflare.com/a-byzantine-failure-in-the-real-world/). *blog.cloudflare.com*, November 2020. Archived at [perma.cc/83EZ-ALCY](https://perma.cc/83EZ-ALCY) 
[^90]: Ivan Kelly. [BookKeeper Tutorial](https://github.com/ivankelly/bookkeeper-tutorial). *github.com*, October 2014. Archived at [perma.cc/37Y6-VZWU](https://perma.cc/37Y6-VZWU) 
[^91]: Jack Vanlightly. [Apache BookKeeper Insights Part 1 — External Consensus and Dynamic Membership](https://medium.com/splunk-maas/apache-bookkeeper-insights-part-1-external-consensus-and-dynamic-membership-c259f388da21). *medium.com*, November 2021. Archived at [perma.cc/3MDB-8GFB](https://perma.cc/3MDB-8GFB)


================================================
FILE: content/zh/ch11.md
================================================
---
title: "第十一章：批处理"
linkTitle: "11. 批处理"
weight: 311
breadcrumbs: false
---

<a id="ch_batch"></a>

![](/map/ch10.png)

> *带有太强个人色彩的系统无法成功。当最初的设计完成并且相对稳健时，真正的考验才刚开始：此后会有许多持不同观点的人做出各自的实验。*
>
> 高德纳

到目前为止，本书大部分内容都围绕着 *请求（request）* 与 *查询（query）* 以及对应的 *响应（response）* 或 *结果（result）* 展开。现代很多数据系统都默认采用这种处理方式：你发出请求或指令，系统尽快给出答案。

网页浏览器请求页面、服务调用远程 API、数据库、缓存、搜索索引，以及很多其他系统都如此运作。我们称这类系统为 *在线系统（online systems）*。它们通常以响应时间作为主要性能指标，并且往往需要良好的容错能力来保证高可用。

但有时候，你需要执行的计算比一次交互式请求大得多，或者要处理的数据量远超单次请求能承载的范围。例如训练 AI 模型、把海量数据从一种形式转换成另一种形式、或者在超大数据集上做分析计算。我们把这类任务称为 *批处理（batch processing）* 作业，有时也称为 *离线系统（offline systems）*。

批处理作业读取一批输入数据（只读），并生成一批输出数据（每次运行都从头生成）。它通常不会像读写事务那样原地修改数据。因此，输出是由输入推导出的 *派生数据（derived data）*（见[“记录系统与派生数据”](/ch1#sec_introduction_derived)）：如果不满意输出，你可以直接删除它，修改作业逻辑，再跑一遍即可。把输入视为不可变并尽量避免副作用（例如直接写外部数据库），不仅有助于性能，也带来其他好处：

- 如果你在代码中引入了 bug 导致输出错误或损坏，可以直接回滚代码并重跑作业，输出就会恢复正确。更简单的做法是把旧输出保留在另一个目录，直接切回旧版本。多数对象存储与开放表格式（见[“云数据仓库”](/ch4#sec_cloud_data_warehouses)）都支持这种能力，通常称为 *时间旅行（time travel）*。大多数支持读写事务的数据库不具备这种特性：如果错误代码把坏数据写进数据库，仅回滚代码并不能修复已写入的数据。能够从错误代码中恢复的能力被称为 *容忍人为失误* [^1]。

- 因为回滚容易，功能开发能比“犯错会造成不可逆损害”的环境更快推进。这个 *最小化不可逆性* 的原则对敏捷开发非常有益 [^2]。

- 同一组文件可以作为多种作业的输入，包括监控类作业：例如计算指标、验证输出是否符合预期（如与上一次结果比较并度量偏差）。

- 批处理框架能更高效地利用计算资源。虽然也可以用 OLTP 数据库和应用服务器等在线系统做批处理，但资源成本通常显著更高。

批处理也有挑战。多数框架中，作业只有在整体完成后，其输出才能被下游进一步处理。批处理也可能低效：输入哪怕只变动一个字节，也可能需要重算整个输入数据集。尽管如此，批处理在大量场景中依然非常有用，我们会在[“批处理用例”](#sec_batch_output)中回到这个话题。

批处理作业可能运行很久：几分钟、几小时甚至几天。很多作业是周期调度的（例如每天一次）。它的核心性能指标通常是吞吐量：单位时间能处理多少数据。有些批处理系统通过“中止并整体重启”应对故障，也有些具备更细粒度容错能力，可以在部分节点崩溃时仍让作业完成。

> [!NOTE]
> 批处理的另一种替代形态是 *流处理（stream processing）*：作业不会在“处理完输入后结束”，而是持续监听输入，并在变化发生后很快处理。我们将在[第十二章](/ch12#ch_stream)讨论流处理。

在线处理与批处理的边界并不总是清晰：一个运行很久的数据库查询，看起来也很像批处理过程。但批处理有一些独特特性，使其成为构建可靠、可伸缩、可维护应用的重要积木。例如，它常在 *数据集成（data integration）* 中发挥作用，即把多个数据系统组合起来完成单一系统做不到的事。ETL（见[“数据仓库”](/ch1#sec_introduction_dwh)）就是典型例子。

现代批处理深受 MapReduce 影响。Google 在 2004 年发表了这一批处理算法 [^3]，随后 Hadoop、CouchDB、MongoDB 等开源系统都实现了它。MapReduce 是相对底层的编程模型，其能力不如数据仓库中的并行查询执行引擎成熟 [^4] [^5]。它在诞生时确实让商用硬件上的处理规模跃升一大步，但今天已大体过时，Google 内部也不再使用 [^6] [^7]。

如今批处理更常通过 Spark、Flink 或数据仓库查询引擎完成。它们与 MapReduce 一样高度依赖分片（见[第七章](/ch7#ch_sharding)）和并行执行，但缓存与执行策略更成熟。随着这些系统走向成熟，运维问题已大幅缓解，重点转向可用性：数据流 API、查询语言、DataFrame API 得到广泛支持；任务与工作流编排也显著进化。以 Hadoop 为中心的 Oozie、Azkaban 等调度器，正被 Airflow、Dagster、Prefect 这类更通用方案替代，它们可协调多种批处理框架与云数据仓库。

云计算已无处不在。批处理存储层也正在从 HDFS、GlusterFS、CephFS 这类分布式文件系统（DFS）向 S3 等对象存储迁移。BigQuery、Snowflake 这类可伸缩云数据仓库，正在模糊“数据仓库”和“批处理系统”之间的边界。

为了建立直觉，本章先从单机 Unix 工具示例出发，再扩展到分布式多机处理。你会看到，分布式批处理框架在很多方面很像操作系统：它也有调度器和文件系统。随后我们会讨论编写批处理作业的几种处理模型，最后给出常见应用场景。

## 使用 Unix 工具的批处理 {#sec_batch_unix}

假设你有一台 Web 服务器，每处理一个请求就在日志文件末尾追加一行。例如，使用 nginx 默认访问日志格式，一行可能像这样：

    216.58.210.78 - - [27/Jun/2025:17:55:11 +0000] "GET /css/typography.css HTTP/1.1"
    200 3377 "https://martin.kleppmann.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X
    10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"

（实际上这是一行，这里为了阅读方便换了行。）这一行包含了很多信息。要正确解释它，你需要日志格式定义：

    $remote_addr - $remote_user [$time_local] "$request"
    $status $body_bytes_sent "$http_referer" "$http_user_agent"

这表示：UTC 时间 2025 年 6 月 27 日 17:55:11，服务器收到来自客户端 IP `216.58.210.78` 对 `/css/typography.css` 的请求。用户未认证，因此 `$remote_user` 是连字符（`-`）。响应状态码是 200（成功），响应体大小 3,377 字节。浏览器是 Chrome 137，该文件是从页面 *[*https://martin.kleppmann.com/*](https://martin.kleppmann.com/)* 引用而来。

看起来“解析日志”有点朴素，但它在现代科技公司里是核心能力之一，从广告流水线到支付处理都大量依赖。事实上，这也是 MapReduce 与“大数据”浪潮快速兴起的重要推动力。

### 简单日志分析 {#sec_batch_log_analysis}

很多工具都能从日志生成漂亮的网站流量报告。这里为了练手，我们只用基础 Unix 工具自己做一个。比如你想找出网站最受欢迎的五个页面，可以在 shell 中这样做：

```bash
cat /var/log/nginx/access.log | #1
  awk '{print $7}' | #2
  sort             | #3
  uniq -c          | #4
  sort -r -n       | #5
  head -n 5          #6
```

1. 读取日志文件。（严格说这里不需要 `cat`，可直接把文件作为 `awk` 参数；但这样写更直观看出线性管道。）
2. 以空白字符切分每行，只输出第 7 个字段，也就是请求 URL。上面的样例中是 `/css/typography.css`。
3. 按字典序对 URL 排序。某个 URL 若出现 *n* 次，排序后会连续出现 *n* 行。
4. `uniq` 通过比较相邻两行是否相同来去重。`-c` 让它输出计数：每个不同 URL 出现了多少次。
5. 第二次 `sort` 按每行开头的数字（`-n`）排序，并用 `-r` 逆序，出现次数最多的排在最前。
6. `head` 只保留前 5 行（`-n 5`），丢弃其余。

输出大致如下：

```
    4189 /favicon.ico
    3631 /2016/02/08/how-to-do-distributed-locking.html
    2124 /2020/11/18/distributed-systems-and-elliptic-curves.html
    1369 /
     915 /css/typography.css
```

如果你不熟悉 Unix 工具，这条命令看起来可能有点晦涩，但它威力很强。它能在几秒内处理 GB 级日志，而且修改分析逻辑也非常方便：例如要排除 CSS 文件，可把 `awk` 参数改成 `'$7 !~ /\.css$/ {print $7}'`；若要统计访问最多的客户端 IP，把 `awk` 参数改成 `'{print $1}'` 即可。

本书篇幅有限，无法展开讲 Unix 工具，但它们非常值得学。令人惊讶的是，仅靠 `awk`、`sed`、`grep`、`sort`、`uniq`、`xargs` 的组合，就能在几分钟内做出很多数据分析，并且性能相当好 [^8]。

### 命令链与自定义程序 {#sec_batch_custom_program}

你也可以不用 Unix 管道，而写个小程序完成同样的事。比如用 Python：

```python
from collections import defaultdict

counts = defaultdict(int) #1

with open('/var/log/nginx/access.log', 'r') as file:
    for line in file:
        url = line.split()[6] #2
        counts[url] += 1 #3

top5 = sorted(((count, url) for url, count in counts.items()), reverse=True)[:5] #4

for count, url in top5:  #5
    print(f"{count} {url}")
```

1. `counts` 是散列表，记录每个 URL 出现次数，默认值为 0。
2. 每行按空白字符切分，取第 7 个字段作为 URL（Python 数组从 0 开始，所以索引是 6）。
3. 当前行对应 URL 的计数器加一。
4. 按计数降序排序，取前五项。
5. 打印前五项。

这个程序不如 Unix 管道简洁，但可读性也不错，偏好取决于习惯。不过两者除了语法差异，执行流程也很不一样；在大文件上运行时，这种差异会很明显。

### 排序与内存聚合 {#id275}

Python 脚本在内存里维护了一个“URL -> 出现次数”的散列表。Unix 管道示例没有这种散列表，而是通过排序把同一 URL 的多次出现排到一起。

哪种方法更好？取决于不同 URL 的数量。对多数中小网站而言，通常可以把所有不同 URL 及其计数器放进（比如）1GB 内存。这个作业的 *工作集（working set）*（需要随机访问的内存规模）只取决于不同 URL 的个数：即便一百万条日志都指向同一 URL，散列表也只存一个 URL 和一个计数器。工作集足够小时，内存散列表很好用，笔记本都能跑。

但如果工作集大于可用内存，排序法就有优势：它能高效使用磁盘。这与[“日志结构存储”](/ch4#sec_storage_log_structured)中的原理一样：先在内存对数据块排序并写成段文件，再把多个有序段合并成更大的有序文件。归并排序的顺序访问模式对磁盘很友好（见[“SSD 上的顺序写与随机写”](/ch4#sidebar_sequential)）。

GNU Coreutils（Linux）中的 `sort` 能自动把超内存数据溢写到磁盘，并自动利用多核并行排序 [^9]。这意味着前面的 Unix 命令链可以自然扩展到大数据集而不耗尽内存，瓶颈通常变成磁盘读取输入文件的速率。

Unix 工具的一个局限是它们只在单机运行。当数据大到单机内存或本地磁盘都放不下时，就需要分布式批处理框架。

## 分布式系统中的批处理 {#sec_batch_distributed}

在前面的 Unix 示例中，单机有几个协同组件在处理日志：

- 通过操作系统文件系统接口访问的存储设备。
- 决定进程何时运行、如何分配 CPU 资源的调度器。
- 一串通过管道把 `stdin`/`stdout` 连接起来的 Unix 程序。

分布式批处理框架也有对应组件。某种意义上，你可以把分布式处理框架看成“分布式操作系统”：它有文件系统、有任务调度器，还有通过文件系统或其他通道互相传递数据的程序。

### 分布式文件系统 {#sec_batch_dfs}

操作系统提供的文件系统由多层组成：

- 最底层是块设备驱动，直接与磁盘交互，向上层提供原始块读写。
- 块层之上是页缓存，缓存最近访问块以提升读取速度。
- 块 API 之上是文件系统层，负责把大文件切块，并维护 inode、目录、文件等元数据。Linux 常见实现如 ext4、XFS。
- 最上层，操作系统通过统一 API（虚拟文件系统，VFS）向应用暴露不同文件系统，让应用以统一方式读写底层不同实现。

分布式文件系统（DFS）工作方式很类似：文件被切成块并分散到多台机器。DFS 的块通常比本地文件系统大得多：HDFS 默认 128MB，JuiceFS 和许多对象存储常用 4MB，而 ext4 默认块通常是 4096 字节。块越大，需要维护的元数据越少，这对 PB 级数据非常关键；同时寻道开销占比也更低。

大多数物理存储设备不能做“部分块写入”，即使数据不足一个块也得写满块。DFS 的块更大且通常构建在操作系统文件系统之上，因此一般没有这个约束。比如一个 900MB 文件在 128MB 分块下，会有 7 个 128MB 块和 1 个 4MB 块。

读取 DFS 块需要通过网络请求到持有该块的集群节点。每台机器都运行守护进程，对外提供 API，使远程进程能把本地文件系统中的块当作文件读写。HDFS 把这些守护进程叫 DataNode，GlusterFS 叫 glusterfsd。后文统称 *数据节点（data node）*。

DFS 也实现了“分布式版本”的页缓存。因为 DFS 块作为文件存放在数据节点本地，读写会经过数据节点操作系统，自带内存页缓存，热门块会被缓存在内存中。某些 DFS 还提供更多缓存层，例如 JuiceFS 的客户端缓存和本地磁盘缓存。

像 ext4/XFS 这样的文件系统会维护空闲空间、块位置、目录结构、权限等元数据。DFS 同样需要记录“文件块分布在哪些机器”“权限如何”等信息。Hadoop 使用 NameNode 维护集群元数据；DeepSeek 的 3FS 使用元数据服务并把元数据持久化到 FoundationDB 之类键值存储。

在文件系统之上是 VFS。批处理系统里最接近它的是 DFS 协议：批处理框架需要通过协议/接口来读写存储。只要实现协议，就能作为可插拔存储接入。例如 S3 API 已被 MinIO、Cloudflare R2、Tigris、Backblaze B2 等大量系统兼容支持。具备 S3 支持的批处理系统通常可直接使用这些存储。

有些 DFS 还提供 POSIX 兼容文件系统，让操作系统 VFS 把它当普通文件系统。常见集成方式是 FUSE 或 NFS 协议。NFS 可能是最知名分布式文件系统协议，最初用于让多个客户端读写单个服务器上的数据。后来 AWS EFS、Archil 等提供了更可伸缩的 NFS 兼容实现。NFS 客户端虽仍连到一个端点，但底层会与分布式元数据服务和数据节点交互完成读写。

> [!TIP] 分布式文件系统与网络存储
> 分布式文件系统基于 *无共享（shared-nothing）* 原则（见[“共享内存、共享磁盘与无共享架构”](/ch2#sec_introduction_shared_nothing)），与 NAS（网络附加存储）和 SAN（存储区域网络）等 *共享磁盘* 方案形成对照。共享磁盘通常依赖集中式存储设备、定制硬件和专用网络（如光纤通道）；无共享方案不要求专用硬件，只需普通数据中心网络互联的机器。

很多 DFS 构建在商用硬件上，成本更低但故障率高于企业级专用硬件。为容忍机器和磁盘故障，文件块通常复制到多台机器。这也让调度器更容易均衡负载：任务可在任一持有输入副本的节点运行。复制可以是多副本（见[第六章](/ch6#ch_replication)），也可以是 Reed-Solomon 等 *纠删码* 方案，以更低存储开销恢复丢失数据 [^10] [^11] [^12]。这与 RAID 思想类似，只是 RAID 面向同一机器上的多块磁盘，而 DFS 是通过普通数据中心网络跨机器做访问和复制。

### 对象存储 {#id277}

Amazon S3、Google Cloud Storage、Azure Blob Storage、OpenStack Swift 等对象存储，已成为批处理场景中对 DFS 的主流替代。实际上两者边界越来越模糊：正如前一节和[“由对象存储支撑的数据库”](/ch6#sec_replication_object_storage)所述，FUSE 可以把 S3 这类对象存储“挂载成文件系统”；JuiceFS、Ceph 等系统也同时提供对象 API 与文件系统 API。但这些接口、性能、以及一致性保证差异很大，即便 API 看似兼容，也需要仔细验证行为是否符合预期。

对象存储中的每个对象有一个 URL，例如 `s3://my-photo-bucket/2025/04/01/birthday.png`。其中主机部分（`my-photo-bucket`）是 bucket 名，后半部分是对象 *键（key）*（示例里是 `/2025/04/01/birthday.png`）。bucket 名全局唯一；对象键在 bucket 内必须唯一。

对象读取用 `get`，写入用 `put`。与文件系统文件不同，对象写入后通常不可变；更新对象需要通过 `put` 全量重写，类似键值存储。Azure Blob Storage 和 S3 Express One Zone 支持追加，但多数对象存储不支持。它也没有 `fopen`、`fseek` 这类文件句柄 API。

对象看起来像按目录组织，这很容易让人误解：对象存储并没有真正目录概念。所谓路径只是约定，斜杠也是 key 的一部分。这个约定允许你按前缀列出对象，类似“目录列表”，但与文件系统目录列举有两点不同：

- 前缀 `list` 行为更像 Unix 的递归 `ls -R`：会返回所有以该前缀开头的对象，包括“子路径”下的对象。
- 不存在“空目录”。如果你删除了 `s3://my-photo-bucket/2025/04/01` 下所有对象，再列 `s3://my-photo-bucket/2025/04` 时就看不到 `01`。常见做法是创建 0 字节对象表示空目录（如创建空对象 `s3://my-photo-bucket/2025/04/01` 以保留目录占位）。

DFS 常支持硬链接、符号链接、文件锁、原子重命名等文件系统操作，而对象存储通常缺失这些能力：链接和锁大多不支持；重命名也非原子，通常是“复制到新 key，再删除旧 key”。若要“重命名目录”，因为目录名是 key 的一部分，实际上要逐个对象重命名。

[第四章](/ch4#ch_storage)讨论的键值存储通常面向小值（通常 KB 级）和高频低延迟读写。相比之下，DFS 和对象存储通常优化的是大对象（MB 到 GB）和低频大块读写。不过近年对象存储也在增强小对象高频访问能力，例如 S3 Express One Zone 已提供单毫秒级延迟，计费模型也更接近键值存储。

DFS 与对象存储另一个区别是：HDFS 等 DFS 可把计算任务调度到持有文件副本的机器上，让任务本地读文件，减少网络传输（当任务代码远小于待读文件时尤其划算）。对象存储通常把存储和计算解耦，虽然可能用更多带宽，但现代数据中心网络很快，通常可接受。同时这种解耦让 CPU/内存与存储容量可以独立扩展。

### 分布式作业编排 {#id278}

前面的“操作系统类比”同样适用于作业编排。在单机上跑 Unix 批处理任务时，总得有东西真正去执行 `awk`、`sort`、`uniq`、`head` 进程；需要把一个进程输出送到另一个进程输入；要给每个进程分配内存；公平调度 CPU 指令；隔离内存与 I/O 边界，等等。单机里这由操作系统内核负责；分布式环境里，这就是作业编排器（orchestrator）的职责。

批处理框架会向编排器的调度器发起“运行作业”请求。请求通常包含如下元数据：

- 需要执行的任务数量；
- 每个任务所需内存、CPU、磁盘；
- 作业标识符；
- 访问凭据；
- 输入输出等作业参数；
- 所需硬件信息（如 GPU、磁盘类型）；
- 作业可执行代码的位置。

Kubernetes、Hadoop YARN（Yet Another Resource Negotiator）[^13] 等编排器会结合这些请求与集群状态，依靠以下组件执行任务：

任务执行器（Task executors）

:   每个节点上运行执行器守护进程，例如 YARN 的 *NodeManager* 或 Kubernetes 的 *kubelet*。执行器负责拉起任务、通过心跳上报存活状态、跟踪节点上的任务状态与资源占用。收到“启动任务”请求后，执行器会获取作业代码并执行启动命令；随后持续监控进程直至结束或失败，并更新对应状态元数据。

    很多执行器还配合操作系统实现安全与性能隔离，例如 YARN 和 Kubernetes 都会使用 Linux *cgroups*。这样可防止任务越权访问数据，或因资源滥用影响同机其他任务。

资源管理器（Resource Manager）

:   资源管理器维护各节点元数据：可用硬件（CPU、GPU、内存、磁盘等）、任务状态、网络位置、节点健康状态等，从而形成全局视图。其中心化特性可能成为可用性和可伸缩性瓶颈。YARN 借助 ZooKeeper，Kubernetes 借助 etcd 存储集群状态（见[“协调服务”](/ch10#sec_consistency_coordination)）。

调度器（Scheduler）

:   编排器通常包含中心化调度子系统，接收启动/停止作业与状态查询请求。例如收到“启动 10 个任务，使用指定 Docker 镜像，且必须运行在某类 GPU 节点上”的请求后，调度器会基于请求和资源管理器状态决定“哪些任务跑在哪些节点”，再通知执行器执行。

不同编排器命名各异，但几乎都具备这些核心组件。

> [!NOTE]
> 有些调度决策需要“应用特定调度器”参与，才能考虑更具体的业务约束，例如当查询量达到阈值时自动扩容只读副本。中心调度器与应用调度器协同决定如何执行任务。YARN 把这类子调度器称为 *ApplicationMaster*，Kubernetes 通常称为 *operator*。

#### 资源分配 {#id279}

调度器在编排系统中最具挑战的职责之一，就是在资源有限且作业需求冲突时，做出合理分配。它本质上是在公平与效率之间做平衡。

假设一个小集群有 5 个节点，共 160 个 CPU 核。调度器收到两个作业请求，每个都想要 100 核。怎么排最好？

- 可以给每个作业先分 80 个任务，剩余 20 个等前面的任务结束后再启动。
- 也可以先跑完其中一个作业，再等 100 核都空出来后跑另一个。这叫 *gang scheduling*（成组调度）。
- 如果一个请求先到，调度器还要决定是立即把 100 核都给它，还是为未来请求预留一部分资源。

这是很简化的例子，但已经能看到艰难权衡。以成组调度为例，如果调度器为了凑齐 100 核而长期预留资源，节点会闲置，资源利用率下降，若其他作业也在抢占式预留，还可能死锁。

反过来，如果只是被动等 100 核“自然可用”，中间可能被别的作业拿走，导致长时间凑不齐，从而产生 *饥饿（starvation）*。调度器也可以 *抢占（preempt）* 一部分先到作业任务，把它们杀掉给后到作业腾资源；但被杀任务之后还要重跑，整体效率同样下降。

把这个问题放大到数百甚至数百万个请求，想求全局最优几乎不可行。事实上这是 *NP-hard* 问题：除了很小规模，很难在可接受时间内算出最优解 [^14] [^15]。

因此工程上调度器通常采用启发式方法，在非最优前提下做“足够好”的决策。常见算法包括 FIFO、主导资源公平（DRF）、优先级队列、容量/配额调度、各种装箱算法等。细节超出本书范围，但这是非常有趣的研究领域。

#### 工作流调度 {#sec_batch_workflows}

本章开头的 Unix 示例是多个命令串联。分布式批处理中同样常见：一个作业输出要成为一个或多个后续作业输入，而每个作业又可能依赖多个上游输入。这个依赖结构称为 *工作流（workflow）* 或 *有向无环图（DAG）*。

> [!NOTE]
> 我们在[“持久化执行与工作流”](/ch5#sec_encoding_dataflow_workflows)中讨论过“按步骤执行 RPC”的工作流引擎；在批处理语境里，“工作流”指的是一串批处理过程：每一步读输入、产输出，通常不直接对外做 RPC。持久化执行引擎通常单次请求处理的数据量小于批处理系统，但两者边界并非绝对。

需要多作业工作流常见有以下原因：

- 一个作业输出可能被多个团队维护的下游作业消费。此时先把输出写到公共位置更合理，下游可按“数据更新触发”或定时方式运行。
- 你可能要在多个处理工具间传递数据。比如 Spark 作业写 HDFS，再由 Python 触发 Trino SQL 查询（见[“云数据仓库”](/ch4#sec_cloud_data_warehouses)）继续处理并写入 S3。
- 有些流水线内部天然需要多阶段。例如第一阶段按某键分片，下一阶段按另一键分片，那么第一阶段需要先产出符合第二阶段要求的数据布局。

在 Unix 里，管道用很小的内存缓冲连接前后命令，不落盘。若缓冲区满，上游必须等待下游消费，这是一种 *背压（backpressure）*。Spark、Flink 等批处理执行引擎也支持类似模式：一个任务输出直接传给下一任务（跨机时经网络传输）。

但在工作流中，更常见仍是“上游作业写 DFS/对象存储，下游再读”，这样可让作业在时间上解耦。若一个作业有多个输入，工作流调度器通常会等待所有上游输入生产成功后再启动它。

YARN ResourceManager 或 Spark 内置调度器主要做“作业内调度”，不负责整条工作流。为管理跨作业依赖，出现了 Airflow、Dagster、Prefect 等工作流调度器。它们在维护大量批作业时非常关键：包含 50~100 个作业的工作流并不罕见；大型组织内很多团队会跨系统互相消费输出。没有工具支撑，很难管理这种复杂数据流。

#### 故障处理 {#id281}

批处理作业往往运行时间长。长时间运行且并行任务多的作业，在执行过程中遇到至少一次任务失败几乎是常态。正如[“硬件与软件故障”](/ch2#sec_introduction_hardware_faults)和[“不可靠网络”](/ch9#sec_distributed_networks)所述，原因可能是硬件故障（商用硬件尤甚）、网络中断等。

任务无法完成的另一原因是被调度器主动抢占（kill）。当系统有多优先级队列时，这很常见：低优先级任务便宜、高优先级任务昂贵。低优先级任务可用空闲算力跑，但高优先级任务一到就可能把它们抢占掉。云厂商的对应产品名分别是：AWS 的 *spot instances*、Azure 的 *spot virtual machines*、GCP 的 *preemptible instances* [^16]。

批处理很多时候对实时性要求不高，因此很适合利用低优先级资源/抢占式实例降成本：本质上它在“吃”否则会闲置的算力，提高集群利用率。但代价是更高的被杀概率：实际里抢占往往比硬件故障更常见 [^17]。

由于批处理每次都从头生成输出，任务失败比在线系统更容易处理：删掉失败任务的部分输出，把任务重新调度到别的机器重跑即可。若只因一个任务失败就重跑整个作业会非常浪费，因此 MapReduce 及其后继系统都尽量让并行任务彼此独立，从而把重试粒度降到单个任务 [^3]。

当一个任务输出成为另一任务输入（即在工作流内传递）时，容错更复杂。MapReduce 的做法是：中间数据总是写回 DFS，且只有写入任务成功后才允许下游读取。这个方案在频繁抢占环境中也能工作，但会带来大量 DFS 写入，效率不高。

Spark 更倾向把中间数据放内存或溢写本地磁盘，只把最终结果写 DFS；它还记录中间数据的计算血缘，丢失时可重算 [^18]。Flink 则采用定期检查点快照机制 [^19]。我们会在[“数据流引擎”](#sec_batch_dataflow)继续讨论。

## 批处理模型 {#id431}

前面我们讨论了分布式环境中批作业如何调度。现在转向“批处理框架如何处理数据”。最常见的两类模型是 MapReduce 与数据流引擎。尽管实践中数据流引擎已大面积替代 MapReduce，但理解 MapReduce 仍然重要，因为它深刻影响了现代批处理框架。

MapReduce 与数据流引擎都发展出多种编程接口：低层 API、关系查询语言、DataFrame API。它们让应用工程师、数据分析工程师、业务分析师乃至非技术人员都能参与数据处理。我们将在[“批处理用例”](#sec_batch_output)中讨论这些用途。

### MapReduce {#sec_batch_mapreduce}

MapReduce 的处理模式与[“简单日志分析”](#sec_batch_log_analysis)几乎同构：

1. 读取输入文件并切分为 *记录（records）*。在日志例子里，每条记录就是一行（`\n` 为记录分隔符）。在 Hadoop MapReduce 中，输入通常存放在 HDFS 或 S3 等对象存储，文件格式可能是 Parquet（列式，见[“面向列存储”](/ch4#sec_storage_column)）或 Avro（行式，见[“Avro”](/ch5#sec_encoding_avro)）。
2. 调用 mapper，从每条输入记录中提取键和值。Unix 示例中 mapper 相当于 `awk '{print $7}'`：URL（`$7`）是键，值可留空。
3. 按键排序所有键值对。日志示例中这一步对应第一次 `sort`。
4. 调用 reducer 遍历排序后的键值对。同键记录会相邻，因此可以在很小内存状态下合并。Unix 示例中 reducer 等价于 `uniq -c`，统计相邻同键记录数。

这四步就是一个 MapReduce 作业。第 2 步（map）与第 4 步（reduce）是你写业务逻辑的地方；第 1 步（文件切记录）由输入格式解析器完成；第 3 步排序在 MapReduce 中是隐式内置的，你无需手写。这一步是批处理的基础算法，我们会在[“混洗数据”](#sec_shuffle)再讨论。

要创建 MapReduce 作业，你需实现两个回调：mapper 与 reducer，其行为如下。

Mapper

:   对每条输入记录调用一次。它从输入记录中提取键和值，并可为每条输入产生任意数量键值对（包括 0 条）。它不保留跨记录状态，每条记录独立处理。

Reducer

:   框架收集 mapper 产生的键值对，把同键值集合交给 reducer（以迭代器形式）。reducer 可输出结果记录（如同一 URL 的出现次数）。

在日志示例里，第 5 步还有一次 `sort` 用于按请求次数排名 URL。MapReduce 若要第二轮排序，通常要再写一个作业：前一个输出作为后一个输入。换个角度看，mapper 的作用是把数据整理成适合排序的形态；reducer 的作用是处理已排序数据。

> [!TIP] MapReduce 与函数式编程
> MapReduce 虽用于批处理，但其编程模型来自函数式编程。Lisp 把 *map* 与 *reduce/fold* 作为列表上的高阶函数引入，后来进入 Python、Rust、Java 等主流语言。包括 SQL 在内的大量数据处理操作都可在 MapReduce 之上表达。Map 和 reduce 以及函数式编程的一些特性恰好契合 MapReduce：可组合、天然适合数据处理链；map 还是典型“令人尴尬地并行”（每条输入独立处理）；reduce 则可按不同键并行。

但用原始 MapReduce API 写复杂处理其实很费力，例如各种连接算法都要自己实现 [^20]。MapReduce 相比现代批处理引擎也偏慢，一个重要原因是其“以文件为中心”的 I/O 让作业流水化困难：上游不结束，下游很难提前处理输出。

### 数据流引擎 {#sec_batch_dataflow}

为解决 MapReduce 的局限，出现了多种分布式批处理执行引擎，最著名的是 Spark [^18] [^21] 和 Flink [^19]。它们设计细节各异，但有一个共同点：把整条工作流当成一个作业处理，而不是拆成互相独立的小作业。

因为它们显式建模了跨多个处理阶段的数据流动，所以称为 *数据流引擎（dataflow engines）*。与 MapReduce 一样，它们提供低层 API（反复调用用户函数逐条处理记录），也提供更高层算子（如 *join*、*group by*）。它们通过分片并行输入，并通过网络把一个任务输出传给另一个任务输入。与 MapReduce 不同，算子不必严格在 map/reduce 两类角色间交替，而可以更灵活组合。

这些 API 通常以关系风格构件表达计算：按字段值连接数据集、按键分组、按条件过滤、按计数或求和等函数聚合。内部实现依赖的正是下一节要讲的混洗算法。

这种处理引擎风格可追溯到 Dryad [^22]、Nephele [^23] 等研究系统。相比 MapReduce，它有几个优势：

- 像排序这类昂贵操作只在“确实需要”的地方执行，而不是每个 map 与 reduce 阶段之间都默认做。
- 连续多个不改变分片方式的算子（如 map/filter）可融合成一个任务，减少数据复制开销。
- 由于工作流里的连接与数据依赖都显式声明，调度器能全局优化数据局部性。比如把“消费某数据”的任务放到“生产该数据”的同机上，用共享内存缓冲交换，而非走网络拷贝。
- 算子间中间状态通常放内存或本地磁盘即可，比写 DFS/对象存储 I/O 更低（后者要多副本并落到多机磁盘）。MapReduce 仅对 mapper 输出做了这类优化，数据流引擎把它推广到所有中间状态。
- 输入一就绪就能启动下游算子，无需等待整个上游阶段全部完成。
- 可复用已有进程运行新算子，减少启动开销；MapReduce 往往为每个任务起一个新 JVM。

因此，数据流引擎能实现与 MapReduce 工作流同样的计算，但通常速度明显更快。

### 混洗数据 {#sec_shuffle}

本章开头的 Unix 工具示例和 MapReduce 都建立在排序之上。批处理系统要能排序 PB 级数据，单机放不下，因此必须使用“输入与输出都分片”的分布式排序算法，这就是 *混洗（shuffle）*。

> [!NOTE] 混洗不是随机
> “shuffle” 容易引发误解。洗牌会得到随机顺序；而这里的 shuffle 产出的是排序后的确定顺序，不含随机性。

混洗是批处理系统的基础算法，连接与聚合都依赖它。MapReduce、Spark、Flink、Daft、Dataflow、BigQuery [^24] 都实现了高可伸缩且高性能的混洗机制以处理大数据集。这里用 Hadoop MapReduce 的混洗实现做说明 [^25]，但核心思想在其他系统同样适用。

[图 11-1](#fig_batch_mapreduce) 展示了一个 MapReduce 作业的数据流。假设输入已分片，标记为 *m1*、*m2*、*m3*。例如每个分片可以是 HDFS 中一个文件，或对象存储中的一个对象；同一数据集的所有分片可以放在同一 HDFS 目录，或使用同一对象前缀。

{{< figure src="/fig/ddia_1101.png" id="fig_batch_mapreduce" caption="图 11-1. 一个包含三个 mapper 和三个 reducer 的 MapReduce 作业。" class="w-full my-4" >}}

框架会为每个输入分片启动一个 map 任务。任务读取分配到的文件，并逐条记录调用 mapper 回调。reduce 侧也会分片。map 任务数由输入分片数决定；reduce 任务数由作业作者配置（可与 map 数不同）。

mapper 输出是键值对。框架需要保证：若不同 mapper 输出了同一个键，这些键值对最终必须由同一个 reducer 处理。为此，每个 mapper 会在本地磁盘为每个 reducer 维护一个输出文件（例如[图 11-1](#fig_batch_mapreduce)中的 *m1,r2*：由 mapper1 生成，目标是 reducer2）。mapper 每输出一条键值对，通常会按键的哈希决定写入哪个 reducer 文件（类似[“按键哈希分片”](/ch7#sec_sharding_hash)）。

mapper 写这些文件的同时，也会在每个文件内部按键排序。可用的正是[“日志结构存储”](/ch4#sec_storage_log_structured)中的技术：先在内存有序结构里积累一批键值对，写成有序段文件，再把小段逐步合并成大段。

每个 mapper 完成后，reducer 会连接到 mapper，把属于自己的有序文件拷贝到本地磁盘。reducer 拿到所有 mapper 的对应分片后，再用归并排序方式合并它们并保持有序。同键记录即便来自不同 mapper，也会在合并后相邻。随后 reducer 以“每个键一次调用”的方式执行，每次拿到一个可迭代器，遍历该键所有值。

reducer 输出记录会顺序写入文件，每个 reduce 任务一个文件。[图 11-1](#fig_batch_mapreduce)中的 *r1*、*r2*、*r3* 就是输出数据集的分片，最终写回 DFS 或对象存储。

MapReduce 在 map 与 reduce 之间执行混洗；现代数据流引擎和云数据仓库则更复杂。BigQuery 等系统已优化混洗，使数据尽量留在内存，并写入外部排序服务 [^24]，以提升速度并通过复制增强韧性。

#### JOIN 与 GROUP BY {#sec_batch_join}

下面看“有序数据”如何简化分布式连接与聚合。为便于说明仍以 MapReduce 为例，但概念适用于大多数批处理系统。

批处理里常见连接场景见[图 11-2](#fig_batch_join_example)。左边是用户活动日志（*activity events* 或 *clickstream data*），右边是用户数据库。它可以看作星型模型的一部分（见[“星型与雪花型：分析模式”](/ch3#sec_datamodels_analytics)）：活动日志是事实表，用户库是维度表之一。

{{< figure src="/fig/ddia_1102.png" id="fig_batch_join_example" caption="图 11-2. 用户活动日志与用户画像数据库的连接。" class="w-full my-4" >}}

如果你要做“结合用户库信息的活动分析”（例如利用用户出生日期字段，判断哪些页面更受年轻或年长用户欢迎），就需要连接这两张表。若两边都大到必须分片，怎么做？

可利用 MapReduce 的关键特性：混洗会把同键键值对汇聚到同一个 reducer，无论它们最初在哪个分片。这里用户 ID 就可以作为键。因此可写一个 mapper 扫活动日志，输出“按用户 ID 键控的页面访问 URL”（见[图 11-3](#fig_batch_join_reduce)）；再写一个 mapper 按行扫描用户表，提取“用户 ID 作为键、出生日期作为值”。

{{< figure src="/fig/ddia_1103.png" id="fig_batch_join_reduce" caption="图 11-3. 基于用户 ID 的排序合并连接。若输入数据集由多个文件分片组成，可并行启动多个 mapper 处理。" class="w-full my-4" >}}

混洗保证 reducer 能同时拿到某用户的出生日期和该用户全部页面访问事件。MapReduce 甚至可以把记录进一步排成 reducer 先看到用户记录、再按时间戳看到活动事件，这称为 *二次排序（secondary sort）* [^25]。

于是 reducer 很容易实现连接逻辑：先拿到出生日期并存入局部变量，再遍历同一用户 ID 的活动事件，输出“被访问 URL + 访问者出生日期”。因为 reducer 一次处理一个用户的全部记录，所以内存里只要保留一条用户记录，也无需发任何网络请求。这个算法称为 *排序合并连接（sort-merge join）*：mapper 输出先按键排序，reducer 再把连接两侧有序记录合并。

工作流中的下一个 MapReduce 作业就可以继续计算“每个 URL 的访问者年龄分布”：先按 URL 做一次混洗，再在 reducer 中遍历同 URL 的所有访问记录（含出生日期），按年龄段维护计数并逐条累加，从而实现 *group by* 与聚合。

### 查询语言 {#sec_batch_query_lanauges}

这些年分布式批处理执行引擎不断成熟。如今在上万台机器的集群上存储并处理数 PB 数据，基础设施已足够稳健。随着“如何在这规模下把系统跑起来”基本被解决，重点开始转向编程模型的可用性。

MapReduce、数据流引擎、云数据仓库都把 SQL 作为批处理“通用语”。这很自然：传统数据仓库本就用 SQL，数据分析/ETL 工具都支持 SQL，几乎所有开发者和分析师也都熟悉 SQL。

相比手写 MapReduce，查询语言接口不仅代码更少，还支持交互式使用：可在终端或 GUI 里写分析 SQL 并直接执行。这种交互式查询对于业务分析、产品、销售、财务等角色探索数据非常高效。虽然它不完全是“经典批处理”形态，但 SQL 让探索式查询也能在分布式批处理系统中高效完成。

高级查询语言不只提升人的生产力，也提高机器执行效率。正如[“云数据仓库”](/ch4#sec_cloud_data_warehouses)所述，查询引擎要把 SQL 转成在集群里执行的批处理作业。这个从查询到语法树再到物理算子的转换过程，让引擎有机会做优化。Hive、Trino、Spark、Flink 等查询引擎都具备代价优化器：它们可分析连接输入特征，自动选择更合适的连接算法，甚至重排连接顺序以减少中间状态 [^19] [^26] [^27] [^28]。

SQL 是最流行的通用批处理语言，但在一些细分场景中仍有其他语言。Apache Pig 提供了基于关系算子的逐步式数据流水线描述方式，而非“一个超大 SQL 查询”。DataFrame（下一节）有相似特征，Morel 则是受 Pig 影响的更现代语言。还有用户采用 jq、JMESPath、JsonPath 等 JSON 查询语言。

在[“图状数据模型”](/ch3#sec_datamodels_graph)中，我们讨论了图建模与图查询语言如何遍历边和顶点。许多图处理框架也支持通过查询语言做批计算，例如 Apache TinkerPop 的 Gremlin。我们会在[“批处理用例”](#sec_batch_output)继续看图处理场景。

> [!TIP] 批处理与云数据仓库正在收敛
> 历史上，数据仓库运行在专用硬件设备上，主要提供关系数据的 SQL 分析查询；而 MapReduce 等批处理框架强调更高可伸缩性与更高灵活性，允许使用通用编程语言写处理逻辑，并读写任意数据格式。
>
> 随着发展，两者越来越像。现代批处理框架已经支持 SQL，并借助 Parquet 等列式格式和优化执行引擎（见[“查询执行：编译与向量化”](/ch4#sec_storage_vectorized)）在关系查询上获得良好性能。与此同时，数据仓库通过云化（见[“云数据仓库”](/ch4#sec_cloud_data_warehouses)）获得更强可伸缩能力，并实现了许多与分布式批处理框架相同的调度、容错和混洗技术，很多也使用分布式文件系统。
>
> 正如批处理系统采纳 SQL，云仓库也在采纳 DataFrame 等替代处理模型（下一节）。例如 BigQuery 提供 BigQuery DataFrames，Snowflake 的 Snowpark 能与 Pandas 集成。Airflow、Prefect、Dagster 等批处理工作流编排器也已广泛集成云仓库。
>
> 当然，并非所有批任务都容易用 SQL 表达。PageRank 等迭代图算法、复杂机器学习任务都很难用 SQL 写。涉及图像、视频、音频等非关系多模态数据的 AI 处理同样如此。
>
> 此外，云数据仓库在某些负载上并不理想。行级逐条计算与列式存储不匹配，效率较低，此时更适合使用仓库的其他 API 或批处理系统。云仓库通常也比其他批处理系统更贵，某些大作业放到 Spark/Flink 等系统可能更具成本优势。
>
> 因此，“用批处理系统还是数据仓库”最终要看成本、便利性、实现复杂度、可用性等综合因素。大型企业往往并存多套系统以保留选择空间；小公司通常一套系统也能跑起来。

### DataFrames {#id287}

随着数据科学家和统计学家开始用分布式批处理框架做机器学习，他们发现原有处理模型不够顺手，因为他们更习惯 R 与 Pandas 里的 DataFrame 数据模型（见[“DataFrame、矩阵与数组”](/ch3#sec_datamodels_dataframes)）。DataFrame 与关系库里的表很像：由多行组成，同一列值类型一致。它不是写一个超大 SQL，而是通过调用对应关系算子的函数来做过滤、连接、排序、分组等操作。

早期 DataFrame 操作大多在本地内存执行，因此只能处理单机装得下的数据集。数据科学家希望在批处理环境中，仍用熟悉的 DataFrame API 处理大数据。Spark、Flink、Daft 等分布式框架都因此提供了 DataFrame API。需要注意的是，本地 DataFrame 通常带索引且有顺序，而分布式 DataFrame 往往没有 [^29]，迁移时可能出现性能“意外”。

DataFrame API 看起来和数据流 API 相似，但实现方式差别不小。Pandas 调用方法后通常立刻执行；Spark 则会先把 DataFrame API 调用翻译为查询计划，做查询优化后，再在分布式数据流引擎上执行，从而获得更好性能。

Daft 等框架甚至同时支持客户端与服务端计算：小规模内存操作在客户端执行，大数据与重计算在服务端执行。Apache Arrow 等列式格式提供统一数据模型，可被两侧执行引擎共享。

## 批处理用例 {#sec_batch_output}

了解了批处理如何工作后，我们来看它在不同应用中的落地。批处理非常适合“海量数据的批量计算”，但不适合低延迟场景。因此，只要数据多且新鲜度要求不高，几乎都能看到批处理的身影。这听起来像限制，但现实里大量工作都符合这个模型：

- 会计对账与库存核对：企业定期验证交易、银行账户与库存是否一致，常由批处理完成 [^30]。
- 制造业需求预测：通常以周期性批任务计算 [^31]。
- 电商、媒体、社交平台推荐模型训练：大量依赖批处理 [^32] [^33]。
- 许多金融系统也是批处理驱动。例如美国银行网络几乎完全基于批任务运行 [^34]。

下面分别讨论几个几乎所有行业都常见的批处理用例。

### 提取-转换-加载（ETL） {#sec_batch_etl_usage}

[“数据仓库”](/ch1#sec_introduction_dwh)介绍了 ETL/ELT：从生产数据库抽取数据、进行转换，再加载到下游系统。本节用“ETL”统称这两类负载。尤其当下游是数据仓库时，ETL 常由批处理作业承载。

批处理天然并行，非常适合数据转换。很多转换任务都是“令人尴尬地并行”：过滤、字段投影及大量常见仓库转换都可并行完成。

批处理环境通常自带成熟工作流调度器，便于安排、编排和调试 ETL 流水线。发生故障时，调度器常会自动重试以覆盖瞬时问题；若持续失败，则明确标记失败，便于工程师快速定位流水线中断点。像 Airflow 还内置大量 source/sink/query 算子，可直接对接 MySQL、PostgreSQL、Snowflake、Spark、Flink 等数十种系统。调度器与数据处理系统的紧密集成显著简化了数据集成。

我们也看到，批处理在“出错后排障与修复”方面很友好，这对调试数据流水线极其关键。失败文件可直接检查，ETL 作业可修复后重跑。比如输入文件不再包含某个转换逻辑依赖字段，数据工程师就能据此更新转换逻辑或修复上游生产作业。

过去数据流水线往往由单一数据工程团队集中维护，因为让产品团队自行编写和维护复杂批流水线不太现实。近年随着处理模型和元数据管理改进，组织内更多团队都能参与并维护自己的流水线。*data mesh* [^35] [^36]、*data contract* [^37]、*data fabric* [^38] 等实践，正通过规范和工具帮助团队安全发布可被全组织消费的数据。

如今数据流水线与分析查询不仅共享处理模型，也常共享执行引擎。很多 ETL 作业与消费其输出的分析查询都运行在同一系统里：例如同样以 SparkSQL、Trino 或 DuckDB 查询执行。这样的架构进一步模糊了应用工程、数据工程、分析工程与业务分析之间的界限。

### 分析（Analytics） {#sec_batch_olap}

在[“操作型系统与分析型系统”](/ch1#sec_introduction_analytics)中我们看到，分析查询（OLAP）通常要扫描大量记录并做分组聚合。这类负载可以与其他批任务一起运行在批处理系统中。分析人员写 SQL，经查询引擎执行，读写底层 DFS 或对象存储。表到文件映射、名称、类型等表元数据通常由 Apache Iceberg 等表格式与 Unity 等 catalog 管理（见[“云数据仓库”](/ch4#sec_cloud_data_warehouses)）。这种架构称为 *数据湖仓（data lakehouse）* [^39]。

与 ETL 类似，SQL 接口改进让很多组织用 Spark 等批框架直接承载分析。常见模式有两类：

- 预聚合查询：先把数据滚动聚合为 OLAP 立方体或数据集市，以提升查询速度（见[“物化视图与数据立方”](/ch4#sec_storage_materialized_views)）。预聚合结果可在仓库查询，或推送到 Apache Druid、Apache Pinot 这类实时 OLAP 系统。预聚合通常按固定周期运行，通常由[“工作流调度”](#sec_batch_workflows)中提到的调度器管理。
- 临时查询（ad hoc）：用户为回答具体业务问题、分析用户行为、排查运行问题等随时发起。该场景非常看重响应时间，分析师通常会根据每次结果继续迭代提问。执行快的批处理查询引擎可显著缩短等待。

SQL 支持还让批处理系统更易接入电子表格与可视化工具，如 Tableau、Power BI、Looker、Apache Superset。比如 Tableau 有 SparkSQL、Presto 连接器；Superset 支持 Trino、Hive、Spark SQL、Presto 等大量最终会触发批任务的数据系统。

### 机器学习 {#id290}

机器学习（ML）高度依赖批处理。数据科学家、ML 工程师、AI 工程师会用批处理框架探索数据模式、做数据转换、训练模型。常见用途包括：

- 特征工程：把原始数据过滤并转换为可训练数据。预测模型往往要求数值特征，因此文本或离散值等数据需要先转成目标格式。
- 模型训练：训练数据是批过程输入，训练后模型权重是输出。
- 批量推理：当数据集很大且不要求实时结果时，可对整批数据做预测，也包括在测试集上评估模型预测效果。

很多框架为这些场景提供了专用工具。例如 Spark 的 MLlib、Flink 的 FlinkML 都内置丰富的特征工程工具、统计函数与分类器。

推荐系统和排序系统等 ML 应用也大量使用图处理（见[“图状数据模型”](/ch3#sec_datamodels_graph)）。许多图算法表达为“沿边逐步传播信息并反复迭代”：把一个顶点与相邻顶点连接，传递某些信息，重复直到满足停止条件，例如无边可继续，或某个指标收敛。

*批同步并行（bulk synchronous parallel, BSP）* 计算模型 [^40] 已成为批图计算常用模型。Apache Giraph [^20]、Spark GraphX、Flink Gelly [^41] 等都实现了它。它也常被称为 *Pregel* 模型，因为 Google 的 Pregel 论文让这一方法广为人知 [^42]。

批处理同样是大语言模型（LLM）数据准备与训练的重要组成部分。网页等原始文本通常存放在 DFS 或对象存储中，必须先预处理才能用于训练。适合批处理框架的预处理步骤包括：

- 从 HTML 中提取纯文本，并修复损坏文本；
- 检测并清理低质量、无关或重复文档；
- 对文本做分词并转换为嵌入向量（词或片段的数值表示）。

Kubeflow、Flyte、Ray 等框架就专为这类负载构建。以 OpenAI 为例，ChatGPT 训练流程中就使用了 Ray [^43]。这些框架通常内置与 PyTorch、TensorFlow、XGBoost 等 LLM/AI 库的集成，并支持特征工程、模型训练、批量推理、微调等能力。

最后，数据科学家常在 Jupyter、Hex 等交互式 Notebook 中实验数据。Notebook 由多个 *cell* 组成，每个 cell 是一小段 Markdown、Python 或 SQL；按顺序执行可得到表格、图表或数据结果。很多 Notebook 背后通过 DataFrame API 或 SQL 调用批处理系统。

### 对外提供派生数据 {#sec_batch_serving_derived}

批处理常用于构建预计算/派生数据集，如商品推荐、面向用户的报表、机器学习特征等。这些数据通常由生产数据库、键值存储或搜索引擎对外服务。不论目标系统是什么，都需要把批处理环境中的 DFS/对象存储输出，回灌到线上服务数据库。

最直观的做法是：在批作业里直接使用数据库客户端库，一条条写生产数据库（假设防火墙允许）。这虽然能工作，但通常不是好主意，原因有三：

- 每条记录一次网络请求，比批任务正常吞吐低几个数量级。即便客户端支持批写，性能通常也不理想。
- 批处理框架常并行跑很多任务。若所有任务同时以批处理速率写同一数据库，很容易把数据库压垮，进而影响其在线查询性能，引发系统其他部分故障 [^44]。
- 批作业通常提供清晰的“全有或全无”输出语义：作业成功时，结果等价于每个任务恰好执行一次；作业失败时，无有效输出。但如果在作业内直接写外部系统，就产生了外部可见副作用，难以隐藏：部分完成结果可能被其他系统看到，任务失败重启还可能造成重复写。

更好的方案是把预计算结果先推送到 Kafka 这类流系统（我们会在[第十二章](/ch12#ch_stream)深入讨论）。Elasticsearch、Apache Pinot、Apache Druid、Venice 这类派生数据存储 [^45]，以及 ClickHouse 等云数仓，都支持从 Kafka 摄入数据。通过流系统过渡可以改善前述问题：

- 流系统针对顺序写优化，更适合批作业的大吞吐写入模式；
- 流系统可在批作业与生产库间充当缓冲层，下游可按自身能力限速读取，避免影响线上流量；
- 一个批作业输出可被多个下游系统同时消费；
- 流系统还可作为批处理网络与生产网络之间的安全边界（可部署在 DMZ）。

但“经由流”并不会自动解决“全有或全无”语义。要实现这一点，批作业需要在完成后向下游发出“作业完成，可对外可见”的通知。流消费者需要像 *读已提交（read committed）* 事务那样，在收到完成通知前让新数据对查询不可见（见[“读已提交”](/ch8#sec_transactions_read_committed)）。

另一种在数据库冷启动（bootstrap）时更常见的模式，是在批作业内直接构建一个全新数据库，再把文件从 DFS、对象存储或本地文件系统批量导入目标数据库。很多系统都提供这类批量导入工具，如 TiDB Lightning、Apache Pinot/Apache Druid 的 Hadoop 导入作业，RocksDB 也提供从批作业批量导入 SST 的 API。

“批构建 + 批导入”速度非常快，也更容易在不同数据版本间做原子切换。但对于需要持续增量更新的场景，这种“每次构建全新库”的方式会更难。很多系统采用混合策略，同时支持冷启动与增量加载。比如 Venice 就支持混合存储，可同时做基于行的批更新和全量数据集切换。

## 本章小结 {#id292}

本章讨论了批处理系统的设计与实现。我们先从经典 Unix 工具链（awk、sort、uniq 等）出发，说明了批处理的基础原语，例如排序和计数。

然后我们把视角扩展到分布式批处理系统。批处理以“不可变、有限（bounded）的输入数据集”为对象，生成输出数据，这使得重跑和调试可以不引入副作用。围绕这一模式，批处理框架通常包含三层核心能力：决定作业何时何地运行的编排层，负责持久化数据的存储层，以及执行实际计算的计算层。

我们看了分布式文件系统和对象存储如何通过分块复制、缓存和元数据服务管理大文件，也讨论了现代批处理框架如何通过可插拔 API 与这些存储交互。我们还讨论了编排器在大集群中如何调度任务、分配资源和处理故障，以及“按作业调度”的编排器与“按依赖图管理整组作业生命周期”的工作流编排器之间的区别。

在处理模型方面，我们回顾了 MapReduce 及其经典 map/reduce 函数，又介绍了 Spark、Flink 等更易用且性能更好的数据流引擎。为了理解批作业如何扩展到大规模，我们重点讲了混洗（shuffle）算法，它是实现分组、连接、聚合的基础操作。

随着批处理系统成熟，焦点转向可用性。高级查询语言（尤其 SQL）和 DataFrame API 让批处理作业更易编写，也更容易被优化器优化。查询优化器把声明式查询转换为高效执行计划。

最后我们回顾了批处理常见用例：

- ETL 流水线：通过定时工作流在不同系统间提取、转换、加载数据；
- 分析：既支持预聚合报表，也支持临时探索查询；
- 机器学习：用于准备与处理大规模训练数据；
- 把批处理输出灌入面向生产流量的系统：常通过流系统或批量导入工具，把派生数据提供给用户。

下一章我们将转向流处理。与批处理不同，流处理输入是 *无界（unbounded）* 的：作业仍在，但输入是持续不断的数据流，因此作业不会“完成”。我们会看到，流处理与批处理在一些方面很相似，但“输入无界”这一前提也会显著改变系统设计。


### 参考文献 {#references}

[^1]: Nathan Marz. [How to Beat the CAP Theorem](http://nathanmarz.com/blog/how-to-beat-the-cap-theorem.html). *nathanmarz.com*, October 2011. Archived at [perma.cc/4BS9-R9A4](https://perma.cc/4BS9-R9A4)
[^2]: Molly Bartlett Dishman and Martin Fowler. [Agile Architecture](https://www.youtube.com/watch?v=VjKYO6DP3fo&list=PL055Epbe6d5aFJdvWNtTeg_UEHZEHdInE). At *O'Reilly Software Architecture Conference*, March 2015.
[^3]: Jeffrey Dean and Sanjay Ghemawat. [MapReduce: Simplified Data Processing on Large Clusters](https://www.usenix.org/legacy/publications/library/proceedings/osdi04/tech/full_papers/dean/dean.pdf). At *6th USENIX Symposium on Operating System Design and Implementation* (OSDI), December 2004.
[^4]: Shivnath Babu and Herodotos Herodotou. [Massively Parallel Databases and MapReduce Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2013/11/db-mr-survey-final.pdf). *Foundations and Trends in Databases*, volume 5, issue 1, pages 1--104, November 2013. [doi:10.1561/1900000036](https://doi.org/10.1561/1900000036)
[^5]: David J. DeWitt and Michael Stonebraker. [MapReduce: A Major Step Backwards](https://homes.cs.washington.edu/~billhowe/mapreduce_a_major_step_backwards.html). Originally published at *databasecolumn.vertica.com*, January 2008. Archived at [perma.cc/U8PA-K48V](https://perma.cc/U8PA-K48V)
[^6]: Henry Robinson. [The Elephant Was a Trojan Horse: On the Death of Map-Reduce at Google](https://www.the-paper-trail.org/post/2014-06-25-the-elephant-was-a-trojan-horse-on-the-death-of-map-reduce-at-google/). *the-paper-trail.org*, June 2014. Archived at [perma.cc/9FEM-X787](https://perma.cc/9FEM-X787)
[^7]: Urs Hölzle. [R.I.P. MapReduce. After having served us well since 2003, today we removed the remaining internal codebase for good](https://twitter.com/uhoelzle/status/1177360023976067077). *twitter.com*, September 2019. Archived at [perma.cc/B34T-LLY7](https://perma.cc/B34T-LLY7)
[^8]: Adam Drake. [Command-Line Tools Can Be 235x Faster than Your Hadoop Cluster](https://adamdrake.com/command-line-tools-can-be-235x-faster-than-your-hadoop-cluster.html). *aadrake.com*, January 2014. Archived at [perma.cc/87SP-ZMCY](https://perma.cc/87SP-ZMCY)
[^9]: [`sort`: Sort text files](https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html). GNU Coreutils 9.7 Documentation, Free Software Foundation, Inc., 2025.
[^10]: Michael Ovsiannikov, Silvius Rus, Damian Reeves, Paul Sutter, Sriram Rao, and Jim Kelly. [The Quantcast File System](https://db.disi.unitn.eu/pages/VLDBProgram/pdf/industry/p808-ovsiannikov.pdf). *Proceedings of the VLDB Endowment*, volume 6, issue 11, pages 1092--1101, August 2013. [doi:10.14778/2536222.2536234](https://doi.org/10.14778/2536222.2536234)
[^11]: Andrew Wang, Zhe Zhang, Kai Zheng, Uma Maheswara G., and Vinayakumar B. [Introduction to HDFS Erasure Coding in Apache Hadoop](https://www.cloudera.com/blog/technical/introduction-to-hdfs-erasure-coding-in-apache-hadoop.html). *blog.cloudera.com*, September 2015. Archived at [archive.org](https://web.archive.org/web/20250731115546/https://www.cloudera.com/blog/technical/introduction-to-hdfs-erasure-coding-in-apache-hadoop.html)
[^12]: Andy Warfield. [Building and operating a pretty big storage system called S3](https://www.allthingsdistributed.com/2023/07/building-and-operating-a-pretty-big-storage-system.html). *allthingsdistributed.com*, July 2023. Archived at [perma.cc/7LPK-TP7V](https://perma.cc/7LPK-TP7V)
[^13]: Vinod Kumar Vavilapalli, Arun C. Murthy, Chris Douglas, Sharad Agarwal, Mahadev Konar, Robert Evans, Thomas Graves, Jason Lowe, Hitesh Shah, Siddharth Seth, Bikas Saha, Carlo Curino, Owen O'Malley, Sanjay Radia, Benjamin Reed, and Eric Baldeschwieler. [Apache Hadoop YARN: Yet Another Resource Negotiator](https://opencourse.inf.ed.ac.uk/sites/default/files/2023-10/yarn-socc13.pdf). At *4th Annual Symposium on Cloud Computing* (SoCC), October 2013. [doi:10.1145/2523616.2523633](https://doi.org/10.1145/2523616.2523633)
[^14]: Richard M. Karp. [Reducibility Among Combinatorial Problems](https://www.cs.purdue.edu/homes/hosking/197/canon/karp.pdf). *Complexity of Computer Computations. The IBM Research Symposia Series*. Springer, 1972. [doi:10.1007/978-1-4684-2001-2_9](https://doi.org/10.1007/978-1-4684-2001-2_9)
[^15]: J. D. Ullman. [NP-Complete Scheduling Problems](https://www.cs.montana.edu/bhz/classes/fall-2018/csci460/paper4.pdf). *Journal of Computer and System Sciences*, volume 10, issue 3, June 1975. [doi:10.1016/S0022-0000(75)80008-0](https://doi.org/10.1016/S0022-0000(75)80008-0)
[^16]: Gilad David Maayan. [The complete guide to spot instances on AWS, Azure and GCP](https://www.datacenterdynamics.com/en/opinions/complete-guide-spot-instances-aws-azure-and-gcp/). *datacenterdynamics.com*, March 2021. Archived at [archive.org](https://web.archive.org/web/20250722114617/https://www.datacenterdynamics.com/en/opinions/complete-guide-spot-instances-aws-azure-and-gcp/)
[^17]: Abhishek Verma, Luis Pedrosa, Madhukar Korupolu, David Oppenheimer, Eric Tune, and John Wilkes. [Large-Scale Cluster Management at Google with Borg](https://dl.acm.org/doi/pdf/10.1145/2741948.2741964). At *10th European Conference on Computer Systems* (EuroSys), April 2015. [doi:10.1145/2741948.2741964](https://doi.org/10.1145/2741948.2741964)
[^18]: Matei Zaharia, Mosharaf Chowdhury, Tathagata Das, Ankur Dave, Justin Ma, Murphy McCauley, Michael J. Franklin, Scott Shenker, and Ion Stoica. [Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing](https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final138.pdf). At *9th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), April 2012.
[^19]: Paris Carbone, Stephan Ewen, Seif Haridi, Asterios Katsifodimos, Volker Markl, and Kostas Tzoumas. [Apache Flink™: Stream and Batch Processing in a Single Engine](http://sites.computer.org/debull/A15dec/p28.pdf). *Bulletin of the IEEE Computer Society Technical Committee on Data Engineering*, volume 38, issue 4, December 2015. Archived at [perma.cc/G3N3-BKX5](https://perma.cc/G3N3-BKX5)
[^20]: Mark Grover, Ted Malaska, Jonathan Seidman, and Gwen Shapira. *[Hadoop Application Architectures](https://learning.oreilly.com/library/view/hadoop-application-architectures/9781491910313/)*. O'Reilly Media, 2015. ISBN: 978-1-491-90004-8
[^21]: Jules S. Damji, Brooke Wenig, Tathagata Das, and Denny Lee. *[Learning Spark, 2nd Edition](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/)*. O'Reilly Media, 2020. ISBN: 978-1492050049
[^22]: Michael Isard, Mihai Budiu, Yuan Yu, Andrew Birrell, and Dennis Fetterly. [Dryad: Distributed Data-Parallel Programs from Sequential Building Blocks](https://www.microsoft.com/en-us/research/publication/dryad-distributed-data-parallel-programs-from-sequential-building-blocks/). At *2nd European Conference on Computer Systems* (EuroSys), March 2007. [doi:10.1145/1272996.1273005](https://doi.org/10.1145/1272996.1273005)
[^23]: Daniel Warneke and Odej Kao. [Nephele: Efficient Parallel Data Processing in the Cloud](https://stratosphere2.dima.tu-berlin.de/assets/papers/Nephele_09.pdf). At *2nd Workshop on Many-Task Computing on Grids and Supercomputers* (MTAGS), November 2009. [doi:10.1145/1646468.1646476](https://doi.org/10.1145/1646468.1646476)
[^24]: Hossein Ahmadi. [In-memory query execution in Google BigQuery](https://cloud.google.com/blog/products/bigquery/in-memory-query-execution-in-google-bigquery). *cloud.google.com*, August 2016. Archived at [perma.cc/DGG2-FL9W](https://perma.cc/DGG2-FL9W)
[^25]: Tom White. *[Hadoop: The Definitive Guide](https://learning.oreilly.com/library/view/hadoop-the-definitive/9781491901687/)*, 4th edition. O'Reilly Media, 2015. ISBN: 978-1-491-90163-2
[^26]: Fabian Hüske. [Peeking into Apache Flink's Engine Room](https://flink.apache.org/2015/03/13/peeking-into-apache-flinks-engine-room/). *flink.apache.org*, March 2015. Archived at [perma.cc/44BW-ALJX](https://perma.cc/44BW-ALJX)
[^27]: Mostafa Mokhtar. [Hive 0.14 Cost Based Optimizer (CBO) Technical Overview](https://web.archive.org/web/20170607112708/http://hortonworks.com/blog/hive-0-14-cost-based-optimizer-cbo-technical-overview/). *hortonworks.com*, March 2015. Archived on [archive.org](https://web.archive.org/web/20170607112708/http://hortonworks.com/blog/hive-0-14-cost-based-optimizer-cbo-technical-overview/)
[^28]: Michael Armbrust, Reynold S. Xin, Cheng Lian, Yin Huai, Davies Liu, Joseph K. Bradley, Xiangrui Meng, Tomer Kaftan, Michael J. Franklin, Ali Ghodsi, and Matei Zaharia. [Spark SQL: Relational Data Processing in Spark](https://people.csail.mit.edu/matei/papers/2015/sigmod_spark_sql.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2742797](https://doi.org/10.1145/2723372.2742797)
[^29]: Kaya Kupferschmidt. [Spark vs Pandas, part 2 -- Spark](https://towardsdatascience.com/spark-vs-pandas-part-2-spark-c57f8ea3a781/). *towardsdatascience.com*, October 2020. Archived at [perma.cc/5BRK-G4N5](https://perma.cc/5BRK-G4N5)
[^30]: Ammar Chalifah. [Tracking payments at scale](https://bolt.eu/en/blog/tracking-payments-at-scale). *bolt.eu.com*, June 2025. Archived at [perma.cc/Q4KX-8K3J](https://perma.cc/Q4KX-8K3J)
[^31]: Nafi Ahmet Turgut, Hamza Akyıldız, Hasan Burak Yel, Mehmet İkbal Özmen, Mutlu Polatcan, Pinar Baki, and Esra Kayabali. [Demand forecasting at Getir built with Amazon Forecast](https://aws.amazon.com/blogs/machine-learning/demand-forecasting-at-getir-built-with-amazon-forecast). *aws.amazon.com.com*, May 2023. Archived at [perma.cc/H3H6-GNL7](https://perma.cc/H3H6-GNL7)
[^32]: Jason (Siyu) Zhu. [Enhancing homepage feed relevance by harnessing the power of large corpus sparse ID embeddings](https://www.linkedin.com/blog/engineering/feed/enhancing-homepage-feed-relevance-by-harnessing-the-power-of-lar). *linkedin.com*, August 2023. Archived at [archive.org](https://web.archive.org/web/20250225094424/https://www.linkedin.com/blog/engineering/feed/enhancing-homepage-feed-relevance-by-harnessing-the-power-of-lar)
[^33]: Avery Ching, Sital Kedia, and Shuojie Wang. [Apache Spark \@Scale: A 60 TB+ production use case](https://engineering.fb.com/2016/08/31/core-infra/apache-spark-scale-a-60-tb-production-use-case/). *engineering.fb.com*, August 2016. Archived at [perma.cc/F7R5-YFAV](https://perma.cc/F7R5-YFAV)
[^34]: Edward Kim. [How ACH works: A developer perspective --- Part 1](https://engineering.gusto.com/how-ach-works-a-developer-perspective-part-1-339d3e7bea1). *engineering.gusto.com*, April 2014. Archived at [perma.cc/F67P-VBLK](https://perma.cc/F67P-VBLK)
[^35]: Zhamak Dehghani. [How to Move Beyond a Monolithic Data Lake to a Distributed Data Mesh](https://martinfowler.com/articles/data-monolith-to-mesh.html). *martinfowler.com*, May 2019. Archived at [perma.cc/LN2L-L4VC](https://perma.cc/LN2L-L4VC)
[^36]: Chris Riccomini. [What the Heck is a Data Mesh?!](https://cnr.sh/essays/what-the-heck-data-mesh) *cnr.sh*, June 2021. Archived at [perma.cc/NEJ2-BAX3](https://perma.cc/NEJ2-BAX3)
[^37]: Chad Sanderson, Mark Freeman, B. E. Schmidt. [*Data Contracts*](https://www.oreilly.com/library/view/data-contracts/9781098157623/). O'Reilly Media, 2025. ISBN: 9781098157623
[^38]: Daniel Abadi. [Data Fabric vs. Data Mesh: What's the Difference?](https://www.starburst.io/blog/data-fabric-vs-data-mesh-whats-the-difference/) *starburst.io*, November 2021. Archived at [perma.cc/RSK3-HXDK](https://perma.cc/RSK3-HXDK)
[^39]: Michael Armbrust, Ali Ghodsi, Reynold Xin, and Matei Zaharia. [Lakehouse: A New Generation of Open Platforms that Unify Data Warehousing and Advanced Analytics](https://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf). At *11th Annual Conference on Innovative Data Systems Research* (CIDR), January 2021.
[^40]: Leslie G. Valiant. [A Bridging Model for Parallel Computation](https://dl.acm.org/doi/pdf/10.1145/79173.79181). *Communications of the ACM*, volume 33, issue 8, pages 103--111, August 1990. [doi:10.1145/79173.79181](https://doi.org/10.1145/79173.79181)
[^41]: Stephan Ewen, Kostas Tzoumas, Moritz Kaufmann, and Volker Markl. [Spinning Fast Iterative Data Flows](https://vldb.org/pvldb/vol5/p1268_stephanewen_vldb2012.pdf). *Proceedings of the VLDB Endowment*, volume 5, issue 11, pages 1268-1279, July 2012. [doi:10.14778/2350229.2350245](https://doi.org/10.14778/2350229.2350245)
[^42]: Grzegorz Malewicz, Matthew H. Austern, Aart J. C. Bik, James C. Dehnert, Ilan Horn, Naty Leiser, and Grzegorz Czajkowski. [Pregel: A System for Large-Scale Graph Processing](https://kowshik.github.io/JPregel/pregel_paper.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2010. [doi:10.1145/1807167.1807184](https://doi.org/10.1145/1807167.1807184)
[^43]: Richard MacManus. [OpenAI Chats about Scaling LLMs at Anyscale's Ray Summit](https://thenewstack.io/openai-chats-about-scaling-llms-at-anyscales-ray-summit/). *thenewstack.io*, September 2023. Archived at [perma.cc/YJD6-KUXU](https://perma.cc/YJD6-KUXU)
[^44]: Jay Kreps. [Why Local State is a Fundamental Primitive in Stream Processing](https://www.oreilly.com/ideas/why-local-state-is-a-fundamental-primitive-in-stream-processing). *oreilly.com*, July 2014. Archived at [perma.cc/P8HU-R5LA](https://perma.cc/P8HU-R5LA)
[^45]: Félix GV. [Open Sourcing Venice -- LinkedIn's Derived Data Platform](https://www.linkedin.com/blog/engineering/open-source/open-sourcing-venice-linkedin-s-derived-data-platform). *linkedin.com*, September 2022. Archived at [archive.org](https://web.archive.org/web/20250226160927/https://www.linkedin.com/blog/engineering/open-source/open-sourcing-venice-linkedin-s-derived-data-platform)


================================================
FILE: content/zh/ch12.md
================================================
---
title: "第十二章：流处理"
linkTitle: "12. 流处理"
weight: 312
math: true
breadcrumbs: false
---

<a id="ch_stream"></a>

![](/map/ch11.png)

> 有效的复杂系统总是从简单的系统演化而来。反之亦然：从零设计的复杂系统没一个能有效工作的。
>
> —— 约翰・加尔，Systemantics（1975）

在 [第十一章](/ch11) 中，我们讨论了批处理技术，它读取一组文件作为输入，并生成一组新的文件作为输出。输出是 **派生数据（derived data）** 的一种形式；也就是说，如果需要，可以通过再次运行批处理过程来重新创建数据集。我们看到了如何使用这个简单而强大的想法来建立搜索索引、推荐系统、做分析等等。

然而，在 [第十一章](/ch11) 中仍然有一个很大的假设：即输入是有界的，即已知和有限的大小，所以批处理知道它何时完成输入的读取。例如，MapReduce 核心的排序操作必须读取其全部输入，然后才能开始生成输出：可能发生这种情况：最后一条输入记录具有最小的键，因此需要第一个被输出，所以提早开始输出是不可行的。

实际上，很多数据是 **无界限** 的，因为它随着时间的推移而逐渐到达：你的用户在昨天和今天产生了数据，明天他们将继续产生更多的数据。除非你停业，否则这个过程永远都不会结束，所以数据集从来就不会以任何有意义的方式 “完成”[^1]。因此，批处理程序必须将数据人为地分成固定时间段的数据块，例如，在每天结束时处理一天的数据，或者在每小时结束时处理一小时的数据。

日常批处理中的问题是，输入的变更只会在一天之后的输出中反映出来，这对于许多急躁的用户来说太慢了。为了减少延迟，我们可以更频繁地运行处理 —— 比如说，在每秒钟的末尾 —— 或者甚至更连续一些，完全抛开固定的时间切片，当事件发生时就立即进行处理，这就是 **流处理（stream processing）** 背后的想法。

一般来说，“流” 是指随着时间的推移逐渐可用的数据。这个概念出现在很多地方：Unix 的 stdin 和 stdout、编程语言（惰性列表）[^2]、文件系统 API（如 Java 的 `FileInputStream`）、TCP 连接、通过互联网传送音频和视频等等。

在本章中，我们将把 **事件流（event stream）** 视为一种数据管理机制：无界限，增量处理，与上一章中的批量数据相对应。我们将首先讨论怎样表示、存储、通过网络传输流。在 “[数据库与流](#sec_stream_databases)” 中，我们将研究流和数据库之间的关系。最后在 “[流处理](#sec_stream_processing)” 中，我们将研究连续处理这些流的方法和工具，以及它们用于应用构建的方式。


## 传递事件流 {#sec_stream_transmit}

在批处理领域，作业的输入和输出是文件（也许在分布式文件系统上）。流处理领域中的等价物看上去是什么样子的？

当输入是一个文件（一个字节序列），第一个处理步骤通常是将其解析为一系列记录。在流处理的上下文中，记录通常被叫做 **事件（event）** ，但它本质上是一样的：一个小的、自包含的、不可变的对象，包含某个时间点发生的某件事情的细节。一个事件通常包含一个来自日历时钟的时间戳，以指明事件发生的时间（请参阅 “[单调钟与日历时钟](/ch9#sec_distributed_monotonic_timeofday)”）。

例如，发生的事件可能是用户采取的行动，例如查看页面或进行购买。它也可能来源于机器，例如对温度传感器或 CPU 利用率的周期性测量。在 “[使用 Unix 工具的批处理](/ch11#sec_batch_unix)” 的示例中，Web 服务器日志的每一行都是一个事件。

事件可能被编码为文本字符串或 JSON，或者某种二进制编码，如 [第五章](/ch5) 所述。这种编码允许你存储一个事件，例如将其追加到一个文件，将其插入关系表，或将其写入文档数据库。它还允许你通过网络将事件发送到另一个节点以进行处理。

在批处理中，文件被写入一次，然后可能被多个作业读取。类似地，在流处理术语中，一个事件由 **生产者（producer）** （也称为 **发布者（publisher）** 或 **发送者（sender）** ）生成一次，然后可能由多个 **消费者（consumer）** （ **订阅者（subscribers）** 或 **接收者（recipients）** ）进行处理[^3]。在文件系统中，文件名标识一组相关记录；在流式系统中，相关的事件通常被聚合为一个 **主题（topic）** 或 **流（stream）** 。

原则上讲，文件或数据库就足以连接生产者和消费者：生产者将其生成的每个事件写入数据存储，且每个消费者定期轮询数据存储，检查自上次运行以来新出现的事件。这实际上正是批处理在每天结束时处理当天数据时所做的事情。

但当我们想要进行低延迟的连续处理时，如果数据存储不是为这种用途专门设计的，那么轮询开销就会很大。轮询的越频繁，能返回新事件的请求比例就越低，而额外开销也就越高。相比之下，最好能在新事件出现时直接通知消费者。

数据库在传统上对这种通知机制支持的并不好，关系型数据库通常有 **触发器（trigger）** ，它们可以对变化（如，插入表中的一行）作出反应，但是它们的功能非常有限，并且在数据库设计中有些后顾之忧[^4]。相应的是，已经开发了专门的工具来提供事件通知。


### 消息传递系统 {#sec_stream_messaging}

向消费者通知新事件的常用方式是使用 **消息传递系统（messaging system）**：生产者发送包含事件的消息，然后将消息推送给消费者。我们之前在 “[消息传递中的数据流](/ch5#sec_encoding_dataflow_msg)” 中谈到了这些系统，但现在我们将详细介绍这些系统。

像生产者和消费者之间的 Unix 管道或 TCP 连接这样的直接信道，是实现消息传递系统的简单方法。但是，大多数消息传递系统都在这一基本模型上进行了扩展。特别的是，Unix 管道和 TCP 将恰好一个发送者与恰好一个接收者连接，而一个消息传递系统允许多个生产者节点将消息发送到同一个主题，并允许多个消费者节点接收主题中的消息。

在这个 **发布 / 订阅** 模式中，不同的系统采取各种各样的方法，并没有针对所有目的的通用答案。为了区分这些系统，问一下这两个问题会特别有帮助：

1. **如果生产者发送消息的速度比消费者能够处理的速度快会发生什么？** 一般来说，有三种选择：系统可以丢掉消息，将消息放入缓冲队列，或使用 **背压**（backpressure，也称为 **流量控制**，即 flow control：阻塞生产者，以免其发送更多的消息）。例如 Unix 管道和 TCP 就使用了背压：它们有一个固定大小的小缓冲区，如果填满，发送者会被阻塞，直到接收者从缓冲区中取出数据（请参阅 “[网络拥塞和排队](/ch9#sec_distributed_congestion)”）。

   如果消息被缓存在队列中，那么理解队列增长会发生什么是很重要的。当队列装不进内存时系统会崩溃吗？还是将消息写入磁盘？如果是这样，磁盘访问又会如何影响消息传递系统的性能[^5]，磁盘写满又会发生什么[^6]？

2. **如果节点崩溃或暂时脱机，会发生什么情况？ —— 是否会有消息丢失？** 与数据库一样，持久性可能需要写入磁盘和 / 或复制的某种组合（请参阅 “[复制与持久性](/ch8#sidebar_transactions_durability)”），这是有代价的。如果你能接受有时消息会丢失，则可能在同一硬件上获得更高的吞吐量和更低的延迟。

是否可以接受消息丢失取决于应用。例如，对于周期传输的传感器读数和指标，偶尔丢失的数据点可能并不重要，因为更新的值会在短时间内发出。但要注意，如果大量的消息被丢弃，可能无法立刻意识到指标已经不正确了[^7]。如果你正在对事件计数，那么它们能够可靠送达是更重要的，因为每个丢失的消息都意味着使计数器的错误扩大。

我们在 [第十一章](/ch11) 中探讨的批处理系统的一个很好的特性是，它们提供了强大的可靠性保证：失败的任务会自动重试，失败任务的部分输出会自动丢弃。这意味着输出与没有发生故障一样，这有助于简化编程模型。在本章的后面，我们将研究如何在流处理的上下文中提供类似的保证。

#### 直接从生产者传递给消费者 {#id296}

许多消息传递系统使用生产者和消费者之间的直接网络通信，而不通过中间节点：

* UDP 组播广泛应用于金融行业，例如股票市场，其中低时延非常重要[^8]。虽然 UDP 本身是不可靠的，但应用层的协议可以恢复丢失的数据包（生产者必须记住它发送的数据包，以便能按需重新发送数据包）。
* 无代理的消息库，如 ZeroMQ 和 nanomsg 采取类似的方法，通过 TCP 或 IP 多播实现发布 / 订阅消息传递。
* 一些指标采集代理（例如 StatsD [^9]）使用不可靠的 UDP 消息传递来收集网络中所有机器的指标并进行监控。（在 StatsD 协议中，计数器指标只有在所有消息都被接收时才是准确的；使用 UDP 使得这些指标至多是近似值[^10]。另请参阅 “[TCP 与 UDP](/ch9#sidebar_distributed_tcp_udp)”）。
* 如果消费者在网络上公开了服务，生产者可以直接发送 HTTP 或 RPC 请求（请参阅 “[服务中的数据流：REST 与 RPC](/ch5#sec_encoding_dataflow_rpc)”）将消息推送给使用者。这就是 webhooks 背后的想法[^11]：把一个服务的回调 URL 注册到另一个服务中，当事件发生时向该 URL 发起请求。

尽管这些直接消息传递系统在设计它们的环境中运行良好，但是它们通常要求应用代码意识到消息丢失的可能性。它们的容错程度极为有限：即使协议检测到并重传在网络中丢失的数据包，它们通常也只是假设生产者和消费者始终在线。

如果消费者处于脱机状态，则可能会丢失其不可达时发送的消息。一些协议允许生产者重试失败的消息传递，但当生产者崩溃时，它可能会丢失消息缓冲区及其本应发送的消息，这种方法可能就没用了。

#### 消息代理 {#id433}

一种广泛使用的替代方法是通过 **消息代理**（message broker，也称为 **消息队列**，即 message queue）发送消息，消息代理实质上是一种针对处理消息流而优化的数据库[^12]。它作为服务器运行，生产者和消费者作为客户端连接到服务器。生产者将消息写入代理，消费者通过从代理那里读取来接收消息。

通过将数据集中在代理上，这些系统可以更容易地容忍来来去去的客户端（连接，断开连接和崩溃），而持久性问题则转移到代理的身上。一些消息代理只将消息保存在内存中，而另一些消息代理（取决于配置）将其写入磁盘，以便在代理崩溃的情况下不会丢失。针对缓慢的消费者，它们通常会允许无上限的排队（而不是丢弃消息或背压），尽管这种选择也可能取决于配置。

排队的结果是，消费者通常是 **异步（asynchronous）** 的：当生产者发送消息时，通常只会等待代理确认消息已经被缓存，而不等待消息被消费者处理。向消费者递送消息将发生在未来某个未定的时间点 —— 通常在几分之一秒之内，但有时当消息堆积时会显著延迟。

#### 消息代理与数据库的对比 {#id297}

有些消息代理甚至可以使用 XA 或 JTA 参与两阶段提交协议（请参阅 “[实践中的分布式事务](/ch8#sec_transactions_xa)”）。这个功能与数据库在本质上非常相似，尽管消息代理和数据库之间仍存在实践上很重要的差异：

* 数据库通常保留数据直至显式删除，而大多数消息代理在消息成功递送给消费者时会自动删除消息。这样的消息代理不适合长期的数据存储。
* 由于它们很快就能删除消息，大多数消息代理都认为它们的工作集相当小 —— 即队列很短。如果代理需要缓冲很多消息，比如因为消费者速度较慢（如果内存装不下消息，可能会溢出到磁盘），每个消息需要更长的处理时间，整体吞吐量可能会恶化[^5]。
* 数据库通常支持次级索引和各种搜索数据的方式，而消息代理通常支持按照某种模式匹配主题，订阅其子集。虽然机制并不一样，但对于客户端选择想要了解的数据的一部分，都是基本的方式。
* 查询数据库时，结果通常基于某个时间点的数据快照；如果另一个客户端随后向数据库写入一些改变了查询结果的内容，则第一个客户端不会发现其先前结果现已过期（除非它重复查询或轮询变更）。相比之下，消息代理不支持任意查询，但是当数据发生变化时（即新消息可用时），它们会通知客户端。

这是关于消息代理的传统观点，它被封装在诸如 JMS [^13] 和 AMQP [^14] 的标准中，并且被诸如 RabbitMQ、ActiveMQ、HornetQ、Qpid、TIBCO 企业消息服务、IBM MQ、Azure Service Bus 和 Google Cloud Pub/Sub 所实现[^15]。尽管可以把数据库当作队列来用，但要调优到理想性能并不容易[^16]。

#### 多个消费者 {#id298}

当多个消费者从同一主题中读取消息时，有两种主要的消息传递模式，如 [图 12-1](#fig_stream_broker_patterns) 所示：

负载均衡（load balancing）
: 每条消息都被传递给消费者 **之一**，所以处理该主题下消息的工作能被多个消费者共享。代理可以为消费者任意分配消息。当处理消息的代价高昂，希望能并行处理消息时，此模式非常有用（在 AMQP 中，可以通过让多个客户端从同一个队列中消费来实现负载均衡，而在 JMS 中则称之为 **共享订阅**，即 shared subscription）。

扇出（fan-out）
: 每条消息都被传递给 **所有** 消费者。扇出允许几个独立的消费者各自 “收听” 相同的消息广播，而不会相互影响 —— 这个流处理中的概念对应批处理中多个不同批处理作业读取同一份输入文件 （JMS 中的主题订阅与 AMQP 中的交叉绑定提供了这一功能）。

{{< figure src="/fig/ddia_1201.png" id="fig_stream_broker_patterns" caption="图 12-1. （a）负载均衡：在消费者间共享消费主题；（b）扇出：将每条消息传递给多个消费者。" class="w-full my-4" >}}

两种模式可以组合使用：例如，两个独立的消费者组可以每组各订阅同一个主题，每一组都共同收到所有消息，但在每一组内部，每条消息仅由单个节点处理。

#### 确认与重新传递 {#sec_stream_reordering}

消费者随时可能会崩溃，所以有一种可能的情况是：代理向消费者递送消息，但消费者没有处理，或者在消费者崩溃之前只进行了部分处理。为了确保消息不会丢失，消息代理使用 **确认（acknowledgments）**：客户端必须显式告知代理消息处理完毕的时间，以便代理能将消息从队列中移除。

如果与客户端的连接关闭，或者代理超出一段时间未收到确认，代理则认为消息没有被处理，因此它将消息再递送给另一个消费者。（请注意可能发生这样的情况，消息 **实际上是** 处理完毕的，但 **确认** 在网络中丢失了。需要一种原子提交协议才能处理这种情况，正如在 “[实践中的分布式事务](/ch8#sec_transactions_xa)” 中所讨论的那样）

当与负载均衡相结合时，这种重传行为对消息的顺序有种有趣的影响。在 [图 12-2](#fig_stream_redelivery_reordering) 中，消费者通常按照生产者发送的顺序处理消息。然而消费者 2 在处理消息 m3 时崩溃，与此同时消费者 1 正在处理消息 m4。未确认的消息 m3 随后被重新发送给消费者 1，结果消费者 1 按照 m4，m3，m5 的顺序处理消息。因此 m3 和 m4 的交付顺序与生产者 1 的发送顺序不同。

{{< figure src="/fig/ddia_1202.png" id="fig_stream_redelivery_reordering" caption="图 12-2. 在处理 m3 时消费者 2 崩溃，因此稍后重传至消费者 1。" class="w-full my-4" >}}

即使消息代理试图保留消息的顺序（如 JMS 和 AMQP 标准所要求的），负载均衡与重传的组合也不可避免地导致消息被重新排序。为避免此问题，你可以让每个消费者使用单独的队列（即不使用负载均衡功能）。如果消息是完全独立的，则消息顺序重排并不是一个问题。但正如我们将在本章后续部分所述，如果消息之间存在因果依赖关系，这就是一个很重要的问题。

重传还可能导致资源浪费、资源饥饿，甚至使流永久阻塞。一个常见场景是生产者错误地序列化消息，例如 JSON 对象缺少必填键。任何读取到该消息的消费者都会因为缺键而失败，无法发送确认，于是代理会不断重传，导致其他消费者也不断失败。如果代理强顺序保证，后续消息可能被彻底卡住；即便允许重排，也会持续浪费资源在永远无法确认的坏消息上。

这类问题通常通过 **死信队列（dead letter queue, DLQ）** 处理：不再无限重试，而是把问题消息移到另一条队列中，从而解堵主消费链路[^17] [^18]。运维通常会对死信队列设置告警 —— 只要有消息进入，就代表出现了错误。收到告警后，操作员可以决定永久丢弃该消息、人工修复后重新投递，或修复消费者代码以正确处理该消息。除了传统队列系统，基于日志的消息系统和流处理系统也开始支持 DLQ[^19]。

### 基于日志的消息代理 {#sec_stream_log}

通过网络发送数据包或向网络服务发送请求通常是短暂的操作，不会留下永久的痕迹。尽管可以永久记录（通过抓包与日志），但我们通常不这么做。即使是将消息持久地写入磁盘的消息代理，在送达给消费者之后也会很快删除消息，因为它们建立在短暂消息传递的思维方式上。

数据库和文件系统采用截然相反的方法论：至少在某人显式删除前，通常写入数据库或文件的所有内容都要被永久记录下来。

这种思维方式上的差异对创建派生数据的方式有巨大影响。如 [第十一章](/ch11) 所述，批处理过程的一个关键特性是，你可以反复运行它们，试验处理步骤，不用担心损坏输入（因为输入是只读的）。而 AMQP/JMS 风格的消息传递并非如此：收到消息是具有破坏性的，因为确认可能导致消息从代理中被删除，因此你不能期望再次运行同一个消费者能得到相同的结果。

如果你将新的消费者添加到消息传递系统，通常只能接收到消费者注册之后开始发送的消息。先前的任何消息都随风而逝，一去不复返。作为对比，你可以随时为文件和数据库添加新的客户端，且能读取任意久远的数据（只要应用没有显式覆盖或删除这些数据）。

为什么我们不能把它俩杂交一下，既有数据库的持久存储方式，又有消息传递的低延迟通知？这就是 **基于日志的消息代理（log-based message brokers）** 背后的想法。

#### 使用日志进行消息存储 {#id300}

日志只是磁盘上简单的仅追加记录序列。我们先前在 [第四章](/ch4) 中日志结构存储引擎和预写式日志的上下文中讨论了日志，在 [第六章](/ch6) 复制的上下文里也讨论了它。

同样的结构可以用于实现消息代理：生产者通过将消息追加到日志末尾来发送消息，而消费者通过依次读取日志来接收消息。如果消费者读到日志末尾，则会等待新消息追加的通知。Unix 工具 `tail -f` 能监视文件被追加写入的数据，基本上就是这样工作的。

为了伸缩超出单个磁盘所能提供的更高吞吐量，可以对日志进行 **分区**（按 [第七章](/ch7) 的定义）。不同的分区可以托管在不同的机器上，使得每个分区都有一份能独立于其他分区进行读写的日志。一个主题可以定义为一组携带相同类型消息的分区。这种方法如 [图 12-3](#fig_stream_log_partitions) 所示。

在每个分区内，代理为每个消息分配一个单调递增的序列号或 **偏移量**（offset，在 [图 12-3](#fig_stream_log_partitions) 中，框中的数字是消息偏移量）。这种序列号是有意义的，因为分区是仅追加写入的，所以分区内的消息是完全有序的。没有跨不同分区的顺序保证。

{{< figure src="/fig/ddia_1203.png" id="fig_stream_log_partitions" caption="图 12-3. 生产者通过将消息追加写入主题分区文件来发送消息，消费者依次读取这些文件。" class="w-full my-4" >}}

Apache Kafka [^20] 和 Amazon Kinesis Streams 都是按这种方式工作的基于日志的消息代理。Google Cloud Pub/Sub 在架构上类似，但对外暴露的是 JMS 风格的 API，而不是日志抽象[^15]。尽管这些消息代理将所有消息写入磁盘，但通过跨多台机器分区，依然能够达到每秒数百万条消息的吞吐量，并通过复制消息实现容错[^21] [^22]。

#### 日志与传统的消息传递相比 {#sec_stream_logs_vs_messaging}

基于日志的方法天然支持扇出式消息传递，因为多个消费者可以独立读取日志，而不会相互影响 —— 读取消息不会将其从日志中删除。为了在一组消费者之间实现负载平衡，代理可以将整个分区分配给消费者组中的节点，而不是将单条消息分配给消费者客户端。

然后每个客户端将消费被指派分区中的 **所有** 消息。通常情况下，当一个用户被指派了一个日志分区时，它会以简单的单线程方式顺序地读取分区中的消息。这种粗粒度的负载均衡方法有一些缺点：

* 共享消费主题工作的节点数，最多为该主题中的日志分区数，因为同一个分区内的所有消息被递送到同一个节点。
* 如果某条消息处理缓慢，则它会阻塞该分区中后续消息的处理（一种头部阻塞的形式；请参阅 “[描述性能](/ch2#sec_introduction_percentiles)”）。

因此在消息处理代价高昂，希望逐条并行处理，以及消息的顺序并没有那么重要的情况下，JMS/AMQP 风格的消息代理是可取的。另一方面，在消息吞吐量很高，处理迅速，顺序很重要的情况下，基于日志的方法表现得非常好[^23] [^24]。不过，基于日志与传统消息系统的边界并不绝对：例如，一个主题分区通常一次只分配给一个消费者[^25] [^26]。

#### 消费者偏移量 {#sec_stream_log_offsets}

顺序消费一个分区使得判断消息是否已经被处理变得相当容易：所有偏移量小于消费者的当前偏移量的消息已经被处理，而具有更大偏移量的消息还没有被看到。因此，代理不需要跟踪确认每条消息，只需要定期记录消费者的偏移即可。这种方法减少了额外簿记开销，而且在批处理和流处理中采用这种方法有助于提高基于日志的系统的吞吐量。

实际上，这种偏移量与单领导者数据库复制中常见的日志序列号非常相似，我们在 “[设置新从库](/ch6#sec_replication_new_replica)” 中讨论了这种情况。在数据库复制中，日志序列号允许跟随者断开连接后，重新连接到领导者，并在不跳过任何写入的情况下恢复复制。这里原理完全相同：消息代理表现得像一个主库，而消费者就像一个从库。

如果消费者节点失效，则失效消费者的分区将指派给其他节点，并从最后记录的偏移量开始消费消息。如果消费者已经处理了后续的消息，但还没有记录它们的偏移量，那么重启后这些消息将被处理两次。我们将在本章后面讨论这个问题的处理方法。

#### 磁盘空间使用 {#sec_stream_disk_usage}

如果只追加写入日志，则磁盘空间终究会耗尽。为了回收磁盘空间，日志实际上被分割成段，并不时地将旧段删除或移动到归档存储。（我们将在后面讨论一种更为复杂的磁盘空间释放方式）

这就意味着如果一个慢消费者跟不上消息产生的速率而落后得太多，它的消费偏移量指向了删除的段，那么它就会错过一些消息。实际上，日志实现了一个有限大小的缓冲区，当缓冲区填满时会丢弃旧消息，它也被称为 **循环缓冲区（circular buffer）** 或 **环形缓冲区（ring buffer）**。不过由于缓冲区在磁盘上，因此缓冲区可能相当的大。

让我们做个粗略估算。在撰写本文时，典型的大容量硬盘约为 20 TB，顺序写入吞吐量约为 250 MB/s。如果持续以最高速率写入消息，磁盘大约 22 小时就会写满并开始删除最旧消息。这意味着，即使在满速写入下，磁盘日志也至少可以缓冲约 22 小时的数据。实践中部署很少持续打满磁盘带宽，因此通常可以保留数天甚至数周的消息缓冲区。

许多基于日志的消息代理现在也将消息分层存储到对象存储中，以进一步提升容量，方式与我们在第六章中讨论“对象存储支撑数据库”时类似。像 Apache Kafka 和 Redpanda 可以把较旧消息放在对象存储中按需读取；还有一些系统直接将全部消息保存在对象存储中。除了成本优势外，这种架构也有数据集成优势：如果对象存储中的消息以 Iceberg 表形式组织，批处理和数据仓库作业可以直接在这些数据上执行，而无需再复制一份数据。

#### 当消费者跟不上生产者时 {#id459}

在 “[消息传递系统](#sec_stream_messaging)” 中，如果消费者无法跟上生产者发送信息的速度时，我们讨论了三种选择：丢弃信息，进行缓冲或施加背压。在这种分类法里，基于日志的方法是缓冲的一种形式，具有很大但大小固定的缓冲区（受可用磁盘空间的限制）。

如果消费者远远落后，而所要求的信息比保留在磁盘上的信息还要旧，那么它将不能读取这些信息，所以代理实际上丢弃了比缓冲区容量更大的旧信息。你可以监控消费者落后日志头部的距离，如果落后太多就发出报警。由于缓冲区很大，因而有足够的时间让运维人员来修复慢消费者，并在消息开始丢失之前让其赶上。

即使消费者真的落后太多开始丢失消息，也只有那个消费者受到影响；它不会中断其他消费者的服务。这是一个巨大的运维优势：你可以实验性地消费生产日志，以进行开发，测试或调试，而不必担心会中断生产服务。当消费者关闭或崩溃时，会停止消耗资源，唯一剩下的只有消费者偏移量。

这种行为也与传统的消息代理形成了鲜明对比，在那种情况下，你需要小心地删除那些消费者已经关闭的队列 —— 否则那些队列就会累积不必要的消息，从其他仍活跃的消费者那里占走内存。

#### 重播旧消息 {#sec_stream_replay}

我们之前提到，使用 AMQP 和 JMS 风格的消息代理，处理和确认消息是一个破坏性的操作，因为它会导致消息在代理上被删除。另一方面，在基于日志的消息代理中，使用消息更像是从文件中读取数据：这是只读操作，不会更改日志。

除了消费者的任何输出之外，处理的唯一副作用是消费者偏移量的前进。但偏移量是在消费者的控制之下的，所以如果需要的话可以很容易地操纵：例如你可以用昨天的偏移量跑一个消费者副本，并将输出写到不同的位置，以便重新处理最近一天的消息。你可以使用各种不同的处理代码重复任意次。

这一方面使得基于日志的消息传递更像上一章的批处理，其中派生数据通过可重复的转换过程与输入数据显式分离。它允许进行更多的实验，更容易从错误和漏洞中恢复，使其成为在组织内集成数据流的良好工具[^27]。


## 数据库与流 {#sec_stream_databases}

我们已经在消息代理和数据库之间进行了一些比较。尽管传统上它们被视为单独的工具类别，但是我们看到基于日志的消息代理已经成功地从数据库中获取灵感并将其应用于消息传递。我们也可以反过来：从消息传递和流中获取灵感，并将它们应用于数据库。

我们之前曾经说过，事件是某个时刻发生的事情的记录。发生的事情可能是用户操作（例如键入搜索查询）或读取传感器，但也可能是 **写入数据库**。某些东西被写入数据库的事实是可以被捕获、存储和处理的事件。这一观察结果表明，数据库和数据流之间的联系不仅仅是磁盘日志的物理存储 —— 而是更深层的联系。

事实上，复制日志（请参阅 “[复制日志的实现](/ch6#sec_replication_implementation)”）是一个由数据库写入事件组成的流，由主库在处理事务时生成。从库将写入流应用到它们自己的数据库副本，从而最终得到相同数据的精确副本。复制日志中的事件描述发生的数据更改。

我们还在 “[使用共享日志](/ch10#sec_consistency_smr)” 中遇到了状态机复制原理，其中指出：如果每个事件代表对数据库的写入，并且每个副本按相同的顺序处理相同的事件，则副本将达到相同的最终状态（假设事件处理是一个确定性的操作）。这是事件流的又一种场景！

在本节中，我们将首先看看异构数据系统中出现的一个问题，然后探讨如何通过将事件流的想法带入数据库来解决这个问题。

### 保持系统同步 {#sec_stream_sync}

正如我们在本书中所看到的，没有一个系统能够满足所有的数据存储、查询和处理需求。在实践中，大多数重要应用都需要组合使用几种不同的技术来满足所有的需求：例如，使用 OLTP 数据库来为用户请求提供服务，使用缓存来加速常见请求，使用全文索引来处理搜索查询，使用数据仓库用于分析。每一种技术都有自己的数据副本，并根据自己的目的进行存储方式的优化。

由于相同或相关的数据出现在了不同的地方，因此相互间需要保持同步：如果某个项目在数据库中被更新，它也应当在缓存、搜索索引和数据仓库中被更新。对于数据仓库，这种同步通常由 ETL 进程执行（请参阅 “[数据仓库](/ch1#sec_introduction_dwh)”），通常是先取得数据库的完整副本，然后执行转换，并批量加载到数据仓库中 —— 换句话说，批处理。我们在 “[批处理工作流的输出](/ch11#sec_batch_output)” 中同样看到了如何使用批处理创建搜索索引、推荐系统和其他派生数据系统。

如果周期性的完整数据库转储过于缓慢，有时会使用的替代方法是 **双写（dual write）**，其中应用代码在数据变更时明确写入每个系统：例如，首先写入数据库，然后更新搜索索引，然后使缓存项失效（甚至同时执行这些写入）。

但是，双写有一些严重的问题，其中一个是竞争条件，如 [图 12-4](#fig_stream_dual_write_race) 所示。在这个例子中，两个客户端同时想要更新一个项目 X：客户端 1 想要将值设置为 A，客户端 2 想要将其设置为 B。两个客户端首先将新值写入数据库，然后将其写入到搜索索引。因为运气不好，这些请求的时序是交错的：数据库首先看到来自客户端 1 的写入将值设置为 A，然后来自客户端 2 的写入将值设置为 B，因此数据库中的最终值为 B。搜索索引首先看到来自客户端 2 的写入，然后是客户端 1 的写入，所以搜索索引中的最终值是 A。即使没发生错误，这两个系统现在也永久地不一致了。

{{< figure src="/fig/ddia_1204.png" id="fig_stream_dual_write_race" caption="图 12-4. 在数据库中 X 首先被设置为 A，然后被设置为 B，而在搜索索引处，写入以相反的顺序到达。" class="w-full my-4" >}}

除非有一些额外的并发检测机制，例如我们在 “[检测并发写入](/ch6#sec_replication_concurrent)” 中讨论的版本向量，否则你甚至不会意识到发生了并发写入 —— 一个值将简单地以无提示方式覆盖另一个值。

双重写入的另一个问题是，其中一个写入可能会失败，而另一个成功。这是一个容错问题，而不是一个并发问题，但也会造成两个系统互相不一致的结果。确保它们要么都成功要么都失败，是原子提交问题的一个例子，解决这个问题的代价是昂贵的（请参阅 “[原子提交与两阶段提交](/ch8#sec_transactions_2pc)”）。

如果你只有一个单领导者复制的数据库，那么这个领导者决定了写入顺序，而状态机复制方法可以在数据库副本上工作。然而，在 [图 12-4](#fig_stream_dual_write_race) 中，没有单个主库：数据库可能有一个领导者，搜索索引也可能有一个领导者，但是两者都不追随对方，所以可能会发生冲突（请参阅 “[多主复制](/ch6#sec_replication_multi_leader)”）。

如果实际上只有一个领导者 —— 例如，数据库 —— 而且我们能让搜索索引成为数据库的追随者，情况要好得多。但这在实践中可能吗？

### 数据变更捕获 {#sec_stream_cdc}

大多数数据库的复制日志的问题在于，它们一直被当做数据库的内部实现细节，而不是公开的 API。客户端应该通过其数据模型和查询语言来查询数据库，而不是解析复制日志并尝试从中提取数据。

数十年来，许多数据库根本没有记录在档的获取变更日志的方式。由于这个原因，捕获数据库中所有的变更，然后将其复制到其他存储技术（搜索索引、缓存或数据仓库）中是相当困难的。

最近，人们对 **数据变更捕获（change data capture, CDC）** 越来越感兴趣，这是一种观察写入数据库的所有数据变更，并将其提取并转换为可以复制到其他系统中的形式的过程。CDC 是非常有意思的，尤其是当变更能在被写入后立刻用于流时[^28]。

例如，你可以捕获数据库中的变更，并不断将相同的变更应用至搜索索引。如果变更日志以相同的顺序应用，则可以预期搜索索引中的数据与数据库中的数据是匹配的。搜索索引和任何其他派生数据系统只是变更流的消费者，如 [图 12-5](#fig_stream_cdc_flow) 所示。

{{< figure src="/fig/ddia_1205.png" id="fig_stream_cdc_flow" caption="图 12-5. 将数据按顺序写入一个数据库，然后按照相同的顺序将这些更改应用到其他系统。" class="w-full my-4" >}}

#### 数据变更捕获的实现 {#id307}

我们可以将日志消费者叫做 **派生数据系统**，正如在 [第一章](/ch1#sec_introduction_derived) 讨论“记录系统与派生数据”时所述：存储在搜索索引和数据仓库中的数据，只是 **记录系统** 数据的额外视图。数据变更捕获是一种机制，可确保对记录系统所做的所有更改都反映在派生数据系统中，以便派生系统具有数据的准确副本。

从本质上说，数据变更捕获使得一个数据库成为领导者（被捕获变化的数据库），并将其他组件变为追随者。基于日志的消息代理非常适合从源数据库传输变更事件，因为它保留了消息的顺序（避免了 [图 12-2](#fig_stream_redelivery_reordering) 的重新排序问题）。

数据库触发器可用来实现数据变更捕获（请参阅 “[基于触发器的复制](/ch6#sec_replication_logical)”），通过注册观察所有变更的触发器，并将相应的变更项写入变更日志表中。但是它们往往是脆弱的，而且有显著的性能开销。解析复制日志可能是一种更稳健的方法，但它也很有挑战，例如如何应对模式变更。

逻辑复制日志可以用于实现 CDC（请参阅 “[逻辑（基于行）的日志复制](/ch6#sec_replication_logical)”），但会带来不少挑战，例如模式变更和更新建模。Debezium 开源项目专门解决这些问题，提供了面向 MySQL、PostgreSQL、Oracle、SQL Server、Db2、Cassandra 等数据库的源连接器。Kafka Connect 也为多种数据库提供了 CDC 连接器；Maxwell 通过解析 binlog 为 MySQL 提供类似能力[^29]，GoldenGate 为 Oracle 提供类似能力，pgcapture 为 PostgreSQL 提供类似能力。

类似于消息代理，数据变更捕获通常是异步的：记录数据库系统在提交变更之前不会等待消费者应用变更。这种设计具有的运维优势是，添加缓慢的消费者不会过度影响记录系统。不过，所有复制延迟可能有的问题在这里都可能出现（请参阅 “[复制延迟问题](/ch6#sec_replication_lag)”）。

#### 初始快照 {#sec_stream_cdc_snapshot}

如果你拥有 **所有** 对数据库进行变更的日志，则可以通过重播该日志，来重建数据库的完整状态。但是在许多情况下，永远保留所有更改会耗费太多磁盘空间，且重播过于费时，因此日志需要被截断。

例如，构建新的全文索引需要整个数据库的完整副本 —— 仅仅应用最近变更的日志是不够的，因为这样会丢失最近未曾更新的项目。因此，如果你没有完整的历史日志，则需要从一个一致的快照开始，如先前的 “[设置新从库](/ch6#sec_replication_new_replica)” 中所述。

数据库的快照必须与变更日志中的已知位置或偏移量相对应，以便在处理完快照后知道从哪里开始应用变更。一些 CDC 工具集成了这种快照功能，而其他工具则把它留给你手动执行。Debezium 使用 Netflix 的 DBLog 水位线算法提供增量快照能力[^30] [^31]。

#### 日志压缩 {#sec_stream_log_compaction}

如果你只能保留有限的历史日志，则每次要添加新的派生数据系统时，都需要做一次快照。但 **日志压缩（log compaction）** 提供了一个很好的备选方案。

我们之前在 “[日志结构存储](/ch4#sec_storage_log_structured)” 的上下文中讨论过日志压缩（可参阅 [图 4-3](/ch4#fig_storage_sstable_merging) 的示例）。原理很简单：存储引擎定期在日志中查找具有相同键的记录，丢掉所有重复的内容，并只保留每个键的最新更新。这个压缩与合并过程在后台运行，如 [图 12-6](#fig_stream_compaction) 所示。

{{< figure src="/fig/ddia_1206.png" id="fig_stream_compaction" caption="图 12-6. 一个键值对日志，其中键是猫视频的 ID（mew、purr、scratch、yawn），值是播放次数。日志压缩只保留每个键的最新值。" class="w-full my-4" >}}

在日志结构存储引擎中，具有特殊值 NULL（**墓碑**，即 tombstone）的更新表示该键被删除，并会在日志压缩过程中被移除。但只要键不被覆盖或删除，它就会永远留在日志中。这种压缩日志所需的磁盘空间仅取决于数据库的当前内容，而不取决于数据库中曾经发生的写入次数。如果相同的键经常被覆盖写入，则先前的值将最终将被垃圾回收，只有最新的值会保留下来。

在基于日志的消息代理与数据变更捕获的上下文中也适用相同的想法。如果 CDC 系统被配置为，每个变更都包含一个主键，且每个键的更新都替换了该键以前的值，那么只需要保留对键的最新写入就足够了。

现在，无论何时需要重建派生数据系统（如搜索索引），你可以从压缩日志主题的零偏移量处启动新的消费者，然后依次扫描日志中的所有消息。日志能保证包含数据库中每个键的最新值（也可能是一些较旧的值）—— 换句话说，你可以使用它来获取数据库内容的完整副本，而无需从 CDC 源数据库取一个快照。

Apache Kafka 支持这种日志压缩功能。正如我们将在本章后面看到的，它允许消息代理被当成持久性存储使用，而不仅仅是用于临时消息。

#### 变更流的 API 支持 {#sec_stream_change_api}

如今许多主流数据库都把变更流作为一等接口提供，而不再像过去那样主要依赖“事后补丁式”或逆向工程式的 CDC。MySQL、PostgreSQL 等关系数据库通常通过与自身复制相同的日志通道输出变更；各大云厂商也提供了对应的 CDC 服务，例如 Google Cloud 的 Datastream 可向关系数据库与数据仓库提供流式数据访问。

即使是 Cassandra 这类最终一致、基于法定票数的数据库，也开始支持数据变更捕获。正如我们在第十章关于线性一致与法定票数中看到的，写入是否“可见”取决于读写一致性设置，这使得其 CDC 的统一抽象更困难。Cassandra 的做法通常是公开各节点原始日志段，而不是提供单一统一的变更流；消费方需要自己读取并合并各节点日志，生成业务可用的单一事件流[^32]。

Kafka Connect[^33]提供了大量数据库系统与 Kafka 的 CDC 集成能力。变更事件一旦进入 Kafka，就可以用于更新搜索索引等派生系统，也可以继续送入后续流处理链路。

#### 数据变更捕获与事件溯源 {#sec_stream_event_sourcing}

数据变更捕获与事件溯源都把状态变化表示成事件日志，但二者抽象层级不同：

* 在数据变更捕获中，应用仍以可变方式使用数据库，任意更新/删除记录；变更日志从数据库底层抽取（如复制日志），因此能保证抽取顺序与真实写入顺序一致，避免 [图 12-4](#fig_stream_dual_write_race) 这类竞态问题。
* 在事件溯源中，应用逻辑从一开始就构建在不可变事件之上，事件存储通常是仅追加写入，更新和删除被限制或禁止。事件语义是应用层行为，而非底层状态差异。

二者孰优取决于场景。对未采用事件溯源的系统而言，引入它通常是一次较大架构变更；而数据变更捕获通常可在现有数据库上以较小改动接入，应用层甚至可以感知不到 CDC 的存在。

> [!TIP] 数据变更捕获与数据库模式
> 数据变更捕获看上去比事件溯源更容易落地，但它也有自己的工程挑战。
>
> 在微服务架构中，数据库通常只由所属服务直接访问；其他服务通过该服务 API 交互，因此数据库模式本应是服务内部实现细节，可随服务演化。
>
> 但 CDC 往往直接复用上游数据库模式做复制，这会把原本“内部模式”变成“外部契约”。删除某个列可能会直接破坏下游消费者[^34]。
>
> 一种常见解法是 **Outbox 模式**：专门维护对外发布的 outbox 表，让 CDC 读取 outbox，而不是直接读取内部领域模型表。这样可以在尽量不影响外部消费者的前提下演化内部模式[^35] [^36]。它看起来像双写，实际上也是双写；但它把两次写入留在同一个数据库系统内，因此可放进同一事务，规避跨系统双写的一致性问题。

和数据变更捕获一样，重放事件日志也能重建当前状态，但日志压缩策略不同：

* 对于 CDC，更新事件通常携带记录的完整新版本，因此同一主键的最新事件就足以决定当前值，旧事件可被压缩。
* 对于事件溯源，事件通常描述用户意图而非状态覆盖，后续事件一般不会“覆盖”先前事件，因此重建状态通常需要完整历史，不能按 CDC 的方式压缩。

采用事件溯源的系统通常会保存由事件日志导出的状态快照，以降低读取与恢复成本；但快照本质上是性能优化。其核心假设仍是：原始事件可长期保存，并在需要时可完整重放。我们将在“不变性的局限性”中讨论这一假设的边界。

### 状态、流和不变性 {#sec_stream_immutability}

我们在 [第十一章](/ch11) 中看到，批处理因其输入文件不变性而受益良多，你可以在现有输入文件上运行实验性处理作业，而不用担心损坏它们。这种不变性原则也是使得事件溯源与数据变更捕获如此强大的原因。

我们通常将数据库视为应用程序当前状态的存储 —— 这种表示针对读取进行了优化，而且通常对于服务查询而言是最为方便的表示。状态的本质是，它会变化，所以数据库才会支持数据的增删改。这又该如何匹配不变性呢？

只要你的状态发生了变化，那么这个状态就是这段时间中事件修改的结果。例如，当前可用的座位列表是你已处理的预订所产生的结果，当前帐户余额是帐户中的借与贷的结果，而 Web 服务器的响应时间图，是所有已发生 Web 请求的独立响应时间的聚合结果。

无论状态如何变化，总是有一系列事件导致了这些变化。即使事情已经执行与回滚，这些事件出现是始终成立的。关键的想法是：可变的状态与不可变事件的仅追加日志相互之间并不矛盾：它们是一体两面，互为阴阳的。所有变化的日志 —— **变化日志（changelog）**，表示了随时间演变的状态。

如果你倾向于数学表示，那么你可能会说，应用状态是事件流对时间求积分得到的结果，而变更流是状态对时间求微分的结果，如 [图 12-7](#fig_stream_state_derivative) 所示[^37] [^38]。这个比喻有一些局限性（例如，状态的二阶导似乎没有意义），但这是考虑数据的一个实用出发点。

$$
\begin{aligned}
state(now) &= \int_{t=0}^{now} stream(t)\,dt \\
stream(t) &= \frac{d\,state(t)}{dt}
\end{aligned}
$$

{{< figure src="/fig/ddia_1207.png" id="fig_stream_state_derivative" caption="图 12-7. 应用当前状态与事件流之间的关系。" class="w-full my-4" >}}

如果你持久存储了变更日志，那么重现状态就非常简单。如果你将事件日志视为记录系统，而把可变状态视为其派生结果，那么系统中的数据流就更容易推理。正如 Jim Gray 和 Andreas Reuter 在 1992 年所说[^39]：

> 从原理上讲，数据库并非必需；日志已经包含了全部信息。之所以要保留数据库（即日志末端的当前状态），只是为了提高读取性能。

日志压缩（如 “[日志压缩](#sec_stream_log_compaction)” 中所述）是连接日志与数据库状态之间的桥梁：它只保留每条记录的最新版本，并丢弃被覆盖的版本。

#### 不可变事件的优点 {#sec_stream_immutability_pros}

数据库中的不变性是一个古老的概念。例如，会计在几个世纪以来一直在财务记账中应用不变性。一笔交易发生时，它被记录在一个仅追加写入的分类帐中，实质上是描述货币、商品或服务转手的事件日志。账目，比如利润、亏损、资产负债表，是从分类账中的交易求和派生而来[^40]。

如果发生错误，会计师不会删除或更改分类帐中的错误交易 —— 而是添加另一笔交易以补偿错误，例如退还一笔不正确的费用。不正确的交易将永远保留在分类帐中，对于审计而言可能非常重要。如果从不正确的分类账派生出的错误数字已经公布，那么下一个会计周期的数字就会包括一个更正。这个过程在会计事务中是很常见的[^41]。

尽管这种可审计性只在金融系统中尤其重要，但对于不受这种严格监管的许多其他系统，也是很有帮助的。如 “[批处理用例](/ch11#sec_batch_output)” 中所讨论的，如果你意外地部署了将错误数据写入数据库的错误代码，当代码会破坏性地覆写数据时，恢复要困难得多。使用不可变事件的仅追加日志，诊断问题与故障恢复就要容易得多。

不可变的事件也包含了比当前状态更多的信息。例如在购物网站上，顾客可以将物品添加到他们的购物车，然后再将其移除。虽然从履行订单的角度，第二个事件取消了第一个事件，但对分析目的而言，知道客户考虑过某个特定项而之后又反悔，可能是很有用的。也许他们会选择在未来购买，或者他们已经找到了替代品。这个信息被记录在事件日志中，但对于移出购物车就删除记录的数据库而言，这个信息在移出购物车时可能就丢失了。

#### 从同一事件日志中派生多个视图 {#sec_stream_deriving_views}

此外，通过从不变的事件日志中分离出可变的状态，你可以针对不同的读取方式，从相同的事件日志中派生出几种不同的表现形式。效果就像一个流的多个消费者一样（[图 12-5](#fig_stream_cdc_flow)）：例如，Kafka Connect 能将来自 Kafka 的数据导出到各种不同的数据库与索引[^33]。这对于许多其他存储和索引系统（如搜索服务器）来说也是有意义的，当系统要从分布式日志中获取输入时尤其如此（请参阅 “[保持系统同步](#sec_stream_sync)”）。

添加从事件日志到数据库的显式转换，能够使应用更容易地随时间演进：如果你想要引入一个新功能，以新的方式表示现有数据，则可以使用事件日志来构建一个单独的、针对新功能的读取优化视图，无需修改现有系统而与之共存。并行运行新旧系统通常比在现有系统中执行复杂的模式迁移更容易。一旦不再需要旧的系统，你可以简单地关闭它并回收其资源[^42] [^43]。

如果你不需要担心如何查询与访问数据，那么存储数据通常是非常简单的。模式设计、索引和存储引擎的许多复杂性，都是希望支持某些特定查询和访问模式的结果（请参阅 [第三章](/ch3)）。出于这个原因，通过将数据写入的形式与读取形式相分离，并允许几个不同的读取视图，你能获得很大的灵活性。这个想法有时被称为 **命令查询责任分离（command query responsibility segregation, CQRS）**[^44]。

数据库和模式设计的传统方法是基于这样一种谬论，数据必须以与查询相同的形式写入。如果可以将数据从针对写入优化的事件日志转换为针对读取优化的应用状态，那么有关规范化和非规范化的争论就变得无关紧要了（请参阅 “[多对一和多对多的关系](/ch3#sec_datamodels_normalization)”）：在针对读取优化的视图中对数据进行非规范化是完全合理的，因为翻译过程提供了使其与事件日志保持一致的机制。

在 “[描述负载](/ch2#sec_introduction_twitter)” 中，我们讨论了推特主页时间线，它是特定用户关注的人群所发推特的缓存（类似邮箱）。这是 **针对读取优化的状态** 的又一个例子：主页时间线是高度非规范化的，因为你的推文与你所有粉丝的时间线都构成了重复。然而，扇出服务保持了这种重复状态与新推特以及新关注关系的同步，从而保证了重复的可管理性。

#### 并发控制 {#sec_stream_concurrency}

事件溯源和数据变更捕获的最大缺点是，事件日志的消费者通常是异步的，所以可能会出现这样的情况：用户会写入日志，然后从日志派生视图中读取，结果发现他的写入还没有反映在读取视图中。我们之前在 “[读己之写](/ch6#sec_replication_ryw)” 中讨论了这个问题以及可能的解决方案。

一种解决方案是将事件追加到日志时同步执行读取视图的更新。而将这些写入操作合并为一个原子单元需要 **事务**，所以要么将事件日志和读取视图保存在同一个存储系统中，要么就需要跨不同系统进行分布式事务。或者，你也可以使用在 “[使用共享日志](/ch10#sec_consistency_smr)” 中讨论的方法。

另一方面，从事件日志导出当前状态也简化了并发控制的某些部分。许多对于多对象事务的需求（请参阅 “[单对象和多对象操作](/ch8#sec_transactions_multi_object)”）源于单个用户操作需要在多个不同的位置更改数据。通过事件溯源，你可以设计一个自包含的事件以表示一个用户操作。然后用户操作就只需要在一个地方进行单次写入操作 —— 即将事件附加到日志中 —— 这个还是很容易使原子化的。

如果事件日志与应用状态以相同的方式分区（例如，处理分区 3 中的客户事件只需要更新分区 3 中的应用状态），那么直接使用单线程日志消费者就不需要写入并发控制了。它从设计上一次只处理一个事件（请参阅 “[真的串行执行](/ch8#sec_transactions_serial)”）。日志通过在分区中定义事件的序列顺序，消除了并发性的不确定性[^27]。如果一个事件触及多个状态分区，那么需要做更多的工作，我们将在 [第十三章](/ch13) 讨论。

#### 不变性的局限性 {#sec_stream_immutability_limitations}

许多不使用事件溯源模型的系统也还是依赖不可变性：各种数据库在内部使用不可变的数据结构或多版本数据来支持时间点快照（请参阅 “[索引和快照隔离](/ch8#sec_transactions_snapshot_indexes)” ）。Git、Mercurial 和 Fossil 等版本控制系统也依靠不可变的数据来保存文件的版本历史记录。

永远保持所有变更的不变历史，在多大程度上是可行的？答案取决于数据集的流失率。一些工作负载主要是添加数据，很少更新或删除；它们很容易保持不变。其他工作负载在相对较小的数据集上有较高的更新 / 删除率；在这些情况下，不可变的历史可能增至难以接受的巨大，碎片化可能成为一个问题，压缩与垃圾收集的表现对于运维的稳健性变得至关重要[^45] [^46]。

除了性能方面的原因外，也可能有出于管理方面的原因需要删除数据的情况，尽管这些数据都是不可变的。例如，隐私条例可能要求在用户关闭帐户后删除他们的个人信息，数据保护立法可能要求删除错误的信息，或者可能需要阻止敏感信息的意外泄露。

在这种情况下，仅仅在日志中添加另一个事件来指明先前的数据应该被视为删除是不够的 —— 你实际上是想改写历史，并假装数据从一开始就没有写入。例如，Datomic 管这个特性叫 **切除（excision）**[^47]，而 Fossil 版本控制系统有一个类似的概念叫 **避免（shunning）**[^48]。

真正删除数据是非常非常困难的[^49]，因为副本可能存在于很多地方：例如，存储引擎、文件系统和 SSD 通常会向新位置写入，而不是原地覆盖旧数据[^41]；而备份往往刻意设计为不可变，以防误删或损坏。

一种支持删除不可变数据的方法是 **加密粉碎（crypto-shredding）**[^50]：将未来可能需要删除的数据以加密形式保存，删除时仅销毁密钥。这样，密文仍在，但不可再被使用。从某种意义上说，这只是把可变性从“数据本身”转移到“密钥管理”上。

此外，你需要预先决定哪些数据共享同一密钥、哪些数据使用不同密钥，因为后续你能“粉碎”的粒度通常是“该密钥加密的全部数据”或“都不删”，很难只删其中一部分。若为每条记录单独存密钥，密钥存储规模又会变得不可控。像 puncturable encryption 这样的高级方案[^51]可以提供更细粒度的撤销能力，但尚未广泛落地。

总的来说，删除更多是在“让数据更难被取回”，而非“让数据绝对不可恢复”。尽管如此，在某些场景下仍必须尽力而为，正如我们在 “[立法与自律](/ch14#sec_future_legislation)” 中会看到的。


## 流处理 {#sec_stream_processing}

到目前为止，本章中我们已经讨论了流的来源（用户活动事件，传感器和写入数据库），我们讨论了流如何传输（直接通过消息传送，通过消息代理，通过事件日志）。

剩下的就是讨论一下你可以用流做什么 —— 也就是说，你可以处理它。一般来说，有三种选项：

1. 你可以将事件中的数据写入数据库、缓存、搜索索引或类似的存储系统，然后能被其他客户端查询。如 [图 12-5](#fig_stream_cdc_flow) 所示，这是数据库与系统其他部分所发生的变更保持同步的好方法 —— 特别是当流消费者是写入数据库的唯一客户端时。如 “[批处理工作流的输出](/ch11#sec_batch_output)” 中所讨论的，它是写入存储系统的流等价物。
2. 你能以某种方式将事件推送给用户，例如发送报警邮件或推送通知，或将事件流式传输到可实时显示的仪表板上。在这种情况下，人是流的最终消费者。
3. 你可以处理一个或多个输入流，并产生一个或多个输出流。流可能会经过由几个这样的处理阶段组成的流水线，最后再输出（选项 1 或 2）。

在本章的剩余部分中，我们将讨论选项 3：处理流以产生其他派生流。处理这样的流的代码片段，被称为 **算子（operator）** 或 **作业（job）**。它与我们在 [第十一章](/ch11) 中讨论过的 Unix 进程和 MapReduce 作业密切相关，数据流的模式是相似的：一个流处理器以只读的方式使用输入流，并将其输出以仅追加的方式写入一个不同的位置。

流处理中的分区和并行化模式也非常类似于 [第十一章](/ch11) 中介绍的 MapReduce 和数据流引擎，因此我们不再重复这些主题。基本的 Map 操作（如转换和过滤记录）也是一样的。

与批量作业相比的一个关键区别是，流不会结束。这种差异会带来很多隐含的结果。正如本章开始部分所讨论的，排序对无界数据集没有意义，因此无法使用 **排序合并连接**（请参阅 “[Reduce 侧连接与分组](/ch11#sec_batch_join)”）。容错机制也必须改变：对于已经运行了几分钟的批处理作业，可以简单地从头开始重启失败任务，但是对于已经运行数年的流作业，重启后从头开始跑可能并不是一个可行的选项。

### 流处理的应用 {#sec_stream_uses}

长期以来，流处理一直用于监控目的，如果某个事件发生，组织希望能得到警报。例如：

* 欺诈检测系统需要确定信用卡的使用模式是否有意外地变化，如果信用卡可能已被盗刷，则锁卡。
* 交易系统需要检查金融市场的价格变化，并根据指定的规则进行交易。
* 制造系统需要监控工厂中机器的状态，如果出现故障，可以快速定位问题。
* 军事和情报系统需要跟踪潜在侵略者的活动，并在出现袭击征兆时发出警报。

这些类型的应用需要非常精密复杂的模式匹配与相关检测。然而随着时代的进步，流处理的其他用途也开始出现。在本节中，我们将简要比较一下这些应用。

#### 复合事件处理 {#id317}

**复合事件处理（complex event processing, CEP）** 是 20 世纪 90 年代为分析事件流而开发出的一种方法，尤其适用于需要搜索某些事件模式的应用[^52]。与正则表达式允许你在字符串中搜索特定字符模式的方式类似，CEP 允许你指定规则以在流中搜索某些事件模式。

CEP 系统通常使用高层次的声明式查询语言，比如 SQL，或者图形用户界面，来描述应该检测到的事件模式。这些查询被提交给处理引擎，该引擎消费输入流，并在内部维护一个执行所需匹配的状态机。当发现匹配时，引擎发出一个 **复合事件**（即 complex event，CEP 因此得名），并附有检测到的事件模式详情[^53]。

在这些系统中，查询和数据之间的关系与普通数据库相比是颠倒的。通常情况下，数据库会持久存储数据，并将查询视为临时的：当查询进入时，数据库搜索与查询匹配的数据，然后在查询完成时丢掉查询。CEP 引擎反转了角色：查询是长期存储的，来自输入流的事件不断流过它们，搜索匹配事件模式的查询[^54]。

CEP 的实现包括 Esper、Apama 和 TIBCO StreamBase。像 Flink 和 Spark Streaming 这样的分布式流处理框架，也支持在流上使用 SQL 进行声明式查询。

#### 流分析 {#id318}

使用流处理的另一个领域是对流进行分析。CEP 与流分析之间的边界是模糊的，但一般来说，分析往往对找出特定事件序列并不关心，而更关注大量事件上的聚合与统计指标 —— 例如：

* 测量某种类型事件的速率（每个时间间隔内发生的频率）
* 滚动计算一段时间窗口内某个值的平均值
* 将当前的统计值与先前的时间区间的值对比（例如，检测趋势，当指标与上周同比异常偏高或偏低时报警）

这些统计值通常是在固定时间区间内进行计算的，例如，你可能想知道在过去 5 分钟内服务每秒查询次数的均值，以及此时间段内响应时间的第 99 百分位点。在几分钟内取平均，能抹平秒和秒之间的无关波动，且仍然能向你展示流量模式的时间图景。聚合的时间间隔称为 **窗口（window）**，我们将在 “[时间推理](#sec_stream_time)” 中更详细地讨论窗口。

流分析系统有时会使用概率算法，例如 Bloom filter（我们在 “[性能优化](/ch4#sec_storage_bloom_filter)” 中遇到过）来管理成员资格，HyperLogLog[^55]用于基数估计以及各种百分比估计算法（请参阅 “[实践中的百分位点](/ch2#sec_introduction_percentiles)”）。概率算法产出近似的结果，但比起精确算法的优点是内存使用要少得多。使用近似算法有时让人们觉得流处理系统总是有损的和不精确的，但这是错误看法：流处理并没有任何内在的近似性，而概率算法只是一种优化[^56]。

许多开源分布式流处理框架的设计都是针对分析设计的：例如 Apache Storm、Spark Streaming、Flink、Samza、Apache Beam 和 Kafka Streams[^57]。托管服务包括 Google Cloud Dataflow 和 Azure Stream Analytics。

#### 维护物化视图 {#sec_stream_mat_view}

我们在 “[数据库与流](#sec_stream_databases)” 中看到，数据库的变更流可以用于维护派生数据系统（如缓存、搜索索引和数据仓库），并使其与源数据库保持最新。我们可以将这些示例视作维护 **物化视图（materialized view）** 的一种具体场景：在某个数据集上派生出一个替代视图以便高效查询，并在底层数据变更时更新视图[^37]。

同样，在事件溯源中，应用程序的状态是通过应用事件日志来维护的；这里的应用程序状态也是一种物化视图。与流分析场景不同的是，仅考虑某个时间窗口内的事件通常是不够的：构建物化视图可能需要任意时间段内的 **所有** 事件，除了那些可能由日志压缩丢弃的过时事件（请参阅 “[日志压缩](#sec_stream_log_compaction)”）。实际上，你需要一个可以一直延伸到时间开端的窗口。

原则上讲，任何流处理组件都可以用于维护物化视图，尽管 “永远运行” 与一些面向分析的框架假设的 “主要在有限时间段窗口上运行” 背道而驰，Kafka Streams 和 Confluent 的 ksqlDB 支持这种用法，建立在 Kafka 对日志压缩的支持上[^58]。

> [!TIP] 增量视图维护
> 数据库看起来很适合做物化视图维护：它们本来就擅长保存完整数据副本，也常常支持物化视图。
>
> 但很多数据库刷新物化视图仍依赖批处理或按需触发（例如 PostgreSQL 的 `REFRESH MATERIALIZED VIEW`），而不是在源数据变化时做增量维护。这会带来两个问题：
>
> 1. 效率低：每次刷新都重算全量数据，而不是只处理变化部分[^38] [^59] [^60]。
> 2. 不够实时：刷新间隔内的变化不会立刻反映在视图里。
>
> Materialize、RisingWave、ClickHouse、Feldera 等系统都在探索更实时的增量维护路径[^61]。

#### 在流上搜索 {#id320}

除了允许搜索由多个事件构成模式的 CEP 外，有时也存在基于复杂标准（例如全文检索查询）来搜索单个事件的需求。

例如，媒体监测服务可以订阅新闻文章 Feed 与来自媒体的播客，搜索任何关于公司、产品或感兴趣的话题的新闻。这是通过预先构建一个搜索查询来完成的，然后不断地将新闻项的流与该查询进行匹配。在一些网站上也有类似的功能：例如，当市场上出现符合其搜索条件的新房产时，房地产网站的用户可以要求网站通知他们。Elasticsearch 的 percolator 功能，是实现这种流搜索的一种选择[^62]。

传统的搜索引擎首先索引文件，然后在索引上跑查询。相比之下，搜索一个数据流则反了过来：查询被存储下来，文档从查询中流过，就像在 CEP 中一样。最简单的情况就是，你可以为每个文档测试每个查询。但是如果你有大量查询，这可能会变慢。为了优化这个过程，可以像对文档一样，为查询建立索引。因而收窄可能匹配的查询集合[^63]。

#### 事件驱动架构与 RPC {#sec_stream_actors_drpc}

在 “[消息传递中的数据流](/ch5#sec_encoding_dataflow_msg)” 中我们讨论过，消息传递系统可以作为 RPC 的替代方案，即作为一种服务间通信的机制，比如在 Actor 模型中所使用的那样。尽管这些系统也是基于消息和事件，但我们通常不会将其视作流处理组件：

* Actor 框架主要是管理模块通信的并发和分布式执行的一种机制，而流处理主要是一种数据管理技术。
* Actor 之间的交流往往是短暂的、一对一的；而事件日志则是持久的、多订阅者的。
* Actor 可以以任意方式进行通信（包括循环的请求 / 响应模式），但流处理通常配置在无环流水线中，其中每个流都是一个特定作业的输出，由良好定义的输入流中派生而来。

也就是说，RPC 类系统与流处理之间有一些交叉领域。例如，Apache Storm 有一个称为 **分布式 RPC** 的功能，它允许将用户查询分散到一系列也处理事件流的节点上；然后这些查询与来自输入流的事件交织，而结果可以被汇总并发回给用户（另请参阅 “[多分区数据处理](/ch13#sec_future_unbundled_multi_shard)”）。

也可以使用 Actor 框架来处理流。但是，很多这样的框架在崩溃时不能保证消息的传递，除非你实现了额外的重试逻辑，否则这种处理不是容错的。

### 时间推理 {#sec_stream_time}

流处理通常需要与时间打交道，尤其是用于分析目的时候，会频繁使用时间窗口，例如 “过去五分钟的平均值”。“过去五分钟” 的含义看上去似乎是清晰而无歧义的，但不幸的是，这个概念非常棘手。

在批处理过程中，大量的历史事件被快速地处理。如果需要按时间来分析，批处理器需要检查每个事件中嵌入的时间戳。读取运行批处理机器的系统时钟没有任何意义，因为处理运行的时间与事件实际发生的时间无关。

批处理可以在几分钟内读取一年的历史事件；在大多数情况下，感兴趣的时间线是历史中的一年，而不是处理中的几分钟。而且使用事件中的时间戳，使得处理是 **确定性** 的：在相同的输入上再次运行相同的处理过程会得到相同的结果。

另一方面，许多流处理框架使用处理机器上的本地系统时钟（**处理时间**，即 processing time）来确定 **窗口（windowing）**[^64]。这种方法的优点是简单，如果事件创建与事件处理之间的延迟可以忽略不计，那也是合理的。然而，如果存在任何显著的处理延迟 —— 即，事件处理显著地晚于事件实际发生的时间，这种处理方式就失效了。

#### 事件时间与处理时间 {#id322}

很多原因都可能导致处理延迟：排队，网络故障（请参阅 “[不可靠的网络](/ch9#sec_distributed_networks)”），性能问题导致消息代理 / 消息处理器出现争用，流消费者重启，从故障中恢复时重新处理过去的事件（请参阅 “[重播旧消息](#sec_stream_replay)”），或者在修复代码 BUG 之后。

而且，消息延迟还可能导致无法预测消息顺序。例如，假设用户首先发出一个 Web 请求（由 Web 服务器 A 处理），然后发出第二个请求（由服务器 B 处理）。A 和 B 发出描述它们所处理请求的事件，但是 B 的事件在 A 的事件发生之前到达消息代理。现在，流处理器将首先看到 B 事件，然后看到 A 事件，即使它们实际上是以相反的顺序发生的。

有一个类比也许能帮助理解，“星球大战” 电影：第四集于 1977 年发行，第五集于 1980 年，第六集于 1983 年，紧随其后的是 1999 年的第一集、2002 年的第二集、2005 年的第三集，以及 2015 年、2017 年和 2019 年的第七至第九集[^65]。如果你按照它们上映的顺序观看电影，你处理电影的顺序与它们叙事的顺序就是不一致的。（集数编号就像事件时间戳，而你观看电影的日期就是处理时间）作为人类，我们能够应对这种不连续性，但是流处理算法需要专门编写，以适应这种时序与顺序的问题。

将事件时间和处理时间搞混会导致错误的数据。例如，假设你有一个流处理器用于测量请求速率（计算每秒请求数）。如果你重新部署流处理器，它可能会停止一分钟，并在恢复之后处理积压的事件。如果你按处理时间来衡量速率，那么在处理积压日志时，请求速率看上去就像有一个异常的突发尖峰，而实际上请求速率是稳定的（[图 12-8](#fig_stream_processing_time_skew)）。

{{< figure src="/fig/ddia_1208.png" id="fig_stream_processing_time_skew" caption="图 12-8. 按处理时间分窗，会因为处理速率的变动引入人为因素。" class="w-full my-4" >}}

#### 处理滞留事件 {#id323}

用事件时间来定义窗口的一个棘手的问题是，你永远也无法确定是不是已经收到了特定窗口的所有事件，还是说还有一些事件正在来的路上。

例如，假设你将事件分组为一分钟的窗口，以便统计每分钟的请求数。你已经计数了一些带有本小时内第 37 分钟时间戳的事件，时间流逝，现在进入的主要都是本小时内第 38 和第 39 分钟的事件。什么时候才能宣布你已经完成了第 37 分钟的窗口计数，并输出其计数器值？

在一段时间没有看到任何新的事件之后，你可以超时并宣布一个窗口已经就绪，但仍然可能发生这种情况：某些事件被缓冲在另一台机器上，由于网络中断而延迟。你需要能够处理这种在窗口宣告完成之后到达的 **滞留（straggler）** 事件。大体上，你有两种选择[^1]：

1. 忽略这些滞留事件，因为在正常情况下它们可能只是事件中的一小部分。你可以将丢弃事件的数量作为一个监控指标，并在出现大量丢消息的情况时报警。
2. 发布一个 **更正（correction）**，一个包括滞留事件的更新窗口值。你可能还需要收回以前的输出。

在某些情况下，可以使用特殊的消息来指示 “从现在开始，不会有比 t 更早时间戳的消息了”，消费者可以使用它来触发窗口[^66]。但是，如果不同机器上的多个生产者都在生成事件，每个生产者都有自己的最小时间戳阈值，则消费者需要分别跟踪每个生产者。在这种情况下，添加和删除生产者都是比较棘手的。

#### 你用的是谁的时钟？ {#id438}

当事件可能在系统内多个地方进行缓冲时，为事件分配时间戳更加困难了。例如，考虑一个移动应用向服务器上报关于用量的事件。该应用可能会在设备处于脱机状态时被使用，在这种情况下，它将在设备本地缓冲事件，并在下一次互联网连接可用时向服务器上报这些事件（可能是几小时甚至几天）。对于这个流的任意消费者而言，它们就如延迟极大的滞留事件一样。

在这种情况下，事件上的时间戳实际上应当是用户交互发生的时间，取决于移动设备的本地时钟。然而用户控制的设备上的时钟通常是不可信的，因为它可能会被无意或故意设置成错误的时间（请参阅 “[时钟同步与准确性](/ch9#sec_distributed_clock_accuracy)”）。服务器收到事件的时间（取决于服务器的时钟）可能是更准确的，因为服务器在你的控制之下，但在描述用户交互方面意义不大。

要校正不正确的设备时钟，一种方法是记录三个时间戳[^67]：

* 事件发生的时间，取决于设备时钟
* 事件发送往服务器的时间，取决于设备时钟
* 事件被服务器接收的时间，取决于服务器时钟

通过从第三个时间戳中减去第二个时间戳，可以估算设备时钟和服务器时钟之间的偏移（假设网络延迟与所需的时间戳精度相比可忽略不计）。然后可以将该偏移应用于事件时间戳，从而估计事件实际发生的真实时间（假设设备时钟偏移在事件发生时与送往服务器之间没有变化）。

这并不是流处理独有的问题，批处理有着完全一样的时间推理问题。只是在流处理的上下文中，我们更容易意识到时间的流逝。

#### 窗口的类型 {#id324}

当你知道如何确定一个事件的时间戳后，下一步就是如何定义时间段的窗口。然后窗口就可以用于聚合，例如事件计数，或计算窗口内值的平均值。有几种窗口很常用[^64] [^68]：

滚动窗口（Tumbling Window）
: 滚动窗口有着固定的长度，每个事件都仅能属于一个窗口。例如，假设你有一个 1 分钟的滚动窗口，则所有时间戳在 `10:03:00` 和 `10:03:59` 之间的事件会被分组到一个窗口中，`10:04:00` 和 `10:04:59` 之间的事件被分组到下一个窗口，依此类推。通过将每个事件时间戳四舍五入至最近的分钟来确定它所属的窗口，可以实现 1 分钟的滚动窗口。

跳动窗口（Hopping Window）
: 跳动窗口也有着固定的长度，但允许窗口重叠以提供一些平滑。例如，一个带有 1 分钟跳跃步长的 5 分钟窗口将包含 `10:03:00` 至 `10:07:59` 之间的事件，而下一个窗口将覆盖 `10:04:00` 至 `10:08:59` 之间的事件，等等。通过首先计算 1 分钟的滚动窗口（tumbling window），然后在几个相邻窗口上进行聚合，可以实现这种跳动窗口。

滑动窗口（Sliding Window）
: 滑动窗口包含了彼此间距在特定时长内的所有事件。例如，一个 5 分钟的滑动窗口应当覆盖 `10:03:39` 和 `10:08:12` 的事件，因为它们相距不超过 5 分钟（注意滚动窗口与步长 5 分钟的跳动窗口可能不会把这两个事件分组到同一个窗口中，因为它们使用固定的边界）。通过维护一个按时间排序的事件缓冲区，并不断从窗口中移除过期的旧事件，可以实现滑动窗口。

会话窗口（Session window）
: 与其他窗口类型不同，会话窗口没有固定的持续时间，而定义为：将同一用户出现时间相近的所有事件分组在一起，而当用户一段时间没有活动时（例如，如果 30 分钟内没有事件）窗口结束。会话切分是网站分析的常见需求（请参阅 “[JOIN 与 GROUP BY](/ch11#sec_batch_join)”）。

### 流连接 {#sec_stream_joins}

在 [第十一章](/ch11) 中，我们讨论了批处理作业如何通过键来连接数据集，以及这种连接是如何成为数据管道的重要组成部分的。由于流处理将数据管道泛化为对无限数据集进行增量处理，因此对流进行连接的需求也是完全相同的。

然而，新事件随时可能出现在一个流中，这使得流连接要比批处理连接更具挑战性。为了更好地理解情况，让我们先来区分三种不同类型的连接：**流 - 流** 连接，**流 - 表** 连接，与 **表 - 表** 连接。我们将在下面的章节中通过例子来说明。

#### 流流连接（窗口连接） {#id440}

假设你的网站上有搜索功能，而你想要找出搜索 URL 的近期趋势。每当有人键入搜索查询时，都会记录下一个包含查询与其返回结果的事件。每当有人点击其中一个搜索结果时，就会记录另一个记录点击事件。为了计算搜索结果中每个 URL 的点击率，你需要将搜索动作与点击动作的事件连在一起，这些事件通过相同的会话 ID 进行连接。广告系统中需要类似的分析[^69]。

如果用户丢弃了搜索结果，点击可能永远不会发生，即使它出现了，搜索与点击之间的时间可能是高度可变的：在很多情况下，它可能是几秒钟，但也可能长达几天或几周（如果用户执行搜索，忘掉了这个浏览器页面，过了一段时间后重新回到这个浏览器页面上，并点击了一个结果）。由于可变的网络延迟，点击事件甚至可能先于搜索事件到达。你可以选择合适的连接窗口 —— 例如，如果点击与搜索之间的时间间隔在一小时内，你可能会选择连接两者。

请注意，在点击事件中嵌入搜索详情与事件连接并不一样：这样做的话，只有当用户点击了一个搜索结果时你才能知道，而那些没有点击的搜索就无能为力了。为了衡量搜索质量，你需要准确的点击率，为此搜索事件和点击事件两者都是必要的。

为了实现这种类型的连接，流处理器需要维护 **状态**：例如，按会话 ID 索引最近一小时内发生的所有事件。无论何时发生搜索事件或点击事件，都会被添加到合适的索引中，而流处理器也会检查另一个索引是否有具有相同会话 ID 的事件到达。如果有匹配事件就会发出一个表示搜索结果被点击的事件；如果搜索事件直到过期都没看见有匹配的点击事件，就会发出一个表示搜索结果未被点击的事件。

#### 流表连接（流扩充） {#sec_stream_table_joins}

在 “[示例：用户活动事件分析](/ch11#sec_batch_join)”（[图 11-2](/ch11#fig_batch_join_example)）中，我们看到了连接两个数据集的批处理作业示例：一组用户活动事件和一个用户档案数据库。将用户活动事件视为流，并在流处理器中连续执行相同的连接是很自然的想法：输入是包含用户 ID 的活动事件流，而输出还是活动事件流，但其中用户 ID 已经被扩展为用户的档案信息。这个过程有时被称为使用数据库的信息来 **扩充（enriching）** 活动事件。

要执行此连接，流处理器需要一次处理一个活动事件，在数据库中查找事件的用户 ID，并将档案信息添加到活动事件中。数据库查询可以通过查询远程数据库来实现。但正如在 “[示例：用户活动事件分析](/ch11#sec_batch_join)” 一节中讨论的，此类远程查询可能会很慢，并且有可能导致数据库过载[^58]。

另一种方法是将数据库副本加载到流处理器中，以便在本地进行查询而无需网络往返。这种技术与我们在 “[JOIN 与 GROUP BY](/ch11#sec_batch_join)” 中讨论的散列连接非常相似：如果数据库的本地副本足够小，则可以是内存中的散列表，比较大的话也可以是本地磁盘上的索引。

与批处理作业的区别在于，批处理作业使用数据库的时间点快照作为输入，而流处理器是长时间运行的，且数据库的内容可能随时间而改变，所以流处理器数据库的本地副本需要保持更新。这个问题可以通过数据变更捕获来解决：流处理器可以订阅用户档案数据库的更新日志，如同活动事件流一样。当增添或修改档案时，流处理器会更新其本地副本。因此，我们有了两个流之间的连接：活动事件和档案更新。

流表连接实际上非常类似于流流连接；最大的区别在于对于表的变更日志流，连接使用了一个可以回溯到 “时间起点” 的窗口（概念上是无限的窗口），新版本的记录会覆盖更早的版本。对于输入的流，连接可能压根儿就没有维护任何窗口。

#### 表表连接（维护物化视图） {#id326}

我们在 “[描述负载](/ch2#sec_introduction_twitter)” 中讨论的推特时间线例子时说过，当用户想要查看他们的主页时间线时，迭代用户所关注人群的推文并合并它们是一个开销巨大的操作。

相反，我们需要一个时间线缓存：一种每个用户的 “收件箱”，在发送推文的时候写入这些信息，因而读取时间线时只需要简单地查询即可。物化与维护这个缓存需要处理以下事件：

* 当用户 u 发送新的推文时，它将被添加到每个关注用户 u 的时间线上。
* 用户删除推文时，推文将从所有用户的时间线中删除。
* 当用户 *u*~1~ 开始关注用户 *u*~2~ 时，*u*~2~ 最近的推文将被添加到 *u*~1~ 的时间线上。
* 当用户 *u*~1~ 取消关注用户 *u*~2~ 时，*u*~2~ 的推文将从 *u*~1~ 的时间线中移除。

要在流处理器中实现这种缓存维护，你需要推文事件流（发送与删除）和关注关系事件流（关注与取消关注）。流处理需要维护一个数据库，包含每个用户的粉丝集合，以便知道当一条新推文到达时，需要更新哪些时间线。

观察这个流处理过程的另一种视角是：它维护了一个连接了两个表（推文与关注）的物化视图，如下所示：

```sql
SELECT follows.follower_id AS timeline_id,
    array_agg(tweets.* ORDER BY tweets.timestamp DESC)
FROM tweets
JOIN follows ON follows.followee_id = tweets.sender_id
GROUP BY follows.follower_id
```

流连接直接对应于这个查询中的表连接。时间线实际上是这个查询结果的缓存，每当底层的表发生变化时都会更新。

> [!NOTE]
> 如果你将流视作表的导数（如 [图 12-7](#fig_stream_state_derivative) 所示），并把连接看作两个表 *u·v* 的乘积，那么会出现一个有趣现象：物化连接的变化流遵循乘积法则 \( (u \cdot v)' = u'v + uv' \)。换句话说，任何推文变化都要和当前关注关系连接，任何关注关系变化都要和当前推文连接[^37]。

#### 连接的时间依赖性 {#sec_stream_join_time}

这里描述的三种连接（流流，流表，表表）有很多共通之处：它们都需要流处理器维护连接一侧的一些状态（搜索与点击事件，用户档案，关注列表），然后当连接另一侧的消息到达时查询该状态。

用于维护状态的事件顺序是很重要的（先关注然后取消关注，或者其他类似操作）。在分区日志中，单个分区内的事件顺序是保留下来的。但典型情况下是没有跨流或跨分区的顺序保证的。

这就产生了一个问题：如果不同流中的事件发生在近似的时间范围内，则应该按照什么样的顺序进行处理？在流表连接的例子中，如果用户更新了它们的档案，哪些活动事件与旧档案连接（在档案更新前处理），哪些又与新档案连接（在档案更新之后处理）？换句话说：你需要对一些状态做连接，如果状态会随着时间推移而变化，那应当使用什么时间点来连接呢？

这种时序依赖可能出现在很多地方。例如销售东西需要对发票应用适当的税率，这取决于所处的国家 / 州，产品类型，销售日期（因为税率时不时会变化）。当连接销售额与税率表时，你可能期望的是使用销售时的税率参与连接。如果你正在重新处理历史数据，销售时的税率可能和现在的税率有所不同。

如果跨越流的事件顺序是未定的，则连接会变为不确定性的[^70]，这意味着你在同样输入上重跑相同的作业未必会得到相同的结果：当你重跑任务时，输入流上的事件可能会以不同的方式交织。

在数据仓库中，这个问题被称为 **缓慢变化的维度（slowly changing dimension, SCD）**，通常通过对特定版本的记录使用唯一的标识符来解决：例如，每当税率改变时都会获得一个新的标识符，而发票在销售时会带有税率的标识符[^71] [^72]。这种变化使连接变为确定性的，但也会导致日志压缩无法进行：表中所有的记录版本都需要保留。

### 容错 {#sec_stream_fault_tolerance}

在本章的最后一节中，让我们看一看流处理是如何容错的。我们在 [第十一章](/ch11) 中看到，批处理框架可以很容易地容错：如果 MapReduce 作业中的任务失败，可以简单地在另一台机器上再次启动，并且丢弃失败任务的输出。这种透明的重试是可能的，因为输入文件是不可变的，每个任务都将其输出写入到 HDFS 上的独立文件中，而输出仅当任务成功完成后可见。

特别是，批处理容错方法可确保批处理作业的输出与没有出错的情况相同，即使实际上某些任务失败了。看起来好像每条输入记录都被处理了恰好一次 —— 没有记录被跳过，而且没有记录被处理两次。尽管重启任务意味着实际上可能会多次处理记录，但输出中的可见效果看上去就像只处理过一次。这个原则被称为 **恰好一次语义（exactly-once semantics）**，尽管 **等效一次（effectively-once）** 可能会是一个更写实的术语[^73]。

在流处理中也出现了同样的容错问题，但是处理起来没有那么直观：等待某个任务完成之后再使其输出可见并不是一个可行选项，因为你永远无法处理完一个无限的流。

#### 微批次与存档点 {#id329}

一个解决方案是将流分解成小块，并像微型批处理一样处理每个块。这种方法被称为 **微批次（microbatching）**，它被用于 Spark Streaming[^74]。批次的大小通常约为 1 秒，这是对性能妥协的结果：较小的批次会导致更大的调度与协调开销，而较大的批次意味着流处理器结果可见之前的延迟要更长。

微批次也隐式提供了一个与批次大小相等的滚动窗口（按处理时间而不是事件时间戳分窗）。任何需要更大窗口的作业都需要显式地将状态从一个微批次转移到下一个微批次。

Apache Flink 则使用不同的方法，它会定期生成状态的滚动存档点并将其写入持久存储[^75] [^76]。如果流算子崩溃，它可以从最近的存档点重启，并丢弃从最近检查点到崩溃之间的所有输出。存档点会由消息流中的 **壁障（barrier）** 触发，类似于微批次之间的边界，但不会强制一个特定的窗口大小。

在流处理框架的范围内，微批次与存档点方法提供了与批处理一样的 **恰好一次语义**。但是，只要输出离开流处理器（例如，写入数据库，向外部消息代理发送消息，或发送电子邮件），框架就无法抛弃失败批次的输出了。在这种情况下，重启失败任务会导致外部副作用发生两次，只有微批次或存档点不足以阻止这一问题。

#### 原子提交再现 {#sec_stream_atomic_commit}

为了在出现故障时表现出恰好处理一次的样子，我们需要确保事件处理的所有输出和副作用 **当且仅当** 处理成功时才会生效。这些影响包括发送给下游算子或外部消息传递系统（包括电子邮件或推送通知）的任何消息，任何数据库写入，对算子状态的任何变更，以及对输入消息的任何确认（包括在基于日志的消息代理中将消费者偏移量前移）。

这些事情要么都原子地发生，要么都不发生，但是它们不应当失去同步。如果这种方法听起来很熟悉，那是因为我们在分布式事务和两阶段提交的上下文中讨论过它（请参阅 “[恰好一次的消息处理](/ch8#sec_transactions_exactly_once)”）。

在 [第十章](/ch10) 中，我们讨论了分布式事务传统实现中的问题（如 XA）。然而在限制更为严苛的环境中，也是有可能高效实现这种原子提交机制的。Google Cloud Dataflow[^66] [^75]、VoltDB[^77] 和 Apache Kafka[^78] [^79] 中都使用了这种方法。与 XA 不同，这些实现不会尝试跨异构技术提供事务，而是通过在流处理框架中同时管理状态变更与消息传递来内化事务。事务协议的开销可以通过在单个事务中处理多个输入消息来分摊。

#### 幂等性 {#sec_stream_idempotence}

我们的目标是丢弃任何失败任务的部分输出，以便能安全地重试，而不会生效两次。分布式事务是实现这个目标的一种方式，而另一种方式是依赖 **幂等性（idempotence）**[^80]。

幂等操作是多次重复执行与单次执行效果相同的操作。例如，将键值存储中的某个键设置为某个特定值是幂等的（再次写入该值，只是用同样的值替代），而递增一个计数器不是幂等的（再次执行递增意味着该值递增两次）。

即使一个操作不是天生幂等的，往往可以通过一些额外的元数据做成幂等的。例如，在使用来自 Kafka 的消息时，每条消息都有一个持久的、单调递增的偏移量。将值写入外部数据库时可以将这个偏移量带上，这样你就可以判断一条更新是不是已经执行过了，因而避免重复执行。

Storm 的 Trident 基于类似的想法来处理状态。依赖幂等性意味着隐含了一些假设：重启一个失败的任务必须以相同的顺序重播相同的消息（基于日志的消息代理能做这些事），处理必须是确定性的，没有其他节点能同时更新相同的值[^81] [^82]。

当从一个处理节点故障切换到另一个节点时，可能需要进行 **防护**（fencing，请参阅 “[领导者和锁](/ch9#sec_distributed_lock_fencing)”），以防止被假死节点干扰。尽管有这么多注意事项，幂等操作是一种实现 **恰好一次语义** 的有效方式，仅需很小的额外开销。

#### 失败后重建状态 {#sec_stream_state_fault_tolerance}

任何需要状态的流处理 —— 例如，任何窗口聚合（例如计数器，平均值和直方图）以及任何用于连接的表和索引，都必须确保在失败之后能恢复其状态。

一种选择是将状态保存在远程数据存储中，并进行复制，然而正如在 “[流表连接（流扩充）](#sec_stream_table_joins)” 中所述，每个消息都要查询远程数据库可能会很慢。另一种方法是在流处理器本地保存状态，并定期复制。然后当流处理器从故障中恢复时，新任务可以读取状态副本，恢复处理而不丢失数据。

例如，Flink 定期捕获算子状态的快照，并将它们写入 HDFS 等持久存储中[^75] [^76]。Kafka Streams 通过将状态变更发送到具有日志压缩功能的专用 Kafka 主题来复制状态变更，这与数据变更捕获类似[^83]。VoltDB 通过在多个节点上对每个输入消息进行冗余处理来复制状态（请参阅 “[真的串行执行](/ch8#sec_transactions_serial)”）。

在某些情况下，甚至可能都不需要复制状态，因为它可以从输入流重建。例如，如果状态是从相当短的窗口中聚合而成，则简单地重播该窗口中的输入事件可能是足够快的。如果状态是通过数据变更捕获来维护的数据库的本地副本，那么也可以从日志压缩的变更流中重建数据库（请参阅 “[日志压缩](#sec_stream_log_compaction)”）。

然而，所有这些权衡取决于底层基础架构的性能特征：在某些系统中，网络延迟可能低于磁盘访问延迟，网络带宽也可能与磁盘带宽相当。没有针对所有情况的普适理想权衡，随着存储和网络技术的发展，本地状态与远程状态的优点也可能会互换。


## 本章小结 {#id332}

在本章中，我们讨论了事件流，它们所服务的目的，以及如何处理它们。在某些方面，流处理非常类似于在 [第十一章](/ch11) 中讨论的批处理，不过是在无限的（永无止境的）流而不是固定大小的输入上持续进行[^84]。从这个角度来看，消息代理和事件日志可以视作文件系统的流式等价物。

我们花了一些时间比较两种消息代理：

AMQP/JMS 风格的消息代理
: 代理将单条消息分配给消费者，消费者在成功处理单条消息后确认消息。消息被确认后从代理中删除。这种方法适合作为一种异步形式的 RPC（另请参阅 “[事件驱动的架构](/ch5#sec_encoding_dataflow_msg)”），例如在任务队列中，消息处理的确切顺序并不重要，而且消息在处理完之后，不需要回头重新读取旧消息。

基于日志的消息代理
: 代理将一个分区中的所有消息分配给同一个消费者节点，并始终以相同的顺序传递消息。并行是通过分区实现的，消费者通过存档最近处理消息的偏移量来跟踪工作进度。消息代理将消息保留在磁盘上，因此如有必要的话，可以回跳并重新读取旧消息。

基于日志的方法与数据库中的复制日志（请参阅 [第六章](/ch6)）和日志结构存储引擎（请参阅 [第四章](/ch4)）有相似之处。我们看到，这种方法对于消费输入流，并产生派生状态或派生输出数据流的系统而言特别适用。

就流的来源而言，我们讨论了几种可能性：用户活动事件，定期读数的传感器，和 Feed 数据（例如，金融中的市场数据）能够自然地表示为流。我们发现将数据库写入视作流也是很有用的：我们可以捕获变更日志 —— 即对数据库所做的所有变更的历史记录 —— 隐式地通过数据变更捕获，或显式地通过事件溯源。日志压缩允许流也能保有数据库内容的完整副本。

将数据库表示为流为系统集成带来了很多强大机遇。通过消费变更日志并将其应用至派生系统，你能使诸如搜索索引、缓存以及分析系统这类派生数据系统不断保持更新。你甚至能从头开始，通过读取从创世至今的所有变更日志，为现有数据创建全新的视图。

像流一样维护状态以及消息重播的基础设施，是在各种流处理框架中实现流连接和容错的基础。我们讨论了流处理的几种目的，包括搜索事件模式（复杂事件处理），计算分窗聚合（流分析），以及保证派生数据系统处于最新状态（物化视图）。

然后我们讨论了在流处理中对时间进行推理的困难，包括处理时间与事件时间戳之间的区别，以及当你认为窗口已经完事之后，如何处理到达的掉队事件的问题。

我们区分了流处理中可能出现的三种连接类型：

流流连接
: 两个输入流都由活动事件组成，而连接算子在某个时间窗口内搜索相关的事件。例如，它可能会将同一个用户 30 分钟内进行的两个活动联系在一起。如果你想要找出一个流内的相关事件，连接的两侧输入可能实际上都是同一个流（**自连接**，即 self-join）。

流表连接
: 一个输入流由活动事件组成，另一个输入流是数据库变更日志。变更日志保证了数据库的本地副本是最新的。对于每个活动事件，连接算子将查询数据库，并输出一个扩展的活动事件。

表表连接
: 两个输入流都是数据库变更日志。在这种情况下，一侧的每一个变化都与另一侧的最新状态相连接。结果是两表连接所得物化视图的变更流。

最后，我们讨论了在流处理中实现容错和恰好一次语义的技术。与批处理一样，我们需要放弃任何失败任务的部分输出。然而由于流处理长时间运行并持续产生输出，所以不能简单地丢弃所有的输出。相反，可以使用更细粒度的恢复机制，基于微批次、存档点、事务或幂等写入。


### 参考文献 {#references}

[^1]: Tyler Akidau, Robert Bradshaw, Craig Chambers, Slava Chernyak, Rafael J. Fernández-Moctezuma, Reuven Lax, Sam McVeety, Daniel Mills, Frances Perry, Eric Schmidt, and Sam Whittle. [The Dataflow Model: A Practical Approach to Balancing Correctness, Latency, and Cost in Massive-Scale, Unbounded, Out-of-Order Data Processing](https://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf). *Proceedings of the VLDB Endowment*, volume 8, issue 12, pages 1792--1803, August 2015. [doi:10.14778/2824032.2824076](https://doi.org/10.14778/2824032.2824076)
[^2]: Harold Abelson, Gerald Jay Sussman, and Julie Sussman. [*Structure and Interpretation of Computer Programs*](https://web.mit.edu/6.001/6.037/sicp.pdf), 2nd edition. MIT Press, 1996. ISBN: 978-0-262-51087-5, archived at [archive.org/details/sicp_20211010](https://archive.org/details/sicp_20211010)
[^3]: Patrick Th. Eugster, Pascal A. Felber, Rachid Guerraoui, and Anne-Marie Kermarrec. [The Many Faces of Publish/Subscribe](https://www.cs.ru.nl/~pieter/oss/manyfaces.pdf). *ACM Computing Surveys*, volume 35, issue 2, pages 114--131, June 2003. [doi:10.1145/857076.857078](https://doi.org/10.1145/857076.857078)
[^4]: Don Carney, Uğur Çetintemel, Mitch Cherniack, Christian Convey, Sangdon Lee, Greg Seidman, Michael Stonebraker, Nesime Tatbul, and Stan Zdonik. [Monitoring Streams -- A New Class of Data Management Applications](https://www.vldb.org/conf/2002/S07P02.pdf). At *28th International Conference on Very Large Data Bases* (VLDB), August 2002. [doi:10.1016/B978-155860869-6/50027-5](https://doi.org/10.1016/B978-155860869-6/50027-5)
[^5]: Matthew Sackman. [Pushing Back](https://wellquite.org/posts/lshift/pushing_back/). *wellquite.org*, May 2016. Archived at [perma.cc/3KCZ-RUFY](https://perma.cc/3KCZ-RUFY)
[^6]: Thomas Figg (tef). [how (not) to write a pipeline](https://web.archive.org/web/20250107135013/https://cohost.org/tef/post/1764930-how-not-to-write-a). *cohost.org*, June 2023. Archived at [perma.cc/A3V8-NYCM](https://perma.cc/A3V8-NYCM)
[^7]: Vicent Martí. [Brubeck, a statsd-Compatible Metrics Aggregator](https://github.blog/news-insights/the-library/brubeck/). *github.blog*, June 2015. Archived at [perma.cc/TP3Q-DJYM](https://perma.cc/TP3Q-DJYM)
[^8]: Seth Lowenberger. [MoldUDP64 Protocol Specification V 1.00](https://www.nasdaqtrader.com/content/technicalsupport/specifications/dataproducts/moldudp64.pdf). *nasdaqtrader.com*, July 2009. Archived at <https://perma.cc/7CRQ-QBD7>
[^9]: Ian Malpass. [Measure Anything, Measure Everything](https://codeascraft.com/2011/02/15/measure-anything-measure-everything/). *codeascraft.com*, February 2011. Archived at [archive.org](https://web.archive.org/web/20250820034209/https://www.etsy.com/codeascraft/measure-anything-measure-everything/)
[^10]: Dieter Plaetinck. [25 Graphite, Grafana and statsd Gotchas](https://grafana.com/blog/2016/03/03/25-graphite-grafana-and-statsd-gotchas/). *grafana.com*, March 2016. Archived at [perma.cc/3NP3-67U7](https://perma.cc/3NP3-67U7)
[^11]: Jeff Lindsay. [Web Hooks to Revolutionize the Web](https://progrium.github.io/blog/2007/05/03/web-hooks-to-revolutionize-the-web/). *progrium.com*, May 2007. Archived at [perma.cc/BF9U-XNX4](https://perma.cc/BF9U-XNX4)
[^12]: Jim N. Gray. [Queues Are Databases](https://arxiv.org/pdf/cs/0701158.pdf). Microsoft Research Technical Report MSR-TR-95-56, December 1995. Archived at [arxiv.org](https://arxiv.org/pdf/cs/0701158)
[^13]: Mark Hapner, Rich Burridge, Rahul Sharma, Joseph Fialli, Kate Stout, and Nigel Deakin. [JSR-343 Java Message Service (JMS) 2.0 Specification](https://jcp.org/en/jsr/detail?id=343). *jms-spec.java.net*, March 2013. Archived at [perma.cc/E4YG-46TA](https://perma.cc/E4YG-46TA)
[^14]: Sanjay Aiyagari, Matthew Arrott, Mark Atwell, Jason Brome, Alan Conway, Robert Godfrey, Robert Greig, Pieter Hintjens, John O'Hara, Matthias Radestock, Alexis Richardson, Martin Ritchie, Shahrokh Sadjadi, Rafael Schloming, Steven Shaw, Martin Sustrik, Carl Trieloff, Kim van der Riet, and Steve Vinoski. [AMQP: Advanced Message Queuing Protocol Specification](https://www.rabbitmq.com/resources/specs/amqp0-9-1.pdf). Version 0-9-1, November 2008. Archived at [perma.cc/6YJJ-GM9X](https://perma.cc/6YJJ-GM9X)
[^15]: [Architectural overview of Pub/Sub](https://cloud.google.com/pubsub/architecture). *cloud.google.com*, 2025. Archived at [perma.cc/VWF5-ABP4](https://perma.cc/VWF5-ABP4)
[^16]: Aris Tzoumas. [Lessons from scaling PostgreSQL queues to 100k events per second](https://www.rudderstack.com/blog/scaling-postgres-queue/). *rudderstack.com*, July 2025. Archived at [perma.cc/QD8C-VA4Y](https://perma.cc/QD8C-VA4Y)
[^17]: Robin Moffatt. [Kafka Connect Deep Dive -- Error Handling and Dead Letter Queues](https://www.confluent.io/blog/kafka-connect-deep-dive-error-handling-dead-letter-queues/). *confluent.io*, March 2019. Archived at [perma.cc/KQ5A-AB28](https://perma.cc/KQ5A-AB28)
[^18]: Dunith Danushka. [Message reprocessing: How to implement the dead letter queue](https://redpanda.com/blog/reliable-message-processing-with-dead-letter-queue). *redpanda.com*. Archived at [perma.cc/R7UB-WEWF](https://perma.cc/R7UB-WEWF)
[^19]: Damien Gasparina, Loic Greffier, and Sebastien Viale. [KIP-1034: Dead letter queue in Kafka Streams](https://cwiki.apache.org/confluence/display/KAFKA/KIP-1034%3A+Dead+letter+queue+in+Kafka+Streams). *cwiki.apache.org*, April 2024. Archived at [perma.cc/3VXV-QXAN](https://perma.cc/3VXV-QXAN)
[^20]: Jay Kreps, Neha Narkhede, and Jun Rao. [Kafka: A Distributed Messaging System for Log Processing](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/09/Kafka.pdf). At *6th International Workshop on Networking Meets Databases* (NetDB), June 2011. Archived at [perma.cc/CSW7-TCQ5](https://perma.cc/CSW7-TCQ5)
[^21]: Jay Kreps. [Benchmarking Apache Kafka: 2 Million Writes Per Second (On Three Cheap Machines)](https://engineering.linkedin.com/kafka/benchmarking-apache-kafka-2-million-writes-second-three-cheap-machines). *engineering.linkedin.com*, April 2014. Archived at [archive.org](https://web.archive.org/web/20140921000742/https://engineering.linkedin.com/kafka/benchmarking-apache-kafka-2-million-writes-second-three-cheap-machines)
[^22]: Kartik Paramasivam. [How We're Improving and Advancing Kafka at LinkedIn](https://engineering.linkedin.com/apache-kafka/how-we_re-improving-and-advancing-kafka-linkedin). *engineering.linkedin.com*, September 2015. Archived at [perma.cc/3S3V-JCYJ](https://perma.cc/3S3V-JCYJ)
[^23]: Philippe Dobbelaere and Kyumars Sheykh Esmaili. [Kafka versus RabbitMQ: A comparative study of two industry reference publish/subscribe implementations](https://arxiv.org/abs/1709.00333). At *11th ACM International Conference on Distributed and Event-based Systems* (DEBS), June 2017. [doi:10.1145/3093742.3093908](https://doi.org/10.1145/3093742.3093908)
[^24]: Kate Holterhoff. [Why Message Queues Endure: A History](https://redmonk.com/kholterhoff/2024/12/12/why-message-queues-endure-a-history/). *redmonk.com*, December 2024. Archived at [perma.cc/6DX8-XK4W](https://perma.cc/6DX8-XK4W)
[^25]: Andrew Schofield. [KIP-932: Queues for Kafka](https://cwiki.apache.org/confluence/display/KAFKA/KIP-932%3A+Queues+for+Kafka). *cwiki.apache.org*, May 2023. Archived at [perma.cc/LBE4-BEMK](https://perma.cc/LBE4-BEMK)
[^26]: Jack Vanlightly. [The advantages of queues on logs](https://jack-vanlightly.com/blog/2023/10/2/the-advantages-of-queues-on-logs). *jack-vanlightly.com*, October 2023. Archived at [perma.cc/WJ7V-287K](https://perma.cc/WJ7V-287K)
[^27]: Jay Kreps. [The Log: What Every Software Engineer Should Know About Real-Time Data's Unifying Abstraction](https://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying). *engineering.linkedin.com*, December 2013. Archived at [perma.cc/2JHR-FR64](https://perma.cc/2JHR-FR64)
[^28]: Andy Hattemer. [Change Data Capture is having a moment. Why?](https://materialize.com/blog/change-data-capture-is-having-a-moment-why/) *materialize.com*, September 2021. Archived at [perma.cc/AL37-P53C](https://perma.cc/AL37-P53C)
[^29]: Prem Santosh Udaya Shankar. [Streaming MySQL Tables in Real-Time to Kafka](https://engineeringblog.yelp.com/2016/08/streaming-mysql-tables-in-real-time-to-kafka.html). *engineeringblog.yelp.com*, August 2016. Archived at [perma.cc/5ZR3-2GVV](https://perma.cc/5ZR3-2GVV)
[^30]: Andreas Andreakis, Ioannis Papapanagiotou. [DBLog: A Watermark Based Change-Data-Capture Framework](https://arxiv.org/pdf/2010.12597). October 2020. Archived at [arxiv.org](https://arxiv.org/pdf/2010.12597)
[^31]: Jiri Pechanec. [Percolator](https://debezium.io/blog/2021/10/07/incremental-snapshots/). *debezium.io*, October 2021. Archived at [perma.cc/EQ8E-W6KQ](https://perma.cc/EQ8E-W6KQ)
[^32]: Debezium maintainers. [Debezium Connector for Cassandra](https://debezium.io/documentation/reference/stable/connectors/cassandra.html). *debezium.io*. Archived at [perma.cc/WR6K-EKMD](https://perma.cc/WR6K-EKMD)
[^33]: Neha Narkhede. [Announcing Kafka Connect: Building Large-Scale Low-Latency Data Pipelines](https://www.confluent.io/blog/announcing-kafka-connect-building-large-scale-low-latency-data-pipelines/). *confluent.io*, February 2016. Archived at [perma.cc/8WXJ-L6GF](https://perma.cc/8WXJ-L6GF)
[^34]: Chris Riccomini. [Kafka change data capture breaks database encapsulation](https://cnr.sh/posts/2018-11-05-kafka-change-data-capture-breaks-database-encapsulation/). *cnr.sh*, November 2018. Archived at [perma.cc/P572-9MKF](https://perma.cc/P572-9MKF)
[^35]: Gunnar Morling. ["Change Data Capture Breaks Encapsulation". Does it, though?](https://www.decodable.co/blog/change-data-capture-breaks-encapsulation-does-it-though) *decodable.co*, November 2023. Archived at [perma.cc/YX2P-WNWR](https://perma.cc/YX2P-WNWR)
[^36]: Gunnar Morling. [Revisiting the Outbox Pattern](https://www.decodable.co/blog/revisiting-the-outbox-pattern). *decodable.co*, October 2024. Archived at [perma.cc/M5ZL-RPS9](https://perma.cc/M5ZL-RPS9)
[^37]: Ashish Gupta and Inderpal Singh Mumick. [Maintenance of Materialized Views: Problems, Techniques, and Applications](https://web.archive.org/web/20220407025818id_/http://sites.computer.org/debull/95JUN-CD.pdf#page=5). *IEEE Data Engineering Bulletin*, volume 18, issue 2, pages 3--18, June 1995. Archived at [archive.org](https://web.archive.org/web/20220407025818id_/http://sites.computer.org/debull/95JUN-CD.pdf#page=5)
[^38]: Mihai Budiu, Tej Chajed, Frank McSherry, Leonid Ryzhyk, Val Tannen. [DBSP: Incremental Computation on Streams and Its Applications to Databases](https://sigmodrecord.org/publications/sigmodRecord/2403/pdfs/20_dbsp-budiu.pdf). *SIGMOD Record*, volume 53, issue 1, pages 87--95, March 2024. [doi:10.1145/3665252.3665271](https://doi.org/10.1145/3665252.3665271)
[^39]: Jim Gray and Andreas Reuter. [*Transaction Processing: Concepts and Techniques*](https://learning.oreilly.com/library/view/transaction-processing/9780080519555/). Morgan Kaufmann, 1992. ISBN: 9781558601901
[^40]: Martin Kleppmann. [Accounting for Computer Scientists](https://martin.kleppmann.com/2011/03/07/accounting-for-computer-scientists.html). *martin.kleppmann.com*, March 2011. Archived at [perma.cc/9EGX-P38N](https://perma.cc/9EGX-P38N)
[^41]: Pat Helland. [Immutability Changes Everything](https://www.cidrdb.org/cidr2015/Papers/CIDR15_Paper16.pdf). At *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
[^42]: Martin Kleppmann. [*Making Sense of Stream Processing*](https://martin.kleppmann.com/papers/stream-processing.pdf). Report, O'Reilly Media, May 2016. Archived at [perma.cc/RAY4-JDVX](https://perma.cc/RAY4-JDVX)
[^43]: Kartik Paramasivam. [Stream Processing Hard Problems -- Part 1: Killing Lambda](https://engineering.linkedin.com/blog/2016/06/stream-processing-hard-problems-part-1-killing-lambda). *engineering.linkedin.com*, June 2016. Archived at [archive.org](https://web.archive.org/web/20240621211312/https://www.linkedin.com/blog/engineering/data-streaming-processing/stream-processing-hard-problems-part-1-killing-lambda)
[^44]: Stéphane Derosiaux. [CQRS: What? Why? How?](https://sderosiaux.medium.com/cqrs-what-why-how-945543482313) *sderosiaux.medium.com*, September 2019. Archived at [perma.cc/FZ3U-HVJ4](https://perma.cc/FZ3U-HVJ4)
[^45]: Baron Schwartz. [Immutability, MVCC, and Garbage Collection](https://web.archive.org/web/20220122020806/http://www.xaprb.com/blog/2013/12/28/immutability-mvcc-and-garbage-collection/). *xaprb.com*, December 2013. Archived at [archive.org](https://web.archive.org/web/20220122020806/http://www.xaprb.com/blog/2013/12/28/immutability-mvcc-and-garbage-collection/)
[^46]: Daniel Eloff, Slava Akhmechet, Jay Kreps, et al. [Re: Turning the Database Inside-out with Apache Samza](https://news.ycombinator.com/item?id=9145197). Hacker News discussion, *news.ycombinator.com*, March 2015. Archived at [perma.cc/ML9E-JC83](https://perma.cc/ML9E-JC83)
[^47]: [Datomic Documentation: Excision](https://docs.datomic.com/operation/excision.html). Cognitect, Inc., *docs.datomic.com*. Archived at [perma.cc/J5QQ-SH32](https://perma.cc/J5QQ-SH32)
[^48]: [Fossil Documentation: Deleting Content from Fossil](https://fossil-scm.org/home/doc/trunk/www/shunning.wiki). *fossil-scm.org*, 2025. Archived at [perma.cc/DS23-GTNG](https://perma.cc/DS23-GTNG)
[^49]: Jay Kreps. [The irony of distributed systems is that data loss is really easy but deleting data is surprisingly hard.](https://x.com/jaykreps/status/582580836425330688) *x.com*, March 2015. Archived at [perma.cc/7RRZ-V7B7](https://perma.cc/7RRZ-V7B7)
[^50]: Brent Robinson. [Crypto shredding: How it can solve modern data retention challenges](https://medium.com/@brentrobinson5/crypto-shredding-how-it-can-solve-modern-data-retention-challenges-da874b01745b). *medium.com*, January 2019. Archived at <https://perma.cc/4LFK-S6XE>
[^51]: Matthew D. Green and Ian Miers. [Forward Secure Asynchronous Messaging from Puncturable Encryption](https://isi.jhu.edu/~mgreen/forward_sec.pdf). At *IEEE Symposium on Security and Privacy*, May 2015. [doi:10.1109/SP.2015.26](https://doi.org/10.1109/SP.2015.26)
[^52]: David C. Luckham. [What's the Difference Between ESP and CEP?](https://complexevents.com/2020/06/15/whats-the-difference-between-esp-and-cep-2/) *complexevents.com*, June 2019. Archived at [perma.cc/E7PZ-FDEF](https://perma.cc/E7PZ-FDEF)
[^53]: Arvind Arasu, Shivnath Babu, and Jennifer Widom. [The CQL Continuous Query Language: Semantic Foundations and Query Execution](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cql.pdf). *The VLDB Journal*, volume 15, issue 2, pages 121--142, June 2006. [doi:10.1007/s00778-004-0147-z](https://doi.org/10.1007/s00778-004-0147-z)
[^54]: Julian Hyde. [Data in Flight: How Streaming SQL Technology Can Help Solve the Web 2.0 Data Crunch](https://queue.acm.org/detail.cfm?id=1667562). *ACM Queue*, volume 7, issue 11, December 2009. [doi:10.1145/1661785.1667562](https://doi.org/10.1145/1661785.1667562)
[^55]: Philippe Flajolet, Éric Fusy, Olivier Gandouet, and Frédéric Meunier. [HyperLogLog: The Analysis of a Near-Optimal Cardinality Estimation Algorithm](https://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf). At *Conference on Analysis of Algorithms* (AofA), June 2007. [doi:10.46298/dmtcs.3545](https://doi.org/10.46298/dmtcs.3545)
[^56]: Jay Kreps. [Questioning the Lambda Architecture](https://www.oreilly.com/ideas/questioning-the-lambda-architecture). *oreilly.com*, July 2014. Archived at [perma.cc/2WY5-HC8Y](https://perma.cc/2WY5-HC8Y)
[^57]: Ian Reppel. [An Overview of Apache Streaming Technologies](https://ianreppel.org/an-overview-of-apache-streaming-technologies/). *ianreppel.org*, March 2016. Archived at [perma.cc/BB3E-QJLW](https://perma.cc/BB3E-QJLW)
[^58]: Jay Kreps. [Why Local State is a Fundamental Primitive in Stream Processing](https://www.oreilly.com/ideas/why-local-state-is-a-fundamental-primitive-in-stream-processing). *oreilly.com*, July 2014. Archived at [perma.cc/P8HU-R5LA](https://perma.cc/P8HU-R5LA)
[^59]: RisingWave Labs. [Deep Dive Into the RisingWave Stream Processing Engine - Part 2: Computational Model](https://risingwave.com/blog/deep-dive-into-the-risingwave-stream-processing-engine-part-2-computational-model/). *risingwave.com*, November 2023. Archived at [perma.cc/LM74-XDEL](https://perma.cc/LM74-XDEL)
[^60]: Frank McSherry, Derek G. Murray, Rebecca Isaacs, and Michael Isard. [Differential dataflow](https://www.cidrdb.org/cidr2013/Papers/CIDR13_Paper111.pdf). At *6th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2013.
[^61]: Andy Hattemer. [Incremental Computation in the Database](https://materialize.com/guides/incremental-computation/). *materialize.com*, March 2020. Archived at [perma.cc/AL94-YVRN](https://perma.cc/AL94-YVRN)
[^62]: Shay Banon. [Percolator](https://www.elastic.co/blog/percolator). *elastic.co*, February 2011. Archived at [perma.cc/LS5R-4FQX](https://perma.cc/LS5R-4FQX)
[^63]: Alan Woodward and Martin Kleppmann. [Real-Time Full-Text Search with Luwak and Samza](https://martin.kleppmann.com/2015/04/13/real-time-full-text-search-luwak-samza.html). *martin.kleppmann.com*, April 2015. Archived at [perma.cc/2U92-Q7R4](https://perma.cc/2U92-Q7R4)
[^64]: Tyler Akidau. [The World Beyond Batch: Streaming 102](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-102). *oreilly.com*, January 2016. Archived at [perma.cc/4XF9-8M2K](https://perma.cc/4XF9-8M2K)
[^65]: Stephan Ewen. [Streaming Analytics with Apache Flink](https://www.slideshare.net/slideshow/advanced-streaming-analytics-with-apache-flink-and-apache-kafka-stephan-ewen/61920008). At *Kafka Summit*, April 2016. Archived at [perma.cc/QBQ4-F9MR](https://perma.cc/QBQ4-F9MR)
[^66]: Tyler Akidau, Alex Balikov, Kaya Bekiroğlu, Slava Chernyak, Josh Haberman, Reuven Lax, Sam McVeety, Daniel Mills, Paul Nordstrom, and Sam Whittle. [MillWheel: Fault-Tolerant Stream Processing at Internet Scale](https://www.vldb.org/pvldb/vol6/p1033-akidau.pdf). *Proceedings of the VLDB Endowment*, volume 6, issue 11, pages 1033--1044, August 2013. [doi:10.14778/2536222.2536229](https://doi.org/10.14778/2536222.2536229)
[^67]: Alex Dean. [Improving Snowplow's Understanding of Time](https://snowplow.io/blog/improving-snowplows-understanding-of-time). *snowplow.io*, September 2015. Archived at [perma.cc/6CT9-Z3Q2](https://perma.cc/6CT9-Z3Q2)
[^68]: [Azure Stream Analytics: Windowing functions](https://learn.microsoft.com/en-gb/stream-analytics-query/windowing-azure-stream-analytics). Microsoft Azure Reference, *learn.microsoft.com*, July 2025. Archived at [archive.org](https://web.archive.org/web/20250901140013/https://learn.microsoft.com/en-gb/stream-analytics-query/windowing-azure-stream-analytics)
[^69]: Rajagopal Ananthanarayanan, Venkatesh Basker, Sumit Das, Ashish Gupta, Haifeng Jiang, Tianhao Qiu, Alexey Reznichenko, Deomid Ryabkov, Manpreet Singh, and Shivakumar Venkataraman. [Photon: Fault-Tolerant and Scalable Joining of Continuous Data Streams](https://research.google.com/pubs/archive/41529.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2013. [doi:10.1145/2463676.2465272](https://doi.org/10.1145/2463676.2465272)
[^70]: Ben Kirwin. [Doing the Impossible: Exactly-Once Messaging Patterns in Kafka](https://ben.kirw.in/2014/11/28/kafka-patterns/). *ben.kirw.in*, November 2014. Archived at [perma.cc/A5QL-QRX7](https://perma.cc/A5QL-QRX7)
[^71]: Pat Helland. [Data on the Outside Versus Data on the Inside](https://www.cidrdb.org/cidr2005/papers/P12.pdf). At *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005.
[^72]: Ralph Kimball and Margy Ross. [*The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling*](https://learning.oreilly.com/library/view/the-data-warehouse/9781118530801/), 3rd edition. John Wiley & Sons, 2013. ISBN: 978-1-118-53080-1
[^73]: Viktor Klang. [I'm coining the phrase 'effectively-once' for message processing with at-least-once + idempotent operations](https://x.com/viktorklang/status/789036133434978304). *x.com*, October 2016. Archived at [perma.cc/7DT9-TDG2](https://perma.cc/7DT9-TDG2)
[^74]: Matei Zaharia, Tathagata Das, Haoyuan Li, Scott Shenker, and Ion Stoica. [Discretized Streams: An Efficient and Fault-Tolerant Model for Stream Processing on Large Clusters](https://www.usenix.org/system/files/conference/hotcloud12/hotcloud12-final28.pdf). At *4th USENIX Conference in Hot Topics in Cloud Computing* (HotCloud), June 2012.
[^75]: Kostas Tzoumas, Stephan Ewen, and Robert Metzger. [High-Throughput, Low-Latency, and Exactly-Once Stream Processing with Apache Flink](https://web.archive.org/web/20250429165534/https://www.ververica.com/blog/high-throughput-low-latency-and-exactly-once-stream-processing-with-apache-flink). *ververica.com*, August 2015. Archived at [archive.org](https://web.archive.org/web/20250429165534/https://www.ververica.com/blog/high-throughput-low-latency-and-exactly-once-stream-processing-with-apache-flink)
[^76]: Paris Carbone, Gyula Fóra, Stephan Ewen, Seif Haridi, and Kostas Tzoumas. [Lightweight Asynchronous Snapshots for Distributed Dataflows](https://arxiv.org/abs/1506.08603). arXiv:1506.08603 [cs.DC], June 2015.
[^77]: Ryan Betts and John Hugg. [*Fast Data: Smart and at Scale*](https://www.voltactivedata.com/wp-content/uploads/2017/03/hv-ebook-fast-data-smart-and-at-scale.pdf). Report, O'Reilly Media, October 2015. Archived at [perma.cc/VQ6S-XQQY](https://perma.cc/VQ6S-XQQY)
[^78]: Neha Narkhede and Guozhang Wang. [Exactly-Once Semantics Are Possible: Here's How Kafka Does It](https://www.confluent.io/blog/exactly-once-semantics-are-possible-heres-how-apache-kafka-does-it/). *confluent.io*, June 2019. Archived at [perma.cc/Q2AU-Q2ED](https://perma.cc/Q2AU-Q2ED)
[^79]: Jason Gustafson, Flavio Junqueira, Apurva Mehta, Sriram Subramanian, and Guozhang Wang. [KIP-98 -- Exactly Once Delivery and Transactional Messaging](https://cwiki.apache.org/confluence/display/KAFKA/KIP-98+-+Exactly+Once+Delivery+and+Transactional+Messaging). *cwiki.apache.org*, November 2016. Archived at [perma.cc/95PT-RCTG](https://perma.cc/95PT-RCTG)
[^80]: Pat Helland. [Idempotence Is Not a Medical Condition](https://dl.acm.org/doi/pdf/10.1145/2160718.2160734). *Communications of the ACM*, volume 55, issue 5, page 56, May 2012. [doi:10.1145/2160718.2160734](https://doi.org/10.1145/2160718.2160734)
[^81]: Jay Kreps. [Re: Trying to Achieve Deterministic Behavior on Recovery/Rewind](https://lists.apache.org/thread/n0sz6zld72nvjtnytv09pxc57mdcf9ft). Email to *samza-dev* mailing list, September 2014. Archived at [perma.cc/7DPD-GJNL](https://perma.cc/7DPD-GJNL)
[^82]: E. N. (Mootaz) Elnozahy, Lorenzo Alvisi, Yi-Min Wang, and David B. Johnson. [A Survey of Rollback-Recovery Protocols in Message-Passing Systems](https://www.cs.utexas.edu/~lorenzo/papers/SurveyFinal.pdf). *ACM Computing Surveys*, volume 34, issue 3, pages 375--408, September 2002. [doi:10.1145/568522.568525](https://doi.org/10.1145/568522.568525)
[^83]: Adam Warski. [Kafka Streams -- How Does It Fit the Stream Processing Landscape?](https://softwaremill.com/kafka-streams-how-does-it-fit-stream-landscape/) *softwaremill.com*, June 2016. Archived at [perma.cc/WQ5Q-H2J2](https://perma.cc/WQ5Q-H2J2)
[^84]: Stephan Ewen, Fabian Hueske, and Xiaowei Jiang. [Batch as a Special Case of Streaming and Alibaba's contribution of Blink](https://flink.apache.org/2019/02/13/batch-as-a-special-case-of-streaming-and-alibabas-contribution-of-blink/). *flink.apache.org*, February 2019. Archived at [perma.cc/A529-SKA9](https://perma.cc/A529-SKA9)


================================================
FILE: content/zh/ch13.md
================================================
---
title: "第十三章：流式系统的哲学"
linkTitle: "13. 流式系统的哲学"
weight: 313
breadcrumbs: false
---

<a id="ch_philosophy"></a>
<a id="ch13"></a>

![](/map/ch12.png)

> 如果船长的终极目标是保护船只，他应该永远待在港口。
>
> —— 圣托马斯・阿奎那《神学大全》（1265-1274）

[第二章](/ch2) 讨论了构建 **可靠**、**可伸缩**、**可维护** 应用与系统的目标。这些主题贯穿了全书：例如，我们讨论了提升可靠性的多种容错算法、提升可伸缩性的分区方法，以及提升可维护性的演化与抽象机制。

在本章中，我们将把这些想法整合起来，并特别基于 [第十二章](/ch12) 的流式/事件驱动架构思路，提出一套满足这些目标的应用开发哲学。与前几章相比，本章立场更鲜明：不是并列比较多种方案，而是深入展开一种特定的设计哲学。

## 数据集成 {#sec_future_integration}

本书中反复出现的主题是，对于任何给定的问题都会有好几种解决方案，所有这些解决方案都有不同的优缺点与利弊权衡。例如在 [第四章](/ch4) 讨论存储引擎时，我们看到了日志结构存储、B 树以及列式存储。在 [第六章](/ch6) 讨论复制时，我们看到了单领导者、多领导者和无领导者的方法。

如果你有一个类似于 “我想存储一些数据并稍后再查询” 的问题，那么并没有一种正确的解决方案。但对于不同的具体环境，总会有不同的合适方法。软件实现通常必须选择一种特定的方法。使单条代码路径能做到稳定健壮且表现良好已经是一件非常困难的事情了 —— 尝试在单个软件中完成所有事情，几乎可以保证，实现效果会很差。

因此软件工具的最佳选择也取决于情况。每一种软件，甚至所谓的 “通用” 数据库，都是针对特定的使用模式设计的。

面对让人眼花缭乱的诸多替代品，第一个挑战就是弄清软件与其适用环境的映射关系。供应商不愿告诉你他们软件不适用的工作负载，这是可以理解的。但是希望先前的章节能给你提供一些问题，让你读出字里行间的言外之意，并更好地理解这些权衡。

但是，即使你已经完全理解各种工具与其适用环境间的关系，还有一个挑战：在复杂的应用中，数据的用法通常花样百出。不太可能存在适用于 **所有** 不同数据应用场景的软件，因此你不可避免地需要拼凑几个不同的软件来以提供应用所需的功能。

### 组合使用派生数据的工具 {#id442}

例如，为了处理任意关键词的搜索查询，将 OLTP 数据库与全文检索索引集成在一起是很常见的需求。尽管一些数据库（例如 PostgreSQL）包含了全文索引功能，对于简单的应用完全够了[^1]，但更复杂的搜索能力就需要专业的信息检索工具了。相反的是，搜索索引通常不适合作为持久的记录系统，因此许多应用需要组合这两种不同的工具以满足所有需求。

我们在 “[保持系统同步](/ch12#sec_stream_sync)” 中接触过集成数据系统的问题。随着数据不同表示形式的增加，集成问题变得越来越困难。除了数据库和搜索索引之外，也许你需要在分析系统（数据仓库，或批处理和流处理系统）中维护数据副本；维护从原始数据中派生的缓存，或反规范化的数据版本；将数据灌入机器学习、分类、排名或推荐系统中；或者基于数据变更发送通知。

#### 理解数据流 {#id443}

当需要在多个存储系统中维护相同数据的副本以满足不同的访问模式时，你要对输入和输出了如指掌：哪些数据先写入，哪些数据表示派生自哪些来源？如何以正确的格式，将所有数据导入正确的地方？

例如，你可能会首先将数据写入 **记录系统** 数据库，捕获对该数据库所做的变更（请参阅 “[变更数据捕获](/ch12#sec_stream_cdc)”），然后将变更以相同的顺序应用于搜索索引。如果变更数据捕获（CDC）是更新索引的唯一方式，则可以确定该索引完全派生自记录系统，因此与其保持一致（除软件错误外）。写入数据库是向该系统提供新输入的唯一方式。

允许应用程序直接写入搜索索引和数据库引入了如 [图 12-4](/ch12#fig_stream_dual_write_race) 所示的问题，其中两个客户端同时发送冲突的写入，且两个存储系统按不同顺序处理它们。在这种情况下，既不是数据库说了算，也不是搜索索引说了算，所以它们做出了相反的决定，进入彼此间持久性的不一致状态。

如果你可以通过单个系统来提供所有用户输入，从而决定所有写入的排序，则通过按相同顺序处理写入，可以更容易地派生出其他数据表示。这是状态机复制方法的一个应用，我们在 “[全序广播](/ch10#sec_consistency_total_order)” 中看到。无论你使用变更数据捕获还是事件溯源日志，都不如简单的基于全序的决策原则更重要。

基于事件日志来更新派生数据的系统，通常可以做到 **确定性** 与 **幂等性**（请参阅 “[幂等性](/ch12#sec_stream_idempotence)”），使得从故障中恢复相当容易。

#### 派生数据与分布式事务 {#sec_future_derived_vs_transactions}

保持不同数据系统彼此一致的经典方法涉及分布式事务，如 “[原子提交与两阶段提交](/ch8#sec_transactions_2pc)” 中所述。与分布式事务相比，使用派生数据系统的方法如何？

在抽象层面，它们通过不同的方式达到类似的目标。分布式事务通过 **锁** 进行互斥来决定写入的顺序（请参阅 “[两阶段锁定](/ch8#sec_transactions_2pl)”），而 CDC 和事件溯源使用日志进行排序。分布式事务使用原子提交来确保变更只生效一次，而基于日志的系统通常基于 **确定性重试** 和 **幂等性**。

最大的不同之处在于事务系统通常提供 [线性一致性](/ch10#sec_consistency_linearizability)，这包含着有用的保证，例如 [读己之写](/ch6#sec_replication_ryw)。另一方面，派生数据系统通常是异步更新的，因此它们默认不会提供相同的时序保证。

在愿意为分布式事务付出代价的有限场景中，它们已被成功应用。但是，我认为 XA 的容错能力和性能很差劲（请参阅 “[实践中的分布式事务](/ch8#sec_transactions_xa)”），这严重限制了它的实用性。我相信为分布式事务设计一种更好的协议是可行的。但使这样一种协议被现有工具广泛接受是很有挑战的，且不是立竿见影的事。

在没有广泛支持的良好分布式事务协议的情况下，我认为基于日志的派生数据是集成不同数据系统的最有前途的方法。然而，诸如读己之写的保证是有用的，我认为告诉所有人 “最终一致性是不可避免的 —— 忍一忍并学会和它打交道” 是没有什么建设性的（至少在缺乏 **如何** 应对的良好指导时）。

在本章后文中，我们将讨论一些在异步派生系统之上实现更强保障的方法，并迈向分布式事务和基于日志的异步系统之间的中间地带。

#### 全序的限制 {#id335}

对于足够小的系统，构建一个完全有序的事件日志是完全可行的（正如单主复制数据库的流行所证明的那样，它正好建立了这样一种日志）。但是，随着系统向更大更复杂的工作负载伸缩，限制开始出现：

* 在大多数情况下，构建完全有序的日志，需要所有事件汇集于决定顺序的 **单个领导者节点**。如果事件吞吐量大于单台计算机的处理能力，则需要将其分区到多台计算机上（请参阅 “[分区日志](/ch12#sec_stream_log)”）。然后两个不同分区中的事件顺序关系就不明确了。
* 如果服务器分布在多个 **地理位置分散** 的数据中心上，例如为了容忍整个数据中心掉线，你通常在每个数据中心都有单独的主库，因为网络延迟会导致同步的跨数据中心协调效率低下（请参阅 “[多主复制](/ch6#sec_replication_multi_leader)”）。这意味着源自两个不同数据中心的事件顺序未定义。
* 将应用程序部署为微服务时（请参阅 “[服务中的数据流：REST 与 RPC](/ch5#sec_encoding_dataflow_rpc)”），常见的设计选择是将每个服务及其持久状态作为独立单元进行部署，服务之间不共享持久状态。当两个事件来自不同的服务时，这些事件间的顺序未定义。
* 某些应用程序在客户端保存状态，该状态在用户输入时立即更新（无需等待服务器确认），甚至可以继续脱机工作（请参阅 “[需要离线操作的客户端](/ch6#sec_replication_offline_clients)”）。对于这样的应用程序，客户端和服务器很可能以不同的顺序看到事件。

在形式上，决定事件的全局顺序称为 **全序广播**，相当于 **共识**（请参阅 “[共识算法和全序广播](/ch10#sec_consistency_faces)”）。大多数共识算法都是针对单个节点的吞吐量足以处理整个事件流的情况而设计的，并且这些算法不提供多个节点共享事件排序工作的机制。设计可以伸缩至单个节点的吞吐量之上，且在地理位置分散环境中仍能良好工作的共识算法仍然是一个开放研究问题。

#### 排序事件以捕获因果关系 {#sec_future_capture_causality}

在事件之间不存在因果关系的情况下，全序的缺乏并不是一个大问题，因为并发事件可以任意排序。其他一些情况很容易处理：例如，当同一对象有多个更新时，它们可以通过将特定对象 ID 的所有更新路由到相同的日志分区来完全排序。然而，因果关系有时会以更微妙的方式出现（请参阅 “[顺序与因果关系](/ch10#sec_consistency_logical)”）。

例如，考虑一个社交网络服务，以及一对曾处于恋爱关系但刚分手的用户。其中一个用户将另一个用户从好友中移除，然后向剩余的好友发送消息，抱怨他们的前任。用户的心思是他们的前任不应该看到这些粗鲁的消息，因为消息是在好友状态解除后发送的。

但是如果好友关系状态与消息存储在不同的地方，在这样一个系统中，可能会出现 **解除好友** 事件与 **发送消息** 事件之间的因果依赖丢失的情况。如果因果依赖关系没有被捕捉到，则发送有关新消息的通知的服务可能会在 **解除好友** 事件之前处理 **发送消息** 事件，从而错误地向前任发送通知。

在本例中，通知实际上是消息和好友列表之间的连接，使得它与我们先前讨论的连接的时序问题有关（请参阅 “[连接的时间依赖性](/ch12#sec_stream_join_time)”）。不幸的是，这个问题似乎并没有一个简单的答案[^2] [^3]。起点包括：

* 逻辑时间戳可以提供无需协调的全局顺序（请参阅 “[序列号顺序](/ch10#sec_consistency_logical)”），因此它们可能有助于全序广播不可行的情况。但是，他们仍然要求收件人处理不按顺序发送的事件，并且需要传递其他元数据。
* 如果你可以记录一个事件来记录用户在做出决定之前所看到的系统状态，并给该事件一个唯一的标识符，那么后面的任何事件都可以引用该事件标识符来记录因果关系[^4]。我们将在 “[读也是事件](#sec_future_read_events)” 中回到这个想法。
* 冲突解决算法（请参阅 “[自动冲突解决](/ch6#automatic-conflict-resolution)”）有助于处理以意外顺序传递的事件。它们对于维护状态很有用，但如果行为有外部副作用（例如，给用户发送通知），就没什么帮助了。

也许，随着时间的推移，应用开发模式将出现，使得能够有效地捕获因果依赖关系，并且保持正确的派生状态，而不会迫使所有事件经历全序广播的瓶颈）。

### 批处理与流处理 {#sec_future_batch_streaming}

我会说数据集成的目标是，确保数据最终能在所有正确的地方表现出正确的形式。这样做需要消费输入、转换、连接、过滤、聚合、训练模型、评估、以及最终写出适当的输出。批处理和流处理是实现这一目标的工具。

批处理和流处理的输出是派生数据集，例如搜索索引、物化视图、向用户显示的建议、聚合指标等（请参阅 “[批处理工作流的输出](/ch11#sec_batch_output)” 和 “[流处理的应用](/ch12#sec_stream_uses)”）。

正如我们在 [第十一章](/ch11) 和 [第十二章](/ch12) 中看到的，批处理和流处理有许多共同的原则，主要的根本区别在于流处理器在无限数据集上运行，而批处理输入是已知的有限大小。

#### 维护派生状态 {#id446}

批处理有着很强的函数式风格（即使其代码不是用函数式语言编写的）：它鼓励确定性的纯函数，其输出仅依赖于输入，除了显式输出外没有副作用，将输入视作不可变的，且输出是仅追加的。流处理与之类似，但它扩展了算子以允许受管理的、容错的状态（请参阅 “[失败后重建状态](/ch12#sec_stream_state_fault_tolerance)”）。

具有良好定义的输入和输出的确定性函数的原理不仅有利于容错（请参阅 “[幂等性](/ch12#sec_stream_idempotence)”），也简化了有关组织中数据流的推理[^7]。无论派生数据是搜索索引、统计模型还是缓存，采用这种观点思考都是很有帮助的：将其视为从一个东西派生出另一个的数据管道，通过函数式应用代码推送一个系统的状态变更，并将其效果应用至派生系统中。

原则上，派生数据系统可以同步地维护，就像关系数据库在与索引表写入操作相同的事务中同步更新次级索引一样。然而，异步是使基于事件日志的系统稳健的原因：它允许系统的一部分故障被抑制在本地。而如果任何一个参与者失败，分布式事务将中止，因此它们倾向于通过将故障传播到系统的其余部分来放大故障（请参阅 “[分布式事务的限制](/ch8#sec_transactions_xa)”）。

我们在 “[分区与次级索引](/ch7#sec_sharding_secondary_indexes)” 中看到，次级索引经常跨越分区边界。具有次级索引的分区系统需要将写入发送到多个分区（如果索引按关键词分区的话）或将读取发送到所有分区（如果索引是按文档分区的话）。如果索引是异步维护的，这种跨分区通信也是最可靠和最可伸缩的[^8]（另请参阅 “[多分区数据处理](#sec_future_unbundled_multi_shard)”）。

#### 应用演化后重新处理数据 {#sec_future_reprocessing}

在维护派生数据时，批处理和流处理都是有用的。流处理允许将输入中的变化以低延迟反映在派生视图中，而批处理允许重新处理大量累积的历史数据以便将新视图导出到现有数据集上。

特别是，重新处理现有数据为维护系统、演化并支持新功能和需求变更提供了一个良好的机制（请参阅 [第四章](/ch4)）。没有重新进行处理，模式演化将仅限于简单的变化，例如向记录中添加新的可选字段或添加新类型的记录。无论是在写时模式还是在读时模式中都是如此（请参阅 “[文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility)”）。另一方面，通过重新处理，可以将数据集重组为一个完全不同的模型，以便更好地满足新的要求。

> ### 铁路上的模式迁移
>
> 大规模的 “模式迁移” 也发生在非计算机系统中。例如，在 19 世纪英国铁路建设初期，轨距（两轨之间的距离）就有了各种各样的竞争标准。为一种轨距而建的列车不能在另一种轨距的轨道上运行，这限制了火车网络中可能的相互连接[^9]。
>
> 在 1846 年最终确定了一个标准轨距之后，其他轨距的轨道必须转换 —— 但是如何在不停运火车线路的情况下进行数月甚至数年的迁移？解决的办法是首先通过添加第三条轨道将轨道转换为 **双轨距（dual gauge）** 或 **混合轨距**。这种转换可以逐渐完成，当完成时，两种轨距的列车都可以在线路上跑，使用三条轨道中的两条。事实上，一旦所有的列车都转换成标准轨距，那么可以移除提供非标准轨距的轨道。
>
> 以这种方式 “再加工” 现有的轨道，让新旧版本并存，可以在几年的时间内逐渐改变轨距。然而，这是一项昂贵的事业，这就是今天非标准轨距仍然存在的原因。例如，旧金山湾区的 BART 系统使用了与美国大部分地区不同的轨距。

派生视图允许 **渐进演化（gradual evolution）**。如果你想重新构建数据集，不需要执行突然切换式的迁移。取而代之的是，你可以将旧架构和新架构并排维护为相同基础数据上的两个独立派生视图。然后可以开始将少量用户转移到新视图，以测试其性能并发现任何错误，而大多数用户仍然会被路由到旧视图。你可以逐渐地增加访问新视图的用户比例，最终可以删除旧视图[^10]。

这种逐渐迁移的美妙之处在于，如果出现问题，每个阶段的过程都很容易逆转：你始终有一个可以回滚的可用系统。通过降低不可逆损害的风险，你能对继续前进更有信心，从而更快地改善系统[^11]。

#### 统一批处理和流处理 {#id338}

早期统一批处理与流处理的提案是 **Lambda 架构**[^12]，但它有不少问题，并且已经逐渐淡出主流。更新的系统允许在同一个系统中同时实现批计算（重处理历史数据）和流计算（事件到达即处理）[^15]。

在一个系统中统一批处理和流处理需要以下功能，这些功能也正在越来越广泛地被提供：

* 通过处理最近事件流的相同处理引擎来重播历史事件的能力。例如，基于日志的消息代理可以重播消息（请参阅 “[重播旧消息](/ch12#sec_stream_replay)”），某些流处理器可以从 HDFS 等分布式文件系统读取输入。
* 对于流处理器来说，恰好一次语义 —— 即确保输出与未发生故障的输出相同，即使事实上发生故障（请参阅 “[容错](/ch12#sec_stream_fault_tolerance)”）。与批处理一样，这需要丢弃任何失败任务的部分输出。
* 按事件时间进行窗口化的工具，而不是按处理时间进行窗口化，因为处理历史事件时，处理时间毫无意义（请参阅 “[时间推理](/ch12#sec_stream_time)”）。例如，Apache Beam 提供了用于表达这种计算的 API，可以在 Apache Flink 或 Google Cloud Dataflow 使用。


## 分拆数据库 {#sec_future_unbundling}

在最抽象的层面上，数据库、批/流处理器和操作系统都在做相似的事情：存储数据，并允许你处理和查询这些数据[^16]。数据库将数据存储为某种数据模型下的记录（例如表行、文档、图顶点等），而操作系统文件系统将数据存为文件；但它们本质上都可视作 “信息管理” 系统[^17]。正如我们在 [第十一章](/ch11) 中看到的，批处理系统在很多方面像是 Unix 的分布式版本。

当然，有很多实际的差异。例如，许多文件系统都不能很好地处理包含 1000 万个小文件的目录，而包含 1000 万个小记录的数据库完全是寻常而不起眼的。无论如何，操作系统和数据库之间的相似之处和差异值得探讨。

Unix 和关系数据库以非常不同的哲学来处理信息管理问题。Unix 认为它的目的是为程序员提供一种相当低层次的硬件的逻辑抽象，而关系数据库则希望为应用程序员提供一种高层次的抽象，以隐藏磁盘上数据结构的复杂性、并发性、崩溃恢复等等。Unix 发展出的管道和文件只是字节序列，而数据库则发展出了 SQL 和事务。

哪种方法更好？当然这取决于你想要的是什么。Unix 是 “简单的”，因为它是对硬件资源相当薄的包装；关系数据库是 “更简单” 的，因为一个简短的声明性查询可以利用很多强大的基础设施（查询优化、索引、连接方法、并发控制、复制等），而不需要查询的作者理解其实现细节。

这些哲学之间的矛盾已经持续了几十年（Unix 和关系模型都出现在 70 年代初），仍然没有解决。例如，我将 NoSQL 运动解释为，希望将类 Unix 的低级别抽象方法应用于分布式 OLTP 数据存储的领域。

在这一部分我将试图调和这两个哲学，希望我们能各取其美。

### 组合使用数据存储技术 {#id447}

在本书的过程中，我们讨论了数据库提供的各种功能及其工作原理，其中包括：

* 次级索引，使你可以根据字段的值有效地搜索记录（请参阅 “[其他索引结构](/ch4#sec_storage_index_multicolumn)”）
* 物化视图，这是一种预计算的查询结果缓存（请参阅 “[聚合：数据立方体和物化视图](/ch4#sec_storage_materialized_views)”）
* 复制日志，保持其他节点上数据的副本最新（请参阅 “[复制日志的实现](/ch6#sec_replication_implementation)”）
* 全文检索索引，允许在文本中进行关键字搜索（请参阅 “[全文检索与模糊索引](/ch4#sec_storage_full_text)”），也内置于某些关系数据库[^1]

在 [第十一章](/ch11) 和 [第十二章](/ch12) 中，出现了类似的主题。我们讨论了如何构建全文检索索引（请参阅 “[批处理工作流的输出](/ch11#sec_batch_output)”），了解了如何维护物化视图（请参阅 “[维护物化视图](/ch12#sec_stream_mat_view)”）以及如何将变更从数据库复制到派生数据系统（请参阅 “[变更数据捕获](/ch12#sec_stream_cdc)”）。

数据库中内置的功能与人们用批处理和流处理器构建的派生数据系统似乎有相似之处。

#### 创建索引 {#id340}

想想当你运行 `CREATE INDEX` 在关系数据库中创建一个新的索引时会发生什么。数据库必须扫描表的一致性快照，挑选出所有被索引的字段值，对它们进行排序，然后写出索引。然后它必须处理自一致快照以来所做的写入操作（假设表在创建索引时未被锁定，所以写操作可能会继续）。一旦完成，只要事务写入表中，数据库就必须继续保持索引最新。

此过程非常类似于设置新的从库副本（请参阅 “[设置新从库](/ch6#sec_replication_new_replica)”），也非常类似于流处理系统中的 **引导（bootstrap）** 变更数据捕获（请参阅 “[初始快照](/ch12#sec_stream_cdc_snapshot)”）。

无论何时运行 `CREATE INDEX`，数据库都会重新处理现有数据集（如 “[应用演化后重新处理数据](#sec_future_reprocessing)” 中所述），并将该索引作为新视图导出到现有数据上。现有数据可能是状态的快照，而不是所有发生变化的日志，但两者密切相关（请参阅 “[状态、流和不变性](/ch12#sec_stream_immutability)”）。

#### 一切的元数据库 {#id341}

有鉴于此，我认为整个组织的数据流开始像一个巨大的数据库[^7]。每当批处理、流处理或 ETL 过程将数据从一个地方传输并转换到另一个地方时，它都像数据库子系统在维护索引或物化视图。

从这种角度来看，批处理和流处理器就像精心实现的触发器、存储过程和物化视图维护例程。它们维护的派生数据系统就像不同的索引类型。例如，关系数据库可能支持 B 树索引、散列索引、空间索引（请参阅 “[多列索引](/ch4#sec_storage_index_multicolumn)”）以及其他类型的索引。在新兴的派生数据系统架构中，不是将这些设施作为单个集成数据库产品的功能实现，而是由各种不同的软件提供，运行在不同的机器上，由不同的团队管理。

这些发展在未来将会把我们带到哪里？如果我们从没有适合所有访问模式的单一数据模型或存储格式的前提出发，我推测有两种途径可以将不同的存储和处理工具组合成一个有凝聚力的系统：

**联合数据库：统一读取**

可以为各种各样的底层存储引擎和处理方法提供一个统一的查询接口 —— 一种称为 **联合数据库（federated database）** 或 **多态存储（polystore）** 的方法[^18] [^19]。例如，PostgreSQL 的 **外部数据包装器（foreign data wrapper）** 功能符合这种模式[^20]。需要专用数据模型或查询接口的应用程序仍然可以直接访问底层存储引擎，而想要组合来自不同位置的数据的用户可以通过联合接口轻松完成操作。

联合查询接口遵循着单一集成系统的关系型传统，带有高级查询语言和优雅的语义，但实现起来非常复杂。

**分拆数据库：统一写入**

虽然联合能解决跨多个不同系统的只读查询问题，但它并没有很好的解决跨系统 **同步** 写入的问题。我们说过，在单个数据库中，创建一致的索引是一项内置功能。当我们构建多个存储系统时，我们同样需要确保所有数据变更都会在所有正确的位置结束，即使在出现故障时也是如此。想要更容易地将存储系统可靠地插接在一起（例如，通过变更数据捕获和事件日志），就像将数据库的索引维护功能以可以跨不同技术同步写入的方式分开[^7] [^21]。

分拆方法遵循 Unix 传统的小型工具，它可以很好地完成一件事[^22]，通过统一的低层级 API（管道）进行通信，并且可以使用更高层级的语言进行组合（shell）[^16] 。

#### 开展分拆工作 {#sec_future_unbundling_favor}

联合和分拆是一个硬币的两面：用不同的组件构成可靠、 可伸缩和可维护的系统。联合只读查询需要将一个数据模型映射到另一个数据模型，这需要一些思考，但最终还是一个可解决的问题。而我认为同步写入到几个存储系统是更困难的工程问题，所以我将重点关注它。

传统的同步写入方法需要跨异构存储系统的分布式事务[^18]，我认为这是错误的解决方案（请参阅 “[派生数据与分布式事务](#sec_future_derived_vs_transactions)”）。单个存储或流处理系统内的事务是可行的，但是当数据跨越不同技术之间的边界时，我认为具有幂等写入的异步事件日志是一种更加健壮和实用的方法。

例如，分布式事务在某些流处理组件内部使用，以匹配 **恰好一次（exactly-once）** 语义（请参阅 “[原子提交再现](/ch12#sec_stream_atomic_commit)”），这可以很好地工作。然而，当事务需要涉及由不同人群编写的系统时（例如，当数据从流处理组件写入分布式键值存储或搜索索引时），缺乏标准化的事务协议会使集成更难。有幂等消费者的有序事件日志（请参阅 “[幂等性](/ch12#sec_stream_idempotence)”）是一种更简单的抽象，因此在异构系统中实现更加可行[^7]。

基于日志的集成的一大优势是各个组件之间的 **松散耦合（loose coupling）**，这体现在两个方面：

1. 在系统级别，异步事件流使整个系统在个别组件的中断或性能下降时更加稳健。如果消费者运行缓慢或失败，那么事件日志可以缓冲消息（请参阅 “[磁盘空间使用](/ch12#sec_stream_disk_usage)”），以便生产者和任何其他消费者可以继续不受影响地运行。有问题的消费者可以在问题修复后赶上，因此不会错过任何数据，并且包含故障。相比之下，分布式事务的同步交互往往会将本地故障升级为大规模故障（请参阅 “[分布式事务的限制](/ch8#sec_transactions_xa)”）。
2. 在人力方面，分拆数据系统允许不同的团队独立开发，改进和维护不同的软件组件和服务。专业化使得每个团队都可以专注于做好一件事，并与其他团队的系统以明确的接口交互。事件日志提供了一个足够强大的接口，以捕获相当强的一致性属性（由于持久性和事件的顺序），但也足够普适于几乎任何类型的数据。

#### 分拆系统与集成系统 {#id448}

如果分拆确实成为未来的方式，它也不会取代目前形式的数据库 —— 它们仍然会像以往一样被需要。为了维护流处理组件中的状态，数据库仍然是需要的，并且为批处理和流处理器的输出提供查询服务（请参阅 “[批处理工作流的输出](/ch11#sec_batch_output)” 与 “[流处理](/ch12#sec_stream_processing)”）。专用查询引擎对于特定的工作负载仍然非常重要：例如，MPP 数据仓库中的查询引擎针对探索性分析查询进行了优化，并且能够很好地处理这种类型的工作负载（请参阅 “[Hadoop 与分布式数据库的对比](/ch11#sec_batch_distributed)”）。

运行几种不同基础设施的复杂性可能是一个问题：每种软件都有一个学习曲线，配置问题和操作怪癖，因此部署尽可能少的移动部件是很有必要的。比起使用应用代码拼接多个工具而成的系统，单一集成软件产品也可以在其设计应对的工作负载类型上实现更好、更可预测的性能[^23]。正如在前言中所说的那样，为了不需要的规模而构建系统是白费精力，而且可能会将你锁死在一个不灵活的设计中。实际上，这是一种过早优化的形式。

分拆的目标不是要针对个别数据库与特定工作负载的性能进行竞争；我们的目标是允许你结合多个不同的数据库，以便在比单个软件可能实现的更广泛的工作负载范围内实现更好的性能。这是关于广度，而不是深度 —— 与我们在 “[Hadoop 与分布式数据库的对比](/ch11#sec_batch_distributed)” 中讨论的存储和处理模型的多样性一样。

因此，如果有一项技术可以满足你的所有需求，那么最好使用该产品，而不是试图用更低层级的组件重新实现它。只有当没有单一软件满足你的所有需求时，才会出现拆分和联合的优势。

### 围绕数据流设计应用 {#sec_future_dataflow}

当底层数据发生变化时去更新派生数据，这个思路并不新鲜。比如电子表格就有很强的数据流编程能力[^33]：你可以在一个单元格写公式（例如对另一列求和），只要输入变化，结果就会自动重算。这正是我们希望数据系统具备的能力：数据库记录一旦变化，相关索引、缓存视图和聚合结果都应自动刷新，而不需要应用开发者关心刷新细节。

从这个意义上说，今天很多数据系统仍可以向 VisiCalc 在 1979 年就具备的特性学习[^34]。与电子表格不同的是，现代数据系统还必须同时满足容错、可伸缩、持久化存储、跨团队异构技术集成等要求，也必须能够复用已有库与服务。指望所有软件都在一种语言、框架或工具上统一实现并不现实。

#### 应用代码作为派生函数 {#sec_future_dataflow_derivation}

当一个数据集派生自另一个数据集时，它会经历某种转换函数。例如：

* 次级索引是由一种直白的转换函数生成的派生数据集：对于基础表中的每行或每个文档，它挑选被索引的列或字段中的值，并按这些值排序（假设使用 B 树或 SSTable 索引，按键排序，如 [第四章](/ch4) 所述）。
* 全文检索索引是通过应用各种自然语言处理函数而创建的，诸如语言检测、分词、词干或词汇化、拼写纠正和同义词识别，然后构建用于高效查找的数据结构（例如倒排索引）。
* 在机器学习系统中，我们可以将模型视作从训练数据通过应用各种特征提取、统计分析函数派生的数据，当模型应用于新的输入数据时，模型的输出是从输入和模型（因此间接地从训练数据）中派生的。
* 缓存通常包含将以用户界面（UI）显示的形式的数据聚合。因此填充缓存需要知道 UI 中引用的字段；UI 中的变更可能需要更新缓存填充方式的定义，并重建缓存。

用于次级索引的派生函数是如此常用的需求，以致于它作为核心功能被内建至许多数据库中，你可以简单地通过 `CREATE INDEX` 来调用它。对于全文索引，常见语言的基本语言特征可能内置到数据库中，但更复杂的特征通常需要领域特定的调整。在机器学习中，特征工程是众所周知的特定于应用的特征，通常需要包含很多关于用户交互与应用部署的详细知识[^35]。

当创建派生数据集的函数不是像创建次级索引那样的标准搬砖函数时，需要自定义代码来处理特定于应用的东西。而这个自定义代码是让许多数据库挣扎的地方，虽然关系数据库通常支持触发器、存储过程和用户定义的函数，可以用它们来在数据库中执行应用代码，但它们有点像数据库设计里的事后反思。（请参阅 “[传递事件流](/ch12#sec_stream_transmit)”）。

#### 应用代码和状态的分离 {#id344}

理论上，数据库可以是任意应用代码的部署环境，就如同操作系统一样。然而实践中它们对这一目标适配的很差。它们不满足现代应用开发的要求，例如依赖和软件包管理、版本控制、滚动升级、可演化性、监控、指标、对网络服务的调用以及与外部系统的集成。

另一方面，Mesos、YARN、Docker、Kubernetes 等部署和集群管理工具专为运行应用代码而设计。通过专注于做好一件事情，他们能够做得比将数据库作为其众多功能之一执行用户定义的功能要好得多。

我认为让系统的某些部分专门用于持久数据存储并让其他部分专门运行应用程序代码是有意义的。这两者可以在保持独立的同时互动。

现在大多数 Web 应用程序都是作为无状态服务部署的，其中任何用户请求都可以路由到任何应用程序服务器，并且服务器在发送响应后会忘记所有请求。这种部署方式很方便，因为可以随意添加或删除服务器，但状态必须到某个地方：通常是数据库。趋势是将无状态应用程序逻辑与状态管理（数据库）分开：不将应用程序逻辑放入数据库中，也不将持久状态置于应用程序中[^36]。正如函数式编程社区喜欢开玩笑说的那样，“我们相信 **教会（Church）** 与 **国家（state）** 的分离”[^37]。

在这个典型的 Web 应用模型中，数据库充当一种可以通过网络同步访问的可变共享变量。应用程序可以读取和更新变量，而数据库负责维持它的持久性，提供一些诸如并发控制和容错的功能。

但是，在大多数编程语言中，你无法订阅可变变量中的变更 —— 你只能定期读取它。与电子表格不同，如果变量的值发生变化，变量的读者不会收到通知（你可以在自己的代码中实现这样的通知 —— 这被称为 **观察者模式** —— 但大多数语言没有将这种模式作为内置功能）。

数据库继承了这种可变数据的被动方法：如果你想知道数据库的内容是否发生了变化，通常你唯一的选择就是轮询（即定期重复你的查询）。订阅变更只是刚刚开始出现的功能（请参阅 “[变更流的 API 支持](/ch12#sec_stream_change_api)”）。

#### 数据流：应用代码与状态变化的交互 {#id450}

从数据流的角度思考应用程序，意味着重新协调应用代码和状态管理之间的关系。我们不再将数据库视作被应用操纵的被动变量，取而代之的是更多地考虑状态，状态变更和处理它们的代码之间的相互作用与协同关系。应用代码通过在另一个地方触发状态变更来响应状态变更。

我们在 “[数据库与流](/ch12#sec_stream_databases)” 中看到了这一思路，我们讨论了将数据库的变更日志视为一种我们可以订阅的事件流。诸如 Actor 的消息传递系统（请参阅 “[消息传递中的数据流](/ch5#sec_encoding_dataflow_msg)”）也具有响应事件的概念。早在 20 世纪 80 年代，**元组空间（tuple space）** 模型就已经探索了表达分布式计算的方式：观察状态变更并作出反应的过程[^38] [^39]。

如前所述，当触发器由于数据变更而被触发时，或次级索引更新以反映索引表中的变更时，数据库内部也发生着类似的情况。分拆数据库意味着将这个想法应用于在主数据库之外，用于创建派生数据集：缓存、全文检索索引、机器学习或分析系统。我们可以为此使用流处理和消息传递系统。

需要记住的重要一点是，维护派生数据不同于执行异步任务。传统的消息传递系统通常是为执行异步任务设计的（请参阅 “[日志与传统的消息传递相比](/ch12#sec_stream_logs_vs_messaging)”）：

* 在维护派生数据时，状态变更的顺序通常很重要（如果多个视图是从事件日志派生的，则需要按照相同的顺序处理事件，以便它们之间保持一致）。如 “[确认与重新传递](/ch12#sec_stream_reordering)” 中所述，许多消息代理在重传未确认消息时没有此属性，双写也被排除在外（请参阅 “[保持系统同步](/ch12#sec_stream_sync)”）。
* 容错是派生数据的关键：仅仅丢失单个消息就会导致派生数据集永远与其数据源失去同步。消息传递和派生状态更新都必须可靠。例如，许多 Actor 系统默认在内存中维护 Actor 的状态和消息，所以如果运行 Actor 的机器崩溃，状态和消息就会丢失。

稳定的消息排序和容错消息处理是相当严格的要求，但与分布式事务相比，它们开销更小，运行更稳定。现代流处理组件可以提供这些排序和可靠性保证，并允许应用代码以流算子的形式运行。

这些应用代码可以执行任意处理，包括数据库内置派生函数通常不提供的功能。就像通过管道链接的 Unix 工具一样，流算子可以围绕着数据流构建大型系统。每个算子接受状态变更的流作为输入，并产生其他状态变化的流作为输出。

#### 流处理器和服务 {#id345}

当今流行的应用开发风格涉及将功能分解为一组通过同步网络请求（如 REST API）进行通信的 **服务**（service，请参阅 “[服务中的数据流：REST 与 RPC](/ch5#sec_encoding_dataflow_rpc)”）。这种面向服务的架构优于单一庞大应用的优势主要在于：通过松散耦合来提供组织上的可伸缩性：不同的团队可以专职于不同的服务上，从而减少团队之间的协调工作（因为服务可以独立部署和更新）。

在数据流中组装流算子与微服务方法有很多相似之处[^40]。但底层通信机制是有很大区别：数据流采用单向异步消息流，而不是同步的请求 / 响应式交互。

除了在 “[消息传递中的数据流](/ch5#sec_encoding_dataflow_msg)” 中列出的优点（如更好的容错性），数据流系统还能实现更好的性能。例如，假设客户正在购买以一种货币定价，但以另一种货币支付的商品。为了执行货币换算，你需要知道当前的汇率。这个操作可以通过两种方式实现[^40] [^41]：

1. 在微服务方法中，处理购买的代码可能会查询汇率服务或数据库，以获取特定货币的当前汇率。
2. 在数据流方法中，处理订单的代码会提前订阅汇率变更流，并在汇率发生变动时将当前汇率存储在本地数据库中。处理订单时只需查询本地数据库即可。

第二种方法能将对另一服务的同步网络请求替换为对本地数据库的查询（可能在同一台机器甚至同一个进程中）。数据流方法不仅更快，而且当其他服务失效时也更稳健。最快且最可靠的网络请求就是压根没有网络请求！我们现在不再使用 RPC，而是在购买事件和汇率更新事件之间建立流联接（请参阅 “[流表连接（流扩充）](/ch12#sec_stream_table_joins)”）。

连接是时间相关的：如果购买事件在稍后的时间点被重新处理，汇率可能已经改变。如果要重建原始输出，则需要获取原始购买时的历史汇率。无论是查询服务还是订阅汇率更新流，你都需要处理这种时间相关性（请参阅 “[连接的时间依赖性](/ch12#sec_stream_join_time)”）。

订阅变更流，而不是在需要时查询当前状态，使我们更接近类似电子表格的计算模型：当某些数据发生变更时，依赖于此的所有派生数据都可以快速更新。还有很多未解决的问题，例如关于时间相关连接等问题，但我认为围绕数据流构建应用的想法是一个非常有希望的方向。

### 观察派生数据状态 {#sec_future_observing}

在抽象层面，上一节讨论的数据流系统给出了创建并维护派生数据集（如搜索索引、物化视图、预测模型）的过程。我们把这称为 **写路径（write path）**：当信息写入系统后，它可能经过多个批处理与流处理阶段，最终所有相关派生数据集都会被更新。[图 13-1](#fig_future_write_read_paths) 展示了搜索索引更新的例子。

{{< figure src="/fig/ddia_1301.png" id="fig_future_write_read_paths" caption="图 13-1 在搜索索引中，写入（文档更新）与读取（查询）相遇。" class="w-full my-4" >}}

但你为什么一开始就要创建派生数据集？很可能是因为你想在以后再次查询它。这就是 **读路径（read path）**：当服务用户请求时，你需要从派生数据集中读取，也许还要对结果进行一些额外处理，然后构建给用户的响应。

总而言之，写路径和读路径涵盖了数据的整个旅程，从收集数据开始，到使用数据结束（可能是由另一个人）。写路径是预计算过程的一部分 —— 即，一旦数据进入，即刻完成，无论是否有人需要看它。读路径是这个过程中只有当有人请求时才会发生的部分。如果你熟悉函数式编程语言，则可能会注意到写路径类似于立即求值，读路径类似于惰性求值。

如 [图 13-1](#fig_future_write_read_paths) 所示，派生数据集是写路径和读路径相遇的地方。它代表了写入时工作量与读取时工作量之间的权衡。

#### 物化视图和缓存 {#id451}

全文检索索引就是一个很好的例子：写路径更新索引，读路径在索引中搜索关键字。读写都需要做一些工作。写入需要更新文档中出现的所有关键词的索引条目。读取需要搜索查询中的每个单词，并应用布尔逻辑来查找包含查询中所有单词（AND 运算符）的文档，或者每个单词（OR 运算符）的任何同义词。

如果没有索引，搜索查询将不得不扫描所有文档（如 grep），如果有着大量文档，这样做的开销巨大。没有索引意味着写入路径上的工作量较少（没有要更新的索引），但是在读取路径上需要更多工作。

另一方面，可以想象为所有可能的查询预先计算搜索结果。在这种情况下，读路径上的工作量会减少：不需要布尔逻辑，只需查找查询结果并返回即可。但写路径会更加昂贵：可能的搜索查询集合是无限大的，因此预先计算所有可能的搜索结果将需要无限的时间和存储空间，这在实践中不可行。

另一种选择是预先计算一组固定的最常见查询的搜索结果，以便可以快速提供它们而无需转到索引。不常见的查询仍然可以通过索引来提供服务。这通常被称为常见查询的 **缓存（cache）**，尽管我们也可以称之为 **物化视图（materialized view）**，因为当新文档出现，且需要被包含在这些常见查询的搜索结果之中时，这些索引就需要更新。

从这个例子中我们可以看到，索引不是写路径和读路径之间唯一可能的边界；缓存常见搜索结果也是可行的；而在少量文档上使用没有索引的类 grep 扫描也是可行的。由此来看，缓存，索引和物化视图的作用很简单：它们改变了读路径与写路径之间的边界。通过预先计算结果，从而允许我们在写路径上做更多的工作，以节省读路径上的工作量。

在写路径上完成的工作和读路径之间的界限，实际上是本书开始处在 “[描述负载](/ch2#sec_introduction_twitter)” 中推特例子里谈到的主题。在该例中，我们还看到了与普通用户相比，名人的写路径和读路径可能有所不同。在 500 页之后，我们已经绕回了起点！

#### 有状态、可离线的客户端 {#id347}

我发现写路径和读路径之间的边界很有趣，因为我们可以试着改变这个边界，并探讨这种改变的实际意义。我们来看看不同上下文中的这一想法。

过去二十年来，Web 应用的火热让我们对应用开发作出了一些很容易视作理所当然的假设。具体来说就是，客户端 / 服务器模型 —— 客户端大多是无状态的，而服务器拥有数据的权威 —— 已经普遍到我们几乎忘掉了还有其他任何模型的存在。但是技术在不断地发展，我认为不时地质疑现状非常重要。

传统上，网络浏览器是无状态的客户端，只有当连接到互联网时才能做一些有用的事情（能离线执行的唯一事情基本上就是上下滚动之前在线时加载好的页面）。然而，最近的 “单页面” JavaScript Web 应用已经获得了很多有状态的功能，包括客户端用户界面交互，以及 Web 浏览器中的持久化本地存储。移动应用可以类似地在设备上存储大量状态，而且大多数用户交互都不需要与服务器往返交互。

这些不断变化的功能重新引发了对 **离线优先（offline-first）** 应用的兴趣，这些应用尽可能地在同一设备上使用本地数据库，无需连接互联网，并在后台网络连接可用时与远程服务器同步[^42]。由于移动设备通常具有缓慢且不可靠的蜂窝网络连接，因此，如果用户的用户界面不必等待同步网络请求，且应用主要是离线工作的，则这是一个巨大优势（请参阅 “[需要离线操作的客户端](/ch6#sec_replication_offline_clients)”）。

当我们摆脱无状态客户端与中央数据库交互的假设，并转向在终端用户设备上维护状态时，这就开启了新世界的大门。特别是，我们可以将设备上的状态视为 **服务器状态的缓存**。屏幕上的像素是客户端应用中模型对象的物化视图；模型对象是远程数据中心的本地状态副本[^27]。

#### 将状态变更推送给客户端 {#id348}

在典型的网页中，如果你在 Web 浏览器中加载页面，并且随后服务器上的数据发生变更，则浏览器在重新加载页面之前对此一无所知。浏览器只能在一个时间点读取数据，假设它是静态的 —— 它不会订阅来自服务器的更新。因此设备上的状态是陈旧的缓存，除非你显式轮询变更否则不会更新。（像 RSS 这样基于 HTTP 的 Feed 订阅协议实际上只是一种基本的轮询形式）

最近的协议已经超越了 HTTP 的基本请求 / 响应模式：服务端发送的事件（EventSource API）和 WebSockets 提供了通信信道，通过这些信道，Web 浏览器可以与服务器保持打开的 TCP 连接，只要浏览器仍然连接着，服务器就能主动向浏览器推送信息。这为服务器提供了主动通知终端用户客户端的机会，服务器能告知客户端其本地存储状态的任何变化，从而减少客户端状态的陈旧程度。

用我们的写路径与读路径模型来讲，主动将状态变更推至到客户端设备，意味着将写路径一直延伸到终端用户。当客户端首次初始化时，它仍然需要使用读路径来获取其初始状态，但此后它就能够依赖服务器发送的状态变更流了。我们在流处理和消息传递部分讨论的想法并不局限于数据中心中：我们可以进一步采纳这些想法，并将它们一直延伸到终端用户设备[^43]。

这些设备有时会离线，并在此期间无法收到服务器状态变更的任何通知。但是我们已经解决了这个问题：在 “[消费者偏移量](/ch12#sec_stream_log_offsets)” 中，我们讨论了基于日志的消息代理的消费者能在失败或断开连接后重连，并确保它不会错过掉线期间任何到达的消息。同样的技术适用于单个用户，每个设备都是一个小事件流的小小订阅者。

#### 端到端的事件流 {#id349}

最近用于开发有状态的客户端与用户界面的工具，例如如 Elm 语言[^30]和 Facebook 的 React、Flux 和 Redux 工具链，已经通过订阅表示用户输入或服务器响应的事件流来管理客户端的内部状态，其结构与事件溯源相似（请参阅 “[事件溯源](/ch12#sec_stream_event_sourcing)”）。

将这种编程模型扩展为：允许服务器将状态变更事件推送到客户端的事件管道中，是非常自然的。因此，状态变化可以通过 **端到端（end-to-end）** 的写路径流动：从一个设备上的交互触发状态变更开始，经由事件日志，并穿过几个派生数据系统与流处理器，一直到另一台设备上的用户界面，而有人正在观察用户界面上的状态变化。这些状态变化能以相当低的延迟传播 —— 比如说，在一秒内从一端到另一端。

一些应用（如即时消息传递与在线游戏）已经具有这种 “实时” 架构（在低延迟交互的意义上，不是在 “[响应时间保证](/ch9#sec_distributed_clocks_realtime)” 中的意义上）。但我们为什么不用这种方式构建所有的应用？

挑战在于，关于无状态客户端和请求 / 响应交互的假设已经根深蒂固地植入在我们的数据库、库、框架以及协议之中。许多数据存储支持读取与写入操作，为请求返回一个响应，但只有极少数提供订阅变更的能力 —— 请求返回一个随时间推移的响应流（请参阅 “[变更流的 API 支持](/ch12#sec_stream_change_api)” ）。

为了将写路径延伸至终端用户，我们需要从根本上重新思考我们构建这些系统的方式：从请求 / 响应交互转向发布 / 订阅数据流[^27]。更具响应性的用户界面与更好的离线支持，我认为这些优势值得我们付出努力。如果你正在设计数据系统，我希望你对订阅变更的选项留有印象，而不只是查询当前状态。

#### 读也是事件 {#sec_future_read_events}

我们讨论过，当流处理器将派生数据写入存储（数据库，缓存或索引）时，以及当用户请求查询该存储时，存储将充当写路径和读路径之间的边界。该存储应当允许对数据进行随机访问的读取查询，否则这些查询将需要扫描整个事件日志。

在很多情况下，数据存储与流处理系统是分开的。但回想一下，流处理器还是需要维护状态以执行聚合和连接的（请参阅 “[流连接](/ch12#sec_stream_joins)”）。这种状态通常隐藏在流处理器内部，但一些框架也允许这些状态被外部客户端查询[^45]，将流处理器本身变成一种简单的数据库。

我愿意进一步思考这个想法。正如到目前为止所讨论的那样，对存储的写入是通过事件日志进行的，而读取是临时的网络请求，直接流向存储着待查数据的节点。这是一个合理的设计，但不是唯一可行的设计。也可以将读取请求表示为事件流，并同时将读事件与写事件送往流处理器；流处理器通过将读取结果发送到输出流来响应读取事件[^46]。

当写入和读取都被表示为事件，并且被路由到同一个流算子以便处理时，我们实际上是在读取查询流和数据库之间执行流表连接。读取事件需要被送往保存数据的数据库分区（请参阅 “[请求路由](/ch7#sec_sharding_routing)”），就像批处理和流处理器在连接时需要在同一个键上对输入分区一样（请参阅 “[Reduce 侧连接与分组](/ch11#sec_batch_join)”）。

服务请求与执行连接之间的这种相似之处是非常关键的[^47]。一次性读取请求只是将请求传过连接算子，然后请求马上就被忘掉了；而一个订阅请求，则是与连接另一侧过去与未来事件的持久化连接。

记录读取事件的日志可能对于追踪整个系统中的因果关系与数据来源也有好处：它可以让你重现出当用户做出特定决策之前看见了什么。例如在网商中，向客户显示的预测送达日期与库存状态，可能会影响他们是否选择购买一件商品[^4]。要分析这种联系，则需要记录用户查询运输与库存状态的结果。

将读取事件写入持久存储可以更好地跟踪因果关系（请参阅 “[排序事件以捕获因果关系](#sec_future_capture_causality)”），但会产生额外的存储与 I/O 成本。优化这些系统以减少开销仍然是一个开放的研究问题[^2]。但如果你已经出于运维目的留下了读取请求日志，将其作为请求处理的副作用，那么将这份日志作为请求事件源并不是什么特别大的变更。

#### 多分区数据处理 {#sec_future_unbundled_multi_shard}

对于只涉及单个分区的查询，通过流来发送查询与收集响应可能是杀鸡用牛刀了。然而，这个想法开启了分布式执行复杂查询的可能性，这需要合并来自多个分区的数据，利用了流处理器已经提供的消息路由、分区和连接的基础设施。

Storm 的分布式 RPC 功能支持这种使用模式（请参阅 “[消息传递和 RPC](/ch12#sec_stream_actors_drpc)”）。例如，它已经被用来计算浏览过某个推特 URL 的人数 —— 即，发推包含该 URL 的所有人的粉丝集合的并集[^48]。由于推特的用户是分区的，因此这种计算需要合并来自多个分区的结果。

这种模式的另一个例子是欺诈预防：为了评估特定购买事件是否具有欺诈风险，你可以检查该用户 IP 地址，电子邮件地址，帐单地址，送货地址的信用分。这些信用数据库中的每一个都是有分区的，因此为特定购买事件采集分数需要连接一系列不同的分区数据集[^49]。

MPP 数据库的内部查询执行图有着类似的特征（请参阅 “[Hadoop 与分布式数据库的对比](/ch11#sec_batch_distributed)”）。如果需要执行这种多分区连接，则直接使用提供此功能的数据库，可能要比使用流处理器实现它要更简单。然而将查询视为流提供了一种选项，可以用于实现超出传统现成解决方案的大规模应用。


## 追求正确性 {#sec_future_correctness}

对于只读取数据的无状态服务，出问题也没什么大不了的：你可以修复该错误并重启服务，而一切都恢复正常。像数据库这样的有状态系统就没那么简单了：它们被设计为永远记住事物（或多或少），所以如果出现问题，这种（错误的）效果也将潜在地永远持续下去，这意味着它们需要更仔细的思考[^50]。

我们希望构建可靠且 **正确** 的应用（即使面对各种故障，程序的语义也能被很好地定义与理解）。约四十年来，原子性、隔离性和持久性（[第八章](/ch8)）等事务特性一直是构建正确应用的首选工具。然而这些地基没有看上去那么牢固：例如弱隔离级别带来的困惑可以佐证（请参阅 “[弱隔离级别](/ch8#sec_transactions_isolation_levels)”）。

事务在某些领域被完全抛弃，并被提供更好性能与可伸缩性的模型取代，但后者有更复杂的语义（例如，请参阅 “[无主复制](/ch6#sec_replication_leaderless)”）。**一致性（Consistency）** 经常被谈起，但其定义并不明确（请参阅 “[一致性](/ch8#sec_transactions_acid_consistency)” 和 [第十章](/ch10)）。有些人断言我们应当为了高可用而 “拥抱弱一致性”，但却对这些概念实际上意味着什么缺乏清晰的认识。

对于如此重要的话题，我们的理解，以及我们的工程方法却是惊人地薄弱。例如，确定在特定事务隔离等级或复制配置下运行特定应用是否安全是非常困难的[^51] [^52]。通常简单的解决方案似乎在低并发性的情况下工作正常，并且没有错误，但在要求更高的情况下却会出现许多微妙的错误。

例如，Kyle Kingsbury 的 Jepsen 实验[^53]标出了一些产品声称的安全保证与其在网络问题与崩溃时的实际行为之间的明显差异。即使像数据库这样的基础设施产品没有问题，应用代码仍然需要正确使用它们提供的功能才行，如果配置很难理解，这是很容易出错的（在这种情况下指的是弱隔离级别，法定人数配置等）。

如果你的应用可以容忍偶尔的崩溃，以及以不可预料的方式损坏或丢失数据，那生活就要简单得多，而你可能只要双手合十念阿弥陀佛，期望佛祖能保佑最好的结果。另一方面，如果你需要更强的正确性保证，那么可串行化与原子提交就是久经考验的方法，但它们是有代价的：它们通常只在单个数据中心中工作（这就排除了地理位置分散的架构），并限制了系统能够实现的规模与容错特性。

虽然传统的事务方法并没有走远，但我也相信在使应用正确而灵活地处理错误方面上，事务也不是最后一个可以谈的。在本节中，我将提出一些在数据流架构中考量正确性的方式。

### 数据库的端到端原则 {#sec_future_end_to_end}

仅仅因为一个应用程序使用了具有相对较强安全属性的数据系统（例如可串行化的事务），并不意味着就可以保证没有数据丢失或损坏。例如，如果某个应用有个 Bug，导致它写入不正确的数据，或者从数据库中删除数据，那么可串行化的事务也救不了你。

这个例子可能看起来很无聊，但值得认真对待：应用会出 Bug，而人也会犯错误。我在 “[状态、流和不变性](/ch12#sec_stream_immutability)” 中使用了这个例子来支持不可变和仅追加的数据，阉割掉错误代码摧毁良好数据的能力，能让从错误中恢复更为容易。

虽然不变性很有用，但它本身并非万灵药。让我们来看一个可能发生的、非常微妙的数据损坏案例。

#### 恰好执行一次操作 {#id353}

在 “[容错](/ch12#sec_stream_fault_tolerance)” 中，我们见到了 **恰好一次**（或 **等效一次**）语义的概念。如果在处理消息时出现问题，你可以选择放弃（丢弃消息 —— 导致数据丢失）或重试。如果重试，就会有这种风险：第一次实际上成功了，只不过你没有发现。结果这个消息就被处理了两次。

处理两次是数据损坏的一种形式：为同样的服务向客户收费两次（收费太多）或增长计数器两次（夸大指标）都不是我们想要的。在这种情况下，恰好一次意味着安排计算，使得最终效果与没有发生错误的情况一样，即使操作实际上因为某种错误而重试。我们先前讨论过实现这一目标的几种方法。

最有效的方法之一是使操作 **幂等**（idempotent，请参阅 “[幂等性](/ch12#sec_stream_idempotence)”）：即确保它无论是执行一次还是执行多次都具有相同的效果。但是，将不是天生幂等的操作变为幂等的操作需要一些额外的努力与关注：你可能需要维护一些额外的元数据（例如更新了值的操作 ID 集合），并在从一个节点故障切换至另一个节点时做好防护（请参阅 “[领导者和锁](/ch9#sec_distributed_lock_fencing)”）。

#### 抑制重复 {#id354}

除了流处理之外，其他许多地方也需要抑制重复的模式。例如，TCP 使用了数据包上的序列号，以便接收方可以将它们正确排序，并确定网络上是否有数据包丢失或重复。在将数据交付应用前，TCP 协议栈会重新传输任何丢失的数据包，也会移除任何重复的数据包。

但是，这种重复抑制仅适用于单条 TCP 连接的场景中。假设 TCP 连接是一个客户端与数据库的连接，并且它正在执行 [例 13-1](#fig_future_non_idempotent) 中的事务。在许多数据库中，事务是绑定在客户端连接上的（如果客户端发送了多个查询，数据库就知道它们属于同一个事务，因为它们是在同一个 TCP 连接上发送的）。如果客户端在发送 `COMMIT` 之后并在从数据库服务器收到响应之前遇到网络中断与连接超时，客户端是不知道事务是否已经被提交的（[图 9-1](/ch9#fig_distributed_network)）。

<a id="fig_future_non_idempotent"></a>

##### 例 13-1 资金从一个账户到另一个账户的非幂等转移

```sql
BEGIN TRANSACTION;
    UPDATE accounts SET balance = balance + 11.00 WHERE account_id = 1234;
    UPDATE accounts SET balance = balance - 11.00 WHERE account_id = 4321;
COMMIT;
```

客户端可以重连到数据库并重试事务，但现在已经处于 TCP 重复抑制的范围之外了。因为 [例 13-1](#fig_future_non_idempotent) 中的事务不是幂等的，可能会发生转了 \$22 而不是期望的 \$11。因此，尽管 [例 13-1](#fig_future_non_idempotent) 是一个事务原子性的标准样例，但它实际上并不正确，而真正的银行并不会这样办事[^3]。

两阶段提交（请参阅 “[原子提交与两阶段提交](/ch8#sec_transactions_2pc)”）协议会破坏 TCP 连接与事务之间的 1:1 映射，因为它们必须在故障后允许事务协调器重连到数据库，告诉数据库将存疑事务提交还是中止。这足以确保事务只被恰好执行一次吗？不幸的是，并不能。

即使我们可以抑制数据库客户端与服务器之间的重复事务，我们仍然需要担心终端用户设备与应用服务器之间的网络。例如，如果终端用户的客户端是 Web 浏览器，则它可能会使用 HTTP POST 请求向服务器提交指令。也许用户正处于一个信号微弱的蜂窝数据网络连接中，它们成功地发送了 POST，但却在能够从服务器接收响应之前没了信号。

在这种情况下，可能会向用户显示错误消息，而他们可能会手动重试。Web 浏览器警告说，“你确定要再次提交这个表单吗？”  —— 用户选 “是”，因为他们希望操作发生（Post/Redirect/Get 模式[^54]可以避免在正常操作中出现此警告消息，但 POST 请求超时就没办法了）。从 Web 服务器的角度来看，重试是一个独立的请求；从数据库的角度来看，这是一个独立的事务。通常的除重机制无济于事。

#### 操作标识符 {#id355}

要在通过几跳的网络通信上使操作具有幂等性，仅仅依赖数据库提供的事务机制是不够的，你需要考虑 **端到端（end-to-end）** 的请求流。  
例如，你可以为操作生成一个唯一标识符（例如 UUID），并将其作为隐藏表单字段包含在客户端应用中，或通过计算所有相关表单字段的哈希来生成操作 ID[^3]。如果浏览器提交了两次 POST，请求会携带相同操作 ID。你就可以把这个 ID 贯穿传递到数据库，并确保同一个 ID 最多只执行一次，如 [例 13-2](#fig_future_request_id) 所示。

<a id="fig_future_request_id"></a>

##### 例 13-2 使用唯一 ID 抑制重复请求

```sql
ALTER TABLE requests ADD UNIQUE (request_id);

BEGIN TRANSACTION;
    INSERT INTO requests
        (request_id, from_account, to_account, amount)
        VALUES('0286FDB8-D7E1-423F-B40B-792B3608036C', 4321, 1234, 11.00);
    UPDATE accounts SET balance = balance + 11.00 WHERE account_id = 1234;
    UPDATE accounts SET balance = balance - 11.00 WHERE account_id = 4321;
COMMIT;
```

[例 13-2](#fig_future_request_id) 依赖于 `request_id` 列上的唯一约束。如果事务尝试插入已存在的 ID，`INSERT` 会失败并中止事务，从而避免重复生效。即使在较弱隔离级别下，关系数据库通常也能正确维护唯一性约束（而应用层的 “先检查再插入” 在不可串行化隔离下可能失败，见 “[写入偏差与幻读](/ch8#sec_transactions_write_skew)”）。

除了抑制重复请求，[例 13-2](#fig_future_request_id) 中的 `requests` 表本身也像一份事件日志，可用于事件溯源或变更数据捕获。账户余额更新并不一定要与事件插入放在同一事务中，因为余额是可由下游消费者从请求事件派生出的冗余状态；只要请求事件被恰好处理一次（同样可通过请求 ID 保证），即可保持正确性。

#### 端到端原则 {#sec_future_e2e_argument}

抑制重复事务的这种情况只是一个更普遍的原则的一个例子，这个原则被称为 **端到端原则（end-to-end argument）**，它在 1984 年由 Saltzer、Reed 和 Clark 阐述[^55]：

> 只有在通信系统两端应用的知识与帮助下，所讨论的功能才能完全地正确地实现。因而将这种被质疑的功能作为通信系统本身的功能是不可能的（有时，通信系统可以提供这种功能的不完备版本，可能有助于提高性能）。
>

在我们的例子中 **所讨论的功能** 是重复抑制。我们看到 TCP 在 TCP 连接层次抑制了重复的数据包，一些流处理器在消息处理层次提供了所谓的恰好一次语义，但这些都无法阻止当一个请求超时时，用户亲自提交重复的请求。TCP，数据库事务，以及流处理器本身并不能完全排除这些重复。解决这个问题需要一个端到端的解决方案：从终端用户的客户端一路传递到数据库的事务标识符。

端到端原则也适用于检查数据的完整性：以太网，TCP 和 TLS 中内置的校验和可以检测网络中数据包的损坏情况，但是它们无法检测到由连接两端发送 / 接收软件中 Bug 导致的损坏。或数据存储所在磁盘上的损坏。如果你想捕获数据所有可能的损坏来源，你也需要端到端的校验和。

类似的原则也适用于加密[^55]：家庭 WiFi 网络上的密码可以防止人们窃听你的 WiFi 流量，但无法阻止互联网上其他地方攻击者的窥探；客户端与服务器之间的 TLS/SSL 可以阻挡网络攻击者，但无法阻止恶意服务器。只有端到端的加密和认证可以防止所有这些事情。

尽管低层级的功能（TCP 重复抑制、以太网校验和、WiFi 加密）无法单独提供所需的端到端功能，但它们仍然很有用，因为它们能降低较高层级出现问题的可能性。例如，如果我们没有 TCP 来将数据包排成正确的顺序，那么 HTTP 请求通常就会被搅烂。我们只需要记住，低级别的可靠性功能本身并不足以确保端到端的正确性。

#### 在数据系统中应用端到端思考 {#id357}

这将我带回最初的论点：仅仅因为应用使用了提供相对较强安全属性的数据系统，例如可串行化的事务，并不意味着应用的数据就不会丢失或损坏了。应用本身也需要采取端到端的措施，例如除重。

这实在是一个遗憾，因为容错机制很难弄好。低层级的可靠机制（比如 TCP 中的那些）运行的相当好，因而剩下的高层级错误基本很少出现。如果能将这些剩下的高层级容错机制打包成抽象，而应用不需要再去操心，那该多好呀 —— 但恐怕我们还没有找到这一正确的抽象。

长期以来，事务被认为是一个很好的抽象，我相信它们确实是很有用的。正如 [第八章](/ch8) 中所讨论的，它们将各种可能的问题（并发写入、违背约束、崩溃、网络中断、磁盘故障）合并为两种可能结果：提交或中止。这是对编程模型而言的一种巨大简化，但这还不够。

事务是代价高昂的，当涉及异构存储技术时尤为甚（请参阅 “[实践中的分布式事务](/ch8#sec_transactions_xa)”）。我们拒绝使用分布式事务是因为它开销太大，结果我们最后不得不在应用代码中重新实现容错机制。正如本书中大量的例子所示，对并发性与部分失败的推理是困难且违反直觉的，所以我怀疑大多数应用级别的机制都不能正确工作，最终结果是数据丢失或损坏。

出于这些原因，我认为探索对容错的抽象是很有价值的。它使提供应用特定的端到端的正确性属性变得更简单，而且还能在大规模分布式环境中提供良好的性能与运维特性。

### 强制约束 {#sec_future_constraints}

让我们思考一下在 [分拆数据库](#sec_future_unbundling) 上下文中的 **正确性（correctness）**。我们看到端到端的除重可以通过从客户端一路透传到数据库的请求 ID 实现。那么其他类型的约束呢？

我们先来特别关注一下 **唯一性约束** —— 例如我们在 [例 13-2](#fig_future_request_id) 中所依赖的约束。在 “[约束和唯一性保证](/ch10#sec_consistency_uniqueness)” 中，我们看到了几个其他需要强制实施唯一性的应用功能例子：用户名或电子邮件地址必须唯一标识用户，文件存储服务不能包含多个重名文件，两个人不能在航班或剧院预订同一个座位。

其他类型的约束也非常类似：例如，确保帐户余额永远不会变为负数，确保不会超卖库存，或者会议室没有重复的预订。执行唯一性约束的技术通常也可以用于这些约束。

#### 唯一性约束需要达成共识 {#id452}

在 [第十章](/ch10) 中我们看到，在分布式环境中，强制执行唯一性约束需要共识：如果存在多个具有相同值的并发请求，则系统需要决定冲突操作中的哪一个被接受，并拒绝其他违背约束的操作。

达成这一共识的最常见方式是使单个节点作为领导，并使其负责所有决策。只要你不介意所有请求都挤过单个节点（即使客户端位于世界的另一端），只要该节点没有失效，系统就能正常工作。如果你需要容忍领导者失效，那么就又回到了共识问题（请参阅 “[单主复制与共识](/ch10#from-single-leader-replication-to-consensus)”）。

唯一性检查可以通过对唯一性字段分区做横向伸缩。例如，如果需要通过请求 ID 确保唯一性（如 [例 13-2](#fig_future_request_id) 所示），你可以确保所有具有相同请求 ID 的请求都被路由到同一分区（请参阅 [第七章](/ch7)）。如果你需要让用户名是唯一的，则可以按用户名的散列值做分区。

但异步多主复制排除在外，因为可能会发生不同主库同时接受冲突写操作的情况，因而这些值不再是唯一的（请参阅 “[实现线性一致的系统](/ch10#sec_consistency_implementing_linearizable)”）。如果你想立刻拒绝任何违背约束的写入，同步协调是无法避免的[^56]。

#### 基于日志消息传递中的唯一性 {#sec_future_uniqueness_log}

日志确保所有消费者以相同顺序看到消息，这在形式上称为 **全序广播（total order broadcast）**，并且等价于共识（请参阅 “[全序广播](/ch10#sec_consistency_total_order)”）。在基于日志消息传递的分拆数据库方案中，我们可以用同样的思路来实施唯一性约束。

流处理器在单个线程上依次消费单个日志分区中的所有消息（请参阅 “[日志与传统的消息传递相比](/ch12#sec_stream_logs_vs_messaging)”）。因此，如果日志是按需要确保唯一的值做的分区，则流处理器可以无歧义地、确定性地决定几个冲突操作中的哪一个先到达。例如，在多个用户尝试宣告相同用户名的情况下[^57]：

1. 每个对用户名的请求都被编码为一条消息，并追加到按用户名散列值确定的分区。
2. 流处理器依序读取日志中的请求，并使用本地数据库来追踪哪些用户名已经被占用了。对于所有申请可用用户名的请求，它都会记录该用户名，并向输出流发送一条成功消息。对于所有申请已占用用户名的请求，它都会向输出流发送一条拒绝消息。
3. 请求用户名的客户端监视输出流，等待与其请求相对应的成功或拒绝消息。

该算法基本上与 “[使用全序广播实现线性一致的存储](/ch10#sec_consistency_total_order)” 中的算法相同。它可以简单地通过增加分区数伸缩至较大的请求吞吐量，因为每个分区都可以被独立处理。

该方法不仅适用于唯一性约束，而且适用于许多其他类型的约束。其基本原理是，任何可能冲突的写入都会路由到相同的分区并按顺序处理。正如 “[什么是冲突？](/ch6#what-is-a-conflict)” 与 “[写入偏差与幻读](/ch8#sec_transactions_write_skew)” 中所述，冲突的定义可能取决于应用，但流处理器可以使用任意逻辑来验证请求。这个想法与 Bayou 在 90 年代开创的方法类似[^58]。

#### 多分区请求处理 {#id360}

当请求涉及多个分区时，如何在满足约束的同时保证原子效果，会更有挑战性。在 [例 13-2](#fig_future_request_id) 中，至少可能涉及三个分区：请求 ID 所在分区、收款账户所在分区、付款账户所在分区。它们彼此独立，并不必然位于同一分区。

在传统数据库方案里，这类事务通常需要跨分区原子提交；这会把事务强行纳入跨分区全序，从而引入同步协调开销并影响吞吐量。  
但使用分区日志与流处理器，也可以在不使用跨分区原子提交的情况下达到等价正确性。

{{< figure src="/fig/ddia_1302.png" id="fig_future_multi_shard" caption="图 13-2 使用事件日志与流处理器，检查源账户是否有足够余额，并将资金原子地划转到目标账户与手续费账户。" class="w-full my-4" >}}

1.  客户端为转账请求生成全局唯一请求 ID，并将请求按源账户 ID 路由到相应日志分区。
2.  一个流处理器消费该请求日志，并维护源账户本地状态及已处理请求 ID 集。遇到新请求 ID 时，先检查余额是否充足；若充足，则在本地状态中预留金额，并发出多个后续事件：源账户的出账事件、目标账户的入账事件、手续费账户的入账事件。所有事件都携带同一请求 ID。
3.  源账户处理器稍后会再次收到出账事件。它根据请求 ID 识别出这是先前预留过的支付，执行真正扣款并更新本地状态；若重复到达则忽略。
4.  目标账户与手续费账户各自由独立处理任务消费。收到入账事件后更新本地状态，并基于请求 ID 去重。

图 13-2 虽然画成三个账户落在三个分区中，但即使在同一分区也同样成立。关键条件是：同一账户的事件必须按日志顺序处理，且消息投递具备至少一次语义，处理逻辑保持确定性。

如果源账户处理器在处理中崩溃，恢复后会重放相同请求并做出相同决策，发出相同请求 ID 的后续事件。下游消费者会基于请求 ID 去重，因此不会重复生效。

这个系统的原子性不来自分布式事务，而来自初始请求事件写入源账户日志这一原子动作。只要这个起点事件写入成功，后续事件最终都会出现：它们可能因故障恢复而延迟，也可能短暂重复，但最终可达。

通过把多分区事务拆成多个按不同键分区的阶段，并贯穿端到端请求 ID，我们在故障场景下依然能保证“每个请求对付款方与收款方都恰好生效一次”，同时避免使用原子提交协议。

### 及时性与完整性 {#sec_future_integrity}

事务的一个便利属性是，它们通常是线性一致的（请参阅 “[线性一致性](/ch10#sec_consistency_linearizability)”），也就是说，写入者会等到事务提交，而之后其写入立刻对所有读取者可见。

当我们把一个操作拆分为跨越多个阶段的流处理器时，却并非如此：日志的消费者在设计上就是异步的，因此发送者不会等其消息被消费者处理完。但是，客户端等待输出流中的特定消息是可能的。这正是我们在 “[基于日志消息传递中的唯一性](#sec_future_uniqueness_log)” 一节中检查唯一性约束时所做的事情。

在这个例子中，唯一性检查的正确性不取决于消息发送者是否等待结果。等待的目的仅仅是同步通知发送者唯一性检查是否成功。但该通知可以与消息处理的结果相解耦。

更一般地来讲，我认为术语 **一致性（consistency）** 这个术语混淆了两个值得分别考虑的需求：

* 及时性（Timeliness）

  及时性意味着确保用户观察到系统的最新状态。我们之前看到，如果用户从陈旧的数据副本中读取数据，它们可能会观察到系统处于不一致的状态（请参阅 “[复制延迟问题](/ch6#sec_replication_lag)”）。但这种不一致是暂时的，而最终会通过等待与重试简单地得到解决。

  CAP 定理（请参阅 “[线性一致性的代价](/ch10#sec_linearizability_cost)”）使用 **线性一致性（linearizability）** 意义上的一致性，这是实现及时性的强有力方法。像 **写后读** 这样及时性更弱的一致性也很有用（请参阅 “[读己之写](/ch6#sec_replication_ryw)”）。

* 完整性（Integrity）

  完整性意味着没有损坏；即没有数据丢失，并且没有矛盾或错误的数据。尤其是如果某些派生数据集是作为底层数据之上的视图而维护的（请参阅 “[从事件日志中派生出当前状态](/ch12#sec_stream_deriving_views)”），这种派生必须是正确的。例如，数据库索引必须正确地反映数据库的内容 —— 缺失某些记录的索引并不是很有用。

  如果完整性被违背，这种不一致是永久的：在大多数情况下，等待与重试并不能修复数据库损坏。相反的是，需要显式地检查与修复。在 ACID 事务的上下文中（请参阅 “[ACID 的含义](/ch8#sec_transactions_acid)”），一致性通常被理解为某种特定于应用的完整性概念。原子性和持久性是保持完整性的重要工具。


口号形式：违反及时性，“最终一致性”；违反完整性，“永无一致性”。

我断言在大多数应用中，完整性比及时性重要得多。违反及时性可能令人困惑与讨厌，但违反完整性的结果可能是灾难性的。

例如在你的信用卡对账单上，如果某一笔过去 24 小时内完成的交易尚未出现并不令人奇怪 —— 这些系统有一定的滞后是正常的。我们知道银行是异步核算与敲定交易的，这里的及时性并不是非常重要[^3]。但如果当期对账单余额与上期对账单余额加交易总额对不上（求和错误），或者出现一笔向你收费但未向商家付款的交易（消失的钱），那就实在是太糟糕了，这样的问题就违背了系统的完整性。

#### 数据流系统的正确性 {#id453}

ACID 事务通常既提供及时性（例如线性一致性）也提供完整性保证（例如原子提交）。因此如果你从 ACID 事务的角度来看待应用的正确性，那么及时性与完整性的区别是无关紧要的。

另一方面，对于在本章中讨论的基于事件的数据流系统而言，它们的一个有趣特性就是将及时性与完整性分开。在异步处理事件流时不能保证及时性，除非你显式构建一个在返回之前明确等待特定消息到达的消费者。但完整性实际上才是流处理系统的核心。

**恰好一次** 或 **等效一次** 语义（请参阅 “[容错](/ch12#sec_stream_fault_tolerance)”）是一种保持完整性的机制。如果事件丢失或者生效两次，就有可能违背数据系统的完整性。因此在出现故障时，容错消息传递与重复抑制（例如，幂等操作）对于维护数据系统的完整性是很重要的。

正如我们在上一节看到的那样，可靠的流处理系统可以在无需分布式事务与原子提交协议的情况下保持完整性，这意味着它们有潜力达到与后者相当的正确性，同时还具备好得多的性能与运维稳健性。为了达成这种正确性，我们组合使用了多种机制：

* 将写入操作的内容表示为单条消息，从而可以轻松地被原子写入 —— 与事件溯源搭配效果拔群（请参阅 “[事件溯源](/ch12#sec_stream_event_sourcing)”）。
* 使用与存储过程类似的确定性派生函数，从这一消息中派生出所有其他的状态变更（请参阅 “[真的串行执行](/ch8#sec_transactions_serial)” 和 “[应用代码作为派生函数](#sec_future_dataflow_derivation)”）
* 将客户端生成的请求 ID 传递通过所有的处理层次，从而允许端到端的除重，带来幂等性。
* 使消息不可变，并允许派生数据能随时被重新处理，这使从错误中恢复更加容易（请参阅 “[不可变事件的优点](/ch12#sec_stream_immutability_pros)”）

这种机制组合在我看来，是未来构建容错应用的一个非常有前景的方向。

#### 宽松地解释约束 {#id362}

如前所述，执行唯一性约束需要共识，通常通过在单个节点中汇集特定分区中的所有事件来实现。如果我们想要传统的唯一性约束形式，这种限制是不可避免的，流处理也不例外。

然而另一个需要了解的事实是，许多真实世界的应用实际上可以摆脱这种形式，接受弱得多的唯一性：

* 如果两个人同时注册了相同的用户名或预订了相同的座位，你可以给其中一个人发消息道歉，并要求他们换一个不同的用户名或座位。这种纠正错误的变化被称为 **补偿性事务（compensating transaction）**[^59] [^60]。
* 如果客户订购的物品多于仓库中的物品，你可以下单补仓，并为延误向客户道歉，向他们提供折扣。实际上，这么说吧，如果叉车在仓库中轧过了你的货物，剩下的货物比你想象的要少，那么你也是得这么做[^61]。因此，既然道歉工作流无论如何已经成为你商业过程中的一部分了，那么对库存物品数目添加线性一致的约束可能就没必要了。
* 与之类似，许多航空公司都会超卖机票，打着一些旅客可能会错过航班的算盘；许多旅馆也会超卖客房，抱着部分客人可能会取消预订的期望。在这些情况下，出于商业原因而故意违反了 “一人一座” 的约束；当需求超过供给的情况出现时，就会进入补偿流程（退款、升级舱位 / 房型、提供隔壁酒店的免费的房间）。即使没有超卖，为了应对由恶劣天气或员工罢工导致的航班取消，你还是需要道歉与补偿流程 —— 从这些问题中恢复仅仅是商业活动的正常组成部分。
* 如果有人从账户超额取款，银行可以向他们收取透支费用，并要求他们偿还欠款。通过限制每天的提款总额，银行的风险是有限的。

在许多商业场景中，临时违背约束并稍后通过道歉来修复，实际上是可以接受的。道歉的成本各不相同，但通常很低（以金钱或名声来算）：你无法撤回已发送的电子邮件，但可以发送一封后续电子邮件进行更正。如果你不小心向信用卡收取了两次费用，则可以将其中一项收费退款，而代价仅仅是手续费，也许还有客户的投诉。尽管一旦 ATM 吐了钱，你无法直接取回，但原则上如果账户透支而客户拒不支付，你可以派催收员收回欠款。

道歉的成本是否能接受是一个商业决策。如果可以接受的话，在写入数据之前检查所有约束的传统模型反而会带来不必要的限制，而线性一致性的约束也不是必须的。乐观写入，事后检查可能是一种合理的选择。你仍然可以在做一些挽回成本高昂的事情前确保有相关的验证，但这并不意味着写入数据之前必须先进行验证。

这些应用 **确实** 需要完整性：你不会希望丢失预订信息，或者由于借方贷方不匹配导致资金消失。但是它们在执行约束时 **并不需要** 及时性：如果你销售的货物多于仓库中的库存，可以在事后道歉后并弥补问题。这种做法与我们在 “[处理写入冲突](/ch6#sec_replication_write_conflicts)” 中讨论的冲突解决方法类似。

#### 无协调数据系统 {#id454}

我们现在已经做了两个有趣的观察：

1. 数据流系统可以维持派生数据的完整性保证，而无需原子提交、线性一致性或者同步的跨分区协调。
2. 虽然严格的唯一性约束要求及时性和协调，但许多应用实际上可以接受宽松的约束：只要整个过程保持完整性，这些约束可能会被临时违反并在稍后被修复。

总之这些观察意味着，数据流系统可以为许多应用提供无需协调的数据管理服务，且仍能给出很强的完整性保证。这种 **无协调（coordination-avoiding）** 的数据系统有着很大的吸引力：比起需要执行同步协调的系统，它们能达到更好的性能与更强的容错能力[^56]。

例如，这种系统可以使用多领导者配置运维，跨越多个数据中心，在区域间异步复制。任何一个数据中心都可以持续独立运行，因为不需要同步的跨区域协调。这样的系统的及时性保证会很弱 —— 如果不引入协调它是不可能是线性一致的 —— 但它仍然可以提供有力的完整性保证。

在这种情况下，可串行化事务作为维护派生状态的一部分仍然是有用的，但它们只能在小范围内运行，在那里它们工作得很好[^8]。异构分布式事务（如 XA 事务，请参阅 “[实践中的分布式事务](/ch8#sec_transactions_xa)”）不是必需的。同步协调仍然可以在需要的地方引入（例如在无法恢复的操作之前强制执行严格的约束），但是如果只是应用的一小部分地方需要它，没必要让所有操作都付出协调的代价。[^43]。

另一种审视协调与约束的角度是：它们减少了由于不一致而必须做出的道歉数量，但也可能会降低系统的性能和可用性，从而可能增加由于宕机中断而需要做出的道歉数量。你不可能将道歉数量减少到零，但可以根据自己的需求寻找最佳平衡点 —— 既不存在太多不一致性，又不存在太多可用性问题。

### 信任但验证 {#sec_future_verification}

我们所有关于正确性，完整性和容错的讨论都基于一些假设，假设某些事情可能会出错，但其他事情不会。我们将这些假设称为我们的 **系统模型**（system model，请参阅 “[将系统模型映射到现实世界](/ch9#sec_distributed_system_model)”）：例如，我们应该假设进程可能会崩溃，机器可能突然断电，网络可能会任意延迟或丢弃消息。但是我们也可能假设写入磁盘的数据在执行 `fsync` 后不会丢失，内存中的数据没有损坏，而 CPU 的乘法指令总是能返回正确的结果。

这些假设是相当合理的，因为大多数时候它们都是成立的，如果我们不得不经常担心计算机出错，那么基本上寸步难行。在传统上，系统模型采用二元方法处理故障：我们假设有些事情可能会发生，而其他事情 **永远** 不会发生。实际上，这更像是一个概率问题：有些事情更有可能，其他事情不太可能。问题在于违反我们假设的情况是否经常发生，以至于我们可能在实践中遇到它们。

我们已经看到，数据可能会在内存中、磁盘上、以及网络传输过程中出现损坏。也许这件事值得我们投入更多关注：当系统规模足够大时，哪怕概率再低的问题也会在现实中发生。

#### 维护完整性，尽管软件有Bug {#id455}

除了这些硬件问题之外，总是存在软件 Bug 的风险，这些错误不会被较低层次的网络、内存或文件系统校验和所捕获。即使广泛使用的数据库软件也有 Bug：即使像 MySQL 与 PostgreSQL 这样稳健、口碑良好、多年来被许多人充分测试过的软件，就我个人所见也有 Bug，比如 MySQL 未能正确维护唯一约束[^65]，以及 PostgreSQL 的可串行化隔离等级存在特定的写入偏差异常[^66]。对于不那么成熟的软件来说，情况可能要糟糕得多。

尽管在仔细设计，测试，以及审查上做出很多努力，但 Bug 仍然会在不知不觉中产生。尽管它们很少，而且最终会被发现并被修复，但总会有那么一段时间，这些 Bug 可能会损坏数据。

而对于应用代码，我们不得不假设会有更多的错误，因为绝大多数应用的代码经受的评审与测试远远无法与数据库的代码相比。许多应用甚至没有正确使用数据库提供的用于维持完整性的功能，例如外键或唯一性约束[^36]。

ACID 意义下的一致性（请参阅 “[一致性](/ch8#sec_transactions_acid_consistency)”）基于这样一种想法：数据库以一致的状态启动，而事务将其从一个一致状态转换至另一个一致的状态。因此，我们期望数据库始终处于一致状态。然而，只有当你假设事务没有 Bug 时，这种想法才有意义。如果应用以某种错误的方式使用数据库，例如，不安全地使用弱隔离等级，数据库的完整性就无法得到保证。

#### 不要盲目信任承诺 {#id364}

由于硬件和软件并不总是符合我们的理想，所以数据损坏似乎早晚不可避免。因此，我们至少应该有办法查明数据是否已经损坏，以便我们能够修复它，并尝试追查错误的来源。检查数据完整性称为 **审计（auditing）**。

如 “[不可变事件的优点](/ch12#sec_stream_immutability_pros)” 一节中所述，审计不仅仅适用于财务应用程序。不过，可审计性在财务中是非常非常重要的，因为每个人都知道错误总会发生，我们也都认为能够检测和解决问题是合理的需求。

成熟的系统同样倾向于考虑不太可能的事情出错的可能性，并管理这种风险。例如，HDFS 和 Amazon S3 等大规模存储系统并不完全信任磁盘：它们运行后台进程持续回读文件，并将其与其他副本进行比较，并将文件从一个磁盘移动到另一个，以便降低静默损坏的风险[^67]。

如果你想确保你的数据仍然存在，你必须真正读取它并进行检查。大多数时候它们仍然会在那里，但如果不是这样，你一定想尽早知道答案，而不是更晚。按照同样的原则，不时地尝试从备份中恢复是非常重要的 —— 否则当你发现备份损坏时，你可能已经遇到了数据丢失，那时候就真的太晚了。不要盲目地相信它们全都管用。

#### 为可审计性而设计 {#id365}

如果一个事务在一个数据库中改变了多个对象，在这一事实发生后，很难说清这个事务到底意味着什么。即使你捕获了事务日志（请参阅 “[变更数据捕获](/ch12#sec_stream_cdc)”），各种表中的插入、更新和删除操作并不一定能清楚地表明 **为什么** 要执行这些变更。决定这些变更的是应用逻辑中的调用，而这一应用逻辑稍纵即逝，无法重现。

相比之下，基于事件的系统可以提供更好的可审计性。在事件溯源方法中，系统的用户输入被表示为一个单一不可变事件，而任何其导致的状态变更都派生自该事件。派生可以实现为具有确定性与可重复性，因而相同的事件日志通过相同版本的派生代码时，会导致相同的状态变更。

显式处理数据流（请参阅 “[批处理输出的哲学](/ch11#sec_batch_output)”）可以使数据的 **来龙去脉（provenance）** 更加清晰，从而使完整性检查更具可行性。对于事件日志，我们可以使用散列来检查事件存储没有被破坏。对于任何派生状态，我们可以重新运行从事件日志中派生它的批处理器与流处理器，以检查是否获得相同的结果，或者，甚至并行运行冗余的派生流程。

具有确定性且定义良好的数据流，也使调试与跟踪系统的执行变得容易，以便确定它 **为什么** 做了某些事情[^4] [^69]。如果出现意想之外的事情，那么重现导致意外事件的确切事故现场的诊断能力 —— 一种时间旅行调试功能是非常有价值的。

#### 端到端原则重现 {#id456}

如果我们不能完全相信系统的每个组件都不会损坏 —— 每一个硬件都没缺陷，每一个软件都没有 Bug —— 那我们至少必须定期检查数据的完整性。如果我们不检查，我们就不能发现损坏，直到无可挽回地导致对下游的破坏时，那时候再去追踪问题就要难得多，且代价也要高的多。

检查数据系统的完整性，最好是以端到端的方式进行（请参阅 “[数据库的端到端原则](#sec_future_end_to_end)”）：我们能在完整性检查中涵盖的系统越多，某些处理阶中出现不被察觉损坏的几率就越小。如果我们能检查整个派生数据管道端到端的正确性，那么沿着这一路径的任何磁盘、网络、服务以及算法的正确性检查都隐含在其中了。

持续的端到端完整性检查可以不断提高你对系统正确性的信心，从而使你能更快地进步[^70]。与自动化测试一样，审计提高了快速发现错误的可能性，从而降低了系统变更或新存储技术可能导致损失的风险。如果你不害怕进行变更，就可以更好地充分演化一个应用，使其满足不断变化的需求。

#### 用于可审计数据系统的工具 {#id366}

目前，把可审计性作为一级目标的数据系统还不多。一些应用会实现自己的审计机制（例如把变更写入独立审计表），但要同时保证审计日志与主数据库状态都不可篡改仍然很难。

像 Bitcoin、Ethereum 这样的区块链，本质上是带密码学一致性校验的共享仅追加日志；交易可视作事件，智能合约可视作流处理器。它们通过共识协议让所有节点同意同一事件序列。与本书 [第十章](/ch10) 的共识协议相比，区块链的一个差异是强调拜占庭容错：参与节点会持续相互校验完整性[^71] [^72] [^73]。

对多数应用而言，区块链整体开销仍偏高；但其中一些密码学工具可在更轻量的场景复用。比如 **默克尔树（Merkle tree）**[^74]可高效证明某条记录属于某数据集。**证书透明性（certificate transparency）** 使用可验证的仅追加日志与默克尔树来校验 TLS/SSL 证书有效性[^75] [^76]。

未来，这类完整性校验与审计算法可能会在通用数据系统中更广泛应用。要把它们做到与无密码学审计系统同等级别的可伸缩性，同时把性能开销压到足够低，仍需要工程改进，但方向值得重视。


## 本章小结 {#id367}

在本章中，我们讨论了设计数据系统的新方式，而且也包括了我的个人观点，以及对未来的猜测。我们从这样一种观察开始：没有单种工具能高效服务所有可能的用例，因此应用必须组合使用几种不同的软件才能实现其目标。我们讨论了如何使用批处理与事件流来解决这一 **数据集成（data integration）** 问题，以便让数据变更在不同系统之间流动。

在这种方法中，某些系统被指定为记录系统，而其他数据则通过转换派生自记录系统。通过这种方式，我们可以维护索引、物化视图、机器学习模型、统计摘要等等。通过使这些派生和转换操作异步且松散耦合，能够防止一个区域中的问题扩散到系统中不相关部分，从而增加整个系统的稳健性与容错性。

将数据流表示为从一个数据集到另一个数据集的转换也有助于演化应用程序：如果你想变更其中一个处理步骤，例如变更索引或缓存的结构，则可以在整个输入数据集上重新运行新的转换代码，以便重新派生输出。同样，出现问题时，你也可以修复代码并重新处理数据以便恢复。

这些过程与数据库内部已经完成的过程非常类似，因此我们将数据流应用的概念重新改写为，**分拆（unbundling）** 数据库组件，并通过组合这些松散耦合的组件来构建应用程序。

派生状态可以通过观察底层数据的变更来更新。此外，派生状态本身可以进一步被下游消费者观察。我们甚至可以将这种数据流一路传送至显示数据的终端用户设备，从而构建可动态更新以反映数据变更，并在离线时能继续工作的用户界面。

接下来，我们讨论了如何确保所有这些处理在出现故障时保持正确。我们看到可伸缩的强完整性保证可以通过异步事件处理来实现，通过使用端到端操作标识符使操作幂等，以及通过异步检查约束。客户端可以等到检查通过，或者不等待继续前进，但是可能会冒有违反约束需要道歉的风险。这种方法比使用分布式事务的传统方法更具可伸缩性与可靠性，并且在实践中适用于很多业务流程。

通过围绕数据流构建应用，并异步检查约束，我们可以避免绝大多数协调，构建在地理分布和故障场景下依然保持完整性且性能良好的系统。随后我们还讨论了如何通过审计验证完整性、发现损坏，并指出区块链/分布式账本所使用的一些机制与事件驱动系统在思想上也存在共通之处。


##### Footnotes

### References {#references}

[^1]: Rachid Belaid: “[Postgres Full-Text Search is Good Enough!](http://rachbelaid.com/postgres-full-text-search-is-good-enough/),” *rachbelaid.com*, July 13, 2015.
[^2]: Philippe Ajoux, Nathan Bronson, Sanjeev Kumar, et al.: “[Challenges to Adopting Stronger Consistency at Scale](https://www.usenix.org/system/files/conference/hotos15/hotos15-paper-ajoux.pdf),” at *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015.
[^3]: Pat Helland and Dave Campbell: “[Building on Quicksand](https://web.archive.org/web/20220606172817/https://database.cs.wisc.edu/cidr/cidr2009/Paper_133.pdf),” at *4th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2009.
[^4]: Jessica Kerr: “[Provenance and Causality in Distributed Systems](https://web.archive.org/web/20190425150540/http://blog.jessitron.com/2016/09/provenance-and-causality-in-distributed.html),” *blog.jessitron.com*, September 25, 2016.
[^5]: Kostas Tzoumas: “[Batch Is a Special Case of Streaming](http://data-artisans.com/blog/batch-is-a-special-case-of-streaming/),” *data-artisans.com*, September 15, 2015.
[^6]: Shinji Kim and Robert Blafford: “[Stream Windowing Performance Analysis: Concord and Spark Streaming](https://web.archive.org/web/20180125074821/http://concord.io/posts/windowing_performance_analysis_w_spark_streaming),” *concord.io*, July 6, 2016.
[^7]: Jay Kreps: “[The Log: What Every Software Engineer Should Know About Real-Time Data's Unifying Abstraction](http://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying),” *engineering.linkedin.com*, December 16, 2013.
[^8]: Pat Helland: “[Life Beyond Distributed Transactions: An Apostate’s Opinion](https://web.archive.org/web/20200730171311/http://www-db.cs.wisc.edu/cidr/cidr2007/papers/cidr07p15.pdf),” at *3rd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2007.
[^9]: “[Great Western Railway (1835–1948)](https://web.archive.org/web/20160122155425/https://www.networkrail.co.uk/VirtualArchive/great-western/),” Network Rail Virtual Archive, *networkrail.co.uk*.
[^10]: Jacqueline Xu: “[Online Migrations at Scale](https://stripe.com/blog/online-migrations),” *stripe.com*, February 2, 2017.
[^11]: Molly Bartlett Dishman and Martin Fowler: “[Agile Architecture](https://web.archive.org/web/20161130034721/http://conferences.oreilly.com/software-architecture/sa2015/public/schedule/detail/40388),” at *O'Reilly Software Architecture Conference*, March 2015.
[^12]: Nathan Marz and James Warren: [*Big Data: Principles and Best Practices of Scalable Real-Time Data Systems*](https://www.manning.com/books/big-data). Manning, 2015. ISBN: 978-1-617-29034-3
[^13]: Oscar Boykin, Sam Ritchie, Ian O'Connell, and Jimmy Lin: “[Summingbird: A Framework for Integrating Batch and Online MapReduce Computations](http://www.vldb.org/pvldb/vol7/p1441-boykin.pdf),” at *40th International Conference on Very Large Data Bases* (VLDB), September 2014.
[^14]: Jay Kreps: “[Questioning the Lambda Architecture](https://www.oreilly.com/ideas/questioning-the-lambda-architecture),” *oreilly.com*, July 2, 2014.
[^15]: Raul Castro Fernandez, Peter Pietzuch, Jay Kreps, et al.: “[Liquid: Unifying Nearline and Offline Big Data Integration](http://cidrdb.org/cidr2015/Papers/CIDR15_Paper25u.pdf),” at *7th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2015.
[^16]: Dennis M. Ritchie and Ken Thompson: “[The UNIX Time-Sharing System](http://web.eecs.utk.edu/~qcao1/cs560/papers/paper-unix.pdf),” *Communications of the ACM*, volume 17, number 7, pages 365–375, July 1974. [doi:10.1145/361011.361061](http://dx.doi.org/10.1145/361011.361061)
[^17]: Eric A. Brewer and Joseph M. Hellerstein: “[CS262a: Advanced Topics in Computer Systems](http://people.eecs.berkeley.edu/~brewer/cs262/systemr.html),” lecture notes, University of California, Berkeley, *cs.berkeley.edu*, August 2011.
[^18]: Michael Stonebraker: “[The Case for Polystores](http://wp.sigmod.org/?p=1629),” *wp.sigmod.org*, July 13, 2015.
[^19]: Jennie Duggan, Aaron J. Elmore, Michael Stonebraker, et al.: “[The BigDAWG Polystore System](https://dspace.mit.edu/handle/1721.1/100936),” *ACM SIGMOD Record*, volume 44, number 2, pages 11–16, June 2015. [doi:10.1145/2814710.2814713](http://dx.doi.org/10.1145/2814710.2814713)
[^20]: Patrycja Dybka: “[Foreign Data Wrappers for PostgreSQL](https://web.archive.org/web/20221003115732/https://www.vertabelo.com/blog/foreign-data-wrappers-for-postgresql/),” *vertabelo.com*, March 24, 2015.
[^21]: David B. Lomet, Alan Fekete, Gerhard Weikum, and Mike Zwilling: “[Unbundling Transaction Services in the Cloud](https://www.microsoft.com/en-us/research/publication/unbundling-transaction-services-in-the-cloud/),” at *4th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2009.
[^22]: Martin Kleppmann and Jay Kreps: “[Kafka, Samza and the Unix Philosophy of Distributed Data](http://martin.kleppmann.com/papers/kafka-debull15.pdf),” *IEEE Data Engineering Bulletin*, volume 38, number 4, pages 4–14, December 2015.
[^23]: John Hugg: “[Winning Now and in the Future: Where VoltDB Shines](https://voltdb.com/blog/winning-now-and-future-where-voltdb-shines),” *voltdb.com*, March 23, 2016.
[^24]: Frank McSherry, Derek G. Murray, Rebecca Isaacs, and Michael Isard: “[Differential Dataflow](http://cidrdb.org/cidr2013/Papers/CIDR13_Paper111.pdf),” at *6th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2013.
[^25]: Derek G Murray, Frank McSherry, Rebecca Isaacs, et al.: “[Naiad: A Timely Dataflow System](http://sigops.org/s/conferences/sosp/2013/papers/p439-murray.pdf),” at *24th ACM Symposium on Operating Systems Principles* (SOSP), pages 439–455, November 2013. [doi:10.1145/2517349.2522738](http://dx.doi.org/10.1145/2517349.2522738)
[^26]: Gwen Shapira: “[We have a bunch of customers who are implementing ‘database inside-out’ concept and they all ask ‘is anyone else doing it? are we crazy?’](https://twitter.com/gwenshap/status/758800071110430720)” *twitter.com*, July 28, 2016.
[^27]: Martin Kleppmann: “[Turning the Database Inside-out with Apache Samza,](http://martin.kleppmann.com/2015/03/04/turning-the-database-inside-out.html)” at *Strange Loop*, September 2014.
[^28]: Peter Van Roy and Seif Haridi: [*Concepts, Techniques, and Models of Computer Programming*](https://www.info.ucl.ac.be/~pvr/book.html). MIT Press, 2004. ISBN: 978-0-262-22069-9
[^29]: “[Juttle Documentation](http://juttle.github.io/juttle/),” *juttle.github.io*, 2016.
[^30]: Evan Czaplicki and Stephen Chong: “[Asynchronous Functional Reactive Programming for GUIs](http://people.seas.harvard.edu/~chong/pubs/pldi13-elm.pdf),” at *34th ACM SIGPLAN Conference on Programming Language Design and Implementation* (PLDI), June 2013. [doi:10.1145/2491956.2462161](http://dx.doi.org/10.1145/2491956.2462161)
[^31]: Engineer Bainomugisha, Andoni Lombide Carreton, Tom van Cutsem, Stijn Mostinckx, and Wolfgang de Meuter: “[A Survey on Reactive Programming](http://soft.vub.ac.be/Publications/2012/vub-soft-tr-12-13.pdf),” *ACM Computing Surveys*, volume 45, number 4, pages 1–34, August 2013. [doi:10.1145/2501654.2501666](http://dx.doi.org/10.1145/2501654.2501666)
[^32]: Peter Alvaro, Neil Conway, Joseph M. Hellerstein, and William R. Marczak: “[Consistency Analysis in Bloom: A CALM and Collected Approach](https://dsf.berkeley.edu/cs286/papers/calm-cidr2011.pdf),” at *5th Biennial Conference on Innovative Data Systems Research* (CIDR), January 2011.
[^33]: Felienne Hermans: “[Spreadsheets Are Code](https://vimeo.com/145492419),” at *Code Mesh*, November 2015.
[^34]: Dan Bricklin and Bob Frankston: “[VisiCalc: Information from Its Creators](http://danbricklin.com/visicalc.htm),” *danbricklin.com*.
[^35]: D. Sculley, Gary Holt, Daniel Golovin, et al.: “[Machine Learning: The High-Interest Credit Card of Technical Debt](http://research.google.com/pubs/pub43146.html),” at *NIPS Workshop on Software Engineering for Machine Learning* (SE4ML), December 2014.
[^36]: Peter Bailis, Alan Fekete, Michael J Franklin, et al.: “[Feral Concurrency Control: An Empirical Investigation of Modern Application Integrity](http://www.bailis.org/papers/feral-sigmod2015.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2737784](http://dx.doi.org/10.1145/2723372.2737784)
[^37]: Guy Steele: “[Re: Need for Macros (Was Re: Icon)](https://people.csail.mit.edu/gregs/ll1-discuss-archive-html/msg01134.html),” email to *ll1-discuss* mailing list, *people.csail.mit.edu*, December 24, 2001.
[^38]: David Gelernter: “[Generative Communication in Linda](http://cseweb.ucsd.edu/groups/csag/html/teaching/cse291s03/Readings/p80-gelernter.pdf),” *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 7, number 1, pages 80–112, January 1985. [doi:10.1145/2363.2433](http://dx.doi.org/10.1145/2363.2433)
[^39]: Patrick Th. Eugster, Pascal A. Felber, Rachid Guerraoui, and Anne-Marie Kermarrec: “[The Many Faces of Publish/Subscribe](http://www.cs.ru.nl/~pieter/oss/manyfaces.pdf),” *ACM Computing Surveys*, volume 35, number 2, pages 114–131, June 2003. [doi:10.1145/857076.857078](http://dx.doi.org/10.1145/857076.857078)
[^40]: Ben Stopford: “[Microservices in a Streaming World](https://www.infoq.com/presentations/microservices-streaming),” at *QCon London*, March 2016.
[^41]: Christian Posta: “[Why Microservices Should Be Event Driven: Autonomy vs Authority](http://blog.christianposta.com/microservices/why-microservices-should-be-event-driven-autonomy-vs-authority/),” *blog.christianposta.com*, May 27, 2016.
[^42]: Alex Feyerke: “[Say Hello to Offline First](https://web.archive.org/web/20210420014747/http://hood.ie/blog/say-hello-to-offline-first.html),” *hood.ie*, November 5, 2013.
[^43]: Sebastian Burckhardt, Daan Leijen, Jonathan Protzenko, and Manuel Fähndrich: “[Global Sequence Protocol: A Robust Abstraction for Replicated Shared State](http://drops.dagstuhl.de/opus/volltexte/2015/5238/),” at *29th European Conference on Object-Oriented Programming* (ECOOP), July 2015. [doi:10.4230/LIPIcs.ECOOP.2015.568](http://dx.doi.org/10.4230/LIPIcs.ECOOP.2015.568)
[^44]: Mark Soper: “[Clearing Up React Data Management Confusion with Flux, Redux, and Relay](https://medium.com/@marksoper/clearing-up-react-data-management-confusion-with-flux-redux-and-relay-aad504e63cae),” *medium.com*, December 3, 2015.
[^45]: Eno Thereska, Damian Guy, Michael Noll, and Neha Narkhede: “[Unifying Stream Processing and Interactive Queries in Apache Kafka](http://www.confluent.io/blog/unifying-stream-processing-and-interactive-queries-in-apache-kafka/),” *confluent.io*, October 26, 2016.
[^46]: Frank McSherry: “[Dataflow as Database](https://github.com/frankmcsherry/blog/blob/master/posts/2016-07-17.md),” *github.com*, July 17, 2016.
[^47]: Peter Alvaro: “[I See What You Mean](https://www.youtube.com/watch?v=R2Aa4PivG0g),” at *Strange Loop*, September 2015.
[^48]: Nathan Marz: “[Trident: A High-Level Abstraction for Realtime Computation](https://blog.twitter.com/2012/trident-a-high-level-abstraction-for-realtime-computation),” *blog.twitter.com*, August 2, 2012.
[^49]: Edi Bice: “[Low Latency Web Scale Fraud Prevention with Apache Samza, Kafka and Friends](http://www.slideshare.net/edibice/extremely-low-latency-web-scale-fraud-prevention-with-apache-samza-kafka-and-friends),” at *Merchant Risk Council MRC Vegas Conference*, March 2016.
[^50]: Charity Majors: “[The Accidental DBA](https://charity.wtf/2016/10/02/the-accidental-dba/),” *charity.wtf*, October 2, 2016.
[^51]: Arthur J. Bernstein, Philip M. Lewis, and Shiyong Lu: “[Semantic Conditions for Correctness at Different Isolation Levels](http://db.cs.berkeley.edu/cs286/papers/isolation-icde2000.pdf),” at *16th International Conference on Data Engineering* (ICDE), February 2000. [doi:10.1109/ICDE.2000.839387](http://dx.doi.org/10.1109/ICDE.2000.839387)
[^52]: Sudhir Jorwekar, Alan Fekete, Krithi Ramamritham, and S. Sudarshan: “[Automating the Detection of Snapshot Isolation Anomalies](http://www.vldb.org/conf/2007/papers/industrial/p1263-jorwekar.pdf),” at *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
[^53]: Kyle Kingsbury: [Jepsen blog post series](https://aphyr.com/tags/jepsen), *aphyr.com*, 2013–2016.
[^54]: Michael Jouravlev: “[Redirect After Post](http://www.theserverside.com/news/1365146/Redirect-After-Post),” *theserverside.com*, August 1, 2004.
[^55]: Jerome H. Saltzer, David P. Reed, and David D. Clark: “[End-to-End Arguments in System Design](https://groups.csail.mit.edu/ana/Publications/PubPDFs/End-to-End%20Arguments%20in%20System%20Design.pdf),” *ACM Transactions on Computer Systems*, volume 2, number 4, pages 277–288, November 1984. [doi:10.1145/357401.357402](http://dx.doi.org/10.1145/357401.357402)
[^56]: Peter Bailis, Alan Fekete, Michael J. Franklin, et al.: “[Coordination-Avoiding Database Systems](http://arxiv.org/pdf/1402.2237.pdf),” *Proceedings of the VLDB Endowment*, volume 8, number 3, pages 185–196, November 2014.
[^57]: Alex Yarmula: “[Strong Consistency in Manhattan](https://blog.twitter.com/2016/strong-consistency-in-manhattan),” *blog.twitter.com*, March 17, 2016.
[^58]: Douglas B Terry, Marvin M Theimer, Karin Petersen, et al.: “[Managing Update Conflicts in Bayou, a Weakly Connected Replicated Storage System](http://css.csail.mit.edu/6.824/2014/papers/bayou-conflicts.pdf),” at *15th ACM Symposium on Operating Systems Principles* (SOSP), pages 172–182, December 1995. [doi:10.1145/224056.224070](http://dx.doi.org/10.1145/224056.224070)
[^59]: Jim Gray: “[The Transaction Concept: Virtues and Limitations](http://jimgray.azurewebsites.net/papers/thetransactionconcept.pdf),” at *7th International Conference on Very Large Data Bases* (VLDB), September 1981.
[^60]: Hector Garcia-Molina and Kenneth Salem: “[Sagas](http://www.cs.cornell.edu/andru/cs711/2002fa/reading/sagas.pdf),” at *ACM International Conference on Management of Data* (SIGMOD), May 1987. [doi:10.1145/38713.38742](http://dx.doi.org/10.1145/38713.38742)
[^61]: Pat Helland: “[Memories, Guesses, and Apologies](https://web.archive.org/web/20160304020907/http://blogs.msdn.com/b/pathelland/archive/2007/05/15/memories-guesses-and-apologies.aspx),” *blogs.msdn.com*, May 15, 2007.
[^62]: Yoongu Kim, Ross Daly, Jeremie Kim, et al.: “[Flipping Bits in Memory Without Accessing Them: An Experimental Study of DRAM Disturbance Errors](https://users.ece.cmu.edu/~yoonguk/papers/kim-isca14.pdf),” at *41st Annual International Symposium on Computer Architecture* (ISCA), June 2014. [doi:10.1145/2678373.2665726](http://dx.doi.org/10.1145/2678373.2665726)
[^63]: Mark Seaborn and Thomas Dullien: “[Exploiting the DRAM Rowhammer Bug to Gain Kernel Privileges](https://googleprojectzero.blogspot.co.uk/2015/03/exploiting-dram-rowhammer-bug-to-gain.html),” *googleprojectzero.blogspot.co.uk*, March 9, 2015.
[^64]: Jim N. Gray and Catharine van Ingen: “[Empirical Measurements of Disk Failure Rates and Error Rates](https://www.microsoft.com/en-us/research/publication/empirical-measurements-of-disk-failure-rates-and-error-rates/),” Microsoft Research, MSR-TR-2005-166, December 2005.
[^65]: Annamalai Gurusami and Daniel Price: “[Bug #73170: Duplicates in Unique Secondary Index Because of Fix of Bug#68021](http://bugs.mysql.com/bug.php?id=73170),” *bugs.mysql.com*, July 2014.
[^66]: Gary Fredericks: “[Postgres Serializability Bug](https://github.com/gfredericks/pg-serializability-bug),” *github.com*, September 2015.
[^67]: Xiao Chen: “[HDFS DataNode Scanners and Disk Checker Explained](http://blog.cloudera.com/blog/2016/12/hdfs-datanode-scanners-and-disk-checker-explained/),” *blog.cloudera.com*, December 20, 2016.
[^68]: Jay Kreps: “[Getting Real About Distributed System Reliability](http://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability),” *blog.empathybox.com*, March 19, 2012.
[^69]: Martin Fowler: “[The LMAX Architecture](http://martinfowler.com/articles/lmax.html),” *martinfowler.com*, July 12, 2011.
[^70]: Sam Stokes: “[Move Fast with Confidence](http://blog.samstokes.co.uk/blog/2016/07/11/move-fast-with-confidence/),” *blog.samstokes.co.uk*, July 11, 2016.
[^71]: “[Hyperledger Sawtooth documentation](https://web.archive.org/web/20220120211548/https://sawtooth.hyperledger.org/docs/core/releases/latest/introduction.html),” Intel Corporation, *sawtooth.hyperledger.org*, 2017.
[^72]: Richard Gendal Brown: “[Introducing R3 Corda™: A Distributed Ledger Designed for Financial Services](https://gendal.me/2016/04/05/introducing-r3-corda-a-distributed-ledger-designed-for-financial-services/),” *gendal.me*, April 5, 2016.
[^73]: Trent McConaghy, Rodolphe Marques, Andreas Müller, et al.: “[BigchainDB: A Scalable Blockchain Database](https://www.bigchaindb.com/whitepaper/bigchaindb-whitepaper.pdf),” *bigchaindb.com*, June 8, 2016.
[^74]: Ralph C. Merkle: “[A Digital Signature Based on a Conventional Encryption Function](https://people.eecs.berkeley.edu/~raluca/cs261-f15/readings/merkle.pdf),” at *CRYPTO '87*, August 1987. [doi:10.1007/3-540-48184-2_32](http://dx.doi.org/10.1007/3-540-48184-2_32)
[^75]: Ben Laurie: “[Certificate Transparency](http://queue.acm.org/detail.cfm?id=2668154),” *ACM Queue*, volume 12, number 8, pages 10-19, August 2014. [doi:10.1145/2668152.2668154](http://dx.doi.org/10.1145/2668152.2668154)
[^76]: Mark D. Ryan: “[Enhanced Certificate Transparency and End-to-End Encrypted Mail](https://www.ndss-symposium.org/wp-content/uploads/2017/09/12_2_1.pdf),” at *Network and Distributed System Security Symposium* (NDSS), February 2014. [doi:10.14722/ndss.2014.23379](http://dx.doi.org/10.14722/ndss.2014.23379)


================================================
FILE: content/zh/ch14.md
================================================
---
title: "14. 将事情做正确"
weight: 314
breadcrumbs: false
---

<a id="ch_right_thing"></a>

![](/map/ch12.png)

> *将世界的美好、丑陋与残酷一起喂给 AI，却期待它只反映美好的一面，这是一种幻想。*
>
> Vinay Uday Prabhu 与 Abeba Birhane，《Large Datasets: A Pyrrhic Win for Computer Vision?》（2020）

在本书最后一章，让我们退一步看问题。整本书里，我们考察了各种数据系统架构，评估了它们的利弊，也探讨了如何构建可靠、可伸缩、可维护的应用。然而，我们一直略去了讨论中一个重要而基础的部分，现在该补上了。

每个系统都是为了某种目的而建；我们做的每个动作，都有预期后果，也有非预期后果。目的可能只是赚钱，但对世界产生的影响可能远远超出这个初始目的。构建这些系统的工程师，有责任认真思考这些后果，并且有意识地决定我们希望生活在怎样的世界中。

我们常把数据当成抽象事物来谈论，但请记住，许多数据集都是关于人的：他们的行为、兴趣、身份。我们必须以人性与尊重来对待这样的数据。用户也是人，而人的尊严至高无上 [^1]。

软件开发越来越涉及重要的伦理抉择。确实有一些指南帮助软件工程师应对这些问题，比如 ACM《伦理与职业行为准则》 [^2]，但在实践中，它们很少被讨论、应用与执行。因此，工程师和产品经理有时会对隐私以及产品可能带来的负面后果抱持一种轻率态度 [^3], [^4]。

技术本身并无善恶，关键在于它如何被使用，以及它如何影响人。这一点对搜索引擎这样的软件系统成立，对枪支这样的武器同样成立。软件工程师若只专注技术本身而忽视其后果，是不够的：伦理责任同样由我们承担。伦理推理很难，但它又重要到不能回避。

不过，什么算“好”或“坏”并没有清晰定义，而计算领域的大多数人甚至不讨论这个问题 [^5]。与计算领域中的很多概念不同，伦理的核心概念并没有严格且确定的单一含义，它们需要解释，而解释可能具有主观性 [^6]。伦理并不是走一遍检查清单、确认你“合规”就完事；它是一种参与式、迭代式的反思过程，要与相关人群对话，并对结果负责 [^7]。

## 预测分析 {#id369}

例如，预测分析是人们对大数据和 AI 感到兴奋的重要原因之一。用数据分析来预测天气或疾病传播是一回事 [^8]；预测一个罪犯是否可能再犯、一个贷款申请者是否可能违约，或一个保险客户是否可能提出高额理赔，又是另一回事 [^9]。后者会直接影响个人的生活。

支付网络当然想防止欺诈交易，银行想避免坏账，航空公司想避免劫机，公司想避免雇到低效或不可信的人。从它们的角度看，错过一笔业务机会的成本较低，而坏账或问题员工的成本更高，因此机构倾向于谨慎行事完全可以理解。拿不准时，说“不”更稳妥。

然而，随着算法决策越来越普遍，一个被某个算法标记为“高风险”的人（不管标记准确与否），可能会不断遭遇这种“不”。如果一个人系统性地被排除在工作、航空出行、保险保障、房屋租赁、金融服务以及社会其他关键领域之外，这对个体自由构成的约束之大，以至于有人称之为“算法监狱” [^10]。在尊重人权的国家，刑事司法讲究“未经证明有罪即推定无罪”；但自动化系统却可能在没有罪证、几乎无申诉机会的情况下，系统性且任意地把一个人排除在社会参与之外。

### 偏见与歧视 {#id370}

算法作出的决策并不必然比人更好，也不必然更差。每个人都可能有偏见，即使他们主动尝试纠偏也是如此；歧视性做法也可能被文化性地制度化。人们期待基于数据、而非基于人的主观直觉评估来作决定，可能更公平，也能让传统系统中常被忽视的人获得更好机会 [^11]。

当我们开发预测分析和 AI 系统时，我们并不只是把人的决策“自动化”——即用软件写明何时说“是”或“否”的规则；我们甚至把规则本身也交给数据去推断。然而，这些系统学到的模式往往是不透明的：即使数据中存在某种相关性，我们也未必知道为什么。如果算法输入中存在系统性偏差，系统很可能会在输出中学习并放大这种偏差 [^12]。

在许多国家，反歧视法禁止依据族裔、年龄、性别、性取向、残障或信仰等受保护特征而区别对待他人。一个人的其他数据特征也许可以分析，但如果这些特征与受保护特征相关怎么办？例如，在按种族隔离的社区里，一个人的邮编，甚至其 IP 地址，都可能是种族的强预测因子。这样一看，认为算法能把带偏见的数据作为输入，却产出公平中立的结果，几乎是荒谬的 [^13], [^14]。然而，数据驱动决策的支持者常隐含这种信念，这种态度甚至被讽刺为“机器学习就像给偏见洗钱” [^15]。

预测分析系统只是在外推过去；如果过去有歧视，它们就会把歧视编码并放大 [^16]。如果我们希望未来比过去更好，就需要道德想象力，而这只能由人提供 [^17]。数据和模型应当是我们的工具，而不是我们的主宰。

### 责任与问责 {#id371}

自动化决策把责任与问责问题摆到了台前 [^17]。如果人犯了错，可以追责，受影响者也可以申诉。算法同样会出错，但如果算法出了问题，谁来负责 [^18]？自动驾驶汽车造成事故，谁应承担责任？自动化信用评分算法如果系统性歧视某一族裔或宗教的人，受害者是否有救济途径？如果你的机器学习系统的决策受到司法审查，你能向法官解释算法是如何作出该决策的吗？人不应通过“怪算法”来逃避自己的责任。

信用评级机构是一个较早的先例：通过收集数据来对人作决策。糟糕的信用评分会让生活变难，但至少信用分通常基于与借贷历史直接相关的事实记录，记录中的错误也可以更正（尽管机构往往不会让这件事变得容易）。相比之下，基于机器学习的评分算法通常使用更广泛的输入且更不透明，使人更难理解某个具体决策是如何得出的，也更难判断某人是否受到了不公平或歧视性对待 [^19]。

信用分回答的是“你过去行为如何？”；而预测分析通常基于“谁和你相似，以及像你这样的人过去行为如何？”。把某人和“相似人群”类比，本质上就是在给人贴群体标签，比如按居住地（这往往是种族和社会经济阶层的近似代理）来推断。那被分错桶的人怎么办？此外，如果决策因错误数据而出错，几乎不可能得到救济 [^17]。

许多数据本质上是统计性的，这意味着即便总体概率分布正确，具体个案也可能是错的。比如，某国平均预期寿命是 80 岁，并不意味着你会在 80 岁生日那天去世。仅凭平均值和概率分布，我们很难判断某个具体个体会活到多少岁。同样，预测系统的输出是概率性的，在具体个案上完全可能出错。

盲目相信数据在决策中的至高地位，不仅是错觉，更是危险。随着数据驱动决策越来越普遍，我们必须找到办法让算法可问责、可透明，避免强化既有偏见，并在它们不可避免地犯错时加以纠正。

我们还需要想办法防止数据被用来伤害人，并实现其积极潜力。比如，分析可以揭示一个人财务和社会生活上的特征。一方面，这种能力可以用于把援助精准地送到最需要的人手中。另一方面，它有时被掠夺性企业用来识别脆弱人群，并向其兜售高成本贷款、含金量极低的学历项目等高风险产品 [^17], [^20]。

### 反馈回路 {#id372}

即便在对人影响没那么立竿见影的预测应用中，比如推荐系统，我们也必须直面棘手问题。当服务越来越擅长预测用户想看什么内容时，它可能最终只向人们展示他们本就认同的观点，形成回音室，让刻板印象、错误信息和社会极化不断滋生。我们已经看到社交媒体回音室对选举活动的影响。

当预测分析影响人的生活时，自我强化的反馈回路会带来尤其恶性的后果。比如，设想雇主用信用分来评估候选人。你原本是一个工作能力不错、信用也不错的人，但因某个无法控制的不幸事件突然陷入财务困境。账单逾期后，你的信用分下降，找到工作的可能性也随之下降。失业把你推向贫困，反过来让你的评分更差，进一步降低就业机会 [^17]。这就是一种下行螺旋：有毒假设披着数学严谨与数据客观的伪装。

反馈回路还有另一个例子：经济学家发现，德国加油站引入算法定价后，竞争反而减弱，消费者价格上升，因为算法学会了“合谋” [^21]。

我们并不总能预测这些反馈回路何时出现。不过，很多后果可以通过思考“整个系统”来预见（不仅是计算机化部分，还包括与系统交互的人）——这种方法称为 **系统思维** [^22]。我们可以尝试理解数据分析系统对不同行为、结构与特征的响应。系统是在强化和放大人与人之间既有差异（例如让富者更富、穷者更穷），还是在努力对抗不公？而且，即便出发点再好，我们也必须警惕非预期后果。

## 隐私与追踪 {#id373}

除了预测分析的问题——也就是用数据自动化地对人作决策——数据收集本身也有伦理问题。收集数据的组织，与数据被收集的人之间，到底是什么关系？

当系统只存储用户明确输入的数据，因为用户希望系统以某种方式存储和处理它时，系统是在为用户提供服务：用户是客户。但当用户活动是在做其他事情时被“顺带”追踪并记录下来，这种关系就不那么清晰了。服务不再只是执行用户指令，而开始拥有自己的利益，而这种利益可能与用户利益冲突。

行为数据追踪已成为许多在线服务面向用户功能的重要组成部分：追踪搜索结果点击有助于改进搜索排序；推荐“喜欢 X 的人也喜欢 Y”可帮助用户发现有趣且有用的内容；A/B 测试与用户流程分析可帮助改进用户界面。这些功能都需要一定程度的用户行为追踪，用户也能从中受益。

然而，取决于公司的商业模式，追踪往往不会止步于此。如果服务靠广告资助，那么广告主才是真正客户，用户利益就会退居次位。追踪数据会变得更细、分析会更深入、数据会被长期保留，以便为营销目的构建每个人的精细画像。

这时，公司与被收集数据的用户之间的关系，就开始显著改变了。用户得到“免费”服务，并被引导尽可能多地参与。对用户的追踪，主要服务的并不是这个个体，而是资助服务的广告主需求。这样的关系，用一个语义更阴暗的词来描述更贴切：**监视**。

### 监视 {#id374}

做个思想实验：把 *data* 一词替换为 *surveillance*（监视），看看常见说法是否还那么“好听” [^23]。例如：“在我们这个监视驱动的组织中，我们收集实时监视流并存入监视仓库。我们的监视科学家使用先进的分析与监视处理来产出新洞见。”

这个思想实验对本书来说少见地带有一点论战色彩，仿佛书名成了《设计监视密集型应用》（*Designing Surveillance-Intensive Applications*）。但为了强调这一点，我们需要更尖锐的词。在我们试图让软件“吞噬世界” [^24] 的过程中，我们构建了人类有史以来规模最大的群体监视基础设施。我们正快速接近这样一个世界：几乎每个有人居住的空间都至少有一个联网麦克风，存在于智能手机、智能电视、语音助手设备、婴儿监视器，甚至使用云语音识别的儿童玩具中。许多这类设备的安全记录都非常糟糕 [^25]。

与过去相比，新变化在于：数字化让大规模收集人的数据变得很容易。对我们位置与行动轨迹、社交关系与通信、购买与支付、健康信息的监视，几乎已不可避免。一个监视型组织最终掌握的个人信息，甚至可能比当事人自己知道的还多——例如，在当事人意识到之前就识别出其疾病或经济困境。

即便是过去最极权、最压迫的政权，也只能梦想把麦克风装进每个房间，并迫使每个人随身携带可追踪其位置与行动的设备。可是，由于数字技术带来的好处太大，我们如今却自愿接受这个全面监视的世界。区别只在于：数据由企业收集以向我们提供服务，而不是由政府机构为控制目的而收集 [^26]。

并非所有数据收集都一定构成监视，但把它放在“监视”的框架下审视，有助于我们理解自己与数据收集者的关系。为什么我们似乎乐于接受企业监视？也许你觉得自己“没什么可隐瞒”——换句话说，你与既有权力结构完全一致，不是边缘少数群体，也无需担心被迫害 [^27]。但不是每个人都这么幸运。又或者，你觉得目的似乎是善意的——不是公开的强制和驯化，而只是更好的推荐与更个性化的营销。然而，结合上一节对预测分析的讨论，这种区分就没那么清楚了。

我们已经看到，汽车在未经驾驶员同意的情况下追踪其驾驶行为，并影响保险费率 [^28]；也看到了与佩戴健身追踪设备绑定的健康保险保障。当监视被用于决定对生活关键方面有重大影响的事项（如保险保障或就业）时，它看起来就不再“无害”。而且，数据分析还能揭示极具侵入性的内容：例如，智能手表或健身手环里的运动传感器可以以相当高的准确率推断你在输入什么（包括密码） [^29]。传感器精度和分析算法只会越来越强。

### 同意与选择自由 {#id375}

我们或许会主张，用户是自愿选择使用会追踪其活动的服务，并且他们同意了服务条款和隐私政策，因此他们已同意数据收集。我们甚至可能声称，用户正以其提供的数据换取有价值的服务，而追踪是提供该服务所必需的。毫无疑问，社交网络、搜索引擎和各种其他免费在线服务确实对用户有价值——但这个论证有问题。

首先，我们应当问：追踪在哪种意义上是“必要的”？有些追踪形式确实直接用于改进用户功能：例如，追踪搜索结果点击率可提升搜索排序与相关性；追踪客户常一起购买哪些商品，可帮助网店推荐关联商品。然而，当追踪用户交互是为了内容推荐，或为了广告构建用户画像时，这是否真正在用户利益之中就不那么清楚了——还是说，它“必要”仅仅因为广告在为服务买单？

其次，用户对自己向我们的数据库“喂入”了哪些数据、这些数据如何被保留与处理，几乎没有认知——而多数隐私政策更多是在遮蔽而非阐明。用户若不了解其数据会发生什么，就无法给出有意义的同意。并且，某个用户的数据往往也会揭示并非该服务用户、也未同意任何条款的其他人。我们在本书这部分讨论过的那些派生数据集——其中可能把全体用户数据与行为追踪及外部数据源结合——正是用户不可能形成有意义理解的数据类型。

此外，数据从用户身上被抽取是单向过程，不是具有真实互惠的关系，也不是公平的价值交换。这里没有对话，没有让用户协商“提供多少数据、换取什么服务”的空间：服务与用户之间的关系高度不对称、单向度。规则由服务制定，而非用户 [^30], [^31]。

在欧盟，《通用数据保护条例》（GDPR）要求同意必须是 “freely given, specific, informed, and unambiguous”，并且用户必须能够 “refuse or withdraw consent without detriment”——否则不被视为 “freely given”。任何征求同意的请求都必须以 “an intelligible and easily accessible form, using clear and plain language” 撰写。此外，“silence, pre-ticked boxes or inactivity \[do not\] constitute consent” [^32]。除同意外，个人数据处理还可基于其他合法基础，例如 *legitimate interest*，它允许某些数据用途，如防欺诈 [^33]。

你可能会说，不同意被监视的用户可以选择不用这项服务。但这种选择同样不自由：如果某项服务流行到“被大多数人视为基本社会参与所必需” [^30]，那就不能合理期待人们退出——使用它在事实上成了强制（*de facto* mandatory）。例如，在多数西方社群中，携带智能手机、通过社交网络社交、使用 Google 获取信息，已经成为常态。尤其当服务具有网络效应时，选择 *不* 使用它会付出社会成本。

因为追踪政策而拒绝使用某服务，说起来容易做起来难。这些平台本来就是为吸引用户而设计的。许多平台使用游戏机制和赌博常见策略来让用户反复回来 [^34]。即便用户能克服这一点，拒绝参与也往往只是少数特权人群的选项：他们有时间和知识去理解隐私政策，也有能力承担潜在代价——比如错过本可通过该服务获得的社会参与或职业机会。对于处境更不利的人来说，并不存在真正意义上的选择自由：监视变得无可逃避。

### 隐私与数据使用 {#id457}

有时有人声称“隐私已死”，理由是某些用户愿意在社交媒体上发布各种生活内容，有些琐碎，有些极度私密。但这个说法是错误的，它建立在对 *privacy* 一词的误解之上。

拥有隐私并不意味着把一切都保密；它意味着拥有选择自由：哪些内容向谁披露、哪些公开、哪些保密。隐私权是一种决策权：它让每个人在每种情境中，决定自己在“保密”与“透明”光谱上的位置 [^30]。这是个体自由与自主性的重要组成部分。

例如，一个患有罕见疾病的人，可能非常愿意把其私密医疗数据提供给研究者，只要这有助于开发治疗方法。但关键在于，这个人应当有权选择谁可以访问这些数据，以及出于什么目的。如果其病情信息可能损害其医疗保险、就业或其他重要权益，这个人很可能会更谨慎地共享数据。

当数据通过监视基础设施从人们身上被抽取时，被侵蚀的未必是隐私权本身，而可能是隐私权的转移：转移给数据收集者。获取数据的公司本质上是在说“相信我们会正确使用你的数据”，这意味着决定“披露什么、保密什么”的权利，从个人转移到了公司。

这些公司反过来会把监视结果中的很大一部分保密，因为一旦公开，会让人感到毛骨悚然，并伤害其商业模式（该模式依赖于“比其他公司更了解你”）。关于用户的私密信息通常只以间接方式被暴露，例如通过向特定人群（如患有某种疾病的人）定向投放广告的工具表现出来。

即便特定用户无法从某条广告所面向的人群桶中被个人重识别，他们仍失去了对某些私密信息披露的主导权。决定“向谁披露什么”不再基于用户自己的偏好，而是公司在行使这种隐私权，目标是利润最大化。

许多公司追求的目标是“不被 *感知* 为令人不适”，回避“数据收集到底有多侵入”这一问题，转而专注于管理用户感知。而且就连这种感知管理也常常做得不好：例如，某些内容也许在事实层面是正确的，但若会触发痛苦记忆，用户可能并不想被提醒 [^35]。面对任何数据，我们都应预期它可能出错、不可取或在某些情况下不合适，并且需要构建机制来处理这些失效。至于什么算“不可取”或“不合适”，当然属于人的判断；算法除非被我们显式编程去尊重人的需要，否则对这些概念是无感的。作为这些系统的工程师，我们必须保持谦逊，接受并预先规划这些失效。

在线服务里的隐私设置，允许用户控制其数据的哪些方面可被其他用户看到，这是把部分控制权还给用户的起点。然而，不管设置如何，服务本身仍可不受限制地访问这些数据，并可在隐私政策允许范围内任意使用。即使服务承诺不把数据出售给第三方，通常也会赋予自己在内部处理和分析数据的广泛权利，而这种处理常常远远超出用户可见范围。

这种把隐私权从个人大规模转移到企业的现象，在历史上前所未有 [^30]。监视并非从未存在，但过去它昂贵且依赖人工，不具备自动化与可伸缩性。信任关系也一直存在，比如病人与医生、被告与律师之间——但这些关系中的数据使用长期受伦理、法律与监管约束。互联网服务则让“在缺乏有意义同意的情况下聚合海量敏感信息，并在用户不知情时以大规模方式使用”变得容易得多。

### 数据作为资产与权力 {#id376}

由于行为数据是用户与服务交互的副产物，它有时被称为 “data exhaust”（数据尾气），暗示这些数据是无价值的废料。照这个角度看，行为分析与预测分析像一种“回收”，从原本会被丢弃的数据中提炼价值。

更准确的看法可能正相反：从经济学角度看，如果定向广告在为服务买单，那么生成行为数据的用户活动就可被视作一种劳动 [^36]。甚至可以更进一步主张：用户交互的应用本身，只是引诱用户不断向监视基础设施输入更多个人信息的手段 [^30]。在线服务中常见的人类创造力与社会关系，被数据抽取机器以冷酷方式利用。

个人数据是有价值资产，这从数据经纪商行业的存在即可见一斑：这是一个在隐秘中运作、颇为灰暗的行业，购买、聚合、分析、推断并转售关于个人的侵入性数据，多数用于营销 [^20]。初创公司的估值常以用户数、以“眼球”为基础——也就是以其监视能力为基础。

因为数据有价值，很多人都想要它。公司当然想要——这本就是它们收集数据的原因。政府也想拿到：通过秘密交易、胁迫、法律强制，或者直接窃取 [^37]。当公司破产时，其收集的个人数据会作为资产被出售。并且，数据很难彻底保护，泄露事件频发得令人不安。

这些观察促使批评者说，数据不只是资产，还是“有毒资产”（*toxic asset*） [^37]，或者至少是“危险材料”（*hazardous material*） [^38]。也许数据不是“新黄金”、不是“新石油”，而是“新铀” [^39]。即使我们认为自己有能力防止数据滥用，每次收集数据时也必须权衡收益与其落入错误之手的风险：计算机系统可能被犯罪分子或敌对外国情报机构攻破，数据可能被内部人员泄露，公司可能落入与我们价值观不一致的管理层手中，或国家可能被一个毫无顾忌、会强迫我们交出数据的政权接管。

收集数据时，我们不仅要考虑今天的政治环境，还要考虑未来所有可能的政府。无法保证未来每一届政府都会尊重人权与公民自由，因此，“安装那些未来可能助长警察国家的技术，是糟糕的公民卫生习惯” [^40]。

正如古老格言所说，“知识就是力量”。而且，“审视他人而避免自身被审视，是最重要的权力形式之一” [^41]。这正是极权政府追求监视的原因：它赋予其控制人口的力量。今天的科技公司虽未公开追求政治权力，但它们积累的数据与知识依然赋予其对我们生活的巨大影响力，其中很多是隐蔽的，处在公共监督之外 [^42]。

### 回顾工业革命 {#id377}

数据是信息时代的决定性特征。互联网、数据存储与处理、软件驱动自动化，正在深刻影响全球经济和人类社会。我们的日常生活与社会组织已被信息技术改变，并且在未来几十年很可能继续发生剧烈变化，这很容易让人联想到工业革命 [^17], [^26]。

工业革命建立在重大技术与农业进步之上，长期看带来了持续经济增长和生活水平显著改善。但它也伴随严重问题：空气污染（烟尘与化工过程）和水污染（工业与生活废弃物）都触目惊心。工厂主生活奢华，城市工人却常住在恶劣住房里、长时间在严苛条件下劳动。童工普遍存在，包括矿井中危险且低薪的工作。

社会花了很长时间才建立起各种防护措施：环境保护法规、工作场所安全规程、取缔童工、食品卫生检查。毫无疑问，当工厂不再被允许把废弃物排进河里、售卖污染食品、剥削工人时，做生意的成本上升了。但整个社会从这些规制中获益巨大，今天几乎没人愿意回到那之前 [^17]。

正如工业革命有其需要被管理的黑暗面一样，我们向信息时代的过渡也有重大问题，必须正视并解决 [^43], [^44]。数据的收集与使用就是其中之一。借用 Bruce Schneier 的话 [^26]：

> 数据是信息时代的污染问题，而保护隐私是环境挑战。几乎所有计算机都会产生信息。它会长期滞留、不断发酵。我们如何处理它——如何围堵它、如何处置它——对信息经济的健康至关重要。正如今天我们回望工业时代的早期几十年，会疑惑我们的祖先为何在建设工业世界的狂热中忽视了污染问题；我们的后代也将回望信息时代的这些早期几十年，并以我们如何应对数据收集与滥用的挑战来评判我们。
>
> 我们应努力让他们感到骄傲。

### 立法与自律 {#sec_future_legislation}

数据保护法也许能够帮助维护个体权利。例如，欧盟 GDPR 规定，个人数据必须“为特定、明确且合法的目的而收集，不得以与这些目的不相容的方式进一步处理”；并且数据必须“就处理目的而言充分、相关且限于必要范围” [^32]。

然而，这一 **数据最小化** 原则与大数据哲学正面冲突。大数据强调最大化数据收集，把数据与其他数据集合并，持续实验与探索，以产生新洞见。探索意味着为预见之外的目的使用数据，这与“特定且明确目的”正相反。尽管 GDPR 对在线广告行业产生了一些影响 [^45]，监管执行总体仍偏弱 [^46]，也似乎没有在更广泛的科技行业内真正带来文化与实践层面的显著转变。

那些收集大量个人数据的公司把监管视为负担和创新阻碍。这种反对在某种程度上也有其合理性。比如共享医疗数据时，隐私风险确实明确存在，但也有潜在机会：如果数据分析能帮助我们实现更好的诊断或找到更好的治疗方案，能减少多少死亡 [^47]？过度监管可能会阻碍这类突破。如何平衡机会与风险并不容易 [^41]。

归根结底，科技行业需要在个人数据问题上完成一次文化转向。我们应停止把用户当作可优化指标，记住他们是应被尊重、拥有尊严与主体性的人。我们应通过自律来约束数据收集与处理实践，以建立并维系依赖我们软件的人们的信任 [^48]。并且，我们应主动教育终端用户其数据如何被使用，而不是把他们蒙在鼓里。

我们应允许每个个体保有其隐私——也就是对自身数据的控制——而不是通过监视把这种控制偷走。个体对自身数据的控制权，就像国家公园中的自然环境：如果我们不明确保护并照料它，它就会被破坏。这会成为“公地悲剧”，最终所有人都更糟。无处不在的监视并非命中注定——我们仍有机会阻止它。

第一步是不要无限期保留数据，而应在不再需要时尽快清除，并在源头最小化收集 [^48], [^49]。只要你的数据不存在，它就不会被泄露、被盗，或被政府强制交出。总的来说，这需要文化与态度的改变。作为技术从业者，如果我们不考虑自己工作的社会影响，那就是没有尽到本职 [^50]。

## 总结 {#id594}

至此，本书接近尾声。我们已经走过了很长一段路：

- 在 [第 1 章](/ch1#ch_tradeoffs) 中，我们对比了分析型系统与事务型系统，比较了云与自托管，权衡了分布式与单节点系统，并讨论了如何平衡业务需求与用户需求。

- 在 [第 2 章](/ch2#ch_nonfunctional) 中，我们看到了如何定义非功能性需求，例如性能、可靠性、可伸缩性与可维护性。

- 在 [第 3 章](/ch3#ch_datamodels) 中，我们考察了从关系模型、文档模型到图模型的一系列数据模型，也讨论了事件溯源与 DataFrame。我们还看了多种查询语言示例，包括 SQL、Cypher、SPARQL、Datalog 与 GraphQL。

- 在 [第 4 章](/ch4#ch_storage) 中，我们讨论了面向 OLTP 的存储引擎（LSM 树与 B 树）、面向分析的存储（列式存储），以及面向信息检索的索引（全文检索与向量检索）。

- 在 [第 5 章](/ch5#ch_encoding) 中，我们考察了将数据对象编码为字节的不同方式，以及如何在需求变化时支持演化。我们还比较了进程间数据流动的几种方式：经由数据库、服务调用、工作流引擎或事件驱动架构。

- 在 [第 6 章](/ch6#ch_replication) 中，我们研究了单领导者、多领导者与无主（无领导者）复制之间的权衡，也讨论了写后读一致性等一致性模型，以及可让客户端离线工作的同步引擎。

- 在 [第 7 章](/ch7#ch_sharding) 中，我们深入讨论了分片，包括再平衡策略、请求路由与次级索引。

- 在 [第 8 章](/ch8#ch_transactions) 中，我们覆盖了事务：持久性、各种隔离级别（读已提交、快照隔离、可串行化）的实现方式，以及如何在分布式事务中保证原子性。

- 在 [第 9 章](/ch9#ch_distributed) 中，我们梳理了分布式系统中的基础问题（网络失效与延迟、时钟误差、进程暂停、崩溃），并看到这些问题如何让“实现一个看似简单的锁”都变得困难。

- 在 [第 10 章](/ch10#ch_consistency) 中，我们深入分析了各种共识形式，以及它所支持的一致性模型（线性一致性）。

- 在 [第 11 章](/ch11#ch_batch) 中，我们深入批处理，从简单的 Unix 工具链一直讲到基于分布式文件系统或对象存储的大规模分布式批处理系统。

- 在 [第 12 章](/ch12#ch_stream) 中，我们把批处理推广到流处理，讨论了底层消息代理、数据变更捕获、容错机制，以及流连接等处理模式。

- 在 [第 13 章](/ch13#ch_philosophy) 中，我们探讨了流式系统的一种哲学，它使异构数据系统更易于集成、系统更易于演化、应用更易于扩展。

最后，在本章中，我们后退一步，审视了构建数据密集型应用的一些伦理面向。我们看到，数据虽可为善，也可能造成严重伤害：作出深刻影响个人生活却难以申诉的决策，导致歧视与剥削，使监视常态化，并暴露私密信息。我们还面临数据泄露风险，也可能发现某些出于善意的数据使用产生了非预期后果。

随着软件与数据对世界产生如此巨大的影响，我们作为工程师必须记住：我们有责任朝着我们希望生活其中的世界努力——一个以人性与尊重对待人的世界。让我们共同朝这个目标前进。

### 参考文献 {#references}

[^1]: David Schmudde. [What If Data Is a Bad Idea?](https://schmud.de/posts/2024-08-18-data-is-a-bad-idea.html). *schmud.de*, August 2024. Archived at [perma.cc/ZXU5-XMCT](https://perma.cc/ZXU5-XMCT)
[^2]: [ACM Code of Ethics and Professional Conduct](https://www.acm.org/code-of-ethics). Association for Computing Machinery, *acm.org*, 2018. Archived at [perma.cc/SEA8-CMB8](https://perma.cc/SEA8-CMB8)
[^3]: Igor Perisic. [Making Hard Choices: The Quest for Ethics in Machine Learning](https://www.linkedin.com/blog/engineering/archive/making-hard-choices-the-quest-for-ethics-in-machine-learning). *linkedin.com*, November 2016. Archived at [perma.cc/DGF8-KNT7](https://perma.cc/DGF8-KNT7)
[^4]: John Naughton. [Algorithm Writers Need a Code of Conduct](https://www.theguardian.com/commentisfree/2015/dec/06/algorithm-writers-should-have-code-of-conduct). *theguardian.com*, December 2015. Archived at [perma.cc/TBG2-3NG6](https://perma.cc/TBG2-3NG6)
[^5]: Ben Green. ["Good" isn't good enough](https://www.benzevgreen.com/wp-content/uploads/2019/11/19-ai4sg.pdf). At *NeurIPS Joint Workshop on AI for Social Good*, December 2019. Archived at [perma.cc/H4LN-7VY3](https://perma.cc/H4LN-7VY3)
[^6]: Deborah G. Johnson and Mario Verdicchio. [Ethical AI is Not about AI](https://cacm.acm.org/opinion/ethical-ai-is-not-about-ai/). *Communications of the ACM*, volume 66, issue 2, pages 32--34, January 2023. [doi:10.1145/3576932](https://doi.org/10.1145/3576932)
[^7]: Marc Steen. [Ethics as a Participatory and Iterative Process](https://cacm.acm.org/opinion/ethics-as-a-participatory-and-iterative-process/). *Communications of the ACM*, volume 66, issue 5, pages 27--29, April 2023. [doi:10.1145/3550069](https://doi.org/10.1145/3550069)
[^8]: Logan Kugler. [What Happens When Big Data Blunders?](https://cacm.acm.org/news/what-happens-when-big-data-blunders/) *Communications of the ACM*, volume 59, issue 6, pages 15--16, June 2016. [doi:10.1145/2911975](https://doi.org/10.1145/2911975)
[^9]: Miri Zilka. [Algorithms and the criminal justice system: promises and challenges in deployment and research](https://www.cl.cam.ac.uk/research/security/seminars/archive/video/2023-03-07-t196231.html). At *University of Cambridge Security Seminar Series*, March 2023.
[^10]: Bill Davidow. [Welcome to Algorithmic Prison](https://www.theatlantic.com/technology/archive/2014/02/welcome-to-algorithmic-prison/283985/). *theatlantic.com*, February 2014. Archived at [archive.org](https://web.archive.org/web/20171019201812/https://www.theatlantic.com/technology/archive/2014/02/welcome-to-algorithmic-prison/283985/)
[^11]: Don Peck. [They're Watching You at Work](https://www.theatlantic.com/magazine/archive/2013/12/theyre-watching-you-at-work/354681/). *theatlantic.com*, December 2013. Archived at [perma.cc/YR9T-6M38](https://perma.cc/YR9T-6M38)
[^12]: Leigh Alexander. [Is an Algorithm Any Less Racist Than a Human?](https://www.theguardian.com/technology/2016/aug/03/algorithm-racist-human-employers-work) *theguardian.com*, August 2016. Archived at [perma.cc/XP93-DSVX](https://perma.cc/XP93-DSVX)
[^13]: Jesse Emspak. [How a Machine Learns Prejudice](https://www.scientificamerican.com/article/how-a-machine-learns-prejudice/). *scientificamerican.com*, December 2016. [perma.cc/R3L5-55E6](https://perma.cc/R3L5-55E6)
[^14]: Rohit Chopra, Kristen Clarke, Charlotte A. Burrows, and Lina M. Khan. [Joint Statement on Enforcement Efforts Against Discrimination and Bias in Automated Systems](https://www.ftc.gov/system/files/ftc_gov/pdf/EEOC-CRT-FTC-CFPB-AI-Joint-Statement%28final%29.pdf). *ftc.gov*, April 2023. Archived at [perma.cc/YY4Y-RCCA](https://perma.cc/YY4Y-RCCA)
[^15]: Maciej Cegłowski. [The Moral Economy of Tech](https://idlewords.com/talks/sase_panel.htm). *idlewords.com*, June 2016. Archived at [perma.cc/L8XV-BKTD](https://perma.cc/L8XV-BKTD)
[^16]: Greg Nichols. [Artificial Intelligence in healthcare is racist](https://www.zdnet.com/article/artificial-intelligence-in-healthcare-is-racist/). *zdnet.com*, November 2020. Archived at [perma.cc/3MKW-YKRS](https://perma.cc/3MKW-YKRS)
[^17]: Cathy O'Neil. *Weapons of Math Destruction: How Big Data Increases Inequality and Threatens Democracy*. Crown Publishing, 2016. ISBN: 978-0-553-41881-1
[^18]: Julia Angwin. [Make Algorithms Accountable](https://www.nytimes.com/2016/08/01/opinion/make-algorithms-accountable.html). *nytimes.com*, August 2016. Archived at [archive.org](https://web.archive.org/web/20230819055242/https://www.nytimes.com/2016/08/01/opinion/make-algorithms-accountable.html)
[^19]: Bryce Goodman and Seth Flaxman. [European Union Regulations on Algorithmic Decision-Making and a 'Right to Explanation'](https://arxiv.org/abs/1606.08813). At *ICML Workshop on Human Interpretability in Machine Learning*, June 2016. Archived at [arxiv.org/abs/1606.08813](https://arxiv.org/abs/1606.08813)
[^20]: [A Review of the Data Broker Industry: Collection, Use, and Sale of Consumer Data for Marketing Purposes](https://www.commerce.senate.gov/services/files/0d2b3642-6221-4888-a631-08f2f255b577). Staff Report, *United States Senate Committee on Commerce, Science, and Transportation*, *commerce.senate.gov*, December 2013. Archived at [perma.cc/32NV-YWLQ](https://perma.cc/32NV-YWLQ)
[^21]: Stephanie Assad, Robert Clark, Daniel Ershov, and Lei Xu. [Algorithmic Pricing and Competition: Empirical Evidence from the German Retail Gasoline Market](https://economics.yale.edu/sites/default/files/clark_acex_jan_2021.pdf). *Journal of Political Economy*, volume 132, issue 3, pages 723-771, March 2024. [doi:10.1086/726906](https://doi.org/10.1086/726906)
[^22]: Donella H. Meadows and Diana Wright. *Thinking in Systems: A Primer*. Chelsea Green Publishing, 2008. ISBN: 978-1-603-58055-7
[^23]: Daniel J. Bernstein. [Listening to a "big data"/"data science" talk. Mentally translating "data" to "surveillance": "\...everything starts with surveillance\..."](https://x.com/hashbreaker/status/598076230437568512) *x.com*, May 2015. Archived at [perma.cc/EY3D-WBBJ](https://perma.cc/EY3D-WBBJ)
[^24]: Marc Andreessen. [Why Software Is Eating the World](https://a16z.com/why-software-is-eating-the-world/). *a16z.com*, August 2011. Archived at [perma.cc/3DCC-W3G6](https://perma.cc/3DCC-W3G6)
[^25]: J. M. Porup. ['Internet of Things' Security Is Hilariously Broken and Getting Worse](https://arstechnica.com/information-technology/2016/01/how-to-search-the-internet-of-things-for-photos-of-sleeping-babies/). *arstechnica.com*, January 2016. Archived at [archive.org](https://web.archive.org/web/20250823001716/https://arstechnica.com/information-technology/2016/01/how-to-search-the-internet-of-things-for-photos-of-sleeping-babies/)
[^26]: Bruce Schneier. [*Data and Goliath: The Hidden Battles to Collect Your Data and Control Your World*](https://www.schneier.com/books/data_and_goliath/). W. W. Norton, 2015. ISBN: 978-0-393-35217-7
[^27]: The Grugq. [Nothing to Hide](https://grugq.tumblr.com/post/142799983558/nothing-to-hide). *grugq.tumblr.com*, April 2016. Archived at [perma.cc/BL95-8W5M](https://perma.cc/BL95-8W5M)
[^28]: Federal Trade Commission. [FTC Takes Action Against General Motors for Sharing Drivers' Precise Location and Driving Behavior Data Without Consent](https://www.ftc.gov/news-events/news/press-releases/2025/01/ftc-takes-action-against-general-motors-sharing-drivers-precise-location-driving-behavior-data). *ftc.gov*, January 2025. Archived at [perma.cc/3XGV-3HRD](https://perma.cc/3XGV-3HRD)
[^29]: Tony Beltramelli. [Deep-Spying: Spying Using Smartwatch and Deep Learning](https://arxiv.org/abs/1512.05616). Masters Thesis, IT University of Copenhagen, December 2015. Archived at *arxiv.org/abs/1512.05616*
[^30]: Shoshana Zuboff. [Big Other: Surveillance Capitalism and the Prospects of an Information Civilization](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2594754). *Journal of Information Technology*, volume 30, issue 1, pages 75--89, April 2015. [doi:10.1057/jit.2015.5](https://doi.org/10.1057/jit.2015.5)
[^31]: Michiel Rhoen. [Beyond Consent: Improving Data Protection Through Consumer Protection Law](https://policyreview.info/articles/analysis/beyond-consent-improving-data-protection-through-consumer-protection-law). *Internet Policy Review*, volume 5, issue 1, March 2016. [doi:10.14763/2016.1.404](https://doi.org/10.14763/2016.1.404)
[^32]: [Regulation (EU) 2016/679 of the European Parliament and of the Council of 27 April 2016](https://eur-lex.europa.eu/eli/reg/2016/679/oj/eng). *Official Journal of the European Union*, L 119/1, May 2016.
[^33]: UK Information Commissioner's Office. [What is the 'legitimate interests' basis?](https://ico.org.uk/for-organisations/uk-gdpr-guidance-and-resources/lawful-basis/legitimate-interests/what-is-the-legitimate-interests-basis/) *ico.org.uk*. Archived at [perma.cc/W8XR-F7ML](https://perma.cc/W8XR-F7ML)
[^34]: Tristan Harris. [How a handful of tech companies control billions of minds every day](https://www.ted.com/talks/tristan_harris_how_a_handful_of_tech_companies_control_billions_of_minds_every_day). At *TED2017*, April 2017.
[^35]: Carina C. Zona. [Consequences of an Insightful Algorithm](https://www.youtube.com/watch?v=YRI40A4tyWU). At *GOTO Berlin*, November 2016.
[^36]: Imanol Arrieta Ibarra, Leonard Goff, Diego Jiménez Hernández, Jaron Lanier, and E. Glen Weyl. [Should We Treat Data as Labor? Moving Beyond 'Free'](https://www.aeaweb.org/conference/2018/preliminary/paper/2Y7N88na). *American Economic Association Papers Proceedings*, volume 1, issue 1, December 2017.
[^37]: Bruce Schneier. [Data Is a Toxic Asset, So Why Not Throw It Out?](https://www.schneier.com/essays/archives/2016/03/data_is_a_toxic_asse.html) *schneier.com*, March 2016. Archived at [perma.cc/4GZH-WR3D](https://perma.cc/4GZH-WR3D)
[^38]: Cory Scott. [Data is not toxic - which implies no benefit - but rather hazardous material, where we must balance need vs. want](https://x.com/cory_scott/status/706586399483437056). *x.com*, March 2016. Archived at [perma.cc/CLV7-JF2E](https://perma.cc/CLV7-JF2E)
[^39]: Mark Pesce. [Data is the new uranium -- incredibly powerful and amazingly dangerous](https://www.theregister.com/2024/11/20/data_is_the_new_uranium/). *theregister.com*, November 2024. Archived at [perma.cc/NV8B-GYGV](https://perma.cc/NV8B-GYGV)
[^40]: Bruce Schneier. [Mission Creep: When Everything Is Terrorism](https://www.schneier.com/essays/archives/2013/07/mission_creep_when_e.html). *schneier.com*, July 2013. Archived at [perma.cc/QB2C-5RCE](https://perma.cc/QB2C-5RCE)
[^41]: Lena Ulbricht and Maximilian von Grafenstein. [Big Data: Big Power Shifts?](https://policyreview.info/articles/analysis/big-data-big-power-shifts) *Internet Policy Review*, volume 5, issue 1, March 2016. [doi:10.14763/2016.1.406](https://doi.org/10.14763/2016.1.406)
[^42]: Ellen P. Goodman and Julia Powles. [Facebook and Google: Most Powerful and Secretive Empires We've Ever Known](https://www.theguardian.com/technology/2016/sep/28/google-facebook-powerful-secretive-empire-transparency). *theguardian.com*, September 2016. Archived at [perma.cc/8UJA-43G6](https://perma.cc/8UJA-43G6)
[^43]: Judy Estrin and Sam Gill. [The World Is Choking on Digital Pollution](https://washingtonmonthly.com/2019/01/13/the-world-is-choking-on-digital-pollution/). *washingtonmonthly.com*, January 2019. Archived at [perma.cc/3VHF-C6UC](https://perma.cc/3VHF-C6UC)
[^44]: A. Michael Froomkin. [Regulating Mass Surveillance as Privacy Pollution: Learning from Environmental Impact Statements](https://repository.law.miami.edu/cgi/viewcontent.cgi?article=1062&context=fac_articles). *University of Illinois Law Review*, volume 2015, issue 5, August 2015. Archived at [perma.cc/24ZL-VK2T](https://perma.cc/24ZL-VK2T)
[^45]: Pengyuan Wang, Li Jiang, and Jian Yang. [The Early Impact of GDPR Compliance on Display Advertising: The Case of an Ad Publisher](https://openreview.net/pdf?id=TUnLHNo19S). *Journal of Marketing Research*, volume 61, issue 1, April 2023. [doi:10.1177/00222437231171848](https://doi.org/10.1177/00222437231171848)
[^46]: Johnny Ryan. [Don't be fooled by Meta's fine for data breaches](https://www.economist.com/by-invitation/2023/05/24/dont-be-fooled-by-metas-fine-for-data-breaches-says-johnny-ryan). *The Economist*, May 2023. Archived at [perma.cc/VCR6-55HR](https://perma.cc/VCR6-55HR)
[^47]: Jessica Leber. [Your Data Footprint Is Affecting Your Life in Ways You Can't Even Imagine](https://www.fastcompany.com/3057514/your-data-footprint-is-affecting-your-life-in-ways-you-cant-even-imagine). *fastcompany.com*, March 2016. Archived at [archive.org](https://web.archive.org/web/20161128133016/https://www.fastcoexist.com/3057514/your-data-footprint-is-affecting-your-life-in-ways-you-cant-even-imagine)
[^48]: Maciej Cegłowski. [Haunted by Data](https://idlewords.com/talks/haunted_by_data.htm). *idlewords.com*, October 2015. Archived at [archive.org](https://web.archive.org/web/20161130143932/https://idlewords.com/talks/haunted_by_data.htm)
[^49]: Sam Thielman. [You Are Not What You Read: Librarians Purge User Data to Protect Privacy](https://www.theguardian.com/us-news/2016/jan/13/us-library-records-purged-data-privacy). *theguardian.com*, January 2016. Archived at [archive.org](https://web.archive.org/web/20250828224851/https://www.theguardian.com/us-news/2016/jan/13/us-library-records-purged-data-privacy)
[^50]: Jez Humble. [It's a cliché that people get into tech to "change the world". So then, you have to actually consider what the impact of your work is on the world. The idea that you can or should exclude societal and political discussions in tech is idiotic. It means you're not doing your job](https://x.com/jezhumble/status/1386758340894597122). *x.com*, April 2021. Archived at [perma.cc/3NYS-MHLC](https://perma.cc/3NYS-MHLC)


================================================
FILE: content/zh/ch2.md
================================================
---
title: "2. 定义非功能性需求"
weight: 102
breadcrumbs: false
---

<a id="ch_nonfunctional"></a>

![](/map/ch01.png)

> *互联网做得太好了，以至于大多数人把它看成像太平洋那样的自然资源，而不是人造产物。上一次出现这种规模且几乎无差错的技术是什么时候？*
>
> [艾伦・凯](https://www.drdobbs.com/architecture-and-design/interview-with-alan-kay/240003442)，
> 在接受 *Dr Dobb's Journal* 采访时（2012 年）

构建一个应用时，你通常会从一张需求清单开始。清单最上面的，往往是应用必须提供的功能：需要哪些页面和按钮，每个操作应该完成什么行为，才能实现软件的目标。这些就是 ***功能性需求***。

此外，你通常还会有一些 ***非功能性需求***：例如，应用应当足够快、足够可靠、足够安全、符合法规，而且易于维护。这些需求可能并没有明确写下来，因为它们看起来像是“常识”，但它们与功能需求同样重要。一个慢得无法忍受、或频繁出错的应用，几乎等于不存在。

许多非功能性需求（比如安全）超出了本书范围。但本章会讨论其中几项核心要求，并帮助你用更清晰的方式描述自己的系统：

* 如何定义并衡量系统的 **性能**（参见 ["描述性能"](#sec_introduction_percentiles)）；
* 服务 **可靠** 到底意味着什么：也就是在出错时仍能持续正确工作（参见 ["可靠性与容错"](#sec_introduction_reliability)）；
* 如何通过高效增加计算资源，让系统在负载增长时保持 **可伸缩性**（参见 ["可伸缩性"](#sec_introduction_scalability)）；以及
* 如何让系统在长期演进中保持 **可维护性**（参见 ["可维护性"](#sec_introduction_maintainability)）。

本章引入的术语，在后续章节深入实现细节时也会反复用到。不过纯定义往往比较抽象。为了把概念落到实处，本章先从一个案例研究开始：看看社交网络服务可能如何实现，并借此讨论性能与可伸缩性问题。


## 案例研究：社交网络首页时间线 {#sec_introduction_twitter}

假设你要实现一个类似 X（原 Twitter）的社交网络：用户可以发帖，并追随其他用户。这会极大简化真实系统的实现方式 [^1] [^2] [^3]，但足以说明大规模系统会遇到的一些关键问题。

我们假设：用户每天发帖 5 亿条，平均每秒约 5,700 条；在特殊事件期间，峰值可能冲到每秒 150,000 条 [^4]。再假设平均每位用户追随 200 人，并有 200 名追随者（实际分布非常不均匀：大多数人只有少量追随者，少数名人如巴拉克・奥巴马则有上亿追随者）。

### 表示用户、帖子与关注关系 {#id20}

假设我们将所有数据保存在关系数据库中，如 [图 2-1](#fig_twitter_relational) 所示。我们有一个用户表、一个帖子表和一个关注关系表。

{{< figure src="/fig/ddia_0201.png" id="fig_twitter_relational" caption="图 2-1. 社交网络的简单关系模式，用户可以相互关注。" class="w-full my-4" >}}

假设该社交网络最重要的读操作是 *首页时间线*：展示你所追随的人最近发布的帖子（为简化起见，我们忽略广告、未追随用户的推荐帖，以及其他扩展功能）。获取某个用户首页时间线的 SQL 可能如下：

```sql
SELECT posts.*, users.* FROM posts
    JOIN follows ON posts.sender_id = follows.followee_id
    JOIN users ON posts.sender_id = users.id
    WHERE follows.follower_id = current_user
    ORDER BY posts.timestamp DESC
    LIMIT 1000
```

要执行此查询，数据库将使用 `follows` 表找到 `current_user` 关注的所有人，查找这些用户最近的帖子，并按时间戳排序以获取被关注用户的最新 1,000 条帖子。

帖子具有时效性。我们假设：某人发帖后，追随者应在 5 秒内看到。一个做法是客户端每 5 秒重复执行一次上述查询（即 *轮询*）。如果同时在线登录用户有 1000 万，就意味着每秒要执行 200 万次查询。即使把轮询间隔调大，这个量也很可观。

此外，这个查询本身也很昂贵。若你追随 200 人，系统就要分别抓取这 200 人的近期帖子列表，再把它们归并。每秒 200 万次时间线查询，等价于数据库每秒要执行约 4 亿次“按发送者查最近帖子”。这还只是平均情况。少数用户会追随数万账户，这个查询对他们尤其昂贵，也更难做快。

### 时间线的物化与更新 {#sec_introduction_materializing}

要如何优化？第一，与其轮询，不如由服务器主动向在线追随者推送新帖。第二，我们应该预先计算上述查询结果，让首页时间线请求可以直接从缓存返回。

设想我们为每个用户维护一个数据结构，保存其首页时间线，也就是其所追随者的近期帖子。每当用户发帖，我们就找出其所有追随者，把这条帖子插入每个追随者的首页时间线中，就像往邮箱里投递信件。这样用户登录时，可以直接读取预先算好的时间线。若要接收新帖提醒，客户端只需订阅“写入该时间线”的帖子流即可。

这种方法的缺点是：每次发帖时都要做更多工作，因为首页时间线属于需要持续更新的派生数据。这个过程见 [图 2-2](#fig_twitter_timelines)。当一个初始请求触发多个下游请求时，我们用 *扇出* 描述请求数量被放大的倍数。

{{< figure src="/fig/ddia_0202.png" id="fig_twitter_timelines" caption="图 2-2. 扇出：将新帖子传递给发布帖子的用户的每个追随者。" class="w-full my-4" >}}

按每秒 5,700 条帖子计算，若平均每条帖到达 200 名追随者（扇出因子 200），则每秒需要略高于 100 万次首页时间线写入。这已经很多，但相比原先每秒 4 亿次“按发送者查帖”，仍是显著优化。

如果遇到特殊事件导致发帖速率激增，我们不必立刻完成时间线投递。可以先入队，接受“帖子出现在追随者时间线中”会暂时变慢。即便在这种峰值期，时间线加载仍然很快，因为读取仍来自缓存。

这种预先计算并持续更新查询结果的过程称为 *物化*。时间线缓存就是一种 *物化视图*（这个概念见 [“维护物化视图”](/ch12#sec_stream_mat_view)）。物化视图能加速读取，但代价是写入侧工作量增加。对大多数用户而言，这个写入成本仍可接受，但社交网络还要处理一些极端情况：

* 如果某用户追随了大量账户，且这些账户发帖频繁，那么该用户的物化时间线写入率会很高。但在这种场景下，用户通常也看不完全部帖子，因此可以丢弃部分时间线写入，只展示其追随账户帖子的一部分样本 [^5]。
* 如果一个拥有海量追随者的名人账号发帖，我们需要把这条帖子写入其数百万追随者的首页时间线，工作量极大。此时不能随意丢写。常见做法是把名人帖子与普通帖子分开处理：名人帖单独存储，读取时间线时再与物化时间线合并，从而省去写入数百万条时间线的成本。即便如此，服务名人账号仍需大量基础设施 [^6]。

## 描述性能 {#sec_introduction_percentiles}

软件性能通常围绕两类指标展开：

响应时间
: 从用户发出请求到收到响应所经历的时间。单位是秒（或毫秒、微秒）。

吞吐量
: 系统每秒可处理的请求数或数据量。对于给定硬件资源，系统存在一个可处理的 *最大吞吐量*。单位是“每秒某种工作量”。

在社交网络案例中，“每秒帖子数”和“每秒时间线写入数”属于吞吐量指标；“加载首页时间线所需时间”或“帖子送达追随者所需时间”属于响应时间指标。

吞吐量和响应时间之间通常相关。在线服务的典型关系如 [图 2-3](#fig_throughput)：低吞吐量时响应时间较低，负载升高后响应时间上升。原因是 *排队*。请求到达高负载系统时，CPU 往往已在处理前一个请求，新请求只能等待；当吞吐量逼近硬件上限，排队延迟会急剧上升。

{{< figure src="/fig/ddia_0203.png" id="fig_throughput" caption="图 2-3. 随着服务的吞吐量接近其容量，由于排队，响应时间急剧增加。" class="w-full my-4" >}}

--------

<a id="sidebar_metastable"></a>

> [!TIP] 当过载系统无法恢复时

如果系统已接近过载、吞吐量逼近极限，有时会进入恶性循环：效率下降，进而更加过载。例如，请求队列很长时，响应时间可能高到让客户端超时并重发请求，导致请求速率进一步上升，问题持续恶化，形成 *重试风暴*。即使负载后来回落，系统也可能仍卡在过载状态，直到重启或重置。这种现象叫 *亚稳态故障*（Metastable Failure），可能引发严重生产故障 [^7] [^8]。

为了避免重试把服务拖垮，可以在客户端拉大并随机化重试间隔（*指数退避* [^9] [^10]），并临时停止向近期报错或超时的服务发请求（例如 *熔断器* [^11] [^12] 或 *令牌桶* [^13]）。服务端也可在接近过载时主动拒绝请求（*负载卸除* [^14]），并通过响应要求客户端降速（*背压* [^1] [^15]）。此外，排队与负载均衡算法的选择也会影响结果 [^16]。

--------

从性能指标角度看，用户通常最关心响应时间；而吞吐量决定了所需计算资源（例如服务器数量），从而决定承载特定工作负载的成本。如果吞吐量增长可能超过当前硬件上限，就必须扩容；若系统可通过增加计算资源显著提升最大吞吐量，就称其 *可伸缩*。

本节主要讨论响应时间；吞吐量与可伸缩性会在 ["可伸缩性"](#sec_introduction_scalability) 一节再展开。

### 延迟与响应时间 {#id23}

“延迟”和“响应时间”有时会混用，但本书对它们有明确区分（见 [图 2-4](#fig_response_time)）：

* *响应时间* 是客户端看到的总时间，包含链路上各处产生的全部延迟。
* *服务时间* 是服务主动处理该请求的时间。
* *排队延迟* 可发生在流程中的多个位置。例如请求到达后，可能要等 CPU 空出来才能处理；同机其他任务若占满出站网卡，响应包也可能先在缓冲区等待发送。
* *延迟* 是对“请求未被主动处理这段时间”的统称，也就是请求处于 *潜伏（latent）* 状态的时间。尤其是 *网络延迟*（或网络时延）指请求与响应在网络中传播所花的时间。

{{< figure src="/fig/ddia_0204.png" id="fig_response_time" caption="图 2-4. 响应时间、服务时间、网络延迟和排队延迟。" class="w-full my-4" >}}

在 [图 2-4](#fig_response_time) 中，时间从左向右流动。每个通信节点画成一条水平线，请求/响应消息画成节点间的粗斜箭头。本书后文会频繁使用这种图示风格。

即便反复发送同一个请求，响应时间也可能显著波动。许多因素都会引入随机延迟：例如切换到后台进程、网络丢包与 TCP 重传、垃圾回收暂停、缺页导致的磁盘读取、服务器机架机械振动 [^17] 等。我们会在 ["超时与无界延迟"](/ch9#sec_distributed_queueing) 进一步讨论这个问题。

排队延迟常常是响应时间波动的主要来源。服务器并行处理能力有限（例如受 CPU 核数约束），少量慢请求就可能堵住后续请求，这就是 *头部阻塞*。即便后续请求本身服务时间很短，客户端仍会因为等待前序请求而看到较慢的总体响应。排队延迟不属于服务时间，因此必须在客户端侧测量响应时间。

### 平均值、中位数与百分位点 {#id24}

由于响应时间会随请求变化，我们应将其看作一个可测量的 *分布*，而非单一数字。在 [图 2-5](#fig_lognormal) 中，每个灰色柱表示一次请求，柱高是该请求耗时。大多数请求较快，但会有少量更慢的 *异常值*。网络时延波动也常称为 *抖动*。

{{< figure src="/fig/ddia_0205.png" id="fig_lognormal" caption="图 2-5. 说明平均值和百分位点：100 个服务请求的响应时间样本。" class="w-full my-4" >}}

报告服务 *平均* 响应时间很常见（严格说是 *算术平均值*：总响应时间除以请求数）。平均值对估算吞吐量上限有帮助 [^18]。但若你想知道“典型”响应时间，平均值并不理想，因为它不能反映到底有多少用户经历了这种延迟。

通常，*百分位点* 更有意义。把响应时间从快到慢排序，*中位数* 位于中间。例如中位响应时间为 200 毫秒，表示一半请求在 200 毫秒内返回，另一半更慢。因此中位数适合衡量用户“通常要等多久”。中位数也称 *第 50 百分位*，常记为 *p50*。

为了看清异常值有多糟，需要观察更高百分位点：常见的是 *p95*、*p99*、*p999*。它们表示 95%、99%、99.9% 的请求都快于该阈值。例如 p95 为 1.5 秒，表示 100 个请求里有 95 个小于 1.5 秒，另外 5 个不小于 1.5 秒。[图 2-5](#fig_lognormal) 展示了这一点。

响应时间的高百分位点（也叫 *尾部延迟*）非常重要，因为它直接影响用户体验。例如亚马逊内部服务常以第 99.9 百分位设定响应要求，尽管它只影响 1/1000 的请求。原因是最慢请求往往来自“账户数据最多”的客户，他们通常也是最有价值客户 [^19]。让这批用户也能获得快速响应，对业务很关键。

另一方面，继续优化到第 99.99 百分位（最慢的万分之一请求）通常成本过高、收益有限。越到高百分位，越容易受不可控随机因素影响，也更符合边际收益递减规律。

--------

> [!TIP] 响应时间对用户的影响

直觉上，快服务当然比慢服务更好 [^20]。但真正要拿到“延迟如何影响用户行为”的可靠量化数据，其实非常困难。

一些被频繁引用的统计并不可靠。2006 年，Google 曾报告：搜索结果从 400 毫秒变慢到 900 毫秒，与流量和收入下降 20% 相关 [^21]。但 2009 年 Google 另一项研究又称，延迟增加 400 毫秒仅导致日搜索量下降 0.6% [^22]；同年 Bing 发现，加载时间增加 2 秒会让广告收入下降 4.3% [^23]。这些公司的更新数据似乎并未公开。

Akamai 的一项较新研究 [^24] 声称：响应时间增加 100 毫秒会让电商网站转化率最多下降 7%。但细看可知，同一研究也显示“加载极快”的页面同样和较低转化率相关。这个看似矛盾的结果，很可能是因为加载最快的页面往往是“无有效内容”的页面（如 404）。而该研究并未把“页面内容影响”和“加载时间影响”区分开，因此结论可能并不可靠。

Yahoo 的一项研究 [^25] 在控制搜索结果质量后，比对了快慢加载对点击率的影响。结果显示：当快慢响应差异达到 1.25 秒或以上时，快速搜索的点击量会高出 20%–30%。

--------

### 响应时间指标的应用 {#sec_introduction_slo_sla}

对于“一个终端请求会触发多次后端调用”的服务，高百分位点尤其关键。即使并行调用，终端请求仍要等待最慢的那个返回。正如 [图 2-6](#fig_tail_amplification) 所示，只要一个调用慢，就能拖慢整个终端请求。即便慢调用比例很小，只要后端调用次数变多，撞上慢调用的概率就会上升，于是更大比例的终端请求会变慢（称为 *尾部延迟放大* [^26]）。

{{< figure src="/fig/ddia_0206.png" id="fig_tail_amplification" caption="图 2-6. 当需要几个后端调用来服务请求时，只需要一个慢的后端请求就可以减慢整个最终用户请求。" class="w-full my-4" >}}

百分位点也常用于定义 *服务级别目标*（SLO）和 *服务级别协议*（SLA）[^27]。例如，一个 SLO 可能要求：中位响应时间低于 200 毫秒、p99 低于 1 秒，并且至少 99.9% 的有效请求返回非错误响应。SLA 则是“未达成 SLO 时如何处理”的合同条款（例如客户可获赔偿）。这是基本思路；但在实践中，为 SLO/SLA 设计合理可用性指标并不容易 [^28] [^29]。

--------

> [!TIP] 计算百分位点

如果你想在监控面板中展示响应时间百分位点，就需要持续且高效地计算它们。例如，维护“最近 10 分钟请求响应时间”的滚动窗口，每分钟计算一次该窗口内的中位数与各百分位点，并绘图展示。

最简单的实现是保存窗口内全部请求的响应时间，并每分钟排序一次。若效率不够，可以用一些低 CPU/内存开销的算法来近似计算百分位点。常见开源库包括 HdrHistogram、t-digest [^30] [^31]、OpenHistogram [^32] 和 DDSketch [^33]。

要注意，“对百分位点再取平均”（例如降低时间分辨率，或合并多机器数据）在数学上没有意义。聚合响应时间数据的正确方式是聚合直方图 [^34]。

--------

## 可靠性与容错 {#sec_introduction_reliability}

每个人对“可靠”与“不可靠”都有直觉。对软件而言，典型期望包括：

* 应用能完成用户预期的功能。
* 能容忍用户犯错，或以意料之外的方式使用软件。
* 在预期负载与数据规模下，性能足以支撑目标用例。
* 能防止未授权访问与滥用。

如果把这些合起来称为“正确工作”，那么 *可靠性* 可以粗略理解为：即使出现问题，系统仍能持续正确工作。为了更精确地描述“出问题”，我们区分 *故障* 与 *失效* [^35] [^36] [^37]：

故障
: 指系统某个 *局部组件* 停止正常工作：例如单个硬盘损坏、单台机器宕机，或系统依赖的外部服务中断。

失效
: 指 *整个系统* 无法继续向用户提供所需服务；换言之，系统未满足服务级别目标（SLO）。

“故障”与“失效”的区别容易混淆，因为它们本质上是同一件事在不同层级上的表述。比如一个硬盘坏了，对“硬盘这个系统”来说是失效；但对“由许多硬盘组成的更大系统”来说，它只是一个故障。更大系统若在其他硬盘上有副本，就可能容忍该故障。

### 容错 {#id27}

如果系统在发生某些故障时仍继续向用户提供所需的服务，我们称系统为 *容错的*。如果系统不能容忍某个部分变得有故障，我们称该部分为 *单点故障*（SPOF），因为该部分的故障会升级导致整个系统的失效。

例如在社交网络案例中，扇出流程里可能有机器崩溃或不可用，导致物化时间线更新中断。若要让该流程具备容错性，就必须保证有其他机器可接管任务，同时既不漏投帖子，也不重复投递。（这个思想称为 *恰好一次语义*，我们会在 [“数据库的端到端论证”](/ch13#sec_future_end_to_end) 中详细讨论。）

容错能力总是“有边界”的：它只针对某些类型、某个数量以内的故障。例如系统可能最多容忍 2 块硬盘同时故障，或 3 个节点里坏 1 个。若全部节点都崩溃，就无计可施，因此“容忍任意数量故障”并无意义。要是地球和上面的服务器都被黑洞吞噬，那就只能去太空托管了，预算审批祝你好运。

反直觉的是，在这类系统里，故意 *提高* 故障发生率反而有意义，例如无预警随机杀死某个进程。这叫 *故障注入*。许多关键故障本质上是错误处理做得不够好 [^38]。通过主动注入故障，可以持续演练并验证容错机制，提升对“真实故障发生时系统仍能正确处理”的信心。*混沌工程* 就是围绕这类实验建立起来的方法论 [^39]。

尽管我们通常更倾向于“容忍故障”，而非“阻止故障”，但也有“预防优于补救”的场景（例如根本无法补救）。安全问题就是如此：若攻击者已攻破系统并获取敏感数据，事件本身无法撤销。不过，本书主要讨论的是可恢复的故障类型。

### 硬件与软件故障 {#sec_introduction_hardware_faults}

当我们想到系统失效的原因时，硬件故障很快就会浮现在脑海中：

* 机械硬盘每年故障率约为 2%–5% [^40] [^41]；在 10,000 盘位的存储集群中，平均每天约有 1 块盘故障。近期数据表明磁盘可靠性在提升，但故障率仍不可忽视 [^42]。
* SSD 每年故障率约为 0.5%–1% [^43]。少量比特错误可自动纠正 [^44]，但不可纠正错误大约每盘每年一次，即使是磨损较轻的新盘也会出现；该错误率高于机械硬盘 [^45]、[^46]。
* 其他硬件组件，如电源、RAID 控制器和内存模块也会发生故障，尽管频率低于硬盘驱动器 [^47] [^48]。
* 大约每 1000 台机器里就有 1 台存在“偶发算错结果”的 CPU 核心，可能由制造缺陷导致 [^49] [^50] [^51]。有时错误计算会直接导致崩溃；有时则只是悄悄返回错误结果。
* RAM 数据也可能损坏：要么来自宇宙射线等随机事件，要么来自永久性物理缺陷。即便使用 ECC 内存，任意一年内仍有超过 1% 的机器会遇到不可纠正错误，通常表现为机器崩溃并需要更换受影响内存条 [^52]。此外，某些病态访问模式还可能以较高概率触发比特翻转 [^53]。
* 整个数据中心也可能不可用（如停电、网络配置错误），甚至被永久摧毁（如火灾、洪水、地震 [^54]）。太阳风暴会在长距离导线中感应大电流，可能损坏电网和海底通信电缆 [^55]。这类大规模故障虽罕见，但若服务无法容忍数据中心丢失，后果将极其严重 [^56]。

这类事件在小系统里足够罕见，通常不必过度担心，只要能方便地更换故障硬件即可。但在大规模系统里，硬件故障足够频繁，已经是“正常运行”的一部分。

#### 通过冗余容忍硬件故障 {#tolerating-hardware-faults-through-redundancy}

我们对不可靠硬件的第一反应通常是向各个硬件组件添加冗余，以降低系统的故障率。磁盘可以设置为 RAID 配置（将数据分布在同一台机器的多个磁盘上，以便故障磁盘不会导致数据丢失），服务器可能有双电源和可热插拔的 CPU，数据中心可能有电池和柴油发电机作为备用电源。这种冗余通常可以使机器不间断运行多年。

当组件故障独立时，冗余最有效，即一个故障的发生不会改变另一个故障发生的可能性。然而，经验表明，组件故障之间通常存在显著的相关性 [^41] [^57] [^58]；整个服务器机架或整个数据中心的不可用仍然比我们预期的更频繁地发生。

硬件冗余确实能提升单机可用时间；但正如 ["分布式与单节点系统"](/ch1#sec_introduction_distributed) 所述，分布式系统还具备额外优势，例如可容忍整个数据中心中断。因此云系统通常不再过分追求“单机极致可靠”，而是通过软件层容忍节点故障来实现高可用。云厂商使用 *可用区* 标识资源是否物理共址；同一可用区内资源比跨地域资源更容易同时失效。

我们在本书中讨论的容错技术旨在容忍整个机器、机架或可用区的丢失。它们通常通过允许一个数据中心的机器在另一个数据中心的机器发生故障或变得不可达时接管来工作。我们将在 [第 6 章](/ch6)、[第 10 章](/ch10) 以及本书的其他各个地方讨论这种容错技术。

能够容忍整个机器丢失的系统也具有运营优势：如果你需要重新启动机器（例如，应用操作系统安全补丁），单服务器系统需要计划停机时间，而多节点容错系统可以一次修补一个节点，而不影响用户的服务。这称为 *滚动升级*，我们将在 [第 5 章](/ch5) 中进一步讨论它。

#### 软件故障 {#software-faults}

尽管硬件故障可能存在弱相关，但整体上仍相对独立：例如一块盘坏了，同机其他盘往往还能再正常工作一段时间。相比之下，软件故障常常高度相关，因为许多节点运行同一套软件，也就共享同一批 bug [^59] [^60]。这类故障更难预判，也往往比“相互独立的硬件故障”造成更多系统失效 [^47]。例如：

* 在特定情况下导致每个节点同时失效的软件错误。例如，2012 年 6 月 30 日，闰秒导致许多 Java 应用程序由于 Linux 内核中的错误而同时挂起 [^61]。由于固件错误，某些型号的所有 SSD 在精确运行 32,768 小时（不到 4 年）后突然失效，使其上的数据无法恢复 [^62]。
* 使用某些共享、有限资源（如 CPU 时间、内存、磁盘空间、网络带宽或线程）的失控进程 [^63]。例如，处理大请求时消耗过多内存的进程可能会被操作系统杀死。客户端库中的错误可能导致比预期更高的请求量 [^64]。
* 系统所依赖的服务变慢、无响应或开始返回损坏的响应。
* 不同系统交互后出现“单系统隔离测试中看不到”的涌现行为 [^65]。
* 级联故障，其中一个组件中的问题导致另一个组件过载和减速，这反过来又导致另一个组件崩溃 [^66] [^67]。

导致这类软件故障的 bug 往往潜伏很久，直到一组不寻常条件把它触发出来。这时才暴露出：软件其实对运行环境做了某些假设，平时大多成立，但终有一天会因某种原因失效 [^68] [^69]。

软件系统性故障没有“速效药”。但许多小措施都有效：认真审视系统假设与交互、充分测试、进程隔离、允许进程崩溃并重启、避免反馈环路（如重试风暴，参见 ["当过载系统无法恢复时"](#sidebar_metastable)），以及在生产环境持续度量、监控和分析系统行为。

### 人类与可靠性 {#id31}

软件系统由人设计、构建和运维。与机器不同，人不会只按规则执行；人的优势在于创造性和适应性。但这也带来不可预测性，即使本意是好的，也会犯导致失效的错误。例如，一项针对大型互联网服务的研究发现：运维配置变更是中断首因，而硬件故障（服务器或网络）仅占 10%–25% [^70]。

遇到这类问题，人们很容易归咎于“人为错误”，并试图通过更严格流程和更强规则约束来控制人。但“责怪个人”通常适得其反。所谓“人为错误”往往不是事故根因，而是社会技术系统本身存在问题的征兆 [^71]。复杂系统里，组件意外交互产生的涌现行为也常导致故障 [^72]。

有多种技术手段可降低人为失误的影响：充分测试（含手写测试与大量随机输入的 *属性测试*）[^38]、可快速回滚配置变更的机制、新代码渐进发布、清晰细致的监控、用于排查生产问题的可观测性工具（参见 ["分布式系统的问题"](/ch1#sec_introduction_dist_sys_problems)），以及鼓励“正确操作”并抑制“错误操作”的良好界面设计。

但这些措施都需要时间和预算。在日常业务压力下，组织往往优先投入“直接创收”活动，而非提升抗错韧性的建设。若在“更多功能”和“更多测试”之间二选一，很多组织会自然选择前者。既然如此，当可预防错误最终发生时，责怪个人并无意义，问题本质在于组织的优先级选择。

越来越多组织在实践 *无责备事后分析*：事故发生后，鼓励参与者在不担心惩罚的前提下完整复盘细节，让组织其他人也能学习如何避免类似问题 [^73]。这个过程常会揭示出：业务优先级需要调整、某些长期被忽视的领域需要补投入、相关激励机制需要改，或其他应由管理层关注的系统性问题。

一般来说，调查事故时应警惕“过于简单”的答案。“鲍勃部署时应更小心”没有建设性，“我们必须用 Haskell 重写后端”同样不是。更可行的做法是：管理层借机从一线人员视角理解社会技术系统的真实运行方式，并据此推动改进 [^71]。

--------

<a id="sidebar_reliability_importance"></a>

> [!TIP] 可靠性有多重要？

可靠性不只适用于核电站或空管系统，普通应用同样需要可靠。企业软件中的 bug 会造成生产力损失（若报表错误还会带来法律风险）；电商网站故障则会带来直接收入损失和品牌伤害。

在许多应用里，几分钟乃至几小时的短暂中断尚可容忍 [^74]；但永久性数据丢失或损坏往往是灾难性的。想象一位家长把孩子的全部照片和视频都存在你的相册应用里 [^75]。若数据库突然损坏，他们会怎样？又是否知道如何从备份恢复？

另一个“软件不可靠伤害现实人群”的例子，是英国邮局 Horizon 丑闻。1999 到 2019 年间，数百名邮局网点负责人因会计系统显示“账目短缺”被判盗窃或欺诈。后来事实证明，许多“短缺”来自软件缺陷，且大量判决已被推翻 [^76]。造成这场可能是英国史上最大司法不公的一个关键前提，是英国法律默认计算机正常运行（因此其证据可靠），除非有相反证据 [^77]。软件工程师或许会觉得“软件无 bug”很荒谬，但这对那些因此被错判入狱、破产乃至自杀的人来说毫无安慰。

在某些场景下，我们也许会有意牺牲部分可靠性来降低开发成本（例如做未验证市场的原型产品）。但应明确知道自己在何处“走捷径”，并充分评估其后果。

--------

## 可伸缩性 {#sec_introduction_scalability}

即便系统今天运行可靠，也不代表将来一定如此。性能退化的常见原因之一是负载增长：比如并发用户从 1 万涨到 10 万，或从 100 万涨到 1000 万；也可能是处理的数据规模远大于从前。

*可伸缩性* 用来描述系统应对负载增长的能力。讨论这个话题时，常有人说：“你又不是 Google/Amazon，别担心规模，直接上关系数据库。”这句话是否成立，取决于你在做什么类型的应用。

如果你在做一个目前用户很少的新产品（例如创业早期），首要工程目标通常是“尽可能简单、尽可能灵活”，以便随着对用户需求理解加深而快速调整产品功能 [^78]。在这种环境下，过早担心“未来也许会有”的规模往往适得其反：最好情况是白费功夫、过早优化；最坏情况是把自己锁进僵化设计，反而阻碍演进。

原因在于，可伸缩性不是一维标签；“X 可伸缩”或“Y 不可伸缩”这种说法本身意义不大。更有意义的问题是：

* “如果系统按某种方式增长，我们有哪些应对选项？”
* “我们如何增加计算资源来承载额外负载？”
* “按当前增长趋势，现有架构何时会触顶？”

当你的产品真的做起来、负载持续上升时，你自然会看到瓶颈在哪里，也就知道该沿哪些维度扩展。那时再系统性投入可伸缩性技术，通常更合适。

### 描述负载 {#id33}

首先要简明描述系统当前负载，之后才能讨论“增长会怎样”（例如负载翻倍会发生什么）。最常见的是吞吐量指标：每秒请求数、每天新增数据量（GB）、每小时购物车结账次数等。有时你关心的是峰值变量，比如 ["案例研究：社交网络首页时间线"](#sec_introduction_twitter) 里的“同时在线用户数”。

此外还可能有其他统计特征会影响访问模式，进而影响可伸缩性要求。例如数据库读写比、缓存命中率、每用户数据项数量（如社交网络里的追随者数）。有时平均情况最关键，有时瓶颈由少数极端情况主导，具体取决于你的应用细节。

当负载被清楚描述后，就可以分析“负载增加时系统会怎样”。可从两个角度看：

* 以某种方式增大负载、但保持资源（CPU、内存、网络带宽等）不变时，性能如何变化？
* 若负载按某种方式增长、但你希望性能不变，需要增加多少资源？

通常目标是：在尽量降低运行成本的同时，让性能维持在 SLA 要求内（参见 ["响应时间指标的应用"](#sec_introduction_slo_sla)）。所需计算资源越多，成本越高。不同硬件的性价比不同，而且会随着新硬件出现而变化。

如果资源翻倍后能承载两倍负载且性能不变，这称为 *线性可伸缩性*，通常是理想状态。偶尔，借助规模效应或峰值负载更均匀分布，甚至可用不足两倍资源处理两倍负载 [^79] [^80]。但更常见的是成本增长快于线性，低效原因也很多。比如数据量增大后，即使请求大小相同，处理一次写请求也可能比数据量小时更耗资源。

### 共享内存、共享磁盘与无共享架构 {#sec_introduction_shared_nothing}

增加服务硬件资源的最简单方式，是迁移到更强的机器。虽然单核 CPU 不再明显提速，但你仍可购买（或租用）拥有更多 CPU 核心、更多 RAM、更多磁盘的实例。这叫 *纵向伸缩*（scaling up）。

在单机上，你可以通过多进程/多线程获得并行性。同一进程内线程共享同一块 RAM，因此这也叫 *共享内存架构*。问题是它的成本常常“超线性增长”：硬件资源翻倍的高端机器，价格往往远超两倍；且受限于瓶颈，性能提升通常又达不到两倍。

另一种方案是 *共享磁盘架构*：多台机器各有独立 CPU 和 RAM，但共享同一组磁盘阵列，通过高速网络连接（NAS 或 SAN）。该架构传统上用于本地数据仓库场景，但争用与锁开销限制了其可伸缩性 [^81]。

相比之下，*无共享架构* [^82]（即 *横向伸缩*、scaling out）已广泛流行。这种方案使用多节点分布式系统，每个节点拥有自己的 CPU、RAM 和磁盘；节点间协作通过常规网络在软件层完成。

无共享的优势在于：具备线性伸缩潜力、可灵活选用高性价比硬件（尤其在云上）、更容易随负载增减调整资源，并可通过跨多个数据中心/地域部署提升容错。代价是：需要显式分片（见 [第 7 章](/ch7)），并承担分布式系统的全部复杂性（见 [第 9 章](/ch9)）。

一些云原生数据库把“存储”和“事务执行”拆成独立服务（参见 ["存储与计算分离"](/ch1#sec_introduction_storage_compute)），由多个计算节点共享同一存储服务。这种模式与共享磁盘有相似性，但规避了老系统的可伸缩瓶颈：它不暴露 NAS/SAN 那种文件系统或块设备抽象，而是提供面向数据库场景定制的存储 API [^83]。

### 可伸缩性原则 {#id35}

能够大规模运行的系统架构，通常高度依赖具体应用，不存在通用“一招鲜”的可伸缩架构（俗称 *万金油*）。例如：面向“每秒 10 万次请求、每次 1 kB”的系统，与面向“每分钟 3 次请求、每次 2 GB”的系统，形态会完全不同，尽管二者数据吞吐量都约为 100 MB/s。

此外，适合某一级负载的架构，通常难以直接承受 10 倍负载。若你在做高速增长服务，几乎每跨一个数量级都要重新审视架构。考虑到业务需求本身也会变化，提前规划超过一个数量级的未来伸缩需求，往往不划算。

可伸缩性的一个通用原则，是把系统拆分成尽量可独立运行的小组件。这也是微服务（参见 ["微服务与无服务器"](/ch1#sec_introduction_microservices)）、分片（[第 7 章](/ch7)）、流处理（[第 12 章](/ch12#ch_stream)）和无共享架构的共同基础。难点在于：哪里该拆，哪里该合。微服务设计可参考其他书籍 [^84]；无共享系统的分片问题我们会在 [第 7 章](/ch7) 讨论。

另一个好原则是：不要把系统做得比必要更复杂。若单机数据库足够，就往往优于复杂分布式方案。自动伸缩（按需求自动加减资源）很吸引人，但若负载相对可预测，手动伸缩可能带来更少运维意外（参见 ["操作：自动或手动再平衡"](/ch7#sec_sharding_operations)）。5 个服务的系统通常比 50 个服务更简单。好架构往往是多种方案的务实组合。

## 可维护性 {#sec_introduction_maintainability}

软件不会像机械设备那样磨损或材料疲劳，但应用需求会变化，软件所处环境（依赖项、底层平台）也会变化，代码中还会持续暴露需要修复的缺陷。

业界普遍认同：软件成本的大头不在初始开发，而在后续维护，包括修 bug、保障系统稳定运行、排查故障、适配新平台、支持新场景、偿还技术债，以及持续交付新功能 [^85] [^86]。

然而维护并不容易。一个长期运行成功的系统，可能仍依赖今天少有人熟悉的旧技术（如大型机和 COBOL）；随着人员流动，系统为何如此设计的组织记忆也可能丢失；维护者往往还要修复前人留下的问题。更重要的是，计算机系统通常与其支撑的组织流程深度耦合，这使得 *遗留* 系统维护既是技术问题，也是人员与组织问题 [^87]。

如果今天构建的系统足够有价值并长期存活，它终有一天会变成遗留系统。为减少后继维护者的痛苦，我们应在设计阶段就考虑维护性。虽然难以准确预判哪些决策会在未来埋雷，但本书会强调几条广泛适用的原则：

可运维性（Operability）
: 让组织能够更容易地保持系统平稳运行。

简单性（Simplicity）
: 采用易理解且一致的模式与结构，避免不必要复杂性，让新工程师也能快速理解系统。

可演化性（Evolvability）
: 让工程师在未来能更容易修改系统，使其随着需求变化而持续适配并扩展到未预料场景。

### 可运维性：让运维更轻松 {#id37}

我们在 ["云时代的运维"](/ch1#sec_introduction_operations) 已讨论过运维角色：可靠运行不仅依赖工具，人类流程同样关键。甚至有人指出：“好的运维常能绕过糟糕（或不完整）软件的局限；但再好的软件，碰上糟糕运维也难以可靠运行” [^60]。

在由成千上万台机器组成的大规模系统中，纯手工维护成本不可接受，自动化必不可少。但自动化也是双刃剑：总会有边缘场景（如罕见故障）需要运维团队人工介入。并且“自动化处理不了”的往往恰恰最复杂，因此自动化越深，越需要 **更** 高水平的运维团队来兜底 [^88]。

另外，一旦自动化系统本身出错，往往比“部分依赖人工操作”的系统更难排查。因此自动化并非越多越好。合理自动化程度取决于你所在应用与组织的具体条件。

良好的可运维性意味着把日常任务做简单，让运维团队把精力投入到高价值工作。数据系统可以通过多种方式达成这一点 [^89]：

* 让监控工具能获取关键指标，并支持可观测性工具（参见 ["分布式系统的问题"](/ch1#sec_introduction_dist_sys_problems)）以洞察运行时行为。相关商业/开源工具都很多 [^90]。
* 避免依赖单机（系统整体不停机的前提下允许下线机器维护）。
* 提供完善文档和易理解的操作模型（“我做 X，会发生 Y”）。
* 提供良好默认值，同时允许管理员在需要时覆盖默认行为。
* 适当支持自愈，同时在必要时保留管理员对系统状态的手动控制权。
* 行为可预测，尽量减少“惊喜”。

### 简单性：管理复杂度 {#id38}

小型项目往往能保持简洁、优雅、富有表达力；但项目变大后，代码常会迅速变复杂且难理解。这种复杂性会拖慢所有参与者效率，进一步抬高维护成本。陷入这种状态的软件项目常被称为 *大泥团* [^91]。

当复杂性让维护变难时，预算和进度常常失控。在复杂软件里，变更时引入缺陷的风险也更高：系统越难理解和推理，隐藏假设、非预期后果和意外交互就越容易被忽略 [^69]。反过来，降低复杂性能显著提升可维护性，因此“追求简单”应是系统设计核心目标之一。

简单系统更容易理解，因此我们应尽可能用最简单方式解决问题。但“简单”知易行难。什么叫简单，往往带有主观判断，因为不存在绝对客观的简单性标准 [^92]。例如，一个系统可能“接口简单但实现复杂”，另一个可能“实现简单但暴露更多内部细节”，到底谁更简单，并不总有标准答案。

一种常见分析方法是把复杂性分成两类：**本质复杂性** 与 **偶然复杂性** [^93]。前者源于业务问题本身，后者源于工具与实现限制。但这种划分也并不完美，因为随着工具演进，“本质”和“偶然”的边界会移动 [^94]。

管理复杂度最重要的工具之一是 **抽象**。好的抽象能在清晰外观后隐藏大量实现细节，也能被多种场景复用。这种复用不仅比反复重写更高效，也能提升质量，因为抽象组件一旦改进，所有依赖它的应用都会受益。

例如，高级语言是对机器码、CPU 寄存器和系统调用的抽象。SQL 则抽象了磁盘/内存中的复杂数据结构、来自其他客户端的并发请求，以及崩溃后的不一致状态。用高级语言编程时，我们仍然在“使用机器码”，但不再 *直接* 面对它，因为语言抽象替我们屏蔽了细节。

应用代码层面的抽象，常借助 *设计模式* [^95]、*领域驱动设计*（DDD）[^96] 等方法来构建。本书重点不在这类应用专用抽象，而在你可以拿来构建应用的通用抽象，例如数据库事务、索引、事件日志等。若你想采用 DDD 等方法，也可以建立在本书介绍的基础能力之上。

### 可演化性：让变化更容易 {#sec_introduction_evolvability}

系统需求永远不变的概率极低。更常见的是持续变化：你会发现新事实，出现此前未预期用例，业务优先级会调整，用户会提出新功能，新平台会替换旧平台，法律与监管会变化，系统增长也会倒逼架构调整。

在组织层面，*敏捷* 方法为适应变化提供了框架；敏捷社区也发展出多种适用于高变化环境的技术与流程，如测试驱动开发（TDD）和重构。本书关注的是：如何在“由多个不同应用/服务组成的系统层级”提升这种敏捷能力。

数据系统对变化的适应难易度，与其简单性和抽象质量高度相关：松耦合、简单系统通常比紧耦合、复杂系统更容易修改。由于这一点极其重要，我们把“数据系统层面的敏捷性”单独称为 *可演化性* [^97]。

大型系统中让变更困难的一个关键因素，是某些操作不可逆，因此执行时必须极其谨慎 [^98]。例如从一个数据库迁移到另一个：若新库出问题后无法回切，风险就远高于可随时回退。尽量减少不可逆操作，能显著提升系统灵活性。

## 总结 {#summary}

本章讨论了几类核心非功能性需求：性能、可靠性、可伸缩性与可维护性。围绕这些主题，我们也建立了贯穿全书的一组概念与术语。章节从“社交网络首页时间线”案例切入，直观展示了系统在规模增长时会遇到的现实挑战。

我们讨论了如何衡量性能（例如响应时间百分位点）、如何描述系统负载（例如吞吐量指标），以及这些指标如何进入 SLA。与之紧密相关的是可伸缩性：当负载增长时，如何保持性能不退化。我们也给出了若干通用原则，例如将任务拆解为可独立运行的小组件。后续章节会深入展开相关技术细节。

为实现可靠性，可以使用容错机制，使系统在部分组件（如磁盘、机器或外部服务）故障时仍能持续提供服务。我们区分了硬件故障与软件故障，并指出软件故障常更难处理，因为它们往往高度相关。可靠性的另一面是“对人为失误的韧性”，其中 *无责备事后分析* 是重要学习机制。

最后，我们讨论了可维护性的多个维度：支持运维工作、管理复杂度、提升系统可演化性。实现这些目标没有银弹，但一个普遍有效的做法是：用清晰、可理解、具备良好抽象的构件来搭建系统。接下来全书会介绍一系列在实践中证明有效的构件。

### 参考文献

[^1]: Mike Cvet. [How We Learned to Stop Worrying and Love Fan-In at Twitter](https://www.youtube.com/watch?v=WEgCjwyXvwc). At *QCon San Francisco*, December 2016. 
[^2]: Raffi Krikorian. [Timelines at Scale](https://www.infoq.com/presentations/Twitter-Timeline-Scalability/). At *QCon San Francisco*, November 2012. Archived at [perma.cc/V9G5-KLYK](https://perma.cc/V9G5-KLYK) 
[^3]: Twitter. [Twitter's Recommendation Algorithm](https://blog.twitter.com/engineering/en_us/topics/open-source/2023/twitter-recommendation-algorithm). *blog.twitter.com*, March 2023. Archived at [perma.cc/L5GT-229T](https://perma.cc/L5GT-229T) 
[^4]: Raffi Krikorian. [New Tweets per second record, and how!](https://blog.twitter.com/engineering/en_us/a/2013/new-tweets-per-second-record-and-how) *blog.twitter.com*, August 2013. Archived at [perma.cc/6JZN-XJYN](https://perma.cc/6JZN-XJYN) 
[^5]: Jaz Volpert. [When Imperfect Systems are Good, Actually: Bluesky's Lossy Timelines](https://jazco.dev/2025/02/19/imperfection/). *jazco.dev*, February 2025. Archived at [perma.cc/2PVE-L2MX](https://perma.cc/2PVE-L2MX) 
[^6]: Samuel Axon. [3% of Twitter's Servers Dedicated to Justin Bieber](https://mashable.com/archive/justin-bieber-twitter). *mashable.com*, September 2010. Archived at [perma.cc/F35N-CGVX](https://perma.cc/F35N-CGVX) 
[^7]: Nathan Bronson, Abutalib Aghayev, Aleksey Charapko, and Timothy Zhu. [Metastable Failures in Distributed Systems](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s11-bronson.pdf). At *Workshop on Hot Topics in Operating Systems* (HotOS), May 2021. [doi:10.1145/3458336.3465286](https://doi.org/10.1145/3458336.3465286) 
[^8]: Marc Brooker. [Metastability and Distributed Systems](https://brooker.co.za/blog/2021/05/24/metastable.html). *brooker.co.za*, May 2021. Archived at [perma.cc/7FGJ-7XRK](https://perma.cc/7FGJ-7XRK) 
[^9]: Marc Brooker. [Exponential Backoff And Jitter](https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/). *aws.amazon.com*, March 2015. Archived at [perma.cc/R6MS-AZKH](https://perma.cc/R6MS-AZKH) 
[^10]: Marc Brooker. [What is Backoff For?](https://brooker.co.za/blog/2022/08/11/backoff.html) *brooker.co.za*, August 2022. Archived at [perma.cc/PW9N-55Q5](https://perma.cc/PW9N-55Q5) 
[^11]: Michael T. Nygard. [*Release It!*](https://learning.oreilly.com/library/view/release-it-2nd/9781680504552/), 2nd Edition. Pragmatic Bookshelf, January 2018. ISBN: 9781680502398 
[^12]: Frank Chen. [Slowing Down to Speed Up – Circuit Breakers for Slack's CI/CD](https://slack.engineering/circuit-breakers/). *slack.engineering*, August 2022. Archived at [perma.cc/5FGS-ZPH3](https://perma.cc/5FGS-ZPH3) 
[^13]: Marc Brooker. [Fixing retries with token buckets and circuit breakers](https://brooker.co.za/blog/2022/02/28/retries.html). *brooker.co.za*, February 2022. Archived at [perma.cc/MD6N-GW26](https://perma.cc/MD6N-GW26) 
[^14]: David Yanacek. [Using load shedding to avoid overload](https://aws.amazon.com/builders-library/using-load-shedding-to-avoid-overload/). Amazon Builders' Library, *aws.amazon.com*. Archived at [perma.cc/9SAW-68MP](https://perma.cc/9SAW-68MP) 
[^15]: Matthew Sackman. [Pushing Back](https://wellquite.org/posts/lshift/pushing_back/). *wellquite.org*, May 2016. Archived at [perma.cc/3KCZ-RUFY](https://perma.cc/3KCZ-RUFY) 
[^16]: Dmitry Kopytkov and Patrick Lee. [Meet Bandaid, the Dropbox service proxy](https://dropbox.tech/infrastructure/meet-bandaid-the-dropbox-service-proxy). *dropbox.tech*, March 2018. Archived at [perma.cc/KUU6-YG4S](https://perma.cc/KUU6-YG4S) 
[^17]: Haryadi S. Gunawi, Riza O. Suminto, Russell Sears, Casey Golliher, Swaminathan Sundararaman, Xing Lin, Tim Emami, Weiguang Sheng, Nematollah Bidokhti, Caitie McCaffrey, Gary Grider, Parks M. Fields, Kevin Harms, Robert B. Ross, Andree Jacobson, Robert Ricci, Kirk Webb, Peter Alvaro, H. Birali Runesha, Mingzhe Hao, and Huaicheng Li. [Fail-Slow at Scale: Evidence of Hardware Performance Faults in Large Production Systems](https://www.usenix.org/system/files/conference/fast18/fast18-gunawi.pdf). At *16th USENIX Conference on File and Storage Technologies*, February 2018. 
[^18]: Marc Brooker. [Is the Mean Really Useless?](https://brooker.co.za/blog/2017/12/28/mean.html) *brooker.co.za*, December 2017. Archived at [perma.cc/U5AE-CVEM](https://perma.cc/U5AE-CVEM) 
[^19]: Giuseppe DeCandia, Deniz Hastorun, Madan Jampani, Gunavardhan Kakulapati, Avinash Lakshman, Alex Pilchin, Swaminathan Sivasubramanian, Peter Vosshall, and Werner Vogels. [Dynamo: Amazon's Highly Available Key-Value Store](https://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf). At *21st ACM Symposium on Operating Systems Principles* (SOSP), October 2007. [doi:10.1145/1294261.1294281](https://doi.org/10.1145/1294261.1294281) 
[^20]: Kathryn Whitenton. [The Need for Speed, 23 Years Later](https://www.nngroup.com/articles/the-need-for-speed/). *nngroup.com*, May 2020. Archived at [perma.cc/C4ER-LZYA](https://perma.cc/C4ER-LZYA) 
[^21]: Greg Linden. [Marissa Mayer at Web 2.0](https://glinden.blogspot.com/2006/11/marissa-mayer-at-web-20.html). *glinden.blogspot.com*, November 2005. Archived at [perma.cc/V7EA-3VXB](https://perma.cc/V7EA-3VXB) 
[^22]: Jake Brutlag. [Speed Matters for Google Web Search](https://services.google.com/fh/files/blogs/google_delayexp.pdf). *services.google.com*, June 2009. Archived at [perma.cc/BK7R-X7M2](https://perma.cc/BK7R-X7M2) 
[^23]: Eric Schurman and Jake Brutlag. [Performance Related Changes and their User Impact](https://www.youtube.com/watch?v=bQSE51-gr2s). Talk at *Velocity 2009*. 
[^24]: Akamai Technologies, Inc. [The State of Online Retail Performance](https://web.archive.org/web/20210729180749/https%3A//www.akamai.com/us/en/multimedia/documents/report/akamai-state-of-online-retail-performance-spring-2017.pdf). *akamai.com*, April 2017. Archived at [perma.cc/UEK2-HYCS](https://perma.cc/UEK2-HYCS) 
[^25]: Xiao Bai, Ioannis Arapakis, B. Barla Cambazoglu, and Ana Freire. [Understanding and Leveraging the Impact of Response Latency on User Behaviour in Web Search](https://iarapakis.github.io/papers/TOIS17.pdf). *ACM Transactions on Information Systems*, volume 36, issue 2, article 21, April 2018. [doi:10.1145/3106372](https://doi.org/10.1145/3106372) 
[^26]: Jeffrey Dean and Luiz André Barroso. [The Tail at Scale](https://cacm.acm.org/research/the-tail-at-scale/). *Communications of the ACM*, volume 56, issue 2, pages 74–80, February 2013. [doi:10.1145/2408776.2408794](https://doi.org/10.1145/2408776.2408794) 
[^27]: Alex Hidalgo. [*Implementing Service Level Objectives: A Practical Guide to SLIs, SLOs, and Error Budgets*](https://www.oreilly.com/library/view/implementing-service-level/9781492076803/). O'Reilly Media, September 2020. ISBN: 1492076813 
[^28]: Jeffrey C. Mogul and John Wilkes. [Nines are Not Enough: Meaningful Metrics for Clouds](https://research.google/pubs/pub48033/). At *17th Workshop on Hot Topics in Operating Systems* (HotOS), May 2019. [doi:10.1145/3317550.3321432](https://doi.org/10.1145/3317550.3321432) 
[^29]: Tamás Hauer, Philipp Hoffmann, John Lunney, Dan Ardelean, and Amer Diwan. [Meaningful Availability](https://www.usenix.org/conference/nsdi20/presentation/hauer). At *17th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), February 2020. 
[^30]: Ted Dunning. [The t-digest: Efficient estimates of distributions](https://www.sciencedirect.com/science/article/pii/S2665963820300403). *Software Impacts*, volume 7, article 100049, February 2021. [doi:10.1016/j.simpa.2020.100049](https://doi.org/10.1016/j.simpa.2020.100049) 
[^31]: David Kohn. [How percentile approximation works (and why it's more useful than averages)](https://www.timescale.com/blog/how-percentile-approximation-works-and-why-its-more-useful-than-averages/). *timescale.com*, September 2021. Archived at [perma.cc/3PDP-NR8B](https://perma.cc/3PDP-NR8B) 
[^32]: Heinrich Hartmann and Theo Schlossnagle. [Circllhist — A Log-Linear Histogram Data Structure for IT Infrastructure Monitoring](https://arxiv.org/pdf/2001.06561.pdf). *arxiv.org*, January 2020. 
[^33]: Charles Masson, Jee E. Rim, and Homin K. Lee. [DDSketch: A Fast and Fully-Mergeable Quantile Sketch with Relative-Error Guarantees](https://www.vldb.org/pvldb/vol12/p2195-masson.pdf). *Proceedings of the VLDB Endowment*, volume 12, issue 12, pages 2195–2205, August 2019. [doi:10.14778/3352063.3352135](https://doi.org/10.14778/3352063.3352135) 
[^34]: Baron Schwartz. [Why Percentiles Don't Work the Way You Think](https://orangematter.solarwinds.com/2016/11/18/why-percentiles-dont-work-the-way-you-think/). *solarwinds.com*, November 2016. Archived at [perma.cc/469T-6UGB](https://perma.cc/469T-6UGB) 
[^35]: Walter L. Heimerdinger and Charles B. Weinstock. [A Conceptual Framework for System Fault Tolerance](https://resources.sei.cmu.edu/asset_files/TechnicalReport/1992_005_001_16112.pdf). Technical Report CMU/SEI-92-TR-033, Software Engineering Institute, Carnegie Mellon University, October 1992. Archived at [perma.cc/GD2V-DMJW](https://perma.cc/GD2V-DMJW) 
[^36]: Felix C. Gärtner. [Fundamentals of fault-tolerant distributed computing in asynchronous environments](https://dl.acm.org/doi/pdf/10.1145/311531.311532). *ACM Computing Surveys*, volume 31, issue 1, pages 1–26, March 1999. [doi:10.1145/311531.311532](https://doi.org/10.1145/311531.311532) 
[^37]: Algirdas Avižienis, Jean-Claude Laprie, Brian Randell, and Carl Landwehr. [Basic Concepts and Taxonomy of Dependable and Secure Computing](https://hdl.handle.net/1903/6459). *IEEE Transactions on Dependable and Secure Computing*, volume 1, issue 1, January 2004. [doi:10.1109/TDSC.2004.2](https://doi.org/10.1109/TDSC.2004.2) 
[^38]: Ding Yuan, Yu Luo, Xin Zhuang, Guilherme Renna Rodrigues, Xu Zhao, Yongle Zhang, Pranay U. Jain, and Michael Stumm. [Simple Testing Can Prevent Most Critical Failures: An Analysis of Production Failures in Distributed Data-Intensive Systems](https://www.usenix.org/system/files/conference/osdi14/osdi14-paper-yuan.pdf). At *11th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2014. 
[^39]: Casey Rosenthal and Nora Jones. [*Chaos Engineering*](https://learning.oreilly.com/library/view/chaos-engineering/9781492043850/). O'Reilly Media, April 2020. ISBN: 9781492043867 
[^40]: Eduardo Pinheiro, Wolf-Dietrich Weber, and Luiz Andre Barroso. [Failure Trends in a Large Disk Drive Population](https://www.usenix.org/legacy/events/fast07/tech/full_papers/pinheiro/pinheiro_old.pdf). At *5th USENIX Conference on File and Storage Technologies* (FAST), February 2007. 
[^41]: Bianca Schroeder and Garth A. Gibson. [Disk failures in the real world: What does an MTTF of 1,000,000 hours mean to you?](https://www.usenix.org/legacy/events/fast07/tech/schroeder/schroeder.pdf) At *5th USENIX Conference on File and Storage Technologies* (FAST), February 2007. 
[^42]: Andy Klein. [Backblaze Drive Stats for Q2 2021](https://www.backblaze.com/blog/backblaze-drive-stats-for-q2-2021/). *backblaze.com*, August 2021. Archived at [perma.cc/2943-UD5E](https://perma.cc/2943-UD5E) 
[^43]: Iyswarya Narayanan, Di Wang, Myeongjae Jeon, Bikash Sharma, Laura Caulfield, Anand Sivasubramaniam, Ben Cutler, Jie Liu, Badriddine Khessib, and Kushagra Vaid. [SSD Failures in Datacenters: What? When? and Why?](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/08/a7-narayanan.pdf) At *9th ACM International on Systems and Storage Conference* (SYSTOR), June 2016. [doi:10.1145/2928275.2928278](https://doi.org/10.1145/2928275.2928278) 
[^44]: Alibaba Cloud Storage Team. [Storage System Design Analysis: Factors Affecting NVMe SSD Performance (1)](https://www.alibabacloud.com/blog/594375). *alibabacloud.com*, January 2019. Archived at [archive.org](https://web.archive.org/web/20230522005034/https%3A//www.alibabacloud.com/blog/594375) 
[^45]: Bianca Schroeder, Raghav Lagisetty, and Arif Merchant. [Flash Reliability in Production: The Expected and the Unexpected](https://www.usenix.org/system/files/conference/fast16/fast16-papers-schroeder.pdf). At *14th USENIX Conference on File and Storage Technologies* (FAST), February 2016. 
[^46]: Jacob Alter, Ji Xue, Alma Dimnaku, and Evgenia Smirni. [SSD failures in the field: symptoms, causes, and prediction models](https://dl.acm.org/doi/pdf/10.1145/3295500.3356172). At *International Conference for High Performance Computing, Networking, Storage and Analysis* (SC), November 2019. [doi:10.1145/3295500.3356172](https://doi.org/10.1145/3295500.3356172) 
[^47]: Daniel Ford, François Labelle, Florentina I. Popovici, Murray Stokely, Van-Anh Truong, Luiz Barroso, Carrie Grimes, and Sean Quinlan. [Availability in Globally Distributed Storage Systems](https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Ford.pdf). At *9th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2010. 
[^48]: Kashi Venkatesh Vishwanath and Nachiappan Nagappan. [Characterizing Cloud Computing Hardware Reliability](https://www.microsoft.com/en-us/research/wp-content/uploads/2010/06/socc088-vishwanath.pdf). At *1st ACM Symposium on Cloud Computing* (SoCC), June 2010. [doi:10.1145/1807128.1807161](https://doi.org/10.1145/1807128.1807161) 
[^49]: Peter H. Hochschild, Paul Turner, Jeffrey C. Mogul, Rama Govindaraju, Parthasarathy Ranganathan, David E. Culler, and Amin Vahdat. [Cores that don't count](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s01-hochschild.pdf). At *Workshop on Hot Topics in Operating Systems* (HotOS), June 2021. [doi:10.1145/3458336.3465297](https://doi.org/10.1145/3458336.3465297) 
[^50]: Harish Dattatraya Dixit, Sneha Pendharkar, Matt Beadon, Chris Mason, Tejasvi Chakravarthy, Bharath Muthiah, and Sriram Sankar. [Silent Data Corruptions at Scale](https://arxiv.org/abs/2102.11245). *arXiv:2102.11245*, February 2021. 
[^51]: Diogo Behrens, Marco Serafini, Sergei Arnautov, Flavio P. Junqueira, and Christof Fetzer. [Scalable Error Isolation for Distributed Systems](https://www.usenix.org/conference/nsdi15/technical-sessions/presentation/behrens). At *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015. 
[^52]: Bianca Schroeder, Eduardo Pinheiro, and Wolf-Dietrich Weber. [DRAM Errors in the Wild: A Large-Scale Field Study](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/35162.pdf). At *11th International Joint Conference on Measurement and Modeling of Computer Systems* (SIGMETRICS), June 2009. [doi:10.1145/1555349.1555372](https://doi.org/10.1145/1555349.1555372) 
[^53]: Yoongu Kim, Ross Daly, Jeremie Kim, Chris Fallin, Ji Hye Lee, Donghyuk Lee, Chris Wilkerson, Konrad Lai, and Onur Mutlu. [Flipping Bits in Memory Without Accessing Them: An Experimental Study of DRAM Disturbance Errors](https://users.ece.cmu.edu/~yoonguk/papers/kim-isca14.pdf). At *41st Annual International Symposium on Computer Architecture* (ISCA), June 2014. [doi:10.5555/2665671.2665726](https://doi.org/10.5555/2665671.2665726) 
[^54]: Tim Bray. [Worst Case](https://www.tbray.org/ongoing/When/202x/2021/10/08/The-WOrst-Case). *tbray.org*, October 2021. Archived at [perma.cc/4QQM-RTHN](https://perma.cc/4QQM-RTHN) 
[^55]: Sangeetha Abdu Jyothi. [Solar Superstorms: Planning for an Internet Apocalypse](https://ics.uci.edu/~sabdujyo/papers/sigcomm21-cme.pdf). At *ACM SIGCOMM Conferene*, August 2021. [doi:10.1145/3452296.3472916](https://doi.org/10.1145/3452296.3472916) 
[^56]: Adrian Cockcroft. [Failure Modes and Continuous Resilience](https://adrianco.medium.com/failure-modes-and-continuous-resilience-6553078caad5). *adrianco.medium.com*, November 2019. Archived at [perma.cc/7SYS-BVJP](https://perma.cc/7SYS-BVJP) 
[^57]: Shujie Han, Patrick P. C. Lee, Fan Xu, Yi Liu, Cheng He, and Jiongzhou Liu. [An In-Depth Study of Correlated Failures in Production SSD-Based Data Centers](https://www.usenix.org/conference/fast21/presentation/han). At *19th USENIX Conference on File and Storage Technologies* (FAST), February 2021. 
[^58]: Edmund B. Nightingale, John R. Douceur, and Vince Orgovan. [Cycles, Cells and Platters: An Empirical Analysis of Hardware Failures on a Million Consumer PCs](https://eurosys2011.cs.uni-salzburg.at/pdf/eurosys2011-nightingale.pdf). At *6th European Conference on Computer Systems* (EuroSys), April 2011. [doi:10.1145/1966445.1966477](https://doi.org/10.1145/1966445.1966477) 
[^59]: Haryadi S. Gunawi, Mingzhe Hao, Tanakorn Leesatapornwongsa, Tiratat Patana-anake, Thanh Do, Jeffry Adityatama, Kurnia J. Eliazar, Agung Laksono, Jeffrey F. Lukman, Vincentius Martin, and Anang D. Satria. [What Bugs Live in the Cloud?](https://ucare.cs.uchicago.edu/pdf/socc14-cbs.pdf) At *5th ACM Symposium on Cloud Computing* (SoCC), November 2014. [doi:10.1145/2670979.2670986](https://doi.org/10.1145/2670979.2670986) 
[^60]: Jay Kreps. [Getting Real About Distributed System Reliability](https://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability). *blog.empathybox.com*, March 2012. Archived at [perma.cc/9B5Q-AEBW](https://perma.cc/9B5Q-AEBW) 
[^61]: Nelson Minar. [Leap Second Crashes Half the Internet](https://www.somebits.com/weblog/tech/bad/leap-second-2012.html). *somebits.com*, July 2012. Archived at [perma.cc/2WB8-D6EU](https://perma.cc/2WB8-D6EU) 
[^62]: Hewlett Packard Enterprise. [Support Alerts – Customer Bulletin a00092491en\_us](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-a00092491en_us). *support.hpe.com*, November 2019. Archived at [perma.cc/S5F6-7ZAC](https://perma.cc/S5F6-7ZAC) 
[^63]: Lorin Hochstein. [awesome limits](https://github.com/lorin/awesome-limits). *github.com*, November 2020. Archived at [perma.cc/3R5M-E5Q4](https://perma.cc/3R5M-E5Q4) 
[^64]: Caitie McCaffrey. [Clients Are Jerks: AKA How Halo 4 DoSed the Services at Launch & How We Survived](https://www.caitiem.com/2015/06/23/clients-are-jerks-aka-how-halo-4-dosed-the-services-at-launch-how-we-survived/). *caitiem.com*, June 2015. Archived at [perma.cc/MXX4-W373](https://perma.cc/MXX4-W373) 
[^65]: Lilia Tang, Chaitanya Bhandari, Yongle Zhang, Anna Karanika, Shuyang Ji, Indranil Gupta, and Tianyin Xu. [Fail through the Cracks: Cross-System Interaction Failures in Modern Cloud Systems](https://tianyin.github.io/pub/csi-failures.pdf). At *18th European Conference on Computer Systems* (EuroSys), May 2023. [doi:10.1145/3552326.3587448](https://doi.org/10.1145/3552326.3587448) 
[^66]: Mike Ulrich. [Addressing Cascading Failures](https://sre.google/sre-book/addressing-cascading-failures/). In Betsy Beyer, Jennifer Petoff, Chris Jones, and Niall Richard Murphy (ed). [*Site Reliability Engineering: How Google Runs Production Systems*](https://www.oreilly.com/library/view/site-reliability-engineering/9781491929117/). O'Reilly Media, 2016. ISBN: 9781491929124 
[^67]: Harri Faßbender. [Cascading failures in large-scale distributed systems](https://blog.mi.hdm-stuttgart.de/index.php/2022/03/03/cascading-failures-in-large-scale-distributed-systems/). *blog.mi.hdm-stuttgart.de*, March 2022. Archived at [perma.cc/K7VY-YJRX](https://perma.cc/K7VY-YJRX) 
[^68]: Richard I. Cook. [How Complex Systems Fail](https://www.adaptivecapacitylabs.com/HowComplexSystemsFail.pdf). Cognitive Technologies Laboratory, April 2000. Archived at [perma.cc/RDS6-2YVA](https://perma.cc/RDS6-2YVA) 
[^69]: David D. Woods. [STELLA: Report from the SNAFUcatchers Workshop on Coping With Complexity](https://snafucatchers.github.io/). *snafucatchers.github.io*, March 2017. Archived at [archive.org](https://web.archive.org/web/20230306130131/https%3A//snafucatchers.github.io/) 
[^70]: David Oppenheimer, Archana Ganapathi, and David A. Patterson. [Why Do Internet Services Fail, and What Can Be Done About It?](https://static.usenix.org/events/usits03/tech/full_papers/oppenheimer/oppenheimer.pdf) At *4th USENIX Symposium on Internet Technologies and Systems* (USITS), March 2003. 
[^71]: Sidney Dekker. [*The Field Guide to Understanding 'Human Error', 3rd Edition*](https://learning.oreilly.com/library/view/the-field-guide/9781317031833/). CRC Press, November 2017. ISBN: 9781472439055 
[^72]: Sidney Dekker. [*Drift into Failure: From Hunting Broken Components to Understanding Complex Systems*](https://www.taylorfrancis.com/books/mono/10.1201/9781315257396/drift-failure-sidney-dekker). CRC Press, 2011. ISBN: 9781315257396 
[^73]: John Allspaw. [Blameless PostMortems and a Just Culture](https://www.etsy.com/codeascraft/blameless-postmortems/). *etsy.com*, May 2012. Archived at [perma.cc/YMJ7-NTAP](https://perma.cc/YMJ7-NTAP) 
[^74]: Itzy Sabo. [Uptime Guarantees — A Pragmatic Perspective](https://world.hey.com/itzy/uptime-guarantees-a-pragmatic-perspective-736d7ea4). *world.hey.com*, March 2023. Archived at [perma.cc/F7TU-78JB](https://perma.cc/F7TU-78JB) 
[^75]: Michael Jurewitz. [The Human Impact of Bugs](http://jury.me/blog/2013/3/14/the-human-impact-of-bugs). *jury.me*, March 2013. Archived at [perma.cc/5KQ4-VDYL](https://perma.cc/5KQ4-VDYL) 
[^76]: Mark Halper. [How Software Bugs led to 'One of the Greatest Miscarriages of Justice' in British History](https://cacm.acm.org/news/how-software-bugs-led-to-one-of-the-greatest-miscarriages-of-justice-in-british-history/). *Communications of the ACM*, January 2025. [doi:10.1145/3703779](https://doi.org/10.1145/3703779) 
[^77]: Nicholas Bohm, James Christie, Peter Bernard Ladkin, Bev Littlewood, Paul Marshall, Stephen Mason, Martin Newby, Steven J. Murdoch, Harold Thimbleby, and Martyn Thomas. [The legal rule that computers are presumed to be operating correctly – unforeseen and unjust consequences](https://www.benthamsgaze.org/wp-content/uploads/2022/06/briefing-presumption-that-computers-are-reliable.pdf). Briefing note, *benthamsgaze.org*, June 2022. Archived at [perma.cc/WQ6X-TMW4](https://perma.cc/WQ6X-TMW4) 
[^78]: Dan McKinley. [Choose Boring Technology](https://mcfunley.com/choose-boring-technology). *mcfunley.com*, March 2015. Archived at [perma.cc/7QW7-J4YP](https://perma.cc/7QW7-J4YP) 
[^79]: Andy Warfield. [Building and operating a pretty big storage system called S3](https://www.allthingsdistributed.com/2023/07/building-and-operating-a-pretty-big-storage-system.html). *allthingsdistributed.com*, July 2023. Archived at [perma.cc/7LPK-TP7V](https://perma.cc/7LPK-TP7V) 
[^80]: Marc Brooker. [Surprising Scalability of Multitenancy](https://brooker.co.za/blog/2023/03/23/economics.html). *brooker.co.za*, March 2023. Archived at [perma.cc/ZZD9-VV8T](https://perma.cc/ZZD9-VV8T) 
[^81]: Ben Stopford. [Shared Nothing vs. Shared Disk Architectures: An Independent View](http://www.benstopford.com/2009/11/24/understanding-the-shared-nothing-architecture/). *benstopford.com*, November 2009. Archived at [perma.cc/7BXH-EDUR](https://perma.cc/7BXH-EDUR) 
[^82]: Michael Stonebraker. [The Case for Shared Nothing](https://dsf.berkeley.edu/papers/hpts85-nothing.pdf). *IEEE Database Engineering Bulletin*, volume 9, issue 1, pages 4–9, March 1986. 
[^83]: Panagiotis Antonopoulos, Alex Budovski, Cristian Diaconu, Alejandro Hernandez Saenz, Jack Hu, Hanuma Kodavalla, Donald Kossmann, Sandeep Lingam, Umar Farooq Minhas, Naveen Prakash, Vijendra Purohit, Hugh Qu, Chaitanya Sreenivas Ravella, Krystyna Reisteter, Sheetal Shrotri, Dixin Tang, and Vikram Wakade. [Socrates: The New SQL Server in the Cloud](https://www.microsoft.com/en-us/research/uploads/prod/2019/05/socrates.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 1743–1756, June 2019. [doi:10.1145/3299869.3314047](https://doi.org/10.1145/3299869.3314047) 
[^84]: Sam Newman. [*Building Microservices*, second edition](https://www.oreilly.com/library/view/building-microservices-2nd/9781492034018/). O'Reilly Media, 2021. ISBN: 9781492034025 
[^85]: Nathan Ensmenger. [When Good Software Goes Bad: The Surprising Durability of an Ephemeral Technology](https://themaintainers.wpengine.com/wp-content/uploads/2021/04/ensmenger-maintainers-v2.pdf). At *The Maintainers Conference*, April 2016. Archived at [perma.cc/ZXT4-HGZB](https://perma.cc/ZXT4-HGZB) 
[^86]: Robert L. Glass. [*Facts and Fallacies of Software Engineering*](https://learning.oreilly.com/library/view/facts-and-fallacies/0321117425/). Addison-Wesley Professional, October 2002. ISBN: 9780321117427 
[^87]: Marianne Bellotti. [*Kill It with Fire*](https://learning.oreilly.com/library/view/kill-it-with/9781098128883/). No Starch Press, April 2021. ISBN: 9781718501188 
[^88]: Lisanne Bainbridge. [Ironies of automation](https://www.adaptivecapacitylabs.com/IroniesOfAutomation-Bainbridge83.pdf). *Automatica*, volume 19, issue 6, pages 775–779, November 1983. [doi:10.1016/0005-1098(83)90046-8](https://doi.org/10.1016/0005-1098%2883%2990046-8) 
[^89]: James Hamilton. [On Designing and Deploying Internet-Scale Services](https://www.usenix.org/legacy/events/lisa07/tech/full_papers/hamilton/hamilton.pdf). At *21st Large Installation System Administration Conference* (LISA), November 2007. 
[^90]: Dotan Horovits. [Open Source for Better Observability](https://horovits.medium.com/open-source-for-better-observability-8c65b5630561). *horovits.medium.com*, October 2021. Archived at [perma.cc/R2HD-U2ZT](https://perma.cc/R2HD-U2ZT) 
[^91]: Brian Foote and Joseph Yoder. [Big Ball of Mud](http://www.laputan.org/pub/foote/mud.pdf). At *4th Conference on Pattern Languages of Programs* (PLoP), September 1997. Archived at [perma.cc/4GUP-2PBV](https://perma.cc/4GUP-2PBV) 
[^92]: Marc Brooker. [What is a simple system?](https://brooker.co.za/blog/2022/05/03/simplicity.html) *brooker.co.za*, May 2022. Archived at [perma.cc/U72T-BFVE](https://perma.cc/U72T-BFVE) 
[^93]: Frederick P. Brooks. [No Silver Bullet – Essence and Accident in Software Engineering](https://worrydream.com/refs/Brooks_1986_-_No_Silver_Bullet.pdf). In [*The Mythical Man-Month*](https://www.oreilly.com/library/view/mythical-man-month-the/0201835959/), Anniversary edition, Addison-Wesley, 1995. ISBN: 9780201835953 
[^94]: Dan Luu. [Against essential and accidental complexity](https://danluu.com/essential-complexity/). *danluu.com*, December 2020. Archived at [perma.cc/H5ES-69KC](https://perma.cc/H5ES-69KC) 
[^95]: Erich Gamma, Richard Helm, Ralph Johnson, and John Vlissides. [*Design Patterns: Elements of Reusable Object-Oriented Software*](https://learning.oreilly.com/library/view/design-patterns-elements/0201633612/). Addison-Wesley Professional, October 1994. ISBN: 9780201633610 
[^96]: Eric Evans. [*Domain-Driven Design: Tackling Complexity in the Heart of Software*](https://learning.oreilly.com/library/view/domain-driven-design-tackling/0321125215/). Addison-Wesley Professional, August 2003. ISBN: 9780321125217 
[^97]: Hongyu Pei Breivold, Ivica Crnkovic, and Peter J. Eriksson. [Analyzing Software Evolvability](https://www.es.mdh.se/pdf_publications/1251.pdf). at *32nd Annual IEEE International Computer Software and Applications Conference* (COMPSAC), July 2008. [doi:10.1109/COMPSAC.2008.50](https://doi.org/10.1109/COMPSAC.2008.50) 
[^98]: Enrico Zaninotto. [From X programming to the X organisation](https://martinfowler.com/articles/zaninotto.pdf). At *XP Conference*, May 2002. Archived at [perma.cc/R9AR-QCKZ](https://perma.cc/R9AR-QCKZ)


================================================
FILE: content/zh/ch3.md
================================================
---
title: "3. 数据模型与查询语言"
weight: 103
breadcrumbs: false
---

<a id="ch_datamodels"></a>

![](/map/ch02.png)

> *语言的边界就是世界的边界。*
>
> 路德维希・维特根斯坦，《逻辑哲学论》（1922）

数据模型或许是开发软件最重要的部分，因为它们有着深远的影响：不仅影响软件的编写方式，还影响我们 **思考问题** 的方式。

大多数应用程序都是通过层层叠加的数据模型来构建的。每一层的关键问题是：如何用更低层次的数据模型来 **表示** 它？例如：

1. 作为应用程序开发者，你观察现实世界（其中有人员、组织、货物、行为、资金流动、传感器等），并用对象或数据结构，以及操作这些数据结构的 API 来建模。这些结构通常是特定于应用程序的。
2. 当你想要存储这些数据结构时，你用通用的数据模型来表达它们，例如 JSON 或 XML 文档、关系数据库中的表，或者图中的顶点和边。这些数据模型是本章的主题。
3. 构建你的数据库软件的工程师决定了如何用内存、磁盘或网络上的字节来表示文档/关系/图数据。这种表示可能允许以各种方式查询、搜索、操作和处理数据。我们将在 [第 4 章](/ch4#ch_storage) 中讨论这些存储引擎的设计。
4. 在更低的层次上，硬件工程师已经想出了如何用电流、光脉冲、磁场等来表示字节的方法。

在复杂的应用程序中可能有更多的中间层，例如基于 API 之上的 API，但基本思想仍然相同：每一层通过提供一个简洁的数据模型来隐藏下层的复杂性。这些抽象允许不同的人群 —— 例如，数据库供应商的工程师和使用他们数据库的应用程序开发者 —— 有效地合作。

在实践中广泛使用着几种不同的数据模型，通常用于不同的目的。某些类型的数据和某些查询在一种模型中很容易表达，而在另一种模型中则很困难。在本章中，我们将通过比较关系模型、文档模型、基于图的数据模型、事件溯源和数据框来探讨这些权衡。我们还将简要介绍允许你使用这些模型的查询语言。这种比较将帮助你决定何时使用哪种模型。

--------

> [!TIP] 术语：声明式查询语言
> 
> 本章中的许多查询语言（如 SQL、Cypher、SPARQL 或 Datalog）都是 **声明式** 的，这意味着你指定所需数据的模式 —— 
> 结果必须满足什么条件，以及你希望如何转换数据（例如，排序、分组和聚合）—— 但不指定 **如何** 实现该目标。
> 数据库系统的查询优化器可以决定使用哪些索引和哪些连接算法，以及以什么顺序执行查询的各个部分。
> 
> 相比之下，使用大多数编程语言，你必须编写一个 **算法** —— 即告诉计算机以什么顺序执行哪些操作。
> 声明式查询语言很有吸引力，因为它通常更简洁，比显式算法更容易编写。
> 但更重要的是，它还隐藏了查询引擎的实现细节，这使得数据库系统可以在不需要更改任何查询的情况下引入性能改进 [^1]。
> 
> 例如，数据库可能能够跨多个 CPU 核心和机器并行执行声明式查询，而你无需担心如何实现该并行性 [^2]。
> 如果用手写算法，实现这种并行执行将需要大量工作。

--------

## 关系模型与文档模型 {#sec_datamodels_history}

今天最广为人知的数据模型可能是 SQL，它基于 Edgar Codd 在 1970 年提出的关系模型 [^3]：
数据被组织成 **关系**（在 SQL 中称为 **表**），其中每个关系是 **元组**（在 SQL 中称为 **行**）的无序集合。

关系模型最初是一个理论提议，当时许多人怀疑它是否能够高效实现。
然而，到 20 世纪 80 年代中期，关系数据库管理系统（RDBMS）和 SQL 已成为大多数需要存储和查询具有某种规则结构的数据的人的首选工具。
许多数据管理用例在几十年后仍然由关系数据主导 —— 例如，商业分析（参见 ["星型与雪花型：分析模式"](#sec_datamodels_analytics)）。

多年来，出现了许多与数据存储和查询相关的竞争方法。在 20 世纪 70 年代和 80 年代初，**网状模型** 和 **层次模型** 是主要的替代方案，但关系模型最终战胜了它们。
对象数据库在 20 世纪 80 年代末和 90 年代初出现又消失。XML 数据库在 21 世纪初出现，但只获得了小众的采用。
每个关系模型的竞争者在其时代都产生了大量的炒作，但都没有持续下去 [^4]。
相反，SQL 已经发展到在其关系核心之外纳入其他数据类型 —— 例如，增加了对 XML、JSON 和图数据的支持 [^5]。

在 2010 年代，**NoSQL** 是试图推翻关系数据库主导地位的最新流行词。
NoSQL 指的不是单一技术，而是围绕新数据模型、模式灵活性、可伸缩性以及向开源许可模式转变的一系列松散的想法。
一些数据库将自己标榜为 *NewSQL*，因为它们旨在提供 NoSQL 系统的可伸缩性以及传统关系数据库的数据模型和事务保证。
NoSQL 和 NewSQL 的想法在数据系统设计中产生了很大的影响，但随着这些原则被广泛采用，这些术语的使用已经减少。

NoSQL 运动的一个持久影响是 **文档模型** 的流行，它通常将数据表示为 JSON。
这个模型最初由专门的文档数据库（如 MongoDB 和 Couchbase）推广，尽管大多数关系数据库现在也增加了 JSON 支持。
与通常被视为具有严格和不灵活模式的关系表相比，JSON 文档被认为更加灵活。

文档和关系数据的优缺点已经被广泛讨论；让我们来看看该辩论的一些关键点。

### 对象关系不匹配 {#sec_datamodels_document}

如今，大部分应用程序开发都是使用面向对象的编程语言完成的，这导致了对 SQL 数据模型的常见批评：如果数据存储在关系表中，则需要在应用程序代码中的对象和数据库的表、行、列模型之间建立一个笨拙的转换层。这种模型之间的脱节有时被称为 *阻抗不匹配*。

--------

> [!NOTE]
> 术语 *阻抗不匹配* 借自电子学。每个电路的输入和输出都有一定的阻抗（对交流电的阻力）。当你将一个电路的输出连接到另一个电路的输入时，如果两个电路的输出和输入阻抗匹配，则通过连接的功率传输将最大化。阻抗不匹配可能导致信号反射和其他问题。

--------

#### 对象关系映射（ORM） {#object-relational-mapping-orm}

对象关系映射（ORM）框架（如 ActiveRecord 和 Hibernate）减少了这个转换层所需的样板代码量，但它们经常受到批评 [^6]。一些常见的问题包括：

* ORM 很复杂，无法完全隐藏两种模型之间的差异，因此开发人员仍然需要考虑数据的关系和对象表示。
* ORM 通常仅用于 OLTP 应用程序开发（参见 ["表征事务处理和分析"](/ch1#sec_introduction_oltp)）；为分析目的提供数据的数据工程师仍然需要使用底层的关系表示，因此在使用 ORM 时，关系模式的设计仍然很重要。
* 许多 ORM 仅适用于关系型 OLTP 数据库。拥有多样化数据系统（如搜索引擎、图数据库和 NoSQL 系统）的组织可能会发现 ORM 支持不足。
* 一些 ORM 会自动生成关系模式，但这些模式对于直接访问关系数据的用户来说可能很尴尬，并且在底层数据库上可能效率低下。自定义 ORM 的模式和查询生成可能很复杂，并否定了首先使用 ORM 的好处。
* ORM 使得意外编写低效查询变得容易，例如 *N+1 查询问题* [^7]。例如，假设你想在页面上显示用户评论列表，因此你执行一个返回 *N* 条评论的查询，每条评论都包含其作者的 ID。要显示评论作者的姓名，你需要在用户表中查找 ID。在手写 SQL 中，你可能会在查询中执行此连接并返回每个评论的作者姓名，但使用 ORM 时，你可能最终会为 *N* 条评论中的每一条在用户表上进行单独的查询以查找其作者，总共产生 *N*+1 个数据库查询，这比在数据库中执行连接要慢。为了避免这个问题，你可能需要告诉 ORM 在获取评论的同时获取作者信息。

然而，ORM 也有优势：

* 对于非常适合关系模型的数据，持久关系和内存对象表示之间的某种转换是不可避免的，ORM 减少了这种转换所需的样板代码量。复杂的查询可能仍然需要在 ORM 之外处理，但 ORM 可以帮助处理简单和重复的情况。
* 一些 ORM 有助于缓存数据库查询的结果，这可以帮助减少数据库的负载。
* ORM 还可以帮助管理模式迁移和其他管理活动。

#### 用于一对多关系的文档数据模型 {#the-document-data-model-for-one-to-many-relationships}

并非所有数据都很适合关系表示；让我们通过一个例子来探讨关系模型的局限性。[图 3-1](#fig_obama_relational) 说明了如何在关系模式中表达简历（LinkedIn 个人资料）。整个个人资料可以通过唯一标识符 `user_id` 来识别。像 `first_name` 和 `last_name` 这样的字段每个用户只出现一次，因此它们可以建模为 `users` 表上的列。

大多数人在职业生涯中有多份工作（职位），人们可能有不同数量的教育经历和任意数量的联系信息。表示这种 *一对多关系* 的一种方法是将职位、教育和联系信息放在单独的表中，并使用外键引用 `users` 表，如 [图 3-1](#fig_obama_relational) 所示。

{{< figure src="/fig/ddia_0301.png" id="fig_obama_relational" caption="图 3-1. 使用关系模式表示 LinkedIn 个人资料。" class="w-full my-4" >}}

另一种表示相同信息的方式，可能更自然并且更接近应用程序代码中的对象结构，是作为 JSON 文档，如 [示例 3-1](#fig_obama_json) 所示。

{{< figure id="fig_obama_json" title="示例 3-1. 将 LinkedIn 个人资料表示为 JSON 文档" class="w-full my-4" >}}

```json
{
    "user_id": 251,
    "first_name": "Barack",
    "last_name": "Obama",
    "headline": "Former President of the United States of America",
    "region_id": "us:91",
    "photo_url": "/p/7/000/253/05b/308dd6e.jpg",
    "positions": [
        {"job_title": "President", "organization": "United States of America"},
        {"job_title": "US Senator (D-IL)", "organization": "United States Senate"}
    ],
    "education": [
        {"school_name": "Harvard University", "start": 1988, "end": 1991},
        {"school_name": "Columbia University", "start": 1981, "end": 1983}
    ],
    "contact_info": {
        "website": "https://barackobama.com",
        "twitter": "https://twitter.com/barackobama"
    }
}
```

一些开发人员认为 JSON 模型减少了应用程序代码和存储层之间的阻抗不匹配。然而，正如我们将在 [第 5 章](/ch5#ch_encoding) 中看到的，JSON 作为数据编码格式也存在问题。缺乏模式通常被认为是一个优势；我们将在 ["文档模型中的模式灵活性"](#sec_datamodels_schema_flexibility) 中讨论这个问题。

与 [图 3-1](#fig_obama_relational) 中的多表模式相比，JSON 表示具有更好的 *局部性*（参见 ["读写的数据局部性"](#sec_datamodels_document_locality)）。如果你想在关系示例中获取个人资料，你需要执行多个查询（通过 `user_id` 查询每个表）或在 `users` 表与其从属表之间执行复杂的多表连接 [^8]。在 JSON 表示中，所有相关信息都在一个地方，使查询既更快又更简单。

从用户个人资料到用户职位、教育历史和联系信息的一对多关系暗示了数据中的树形结构，而 JSON 表示使这种树形结构变得明确（见 [图 3-2](#fig_json_tree)）。

{{< figure src="/fig/ddia_0302.png" id="fig_json_tree" caption="图 3-2. 一对多关系形成树状结构。" class="w-full my-4" >}}

--------

> [!NOTE]
> 这种类型的关系有时被称为 *一对少* 而不是 *一对多*，因为简历通常有少量的职位 [^9] [^10]。在可能存在真正大量相关项目的情况下 —— 比如名人社交媒体帖子上的评论，可能有成千上万条 —— 将它们全部嵌入同一个文档中可能太笨拙了，因此 [图 3-1](#fig_obama_relational) 中的关系方法更可取。

--------

### 规范化、反规范化与连接 {#sec_datamodels_normalization}

在前一节的 [示例 3-1](#fig_obama_json) 中，`region_id` 被给出为 ID，而不是纯文本字符串 `"Washington, DC, United States"`。为什么？

如果用户界面有一个用于输入地区的自由文本字段，将其存储为纯文本字符串是有意义的。但是，拥有标准化的地理区域列表并让用户从下拉列表或自动补全中选择也有其优势：

* 不同个人资料之间的风格和拼写保持一致
* 避免歧义：如果有几个同名的地方（如果字符串只是 "Washington"，它是指 DC 还是州？）
* 易于更新 —— 名称只存储在一个地方，因此如果需要更改（例如，由于政治事件而更改城市名称），可以轻松地全面更新
* 本地化支持 —— 当网站被翻译成其他语言时，标准化列表可以被本地化，因此区域可以用查看者的语言显示
* 更好的搜索 —— 例如，搜索美国东海岸的人可以匹配此个人资料，因为区域列表可以编码华盛顿位于东海岸的事实（这从字符串 `"Washington, DC"` 中并不明显）

无论你存储 ID 还是文本字符串，这都是 *规范化* 的问题。当你使用 ID 时，你的数据更加规范化：对人类有意义的信息（如文本 *Washington, DC*）只存储在一个地方，所有引用它的地方都使用 ID（它只在数据库中有意义）。当你直接存储文本时，你在使用它的每条记录中都复制了对人类有意义的信息；这种表示是 *反规范化* 的。

使用 ID 的优势在于，因为它对人类没有意义，所以永远不需要更改：即使它标识的信息发生变化，ID 也可以保持不变。任何对人类有意义的东西将来某个时候可能需要更改 —— 如果该信息被复制，所有冗余副本都需要更新。这需要更多的代码、更多的写操作、更多的磁盘空间，并且存在不一致的风险（其中一些信息副本被更新但其他的没有）。

规范化表示的缺点是，每次要显示包含 ID 的记录时，都必须进行额外的查找以将 ID 解析为人类可读的内容。在关系数据模型中，这是使用 *连接* 完成的，例如：

```sql
SELECT users.*, regions.region_name
    FROM users
    JOIN regions ON users.region_id = regions.id
    WHERE users.id = 251;
```

文档数据库可以存储规范化和反规范化的数据，但它们通常与反规范化相关联 —— 部分是因为 JSON 数据模型使得存储额外的反规范化字段变得容易，部分是因为许多文档数据库中对连接的弱支持使得规范化不方便。一些文档数据库根本不支持连接，因此你必须在应用程序代码中执行它们 —— 也就是说，你首先获取包含 ID 的文档，然后执行第二个查询将该 ID 解析为另一个文档。在 MongoDB 中，也可以使用聚合管道中的 `$lookup` 算子执行连接：

```mongodb-json
db.users.aggregate([
    { $match: { _id: 251 } },
    { $lookup: {
        from: "regions",
        localField: "region_id",
        foreignField: "_id",
        as: "region"
    } }
])
```

#### 规范化的权衡 {#trade-offs-of-normalization}

在简历示例中，虽然 `region_id` 字段是对标准化区域集的引用，但 `organization`（人工作的公司或政府）和 `school_name`（他们学习的地方）的名称只是字符串。这种表示是反规范化的：许多人可能在同一家公司工作过，但没有 ID 将他们联系起来。

也许组织和学校应该是实体，个人资料应该引用它们的 ID 而不是它们的名称？引用区域 ID 的相同论点也适用于此。例如，假设我们想在他们的名字之外包括学校或公司的标志：

* 在反规范化表示中，我们会在每个人的个人资料中包含标志的图像 URL；这使得 JSON 文档自包含，但如果我们需要更改标志，就会产生麻烦，因为我们现在需要找到旧 URL 的所有出现并更新它们 [^9]。
* 在规范化表示中，我们将创建一个代表组织或学校的实体，并在该实体上存储其名称、标志 URL 以及可能的其他属性（描述、新闻提要等）一次。然后，每个提到该组织的简历都会简单地引用其 ID，更新标志很容易。

作为一般原则，规范化数据通常写入更快（因为只有一个副本），但查询更慢（因为它需要连接）；反规范化数据通常读取更快（连接更少），但写入更昂贵（更多副本要更新，使用更多磁盘空间）。你可能会发现将反规范化视为派生数据的一种形式很有帮助（["记录系统与派生数据"](/ch1#sec_introduction_derived)），因为你需要设置一个过程来更新数据的冗余副本。

除了执行所有这些更新的成本之外，如果进程在进行更新的过程中崩溃，你还需要考虑数据库的一致性。提供原子事务的数据库（参见 ["原子性"](/ch8#sec_transactions_acid_atomicity)）使保持一致性变得更容易，但并非所有数据库都在多个文档之间提供原子性。通过流处理确保一致性也是可能的，我们将在 ["保持系统同步"](/ch12#sec_stream_sync) 中讨论。

规范化往往更适合 OLTP 系统，其中读取和更新都需要快速；分析系统通常使用反规范化数据表现更好，因为它们批量执行更新，只读查询的性能是主要关注点。此外，在中小规模的系统中，规范化数据模型通常是最好的，因为你不必担心保持数据的多个副本相互一致，执行连接的成本是可以接受的。然而，在非常大规模的系统中，连接的成本可能会成为问题。

#### 社交网络案例研究中的反规范化 {#denormalization-in-the-social-networking-case-study}

在 ["案例研究：社交网络首页时间线"](/ch2#sec_introduction_twitter) 中，我们比较了规范化表示（[图 2-1](/ch2#fig_twitter_relational)）和反规范化表示（预计算的物化时间线）：这里，`posts` 和 `follows` 之间的连接太昂贵了，物化时间线是该连接结果的缓存。将新帖子插入关注者时间线的扇出过程是我们保持反规范化表示一致的方式。

然而，X（前 Twitter）的物化时间线实现实际上并不存储每个帖子的实际文本：每个条目实际上只存储帖子 ID、发布者的用户 ID，以及一些额外的信息来识别转发和回复 [^11]。换句话说，它大致是以下查询的预计算结果：

```sql
SELECT posts.id, posts.sender_id 
    FROM posts
    JOIN follows ON posts.sender_id = follows.followee_id
    WHERE follows.follower_id = current_user
    ORDER BY posts.timestamp DESC
    LIMIT 1000
```

这意味着每当读取时间线时，服务仍然需要执行两个连接：通过 ID 查找帖子以获取实际的帖子内容（以及点赞数和回复数等统计信息），并通过 ID 查找发送者的个人资料（以获取他们的用户名、个人资料图片和其他详细信息）。这个将 ID 补全为人类可读信息的过程称为 *hydrating* ID，本质上是在应用程序代码中执行的连接 [^11]。

在预计算时间线中仅存储 ID 的原因是它们引用的数据变化很快：热门帖子的点赞数和回复数可能每秒变化多次，一些用户定期更改他们的用户名或个人资料照片。由于时间线在查看时应该显示最新的点赞数和个人资料图片，因此将此信息反规范化到物化时间线中是没有意义的。此外，这种反规范化会显著增加存储成本。

这个例子表明，在读取数据时必须执行连接并不像有时声称的那样，是创建高性能、可扩展服务的障碍。`hydrating` 帖子 ID 和用户 ID 实际上是一个相当容易扩展的操作，因为它可以很好地并行化，并且成本不取决于你关注的账户数量或你拥有的关注者数量。

如果你需要决定是否在应用程序中反规范化某些内容，社交网络案例研究表明选择并不是立即显而易见的：最可扩展的方法可能涉及反规范化某些内容并保持其他内容规范化。你必须仔细考虑信息更改的频率以及读写成本（这可能由异常值主导，例如在典型社交网络的情况下拥有许多关注/关注者的用户）。规范化和反规范化本质上并不好或坏 —— 它们只是在读写性能以及实施工作量方面的权衡。

### 多对一与多对多关系 {#sec_datamodels_many_to_many}

虽然 [图 3-1](#fig_obama_relational) 中的 `positions` 和 `education` 是一对多或一对少关系的例子（一份简历有多个职位，但每个职位只属于一份简历），但 `region_id` 字段是 *多对一* 关系的例子（许多人住在同一个地区，但我们假设每个人在任何时候只住在一个地区）。

如果我们为组织和学校引入实体，并通过 ID 从简历中引用它们，那么我们也有 *多对多* 关系（一个人曾为多个组织工作，一个组织有多个过去或现在的员工）。在关系模型中，这种关系通常表示为 *关联表* 或 *连接表*，如 [图 3-3](#fig_datamodels_m2m_rel) 所示：每个职位将一个用户 ID 与一个组织 ID 关联起来。

{{< figure src="/fig/ddia_0303.png" id="fig_datamodels_m2m_rel" caption="图 3-3. 关系模型中的多对多关系。" class="w-full my-4" >}}

多对一和多对多关系不容易适应一个自包含的 JSON 文档；它们更适合规范化表示。在文档模型中，一种可能的表示如 [示例 3-2](#fig_datamodels_m2m_json) 所示，并在 [图 3-4](#fig_datamodels_many_to_many) 中说明：每个虚线矩形内的数据可以分组到一个文档中，但到组织和学校的链接最好表示为对其他文档的引用。

{{< figure id="fig_datamodels_m2m_json" title="示例 3-2. 通过 ID 引用组织的简历。" class="w-full my-4" >}}

```json
{
    "user_id": 251,
    "first_name": "Barack",
    "last_name": "Obama",
    "positions": [
        {"start": 2009, "end": 2017, "job_title": "President", "org_id": 513},
        {"start": 2005, "end": 2008, "job_title": "US Senator (D-IL)", "org_id": 514}
    ],
    ...
}
```

{{< figure src="/fig/ddia_0304.png" id="fig_datamodels_many_to_many" caption="图 3-4. 文档模型中的多对多关系：每个虚线框内的数据可以分组到一个文档中。" class="w-full my-4" >}}

多对多关系通常需要"双向"查询：例如，找到特定人员工作过的所有组织，以及找到在特定组织工作过的所有人员。启用此类查询的一种方法是在两边都存储 ID 引用，即简历包含该人工作过的每个组织的 ID，组织文档包含提到该组织的简历的 ID。这种表示是反规范化的，因为关系存储在两个地方，可能会相互不一致。

规范化表示仅在一个地方存储关系，并依赖 *二级索引*（我们将在 [第 4 章](/ch4#ch_storage) 中讨论）来允许有效地双向查询关系。在 [图 3-3](#fig_datamodels_m2m_rel) 的关系模式中，我们会告诉数据库在 `positions` 表的 `user_id` 和 `org_id` 列上创建索引。

在 [示例 3-2](#fig_datamodels_m2m_json) 的文档模型中，数据库需要索引 `positions` 数组内对象的 `org_id` 字段。许多文档数据库和具有 JSON 支持的关系数据库能够在文档内的值上创建此类索引。

### 星型与雪花型：分析模式 {#sec_datamodels_analytics}

数据仓库（参见 ["数据仓库"](/ch1#sec_introduction_dwh)）通常是关系型的，并且数据仓库中表结构有一些广泛使用的约定：*星型模式*、*雪花模式*、*维度建模* [^12]，以及 *一张大表*（OBT）。这些结构针对业务分析师的需求进行了优化。ETL 过程将来自运营系统的数据转换为此模式。

[图 3-5](#fig_dwh_schema) 显示了一个可能在杂货零售商的数据仓库中找到的星型模式示例。模式的中心是所谓的 *事实表*（在此示例中，它称为 `fact_sales`）。事实表的每一行代表在特定时间发生的事件（这里，每一行代表客户购买产品）。如果我们分析的是网站流量而不是零售销售，每一行可能代表用户的页面查看或点击。

{{< figure src="/fig/ddia_0305.png" id="fig_dwh_schema" caption="图 3-5. 用于数据仓库的星型模式示例。" class="w-full my-4" >}}

通常，事实被捕获为单个事件，因为这允许以后最大的分析灵活性。然而，这意味着事实表可能变得非常大。一个大型企业可能在其数据仓库中有许多 PB 的交易历史，主要表示为事实表。

事实表中的一些列是属性，例如产品售出的价格和从供应商那里购买它的成本（允许计算利润率）。事实表中的其他列是对其他表的外键引用，称为 *维度表*。由于事实表中的每一行代表一个事件，维度代表事件的 *谁*、*什么*、*哪里*、*何时*、*如何* 和 *为什么*。

例如，在 [图 3-5](#fig_dwh_schema) 中，其中一个维度是售出的产品。`dim_product` 表中的每一行代表一种待售产品类型，包括其库存单位（SKU）、描述、品牌名称、类别、脂肪含量、包装尺寸等。`fact_sales` 表中的每一行使用外键来指示在该特定交易中售出了哪种产品。查询通常涉及对多个维度表的多个连接。

即使日期和时间也经常使用维度表表示，因为这允许编码有关日期的附加信息（例如公共假期），允许查询区分假期和非假期的销售。

[图 3-5](#fig_dwh_schema) 是星型模式的一个例子。该名称来自这样一个事实：当表关系被可视化时，事实表位于中间，被其维度表包围；到这些表的连接就像星星的光芒。

这个模板的一个变体被称为 *雪花模式*，其中维度被进一步分解为子维度。例如，品牌和产品类别可能有单独的表，`dim_product` 表中的每一行都可以将品牌和类别作为外键引用，而不是将它们作为字符串存储在 `dim_product` 表中。雪花模式比星型模式更规范化，但星型模式通常更受欢迎，因为它们对分析师来说更简单 [^12]。

在典型的数据仓库中，表通常非常宽：事实表通常有超过 100 列，有时有几百列。维度表也可能很宽，因为它们包括所有可能与分析相关的元数据 —— 例如，`dim_store` 表可能包括每个商店提供哪些服务的详细信息、是否有店内面包房、平方英尺、商店首次开业的日期、最后一次改造的时间、距离最近的高速公路有多远等。

星型或雪花模式主要由多对一关系组成（例如，许多销售发生在一个特定产品，在一个特定商店），表示为事实表对维度表的外键，或维度对子维度的外键。原则上，其他类型的关系可能存在，但它们通常被反规范化以简化查询。例如，如果客户一次购买多种不同的产品，则该多项交易不会被明确表示；相反，事实表中为每个购买的产品都有一个单独的行，这些事实都恰好具有相同的客户 ID、商店 ID 和时间戳。

一些数据仓库模式进一步进行反规范化，完全省略维度表，将维度中的信息折叠到事实表上的反规范化列中（本质上是预计算事实表和维度表之间的连接）。这种方法被称为 *一张大表*（OBT），虽然它需要更多的存储空间，但有时可以实现更快的查询 [^13]。

在分析的背景下，这种反规范化是没有问题的，因为数据通常代表不会改变的历史数据日志（除了偶尔纠正错误）。OLTP 系统中反规范化出现的数据一致性和写入开销问题在分析中并不那么紧迫。

### 何时使用哪种模型 {#sec_datamodels_document_summary}

文档数据模型的主要论点是模式灵活性、由于局部性而获得更好的性能，以及对于某些应用程序来说，它更接近应用程序使用的对象模型。关系模型通过为连接、多对一和多对多关系提供更好的支持来反击。让我们更详细地研究这些论点。

如果你的应用程序中的数据具有类似文档的结构（即一对多关系的树，通常一次加载整个树），那么使用文档模型可能是个好主意。将类似文档的结构 *切碎*（shredding）为多个表的关系技术（如 [图 3-1](#fig_obama_relational) 中的 `positions`、`education` 和 `contact_info`）可能导致繁琐的模式和不必要复杂的应用程序代码。

文档模型有局限性：例如，你不能直接引用文档中的嵌套项，而是需要说类似"用户 251 的职位列表中的第二项"之类的话。如果你确实需要引用嵌套项，关系方法效果更好，因为你可以通过其 ID 直接引用任何项。

一些应用程序允许用户选择项目的顺序：例如，想象一个待办事项列表或问题跟踪器，用户可以拖放任务来重新排序它们。文档模型很好地支持此类应用程序，因为项目（或它们的 ID）可以简单地存储在 JSON 数组中以确定它们的顺序。在关系数据库中，没有表示此类可重新排序列表的标准方法，并且使用各种技巧：按整数列排序（在插入中间时需要重新编号）、ID 的链表或分数索引 [^14] [^15] [^16]。

#### 文档模型中的模式灵活性 {#sec_datamodels_schema_flexibility}

大多数文档数据库以及关系数据库中的 JSON 支持不会对文档中的数据强制执行任何模式。关系数据库中的 XML 支持通常带有可选的模式验证。没有模式意味着可以将任意键和值添加到文档中，并且在读取时，客户端不能保证文档可能包含哪些字段。

文档数据库有时被称为 *无模式*，但这是误导性的，因为读取数据的代码通常假设某种结构 —— 即存在隐式模式，但数据库不强制执行 [^17]。更准确的术语是 *读时模式*（数据的结构是隐式的，只有在读取数据时才解释），与 *写时模式*（关系数据库的传统方法，其中模式是显式的，数据库确保所有数据在写入时都符合它）形成对比 [^18]。

读时模式类似于编程语言中的动态（运行时）类型检查，而写时模式类似于静态（编译时）类型检查。正如静态和动态类型检查的倡导者对它们的相对优点有很大的争论 [^19]，数据库中模式的强制执行是一个有争议的话题，通常没有正确或错误的答案。

当应用程序想要更改其数据格式时，这些方法之间的差异特别明显。例如，假设你当前在一个字段中存储每个用户的全名，而你想要分别存储名字和姓氏 [^20]。在文档数据库中，你只需开始编写具有新字段的新文档，并在应用程序中编写处理读取旧文档时的代码。例如：

```mongodb-json
if (user && user.name && !user.first_name) {
    // 2023 年 12 月 8 日之前写入的文档没有 first_name
    user.first_name = user.name.split(" ")[0];
}
```

这种方法的缺点是，从数据库读取的应用程序的每个部分现在都需要处理可能很久以前写入的旧格式的文档。另一方面，在写时模式数据库中，你通常会执行 *迁移*，如下所示：

```sql
ALTER TABLE users ADD COLUMN first_name text DEFAULT NULL;
UPDATE users SET first_name = split_part(name, ' ', 1); -- PostgreSQL
UPDATE users SET first_name = substring_index(name, ' ', 1); -- MySQL
```

在大多数关系数据库中，添加具有默认值的列即使在大表上也是快速且无问题的。然而，在大表上运行 `UPDATE` 语句可能会很慢，因为每一行都需要重写，其他模式操作（例如更改列的数据类型）通常也需要复制整个表。

存在各种工具允许在后台执行此类模式更改而无需停机 [^21] [^22] [^23] [^24]，但在大型数据库上执行此类迁移在操作上仍然具有挑战性。通过仅添加默认值为 `NULL` 的 `first_name` 列（这很快）并在读取时填充它，可以避免复杂的迁移，就像你在文档数据库中所做的那样。

如果集合中的项目由于某种原因并非都具有相同的结构（即数据是异构的），则读时模式方法是有利的 —— 例如，因为：

* 有许多不同类型的对象，将每种类型的对象放在自己的表中是不切实际的。
* 数据的结构由你无法控制且可能随时更改的外部系统决定。

在这样的情况下，模式可能弊大于利，无模式文档可能是更自然的数据模型。但在所有记录都应具有相同结构的情况下，模式是记录和强制该结构的有用机制。我们将在 [第 5 章](/ch5#ch_encoding) 中更详细地讨论模式和模式演化。

#### 读写的数据局部性 {#sec_datamodels_document_locality}

文档通常存储为单个连续字符串，编码为 JSON、XML 或二进制变体（如 MongoDB 的 BSON）。如果你的应用程序经常需要访问整个文档（例如，在网页上渲染它），则这种 *存储局部性* 具有性能优势。如果数据分布在多个表中，如 [图 3-1](#fig_obama_relational) 所示，则需要多次索引查找才能检索所有数据，这可能需要更多的磁盘寻道并花费更多时间。

局部性优势仅在你同时需要文档的大部分时才适用。数据库通常需要加载整个文档，如果你只需要访问大文档的一小部分，这可能会浪费。在文档更新时，通常需要重写整个文档。由于这些原因，通常建议你保持文档相当小，并避免频繁对文档进行小的更新。

然而，将相关数据存储在一起以获得局部性的想法并不限于文档模型。例如，Google 的 Spanner 数据库在关系数据模型中提供相同的局部性属性，允许模式声明表的行应该交错（嵌套）在父表中 [^25]。Oracle 允许相同的功能，使用称为 *多表索引集群表* 的功能 [^26]。由 Google 的 Bigtable 推广并在 HBase 和 Accumulo 等中使用的 *宽列* 数据模型具有 *列族* 的概念，其目的类似于管理局部性 [^27]。

#### 文档的查询语言 {#query-languages-for-documents}

关系数据库和文档数据库之间的另一个区别是你用来查询它的语言或 API。大多数关系数据库使用 SQL 查询，但文档数据库更加多样化。一些只允许通过主键进行键值访问，而另一些还提供二级索引来查询文档内的值，有些提供丰富的查询语言。

XML 数据库通常使用 XQuery 和 XPath 查询，它们旨在允许复杂的查询，包括跨多个文档的连接，并将其结果格式化为 XML [^28]。JSON Pointer [^29] 和 JSONPath [^30] 为 JSON 提供了等效于 XPath 的功能。

MongoDB 的聚合管道，我们在 ["规范化、反规范化与连接"](#sec_datamodels_normalization) 中看到了其用于连接的 `$lookup` 算子，是 JSON 文档集合查询语言的一个例子。

让我们看另一个例子来感受这种语言 —— 这次是聚合，这对分析特别需要。想象你是一名海洋生物学家，每次你在海洋中看到动物时，你都会向数据库添加一条观察记录。现在你想生成一份报告，说明你每个月看到了多少条鲨鱼。在 PostgreSQL 中，你可能会这样表达该查询：

```sql
SELECT date_trunc('month', observation_timestamp) AS observation_month, ❶ 
    sum(num_animals) AS total_animals
FROM observations
WHERE family = 'Sharks'
GROUP BY observation_month;
```

❶ : `date_trunc('month', timestamp)` 函数确定包含 `timestamp` 的日历月，并返回表示该月开始的另一个时间戳。换句话说，它将时间戳向下舍入到最近的月份。

此查询首先过滤观察结果以仅显示 `Sharks` 家族中的物种，然后按它们发生的日历月对观察结果进行分组，最后将该月所有观察中看到的动物数量相加。可以使用 MongoDB 的聚合管道表达相同的查询，如下所示：

```mongodb-json
db.observations.aggregate([
    { $match: { family: "Sharks" } },
    { $group: {
    _id: {
        year: { $year: "$observationTimestamp" },
        month: { $month: "$observationTimestamp" }
    },
    totalAnimals: { $sum: "$numAnimals" }
    } }
]);
```

聚合管道语言在表达能力上类似于 SQL 的子集，但它使用基于 JSON 的语法而不是 SQL 的英语句子风格语法；差异可能是品味问题。

#### 文档和关系数据库的融合 {#convergence-of-document-and-relational-databases}

文档数据库和关系数据库最初是非常不同的数据管理方法，但随着时间的推移，它们变得更加相似 [^31]。关系数据库增加了对 JSON 类型和查询算子的支持，以及索引文档内属性的能力。一些文档数据库（如 MongoDB、Couchbase 和 RethinkDB）增加了对连接、二级索引和声明式查询语言的支持。

模型的这种融合对应用程序开发人员来说是个好消息，因为当你可以在同一个数据库中组合两者时，关系模型和文档模型效果最好。许多文档数据库需要对其他文档进行关系式引用，许多关系数据库也有一些场景更适合模式灵活性。关系-文档混合是一个强大的组合。

--------

> [!NOTE]
> Codd 对关系模型的原始描述 [^3] 实际上允许在关系模式中存在类似于 JSON 的东西。他称之为 *非简单域*。这个想法是，行中的值不必只是原始数据类型（如数字或字符串），但它也可以是嵌套关系（表）—— 所以你可以有一个任意嵌套的树结构作为值，很像 30 多年后添加到 SQL 的 JSON 或 XML 支持。

--------


## 图数据模型 {#sec_datamodels_graph}

我们之前看到，关系类型是不同数据模型之间的重要区别特征。如果你的应用程序主要具有一对多关系（树形结构数据）并且记录之间很少有其他关系，则文档模型是合适的。

但是，如果你的数据中多对多关系非常常见呢？关系模型可以处理多对多关系的简单情况，但随着数据内部连接变得更加复杂，开始将数据建模为图变得更加自然。

图由两种对象组成：*顶点*（也称为 *节点* 或 *实体*）和 *边*（也称为 *关系* 或 *弧*）。许多类型的数据可以建模为图。典型的例子包括：

社交图
: 顶点是人，边表示哪些人相互认识。

网页图
: 顶点是网页，边表示指向其他页面的 HTML 链接。

道路或铁路网络
: 顶点是交叉点，边表示它们之间的道路或铁路线。

众所周知的算法可以在这些图上运行：例如，地图导航应用程序搜索道路网络中两点之间的最短路径，PageRank 可用于网页图以确定网页的受欢迎程度，从而确定其在搜索结果中的排名 [^32]。

图可以用几种不同的方式表示。在 *邻接表* 模型中，每个顶点存储其相距一条边的邻居顶点的 ID。或者，你可以使用 *邻接矩阵*，这是一个二维数组，其中每一行和每一列对应一个顶点，当行顶点和列顶点之间没有边时值为零，如果有边则值为一。邻接表适合图遍历，矩阵适合机器学习（参见 ["数据框、矩阵与数组"](#sec_datamodels_dataframes)）。

在刚才给出的示例中，图中的所有顶点都表示相同类型的事物（分别是人、网页或道路交叉点）。然而，图不限于这种 *同质* 数据：图的一个同样强大的用途是提供一种一致的方式在单个数据库中存储完全不同类型的对象。例如：

* Facebook 维护一个包含许多不同类型顶点和边的单一图：顶点表示人员、位置、事件、签到和用户发表的评论；边表示哪些人彼此是朋友、哪个签到发生在哪个位置、谁评论了哪个帖子、谁参加了哪个事件等等 [^33]。
* 知识图被搜索引擎用来记录搜索查询中经常出现的实体（如组织、人员和地点）的事实 [^34]。这些信息通过爬取和分析网站上的文本获得；一些网站（如 Wikidata）也以结构化形式发布图数据。

在图中构建和查询数据有几种不同但相关的方式。在本节中，我们将讨论 *属性图* 模型（由 Neo4j、Memgraph、KùzuDB [^35] 和其他 [^36] 实现）和 *三元组存储* 模型（由 Datomic、AllegroGraph、Blazegraph 和其他实现）。这些模型在它们可以表达的内容方面相当相似，一些图数据库（如 Amazon Neptune）支持两种模型。

我们还将查看图的四种查询语言（Cypher、SPARQL、Datalog 和 GraphQL），以及用于查询图的 SQL 支持。还存在其他图查询语言，如 Gremlin [^37]，但这些将为我们提供代表性的概述。

为了说明这些不同的语言和模型，本节使用 [图 3-6](#fig_datamodels_graph) 中显示的图作为运行示例。它可能取自社交网络或家谱数据库：它显示了两个人，来自爱达荷州的 Lucy 和来自法国圣洛的 Alain。他们已婚并住在伦敦。每个人和每个位置都表示为顶点，它们之间的关系表示为边。此示例将帮助演示一些在图数据库中很容易但在其他模型中很困难的查询。

{{< figure src="/fig/ddia_0306.png" id="fig_datamodels_graph" caption="图 3-6. 图结构数据示例（框表示顶点，箭头表示边）。" class="w-full my-4" >}}

### 属性图 {#id56}

在 *属性图*（也称为 *标记属性图*）模型中，每个顶点包含：

* 唯一标识符
* 标签（字符串），描述此顶点表示的对象类型
* 一组出边
* 一组入边
* 属性集合（键值对）

每条边包含：

* 唯一标识符
* 边开始的顶点（*尾顶点*）
* 边结束的顶点（*头顶点*）
* 描述两个顶点之间关系类型的标签
* 属性集合（键值对）

你可以将图存储视为由两个关系表组成，一个用于顶点，一个用于边，如 [示例 3-3](#fig_graph_sql_schema) 所示（此模式使用 PostgreSQL `jsonb` 数据类型来存储每个顶点或边的属性）。每条边都存储头顶点和尾顶点；如果你想要顶点的入边或出边集，可以分别通过 `head_vertex` 或 `tail_vertex` 查询 `edges` 表。

{{< figure id="fig_graph_sql_schema" title="示例 3-3. 使用关系模式表示属性图" class="w-full my-4" >}}

```sql
CREATE TABLE vertices (
    vertex_id integer PRIMARY KEY,
    label text,
    properties jsonb
);

CREATE TABLE edges (
    edge_id integer PRIMARY KEY,
    tail_vertex integer REFERENCES vertices (vertex_id),
    head_vertex integer REFERENCES vertices (vertex_id),
    label text,
    properties jsonb
);

CREATE INDEX edges_tails ON edges (tail_vertex);
CREATE INDEX edges_heads ON edges (head_vertex);
```

此模型的一些重要方面是：

1. 任何顶点都可以有一条边将其与任何其他顶点连接。没有限制哪些类型的事物可以或不能关联的模式。
2. 给定任何顶点，你可以有效地找到其入边和出边，从而 *遍历* 图 —— 即通过顶点链跟随路径 —— 向前和向后。（这就是为什么 [示例 3-3](#fig_graph_sql_schema) 在 `tail_vertex` 和 `head_vertex` 列上都有索引。）
3. 通过对不同类型的顶点和关系使用不同的标签，你可以在单个图中存储几种不同类型的信息，同时仍保持简洁的数据模型。

边表就像我们在 ["多对一与多对多关系"](#sec_datamodels_many_to_many) 中看到的多对多关联表/连接表，泛化为允许在同一表中存储许多不同类型的关系。标签和属性上也可能有索引，允许有效地找到具有某些属性的顶点或边。

--------

> [!NOTE]
> 图模型的一个限制是边只能将两个顶点相互关联，而关系连接表可以通过在单行上具有多个外键引用来表示三元或甚至更高阶的关系。此类关系可以通过为连接表的每一行创建一个额外的顶点，以及到/从该顶点的边，或者使用 *超图* 在图中表示。

--------

这些功能为数据建模提供了极大的灵活性，如 [图 3-6](#fig_datamodels_graph) 所示。该图显示了一些在传统关系模式中难以表达的内容，例如不同国家的不同区域结构（法国有 *省* 和 *大区*，而美国有 *县* 和 *州*）、历史的怪癖（如国中之国）（暂时忽略主权国家和民族的复杂性），以及不同粒度的数据（Lucy 的当前居住地指定为城市，而她的出生地仅在州级别指定）。

你可以想象扩展图以包括有关 Lucy 和 Alain 或其他人的许多其他事实。例如，你可以使用它来指示他们有哪些食物过敏（通过为每个过敏原引入一个顶点，并在人和过敏原之间设置边以指示过敏），并将过敏原与显示哪些食物含有哪些物质的一组顶点链接。然后你可以编写查询来找出每个人可以安全食用的食物。图适合可演化性：随着你向应用程序添加功能，图可以轻松扩展以适应应用程序数据结构的变化。

### Cypher 查询语言 {#id57}

*Cypher* 是用于属性图的查询语言，最初为 Neo4j 图数据库创建，后来作为 *openCypher* 发展为开放标准 [^38]。除了 Neo4j，Cypher 还得到 Memgraph、KùzuDB [^35]、Amazon Neptune、Apache AGE（在 PostgreSQL 中存储）等的支持。它以电影《黑客帝国》中的角色命名，与密码学中的密码无关 [^39]。

[示例 3-4](#fig_cypher_create) 显示了将 [图 3-6](#fig_datamodels_graph) 的左侧部分插入图数据库的 Cypher 查询。图的其余部分可以类似地添加。每个顶点都被赋予一个符号名称，如 `usa` 或 `idaho`。该名称不存储在数据库中，仅在查询内部使用以在顶点之间创建边，使用箭头符号：`(idaho) -[:WITHIN]-> (usa)` 创建一条标记为 `WITHIN` 的边，其中 `idaho` 作为尾节点，`usa` 作为头节点。

{{< figure link="#fig_datamodels_graph" id="fig_cypher_create" title="示例 3-4. 图 3-6 中数据的子集，表示为 Cypher 查询" class="w-full my-4" >}}

```
CREATE
    (namerica :Location {name:'North America', type:'continent'}),
    (usa :Location {name:'United States', type:'country' }),
    (idaho :Location {name:'Idaho', type:'state' }),
    (lucy :Person {name:'Lucy' }),
    (idaho) -[:WITHIN ]-> (usa) -[:WITHIN]-> (namerica),
    (lucy) -[:BORN_IN]-> (idaho)
```

当 [图 3-6](#fig_datamodels_graph) 的所有顶点和边都添加到数据库后，我们可以开始提出有趣的问题：例如，*查找所有从美国移民到欧洲的人的姓名*。也就是说，找到所有具有指向美国境内位置的 `BORN_IN` 边，以及指向欧洲境内位置的 `LIVING_IN` 边的顶点，并返回每个顶点的 `name` 属性。

[示例 3-5](#fig_cypher_query) 显示了如何在 Cypher 中表达该查询。相同的箭头符号用于 `MATCH` 子句中以在图中查找模式：`(person) -[:BORN_IN]-> ()` 匹配由标记为 `BORN_IN` 的边相关的任意两个顶点。该边的尾顶点绑定到变量 `person`，头顶点未命名。

{{< figure id="fig_cypher_query" title="示例 3-5. Cypher 查询查找从美国移民到欧洲的人" class="w-full my-4" >}}

```
MATCH
    (person) -[:BORN_IN]-> () -[:WITHIN*0..]-> (:Location {name:'United States'}),
    (person) -[:LIVES_IN]-> () -[:WITHIN*0..]-> (:Location {name:'Europe'})
RETURN person.name
```

查询可以这样理解：

> 找到满足以下 *两个* 条件的任何顶点（称为 `person`）：
>
> 1. `person` 有一条出边 `BORN_IN` 指向某个顶点。从那个顶点，你可以跟随一条出边 `WITHIN` 链，直到最终到达一个类型为 `Location` 的顶点，其 `name` 属性等于 `"United States"`。
> 2. 同一个 `person` 顶点也有一条出边 `LIVES_IN`。跟随该边，然后是一条出边 `WITHIN` 链，你最终到达一个类型为 `Location` 的顶点，其 `name` 属性等于 `"Europe"`。
>
> 对于每个这样的 `person` 顶点，返回 `name` 属性。

有几种可能的执行查询的方法。这里给出的描述建议你从扫描数据库中的所有人开始，检查每个人的出生地和居住地，并仅返回符合条件的人。

但等效地，你可以从两个 `Location` 顶点开始并向后工作。如果 `name` 属性上有索引，你可以有效地找到表示美国和欧洲的两个顶点。然后你可以通过跟随所有传入的 `WITHIN` 边来查找美国和欧洲各自的所有位置（州、地区、城市等）。最后，你可以寻找可以通过位置顶点之一的传入 `BORN_IN` 或 `LIVES_IN` 边找到的人。

### SQL 中的图查询 {#id58}

[示例 3-3](#fig_graph_sql_schema) 建议图数据可以在关系数据库中表示。但如果我们将图数据放入关系结构中，我们还能使用 SQL 查询它吗？

答案是肯定的，但有一些困难。你在图查询中遍历的每条边实际上都是与 `edges` 表的连接。在关系数据库中，你通常事先知道查询中需要哪些连接。另一方面，在图查询中，你可能需要遍历可变数量的边才能找到你要查找的顶点 —— 也就是说，连接的数量不是预先固定的。

在我们的示例中，这发生在 Cypher 查询中的 `() -[:WITHIN*0..]-> ()` 模式中。一个人的 `LIVES_IN` 边可能指向任何类型的位置：街道、城市、区（district）、地区（region）、州等。一个城市可能在（`WITHIN`）某个地区，该地区在（`WITHIN`）某个州，该州在（`WITHIN`）某个国家，等等。`LIVES_IN` 边可能直接指向你要查找的位置顶点，或者它可能在位置层次结构中相距几个级别。

在 Cypher 中，`:WITHIN*0..` 非常简洁地表达了这个事实：它意味着"跟随 `WITHIN` 边，零次或多次"。它就像正则表达式中的 `*` 算子。

自 SQL:1999 以来，查询中可变长度遍历路径的想法可以使用称为 *递归公用表表达式*（`WITH RECURSIVE` 语法）的东西来表达。[示例 3-6](#fig_graph_sql_query) 显示了相同的查询 —— 查找从美国移民到欧洲的人的姓名 —— 使用此技术在 SQL 中表达。然而，与 Cypher 相比，语法非常笨拙。

{{< figure link="#fig_cypher_query" id="fig_graph_sql_query" title="示例 3-6. 与 示例 3-5 相同的查询，使用递归公用表表达式在 SQL 中编写" class="w-full my-4" >}}

```sql
WITH RECURSIVE

    -- in_usa 是美国境内所有位置的顶点 ID 集合
    in_usa(vertex_id) AS (
        SELECT vertex_id FROM vertices
            WHERE label = 'Location' AND properties->>'name' = 'United States' ❶ 
      UNION
        SELECT edges.tail_vertex FROM edges ❷
            JOIN in_usa ON edges.head_vertex = in_usa.vertex_id
            WHERE edges.label = 'within'
    ),
    
    -- in_europe 是欧洲境内所有位置的顶点 ID 集合
    in_europe(vertex_id) AS (
        SELECT vertex_id FROM vertices
            WHERE label = 'location' AND properties->>'name' = 'Europe' ❸
      UNION
        SELECT edges.tail_vertex FROM edges
            JOIN in_europe ON edges.head_vertex = in_europe.vertex_id
            WHERE edges.label = 'within'
    ),
    
    -- born_in_usa 是所有在美国出生的人的顶点 ID 集合
    born_in_usa(vertex_id) AS ( ❹
        SELECT edges.tail_vertex FROM edges
            JOIN in_usa ON edges.head_vertex = in_usa.vertex_id
            WHERE edges.label = 'born_in'
    ),
    
    -- lives_in_europe 是所有居住在欧洲的人的顶点 ID 集合
    lives_in_europe(vertex_id) AS ( ❺
        SELECT edges.tail_vertex FROM edges
            JOIN in_europe ON edges.head_vertex = in_europe.vertex_id
            WHERE edges.label = 'lives_in'
    )
    
    SELECT vertices.properties->>'name'
    FROM vertices
    -- 连接以找到那些既在美国出生 *又* 居住在欧洲的人
    JOIN born_in_usa ON vertices.vertex_id = born_in_usa.vertex_id ❻
    JOIN lives_in_europe ON vertices.vertex_id = lives_in_europe.vertex_id;
```

❶: 首先找到 `name` 属性值为 `"United States"` 的顶点，并使其成为顶点集 `in_usa` 的第一个元素。

❷: 从集合 `in_usa` 中的顶点跟随所有传入的 `within` 边，并将它们添加到同一集合中，直到访问了所有传入的 `within` 边。

❸: 从 `name` 属性值为 `"Europe"` 的顶点开始执行相同操作，并构建顶点集 `in_europe`。

❹: 对于集合 `in_usa` 中的每个顶点，跟随传入的 `born_in` 边以查找在美国某个地方出生的人。

❺: 类似地，对于集合 `in_europe` 中的每个顶点，跟随传入的 `lives_in` 边以查找居住在欧洲的人。

❻: 最后，通过连接它们来将在美国出生的人的集合与居住在欧洲的人的集合相交。

4 行 Cypher 查询需要 31 行 SQL 的事实表明，正确选择数据模型和查询语言可以产生多大的差异。这只是开始；还有更多细节需要考虑，例如，处理循环，以及在广度优先或深度优先遍历之间进行选择 [^40]。

Oracle 对递归查询有不同的 SQL 扩展，它称之为 *层次* [^41]。

然而，情况可能正在改善：在撰写本文时，有计划向 SQL 标准添加一种名为 GQL 的图查询语言 [^42] [^43]，它将提供受 Cypher、GSQL [^44] 和 PGQL [^45] 启发的语法。

### 三元组存储与 SPARQL {#id59}

三元组存储模型大多等同于属性图模型，使用不同的词来描述相同的想法。尽管如此，它仍值得讨论，因为有各种三元组存储的工具和语言，它们可以成为构建应用程序工具箱的宝贵补充。

在三元组存储中，所有信息都以非常简单的三部分语句的形式存储：（*主语*、*谓语*、*宾语*）。例如，在三元组（*Jim*、*likes*、*bananas*）中，*Jim* 是主语，*likes* 是谓语（动词），*bananas* 是宾语。

三元组的主语等同于图中的顶点。宾语是两种东西之一：

1. 原始数据类型的值，如字符串或数字。在这种情况下，三元组的谓语和宾语等同于主语顶点上属性的键和值。使用 [图 3-6](#fig_datamodels_graph) 中的示例，（*lucy*、*birthYear*、*1989*）就像一个顶点 `lucy`，其属性为 `{"birthYear": 1989}`。
2. 图中的另一个顶点。在这种情况下，谓语是图中的边，主语是尾顶点，宾语是头顶点。例如，在（*lucy*、*marriedTo*、*alain*）中，主语和宾语 *lucy* 和 *alain* 都是顶点，谓语 *marriedTo* 是连接它们的边的标签。

> [!NOTE]
> 准确地说，提供类似三元组数据模型的数据库通常需要在每个元组上存储一些额外的元数据。例如，AWS Neptune 使用四元组（4-tuples），通过向每个三元组添加图 ID [^46]；Datomic 使用 5 元组，用事务 ID 和一个表示删除的布尔值扩展每个三元组 [^47]。由于这些数据库保留了上面解释的基本 *主语-谓语-宾语* 结构，本书仍然称它们为三元组存储。

[示例 3-7](#fig_graph_n3_triples) 显示了与 [示例 3-4](#fig_cypher_create) 中相同的数据，以称为 *Turtle* 的格式编写为三元组，它是 *Notation3*（*N3*）的子集 [^48]。

{{< figure link="#fig_datamodels_graph" id="fig_graph_n3_triples" title="示例 3-7. 图 3-6 中数据的子集，表示为 Turtle 三元组" class="w-full my-4" >}}

```
@prefix : <urn:example:>.
_:lucy a :Person.
_:lucy :name "Lucy".
_:lucy :bornIn _:idaho.
_:idaho a :Location.
_:idaho :name "Idaho".
_:idaho :type "state".
_:idaho :within _:usa.
_:usa a :Location.
_:usa :name "United States".
_:usa :type "country".
_:usa :within _:namerica.
_:namerica a :Location.
_:namerica :name "North America".
_:namerica :type "continent".
```

在此示例中，图的顶点写为 `_:someName`。该名称在此文件之外没有任何意义；它的存在只是因为否则我们不知道哪些三元组引用同一个顶点。当谓语表示边时，宾语是顶点，如 `_:idaho :within _:usa`。当谓语是属性时，宾语是字符串字面量，如 `_:usa :name "United States"`。

一遍又一遍地重复相同的主语相当重复，但幸运的是，你可以使用分号来表达关于同一主语的多个内容。这使得 Turtle 格式非常易读：见 [示例 3-8](#fig_graph_n3_shorthand)。

{{< figure link="#fig_graph_n3_triples" id="fig_graph_n3_shorthand" title="示例 3-8. 编写 示例 3-7 中数据的更简洁方式" class="w-full my-4" >}}

```
@prefix : <urn:example:>.
_:lucy a :Person; :name "Lucy"; :bornIn _:idaho.
_:idaho a :Location; :name "Idaho"; :type "state"; :within _:usa.
_:usa a :Location; :name "United States"; :type "country"; :within _:namerica.
_:namerica a :Location; :name "North America"; :type "continent".
```

--------

> [!TIP] 语义网

一些三元组存储的研究和开发工作是由 *语义网* 推动的，这是 2000 年代初的一项努力，旨在通过不仅以人类可读的网页形式发布数据，还以标准化的机器可读格式发布数据来促进互联网范围的数据交换。尽管最初设想的语义网没有成功 [^49] [^50]，但语义网项目的遗产在几项特定技术中继续存在：*链接数据* 标准（如 JSON-LD [^51]）、生物医学科学中使用的 *本体* [^52]、Facebook 的开放图协议 [^53]（用于链接展开 [^54]）、知识图（如 Wikidata）以及由 [`schema.org`](https://schema.org/) 维护的结构化数据的标准化词汇表。

三元组存储是另一种在其原始用例之外找到用途的语义网技术：即使你对语义网没有兴趣，三元组也可以成为应用程序的良好内部数据模型。

--------

#### RDF 数据模型 {#the-rdf-data-model}

我们在 [示例 3-8](#fig_graph_n3_shorthand) 中使用的 Turtle 语言实际上是在 *资源描述框架*（RDF）[^55] 中编码数据的一种方式，这是为语义网设计的数据模型。RDF 数据也可以用其他方式编码，例如（更冗长地）用 XML，如 [示例 3-9](#fig_graph_rdf_xml) 所示。像 Apache Jena 这样的工具可以在不同的 RDF 编码之间自动转换。

{{< figure link="#fig_graph_n3_shorthand" id="fig_graph_rdf_xml" title="示例 3-9. 示例 3-8 的数据，使用 RDF/XML 语法表示" class="w-full my-4" >}}

```xml
<rdf:RDF xmlns="urn:example:"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">

    <Location rdf:nodeID="idaho">
        <name>Idaho</name>
        <type>state</type>
        <within>
            <Location rdf:nodeID="usa">
                <name>United States</name>
                <type>country</type>
                <within>
                    <Location rdf:nodeID="namerica">
                        <name>North America</name>
                        <type>continent</type>
                    </Location>
                </within>
            </Location>
        </within>
    </Location>

    <Person rdf:nodeID="lucy">
        <name>Lucy</name>
        <bornIn rdf:nodeID="idaho"/>
    </Person>
</rdf:RDF>
```

RDF 有一些怪癖，因为它是为互联网范围的数据交换而设计的。三元组的主语、谓语和宾语通常是 URI。例如，谓语可能是一个 URI，如 `<http://my-company.com/namespace#within>` 或 `<http://my-company.com/namespace#lives_in>`，而不仅仅是 `WITHIN` 或 `LIVES_IN`。这种设计背后的原因是，你应该能够将你的数据与其他人的数据结合起来，如果他们给单词 `within` 或 `lives_in` 附加了不同的含义，你不会发生冲突，因为他们的谓语实际上是 `<http://other.org/foo#within>` 和 `<http://other.org/foo#lives_in>`。

URL `<http://my-company.com/namespace>` 不一定需要解析为任何内容 —— 从 RDF 的角度来看，它只是一个命名空间。为了避免与 `http://` URL 的潜在混淆，本节中的示例使用不可解析的 URI，如 `urn:example:within`。幸运的是，你只需在文件顶部指定一次此前缀，然后就可以忘记它。

#### SPARQL 查询语言 {#the-sparql-query-language}

*SPARQL* 是使用 RDF 数据模型的三元组存储的查询语言 [^56]。（它是 *SPARQL Protocol and RDF Query Language* 的首字母缩略词，发音为 "sparkle"。）它早于 Cypher，由于 Cypher 的模式匹配是从 SPARQL 借用的，它们看起来非常相似。

与之前相同的查询 —— 查找从美国搬到欧洲的人 —— 在 SPARQL 中与在 Cypher 中一样简洁（见 [示例 3-10](#fig_sparql_query)）。

{{< figure id="fig_sparql_query" title="示例 3-10. 与 [示例 3-5](#fig_cypher_query) 相同的查询，用 SPARQL 表示" class="w-full my-4" >}}

```
PREFIX : <urn:example:>

SELECT ?personName WHERE {
 ?person :name ?personName.
 ?person :bornIn / :within* / :name "United States".
 ?person :livesIn / :within* / :name "Europe".
}
```

结构非常相似。以下两个表达式是等效的（变量在 SPARQL 中以问号开头）：

```
(person) -[:BORN_IN]-> () -[:WITHIN*0..]-> (location) # Cypher

?person :bornIn / :within* ?location. # SPARQL
```

因为 RDF 不区分属性和边，而只是对两者都使用谓语，所以你可以使用相同的语法来匹配属性。在以下表达式中，变量 `usa` 绑定到任何具有 `name` 属性且其值为字符串 `"United States"` 的顶点：

```
(usa {name:'United States'}) # Cypher

?usa :name "United States". # SPARQL
```

SPARQL 得到 Amazon Neptune、AllegroGraph、Blazegraph、OpenLink Virtuoso、Apache Jena 和各种其他三元组存储的支持 [^36]。

### Datalog：递归关系查询 {#id62}

Datalog 是一种比 SPARQL 或 Cypher 更古老的语言：它源于 20 世纪 80 年代的学术研究 [^57] [^58] [^59]。它在软件工程师中不太为人所知，并且在主流数据库中没有得到广泛支持，但它应该更为人所知，因为它是一种非常有表现力的语言，对于复杂查询特别强大。几个小众数据库，包括 Datomic、LogicBlox、CozoDB 和 LinkedIn 的 LIquid [^60] 使用 Datalog 作为它们的查询语言。

Datalog 实际上基于关系数据模型，而不是图，但它出现在本书的图数据库部分，因为图上的递归查询是 Datalog 的特殊优势。

Datalog 数据库的内容由 *事实* 组成，每个事实对应于关系表中的一行。例如，假设我们有一个包含位置的表 *location*，它有三列：*ID*、*name* 和 *type*。美国是一个国家的事实可以写成 `location(2, "United States", "country")`，其中 `2` 是美国的 ID。一般来说，语句 `table(val1, val2, …​)` 意味着 `table` 包含一行，其中第一列包含 `val1`，第二列包含 `val2`，依此类推。

[示例 3-11](#fig_datalog_triples) 显示了如何在 Datalog 中编写 [图 3-6](#fig_datamodels_graph) 左侧的数据。图的边（`within`、`born_in` 和 `lives_in`）表示为两列连接表。例如，Lucy 的 ID 是 100，爱达荷州的 ID 是 3，所以关系"Lucy 出生在爱达荷州"表示为 `born_in(100, 3)`。

{{< figure id="fig_datalog_triples" title="示例 3-11. [图 3-6](#fig_datamodels_graph) 中数据的子集，表示为 Datalog 事实" class="w-full my-4" >}}

```
location(1, "North America", "continent").
location(2, "United States", "country").
location(3, "Idaho", "state").

within(2, 1). /* 美国在北美 */
within(3, 2). /* 爱达荷州在美国 */

person(100, "Lucy").
born_in(100, 3). /* Lucy 出生在爱达荷州 */
```

现在我们已经定义了数据，我们可以编写与之前相同的查询，如 [示例 3-12](#fig_datalog_query) 所示。它看起来与 Cypher 或 SPARQL 中的等效查询有点不同，但不要让这吓倒你。Datalog 是 Prolog 的子集，这是一种编程语言，如果你学过计算机科学，你可能见过它。

{{< figure id="fig_datalog_query" title="示例 3-12. 与 [示例 3-5](#fig_cypher_query) 相同的查询，用 Datalog 表示" class="w-full my-4" >}}

```sql
within_recursive(LocID, PlaceName) :- location(LocID, PlaceName, _). /* 规则 1 */

within_recursive(LocID, PlaceName) :- within(LocID, ViaID), /* 规则 2 */
 within_recursive(ViaID, PlaceName).

migrated(PName, BornIn, LivingIn) :- person(PersonID, PName), /* 规则 3 */
 born_in(PersonID, BornID),
 within_recursive(BornID, BornIn),
 lives_in(PersonID, LivingID),
 within_recursive(LivingID, LivingIn).

us_to_europe(Person) :- migrated(Person, "United States", "Europe"). /* 规则 4 */
/* us_to_europe 包含行 "Lucy"。 */
```

Cypher 和 SPARQL 直接用 `SELECT` 开始，但 Datalog 一次只迈出一小步。我们定义 *规则* 从底层事实派生新的虚拟表。这些派生表就像（虚拟）SQL 视图：它们不存储在数据库中，但你可以像查询包含存储事实的表一样查询它们。

在 [示例 3-12](#fig_datalog_query) 中，我们定义了三个派生表：`within_recursive`、`migrated` 和 `us_to_europe`。虚拟表的名称和列由每个规则的 `:-` 符号之前出现的内容定义。例如，`migrated(PName, BornIn, LivingIn)` 是一个具有三列的虚拟表：一个人的姓名、他们出生地的名称和他们居住地的名称。

虚拟表的内容由规则的 `:-` 符号之后的部分定义，我们在其中尝试查找表中匹配某种模式的行。例如，`person(PersonID, PName)` 匹配行 `person(100, "Lucy")`，变量 `PersonID` 绑定到值 `100`，变量 `PName` 绑定到值 `"Lucy"`。如果系统可以为 `:-` 算子右侧的 *所有* 模式找到匹配项，则规则适用。当规则适用时，就好像 `:-` 的左侧被添加到数据库中（变量被它们匹配的值替换）。

因此，应用规则的一种可能方式是（如 [图 3-7](#fig_datalog_naive) 所示）：

1. `location(1, "North America", "continent")` 存在于数据库中，因此规则 1 适用。它生成 `within_recursive(1, "North America")`。
2. `within(2, 1)` 存在于数据库中，前一步生成了 `within_recursive(1, "North America")`，因此规则 2 适用。它生成 `within_recursive(2, "North America")`。
3. `within(3, 2)` 存在于数据库中，前一步生成了 `within_recursive(2, "North America")`，因此规则 2 适用。它生成 `within_recursive(3, "North America")`。

通过重复应用规则 1 和 2，`within_recursive` 虚拟表可以告诉我们数据库中包含的北美（或任何其他位置）的所有位置。

{{< figure link="#fig_datalog_query" src="/fig/ddia_0307.png" id="fig_datalog_naive" title="图 3-7. 使用示例 3-12 中的 Datalog 规则确定爱达荷州在北美。" class="w-full my-4" >}}

> 图 3-7. 使用 [示例 3-12](#fig_datalog_query) 中的 Datalog 规则确定爱达荷州在北美。

现在规则 3 可以找到出生在某个位置 `BornIn` 并居住在某个位置 `LivingIn` 的人。规则 4 使用 `BornIn = 'United States'` 和 `LivingIn = 'Europe'` 调用规则 3，并仅返回匹配搜索的人的姓名。通过查询虚拟 `us_to_europe` 表的内容，Datalog 系统最终得到与早期 Cypher 和 SPARQL 查询相同的答案。

与本章讨论的其他查询语言相比，Datalog 方法需要不同类型的思维。它允许逐条规则地构建复杂查询，一个规则引用其他规则，类似于你将代码分解为相互调用的函数的方式。就像函数可以递归一样，Datalog 规则也可以调用自己，如 [示例 3-12](#fig_datalog_query) 中的规则 2，这使得 Datalog 查询中的图遍历成为可能。

### GraphQL {#id63}

GraphQL 是一种查询语言，从设计上讲，它比我们在本章中看到的其他查询语言限制性更强。GraphQL 的目的是允许在用户设备上运行的客户端软件（如移动应用程序或 JavaScript Web 应用程序前端）请求具有特定结构的 JSON 文档，其中包含渲染其用户界面所需的字段。GraphQL 接口允许开发人员快速更改客户端代码中的查询，而无需更改服务器端 API。

GraphQL 的灵活性是有代价的。采用 GraphQL 的组织通常需要工具将 GraphQL 查询转换为对内部服务的请求，这些服务通常使用 REST 或 gRPC（参见 [第 5 章](/ch5#ch_encoding)）。授权、速率限制和性能挑战是额外的关注点 [^61]。GraphQL 的查询语言也受到限制，因为 GraphQL 查询来自不受信任的来源。该语言不允许任何可能执行成本高昂的操作，否则用户可能通过运行大量昂贵的查询对服务器执行拒绝服务攻击。特别是，GraphQL 不允许递归查询（与 Cypher、SPARQL、SQL 或 Datalog 不同），并且不允许任意搜索条件，如"查找在美国出生并现在居住在欧洲的人"（除非服务所有者特别选择提供此类搜索功能）。

尽管如此，GraphQL 还是很有用的。[示例 3-13](#fig_graphql_query) 显示了如何使用 GraphQL 实现 Discord 或 Slack 等群聊应用程序。查询请求用户有权访问的所有频道，包括频道名称和每个频道中的 50 条最新消息。对于每条消息，它请求时间戳、消息内容以及消息发送者的姓名和个人资料图片 URL。此外，如果消息是对另一条消息的回复，查询还会请求发送者姓名和它所回复的消息内容（可能以较小的字体呈现在回复上方，以提供一些上下文）。

{{< figure id="fig_graphql_query" title="示例 3-13. 群聊应用程序的示例 GraphQL 查询" class="w-full my-4" >}}

```
query ChatApp {
    channels {
        name
        recentMessages(latest: 50) {
            timestamp
            content
        sender {
            fullName
            imageUrl
        }
    replyTo {
        content
        sender {
            fullName
        }
    }
    }
    }
}
```

[示例 3-14](#fig_graphql_response) 显示了对 [示例 3-13](#fig_graphql_query) 中查询的响应可能是什么样子。响应是一个反映查询结构的 JSON 文档：它正好包含请求的那些属性，不多也不少。这种方法的优点是服务器不需要知道客户端需要哪些属性来渲染用户界面；相反，客户端可以简单地请求它需要的内容。例如，此查询不会为 `replyTo` 消息的发送者请求个人资料图片 URL，但如果用户界面更改为添加该个人资料图片，客户端可以很容易地将所需的 `imageUrl` 属性添加到查询中，而无需更改服务器。

{{< figure link="#fig_graphql_query" id="fig_graphql_response" title="示例 3-14. 对 示例 3-13 中查询的可能响应" class="w-full my-4" >}}

```json
{
"data": {
    "channels": [
        {
        "name": "#general",
        "recentMessages": [
        {
        "timestamp": 1693143014,
        "content": "Hey! How are y'all doing?",
        "sender": {"fullName": "Aaliyah", "imageUrl": "https://..."},
        "replyTo": null
        },
        {
            "timestamp": 1693143024,
            "content": "Great! And you?",
            "sender": {"fullName": "Caleb", "imageUrl": "https://..."},
            "replyTo": {
            "content": "Hey! How are y'all doing?",
            "sender": {"fullName": "Aaliyah"}
        }
},
...
```

在 [示例 3-14](#fig_graphql_response) 中，消息发送者的姓名和图像 URL 直接嵌入在消息对象中。如果同一用户发送多条消息，此信息会在每条消息上重复。原则上，可以减少这种重复，但 GraphQL 做出了接受更大响应大小的设计选择，以便更简单地基于数据渲染用户界面。

`replyTo` 字段类似：在 [示例 3-14](#fig_graphql_response) 中，第二条消息是对第一条消息的回复，内容（"Hey!…"）和发送者 Aaliyah 在 `replyTo` 下重复。可以改为返回被回复消息的 ID，但如果该 ID 不在返回的 50 条最新消息中，客户端就必须向服务器发出额外的请求。重复内容使得处理数据变得更加简单。

服务器的数据库可以以更规范化的形式存储数据，并执行必要的连接来处理查询。例如，服务器可能存储消息以及发送者的用户 ID 和它所回复的消息的 ID；当它收到如上所示的查询时，服务器将解析这些 ID 以查找它们引用的记录。但是，客户端只能要求服务器执行 GraphQL 模式中明确提供的连接。

即使对 GraphQL 查询的响应看起来类似于文档数据库的响应，即使它的名称中有"graph"，GraphQL 也可以在任何类型的数据库之上实现 —— 关系型、文档型或图型。


## 事件溯源与 CQRS {#sec_datamodels_events}

在我们迄今为止讨论的所有数据模型中，数据以与写入相同的形式被查询 —— 无论是 JSON 文档、表中的行，还是图中的顶点和边。然而，在复杂的应用程序中，有时很难找到一种能够满足所有不同查询和呈现数据方式的单一数据表示。在这种情况下，以一种形式写入数据，然后从中派生出针对不同类型读取优化的多种表示形式可能是有益的。

我们之前在 ["记录系统与派生数据"](/ch1#sec_introduction_derived) 中看到了这个想法，ETL（参见 ["数据仓库"](/ch1#sec_introduction_dwh)）就是这种派生过程的一个例子。现在我们将进一步深入这个想法。如果我们无论如何都要从一种数据表示派生出另一种，我们可以选择分别针对写入和读取优化的不同表示。如果你只想为写入优化数据建模，而不关心高效查询，你会如何建模？

也许写入数据的最简单、最快速和最具表现力的方式是 *事件日志*：每次你想写入一些数据时，你将其编码为自包含的字符串（可能是 JSON），包括时间戳，然后将其追加到事件序列中。此日志中的事件是 *不可变的*：你永远不会更改或删除它们，你只会向日志追加更多事件（这可能会取代早期事件）。事件可以包含任意属性。

[图 3-8](#fig_event_sourcing) 显示了一个可能来自会议管理系统的示例。会议可能是一个复杂的业务领域：不仅个人参与者可以注册并用信用卡付款，公司也可以批量订购座位，通过发票付款，然后再将座位分配给个人。一些座位可能为演讲者、赞助商、志愿者助手等保留。预订也可能被取消，与此同时，会议组织者可能通过将其移至不同的房间来更改活动的容量。在所有这些情况发生时，简单地计算可用座位数量就成为一个具有挑战性的查询。

{{< figure src="/fig/ddia_0308.png" id="fig_event_sourcing" title="图 3-8. 使用不可变事件日志作为真相来源（权威数据源），并从中派生物化视图。" class="w-full my-4" >}}

在 [图 3-8](#fig_event_sourcing) 中，会议状态的每个变化（例如组织者开放注册，或参与者进行和取消注册）首先被存储为事件。每当事件追加到日志时，几个 *物化视图*（也称为 *投影* 或 *读模型*）也会更新以反映该事件的影响。在会议示例中，可能有一个物化视图收集与每个预订状态相关的所有信息，另一个为会议组织者的仪表板计算图表，第三个为打印参与者徽章的打印机生成文件。

使用事件作为真相来源（权威数据源），并将每个状态变化表达为事件的想法被称为 *事件溯源* [^62] [^63]。维护单独的读优化表示并从写优化表示派生它们的原则称为 *命令查询责任分离（CQRS）* [^64]。这些术语起源于领域驱动设计（DDD）社区，尽管类似的想法已经存在很长时间了，例如 *状态机复制*（参见 ["使用共享日志"](/ch10#sec_consistency_smr)）。

当用户的请求进来时，它被称为 *命令*，首先需要验证。只有在命令已执行并确定有效（例如，请求的预订有足够的可用座位）后，它才成为事实，相应的事件被添加到日志中。因此，事件日志应该只包含有效事件，构建物化视图的事件日志消费者不允许拒绝事件。

在以事件溯源风格建模数据时，建议你使用过去时态命名事件（例如，"座位已预订"），因为事件是记录过去发生的事情的记录。即使用户后来决定更改或取消，他们以前持有预订的事实仍然是真实的，更改或取消是稍后添加的单独事件。

事件溯源与星型模式事实表之间的相似之处（如 ["星型与雪花型：分析模式"](#sec_datamodels_analytics) 中所讨论的）是两者都是过去发生的事件的集合。然而，事实表中的行都具有相同的列集，而在事件溯源中可能有许多不同的事件类型，每种都有不同的属性。此外，事实表是无序集合，而在事件溯源中事件的顺序很重要：如果先进行预订然后取消，以错误的顺序处理这些事件将没有意义。

事件溯源和 CQRS 有几个优点：

* 对于开发系统的人来说，事件更好地传达了 *为什么* 发生某事的意图。例如，理解事件"预订已取消"比理解"`bookings` 表第 4001 行的 `active` 列被设置为 `false`，与该预订相关的三行从 `seat_assignments` 表中删除，并且在 `payments` 表中插入了一行代表退款"更容易。当物化视图处理取消事件时，这些行修改仍可能发生，但当它们由事件驱动时，更新的原因变得更加清晰。
* 事件溯源的关键原则是物化视图以可重现的方式从事件日志派生：你应该始终能够删除物化视图并通过以相同顺序处理相同事件，使用相同代码来重新计算它们。如果视图维护代码中有错误，你可以删除视图并使用新代码重新计算它。查找错误也更容易，因为你可以随意重新运行视图维护代码并检查其行为。
* 你可以有多个物化视图，针对应用程序所需的特定查询进行优化。它们可以存储在与事件相同的数据库中，也可以存储在不同的数据库中，具体取决于你的需求。它们可以使用任何数据模型，并且可以为快速读取而反规范化。你甚至可以只在内存中保留视图并避免持久化它，只要可以在服务重新启动时从事件日志重新计算视图即可。
* 如果你决定以新方式呈现现有信息，很容易从现有事件日志构建新的物化视图。你还可以通过添加新类型的事件或向现有事件类型添加新属性（任何旧事件保持未修改）来发展系统以支持新功能。你还可以将新行为链接到现有事件（例如，当会议参与者取消时，他们的座位可以提供给等候名单上的下一个人）。
* 如果某个事件被错误写入，你可以再把它删掉，这样你就能重建出一个没有这个被删除事件的视图。另一方面，在直接更新和删除数据的数据库中，已提交的事务通常很难撤销。因此，事件溯源可以减少系统中不可逆操作的数量，使其更容易更改（参见 ["可演化性：让变更变得容易"](/ch2#sec_introduction_evolvability)）。
* 事件日志还可以作为系统中发生的所有事情的审计日志，这在需要此类可审计性的受监管行业中很有价值。

然而，事件溯源和 CQRS 也有缺点：

* 如果涉及外部信息，你需要小心。例如，假设一个事件包含以一种货币给出的价格，对于其中一个视图，它需要转换为另一种货币。由于汇率可能会波动，在处理事件时从外部源获取汇率会有问题，因为如果你在另一个日期重新计算物化视图，你会得到不同的结果。为了使事件处理逻辑具有确定性，你要么需要在事件本身中包含汇率，要么有一种方法来查询事件中指示的时间戳处的历史汇率，确保此查询始终为相同的时间戳返回相同的结果。
* 事件不可变的要求会在事件包含用户的个人数据时产生问题，因为用户可能行使他们的权利（例如，根据 GDPR）请求删除他们的数据。如果事件日志是基于每个用户的，你可以删除该用户的整个日志，但如果你的事件日志包含与多个用户相关的事件，这就不起作用了。你可以尝试将个人数据存储在实际事件之外，或者使用密钥对其进行加密，你可以稍后选择删除该密钥，但这也使得在需要时更难重新计算派生状态。
* 如果存在外部可见的副作用，重新处理事件需要小心 —— 例如，你可能不希望每次重建物化视图时都重新发送确认电子邮件。

你可以在任何数据库之上实现事件溯源，但也有一些专门设计来支持这种模式的系统，例如 EventStoreDB、MartenDB（基于 PostgreSQL）和 Axon Framework。你还可以使用消息代理（如 Apache Kafka）来存储事件日志，流处理器可以使物化视图保持最新；我们将在 ["数据变更捕获与事件溯源"](/ch12#sec_stream_event_sourcing) 中回到这些主题。

唯一重要的要求是事件存储系统必须保证所有物化视图以与它们在日志中出现的完全相同的顺序处理事件；正如我们将在 [第 10 章](/ch10#ch_consistency) 中看到的，这在分布式系统中并不总是容易实现。


## 数据框、矩阵与数组 {#sec_datamodels_dataframes}

到目前为止，我们在本章中看到的数据模型通常用于事务处理和分析目的（参见 ["分析与运营系统"](/ch1#sec_introduction_analytics)）。还有一些数据模型你可能会在分析或科学环境中遇到，但很少出现在 OLTP 系统中：数据框和多维数字数组（如矩阵）。

数据框是 R 语言、Python 的 Pandas 库、Apache Spark、ArcticDB、Dask 和其他系统支持的数据模型。它们是数据科学家为训练机器学习模型准备数据的流行工具，但它们也广泛用于数据探索、统计数据分析、数据可视化和类似目的。

乍一看，数据框类似于关系数据库中的表或电子表格。它支持对数据框内容执行批量操作的类关系算子：例如，将函数应用于所有行、基于某些条件过滤行、按某些列对行进行分组并聚合其他列，以及基于某个键将一个数据框中的行与另一个数据框连接（关系数据库称为 *连接* 的操作在数据框上通常称为 *合并*）。

数据框通常不是通过声明式查询（如 SQL）而是通过一系列修改其结构和内容的命令来操作的。这符合数据科学家的典型工作流程，他们逐步"整理"数据，使其成为能够找到他们所提问题答案的形式。这些操作通常在数据科学家的数据集私有副本上进行，通常在他们的本地机器上，尽管最终结果可能与其他用户共享。

数据框 API 还提供了远远超出关系数据库提供的各种操作，数据模型的使用方式通常与典型的关系数据建模非常不同 [^65]。例如，数据框的常见用途是将数据从类似关系的表示转换为矩阵或多维数组表示，这是许多机器学习算法期望的输入形式。

[图 3-9](#fig_dataframe_to_matrix) 显示了这种转换的简单示例。左侧是不同用户如何评价各种电影的关系表（评分为 1 到 5），右侧数据已转换为矩阵，其中每列是一部电影，每行是一个用户（类似于电子表格中的 *数据透视表*）。矩阵是 *稀疏* 的，这意味着许多用户-电影组合没有数据，但这没关系。这个矩阵可能有数千列，因此不太适合关系数据库，但数据框和提供稀疏数组的库（如 Python 的 NumPy）可以轻松处理此类数据。

{{< figure src="/fig/ddia_0309.png" id="fig_dataframe_to_matrix" title="图 3-9. 将电影评分的关系数据库转换为矩阵表示。" class="w-full my-4" >}}

矩阵只能包含数字，各种技术用于将非数字数据转换为矩阵中的数字。例如：

* 日期（在 [图 3-9](#fig_dataframe_to_matrix) 的示例矩阵中省略了）可以缩放为某个合适范围内的浮点数。
* 对于只能取一小组固定值之一的列（例如，电影数据库中电影的类型），通常使用 *独热编码*：我们为每个可能的值创建一列（一个用于"喜剧"，一个用于"剧情"，一个用于"恐怖"等），对于代表电影的每一行，我们在对应于该电影类型的列中放置 1，在所有其他列中放置 0。这种表示也很容易推广到适合多种类型的电影。

一旦数据以数字矩阵的形式存在，它就适合线性代数运算，这构成了许多机器学习算法的基础。例如，[图 3-9](#fig_dataframe_to_matrix) 中的数据可能是推荐用户可能喜欢的电影系统的一部分。数据框足够灵活，允许数据从关系形式逐渐演变为矩阵表示，同时让数据科学家控制最适合实现数据分析或模型训练过程目标的表示。

还有像 TileDB [^66] 这样专门存储大型多维数字数组的数据库；它们被称为 *数组数据库*，最常用于科学数据集，如地理空间测量（规则间隔网格上的栅格数据）、医学成像或天文望远镜的观测 [^67]。数据框在金融行业也用于表示 *时间序列数据*，如资产价格和随时间变化的交易 [^68]。

## 总结 {#summary}

数据模型是一个巨大的主题，在本章中，我们快速浏览了各种不同的模型。我们没有空间深入每个模型的所有细节，但希望这个概述足以激发你的兴趣，找出最适合你的应用需求的模型。

*关系模型* 尽管已有半个多世纪的历史，但对许多应用来说仍然是一个重要的数据模型——特别是在数据仓库和商业分析中，关系星型或雪花模式和 SQL 查询无处不在。然而，关系数据的几种替代方案也在其他领域变得流行：

* *文档模型* 针对数据以独立的 JSON 文档形式出现的用例，以及一个文档与另一个文档之间的关系很少的情况。
* *图数据模型* 走向相反的方向，针对任何东西都可能与一切相关的用例，以及查询可能需要遍历多个跳跃才能找到感兴趣的数据（可以使用 Cypher、SPARQL 或 Datalog 中的递归查询来表达）。
* *数据框* 将关系数据推广到大量列，从而在数据库和构成大量机器学习、统计数据分析和科学计算基础的多维数组之间提供桥梁。

在某种程度上，一个模型可以用另一个模型来模拟——例如，图数据可以在关系数据库中表示——但结果可能很别扭，正如我们在 SQL 中对递归查询的支持中看到的那样。

因此，为每个数据模型开发了各种专业数据库，提供针对特定模型优化的查询语言和存储引擎。然而，数据库也有通过添加对其他数据模型的支持来扩展到相邻领域的趋势：例如，关系数据库以 JSON 列的形式添加了对文档数据的支持，文档数据库添加了类似关系的连接，SQL 中对图数据的支持也在逐步改进。

我们讨论的另一个模型是 *事件溯源*，它将数据表示为不可变事件的仅追加日志，这对于建模复杂业务领域中的活动可能是有利的。仅追加日志有利于写入数据（正如我们将在 [第 4 章](/ch4#ch_storage) 中看到的）；为了支持高效查询，事件日志通过 CQRS 转换为读优化的物化视图。

非关系数据模型的一个共同点是，它们通常不会对存储的数据强制执行模式，这可以使应用更容易适应不断变化的需求。然而，你的应用很可能仍然假设数据具有某种结构；这只是模式是显式的（在写入时强制执行）还是隐式的（在读取时假设）的问题。

尽管我们涵盖了很多内容，但仍有数据模型未被提及。仅举几个简短的例子：

* 研究基因组数据的研究人员通常需要执行 *序列相似性搜索*，这意味着获取一个非常长的字符串（代表 DNA 分子）并将其与相似但不相同的大量字符串数据库进行匹配。这里描述的数据库都无法处理这种用法，这就是研究人员编写了像 GenBank [^69] 这样的专门基因组数据库软件的原因。
* 许多金融系统使用具有复式记账的 *账本* 作为其数据模型。这种类型的数据可以在关系数据库中表示，但也有像 TigerBeetle 这样专门研究这种数据模型的数据库。加密货币和区块链通常基于分布式账本，它们的数据模型中也内置了价值转移。
* *全文检索* 可以说是一种经常与数据库一起使用的数据模型。信息检索是一个大型的专业主题，我们不会在本书中详细介绍，但我们将在 ["全文检索"](/ch4#sec_storage_full_text) 中涉及搜索索引和向量搜索。

我们现在必须到此为止了。在下一章中，我们将讨论在 *实现* 本章中描述的数据模型时出现的一些权衡。


### 参考文献

[^1]: Jamie Brandon. [Unexplanations: query optimization works because sql is declarative](https://www.scattered-thoughts.net/writing/unexplanations-sql-declarative/). *scattered-thoughts.net*, February 2024. Archived at [perma.cc/P6W2-WMFZ](https://perma.cc/P6W2-WMFZ)
[^2]: Joseph M. Hellerstein. [The Declarative Imperative: Experiences and Conjectures in Distributed Logic](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2010/EECS-2010-90.pdf). Tech report UCB/EECS-2010-90, Electrical Engineering and Computer Sciences, University of California at Berkeley, June 2010. Archived at [perma.cc/K56R-VVQM](https://perma.cc/K56R-VVQM)
[^3]: Edgar F. Codd. [A Relational Model of Data for Large Shared Data Banks](https://www.seas.upenn.edu/~zives/03f/cis550/codd.pdf). *Communications of the ACM*, volume 13, issue 6, pages 377–387, June 1970. [doi:10.1145/362384.362685](https://doi.org/10.1145/362384.362685)
[^4]: Michael Stonebraker and Joseph M. Hellerstein. [What Goes Around Comes Around](http://mitpress2.mit.edu/books/chapters/0262693143chapm1.pdf). In *Readings in Database Systems*, 4th edition, MIT Press, pages 2–41, 2005. ISBN: 9780262693141
[^5]: Markus Winand. [Modern SQL: Beyond Relational](https://modern-sql.com/). *modern-sql.com*, 2015. Archived at [perma.cc/D63V-WAPN](https://perma.cc/D63V-WAPN)
[^6]: Martin Fowler. [OrmHate](https://martinfowler.com/bliki/OrmHate.html). *martinfowler.com*, May 2012. Archived at [perma.cc/VCM8-PKNG](https://perma.cc/VCM8-PKNG)
[^7]: Vlad Mihalcea. [N+1 query problem with JPA and Hibernate](https://vladmihalcea.com/n-plus-1-query-problem/). *vladmihalcea.com*, January 2023. Archived at [perma.cc/79EV-TZKB](https://perma.cc/79EV-TZKB)
[^8]: Jens Schauder. [This is the Beginning of the End of the N+1 Problem: Introducing Single Query Loading](https://spring.io/blog/2023/08/31/this-is-the-beginning-of-the-end-of-the-n-1-problem-introducing-single-query). *spring.io*, August 2023. Archived at [perma.cc/6V96-R333](https://perma.cc/6V96-R333)
[^9]: William Zola. [6 Rules of Thumb for MongoDB Schema Design](https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design). *mongodb.com*, June 2014. Archived at [perma.cc/T2BZ-PPJB](https://perma.cc/T2BZ-PPJB)
[^10]: Sidney Andrews and Christopher McClister. [Data modeling in Azure Cosmos DB](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/modeling-data). *learn.microsoft.com*, February 2023. Archived at [archive.org](https://web.archive.org/web/20230207193233/https%3A//learn.microsoft.com/en-us/azure/cosmos-db/nosql/modeling-data)
[^11]: Raffi Krikorian. [Timelines at Scale](https://www.infoq.com/presentations/Twitter-Timeline-Scalability/). At *QCon San Francisco*, November 2012. Archived at [perma.cc/V9G5-KLYK](https://perma.cc/V9G5-KLYK)
[^12]: Ralph Kimball and Margy Ross. [*The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling*](https://learning.oreilly.com/library/view/the-data-warehouse/9781118530801/), 3rd edition. John Wiley & Sons, July 2013. ISBN: 9781118530801
[^13]: Michael Kaminsky. [Data warehouse modeling: Star schema vs. OBT](https://www.fivetran.com/blog/star-schema-vs-obt). *fivetran.com*, August 2022. Archived at [perma.cc/2PZK-BFFP](https://perma.cc/2PZK-BFFP)
[^14]: Joe Nelson. [User-defined Order in SQL](https://begriffs.com/posts/2018-03-20-user-defined-order.html). *begriffs.com*, March 2018. Archived at [perma.cc/GS3W-F7AD](https://perma.cc/GS3W-F7AD)
[^15]: Evan Wallace. [Realtime Editing of Ordered Sequences](https://www.figma.com/blog/realtime-editing-of-ordered-sequences/). *figma.com*, March 2017. Archived at [perma.cc/K6ER-CQZW](https://perma.cc/K6ER-CQZW)
[^16]: David Greenspan. [Implementing Fractional Indexing](https://observablehq.com/%40dgreensp/implementing-fractional-indexing). *observablehq.com*, October 2020. Archived at [perma.cc/5N4R-MREN](https://perma.cc/5N4R-MREN)
[^17]: Martin Fowler. [Schemaless Data Structures](https://martinfowler.com/articles/schemaless/). *martinfowler.com*, January 2013.
[^18]: Amr Awadallah. [Schema-on-Read vs. Schema-on-Write](https://www.slideshare.net/awadallah/schemaonread-vs-schemaonwrite). At *Berkeley EECS RAD Lab Retreat*, Santa Cruz, CA, May 2009. Archived at [perma.cc/DTB2-JCFR](https://perma.cc/DTB2-JCFR)
[^19]: Martin Odersky. [The Trouble with Types](https://www.infoq.com/presentations/data-types-issues/). At *Strange Loop*, September 2013. Archived at [perma.cc/85QE-PVEP](https://perma.cc/85QE-PVEP)
[^20]: Conrad Irwin. [MongoDB—Confessions of a PostgreSQL Lover](https://speakerdeck.com/conradirwin/mongodb-confessions-of-a-postgresql-lover). At *HTML5DevConf*, October 2013. Archived at [perma.cc/C2J6-3AL5](https://perma.cc/C2J6-3AL5)
[^21]: [Percona Toolkit Documentation: pt-online-schema-change](https://docs.percona.com/percona-toolkit/pt-online-schema-change.html). *docs.percona.com*, 2023. Archived at [perma.cc/9K8R-E5UH](https://perma.cc/9K8R-E5UH)
[^22]: Shlomi Noach. [gh-ost: GitHub’s Online Schema Migration Tool for MySQL](https://github.blog/2016-08-01-gh-ost-github-s-online-migration-tool-for-mysql/). *github.blog*, August 2016. Archived at [perma.cc/7XAG-XB72](https://perma.cc/7XAG-XB72)
[^23]: Shayon Mukherjee. [pg-osc: Zero downtime schema changes in PostgreSQL](https://www.shayon.dev/post/2022/47/pg-osc-zero-downtime-schema-changes-in-postgresql/). *shayon.dev*, February 2022. Archived at [perma.cc/35WN-7WMY](https://perma.cc/35WN-7WMY)
[^24]: Carlos Pérez-Aradros Herce. [Introducing pgroll: zero-downtime, reversible, schema migrations for Postgres](https://xata.io/blog/pgroll-schema-migrations-postgres). *xata.io*, October 2023. Archived at [archive.org](https://web.archive.org/web/20231008161750/https%3A//xata.io/blog/pgroll-schema-migrations-postgres)
[^25]: James C. Corbett, Jeffrey Dean, Michael Epstein, Andrew Fikes, Christopher Frost, JJ Furman, Sanjay Ghemawat, Andrey Gubarev, Christopher Heiser, Peter Hochschild, Wilson Hsieh, Sebastian Kanthak, Eugene Kogan, Hongyi Li, Alexander Lloyd, Sergey Melnik, David Mwaura, David Nagle, Sean Quinlan, Rajesh Rao, Lindsay Rolig, Dale Woodford, Yasushi Saito, Christopher Taylor, Michal Szymaniak, and Ruth Wang. [Spanner: Google’s Globally-Distributed Database](https://research.google/pubs/pub39966/). At *10th USENIX Symposium on Operating System Design and Implementation* (OSDI), October 2012.
[^26]: Donald K. Burleson. [Reduce I/O with Oracle Cluster Tables](http://www.dba-oracle.com/oracle_tip_hash_index_cluster_table.htm). *dba-oracle.com*. Archived at [perma.cc/7LBJ-9X2C](https://perma.cc/7LBJ-9X2C)
[^27]: Fay Chang, Jeffrey Dean, Sanjay Ghemawat, Wilson C. Hsieh, Deborah A. Wallach, Mike Burrows, Tushar Chandra, Andrew Fikes, and Robert E. Gruber. [Bigtable: A Distributed Storage System for Structured Data](https://research.google/pubs/pub27898/). At *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006.
[^28]: Priscilla Walmsley. [*XQuery, 2nd Edition*](https://learning.oreilly.com/library/view/xquery-2nd-edition/9781491915080/). O’Reilly Media, December 2015. ISBN: 9781491915080
[^29]: Paul C. Bryan, Kris Zyp, and Mark Nottingham. [JavaScript Object Notation (JSON) Pointer](https://www.rfc-editor.org/rfc/rfc6901). RFC 6901, IETF, April 2013.
[^30]: Stefan Gössner, Glyn Normington, and Carsten Bormann. [JSONPath: Query Expressions for JSON](https://www.rfc-editor.org/rfc/rfc9535.html). RFC 9535, IETF, February 2024.
[^31]: Michael Stonebraker and Andrew Pavlo. [What Goes Around Comes Around… And Around…](https://db.cs.cmu.edu/papers/2024/whatgoesaround-sigmodrec2024.pdf). *ACM SIGMOD Record*, volume 53, issue 2, pages 21–37. [doi:10.1145/3685980.3685984](https://doi.org/10.1145/3685980.3685984)
[^32]: Lawrence Page, Sergey Brin, Rajeev Motwani, and Terry Winograd. [The PageRank Citation Ranking: Bringing Order to the Web](http://ilpubs.stanford.edu:8090/422/). Technical Report 1999-66, Stanford University InfoLab, November 1999. Archived at [perma.cc/UML9-UZHW](https://perma.cc/UML9-UZHW)
[^33]: Nathan Bronson, Zach Amsden, George Cabrera, Prasad Chakka, Peter Dimov, Hui Ding, Jack Ferris, Anthony Giardullo, Sachin Kulkarni, Harry Li, Mark Marchukov, Dmitri Petrov, Lovro Puzar, Yee Jiun Song, and Venkat Venkataramani. [TAO: Facebook’s Distributed Data Store for the Social Graph](https://www.usenix.org/conference/atc13/technical-sessions/presentation/bronson). At *USENIX Annual Technical Conference* (ATC), June 2013.
[^34]: Natasha Noy, Yuqing Gao, Anshu Jain, Anant Narayanan, Alan Patterson, and Jamie Taylor. [Industry-Scale Knowledge Graphs: Lessons and Challenges](https://cacm.acm.org/magazines/2019/8/238342-industry-scale-knowledge-graphs/fulltext). *Communications of the ACM*, volume 62, issue 8, pages 36–43, August 2019. [doi:10.1145/3331166](https://doi.org/10.1145/3331166)
[^35]: Xiyang Feng, Guodong Jin, Ziyi Chen, Chang Liu, and Semih Salihoğlu. [KÙZU Graph Database Management System](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf). At *3th Annual Conference on Innovative Data Systems Research* (CIDR 2023), January 2023.
[^36]: Maciej Besta, Emanuel Peter, Robert Gerstenberger, Marc Fischer, Michał Podstawski, Claude Barthels, Gustavo Alonso, Torsten Hoefler. [Demystifying Graph Databases: Analysis and Taxonomy of Data Organization, System Designs, and Graph Queries](https://arxiv.org/pdf/1910.09017.pdf). *arxiv.org*, October 2019.
[^37]: [Apache TinkerPop 3.6.3 Documentation](https://tinkerpop.apache.org/docs/3.6.3/reference/). *tinkerpop.apache.org*, May 2023. Archived at [perma.cc/KM7W-7PAT](https://perma.cc/KM7W-7PAT)
[^38]: Nadime Francis, Alastair Green, Paolo Guagliardo, Leonid Libkin, Tobias Lindaaker, Victor Marsault, Stefan Plantikow, Mats Rydberg, Petra Selmer, and Andrés Taylor. [Cypher: An Evolving Query Language for Property Graphs](https://core.ac.uk/download/pdf/158372754.pdf). At *International Conference on Management of Data* (SIGMOD), pages 1433–1445, May 2018. [doi:10.1145/3183713.3190657](https://doi.org/10.1145/3183713.3190657)
[^39]: Emil Eifrem. [Twitter correspondence](https://twitter.com/emileifrem/status/419107961512804352), January 2014. Archived at [perma.cc/WM4S-BW64](https://perma.cc/WM4S-BW64)
[^40]: Francesco Tisiot. [Explore the new SEARCH and CYCLE features in PostgreSQL® 14](https://aiven.io/blog/explore-the-new-search-and-cycle-features-in-postgresql-14). *aiven.io*, December 2021. Archived at [perma.cc/J6BT-83UZ](https://perma.cc/J6BT-83UZ)
[^41]: Gaurav Goel. [Understanding Hierarchies in Oracle](https://towardsdatascience.com/understanding-hierarchies-in-oracle-43f85561f3d9). *towardsdatascience.com*, May 2020. Archived at [perma.cc/5ZLR-Q7EW](https://perma.cc/5ZLR-Q7EW)
[^42]: Alin Deutsch, Nadime Francis, Alastair Green, Keith Hare, Bei Li, Leonid Libkin, Tobias Lindaaker, Victor Marsault, Wim Martens, Jan Michels, Filip Murlak, Stefan Plantikow, Petra Selmer, Oskar van Rest, Hannes Voigt, Domagoj Vrgoč, Mingxi Wu, and Fred Zemke. [Graph Pattern Matching in GQL and SQL/PGQ](https://arxiv.org/abs/2112.06217). At *International Conference on Management of Data* (SIGMOD), pages 2246–2258, June 2022. [doi:10.1145/3514221.3526057](https://doi.org/10.1145/3514221.3526057)
[^43]: Alastair Green. [SQL... and now GQL](https://opencypher.org/articles/2019/09/12/SQL-and-now-GQL/). *opencypher.org*, September 2019. Archived at [perma.cc/AFB2-3SY7](https://perma.cc/AFB2-3SY7)
[^44]: Alin Deutsch, Yu Xu, and Mingxi Wu. [Seamless Syntactic and Semantic Integration of Query Primitives over Relational and Graph Data in GSQL](https://cdn2.hubspot.net/hubfs/4114546/IntegrationQuery%20PrimitivesGSQL.pdf). *tigergraph.com*, November 2018. Archived at [perma.cc/JG7J-Y35X](https://perma.cc/JG7J-Y35X)
[^45]: Oskar van Rest, Sungpack Hong, Jinha Kim, Xuming Meng, and Hassan Chafi. [PGQL: a property graph query language](https://event.cwi.nl/grades/2016/07-VanRest.pdf). At *4th International Workshop on Graph Data Management Experiences and Systems* (GRADES), June 2016. [doi:10.1145/2960414.2960421](https://doi.org/10.1145/2960414.2960421)
[^46]: Amazon Web Services. [Neptune Graph Data Model](https://docs.aws.amazon.com/neptune/latest/userguide/feature-overview-data-model.html). Amazon Neptune User Guide, *docs.aws.amazon.com*. Archived at [perma.cc/CX3T-EZU9](https://perma.cc/CX3T-EZU9)
[^47]: Cognitect. [Datomic Data Model](https://docs.datomic.com/cloud/whatis/data-model.html). Datomic Cloud Documentation, *docs.datomic.com*. Archived at [perma.cc/LGM9-LEUT](https://perma.cc/LGM9-LEUT)
[^48]: David Beckett and Tim Berners-Lee. [Turtle – Terse RDF Triple Language](https://www.w3.org/TeamSubmission/turtle/). W3C Team Submission, March 2011.
[^49]: Sinclair Target. [Whatever Happened to the Semantic Web?](https://twobithistory.org/2018/05/27/semantic-web.html) *twobithistory.org*, May 2018. Archived at [perma.cc/M8GL-9KHS](https://perma.cc/M8GL-9KHS)
[^50]: Gavin Mendel-Gleason. [The Semantic Web is Dead – Long Live the Semantic Web!](https://terminusdb.com/blog/the-semantic-web-is-dead/) *terminusdb.com*, August 2022. Archived at [perma.cc/G2MZ-DSS3](https://perma.cc/G2MZ-DSS3)
[^51]: Manu Sporny. [JSON-LD and Why I Hate the Semantic Web](http://manu.sporny.org/2014/json-ld-origins-2/). *manu.sporny.org*, January 2014. Archived at [perma.cc/7PT4-PJKF](https://perma.cc/7PT4-PJKF)
[^52]: University of Michigan Library. [Biomedical Ontologies and Controlled Vocabularies](https://guides.lib.umich.edu/ontology), *guides.lib.umich.edu/ontology*. Archived at [perma.cc/Q5GA-F2N8](https://perma.cc/Q5GA-F2N8)
[^53]: Facebook. [The Open Graph protocol](https://ogp.me/), *ogp.me*. Archived at [perma.cc/C49A-GUSY](https://perma.cc/C49A-GUSY)
[^54]: Matt Haughey. [Everything you ever wanted to know about unfurling but were afraid to ask /or/ How to make your site previews look amazing in Slack](https://medium.com/slack-developer-blog/everything-you-ever-wanted-to-know-about-unfurling-but-were-afraid-to-ask-or-how-to-make-your-e64b4bb9254). *medium.com*, November 2015. Archived at [perma.cc/C7S8-4PZN](https://perma.cc/C7S8-4PZN)
[^55]: W3C RDF Working Group. [Resource Description Framework (RDF)](https://www.w3.org/RDF/). *w3.org*, February 2004.
[^56]: Steve Harris, Andy Seaborne, and Eric Prud’hommeaux. [SPARQL 1.1 Query Language](https://www.w3.org/TR/sparql11-query/). W3C Recommendation, March 2013.
[^57]: Todd J. Green, Shan Shan Huang, Boon Thau Loo, and Wenchao Zhou. [Datalog and Recursive Query Processing](http://blogs.evergreen.edu/sosw/files/2014/04/Green-Vol5-DBS-017.pdf). *Foundations and Trends in Databases*, volume 5, issue 2, pages 105–195, November 2013. [doi:10.1561/1900000017](https://doi.org/10.1561/1900000017)
[^58]: Stefano Ceri, Georg Gottlob, and Letizia Tanca. [What You Always Wanted to Know About Datalog (And Never Dared to Ask)](https://www.researchgate.net/profile/Letizia_Tanca/publication/3296132_What_you_always_wanted_to_know_about_Datalog_and_never_dared_to_ask/links/0fcfd50ca2d20473ca000000.pdf). *IEEE Transactions on Knowledge and Data Engineering*, volume 1, issue 1, pages 146–166, March 1989. [doi:10.1109/69.43410](https://doi.org/10.1109/69.43410)
[^59]: Serge Abiteboul, Richard Hull, and Victor Vianu. [*Foundations of Databases*](http://webdam.inria.fr/Alice/). Addison-Wesley, 1995. ISBN: 9780201537710, available online at [*webdam.inria.fr/Alice*](http://webdam.inria.fr/Alice/)
[^60]: Scott Meyer, Andrew Carter, and Andrew Rodriguez. [LIquid: The soul of a new graph database, Part 2](https://engineering.linkedin.com/blog/2020/liquid--the-soul-of-a-new-graph-database--part-2). *engineering.linkedin.com*, September 2020. Archived at [perma.cc/K9M4-PD6Q](https://perma.cc/K9M4-PD6Q)
[^61]: Matt Bessey. [Why, after 6 years, I’m over GraphQL](https://bessey.dev/blog/2024/05/24/why-im-over-graphql/). *bessey.dev*, May 2024. Archived at [perma.cc/2PAU-JYRA](https://perma.cc/2PAU-JYRA)
[^62]: Dominic Betts, Julián Domínguez, Grigori Melnik, Fernando Simonazzi, and Mani Subramanian. [*Exploring CQRS and Event Sourcing*](https://learn.microsoft.com/en-us/previous-versions/msp-n-p/jj554200%28v%3Dpandp.10%29). Microsoft Patterns & Practices, July 2012. ISBN: 1621140164, archived at [perma.cc/7A39-3NM8](https://perma.cc/7A39-3NM8)
[^63]: Greg Young. [CQRS and Event Sourcing](https://www.youtube.com/watch?v=JHGkaShoyNs). At *Code on the Beach*, August 2014.
[^64]: Greg Young. [CQRS Documents](https://cqrs.files.wordpress.com/2010/11/cqrs_documents.pdf). *cqrs.wordpress.com*, November 2010. Archived at [perma.cc/X5R6-R47F](https://perma.cc/X5R6-R47F)
[^65]: Devin Petersohn, Stephen Macke, Doris Xin, William Ma, Doris Lee, Xiangxi Mo, Joseph E. Gonzalez, Joseph M. Hellerstein, Anthony D. Joseph, and Aditya Parameswaran. [Towards Scalable Dataframe Systems](https://www.vldb.org/pvldb/vol13/p2033-petersohn.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 11, pages 2033–2046. [doi:10.14778/3407790.3407807](https://doi.org/10.14778/3407790.3407807)
[^66]: Stavros Papadopoulos, Kushal Datta, Samuel Madden, and Timothy Mattson. [The TileDB Array Data Storage Manager](https://www.vldb.org/pvldb/vol10/p349-papadopoulos.pdf). *Proceedings of the VLDB Endowment*, volume 10, issue 4, pages 349–360, November 2016. [doi:10.14778/3025111.3025117](https://doi.org/10.14778/3025111.3025117)
[^67]: Florin Rusu. [Multidimensional Array Data Management](https://faculty.ucmerced.edu/frusu/Papers/Report/2022-09-fntdb-arrays.pdf). *Foundations and Trends in Databases*, volume 12, numbers 2–3, pages 69–220, February 2023. [doi:10.1561/1900000069](https://doi.org/10.1561/1900000069)
[^68]: Ed Targett. [Bloomberg, Man Group team up to develop open source “ArcticDB” database](https://www.thestack.technology/bloomberg-man-group-arcticdb-database-dataframe/). *thestack.technology*, March 2023. Archived at [perma.cc/M5YD-QQYV](https://perma.cc/M5YD-QQYV)
[^69]: Dennis A. Benson, Ilene Karsch-Mizrachi, David J. Lipman, James Ostell, and David L. Wheeler. [GenBank](https://academic.oup.com/nar/article/36/suppl_1/D25/2507746). *Nucleic Acids Research*, volume 36, database issue, pages D25–D30, December 2007. [doi:10.1093/nar/gkm929](https://doi.org/10.1093/nar/gkm929) 


================================================
FILE: content/zh/ch4.md
================================================
---
title: "4. 存储与检索"
weight: 104
breadcrumbs: false
---

<a id="ch_storage"></a>

![](/map/ch03.png)

> *生活的苦恼之一是，每个人对事物的命名都有些偏差。这让我们理解世界变得比本该有的样子困难一些，要是命名方式不同就好了。计算机的主要功能并不是传统意义上的计算，比如算术运算。[……] 它们主要是归档系统。*
>
> [理查德·费曼](https://www.youtube.com/watch?v=EKWGGDXe5MA&t=296s)，
> *特立独行的思考* 研讨会（1985）

在最基础的层面上，数据库需要做两件事：当你给它一些数据时，它应该存储这些数据；当你之后再询问时，它应该把数据返回给你。

在 [第 3 章](/ch3#ch_datamodels) 中，我们讨论了数据模型和查询语言 —— 即你向数据库提供数据的格式，以及之后再次请求数据的接口。在本章中，我们从数据库的角度讨论同样的问题：数据库如何存储你提供的数据，以及当你请求时如何再次找到这些数据。

作为应用开发者，你为什么要关心数据库内部如何处理存储和检索？你可能不会从头开始实现自己的存储引擎，但你 *确实* 需要从众多可用的存储引擎中选择一个适合你应用的。为了让存储引擎在你的工作负载类型上表现良好，你需要对存储引擎在底层做了什么有个大致的了解。

特别是，针对事务型工作负载（OLTP）优化的存储引擎和针对分析型工作负载优化的存储引擎之间存在巨大差异（我们在 ["分析型与事务型系统"](/ch1#sec_introduction_analytics) 中介绍了这种区别）。本章首先研究两种用于 OLTP 的存储引擎家族：写入不可变数据文件的 *日志结构* 存储引擎，以及像 *B 树* 这样就地更新数据的存储引擎。这些结构既用于键值存储，也用于二级索引。

随后在 ["分析型数据存储"](#sec_storage_analytics) 中，我们将讨论一系列针对分析优化的存储引擎；在 ["多维索引与全文索引"](#sec_storage_multidimensional) 中，我们将简要介绍用于更高级查询（如文本检索）的索引。

## OLTP 系统的存储与索引 {#sec_storage_oltp}

考虑世界上最简单的数据库，用两个 Bash 函数实现：

```bash
#!/bin/bash

db_set () {
  echo "$1,$2" >> database
}

db_get () {
  grep "^$1," database | sed -e "s/^$1,//" | tail -n 1
}
```

这两个函数实现了一个键值存储。你可以调用 `db_set key value`，它将在数据库中存储 `key` 和 `value`。键和值可以是（几乎）任何你喜欢的内容 —— 例如，值可以是一个 JSON 文档。然后你可以调用 `db_get key`，它会查找与该特定键关联的最新值并返回它。

麻雀虽小，五脏俱全：

```bash
$ db_set 12 '{"name":"London","attractions":["Big Ben","London Eye"]}'

$ db_set 42 '{"name":"San Francisco","attractions":["Golden Gate Bridge"]}'

$ db_get 42
{"name":"San Francisco","attractions":["Golden Gate Bridge"]}
```

存储格式非常简单：一个文本文件，每行包含一个键值对，用逗号分隔（大致类似 CSV 文件，忽略转义问题）。每次调用 `db_set` 都会追加到文件末尾。如果你多次更新一个键，旧版本的值不会被覆盖 —— 你需要查看文件中键的最后一次出现来找到最新值（因此 `db_get` 中使用了 `tail -n 1`）：

```bash
$ db_set 42 '{"name":"San Francisco","attractions":["Exploratorium"]}'

$ db_get 42
{"name":"San Francisco","attractions":["Exploratorium"]}

$ cat database
12,{"name":"London","attractions":["Big Ben","London Eye"]}
42,{"name":"San Francisco","attractions":["Golden Gate Bridge"]}
42,{"name":"San Francisco","attractions":["Exploratorium"]}

```

对于如此简单的实现，`db_set` 函数实际上有相当好的性能，因为追加到文件通常非常高效。与 `db_set` 所做的类似，许多数据库内部使用 *日志*，这是一个仅追加的数据文件。真正的数据库有更多问题要处理（如处理并发写入、回收磁盘空间以防日志无限增长，以及从崩溃中恢复时处理部分写入的记录），但基本原理是相同的。日志非常有用，我们将在本书中多次遇到它们。

---------

> [!NOTE]
> *日志* 这个词通常用于指应用程序日志，应用程序输出描述正在发生什么的文本。在本书中，*日志* 用于更一般的含义：磁盘上仅追加的记录序列。它不一定是人类可读的；它可能是二进制的，仅供数据库系统内部使用。

--------


另一方面，如果你的数据库中有大量记录，`db_get` 函数的性能会很糟糕。每次你想查找一个键时，`db_get` 必须从头到尾扫描整个数据库文件，寻找该键的出现。用算法术语来说，查找的成本是 *O*(*n*)：如果你的数据库中的记录数 *n* 翻倍，查找时间也会翻倍。这并不好。

为了高效地找到数据库中特定键的值，我们需要一个不同的数据结构：*索引*。在本章中，我们将研究一系列索引结构并了解它们的比较；一般思想是以特定方式（例如，按某个键排序）构建数据，使定位所需数据更快。如果你想以几种不同的方式搜索相同的数据，你可能需要在数据的不同部分上建立几个不同的索引。

索引是从主数据派生出的 *额外* 结构。许多数据库允许你添加和删除索引，这不会影响数据库的内容；它只影响查询的性能。维护额外的结构会产生开销，特别是在写入时。对于写入，很难超越简单地追加到文件的性能，因为这是最简单的写入操作。任何类型的索引通常都会减慢写入速度，因为每次写入数据时也需要更新索引。

这是存储系统中的一个重要权衡：精心选择的索引加快了读查询速度，但每个索引都会消耗额外的磁盘空间并减慢写入速度，有时会大幅减慢 [^1]。因此，数据库通常不会默认为所有内容建立索引，而是要求你 —— 编写应用程序或管理数据库的人 —— 使用你对应用程序典型查询模式的了解来手动选择索引。然后你可以选择为你的应用程序带来最大收益的索引，而不会引入超过必要的写入开销。

### 日志结构存储 {#sec_storage_log_structured}

首先，让我们假设你想继续将数据存储在 `db_set` 写入的仅追加文件中，你只是想加快读取速度。一种方法是在内存中保留一个哈希映射，其中每个键都映射到文件中可以找到该键最新值的字节偏移量，如 [图 4-1](#fig_storage_csv_hash_index) 所示。

{{< figure src="/fig/ddia_0401.png" id="fig_storage_csv_hash_index" caption="图 4-1. 以类似 CSV 格式存储键值对日志，使用内存哈希映射建立索引。" class="w-full my-4" >}}

每当你向文件追加新的键值对时，你也会更新哈希映射以反映刚刚写入数据的偏移量。当你想查找一个值时，你使用哈希映射找到日志文件中的偏移量，寻找到该位置，然后读取值。如果数据文件的那部分已经在文件系统缓存中，读取根本不需要任何磁盘 I/O。

这种方法速度更快，但仍然存在几个问题：

* 你永远不会释放被覆盖的旧日志条目占用的磁盘空间；如果你不断写入数据库，可能会耗尽磁盘空间。
* 哈希映射不是持久化的，所以当你重启数据库时必须重建它 —— 例如，通过扫描整个日志文件来找到每个键的最新字节偏移量。如果你有大量数据，这会使重启变慢。
* 哈希表必须适合内存。原则上，你可以在磁盘上维护哈希表，但不幸的是，很难让磁盘上的哈希映射表现良好。它需要大量的随机访问 I/O，当它变满时扩展成本高昂，哈希冲突需要复杂的逻辑 [^2]。
* 范围查询效率不高。例如，你不能轻松扫描 `10000` 和 `19999` 之间的所有键 —— 你必须在哈希映射中单独查找每个键。

#### SSTable 文件格式 {#the-sstable-file-format}

实际上，哈希表很少用于数据库索引，相反，保持数据 *按键排序* 的结构更为常见 [^3]。这种结构的一个例子是 *排序字符串表*（*Sorted String Table*），简称 *SSTable*，如 [图 4-2](#fig_storage_sstable_index) 所示。这种文件格式也存储键值对，但它确保它们按键排序，每个键在文件中只出现一次。

{{< figure src="/fig/ddia_0402.png" id="fig_storage_sstable_index" caption="图 4-2. 带有稀疏索引的 SSTable，允许查询跳转到正确的块。" class="w-full my-4" >}}

现在你不需要在内存中保留所有键：你可以将 SSTable 中的键值对分组为几千字节的 *块*，然后在索引中存储每个块的第一个键。这种只存储部分键的索引称为 *稀疏* 索引。这个索引存储在 SSTable 的单独部分，例如使用不可变 B 树、字典树或其他允许查询快速查找特定键的数据结构 [^4]。

例如，在 [图 4-2](#fig_storage_sstable_index) 中，一个块的第一个键是 `handbag`，下一个块的第一个键是 `handsome`。现在假设你要查找键 `handiwork`，它没有出现在稀疏索引中。由于排序，你知道 `handiwork` 必须出现在 `handbag` 和 `handsome` 之间。这意味着你可以寻找到 `handbag` 的偏移量，然后从那里扫描文件，直到找到 `handiwork`（或没有，如果该键不在文件中）。几千字节的块可以非常快速地扫描。

此外，每个记录块都可以压缩（在 [图 4-2](#fig_storage_sstable_index) 中用阴影区域表示）。除了节省磁盘空间外，压缩还减少了 I/O 带宽使用，代价是使用更多一点的 CPU 时间。

#### 构建和合并 SSTable {#constructing-and-merging-sstables}

SSTable 文件格式在读取方面比仅追加日志更好，但它使写入更加困难。我们不能简单地追加到末尾，因为那样文件就不再有序了（除非键恰好按升序写入）。如果我们每次在中间某处插入键时都必须重写整个 SSTable，写入将变得太昂贵。

我们可以用 *日志结构* 方法解决这个问题，这是仅追加日志和排序文件之间的混合：

1. 当写入操作到来时，将其添加到内存中的有序映射数据结构中，例如红黑树、跳表 [^5] 或字典树 [^6]。使用这些数据结构，你可以按任意顺序插入键，高效地查找它们，并按排序顺序读回它们。这个内存数据结构称为 *内存表*（*memtable*）。
2. 当内存表变得大于某个阈值（通常是几兆字节）时，将其按排序顺序作为 SSTable 文件写入磁盘。我们将这个新的 SSTable 文件称为数据库的最新 *段*，它与旧段一起作为单独的文件存储。每个段都有自己内容的单独索引。当新段被写入磁盘时，数据库可以继续写入新的内存表实例，当 SSTable 写入完成时，旧内存表的内存被释放。
3. 为了读取某个键的值，首先尝试在内存表和最新的磁盘段中找到该键。如果没有找到，就在下一个较旧的段中查找，依此类推，直到找到键或到达最旧的段。如果键没有出现在任何段中，则它不存在于数据库中。
4. 不时地在后台运行合并和压实过程，以合并段文件并丢弃被覆盖或删除的值。

合并段的工作方式类似于 *归并排序* 算法 [^5]。该过程如 [图 4-3](#fig_storage_sstable_merging) 所示：并排开始读取输入文件，查看每个文件中的第一个键，将最低的键（根据排序顺序）复制到输出文件，然后重复。如果同一个键出现在多个输入文件中，只保留较新的值。这会产生一个新的合并段文件，也按键排序，每个键只有一个值，并且它使用最少的内存，因为我们可以一次遍历一个键的 SSTable。

{{< figure src="/fig/ddia_0403.png" id="fig_storage_sstable_merging" caption="图 4-3. 合并多个 SSTable 段，仅保留每个键的最新值。" class="w-full my-4" >}}

为了确保数据库崩溃时内存表中的数据不会丢失，存储引擎在磁盘上保留一个单独的日志，每次写入都会立即追加到该日志中。此日志不按键排序，但这无关紧要，因为它的唯一目的是在崩溃后恢复内存表。每次内存表被写出到 SSTable 后，日志的相应部分就可以丢弃。

如果你想删除一个键及其关联的值，你必须向数据文件追加一个称为 *墓碑*（*tombstone*）的特殊删除记录。当日志段合并时，墓碑告诉合并过程丢弃已删除键的任何先前值。一旦墓碑合并到最旧的段中，它就可以被丢弃。

这里描述的算法本质上就是 RocksDB [^7]、Cassandra、Scylla 和 HBase [^8] 中使用的算法，它们都受到 Google 的 Bigtable 论文 [^9] 的启发（该论文引入了 *SSTable* 和 *memtable* 这两个术语）。

该算法最初于 1996 年以 *日志结构合并树*（*Log-Structured Merge-Tree*）或 *LSM 树*（*LSM-Tree*）[^10] 的名称发布，建立在早期日志结构文件系统工作的基础上 [^11]。因此，基于合并和压实排序文件原理的存储引擎通常被称为 *LSM 存储引擎*。

在 LSM 存储引擎中，段文件是一次性写入的（通过写出内存表或合并一些现有段），此后它是不可变的。段的合并和压实可以在后台线程中完成，当它进行时，我们仍然可以使用旧的段文件继续提供读取服务。当合并过程完成时，我们将读取请求切换到使用新的合并段而不是旧段，然后可以删除旧的段文件。

段文件不一定必须存储在本地磁盘上：它们也非常适合写入对象存储。例如，SlateDB 和 Delta Lake [^12] 采用了这种方法。

具有不可变段文件也简化了崩溃恢复：如果在写出内存表或合并段时发生崩溃，数据库可以删除未完成的 SSTable 并重新开始。将写入持久化到内存表的日志如果在写入记录的过程中发生崩溃，或者磁盘已满，可能包含不完整的记录；这些通常通过在日志中包含校验和来检测，并丢弃损坏或不完整的日志条目。我们将在 [第 8 章](/ch8#ch_transactions) 中更多地讨论持久性和崩溃恢复。

<a id="sec_storage_bloom_filter"></a>

#### 布隆过滤器 {#bloom-filters}

使用 LSM 存储，读取很久以前更新的键或不存在的键可能会很慢，因为存储引擎需要检查多个段文件。为了加快此类读取，LSM 存储引擎通常在每个段中包含一个 *布隆过滤器*（*Bloom filter*）[^13]，它提供了一种快速但近似的方法来检查特定键是否出现在特定 SSTable 中。

[图 4-4](#fig_storage_bloom) 显示了一个包含两个键和 16 位的布隆过滤器示例（实际上，它会包含更多的键和更多的位）。对于 SSTable 中的每个键，我们计算一个哈希函数，产生一组数字，然后将其解释为位数组的索引 [^14]。我们将对应于这些索引的位设置为 1，其余保持为 0。例如，键 `handbag` 哈希为数字 (2, 9, 4)，所以我们将第 2、9 和 4 位设置为 1。然后将位图与键的稀疏索引一起存储为 SSTable 的一部分。这需要一点额外的空间，但与 SSTable 的其余部分相比，布隆过滤器通常很小。

{{< figure src="/fig/ddia_0404.png" id="fig_storage_bloom" caption="图 4-4. 布隆过滤器提供了一种快速的概率检查，用于判断特定键是否存在于特定 SSTable 中。" class="w-full my-4" >}}

当我们想知道一个键是否出现在 SSTable 中时，我们像以前一样计算该键的相同哈希，并检查这些索引处的位。例如，在 [图 4-4](#fig_storage_bloom) 中，我们查询键 `handheld`，它哈希为 (6, 11, 2)。其中一个位是 1（即第 2 位），而另外两个是 0。这些检查可以使用所有 CPU 都支持的位运算非常快速地进行。

如果至少有一个位是 0，我们知道该键肯定不在 SSTable 中。如果查询中的位都是 1，那么该键很可能在 SSTable 中，但也有可能是巧合，所有这些位都被其他键设置为 1。这种看起来键存在但实际上不存在的情况称为 *假阳性*（*false positive*）。

假阳性的概率取决于键的数量、每个键设置的位数和布隆过滤器中的总位数。你可以使用在线计算器工具为你的应用计算出正确的参数 [^15]。作为经验法则，你需要为 SSTable 中的每个键分配 10 位布隆过滤器空间以获得 1% 的假阳性概率，每为每个键分配额外的 5 位，概率就会降低十倍。

在 LSM 存储引擎的上下文中，假阳性没有问题：

* 如果布隆过滤器说键 *不* 存在，我们可以安全地跳过该 SSTable，因为我们可以确定它不包含该键。
* 如果布隆过滤器说键 *存在*，我们必须查询稀疏索引并解码键值对块以检查键是否真的在那里。如果是假阳性，我们做了一些不必要的工作，但除此之外没有害处 —— 我们只是继续使用下一个最旧的段进行搜索。

#### 压实策略 {#sec_storage_lsm_compaction}

一个重要的细节是 LSM 存储如何选择何时执行压实，以及在压实中包括哪些 SSTable。许多基于 LSM 的存储系统允许你配置使用哪种压实策略，一些常见的选择是 [^16] [^17]：

分层压实（Size-tiered compaction）
: 较新和较小的 SSTable 依次合并到较旧和较大的 SSTable 中。包含较旧数据的 SSTable 可能变得非常大，合并它们需要大量的临时磁盘空间。这种策略的优点是它可以处理非常高的写入吞吐量。

分级压实（Leveled compaction）
: 键范围被分成较小的 SSTable，较旧的数据被移动到单独的"级别"中，这允许压实更增量地进行，并且比分层策略使用更少的磁盘空间。这种策略对于读取比分层压实更有效，因为存储引擎需要读取更少的 SSTable 来检查它们是否包含该键。

作为经验法则，如果你主要有写入而读取很少，分层压实表现更好，而如果你的工作负载以读取为主，分级压实表现更好。如果你频繁写入少量键，而很少写入大量键，那么分级压实也可能有优势 [^18]。

尽管有许多细微之处，但 LSM 树的基本思想 —— 保持在后台合并的 SSTable 级联 —— 简单而有效。我们将在 ["比较 B 树与 LSM 树"](#sec_storage_btree_lsm_comparison) 中更详细地讨论它们的性能特征。

--------

<a id="sidebar_embedded"></a>

> [!TIP] 嵌入式存储引擎

许多数据库作为接受网络查询的服务运行，但也有 *嵌入式* 数据库不公开网络 API。相反，它们是在与应用程序代码相同的进程中运行的库，通常读取和写入本地磁盘上的文件，你通过正常的函数调用与它们交互。嵌入式存储引擎的例子包括 RocksDB、SQLite、LMDB、DuckDB 和 KùzuDB [^19]。

嵌入式数据库在移动应用中非常常用，用于存储本地用户的数据。在后端，如果数据足够小以适合单台机器，并且没有太多并发事务，它们可能是一个合适的选择。例如，在多租户系统中，如果每个租户足够小且完全与其他租户分离（即，你不需要运行合并多个租户数据的查询），你可能可以为每个租户使用单独的嵌入式数据库实例 [^20]。

我们在本章讨论的存储和检索方法既用于嵌入式数据库，也用于客户端-服务器数据库。在 [第 6 章](/ch6#ch_replication) 和 [第 7 章](/ch7#ch_sharding) 中，我们将讨论跨多台机器扩展数据库的技术。

--------

### B 树 {#sec_storage_b_trees}

日志结构方法很流行，但它不是键值存储的唯一形式。按键读取和写入数据库记录最广泛使用的结构是 *B 树*。

B 树于 1970 年引入 [^21]，不到 10 年后就被称为"无处不在"[^22]，它们经受住了时间的考验。它们仍然是几乎所有关系数据库中的标准索引实现，许多非关系数据库也使用它们。

像 SSTable 一样，B 树按键保持键值对排序，这允许高效的键值查找和范围查询。但相似之处到此为止：B 树有着非常不同的设计理念。

我们之前看到的日志结构索引将数据库分解为可变大小的 *段*，通常为几兆字节或更大，写入一次后就不可变。相比之下，B 树将数据库分解为固定大小的 *块* 或 *页*，并可能就地覆盖页。页传统上大小为 4 KiB，但 PostgreSQL 现在默认使用 8 KiB，MySQL 默认使用 16 KiB。

每个页都可以使用页号来标识，这允许一个页引用另一个页 —— 类似于指针，但在磁盘上而不是在内存中。如果所有页都存储在同一个文件中，将页号乘以页大小就给我们文件中页所在位置的字节偏移量。我们可以使用这些页引用来构建页树，如 [图 4-5](#fig_storage_b_tree) 所示。

{{< figure src="/fig/ddia_0405.png" id="fig_storage_b_tree" caption="图 4-5. 使用 B 树索引查找键 251。从根页开始，我们首先跟随引用到键 200–300 的页，然后是键 250–270 的页。" class="w-full my-4" >}}

一个页被指定为 B 树的 *根*；每当你想在索引中查找一个键时，你就从这里开始。该页包含几个键和对子页的引用。每个子页负责一个连续的键范围，引用之间的键指示这些范围之间的边界在哪里。（这种结构有时称为 B+ 树，但我们不需要将其与其他 B 树变体区分开来。）

在 [图 4-5](#fig_storage_b_tree) 的例子中，我们正在查找键 251，所以我们知道我们需要跟随边界 200 和 300 之间的页引用。这将我们带到一个看起来相似的页，该页进一步将 200–300 范围分解为子范围。最终我们到达包含单个键的页（*叶页*），该页要么内联包含每个键的值，要么包含对可以找到值的页的引用。

B 树的一个页中对子页的引用数称为 *分支因子*。例如，在 [图 4-5](#fig_storage_b_tree) 中，分支因子为六。实际上，分支因子取决于存储页引用和范围边界所需的空间量，但通常为几百。

如果你想更新 B 树中现有键的值，你搜索包含该键的叶页，并用包含新值的版本覆盖磁盘上的该页。如果你想添加一个新键，你需要找到其范围包含新键的页并将其添加到该页。如果页中没有足够的空闲空间来容纳新键，则页被分成两个半满的页，并更新父页以说明键范围的新细分。

{{< figure src="/fig/ddia_0406.png" id="fig_storage_b_tree_split" caption="图 4-6. 通过在边界键 337 上分割页来增长 B 树。父页被更新以引用两个子页。" class="w-full my-4" >}}

在 [图 4-6](#fig_storage_b_tree_split) 的例子中，我们想插入键 334，但范围 333–345 的页已经满了。因此，我们将其分成范围 333–337（包括新键）的页和 337–344 的页。我们还必须更新父页以引用两个子页，它们之间的边界值为 337。如果父页没有足够的空间容纳新引用，它也可能需要被分割，分割可以一直持续到树的根。当根被分割时，我们在它上面创建一个新根。删除键（可能需要合并节点）更复杂 [^5]。

这个算法确保树保持 *平衡*：具有 *n* 个键的 B 树始终具有 *O*(log *n*) 的深度。大多数数据库可以适合三或四层深的 B 树，所以你不需要跟随许多页引用来找到你要查找的页。（具有 500 分支因子的 4 KiB 页的四层树可以存储多达 250 TB。）

#### 使 B 树可靠 {#sec_storage_btree_wal}

B 树的基本底层写操作是用新数据覆盖磁盘上的页。假设覆盖不会改变页的位置；即，当页被覆盖时，对该页的所有引用保持不变。这与日志结构索引（如 LSM 树）形成鲜明对比，后者只追加到文件（并最终删除过时的文件），但从不就地修改文件。

一次覆盖多个页，如在页分割中，是一个危险的操作：如果数据库在只写入了部分页后崩溃，你最终会得到一个损坏的树（例如，可能有一个 *孤立* 页，它不是任何父页的子页）。如果硬件不能原子地写入整个页，你也可能最终得到部分写入的页（这称为 *撕裂页*（*torn page*）[^23]）。

为了使数据库对崩溃具有弹性，B 树实现通常包括磁盘上的额外数据结构：*预写日志*（*write-ahead log*，WAL）。这是一个仅追加文件，每个 B 树修改必须在应用于树本身的页之前写入其中。当数据库在崩溃后恢复时，此日志用于将 B 树恢复到一致状态 [^2] [^24]。在文件系统中，等效机制称为 *日志记录*（*journaling*）。

为了提高性能，B 树实现通常不会立即将每个修改的页写入磁盘，而是首先将 B 树页缓冲在内存中一段时间。预写日志还确保在崩溃的情况下数据不会丢失：只要数据已写入 WAL，并使用 `fsync()` 系统调用刷新到磁盘，数据就是持久的，因为数据库将能够在崩溃后恢复它 [^25]。

#### B 树变体 {#b-tree-variants}

由于 B 树已经存在了很长时间，多年来已经开发了许多变体。仅举几个例子：

* 一些数据库（如 LMDB）使用写时复制方案 [^26]，而不是覆盖页并维护 WAL 以进行崩溃恢复。修改的页被写入不同的位置，并创建树中父页的新版本，指向新位置。这种方法对于并发控制也很有用，我们将在 ["快照隔离和可重复读"](/ch8#sec_transactions_snapshot_isolation) 中看到。
* 我们可以通过不存储整个键而是缩写它来节省页中的空间。特别是在树内部的页中，键只需要提供足够的信息来充当键范围之间的边界。在页中打包更多键允许树具有更高的分支因子，从而减少层数。
* 为了加快按排序顺序扫描键范围，一些 B 树实现尝试布局树，使叶页按顺序出现在磁盘上，减少磁盘寻道次数。然而，随着树的增长，很难维持这种顺序。
* 已向树添加了其他指针。例如，每个叶页可能有对其左右兄弟页的引用，这允许按顺序扫描键而无需跳回父页。

### 比较 B 树与 LSM 树 {#sec_storage_btree_lsm_comparison}

作为经验法则，LSM 树更适合写入密集型应用，而 B 树对读取更快 [^27] [^28]。然而，基准测试通常对工作负载的细节很敏感。你需要使用特定的工作负载测试系统，以便进行有效的比较。此外，这不是 LSM 和 B 树之间的严格二选一选择：存储引擎有时会混合两种方法的特征，例如具有多个 B 树并以 LSM 风格合并它们。在本节中，我们将简要讨论在衡量存储引擎性能时值得考虑的几件事。

#### 读取性能 {#read-performance}

在 B 树中，查找键涉及在 B 树的每个层级读取一个页。由于层级数通常很小，这意味着从 B 树读取通常很快并且具有可预测的性能。在 LSM 存储引擎中，读取通常必须检查处于不同压实阶段的几个不同 SSTable，但布隆过滤器有助于减少所需的实际磁盘 I/O 操作数。两种方法都可以表现良好，哪个更快取决于存储引擎的细节和工作负载。

范围查询在 B 树上简单而快速，因为它们可以使用树的排序结构。在 LSM 存储上，范围查询也可以利用 SSTable 排序，但它们需要并行扫描所有段并组合结果。布隆过滤器对范围查询没有帮助（因为你需要计算范围内每个可能键的哈希，这是不切实际的），使得范围查询在 LSM 方法中比点查询更昂贵 [^29]。

如果内存表填满，高写入吞吐量可能会导致日志结构存储引擎中的延迟峰值。如果数据无法足够快地写入磁盘，可能是因为压实过程无法跟上传入的写入，就会发生这种情况。许多存储引擎，包括 RocksDB，在这种情况下执行 *背压*：它们暂停所有读取和写入，直到内存表被写入磁盘 [^30] [^31]。

关于读取吞吐量，现代 SSD（特别是 NVMe）可以并行执行许多独立的读请求。LSM 树和 B 树都能够提供高读取吞吐量，但存储引擎需要仔细设计以利用这种并行性 [^32]。

#### 顺序与随机写入 {#sidebar_sequential}

使用 B 树时，如果应用程序写入的键分散在整个键空间中，生成的磁盘操作也会随机分散，因为存储引擎需要覆盖的页可能位于磁盘的任何位置。另一方面，日志结构存储引擎一次写入整个段文件（无论是写出内存表还是压实现有段），这比 B 树中的页大得多。

许多小的、分散的写入模式（如 B 树中的）称为 *随机写入*，而较少的大写入模式（如 LSM 树中的）称为 *顺序写入*。磁盘通常具有比随机写入更高的顺序写入吞吐量，这意味着日志结构存储引擎通常可以在相同硬件上处理比 B 树更高的写入吞吐量。这种差异在旋转磁盘硬盘（HDD）上特别大；在今天大多数数据库使用的固态硬盘（SSD）上，差异较小，但仍然明显（参见 ["SSD 上的顺序与随机写入"](#sidebar_sequential)）。

--------

> [!TIP] SSD 上的顺序与随机写入

在旋转磁盘硬盘（HDD）上，顺序写入比随机写入快得多：随机写入必须机械地将磁头移动到新位置，并等待盘片的正确部分经过磁头下方，这需要几毫秒 —— 在计算时间尺度上是永恒的。然而，SSD（固态硬盘）包括 NVMe（Non-Volatile Memory Express，即连接到 PCI Express 总线的闪存）现在已经在许多场景中超越了 HDD，它们不受这种机械限制。

尽管如此，SSD 对顺序写入的吞吐量也高于随机写入。原因是闪存可以一次读取或写入一页（通常为 4 KiB），但只能一次擦除一个块（通常为 512 KiB）。块中的某些页可能包含有效数据，而其他页可能包含不再需要的数据。在擦除块之前，控制器必须首先将包含有效数据的页移动到其他块中；这个过程称为 *垃圾回收*（GC）[^33]。

顺序写入工作负载一次写入更大的数据块，因此整个 512 KiB 块很可能属于单个文件；当该文件稍后再次被删除时，整个块可以被擦除而无需执行任何 GC。另一方面，对于随机写入工作负载，块更可能包含有效和无效数据页的混合，因此 GC 必须在块可以擦除之前执行更多工作 [^34] [^35] [^36]。

GC 消耗的写入带宽就不能用于应用程序。此外，GC 执行的额外写入会导致闪存磨损；因此，随机写入比顺序写入更快地磨损驱动器。

--------

#### 写放大 {#write-amplification}

对于任何类型的存储引擎，来自应用程序的一次写请求都会转换为底层磁盘上的多个 I/O 操作。对于 LSM 树，一个值首先被写入日志以保证持久性，然后在内存表写入磁盘时再次写入，并且每次键值对参与压实时再次写入。（如果值明显大于键，可以通过将值与键分开存储，并仅对包含键和值引用的 SSTable 执行压实来减少这种开销 [^37]。）

B 树索引必须至少写入每条数据两次：一次写入预写日志，一次写入树页本身。此外，它们有时需要写出整个页，即使该页中只有几个字节发生了变化，以确保 B 树在崩溃或断电后可以正确恢复 [^38] [^39]。

如果你获取在某个工作负载中写入磁盘的总字节数，然后除以如果你只是写入没有索引的仅追加日志需要写入的字节数，你就得到了 *写放大*。（有时写放大是根据 I/O 操作而不是字节来定义的。）在写入密集型应用程序中，瓶颈可能是数据库可以写入磁盘的速率。在这种情况下，写放大越高，它在可用磁盘带宽内可以处理的每秒写入次数就越少。

写放大是 LSM 树和 B 树中的问题。哪个更好取决于各种因素，例如键和值的长度，以及你覆盖现有键与插入新键的频率。对于典型的工作负载，LSM 树往往具有较低的写放大，因为它们不必写入整个页，并且可以压缩 SSTable 的块 [^40]。这是使 LSM 存储引擎非常适合写入密集型工作负载的另一个因素。

除了影响吞吐量，写放大也与 SSD 的磨损有关：写放大较低的存储引擎将更慢地磨损 SSD。

在测量存储引擎的写入吞吐量时，重要的是要运行足够长的实验，以便写放大的影响变得清晰。当写入空的 LSM 树时，还没有进行压实，因此所有磁盘带宽都可用于新写入。随着数据库的增长，新写入需要与压实共享磁盘带宽。

#### 磁盘空间使用 {#disk-space-usage}

B 树可能会随着时间的推移变得 *碎片化*：例如，如果删除了大量键，数据库文件可能包含许多 B 树不再使用的页。对 B 树的后续添加可以使用这些空闲页，但它们不能轻易地返回给操作系统，因为它们在文件的中间，所以它们仍然占用文件系统上的空间。因此，数据库需要一个后台过程来移动页以更好地放置它们，例如 PostgreSQL 中的真空过程 [^25]。

碎片化在 LSM 树中不太成问题，因为压实过程无论如何都会定期重写数据文件，而且 SSTable 没有未使用空间的页。此外，SSTable 中的键值对块可以更好地压缩，因此通常比 B 树在磁盘上产生更小的文件。被覆盖的键和值继续消耗空间，直到它们被压实删除，但使用分级压实时，这种开销相当低 [^40] [^41]。分层压实（参见 ["压实策略"](#sec_storage_lsm_compaction)）使用更多的磁盘空间，特别是在压实期间临时使用。

在磁盘上有一些数据的多个副本也可能是一个问题，当你需要删除一些数据，并确信它真的已被删除（也许是为了遵守数据保护法规）。例如，在大多数 LSM 存储引擎中，已删除的记录可能仍然存在于较高级别中，直到代表删除的墓碑通过所有压实级别传播，这可能需要很长时间。专门的存储引擎设计可以更快地传播删除 [^42]。

另一方面，SSTable 段文件的不可变性质在你想在某个时间点对数据库进行快照时很有用（例如，用于备份或创建数据库副本以进行测试）：你可以写出内存表并记录该时间点存在的段文件。只要你不删除快照的一部分的文件，你就不需要实际复制它们。在其页被覆盖的 B 树中，有效地进行这样的快照更困难。


### 多列索引与二级索引 {#sec_storage_index_multicolumn}

到目前为止，我们只讨论了键值索引，它们就像关系模型中的 *主键* 索引。主键唯一标识关系表中的一行，或文档数据库中的一个文档，或图数据库中的一个顶点。数据库中的其他记录可以通过其主键（或 ID）引用该行/文档/顶点，索引用于解析此类引用。

拥有 *二级索引* 也非常常见。在关系数据库中，你可以使用 `CREATE INDEX` 命令在同一个表上创建多个二级索引，允许你按主键以外的列进行搜索。例如，在 [第 3 章](/ch3#ch_datamodels) 的 [图 3-1](/ch3#fig_obama_relational) 中，你很可能在 `user_id` 列上有一个二级索引，以便你可以在每个表中找到属于同一用户的所有行。

二级索引可以很容易地从键值索引构建。主要区别在于，在二级索引中，索引值不一定是唯一的；也就是说，同一索引条目下可能有许多行（文档、顶点）。这可以通过两种方式解决：要么使索引中的每个值成为匹配行标识符的列表（如全文索引中的倒排列表），要么通过向其追加行标识符使每个条目唯一。具有就地更新的存储引擎（如 B 树）和日志结构存储都可用于实现索引。

#### 在索引中存储值 {#sec_storage_index_heap}

索引中的键是查询搜索的内容，但值可以是几种东西之一：

* 如果实际数据（行、文档、顶点）直接存储在索引结构中，则称为 *聚簇索引*。例如，在 MySQL 的 InnoDB 存储引擎中，表的主键始终是聚簇索引，在 SQL Server 中，你可以为每个表指定一个聚簇索引 [^43]。
* 或者，值可以是对实际数据的引用：要么是相关行的主键（InnoDB 对二级索引这样做），要么是对磁盘上位置的直接引用。在后一种情况下，存储行的地方称为 *堆文件*，它以无特定顺序存储数据（它可能是仅追加的，或者它可能跟踪已删除的行以便稍后用新数据覆盖它们）。例如，Postgres 使用堆文件方法 [^44]。
* 两者之间的折中是 *覆盖索引* 或 *包含列的索引*，它在索引中存储表的 *某些* 列，除了在堆上或主键聚簇索引中存储完整行 [^45]。这允许仅使用索引来回答某些查询，而无需解析主键或查看堆文件（在这种情况下，索引被称为 *覆盖* 查询）。这可以使某些查询更快，但数据的重复意味着索引使用更多的磁盘空间并减慢写入速度。

到目前为止讨论的索引只将单个键映射到值。如果你需要同时查询表的多个列（或文档中的多个字段），请参见 ["多维索引与全文索引"](#sec_storage_multidimensional)。

当更新值而不更改键时，堆文件方法可以允许记录就地覆盖，前提是新值不大于旧值。如果新值更大，情况会更复杂，因为它可能需要移动到堆中有足够空间的新位置。在这种情况下，要么所有索引都需要更新以指向记录的新堆位置，要么在旧堆位置留下转发指针 [^2]。

### 全内存存储 {#sec_storage_inmemory}

本章到目前为止讨论的数据结构都是对磁盘限制的回应。与主内存相比，磁盘很难处理。对于磁盘和 SSD，如果你想在读取和写入上获得良好的性能，磁盘上的数据需要仔细布局。然而，我们容忍这种尴尬，因为磁盘有两个显著的优势：它们是持久的（如果断电，其内容不会丢失），并且它们每千兆字节的成本比 RAM 低。

随着 RAM 变得更便宜，按每 GB 计价的成本优势正在减弱。许多数据集根本没有那么大，因此将它们完全保留在内存中是完全可行的，甚至可以分布在几台机器上。这导致了 *内存数据库* 的发展。

一些内存键值存储，例如 Memcached，仅用于缓存，如果机器重新启动，数据丢失是可以接受的。但其他内存数据库旨在实现持久性，这可以通过特殊硬件（例如电池供电的 RAM）、将更改日志写入磁盘、将定期快照写入磁盘或将内存状态复制到其他机器来实现。

当内存数据库重新启动时，它需要重新加载其状态，要么从磁盘，要么通过网络从副本（除非使用特殊硬件）。尽管写入磁盘，它仍然是一个内存数据库，因为磁盘仅用作持久性的仅追加日志，读取完全从内存提供。写入磁盘还具有操作优势：磁盘上的文件可以轻松备份、检查和由外部实用程序分析。

VoltDB、SingleStore 和 Oracle TimesTen 等产品是具有关系模型的内存数据库，供应商声称，通过消除管理磁盘数据结构相关的所有开销，它们可以提供巨大的性能改进 [^46] [^47]。RAMCloud 是一个开源的内存键值存储，具有持久性（对内存中的数据以及磁盘上的数据使用日志结构方法）[^48]。

Redis 和 Couchbase 通过异步写入磁盘提供弱持久性。

反直觉的是，内存数据库的性能优势不是因为它们不需要从磁盘读取。即使是基于磁盘的存储引擎，如果你有足够的内存，也可能永远不需要从磁盘读取，因为操作系统无论如何都会在内存中缓存最近使用的磁盘块。相反，它们可以更快，因为它们可以避免将内存数据结构编码为可以写入磁盘的形式的开销 [^49]。

除了性能，内存数据库的另一个有趣领域是提供了基于磁盘的索引难以实现的数据模型。例如，Redis 为各种数据结构（例如优先队列和集合）提供类似数据库的接口。因为它将所有数据保留在内存中，其实现相对简单。


## 分析型数据存储 {#sec_storage_analytics}

数据仓库的数据模型最常见的是关系型，因为 SQL 通常非常适合分析查询。有许多图形化数据分析工具可以生成 SQL 查询、可视化结果，并允许分析师探索数据（通过 *下钻* 和 *切片切块* 等操作）。

表面上，数据仓库和关系型 OLTP 数据库看起来很相似，因为它们都有 SQL 查询接口。然而，系统的内部可能看起来完全不同，因为它们针对非常不同的查询模式进行了优化。许多数据库供应商现在专注于支持事务处理或分析工作负载，但不是两者兼而有之。

一些数据库，如 Microsoft SQL Server、SAP HANA 和 SingleStore，在同一产品中支持事务处理和数据仓库。然而，这些混合事务和分析处理（HTAP）数据库（在 ["数据仓库"](/ch1#sec_introduction_dwh) 中介绍）越来越多地成为两个独立的存储和查询引擎，它们恰好可以通过通用的 SQL 接口访问 [^50] [^51] [^52] [^53]。

### 云数据仓库 {#sec_cloud_data_warehouses}

Teradata、Vertica 和 SAP HANA 等数据仓库供应商既销售商业许可下的本地仓库，也销售基于云的解决方案。但随着他们的许多客户转向云，新的云数据仓库（如 Google Cloud BigQuery、Amazon Redshift 和 Snowflake）也变得广泛采用。与传统数据仓库不同，云数据仓库利用可扩展的云基础设施，如对象存储和无服务器计算平台。

云数据仓库往往与其他云服务更好地集成，并且更具弹性。例如，许多云仓库支持自动日志摄取，并提供与数据处理框架（如 Google Cloud 的 Dataflow 或 Amazon Web Services 的 Kinesis）的轻松集成。这些仓库也更具弹性，因为它们将查询计算与存储层解耦 [^54]。数据持久存储在对象存储而不是本地磁盘上，这使得可以独立调整存储容量和查询的计算资源，正如我们之前在 ["云原生系统架构"](/ch1#sec_introduction_cloud_native) 中看到的。

Apache Hive、Trino 和 Apache Spark 等开源数据仓库也随着云的发展而发展。随着分析数据存储转移到对象存储上的数据湖，开源仓库也开始解耦拆分 [^55]。以下组件以前集成在单个系统（如 Apache Hive）中，现在通常作为单独的组件实现：

查询引擎
: Trino、Apache DataFusion 和 Presto 等查询引擎解析 SQL 查询，将其优化为执行计划，并在数据上执行这些计划。执行通常需要并行、分布式的数据处理任务。一些查询引擎提供内置任务执行，而有些则选择使用第三方执行框架，如 Apache Spark 或 Apache Flink。

存储格式
: 存储格式确定表的行如何编码为文件中的字节，然后通常存储在对象存储或分布式文件系统中 [^12]。然后查询引擎可以访问这些数据，但使用数据湖的其他应用程序也可以访问。此类存储格式的示例包括 Parquet、ORC、Lance 或 Nimble，我们将在下一节中看到更多关于它们的内容。

表格式
: 以 Apache Parquet 和类似存储格式编写的文件一旦写入通常就是不可变的。为了支持行插入和删除，通常会使用 Apache Iceberg 或 Databricks Delta 等表格式。表格式规定了哪些文件构成一张表，以及表模式的定义格式。此类格式还提供高级功能，例如时间旅行（查询表在过去某个时间点状态的能力）、垃圾回收，甚至事务。

数据目录
: 就像表格式定义哪些文件构成表一样，数据目录定义哪些表组成数据库。目录用于创建、重命名和删除表。与存储和表格式不同，Snowflake 的 Polaris 和 Databricks 的 Unity Catalog 等数据目录通常作为可以使用 REST 接口查询的独立服务运行。Apache Iceberg 也提供目录，可以在客户端内运行或作为单独的进程运行。查询引擎在读取和写入表时使用目录信息。传统上，目录和查询引擎已经集成，但将它们解耦使数据发现和数据治理系统（在 ["数据系统、法律和社会"](/ch1#sec_introduction_compliance) 中讨论）也能够访问目录的元数据。

### 列式存储 {#sec_storage_column}

如 ["星型和雪花型：分析模式"](/ch3#sec_datamodels_analytics) 中所讨论的，数据仓库按照惯例通常使用带有大型事实表的关系模式，该表包含对维度表的外键引用。如果你的事实表中有数万亿行和数 PB 的数据，有效地存储和查询它们就成为一个具有挑战性的问题。维度表通常要小得多（数百万行），因此在本节中我们将重点关注事实的存储。

尽管事实表通常有超过 100 列，但典型的数据仓库查询一次只访问其中的 4 或 5 列（分析很少需要 `"SELECT *"` 查询）[^52]。以 [示例 4-1](#fig_storage_analytics_query) 中的查询为例：它访问大量行（2024 日历年期间每次有人购买水果或糖果的情况），但它只需要访问 `fact_sales` 表的三列：`date_key`、`product_sk` 和 `quantity`。查询忽略所有其他列。

{{< figure id="fig_storage_analytics_query" title="示例 4-1. 分析人们是否更倾向于购买新鲜水果或糖果，取决于星期几" class="w-full my-4" >}}

```sql
SELECT
    dim_date.weekday, dim_product.category,
    SUM(fact_sales.quantity) AS quantity_sold
FROM fact_sales
    JOIN dim_date ON fact_sales.date_key = dim_date.date_key
    JOIN dim_product ON fact_sales.product_sk = dim_product.product_sk
WHERE
    dim_date.year = 2024 AND
    dim_product.category IN ('Fresh fruit', 'Candy')
GROUP BY
    dim_date.weekday, dim_product.category;
```

我们如何高效地执行这个查询？

在大多数 OLTP 数据库中，存储是以 *面向行* 的方式布局的：表中一行的所有值彼此相邻存储。文档数据库类似：整个文档通常作为一个连续的字节序列存储。你可以在 [图 4-1](#fig_storage_csv_hash_index) 的 CSV 示例中看到这一点。

为了处理像 [示例 4-1](#fig_storage_analytics_query) 这样的查询，你可能在 `fact_sales.date_key` 和/或 `fact_sales.product_sk` 上有索引，告诉存储引擎在哪里找到特定日期或特定产品的所有销售。但是，面向行的存储引擎仍然需要将所有这些行（每行包含超过 100 个属性）从磁盘加载到内存中，解析它们，并过滤掉不符合所需条件的行。这可能需要很长时间。

*面向列*（或 *列式*）存储背后的想法很简单：不要将一行中的所有值存储在一起，而是将每 *列* 中的所有值存储在一起 [^56]。如果每列单独存储，查询只需要读取和解析该查询中使用的那些列，这可以节省大量工作。[图 4-7](#fig_column_store) 使用 [图 3-5](/ch3#fig_dwh_schema) 中事实表的扩展版本展示了这一原理。

--------

> [!NOTE]
> 列存储在关系数据模型中最容易理解，但它同样适用于非关系数据。例如，Parquet [^57] 是一种列式存储格式，它支持基于 Google 的 Dremel [^58] 的文档数据模型，使用一种称为 *分解*（*shredding*）或 *条带化*（*striping*）的技术 [^59]。

--------

{{< figure src="/fig/ddia_0407.png" id="fig_column_store" caption="图 4-7. 按列而不是按行存储关系数据。" class="w-full my-4" >}}

面向列的存储布局依赖于每列以相同顺序存储行。因此，如果你需要重新组装整行，你可以从每个单独的列中取出第 23 个条目，并将它们组合在一起形成表的第 23 行。

实际上，列式存储引擎并不真的一次存储整个列（可能包含数万亿行）。相反，它们将表分解为数千或数百万行的块，并且在每个块内，它们分别存储每列的值 [^60]。由于许多查询都限制在特定的日期范围内，因此通常使每个块包含特定时间戳范围的行。然后查询只需要在与所需日期范围重叠的那些块中加载它需要的列。

列式存储如今几乎用于所有分析数据库 [^60]，从大规模云数据仓库（如 Snowflake [^61]）到单节点嵌入式数据库（如 DuckDB [^62]），以及产品分析系统（如 Pinot [^63] 和 Druid [^64]）。它用于存储格式，如 Parquet、ORC [^65] [^66]、Lance [^67] 和 Nimble [^68]，以及内存分析格式，如 Apache Arrow [^65] [^69] 和 Pandas/NumPy [^70]。一些时间序列数据库，如 InfluxDB IOx [^71] 和 TimescaleDB [^72]，也基于面向列的存储。

#### 列压缩 {#sec_storage_column_compression}

除了只从磁盘加载查询所需的那些列之外，我们还可以通过压缩数据进一步减少对磁盘吞吐量和网络带宽的需求。幸运的是，面向列的存储通常非常适合压缩。

看看 [图 4-7](#fig_column_store) 中每列的值序列：它们看起来经常重复，这是压缩的良好迹象。根据列中的数据，可以使用不同的压缩技术。在数据仓库中特别有效的一种技术是 *位图编码*，如 [图 4-8](#fig_bitmap_index) 所示。

{{< figure src="/fig/ddia_0408.png" id="fig_bitmap_index" caption="图 4-8. 单列的压缩、位图索引存储。" class="w-full my-4" >}}

通常，列中不同值的数量与行数相比很小（例如，零售商可能有数十亿条销售交易，但只有 100,000 种不同的产品）。我们现在可以将具有 *n* 个不同值的列转换为 *n* 个单独的位图：每个不同值一个位图，每行一位。如果该行具有该值，则该位为 1，否则为 0。

一种选择是使用每行一位来存储这些位图。然而，这些位图通常包含大量零（我们说它们是 *稀疏* 的）。在这种情况下，位图可以另外进行游程编码：计算连续零或一的数量并存储该数字，如 [图 4-8](#fig_bitmap_index) 底部所示。诸如 *咆哮位图*（*roaring bitmaps*）之类的技术在两种位图表示之间切换，使用最紧凑的表示 [^73]。这可以使列的编码非常高效。

像这样的位图索引非常适合数据仓库中常见的查询类型。例如：

`WHERE product_sk IN (31, 68, 69):`
: 加载 `product_sk = 31`、`product_sk = 68` 和 `product_sk = 69` 的三个位图，并计算三个位图的按位 *OR*，这可以非常高效地完成。

`WHERE product_sk = 30 AND store_sk = 3:`
: 加载 `product_sk = 30` 和 `store_sk = 3` 的位图，并计算按位 *AND*。这有效是因为列以相同的顺序包含行，所以一列位图中的第 *k* 位对应于另一列位图中第 *k* 位的同一行。

位图也可用于回答图查询，例如查找社交网络中被用户 *X* 关注并且也关注用户 *Y* 的所有用户 [^74]。列式数据库还有各种其他压缩方案，你可以在参考文献中找到 [^75]。

--------

> [!NOTE]
> 不要将面向列的数据库与 *宽列*（也称为 *列族*）数据模型混淆，在该模型中，一行可以有数千列，并且不需要所有行都有相同的列 [^9]。尽管名称相似，宽列数据库是面向行的，因为它们将一行中的所有值存储在一起。Google 的 Bigtable、Apache Accumulo 和 HBase 是宽列模型的例子。

--------

#### 列存储中的排序顺序 {#sort-order-in-column-storage}

在列存储中，行的存储顺序并不一定重要。最简单的是按插入顺序存储它们，因为这样插入新行只需追加到每列。但是，我们可以选择强制执行顺序，就像我们之前对 SSTable 所做的那样，并将其用作索引机制。

请注意，独立排序每列是没有意义的，因为那样我们就不再知道列中的哪些项属于同一行。我们只能重建一行，因为我们知道一列中的第 *k* 个项与另一列中的第 *k* 个项属于同一行。

相反，数据需要一次排序整行，即使它是按列存储的。数据库管理员可以使用他们对常见查询的了解来选择表应按哪些列排序。例如，如果查询经常针对日期范围（例如上个月），则将 `date_key` 作为第一个排序键可能是有意义的。然后查询可以只扫描上个月的行，这将比扫描所有行快得多。

第二列可以确定在第一列中具有相同值的任何行的排序顺序。例如，如果 `date_key` 是 [图 4-7](#fig_column_store) 中的第一个排序键，那么 `product_sk` 作为第二个排序键可能是有意义的，这样同一天同一产品的所有销售都在存储中分组在一起。这将有助于需要在某个日期范围内按产品分组或过滤销售的查询。

排序顺序的另一个优点是它可以帮助压缩列。如果主排序列没有许多不同的值，那么排序后，它将有很长的序列，其中相同的值在一行中重复多次。简单的游程编码，就像我们在 [图 4-8](#fig_bitmap_index) 中用于位图的那样，可以将该列压缩到几千字节 —— 即使表有数十亿行。

该压缩效果在第一个排序键上最强。第二和第三个排序键将更加混乱，因此不会有如此长的重复值运行。排序优先级较低的列基本上以随机顺序出现，因此它们可能不会压缩得那么好。但是，让前几列排序仍然是整体上的胜利。

#### 写入列式存储 {#writing-to-column-oriented-storage}

我们在 ["事务处理和分析的特征"](/ch1#sec_introduction_oltp) 中看到，数据仓库中的读取往往包括大量行的聚合；列式存储、压缩和排序都有助于使这些读取查询更快。数据仓库中的写入往往是数据的批量导入，通常通过 ETL 过程。

使用列式存储，在排序表的中间某处写入单个行将非常低效，因为你必须从插入位置开始重写所有压缩列。但是，一次批量写入许多行会分摊重写这些列的成本，使其高效。

通常使用日志结构方法以批次执行写入。所有写入首先进入面向行的、排序的内存存储。当积累了足够的写入时，它们将与磁盘上的列编码文件合并，并批量写入新文件。由于旧文件保持不可变，新文件一次写入，对象存储非常适合存储这些文件。

查询需要检查磁盘上的列数据和内存中的最近写入，并将两者结合起来。查询执行引擎对用户隐藏了这种区别。从分析师的角度来看，已通过插入、更新或删除修改的数据会立即反映在后续查询中。Snowflake、Vertica、Apache Pinot、Apache Druid 和许多其他系统都这样做 [^61] [^63] [^64] [^76]。


### 查询执行：编译与向量化 {#sec_storage_vectorized}

用于分析的复杂 SQL 查询被分解为由多个阶段组成的 *查询计划*，称为 *算子*，这些算子可能分布在多台机器上以并行执行。查询规划器可以通过选择使用哪些算子、以何种顺序执行它们以及在哪里运行每个算子来执行大量优化。

在每个算子内，查询引擎需要对列中的值执行各种操作，例如查找值在特定值集中的所有行（可能作为连接的一部分），或检查值是否大于 15。它还需要查看同一行的几列，例如查找产品是香蕉且门店是某个特定目标门店的所有销售交易。

对于需要扫描数百万行的数据仓库查询，我们不仅需要担心它们需要从磁盘读取的数据量，还需要担心执行复杂算子所需的 CPU 时间。最简单的算子类型就像编程语言的解释器：在遍历每一行时，它检查表示查询的数据结构，以找出需要对哪些列执行哪些比较或计算。不幸的是，这对许多分析目的来说太慢了。高效查询执行的两种替代方法已经出现 [^77]：

查询编译
: 查询引擎获取 SQL 查询并生成用于执行它的代码。代码逐行迭代，查看感兴趣列中的值，执行所需的任何比较或计算，如果满足所需条件，则将必要的值复制到输出缓冲区。查询引擎将生成的代码编译为机器代码（通常使用现有编译器，如 LLVM），然后在已加载到内存中的列编码数据上运行它。这种代码生成方法类似于 Java 虚拟机（JVM）和类似运行时中使用的即时（JIT）编译方法。

向量化处理
: 查询被解释，而不是编译，但通过批量处理列中的许多值而不是逐行迭代来提高速度。一组固定的预定义算子内置在数据库中；我们可以向它们传递参数并获得一批结果 [^50] [^75]。

例如，我们可以将 `product_sk` 列和"香蕉"的 ID 传递给相等算子，并获得一个位图（输入列中每个值一位，如果是香蕉则为 1）；然后我们可以将 `store_sk` 列和感兴趣商店的 ID 传递给相同的相等算子，并获得另一个位图；然后我们可以将两个位图传递给"按位 AND"算子，如 [图 4-9](#fig_bitmap_and) 所示。结果将是一个位图，包含特定商店中所有香蕉销售的 1。

{{< figure src="/fig/ddia_0409.png" id="fig_bitmap_and" caption="图 4-9. 两个位图之间的按位 AND 适合向量化。" class="w-full my-4" >}}

这两种方法在实现方面非常不同，但两者都在实践中使用 [^77]。两者都可以通过利用现代 CPU 的特性来实现非常好的性能：

* 优先选择顺序内存访问而不是随机访问以减少缓存未命中 [^78]，
* 在紧密的内部循环中完成大部分工作（即，具有少量指令且没有函数调用）以保持 CPU 指令处理管道繁忙并避免分支预测错误，
* 利用并行性，例如多线程和单指令多数据（SIMD）指令 [^79] [^80]，以及
* 直接对压缩数据进行操作，而无需将其解码为单独的内存表示，这可以节省内存分配和复制成本。

### 物化视图与数据立方体 {#sec_storage_materialized_views}

我们之前在 ["物化和更新时间线"](/ch2#sec_introduction_materializing) 中遇到了 *物化视图*：在关系数据模型中，它们是表状对象，其内容是某些查询的结果。区别在于物化视图是查询结果的实际副本，写入磁盘，而虚拟视图只是编写查询的快捷方式。当你从虚拟视图读取时，SQL 引擎会即时将其扩展为视图的基础查询，然后处理扩展的查询。

当基础数据更改时，物化视图需要相应更新。一些数据库可以自动执行此操作，还有像 Materialize 这样专门从事物化视图维护的系统 [^81]。执行此类更新意味着写入时需要更多工作，但物化视图可以改善在重复需要执行相同查询的工作负载中的读取性能。

*物化聚合* 是一种可以在数据仓库中有用的物化视图类型。如前所述，数据仓库查询通常涉及聚合函数，例如 SQL 中的 `COUNT`、`SUM`、`AVG`、`MIN` 或 `MAX`。如果许多不同的查询使用相同的聚合，每次都处理原始数据可能会很浪费。为什么不缓存查询最常使用的一些计数或总和？*数据立方体*（*OLAP 立方体*）通过创建按不同维度分组的聚合网格来做到这一点 [^82]。[图 4-10](#fig_data_cube) 显示了一个示例。

{{< figure src="/fig/ddia_0410.png" id="fig_data_cube" caption="图 4-10. 数据立方体的两个维度，通过求和聚合数据。" class="w-full my-4" >}}

现在假设每个事实只有两个维度表的外键 —— 在 [图 4-10](#fig_data_cube) 中，这些是 `date_key` 和 `product_sk`。你现在可以绘制一个二维表，日期沿着一个轴，产品沿着另一个轴。每个单元格包含具有该日期-产品组合的所有事实的属性（例如 `net_price`）的聚合（例如 `SUM`）。然后，你可以沿着每行或列应用相同的聚合，并获得已减少一个维度的摘要（不管日期的产品销售，或不管产品的日期销售）。

一般来说，事实通常有两个以上的维度。在 [图 3-5](/ch3#fig_dwh_schema) 中有五个维度：日期、产品、商店、促销和客户。很难想象五维超立方体会是什么样子，但原理保持不变：每个单元格包含特定日期-产品-商店-促销-客户组合的销售。然后可以沿着每个维度重复汇总这些值。

物化数据立方体的优点是某些查询会变得非常快，因为结果已经被预先计算好了。例如，如果你想知道昨天每个商店的总销售额，你只需要查看相应维度上的汇总值 —— 不需要扫描数百万行。

缺点是数据立方体不像直接查询原始数据那样灵活。例如，没有办法计算售价超过 100 美元的商品销售占比，因为价格并不是其中一个维度。因此，大多数数据仓库都会尽可能保留原始数据，只把这类聚合（如数据立方体）当作特定查询的性能加速手段。


## 多维索引与全文索引 {#sec_storage_multidimensional}

我们在本章前半部分看到的 B 树和 LSM 树允许对单个属性进行范围查询：例如，如果键是用户名，你可以使用它们作为索引来高效查找所有以 L 开头的名称。但有时，按单个属性搜索是不够的。

最常见的多列索引类型称为 *联合索引*，它通过将一列追加到另一列来将几个字段组合成一个键（索引定义指定字段以何种顺序连接）。这就像老式的纸质电话簿，它提供从（*姓氏*、*名字*）到电话号码的索引。由于排序顺序，索引可用于查找具有特定姓氏的所有人，或具有特定 *姓氏-名字* 组合的所有人。但是，如果你想查找具有特定名字的所有人，索引是无用的。

另一方面，*多维索引* 允许你一次查询多个列。在地理空间数据中这尤其重要。例如，餐厅搜索网站可能有一个包含每个餐厅的纬度和经度的数据库。当用户在地图上查看餐厅时，网站需要搜索用户当前查看的矩形地图区域内的所有餐厅。这需要像以下这样的二维范围查询：

```sql
SELECT * FROM restaurants WHERE latitude > 51.4946 AND latitude < 51.5079
    AND longitude > -0.1162 AND longitude < -0.1004;
```

纬度和经度列上的联合索引无法有效地回答这种查询：它可以为你提供纬度范围内的所有餐厅（但在任何经度），或经度范围内的所有餐厅（但在北极和南极之间的任何地方），但不能同时提供两者。

一种选择是使用空间填充曲线将二维位置转换为单个数字，然后使用常规 B 树索引 [^83]。更常见的是，使用专门的空间索引，如 R 树或 Bkd 树 [^84]；它们划分空间，使附近的数据点倾向于分组在同一子树中。例如，PostGIS 使用 PostgreSQL 的通用搜索树索引设施将地理空间索引实现为 R 树 [^85]。也可以使用规则间隔的三角形、正方形或六边形网格 [^86]。

多维索引不仅用于地理位置。例如，在电子商务网站上，你可以在维度（*红色*、*绿色*、*蓝色*）上使用三维索引来搜索某个颜色范围内的产品，或者在天气观测数据库中，你可以在（*日期*、*温度*）上有一个二维索引，以便有效地搜索 2013 年期间温度在 25 到 30°C 之间的所有观测。使用一维索引，你必须扫描 2013 年的所有记录（不管温度），然后按温度过滤它们，反之亦然。二维索引可以同时按时间戳和温度缩小范围 [^87]。

### 全文检索 {#sec_storage_full_text}

全文检索允许你通过可能出现在文本中任何位置的关键字搜索文本文档集合（网页、产品描述等）[^88]。信息检索是一个大的专业主题，通常涉及特定于语言的处理：例如，几种亚洲语言在单词之间没有空格或标点符号，因此将文本分割成单词需要一个指示哪些字符序列构成单词的模型。全文检索还经常涉及匹配相似但不相同的单词（例如拼写错误或单词的不同语法形式）和同义词。这些问题超出了本书的范围。

然而，在其核心，你可以将全文检索视为另一种多维查询：在这种情况下，可能出现在文本中的每个单词（*词项*）是一个维度。包含词项 *x* 的文档在维度 *x* 中的值为 1，不包含 *x* 的文档的值为 0。搜索提到“红苹果”的文档意味着查询在 *红* 维度中查找 1，同时在 *苹果* 维度中查找 1。维度数量可能因此非常大。

许多搜索引擎用来回答此类查询的数据结构称为 *倒排索引*。这是一个键值结构，其中键是词项，值是包含该词项的所有文档的 ID 列表（*倒排列表*）。如果文档 ID 是顺序数字，倒排列表也可以表示为稀疏位图，如 [图 4-8](#fig_bitmap_index)：词项 *x* 的位图中的第 *n* 位是 1，如果 ID 为 *n* 的文档包含词项 *x* [^89]。

查找包含词项 *x* 和 *y* 的所有文档现在类似于搜索匹配两个条件的行的向量化数据仓库查询（[图 4-9](#fig_bitmap_and)）：加载词项 *x* 和 *y* 的两个位图并计算它们的按位 AND。即使位图是游程编码的，这也可以非常高效地完成。

例如，Elasticsearch 和 Solr 使用的全文索引引擎 Lucene 就是这样工作的 [^90]。它将词项到倒排列表的映射存储在类似 SSTable 的排序文件中，这些文件使用我们在本章前面看到的相同日志结构方法在后台合并 [^91]。PostgreSQL 的 GIN 索引类型也使用倒排列表来支持全文检索和 JSON 文档内的索引 [^92] [^93]。

除了将文本分解为单词，另一种选择是查找长度为 *n* 的所有子字符串，称为 *n-gram*（*n 元语法*）。例如，字符串 `"hello"` 的三元语法（*n* = 3）是 `"hel"`、`"ell"` 和 `"llo"`。如果我们为所有三元语法构建倒排索引，我们就可以搜索任意至少三个字符长的子字符串。三元语法索引甚至允许在搜索查询中使用正则表达式；缺点是它们相当大 [^94]。

为了处理文档或查询中的拼写错误，Lucene 能够在一定编辑距离内搜索文本中的单词（编辑距离为 1 意味着已添加、删除或替换了一个字母）[^95]。它通过将词项集存储为字符上的有限状态自动机（类似于 *字典树* [^96]）并将其转换为 *莱文斯坦自动机* 来实现，该自动机支持在给定编辑距离内高效搜索单词 [^97]。


### 向量嵌入 {#id92}

语义搜索超越了同义词和拼写错误，试图理解文档概念和用户意图。例如，如果你的帮助页面中有一个标题为“取消订阅”的页面，用户在搜索“如何关闭我的账户”或“终止合同”时，仍应能找到这个页面，即使查询词完全不同，但语义非常接近。

为了理解文档的语义 —— 它的含义 —— 语义搜索索引使用嵌入模型将文档转换为浮点值向量，称为 *向量嵌入*。向量表示多维空间中的一个点，每个浮点值表示文档沿着一个维度轴的位置。嵌入模型生成的向量嵌入在（这个多维空间中）彼此接近，当嵌入的输入文档在语义上相似时。

--------

> [!NOTE]
> 我们在 ["查询执行：编译与向量化"](#sec_storage_vectorized) 中看到了术语 *向量化处理*。语义搜索中的向量有不同的含义。在向量化处理中，向量指的是可以用特别优化的代码处理的一批位。在嵌入模型中，向量是表示多维空间中位置的浮点数列表。

--------

例如，关于农业的维基百科页面的三维向量嵌入可能是 `[0.1, 0.22, 0.11]`。关于蔬菜的维基百科页面会非常接近，可能嵌入为 `[0.13, 0.19, 0.24]`。关于星型模式的页面可能有 `[0.82, 0.39, -0.74]` 的嵌入，相对较远。我们可以通过观察看出前两个向量比第三个更接近。

嵌入模型使用更大的向量（通常超过 1,000 个数字），但原理是相同的。我们不试图理解各个数字的含义；它们只是嵌入模型指向抽象多维空间中位置的一种方式。搜索引擎使用距离函数（如余弦相似度或欧几里得距离）来测量向量之间的距离。余弦相似度测量两个向量角度的余弦以确定它们的接近程度，而欧几里得距离测量空间中两点之间的直线距离。

许多早期的嵌入模型，如 Word2Vec [^98]、BERT [^99] 和 GPT [^100] 都处理文本数据。这些模型通常实现为神经网络。研究人员继续为视频、音频和图像创建嵌入模型。最近，模型架构已经变成 *多模态* 的：单个模型可以为多种模态（如文本和图像）生成向量嵌入。

语义搜索引擎在用户输入查询时使用嵌入模型生成向量嵌入。用户的查询和相关上下文（例如用户的位置）被输入到嵌入模型中。嵌入模型生成查询的向量嵌入后，搜索引擎必须使用向量索引找到具有相似向量嵌入的文档。

向量索引存储文档集合的向量嵌入。要查询索引，你传入查询的向量嵌入，索引返回其向量最接近查询向量的文档。由于我们之前看到的 R 树不适用于多维向量，因此使用专门的向量索引，例如：

平面索引（Flat indexes）
: 向量按原样存储在索引中。查询必须读取每个向量并测量其与查询向量的距离。平面索引是准确的，但测量查询与每个向量之间的距离很慢。

倒排文件（IVF）索引
: 向量空间被聚类为向量的分区（称为 *质心*），以减少必须比较的向量数量。IVF 索引比平面索引更快，但只能给出近似结果：即使查询和文档彼此接近，它们也可能落入不同的分区。对 IVF 索引的查询首先定义 *探针*，这只是要检查的分区数。使用更多探针的查询将更准确，但会更慢，因为必须比较更多向量。

分层可导航小世界（HNSW）
: HNSW 索引维护向量空间的多个层，如 [图 4-11](#fig_vector_hnsw) 所示。每一层都表示为一个图，其中节点表示向量，边表示与附近向量的接近度。查询首先在最顶层定位最近的向量，该层具有少量节点。然后查询移动到下面一层的同一节点，并跟随该层中的边，该层连接更密集，寻找更接近查询向量的向量。该过程继续直到到达最后一层。与 IVF 索引一样，HNSW 索引是近似的。

{{< figure src="/fig/ddia_0411.png" id="fig_vector_hnsw" caption="图 4-11. 在 HNSW 索引中搜索最接近给定查询向量的数据库条目。" class="w-full my-4" >}}


许多流行的向量数据库实现了 IVF 和 HNSW 索引。Facebook 的 Faiss 库有每种的许多变体 [^101]，PostgreSQL 的 pgvector 也支持两者 [^102]。IVF 和 HNSW 算法的完整细节超出了本书的范围，但它们的论文是极好的资源 [^103] [^104]。

## 总结 {#summary}

在本章中，我们试图深入了解数据库如何执行存储和检索。当你在数据库中存储数据时会发生什么，当你稍后再次查询数据时数据库会做什么？

["分析型与事务型系统"](/ch1#sec_introduction_analytics) 介绍了事务处理（OLTP）和分析（OLAP）之间的区别。在本章中，我们看到为 OLTP 优化的存储引擎与为分析优化的存储引擎看起来非常不同：

* OLTP 系统针对大量请求进行了优化，每个请求读取和写入少量记录，并且需要快速响应。记录通常通过主键或二级索引访问，这些索引通常是从键到记录的有序映射，也支持范围查询。
* 数据仓库和类似的分析系统针对扫描大量记录的复杂读取查询进行了优化。它们通常使用带有压缩的列式存储布局，以最小化此类查询需要从磁盘读取的数据量，并使用查询的即时编译或向量化来最小化处理数据所花费的 CPU 时间。

在 OLTP 方面，我们看到了两个主要思想流派的存储引擎：

* 日志结构方法，只允许追加到文件和删除过时文件，但从不更新已写入的文件。SSTable、LSM 树、RocksDB、Cassandra、HBase、Scylla、Lucene 等属于这一组。一般来说，日志结构存储引擎往往提供高写入吞吐量。
* 就地更新方法，将磁盘视为一组可以覆盖的固定大小页。B 树是这种理念的最大例子，用于所有主要的关系型 OLTP 数据库以及许多非关系型数据库。作为经验法则，B 树往往更适合读取，提供比日志结构存储更高的读取吞吐量和更低的响应时间。

然后我们查看了可以同时搜索多个条件的索引：多维索引（如 R 树）可以同时按纬度和经度搜索地图上的点，全文检索索引可以搜索出现在同一文本中的多个关键字。最后，向量数据库用于文本文档和其他媒体的语义搜索；它们使用具有大量维度的向量，并通过比较向量相似性来查找相似文档。

作为应用开发者，如果你掌握了这些关于存储引擎内部机制的知识，就能更好地判断哪种工具最适合你的具体应用。如果你需要调整数据库的调优参数，这种理解也能帮助你预判参数调高或调低可能带来的影响。

尽管本章不能让你成为调优某个特定存储引擎的专家，但它希望已经为你提供了足够的术语和思路，使你能够读懂所选数据库的文档。


### 参考


[^1]: Nikolay Samokhvalov. [How partial, covering, and multicolumn indexes may slow down UPDATEs in PostgreSQL](https://postgres.ai/blog/20211029-how-partial-and-covering-indexes-affect-update-performance-in-postgresql). *postgres.ai*, October 2021. Archived at [perma.cc/PBK3-F4G9](https://perma.cc/PBK3-F4G9)
[^2]: Goetz Graefe. [Modern B-Tree Techniques](https://w6113.github.io/files/papers/btreesurvey-graefe.pdf). *Foundations and Trends in Databases*, volume 3, issue 4, pages 203–402, August 2011. [doi:10.1561/1900000028](https://doi.org/10.1561/1900000028)
[^3]: Evan Jones. [Why databases use ordered indexes but programming uses hash tables](https://www.evanjones.ca/ordered-vs-unordered-indexes.html). *evanjones.ca*, December 2019. Archived at [perma.cc/NJX8-3ZZD](https://perma.cc/NJX8-3ZZD)
[^4]: Branimir Lambov. [CEP-25: Trie-indexed SSTable format](https://cwiki.apache.org/confluence/display/CASSANDRA/CEP-25%3A%2BTrie-indexed%2BSSTable%2Bformat). *cwiki.apache.org*, November 2022. Archived at [perma.cc/HD7W-PW8U](https://perma.cc/HD7W-PW8U). Linked Google Doc archived at [perma.cc/UL6C-AAAE](https://perma.cc/UL6C-AAAE)
[^5]: Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein: *Introduction to Algorithms*, 3rd edition. MIT Press, 2009. ISBN: 978-0-262-53305-8
[^6]: Branimir Lambov. [Trie Memtables in Cassandra](https://www.vldb.org/pvldb/vol15/p3359-lambov.pdf). *Proceedings of the VLDB Endowment*, volume 15, issue 12, pages 3359–3371, August 2022. [doi:10.14778/3554821.3554828](https://doi.org/10.14778/3554821.3554828)
[^7]: Dhruba Borthakur. [The History of RocksDB](https://rocksdb.blogspot.com/2013/11/the-history-of-rocksdb.html). *rocksdb.blogspot.com*, November 2013. Archived at [perma.cc/Z7C5-JPSP](https://perma.cc/Z7C5-JPSP)
[^8]: Matteo Bertozzi. [Apache HBase I/O – HFile](https://blog.cloudera.com/apache-hbase-i-o-hfile/). *blog.cloudera.com*, June 2012. Archived at [perma.cc/U9XH-L2KL](https://perma.cc/U9XH-L2KL)
[^9]: Fay Chang, Jeffrey Dean, Sanjay Ghemawat, Wilson C. Hsieh, Deborah A. Wallach, Mike Burrows, Tushar Chandra, Andrew Fikes, and Robert E. Gruber. [Bigtable: A Distributed Storage System for Structured Data](https://research.google/pubs/pub27898/). At *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006.
[^10]: Patrick O’Neil, Edward Cheng, Dieter Gawlick, and Elizabeth O’Neil. [The Log-Structured Merge-Tree (LSM-Tree)](https://www.cs.umb.edu/~poneil/lsmtree.pdf). *Acta Informatica*, volume 33, issue 4, pages 351–385, June 1996. [doi:10.1007/s002360050048](https://doi.org/10.1007/s002360050048)
[^11]: Mendel Rosenblum and John K. Ousterhout. [The Design and Implementation of a Log-Structured File System](https://research.cs.wisc.edu/areas/os/Qual/papers/lfs.pdf). *ACM Transactions on Computer Systems*, volume 10, issue 1, pages 26–52, February 1992. [doi:10.1145/146941.146943](https://doi.org/10.1145/146941.146943)
[^12]: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong Zhu, Mukul Murthy, Joseph Torres, Herman van Hovell, Adrian Ionescu, Alicja Łuszczak, Michał Świtakowski, Michał Szafrański, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter Boncz, Ali Ghodsi, Sameer Paranjpye, Pieter Senster, Reynold Xin, and Matei Zaharia. [Delta Lake: High-Performance ACID Table Storage over Cloud Object Stores](https://vldb.org/pvldb/vol13/p3411-armbrust.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 12, pages 3411–3424, August 2020. [doi:10.14778/3415478.3415560](https://doi.org/10.14778/3415478.3415560)
[^13]: Burton H. Bloom. [Space/Time Trade-offs in Hash Coding with Allowable Errors](https://people.cs.umass.edu/~emery/classes/cmpsci691st/readings/Misc/p422-bloom.pdf). *Communications of the ACM*, volume 13, issue 7, pages 422–426, July 1970. [doi:10.1145/362686.362692](https://doi.org/10.1145/362686.362692)
[^14]: Adam Kirsch and Michael Mitzenmacher. [Less Hashing, Same Performance: Building a Better Bloom Filter](https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf). *Random Structures & Algorithms*, volume 33, issue 2, pages 187–218, September 2008. [doi:10.1002/rsa.20208](https://doi.org/10.1002/rsa.20208)
[^15]: Thomas Hurst. [Bloom Filter Calculator](https://hur.st/bloomfilter/). *hur.st*, September 2023. Archived at [perma.cc/L3AV-6VC2](https://perma.cc/L3AV-6VC2)
[^16]: Chen Luo and Michael J. Carey. [LSM-based storage techniques: a survey](https://arxiv.org/abs/1812.07527). *The VLDB Journal*, volume 29, pages 393–418, July 2019. [doi:10.1007/s00778-019-00555-y](https://doi.org/10.1007/s00778-019-00555-y)
[^17]: Subhadeep Sarkar and Manos Athanassoulis. [Dissecting, Designing, and Optimizing LSM-based Data Stores](https://www.youtube.com/watch?v=hkMkBZn2mGs). Tutorial at *ACM International Conference on Management of Data* (SIGMOD), June 2022. Slides archived at [perma.cc/93B3-E827](https://perma.cc/93B3-E827)
[^18]: Mark Callaghan. [Name that compaction algorithm](https://smalldatum.blogspot.com/2018/08/name-that-compaction-algorithm.html). *smalldatum.blogspot.com*, August 2018. Archived at [perma.cc/CN4M-82DY](https://perma.cc/CN4M-82DY)
[^19]: Prashanth Rao. [Embedded databases (1): The harmony of DuckDB, KùzuDB and LanceDB](https://thedataquarry.com/posts/embedded-db-1/). *thedataquarry.com*, August 2023. Archived at [perma.cc/PA28-2R35](https://perma.cc/PA28-2R35)
[^20]: Hacker News discussion. [Bluesky migrates to single-tenant SQLite](https://news.ycombinator.com/item?id=38171322). *news.ycombinator.com*, October 2023. Archived at [perma.cc/69LM-5P6X](https://perma.cc/69LM-5P6X)
[^21]: Rudolf Bayer and Edward M. McCreight. [Organization and Maintenance of Large Ordered Indices](https://dl.acm.org/doi/pdf/10.1145/1734663.1734671). Boeing Scientific Research Laboratories, Mathematical and Information Sciences Laboratory, report no. 20, July 1970. [doi:10.1145/1734663.1734671](https://doi.org/10.1145/1734663.1734671)
[^22]: Douglas Comer. [The Ubiquitous B-Tree](https://web.archive.org/web/20170809145513id_/http%3A//sites.fas.harvard.edu/~cs165/papers/comer.pdf). *ACM Computing Surveys*, volume 11, issue 2, pages 121–137, June 1979. [doi:10.1145/356770.356776](https://doi.org/10.1145/356770.356776)
[^23]: Alex Miller. [Torn Write Detection and Protection](https://transactional.blog/blog/2025-torn-writes). *transactional.blog*, April 2025. Archived at [perma.cc/G7EB-33EW](https://perma.cc/G7EB-33EW)
[^24]: C. Mohan and Frank Levine. [ARIES/IM: An Efficient and High Concurrency Index Management Method Using Write-Ahead Logging](https://ics.uci.edu/~cs223/papers/p371-mohan.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 1992. [doi:10.1145/130283.130338](https://doi.org/10.1145/130283.130338)
[^25]: Hironobu Suzuki. [The Internals of PostgreSQL](https://www.interdb.jp/pg/). *interdb.jp*, 2017.
[^26]: Howard Chu. [LDAP at Lightning Speed](https://buildstuff14.sched.com/event/08a1a368e272eb599a52e08b4c3c779d). At *Build Stuff ’14*, November 2014. Archived at [perma.cc/GB6Z-P8YH](https://perma.cc/GB6Z-P8YH)
[^27]: Manos Athanassoulis, Michael S. Kester, Lukas M. Maas, Radu Stoica, Stratos Idreos, Anastasia Ailamaki, and Mark Callaghan. [Designing Access Methods: The RUM Conjecture](https://openproceedings.org/2016/conf/edbt/paper-12.pdf). At *19th International Conference on Extending Database Technology* (EDBT), March 2016. [doi:10.5441/002/edbt.2016.42](https://doi.org/10.5441/002/edbt.2016.42)
[^28]: Ben Stopford. [Log Structured Merge Trees](http://www.benstopford.com/2015/02/14/log-structured-merge-trees/). *benstopford.com*, February 2015. Archived at [perma.cc/E5BV-KUJ6](https://perma.cc/E5BV-KUJ6)
[^29]: Mark Callaghan. [The Advantages of an LSM vs a B-Tree](https://smalldatum.blogspot.com/2016/01/summary-of-advantages-of-lsm-vs-b-tree.html). *smalldatum.blogspot.co.uk*, January 2016. Archived at [perma.cc/3TYZ-EFUD](https://perma.cc/3TYZ-EFUD)
[^30]: Oana Balmau, Florin Dinu, Willy Zwaenepoel, Karan Gupta, Ravishankar Chandhiramoorthi, and Diego Didona. [SILK: Preventing Latency Spikes in Log-Structured Merge Key-Value Stores](https://www.usenix.org/conference/atc19/presentation/balmau). At *USENIX Annual Technical Conference*, July 2019.
[^31]: Igor Canadi, Siying Dong, Mark Callaghan, et al. [RocksDB Tuning Guide](https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide). *github.com*, 2023. Archived at [perma.cc/UNY4-MK6C](https://perma.cc/UNY4-MK6C)
[^32]: Gabriel Haas and Viktor Leis. [What Modern NVMe Storage Can Do, and How to Exploit it: High-Performance I/O for High-Performance Storage Engines](https://www.vldb.org/pvldb/vol16/p2090-haas.pdf). *Proceedings of the VLDB Endowment*, volume 16, issue 9, pages 2090-2102. [doi:10.14778/3598581.3598584](https://doi.org/10.14778/3598581.3598584)
[^33]: Emmanuel Goossaert. [Coding for SSDs](https://codecapsule.com/2014/02/12/coding-for-ssds-part-1-introduction-and-table-of-contents/). *codecapsule.com*, February 2014.
[^34]: Jack Vanlightly. [Is sequential IO dead in the era of the NVMe drive?](https://jack-vanlightly.com/blog/2023/5/9/is-sequential-io-dead-in-the-era-of-the-nvme-drive) *jack-vanlightly.com*, May 2023. Archived at [perma.cc/7TMZ-TAPU](https://perma.cc/7TMZ-TAPU)
[^35]: Alibaba Cloud Storage Team. [Storage System Design Analysis: Factors Affecting NVMe SSD Performance (2)](https://www.alibabacloud.com/blog/594376). *alibabacloud.com*, January 2019. Archived at [archive.org](https://web.archive.org/web/20230510065132/https%3A//www.alibabacloud.com/blog/594376)
[^36]: Xiao-Yu Hu and Robert Haas. [The Fundamental Limit of Flash Random Write Performance: Understanding, Analysis and Performance Modelling](https://dominoweb.draco.res.ibm.com/reports/rz3771.pdf). *dominoweb.draco.res.ibm.com*, March 2010. Archived at [perma.cc/8JUL-4ZDS](https://perma.cc/8JUL-4ZDS)
[^37]: Lanyue Lu, Thanumalayan Sankaranarayana Pillai, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [WiscKey: Separating Keys from Values in SSD-conscious Storage](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf). At *4th USENIX Conference on File and Storage Technologies* (FAST), February 2016.
[^38]: Peter Zaitsev. [Innodb Double Write](https://www.percona.com/blog/innodb-double-write/). *percona.com*, August 2006. Archived at [perma.cc/NT4S-DK7T](https://perma.cc/NT4S-DK7T)
[^39]: Tomas Vondra. [On the Impact of Full-Page Writes](https://www.2ndquadrant.com/en/blog/on-the-impact-of-full-page-writes/). *2ndquadrant.com*, November 2016. Archived at [perma.cc/7N6B-CVL3](https://perma.cc/7N6B-CVL3)
[^40]: Mark Callaghan. [Read, write & space amplification - B-Tree vs LSM](https://smalldatum.blogspot.com/2015/11/read-write-space-amplification-b-tree.html). *smalldatum.blogspot.com*, November 2015. Archived at [perma.cc/S487-WK5P](https://perma.cc/S487-WK5P)
[^41]: Mark Callaghan. [Choosing Between Efficiency and Performance with RocksDB](https://codemesh.io/codemesh2016/mark-callaghan). At *Code Mesh*, November 2016. Video at [youtube.com/watch?v=tgzkgZVXKB4](https://www.youtube.com/watch?v=tgzkgZVXKB4)
[^42]: Subhadeep Sarkar, Tarikul Islam Papon, Dimitris Staratzis, Zichen Zhu, and Manos Athanassoulis. [Enabling Timely and Persistent Deletion in LSM-Engines](https://subhadeep.net/assets/fulltext/Enabling_Timely_and_Persistent_Deletion_in_LSM-Engines.pdf). *ACM Transactions on Database Systems*, volume 48, issue 3, article no. 8, August 2023. [doi:10.1145/3599724](https://doi.org/10.1145/3599724)
[^43]: Lukas Fittl. [Postgres vs. SQL Server: B-Tree Index Differences & the Benefit of Deduplication](https://pganalyze.com/blog/postgresql-vs-sql-server-btree-index-deduplication). *pganalyze.com*, April 2025. Archived at [perma.cc/XY6T-LTPX](https://perma.cc/XY6T-LTPX)
[^44]: Drew Silcock. [How Postgres stores data on disk – this one’s a page turner](https://drew.silcock.dev/blog/how-postgres-stores-data-on-disk/). *drew.silcock.dev*, August 2024. Archived at [perma.cc/8K7K-7VJ2](https://perma.cc/8K7K-7VJ2)
[^45]: Joe Webb. [Using Covering Indexes to Improve Query Performance](https://www.red-gate.com/simple-talk/databases/sql-server/learn/using-covering-indexes-to-improve-query-performance/). *simple-talk.com*, September 2008. Archived at [perma.cc/6MEZ-R5VR](https://perma.cc/6MEZ-R5VR)
[^46]: Michael Stonebraker, Samuel Madden, Daniel J. Abadi, Stavros Harizopoulos, Nabil Hachem, and Pat Helland. [The End of an Architectural Era (It’s Time for a Complete Rewrite)](https://vldb.org/conf/2007/papers/industrial/p1150-stonebraker.pdf). At *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
[^47]: [VoltDB Technical Overview White Paper](https://www.voltactivedata.com/wp-content/uploads/2017/03/hv-white-paper-voltdb-technical-overview.pdf). VoltDB, 2017. Archived at [perma.cc/B9SF-SK5G](https://perma.cc/B9SF-SK5G)
[^48]: Stephen M. Rumble, Ankita Kejriwal, and John K. Ousterhout. [Log-Structured Memory for DRAM-Based Storage](https://www.usenix.org/system/files/conference/fast14/fast14-paper_rumble.pdf). At *12th USENIX Conference on File and Storage Technologies* (FAST), February 2014.
[^49]: Stavros Harizopoulos, Daniel J. Abadi, Samuel Madden, and Michael Stonebraker. [OLTP Through the Looking Glass, and What We Found There](https://hstore.cs.brown.edu/papers/hstore-lookingglass.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2008. [doi:10.1145/1376616.1376713](https://doi.org/10.1145/1376616.1376713)
[^50]: Per-Åke Larson, Cipri Clinciu, Campbell Fraser, Eric N. Hanson, Mostafa Mokhtar, Michal Nowakiewicz, Vassilis Papadimos, Susan L. Price, Srikumar Rangarajan, Remus Rusanu, and Mayukh Saubhasik. [Enhancements to SQL Server Column Stores](https://web.archive.org/web/20131203001153id_/http%3A//research.microsoft.com/pubs/193599/Apollo3%20-%20Sigmod%202013%20-%20final.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2013. [doi:10.1145/2463676.2463708](https://doi.org/10.1145/2463676.2463708)
[^51]: Franz Färber, Norman May, Wolfgang Lehner, Philipp Große, Ingo Müller, Hannes Rauhe, and Jonathan Dees. [The SAP HANA Database – An Architecture Overview](https://web.archive.org/web/20220208081111id_/http%3A//sites.computer.org/debull/A12mar/hana.pdf). *IEEE Data Engineering Bulletin*, volume 35, issue 1, pages 28–33, March 2012.
[^52]: Michael Stonebraker. [The Traditional RDBMS Wisdom Is (Almost Certainly) All Wrong](https://slideshot.epfl.ch/talks/166). Presentation at *EPFL*, May 2013.
[^53]: Adam Prout, Szu-Po Wang, Joseph Victor, Zhou Sun, Yongzhu Li, Jack Chen, Evan Bergeron, Eric Hanson, Robert Walzer, Rodrigo Gomes, and Nikita Shamgunov. [Cloud-Native Transactions and Analytics in SingleStore](https://dl.acm.org/doi/pdf/10.1145/3514221.3526055). At *ACM International Conference on Management of Data* (SIGMOD), June 2022. [doi:10.1145/3514221.3526055](https://doi.org/10.1145/3514221.3526055)
[^54]: Tino Tereshko and Jordan Tigani. [BigQuery under the hood](https://cloud.google.com/blog/products/bigquery/bigquery-under-the-hood). *cloud.google.com*, January 2016. Archived at [perma.cc/WP2Y-FUCF](https://perma.cc/WP2Y-FUCF)
[^55]: Wes McKinney. [The Road to Composable Data Systems: Thoughts on the Last 15 Years and the Future](https://wesmckinney.com/blog/looking-back-15-years/). *wesmckinney.com*, September 2023. Archived at [perma.cc/6L2M-GTJX](https://perma.cc/6L2M-GTJX)
[^56]: Michael Stonebraker, Daniel J. Abadi, Adam Batkin, Xuedong Chen, Mitch Cherniack, Miguel Ferreira, Edmond Lau, Amerson Lin, Sam Madden, Elizabeth O’Neil, Pat O’Neil, Alex Rasin, Nga Tran, and Stan Zdonik. [C-Store: A Column-oriented DBMS](https://www.vldb.org/archives/website/2005/program/paper/thu/p553-stonebraker.pdf). At *31st International Conference on Very Large Data Bases* (VLDB), pages 553–564, September 2005.
[^57]: Julien Le Dem. [Dremel Made Simple with Parquet](https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet.html). *blog.twitter.com*, September 2013.
[^58]: Sergey Melnik, Andrey Gubarev, Jing Jing Long, Geoffrey Romer, Shiva Shivakumar, Matt Tolton, and Theo Vassilakis. [Dremel: Interactive Analysis of Web-Scale Datasets](https://vldb.org/pvldb/vol3/R29.pdf). At *36th International Conference on Very Large Data Bases* (VLDB), pages 330–339, September 2010. [doi:10.14778/1920841.1920886](https://doi.org/10.14778/1920841.1920886)
[^59]: Joe Kearney. [Understanding Record Shredding: storing nested data in columns](https://www.joekearney.co.uk/posts/understanding-record-shredding). *joekearney.co.uk*, December 2016. Archived at [perma.cc/ZD5N-AX5D](https://perma.cc/ZD5N-AX5D)
[^60]: Jamie Brandon. [A shallow survey of OLAP and HTAP query engines](https://www.scattered-thoughts.net/writing/a-shallow-survey-of-olap-and-htap-query-engines). *scattered-thoughts.net*, September 2023. Archived at [perma.cc/L3KH-J4JF](https://perma.cc/L3KH-J4JF)
[^61]: Benoit Dageville, Thierry Cruanes, Marcin Zukowski, Vadim Antonov, Artin Avanes, Jon Bock, Jonathan Claybaugh, Daniel Engovatov, Martin Hentschel, Jiansheng Huang, Allison W. Lee, Ashish Motivala, Abdul Q. Munir, Steven Pelley, Peter Povinec, Greg Rahn, Spyridon Triantafyllis, and Philipp Unterbrunner. [The Snowflake Elastic Data Warehouse](https://dl.acm.org/doi/pdf/10.1145/2882903.2903741). At *ACM International Conference on Management of Data* (SIGMOD), pages 215–226, June 2016. [doi:10.1145/2882903.2903741](https://doi.org/10.1145/2882903.2903741)
[^62]: Mark Raasveldt and Hannes Mühleisen. [Data Management for Data Science Towards Embedded Analytics](https://duckdb.org/pdf/CIDR2020-raasveldt-muehleisen-duckdb.pdf). At *10th Conference on Innovative Data Systems Research* (CIDR), January 2020.
[^63]: Jean-François Im, Kishore Gopalakrishna, Subbu Subramaniam, Mayank Shrivastava, Adwait Tumbde, Xiaotian Jiang, Jennifer Dai, Seunghyun Lee, Neha Pawar, Jialiang Li, and Ravi Aringunram. [Pinot: Realtime OLAP for 530 Million Users](https://cwiki.apache.org/confluence/download/attachments/103092375/Pinot.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 583–594, May 2018. [doi:10.1145/3183713.3190661](https://doi.org/10.1145/3183713.3190661)
[^64]: Fangjin Yang, Eric Tschetter, Xavier Léauté, Nelson Ray, Gian Merlino, and Deep Ganguli. [Druid: A Real-time Analytical Data Store](https://static.druid.io/docs/druid.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2014. [doi:10.1145/2588555.2595631](https://doi.org/10.1145/2588555.2595631)
[^65]: Chunwei Liu, Anna Pavlenko, Matteo Interlandi, and Brandon Haynes. [Deep Dive into Common Open Formats for Analytical DBMSs](https://www.vldb.org/pvldb/vol16/p3044-liu.pdf). *Proceedings of the VLDB Endowment*, volume 16, issue 11, pages 3044–3056, July 2023. [doi:10.14778/3611479.3611507](https://doi.org/10.14778/3611479.3611507)
[^66]: Xinyu Zeng, Yulong Hui, Jiahong Shen, Andrew Pavlo, Wes McKinney, and Huanchen Zhang. [An Empirical Evaluation of Columnar Storage Formats](https://www.vldb.org/pvldb/vol17/p148-zeng.pdf). *Proceedings of the VLDB Endowment*, volume 17, issue 2, pages 148–161. [doi:10.14778/3626292.3626298](https://doi.org/10.14778/3626292.3626298)
[^67]: Weston Pace. [Lance v2: A columnar container format for modern data](https://blog.lancedb.com/lance-v2/). *blog.lancedb.com*, April 2024. Archived at [perma.cc/ZK3Q-S9VJ](https://perma.cc/ZK3Q-S9VJ)
[^68]: Yoav Helfman. [Nimble, A New Columnar File Format](https://www.youtube.com/watch?v=bISBNVtXZ6M). At *VeloxCon*, April 2024.
[^69]: Wes McKinney. [Apache Arrow: High-Performance Columnar Data Framework](https://www.youtube.com/watch?v=YhF8YR0OEFk). At *CMU Database Group – Vaccination Database Tech Talks*, December 2021.
[^70]: Wes McKinney. [Python for Data Analysis, 3rd Edition](https://learning.oreilly.com/library/view/python-for-data/9781098104023/). O’Reilly Media, August 2022. ISBN: 9781098104023
[^71]: Paul Dix. [The Design of InfluxDB IOx: An In-Memory Columnar Database Written in Rust with Apache Arrow](https://www.youtube.com/watch?v=_zbwz-4RDXg). At *CMU Database Group – Vaccination Database Tech Talks*, May 2021.
[^72]: Carlota Soto and Mike Freedman. [Building Columnar Compression for Large PostgreSQL Databases](https://www.timescale.com/blog/building-columnar-compression-in-a-row-oriented-database/). *timescale.com*, March 2024. Archived at [perma.cc/7KTF-V3EH](https://perma.cc/7KTF-V3EH)
[^73]: Daniel Lemire, Gregory Ssi‐Yan‐Kai, and Owen Kaser. [Consistently faster and smaller compressed bitmaps with Roaring](https://arxiv.org/pdf/1603.06549). *Software: Practice and Experience*, volume 46, issue 11, pages 1547–1569, November 2016. [doi:10.1002/spe.2402](https://doi.org/10.1002/spe.2402)
[^74]: Jaz Volpert. [An entire Social Network in 1.6GB (GraphD Part 2)](https://jazco.dev/2024/04/20/roaring-bitmaps/). *jazco.dev*, April 2024. Archived at [perma.cc/L27Z-QVMG](https://perma.cc/L27Z-QVMG)
[^75]: Daniel J. Abadi, Peter Boncz, Stavros Harizopoulos, Stratos Idreos, and Samuel Madden. [The Design and Implementation of Modern Column-Oriented Database Systems](https://www.cs.umd.edu/~abadi/papers/abadi-column-stores.pdf). *Foundations and Trends in Databases*, volume 5, issue 3, pages 197–280, December 2013. [doi:10.1561/1900000024](https://doi.org/10.1561/1900000024)
[^76]: Andrew Lamb, Matt Fuller, Ramakrishna Varadarajan, Nga Tran, Ben Vandiver, Lyric Doshi, and Chuck Bear. [The Vertica Analytic Database: C-Store 7 Years Later](https://vldb.org/pvldb/vol5/p1790_andrewlamb_vldb2012.pdf). *Proceedings of the VLDB Endowment*, volume 5, issue 12, pages 1790–1801, August 2012. [doi:10.14778/2367502.2367518](https://doi.org/10.14778/2367502.2367518)
[^77]: Timo Kersten, Viktor Leis, Alfons Kemper, Thomas Neumann, Andrew Pavlo, and Peter Boncz. [Everything You Always Wanted to Know About Compiled and Vectorized Queries But Were Afraid to Ask](https://www.vldb.org/pvldb/vol11/p2209-kersten.pdf). *Proceedings of the VLDB Endowment*, volume 11, issue 13, pages 2209–2222, September 2018. [doi:10.14778/3275366.3284966](https://doi.org/10.14778/3275366.3284966)
[^78]: Forrest Smith. [Memory Bandwidth Napkin Math](https://www.forrestthewoods.com/blog/memory-bandwidth-napkin-math/). *forrestthewoods.com*, February 2020. Archived at [perma.cc/Y8U4-PS7N](https://perma.cc/Y8U4-PS7N)
[^79]: Peter Boncz, Marcin Zukowski, and Niels Nes. [MonetDB/X100: Hyper-Pipelining Query Execution](https://www.cidrdb.org/cidr2005/papers/P19.pdf). At *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005.
[^80]: Jingren Zhou and Kenneth A. Ross. [Implementing Database Operations Using SIMD Instructions](https://www1.cs.columbia.edu/~kar/pubsk/simd.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 145–156, June 2002. [doi:10.1145/564691.564709](https://doi.org/10.1145/564691.564709)
[^81]: Kevin Bartley. [OLTP Queries: Transfer Expensive Workloads to Materialize](https://materialize.com/blog/oltp-queries/). *materialize.com*, August 2024. Archived at [perma.cc/4TYM-TYD8](https://perma.cc/4TYM-TYD8)
[^82]: Jim Gray, Surajit Chaudhuri, Adam Bosworth, Andrew Layman, Don Reichart, Murali Venkatrao, Frank Pellow, and Hamid Pirahesh. [Data Cube: A Relational Aggregation Operator Generalizing Group-By, Cross-Tab, and Sub-Totals](https://arxiv.org/pdf/cs/0701155). *Data Mining and Knowledge Discovery*, volume 1, issue 1, pages 29–53, March 2007. [doi:10.1023/A:1009726021843](https://doi.org/10.1023/A%3A1009726021843)
[^83]: Frank Ramsak, Volker Markl, Robert Fenk, Martin Zirkel, Klaus Elhardt, and Rudolf Bayer. [Integrating the UB-Tree into a Database System Kernel](https://www.vldb.org/conf/2000/P263.pdf). At *26th International Conference on Very Large Data Bases* (VLDB), September 2000.
[^84]: Octavian Procopiuc, Pankaj K. Agarwal, Lars Arge, and Jeffrey Scott Vitter. [Bkd-Tree: A Dynamic Scalable kd-Tree](https://users.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf). At *8th International Symposium on Spatial and Temporal Databases* (SSTD), pages 46–65, July 2003. [doi:10.1007/978-3-540-45072-6\_4](https://doi.org/10.1007/978-3-540-45072-6_4)
[^85]: Joseph M. Hellerstein, Jeffrey F. Naughton, and Avi Pfeffer. [Generalized Search Trees for Database Systems](https://dsf.berkeley.edu/papers/vldb95-gist.pdf). At *21st International Conference on Very Large Data Bases* (VLDB), September 1995.
[^86]: Isaac Brodsky. [H3: Uber’s Hexagonal Hierarchical Spatial Index](https://eng.uber.com/h3/). *eng.uber.com*, June 2018. Archived at [archive.org](https://web.archive.org/web/20240722003854/https%3A//www.uber.com/blog/h3/)
[^87]: Robert Escriva, Bernard Wong, and Emin Gün Sirer. [HyperDex: A Distributed, Searchable Key-Value Store](https://www.cs.princeton.edu/courses/archive/fall13/cos518/papers/hyperdex.pdf). At *ACM SIGCOMM Conference*, August 2012. [doi:10.1145/2377677.2377681](https://doi.org/10.1145/2377677.2377681)
[^88]: Christopher D. Manning, Prabhakar Raghavan, and Hinrich Schütze. [*Introduction to Information Retrieval*](https://nlp.stanford.edu/IR-book/). Cambridge University Press, 2008. ISBN: 978-0-521-86571-5, available online at [nlp.stanford.edu/IR-book](https://nlp.stanford.edu/IR-book/)
[^89]: Jianguo Wang, Chunbin Lin, Yannis Papakonstantinou, and Steven Swanson. [An Experimental Study of Bitmap Compression vs. Inverted List Compression](https://cseweb.ucsd.edu/~swanson/papers/SIGMOD2017-ListCompression.pdf). At *ACM International Conference on Management of Data* (SIGMOD), pages 993–1008, May 2017. [doi:10.1145/3035918.3064007](https://doi.org/10.1145/3035918.3064007)
[^90]: Adrien Grand. [What is in a Lucene Index?](https://speakerdeck.com/elasticsearch/what-is-in-a-lucene-index) At *Lucene/Solr Revolution*, November 2013. Archived at [perma.cc/Z7QN-GBYY](https://perma.cc/Z7QN-GBYY)
[^91]: Michael McCandless. [Visualizing Lucene’s Segment Merges](https://blog.mikemccandless.com/2011/02/visualizing-lucenes-segment-merges.html). *blog.mikemccandless.com*, February 2011. Archived at [perma.cc/3ZV8-72W6](https://perma.cc/3ZV8-72W6)
[^92]: Lukas Fittl. [Understanding Postgres GIN Indexes: The Good and the Bad](https://pganalyze.com/blog/gin-index). *pganalyze.com*, December 2021. Archived at [perma.cc/V3MW-26H6](https://perma.cc/V3MW-26H6)
[^93]: Jimmy Angelakos. [The State of (Full) Text Search in PostgreSQL 12](https://www.youtube.com/watch?v=c8IrUHV70KQ). At *FOSDEM*, February 2020. Archived at [perma.cc/J6US-3WZS](https://perma.cc/J6US-3WZS)
[^94]: Alexander Korotkov. [Index support for regular expression search](https://wiki.postgresql.org/images/6/6c/Index_support_for_regular_expression_search.pdf). At *PGConf.EU Prague*, October 2012. Archived at [perma.cc/5RFZ-ZKDQ](https://perma.cc/5RFZ-ZKDQ)
[^95]: Michael McCandless. [Lucene’s FuzzyQuery Is 100 Times Faster in 4.0](https://blog.mikemccandless.com/2011/03/lucenes-fuzzyquery-is-100-times-faster.html). *blog.mikemccandless.com*, March 2011. Archived at [perma.cc/E2WC-GHTW](https://perma.cc/E2WC-GHTW)
[^96]: Steffen Heinz, Justin Zobel, and Hugh E. Williams. [Burst Tries: A Fast, Efficient Data Structure for String Keys](https://web.archive.org/web/20130903070248id_/http%3A//ww2.cs.mu.oz.au%3A80/~jz/fulltext/acmtois02.pdf). *ACM Transactions on Information Systems*, volume 20, issue 2, pages 192–223, April 2002. [doi:10.1145/506309.506312](https://doi.org/10.1145/506309.506312)
[^97]: Klaus U. Schulz and Stoyan Mihov. [Fast String Correction with Levenshtein Automata](https://dmice.ohsu.edu/bedricks/courses/cs655/pdf/readings/2002_Schulz.pdf). *International Journal on Document Analysis and Recognition*, volume 5, issue 1, pages 67–85, November 2002. [doi:10.1007/s10032-002-0082-8](https://doi.org/10.1007/s10032-002-0082-8)
[^98]: Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781). At *International Conference on Learning Representations* (ICLR), May 2013. [doi:10.48550/arXiv.1301.3781](https://doi.org/10.48550/arXiv.1301.3781)
[^99]: Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805). At *Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies*, volume 1, pages 4171–4186, June 2019. [doi:10.18653/v1/N19-1423](https://doi.org/10.18653/v1/N19-1423)
[^100]: Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever. [Improving Language Understanding by Generative Pre-Training](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf). *openai.com*, June 2018. Archived at [perma.cc/5N3C-DJ4C](https://perma.cc/5N3C-DJ4C)
[^101]: Matthijs Douze, Maria Lomeli, and Lucas Hosseini. [Faiss indexes](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes). *github.com*, August 2024. Archived at [perma.cc/2EWG-FPBS](https://perma.cc/2EWG-FPBS)
[^102]: Varik Matevosyan. [Understanding pgvector’s HNSW Index Storage in Postgres](https://lantern.dev/blog/pgvector-storage). *lantern.dev*, August 2024. Archived at [perma.cc/B2YB-JB59](https://perma.cc/B2YB-JB59)
[^103]: Dmitry Baranchuk, Artem Babenko, and Yury Malkov. [Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](https://arxiv.org/pdf/1802.02422). At *European Conference on Computer Vision* (ECCV), pages 202–216, September 2018. [doi:10.1007/978-3-030-01258-8\_13](https://doi.org/10.1007/978-3-030-01258-8_13)
[^104]: Yury A. Malkov and Dmitry A. Yashunin. [Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs](https://arxiv.org/pdf/1603.09320). *IEEE Transactions on Pattern Analysis and Machine Intelligence*, volume 42, issue 4, pages 824–836, April 2020. [doi:10.1109/TPAMI.2018.2889473](https://doi.org/10.1109/TPAMI.2018.2889473) 


================================================
FILE: content/zh/ch5.md
================================================
---
title: "5. 编码与演化"
weight: 105
math: true
breadcrumbs: false
---

<a id="ch_encoding"></a>

![](/map/ch04.png)

> *万物流转，无物常驻。*
>
> 赫拉克利特，引自柏拉图《克拉提鲁斯》（公元前 360 年）

应用程序不可避免地会随时间而变化。随着新产品的推出、用户需求被更深入地理解，或者业务环境发生变化，功能会被添加或修改。在 [第 2 章](/ch2#ch_nonfunctional) 中，我们介绍了 *可演化性* 的概念：我们应该致力于构建易于适应变化的系统（参见 ["可演化性：让变更更容易"](/ch2#sec_introduction_evolvability)）。

在大多数情况下，应用程序功能的变更也需要其存储数据的变更：可能需要捕获新的字段或记录类型，或者现有数据需要以新的方式呈现。

我们在 [第 3 章](/ch3#ch_datamodels) 中讨论的数据模型有不同的方式来应对这种变化。关系数据库通常假定数据库中的所有数据都遵循一个模式：尽管该模式可以更改（通过模式迁移；即 `ALTER` 语句），但在任何一个时间点只有一个模式生效。相比之下，读时模式（"无模式"）数据库不强制执行模式，因此数据库可以包含在不同时间写入的新旧数据格式的混合（参见 ["文档模型中的模式灵活性"](/ch3#sec_datamodels_schema_flexibility)）。

当数据格式或模式发生变化时，通常需要对应用程序代码进行相应的更改（例如，你向记录添加了一个新字段，应用程序代码开始读写该字段）。然而，在大型应用程序中，代码更改通常无法立即完成：

* 对于服务端应用程序，你可能希望执行 *滚动升级*（也称为 *阶段发布*），每次将新版本部署到几个节点，检查新版本是否运行顺利，然后逐步在所有节点上部署。这允许在不中断服务的情况下部署新版本，从而鼓励更频繁的发布和更好的可演化性。
* 对于客户端应用程序，你要看用户的意愿，他们可能很长时间都不安装更新。

这意味着新旧版本的代码，以及新旧数据格式，可能会同时在系统中共存。为了使系统继续平稳运行，我们需要在两个方向上保持兼容性：

向后兼容性
: 较新的代码可以读取由较旧代码写入的数据。

向前兼容性
: 较旧的代码可以读取由较新代码写入的数据。

向后兼容性通常不难实现：作为新代码的作者，你知道旧代码写入的数据格式，因此可以显式地处理它（如有必要，只需保留旧代码来读取旧数据）。向前兼容性可能更棘手，因为它需要旧代码忽略新版本代码添加的部分。

向前兼容性的另一个挑战如 [图 5-1](#fig_encoding_preserve_field) 所示。假设你向记录模式添加了一个字段，新代码创建了包含该新字段的记录并将其存储在数据库中。随后，旧版本的代码（尚不知道新字段）读取记录，更新它，然后写回。在这种情况下，理想的行为通常是旧代码保持新字段不变，即使它无法解释。但是，如果记录被解码为不显式保留未知字段的模型对象，数据可能会丢失，如 [图 5-1](#fig_encoding_preserve_field) 所示。

{{< figure src="/fig/ddia_0501.png" id="fig_encoding_preserve_field" caption="图 5-1. 当旧版本的应用程序更新之前由新版本应用程序写入的数据时，如果不小心，数据可能会丢失。" class="w-full my-4" >}}

在本章中，我们将研究几种编码数据的格式，包括 JSON、XML、Protocol Buffers 和 Avro。特别是，我们将研究它们如何处理模式变化，以及它们如何支持新旧数据和代码需要共存的系统。然后我们将讨论这些格式如何用于数据存储和通信：在数据库、Web 服务、REST API、远程过程调用（RPC）、工作流引擎以及事件驱动系统（如 actor 和消息队列）中。

## 编码数据的格式 {#sec_encoding_formats}

程序通常以（至少）两种不同的表示形式处理数据：

1. 在内存中，数据保存在对象、结构体、列表、数组、哈希表、树等中。这些数据结构针对 CPU 的高效访问和操作进行了优化（通常使用指针）。
2. 当你想要将数据写入文件或通过网络发送时，必须将其编码为某种自包含的字节序列（例如，JSON 文档）。由于指针对任何其他进程都没有意义，因此这种字节序列表示通常与内存中常用的数据结构看起来截然不同。

因此，我们需要在两种表示之间进行某种转换。从内存表示到字节序列的转换称为 *编码*（也称为 *序列化* 或 *编组*），反向过程称为 *解码*（*解析*、*反序列化*、*反编组*）。

--------

> [!TIP] 术语冲突
>
> *序列化* 这个术语不幸地也用于事务的上下文中（参见 [第 8 章](/ch8#ch_transactions)），具有完全不同的含义。为了避免词义重载，本书中我们将坚持使用 *编码*，尽管 *序列化* 可能是更常见的术语。

--------

也有例外情况不需要编码/解码——例如，当数据库直接对从磁盘加载的压缩数据进行操作时，如 ["查询执行：编译与向量化"](/ch4#sec_storage_vectorized) 中所讨论的。还有一些 *零拷贝* 数据格式，旨在在运行时和磁盘/网络上都可以使用，无需显式转换步骤，例如 Cap'n Proto 和 FlatBuffers。

然而，大多数系统需要在内存对象和平面字节序列之间进行转换。由于这是一个如此常见的问题，有无数不同的库和编码格式可供选择。让我们简要概述一下。

### 特定语言的格式 {#id96}

许多编程语言都内置了将内存对象编码为字节序列的支持。例如，Java 有 `java.io.Serializable`，Python 有 `pickle`，Ruby 有 `Marshal`，等等。许多第三方库也存在，例如 Java 的 Kryo。

这些编码库非常方便，因为它们允许用最少的额外代码保存和恢复内存对象。然而，它们也有许多深层次的问题：

* 编码通常与特定编程语言绑定，在另一种语言中读取会非常困难。如果你以这种编码存储或传输数据，就等于在相当长时间内把自己绑定在当前编程语言上，也排除了与其他组织（可能使用不同语言）的系统集成。
* 为了以相同的对象类型恢复数据，解码过程需要能够实例化任意类。这经常是安全问题的来源 [^1]：如果攻击者可以让你的应用程序解码任意字节序列，他们可以实例化任意类，这反过来通常允许他们做可怕的事情，例如远程执行任意代码 [^2] [^3]。
* 在这些库中，数据版本控制通常是事后考虑的：由于它们旨在快速轻松地编码数据，因此它们经常忽略向前和向后兼容性的不便问题 [^4]。
* 效率（编码或解码所需的 CPU 时间以及编码结构的大小）通常也是事后考虑的。例如，Java 的内置序列化因其糟糕的性能和臃肿的编码而臭名昭著 [^5]。

由于这些原因，除了非常临时的目的外，使用语言的内置编码通常是个坏主意。

### JSON、XML 及其二进制变体 {#sec_encoding_json}

当转向可以由许多编程语言编写和读取的标准化编码时，JSON 和 XML 是显而易见的竞争者。它们广为人知，广受支持，也几乎同样广受诟病。XML 经常因过于冗长和不必要的复杂而受到批评 [^6]。JSON 的流行主要是由于它在 Web 浏览器中的内置支持以及相对于 XML 的简单性。CSV 是另一种流行的与语言无关的格式，但它只支持表格数据而不支持嵌套。

JSON、XML 和 CSV 是文本格式，因此在某种程度上是人类可读的（尽管语法是一个热门的争论话题）。除了表面的语法问题之外，它们还有一些微妙的问题：

* 数字的编码有很多歧义。在 XML 和 CSV 中，你无法区分数字和恰好由数字组成的字符串（除非引用外部模式）。JSON 区分字符串和数字，但它不区分整数和浮点数，也不指定精度。

  这在处理大数字时是一个问题；例如，大于 2⁵³ 的整数无法在 IEEE 754 双精度浮点数中精确表示，因此在使用浮点数的语言（如 JavaScript）中解析时，此类数字会变得不准确 [^7]。大于 2⁵³ 的数字的一个例子出现在 X（前身为 Twitter）上，它使用 64 位数字来识别每个帖子。API 返回的 JSON 包括帖子 ID 两次，一次作为 JSON 数字，一次作为十进制字符串，以解决 JavaScript 应用程序无法正确解析数字的事实 [^8]。
* JSON 和 XML 对 Unicode 字符串（即人类可读文本）有很好的支持，但它们不支持二进制字符串（没有字符编码的字节序列）。二进制字符串是一个有用的功能，因此人们通过使用 Base64 将二进制数据编码为文本来绕过这个限制。然后模式用于指示该值应被解释为 Base64 编码。这虽然有效，但有点取巧，并且会将数据大小增加 33%。
* XML 模式和 JSON 模式功能强大，因此学习和实现起来相当复杂。由于数据的正确解释（如数字和二进制字符串）取决于模式中的信息，不使用 XML/JSON 模式的应用程序需要潜在地硬编码适当的编码/解码逻辑。
* CSV 没有任何模式，因此应用程序需要定义每行和每列的含义。如果应用程序更改添加了新行或列，你必须手动处理该更改。CSV 也是一种相当模糊的格式（如果值包含逗号或换行符会发生什么？）。尽管其转义规则已被正式指定 [^9]，但并非所有解析器都正确实现它们。

尽管存在这些缺陷，JSON、XML 和 CSV 对许多目的来说已经足够好了。它们可能会继续流行，特别是作为数据交换格式（即从一个组织向另一个组织发送数据）。在这些情况下，只要人们就格式达成一致，格式有多漂亮或高效通常并不重要。让不同组织就 *任何事情* 达成一致的困难超过了大多数其他问题。

#### JSON 模式 {#json-schema}

JSON 模式已被广泛采用，作为系统间交换或写入存储时对数据建模的一种方式。你会在 Web 服务中找到 JSON 模式（参见 ["Web 服务"](#sec_web_services)）作为 OpenAPI Web 服务规范的一部分，在模式注册表中如 Confluent 的 Schema Registry 和 Red Hat 的 Apicurio Registry，以及在数据库中如 PostgreSQL 的 pg_jsonschema 验证器扩展和 MongoDB 的 `$jsonSchema` 验证器语法。

JSON 模式规范提供了许多功能。模式包括标准原始类型，包括字符串、数字、整数、对象、数组、布尔值或空值。但 JSON 模式还提供了一个单独的验证规范，允许开发人员在字段上叠加约束。例如，`port` 字段可能具有最小值 1 和最大值 65535。

JSON 模式可以具有开放或封闭的内容模型。开放内容模型允许模式中未定义的任何字段以任何数据类型存在，而封闭内容模型只允许显式定义的字段。JSON 模式中的开放内容模型在 `additionalProperties` 设置为 `true` 时启用，这是默认值。因此，JSON 模式通常是对 *不允许* 内容的定义（即，任何已定义字段上的无效值），而不是对模式中 *允许* 内容的定义。

开放内容模型功能强大，但可能很复杂。例如，假设你想定义一个从整数（如 ID）到字符串的映射。JSON 没有映射或字典类型，只有一个可以包含字符串键和任何类型值的"对象"类型。然后，你可以使用 JSON 模式约束此类型，使键只能包含数字，值只能是字符串，使用 `patternProperties` 和 `additionalProperties`，如 [示例 5-1](#fig_encoding_json_schema) 所示。


{{< figure id="fig_encoding_json_schema" title="示例 5-1. 具有整数键和字符串值的示例 JSON 模式。整数键表示为仅包含整数的字符串，因为 JSON 模式要求所有键都是字符串。" class="w-full my-4" >}}

```json
{
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "patternProperties": {
        "^[0-9]+$": {
        "type": "string"
    }
    },
    "additionalProperties": false
}
```

除了开放和封闭内容模型以及验证器之外，JSON 模式还支持条件 if/else 模式逻辑、命名类型、对远程模式的引用等等。所有这些都构成了一种非常强大的模式语言。这些功能也使定义变得笨重。解析远程模式、推理条件规则或以向前或向后兼容的方式演化模式可能具有挑战性 [^10]。类似的问题也适用于 XML 模式 [^11]。

#### 二进制编码 {#binary-encoding}

JSON 比 XML 更简洁，但与二进制格式相比，两者仍然使用大量空间。这一观察导致了大量 JSON 二进制编码（MessagePack、CBOR、BSON、BJSON、UBJSON、BISON、Hessian 和 Smile 等等）和 XML 二进制编码（例如 WBXML 和 Fast Infoset）的发展。这些格式已在各种利基市场中被采用，因为它们更紧凑，有时解析速度更快，但它们都没有像 JSON 和 XML 的文本版本那样被广泛采用 [^12]。

其中一些格式扩展了数据类型集（例如，区分整数和浮点数，或添加对二进制字符串的支持），但除此之外，它们保持 JSON/XML 数据模型不变。特别是，由于它们不规定模式，因此需要在编码数据中包含所有对象字段名称。也就是说，在 [示例 5-2](#fig_encoding_json) 中的 JSON 文档的二进制编码中，它们需要在某处包含字符串 `userName`、`favoriteNumber` 和 `interests`。

{{< figure id="fig_encoding_json" title="示例 5-2. 本章中我们将以几种二进制格式编码的示例记录" class="w-full my-4" >}}

```json
{
    "userName": "Martin",
    "favoriteNumber": 1337,
    "interests": ["daydreaming", "hacking"]
}
```

让我们看一个 MessagePack 的例子，它是 JSON 的二进制编码。[图 5-2](#fig_encoding_messagepack) 显示了如果你使用 MessagePack 编码 [示例 5-2](#fig_encoding_json) 中的 JSON 文档所得到的字节序列。前几个字节如下：

1. 第一个字节 `0x83` 表示接下来是一个对象（前四位 = `0x80`），有三个字段（后四位 = `0x03`）。（如果你想知道如果对象有超过 15 个字段会发生什么，以至于字段数无法装入四位，那么它会获得不同的类型指示符，字段数会以两个或四个字节编码。）
2. 第二个字节 `0xa8` 表示接下来是一个字符串（前四位 = `0xa0`），长度为八个字节（后四位 = `0x08`）。
3. 接下来的八个字节是 ASCII 格式的字段名 `userName`。由于之前已经指示了长度，因此不需要任何标记来告诉我们字符串在哪里结束（或任何转义）。
4. 接下来的七个字节使用前缀 `0xa6` 编码六个字母的字符串值 `Martin`，依此类推。

二进制编码长度为 66 字节，仅比文本 JSON 编码（去除空格后）占用的 81 字节少一点。所有 JSON 的二进制编码在这方面都是相似的。目前尚不清楚这种小的空间减少（以及可能的解析速度提升）是否值得失去人类可读性。

在接下来的部分中，我们将看到如何做得更好，将相同的记录编码为仅 32 字节。

{{< figure link="#fig_encoding_json" src="/fig/ddia_0502.png" id="fig_encoding_messagepack" caption="图 5-2. 使用 MessagePack 编码的示例记录 示例 5-2。" class="w-full my-4" >}}


### Protocol Buffers {#sec_encoding_protobuf}

Protocol Buffers (protobuf) 是 Google 开发的二进制编码库。它类似于 Apache Thrift，后者最初由 Facebook 开发 [^13]；本节关于 Protocol Buffers 的大部分内容也适用于 Thrift。

Protocol Buffers 需要为任何编码的数据提供模式。要在 Protocol Buffers 中编码 [示例 5-2](#fig_encoding_json) 中的数据，你需要像这样在 Protocol Buffers 接口定义语言（IDL）中描述模式：

```protobuf
syntax = "proto3";

message Person {
    string user_name = 1;
    int64 favorite_number = 2;
    repeated string interests = 3;
}
```

Protocol Buffers 附带了一个代码生成工具，它接受像这里显示的模式定义，并生成以各种编程语言实现该模式的类。你的应用程序代码可以调用此生成的代码来编码或解码模式的记录。使用 Protocol Buffers 编码器编码 [示例 5-2](#fig_encoding_json) 需要 33 字节，如 [图 5-3](#fig_encoding_protobuf) 所示 [^14]。

{{< figure src="/fig/ddia_0503.png" id="fig_encoding_protobuf" caption="图 5-3. 使用 Protocol Buffers 编码的示例记录。" class="w-full my-4" >}}


与 [图 5-2](#fig_encoding_messagepack) 类似，每个字段都有一个类型注释（指示它是字符串、整数等）以及必要时的长度指示（例如字符串的长度）。数据中出现的字符串（"Martin"、"daydreaming"、"hacking"）也编码为 ASCII（准确地说是 UTF-8），与之前类似。

与 [图 5-2](#fig_encoding_messagepack) 相比的最大区别是没有字段名（`userName`、`favoriteNumber`、`interests`）。相反，编码数据包含 *字段标签*，即数字（`1`、`2` 和 `3`）。这些是模式定义中出现的数字。字段标签就像字段的别名——它们是说明我们正在谈论哪个字段的紧凑方式，而无需拼写字段名。

如你所见，Protocol Buffers 通过将字段类型和标签号打包到单个字节中来节省更多空间。它使用可变长度整数：数字 1337 编码为两个字节，每个字节的最高位用于指示是否还有更多字节要来。这意味着 -64 到 63 之间的数字以一个字节编码，-8192 到 8191 之间的数字以两个字节编码，等等。更大的数字使用更多字节。

Protocol Buffers 没有显式的列表或数组数据类型。相反，`interests` 字段上的 `repeated` 修饰符表示该字段包含值列表，而不是单个值。在二进制编码中，列表元素只是简单地表示为同一记录中相同字段标签的重复出现。

#### 字段标签与模式演化 {#field-tags-and-schema-evolution}

我们之前说过，模式不可避免地需要随时间而变化。我们称之为 *模式演化*。Protocol Buffers 如何在保持向后和向前兼容性的同时处理模式更改？

从示例中可以看出，编码记录只是其编码字段的串联。每个字段由其标签号（示例模式中的数字 `1`、`2`、`3`）标识，并带有数据类型注释（例如字符串或整数）。如果未设置字段值，则它会从编码记录中省略。由此可以看出，字段标签对编码数据的含义至关重要。你可以更改模式中字段的名称，因为编码数据从不引用字段名，但你不能更改字段的标签，因为这会使所有现有的编码数据无效。

你可以向模式添加新字段，前提是你为每个字段提供新的标签号。如果旧代码（不知道你添加的新标签号）尝试读取由新代码写入的数据（包括具有它不识别的标签号的新字段），它可以简单地忽略该字段。数据类型注释允许解析器确定需要跳过多少字节，并保留未知字段以避免 [图 5-1](#fig_encoding_preserve_field) 中的问题。这保持了向前兼容性：旧代码可以读取由新代码编写的记录。

向后兼容性呢？只要每个字段都有唯一的标签号，新代码总是可以读取旧数据，因为标签号仍然具有相同的含义。如果在新模式中添加了字段，而你读取尚未包含该字段的旧数据，则它将填充默认值（例如，如果字段类型为字符串，则为空字符串；如果是数字，则为零）。

删除字段就像添加字段一样，向后和向前兼容性问题相反。你永远不能再次使用相同的标签号，因为你可能仍然有在某处写入的数据包含旧标签号，并且该字段必须被新代码忽略。可以在模式定义中保留过去使用的标签号，以确保它们不会被遗忘。

更改字段的数据类型呢？这在某些类型上是可能的——请查看文档了解详细信息——但存在值被截断的风险。例如，假设你将 32 位整数更改为 64 位整数。新代码可以轻松读取旧代码写入的数据，因为解析器可以用零填充任何缺失的位。但是，如果旧代码读取新代码写入的数据，则旧代码仍然使用 32 位变量来保存该值。如果解码的 64 位值无法装入 32 位，它将被截断。

### Avro {#sec_encoding_avro}

Apache Avro 是另一种二进制编码格式，与 Protocol Buffers 有着有趣的不同。它于 2009 年作为 Hadoop 的子项目启动，因为 Protocol Buffers 不太适合 Hadoop 的用例 [^15]。

Avro 也使用模式来指定正在编码的数据的结构。它有两种模式语言：一种（Avro IDL）用于人工编辑，另一种（基于 JSON）更容易被机器读取。与 Protocol Buffers 一样，此模式语言仅指定字段及其类型，而不像 JSON 模式那样指定复杂的验证规则。

我们的示例模式，用 Avro IDL 编写，可能如下所示：

```c
record Person {
    string                  userName;
    union { null, long }    favoriteNumber = null;
    array<string>           interests;
}
```

该模式的等效 JSON 表示如下：

```c
{
    "type": "record",
    "name": "Person",
    "fields": [
        {"name": "userName",        "type": "string"},
        {"name": "favoriteNumber",  "type": ["null", "long"], "default": null},
        {"name": "interests",       "type": {"type": "array", "items": "string"}}
    ]
}
```

首先，请注意模式中没有标签号。如果我们使用此模式编码示例记录（[示例 5-2](#fig_encoding_json)），Avro 二进制编码只有 32 字节长——是我们看到的所有编码中最紧凑的。编码字节序列的分解如 [图 5-4](#fig_encoding_avro) 所示。

如果你检查字节序列，你会发现没有任何东西来标识字段或其数据类型。编码只是由串联在一起的值组成。字符串只是一个长度前缀，后跟 UTF-8 字节，但编码数据中没有任何内容告诉你它是字符串。它也可能是整数，或完全是其他东西。整数使用可变长度编码进行编码。

{{< figure src="/fig/ddia_0504.png" id="fig_encoding_avro" caption="图 5-4. 使用 Avro 编码的示例记录。" class="w-full my-4" >}}


要解析二进制数据，你需要按照模式中出现的字段顺序进行遍历，并使用模式告诉你每个字段的数据类型。这意味着只有当读取数据的代码使用与写入数据的代码 *完全相同的模式* 时，二进制数据才能被正确解码。读取器和写入器之间的任何模式不匹配都意味着数据被错误解码。

那么，Avro 如何支持模式演化？

#### 写入者模式与读取者模式 {#the-writers-schema-and-the-readers-schema}

当应用程序想要编码一些数据（将其写入文件或数据库，通过网络发送等）时，它使用它知道的任何版本的模式对数据进行编码——例如，该模式可能被编译到应用程序中。这被称为 *写入者模式*。

当应用程序想要解码一些数据（从文件或数据库读取，从网络接收等）时，它使用两个模式：与用于编码相同的写入者模式，以及 *读取者模式*，后者可能不同。这在 [图 5-5](#fig_encoding_avro_schemas) 中说明。读取者模式定义了应用程序代码期望的每条记录的字段及其类型。

{{< figure src="/fig/ddia_0505.png" id="fig_encoding_avro_schemas" caption="图 5-5. 在 Protocol Buffers 中，编码和解码可以使用不同版本的模式。在 Avro 中，解码使用两个模式：写入者模式必须与用于编码的模式相同，但读取者模式可以是较旧或较新的版本。" class="w-full my-4" >}}

如果读取者模式和写入者模式相同，解码很容易。如果它们不同，Avro 通过并排查看写入者模式和读取者模式并将数据从写入者模式转换为读取者模式来解决差异。Avro 规范 [^16] [^17] 准确定义了此解析的工作方式，并在 [图 5-6](#fig_encoding_avro_resolution) 中进行了说明。

例如，如果写入者模式和读取者模式的字段顺序不同，这没有问题，因为模式解析通过字段名匹配字段。如果读取数据的代码遇到出现在写入者模式中但不在读取者模式中的字段，它将被忽略。如果读取数据的代码期望某个字段，但写入者模式不包含该名称的字段，则使用读取者模式中声明的默认值填充它。

{{< figure src="/fig/ddia_0506.png" id="fig_encoding_avro_resolution" caption="图 5-6. Avro 读取器解决写入者模式和读取者模式之间的差异。" class="w-full my-4" >}}

#### 模式演化规则 {#schema-evolution-rules}

使用 Avro，向前兼容性意味着你可以将新版本的模式作为写入者，将旧版本的模式作为读取者。相反，向后兼容性意味着你可以将新版本的模式作为读取者，将旧版本作为写入者。

为了保持兼容性，你只能添加或删除具有默认值的字段。（我们的 Avro 模式中的 `favoriteNumber` 字段的默认值为 `null`。）例如，假设你添加了一个具有默认值的字段，因此这个新字段存在于新模式中但不在旧模式中。当使用新模式的读取者读取使用旧模式编写的记录时，将为缺失的字段填充默认值。

如果你要添加一个没有默认值的字段，新读取者将无法读取旧写入者写入的数据，因此你会破坏向后兼容性。如果你要删除一个没有默认值的字段，旧读取者将无法读取新写入者写入的数据，因此你会破坏向前兼容性。

在某些编程语言中，`null` 是任何变量的可接受默认值，但在 Avro 中不是这样：如果你想允许字段为 null，你必须使用 *联合类型*。例如，`union { null, long, string } field;` 表示 `field` 可以是数字、字符串或 null。只有当 `null` 是联合的第一个分支时，你才能将其用作默认值。这比默认情况下一切都可为空更冗长一些，但它通过明确什么可以和不能为 null 来帮助防止错误 [^18]。

更改字段的数据类型是可能的，前提是 Avro 可以转换该类型。更改字段的名称是可能的，但有点棘手：读取者模式可以包含字段名的别名，因此它可以将旧写入者的模式字段名与别名匹配。这意味着更改字段名是向后兼容的，但不是向前兼容的。同样，向联合类型添加分支是向后兼容的，但不是向前兼容的。

#### 但什么是写入者模式？ {#but-what-is-the-writers-schema}

到目前为止，我们忽略了一个重要问题：读取者如何知道特定数据是用哪个写入者模式编码的？我们不能只在每条记录中包含整个模式，因为模式可能比编码数据大得多，使二进制编码节省的所有空间都白费了。

答案取决于 Avro 的使用环境。举几个例子：

包含大量记录的大文件
: Avro 的一个常见用途是存储包含数百万条记录的大文件，所有记录都使用相同的模式编码。（我们将在 [第 11 章](/ch11#ch_batch) 讨论这种情况。）在这种情况下，该文件的写入者可以在文件开头只包含一次写入者模式。Avro 指定了一种文件格式（对象容器文件）来执行此操作。

具有单独写入记录的数据库
: 在数据库中，不同的记录可能在不同的时间点使用不同的写入者模式编写——你不能假定所有记录都具有相同的模式。最简单的解决方案是在每个编码记录的开头包含一个版本号，并在数据库中保留模式版本列表。读取者可以获取记录，提取版本号，然后从数据库中获取该版本号的写入者模式。使用该写入者模式，它可以解码记录的其余部分。

  例如，Apache Kafka 的 Confluent 模式注册表 [^19] 和 LinkedIn 的 Espresso [^20] 就是这样工作的。

通过网络连接发送记录
: 当两个进程通过双向网络连接进行通信时，它们可以在连接设置时协商模式版本，然后在连接的生命周期内使用该模式。Avro RPC 协议（参见 ["流经服务的数据流：REST 与 RPC"](#sec_encoding_dataflow_rpc)）就是这样工作的。

无论如何，模式版本数据库都是有用的，因为它充当文档并让你有机会检查模式兼容性 [^21]。作为版本号，你可以使用简单的递增整数，或者可以使用模式的哈希值。

#### 动态生成的模式 {#dynamically-generated-schemas}

与 Protocol Buffers 相比，Avro 方法的一个优点是模式不包含任何标签号。但为什么这很重要？在模式中保留几个数字有什么问题？

区别在于 Avro 对 *动态生成* 的模式更友好。例如，假设你有一个关系数据库，其内容你想要转储到文件中，并且你想要使用二进制格式来避免前面提到的文本格式（JSON、CSV、XML）的问题。如果你使用 Avro，你可以相当容易地从关系模式生成 Avro 模式（我们之前看到的 JSON 表示），并使用该模式对数据库内容进行编码，将其全部转储到 Avro 对象容器文件中 [^22]。你可以为每个数据库表生成记录模式，每列成为该记录中的一个字段。数据库中的列名映射到 Avro 中的字段名。

现在，如果数据库模式发生变化（例如，表添加了一列并删除了一列），你可以从更新的数据库模式生成新的 Avro 模式，并以新的 Avro 模式导出数据。数据导出过程不需要关注模式更改——它可以在每次运行时简单地进行模式转换。读取新数据文件的任何人都会看到记录的字段已更改，但由于字段是按名称标识的，因此更新的写入者模式仍然可以与旧的读取者模式匹配。

相比之下，如果你为此目的使用 Protocol Buffers，字段标签可能必须手动分配：每次数据库模式更改时，管理员都必须手动更新从数据库列名到字段标签的映射。（这可能是可以自动化的，但模式生成器必须非常小心，不要分配以前使用过的字段标签。）这种动态生成的模式根本不是 Protocol Buffers 的设计目标，而 Avro 则是。

### 模式的优点 {#sec_encoding_schemas}

正如我们所见，Protocol Buffers 和 Avro 都使用模式来描述二进制编码格式。它们的模式语言比 XML 模式或 JSON 模式简单得多，后者支持更详细的验证规则（例如，"此字段的字符串值必须与此正则表达式匹配"或"此字段的整数值必须在 0 到 100 之间"）。由于 Protocol Buffers 和 Avro 在实现和使用上都更简单，它们已经发展到支持相当广泛的编程语言。

这些编码所基于的想法绝不是新的。例如，它们与 ASN.1 有很多共同之处，ASN.1 是 1984 年首次标准化的模式定义语言 [^23] [^24]。它用于定义各种网络协议，其二进制编码（DER）仍用于编码 SSL 证书（X.509），例如 [^25]。ASN.1 支持使用标签号的模式演化，类似于 Protocol Buffers [^26]。然而，它也非常复杂且文档记录不佳，因此 ASN.1 可能不是新应用程序的好选择。

许多数据系统也为其数据实现某种专有二进制编码。例如，大多数关系数据库都有一个网络协议，你可以通过它向数据库发送查询并获取响应。这些协议通常特定于特定数据库，数据库供应商提供驱动程序（例如，使用 ODBC 或 JDBC API），将数据库网络协议的响应解码为内存数据结构。

因此，我们可以看到，尽管文本数据格式（如 JSON、XML 和 CSV）广泛存在，但基于模式的二进制编码也是一个可行的选择。它们具有许多良好的属性：

* 它们可以比各种"二进制 JSON"变体紧凑得多，因为它们可以从编码数据中省略字段名。
* 模式是一种有价值的文档形式，并且由于解码需要模式，因此你可以确保它是最新的（而手动维护的文档很容易与现实脱节）。
* 保留模式数据库允许你在部署任何内容之前检查模式更改的向前和向后兼容性。
* 对于静态类型编程语言的用户，从模式生成代码的能力很有用，因为它可以在编译时进行类型检查。

总之，模式演化允许与无模式/读时模式 JSON 数据库相同的灵活性（参见 ["文档模型中的模式灵活性"](/ch3#sec_datamodels_schema_flexibility)），同时还提供更好的数据保证和更好的工具。

## 数据流的模式 {#sec_encoding_dataflow}

在本章开头，我们说过，当你想要将一些数据发送到与你不共享内存的另一个进程时——例如，当你想要通过网络发送数据或将其写入文件时——你需要将其编码为字节序列。然后，我们讨论了用于执行此操作的各种不同编码。

我们讨论了向前和向后兼容性，这对可演化性很重要（通过允许你独立升级系统的不同部分，而不必一次更改所有内容，使更改变得容易）。兼容性是编码数据的一个进程与解码数据的另一个进程之间的关系。

这是一个相当抽象的想法——数据可以通过许多方式从一个进程流向另一个进程。谁编码数据，谁解码数据？在本章的其余部分，我们将探讨数据在进程之间流动的一些最常见方式：

* 通过数据库（参见 ["流经数据库的数据流"](#sec_encoding_dataflow_db)）
* 通过服务调用（参见 ["流经服务的数据流：REST 与 RPC"](#sec_encoding_dataflow_rpc)）
* 通过工作流引擎（参见 ["持久化执行与工作流"](#sec_encoding_dataflow_workflows)）
* 通过异步消息（参见 ["事件驱动的架构"](#sec_encoding_dataflow_msg)）

### 流经数据库的数据流 {#sec_encoding_dataflow_db}

在数据库中，写入数据库的进程对数据进行编码，从数据库读取的进程对其进行解码。可能只有一个进程访问数据库，在这种情况下，读取者只是同一进程的后续版本——在这种情况下，你可以将在数据库中存储某些内容视为 *向未来的自己发送消息*。

向后兼容性在这里显然是必要的；否则你未来的自己将无法解码你之前写的内容。

通常，几个不同的进程同时访问数据库是很常见的。这些进程可能是几个不同的应用程序或服务，或者它们可能只是同一服务的几个实例（为了可伸缩性或容错而并行运行）。无论哪种方式，在应用程序正在更改的环境中，某些访问数据库的进程可能正在运行较新的代码，而某些进程正在运行较旧的代码——例如，因为新版本当前正在滚动升级中部署，因此某些实例已更新，而其他实例尚未更新。

这意味着数据库中的值可能由 *较新* 版本的代码写入，随后由仍在运行的 *较旧* 版本的代码读取。因此，数据库通常也需要向前兼容性。

#### 不同时间写入的不同值 {#different-values-written-at-different-times}

数据库通常允许在任何时间更新任何值。这意味着在单个数据库中，你可能有一些五毫秒前写入的值，以及一些五年前写入的值。

当你部署应用程序的新版本时（至少是服务端应用程序），你可能会在几分钟内用新版本完全替换旧版本。数据库内容并非如此：五年前的数据仍然存在，采用原始编码，除非你自那时以来明确重写了它。这种观察有时被总结为 *数据比代码更长寿*。

将数据重写（*迁移*）为新模式当然是可能的，但在大型数据集上这是一件昂贵的事情，因此大多数数据库尽可能避免它。大多数关系数据库允许简单的模式更改，例如添加具有 `null` 默认值的新列，而无需重写现有数据。从磁盘上的编码数据中缺少的任何列读取旧行时，数据库会为其填充 `null`。因此，模式演化允许整个数据库看起来好像是用单个模式编码的，即使底层存储可能包含用各种历史版本的模式编码的记录。

更复杂的模式更改——例如，将单值属性更改为多值，或将某些数据移动到单独的表中——仍然需要重写数据，通常在应用程序级别 [^27]。在此类迁移中保持向前和向后兼容性仍然是一个研究问题 [^28]。

#### 归档存储 {#archival-storage}

也许你会不时对数据库进行快照，例如用于备份目的或加载到数据仓库中（参见 ["数据仓库"](/ch1#sec_introduction_dwh)）。在这种情况下，数据转储通常将使用最新模式进行编码，即使源数据库中的原始编码包含来自不同时代的模式版本的混合。由于你无论如何都在复制数据，因此你不妨一致地对数据副本进行编码。

由于数据转储是一次性写入的，此后是不可变的，因此像 Avro 对象容器文件这样的格式非常适合。这也是将数据编码为分析友好的列式格式（如 Parquet）的好机会（参见 ["列压缩"](/ch4#sec_storage_column_compression)）。

在 [第 11 章](/ch11#ch_batch) 中，我们将更多地讨论如何使用归档存储中的数据。

### 流经服务的数据流：REST 与 RPC {#sec_encoding_dataflow_rpc}

当你有需要通过网络进行通信的进程时，有几种不同的方式来安排这种通信。最常见的安排是有两个角色：*客户端* 和 *服务器*。服务器通过网络公开 API，客户端可以连接到服务器以向该 API 发出请求。服务器公开的 API 称为 *服务*。

Web 就是这样工作的：客户端（Web 浏览器）向 Web 服务器发出请求，发出 `GET` 请求以下载 HTML、CSS、JavaScript、图像等，并发出 `POST` 请求以向服务器提交数据。API 由一组标准化的协议和数据格式（HTTP、URL、SSL/TLS、HTML 等）组成。由于 Web 浏览器、Web 服务器和网站作者大多同意这些标准，因此你可以使用任何 Web 浏览器访问任何网站（至少在理论上！）。

Web 浏览器不是唯一类型的客户端。例如，在移动设备和桌面计算机上运行的原生应用程序通常也与服务器通信，在 Web 浏览器内运行的客户端 JavaScript 应用程序也可以发出 HTTP 请求。在这种情况下，服务器的响应通常不是用于向人显示的 HTML，而是以便于客户端应用程序代码进一步处理的编码数据（最常见的是 JSON）。尽管 HTTP 可能用作传输协议，但在其之上实现的 API 是特定于应用程序的，客户端和服务器需要就该 API 的详细信息达成一致。

在某些方面，服务类似于数据库：它们通常允许客户端提交和查询数据。但是，虽然数据库允许使用我们在 [第 3 章](/ch3#ch_datamodels) 中讨论的查询语言进行任意查询，但服务公开了一个特定于应用程序的 API，该 API 仅允许由服务的业务逻辑（应用程序代码）预先确定的输入和输出 [^29]。这种限制提供了一定程度的封装：服务可以对客户端可以做什么和不能做什么施加细粒度的限制。

面向服务/微服务架构的一个关键设计目标是通过使服务可独立部署和演化来使应用程序更容易更改和维护。一个常见的原则是每个服务应该由一个团队拥有，该团队应该能够频繁发布服务的新版本，而无需与其他团队协调。因此，我们应该期望服务器和客户端的新旧版本同时运行，因此服务器和客户端使用的数据编码必须在服务 API 的各个版本之间兼容。

#### Web 服务 {#sec_web_services}

当 HTTP 用作与服务通信的底层协议时，它被称为 *Web 服务*。Web 服务通常用于构建面向服务或微服务架构（在 ["微服务与 Serverless"](/ch1#sec_introduction_microservices) 中讨论过）。术语"Web 服务"可能有点用词不当，因为 Web 服务不仅用于 Web，还用于几种不同的上下文。例如：

1. 在用户设备上运行的客户端应用程序（例如，移动设备上的原生应用程序，或浏览器中的 JavaScript Web 应用程序）向服务发出 HTTP 请求。这些请求通常通过公共互联网进行。
2. 一个服务向同一组织拥有的另一个服务发出请求，通常位于同一数据中心内，作为面向服务/微服务架构的一部分。
3. 一个服务向不同组织拥有的服务发出请求，通常通过互联网。这用于不同组织后端系统之间的数据交换。此类别包括在线服务提供的公共 API，例如信用卡处理系统或用于共享访问用户数据的 OAuth。

最流行的服务设计理念是 REST，它建立在 HTTP 的原则之上 [^30] [^31]。它强调简单的数据格式，使用 URL 来标识资源，并使用 HTTP 功能进行缓存控制、身份验证和内容类型协商。根据 REST 原则设计的 API 称为 *RESTful*。

需要调用 Web 服务 API 的代码必须知道要查询哪个 HTTP 端点，以及发送什么数据格式以及预期的响应。即使服务采用 RESTful 设计原则，客户端也需要以某种方式找出这些详细信息。服务开发人员通常使用接口定义语言（IDL）来定义和记录其服务的 API 端点和数据模型，并随着时间的推移演化它们。然后，其他开发人员可以使用服务定义来确定如何查询服务。两种最流行的服务 IDL 是 OpenAPI（也称为 Swagger [^32]）和 gRPC。OpenAPI 用于发送和接收 JSON 数据的 Web 服务，而 gRPC 服务发送和接收 Protocol Buffers。

开发人员通常用 JSON 或 YAML 编写 OpenAPI 服务定义；参见 [示例 5-3](#fig_open_api_def)。服务定义允许开发人员定义服务端点、文档、版本、数据模型等。gRPC 定义看起来类似，但使用 Protocol Buffers 服务定义进行定义。

{{< figure id="fig_open_api_def" title="示例 5-3. YAML 中的示例 OpenAPI 服务定义" class="w-full my-4" >}}

```yaml
openapi: 3.0.0
info:
  title: Ping, Pong
  version: 1.0.0
servers:
  - url: http://localhost:8080
paths:
  /ping:
    get:
      summary: Given a ping, returns a pong message
      responses:
        '200':
          description: A pong
          content:
            application/json:
              schema:
                type: object
                properties:
                  message:
                    type: string
                    example: Pong!
```

即使采用了设计理念和 IDL，开发人员仍必须编写实现其服务 API 调用的代码。通常采用服务框架来简化这项工作。Spring Boot、FastAPI 和 gRPC 等服务框架允许开发人员为每个 API 端点编写业务逻辑，而框架代码处理路由、指标、缓存、身份验证等。[示例 5-4](#fig_fastapi_def) 显示了 [示例 5-3](#fig_open_api_def) 中定义的服务的示例 Python 实现。

{{< figure id="fig_fastapi_def" title="示例 5-4. 实现 [示例 5-3](#fig_open_api_def) 中定义的示例 FastAPI 服务" class="w-full my-4" >}}

```python
from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI(title="Ping, Pong", version="1.0.0")

class PongResponse(BaseModel):
    message: str = "Pong!"

@app.get("/ping", response_model=PongResponse,
         summary="Given a ping, returns a pong message")
async def ping():
    return PongResponse()
```

许多框架将服务定义和服务器代码耦合在一起。在某些情况下，例如流行的 Python FastAPI 框架，服务器是用代码编写的，IDL 会自动生成。在其他情况下，例如 gRPC，首先编写服务定义，然后生成服务器代码脚手架。两种方法都允许开发人员从服务定义生成各种语言的客户端库和 SDK。除了代码生成之外，Swagger 等 IDL 工具还可以生成文档、验证模式更改兼容性，并为开发人员提供查询和测试服务的图形用户界面。

#### 远程过程调用（RPC）的问题 {#sec_problems_with_rpc}

Web 服务只是通过网络进行 API 请求的一长串技术的最新化身，其中许多技术获得了大量炒作但存在严重问题。Enterprise JavaBeans (EJB) 和 Java 的远程方法调用 (RMI) 仅限于 Java。分布式组件对象模型 (DCOM) 仅限于 Microsoft 平台。公共对象请求代理架构 (CORBA) 过于复杂，并且不提供向后或向前兼容性 [^33]。SOAP 和 WS-\* Web 服务框架旨在提供跨供应商的互操作性，但也受到复杂性和兼容性问题的困扰 [^34] [^35] [^36]。

所有这些都基于 *远程过程调用* (RPC) 的想法，这个想法自 1970 年代以来就存在了 [^37]。RPC 模型试图使向远程网络服务的请求看起来与在编程语言中调用函数或方法相同，在同一进程内（这种抽象称为 *位置透明性*）。尽管 RPC 起初似乎很方便，但这种方法从根本上是有缺陷的 [^38] [^39]。网络请求与本地函数调用非常不同：

* 本地函数调用是可预测的，要么成功要么失败，仅取决于你控制的参数。网络请求是不可预测的：由于网络问题，请求或响应可能会丢失，或者远程机器可能速度慢或不可用，而这些问题完全超出了你的控制。网络问题很常见，因此你必须预料到它们，例如通过重试失败的请求。
* 本地函数调用要么返回结果，要么抛出异常，要么永不返回（因为它进入无限循环或进程崩溃）。网络请求有另一种可能的结果：它可能由于 *超时* 而没有返回结果。在这种情况下，你根本不知道发生了什么：如果你没有从远程服务获得响应，你无法知道请求是否通过。（我们在 [第 9 章](/ch9#ch_distributed) 中更详细地讨论了这个问题。）
* 如果你重试失败的网络请求，可能会发生前一个请求实际上已经成功，只是响应丢失了。在这种情况下，重试将导致操作执行多次，除非你在协议中构建去重机制（*幂等性*）[^40]。本地函数调用没有这个问题。（我们在 [“幂等性”](/ch12#sec_stream_idempotence) 中更详细地讨论幂等性。）
* 每次调用本地函数时，通常需要大约相同的时间来执行。网络请求比函数调用慢得多，其延迟也变化很大：在良好的时候，它可能在不到一毫秒内完成，但当网络拥塞或远程服务过载时，执行完全相同的操作可能需要许多秒。
* 当你调用本地函数时，你可以有效地将引用（指针）传递给本地内存中的对象。当你发出网络请求时，所有这些参数都需要编码为可以通过网络发送的字节序列。如果参数是不可变的原语，如数字或短字符串，那没问题，但对于更大量的数据和可变对象，它很快就会出现问题。
* 客户端和服务可能以不同的编程语言实现，因此 RPC 框架必须将数据类型从一种语言转换为另一种语言。这可能会变得很丑陋，因为并非所有语言都具有相同的类型——例如，回想一下 JavaScript 处理大于 2⁵³ 的数字的问题（参见 ["JSON、XML 及其二进制变体"](#sec_encoding_json)）。单一语言编写的单个进程中不存在此问题。

所有这些因素意味着，试图让远程服务看起来太像编程语言中的本地对象是没有意义的，因为它是根本不同的东西。REST 的部分吸引力在于它将网络上的状态传输视为与函数调用不同的过程。

#### 负载均衡器、服务发现和服务网格 {#sec_encoding_service_discovery}

所有服务都通过网络进行通信。因此，客户端必须知道它正在连接的服务的地址——这个问题称为 *服务发现*。最简单的方法是配置客户端连接到运行服务的 IP 地址和端口。此配置可以工作，但如果服务器离线、转移到新机器或变得过载，则必须手动重新配置客户端。

为了提供更高的可用性和可伸缩性，通常在不同的机器上运行服务的多个实例，其中任何一个都可以处理传入的请求。将请求分散到这些实例上称为 *负载均衡* [^41]。有许多负载均衡和服务发现解决方案可用：

* *硬件负载均衡器* 是安装在数据中心的专用设备。它们允许客户端连接到单个主机和端口，传入连接被路由到运行服务的服务器之一。此类负载均衡器在连接到下游服务器时检测网络故障，并将流量转移到其他服务器。
* *软件负载均衡器* 的行为方式与硬件负载均衡器大致相同。但是，软件负载均衡器（如 Nginx 和 HAProxy）不需要特殊设备，而是可以安装在标准机器上的应用程序。
* *域名服务 (DNS)* 是当你打开网页时在互联网上解析域名的方式。它通过允许多个 IP 地址与单个域名关联来支持负载均衡。然后，客户端可以配置为使用域名而不是 IP 地址连接到服务，并且客户端的网络层在建立连接时选择要使用的 IP 地址。这种方法的一个缺点是 DNS 旨在在较长时间内传播更改并缓存 DNS 条目。如果服务器频繁启动、停止或移动，客户端可能会看到不再有服务器运行的陈旧 IP 地址。
* *服务发现系统* 使用集中式注册表而不是 DNS 来跟踪哪些服务端点可用。当新服务实例启动时，它通过声明它正在侦听的主机和端口以及相关元数据（如分片所有权信息（参见 [第 7 章](/ch7#ch_sharding)）、数据中心位置等）向服务发现系统注册自己。然后，服务定期向发现系统发送心跳信号，以表明服务仍然可用。

  当客户端希望连接到服务时，它首先查询发现系统以获取可用端点列表，然后直接连接到端点。与 DNS 相比，服务发现支持服务实例频繁更改的更动态环境。发现系统还为客户端提供有关它们正在连接的服务的更多元数据，这使客户端能够做出更智能的负载均衡决策。
* *服务网格* 是一种复杂的负载均衡形式，它结合了软件负载均衡器和服务发现。与在单独机器上运行的传统软件负载均衡器不同，服务网格负载均衡器通常作为进程内客户端库或作为客户端和服务器上的进程或"边车"容器部署。客户端应用程序连接到它们自己的本地服务负载均衡器，该负载均衡器连接到服务器的负载均衡器。从那里，连接被路由到本地服务器进程。

  虽然复杂，但这种拓扑提供了许多优势。由于客户端和服务器完全通过本地连接路由，因此连接加密可以完全在负载均衡器级别处理。这使客户端和服务器免于处理 SSL 证书和 TLS 的复杂性。网格系统还提供复杂的可观测性。它们可以实时跟踪哪些服务正在相互调用，检测故障，跟踪流量负载等。

哪种解决方案合适取决于组织的需求。在使用 Kubernetes 等编排器的非常动态的服务环境中运行的组织通常选择运行 Istio 或 Linkerd 等服务网格。专门的基础设施（如数据库或消息传递系统）可能需要自己专门构建的负载均衡器。更简单的部署最适合使用软件负载均衡器。

#### RPC 的数据编码与演化 {#data-encoding-and-evolution-for-rpc}

对于可演化性，RPC 客户端和服务器可以独立更改和部署非常重要。与通过数据库流动的数据（如上一节所述）相比，我们可以在通过服务的数据流的情况下做出简化假设：假设所有服务器都先更新，然后所有客户端都更新是合理的。因此，你只需要在请求上向后兼容，在响应上向前兼容。

RPC 方案的向后和向前兼容性属性继承自它使用的任何编码：

* gRPC（Protocol Buffers）和 Avro RPC 可以根据各自编码格式的兼容性规则进行演化。
* RESTful API 最常使用 JSON 作为响应，以及 JSON 或 URI 编码/表单编码的请求参数作为请求。添加可选请求参数和向响应对象添加新字段通常被认为是保持兼容性的更改。

服务兼容性变得更加困难，因为 RPC 通常用于跨组织边界的通信，因此服务提供者通常无法控制其客户端，也无法强制它们升级。因此，兼容性需要保持很长时间，也许是无限期的。如果需要破坏兼容性的更改，服务提供者通常最终会并行维护服务 API 的多个版本。

关于 API 版本控制应该如何工作（即客户端如何指示它想要使用哪个版本的 API）没有达成一致 [^42]。对于 RESTful API，常见的方法是在 URL 中使用版本号或在 HTTP `Accept` 标头中使用。对于使用 API 密钥识别特定客户端的服务，另一个选项是在服务器上存储客户端请求的 API 版本，并允许通过单独的管理界面更新此版本选择 [^43]。

### 持久化执行与工作流 {#sec_encoding_dataflow_workflows}

根据定义，基于服务的架构具有多个服务，这些服务都负责应用程序的不同部分。考虑一个处理信用卡并将资金存入银行账户的支付处理应用程序。该系统可能有不同的服务负责欺诈检测、信用卡集成、银行集成等。

在我们的示例中，处理单个付款需要许多服务调用。支付处理器服务可能会调用欺诈检测服务以检查欺诈，调用信用卡服务以扣除信用卡费用，并调用银行服务以存入扣除的资金，如 [图 5-7](#fig_encoding_workflow) 所示。我们将这一系列步骤称为 *工作流*，每个步骤称为 *任务*。工作流通常定义为任务图。工作流定义可以用通用编程语言、领域特定语言 (DSL) 或标记语言（如业务流程执行语言 (BPEL)）[^44] 编写。

--------

> [!TIP] 任务、活动和函数
>
> 不同的工作流引擎对任务使用不同的名称。例如，Temporal 使用术语 *活动*。其他引擎将任务称为 *持久函数*。虽然名称不同，但概念是相同的。

--------

{{< figure src="/fig/ddia_0507.png" id="fig_encoding_workflow" title="图 5-7. 使用业务流程模型和标记法 (BPMN) 表示的工作流示例，这是一种图形标记法。" class="w-full my-4" >}}


工作流由 *工作流引擎* 运行或执行。工作流引擎确定何时运行每个任务、任务必须在哪台机器上运行、如果任务失败该怎么办（例如，如果机器在任务运行时崩溃）、允许并行执行多少任务等。

工作流引擎通常由编排器和执行器组成。编排器负责调度要执行的任务，执行器负责执行任务。当工作流被触发时，执行开始。如果用户定义了基于时间的调度（例如每小时执行），则编排器会自行触发工作流。外部源（如 Web 服务）甚至人类也可以触发工作流执行。一旦触发，就会调用执行器来运行任务。

有许多类型的工作流引擎可以满足各种各样的用例。有些，如 Airflow、Dagster 和 Prefect，与数据系统集成并编排 ETL 任务。其他的，如 Camunda 和 Orkes，为工作流提供图形标记法（如 [图 5-7](#fig_encoding_workflow) 中使用的 BPMN），以便非工程师可以更轻松地定义和执行工作流。还有一些，如 Temporal 和 Restate，提供 *持久化执行*。

#### 持久化执行 {#durable-execution}

持久化执行框架已成为构建需要事务性的基于服务的架构的流行方式。在我们的支付示例中，我们希望每笔付款都恰好处理一次。工作流执行期间的故障可能导致信用卡扣费，但没有相应的银行账户存款。在基于服务的架构中，我们不能简单地将两个任务包装在数据库事务中。此外，我们可能正在与我们控制有限的第三方支付网关进行交互。

持久化执行框架是为工作流提供 *恰好一次语义* 的一种方式。如果任务失败，框架将重新执行该任务，但会跳过任务在失败之前成功完成的任何 RPC 调用或状态更改。相反，框架将假装进行调用，但实际上将返回先前调用的结果。这是可能的，因为持久化执行框架将所有 RPC 和状态更改记录到持久存储（如预写日志）[^45] [^46]。[示例 5-5](#fig_temporal_workflow) 显示了使用 Temporal 支持持久化执行的工作流定义示例。

{{< figure id="fig_temporal_workflow" title="示例 5-5. [图 5-7](#fig_encoding_workflow) 中支付工作流的 Temporal 工作流定义片段。" class="w-full my-4" >}}

```python
@workflow.defn
class PaymentWorkflow:
    @workflow.run
    async def run(self, payment: PaymentRequest) -> PaymentResult:
        is_fraud = await workflow.execute_activity(
            check_fraud,
            payment,
            start_to_close_timeout=timedelta(seconds=15),
        )
        if is_fraud:
            return PaymentResultFraudulent
        credit_card_response = await workflow.execute_activity(
            debit_credit_card,
            payment,
            start_to_close_timeout=timedelta(seconds=15),
        )
        # ...
```

像 Temporal 这样的框架并非没有挑战。外部服务（例如我们示例中的第三方支付网关）仍必须提供幂等 API。开发人员必须记住为这些 API 使用唯一 ID 以防止重复执行 [^47]。由于持久化执行框架按顺序记录每个 RPC 调用，因此它期望后续执行以相同的顺序进行相同的 RPC 调用。这使得代码更改变得脆弱：你可能仅通过重新排序函数调用就引入未定义的行为 [^48]。与其修改现有工作流的代码，不如单独部署新版本的代码更安全，以便现有工作流调用的重新执行继续使用旧版本，只有新调用使用新代码 [^49]。

同样，由于持久化执行框架期望以确定性方式重放所有代码（相同的输入产生相同的输出），因此随机数生成器或系统时钟等非确定性代码会产生问题 [^48]。框架通常会为这类库函数提供自己的确定性实现，但你必须记得使用它们。在某些情况下，例如 Temporal 的 workflowcheck 工具，框架还会提供静态分析工具来判断是否引入了非确定性行为。

--------

> [!NOTE]
> 使代码具有确定性是一个强大的想法，但要稳健地做到这一点很棘手。在 ["确定性的力量"](/ch9#sidebar_distributed_determinism) 中，我们将回到这个话题。

--------

### 事件驱动的架构 {#sec_encoding_dataflow_msg}

在这最后一节中，我们将简要介绍 *事件驱动架构*，这是编码数据从一个进程流向另一个进程的另一种方式。请求称为 *事件* 或 *消息*；与 RPC 不同，发送者通常不会等待接收者处理事件。此外，事件通常不是通过直接网络连接发送给接收者，而是通过称为 *消息代理*（也称为 *事件代理*、*消息队列* 或 *面向消息的中间件*）的中介，它临时存储消息 [^50]。

使用消息代理与直接 RPC 相比有几个优点：

* 如果接收者不可用或过载，它可以充当缓冲区，从而提高系统可靠性。
* 它可以自动将消息重新传递给已崩溃的进程，从而防止消息丢失。
* 它避免了服务发现的需要，因为发送者不需要直接连接到接收者的 IP 地址。
* 它允许将相同的消息发送给多个接收者。
* 它在逻辑上将发送者与接收者解耦（发送者只是发布消息，不关心谁使用它们）。

通过消息代理的通信是 *异步的*：发送者不会等待消息被传递，而是简单地发送它然后忘记它。可以通过让发送者在单独的通道上等待响应来实现类似同步 RPC 的模型。

#### 消息代理 {#message-brokers}

过去，消息代理的格局由 TIBCO、IBM WebSphere 和 webMethods 等公司的商业企业软件主导，然后开源实现（如 RabbitMQ、ActiveMQ、HornetQ、NATS 和 Apache Kafka）变得流行。最近，云服务（如 Amazon Kinesis、Azure Service Bus 和 Google Cloud Pub/Sub）也获得了采用。我们将在 [“消息系统”](/ch12#sec_stream_messaging) 中更详细地比较它们。

详细的传递语义因实现和配置而异，但通常，最常使用两种消息分发模式：

* 一个进程将消息添加到命名 *队列*，代理将该消息传递给该队列的 *消费者*。如果有多个消费者，其中一个会收到消息。
* 一个进程将消息发布到命名 *主题*，代理将该消息传递给该主题的所有 *订阅者*。如果有多个订阅者，他们都会收到消息。

消息代理通常不强制执行任何特定的数据模型——消息只是带有一些元数据的字节序列，因此你可以使用任何编码格式。常见的方法是使用 Protocol Buffers、Avro 或 JSON，并在消息代理旁边部署模式注册表来存储所有有效的模式版本并检查其兼容性 [^19] [^21]。AsyncAPI（OpenAPI 的基于消息传递的等效物）也可用于指定消息的模式。

消息代理在消息的持久性方面有所不同。许多将消息写入磁盘，以便在消息代理崩溃或需要重新启动时不会丢失。与数据库不同，许多消息代理在消息被消费后会自动再次删除消息。某些代理可以配置为无限期地存储消息，如果你想使用事件溯源，这是必需的（参见 ["事件溯源与 CQRS"](/ch3#sec_datamodels_events)）。

如果消费者将消息重新发布到另一个主题，你可能需要小心保留未知字段，以防止前面在数据库上下文中描述的问题（[图 5-1](#fig_encoding_preserve_field)）。

#### 分布式 actor 框架 {#distributed-actor-frameworks}

*Actor 模型* 是单个进程中并发的编程模型。与其直接处理线程（以及相关的竞态条件、锁定和死锁问题），逻辑被封装在 *actor* 中。每个 actor 通常代表一个客户端或实体，它可能有一些本地状态（不与任何其他 actor 共享），并通过发送和接收异步消息与其他 actor 通信。消息传递不能保证：在某些错误场景中，消息将丢失。由于每个 actor 一次只处理一条消息，因此它不需要担心线程，并且每个 actor 可以由框架独立调度。

在 *分布式 actor 框架* 中，如 Akka、Orleans [^51] 和 Erlang/OTP，此编程模型用于跨多个节点扩展应用程序。无论发送者和接收者是在同一节点还是不同节点上，都使用相同的消息传递机制。如果它们在不同的节点上，消息将透明地编码为字节序列，通过网络发送，并在另一端解码。

位置透明性在 actor 模型中比在 RPC 中效果更好，因为 actor 模型已经假定消息可能会丢失，即使在单个进程内也是如此。尽管网络上的延迟可能比同一进程内的延迟更高，但在使用 actor 模型时，本地和远程通信之间的根本不匹配较少。

分布式 actor 框架本质上将消息代理和 actor 编程模型集成到单个框架中。但是，如果你想对基于 actor 的应用程序执行滚动升级，你仍然必须担心向前和向后兼容性，因为消息可能从运行新版本的节点发送到运行旧版本的节点，反之亦然。这可以通过使用本章中讨论的编码之一来实现。


## 总结 {#summary}

在本章中，我们研究了将数据结构转换为网络上的字节或磁盘上的字节的几种方法。我们看到了这些编码的细节不仅影响其效率，更重要的是还影响应用程序的架构和演化选项。

特别是，许多服务需要支持滚动升级，其中服务的新版本逐步部署到少数节点，而不是同时部署到所有节点。滚动升级允许在不停机的情况下发布服务的新版本（从而鼓励频繁的小版本发布而不是罕见的大版本发布），并使部署风险更低（允许在影响大量用户之前检测和回滚有故障的版本）。这些属性对 *可演化性* 非常有益，即轻松进行应用程序更改。

在滚动升级期间，或出于其他各种原因，我们必须假设不同的节点正在运行我们应用程序代码的不同版本。因此，重要的是系统中流动的所有数据都以提供向后兼容性（新代码可以读取旧数据）和向前兼容性（旧代码可以读取新数据）的方式进行编码。

我们讨论了几种数据编码格式及其兼容性属性：

* 特定于编程语言的编码仅限于单一编程语言，并且通常无法提供向前和向后兼容性。
* 文本格式（如 JSON、XML 和 CSV）广泛存在，其兼容性取决于你如何使用它们。它们有可选的模式语言，有时有帮助，有时是障碍。这些格式在数据类型方面有些模糊，因此你必须小心处理数字和二进制字符串等内容。
* 二进制模式驱动的格式（如 Protocol Buffers 和 Avro）允许使用明确定义的向前和向后兼容性语义进行紧凑、高效的编码。模式可用于文档和代码生成，适用于静态类型语言。但是，这些格式的缺点是数据需要在人类可读之前进行解码。

我们还讨论了几种数据流模式，说明了数据编码很重要的不同场景：

* 数据库，其中写入数据库的进程对数据进行编码，从数据库读取的进程对其进行解码
* RPC 和 REST API，其中客户端对请求进行编码，服务器对请求进行解码并对响应进行编码，客户端最终对响应进行解码
* 事件驱动架构（使用消息代理或 actor），其中节点通过相互发送消息进行通信，这些消息由发送者编码并由接收者解码

我们可以得出结论，通过一点小心，向后/向前兼容性和滚动升级是完全可以实现的。愿你的应用程序演化迅速，部署频繁。


### 参考

[^1]: [CWE-502: Deserialization of Untrusted Data](https://cwe.mitre.org/data/definitions/502.html). Common Weakness Enumeration, *cwe.mitre.org*, July 2006. Archived at [perma.cc/26EU-UK9Y](https://perma.cc/26EU-UK9Y) 
[^2]: Steve Breen. [What Do WebLogic, WebSphere, JBoss, Jenkins, OpenNMS, and Your Application Have in Common? This Vulnerability](https://foxglovesecurity.com/2015/11/06/what-do-weblogic-websphere-jboss-jenkins-opennms-and-your-application-have-in-common-this-vulnerability/). *foxglovesecurity.com*, November 2015. Archived at [perma.cc/9U97-UVVD](https://perma.cc/9U97-UVVD) 
[^3]: Patrick McKenzie. [What the Rails Security Issue Means for Your Startup](https://www.kalzumeus.com/2013/01/31/what-the-rails-security-issue-means-for-your-startup/). *kalzumeus.com*, January 2013. Archived at [perma.cc/2MBJ-7PZ6](https://perma.cc/2MBJ-7PZ6) 
[^4]: Brian Goetz. [Towards Better Serialization](https://openjdk.org/projects/amber/design-notes/towards-better-serialization). *openjdk.org*, June 2019. Archived at [perma.cc/UK6U-GQDE](https://perma.cc/UK6U-GQDE) 
[^5]: Eishay Smith. [jvm-serializers wiki](https://github.com/eishay/jvm-serializers/wiki). *github.com*, October 2023. Archived at [perma.cc/PJP7-WCNG](https://perma.cc/PJP7-WCNG) 
[^6]: [XML Is a Poor Copy of S-Expressions](https://wiki.c2.com/?XmlIsaPoorCopyOfEssExpressions). *wiki.c2.com*, May 2013. Archived at [perma.cc/7FAN-YBKL](https://perma.cc/7FAN-YBKL) 
[^7]: Julia Evans. [Examples of floating point problems](https://jvns.ca/blog/2023/01/13/examples-of-floating-point-problems/). *jvns.ca*, January 2023. Archived at [perma.cc/M57L-QKKW](https://perma.cc/M57L-QKKW) 
[^8]: Matt Harris. [Snowflake: An Update and Some Very Important Information](https://groups.google.com/g/twitter-development-talk/c/ahbvo3VTIYI). Email to *Twitter Development Talk* mailing list, October 2010. Archived at [perma.cc/8UBV-MZ3D](https://perma.cc/8UBV-MZ3D) 
[^9]: Yakov Shafranovich. [RFC 4180: Common Format and MIME Type for Comma-Separated Values (CSV) Files](https://tools.ietf.org/html/rfc4180). IETF, October 2005. 
[^10]: Andy Coates. [Evolving JSON Schemas - Part I](https://www.creekservice.org/articles/2024/01/08/json-schema-evolution-part-1.html) and [Part II](https://www.creekservice.org/articles/2024/01/09/json-schema-evolution-part-2.html). *creekservice.org*, January 2024. Archived at [perma.cc/MZW3-UA54](https://perma.cc/MZW3-UA54) and [perma.cc/GT5H-WKZ5](https://perma.cc/GT5H-WKZ5) 
[^11]: Pierre Genevès, Nabil Layaïda, and Vincent Quint. [Ensuring Query Compatibility with Evolving XML Schemas](https://arxiv.org/abs/0811.4324). INRIA Technical Report 6711, November 2008. 
[^12]: Tim Bray. [Bits On the Wire](https://www.tbray.org/ongoing/When/201x/2019/11/17/Bits-On-the-Wire). *tbray.org*, November 2019. Archived at [perma.cc/3BT3-BQU3](https://perma.cc/3BT3-BQU3) 
[^13]: Mark Slee, Aditya Agarwal, and Marc Kwiatkowski. [Thrift: Scalable Cross-Language Services Implementation](https://thrift.apache.org/static/files/thrift-20070401.pdf). Facebook technical report, April 2007. Archived at [perma.cc/22BS-TUFB](https://perma.cc/22BS-TUFB) 
[^14]: Martin Kleppmann. [Schema Evolution in Avro, Protocol Buffers and Thrift](https://martin.kleppmann.com/2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html). *martin.kleppmann.com*, December 2012. Archived at [perma.cc/E4R2-9RJT](https://perma.cc/E4R2-9RJT) 
[^15]: Doug Cutting, Chad Walters, Jim Kellerman, et al. [[PROPOSAL] New Subproject: Avro](https://lists.apache.org/thread/z571w0r5jmfsjvnl0fq4fgg0vh28d3bk). Email thread on *hadoop-general* mailing list, *lists.apache.org*, April 2009. Archived at [perma.cc/4A79-BMEB](https://perma.cc/4A79-BMEB) 
[^16]: Apache Software Foundation. [Apache Avro 1.12.0 Specification](https://avro.apache.org/docs/1.12.0/specification/). *avro.apache.org*, August 2024. Archived at [perma.cc/C36P-5EBQ](https://perma.cc/C36P-5EBQ) 
[^17]: Apache Software Foundation. [Avro schemas as LL(1) CFG definitions](https://avro.apache.org/docs/1.12.0/api/java/org/apache/avro/io/parsing/doc-files/parsing.html). *avro.apache.org*, August 2024. Archived at [perma.cc/JB44-EM9Q](https://perma.cc/JB44-EM9Q) 
[^18]: Tony Hoare. [Null References: The Billion Dollar Mistake](https://www.infoq.com/presentations/Null-References-The-Billion-Dollar-Mistake-Tony-Hoare/). Talk at *QCon London*, March 2009. 
[^19]: Confluent, Inc. [Schema Registry Overview](https://docs.confluent.io/platform/current/schema-registry/index.html). *docs.confluent.io*, 2024. Archived at [perma.cc/92C3-A9JA](https://perma.cc/92C3-A9JA) 
[^20]: Aditya Auradkar and Tom Quiggle. [Introducing Espresso—LinkedIn’s Hot New Distributed Document Store](https://engineering.linkedin.com/espresso/introducing-espresso-linkedins-hot-new-distributed-document-store). *engineering.linkedin.com*, January 2015. Archived at [perma.cc/FX4P-VW9T](https://perma.cc/FX4P-VW9T) 
[^21]: Jay Kreps. [Putting Apache Kafka to Use: A Practical Guide to Building a Stream Data Platform (Part 2)](https://www.confluent.io/blog/event-streaming-platform-2/). *confluent.io*, February 2015. Archived at [perma.cc/8UA4-ZS5S](https://perma.cc/8UA4-ZS5S) 
[^22]: Gwen Shapira. [The Problem of Managing Schemas](https://www.oreilly.com/content/the-problem-of-managing-schemas/). *oreilly.com*, November 2014. Archived at [perma.cc/BY8Q-RYV3](https://perma.cc/BY8Q-RYV3) 
[^23]: John Larmouth. [*ASN.1 Complete*](https://www.oss.com/asn1/resources/books-whitepapers-pubs/larmouth-asn1-book.pdf). Morgan Kaufmann, 1999. ISBN: 978-0-122-33435-1. Archived at [perma.cc/GB7Y-XSXQ](https://perma.cc/GB7Y-XSXQ) 
[^24]: Burton S. Kaliski Jr. [A Layman’s Guide to a Subset of ASN.1, BER, and DER](https://luca.ntop.org/Teaching/Appunti/asn1.html). Technical Note, RSA Data Security, Inc., November 1993. Archived at [perma.cc/2LMN-W9U8](https://perma.cc/2LMN-W9U8) 
[^25]: Jacob Hoffman-Andrews. [A Warm Welcome to ASN.1 and DER](https://letsencrypt.org/docs/a-warm-welcome-to-asn1-and-der/). *letsencrypt.org*, April 2020. Archived at [perma.cc/CYT2-GPQ8](https://perma.cc/CYT2-GPQ8) 
[^26]: Lev Walkin. [Question: Extensibility and Dropping Fields](https://lionet.info/asn1c/blog/2010/09/21/question-extensibility-removing-fields/). *lionet.info*, September 2010. Archived at [perma.cc/VX8E-NLH3](https://perma.cc/VX8E-NLH3) 
[^27]: Jacqueline Xu. [Online migrations at scale](https://stripe.com/blog/online-migrations). *stripe.com*, February 2017. Archived at [perma.cc/X59W-DK7Y](https://perma.cc/X59W-DK7Y) 
[^28]: Geoffrey Litt, Peter van Hardenberg, and Orion Henry. [Project Cambria: Translate your data with lenses](https://www.inkandswitch.com/cambria/). Technical Report, *Ink & Switch*, October 2020. Archived at [perma.cc/WA4V-VKDB](https://perma.cc/WA4V-VKDB) 
[^29]: Pat Helland. [Data on the Outside Versus Data on the Inside](https://www.cidrdb.org/cidr2005/papers/P12.pdf). At *2nd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2005. 
[^30]: Roy Thomas Fielding. [Architectural Styles and the Design of Network-Based Software Architectures](https://ics.uci.edu/~fielding/pubs/dissertation/fielding_dissertation.pdf). PhD Thesis, University of California, Irvine, 2000. Archived at [perma.cc/LWY9-7BPE](https://perma.cc/LWY9-7BPE) 
[^31]: Roy Thomas Fielding. [REST APIs must be hypertext-driven](https://roy.gbiv.com/untangled/2008/rest-apis-must-be-hypertext-driven).” *roy.gbiv.com*, October 2008. Archived at [perma.cc/M2ZW-8ATG](https://perma.cc/M2ZW-8ATG) 
[^32]: [OpenAPI Specification Version 3.1.0](https://swagger.io/specification/). *swagger.io*, February 2021. Archived at [perma.cc/3S6S-K5M4](https://perma.cc/3S6S-K5M4) 
[^33]: Michi Henning. [The Rise and Fall of CORBA](https://cacm.acm.org/practice/the-rise-and-fall-of-corba/). *Communications of the ACM*, volume 51, issue 8, pages 52–57, August 2008. [doi:10.1145/1378704.1378718](https://doi.org/10.1145/1378704.1378718) 
[^34]: Pete Lacey. [The S Stands for Simple](https://harmful.cat-v.org/software/xml/soap/simple). *harmful.cat-v.org*, November 2006. Archived at [perma.cc/4PMK-Z9X7](https://perma.cc/4PMK-Z9X7) 
[^35]: Stefan Tilkov. [Interview: Pete Lacey Criticizes Web Services](https://www.infoq.com/articles/pete-lacey-ws-criticism/). *infoq.com*, December 2006. Archived at [perma.cc/JWF4-XY3P](https://perma.cc/JWF4-XY3P) 
[^36]: Tim Bray. [The Loyal WS-Opposition](https://www.tbray.org/ongoing/When/200x/2004/09/18/WS-Oppo). *tbray.org*, September 2004. Archived at [perma.cc/J5Q8-69Q2](https://perma.cc/J5Q8-69Q2) 
[^37]: Andrew D. Birrell and Bruce Jay Nelson. [Implementing Remote Procedure Calls](https://www.cs.princeton.edu/courses/archive/fall03/cs518/papers/rpc.pdf). *ACM Transactions on Computer Systems* (TOCS), volume 2, issue 1, pages 39–59, February 1984. [doi:10.1145/2080.357392](https://doi.org/10.1145/2080.357392) 
[^38]: Jim Waldo, Geoff Wyant, Ann Wollrath, and Sam Kendall. [A Note on Distributed Computing](https://m.mirror.facebook.net/kde/devel/smli_tr-94-29.pdf). Sun Microsystems Laboratories, Inc., Technical Report TR-94-29, November 1994. Archived at [perma.cc/8LRZ-BSZR](https://perma.cc/8LRZ-BSZR) 
[^39]: Steve Vinoski. [Convenience over Correctness](https://steve.vinoski.net/pdf/IEEE-Convenience_Over_Correctness.pdf). *IEEE Internet Computing*, volume 12, issue 4, pages 89–92, July 2008. [doi:10.1109/MIC.2008.75](https://doi.org/10.1109/MIC.2008.75) 
[^40]: Brandur Leach. [Designing robust and predictable APIs with idempotency](https://stripe.com/blog/idempotency). *stripe.com*, February 2017. Archived at [perma.cc/JD22-XZQT](https://perma.cc/JD22-XZQT) 
[^41]: Sam Rose. [Load Balancing](https://samwho.dev/load-balancing/). *samwho.dev*, April 2023. Archived at [perma.cc/Q7BA-9AE2](https://perma.cc/Q7BA-9AE2) 
[^42]: Troy Hunt. [Your API versioning is wrong, which is why I decided to do it 3 different wrong ways](https://www.troyhunt.com/your-api-versioning-is-wrong-which-is/). *troyhunt.com*, February 2014. Archived at [perma.cc/9DSW-DGR5](https://perma.cc/9DSW-DGR5) 
[^43]: Brandur Leach. [APIs as infrastructure: future-proofing Stripe with versioning](https://stripe.com/blog/api-versioning). *stripe.com*, August 2017. Archived at [perma.cc/L63K-USFW](https://perma.cc/L63K-USFW) 
[^44]: Alexandre Alves, Assaf Arkin, Sid Askary, et al. [Web Services Business Process Execution Language Version 2.0](https://docs.oasis-open.org/wsbpel/2.0/wsbpel-v2.0.html). *docs.oasis-open.org*, April 2007. 
[^45]: [What is a Temporal Service?](https://docs.temporal.io/clusters) *docs.temporal.io*, 2024. Archived at [perma.cc/32P3-CJ9V](https://perma.cc/32P3-CJ9V) 
[^46]: Stephan Ewen. [Why we built Restate](https://restate.dev/blog/why-we-built-restate/). *restate.dev*, August 2023. Archived at [perma.cc/BJJ2-X75K](https://perma.cc/BJJ2-X75K) 
[^47]: Keith Tenzer and Joshua Smith. [Idempotency and Durable Execution](https://temporal.io/blog/idempotency-and-durable-execution). *temporal.io*, February 2024. Archived at [perma.cc/9LGW-PCLU](https://perma.cc/9LGW-PCLU) 
[^48]: [What is a Temporal Workflow?](https://docs.temporal.io/workflows) *docs.temporal.io*, 2024. Archived at [perma.cc/B5C5-Y396](https://perma.cc/B5C5-Y396) 
[^49]: Jack Kleeman. [Solving durable execution’s immutability problem](https://restate.dev/blog/solving-durable-executions-immutability-problem/). *restate.dev*, February 2024. Archived at [perma.cc/G55L-EYH5](https://perma.cc/G55L-EYH5) 
[^50]: Srinath Perera. [Exploring Event-Driven Architecture: A Beginner’s Guide for Cloud Native Developers](https://wso2.com/blogs/thesource/exploring-event-driven-architecture-a-beginners-guide-for-cloud-native-developers/). *wso2.com*, August 2023. Archived at [archive.org](https://web.archive.org/web/20240716204613/https%3A//wso2.com/blogs/thesource/exploring-event-driven-architecture-a-beginners-guide-for-cloud-native-developers/) 
[^51]: Philip A. Bernstein, Sergey Bykov, Alan Geller, Gabriel Kliot, and Jorgen Thelin. [Orleans: Distributed Virtual Actors for Programmability and Scalability](https://www.microsoft.com/en-us/research/publication/orleans-distributed-virtual-actors-for-programmability-and-scalability/). Microsoft Research Technical Report MSR-TR-2014-41, March 2014. Archived at [perma.cc/PD3U-WDMF](https://perma.cc/PD3U-WDMF) 


================================================
FILE: content/zh/ch6.md
================================================
---
title: "6. 复制"
weight: 206
breadcrumbs: false
---

<a id="ch_replication"></a>

![](/map/ch05.png)

> *可能出错的东西和“不可能”出错的东西之间，最大的区别在于：后者一旦出错，往往几乎无从下手，也难以修复。*
>
> 道格拉斯·亚当斯，《基本无害》（1992）

**复制** 指的是在通过网络连接的多台机器上保留相同数据的副本。如 ["分布式与单节点系统"](/ch1#sec_introduction_distributed) 中所讨论的，你可能出于以下几个原因希望复制数据：

* 使数据在地理上更接近用户（从而减少访问延迟）
* 即使系统的部分组件出现故障，也能让系统继续工作（从而提高可用性）
* 扩展能够处理读查询的机器数量（从而提高读吞吐量）

本章假设你的数据集足够小，每台机器都可以保存整个数据集的副本。在 [第 7 章](/ch7#ch_sharding) 中，我们将放宽这一假设，讨论单台机器无法容纳的、过大数据集的 **分片**（**分区**）。在后续章节中，我们将讨论复制数据系统中可能发生的各种故障，以及如何处理它们。

如果需要复制的数据不会随时间变化，那么复制就很简单：只需要将数据复制到每个节点一次就大功告成。处理复制的所有困难都在于处理复制数据的 **变更**，这也是本章的主题。我们将讨论三类在节点之间复制变更的算法：**单主**、**多主** 和 **无主** 复制。几乎所有分布式数据库都使用这三种方法之一。它们各有利弊，我们将详细研究。

复制需要考虑许多权衡：例如，是使用同步还是异步复制，以及如何处理失败的副本。这些通常是数据库中的配置选项，尽管不同数据库的细节有所不同，但许多不同实现的通用原则是相似的。我们将在本章中讨论这些选择的后果。

数据库复制是一个古老的话题——自 20 世纪 70 年代研究以来，原理并没有太大变化 [^1]，因为网络的基本约束保持不变。尽管如此古老，像 **最终一致性** 这样的概念仍然会引起困惑。在 ["复制延迟的问题"](#sec_replication_lag) 中，我们将更准确地了解最终一致性，并讨论诸如 **读己之写** 和 **单调读** 等保证。

--------

> [!TIP] 备份与复制
>
> 你可能会想，如果有了复制，是否还需要备份。答案是肯定的，因为它们有不同的目的：副本会快速将一个节点的写入反映到其他节点上，但备份存储数据的旧快照，以便你可以回到过去的时间点。如果你不小心删除了一些数据，复制并不能帮助你，因为删除操作也会传播到副本，所以如果你想恢复被删除的数据，就需要备份。
>
> 事实上，复制和备份通常是相互补充的。备份有时是设置复制过程的一部分，正如我们将在 ["设置新的副本"](#sec_replication_new_replica) 中看到的。反过来，归档复制日志可以成为备份过程的一部分。
>
> 一些数据库在内部维护过去状态的不可变快照，作为一种内部备份。然而，这意味着在与当前状态相同的存储介质上保留数据的旧版本。如果你有大量数据，将旧数据的备份保存在针对不常访问数据优化的对象存储中可能会更便宜，而只在主存储中存储数据库的当前状态。

--------

## 单主复制 {#sec_replication_leader}

存储数据库副本的每个节点称为 **副本**。有了多个副本，不可避免地会出现一个问题：我们如何确保所有数据最终都出现在所有副本上？

每次写入数据库都需要由每个副本处理；否则，副本将不再包含相同的数据。最常见的解决方案称为 **基于领导者的复制**、**主备复制** 或 **主动/被动复制**。它的工作原理如下（见 [图 6-1](#fig_replication_leader_follower)）：

1. 其中一个副本被指定为 **领导者**（也称为 **主库** 或 **源** [^2]）。当客户端想要写入数据库时，他们必须将请求发送给领导者，领导者首先将新数据写入其本地存储。
2. 其他副本称为 **追随者**（**只读副本**、**从库** 或 **热备**）。每当领导者将新数据写入其本地存储时，它也会将数据变更作为 **复制日志** 或 **变更流** 的一部分发送给所有追随者。每个追随者从领导者获取日志，并通过按照与领导者处理相同的顺序应用所有写入来相应地更新其本地数据库副本。
3. 当客户端想要从数据库读取时，它可以查询领导者或任何追随者。然而，只有领导者接受写入（从客户端的角度来看，追随者是只读的）。

{{< figure src="/fig/ddia_0601.png" id="fig_replication_leader_follower" caption="图 6-1. 单主复制将所有写入定向到指定的领导者，该领导者向追随者发送变更流。" class="w-full my-4" >}}

如果数据库是分片的（见 [第 7 章](/ch7#ch_sharding)），每个分片都有一个领导者。不同的分片可能在不同的节点上有其领导者，但每个分片仍必须有一个领导者。在 ["多主复制"](#sec_replication_multi_leader) 中，我们将讨论一种替代模型，其中系统可能同时为同一分片拥有多个领导者。

单主复制被广泛使用。它是许多关系数据库的内置功能，如 PostgreSQL、MySQL、Oracle Data Guard [^3] 和 SQL Server 的 Always On 可用性组 [^4]。它也用于一些文档数据库，如 MongoDB 和 DynamoDB [^5]，消息代理如 Kafka，复制块设备如 DRBD，以及一些网络文件系统。许多共识算法（如 Raft）也基于单个领导者，用于 CockroachDB [^6]、TiDB [^7]、etcd 和 RabbitMQ 仲裁队列（以及其他）中的复制，并在旧领导者失败时自动选举新领导者（我们将在 [第 10 章](/ch10#ch_consistency) 中更详细地讨论共识）。

--------

> [!NOTE]
> 在较旧的文档中，你可能会看到术语 **主从复制**。它与基于领导者的复制含义相同，但应该避免使用该术语，因为它被广泛认为是冒犯性的 [^8]。

--------

### 同步复制与异步复制 {#sec_replication_sync_async}

复制系统的一个重要细节是复制是 **同步** 发生还是 **异步** 发生。（在关系数据库中，这通常是一个可配置选项；其他系统通常硬编码为其中之一。）

想想 [图 6-1](#fig_replication_leader_follower) 中发生的情况，一个网站用户更新他们的个人资料图片。在某个时间点，客户端向领导者发送更新请求；不久之后，领导者收到了它。在某个时间点，领导者将数据变更转发给追随者。最终，领导者通知客户端更新成功。[图 6-2](#fig_replication_sync_replication) 显示了时序可能的工作方式。

{{< figure src="/fig/ddia_0602.png" id="fig_replication_sync_replication" caption="图 6-2. 基于领导者的复制，带有一个同步和一个异步追随者。" class="w-full my-4" >}}

在 [图 6-2](#fig_replication_sync_replication) 的示例中，对追随者 1 的复制是 **同步的**：领导者等待追随者 1 确认它已收到写入，然后才向用户报告成功，并使写入对其他客户端可见。对追随者 2 的复制是 **异步的**：领导者发送消息，但不等待追随者的响应。

图中显示，追随者 2 处理消息之前有相当大的延迟。通常，复制相当快：大多数数据库系统在不到一秒的时间内将变更应用到追随者。然而，不能保证需要多长时间。在某些情况下，追随者可能落后领导者几分钟或更长时间；例如，如果追随者正在从故障中恢复，如果系统正在接近最大容量运行，或者如果节点之间存在网络问题。

同步复制的优点是追随者保证拥有与领导者一致的最新数据副本。如果领导者突然失败，我们可以确信数据仍然在追随者上可用。缺点是，如果同步追随者没有响应（因为它已崩溃，或存在网络故障，或任何其他原因），写入就无法处理。领导者必须阻塞所有写入并等待同步副本再次可用。

因此，将所有追随者都设为同步是不切实际的：任何一个节点的中断都会导致整个系统停止。实际上，如果数据库提供同步复制，通常意味着 **一个** 追随者是同步的，其他的是异步的。如果同步追随者变得不可用或缓慢，异步追随者之一将变为同步。这保证了你至少在两个节点上拥有最新的数据副本：领导者和一个同步追随者。这种配置有时也称为 **半同步**。

在某些系统中，**多数**（例如，包括领导者在内的 5 个副本中的 3 个）副本被同步更新，其余少数是异步的。这是 **仲裁** 的一个例子，我们将在 ["读写仲裁"](#sec_replication_quorum_condition) 中进一步讨论。多数仲裁通常用于使用共识协议进行自动领导者选举的系统中，我们将在 [第 10 章](/ch10#ch_consistency) 中回到这个话题。

有时，基于领导者的复制被配置为完全异步。在这种情况下，如果领导者失败且无法恢复，任何尚未复制到追随者的写入都会丢失。这意味着即使已向客户端确认，写入也不能保证持久。然而，完全异步配置的优点是领导者可以继续处理写入，即使所有追随者都已落后。

弱化持久性可能听起来像是一个糟糕的权衡，但异步复制仍然被广泛使用，特别是如果有许多追随者或者它们在地理上分布广泛 [^9]。我们将在 ["复制延迟的问题"](#sec_replication_lag) 中回到这个问题。

### 设置新的副本 {#sec_replication_new_replica}

不时地，你需要设置新的追随者——也许是为了增加副本的数量，或者替换失败的节点。如何确保新的追随者拥有领导者数据的准确副本？

简单地将数据文件从一个节点复制到另一个节点通常是不够的：客户端不断向数据库写入，数据总是在变化，所以标准文件复制会在不同的时间点看到数据库的不同部分。结果可能没有任何意义。

你可以通过锁定数据库（使其不可用于写入）来使磁盘上的文件保持一致，但这将违背我们的高可用性目标。幸运的是，设置追随者通常可以在不停机的情况下完成。从概念上讲，过程如下所示：

1. 在某个时间点获取领导者数据库的一致快照——如果可能，不锁定整个数据库。大多数数据库都有此功能，因为备份也需要它。在某些情况下，需要第三方工具，例如用于 MySQL 的 Percona XtraBackup。
2. 将快照复制到新的追随者。
3. 追随者连接到领导者并请求自快照拍摄以来发生的所有数据变更。这要求快照与领导者复制日志中的确切位置相关联。该位置有各种名称：例如，PostgreSQL 称之为 **日志序列号**；MySQL 有两种机制，**binlog 位点** 和 **全局事务标识符**（GTID）。
4. 当追随者处理了自快照以来的数据变更积压后，我们说它已经 **追上进度**。它现在可以继续处理领导者发生的数据变更。

设置追随者的实际步骤因数据库而异。在某些系统中，该过程是完全自动化的，而在其他系统中，它可能是需要管理员手动执行的有些神秘的多步骤工作流程。

你也可以将复制日志归档到对象存储；连同对象存储中整个数据库的定期快照，这是实现数据库备份和灾难恢复的好方法。你还可以通过从对象存储下载这些文件来执行设置新追随者的步骤 1 和 2。例如，WAL-G 为 PostgreSQL、MySQL 和 SQL Server 执行此操作，Litestream 为 SQLite 执行等效操作。

--------

<a id="sec_replication_object_storage"></a>

> [!TIP] 由对象存储支持的数据库
>
> 对象存储可用于存档数据之外的更多用途。许多数据库开始使用对象存储（如 Amazon Web Services S3、Google Cloud Storage 和 Azure Blob Storage）来为实时查询提供数据。在对象存储中存储数据库数据有许多好处：
>
> * 与其他云存储选项相比，对象存储价格便宜，这使得云数据库可以将较少查询的数据存储在更便宜、更高延迟的存储上，同时从内存、SSD 和 NVMe 中提供工作集。
> * 对象存储还提供具有非常高持久性保证的多区域、双区域或多区域复制。这也允许数据库绕过跨区域网络费用。
> * 数据库可以使用对象存储的 **条件写入** 功能——本质上是 **比较并设置**（CAS）操作——来实现事务和领导者选举 [^10] [^11]
> * 将来自多个数据库的数据存储在同一对象存储中可以简化数据集成，特别是在使用 Apache Parquet 和 Apache Iceberg 等开放格式时。
>
> 这些好处通过将事务、领导者选举和复制的责任转移到对象存储，大大简化了数据库架构。
>
> 采用对象存储进行复制的系统必须应对一些权衡。值得注意的是，对象存储的读写延迟比本地磁盘或 EBS 等虚拟块设备要高得多。许多云提供商还收取每个 API 调用费用，这迫使系统批量读写以降低成本。这种批处理进一步增加了延迟。此外，许多对象存储不提供标准文件系统接口。这阻止了缺乏对象存储集成的系统利用对象存储。像 **用户空间文件系统**（FUSE）这样的接口允许操作员将对象存储桶挂载为文件系统，应用程序可以在不知道其数据存储在对象存储上的情况下使用。尽管如此，许多对象存储的 FUSE 接口缺乏系统可能依赖的 POSIX 功能，如非顺序写入或符号链接。
>
> 不同的系统以各种方式处理这些权衡。一些引入了 **分层存储** 架构，将较少访问的数据放在对象存储上，而新的或频繁访问的数据保存在更快的存储设备上，如 SSD、NVMe，甚至内存中。其他系统使用对象存储作为其主要存储层，但使用单独的低延迟存储系统（如 Amazon 的 EBS 或 Neon 的 Safekeepers [^12]）来存储其 WAL。最近，一些系统更进一步，采用了 **零磁盘架构**（ZDA）。基于 ZDA 的系统将所有数据持久化到对象存储，并严格将磁盘和内存用于缓存。这允许节点没有持久状态，这大大简化了运维。WarpStream、Confluent Freight、Buf 的 Bufstream 和 Redpanda Serverless 都是使用零磁盘架构构建的兼容 Kafka 的系统。几乎每个现代云数据仓库也采用这种架构，Turbopuffer（向量搜索引擎）和 SlateDB（云原生 LSM 存储引擎）也是如此。

--------

### 处理节点故障 {#sec_replication_failover}

系统中的任何节点都可能发生故障，可能是由于故障意外发生，但同样可能是由于计划维护（例如，重新启动机器以安装内核安全补丁）。能够在不停机的情况下重新启动单个节点对于操作和维护来说是一个很大的优势。因此，我们的目标是尽管单个节点发生故障，但保持整个系统运行，并尽可能减小节点中断的影响。

如何通过基于领导者的复制实现高可用性？

#### 追随者故障：追赶恢复 {#follower-failure-catch-up-recovery}

在其本地磁盘上，每个追随者保留从领导者接收的数据变更日志。如果追随者崩溃并重新启动，或者如果领导者和追随者之间的网络暂时中断，追随者可以很容易地恢复：从其日志中，它知道在故障发生之前处理的最后一个事务。因此，追随者可以连接到领导者并请求在追随者断开连接期间发生的所有数据变更。当它应用了这些变更后，它就赶上了领导者，可以像以前一样继续接收数据变更流。

尽管追随者恢复在概念上很简单，但在性能方面可能具有挑战性：如果数据库具有高写入吞吐量，或者如果追随者已离线很长时间，可能有很多写入需要赶上。在进行这种追赶时，恢复的追随者和领导者（需要将写入积压发送到追随者）都会有高负载。

一旦所有追随者都确认已处理了日志，领导者就可以删除其写入日志，但如果追随者长时间不可用，领导者面临选择：要么保留日志直到追随者恢复并赶上（冒着领导者磁盘空间耗尽的风险），要么删除不可用追随者尚未确认的日志（在这种情况下，追随者无法从日志中恢复，并且在它回来时必须从备份中恢复）。

#### 领导者故障：故障转移 {#leader-failure-failover}

处理领导者故障更加棘手：其中一个追随者需要被提升为新的领导者，客户端需要重新配置以将其写入发送到新的领导者，其他追随者需要开始从新的领导者消费数据变更。这个过程称为 **故障转移**。

故障转移可以手动发生（管理员收到领导者失败的通知并采取必要步骤来创建新的领导者）或自动发生。自动故障转移过程通常包括以下步骤：

1. **确定领导者已失效。** 可能会出现许多问题：崩溃、停电、网络故障等。没有万无一失的方法能准确判断发生了什么，所以大多数系统只是依赖超时：节点之间会频繁来回发送消息，如果某个节点在一段时间内（例如 30 秒）没有响应，就认为它已经失效。（如果是计划维护而主动下线领导者，则不适用。）
2. **选择新的领导者。** 这可以通过选举过程完成（由剩余副本中的多数选出领导者），也可以由预先设定的 **控制器节点** 任命 [^13]。最适合担任领导者的通常是那个拥有旧领导者最新数据变更的副本（以尽量减少数据丢失）。让所有节点就新领导者达成一致是一个共识问题，我们会在 [第 10 章](/ch10#ch_consistency) 详细讨论。
3. **将系统重新配置为使用新的领导者。** 客户端现在需要把写请求发送到新领导者（我们在 ["请求路由"](/ch7#sec_sharding_routing) 中讨论这个问题）。如果旧领导者恢复，它可能仍然以为自己是领导者，并不知道其他副本已经让它下台。系统需要确保旧领导者降级为追随者，并识别新的领导者。

故障转移充满了可能出错的事情：

* 如果使用异步复制，新的领导者可能在失败之前没有收到来自旧领导者的所有写入。如果前领导者在选择了新领导者后重新加入集群，那些写入应该怎么办？新的领导者可能同时收到了冲突的写入。最常见的解决方案是简单地丢弃旧领导者未复制的写入，这意味着你认为已提交的写入实际上并不持久。
* 如果数据库之外的其他存储系统需要与数据库内容协调，丢弃写入尤其危险。例如，在 GitHub 的一次事故中 [^14]，一个过时的 MySQL 追随者被提升为领导者。数据库使用自增计数器为新行分配主键，但由于新领导者的计数器落后于旧领导者，它重用了旧领导者先前分配的一些主键。这些主键也在 Redis 存储中使用，因此主键的重用导致 MySQL 和 Redis 之间的不一致，这导致一些私人数据被错误地披露给错误的用户。
* 在某些故障场景中（见 [第 9 章](/ch9#ch_distributed)），可能会发生两个节点都认为自己是领导者的情况。这种情况称为 **脑裂**，这是危险的：如果两个领导者都接受写入，并且没有解决冲突的过程（见 ["多主复制"](#sec_replication_multi_leader)），数据很可能会丢失或损坏。作为安全措施，一些系统在检测到两个领导者时有一种机制来关闭一个节点。然而，如果这种机制设计不当，你最终可能会关闭两个节点 [^15]。此外，当检测到脑裂并关闭旧节点时，可能为时已晚，数据已经损坏。
* 在宣布领导者死亡之前，正确的超时是什么？更长的超时意味着在领导者失败的情况下恢复时间更长。然而，如果超时太短，可能会有不必要的故障转移。例如，临时负载峰值可能导致节点的响应时间增加到超时以上，或者网络故障可能导致数据包延迟。如果系统已经在高负载或网络问题上挣扎，不必要的故障转移可能会使情况变得更糟，而不是更好。

--------

> [!NOTE]
> 通过限制或关闭旧领导者来防止脑裂，被称为 **栅栏机制**（fencing），或者更直白地说，**爆彼之头**（STONITH）。我们将在 ["分布式锁和租约"](/ch9#sec_distributed_lock_fencing) 中更详细地讨论栅栏机制。

--------

这些问题没有简单的解决方案。因此，一些运维团队更喜欢手动执行故障转移，即使软件支持自动故障转移。

故障转移最重要的是选择一个最新的追随者作为新的领导者——如果使用同步或半同步复制，这将是旧领导者在确认写入之前等待的追随者。使用异步复制，你可以选择具有最大日志序列号的追随者。这最小化了故障转移期间丢失的数据量：丢失几分之一秒的写入可能是可以容忍的，但选择落后几天的追随者可能是灾难性的。

这些问题——节点故障；不可靠的网络；以及围绕副本一致性、持久性、可用性和延迟的权衡——实际上是分布式系统中的基本问题。在 [第 9 章](/ch9#ch_distributed) 和 [第 10 章](/ch10#ch_consistency) 中，我们将更深入地讨论它们。

### 复制日志的实现 {#sec_replication_implementation}

基于领导者的复制在底层是如何工作的？让我们简要地看看实践中使用的几种不同的复制方法。

#### 基于语句的复制 {#statement-based-replication}

在最简单的情况下，领导者记录它执行的每个写入请求（**语句**）并将该语句日志发送给其追随者。对于关系数据库，这意味着每个 `INSERT`、`UPDATE` 或 `DELETE` 语句都被转发到追随者，每个追随者解析并执行该 SQL 语句，就像它是从客户端接收的一样。

虽然这听起来合理，但这种复制方法可能会出现各种问题：

* 任何调用非确定性函数的语句，例如 `NOW()` 获取当前日期和时间或 `RAND()` 获取随机数，可能会在每个副本上生成不同的值。
* 如果语句使用自增列，或者如果它们依赖于数据库中的现有数据（例如，`UPDATE … WHERE <某条件>`），它们必须在每个副本上以完全相同的顺序执行，否则它们可能会产生不同的效果。当有多个并发执行的事务时，这可能会受到限制。
* 具有副作用的语句（例如，触发器、存储过程、用户定义的函数）可能会导致每个副本上发生不同的副作用，除非副作用是绝对确定的。

可以解决这些问题——例如，领导者可以在记录语句时用固定的返回值替换任何非确定性函数调用，以便追随者都获得相同的值。以固定顺序执行确定性语句的想法类似于我们之前在 ["事件溯源与 CQRS"](/ch3#sec_datamodels_events) 中讨论的事件溯源模型。这种方法也称为 **状态机复制**，我们将在 ["使用共享日志"](/ch10#sec_consistency_smr) 中讨论其背后的理论。

基于语句的复制在 MySQL 5.1 版本之前使用。它今天有时仍在使用，因为它相当紧凑，但默认情况下，如果语句中有任何非确定性，MySQL 现在会切换到基于行的复制（稍后讨论）。VoltDB 使用基于语句的复制，并通过要求事务是确定性的来使其安全 [^16]。然而，确定性在实践中很难保证，因此许多数据库更喜欢其他复制方法。

#### 预写日志（WAL）传输 {#write-ahead-log-wal-shipping}

在 [第 4 章](/ch4#ch_storage) 中，我们看到预写日志是使 B 树存储引擎健壮所必需的：每个修改首先写入 WAL，以便在崩溃后可以将树恢复到一致状态。由于 WAL 包含将索引和堆恢复到一致状态所需的所有信息，我们可以使用完全相同的日志在另一个节点上构建副本：除了将日志写入磁盘外，领导者还通过网络将其发送给其追随者。当追随者处理此日志时，它构建了与领导者上找到的完全相同的文件副本。

此复制方法在 PostgreSQL 和 Oracle 等中使用 [^17] [^18]。主要缺点是日志在非常低的级别描述数据：WAL 包含哪些字节在哪些磁盘块中被更改的详细信息。这使得复制与存储引擎紧密耦合。如果数据库从一个版本更改其存储格式到另一个版本，通常不可能在领导者和追随者上运行不同版本的数据库软件。

这可能看起来像是一个小的实现细节，但它可能会产生很大的操作影响。如果复制协议允许追随者使用比领导者更新的软件版本，你可以通过首先升级追随者然后执行故障转移以使其中一个升级的节点成为新的领导者来执行数据库软件的零停机升级。如果复制协议不允许此版本不匹配（如 WAL 传输的情况），此类升级需要停机。

<a id="sec_replication_logical"></a>

#### 逻辑（基于行）日志复制 {#logical-row-based-log-replication}

另一种选择是为复制和存储引擎使用不同的日志格式，这允许复制日志与存储引擎内部解耦。这种复制日志称为 **逻辑日志**，以区别于存储引擎的（**物理**）数据表示。

关系数据库的逻辑日志通常是描述以行粒度对数据库表的写入的记录序列：

* 对于插入的行，日志包含所有列的新值。
* 对于删除的行，日志包含足够的信息来唯一标识被删除的行。通常这将是主键，但如果表上没有主键，则需要记录所有列的旧值。
* 对于更新的行，日志包含足够的信息来唯一标识更新的行，以及所有列的新值（或至少所有已更改的列的新值）。

修改多行的事务会生成多个这样的日志记录，后跟指示事务已提交的记录。MySQL 除了 WAL 之外还保留一个单独的逻辑复制日志，称为 **binlog**（当配置为使用基于行的复制时）。PostgreSQL 通过将物理 WAL 解码为行插入/更新/删除事件来实现逻辑复制 [^19]。

由于逻辑日志与存储引擎内部解耦，因此可以更容易地保持向后兼容，允许领导者和追随者运行不同版本的数据库软件。这反过来又可以以最少的停机时间升级到新版本 [^20]。

逻辑日志格式也更容易被外部应用解析。如果你想把数据库内容发送到外部系统（例如用于离线分析的数据仓库），或者构建自定义索引和缓存 [^21]，这一点会很有用。这种技术称为 **数据变更捕获**，我们将在 ["数据变更捕获"](/ch12#sec_stream_cdc) 一节再回到它。


## 复制延迟的问题 {#sec_replication_lag}

能够容忍节点故障只是想要复制的一个原因。如 ["分布式与单节点系统"](/ch1#sec_introduction_distributed) 中所述，其他原因是可伸缩性（处理比单台机器能够处理的更多请求）和延迟（将副本在地理上放置得更接近用户）。

基于领导者的复制要求所有写入都通过单个节点，但只读查询可以转到任何副本。对于主要由读取和只有少量写入组成的工作负载（这通常是在线服务的情况），有一个有吸引力的选择：创建许多追随者，并将读取请求分布在这些追随者上。这减轻了领导者的负载，并允许附近的副本提供读取请求。

在这种 **读扩展** 架构中，你可以通过添加更多追随者来简单地增加服务只读请求的容量。然而，这种方法只有在使用异步复制时才现实可行——如果你试图同步复制到所有追随者，单个节点故障或网络中断将使整个系统无法写入。而且你拥有的节点越多，其中一个节点宕机的可能性就越大，因此完全同步的配置将非常不可靠。

不幸的是，如果应用程序从 **异步** 追随者读取，如果追随者已落后，它可能会看到过时的信息。这导致数据库中出现明显的不一致：如果你同时在领导者和追随者上运行相同的查询，你可能会得到不同的结果，因为并非所有写入都已反映在追随者中。这种不一致只是一种临时状态——如果你停止向数据库写入并等待一段时间，追随者最终将赶上并与领导者保持一致。因此，这种效果被称为 **最终一致性** [^22]。

--------

> [!NOTE]
> 术语 **最终一致性** 由 Douglas Terry 等人创造 [^23]，由 Werner Vogels 推广 [^24]，并成为许多 NoSQL 项目的战斗口号。然而，不仅 NoSQL 数据库是最终一致的：异步复制的关系数据库中的追随者具有相同的特征。

--------

术语"最终"是故意模糊的：一般来说，副本可以落后多远没有限制。在正常操作中，写入发生在领导者上并反映在追随者上之间的延迟——**复制延迟**——可能只是几分之一秒，在实践中不会被注意到。然而，如果系统在接近容量运行或网络中存在问题，延迟可以轻易增加到几秒甚至几分钟。

当延迟如此之大时，它引入的不一致不仅仅是一个理论问题，而是应用程序的真正问题。在本节中，我们将重点介绍复制延迟时可能发生的三个问题示例。我们还将概述解决它们的一些方法。

### 读己之写 {#sec_replication_ryw}

许多应用程序让用户提交一些数据，然后查看他们提交的内容。这可能是客户数据库中的记录，或讨论线程上的评论，或其他类似的东西。提交新数据时，必须将其发送到领导者，但当用户查看数据时，可以从追随者读取。如果数据经常被查看但只是偶尔被写入，这尤其合适。

使用异步复制，存在一个问题，如 [图 6-3](#fig_replication_read_your_writes) 所示：如果用户在写入后不久查看数据，新数据可能尚未到达副本。对用户来说，看起来他们提交的数据丢失了，所以他们自然会不高兴。

{{< figure src="/fig/ddia_0603.png" id="fig_replication_read_your_writes" caption="图 6-3. 用户进行写入，然后从陈旧副本读取。为了防止这种异常，我们需要写后读一致性。" class="w-full my-4" >}}

在这种情况下，我们需要 **写后读一致性**，也称为 **读己之写一致性** [^23]。这是一种保证，如果用户重新加载页面，他们将始终看到他们自己提交的任何更新。它不对其他用户做出承诺：其他用户的更新可能直到稍后才可见。然而，它向用户保证他们自己的输入已正确保存。

我们如何在基于领导者的复制系统中实现写后读一致性？有各种可能的技术。下面举几个例子：

* 当读取用户可能已修改的内容时，从领导者或同步更新的追随者读取；否则，从异步更新的追随者读取。这要求你有某种方法知道某物是否可能已被修改，而无需实际查询它。例如，社交网络上的用户个人资料信息通常只能由个人资料的所有者编辑，而不能由其他任何人编辑。因此，一个简单的规则是：始终从领导者读取用户自己的个人资料，从追随者读取任何其他用户的个人资料。
* 如果应用程序中的大多数东西都可能被用户编辑，那种方法将不会有效，因为大多数东西都必须从领导者读取（否定了读扩展的好处）。在这种情况下，可以使用其他标准来决定是否从领导者读取。例如，你可以跟踪上次更新的时间，并在上次更新后的一分钟内，使所有读取都来自领导者 [^25]。你还可以监控追随者上的复制延迟，并防止在落后领导者超过一分钟的任何追随者上进行查询。
* 客户端可以记住其最近写入的时间戳——然后系统可以确保为该用户提供任何读取的副本至少反映该时间戳之前的更新。如果副本不够最新，则可以由另一个副本处理读取，或者查询可以等待直到副本赶上 [^26]。时间戳可以是 **逻辑时间戳**（指示写入顺序的东西，例如日志序列号）或实际系统时钟（在这种情况下，时钟同步变得至关重要；见 ["不可靠的时钟"](/ch9#sec_distributed_clocks)）。
* 如果你的副本分布在各个地区（为了地理上接近用户或为了可用性），还有额外的复杂性。任何需要由领导者提供的请求都必须路由到包含领导者的地区。

当同一用户从多个设备访问你的服务时，会出现另一个复杂情况，例如桌面网络浏览器和移动应用程序。在这种情况下，你可能希望提供 **跨设备** 写后读一致性：如果用户在一个设备上输入一些信息，然后在另一个设备上查看它，他们应该看到他们刚刚输入的信息。

在这种情况下，需要考虑一些额外的问题：

* 需要记住用户上次更新的时间戳的方法变得更加困难，因为在一个设备上运行的代码不知道在另一个设备上发生了什么更新。此元数据将需要集中化。
* 如果你的副本分布在不同的地区，则无法保证来自不同设备的连接将路由到同一地区。（例如，如果用户的台式计算机使用家庭宽带连接，而他们的移动设备使用蜂窝数据网络，则设备的网络路由可能完全不同。）如果你的方法需要从领导者读取，你可能首先需要将来自用户所有设备的请求路由到同一地区。

--------

> [!TIP] 地区和可用区
>
> 我们用 **地区**（region）来指代一个地理位置中的一组数据中心。云服务提供商通常会在同一地区部署多个数据中心，每个数据中心称为 **可用区**（availability zone，简称 AZ）。因此，一个地区由多个可用区组成；每个可用区都是独立的物理设施，具有自己的供电、制冷等基础设施。
>
> 同一地区内各可用区通常通过高速网络互联，延迟足够低，因此大多数分布式系统可以把同一地区内的多个可用区近似看作一个机房。多可用区部署可以抵御单个可用区故障，但无法抵御整个地区不可用。要应对地区级中断，系统必须跨多个地区部署，这通常会带来更高延迟、更低吞吐和更高的云网络费用。我们将在 ["多主复制拓扑"](#sec_replication_topologies) 中进一步讨论这些权衡。这里你只需记住：本书所说的“地区”，是同一地理位置内多个可用区（数据中心）的集合。

--------

### 单调读 {#sec_replication_monotonic_reads}

从异步追随者读取时可能发生的第二个异常示例是，用户可能会看到事物 **在时间上倒退**。

如果用户从不同的副本进行多次读取，就可能发生这种情况。例如，[图 6-4](#fig_replication_monotonic_reads) 显示用户 2345 进行相同的查询两次，首先到延迟很小的追随者，然后到延迟更大的追随者。（如果用户刷新网页，并且每个请求都路由到随机服务器，这种情况很可能发生。）第一个查询返回用户 1234 最近添加的评论，但第二个查询没有返回任何内容，因为滞后的追随者尚未获取该写入。实际上，第二个查询观察到的系统状态比第一个查询更早的时间点。如果第一个查询没有返回任何内容，这不会那么糟糕，因为用户 2345 可能不知道用户 1234 最近添加了评论。然而，如果用户 2345 首先看到用户 1234 的评论出现，然后又看到它消失，这对用户 2345 来说非常令人困惑。

{{< figure src="/fig/ddia_0604.png" id="fig_replication_monotonic_reads" caption="图 6-4. 用户首先从新鲜副本读取，然后从陈旧副本读取。时间似乎倒退了。为了防止这种异常，我们需要单调读。" class="w-full my-4" >}}

**单调读** [^22] 是一种保证这类异常不会发生的会话保证。它比强一致性弱，但比最终一致性强。当你读取数据时，仍可能看到旧值；单调读只保证同一用户按顺序进行多次读取时，不会出现“时间倒退”——也就是先读到新值，后又读到更旧的值。

实现单调读的一种方法是确保每个用户始终从同一副本进行读取（不同的用户可以从不同的副本读取）。例如，可以基于用户 ID 的哈希选择副本，而不是随机选择。然而，如果该副本失败，用户的查询将需要重新路由到另一个副本。

### 一致前缀读 {#sec_replication_consistent_prefix}

我们的第三个复制延迟异常示例涉及违反因果关系。想象一下 Poons 先生和 Cake 夫人之间的以下简短对话：

Poons 先生
:   你能看到多远的未来，Cake 夫人？

Cake 夫人
:   通常大约十秒钟，Poons 先生。

这两个句子之间存在因果依赖关系：Cake 夫人听到了 Poons 先生的问题并回答了它。

现在，想象第三个人通过追随者听这个对话。Cake 夫人说的话通过延迟很小的追随者，但 Poons 先生说的话有更长的复制延迟（见 [图 6-5](#fig_replication_consistent_prefix)）。这个观察者会听到以下内容：

Cake 夫人
:   通常大约十秒钟，Poons 先生。

Poons 先生
:   你能看到多远的未来，Cake 夫人？

对观察者来说，看起来 Cake 夫人在 Poons 先生甚至提出问题之前就回答了问题。这种通灵能力令人印象深刻，但非常令人困惑 [^27]。

{{< figure src="/fig/ddia_0605.png" id="fig_replication_consistent_prefix" caption="图 6-5. 如果某些分片的复制比其他分片慢，观察者可能会在看到问题之前看到答案。" class="w-full my-4" >}}

防止这种异常需要另一种类型的保证：**一致前缀读** [^22]。这种保证说，如果一系列写入以某个顺序发生，那么任何读取这些写入的人都会看到它们以相同的顺序出现。

这是分片（分区）数据库中的一个特殊问题，我们将在 [第 7 章](/ch7#ch_sharding) 中讨论。如果数据库始终以相同的顺序应用写入，读取始终会看到一致的前缀，因此这种异常不会发生。然而，在许多分布式数据库中，不同的分片独立运行，因此没有全局的写入顺序：当用户从数据库读取时，他们可能会看到数据库的某些部分处于较旧状态，而某些部分处于较新状态。

一种解决方案是确保任何因果相关的写入都写入同一分片——但在某些应用程序中，这无法有效完成。还有一些算法明确跟踪因果依赖关系，这是我们将在 ["先发生关系与并发"](#sec_replication_happens_before) 中回到的主题。

### 复制延迟的解决方案 {#id131}

在使用最终一致系统时，值得思考：如果复制延迟上升到几分钟甚至几小时，应用程序会如何表现。如果答案是“没问题”，那很好；但如果这会造成糟糕的用户体验，就应当设计系统提供更强的保证（如写后读一致性）。把异步复制当作同步复制来假设，往往会在系统承压时暴露问题。

如前所述，应用程序可以提供比底层数据库更强的保证——例如，通过在领导者或同步更新的追随者上执行某些类型的读取。然而，在应用程序代码中处理这些问题很复杂且容易出错。

对于应用程序开发人员来说，最简单的编程模型是选择一个为副本提供强一致性保证的数据库，例如线性一致性（见 [第 10 章](/ch10#ch_consistency)）和 ACID 事务（见 [第 8 章](/ch8#ch_transactions)）。这允许你大部分忽略复制带来的挑战，并将数据库视为只有一个节点。在 2010 年代初期，**NoSQL** 运动推广了这样的观点，即这些功能限制了可伸缩性，大规模系统必须接受最终一致性。

然而，从那时起，许多数据库开始提供强一致性和事务，同时还提供分布式数据库的容错、高可用性和可伸缩性优势。如 ["关系模型与文档模型"](/ch3#sec_datamodels_history) 中所述，这种趋势被称为 **NewSQL**，以与 NoSQL 形成对比（尽管它不太关于 SQL 本身，而更多关于可伸缩事务管理的新方法）。

尽管现在可以使用可伸缩、强一致的分布式数据库，但某些应用程序选择使用提供较弱一致性保证的不同形式的复制仍然有充分的理由：它们可以在面对网络中断时提供更强的韧性，并且与事务系统相比具有较低的开销。我们将在本章的其余部分探讨这些方法。


## 多主复制 {#sec_replication_multi_leader}

到目前为止，本章中我们只考虑了使用单个领导者的复制架构。尽管这是一种常见的方法，但还有一些有趣的替代方案。

单主复制有一个主要缺点：所有写入都必须通过一个领导者。如果由于任何原因无法连接到领导者，例如你和领导者之间的网络中断，你就无法写入数据库。

单主复制模型的自然扩展是允许多个节点接受写入。复制仍然以相同的方式进行：每个处理写入的节点必须将该数据变更转发给所有其他节点。我们称之为 **多主** 配置（也称为 **主动/主动** 或 **双向** 复制）。在这种设置中，每个领导者同时充当其他领导者的追随者。

与单主复制一样，可以选择使其同步或异步。假设你有两个领导者，*A* 和 *B*，你正在尝试写入 *A*。如果写入从 *A* 同步复制到 *B*，并且两个节点之间的网络中断，你就无法写入 *A* 直到网络恢复。同步多主复制因此给你一个非常类似于单主复制的模型，即如果你让 *B* 成为领导者，*A* 只是将任何写入请求转发给 *B* 执行。

因此，我们不会进一步讨论同步多主复制，而只是将其视为等同于单主复制。本节的其余部分专注于异步多主复制，其中任何领导者都可以处理写入，即使其与其他领导者的连接中断。

### 跨地域运行 {#sec_replication_multi_dc}

在单个地区内使用多主设置很少有意义，因为好处很少超过增加的复杂性。然而，在某些情况下，这种配置是合理的。

想象你有一个数据库，在几个不同的地区有副本（也许是为了能够容忍整个地区的故障，或者是为了更接近你的用户）。这被称为 **地理分布式**、**地域分布式** 或 **地域复制** 设置。使用单主复制，领导者必须在 **一个** 地区，所有写入都必须通过该地区。

在多主配置中，你可以在 **每个** 地区都部署一个领导者。[图 6-6](#fig_replication_multi_dc) 展示了这种架构：在每个地区内使用常规单主复制（追随者可能位于与领导者不同的可用区）；在地区之间，每个地区的领导者把变更复制给其他地区的领导者。

{{< figure src="/fig/ddia_0606.png" id="fig_replication_multi_dc" caption="图 6-6. 跨多个地区的多主复制。" class="w-full my-4" >}}

让我们比较单主和多主配置在多地区部署中的表现：

性能
:   在单主配置中，每次写入都必须通过互联网到拥有领导者的地区。这可能会给写入增加显著的延迟，并可能违背首先拥有多个地区的目的。在多主配置中，每次写入都可以在本地地区处理，并异步复制到其他地区。因此，跨地区网络延迟对用户是隐藏的，这意味着感知性能可能更好。

地区故障容忍
:   在单主配置中，如果拥有领导者的地区变得不可用，故障转移可以将另一个地区的追随者提升为领导者。在多主配置中，每个地区可以独立于其他地区继续运行，并在离线地区恢复上线时赶上复制。

网络问题容忍
:   即使有专用连接，地区之间的流量也可能比同一地区内或单个区域内的流量更不可靠。单主配置对这种跨地区链路中的问题非常敏感，因为当一个地区的客户端想要写入另一个地区的领导者时，它必须通过该链路发送其请求并等待响应才能完成。

    具有异步复制的多主配置可以更好地容忍网络问题：在临时网络中断期间，每个地区的领导者可以继续独立处理写入。

一致性
:   单主系统可以提供强一致性保证，例如可串行化事务，我们将在 [第 8 章](/ch8#ch_transactions) 中讨论。多主系统的最大缺点是它们能够实现的一致性要弱得多。例如，你不能保证银行账户不会变成负数或用户名是唯一的：不同的领导者总是可能处理单独没问题的写入（从账户中支付一些钱，注册特定用户名），但当与另一个领导者上的另一个写入结合时违反了约束。

    这只是分布式系统的基本限制 [^28]。如果你必须强制执行这类约束，通常应选择单主系统。不过，正如我们将在 ["处理写入冲突"](#sec_replication_write_conflicts) 中看到的，多主系统在不需要这类约束的广泛应用里，仍然可以提供有用的一致性属性。

多主复制不如单主复制常见，但许多数据库仍然支持它，包括 MySQL、Oracle、SQL Server 和 YugabyteDB。在某些情况下，它是一个外部附加功能，例如在 Redis Enterprise、EDB Postgres Distributed 和 pglogical 中 [^29]。

由于多主复制在许多数据库中是一个有点改装的功能，因此通常存在微妙的配置陷阱和与其他数据库功能的令人惊讶的交互。例如，自增键、触发器和完整性约束可能会有问题。因此，多主复制通常被认为是应该尽可能避免的危险领域 [^30]。

#### 多主复制拓扑 {#sec_replication_topologies}

**复制拓扑** 描述了写入从一个节点传播到另一个节点的通信路径。如果你有两个领导者，如 [图 6-9](#fig_replication_write_conflict) 中，只有一种合理的拓扑：领导者 1 必须将其所有写入发送到领导者 2，反之亦然。有了两个以上的领导者，各种不同的拓扑是可能的。[图 6-7](#fig_replication_topologies) 中说明了一些示例。

{{< figure src="/fig/ddia_0607.png" id="fig_replication_topologies" caption="图 6-7. 可以设置多主复制的三个示例拓扑。" class="w-full my-4" >}}

最通用的拓扑是 **全对全**，如 [图 6-7](#fig_replication_topologies)(c) 所示，其中每个领导者将其写入发送到每个其他领导者。然而，也使用更受限制的拓扑：例如 **环形拓扑**，其中每个节点从一个节点接收写入并将这些写入（加上其自己的任何写入）转发到另一个节点。另一种流行的拓扑具有 **星形** 形状：一个指定的根节点将写入转发到所有其他节点。星形拓扑可以推广到树形。

--------

> [!NOTE]
> 不要将星形网络拓扑与 **星型模式** 混淆（见 ["星型与雪花型：分析模式"](/ch3#sec_datamodels_analytics)），后者描述了数据模型的结构。

--------

在环形和星形拓扑中，写入可能需要通过几个节点才能到达所有副本。因此，节点需要转发它们从其他节点接收的数据变更。为了防止无限复制循环，每个节点都被赋予一个唯一标识符，并且在复制日志中，每个写入都用它经过的所有节点的标识符标记 [^31]。当节点接收到用其自己的标识符标记的数据变更时，该数据变更将被忽略，因为节点知道它已经被处理过了。

#### 不同拓扑的问题 {#problems-with-different-topologies}

环形和星形拓扑的一个问题是，如果只有一个节点发生故障，它可能会中断其他节点之间的复制消息流，使它们无法通信，直到节点被修复。可以重新配置拓扑以绕过故障节点，但在大多数部署中，这种重新配置必须手动完成。更密集连接的拓扑（如全对全）的容错性更好，因为它允许消息沿着不同的路径传播，避免单点故障。

另一方面，全对全拓扑也可能有问题。特别是，一些网络链路可能比其他链路更快（例如，由于网络拥塞），结果是一些复制消息可能会"超越"其他消息，如 [图 6-8](#fig_replication_causality) 所示。

{{< figure src="/fig/ddia_0608.png" id="fig_replication_causality" caption="图 6-8. 使用多主复制，写入可能以错误的顺序到达某些副本。" class="w-full my-4" >}}

在 [图 6-8](#fig_replication_causality) 中，客户端 A 在领导者 1 上向表中插入一行，客户端 B 在领导者 3 上更新该行。然而，领导者 2 可能以不同的顺序接收写入：它可能首先接收更新（从其角度来看，这是对数据库中不存在的行的更新），然后才接收相应的插入（应该在更新之前）。

这是一个因果关系问题，类似于我们在 ["一致前缀读"](#sec_replication_consistent_prefix) 中看到的问题：更新依赖于先前的插入，因此我们需要确保所有节点首先处理插入，然后处理更新。简单地为每个写入附加时间戳是不够的，因为时钟不能被信任足够同步以在领导者 2 上正确排序这些事件（见 [第 9 章](/ch9#ch_distributed)）。

为了正确排序这些事件，可以使用一种称为 **版本向量** 的技术，我们将在本章后面讨论（见 ["检测并发写入"](#sec_replication_concurrent)）。然而，许多多主复制系统不使用良好的技术来排序更新，使它们容易受到像 [图 6-8](#fig_replication_causality) 中的问题的影响。如果你使用多主复制，值得了解这些问题，仔细阅读文档，并彻底测试你的数据库，以确保它真正提供你认为它具有的保证。

### 同步引擎与本地优先软件 {#sec_replication_offline_clients}

另一种适合多主复制的情况是，如果你有一个需要在与互联网断开连接时继续工作的应用程序。

例如，考虑你的手机、笔记本电脑和其他设备上的日历应用程序。你需要能够随时查看你的会议（进行读取请求）并输入新会议（进行写入请求），无论你的设备当前是否有互联网连接。如果你在离线时进行任何更改，它们需要在设备下次上线时与服务器和你的其他设备同步。

在这种情况下，每个设备都拥有一个充当领导者的本地数据库副本（可接受写入），并在你所有设备上的日历副本之间运行异步多主复制流程（即同步过程）。复制延迟可能是几小时甚至几天，具体取决于你何时能连上互联网。

从架构的角度来看，这种设置与地区之间的多主复制非常相似，达到了极端：每个设备是一个"地区"，它们之间的网络连接极其不可靠。

#### 实时协作、离线优先和本地优先应用 {#real-time-collaboration-offline-first-and-local-first-apps}

此外，许多现代 Web 应用程序提供 **实时协作** 功能，例如用于文本文档和电子表格的 Google Docs 和 Sheets，用于图形的 Figma，以及用于项目管理的 Linear。使这些应用程序如此响应的原因是用户输入立即反映在用户界面中，无需等待到服务器的网络往返，并且一个用户的编辑以低延迟显示给他们的协作者 [^32] [^33] [^34]。

这再次导致多主架构：每个打开共享文件的 Web 浏览器选项卡都是一个副本，你对文件进行的任何更新都会异步复制到打开同一文件的其他用户的设备。即使应用程序不允许你在离线时继续编辑文件，多个用户可以进行编辑而无需等待服务器的响应这一事实已经使其成为多主。

离线编辑和实时协作都需要类似的复制基础设施：应用程序需要捕获用户对文件所做的任何更改，并立即将它们发送给协作者（如果在线），或本地存储它们以供稍后发送（如果离线）。此外，应用程序需要接收来自协作者的更改，将它们合并到用户的文件本地副本中，并更新用户界面以反映最新版本。如果多个用户同时更改了文件，可能需要冲突解决逻辑来合并这些更改。

支持此过程的软件库称为 **同步引擎**。尽管这个想法已经存在很长时间了，但这个术语最近才受到关注 [^35] [^36] [^37]。允许用户在离线时继续编辑文件的应用程序（可能使用同步引擎实现）称为 **离线优先** [^38]。术语 **本地优先软件** 指的是不仅是离线优先的协作应用程序，而且即使制作软件的开发人员关闭了他们的所有在线服务，也被设计为继续工作 [^39]。这可以通过使用具有开放标准同步协议的同步引擎来实现，该协议有多个服务提供商可用 [^40]。例如，Git 是一个本地优先的协作系统（尽管不支持实时协作），因为你可以通过 GitHub、GitLab 或任何其他存储库托管服务进行同步。

#### 同步引擎的利弊 {#pros-and-cons-of-sync-engines}

今天构建 Web 应用程序的主导方式是在客户端保留很少的持久状态，并在需要显示新数据或需要更新某些数据时依赖向服务器发出请求。相比之下，当使用同步引擎时，你在客户端有持久状态，与服务器的通信被移到后台进程中。同步引擎方法有许多优点：

* 在本地拥有数据意味着用户界面的响应速度可以比必须等待服务调用获取某些数据时快得多。一些应用程序的目标是在图形系统的 **下一帧** 响应用户输入，这意味着在 60 Hz 刷新率的显示器上在 16 毫秒内渲染。
* 允许用户在离线时继续工作是有价值的，特别是在具有间歇性连接的移动设备上。使用同步引擎，应用程序不需要单独的离线模式：离线与具有非常大的网络延迟相同。
* 与在应用程序代码中执行显式服务调用相比，同步引擎简化了前端应用程序的编程模型。每个服务调用都需要错误处理，如 ["远程过程调用（RPC）的问题"](/ch5#sec_problems_with_rpc) 中所讨论的：例如，如果更新服务器上的数据的请求失败，用户界面需要以某种方式反映该错误。同步引擎允许应用程序对本地数据执行读写，这几乎从不失败，导致更具声明性的编程风格 [^41]。
* 为了实时显示其他用户的编辑，你需要接收这些编辑的通知并相应地有效更新用户界面。同步引擎与 **响应式编程** 模型相结合是实现此目的的好方法 [^42]。

当用户可能需要的所有数据都提前下载并持久存储在客户端时，同步引擎效果最佳。这意味着数据可用于离线访问，但这也意味着如果用户可以访问非常大量的数据，同步引擎就不适合。例如，下载用户自己创建的所有文件可能很好（一个用户通常不会生成那么多数据），但下载电子商务网站的整个目录可能没有意义。

同步引擎由 Lotus Notes 在 20 世纪 80 年代开创 [^43]（没有使用该术语），特定应用程序（如日历）的同步也已经存在很长时间了。今天有许多通用同步引擎，其中一些使用专有后端服务（例如，Google Firestore、Realm 或 Ditto），有些具有开源后端，使它们适合创建本地优先软件（例如，PouchDB/CouchDB、Automerge 或 Yjs）。

多人视频游戏有类似的需求，需要立即响应用户的本地操作，并将它们与通过网络异步接收的其他玩家的操作协调。在游戏开发术语中，同步引擎的等效物称为 **网络代码**。网络代码中使用的技术非常特定于游戏的要求 [^44]，并且不能直接应用于其他类型的软件，因此我们不会在本书中进一步考虑它们。


### 处理写入冲突 {#sec_replication_write_conflicts}

多主复制的最大问题——无论是在地域分布式服务器端数据库中还是在终端用户设备上的本地优先同步引擎中——是不同领导者上的并发写入可能导致需要解决的冲突。

例如，考虑一个维基页面同时被两个用户编辑，如 [图 6-9](#fig_replication_write_conflict) 所示。用户 1 将页面标题从 A 更改为 B，用户 2 独立地将标题从 A 更改为 C。每个用户的更改成功应用于其本地领导者。然而，当更改异步复制时，检测到冲突。这个问题在单主数据库中不会发生。

{{< figure src="/fig/ddia_0609.png" id="fig_replication_write_conflict" caption="图 6-9. 两个领导者并发更新同一记录导致的写入冲突。" class="w-full my-4" >}}

> [!NOTE]
> 我们说 [图 6-9](#fig_replication_write_conflict) 中的两个写入是 **并发的**，因为在最初进行写入时，两者都不“知道”对方。写入是否真的在同一时刻发生并不重要；实际上，如果写入发生在离线状态，它们在物理时间上可能相隔很久。关键在于：一个写入是否发生在另一个写入已经生效的状态之上。

在 ["检测并发写入"](#sec_replication_concurrent) 中，我们将解决数据库如何确定两个写入是否并发的问题。现在我们假设我们可以检测冲突，并且我们想找出解决它们的最佳方法。

#### 冲突避免 {#conflict-avoidance}

冲突的一种策略是首先避免它们发生。例如，如果应用程序可以确保特定记录的所有写入都通过同一领导者，那么即使整个数据库是多主的，也不会发生冲突。这种方法在同步引擎客户端离线更新的情况下是不可能的，但在地域复制的服务器系统中有时是可能的 [^30]。

例如，在一个用户只能编辑自己数据的应用程序中，你可以确保来自特定用户的请求始终路由到同一地区，并使用该地区的领导者进行读写。不同的用户可能有不同的"主"地区（可能基于与用户的地理接近程度选择），但从任何一个用户的角度来看，配置本质上是单主的。

然而，有时你可能想要更改记录的指定领导者——也许是因为一个地区不可用，你需要将流量重新路由到另一个地区，或者也许是因为用户已经移动到不同的位置，现在更接近不同的地区。现在存在风险，即用户在指定领导者更改正在进行时执行写入，导致必须使用下面的方法之一解决的冲突。因此，如果你允许更改领导者，冲突避免就会失效。

冲突避免的另一个例子：想象你想要插入新记录并基于自增计数器为它们生成唯一 ID。如果你有两个领导者，你可以设置它们，使得一个领导者只生成奇数，另一个只生成偶数。这样你可以确保两个领导者不会同时为不同的记录分配相同的 ID。我们将在 ["ID 生成器和逻辑时钟"](/ch10#sec_consistency_logical) 中讨论其他 ID 分配方案。


#### 最后写入胜利（丢弃并发写入） {#sec_replication_lww}

如果无法避免冲突，解决它们的最简单方法是为每个写入附加时间戳，并始终使用具有最大时间戳的值。例如，在 [图 6-9](#fig_replication_write_conflict) 中，假设用户 1 的写入时间戳大于用户 2 的写入时间戳。在这种情况下，两个领导者都将确定页面的新标题应该是 B，并丢弃将其设置为 C 的写入。如果写入巧合地具有相同的时间戳，可以通过比较值来选择获胜者（例如，在字符串的情况下，取字母表中较早的那个）。

这种方法称为 **最后写入胜利**（LWW），因为具有最大时间戳的写入可以被认为是"最后"的。然而，这个术语是误导性的，因为当两个写入像 [图 6-9](#fig_replication_write_conflict) 中那样并发时，哪个更旧，哪个更新是未定义的，因此并发写入的时间戳顺序本质上是随机的。

因此，LWW 的真正含义是：当同一记录在不同的领导者上并发写入时，其中一个写入被随机选择为获胜者，其他写入被静默丢弃，即使它们在各自的领导者上成功处理。这实现了最终所有副本都处于一致状态的目标，但代价是数据丢失。

如果你可以避免冲突——例如，通过只插入具有唯一键（如 UUID）的记录，而从不更新它们——那么 LWW 没有问题。但是，如果你更新现有记录，或者如果不同的领导者可能插入具有相同键的记录，那么你必须决定丢失的更新对你的应用程序是否是个问题。如果丢失的更新是不可接受的，你需要使用下面描述的冲突解决方法之一。

LWW 的另一个问题是，如果使用实时时钟（例如 Unix 时间戳）作为写入的时间戳，系统对时钟同步变得非常敏感。如果一个节点的时钟领先于其他节点，并且你尝试覆盖该节点写入的值，你的写入可能会被忽略，因为它可能具有较低的时间戳，即使它明显发生得更晚。这个问题可以通过使用 **逻辑时钟** 来解决，我们将在 ["ID 生成器和逻辑时钟"](/ch10#sec_consistency_logical) 中讨论。

#### 手动冲突解决 {#manual-conflict-resolution}

如果随机丢弃你的一些写入是不可取的，下一个选择是手动解决冲突。你可能熟悉 Git 和其他版本控制系统中的手动冲突解决：如果两个不同分支上的提交编辑同一文件的相同行，并且你尝试合并这些分支，你将得到一个需要在合并完成之前解决的合并冲突。

在数据库里，让冲突阻塞整个复制流程、直到人工处理，通常并不现实。更常见的是，数据库会保留某条记录的所有并发写入值——例如 [图 6-9](#fig_replication_write_conflict) 中的 B 和 C。这些值有时称为 **兄弟**。下次查询该记录时，数据库会返回 **所有** 这些值，而不只是最新值。随后你可以按需要解决这些值：要么在应用代码里自动处理（例如把 B 和 C 合并成 "B/C"），要么让用户参与处理；最后再把新值写回数据库以消解冲突。

这种冲突解决方法在某些系统中使用，例如 CouchDB。然而，它也存在许多问题：

* 数据库的 API 发生变化：例如，以前维基页面的标题只是一个字符串，现在它变成了一组字符串，通常包含一个元素，但如果有冲突，有时可能包含多个元素。这可能使应用程序代码中的数据难以处理。
* 要求用户手动合并兄弟，会带来很大负担：开发者需要构建冲突解决界面，用户也可能不明白自己为何要做这件事。在很多场景下，自动合并比打扰用户更合适。
* 如果不够谨慎，自动合并兄弟也可能产生反直觉行为。例如，亚马逊购物车曾允许并发更新，并用“并集”策略合并（保留出现在任一兄弟中的所有商品）。这意味着：若用户在一个兄弟里删除了某商品，但另一个兄弟仍保留它，该商品会“复活”回购物车 [^45]。[图 6-10](#fig_replication_amazon_anomaly) 就是一个例子：设备 1 删除 Book，设备 2 并发删除 DVD，冲突合并后两个商品都回来了。
* 如果多个节点观察到冲突并并发解决它，冲突解决过程本身可能会引入新的冲突。这些解决方案甚至可能不一致：例如，如果你不小心一致地排序它们，一个节点可能将 B 和 C 合并为"B/C"，另一个可能将它们合并为"C/B"。当"B/C"和"C/B"之间的冲突被合并时，它可能导致"B/C/C/B"或类似令人惊讶的东西。

{{< figure src="/fig/ddia_0610.png" id="fig_replication_amazon_anomaly" caption="图 6-10. 亚马逊购物车异常的示例：如果购物车上的冲突通过取并集合并，删除的项目可能会重新出现。" class="w-full my-4" >}}


#### 自动冲突解决 {#automatic-conflict-resolution}

对于许多应用程序，处理冲突的最佳方法是使用自动将并发写入合并为一致状态的算法。自动冲突解决确保所有副本 **收敛** 到相同的状态——即，处理了相同写入集的所有副本都具有相同的状态，无论写入到达的顺序如何。

LWW 是冲突解决算法的一个简单示例。已经为不同类型的数据开发了更复杂的合并算法，目标是尽可能保留所有更新的预期效果，从而避免数据丢失：

* 如果数据是文本（例如维基页面标题或正文），我们可以检测每次版本演进中的字符插入和删除。合并结果会保留任一兄弟中的所有插入和删除。如果多个用户并发在同一位置插入文本，还可以用确定性顺序来排序，以确保所有节点得到同样的合并结果。
* 如果数据是项目集合（像待办事项列表那样有序，或像购物车那样无序），我们可以通过跟踪插入和删除类似于文本来合并它。为了避免 [图 6-10](#fig_replication_amazon_anomaly) 中的购物车问题，算法跟踪 Book 和 DVD 被删除的事实，因此合并的结果是 Cart = {Soap}。
* 如果数据是可增可减的整数计数器（例如社交媒体帖子的点赞数），合并算法可以统计每个兄弟上的递增和递减次数，并正确求和，既不重复计数，也不丢更新。
* 如果数据是键值映射，我们可以通过将其他冲突解决算法之一应用于该键下的值来合并对同一键的更新。对不同键的更新可以相互独立处理。

冲突解决的可能性是有限的。例如，如果你想强制一个列表不包含超过五个项目，并且多个用户并发地向列表添加项目，使得总共有五个以上，你唯一的选择是丢弃一些项目。尽管如此，自动冲突解决足以构建许多有用的应用程序。如果你从想要构建协作离线优先或本地优先应用程序的要求开始，那么冲突解决是不可避免的，自动化它通常是最好的方法。

### CRDT 与操作变换 {#sec_replication_crdts}

两个算法族通常用于实现自动冲突解决：**无冲突复制数据类型**（CRDT）[^46] 和 **操作变换**（OT）[^47]。它们具有不同的设计理念和性能特征，但都能够为前面提到的所有类型的数据执行自动合并。

[图 6-11](#fig_replication_ot_crdt) 显示了 OT 和 CRDT 如何合并对文本的并发更新的示例。假设你有两个副本，都从文本"ice"开始。一个副本在前面添加字母"n"以制作"nice"，而另一个副本并发地附加感叹号以制作"ice!"。

{{< figure src="/fig/ddia_0611.png" id="fig_replication_ot_crdt" caption="图 6-11. OT 和 CRDT 如何分别合并对字符串的两个并发插入。" class="w-full my-4" >}}

合并的结果"nice!"由两种类型的算法以不同的方式实现：

OT
:   我们记录插入或删除字符的索引："n"插入在索引 0，"!"插入在索引 3。接下来，副本交换它们的操作。在 0 处插入"n"可以按原样应用，但如果在 3 处插入"!"应用于状态"nice"，我们将得到"nic!e"，这是不正确的。因此，我们需要转换每个操作的索引以考虑已经应用的并发操作；在这种情况下，"!"的插入被转换为索引 4 以考虑在较早索引处插入"n"。

CRDT
:   大多数 CRDT 为每个字符提供唯一的、不可变的 ID，并使用这些 ID 来确定插入/删除的位置，而不是索引。例如，在 [图 6-11](#fig_replication_ot_crdt) 中，我们将 ID 1A 分配给"i"，ID 2A 分配给"c"等。插入感叹号时，我们生成一个包含新字符的 ID（4B）和我们想要在其后插入的现有字符的 ID（3A）的操作。要在字符串的开头插入，我们将"nil"作为前面的字符 ID。在同一位置的并发插入按字符的 ID 排序。这确保副本收敛而不执行任何转换。

有许多基于这些想法变体的算法。列表/数组可以类似地支持，使用列表元素而不是字符，其他数据类型（如键值映射）可以很容易地添加。OT 和 CRDT 之间存在一些性能和功能权衡，但可以在一个算法中结合 CRDT 和 OT 的优点 [^48]。

OT 最常用于文本的实时协作编辑，例如在 Google Docs 中 [^32]，而 CRDT 可以在分布式数据库中找到，例如 Redis Enterprise、Riak 和 Azure Cosmos DB [^49]。JSON 数据的同步引擎可以使用 CRDT（例如，Automerge 或 Yjs）和 OT（例如，ShareDB）实现。

#### 什么是冲突？ {#what-is-a-conflict}

某些类型的冲突是显而易见的。在 [图 6-9](#fig_replication_write_conflict) 的示例中，两个写入并发修改了同一记录中的同一字段，将其设置为两个不同的值。毫无疑问，这是一个冲突。

其他类型的冲突可能更难以检测。例如，考虑一个会议室预订系统：它跟踪哪个房间由哪组人在什么时间预订。此应用程序需要确保每个房间在任何时间只由一组人预订（即，同一房间不得有任何重叠的预订）。在这种情况下，如果为同一房间同时创建两个不同的预订，可能会出现冲突。即使应用程序在允许用户进行预订之前检查可用性，如果两个预订是在两个不同的领导者上进行的，也可能会发生冲突。

没有现成的快速答案，不过在后续章节中，我们会逐步建立对这个问题的理解。我们将在 [第 8 章](/ch8#ch_transactions) 看到更多冲突案例，并在 ["通过事件顺序捕获因果关系"](/ch13#sec_future_capture_causality) 中讨论在复制系统里可伸缩地检测和解决冲突的方法。


## 无主复制 {#sec_replication_leaderless}

到目前为止，我们在本章中讨论的复制方法——单主和多主复制——都基于这样的想法：客户端向一个节点（领导者）发送写入请求，数据库系统负责将该写入复制到其他副本。领导者确定写入应该处理的顺序，追随者以相同的顺序应用领导者的写入。

一些数据存储系统采用不同的方法，放弃领导者的概念，并允许任何副本直接接受来自客户端的写入。一些最早的复制数据系统是无主的 [^1] [^50]，但在关系数据库主导的时代，这个想法基本上被遗忘了。在亚马逊于 2007 年将其用于其内部 **Dynamo** 系统后，它再次成为数据库的时尚架构 [^45]。Riak、Cassandra 和 ScyllaDB 是受 Dynamo 启发的具有无主复制模型的开源数据存储，因此这种数据库也被称为 **Dynamo 风格**。

--------

> [!NOTE]
> 原始的 **Dynamo** 系统仅在论文中描述 [^45]，但从未在亚马逊之外发布。AWS 的名称相似的 **DynamoDB** 是一个更新的云数据库，但它具有完全不同的架构：它使用基于 Multi-Paxos 共识算法的单主复制 [^5]。

--------

在某些无主实现中，客户端直接将其写入发送到多个副本，而在其他实现中，协调器节点代表客户端执行此操作。然而，与领导者数据库不同，该协调器不强制执行特定的写入顺序。正如我们将看到的，这种设计差异对数据库的使用方式产生了深远的影响。

### 当节点故障时写入数据库 {#id287}

想象你有一个具有三个副本的数据库，其中一个副本当前不可用——也许它正在重新启动以安装系统更新。在单主配置中，如果你想继续处理写入，你可能需要执行故障转移（见 ["处理节点故障"](#sec_replication_failover)）。

另一方面，在无主配置中，故障转移不存在。[图 6-12](#fig_replication_quorum_node_outage) 显示了发生的情况：客户端（用户 1234）将写入并行发送到所有三个副本，两个可用副本接受写入，但不可用副本错过了它。假设三个副本中有两个确认写入就足够了：在用户 1234 收到两个 **ok** 响应后，我们认为写入成功。客户端只是忽略了其中一个副本错过写入的事实。

{{< figure src="/fig/ddia_0612.png" id="fig_replication_quorum_node_outage" caption="图 6-12. 节点中断后的仲裁写入、仲裁读取和读修复。" class="w-full my-4" >}}


现在想象不可用节点恢复上线，客户端开始从它读取。在节点宕机期间发生的任何写入都从该节点丢失。因此，如果你从该节点读取，你可能会得到 **陈旧**（过时）值作为响应。

为了解决这个问题，当客户端从数据库读取时，它不只是将其请求发送到一个副本：**读取请求也并行发送到多个节点**。客户端可能会从不同的节点获得不同的响应；例如，从一个节点获得最新值，从另一个节点获得陈旧值。

为了区分哪些响应是最新的，哪些是过时的，写入的每个值都需要用版本号或时间戳标记，类似于我们在 ["最后写入胜利（丢弃并发写入）"](#sec_replication_lww) 中看到的。当客户端收到对读取的多个值响应时，它使用具有最大时间戳的值（即使该值仅由一个副本返回，而其他几个副本返回较旧的值）。有关更多详细信息，请参见 ["检测并发写入"](#sec_replication_concurrent)。

#### 追赶错过的写入 {#sec_replication_read_repair}

复制系统应确保最终所有数据都复制到每个副本。在不可用节点恢复上线后，它如何赶上它错过的写入？在 Dynamo 风格的数据存储中使用了几种机制：

读修复
:   当客户端并行从多个节点读取时，它可以检测任何陈旧响应。例如，在 [图 6-12](#fig_replication_quorum_node_outage) 中，用户 2345 从副本 3 获得版本 6 的值，从副本 1 和 2 获得版本 7 的值。客户端发现副本 3 陈旧后，会把较新的值写回该副本。这种方法适用于经常被读取的值。

提示移交
:   如果一个副本不可用，另一个副本可能会以 **提示** 的形式代表其存储写入。当应该接收这些写入的副本恢复时，存储提示的副本将它们发送到恢复的副本，然后删除提示。这个 **移交** 过程有助于使副本保持最新，即使对于从未读取的值也是如此，因此不由读修复处理。

反熵
:   此外，还有一个后台进程定期查找副本之间数据的差异，并将任何缺失的数据从一个副本复制到另一个。与基于领导者的复制中的复制日志不同，这个 **反熵进程** 不以任何特定顺序复制写入，并且在复制数据之前可能会有显著的延迟。

#### 读写仲裁 {#sec_replication_quorum_condition}

在 [图 6-12](#fig_replication_quorum_node_outage) 的例子中，即使写入仅在三个副本中的两个上处理，我们也认为写入成功。如果三个副本中只有一个接受了写入呢？我们能推多远？

如果我们知道每次成功的写入都保证至少存在于三个副本中的两个上，这意味着最多一个副本可能是陈旧的。因此，如果我们从至少两个副本读取，我们可以确信两个中至少有一个是最新的。如果第三个副本宕机或响应缓慢，读取仍然可以继续返回最新值。

更一般地说，如果有 *n* 个副本，每次写入必须由 *w* 个节点确认才能被认为成功，并且我们必须为每次读取查询至少 *r* 个节点。（在我们的例子中，*n* = 3，*w* = 2，*r* = 2。）只要 *w* + *r* > *n*，我们在读取时期望获得最新值，因为我们读取的 *r* 个节点中至少有一个必须是最新的。遵守这些 *r* 和 *w* 值的读取和写入称为 **仲裁** 读取和写入 [^50]。你可以将 *r* 和 *w* 视为读取或写入有效所需的最小投票数。

在 Dynamo 风格的数据库中，参数 *n*、*w* 和 *r* 通常是可配置的。常见的选择是使 *n* 为奇数（通常为 3 或 5），并设置 *w* = *r* = (*n* + 1) / 2（向上舍入）。然而，你可以根据需要更改数字。例如，写入很少而读取很多的工作负载可能受益于设置 *w* = *n* 和 *r* = 1。这使读取更快，但缺点是仅一个失败的节点就会导致所有数据库写入失败。

--------

> [!NOTE]
> 集群中可能有超过 *n* 个节点，但任何给定值仅存储在 *n* 个节点上。这允许数据集进行分片，支持比单个节点能容纳的更大的数据集。我们将在 [第 7 章](/ch7#ch_sharding) 中回到分片。

--------

仲裁条件 *w* + *r* > *n* 允许系统容忍不可用节点，如下所示：

* 如果 *w* < *n*，如果节点不可用，我们仍然可以处理写入。
* 如果 *r* < *n*，如果节点不可用，我们仍然可以处理读取。
* 使用 *n* = 3，*w* = 2，*r* = 2，我们可以容忍一个不可用节点，如 [图 6-12](#fig_replication_quorum_node_outage) 中所示。
* 使用 *n* = 5，*w* = 3，*r* = 3，我们可以容忍两个不可用节点。这种情况在 [图 6-13](#fig_replication_quorum_overlap) 中说明。

通常，读取和写入总是并行发送到所有 *n* 个副本。参数 *w* 和 *r* 确定我们等待多少个节点——即，在我们认为读取或写入成功之前，*n* 个节点中有多少个需要报告成功。

{{< figure src="/fig/ddia_0613.png" id="fig_replication_quorum_overlap" caption="图 6-13. 如果 *w* + *r* > *n*，你读取的 *r* 个副本中至少有一个必须看到最近的成功写入。" class="w-full my-4" >}}


如果少于所需的 *w* 或 *r* 个节点可用，写入或读取将返回错误。节点可能因许多原因不可用：因为节点宕机（崩溃、断电）、由于执行操作时出错（无法写入因为磁盘已满）、由于客户端和节点之间的网络中断，或任何其他原因。我们只关心节点是否返回了成功响应，不需要区分不同类型的故障。

### 仲裁一致性的局限 {#sec_replication_quorum_limitations}

如果你有 *n* 个副本，并且你选择 *w* 和 *r* 使得 *w* + *r* > *n*，你通常可以期望每次读取都返回为键写入的最新值。这是因为你写入的节点集和你读取的节点集必须重叠。也就是说，在你读取的节点中，必须至少有一个具有最新值的节点（如 [图 6-13](#fig_replication_quorum_overlap) 所示）。

通常，*r* 和 *w* 被选择为多数（超过 *n*/2）节点，因为这确保了 *w* + *r* > *n*，同时仍然容忍最多 *n*/2（向下舍入）个节点故障。但仲裁不一定是多数——重要的是读取和写入操作使用的节点集至少在一个节点中重叠。其他仲裁分配是可能的，这允许分布式算法设计中的一些灵活性 [^51]。

你也可以将 *w* 和 *r* 设置为较小的数字，使得 *w* + *r* ≤ *n*（即，不满足仲裁条件）。在这种情况下，读取和写入仍将发送到 *n* 个节点，但需要较少的成功响应数才能使操作成功。

使用较小的 *w* 和 *r*，你更有可能读取陈旧值，因为你的读取更可能没有包含具有最新值的节点。从好的方面来说，这种配置允许更低的延迟和更高的可用性：如果存在网络中断并且许多副本变得无法访问，你继续处理读取和写入的机会更高。只有在可访问副本的数量低于 *w* 或 *r* 之后，数据库才分别变得无法写入或读取。

然而，即使使用 *w* + *r* > *n*，在某些边缘情况下，一致性属性可能会令人困惑。一些场景包括：

* 如果携带新值的节点失败，并且其数据从携带旧值的副本恢复，存储新值的副本数量可能低于 *w*，破坏仲裁条件。
* 在重新平衡正在进行时，其中一些数据从一个节点移动到另一个节点（见 [第 7 章](/ch7#ch_sharding)），节点可能对哪些节点应该持有特定值的 *n* 个副本有不一致的视图。这可能导致读取和写入仲裁不再重叠。
* 如果读取与写入操作并发，读取可能会或可能不会看到并发写入的值。特别是，一次读取可能看到新值，而后续读取看到旧值，正如我们将在 ["线性一致性与仲裁"](/ch10#sec_consistency_quorum_linearizable) 中看到的。
* 如果写入在某些副本上成功但在其他副本上失败（例如，因为某些节点上的磁盘已满），并且总体上在少于 *w* 个副本上成功，它不会在成功的副本上回滚。这意味着如果写入被报告为失败，后续读取可能会或可能不会返回该写入的值 [^52]。
* 如果数据库使用实时时钟的时间戳来确定哪个写入更新（如 Cassandra 和 ScyllaDB 所做的），如果另一个具有更快时钟的节点已写入同一键，写入可能会被静默丢弃——我们之前在 ["最后写入胜利（丢弃并发写入）"](#sec_replication_lww) 中看到的问题。我们将在 ["依赖同步时钟"](/ch9#sec_distributed_clocks_relying) 中更详细地讨论这一点。
* 如果两个写入并发发生，其中一个可能首先在一个副本上处理，另一个可能首先在另一个副本上处理。这导致冲突，类似于我们在多主复制中看到的（见 ["处理写入冲突"](#sec_replication_write_conflicts)）。我们将在 ["检测并发写入"](#sec_replication_concurrent) 中回到这个主题。

因此，尽管仲裁似乎保证读取返回最新写入的值，但实际上并不那么简单。Dynamo 风格的数据库通常针对可以容忍最终一致性的用例进行了优化。参数 *w* 和 *r* 允许你调整读取陈旧值的概率 [^53]，但明智的做法是不要将它们视为绝对保证。

#### 监控陈旧性 {#monitoring-staleness}

从操作角度来看，监控你的数据库是否返回最新结果很重要。即使你的应用程序可以容忍陈旧读取，你也需要了解复制的健康状况。如果它明显落后，它应该提醒你，以便你可以调查原因（例如，网络中的问题或过载的节点）。

对于基于领导者的复制，数据库通常公开复制延迟的指标，你可以将其输入到监控系统。这是可能的，因为写入以相同的顺序应用于领导者和追随者，每个节点在复制日志中都有一个位置（它在本地应用的写入数）。通过从领导者的当前位置减去追随者的当前位置，你可以测量复制延迟的量。

然而，在具有无主复制的系统中，没有固定的写入应用顺序，这使得监控更加困难。副本为移交存储的提示数量可以是系统健康的一个度量，但很难有用地解释 [^54]。最终一致性是一个故意模糊的保证，但为了可操作性，能够量化"最终"很重要。


### 单主与无主复制的性能 {#sec_replication_leaderless_perf}

基于单个领导者的复制系统可以提供在无主系统中难以或不可能实现的强一致性保证。然而，正如我们在 ["复制延迟的问题"](#sec_replication_lag) 中看到的，如果你在异步更新的追随者上进行读取，基于领导者的复制系统中的读取也可能返回陈旧值。

从领导者读取确保最新响应，但它存在性能问题：

* 读取吞吐量受领导者处理请求能力的限制（与读扩展相反，读扩展将读取分布在可能返回陈旧值的异步更新副本上）。
* 如果领导者失败，你必须等待检测到故障，并在继续处理请求之前完成故障转移。即使故障转移过程非常快，用户也会因为临时增加的响应时间而注意到它；如果故障转移需要很长时间，系统在其持续时间内不可用。
* 系统对领导者上的性能问题非常敏感：如果领导者响应缓慢，例如由于过载或某些资源争用，增加的响应时间也会立即影响用户。

无主架构的一大优势是它对此类问题更有弹性。因为没有故障转移，而且请求本来就是并行发往多个副本，所以某个副本变慢或不可用对响应时间影响较小：客户端只需采用更快副本的响应即可。利用最快响应的做法称为 **请求对冲**，它可以显著降低尾部延迟 [^55]。

从根本上说，无主系统的弹性来自于它不区分正常情况和故障情况的事实。这在处理所谓的 **灰色故障** 时特别有用，其中节点没有完全宕机，但以降级状态运行，处理请求异常缓慢 [^56]，或者当节点只是过载时（例如，如果节点已离线一段时间，通过提示移交恢复可能会导致大量额外负载）。基于领导者的系统必须决定情况是否足够糟糕以保证故障转移（这本身可能会导致进一步的中断），而在无主系统中，这个问题甚至不会出现。

也就是说，无主系统也可能有性能问题：

* 即使系统不需要执行故障转移，一个副本确实需要检测另一个副本何时不可用，以便它可以存储有关不可用副本错过的写入的提示。当不可用副本恢复时，移交过程需要向其发送这些提示。这在系统已经处于压力下时给副本带来了额外的负载 [^54]。
* 你拥有的副本越多，你的仲裁就越大，在请求完成之前你必须等待的响应就越多。即使你只等待最快的 *r* 或 *w* 个副本响应，即使你并行发出请求，更大的 *r* 或 *w* 增加了你遇到慢副本的机会，增加了总体响应时间（见 ["响应时间指标的应用"](/ch2#sec_introduction_slo_sla)）。
* 大规模网络中断使客户端与大量副本断开连接，可能使形成仲裁变得不可能。一些无主数据库提供了一个配置选项，允许任何可访问的副本接受写入，即使它不是该键的通常副本之一（Riak 和 Dynamo 称之为 **宽松仲裁** [^45]；Cassandra 和 ScyllaDB 称之为 **一致性级别 ANY**）。不能保证后续读取会看到写入的值，但根据应用程序，它可能仍然比写入失败更好。

多主复制可以提供比无主复制更大的网络中断弹性，因为读取和写入只需要与一个领导者通信，该领导者可以与客户端位于同一位置。然而，由于一个领导者上的写入异步传播到其他领导者，读取可能任意过时。仲裁读取和写入提供了一种折衷：良好的容错性，同时也有很高的可能性读取最新数据。

#### 多地区操作 {#multi-region-operation}

我们之前讨论了跨地区复制作为多主复制的用例（见 ["多主复制"](#sec_replication_multi_leader)）。无主复制也适合多地区操作，因为它被设计为容忍冲突的并发写入、网络中断和延迟峰值。

Cassandra 和 ScyllaDB 在正常的无主模型中实现了它们的多地区支持：客户端直接将其写入发送到所有地区的副本，你可以从各种一致性级别中进行选择，这些级别确定请求成功所需的响应数。例如，你可以请求所有地区中副本的仲裁、每个地区中的单独仲裁，或仅客户端本地地区的仲裁。本地仲裁避免了必须等待到其他地区的缓慢请求，但它也更可能返回陈旧结果。

Riak 将客户端和数据库节点之间的所有通信保持在一个地区本地，因此 *n* 描述了一个地区内的副本数。数据库集群之间的跨地区复制在后台异步发生，其风格类似于多主复制。


### 检测并发写入 {#sec_replication_concurrent}

与多主复制一样，无主数据库允许对同一键进行并发写入，导致需要解决的冲突。此类冲突可能在写入发生时发生，但并非总是如此：它们也可能在读修复、提示移交或反熵期间稍后检测到。

问题在于，由于可变的网络延迟和部分故障，事件可能以不同的顺序到达不同的节点。例如，[图 6-14](#fig_replication_concurrency) 显示了两个客户端 A 和 B 同时写入三节点数据存储中的键 *X*：

* 节点 1 接收来自 A 的写入，但由于瞬时中断从未接收来自 B 的写入。
* 节点 2 首先接收来自 A 的写入，然后接收来自 B 的写入。
* 节点 3 首先接收来自 B 的写入，然后接收来自 A 的写入。

{{< figure src="/fig/ddia_0614.png" id="fig_replication_concurrency" caption="图 6-14. Dynamo 风格数据存储中的并发写入：没有明确定义的顺序。" class="w-full my-4" >}}

如果每个节点在接收到来自客户端的写入请求时只是覆盖键的值，节点将变得永久不一致，如 [图 6-14](#fig_replication_concurrency) 中的最终 *get* 请求所示：节点 2 认为 *X* 的最终值是 B，而其他节点认为值是 A。

为了最终保持一致，副本应该收敛到相同的值。为此，我们可以使用我们之前在 ["处理写入冲突"](#sec_replication_write_conflicts) 中讨论的任何冲突解决机制，例如最后写入胜利（由 Cassandra 和 ScyllaDB 使用）、手动解决或 CRDT（在 ["CRDT 与操作变换"](#sec_replication_crdts) 中描述，并由 Riak 使用）。

最后写入胜利很容易实现：每个写入都标有时间戳，具有更高时间戳的值总是覆盖具有较低时间戳的值。然而，时间戳不会告诉你两个值是否实际上冲突（即，它们是并发写入的）或不冲突（它们是一个接一个写入的）。如果你想显式解决冲突，系统需要更加小心地检测并发写入。

#### "先发生"关系与并发 {#sec_replication_happens_before}

我们如何决定两个操作是否并发？为了培养直觉，让我们看一些例子：

* 在 [图 6-8](#fig_replication_causality) 中，两个写入不是并发的：A 的插入 **先发生于** B 的递增，因为 B 递增的值是 A 插入的值。换句话说，B 的操作建立在 A 的操作之上，所以 B 的操作必须稍后发生。我们也说 B **因果依赖** 于 A。
* 另一方面，[图 6-14](#fig_replication_concurrency) 中的两个写入是并发的：当每个客户端开始操作时，它不知道另一个客户端也在对同一键执行操作。因此，操作之间没有因果依赖关系。

如果操作 B 知道 A，或依赖于 A，或以某种方式建立在 A 之上，则操作 A **先发生于** 另一个操作 B。一个操作是否先发生于另一个操作是定义并发含义的关键。事实上，我们可以简单地说，如果两个操作都不先发生于另一个（即，两者都不知道另一个），则它们是 **并发的** [^57]。

因此，每当你有两个操作 A 和 B 时，有三种可能性：要么 A 先发生于 B，要么 B 先发生于 A，要么 A 和 B 是并发的。我们需要的是一个算法来告诉我们两个操作是否并发。如果一个操作先发生于另一个，后面的操作应该覆盖前面的操作，但如果操作是并发的，我们有一个需要解决的冲突。

--------

> [!TIP] 并发、时间和相对论
>
> 似乎两个操作如果"同时"发生，应该称为并发——但实际上，它们是否真的在时间上重叠并不重要。由于分布式系统中的时钟问题，实际上很难判断两件事是否恰好在同一时间发生——我们将在 [第 9 章](/ch9#ch_distributed) 中更详细地讨论这个问题。
>
> 为了定义并发，确切的时间并不重要：我们只是称两个操作并发，如果它们都不知道对方，无论它们发生的物理时间如何。人们有时将这一原则与物理学中的狭义相对论联系起来 [^57]，它引入了信息不能比光速传播更快的想法。因此，如果两个事件之间的时间短于光在它们之间传播的时间，那么相隔一定距离发生的两个事件不可能相互影响。
>
> 在计算机系统中，即使光速原则上允许一个操作影响另一个，两个操作也可能是并发的。例如，如果网络在当时很慢或中断，两个操作可以相隔一段时间发生，仍然是并发的，因为网络问题阻止了一个操作能够知道另一个。

--------

#### 捕获先发生关系 {#capturing-the-happens-before-relationship}

让我们看一个确定两个操作是否并发或一个先发生于另一个的算法。为了简单起见，让我们从只有一个副本的数据库开始。一旦我们弄清楚如何在单个副本上执行此操作，我们就可以将该方法推广到具有多个副本的无主数据库。

[图 6-15](#fig_replication_causality_single) 显示了两个客户端并发地向同一购物车添加项目。（如果这个例子让你觉得太无聊，想象一下两个空中交通管制员并发地向他们正在跟踪的扇区添加飞机。）最初，购物车是空的。两个客户端总共向数据库发起了五次写入：

1. 客户端 1 将 `milk` 添加到购物车。这是对该键的第一次写入，因此服务器成功存储它并为其分配版本 1。服务器还将值连同版本号一起回显给客户端。
2. 客户端 2 将 `eggs` 添加到购物车，不知道客户端 1 并发地添加了 `milk`（客户端 2 认为它的 `eggs` 是购物车中的唯一项目）。服务器为此写入分配版本 2，并将 `eggs` 和 `milk` 存储为两个单独的值（兄弟）。然后，它将 **两个** 值连同版本号 2 一起返回给客户端。
3. 客户端 1，不知道客户端 2 的写入，想要将 `flour` 添加到购物车，因此它认为当前购物车内容应该是 `[milk, flour]`。它将此值连同服务器之前给客户端 1 的版本号 1 一起发送到服务器。服务器可以从版本号判断 `[milk, flour]` 的写入取代了 `[milk]` 的先前值，但它与 `[eggs]` 并发。因此，服务器将版本 3 分配给 `[milk, flour]`，覆盖版本 1 值 `[milk]`，但保留版本 2 值 `[eggs]` 并将两个剩余值返回给客户端。
4. 同时，客户端 2 想要将 `ham` 添加到购物车，不知道客户端 1 刚刚添加了 `flour`。客户端 2 在上次响应中从服务器接收了两个值 `[milk]` 和 `[eggs]`，因此客户端现在合并这些值并添加 `ham` 以形成新值 `[eggs, milk, ham]`。它将该值连同先前的版本号 2 一起发送到服务器。服务器检测到版本 2 覆盖 `[eggs]` 但与 `[milk, flour]` 并发，因此两个剩余值是版本 3 的 `[milk, flour]` 和版本 4 的 `[eggs, milk, ham]`。
5. 最后，客户端 1 想要添加 `bacon`。它之前从服务器接收了版本 3 的 `[milk, flour]` 和 `[eggs]`，因此它合并这些，添加 `bacon`，并将最终值 `[milk, flour, eggs, bacon]` 连同版本号 3 一起发送到服务器。这覆盖了 `[milk, flour]`（注意 `[eggs]` 已经在上一步中被覆盖）但与 `[eggs, milk, ham]` 并发，因此服务器保留这两个并发值。

{{< figure src="/fig/ddia_0615.png" id="fig_replication_causality_single" caption="图 6-15. 捕获两个客户端并发编辑购物车之间的因果依赖关系。" class="w-full my-4" >}}


[图 6-15](#fig_replication_causality_single) 中操作之间的数据流在 [图 6-16](#fig_replication_causal_dependencies) 中以图形方式说明。箭头指示哪个操作 **先发生于** 哪个其他操作，即后面的操作 **知道** 或 **依赖于** 前面的操作。在这个例子中，客户端从未完全了解服务器上的数据，因为总是有另一个并发进行的操作。但是值的旧版本最终会被覆盖，并且不会丢失任何写入。

{{< figure link="#fig_replication_causality_single" src="/fig/ddia_0616.png" id="fig_replication_causal_dependencies" caption="图 6-16. 图 6-15 中因果依赖关系的图。" class="w-full my-4" >}}


请注意，服务器可以通过查看版本号来确定两个操作是否并发——它不需要解释值本身（因此值可以是任何数据结构）。算法的工作原理如下：

* 服务器为每个键维护一个版本号，每次写入该键时递增版本号，并将新版本号与写入的值一起存储。
* 当客户端读取键时，服务器返回所有兄弟，即所有未被覆盖的值，以及最新的版本号。客户端必须在写入之前读取键。
* 当客户端写入键时，它必须包含来自先前读取的版本号，并且必须合并它在先前读取中收到的所有值，例如使用 CRDT 或通过询问用户。写入请求的响应就像读取一样，返回所有兄弟，这允许我们像购物车示例中那样链接多个写入。
* 当服务器接收到具有特定版本号的写入时，它可以覆盖具有该版本号或更低版本号的所有值（因为它知道它们已合并到新值中），但它必须保留具有更高版本号的所有值（因为这些值与传入写入并发）。

当写入包含来自先前读取的版本号时，这告诉我们写入基于哪个先前状态。如果你在不包含版本号的情况下进行写入，它与所有其他写入并发，因此它不会覆盖任何内容——它只会作为后续读取的值之一返回。

#### 版本向量 {#version-vectors}

[图 6-15](#fig_replication_causality_single) 中的示例只使用了单个副本。当存在多个副本、且没有领导者时，算法如何变化？

[图 6-15](#fig_replication_causality_single) 使用单个版本号来捕获操作间依赖关系，但当多个副本并发接受写入时，这还不够。我们需要为 **每个副本**、每个键分别维护版本号。每个副本在处理写入时递增自己的版本号，并追踪从其他副本看到的版本号。这些信息决定了哪些值该被覆盖，哪些值要作为兄弟保留。

来自所有副本的版本号集合称为 **版本向量** [^58]。这一思想有若干变体，其中较有代表性的是 **点版本向量** [^59] [^60]，Riak 2.0 使用了它 [^61] [^62]。这里不展开细节，它的工作方式与前面的购物车示例非常相似。

和 [图 6-15](#fig_replication_causality_single) 里的版本号一样，版本向量会在读取时由数据库副本返回给客户端，并在后续写入时再由客户端带回数据库。（Riak 把版本向量编码成一个字符串，称为 **因果上下文**。）版本向量让数据库能够区分“覆盖写入”和“并发写入”。

版本向量还保证了“从一个副本读取，再写回另一个副本”是安全的。这样做可能会产生兄弟，但只要正确合并兄弟，就不会丢失数据。

--------

> [!TIP] 版本向量和向量时钟
>
> **版本向量** 有时也称为 **向量时钟**，尽管它们不完全相同。差异很微妙——请参阅参考资料以获取详细信息 [^60] [^63] [^64]。简而言之，在比较副本状态时，版本向量是要使用的正确数据结构。

--------

## 总结 {#summary}

在本章中，我们研究了复制问题。复制可以服务于多种目的：

**高可用性**
:   即使一台机器（或几台机器、一个区域，甚至整个地区）宕机，也能保持系统运行

**断开操作**
:   允许应用程序在网络中断时继续工作

**延迟**
:   将数据在地理上放置在靠近用户的位置，以便用户可以更快地与其交互

**可伸缩性**
:   通过在副本上执行读取，能够处理比单台机器能够处理的更高的读取量

尽管目标很简单——在几台机器上保留相同数据的副本——复制却是一个非常棘手的问题。它需要仔细考虑并发性以及所有可能出错的事情，并处理这些故障的后果。至少，我们需要处理不可用的节点和网络中断（这甚至还没有考虑更隐蔽的故障类型，例如由于软件错误或硬件错误导致的静默数据损坏）。

我们讨论了三种主要的复制方法：

**单主复制**
:   客户端将所有写入发送到单个节点（领导者），该节点将数据变更事件流发送到其他副本（追随者）。读取可以在任何副本上执行，但从追随者读取可能是陈旧的。

**多主复制**
:   客户端将每个写入发送到几个领导者之一，任何领导者都可以接受写入。领导者相互发送数据变更事件流，并发送到任何追随者。

**无主复制**
:   客户端将每个写入发送到多个节点，并行从多个节点读取，以检测和纠正具有陈旧数据的节点。

每种方法都有优缺点。单主复制很受欢迎，因为它相当容易理解，并且提供强一致性。多主和无主复制在存在故障节点、网络中断和延迟峰值时可以更加健壮——代价是需要冲突解决并提供较弱的一致性保证。

复制可以是同步的或异步的，这对系统在出现故障时的行为有深远的影响。尽管异步复制在系统平稳运行时可能很快，但重要的是要弄清楚当复制延迟增加和服务器失败时会发生什么。如果领导者失败并且你将异步更新的追随者提升为新的领导者，最近提交的数据可能会丢失。

我们研究了复制延迟可能导致的一些奇怪效果，并讨论了一些有助于决定应用程序在复制延迟下应如何表现的一致性模型：

**写后读一致性**
:   用户应该始终看到他们自己提交的数据。

**单调读**
:   在用户在某个时间点看到数据后，他们不应该稍后从某个较早的时间点看到数据。

**一致前缀读**
:   用户应该看到处于因果意义状态的数据：例如，按正确顺序看到问题及其回复。

最后，我们讨论了多主和无主复制如何确保所有副本最终收敛到一致状态：通过使用版本向量或类似算法来检测哪些写入是并发的，并通过使用冲突解决算法（如 CRDT）来合并并发写入的值。最后写入胜利和手动冲突解决也是可能的。

本章假设每个副本都存储整个数据库的完整副本，这对于大型数据集是不现实的。在下一章中，我们将研究 **分片**，它允许每台机器只存储数据的子集。


### 参考

[^1]: B. G. Lindsay, P. G. Selinger, C. Galtieri, J. N. Gray, R. A. Lorie, T. G. Price, F. Putzolu, I. L. Traiger, and B. W. Wade. [Notes on Distributed Databases](https://dominoweb.draco.res.ibm.com/reports/RJ2571.pdf). IBM Research, Research Report RJ2571(33471), July 1979. Archived at [perma.cc/EPZ3-MHDD](https://perma.cc/EPZ3-MHDD)
[^2]: Kenny Gryp. [MySQL Terminology Updates](https://dev.mysql.com/blog-archive/mysql-terminology-updates/). *dev.mysql.com*, July 2020. Archived at [perma.cc/S62G-6RJ2](https://perma.cc/S62G-6RJ2)
[^3]: Oracle Corporation. [Oracle (Active) Data Guard 19c: Real-Time Data Protection and Availability](https://www.oracle.com/technetwork/database/availability/dg-adg-technical-overview-wp-5347548.pdf). White Paper, *oracle.com*, March 2019. Archived at [perma.cc/P5ST-RPKE](https://perma.cc/P5ST-RPKE)
[^4]: Microsoft. [What is an Always On availability group?](https://learn.microsoft.com/en-us/sql/database-engine/availability-groups/windows/overview-of-always-on-availability-groups-sql-server) *learn.microsoft.com*, September 2024. Archived at [perma.cc/ABH6-3MXF](https://perma.cc/ABH6-3MXF)
[^5]: Mostafa Elhemali, Niall Gallagher, Nicholas Gordon, Joseph Idziorek, Richard Krog, Colin Lazier, Erben Mo, Akhilesh Mritunjai, Somu Perianayagam, Tim Rath, Swami Sivasubramanian, James Christopher Sorenson III, Sroaj Sosothikul, Doug Terry, and Akshat Vig. [Amazon DynamoDB: A Scalable, Predictably Performant, and Fully Managed NoSQL Database Service](https://www.usenix.org/conference/atc22/presentation/elhemali). At *USENIX Annual Technical Conference* (ATC), July 2022.
[^6]: Rebecca Taft, Irfan Sharif, Andrei Matei, Nathan VanBenschoten, Jordan Lewis, Tobias Grieger, Kai Niemi, Andy Woods, Anne Birzin, Raphael Poss, Paul Bardea, Amruta Ranade, Ben Darnell, Bram Gruneir, Justin Jaffray, Lucy Zhang, and Peter Mattis. [CockroachDB: The Resilient Geo-Distributed SQL Database](https://dl.acm.org/doi/abs/10.1145/3318464.3386134). At *ACM SIGMOD International Conference on Management of Data* (SIGMOD), pages 1493–1509, June 2020. [doi:10.1145/3318464.3386134](https://doi.org/10.1145/3318464.3386134)
[^7]: Dongxu Huang, Qi Liu, Qiu Cui, Zhuhe Fang, Xiaoyu Ma, Fei Xu, Li Shen, Liu Tang, Yuxing Zhou, Menglong Huang, Wan Wei, Cong Liu, Jian Zhang, Jianjun Li, Xuelian Wu, Lingyu Song, Ruoxi Sun, Shuaipeng Yu, Lei Zhao, Nicholas Cameron, Liquan Pei, and Xin Tang. [TiDB: a Raft-based HTAP database](https://www.vldb.org/pvldb/vol13/p3072-huang.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 12, pages 3072–3084. [doi:10.14778/3415478.3415535](https://doi.org/10.14778/3415478.3415535)
[^8]: Mallory Knodel and Niels ten Oever. [Terminology, Power, and Inclusive Language in Internet-Drafts and RFCs](https://www.ietf.org/archive/id/draft-knodel-terminology-14.html). *IETF Internet-Draft*, August 2023. Archived at [perma.cc/5ZY9-725E](https://perma.cc/5ZY9-725E)
[^9]: Buck Hodges. [Postmortem: VSTS 4 September 2018](https://devblogs.microsoft.com/devopsservice/?p=17485). *devblogs.microsoft.com*, September 2018. Archived at [perma.cc/ZF5R-DYZS](https://perma.cc/ZF5R-DYZS)
[^10]: Gunnar Morling. [Leader Election With S3 Conditional Writes](https://www.morling.dev/blog/leader-election-with-s3-conditional-writes/). *www.morling.dev*, August 2024. Archived at [perma.cc/7V2N-J78Y](https://perma.cc/7V2N-J78Y)
[^11]: Vignesh Chandramohan, Rohan Desai, and Chris Riccomini. [SlateDB Manifest Design](https://github.com/slatedb/slatedb/blob/main/rfcs/0001-manifest.md). *github.com*, May 2024. Archived at [perma.cc/8EUY-P32Z](https://perma.cc/8EUY-P32Z)
[^12]: Stas Kelvich. [Why does Neon use Paxos instead of Raft, and what’s the difference?](https://neon.tech/blog/paxos) *neon.tech*, August 2022. Archived at [perma.cc/SEZ4-2GXU](https://perma.cc/SEZ4-2GXU)
[^13]: Dimitri Fontaine. [An introduction to the pg\_auto\_failover project](https://tapoueh.org/blog/2021/11/an-introduction-to-the-pg_auto_failover-project/). *tapoueh.org*, November 2021. Archived at [perma.cc/3WH5-6BAF](https://perma.cc/3WH5-6BAF)
[^14]: Jesse Newland. [GitHub availability this week](https://github.blog/news-insights/the-library/github-availability-this-week/). *github.blog*, September 2012. Archived at [perma.cc/3YRF-FTFJ](https://perma.cc/3YRF-FTFJ)
[^15]: Mark Imbriaco. [Downtime last Saturday](https://github.blog/news-insights/the-library/downtime-last-saturday/). *github.blog*, December 2012. Archived at [perma.cc/M7X5-E8SQ](https://perma.cc/M7X5-E8SQ)
[^16]: John Hugg. [‘All In’ with Determinism for Performance and Testing in Distributed Systems](https://www.youtube.com/watch?v=gJRj3vJL4wE). At *Strange Loop*, September 2015.
[^17]: Hironobu Suzuki. [The Internals of PostgreSQL](https://www.interdb.jp/pg/). *interdb.jp*, 2017.
[^18]: Amit Kapila. [WAL Internals of PostgreSQL](https://www.pgcon.org/2012/schedule/attachments/258_212_Internals%20Of%20PostgreSQL%20Wal.pdf). At *PostgreSQL Conference* (PGCon), May 2012. Archived at [perma.cc/6225-3SUX](https://perma.cc/6225-3SUX)
[^19]: Amit Kapila. [Evolution of Logical Replication](https://amitkapila16.blogspot.com/2023/09/evolution-of-logical-replication.html). *amitkapila16.blogspot.com*, September 2023. Archived at [perma.cc/F9VX-JLER](https://perma.cc/F9VX-JLER)
[^20]: Aru Petchimuthu. [Upgrade your Amazon RDS for PostgreSQL or Amazon Aurora PostgreSQL database, Part 2: Using the pglogical extension](https://aws.amazon.com/blogs/database/part-2-upgrade-your-amazon-rds-for-postgresql-database-using-the-pglogical-extension/). *aws.amazon.com*, August 2021. Archived at [perma.cc/RXT8-FS2T](https://perma.cc/RXT8-FS2T)
[^21]: Yogeshwer Sharma, Philippe Ajoux, Petchean Ang, David Callies, Abhishek Choudhary, Laurent Demailly, Thomas Fersch, Liat Atsmon Guz, Andrzej Kotulski, Sachin Kulkarni, Sanjeev Kumar, Harry Li, Jun Li, Evgeniy Makeev, Kowshik Prakasam, Robbert van Renesse, Sabyasachi Roy, Pratyush Seth, Yee Jiun Song, Benjamin Wester, Kaushik Veeraraghavan, and Peter Xie. [Wormhole: Reliable Pub-Sub to Support Geo-Replicated Internet Services](https://www.usenix.org/system/files/conference/nsdi15/nsdi15-paper-sharma.pdf). At *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015.
[^22]: Douglas B. Terry. [Replicated Data Consistency Explained Through Baseball](https://www.microsoft.com/en-us/research/publication/replicated-data-consistency-explained-through-baseball/). Microsoft Research, Technical Report MSR-TR-2011-137, October 2011. Archived at [perma.cc/F4KZ-AR38](https://perma.cc/F4KZ-AR38)
[^23]: Douglas B. Terry, Alan J. Demers, Karin Petersen, Mike J. Spreitzer, Marvin M. Theher, and Brent B. Welch. [Session Guarantees for Weakly Consistent Replicated Data](https://csis.pace.edu/~marchese/CS865/Papers/SessionGuaranteesPDIS.pdf). At *3rd International Conference on Parallel and Distributed Information Systems* (PDIS), September 1994. [doi:10.1109/PDIS.1994.331722](https://doi.org/10.1109/PDIS.1994.331722)
[^24]: Werner Vogels. [Eventually Consistent](https://queue.acm.org/detail.cfm?id=1466448). *ACM Queue*, volume 6, issue 6, pages 14–19, October 2008. [doi:10.1145/1466443.1466448](https://doi.org/10.1145/1466443.1466448)
[^25]: Simon Willison. [Reply to: “My thoughts about Fly.io (so far) and other newish technology I’m getting into”](https://news.ycombinator.com/item?id=31434055). *news.ycombinator.com*, May 2022. Archived at [perma.cc/ZRV4-WWV8](https://perma.cc/ZRV4-WWV8)
[^26]: Nithin Tharakan. [Scaling Bitbucket’s Database](https://www.atlassian.com/blog/bitbucket/scaling-bitbuckets-database). *atlassian.com*, October 2020. Archived at [perma.cc/JAB7-9FGX](https://perma.cc/JAB7-9FGX)
[^27]: Terry Pratchett. *Reaper Man: A Discworld Novel*. Victor Gollancz, 1991. ISBN: 978-0-575-04979-6
[^28]: Peter Bailis, Alan Fekete, Michael J. Franklin, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Coordination Avoidance in Database Systems](https://arxiv.org/abs/1402.2237). *Proceedings of the VLDB Endowment*, volume 8, issue 3, pages 185–196, November 2014. [doi:10.14778/2735508.2735509](https://doi.org/10.14778/2735508.2735509)
[^29]: Yaser Raja and Peter Celentano. [PostgreSQL bi-directional replication using pglogical](https://aws.amazon.com/blogs/database/postgresql-bi-directional-replication-using-pglogical/). *aws.amazon.com*, January 2022. Archived at <https://perma.cc/BUQ2-5QWN>
[^30]: Robert Hodges. [If You \*Must\* Deploy Multi-Master Replication, Read This First](https://scale-out-blog.blogspot.com/2012/04/if-you-must-deploy-multi-master.html). *scale-out-blog.blogspot.com*, April 2012. Archived at [perma.cc/C2JN-F6Y8](https://perma.cc/C2JN-F6Y8)
[^31]: Lars Hofhansl. [HBASE-7709: Infinite Loop Possible in Master/Master Replication](https://issues.apache.org/jira/browse/HBASE-7709). *issues.apache.org*, January 2013. Archived at [perma.cc/24G2-8NLC](https://perma.cc/24G2-8NLC)
[^32]: John Day-Richter. [What’s Different About the New Google Docs: Making Collaboration Fast](https://drive.googleblog.com/2010/09/whats-different-about-new-google-docs.html). *drive.googleblog.com*, September 2010. Archived at [perma.cc/5TL8-TSJ2](https://perma.cc/5TL8-TSJ2)
[^33]: Evan Wallace. [How Figma’s multiplayer technology works](https://www.figma.com/blog/how-figmas-multiplayer-technology-works/). *figma.com*, October 2019. Archived at [perma.cc/L49H-LY4D](https://perma.cc/L49H-LY4D)
[^34]: Tuomas Artman. [Scaling the Linear Sync Engine](https://linear.app/blog/scaling-the-linear-sync-engine). *linear.app*, June 2023.
[^35]: Amr Saafan. [Why Sync Engines Might Be the Future of Web Applications](https://www.nilebits.com/blog/2024/09/sync-engines-future-web-applications/). *nilebits.com*, September 2024. Archived at [perma.cc/5N73-5M3V](https://perma.cc/5N73-5M3V)
[^36]: Isaac Hagoel. [Are Sync Engines The Future of Web Applications?](https://dev.to/isaachagoel/are-sync-engines-the-future-of-web-applications-1bbi) *dev.to*, July 2024. Archived at [perma.cc/R9HF-BKKL](https://perma.cc/R9HF-BKKL)
[^37]: Sujay Jayakar. [A Map of Sync](https://stack.convex.dev/a-map-of-sync). *stack.convex.dev*, October 2024. Archived at [perma.cc/82R3-H42A](https://perma.cc/82R3-H42A)
[^38]: Alex Feyerke. [Designing Offline-First Web Apps](https://alistapart.com/article/offline-first/). *alistapart.com*, December 2013. Archived at [perma.cc/WH7R-S2DS](https://perma.cc/WH7R-S2DS)
[^39]: Martin Kleppmann, Adam Wiggins, Peter van Hardenberg, and Mark McGranaghan. [Local-first software: You own your data, in spite of the cloud](https://www.inkandswitch.com/local-first/). At *ACM SIGPLAN International Symposium on New Ideas, New Paradigms, and Reflections on Programming and Software* (Onward!), October 2019, pages 154–178. [doi:10.1145/3359591.3359737](https://doi.org/10.1145/3359591.3359737)
[^40]: Martin Kleppmann. [The past, present, and future of local-first](https://martin.kleppmann.com/2024/05/30/local-first-conference.html). At *Local-First Conference*, May 2024.
[^41]: Conrad Hofmeyr. [API Calling is to Sync Engines as jQuery is to React](https://www.powersync.com/blog/api-calling-is-to-sync-engines-as-jquery-is-to-react). *powersync.com*, November 2024. Archived at [perma.cc/2FP9-7WJJ](https://perma.cc/2FP9-7WJJ)
[^42]: Peter van Hardenberg and Martin Kleppmann. [PushPin: Towards Production-Quality Peer-to-Peer Collaboration](https://martin.kleppmann.com/papers/pushpin-papoc20.pdf). At *7th Workshop on Principles and Practice of Consistency for Distributed Data* (PaPoC), April 2020. [doi:10.1145/3380787.3393683](https://doi.org/10.1145/3380787.3393683)
[^43]: Leonard Kawell, Jr., Steven Beckhardt, Timothy Halvorsen, Raymond Ozzie, and Irene Greif. [Replicated document management in a group communication system](https://dl.acm.org/doi/pdf/10.1145/62266.1024798). At *ACM Conference on Computer-Supported Cooperative Work* (CSCW), September 1988. [doi:10.1145/62266.1024798](https://doi.org/10.1145/62266.1024798)
[^44]: Ricky Pusch. [Explaining how fighting games use delay-based and rollback netcode](https://words.infil.net/w02-netcode.html). *words.infil.net* and *arstechnica.com*, October 2019. Archived at [perma.cc/DE7W-RDJ8](https://perma.cc/DE7W-RDJ8)
[^45]: Giuseppe DeCandia, Deniz Hastorun, Madan Jampani, Gunavardhan Kakulapati, Avinash Lakshman, Alex Pilchin, Swaminathan Sivasubramanian, Peter Vosshall, and Werner Vogels. [Dynamo: Amazon’s Highly Available Key-Value Store](https://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf). At *21st ACM Symposium on Operating Systems Principles* (SOSP), October 2007. [doi:10.1145/1323293.1294281](https://doi.org/10.1145/1323293.1294281)
[^46]: Marc Shapiro, Nuno Preguiça, Carlos Baquero, and Marek Zawirski. [A Comprehensive Study of Convergent and Commutative Replicated Data Types](https://inria.hal.science/inria-00555588v1/document). INRIA Research Report no. 7506, January 2011.
[^47]: Chengzheng Sun and Clarence Ellis. [Operational Transformation in Real-Time Group Editors: Issues, Algorithms, and Achievements](https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=aef660812c5a9c4d3f06775f9455eeb090a4ff0f). At *ACM Conference on Computer Supported Cooperative Work* (CSCW), November 1998. [doi:10.1145/289444.289469](https://doi.org/10.1145/289444.289469)
[^48]: Joseph Gentle and Martin Kleppmann. [Collaborative Text Editing with Eg-walker: Better, Faster, Smaller](https://arxiv.org/abs/2409.14252). At *20th European Conference on Computer Systems* (EuroSys), March 2025. [doi:10.1145/3689031.3696076](https://doi.org/10.1145/3689031.3696076)
[^49]: Dharma Shukla. [Azure Cosmos DB: Pushing the frontier of globally distributed databases](https://azure.microsoft.com/en-us/blog/azure-cosmos-db-pushing-the-frontier-of-globally-distributed-databases/). *azure.microsoft.com*, September 2018. Archived at [perma.cc/UT3B-HH6R](https://perma.cc/UT3B-HH6R)
[^50]: David K. Gifford. [Weighted Voting for Replicated Data](https://www.cs.cmu.edu/~15-749/READINGS/required/availability/gifford79.pdf). At *7th ACM Symposium on Operating Systems Principles* (SOSP), December 1979. [doi:10.1145/800215.806583](https://doi.org/10.1145/800215.806583)
[^51]: Heidi Howard, Dahlia Malkhi, and Alexander Spiegelman. [Flexible Paxos: Quorum Intersection Revisited](https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.OPODIS.2016.25). At *20th International Conference on Principles of Distributed Systems* (OPODIS), December 2016. [doi:10.4230/LIPIcs.OPODIS.2016.25](https://doi.org/10.4230/LIPIcs.OPODIS.2016.25)
[^52]: Joseph Blomstedt. [Bringing Consistency to Riak](https://vimeo.com/51973001). At *RICON West*, October 2012.
[^53]: Peter Bailis, Shivaram Venkataraman, Michael J. Franklin, Joseph M. Hellerstein, and Ion Stoica. [Quantifying eventual consistency with PBS](http://www.bailis.org/papers/pbs-vldbj2014.pdf). *The VLDB Journal*, volume 23, pages 279–302, April 2014. [doi:10.1007/s00778-013-0330-1](https://doi.org/10.1007/s00778-013-0330-1)
[^54]: Colin Breck. [Shared-Nothing Architectures for Server Replication and Synchronization](https://blog.colinbreck.com/shared-nothing-architectures-for-server-replication-and-synchronization/). *blog.colinbreck.com*, December 2019. Archived at [perma.cc/48P3-J6CJ](https://perma.cc/48P3-J6CJ)
[^55]: Jeffrey Dean and Luiz André Barroso. [The Tail at Scale](https://cacm.acm.org/research/the-tail-at-scale/). *Communications of the ACM*, volume 56, issue 2, pages 74–80, February 2013. [doi:10.1145/2408776.2408794](https://doi.org/10.1145/2408776.2408794)
[^56]: Peng Huang, Chuanxiong Guo, Lidong Zhou, Jacob R. Lorch, Yingnong Dang, Murali Chintalapati, and Randolph Yao. [Gray Failure: The Achilles’ Heel of Cloud-Scale Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/06/paper-1.pdf). At *16th Workshop on Hot Topics in Operating Systems* (HotOS), May 2017. [doi:10.1145/3102980.3103005](https://doi.org/10.1145/3102980.3103005)
[^57]: Leslie Lamport. [Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/). *Communications of the ACM*, volume 21, issue 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](https://doi.org/10.1145/359545.359563)
[^58]: D. Stott Parker Jr., Gerald J. Popek, Gerard Rudisin, Allen Stoughton, Bruce J. Walker, Evelyn Walton, Johanna M. Chow, David Edwards, Stephen Kiser, and Charles Kline. [Detection of Mutual Inconsistency in Distributed Systems](https://pages.cs.wisc.edu/~remzi/Classes/739/Papers/parker83detection.pdf). *IEEE Transactions on Software Engineering*, volume SE-9, issue 3, pages 240–247, May 1983. [doi:10.1109/TSE.1983.236733](https://doi.org/10.1109/TSE.1983.236733)
[^59]: Nuno Preguiça, Carlos Baquero, Paulo Sérgio Almeida, Victor Fonte, and Ricardo Gonçalves. [Dotted Version Vectors: Logical Clocks for Optimistic Replication](https://arxiv.org/abs/1011.5808). arXiv:1011.5808, November 2010.
[^60]: Giridhar Manepalli. [Clocks and Causality - Ordering Events in Distributed Systems](https://www.exhypothesi.com/clocks-and-causality/). *exhypothesi.com*, November 2022. Archived at [perma.cc/8REU-KVLQ](https://perma.cc/8REU-KVLQ)
[^61]: Sean Cribbs. [A Brief History of Time in Riak](https://speakerdeck.com/seancribbs/a-brief-history-of-time-in-riak). At *RICON*, October 2014. Archived at [perma.cc/7U9P-6JFX](https://perma.cc/7U9P-6JFX)
[^62]: Russell Brown. [Vector Clocks Revisited Part 2: Dotted Version Vectors](https://riak.com/posts/technical/vector-clocks-revisited-part-2-dotted-version-vectors/). *riak.com*, November 2015. Archived at [perma.cc/96QP-W98R](https://perma.cc/96QP-W98R)
[^63]: Carlos Baquero. [Version Vectors Are Not Vector Clocks](https://haslab.wordpress.com/2011/07/08/version-vectors-are-not-vector-clocks/). *haslab.wordpress.com*, July 2011. Archived at [perma.cc/7PNU-4AMG](https://perma.cc/7PNU-4AMG)
[^64]: Reinhard Schwarz and Friedemann Mattern. [Detecting Causal Relationships in Distributed Computations: In Search of the Holy Grail](https://disco.ethz.ch/courses/hs08/seminar/papers/mattern4.pdf). *Distributed Computing*, volume 7, issue 3, pages 149–174, March 1994. [doi:10.1007/BF02277859](https://doi.org/10.1007/BF02277859)


================================================
FILE: content/zh/ch7.md
================================================
---
title: "7. 分片"
weight: 207
breadcrumbs: false
---

<a id="ch_sharding"></a>

![](/map/ch06.png)

> *显然，我们必须跳出顺序计算机指令的窠臼。我们必须叙述定义、提供优先级和数据描述。我们必须叙述关系，而不是过程。*
>
> Grace Murray Hopper，《未来的计算机及其管理》（1962）

分布式数据库通常通过两种方式在节点间分布数据：

1. 在多个节点上保存相同数据的副本：这是 *复制*，我们在 [第 6 章](/ch6#ch_replication) 中讨论过。
2. 如果我们不想让每个节点都存储所有数据，我们可以将大量数据分割成更小的 *分片（shards）* 或 *分区（partitions）*，并将不同的分片存储在不同的节点上。我们将在本章讨论分片。

通常，分片的定义方式使得每条数据（每条记录、行或文档）恰好属于一个分片。有多种方法可以实现这一点，我们将在本章深入讨论。实际上，每个分片本身就是一个小型数据库，尽管某些数据库系统支持同时涉及多个分片的操作。

分片通常与复制结合使用，以便每个分片的副本存储在多个节点上。这意味着，即使每条记录属于恰好一个分片，它仍然可以存储在多个不同的节点上以提供容错能力。

一个节点可能存储多个分片。例如，如果使用单领导者复制模型，分片与复制的组合可能如 [图 7-1](#fig_sharding_replicas) 所示。每个分片的领导者被分配到一个节点，追随者被分配到其他节点。每个节点可能是某些分片的领导者，同时又是其他分片的追随者，但每个分片仍然只有一个领导者。

{{< figure src="/fig/ddia_0701.png" id="fig_sharding_replicas" caption="图 7-1. 复制与分片结合使用：每个节点对某些分片充当领导者，对另一些分片充当追随者。" class="w-full my-4" >}}

我们在 [第 6 章](/ch6#ch_replication) 中讨论的关于数据库复制的所有内容同样适用于分片的复制。由于分片方案的选择大部分独立于复制方案的选择，为了简单起见，我们将在本章中忽略复制。

--------

> [!TIP] 分片和分区

在本章中我们称之为 *分片* 的东西，根据你使用的软件不同有许多不同的名称：在 Kafka 中称为 *分区（partition）*，在 CockroachDB 中称为 *范围（range）*，在 HBase 和 TiDB 中称为 *区域（region）*，在 Bigtable 和 YugabyteDB 中称为 *表块（tablet）*，在 Cassandra、ScyllaDB 和 Riak 中称为 *虚节点（vnode）*，在 Couchbase 中称为 *虚桶（vBucket）*，仅举几例。

一些数据库将分区和分片视为两个不同的概念。例如，在 PostgreSQL 中，分区是将大表拆分为存储在同一台机器上的多个文件的方法（这有几个优点，例如可以非常快速地删除整个分区），而分片则是将数据集拆分到多台机器上 [^1] [^2]。在许多其他系统中，分区只是分片的另一个词。

虽然 *分区* 相当具有描述性，但 *分片* 这个术语可能令人惊讶。根据一种理论，该术语源于在线角色扮演游戏《网络创世纪》（Ultima Online），其中一块魔法水晶被打碎成碎片，每个碎片都折射出游戏世界的副本 [^3]。*分片* 一词因此用来指一组并行游戏服务器中的一个，后来被引入数据库。另一种理论是 *分片* 最初是 *高可用复制数据系统*（System for Highly Available Replicated Data）的缩写——据说是 1980 年代的一个数据库，其细节已经失传。

顺便说一下，分区与 *网络分区*（netsplits）无关，后者是节点之间网络中的一种故障。我们将在 [第 9 章](/ch9#ch_distributed) 中讨论此类故障。

--------

## 分片的利与弊 {#sec_sharding_reasons}

对数据库进行分片的主要原因是 *可伸缩性*：如果数据量或写吞吐量已经超出单个节点的处理能力，这是一个解决方案，它允许你将数据和写入分散到多个节点上。（如果读吞吐量是问题，你不一定需要分片——你可以使用 [第 6 章](/ch6#ch_replication) 中讨论的 *读扩展*。）

事实上，分片是我们实现 *水平扩展*（*横向扩展* 架构）的主要工具之一，如 ["共享内存、共享磁盘和无共享架构"](/ch2#sec_introduction_shared_nothing) 中所讨论的：即，允许系统通过添加更多（较小的）机器而不是转移到更大的机器来增长其容量。如果你可以划分工作负载，使每个分片处理大致相等的份额，那么你可以将这些分片分配给不同的机器，以便并行处理它们的数据和查询。

虽然复制在小规模和大规模上都很有用，因为它支持容错和离线操作，但分片是一个重量级解决方案，主要在大规模场景下才有意义。如果你的数据量和写吞吐量可以在单台机器上处理（而单台机器现在可以做很多事情！），通常最好避免分片并坚持使用单分片数据库。

推荐这样做的原因是分片通常会增加复杂性：你通常必须通过选择 *分区键* 来决定将哪些记录放在哪个分片中；具有相同分区键的所有记录都放在同一个分片中 [^4]。这个选择很重要，因为如果你知道记录在哪个分片中，访问记录会很快，但如果你不知道分片，你必须在所有分片中进行低效的搜索，而且分片方案很难更改。

因此，分片通常适用于键值数据，你可以轻松地按键进行分片，但对于关系数据则较难，因为你可能想要通过二级索引搜索，或连接可能分布在不同分片中的记录。我们将在 ["分片与二级索引"](#sec_sharding_secondary_indexes) 中进一步讨论这个问题。

分片的另一个问题是写入可能需要更新多个不同分片中的相关记录。虽然单节点上的事务相当常见（见 [第 8 章](/ch8#ch_transactions)），但确保跨多个分片的一致性需要 *分布式事务*。正如我们将在 [第 8 章](/ch8#ch_transactions) 中看到的，分布式事务在某些数据库中可用，但它们通常比单节点事务慢得多，可能成为整个系统的瓶颈，有些系统根本不支持它们。

一些系统即使在单台机器上也使用分片，通常每个 CPU 核心运行一个单线程进程，以利用 CPU 的并行性，或者利用 *非统一内存访问*（NUMA）架构：某些内存分区比其他分区更靠近某个 CPU [^5]。例如，Redis、VoltDB 和 FoundationDB 每个核心使用一个进程，并依靠分片在同一台机器的 CPU 核心之间分散负载 [^6]。

### 面向多租户的分片 {#sec_sharding_multitenancy}

软件即服务（SaaS）产品和云服务通常是 *多租户* 的，其中每个租户是一个客户。多个用户可能在同一租户上拥有登录帐户，但每个租户都有一个独立的数据集，与其他租户分开。例如，在电子邮件营销服务中，每个注册的企业通常是一个单独的租户，因为一个企业的通讯订阅、投递数据等与其他企业的数据是分开的。

有时分片用于实现多租户系统：要么每个租户被分配一个单独的分片，要么多个小租户可能被分组到一个更大的分片中。这些分片可能是物理上分离的数据库（我们之前在 ["嵌入式存储引擎"](/ch4#sidebar_embedded) 中提到过），或者是更大逻辑数据库的可单独管理部分 [^7]。使用分片实现多租户有几个优点：

资源隔离
: 如果某个租户执行计算密集型操作，而它与其他租户运行在不同分片上，那么其他租户性能受影响的可能性更小。

权限隔离
: 如果访问控制逻辑有漏洞，而租户数据集又是彼此物理隔离存储的，那么误将一个租户的数据暴露给另一个租户的概率会更低。

基于单元的架构
: 你不仅可以在数据存储级别应用分片，还可以为运行应用程序代码的服务应用分片。在 *基于单元的架构* 中，特定租户集的服务和存储被分组到一个自包含的 *单元* 中，不同的单元被设置为可以在很大程度上彼此独立运行。这种方法提供了 *故障隔离*：即，一个单元中的故障仅限于该单元，其他单元中的租户不受影响 [^8]。

按租户备份和恢复
: 单独备份每个租户的分片使得可以从备份中恢复租户的状态而不影响其他租户，这在租户意外删除或覆盖重要数据的情况下很有用 [^9]。

法规合规性
: 数据隐私法规（如 GDPR）赋予个人访问和删除存储的所有关于他们的数据的权利。如果每个人的数据存储在单独的分片中，这就转化为对其分片的简单数据导出和删除操作 [^10]。

数据驻留
: 如果特定租户的数据需要存储在特定司法管辖区以符合数据驻留法律，具有区域感知的数据库可以允许你将该租户的分片分配给特定区域。

渐进式模式推出
: 模式迁移（之前在 ["文档模型中的模式灵活性"](/ch3#sec_datamodels_schema_flexibility) 中讨论过）可以逐步推出，一次一个租户。这降低了风险，因为你可以在影响所有租户之前检测到问题，但很难以事务方式执行 [^11]。

使用分片实现多租户的主要挑战是：

* 它假设每个单独的租户都足够小，可以适应单个节点。如果情况并非如此，并且你有一个对于一台机器来说太大的租户，你将需要在单个租户内额外执行分片，这将我们带回到为可伸缩性进行分片的主题 [^12]。
* 如果你有许多小租户，那么为每个租户创建单独的分片可能会产生太多开销。你可以将几个小租户组合到一个更大的分片中，但随后你会遇到如何在租户增长时将其从一个分片移动到另一个分片的问题。
* 如果你需要支持跨多个租户关联数据的功能，那么在必须跨多个分片做连接时，实现难度会显著增加。


## 键值数据的分片 {#sec_sharding_key_value}

假设你有大量数据，并且想要对其进行分片。如何决定将哪些记录存储在哪些节点上？

我们进行分片的目标是将数据和查询负载均匀地分布在各节点上。如果每个节点承担公平的份额，那么理论上——10 个节点应该能够处理 10 倍的数据量和 10 倍单个节点的读写吞吐量（忽略复制）。此外，如果我们添加或删除节点，我们希望能够 *再平衡* 负载，使其在添加时均匀分布在 11 个节点上（或删除时在剩余的 9 个节点上）。

如果分片不公平，使得某些分片比其他分片承载更多数据或查询，我们称之为 *偏斜*。偏斜会显著削弱分片效果。在极端情况下，所有负载都可能集中在一个分片上，导致 10 个节点中有 9 个处于空闲状态，而瓶颈落在那一个繁忙节点上。负载明显高于其他分片的分片称为 *热分片* 或 *热点*。如果某个键的负载特别高（例如社交网络中的名人），我们称之为 *热键*。

因此，我们需要一种算法，它以记录的分区键作为输入，并告诉我们该记录在哪个分片中。在键值存储中，分区键通常是键，或键的第一部分。在关系模型中，分区键可能是表的某一列（不一定是其主键）。该算法需要能够进行再平衡以缓解热点。


### 按键的范围分片 {#sec_sharding_key_range}

一种分片方法是为每个分片分配一个连续的分区键范围（从某个最小值到某个最大值），就像纸质百科全书的卷一样，如 [图 7-2](#fig_sharding_encyclopedia) 所示。在这个例子中，条目的分区键是其标题。如果你想查找特定标题的条目，你可以通过找到键范围包含你要查找标题的卷来轻松确定哪个分片包含该条目，从而从书架上挑选正确的书。

{{< figure src="/fig/ddia_0702.png" id="fig_sharding_encyclopedia" caption="图 7-2. 印刷版百科全书按键范围分片。" class="w-full my-4" >}}

键的范围不一定是均匀分布的，因为你的数据可能不是均匀分布的。例如，在 [图 7-2](#fig_sharding_encyclopedia) 中，第 1 卷包含以 A 和 B 开头的单词，但第 12 卷包含以 T、U、V、W、X、Y 和 Z 开头的单词。简单地为字母表的每两个字母分配一卷会导致某些卷比其他卷大得多。为了均匀分布数据，分片边界需要适应数据。

分片边界可能由管理员手动选择，或者数据库可以自动选择它们。手动键范围分片例如被 Vitess（MySQL 的分片层）使用；自动变体被 Bigtable、其开源等价物 HBase、MongoDB 中基于范围的分片选项、CockroachDB、RethinkDB 和 FoundationDB 使用 [^6]。YugabyteDB 提供手动和自动表块分割两种选项。

在每个分片内，键以排序顺序存储（例如，在 B 树或 SSTable 中，如 [第 4 章](/ch4#ch_storage) 中所讨论的）。这样做的优点是范围扫描很容易，你可以将键视为连接索引，以便在一个查询中获取多个相关记录（参见 ["多维和全文索引"](/ch4#sec_storage_multidimensional)）。例如，考虑一个存储传感器网络数据的应用程序，其中键是测量的时间戳。范围扫描在这种情况下非常有用，因为它们让你可以轻松获取，比如说，特定月份的所有读数。

键范围分片的一个缺点是，如果有大量对相邻键的写入，你很容易得到一个热分片。例如，如果键是时间戳，那么分片对应于时间范围——例如，每个月一个分片。不幸的是，如果你在测量发生时将传感器数据写入数据库，所有写入最终都会进入同一个分片（本月的分片），因此该分片可能会因写入而过载，而其他分片则处于空闲状态 [^13]。

为了避免传感器数据库中的这个问题，你需要使用时间戳以外的东西作为键的第一个元素。例如，你可以在每个时间戳前加上传感器 ID，使键排序首先按传感器 ID，然后按时间戳。假设你有许多传感器同时活动，写入负载最终会更均匀地分布在各个分片上。缺点是当你想要在一个时间范围内获取多个传感器的值时，你现在需要为每个传感器执行单独的范围查询。

#### 重新平衡键范围分片数据 {#rebalancing-key-range-sharded-data}

当你首次设置数据库时，没有键范围可以分割成分片。一些数据库，如 HBase 和 MongoDB，允许你在空数据库上配置一组初始分片，这称为 *预分割*。这要求你已经对键分布将会是什么样子有所了解，以便你可以选择适当的键范围边界 [^14]。

后来，随着你的数据量和写吞吐量增长，具有键范围分片的系统通过将现有分片分割成两个或更多较小的分片来增长，每个分片都保存原始分片键范围的连续子范围。然后可以将生成的较小分片分布在多个节点上。如果删除了大量数据，你可能还需要将几个相邻的已变小的分片合并为一个更大的分片。这个过程类似于 B 树顶层发生的事情（参见 ["B 树"](/ch4#sec_storage_b_trees)）。

对于自动管理分片边界的数据库，分片分割通常由以下触发：

* 分片达到配置的大小（例如，在 HBase 上，默认值为 10 GB），或
* 在某些系统中，写吞吐量持续高于某个阈值。因此，即使热分片没有存储大量数据，也可能被分割，以便其写入负载可以更均匀地分布。

键范围分片的一个优点是分片数量适应数据量。如果只有少量数据，少量分片就足够了，因此开销很小；如果有大量数据，每个单独分片的大小被限制在可配置的最大值 [^15]。

这种方法的一个缺点是分割分片是一项昂贵的操作，因为它需要将其所有数据重写到新文件中，类似于日志结构存储引擎中的压实。需要分割的分片通常也是处于高负载下的分片，分割的成本可能会加剧该负载，有使其过载的风险。

### 按键的哈希分片 {#sec_sharding_hash}

键范围分片在你希望具有相邻（但不同）分区键的记录被分组到同一个分片中时很有用；例如，如果是时间戳，这可能就是这种情况。如果你不关心分区键是否彼此接近（例如，如果它们是多租户应用程序中的租户 ID），一种常见方法是先对分区键进行哈希，然后将其映射到分片。

一个好的哈希函数可以把偏斜的数据变得更均匀。假设你有一个 32 位哈希函数，输入是字符串。每当给它一个新字符串，它都会返回一个看似随机、介于 0 和 2³² − 1 之间的数字。即使输入字符串非常相似，它们的哈希值也会在这个范围内均匀分布（但相同输入总是产生相同输出）。

出于分片目的，哈希函数不需要是密码学强度的：例如，MongoDB 使用 MD5，而 Cassandra 和 ScyllaDB 使用 Murmur3。许多编程语言都内置了简单的哈希函数（因为它们用于哈希表），但它们可能不适合分片：例如，在 Java 的 `Object.hashCode()` 和 Ruby 的 `Object#hash` 中，相同的键在不同的进程中可能有不同的哈希值，使它们不适合分片 [^16]。

#### 哈希取模节点数 {#hash-modulo-number-of-nodes}

一旦你对键进行了哈希，如何选择将其存储在哪个分片中？也许你的第一个想法是取哈希值 *模* 系统中的节点数（在许多编程语言中使用 `%` 运算符）。例如，*hash*(*key*) % 10 将返回 0 到 9 之间的数字（如果我们将哈希写为十进制数，hash % 10 将是最后一位数字）。如果我们有 10 个节点，编号从 0 到 9，这似乎是将每个键分配给节点的简单方法。

*mod N* 方法的问题是，如果节点数 *N* 发生变化，大多数键必须从一个节点移动到另一个节点。[图 7-3](#fig_sharding_hash_mod_n) 显示了当你有三个节点并添加第四个节点时会发生什么。在再平衡之前，节点 0 存储哈希值为 0、3、6、9 等的键。添加第四个节点后，哈希值为 3 的键已移动到节点 3，哈希值为 6 的键已移动到节点 2，哈希值为 9 的键已移动到节点 1，依此类推。

{{< figure src="/fig/ddia_0703.png" id="fig_sharding_hash_mod_n" caption="图 7-3. 通过对键进行哈希并取模节点数来将键分配给节点。更改节点数会导致许多键从一个节点移动到另一个节点。" class="w-full my-4" >}}

*mod N* 函数易于计算，但它导致非常低效的再平衡，因为存在大量不必要的记录从一个节点移动到另一个节点。我们需要一种不会移动超过必要数据的方法。

#### 固定数量的分片 {#fixed-number-of-shards}

一个简单但广泛使用的解决方案是创建比节点多得多的分片，并为每个节点分配多个分片。例如，在 10 个节点的集群上运行的数据库可能从一开始就被分成 1,000 个分片，以便每个节点分配 100 个分片。然后将键存储在分片号 *hash*(*key*) % 1,000 中，系统单独跟踪哪个分片存储在哪个节点上。

现在，如果向集群添加一个节点，系统可以从现有节点重新分配一些分片到新节点，直到它们再次公平分布。这个过程在 [图 7-4](#fig_sharding_rebalance_fixed) 中说明。如果从集群中删除节点，则反向发生相同的事情。

{{< figure src="/fig/ddia_0704.png" id="fig_sharding_rebalance_fixed" caption="图 7-4. 向每个节点有多个分片的数据库集群添加新节点。" class="w-full my-4" >}}

在这个模型中，只有整个分片在节点之间移动，这比分割分片更便宜。分片的数量不会改变，也不会改变键到分片的分配。唯一改变的是分片到节点的分配。这种分配的变化不是立即的——通过网络传输大量数据需要一些时间——因此在传输进行时，旧的分片分配用于任何发生的读写。

选择分片数量为可被许多因子整除的数字是很常见的，这样数据集可以在各种不同数量的节点之间均匀分割——例如，不要求节点数必须是 2 的幂 [^4]。你甚至可以考虑集群中不匹配的硬件：通过为更强大的节点分配更多分片，你可以让这些节点承担更大份额的负载。

这种分片方法被 Citus（PostgreSQL 的分片层）、Riak、Elasticsearch 和 Couchbase 等使用。只要你对首次创建数据库时需要多少分片有很好的估计，它就很有效。然后你可以轻松添加或删除节点，但受限于你不能拥有比分片更多的节点。

如果你发现最初配置的分片数量是错误的——例如，如果你已经达到需要比分片更多节点的规模——那么需要进行昂贵的重新分片操作。它需要分割每个分片并将其写入新文件，在此过程中使用大量额外的磁盘空间。一些系统不允许在并发写入数据库时进行重新分片，这使得在没有停机时间的情况下更改分片数量变得困难。

如果数据集总大小高度可变（例如起初很小，但会随时间显著增长），选择合适的分片数量就很困难。由于每个分片包含总数据中的固定比例，每个分片的大小会随集群总数据量按比例增长。如果分片很大，再平衡和节点故障恢复都会很昂贵；但如果分片太小，又会产生过多管理开销。最佳性能通常出现在分片大小“恰到好处”时，但在分片数量固定、数据规模又持续变化的情况下，这很难做到。

#### 按哈希范围分片 {#sharding-by-hash-range}

如果无法提前预测所需的分片数量，最好使用一种方案，其中分片数量可以轻松适应工作负载。前面提到的键范围分片方案具有这个属性，但当有大量对相邻键的写入时，它有热点的风险。一种解决方案是将键范围分片与哈希函数结合，使每个分片包含 *哈希值* 的范围而不是 *键* 的范围。

[图 7-5](#fig_sharding_hash_range) 显示了使用 16 位哈希函数的示例，该函数返回 0 到 65,535 = 2¹⁶ − 1 之间的数字（实际上，哈希通常是 32 位或更多）。即使输入键非常相似（例如，连续的时间戳），它们的哈希值也会在该范围内均匀分布。然后我们可以为每个分片分配一个哈希值范围：例如，值 0 到 16,383 分配给分片 0，值 16,384 到 32,767 分配给分片 1，依此类推。

{{< figure src="/fig/ddia_0705.png" id="fig_sharding_hash_range" caption="图 7-5. 为每个分片分配连续的哈希值范围。" class="w-full my-4" >}}

与键范围分片一样，哈希范围分片中的分片在变得太大或负载太重时可以被分割。这仍然是一个昂贵的操作，但它可以根据需要发生，因此分片数量适应数据量而不是预先固定。

与键范围分片相比的缺点是，对分区键的范围查询效率不高，因为范围内的键现在分散在所有分片中。但是，如果键由两列或更多列组成，并且分区键只是这些列中的第一列，你仍然可以对第二列和后续列执行高效的范围查询：只要范围查询中的所有记录具有相同的分区键，它们就会在同一个分片中。

--------

> [!TIP] 数据仓库中的分区和范围查询

数据仓库如 BigQuery、Snowflake 和 Delta Lake 支持类似的索引方法，尽管术语不同。例如，在 BigQuery 中，分区键决定记录驻留在哪个分区中，而"集群列"决定记录在分区内如何排序。Snowflake 自动将记录分配给"微分区"，但允许用户为表定义集群键。Delta Lake 支持手动和自动分区分配，并支持集群键。聚集数据不仅可以提高范围扫描性能，还可以提高压缩和过滤性能。

--------

哈希范围分片被 YugabyteDB 和 DynamoDB 使用 [^17]，并且是 MongoDB 中的一个选项。Cassandra 和 ScyllaDB 使用这种方法的一个变体，如 [图 7-6](#fig_sharding_cassandra) 所示：哈希值空间被分割成与节点数成比例的范围数（[图 7-6](#fig_sharding_cassandra) 中每个节点 3 个范围，但实际数字在 Cassandra 中默认为每个节点 8 个，在 ScyllaDB 中为每个节点 256 个），这些范围之间有随机边界。这意味着某些范围比其他范围大，但通过每个节点有多个范围，这些不平衡倾向于平均化 [^15] [^18]。

{{< figure src="/fig/ddia_0706.png" id="fig_sharding_cassandra" caption="图 7-6. Cassandra 和 ScyllaDB 将可能的哈希值范围（这里是 0-1023）分割成具有随机边界的连续范围，并为每个节点分配多个范围。" class="w-full my-4" >}}

当添加或删除节点时，会添加和删除范围边界，并相应地分割或合并分片 [^19]。在 [图 7-6](#fig_sharding_cassandra) 的示例中，当添加节点 3 时，节点 1 将其两个范围的部分转移到节点 3，节点 2 将其一个范围的部分转移到节点 3。这样做的效果是给新节点一个大致公平的数据集份额，而不会在节点之间传输超过必要的数据。

#### 一致性哈希 {#sec_sharding_consistent_hashing}

*一致性哈希* 算法是一种哈希函数，它以满足两个属性的方式将键映射到指定数量的分片：

1. 映射到每个分片的键数大致相等，并且
2. 当分片数量变化时，尽可能少的键从一个分片移动到另一个分片。

注意这里的 *一致性* 与副本一致性（见 [第 6 章](/ch6#ch_replication)）或 ACID 一致性（见 [第 8 章](/ch8#ch_transactions)）无关，而是描述了键尽可能保持在同一个分片中的倾向。

Cassandra 和 ScyllaDB 使用的分片算法类似于一致性哈希的原始定义 [^20]，但也提出了其他几种一致性哈希算法 [^21]，如 *最高随机权重*，也称为 *会合哈希* [^22]，以及 *跳跃一致性哈希* [^23]。使用 Cassandra 的算法，如果添加一个节点，少量现有分片会被分割成子范围；另一方面，使用会合和跳跃一致性哈希，新节点被分配之前分散在所有其他节点中的单个键。哪种更可取取决于应用程序。

### 偏斜的工作负载与缓解热点 {#sec_sharding_skew}

一致性哈希保证键在节点间大致均匀分布，但这并不等于实际负载也均匀分布。如果工作负载高度偏斜，即某些分区键下的数据量远大于其他键，或某些键的请求速率远高于其他键，那么你仍可能出现部分服务器过载、其他服务器几乎空闲的情况。

例如，在社交媒体网站上，拥有数百万粉丝的名人用户在做某事时可能会引起活动风暴 [^24]。这个事件可能导致对同一个键的大量读写（其中分区键可能是名人的用户 ID，或者人们正在评论的动作的 ID）。

在这种情况下，需要更灵活的分片策略 [^25] [^26]。基于键范围（或哈希范围）定义分片的系统使得可以将单个热键放在自己的分片中，甚至可能为其分配专用机器 [^27]。

也可以在应用层补偿偏斜。例如，如果已知某个键非常热，一个简单方法是在键的前后附加随机数。仅用两位十进制随机数，就可以把对该键的写入均匀打散到 100 个不同键上，从而将它们分布到不同分片。

然而，将写入分散到不同的键之后，任何读取现在都必须做额外的工作，因为它们必须从所有 100 个键读取数据并将其组合。对热键每个分片的读取量没有减少；只有写入负载被分割。这种技术还需要额外的记账：只对少数热键附加随机数是有意义的；对于写入吞吐量低的绝大多数键，这将是不必要的开销。因此，你还需要某种方法来跟踪哪些键正在被分割，以及将常规键转换为特殊管理的热键的过程。

问题因负载随时间变化而进一步复杂化：例如，一个已经病毒式传播的特定社交媒体帖子可能会在几天内经历高负载，但之后可能会再次平静下来。此外，某些键可能对写入很热，而其他键对读取很热，需要不同的策略来处理它们。

一些系统（特别是为大规模设计的云服务）有自动处理热分片的方法；例如，Amazon 称之为 *热管理* [^28] 或 *自适应容量* [^17]。这些系统如何工作的细节超出了本书的范围。

### 运维：自动/手动再平衡 {#sec_sharding_operations}

关于再平衡有一个我们已经忽略的重要问题：分片的分割和再平衡是自动发生还是手动发生？

一些系统自动决定何时分割分片以及何时将它们从一个节点移动到另一个节点，无需任何人工交互，而其他系统则让分片由管理员明确配置。还有一个中间地带：例如，Couchbase 和 Riak 自动生成建议的分片分配，但需要管理员提交才能生效。

完全自动的再平衡可能很方便，因为正常维护的操作工作较少，这样的系统甚至可以自动扩展以适应工作负载的变化。云数据库如 DynamoDB 被宣传为能够在几分钟内自动添加和删除分片以适应负载的大幅增加或减少 [^17] [^29]。

然而，自动分片管理也可能是不可预测的。再平衡是一项昂贵的操作，因为它需要重新路由请求并将大量数据从一个节点移动到另一个节点。如果操作不当，这个过程可能会使网络或节点过载，并可能损害其他请求的性能。系统必须在再平衡进行时继续处理写入；如果系统接近其最大写入吞吐量，分片分割过程甚至可能无法跟上传入写入的速率 [^29]。

这种自动化与自动故障检测结合可能很危险。例如，假设一个节点过载并暂时响应请求缓慢。其他节点得出结论，过载的节点已死，并自动重新平衡集群以将负载从它移开。这会对其他节点和网络施加额外负载，使情况变得更糟。存在导致级联故障的风险，其中其他节点变得过载并也被错误地怀疑已关闭。

出于这个原因，在再平衡过程中有人参与可能是件好事。它比完全自动的过程慢，但它可以帮助防止操作意外。


## 请求路由 {#sec_sharding_routing}

我们已经讨论了如何将数据集分片到多个节点上，以及如何在添加或删除节点时重新平衡这些分片。现在让我们继续讨论这个问题：如果你想读取或写入特定的键，你如何知道需要连接到哪个节点——即哪个 IP 地址和端口号？

我们称这个问题为 *请求路由*，它与 *服务发现* 非常相似，我们之前在 ["负载均衡器、服务发现和服务网格"](/ch5#sec_encoding_service_discovery) 中讨论过。两者之间最大的区别是，对于运行应用程序代码的服务，每个实例通常是无状态的，负载均衡器可以将请求发送到任何实例。对于分片数据库，对键的请求只能由包含该键的分片的副本节点处理。

这意味着请求路由必须知道键到分片的分配，以及分片到节点的分配。在高层次上，这个问题有几种不同的方法（在 [图 7-7](#fig_sharding_routing) 中说明）：

1. 允许客户端连接任何节点（例如，通过循环负载均衡器）。如果该节点恰好拥有请求适用的分片，它可以直接处理请求；否则，它将请求转发到适当的节点，接收回复，并将回复传递给客户端。
2. 首先将客户端的所有请求发送到路由层，该层确定应该处理每个请求的节点并相应地转发它。这个路由层本身不处理任何请求；它只充当分片感知的负载均衡器。
3. 要求客户端知道分片和分片到节点的分配。在这种情况下，客户端可以直接连接到适当的节点，而无需任何中介。

{{< figure src="/fig/ddia_0707.png" id="fig_sharding_routing" caption="图 7-7. 将请求路由到正确节点的三种不同方式。" class="w-full my-4" >}}

在所有情况下，都有一些关键问题：

* 谁决定哪个分片应该存在于哪个节点上？最简单的是有一个单一的协调器做出该决定，但在这种情况下，如果运行协调器的节点出现故障，如何使其容错？如果协调器角色可以故障转移到另一个节点，如何防止脑裂情况（见 ["处理节点中断"](/ch6#sec_replication_failover)），其中两个不同的协调器做出相互矛盾的分片分配？
* 执行路由的组件（可能是节点之一、路由层或客户端）如何了解分片到节点分配的变化？
* 当分片从一个节点移动到另一个节点时，有一个切换期，在此期间新节点已接管，但对旧节点的请求可能仍在传输中。如何处理这些？

许多分布式数据系统依赖于单独的协调服务（如 ZooKeeper 或 etcd）来跟踪分片分配，如 [图 7-8](#fig_sharding_zookeeper) 所示。它们使用共识算法（见 [第 10 章](/ch10#ch_consistency)）来提供容错和防止脑裂。每个节点在 ZooKeeper 中注册自己，ZooKeeper 维护分片到节点的权威映射。其他参与者，如路由层或分片感知客户端，可以在 ZooKeeper 中订阅此信息。每当分片所有权发生变化，或者添加或删除节点时，ZooKeeper 都会通知路由层，以便它可以保持其路由信息最新。

{{< figure src="/fig/ddia_0708.png" id="fig_sharding_zookeeper" caption="图 7-8. 使用 ZooKeeper 跟踪分片到节点的分配。" class="w-full my-4" >}}

例如，HBase 和 SolrCloud 使用 ZooKeeper 管理分片分配，Kubernetes 使用 etcd 跟踪哪个服务实例在哪里运行。MongoDB 有类似的架构，但它依赖于自己的 *配置服务器* 实现和 *mongos* 守护进程作为路由层。Kafka、YugabyteDB 和 TiDB 使用内置的 Raft 共识协议实现来执行此协调功能。

Cassandra、ScyllaDB 和 Riak 采用不同的方法：它们在节点之间使用 *流言协议* 来传播集群状态的任何变化。这提供了比共识协议弱得多的一致性；可能会出现脑裂，其中集群的不同部分对同一分片有不同的节点分配。无主数据库可以容忍这一点，因为它们通常提供弱一致性保证（见 ["仲裁一致性的限制"](/ch6#sec_replication_quorum_limitations)）。

当使用路由层或向随机节点发送请求时，客户端仍然需要找到要连接的 IP 地址。这些不像分片到节点的分配那样快速变化，因此通常使用 DNS 就足够了。

上面对请求路由的讨论，主要关注如何为单个键找到对应分片，这对分片 OLTP 数据库最相关。分析型数据库通常也使用分片，但其查询执行模型很不一样：查询往往需要并行聚合并连接来自多个分片的数据，而不是在单个分片内执行。我们将在 ["JOIN 和 GROUP BY"](/ch11#sec_batch_join) 中讨论这类并行查询执行技术。

## 分片与二级索引 {#sec_sharding_secondary_indexes}

到目前为止，我们讨论的分片方案依赖于客户端知道它想要访问的任何记录的分区键。这在键值数据模型中最容易做到，其中分区键是主键的第一部分（或整个主键），因此我们可以使用分区键来确定分片，从而将读写路由到负责该键的节点。

如果涉及二级索引，情况会变得更加复杂（另见 ["多列和二级索引"](/ch4#sec_storage_index_multicolumn)）。二级索引通常不唯一地标识记录，而是一种搜索特定值出现的方法：查找用户 `123` 的所有操作、查找包含单词 `hogwash` 的所有文章、查找颜色为 `red` 的所有汽车等。

键值存储通常没有二级索引；但在关系数据库中，二级索引是基础能力，在文档数据库中也很常见，而且它们正是 Solr、Elasticsearch 等全文检索引擎的 *立身之本*。二级索引的难点在于，它们不能整齐地映射到分片。带二级索引的分片数据库主要有两种做法：本地索引与全局索引。

### 本地二级索引 {#id166}

例如，假设你正在运营一个出售二手车的网站（如 [图 7-9](#fig_sharding_local_secondary) 所示）。每个列表都有一个唯一的 ID——称之为文档 ID——你使用该 ID 作为分区键对数据库进行分片（例如，ID 0 到 499 在分片 0 中，ID 500 到 999 在分片 1 中，等等）。

如果你想让用户搜索汽车，允许他们按颜色和制造商过滤，你需要在 `color` 和 `make` 上建立二级索引（在文档数据库中这些是字段；在关系数据库中这些是列）。如果你已声明索引，数据库就可以自动维护索引。例如，每当一辆红色汽车被写入数据库，所在分片会自动将其 ID 加入索引条目 `color:red` 对应的文档 ID 列表。正如 [第 4 章](/ch4#ch_storage) 所述，这个 ID 列表也称为 *倒排列表*。

{{< figure src="/fig/ddia_0709.png" id="fig_sharding_local_secondary" caption="图 7-9. 本地二级索引：每个分片只索引其自己分片内的记录。" class="w-full my-4" >}}

> [!WARNING] 警告

如果你的数据库只支持键值模型，你可能会尝试通过在应用程序代码中创建从值到文档 ID 的映射来自己实现二级索引。如果你走这条路，你需要格外小心，确保你的索引与底层数据保持一致。竞态条件和间歇性写入失败（其中某些更改已保存但其他更改未保存）很容易导致数据不同步——见 ["多对象事务的需求"](/ch8#sec_transactions_need)。

--------

在这种索引方法中，每个分片是完全独立的：每个分片维护自己的二级索引，仅覆盖该分片中的文档。它不关心存储在其他分片中的数据。每当你需要写入数据库——添加、删除或更新记录——你只需要处理包含你正在写入的文档 ID 的分片。出于这个原因，这种类型的二级索引被称为 *本地索引*。在信息检索上下文中，它也被称为 *文档分区索引* [^30]。

当从本地二级索引读取时，如果你已经知道你正在查找的记录的分区键，你可以只在适当的分片上执行搜索。此外，如果你只想要 *一些* 结果，而不需要全部，你可以将请求发送到任何分片。

但是，如果你想要所有结果并且事先不知道它们的分区键，你需要将查询发送到所有分片，并组合你收到的结果，因为匹配的记录可能分散在所有分片中。在 [图 7-9](#fig_sharding_local_secondary) 中，红色汽车出现在分片 0 和分片 1 中。

这种查询分片数据库的方法有时称为 *分散/收集*（scatter/gather），它可能使二级索引读取变得相当昂贵。即使并行查询各分片，分散/收集也容易导致尾部延迟放大（见 ["响应时间指标的使用"](/ch2#sec_introduction_slo_sla)）。它还会限制应用的可伸缩性：增加分片可以提升可存储数据量，但若每个查询仍需所有分片参与，查询吞吐量并不会随分片数增加而提升。

尽管如此，本地二级索引被广泛使用 [^31]：例如，MongoDB、Riak、Cassandra [^32]、Elasticsearch [^33]、SolrCloud 和 VoltDB [^34] 都使用本地二级索引。

### 全局二级索引 {#id167}

我们可以构建一个覆盖所有分片数据的 *全局索引*，而不是每个分片有自己的本地二级索引。但是，我们不能只将该索引存储在一个节点上，因为它可能会成为瓶颈并违背分片的目的。全局索引也必须进行分片，但它可以以不同于主键索引的方式进行分片。

[图 7-10](#fig_sharding_global_secondary) 说明了这可能是什么样子：来自所有分片的红色汽车的 ID 出现在索引的 `color:red` 下，但索引是分片的，以便以字母 *a* 到 *r* 开头的颜色出现在分片 0 中，以 *s* 到 *z* 开头的颜色出现在分片 1 中。汽车制造商的索引也类似地分区（分片边界在 *f* 和 *h* 之间）。

{{< figure src="/fig/ddia_0710.png" id="fig_sharding_global_secondary" caption="图 7-10. 全局二级索引反映来自所有分片的数据，并且本身按索引值进行分片。" class="w-full my-4" >}}

这种索引也称为 *基于词项分区* [^30]：回忆一下 ["全文检索"](/ch4#sec_storage_full_text)，在全文检索中，*词项* 是你可以搜索的文本中的关键字。这里我们将其推广为指二级索引中你可以搜索的任何值。

全局索引使用词项作为分区键，因此当你查找特定词项或值时，你可以找出需要查询哪个分片。和以前一样，分片可以包含连续的词项范围（如 [图 7-10](#fig_sharding_global_secondary)），或者你可以基于词项的哈希将词项分配给分片。

全局索引的优点是，只有一个查询条件时（如 *color = red*），只需从一个分片读取即可获得倒排列表。但如果你不仅要 ID，还要取回完整记录，仍然必须去负责这些 ID 的各个分片读取。

如果你有多个搜索条件或词项（例如搜索某种颜色且某个制造商的汽车，或搜索同一文本中出现的多个单词），这些词项很可能会落在不同分片。要计算两个条件的逻辑 AND，系统需要找出同时出现在两个倒排列表中的 ID。若倒排列表较短，这没问题；但若很长，把它们通过网络发送后再算交集就可能很慢 [^30]。

全局二级索引的另一个挑战是写入比本地索引更复杂，因为写入单个记录可能会影响索引的多个分片（文档中的每个词项可能在不同的分片或不同的节点上）。这使得二级索引与底层数据保持同步更加困难。一种选择是使用分布式事务来原子地更新存储主记录的分片及其二级索引（见 [第 8 章](/ch8#ch_transactions)）。

全局二级索引被 CockroachDB、TiDB 和 YugabyteDB 使用；DynamoDB 同时支持本地与全局二级索引。在 DynamoDB 中，写入会异步反映到全局索引，因此从全局索引读取到的结果可能是陈旧的（类似复制延迟，见 ["复制延迟的问题"](/ch6#sec_replication_lag)）。尽管如此，在读吞吐量高于写吞吐量且倒排列表不太长的场景下，全局索引仍然很有价值。


## 总结 {#summary}

在本章中，我们探讨了将大型数据集分片为更小子集的不同方法。当你有如此多的数据以至于在单台机器上存储和处理它不再可行时，分片是必要的。

分片的目标是在多台机器上均匀分布数据和查询负载，避免热点（负载不成比例高的节点）。这需要选择适合你的数据的分片方案，并在节点添加到集群或从集群中删除时重新平衡分片。

我们讨论了两种主要的分片方法：

**键范围分片**
: 其中键是有序的，分片拥有从某个最小值到某个最大值的所有键。排序的优点是可以进行高效的范围查询，但如果应用程序经常访问排序顺序中彼此接近的键，则存在热点风险。

  在这种方法中，当分片变得太大时，通常通过将范围分成两个子范围来动态重新平衡分片。

**哈希分片**
: 其中对每个键应用哈希函数，分片拥有一个哈希值范围（或者可以使用另一种一致性哈希算法将哈希映射到分片）。这种方法破坏了键的顺序，使范围查询效率低下，但可能更均匀地分布负载。

  当按哈希分片时，通常预先创建固定数量的分片，为每个节点分配多个分片，并在添加或删除节点时将整个分片从一个节点移动到另一个节点。像键范围一样分割分片也是可能的。

通常使用键的第一部分作为分区键（即，识别分片），并在该分片内按键的其余部分对记录进行排序。这样，你仍然可以在具有相同分区键的记录之间进行高效的范围查询。

我们还讨论了分片和二级索引之间的交互。二级索引也需要进行分片，有两种方法：

**本地二级索引**
: 其中二级索引与主键和值存储在同一个分片中。这意味着写入时只需要更新一个分片，但二级索引的查找需要从所有分片读取。

**全局二级索引**
: 它们基于索引值单独分片。二级索引中的条目可能引用来自主键所有分片的记录。写入记录时，可能需要更新多个二级索引分片；但读取倒排列表时，可以由单个分片提供（获取实际记录仍需从多个分片读取）。

最后，我们讨论了将查询路由到正确分片的技术，以及如何借助协调服务维护分片到节点的分配信息。

按设计，每个分片大体独立运行，这正是分片数据库能够扩展到多台机器的原因。然而，凡是需要同时写多个分片的操作都会变得棘手：例如，一个分片写入成功、另一个分片写入失败时会发生什么？这个问题将在后续章节中讨论。


### 参考

[^1]: Claire Giordano. [Understanding partitioning and sharding in Postgres and Citus](https://www.citusdata.com/blog/2023/08/04/understanding-partitioning-and-sharding-in-postgres-and-citus/). *citusdata.com*, August 2023. Archived at [perma.cc/8BTK-8959](https://perma.cc/8BTK-8959) 
[^2]: Brandur Leach. [Partitioning in Postgres, 2022 edition](https://brandur.org/fragments/postgres-partitioning-2022). *brandur.org*, October 2022. Archived at [perma.cc/Z5LE-6AKX](https://perma.cc/Z5LE-6AKX) 
[^3]: Raph Koster. [Database “sharding” came from UO?](https://www.raphkoster.com/2009/01/08/database-sharding-came-from-uo/) *raphkoster.com*, January 2009. Archived at [perma.cc/4N9U-5KYF](https://perma.cc/4N9U-5KYF) 
[^4]: Garrett Fidalgo. [Herding elephants: Lessons learned from sharding Postgres at Notion](https://www.notion.com/blog/sharding-postgres-at-notion). *notion.com*, October 2021. Archived at [perma.cc/5J5V-W2VX](https://perma.cc/5J5V-W2VX) 
[^5]: Ulrich Drepper. [What Every Programmer Should Know About Memory](https://www.akkadia.org/drepper/cpumemory.pdf). *akkadia.org*, November 2007. Archived at [perma.cc/NU6Q-DRXZ](https://perma.cc/NU6Q-DRXZ) 
[^6]: Jingyu Zhou, Meng Xu, Alexander Shraer, Bala Namasivayam, Alex Miller, Evan Tschannen, Steve Atherton, Andrew J. Beamon, Rusty Sears, John Leach, Dave Rosenthal, Xin Dong, Will Wilson, Ben Collins, David Scherer, Alec Grieser, Young Liu, Alvin Moore, Bhaskar Muppana, Xiaoge Su, and Vishesh Yadav. [FoundationDB: A Distributed Unbundled Transactional Key Value Store](https://www.foundationdb.org/files/fdb-paper.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2021. [doi:10.1145/3448016.3457559](https://doi.org/10.1145/3448016.3457559) 
[^7]: Marco Slot. [Citus 12: Schema-based sharding for PostgreSQL](https://www.citusdata.com/blog/2023/07/18/citus-12-schema-based-sharding-for-postgres/). *citusdata.com*, July 2023. Archived at [perma.cc/R874-EC9W](https://perma.cc/R874-EC9W) 
[^8]: Robisson Oliveira. [Reducing the Scope of Impact with Cell-Based Architecture](https://docs.aws.amazon.com/pdfs/wellarchitected/latest/reducing-scope-of-impact-with-cell-based-architecture/reducing-scope-of-impact-with-cell-based-architecture.pdf). AWS Well-Architected white paper, Amazon Web Services, September 2023. Archived at [perma.cc/4KWW-47NR](https://perma.cc/4KWW-47NR) 
[^9]: Gwen Shapira. [Things DBs Don’t Do - But Should](https://www.thenile.dev/blog/things-dbs-dont-do). *thenile.dev*, February 2023. Archived at [perma.cc/C3J4-JSFW](https://perma.cc/C3J4-JSFW) 
[^10]: Malte Schwarzkopf, Eddie Kohler, M. Frans Kaashoek, and Robert Morris. [Position: GDPR Compliance by Construction](https://cs.brown.edu/people/malte/pub/papers/2019-poly-gdpr.pdf). At *Towards Polystores that manage multiple Databases, Privacy, Security and/or Policy Issues for Heterogenous Data* (Poly), August 2019. [doi:10.1007/978-3-030-33752-0\_3](https://doi.org/10.1007/978-3-030-33752-0_3) 
[^11]: Gwen Shapira. [Introducing pg\_karnak: Transactional schema migration across tenant databases](https://www.thenile.dev/blog/distributed-ddl). *thenile.dev*, November 2024. Archived at [perma.cc/R5RD-8HR9](https://perma.cc/R5RD-8HR9) 
[^12]: Arka Ganguli, Guido Iaquinti, Maggie Zhou, and Rafael Chacón. [Scaling Datastores at Slack with Vitess](https://slack.engineering/scaling-datastores-at-slack-with-vitess/). *slack.engineering*, December 2020. Archived at [perma.cc/UW8F-ALJK](https://perma.cc/UW8F-ALJK) 
[^13]: Ikai Lan. [App Engine Datastore Tip: Monotonically Increasing Values Are Bad](https://ikaisays.com/2011/01/25/app-engine-datastore-tip-monotonically-increasing-values-are-bad/). *ikaisays.com*, January 2011. Archived at [perma.cc/BPX8-RPJB](https://perma.cc/BPX8-RPJB) 
[^14]: Enis Soztutar. [Apache HBase Region Splitting and Merging](https://www.cloudera.com/blog/technical/apache-hbase-region-splitting-and-merging.html). *cloudera.com*, February 2013. Archived at [perma.cc/S9HS-2X2C](https://perma.cc/S9HS-2X2C) 
[^15]: Eric Evans. [Rethinking Topology in Cassandra](https://www.youtube.com/watch?v=Qz6ElTdYjjU). At *Cassandra Summit*, June 2013. Archived at [perma.cc/2DKM-F438](https://perma.cc/2DKM-F438) 
[^16]: Martin Kleppmann. [Java’s hashCode Is Not Safe for Distributed Systems](https://martin.kleppmann.com/2012/06/18/java-hashcode-unsafe-for-distributed-systems.html). *martin.kleppmann.com*, June 2012. Archived at [perma.cc/LK5U-VZSN](https://perma.cc/LK5U-VZSN) 
[^17]: Mostafa Elhemali, Niall Gallagher, Nicholas Gordon, Joseph Idziorek, Richard Krog, Colin Lazier, Erben Mo, Akhilesh Mritunjai, Somu Perianayagam, Tim Rath, Swami Sivasubramanian, James Christopher Sorenson III, Sroaj Sosothikul, Doug Terry, and Akshat Vig. [Amazon DynamoDB: A Scalable, Predictably Performant, and Fully Managed NoSQL Database Service](https://www.usenix.org/conference/atc22/presentation/elhemali). At *USENIX Annual Technical Conference* (ATC), July 2022. 
[^18]: Brandon Williams. [Virtual Nodes in Cassandra 1.2](https://www.datastax.com/blog/virtual-nodes-cassandra-12). *datastax.com*, December 2012. Archived at [perma.cc/N385-EQXV](https://perma.cc/N385-EQXV) 
[^19]: Branimir Lambov. [New Token Allocation Algorithm in Cassandra 3.0](https://www.datastax.com/blog/new-token-allocation-algorithm-cassandra-30). *datastax.com*, January 2016. Archived at [perma.cc/2BG7-LDWY](https://perma.cc/2BG7-LDWY) 
[^20]: David Karger, Eric Lehman, Tom Leighton, Rina Panigrahy, Matthew Levine, and Daniel Lewin. [Consistent Hashing and Random Trees: Distributed Caching Protocols for Relieving Hot Spots on the World Wide Web](https://people.csail.mit.edu/karger/Papers/web.pdf). At *29th Annual ACM Symposium on Theory of Computing* (STOC), May 1997. [doi:10.1145/258533.258660](https://doi.org/10.1145/258533.258660) 
[^21]: Damian Gryski. [Consistent Hashing: Algorithmic Tradeoffs](https://dgryski.medium.com/consistent-hashing-algorithmic-tradeoffs-ef6b8e2fcae8). *dgryski.medium.com*, April 2018. Archived at [perma.cc/B2WF-TYQ8](https://perma.cc/B2WF-TYQ8) 
[^22]: David G. Thaler and Chinya V. Ravishankar. [Using name-based mappings to increase hit rates](https://www.cs.kent.edu/~javed/DL/web/p1-thaler.pdf). *IEEE/ACM Transactions on Networking*, volume 6, issue 1, pages 1–14, February 1998. [doi:10.1109/90.663936](https://doi.org/10.1109/90.663936) 
[^23]: John Lamping and Eric Veach. [A Fast, Minimal Memory, Consistent Hash Algorithm](https://arxiv.org/abs/1406.2294). *arxiv.org*, June 2014. 
[^24]: Samuel Axon. [3% of Twitter’s Servers Dedicated to Justin Bieber](https://mashable.com/archive/justin-bieber-twitter). *mashable.com*, September 2010. Archived at [perma.cc/F35N-CGVX](https://perma.cc/F35N-CGVX) 
[^25]: Gerald Guo and Thawan Kooburat. [Scaling services with Shard Manager](https://engineering.fb.com/2020/08/24/production-engineering/scaling-services-with-shard-manager/). *engineering.fb.com*, August 2020. Archived at [perma.cc/EFS3-XQYT](https://perma.cc/EFS3-XQYT) 
[^26]: Sangmin Lee, Zhenhua Guo, Omer Sunercan, Jun Ying, Thawan Kooburat, Suryadeep Biswal, Jun Chen, Kun Huang, Yatpang Cheung, Yiding Zhou, Kaushik Veeraraghavan, Biren Damani, Pol Mauri Ruiz, Vikas Mehta, and Chunqiang Tang. [Shard Manager: A Generic Shard Management Framework for Geo-distributed Applications](https://dl.acm.org/doi/pdf/10.1145/3477132.3483546). *28th ACM SIGOPS Symposium on Operating Systems Principles* (SOSP), pages 553–569, October 2021. [doi:10.1145/3477132.3483546](https://doi.org/10.1145/3477132.3483546) 
[^27]: Scott Lystig Fritchie. [A Critique of Resizable Hash Tables: Riak Core & Random Slicing](https://www.infoq.com/articles/dynamo-riak-random-slicing/). *infoq.com*, August 2018. Archived at [perma.cc/RPX7-7BLN](https://perma.cc/RPX7-7BLN) 
[^28]: Andy Warfield. [Building and operating a pretty big storage system called S3](https://www.allthingsdistributed.com/2023/07/building-and-operating-a-pretty-big-storage-system.html). *allthingsdistributed.com*, July 2023. Archived at [perma.cc/6S7P-GLM4](https://perma.cc/6S7P-GLM4) 
[^29]: Rich Houlihan. [DynamoDB adaptive capacity: smooth performance for chaotic workloads (DAT327)](https://www.youtube.com/watch?v=kMY0_m29YzU). At *AWS re:Invent*, November 2017. 
[^30]: Christopher D. Manning, Prabhakar Raghavan, and Hinrich Schütze. [*Introduction to Information Retrieval*](https://nlp.stanford.edu/IR-book/). Cambridge University Press, 2008. ISBN: 978-0-521-86571-5, available online at [nlp.stanford.edu/IR-book](https://nlp.stanford.edu/IR-book/) 
[^31]: Michael Busch, Krishna Gade, Brian Larson, Patrick Lok, Samuel Luckenbill, and Jimmy Lin. [Earlybird: Real-Time Search at Twitter](https://cs.uwaterloo.ca/~jimmylin/publications/Busch_etal_ICDE2012.pdf). At *28th IEEE International Conference on Data Engineering* (ICDE), April 2012. [doi:10.1109/ICDE.2012.149](https://doi.org/10.1109/ICDE.2012.149) 
[^32]: Nadav Har’El. [Indexing in Cassandra 3](https://github.com/scylladb/scylladb/wiki/Indexing-in-Cassandra-3). *github.com*, April 2017. Archived at [perma.cc/3ENV-8T9P](https://perma.cc/3ENV-8T9P) 
[^33]: Zachary Tong. [Customizing Your Document Routing](https://www.elastic.co/blog/customizing-your-document-routing/). *elastic.co*, June 2013. Archived at [perma.cc/97VM-MREN](https://perma.cc/97VM-MREN) 
[^34]: Andrew Pavlo. [H-Store Frequently Asked Questions](https://hstore.cs.brown.edu/documentation/faq/). *hstore.cs.brown.edu*, October 2013. Archived at [perma.cc/X3ZA-DW6Z](https://perma.cc/X3ZA-DW6Z) 


================================================
FILE: content/zh/ch8.md
================================================
---
title: "8. 事务"
weight: 208
math: true
breadcrumbs: false
---

<a id="ch_transactions"></a>

![](/map/ch07.png)

> *有些作者声称，支持通用的两阶段提交代价太大，会带来性能与可用性的问题。我们认为，让程序员来处理过度使用事务导致的性能问题，总比缺少事务编程好得多。*
>
> James Corbett 等人，*Spanner：Google 的全球分布式数据库*（2012）

在数据系统的残酷现实中，很多事情都可能出错：

* 数据库软件或硬件可能在任意时刻发生故障（包括写操作进行到一半时）。
* 应用程序可能在任意时刻崩溃（包括一系列操作的中间）。
* 网络中断可能会意外切断应用程序与数据库的连接，或数据库节点之间的连接。
* 多个客户端可能会同时写入数据库，覆盖彼此的更改。
* 客户端可能读取到无意义的数据，因为数据只更新了一部分。
* 客户端之间的竞态条件可能导致令人惊讶的错误。

为了实现可靠性，系统必须处理这些故障，确保它们不会导致整个系统的灾难性故障。然而，实现容错机制需要大量工作。它需要仔细考虑所有可能出错的事情，并进行大量测试，以确保解决方案真正有效。

数十年来，*事务*一直是简化这些问题的首选机制。事务是应用程序将多个读写操作组合成一个逻辑单元的一种方式。从概念上讲，事务中的所有读写操作被视作单个操作来执行：整个事务要么成功（*提交*），要么失败（*中止*、*回滚*）。如果失败，应用程序可以安全地重试。对于事务来说，应用程序的错误处理变得简单多了，因为它不用再担心部分失败——即某些操作成功，某些失败（无论出于何种原因）。

如果你与事务打交道多年，它们可能看起来显而易见，但我们不应该将其视为理所当然。事务不是自然法则；它们是有目的地创建的，即为了*简化应用程序的编程模型*。通过使用事务，应用程序可以自由地忽略某些潜在的错误场景和并发问题，因为数据库会替应用处理好这些（我们称之为*安全保证*）。

并非所有应用程序都需要事务，有时弱化事务保证或完全放弃事务也有好处（例如，为了获得更高的性能或更高的可用性）。某些安全属性可以在没有事务的情况下实现。另一方面，事务可以防止很多麻烦：例如，邮局 Horizon 丑闻（参见["可靠性有多重要？"](/ch2#sidebar_reliability_importance)）背后的技术原因可能是底层会计系统缺乏 ACID 事务[^1]。

你如何确定是否需要事务？为了回答这个问题，我们首先需要准确理解事务可以提供哪些安全保证，以及相关的成本。尽管事务乍看起来很简单，但实际上有许多细微但重要的细节在起作用。

在本章中，我们将研究许多可能出错的案例，并探索数据库用于防范这些问题的算法。我们将特别深入并发控制领域，讨论可能发生的各种竞态条件，以及数据库如何实现*读已提交*、*快照隔离*和*可串行化*等隔离级别。

并发控制对单节点和分布式数据库都很重要。在本章后面的["分布式事务"](#sec_transactions_distributed)部分，我们将研究*两阶段提交*协议和在分布式事务中实现原子性的挑战。

## 事务到底是什么？ {#sec_transactions_overview}

今天，几乎所有的关系型数据库和一些非关系数据库都支持事务。它们大多遵循 1975 年由 IBM System R（第一个 SQL 数据库）引入的风格[^2] [^3] [^4]。尽管一些实现细节发生了变化，但总体思路在 50 年里几乎保持不变：MySQL、PostgreSQL、Oracle、SQL Server 等的事务支持与 System R 惊人地相似。

在 2000 年代后期，非关系（NoSQL）数据库开始流行起来。它们旨在通过提供新的数据模型选择（参见[第 3 章](/ch3#ch_datamodels)），以及默认包含复制（[第 6 章](/ch6#ch_replication)）和分片（[第 7 章](/ch7#ch_sharding)）来改进关系型数据库的现状。事务是这一运动的主要牺牲品：许多这一代数据库完全放弃了事务，或者重新定义了这个词，用来描述比以前理解的更弱的保证集。

围绕 NoSQL 分布式数据库的炒作导致了一种流行的信念，即事务从根本上不可伸缩，任何大规模系统都必须放弃事务以保持良好的性能和高可用性。最近，这种信念被证明是错误的。所谓 "NewSQL" 数据库，如 CockroachDB[^5]、TiDB[^6]、Spanner[^7]、FoundationDB[^8] 和 YugabyteDB 已经证明，事务系统同样可以具备很强的可伸缩性，并支持大数据量与高吞吐量。这些系统将分片与共识协议（[第 10 章](/ch10#ch_consistency)）结合，在大规模下提供强 ACID 保证。

然而，这并不意味着每个系统都必须是事务型的：与任何其他技术设计选择一样，事务有优点也有局限性。为了理解这些权衡，让我们深入了解事务可以提供的保证的细节——无论是在正常操作中还是在各种极端（但现实）的情况下。

### ACID 的含义 {#sec_transactions_acid}

事务提供的安全保证通常由众所周知的首字母缩略词 *ACID* 来描述，它代表*原子性*（Atomicity）、*一致性*（Consistency）、*隔离性*（Isolation）和*持久性*（Durability）。它由 Theo Härder 和 Andreas Reuter 于 1983 年提出[^9]，旨在为数据库中的容错机制建立精确的术语。

然而，在实践中，一个数据库的 ACID 实现并不等同于另一个数据库的实现。例如，正如我们将看到的，*隔离性*的含义有很多歧义[^10]。高层次的想法是合理的，但魔鬼在细节中。今天，当一个系统声称自己"符合 ACID"时，实际上你能期待什么保证并不清楚。不幸的是，ACID 基本上已经成为了一个营销术语。

（不符合 ACID 标准的系统有时被称为 *BASE*，它代表*基本可用*（Basically Available）、*软状态*（Soft state）和*最终一致性*（Eventual consistency）[^11]。这比 ACID 的定义更加模糊。似乎 BASE 唯一合理的定义是"非 ACID"；即，它几乎可以代表任何你想要的东西。）

让我们深入了解原子性、一致性、隔离性和持久性的定义，这将让我们提炼出事务的思想。

#### 原子性 {#sec_transactions_acid_atomicity}

一般来说，*原子*是指不能分解成更小部分的东西。这个词在计算机的不同分支中意味着相似但又微妙不同的东西。例如，在多线程编程中，如果一个线程执行原子操作，这意味着另一个线程无法看到该操作的半完成结果。系统只能处于操作之前或操作之后的状态，而不是介于两者之间。

相比之下，在 ACID 的上下文中，原子性*不是*关于并发的。它不描述如果几个进程试图同时访问相同的数据会发生什么，因为这包含在字母 *I*（*隔离性*）中（参见["隔离性"](#sec_transactions_acid_isolation)）。

相反，ACID 原子性描述了当客户端想要进行多次写入，但在某些写入被处理后发生故障时会发生什么——例如，进程崩溃、网络连接中断、磁盘变满或违反了某些完整性约束。如果这些写入被分组到一个原子事务中，并且由于故障无法完成（*提交*）事务，则事务被*中止*，数据库必须丢弃或撤消该事务中迄今为止所做的任何写入。

如果没有原子性，如果在进行多处更改的中途发生错误，很难知道哪些更改已经生效，哪些没有。应用程序可以重试，但这有进行两次相同更改的风险，导致数据重复或错误。原子性简化了这个问题：如果事务被中止，应用程序可以确定它没有改变任何东西，因此可以安全地重试。

在错误时中止事务并丢弃该事务的所有写入的能力是 ACID 原子性的定义特征。也许*可中止性*比*原子性*更好，但我们将坚持使用*原子性*，因为这是常用词。

#### 一致性 {#sec_transactions_acid_consistency}

*一致性*这个词被严重滥用：

* 在[第 6 章](/ch6#ch_replication)中，我们讨论了*副本一致性*和异步复制系统中出现的*最终一致性*问题（参见["复制延迟的问题"](/ch6#sec_replication_lag)）。
* 数据库的*一致快照*（例如，用于备份）是整个数据库在某一时刻存在的快照。更准确地说，它与先发生关系（happens-before relation）一致（参见["“先发生”关系和并发"](/ch6#sec_replication_happens_before)）：也就是说，如果快照包含在特定时间写入的值，那么它也反映了在该值写入之前发生的所有写入。
* *一致性哈希*是某些系统用于再平衡的分片方法（参见["一致性哈希"](/ch7#sec_sharding_consistent_hashing)）。
* 在 CAP定理中（参见[第 10 章](/ch10#ch_consistency)），*一致性*一词用于表示*线性一致性*（参见["线性一致性"](/ch10#sec_consistency_linearizability)）。
* 在 ACID 的上下文中，*一致性*是指应用程序特定的数据库处于"良好状态"的概念。

不幸的是，同一个词至少有五种不同的含义。

ACID 一致性的思想是，你对数据有某些陈述（*不变式*）必须始终为真——例如，在会计系统中，所有账户的贷方和借方必须始终平衡。如果事务从满足这些不变式的有效数据库开始，并且事务期间的任何写入都保持有效性，那么你可以确定不变式始终得到满足。（不变式可能在事务执行期间暂时违反，但在事务提交时应该再次满足。）

如果你希望数据库强制执行你的不变式，你需要将它们声明为模式的一部分的*约束*。例如，外键约束、唯一性约束或检查约束（限制单个行中可以出现的值）通常用于对特定类型的不变式建模。更复杂的一致性要求有时可以使用触发器或物化视图建模[^12]。

然而，复杂的不变式可能很难或不可能使用数据库通常提供的约束来建模。在这种情况下，应用程序有责任正确定义其事务，以便它们保持一致性。如果你写入违反不变式的错误数据，但你没有声明这些不变式，数据库无法阻止你。因此，ACID 中的 C 通常取决于应用程序如何使用数据库，而不仅仅是数据库的属性。

#### 隔离性 {#sec_transactions_acid_isolation}

大多数数据库都会同时被多个客户端访问。如果它们读写数据库的不同部分，这没有问题，但如果它们访问相同的数据库记录，你可能会遇到并发问题（竞态条件）。

[图 8-1](#fig_transactions_increment) 是这种问题的一个简单例子。假设你有两个客户端同时递增存储在数据库中的计数器。每个客户端需要读取当前值，加 1，然后写回新值（假设数据库中没有内置的递增操作）。在[图 8-1](#fig_transactions_increment) 中，计数器应该从 42 增加到 44，因为发生了两次递增，但实际上由于竞态条件只增加到 43。

{{< figure src="/fig/ddia_0801.png" id="fig_transactions_increment" caption="图 8-1. 两个客户端并发递增计数器之间的竞态条件。" class="w-full my-4" >}}


ACID 意义上的*隔离性*意味着同时执行的事务彼此隔离：它们不能相互干扰。经典的数据库教科书将隔离性形式化为*可串行化*，这意味着每个事务可以假装它是唯一在整个数据库上运行的事务。数据库确保当事务已经提交时，结果与它们*串行*运行（一个接一个）相同，即使实际上它们可能是并发运行的[^13]。

然而，可串行化有性能成本。在实践中，许多数据库使用比可串行化更弱的隔离形式：也就是说，它们允许并发事务以有限的方式相互干扰。一些流行的数据库，如 Oracle，甚至没有实现它（Oracle 有一个称为"可串行化"的隔离级别，但它实际上实现了*快照隔离*，这是比可串行化更弱的保证[^10] [^14]）。这意味着某些类型的竞态条件仍然可能发生。我们将在["弱隔离级别"](#sec_transactions_isolation_levels)中探讨快照隔离和其他形式的隔离。

#### 持久性 {#durability}

数据库系统的目的是提供一个安全的地方来存储数据，而不用担心丢失它。*持久性*是一个承诺，即一旦事务成功提交，它写入的任何数据都不会被遗忘，即使发生硬件故障或数据库崩溃。

在单节点数据库中，持久性通常意味着数据已经写入非易失性存储，如硬盘或 SSD。定期文件写入通常在发送到磁盘之前在内存中缓冲，这意味着如果突然断电它们将丢失；因此，许多数据库使用 `fsync()` 系统调用来确保数据真正写入磁盘。数据库通常还有预写日志或类似的（参见["使 B 树可靠"](/ch4#sec_storage_btree_wal)），这允许它们在写入过程中发生崩溃时恢复。

在复制数据库中，持久性可能意味着数据已成功复制到某些节点。为了提供持久性保证，数据库必须等到这些写入或复制完成，然后才报告事务成功提交。然而，如["可靠性和容错"](/ch2#sec_introduction_reliability)中所讨论的，完美的持久性不存在：如果所有硬盘和所有备份同时被销毁，显然你的数据库无法挽救你。

--------

<a id="sidebar_transactions_durability"></a>

> [!TIP] 复制与持久性

历史上，持久性意味着写入归档磁带。然后它被理解为写入磁盘或 SSD。最近，它已经适应为意味着复制。哪种实现更好？

事实是，没有什么是完美的：

* 如果你写入磁盘而机器死机，即使你的数据没有丢失，在你修复机器或将磁盘转移到另一台机器之前，它也是不可访问的。复制系统可以保持可用。
* 相关故障——停电或导致每个节点在特定输入上崩溃的错误——可以一次性摧毁所有副本（参见["可靠性和容错"](/ch2#sec_introduction_reliability)），失去任何仅在内存中的数据。因此，写入磁盘对于复制数据库仍然相关。
* 在异步复制系统中，当领导者变得不可用时，最近的写入可能会丢失（参见["处理节点故障"](/ch6#sec_replication_failover)）。
* 当电源突然切断时，SSD 特别被证明有时会违反它们应该提供的保证：即使 `fsync` 也不能保证正常工作[^15]。磁盘固件可能有错误，就像任何其他类型的软件一样[^16] [^17]，例如，导致驱动器在正好 32,768 小时操作后失败[^18]。而且 `fsync` 很难使用；即使 PostgreSQL 使用它不正确超过 20 年[^19] [^20] [^21]。
* 存储引擎和文件系统实现之间的微妙交互可能导致难以追踪的错误，并可能导致磁盘上的文件在崩溃后损坏[^22] [^23]。一个副本上的文件系统错误有时也会传播到其他副本[^24]。
* 磁盘上的数据可能在未被检测到的情况下逐渐损坏[^25]。如果数据已经损坏了一段时间，副本和最近的备份也可能损坏。在这种情况下，你需要尝试从历史备份中恢复数据。
* 一项关于 SSD 的研究发现，在前四年的运行中，30% 到 80% 的驱动器会开发至少一个坏块，其中只有一些可以通过固件纠正[^26]。磁盘驱动器的坏扇区率较低，但完全故障率高于 SSD。
* 当磨损的 SSD（经历了许多写/擦除周期）断电时，它可能在几周到几个月的时间尺度上开始丢失数据，具体取决于温度[^27]。对于磨损水平较低的驱动器，这不是问题[^28]。

在实践中，没有一种技术可以提供绝对保证。只有各种降低风险的技术，包括写入磁盘、复制到远程机器和备份——它们可以而且应该一起使用。一如既往，明智的做法是对任何理论上的"保证"持健康的怀疑态度。

--------

### 单对象与多对象操作 {#sec_transactions_multi_object}

回顾一下，在 ACID 中，原子性和隔离性描述了如果客户端在同一事务中进行多次写入，数据库应该做什么：

原子性
: 如果在写入序列的中途发生错误，事务应该被中止，并且到该点为止所做的写入应该被丢弃。换句话说，数据库让你免于担心部分失败，通过提供全有或全无的保证。

隔离性
: 并发运行的事务不应该相互干扰。例如，如果一个事务进行多次写入，那么另一个事务应该看到所有或不看到这些写入，但不是某些子集。

这些定义假设你想要同时修改多个对象（行、文档、记录）。这种*多对象事务*通常需要保持多块数据同步。[图 8-2](#fig_transactions_read_uncommitted) 显示了一个来自电子邮件应用程序的示例。要显示用户的未读消息数，你可以查询类似这样的内容：

```
SELECT COUNT(*) FROM emails WHERE recipient_id = 2 AND unread_flag = true
```

{{< figure src="/fig/ddia_0802.png" id="fig_transactions_read_uncommitted" caption="图 8-2. 违反隔离性：一个事务读取另一个事务的未提交写入（“脏读”）。" class="w-full my-4" >}}


然而，如果有很多电子邮件，你可能会发现这个查询太慢，并决定将未读消息的数量存储在一个单独的字段中（一种反规范化，我们在["规范化、反规范化和连接"](/ch3#sec_datamodels_normalization)中讨论）。现在，每当有新消息进来时，你必须增加未读计数器，每当消息被标记为已读时，你也必须减少未读计数器。

在[图 8-2](#fig_transactions_read_uncommitted) 中，用户 2 遇到了异常：邮箱列表显示有未读消息，但计数器显示零未读消息，因为计数器增量尚未发生。（如果电子邮件应用程序中的错误计数器看起来太微不足道，请考虑客户账户余额而不是未读计数器，以及支付事务而不是电子邮件。）隔离本可以通过确保用户 2 看到插入的电子邮件和更新的计数器，或者两者都不看到，但不是不一致的中间点，来防止这个问题。

[图 8-3](#fig_transactions_atomicity) 说明了对原子性的需求：如果在事务过程中某处发生错误，邮箱的内容和未读计数器可能会失去同步。在原子事务中，如果对计数器的更新失败，事务将被中止，插入的电子邮件将被回滚。

{{< figure src="/fig/ddia_0803.png" id="fig_transactions_atomicity" caption="图 8-3. 原子性确保如果发生错误，该事务的任何先前写入都会被撤消，以避免不一致的状态。" class="w-full my-4" >}}


多对象事务需要某种方式来确定哪些读写操作属于同一事务。在关系数据库中，这通常基于客户端与数据库服务器的 TCP 连接：在任何特定连接上，`BEGIN TRANSACTION` 和 `COMMIT` 语句之间的所有内容都被认为是同一事务的一部分。如果 TCP 连接中断，事务必须被中止。

另一方面，许多非关系数据库没有这样的方式来将操作组合在一起。即使有多对象 API（例如，键值存储可能有一个*多重放置*操作，在一个操作中更新多个键），这并不一定意味着它具有事务语义：该命令可能在某些键上成功而在其他键上失败，使数据库处于部分更新状态。

#### 单对象写入 {#sec_transactions_single_object}

当单个对象被更改时，原子性和隔离性也适用。例如，假设你正在向数据库写入 20 KB 的 JSON 文档：

* 如果在发送了前 10 KB 后网络连接中断，数据库是否存储了无法解析的 10 KB JSON 片段？
* 如果数据库正在覆盖磁盘上的先前值的过程中电源失效，你是否最终会将新旧值拼接在一起？
* 如果另一个客户端在写入过程中读取该文档，它会看到部分更新的值吗？

这些问题会令人非常困惑，因此存储引擎几乎普遍的目标是在一个节点上的单个对象（如键值对）上提供原子性和隔离性。原子性可以使用日志实现崩溃恢复（参见["使 B 树可靠"](/ch4#sec_storage_btree_wal)），隔离性可以使用每个对象上的锁来实现（一次只允许一个线程访问对象）。

某些数据库还提供更复杂的原子操作，例如递增操作，它消除了像[图 8-1](#fig_transactions_increment) 中那样的读-修改-写循环的需求。类似流行的是*条件写入*操作，它允许仅在值未被其他人并发更改时才进行写入（参见["条件写入（比较并设置）"](#sec_transactions_compare_and_set)），类似于共享内存并发中的比较并设置或比较并交换（CAS）操作。

--------

> [!NOTE]
> 严格来说，术语*原子递增*在多线程编程的意义上使用了*原子*这个词。在 ACID 的上下文中，它实际上应该被称为*隔离*或*可串行化*递增，但这不是通常的术语。

--------

这些单对象操作很有用，因为它们可以防止多个客户端尝试同时写入同一对象时的丢失更新（参见["防止丢失更新"](#sec_transactions_lost_update)）。然而，它们不是通常意义上的事务。例如，Cassandra 和 ScyllaDB 的"轻量级事务"功能以及 Aerospike 的"强一致性"模式在单个对象上提供线性一致（参见["线性一致性"](/ch10#sec_consistency_linearizability)）读取和条件写入，但不保证跨多个对象。

#### 多对象事务的需求 {#sec_transactions_need}

我们是否需要多对象事务？是否可能仅使用键值数据模型和单对象操作来实现任何应用程序？

在某些用例中，单对象插入、更新和删除就足够了。然而，在许多其他情况下，需要协调对多个不同对象的写入：

* 在关系数据模型中，一个表中的行通常具有对另一个表中行的外键引用。类似地，在类似图的数据模型中，顶点具有指向其他顶点的边。多对象事务允许你确保这些引用保持有效：插入引用彼此的多个记录时，外键必须正确且最新，否则数据变得毫无意义。
* 在文档数据模型中，需要一起更新的字段通常在同一文档内，它被视为单个对象——更新单个文档时不需要多对象事务。然而，缺乏连接功能的文档数据库也鼓励反规范化（参见["何时使用哪种模型"](/ch3#sec_datamodels_document_summary)）。当需要更新反规范化信息时，如[图 8-2](#fig_transactions_read_uncommitted) 的示例，你需要一次更新多个文档。事务在这种情况下非常有用，可以防止反规范化数据失去同步。
* 在具有二级索引的数据库中（几乎除了纯键值存储之外的所有数据库），每次更改值时都需要更新索引。从事务的角度来看，这些索引是不同的数据库对象：例如，如果没有事务隔离，记录可能出现在一个索引中但不在另一个索引中，因为对第二个索引的更新尚未发生（参见["分片和二级索引"](/ch7#sec_sharding_secondary_indexes)）。

这些应用程序仍然可以在没有事务的情况下实现。然而，没有原子性的错误处理变得更加复杂，缺乏隔离性可能导致并发问题。我们将在["弱隔离级别"](#sec_transactions_isolation_levels)中讨论这些问题，并在["派生数据与分布式事务"](/ch13#sec_future_derived_vs_transactions)中探索替代方法。

#### 处理错误和中止 {#handling-errors-and-aborts}

事务的一个关键特性是，如果发生错误，它可以被中止并安全地重试。ACID 数据库基于这样的哲学：如果数据库有违反其原子性、隔离性或持久性保证的危险，它宁愿完全放弃事务，也不允许它保持半完成状态。

然而，并非所有系统都遵循这种哲学。特别是，具有无主（无领导者）复制的数据存储（参见["无主（无领导者）复制"](/ch6#sec_replication_leaderless)）更多地基于"尽力而为"的基础工作，可以总结为"数据库将尽其所能，如果遇到错误，它不会撤消已经完成的操作"——因此，从错误中恢复是应用程序的责任。

错误不可避免地会发生，但许多软件开发人员更愿意只考虑快乐路径，而不是错误处理的复杂性。例如，流行的对象关系映射（ORM）框架，如 Rails 的 ActiveRecord 和 Django，不会重试中止的事务——错误通常导致异常冒泡到堆栈中，因此任何用户输入都被丢弃，用户收到错误消息。这是一种遗憾，因为中止的全部意义是启用安全重试。

尽管重试中止的事务是一种简单有效的错误处理机制，但它并不完美：

* 如果事务实际上成功了，但在服务器尝试向客户端确认成功提交时网络中断（因此从客户端的角度来看超时），那么重试事务会导致它被执行两次——除非你有额外的应用程序级去重机制。
* 如果错误是由于过载或并发事务之间的高争用，重试事务会使问题变得更糟，而不是更好。为了避免这种反馈循环，你可以限制重试次数，使用指数退避，并以不同的方式处理与过载相关的错误与其他错误（参见["当过载系统无法恢复时"](/ch2#sidebar_metastable)）。
* 仅在瞬态错误后重试才值得（例如，由于死锁、隔离违规、临时网络中断和故障转移）；在永久错误后（例如，约束违规）重试将毫无意义。
* 如果事务在数据库之外也有副作用，即使事务被中止，这些副作用也可能发生。例如，如果你正在发送电子邮件，你不会希望每次重试事务时都再次发送电子邮件。如果你想确保几个不同的系统一起提交或中止，两阶段提交可以提供帮助（我们将在["两阶段提交（2PC）"](#sec_transactions_2pc)中讨论这个问题）。
* 如果客户端进程在重试时崩溃，它试图写入数据库的任何数据都会丢失。


## 弱隔离级别 {#sec_transactions_isolation_levels}

如果两个事务不访问相同的数据，或者都是只读的，它们可以安全地并行运行，因为它们互不依赖。仅当一个事务读取另一个事务并发修改的数据时，或者当两个事务尝试同时修改相同的数据时，才会出现并发问题（竞态条件）。

并发错误很难通过测试发现，因为这些错误只有在时机不巧时才会触发。这种时机问题可能非常罕见，通常难以重现。并发也很难推理，特别是在大型应用程序中，你不一定知道代码的其他部分正在访问数据库。如果只有一个用户，应用程序开发就已经够困难了；有许多并发用户会让情况变得更加困难，因为任何数据都可能在任何时候意外地发生变化。

出于这个原因，数据库长期以来一直试图通过提供*事务隔离*来向应用程序开发人员隐藏并发问题。理论上，隔离应该让你的生活更轻松，让你假装没有并发发生：*可串行化*隔离意味着数据库保证事务具有与*串行*运行（即一次一个，没有任何并发）相同的效果。

在实践中，隔离不幸并不那么简单。可串行化隔离有性能成本，许多数据库不愿意支付这个代价[^10]。因此，系统通常使用较弱的隔离级别，这些级别可以防止*某些*并发问题，但不是全部。这些隔离级别更难理解，它们可能导致微妙的错误，但它们在实践中仍然被使用[^29]。

由弱事务隔离引起的并发错误不仅仅是理论问题。它们已经导致了巨额资金损失[^30] [^31] [^32]，引发了金融审计师的调查[^33]，并导致客户数据损坏[^34]。对此类问题披露的一个流行评论是"如果你正在处理金融数据，请使用 ACID 数据库！"——但这没有抓住重点。即使许多流行的关系数据库系统（通常被认为是"ACID"）使用弱隔离，因此它们不一定能防止这些错误发生。

--------

> [!NOTE]
> 顺便说一句，银行系统的大部分依赖于通过安全 FTP 交换的文本文件[^35]。在这种情况下，拥有审计跟踪和一些人为级别的欺诈预防措施实际上比 ACID 属性更重要。

--------

这些例子还强调了一个重要观点：即使并发问题在正常操作中很少见，你也必须考虑攻击者故意向你的 API 发送大量高度并发请求以故意利用并发错误的可能性[^30]。因此，为了构建可靠和安全的应用程序，你必须确保系统地防止此类错误。

在本节中，我们将研究实践中使用的几种弱（非可串行化）隔离级别，并详细讨论哪些竞态条件可以发生和不能发生，以便你可以决定哪个级别适合你的应用程序。完成后，我们将详细讨论可串行化（参见["可串行化"](#sec_transactions_serializability)）。我们对隔离级别的讨论将是非正式的，使用示例。如果你想要严格的定义和对其属性的分析，你可以在学术文献中找到它们[^36] [^37] [^38] [^39]。

### 读已提交 {#sec_transactions_read_committed}

最基本的事务隔离级别是*读已提交*。它提供两个保证：

1. 从数据库读取时，你只会看到已经提交的数据（没有*脏读*）。
2. 写入数据库时，你只会覆盖已经提交的数据（没有*脏写*）。

某些数据库支持更弱的隔离级别，称为*读未提交*。它防止脏写，但不防止脏读。让我们更详细地讨论这两个保证。

#### 没有脏读 {#no-dirty-reads}

想象一个事务已经向数据库写入了一些数据，但事务尚未提交或中止。另一个事务能看到那个未提交的数据吗？如果能，这称为*脏读*[^3]。

在读已提交隔离级别下运行的事务必须防止脏读。这意味着事务的任何写入只有在该事务提交时才对其他人可见（然后它的所有写入立即变得可见）。这在[图 8-4](#fig_transactions_read_committed) 中说明，其中用户 1 已设置 *x* = 3，但用户 2 的 *get x* 仍返回旧值 2，因为用户 1 尚未提交。

{{< figure src="/fig/ddia_0804.png" id="fig_transactions_read_committed" caption="图 8-4. 没有脏读：用户 2 只有在用户 1 的事务提交后才能看到 x 的新值。" class="w-full my-4" >}}

有几个原因说明为什么防止脏读是有用的：

* 如果事务需要更新多行，脏读意味着另一个事务可能看到某些更新但不是其他更新。例如，在[图 8-2](#fig_transactions_read_uncommitted) 中，用户看到新的未读电子邮件但没有看到更新的计数器。这是电子邮件的脏读。看到数据库处于部分更新状态会让用户感到困惑，并可能导致其他事务做出错误的决定。
* 如果事务中止，它所做的任何写入都需要回滚（如[图 8-3](#fig_transactions_atomicity)）。如果数据库允许脏读，这意味着事务可能看到后来被回滚的数据——即从未实际提交到数据库的数据。任何读取未提交数据的事务也需要被中止，导致称为*级联中止*的问题。

#### 没有脏写 {#sec_transactions_dirty_write}

如果两个事务并发尝试更新数据库中的同一行会发生什么？我们不知道写入将以什么顺序发生，但我们通常假设后面的写入会覆盖前面的写入。

然而，如果前面的写入是尚未提交的事务的一部分，因此后面的写入覆盖了一个未提交的值，会发生什么？这称为*脏写*[^36]。在读已提交隔离级别下运行的事务必须防止脏写，通常通过延迟第二个写入直到第一个写入的事务已提交或中止。

通过防止脏写，这个隔离级别避免了某些类型的并发问题：

* 如果事务更新多行，脏写可能导致糟糕的结果。例如，考虑[图 8-5](#fig_transactions_dirty_writes)，它说明了一个二手车销售网站，两个人 Aaliyah 和 Bryce 同时尝试购买同一辆车。购买汽车需要两次数据库写入：网站上的列表需要更新以反映买家，销售发票需要发送给买家。在[图 8-5](#fig_transactions_dirty_writes) 的情况下，销售被授予 Bryce（因为他对 `listings` 表执行了获胜的更新），但发票被发送给 Aaliyah（因为她对 `invoices` 表执行了获胜的更新）。读已提交防止了这种事故。
* 然而，读已提交*不*防止[图 8-1](#fig_transactions_increment) 中两个计数器递增之间的竞态条件。在这种情况下，第二个写入发生在第一个事务提交之后，所以它不是脏写。它仍然是不正确的，但原因不同——在["防止丢失更新"](#sec_transactions_lost_update)中，我们将讨论如何使此类计数器递增安全。

{{< figure src="/fig/ddia_0805.png" id="fig_transactions_dirty_writes" caption="图 8-5. 有了脏写，来自不同事务的冲突写入可能会混在一起。" class="w-full my-4" >}}


#### 实现读已提交 {#sec_transactions_read_committed_impl}

读已提交是一个非常流行的隔离级别。它是 Oracle Database、PostgreSQL、SQL Server 和许多其他数据库中的默认设置[^10]。

最常见的是，数据库通过使用行级锁来防止脏写：当事务想要修改特定行（或文档或其他对象）时，它必须首先获取该行的锁。然后它必须持有该锁直到事务提交或中止。任何给定行只能有一个事务持有锁；如果另一个事务想要写入同一行，它必须等到第一个事务提交或中止后才能获取锁并继续。这种锁定由数据库在读已提交模式（或更强的隔离级别）下自动完成。

我们如何防止脏读？一种选择是使用相同的锁，并要求任何想要读取行的事务短暂地获取锁，然后在读取后立即再次释放它。这将确保在行具有脏的、未提交的值时无法进行读取（因为在此期间锁将由进行写入的事务持有）。

然而，要求读锁的方法在实践中效果不佳，因为一个长时间运行的写事务可以强制许多其他事务等待，直到长时间运行的事务完成，即使其他事务只读取并且不向数据库写入任何内容。这会损害只读事务的响应时间，并且对可操作性不利：应用程序一个部分的减速可能会由于等待锁而在应用程序的完全不同部分产生连锁效应。

尽管如此，在某些数据库中使用锁来防止脏读，例如 IBM Db2 和 Microsoft SQL Server 在 `read_committed_snapshot=off` 设置中[^29]。

防止脏读的更常用方法是[图 8-4](#fig_transactions_read_committed) 中说明的方法：对于每个被写入的行，数据库记住旧的已提交值和当前持有写锁的事务设置的新值。当事务正在进行时，任何其他读取该行的事务都只是被给予旧值。只有当新值被提交时，事务才会切换到读取新值（有关更多详细信息，请参见["多版本并发控制（MVCC）"](#sec_transactions_snapshot_impl)）。

### 快照隔离与可重复读 {#sec_transactions_snapshot_isolation}

如果你肤浅地看待读已提交隔离，你可能会被原谅认为它做了事务需要做的一切：它允许中止（原子性所需），它防止读取事务的不完整结果，并且它防止并发写入混淆。确实，这些是有用的功能，比没有事务的系统能获得的保证要强得多。

然而，使用这个隔离级别时，仍然有很多方式可能出现并发错误。例如，[图 8-6](#fig_transactions_item_many_preceders) 说明了读已提交可能发生的问题。

{{< figure src="/fig/ddia_0806.png" id="fig_transactions_item_many_preceders" caption="图 8-6. 读取偏差：Aaliyah 观察到数据库处于不一致状态。" class="w-full my-4" >}}


假设 Aaliyah 在银行有 1,000 美元的储蓄，分成两个账户，每个 500 美元。现在一笔事务从她的一个账户转账 100 美元到另一个账户。如果她不幸在该事务处理的同时查看她的账户余额列表，她可能会看到一个账户余额在收款到达之前（余额为 500 美元），另一个账户在转出之后（新余额为 400 美元）。对 Aaliyah 来说，现在她的账户总共只有 900 美元——似乎 100 美元凭空消失了。

这种异常称为*读取偏差*，它是*不可重复读*的一个例子：如果 Aaliyah 在事务结束时再次读取账户 1 的余额，她会看到与之前查询中看到的不同的值（600 美元）。读取偏差在读已提交隔离下被认为是可接受的：Aaliyah 看到的账户余额确实是在她读取它们时已提交的。

--------

> [!NOTE]
> 术语*偏斜*不幸地被重载了：我们之前在*具有热点的不平衡工作负载*的意义上使用它（参见["倾斜负载和缓解热点"](/ch7#sec_sharding_skew)），而这里它意味着*时序异常*。

--------

在 Aaliyah 的情况下，这不是一个持久的问题，因为如果她几秒钟后重新加载在线银行网站，她很可能会看到一致的账户余额。然而，某些情况不能容忍这种临时的不一致性：

备份
: 进行备份需要复制整个数据库，对于大型数据库可能需要几个小时。在备份过程运行期间，写入将继续对数据库进行。因此，你最终可能会得到备份的某些部分包含较旧版本的数据，而其他部分包含较新版本。如果你需要从这样的备份恢复，不一致性（如消失的钱）将变成永久性的。

分析查询和完整性检查
: 有时，你可能想要运行扫描数据库大部分的查询。此类查询在分析中很常见（参见["分析与运营系统"](/ch1#sec_introduction_analytics)），或者可能是定期完整性检查的一部分，以确保一切正常（监控数据损坏）。如果这些查询在不同时间点观察数据库的不同部分，它们很可能返回无意义的结果。

*快照隔离*[^36] 是解决这个问题的最常见方法。其思想是每个事务从数据库的*一致快照*读取——也就是说，事务看到事务开始时数据库中已提交的所有数据。即使数据随后被另一个事务更改，每个事务也只能看到该特定时间点的旧数据。

快照隔离对于长时间运行的只读查询（如备份和分析）来说是一个福音。如果查询操作的数据在查询执行的同时发生变化，很难推理查询的含义。当事务可以看到数据库的一致快照（冻结在特定时间点）时，理解起来就容易得多。

快照隔离是一个流行的功能：它的变体受到 PostgreSQL、使用 InnoDB 存储引擎的 MySQL、Oracle、SQL Server 等的支持，尽管详细行为因系统而异[^29] [^40] [^41]。某些数据库，如 Oracle、TiDB 和 Aurora DSQL，甚至选择快照隔离作为它们的最高隔离级别。

#### 多版本并发控制（MVCC） {#sec_transactions_snapshot_impl}

与读已提交隔离一样，快照隔离的实现通常使用写锁来防止脏写（参见["实现读已提交"](#sec_transactions_read_committed_impl)），这意味着进行写入的事务可以阻止写入同一行的另一个事务的进度。但是，读取不需要任何锁。从性能的角度来看，快照隔离的一个关键原则是*读者永远不会阻塞写者，写者永远不会阻塞读者*。这允许数据库在一致快照上处理长时间运行的读查询，同时正常处理写入，两者之间没有任何锁争用。

为了实现快照隔离，数据库使用了我们在[图 8-4](#fig_transactions_read_committed) 中看到的防止脏读机制的泛化。数据库必须潜在地保留每行的几个不同的已提交版本，而不是每行的两个版本（已提交版本和被覆盖但尚未提交的版本），因为各种正在进行的事务可能需要在不同时间点看到数据库的状态。因为它并排维护一行的多个版本，所以这种技术被称为*多版本并发控制*（MVCC）。

[图 8-7](#fig_transactions_mvcc) 说明了 PostgreSQL 中如何实现基于 MVCC 的快照隔离[^40] [^42] [^43]（其他实现类似）。当事务启动时，它被赋予一个唯一的、始终递增的事务 ID（`txid`）。每当事务向数据库写入任何内容时，它写入的数据都用写入者的事务 ID 标记。（准确地说，PostgreSQL 中的事务 ID 是 32 位整数，因此它们在大约 40 亿个事务后溢出。清理过程执行清理以确保溢出不会影响数据。）

{{< figure src="/fig/ddia_0807.png" id="fig_transactions_mvcc" caption="图 8-7. 使用多版本并发控制实现快照隔离。" class="w-full my-4" >}}


表中的每一行都有一个 `inserted_by` 字段，包含将此行插入表中的事务的 ID。此外，每行都有一个 `deleted_by` 字段，最初为空。如果事务删除一行，该行实际上不会从数据库中删除，而是通过将 `deleted_by` 字段设置为请求删除的事务的 ID 来标记为删除。在稍后的某个时间，当确定没有事务可以再访问已删除的数据时，数据库中的垃圾收集过程会删除任何标记为删除的行并释放它们的空间。

更新在内部被转换为删除和插入[^44]。例如，在[图 8-7](#fig_transactions_mvcc) 中，事务 13 从账户 2 中扣除 100 美元，将余额从 500 美元更改为 400 美元。`accounts` 表现在实际上包含账户 2 的两行：余额为 500 美元的行被事务 13 标记为已删除，余额为 400 美元的行由事务 13 插入。

行的所有版本都存储在同一个数据库堆中（参见["在索引中存储值"](/ch4#sec_storage_index_heap)），无论写入它们的事务是否已提交。同一行的版本形成一个链表，从最新版本到最旧版本或相反，以便查询可以在内部迭代行的所有版本[^45] [^46]。

#### 观察一致快照的可见性规则 {#sec_transactions_mvcc_visibility}

当事务从数据库读取时，事务 ID 用于决定它可以看到哪些行版本以及哪些是不可见的。通过仔细定义可见性规则，数据库可以向应用程序呈现数据库的一致快照。这大致如下工作[^43]：

1. 在每个事务开始时，数据库列出当时正在进行（尚未提交或中止）的所有其他事务。这些事务所做的任何写入都被忽略，即使事务随后提交。这确保我们看到一个不受另一个事务提交影响的一致快照。
2. 具有较晚事务 ID（即在当前事务开始后开始，因此不包括在正在进行的事务列表中）的事务所做的任何写入都被忽略，无论这些事务是否已提交。
3. 中止事务所做的任何写入都被忽略，无论该中止何时发生。这样做的好处是，当事务中止时，我们不需要立即从存储中删除它写入的行，因为可见性规则会将它们过滤掉。垃圾收集过程可以稍后删除它们。
4. 所有其他写入对应用程序的查询可见。

这些规则适用于行的插入和删除。在[图 8-7](#fig_transactions_mvcc) 中，当事务 12 从账户 2 读取时，它看到 500 美元的余额，因为 500 美元余额的删除是由事务 13 进行的（根据规则 2，事务 12 无法看到事务 13 进行的删除），而 400 美元余额的插入尚不可见（根据相同的规则）。

换句话说，如果以下两个条件都为真，则行是可见的：

* 在读者事务开始时，插入该行的事务已经提交。
* 该行未标记为删除，或者如果是，请求删除的事务在读者事务开始时尚未提交。

长时间运行的事务可能会长时间继续使用快照，继续读取（从其他事务的角度来看）早已被覆盖或删除的值。通过永远不更新原地的值，而是在每次更改值时插入新版本，数据库可以提供一致的快照，同时只产生很小的开销。

<a id="sec_transactions_snapshot_indexes"></a>

#### 索引与快照隔离 {#indexes-and-snapshot-isolation}

索引如何在多版本数据库中工作？最常见的方法是每个索引条目指向与该条目匹配的行的一个版本（最旧或最新版本）。每个行版本可能包含对下一个最旧或下一个最新版本的引用。使用索引的查询必须迭代行以找到可见的行，并且值与查询要查找的内容匹配。当垃圾收集删除不再对任何事务可见的旧行版本时，相应的索引条目也可以被删除。

许多实现细节影响多版本并发控制的性能[^45] [^46]。例如，如果同一行的不同版本可以适合同一页面，PostgreSQL 有避免索引更新的优化[^40]。其他一些数据库避免存储修改行的完整副本，而只存储版本之间的差异以节省空间。

CouchDB、Datomic 和 LMDB 使用另一种方法。尽管它们也使用 B 树（参见["B 树"](/ch4#sec_storage_b_trees)），但它们使用*不可变*（写时复制）变体，在更新时不会覆盖树的页面，而是创建每个修改页面的新副本。父页面，直到树的根，被复制并更新以指向其子页面的新版本。任何不受写入影响的页面都不需要复制，并且可以与新树共享[^47]。

使用不可变 B 树，每个写事务（或事务批次）都会创建一个新的 B 树根，特定的根是创建时数据库的一致快照。不需要基于事务 ID 过滤行，因为后续写入无法修改现有的 B 树；它们只能创建新的树根。这种方法还需要后台进程进行压缩和垃圾收集。

#### 快照隔离、可重复读和命名混淆 {#snapshot-isolation-repeatable-read-and-naming-confusion}

MVCC 是数据库常用的实现技术，通常用于实现快照隔离。然而，不同的数据库有时使用不同的术语来指代同一件事：例如，快照隔离在 PostgreSQL 中称为"可重复读"，在 Oracle 中称为"可串行化"[^29]。有时不同的系统使用相同的术语来表示不同的东西：例如，虽然在 PostgreSQL 中"可重复读"意味着快照隔离，但在 MySQL 中它意味着比快照隔离更弱一致性的 MVCC 实现[^41]。

这种命名混淆的原因是 SQL 标准没有快照隔离的概念，因为该标准基于 System R 1975 年的隔离级别定义[^3]，而快照隔离当时还没有被发明。相反，它定义了可重复读，表面上看起来类似于快照隔离。PostgreSQL 将其快照隔离级别称为"可重复读"，因为它符合标准的要求，因此他们可以声称符合标准。

不幸的是，SQL 标准对隔离级别的定义是有缺陷的——它是模糊的、不精确的，并且不像标准应该的那样独立于实现[^36]。即使几个数据库实现了可重复读，它们实际提供的保证也有很大差异，尽管表面上是标准化的[^29]。研究文献中有可重复读的正式定义[^37] [^38]，但大多数实现不满足该正式定义。最重要的是，IBM Db2 使用"可重复读"来指代可串行化[^10]。

因此，没有人真正知道可重复读意味着什么。

### 防止丢失更新 {#sec_transactions_lost_update}

到目前为止，我们讨论的读已提交和快照隔离级别主要是关于只读事务在并发写入存在的情况下可以看到什么的保证。我们大多忽略了两个事务并发写入的问题——我们只讨论了脏写（参见["没有脏写"](#sec_transactions_dirty_write)），这是可能发生的一种特定类型的写-写冲突。

并发写入事务之间还可能发生其他几种有趣的冲突。其中最著名的是*丢失更新*问题，在[图 8-1](#fig_transactions_increment) 中以两个并发计数器递增的例子说明。

如果应用程序从数据库读取某个值，修改它，然后写回修改后的值（*读-修改-写循环*），就会出现丢失更新问题。如果两个事务并发执行此操作，其中一个修改可能会丢失，因为第二个写入不包括第一个修改。（我们有时说后面的写入*覆盖*了前面的写入。）这种模式出现在各种不同的场景中：

* 递增计数器或更新账户余额（需要读取当前值，计算新值，并写回更新的值）
* 对复杂值进行本地更改，例如，向 JSON 文档中的列表添加元素（需要解析文档，进行更改，并写回修改后的文档）
* 两个用户同时编辑 wiki 页面，每个用户通过将整个页面内容发送到服务器来保存他们的更改，覆盖数据库中当前的任何内容

因为这是一个如此常见的问题，已经开发了各种解决方案[^48]。

#### 原子写操作 {#atomic-write-operations}

许多数据库提供原子更新操作，消除了在应用程序代码中实现读-修改-写循环的需要。如果你的代码可以用这些操作来表达，它们通常是最好的解决方案。例如，以下指令在大多数关系数据库中是并发安全的：

```sql
UPDATE counters SET value = value + 1 WHERE key = 'foo';
```

类似地，文档数据库（如 MongoDB）提供原子操作来对 JSON 文档的一部分进行本地修改，Redis 提供原子操作来修改数据结构（如优先级队列）。并非所有写入都可以轻松地用原子操作来表达——例如，对 wiki 页面的更新涉及任意文本编辑，可以使用["CRDT 和操作转换"](/ch6#sec_replication_crdts)中讨论的算法来处理——但在可以使用原子操作的情况下，它们通常是最佳选择。

原子操作通常通过在读取对象时对其进行独占锁来实现，以便在应用更新之前没有其他事务可以读取它。另一种选择是简单地强制所有原子操作在单个线程上执行。

不幸的是，对象关系映射（ORM）框架很容易意外地编写执行不安全的读-修改-写循环的代码，而不是使用数据库提供的原子操作[^49] [^50] [^51]。这可能是难以通过测试发现的微妙错误的来源。

#### 显式锁定 {#explicit-locking}

如果数据库的内置原子操作不提供必要的功能，另一个防止丢失更新的选项是应用程序显式锁定要更新的对象。然后应用程序可以执行读-修改-写循环，如果任何其他事务尝试并发更新或锁定同一对象，它将被迫等到第一个读-修改-写循环完成。

例如，考虑一个多人游戏，其中几个玩家可以同时移动同一个棋子。在这种情况下，原子操作可能不够，因为应用程序还需要确保玩家的移动遵守游戏规则，这涉及一些你无法合理地作为数据库查询实现的逻辑。相反，你可以使用锁来防止两个玩家同时移动同一个棋子，如[例 8-1](#fig_transactions_select_for_update) 所示。

{{< figure id="fig_transactions_select_for_update" title="例 8-1. 显式锁定行以防止丢失更新" class="w-full my-4" >}}

```sql
BEGIN TRANSACTION;

SELECT * FROM figures
    WHERE name = 'robot' AND game_id = 222
    FOR UPDATE; ❶

-- 检查移动是否有效，然后更新
-- 前一个 SELECT 返回的棋子的位置。
UPDATE figures SET position = 'c4' WHERE id = 1234;

COMMIT;
```

❶：`FOR UPDATE` 子句表示数据库应该对此查询返回的所有行进行锁定。

这是有效的，但要正确执行，你需要仔细考虑你的应用程序逻辑。很容易忘记在代码中的某个地方添加必要的锁，从而引入竞态条件。

此外，如果你锁定多个对象，则存在死锁的风险，其中两个或多个事务正在等待彼此释放锁。许多数据库会自动检测死锁，并中止涉及的事务之一，以便系统可以取得进展。你可以在应用程序级别通过重试中止的事务来处理这种情况。

#### 自动检测丢失的更新 {#automatically-detecting-lost-updates}

原子操作和锁是通过强制读-修改-写循环按顺序发生来防止丢失更新的方法。另一种选择是允许它们并行执行，如果事务管理器检测到丢失的更新，则中止事务并强制它重试其读-修改-写循环。

这种方法的一个优点是数据库可以与快照隔离一起有效地执行此检查。实际上，PostgreSQL 的可重复读、Oracle 的可串行化和 SQL Server 的快照隔离级别会自动检测何时发生丢失的更新并中止有问题的事务。然而，MySQL/InnoDB 的可重复读不检测丢失的更新[^29] [^41]。一些作者[^36] [^38] 认为数据库必须防止丢失的更新才能提供快照隔离，因此根据这个定义，MySQL 不提供快照隔离。

丢失更新检测是一个很好的功能，因为它不需要应用程序代码使用任何特殊的数据库功能——你可能忘记使用锁或原子操作从而引入错误，但丢失更新检测会自动发生，因此不太容易出错。但是，你还必须在应用程序级别重试中止的事务。

#### 条件写入（比较并设置） {#sec_transactions_compare_and_set}

在不提供事务的数据库中，你有时会发现一个*条件写入*操作，它可以通过仅在值自你上次读取以来未更改时才允许更新来防止丢失的更新（之前在["单对象写入"](#sec_transactions_single_object)中提到）。如果当前值与你之前读取的不匹配，则更新无效，必须重试读-修改-写循环。它是许多 CPU 支持的原子*比较并设置*或*比较并交换*（CAS）指令的数据库等价物。

例如，为了防止两个用户同时更新同一个 wiki 页面，你可以尝试类似这样的操作，期望仅当页面内容自用户开始编辑以来没有更改时才进行更新：

```sql
-- 这可能安全也可能不安全，取决于数据库实现
UPDATE wiki_pages SET content = 'new content'
    WHERE id = 1234 AND content = 'old content';
```

如果内容已更改并且不再匹配 `'old content'`，则此更新将无效，因此你需要检查更新是否生效并在必要时重试。你也可以使用在每次更新时递增的版本号列，并且仅在当前版本号未更改时才应用更新，而不是比较完整内容。这种方法有时称为*乐观锁定*[^52]。

请注意，如果另一个事务并发修改了 `content`，则根据 MVCC 可见性规则，新内容可能不可见（参见["观察一致快照的可见性规则"](#sec_transactions_mvcc_visibility)）。MVCC 的许多实现对此场景有可见性规则的例外，其中其他事务写入的值对 `UPDATE` 和 `DELETE` 查询的 `WHERE` 子句的评估可见，即使这些写入在快照中不可见。

#### 冲突解决与复制 {#conflict-resolution-and-replication}

在复制数据库中（参见[第 6 章](/ch6#ch_replication)），防止丢失的更新具有另一个维度：由于它们在多个节点上有数据副本，并且数据可能在不同节点上并发修改，因此需要采取一些额外的步骤来防止丢失的更新。

锁和条件写入操作假设有一个最新的数据副本。然而，具有多领导者或无主（无领导者）复制的数据库通常允许多个写入并发发生并异步复制它们，因此它们不能保证有一个最新的数据副本。因此，基于锁或条件写入的技术在此上下文中不适用。（我们将在["线性一致性"](/ch10#sec_consistency_linearizability)中更详细地重新讨论这个问题。）

相反，如["处理冲突写入"](/ch6#sec_replication_write_conflicts)中所讨论的，此类复制数据库中的常见方法是允许并发写入创建值的多个冲突版本（也称为*兄弟节点*），并使用应用程序代码或特殊数据结构在事后解决和合并这些版本。

如果更新是可交换的（即，你可以在不同副本上以不同顺序应用它们，仍然得到相同的结果），合并冲突值可以防止丢失的更新。例如，递增计数器或向集合添加元素是可交换操作。这就是 CRDT 背后的想法，我们在["CRDT 和操作转换"](/ch6#sec_replication_crdts)中遇到过。然而，某些操作（如条件写入）不能成为可交换的。

另一方面，*最后写入胜利*（LWW）冲突解决方法容易丢失更新，如["最后写入胜利（丢弃并发写入）"](/ch6#sec_replication_lww)中所讨论的。不幸的是，LWW 是许多复制数据库中的默认值。

### 写偏差与幻读 {#sec_transactions_write_skew}

在前面的部分中，我们看到了*脏写*和*丢失更新*，这是当不同事务并发尝试写入相同对象时可能发生的两种竞态条件。为了避免数据损坏，需要防止这些竞态条件——要么由数据库自动防止，要么通过使用锁或原子写操作等手动保护措施。

然而，这并不是并发写入之间可能发生的潜在竞态条件列表的结尾。在本节中，我们将看到一些更微妙的冲突示例。

首先，想象这个例子：你正在为医生编写一个应用程序来管理他们在医院的值班班次。医院通常试图在任何时候都有几位医生值班，但绝对必须至少有一位医生值班。医生可以放弃他们的班次（例如，如果他们自己生病了），前提是该班次中至少有一位同事留在值班[^53] [^54]。

现在想象 Aaliyah 和 Bryce 是特定班次的两位值班医生。两人都感觉不舒服，所以他们都决定请假。不幸的是，他们碰巧大约在同一时间点击了下班的按钮。接下来发生的事情如[图 8-8](#fig_transactions_write_skew) 所示。

{{< figure src="/fig/ddia_0808.png" id="fig_transactions_write_skew" caption="图 8-8. 写偏差导致应用程序错误的示例。" class="w-full my-4" >}}


在每个事务中，你的应用程序首先检查当前是否有两个或更多医生在值班；如果是，它假设一个医生下班是安全的。由于数据库使用快照隔离，两个检查都返回 `2`，因此两个事务都继续到下一阶段。Aaliyah 更新她自己的记录让自己下班，Bryce 同样更新他自己的记录。两个事务都提交，现在没有医生值班。你至少有一个医生值班的要求被违反了。

#### 写偏差的特征 {#characterizing-write-skew}

这种异常称为*写偏差*[^36]。它既不是脏写也不是丢失的更新，因为两个事务正在更新两个不同的对象（分别是 Aaliyah 和 Bryce 的值班记录）。这里发生冲突不太明显，但这绝对是一个竞态条件：如果两个事务一个接一个地运行，第二个医生将被阻止下班。异常行为只有在事务并发运行时才可能。

你可以将写偏差视为丢失更新问题的概括。如果两个事务读取相同的对象，然后更新其中一些对象（不同的事务可能更新不同的对象），就会发生写偏差。在不同事务更新同一对象的特殊情况下，你会得到脏写或丢失更新异常（取决于时机）。

我们看到有各种不同的方法可以防止丢失的更新。对于写偏差，我们的选择更受限制：

* 原子单对象操作没有帮助，因为涉及多个对象。
* 不幸的是，你在某些快照隔离实现中发现的丢失更新的自动检测也没有帮助：写偏差在 PostgreSQL 的可重复读、MySQL/InnoDB 的可重复读、Oracle 的可串行化或 SQL Server 的快照隔离级别中不会自动检测到[^29]。自动防止写偏差需要真正的可串行化隔离（参见["可串行化"](#sec_transactions_serializability)）。
* 某些数据库允许你配置约束，然后由数据库强制执行（例如，唯一性、外键约束或对特定值的限制）。但是，为了指定至少有一个医生必须值班，你需要一个涉及多个对象的约束。大多数数据库没有对此类约束的内置支持，但你可能能够使用触发器或物化视图实现它们，如["一致性"](#sec_transactions_acid_consistency)中所讨论的[^12]。
* 如果你不能使用可串行化隔离级别，在这种情况下，第二好的选择可能是显式锁定事务所依赖的行。在医生示例中，你可以编写如下内容：

    ```sql
    BEGIN TRANSACTION;

    SELECT * FROM doctors
        WHERE on_call = true
        AND shift_id = 1234 FOR UPDATE; ❶

    UPDATE doctors
       SET on_call = false
       WHERE name = 'Aaliyah'
       AND shift_id = 1234;

    COMMIT;
    ```

❶：和以前一样，`FOR UPDATE` 告诉数据库锁定此查询返回的所有行。

#### 写偏差的更多例子 {#more-examples-of-write-skew}

写偏差起初可能看起来是一个深奥的问题，但一旦你意识到它，你可能会注意到更多可能发生的情况。以下是更多示例：

会议室预订系统
: 假设你想强制同一会议室在同一时间不能有两个预订[^55]。当有人想要预订时，你首先检查是否有任何冲突的预订（即，具有重叠时间范围的同一房间的预订），如果没有找到，你就创建会议（参见[例 8-2](#fig_transactions_meeting_rooms)）。
    
    {{< figure id="fig_transactions_meeting_rooms" title="例 8-2. 会议室预订系统试图避免重复预订（在快照隔离下不安全）" class="w-full my-4" >}}
    
    ```sql
    BEGIN TRANSACTION;
    
    -- 检查是否有任何现有预订与中午 12 点到 1 点的时间段重叠
    SELECT COUNT(*) FROM bookings
    WHERE room_id = 123 AND
    end_time > '2025-01-01 12:00' AND start_time < '2025-01-01 13:00';
    
    -- 如果前一个查询返回零：
    INSERT INTO bookings (room_id, start_time, end_time, user_id)
    VALUES (123, '2025-01-01 12:00', '2025-01-01 13:00', 666);
    
    COMMIT;
    ```

     不幸的是，快照隔离不会阻止另一个用户并发插入冲突的会议。为了保证你不会出现调度冲突，你再次需要可串行化隔离。

多人游戏
: 在[例 8-1](#fig_transactions_select_for_update) 中，我们使用锁来防止丢失的更新（即，确保两个玩家不能同时移动同一个棋子）。但是，锁不会阻止玩家将两个不同的棋子移动到棋盘上的同一位置，或者可能做出违反游戏规则的其他移动。根据你要执行的规则类型，你可能能够使用唯一约束，但否则你很容易受到写偏差的影响。

声明用户名
: 在每个用户都有唯一用户名的网站上，两个用户可能同时尝试使用相同的用户名创建账户。你可以使用事务来检查名称是否被占用，如果没有，使用该名称创建账户。但是，就像前面的例子一样，这在快照隔离下是不安全的。幸运的是，唯一约束在这里是一个简单的解决方案（尝试注册用户名的第二个事务将由于违反约束而被中止）。

防止重复消费
: 允许用户花钱或积分的服务需要检查用户不会花费超过他们拥有的。你可以通过在用户账户中插入暂定支出项目，列出账户中的所有项目，并检查总和是否为正来实现这一点。有了写偏差，可能会发生两个支出项目并发插入，它们一起导致余额变为负数，但没有任何事务注意到另一个。

#### 导致写偏差的幻读 {#sec_transactions_phantom}

所有这些例子都遵循类似的模式：

1. `SELECT` 查询通过搜索匹配某些搜索条件的行来检查是否满足某些要求（至少有两个医生值班，该房间在该时间没有现有预订，棋盘上的位置还没有另一个棋子，用户名尚未被占用，账户中仍有钱）。
2. 根据第一个查询的结果，应用程序代码决定如何继续（也许继续操作，或者向用户报告错误并中止）。
3. 如果应用程序决定继续，它会向数据库进行写入（`INSERT`、`UPDATE` 或 `DELETE`）并提交事务。

 此写入的效果改变了步骤 2 决策的前提条件。换句话说，如果你在提交写入后重复步骤 1 的 `SELECT` 查询，你会得到不同的结果，因为写入改变了匹配搜索条件的行集（现在少了一个医生值班，会议室现在已为该时间预订，棋盘上的位置现在被移动的棋子占据，用户名现在被占用，账户中的钱现在更少）。

步骤可能以不同的顺序发生。例如，你可以先进行写入，然后进行 `SELECT` 查询，最后根据查询结果决定是中止还是提交。

在医生值班示例的情况下，步骤 3 中被修改的行是步骤 1 中返回的行之一，因此我们可以通过锁定步骤 1 中的行（`SELECT FOR UPDATE`）来使事务安全并避免写偏差。但是，其他四个示例是不同的：它们检查*不存在*匹配某些搜索条件的行，而写入*添加*了匹配相同条件的行。如果步骤 1 中的查询不返回任何行，`SELECT FOR UPDATE` 就无法附加锁[^56]。

这种效果，其中一个事务中的写入改变另一个事务中搜索查询的结果，称为*幻读*[^4]。快照隔离避免了只读查询中的幻读，但在我们讨论的读写事务中，幻读可能导致特别棘手的写偏差情况。ORM 生成的 SQL 也容易出现写偏差[^50] [^51]。

#### 物化冲突 {#materializing-conflicts}

如果幻读的问题是没有对象可以附加锁，也许我们可以在数据库中人为地引入一个锁对象？

例如，在会议室预订情况下，你可以想象创建一个时间段和房间的表。此表中的每一行对应于特定时间段（例如，15 分钟）的特定房间。你提前为所有可能的房间和时间段组合创建行，例如，接下来的六个月。

现在，想要创建预订的事务可以锁定（`SELECT FOR UPDATE`）表中对应于所需房间和时间段的行。获取锁后，它可以像以前一样检查重叠的预订并插入新的预订。请注意，附加表不用于存储有关预订的信息——它纯粹是一组锁，用于防止同一房间和时间范围的预订被并发修改。

这种方法称为*物化冲突*，因为它采用了幻读并将其转化为存在于数据库中的具体行集上的锁冲突[^14]。不幸的是，很难且容易出错地弄清楚如何物化冲突，并且让并发控制机制泄漏到应用程序数据模型中是丑陋的。出于这些原因，如果没有其他选择，物化冲突应被视为最后的手段。在大多数情况下，可串行化隔离级别要好得多。


## 可串行化 {#sec_transactions_serializability}

在本章中，我们已经看到了几个容易出现竞态条件的事务示例。某些竞态条件被读已提交和快照隔离级别所防止，但其他的则没有。我们遇到了一些特别棘手的写偏差和幻读示例。这是一个令人沮丧的情况：

* 隔离级别很难理解，并且在不同数据库中的实现不一致（例如，"可重复读"的含义差异很大）。
* 如果你查看你的应用程序代码，很难判断在特定隔离级别下运行是否安全——特别是在大型应用程序中，你可能不知道所有可能并发发生的事情。
* 没有好的工具来帮助我们检测竞态条件。原则上，静态分析可能有所帮助[^33]，但研究技术尚未进入实际使用。测试并发问题很困难，因为它们通常是非确定性的——只有在时机不巧时才会出现问题。

这不是一个新问题——自 1970 年代引入弱隔离级别以来一直如此[^3]。一直以来，研究人员的答案都很简单：使用*可串行化*隔离！

可串行化隔离是最强的隔离级别。它保证即使事务可能并行执行，最终结果与它们*串行*执行（一次一个，没有任何并发）相同。因此，数据库保证如果事务在单独运行时行为正确，那么在并发运行时它们继续保持正确——换句话说，数据库防止了*所有*可能的竞态条件。

但如果可串行化隔离比弱隔离级别的混乱要好得多，那为什么不是每个人都在使用它？要回答这个问题，我们需要查看实现可串行化的选项，以及它们的性能如何。今天提供可串行化的大多数数据库使用以下三种技术之一，我们将在本章的其余部分探讨：

* 字面上串行执行事务（参见["实际串行执行"](#sec_transactions_serial)）
* 两阶段锁定（参见["两阶段锁定（2PL）"](#sec_transactions_2pl)），几十年来这是唯一可行的选择
* 乐观并发控制技术，如可串行化快照隔离（参见["可串行化快照隔离（SSI）"](#sec_transactions_ssi)）

### 实际串行执行 {#sec_transactions_serial}

避免并发问题的最简单方法是完全消除并发：在单个线程上按串行顺序一次执行一个事务。通过这样做，我们完全回避了检测和防止事务之间冲突的问题：所产生的隔离根据定义是可串行化的。

尽管这似乎是一个显而易见的想法，但直到 2000 年代，数据库设计者才决定执行事务的单线程循环是可行的[^57]。如果在过去 30 年中多线程并发被认为是获得良好性能的必要条件，那是什么改变使得单线程执行成为可能？

两个发展导致了这种重新思考：

* RAM 变得足够便宜，对于许多用例，现在可以将整个活动数据集保存在内存中（参见["将所有内容保存在内存中"](/ch4#sec_storage_inmemory)）。当事务需要访问的所有数据都在内存中时，事务的执行速度比必须等待从磁盘加载数据要快得多。
* 数据库设计者意识到 OLTP 事务通常很短，只进行少量读写（参见["分析与运营系统"](/ch1#sec_introduction_analytics)）。相比之下，长时间运行的分析查询通常是只读的，因此它们可以在串行执行循环之外的一致快照上运行（使用快照隔离）。

串行执行事务的方法在 VoltDB/H-Store、Redis 和 Datomic 等中实现[^58] [^59] [^60]。为单线程执行设计的系统有时可以比支持并发的系统性能更好，因为它可以避免锁定的协调开销。但是，其吞吐量限于单个 CPU 核心。为了充分利用该单线程，事务需要以不同于传统形式的方式构建。

#### 将事务封装在存储过程中 {#encapsulating-transactions-in-stored-procedures}

在数据库的早期，意图是数据库事务可以包含整个用户活动流程。例如，预订机票是一个多阶段过程（搜索路线、票价和可用座位；决定行程；预订行程中每个航班的座位；输入乘客详细信息；付款）。数据库设计者认为，如果整个过程是一个事务，以便可以原子地提交，那将是很好的。

不幸的是，人类做决定和响应的速度非常慢。如果数据库事务需要等待用户的输入，数据库需要支持潜在的大量并发事务，其中大多数是空闲的。大多数数据库无法有效地做到这一点，因此几乎所有 OLTP 应用程序都通过避免在事务中交互式地等待用户来保持事务简短。在 Web 上，这意味着事务在同一 HTTP 请求中提交——事务不跨越多个请求。新的 HTTP 请求开始新的事务。

即使人类已经从关键路径中移除，事务仍然以交互式客户端/服务器风格执行，一次一个语句。应用程序进行查询，读取结果，可能根据第一个查询的结果进行另一个查询，依此类推。查询和结果在应用程序代码（在一台机器上运行）和数据库服务器（在另一台机器上）之间来回发送。

在这种交互式事务风格中，大量时间花在应用程序和数据库之间的网络通信上。如果你要在数据库中禁止并发并一次只处理一个事务，吞吐量将是可怕的，因为数据库将大部分时间都在等待应用程序为当前事务发出下一个查询。在这种数据库中，为了获得合理的性能，必须并发处理多个事务。

因此，具有单线程串行事务处理的系统不允许交互式多语句事务。相反，应用程序必须将自己限制为包含单个语句的事务，或者提前将整个事务代码作为*存储过程*提交给数据库[^61]。

交互式事务和存储过程之间的差异如[图 8-9](#fig_transactions_stored_proc) 所示。前提是事务所需的所有数据都在内存中，存储过程可以非常快速地执行，而无需等待任何网络或磁盘 I/O。

{{< figure src="/fig/ddia_0809.png" id="fig_transactions_stored_proc" caption="图 8-9. 交互式事务和存储过程之间的差异（使用[图 8-8](#fig_transactions_write_skew)的示例事务）。" class="w-full my-4" >}}

#### 存储过程的利弊 {#sec_transactions_stored_proc_tradeoffs}

存储过程在关系数据库中已经存在了一段时间，自 1999 年以来一直是 SQL 标准（SQL/PSM）的一部分。它们因各种原因获得了一些不好的声誉：

* 传统上，每个数据库供应商都有自己的存储过程语言（Oracle 有 PL/SQL，SQL Server 有 T-SQL，PostgreSQL 有 PL/pgSQL 等）。这些语言没有跟上通用编程语言的发展，因此从今天的角度来看，它们看起来相当丑陋和过时，并且缺乏大多数编程语言中的库生态系统。
* 在数据库中运行的代码很难管理：与应用程序服务器相比，调试更困难，版本控制和部署更尴尬，测试更棘手，并且难以与监控的指标收集系统集成。
* 数据库通常比应用程序服务器对性能更敏感，因为单个数据库实例通常由许多应用程序服务器共享。数据库中编写不当的存储过程（例如，使用大量内存或 CPU 时间）可能比应用程序服务器中等效的编写不当的代码造成更多麻烦。
* 在允许租户编写自己的存储过程的多租户系统中，在与数据库内核相同的进程中执行不受信任的代码是一个安全风险[^62]。

然而，这些问题可以克服。存储过程的现代实现已经放弃了 PL/SQL，而是使用现有的通用编程语言：VoltDB 使用 Java 或 Groovy，Datomic 使用 Java 或 Clojure，Redis 使用 Lua，MongoDB 使用 Javascript。

存储过程在应用程序逻辑无法轻松嵌入其他地方的情况下也很有用。例如，使用 GraphQL 的应用程序可能通过 GraphQL 代理直接公开其数据库。如果代理不支持复杂的验证逻辑，你可以使用存储过程将此类逻辑直接嵌入数据库中。如果数据库不支持存储过程，你必须在代理和数据库之间部署验证服务来进行验证。

使用存储过程和内存数据，在单个线程上执行所有事务变得可行。当存储过程不需要等待 I/O 并避免其他并发控制机制的开销时，它们可以在单个线程上实现相当好的吞吐量。

VoltDB 还使用存储过程进行复制：它不是将事务的写入从一个节点复制到另一个节点，而是在每个副本上执行相同的存储过程。因此，VoltDB 要求存储过程是*确定性的*（在不同节点上运行时，它们必须产生相同的结果）。例如，如果事务需要使用当前日期和时间，它必须通过特殊的确定性 API 来实现（有关确定性操作的更多详细信息，请参见["持久执行和工作流"](/ch5#sec_encoding_dataflow_workflows)）。这种方法称为*状态机复制*，我们将在[第 10 章](/ch10#ch_consistency)中回到它。

#### 分片 {#sharding}

串行执行所有事务使并发控制变得简单得多，但将数据库的事务吞吐量限制为单台机器上单个 CPU 核心的速度。只读事务可以使用快照隔离在其他地方执行，但对于具有高写入吞吐量的应用程序，单线程事务处理器可能成为严重的瓶颈。

为了扩展到多个 CPU 核心和多个节点，你可以对数据进行分片（参见[第 7 章](/ch7#ch_sharding)），VoltDB 支持这一点。如果你可以找到一种对数据集进行分片的方法，使每个事务只需要读取和写入单个分片内的数据，那么每个分片可以有自己的事务处理线程，独立于其他分片运行。在这种情况下，你可以给每个 CPU 核心分配自己的分片，这允许你的事务吞吐量与 CPU 核心数量线性扩展[^59]。

但是，对于需要访问多个分片的任何事务，数据库必须协调它所涉及的所有分片之间的事务。存储过程需要在所有分片上同步执行，以确保整个系统的可串行化。

由于跨分片事务具有额外的协调开销，因此它们比单分片事务慢得多。VoltDB 报告的跨分片写入吞吐量约为每秒 1,000 次，这比其单分片吞吐量低几个数量级，并且无法通过添加更多机器来增加[^61]。最近的研究探索了使多分片事务更具可伸缩性的方法[^63]。

事务是否可以是单分片的很大程度上取决于应用程序使用的数据结构。简单的键值数据通常可以很容易地分片，但具有多个二级索引的数据可能需要大量的跨分片协调（参见["分片和二级索引"](/ch7#sec_sharding_secondary_indexes)）。

#### 串行执行总结 {#summary-of-serial-execution}

串行执行事务已成为在某些约束条件下实现可串行化隔离的可行方法：

* 每个事务必须小而快，因为只需要一个缓慢的事务就可以阻止所有事务处理。
* 它最适合活动数据集可以适合内存的情况。很少访问的数据可能会移到磁盘，但如果需要在单线程事务中访问，系统会变得非常慢。
* 写入吞吐量必须足够低，可以在单个 CPU 核心上处理，否则事务需要分片而不需要跨分片协调。
* 跨分片事务是可能的，但它们的吞吐量很难扩展。

### 两阶段锁定（2PL） {#sec_transactions_2pl}

大约 30 年来，数据库中只有一种广泛使用的可串行化算法：*两阶段锁定*（2PL），有时称为*强严格两阶段锁定*（SS2PL），以区别于 2PL 的其他变体。


--------

> [!TIP] 2PL 不是 2PC

两阶段*锁定*（2PL）和两阶段*提交*（2PC）是两个非常不同的东西。2PL 提供可串行化隔离，而 2PC 在分布式数据库中提供原子提交（参见["两阶段提交（2PC）"](#sec_transactions_2pc)）。为避免混淆，最好将它们视为完全独立的概念，并忽略名称中不幸的相似性。

--------

我们之前看到锁通常用于防止脏写（参见["没有脏写"](#sec_transactions_dirty_write)）：如果两个事务并发尝试写入同一对象，锁确保第二个写入者必须等到第一个完成其事务（中止或提交）后才能继续。

两阶段锁定类似，但使锁要求更强。只要没有人写入，多个事务就可以并发读取同一对象。但是一旦有人想要写入（修改或删除）对象，就需要独占访问：

* 如果事务 A 已读取对象而事务 B 想要写入该对象，B 必须等到 A 提交或中止后才能继续。（这确保 B 不能在 A 背后意外地更改对象。）
* 如果事务 A 已写入对象而事务 B 想要读取该对象，B 必须等到 A 提交或中止后才能继续。（像[图 8-4](#fig_transactions_read_committed) 中那样读取对象的旧版本在 2PL 下是不可接受的。）

在 2PL 中，写入者不仅阻塞其他写入者；它们还阻塞读者，反之亦然。快照隔离有这样的口号：*读者永远不会阻塞写者，写者永远不会阻塞读者*（参见["多版本并发控制（MVCC）"](#sec_transactions_snapshot_impl)），这捕捉了快照隔离和两阶段锁定之间的关键区别。另一方面，因为 2PL 提供可串行化，它可以防止早期讨论的所有竞态条件，包括丢失的更新和写偏差。

#### 两阶段锁定的实现 {#implementation-of-two-phase-locking}

2PL 由 MySQL（InnoDB）和 SQL Server 中的可串行化隔离级别以及 Db2 中的可重复读隔离级别使用[^29]。

读者和写者的阻塞是通过在数据库中的每个对象上有一个锁来实现的。锁可以处于*共享模式*或*独占模式*（也称为*多读者单写者*锁）。锁的使用如下：

* 如果事务想要读取对象，它必须首先以共享模式获取锁。多个事务可以同时以共享模式持有锁，但如果另一个事务已经对该对象具有独占锁，则这些事务必须等待。
* 如果事务想要写入对象，它必须首先以独占模式获取锁。没有其他事务可以同时持有锁（无论是共享模式还是独占模式），因此如果对象上有任何现有锁，事务必须等待。
* 如果事务首先读取然后写入对象，它可以将其共享锁升级为独占锁。升级的工作方式与直接获取独占锁相同。
* 获取锁后，事务必须继续持有锁直到事务结束（提交或中止）。这就是"两阶段"名称的来源：第一阶段（事务执行时）是获取锁，第二阶段（事务结束时）是释放所有锁。

由于使用了如此多的锁，很容易发生事务 A 等待事务 B 释放其锁，反之亦然的情况。这种情况称为*死锁*。数据库自动检测事务之间的死锁并中止其中一个，以便其他事务可以取得进展。中止的事务需要由应用程序重试。

#### 两阶段锁定的性能 {#performance-of-two-phase-locking}

两阶段锁定的主要缺点，以及自 1970 年代以来并非每个人都使用它的原因，是性能：在两阶段锁定下，事务吞吐量和查询响应时间明显比弱隔离下差。

这部分是由于获取和释放所有这些锁的开销，但更重要的是由于并发性降低。按设计，如果两个并发事务尝试执行任何可能以任何方式导致竞态条件的操作，其中一个必须等待另一个完成。

例如，如果你有一个需要读取整个表的事务（例如，备份、分析查询或完整性检查，如["快照隔离与可重复读"](#sec_transactions_snapshot_isolation)中所讨论的），该事务必须对整个表进行共享锁。因此，读取事务首先必须等到所有正在写入该表的进行中事务完成；然后，在读取整个表时（对于大表可能需要很长时间），所有想要写入该表的其他事务都被阻塞，直到大型只读事务提交。实际上，数据库在很长一段时间内无法进行写入。

因此，运行 2PL 的数据库可能具有相当不稳定的延迟，如果工作负载中存在争用，它们在高百分位数可能非常慢（参见["描述性能"](/ch2#sec_introduction_percentiles)）。可能只需要一个缓慢的事务，或者一个访问大量数据并获取许多锁的事务，就会导致系统的其余部分停滞不前。

尽管死锁可能发生在基于锁的读已提交隔离级别下，但在 2PL 可串行化隔离下（取决于事务的访问模式）它们发生得更频繁。这可能是一个额外的性能问题：当事务由于死锁而被中止并重试时，它需要重新完成所有工作。如果死锁频繁，这可能意味着大量的浪费努力。

#### 谓词锁 {#predicate-locks}

在前面的锁描述中，我们掩盖了一个微妙但重要的细节。在["导致写偏差的幻读"](#sec_transactions_phantom)中，我们讨论了*幻读*的问题——即一个事务改变另一个事务的搜索查询结果。具有可串行化隔离的数据库必须防止幻读。

在会议室预订示例中，这意味着如果一个事务已经搜索了某个时间窗口内某个房间的现有预订（参见[例 8-2](#fig_transactions_meeting_rooms)），另一个事务不允许并发插入或更新同一房间和时间范围的另一个预订。（并发插入其他房间的预订，或同一房间不影响拟议预订的不同时间的预订是可以的。）

我们如何实现这一点？从概念上讲，我们需要一个*谓词锁*[^4]。它的工作方式类似于前面描述的共享/独占锁，但它不属于特定对象（例如，表中的一行），而是属于匹配某些搜索条件的所有对象，例如：

```
SELECT * FROM bookings
 WHERE room_id = 123 AND
 end_time > '2025-01-01 12:00' AND
 start_time < '2025-01-01 13:00';
```

谓词锁限制访问如下：

* 如果事务 A 想要读取匹配某些条件的对象，就像在该 `SELECT` 查询中一样，它必须在查询条件上获取共享模式谓词锁。如果另一个事务 B 当前对匹配这些条件的任何对象具有独占锁，A 必须等到 B 释放其锁后才允许进行查询。
* 如果事务 A 想要插入、更新或删除任何对象，它必须首先检查旧值或新值是否匹配任何现有的谓词锁。如果存在事务 B 持有的匹配谓词锁，则 A 必须等到 B 提交或中止后才能继续。

这里的关键思想是，谓词锁甚至适用于数据库中尚不存在但将来可能添加的对象（幻读）。如果两阶段锁定包括谓词锁，数据库将防止所有形式的写偏差和其他竞态条件，因此其隔离变为可串行化。

#### 索引范围锁 {#sec_transactions_2pl_range}

不幸的是，谓词锁的性能不佳：如果活动事务有许多锁，检查匹配锁变得耗时。因此，大多数具有 2PL 的数据库实际上实现了*索引范围锁定*（也称为*间隙锁*），这是谓词锁定的简化近似[^54] [^64]。

通过使谓词匹配更大的对象集来简化谓词是安全的。例如，如果你对中午到下午 1 点之间房间 123 的预订有谓词锁，你可以通过锁定房间 123 在任何时间的预订来近似它，或者你可以通过锁定中午到下午 1 点之间的所有房间（不仅仅是房间 123）来近似它。这是安全的，因为匹配原始谓词的任何写入肯定也会匹配近似。

在房间预订数据库中，你可能在 `room_id` 列上有索引，和/或在 `start_time` 和 `end_time` 上有索引（否则前面的查询在大型数据库上会非常慢）：

* 假设你的索引在 `room_id` 上，数据库使用此索引查找房间 123 的现有预订。现在数据库可以简单地将共享锁附加到此索引条目，表示事务已搜索房间 123 的预订。
* 或者，如果数据库使用基于时间的索引查找现有预订，它可以将共享锁附加到该索引中的值范围，表示事务已搜索与 2025 年 1 月 1 日中午到下午 1 点的时间段重叠的预订。

无论哪种方式，搜索条件的近似都附加到其中一个索引。现在，如果另一个事务想要插入、更新或删除同一房间和/或重叠时间段的预订，它将必须更新索引的相同部分。在这样做的过程中，它将遇到共享锁，并被迫等到锁被释放。

这提供了对幻读和写偏差的有效保护。索引范围锁不如谓词锁精确（它们可能锁定比严格维护可串行化所需的更大范围的对象），但由于它们的开销要低得多，它们是一个很好的折衷。

如果没有合适的索引可以附加范围锁，数据库可以退回到整个表的共享锁。这对性能不利，因为它将阻止所有其他事务写入表，但这是一个安全的后备位置。

### 可串行化快照隔离（SSI） {#sec_transactions_ssi}

本章描绘了数据库并发控制的黯淡画面。一方面，我们有性能不佳（两阶段锁定）或可伸缩性不佳（串行执行）的可串行化实现。另一方面，我们有性能良好但容易出现各种竞态条件（丢失的更新、写偏差、幻读等）的弱隔离级别。可串行化隔离和良好性能从根本上是对立的吗？

似乎不是：一种称为*可串行化快照隔离*（SSI）的算法提供完全可串行化，与快照隔离相比只有很小的性能损失。SSI 相对较新：它于 2008 年首次描述[^53] [^65]。

今天，SSI 和类似算法用于单节点数据库（PostgreSQL 中的可串行化隔离级别[^54]、SQL Server 的内存 OLTP/Hekaton[^66] 和 HyPer[^67]）、分布式数据库（CockroachDB[^5] 和 FoundationDB[^8]）以及嵌入式存储引擎（如 BadgerDB）。

#### 悲观并发控制与乐观并发控制 {#pessimistic-versus-optimistic-concurrency-control}

两阶段锁定是所谓的*悲观*并发控制机制：它基于这样的原则，即如果任何事情可能出错（如另一个事务持有的锁所示），最好等到情况再次安全后再做任何事情。它就像*互斥*，用于保护多线程编程中的数据结构。

串行执行在某种意义上是悲观到极端：它本质上相当于每个事务在事务期间对整个数据库（或数据库的一个分片）具有独占锁。我们通过使每个事务执行得非常快来补偿悲观主义，因此它只需要短时间持有"锁"。

相比之下，可串行化快照隔离是一种*乐观*并发控制技术。在这种情况下，乐观意味着，如果发生潜在危险的事情，事务不会阻塞，而是继续进行，希望一切都会好起来。当事务想要提交时，数据库会检查是否发生了任何不好的事情（即，是否违反了隔离）；如果是，事务将被中止并必须重试。只允许可串行执行的事务提交。

乐观并发控制是一个老想法[^68]，其优缺点已经争论了很长时间[^69]。如果存在高争用（许多事务尝试访问相同的对象），它的性能很差，因为这会导致大部分事务需要中止。如果系统已经接近其最大吞吐量，重试事务的额外事务负载可能会使性能变差。

但是，如果有足够的备用容量，并且事务之间的争用不太高，乐观并发控制技术往往比悲观技术性能更好。可交换原子操作可以减少争用：例如，如果几个事务并发想要递增计数器，应用递增的顺序无关紧要（只要计数器在同一事务中没有被读取），因此并发递增都可以应用而不会发生冲突。

顾名思义，SSI 基于快照隔离——也就是说，事务中的所有读取都从数据库的一致快照进行（参见["快照隔离与可重复读"](#sec_transactions_snapshot_isolation)）。在快照隔离的基础上，SSI 添加了一种算法来检测读写之间的串行化冲突，并确定要中止哪些事务。

#### 基于过时前提的决策 {#decisions-based-on-an-outdated-premise}

当我们之前讨论快照隔离中的写偏差时（参见["写偏差与幻读"](#sec_transactions_write_skew)），我们观察到一个反复出现的模式：事务从数据库读取一些数据，检查查询结果，并根据它看到的结果决定采取某些行动（写入数据库）。但是，在快照隔离下，原始查询的结果在事务提交时可能不再是最新的，因为数据可能在此期间被修改。

换句话说，事务基于*前提*（事务开始时为真的事实，例如，"当前有两名医生值班"）采取行动。后来，当事务想要提交时，原始数据可能已更改——前提可能不再为真。

当应用程序进行查询（例如，"当前有多少医生值班？"）时，数据库不知道应用程序逻辑如何使用该查询的结果。为了安全起见，数据库需要假设查询结果（前提）中的任何更改都意味着该事务中的写入可能无效。换句话说，事务中的查询和写入之间可能存在因果依赖关系。为了提供可串行化隔离，数据库必须检测事务可能基于过时前提采取行动的情况，并在这种情况下中止事务。

数据库如何知道查询结果是否可能已更改？有两种情况需要考虑：

* 检测陈旧的 MVCC 对象版本的读取（未提交的写入发生在读取之前）
* 检测影响先前读取的写入（写入发生在读取之后）

#### 检测陈旧的 MVCC 读取 {#detecting-stale-mvcc-reads}

回想一下，快照隔离通常由多版本并发控制（MVCC；参见["多版本并发控制（MVCC）"](#sec_transactions_snapshot_impl)）实现。当事务从 MVCC 数据库中的一致快照读取时，它会忽略在拍摄快照时尚未提交的任何其他事务所做的写入。

在[图 8-10](#fig_transactions_detect_mvcc) 中，事务 43 看到 Aaliyah 的 `on_call = true`，因为事务 42（修改了 Aaliyah 的值班状态）未提交。但是，当事务 43 想要提交时，事务 42 已经提交。这意味着从一致快照读取时被忽略的写入现在已生效，事务 43 的前提不再为真。当写入者插入以前不存在的数据时，事情变得更加复杂（参见["导致写偏差的幻读"](#sec_transactions_phantom)）。我们将在["检测影响先前读取的写入"](#sec_detecting_writes_affect_reads)中讨论为 SSI 检测幻写。

{{< figure src="/fig/ddia_0810.png" id="fig_transactions_detect_mvcc" caption="图 8-10. 检测事务何时从 MVCC 快照读取过时值。" class="w-full my-4" >}}


为了防止这种异常，数据库需要跟踪事务由于 MVCC 可见性规则而忽略另一个事务的写入的时间。当事务想要提交时，数据库会检查是否有任何被忽略的写入现在已经提交。如果是，事务必须被中止。

为什么要等到提交？为什么不在检测到陈旧读取时立即中止事务 43？好吧，如果事务 43 是只读事务，它就不需要被中止，因为没有写偏差的风险。在事务 43 进行读取时，数据库还不知道该事务是否稍后会执行写入。此外，事务 42 可能还会中止，或者在事务 43 提交时可能仍未提交，因此读取可能最终不是陈旧的。通过避免不必要的中止，SSI 保留了快照隔离对从一致快照进行长时间运行读取的支持。

#### 检测影响先前读取的写入 {#sec_detecting_writes_affect_reads}

要考虑的第二种情况是另一个事务在数据被读取后修改数据。这种情况如[图 8-11](#fig_transactions_detect_index_range) 所示。

{{< figure src="/fig/ddia_0811.png" id="fig_transactions_detect_index_range" caption="图 8-11. 在可串行化快照隔离中，检测一个事务何时修改另一个事务的读取。" class="w-full my-4" >}}


在两阶段锁定的上下文中，我们讨论了索引范围锁（参见["索引范围锁"](#sec_transactions_2pl_range)），它允许数据库锁定对匹配某些搜索查询的所有行的访问，例如 `WHERE shift_id = 1234`。我们可以在这里使用类似的技术，除了 SSI 锁不会阻塞其他事务。

在[图 8-11](#fig_transactions_detect_index_range) 中，事务 42 和 43 都在班次 `1234` 期间搜索值班医生。如果 `shift_id` 上有索引，数据库可以使用索引条目 1234 来记录事务 42 和 43 读取此数据的事实。（如果没有索引，可以在表级别跟踪此信息。）此信息只需要保留一段时间：在事务完成（提交或中止）并且所有并发事务完成后，数据库可以忘记它读取的数据。

当事务写入数据库时，它必须在索引中查找最近读取受影响数据的任何其他事务。此过程类似于获取受影响键范围的写锁，但它不是阻塞直到读者提交，而是充当绊线：它只是通知事务它们读取的数据可能不再是最新的。

在[图 8-11](#fig_transactions_detect_index_range) 中，事务 43 通知事务 42 其先前的读取已过时，反之亦然。事务 42 首先提交，并且成功：尽管事务 43 的写入影响了 42，但 43 尚未提交，因此写入尚未生效。但是，当事务 43 想要提交时，来自 42 的冲突写入已经提交，因此 43 必须中止。

#### 可串行化快照隔离的性能 {#performance-of-serializable-snapshot-isolation}

与往常一样，许多工程细节会影响算法在实践中的工作效果。例如，一个权衡是跟踪事务读写的粒度。如果数据库详细跟踪每个事务的活动，它可以精确地确定哪些事务需要中止，但簿记开销可能变得很大。不太详细的跟踪速度更快，但可能导致比严格必要更多的事务被中止。

在某些情况下，事务读取被另一个事务覆盖的信息是可以的：根据发生的其他情况，有时可以证明执行结果仍然是可串行化的。PostgreSQL 使用这一理论来减少不必要中止的数量[^14] [^54]。

与两阶段锁定相比，可串行化快照隔离的主要优点是一个事务不需要阻塞等待另一个事务持有的锁。与快照隔离一样，写入者不会阻塞读者，反之亦然。这种设计原则使查询延迟更可预测且变化更少。特别是，只读查询可以在一致快照上运行而无需任何锁，这对于读取密集型工作负载非常有吸引力。

与串行执行相比，可串行化快照隔离不限于单个 CPU 核心的吞吐量：例如，FoundationDB 将串行化冲突的检测分布在多台机器上，允许它扩展到非常高的吞吐量。即使数据可能分片在多台机器上，事务也可以在多个分片中读取和写入数据，同时确保可串行化隔离。

与非可串行化快照隔离相比，检查可串行化违规的需要引入了一些性能开销。这些开销有多大是一个争论的问题：有些人认为可串行化检查不值得[^70]，而其他人认为可串行化的性能现在已经很好，不再需要使用较弱的快照隔离[^67]。

中止率显著影响 SSI 的整体性能。例如，长时间读取和写入数据的事务可能会遇到冲突并中止，因此 SSI 要求读写事务相当短（长时间运行的只读事务是可以的）。但是，SSI 对慢事务的敏感性低于两阶段锁定或串行执行。

## 分布式事务 {#sec_transactions_distributed}

前几节重点讨论了隔离的并发控制，即 ACID 中的 I。我们看到的算法适用于单节点和分布式数据库：尽管在使并发控制算法可扩展方面存在挑战（例如，为 SSI 执行分布式可串行化检查），但分布式并发控制的高层思想与单节点并发控制相似[^8]。

一致性和持久性在转向分布式事务时也没有太大变化。但是，原子性需要更多关注。

对于在单个数据库节点执行的事务，原子性通常由存储引擎实现。当客户端要求数据库节点提交事务时，数据库使事务的写入持久化（通常在预写日志中；参见["使 B 树可靠"](/ch4#sec_storage_btree_wal)），然后将提交记录附加到磁盘上的日志。如果数据库在此过程中崩溃，事务将在节点重新启动时从日志中恢复：如果提交记录在崩溃前成功写入磁盘，则事务被认为已提交；如果没有，该事务的任何写入都将回滚。

因此，在单个节点上，事务提交关键取决于数据持久写入磁盘的*顺序*：首先是数据，然后是提交记录[^22]。事务提交或中止的关键决定时刻是磁盘完成写入提交记录的时刻：在那一刻之前，仍然可能中止（由于崩溃），但在那一刻之后，事务已提交（即使数据库崩溃）。因此，是单个设备（连接到特定节点的特定磁盘驱动器的控制器）使提交成为原子的。

但是，如果多个节点参与事务会怎样？例如，也许你在分片数据库中有多对象事务，或者有全局二级索引（其中索引条目可能与主数据在不同的节点上；参见["分片和二级索引"](/ch7#sec_sharding_secondary_indexes)）。大多数"NoSQL"分布式数据存储不支持此类分布式事务，但各种分布式关系数据库支持。

在这些情况下，仅向所有节点发送提交请求并在每个节点上独立提交事务是不够的。如[图 8-12](#fig_transactions_non_atomic) 所示，提交可能在某些节点上成功，在其他节点上失败：

* 某些节点可能检测到约束违规或冲突，需要中止，而其他节点能够成功提交。
* 某些提交请求可能在网络中丢失，最终由于超时而中止，而其他提交请求通过。
* 某些节点可能在提交记录完全写入之前崩溃并在恢复时回滚，而其他节点成功提交。

{{< figure src="/fig/ddia_0812.png" id="fig_transactions_non_atomic" caption="图 8-12. 当事务涉及多个数据库节点时，它可能在某些节点上提交，在其他节点上失败。" class="w-full my-4" >}}


如果某些节点提交事务而其他节点中止它，节点之间就会变得不一致。一旦事务在一个节点上提交，如果后来发现它在另一个节点上被中止，就不能撤回了。这是因为一旦数据被提交，它在*读已提交*或更强的隔离下对其他事务可见。例如，在[图 8-12](#fig_transactions_non_atomic) 中，当用户 1 注意到其在数据库 1 上的提交失败时，用户 2 已经从数据库 2 上的同一事务读取了数据。如果用户 1 的事务后来被中止，用户 2 的事务也必须被还原，因为它基于被追溯声明不存在的数据。

更好的方法是确保参与事务的节点要么全部提交，要么全部中止，并防止两者的混合。确保这一点被称为*原子提交*问题。

### 两阶段提交（2PC） {#sec_transactions_2pc}

两阶段提交是一种跨多个节点实现原子事务提交的算法。它是分布式数据库中的经典算法[^13] [^71] [^72]。2PC 在某些数据库内部使用，也以 *XA 事务*[^73] 的形式提供给应用程序（例如，Java 事务 API 支持），或通过 WS-AtomicTransaction 用于 SOAP Web 服务[^74] [^75]。

2PC 的基本流程如[图 8-13](#fig_transactions_two_phase_commit) 所示。与单节点事务的单个提交请求不同，2PC 中的提交/中止过程分为两个阶段（因此得名）。

{{< figure src="/fig/ddia_0813.png" id="fig_transactions_two_phase_commit" title="图 8-13. 两阶段提交（2PC）的成功执行。" class="w-full my-4" >}}


2PC 使用一个通常不会出现在单节点事务中的新组件：*协调器*（也称为*事务管理器*）。协调器通常作为请求事务的同一应用程序进程中的库实现（例如，嵌入在 Java EE 容器中），但它也可以是单独的进程或服务。此类协调器的示例包括 Narayana、JOTM、BTM 或 MSDTC。

使用 2PC 时，分布式事务从应用程序在多个数据库节点上正常读写数据开始。我们称这些数据库节点为事务中的*参与者*。当应用程序准备提交时，协调器开始第 1 阶段：它向每个节点发送*准备*请求，询问它们是否能够提交。然后协调器跟踪参与者的响应：

* 如果所有参与者回复"是"，表示他们准备提交，那么协调器在第 2 阶段发出*提交*请求，提交实际发生。
* 如果任何参与者回复"否"，协调器在第 2 阶段向所有节点发送*中止*请求。

这个过程有点像西方文化中的传统婚礼仪式：牧师分别询问新娘和新郎是否愿意嫁给对方，通常从两人那里得到"我愿意"的答案。在收到两个确认后，牧师宣布这对夫妇为夫妻：事务已提交，这个快乐的事实向所有参加者广播。如果新娘或新郎没有说"是"，仪式就被中止了[^76]。

#### 系统性的承诺 {#a-system-of-promises}

从这个简短的描述中，可能不清楚为什么两阶段提交确保原子性，而跨多个节点的单阶段提交却不能。准备和提交请求在两阶段情况下同样容易丢失。是什么让 2PC 不同？

要理解它为什么有效，我们必须更详细地分解这个过程：

1. 当应用程序想要开始分布式事务时，它从协调器请求事务 ID。此事务 ID 是全局唯一的。
2. 应用程序在每个参与者上开始单节点事务，并将全局唯一的事务 ID 附加到单节点事务。所有读写都在这些单节点事务之一中完成。如果在此阶段出现任何问题（例如，节点崩溃或请求超时），协调器或任何参与者都可以中止。
3. 当应用程序准备提交时，协调器向所有参与者发送准备请求，标记有全局事务 ID。如果这些请求中的任何一个失败或超时，协调器向所有参与者发送该事务 ID 的中止请求。
4. 当参与者收到准备请求时，它确保它可以在任何情况下明确提交事务。

 这包括将所有事务数据写入磁盘（崩溃、电源故障或磁盘空间不足不是稍后拒绝提交的可接受借口），并检查任何冲突或约束违规。通过向协调器回复"是"，节点承诺在请求时无错误地提交事务。换句话说，参与者放弃了中止事务的权利，但没有实际提交它。
5. 当协调器收到所有准备请求的响应时，它对是否提交或中止事务做出明确决定（仅当所有参与者投票"是"时才提交）。协调器必须将该决定写入其磁盘上的事务日志，以便在随后崩溃时知道它是如何决定的。这称为*提交点*。
6. 一旦协调器的决定被写入磁盘，提交或中止请求就会发送给所有参与者。如果此请求失败或超时，协调器必须永远重试，直到成功。没有回头路：如果决定是提交，那么必须执行该决定，无论需要多少次重试。如果参与者在此期间崩溃，事务将在恢复时提交——因为参与者投票"是"，它在恢复时不能拒绝提交。

因此，该协议包含两个关键的"不归路"：当参与者投票"是"时，它承诺它肯定能够稍后提交（尽管协调器仍可能选择中止）；一旦协调器决定，该决定是不可撤销的。这些承诺确保了 2PC 的原子性。（单节点原子提交将这两个事件合并为一个：将提交记录写入事务日志。）

回到婚姻比喻，在说"我愿意"之前，你和你的新娘/新郎有自由通过说"不行！"（或类似的话）来中止事务。但是，在说"我愿意"之后，你不能撤回该声明。如果你在说"我愿意"后晕倒，没有听到牧师说"你们现在是夫妻"，这并不改变事务已提交的事实。当你稍后恢复意识时，你可以通过向牧师查询你的全局事务 ID 的状态来了解你是否已婚，或者你可以等待牧师下一次重试提交请求（因为重试将在你失去意识期间继续）。

#### 协调器故障 {#coordinator-failure}

我们已经讨论了如果参与者之一或网络在 2PC 期间失败会发生什么：如果任何准备请求失败或超时，协调器将中止事务；如果任何提交或中止请求失败，协调器将无限期地重试它们。但是，如果协调器崩溃会发生什么就不太清楚了。

如果协调器在发送准备请求之前失败，参与者可以安全地中止事务。但是一旦参与者收到准备请求并投票"是"，它就不能再单方面中止——它必须等待协调器回复事务是提交还是中止。如果协调器此时崩溃或网络失败，参与者除了等待别无他法。参与者在此状态下的事务称为*存疑*或*不确定*。

这种情况如[图 8-14](#fig_transactions_2pc_crash) 所示。在这个特定的例子中，协调器实际上决定提交，数据库 2 收到了提交请求。但是，协调器在向数据库 1 发送提交请求之前崩溃了，因此数据库 1 不知道是提交还是中止。即使超时在这里也没有帮助：如果数据库 1 在超时后单方面中止，它将与已提交的数据库 2 不一致。同样，单方面提交也不安全，因为另一个参与者可能已中止。

{{< figure src="/fig/ddia_0814.png" id="fig_transactions_2pc_crash" title="图 8-14. 协调器在参与者投票“是”后崩溃。数据库 1 不知道是提交还是中止。" class="w-full my-4" >}}


没有协调器的消息，参与者无法知道是提交还是中止。原则上，参与者可以相互通信，了解每个参与者如何投票并达成某种协议，但这不是 2PC 协议的一部分。

2PC 完成的唯一方法是等待协调器恢复。这就是为什么协调器必须在向参与者发送提交或中止请求之前将其提交或中止决定写入磁盘上的事务日志：当协调器恢复时，它通过读取其事务日志来确定所有存疑事务的状态。协调器日志中没有提交记录的任何事务都将中止。因此，2PC 的提交点归结为协调器上的常规单节点原子提交。

#### 三阶段提交 {#three-phase-commit}

由于 2PC 可能会卡住等待协调器恢复，因此两阶段提交被称为*阻塞*原子提交协议。可以使原子提交协议*非阻塞*，以便在节点失败时不会卡住。但是，在实践中使其工作并不那么简单。

作为 2PC 的替代方案，已经提出了一种称为*三阶段提交*（3PC）的算法[^13] [^77]。但是，3PC 假设具有有界延迟的网络和具有有界响应时间的节点；在大多数具有无界网络延迟和进程暂停的实际系统中（参见[第 9 章](/ch9#ch_distributed)），它无法保证原子性。

实践中更好的解决方案是用容错共识协议替换单节点协调器。我们将在[第 10 章](/ch10#ch_consistency)中看到如何做到这一点。

### 跨不同系统的分布式事务 {#sec_transactions_xa}

分布式事务和两阶段提交的声誉参差不齐。一方面，它们被认为提供了一个重要的安全保证，否则很难实现；另一方面，它们因导致操作问题、扼杀性能并承诺超过它们可以提供的东西而受到批评[^78] [^79] [^80] [^81]。许多云服务由于它们引起的操作问题而选择不实现分布式事务[^82]。

某些分布式事务的实现会带来沉重的性能损失。两阶段提交固有的大部分性能成本是由于崩溃恢复所需的额外磁盘强制（`fsync`）和额外的网络往返。

但是，与其直接否定分布式事务，我们应该更详细地研究它们，因为从中可以学到重要的教训。首先，我们应该准确说明"分布式事务"的含义。两种完全不同类型的分布式事务经常被混淆：

数据库内部分布式事务
: 某些分布式数据库（即，在其标准配置中使用复制和分片的数据库）支持该数据库节点之间的内部事务。例如，YugabyteDB、TiDB、FoundationDB、Spanner、VoltDB 和 MySQL Cluster 的 NDB 存储引擎都有这样的内部事务支持。在这种情况下，参与事务的所有节点都运行相同的数据库软件。

异构分布式事务
: 在*异构*事务中，参与者是两个或多个不同的技术：例如，来自不同供应商的两个数据库，甚至是非数据库系统（如消息代理）。跨这些系统的分布式事务必须确保原子提交，即使系统在底层可能完全不同。

数据库内部事务不必与任何其他系统兼容，因此它们可以使用任何协议并应用特定于该特定技术的优化。因此，数据库内部分布式事务通常可以很好地工作。另一方面，跨异构技术的事务更具挑战性。

#### 恰好一次消息处理 {#sec_transactions_exactly_once}

异构分布式事务允许以强大的方式集成各种系统。例如，当且仅当处理消息的数据库事务成功提交时，来自消息队列的消息才能被确认为已处理。这是通过在单个事务中原子地提交消息确认和数据库写入来实现的。有了分布式事务支持，即使消息代理和数据库是在不同机器上运行的两种不相关的技术，这也是可能的。

如果消息传递或数据库事务失败，两者都会中止，因此消息代理可以稍后安全地重新传递消息。因此，通过原子地提交消息及其处理的副作用，我们可以确保消息在效果上*恰好*处理一次，即使在成功之前需要几次重试。中止会丢弃部分完成事务的任何副作用。这被称为*恰好一次语义*。

但是，只有当受事务影响的所有系统都能够使用相同的原子提交协议时，这种分布式事务才有可能。例如，假设处理消息的副作用是发送电子邮件，而电子邮件服务器不支持两阶段提交：如果消息处理失败并重试，可能会发生电子邮件被发送两次或更多次。但是，如果处理消息的所有副作用在事务中止时都会回滚，那么处理步骤可以安全地重试，就好像什么都没有发生一样。

我们将在本章后面回到恰好一次语义的主题。让我们首先看看允许此类异构分布式事务的原子提交协议。

#### XA 事务 {#xa-transactions}

*X/Open XA*（*eXtended Architecture* 的缩写）是跨异构技术实现两阶段提交的标准[^73]。它于 1991 年推出并得到广泛实现：XA 受到许多传统关系数据库（包括 PostgreSQL、MySQL、Db2、SQL Server 和 Oracle）和消息代理（包括 ActiveMQ、HornetQ、MSMQ 和 IBM MQ）的支持。

XA 不是网络协议——它只是用于与事务协调器接口的 C API。此 API 的绑定存在于其他语言中；例如，在 Java EE 应用程序的世界中，XA 事务使用 Java 事务 API（JTA）实现，而 JTA 又由许多使用 Java 数据库连接（JDBC）的数据库驱动程序和使用 Java 消息服务（JMS）API 的消息代理驱动程序支持。

XA 假设你的应用程序使用网络驱动程序或客户端库与参与者数据库或消息服务进行通信。如果驱动程序支持 XA，这意味着它调用 XA API 来确定操作是否应该是分布式事务的一部分——如果是，它将必要的信息发送到数据库服务器。驱动程序还公开回调，协调器可以通过回调要求参与者准备、提交或中止。

事务协调器实现 XA API。该标准没有指定应该如何实现它，但在实践中，协调器通常只是加载到发出事务的应用程序的同一进程中的库（而不是单独的服务）。它跟踪事务中的参与者，在要求他们准备后收集参与者的响应（通过驱动程序的回调），并使用本地磁盘上的日志来跟踪每个事务的提交/中止决定。

如果应用程序进程崩溃，或者运行应用程序的机器死机，协调器也随之消失。任何准备但未提交事务的参与者都陷入存疑。由于协调器的日志在应用程序服务器的本地磁盘上，该服务器必须重新启动，协调器库必须读取日志以恢复每个事务的提交/中止结果。然后，协调器才能使用数据库驱动程序的 XA 回调来要求参与者提交或中止（视情况而定）。数据库服务器无法直接联系协调器，因为所有通信都必须通过其客户端库。

#### 存疑时持有锁 {#holding-locks-while-in-doubt}

为什么我们如此关心事务陷入存疑？系统的其余部分不能继续工作，忽略最终会被清理的存疑事务吗？

问题在于*锁定*。如["读已提交"](#sec_transactions_read_committed)中所讨论的，数据库事务通常对它们修改的任何行进行行级独占锁，以防止脏写。此外，如果你想要可串行化隔离，使用两阶段锁定的数据库还必须对事务*读取*的任何行进行共享锁。

数据库在事务提交或中止之前不能释放这些锁（如[图 8-13](#fig_transactions_two_phase_commit) 中的阴影区域所示）。因此，使用两阶段提交时，事务必须在存疑期间保持锁。如果协调器崩溃并需要 20 分钟才能重新启动，这些锁将保持 20 分钟。如果协调器的日志由于某种原因完全丢失，这些锁将永远保持——或者至少直到管理员手动解决情况。

当这些锁被持有时，没有其他事务可以修改这些行。根据隔离级别，其他事务甚至可能被阻止读取这些行。因此，其他事务不能简单地继续他们的业务——如果他们想要访问相同的数据，他们将被阻塞。这可能导致你的应用程序的大部分变得不可用，直到存疑事务得到解决。

#### 从协调器故障中恢复 {#recovering-from-coordinator-failure}

理论上，如果协调器崩溃并重新启动，它应该从日志中干净地恢复其状态并解决任何存疑事务。但是，在实践中，*孤立的*存疑事务确实会发生[^83] [^84]——也就是说，协调器由于某种原因（例如，由于软件错误导致事务日志丢失或损坏）无法决定结果的事务。这些事务无法自动解决，因此它们永远留在数据库中，持有锁并阻塞其他事务。

即使重新启动数据库服务器也无法解决此问题，因为 2PC 的正确实现必须即使在重新启动时也保留存疑事务的锁（否则它将冒着违反原子性保证的风险）。这是一个棘手的情况。

唯一的出路是管理员手动决定是提交还是回滚事务。管理员必须检查每个存疑事务的参与者，确定是否有任何参与者已经提交或中止，然后将相同的结果应用于其他参与者。解决问题可能需要大量的手动工作，并且很可能需要在严重的生产中断期间在高压力和时间压力下完成（否则，为什么协调器会处于如此糟糕的状态？）。

许多 XA 实现都有一个名为*启发式决策*的紧急逃生舱口：允许参与者在没有协调器明确决定的情况下单方面决定中止或提交存疑事务[^73]。明确地说，这里的*启发式*是*可能破坏原子性*的委婉说法，因为启发式决策违反了两阶段提交中的承诺系统。因此，启发式决策仅用于摆脱灾难性情况，而不用于常规使用。

#### XA 事务的问题 {#problems-with-xa-transactions}

单节点协调器是整个系统的单点故障，使其成为应用程序服务器的一部分也是有问题的，因为协调器在其本地磁盘上的日志成为持久系统状态的关键部分——与数据库本身一样重要。

原则上，XA 事务的协调器可以是高可用和复制的，就像我们对任何其他重要数据库的期望一样。不幸的是，这仍然不能解决 XA 的一个根本问题，即它没有为事务的协调器和参与者提供直接相互通信的方式。它们只能通过调用事务的应用程序代码以及调用参与者的数据库驱动程序进行通信。

即使协调器被复制，应用程序代码也将是单点故障。解决这个问题需要完全重新设计应用程序代码的运行方式，使其复制或可重启，这可能看起来类似于持久执行（参见["持久执行和工作流"](/ch5#sec_encoding_dataflow_workflows)）。但是，实践中似乎没有任何工具实际采用这种方法。

另一个问题是，由于 XA 需要与各种数据系统兼容，它必然是最低公分母。例如，它无法检测跨不同系统的死锁（因为这需要系统交换有关每个事务正在等待的锁的信息的标准化协议），并且它不适用于 SSI（参见["可串行化快照隔离（SSI）"](#sec_transactions_ssi)），因为这需要跨不同系统识别冲突的协议。

这些问题在某种程度上是跨异构技术执行事务所固有的。但是，保持几个异构数据系统彼此一致仍然是一个真实而重要的问题，因此我们需要为其找到不同的解决方案。这可以做到，我们将在下一节和["派生数据与分布式事务"](/ch13#sec_future_derived_vs_transactions)中看到。

### 数据库内部的分布式事务 {#sec_transactions_internal}

如前所述，跨多个异构存储技术的分布式事务与系统内部的分布式事务之间存在很大差异——即，参与节点都是运行相同软件的同一数据库的分片。此类内部分布式事务是"NewSQL"数据库的定义特征，例如 CockroachDB[^5]、TiDB[^6]、Spanner[^7]、FoundationDB[^8] 和 YugabyteDB。某些消息代理（如 Kafka）也支持内部分布式事务[^85]。

这些系统中的许多系统使用两阶段提交来确保写入多个分片的事务的原子性，但它们不会遇到与 XA 事务相同的问题。原因是，由于它们的分布式事务不需要与任何其他技术接口，它们避免了最低公分母陷阱——这些系统的设计者可以自由使用更可靠、更快的更好协议。

XA 的最大问题可以通过以下方式解决：

* 复制协调器，如果主协调器崩溃，自动故障转移到另一个协调器节点；
* 允许协调器和数据分片直接通信，而不通过应用程序代码；
* 复制参与分片，以减少由于分片中的故障而必须中止事务的风险；以及
* 将原子提交协议与支持跨分片死锁检测和一致读取的分布式并发控制协议耦合。

共识算法通常用于复制协调器和数据库分片。我们将在[第 10 章](/ch10#ch_consistency)中看到如何使用共识算法实现分布式事务的原子提交。这些算法通过自动从一个节点故障转移到另一个节点来容忍故障，无需任何人工干预，同时继续保证强一致性属性。

为分布式事务提供的隔离级别取决于系统，但跨分片的快照隔离和可串行化快照隔离都是可能的。有关其工作原理的详细信息，请参见本章末尾引用的论文。

#### 再谈恰好一次消息处理 {#exactly-once-message-processing-revisited}

我们在["恰好一次消息处理"](#sec_transactions_exactly_once)中看到，分布式事务的一个重要用例是确保某些操作恰好生效一次，即使在处理过程中发生崩溃并且需要重试处理。如果你可以跨消息代理和数据库原子地提交事务，则当且仅当成功处理消息并且从处理过程产生的数据库写入被提交时，你可以向代理确认消息。

但是，你实际上不需要这样的分布式事务来实现恰好一次语义。另一种方法如下，它只需要数据库中的事务：

1. 假设每条消息都有唯一的 ID，并且在数据库中有一个已处理消息 ID 的表。当你开始从代理处理消息时，你在数据库上开始一个新事务，并检查消息 ID。如果数据库中已经存在相同的消息 ID，你知道它已经被处理，因此你可以向代理确认消息并丢弃它。
2. 如果消息 ID 尚未在数据库中，你将其添加到表中。然后你处理消息，这可能会导致在同一事务中对数据库进行额外的写入。完成处理消息后，你提交数据库上的事务。
3. 一旦数据库事务成功提交，你就可以向代理确认消息。
4. 一旦消息成功确认给代理，你知道它不会再次尝试处理相同的消息，因此你可以从数据库中删除消息 ID（在单独的事务中）。

如果消息处理器在提交数据库事务之前崩溃，事务将被中止，消息代理将重试处理。如果它在提交后但在向代理确认消息之前崩溃，它也将重试处理，但重试将在数据库中看到消息 ID 并丢弃它。如果它在确认消息后但在从数据库中删除消息 ID 之前崩溃，你将有一个旧的消息 ID 留下，除了占用一点存储空间外不会造成任何伤害。如果在数据库事务中止之前发生重试（如果消息处理器和数据库之间的通信中断，这可能会发生），消息 ID 表上的唯一性约束应该防止两个并发事务插入相同的消息 ID。

因此，实现恰好一次处理只需要数据库中的事务——跨数据库和消息代理的原子性对于此用例不是必需的。在数据库中记录消息 ID 使消息处理具备*幂等性*，因此可以安全地重试消息处理而不会重复其副作用。流处理框架（如 Kafka Streams）中使用类似的方法来实现恰好一次语义，我们将在["容错"](/ch12#sec_stream_fault_tolerance)中看到。

但是，数据库内的内部分布式事务对于此类模式的可伸缩性仍然有用：例如，它们将允许消息 ID 存储在一个分片上，而消息处理更新的主数据存储在其他分片上，并确保跨这些分片的事务提交的原子性。


## 总结 {#summary}

事务是一个抽象层，允许应用程序假装某些并发问题和某些类型的硬件和软件故障不存在。大量错误被简化为简单的*事务中止*，应用程序只需要重试。

在本章中，我们看到了许多事务有助于防止的问题示例。并非所有应用程序都容易受到所有这些问题的影响：具有非常简单的访问模式的应用程序（例如，仅读取和写入单个记录）可能可以在没有事务的情况下管理。但是，对于更复杂的访问模式，事务可以大大减少你需要考虑的潜在错误情况的数量。

没有事务，各种错误场景（进程崩溃、网络中断、停电、磁盘已满、意外并发等）意味着数据可能以各种方式变得不一致。例如，反规范化数据很容易与源数据失去同步。没有事务，很难推理复杂的交互访问对数据库可能产生的影响。

在本章中，我们特别深入地探讨了并发控制的主题。我们讨论了几种广泛使用的隔离级别，特别是*读已提交*、*快照隔离*（有时称为*可重复读*）和*可串行化*。我们通过讨论各种竞态条件的示例来描述这些隔离级别，总结在 [表 8-1](#tab_transactions_isolation_levels) 中：

{{< figure id="tab_transactions_isolation_levels" title="表 8-1. 各种隔离级别可能发生的异常总结" class="w-full my-4" >}}

| 隔离级别 | 脏读   | 读取偏差  | 幻读   | 丢失更新  | 写偏差  |
|------|------|------|------|-------|------|
| 读未提交 | ✗ 可能 | ✗ 可能 | ✗ 可能 | ✗ 可能  | ✗ 可能 |
| 读已提交 | ✓ 防止 | ✗ 可能 | ✗ 可能 | ✗ 可能  | ✗ 可能 |
| 快照隔离 | ✓ 防止 | ✓ 防止 | ✓ 防止 | ? 视情况 | ✗ 可能 |
| 可串行化 | ✓ 防止 | ✓ 防止 | ✓ 防止 | ✓ 防止  | ✓ 防止 |

脏读
: 一个客户端在另一个客户端的写入提交之前读取它们。读已提交隔离级别和更强的级别防止脏读。

脏写
: 一个客户端覆盖另一个客户端已写入但尚未提交的数据。几乎所有事务实现都防止脏写。

读取偏差
: 客户端在不同时间点看到数据库的不同部分。某些读取偏差的情况也称为*不可重复读*。这个问题最常通过快照隔离来防止，它允许事务从对应于特定时间点的一致快照读取。它通常使用*多版本并发控制*（MVCC）实现。

丢失更新
: 两个客户端并发执行读-修改-写循环。一个覆盖另一个的写入而不合并其更改，因此数据丢失。某些快照隔离的实现会自动防止此异常，而其他实现需要手动锁（`SELECT FOR UPDATE`）。

写偏差
: 事务读取某些内容，根据它看到的值做出决定，并将决定写入数据库。但是，在进行写入时，决策的前提不再为真。只有可串行化隔离才能防止此异常。

幻读
: 事务读取匹配某些搜索条件的对象。另一个客户端进行影响该搜索结果的写入。快照隔离防止直接的幻读，但写偏差上下文中的幻读需要特殊处理，例如索引范围锁。

弱隔离级别可以防止某些异常，但让你（应用程序开发人员）手动处理其他异常（例如，使用显式锁定）。只有可串行化隔离可以防止所有这些问题。我们讨论了实现可串行化事务的三种不同方法：

字面上串行执行事务
: 如果你可以使每个事务执行得非常快（通常通过使用存储过程），并且事务吞吐量足够低，可以在单个 CPU 核心上处理或可以分片，这是一个简单有效的选择。

两阶段锁定
: 几十年来，这一直是实现可串行化的标准方法，但许多应用程序由于其性能不佳而避免使用它。

可串行化快照隔离（SSI）
: 一种相对较新的算法，避免了前面方法的大部分缺点。它使用乐观方法，允许事务在不阻塞的情况下进行。当事务想要提交时，它会被检查，如果执行不可串行化，它将被中止。

最后，我们研究了当事务分布在多个节点上时如何实现原子性，使用两阶段提交。如果这些节点都运行相同的数据库软件，分布式事务可以很好地工作，但跨不同存储技术（使用 XA 事务），2PC 是有问题的：它对协调器和驱动事务的应用程序代码中的故障非常敏感，并且与并发控制机制的交互很差。幸运的是，幂等性可以确保恰好一次语义，而无需跨不同存储技术的原子提交，我们将在后面的章节中看到更多相关内容。

本章中的示例使用了关系数据模型。但是，如["多对象事务的需求"](#sec_transactions_need)中所讨论的，无论使用哪种数据模型，事务都是有价值的数据库功能。


### 参考


[^1]: Steven J. Murdoch. [What went wrong with Horizon: learning from the Post Office Trial](https://www.benthamsgaze.org/2021/07/15/what-went-wrong-with-horizon-learning-from-the-post-office-trial/). *benthamsgaze.org*, July 2021. Archived at [perma.cc/CNM4-553F](https://perma.cc/CNM4-553F)
[^2]: Donald D. Chamberlin, Morton M. Astrahan, Michael W. Blasgen, James N. Gray, W. Frank King, Bruce G. Lindsay, Raymond Lorie, James W. Mehl, Thomas G. Price, Franco Putzolu, Patricia Griffiths Selinger, Mario Schkolnick, Donald R. Slutz, Irving L. Traiger, Bradford W. Wade, and Robert A. Yost. [A History and Evaluation of System R](https://dsf.berkeley.edu/cs262/2005/SystemR.pdf). *Communications of the ACM*, volume 24, issue 10, pages 632–646, October 1981. [doi:10.1145/358769.358784](https://doi.org/10.1145/358769.358784)
[^3]: Jim N. Gray, Raymond A. Lorie, Gianfranco R. Putzolu, and Irving L. Traiger. [Granularity of Locks and Degrees of Consistency in a Shared Data Base](https://citeseerx.ist.psu.edu/pdf/e127f0a6a912bb9150ecfe03c0ebf7fbc289a023). in *Modelling in Data Base Management Systems: Proceedings of the IFIP Working Conference on Modelling in Data Base Management Systems*, edited by G. M. Nijssen, pages 364–394, Elsevier/North Holland Publishing, 1976. Also in *Readings in Database Systems*, 4th edition, edited by Joseph M. Hellerstein and Michael Stonebraker, MIT Press, 2005. ISBN: 978-0-262-69314-1
[^4]: Kapali P. Eswaran, Jim N. Gray, Raymond A. Lorie, and Irving L. Traiger. [The Notions of Consistency and Predicate Locks in a Database System](https://jimgray.azurewebsites.net/papers/On%20the%20Notions%20of%20Consistency%20and%20Predicate%20Locks%20in%20a%20Database%20System%20CACM.pdf?from=https://research.microsoft.com/en-us/um/people/gray/papers/On%20the%20Notions%20of%20Consistency%20and%20Predicate%20Locks%20in%20a%20Database%20System%20CACM.pdf). *Communications of the ACM*, volume 19, issue 11, pages 624–633, November 1976. [doi:10.1145/360363.360369](https://doi.org/10.1145/360363.360369)
[^5]: Rebecca Taft, Irfan Sharif, Andrei Matei, Nathan VanBenschoten, Jordan Lewis, Tobias Grieger, Kai Niemi, Andy Woods, Anne Birzin, Raphael Poss, Paul Bardea, Amruta Ranade, Ben Darnell, Bram Gruneir, Justin Jaffray, Lucy Zhang, and Peter Mattis. [CockroachDB: The Resilient Geo-Distributed SQL Database](https://dl.acm.org/doi/pdf/10.1145/3318464.3386134). At *ACM SIGMOD International Conference on Management of Data* (SIGMOD), pages 1493–1509, June 2020. [doi:10.1145/3318464.3386134](https://doi.org/10.1145/3318464.3386134)
[^6]: Dongxu Huang, Qi Liu, Qiu Cui, Zhuhe Fang, Xiaoyu Ma, Fei Xu, Li Shen, Liu Tang, Yuxing Zhou, Menglong Huang, Wan Wei, Cong Liu, Jian Zhang, Jianjun Li, Xuelian Wu, Lingyu Song, Ruoxi Sun, Shuaipeng Yu, Lei Zhao, Nicholas Cameron, Liquan Pei, and Xin Tang. [TiDB: a Raft-based HTAP database](https://www.vldb.org/pvldb/vol13/p3072-huang.pdf). *Proceedings of the VLDB Endowment*, volume 13, issue 12, pages 3072–3084. [doi:10.14778/3415478.3415535](https://doi.org/10.14778/3415478.3415535)
[^7]: James C. Corbett, Jeffrey Dean, Michael Epstein, Andrew Fikes, Christopher Frost, JJ Furman, Sanjay Ghemawat, Andrey Gubarev, Christopher Heiser, Peter Hochschild, Wilson Hsieh, Sebastian Kanthak, Eugene Kogan, Hongyi Li, Alexander Lloyd, Sergey Melnik, David Mwaura, David Nagle, Sean Quinlan, Rajesh Rao, Lindsay Rolig, Dale Woodford, Yasushi Saito, Christopher Taylor, Michal Szymaniak, and Ruth Wang. [Spanner: Google’s Globally-Distributed Database](https://research.google/pubs/pub39966/). At *10th USENIX Symposium on Operating System Design and Implementation* (OSDI), October 2012.
[^8]: Jingyu Zhou, Meng Xu, Alexander Shraer, Bala Namasivayam, Alex Miller, Evan Tschannen, Steve Atherton, Andrew J. Beamon, Rusty Sears, John Leach, Dave Rosenthal, Xin Dong, Will Wilson, Ben Collins, David Scherer, Alec Grieser, Young Liu, Alvin Moore, Bhaskar Muppana, Xiaoge Su, and Vishesh Yadav. [FoundationDB: A Distributed Unbundled Transactional Key Value Store](https://www.foundationdb.org/files/fdb-paper.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2021. [doi:10.1145/3448016.3457559](https://doi.org/10.1145/3448016.3457559)
[^9]: Theo Härder and Andreas Reuter. [Principles of Transaction-Oriented Database Recovery](https://citeseerx.ist.psu.edu/pdf/11ef7c142295aeb1a28a0e714c91fc8d610c3047). *ACM Computing Surveys*, volume 15, issue 4, pages 287–317, December 1983. [doi:10.1145/289.291](https://doi.org/10.1145/289.291)
[^10]: Peter Bailis, Alan Fekete, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [HAT, not CAP: Towards Highly Available Transactions](https://www.usenix.org/system/files/conference/hotos13/hotos13-final80.pdf). At *14th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2013.
[^11]: Armando Fox, Steven D. Gribble, Yatin Chawathe, Eric A. Brewer, and Paul Gauthier. [Cluster-Based Scalable Network Services](https://people.eecs.berkeley.edu/~brewer/cs262b/TACC.pdf). At *16th ACM Symposium on Operating Systems Principles* (SOSP), October 1997. [doi:10.1145/268998.266662](https://doi.org/10.1145/268998.266662)
[^12]: Tony Andrews. [Enforcing Complex Constraints in Oracle](https://tonyandrews.blogspot.com/2004/10/enforcing-complex-constraints-in.html). *tonyandrews.blogspot.co.uk*, October 2004. Archived at [archive.org](https://web.archive.org/web/20220201190625/https%3A//tonyandrews.blogspot.com/2004/10/enforcing-complex-constraints-in.html)
[^13]: Philip A. Bernstein, Vassos Hadzilacos, and Nathan Goodman. [*Concurrency Control and Recovery in Database Systems*](https://www.microsoft.com/en-us/research/people/philbe/book/). Addison-Wesley, 1987. ISBN: 978-0-201-10715-9, available online at [*microsoft.com*](https://www.microsoft.com/en-us/research/people/philbe/book/).
[^14]: Alan Fekete, Dimitrios Liarokapis, Elizabeth O’Neil, Patrick O’Neil, and Dennis Shasha. [Making Snapshot Isolation Serializable](https://www.cse.iitb.ac.in/infolab/Data/Courses/CS632/2009/Papers/p492-fekete.pdf). *ACM Transactions on Database Systems*, volume 30, issue 2, pages 492–528, June 2005. [doi:10.1145/1071610.1071615](https://doi.org/10.1145/1071610.1071615)
[^15]: Mai Zheng, Joseph Tucek, Feng Qin, and Mark Lillibridge. [Understanding the Robustness of SSDs Under Power Fault](https://www.usenix.org/system/files/conference/fast13/fast13-final80.pdf). At *11th USENIX Conference on File and Storage Technologies* (FAST), February 2013.
[^16]: Laurie Denness. [SSDs: A Gift and a Curse](https://laur.ie/blog/2015/06/ssds-a-gift-and-a-curse/). *laur.ie*, June 2015. Archived at [perma.cc/6GLP-BX3T](https://perma.cc/6GLP-BX3T)
[^17]: Adam Surak. [When Solid State Drives Are Not That Solid](https://www.algolia.com/blog/engineering/when-solid-state-drives-are-not-that-solid). *blog.algolia.com*, June 2015. Archived at [perma.cc/CBR9-QZEE](https://perma.cc/CBR9-QZEE)
[^18]: Hewlett Packard Enterprise. [Bulletin: (Revision) HPE SAS Solid State Drives - Critical Firmware Upgrade Required for Certain HPE SAS Solid State Drive Models to Prevent Drive Failure at 32,768 Hours of Operation](https://support.hpe.com/hpesc/public/docDisplay?docId=emr_na-a00092491en_us). *support.hpe.com*, November 2019. Archived at [perma.cc/CZR4-AQBS](https://perma.cc/CZR4-AQBS)
[^19]: Craig Ringer et al. [PostgreSQL’s handling of fsync() errors is unsafe and risks data loss at least on XFS](https://www.postgresql.org/message-id/flat/CAMsr%2BYHh%2B5Oq4xziwwoEfhoTZgr07vdGG%2Bhu%3D1adXx59aTeaoQ%40mail.gmail.com). Email thread on pgsql-hackers mailing list, *postgresql.org*, March 2018. Archived at [perma.cc/5RKU-57FL](https://perma.cc/5RKU-57FL)
[^20]: Anthony Rebello, Yuvraj Patel, Ramnatthan Alagappan, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [Can Applications Recover from fsync Failures?](https://www.usenix.org/conference/atc20/presentation/rebello) At *USENIX Annual Technical Conference* (ATC), July 2020.
[^21]: Thanumalayan Sankaranarayana Pillai, Vijay Chidambaram, Ramnatthan Alagappan, Samer Al-Kiswany, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [Crash Consistency: Rethinking the Fundamental Abstractions of the File System](https://dl.acm.org/doi/pdf/10.1145/2800695.2801719). *ACM Queue*, volume 13, issue 7, pages 20–28, July 2015. [doi:10.1145/2800695.2801719](https://doi.org/10.1145/2800695.2801719)
[^22]: Thanumalayan Sankaranarayana Pillai, Vijay Chidambaram, Ramnatthan Alagappan, Samer Al-Kiswany, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [All File Systems Are Not Created Equal: On the Complexity of Crafting Crash-Consistent Applications](https://www.usenix.org/system/files/conference/osdi14/osdi14-paper-pillai.pdf). At *11th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), October 2014.
[^23]: Chris Siebenmann. [Unix’s File Durability Problem](https://utcc.utoronto.ca/~cks/space/blog/unix/FileSyncProblem). *utcc.utoronto.ca*, April 2016. Archived at [perma.cc/VSS8-5MC4](https://perma.cc/VSS8-5MC4)
[^24]: Aishwarya Ganesan, Ramnatthan Alagappan, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [Redundancy Does Not Imply Fault Tolerance: Analysis of Distributed Storage Reactions to Single Errors and Corruptions](https://www.usenix.org/conference/fast17/technical-sessions/presentation/ganesan). At *15th USENIX Conference on File and Storage Technologies* (FAST), February 2017.
[^25]: Lakshmi N. Bairavasundaram, Garth R. Goodson, Bianca Schroeder, Andrea C. Arpaci-Dusseau, and Remzi H. Arpaci-Dusseau. [An Analysis of Data Corruption in the Storage Stack](https://www.usenix.org/legacy/event/fast08/tech/full_papers/bairavasundaram/bairavasundaram.pdf). At *6th USENIX Conference on File and Storage Technologies* (FAST), February 2008.
[^26]: Bianca Schroeder, Raghav Lagisetty, and Arif Merchant. [Flash Reliability in Production: The Expected and the Unexpected](https://www.usenix.org/conference/fast16/technical-sessions/presentation/schroeder). At *14th USENIX Conference on File and Storage Technologies* (FAST), February 2016.
[^27]: Don Allison. [SSD Storage – Ignorance of Technology Is No Excuse](https://blog.korelogic.com/blog/2015/03/24). *blog.korelogic.com*, March 2015. Archived at [perma.cc/9QN4-9SNJ](https://perma.cc/9QN4-9SNJ)
[^28]: Gordon Mah Ung. [Debunked: Your SSD won’t lose data if left unplugged after all](https://www.pcworld.com/article/427602/debunked-your-ssd-wont-lose-data-if-left-unplugged-after-all.html). *pcworld.com*, May 2015. Archived at [perma.cc/S46H-JUDU](https://perma.cc/S46H-JUDU)
[^29]: Martin Kleppmann. [Hermitage: Testing the ‘I’ in ACID](https://martin.kleppmann.com/2014/11/25/hermitage-testing-the-i-in-acid.html). *martin.kleppmann.com*, November 2014. Archived at [perma.cc/KP2Y-AQGK](https://perma.cc/KP2Y-AQGK)
[^30]: Todd Warszawski and Peter Bailis. [ACIDRain: Concurrency-Related Attacks on Database-Backed Web Applications](http://www.bailis.org/papers/acidrain-sigmod2017.pdf). At *ACM International Conference on Management of Data* (SIGMOD), May 2017. [doi:10.1145/3035918.3064037](https://doi.org/10.1145/3035918.3064037)
[^31]: Tristan D’Agosta. [BTC Stolen from Poloniex](https://bitcointalk.org/index.php?topic=499580). *bitcointalk.org*, March 2014. Archived at [perma.cc/YHA6-4C5D](https://perma.cc/YHA6-4C5D)
[^32]: bitcointhief2. [How I Stole Roughly 100 BTC from an Exchange and How I Could Have Stolen More!](https://www.reddit.com/r/Bitcoin/comments/1wtbiu/how_i_stole_roughly_100_btc_from_an_exchange_and/) *reddit.com*, February 2014. Archived at [archive.org](https://web.archive.org/web/20250118042610/https%3A//www.reddit.com/r/Bitcoin/comments/1wtbiu/how_i_stole_roughly_100_btc_from_an_exchange_and/)
[^33]: Sudhir Jorwekar, Alan Fekete, Krithi Ramamritham, and S. Sudarshan. [Automating the Detection of Snapshot Isolation Anomalies](https://www.vldb.org/conf/2007/papers/industrial/p1263-jorwekar.pdf). At *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
[^34]: Michael Melanson. [Transactions: The Limits of Isolation](https://www.michaelmelanson.net/posts/transactions-the-limits-of-isolation/). *michaelmelanson.net*, November 2014. Archived at [perma.cc/RG5R-KMYZ](https://perma.cc/RG5R-KMYZ)
[^35]: Edward Kim. [How ACH works: A developer perspective — Part 1](https://engineering.gusto.com/how-ach-works-a-developer-perspective-part-1-339d3e7bea1). *engineering.gusto.com*, April 2014. Archived at [perma.cc/7B2H-PU94](https://perma.cc/7B2H-PU94)
[^36]: Hal Berenson, Philip A. Bernstein, Jim N. Gray, Jim Melton, Elizabeth O’Neil, and Patrick O’Neil. [A Critique of ANSI SQL Isolation Levels](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-95-51.pdf). At *ACM International Conference on Management of Data* (SIGMOD), May 1995. [doi:10.1145/568271.223785](https://doi.org/10.1145/568271.223785)
[^37]: Atul Adya. [Weak Consistency: A Generalized Theory and Optimistic Implementations for Distributed Transactions](https://pmg.csail.mit.edu/papers/adya-phd.pdf). PhD Thesis, Massachusetts Institute of Technology, March 1999. Archived at [perma.cc/E97M-HW5Q](https://perma.cc/E97M-HW5Q)
[^38]: Peter Bailis, Aaron Davidson, Alan Fekete, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Highly Available Transactions: Virtues and Limitations](https://www.vldb.org/pvldb/vol7/p181-bailis.pdf). At *40th International Conference on Very Large Data Bases* (VLDB), September 2014.
[^39]: Natacha Crooks, Youer Pu, Lorenzo Alvisi, and Allen Clement. [Seeing is Believing: A Client-Centric Specification of Database Isolation](https://www.cs.cornell.edu/lorenzo/papers/Crooks17Seeing.pdf). At *ACM Symposium on Principles of Distributed Computing* (PODC), pages 73–82, July 2017. [doi:10.1145/3087801.3087802](https://doi.org/10.1145/3087801.3087802)
[^40]: Bruce Momjian. [MVCC Unmasked](https://momjian.us/main/writings/pgsql/mvcc.pdf). *momjian.us*, July 2014. Archived at [perma.cc/KQ47-9GYB](https://perma.cc/KQ47-9GYB)
[^41]: Peter Alvaro and Kyle Kingsbury. [MySQL 8.0.34](https://jepsen.io/analyses/mysql-8.0.34). *jepsen.io*, December 2023. Archived at [perma.cc/HGE2-Z878](https://perma.cc/HGE2-Z878)
[^42]: Egor Rogov. [PostgreSQL 14 Internals](https://postgrespro.com/community/books/internals). *postgrespro.com*, April 2023. Archived at [perma.cc/FRK2-D7WB](https://perma.cc/FRK2-D7WB)
[^43]: Hironobu Suzuki. [The Internals of PostgreSQL](https://www.interdb.jp/pg/). *interdb.jp*, 2017.
[^44]: Rohan Reddy Alleti. [Internals of MVCC in Postgres: Hidden costs of Updates vs Inserts](https://medium.com/%40rohanjnr44/internals-of-mvcc-in-postgres-hidden-costs-of-updates-vs-inserts-381eadd35844). *medium.com*, March 2025. Archived at [perma.cc/3ACX-DFXT](https://perma.cc/3ACX-DFXT)
[^45]: Andy Pavlo and Bohan Zhang. [The Part of PostgreSQL We Hate the Most](https://www.cs.cmu.edu/~pavlo/blog/2023/04/the-part-of-postgresql-we-hate-the-most.html). *cs.cmu.edu*, April 2023. Archived at [perma.cc/XSP6-3JBN](https://perma.cc/XSP6-3JBN)
[^46]: Yingjun Wu, Joy Arulraj, Jiexi Lin, Ran Xian, and Andrew Pavlo. [An empirical evaluation of in-memory multi-version concurrency control](https://vldb.org/pvldb/vol10/p781-Wu.pdf). *Proceedings of the VLDB Endowment*, volume 10, issue 7, pages 781–792, March 2017. [doi:10.14778/3067421.3067427](https://doi.org/10.14778/3067421.3067427)
[^47]: Nikita Prokopov. [Unofficial Guide to Datomic Internals](https://tonsky.me/blog/unofficial-guide-to-datomic-internals/). *tonsky.me*, May 2014.
[^48]: Daniil Svetlov. [A Practical Guide to Taming Postgres Isolation Anomalies](https://dansvetlov.me/postgres-anomalies/). *dansvetlov.me*, March 2025. Archived at [perma.cc/L7LE-TDLS](https://perma.cc/L7LE-TDLS)
[^49]: Nate Wiger. [An Atomic Rant](https://nateware.com/2010/02/18/an-atomic-rant/). *nateware.com*, February 2010. Archived at [perma.cc/5ZYB-PE44](https://perma.cc/5ZYB-PE44)
[^50]: James Coglan. [Reading and writing, part 3: web applications](https://blog.jcoglan.com/2020/10/12/reading-and-writing-part-3/). *blog.jcoglan.com*, October 2020. Archived at [perma.cc/A7EK-PJVS](https://perma.cc/A7EK-PJVS)
[^51]: Peter Bailis, Alan Fekete, Michael J. Franklin, Ali Ghodsi, Joseph M. Hellerstein, and Ion Stoica. [Feral Concurrency Control: An Empirical Investigation of Modern Application Integrity](http://www.bailis.org/papers/feral-sigmod2015.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2015. [doi:10.1145/2723372.2737784](https://doi.org/10.1145/2723372.2737784)
[^52]: Jaana Dogan. [Things I Wished More Developers Knew About Databases](https://rakyll.medium.com/things-i-wished-more-developers-knew-about-databases-2d0178464f78). *rakyll.medium.com*, April 2020. Archived at [perma.cc/6EFK-P2TD](https://perma.cc/6EFK-P2TD)
[^53]: Michael J. Cahill, Uwe Röhm, and Alan Fekete. [Serializable Isolation for Snapshot Databases](https://www.cs.cornell.edu/~sowell/dbpapers/serializable_isolation.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2008. [doi:10.1145/1376616.1376690](https://doi.org/10.1145/1376616.1376690)
[^54]: Dan R. K. Ports and Kevin Grittner. [Serializable Snapshot Isolation in PostgreSQL](https://drkp.net/papers/ssi-vldb12.pdf). At *38th International Conference on Very Large Databases* (VLDB), August 2012.
[^55]: Douglas B. Terry, Marvin M. Theimer, Karin Petersen, Alan J. Demers, Mike J. Spreitzer and Carl H. Hauser. [Managing Update Conflicts in Bayou, a Weakly Connected Replicated Storage System](https://pdos.csail.mit.edu/6.824/papers/bayou-conflicts.pdf). At *15th ACM Symposium on Operating Systems Principles* (SOSP), December 1995. [doi:10.1145/224056.224070](https://doi.org/10.1145/224056.224070)
[^56]: Hans-Jürgen Schönig. [Constraints over multiple rows in PostgreSQL](https://www.cybertec-postgresql.com/en/postgresql-constraints-over-multiple-rows/). *cybertec-postgresql.com*, June 2021. Archived at [perma.cc/2TGH-XUPZ](https://perma.cc/2TGH-XUPZ)
[^57]: Michael Stonebraker, Samuel Madden, Daniel J. Abadi, Stavros Harizopoulos, Nabil Hachem, and Pat Helland. [The End of an Architectural Era (It’s Time for a Complete Rewrite)](https://vldb.org/conf/2007/papers/industrial/p1150-stonebraker.pdf). At *33rd International Conference on Very Large Data Bases* (VLDB), September 2007.
[^58]: John Hugg. [H-Store/VoltDB Architecture vs. CEP Systems and Newer Streaming Architectures](https://www.youtube.com/watch?v=hD5M4a1UVz8). At *Data @Scale Boston*, November 2014.
[^59]: Robert Kallman, Hideaki Kimura, Jonathan Natkins, Andrew Pavlo, Alexander Rasin, Stanley Zdonik, Evan P. C. Jones, Samuel Madden, Michael Stonebraker, Yang Zhang, John Hugg, and Daniel J. Abadi. [H-Store: A High-Performance, Distributed Main Memory Transaction Processing System](https://www.vldb.org/pvldb/vol1/1454211.pdf). *Proceedings of the VLDB Endowment*, volume 1, issue 2, pages 1496–1499, August 2008.
[^60]: Rich Hickey. [The Architecture of Datomic](https://www.infoq.com/articles/Architecture-Datomic/). *infoq.com*, November 2012. Archived at [perma.cc/5YWU-8XJK](https://perma.cc/5YWU-8XJK)
[^61]: John Hugg. [Debunking Myths About the VoltDB In-Memory Database](https://dzone.com/articles/debunking-myths-about-voltdb). *dzone.com*, May 2014. Archived at [perma.cc/2Z9N-HPKF](https://perma.cc/2Z9N-HPKF)
[^62]: Xinjing Zhou, Viktor Leis, Xiangyao Yu, and Michael Stonebraker. [OLTP Through the Looking Glass 16 Years Later: Communication is the New Bottleneck](https://www.vldb.org/cidrdb/papers/2025/p17-zhou.pdf). At *15th Annual Conference on Innovative Data Systems Research* (CIDR), January 2025.
[^63]: Xinjing Zhou, Xiangyao Yu, Goetz Graefe, and Michael Stonebraker. [Lotus: scalable multi-partition transactions on single-threaded partitioned databases](https://www.vldb.org/pvldb/vol15/p2939-zhou.pdf). *Proceedings of the VLDB Endowment* (PVLDB), volume 15, issue 11, pages 2939–2952, July 2022. [doi:10.14778/3551793.3551843](https://doi.org/10.14778/3551793.3551843)
[^64]: Joseph M. Hellerstein, Michael Stonebraker, and James Hamilton. [Architecture of a Database System](https://dsf.berkeley.edu/papers/fntdb07-architecture.pdf). *Foundations and Trends in Databases*, volume 1, issue 2, pages 141–259, November 2007. [doi:10.1561/1900000002](https://doi.org/10.1561/1900000002)
[^65]: Michael J. Cahill. [Serializable Isolation for Snapshot Databases](https://ses.library.usyd.edu.au/bitstream/handle/2123/5353/michael-cahill-2009-thesis.pdf). PhD Thesis, University of Sydney, July 2009. Archived at [perma.cc/727J-NTMP](https://perma.cc/727J-NTMP)
[^66]: Cristian Diaconu, Craig Freedman, Erik Ismert, Per-Åke Larson, Pravin Mittal, Ryan Stonecipher, Nitin Verma, and Mike Zwilling. [Hekaton: SQL Server’s Memory-Optimized OLTP Engine](https://www.microsoft.com/en-us/research/wp-content/uploads/2013/06/Hekaton-Sigmod2013-final.pdf). At *ACM SIGMOD International Conference on Management of Data* (SIGMOD), pages 1243–1254, June 2013. [doi:10.1145/2463676.2463710](https://doi.org/10.1145/2463676.2463710)
[^67]: Thomas Neumann, Tobias Mühlbauer, and Alfons Kemper. [Fast Serializable Multi-Version Concurrency Control for Main-Memory Database Systems](https://db.in.tum.de/~muehlbau/papers/mvcc.pdf). At *ACM SIGMOD International Conference on Management of Data* (SIGMOD), pages 677–689, May 2015. [doi:10.1145/2723372.2749436](https://doi.org/10.1145/2723372.2749436)
[^68]: D. Z. Badal. [Correctness of Concurrency Control and Implications in Distributed Databases](https://ieeexplore.ieee.org/abstract/document/762563). At *3rd International IEEE Computer Software and Applications Conference* (COMPSAC), November 1979. [doi:10.1109/CMPSAC.1979.762563](https://doi.org/10.1109/CMPSAC.1979.762563)
[^69]: Rakesh Agrawal, Michael J. Carey, and Miron Livny. [Concurrency Control Performance Modeling: Alternatives and Implications](https://people.eecs.berkeley.edu/~brewer/cs262/ConcControl.pdf). *ACM Transactions on Database Systems* (TODS), volume 12, issue 4, pages 609–654, December 1987. [doi:10.1145/32204.32220](https://doi.org/10.1145/32204.32220)
[^70]: Marc Brooker. [Snapshot Isolation vs Serializability](https://brooker.co.za/blog/2024/12/17/occ-and-isolation.html). *brooker.co.za*, December 2024. Archived at [perma.cc/5TRC-CR5G](https://perma.cc/5TRC-CR5G)
[^71]: B. G. Lindsay, P. G. Selinger, C. Galtieri, J. N. Gray, R. A. Lorie, T. G. Price, F. Putzolu, I. L. Traiger, and B. W. Wade. [Notes on Distributed Databases](https://dominoweb.draco.res.ibm.com/reports/RJ2571.pdf). IBM Research, Research Report RJ2571(33471), July 1979. Archived at [perma.cc/EPZ3-MHDD](https://perma.cc/EPZ3-MHDD)
[^72]: C. Mohan, Bruce G. Lindsay, and Ron Obermarck. [Transaction Management in the R\* Distributed Database Management System](https://cs.brown.edu/courses/csci2270/archives/2012/papers/dtxn/p378-mohan.pdf). *ACM Transactions on Database Systems*, volume 11, issue 4, pages 378–396, December 1986. [doi:10.1145/7239.7266](https://doi.org/10.1145/7239.7266)
[^73]: X/Open Company Ltd. [Distributed Transaction Processing: The XA Specification](https://pubs.opengroup.org/onlinepubs/009680699/toc.pdf). Technical Standard XO/CAE/91/300, December 1991. ISBN: 978-1-872-63024-3, archived at [perma.cc/Z96H-29JB](https://perma.cc/Z96H-29JB)
[^74]: Ivan Silva Neto and Francisco Reverbel. [Lessons Learned from Implementing WS-Coordination and WS-AtomicTransaction](https://www.ime.usp.br/~reverbel/papers/icis2008.pdf). At *7th IEEE/ACIS International Conference on Computer and Information Science* (ICIS), May 2008. [doi:10.1109/ICIS.2008.75](https://doi.org/10.1109/ICIS.2008.75)
[^75]: James E. Johnson, David E. Langworthy, Leslie Lamport, and Friedrich H. Vogt. [Formal Specification of a Web Services Protocol](https://www.microsoft.com/en-us/research/publication/formal-specification-of-a-web-services-protocol/). At *1st International Workshop on Web Services and Formal Methods* (WS-FM), February 2004. [doi:10.1016/j.entcs.2004.02.022](https://doi.org/10.1016/j.entcs.2004.02.022)
[^76]: Jim Gray. [The Transaction Concept: Virtues and Limitations](https://jimgray.azurewebsites.net/papers/thetransactionconcept.pdf). At *7th International Conference on Very Large Data Bases* (VLDB), September 1981.
[^77]: Dale Skeen. [Nonblocking Commit Protocols](https://www.cs.utexas.edu/~lorenzo/corsi/cs380d/papers/Ske81.pdf). At *ACM International Conference on Management of Data* (SIGMOD), April 1981. [doi:10.1145/582318.582339](https://doi.org/10.1145/582318.582339)
[^78]: Gregor Hohpe. [Your Coffee Shop Doesn’t Use Two-Phase Commit](https://www.martinfowler.com/ieeeSoftware/coffeeShop.pdf). *IEEE Software*, volume 22, issue 2, pages 64–66, March 2005. [doi:10.1109/MS.2005.52](https://doi.org/10.1109/MS.2005.52)
[^79]: Pat Helland. [Life Beyond Distributed Transactions: An Apostate’s Opinion](https://www.cidrdb.org/cidr2007/papers/cidr07p15.pdf). At *3rd Biennial Conference on Innovative Data Systems Research* (CIDR), January 2007.
[^80]: Jonathan Oliver. [My Beef with MSDTC and Two-Phase Commits](https://blog.jonathanoliver.com/my-beef-with-msdtc-and-two-phase-commits/). *blog.jonathanoliver.com*, April 2011. Archived at [perma.cc/K8HF-Z4EN](https://perma.cc/K8HF-Z4EN)
[^81]: Oren Eini (Ahende Rahien). [The Fallacy of Distributed Transactions](https://ayende.com/blog/167362/the-fallacy-of-distributed-transactions). *ayende.com*, July 2014. Archived at [perma.cc/VB87-2JEF](https://perma.cc/VB87-2JEF)
[^82]: Clemens Vasters. [Transactions in Windows Azure (with Service Bus) – An Email Discussion](https://learn.microsoft.com/en-gb/archive/blogs/clemensv/transactions-in-windows-azure-with-service-bus-an-email-discussion). *learn.microsoft.com*, July 2012. Archived at [perma.cc/4EZ9-5SKW](https://perma.cc/4EZ9-5SKW)
[^83]: Ajmer Dhariwal. [Orphaned MSDTC Transactions (-2 spids)](https://www.eraofdata.com/posts/2008/orphaned-msdtc-transactions-2-spids/). *eraofdata.com*, December 2008. Archived at [perma.cc/YG6F-U34C](https://perma.cc/YG6F-U34C)
[^84]: Paul Randal. [Real World Story of DBCC PAGE Saving the Day](https://www.sqlskills.com/blogs/paul/real-world-story-of-dbcc-page-saving-the-day/). *sqlskills.com*, June 2013. Archived at [perma.cc/2MJN-A5QH](https://perma.cc/2MJN-A5QH)
[^85]: Guozhang Wang, Lei Chen, Ayusman Dikshit, Jason Gustafson, Boyang Chen, Matthias J. Sax, John Roesler, Sophie Blee-Goldman, Bruno Cadonna, Apurva Mehta, Varun Madan, and Jun Rao. [Consistency and Completeness: Rethinking Distributed Stream Processing in Apache Kafka](https://dl.acm.org/doi/pdf/10.1145/3448016.3457556). At *ACM International Conference on Management of Data* (SIGMOD), June 2021. [doi:10.1145/3448016.3457556](https://doi.org/10.1145/3448016.3457556)


================================================
FILE: content/zh/ch9.md
================================================
---
title: "9. 分布式系统的麻烦"
weight: 209
breadcrumbs: false
---

<a id="ch_distributed"></a>

![](/map/ch08.png)

> *意外这东西挺有意思：你没碰上之前，它就从来不会发生。*
>
> A.A. 米尔恩，《小熊维尼和老灰驴的家》（1928）

正如 ["可靠性与容错"](/ch2#sec_introduction_reliability) 中所讨论的，让系统可靠意味着确保系统作为一个整体继续工作，即使出了问题（即出现故障）。然而，预料所有可能的故障并处理它们并不是那么容易。作为开发者，我们很容易主要关注正常路径（毕竟，大多数时候事情都运行良好！）而忽略故障，因为故障会引入大量边界情况。

如果你希望系统在故障存在的情况下仍然可靠，你必须从根本上改变你的思维方式，并专注于可能出错的事情，即使它们可能性很低。一件事情出错的概率是否只有百万分之一并不重要：在一个足够大的系统中，百万分之一的事件每天都在发生。经验丰富的系统操作员会告诉你，任何 *可能* 出错的事情 *都会* 出错。

此外，使用分布式系统与在单台计算机上编写软件有着根本的不同 —— 主要区别在于有许多新的、令人兴奋的出错方式 [^1] [^2]。在本章中，你将体验实践中出现的问题，并理解你可以依赖和不能依赖的事物。

为了理解我们面临的挑战，我们现在将把悲观情绪发挥到极致，探索分布式系统中可能出错的事情。我们将研究网络问题（["不可靠的网络"](#sec_distributed_networks)）以及时钟和时序问题（["不可靠的时钟"](#sec_distributed_clocks)）。所有这些问题的后果令人迷惑，因此我们将探索如何思考分布式系统的状态以及如何推理已经发生的事情（["知识、真相与谎言"](#sec_distributed_truth)）。稍后，在 [第 10 章](/ch10#ch_consistency) 中，我们将看一些面对这些故障时如何实现容错的例子。

## 故障与部分失效 {#sec_distributed_partial_failure}

当你在单台计算机上编写程序时，它通常以相当可预测的方式运行：要么工作，要么不工作。有缺陷的软件可能会给人一种计算机有时 "状态不佳" 的印象（这个问题通常通过重启来解决），但这主要只是编写不良的软件的后果。

软件在单台计算机上不应该是不稳定的，这没有根本原因：当硬件正常工作时，相同的操作总是产生相同的结果（它是 *确定性的*）。如果存在硬件问题（例如，内存损坏或连接器松动），后果通常是整个系统故障（例如，内核恐慌、"蓝屏死机"、无法启动）。一台运行良好软件的单独计算机通常要么完全正常运行，要么完全故障，而不是介于两者之间。

这是计算机设计中的一个刻意选择：如果发生内部故障，我们宁愿计算机完全崩溃而不是返回错误的结果，因为错误的结果很难处理且令人困惑。因此，计算机隐藏了它们所实现的模糊物理现实，并呈现一个以数学完美运行的理想化系统模型。CPU 指令总是做同样的事情；如果你将一些数据写入内存或磁盘，该数据保持完整，不会被随机损坏。正如 ["硬件与软件故障"](/ch2#sec_introduction_hardware_faults) 中所讨论的，这实际上并不是真的 —— 实际上，数据确实会被静默损坏，CPU 有时会静默返回错误的结果 —— 但这种情况发生得足够少，以至于我们可以忽略它。

当你编写在多台计算机上运行的软件，通过网络连接时，情况就根本不同了。在分布式系统中，故障发生得更加频繁，因此我们不能再忽略它们 —— 我们别无选择，只能直面物理世界的混乱现实。在物理世界中，可能出错的事情范围非常广泛，正如这个轶事所说明的 [^3]：

> 在我有限的经验中，我处理过单个数据中心（DC）中的长期网络分区、PDU [配电单元] 故障、交换机故障、整个机架的意外断电、整个 DC 骨干网故障、整个 DC 电源故障，以及一个低血糖的司机将他的福特皮卡撞进 DC 的 HVAC [供暖、通风和空调] 系统。而我甚至不是运维人员。
>
> —— Coda Hale

在分布式系统中，系统的某些部分可能以某种不可预测的方式出现故障，即使系统的其他部分工作正常。这被称为 *部分失效*。困难在于部分失效是 *非确定性的*：如果你尝试做任何涉及多个节点和网络的事情，它有时可能工作，有时可能不可预测地失败。正如我们将看到的，你甚至可能不 *知道* 某事是否成功！

这种非确定性和部分失效的可能性使分布式系统难以使用 [^4]。另一方面，如果分布式系统可以容忍部分失效，这将开启强大的可能性：例如，它允许你执行滚动升级，一次重启一个节点以安装软件更新，而系统作为一个整体继续不间断地工作。因此，容错使我们能够从不可靠的组件构建比单节点系统更可靠的分布式系统。

但在我们实现容错之前，我们需要更多地了解我们应该容忍的故障。重要的是要考虑各种可能的故障 —— 即使是相当不太可能的故障 —— 并在你的测试环境中人为地创建这种情况以查看会发生什么。在分布式系统中，怀疑、悲观和偏执是有回报的。

## 不可靠的网络 {#sec_distributed_networks}

正如 ["共享内存、共享磁盘和无共享架构"](/ch2#sec_introduction_shared_nothing) 中所讨论的，我们在本书中关注的分布式系统主要是 *无共享系统*：即通过网络连接的一组机器。网络是这些机器进行通信的唯一方式 —— 我们假设每台机器都有自己的内存和磁盘，一台机器不能访问另一台机器的内存或磁盘（除非通过网络向服务发出请求）。即使存储是共享的，例如亚马逊的 S3，机器也是通过网络与共享存储服务通信。

互联网和数据中心中的大多数内部网络（通常是以太网）都是 *异步分组网络*。在这种网络中，一个节点可以向另一个节点发送消息（数据包），但网络不保证它何时到达，或者是否会到达。如果你发送请求并期望响应，许多事情可能会出错（其中一些如 [图 9-1](#fig_distributed_network) 所示）：

1. 你的请求可能已经丢失（也许有人拔掉了网线）。
2. 你的请求可能在队列中等待，稍后将被交付（也许网络或接收方过载）。
3. 远程节点可能已经失效（也许它崩溃了或被关闭了）。
4. 远程节点可能暂时停止响应（也许它正在经历长时间的垃圾回收暂停；见 ["进程暂停"](#sec_distributed_clocks_pauses)），但稍后会再次开始响应。
5. 远程节点可能已经处理了你的请求，但响应在网络上丢失了（也许网络交换机配置错误）。
6. 远程节点可能已经处理了你的请求，但响应被延迟了，稍后将被交付（也许网络或你自己的机器过载）。

{{< figure src="/fig/ddia_0901.png" id="fig_distributed_network" caption="图 9-1. 如果你发送请求但没有收到响应，无法区分是 (a) 请求丢失了，(b) 远程节点宕机了，还是 (c) 响应丢失了。" class="w-full my-4" >}}


发送方甚至无法判断数据包是否已交付：唯一的选择是让接收方发送响应消息，而响应消息本身也可能丢失或延迟。在异步网络中，这些问题是无法区分的：你拥有的唯一信息是你还没有收到响应。如果你向另一个节点发送请求但没有收到响应，*不可能* 判断原因。

处理这个问题的常用方法是 *超时*：在一段时间后，你放弃等待并假设响应不会到达。然而，当超时发生时，你仍然不知道远程节点是否收到了你的请求（如果请求仍在某处排队，即使发送方已经放弃了它，它仍可能被交付给接收方）。

### TCP 的局限性 {#sec_distributed_tcp}

网络数据包有最大大小（通常为几千字节），但许多应用程序需要发送太大而无法装入一个数据包的消息（请求、响应）。这些应用程序最常使用 TCP（传输控制协议）来建立一个 *连接*，将大型数据流分解为单个数据包，并在接收端将它们重新组合起来。

--------

> [!NOTE]
> 我们关于 TCP 的大部分内容也适用于其更新的替代方案 QUIC，以及 WebRTC 中使用的流控制传输协议（SCTP）、BitTorrent uTP 协议和其他传输协议。有关与 UDP 的比较，请参见 ["TCP 与 UDP"](#sidebar_distributed_tcp_udp)。

--------

TCP 通常被描述为提供 "可靠" 的交付，从某种意义上说，它检测并重传丢弃的数据包，检测重新排序的数据包并将它们恢复到正确的顺序，并使用简单的校验和检测数据包损坏。它还计算出可以发送数据的速度，以便尽快传输数据，但不会使网络或接收节点过载；这被称为 *拥塞控制*、*流量控制* 或 *背压* [^5]。

当你通过将数据写入套接字来 "发送" 一些数据时，它实际上不会立即发送，而只是放置在由操作系统管理的缓冲区中。当拥塞控制算法决定它有能力发送数据包时，它会从该缓冲区中获取下一个数据包的数据并将其传递给网络接口。数据包通过几个交换机和路由器，最终接收节点的操作系统将数据包的数据放置在接收缓冲区中并向发送方发送确认数据包。只有这样，接收操作系统才会通知应用程序有更多数据到达 [^6]。

那么，如果 TCP 提供 "可靠性"，这是否意味着我们不再需要担心网络不可靠？不幸的是不是。如果在某个超时时间内没有收到确认，它会认为数据包一定已经丢失，但 TCP 也无法判断是出站数据包还是确认丢失了。尽管 TCP 可以重新发送数据包，但它不能保证新数据包也会通过。如果网线被拔掉，TCP 不能为你重新插上它。最终，在可配置的超时后，TCP 放弃并向应用程序发出错误信号。

如果 TCP 连接因错误而关闭 —— 也许是因为远程节点崩溃了，或者是因为网络被中断了 —— 你不幸地无法知道远程节点实际处理了多少数据 [^6]。即使 TCP 确认数据包已交付，这也仅意味着远程节点上的操作系统内核收到了它，但应用程序可能在处理该数据之前就崩溃了。如果你想确保请求成功，你需要应用层返回明确的成功响应 [^7]。

尽管如此，TCP 非常有用，因为它提供了一种方便的方式来发送和接收太大而无法装入一个数据包的消息。一旦建立了 TCP 连接，你还可以使用它来发送多个请求和响应。这通常是通过首先发送一个标头来完成的，该标头以字节为单位指示后续消息的长度，然后是实际消息。HTTP 和许多 RPC 协议（见 ["通过服务的数据流：REST 和 RPC"](/ch5#sec_encoding_dataflow_rpc)）就是这样工作的。

### 实践中的网络故障 {#sec_distributed_network_faults}

我们已经建立计算机网络几十年了 —— 人们可能希望到现在我们已经弄清楚如何使它们可靠。不幸的是，我们还没有成功。有一些系统研究和大量轶事证据表明，网络问题可能出人意料地常见，即使在由一家公司运营的受控环境（如数据中心）中也是如此 [^8]：

* 一项在中型数据中心的研究发现，每月约有 12 次网络故障，其中一半断开了单台机器，一半断开了整个机架 [^9]。
* 另一项研究测量了组件（如机架顶部交换机、汇聚交换机和负载均衡器）的故障率 [^10]。它发现，添加冗余网络设备并不能像你希望的那样减少故障，因为它不能防范人为错误（例如，配置错误的交换机），这是停机的主要原因。
* 广域光纤链路的中断被归咎于奶牛 [^11]、海狸 [^12] 和鲨鱼 [^13]（尽管由于海底电缆屏蔽更好，鲨鱼咬伤已经变得更加罕见 [^14]）。人类也有过错，无论是由于意外配置错误 [^15]、拾荒 [^16] 还是破坏 [^17]。
* 在不同的云区域之间，已经观察到高百分位数下长达几 *分钟* 的往返时间 [^18]。即使在单个数据中心内，在网络拓扑重新配置期间（由交换机软件升级期间的问题触发），也可能发生超过一分钟的数据包延迟 [^19]。因此，我们必须假设消息可能被任意延迟。
* 有时通信部分中断，这取决于你在和谁交谈：例如，A 和 B 可以通信，B 和 C 可以通信，但 A 和 C 不能 [^20] [^21]。其他令人惊讶的故障包括网络接口有时会丢弃所有入站数据包但成功发送出站数据包 [^22]：仅仅因为网络链路在一个方向上工作并不能保证它在相反方向上也工作。
* 即使是短暂的网络中断也可能产生比原始问题持续时间更长的影响 [^8] [^20] [^23]。

--------

> [!TIP] 网络分区
>
> 当网络的一部分由于网络故障而与其余部分隔离时，有时称为 *网络分区* 或 *网络分裂*，但它与其他类型的网络中断没有根本区别。网络分区与存储系统的分片无关，后者有时也称为 *分区*（见 [第 7 章](/ch7#ch_sharding)）。

--------

即使网络故障在你的环境中很少见，故障 *可能* 发生的事实意味着你的软件需要能够处理它们。每当通过网络进行任何通信时，它都可能失败 —— 这是无法避免的。

如果网络故障的错误处理没有定义和测试，可能会发生任意糟糕的事情：例如，集群可能会陷入死锁并永久无法提供请求，即使网络恢复 [^24]，或者它甚至可能删除你的所有数据 [^25]。如果软件处于意料之外的情况，它可能会做任意意外的事情。

处理网络故障不一定意味着 *容忍* 它们：如果你的网络通常相当可靠，一个有效的方法可能是在网络出现问题时简单地向用户显示错误消息。但是，你确实需要知道你的软件如何对网络问题做出反应，并确保系统可以从中恢复。故意触发网络问题并测试系统的响应可能是有意义的（这被称为 *故障注入*；见 ["故障注入"](#sec_fault_injection)）。

### 检测故障 {#id307}

许多系统需要自动检测故障节点。例如：

* 负载均衡器需要停止向已死亡的节点发送请求（即，将其 *从轮询池中摘除*）。
* 在具有单主复制的分布式数据库中，如果主节点失效，其中一个从节点需要被提升为新的主节点（见 ["处理节点中断"](/ch6#sec_replication_failover)）。

不幸的是，网络的不确定性使得很难判断节点是否正常工作。在某些特定情况下，你可能会得到一些明确告诉你某事不工作的反馈：

* 如果你可以访问节点应该运行的机器，但没有进程监听目标端口（例如，因为进程崩溃了），操作系统将通过发送 `RST` 或 `FIN` 数据包来帮助关闭或拒绝 TCP 连接。
* 如果节点进程崩溃（或被管理员杀死）但节点的操作系统仍在运行，脚本可以通知其他节点有关崩溃的信息，以便另一个节点可以快速接管而无需等待超时到期。例如，HBase 就是这样做的 [^26]。
* 如果你可以访问数据中心中网络交换机的管理接口，你可以查询它们以在硬件级别检测链路故障（例如，如果远程机器已关闭电源）。如果你通过互联网连接，或者你在共享数据中心中无法访问交换机本身，或者由于网络问题无法访问管理接口，则此选项被排除。
* 如果路由器确定你尝试连接的 IP 地址不可达，它可能会向你回复 ICMP 目标不可达数据包。然而，路由器也没有神奇的故障检测能力 —— 它受到与网络其他参与者相同的限制。

关于远程节点宕机的快速反馈很有用，但你不能指望它。如果出了问题，你可能会在堆栈的某个级别收到错误响应，但通常你必须假设你根本不会收到任何响应。你可以重试几次，等待超时过去，如果在超时内没有收到回复，最终宣布节点死亡。

### 超时和无界延迟 {#sec_distributed_queueing}

如果超时是检测故障的唯一可靠方法，那么超时应该多长？不幸的是，没有简单的答案。

长超时意味着在节点被宣布死亡之前需要长时间等待（在此期间，用户可能不得不等待或看到错误消息）。短超时可以更快地检测故障，但当节点实际上只是遭受暂时的减速（例如，由于节点或网络上的负载峰值）时，错误地宣布节点死亡的风险更高。

过早地宣布节点死亡是有问题的：如果节点实际上是活着的并且正在执行某些操作（例如，发送电子邮件），而另一个节点接管，该操作可能最终被执行两次。我们将在 ["知识、真相与谎言"](#sec_distributed_truth) 以及第 10 章和后续章节中更详细地讨论这个问题。

当节点被宣布死亡时，其职责需要转移到其他节点，这会给其他节点和网络带来额外的负载。如果系统已经在高负载下挣扎，过早地宣布节点死亡可能会使问题变得更糟。特别是，可能发生的情况是，节点实际上并没有死亡，只是由于过载而响应缓慢；将其负载转移到其他节点可能会导致级联故障（在极端情况下，所有节点互相宣布对方死亡，一切都停止工作 —— 见 ["当过载系统无法恢复时"](/ch2#sidebar_metastable)）。

想象一个虚构的系统，其网络保证数据包的最大延迟 —— 每个数据包要么在某个时间 *d* 内交付，要么丢失，但交付从不会超过 *d*。此外，假设你可以保证未失效的节点总是在某个时间 *r* 内处理请求。在这种情况下，你可以保证每个成功的请求在时间 2*d* + *r* 内收到响应 —— 如果你在该时间内没有收到响应，你就知道网络或远程节点不工作。如果这是真的，2*d* + *r* 将是一个合理的超时时间。

不幸的是，我们使用的大多数系统都没有这些保证：异步网络具有 *无界延迟*（即，它们尝试尽快交付数据包，但数据包到达所需的时间没有上限），大多数服务器实现无法保证它们可以在某个最大时间内处理请求（见 ["响应时间保证"](#sec_distributed_clocks_realtime)）。对于故障检测，系统大部分时间快速运行是不够的：如果你的超时很低，往返时间的瞬时峰值就足以使系统失去平衡。

<a id="sec_distributed_congestion"></a>

#### 网络拥塞和排队 {#network-congestion-and-queueing}

开车时，道路网络上的行驶时间通常因交通拥堵而变化最大。同样，计算机网络上数据包延迟的可变性最常是由于排队 [^27]：

* 如果几个不同的节点同时尝试向同一目的地发送数据包，网络交换机必须将它们排队并逐个送入目标网络链路（如 [图 9-2](#fig_distributed_switch_queueing) 所示）。在繁忙的网络链路上，数据包可能需要等待一段时间才能获得一个插槽（这称为 *网络拥塞*）。如果有太多的传入数据以至于交换机队列满了，数据包将被丢弃，因此需要重新发送 —— 即使网络运行正常。
* 当数据包到达目标机器时，如果所有 CPU 核心当前都很忙，来自网络的传入请求会被操作系统排队，直到应用程序准备处理它。根据机器上的负载，这可能需要任意长的时间 [^28]。
* 在虚拟化环境中，正在运行的操作系统经常会暂停几十毫秒，而另一个虚拟机使用 CPU 核心。在此期间，VM 无法消耗来自网络的任何数据，因此传入数据由虚拟机监视器排队（缓冲）[^29]，进一步增加了网络延迟的可变性。
* 如前所述，为了避免网络过载，TCP 限制发送数据的速率。这意味着在数据甚至进入网络之前，发送方就有额外的排队。

{{< figure src="/fig/ddia_0902.png" id="fig_distributed_switch_queueing" caption="图 9-2. 如果几台机器向同一目的地发送网络流量，其交换机队列可能会满。这里，端口 1、2 和 4 都试图向端口 3 发送数据包。" class="w-full my-4" >}}

此外，当 TCP 检测到并自动重传丢失的数据包时，尽管应用程序不会直接看到数据包丢失，但它确实会看到由此产生的延迟（等待超时到期，然后等待重传的数据包被确认）。

--------

<a id="sidebar_distributed_tcp_udp"></a>

> [!TIP] TCP 与 UDP
>
> 一些对延迟敏感的应用程序，如视频会议和 IP 语音（VoIP），使用 UDP 而不是 TCP。这是可靠性和延迟可变性之间的权衡：由于 UDP 不执行流量控制并且不重传丢失的数据包，它避免了网络延迟可变的一些原因（尽管它仍然容易受到交换机队列和调度延迟的影响）。
>
> UDP 是延迟数据无价值的情况下的好选择。例如，在 VoIP 电话通话中，在数据应该通过扬声器播放之前，可能没有足够的时间重传丢失的数据包。在这种情况下，重传数据包没有意义 —— 应用程序必须用静音填充缺失数据包的时间槽（导致声音短暂中断）并继续流。重试发生在人类层面。（"你能重复一下吗？声音刚刚中断了一会儿。"）

--------

所有这些因素都导致了网络延迟的可变性。当系统接近其最大容量时，排队延迟的范围特别大：具有充足备用容量的系统可以轻松排空队列，而在高度利用的系统中，长队列可以很快建立起来。

在公共云和多租户数据中心中，资源在许多客户之间共享：网络链路和交换机，甚至每台机器的网络接口和 CPU（在虚拟机上运行时）都是共享的。处理大量数据可以使用网络链路的全部容量（*饱和* 它们）。由于你无法控制或了解其他客户对共享资源的使用情况，如果你附近的某人（*吵闹的邻居*）正在使用大量资源，网络延迟可能会高度可变 [^30] [^31]。

在这种环境中，你只能通过实验选择超时：在较长时间内和许多机器上测量网络往返时间的分布，以确定延迟的预期可变性。然后，考虑到你的应用程序的特征，你可以在故障检测延迟和过早超时风险之间确定适当的权衡。

更好的是，系统可以持续测量响应时间及其可变性（*抖动*），并根据观察到的响应时间分布自动调整超时，而不是使用配置的常量超时。Phi 累积故障检测器 [^32]（例如在 Akka 和 Cassandra 中使用 [^33]）就是这样做的一种方法。TCP 重传超时也以类似的方式工作 [^5]。

### 同步与异步网络 {#sec_distributed_sync_networks}

如果我们可以依靠网络以某个固定的最大延迟交付数据包，并且不丢弃数据包，分布式系统将会简单得多。为什么我们不能在硬件级别解决这个问题，使网络可靠，这样软件就不需要担心它了？

要回答这个问题，比较数据中心网络与传统的固定电话网络（非蜂窝、非 VoIP）很有趣，后者极其可靠：延迟的音频帧和掉线非常罕见。电话通话需要持续的低端到端延迟和足够的带宽来传输你声音的音频样本。在计算机网络中拥有类似的可靠性和可预测性不是很好吗？

当你通过电话网络拨打电话时，它会建立一个 *电路*：在两个呼叫者之间的整个路线上分配固定、有保证的带宽量。该电路一直保持到通话结束 [^34]。例如，ISDN 网络以每秒 4,000 帧的固定速率运行。建立呼叫时，它在每帧内（在每个方向上）分配 16 位空间。因此，在通话期间，每一方都保证能够每 250 微秒准确发送 16 位音频数据 [^35]。

这种网络是 *同步的*：即使数据通过几个路由器，它也不会遭受排队，因为呼叫的 16 位空间已经在网络的下一跳中预留了。由于没有排队，网络的最大端到端延迟是固定的。我们称之为 *有界延迟*。

#### 我们不能简单地使网络延迟可预测吗？ {#can-we-not-simply-make-network-delays-predictable}

请注意，电话网络中的电路与 TCP 连接非常不同：电路是固定数量的预留带宽，在电路建立期间其他人无法使用，而 TCP 连接的数据包则机会主义地使用任何可用的网络带宽。你可以给 TCP 一个可变大小的数据块（例如，电子邮件或网页），它会尝试在尽可能短的时间内传输它。当 TCP 连接空闲时，它不使用任何带宽（除了偶尔的保活数据包）。

如果数据中心网络和互联网是电路交换网络，那么在建立电路时就可以建立有保证的最大往返时间。然而，它们不是：以太网和 IP 是分组交换协议，会遭受排队，因此在网络中有无界延迟。这些协议没有电路的概念。

为什么数据中心网络和互联网使用分组交换？答案是它们针对 *突发流量* 进行了优化。电路适合音频或视频通话，需要在通话期间传输相当恒定的每秒位数。另一方面，请求网页、发送电子邮件或传输文件没有任何特定的带宽要求 —— 我们只希望它尽快完成。

如果你想通过电路传输文件，你必须猜测带宽分配。如果你猜得太低，传输会不必要地慢，使网络容量未被使用。如果你猜得太高，电路无法建立（因为如果无法保证其带宽分配，网络无法允许创建电路）。因此，使用电路进行突发数据传输会浪费网络容量并使传输不必要地缓慢。相比之下，TCP 动态调整数据传输速率以适应可用的网络容量。

曾经有一些尝试构建既支持电路交换又支持分组交换的混合网络。*异步传输模式*（ATM）在 1980 年代是以太网的竞争对手，但除了电话网络核心交换机外，它没有获得太多采用。InfiniBand 有一些相似之处 [^36]：它在链路层实现端到端流量控制，减少了网络中排队的需要，尽管它仍然可能因链路拥塞而遭受延迟 [^37]。通过仔细使用 *服务质量*（QoS，数据包的优先级和调度）和 *准入控制*（对发送者的速率限制），可以在分组网络上模拟电路交换，或提供统计上有界的延迟 [^27] [^34]。新的网络算法，如低延迟、低损耗和可扩展吞吐量（L4S）试图在客户端和路由器级别缓解一些排队和拥塞控制问题。Linux 的流量控制器（TC）也允许应用程序为 QoS 目的重新优先排序数据包。

--------

<a id="sidebar_distributed_latency_utilization"></a>

> [!TIP] 延迟和资源利用率
>
> 更一般地说，你可以将可变延迟视为动态资源分区的结果。
>
> 假设你在两个电话交换机之间有一条可以承载多达 10,000 个同时呼叫的线路。通过此线路交换的每个电路都占用其中一个呼叫插槽。因此，你可以将该线路视为最多可由 10,000 个同时用户共享的资源。资源以 *静态* 方式划分：即使你现在是线路上唯一的呼叫，并且所有其他 9,999 个插槽都未使用，你的电路仍然分配与线路完全利用时相同的固定带宽量。
>
> 相比之下，互联网 *动态* 共享网络带宽。发送者互相推挤，尽可能快地通过线路发送数据包，网络交换机决定在每个时刻发送哪个数据包（即带宽分配）。这种方法的缺点是排队，但优点是它最大化了线路的利用率。线路有固定成本，所以如果你更好地利用它，你通过线路发送的每个字节都更便宜。
>
> CPU 也会出现类似的情况：如果你在几个线程之间动态共享每个 CPU 核心，一个线程有时必须在操作系统的运行队列中等待，而另一个线程正在运行，因此线程可能会暂停不同的时间长度 [^38]。然而，这比为每个线程分配静态数量的 CPU 周期更好地利用硬件（见 ["响应时间保证"](#sec_distributed_clocks_realtime)）。更好的硬件利用率也是云平台在同一物理机器上运行来自不同客户的多个虚拟机的原因。
>
> 如果资源是静态分区的（例如，专用硬件和独占带宽分配），则在某些环境中可以实现延迟保证。然而，这是以降低利用率为代价的 —— 换句话说，它更昂贵。另一方面，具有动态资源分区的多租户提供了更好的利用率，因此更便宜，但它有可变延迟的缺点。
>
> 网络中的可变延迟不是自然法则，而只是成本/收益权衡的结果。

--------

然而，这种服务质量目前在多租户数据中心和公共云中未启用，或者在通过互联网通信时未启用。当前部署的技术不允许我们对网络的延迟或可靠性做出任何保证：我们必须假设网络拥塞、排队和无界延迟会发生。因此，超时没有 "正确" 的值 —— 它们需要通过实验确定。

互联网服务提供商之间的对等协议和通过边界网关协议（BGP）建立路由，比 IP 本身更接近电路交换。在这个级别，可以购买专用带宽。然而，互联网路由在网络级别而不是主机之间的单个连接上运行，并且时间尺度要长得多。


## 不可靠的时钟 {#sec_distributed_clocks}

时钟和时间很重要。应用程序以各种方式依赖时钟来回答如下问题：

1. 这个请求超时了吗？
2. 这项服务的第 99 百分位响应时间是多少？
3. 这项服务在过去五分钟内平均每秒处理了多少查询？
4. 用户在我们的网站上花了多长时间？
5. 这篇文章是什么时候发表的？
6. 提醒邮件应该在什么日期和时间发送？
7. 这个缓存条目何时过期？
8. 日志文件中此错误消息的时间戳是什么？

示例 1-4 测量 *持续时间*（例如，发送请求和接收响应之间的时间间隔），而示例 5-8 描述 *时间点*（在特定日期、特定时间发生的事件）。

在分布式系统中，时间是一件棘手的事情，因为通信不是瞬时的：消息从一台机器通过网络传输到另一台机器需要时间。接收消息的时间总是晚于发送消息的时间，但由于网络中的可变延迟，我们不知道晚了多少。当涉及多台机器时，这个事实有时会使确定事情发生的顺序变得困难。

此外，网络上的每台机器都有自己的时钟，这是一个实际的硬件设备：通常是石英晶体振荡器。这些设备并不完全准确，因此每台机器都有自己的时间概念，可能比其他机器稍快或稍慢。可以在某种程度上同步时钟：最常用的机制是网络时间协议（NTP），它允许根据一组服务器报告的时间调整计算机时钟 [^39]。服务器反过来从更准确的时间源（如 GPS 接收器）获取时间。

### 单调时钟与日历时钟 {#sec_distributed_monotonic_timeofday}

现代计算机至少有两种不同类型的时钟：*日历时钟* 和 *单调时钟*。尽管它们都测量时间，但区分两者很重要，因为它们服务于不同的目的。

#### 日历时钟 {#time-of-day-clocks}

日历时钟做你直观期望时钟做的事情：它根据某个日历返回当前日期和时间（也称为 *墙上时钟时间*）。例如，Linux 上的 `clock_gettime(CLOCK_REALTIME)` 和 Java 中的 `System.currentTimeMillis()` 返回自 *纪元* 以来的秒数（或毫秒数）：根据格里高利历，1970 年 1 月 1 日午夜 UTC，不计算闰秒。一些系统使用其他日期作为参考点。（尽管 Linux 时钟被称为 *实时*，但它与实时操作系统无关，如 ["响应时间保证"](#sec_distributed_clocks_realtime) 中所讨论的。）

日历时钟通常与 NTP 同步，这意味着来自一台机器的时间戳（理想情况下）与另一台机器上的时间戳意思相同。然而，日历时钟也有各种奇怪之处，如下一节所述。特别是，如果本地时钟远远超前于 NTP 服务器，它可能会被强制重置并显示跳回到以前的时间点。这些跳跃，以及闰秒引起的类似跳跃，使日历时钟不适合测量经过的时间 [^40]。

日历时钟可能会因夏令时（DST）的开始和结束而经历跳跃；这些可以通过始终使用 UTC 作为时区来避免，UTC 没有 DST。日历时钟在历史上也具有相当粗粒度的分辨率，例如，在较旧的 Windows 系统上以 10 毫秒的步长前进 [^41]。在最近的系统上，这不再是一个问题。

#### 单调时钟 {#monotonic-clocks}

单调时钟适用于测量持续时间（时间间隔），例如超时或服务的响应时间：例如，Linux 上的 `clock_gettime(CLOCK_MONOTONIC)` 或 `clock_gettime(CLOCK_BOOTTIME)` [^42] 和 Java 中的 `System.nanoTime()` 是单调时钟。这个名字来源于它们保证始终向前移动的事实（而日历时钟可能会在时间上向后跳跃）。

你可以在某个时间点检查单调时钟的值，做一些事情，然后在稍后的时间再次检查时钟。两个值之间的 *差值* 告诉你两次检查之间经过了多少时间 —— 更像秒表而不是挂钟。然而，时钟的 *绝对* 值是没有意义的：它可能是自计算机启动以来的纳秒数，或类似的任意值。特别是，比较来自两台不同计算机的单调时钟值是没有意义的，因为它们不代表同样的东西。

在具有多个 CPU 插槽的服务器上，每个 CPU 可能有一个单独的计时器，它不一定与其他 CPU 同步 [^43]。操作系统会补偿任何差异，并尝试向应用程序线程呈现时钟的单调视图，即使它们被调度到不同的 CPU 上。然而，明智的做法是对这种单调性保证持保留态度 [^44]。

如果 NTP 检测到计算机的本地石英晶体比 NTP 服务器运行得更快或更慢，它可能会调整单调时钟前进的频率（这被称为 *调整* 时钟）。默认情况下，NTP 允许时钟速率加速或减速高达 0.05%，但 NTP 不能导致单调时钟向前或向后跳跃。单调时钟的分辨率通常相当好：在大多数系统上，它们可以测量微秒或更短的时间间隔。

在分布式系统中，使用单调时钟测量经过的时间（例如，超时）通常是可以的，因为它不假设不同节点的时钟之间有任何同步，并且对测量的轻微不准确不敏感。

### 时钟同步和准确性 {#sec_distributed_clock_accuracy}

单调时钟不需要同步，但日历时钟需要根据 NTP 服务器或其他外部时间源设置才能有用。不幸的是，我们让时钟显示正确时间的方法远不如你希望的那样可靠或准确 —— 硬件时钟和 NTP 可能是反复无常的野兽。仅举几个例子：

* 计算机中的石英时钟不是很准确：它会 *漂移*（比应该的运行得更快或更慢）。时钟漂移因机器的温度而异。Google 假设其服务器的时钟漂移高达 200 ppm（百万分之一）[^45]，这相当于每 30 秒与服务器重新同步的时钟有 6 毫秒漂移，或每天重新同步一次的时钟有 17 秒漂移。即使一切正常工作，这种漂移也限制了你可以达到的最佳精度。
* 如果计算机的时钟与 NTP 服务器相差太多，它可能会拒绝同步，或者本地时钟将被强制重置 [^39]。任何在重置前后观察时间的应用程序都可能看到时间倒退或突然向前跳跃。
* 如果节点意外地被防火墙与 NTP 服务器隔离，配置错误可能会在一段时间内未被注意到，在此期间漂移可能会累积成不同节点时钟之间的巨大差异。轶事证据表明，这在实践中确实会发生。
* NTP 同步只能与网络延迟一样好，因此当你在具有可变数据包延迟的拥塞网络上时，其准确性有限。一项实验表明，通过互联网同步时可以达到 35 毫秒的最小误差 [^46]，尽管网络延迟的偶尔峰值会导致大约一秒的误差。根据配置，大的网络延迟可能导致 NTP 客户端完全放弃。
* 一些 NTP 服务器是错误的或配置错误的，报告的时间相差数小时 [^47] [^48]。NTP 客户端通过查询多个服务器并忽略异常值来减轻此类错误。尽管如此，将系统的正确性押注在互联网上陌生人告诉你的时间上还是有些令人担忧的。
* 闰秒导致一分钟有 59 秒或 61 秒长，这会搞乱在设计时没有考虑闰秒的系统中的时序假设 [^49]。闰秒已经导致许多大型系统崩溃的事实 [^40] [^50] 表明，关于时钟的错误假设是多么容易潜入系统。处理闰秒的最佳方法可能是让 NTP 服务器 "撒谎"，通过在一天的过程中逐渐执行闰秒调整（这被称为 *平滑*）[^51] [^52]，尽管实际的 NTP 服务器行为在实践中有所不同 [^53]。从 2035 年起将不再使用闰秒，所以这个问题幸运地将会消失。
* 在虚拟机中，硬件时钟是虚拟化的，这为需要准确计时的应用程序带来了额外的挑战 [^54]。当 CPU 核心在虚拟机之间共享时，每个 VM 在另一个 VM 运行时会暂停数十毫秒。从应用程序的角度来看，这种暂停表现为时钟突然向前跳跃 [^29]。如果 VM 暂停几秒钟，时钟可能会比实际时间落后几秒钟，但 NTP 可能会继续报告时钟几乎完全同步 [^55]。
* 如果你在不完全控制的设备上运行软件（例如，移动或嵌入式设备），你可能根本无法信任设备的硬件时钟。一些用户故意将他们的硬件时钟设置为不正确的日期和时间，例如在游戏中作弊 [^56]。因此，时钟可能被设置为遥远的过去或未来的时间。

如果你足够关心时钟精度并愿意投入大量资源，就可以实现非常好的时钟精度。例如，欧洲金融机构的 MiFID II 法规要求所有高频交易基金将其时钟同步到 UTC 的 100 微秒以内，以帮助调试市场异常（如 "闪崩"）并帮助检测市场操纵 [^57]。

这种精度可以通过一些特殊硬件（GPS 接收器和/或原子钟）、精确时间协议（PTP）以及仔细的部署和监控来实现 [^58] [^59]。仅依赖 GPS 可能有风险，因为 GPS 信号很容易被干扰。在某些地方，这种情况经常发生，例如靠近军事设施 [^60]。一些云提供商已经开始为其虚拟机提供高精度时钟同步 [^61]。然而，时钟同步仍然需要很多注意。如果你的 NTP 守护进程配置错误，或者防火墙阻止了 NTP 流量，由于漂移导致的时钟误差可能会迅速变大。

### 对同步时钟的依赖 {#sec_distributed_clocks_relying}

时钟的问题在于，虽然它们看起来简单易用，但它们有惊人数量的陷阱：一天可能没有正好 86,400 秒，日历时钟可能会在时间上向后移动，根据一个节点的时钟的时间可能与另一个节点的时钟相差很大。

本章前面我们讨论了网络丢弃和任意延迟数据包。即使网络大部分时间表现良好，软件也必须设计成假设网络偶尔会出现故障，软件必须优雅地处理此类故障。时钟也是如此：尽管它们大部分时间工作得很好，但强健的软件需要准备好处理不正确的时钟。

问题的一部分是不正确的时钟很容易被忽视。如果机器的 CPU 有缺陷或其网络配置错误，它很可能根本无法工作，因此会很快被注意到并修复。另一方面，如果它的石英时钟有缺陷或其 NTP 客户端配置错误，大多数事情看起来会正常工作，即使它的时钟逐渐偏离现实越来越远。如果某些软件依赖于准确同步的时钟，结果更可能是静默和微妙的数据丢失，而不是戏剧性的崩溃 [^62] [^63]。

因此，如果你使用需要同步时钟的软件，你还必须仔细监控所有机器之间的时钟偏移。任何时钟偏离其他节点太远的节点都应该被宣布死亡并从集群中移除。这种监控确保你在损坏的时钟造成太多损害之前注意到它们。

#### 用于事件排序的时间戳 {#sec_distributed_lww}

让我们考虑一个特定的情况，其中依赖时钟是诱人但危险的：跨多个节点的事件排序 [^64]。例如，如果两个客户端写入分布式数据库，谁先到达？哪个写入是更新的？

[图 9-3](#fig_distributed_timestamps) 说明了在具有多主复制的数据库中日历时钟的危险使用（该示例类似于 [图 6-8](/ch6#fig_replication_causality)）。客户端 A 在节点 1 上写入 *x* = 1；写入被复制到节点 3；客户端 B 在节点 3 上递增 *x*（我们现在有 *x* = 2）；最后，两个写入都被复制到节点 2。

{{< figure src="/fig/ddia_0903.png" id="fig_distributed_timestamps" caption="图 9-3. 客户端 B 的写入在因果关系上晚于客户端 A 的写入，但 B 的写入具有更早的时间戳。" class="w-full my-4" >}}


在 [图 9-3](#fig_distributed_timestamps) 中，当写入被复制到其他节点时，它会根据写入起源节点上的日历时钟标记时间戳。此示例中的时钟同步非常好：节点 1 和节点 3 之间的偏差小于 3 毫秒，这可能比你在实践中可以期望的要好。

由于递增建立在 *x* = 1 的早期写入之上，我们可能期望 *x* = 2 的写入应该具有两者中更大的时间戳。不幸的是，[图 9-3](#fig_distributed_timestamps) 中发生的并非如此：写入 *x* = 1 的时间戳为 42.004 秒，但写入 *x* = 2 的时间戳为 42.003 秒。

如 ["最后写入胜利（丢弃并发写入）"](/ch6#sec_replication_lww) 中所讨论的，解决不同节点上并发写入值之间冲突的一种方法是 *最后写入胜利*（LWW），这意味着保留给定键的具有最大时间戳的写入，并丢弃所有具有较旧时间戳的写入。在 [图 9-3](#fig_distributed_timestamps) 的示例中，当节点 2 接收这两个事件时，它将错误地得出结论，认为 *x* = 1 是更新的值并丢弃写入 *x* = 2，因此递增丢失了。

可以通过确保当值被覆盖时，新值总是具有比被覆盖值更高的时间戳来防止这个问题，即使该时间戳超前于写入者的本地时钟。然而，这会产生额外的读取成本来查找最大的现有时间戳。一些系统，包括 Cassandra 和 ScyllaDB，希望在单次往返中写入所有副本，因此它们只是使用客户端时钟的时间戳以及最后写入胜利策略 [^62]。这种方法有一些严重的问题：

* 数据库写入可能会神秘地消失：具有滞后时钟的节点无法覆盖先前由具有快速时钟的节点写入的值，直到节点之间的时钟偏差时间过去 [^63] [^65]。这种情况可能导致任意数量的数据被静默丢弃，而不会向应用程序报告任何错误。
* LWW 无法区分快速连续发生的顺序写入（在 [图 9-3](#fig_distributed_timestamps) 中，客户端 B 的递增肯定发生在客户端 A 的写入 *之后*）和真正并发的写入（两个写入者都不知道对方）。需要额外的因果关系跟踪机制，如版本向量，以防止违反因果关系（见 ["检测并发写入"](/ch6#sec_replication_concurrent)）。
* 两个节点可能独立生成具有相同时间戳的写入，特别是当时钟只有毫秒分辨率时。需要额外的决胜值（可以简单地是一个大的随机数）来解决此类冲突，但这种方法也可能导致违反因果关系 [^62]。

因此，即使通过保留最 "新" 的值并丢弃其他值来解决冲突很诱人，但重要的是要意识到 "新" 的定义取决于本地日历时钟，它很可能是不正确的。即使使用紧密 NTP 同步的时钟，你也可能在时间戳 100 毫秒（根据发送者的时钟）发送数据包，并让它在时间戳 99 毫秒（根据接收者的时钟）到达 —— 因此看起来数据包在发送之前就到达了，这是不可能的。

NTP 同步能否足够准确以至于不会发生此类错误排序？可能不行，因为除了石英漂移等其他误差源之外，NTP 的同步精度本身受到网络往返时间的限制。要保证正确的排序，你需要时钟误差显著低于网络延迟，这是不可能的。

所谓的 *逻辑时钟* [^66]，基于递增计数器而不是振荡石英晶体，是排序事件的更安全替代方案（见 ["检测并发写入"](/ch6#sec_replication_concurrent)）。逻辑时钟不测量一天中的时间或经过的秒数，只测量事件的相对顺序（一个事件是在另一个事件之前还是之后发生）。相比之下，日历时钟和单调时钟测量实际经过的时间，也称为 *物理时钟*。我们将在 ["ID 生成器和逻辑时钟"](/ch10#sec_consistency_logical) 中更详细地研究逻辑时钟。

#### 带置信区间的时钟读数 {#clock-readings-with-a-confidence-interval}

你可能能够以微秒甚至纳秒分辨率读取机器的日历时钟。但即使你能获得如此细粒度的测量，也不意味着该值实际上精确到如此精度。事实上，它很可能不是 —— 如前所述，即使你每分钟与本地网络上的 NTP 服务器同步，不精确的石英时钟的漂移也很容易达到几毫秒。使用公共互联网上的 NTP 服务器，最佳可能精度可能是几十毫秒，当存在网络拥塞时，误差很容易超过 100 毫秒。

因此，将时钟读数视为时间点是没有意义的 —— 它更像是一个时间范围，在置信区间内：例如，系统可能有 95% 的信心认为现在的时间在分钟后的 10.3 到 10.5 秒之间，但它不知道比这更精确的时间 [^67]。如果我们只知道时间 +/- 100 毫秒，时间戳中的微秒数字基本上是没有意义的。

不确定性边界可以根据你的时间源计算。如果你有直接连接到计算机的 GPS 接收器或原子钟，预期误差范围由设备决定，对于 GPS，由来自卫星的信号质量决定。如果你从服务器获取时间，不确定性基于自上次与服务器同步以来的预期石英漂移，加上 NTP 服务器的不确定性，加上到服务器的网络往返时间（作为第一近似，并假设你信任服务器）。

不幸的是，大多数系统不暴露这种不确定性：例如，当你调用 `clock_gettime()` 时，返回值不会告诉你时间戳的预期误差，所以你不知道它的置信区间是五毫秒还是五年。

有例外：Google Spanner 中的 *TrueTime* API [^45] 和亚马逊的 ClockBound 明确报告本地时钟的置信区间。当你询问当前时间时，你会得到两个值：`[earliest, latest]`，它们是 *最早可能* 和 *最晚可能* 的时间戳。基于其不确定性计算，时钟知道实际当前时间在该区间内的某处。区间的宽度取决于多种因素，包括本地石英时钟上次与更准确的时钟源同步以来已经过去了多长时间。

#### 用于全局快照的同步时钟 {#sec_distributed_spanner}

在 ["快照隔离和可重复读"](/ch8#sec_transactions_snapshot_isolation) 中，我们讨论了 *多版本并发控制*（MVCC），这是数据库中非常有用的功能，需要支持小型、快速的读写事务和大型、长时间运行的只读事务（例如，用于备份或分析）。它允许只读事务看到数据库的 *快照*，即特定时间点的一致状态，而不会锁定和干扰读写事务。

通常，MVCC 需要单调递增的事务 ID。如果写入发生在快照之后（即，写入的事务 ID 大于快照），则该写入对快照事务不可见。在单节点数据库上，简单的计数器就足以生成事务 ID。

然而，当数据库分布在许多机器上，可能在多个数据中心时，全局单调递增的事务 ID（跨所有分片）很难生成，因为它需要协调。事务 ID 必须反映因果关系：如果事务 B 读取或覆盖先前由事务 A 写入的值，则 B 必须具有比 A 更高的事务 ID —— 否则，快照将不一致。对于大量小型、快速的事务，在分布式系统中创建事务 ID 成为难以承受的瓶颈。（我们将在 ["ID 生成器和逻辑时钟"](/ch10#sec_consistency_logical) 中讨论此类 ID 生成器。）

我们能否使用同步日历时钟的时间戳作为事务 ID？如果我们能够获得足够好的同步，它们将具有正确的属性：较晚的事务具有更高的时间戳。当然，问题是时钟精度的不确定性。

Spanner 以这种方式跨数据中心实现快照隔离 [^68] [^69]。它使用 TrueTime API 报告的时钟置信区间，并基于以下观察：如果你有两个置信区间，每个都由最早和最晚可能的时间戳组成（*A* = [*A最早*, *A最晚*] 和 *B* = [*B最早*, *B最晚*]），并且这两个区间不重叠（即，*A最早* < *A最晚* < *B最早* < *B最晚*），那么 B 肯定发生在 A 之后 —— 毫无疑问。只有当区间重叠时，我们才不确定 A 和 B 发生的顺序。

为了确保事务时间戳反映因果关系，Spanner 在提交读写事务之前故意等待置信区间的长度。通过这样做，它确保任何可能读取数据的事务都在足够晚的时间，因此它们的置信区间不会重叠。为了使等待时间尽可能短，Spanner 需要使时钟不确定性尽可能小；为此，Google 在每个数据中心部署 GPS 接收器或原子钟，使时钟能够同步到大约 7 毫秒以内 [^45]。

原子钟和 GPS 接收器在 Spanner 中并不是严格必要的：重要的是要有一个置信区间，准确的时钟源只是帮助保持该区间较小。其他系统开始采用类似的方法：例如，YugabyteDB 在 AWS 上运行时可以利用 ClockBound [^70]，其他几个系统现在也在不同程度上依赖时钟同步 [^71] [^72]。

### 进程暂停 {#sec_distributed_clocks_pauses}

让我们考虑分布式系统中危险使用时钟的另一个例子。假设你有一个每个分片都有单个主节点的数据库。只有主节点被允许接受写入。节点如何知道它仍然是主节点（它没有被其他节点宣布死亡），并且它可以安全地接受写入？

一种选择是让主节点从其他节点获取 *租约*，这类似于带有超时的锁 [^73]。任何时候只有一个节点可以持有租约 —— 因此，当节点获得租约时，它知道在租约到期之前的一段时间内它是主节点。为了保持主节点身份，节点必须在租约到期之前定期续订租约。如果节点失效，它会停止续订租约，因此另一个节点可以在租约到期时接管。

你可以想象请求处理循环看起来像这样：

```js
while (true) {
    request = getIncomingRequest();

    // 确保租约始终至少有 10 秒的剩余时间
    if (lease.expiryTimeMillis - System.currentTimeMillis() < 10000) {
        lease = lease.renew();
    }

    if (lease.isValid()) {
        process(request);
    }
}
```

这段代码有什么问题？首先，它依赖于同步时钟：租约的到期时间由不同的机器设置（到期时间可能计算为当前时间加 30 秒，例如），并且它与本地系统时钟进行比较。如果时钟相差超过几秒钟，这段代码将开始做奇怪的事情。

其次，即使我们更改协议以仅使用本地单调时钟，还有另一个问题：代码假设在检查时间（`System.currentTimeMillis()`）和处理请求（`process(request)`）之间经过的时间非常少。通常这段代码运行得非常快，所以 10 秒的缓冲时间足以确保租约不会在处理请求的过程中到期。

然而，如果程序执行中出现意外暂停会怎样？例如，想象线程在 `lease.isValid()` 行周围停止了 15 秒，然后才最终继续。在这种情况下，处理请求时租约很可能已经到期，另一个节点已经接管了主节点身份。然而，没有任何东西告诉这个线程它暂停了这么长时间，所以这段代码不会注意到租约已经到期，直到循环的下一次迭代 —— 到那时它可能已经通过处理请求做了一些不安全的事情。

假设线程可能暂停这么长时间是合理的吗？不幸的是，是的。有各种原因可能导致这种情况发生：

* 线程访问共享资源（如锁或队列）时的争用可能导致线程花费大量时间等待。转移到具有更多 CPU 核心的机器可能会使此类问题变得更糟，并且争用问题可能难以诊断 [^74]。
* 许多编程语言运行时（如 Java 虚拟机）有 *垃圾回收器*（GC），偶尔需要停止所有正在运行的线程。过去，这种 *"全局暂停" GC 暂停* 有时会持续几分钟 [^75]！使用现代 GC 算法，这不再是一个大问题，但 GC 暂停仍然可能很明显（见 ["限制垃圾回收的影响"](#sec_distributed_gc_impact)）。
* 在虚拟化环境中，虚拟机可以被 *挂起*（暂停所有进程的执行并将内存内容保存到磁盘）和 *恢复*（恢复内存内容并继续执行）。这种暂停可能发生在进程执行的任何时间，并且可能持续任意长的时间。这个功能有时用于虚拟机从一台主机到另一台主机的 *实时迁移*，无需重启，在这种情况下，暂停的长度取决于进程写入内存的速率 [^76]。
* 在笔记本电脑和手机等终端用户设备上，执行也可能被任意挂起和恢复，例如，当用户合上笔记本电脑盖时。
* 当操作系统上下文切换到另一个线程时，或者当虚拟机管理程序切换到不同的虚拟机时（在虚拟机中运行时），当前运行的线程可能在代码的任何任意点暂停。在虚拟机的情况下，在其他虚拟机中花费的 CPU 时间称为 *窃取时间*。如果机器负载很重 —— 即，如果有长队列的线程等待运行 —— 暂停的线程可能需要一些时间才能再次运行。
* 如果应用程序执行同步磁盘访问，线程可能会暂停等待缓慢的磁盘 I/O 操作完成 [^77]。在许多语言中，磁盘访问可能会令人惊讶地发生，即使代码没有明确提到文件访问 —— 例如，Java 类加载器在首次使用时会延迟加载类文件，这可能发生在程序执行的任何时间。I/O 暂停和 GC 暂停甚至可能共谋结合它们的延迟 [^78]。如果磁盘实际上是网络文件系统或网络块设备（如亚马逊的 EBS），I/O 延迟还会受到网络延迟可变性的影响 [^31]。
* 如果操作系统配置为允许 *交换到磁盘*（*分页*），简单的内存访问可能会导致页面错误，需要从磁盘加载页面到内存。线程在此缓慢的 I/O 操作进行时暂停。如果内存压力很高，这可能反过来需要将不同的页面交换到磁盘。在极端情况下，操作系统可能会花费大部分时间在内存中交换页面进出，而实际完成的工作很少（这被称为 *抖动*）。为了避免这个问题，服务器机器上通常禁用分页（如果你宁愿杀死进程以释放内存而不是冒抖动的风险）。
* Unix 进程可以通过向其发送 `SIGSTOP` 信号来暂停，例如通过在 shell 中按 Ctrl-Z。此信号立即停止进程获取更多 CPU 周期，直到使用 `SIGCONT` 恢复它，此时它从停止的地方继续运行。即使你的环境通常不使用 `SIGSTOP`，它也可能被运维工程师意外发送。

所有这些情况都可以在任何时候 *抢占* 正在运行的线程，并在稍后的某个时间恢复它，而线程甚至没有注意到。这个问题类似于在单台机器上使多线程代码线程安全：你不能对时序做任何假设，因为可能会发生任意的上下文切换和并行性。

在单台机器上编写多线程代码时，我们有相当好的工具来使其线程安全：互斥锁、信号量、原子计数器、无锁数据结构、阻塞队列等。不幸的是，这些工具不能直接转换到分布式系统，因为分布式系统没有共享内存 —— 只有通过不可靠网络发送的消息。

分布式系统中的节点必须假设其执行可以在任何时候暂停相当长的时间，即使在函数的中间。在暂停期间，世界的其余部分继续运行，甚至可能因为暂停的节点没有响应而宣布它死亡。最终，暂停的节点可能会继续运行，甚至没有注意到它在睡觉，直到它稍后某个时候检查其时钟。

#### 响应时间保证 {#sec_distributed_clocks_realtime}

在许多编程语言和操作系统中，如所讨论的，线程和进程可能会暂停无限长的时间。如果你足够努力，这些暂停的原因 *可以* 被消除。

某些软件在环境中运行，如果未能在指定时间内响应可能会造成严重损害：控制飞机、火箭、机器人、汽车和其他物理对象的计算机必须快速且可预测地响应其传感器输入。在这些系统中，有一个指定的 *截止时间*，软件必须在此之前响应；如果它没有达到截止时间，可能会导致整个系统的故障。这些被称为 *硬实时* 系统。

--------

> [!NOTE]
> 在嵌入式系统中，*实时* 意味着系统经过精心设计和测试，以在所有情况下满足指定的时序保证。这个含义与网络上更模糊的 *实时* 术语使用形成对比，后者描述服务器向客户端推送数据和流处理，没有硬响应时间约束（见后续章节）。

--------

例如，如果你的汽车的车载传感器检测到你当前正在经历碰撞，你不希望安全气囊的释放因为安全气囊释放系统中不合时宜的 GC 暂停而延迟。

在系统中提供实时保证需要软件栈所有级别的支持：需要 *实时操作系统*（RTOS），它允许进程在指定的时间间隔内以有保证的 CPU 时间分配进行调度；库函数必须记录其最坏情况执行时间；动态内存分配可能受到限制或完全禁止（实时垃圾回收器存在，但应用程序仍必须确保它不会给 GC 太多工作）；必须进行大量的测试和测量以确保满足保证。

所有这些都需要大量的额外工作，并严重限制了可以使用的编程语言、库和工具的范围（因为大多数语言和工具不提供实时保证）。由于这些原因，开发实时系统非常昂贵，它们最常用于安全关键的嵌入式设备。此外，"实时" 不同于 "高性能" —— 事实上，实时系统可能具有较低的吞吐量，因为它们必须优先考虑及时响应高于一切（另见 ["延迟和资源利用率"](#sidebar_distributed_latency_utilization)）。

对于大多数服务器端数据处理系统，实时保证根本不经济或不合适。因此，这些系统必须承受在非实时环境中运行带来的暂停和时钟不稳定性。

#### 限制垃圾回收的影响 {#sec_distributed_gc_impact}

垃圾回收曾经是进程暂停的最大原因之一 [^79]，但幸运的是 GC 算法已经改进了很多：经过适当调整的回收器现在通常只会暂停几毫秒。Java 运行时提供了并发标记清除（CMS）、G1、Z 垃圾回收器（ZGC）、Epsilon 和 Shenandoah 等回收器。每个都针对不同的内存配置文件进行了优化，如高频对象创建、大堆等。相比之下，Go 提供了一个更简单的并发标记清除垃圾回收器，试图自我优化。

如果你需要完全避免 GC 暂停，一个选择是使用根本没有垃圾回收器的语言。例如，Swift 使用自动引用计数来确定何时可以释放内存；Rust 和 Mojo 使用类型系统跟踪对象的生命周期，以便编译器可以确定必须分配内存多长时间。

也可以使用垃圾回收语言，同时减轻暂停的影响。一种方法是将 GC 暂停视为节点的短暂计划中断，并让其他节点在一个节点收集垃圾时处理来自客户端的请求。如果运行时可以警告应用程序节点很快需要 GC 暂停，应用程序可以停止向该节点发送新请求，等待它完成处理未完成的请求，然后在没有请求进行时执行 GC。这个技巧从客户端隐藏了 GC 暂停，并减少了响应时间的高百分位数 [^80] [^81]。

这个想法的一个变体是仅对短期对象使用垃圾回收器（快速收集），并定期重启进程，在它们积累足够的长期对象需要长期对象的完整 GC 之前 [^79] [^82]。可以一次重启一个节点，并且可以在计划重启之前将流量从节点转移，就像滚动升级一样（见 [第 5 章](/ch5#ch_encoding)）。

这些措施不能完全防止垃圾回收暂停，但它们可以有效地减少对应用程序的影响。


## 知识、真相与谎言 {#sec_distributed_truth}

到目前为止，在本章中，我们已经探讨了分布式系统与在单台计算机上运行的程序的不同之处：没有共享内存，只有通过不可靠的网络进行消息传递，具有可变延迟，系统可能会遭受部分失效、不可靠的时钟和处理暂停。

如果你不习惯分布式系统，这些问题的后果会令人深感迷惑。网络中的节点不能 *确切地知道* 关于其他节点的任何事情 —— 它只能根据它接收（或未接收）的消息进行猜测。节点只能通过与另一个节点交换消息来了解它处于什么状态（它存储了什么数据，它是否正常运行等）。如果远程节点没有响应，就无法知道它处于什么状态，因为网络中的问题无法与节点的问题可靠地区分开来。

这些系统的讨论接近哲学：在我们的系统中，我们知道什么是真或假？如果感知和测量的机制不可靠，我们对这些知识有多确定 [^83]？软件系统是否应该遵守我们对物理世界的期望法则，如因果关系？

幸运的是，我们不需要走到弄清生命意义的程度。在分布式系统中，我们可以陈述我们对行为（*系统模型*）的假设，并以这样的方式设计实际系统，使其满足这些假设。算法可以被证明在某个系统模型内正确运行。这意味着即使底层系统模型提供的保证很少，也可以实现可靠的行为。

然而，尽管可以在不可靠的系统模型中使软件表现良好，但这样做并不简单。在本章的其余部分，我们将进一步探讨分布式系统中知识和真相的概念，这将帮助我们思考我们可以做出的假设类型和我们可能希望提供的保证。在 [第 10 章](/ch10#ch_consistency) 中，我们将继续查看在特定假设下提供特定保证的分布式算法的一些示例。

### 多数派原则 {#sec_distributed_majority}

想象一个具有不对称故障的网络：一个节点能够接收发送给它的所有消息，但该节点的任何传出消息都被丢弃或延迟 [^22]。即使该节点运行得非常好，并且正在接收来自其他节点的请求，其他节点也无法听到它的响应。在一些超时之后，其他节点宣布它死亡，因为它们没有收到该节点的消息。情况展开就像一场噩梦：半断开的节点被拖到墓地，踢腿尖叫着 "我没死！" —— 但由于没人能听到它的尖叫，葬礼队伍以坚忍的决心继续前进。

在稍微不那么可怕的情况下，半断开的节点可能会注意到它发送的消息没有被其他节点确认，因此意识到网络中一定有故障。尽管如此，该节点被其他节点错误地宣布死亡，半断开的节点对此无能为力。

作为第三种情况，想象一个节点暂停执行一分钟。在此期间，没有请求被处理，也没有响应被发送。其他节点等待、重试、变得不耐烦，最终宣布该节点死亡并将其装上灵车。最后，暂停结束，节点的线程继续运行，就好像什么都没发生过。其他节点惊讶地看到据称已死的节点突然从棺材里抬起头来，健康状况良好，开始愉快地与旁观者聊天。起初，暂停的节点甚至没有意识到整整一分钟已经过去，它被宣布死亡 —— 从它的角度来看，自从它上次与其他节点交谈以来，几乎没有时间过去。

这些故事的寓意是，节点不一定能信任自己对情况的判断。分布式系统不能完全依赖单个节点，因为节点可能随时失效，可能使系统陷入困境并无法恢复。相反，许多分布式算法依赖于 *仲裁*，即节点之间的投票（见 ["读写仲裁"](/ch6#sec_replication_quorum_condition)）：决策需要来自几个节点的最少票数，以减少对任何一个特定节点的依赖。

这包括关于宣布节点死亡的决定。如果节点的仲裁宣布另一个节点死亡，那么它必须被认为是死亡的，即使该节点仍然感觉自己非常活着。个别节点必须遵守仲裁决定并退出。

最常见的是，仲裁是超过半数节点的绝对多数（尽管其他类型的仲裁也是可能的）。多数仲裁允许系统在少数节点故障时继续工作（三个节点可以容忍一个故障节点；五个节点可以容忍两个故障节点）。然而，它仍然是安全的，因为系统中只能有一个多数 —— 不能同时有两个具有冲突决策的多数。当我们在 [第 10 章](/ch10#ch_consistency) 讨论 *共识算法* 时，我们将更详细地讨论仲裁的使用。

### 分布式锁和租约 {#sec_distributed_lock_fencing}

分布式应用程序中的锁和租约容易被误用，并且是错误的常见来源 [^84]。让我们看看它们如何出错的一个特定案例。

在 ["进程暂停"](#sec_distributed_clocks_pauses) 中，我们看到租约是一种超时的锁，如果旧所有者停止响应（可能是因为它崩溃了、暂停太久或与网络断开连接），可以分配给新所有者。你可以在系统需要只有一个某种东西的情况下使用租约。例如：

* 只允许一个节点成为数据库分片的主节点，以避免脑裂（见 ["处理节点中断"](/ch6#sec_replication_failover)）。
* 只允许一个事务或客户端更新特定资源或对象，以防止并发写入损坏它。
* 只有一个节点应该处理大型处理作业的给定输入文件，以避免由于多个节点冗余地执行相同工作而浪费精力。

值得仔细思考如果几个节点同时认为它们持有租约会发生什么，可能是由于进程暂停。在第三个例子中，后果只是一些浪费的计算资源，这不是什么大问题。但在前两种情况下，后果可能是数据丢失或损坏，这要严重得多。

例如，[图 9-4](#fig_distributed_lease_pause) 显示了由于锁的错误实现导致的数据损坏错误。（该错误不是理论上的：HBase 曾经有这个问题 [^85] [^86]。）假设你想确保存储服务中的文件一次只能由一个客户端访问，因为如果多个客户端试图写入它，文件将被损坏。你尝试通过要求客户端在访问文件之前从锁服务获取租约来实现这一点。这种锁服务通常使用共识算法实现；我们将在 [第 10 章](/ch10#ch_consistency) 中进一步讨论这一点。

{{< figure src="/fig/ddia_0904.png" id="fig_distributed_lease_pause" caption="图 9-4. 分布式锁的错误实现：客户端 1 认为它仍然有有效的租约，即使它已经过期，因此损坏了存储中的文件。" class="w-full my-4" >}}


问题是我们在 ["进程暂停"](#sec_distributed_clocks_pauses) 中讨论的一个例子：如果持有租约的客户端暂停太久，其租约就会过期。另一个客户端可以获得同一文件的租约，并开始写入文件。当暂停的客户端回来时，它（错误地）认为它仍然有有效的租约，并继续写入文件。我们现在有了脑裂情况：客户端的写入冲突并损坏了文件。

[图 9-5](#fig_distributed_lease_delay) 显示了具有类似后果的另一个问题。在这个例子中没有进程暂停，只有客户端 1 的崩溃。就在客户端 1 崩溃之前，它向存储服务发送了一个写请求，但这个请求在网络中被延迟了很长时间。（请记住 ["实践中的网络故障"](#sec_distributed_network_faults)，数据包有时可能会延迟一分钟或更长时间。）当写请求到达存储服务时，租约已经超时，允许客户端 2 获取它并发出自己的写入。结果是类似于 [图 9-4](#fig_distributed_lease_pause) 的损坏。

{{< figure src="/fig/ddia_0905.png" id="fig_distributed_lease_delay" caption="图 9-5. 来自前租约持有者的消息可能会延迟很长时间，并在另一个节点接管租约后到达。" class="w-full my-4" >}}


#### 隔离僵尸进程和延迟请求 {#sec_distributed_fencing_tokens}

术语 *僵尸* 有时用于描述尚未发现失去租约的前租约持有者，并且仍在充当当前租约持有者。由于我们不能完全排除僵尸，我们必须确保它们不能以脑裂的形式造成任何损害。这被称为 *隔离* 僵尸。

一些系统试图通过关闭僵尸来隔离它们，例如通过断开它们与网络的连接 [^9]、通过云提供商的管理界面关闭 VM，甚至物理关闭机器 [^87]。这种方法被称为 *对端节点爆头*（STONITH）。不幸的是，它存在一些问题：它不能防范像 [图 9-5](#fig_distributed_lease_delay) 中那样的大网络延迟；可能会发生所有节点相互关闭的情况 [^19]；到检测到僵尸并关闭它时，可能已经太晚了，数据可能已经被损坏。

一个更强大的隔离解决方案，可以防范僵尸和延迟请求，如 [图 9-6](#fig_distributed_fencing) 所示。

{{< figure src="/fig/ddia_0906.png" id="fig_distributed_fencing" caption="图 9-6. 通过只允许按递增隔离令牌顺序写入来使存储访问安全。" class="w-full my-4" >}}


假设每次锁服务授予锁或租约时，它还返回一个 *隔离令牌*，这是一个每次授予锁时都会增加的数字（例如，由锁服务递增）。然后我们可以要求客户端每次向存储服务发送写请求时，都必须包含其当前的隔离令牌。

--------

> [!NOTE]
> 隔离令牌有几个替代名称。在 Google 的锁服务 Chubby 中，它们被称为 *序列器* [^88]，在 Kafka 中它们被称为 *纪元编号*。在共识算法中，我们将在 [第 10 章](/ch10#ch_consistency) 中讨论，*投票编号*（Paxos）或 *任期编号*（Raft）起着类似的作用。

--------

在 [图 9-6](#fig_distributed_fencing) 中，客户端 1 获得带有令牌 33 的租约，但随后进入长时间暂停，租约过期。客户端 2 获得带有令牌 34 的租约（数字总是增加），然后将其写请求发送到存储服务，包括令牌 34。稍后，客户端 1 恢复执行并将其写入发送到存储服务，包括其令牌值 33。然而，存储服务记得它已经处理了具有更高令牌编号（34）的写入，因此它拒绝带有令牌 33 的请求。刚刚获得租约的客户端必须立即向存储服务进行写入，一旦该写入完成，任何僵尸都被隔离了。

如果 ZooKeeper 是你的锁服务，你可以使用事务 ID `zxid` 或节点版本 `cversion` 作为隔离令牌 [^85]。使用 etcd，修订号与租约 ID 一起起着类似的作用 [^89]。Hazelcast 中的 FencedLock API 明确生成隔离令牌 [^90]。

这种机制要求存储服务有某种方法来检查写入是否基于过时的令牌。或者，服务支持仅在对象自当前客户端上次读取以来未被另一个客户端写入时才成功的写入就足够了，类似于原子比较并设置（CAS）操作。例如，对象存储服务支持这种检查：Amazon S3 称之为 *条件写入*，Azure Blob Storage 称之为 *条件标头*，Google Cloud Storage 称之为 *请求前提条件*。

#### 多副本隔离 {#fencing-with-multiple-replicas}

如果你的客户端只需要写入一个支持此类条件写入的存储服务，锁服务在某种程度上是多余的 [^91] [^92]，因为租约分配本可以直接基于该存储服务实现 [^93]。然而，一旦你有了隔离令牌，你也可以将其用于多个服务或副本，并确保旧的租约持有者在所有这些服务上都被隔离。

例如，想象存储服务是一个具有最后写入胜利冲突解决的无主复制键值存储（见 ["无主复制"](/ch6#sec_replication_leaderless)）。在这样的系统中，客户端直接向每个副本发送写入，每个副本根据客户端分配的时间戳独立决定是否接受写入。

如 [图 9-7](#fig_distributed_fencing_leaderless) 所示，你可以将写入者的隔离令牌放在时间戳的最高有效位或数字中。然后你可以确保新租约持有者生成的任何时间戳都将大于旧租约持有者的任何时间戳，即使旧租约持有者的写入发生得更晚。

{{< figure src="/fig/ddia_0907.png" id="fig_distributed_fencing_leaderless" caption="图 9-7. 使用隔离令牌保护对无主复制数据库的写入。" class="w-full my-4" >}}


在 [图 9-7](#fig_distributed_fencing_leaderless) 中，客户端 2 有隔离令牌 34，因此它所有以 34… 开头的时间戳都大于客户端 1 生成的任何以 33… 开头的时间戳。客户端 2 写入副本的仲裁，但它无法到达副本 3。这意味着当僵尸客户端 1 稍后尝试写入时，它的写入可能在副本 3 上成功，即使它被副本 1 和 2 忽略。这不是问题，因为后续的仲裁读取将更喜欢具有更大时间戳的客户端 2 的写入，读修复或反熵最终将覆盖客户端 1 写入的值。

从这些例子可以看出，假设任何时候只有一个节点持有租约是不安全的。幸运的是，通过一点小心，你可以使用隔离令牌来防止僵尸和延迟请求造成任何损害。

### 拜占庭故障 {#sec_distributed_byzantine}

隔离令牌可以检测并阻止 *无意中* 出错的节点（例如，因为它尚未发现其租约已过期）。然而，如果节点故意想要破坏系统的保证，它可以通过发送带有虚假隔离令牌的消息轻松做到。

在本书中，我们假设节点是不可靠但诚实的：它们可能很慢或从不响应（由于故障），它们的状态可能已过时（由于 GC 暂停或网络延迟），但我们假设如果节点 *确实* 响应，它就是在说 "真话"：据它所知，它正在按协议规则行事。

如果节点可能 "撒谎"（发送任意错误或损坏的响应）的风险存在，分布式系统问题会变得更加困难 —— 例如，它可能在同一次选举中投出多个相互矛盾的票。这种行为被称为 *拜占庭故障*，在这种不信任环境中达成共识的问题被称为 *拜占庭将军问题* [^94]。

> [!TIP] 拜占庭将军问题
>
> 拜占庭将军问题是所谓 *两将军问题* [^95] 的推广，它想象了两个军队将军需要就战斗计划达成一致的情况。由于他们在两个不同的地点扎营，他们只能通过信使进行通信，信使有时会延迟或丢失（就像网络中的数据包）。我们将在 [第 10 章](/ch10#ch_consistency) 中讨论这个 *共识* 问题。
>
> 在问题的拜占庭版本中，有 *n* 个需要达成一致的将军，他们的努力受到他们中间有一些叛徒的阻碍。大多数将军是忠诚的，因此发送真实的消息，但叛徒可能试图通过发送虚假或不真实的消息来欺骗和混淆其他人。事先不知道谁是叛徒。
>
> 拜占庭是一个古希腊城市，后来成为君士坦丁堡，位于现在土耳其的伊斯坦布尔。没有任何历史证据表明拜占庭的将军比其他地方的将军更容易搞阴谋和密谋。相反，这个名字源自 *拜占庭* 一词在 *过于复杂、官僚、狡猾* 的意义上的使用，这个词在计算机出现之前很久就在政治中使用了 [^96]。Lamport 想选择一个不会冒犯任何读者的国籍，他被建议称之为 *阿尔巴尼亚将军问题* 不是个好主意 [^97]。

--------

如果即使某些节点发生故障并且不遵守协议，或者恶意攻击者干扰网络，系统仍能继续正确运行，则该系统是 *拜占庭容错* 的。这种担忧在某些特定情况下是相关的。例如：

* 在航空航天环境中，计算机内存或 CPU 寄存器中的数据可能因辐射而损坏，导致它以任意不可预测的方式响应其他节点。由于系统故障的成本非常高昂（例如，飞机坠毁并杀死机上所有人，或火箭与国际空间站相撞），飞行控制系统必须容忍拜占庭故障 [^98] [^99]。
* 在有多个参与方的系统中，一些参与者可能试图欺骗或欺诈其他人。在这种情况下，节点简单地信任另一个节点的消息是不安全的，因为它们可能是恶意发送的。例如，比特币等加密货币和其他区块链可以被认为是让相互不信任的各方就交易是否发生达成一致的一种方式，而无需依赖中央权威 [^100]。

然而，在我们在本书中讨论的系统类型中，我们通常可以安全地假设没有拜占庭故障。在数据中心中，所有节点都由你的组织控制（因此它们有望被信任），辐射水平足够低，内存损坏不是主要问题（尽管正在考虑轨道数据中心 [^101]）。多租户系统有相互不信任的租户，但它们使用防火墙、虚拟化和访问控制策略相互隔离，而不是使用拜占庭容错。使系统拜占庭容错的协议相当昂贵 [^102]，容错嵌入式系统依赖于硬件级别的支持 [^98]。在大多数服务器端数据系统中，部署拜占庭容错解决方案的成本使它们不切实际。

Web 应用程序确实需要预期客户端在最终用户控制下的任意和恶意行为，例如 Web 浏览器。这就是输入验证、清理和输出转义如此重要的原因：例如，防止 SQL 注入和跨站脚本攻击。然而，我们通常不在这里使用拜占庭容错协议，而只是让服务器成为决定什么客户端行为被允许和不被允许的权威。在没有这种中央权威的点对点网络中，拜占庭容错更相关 [^103] [^104]。

软件中的错误可以被视为拜占庭故障，但如果你将相同的软件部署到所有节点，那么拜占庭容错算法无法拯救你。大多数拜占庭容错算法需要超过三分之二的节点的绝对多数才能正常运行（例如，如果你有四个节点，最多一个可能发生故障）。要使用这种方法对付错误，你必须有四个相同软件的独立实现，并希望错误只出现在四个实现中的一个。

同样，如果协议可以保护我们免受漏洞、安全妥协和恶意攻击，那将是很有吸引力的。不幸的是，这也不现实：在大多数系统中，如果攻击者可以破坏一个节点，他们可能可以破坏所有节点，因为它们可能运行相同的软件。因此，传统机制（身份验证、访问控制、加密、防火墙等）仍然是防范攻击者的主要保护。

<a id="sec_distributed_weak_lying"></a>

#### 弱形式的谎言 {#weak-forms-of-lying}

尽管我们假设节点通常是诚实的，但向软件添加防范弱形式 "谎言" 的机制可能是值得的 —— 例如，由于硬件问题、软件错误和配置错误导致的无效消息。这种保护机制不是完全的拜占庭容错，因为它们无法抵御坚定的对手，但它们仍然是朝着更好可靠性迈出的简单而务实的步骤。例如：

* 由于硬件问题或操作系统、驱动程序、路由器等中的错误，网络数据包有时确实会损坏。通常，损坏的数据包会被内置于 TCP 和 UDP 中的校验和捕获，但有时它们会逃避检测 [^105] [^106] [^107]。简单的措施通常足以防范此类损坏，例如应用程序级协议中的校验和。TLS 加密连接也提供防损坏保护。
* 公开可访问的应用程序必须仔细清理来自用户的任何输入，例如检查值是否在合理范围内，并限制字符串的大小以防止通过大内存分配进行拒绝服务。防火墙后面的内部服务可能能够在输入上进行较少严格的检查，但协议解析器中的基本检查仍然是个好主意 [^105]。
* NTP 客户端可以配置多个服务器地址。同步时，客户端联系所有服务器，估计它们的错误，并检查大多数服务器是否在某个时间范围内达成一致。只要大多数服务器都正常，报告不正确时间的配置错误的 NTP 服务器就会被检测为异常值并从同步中排除 [^39]。使用多个服务器使 NTP 比仅使用单个服务器更强大。

### 系统模型与现实 {#sec_distributed_system_model}

许多算法被设计来解决分布式系统问题 —— 例如，我们将在 [第 10 章](/ch10#ch_consistency) 中研究共识问题的解决方案。为了有用，这些算法需要容忍我们在本章中讨论的分布式系统的各种故障。

算法需要以不过度依赖于它们运行的硬件和软件配置细节的方式编写。这反过来又要求我们以某种方式形式化我们期望在系统中发生的故障类型。我们通过定义 *系统模型* 来做到这一点，这是一个描述算法可能假设什么事情的抽象。

关于时序假设，三种系统模型常用：

同步模型
: 同步模型假设有界的网络延迟、有界的进程暂停和有界的时钟误差。这并不意味着精确同步的时钟或零网络延迟；它只是意味着你知道网络延迟、暂停和时钟漂移永远不会超过某个固定的上限 [^108]。同步模型不是大多数实际系统的现实模型，因为（如本章所讨论的）无界延迟和暂停确实会发生。

部分同步模型
: 部分同步意味着系统 *大部分时间* 表现得像同步系统，但有时会超过网络延迟、进程暂停和时钟漂移的界限 [^108]。这是许多系统的现实模型：大部分时间，网络和进程表现相当良好 —— 否则我们永远无法完成任何事情 —— 但我们必须考虑到任何时序假设偶尔可能会被打破的事实。发生这种情况时，网络延迟、暂停和时钟误差可能会变得任意大。

异步模型
: 在这个模型中，算法不允许做出任何时序假设 —— 事实上，它甚至没有时钟（因此它不能使用超时）。一些算法可以为异步模型设计，但它非常有限。

此外，除了时序问题，我们还必须考虑节点故障。节点的一些常见系统模型是：

崩溃停止故障
: 在 *崩溃停止*（或 *故障停止*）模型中，算法可以假设节点只能以一种方式失效，即崩溃 [^109]。这意味着节点可能在任何时刻突然停止响应，此后该节点永远消失 —— 它永远不会回来。

崩溃恢复故障
: 我们假设节点可能在任何时刻崩溃，并且可能在某个未知时间后再次开始响应。在崩溃恢复模型中，假设节点具有跨崩溃保留的稳定存储（即非易失性磁盘存储），而内存中的状态假设丢失。

性能下降和部分功能
: 除了崩溃和重启之外，节点可能变慢：它们可能仍然能够响应健康检查请求，但速度太慢而无法完成任何实际工作。例如，千兆网络接口可能由于驱动程序错误突然降至 1 Kb/s 吞吐量 [^110]；处于内存压力下的进程可能会花费大部分时间执行垃圾回收 [^111]；磨损的 SSD 可能具有不稳定的性能；硬件可能受到高温、松动的连接器、机械振动、电源问题、固件错误等的影响 [^112]。这种情况被称为 *跛行节点*、*灰色故障* 或 *慢速故障* [^113]，它可能比干净失效的节点更难处理。一个相关的问题是当进程停止执行它应该做的某些事情，而其他方面继续工作时，例如因为后台线程崩溃或死锁 [^114]。

拜占庭（任意）故障
: 节点可能做任何事情，包括试图欺骗和欺骗其他节点，如上一节所述。

对于建模真实系统，具有崩溃恢复故障的部分同步模型通常是最有用的模型。它允许无界的网络延迟、进程暂停和慢节点。但是分布式算法如何应对该模型？

#### 定义算法的正确性 {#defining-the-correctness-of-an-algorithm}

为了定义算法 *正确* 的含义，我们可以描述它的 *属性*。例如，排序算法的输出具有这样的属性：对于输出列表的任何两个不同元素，左边的元素小于右边的元素。这只是定义列表排序含义的正式方式。

同样，我们可以写下我们希望分布式算法具有的属性，以定义正确的含义。例如，如果我们为锁生成隔离令牌（见 ["隔离僵尸进程和延迟请求"](#sec_distributed_fencing_tokens)），我们可能要求算法具有以下属性：

唯一性
: 没有两个隔离令牌请求返回相同的值。

单调序列
: 如果请求 *x* 返回令牌 *t**x*，请求 *y* 返回令牌 *t**y*，并且 *x* 在 *y* 开始之前完成，则 *t**x* < *t**y*。

可用性
: 请求隔离令牌且不崩溃的节点最终会收到响应。

如果算法在我们假设该系统模型中可能发生的所有情况下始终满足其属性，则该算法在某个系统模型中是正确的。然而，如果所有节点崩溃，或者所有网络延迟突然变得无限长，那么没有算法能够完成任何事情。即使在允许完全失效的系统模型中，我们如何仍然做出有用的保证？

#### 安全性与活性 {#sec_distributed_safety_liveness}

为了澄清情况，值得区分两种不同类型的属性：*安全性* 和 *活性* 属性。在刚才给出的例子中，*唯一性* 和 *单调序列* 是安全属性，但 *可用性* 是活性属性。

什么区分这两种属性？一个迹象是活性属性通常在其定义中包含 "最终" 一词。（是的，你猜对了 —— *最终一致性* 是一个活性属性 [^115]。）

安全性通常被非正式地定义为 *没有坏事发生*，活性被定义为 *好事最终会发生*。然而，最好不要过多地解读这些非正式定义，因为 "好" 和 "坏" 是价值判断，不能很好地应用于算法。安全性和活性的实际定义更精确 [^116]：

* 如果违反了安全属性，我们可以指出它被破坏的特定时间点（例如，如果违反了唯一性属性，我们可以识别返回重复隔离令牌的特定操作）。在违反安全属性之后，违规无法撤消 —— 损害已经造成。
* 活性属性以相反的方式工作：它可能在某个时间点不成立（例如，节点可能已发送请求但尚未收到响应），但总有希望它将来可能得到满足（即通过接收响应）。

区分安全性和活性属性的一个优点是它有助于我们处理困难的系统模型。对于分布式算法，通常要求安全属性在系统模型的所有可能情况下 *始终* 成立 [^108]。也就是说，即使所有节点崩溃，或整个网络失效，算法也必须确保它不会返回错误的结果（即，安全属性保持满足）。

然而，对于活性属性，我们可以做出警告：例如，我们可以说请求只有在大多数节点没有崩溃时才需要收到响应，并且只有在网络最终从中断中恢复时才需要响应。部分同步模型的定义要求系统最终返回到同步状态 —— 也就是说，任何网络中断期只持续有限的时间，然后被修复。

#### 将系统模型映射到现实世界 {#mapping-system-models-to-the-real-world}

安全性和活性属性以及系统模型对于推理分布式算法的正确性非常有用。然而，在实践中实现算法时，现实的混乱事实又会回来咬你一口，很明显系统模型是现实的简化抽象。

例如，崩溃恢复模型中的算法通常假设稳定存储中的数据在崩溃后幸存。然而，如果磁盘上的数据损坏了，或者由于硬件错误或配置错误而擦除了数据，会发生什么 [^117]？如果服务器有固件错误并且在重启时无法识别其硬盘驱动器，即使驱动器正确连接到服务器，会发生什么 [^118]？

仲裁算法（见 ["读写仲裁"](/ch6#sec_replication_quorum_condition)）依赖于节点记住它声称已存储的数据。如果节点可能患有健忘症并忘记先前存储的数据，那会破坏仲裁条件，从而破坏算法的正确性。也许需要一个新的系统模型，其中我们假设稳定存储大多在崩溃后幸存，但有时可能会丢失。但该模型随后变得更难推理。

算法的理论描述可以声明某些事情被简单地假设不会发生 —— 在非拜占庭系统中，我们确实必须对可能和不可能发生的故障做出一些假设。然而，真正的实现可能仍然必须包含代码来处理被假设为不可能的事情发生的情况，即使该处理归结为 `printf("Sucks to be you")` 和 `exit(666)` —— 即，让人类操作员清理烂摊子 [^119]。（这是计算机科学和软件工程之间的一个区别。）

这并不是说理论上的、抽象的系统模型是无用的 —— 恰恰相反。它们非常有助于将真实系统的复杂性提炼为我们可以推理的可管理的故障集，以便我们可以理解问题并尝试系统地解决它。

### 形式化方法和随机测试 {#sec_distributed_formal}

我们如何知道算法满足所需的属性？由于并发性、部分失效和网络延迟，存在大量潜在状态。我们需要保证属性在每个可能的状态下都成立，并确保我们没有忘记任何边界情况。

一种方法是通过数学描述算法来形式验证它，并使用证明技术来表明它在系统模型允许的所有情况下都满足所需的属性。证明算法正确并不意味着它在真实系统上的 *实现* 必然总是正确运行。但这是一个非常好的第一步，因为理论分析可以发现算法中的问题，这些问题可能在真实系统中长时间隐藏，并且只有当你的假设（例如，关于时序）由于不寻常的情况而失败时才会咬你一口。

将理论分析与经验测试相结合以验证实现按预期运行是明智的。基于属性的测试、模糊测试和确定性模拟测试（DST）等技术使用随机化来在各种情况下测试系统。亚马逊网络服务等公司已成功地在其许多产品上使用了这些技术的组合 [^120] [^121]。

#### 模型检查与规范语言 {#model-checking-and-specification-languages}

*模型检查器* 是帮助验证算法或系统按预期运行的工具。算法规范是用专门构建的语言编写的，如 TLA+、Gallina 或 FizzBee。这些语言使得更容易专注于算法的行为，而不必担心代码实现细节。然后，模型检查器使用这些模型通过系统地尝试所有可能发生的事情来验证不变量在算法的所有状态中都成立。

模型检查实际上不能证明算法的不变量对每个可能的状态都成立，因为大多数现实世界的算法都有无限的状态空间。对所有状态的真正验证需要形式证明，这是可以做到的，但通常比运行模型检查器更困难。相反，模型检查器鼓励你将算法的模型减少到可以完全验证的近似值，或者将执行限制到某个上限（例如，通过设置可以发送的最大消息数）。任何只在更长执行时发生的错误将不会被发现。

尽管如此，模型检查器在易用性和查找非显而易见错误的能力之间取得了很好的平衡。CockroachDB、TiDB、Kafka 和许多其他分布式系统使用模型规范来查找和修复错误 [^122] [^123] [^124]。例如，使用 TLA+，研究人员能够证明由算法的散文描述中的歧义引起的视图戳复制（VR）中数据丢失的可能性 [^125]。

按设计，模型检查器不运行你的实际代码，而是运行一个简化的模型，该模型仅指定你的协议的核心思想。这使得系统地探索状态空间更易处理，但有风险是你的规范和你的实现彼此不同步 [^126]。可以检查模型和真实实现是否具有等效行为，但这需要在真实实现中进行仪器化 [^127]。

#### 故障注入 {#sec_fault_injection}

许多错误是在机器和网络故障发生时触发的。故障注入是一种有效（有时令人恐惧）的技术，用于验证系统的实现在出错时是否按预期工作。这个想法很简单：将故障注入到正在运行的系统环境中，看看它如何表现。故障可以是网络故障、机器崩溃、磁盘损坏、暂停的进程 —— 你能想象到的计算机出错的任何事情。

故障注入测试通常在与系统将运行的生产环境非常相似的环境中运行。有些甚至直接将故障注入到他们的生产环境中。Netflix 通过他们的 Chaos Monkey 工具推广了这种方法 [^128]。生产故障注入通常被称为 *混沌工程*，我们在 ["可靠性与容错"](/ch2#sec_introduction_reliability) 中讨论过。

要运行故障注入测试，首先部署被测系统以及故障注入协调器和脚本。协调器负责决定执行什么故障以及何时执行它们。本地或远程脚本负责将故障注入到单个节点或进程中。注入脚本使用许多不同的工具来触发故障。可以使用 Linux 的 `kill` 命令暂停或杀死 Linux 进程，可以使用 `umount` 卸载磁盘，可以通过防火墙设置中断网络连接。你可以在注入故障期间和之后检查系统行为，以确保事情按预期工作。

触发故障所需的无数工具使故障注入测试编写起来很麻烦。采用像 Jepsen 这样的故障注入框架来运行故障注入测试以简化过程是常见的。这些框架带有各种操作系统的集成和许多预构建的故障注入器 [^129]。Jepsen 在许多广泛使用的系统中发现关键错误方面非常有效 [^130] [^131]。

#### 确定性模拟测试 {#deterministic-simulation-testing}

确定性模拟测试（DST）也已成为模型检查和故障注入的流行补充。它使用与模型检查器类似的状态空间探索过程，但它测试你的实际代码，而不是模型。

在 DST 中，模拟自动运行系统的大量随机执行。模拟期间的网络通信、I/O 和时钟时序都被模拟替换，允许模拟器控制事情发生的确切顺序，包括各种时序和故障场景。这允许模拟器探索比手写测试或故障注入更多的情况。如果测试失败，它可以重新运行，因为模拟器知道触发故障的确切操作顺序 —— 与故障注入相比，后者对系统没有如此细粒度的控制。

DST 要求模拟器能够控制所有非确定性来源，例如网络延迟。通常采用三种策略之一来使代码确定性：

应用程序级
: 一些系统从头开始构建，以便于确定性地执行代码。例如，DST 领域的先驱之一 FoundationDB 是使用称为 Flow 的异步通信库构建的。Flow 为开发人员提供了将确定性网络模拟注入系统的点 [^132]。类似地，TigerBeetle 是一个具有一流 DST 支持的在线事务处理（OLTP）数据库。系统的状态被建模为状态机，所有突变都发生在单个事件循环中。当与模拟确定性原语（如时钟）结合时，这种架构能够确定性地运行 [^133]。

运行时级
: 具有异步运行时和常用库的语言提供了引入确定性的插入点。使用单线程运行时强制所有异步代码按顺序运行。例如，FrostDB 修补 Go 的运行时以按顺序执行 goroutine [^134]。Rust 的 madsim 库以类似的方式工作。Madsim 提供了 Tokio 的异步运行时 API、AWS 的 S3 库、Kafka 的 Rust 库等的确定性实现。应用程序可以交换确定性库和运行时以获得确定性测试执行，而无需更改其代码。

机器级
: 与其在运行时修补代码，不如使整个机器确定性。这是一个微妙的过程，需要机器对所有通常非确定性的调用响应确定性响应。Antithesis 等工具通过构建自定义虚拟机管理程序来做到这一点，该虚拟机管理程序用确定性操作替换通常的非确定性操作。从时钟到网络和存储的一切都需要考虑。不过，一旦完成，开发人员可以在虚拟机管理程序内的容器集合中运行其整个分布式系统，并获得完全确定性的分布式系统。

DST 提供了超越可重放性的几个优势。Antithesis 等工具试图通过在发现不太常见的行为时将测试执行分支为多个子执行来探索应用程序代码中的许多不同代码路径。由于确定性测试通常使用模拟时钟和网络调用，因此此类测试可以比挂钟时间运行得更快。例如，TigerBeetle 的时间抽象允许模拟模拟网络延迟和超时，而实际上不需要触发超时的全部时间长度。这些技术允许模拟器更快地探索更多代码路径。

#### 确定性的力量 {#sidebar_distributed_determinism}

非确定性是我们在本章中讨论的所有分布式系统挑战的核心：并发性、网络延迟、进程暂停、时钟跳跃和崩溃都以不可预测的方式发生，从系统的一次运行到下一次运行都不同。相反，如果你能使系统确定性，那可以极大地简化事情。

事实上，使事物确定性是一个简单但强大的想法，在分布式系统设计中一再出现。除了确定性模拟测试，我们在过去的章节中已经看到了几种使用确定性的方法：

* 事件溯源的一个关键优势（见 ["事件溯源和 CQRS"](/ch3#sec_datamodels_events)）是你可以确定性地重放事件日志以重建派生的物化视图。
* 工作流引擎（见 ["持久执行和工作流"](/ch5#sec_encoding_dataflow_workflows)）依赖于工作流定义是确定性的，以提供持久执行语义。
* *状态机复制*，我们将在 ["使用共享日志"](/ch10#sec_consistency_smr) 中讨论，通过在每个副本上独立执行相同的确定性事务序列来复制数据。我们已经看到了这个想法的两个变体：基于语句的复制（见 ["复制日志的实现"](/ch6#sec_replication_implementation)）和使用存储过程的串行事务执行（见 ["存储过程的利弊"](/ch8#sec_transactions_stored_proc_tradeoffs)）。

然而，使代码完全确定性需要小心。即使你已经删除了所有并发性并用确定性模拟替换了 I/O、网络通信、时钟和随机数生成器，非确定性元素可能仍然存在。例如，在某些编程语言中，迭代哈希表元素的顺序可能是非确定性的。是否遇到资源限制（内存分配失败、堆栈溢出）也是非确定性的。

## 总结 {#summary}

在本章中，我们讨论了分布式系统中可能发生的各种问题，包括：

* 每当你尝试通过网络发送数据包时，它可能会丢失或任意延迟。同样，回复可能会丢失或延迟，所以如果你没有得到回复，你不知道消息是否送达。
* 节点的时钟可能与其他节点严重不同步（尽管你尽最大努力设置了 NTP），它可能会突然向前或向后跳跃，而依赖它是危险的，因为你很可能没有一个好的时钟置信区间度量。
* 进程可能在其执行的任何时刻暂停相当长的时间，被其他节点宣告死亡，然后再次恢复活动而没有意识到它曾暂停。

这种 *部分失效* 可能发生的事实是分布式系统的决定性特征。每当软件尝试做任何涉及其他节点的事情时，都有可能偶尔失败、随机变慢或根本没有响应（并最终超时）。在分布式系统中，我们尝试将对部分失效的容忍构建到软件中，这样即使某些组成部分出现故障，整个系统也可以继续运行。

要容忍故障，第一步是 *检测* 它们，但即使这样也很困难。大多数系统没有准确的机制来检测节点是否已失败，因此大多数分布式算法依赖超时来确定远程节点是否仍然可用。然而，超时无法区分网络和节点故障，可变的网络延迟有时会导致节点被错误地怀疑崩溃。处理跛行节点（limping nodes）更加困难，这些节点正在响应但速度太慢而无法做任何有用的事情。

一旦检测到故障，让系统容忍它也不容易：没有全局变量、没有共享内存、没有公共知识或机器之间任何其他类型的共享状态 [^83]。节点甚至无法就现在是什么时间达成一致，更不用说任何更深刻的事情了。信息从一个节点流向另一个节点的唯一方式是通过不可靠的网络发送。单个节点无法安全地做出重大决策，因此我们需要协议来征求其他节点的帮助并尝试获得法定人数的同意。

如果你习惯于在单台计算机的理想数学完美环境中编写软件，其中相同的操作总是确定性地返回相同的结果，那么转向分布式系统混乱的物理现实可能会有点震惊。相反，分布式系统工程师通常会认为如果一个问题可以在单台计算机上解决，那它就是微不足道的 [^4]，而且单台计算机现在确实可以做很多事情。如果你可以避免打开潘多拉的盒子，只需将事情保持在单台机器上，例如使用嵌入式存储引擎（见 ["嵌入式存储引擎"](/ch4#sidebar_embedded)），通常值得这样做。

然而，正如在 ["分布式系统与单节点系统"](/ch1#sec_introduction_distributed) 中讨论的，可伸缩性并不是使用分布式系统的唯一原因。容错和低延迟（通过将数据在地理上放置在靠近用户的位置）是同样重要的目标，而这些事情无法通过单个节点实现。分布式系统的力量在于，原则上它们可以在服务层面永远运行而不被中断，因为所有故障和维护都可以在节点层面处理。（实际上，如果错误的配置更改被推送到所有节点，仍然会让分布式系统崩溃。）

在本章中，我们还探讨了网络、时钟和进程的不可靠性是否是不可避免的自然法则。我们看到它不是：可以在网络中提供硬实时响应保证和有界延迟，但这样做非常昂贵，并导致硬件资源利用率降低。大多数非安全关键系统选择便宜和不可靠而不是昂贵和可靠。

本章一直在讨论问题，给了我们一个暗淡的前景。在下一章中，我们将转向解决方案，并讨论一些为应对分布式系统中的问题而设计的算法。


### 参考

[^1]: Mark Cavage. [There’s Just No Getting Around It: You’re Building a Distributed System](https://queue.acm.org/detail.cfm?id=2482856). *ACM Queue*, volume 11, issue 4, pages 80-89, April 2013. [doi:10.1145/2466486.2482856](https://doi.org/10.1145/2466486.2482856) 
[^2]: Jay Kreps. [Getting Real About Distributed System Reliability](https://blog.empathybox.com/post/19574936361/getting-real-about-distributed-system-reliability). *blog.empathybox.com*, March 2012. Archived at [perma.cc/9B5Q-AEBW](https://perma.cc/9B5Q-AEBW) 
[^3]: Coda Hale. [You Can’t Sacrifice Partition Tolerance](https://codahale.com/you-cant-sacrifice-partition-tolerance/). *codahale.com*, October 2010. <https://perma.cc/6GJU-X4G5>
[^4]: Jeff Hodges. [Notes on Distributed Systems for Young Bloods](https://www.somethingsimilar.com/2013/01/14/notes-on-distributed-systems-for-young-bloods/). *somethingsimilar.com*, January 2013. Archived at [perma.cc/B636-62CE](https://perma.cc/B636-62CE) 
[^5]: Van Jacobson. [Congestion Avoidance and Control](https://www.cs.usask.ca/ftp/pub/discus/seminars2002-2003/p314-jacobson.pdf). At *ACM Symposium on Communications Architectures and Protocols* (SIGCOMM), August 1988. [doi:10.1145/52324.52356](https://doi.org/10.1145/52324.52356) 
[^6]: Bert Hubert. [The Ultimate SO\_LINGER Page, or: Why Is My TCP Not Reliable](https://blog.netherlabs.nl/articles/2009/01/18/the-ultimate-so_linger-page-or-why-is-my-tcp-not-reliable). *blog.netherlabs.nl*, January 2009. Archived at [perma.cc/6HDX-L2RR](https://perma.cc/6HDX-L2RR) 
[^7]: Jerome H. Saltzer, David P. Reed, and David D. Clark. [End-To-End Arguments in System Design](https://groups.csail.mit.edu/ana/Publications/PubPDFs/End-to-End%20Arguments%20in%20System%20Design.pdf). *ACM Transactions on Computer Systems*, volume 2, issue 4, pages 277–288, November 1984. [doi:10.1145/357401.357402](https://doi.org/10.1145/357401.357402) 
[^8]: Peter Bailis and Kyle Kingsbury. [The Network Is Reliable](https://queue.acm.org/detail.cfm?id=2655736). *ACM Queue*, volume 12, issue 7, pages 48-55, July 2014. [doi:10.1145/2639988.2639988](https://doi.org/10.1145/2639988.2639988) 
[^9]: Joshua B. Leners, Trinabh Gupta, Marcos K. Aguilera, and Michael Walfish. [Taming Uncertainty in Distributed Systems with Help from the Network](https://cs.nyu.edu/~mwalfish/papers/albatross-eurosys15.pdf). At *10th European Conference on Computer Systems* (EuroSys), April 2015. [doi:10.1145/2741948.2741976](https://doi.org/10.1145/2741948.2741976) 
[^10]: Phillipa Gill, Navendu Jain, and Nachiappan Nagappan. [Understanding Network Failures in Data Centers: Measurement, Analysis, and Implications](https://conferences.sigcomm.org/sigcomm/2011/papers/sigcomm/p350.pdf). At *ACM SIGCOMM Conference*, August 2011. [doi:10.1145/2018436.2018477](https://doi.org/10.1145/2018436.2018477) 
[^11]: Urs Hölzle. [But recently a farmer had started grazing a herd of cows nearby. And whenever they stepped on the fiber link, they bent it enough to cause a blip](https://x.com/uhoelzle/status/1263333283107991558). *x.com*, May 2020. Archived at [perma.cc/WX8X-ZZA5](https://perma.cc/WX8X-ZZA5) 
[^12]: CBC News. [Hundreds lose internet service in northern B.C. after beaver chews through cable](https://www.cbc.ca/news/canada/british-columbia/beaver-internet-down-tumbler-ridge-1.6001594). *cbc.ca*, April 2021. Archived at [perma.cc/UW8C-H2MY](https://perma.cc/UW8C-H2MY) 
[^13]: Will Oremus. [The Global Internet Is Being Attacked by Sharks, Google Confirms](https://slate.com/technology/2014/08/shark-attacks-threaten-google-s-undersea-internet-cables-video.html). *slate.com*, August 2014. Archived at [perma.cc/P6F3-C6YG](https://perma.cc/P6F3-C6YG) 
[^14]: Jess Auerbach Jahajeeah. [Down to the wire: The ship fixing our internet](https://continent.substack.com/p/down-to-the-wire-the-ship-fixing). *continent.substack.com*, November 2023. Archived at [perma.cc/DP7B-EQ7S](https://perma.cc/DP7B-EQ7S) 
[^15]: Santosh Janardhan. [More details about the October 4 outage](https://engineering.fb.com/2021/10/05/networking-traffic/outage-details/). *engineering.fb.com*, October 2021. Archived at [perma.cc/WW89-VSXH](https://perma.cc/WW89-VSXH) 
[^16]: Tom Parfitt. [Georgian woman cuts off web access to whole of Armenia](https://www.theguardian.com/world/2011/apr/06/georgian-woman-cuts-web-access). *theguardian.com*, April 2011. Archived at [perma.cc/KMC3-N3NZ](https://perma.cc/KMC3-N3NZ) 
[^17]: Antonio Voce, Tural Ahmedzade and Ashley Kirk. [‘Shadow fleets’ and subaquatic sabotage: are Europe’s undersea internet cables under attack?](https://www.theguardian.com/world/ng-interactive/2025/mar/05/shadow-fleets-subaquatic-sabotage-europe-undersea-internet-cables-under-attack) *theguardian.com*, March 2025. Archived at [perma.cc/HA7S-ZDBV](https://perma.cc/HA7S-ZDBV) 
[^18]: Shengyun Liu, Paolo Viotti, Christian Cachin, Vivien Quéma, and Marko Vukolić. [XFT: Practical Fault Tolerance beyond Crashes](https://www.usenix.org/system/files/conference/osdi16/osdi16-liu.pdf). At *12th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), November 2016. 
[^19]: Mark Imbriaco. [Downtime last Saturday](https://github.blog/news-insights/the-library/downtime-last-saturday/). *github.blog*, December 2012. Archived at [perma.cc/M7X5-E8SQ](https://perma.cc/M7X5-E8SQ) 
[^20]: Tom Lianza and Chris Snook. [A Byzantine failure in the real world](https://blog.cloudflare.com/a-byzantine-failure-in-the-real-world/). *blog.cloudflare.com*, November 2020. Archived at [perma.cc/83EZ-ALCY](https://perma.cc/83EZ-ALCY) 
[^21]: Mohammed Alfatafta, Basil Alkhatib, Ahmed Alquraan, and Samer Al-Kiswany. [Toward a Generic Fault Tolerance Technique for Partial Network Partitioning](https://www.usenix.org/conference/osdi20/presentation/alfatafta). At *14th USENIX Symposium on Operating Systems Design and Implementation* (OSDI), November 2020. 
[^22]: Marc A. Donges. [Re: bnx2 cards Intermittantly Going Offline](https://www.spinics.net/lists/netdev/msg210485.html). Message to Linux *netdev* mailing list, *spinics.net*, September 2012. Archived at [perma.cc/TXP6-H8R3](https://perma.cc/TXP6-H8R3) 
[^23]: Troy Toman. [Inside a CODE RED: Network Edition](https://signalvnoise.com/svn3/inside-a-code-red-network-edition/). *signalvnoise.com*, September 2020. Archived at [perma.cc/BET6-FY25](https://perma.cc/BET6-FY25) 
[^24]: Kyle Kingsbury. [Call Me Maybe: Elasticsearch](https://aphyr.com/posts/317-call-me-maybe-elasticsearch). *aphyr.com*, June 2014. [perma.cc/JK47-S89J](https://perma.cc/JK47-S89J) 
[^25]: Salvatore Sanfilippo. [A Few Arguments About Redis Sentinel Properties and Fail Scenarios](https://antirez.com/news/80). *antirez.com*, October 2014. [perma.cc/8XEU-CLM8](https://perma.cc/8XEU-CLM8) 
[^26]: Nicolas Liochon. [CAP: If All You Have Is a Timeout, Everything Looks Like a Partition](http://blog.thislongrun.com/2015/05/CAP-theorem-partition-timeout-zookeeper.html). *blog.thislongrun.com*, May 2015. Archived at [perma.cc/FS57-V2PZ](https://perma.cc/FS57-V2PZ) 
[^27]: Matthew P. Grosvenor, Malte Schwarzkopf, Ionel Gog, Robert N. M. Watson, Andrew W. Moore, Steven Hand, and Jon Crowcroft. [Queues Don’t Matter When You Can JUMP Them!](https://www.usenix.org/system/files/conference/nsdi15/nsdi15-paper-grosvenor_update.pdf) At *12th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), May 2015. 
[^28]: Theo Julienne. [Debugging network stalls on Kubernetes](https://github.blog/engineering/debugging-network-stalls-on-kubernetes/). *github.blog*, November 2019. Archived at [perma.cc/K9M8-XVGL](https://perma.cc/K9M8-XVGL) 
[^29]: Guohui Wang and T. S. Eugene Ng. [The Impact of Virtualization on Network Performance of Amazon EC2 Data Center](https://www.cs.rice.edu/~eugeneng/papers/INFOCOM10-ec2.pdf). At *29th IEEE International Conference on Computer Communications* (INFOCOM), March 2010. [doi:10.1109/INFCOM.2010.5461931](https://doi.org/10.1109/INFCOM.2010.5461931) 
[^30]: Brandon Philips. [etcd: Distributed Locking and Service Discovery](https://www.youtube.com/watch?v=HJIjTTHWYnE). At *Strange Loop*, September 2014. 
[^31]: Steve Newman. [A Systematic Look at EC2 I/O](https://www.sentinelone.com/blog/a-systematic-look-at-ec2-i-o/). *blog.scalyr.com*, October 2012. Archived at [perma.cc/FL4R-H2VE](https://perma.cc/FL4R-H2VE) 
[^32]: Naohiro Hayashibara, Xavier Défago, Rami Yared, and Takuya Katayama. [The ϕ Accrual Failure Detector](https://hdl.handle.net/10119/4784). Japan Advanced Institute of Science and Technology, School of Information Science, Technical Report IS-RR-2004-010, May 2004. Archived at [perma.cc/NSM2-TRYA](https://perma.cc/NSM2-TRYA) 
[^33]: Jeffrey Wang. [Phi Accrual Failure Detector](https://ternarysearch.blogspot.com/2013/08/phi-accrual-failure-detector.html). *ternarysearch.blogspot.co.uk*, August 2013. [perma.cc/L452-AMLV](https://perma.cc/L452-AMLV) 
[^34]: Srinivasan Keshav. *An Engineering Approach to Computer Networking: ATM Networks, the Internet, and the Telephone Network*. Addison-Wesley Professional, May 1997. ISBN: 978-0-201-63442-6 
[^35]: Othmar Kyas. *ATM Networks*. International Thomson Publishing, 1995. ISBN: 978-1-850-32128-6 
[^36]: Mellanox Technologies. [InfiniBand FAQ, Rev 1.3](https://network.nvidia.com/related-docs/whitepapers/InfiniBandFAQ_FQ_100.pdf). *network.nvidia.com*, December 2014. Archived at [perma.cc/LQJ4-QZVK](https://perma.cc/LQJ4-QZVK) 
[^37]: Jose Renato Santos, Yoshio Turner, and G. (John) Janakiraman. [End-to-End Congestion Control for InfiniBand](https://infocom2003.ieee-infocom.org/papers/28_01.PDF). At *22nd Annual Joint Conference of the IEEE Computer and Communications Societies* (INFOCOM), April 2003. Also published by HP Laboratories Palo Alto, Tech Report HPL-2002-359. [doi:10.1109/INFCOM.2003.1208949](https://doi.org/10.1109/INFCOM.2003.1208949) 
[^38]: Jialin Li, Naveen Kr. Sharma, Dan R. K. Ports, and Steven D. Gribble. [Tales of the Tail: Hardware, OS, and Application-level Sources of Tail Latency](https://syslab.cs.washington.edu/papers/latency-socc14.pdf). At *ACM Symposium on Cloud Computing* (SOCC), November 2014. [doi:10.1145/2670979.2670988](https://doi.org/10.1145/2670979.2670988) 
[^39]: Ulrich Windl, David Dalton, Marc Martinec, and Dale R. Worley. [The NTP FAQ and HOWTO](https://www.ntp.org/ntpfaq/). *ntp.org*, November 2006. 
[^40]: John Graham-Cumming. [How and why the leap second affected Cloudflare DNS](https://blog.cloudflare.com/how-and-why-the-leap-second-affected-cloudflare-dns/). *blog.cloudflare.com*, January 2017. Archived at [archive.org](https://web.archive.org/web/20250202041444/https%3A//blog.cloudflare.com/how-and-why-the-leap-second-affected-cloudflare-dns/) 
[^41]: David Holmes. [Inside the Hotspot VM: Clocks, Timers and Scheduling Events – Part I – Windows](https://web.archive.org/web/20160308031939/https%3A//blogs.oracle.com/dholmes/entry/inside_the_hotspot_vm_clocks). *blogs.oracle.com*, October 2006. Archived at [archive.org](https://web.archive.org/web/20160308031939/https%3A//blogs.oracle.com/dholmes/entry/inside_the_hotspot_vm_clocks) 
[^42]: Joran Dirk Greef. [Three Clocks are Better than One](https://tigerbeetle.com/blog/2021-08-30-three-clocks-are-better-than-one/). *tigerbeetle.com*, August 2021. Archived at [perma.cc/5RXG-EU6B](https://perma.cc/5RXG-EU6B) 
[^43]: Oliver Yang. [Pitfalls of TSC usage](https://oliveryang.net/2015/09/pitfalls-of-TSC-usage/). *oliveryang.net*, September 2015. Archived at [perma.cc/Z2QY-5FRA](https://perma.cc/Z2QY-5FRA) 
[^44]: Steve Loughran. [Time on Multi-Core, Multi-Socket Servers](https://steveloughran.blogspot.com/2015/09/time-on-multi-core-multi-socket-servers.html). *steveloughran.blogspot.co.uk*, September 2015. Archived at [perma.cc/7M4S-D4U6](https://perma.cc/7M4S-D4U6) 
[^45]: James C. Corbett, Jeffrey Dean, Michael Epstein, Andrew Fikes, Christopher Frost, JJ Furman, Sanjay Ghemawat, Andrey Gubarev, Christopher Heiser, Peter Hochschild, Wilson Hsieh, Sebastian Kanthak, Eugene Kogan, Hongyi Li, Alexander Lloyd, Sergey Melnik, David Mwaura, David Nagle, Sean Quinlan, Rajesh Rao, Lindsay Rolig, Dale Woodford, Yasushi Saito, Christopher Taylor, Michal Szymaniak, and Ruth Wang. [Spanner: Google’s Globally-Distributed Database](https://research.google/pubs/pub39966/). At *10th USENIX Symposium on Operating System Design and Implementation* (OSDI), October 2012. 
[^46]: M. Caporaloni and R. Ambrosini. [How Closely Can a Personal Computer Clock Track the UTC Timescale Via the Internet?](https://iopscience.iop.org/0143-0807/23/4/103/) *European Journal of Physics*, volume 23, issue 4, pages L17–L21, June 2012. [doi:10.1088/0143-0807/23/4/103](https://doi.org/10.1088/0143-0807/23/4/103) 
[^47]: Nelson Minar. [A Survey of the NTP Network](https://alumni.media.mit.edu/~nelson/research/ntp-survey99/). *alumni.media.mit.edu*, December 1999. Archived at [perma.cc/EV76-7ZV3](https://perma.cc/EV76-7ZV3) 
[^48]: Viliam Holub. [Synchronizing Clocks in a Cassandra Cluster Pt. 1 – The Problem](https://blog.rapid7.com/2014/03/14/synchronizing-clocks-in-a-cassandra-cluster-pt-1-the-problem/). *blog.rapid7.com*, March 2014. Archived at [perma.cc/N3RV-5LNL](https://perma.cc/N3RV-5LNL) 
[^49]: Poul-Henning Kamp. [The One-Second War (What Time Will You Die?)](https://queue.acm.org/detail.cfm?id=1967009) *ACM Queue*, volume 9, issue 4, pages 44–48, April 2011. [doi:10.1145/1966989.1967009](https://doi.org/10.1145/1966989.1967009) 
[^50]: Nelson Minar. [Leap Second Crashes Half the Internet](https://www.somebits.com/weblog/tech/bad/leap-second-2012.html). *somebits.com*, July 2012. Archived at [perma.cc/2WB8-D6EU](https://perma.cc/2WB8-D6EU) 
[^51]: Christopher Pascoe. [Time, Technology and Leaping Seconds](https://googleblog.blogspot.com/2011/09/time-technology-and-leaping-seconds.html). *googleblog.blogspot.co.uk*, September 2011. Archived at [perma.cc/U2JL-7E74](https://perma.cc/U2JL-7E74) 
[^52]: Mingxue Zhao and Jeff Barr. [Look Before You Leap – The Coming Leap Second and AWS](https://aws.amazon.com/blogs/aws/look-before-you-leap-the-coming-leap-second-and-aws/). *aws.amazon.com*, May 2015. Archived at [perma.cc/KPE9-XMFM](https://perma.cc/KPE9-XMFM) 
[^53]: Darryl Veitch and Kanthaiah Vijayalayan. [Network Timing and the 2015 Leap Second](https://opus.lib.uts.edu.au/bitstream/10453/43923/1/LeapSecond_camera.pdf). At *17th International Conference on Passive and Active Measurement* (PAM), April 2016. [doi:10.1007/978-3-319-30505-9\_29](https://doi.org/10.1007/978-3-319-30505-9_29) 
[^54]: VMware, Inc. [Timekeeping in VMware Virtual Machines](https://www.vmware.com/docs/vmware_timekeeping). *vmware.com*, October 2008. Archived at [perma.cc/HM5R-T5NF](https://perma.cc/HM5R-T5NF) 
[^55]: Victor Yodaiken. [Clock Synchronization in Finance and Beyond](https://www.yodaiken.com/wp-content/uploads/2018/05/financeandbeyond.pdf). *yodaiken.com*, November 2017. Archived at [perma.cc/9XZD-8ZZN](https://perma.cc/9XZD-8ZZN) 
[^56]: Mustafa Emre Acer, Emily Stark, Adrienne Porter Felt, Sascha Fahl, Radhika Bhargava, Bhanu Dev, Matt Braithwaite, Ryan Sleevi, and Parisa Tabriz. [Where the Wild Warnings Are: Root Causes of Chrome HTTPS Certificate Errors](https://acmccs.github.io/papers/p1407-acerA.pdf). At *ACM SIGSAC Conference on Computer and Communications Security* (CCS), pages 1407–1420, October 2017. [doi:10.1145/3133956.3134007](https://doi.org/10.1145/3133956.3134007) 
[^57]: European Securities and Markets Authority. [MiFID II / MiFIR: Regulatory Technical and Implementing Standards – Annex I](https://www.esma.europa.eu/sites/default/files/library/2015/11/2015-esma-1464_annex_i_-_draft_rts_and_its_on_mifid_ii_and_mifir.pdf). *esma.europa.eu*, Report ESMA/2015/1464, September 2015. Archived at [perma.cc/ZLX9-FGQ3](https://perma.cc/ZLX9-FGQ3) 
[^58]: Luke Bigum. [Solving MiFID II Clock Synchronisation With Minimum Spend (Part 1)](https://catach.blogspot.com/2015/11/solving-mifid-ii-clock-synchronisation.html). *catach.blogspot.com*, November 2015. Archived at [perma.cc/4J5W-FNM4](https://perma.cc/4J5W-FNM4) 
[^59]: Oleg Obleukhov and Ahmad Byagowi. [How Precision Time Protocol is being deployed at Meta](https://engineering.fb.com/2022/11/21/production-engineering/precision-time-protocol-at-meta/). *engineering.fb.com*, November 2022. Archived at [perma.cc/29G6-UJNW](https://perma.cc/29G6-UJNW) 
[^60]: John Wiseman. [gpsjam.org](https://gpsjam.org/), July 2022. 
[^61]: Josh Levinson, Julien Ridoux, and Chris Munns. [It’s About Time: Microsecond-Accurate Clocks on Amazon EC2 Instances](https://aws.amazon.com/blogs/compute/its-about-time-microsecond-accurate-clocks-on-amazon-ec2-instances/). *aws.amazon.com*, November 2023. Archived at [perma.cc/56M6-5VMZ](https://perma.cc/56M6-5VMZ) 
[^62]: Kyle Kingsbury. [Call Me Maybe: Cassandra](https://aphyr.com/posts/294-call-me-maybe-cassandra/). *aphyr.com*, September 2013. Archived at [perma.cc/4MBR-J96V](https://perma.cc/4MBR-J96V) 
[^63]: John Daily. [Clocks Are Bad, or, Welcome to the Wonderful World of Distributed Systems](https://riak.com/clocks-are-bad-or-welcome-to-distributed-systems/). *riak.com*, November 2013. Archived at [perma.cc/4XB5-UCXY](https://perma.cc/4XB5-UCXY) 
[^64]: Marc Brooker. [It’s About Time!](https://brooker.co.za/blog/2023/11/27/about-time.html) *brooker.co.za*, November 2023. Archived at [perma.cc/N6YK-DRPA](https://perma.cc/N6YK-DRPA) 
[^65]: Kyle Kingsbury. [The Trouble with Timestamps](https://aphyr.com/posts/299-the-trouble-with-timestamps). *aphyr.com*, October 2013. Archived at [perma.cc/W3AM-5VAV](https://perma.cc/W3AM-5VAV) 
[^66]: Leslie Lamport. [Time, Clocks, and the Ordering of Events in a Distributed System](https://www.microsoft.com/en-us/research/publication/time-clocks-ordering-events-distributed-system/). *Communications of the ACM*, volume 21, issue 7, pages 558–565, July 1978. [doi:10.1145/359545.359563](https://doi.org/10.1145/359545.359563) 
[^67]: Justin Sheehy. [There Is No Now: Problems With Simultaneity in Distributed Systems](https://queue.acm.org/detail.cfm?id=2745385). *ACM Queue*, volume 13, issue 3, pages 36–41, March 2015. [doi:10.1145/2733108](https://doi.org/10.1145/2733108) 
[^68]: Murat Demirbas. [Spanner: Google’s Globally-Distributed Database](https://muratbuffalo.blogspot.com/2013/07/spanner-googles-globally-distributed_4.html). *muratbuffalo.blogspot.co.uk*, July 2013. Archived at [perma.cc/6VWR-C9WB](https://perma.cc/6VWR-C9WB) 
[^69]: Dahlia Malkhi and Jean-Philippe Martin. [Spanner’s Concurrency Control](https://www.cs.cornell.edu/~ie53/publications/DC-col51-Sep13.pdf). *ACM SIGACT News*, volume 44, issue 3, pages 73–77, September 2013. [doi:10.1145/2527748.2527767](https://doi.org/10.1145/2527748.2527767) 
[^70]: Franck Pachot. [Achieving Precise Clock Synchronization on AWS](https://www.yugabyte.com/blog/aws-clock-synchronization/). *yugabyte.com*, December 2024. Archived at [perma.cc/UYM6-RNBS](https://perma.cc/UYM6-RNBS) 
[^71]: Spencer Kimball. [Living Without Atomic Clocks: Where CockroachDB and Spanner diverge](https://www.cockroachlabs.com/blog/living-without-atomic-clocks/). *cockroachlabs.com*, January 2022. Archived at [perma.cc/AWZ7-RXFT](https://perma.cc/AWZ7-RXFT) 
[^72]: Murat Demirbas. [Use of Time in Distributed Databases (part 4): Synchronized clocks in production databases](https://muratbuffalo.blogspot.com/2025/01/use-of-time-in-distributed-databases.html). *muratbuffalo.blogspot.com*, January 2025. Archived at [perma.cc/9WNX-Q9U3](https://perma.cc/9WNX-Q9U3) 
[^73]: Cary G. Gray and David R. Cheriton. [Leases: An Efficient Fault-Tolerant Mechanism for Distributed File Cache Consistency](https://courses.cs.duke.edu/spring11/cps210/papers/p202-gray.pdf). At *12th ACM Symposium on Operating Systems Principles* (SOSP), December 1989. [doi:10.1145/74850.74870](https://doi.org/10.1145/74850.74870) 
[^74]: Daniel Sturman, Scott Delap, Max Ross, et al. [Roblox Return to Service](https://corp.roblox.com/newsroom/2022/01/roblox-return-to-service-10-28-10-31-2021). *corp.roblox.com*, January 2022. Archived at [perma.cc/8ALT-WAS4](https://perma.cc/8ALT-WAS4) 
[^75]: Todd Lipcon. [Avoiding Full GCs with MemStore-Local Allocation Buffers](https://www.slideshare.net/slideshow/hbase-hug-presentation/7038178). *slideshare.net*, February 2011. Archived at <https://perma.cc/CH62-2EWJ>
[^76]: Christopher Clark, Keir Fraser, Steven Hand, Jacob Gorm Hansen, Eric Jul, Christian Limpach, Ian Pratt, and Andrew Warfield. [Live Migration of Virtual Machines](https://www.usenix.org/legacy/publications/library/proceedings/nsdi05/tech/full_papers/clark/clark.pdf). At *2nd USENIX Symposium on Symposium on Networked Systems Design & Implementation* (NSDI), May 2005. 
[^77]: Mike Shaver. [fsyncers and Curveballs](https://web.archive.org/web/20220107141023/http%3A//shaver.off.net/diary/2008/05/25/fsyncers-and-curveballs/). *shaver.off.net*, May 2008. Archived at [archive.org](https://web.archive.org/web/20220107141023/http%3A//shaver.off.net/diary/2008/05/25/fsyncers-and-curveballs/) 
[^78]: Zhenyun Zhuang and Cuong Tran. [Eliminating Large JVM GC Pauses Caused by Background IO Traffic](https://engineering.linkedin.com/blog/2016/02/eliminating-large-jvm-gc-pauses-caused-by-background-io-traffic). *engineering.linkedin.com*, February 2016. Archived at [perma.cc/ML2M-X9XT](https://perma.cc/ML2M-X9XT) 
[^79]: Martin Thompson. [Java Garbage Collection Distilled](https://mechanical-sympathy.blogspot.com/2013/07/java-garbage-collection-distilled.html). *mechanical-sympathy.blogspot.co.uk*, July 2013. Archived at [perma.cc/DJT3-NQLQ](https://perma.cc/DJT3-NQLQ) 
[^80]: David Terei and Amit Levy. [Blade: A Data Center Garbage Collector](https://arxiv.org/pdf/1504.02578). arXiv:1504.02578, April 2015. 
[^81]: Martin Maas, Tim Harris, Krste Asanović, and John Kubiatowicz. [Trash Day: Coordinating Garbage Collection in Distributed Systems](https://timharris.uk/papers/2015-hotos.pdf). At *15th USENIX Workshop on Hot Topics in Operating Systems* (HotOS), May 2015. 
[^82]: Martin Fowler. [The LMAX Architecture](https://martinfowler.com/articles/lmax.html). *martinfowler.com*, July 2011. Archived at [perma.cc/5AV4-N6RJ](https://perma.cc/5AV4-N6RJ) 
[^83]: Joseph Y. Halpern and Yoram Moses. [Knowledge and common knowledge in a distributed environment](https://groups.csail.mit.edu/tds/papers/Halpern/JACM90.pdf). *Journal of the ACM* (JACM), volume 37, issue 3, pages 549–587, July 1990. [doi:10.1145/79147.79161](https://doi.org/10.1145/79147.79161) 
[^84]: Chuzhe Tang, Zhaoguo Wang, Xiaodong Zhang, Qianmian Yu, Binyu Zang, Haibing Guan, and Haibo Chen. [Ad Hoc Transactions in Web Applications: The Good, the Bad, and the Ugly](https://ipads.se.sjtu.edu.cn/_media/publications/concerto-sigmod22.pdf). At *ACM International Conference on Management of Data* (SIGMOD), June 2022. [doi:10.1145/3514221.3526120](https://doi.org/10.1145/3514221.3526120) 
[^85]: Flavio P. Junqueira and Benjamin Reed. [*ZooKeeper: Distributed Process Coordination*](https://www.oreilly.com/library/view/zookeeper/9781449361297/). O’Reilly Media, 2013. ISBN: 978-1-449-36130-3 
[^86]: Enis Söztutar. [HBase and HDFS: Understanding Filesystem Usage in HBase](https://www.slideshare.net/slideshow/hbase-and-hdfs-understanding-filesystem-usage/22990858). At *HBaseCon*, June 2013. Archived at [perma.cc/4DXR-9P88](https://perma.cc/4DXR-9P88) 
[^87]: SUSE LLC. [SUSE Linux Enterprise High Availability 15 SP6 Administration Guide, Section 12: Fencing and STONITH](https://documentation.suse.com/sle-ha/15-SP6/html/SLE-HA-all/cha-ha-fencing.html). *documentation.suse.com*, March 2025. Archived at [perma.cc/8LAR-EL9D](https://perma.cc/8LAR-EL9D) 
[^88]: Mike Burrows. [The Chubby Lock Service for Loosely-Coupled Distributed Systems](https://research.google/pubs/pub27897/). At *7th USENIX Symposium on Operating System Design and Implementation* (OSDI), November 2006. 
[^89]: Kyle Kingsbury. [etcd 3.4.3](https://jepsen.io/analyses/etcd-3.4.3). *jepsen.io*, January 2020. Archived at [perma.cc/2P3Y-MPWU](https://perma.cc/2P3Y-MPWU) 
[^90]: Ensar Basri Kahveci. [Distributed Locks are Dead; Long Live Distributed Locks!](https://hazelcast.com/blog/long-live-distributed-locks/) *hazelcast.com*, April 2019. Archived at [perma.cc/7FS5-LDXE](https://perma.cc/7FS5-LDXE) 
[^91]: Martin Kleppmann. [How to do distributed locking](https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html). *martin.kleppmann.com*, February 2016. Archived at [perma.cc/Y24W-YQ5L](https://perma.cc/Y24W-YQ5L) 
[^92]: Salvatore Sanfilippo. [Is Redlock safe?](https://antirez.com/news/101) *antirez.com*, February 2016. Archived at [perma.cc/B6GA-9Q6A](https://perma.cc/B6GA-9Q6A) 
[^93]: Gunnar Morling. [Leader Election With S3 Conditional Writes](https://www.morling.dev/blog/leader-election-with-s3-conditional-writes/). *www.morling.dev*, August 2024. Archived at [perma.cc/7V2N-J78Y](https://perma.cc/7V2N-J78Y) 
[^94]: Leslie Lamport, Robert Shostak, and Marshall Pease. [The Byzantine Generals Problem](https://www.microsoft.com/en-us/research/publication/byzantine-generals-problem/). *ACM Transactions on Programming Languages and Systems* (TOPLAS), volume 4, issue 3, pages 382–401, July 1982. [doi:10.1145/357172.357176](https://doi.org/10.1145/357172.357176) 
[^95]: Jim N. Gray. [Notes on Data Base Operating Systems](https://jimgray.azurewebsites.net/papers/dbos.pdf). in *Operating Systems: An Advanced Course*, Lecture Notes in Computer Science, volume 60, edited by R. Bayer, R. M. Graham, and G. Seegmüller, pages 393–481, Springer-Verlag, 1978. ISBN: 978-3-540-08755-7. Archived at [perma.cc/7S9M-2LZU](https://perma.cc/7S9M-2LZU) 
[^96]: Brian Palmer. [How Complicated Was the Byzantine Empire?](https://slate.com/news-and-politics/2011/10/the-byzantine-tax-code-how-complicated-was-byzantium-anyway.html) *slate.com*, October 2011. Archived at [perma.cc/AN7X-FL3N](https://perma.cc/AN7X-FL3N) 
[^97]: Leslie Lamport. [My Writings](https://lamport.azurewebsites.net/pubs/pubs.html). *lamport.azurewebsites.net*, December 2014. Archived at [perma.cc/5NNM-SQGR](https://perma.cc/5NNM-SQGR) 
[^98]: John Rushby. [Bus Architectures for Safety-Critical Embedded Systems](https://www.csl.sri.com/papers/emsoft01/emsoft01.pdf). At *1st International Workshop on Embedded Software* (EMSOFT), October 2001. [doi:10.1007/3-540-45449-7\_22](https://doi.org/10.1007/3-540-45449-7_22) 
[^99]: Jake Edge. [ELC: SpaceX Lessons Learned](https://lwn.net/Articles/540368/). *lwn.net*, March 2013. Archived at [perma.cc/AYX8-QP5X](https://perma.cc/AYX8-QP5X) 
[^100]: Shehar Bano, Alberto Sonnino, Mustafa Al-Bassam, Sarah Azouvi, Patrick McCorry, Sarah Meiklejohn, and George Danezis. [SoK: Consensus in the Age of Blockchains](https://smeiklej.com/files/aft19a.pdf). At *1st ACM Conference on Advances in Financial Technologies* (AFT), October 2019. [doi:10.1145/3318041.3355458](https://doi.org/10.1145/3318041.3355458) 
[^101]: Ezra Feilden, Adi Oltean, and Philip Johnston. [Why we should train AI in space](https://www.starcloud.com/wp). White Paper, *starcloud.com*, September 2024. Archived at [perma.cc/7Y3S-8UB6](https://perma.cc/7Y3S-8UB6) 
[^102]: James Mickens. [The Saddest Moment](https://www.usenix.org/system/files/login-logout_1305_mickens.pdf). *USENIX ;login*, May 2013. Archived at [perma.cc/T7BZ-XCFR](https://perma.cc/T7BZ-XCFR) 
[^103]: Martin Kleppmann and Heidi Howard. [Byzantine Eventual Consistency and the Fundamental Limits of Peer-to-Peer Databases](https://arxiv.org/abs/2012.00472). *arxiv.org*, December 2020. [doi:10.48550/arXiv.2012.00472](https://doi.org/10.48550/arXiv.2012.00472) 
[^104]: Martin Kleppmann. [Making CRDTs Byzantine Fault Tolerant](https://martin.kleppmann.com/papers/bft-crdt-papoc22.pdf). At *9th Workshop on Principles and Practice of Consistency for Distributed Data* (PaPoC), April 2022. [doi:10.1145/3517209.3524042](https://doi.org/10.1145/3517209.3524042) 
[^105]: Evan Gilman. [The Discovery of Apache ZooKeeper’s Poison Packet](https://www.pagerduty.com/blog/the-discovery-of-apache-zookeepers-poison-packet/). *pagerduty.com*, May 2015. Archived at [perma.cc/RV6L-Y5CQ](https://perma.cc/RV6L-Y5CQ) 
[^106]: Jonathan Stone and Craig Partridge. [When the CRC and TCP Checksum Disagree](https://conferences2.sigcomm.org/sigcomm/2000/conf/paper/sigcomm2000-9-1.pdf). At *ACM Conference on Applications, Technologies, Architectures, and Protocols for Computer Communication* (SIGCOMM), August 2000. [doi:10.1145/347059.347561](https://doi.org/10.1145/347059.347561) 
[^107]: Evan Jones. [How Both TCP and Ethernet Checksums Fail](https://www.evanjones.ca/tcp-and-ethernet-checksums-fail.html). *evanjones.ca*, October 2015. Archived at [perma.cc/9T5V-B8X5](https://perma.cc/9T5V-B8X5) 
[^108]: Cynthia Dwork, Nancy Lynch, and Larry Stockmeyer. [Consensus in the Presence of Partial Synchrony](https://groups.csail.mit.edu/tds/papers/Lynch/jacm88.pdf). *Journal of the ACM*, volume 35, issue 2, pages 288–323, April 1988. [doi:10.1145/42282.42283](https://doi.org/10.1145/42282.42283) 
[^109]: Richard D. Schlichting and Fred B. Schneider. [Fail-stop processors: an approach to designing fault-tolerant computing systems](https://www.cs.cornell.edu/fbs/publications/Fail_Stop.pdf). *ACM Transactions on Computer Systems* (TOCS), volume 1, issue 3, pages 222–238, August 1983. [doi:10.1145/357369.357371](https://doi.org/10.1145/357369.357371) 
[^110]: Thanh Do, Mingzhe Hao, Tanakorn Leesatapornwongsa, Tiratat Patana-anake, and Haryadi S. Gunawi. [Limplock: Understanding the Impact of Limpware on Scale-out Cloud Systems](https://ucare.cs.uchicago.edu/pdf/socc13-limplock.pdf). At *4th ACM Symposium on Cloud Computing* (SoCC), October 2013. [doi:10.1145/2523616.2523627](https://doi.org/10.1145/2523616.2523627) 
[^111]: Josh Snyder and Joseph Lynch. [Garbage collecting unhealthy JVMs, a proactive approach](https://netflixtechblog.medium.com/introducing-jvmquake-ec944c60ba70). Netflix Technology Blog, *netflixtechblog.medium.com*, November 2019. Archived at [perma.cc/8BTA-N3YB](https://perma.cc/8BTA-N3YB) 
[^112]: Haryadi S. Gunawi, Riza O. Suminto, Russell Sears, Casey Golliher, Swaminathan Sundararaman, Xing Lin, Tim Emami, Weiguang Sheng, Nematollah Bidokhti, Caitie McCaffrey, Gary Grider, Parks M. Fields, Kevin Harms, Robert B. Ross, Andree Jacobson, Robert Ricci, Kirk Webb, Peter Alvaro, H. Birali Runesha, Mingzhe Hao, and Huaicheng Li. [Fail-Slow at Scale: Evidence of Hardware Performance Faults in Large Production Systems](https://www.usenix.org/system/files/conference/fast18/fast18-gunawi.pdf). At *16th USENIX Conference on File and Storage Technologies*, February 2018. 
[^113]: Peng Huang, Chuanxiong Guo, Lidong Zhou, Jacob R. Lorch, Yingnong Dang, Murali Chintalapati, and Randolph Yao. [Gray Failure: The Achilles’ Heel of Cloud-Scale Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/06/paper-1.pdf). At *16th Workshop on Hot Topics in Operating Systems* (HotOS), May 2017. [doi:10.1145/3102980.3103005](https://doi.org/10.1145/3102980.3103005) 
[^114]: Chang Lou, Peng Huang, and Scott Smith. [Understanding, Detecting and Localizing Partial Failures in Large System Software](https://www.usenix.org/conference/nsdi20/presentation/lou). At *17th USENIX Symposium on Networked Systems Design and Implementation* (NSDI), February 2020. 
[^115]: Peter Bailis and Ali Ghodsi. [Eventual Consistency Today: Limitations, Extensions, and Beyond](https://queue.acm.org/detail.cfm?id=2462076). *ACM Queue*, volume 11, issue 3, pages 55-63, March 2013. [doi:10.1145/2460276.2462076](https://doi.org/10.1145/2460276.2462076) 
[^116]: Bowen Alpern and Fred B. Schneider. [Defining Liveness](https://www.cs.cornell.edu/fbs/publications/DefLiveness.pdf). *Information Processing Letters*, volume 21, issue 4, pages 181–185, October 1985. [doi:10.1016/0020-0190(85)90056-0](https://doi.org/10.1016/0020-0190%2885%2990056-0) 
[^117]: Flavio P. Junqueira. [Dude, Where’s My Metadata?](https://fpj.me/2015/05/28/dude-wheres-my-metadata/) *fpj.me*, May 2015. Archived at [perma.cc/D2EU-Y9S5](https://perma.cc/D2EU-Y9S5) 
[^118]: Scott Sanders. [January 28th Incident Report](https://github.com/blog/2106-january-28th-incident-report). *github.com*, February 2016. Archived at [perma.cc/5GZR-88TV](https://perma.cc/5GZR-88TV) 
[^119]: Jay Kreps. [A Few Notes on Kafka and Jepsen](https://blog.empathybox.com/post/62279088548/a-few-notes-on-kafka-and-jepsen). *blog.empathybox.com*, September 2013. [perma.cc/XJ5C-F583](https://perma.cc/XJ5C-F583) 
[^120]: Marc Brooker and Ankush Desai. [Systems Correctness Practices at AWS](https://dl.acm.org/doi/pdf/10.1145/3712057). *Queue, Volume 22, Issue 6*, November/December 2024. [doi:10.1145/3712057](https://doi.org/10.1145/3712057) 
[^121]: Andrey Satarin. [Testing Distributed Systems: Curated list of resources on testing distributed systems](https://asatarin.github.io/testing-distributed-systems/). *asatarin.github.io*. Archived at [perma.cc/U5V8-XP24](https://perma.cc/U5V8-XP24) 
[^122]: Jack Vanlightly. [Verifying Kafka transactions - Diary entry 2 - Writing an initial TLA+ spec](https://jack-vanlightly.com/analyses/2024/12/3/verifying-kafka-transactions-diary-entry-2-writing-an-initial-tla-spec). *jack-vanlightly.com*, December 2024. Archived at [perma.cc/NSQ8-MQ5N](https://perma.cc/NSQ8-MQ5N) 
[^123]: Siddon Tang. [From Chaos to Order — Tools and Techniques for Testing TiDB, A Distributed NewSQL Database](https://www.pingcap.com/blog/chaos-practice-in-tidb/). *pingcap.com*, April 2018. Archived at [perma.cc/5EJB-R29F](https://perma.cc/5EJB-R29F) 
[^124]: Nathan VanBenschoten. [Parallel Commits: An atomic commit protocol for globally distributed transactions](https://www.cockroachlabs.com/blog/parallel-commits/). *cockroachlabs.com*, November 2019. Archived at [perma.cc/5FZ7-QK6J](https://perma.cc/5FZ7-QK6J%20) 
[^125]: Jack Vanlightly. [Paper: VR Revisited - State Transfer (part 3)](https://jack-vanlightly.com/analyses/2022/12/28/paper-vr-revisited-state-transfer-part-3). *jack-vanlightly.com*, December 2022. Archived at [perma.cc/KNK3-K6WS](https://perma.cc/KNK3-K6WS) 
[^126]: Hillel Wayne. [What if the spec doesn’t match the code?](https://buttondown.com/hillelwayne/archive/what-if-the-spec-doesnt-match-the-code/) *buttondown.com*, March 2024. Archived at [perma.cc/8HEZ-KHER](https://perma.cc/8HEZ-KHER) 
[^127]: Lingzhi Ouyang, Xudong Sun, Ruize Tang, Yu Huang, Madhav Jivrajani, Xiaoxing Ma, Tianyin Xu. [Multi-Grained Specifications for Distributed System Model Checking and Verification](https://arxiv.org/abs/2409.14301). At *20th European Conference on Computer Systems* (EuroSys), March 2025. [doi:10.1145/3689031.3696069](https://doi.org/10.1145/3689031.3696069) 
[^128]: Yury Izrailevsky and Ariel Tseitlin. [The Netflix Simian Army](https://netflixtechblog.com/the-netflix-simian-army-16e57fbab116). *netflixtechblog.com*, July, 2011. Archived at [perma.cc/M3NY-FJW6](https://perma.cc/M3NY-FJW6) 
[^129]: Kyle Kingsbury. [Jepsen: On the perils of network partitions](https://aphyr.com/posts/281-jepsen-on-the-perils-of-network-partitions). *aphyr.com*, May, 2013. Archived at [perma.cc/W98G-6HQP](https://perma.cc/W98G-6HQP) 
[^130]: Kyle Kingsbury. [Jepsen Analyses](https://jepsen.io/analyses). *jepsen.io*, 2024. Archived at [perma.cc/8LDN-D2T8](https://perma.cc/8LDN-D2T8) 
[^131]: Rupak Majumdar and Filip Niksic. [Why is random testing effective for partition tolerance bugs?](https://dl.acm.org/doi/pdf/10.1145/3158134) *Proceedings of the ACM on Programming Languages* (PACMPL), volume 2, issue POPL, article no. 46, December 2017. [doi:10.1145/3158134](https://doi.org/10.1145/3158134) 
[^132]: FoundationDB project authors. [Simulation and Testing](https://apple.github.io/foundationdb/testing.html). *apple.github.io*. Archived at [perma.cc/NQ3L-PM4C](https://perma.cc/NQ3L-PM4C) 
[^133]: Alex Kladov. [Simulation Testing For Liveness](https://tigerbeetle.com/blog/2023-07-06-simulation-testing-for-liveness/). *tigerbeetle.com*, July 2023. Archived at [perma.cc/RKD4-HGCR](https://perma.cc/RKD4-HGCR) 
[^134]: Alfonso Subiotto Marqués. [(Mostly) Deterministic Simulation Testing in Go](https://www.polarsignals.com/blog/posts/2024/05/28/mostly-dst-in-go). *polarsignals.com*, May 2024. Archived at [perma.cc/ULD6-TSA4](https://perma.cc/ULD6-TSA4) 


================================================
FILE: content/zh/colophon.md
================================================
---
title: 后记
weight: 600
breadcrumbs: false
---

{{< callout type="warning" >}}
当前页面来自本书第一版，第二版尚不可用
{{< /callout >}}

## 关于作者

**Martin Kleppmann** 是英国剑桥大学副教授，教授分布式系统与密码学协议。2017 年出版的《设计数据密集型应用》第一版确立了他在数据系统领域的权威地位；他在分布式系统方面的研究也推动了 local-first 软件运动。此前他曾在 LinkedIn、Rapportive 等互联网公司担任软件工程师和创业者，负责大规模数据基础设施。

**Chris Riccomini** 是软件工程师、创业投资人和作者，拥有 15 年以上在 PayPal、LinkedIn、WePay 的工作经验。他运营 Materialized View Capital，专注于基础设施初创企业投资；同时也是 Apache Samza 与 SlateDB 的共同创造者，并合著了 *The Missing README: A Guide for the New Software Engineer*。

![](http://martin.kleppmann.com/2017/03/ddia-poster.jpg)

## 关于译者

[**冯若航**](https://vonng.com)，网名 [@Vonng](https://github.com/Vonng)。
PostgreSQL 专家，数据库老司机，云计算泥石流。
PostgreSQL 发行版 [**Pigsty**](https://pgsty.com) 作者与创始人。
架构师，DBA，全栈工程师 @ TanTan，Alibaba，Apple。
独立开源贡献者，[GitStar Ranking 585](https://gitstar-ranking.com/Vonng)，[国区活跃 Top20](https://committers.top/china)。
[DDIA](https://ddia.pigsty.io) / [PG Internal](https://pgint.vonng.com) 中文版译者，数据库/云计算 KOL。


## 后记

《设计数据密集型应用》封面上的动物是 **印度野猪（Sus scrofa cristatus）**，它是在印度、缅甸、尼泊尔、斯里兰卡和泰国发现的一种野猪的亚种。与欧洲野猪不同，它们有更高的背部鬃毛，没有体表绒毛，以及更大更直的头骨。

印度野猪有一头灰色或黑色的头发，脊背上有短而硬的毛。雄性有突出的犬齿（称为 T），用来与对手战斗或抵御掠食者。雄性比雌性大，这些物种平均肩高 33-35 英寸，体重 200-300 磅。他们的天敌包括熊、老虎和各种大型猫科动物。

这些动物夜行且杂食 —— 它们吃各种各样的东西，包括根、昆虫、腐肉、坚果、浆果和小动物。野猪经常因为破坏农作物的根被人们所熟知，他们造成大量的破坏，并被农民所敌视。他们每天需要摄入 4,000 ~ 4,500 卡路里的能量。野猪有发达的嗅觉，这有助于寻找地下植物和挖掘动物。然而，它们的视力很差。

野猪在人类文化中一直具有重要意义。在印度教传说中，野猪是毗湿奴神的化身。在古希腊的丧葬纪念碑中，它是一个勇敢失败者的象征（与胜利的狮子相反）。由于它的侵略，它被描绘在斯堪的纳维亚、日耳曼和盎格鲁撒克逊战士的盔甲和武器上。在中国十二生肖中，它象征着决心和急躁。

O'Reilly 封面上的许多动物都受到威胁，这些动物对世界都很重要。要了解有关如何提供帮助的更多信息，请访问 animals.oreilly.com。

封面图片来自 Shaw's Zoology。封面字体是 URW Typewriter 和 Guardian Sans。文字字体是 Adobe Minion Pro；图中的字体是 Adobe Myriad Pro；标题字体是 Adobe Myriad Condensed；代码字体是 Dalton Maag 的 Ubuntu Mono。


================================================
FILE: content/zh/contrib.md
================================================
---
title: 贡献者
weight: 800
breadcrumbs: false
---

## 译者

[**冯若航**](https://vonng.com)，网名 [@Vonng](https://github.com/Vonng)。
PostgreSQL 专家，数据库老司机，云计算泥石流。
[**Pigsty**](https://pgsty.com) 作者与创始人。
架构师，DBA，全栈工程师 @ TanTan，Alibaba，Apple。
独立开源贡献者，[GitStar Ranking 585](https://gitstar-ranking.com/Vonng)，[国区活跃 Top20](https://committers.top/china)。
[DDIA](https://ddia.pigsty.io) / [PG Internal](https://pgint.vonng.com) 中文版译者，公众号：《老冯云数》，数据库 KOL。

## 校订与维护

YinGang [@yingang](https://github.com/yingang) 对本书进行了全文校订，并持续维护。

## 繁体中文版本

[繁體中文](/tw) **版本维护** by  [@afunTW](https://github.com/afunTW)

## 贡献列表

[GitHub 贡献者列表](https://github.com/Vonng/ddia/graphs/contributors)

0. 全文校订 by [@yingang](https://github.com/Vonng/ddia/commits?author=yingang)
1. [序言初翻修正](https://github.com/Vonng/ddia/commit/afb5edab55c62ed23474149f229677e3b42dfc2c) by [@seagullbird](https://github.com/Vonng/ddia/commits?author=seagullbird)
2. [第一章语法标点校正](https://github.com/Vonng/ddia/commit/973b12cd8f8fcdf4852f1eb1649ddd9d187e3644) by [@nevertiree](https://github.com/Vonng/ddia/commits?author=nevertiree)
3. [第六章部分校正](https://github.com/Vonng/ddia/commit/d4eb0852c0ec1e93c8aacc496c80b915bb1e6d48) 与[第十章的初翻](https://github.com/Vonng/ddia/commit/9de8dbd1bfe6fbb03b3bf6c1a1aa2291aed2490e) by [@MuAlex](https://github.com/Vonng/ddia/commits?author=MuAlex)
4. [第一部分](/part-i)前言，[ch2](/ch2)校正 by [@jiajiadebug](https://github.com/Vonng/ddia/commits?author=jiajiadebug)
5. [词汇表](/glossary)、[后记](/colophon)关于野猪的部分 by [@Chowss](https://github.com/Vonng/ddia/commits?author=Chowss)
6. [繁體中文](https://github.com/Vonng/ddia/pulls)版本与转换脚本 by [@afunTW](https://github.com/afunTW)
7. 多处翻译修正 by [@songzhibin97](https://github.com/Vonng/ddia/commits?author=songzhibin97) [@MamaShip](https://github.com/Vonng/ddia/commits?author=MamaShip) [@FangYuan33](https://github.com/Vonng/ddia/commits?author=FangYuan33)


感谢所有提出意见，作出贡献的朋友们，您可以在这里找到所有贡献的 [Issue 列表](https://github.com/Vonng/ddia/issues) 与 [PR 列表](https://github.com/Vonng/ddia/pulls)：

| ISSUE & Pull Requests                           | USER                                                       | Title                                                          |
|-------------------------------------------------|------------------------------------------------------------|----------------------------------------------------------------|
| [359](https://github.com/Vonng/ddia/pull/359)   | [@c25423](https://github.com/c25423)                       | ch10: 修正一处拼写错误                                                 |
| [358](https://github.com/Vonng/ddia/pull/358)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch4: 修正一处拼写错误                                                  |
| [356](https://github.com/Vonng/ddia/pull/356)   | [@lewiszlw](https://github.com/lewiszlw)                   | ch2: 修正一处标点错误                                                  |
| [355](https://github.com/Vonng/ddia/pull/355)   | [@DuroyGeorge](https://github.com/DuroyGeorge)             | ch12: 修正一处格式错误                                                 |
| [354](https://github.com/Vonng/ddia/pull/354)   | [@justlorain](https://github.com/justlorain)               | ch7: 修正一处参考链接                                                  |
| [353](https://github.com/Vonng/ddia/pull/353)   | [@fantasyczl](https://github.com/fantasyczl)               | ch3&9: 修正两处引用错误                                                |
| [352](https://github.com/Vonng/ddia/pull/352)   | [@fantasyczl](https://github.com/fantasyczl)               | 支持输出为 EPUB 格式                                                  |
| [349](https://github.com/Vonng/ddia/pull/349)   | [@xiyihan0](https://github.com/xiyihan0)                   | ch1: 修正一处格式错误                                                  |
| [348](https://github.com/Vonng/ddia/pull/348)   | [@omegaatt36](https://github.com/omegaatt36)               | ch3: 修正一处图像链接                                                  |
| [346](https://github.com/Vonng/ddia/issues/346) | [@Vermouth1995](https://github.com/Vermouth1995)           | ch1: 优化一处翻译                                                    |
| [343](https://github.com/Vonng/ddia/pull/343)   | [@kehao-chen](https://github.com/kehao-chen)               | ch10: 优化一处翻译                                                   |
| [341](https://github.com/Vonng/ddia/pull/341)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch3: 优化两处翻译                                                    |
| [340](https://github.com/Vonng/ddia/pull/340)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch2: 优化多处翻译                                                    |
| [338](https://github.com/Vonng/ddia/pull/338)   | [@YKIsTheBest](https://github.com/YKIsTheBest)             | ch1: 优化一处翻译                                                    |
| [335](https://github.com/Vonng/ddia/pull/335)   | [@kimi0230](https://github.com/kimi0230)                   | 修正一处繁体中文错误                                                     |
| [334](https://github.com/Vonng/ddia/pull/334)   | [@soulrrrrr](https://github.com/soulrrrrr)                 | ch2: 修正一处繁体中文错误                                                |
| [332](https://github.com/Vonng/ddia/pull/332)   | [@justlorain](https://github.com/justlorain)               | ch5: 修正一处翻译错误                                                  |
| [331](https://github.com/Vonng/ddia/pull/331)   | [@Lyianu](https://github.com/Lyianu)                       | ch9: 更正几处拼写错误                                                  |
| [330](https://github.com/Vonng/ddia/pull/330)   | [@Lyianu](https://github.com/Lyianu)                       | ch7: 优化一处翻译                                                    |
| [329](https://github.com/Vonng/ddia/issues/329) | [@Lyianu](https://github.com/Lyianu)                       | ch6: 指出一处翻译错误                                                  |
| [328](https://github.com/Vonng/ddia/pull/328)   | [@justlorain](https://github.com/justlorain)               | ch4: 更正一处翻译遗漏                                                  |
| [326](https://github.com/Vonng/ddia/pull/326)   | [@liangGTY](https://github.com/liangGTY)                   | ch1: 优化一处翻译                                                    |
| [323](https://github.com/Vonng/ddia/pull/323)   | [@marvin263](https://github.com/marvin263)                 | ch5: 优化一处翻译                                                    |
| [322](https://github.com/Vonng/ddia/pull/322)   | [@marvin263](https://github.com/marvin263)                 | ch8: 优化一处翻译                                                    |
| [304](https://github.com/Vonng/ddia/pull/304)   | [@spike014](https://github.com/spike014)                   | ch11: 优化一处翻译                                                   |
| [298](https://github.com/Vonng/ddia/pull/298)   | [@Makonike](https://github.com/Makonike)                   | ch11&12: 修正两处错误                                                |
| [284](https://github.com/Vonng/ddia/pull/284)   | [@WAangzE](https://github.com/WAangzE)                     | ch4: 更正一处列表错误                                                  |
| [283](https://github.com/Vonng/ddia/pull/283)   | [@WAangzE](https://github.com/WAangzE)                     | ch3: 更正一处错别字                                                   |
| [282](https://github.com/Vonng/ddia/pull/282)   | [@WAangzE](https://github.com/WAangzE)                     | ch2: 更正一处公式问题                                                  |
| [281](https://github.com/Vonng/ddia/pull/281)   | [@lyuxi99](https://github.com/lyuxi99)                     | 更正多处内部链接错误                                                     |
| [280](https://github.com/Vonng/ddia/pull/280)   | [@lyuxi99](https://github.com/lyuxi99)                     | ch9: 更正内部链接错误                                                  |
| [279](https://github.com/Vonng/ddia/issues/279) | [@codexvn](https://github.com/codexvn)                     | ch9: 指出公式在 GitHub Pages 显示的问题                                  |
| [278](https://github.com/Vonng/ddia/pull/278)   | [@LJlkdskdjflsa](https://github.com/LJlkdskdjflsa)         | 发现了繁体中文版本中的错误翻译                                                |
| [275](https://github.com/Vonng/ddia/pull/275)   | [@117503445](https://github.com/117503445)                 | 更正 LICENSE 链接                                                  |
| [274](https://github.com/Vonng/ddia/pull/274)   | [@uncle-lv](https://github.com/uncle-lv)                   | ch7: 修正错别字                                                     |
| [273](https://github.com/Vonng/ddia/pull/273)   | [@Sdot-Python](https://github.com/Sdot-Python)             | ch7: 统一了 write skew 的翻译                                        |
| [271](https://github.com/Vonng/ddia/pull/271)   | [@Makonike](https://github.com/Makonike)                   | ch6: 统一了 rebalancing 的翻译                                       |
| [270](https://github.com/Vonng/ddia/pull/270)   | [@Ynjxsjmh](https://github.com/Ynjxsjmh)                   | ch7: 修正不一致的翻译                                                  |
| [263](https://github.com/Vonng/ddia/pull/263)   | [@zydmayday](https://github.com/zydmayday)                 | ch5: 修正译文中的重复单词                                                |
| [260](https://github.com/Vonng/ddia/pull/260)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch4: 修正部分不准确的翻译                                                |
| [258](https://github.com/Vonng/ddia/pull/258)   | [@bestgrc](https://github.com/bestgrc)                     | ch3: 修正一处翻译错误                                                  |
| [257](https://github.com/Vonng/ddia/pull/257)   | [@UnderSam](https://github.com/UnderSam)                   | ch8: 修正一处拼写错误                                                  |
| [256](https://github.com/Vonng/ddia/pull/256)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可串行化”相关内容的多处翻译不当                                       |
| [255](https://github.com/Vonng/ddia/pull/255)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“可重复读”相关内容的多处翻译不当                                       |
| [253](https://github.com/Vonng/ddia/pull/253)   | [@AlphaWang](https://github.com/AlphaWang)                 | ch7: 修正“读已提交”相关内容的多处翻译不当                                       |
| [246](https://github.com/Vonng/ddia/pull/246)   | [@derekwu0101](https://github.com/derekwu0101)             | ch3: 修正繁体中文的转译错误                                               |
| [245](https://github.com/Vonng/ddia/pull/245)   | [@skyran1278](https://github.com/skyran1278)               | ch12: 修正繁体中文的转译错误                                              |
| [244](https://github.com/Vonng/ddia/pull/244)   | [@Axlgrep](https://github.com/Axlgrep)                     | ch9: 修正不通顺的翻译                                                  |
| [242](https://github.com/Vonng/ddia/pull/242)   | [@lynkeib](https://github.com/lynkeib)                     | ch9: 修正不通顺的翻译                                                  |
| [241](https://github.com/Vonng/ddia/pull/241)   | [@lynkeib](https://github.com/lynkeib)                     | ch8: 修正不正确的公式格式                                                |
| [240](https://github.com/Vonng/ddia/pull/240)   | [@8da2k](https://github.com/8da2k)                         | ch9: 修正不通顺的翻译                                                  |
| [239](https://github.com/Vonng/ddia/pull/239)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch7: 修正不一致的翻译                                                  |
| [237](https://github.com/Vonng/ddia/pull/237)   | [@zhangnew](https://github.com/zhangnew)                   | ch3: 修正错误的图片链接                                                 |
| [229](https://github.com/Vonng/ddia/pull/229)   | [@lis186](https://github.com/lis186)                       | 指出繁体中文的转译错误：复杂                                                 |
| [226](https://github.com/Vonng/ddia/pull/226)   | [@chroming](https://github.com/chroming)                   | ch1: 修正导航栏中的章节名称                                               |
| [220](https://github.com/Vonng/ddia/pull/220)   | [@skyran1278](https://github.com/skyran1278)               | ch9: 修正线性一致的繁体中文翻译                                             |
| [194](https://github.com/Vonng/ddia/pull/194)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正错误的翻译                                                   |
| [193](https://github.com/Vonng/ddia/pull/193)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 优化译文                                                      |
| [192](https://github.com/Vonng/ddia/pull/192)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | ch4: 修正不一致和不通顺的翻译                                              |
| [190](https://github.com/Vonng/ddia/pull/190)   | [@Pcrab](https://github.com/Pcrab)                         | ch1: 修正不准确的翻译                                                  |
| [187](https://github.com/Vonng/ddia/pull/187)   | [@narojay](https://github.com/narojay)                     | ch9: 修正生硬的翻译                                                   |
| [186](https://github.com/Vonng/ddia/pull/186)   | [@narojay](https://github.com/narojay)                     | ch8: 修正错别字                                                     |
| [185](https://github.com/Vonng/ddia/issues/185) | [@8da2k](https://github.com/8da2k)                         | 指出小标题跳转的问题                                                     |
| [184](https://github.com/Vonng/ddia/pull/184)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch10: 修正失效的网址                                                  |
| [183](https://github.com/Vonng/ddia/pull/183)   | [@OneSizeFitsQuorum](https://github.com/OneSizeFitsQuorum) | ch8: 修正错别字                                                     |
| [182](https://github.com/Vonng/ddia/issues/182) | [@lroolle](https://github.com/lroolle)                     | 建议docsify的主题风格                                                 |
| [181](https://github.com/Vonng/ddia/pull/181)   | [@YunfengGao](https://github.com/YunfengGao)               | ch2: 修正翻译错误                                                    |
| [180](https://github.com/Vonng/ddia/pull/180)   | [@skyran1278](https://github.com/skyran1278)               | ch3: 指出繁体中文的转译错误                                               |
| [177](https://github.com/Vonng/ddia/pull/177)   | [@exzhawk](https://github.com/exzhawk)                     | 支持 Github Pages 里的公式显示                                         |
| [176](https://github.com/Vonng/ddia/pull/176)   | [@haifeiWu](https://github.com/haifeiWu)                   | ch2: 语义网相关翻译更正                                                 |
| [175](https://github.com/Vonng/ddia/pull/175)   | [@cwr31](https://github.com/cwr31)                         | ch7: 不变式相关翻译更正                                                 |
| [174](https://github.com/Vonng/ddia/pull/174)   | [@BeBraveBeCurious](https://github.com/BeBraveBeCurious)   | README & preface: 更正不正确的中文用词和标点符号                              |
| [173](https://github.com/Vonng/ddia/pull/173)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正不完整的翻译                                                 |
| [171](https://github.com/Vonng/ddia/pull/171)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 修正重复的译文                                                  |
| [169](https://github.com/Vonng/ddia/pull/169)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch12: 更正不太通顺的翻译                                                |
| [166](https://github.com/Vonng/ddia/pull/166)   | [@bp4m4h94](https://github.com/bp4m4h94)                   | ch1: 发现错误的文献索引                                                 |
| [164](https://github.com/Vonng/ddia/pull/164)   | [@DragonDriver](https://github.com/DragonDriver)           | preface: 更正错误的标点符号                                             |
| [163](https://github.com/Vonng/ddia/pull/163)   | [@llmmddCoder](https://github.com/llmmddCoder)             | ch1: 更正错误字                                                     |
| [160](https://github.com/Vonng/ddia/pull/160)   | [@Zhayhp](https://github.com/Zhayhp)                       | ch2: 建议将 network model 翻译为网状模型                                 |
| [159](https://github.com/Vonng/ddia/pull/159)   | [@1ess](https://github.com/1ess)                           | ch4: 更正错误字                                                     |
| [157](https://github.com/Vonng/ddia/pull/157)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通顺的翻译                                                 |
| [155](https://github.com/Vonng/ddia/pull/155)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 更正不太通顺的翻译                                                 |
| [153](https://github.com/Vonng/ddia/pull/153)   | [@DavidZhiXing](https://github.com/DavidZhiXing)           | ch9: 修正缩略图的错别字                                                 |
| [152](https://github.com/Vonng/ddia/pull/152)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch7: 除重->去重                                                    |
| [151](https://github.com/Vonng/ddia/pull/151)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 修订sibling相关的翻译                                            |
| [147](https://github.com/Vonng/ddia/pull/147)   | [@ZvanYang](https://github.com/ZvanYang)                   | ch5: 更正一处不准确的翻译                                                |
| [145](https://github.com/Vonng/ddia/pull/145)   | [@Hookey](https://github.com/Hookey)                       | 识别了当前简繁转译过程中处理不当的地方，暂通过转换脚本规避                                  |
| [144](https://github.com/Vonng/ddia/issues/144) | [@secret4233](https://github.com/secret4233)               | ch7: 不翻译`next-key locking`                                     |
| [143](https://github.com/Vonng/ddia/issues/143) | [@imcheney](https://github.com/imcheney)                   | ch3: 更新残留的机翻段落                                                 |
| [142](https://github.com/Vonng/ddia/issues/142) | [@XIJINIAN](https://github.com/XIJINIAN)                   | 建议去除段首的制表符                                                     |
| [141](https://github.com/Vonng/ddia/issues/141) | [@Flyraty](https://github.com/Flyraty)                     | ch5: 发现一处错误格式的章节引用                                             |
| [140](https://github.com/Vonng/ddia/pull/140)   | [@Bowser1704](https://github.com/Bowser1704)               | ch5: 修正章节Summary中多处不通顺的翻译                                      |
| [139](https://github.com/Vonng/ddia/pull/139)   | [@Bowser1704](https://github.com/Bowser1704)               | ch2&ch3: 修正多处不通顺的或错误的翻译                                        |
| [137](https://github.com/Vonng/ddia/pull/137)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch5&ch6: 优化多处不通顺的或错误的翻译                                        |
| [134](https://github.com/Vonng/ddia/pull/134)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch4: 优化多处不通顺的或错误的翻译                                            |
| [133](https://github.com/Vonng/ddia/pull/133)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 优化多处错误的或不通顺的翻译                                            |
| [132](https://github.com/Vonng/ddia/pull/132)   | [@fuxuemingzhu](https://github.com/fuxuemingzhu)           | ch3: 优化一处容易产生歧义的翻译                                             |
| [131](https://github.com/Vonng/ddia/pull/131)   | [@rwwg4](https://github.com/rwwg4)                         | ch6: 修正两处错误的翻译                                                 |
| [129](https://github.com/Vonng/ddia/pull/129)   | [@anaer](https://github.com/anaer)                         | ch4: 修正两处强调文本和四处代码变量名称                                         |
| [128](https://github.com/Vonng/ddia/pull/128)   | [@meilin96](https://github.com/meilin96)                   | ch5: 修正一处错误的引用                                                 |
| [126](https://github.com/Vonng/ddia/pull/126)   | [@cwr31](https://github.com/cwr31)                         | ch10: 修正一处错误的翻译（功能 -> 函数）                                      |
| [125](https://github.com/Vonng/ddia/pull/125)   | [@dch1228](https://github.com/dch1228)                     | ch2: 优化 how best 的翻译（如何以最佳方式）                                  |
| [123](https://github.com/Vonng/ddia/pull/123)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 9, TOC in readme, glossary, etc.) |
| [121](https://github.com/Vonng/ddia/pull/121)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 5 to chapter 8)                   |
| [120](https://github.com/Vonng/ddia/pull/120)   | [@jiong-han](https://github.com/jiong-han)                 | Typo fix: 呲之以鼻 -> 嗤之以鼻                                         |
| [119](https://github.com/Vonng/ddia/pull/119)   | [@cclauss](https://github.com/cclauss)                     | Streamline file operations in convert()                        |
| [118](https://github.com/Vonng/ddia/pull/118)   | [@yingang](https://github.com/yingang)                     | translation updates (chapter 2 to chapter 4)                   |
| [117](https://github.com/Vonng/ddia/pull/117)   | [@feeeei](https://github.com/feeeei)                       | 统一每章的标题格式                                                      |
| [115](https://github.com/Vonng/ddia/pull/115)   | [@NageNalock](https://github.com/NageNalock)               | 第七章病句修改: 重复词语                                                  |
| [114](https://github.com/Vonng/ddia/pull/114)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | Update README.md: correct the book name                        |
| [113](https://github.com/Vonng/ddia/pull/113)   | [@lpxxn](https://github.com/lpxxn)                         | 修改语句                                                           |
| [112](https://github.com/Vonng/ddia/pull/112)   | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [110](https://github.com/Vonng/ddia/pull/110)   | [@lpxxn](https://github.com/lpxxn)                         | 读已写入数据                                                         |
| [107](https://github.com/Vonng/ddia/pull/107)   | [@abbychau](https://github.com/abbychau)                   | 單調鐘和好死还是赖活着                                                    |
| [106](https://github.com/Vonng/ddia/pull/106)   | [@enochii](https://github.com/enochii)                     | typo in ch2: fix braces typo                                   |
| [105](https://github.com/Vonng/ddia/pull/105)   | [@LiminCode](https://github.com/LiminCode)                 | Chronicle translation error                                    |
| [104](https://github.com/Vonng/ddia/pull/104)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | several advice for better translation                          |
| [103](https://github.com/Vonng/ddia/pull/103)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in ch4: should be 完成 rather than 完全                       |
| [102](https://github.com/Vonng/ddia/pull/102)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | ch4: better-translation: 扼杀 → 破坏                               |
| [101](https://github.com/Vonng/ddia/pull/101)   | [@Sunt-ing](https://github.com/Sunt-ing)                   | typo in Ch4: should be "改变" rathr than "盖面"                    |
| [100](https://github.com/Vonng/ddia/pull/100)   | [@LiminCode](https://github.com/LiminCode)                 | fix missing translation                                        |
| [99 ](https://github.com/Vonng/ddia/pull/99)    | [@mrdrivingduck](https://github.com/mrdrivingduck)         | ch6: fix the word rebalancing                                  |
| [98 ](https://github.com/Vonng/ddia/pull/98)    | [@jacklightChen](https://github.com/jacklightChen)         | fix ch7.md: fix wrong references                               |
| [97 ](https://github.com/Vonng/ddia/pull/97)    | [@jenac](https://github.com/jenac)                         | 96                                                             |
| [96 ](https://github.com/Vonng/ddia/pull/96)    | [@PragmaTwice](https://github.com/PragmaTwice)             | ch2: fix typo about 'may or may not be'                        |
| [95 ](https://github.com/Vonng/ddia/pull/95)    | [@EvanMu96](https://github.com/EvanMu96)                   | fix translation of "the battle cry" in ch5                     |
| [94 ](https://github.com/Vonng/ddia/pull/94)    | [@kemingy](https://github.com/kemingy)                     | ch6: fix markdown and punctuations                             |
| [93 ](https://github.com/Vonng/ddia/pull/93)    | [@kemingy](https://github.com/kemingy)                     | ch5: fix markdown and some typos                               |
| [92 ](https://github.com/Vonng/ddia/pull/92)    | [@Gilbert1024](https://github.com/Gilbert1024)             | Merge pull request #1 from Vonng/master                        |
| [88 ](https://github.com/Vonng/ddia/pull/88)    | [@kemingy](https://github.com/kemingy)                     | fix typo for ch1, ch2, ch3, ch4                                |
| [87 ](https://github.com/Vonng/ddia/pull/87)    | [@wynn5a](https://github.com/wynn5a)                       | Update ch3.md                                                  |
| [86 ](https://github.com/Vonng/ddia/pull/86)    | [@northmorn](https://github.com/northmorn)                 | Update ch1.md                                                  |
| [85 ](https://github.com/Vonng/ddia/pull/85)    | [@sunbuhui](https://github.com/sunbuhui)                   | fix ch2.md: fix ch2 ambiguous translation                      |
| [84 ](https://github.com/Vonng/ddia/pull/84)    | [@ganler](https://github.com/ganler)                       | Fix translation: use up                                        |
| [83 ](https://github.com/Vonng/ddia/pull/83)    | [@afunTW](https://github.com/afunTW)                       | Using OpenCC to convert from zh-cn to zh-tw                    |
| [82 ](https://github.com/Vonng/ddia/pull/82)    | [@kangni](https://github.com/kangni)                       | fix gitbook url                                                |
| [78 ](https://github.com/Vonng/ddia/pull/78)    | [@hanyu2](https://github.com/hanyu2)                       | Fix unappropriated translation                                 |
| [77 ](https://github.com/Vonng/ddia/pull/77)    | [@Ozarklake](https://github.com/Ozarklake)                 | fix typo                                                       |
| [75 ](https://github.com/Vonng/ddia/pull/75)    | [@2997ms](https://github.com/2997ms)                       | Fix typo                                                       |
| [74 ](https://github.com/Vonng/ddia/pull/74)    | [@2997ms](https://github.com/2997ms)                       | Update ch9.md                                                  |
| [70 ](https://github.com/Vonng/ddia/pull/70)    | [@2997ms](https://github.com/2997ms)                       | Update ch7.md                                                  |
| [67 ](https://github.com/Vonng/ddia/pull/67)    | [@jiajiadebug](https://github.com/jiajiadebug)             | fix issues in ch2 - ch9 and glossary                           |
| [66 ](https://github.com/Vonng/ddia/pull/66)    | [@blindpirate](https://github.com/blindpirate)             | Fix typo                                                       |
| [63 ](https://github.com/Vonng/ddia/pull/63)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch10.md                                                 |
| [62 ](https://github.com/Vonng/ddia/pull/62)    | [@ych](https://github.com/ych)                             | fix ch1.md typesetting problem                                 |
| [61 ](https://github.com/Vonng/ddia/pull/61)    | [@xianlaioy](https://github.com/xianlaioy)                 | docs:钟-->种，去掉ou                                                |
| [60 ](https://github.com/Vonng/ddia/pull/60)    | [@Zombo1296](https://github.com/Zombo1296)                 | 否则 -> 或者                                                       |
| [59 ](https://github.com/Vonng/ddia/pull/59)    | [@AlexanderMisel](https://github.com/AlexanderMisel)       | 呼叫->调用，显着->显著                                                  |
| [58 ](https://github.com/Vonng/ddia/pull/58)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch8.md                                                  |
| [55 ](https://github.com/Vonng/ddia/pull/55)    | [@saintube](https://github.com/saintube)                   | ch8: 修改链接错误                                                    |
| [54 ](https://github.com/Vonng/ddia/pull/54)    | [@Panmax](https://github.com/Panmax)                       | Update ch2.md                                                  |
| [53 ](https://github.com/Vonng/ddia/pull/53)    | [@ibyte2011](https://github.com/ibyte2011)                 | Update ch9.md                                                  |
| [52 ](https://github.com/Vonng/ddia/pull/52)    | [@hecenjie](https://github.com/hecenjie)                   | Update ch1.md                                                  |
| [51 ](https://github.com/Vonng/ddia/pull/51)    | [@latavin243](https://github.com/latavin243)               | fix 修正ch3 ch4几处翻译                                              |
| [50 ](https://github.com/Vonng/ddia/pull/50)    | [@AlexZFX](https://github.com/AlexZFX)                     | 几个疏漏和格式错误                                                      |
| [49 ](https://github.com/Vonng/ddia/pull/49)    | [@haifeiWu](https://github.com/haifeiWu)                   | Update ch1.md                                                  |
| [48 ](https://github.com/Vonng/ddia/pull/48)    | [@scaugrated](https://github.com/scaugrated)               | fix typo                                                       |
| [47 ](https://github.com/Vonng/ddia/pull/47)    | [@lzwill](https://github.com/lzwill)                       | Fixed typos in ch2                                             |
| [45 ](https://github.com/Vonng/ddia/pull/45)    | [@zenuo](https://github.com/zenuo)                         | 删除一个多余的右括号                                                     |
| [44 ](https://github.com/Vonng/ddia/pull/44)    | [@akxxsb](https://github.com/akxxsb)                       | 修正第七章底部链接错误                                                    |
| [43 ](https://github.com/Vonng/ddia/pull/43)    | [@baijinping](https://github.com/baijinping)               | "更假简单"->"更加简单"                                                 |
| [42 ](https://github.com/Vonng/ddia/pull/42)    | [@tisonkun](https://github.com/tisonkun)                   | 修复 ch1 中的无序列表格式                                                |
| [38 ](https://github.com/Vonng/ddia/pull/38)    | [@renjie-c](https://github.com/renjie-c)                   | 纠正多处的翻译小错误                                                     |
| [37 ](https://github.com/Vonng/ddia/pull/37)    | [@tankilo](https://github.com/tankilo)                     | fix translation mistakes in ch4.md                             |
| [36 ](https://github.com/Vonng/ddia/pull/36)    | [@wwek](https://github.com/wwek)                           | 1.修复多个链接错误 2.名词优化修订 3.错误修订                                     |
| [35 ](https://github.com/Vonng/ddia/pull/35)    | [@wwek](https://github.com/wwek)                           | fix ch7.md  to ch8.md  link error                              |
| [34 ](https://github.com/Vonng/ddia/pull/34)    | [@wwek](https://github.com/wwek)                           | Merge pull request #1 from Vonng/master                        |
| [33 ](https://github.com/Vonng/ddia/pull/33)    | [@wwek](https://github.com/wwek)                           | fix part-ii.md link error                                      |
| [32 ](https://github.com/Vonng/ddia/pull/32)    | [@JCYoky](https://github.com/JCYoky)                       | Update ch2.md                                                  |
| [31 ](https://github.com/Vonng/ddia/pull/31)    | [@elsonLee](https://github.com/elsonLee)                   | Update ch7.md                                                  |
| [26 ](https://github.com/Vonng/ddia/pull/26)    | [@yjhmelody](https://github.com/yjhmelody)                 | 修复一些明显错误                                                       |
| [25 ](https://github.com/Vonng/ddia/pull/25)    | [@lqbilbo](https://github.com/lqbilbo)                     | 修复链接错误                                                         |
| [24 ](https://github.com/Vonng/ddia/pull/24)    | [@artiship](https://github.com/artiship)                   | 修改词语顺序                                                         |
| [23 ](https://github.com/Vonng/ddia/pull/23)    | [@artiship](https://github.com/artiship)                   | 修正错别字                                                          |
| [22 ](https://github.com/Vonng/ddia/pull/22)    | [@artiship](https://github.com/artiship)                   | 纠正翻译错误                                                         |
| [21 ](https://github.com/Vonng/ddia/pull/21)    | [@zhtisi](https://github.com/zhtisi)                       | 修正目录和本章标题不符的情况                                                 |
| [20 ](https://github.com/Vonng/ddia/pull/20)    | [@rentiansheng](https://github.com/rentiansheng)           | Update ch7.md                                                  |
| [19 ](https://github.com/Vonng/ddia/pull/19)    | [@LHRchina](https://github.com/LHRchina)                   | 修复语句小bug                                                       |
| [16 ](https://github.com/Vonng/ddia/pull/16)    | [@MuAlex](https://github.com/MuAlex)                       | Master                                                         |
| [15 ](https://github.com/Vonng/ddia/pull/15)    | [@cg-zhou](https://github.com/cg-zhou)                     | Update translation progress                                    |
| [14 ](https://github.com/Vonng/ddia/pull/14)    | [@cg-zhou](https://github.com/cg-zhou)                     | Translate glossary                                             |
| [13 ](https://github.com/Vonng/ddia/pull/13)    | [@cg-zhou](https://github.com/cg-zhou)                     | 详细修改了后记中和印度野猪相关的描述                                             |
| [12 ](https://github.com/Vonng/ddia/pull/12)    | [@ibyte2011](https://github.com/ibyte2011)                 | 修改了部分翻译                                                        |
| [11 ](https://github.com/Vonng/ddia/pull/11)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 100%                                                       |
| [10 ](https://github.com/Vonng/ddia/pull/10)    | [@jiajiadebug](https://github.com/jiajiadebug)             | ch2 20%                                                        |
| [9  ](https://github.com/Vonng/ddia/pull/9)     | [@jiajiadebug](https://github.com/jiajiadebug)             | Preface, ch1, part-i translation minor fixes                   |
| [7  ](https://github.com/Vonng/ddia/pull/7)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 translation pull request                                   |
| [6  ](https://github.com/Vonng/ddia/pull/6)     | [@MuAlex](https://github.com/MuAlex)                       | Ch6 change version1                                            |
| [5  ](https://github.com/Vonng/ddia/pull/5)     | [@nevertiree](https://github.com/nevertiree)               | Chapter 01语法微调                                                 |
| [2  ](https://github.com/Vonng/ddia/pull/2)     | [@seagullbird](https://github.com/seagullbird)             | 序言初翻                                                           |


================================================
FILE: content/zh/glossary.md
================================================
---
title: 术语表
weight: 500
breadcrumbs: false
---

> 请注意：本术语表的定义刻意保持简短，旨在传达核心概念，而非覆盖术语的全部细节。更多内容请参阅正文对应章节。

### 异步（asynchronous）

不等待某件事完成（例如通过网络把数据发送到另一个节点），且不假设它会在多长时间内完成。参见“[同步与异步复制](/ch6#sec_replication_sync_async)”、“[同步网络与异步网络](/ch9#sec_distributed_sync_networks)”和“[系统模型与现实](/ch9#sec_distributed_system_model)”。

### 原子（atomic）

1. 在并发语境下：指一个操作看起来在某个单一时刻生效，其他并发进程不会看到它处于“半完成”状态。另见 *isolation*。
2. 在事务语境下：指一组写入要么全部提交、要么全部回滚，即使发生故障也不例外。参见“[原子性](/ch8#sec_transactions_acid_atomicity)”和“[两阶段提交（2PC）](/ch8#sec_transactions_2pc)”。

### 背压（backpressure）

当接收方跟不上时，强制发送方降速。也称为 *flow control*。参见“[系统过载后无法恢复时会发生什么](/ch2#sidebar_metastable)”。

### 批处理（batch process）

以一个固定（通常较大）数据集为输入、产出另一份数据且不修改输入的计算。参见[第 11 章](/ch11#ch_batch)。

### 有界（bounded）

具有已知上限或大小。例如可用于描述网络延迟（参见“[超时与无界延迟](/ch9#sec_distributed_queueing)”）和数据集（参见[第 12 章](/ch12#ch_stream)导言）。

### 拜占庭故障（Byzantine fault）

节点以任意错误方式行为，例如向不同节点发送相互矛盾或恶意消息。参见“[拜占庭故障](/ch9#sec_distributed_byzantine)”。

### 缓存（cache）

通过记住近期访问数据来加速后续读取的组件。缓存通常不完整：若未命中，需要回源到更慢但完整的底层数据存储。

### CAP 定理（CAP theorem）

一个在实践中经常被误解、且不太有直接指导价值的理论结果。参见“[CAP 定理](/ch10#the-cap-theorem)”。

### 因果关系（causality）

当一件事“先于”另一件事发生时产生的事件依赖关系。例如后续事件对先前事件的响应、建立在先前事件之上，或必须结合先前事件理解。参见“[happens-before 关系与并发](/ch6#sec_replication_happens_before)”。

### 共识（consensus）

分布式计算中的基本问题：让多个节点就某件事达成一致（例如谁是主节点）。这比直觉上要困难得多。参见“[共识](/ch10#sec_consistency_consensus)”。

### 数据仓库（data warehouse）

将多个 OLTP 系统的数据汇总并整理后，用于分析场景的数据库。参见“[数据仓库](/ch1#sec_introduction_dwh)”。

### 声明式（declarative）

描述“想要什么性质”，而非“如何一步步实现”。在数据库查询中，优化器接收声明式查询并决定最佳执行方式。参见“[术语：声明式查询语言](/ch3)”。

### 反规范化（denormalize）

在已规范化数据集中引入一定冗余（常见形式为缓存或索引）以换取更快读取。反规范化值可看作预计算结果，类似物化视图。参见“[规范化、反规范化与连接](/ch3#sec_datamodels_normalization)”。

### 派生数据（derived data）

通过可重复流程由其他数据生成的数据集，必要时可重新计算。通常用于加速某类读取。索引、缓存、物化视图都属于派生数据。参见“[记录系统与派生数据](/ch1#sec_introduction_derived)”。

### 确定性（deterministic）

一个函数在相同输入下总产生相同输出，不依赖随机数、当前时间、网络交互等不可预测因素。参见“[确定性的力量](/ch9#sidebar_distributed_determinism)”。

### 分布式（distributed）

系统在多个通过网络连接的节点上运行。其典型特征是 *部分失效*：一部分坏了，另一部分仍在工作，而软件往往难以精确知道哪里坏了。参见“[故障与部分失效](/ch9#sec_distributed_partial_failure)”。

### 持久性（durable）

以你相信不会丢失的方式存储数据，即使发生各种故障。参见“[持久性](/ch8#durability)”。

### ETL

Extract-Transform-Load（提取-转换-加载）：从源数据库抽取数据，转成更适合分析查询的形式，再加载到数据仓库或批处理系统。参见“[数据仓库](/ch1#sec_introduction_dwh)”。

### 故障切换（failover）

在单主系统中，将主角色从一个节点切到另一个节点的过程。参见“[处理节点故障](/ch6#sec_replication_failover)”。

### 容错（fault-tolerant）

出现故障（如机器崩溃、链路故障）后仍可自动恢复。参见“[可靠性与容错](/ch2#sec_introduction_reliability)”。

### 流量控制（flow control）

见 *backpressure*。

### 追随者（follower）

不直接接收客户端写入、仅应用来自主节点变更的副本。也称 *secondary*、*read replica* 或 *hot standby*。参见“[单主复制](/ch6#sec_replication_leader)”。

### 全文检索（full-text search）

按任意关键词搜索文本，通常支持近似拼写、同义词等能力。全文索引是支持此类查询的一种 *secondary index*。参见“[全文检索](/ch4#sec_storage_full_text)”。

### 图（graph）

由 *vertices*（可引用对象，也称 *nodes* 或 *entities*）和 *edges*（顶点间连接，也称 *relationships* 或 *arcs*）组成的数据结构。参见“[图状数据模型](/ch3#sec_datamodels_graph)”。

### 哈希（hash）

把输入映射成看似随机数字的函数。相同输入总得相同输出；不同输入通常输出不同，但也可能碰撞（*collision*）。参见“[按键的哈希分片](/ch7#sec_sharding_hash)”。

### 幂等（idempotent）

可安全重试的操作：执行多次与执行一次效果相同。参见“[幂等性](/ch12#sec_stream_idempotence)”。

### 索引（index）

一种可高效检索“某字段取某值”的记录的数据结构。参见“[OLTP 的存储与索引](/ch4#sec_storage_oltp)”。

### 隔离性（isolation）

在事务语境下，并发事务相互干扰的程度。*Serializable* 最强，也常用更弱隔离级别。参见“[隔离性](/ch8#sec_transactions_acid_isolation)”。

### 连接（join）

把具有关联关系的记录拼在一起。常见于一个记录引用另一个记录（外键、文档引用、图边）时，查询需要取到被引用对象。参见“[规范化、反规范化与连接](/ch3#sec_datamodels_normalization)”和“[JOIN 与 GROUP BY](/ch11#sec_batch_join)”。

### 领导者（leader）

当数据或服务跨多个节点复制时，被指定为可接受写入的副本。可通过协议选举或管理员指定。也称 *primary* 或 *source*。参见“[单主复制](/ch6#sec_replication_leader)”。

### 线性一致（linearizable）

表现得像系统里只有一份数据副本，且由原子操作更新。参见“[线性一致性](/ch10#sec_consistency_linearizability)”。

### 局部性（locality）

一种性能优化：把经常被一起访问的数据放在一起。参见“[读写的数据局部性](/ch3#sec_datamodels_document_locality)”。

### 锁（lock）

保证同一时刻只有一个线程/节点/事务访问某资源的机制；其他访问者需等待锁释放。参见“[两阶段锁（2PL）](/ch8#sec_transactions_2pl)”和“[分布式锁与租约](/ch9#sec_distributed_lock_fencing)”。

### 日志（log）

只追加写入的数据文件。*WAL* 用于崩溃恢复（参见“[让 B 树可靠](/ch4#sec_storage_btree_wal)”）；*log-structured* 存储把日志作为主存储格式（参见“[日志结构存储](/ch4#sec_storage_log_structured)”）；*replication log* 用于主从复制（参见“[单主复制](/ch6#sec_replication_leader)”）；*event log* 可表示数据流（参见“[基于日志的消息代理](/ch12#sec_stream_log) ”）。

### 物化（materialize）

把计算结果提前算出并写下来，而不是按需即时计算。参见“[事件溯源与 CQRS](/ch3#sec_datamodels_events)”。

### 节点（node）

运行在某台计算机上的软件实例，通过网络与其他节点协作完成任务。

### 规范化（normalized）

数据结构中尽量避免冗余与重复。规范化数据库里某数据变化时通常只改一处，不需多处同步。参见“[规范化、反规范化与连接](/ch3#sec_datamodels_normalization)”。

### OLAP

Online Analytic Processing（在线分析处理）：典型访问模式是对大量记录做聚合（如 count/sum/avg）。参见“[事务系统与分析系统](/ch1#sec_introduction_analytics)”。

### OLTP

Online Transaction Processing（在线事务处理）：典型访问模式是快速读写少量记录，通常按键索引。参见“[事务系统与分析系统](/ch1#sec_introduction_analytics)”。

### 分片（sharding）

把单机装不下的大数据集或计算拆成更小部分并分散到多台机器上。也称 *partitioning*。参见[第 7 章](/ch7#ch_sharding)。

### 百分位（percentile）

通过统计多少值高于/低于某阈值来描述分布。例如某时段 95 分位响应时间为 *t*，表示 95% 请求耗时小于 *t*，5% 更长。参见“[描述性能](/ch2#sec_introduction_percentiles)”。

### 主键（primary key）

唯一标识一条记录的值（通常为数字或字符串）。在很多应用中由系统在创建时生成（顺序或随机），而非用户手工指定。另见 *secondary index*。

### 法定票数（quorum）

一个操作被判定成功前所需的最少投票节点数。参见“[读写法定票数](/ch6#sec_replication_quorum_condition)”。

### 再平衡（rebalance）

为均衡负载，把数据或服务从一个节点迁移到另一个节点。参见“[键值数据的分片](/ch7#sec_sharding_key_value)”。

### 复制（replication）

在多个节点（*replicas*）上保存同一份数据，以便部分节点不可达时仍可访问。参见[第 6 章](/ch6#ch_replication)。

### 模式（schema）

对数据结构（字段、类型等）的描述。数据是否符合模式可在生命周期不同阶段检查（参见“[文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility)”），模式也可随时间演进（参见[第 5 章](/ch5#ch_encoding)）。

### 二级索引（secondary index）

与主存储并行维护的附加结构，用于高效检索满足某类条件的记录。参见“[多列索引与二级索引](/ch4#sec_storage_index_multicolumn)”和“[分片与二级索引](/ch7#sec_sharding_secondary_indexes)”。

### 可串行化（serializable）

一种 *isolation* 保证：多个事务并发执行时，行为等价于某个串行顺序逐个执行。参见“[可串行化](/ch8#sec_transactions_serializability)”。

### 无共享（shared-nothing）

一种架构：独立节点（各自 CPU、内存、磁盘）通过普通网络连接；相对的是共享内存或共享磁盘架构。参见“[共享内存、共享磁盘与无共享架构](/ch2#sec_introduction_shared_nothing)”。

### 偏斜（skew）

1. 分片负载不均：某些分片请求/数据很多，另一些很少。也称 *hot spots*。参见“[负载偏斜与热点消除](/ch7#sec_sharding_skew)”。
2. 一种时序异常，导致事件呈现为非预期的非顺序。参见“[快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)”中的读偏斜、“[写偏斜与幻读](/ch8#sec_transactions_write_skew)”中的写偏斜、以及“[用于事件排序的时间戳](/ch9#sec_distributed_lww)”中的时钟偏斜。

### 脑裂（split brain）

两个节点同时认为自己是领导者，可能破坏系统保证。参见“[处理节点故障](/ch6#sec_replication_failover)”和“[少数服从多数](/ch9#sec_distributed_majority)”。

### 存储过程（stored procedure）

把事务逻辑编码到数据库服务器端执行，使事务过程中无需与客户端来回通信。参见“[实际串行执行](/ch8#sec_transactions_serial)”。

### 流处理（stream process）

持续运行的计算：消费无穷事件流并产出结果。参见[第 12 章](/ch12#ch_stream)。

### 同步（synchronous）

*asynchronous* 的反义词。

### 记录系统（system of record）

持有某类数据主权威版本的系统，也称 *source of truth*。数据变更首先写入这里，其他数据集可由其派生。参见“[记录系统与派生数据](/ch1#sec_introduction_derived)”。

### 超时（timeout）

最简单的故障检测方式之一：在一定时间内未收到响应即判定超时。但无法确定是远端节点故障还是网络问题导致。参见“[超时与无界延迟](/ch9#sec_distributed_queueing)”。

### 全序（total order）

一种可比较关系（如时间戳），任意两者都能判定大小。若存在不可比较元素，则称 *partial order*（偏序）。

### 事务（transaction）

把多次读写封装为一个逻辑单元，以简化错误处理与并发问题。参见[第 8 章](/ch8#ch_transactions)。

### 两阶段提交（two-phase commit, 2PC）

保证多个数据库节点对同一事务要么都 *atomically* 提交、要么都中止的算法。参见“[两阶段提交（2PC）](/ch8#sec_transactions_2pc)”。

### 两阶段锁（two-phase locking, 2PL）

实现 *serializable isolation* 的算法：事务对读写数据加锁并持有到事务结束。参见“[两阶段锁（2PL）](/ch8#sec_transactions_2pl)”。

### 无界（unbounded）

没有已知上限或大小。与 *bounded* 相反。


================================================
FILE: content/zh/indexes.md
================================================
---
title: 索引
weight: 550
breadcrumbs: false
---

### 符号

- 3FS（分布式文件系统）, [分布式文件系统](/ch11#sec_batch_dfs)

### A

- 中止（事务）, [事务](/ch8#ch_transactions), [原子性](/ch8#sec_transactions_acid_atomicity)
  - 级联, [没有脏读](/ch8#no-dirty-reads)
  - 在两阶段提交中, [两阶段提交（2PC）](/ch8#sec_transactions_2pc)
  - 乐观并发控制的性能, [可串行化快照隔离的性能](/ch8#performance-of-serializable-snapshot-isolation)
  - 重试已中止的事务, [处理错误和中止](/ch8#handling-errors-and-aborts)
- 抽象, [云服务的分层](/ch1#layering-of-cloud-services), [简单性：管理复杂度](/ch2#id38), [数据模型与查询语言](/ch3#ch_datamodels), [事务](/ch8#ch_transactions), [总结](/ch8#summary)
- 意外复杂性, [简单性：管理复杂度](/ch2#id38)
- 问责制, [责任与问责](/ch14#id371)
- 会计（财务数据）, [总结](/ch3#summary), [不可变事件的优点](/ch12#sec_stream_immutability_pros)
- Accumulo（数据库）
  - 宽柱数据模型, [读写的数据局部性](/ch3#sec_datamodels_document_locality), [列压缩](/ch4#sec_storage_column_compression)
- ACID 属性（事务）, [ACID 的含义](/ch8#sec_transactions_acid)
  - 原子性, [原子性](/ch8#sec_transactions_acid_atomicity), [单对象与多对象操作](/ch8#sec_transactions_multi_object)
  - 一致性, [一致性](/ch8#sec_transactions_acid_consistency), [维护完整性，尽管软件有Bug](/ch13#id455)
  - 持久性, [使 B 树可靠](/ch4#sec_storage_btree_wal), [持久性](/ch8#durability)
  - 隔离性, [隔离性](/ch8#sec_transactions_acid_isolation), [单对象与多对象操作](/ch8#sec_transactions_multi_object)
- 确认（消息）, [确认与重新传递](/ch12#sec_stream_reordering)
- active/active replication（见 multi-leader replication）
- active/passive replication（见 基于领导者的复制）
- ActiveMQ（消息系统）, [消息代理](/ch5#message-brokers), [消息代理与数据库的对比](/ch12#id297)
  - 分布式事务支持, [XA 事务](/ch8#xa-transactions)
- ActiveRecord（对象关系映射器）, [对象关系映射（ORM）](/ch3#object-relational-mapping-orm), [处理错误和中止](/ch8#handling-errors-and-aborts)
- activity (workflows)（见 workflow engines）
- Actor 模型, [分布式 actor 框架](/ch5#distributed-actor-frameworks)
  - （另见 event-driven architecture）
  - 与流处理的比较, [事件驱动架构与 RPC](/ch12#sec_stream_actors_drpc)
- 自适应容量, [偏斜的工作负载与缓解热点](/ch7#sec_sharding_skew)
- Advanced Message Queuing Protocol（见 AMQP）
- 航空航天系统, [拜占庭故障](/ch9#sec_distributed_byzantine)
- Aerospike（数据库）
  - 强一致性模式, [单对象写入](/ch8#sec_transactions_single_object)
- AGE（图数据库）, [Cypher 查询语言](/ch3#id57)
- 汇总
  - 数据立方体和已实现视图, [物化视图与数据立方体](/ch4#sec_storage_materialized_views)
  - 分批处理, [排序与内存聚合](/ch11#id275)
  - 流程中, [流分析](/ch12#id318)
- 聚合管道（MongoDB）, [规范化、反规范化与连接](/ch3#sec_datamodels_normalization), [文档的查询语言](/ch3#query-languages-for-documents)
- 敏捷, [可演化性：让变化更容易](/ch2#sec_introduction_evolvability)
  - 最小化不可逆性, [批处理](/ch11#ch_batch), [应用演化后重新处理数据](/ch13#sec_future_reprocessing)
  - 充满自信地快速前进, [端到端原则重现](/ch13#id456)
- 一致意见, [单值共识](/ch10#single-value-consensus), [原子提交作为共识](/ch10#atomic-commitment-as-consensus)
  - （另见 共识）
- AI (artificial intelligence)（见 machine learning）
- AI Act (European Union), [数据系统、法律与社会](/ch1#sec_introduction_compliance)
- Airbyte, [数据仓库](/ch1#sec_introduction_dwh)
- Airflow（工作流调度器）, [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows), [批处理](/ch11#ch_batch), [工作流调度](/ch11#sec_batch_workflows)
  - 云数据仓集成, [查询语言](/ch11#sec_batch_query_lanauges)
  - 用于 ETL, [提取-转换-加载（ETL）](/ch11#sec_batch_etl_usage)
- 阿卡迈
  - 响应时间研究, [平均值、中位数与百分位点](/ch2#id24)
- 算法
  - 算法正确性, [定义算法的正确性](/ch9#defining-the-correctness-of-an-algorithm)
  - B树, [B 树](/ch4#sec_storage_b_trees)-[B 树变体](/ch4#b-tree-variants)
  - 分布式系统, [系统模型与现实](/ch9#sec_distributed_system_model)
  - 归并排序, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables), [混洗数据](/ch11#sec_shuffle)
  - 调度, [资源分配](/ch11#id279)
  - SSTable 与 LSM 树, [SSTable 文件格式](/ch4#the-sstable-file-format)-[压实策略](/ch4#sec_storage_lsm_compaction)
- 全互联复制拓扑, [多主复制拓扑](/ch6#sec_replication_topologies)
- AllegroGraph（数据库）, [图数据模型](/ch3#sec_datamodels_graph)
  - SPARQL 查询语言, [SPARQL 查询语言](/ch3#the-sparql-query-language)
- ALTER TABLE 语句（SQL）, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility), [编码与演化](/ch5#ch_encoding)
- 亚马逊
  - Dynamo（见 Dynamo（数据库））
  - 响应时间研究, [平均值、中位数与百分位点](/ch2#id24)
- Amazon Web Services (AWS)
  - Aurora（见 Aurora（云数据库））
  - ClockBound（见 ClockBound（时间同步））
  - 正确性测试, [形式化方法和随机测试](/ch9#sec_distributed_formal)
  - DynamoDB（见 DynamoDB（数据库））
  - EBS（见 EBS（虚拟块设备））
  - Kinesis（见 Kinesis（消息系统））
  - Neptune（见 Neptune（图数据库））
  - 网络可靠性, [实践中的网络故障](/ch9#sec_distributed_network_faults)
  - S3（见 S3（对象存储））
- 放大
  - 偏见, [偏见与歧视](/ch14#id370)
  - 故障, [维护派生状态](/ch13#id446)
  - 尾延迟, [响应时间指标的应用](/ch2#sec_introduction_slo_sla), [本地二级索引](/ch7#id166)
  - 写入放大, [写放大](/ch4#write-amplification)
- AMQP（高级消息队列协议）, [消息代理与数据库的对比](/ch12#id297)
  - （另见 messaging systems）
  - 比较基于日志的邮件, [日志与传统的消息传递相比](/ch12#sec_stream_logs_vs_messaging), [重播旧消息](/ch12#sec_stream_replay)
  - 消息顺序, [确认与重新传递](/ch12#sec_stream_reordering)
- 分析系统, [分析型与事务型系统](/ch1#sec_introduction_analytics)
  - 作为衍生数据系统, [记录系统与派生数据](/ch1#sec_introduction_derived)
  - 来自运营系统的 ETL, [数据仓库](/ch1#sec_introduction_dwh)
  - 治理, [超越数据湖](/ch1#beyond-the-data-lake)
- 分析, [分析型与事务型系统](/ch1#sec_introduction_analytics)-[记录系统与派生数据](/ch1#sec_introduction_derived)
  - 与事务处理的比较, [事务处理与分析的特征](/ch1#sec_introduction_oltp)
  - 数据正常化, [规范化的权衡](/ch3#trade-offs-of-normalization)
  - data warehousing（见 data warehousing）
  - predictive（见 predictive analytics）
  - 与批量处理的关系, [分析（Analytics）](/ch11#sec_batch_olap)-[分析（Analytics）](/ch11#sec_batch_olap)
  - 计划, [星型与雪花型：分析模式](/ch3#sec_datamodels_analytics)-[星型与雪花型：分析模式](/ch3#sec_datamodels_analytics)
  - 快速隔离查询, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)
  - 流式分析, [流分析](/ch12#id318)
- 分析工程, [分析型与事务型系统](/ch1#sec_introduction_analytics)
- 反熵, [追赶错过的写入](/ch6#sec_replication_read_repair)
- Antithesis（确定性仿真测试）, [确定性模拟测试](/ch9#deterministic-simulation-testing)
- Apache Accumulo（见 Accumulo）
- Apache ActiveMQ（见 ActiveMQ）
- Apache AGE（见 AGE）
- Apache Arrow（见 Arrow（数据格式））
- Apache Avro（见 Avro）
- Apache Beam（见 Beam）
- Apache BookKeeper（见 BookKeeper）
- Apache Cassandra（见 Cassandra）
- Apache Curator（见 Curator）
- Apache DataFusion（见 DataFusion（查询引擎））
- Apache Druid（见 Druid（数据库））
- Apache Flink（见 Flink（处理框架））
- Apache HBase（见 HBase）
- Apache Iceberg（见 Iceberg（表格式））
- Apache Jena（见 Jena）
- Apache Kafka（见 Kafka）
- Apache Lucene（见 Lucene）
- Apache Oozie（见 Oozie（工作流调度器））
- Apache ORC（见 ORC（数据格式））
- Apache Parquet（见 Parquet（数据格式））
- Apache Pig（查询语言）, [查询语言](/ch11#sec_batch_query_lanauges)
- Apache Pinot（见 Pinot（数据库））
- Apache Pulsar（见 Pulsar）
- Apache Qpid（见 Qpid）
- Apache Samza（见 Samza）
- Apache Solr（见 Solr）
- Apache Spark（见 Spark；见 Spark（处理框架））
- Apache Storm（见 Storm）
- Apache Superset（见 Superset（数据可视化软件））
- Apache Thrift（见 Thrift）
- Apache ZooKeeper（见 ZooKeeper）
- Apama （流式分析）, [复合事件处理](/ch12#id317)
- append-only files（见 logs）
- Application Programming Interfaces (APIs), [数据模型与查询语言](/ch3#ch_datamodels)
  - 用于改变流, [变更流的 API 支持](/ch12#sec_stream_change_api)
  - 分布式事务, [XA 事务](/ch8#xa-transactions)
  - 服务费用, [流经服务的数据流：REST 与 RPC](/ch5#sec_encoding_dataflow_rpc)-[RPC 的数据编码与演化](/ch5#data-encoding-and-evolution-for-rpc)
    - （另见 services）
    - 可演化性, [RPC 的数据编码与演化](/ch5#data-encoding-and-evolution-for-rpc)
    - RESTful, [Web 服务](/ch5#sec_web_services)
- application state（见 国家）
- approximate search（见 similarity search）
- 档案储存、数据库数据, [归档存储](/ch5#archival-storage)
- arcs（见 edges）
- ArcticDB（数据库）, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- 算术平均值, [平均值、中位数与百分位点](/ch2#id24)
- 数组
  - 数组数据库, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
  - 多层面, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- Arrow（数据格式）, [列式存储](/ch4#sec_storage_column), [DataFrames](/ch11#id287)
- artificial intelligence（见 machine learning）
- ASCII text, [Protocol Buffers](/ch5#sec_encoding_protobuf)
- ASN.1 (schema language), [模式的优点](/ch5#sec_encoding_schemas)
- 关联表格, [多对一与多对多关系](/ch3#sec_datamodels_many_to_many), [属性图](/ch3#id56)
- 同步网络, [不可靠的网络](/ch9#sec_distributed_networks), [术语表](/glossary)
  - 比较同步网络, [同步与异步网络](/ch9#sec_distributed_sync_networks)
  - 系统模型, [系统模型与现实](/ch9#sec_distributed_system_model)
- 同步复制, [同步复制与异步复制](/ch6#sec_replication_sync_async), [术语表](/glossary)
  - 故障数据损失, [领导者故障：故障转移](/ch6#leader-failure-failover)
  - 从同步跟踪器读取, [复制延迟的问题](/ch6#sec_replication_lag)
  - 有多个领导, [多主复制](/ch6#sec_replication_multi_leader)
- 异步传输模式, [我们不能简单地使网络延迟可预测吗？](/ch9#can-we-not-simply-make-network-delays-predictable)
- 原子广播, [共享日志作为共识](/ch10#sec_consistency_shared_logs)
- 原子钟, [带置信区间的时钟读数](/ch9#clock-readings-with-a-confidence-interval), [用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
  - （另见 clocks）
- 原子性, [术语表](/glossary)
  - 原子自增, [单对象写入](/ch8#sec_transactions_single_object)
  - 比较和设置, [条件写入（比较并设置）](/ch8#sec_transactions_compare_and_set), [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
    - （另见 比较和设置）
  - 异常数据, [规范化的权衡](/ch3#trade-offs-of-normalization)
  - 获取和添加/递增, [ID 生成器和逻辑时钟](/ch10#sec_consistency_logical), [共识](/ch10#sec_consistency_consensus), [获取并增加作为共识](/ch10#fetch-and-add-as-consensus)
  - 写入操作, [原子写操作](/ch8#atomic-write-operations)
- 原子性, [原子性](/ch8#sec_transactions_acid_atomicity), [单对象与多对象操作](/ch8#sec_transactions_multi_object), [术语表](/glossary)
  - 原子提交
    - 避开, [多分区请求处理](/ch13#id360), [无协调数据系统](/ch13#id454)
    - 屏蔽和非屏蔽, [三阶段提交](/ch8#three-phase-commit)
    - 在溪流处理中, [恰好一次消息处理](/ch8#sec_transactions_exactly_once), [再谈恰好一次消息处理](/ch8#exactly-once-message-processing-revisited), [原子提交再现](/ch12#sec_stream_atomic_commit)
    - 维护衍生数据, [保持系统同步](/ch12#sec_stream_sync)
  - 分布式事务, [分布式事务](/ch8#sec_transactions_distributed)-[再谈恰好一次消息处理](/ch8#exactly-once-message-processing-revisited)
  - 用于多对象事务, [单对象与多对象操作](/ch8#sec_transactions_multi_object)
  - 用于单对象写入, [单对象写入](/ch8#sec_transactions_single_object)
  - 与协商一致的关系, [原子提交作为共识](/ch10#atomic-commitment-as-consensus)
- 可审计性, [信任但验证](/ch13#sec_future_verification)-[用于可审计数据系统的工具](/ch13#id366)
  - 设计, [为可审计性而设计](/ch13#id365)
  - 自动审计系统, [不要盲目信任承诺](/ch13#id364)
  - 通过不可改变性, [不可变事件的优点](/ch12#sec_stream_immutability_pros)
  - 可审计数据系统工具, [用于可审计数据系统的工具](/ch13#id366)
- Aurora（云数据库）, [云原生系统架构](/ch1#sec_introduction_cloud_native)
- Aurora DSQL（数据库）
  - 快速隔离支持, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)
- 自动缩放, [运维：自动/手动再平衡](/ch7#sec_sharding_operations)
- Automerge (CRDT library), [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
- 可用性, [可靠性与容错](/ch2#sec_introduction_reliability)
  - （另见 fault tolerance）
  - 在 CAP 定理中, [CAP 定理](/ch10#the-cap-theorem)
  - 领袖选举, [共识的微妙之处](/ch10#subtleties-of-consensus)
  - 在服务级别协议（SLA）中, [响应时间指标的应用](/ch2#sec_introduction_slo_sla)
- 可用区, [通过冗余容忍硬件故障](/ch2#tolerating-hardware-faults-through-redundancy), [读己之写](/ch6#sec_replication_ryw)
- Avro（数据格式）, [Avro](/ch5#sec_encoding_avro)-[动态生成的模式](/ch5#dynamically-generated-schemas)
  - 动态生成的计划, [动态生成的模式](/ch5#dynamically-generated-schemas)
  - 对象容器文件, [但什么是写入者模式？](/ch5#but-what-is-the-writers-schema), [归档存储](/ch5#archival-storage)
  - 读者决定作家的计划, [但什么是写入者模式？](/ch5#but-what-is-the-writers-schema)
  - 计划演变, [写入者模式与读取者模式](/ch5#the-writers-schema-and-the-readers-schema)
  - 批量处理中的用途, [MapReduce](/ch11#sec_batch_mapreduce)
- awk （Unix 工具） （英语）., [简单日志分析](/ch11#sec_batch_log_analysis), [简单日志分析](/ch11#sec_batch_log_analysis), [分布式作业编排](/ch11#id278)
- Axon Framework, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
- Azkaban（工作流调度器）, [批处理](/ch11#ch_batch)
- Azure Blob Storage（对象存储）, [云服务的分层](/ch1#layering-of-cloud-services), [设置新的副本](/ch6#sec_replication_new_replica)
  - 有条件的标题, [隔离僵尸进程和延迟请求](/ch9#sec_distributed_fencing_tokens)
- Azure managed disks, [存储与计算的分离](/ch1#sec_introduction_storage_compute)
- Azure SQL DB（数据库）, [云原生系统架构](/ch1#sec_introduction_cloud_native)
- Azure Storage, [对象存储](/ch11#id277)
- Azure Synapse Analytics（数据库）, [云原生系统架构](/ch1#sec_introduction_cloud_native)
- Azure Virtual Machines
  - 现场虚拟机, [故障处理](/ch11#id281)

### B

- B树（指数）, [B 树](/ch4#sec_storage_b_trees)-[B 树变体](/ch4#b-tree-variants)
  - B+ trees, [B 树变体](/ch4#b-tree-variants)
  - 分支因子, [B 树](/ch4#sec_storage_b_trees)
  - comparison to LSM-trees, [比较 B 树与 LSM 树](/ch4#sec_storage_btree_lsm_comparison)-[磁盘空间使用](/ch4#disk-space-usage)
  - 崩溃恢复, [使 B 树可靠](/ch4#sec_storage_btree_wal)
  - 通过分割页面增长, [B 树](/ch4#sec_storage_b_trees)
  - 不可变变种, [B 树变体](/ch4#b-tree-variants), [索引与快照隔离](/ch8#indexes-and-snapshot-isolation)
  - 与硬分裂相似, [重新平衡键范围分片数据](/ch7#rebalancing-key-range-sharded-data)
  - 变体, [B 树变体](/ch4#b-tree-variants)
- B2（对象存储）, [分布式文件系统](/ch11#sec_batch_dfs)
- Backblaze B2（见 B2（对象存储））
- 后端, [数据系统架构中的权衡](/ch1#ch_tradeoffs)
- 返回, 指数, [描述性能](/ch2#sec_introduction_percentiles), [处理错误和中止](/ch8#handling-errors-and-aborts)
- 背压, [描述性能](/ch2#sec_introduction_percentiles), [读取性能](/ch4#read-performance), [消息传递系统](/ch12#sec_stream_messaging), [术语表](/glossary)
  - 分批处理, [工作流调度](/ch11#sec_batch_workflows)
  - in TCP, [TCP 的局限性](/ch9#sec_distributed_tcp)
- 备份
  - 用于复制的数据库快照, [设置新的副本](/ch6#sec_replication_new_replica)
  - 在多用户系统中, [面向多租户的分片](/ch7#sec_sharding_multitenancy)
  - 完整性, [不要盲目信任承诺](/ch13#id364)
  - 抓图隔离, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)
  - 使用对象存储, [设置新的副本](/ch6#sec_replication_new_replica)
  - 相对复制, [复制](/ch6#ch_replication)
- 向后兼容, [编码与演化](/ch5#ch_encoding)
- BadgerDB（数据库）
  - 可序列事务, [可串行化快照隔离（SSI）](/ch8#sec_transactions_ssi)
- BASE, contrast to ACID, [ACID 的含义](/ch8#sec_transactions_acid)
- 击打弹壳（Unix）, [OLTP 系统的存储与索引](/ch4#sec_storage_oltp)
- 批处理, [批处理](/ch11#ch_batch)-[本章小结](/ch11#id292), [术语表](/glossary)
  - 方案规划和职能规划, [MapReduce](/ch11#sec_batch_mapreduce)
  - 惠益, [批处理](/ch11#ch_batch)
  - 结合流处理, [统一批处理和流处理](/ch13#id338)
  - 与流处理的比较, [流处理](/ch12#sec_stream_processing)
  - 数据流引擎, [数据流引擎](/ch11#sec_batch_dataflow)-[数据流引擎](/ch11#sec_batch_dataflow)
  - 过失容忍, [故障处理](/ch11#id281), [消息传递系统](/ch12#sec_stream_messaging)
  - 数据整合, [批处理与流处理](/ch13#sec_future_batch_streaming)-[统一批处理和流处理](/ch13#id338)
  - 图表和迭代处理, [机器学习](/ch11#id290)
  - high-level APIs and languages, [查询语言](/ch11#sec_batch_query_lanauges)-[查询语言](/ch11#sec_batch_query_lanauges)
  - 云数据仓库中, [查询语言](/ch11#sec_batch_query_lanauges)
  - 在分布式系统中, [分布式系统中的批处理](/ch11#sec_batch_distributed)
  - 加入和分组, [JOIN 与 GROUP BY](/ch11#sec_batch_join)-[JOIN 与 GROUP BY](/ch11#sec_batch_join)
  - 限制, [批处理](/ch11#ch_batch)
  - 基于日志的信息和, [重播旧消息](/ch12#sec_stream_replay)
  - 保持衍生状态, [维护派生状态](/ch13#id446)
  - 衡量业绩, [批处理](/ch11#ch_batch)
  - 模式, [批处理模型](/ch11#id431)
  - 资源分配, [资源分配](/ch11#id279)-[资源分配](/ch11#id279)
  - 资源管理员, [分布式作业编排](/ch11#id278)
  - 调度器, [分布式作业编排](/ch11#id278)
  - 服务衍生数据, [对外提供派生数据](/ch11#sec_batch_serving_derived)-[对外提供派生数据](/ch11#sec_batch_serving_derived)
  - 移动数据, [混洗数据](/ch11#sec_shuffle)-[混洗数据](/ch11#sec_shuffle)
  - 任务执行, [分布式作业编排](/ch11#id278)
  - 使用大小写, [批处理用例](/ch11#sec_batch_output)-[对外提供派生数据](/ch11#sec_batch_serving_derived)
  - 使用 Unix 工具（例如）, [使用 Unix 工具的批处理](/ch11#sec_batch_unix)-[排序与内存聚合](/ch11#id275)
- 批处理框架
  - 与操作系统的比较, [分布式系统中的批处理](/ch11#sec_batch_distributed)
- Beam （数据流库）, [统一批处理和流处理](/ch13#id338)
- BERT (language model), [向量嵌入](/ch4#id92)
- 偏向, [偏见与歧视](/ch14#id370)
- bidirectional replication（见 multi-leader replication）
- 泥浆大球, [简单性：管理复杂度](/ch2#id38)
- 大数据
  - 对数据最小化, [数据系统、法律与社会](/ch1#sec_introduction_compliance), [立法与自律](/ch14#sec_future_legislation)
- BigQuery（数据库）, [云原生系统架构](/ch1#sec_introduction_cloud_native), [云数据仓库](/ch4#sec_cloud_data_warehouses), [批处理](/ch11#ch_batch)
  - DataFrames, [查询语言](/ch11#sec_batch_query_lanauges)
  - 硬化和集群, [按哈希范围分片](/ch7#sharding-by-hash-range)
  - 移动数据, [混洗数据](/ch11#sec_shuffle)
  - 快速隔离支持, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)
- Bigtable（数据库）
  - 硬化计划, [按键的范围分片](/ch7#sec_sharding_key_range)
  - 存储布局, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
  - 平板（硬化）, [分片](/ch7#ch_sharding)
  - 宽柱数据模型, [读写的数据局部性](/ch3#sec_datamodels_document_locality), [列压缩](/ch4#sec_storage_column_compression)
- 二进制数据编码, [二进制编码](/ch5#binary-encoding)-[模式的优点](/ch5#sec_encoding_schemas)
  - Avro, [Avro](/ch5#sec_encoding_avro)-[动态生成的模式](/ch5#dynamically-generated-schemas)
  - MessagePack, [二进制编码](/ch5#binary-encoding)-[二进制编码](/ch5#binary-encoding)
  - Protocol Buffers, [Protocol Buffers](/ch5#sec_encoding_protobuf)-[字段标签与模式演化](/ch5#field-tags-and-schema-evolution)
- 二进制编码
  - 根据计划, [模式的优点](/ch5#sec_encoding_schemas)
  - 按网络驱动程序, [模式的优点](/ch5#sec_encoding_schemas)
- binary strings, lack of support in JSON and XML, [JSON、XML 及其二进制变体](/ch5#sec_encoding_json)
- 比特币（催眠币）, [用于可审计数据系统的工具](/ch13#id366)
  - 拜占庭断层承受力, [拜占庭故障](/ch9#sec_distributed_byzantine)
  - 交换中的货币错误, [弱隔离级别](/ch8#sec_transactions_isolation_levels)
- 位图索引, [列压缩](/ch4#sec_storage_column_compression)
- BitTorrent uTP protocol, [TCP 的局限性](/ch9#sec_distributed_tcp)
- Bkd-树木（指数）, [多维索引与全文索引](/ch4#sec_storage_multidimensional)
- 无咎死后, [人类与可靠性](/ch2#id31)
- Blazegraph（数据库）, [图数据模型](/ch3#sec_datamodels_graph)
  - SPARQL 查询语言, [SPARQL 查询语言](/ch3#the-sparql-query-language)
- blob storage（见 object storage）
- 块, [分布式文件系统](/ch11#sec_batch_dfs)
- 块设备（磁盘）, [存储与计算的分离](/ch1#sec_introduction_storage_compute)
- 块链, [总结](/ch3#summary)
  - 拜占庭断层承受力, [拜占庭故障](/ch9#sec_distributed_byzantine), [共识](/ch10#sec_consistency_consensus), [用于可审计数据系统的工具](/ch13#id366)
- 阻止原子承诺, [三阶段提交](/ch8#three-phase-commit)
- Bloom 过滤器（算法）, [布隆过滤器](/ch4#bloom-filters), [读取性能](/ch4#read-performance), [流分析](/ch12#id318)
- BookKeeper (replicated log), [将工作分配给节点](/ch10#allocating-work-to-nodes)
- 边框数据集, [流处理](/ch12#ch_stream), [术语表](/glossary)
  - （另见 batch processing）
- 受限延迟, [术语表](/glossary)
  - 在网络中, [同步与异步网络](/ch9#sec_distributed_sync_networks)
  - 进程暂停, [响应时间保证](/ch9#sec_distributed_clocks_realtime)
- 广播
  - 全序广播（见 shared logs）
- 无中介消息, [直接从生产者传递给消费者](/ch12#id296)
- 粗糙（计量聚合器）, [直接从生产者传递给消费者](/ch12#id296)
- BTM (transaction coordinator), [两阶段提交（2PC）](/ch8#sec_transactions_2pc)
- 缓冲
  - Bufstream（消息系统）, [设置新的副本](/ch6#sec_replication_new_replica)
- Bufstream（消息系统）, [磁盘空间使用](/ch12#sec_stream_disk_usage)
- 新建或购买, [云服务与自托管](/ch1#sec_introduction_cloud)
- 快速网络交通模式, [我们不能简单地使网络延迟可预测吗？](/ch9#can-we-not-simply-make-network-delays-predictable)
- 商业分析员, [分析型与事务型系统](/ch1#sec_introduction_analytics), [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake)
- 商业数据处理, [事务处理与分析的特征](/ch1#sec_introduction_oltp)
- 商业情报, [分析型与事务型系统](/ch1#sec_introduction_analytics)-[数据仓库](/ch1#sec_introduction_dwh)
- Business Process Execution Language (BPEL), [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows)
- Business Process Model and Notation (BPMN), [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows)
  - 实例, [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows)
- 字节序列,编码数据, [编码数据的格式](/ch5#sec_encoding_formats)
- 拜占庭断层, [拜占庭故障](/ch9#sec_distributed_byzantine)-[弱形式的谎言](/ch9#weak-forms-of-lying), [系统模型与现实](/ch9#sec_distributed_system_model), [术语表](/glossary)
  - 拜占庭容错系统, [拜占庭故障](/ch9#sec_distributed_byzantine)
  - Byzantine Generals Problem, [拜占庭故障](/ch9#sec_distributed_byzantine)
  - 协商一致算法和, [共识](/ch10#sec_consistency_consensus), [用于可审计数据系统的工具](/ch13#id366)

### C

- 缓存, [全内存存储](/ch4#sec_storage_inmemory), [术语表](/glossary)
  - 意见, [物化视图与数据立方体](/ch4#sec_storage_materialized_views)
  - 作为衍生数据, [记录系统与派生数据](/ch1#sec_introduction_derived), [组合使用数据存储技术](/ch13#id447)-[分拆系统与集成系统](/ch13#id448)
  - in CPUs, [查询执行：编译与向量化](/ch4#sec_storage_vectorized), [线性一致性与网络延迟](/ch10#linearizability-and-network-delays)
  - 无效和赡养费, [保持系统同步](/ch12#sec_stream_sync), [维护物化视图](/ch12#sec_stream_mat_view)
  - 线性一致性, [线性一致性](/ch10#sec_consistency_linearizability)
  - 云中的本地磁盘, [存储与计算的分离](/ch1#sec_introduction_storage_compute)
- 日历同步, [同步引擎与本地优先软件](/ch6#sec_replication_offline_clients), [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
- California Consumer Privacy Act (CCPA), [数据系统、法律与社会](/ch1#sec_introduction_compliance)
- Camunda（工作流程引擎）, [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows)
- （数据）, [记录系统与派生数据](/ch1#sec_introduction_derived)
- CAP定理, [CAP 定理](/ch10#the-cap-theorem)-[CAP 定理](/ch10#the-cap-theorem), [术语表](/glossary)
- 能力规划, [云时代的运维](/ch1#sec_introduction_operations)
- Cap'n Proto（数据格式）, [编码数据的格式](/ch5#sec_encoding_formats)
- 碳排放, [分布式与单节点系统](/ch1#sec_introduction_distributed)
- 级联中止, [没有脏读](/ch8#no-dirty-reads)
- 连锁失败, [软件故障](/ch2#software-faults), [运维：自动/手动再平衡](/ch7#sec_sharding_operations), [超时和无界延迟](/ch9#sec_distributed_queueing)
- Cassandra（数据库）
  - 数据变更捕获, [数据变更捕获的实现](/ch12#id307), [变更流的 API 支持](/ch12#sec_stream_change_api)
  - 压缩战略, [压实策略](/ch4#sec_storage_lsm_compaction)
  - consistency level ANY, [单主与无主复制的性能](/ch6#sec_replication_leaderless_perf)
  - 散列变硬, [按键的哈希分片](/ch7#sec_sharding_hash), [按哈希范围分片](/ch7#sharding-by-hash-range)
  - 最后写成的解决冲突, [检测并发写入](/ch6#sec_replication_concurrent)
  - 无领导复制, [无主复制](/ch6#sec_replication_leaderless)
  - 轻量事务, [单对象写入](/ch8#sec_transactions_single_object)
  - 线性,缺少, [实现线性一致性系统](/ch10#sec_consistency_implementing_linearizable)
  - 日志结构存储, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
  - 多区域支助, [多地区操作](/ch6#multi-region-operation)
  - 二级指数, [本地二级索引](/ch7#id166)
  - 使用时钟, [仲裁一致性的局限](/ch6#sec_replication_quorum_limitations), [用于事件排序的时间戳](/ch9#sec_distributed_lww)
  - 节点（硬化）, [分片](/ch7#ch_sharding)
- 猫（Unix 工具）, [简单日志分析](/ch11#sec_batch_log_analysis)
- 目录, [云数据仓库](/ch4#sec_cloud_data_warehouses)
- 因果关系, [版本向量](/ch6#version-vectors)
  - （另见 causal dependencies）
- 因果关系, ["先发生"关系与并发](/ch6#sec_replication_happens_before)-[版本向量](/ch6#version-vectors)
  - 捕获, [版本向量](/ch6#version-vectors), [排序事件以捕获因果关系](/ch13#sec_future_capture_causality), [读也是事件](/ch13#sec_future_read_events)
    - 按总订单, [全序的限制](/ch13#id335)
  - 事务中, [基于过时前提的决策](/ch8#decisions-based-on-an-outdated-premise)
  - 向朋友发送消息（例如）, [排序事件以捕获因果关系](/ch13#sec_future_capture_causality)
- 因果关系, [术语表](/glossary)
  - 因果顺序
    - 与, [逻辑时钟](/ch10#sec_consistency_timestamps)
  - 与, [逻辑时钟](/ch10#sec_consistency_timestamps)-[使用逻辑时钟强制约束](/ch10#enforcing-constraints-using-logical-clocks)
  - 发生关系前, ["先发生"关系与并发](/ch6#sec_replication_happens_before)
  - 在可序列事务中, [基于过时前提的决策](/ch8#decisions-based-on-an-outdated-premise)-[检测影响先前读取的写入](/ch8#sec_detecting_writes_affect_reads)
  - 与时钟不符, [用于事件排序的时间戳](/ch9#sec_distributed_lww)
  - 命令要抓取的事件, [排序事件以捕获因果关系](/ch13#sec_future_capture_causality)
  - 违反《公约》的行为, [一致前缀读](/ch6#sec_replication_consistent_prefix), [不同拓扑的问题](/ch6#problems-with-different-topologies), [用于事件排序的时间戳](/ch9#sec_distributed_lww)
  - 带有同步时钟, [用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
- 基于单元格的架构, [面向多租户的分片](/ch7#sec_sharding_multitenancy)
- 复合事件处理（见 复合事件处理）
- CephFS（分布式文件系统）, [批处理](/ch11#ch_batch), [对象存储](/ch11#id277)
- 证书透明性, [用于可审计数据系统的工具](/ch13#id366)
- c组, [分布式作业编排](/ch11#id278)
- 数据变更捕获, [逻辑（基于行）日志复制](/ch6#logical-row-based-log-replication), [数据变更捕获](/ch12#sec_stream_cdc)
  - 变更流的 API 支持, [变更流的 API 支持](/ch12#sec_stream_change_api)
  - 比较事件来源, [数据变更捕获与事件溯源](/ch12#sec_stream_event_sourcing)
  - 执行, [数据变更捕获的实现](/ch12#id307)
  - 初始快照, [初始快照](/ch12#sec_stream_cdc_snapshot)
  - 日志压缩, [日志压缩](/ch12#sec_stream_log_compaction)
- 更改日志, [状态、流和不变性](/ch12#sec_stream_immutability)
  - 数据变更捕获, [数据变更捕获](/ch12#sec_stream_cdc)
  - 操作状态, [失败后重建状态](/ch12#sec_stream_state_fault_tolerance)
  - 在溪流中连接, [流表连接（流扩充）](/ch12#sec_stream_table_joins)
  - 日志压缩, [日志压缩](/ch12#sec_stream_log_compaction)
  - 保持衍生状态, [数据库与流](/ch12#sec_stream_databases)
- 混乱工程, [容错](/ch2#id27), [故障注入](/ch9#sec_fault_injection)
- 检查站
  - 在高性能计算中, [云计算与超级计算](/ch1#id17)
  - 在流处理器中, [微批次与存档点](/ch12#id329)
- 断路器（限制重试）, [描述性能](/ch2#sec_introduction_percentiles)
- 电路交换网络, [同步与异步网络](/ch9#sec_distributed_sync_networks)
- 循环缓冲器, [磁盘空间使用](/ch12#sec_stream_disk_usage)
- 循环复制地形, [多主复制拓扑](/ch6#sec_replication_topologies)
- Citus（数据库）
  - 散列变硬, [固定数量的分片](/ch7#fixed-number-of-shards)
- ClickHouse（数据库）, [事务处理与分析的特征](/ch1#sec_introduction_oltp), [云原生系统架构](/ch1#sec_introduction_cloud_native)
  - 增量视图维护, [维护物化视图](/ch12#sec_stream_mat_view)
- 点击流数据,分析, [JOIN 与 GROUP BY](/ch11#sec_batch_join)
- 客户
  - 电话服务, [流经服务的数据流：REST 与 RPC](/ch5#sec_encoding_dataflow_rpc)
  - 脱机, [同步引擎与本地优先软件](/ch6#sec_replication_offline_clients), [有状态、可离线的客户端](/ch13#id347)
  - 推动状态更改到, [将状态变更推送给客户端](/ch13#id348)
  - 请求路由, [请求路由](/ch7#sec_sharding_routing)
- ClockBound（时间同步）, [带置信区间的时钟读数](/ch9#clock-readings-with-a-confidence-interval)
  - use in YugabyteDB, [用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
- 时钟, [不可靠的时钟](/ch9#sec_distributed_clocks)-[限制垃圾回收的影响](/ch9#sec_distributed_gc_impact)
  - 原子钟, [带置信区间的时钟读数](/ch9#clock-readings-with-a-confidence-interval), [用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
  - 信任间隔, [带置信区间的时钟读数](/ch9#clock-readings-with-a-confidence-interval)-[用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
  - 全球快照, [用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
  - 混合逻辑时钟, [混合逻辑时钟](/ch10#hybrid-logical-clocks)
  - logical（见 logical clocks）
  - 偏斜, [最后写入胜利（丢弃并发写入）](/ch6#sec_replication_lww), [仲裁一致性的局限](/ch6#sec_replication_quorum_limitations), [对同步时钟的依赖](/ch9#sec_distributed_clocks_relying)-[带置信区间的时钟读数](/ch9#clock-readings-with-a-confidence-interval), [实现线性一致性系统](/ch10#sec_consistency_implementing_linearizable)
  - 杀人, [单调时钟](/ch9#monotonic-clocks)
  - 同步和准确性, [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy)-[时钟同步和准确性](/ch9#sec_distributed_clock_accuracy)
  - synchronization using GPS, [不可靠的时钟](/ch9#sec_distributed_clocks), [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy), [带置信区间的时钟读数](/ch9#clock-readings-with-a-confidence-interval), [用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
  - 时间与单调时钟, [单调时钟与日历时钟](/ch9#sec_distributed_monotonic_timeofday)
  - 时间标记事件, [你用的是谁的时钟？](/ch12#id438)
- 云服务, [云服务与自托管](/ch1#sec_introduction_cloud)-[云计算与超级计算](/ch1#id17)
  - 可用区, [通过冗余容忍硬件故障](/ch2#tolerating-hardware-faults-through-redundancy), [读己之写](/ch6#sec_replication_ryw)
  - 数据仓库, [云数据仓库](/ch4#sec_cloud_data_warehouses)
  - 需要发现服务, [服务发现](/ch10#service-discovery)
  - 网络故障, [实践中的网络故障](/ch9#sec_distributed_network_faults)
  - 利弊关系, [云服务的利弊](/ch1#sec_introduction_cloud_tradeoffs)-[云服务的利弊](/ch1#sec_introduction_cloud_tradeoffs)
  - 配额, [云时代的运维](/ch1#sec_introduction_operations)
  - regions（见 regions (geographic distribution)）
  - 无服务器, [微服务与无服务器](/ch1#sec_introduction_microservices)
  - 共享资源, [网络拥塞和排队](/ch9#network-congestion-and-queueing)
  - 对超级计算, [云计算与超级计算](/ch1#id17)
- 云内, [云原生系统架构](/ch1#sec_introduction_cloud_native)-[云时代的运维](/ch1#sec_introduction_operations)
- 云飞
  - R2（见 R2（对象存储））
- 组合索引, [在索引中存储值](/ch4#sec_storage_index_heap)
- 分组（记录顺序）, [按哈希范围分片](/ch7#sharding-by-hash-range)
- CockroachDB（数据库）
  - 基于共识的复制, [单主复制](/ch6#sec_replication_leader)
  - 一致性模式, [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
  - 键程硬化, [分片](/ch7#ch_sharding), [按键的范围分片](/ch7#sec_sharding_key_range)
  - 可序列事务, [可串行化快照隔离（SSI）](/ch8#sec_transactions_ssi)
  - 硬化二级指数, [全局二级索引](/ch7#id167)
  - 事务, [事务到底是什么？](/ch8#sec_transactions_overview), [数据库内部的分布式事务](/ch8#sec_transactions_internal)
  - 使用模型检查, [模型检查与规范语言](/ch9#model-checking-and-specification-languages)
- 代码生成
  - 用于查询执行, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
  - 带有协议缓冲, [Protocol Buffers](/ch5#sec_encoding_protobuf)
- 协作编辑, [实时协作、离线优先和本地优先应用](/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- 列家庭（大表）, [读写的数据局部性](/ch3#sec_datamodels_document_locality), [列压缩](/ch4#sec_storage_column_compression)
- 面向列的存储, [列式存储](/ch4#sec_storage_column)-[查询执行：编译与向量化](/ch4#sec_storage_vectorized)
  - 列压缩, [列压缩](/ch4#sec_storage_column_compression)
  - 公园, [列式存储](/ch4#sec_storage_column), [归档存储](/ch5#archival-storage)
  - 排序在, [列存储中的排序顺序](/ch4#sort-order-in-column-storage)-[列存储中的排序顺序](/ch4#sort-order-in-column-storage)
  - 矢量处理, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
  - 宽柱型, [列压缩](/ch4#sec_storage_column_compression)
  - 写入, [写入列式存储](/ch4#writing-to-column-oriented-storage)
- comma-separated values（见 CSV）
- 命令查询责任分离, [事件溯源与 CQRS](/ch3#sec_datamodels_events)-[事件溯源与 CQRS](/ch3#sec_datamodels_events), [从同一事件日志中派生多个视图](/ch12#sec_stream_deriving_views)
- 命令（活动来源）, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
- 执行（事务）, [事务](/ch8#ch_transactions)
  - 原子提交, [分布式事务](/ch8#sec_transactions_distributed)-[再谈恰好一次消息处理](/ch8#exactly-once-message-processing-revisited)
    - （另见 原子性）
  - 读作承诺隔离, [读已提交](/ch8#sec_transactions_read_committed)
  - three-phase commit (3PC), [三阶段提交](/ch8#three-phase-commit)
  - 两阶段提交, [两阶段提交（2PC）](/ch8#sec_transactions_2pc)-[协调器故障](/ch8#coordinator-failure)
- 通用业务, [冲突解决与复制](/ch8#conflict-resolution-and-replication)
- 压实（Compaction）
  - 更改日志, [日志压缩](/ch12#sec_stream_log_compaction)
    - （另见 日志压缩）
    - 流运算符状态, [失败后重建状态](/ch12#sec_stream_state_fault_tolerance)
  - 日志结构存储, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
    - 问题, [读取性能](/ch4#read-performance)
    - 规模分级和分级办法, [压实策略](/ch4#sec_storage_lsm_compaction), [磁盘空间使用](/ch4#disk-space-usage)
- 比较和设置, [条件写入（比较并设置）](/ch8#sec_transactions_compare_and_set), [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
  - 执行锁定, [协调服务](/ch10#sec_consistency_coordination)
  - 执行独特性限制, [约束与唯一性保证](/ch10#sec_consistency_uniqueness)
  - 在对象存储中, [设置新的副本](/ch6#sec_replication_new_replica)
  - 与协商一致的关系, [线性一致性与仲裁](/ch10#sec_consistency_quorum_linearizable), [共识](/ch10#sec_consistency_consensus), [比较并设置作为共识](/ch10#compare-and-set-as-consensus)
  - 与栅栏标志的关系, [隔离僵尸进程和延迟请求](/ch9#sec_distributed_fencing_tokens)
  - 与事务的关系, [单对象写入](/ch8#sec_transactions_single_object)
- 兼容性, [编码与演化](/ch5#ch_encoding), [数据流的模式](/ch5#sec_encoding_dataflow)
  - 电话服务, [RPC 的数据编码与演化](/ch5#data-encoding-and-evolution-for-rpc)
  - 编码格式的属性, [总结](/ch5#summary)
  - 使用数据库, [流经数据库的数据流](/ch5#sec_encoding_dataflow_db)-[归档存储](/ch5#archival-storage)
- 补偿事务, [不可变事件的优点](/ch12#sec_stream_immutability_pros), [宽松地解释约束](/ch13#id362)
- 汇编, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
- 复合事件处理, [复合事件处理](/ch12#id317)
- 复杂度
  - 理论模型中的蒸馏, [将系统模型映射到现实世界](/ch9#mapping-system-models-to-the-real-world)
  - 重要和意外事项, [简单性：管理复杂度](/ch2#id38)
  - 使用抽象来隐藏, [数据模型与查询语言](/ch3#ch_datamodels)
  - 管理, [简单性：管理复杂度](/ch2#id38)
- composing data systems（见 unbundling databases）
- 压缩
  - in SSTables, [SSTable 文件格式](/ch4#the-sstable-file-format)
- 计算密集型应用程序, [数据系统架构中的权衡](/ch1#ch_tradeoffs)
- 电脑游戏, [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
- 缩写索引, [多维索引与全文索引](/ch4#sec_storage_multidimensional)
  - 在散列硬化系统中, [按哈希范围分片](/ch7#sharding-by-hash-range)
- 并发
  - 演员编程模式, [分布式 actor 框架](/ch5#distributed-actor-frameworks), [事件驱动架构与 RPC](/ch12#sec_stream_actors_drpc)
    - （另见 event-driven architecture）
  - 事务隔离薄弱时出现的错误, [弱隔离级别](/ch8#sec_transactions_isolation_levels)
  - 解决冲突, [处理写入冲突](/ch6#sec_replication_write_conflicts)-[处理写入冲突](/ch6#sec_replication_write_conflicts)
  - 定义, [处理写入冲突](/ch6#sec_replication_write_conflicts)
  - 检测并行写作, [检测并发写入](/ch6#sec_replication_concurrent)-[版本向量](/ch6#version-vectors)
  - 双写、 问题, [保持系统同步](/ch12#sec_stream_sync)
  - 发生关系前, ["先发生"关系与并发](/ch6#sec_replication_happens_before)
  - 在复制系统中, [复制延迟的问题](/ch6#sec_replication_lag)-[版本向量](/ch6#version-vectors), [线性一致性](/ch10#sec_consistency_linearizability)-[线性一致性与网络延迟](/ch10#linearizability-and-network-delays)
  - 丢失更新, [防止丢失更新](/ch8#sec_transactions_lost_update)
  - 多版本并发控制, [多版本并发控制（MVCC）](/ch8#sec_transactions_snapshot_impl), [用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
  - 乐观并发控制, [悲观并发控制与乐观并发控制](/ch8#pessimistic-versus-optimistic-concurrency-control)
  - 行动命令, [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
  - 通过事件日志减少, [并发控制](/ch12#sec_stream_concurrency), [数据流：应用代码与状态变化的交互](/ch13#id450)
  - 时间和相对性, ["先发生"关系与并发](/ch6#sec_replication_happens_before)
  - 事务隔离, [隔离性](/ch8#sec_transactions_acid_isolation)
  - 写偏差, [写偏差与幻读](/ch8#sec_transactions_write_skew)-[物化冲突](/ch8#materializing-conflicts)
- 有条件写入, [条件写入（比较并设置）](/ch8#sec_transactions_compare_and_set)
  - 事务中, [单对象写入](/ch8#sec_transactions_single_object)
  - 在对象存储中, [设置新的副本](/ch6#sec_replication_new_replica)
- 会议管理系统（例如）, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
- conflict-free replicated datatypes (CRDTs), [CRDT 与操作变换](/ch6#sec_replication_crdts)
  - 用于无头复制, [捕获先发生关系](/ch6#capturing-the-happens-before-relationship)
  - 防止丢失更新, [冲突解决与复制](/ch8#conflict-resolution-and-replication)
- 冲突
  - 撤销, [冲突避免](/ch6#conflict-avoidance)
  - 因果关系, ["先发生"关系与并发](/ch6#sec_replication_happens_before)
  - 冲突检测
    - 分布式事务, [XA 事务的问题](/ch8#problems-with-xa-transactions)
    - 在基于日志的系统中, [唯一性约束需要达成共识](/ch13#id452)
    - in serializable snapshot isolation (SSI), [检测影响先前读取的写入](/ch8#sec_detecting_writes_affect_reads)
    - 在两阶段提交中, [系统性的承诺](/ch8#a-system-of-promises)
  - 解决冲突
    - 通过中止事务, [悲观并发控制与乐观并发控制](/ch8#pessimistic-versus-optimistic-concurrency-control)
    - 通过道歉, [宽松地解释约束](/ch13#id362)
    - 最后写入胜利, [用于事件排序的时间戳](/ch9#sec_distributed_lww)
    - 使用原子操作, [冲突解决与复制](/ch8#conflict-resolution-and-replication)
  - 确定什么是冲突, [处理写入冲突](/ch6#sec_replication_write_conflicts), [基于日志消息传递中的唯一性](/ch13#sec_future_uniqueness_log)
  - 无领导复制, [检测并发写入](/ch6#sec_replication_concurrent)
  - 丢失更新, [防止丢失更新](/ch8#sec_transactions_lost_update)-[冲突解决与复制](/ch8#conflict-resolution-and-replication)
  - 实现, [物化冲突](/ch8#materializing-conflicts)
  - 决议, [处理写入冲突](/ch6#sec_replication_write_conflicts)-[处理写入冲突](/ch6#sec_replication_write_conflicts)
    - 自动, [自动冲突解决](/ch6#automatic-conflict-resolution)
    - 无头系统, [检测并发写入](/ch6#sec_replication_concurrent)
    - 最后写入胜利, [最后写入胜利（丢弃并发写入）](/ch6#sec_replication_lww)
    - 使用自定义逻辑, [手动冲突解决](/ch6#manual-conflict-resolution), [捕获先发生关系](/ch6#capturing-the-happens-before-relationship)
  - 兄弟, [手动冲突解决](/ch6#manual-conflict-resolution), [捕获先发生关系](/ch6#capturing-the-happens-before-relationship)
    - 合并, [捕获先发生关系](/ch6#capturing-the-happens-before-relationship)
  - 写偏差, [写偏差与幻读](/ch8#sec_transactions_write_skew)-[物化冲突](/ch8#materializing-conflicts)
- 调和
  - Freight（消息系统）, [设置新的副本](/ch6#sec_replication_new_replica), [磁盘空间使用](/ch12#sec_stream_disk_usage)
  - 计划登记, [JSON 模式](/ch5#json-schema), [但什么是写入者模式？](/ch5#but-what-is-the-writers-schema)
- 拥堵（网络）
  - 撤销, [TCP 的局限性](/ch9#sec_distributed_tcp)
  - 限制时钟的准确性, [带置信区间的时钟读数](/ch9#clock-readings-with-a-confidence-interval)
  - 排队延迟, [网络拥塞和排队](/ch9#network-congestion-and-queueing)
- 共识, [共识](/ch10#sec_consistency_consensus)-[总结](/ch10#summary), [术语表](/glossary)
  - 算法, [共识](/ch10#sec_consistency_consensus), [共识的实践](/ch10#sec_consistency_total_order)
  - 协商一致编号, [获取并增加作为共识](/ch10#fetch-and-add-as-consensus)
  - 协调事务, [协调服务](/ch10#sec_consistency_coordination)-[服务发现](/ch10#service-discovery)
  - 费用, [共识的利弊](/ch10#pros-and-cons-of-consensus)
  - 无法实现, [共识](/ch10#sec_consistency_consensus)
  - 防止脑分裂, [从单主复制到共识](/ch10#from-single-leader-replication-to-consensus)
  - 重组, [共识的微妙之处](/ch10#subtleties-of-consensus)
  - 与原子承诺的关系, [原子提交作为共识](/ch10#atomic-commitment-as-consensus)
  - relation to compare-and-set (CAS), [线性一致性与仲裁](/ch10#sec_consistency_quorum_linearizable), [比较并设置作为共识](/ch10#compare-and-set-as-consensus)
  - 与获取和添加的关系, [获取并增加作为共识](/ch10#fetch-and-add-as-consensus)
  - 与复制有关, [使用共享日志](/ch10#sec_consistency_smr)
  - 与共享日志的关系, [共享日志作为共识](/ch10#sec_consistency_shared_logs)
  - 与独特性制约因素的关系, [唯一性约束需要达成共识](/ch13#id452)
  - 安全和生活特性, [单值共识](/ch10#single-value-consensus)
  - 单一价值共识, [单值共识](/ch10#single-value-consensus)
- consent (GDPR), [同意与选择自由](/ch14#id375)
- 一致性, [一致性](/ch8#sec_transactions_acid_consistency), [及时性与完整性](/ch13#sec_future_integrity)
  - 跨越不同数据库, [领导者故障：故障转移](/ch6#leader-failure-failover), [保持系统同步](/ch12#sec_stream_sync), [从同一事件日志中派生多个视图](/ch12#sec_stream_deriving_views), [派生数据与分布式事务](/ch13#sec_future_derived_vs_transactions)
  - 因果关系, [一致前缀读](/ch6#sec_replication_consistent_prefix), [不同拓扑的问题](/ch6#problems-with-different-topologies), [排序事件以捕获因果关系](/ch13#sec_future_capture_causality)
  - 一致前缀读, [一致前缀读](/ch6#sec_replication_consistent_prefix)-[一致前缀读](/ch6#sec_replication_consistent_prefix)
  - 一致的快照, [设置新的副本](/ch6#sec_replication_new_replica), [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)-[快照隔离、可重复读和命名混淆](/ch8#snapshot-isolation-repeatable-read-and-naming-confusion), [用于全局快照的同步时钟](/ch9#sec_distributed_spanner), [初始快照](/ch12#sec_stream_cdc_snapshot), [创建索引](/ch13#id340)
    - （另见 snapshots）
  - 崩溃恢复, [使 B 树可靠](/ch4#sec_storage_btree_wal)
  - enforcing constraints（见 constraints）
  - 最终, [复制延迟的问题](/ch6#sec_replication_lag)
    - （另见 最终一致性）
  - in ACID transactions, [一致性](/ch8#sec_transactions_acid_consistency), [维护完整性，尽管软件有Bug](/ch13#id455)
  - 在 CAP 定理中, [CAP 定理](/ch10#the-cap-theorem)
  - 领袖选举, [共识的微妙之处](/ch10#subtleties-of-consensus)
  - 微服务, [分布式系统的问题](/ch1#sec_introduction_dist_sys_problems)
  - 线性一致性, [复制延迟的解决方案](/ch6#id131), [线性一致性](/ch10#sec_consistency_linearizability)-[线性一致性与网络延迟](/ch10#linearizability-and-network-delays)
  - 含义, [一致性](/ch8#sec_transactions_acid_consistency)
  - 单调读, [单调读](/ch6#sec_replication_monotonic_reads)-[单调读](/ch6#sec_replication_monotonic_reads)
  - 二级指数, [多对象事务的需求](/ch8#sec_transactions_need), [索引与快照隔离](/ch8#indexes-and-snapshot-isolation), [理解数据流](/ch13#id443), [创建索引](/ch13#id340)
  - 读后写, [读己之写](/ch6#sec_replication_ryw)-[读己之写](/ch6#sec_replication_ryw)
    - 在衍生数据系统中, [派生数据与分布式事务](/ch13#sec_future_derived_vs_transactions)
  - strong（见 线性一致性）
  - 及时性和完整性, [及时性与完整性](/ch13#sec_future_integrity)
  - 使用法定人数, [仲裁一致性的局限](/ch6#sec_replication_quorum_limitations), [线性一致性与仲裁](/ch10#sec_consistency_quorum_linearizable)
- 连续的散列, [一致性哈希](/ch7#sec_sharding_consistent_hashing)
- 一致前缀读, [一致前缀读](/ch6#sec_replication_consistent_prefix)
- 限制（数据库）, [一致性](/ch8#sec_transactions_acid_consistency), [写偏差的特征](/ch8#characterizing-write-skew)
  - 同步检查, [宽松地解释约束](/ch13#id362)
  - 避免协调, [无协调数据系统](/ch13#id454)
  - 确保一能, [操作标识符](/ch13#id355)
  - 在基于日志的系统中, [强制约束](/ch13#sec_future_constraints)-[多分区请求处理](/ch13#id360)
    - 跨越多个硬块, [多分区请求处理](/ch13#id360)
  - 在两阶段提交中, [分布式事务](/ch8#sec_transactions_distributed), [系统性的承诺](/ch8#a-system-of-promises)
  - 与协商一致的关系, [唯一性约束需要达成共识](/ch13#id452)
  - 需要线性, [约束与唯一性保证](/ch10#sec_consistency_uniqueness)
- 领事（协调处）, [协调服务](/ch10#sec_consistency_coordination)
  - 用于服务发现, [服务发现](/ch10#service-discovery)
- 消费者（信息流）, [消息代理](/ch5#message-brokers), [传递事件流](/ch12#sec_stream_transmit)
  - 背压, [消息传递系统](/ch12#sec_stream_messaging)
  - 消费者群体, [多个消费者](/ch12#id298)
  - 以原木计的消费者抵销额, [消费者偏移量](/ch12#sec_stream_log_offsets)
  - 失败, [确认与重新传递](/ch12#sec_stream_reordering), [消费者偏移量](/ch12#sec_stream_log_offsets)
  - 扇出, [时间线的物化与更新](/ch2#sec_introduction_materializing), [多个消费者](/ch12#id298), [日志与传统的消息传递相比](/ch12#sec_stream_logs_vs_messaging)
  - 负载平衡, [多个消费者](/ch12#id298), [日志与传统的消息传递相比](/ch12#sec_stream_logs_vs_messaging)
  - 未与生产者保持同步, [消息传递系统](/ch12#sec_stream_messaging), [磁盘空间使用](/ch12#sec_stream_disk_usage), [开展分拆工作](/ch13#sec_future_unbundling_favor)
- content models (JSON Schema), [JSON 模式](/ch5#json-schema)
- 参数
  - 事务之间, [处理错误和中止](/ch8#handling-errors-and-aborts)
  - 屏蔽线程, [进程暂停](/ch9#sec_distributed_clocks_pauses)
  - 乐观并发控制的性能, [悲观并发控制与乐观并发控制](/ch8#pessimistic-versus-optimistic-concurrency-control)
  - 双相锁定, [两阶段锁定的性能](/ch8#performance-of-two-phase-locking)
- 上下文开关, [延迟与响应时间](/ch2#id23), [进程暂停](/ch9#sec_distributed_clocks_pauses)
- 收敛, [自动冲突解决](/ch6#automatic-conflict-resolution)-[CRDT 与操作变换](/ch6#sec_replication_crdts)
- 协调
  - 撤销, [无协调数据系统](/ch13#id454)
  - 跨数据中心, [全序的限制](/ch13#id335)
  - 跨区域, [跨地域运行](/ch6#sec_replication_multi_dc)
  - 交叉硬度顺序, [分片](/ch8#sharding), [用于全局快照的同步时钟](/ch9#sec_distributed_spanner), [使用共享日志](/ch10#sec_consistency_smr), [多分区请求处理](/ch13#id360)
  - 路径请求到硬体, [请求路由](/ch7#sec_sharding_routing)
  - 服务, [锁定与领导者选举](/ch10#locking-and-leader-election), [协调服务](/ch10#sec_consistency_coordination)-[服务发现](/ch10#service-discovery)
- 协调者, [两阶段提交（2PC）](/ch8#sec_transactions_2pc)
  - 失效, [协调器故障](/ch8#coordinator-failure)
  - in XA transactions, [XA 事务](/ch8#xa-transactions)-[XA 事务的问题](/ch8#problems-with-xa-transactions)
  - 恢复, [从协调器故障中恢复](/ch8#recovering-from-coordinator-failure)
- 复制写（B- 树）, [B 树变体](/ch4#b-tree-variants), [索引与快照隔离](/ch8#indexes-and-snapshot-isolation)
- 公共对象请求代理体系结构, [远程过程调用（RPC）的问题](/ch5#sec_problems_with_rpc)
- coronal mass ejection（见 solar storm）
- 正确性
  - 可审计性, [信任但验证](/ch13#sec_future_verification)-[用于可审计数据系统的工具](/ch13#id366)
  - 拜占庭断层承受力, [拜占庭故障](/ch9#sec_distributed_byzantine)
  - 处理部分失败, [故障与部分失效](/ch9#sec_distributed_partial_failure)
  - 在基于日志的系统中, [强制约束](/ch13#sec_future_constraints)-[多分区请求处理](/ch13#id360)
  - 系统模型中的算法, [定义算法的正确性](/ch9#defining-the-correctness-of-an-algorithm)
  - 生成数据, [为可审计性而设计](/ch13#id365)
  - 不可变数据, [不可变事件的优点](/ch12#sec_stream_immutability_pros)
  - 个人资料, [责任与问责](/ch14#id371), [隐私与数据使用](/ch14#id457)
  - 时间, [不同拓扑的问题](/ch6#problems-with-different-topologies), [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy)-[用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
  - 事务次数, [一致性](/ch8#sec_transactions_acid_consistency), [追求正确性](/ch13#sec_future_correctness), [维护完整性，尽管软件有Bug](/ch13#id455)
  - 及时性和完整性, [及时性与完整性](/ch13#sec_future_integrity)-[无协调数据系统](/ch13#id454)
- 数据腐败
  - 检测, [端到端原则](/ch13#sec_future_e2e_argument), [不要盲目信任承诺](/ch13#id364)-[用于可审计数据系统的工具](/ch13#id366)
  - 由于病态内存访问, [硬件与软件故障](/ch2#sec_introduction_hardware_faults)
  - 辐射所致, [拜占庭故障](/ch9#sec_distributed_byzantine)
  - 由于大脑分裂, [领导者故障：故障转移](/ch6#leader-failure-failover), [分布式锁和租约](/ch9#sec_distributed_lock_fencing)
  - 由于事务隔离薄弱, [弱隔离级别](/ch8#sec_transactions_isolation_levels)
  - 完整性作为不存在, [及时性与完整性](/ch13#sec_future_integrity)
  - 网络包, [弱形式的谎言](/ch9#weak-forms-of-lying)
  - 磁盘, [持久性](/ch8#durability)
  - 防止使用写头日志, [使 B 树可靠](/ch4#sec_storage_btree_wal)
  - 从, [批处理](/ch11#ch_batch), [不可变事件的优点](/ch12#sec_stream_immutability_pros)
- 余弦相似性（语义搜索）, [向量嵌入](/ch4#id92)
- Couchbase（数据库）
  - 文档数据模型, [关系模型与文档模型](/ch3#sec_datamodels_history)
  - 持久性, [全内存存储](/ch4#sec_storage_inmemory)
  - 散列变硬, [固定数量的分片](/ch7#fixed-number-of-shards)
  - 加入支持, [文档和关系数据库的融合](/ch3#convergence-of-document-and-relational-databases)
  - 再平衡, [运维：自动/手动再平衡](/ch7#sec_sharding_operations)
  - vBuckets（硬化）, [分片](/ch7#ch_sharding)
- CouchDB（数据库）
  - 作为同步引擎, [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
  - B-树木存储, [索引与快照隔离](/ch8#indexes-and-snapshot-isolation)
  - 解决冲突, [手动冲突解决](/ch6#manual-conflict-resolution)
- 耦合（松紧）, [可演化性：让变化更容易](/ch2#sec_introduction_evolvability)
- 覆盖索引, [在索引中存储值](/ch4#sec_storage_index_heap)
- CozoDB（数据库）, [Datalog：递归关系查询](/ch3#id62)
- CPUs
  - 缓存一致性和内存障碍, [线性一致性与网络延迟](/ch10#linearizability-and-network-delays)
  - 缓冲和管道, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
  - 计算错误的结果, [硬件与软件故障](/ch2#sec_introduction_hardware_faults)
  - SIMD instructions, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
- 断层和断层, [系统模型与现实](/ch9#sec_distributed_system_model)
- CRDTs（见 conflict-free replicated datatypes）
- CREATE INDEX statement (SQL), [多列索引与二级索引](/ch4#sec_storage_index_multicolumn), [创建索引](/ch13#id340)
- 信用评级机构, [责任与问责](/ch14#id371)
- 加密刷新, [事件溯源与 CQRS](/ch3#sec_datamodels_events), [不变性的局限性](/ch12#sec_stream_immutability_limitations)
- 密码, [总结](/ch3#summary)
- 密码学
  - 防御攻击者, [拜占庭故障](/ch9#sec_distributed_byzantine)
  - 端到端加密和认证, [端到端原则](/ch13#sec_future_e2e_argument)
- CSV (comma-separated values), [OLTP 系统的存储与索引](/ch4#sec_storage_oltp), [JSON、XML 及其二进制变体](/ch5#sec_encoding_json)
- Curator (ZooKeeper recipes), [锁定与领导者选举](/ch10#locking-and-leader-election), [将工作分配给节点](/ch10#allocating-work-to-nodes)
- Cypher（查询语言）, [Cypher 查询语言](/ch3#id57)
  - comparison to SPARQL, [SPARQL 查询语言](/ch3#the-sparql-query-language)

### D

- Daft（处理框架）
  - DataFrames, [DataFrames](/ch11#id287)
  - 移动数据, [混洗数据](/ch11#sec_shuffle)
- Dagster（工作流调度器）, [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows), [批处理](/ch11#ch_batch), [工作流调度](/ch11#sec_batch_workflows)
  - 云数据仓集成, [查询语言](/ch11#sec_batch_query_lanauges)
- 仪表板（业务情报）, [事务处理与分析的特征](/ch1#sec_introduction_oltp)
- Dask（处理框架）, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- 数据目录, [云数据仓库](/ch4#sec_cloud_data_warehouses)
- 数据连接器, [数据仓库](/ch1#sec_introduction_dwh)
- 数据合同, [提取-转换-加载（ETL）](/ch11#sec_batch_etl_usage)
  - 数据变更捕获, [数据变更捕获与事件溯源](/ch12#sec_stream_event_sourcing)
- data corruption（见 corruption of data）
- 数据方块, [物化视图与数据立方体](/ch4#sec_storage_materialized_views)
- 数据工程, [分析型与事务型系统](/ch1#sec_introduction_analytics)
- 数据结构, [提取-转换-加载（ETL）](/ch11#sec_batch_etl_usage)
- data formats（见 编码）
- 数据基础设施, [数据系统架构中的权衡](/ch1#ch_tradeoffs)
- 数据集成, [数据集成](/ch13#sec_future_integration)-[统一批处理和流处理](/ch13#id338), [本章小结](/ch13#id367)
  - 批量和流处理, [批处理与流处理](/ch13#sec_future_batch_streaming)-[统一批处理和流处理](/ch13#id338)
    - 保持衍生状态, [维护派生状态](/ch13#id446)
    - 后处理数据, [应用演化后重新处理数据](/ch13#sec_future_reprocessing)
    - 统一, [统一批处理和流处理](/ch13#id338)
  - 通过解开数据库, [分拆数据库](/ch13#sec_future_unbundling)-[多分区数据处理](/ch13#sec_future_unbundled_multi_shard)
    - 与联邦数据库的比较, [一切的元数据库](/ch13#id341)
  - 通过生成数据合并工具, [组合使用派生数据的工具](/ch13#id442)-[排序事件以捕获因果关系](/ch13#sec_future_capture_causality)
    - 衍生数据与分布式事务, [派生数据与分布式事务](/ch13#sec_future_derived_vs_transactions)
    - 总订单的限制, [全序的限制](/ch13#id335)
    - 命令事件捕获因果关系, [排序事件以捕获因果关系](/ch13#sec_future_capture_causality)
    - 关于数据流的推理, [理解数据流](/ch13#id443)
  - 需求, [记录系统与派生数据](/ch1#sec_introduction_derived)
  - 使用批量处理, [批处理](/ch11#ch_batch), [提取-转换-加载（ETL）](/ch11#sec_batch_etl_usage)
- 数据湖, [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake)
  - 数据湖区, [云数据仓库](/ch4#sec_cloud_data_warehouses), [分析（Analytics）](/ch11#sec_batch_olap)
- data locality（见 局部性）
- 数据网格, [提取-转换-加载（ETL）](/ch11#sec_batch_etl_usage)
- 数据最小化, [数据系统、法律与社会](/ch1#sec_introduction_compliance), [立法与自律](/ch14#sec_future_legislation)
- 数据模型, [数据模型与查询语言](/ch3#ch_datamodels)-[总结](/ch3#summary)
  - DataFrames and arrays, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
  - 类似图表的模型, [图数据模型](/ch3#sec_datamodels_graph)-[GraphQL](/ch3#id63)
    - 数据日志语言, [Datalog：递归关系查询](/ch3#id62)-[Datalog：递归关系查询](/ch3#id62)
    - 属性图, [属性图](/ch3#id56)
    - RDF and triple-stores, [三元组存储与 SPARQL](/ch3#id59)-[SPARQL 查询语言](/ch3#the-sparql-query-language)
  - 关系模型对文档模型, [关系模型与文档模型](/ch3#sec_datamodels_history)-[文档和关系数据库的融合](/ch3#convergence-of-document-and-relational-databases)
  - 支持多个, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
- 数据管道, [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake), [记录系统与派生数据](/ch1#sec_introduction_derived), [提取-转换-加载（ETL）](/ch11#sec_batch_etl_usage)
- 数据产品, [超越数据湖](/ch1#beyond-the-data-lake)
- data protection regulations（见 GDPR）
- 数据居住法, [分布式与单节点系统](/ch1#sec_introduction_distributed), [面向多租户的分片](/ch7#sec_sharding_multitenancy)
- 数据科学, [分析型与事务型系统](/ch1#sec_introduction_analytics), [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake)
- 数据仓, [数据仓库](/ch1#sec_introduction_dwh)
- 数据系统
  - 正确性、制约因素和完整性, [追求正确性](/ch13#sec_future_correctness)-[用于可审计数据系统的工具](/ch13#id366)
  - 数据集成, [数据集成](/ch13#sec_future_integration)-[统一批处理和流处理](/ch13#id338)
  - 使用目标, [数据系统架构中的权衡](/ch1#ch_tradeoffs)
  - 多样性, 保持同步, [保持系统同步](/ch12#sec_stream_sync)
  - 可维护性, [可运维性](/ch2#sec_introduction_maintainability)-[可演化性：让变化更容易](/ch2#sec_introduction_evolvability)
  - 可能的错误, [事务](/ch8#ch_transactions)
  - 可靠性, [可靠性与容错](/ch2#sec_introduction_reliability)-[人类与可靠性](/ch2#id31)
    - 硬件故障, [硬件与软件故障](/ch2#sec_introduction_hardware_faults)
    - 人类错误, [人类与可靠性](/ch2#id31)
    - 重要性, [人类与可靠性](/ch2#id31)
    - 软件故障, [软件故障](/ch2#software-faults)
  - 可伸缩性, [可伸缩性](/ch2#sec_introduction_scalability)-[可伸缩性原则](/ch2#id35)
  - 解析数据库, [分拆数据库](/ch13#sec_future_unbundling)-[多分区数据处理](/ch13#sec_future_unbundled_multi_shard)
  - 不可靠的时钟, [不可靠的时钟](/ch9#sec_distributed_clocks)-[限制垃圾回收的影响](/ch9#sec_distributed_gc_impact)
- 数据存储, [数据仓库](/ch1#sec_introduction_dwh), [术语表](/glossary)
  - 基于云的解决办法, [云数据仓库](/ch4#sec_cloud_data_warehouses)
  - ETL, [数据仓库](/ch1#sec_introduction_dwh), [保持系统同步](/ch12#sec_stream_sync)
  - 用于批处理, [批处理](/ch11#ch_batch)
  - 保持数据系统的同步, [保持系统同步](/ch12#sec_stream_sync)
  - 设计, [星型与雪花型：分析模式](/ch3#sec_datamodels_analytics)
  - 硬化和集群, [按哈希范围分片](/ch7#sharding-by-hash-range)
  - 缓慢变化的维度, [连接的时间依赖性](/ch12#sec_stream_join_time)
- 数据密集型应用, [数据系统架构中的权衡](/ch1#ch_tradeoffs)
- 数据库管理员, [云时代的运维](/ch1#sec_introduction_operations)
- 内部分布式事务, [跨不同系统的分布式事务](/ch8#sec_transactions_xa), [数据库内部的分布式事务](/ch8#sec_transactions_internal), [原子提交再现](/ch12#sec_stream_atomic_commit)
- 数据库
  - 归档存储, [归档存储](/ch5#archival-storage)
  - 信件经纪人的比较, [消息代理与数据库的对比](/ch12#id297)
  - 数据流, [流经数据库的数据流](/ch5#sec_encoding_dataflow_db)
  - 端到端参数, [端到端原则](/ch13#sec_future_e2e_argument)-[在数据系统中应用端到端思考](/ch13#id357)
    - 检查完整性, [端到端原则重现](/ch13#id456)
  - 与事件流的关系, [数据库与流](/ch12#sec_stream_databases)-[不变性的局限性](/ch12#sec_stream_immutability_limitations)
    - （另见 changelogs）
    - 变更流的 API 支持, [变更流的 API 支持](/ch12#sec_stream_change_api), [应用代码和状态的分离](/ch13#id344)
    - 数据变更捕获, [数据变更捕获](/ch12#sec_stream_cdc)-[变更流的 API 支持](/ch12#sec_stream_change_api)
    - 事件溯源, [数据变更捕获与事件溯源](/ch12#sec_stream_event_sourcing)
    - 保持系统同步, [保持系统同步](/ch12#sec_stream_sync)-[保持系统同步](/ch12#sec_stream_sync)
    - 不可改变事件哲学, [状态、流和不变性](/ch12#sec_stream_immutability)-[不变性的局限性](/ch12#sec_stream_immutability_limitations)
  - 分拆, [分拆数据库](/ch13#sec_future_unbundling)-[多分区数据处理](/ch13#sec_future_unbundled_multi_shard)
    - 构建数据存储技术, [组合使用数据存储技术](/ch13#id447)-[分拆系统与集成系统](/ch13#id448)
    - 围绕数据流设计应用程序, [围绕数据流设计应用](/ch13#sec_future_dataflow)-[流处理器和服务](/ch13#id345)
    - 观察导出状态, [观察派生数据状态](/ch13#sec_future_observing)-[多分区数据处理](/ch13#sec_future_unbundled_multi_shard)
- 数据中心
  - 失败, [硬件与软件故障](/ch2#sec_introduction_hardware_faults)
  - geographically distributed（见 regions (geographic distribution)）
  - 多种使用和共享资源, [网络拥塞和排队](/ch9#network-congestion-and-queueing)
  - 网络架构, [云计算与超级计算](/ch1#id17)
  - 网络断层, [实践中的网络故障](/ch9#sec_distributed_network_faults)
- 数据流动, [数据流的模式](/ch5#sec_encoding_dataflow)-[分布式 actor 框架](/ch5#distributed-actor-frameworks), [围绕数据流设计应用](/ch13#sec_future_dataflow)-[流处理器和服务](/ch13#id345)
  - 数据流系统的正确性, [数据流系统的正确性](/ch13#id453)
  - 数据流引擎, [数据流引擎](/ch11#sec_batch_dataflow)
    - 与流处理的比较, [流处理](/ch12#sec_stream_processing)
    - DataFrames, [DataFrames](/ch11#id287)
    - 批次处理框架中的支持, [批处理](/ch11#ch_batch)
  - 事件驱动, [事件驱动的架构](/ch5#sec_encoding_dataflow_msg)-[分布式 actor 框架](/ch5#distributed-actor-frameworks)
  - 关于, [理解数据流](/ch13#id443)
  - 通过数据库, [流经数据库的数据流](/ch5#sec_encoding_dataflow_db)
  - 通过服务, [流经服务的数据流：REST 与 RPC](/ch5#sec_encoding_dataflow_rpc)-[RPC 的数据编码与演化](/ch5#data-encoding-and-evolution-for-rpc)
  - workflow engines（见 workflow engines）
- DataFrames, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
  - 执行, [DataFrames](/ch11#id287)
  - 分批处理, [DataFrames](/ch11#id287)
  - 在笔记本中, [机器学习](/ch11#id290)
  - 批次处理框架中的支持, [批处理](/ch11#ch_batch)
- DataFusion（查询引擎）, [云数据仓库](/ch4#sec_cloud_data_warehouses)
- Datalog（查询语言）, [Datalog：递归关系查询](/ch3#id62)-[Datalog：递归关系查询](/ch3#id62)
- 数据流（变化数据捕获）, [变更流的 API 支持](/ch12#sec_stream_change_api)
- 数据类型
  - binary strings in XML and JSON, [JSON、XML 及其二进制变体](/ch5#sec_encoding_json)
  - 无冲突, [CRDT 与操作变换](/ch6#sec_replication_crdts)
  - 在 Avro 编码中, [Avro](/ch5#sec_encoding_avro)
  - 在协议缓冲中, [字段标签与模式演化](/ch5#field-tags-and-schema-evolution)
  - numbers in XML and JSON, [JSON、XML 及其二进制变体](/ch5#sec_encoding_json)
- 日期和日期, [数据系统、法律与社会](/ch1#sec_introduction_compliance)
- Datomic（数据库）
  - B-树木存储, [索引与快照隔离](/ch8#indexes-and-snapshot-isolation)
  - 数据模型, [图数据模型](/ch3#sec_datamodels_graph), [三元组存储与 SPARQL](/ch3#id59)
  - 数据日志查询语言, [Datalog：递归关系查询](/ch3#id62)
  - 切除, [不变性的局限性](/ch12#sec_stream_immutability_limitations)
  - 事务语言, [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs)
  - 事务的序列执行, [实际串行执行](/ch8#sec_transactions_serial)
- Daylight Saving Time (DST), [日历时钟](/ch9#time-of-day-clocks)
- Db2（数据库）
  - 数据变更捕获, [数据变更捕获的实现](/ch12#id307)
- DBA (database administrator), [云时代的运维](/ch1#sec_introduction_operations)
- 僵局, [显式锁定](/ch8#explicit-locking)
  - 检测, 分布式事务, [XA 事务的问题](/ch8#problems-with-xa-transactions)
  - in two-phase locking (2PL), [两阶段锁定的实现](/ch8#implementation-of-two-phase-locking)
- Debezium（变化数据捕获）, [数据变更捕获的实现](/ch12#id307)
  - 卡桑德拉岛, [变更流的 API 支持](/ch12#sec_stream_change_api)
  - 数据整合, [分拆系统与集成系统](/ch13#id448)
- 声明语言, [数据模型与查询语言](/ch3#ch_datamodels), [术语表](/glossary)
  - 并同步引擎, [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
  - 数据日志, [Datalog：递归关系查询](/ch3#id62)
  - 文档数据库中, [文档和关系数据库的融合](/ch3#convergence-of-document-and-relational-databases)
  - recursive SQL queries, [SQL 中的图查询](/ch3#id58)
  - SPARQL, [SPARQL 查询语言](/ch3#the-sparql-query-language)
- DeepSeek
  - 3FS（见 3FS）
- 延迟
  - 限制网络延迟, [同步与异步网络](/ch9#sec_distributed_sync_networks)
  - 边框进程暂停, [响应时间保证](/ch9#sec_distributed_clocks_realtime)
  - 无限制的网络延迟, [超时和无界延迟](/ch9#sec_distributed_queueing)
  - 未绑定的进程暂停, [进程暂停](/ch9#sec_distributed_clocks_pauses)
- 删除数据, [不变性的局限性](/ch12#sec_stream_immutability_limitations)
  - in LSM storage, [磁盘空间使用](/ch4#disk-space-usage)
  - 法律依据, [数据系统、法律与社会](/ch1#sec_introduction_compliance)
- Delta Lake（表格式）, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables), [云数据仓库](/ch4#sec_cloud_data_warehouses)
  - 硬化和集群, [按哈希范围分片](/ch7#sharding-by-hash-range)
- 非军事区（联网）, [对外提供派生数据](/ch11#sec_batch_serving_derived)
- 非正常化（数据表示）, [规范化、反规范化与连接](/ch3#sec_datamodels_normalization)-[多对一与多对多关系](/ch3#sec_datamodels_many_to_many), [术语表](/glossary)
  - 在衍生数据系统中, [记录系统与派生数据](/ch1#sec_introduction_derived)
  - in event sourcing/CQRS, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
  - 社会网络案例研究, [社交网络案例研究中的反规范化](/ch3#denormalization-in-the-social-networking-case-study)
  - 实际意见, [物化视图与数据立方体](/ch4#sec_storage_materialized_views)
  - 更新衍生数据, [单对象与多对象操作](/ch8#sec_transactions_multi_object), [多对象事务的需求](/ch8#sec_transactions_need), [组合使用派生数据的工具](/ch13#id442)
  - 相对于正常化, [从同一事件日志中派生多个视图](/ch12#sec_stream_deriving_views)
- 衍生数据, [记录系统与派生数据](/ch1#sec_introduction_derived), [流处理](/ch12#ch_stream), [术语表](/glossary)
  - 批处理, [批处理](/ch11#ch_batch)
  - 事件溯源与 CQRS, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
  - 从变化数据抓取, [数据变更捕获的实现](/ch12#id307)
  - 通过日志维护导出状态, [数据库与流](/ch12#sec_stream_databases)-[变更流的 API 支持](/ch12#sec_stream_change_api), [状态、流和不变性](/ch12#sec_stream_immutability)-[并发控制](/ch12#sec_stream_concurrency)
  - 通过对流的订阅来观察, [端到端的事件流](/ch13#id349)
  - 批量和流处理的产出, [批处理与流处理](/ch13#sec_future_batch_streaming)
  - 通过应用程序代码, [应用代码作为派生函数](/ch13#sec_future_dataflow_derivation)
  - 相对于已分配事务, [派生数据与分布式事务](/ch13#sec_future_derived_vs_transactions)
- 设计模式, [简单性：管理复杂度](/ch2#id38)
- 决定性行动, [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs), [故障与部分失效](/ch9#sec_distributed_partial_failure), [术语表](/glossary)
  - 专有权, [幂等性](/ch12#sec_stream_idempotence), [理解数据流](/ch13#id443)
  - 计算衍生数据, [维护派生状态](/ch13#id446), [数据流系统的正确性](/ch13#id453), [为可审计性而设计](/ch13#id365)
  - 如果来源, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
  - 状态机器复制, [使用共享日志](/ch10#sec_consistency_smr), [数据库与流](/ch12#sec_stream_databases)
  - 基于语句的复制, [基于语句的复制](/ch6#statement-based-replication)
  - 测试中, [确定性模拟测试](/ch9#deterministic-simulation-testing)
  - 加入, [连接的时间依赖性](/ch12#sec_stream_join_time)
  - 使代码确定性, [确定性模拟测试](/ch9#deterministic-simulation-testing)
  - 概览, [确定性模拟测试](/ch9#deterministic-simulation-testing)
- 确定性模拟测试（DST）, [确定性模拟测试](/ch9#deterministic-simulation-testing)
- DevOps, [云时代的运维](/ch1#sec_introduction_operations)
- 维度表, [星型与雪花型：分析模式](/ch3#sec_datamodels_analytics)
- dimensional modeling（见 star schemas）
- directed acyclic graphs (DAG)
  - 工作流程, [工作流调度](/ch11#sec_batch_workflows)
    - （另见 workflow engines）
- 脏读, [没有脏读](/ch8#no-dirty-reads)
- 脏字（事务隔离）, [没有脏写](/ch8#sec_transactions_dirty_write)
- 分类
  - 存储和计算, [存储与计算的分离](/ch1#sec_introduction_storage_compute)
- discord（分组聊天）
  - GraphQL example, [GraphQL](/ch3#id63)
- 歧视, [偏见与歧视](/ch14#id370)
- disks（见 hard disks）
- 分布式行为者框架, [分布式 actor 框架](/ch5#distributed-actor-frameworks)
- 分布式文件系统, [分布式文件系统](/ch11#sec_batch_dfs)-[分布式文件系统](/ch11#sec_batch_dfs)
  - 比较对象存储, [对象存储](/ch11#id277)
  - 由 Flink 使用, [失败后重建状态](/ch12#sec_stream_state_fault_tolerance)
- 已分发分类账, [总结](/ch3#summary)
- 分布式系统, [分布式系统的麻烦](/ch9#ch_distributed)-[总结](/ch9#summary), [术语表](/glossary)
  - 拜占庭断层, [拜占庭故障](/ch9#sec_distributed_byzantine)-[弱形式的谎言](/ch9#weak-forms-of-lying)
  - 检测网络断层, [检测故障](/ch9#id307)
  - 过失和部分失败, [故障与部分失效](/ch9#sec_distributed_partial_failure)
  - 协商一致的正式化, [单值共识](/ch10#single-value-consensus)
  - 无法取得的结果, [CAP 定理](/ch10#the-cap-theorem), [共识](/ch10#sec_consistency_consensus)
  - 出现故障的问题, [领导者故障：故障转移](/ch6#leader-failure-failover)
  - multi-region（见 regions (geographic distribution)）
  - 网络问题, [不可靠的网络](/ch9#sec_distributed_networks)-[我们不能简单地使网络延迟可预测吗？](/ch9#can-we-not-simply-make-network-delays-predictable)
  - 问题, [分布式系统的问题](/ch1#sec_introduction_dist_sys_problems)
  - 法定人数,依赖, [多数派原则](/ch9#sec_distributed_majority)
  - 使用原因, [分布式与单节点系统](/ch1#sec_introduction_distributed), [复制](/ch6#ch_replication)
  - 同步时钟, 依赖, [对同步时钟的依赖](/ch9#sec_distributed_clocks_relying)-[用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
  - 系统模型, [系统模型与现实](/ch9#sec_distributed_system_model)-[确定性模拟测试](/ch9#deterministic-simulation-testing)
  - 使用时钟和时间, [不可靠的时钟](/ch9#sec_distributed_clocks)
- distributed transactions（见 transactions）
- Django（网络框架）, [处理错误和中止](/ch8#handling-errors-and-aborts)
- DMZ (demilitarized zone), [对外提供派生数据](/ch11#sec_batch_serving_derived)
- DNS (Domain Name System), [请求路由](/ch7#sec_sharding_routing), [服务发现](/ch10#service-discovery)
  - 用于负载平衡, [负载均衡器、服务发现和服务网格](/ch5#sec_encoding_service_discovery)
- Docker （集装箱管理器）, [应用代码和状态的分离](/ch13#id344)
- 文档数据模型, [关系模型与文档模型](/ch3#sec_datamodels_history)-[文档和关系数据库的融合](/ch3#convergence-of-document-and-relational-databases)
  - 比较关系模式, [何时使用哪种模型](/ch3#sec_datamodels_document_summary)-[文档和关系数据库的融合](/ch3#convergence-of-document-and-relational-databases)
  - 多对象事务, 需要, [多对象事务的需求](/ch8#sec_transactions_need)
  - 硬化二级指数, [分片与二级索引](/ch7#sec_sharding_secondary_indexes)
  - 相对关系模式
    - 模式的趋同, [文档和关系数据库的融合](/ch3#convergence-of-document-and-relational-databases)
    - 数据位置, [读写的数据局部性](/ch3#sec_datamodels_document_locality)
- document-partitioned indexes（见 local secondary indexes）
- 领域驱动设计, [简单性：管理复杂度](/ch2#id38), [事件溯源与 CQRS](/ch3#sec_datamodels_events)
- 点版向量, [版本向量](/ch6#version-vectors)
- 双重登录簿记, [总结](/ch3#summary)
- DRBD (Distributed Replicated Block Device), [单主复制](/ch6#sec_replication_leader)
- 漂移（小时）, [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy)
- Druid（数据库）, [事务处理与分析的特征](/ch1#sec_introduction_oltp), [列式存储](/ch4#sec_storage_column), [从同一事件日志中派生多个视图](/ch12#sec_stream_deriving_views)
  - 处理写入, [写入列式存储](/ch4#writing-to-column-oriented-storage)
  - 预汇总, [分析（Analytics）](/ch11#sec_batch_olap)
  - 服务衍生数据, [对外提供派生数据](/ch11#sec_batch_serving_derived)
- Dryad（数据流引擎）, [数据流引擎](/ch11#sec_batch_dataflow)
- 双写、 问题, [保持系统同步](/ch12#sec_stream_sync)
- DuckDB（数据库）, [分布式系统的问题](/ch1#sec_introduction_dist_sys_problems), [压实策略](/ch4#sec_storage_lsm_compaction)
  - 面向列的存储, [列式存储](/ch4#sec_storage_column)
  - 用于 ETL, [提取-转换-加载（ETL）](/ch11#sec_batch_etl_usage)
- 减少重复,消除, [抑制重复](/ch13#id354)
  - （另见 幂等性）
  - using a unique ID, [操作标识符](/ch13#id355), [多分区请求处理](/ch13#id360)
- 持久性, [使 B 树可靠](/ch4#sec_storage_btree_wal), [持久性](/ch8#durability), [术语表](/glossary)
- 持久执行, [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows)
  - 依赖决定性因素, [确定性模拟测试](/ch9#deterministic-simulation-testing)
  - Restate（见 Restate (workflow engine)）
  - Temporal（见 Temporal (workflow engine)）
- durable functions（见 workflow engines）
- 时间（时间）, [不可靠的时钟](/ch9#sec_distributed_clocks)
  - 用单音钟测量, [单调时钟](/ch9#monotonic-clocks)
- 动态输入语言
  - 类比于阅读时的图案, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility)
- Dynamo（数据库）, [无主复制](/ch6#sec_replication_leaderless)
- Dynamo-style databases（见 leaderless replication）
- DynamoDB（数据库）
  - 自动缩放, [运维：自动/手动再平衡](/ch7#sec_sharding_operations)
  - 散列变硬, [按哈希范围分片](/ch7#sharding-by-hash-range)
  - 基于领导者的复制, [单主复制](/ch6#sec_replication_leader)
  - 硬化二级指数, [全局二级索引](/ch7#id167)

### E

- EBS（虚拟块设备）, [存储与计算的分离](/ch1#sec_introduction_storage_compute)
  - 比较对象存储, [设置新的副本](/ch6#sec_replication_new_replica)
- ECC（见 error-correcting codes）
- EDB Postgres Distributed（数据库）, [跨地域运行](/ch6#sec_replication_multi_dc)
- 边缘（图）, [图数据模型](/ch3#sec_datamodels_graph)
  - 属性图模型, [属性图](/ch3#id56)
- 编辑距离（全文搜索）, [全文检索](/ch4#sec_storage_full_text)
- 有效即时语义, [容错](/ch12#sec_stream_fault_tolerance), [恰好执行一次操作](/ch13#id353)
  - （另见 恰好一次语义）
  - 维护完整性, [数据流系统的正确性](/ch13#id453)
- Elastic Compute Cloud (EC2)
  - 现场实例, [故障处理](/ch11#id281)
- 弹性, [分布式与单节点系统](/ch1#sec_introduction_distributed)
  - 云数据仓库, [云数据仓库](/ch4#sec_cloud_data_warehouses), [查询语言](/ch11#sec_batch_query_lanauges)
- 弹性搜索（搜索服务器）
  - 本地二级指数, [本地二级索引](/ch7#id166)
  - 剖析器（流搜索）, [在流上搜索](/ch12#id320)
  - 服务衍生数据, [对外提供派生数据](/ch11#sec_batch_serving_derived)
  - 硬调和, [固定数量的分片](/ch7#fixed-number-of-shards)
  - 使用 Lucene, [全文检索](/ch4#sec_storage_full_text)
- 精灵（编程语言）, [端到端的事件流](/ch13#id349)
- ELT (extract-load-transform), [数据仓库](/ch1#sec_introduction_dwh)
  - 与批量处理的关系, [提取-转换-加载（ETL）](/ch11#sec_batch_etl_usage)
- 严重平行（算法）
  - 提取-转换-加载（ETL）（见 ETL）
  - MapReduce, [MapReduce](/ch11#sec_batch_mapreduce)
    - （另见 MapReduce）
- 嵌入式存储引擎, [压实策略](/ch4#sec_storage_lsm_compaction)
- 嵌入（显示器）, [向量嵌入](/ch4#id92)
- 编码（数据格式）, [编码与演化](/ch5#ch_encoding)-[模式的优点](/ch5#sec_encoding_schemas)
  - Avro, [Avro](/ch5#sec_encoding_avro)-[动态生成的模式](/ch5#dynamically-generated-schemas)
  - binary variants of JSON and XML, [二进制编码](/ch5#binary-encoding)
  - 兼容性, [编码与演化](/ch5#ch_encoding)
    - 电话服务, [RPC 的数据编码与演化](/ch5#data-encoding-and-evolution-for-rpc)
    - 使用数据库, [流经数据库的数据流](/ch5#sec_encoding_dataflow_db)-[归档存储](/ch5#archival-storage)
  - 定义, [编码数据的格式](/ch5#sec_encoding_formats)
  - JSON, XML, and CSV, [JSON、XML 及其二进制变体](/ch5#sec_encoding_json)
  - 语言特定格式, [特定语言的格式](/ch5#id96)
  - 计划的价值, [模式的优点](/ch5#sec_encoding_schemas)
  - Protocol Buffers, [Protocol Buffers](/ch5#sec_encoding_protobuf)-[字段标签与模式演化](/ch5#field-tags-and-schema-evolution)
  - 数据说明, [编码数据的格式](/ch5#sec_encoding_formats)
- 端到端原则, [端到端原则](/ch13#sec_future_e2e_argument)-[在数据系统中应用端到端思考](/ch13#id357)
  - 检查完整性, [端到端原则重现](/ch13#id456)
  - 发布/订阅流, [端到端的事件流](/ch13#id349)
- 浓缩（流）, [流表连接（流扩充）](/ch12#sec_stream_table_joins)
- Enterprise JavaBeans (EJB), [远程过程调用（RPC）的问题](/ch5#sec_problems_with_rpc)
- 企业软件, [数据系统架构中的权衡](/ch1#ch_tradeoffs)
- entities（见 vertices）
- 电子存储, [存储与计算的分离](/ch1#sec_introduction_storage_compute)
- 时代（协商一致算法）, [从单主复制到共识](/ch10#from-single-leader-replication-to-consensus)
- 时代（Unix 时间戳）, [日历时钟](/ch9#time-of-day-clocks)
- 清除编码（错误校正）, [分布式文件系统](/ch11#sec_batch_dfs)
- 错误处理
  - 网络断层, [实践中的网络故障](/ch9#sec_distributed_network_faults)
  - 事务中, [处理错误和中止](/ch8#handling-errors-and-aborts)
- 错误更正代码, [硬件与软件故障](/ch2#sec_introduction_hardware_faults), [分布式文件系统](/ch11#sec_batch_dfs)
- Esper (CEP engine), [复合事件处理](/ch12#id317)
- 基本复杂性, [简单性：管理复杂度](/ch2#id38)
- 协调事务, [协调服务](/ch10#sec_consistency_coordination)-[服务发现](/ch10#service-discovery)
  - 生成栅栏标志, [隔离僵尸进程和延迟请求](/ch9#sec_distributed_fencing_tokens), [协调服务](/ch10#sec_consistency_coordination)
  - 线性操作, [实现线性一致性系统](/ch10#sec_consistency_implementing_linearizable), [共识的微妙之处](/ch10#subtleties-of-consensus)
  - 锁和领袖选举, [锁定与领导者选举](/ch10#locking-and-leader-election)
  - 用于服务发现, [负载均衡器、服务发现和服务网格](/ch5#sec_encoding_service_discovery), [服务发现](/ch10#service-discovery)
  - 用于硬性转让, [请求路由](/ch7#sec_sharding_routing)
  - 使用 Raft 算法, [单主复制](/ch6#sec_replication_leader)
- 伊特鲁姆（块链）, [用于可审计数据系统的工具](/ch13#id366)
- 以太网（网络）, [云计算与超级计算](/ch1#id17), [不可靠的网络](/ch9#sec_distributed_networks), [我们不能简单地使网络延迟可预测吗？](/ch9#can-we-not-simply-make-network-delays-predictable)
  - 包检查和, [弱形式的谎言](/ch9#weak-forms-of-lying), [端到端原则](/ch13#sec_future_e2e_argument)
- 道德操守, [将事情做正确](/ch14)-[立法与自律](/ch14#sec_future_legislation)
  - 道德守则和专业实务, [将事情做正确](/ch14)
  - 立法和自律, [立法与自律](/ch14#sec_future_legislation)
  - 预测分析, [预测分析](/ch14#id369)-[反馈回路](/ch14#id372)
    - 扩大偏见, [偏见与歧视](/ch14#id370)
    - 反馈循环, [反馈回路](/ch14#id372)
  - 隐私和跟踪, [隐私与追踪](/ch14#id373)-[立法与自律](/ch14#sec_future_legislation)
    - 同意和选择自由, [同意与选择自由](/ch14#id375)
    - 数据作为资产和权力, [数据作为资产与权力](/ch14#id376)
    - 隐私的含义, [隐私与数据使用](/ch14#id457)
    - 监视, [监视](/ch14#id374)
  - 尊重、尊严和机构, [立法与自律](/ch14#sec_future_legislation)
  - 意外后果, [将事情做正确](/ch14), [反馈回路](/ch14#id372)
- ETL, [数据仓库](/ch1#sec_introduction_dwh), [保持系统同步](/ch12#sec_stream_sync), [术语表](/glossary)
  - 与批量处理的关系, [提取-转换-加载（ETL）](/ch11#sec_batch_etl_usage)-[提取-转换-加载（ETL）](/ch11#sec_batch_etl_usage)
  - 使用批量处理, [批处理](/ch11#ch_batch)
- 欧几利得距离（语义搜索）, [向量嵌入](/ch4#id92)
- European Union
  - AI Act（见 AI Act）
  - GDPR（见 GDPR）
- 事件溯源, [事件溯源与 CQRS](/ch3#sec_datamodels_events)-[事件溯源与 CQRS](/ch3#sec_datamodels_events)
  - 并更改数据捕获, [数据变更捕获与事件溯源](/ch12#sec_stream_event_sourcing)
  - 与变化数据捕获的比较, [数据变更捕获与事件溯源](/ch12#sec_stream_event_sourcing)
  - 不可更改性和可审计性, [状态、流和不变性](/ch12#sec_stream_immutability), [为可审计性而设计](/ch13#id365)
  - 大型可靠数据系统, [操作标识符](/ch13#id355), [数据流系统的正确性](/ch13#id453)
  - 依赖决定性因素, [确定性模拟测试](/ch9#deterministic-simulation-testing)
- event streams（见 streams）
- 事件驱动的架构, [事件驱动的架构](/ch5#sec_encoding_dataflow_msg)-[分布式 actor 框架](/ch5#distributed-actor-frameworks)
  - 分布式行为者框架, [分布式 actor 框架](/ch5#distributed-actor-frameworks)
- 事件, [传递事件流](/ch12#sec_stream_transmit)
  - 决定总顺序, [全序的限制](/ch13#id335)
  - 从事件日志中得出看法, [从同一事件日志中派生多个视图](/ch12#sec_stream_deriving_views)
  - 事件时间与处理时间, [事件时间与处理时间](/ch12#id322), [微批次与存档点](/ch12#id329), [统一批处理和流处理](/ch13#id338)
  - 不可改变的优点, [不可变事件的优点](/ch12#sec_stream_immutability_pros), [为可审计性而设计](/ch13#id365)
  - 命令捕获因果关系, [排序事件以捕获因果关系](/ch13#sec_future_capture_causality)
  - 读作:, [读也是事件](/ch13#sec_future_read_events)
  - 疏远者, [处理滞留事件](/ch12#id323)
  - 溪流处理中的时间戳, [你用的是谁的时钟？](/ch12#id438)
- EventSource (browser API), [将状态变更推送给客户端](/ch13#id348)
- EventStoreDB（数据库）, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
- 最终一致性, [复制](/ch6#ch_replication), [复制延迟的问题](/ch6#sec_replication_lag), [安全性与活性](/ch9#sec_distributed_safety_liveness)
  - （另见 conflicts）
  - 和长期不一致, [及时性与完整性](/ch13#sec_future_integrity)
  - 最终的一致性, [自动冲突解决](/ch6#automatic-conflict-resolution)
- 证据
  - 数据用作, [人类与可靠性](/ch2#id31)
- 可演化性, [可演化性：让变化更容易](/ch2#sec_introduction_evolvability), [编码与演化](/ch5#ch_encoding)
  - 电话服务, [RPC 的数据编码与演化](/ch5#data-encoding-and-evolution-for-rpc)
  - 事件溯源, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
  - 图表结构数据, [属性图](/ch3#id56)
  - 数据库, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility), [流经数据库的数据流](/ch5#sec_encoding_dataflow_db)-[归档存储](/ch5#archival-storage), [从同一事件日志中派生多个视图](/ch12#sec_stream_deriving_views), [应用演化后重新处理数据](/ch13#sec_future_reprocessing)
  - 后处理数据, [应用演化后重新处理数据](/ch13#sec_future_reprocessing), [统一批处理和流处理](/ch13#id338)
  - Avro 的策略进化, [写入者模式与读取者模式](/ch5#the-writers-schema-and-the-readers-schema)
  - 协议缓冲的策略演变, [字段标签与模式演化](/ch5#field-tags-and-schema-evolution)
  - 阅读时的图谋, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility), [编码与演化](/ch5#ch_encoding), [模式的优点](/ch5#sec_encoding_schemas)
- 恰好一次语义, [恰好一次消息处理](/ch8#sec_transactions_exactly_once), [再谈恰好一次消息处理](/ch8#exactly-once-message-processing-revisited), [容错](/ch12#sec_stream_fault_tolerance), [恰好执行一次操作](/ch13#id353)
  - 与批量处理器对等, [统一批处理和流处理](/ch13#id338)
  - 维护完整性, [数据流系统的正确性](/ch13#id453)
  - 使用持久执行, [持久化执行](/ch5#durable-execution)
- 独占模式, [两阶段锁定的实现](/ch8#implementation-of-two-phase-locking)
- 指数备份, [描述性能](/ch2#sec_introduction_percentiles), [处理错误和中止](/ch8#handling-errors-and-aborts)
- ext4 (file system), [分布式文件系统](/ch11#sec_batch_dfs)
- eXtended Architecture transactions（见 XA 事务）
- ETL（见 提取-转换-加载（ETL））

### F

- 脸书
  - 费斯（媒介指数）, [向量嵌入](/ch4#id92)
  - 反应（用户界面库）, [端到端的事件流](/ch13#id349)
  - 社会图表, [图数据模型](/ch3#sec_datamodels_graph)
- 事实
  - 事实表（星图）, [星型与雪花型：分析模式](/ch3#sec_datamodels_analytics)
  - 在数据日志中, [Datalog：递归关系查询](/ch3#id62)
  - 如果来源, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
- 慢故障, [系统模型与现实](/ch9#sec_distributed_system_model)
- 失败停止模式, [系统模型与现实](/ch9#sec_distributed_system_model)
- 故障切换, [领导者故障：故障转移](/ch6#leader-failure-failover), [术语表](/glossary)
  - （另见 基于领导者的复制）
  - 无领导复制,没有, [当节点故障时写入数据库](/ch6#id287)
  - 领袖选举, [分布式锁和租约](/ch9#sec_distributed_lock_fencing), [共识](/ch10#sec_consistency_consensus), [从单主复制到共识](/ch10#from-single-leader-replication-to-consensus)
  - 潜在问题, [领导者故障：故障转移](/ch6#leader-failure-failover)
- 失败
  - 通过经销事务扩充, [维护派生状态](/ch13#id446)
  - 检测失败, [检测故障](/ch9#id307)
    - 自动再平衡导致连锁故障, [运维：自动/手动再平衡](/ch7#sec_sharding_operations)
    - 超时和无限制延误, [超时和无界延迟](/ch9#sec_distributed_queueing), [网络拥塞和排队](/ch9#network-congestion-and-queueing)
    - 使用协调服务, [协调服务](/ch10#sec_consistency_coordination)
  - 错对错, [可靠性与容错](/ch2#sec_introduction_reliability)
  - 部分失败, [故障与部分失效](/ch9#sec_distributed_partial_failure), [总结](/ch9#summary)
- 费斯（媒介指数）, [向量嵌入](/ch4#id92)
- 假阳性（Bloom 过滤器）, [布隆过滤器](/ch4#bloom-filters)
- 扇出, [时间线的物化与更新](/ch2#sec_introduction_materializing), [多个消费者](/ch12#id298)
- 断层注射, [容错](/ch2#id27), [实践中的网络故障](/ch9#sec_distributed_network_faults), [故障注入](/ch9#sec_fault_injection)
- 断层隔离, [面向多租户的分片](/ch7#sec_sharding_multitenancy)
- 过失容忍, [可靠性与容错](/ch2#sec_introduction_reliability)-[人类与可靠性](/ch2#id31), [术语表](/glossary)
  - 协商一致的形式化, [单值共识](/ch10#single-value-consensus)
  - 容忍人为失误, [批处理](/ch11#ch_batch)
  - 分批处理, [故障处理](/ch11#id281)
  - 在基于日志的系统中, [在数据系统中应用端到端思考](/ch13#id357), [及时性与完整性](/ch13#sec_future_integrity)-[数据流系统的正确性](/ch13#id453)
  - 在溪流处理中, [容错](/ch12#sec_stream_fault_tolerance)-[失败后重建状态](/ch12#sec_stream_state_fault_tolerance)
    - 原子提交, [原子提交再现](/ch12#sec_stream_atomic_commit)
    - 幂等性, [幂等性](/ch12#sec_stream_idempotence)
    - 保持衍生状态, [维护派生状态](/ch13#id446)
    - 微打斗和检查站, [微批次与存档点](/ch12#id329)
    - 失败后重建状态, [失败后重建状态](/ch12#sec_stream_state_fault_tolerance)
  - 分布式事务, [XA 事务](/ch8#xa-transactions)-[再谈恰好一次消息处理](/ch8#exactly-once-message-processing-revisited)
  - 基于领导和无领导者的复制, [单主与无主复制的性能](/ch6#sec_replication_leaderless_perf)
  - 事务原子性, [原子性](/ch8#sec_transactions_acid_atomicity), [分布式事务](/ch8#sec_transactions_distributed)-[恰好一次消息处理](/ch8#sec_transactions_exactly_once)
- 错误
  - 拜占庭断层, [拜占庭故障](/ch9#sec_distributed_byzantine)-[弱形式的谎言](/ch9#weak-forms-of-lying)
  - 失败与, [可靠性与容错](/ch2#sec_introduction_reliability)
  - 事务处理, [事务](/ch8#ch_transactions)
  - 超级计算机和云计算处理, [云计算与超级计算](/ch1#id17)
  - 硬件, [硬件与软件故障](/ch2#sec_introduction_hardware_faults)
  - 在分布式系统中, [故障与部分失效](/ch9#sec_distributed_partial_failure)
  - introducing deliberately（见 fault injection）
  - 网络断层, [实践中的网络故障](/ch9#sec_distributed_network_faults)-[检测故障](/ch9#id307)
    - 非对称断层, [多数派原则](/ch9#sec_distributed_majority)
    - 检测, [检测故障](/ch9#id307)
    - 容忍,多领导复制, [跨地域运行](/ch6#sec_replication_multi_dc)
  - 软件故障, [软件故障](/ch2#software-faults)
  - tolerating（见 fault tolerance）
- 特性工程（机器学习）, [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake)
- 联邦数据库, [一切的元数据库](/ch13#id341)
- Feldera（数据库）
  - 增量视图维护, [维护物化视图](/ch12#sec_stream_mat_view)
- 围栏, [线性一致性与网络延迟](/ch10#linearizability-and-network-delays)
- 屏障, [领导者故障：故障转移](/ch6#leader-failure-failover), [隔离僵尸进程和延迟请求](/ch9#sec_distributed_fencing_tokens)-[多副本隔离](/ch9#fencing-with-multiple-replicas)
  - 生成栅栏标志, [使用共享日志](/ch10#sec_consistency_smr), [协调服务](/ch10#sec_consistency_coordination)
  - 栅栏标志的属性, [定义算法的正确性](/ch9#defining-the-correctness-of-an-algorithm)
  - 流处理器写入数据库, [幂等性](/ch12#sec_stream_idempotence), [恰好执行一次操作](/ch13#id353)
- 获取和添加
  - 与协商一致的关系, [获取并增加作为共识](/ch10#fetch-and-add-as-consensus)
- 纤维通道（网络）, [分布式文件系统](/ch11#sec_batch_dfs)
- 字段标记（协议缓冲）, [Protocol Buffers](/ch5#sec_encoding_protobuf)-[字段标签与模式演化](/ch5#field-tags-and-schema-evolution)
- Figma （图形软件）, [实时协作、离线优先和本地优先应用](/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- filesystem in userspace (FUSE), [设置新的副本](/ch6#sec_replication_new_replica), [分布式文件系统](/ch11#sec_batch_dfs)
  - 在对象存储中, [对象存储](/ch11#id277)
- 财务数据
  - 会计分类账, [总结](/ch3#summary)
  - 不可改变性, [不可变事件的优点](/ch12#sec_stream_immutability_pros)
  - 时间序列数据, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- 五特兰, [数据仓库](/ch1#sec_introduction_dwh)
- FizzBee (specification language), [模型检查与规范语言](/ch9#model-checking-and-specification-languages)
- 平面指数（媒介指数）, [向量嵌入](/ch4#id92)
- FlatBuffers（数据格式）, [编码数据的格式](/ch5#sec_encoding_formats)
- Flink（处理框架）, [批处理](/ch11#ch_batch), [数据流引擎](/ch11#sec_batch_dataflow)
  - 成本效率, [查询语言](/ch11#sec_batch_query_lanauges)
  - DataFrames, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes), [DataFrames](/ch11#id287)
  - 过失容忍, [故障处理](/ch11#id281), [微批次与存档点](/ch12#id329), [失败后重建状态](/ch12#sec_stream_state_fault_tolerance)
  - FlinkML, [机器学习](/ch11#id290)
  - 数据仓库, [云数据仓库](/ch4#sec_cloud_data_warehouses)
  - high availability using ZooKeeper, [协调服务](/ch10#sec_consistency_coordination)
  - 集成批量和流处理, [统一批处理和流处理](/ch13#id338)
  - 查询优化器, [查询语言](/ch11#sec_batch_query_lanauges)
  - 移动数据, [混洗数据](/ch11#sec_shuffle)
  - 流处理, [流分析](/ch12#id318)
  - streaming SQL support, [复合事件处理](/ch12#id317)
- 流量控制, [TCP 的局限性](/ch9#sec_distributed_tcp), [消息传递系统](/ch12#sec_stream_messaging), [术语表](/glossary)
- FLP result (on consensus), [共识](/ch10#sec_consistency_consensus)
- Flyte（工作流调度器）, [机器学习](/ch11#id290)
- 追随者, [单主复制](/ch6#sec_replication_leader), [术语表](/glossary)
  - （另见 基于领导者的复制）
- 正式方法, [形式化方法和随机测试](/ch9#sec_distributed_formal)-[确定性模拟测试](/ch9#deterministic-simulation-testing)
- 转发兼容性, [编码与演化](/ch5#ch_encoding)
- 前进衰变（算法）, [响应时间指标的应用](/ch2#sec_introduction_slo_sla)
- 化石（版本控制系统）, [并发控制](/ch12#sec_stream_concurrency)
  - 避免, [不变性的局限性](/ch12#sec_stream_immutability_limitations)
- FoundationDB（数据库）
  - 一致性模式, [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
  - 确定性模拟测试, [确定性模拟测试](/ch9#deterministic-simulation-testing)
  - 键程硬化, [按键的范围分片](/ch7#sec_sharding_key_range)
  - 进程/核心模式, [分片的利与弊](/ch7#sec_sharding_reasons)
  - 可序列事务, [可串行化快照隔离（SSI）](/ch8#sec_transactions_ssi), [可串行化快照隔离的性能](/ch8#performance-of-serializable-snapshot-isolation)
  - 事务, [事务到底是什么？](/ch8#sec_transactions_overview), [数据库内部的分布式事务](/ch8#sec_transactions_internal)
- 分数索引, [何时使用哪种模型](/ch3#sec_datamodels_document_summary)
- 碎裂（B树）, [磁盘空间使用](/ch4#disk-space-usage)
- 框架（计算机图形）, [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
- 前端 （网页开发）, [数据系统架构中的权衡](/ch1#ch_tradeoffs)
- FrostDB（数据库）
  - 确定性模拟测试（DST）, [确定性模拟测试](/ch9#deterministic-simulation-testing)
- fsync （系统调用）, [使 B 树可靠](/ch4#sec_storage_btree_wal), [持久性](/ch8#durability)
- 全文检索, [全文检索](/ch4#sec_storage_full_text), [术语表](/glossary)
  - 和模糊的指数, [全文检索](/ch4#sec_storage_full_text)
  - Lucene 存储引擎, [全文检索](/ch4#sec_storage_full_text)
  - 硬化指数, [分片与二级索引](/ch7#sec_sharding_secondary_indexes)
- Function as a Service (FaaS), [微服务与无服务器](/ch1#sec_introduction_microservices)
- 职能方案拟订
  - inspiration for MapReduce, [MapReduce](/ch11#sec_batch_mapreduce)
- 职能要求, [定义非功能性需求](/ch2#ch_nonfunctional)
- FUSE（见 filesystem in userspace (FUSE)）
- 模糊, [形式化方法和随机测试](/ch9#sec_distributed_formal)
- fuzzy search（见 similarity search）

### G

- Gallina（特写语言）, [模型检查与规范语言](/ch9#model-checking-and-specification-languages)
- 游戏开发, [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
- 垃圾收集
  - 不可改变性和, [不变性的局限性](/ch12#sec_stream_immutability_limitations)
  - 进程暂停, [延迟与响应时间](/ch2#id23), [进程暂停](/ch9#sec_distributed_clocks_pauses)-[限制垃圾回收的影响](/ch9#sec_distributed_gc_impact), [多数派原则](/ch9#sec_distributed_majority)
    - （另见 process pauses）
- 加油站算法定价, [反馈回路](/ch14#id372)
- GDPR (regulation), [数据系统、法律与社会](/ch1#sec_introduction_compliance), [不变性的局限性](/ch12#sec_stream_immutability_limitations)
  - 同意书, [同意与选择自由](/ch14#id375)
  - 数据最小化, [立法与自律](/ch14#sec_future_legislation)
  - 合法权益, [同意与选择自由](/ch14#id375)
  - 使用权, [面向多租户的分片](/ch7#sec_sharding_multitenancy)
  - 清除的权利, [数据系统、法律与社会](/ch1#sec_introduction_compliance), [磁盘空间使用](/ch4#disk-space-usage), [面向多租户的分片](/ch7#sec_sharding_multitenancy)
- GenBank (genome database), [总结](/ch3#summary)
- General Data Protection Regulation（见 GDPR (regulation)）
- 基因组分析, [总结](/ch3#summary)
- geographic distribution（见 regions (geographic distribution)）
- 地理空间指数, [多维索引与全文索引](/ch4#sec_storage_multidimensional)
- Git（版本控制系统）, [并发控制](/ch12#sec_stream_concurrency)
  - 本地第一软件, [实时协作、离线优先和本地优先应用](/ch6#real-time-collaboration-offline-first-and-local-first-apps)
  - 合并冲突, [手动冲突解决](/ch6#manual-conflict-resolution)
- GitHub, postmortems, [领导者故障：故障转移](/ch6#leader-failure-failover), [领导者故障：故障转移](/ch6#leader-failure-failover), [将系统模型映射到现实世界](/ch9#mapping-system-models-to-the-real-world)
- 全球二级指数, [全局二级索引](/ch7#id167), [总结](/ch7#summary)
- globally unique identifiers（见 UUIDs）
- GlusterFS（分布式文件系统）, [批处理](/ch11#ch_batch), [分布式文件系统](/ch11#sec_batch_dfs), [对象存储](/ch11#id277)
- GNU Coreutils (Linux), [排序与内存聚合](/ch11#id275)
- Go（编程语言）
  - 垃圾收集, [限制垃圾回收的影响](/ch9#sec_distributed_gc_impact)
- GoldenGate (change data capture), [数据变更捕获的实现](/ch12#id307)
  - （另见 Oracle）
- 谷歌
  - BigQuery（见 BigQuery（数据库））
  - Bigtable（见 Bigtable（数据库））
  - Chubby（锁服务）, [协调服务](/ch10#sec_consistency_coordination)
  - Cloud Storage（对象存储）, [设置新的副本](/ch6#sec_replication_new_replica), [对象存储](/ch11#id277)
    - 请求先决条件, [隔离僵尸进程和延迟请求](/ch9#sec_distributed_fencing_tokens)
  - Compute Engine
    - 预设实例, [故障处理](/ch11#id281)
  - 数据流（流程处理）
    - 数据仓集成, [云数据仓库](/ch4#sec_cloud_data_warehouses)
    - 移动数据, [混洗数据](/ch11#sec_shuffle)
  - 数据流（流处理器）, [流分析](/ch12#id318), [原子提交再现](/ch12#sec_stream_atomic_commit), [统一批处理和流处理](/ch13#id338)
    - （另见 Beam）
  - 数据流（变化数据捕获）, [变更流的 API 支持](/ch12#sec_stream_change_api)
  - Docs（协作编辑）, [实时协作、离线优先和本地优先应用](/ch6#real-time-collaboration-offline-first-and-local-first-apps), [CRDT 与操作变换](/ch6#sec_replication_crdts)
    - 操作转换, [CRDT 与操作变换](/ch6#sec_replication_crdts)
  - Dremel（查询引擎）, [列式存储](/ch4#sec_storage_column)
  - Firestore（数据库）, [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
  - MapReduce (batch processing), [批处理](/ch11#ch_batch)
    - （另见 MapReduce）
  - Percolator（事务系统）, [实现线性一致的 ID 生成器](/ch10#implementing-a-linearizable-id-generator)
  - 持久性磁盘（云服务）, [存储与计算的分离](/ch1#sec_introduction_storage_compute)
  - Pub/Sub（消息系统）, [消息代理](/ch5#message-brokers), [消息代理与数据库的对比](/ch12#id297), [使用日志进行消息存储](/ch12#id300)
  - 响应时间研究, [平均值、中位数与百分位点](/ch2#id24)
  - 工作表（协作电子表格）, [实时协作、离线优先和本地优先应用](/ch6#real-time-collaboration-offline-first-and-local-first-apps), [CRDT 与操作变换](/ch6#sec_replication_crdts)
  - Spanner（见 Spanner（数据库））
  - TrueTime (clock API), [带置信区间的时钟读数](/ch9#clock-readings-with-a-confidence-interval)
- 流言协议, [请求路由](/ch7#sec_sharding_routing)
- 治理, [超越数据湖](/ch1#beyond-the-data-lake)
- 政府对数据的使用, [数据作为资产与权力](/ch14#id376)
- GPS (Global Positioning System)
  - 用于时钟同步, [不可靠的时钟](/ch9#sec_distributed_clocks), [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy), [带置信区间的时钟读数](/ch9#clock-readings-with-a-confidence-interval), [用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
- GPT (language model), [向量嵌入](/ch4#id92)
- GPU (graphics processing unit), [云服务的分层](/ch1#layering-of-cloud-services), [分布式与单节点系统](/ch1#sec_introduction_distributed)
- gradual rollout（见 rolling upgrades）
- GraphQL（查询语言）, [GraphQL](/ch3#id63)
  - 验证, [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs)
- 图表, [术语表](/glossary)
  - 作为数据模型, [图数据模型](/ch3#sec_datamodels_graph)-[GraphQL](/ch3#id63)
    - 属性图, [属性图](/ch3#id56)
    - RDF and triple-stores, [三元组存储与 SPARQL](/ch3#id59)-[SPARQL 查询语言](/ch3#the-sparql-query-language)
  - DAGs（见 directed acyclic graphs）
  - 处理和分析, [机器学习](/ch11#id290)
  - 查询语言
    - 密码, [Cypher 查询语言](/ch3#id57)
    - 数据日志, [Datalog：递归关系查询](/ch3#id62)-[Datalog：递归关系查询](/ch3#id62)
    - GraphQL, [GraphQL](/ch3#id63)
    - 格伦林, [图数据模型](/ch3#sec_datamodels_graph)
    - recursive SQL queries, [SQL 中的图查询](/ch3#id58)
    - SPARQL, [SPARQL 查询语言](/ch3#the-sparql-query-language)-[SPARQL 查询语言](/ch3#the-sparql-query-language)
  - 转弯, [属性图](/ch3#id56)
- 灰色失败, [系统模型与现实](/ch9#sec_distributed_system_model)
  - 无领导复制, [单主与无主复制的性能](/ch6#sec_replication_leaderless_perf)
- 格勒姆林（图形查询语言）, [图数据模型](/ch3#sec_datamodels_graph)
- grep （Unix 工具） （英语）., [简单日志分析](/ch11#sec_batch_log_analysis)
- gRPC (service calls), [微服务与无服务器](/ch1#sec_introduction_microservices), [Web 服务](/ch5#sec_web_services)
  - 前向和后向兼容性, [RPC 的数据编码与演化](/ch5#data-encoding-and-evolution-for-rpc)
- GUIDs（见 UUIDs）

### H

- Hadoop（数据基础设施）
  - 比较分布式数据库, [批处理](/ch11#ch_batch)
  - MapReduce（见 MapReduce）
  - NodeManager, [分布式作业编排](/ch11#id278)
  - YARN（见 YARN (job scheduler)）
- HANA（见 SAP HANA（数据库））
- 发生关系前, ["先发生"关系与并发](/ch6#sec_replication_happens_before)
- 硬盘
  - 访问模式, [顺序与随机写入](/ch4#sidebar_sequential)
  - 侦查腐败, [端到端原则](/ch13#sec_future_e2e_argument), [不要盲目信任承诺](/ch13#id364)
  - 错误在, [硬件与软件故障](/ch2#sec_introduction_hardware_faults), [持久性](/ch8#durability)
  - 顺序对随机写入, [顺序与随机写入](/ch4#sidebar_sequential)
  - 连续写入吞吐量, [磁盘空间使用](/ch12#sec_stream_disk_usage)
- 硬件故障, [硬件与软件故障](/ch2#sec_introduction_hardware_faults)
- 散列函数
  - 在 Bloom 过滤器中, [布隆过滤器](/ch4#bloom-filters)
- 加入散列
  - 在溪流处理中, [流表连接（流扩充）](/ch12#sec_stream_table_joins)
- 散列变硬, [按键的哈希分片](/ch7#sec_sharding_hash)-[一致性哈希](/ch7#sec_sharding_consistent_hashing), [总结](/ch7#summary)
  - 连续的散列, [一致性哈希](/ch7#sec_sharding_consistent_hashing)
  - Hash mod N的问题, [哈希取模节点数](/ch7#hash-modulo-number-of-nodes)
  - 区域查询, [按哈希范围分片](/ch7#sharding-by-hash-range)
  - 合适的散列函数, [按键的哈希分片](/ch7#sec_sharding_hash)
  - 有固定的硬块数, [固定数量的分片](/ch7#fixed-number-of-shards)
- 散列表格, [日志结构存储](/ch4#sec_storage_log_structured)
- Hazelcast（模拟数据网）
  - FencedLock, [隔离僵尸进程和延迟请求](/ch9#sec_distributed_fencing_tokens)
  - Flake ID Generator, [ID 生成器和逻辑时钟](/ch10#sec_consistency_logical)
- HBase（数据库）
  - 由于缺乏围栏而出现错误, [分布式锁和租约](/ch9#sec_distributed_lock_fencing)
  - 键程硬化, [按键的范围分片](/ch7#sec_sharding_key_range)
  - 日志结构存储, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
  - 区域（硬化）, [分片](/ch7#ch_sharding)
  - 请求路由, [请求路由](/ch7#sec_sharding_routing)
  - 大小级紧凑, [压实策略](/ch4#sec_storage_lsm_compaction)
  - 宽柱数据模型, [读写的数据局部性](/ch3#sec_datamodels_document_locality), [列压缩](/ch4#sec_storage_column_compression)
- HDFS (Hadoop Distributed File System), [批处理](/ch11#ch_batch), [分布式文件系统](/ch11#sec_batch_dfs)
  - （另见 distributed filesystems）
  - 检查数据完整性, [不要盲目信任承诺](/ch13#id364)
  - DataNode, [分布式文件系统](/ch11#sec_batch_dfs)
  - NameNode, [分布式文件系统](/ch11#sec_batch_dfs)
  - use in MapReduce, [MapReduce](/ch11#sec_batch_mapreduce)
  - 工作流程示例, [工作流调度](/ch11#sec_batch_workflows)
- HdrHistogram (numerical library), [响应时间指标的应用](/ch2#sec_introduction_slo_sla)
- 头 （Unix 工具）, [简单日志分析](/ch11#sec_batch_log_analysis), [分布式作业编排](/ch11#id278)
- 头顶（财产图）, [属性图](/ch3#id56)
- 头部阻塞, [延迟与响应时间](/ch2#id23)
- 堆积文件（数据库）, [在索引中存储值](/ch4#sec_storage_index_heap)
  - 多转换并发控制, [多版本并发控制（MVCC）](/ch8#sec_transactions_snapshot_impl)
- 热量管理, [偏斜的工作负载与缓解热点](/ch7#sec_sharding_skew)
- 被套期请求, [单主与无主复制的性能](/ch6#sec_replication_leaderless_perf)
- 分散事务, [跨不同系统的分布式事务](/ch8#sec_transactions_xa), [XA 事务的问题](/ch8#problems-with-xa-transactions)
- 启发式决策, [从协调器故障中恢复](/ch8#recovering-from-coordinator-failure)
- 十六进制（注解本）, [机器学习](/ch11#id290)
- 六边形
  - 地理空间索引, [多维索引与全文索引](/ch4#sec_storage_multidimensional)
- Hibernate（对象关系映射器）, [对象关系映射（ORM）](/ch3#object-relational-mapping-orm)
- 层次模型, [关系模型与文档模型](/ch3#sec_datamodels_history)
- 可导航的小世界（媒介指数）, [向量嵌入](/ch4#id92)
- hierarchical queries（见 recursive common table expressions）
- high availability（见 fault tolerance）
- 高频事务, [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy)
- high-performance computing (HPC), [云计算与超级计算](/ch1#id17)
- 提示移交, [追赶错过的写入](/ch6#sec_replication_read_repair)
- 直方图, [响应时间指标的应用](/ch2#sec_introduction_slo_sla)
- 蜂窝（数据仓）, [云数据仓库](/ch4#sec_cloud_data_warehouses)
  - 查询优化器, [查询语言](/ch11#sec_batch_query_lanauges)
- HNSW (vector index), [向量嵌入](/ch4#id92)
- 购物窗口（流程处理）, [窗口的类型](/ch12#id324)
  - （另见 windows）
- Hoptimator（查询引擎）, [一切的元数据库](/ch13#id341)
- 地平线丑闻, [人类与可靠性](/ch2#id31)
  - 缺乏事务, [事务](/ch8#ch_transactions)
- horizontal scaling（见 scaling out）
  - 通过磨损, [分片的利与弊](/ch7#sec_sharding_reasons)
- HornetQ（消息系统）, [消息代理](/ch5#message-brokers), [消息代理与数据库的对比](/ch12#id297)
  - 分布式事务支持, [XA 事务](/ch8#xa-transactions)
- 热键, [键值数据的分片](/ch7#sec_sharding_key_value)
- 热点, [键值数据的分片](/ch7#sec_sharding_key_value)
  - 由于名人, [偏斜的工作负载与缓解热点](/ch7#sec_sharding_skew)
  - 时间序列数据, [按键的范围分片](/ch7#sec_sharding_key_range)
  - 解除武装, [偏斜的工作负载与缓解热点](/ch7#sec_sharding_skew)
- hot standbys（见 基于领导者的复制）
- HTAP（见 hybrid transactional/analytic processing）
- HTTP, use in APIs（见 services）
- 人类错误, [人类与可靠性](/ch2#id31), [实践中的网络故障](/ch9#sec_distributed_network_faults), [批处理](/ch11#ch_batch)
- 混合逻辑时钟, [混合逻辑时钟](/ch10#hybrid-logical-clocks)
- 混合事务/分析处理, [数据仓库](/ch1#sec_introduction_dwh), [分析型数据存储](/ch4#sec_storage_analytics)
- hydrating IDs (join), [社交网络案例研究中的反规范化](/ch3#denormalization-in-the-social-networking-case-study)
- 高频图, [属性图](/ch3#id56)
- HyperLogLog (algorithm), [流分析](/ch12#id318)

### I

- I/O operations, waiting for, [进程暂停](/ch9#sec_distributed_clocks_pauses)
- IaaS（见 infrastructure as a service (IaaS)）
- IBM
  - Db2（数据库）
    - 分布式事务支持, [XA 事务](/ch8#xa-transactions)
    - 可序列隔离, [快照隔离、可重复读和命名混淆](/ch8#snapshot-isolation-repeatable-read-and-naming-confusion), [两阶段锁定的实现](/ch8#implementation-of-two-phase-locking)
  - MQ（消息系统）, [消息代理与数据库的对比](/ch12#id297)
    - 分布式事务支持, [XA 事务](/ch8#xa-transactions)
  - System R（数据库）, [事务到底是什么？](/ch8#sec_transactions_overview)
  - WebSphere（消息系统）, [消息代理](/ch5#message-brokers)
- Iceberg（表格式）, [云数据仓库](/ch4#sec_cloud_data_warehouses)
  - 对象存储的数据库, [设置新的副本](/ch6#sec_replication_new_replica)
  - 基于日志的信息代理存储, [磁盘空间使用](/ch12#sec_stream_disk_usage)
- 幂等性, [远程过程调用（RPC）的问题](/ch5#sec_problems_with_rpc), [幂等性](/ch12#sec_stream_idempotence), [术语表](/glossary)
  - by giving operations unique IDs, [多分区请求处理](/ch13#id360)
  - by giving requests unique IDs, [操作标识符](/ch13#id355)
  - 对于完全的语义, [再谈恰好一次消息处理](/ch8#exactly-once-message-processing-revisited)
  - 一元业务, [恰好执行一次操作](/ch13#id353)
  - 工作流程引擎中, [持久化执行](/ch5#durable-execution)
- 不可改变性
  - 好处, [不可变事件的优点](/ch12#sec_stream_immutability_pros), [为可审计性而设计](/ch13#id365)
  - 和清除的权利, [数据系统、法律与社会](/ch1#sec_introduction_compliance), [磁盘空间使用](/ch4#disk-space-usage)
  - 删除加密, [事件溯源与 CQRS](/ch3#sec_datamodels_events), [不变性的局限性](/ch12#sec_stream_immutability_limitations)
  - 从事件日志中获取状态, [状态、流和不变性](/ch12#sec_stream_immutability)-[不变性的局限性](/ch12#sec_stream_immutability_limitations)
  - 事故恢复, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
  - 在B树上, [B 树变体](/ch4#b-tree-variants), [索引与快照隔离](/ch8#indexes-and-snapshot-isolation)
  - 如果来源, [事件溯源与 CQRS](/ch3#sec_datamodels_events), [数据变更捕获与事件溯源](/ch12#sec_stream_event_sourcing)
  - 限制, [并发控制](/ch12#sec_stream_concurrency)
- 阻抗不匹配, [对象关系不匹配](/ch3#sec_datamodels_document)
- 存疑, [协调器故障](/ch8#coordinator-failure)
  - 锁定, [存疑时持有锁](/ch8#holding-locks-while-in-doubt)
  - 孤儿事务, [从协调器故障中恢复](/ch8#recovering-from-coordinator-failure)
- 模拟数据库, [全内存存储](/ch4#sec_storage_inmemory)
  - 持久性, [持久性](/ch8#durability)
  - 序列事务执行, [实际串行执行](/ch8#sec_transactions_serial)
- 事件
  - 导致错误定罪的会计软件错误, [人类与可靠性](/ch2#id31)
  - 无咎死后, [人类与可靠性](/ch2#id31)
  - 跳跃秒坠机, [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy)
  - 数据腐败和货币错误造成的经济损失, [弱隔离级别](/ch8#sec_transactions_isolation_levels)
  - 硬盘上的数据腐败, [持久性](/ch8#durability)
  - 数据损失,因最后写成, [用于事件排序的时间戳](/ch9#sec_distributed_lww)
  - 磁盘上无法读取的数据, [将系统模型映射到现实世界](/ch9#mapping-system-models-to-the-real-world)
  - 由于重用主钥匙而披露敏感数据, [领导者故障：故障转移](/ch6#leader-failure-failover)
  - 事务序列性中的错误, [维护完整性，尽管软件有Bug](/ch13#id455)
  - gigabit network interface with 1 Kb/s throughput, [系统模型与现实](/ch9#sec_distributed_system_model)
  - 跳跃第二次崩溃, [软件故障](/ch2#software-faults)
  - 网络断层, [实践中的网络故障](/ch9#sec_distributed_network_faults)
  - 网络接口只放下入境包, [实践中的网络故障](/ch9#sec_distributed_network_faults)
  - 网络分区和全数据中心故障, [故障与部分失效](/ch9#sec_distributed_partial_failure)
  - 网络故障处理不当, [实践中的网络故障](/ch9#sec_distributed_network_faults)
  - 向前合伙人发送消息, [排序事件以捕获因果关系](/ch13#sec_future_capture_causality)
  - 咬海底电缆的鲨鱼, [实践中的网络故障](/ch9#sec_distributed_network_faults)
  - split brain due to 1-minute packet delay, [领导者故障：故障转移](/ch6#leader-failure-failover), [实践中的网络故障](/ch9#sec_distributed_network_faults)
  - SSD failure after 32,768 hours, [软件故障](/ch2#software-faults)
  - 线程争吵导致服务下降, [进程暂停](/ch9#sec_distributed_clocks_pauses)
  - 服务器架中的振动, [延迟与响应时间](/ch2#id23)
  - 违反独特性限制, [维护完整性，尽管软件有Bug](/ch13#id455)
- incremental view maintenance (IVM), [维护物化视图](/ch12#sec_stream_mat_view)
  - 数据整合, [分拆系统与集成系统](/ch13#id448)
- 索引, [OLTP 系统的存储与索引](/ch4#sec_storage_oltp), [术语表](/glossary)
  - 并快照隔离, [索引与快照隔离](/ch8#indexes-and-snapshot-isolation)
  - 作为衍生数据, [记录系统与派生数据](/ch1#sec_introduction_derived), [组合使用数据存储技术](/ch13#id447)-[分拆系统与集成系统](/ch13#id448)
  - B树, [B 树](/ch4#sec_storage_b_trees)-[B 树变体](/ch4#b-tree-variants)
  - 分组, [在索引中存储值](/ch4#sec_storage_index_heap)
  - comparison of B-trees and LSM-trees, [比较 B 树与 LSM 树](/ch4#sec_storage_btree_lsm_comparison)-[磁盘空间使用](/ch4#disk-space-usage)
  - 覆盖（包括各栏）, [在索引中存储值](/ch4#sec_storage_index_heap)
  - 创建, [创建索引](/ch13#id340)
  - 全文检索, [全文检索](/ch4#sec_storage_full_text)
  - 地理空间, [多维索引与全文索引](/ch4#sec_storage_multidimensional)
  - 索引范围锁定, [索引范围锁](/ch8#sec_transactions_2pl_range)
  - 多列（压缩）, [多维索引与全文索引](/ch4#sec_storage_multidimensional)
  - 中学, [多列索引与二级索引](/ch4#sec_storage_index_multicolumn)
    - （另见 secondary indexes）
    - 双写问题, [保持系统同步](/ch12#sec_stream_sync), [理解数据流](/ch13#id443)
  - 硬化指数和二级指数, [分片与二级索引](/ch7#sec_sharding_secondary_indexes)-[全局二级索引](/ch7#id167), [总结](/ch7#summary)
  - 人烟稀少, [SSTable 文件格式](/ch4#the-sstable-file-format)
  - SSTable 与 LSM 树, [SSTable 文件格式](/ch4#the-sstable-file-format)-[压实策略](/ch4#sec_storage_lsm_compaction)
  - 数据变化时更新, [保持系统同步](/ch12#sec_stream_sync), [维护物化视图](/ch12#sec_stream_mat_view)
- Industrial Revolution, [回顾工业革命](/ch14#id377)
- InfiniBand (networks), [我们不能简单地使网络延迟可预测吗？](/ch9#can-we-not-simply-make-network-delays-predictable)
- InfluxDB IOx (storage engine), [列式存储](/ch4#sec_storage_column)
- information retrieval（见 全文检索）
- infrastructure as a service (IaaS), [云服务与自托管](/ch1#sec_introduction_cloud), [云服务的分层](/ch1#layering-of-cloud-services)
- InnoDB (storage engine)
  - 主密钥的分组索引, [在索引中存储值](/ch4#sec_storage_index_heap)
  - 不防止丢失的更新, [自动检测丢失的更新](/ch8#automatically-detecting-lost-updates)
  - 防止写入skew, [写偏差的特征](/ch8#characterizing-write-skew), [两阶段锁定的实现](/ch8#implementation-of-two-phase-locking)
  - 可序列隔离, [两阶段锁定的实现](/ch8#implementation-of-two-phase-locking)
  - 快速隔离支持, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)
- 实例（云计算）, [云服务的分层](/ch1#layering-of-cloud-services)
- integrating different data systems（见 数据集成）
- 诚信, [及时性与完整性](/ch13#sec_future_integrity)
  - 协调-避免数据系统, [无协调数据系统](/ch13#id454)
  - 数据流系统的正确性, [数据流系统的正确性](/ch13#id453)
  - 协商一致形式化, [单值共识](/ch10#single-value-consensus), [原子提交作为共识](/ch10#atomic-commitment-as-consensus)
  - 完整性检查, [不要盲目信任承诺](/ch13#id364)
    - （另见 审计）
    - 端到端, [端到端原则](/ch13#sec_future_e2e_argument), [端到端原则重现](/ch13#id456)
    - 使用快照隔离, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)
  - 尽管软件错误仍然维护, [维护完整性，尽管软件有Bug](/ch13#id455)
- Interface Definition Language (IDL), [Protocol Buffers](/ch5#sec_encoding_protobuf), [Avro](/ch5#sec_encoding_avro), [Web 服务](/ch5#sec_web_services)
- 不变式, [一致性](/ch8#sec_transactions_acid_consistency)
  - （另见 constraints）
- 反向文件索引（向量索引）, [向量嵌入](/ch4#id92)
- 倒转索引, [全文检索](/ch4#sec_storage_full_text)
- 不可逆转,尽量减少, [可演化性：让变化更容易](/ch2#sec_introduction_evolvability), [事件溯源与 CQRS](/ch3#sec_datamodels_events), [批处理](/ch11#ch_batch)
- ISDN (Integrated Services Digital Network), [同步与异步网络](/ch9#sec_distributed_sync_networks)
- 隔离性
  - cgroups（见 cgroups）
- 隔离性, [隔离性](/ch8#sec_transactions_acid_isolation), [单对象与多对象操作](/ch8#sec_transactions_multi_object), [术语表](/glossary)
  - 正确性和, [追求正确性](/ch13#sec_future_correctness)
  - 用于单对象写入, [单对象写入](/ch8#sec_transactions_single_object)
  - 可串行化, [可串行化](/ch8#sec_transactions_serializability)-[可串行化快照隔离的性能](/ch8#performance-of-serializable-snapshot-isolation)
    - 实际执行, [实际串行执行](/ch8#sec_transactions_serial)-[串行执行总结](/ch8#summary-of-serial-execution)
    - 可串行化快照隔离, [可串行化快照隔离（SSI）](/ch8#sec_transactions_ssi)-[可串行化快照隔离的性能](/ch8#performance-of-serializable-snapshot-isolation)
    - 两阶段锁定, [两阶段锁定（2PL）](/ch8#sec_transactions_2pl)-[索引范围锁](/ch8#sec_transactions_2pl_range)
  - 违反, [单对象与多对象操作](/ch8#sec_transactions_multi_object)
  - 薄弱的隔离水平, [弱隔离级别](/ch8#sec_transactions_isolation_levels)-[物化冲突](/ch8#materializing-conflicts)
    - 防止丢失更新, [防止丢失更新](/ch8#sec_transactions_lost_update)-[冲突解决与复制](/ch8#conflict-resolution-and-replication)
    - 读已提交, [读已提交](/ch8#sec_transactions_read_committed)-[实现读已提交](/ch8#sec_transactions_read_committed_impl)
    - 快照隔离, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)-[快照隔离、可重复读和命名混淆](/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
- IVF (vector index), [向量嵌入](/ch4#id92)

### J

- 数据库连接
  - 分布式事务支持, [XA 事务](/ch8#xa-transactions)
  - 网络驱动程序, [模式的优点](/ch5#sec_encoding_schemas)
- Java Enterprise Edition (EE), [远程过程调用（RPC）的问题](/ch5#sec_problems_with_rpc), [两阶段提交（2PC）](/ch8#sec_transactions_2pc), [XA 事务](/ch8#xa-transactions)
- Java Message Service (JMS), [消息代理与数据库的对比](/ch12#id297)
  - （另见 messaging systems）
  - 比较基于日志的邮件, [日志与传统的消息传递相比](/ch12#sec_stream_logs_vs_messaging), [重播旧消息](/ch12#sec_stream_replay)
  - 分布式事务支持, [XA 事务](/ch8#xa-transactions)
  - 消息顺序, [确认与重新传递](/ch12#sec_stream_reordering)
- Java Transaction API (JTA), [两阶段提交（2PC）](/ch8#sec_transactions_2pc), [XA 事务](/ch8#xa-transactions)
- Java Virtual Machine (JVM)
  - 垃圾收集, [进程暂停](/ch9#sec_distributed_clocks_pauses), [限制垃圾回收的影响](/ch9#sec_distributed_gc_impact)
  - JIT compilation, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
  - 批次处理器中的工艺再利用, [数据流引擎](/ch11#sec_batch_dataflow)
- Jena (RDF framework), [RDF 数据模型](/ch3#the-rdf-data-model)
  - SPARQL 查询语言, [SPARQL 查询语言](/ch3#the-sparql-query-language)
- Jepsen（过失容忍度测试）, [故障注入](/ch9#sec_fault_injection), [追求正确性](/ch13#sec_future_correctness)
- jitter （网络延迟）, [平均值、中位数与百分位点](/ch2#id24), [网络拥塞和排队](/ch9#network-congestion-and-queueing)
- JMESPath（查询语言）, [查询语言](/ch11#sec_batch_query_lanauges)
- 合并表格, [多对一与多对多关系](/ch3#sec_datamodels_many_to_many), [属性图](/ch3#id56)
- 加入, [术语表](/glossary)
  - 作为关系运算符表示, [查询语言](/ch11#sec_batch_query_lanauges)
  - handling GraphQL query, [GraphQL](/ch3#id63)
  - 应用程序代码, [规范化、反规范化与连接](/ch3#sec_datamodels_normalization), [社交网络案例研究中的反规范化](/ch3#denormalization-in-the-social-networking-case-study)
  - in DataFrames, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
  - 关系数据库和文档数据库, [规范化、反规范化与连接](/ch3#sec_datamodels_normalization)
  - 二级指数和, [多列索引与二级索引](/ch4#sec_storage_index_multicolumn)
  - 排序合并, [JOIN 与 GROUP BY](/ch11#sec_batch_join)
  - 串流连接, [流连接](/ch12#sec_stream_joins)-[连接的时间依赖性](/ch12#sec_stream_join_time)
    - 串流流连接, [流流连接（窗口连接）](/ch12#id440)
    - 串行表连接, [流表连接（流扩充）](/ch12#sec_stream_table_joins)
    - 表格连接, [表表连接（维护物化视图）](/ch12#id326)
    - 时间的依赖性, [连接的时间依赖性](/ch12#sec_stream_join_time)
  - 文档数据库中的支持, [文档和关系数据库的融合](/ch3#convergence-of-document-and-relational-databases)
- JOTM (transaction coordinator), [两阶段提交（2PC）](/ch8#sec_transactions_2pc)
- 日记（文件系统）, [使 B 树可靠](/ch4#sec_storage_btree_wal)
- JSON
  - 管道汇总（用克里语）, [文档的查询语言](/ch3#query-languages-for-documents)
  - Avro 方案说明, [Avro](/ch5#sec_encoding_avro)
  - 二进制变体, [二进制编码](/ch5#binary-encoding)
  - 数据位置, [读写的数据局部性](/ch3#sec_datamodels_document_locality)
  - 文档数据模型, [关系模型与文档模型](/ch3#sec_datamodels_history)
  - 应用数据的问题, [JSON、XML 及其二进制变体](/ch5#sec_encoding_json)
  - GraphQL response, [GraphQL](/ch3#id63)
  - 关系数据库, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility)
  - 代表简历（例）, [用于一对多关系的文档数据模型](/ch3#the-document-data-model-for-one-to-many-relationships)
  - 模式, [JSON 模式](/ch5#json-schema)
- JSON-LD, [三元组存储与 SPARQL](/ch3#id59)
- JsonPath（查询语言）, [查询语言](/ch11#sec_batch_query_lanauges)
- JuiceFS（分布式文件系统）, [分布式文件系统](/ch11#sec_batch_dfs), [对象存储](/ch11#id277)
- 朱皮特（注解本）, [机器学习](/ch11#id290)
- just-in-time (JIT) compilation, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)

### K

- Kafka（消息系统）, [消息代理](/ch5#message-brokers), [使用日志进行消息存储](/ch12#id300)
  - 消费者群体, [多个消费者](/ch12#id298)
  - 数据整合, [分拆系统与集成系统](/ch13#id448)
  - 用于事件源代码, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
  - Kafka 连接（数据库整合）, [数据变更捕获的实现](/ch12#id307), [变更流的 API 支持](/ch12#sec_stream_change_api), [从同一事件日志中派生多个视图](/ch12#sec_stream_deriving_views)
  - 卡夫卡流（流处理器）, [流分析](/ch12#id318), [维护物化视图](/ch12#sec_stream_mat_view)
    - 恰好一次语义, [再谈恰好一次消息处理](/ch8#exactly-once-message-processing-revisited)
    - 过失容忍, [失败后重建状态](/ch12#sec_stream_state_fault_tolerance)
  - ksqlDB (stream database), [维护物化视图](/ch12#sec_stream_mat_view)
  - 基于领导者的复制, [单主复制](/ch6#sec_replication_leader)
  - 日志压缩, [日志压缩](/ch12#sec_stream_log_compaction), [维护物化视图](/ch12#sec_stream_mat_view)
  - 页:1, [使用日志进行消息存储](/ch12#id300), [幂等性](/ch12#sec_stream_idempotence)
  - 分区, [分片](/ch7#ch_sharding)
  - 请求路由, [请求路由](/ch7#sec_sharding_routing)
  - 计划登记, [但什么是写入者模式？](/ch5#but-what-is-the-writers-schema)
  - 服务衍生数据, [对外提供派生数据](/ch11#sec_batch_serving_derived)
  - 分层存储, [磁盘空间使用](/ch12#sec_stream_disk_usage)
  - 事务, [数据库内部的分布式事务](/ch8#sec_transactions_internal), [原子提交再现](/ch12#sec_stream_atomic_commit)
  - 不洁领袖选举, [共识的微妙之处](/ch10#subtleties-of-consensus)
  - 使用模型检查, [模型检查与规范语言](/ch9#model-checking-and-specification-languages)
- kappa 架构, [统一批处理和流处理](/ch13#id338)
- 关键价值储存, [OLTP 系统的存储与索引](/ch4#sec_storage_oltp)
  - 比较对象存储, [对象存储](/ch11#id277)
  - 记忆, [全内存存储](/ch4#sec_storage_inmemory)
  - LSM storage, [日志结构存储](/ch4#sec_storage_log_structured)-[磁盘空间使用](/ch4#disk-space-usage)
  - 分片, [键值数据的分片](/ch7#sec_sharding_key_value)-[偏斜的工作负载与缓解热点](/ch7#sec_sharding_skew)
    - 键的散列, [按键的哈希分片](/ch7#sec_sharding_hash), [总结](/ch7#summary)
    - 按密钥范围, [按键的范围分片](/ch7#sec_sharding_key_range), [总结](/ch7#summary)
    - 摇摆和热点, [偏斜的工作负载与缓解热点](/ch7#sec_sharding_skew)
- Kinesis（消息系统）, [消息代理](/ch5#message-brokers), [使用日志进行消息存储](/ch12#id300)
  - 数据仓集成, [云数据仓库](/ch4#sec_cloud_data_warehouses)
- Kryo (Java), [特定语言的格式](/ch5#id96)
- ksqlDB (stream database), [维护物化视图](/ch12#sec_stream_mat_view)
- Kubernetes（集群经理）, [云服务与自托管](/ch1#sec_introduction_cloud), [微服务与无服务器](/ch1#sec_introduction_microservices), [分布式作业编排](/ch11#id278), [应用代码和状态的分离](/ch13#id344)
  - 库贝流, [机器学习](/ch11#id290)
  - 立方体, [分布式作业编排](/ch11#id278)
  - 算子, [分布式作业编排](/ch11#id278)
  - 使用等数据d, [请求路由](/ch7#sec_sharding_routing), [协调服务](/ch10#sec_consistency_coordination)
- KùzuDB (database), [分布式系统的问题](/ch1#sec_introduction_dist_sys_problems), [图数据模型](/ch3#sec_datamodels_graph)
  - 作为嵌入式存储引擎, [压实策略](/ch4#sec_storage_lsm_compaction)
  - Cypher 查询语言, [Cypher 查询语言](/ch3#id57)

### L

- labeled property graphs（见 property graphs）
- 羊肉达建筑, [统一批处理和流处理](/ch13#id338)
- Lamport 时间戳, [Lamport 时间戳](/ch10#lamport-timestamps)
- Lance（数据格式）, [云数据仓库](/ch4#sec_cloud_data_warehouses), [列式存储](/ch4#sec_storage_column)
  - （另见 column-oriented storage）
- large language models (LLMs)
  - 预处理培训数据, [机器学习](/ch11#id290)
- 最后写入胜利, [最后写入胜利（丢弃并发写入）](/ch6#sec_replication_lww), [检测并发写入](/ch6#sec_replication_concurrent), [实现线性一致性系统](/ch10#sec_consistency_implementing_linearizable)
  - 问题, [用于事件排序的时间戳](/ch9#sec_distributed_lww)
  - 容易丢失更新, [冲突解决与复制](/ch8#conflict-resolution-and-replication)
- 延迟, [延迟与响应时间](/ch2#id23)
  - （另见 响应时间）
  - 跨区域, [分布式与单节点系统](/ch1#sec_introduction_distributed)
  - 在两阶段锁定下的不稳定, [两阶段锁定的性能](/ch8#performance-of-two-phase-locking)
  - 网络延迟和资源利用, [我们不能简单地使网络延迟可预测吗？](/ch9#can-we-not-simply-make-network-delays-predictable)
  - 根据请求减少套期保值, [单主与无主复制的性能](/ch6#sec_replication_leaderless_perf)
  - 响应时间对比, [延迟与响应时间](/ch2#id23)
  - 尾延迟, [平均值、中位数与百分位点](/ch2#id24), [响应时间指标的应用](/ch2#sec_introduction_slo_sla), [本地二级索引](/ch7#id166)
- law（见 legal matters）
- （云服务）, [云服务的分层](/ch1#layering-of-cloud-services)
- 基于领导者的复制, [单主复制](/ch6#sec_replication_leader)-[逻辑（基于行）日志复制](/ch6#logical-row-based-log-replication)
  - （另见 复制）
  - 故障切换, [领导者故障：故障转移](/ch6#leader-failure-failover), [分布式锁和租约](/ch9#sec_distributed_lock_fencing)
  - 处理节点断电, [处理节点故障](/ch6#sec_replication_failover)
  - 实施复制日志
    - 数据变更捕获, [数据变更捕获](/ch12#sec_stream_cdc)-[变更流的 API 支持](/ch12#sec_stream_change_api)
      - （另见 changelogs）
    - 基于语句的, [基于语句的复制](/ch6#statement-based-replication)
    - 预写日志（WAL）传输, [预写日志（WAL）传输](/ch6#write-ahead-log-wal-shipping)
  - 操作的可线性, [实现线性一致性系统](/ch10#sec_consistency_implementing_linearizable)
  - 锁定和领导者选举, [锁定与领导者选举](/ch10#locking-and-leader-election)
  - 日志序列号, [设置新的副本](/ch6#sec_replication_new_replica), [消费者偏移量](/ch12#sec_stream_log_offsets)
  - 读缩放架构, [复制延迟的问题](/ch6#sec_replication_lag), [单主与无主复制的性能](/ch6#sec_replication_leaderless_perf)
  - 与协商一致的关系, [共识](/ch10#sec_consistency_consensus), [从单主复制到共识](/ch10#from-single-leader-replication-to-consensus), [共识的利弊](/ch10#pros-and-cons-of-consensus)
  - 设立新的追随者, [设置新的副本](/ch6#sec_replication_new_replica)
  - 同步对同步, [同步复制与异步复制](/ch6#sec_replication_sync_async)-[同步复制与异步复制](/ch6#sec_replication_sync_async)
- 无领导复制, [无主复制](/ch6#sec_replication_leaderless)-[版本向量](/ch6#version-vectors)
  - （另见 复制）
  - 追赶丢失的写入, [追赶错过的写入](/ch6#sec_replication_read_repair)
  - 检测并行写作, [检测并发写入](/ch6#sec_replication_concurrent)-[版本向量](/ch6#version-vectors)
    - 版本向量, [版本向量](/ch6#version-vectors)
  - 多区域, [多地区操作](/ch6#multi-region-operation)
  - 法定人数, [读写仲裁](/ch6#sec_replication_quorum_condition)-[多地区操作](/ch6#multi-region-operation)
    - 一致性限制, [仲裁一致性的局限](/ch6#sec_replication_quorum_limitations)-[监控陈旧性](/ch6#monitoring-staleness), [线性一致性与仲裁](/ch10#sec_consistency_quorum_linearizable)
- 跳跃秒, [软件故障](/ch2#software-faults), [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy)
  - 时钟, [日历时钟](/ch9#time-of-day-clocks)
- 租赁, [进程暂停](/ch9#sec_distributed_clocks_pauses)
  - 与协调处合作执行, [协调服务](/ch10#sec_consistency_coordination)
  - 需要围栏, [分布式锁和租约](/ch9#sec_distributed_lock_fencing)
  - 与协商一致的关系, [单值共识](/ch10#single-value-consensus)
- 分类账（会计）, [总结](/ch3#summary)
  - 不可改变性, [不可变事件的优点](/ch12#sec_stream_immutability_pros)
- 遗留系统,维护, [可运维性](/ch2#sec_introduction_maintainability)
- 法律事项, [数据系统、法律与社会](/ch1#sec_introduction_compliance)-[数据系统、法律与社会](/ch1#sec_introduction_compliance)
  - 数据删除, [数据系统、法律与社会](/ch1#sec_introduction_compliance), [磁盘空间使用](/ch4#disk-space-usage)
  - 数据存储, [分布式与单节点系统](/ch1#sec_introduction_distributed), [面向多租户的分片](/ch7#sec_sharding_multitenancy)
  - 隐私监管, [数据系统、法律与社会](/ch1#sec_introduction_compliance), [立法与自律](/ch14#sec_future_legislation)
- legitimate interest (GDPR), [同意与选择自由](/ch14#id375)
- 平面压缩, [压实策略](/ch4#sec_storage_lsm_compaction), [磁盘空间使用](/ch4#disk-space-usage)
- Levenshtein 自动地图, [全文检索](/ch4#sec_storage_full_text)
- 跛脚（部分失败）, [系统模型与现实](/ch9#sec_distributed_system_model)
- 线性（项目管理软件）, [实时协作、离线优先和本地优先应用](/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- 线性代数, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- 线性可缩放性, [描述负载](/ch2#id33)
- 线性一致性, [复制延迟的解决方案](/ch6#id131), [线性一致性](/ch10#sec_consistency_linearizability)-[线性一致性与网络延迟](/ch10#linearizability-and-network-delays), [术语表](/glossary)
  - 和共识, [共识](/ch10#sec_consistency_consensus)
  - 费用, [线性一致性的代价](/ch10#sec_linearizability_cost)-[线性一致性与网络延迟](/ch10#linearizability-and-network-delays)
    - CAP定理, [CAP 定理](/ch10#the-cap-theorem)
    - memory on multi-core CPUs, [线性一致性与网络延迟](/ch10#linearizability-and-network-delays)
  - 定义, [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)-[什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
  - ID generation, [线性一致的 ID 生成器](/ch10#sec_consistency_linearizable_id)
  - 协调事务, [协调服务](/ch10#sec_consistency_coordination)
  - 数据系统
    - 避免协调, [无协调数据系统](/ch13#id454)
  - 不同复制方法, [实现线性一致性系统](/ch10#sec_consistency_implementing_linearizable)-[线性一致性与仲裁](/ch10#sec_consistency_quorum_linearizable)
    - 使用法定人数, [线性一致性与仲裁](/ch10#sec_consistency_quorum_linearizable)
  - 在协商一致的制度中读取, [共识的微妙之处](/ch10#subtleties-of-consensus)
  - 依赖, [依赖线性一致性](/ch10#sec_consistency_linearizability_usage)-[跨通道时序依赖](/ch10#cross-channel-timing-dependencies)
    - 限制和独特性, [约束与唯一性保证](/ch10#sec_consistency_uniqueness)
    - 跨渠道时间依赖性, [跨通道时序依赖](/ch10#cross-channel-timing-dependencies)
    - 锁定和领导者选举, [锁定与领导者选举](/ch10#locking-and-leader-election)
  - 可序列性, [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
- 链接数据, [三元组存储与 SPARQL](/ch3#id59)
- LinkedIn
  - Espresso（数据库）, [但什么是写入者模式？](/ch5#but-what-is-the-writers-schema)
  - LIquid（数据库）, [Datalog：递归关系查询](/ch3#id62)
  - 配置文件（例）, [用于一对多关系的文档数据模型](/ch3#the-document-data-model-for-one-to-many-relationships)
- Linux 跳过第二个错误, [软件故障](/ch2#software-faults), [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy)
- Litestream （备份工具）, [设置新的副本](/ch6#sec_replication_new_replica)
- 生活属性, [安全性与活性](/ch9#sec_distributed_safety_liveness)
- LLVM (compiler), [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
- LMDB (storage engine), [压实策略](/ch4#sec_storage_lsm_compaction), [B 树变体](/ch4#b-tree-variants), [索引与快照隔离](/ch8#indexes-and-snapshot-isolation)
- 负载
  - 应付, [可伸缩性原则](/ch2#id35)
  - 描述, [描述负载](/ch2#id33)
- 负载平衡, [描述性能](/ch2#sec_introduction_percentiles), [负载均衡器、服务发现和服务网格](/ch5#sec_encoding_service_discovery)
  - 硬件, [负载均衡器、服务发现和服务网格](/ch5#sec_encoding_service_discovery)
  - 软件, [负载均衡器、服务发现和服务网格](/ch5#sec_encoding_service_discovery)
  - 使用信件经纪人, [多个消费者](/ch12#id298)
- 装弹, [描述性能](/ch2#sec_introduction_percentiles)
- 本地二级指数, [本地二级索引](/ch7#id166), [总结](/ch7#summary)
- 本地第一软件, [实时协作、离线优先和本地优先应用](/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- 局部性, [用于一对多关系的文档数据模型](/ch3#the-document-data-model-for-one-to-many-relationships), [读写的数据局部性](/ch3#sec_datamodels_document_locality), [术语表](/glossary)
  - 分批处理, [数据流引擎](/ch11#sec_batch_dataflow)
  - 在状态客户端, [同步引擎与本地优先软件](/ch6#sec_replication_offline_clients), [有状态、可离线的客户端](/ch13#id347)
  - 在溪流处理中, [流表连接（流扩充）](/ch12#sec_stream_table_joins), [失败后重建状态](/ch12#sec_stream_state_fault_tolerance), [流处理器和服务](/ch13#id345), [基于日志消息传递中的唯一性](/ch13#sec_future_uniqueness_log)
- 地点透明度, [远程过程调用（RPC）的问题](/ch5#sec_problems_with_rpc)
  - 在演员模式中, [分布式 actor 框架](/ch5#distributed-actor-frameworks)
- 锁定, [云服务的利弊](/ch1#sec_introduction_cloud_tradeoffs)
- 锁, [术语表](/glossary)
  - 死锁, [显式锁定](/ch8#explicit-locking), [两阶段锁定的实现](/ch8#implementation-of-two-phase-locking)
  - 分布式锁定, [分布式锁和租约](/ch9#sec_distributed_lock_fencing)-[多副本隔离](/ch9#fencing-with-multiple-replicas), [锁定与领导者选举](/ch10#locking-and-leader-election)
    - 栅栏标志, [隔离僵尸进程和延迟请求](/ch9#sec_distributed_fencing_tokens)
    - 与协调处合作执行, [协调服务](/ch10#sec_consistency_coordination)
    - 与协商一致的关系, [单值共识](/ch10#single-value-consensus)
  - 用于事务隔离
    - 在快照隔离中, [多版本并发控制（MVCC）](/ch8#sec_transactions_snapshot_impl)
    - in two-phase locking (2PL), [两阶段锁定（2PL）](/ch8#sec_transactions_2pl)-[索引范围锁](/ch8#sec_transactions_2pl_range)
    - 使操作原子化, [原子写操作](/ch8#atomic-write-operations)
    - 性能, [两阶段锁定的性能](/ch8#performance-of-two-phase-locking)
    - 防止肮脏的写作, [实现读已提交](/ch8#sec_transactions_read_committed_impl)
    - 防止带有索引范围锁的幽灵, [索引范围锁](/ch8#sec_transactions_2pl_range), [检测影响先前读取的写入](/ch8#sec_detecting_writes_affect_reads)
    - 读取锁（共享模式）, [实现读已提交](/ch8#sec_transactions_read_committed_impl), [两阶段锁定的实现](/ch8#implementation-of-two-phase-locking)
    - 共享模式和专属模式, [两阶段锁定的实现](/ch8#implementation-of-two-phase-locking)
  - 分布式事务
    - 发现僵局, [XA 事务的问题](/ch8#problems-with-xa-transactions)
    - 持有锁的可疑事务, [存疑时持有锁](/ch8#holding-locks-while-in-doubt)
  - 实现冲突, [物化冲突](/ch8#materializing-conflicts)
  - 通过明确锁定防止丢失更新, [显式锁定](/ch8#explicit-locking)
- 日志序列号, [设置新的副本](/ch6#sec_replication_new_replica), [消费者偏移量](/ch12#sec_stream_log_offsets)
- 逻辑时钟, [用于事件排序的时间戳](/ch9#sec_distributed_lww), [ID 生成器和逻辑时钟](/ch10#sec_consistency_logical)-[使用逻辑时钟强制约束](/ch10#enforcing-constraints-using-logical-clocks), [排序事件以捕获因果关系](/ch13#sec_future_capture_causality)
  - 最后写成的, [最后写入胜利（丢弃并发写入）](/ch6#sec_replication_lww)
  - 读后写入一致性, [读己之写](/ch6#sec_replication_ryw)
  - 混合逻辑时钟, [混合逻辑时钟](/ch10#hybrid-logical-clocks)
  - 执行制约因素不足, [使用逻辑时钟强制约束](/ch10#enforcing-constraints-using-logical-clocks)
  - Lamport 时间戳, [Lamport 时间戳](/ch10#lamport-timestamps)
- 逻辑复制, [逻辑（基于行）日志复制](/ch6#logical-row-based-log-replication)
  - 用于获取变化数据, [数据变更捕获的实现](/ch12#id307)
- LogicBlox（数据库）, [Datalog：递归关系查询](/ch3#id62)
- 日志（数据结构）, [OLTP 系统的存储与索引](/ch4#sec_storage_oltp), [共享日志作为共识](/ch10#sec_consistency_shared_logs), [术语表](/glossary)
  - （另见 shared logs）
  - 不可改变性的好处, [不可变事件的优点](/ch12#sec_stream_immutability_pros)
  - 和清除的权利, [数据系统、法律与社会](/ch1#sec_introduction_compliance), [磁盘空间使用](/ch4#disk-space-usage)
  - 压实（Compaction）, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables), [压实策略](/ch4#sec_storage_lsm_compaction), [日志压缩](/ch12#sec_stream_log_compaction), [状态、流和不变性](/ch12#sec_stream_immutability)
    - 流运算符状态, [失败后重建状态](/ch12#sec_stream_state_fault_tolerance)
  - 执行独特性限制, [基于日志消息传递中的唯一性](/ch13#sec_future_uniqueness_log)
  - 基于日志的信息, [基于日志的消息代理](/ch12#sec_stream_log)-[重播旧消息](/ch12#sec_stream_replay)
    - 比较传统消息, [日志与传统的消息传递相比](/ch12#sec_stream_logs_vs_messaging), [重播旧消息](/ch12#sec_stream_replay)
    - 减 减, [消费者偏移量](/ch12#sec_stream_log_offsets)
    - 磁盘空间使用情况, [磁盘空间使用](/ch12#sec_stream_disk_usage)
    - 重播旧信件, [重播旧消息](/ch12#sec_stream_replay), [应用演化后重新处理数据](/ch13#sec_future_reprocessing), [统一批处理和流处理](/ch13#id338)
    - 缓慢的消费者, [当消费者跟不上生产者时](/ch12#id459)
    - 使用日志存储信件, [使用日志进行消息存储](/ch12#id300)
  - 日志结构存储, [OLTP 系统的存储与索引](/ch4#sec_storage_oltp)-[压实策略](/ch4#sec_storage_lsm_compaction)
    - log-structured merge tree（见 LSM-trees）
  - 与协商一致的关系, [共享日志作为共识](/ch10#sec_consistency_shared_logs)
  - 复制, [单主复制](/ch6#sec_replication_leader), [复制日志的实现](/ch6#sec_replication_implementation)-[逻辑（基于行）日志复制](/ch6#logical-row-based-log-replication)
    - 数据变更捕获, [数据变更捕获](/ch12#sec_stream_cdc)-[变更流的 API 支持](/ch12#sec_stream_change_api)
      - （另见 changelogs）
    - 与快照协调, [设置新的副本](/ch6#sec_replication_new_replica)
    - 逻辑（基于row） 复制, [逻辑（基于行）日志复制](/ch6#logical-row-based-log-replication)
    - 基于语句的复制, [基于语句的复制](/ch6#statement-based-replication)
    - 预写日志（WAL）传输, [预写日志（WAL）传输](/ch6#write-ahead-log-wal-shipping)
  - 伸缩性限制, [全序的限制](/ch13#id335)
- 浏览器（商业情报软件）, [事务处理与分析的特征](/ch1#sec_introduction_oltp), [分析（Analytics）](/ch11#sec_batch_olap)
- 松耦合, [开展分拆工作](/ch13#sec_future_unbundling_favor)
- lost updates（见 updates）
- 莲花笔记（同步引擎）, [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
- LSM-trees (indexes), [SSTable 文件格式](/ch4#the-sstable-file-format)-[压实策略](/ch4#sec_storage_lsm_compaction)
  - 与B树的比较, [比较 B 树与 LSM 树](/ch4#sec_storage_btree_lsm_comparison)-[磁盘空间使用](/ch4#disk-space-usage)
- Lucene（存储引擎）, [全文检索](/ch4#sec_storage_full_text)
  - 相似性搜索, [全文检索](/ch4#sec_storage_full_text)
- 最后写入胜利（见 最后写入胜利）

### M

- 机器学习
  - 批量推论, [机器学习](/ch11#id290)
  - data preparation with DataFrames, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
  - 删除培训数据, [数据系统、法律与社会](/ch1#sec_introduction_compliance)
  - 部署数据产品, [超越数据湖](/ch1#beyond-the-data-lake)
  - 道德考虑, [预测分析](/ch14#id369)
    - （另见 ethics）
  - 特性工程, [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake), [机器学习](/ch11#id290)
  - 分析系统, [分析型与事务型系统](/ch1#sec_introduction_analytics)
  - 迭代处理, [机器学习](/ch11#id290)
  - LLMs（见 large language models (LLMs)）
  - 培训数据产生的模型, [应用代码作为派生函数](/ch13#sec_future_dataflow_derivation)
  - 与批量处理的关系, [机器学习](/ch11#id290)-[机器学习](/ch11#id290)
  - 使用数据湖, [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake)
  - using GPUs, [云服务的分层](/ch1#layering-of-cloud-services), [分布式与单节点系统](/ch1#sec_introduction_distributed)
  - 使用矩阵, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- 疯狂（决定性模拟测试）, [确定性模拟测试](/ch9#deterministic-simulation-testing)
- 万金油, [可伸缩性原则](/ch2#id35)
- 可维护性, [可运维性](/ch2#sec_introduction_maintainability)-[可演化性：让变化更容易](/ch2#sec_introduction_evolvability), [流式系统的哲学](/ch13#ch_philosophy)
  - 可演化性（见 可演化性）
  - 可操作性, [可运维性：让运维更轻松](/ch2#id37)
  - 简化和管理复杂性, [简单性：管理复杂度](/ch2#id38)
- 多种关系, [多对一与多对多关系](/ch3#sec_datamodels_many_to_many)
  - 模拟为图表, [图数据模型](/ch3#sec_datamodels_graph)
- 多对一关系, [多对一与多对多关系](/ch3#sec_datamodels_many_to_many)
  - 在恒星计时, [星型与雪花型：分析模式](/ch3#sec_datamodels_analytics)
- MapReduce (batch processing), [批处理](/ch11#ch_batch), [MapReduce](/ch11#sec_batch_mapreduce)-[MapReduce](/ch11#sec_batch_mapreduce)
  - 用户活动活动分析（实例）, [JOIN 与 GROUP BY](/ch11#sec_batch_join)
  - 与流处理的比较, [流处理](/ch12#sec_stream_processing)
  - 不利条件和限制, [MapReduce](/ch11#sec_batch_mapreduce)
  - 过失容忍, [故障处理](/ch11#id281)
  - 高级工具, [查询语言](/ch11#sec_batch_query_lanauges)
  - 映射和缩小函数, [MapReduce](/ch11#sec_batch_mapreduce)
  - 移动数据, [混洗数据](/ch11#sec_shuffle)
  - 排序合并, [JOIN 与 GROUP BY](/ch11#sec_batch_join)
  - 工作流程, [工作流调度](/ch11#sec_batch_workflows)
    - （另见 workflow engines）
- 编组（见 编码）
- MartenDB（数据库）, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
- 主奴隶复制（过时术语）, [单主复制](/ch6#sec_replication_leader)
- 物化, [术语表](/glossary)
  - 总价值, [物化视图与数据立方体](/ch4#sec_storage_materialized_views)
  - 冲突, [物化冲突](/ch8#materializing-conflicts)
  - 实际意见, [物化视图与数据立方体](/ch4#sec_storage_materialized_views)
    - 作为衍生数据, [记录系统与派生数据](/ch1#sec_introduction_derived), [组合使用数据存储技术](/ch13#id447)-[分拆系统与集成系统](/ch13#id448)
    - 如果来源, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
    - 增量视图维护, [维护物化视图](/ch12#sec_stream_mat_view)
      - （另见 incremental view maintenance (IVM)）
    - 维护,使用流处理, [维护物化视图](/ch12#sec_stream_mat_view), [表表连接（维护物化视图）](/ch12#id326)
  - 社会网络时间表实例, [时间线的物化与更新](/ch2#sec_introduction_materializing)
- 物化, [物化视图与数据立方体](/ch4#sec_storage_materialized_views)
  - 增量视图维护, [维护物化视图](/ch12#sec_stream_mat_view)
- 矩阵, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
  - 人烟稀少, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- Maxwell（变化数据捕获）, [数据变更捕获的实现](/ch12#id307)
- 说, [平均值、中位数与百分位点](/ch2#id24)
- 媒体监测, [在流上搜索](/ch12#id320)
- 中位数, [平均值、中位数与百分位点](/ch2#id24)
- 会议室预订（例）, [写偏差的更多例子](/ch8#more-examples-of-write-skew), [谓词锁](/ch8#predicate-locks), [强制约束](/ch13#sec_future_constraints)
- 调试（调试服务器）, [全内存存储](/ch4#sec_storage_inmemory)
- Memgraph（数据库）, [图数据模型](/ch3#sec_datamodels_graph)
  - Cypher 查询语言, [Cypher 查询语言](/ch3#id57)
- 内存
  - 壁障, [线性一致性与网络延迟](/ch10#linearizability-and-network-delays)
  - 腐败, [硬件与软件故障](/ch2#sec_introduction_hardware_faults)
  - 模拟数据库, [全内存存储](/ch4#sec_storage_inmemory)
    - 持久性, [持久性](/ch8#durability)
    - 序列事务执行, [实际串行执行](/ch8#sec_transactions_serial)
  - 数据模拟表示, [编码数据的格式](/ch5#sec_encoding_formats)
  - 内存表, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
  - 随机比特- flips in, [信任但验证](/ch13#sec_future_verification)
  - 索引的使用, [日志结构存储](/ch4#sec_storage_log_structured)
- 内存表, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
- 商品（版本控制系统）, [并发控制](/ch12#sec_stream_concurrency)
- 合并, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- 合并排序的文件, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables), [混洗数据](/ch11#sec_shuffle)
- 默克尔树, [用于可审计数据系统的工具](/ch13#id366)
- Mesos（分组管理器）, [应用代码和状态的分离](/ch13#id344)
- message brokers（见 messaging systems）
- message-passing（见 event-driven architecture）
- MessagePack (encoding format), [二进制编码](/ch5#binary-encoding)
- 通讯系统, [流处理](/ch12#ch_stream)-[重播旧消息](/ch12#sec_stream_replay)
  - （另见 streams）
  - 后压、缓冲或丢弃信件, [消息传递系统](/ch12#sec_stream_messaging)
  - 无中介消息, [直接从生产者传递给消费者](/ch12#id296)
  - 事件日志, [基于日志的消息代理](/ch12#sec_stream_log)-[重播旧消息](/ch12#sec_stream_replay)
    - 作为数据模型, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
    - 比较传统消息, [日志与传统的消息传递相比](/ch12#sec_stream_logs_vs_messaging), [重播旧消息](/ch12#sec_stream_replay)
    - 减 减, [消费者偏移量](/ch12#sec_stream_log_offsets)
    - 重播旧信件, [重播旧消息](/ch12#sec_stream_replay), [应用演化后重新处理数据](/ch13#sec_future_reprocessing), [统一批处理和流处理](/ch13#id338)
    - 缓慢的消费者, [当消费者跟不上生产者时](/ch12#id459)
  - 恰好一次语义, [恰好一次消息处理](/ch8#sec_transactions_exactly_once), [再谈恰好一次消息处理](/ch8#exactly-once-message-processing-revisited), [容错](/ch12#sec_stream_fault_tolerance)
  - 信件经纪人, [消息代理](/ch12#id433)-[确认与重新传递](/ch12#sec_stream_reordering)
    - 承认和重新交付, [确认与重新传递](/ch12#sec_stream_reordering)
    - 比较事件日志, [日志与传统的消息传递相比](/ch12#sec_stream_logs_vs_messaging), [重播旧消息](/ch12#sec_stream_replay)
    - 同一主题的多个消费者, [多个消费者](/ch12#id298)
    - versus RPC, [事件驱动的架构](/ch5#sec_encoding_dataflow_msg)
  - 消息丢失, [消息传递系统](/ch12#sec_stream_messaging)
  - 可靠性, [消息传递系统](/ch12#sec_stream_messaging)
  - 以日志为基础的信件中的独特性, [基于日志消息传递中的唯一性](/ch13#sec_future_uniqueness_log)
- 可调味的失败, [描述性能](/ch2#sec_introduction_percentiles)
- 计票
  - 无服务器, [微服务与无服务器](/ch1#sec_introduction_microservices)
  - 存储, [云时代的运维](/ch1#sec_introduction_operations)
- 微批次, [微批次与存档点](/ch12#id329)
- 微服务, [微服务与无服务器](/ch1#sec_introduction_microservices)
  - （另见 services）
  - 各种服务的因果关系, [全序的限制](/ch13#id335)
  - 松耦合, [开展分拆工作](/ch13#sec_future_unbundling_favor)
  - 与批量/流程处理器的关系, [批处理](/ch11#ch_batch), [流处理器和服务](/ch13#id345)
- 微软
  - Azure Blob Storage（见 Azure Blob Storage）
  - Azure managed disks, [存储与计算的分离](/ch1#sec_introduction_storage_compute)
  - Azure Service Bus（消息系统）, [消息代理](/ch5#message-brokers), [消息代理与数据库的对比](/ch12#id297)
  - Azure SQL DB（数据库）, [云原生系统架构](/ch1#sec_introduction_cloud_native)
  - Azure Storage, [对象存储](/ch11#id277)
  - Azure Stream Analytics, [流分析](/ch12#id318)
  - Azure Synapse Analytics（数据库）, [云原生系统架构](/ch1#sec_introduction_cloud_native)
  - 分布式组件对象模型, [远程过程调用（RPC）的问题](/ch5#sec_problems_with_rpc)
  - MSDTC (transaction coordinator), [两阶段提交（2PC）](/ch8#sec_transactions_2pc)
  - SQL Server（见 SQL Server）
- Microsoft Power BI（见 Power BI (business intelligence software)）
- 迁移（重写）数据, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility), [不同时间写入的不同值](/ch5#different-values-written-at-different-times), [从同一事件日志中派生多个视图](/ch12#sec_stream_deriving_views), [应用演化后重新处理数据](/ch13#sec_future_reprocessing)
- MinIO（对象存储）, [分布式文件系统](/ch11#sec_batch_dfs)
- 移动应用程序, [数据系统架构中的权衡](/ch1#ch_tradeoffs)
  - 嵌入式数据库, [压实策略](/ch4#sec_storage_lsm_compaction)
- 模式检查, [模型检查与规范语言](/ch9#model-checking-and-specification-languages)
- 模块操作员（%）, [哈希取模节点数](/ch7#hash-modulo-number-of-nodes)
- Mojo（编程语言）
  - 内存管理, [限制垃圾回收的影响](/ch9#sec_distributed_gc_impact)
- MongoDB（数据库）
  - 管道合计, [文档的查询语言](/ch3#query-languages-for-documents)
  - 原子操作, [原子写操作](/ch8#atomic-write-operations)
  - BSON, [读写的数据局部性](/ch3#sec_datamodels_document_locality)
  - 文档数据模型, [关系模型与文档模型](/ch3#sec_datamodels_history)
  - 散列变硬, [按键的哈希分片](/ch7#sec_sharding_hash), [按哈希范围分片](/ch7#sharding-by-hash-range)
  - 在云层中, [云原生系统架构](/ch1#sec_introduction_cloud_native)
  - 加入支持, [文档和关系数据库的融合](/ch3#convergence-of-document-and-relational-databases)
  - 加入（\$$ookup 运算符）, [规范化、反规范化与连接](/ch3#sec_datamodels_normalization)
  - JSON Schema validation, [JSON 模式](/ch5#json-schema)
  - 基于领导者的复制, [单主复制](/ch6#sec_replication_leader)
  - ObjectIds, [ID 生成器和逻辑时钟](/ch10#sec_consistency_logical)
  - 基于范围的硬化, [按键的范围分片](/ch7#sec_sharding_key_range)
  - 请求路由, [请求路由](/ch7#sec_sharding_routing)
  - 二级指数, [本地二级索引](/ch7#id166)
  - 硬分裂, [重新平衡键范围分片数据](/ch7#rebalancing-key-range-sharded-data)
  - 存储程序, [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs)
- 监测, [云时代的运维](/ch1#sec_introduction_operations), [人类与可靠性](/ch2#id31), [可运维性：让运维更轻松](/ch2#id37)
- 单音钟, [单调时钟](/ch9#monotonic-clocks)
- 单调读, [单调读](/ch6#sec_replication_monotonic_reads)
- Morel（查询语言）, [查询语言](/ch11#sec_batch_query_lanauges)
- MSMQ（消息系统）, [XA 事务](/ch8#xa-transactions)
- 多列索引, [多维索引与全文索引](/ch4#sec_storage_multidimensional)
- 多领导复制, [多主复制](/ch6#sec_replication_multi_leader)-[处理写入冲突](/ch6#sec_replication_write_conflicts)
  - （另见 复制）
  - 协作编辑, [实时协作、离线优先和本地优先应用](/ch6#real-time-collaboration-offline-first-and-local-first-apps)
  - 冲突检测, [处理写入冲突](/ch6#sec_replication_write_conflicts)
  - 解决冲突, [处理写入冲突](/ch6#sec_replication_write_conflicts)
  - 供多区域复制, [跨地域运行](/ch6#sec_replication_multi_dc), [线性一致性的代价](/ch10#sec_linearizability_cost)
  - 线性,缺少, [实现线性一致性系统](/ch10#sec_consistency_implementing_linearizable)
  - 可脱机客户端, [同步引擎与本地优先软件](/ch6#sec_replication_offline_clients)
  - 复制地形, [多主复制拓扑](/ch6#sec_replication_topologies)-[不同拓扑的问题](/ch6#problems-with-different-topologies)
- 多对象事务, [单对象与多对象操作](/ch8#sec_transactions_multi_object)
  - 需求, [多对象事务的需求](/ch8#sec_transactions_need)
- Multi-Paxos (consensus algorithm), [共识的实践](/ch10#sec_consistency_total_order)
- 多读单写锁定, [两阶段锁定的实现](/ch8#implementation-of-two-phase-locking)
- 多表索引集群表, [读写的数据局部性](/ch3#sec_datamodels_document_locality)
- 多版本并发控制, [多版本并发控制（MVCC）](/ch8#sec_transactions_snapshot_impl), [总结](/ch8#summary)
  - detecting stale MVCC reads, [检测陈旧的 MVCC 读取](/ch8#detecting-stale-mvcc-reads)
  - 索引和快照隔离, [索引与快照隔离](/ch8#indexes-and-snapshot-isolation)
  - 使用同步时钟, [用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
- 多层面阵列, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- 多重租赁, [存储与计算的分离](/ch1#sec_introduction_storage_compute), [网络拥塞和排队](/ch9#network-congestion-and-queueing)
  - 通过磨损, [面向多租户的分片](/ch7#sec_sharding_multitenancy)
  - 使用嵌入式数据库, [压实策略](/ch4#sec_storage_lsm_compaction)
  - 与拜占庭断层承受能力相比, [拜占庭故障](/ch9#sec_distributed_byzantine)
- 相互排斥, [悲观并发控制与乐观并发控制](/ch8#pessimistic-versus-optimistic-concurrency-control)
  - （另见 locks）
- MySQL（数据库）
  - archiving WAL to object stores, [设置新的副本](/ch6#sec_replication_new_replica)
  - 二进制日志坐标, [设置新的副本](/ch6#sec_replication_new_replica)
  - 数据变更捕获, [数据变更捕获的实现](/ch12#id307), [变更流的 API 支持](/ch12#sec_stream_change_api)
  - 循环复制地形, [多主复制拓扑](/ch6#sec_replication_topologies)
  - 一致的快照, [设置新的副本](/ch6#sec_replication_new_replica)
  - 分布式事务支持, [XA 事务](/ch8#xa-transactions)
  - global transaction identifiers (GTIDs), [设置新的副本](/ch6#sec_replication_new_replica)
  - 在云层中, [云原生系统架构](/ch1#sec_introduction_cloud_native)
  - InnoDB storage engine（见 InnoDB）
  - 基于领导者的复制, [单主复制](/ch6#sec_replication_leader)
  - 多领导复制, [跨地域运行](/ch6#sec_replication_multi_dc)
  - 基于行的复制, [逻辑（基于行）日志复制](/ch6#logical-row-based-log-replication)
  - 分片（见 Vitess（数据库））
  - 快速隔离支持, [快照隔离、可重复读和命名混淆](/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
    - （另见 InnoDB）
  - 基于语句的复制, [基于语句的复制](/ch6#statement-based-replication)

### N

- N+1 query problem, [对象关系映射（ORM）](/ch3#object-relational-mapping-orm)
- 纳米msg（信息库）, [直接从生产者传递给消费者](/ch12#id296)
- Narayana（事务协调员）, [两阶段提交（2PC）](/ch8#sec_transactions_2pc)
- NATS（消息系统）, [消息代理](/ch5#message-brokers)
- 自然语言处理, [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake)
- Neo4j（数据库）
  - Cypher 查询语言, [Cypher 查询语言](/ch3#id57)
  - 图表数据模型, [图数据模型](/ch3#sec_datamodels_graph)
- Neon（数据库）, [设置新的副本](/ch6#sec_replication_new_replica)
- 侄子（数据流引擎）, [数据流引擎](/ch11#sec_batch_dataflow)
- Neptune（图数据库）, [图数据模型](/ch3#sec_datamodels_graph)
  - Cypher 查询语言, [Cypher 查询语言](/ch3#id57)
  - SPARQL 查询语言, [SPARQL 查询语言](/ch3#the-sparql-query-language)
- 网码（游戏开发）, [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
- Network Attached Storage (NAS), [共享内存、共享磁盘与无共享架构](/ch2#sec_introduction_shared_nothing), [分布式文件系统](/ch11#sec_batch_dfs)
- 网络模型（数据表示）, [关系模型与文档模型](/ch3#sec_datamodels_history)
- Network Time Protocol（见 网络时间协议）
- 网络
  - 拥堵和排队, [网络拥塞和排队](/ch9#network-congestion-and-queueing)
  - 数据中心网络地形, [云计算与超级计算](/ch1#id17)
  - faults（见 faults）
  - 线性化和网络延迟, [线性一致性与网络延迟](/ch10#linearizability-and-network-delays)
  - 网络分区, [实践中的网络故障](/ch9#sec_distributed_network_faults)
    - 在 CAP 定理中, [线性一致性的代价](/ch10#sec_linearizability_cost)
  - 超时和无限制延误, [超时和无界延迟](/ch9#sec_distributed_queueing)
- NewSQL, [关系模型与文档模型](/ch3#sec_datamodels_history), [复制延迟的解决方案](/ch6#id131)
  - 事务和, [事务到底是什么？](/ch8#sec_transactions_overview), [数据库内部的分布式事务](/ch8#sec_transactions_internal)
- 下键锁定, [索引范围锁](/ch8#sec_transactions_2pl_range)
- NFS (network file system), [分布式文件系统](/ch11#sec_batch_dfs)
  - 在对象存储中, [对象存储](/ch11#id277)
- Nimble（数据格式）, [云数据仓库](/ch4#sec_cloud_data_warehouses), [列式存储](/ch4#sec_storage_column)
  - （另见 column-oriented storage）
- node (in graphs)（见 vertices）
- 节点（进程）, [分布式与单节点系统](/ch1#sec_introduction_distributed), [术语表](/glossary)
  - 在基于领导器的复制中处理断电, [处理节点故障](/ch6#sec_replication_failover)
  - 失败的系统模型, [系统模型与现实](/ch9#sec_distributed_system_model)
- 吵闹的邻居, [网络拥塞和排队](/ch9#network-congestion-and-queueing)
- 原子承诺, [三阶段提交](/ch8#three-phase-commit)
- 非决定性操作, [基于语句的复制](/ch6#statement-based-replication)
  - （另见 deterministic operations）
  - 在分布式系统中, [确定性模拟测试](/ch9#deterministic-simulation-testing)
  - 工作流程引擎中, [持久化执行](/ch5#durable-execution)
  - 部分失败, [故障与部分失效](/ch9#sec_distributed_partial_failure)
  - 非决定因素, [确定性模拟测试](/ch9#deterministic-simulation-testing)
- 不起作用的要求, [定义非功能性需求](/ch2#ch_nonfunctional), [总结](/ch2#summary)
- 不可重复读作, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)
  - （另见 读取偏差）
- 规范化, [规范化、反规范化与连接](/ch3#sec_datamodels_normalization)-[多对一与多对多关系](/ch3#sec_datamodels_many_to_many), [术语表](/glossary)
  - 外国关键参考文献, [多对象事务的需求](/ch8#sec_transactions_need)
  - 社会网络案例研究, [社交网络案例研究中的反规范化](/ch3#denormalization-in-the-social-networking-case-study)
  - 在记录系统中, [记录系统与派生数据](/ch1#sec_introduction_derived)
  - 相对于非正常化, [从同一事件日志中派生多个视图](/ch12#sec_stream_deriving_views)
- NoSQL, [关系模型与文档模型](/ch3#sec_datamodels_history), [复制延迟的解决方案](/ch6#id131), [分拆数据库](/ch13#sec_future_unbundling)
  - 事务和, [事务到底是什么？](/ch8#sec_transactions_overview)
- Notation3 (N3), [三元组存储与 SPARQL](/ch3#id59)
- 网络时间协议, [不可靠的时钟](/ch9#sec_distributed_clocks)
  - 准确性, [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy), [用于事件排序的时间戳](/ch9#sec_distributed_lww)
  - 对单音钟的调整, [单调时钟](/ch9#monotonic-clocks)
  - 多个服务器地址, [弱形式的谎言](/ch9#weak-forms-of-lying)
- XML 与 JSON 编码中的数字, [JSON、XML 及其二进制变体](/ch5#sec_encoding_json)
- NumPy (Python library), [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes), [列式存储](/ch4#sec_storage_column)
- NVMe (Non-Volatile Memory Express)（见 solid state drives (SSDs)）

### O

- 对象数据库, [关系模型与文档模型](/ch3#sec_datamodels_history)
- 对象存储, [云服务的分层](/ch1#layering-of-cloud-services), [对象存储](/ch11#id277)-[对象存储](/ch11#id277)
  - Azure Blob Storage（见 Azure Blob Storage）
  - 比较分布式文件系统, [对象存储](/ch11#id277)
  - 与关键价值库存的比较, [对象存储](/ch11#id277)
  - 数据库由, [设置新的副本](/ch6#sec_replication_new_replica)
  - 备份, [复制](/ch6#ch_replication)
  - 用于云数据仓库, [云数据仓库](/ch4#sec_cloud_data_warehouses), [写入列式存储](/ch4#writing-to-column-oriented-storage)
  - 数据库复制, [设置新的副本](/ch6#sec_replication_new_replica)
  - Google Cloud Storage（见 Google Cloud Storage）
  - 对象大小, [存储与计算的分离](/ch1#sec_introduction_storage_compute)
  - S3（见 S3（对象存储））
  - storing LSM segment files, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
  - 支持围栏, [隔离僵尸进程和延迟请求](/ch9#sec_distributed_fencing_tokens)
  - 数据湖中的使用, [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake)
- 对象关系映射（ORM）框架, [对象关系映射（ORM）](/ch3#object-relational-mapping-orm)
  - 处理错误和中止事务, [处理错误和中止](/ch8#handling-errors-and-aborts)
  - 不安全的读写周期代码, [原子写操作](/ch8#atomic-write-operations)
- 对象关系不匹配, [对象关系不匹配](/ch3#sec_datamodels_document)
- 可观察性, [分布式系统的问题](/ch1#sec_introduction_dist_sys_problems), [人类与可靠性](/ch2#id31), [可运维性：让运维更轻松](/ch2#id37)
- 观察员模式, [应用代码和状态的分离](/ch13#id344)
- OBT (one big table), [星型与雪花型：分析模式](/ch3#sec_datamodels_analytics), [星型与雪花型：分析模式](/ch3#sec_datamodels_analytics)
- 离线系统, [批处理](/ch11#ch_batch)
  - （另见 batch processing）
- 离线第一应用程序, [实时协作、离线优先和本地优先应用](/ch6#real-time-collaboration-offline-first-and-local-first-apps), [有状态、可离线的客户端](/ch13#id347)
- 页:1
  - 加工过的原木中的消费者抵消额, [消费者偏移量](/ch12#sec_stream_log_offsets)
  - 已磨损日志中的消息, [使用日志进行消息存储](/ch12#id300)
- OLAP, [事务处理与分析的特征](/ch1#sec_introduction_oltp), [术语表](/glossary)
  - 数据方块, [物化视图与数据立方体](/ch4#sec_storage_materialized_views)
- OLTP, [事务处理与分析的特征](/ch1#sec_introduction_oltp), [术语表](/glossary)
  - 分析查询与, [分析（Analytics）](/ch11#sec_batch_olap)
  - 数据正常化, [规范化的权衡](/ch3#trade-offs-of-normalization)
  - 工作量特点, [实际串行执行](/ch8#sec_transactions_serial)
- 现场部署, [云服务与自托管](/ch1#sec_introduction_cloud)
  - 数据仓库, [云数据仓库](/ch4#sec_cloud_data_warehouses)
- 一个大表格（数据仓计划）, [星型与雪花型：分析模式](/ch3#sec_datamodels_analytics), [星型与雪花型：分析模式](/ch3#sec_datamodels_analytics)
- 单热编码, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- 一对夫妇关系, [用于一对多关系的文档数据模型](/ch3#the-document-data-model-for-one-to-many-relationships)
- 一对多种关系, [用于一对多关系的文档数据模型](/ch3#the-document-data-model-for-one-to-many-relationships)
  - JSON representation, [用于一对多关系的文档数据模型](/ch3#the-document-data-model-for-one-to-many-relationships)
- 在线系统, [批处理](/ch11#ch_batch)
  - （另见 services）
  - 相对于科学计算, [云计算与超级计算](/ch1#id17)
- 肿瘤, [三元组存储与 SPARQL](/ch3#id59)
- Oozie（工作流调度器）, [批处理](/ch11#ch_batch)
- OpenAPI (service definition format), [微服务与无服务器](/ch1#sec_introduction_microservices), [Web 服务](/ch5#sec_web_services), [Web 服务](/ch5#sec_web_services)
  - use of JSON Schema, [JSON 模式](/ch5#json-schema)
- openCypher（见 Cypher（查询语言））
- OpenLink Virtuoso（见 Virtuoso（数据库））
- OpenStack
  - Swift（对象存储）, [对象存储](/ch11#id277)
- 可操作性, [可运维性：让运维更轻松](/ch2#id37)
- 操作系统与数据库, [分拆数据库](/ch13#sec_future_unbundling)
- 业务系统, [分析型与事务型系统](/ch1#sec_introduction_analytics)
  - （另见 在线事务处理）
  - 作为记录系统, [记录系统与派生数据](/ch1#sec_introduction_derived)
  - ETL into analytical systems, [数据仓库](/ch1#sec_introduction_dwh)
- 操作转换, [CRDT 与操作变换](/ch6#sec_replication_crdts)
- 行动组, [云时代的运维](/ch1#sec_introduction_operations)
- 算子, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
  - 在溪流处理中, [流处理](/ch12#sec_stream_processing)
- 乐观并发控制, [悲观并发控制与乐观并发控制](/ch8#pessimistic-versus-optimistic-concurrency-control)
- 乐观锁定, [条件写入（比较并设置）](/ch8#sec_transactions_compare_and_set)
- Oracle（数据库）
  - 分布式事务支持, [XA 事务](/ch8#xa-transactions)
  - GoldenGate (change data capture), [数据变更捕获的实现](/ch12#id307)
  - 等级查询, [SQL 中的图查询](/ch3#id58), [SQL 中的图查询](/ch3#id58)
  - 缺乏序列性, [隔离性](/ch8#sec_transactions_acid_isolation)
  - 基于领导者的复制, [单主复制](/ch6#sec_replication_leader)
  - 多领导复制, [跨地域运行](/ch6#sec_replication_multi_dc)
  - 多表索引集群表, [读写的数据局部性](/ch3#sec_datamodels_document_locality)
  - 无法阻止写入 skew, [写偏差的特征](/ch8#characterizing-write-skew)
  - PL/SQL language, [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs)
  - 防止丢失更新, [自动检测丢失的更新](/ch8#automatically-detecting-lost-updates)
  - 读作承诺隔离, [实现读已提交](/ch8#sec_transactions_read_committed_impl)
  - Real Application Clusters (RAC), [锁定与领导者选举](/ch10#locking-and-leader-election)
  - 快速隔离支持, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation), [快照隔离、可重复读和命名混淆](/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
  - TimesTen (in-memory database), [全内存存储](/ch4#sec_storage_inmemory)
  - WAL-based replication, [预写日志（WAL）传输](/ch6#write-ahead-log-wal-shipping)
- ORC（数据格式）, [云数据仓库](/ch4#sec_cloud_data_warehouses), [列式存储](/ch4#sec_storage_column)
  - （另见 column-oriented storage）
- 协调（服务部署）, [云服务与自托管](/ch1#sec_introduction_cloud), [微服务与无服务器](/ch1#sec_introduction_microservices)
  - 批量任务执行, [分布式作业编排](/ch11#id278)-[分布式作业编排](/ch11#id278)
  - 工作流程引擎, [批处理](/ch11#ch_batch)
- 顺序
  - 事件日志, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
  - 总订单的限制, [全序的限制](/ch13#id335)
  - 逻辑时间戳, [逻辑时钟](/ch10#sec_consistency_timestamps)
  - of auto-incrementing IDs, [ID 生成器和逻辑时钟](/ch10#sec_consistency_logical)
  - 共享日志, [共识的实践](/ch10#sec_consistency_total_order)-[共识的利弊](/ch10#pros-and-cons-of-consensus)
- Orkes（工作流程引擎）, [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows)
- 孤儿页面（B- 树）, [使 B 树可靠](/ch4#sec_storage_btree_wal)
- 发件箱图案, [数据变更捕获与事件溯源](/ch12#sec_stream_event_sourcing)
- 异常值（响应时间）, [平均值、中位数与百分位点](/ch2#id24)
- 外包, [云服务与自托管](/ch1#sec_introduction_cloud)
- 超载, [描述性能](/ch2#sec_introduction_percentiles), [处理错误和中止](/ch8#handling-errors-and-aborts)

### P

- PACELC principle, [CAP 定理](/ch10#the-cap-theorem)
- 软件包管理器, [应用代码和状态的分离](/ch13#id344)
- 包切换, [我们不能简单地使网络延迟可预测吗？](/ch9#can-we-not-simply-make-network-delays-predictable)
- 数据包
  - 腐败, [弱形式的谎言](/ch9#weak-forms-of-lying)
  - sending via UDP, [直接从生产者传递给消费者](/ch12#id296)
- PageRank (algorithm), [图数据模型](/ch3#sec_datamodels_graph), [查询语言](/ch11#sec_batch_query_lanauges), [机器学习](/ch11#id290)
- paging（见 virtual memory）
- 大熊猫（蟒蛇图书馆）, [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake), [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes), [列式存储](/ch4#sec_storage_column), [DataFrames](/ch11#id287)
- Parquet（数据格式）, [云数据仓库](/ch4#sec_cloud_data_warehouses), [列式存储](/ch4#sec_storage_column), [归档存储](/ch5#archival-storage), [查询语言](/ch11#sec_batch_query_lanauges)
  - （另见 column-oriented storage）
  - 对象存储的数据库, [设置新的副本](/ch6#sec_replication_new_replica)
  - 文档数据模型, [列式存储](/ch4#sec_storage_column)
  - 批量处理中的用途, [MapReduce](/ch11#sec_batch_mapreduce)
- 部分失败, [故障与部分失效](/ch9#sec_distributed_partial_failure), [总结](/ch9#summary)
  - 跛脚, [系统模型与现实](/ch9#sec_distributed_system_model)
- 部分同步（系统模型）, [系统模型与现实](/ch9#sec_distributed_system_model)
- 分区键, [分片的利与弊](/ch7#sec_sharding_reasons), [键值数据的分片](/ch7#sec_sharding_key_value)
- 分区（见 分片）
- Paxos（协商一致算法）, [共识](/ch10#sec_consistency_consensus), [共识的实践](/ch10#sec_consistency_total_order)
  - 票数, [从单主复制到共识](/ch10#from-single-leader-replication-to-consensus)
  - Multi-Paxos, [共识的实践](/ch10#sec_consistency_total_order)
- payment card industry (PCI), [数据系统、法律与社会](/ch1#sec_introduction_compliance)
- PCI (payment card industry) compliance, [数据系统、法律与社会](/ch1#sec_introduction_compliance)
- 百分位点, [平均值、中位数与百分位点](/ch2#id24), [术语表](/glossary)
  - 高效计算, [响应时间指标的应用](/ch2#sec_introduction_slo_sla)
  - 高百分数的重要性, [响应时间指标的应用](/ch2#sec_introduction_slo_sla)
  - use in service level agreements (SLAs), [响应时间指标的应用](/ch2#sec_introduction_slo_sla)
- Percolator (Google), [实现线性一致的 ID 生成器](/ch10#implementing-a-linearizable-id-generator)
- Percona XtraBackup (MySQL tool), [设置新的副本](/ch6#sec_replication_new_replica)
- 性能
  - 作为过失的降解, [系统模型与现实](/ch9#sec_distributed_system_model)
  - 描述, [描述性能](/ch2#sec_introduction_percentiles)
  - 分布式事务, [跨不同系统的分布式事务](/ch8#sec_transactions_xa)
  - 内存数据库, [全内存存储](/ch4#sec_storage_inmemory)
  - 线性, [线性一致性与网络延迟](/ch10#linearizability-and-network-delays)
  - 多领导者复制, [跨地域运行](/ch6#sec_replication_multi_dc)
- 权限隔离, [面向多租户的分片](/ch7#sec_sharding_multitenancy)
- 永久不一致, [及时性与完整性](/ch13#sec_future_integrity)
- 悲观并发控制, [悲观并发控制与乐观并发控制](/ch8#pessimistic-versus-optimistic-concurrency-control)
- pglogical (PostgreSQL extension), [跨地域运行](/ch6#sec_replication_multi_dc)
- pgvector （矢量指数）, [向量嵌入](/ch4#id92)
- 幻读, [导致写偏差的幻读](/ch8#sec_transactions_phantom)
  - 物化冲突, [物化冲突](/ch8#materializing-conflicts)
  - 预防,序列性, [谓词锁](/ch8#predicate-locks)
- physical clocks（见 clocks）
- pick菜（蟒鱼）, [特定语言的格式](/ch5#id96)
- Pinot（数据库）, [事务处理与分析的特征](/ch1#sec_introduction_oltp), [列式存储](/ch4#sec_storage_column)
  - 处理写入, [写入列式存储](/ch4#writing-to-column-oriented-storage)
  - 预汇总, [分析（Analytics）](/ch11#sec_batch_olap)
  - 服务衍生数据, [对外提供派生数据](/ch11#sec_batch_serving_derived), [对外提供派生数据](/ch11#sec_batch_serving_derived)
- 编审中的执行
  - 数据仓查询, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
- 枢轴表, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- 时间点, [不可靠的时钟](/ch9#sec_distributed_clocks)
- 点查询, [事务处理与分析的特征](/ch1#sec_introduction_oltp)
- 极地（数据目录）, [云数据仓库](/ch4#sec_cloud_data_warehouses)
- 投票, [表示用户、帖子与关注关系](/ch2#id20)
- 多边存储器, [一切的元数据库](/ch13#id341)
- POSIX (portable operating system interface)
  - 符合的文件系统, [设置新的副本](/ch6#sec_replication_new_replica), [分布式文件系统](/ch11#sec_batch_dfs), [对象存储](/ch11#id277)
- 邮政局地平线丑闻, [人类与可靠性](/ch2#id31)
  - 缺乏事务, [事务](/ch8#ch_transactions)
- PostgreSQL（数据库）
  - archiving WAL to object stores, [设置新的副本](/ch6#sec_replication_new_replica)
  - 数据变更捕获, [数据变更捕获的实现](/ch12#id307), [变更流的 API 支持](/ch12#sec_stream_change_api)
  - 分布式事务支持, [XA 事务](/ch8#xa-transactions)
  - 外国数据包, [一切的元数据库](/ch13#id341)
  - 全文搜索支持, [组合使用派生数据的工具](/ch13#id442)
  - 在云层中, [云原生系统架构](/ch1#sec_introduction_cloud_native)
  - JSON Schema validation, [JSON 模式](/ch5#json-schema)
  - 基于领导者的复制, [单主复制](/ch6#sec_replication_leader)
  - 日志序列号, [设置新的副本](/ch6#sec_replication_new_replica)
  - 逻辑解码, [逻辑（基于行）日志复制](/ch6#logical-row-based-log-replication)
  - 实现视图维护, [维护物化视图](/ch12#sec_stream_mat_view)
  - 多领导复制, [跨地域运行](/ch6#sec_replication_multi_dc)
  - MVCC implementation, [多版本并发控制（MVCC）](/ch8#sec_transactions_snapshot_impl), [索引与快照隔离](/ch8#indexes-and-snapshot-isolation)
  - 分割对硬化, [分片](/ch7#ch_sharding)
  - pgvector （矢量指数）, [向量嵌入](/ch4#id92)
  - PL/pgSQL language, [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs)
  - PostGIS geospatial indexes, [多维索引与全文索引](/ch4#sec_storage_multidimensional)
  - 防止丢失更新, [自动检测丢失的更新](/ch8#automatically-detecting-lost-updates)
  - 防止写入skew, [写偏差的特征](/ch8#characterizing-write-skew), [可串行化快照隔离（SSI）](/ch8#sec_transactions_ssi)
  - 读作承诺隔离, [实现读已提交](/ch8#sec_transactions_read_committed_impl)
  - 表示图表, [属性图](/ch3#id56)
  - 可串行化快照隔离, [可串行化快照隔离（SSI）](/ch8#sec_transactions_ssi)
  - 分片（见 Citus（数据库））
  - 快速隔离支持, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation), [快照隔离、可重复读和命名混淆](/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
  - WAL-based replication, [预写日志（WAL）传输](/ch6#write-ahead-log-wal-shipping)
- 倒排列表, [全文检索](/ch4#sec_storage_full_text)
  - 在硬化指数中, [本地二级索引](/ch7#id166)
- 死后无咎, [人类与可靠性](/ch2#id31)
- PouchDB（数据库）, [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
- Power BI (business intelligence software), [事务处理与分析的特征](/ch1#sec_introduction_oltp), [分析（Analytics）](/ch11#sec_batch_olap)
- 预汇总, [分析（Analytics）](/ch11#sec_batch_olap)
  - 服务衍生数据, [对外提供派生数据](/ch11#sec_batch_serving_derived)
- 分享前, [重新平衡键范围分片数据](/ch7#rebalancing-key-range-sharded-data)
- Precision Time Protocol (PTP), [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy)
- 上游锁定, [谓词锁](/ch8#predicate-locks)
- 预测分析, [分析型与事务型系统](/ch1#sec_introduction_analytics), [预测分析](/ch14#id369)-[反馈回路](/ch14#id372)
  - 扩大偏见, [偏见与歧视](/ch14#id370)
  - ethics of（见 ethics）
  - 反馈循环, [反馈回路](/ch14#id372)
- 预设, [资源分配](/ch11#id279)
  - 在分布式调度器中, [故障处理](/ch11#id281)
  - 线程, [进程暂停](/ch9#sec_distributed_clocks_pauses)
- Prefect（工作流调度器）, [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows), [批处理](/ch11#ch_batch), [工作流调度](/ch11#sec_batch_workflows)
  - 云数据仓集成, [查询语言](/ch11#sec_batch_query_lanauges)
- Presto（查询引擎）, [云数据仓库](/ch4#sec_cloud_data_warehouses)
- 主密钥, [多列索引与二级索引](/ch4#sec_storage_index_multicolumn), [术语表](/glossary)
  - 自动递增, [ID 生成器和逻辑时钟](/ch10#sec_consistency_logical)
  - 对分区键, [按哈希范围分片](/ch7#sharding-by-hash-range)
- primary-backup replication（见 基于领导者的复制）
- 隐私, [隐私与追踪](/ch14#id373)-[立法与自律](/ch14#sec_future_legislation)
  - 同意和选择自由, [同意与选择自由](/ch14#id375)
  - 数据作为资产和权力, [数据作为资产与权力](/ch14#id376)
  - 删除数据, [不变性的局限性](/ch12#sec_stream_immutability_limitations)
  - ethical considerations（见 ethics）
  - 立法和自律, [立法与自律](/ch14#sec_future_legislation)
  - 含义, [隐私与数据使用](/ch14#id457)
  - 条例, [数据系统、法律与社会](/ch1#sec_introduction_compliance)
  - 监视, [监视](/ch14#id374)
  - 跟踪行为数据, [隐私与追踪](/ch14#id373)
- 概率算法, [响应时间指标的应用](/ch2#sec_introduction_slo_sla), [流分析](/ch12#id318)
- 进程暂停, [进程暂停](/ch9#sec_distributed_clocks_pauses)-[限制垃圾回收的影响](/ch9#sec_distributed_gc_impact)
- 处理时间（事件）, [时间推理](/ch12#sec_stream_time)
- 生产者（信息流）, [传递事件流](/ch12#sec_stream_transmit)
- 产品分析, [事务处理与分析的特征](/ch1#sec_introduction_oltp)
  - 面向列的存储, [列式存储](/ch4#sec_storage_column)
- 编程语言
  - 用于储存程序, [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs)
- 预测（活动来源）, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
- Prolog（语言）, [Datalog：递归关系查询](/ch3#id62)
  - （另见 Datalog）
- 属性图, [属性图](/ch3#id56)
  - Cypher 查询语言, [Cypher 查询语言](/ch3#id57)
  - Property Graph Query Language (PGQL), [SQL 中的图查询](/ch3#id58)
- 基于属性的测试, [人类与可靠性](/ch2#id31), [形式化方法和随机测试](/ch9#sec_distributed_formal)
- Protocol Buffers（数据格式）, [Protocol Buffers](/ch5#sec_encoding_protobuf)-[字段标签与模式演化](/ch5#field-tags-and-schema-evolution), [Protocol Buffers](/ch5#sec_encoding_protobuf)
  - 字段标记和计划演变, [字段标签与模式演化](/ch5#field-tags-and-schema-evolution)
- 数据来源, [为可审计性而设计](/ch13#id365)
- 发布/订阅模式, [消息传递系统](/ch12#sec_stream_messaging)
- 出版社（信息流）, [传递事件流](/ch12#sec_stream_transmit)
- Pulsar （流线平台）, [确认与重新传递](/ch12#sec_stream_reordering)
- PyTorch (machine learning library), [机器学习](/ch11#id290)

### Q

- Qpid（消息系统）, [消息代理与数据库的对比](/ch12#id297)
- quality of service (QoS), [我们不能简单地使网络延迟可预测吗？](/ch9#can-we-not-simply-make-network-delays-predictable)
- Quantcast File System（分布式文件系统）, [对象存储](/ch11#id277)
- 查询引擎
  - 汇编和矢量化, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
  - 在云数据仓库中, [云数据仓库](/ch4#sec_cloud_data_warehouses)
  - 算子, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
  - 优化申报查询, [数据模型与查询语言](/ch3#ch_datamodels)
- 查询语言
  - 密码, [Cypher 查询语言](/ch3#id57)
  - 数据日志, [Datalog：递归关系查询](/ch3#id62)
  - GraphQL, [GraphQL](/ch3#id63)
  - MongoDB aggregation pipeline, [规范化、反规范化与连接](/ch3#sec_datamodels_normalization), [文档的查询语言](/ch3#query-languages-for-documents)
  - recursive SQL queries, [SQL 中的图查询](/ch3#id58)
  - SPARQL, [SPARQL 查询语言](/ch3#the-sparql-query-language)
  - SQL, [规范化、反规范化与连接](/ch3#sec_datamodels_normalization)
- 查询优化器, [查询语言](/ch11#sec_batch_query_lanauges)
- 查询计划, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
- 排队延迟, [网络拥塞和排队](/ch9#network-congestion-and-queueing)
  - 头部阻塞, [延迟与响应时间](/ch2#id23)
  - 延迟和反应时间, [延迟与响应时间](/ch2#id23)
- 队列（消息）, [消息代理](/ch5#message-brokers)
- QUIC (protocol), [TCP 的局限性](/ch9#sec_distributed_tcp)
- 法定人数, [读写仲裁](/ch6#sec_replication_quorum_condition)-[多地区操作](/ch6#multi-region-operation), [术语表](/glossary)
  - 用于无头复制, [读写仲裁](/ch6#sec_replication_quorum_condition)
  - 在共识算法中, [从单主复制到共识](/ch10#from-single-leader-replication-to-consensus)
  - 一致性的限制, [仲裁一致性的局限](/ch6#sec_replication_quorum_limitations)-[监控陈旧性](/ch6#monitoring-staleness), [线性一致性与仲裁](/ch10#sec_consistency_quorum_linearizable)
  - 在分布式系统中作出决定, [多数派原则](/ch9#sec_distributed_majority)
  - 监测停滞情况, [监控陈旧性](/ch6#monitoring-staleness)
  - 多区域复制, [多地区操作](/ch6#multi-region-operation)
  - 依赖耐久性, [将系统模型映射到现实世界](/ch9#mapping-system-models-to-the-real-world)
- 配额, [云时代的运维](/ch1#sec_introduction_operations)

### R

- R（语言）, [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake), [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes), [DataFrames](/ch11#id287)
- R树（指数）, [多维索引与全文索引](/ch4#sec_storage_multidimensional)
- R2（对象存储）, [云服务的分层](/ch1#layering-of-cloud-services), [分布式文件系统](/ch11#sec_batch_dfs)
- RabbitMQ（消息系统）, [消息代理](/ch5#message-brokers), [消息代理与数据库的对比](/ch12#id297)
  - 法定人数队列（复制）, [单主复制](/ch6#sec_replication_leader)
- 种族条件, [隔离性](/ch8#sec_transactions_acid_isolation)
  - （另见 并发）
  - 以可线性避免, [跨通道时序依赖](/ch10#cross-channel-timing-dependencies)
  - 由双写引起, [保持系统同步](/ch12#sec_stream_sync)
  - 造成资金损失, [弱隔离级别](/ch8#sec_transactions_isolation_levels)
  - 肮脏的写作, [没有脏写](/ch8#sec_transactions_dirty_write)
  - 逆增量, [没有脏写](/ch8#sec_transactions_dirty_write)
  - 丢失更新, [防止丢失更新](/ch8#sec_transactions_lost_update)-[冲突解决与复制](/ch8#conflict-resolution-and-replication)
  - 以事件日志防止, [并发控制](/ch12#sec_stream_concurrency), [数据流：应用代码与状态变化的交互](/ch13#id450)
  - 以可序列隔离的方式防止, [可串行化](/ch8#sec_transactions_serializability)
  - 事务隔离薄弱, [弱隔离级别](/ch8#sec_transactions_isolation_levels)
  - 写偏差, [写偏差与幻读](/ch8#sec_transactions_write_skew)-[物化冲突](/ch8#materializing-conflicts)
- Raft（协商一致算法）, [共识](/ch10#sec_consistency_consensus), [共识的实践](/ch10#sec_consistency_total_order)
  - 基于领导者的复制, [单主复制](/ch6#sec_replication_leader)
  - 对网络问题的敏感性, [共识的利弊](/ch10#pros-and-cons-of-consensus)
  - 任期, [从单主复制到共识](/ch10#from-single-leader-replication-to-consensus)
  - 用于等, [实现线性一致性系统](/ch10#sec_consistency_implementing_linearizable)
- RAID (Redundant Array of Independent Disks), [存储与计算的分离](/ch1#sec_introduction_storage_compute), [通过冗余容忍硬件故障](/ch2#tolerating-hardware-faults-through-redundancy), [分布式文件系统](/ch11#sec_batch_dfs)
- 铁路,计划迁移, [应用演化后重新处理数据](/ch13#sec_future_reprocessing)
- RAM（见 memory）
- RAMCloud (in-memory storage), [全内存存储](/ch4#sec_storage_inmemory)
- 随机写入（访问模式）, [顺序与随机写入](/ch4#sidebar_sequential)
- 区域查询
  - 在B树上, [B 树](/ch4#sec_storage_b_trees), [读取性能](/ch4#read-performance)
  - in LSM-trees, [读取性能](/ch4#read-performance)
  - 散列地图中不高效, [日志结构存储](/ch4#sec_storage_log_structured)
  - 与大麻的磨损,, [按哈希范围分片](/ch7#sharding-by-hash-range)
- 排名算法, [机器学习](/ch11#id290)
- Ray（工作流调度器）, [机器学习](/ch11#id290)
- RDF (Resource Description Framework), [RDF 数据模型](/ch3#the-rdf-data-model)
  - querying with SPARQL, [SPARQL 查询语言](/ch3#the-sparql-query-language)
- 远程直接内存访问, [云服务的分层](/ch1#layering-of-cloud-services), [云计算与超级计算](/ch1#id17)
- 反应（用户界面库）, [端到端的事件流](/ch13#id349)
- 被动方案拟订, [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
- 读取承诺隔离级别, [读已提交](/ch8#sec_transactions_read_committed)-[实现读已提交](/ch8#sec_transactions_read_committed_impl)
  - 执行, [实现读已提交](/ch8#sec_transactions_read_committed_impl)
  - 多版本并发控制, [多版本并发控制（MVCC）](/ch8#sec_transactions_snapshot_impl)
  - 没有脏读, [没有脏读](/ch8#no-dirty-reads)
  - 没有污秽的文字, [没有脏写](/ch8#sec_transactions_dirty_write)
- 读取模型（活动来源）, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
- 读路径, [观察派生数据状态](/ch13#sec_future_observing)
- （无铅复制）, [追赶错过的写入](/ch6#sec_replication_read_repair)
  - 线性, [线性一致性与仲裁](/ch10#sec_consistency_quorum_linearizable)
- 只读副本（见 基于领导者的复制）
- 读取偏差, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation), [总结](/ch8#summary)
- 读取未承诺的隔离级别, [实现读已提交](/ch8#sec_transactions_read_committed_impl)
- 写后读一致性, [读己之写](/ch6#sec_replication_ryw), [及时性与完整性](/ch13#sec_future_integrity)
  - 交叉设备, [读己之写](/ch6#sec_replication_ryw)
  - 在衍生数据系统中, [派生数据与分布式事务](/ch13#sec_future_derived_vs_transactions)
- 读 - 修改 - 写入周期, [防止丢失更新](/ch8#sec_transactions_lost_update)
- 读缩放架构, [复制延迟的问题](/ch6#sec_replication_lag), [单主与无主复制的性能](/ch6#sec_replication_leaderless_perf)
  - 与磨损, [分片的利与弊](/ch7#sec_sharding_reasons)
- 读作事件, [读也是事件](/ch13#sec_future_read_events)
- 实时
  - analytics（见 product analytics）
  - 协作编辑, [实时协作、离线优先和本地优先应用](/ch6#real-time-collaboration-offline-first-and-local-first-apps)
  - 发布/订阅数据流, [端到端的事件流](/ch13#id349)
  - 响应时间保障, [响应时间保证](/ch9#sec_distributed_clocks_realtime)
  - 每日时钟, [日历时钟](/ch9#time-of-day-clocks)
- Realm（数据库）, [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
- 重新平衡困难, [重新平衡键范围分片数据](/ch7#rebalancing-key-range-sharded-data)-[运维：自动/手动再平衡](/ch7#sec_sharding_operations), [术语表](/glossary)
  - （另见 分片）
  - 自动或人工重新平衡, [运维：自动/手动再平衡](/ch7#sec_sharding_operations)
  - 固定块数, [固定数量的分片](/ch7#fixed-number-of-shards)
  - 每个节点的固定硬度数, [按哈希范围分片](/ch7#sharding-by-hash-range)
  - Hash mod N的问题, [哈希取模节点数](/ch7#hash-modulo-number-of-nodes)
- 新鲜度保证, [线性一致性](/ch10#sec_consistency_linearizability)
- 建议引擎, [分析型与事务型系统](/ch1#sec_introduction_analytics)
  - building using DataFrames, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
  - 迭代处理, [机器学习](/ch11#id290)
- 重组（协商一致）, [共识的微妙之处](/ch10#subtleties-of-consensus)
- 记录, [MapReduce](/ch11#sec_batch_mapreduce)
  - 流处理中的事件, [传递事件流](/ch12#sec_stream_transmit)
- 递归查询
  - 在密钥中, [Cypher 查询语言](/ch3#id57)
  - 在数据日志中, [Datalog：递归关系查询](/ch3#id62)
  - in SPARQL, [SPARQL 查询语言](/ch3#the-sparql-query-language)
  - lack of, in GraphQL, [GraphQL](/ch3#id63)
  - SQL common table expressions, [SQL 中的图查询](/ch3#id58)
- Red Hat
  - Apicurio Registry, [JSON 模式](/ch5#json-schema)
- 红黑树, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
- 重新交付（通讯）, [确认与重新传递](/ch12#sec_stream_reordering)
- Redis（数据库）
  - 原子操作, [原子写操作](/ch8#atomic-write-operations)
  - CRDT support, [CRDT 与操作变换](/ch6#sec_replication_crdts)
  - 持久性, [全内存存储](/ch4#sec_storage_inmemory)
  - Lua 脚本, [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs)
  - 多领导复制, [跨地域运行](/ch6#sec_replication_multi_dc)
  - 进程/核心模式, [分片的利与弊](/ch7#sec_sharding_reasons)
  - 单条执行, [实际串行执行](/ch8#sec_transactions_serial)
- redo log（见 write-ahead log）
- Redpanda（消息系统）, [消息代理](/ch5#message-brokers), [设置新的副本](/ch6#sec_replication_new_replica)
  - 分层存储, [磁盘空间使用](/ch12#sec_stream_disk_usage)
- Redshift（数据库）, [云数据仓库](/ch4#sec_cloud_data_warehouses)
- 冗余
  - 硬件组件, [通过冗余容忍硬件故障](/ch2#tolerating-hardware-faults-through-redundancy)
  - 生成数据, [记录系统与派生数据](/ch1#sec_introduction_derived)
    - （另见 衍生数据）
- Reed--Solomon codes (error correction), [分布式文件系统](/ch11#sec_batch_dfs)
- 重构, [可演化性：让变化更容易](/ch2#sec_introduction_evolvability)
  - （另见 可演化性）
- （地理分布）, [读己之写](/ch6#sec_replication_ryw)
  - （另见 datacenters）
  - 协商一致, [共识的利弊](/ch10#pros-and-cons-of-consensus)
  - 定义, [读己之写](/ch6#sec_replication_ryw)
  - 延迟, [分布式与单节点系统](/ch1#sec_introduction_distributed)
  - linearizable ID generation, [实现线性一致的 ID 生成器](/ch10#implementing-a-linearizable-id-generator)
  - 在整个区域复制, [跨地域运行](/ch6#sec_replication_multi_dc)-[不同拓扑的问题](/ch6#problems-with-different-topologies), [线性一致性的代价](/ch10#sec_linearizability_cost), [全序的限制](/ch13#id335)
    - 无主（无领导者）, [多地区操作](/ch6#multi-region-operation)
    - 多领导者, [跨地域运行](/ch6#sec_replication_multi_dc)
- 区域（硬化）, [分片](/ch7#ch_sharding)
- 寄存器, [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
- regulation（见 legal matters）
- 关系数据模型, [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake), [关系模型与文档模型](/ch3#sec_datamodels_history)-[文档和关系数据库的融合](/ch3#convergence-of-document-and-relational-databases)
  - 与文件模型的比较, [何时使用哪种模型](/ch3#sec_datamodels_document_summary)-[文档和关系数据库的融合](/ch3#convergence-of-document-and-relational-databases)
  - graph queries in SQL, [SQL 中的图查询](/ch3#id58)
  - 模拟数据库, [全内存存储](/ch4#sec_storage_inmemory)
  - 多对多对多的关系, [多对一与多对多关系](/ch3#sec_datamodels_many_to_many)
  - 多对象事务, 需要, [多对象事务的需求](/ch8#sec_transactions_need)
  - 对象关系不匹配, [对象关系不匹配](/ch3#sec_datamodels_document)
  - 代表可重排列表, [何时使用哪种模型](/ch3#sec_datamodels_document_summary)
  - 对文档模式
    - 模式的趋同, [文档和关系数据库的融合](/ch3#convergence-of-document-and-relational-databases)
    - 数据位置, [读写的数据局部性](/ch3#sec_datamodels_document_locality)
- 关系数据库
  - 最终一致性, [复制延迟的问题](/ch6#sec_replication_lag)
  - 历史, [关系模型与文档模型](/ch3#sec_datamodels_history)
  - 基于领导者的复制, [单主复制](/ch6#sec_replication_leader)
  - 逻辑日志, [逻辑（基于行）日志复制](/ch6#logical-row-based-log-replication)
  - 哲学比Unix, [分拆数据库](/ch13#sec_future_unbundling), [一切的元数据库](/ch13#id341)
  - 方案变化, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility), [编码与演化](/ch5#ch_encoding), [不同时间写入的不同值](/ch5#different-values-written-at-different-times)
  - 硬化二级指数, [分片与二级索引](/ch7#sec_sharding_secondary_indexes)
  - 基于语句的复制, [基于语句的复制](/ch6#statement-based-replication)
  - B树指数的使用, [B 树](/ch4#sec_storage_b_trees)
- relationships（见 edges）
- 可靠性, [可靠性与容错](/ch2#sec_introduction_reliability)-[人类与可靠性](/ch2#id31), [流式系统的哲学](/ch13#ch_philosophy)
  - 从不可靠的组件建立可靠的系统, [故障与部分失效](/ch9#sec_distributed_partial_failure)
  - 硬件故障, [硬件与软件故障](/ch2#sec_introduction_hardware_faults)
  - 人类错误, [人类与可靠性](/ch2#id31)
  - 重要性, [人类与可靠性](/ch2#id31)
  - 通讯系统, [消息传递系统](/ch12#sec_stream_messaging)
  - 软件故障, [软件故障](/ch2#software-faults)
- Remote Method Invocation (Java RMI), [远程过程调用（RPC）的问题](/ch5#sec_problems_with_rpc)
- remote procedure calls (RPCs), [远程过程调用（RPC）的问题](/ch5#sec_problems_with_rpc)-[RPC 的数据编码与演化](/ch5#data-encoding-and-evolution-for-rpc)
  - （另见 services）
  - 数据编码和演化, [RPC 的数据编码与演化](/ch5#data-encoding-and-evolution-for-rpc)
  - 问题, [远程过程调用（RPC）的问题](/ch5#sec_problems_with_rpc)
  - 使用 Avro, [但什么是写入者模式？](/ch5#but-what-is-the-writers-schema)
  - 对信件经纪人, [事件驱动的架构](/ch5#sec_encoding_dataflow_msg)
- 可再生能源, [分布式与单节点系统](/ch1#sec_introduction_distributed)
- 可重复读（切换隔离）, [快照隔离、可重复读和命名混淆](/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
- 复制品, [单主复制](/ch6#sec_replication_leader)
- 复制, [复制](/ch6#ch_replication)-[总结](/ch6#summary), [术语表](/glossary)
  - 持久性, [持久性](/ch8#durability)
  - 解决冲突, [冲突解决与复制](/ch8#conflict-resolution-and-replication)
  - 一致性属性, [复制延迟的问题](/ch6#sec_replication_lag)-[复制延迟的解决方案](/ch6#id131)
    - 一致前缀读, [一致前缀读](/ch6#sec_replication_consistent_prefix)
    - 单调读, [单调读](/ch6#sec_replication_monotonic_reads)
    - 读取您的写作, [读己之写](/ch6#sec_replication_ryw)
  - 在分布式文件系统中, [分布式文件系统](/ch11#sec_batch_dfs)
  - 无主（无领导者）, [无主复制](/ch6#sec_replication_leaderless)-[版本向量](/ch6#version-vectors)
    - 检测并行写作, [检测并发写入](/ch6#sec_replication_concurrent)-[版本向量](/ch6#version-vectors)
    - 法定人数一致性的限制, [仲裁一致性的局限](/ch6#sec_replication_quorum_limitations)-[监控陈旧性](/ch6#monitoring-staleness), [线性一致性与仲裁](/ch10#sec_consistency_quorum_linearizable)
  - 监测停滞情况, [监控陈旧性](/ch6#monitoring-staleness)
  - 多领导者, [多主复制](/ch6#sec_replication_multi_leader)-[处理写入冲突](/ch6#sec_replication_write_conflicts)
    - 跨多个区域, [跨地域运行](/ch6#sec_replication_multi_dc), [线性一致性的代价](/ch10#sec_linearizability_cost)
    - 解决冲突, [处理写入冲突](/ch6#sec_replication_write_conflicts)-[处理写入冲突](/ch6#sec_replication_write_conflicts)
    - 复制地形, [多主复制拓扑](/ch6#sec_replication_topologies)-[不同拓扑的问题](/ch6#problems-with-different-topologies)
  - 使用原因, [分布式与单节点系统](/ch1#sec_introduction_distributed), [复制](/ch6#ch_replication)
  - 硬化和, [分片](/ch7#ch_sharding)
  - 单人领导, [单主复制](/ch6#sec_replication_leader)-[逻辑（基于行）日志复制](/ch6#logical-row-based-log-replication)
    - 故障切换, [领导者故障：故障转移](/ch6#leader-failure-failover)
    - 实施复制日志, [复制日志的实现](/ch6#sec_replication_implementation)-[逻辑（基于行）日志复制](/ch6#logical-row-based-log-replication)
    - 与协商一致的关系, [从单主复制到共识](/ch10#from-single-leader-replication-to-consensus), [共识的利弊](/ch10#pros-and-cons-of-consensus)
    - 设立新的追随者, [设置新的副本](/ch6#sec_replication_new_replica)
    - 同步对同步, [同步复制与异步复制](/ch6#sec_replication_sync_async)-[同步复制与异步复制](/ch6#sec_replication_sync_async)
  - 状态机复制, [基于语句的复制](/ch6#statement-based-replication), [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs), [使用共享日志](/ch10#sec_consistency_smr), [数据库与流](/ch12#sec_stream_databases)
    - 事件溯源, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
    - 依赖决定性因素, [确定性模拟测试](/ch9#deterministic-simulation-testing)
  - 利用协商一致, [共识的利弊](/ch10#pros-and-cons-of-consensus)
  - 使用擦除编码, [分布式文件系统](/ch11#sec_batch_dfs)
  - 使用对象存储, [设置新的副本](/ch6#sec_replication_new_replica)
  - 相对备份, [复制](/ch6#ch_replication)
  - 具有多样化数据系统, [保持系统同步](/ch12#sec_stream_sync)
- replication logs（见 logs）
- representations of data（见 data models）
- 后处理数据, [应用演化后重新处理数据](/ch13#sec_future_reprocessing), [统一批处理和流处理](/ch13#id338)
  - （另见 可演化性）
  - 从基于日志的信件, [重播旧消息](/ch12#sec_stream_replay)
- 请求套期, [单主与无主复制的性能](/ch6#sec_replication_leaderless_perf)
- 请求标识符, [操作标识符](/ch13#id355), [多分区请求处理](/ch13#id360)
- 请求路由, [请求路由](/ch7#sec_sharding_routing)-[请求路由](/ch7#sec_sharding_routing)
  - 方法, [请求路由](/ch7#sec_sharding_routing)
- 数据居住法, [分布式与单节点系统](/ch1#sec_introduction_distributed), [面向多租户的分片](/ch7#sec_sharding_multitenancy)
- 弹性系统, [可靠性与容错](/ch2#sec_introduction_reliability)
  - （另见 fault tolerance）
- 资源隔离, [云计算与超级计算](/ch1#id17), [面向多租户的分片](/ch7#sec_sharding_multitenancy)
- 资源限制, [云时代的运维](/ch1#sec_introduction_operations)
- 响应时间
  - 作为业绩计量, [描述性能](/ch2#sec_introduction_percentiles), [批处理](/ch11#ch_batch)
  - 保证, [响应时间保证](/ch9#sec_distributed_clocks_realtime)
  - 对用户的影响, [平均值、中位数与百分位点](/ch2#id24)
  - 在复制系统中, [单主与无主复制的性能](/ch6#sec_replication_leaderless_perf)
  - 暂时性与, [延迟与响应时间](/ch2#id23)
  - 平均值和百分位数, [平均值、中位数与百分位点](/ch2#id24)
  - 用户体验, [平均值、中位数与百分位点](/ch2#id24)
- 责任和问责制, [责任与问责](/ch14#id371)
- 表述性状态传递, [Web 服务](/ch5#sec_web_services)
  - （另见 services）
- 重报（工作流程引擎）, [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows)
- RethinkDB（数据库）
  - 加入支持, [文档和关系数据库的融合](/ch3#convergence-of-document-and-relational-databases)
  - 键程硬化, [按键的范围分片](/ch7#sec_sharding_key_range)
- 重试风暴, [描述性能](/ch2#sec_introduction_percentiles), [软件故障](/ch2#software-faults)
- reverse ETL, [超越数据湖](/ch1#beyond-the-data-lake)
- Riak（数据库）
  - CRDT support, [CRDT 与操作变换](/ch6#sec_replication_crdts), [检测并发写入](/ch6#sec_replication_concurrent)
  - 点版向量, [版本向量](/ch6#version-vectors)
  - 流言协议, [请求路由](/ch7#sec_sharding_routing)
  - 散列变硬, [固定数量的分片](/ch7#fixed-number-of-shards)
  - 无领导复制, [无主复制](/ch6#sec_replication_leaderless)
  - 线性,缺少, [线性一致性与仲裁](/ch10#sec_consistency_quorum_linearizable)
  - 多区域支助, [多地区操作](/ch6#multi-region-operation)
  - 再平衡, [运维：自动/手动再平衡](/ch7#sec_sharding_operations)
  - 二级指数, [本地二级索引](/ch7#id166)
  - 草率法定人数, [单主与无主复制的性能](/ch6#sec_replication_leaderless_perf)
  - 节点（硬化）, [分片](/ch7#ch_sharding)
- 环缓冲器, [磁盘空间使用](/ch12#sec_stream_disk_usage)
- RisingWave（数据库）
  - 增量视图维护, [维护物化视图](/ch12#sec_stream_mat_view)
- 火箭弹, [拜占庭故障](/ch9#sec_distributed_byzantine)
- RocksDB (storage engine), [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
  - 作为嵌入式存储引擎, [压实策略](/ch4#sec_storage_lsm_compaction)
  - 平面压缩, [压实策略](/ch4#sec_storage_lsm_compaction)
  - 服务衍生数据, [对外提供派生数据](/ch11#sec_batch_serving_derived)
- 退缩（事务）, [事务](/ch8#ch_transactions)
- 滚动升级, [通过冗余容忍硬件故障](/ch2#tolerating-hardware-faults-through-redundancy), [编码与演化](/ch5#ch_encoding), [故障与部分失效](/ch9#sec_distributed_partial_failure)
  - 在多种租户系统中, [面向多租户的分片](/ch7#sec_sharding_multitenancy)
- routing（见 request routing）
- 基于行的复制, [逻辑（基于行）日志复制](/ch6#logical-row-based-log-replication)
- 面向行存储, [列式存储](/ch4#sec_storage_column)
- 抢劫犯（贪污）, [硬件与软件故障](/ch2#sec_introduction_hardware_faults)
- RPCs（见 remote procedure calls）
- 规则（数据）, [Datalog：递归关系查询](/ch3#id62)
- Rust（编程语言）
  - 内存管理, [限制垃圾回收的影响](/ch9#sec_distributed_gc_impact)

### S

- S3（对象存储）, [云服务的分层](/ch1#layering-of-cloud-services), [设置新的副本](/ch6#sec_replication_new_replica), [批处理](/ch11#ch_batch), [分布式文件系统](/ch11#sec_batch_dfs), [对象存储](/ch11#id277)
  - 检查数据完整性, [不要盲目信任承诺](/ch13#id364)
  - 有条件写入, [隔离僵尸进程和延迟请求](/ch9#sec_distributed_fencing_tokens)
  - 对象大小, [存储与计算的分离](/ch1#sec_introduction_storage_compute)
  - S3 Express One Zone, [对象存储](/ch11#id277), [对象存储](/ch11#id277)
  - use in MapReduce, [MapReduce](/ch11#sec_batch_mapreduce)
  - 工作流程示例, [工作流调度](/ch11#sec_batch_workflows)
- SaaS（见 软件即服务（SaaS））
- 安全和生活特性, [安全性与活性](/ch9#sec_distributed_safety_liveness)
  - 在共识算法中, [单值共识](/ch10#single-value-consensus)
  - 事务中, [事务](/ch8#ch_transactions)
- sagas（见 compensating transactions）
- Samza （流处理器）, [流分析](/ch12#id318)
- SAP HANA（数据库）, [分析型数据存储](/ch4#sec_storage_analytics)
- 可伸缩性, [可伸缩性](/ch2#sec_introduction_scalability)-[可伸缩性原则](/ch2#id35), [流式系统的哲学](/ch13#ch_philosophy)
  - 自动缩放, [运维：自动/手动再平衡](/ch7#sec_sharding_operations)
  - 通过磨损, [分片的利与弊](/ch7#sec_sharding_reasons)
  - 描述负载, [描述负载](/ch2#id33)
  - 描述性能, [描述性能](/ch2#sec_introduction_percentiles)
  - 线性, [描述负载](/ch2#id33)
  - 原则, [可伸缩性原则](/ch2#id35)
  - 复制和, [复制延迟的问题](/ch6#sec_replication_lag)
  - 扩大规模与扩大规模, [共享内存、共享磁盘与无共享架构](/ch2#sec_introduction_shared_nothing)
- 缩放, [共享内存、共享磁盘与无共享架构](/ch2#sec_introduction_shared_nothing)
  - （另见 shared-nothing architecture）
  - 通过磨损, [分片的利与弊](/ch7#sec_sharding_reasons)
- 扩大规模, [共享内存、共享磁盘与无共享架构](/ch2#sec_introduction_shared_nothing)
- 缓慢变化的维度, [连接的时间依赖性](/ch12#sec_stream_join_time)
- 调度
  - 算法, [资源分配](/ch11#id279)
  - 批量任务, [分布式作业编排](/ch11#id278)-[工作流调度](/ch11#sec_batch_workflows)
  - 帮派列表, [资源分配](/ch11#id279)
- 阅读时的图谋, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility)
  - 与可变方案比较, [模式的优点](/ch5#sec_encoding_schemas)
- 拼写图, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility)
- schemaless databases（见 schema-on-read）
- 计划, [术语表](/glossary)
  - Avro, [Avro](/ch5#sec_encoding_avro)-[动态生成的模式](/ch5#dynamically-generated-schemas)
    - 读者决定作家的计划, [但什么是写入者模式？](/ch5#but-what-is-the-writers-schema)
    - 计划演变, [写入者模式与读取者模式](/ch5#the-writers-schema-and-the-readers-schema)
  - 动态生成, [动态生成的模式](/ch5#dynamically-generated-schemas)
  - 变化, [应用演化后重新处理数据](/ch13#sec_future_reprocessing)
    - 影响应用程序代码, [编码与演化](/ch5#ch_encoding)
    - 兼容性检查, [但什么是写入者模式？](/ch5#but-what-is-the-writers-schema)
    - 数据库中, [流经数据库的数据流](/ch5#sec_encoding_dataflow_db)-[归档存储](/ch5#archival-storage)
    - 服务电话, [RPC 的数据编码与演化](/ch5#data-encoding-and-evolution-for-rpc)
  - 文件模式的灵活性, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility)
  - 用于分析, [星型与雪花型：分析模式](/ch3#sec_datamodels_analytics)-[星型与雪花型：分析模式](/ch3#sec_datamodels_analytics)
  - for JSON and XML, [JSON、XML 及其二进制变体](/ch5#sec_encoding_json), [JSON 模式](/ch5#json-schema)
  - generation and migration using ORMs, [对象关系映射（ORM）](/ch3#object-relational-mapping-orm)
  - 案情, [模式的优点](/ch5#sec_encoding_schemas)
  - 迁移, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility)
  - Protocol Buffers, [Protocol Buffers](/ch5#sec_encoding_protobuf)-[字段标签与模式演化](/ch5#field-tags-and-schema-evolution)
    - 计划演变, [字段标签与模式演化](/ch5#field-tags-and-schema-evolution)
  - 铁路移民计划, [应用演化后重新处理数据](/ch13#sec_future_reprocessing)
  - 传统的设计方法,谬误, [从同一事件日志中派生多个视图](/ch12#sec_stream_deriving_views)
- 科学计算, [云计算与超级计算](/ch1#id17)
- scikit-learn （Python 图书馆）, [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake)
- ScyllaDB（数据库）
  - 集群元数据, [请求路由](/ch7#sec_sharding_routing)
  - consistency level ANY, [单主与无主复制的性能](/ch6#sec_replication_leaderless_perf)
  - 散列变硬, [按键的哈希分片](/ch7#sec_sharding_hash), [按哈希范围分片](/ch7#sharding-by-hash-range)
  - 最后写成的解决冲突, [检测并发写入](/ch6#sec_replication_concurrent)
  - 无领导复制, [无主复制](/ch6#sec_replication_leaderless)
  - 轻量事务, [单对象写入](/ch8#sec_transactions_single_object)
  - 线性,缺少, [实现线性一致性系统](/ch10#sec_consistency_implementing_linearizable)
  - 日志结构存储, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
  - 多区域支助, [多地区操作](/ch6#multi-region-operation)
  - 使用时钟, [仲裁一致性的局限](/ch6#sec_replication_quorum_limitations), [用于事件排序的时间戳](/ch9#sec_distributed_lww)
  - 节点（硬化）, [分片](/ch7#ch_sharding)
- search engines（见 全文检索）
- 搜索流, [在流上搜索](/ch12#id320)
- 备库（见 基于领导者的复制）
- 二级指数, [多列索引与二级索引](/ch4#sec_storage_index_multicolumn), [术语表](/glossary)
  - 多对多关系, [多对一与多对多关系](/ch3#sec_datamodels_many_to_many)
  - 双写问题, [保持系统同步](/ch12#sec_stream_sync), [理解数据流](/ch13#id443)
  - 分片, [分片与二级索引](/ch7#sec_sharding_secondary_indexes)-[全局二级索引](/ch7#id167), [总结](/ch7#summary)
    - 全球, [全局二级索引](/ch7#id167)
    - 指数维护, [维护派生状态](/ch13#id446)
    - 当地, [本地二级索引](/ch7#id166)
  - 更新、事务隔离和, [多对象事务的需求](/ch8#sec_transactions_need)
- 二次排序, [JOIN 与 GROUP BY](/ch11#sec_batch_join)
- sed （Unix 工具） （英语）., [简单日志分析](/ch11#sec_batch_log_analysis)
- 自我托管, [云服务与自托管](/ch1#sec_introduction_cloud)
  - 数据仓库, [云数据仓库](/ch4#sec_cloud_data_warehouses)
- 自我欢乐, [本章小结](/ch12#id332)
- 自动验证系统, [不要盲目信任承诺](/ch13#id364)
- 语义搜索, [向量嵌入](/ch4#id92)
- 语义相似性, [向量嵌入](/ch4#id92)
- 语义网, [三元组存储与 SPARQL](/ch3#id59)
- 半同步复制, [同步复制与异步复制](/ch6#sec_replication_sync_async)
- 顺序写（访问模式）, [顺序与随机写入](/ch4#sidebar_sequential)
- 可串行化, [隔离性](/ch8#sec_transactions_acid_isolation), [弱隔离级别](/ch8#sec_transactions_isolation_levels), [可串行化](/ch8#sec_transactions_serializability)-[可串行化快照隔离的性能](/ch8#performance-of-serializable-snapshot-isolation), [术语表](/glossary)
  - 线性比对, [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
  - 悲观与乐观的并发控制, [悲观并发控制与乐观并发控制](/ch8#pessimistic-versus-optimistic-concurrency-control)
  - 序列执行, [实际串行执行](/ch8#sec_transactions_serial)-[串行执行总结](/ch8#summary-of-serial-execution)
    - 分片, [分片](/ch8#sharding)
    - 使用存储程序, [将事务封装在存储过程中](/ch8#encapsulating-transactions-in-stored-procedures), [使用共享日志](/ch10#sec_consistency_smr)
  - 可串行化快照隔离, [可串行化快照隔离（SSI）](/ch8#sec_transactions_ssi)-[可串行化快照隔离的性能](/ch8#performance-of-serializable-snapshot-isolation)
    - detecting stale MVCC reads, [检测陈旧的 MVCC 读取](/ch8#detecting-stale-mvcc-reads)
    - 检测影响先前读取的写入, [检测影响先前读取的写入](/ch8#sec_detecting_writes_affect_reads)
    - 分布式执行, [可串行化快照隔离的性能](/ch8#performance-of-serializable-snapshot-isolation), [数据库内部的分布式事务](/ch8#sec_transactions_internal)
    - performance of SSI, [可串行化快照隔离的性能](/ch8#performance-of-serializable-snapshot-isolation)
    - 防止写入skew, [基于过时前提的决策](/ch8#decisions-based-on-an-outdated-premise)-[检测影响先前读取的写入](/ch8#sec_detecting_writes_affect_reads)
  - 严格的序列性, [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
    - 及时性与完整性, [及时性与完整性](/ch13#sec_future_integrity)
  - 两阶段锁定, [两阶段锁定（2PL）](/ch8#sec_transactions_2pl)-[索引范围锁](/ch8#sec_transactions_2pl_range)
    - 索引范围锁定, [索引范围锁](/ch8#sec_transactions_2pl_range)
    - 性能, [两阶段锁定的性能](/ch8#performance-of-two-phase-locking)
- 可串行化, [特定语言的格式](/ch5#id96)
- 序列化, [编码数据的格式](/ch5#sec_encoding_formats)
  - （另见 编码）
- 无服务器, [微服务与无服务器](/ch1#sec_introduction_microservices)
- 服务发现, [负载均衡器、服务发现和服务网格](/ch5#sec_encoding_service_discovery), [请求路由](/ch7#sec_sharding_routing), [服务发现](/ch10#service-discovery)
  - 登记, [负载均衡器、服务发现和服务网格](/ch5#sec_encoding_service_discovery)
  - using DNS, [负载均衡器、服务发现和服务网格](/ch5#sec_encoding_service_discovery), [请求路由](/ch7#sec_sharding_routing), [服务发现](/ch10#service-discovery)
- 服务级别协议（SLA）, [响应时间指标的应用](/ch2#sec_introduction_slo_sla), [描述负载](/ch2#id33)
- 服务网格, [负载均衡器、服务发现和服务网格](/ch5#sec_encoding_service_discovery)
- Service Organization Control (SOC), [数据系统、法律与社会](/ch1#sec_introduction_compliance)
- 服务时间, [延迟与响应时间](/ch2#id23)
- 面向服务的体系结构, [微服务与无服务器](/ch1#sec_introduction_microservices)
  - （另见 services）
- 服务, [流经服务的数据流：REST 与 RPC](/ch5#sec_encoding_dataflow_rpc)-[RPC 的数据编码与演化](/ch5#data-encoding-and-evolution-for-rpc)
  - 微服务, [微服务与无服务器](/ch1#sec_introduction_microservices)
    - 各种服务的因果关系, [全序的限制](/ch13#id335)
    - 松耦合, [开展分拆工作](/ch13#sec_future_unbundling_favor)
  - 与批量/流程处理器的关系, [批处理](/ch11#ch_batch), [流处理器和服务](/ch13#id345)
  - remote procedure calls (RPCs), [远程过程调用（RPC）的问题](/ch5#sec_problems_with_rpc)-[RPC 的数据编码与演化](/ch5#data-encoding-and-evolution-for-rpc)
    - 问题, [远程过程调用（RPC）的问题](/ch5#sec_problems_with_rpc)
  - 与数据库相似, [流经服务的数据流：REST 与 RPC](/ch5#sec_encoding_dataflow_rpc)
  - 网络服务, [Web 服务](/ch5#sec_web_services)
- 会话窗口（流处理）, [窗口的类型](/ch12#id324)
  - （另见 windows）
- 分片, [分片](/ch7#ch_sharding)-[总结](/ch7#summary), [术语表](/glossary)
  - 和共识, [使用共享日志](/ch10#sec_consistency_smr)
  - 复制, [分片](/ch7#ch_sharding)
  - 分散事务, [分布式事务](/ch8#sec_transactions_distributed)
  - 热的软糖, [键值数据的分片](/ch7#sec_sharding_key_value)
  - 分批处理, [批处理](/ch11#ch_batch)
  - 键程分割, [重新平衡键范围分片数据](/ch7#rebalancing-key-range-sharded-data)
  - 多硬性操作, [多分区数据处理](/ch13#sec_future_unbundled_multi_shard)
    - 执行限制, [多分区请求处理](/ch13#id360)
    - 二级指数维护, [维护派生状态](/ch13#id446)
  - 关键值数据, [键值数据的分片](/ch7#sec_sharding_key_value)-[偏斜的工作负载与缓解热点](/ch7#sec_sharding_skew)
    - 按密钥范围, [按键的范围分片](/ch7#sec_sharding_key_range)
    - 摇摆和热点, [偏斜的工作负载与缓解热点](/ch7#sec_sharding_skew)
  - 词源, [分片](/ch7#ch_sharding)
  - 分区键, [分片的利与弊](/ch7#sec_sharding_reasons), [键值数据的分片](/ch7#sec_sharding_key_value)
  - 再平衡
    - 密钥范围压缩数据, [重新平衡键范围分片数据](/ch7#rebalancing-key-range-sharded-data)
  - 重新平衡困难, [重新平衡键范围分片数据](/ch7#rebalancing-key-range-sharded-data)-[运维：自动/手动再平衡](/ch7#sec_sharding_operations)
    - 自动或人工重新平衡, [运维：自动/手动再平衡](/ch7#sec_sharding_operations)
    - Hash mod N的问题, [哈希取模节点数](/ch7#hash-modulo-number-of-nodes)
    - 使用固定的碎片数, [固定数量的分片](/ch7#fixed-number-of-shards)
    - 使用 N 个节点, [按哈希范围分片](/ch7#sharding-by-hash-range)
  - 请求路由, [请求路由](/ch7#sec_sharding_routing)-[请求路由](/ch7#sec_sharding_routing)
  - 二级指数, [分片与二级索引](/ch7#sec_sharding_secondary_indexes)-[全局二级索引](/ch7#id167)
    - 全球, [全局二级索引](/ch7#id167)
    - 当地, [本地二级索引](/ch7#id166)
  - 连续执行事务和, [分片](/ch8#sharding)
  - 正在排序硬化数据, [混洗数据](/ch11#sec_shuffle)
- 共享日志, [共识的实践](/ch10#sec_consistency_total_order)-[共识的利弊](/ch10#pros-and-cons-of-consensus), [全序的限制](/ch13#id335), [基于日志消息传递中的唯一性](/ch13#sec_future_uniqueness_log)
  - 算法, [共识的实践](/ch10#sec_consistency_total_order)
  - 用于事件源代码, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
  - 用于通讯, [基于日志的消息代理](/ch12#sec_stream_log)-[重播旧消息](/ch12#sec_stream_replay)
  - 与协商一致的关系, [共享日志作为共识](/ch10#sec_consistency_shared_logs)
  - 使用, [使用共享日志](/ch10#sec_consistency_smr)
- 共享模式, [两阶段锁定的实现](/ch8#implementation-of-two-phase-locking)
- 共享磁盘架构, [共享内存、共享磁盘与无共享架构](/ch2#sec_introduction_shared_nothing), [分布式文件系统](/ch11#sec_batch_dfs)
- 共享内存架构, [共享内存、共享磁盘与无共享架构](/ch2#sec_introduction_shared_nothing)
- 共享- 无结构, [共享内存、共享磁盘与无共享架构](/ch2#sec_introduction_shared_nothing), [术语表](/glossary)
  - 分布式文件系统, [分布式文件系统](/ch11#sec_batch_dfs)
    - （另见 distributed filesystems）
  - 网络的使用, [不可靠的网络](/ch9#sec_distributed_networks)
- 鲨鱼
  - 咬海底电缆, [实践中的网络故障](/ch9#sec_distributed_network_faults)
  - 计数（例）, [文档的查询语言](/ch3#query-languages-for-documents)
- shredding (deletion)（见 crypto-shredding）
- 粉碎（专栏编码）, [列式存储](/ch4#sec_storage_column)
- 粉碎（相关模型）, [何时使用哪种模型](/ch3#sec_datamodels_document_summary)
- 混洗, [混洗数据](/ch11#sec_shuffle)-[混洗数据](/ch11#sec_shuffle)
- 兄弟, [手动冲突解决](/ch6#manual-conflict-resolution), [捕获先发生关系](/ch6#capturing-the-happens-before-relationship), [冲突解决与复制](/ch8#conflict-resolution-and-replication)
  - （另见 conflicts）
- 仓, [数据仓库](/ch1#sec_introduction_dwh)
- 相似性搜索
  - 编辑距离, [全文检索](/ch4#sec_storage_full_text)
  - 基因组数据, [总结](/ch3#summary)
- 简单, [简单性：管理复杂度](/ch2#id38)
- 歌手, [数据仓库](/ch1#sec_introduction_dwh)
- single-instruction-multi-data (SIMD) instructions, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
- single-leader replication（见 基于领导者的复制）
- 单条执行, [原子写操作](/ch8#atomic-write-operations), [实际串行执行](/ch8#sec_transactions_serial)
  - 在溪流处理中, [日志与传统的消息传递相比](/ch12#sec_stream_logs_vs_messaging), [并发控制](/ch12#sec_stream_concurrency), [基于日志消息传递中的唯一性](/ch13#sec_future_uniqueness_log)
- SingleStore（数据库）
  - 内存储, [全内存存储](/ch4#sec_storage_inmemory)
- 工地可靠性工程师, [云时代的运维](/ch1#sec_introduction_operations)
- 大小级紧凑, [压实策略](/ch4#sec_storage_lsm_compaction), [磁盘空间使用](/ch4#disk-space-usage)
- 偏斜, [术语表](/glossary)
  - 时钟摇摆, [对同步时钟的依赖](/ch9#sec_distributed_clocks_relying)-[带置信区间的时钟读数](/ch9#clock-readings-with-a-confidence-interval), [实现线性一致性系统](/ch10#sec_consistency_implementing_linearizable)
  - 事务隔离
    - 读取偏差, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation), [总结](/ch8#summary)
    - 写偏差, [写偏差与幻读](/ch8#sec_transactions_write_skew)-[物化冲突](/ch8#materializing-conflicts), [基于过时前提的决策](/ch8#decisions-based-on-an-outdated-premise)-[检测影响先前读取的写入](/ch8#sec_detecting_writes_affect_reads)
      - （另见 写偏差）
  - 含义, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)
  - 不平衡的工作量, [键值数据的分片](/ch7#sec_sharding_key_value)
    - 补偿, [偏斜的工作负载与缓解热点](/ch7#sec_sharding_skew)
    - 由于名人, [偏斜的工作负载与缓解热点](/ch7#sec_sharding_skew)
    - 时间序列数据, [按键的范围分片](/ch7#sec_sharding_key_range)
- 跳过列表, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
- 服务级别协议（见 服务级别协议）
- Slack（分组聊天）
  - GraphQL example, [GraphQL](/ch3#id63)
- SlateDB（数据库）, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables), [设置新的副本](/ch6#sec_replication_new_replica)
- 滑动窗口（流处理）, [窗口的类型](/ch12#id324)
  - （另见 windows）
- 草率法定人数, [单主与无主复制的性能](/ch6#sec_replication_leaderless_perf)
- 缓慢变化的维度, [连接的时间依赖性](/ch12#sec_stream_join_time)
- 涂抹（倾斜秒调整）, [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy)
- 快照（数据库）
  - 作为备份, [复制](/ch6#ch_replication)
  - 计算衍生数据, [创建索引](/ch13#id340)
  - 变化数据捕获中, [初始快照](/ch12#sec_stream_cdc_snapshot)
  - 可串行化快照隔离, [可串行化快照隔离（SSI）](/ch8#sec_transactions_ssi)-[可串行化快照隔离的性能](/ch8#performance-of-serializable-snapshot-isolation)
  - 新建复制品, [设置新的副本](/ch6#sec_replication_new_replica)
  - 快速隔离和可重复读取, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)-[快照隔离、可重复读和命名混淆](/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
    - implementing with MVCC, [多版本并发控制（MVCC）](/ch8#sec_transactions_snapshot_impl)
    - indexes and MVCC, [索引与快照隔离](/ch8#indexes-and-snapshot-isolation)
    - 可见度规则, [观察一致快照的可见性规则](/ch8#sec_transactions_mvcc_visibility)
  - 全球快照同步时钟, [用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
- Snowflake（数据库）, [云原生系统架构](/ch1#sec_introduction_cloud_native), [云服务的分层](/ch1#layering-of-cloud-services), [云数据仓库](/ch4#sec_cloud_data_warehouses), [批处理](/ch11#ch_batch)
  - 面向列的存储, [列式存储](/ch4#sec_storage_column)
  - 处理写入, [写入列式存储](/ch4#writing-to-column-oriented-storage)
  - 硬化和集群, [按哈希范围分片](/ch7#sharding-by-hash-range)
  - 雪园, [查询语言](/ch11#sec_batch_query_lanauges)
- Snowflake (ID generator), [ID 生成器和逻辑时钟](/ch10#sec_consistency_logical)
- 雪花计划, [星型与雪花型：分析模式](/ch3#sec_datamodels_analytics)
- SOAP (web services), [远程过程调用（RPC）的问题](/ch5#sec_problems_with_rpc)
- SOC2（见 Service Organization Control (SOC)）
- 社会图表, [图数据模型](/ch3#sec_datamodels_graph)
- 社会
  - 的责任, [数据系统、法律与社会](/ch1#sec_introduction_compliance), [立法与自律](/ch14#sec_future_legislation)
- 社会技术系统, [人类与可靠性](/ch2#id31)
- 软件即服务（SaaS）, [数据系统架构中的权衡](/ch1#ch_tradeoffs), [云服务与自托管](/ch1#sec_introduction_cloud)
  - ETL from, [数据仓库](/ch1#sec_introduction_dwh)
  - 多重租赁, [面向多租户的分片](/ch7#sec_sharding_multitenancy)
- 软件错误, [软件故障](/ch2#software-faults)
  - 维护诚信, [维护完整性，尽管软件有Bug](/ch13#id455)
- 太阳风暴, [硬件与软件故障](/ch2#sec_introduction_hardware_faults)
- solid state drives (SSDs)
  - 访问模式, [顺序与随机写入](/ch4#sidebar_sequential)
  - 比较对象存储, [设置新的副本](/ch6#sec_replication_new_replica)
  - 侦查腐败, [端到端原则](/ch13#sec_future_e2e_argument), [不要盲目信任承诺](/ch13#id364)
  - 失败率, [硬件与软件故障](/ch2#sec_introduction_hardware_faults)
  - 错误在, [持久性](/ch8#durability)
  - 固件错误, [软件故障](/ch2#software-faults)
  - 读取吞吐量, [读取性能](/ch4#read-performance)
  - 顺序对随机写入, [顺序与随机写入](/ch4#sidebar_sequential)
- Solr （搜索服务器）
  - 本地二级指数, [本地二级索引](/ch7#id166)
  - 请求路由, [请求路由](/ch7#sec_sharding_routing)
  - 使用 Lucene, [全文检索](/ch4#sec_storage_full_text)
- 排序（Unix 工具）, [简单日志分析](/ch11#sec_batch_log_analysis), [简单日志分析](/ch11#sec_batch_log_analysis), [排序与内存聚合](/ch11#id275), [分布式作业编排](/ch11#id278)
- 排序归并连接（MapReduce）, [JOIN 与 GROUP BY](/ch11#sec_batch_join)
- Sorted String Tables（见 SSTables）
- 排序
  - 列存储中的排序顺序, [列存储中的排序顺序](/ch4#sort-order-in-column-storage)
- 真相来源（权威数据源）（见 systems of record）
- Spanner（数据库）
  - 一致性模式, [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
  - 数据位置, [读写的数据局部性](/ch3#sec_datamodels_document_locality)
  - 在云层中, [云原生系统架构](/ch1#sec_introduction_cloud_native)
  - 使用时钟快照隔离, [用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
  - 事务, [事务到底是什么？](/ch8#sec_transactions_overview), [数据库内部的分布式事务](/ch8#sec_transactions_internal)
  - TrueTime API, [带置信区间的时钟读数](/ch9#clock-readings-with-a-confidence-interval)
- Spark（处理框架）, [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake), [云原生系统架构](/ch1#sec_introduction_cloud_native), [批处理](/ch11#ch_batch), [数据流引擎](/ch11#sec_batch_dataflow)
  - 成本效率, [查询语言](/ch11#sec_batch_query_lanauges)
  - DataFrames, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes), [DataFrames](/ch11#id287)
  - 过失容忍, [故障处理](/ch11#id281)
  - 数据仓库, [云数据仓库](/ch4#sec_cloud_data_warehouses)
  - high availability using ZooKeeper, [协调服务](/ch10#sec_consistency_coordination)
  - MLlib, [机器学习](/ch11#id290)
  - 查询优化器, [查询语言](/ch11#sec_batch_query_lanauges)
  - 移动数据, [混洗数据](/ch11#sec_shuffle)
  - Spark Streaming, [流分析](/ch12#id318)
    - 微批次, [微批次与存档点](/ch12#id329)
  - streaming SQL support, [复合事件处理](/ch12#id317)
  - 用于 ETL, [提取-转换-加载（ETL）](/ch11#sec_batch_etl_usage)
- SPARQL（查询语言）, [SPARQL 查询语言](/ch3#the-sparql-query-language)
- 零星指数, [SSTable 文件格式](/ch4#the-sstable-file-format)
- 稀疏矩阵, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- 脑裂, [领导者故障：故障转移](/ch6#leader-failure-failover), [请求路由](/ch7#sec_sharding_routing), [术语表](/glossary)
  - 执行限制, [唯一性约束需要达成共识](/ch13#id452)
  - 在共识算法中, [共识](/ch10#sec_consistency_consensus), [从单主复制到共识](/ch10#from-single-leader-replication-to-consensus)
  - 预防, [实现线性一致性系统](/ch10#sec_consistency_implementing_linearizable)
  - 使用栅栏标志来避免, [隔离僵尸进程和延迟请求](/ch9#sec_distributed_fencing_tokens)-[多副本隔离](/ch9#fencing-with-multiple-replicas)
- 现场实例, [故障处理](/ch11#id281)
- 电子表格, [数据系统架构中的权衡](/ch1#ch_tradeoffs), [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
  - 数据流编程, [围绕数据流设计应用](/ch13#sec_future_dataflow)
  - 枢轴表, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- SQL (Structured Query Language), [简单性：管理复杂度](/ch2#id38), [关系模型与文档模型](/ch3#sec_datamodels_history), [云数据仓库](/ch4#sec_cloud_data_warehouses)
  - 用于分析, [数据仓库](/ch1#sec_introduction_dwh), [列式存储](/ch4#sec_storage_column)
  - 图表查询, [SQL 中的图查询](/ch3#id58)
  - 隔离级别标准,问题, [快照隔离、可重复读和命名混淆](/ch8#snapshot-isolation-repeatable-read-and-naming-confusion)
  - 加入, [规范化、反规范化与连接](/ch3#sec_datamodels_normalization)
  - 简历（例）, [用于一对多关系的文档数据模型](/ch3#the-document-data-model-for-one-to-many-relationships)
  - 社会网络家庭时间表（例）, [表示用户、帖子与关注关系](/ch2#id20)
  - SQL injection vulnerability, [拜占庭故障](/ch9#sec_distributed_byzantine)
  - 基于语句的复制, [基于语句的复制](/ch6#statement-based-replication)
  - 存储程序, [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs)
  - 批次处理框架中的支持, [批处理](/ch11#ch_batch)
  - 视图, [Datalog：递归关系查询](/ch3#id62)
- SQL Server（数据库）
  - archiving WAL to object stores, [设置新的副本](/ch6#sec_replication_new_replica)
  - 数据变更捕获, [数据变更捕获的实现](/ch12#id307)
  - 数据存储支持, [分析型数据存储](/ch4#sec_storage_analytics)
  - 分布式事务支持, [XA 事务](/ch8#xa-transactions)
  - 基于领导者的复制, [单主复制](/ch6#sec_replication_leader)
  - 多领导复制, [跨地域运行](/ch6#sec_replication_multi_dc)
  - 防止丢失更新, [自动检测丢失的更新](/ch8#automatically-detecting-lost-updates)
  - 防止写入skew, [写偏差的特征](/ch8#characterizing-write-skew), [两阶段锁定的实现](/ch8#implementation-of-two-phase-locking)
  - 读作承诺隔离, [实现读已提交](/ch8#sec_transactions_read_committed_impl)
  - 可序列隔离, [两阶段锁定的实现](/ch8#implementation-of-two-phase-locking)
  - 快速隔离支持, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)
  - T-SQL language, [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs)
- SQLite（数据库）, [分布式系统的问题](/ch1#sec_introduction_dist_sys_problems), [压实策略](/ch4#sec_storage_lsm_compaction)
  - archiving WAL to object stores, [设置新的副本](/ch6#sec_replication_new_replica)
- SRE (site reliability engineer), [云时代的运维](/ch1#sec_introduction_operations)
- SSDs（见 solid state drives）
- SSTables (storage format), [SSTable 文件格式](/ch4#the-sstable-file-format)-[压实策略](/ch4#sec_storage_lsm_compaction)
  - 建造和维护, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
  - making LSM-Tree from, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
- 阶段发布（见 rolling upgrades）
- 停滞（旧数据）, [读己之写](/ch6#sec_replication_ryw)
  - 跨渠道时间依赖性, [跨通道时序依赖](/ch10#cross-channel-timing-dependencies)
  - 无头数据库中, [当节点故障时写入数据库](/ch6#id287)
  - 多转换并发控制, [检测陈旧的 MVCC 读取](/ch8#detecting-stale-mvcc-reads)
  - 监测, [监控陈旧性](/ch6#monitoring-staleness)
  - 客户端状态, [将状态变更推送给客户端](/ch13#id348)
  - 相对线性, [线性一致性](/ch10#sec_consistency_linearizability)
  - 相对于及时性, [及时性与完整性](/ch13#sec_future_integrity)
- standbys（见 基于领导者的复制）
- 恒星复制地形, [多主复制拓扑](/ch6#sec_replication_topologies)
- 恒星计划, [星型与雪花型：分析模式](/ch3#sec_datamodels_analytics)-[星型与雪花型：分析模式](/ch3#sec_datamodels_analytics)
- 星球大战类比（事件时间与处理时间）, [事件时间与处理时间](/ch12#id322)
- 饥饿（时间安排）, [资源分配](/ch11#id279)
- 国家
  - 从不可改变事件日志中得出, [状态、流和不变性](/ch12#sec_stream_immutability)
  - 状态变化与应用程序代码之间的相互作用, [数据流：应用代码与状态变化的交互](/ch13#id450)
  - 保持衍生状态, [维护派生状态](/ch13#id446)
  - 由流处理器在流-流连接中维护, [流流连接（窗口连接）](/ch12#id440)
  - 观察导出状态, [观察派生数据状态](/ch13#sec_future_observing)-[多分区数据处理](/ch13#sec_future_unbundled_multi_shard)
  - 流处理器失败后重建, [失败后重建状态](/ch12#sec_stream_state_fault_tolerance)
  - 应用代码和, [应用代码和状态的分离](/ch13#id344)
- 状态机复制, [基于语句的复制](/ch6#statement-based-replication), [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs), [使用共享日志](/ch10#sec_consistency_smr), [数据库与流](/ch12#sec_stream_databases)
  - 事件溯源, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
  - 依赖决定性因素, [确定性模拟测试](/ch9#deterministic-simulation-testing)
- 无国籍人制度, [数据系统架构中的权衡](/ch1#ch_tradeoffs)
- 基于语句的复制, [基于语句的复制](/ch6#statement-based-replication)
  - 依赖决定性因素, [确定性模拟测试](/ch9#deterministic-simulation-testing)
- 静态输入语言
  - 类比于图案, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility)
- 统计和数字算法, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- StatsD (metrics aggregator), [直接从生产者传递给消费者](/ch12#id296)
- 股票市场饲料, [直接从生产者传递给消费者](/ch12#id296)
- 爆彼之头, [领导者故障：故障转移](/ch6#leader-failure-failover)
  - 问题, [隔离僵尸进程和延迟请求](/ch9#sec_distributed_fencing_tokens)
- 停止所有处理（见 garbage collection）
- 存储
  - 构建数据存储技术, [组合使用数据存储技术](/ch13#id447)-[分拆系统与集成系统](/ch13#id448)
- 存储区网络, [共享内存、共享磁盘与无共享架构](/ch2#sec_introduction_shared_nothing), [分布式文件系统](/ch11#sec_batch_dfs)
- 存储引擎, [存储与检索](/ch4#ch_storage)-[总结](/ch4#summary)
  - 面向列, [列式存储](/ch4#sec_storage_column)-[查询执行：编译与向量化](/ch4#sec_storage_vectorized)
    - 列压缩, [列压缩](/ch4#sec_storage_column_compression)-[列压缩](/ch4#sec_storage_column_compression)
    - 定义, [列式存储](/ch4#sec_storage_column)
    - 公园, [云数据仓库](/ch4#sec_cloud_data_warehouses), [列式存储](/ch4#sec_storage_column), [归档存储](/ch5#archival-storage)
    - 排序在, [列存储中的排序顺序](/ch4#sort-order-in-column-storage)-[列存储中的排序顺序](/ch4#sort-order-in-column-storage)
    - 宽柱型, [列压缩](/ch4#sec_storage_column_compression)
    - 写入, [写入列式存储](/ch4#writing-to-column-oriented-storage)
  - 内存储, [全内存存储](/ch4#sec_storage_inmemory)
    - 持久性, [持久性](/ch8#durability)
  - 面向行, [OLTP 系统的存储与索引](/ch4#sec_storage_oltp)-[全内存存储](/ch4#sec_storage_inmemory)
    - B树, [B 树](/ch4#sec_storage_b_trees)-[B 树变体](/ch4#b-tree-variants)
    - comparing B-trees and LSM-trees, [比较 B 树与 LSM 树](/ch4#sec_storage_btree_lsm_comparison)-[磁盘空间使用](/ch4#disk-space-usage)
    - 定义, [列式存储](/ch4#sec_storage_column)
    - 日志结构, [日志结构存储](/ch4#sec_storage_log_structured)-[压实策略](/ch4#sec_storage_lsm_compaction)
- 存储程序, [将事务封装在存储过程中](/ch8#encapsulating-transactions-in-stored-procedures)-[存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs), [术语表](/glossary)
  - 和共享日志, [使用共享日志](/ch10#sec_consistency_smr)
  - 利弊因素, [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs)
  - 类似于流处理器, [应用代码作为派生函数](/ch13#sec_future_dataflow_derivation)
- 风暴（流处理器）, [流分析](/ch12#id318)
  - distributed RPC, [事件驱动架构与 RPC](/ch12#sec_stream_actors_drpc), [多分区数据处理](/ch13#sec_future_unbundled_multi_shard)
  - 三叉戟状态处理, [幂等性](/ch12#sec_stream_idempotence)
- 斜拉机事件, [处理滞留事件](/ch12#id323)
- Stream Control Transmission Protocol (SCTP), [TCP 的局限性](/ch9#sec_distributed_tcp)
- 流处理, [流处理](/ch12#sec_stream_processing)-[本章小结](/ch12#id332), [术语表](/glossary)
  - 在工作范围内获得外部服务, [流表连接（流扩充）](/ch12#sec_stream_table_joins), [微批次与存档点](/ch12#id329), [幂等性](/ch12#sec_stream_idempotence), [恰好执行一次操作](/ch13#id353)
  - 与批量处理相结合, [统一批处理和流处理](/ch13#id338)
  - 与批量处理的比较, [流处理](/ch12#sec_stream_processing)
  - 复合事件处理, [复合事件处理](/ch12#id317)
  - 过失容忍, [容错](/ch12#sec_stream_fault_tolerance)-[失败后重建状态](/ch12#sec_stream_state_fault_tolerance)
    - 原子提交, [原子提交再现](/ch12#sec_stream_atomic_commit)
    - 幂等性, [幂等性](/ch12#sec_stream_idempotence)
    - 微打斗和检查站, [微批次与存档点](/ch12#id329)
    - 失败后重建状态, [失败后重建状态](/ch12#sec_stream_state_fault_tolerance)
  - 数据整合, [批处理与流处理](/ch13#sec_future_batch_streaming)-[统一批处理和流处理](/ch13#id338)
  - 用于事件源代码, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
  - 保持衍生状态, [维护派生状态](/ch13#id446)
  - 维持实际意见, [维护物化视图](/ch12#sec_stream_mat_view)
  - messaging systems（见 messaging systems）
  - 关于时间的推理, [时间推理](/ch12#sec_stream_time)-[窗口的类型](/ch12#id324)
    - 事件时间与处理时间, [事件时间与处理时间](/ch12#id322), [微批次与存档点](/ch12#id329), [统一批处理和流处理](/ch13#id338)
    - 知道窗口何时准备好, [处理滞留事件](/ch12#id323)
    - 窗口类型, [窗口的类型](/ch12#id324)
  - relation to databases（见 streams）
  - 与服务的关系, [流处理器和服务](/ch13#id345)
  - 与批次处理的关系, [批处理](/ch11#ch_batch)
  - 在流中搜索, [在流上搜索](/ch12#id320)
  - 单条执行, [日志与传统的消息传递相比](/ch12#sec_stream_logs_vs_messaging), [并发控制](/ch12#sec_stream_concurrency)
  - 流式分析, [流分析](/ch12#id318)
  - 串流连接, [流连接](/ch12#sec_stream_joins)-[连接的时间依赖性](/ch12#sec_stream_join_time)
    - 串流流连接, [流流连接（窗口连接）](/ch12#id440)
    - 串行表连接, [流表连接（流扩充）](/ch12#sec_stream_table_joins)
    - 表格连接, [表表连接（维护物化视图）](/ch12#id326)
    - 时间的依赖性, [连接的时间依赖性](/ch12#sec_stream_join_time)
- 流程, [流处理](/ch12#ch_stream)-[重播旧消息](/ch12#sec_stream_replay)
  - 端对端,向客户推进事件, [端到端的事件流](/ch13#id349)
  - messaging systems（见 messaging systems）
  - processing（见 流处理）
  - 与数据库的关系, [数据库与流](/ch12#sec_stream_databases)-[不变性的局限性](/ch12#sec_stream_immutability_limitations)
    - （另见 changelogs）
    - 变更流的 API 支持, [变更流的 API 支持](/ch12#sec_stream_change_api)
    - 数据变更捕获, [数据变更捕获](/ch12#sec_stream_cdc)-[变更流的 API 支持](/ch12#sec_stream_change_api)
    - 按时间分列的状态衍生物, [状态、流和不变性](/ch12#sec_stream_immutability)
    - 事件溯源, [数据变更捕获与事件溯源](/ch12#sec_stream_event_sourcing)
    - 保持系统同步, [保持系统同步](/ch12#sec_stream_sync)-[保持系统同步](/ch12#sec_stream_sync)
    - 不可改变事件哲学, [状态、流和不变性](/ch12#sec_stream_immutability)-[不变性的局限性](/ch12#sec_stream_immutability_limitations)
  - 专题, [传递事件流](/ch12#sec_stream_transmit)
- 严格的序列性, [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
  - 及时性与完整性, [及时性与完整性](/ch13#sec_future_integrity)
- 条纹（列编码）, [列式存储](/ch4#sec_storage_column)
- 强一致性（见 线性一致性）
- 最终的一致性, [自动冲突解决](/ch6#automatic-conflict-resolution)
- 强烈的单份序列性, [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
- 主题、上游和物体（三层）, [三元组存储与 SPARQL](/ch3#id59)
- 订阅者, [传递事件流](/ch12#sec_stream_transmit)
  - （另见 consumers）
- 超级计算机, [云计算与超级计算](/ch1#id17)
- Superset（数据可视化软件）, [分析（Analytics）](/ch11#sec_batch_olap)
- 监视, [监视](/ch14#id374)
  - （另见 隐私）
- 寿司原则, [从数据仓库到数据湖](/ch1#from-data-warehouse-to-data-lake)
- 可持续性, [分布式与单节点系统](/ch1#sec_introduction_distributed)
- Swagger（服务定义格式）, [Web 服务](/ch5#sec_web_services)
- swapping to disk（见 virtual memory）
- Swift（编程语言）
  - 内存管理, [限制垃圾回收的影响](/ch9#sec_distributed_gc_impact)
- 同步引擎, [同步引擎与本地优先软件](/ch6#sec_replication_offline_clients)-[同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
  - 实例, [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
  - 用于本地第一软件, [实时协作、离线优先和本地优先应用](/ch6#real-time-collaboration-offline-first-and-local-first-apps)
- 同步网络, [同步与异步网络](/ch9#sec_distributed_sync_networks), [术语表](/glossary)
  - 比较同步网络, [同步与异步网络](/ch9#sec_distributed_sync_networks)
  - 系统模型, [系统模型与现实](/ch9#sec_distributed_system_model)
- 同步复制, [同步复制与异步复制](/ch6#sec_replication_sync_async), [术语表](/glossary)
  - 有多个领导, [多主复制](/ch6#sec_replication_multi_leader)
- 系统管理员, [云时代的运维](/ch1#sec_introduction_operations)
- 系统模型, [知识、真相和谎言](/ch9#sec_distributed_truth), [系统模型与现实](/ch9#sec_distributed_system_model)-[确定性模拟测试](/ch9#deterministic-simulation-testing)
  - 假设, [信任但验证](/ch13#sec_future_verification)
  - 算法的正确性, [定义算法的正确性](/ch9#defining-the-correctness-of-an-algorithm)
  - 绘制真实世界的地图, [将系统模型映射到现实世界](/ch9#mapping-system-models-to-the-real-world)
  - 安全和生活, [安全性与活性](/ch9#sec_distributed_safety_liveness)
- 记录系统, [记录系统与派生数据](/ch1#sec_introduction_derived), [术语表](/glossary)
  - 数据变更捕获, [数据变更捕获的实现](/ch12#id307), [理解数据流](/ch13#id443)
  - 事件日志, [事件溯源与 CQRS](/ch3#sec_datamodels_events)
  - 事件日志处理为, [状态、流和不变性](/ch12#sec_stream_immutability)
- 系统思维, [反馈回路](/ch14#id372)

### T

- t- digest（算法）, [响应时间指标的应用](/ch2#sec_introduction_slo_sla)
- 表格连接, [表表连接（维护物化视图）](/ch12#id326)
- Tableau（数据可视化软件）, [事务处理与分析的特征](/ch1#sec_introduction_oltp), [分析（Analytics）](/ch11#sec_batch_olap)
- 尾巴 （Unix 工具）, [使用日志进行消息存储](/ch12#id300)
- tail latency（见 延迟）
- 尾顶（财产图）, [属性图](/ch3#id56)
- task (workflows)（见 workflow engines）
- TCP (Transmission Control Protocol), [TCP 的局限性](/ch9#sec_distributed_tcp)
  - 比较电路切换, [我们不能简单地使网络延迟可预测吗？](/ch9#can-we-not-simply-make-network-delays-predictable)
  - comparison to UDP, [网络拥塞和排队](/ch9#network-congestion-and-queueing)
  - 连接失败, [检测故障](/ch9#id307)
  - 流量控制, [网络拥塞和排队](/ch9#network-congestion-and-queueing), [消息传递系统](/ch12#sec_stream_messaging)
  - 包检查和, [弱形式的谎言](/ch9#weak-forms-of-lying), [端到端原则](/ch13#sec_future_e2e_argument), [信任但验证](/ch13#sec_future_verification)
  - 可靠性和重复压制, [抑制重复](/ch13#id354)
  - 转发超时, [网络拥塞和排队](/ch9#network-congestion-and-queueing)
  - 用于事务会话, [单对象与多对象操作](/ch8#sec_transactions_multi_object)
- 时间（工作流程引擎）, [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows)
- Tensorflow （机器学习图书馆）, [机器学习](/ch11#id290)
- Teradata（数据库）, [云原生系统架构](/ch1#sec_introduction_cloud_native), [云数据仓库](/ch4#sec_cloud_data_warehouses)
- term-partitioned indexes（见 global secondary indexes）
- 终止（协商一致）, [单值共识](/ch10#single-value-consensus), [原子提交作为共识](/ch10#atomic-commitment-as-consensus)
- 测试, [人类与可靠性](/ch2#id31)
- 击打（内存断）, [进程暂停](/ch9#sec_distributed_clocks_pauses)
- 线程（并发）
  - Actor 模型, [分布式 actor 框架](/ch5#distributed-actor-frameworks), [事件驱动架构与 RPC](/ch12#sec_stream_actors_drpc)
    - （另见 event-driven architecture）
  - 原子操作, [原子性](/ch8#sec_transactions_acid_atomicity)
  - 背景线程, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables)
  - 执行暂停, [我们不能简单地使网络延迟可预测吗？](/ch9#can-we-not-simply-make-network-delays-predictable), [进程暂停](/ch9#sec_distributed_clocks_pauses)-[进程暂停](/ch9#sec_distributed_clocks_pauses)
  - 内存障碍, [线性一致性与网络延迟](/ch10#linearizability-and-network-delays)
  - 预设, [进程暂停](/ch9#sec_distributed_clocks_pauses)
  - single（见 single-threaded execution）
- 三阶段承诺, [三阶段提交](/ch8#three-phase-commit)
- 三方关系, [属性图](/ch3#id56)
- Thrift（数据格式）, [Protocol Buffers](/ch5#sec_encoding_protobuf)
- 吞吐量, [描述性能](/ch2#sec_introduction_percentiles), [描述负载](/ch2#id33), [批处理](/ch11#ch_batch)
- TIBCO, [消息代理](/ch5#message-brokers)
  - Enterprise Message Service, [消息代理与数据库的对比](/ch12#id297)
  - StreamBase (stream analytics), [复合事件处理](/ch12#id317)
- TiDB（数据库）
  - 基于共识的复制, [单主复制](/ch6#sec_replication_leader)
  - 区域（硬化）, [分片](/ch7#ch_sharding)
  - 请求路由, [请求路由](/ch7#sec_sharding_routing)
  - 服务衍生数据, [对外提供派生数据](/ch11#sec_batch_serving_derived)
  - 硬化二级指数, [全局二级索引](/ch7#id167)
  - 快速隔离支持, [快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)
  - 时间戳, [实现线性一致的 ID 生成器](/ch10#implementing-a-linearizable-id-generator)
  - 事务, [事务到底是什么？](/ch8#sec_transactions_overview), [数据库内部的分布式事务](/ch8#sec_transactions_internal)
  - 使用模型检查, [模型检查与规范语言](/ch9#model-checking-and-specification-languages)
- 分层存储, [设置新的副本](/ch6#sec_replication_new_replica), [磁盘空间使用](/ch12#sec_stream_disk_usage)
- TigerBeetle（数据库）, [总结](/ch3#summary)
  - 确定性模拟测试, [确定性模拟测试](/ch9#deterministic-simulation-testing)
- TigerGraph（数据库）
  - GSQL language, [SQL 中的图查询](/ch3#id58)
- Tigris（对象存储）, [分布式文件系统](/ch11#sec_batch_dfs)
- TileDB（数据库）, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- 时间
  - 并发与, ["先发生"关系与并发](/ch6#sec_replication_happens_before)
  - 跨渠道时间依赖性, [跨通道时序依赖](/ch10#cross-channel-timing-dependencies)
  - 在分布式系统中, [不可靠的时钟](/ch9#sec_distributed_clocks)-[限制垃圾回收的影响](/ch9#sec_distributed_gc_impact)
    - （另见 clocks）
    - 时钟同步和准确性, [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy)
    - 依赖同步时钟, [对同步时钟的依赖](/ch9#sec_distributed_clocks_relying)-[用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
  - 进程暂停, [进程暂停](/ch9#sec_distributed_clocks_pauses)-[限制垃圾回收的影响](/ch9#sec_distributed_gc_impact)
  - 流程处理器中的推理, [时间推理](/ch12#sec_stream_time)-[窗口的类型](/ch12#id324)
    - 事件时间与处理时间, [事件时间与处理时间](/ch12#id322), [微批次与存档点](/ch12#id329), [统一批处理和流处理](/ch13#id338)
    - 知道窗口何时准备好, [处理滞留事件](/ch12#id323)
    - 事件的时间戳, [你用的是谁的时钟？](/ch12#id438)
    - 窗口类型, [窗口的类型](/ch12#id324)
  - 分布式系统的系统模型, [系统模型与现实](/ch9#sec_distributed_system_model)
  - 串流中的时间依赖, [连接的时间依赖性](/ch12#sec_stream_join_time)
- 时间序列数据
  - as DataFrames, [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
  - 面向列的存储, [列式存储](/ch4#sec_storage_column)
- 每日时钟, [日历时钟](/ch9#time-of-day-clocks)
  - 混合逻辑时钟, [混合逻辑时钟](/ch10#hybrid-logical-clocks)
- 及时性, [及时性与完整性](/ch13#sec_future_integrity)
  - 协调-避免数据系统, [无协调数据系统](/ch13#id454)
  - 数据流系统的正确性, [数据流系统的正确性](/ch13#id453)
- 超时, [不可靠的网络](/ch9#sec_distributed_networks), [术语表](/glossary)
  - 动态配置, [网络拥塞和排队](/ch9#network-congestion-and-queueing)
  - 失败, [领导者故障：故障转移](/ch6#leader-failure-failover)
  - 长度, [超时和无界延迟](/ch9#sec_distributed_queueing)
- TimescaleDB（数据库）, [列式存储](/ch4#sec_storage_column)
- 时间戳, [逻辑时钟](/ch10#sec_consistency_timestamps)
  - 指定流处理中的事件, [你用的是谁的时钟？](/ch12#id438)
  - 读后写入一致性, [读己之写](/ch6#sec_replication_ryw)
  - 用于事务命令, [用于全局快照的同步时钟](/ch9#sec_distributed_spanner)
  - 执行制约因素不足, [使用逻辑时钟强制约束](/ch10#enforcing-constraints-using-logical-clocks)
  - 密钥范围, [按键的范围分片](/ch7#sec_sharding_key_range)
  - 兰波特, [Lamport 时间戳](/ch10#lamport-timestamps)
  - 逻辑, [排序事件以捕获因果关系](/ch13#sec_future_capture_causality)
  - 命令事件, [用于事件排序的时间戳](/ch9#sec_distributed_lww)
  - 时间戳, [实现线性一致的 ID 生成器](/ch10#implementing-a-linearizable-id-generator)
- TLA+ (specification language), [模型检查与规范语言](/ch9#model-checking-and-specification-languages)
- 符号桶（限制重试）, [描述性能](/ch2#sec_introduction_percentiles)
- 墓碑, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables), [磁盘空间使用](/ch4#disk-space-usage), [日志压缩](/ch12#sec_stream_log_compaction)
- 专题（信息）, [消息代理](/ch5#message-brokers), [传递事件流](/ch12#sec_stream_transmit)
- 撕裂的页面（B- 树）, [使 B 树可靠](/ch4#sec_storage_btree_wal)
- 全序, [术语表](/glossary)
  - broadcast（见 shared logs）
  - 限制, [全序的限制](/ch13#id335)
  - 在逻辑时间戳上, [逻辑时钟](/ch10#sec_consistency_timestamps)
- 追踪, [分布式系统的问题](/ch1#sec_introduction_dist_sys_problems)
- 跟踪行为数据, [隐私与追踪](/ch14#id373)
  - （另见 隐私）
- 权衡, [数据系统架构中的权衡](/ch1#ch_tradeoffs)-[数据系统、法律与社会](/ch1#sec_introduction_compliance)
- transaction coordinator（见 协调者）
- transaction manager（见 协调者）
- 事务处理, [事务处理与分析的特征](/ch1#sec_introduction_oltp)-[事务处理与分析的特征](/ch1#sec_introduction_oltp)
  - 与分析的比较, [事务处理与分析的特征](/ch1#sec_introduction_oltp)
  - 与数据存储的比较, [分析型数据存储](/ch4#sec_storage_analytics)
- 事务, [事务](/ch8#ch_transactions)-[总结](/ch8#summary), [术语表](/glossary)
  - ACID properties of, [ACID 的含义](/ch8#sec_transactions_acid)
    - 原子性, [原子性](/ch8#sec_transactions_acid_atomicity)
    - 一致性, [一致性](/ch8#sec_transactions_acid_consistency)
    - 持久性, [使 B 树可靠](/ch4#sec_storage_btree_wal), [持久性](/ch8#durability)
    - 隔离性, [隔离性](/ch8#sec_transactions_acid_isolation)
  - 数据完整性, [及时性与完整性](/ch13#sec_future_integrity)
  - 复制, [复制延迟的解决方案](/ch6#id131)
  - compensating（见 compensating transactions）
  - 概念, [事务到底是什么？](/ch8#sec_transactions_overview)
  - 分布式事务, [分布式事务](/ch8#sec_transactions_distributed)-[再谈恰好一次消息处理](/ch8#exactly-once-message-processing-revisited)
    - 避开, [派生数据与分布式事务](/ch13#sec_future_derived_vs_transactions), [开展分拆工作](/ch13#sec_future_unbundling_favor), [强制约束](/ch13#sec_future_constraints)-[无协调数据系统](/ch13#id454)
    - 失败放大, [维护派生状态](/ch13#id446)
    - 已磨损的系统, [分片的利与弊](/ch7#sec_sharding_reasons)
    - 可疑/不确定状况, [协调器故障](/ch8#coordinator-failure), [存疑时持有锁](/ch8#holding-locks-while-in-doubt)
    - 两阶段提交, [两阶段提交（2PC）](/ch8#sec_transactions_2pc)-[三阶段提交](/ch8#three-phase-commit)
    - 使用, [跨不同系统的分布式事务](/ch8#sec_transactions_xa)-[恰好一次消息处理](/ch8#sec_transactions_exactly_once)
    - XA 事务, [XA 事务](/ch8#xa-transactions)-[XA 事务的问题](/ch8#problems-with-xa-transactions)
  - OLTP versus analytics queries, [分析（Analytics）](/ch11#sec_batch_olap)
  - 目标, [事务](/ch8#ch_transactions)
  - 可串行化, [可串行化](/ch8#sec_transactions_serializability)-[可串行化快照隔离的性能](/ch8#performance-of-serializable-snapshot-isolation)
    - 实际执行, [实际串行执行](/ch8#sec_transactions_serial)-[串行执行总结](/ch8#summary-of-serial-execution)
    - 悲观与乐观的并发控制, [悲观并发控制与乐观并发控制](/ch8#pessimistic-versus-optimistic-concurrency-control)
    - 可串行化快照隔离, [可串行化快照隔离（SSI）](/ch8#sec_transactions_ssi)-[可串行化快照隔离的性能](/ch8#performance-of-serializable-snapshot-isolation)
    - 两阶段锁定, [两阶段锁定（2PL）](/ch8#sec_transactions_2pl)-[索引范围锁](/ch8#sec_transactions_2pl_range)
  - 单对象和多对象, [单对象与多对象操作](/ch8#sec_transactions_multi_object)-[处理错误和中止](/ch8#handling-errors-and-aborts)
    - 处理错误和中止, [处理错误和中止](/ch8#handling-errors-and-aborts)
    - 多对象事务的需要, [多对象事务的需求](/ch8#sec_transactions_need)
    - 单对象写入, [单对象写入](/ch8#sec_transactions_single_object)
  - 快照隔离（见 snapshots）
  - 严格的序列性, [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition)
  - 薄弱的隔离水平, [弱隔离级别](/ch8#sec_transactions_isolation_levels)-[物化冲突](/ch8#materializing-conflicts)
    - 防止丢失更新, [防止丢失更新](/ch8#sec_transactions_lost_update)-[冲突解决与复制](/ch8#conflict-resolution-and-replication)
    - 读已提交, [读已提交](/ch8#sec_transactions_read_committed)-[快照隔离与可重复读](/ch8#sec_transactions_snapshot_isolation)
- 曲线（图）, [属性图](/ch3#id56)
- 三（数据结构）, [构建和合并 SSTable](/ch4#constructing-and-merging-sstables), [全文检索](/ch4#sec_storage_full_text)
  - as SSTable index, [SSTable 文件格式](/ch4#the-sstable-file-format)
- 触发器（数据库）, [传递事件流](/ch12#sec_stream_transmit)
- Trino（数据仓库）, [云数据仓库](/ch4#sec_cloud_data_warehouses)
  - 联邦数据库, [一切的元数据库](/ch13#id341)
  - 查询优化器, [查询语言](/ch11#sec_batch_query_lanauges)
  - 用于 ETL, [提取-转换-加载（ETL）](/ch11#sec_batch_etl_usage)
  - 工作流程示例, [工作流调度](/ch11#sec_batch_workflows)
- 三层, [三元组存储与 SPARQL](/ch3#id59)-[SPARQL 查询语言](/ch3#the-sparql-query-language)
  - SPARQL 查询语言, [SPARQL 查询语言](/ch3#the-sparql-query-language)
- 翻转窗口（流处理）, [窗口的类型](/ch12#id324)
  - （另见 windows）
  - 在微战斗中, [微批次与存档点](/ch12#id329)
- Turbopuffer（种子搜索） Name, [设置新的副本](/ch6#sec_replication_new_replica)
- Turtle (RDF data format), [三元组存储与 SPARQL](/ch3#id59)
- Twitter（见 X (social network)）
- 两阶段提交, [两阶段提交（2PC）](/ch8#sec_transactions_2pc)-[协调器故障](/ch8#coordinator-failure), [术语表](/glossary)
  - 与双相锁定混淆, [两阶段锁定（2PL）](/ch8#sec_transactions_2pl)
  - 协调员失败, [协调器故障](/ch8#coordinator-failure)
  - 协调员恢复, [从协调器故障中恢复](/ch8#recovering-from-coordinator-failure)
  - 如何运作, [系统性的承诺](/ch8#a-system-of-promises)
  - 绩效成本, [跨不同系统的分布式事务](/ch8#sec_transactions_xa)
  - problems with XA transactions, [XA 事务的问题](/ch8#problems-with-xa-transactions)
  - 持有锁定的事务, [存疑时持有锁](/ch8#holding-locks-while-in-doubt)
- 两阶段锁定, [两阶段锁定（2PL）](/ch8#sec_transactions_2pl)-[索引范围锁](/ch8#sec_transactions_2pl_range), [什么使系统具有线性一致性？](/ch10#sec_consistency_lin_definition), [术语表](/glossary)
  - 与两阶段提交混淆, [两阶段锁定（2PL）](/ch8#sec_transactions_2pl)
  - 增长和缩小阶段, [两阶段锁定的实现](/ch8#implementation-of-two-phase-locking)
  - 索引范围锁定, [索引范围锁](/ch8#sec_transactions_2pl_range)
  - 业绩, [两阶段锁定的性能](/ch8#performance-of-two-phase-locking)
- 类型检查,动态对静态, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility)

### U

- UDP (User Datagram Protocol)
  - comparison to TCP, [网络拥塞和排队](/ch9#network-congestion-and-queueing)
  - 多广播, [直接从生产者传递给消费者](/ch12#id296)
- 终极在线（游戏）, [分片](/ch7#ch_sharding)
- 未绑定的数据集, [流处理](/ch12#ch_stream), [术语表](/glossary)
  - （另见 streams）
- 无限制的延误, [术语表](/glossary)
  - 在网络中, [超时和无界延迟](/ch9#sec_distributed_queueing)
  - 进程暂停, [进程暂停](/ch9#sec_distributed_clocks_pauses)
- 解析数据库, [分拆数据库](/ch13#sec_future_unbundling)-[多分区数据处理](/ch13#sec_future_unbundled_multi_shard)
  - 构建数据存储技术, [组合使用数据存储技术](/ch13#id447)-[分拆系统与集成系统](/ch13#id448)
    - 联邦制与拆分制, [一切的元数据库](/ch13#id341)
  - 围绕数据流设计应用程序, [围绕数据流设计应用](/ch13#sec_future_dataflow)-[流处理器和服务](/ch13#id345)
  - 观察导出状态, [观察派生数据状态](/ch13#sec_future_observing)-[多分区数据处理](/ch13#sec_future_unbundled_multi_shard)
    - 实现视图和缓存, [物化视图和缓存](/ch13#id451)
    - 多硬数据处理, [多分区数据处理](/ch13#sec_future_unbundled_multi_shard)
    - 推动客户端更改状态, [将状态变更推送给客户端](/ch13#id348)
- uncertain (transaction status)（见 存疑）
- 联盟类型（在 Avro）, [模式演化规则](/ch5#schema-evolution-rules)
- uniq（Unix 工具）, [简单日志分析](/ch11#sec_batch_log_analysis), [简单日志分析](/ch11#sec_batch_log_analysis), [分布式作业编排](/ch11#id278)
- 独特性限制
  - 同步检查, [宽松地解释约束](/ch13#id362)
  - 需要协商一致, [唯一性约束需要达成共识](/ch13#id452)
  - 需要线性, [约束与唯一性保证](/ch10#sec_consistency_uniqueness)
  - 以日志为基础的信件中的独特性, [基于日志消息传递中的唯一性](/ch13#sec_future_uniqueness_log)
- 团结（数据目录）, [云数据仓库](/ch4#sec_cloud_data_warehouses)
- universally unique identifiers（见 UUIDs）
- unix 哲学
  - 比较关系数据库, [分拆数据库](/ch13#sec_future_unbundling), [一切的元数据库](/ch13#id341)
  - 与流处理的比较, [流处理](/ch12#sec_stream_processing)
- unix 管道, [简单日志分析](/ch11#sec_batch_log_analysis)
  - 与分布式批量处理相比, [工作流调度](/ch11#sec_batch_workflows)
- UPDATE statement (SQL), [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility)
- 更新
  - 防止丢失更新, [防止丢失更新](/ch8#sec_transactions_lost_update)-[冲突解决与复制](/ch8#conflict-resolution-and-replication)
    - 原子写入操作, [原子写操作](/ch8#atomic-write-operations)
    - 自动检测丢失的更新, [自动检测丢失的更新](/ch8#automatically-detecting-lost-updates)
    - 比较和设置, [条件写入（比较并设置）](/ch8#sec_transactions_compare_and_set)
    - 冲突解决和推广, [冲突解决与复制](/ch8#conflict-resolution-and-replication)
    - 使用明确的锁定, [显式锁定](/ch8#explicit-locking)
  - 防止写入skew, [写偏差与幻读](/ch8#sec_transactions_write_skew)-[物化冲突](/ch8#materializing-conflicts)
- 使用量
  - 批量过程调度, [资源分配](/ch11#id279)
  - 通过预设增加, [故障处理](/ch11#id281)
  - 与暂时取舍, [我们不能简单地使网络延迟可预测吗？](/ch9#can-we-not-simply-make-network-delays-predictable)
- uTP protocol (BitTorrent), [TCP 的局限性](/ch9#sec_distributed_tcp)
- UUIDs, [ID 生成器和逻辑时钟](/ch10#sec_consistency_logical)

### V

- 有效性（协商一致）, [单值共识](/ch10#single-value-consensus), [原子提交作为共识](/ch10#atomic-commitment-as-consensus)
- vBuckets（硬化）, [分片](/ch7#ch_sharding)
- 矢量时钟, [版本向量](/ch6#version-vectors)
  - （另见 版本向量）
  - 和 Lamport/hybrid 逻辑钟, [Lamport/混合逻辑时钟 vs. 向量时钟](/ch10#lamporthybrid-logical-clocks-vs-vector-clocks)
  - 和版本向量, [版本向量](/ch6#version-vectors)
- 向量嵌入, [向量嵌入](/ch4#id92)
- 矢量处理, [查询执行：编译与向量化](/ch4#sec_storage_vectorized)
- 供应商锁定, [云服务的利弊](/ch1#sec_introduction_cloud_tradeoffs)
- Venice（数据库）, [对外提供派生数据](/ch11#sec_batch_serving_derived)
- 核查, [信任但验证](/ch13#sec_future_verification)-[用于可审计数据系统的工具](/ch13#id366)
  - 避免盲目信任, [不要盲目信任承诺](/ch13#id364)
  - 设计可审计性, [为可审计性而设计](/ch13#id365)
  - 端对端完整性检查, [端到端原则重现](/ch13#id456)
  - 可审计数据系统工具, [用于可审计数据系统的工具](/ch13#id366)
- 版本控制系统
  - 合并冲突, [手动冲突解决](/ch6#manual-conflict-resolution)
  - 依赖不可改变的数据, [并发控制](/ch12#sec_stream_concurrency)
- 版本向量, [不同拓扑的问题](/ch6#problems-with-different-topologies), [版本向量](/ch6#version-vectors)
  - 点数, [版本向量](/ch6#version-vectors)
  - 对向量时钟, [版本向量](/ch6#version-vectors)
- Vertica（数据库）, [云数据仓库](/ch4#sec_cloud_data_warehouses)
  - 处理写入, [写入列式存储](/ch4#writing-to-column-oriented-storage)
- vertical scaling（见 scaling up）
- 顶点（图）, [图数据模型](/ch3#sec_datamodels_graph)
  - 属性图模型, [属性图](/ch3#id56)
- 电子游戏, [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
- 视频转码（例如）, [跨通道时序依赖](/ch10#cross-channel-timing-dependencies)
- views (SQL queries), [Datalog：递归关系查询](/ch3#id62)
  - materialized views（见 物化）
- 视图戳复制, [共识](/ch10#sec_consistency_consensus), [共识的实践](/ch10#sec_consistency_total_order)
  - 使用模型检查, [模型检查与规范语言](/ch9#model-checking-and-specification-languages)
  - 视图编号, [从单主复制到共识](/ch10#from-single-leader-replication-to-consensus)
- 虚拟块设备, [存储与计算的分离](/ch1#sec_introduction_storage_compute)
- 虚拟文件系统, [分布式文件系统](/ch11#sec_batch_dfs)
  - 比较分布式文件系统, [分布式文件系统](/ch11#sec_batch_dfs)
- 虚拟机, [云服务的分层](/ch1#layering-of-cloud-services)
  - 上下文开关, [进程暂停](/ch9#sec_distributed_clocks_pauses)
  - 网络性能, [网络拥塞和排队](/ch9#network-congestion-and-queueing)
  - 吵闹的邻居, [网络拥塞和排队](/ch9#network-congestion-and-queueing)
  - 虚拟时钟在, [时钟同步和准确性](/ch9#sec_distributed_clock_accuracy)
- 虚拟内存
  - 因页面错误造成的进程暂停, [延迟与响应时间](/ch2#id23), [进程暂停](/ch9#sec_distributed_clocks_pauses)
- Virtuoso（数据库）, [SPARQL 查询语言](/ch3#the-sparql-query-language)
- VisiCalc (spreadsheets), [围绕数据流设计应用](/ch13#sec_future_dataflow)
- Vitess（数据库）
  - 键程硬化, [按键的范围分片](/ch7#sec_sharding_key_range)
- 节点（硬化）, [分片](/ch7#ch_sharding)
- 词汇, [三元组存储与 SPARQL](/ch3#id59)
- Voice over IP (VoIP), [网络拥塞和排队](/ch9#network-congestion-and-queueing)
- VoltDB（数据库）
  - 交叉硬度序列化, [分片](/ch8#sharding)
  - 确定性储存程序, [存储过程的利弊](/ch8#sec_transactions_stored_proc_tradeoffs)
  - 内存储, [全内存存储](/ch4#sec_storage_inmemory)
  - 进程/核心模式, [分片的利与弊](/ch7#sec_sharding_reasons)
  - 二级指数, [本地二级索引](/ch7#id166)
  - 事务的序列执行, [实际串行执行](/ch8#sec_transactions_serial)
  - 基于语句的复制, [基于语句的复制](/ch6#statement-based-replication), [失败后重建状态](/ch12#sec_stream_state_fault_tolerance)
  - 流程处理中的事务, [原子提交再现](/ch12#sec_stream_atomic_commit)

### W

- 预写式日志, [使 B 树可靠](/ch4#sec_storage_btree_wal)
- WAL-G (backup tool), [设置新的副本](/ch6#sec_replication_new_replica)
- WarpStream（消息系统）, [磁盘空间使用](/ch12#sec_stream_disk_usage)
- web services（见 services）
- 网络用户, [直接从生产者传递给消费者](/ch12#id296)
- 网络方法（通讯）, [消息代理](/ch5#message-brokers)
- WebSocket (protocol), [将状态变更推送给客户端](/ch13#id348)
- 宽柱数据模型, [读写的数据局部性](/ch3#sec_datamodels_document_locality)
  - 相对于面向列的存储, [列压缩](/ch4#sec_storage_column_compression)
- 窗口（流程处理）, [流分析](/ch12#id318), [时间推理](/ch12#sec_stream_time)-[窗口的类型](/ch12#id324)
  - 更改日志的无限窗口, [维护物化视图](/ch12#sec_stream_mat_view), [流表连接（流扩充）](/ch12#sec_stream_table_joins)
  - 知道所有事件何时到来, [处理滞留事件](/ch12#id323)
  - 串流在窗口内连接, [流流连接（窗口连接）](/ch12#id440)
  - 窗口类型, [窗口的类型](/ch12#id324)
- WITH RECURSIVE syntax (SQL), [SQL 中的图查询](/ch3#id58)
- Word2Vec (language model), [向量嵌入](/ch4#id92)
- 工作流程引擎, [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows)
  - Airflow（见 Airflow（工作流调度器））
  - 批处理, [工作流调度](/ch11#sec_batch_workflows)
  - Camunda（见 Camunda (workflow engine)）
  - Dagster（见 Dagster（工作流调度器））
  - 持久执行, [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows)
  - 提取-转换-加载（ETL）（见 ETL）
  - 执行器, [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows)
  - 乐团, [持久化执行与工作流](/ch5#sec_encoding_dataflow_workflows), [批处理](/ch11#ch_batch)
  - Orkes（见 Orkes (workflow engine)）
  - Prefect（见 Prefect（工作流调度器））
  - 依赖决定性因素, [确定性模拟测试](/ch9#deterministic-simulation-testing)
  - Restate（见 Restate (workflow engine)）
  - Temporal（见 Temporal (workflow engine)）
- 工作设置, [排序与内存聚合](/ch11#id275)
- 写入放大, [写放大](/ch4#write-amplification)
- 写路径, [观察派生数据状态](/ch13#sec_future_observing)
- 写偏差, [写偏差与幻读](/ch8#sec_transactions_write_skew)-[物化冲突](/ch8#materializing-conflicts)
  - 特性, [写偏差与幻读](/ch8#sec_transactions_write_skew)-[导致写偏差的幻读](/ch8#sec_transactions_phantom), [基于过时前提的决策](/ch8#decisions-based-on-an-outdated-premise)
  - 实例, [写偏差与幻读](/ch8#sec_transactions_write_skew), [写偏差的更多例子](/ch8#more-examples-of-write-skew)
  - 物化冲突, [物化冲突](/ch8#materializing-conflicts)
  - 实际发生情况, [维护完整性，尽管软件有Bug](/ch13#id455)
  - 幻读, [导致写偏差的幻读](/ch8#sec_transactions_phantom)
  - 预防
    - 在快照隔离中, [基于过时前提的决策](/ch8#decisions-based-on-an-outdated-premise)-[检测影响先前读取的写入](/ch8#sec_detecting_writes_affect_reads)
    - 双相锁定, [谓词锁](/ch8#predicate-locks)-[索引范围锁](/ch8#sec_transactions_2pl_range)
    - 选项, [写偏差的特征](/ch8#characterizing-write-skew)
- 预写式日志, [使 B 树可靠](/ch4#sec_storage_btree_wal), [预写日志（WAL）传输](/ch6#write-ahead-log-wal-shipping)
  - 持久执行, [持久化执行](/ch5#durable-execution)
- 写入（数据库）
  - 原子写入操作, [原子写操作](/ch8#atomic-write-operations)
  - 检测影响前读的写入, [检测影响先前读取的写入](/ch8#sec_detecting_writes_affect_reads)
  - 防止污秽的写作,, [没有脏写](/ch8#sec_transactions_dirty_write)
- WS-\* framework, [远程过程调用（RPC）的问题](/ch5#sec_problems_with_rpc)
- WS-AtomicTransaction (2PC), [两阶段提交（2PC）](/ch8#sec_transactions_2pc)

### X

- X （社会网络）
  - 建造住房时间表（例如）, [案例研究：社交网络首页时间线](/ch2#sec_introduction_twitter), [从同一事件日志中派生多个视图](/ch12#sec_stream_deriving_views), [表表连接（维护物化视图）](/ch12#id326), [物化视图和缓存](/ch13#id451)
    - 加入费用, [社交网络案例研究中的反规范化](/ch3#denormalization-in-the-social-networking-case-study)
    - 描述负载, [描述负载](/ch2#id33)
    - 过失容忍, [容错](/ch2#id27)
    - 业绩计量, [描述性能](/ch2#sec_introduction_percentiles)
  - DistributedLog (event log), [使用日志进行消息存储](/ch12#id300)
  - Snowflake (ID generator), [ID 生成器和逻辑时钟](/ch10#sec_consistency_logical)
- XA 事务, [两阶段提交（2PC）](/ch8#sec_transactions_2pc), [XA 事务](/ch8#xa-transactions)-[XA 事务的问题](/ch8#problems-with-xa-transactions)
  - 启发式决策, [从协调器故障中恢复](/ch8#recovering-from-coordinator-failure)
  - 问题, [XA 事务的问题](/ch8#problems-with-xa-transactions)
- xargs （Unix 工具） （英语）., [简单日志分析](/ch11#sec_batch_log_analysis)
- XFS (file system), [分布式文件系统](/ch11#sec_batch_dfs)
- XGBoost (machine learning library), [机器学习](/ch11#id290)
- XML
  - 二进制变体, [二进制编码](/ch5#binary-encoding)
  - 数据位置, [读写的数据局部性](/ch3#sec_datamodels_document_locality)
  - encoding RDF data, [RDF 数据模型](/ch3#the-rdf-data-model)
  - 应用数据的问题, [JSON、XML 及其二进制变体](/ch5#sec_encoding_json)
  - 关系数据库, [文档模型中的模式灵活性](/ch3#sec_datamodels_schema_flexibility)
  - XML databases, [关系模型与文档模型](/ch3#sec_datamodels_history), [文档的查询语言](/ch3#query-languages-for-documents)
- Xorq（查询引擎）, [一切的元数据库](/ch13#id341)
- XPath, [文档的查询语言](/ch3#query-languages-for-documents)
- XQuery, [文档的查询语言](/ch3#query-languages-for-documents)

### Y

- 亚虎
  - 响应时间研究, [平均值、中位数与百分位点](/ch2#id24)
- YARN (job scheduler), [分布式作业编排](/ch11#id278), [应用代码和状态的分离](/ch13#id344)
  - ApplicationMaster, [分布式作业编排](/ch11#id278)
- Yjs (CRDT library), [同步引擎的利弊](/ch6#pros-and-cons-of-sync-engines)
- YugabyteDB（数据库）
  - 散列变硬, [按哈希范围分片](/ch7#sharding-by-hash-range)
  - 键程硬化, [按键的范围分片](/ch7#sec_sharding_key_range)
  - 多领导复制, [跨地域运行](/ch6#sec_replication_multi_dc)
  - 请求路由, [请求路由](/ch7#sec_sharding_routing)
  - 硬化二级指数, [全局二级索引](/ch7#id167)
  - 平板（硬化）, [分片](/ch7#ch_sharding)
  - 事务, [事务到底是什么？](/ch8#sec_transactions_overview), [数据库内部的分布式事务](/ch8#sec_transactions_internal)
  - 使用时钟同步, [用于全局快照的同步时钟](/ch9#sec_distributed_spanner)

### Z

- Zab（协商一致算法）, [共识](/ch10#sec_consistency_consensus), [共识的实践](/ch10#sec_consistency_total_order)
  - use in ZooKeeper, [实现线性一致性系统](/ch10#sec_consistency_implementing_linearizable)
- 零拷贝, [编码数据的格式](/ch5#sec_encoding_formats)
- zero-disk architecture (ZDA), [设置新的副本](/ch6#sec_replication_new_replica)
- ZeroMQ (messaging library), [直接从生产者传递给消费者](/ch12#id296)
- 僵尸（分裂的大脑）, [隔离僵尸进程和延迟请求](/ch9#sec_distributed_fencing_tokens)
- zones (cloud computing)（见 availability zones）
- ZooKeeper (coordination service), [协调服务](/ch10#sec_consistency_coordination)-[服务发现](/ch10#service-discovery)
  - 生成栅栏标志, [隔离僵尸进程和延迟请求](/ch9#sec_distributed_fencing_tokens), [使用共享日志](/ch10#sec_consistency_smr), [协调服务](/ch10#sec_consistency_coordination)
  - 线性操作, [实现线性一致性系统](/ch10#sec_consistency_implementing_linearizable)
  - 锁和领袖选举, [锁定与领导者选举](/ch10#locking-and-leader-election)
  - 观察员, [服务发现](/ch10#service-discovery)
  - 用于服务发现, [负载均衡器、服务发现和服务网格](/ch5#sec_encoding_service_discovery), [服务发现](/ch10#service-discovery)
  - 用于硬性转让, [请求路由](/ch7#sec_sharding_routing)
  - 使用 Zab 算法, [共识](/ch10#sec_consistency_consensus)


================================================
FILE: content/zh/part-i.md
================================================
---
title: 第一部分：数据系统基础
weight: 100
breadcrumbs: false
---

{{< callout type="warning" >}}
当前页面来自本书第一版，第二版尚不可用
{{< /callout >}}

本书前五章介绍了数据系统底层的基础概念，无论是在单台机器上运行的单点数据系统，还是分布在多台机器上的分布式数据系统都适用。

1. [第一章](/ch1) 将介绍 **数据系统架构中的利弊权衡**。我们将讨论不同类型的数据系统（例如，分析型与事务型），以及它们在云环境中的运行方式。
2. [第二章](/ch2) 将介绍非功能性需求的定义。**可靠性，可伸缩性和可维护性** ，这些词汇到底意味着什么？如何实现这些目标？
3. [第三章](/ch3) 将对几种不同的 **数据模型和查询语言** 进行比较。从程序员的角度看，这是数据库之间最明显的区别。不同的数据模型适用于不同的应用场景。
4. [第四章](/ch4) 将深入 **存储引擎** 内部，研究数据库如何在磁盘上摆放数据。不同的存储引擎针对不同的负载进行优化，选择合适的存储引擎对系统性能有巨大影响。
5. [第五章](/ch5) 将对几种不同的 **数据编码** 进行比较。特别研究了这些格式在应用需求经常变化、模式需要随时间演变的环境中表现如何。

[第二部分](/part-ii) 将专门讨论在 **分布式数据系统** 中特有的问题。


## [1. 数据系统架构中的权衡](/ch1)
- [分析型与事务型系统](/ch1#sec_introduction_analytics)
- [云服务与自托管](/ch1#sec_introduction_cloud)
- [分布式与单节点系统](/ch1#sec_introduction_distributed)
- [数据系统、法律与社会](/ch1#sec_introduction_compliance)
- [总结](/ch1#summary)

## [2. 定义非功能性需求](/ch2)
- [案例研究：社交网络首页时间线](/ch2#sec_introduction_twitter)
- [描述性能](/ch2#sec_introduction_percentiles)
- [可靠性与容错](/ch2#sec_introduction_reliability)
- [可伸缩性](/ch2#sec_introduction_scalability)
- [可运维性](/ch2#sec_introduction_maintainability)
- [总结](/ch2#summary)

## [3. 数据模型与查询语言](/ch3)
- [关系模型与文档模型](/ch3#sec_datamodels_history)
- [图数据模型](/ch3#sec_datamodels_graph)
- [事件溯源与 CQRS](/ch3#sec_datamodels_events)
- [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- [总结](/ch3#summary)

## [4. 存储与检索](/ch4)
- [OLTP 系统的存储与索引](/ch4#sec_storage_oltp)
- [分析型数据存储](/ch4#sec_storage_analytics)
- [多维索引与全文索引](/ch4#sec_storage_multidimensional)
- [总结](/ch4#summary)

## [5. 编码与演化](/ch5)
- [编码数据的格式](/ch5#sec_encoding_formats)
- [数据流的模式](/ch5#sec_encoding_dataflow)
- [总结](/ch5#summary)


================================================
FILE: content/zh/part-ii.md
================================================
---
title: 第二部分：分布式数据
weight: 200
breadcrumbs: false
---

{{< callout type="warning" >}}
当前页面来自本书第一版，第二版尚不可用
{{< /callout >}}

> 一个成功的技术，现实的优先级必须高于公关，你可以糊弄别人，但糊弄不了自然规律。
>
> —— 罗杰斯委员会报告（1986）
>

-------

在本书的 [第一部分](/part-i) 中，我们讨论了数据系统的各个方面，但仅限于数据存储在单台机器上的情况。
现在我们到了 [第二部分](/part-ii)，进入更高的层次，并提出一个问题：如果 **多台机器** 参与数据的存储和检索，会发生什么？

你可能会出于各种各样的原因，希望将数据库分布到多台机器上：

可伸缩性
: 如果你的数据量、读取负载、写入负载超出单台机器的处理能力，可以将负载分散到多台计算机上。

容错 / 高可用性
: 如果你的应用需要在单台机器（或多台机器，网络或整个数据中心）出现故障的情况下仍然能继续工作，则可使用多台机器，以提供冗余。一台故障时，另一台可以接管。

延迟
: 如果在世界各地都有用户，你也许会考虑在全球范围部署多个服务器，从而每个用户可以从地理上最近的数据中心获取服务，避免了等待网络数据包穿越半个世界。

## 伸缩至更高的负载

如果你需要的只是伸缩至更高的 **负载（load）**，最简单的方法就是购买更强大的机器（有时称为 **垂直伸缩**，即 vertical scaling，或 **向上伸缩**，即 scale up）。许多处理器，内存和磁盘可以在同一个操作系统下相互连接，快速的相互连接允许任意处理器访问内存或磁盘的任意部分。在这种 **共享内存架构（shared-memory architecture）** 中，所有的组件都可以看作一台单独的机器。

> [!NOTE]
> 在大型机中，尽管任意处理器都可以访问内存的任意部分，但总有一些内存区域与一些处理器更接近（称为 **非均匀内存访问（nonuniform memory access, NUMA）** [^1]）。为了有效利用这种架构特性，需要对处理进行细分，以便每个处理器主要访问临近的内存，这意味着即使表面上看起来只有一台机器在运行，**分区（partitioning）** 仍然是必要的。

共享内存方法的问题在于，成本增长速度快于线性增长：一台有着双倍处理器数量，双倍内存大小，双倍磁盘容量的机器，通常成本会远远超过原来的两倍。而且可能因为存在瓶颈，并不足以处理双倍的载荷。

共享内存架构可以提供有限的容错能力，高端机器可以使用热插拔的组件（不关机更换磁盘，内存模块，甚至处理器）—— 但它必然囿于单个地理位置的桎梏。

另一种方法是 **共享磁盘架构（shared-disk architecture）**，它使用多台具有独立处理器和内存的机器，但将数据存储在机器之间共享的磁盘阵列上，这些磁盘通过快速网络连接。这种架构用于某些数据仓库，但竞争和锁定的开销限制了共享磁盘方法的可伸缩性 [^2]。

> [!NOTE]
> 网络附属存储（Network Attached Storage, NAS），或 **存储区网络（Storage Area Network, SAN）**

### 无共享架构

相比之下，**无共享架构** [^3]（shared-nothing architecture，有时被称为 **水平伸缩**，即 horizontal scaling，或 **向外伸缩**，即 scaling out）已经相当普及。
在这种架构中，运行数据库软件的每台机器 / 虚拟机都称为 **节点（node）**。每个节点只使用各自的处理器，内存和磁盘。节点之间的任何协调，都是在软件层面使用传统网络实现的。

无共享系统不需要使用特殊的硬件，所以你可以用任意机器 —— 比如性价比最好的机器。你也许可以跨多个地理区域分布数据从而减少用户延迟，或者在损失一整个数据中心的情况下幸免于难。
随着云端虚拟机部署的出现，即使是小公司，现在无需 Google 级别的运维，也可以实现异地分布式架构。

在这一部分里，我们将重点放在无共享架构上。它不见得是所有场景的最佳选择，但它是最需要你谨慎从事的架构。
如果你的数据分布在多个节点上，你需要意识到这样一个分布式系统中约束和权衡 —— 数据库并不能魔术般地把这些东西隐藏起来。

虽然分布式无共享架构有许多优点，但它通常也会给应用带来额外的复杂度，有时也会限制你可用数据模型的表达力。
在某些情况下，一个简单的单线程程序可以比一个拥有超过 100 个 CPU 核的集群表现得更好 [^4]。另一方面，无共享系统可以非常强大。接下来的几章，将详细讨论分布式数据会带来的问题。


### 复制 vs 分区

数据分布在多个节点上有两种常见的方式：

复制（Replication）
: 在几个不同的节点上保存数据的相同副本，可能放在不同的位置。复制提供了冗余：如果一些节点不可用，剩余的节点仍然可以提供数据服务。复制也有助于改善性能。[第六章](/ch6) 将讨论复制。

分区 (Partitioning)
: 将一个大型数据库拆分成较小的子集（称为 **分区**，即 partitions），从而不同的分区可以指派给不同的 **节点**（nodes，亦称 **分片**，即 sharding）。[第七章](/ch7) 将讨论分区。

复制和分区是不同的机制，但它们经常同时使用。如 [图 II-1](#fig_replication_partitioning) 所示。

{{< figure src="/v1/ddia_part-ii_01.png" id="fig_replication_partitioning" caption="图 II-1 一个数据库切分为两个分区，每个分区都有两个副本" class="w-full my-4" >}}


理解了这些概念，就可以开始讨论在分布式系统中需要做出的困难抉择。[第八章](/ch8) 将讨论 **事务（Transaction）**，这对于了解数据系统中可能出现的各种问题，以及我们可以做些什么很有帮助。
[第九章](/ch9) 和 [第十章](/ch10) 将讨论分布式系统的根本局限性。

在本书的 [第三部分](/part-iii) 中，将讨论如何将多个（可能是分布式的）数据存储集成为一个更大的系统，以满足复杂的应用需求。但首先，我们来聊聊分布式的数据。


## [6. 复制](/ch6)
- [单主复制](/ch6#sec_replication_leader)
- [复制延迟的问题](/ch6#sec_replication_lag)
- [多主复制](/ch6#sec_replication_multi_leader)
- [无主复制](/ch6#sec_replication_leaderless)
- [总结](/ch6#summary)

## [7. 分片](/ch7)
- [分片的利与弊](/ch7#sec_sharding_reasons)
- [键值数据的分片](/ch7#sec_sharding_key_value)
- [请求路由](/ch7#sec_sharding_routing)
- [分片与二级索引](/ch7#sec_sharding_secondary_indexes)
- [总结](/ch7#summary)

## [8. 事务](/ch8)
- [事务到底是什么？](/ch8#sec_transactions_overview)
- [弱隔离级别](/ch8#sec_transactions_isolation_levels)
- [可串行化](/ch8#sec_transactions_serializability)
- [分布式事务](/ch8#sec_transactions_distributed)
- [总结](/ch8#summary)
- [参考](/ch8#参考)

## [9. 分布式系统的麻烦](/ch9)
- [故障与部分失效](/ch9#sec_distributed_partial_failure)
- [不可靠的网络](/ch9#sec_distributed_networks)
- [不可靠的时钟](/ch9#sec_distributed_clocks)
- [知识、真相和谎言](/ch9#sec_distributed_truth)
- [总结](/ch9#summary)

## [10. 一致性与共识](/ch10)
- [线性一致性](/ch10#sec_consistency_linearizability)
- [ID 生成器和逻辑时钟](/ch10#sec_consistency_logical)
- [共识](/ch10#sec_consistency_consensus)
- [总结](/ch10#summary)


### 参考

[^1]: Ulrich Drepper: “[What Every Programmer Should Know About Memory](https://people.freebsd.org/~lstewart/articles/cpumemory.pdf),” akka‐dia.org, November 21, 2007.
[^2]: Ben Stopford: “[Shared Nothing vs. Shared Disk Architectures: An Independent View](http://www.benstopford.com/2009/11/24/understanding-the-shared-nothing-architecture/),” benstopford.com, November 24, 2009.
[^3]: Michael Stonebraker: “[The Case for Shared Nothing](http://db.cs.berkeley.edu/papers/hpts85-nothing.pdf),” IEEE Database EngineeringBulletin, volume 9, number 1, pages 4–9, March 1986.
[^4]: Frank McSherry, Michael Isard, and Derek G. Murray: “[Scalability! But at What COST?](http://www.frankmcsherry.org/assets/COST.pdf),” at 15th USENIX Workshop on Hot Topics in Operating Systems (HotOS),May 2015.


================================================
FILE: content/zh/part-iii.md
================================================
---
title: 第三部分：派生数据
weight: 300
breadcrumbs: false
---

{{< callout type="warning" >}}
当前页面来自本书第一版，第二版尚不可用
{{< /callout >}}

在本书的 [第一部分](/part-i) 和 [第二部分](/part-ii) 中，我们自底向上地把所有关于分布式数据库的主要考量都过了一遍。从数据在磁盘上的布局，一直到出现故障时分布式系统一致性的局限。但所有的讨论都假定了应用中只用了一种数据库。

现实世界中的数据系统往往更为复杂。大型应用程序经常需要以多种方式访问和处理数据，没有一个数据库可以同时满足所有这些不同的需求。因此应用程序通常组合使用多种组件：数据存储、索引、缓存、分析系统等等，并实现在这些组件中移动数据的机制。

本书的最后一部分，会研究将多个不同数据系统（可能有着不同数据模型，并针对不同的访问模式进行优化）集成为一个协调一致的应用架构时，会遇到的问题。软件供应商经常会忽略这一方面的生态建设，并声称他们的产品能够满足你的所有需求。在现实世界中，集成不同的系统是实际应用中最重要的事情之一。

## 记录系统和派生数据系统

从高层次上看，存储和处理数据的系统可以分为两大类：

权威记录系统（System of record）
: **记录系统**，也被称为 **真相源（source of truth）**，持有数据的权威版本。当新的数据进入时（例如，用户输入）首先会记录在这里。
 每个事实正正好好表示一次（表示通常是 **正规化的**，即 normalized）。如果其他系统和 **记录系统** 之间存在任何差异，那么记录系统中的值是正确的（根据定义）。

派生数据系统（Derived data systems）
: **派生系统** 中的数据，通常是另一个系统中的现有数据以某种方式进行转换或处理的结果。如果丢失派生数据，可以从原始来源重新创建。
 典型的例子是 **缓存（cache）**：如果数据在缓存中，就可以由缓存提供服务；如果缓存不包含所需数据，则降级由底层数据库提供。非规范化的值，索引和物化视图亦属此类。在推荐系统中，预测汇总数据通常派生自用户日志。

从技术上讲，派生数据是 **冗余的（redundant）**，因为它重复了已有的信息。但是派生数据对于获得良好的只读查询性能通常是至关重要的。它通常是非规范化的。可以从单个源头派生出多个不同的数据集，使你能从不同的 “视角” 洞察数据。

并不是所有的系统都在其架构中明确区分 **记录系统** 和 **派生数据系统**，但是这是一种有用的区分方式，因为它明确了系统中的数据流：系统的哪一部分具有哪些输入和哪些输出，以及它们如何相互依赖。

大多数数据库，存储引擎和查询语言，本质上既不是记录系统也不是派生系统。数据库只是一个工具：如何使用它取决于你自己。**记录系统和派生数据系统之间的区别不在于工具，而在于应用程序中的使用方式。**

通过梳理数据的派生关系，可以清楚地理解一个令人困惑的系统架构。这将贯穿本书的这一部分。

## 章节概述

我们将从 [第十一章](/ch11) 开始，研究例如 MapReduce 这样 **面向批处理（batch-oriented）** 的数据流系统。对于建设大规模数据系统，我们将看到，它们提供了优秀的工具和思想。
[第十二章](/ch12) 将把这些思想应用到 **流式数据（data streams）** 中，使我们能用更低的延迟完成同样的任务。[第十三章](/ch13) 将探讨如何使用这些工具来构建可靠、可伸缩和可维护的应用。[第十四章](/ch14) 将以伦理、隐私与社会影响为主题，为全书收束。


## 索引

## [11. 批处理](/ch11)
- [使用 Unix 工具的批处理](/ch11#sec_batch_unix)
- [分布式系统中的批处理](/ch11#sec_batch_distributed)
- [批处理模型](/ch11#id431)
- [批处理用例](/ch11#sec_batch_output)
- [本章小结](/ch11#id292)
- [参考文献](/ch11#references)

## [12. 流处理](/ch12)
- [传递事件流](/ch12#sec_stream_transmit)
- [数据库与流](/ch12#sec_stream_databases)
- [流处理](/ch12#sec_stream_processing)
- [本章小结](/ch12#id332)
- [参考文献](/ch12#references)

## [13. 流式系统的哲学](/ch13)
- [数据集成](/ch13#sec_future_integration)
- [分拆数据库](/ch13#sec_future_unbundling)
- [追求正确性](/ch13#sec_future_correctness)
- [本章小结](/ch13#id367)
- [参考文献](/ch13#references)

## [14. 将事情做正确](/ch14)
- [预测分析](/ch14#id369)
- [隐私与追踪](/ch14#id373)
- [总结](/ch14#id594)
- [参考文献](/ch14#references)


================================================
FILE: content/zh/preface.md
================================================
---
title: 序言
weight: 50
breadcrumbs: false
---

{{< callout type="warning" >}}
当前页面来自本书第一版，第二版尚不可用
{{< /callout >}}

如果近几年从业于软件工程，特别是服务器端和后端系统开发，那么你很有可能已经被大量关于数据存储和处理的时髦词汇轰炸过了： NoSQL！大数据！Web-Scale！分片！最终一致性！ACID！CAP 定理！云服务！MapReduce！实时！

在最近十年中，我们看到了很多有趣的进展，关于数据库，分布式系统，以及在此基础上构建应用程序的方式。这些进展有着各种各样的驱动力：

* 谷歌、雅虎、亚马逊、脸书、领英、微软和推特等互联网公司正在和巨大的流量 / 数据打交道，这迫使他们去创造能有效应对如此规模的新工具。
* 企业需要变得敏捷，需要低成本地检验假设，需要通过缩短开发周期和保持数据模型的灵活性，快速地响应新的市场洞察。
* 免费和开源软件变得非常成功，在许多环境中比商业软件和定制软件更受欢迎。
* 处理器主频几乎没有增长，但是多核处理器已经成为标配，网络也越来越快。这意味着并行化程度只增不减。
* 即使你在一个小团队中工作，现在也可以构建分布在多台计算机甚至多个地理区域的系统，这要归功于譬如亚马逊网络服务（AWS）等基础设施即服务（IaaS）概念的践行者。
* 许多服务都要求高可用，因停电或维护导致的服务不可用，变得越来越难以接受。

**数据密集型应用（data-intensive applications）** 正在通过使用这些技术进步来推动可能性的边界。一个应用被称为 **数据密集型** 的，如果 **数据是其主要挑战**（数据量，数据复杂度或数据变化速度）—— 与之相对的是 **计算密集型**，即处理器速度是其瓶颈。

帮助数据密集型应用存储和处理数据的工具与技术，正迅速地适应这些变化。新型数据库系统（“NoSQL”）已经备受关注，而消息队列，缓存，搜索索引，批处理和流处理框架以及相关技术也非常重要。很多应用组合使用这些工具与技术。

这些生意盎然的时髦词汇体现出人们对新的可能性的热情，这是一件好事。但是作为软件工程师和架构师，如果要开发优秀的应用，我们还需要对各种层出不穷的技术及其利弊权衡有精准的技术理解。为了获得这种洞察，我们需要深挖时髦词汇背后的内容。

幸运的是，在技术迅速变化的背后总是存在一些持续成立的原则，无论你使用了特定工具的哪个版本。如果你理解了这些原则，就可以领会这些工具的适用场景，如何充分利用它们，以及如何避免其中的陷阱。这正是本书的初衷。

本书的目标是帮助你在飞速变化的数据处理和数据存储技术大观园中找到方向。本书并不是某个特定工具的教程，也不是一本充满枯燥理论的教科书。相反，我们将看到一些成功数据系统的样例：许多流行应用每天都要在生产中满足可伸缩性、性能、以及可靠性的要求，而这些技术构成了这些应用的基础。

我们将深入这些系统的内部，理清它们的关键算法，讨论背后的原则和它们必须做出的权衡。在这个过程中，我们将尝试寻找 **思考** 数据系统的有效方式 —— 不仅关于它们 **如何** 工作，还包括它们 **为什么** 以这种方式工作，以及哪些问题是我们需要问的。

阅读本书后，你能很好地决定哪种技术适合哪种用途，并了解如何将工具组合起来，为一个良好应用架构奠定基础。本书并不足以使你从头开始构建自己的数据库存储引擎，不过幸运的是这基本上很少有必要。你将获得对系统底层发生事情的敏锐直觉，这样你就有能力推理它们的行为，做出优秀的设计决策，并追踪任何可能出现的问题。


## 本书的目标读者

如果你开发的应用具有用于存储或处理数据的某种服务器 / 后端系统，而且使用网络（例如，Web 应用、移动应用或连接到互联网的传感器），那么本书就是为你准备的。

本书是为软件工程师，软件架构师，以及喜欢写代码的技术经理准备的。如果你需要对所从事系统的架构做出决策 —— 例如你需要选择解决某个特定问题的工具，并找出如何最好地使用这些工具，那么这本书对你尤有价值。但即使你无法选择你的工具，本书仍将帮助你更好地了解所使用工具的长处和短处。

你应当具有一些开发 Web 应用或网络服务的经验，且应当熟悉关系型数据库和 SQL。任何你了解的非关系型数据库和其他与数据相关工具都会有所帮助，但不是必需的。对常见网络协议如 TCP 和 HTTP 的大概理解是有帮助的。编程语言或框架的选择对阅读本书没有任何不同影响。

如果以下任意一条对你为真，你会发现这本书很有价值：

* 你想了解如何使数据系统可伸缩，例如，支持拥有数百万用户的 Web 或移动应用。
* 你需要提高应用程序的可用性（最大限度地减少停机时间），保持稳定运行。
* 你正在寻找使系统在长期运行过程易于维护的方法，即使系统规模增长，需求与技术也发生变化。
* 你对事物的运作方式有着天然的好奇心，并且希望知道一些主流网站和在线服务背后发生的事情。这本书打破了各种数据库和数据处理系统的内幕，探索这些系统设计中的智慧是非常有趣的。

有时在讨论可伸缩的数据系统时，人们会说：“你又不在谷歌或亚马逊，别操心可伸缩性了，直接上关系型数据库”。这个陈述有一定的道理：为了不必要的伸缩性而设计程序，不仅会浪费不必要的精力，并且可能会把你锁死在一个不灵活的设计中。实际上这是一种 “过早优化” 的形式。不过，选择合适的工具确实很重要，而不同的技术各有优缺点。我们将看到，关系数据库虽然很重要，但绝不是数据处理的终章。


## 本书涉及的领域

本书并不会尝试告诉读者如何安装或使用特定的软件包或 API，因为已经有大量文档给出了详细的使用说明。相反，我们会讨论数据系统的基础 —— 各种原则与利弊权衡，并探讨了不同产品所做出的不同设计决策。

在电子书中包含了在线资源全文的链接。所有链接在出版时都进行了验证，但不幸的是，由于网络的自然规律，链接往往会频繁地破损。如果你遇到链接断开的情况，或者正在阅读本书的打印副本，可以使用搜索引擎查找参考文献。对于学术论文，你可以在 Google 学术中搜索标题，查找可以公开获取的 PDF 文件。或者，你也可以在 https://github.com/ept/ddia-references 中找到所有的参考资料，我们在那儿维护最新的链接。

我们主要关注的是数据系统的 **架构（architecture）**，以及它们被集成到数据密集型应用中的方式。本书没有足够的空间覆盖部署、运维、安全、管理等领域 —— 这些都是复杂而重要的主题，仅仅在本书中用粗略的注解讨论这些对它们很不公平。每个领域都值得用单独的书去讲。

本书中描述的许多技术都被涵盖在 **大数据（Big Data）** 这个时髦词的范畴中。然而 “大数据” 这个术语被滥用，缺乏明确定义，以至于在严肃的工程讨论中没有用处。这本书使用歧义更小的术语，如 “单节点” 之于 “分布式系统”，或 “在线 / 交互式系统” 之于 “离线 / 批处理系统”。

本书对 **自由和开源软件（FOSS）** 有一定偏好，因为阅读、修改和执行源码是了解某事物详细工作原理的好方法。开放的平台也可以降低供应商垄断的风险。然而在适当的情况下，我们也会讨论专利软件（闭源软件，软件即服务 SaaS，或一些在文献中描述过但未公开发行的公司内部软件）。

## 本书纲要

本书分为三部分：

1. 在 [第一部分](/part-i) 中，我们会讨论设计数据密集型应用所赖的基本思想。我们从 [第一章](/ch1) 开始，讨论我们实际要达到的目标：可靠性、可伸缩性和可维护性；我们该如何思考这些概念；以及如何实现它们。在 [第二章](/ch2) 中，我们比较了几种不同的数据模型和查询语言，看看它们如何适用于不同的场景。在 [第三章](/ch3) 中将讨论存储引擎：数据库如何在磁盘上摆放数据，以便能高效地再次找到它。[第四章](/ch4) 转向数据编码（序列化），以及随时间演化的模式。

2. 在 [第二部分](/part-ii) 中，我们从讨论存储在一台机器上的数据转向讨论分布在多台机器上的数据。这对于可伸缩性通常是必需的，但带来了各种独特的挑战。我们首先讨论复制（[第五章](/ch5)）、分区 / 分片（[第六章](/ch6)）和事务（[第七章](/ch7)）。然后我们将探索关于分布式系统问题的更多细节（[第八章](/ch8)），以及在分布式系统中实现一致性与共识意味着什么（[第九章](/ch9)）。

3. 在 [第三部分](/part-iii) 中，我们讨论那些从其他数据集派生出一些数据集的系统。派生数据经常出现在异构系统中：当没有单个数据库可以把所有事情都做的很好时，应用需要集成几种不同的数据库、缓存、索引等。在 [第十章](/ch10) 中我们将从一种派生数据的批处理方法开始，然后在此基础上建立在 [第十一章](/ch11) 中讨论的流处理。最后，在 [第十二章](/ch12) 中，我们将所有内容汇总，讨论在将来构建可靠、可伸缩和可维护的应用程序的方法。


## 参考文献与延伸阅读

本书中讨论的大部分内容已经在其它地方以某种形式出现过了 —— 会议演示文稿、研究论文、博客文章、代码、BUG 跟踪器、邮件列表以及工程习惯中。本书总结了不同来源资料中最重要的想法，并在文本中包含了指向原始文献的链接。如果你想更深入地探索一个领域，那么每章末尾的参考文献都是很好的资源，其中大部分可以免费在线获取。


## O‘Reilly Safari

[Safari](http://oreilly.com/safari) (formerly Safari Books Online) is a membership-based training and reference platform for enterprise, government, educators, and individuals.

Members have access to thousands of books, training videos, Learning Paths, interac‐ tive tutorials, and curated playlists from over 250 publishers, including O’Reilly Media, Harvard Business Review, Prentice Hall Professional, Addison-Wesley Pro‐ fessional, Microsoft Press, Sams, Que, Peachpit Press, Adobe, Focal Press, Cisco Press, John Wiley & Sons, Syngress, Morgan Kaufmann, IBM Redbooks, Packt, Adobe Press, FT Press, Apress, Manning, New Riders, McGraw-Hill, Jones & Bartlett, and Course Technology, among others.

For more information, please visit http://oreilly.com/safari.


## 联系我们

有关本书的评论和问题，请联系出版社：

O’Reilly Media, Inc.  
1005 Gravenstein Highway North  
Sebastopol, CA 95472  
800-998-9938（美国或加拿大）  
707-829-0515（国际或本地）  
707-829-0104（传真）

我们为本书提供了网页，会在上面列出勘误、示例以及任何补充信息。你可以访问：*http://bit.ly/designing-data-intensive-apps*。

如需发表评论或提出技术问题，请发送邮件至：*bookquestions@oreilly.com*。

有关 O’Reilly 图书、课程、会议和新闻的更多信息，请访问：*http://www.oreilly.com*。

* Facebook: [http://facebook.com/oreilly](http://facebook.com/oreilly)
* Twitter: [http://twitter.com/oreillymedia](http://twitter.com/oreillymedia)
* YouTube: [http://www.youtube.com/oreillymedia](http://www.youtube.com/oreillymedia)


## 致谢

本书融合了学术研究和工业实践的经验，融合并系统化了大量其他人的想法与知识。在计算领域，我们往往会被各种新鲜花样所吸引，但我认为前人完成的工作中，有太多值得我们学习的地方了。本书有 800 多处引用：文章、博客、讲座、文档等，对我来说这些都是宝贵的学习资源。我非常感谢这些材料的作者分享他们的知识。

我也从与人交流中学到了很多东西，很多人花费了宝贵的时间与我讨论想法并耐心解释。特别感谢 Joe Adler, Ross Anderson, Peter Bailis, Márton Balassi, Alastair Beresford, Mark Callaghan, Mat Clayton, Patrick Collison, Sean Cribbs, Shirshanka Das, Niklas Ekström, Stephan Ewen, Alan Fekete, Gyula Fóra, Camille Fournier, Andres Freund, John Garbutt, Seth Gilbert, Tom Haggett, Pat Hel‐ land, Joe Hellerstein, Jakob Homan, Heidi Howard, John Hugg, Julian Hyde, Conrad Irwin, Evan Jones, Flavio Junqueira, Jessica Kerr, Kyle Kingsbury, Jay Kreps, Carl Lerche, Nicolas Liochon, Steve Loughran, Lee Mallabone, Nathan Marz, Caitie McCaffrey, Josie McLellan, Christopher Meiklejohn, Ian Meyers, Neha Narkhede, Neha Narula, Cathy O’Neil, Onora O’Neill, Ludovic Orban, Zoran Perkov, Julia Powles, Chris Riccomini, Henry Robinson, David Rosenthal, Jennifer Rullmann, Matthew Sackman, Martin Scholl, Amit Sela, Gwen Shapira, Greg Spurrier, Sam Stokes, Ben Stopford, Tom Stuart, Diana Vasile, Rahul Vohra, Pete Warden, 以及 Brett Wooldridge.

更多人通过审阅草稿并提供反馈意见在本书的创作过程中做出了无价的贡献。我要特别感谢 Raul Agepati, Tyler Akidau, Mattias Andersson, Sasha Baranov, Veena Basavaraj, David Beyer, Jim Brikman, Paul Carey, Raul Castro Fernandez, Joseph Chow, Derek Elkins, Sam Elliott, Alexander Gallego, Mark Grover, Stu Halloway, Heidi Howard, Nicola Kleppmann, Stefan Kruppa, Bjorn Madsen, Sander Mak, Stefan Podkowinski, Phil Potter, Hamid Ramazani, Sam Stokes, 以及 Ben Summers。当然对于本书中的任何遗留错误或难以接受的见解，我都承担全部责任。

为了帮助这本书落地，并且耐心地处理我缓慢的写作和不寻常的要求，我要对编辑 Marie Beaugureau，Mike Loukides，Ann Spencer 和 O'Reilly 的所有团队表示感谢。我要感谢 Rachel Head 帮我找到了合适的术语。我要感谢 Alastair Beresford，Susan Goodhue，Neha Narkhede 和 Kevin Scott，在其他工作事务之外给了我充分地创作时间和自由。

特别感谢 Shabbir Diwan 和 Edie Freedman，他们非常用心地为各章配了地图。他们提出了不落俗套的灵感，创作了这些地图，美丽而引人入胜，真是太棒了。

最后我要表达对家人和朋友们的爱，没有他们，我将无法走完这个将近四年的写作历程。你们是最棒的。


================================================
FILE: content/zh/toc.md
================================================
---
title: "目录"
linkTitle: "目录"
weight: 10
breadcrumbs: false
---


![](/title.jpg)


## [序言](/preface)
- [本书的目标读者](/preface#本书的目标读者)
- [本书涉及的领域](/preface#本书涉及的领域)
- [本书纲要](/preface#本书纲要)
- [参考文献与延伸阅读](/preface#参考文献与延伸阅读)
- [O‘Reilly Safari](/preface#oreilly-safari)
- [致谢](/preface#致谢)

## [1. 数据系统架构中的权衡](/ch1)
- [分析型与事务型系统](/ch1#sec_introduction_analytics)
- [云服务与自托管](/ch1#sec_introduction_cloud)
- [分布式与单节点系统](/ch1#sec_introduction_distributed)
- [数据系统、法律与社会](/ch1#sec_introduction_compliance)
- [总结](/ch1#summary)

## [2. 定义非功能性需求](/ch2)
- [案例研究：社交网络首页时间线](/ch2#sec_introduction_twitter)
- [描述性能](/ch2#sec_introduction_percentiles)
- [可靠性与容错](/ch2#sec_introduction_reliability)
- [可伸缩性](/ch2#sec_introduction_scalability)
- [可运维性](/ch2#sec_introduction_maintainability)
- [总结](/ch2#summary)

## [3. 数据模型与查询语言](/ch3)
- [关系模型与文档模型](/ch3#sec_datamodels_history)
- [图数据模型](/ch3#sec_datamodels_graph)
- [事件溯源与 CQRS](/ch3#sec_datamodels_events)
- [数据框、矩阵与数组](/ch3#sec_datamodels_dataframes)
- [总结](/ch3#summary)

## [4. 存储与检索](/ch4)
- [OLTP 系统的存储与索引](/ch4#sec_storage_oltp)
- [分析型数据存储](/ch4#sec_storage_analytics)
- [多维索引与全文索引](/ch4#sec_storage_multidimensional)
- [总结](/ch4#summary)

## [5. 编码与演化](/ch5)
- [编码数据的格式](/ch5#sec_encoding_formats)
- [数据流的模式](/ch5#sec_encoding_dataflow)
- [总结](/ch5#summary)

## [6. 复制](/ch6)
- [单主复制](/ch6#sec_replication_leader)
- [复制延迟的问题](/ch6#sec_replication_lag)
- [多主复制](/ch6#sec_replication_multi_leader)
- [无主复制](/ch6#sec_replication_leaderless)
- [总结](/ch6#summary)

## [7. 分片](/ch7)
- [分片的利与弊](/ch7#sec_sharding_reasons)
- [键值数据的分片](/ch7#sec_sharding_key_value)
- [请求路由](/ch7#sec_sharding_routing)
- [分片与二级索引](/ch7#sec_sharding_secondary_indexes)
- [总结](/ch7#summary)

## [8. 事务](/ch8)
- [事务到底是什么？](/ch8#sec_transactions_overview)
- [弱隔离级别](/ch8#sec_transactions_isolation_levels)
- [可串行化](/ch8#sec_transactions_serializability)
- [分布式事务](/ch8#sec_transactions_distributed)
- [总结](/ch8#summary)
- [参考](/ch8#参考)

## [9. 分布式系统的麻烦](/ch9)
- [故障与部分失效](/ch9#sec_distributed_partial_failure)
- [不可靠的网络](/ch9#sec_distributed_networks)
- [不可靠的时钟](/ch9#sec_distributed_clocks)
- [知识、真相和谎言](/ch9#sec_distributed_truth)
- [总结](/ch9#summary)

## [10. 一致性与共识](/ch10)
- [线性一致性](/ch10#sec_consistency_linearizability)
- [ID 生成器和逻辑时钟](/ch10#sec_consistency_logical)
- [共识](/ch10#sec_consistency_consensus)
- [总结](/ch10#summary)

## [11. 批处理](/ch11)
- [使用 Unix 工具的批处理](/ch11#sec_batch_unix)
- [分布式系统中的批处理](/ch11#sec_batch_distributed)
- [批处理模型](/ch11#id431)
- [批处理用例](/ch11#sec_batch_output)
- [本章小结](/ch11#id292)
- [参考文献](/ch11#references)

## [12. 流处理](/ch12)
- [传递事件流](/ch12#sec_stream_transmit)
- [数据库与流](/ch12#sec_stream_databases)
- [流处理](/ch12#sec_stream_processing)
- [本章小结](/ch12#id332)
- [参考文献](/ch12#references)

## [13. 流式系统的哲学](/ch13)
- [数据集成](/ch13#sec_future_integration)
- [分拆数据库](/ch13#sec_future_unbundling)
- [追求正确性](/ch13#sec_future_correctness)
- [本章小结](/ch13#id367)
- [参考文献](/ch13#references)

## [14. 将事情做正确](/ch14)
- [预测分析](/ch14#id369)
- [隐私与追踪](/ch14#id373)
- [总结](/ch14#id594)
- [参考文献](/ch14#references)

## [术语表](/glossary)

## [后记](/colophon)
- [关于作者](/colophon#关于作者)
- [关于译者](/colophon#关于译者)
- [后记](/colophon#后记)


================================================
FILE: giscus.json
================================================
{
  "origins": [
    "https://vonng.github.com/ddia",
    "https://ddia.vonng.com",
    "https://ddia.pgsty.com",
    "http://localhost:1313"
  ],
  "originsRegex": [
    "http://localhost:[0-9]+"
  ]
}

================================================
FILE: go.mod
================================================
module github.com/Vonng/ddia

go 1.24.5

require github.com/imfing/hextra v0.11.0 // indirect


================================================
FILE: go.sum
================================================
github.com/imfing/hextra v0.9.7 h1:Zg5n24us36Bn/S/5mEUPkRW6uwE6vHHEqWSgN0bPXaM=
github.com/imfing/hextra v0.9.7/go.mod h1:cEfel3lU/bSx7lTE/+uuR4GJaphyOyiwNR3PTqFTXpI=
github.com/imfing/hextra v0.11.0 h1:2HswtfKD/TFg2VWp0hvsH5F3/WoEugiz8s3n2JFouqY=
github.com/imfing/hextra v0.11.0/go.mod h1:cEfel3lU/bSx7lTE/+uuR4GJaphyOyiwNR3PTqFTXpI=


================================================
FILE: hugo.yaml
================================================
baseURL: 'https://ddia.vonng.com/'
languageCode: 'zh-CN'
title: '设计数据密集型应用（第二版）'

enableRobotsTXT: true
# Parse Git commit
enableGitInfo: true
# enableEmoji: false
hasCJKLanguage: true

services:
  googleAnalytics:
    ID: G-SGY2FPH1EG

outputs:
  home: [HTML]
  page: [HTML]
  section: [HTML, RSS]

module:
  imports:
    - path: github.com/imfing/hextra

defaultContentLanguage: zh
languages:
  zh:
    languageName: 简体中文
    languageCode: zh
    contentDir: content/zh
    weight: 1
    title: 设计数据密集型应用（第二版）
  tw:
    languageName: 繁体中文
    languageCode: tw
    contentDir: content/tw
    weight: 2
    title: 設計資料密集型應用（第二版）
  v1:
    languageName: 简体中文第一版
    languageCode: zh
    contentDir: content/v1
    weight: 3
    title: 设计数据密集型应用（第一版）
  v1_tw:
    languageName: 繁体中文第一版
    languageCode: tw
    contentDir: content/v1_tw
    weight: 4
    title: 設計資料密集型應用（第一版）
  #en:
  #  languageName: English
  #  languageCode: en
  #  contentDir: content/en
  #  weight: 4
  #  title: Designing Data-Intensive Applications 2nd Edition

markup:
  goldmark:
    extensions:
      footnote: true         # 开启脚注语法：[^id] / [^id]: text
      linkify: true          # 自动将 URL 文本转为链接
      table: true            # 启用 Markdown 表格
      taskList: true         # 启用任务列表 [ ] / [x]
      typographer: true      # 智能排版（引号、破折号等）
      passthrough:
        enable: true         # 允许将数学定界符透传给 Hextra 的数学渲染器
        delimiters:
          block:
            - ['\[', '\]']
            - ['$$', '$$']
          inline:
            - ['\(', '\)']
    parser:
      attribute: true        # 允许在标题后写 {#id .class key=val}，用于显式锚点
      autoHeadingID: true    # 为标题自动生成 ID（手写 {#id} 会覆盖自动生成）
      autoHeadingIDType: github  # 自动 ID 规则：github / blackfriday / none
    renderer:
      unsafe: true           # 允许 Markdown 中的原生 HTML（如 <a>、<details>）按原样渲染
  tableOfContents:
    startLevel: 2            # ToC 从 h2 开始
    endLevel: 4              # ToC 到 h4 结束

menu:
  main:
    - name: Search
      weight: 1
      params:
        type: search
    - name: GitHub
      weight: 2
      url: "https://github.com/Vonng/ddia"
      params:
        icon: github

  sidebar:

    - identifier: ver
      name: 语言版本
      weight: 1
      params:
        type: separator
    - identifier: simplified-chinese
      name: "简体中文 ↗"
      url: "/"
      weight: 2
    - identifier: traditional-chinese
      name: "繁體中文 ↗"
      url: "/tw"
      weight: 3
    - identifier: chinese-1st-edition
      name: "简体中文初版 ↗"
      url: "/v1"
      weight: 4
    - identifier: traditional-1st-edition
      name: "繁體中文初版 ↗"
      url: "/v1_tw"
      weight: 5
    #- identifier: english
    #  name: "English ↗"
    #  url: "/en"
    #  weight: 5

    - identifier: more
      name: 参考链接
      params:
        type: separator
      weight: 6
    - identifier: vonng
      name: "博客：老冯云数 ↗"
      url: "https://vonng.com"
      weight: 7
    - identifier: pgint
      name: "PostgreSQL 内幕探索 ↗"
      url: "https://pgint.vonng.com/"
      weight: 8
    - identifier: pgint14
      name: "PostgreSQL 14 内参 ↗"
      url: "https://postgres-internals.cn/"
      weight: 9
    - identifier: pigsty-cc
      name: "Pigsty：开源 PG RDS ↗"
      url: "https://pigsty.cc/"
      weight: 10
    - identifier: pigsty-io
      name: "Pigsty: Free PG RDS ↗"
      url: "https://pigsty.io/"
      weight: 11
    - identifier: pgext
      name: "PG 扩展目录 ↗"
      url: "https://ext.pgsty.com/zh"
      weight: 12
    - identifier: ddia1
      name: "DDIA O'reilly ↗"
      url: "https://www.oreilly.com/library/view/designing-data-intensive-applications/9781491903063/"
      weight: 13
    - identifier: ddia2
      name: "DDIA 2nd O'reilly ↗"
      url: "https://www.oreilly.com/library/view/designing-data-intensive-applications/9781098119058/"
      weight: 14


params:
  description: DDIA2，设计数据密集型应用（第二版）中文版翻译，设计数据密集型应用是一本关于数据系统设计的书籍，它深入探讨了数据密集型应用的架构和设计原则，涵盖了模型，存储，分区，事务，分布式系统、数据存储、流处理，批处理等各种主题。

  navbar:
    displayTitle: true
    displayLogo: true
    logo:
      path: /logo.png
      dark: /logo.png
    width: wide

  page:
    # full (100%), wide (90rem), normal (80rem)
    width: full

  theme:
    # light | dark | system
    default: system
    displayToggle: true

  footer:
    enable: false
    displayCopyright: true
    displayPoweredBy: false
    width: normal

  # Display the last modification date
  displayUpdatedDate: true
  dateFormat: "2006-01-02"

  # Search
  # flexsearch is enabled by default
  search:
    enable: true
    type: flexsearch

    flexsearch:
      # index page by: content | summary | heading | title
      index: content
      # full | forward | reverse | strict
      # https://github.com/nextapps-de/flexsearch/#tokenizer-prefix-search
      tokenize: forward

  editURL:
    enable: true
    base: "https://github.com/Vonng/ddia/edit/main/content"

  toc:
    displayTags: true

  highlight:
    copy:
      enable: true
      # hover | always
      display: hover

  comments:
    enable: true
    type: giscus
    giscus:
      repo: "Vonng/ddia"
      repoId: "MDEwOlJlcG9zaXRvcnkxMjA3MTA2NDQ"
      category: "Announcements"
      categoryId: "DIC_kwDOBzHl9M4CtlWB"
      mapping: pathname
      strict: 0
      reactionsEnabled: 1
      emitMetadata: 0
      inputPosition: bottom
      lang: zh-CN
      theme: preferred_color_scheme


================================================
FILE: i18n/en.yaml
================================================
copyright: '<a class="hx:flex hx:text-sm hx:items-center hx:gap-1 hx:text-current" target="_blank" rel="noopener noreferrer" title="Vonng" href="https://blog.vonng.com/en"><span>© 2025 Ruohang Feng</span></a>'


================================================
FILE: i18n/tw.yaml
================================================
backToTop: "返回頂部"
changeLanguage: "切換語言"
changeTheme: "切換主題"
copyright: '<a class="hx:flex hx:text-sm hx:items-center hx:gap-1 hx:text-current" target="_blank" rel="noopener noreferrer" title="Vonng" href="https://blog.vonng.com/"><span>© 2025 馮若航</span></a>'
dark: "深色"
editThisPage: "在 GitHub 上編輯此頁 →"
lastUpdated: "最後更新於"
light: "淺色"
noResultsFound: "無結果"
onThisPage: "此頁上"
tags: "標籤"
readMore: "更多 →"
searchPlaceholder: "搜尋文檔..."

================================================
FILE: i18n/v2.yaml
================================================
backToTop: "返回顶部"
changeLanguage: "切换语言"
changeTheme: "切换主题"
copyright: '<a class="hx:flex hx:text-sm hx:items-center hx:gap-1 hx:text-current" target="_blank" rel="noopener noreferrer" title="Vonng" href="https://blog.vonng.com/"><span>© 2025 冯若航</span></a>'
dark: "深色"
editThisPage: "在 GitHub 上编辑此页 →"
lastUpdated: "最后更新于"
light: "浅色"
noResultsFound: "无结果"
onThisPage: "此页上"
tags: "标签"
readMore: "更多 →"
searchPlaceholder: "搜索文档..."
previous: "上一页"
next: "下一页"

================================================
FILE: i18n/zh.yaml
================================================
backToTop: "返回顶部"
changeLanguage: "切换语言"
changeTheme: "切换主题"
copyright: '<a class="hx:flex hx:text-sm hx:items-center hx:gap-1 hx:text-current" target="_blank" rel="noopener noreferrer" title="Vonng" href="https://blog.vonng.com/"><span>© 2025 冯若航</span></a>'
dark: "深色"
editThisPage: "在 GitHub 上编辑此页 →"
lastUpdated: "最后更新于"
light: "浅色"
noResultsFound: "无结果"
onThisPage: "此页上"
tags: "标签"
readMore: "更多 →"
searchPlaceholder: "搜索文档..."
previous: "上一页"
next: "下一页"

================================================
FILE: js/epub.css
================================================
/* This defines styles and classes used in the book */
@page {
  margin: 10px;
}
html, body, div, span, applet, object, iframe, h1, h2, h3, h4, h5, h6, p,
blockquote, pre, a, abbr, acronym, address, big, cite, code, del, dfn, em, img,
ins, kbd, q, s, samp, small, strike, strong, sub, sup, tt, var, b, u, i, center,
fieldset, form, label, legend, table, caption, tbody, tfoot, thead, tr, th, td,
article, aside, canvas, details, embed, figure, figcaption, footer, header,
hgroup, menu, nav, output, ruby, section, summary, time, mark, audio, video, ol,
ul, li, dl, dt, dd {
  margin: 0;
  padding: 0;
  border: 0;
  font-size: 100%;
  vertical-align: baseline;
}
html {
  line-height: 1.2;
  font-family: Georgia, serif;
  color: #1a1a1a;
}
p {
  text-indent: 0;
  margin: 1em 0;
  widows: 2;
  orphans: 2;
}
a, a:visited {
  color: #1a1a1a;
}
img {
  max-width: 100%;
}
sup {
  vertical-align: super;
  font-size: smaller;
}
sub {
  vertical-align: sub;
  font-size: smaller;
}
h1 {
  margin: 3em 0 0 0;
  font-size: 2em;
  page-break-before: always;
  line-height: 150%;
}
h2 {
  margin: 1.5em 0 0 0;
  font-size: 1.5em;
  line-height: 135%;
}
h3 {
  margin: 1.3em 0 0 0;
  font-size: 1.3em;
}
h4 {
  margin: 1.2em 0 0 0;
  font-size: 1.2em;
}
h5 {
  margin: 1.1em 0 0 0;
  font-size: 1.1em;
}
h6 {
  font-size: 1em;
}
h1, h2, h3, h4, h5, h6 {
  text-indent: 0;
  text-align: left;
  font-weight: bold;
  page-break-after: avoid;
  page-break-inside: avoid;
}

ol, ul {
  margin: 1em 0 0 1.7em;
}
li > ol, li > ul {
  margin-top: 0;
}
blockquote {
  margin: 1em 0 1em 1.7em;
}
code {
  font-family: Menlo, Monaco, 'Lucida Console', Consolas, monospace;
  font-size: 85%;
  margin: 0;
  hyphens: manual;
}
/*pre {*/
/*  margin: 1em 0;*/
/*  overflow: auto;*/
/*}*/
pre code {
  white-space: pre-wrap;
  word-wrap: break-word;
  background-color: #f5f5f5;
  padding: 1em;
}
.sourceCode {
  background-color: transparent;
  overflow: visible;
}
hr {
  background-color: #1a1a1a;
  border: none;
  height: 1px;
  margin: 1em 0;
}
table {
  margin: 1em 0;
  border-collapse: collapse;
  width: 100%;
  overflow-x: auto;
  display: block;
}
table caption {
  margin-bottom: 0.75em;
}
tbody {
  margin-top: 0.5em;
  border-top: 1px solid #1a1a1a;
  border-bottom: 1px solid #1a1a1a;
}
th, td {
  padding: 0.25em 0.5em 0.25em 0.5em;
}
th {
  border-top: 1px solid #1a1a1a;
}
header {
  margin-bottom: 4em;
  text-align: center;
}
#TOC li {
  list-style: none;
}
#TOC ul {
  padding-left: 1.3em;
}
#TOC > ul {
  padding-left: 0;
}
#TOC a:not(:hover) {
  text-decoration: none;
}
code {
  white-space: pre-wrap;
}
span.smallcaps {
  font-variant: small-caps;
}

/* This is the most compatible CSS, but it only allows two columns: */
div.column {
  display: inline-block;
  vertical-align: top;
  width: 50%;
}
/* If you can rely on CSS3 support, use this instead: */
/* div.columns {
  display: flex;
  gap: min(4vw, 1.5em);
}
div.column {
  flex: auto;
  overflow-x: auto;
} */

div.hanging-indent {
  margin-left: 1.5em;
  text-indent: -1.5em;
}
ul.task-list {
  list-style: none;
}
ul.task-list li input[type="checkbox"] {
  width: 0.8em;
  margin: 0 0.8em 0.2em -1.6em;
  vertical-align: middle;
}
.display.math {
  display: block;
  text-align: center;
  margin: 0.5rem auto;
}

/* For title, author, and date on the cover page */
h1.title { }
p.author { }
p.date { }

nav#toc ol, nav#landmarks ol {
  padding: 0;
  margin-left: 1em;
}
nav#toc ol li, nav#landmarks ol li {
  list-style-type: none;
  margin: 0;
  padding: 0;
}
a.footnote-ref {
  vertical-align: super;
}
em, em em em, em em em em em {
  font-style: italic;
}
em em, em em em em {
  font-style: normal;
}
q {
  quotes: """''"'";
}
@media screen { /* Workaround for iBooks issue; see #6242 */
  .sourceCode {
    overflow: visible !important;
    white-space: pre-wrap !important;
  }
}


================================================
FILE: layouts/shortcodes/figure.html
================================================
{{- $src  := .Get "src" -}}
{{- $link := .Get "link" -}}

<figure
        {{ with .Get "id" }}id="{{ . }}"{{ end }}
{{ with .Get "class" }}class="{{ . }}"{{ end }}
>
{{- if $src -}}
{{- with $link }}<a href="{{ . }}">{{ end -}}
    <img
            src="{{ $src }}"
            {{ with .Get "alt" }}alt="{{ . }}"{{ end }}
    {{ with .Get "width" }}width="{{ . }}"{{ end }}
    {{ with .Get "height" }}height="{{ . }}"{{ end }}
    />
    {{- if $link }}</a>{{ end -}}
{{- end -}}

{{ with .Get "title" }}<h4>{{ . }}</h4>{{ end }}
{{ with .Get "caption" }}<figcaption>{{ . | markdownify }}</figcaption>{{ end }}
</figure>
{{- if $src }}<br />{{- end -}}

================================================
FILE: metadata.yaml
================================================
---
title: 设计数据密集型应用
author: Martin Kleppmann
rights:  Creative Commons Non-Commercial Share Alike 3.0
language: zh
cover-image: ./static/title.jpg