[
  {
    "path": ".github/FUNDING.yml",
    "content": "# These are supported funding model platforms\n\npatreon: 0xAX\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/content-issue.yml",
    "content": "name: 📖 Content issue\ndescription: Report an issue with the content\nbody:\n  - type: markdown\n    attributes:\n      value: |\n        Use this form to report an issue with the content.\n\n        When contributing, make sure to follow Contributing guidelines and Code of Conduct.\n        Thank you for your contribution!\n\n  - type: checkboxes\n    attributes:\n      label: Existing issues\n      description: Is there an existing issue for this? Search open and closed issues to avoid duplicates.\n      options:\n        - label: I have searched the existing issues.\n          required: true\n\n  - type: input\n    attributes:\n      label: Affected document\n      description: Name or paste a link to the document that contains an issue.\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: Issue description\n      description: Explain what is unclear or confusing in the given document.\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: Attachments\n      description: Include screenshots or links if applicable.\n    validations:\n      required: false\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/question.yml",
    "content": "name: ❓ Questions and discussions\ndescription: Ask a question or start a discussion with other community members.\nbody:\n  - type: markdown\n    attributes:\n      value: |\n        Use this form to ask a question or start a discussion with other community members.\n\n        When contributing, make sure to follow Contributing guidelines and Code of Conduct.\n        Thank you for your contribution!\n\n  - type: checkboxes\n    attributes:\n      label: Existing issues\n      description: Is there an existing issue for this? Search open and closed issues to avoid duplicates.\n      options:\n        - label: I have searched the existing issues.\n          required: true\n\n  - type: textarea\n    attributes:\n      label: Question\n      description: Ask a question you would like to discuss with the community.\n    validations:\n      required: false\n\n  - type: textarea\n    attributes:\n      label: Discussion\n      description: Start a discussion topic.\n    validations:\n      required: false\n\n  - type: textarea\n    attributes:\n      label: Attachments\n      description: Include screenshots, links, or example's output if applicable.\n    validations:\n      required: false\n"
  },
  {
    "path": ".github/dependabot.yaml",
    "content": "version: 2\nupdates:\n  - package-ecosystem: \"github-actions\"\n    directory: \"/\"\n    schedule:\n      interval: \"daily\"\n"
  },
  {
    "path": ".github/pull-request-template.md",
    "content": "<!-- Thank you for your contribution. When contributing to the project, remember to:\n- Read the Contribution guide.\n- Follow the Code of Conduct.\n-->\n\n**Description**\n\n<!-- In this section, provide a description of your changes. The context and justification let others understand your motivation and the purpose of the pull request. Follow the description with a list that summarises the most relevant changes included in the pull request. -->\n\nChanges proposed in this pull request:\n\n- ...\n- ...\n- ...\n\n**Related issues**\n\n<!-- Link the related issue here, if applicable. -->\n"
  },
  {
    "path": ".github/workflows/check-code-snippets.yaml",
    "content": "name: check code snippets\n\non:\n  workflow_dispatch:\n  push:\n    branches:\n      - main\n  pull_request:\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}\n  cancel-in-progress: true\n\njobs:\n  check-code-snippets:\n    name: check-code-snippets\n    runs-on:\n      - ubuntu-22.04\n    steps:\n      - name: Checkout repository\n        uses: actions/checkout@v6\n      - name: Setup python\n        uses: actions/setup-python@v6\n        with:\n          python-version: '3.13'\n      - name: Install dependencies\n        run: |\n          python -m pip install --upgrade pip\n          pip install requests  \n      - name: Validate code snippets\n        run: |\n          python ./scripts/check_code_snippets.py .\n"
  },
  {
    "path": ".github/workflows/check-links.yaml",
    "content": "name: check links\n\non:\n  workflow_dispatch:\n  push:\n    branches:\n      - main\n      - master\n  pull_request:\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}\n  cancel-in-progress: true\n\njobs:\n  check-links:\n    name: check-links\n    runs-on:\n      - ubuntu-22.04\n    steps:\n      - name: Checkout repository\n        uses: actions/checkout@v6\n\n      - name: Check links with lychee\n        uses: lycheeverse/lychee-action@v2\n        with:\n          # Check README.md and all files in Booting directory\n          args: |\n            --verbose\n            --no-progress\n            --max-retries 3\n            --timeout 20\n            README.md\n            'Booting/*.md'\n          fail: true\n        env:\n          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}\n"
  },
  {
    "path": ".github/workflows/generate-e-books.yaml",
    "content": "name: Generate e-books\n\non:\n  workflow_dispatch: {}\n\njobs:\n  build-for-pr:\n    # For every PR, build the same artifacts and make them accessible from the PR.\n    if: github.event_name == 'pull_request'\n    runs-on: ubuntu-latest\n\n    permissions:\n      contents: read\n      pull-requests: write\n\n    steps:\n      - name: Checkout repository\n        uses: actions/checkout@v6\n\n      - name: Export all supported book formats from the Docker container\n        run: |\n          make run\n          make export\n\n      - name: Copy generated files to host system\n        run: |\n          make cp\n          mkdir -p artifacts/\n          mv \"Linux Inside - 0xAX.epub\" \\\n             \"Linux Inside - 0xAX.mobi\" \\\n             \"Linux Inside - 0xAX.pdf\" \\\n             \"Linux Inside - 0xAX (A5).pdf\" \\\n             artifacts/\n\n      - name: Upload PR artifacts\n        uses: actions/upload-artifact@v7\n        with:\n          name: ebooks-${{ github.sha }}\n          path: artifacts/*\n          if-no-files-found: error\n          # Change the retention period here if necessary.\n          retention-days: 7\n\n      - name: Add a comment with a link to the generated artifacts.\n        # For forked PRs the token is read-only; skip commenting to avoid failures.\n        if: ${{ github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }}\n        uses: actions/github-script@v8\n        env:\n          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\n        with:\n          script: |\n            const body = [\n              `E-books generated for this pull request available at: ${process.env.RUN_URL}`\n            ].join('\\n');\n            await github.rest.issues.createComment({\n              owner: context.repo.owner,\n              repo: context.repo.repo,\n              issue_number: context.issue.number,\n              body\n            });\n"
  },
  {
    "path": ".github/workflows/release-e-books.yaml",
    "content": "name: Release e-books\n\non:\n  push:\n    tags:\n      - 'v*.*' # Create a release only when a new tag matching v*.* is pushed.\n    # To also create a release for each push to the main branch, uncomment the following 2 lines:\n    # branches:\n    #   - master\n  workflow_dispatch: {}  # For manual runs.\n\njobs:\n  release-ebooks:\n    runs-on: ubuntu-latest\n\n    permissions:\n      contents: write\n\n    steps:\n      - name: Checkout repository\n        uses: actions/checkout@v6\n\n      - name: Export all supported book formats from the Docker container\n        run: |\n          make run\n          make export\n\n      - name: Copy generated files to host system\n        run: |\n          make cp\n          mkdir -p artifacts/\n          mv \"Linux Inside - 0xAX.epub\" \\\n             \"Linux Inside - 0xAX.mobi\" \\\n             \"Linux Inside - 0xAX.pdf\" \\\n             \"Linux Inside - 0xAX (A5).pdf\" \\\n             artifacts/\n          cp LICENSE artifacts/\n\n      - name: Prepare release metadata\n        # Use tag name when running on a tag, otherwise fall back to the short commit hash.\n        id: meta\n        env:\n          GITHUB_REF_TYPE: ${{ github.ref_type }}\n          GITHUB_REF_NAME: ${{ github.ref_name }}\n        run: |\n          DATE_UTC=\"$(date -u '+%m/%d/%Y %H:%M')\"\n          if [ \"${GITHUB_REF_TYPE}\" = \"tag\" ] && [ -n \"${GITHUB_REF_NAME}\" ]; then\n            LABEL=\"${GITHUB_REF_NAME}\"\n          else\n            LABEL=\"$(git rev-parse --short HEAD)\"\n          fi\n          echo \"release_name=${DATE_UTC} (${LABEL})\" >> \"$GITHUB_OUTPUT\"\n          echo \"tag_name=${LABEL}\" >> \"$GITHUB_OUTPUT\"\n\n      - name: Create GitHub release\n        uses: softprops/action-gh-release@v2\n        with:\n          files: artifacts/*\n          name: ${{ steps.meta.outputs.release_name }}\n          tag_name: ${{ steps.meta.outputs.tag_name }}\n          target_commitish: ${{ github.sha }}\n          generate_release_notes: true\n          fail_on_unmatched_files: true\n"
  },
  {
    "path": ".gitignore",
    "content": "*.tex\nbuild\n"
  },
  {
    "path": "Booting/README.md",
    "content": "# Kernel Boot Process\n\nWelcome to the boot journey of the Linux kernel, from power-on to the first instruction of the decompressed kernel. This chapter walks the complete boot path step by step from the moment you power on your computer to the moment the Linux kernel loaded in the memory of your machine.\n\n## How to read\n\nThis chapter assumes you are comfortable with basic computer architecture and have a light familiarity with `C` programming language and x86_64 assembly syntax. You do not need to be a kernel expert, but being able to read short code snippets and recognize hardware terms will help.\n\nEach part of this chapter focuses on one boot phase. Read in order the first time, then revisit individual steps as references when you want to map a specific symbol or register setup to its place in the sequence. It is quite useful to have the source code of Linux kernel on your local computer to follow the details. You can obtain the source code using the following command:\n\n```bash\ngit clone git@github.com:torvalds/linux.git\n```\n\n## Notation used\n\nDuring reading this and other chapters, you may encounter special notation:\n\n- `CS`, `DS`, `SS`, `CR0`, `CR3`, `CR4`, `EFER` - refer to x86 segment and control registers\n- `0x...` - denotes hexadecimal values\n- `entry_*` and `startup_*` - are common prefixes for early boot symbols\n- `setup code` refers to the early part of the Linux kernel which executes preparation to load the kernel code itself into memory\n- `decompressor` refers to the part of the `setup code` that inflates the compressed kernel image into memory\n\n## What you will learn\n\n- The way a processor reaches the kernel entry point from firmware and the bootloader\n- Different modes of x86_64 processors\n- What the early setup code does before the kernel itself will be loaded into memory and start its work\n\n## Reading order\n\n1. [From the bootloader to kernel](linux-bootstrap-1.md) - from power-on to the first instruction in the kernel\n2. [First steps in the kernel setup code](linux-bootstrap-2.md) - early setup, heap init, parameter discovery (EDD, IST, and more)\n3. [Video mode initialization and transition to protected mode](linux-bootstrap-3.md) - video mode setup and the move to protected mode\n4. [Transition to 64-bit mode](linux-bootstrap-4.md) - preparation and the jump into long mode\n5. [Kernel Decompression](linux-bootstrap-5.md) - pre-decompression setup and the decompressor itself\n6. [Kernel load address randomization](linux-bootstrap-6.md) - how KASLR picks a load address\n\n## Kernel version\n\nThis chapter corresponds to `Linux kernel v6.19`.\n"
  },
  {
    "path": "Booting/linux-bootstrap-1.md",
    "content": "# Kernel Booting Process — Part 1\n\nIf you’ve read my earlier [posts](https://github.com/0xAX/asm) about [assembly language](https://en.wikipedia.org/wiki/Assembly_language) for Linux x86_64, you might see that I started to get interested in low-level programming. I’ve written a set of articles on assembly programming for [x86_64](https://en.wikipedia.org/wiki/X86-64) Linux and, in parallel, began exploring the Linux kernel source code. I’ve always been fascinated by what happens under the hood — how programs execute on a CPU, how they’re laid out in memory, how the kernel schedules processes and manages resources, how the network stack operates at a low level, and many other details. This series is a way of sharing my journey.\n\n> [!NOTE]\n> This is not official Linux kernel documentation, it is a learning project. I’m not a professional Linux kernel developer, and I don’t write kernel code as part of my daily job. Learning how the Linux kernel works is just my hobby. If you find anything unclear, spot an error, or have questions or suggestions, feel free to reach out - you always can ping me on X [0xAX](https://twitter.com/0xAX), send me an [email](mailto:anotherworldofworld@gmail.com) or open a new [issue](https://github.com/0xAX/linux-insides/issues/new). Your feedback is always welcome and appreciated.\n\nThe main goal of this series is to provide a guide to the Linux kernel for readers who want to begin learning how it works. We will explore not only what the kernel does, but will try to understand how and why it does it. Despite being considered to be understandable for anyone who is interested in Linux kernel, it is highly recommended to have some prior knowledge before starting to read these notes. If you want to experiment with the kernel code, first of all it is best to have a [Linux distribution](https://en.wikipedia.org/wiki/Linux_distribution) installed. Besides that, on these pages we will see much of [C](https://en.wikipedia.org/wiki/C_(programming_language)) and [assembly](https://en.wikipedia.org/wiki/Assembly_language) code, so the good understanding of these programming languages is highly required.\n\n> [!IMPORTANT]\n> I started writing this series when the latest version of the kernel was `3.18`. A lot has changed since then, and I am in the process of updating the content to reflect modern kernels where possible — now focusing on v6.16+. I’ll continue revising the posts as the kernel evolves.\n\nThat’s enough introduction — let’s dive into the Linux kernel!\n\n## The Magic Power Button - What happens next?\n\nAlthough this is a series of posts about Linux kernel, we will not jump straight into kernel code. First, let’s step back and look at what happens before the kernel even comes into play. Everything starts from the turning on a computer. And we will start from this point as well.\n\nWhen you press the \"magic\" power button on your laptop or desktop computer, the [motherboard](https://en.wikipedia.org/wiki/Motherboard) sends a signal to the [power supply](https://en.wikipedia.org/wiki/Power_supply). In response, the power supply delivers the proper amount of electricity to other components of the computer. Once the motherboard receives the [power good signal](https://en.wikipedia.org/wiki/Power_good_signal), it triggers the CPU to start. The CPU then performs a reset: it clears any leftover data in its registers and loads predefined values into each of them, preparing for the very first instructions of the boot process.\n\nEach **x86_64** processor begins execution in a special mode called [real mode](https://en.wikipedia.org/wiki/Real_mode). This mode exists for historical reasons - to be compatible with the earliest processors. Real mode is supported on all x86-compatible processors — from the original [8086](https://en.wikipedia.org/wiki/Intel_8086) to today’s modern 64-bit CPUs.\n\nThe **8086** was a 16-bit microprocessor. Basically it means that its general-purpose registers and instruction pointer were `16` bits wide. However, the chip was designed with a `20-bit` physical memory address bus — the set of electrical lines used to select memory locations. With `20` address lines, the CPU can form addresses from `0x00000` to `0xFFFFF`, giving access to exactly `1 MB` of physical memory or `2^20` bytes.\n\nBecause the registers on **8086** processors were only `16` bits wide, the largest value they could hold was `0xFFFF` which equals 64 KB. This means that, using just a single 16-bit value, the CPU could only directly address 64 KB of memory at a time. This leads us to the question - how can a processor with 16-bit registers access 20-bit addresses? The answer is [memory segmentation](https://en.wikipedia.org/wiki/Memory_segmentation).\n\nTo make use of the entire 1 MB space provided by the 20-bit address bus, the **8086** used a scheme called [memory segmentation](https://en.wikipedia.org/wiki/Memory_segmentation). All memory is divided into small, fixed-size segments of `65_536` bytes each. Instead of using just one value to identify a memory location, a CPU uses the two:\n\n1. Segment selector — identifies the starting point (base address) of a 64 KB segment. Represented by the value of the `cs` (code-segment) register.\n2. Offset — specifies how far into that segment the target address is. Represented by the value of the `ip` register.\n\nIn real mode, the base address for a given segment selector is calculated as:\n\n```\nBase Address = Segment Selector << 4\n```\n\nTo compute the final physical memory address, the CPU adds the base address to the offset:\n\n```\nPhysical Address = Base Address + Offset\n```\n\nFor example, if the value of the `cs:ip` is `0x2000:0x0010`, then the corresponding physical address will be:\n\n```python\n>>> hex((0x2000 << 4) + 0x0010)\n'0x20010'\n```\n\nIf we take the largest possible values for the segment selector and the offset - `0xFFFF:0xFFFF`, the resulting address will be:\n\n```python\n>>> hex((0xffff << 4) + 0xffff)\n'0x10ffef'\n```\n\nThis gives us the address `0x10FFEF`, which is `65_520` bytes past the 1 MB boundary. Since, in real mode on the original **8086** CPU, the CPU could only access the first 1 MB of memory, any address above `0xFFFFF` would wrap around back to the beginning of the address space. On modern **386+** CPUs the physical bus is wider even in real mode, but the address computation still based on the `segment:offset`.\n\nNow that we understand the basics of real mode and its memory addressing limitations, let’s return to the state after a hardware reset.\n\n## First code executed after reset\n\nThe system has just been powered on, the reset signal has been released, and the processor is waking up to execute first instructions. The [80386](https://en.wikipedia.org/wiki/Intel_80386) and later CPUs set the following [register](https://en.wikipedia.org/wiki/X86#x86_registers) values after a hardware reset:\n\n| Register           | Value        | Meaning                                                                        |\n| ------------------ | ------------ | ------------------------------------------------------------------------------ |\n| `ip`               | `0xFFF0`     | Instruction pointer; execution starts here within the current code segment     |\n| `cs` (selector)    | `0xF000`     | Visible code segment selector value after reset                                |\n| `cs` (base)        | `0xFFFF0000` | Hidden descriptor base address loaded into `cs` during reset                   |\n\nIn real mode, the base address is normally formed by shifting the 16-bit segment selector value 4 bits left to produce a 20-bit physical address. However, after the hardware reset the first instruction will be located at the special address. We may see that the segment selector in the `cs` register is loaded with `0xF000` but the hidden base address is loaded with `0xFFFF0000`. Instead of using the usual formula to get the address, the processor uses this value as the base address of the first instruction. Having the value of the base address and the offset (from the `ip` register), the starting address will be:\n\n```python\n>>> hex(0xffff0000 + 0xfff0)\n'0xfffffff0'\n```\n\nWe got `0xFFFFFFF0`, which is 16 bytes below 4GB. This is the very first address where the CPU starts the execution after reset. This address has special name - [reset vector](https://en.wikipedia.org/wiki/Reset_vector). It is the memory location at which the CPU expects to find the first instruction to execute after reset. Usually it contains a [jump](https://en.wikipedia.org/wiki/JMP_%28x86_instruction%29) (`jmp`) instruction which points to the [BIOS](https://en.wikipedia.org/wiki/BIOS) or [UEFI](https://en.wikipedia.org/wiki/UEFI) entry point. For example, if we take a look at the [source code](https://github.com/coreboot/coreboot/blob/main/src/cpu/x86/entry16.S) of the [coreboot](https://www.coreboot.org/), we will see it there:\n\n<!-- https://raw.githubusercontent.com/coreboot/coreboot/refs/heads/main/src/cpu/x86/entry16.S#L155-L159 -->\n```assembly\n  /* This is the first instruction the CPU runs when coming out of reset. */\n.section \".reset\", \"ax\", %progbits\n.globl _start\n_start:\n\tjmp\t\t_start16bit\n```\n\nTo prove that this code is located at the `0xFFFFFFF0` address, we may take a look at the [linker script](https://github.com/coreboot/coreboot/blob/master/src/arch/x86/bootblock.ld):\n\n<!-- https://raw.githubusercontent.com/coreboot/coreboot/refs/heads/master/src/arch/x86/bootblock.ld#L72-L78 -->\n```linker-script\n\t. = 0xfffffff0;\n\t_X86_RESET_VECTOR = .;\n\t.reset . : {\n\t\t*(.reset);\n\t\t. = _X86_RESET_VECTOR_FILLING;\n\t\tBYTE(0);\n\t}\n```\n\nThe address `0xFFFFFFF0` is much larger than `0xFFFFF` (1MB). How can the CPU access this address in real mode? The answer is simple. Most likely you have something more modern than **8086** CPU with 20-bit address bus. More modern processors starts in real mode but with 32-bit or 64-bit bus.\n\nWhen the CPU wakes up, it reads the jump at the `0xFFFFFFF0` address, jump into the firmware, and the long chain of the boot process begins. This is the very first step on the way to boot the Linux kernel.\n\n## From Power-On to Bootloader\n\nWe stopped at the point when a CPU jumps from the reset vector to the firmware. On a legacy PC, that means the BIOS. On modern computers it is UEFI. In the next chapters we will see the booting processes on a legacy PC using the BIOS, and later UEFI.\n\nThe first job of BIOS is to bring the system into a working state. It runs a series of hardware checks and initializations — memory tests, peripheral setup, chipset configuration — all part of the [POST](https://en.wikipedia.org/wiki/Power-on_self-test) routine. Once everything is checked, the next step is to find an operating system to boot. The BIOS doesn’t pick just a random disk. It follows a boot order, a list stored in its configuration.\n\nWhen the BIOS tries to boot from a hard drive, it looks for a [boot sector](https://en.wikipedia.org/wiki/Boot_sector). On hard drives partitioned with an [MBR partition layout](https://en.wikipedia.org/wiki/Master_boot_record), the boot sector is stored in the first `446` bytes of the first sector, where each sector is `512` bytes. The final two bytes of the first sector must be `0x55` and `0xAA`. These two last bytes says to BIOS somewhat like \"yes - this device is bootable\". Once the BIOS finds the valid boot sector, it copies it into the fixed memory location at `0x7C00`, jumps to there and start executing it.\n\nIn general, real mode's memory map is as follows:\n\n| Address Range         | Description                          |\n|-----------------------|--------------------------------------|\n| 0x00000000–0x000003FF | Real Mode Interrupt Vector Table     |\n| 0x00000400–0x000004FF | BIOS Data Area                       |\n| 0x00000500–0x00007BFF | Unused                               |\n| 0x00007C00–0x00007DFF | Bootloader                           |\n| 0x00007E00–0x0009FFFF | Unused                               |\n| 0x000A0000–0x000BFFFF | Video RAM (VRAM) Memory              |\n| 0x000B0000–0x000B7777 | Monochrome Video Memory              |\n| 0x000B8000–0x000BFFFF | Color Video Memory                   |\n| 0x000C0000–0x000C7FFF | Video ROM BIOS                       |\n| 0x000C8000–0x000EFFFF | BIOS Shadow Area                     |\n| 0x000F0000–0x000FFFFF | System BIOS                          |\n\nWe can do a simple experiment and create a very primitive boot code:\n\n```assembly\n;;\n;; Note: this example is written using NASM assembler\n;;\n[BITS 16]\n\nboot:\n    ;; Symbol to print\n    mov al, '!'\n    ;; TTY-style text output\n    mov ah, 0x0e\n    ;; Position where to print the character\n    mov bh, 0x00\n    ;; Color\n    mov bl, 0x07\n    ;; Interrupt call\n    int 0x10\n    jmp $\n\ntimes 510-($-$$) db 0\n\ndb 0x55\ndb 0xaa\n```\n\nYou can build and run this code using the following commands:\n\n```bash\nnasm -f bin boot.S && qemu-system-x86_64 boot -nographic\n```\n\nThis will instruct [QEMU](https://www.qemu.org/) virtual machine to use the `boot` binary that we just built as a disk image. Since the binary generated by the assembly code above fulfills the requirements of the boot sector (we end it with the magic sequence), QEMU will treat the binary as the master boot record (MBR) of a disk image.\n\nIf you did everything correctly, you will see something like this after run of the command above:\n\n```\nSeaBIOS (version 1.17.0-5.fc42)\n\niPXE (https://ipxe.org) 00:03.0 CA00 PCI2.10 PnP PMM+06FCAEC0+06F0AEC0 CA00\n\nBooting from Hard Disk...\n!\n```\n\nOf course, a real-world boot sector has \"slightly\" speaking more code for loading of an operating system instead of printing an exclamation mark, but it may interesting to experiment. In this example, we can see that the code will be executed in `16-bit` real mode which is specified by the `[BITS 16]` directive. After starting, it calls the [0x10](https://en.wikipedia.org/wiki/INT_10H) interrupt, which just prints the `!` symbol. The `times` directive will pad that number of bytes up to `510th` byte with zeros. In the end we \"hard-code\" the last two magic bytes `0xAA` and `0x55`. To exit from the virtual machine, you can press - `Ctrl+a x`.\n\nFrom this point onwards, the BIOS hands control over to the bootloader.\n\n## The Bootloader Stage\n\nThere are a number of different bootloaders that can boot Linux kernel, such as [GRUB 2](https://www.gnu.org/software/grub/), [syslinux](http://www.syslinux.org/wiki/index.php/The_Syslinux_Project), [systemd-boot](https://github.com/ivandavidov/systemd-boot), and others. The Linux kernel has a [Boot protocol](https://github.com/torvalds/linux/blob/master/Documentation/arch/x86/boot.rst) which specifies the requirements for a bootloader to implement Linux support. In this chapter, we will take a short look how GRUB 2 does loading.\n\nContinuing from where we left off - the BIOS has now selected a boot device, found its boot sector, loaded it into memory and passed control to the code located there. GRUB 2 bootloader consists of multiple [stages](https://www.gnu.org/software/grub/manual/grub/grub.html#Images). The first stage of the boot code is in the [boot.S](https://github.com/rhboot/grub2/blob/master/grub-core/boot/i386/pc/boot.S) source code file. Due to limited amount of space for the first boot sector, this code has only single goal - to load [core image](https://www.gnu.org/software/grub/manual/grub/html_node/Images.html) into memory and jump to it.\n\nThe core image starts with [diskboot.S](https://github.com/rhboot/grub2/blob/master/grub-core/boot/i386/pc/diskboot.S), which is usually stored right after the first sector of the disk. The code from the `diskboot.S` file loads the rest of the core image into memory. The core image contains the code of the loader itself and drivers for reading different filesystems. After the whole core image is loaded into memory, the execution continues from the [grub_main](https://github.com/rhboot/grub2/blob/master/grub-core/kern/main.c) function. This is where GRUB sets up the environment it needs to operate:\n\n- Initializes the console so messages and menus can be displayed.\n- Sets the root device — the disk from which GRUB will read files modules and configuration files.\n- Loads and parses the GRUB configuration file.\n- Loads required modules.\n\nOnce these tasks are complete, we may see the familiar GRUB menu where we can choose the operating system we want to load. When we select one of the menu entries, GRUB executes the [boot](https://www.gnu.org/software/grub/manual/grub/grub.html#boot) command which boots the selected operating system. So how the loader loads the Linux kernel? To answer on this question, we need to get back to the Linux kernel boot protocol.\n\nAs we can read in the [documentation](https://github.com/torvalds/linux/blob/master/Documentation/arch/x86/boot.rst), the bootloader must load the kernel into memory, fill some fields in the kernel setup header and pass control to the kernel code. The very first part of the kernel code is so-called kernel setup header and setup code. The kernel setup header is a special structure embedded in the early Linux boot code and provides fields that describes how kernel should be loaded and started. The setup header is started at the `0x01F1` offset from the beginning of the kernel image. We may look at the boot [linker script](https://github.com/torvalds/linux/blob/master/arch/x86/boot/setup.ld) to confirm the value of this offset:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/setup.ld#L70-70 -->\n```linker-script\n\t. = ASSERT(hdr == 0x1f1, \"The setup header has the wrong offset!\");\n```\n\nThe kernel [setup header](https://github.com/torvalds/linux/blob/master/arch/x86/boot/header.S) is split on two parts and the first part starts from the following fields:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/header.S#L233-L241 -->\n```assembly\n\t.globl\thdr\nhdr:\n\t\t.byte setup_sects - 1\nroot_flags:\t.word ROOT_RDONLY\nsyssize:\t.long ZO__edata / 16\nram_size:\t.word 0\t\t\t/* Obsolete */\nvid_mode:\t.word SVGA_MODE\nroot_dev:\t.word 0\t\t\t/* Default to major/minor 0/0 */\nboot_flag:\t.word 0xAA55\n```\n\nThe bootloader may fill some of these fields in the setup header which marked as being type `write` or `modify` in the Linux boot protocol. The values set by the bootloader will be taken from its configuration or will be calculated during boot. Of course we will not go over full descriptions and explanations of all the fields of the kernel setup header. Instead, we will take a look closer at this or that field if we will meet it during our research of the kernel code.\n\nAccording to the Linux kernel boot protocol, memory will be mapped as follows after loading the kernel:\n\n```\n              ~                        ~\n              |  Protected-mode kernel |\n100000        +------------------------+\n              |  I/O memory hole       |\n0A0000        +------------------------+\n              |  Reserved for BIOS     |      Leave as much as possible unused\n              ~                        ~\n              |  Command line          |      (Can also be below the X+10000 mark)\nX+10000       +------------------------+\n              |  Stack/heap            |      For use by the kernel real-mode code.\nX+08000       +------------------------+\n              |  Kernel setup          |      The kernel real-mode code.\n              |  Kernel boot sector    |      The kernel legacy boot sector.\nX             +------------------------+\n              |  Boot loader           |      <- Boot sector entry point 0000:7C00\n001000        +------------------------+\n              |  Reserved for MBR/BIOS |\n000800        +------------------------+\n              |  Typically used by MBR |\n000600        +------------------------+\n              |  BIOS use only         |\n000000        +------------------------+\n\n... where the address X is as low as the design of the boot loader permits.\n```\n\nWe can see that when the bootloader transfers control to the kernel, execution starts right after the kernel’s boot sector — that is, at the address `X` plus the length of the boot sector. The value of this `X` depends on how the kernel loaded. For example if I try to load kernel just with [qemu](https://www.qemu.org/), the starting address of the kernel image is at `0x10000`:\n\n```bash\nhexdump -C /tmp/dump | grep MZ\n00010000  4d 5a 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |MZ..............|\n```\n\nLinux kernel image starts from `4D 5A` bytes as you may see in the beginning of the kernel setup code:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/header.S#L42-L46 -->\n```assembly\n\t.code16\n\t.section \".bstext\", \"ax\"\n#ifdef CONFIG_EFI_STUB\n\t# \"MZ\", MS-DOS header\n\t.word\tIMAGE_DOS_SIGNATURE\n```\n\nIf you want to get a similar memory dump, follow these steps. First of all, you need to build kernel. If you do not know how to do it, you can find detailed instruction [here](https://github.com/0xAX/linux-insides/blob/master/Misc/linux-misc-1.md). On the diagram above, we can see that the `Protected-mode` kernel starts from `0x100000`. Knowing this address we can start the kernel in the qemu virtual machine with the following command:\n\n```bash\nsudo qemu-system-x86_64 -kernel ./linux/arch/x86/boot/bzImage \\\n                        -nographic                            \\\n                        -append \"console=ttyS0 nokaslr\"       \\\n                        -initrd /boot/initramfs-6.17.0-rc1-g8f5ae30d69d7.img -s -S\n```\n\nAfter the virtual machine is started, we can attach the debugger to it, set up a breakpoint on the entry point and get the dump:\n\n```bash\ngdb vmlinux\n(gdb) target remote :1234\n(gdb) hbreak *0x100000\n(gdb) c\nContinuing.\n\nBreakpoint 1, 0x0000000000100000 in ?? ()\n(gdb) dump binary memory /tmp/dump 0x0000 0x20000\n```\n\nAfter this you should be able to find your dump in the `/tmp/dump`.\n\nIf we try to load Linux kernel using GRUB 2 bootloader, this `X` address will be `0x90000`. Let's take a look how to do it and check. First of all you need to prepare image with kernel and GRUB 2. To do so execute the following commands:\n\n```bash\nqemu-img create hdd.img 64M\nparted hdd.img --script mklabel msdos\nparted hdd.img --script mkpart primary ext2 1MiB 100%\nparted hdd.img --script set 1 boot on\nLO_DEVICE=$(losetup -f)\nsudo losetup -P \"${LO_DEVICE}\" hdd.img\nsudo mkfs.ext2 \"${LO_DEVICE}\"p1\nsudo mount \"${LO_DEVICE}\"p1 /mnt/tmp\nsudo mkdir -p /mnt/tmp/boot/grub\nsudo grub2-install \\\n  --target=i386-pc \\\n  --boot-directory=/mnt/tmp/boot \\\n  \"${LO_DEVICE}\"\nsudo cp ./arch/x86/boot/bzImage /mnt/tmp/boot/\nsudo tee /mnt/tmp/boot/grub/grub.cfg > /dev/null <<EOF\nterminal_input serial\nterminal_output serial\nset timeout=0\nset default=0\nset debug=linux\n\nmenuentry \"Linux\" {\n    linux /boot/bzImage earlyprintk=serial,0x3f8,115200\n}\nEOF\nsudo umount /mnt/tmp\nsudo losetup -d \"${LO_DEVICE}\"\n```\n\nNow we can run qemu virtual machine with our image:\n\n```bash\nqemu-system-x86_64 -drive format=raw,file=hdd.img -m 256M -s -S -no-reboot -no-shutdown -vga virtio\n```\n\nConnect with [gdb](https://sourceware.org/gdb/) debugger and setup breakpoint:\n\n```\n$ gdb\n(gdb) target remote localhost:1234\nRemote debugging using localhost:1234\n(gdb) break *0x90200\nBreakpoint 1 at 0x90200\n(gdb) c\nContinuing.\n```\n\nIf you did everything correctly, you will see the GRUB 2 prompt in the qemu window. Execute the following commands:\n\n```\nset pager=1\nset debug=all\nlinux /boot/bzImage\nboot\n```\n\nDuring the execution of the `linux` command, you will see the debug line:\n\n```\nrelocator: min_addr = 0x0, max_addr = 0xffffffff, target = 0x90000\n```\n\nThat confirms that the kernel image will be loaded at the `0x90000` address. During execution of the `boot` command, the breakpoint should be caught. In debugger you can execute `i r` command and see that we are at the `0x9020:0x0000`\n\n```\nrip            0x0                 0x0\ncs             0x9020              36896\n```\n\nIf you continue to execute `s i` commands in the debugger CLI, you will go step by step through the early kernel setup code. If you exit from the debugger, you will see the continuation of the kernel loading procedure.\n\nIn addition, we can confirm this address using the same approach as in the example with QEMU above. We know that according to the Linux kernel boot protocol, the protected mode kernel is loaded at the `100000` address. We can set a breakpoint at this address and create a memory dump. To do this, run the QEMU virtual machine using the same command:\n\n```bash\nqemu-system-x86_64 -drive format=raw,file=hdd.img -m 256M -s -S -no-reboot -no-shutdown -vga virtio\n```\n\nAt the next step, attach with gdb to the virtual machine:\n\n```\n(gdb) target remote localhost:1234\nRemote debugging using localhost:1234\n0x000000000000fff0 in ?? ()\n(gdb) break *0x100000\nBreakpoint 1 at 0x100000\n(gdb) c\nContinuing.\n```\n\nAt the beginning, the breakpoint stops us at the GRUB code itself. Because of this, we need to continue in the debugger with the `c` command. Return to the QEMU window now, and execute these commands:\n\n```\nset pager=1\nset debug=all\nlinux /boot/bzImage\nboot\n```\n\nDuring the boot process, the debugger stops us the second time at the breakpoint which we set at the `100000` address:\n\n```\nBreakpoint 1, 0x0000000000100000 in ?? ()\n(gdb) c\nContinuing.\n```\n\nThis time, we are at the entry point of the Linux kernel in protected mode. Execute the next command in the debugger shell to get a memory dump:\n\n```\ndump binary memory /tmp/dump 0x0000 0x200000\n```\n\nNow we can inspect the memory dump at the `0x90000` address:\n\n```bash\n~$ hexdump -C /tmp/dump | grep 00090000\n00090000  4d 5a 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |MZ..............|\n```\n\nWe can see the same `MZ` header from which the Linux kernel setup head starts. In addition, we can inspect the memory at the `0x90200` offset to see that there is a kernel setup header:\n\n```bash\n~$ hexdump -C /tmp/dump | grep 00090200\n00090200  eb 6a 48 64 72 53 0f 02  00 00 00 00 00 10 00 43  |.jHdrS.........C|\n```\n\n## The Beginning of the Kernel Setup Stage\n\nThe bootloader has now loaded the Linux kernel and the kernel setup code into memory, filled the header fields, and then jumped to the corresponding memory address. Finally, we are in the kernel 🎉\n\nTechnically, the kernel itself hasn't run yet but only early kernel setup code. First, the kernel setup part must switch from the real mode to [protected mode](https://en.wikipedia.org/wiki/Protected_mode), and after this switch to the [long mode](https://en.wikipedia.org/wiki/Long_mode), to configure the kernel decompressor, and finally decompress the kernel and jump to it. Execution of the kernel setup code starts from [arch/x86/boot/header.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/header.S) at the `_start` symbol:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/header.S#L246-L256 -->\n```assembly\n_start:\n\t\t# Explicitly enter this as bytes, or the assembler\n\t\t# tries to generate a 3-byte jump here, which causes\n\t\t# everything else to push off to the wrong offset.\n\t\t.byte\t0xeb\t\t# short (2-byte) jump\n\t\t.byte\tstart_of_setup-1f\n1:\n\n\t# Part 2 of the header, from the old setup.S\n\n\t\t.ascii\t\"HdrS\"\t\t# header signature\n```\n\nThe very first instruction we encounter here is [jmp](https://en.wikipedia.org/wiki/JMP_(x86_instruction)) specified by the `0xEB` opcode. The second byte defines the offset to jump to. As described in the [Intel® 64 and IA-32 Architectures Software Developer Manuals](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html):\n\n> The target operand specifies either an absolute offset (that is an offset from the base of the code segment) or a relative offset (a signed displacement relative to the current value of the instruction pointer in the EIP register).\n\nIf you’ve never met the `Nf` syntax before, `1f` means the next label `1` that will appear in the code. Immediately after those two bytes, we can see the label `1` located right before the beginning of the second part of the kernel setup header.\n\nAfter the second part of the kernel setup header, we can see the `.entrytext` section, which starts with the `start_of_setup` label. This is exactly the place where the execution will be continued:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/header.S#L544-L547 -->\n```assembly\n# End of setup header #####################################################\n\n\t.section \".entrytext\", \"ax\"\nstart_of_setup:\n```\n\nBut from which point are we jumping? After the kernel setup code receives control from the bootloader, the first `jmp` instruction is located at the `0x200` bytes offset from the start of the loaded kernel image. This is mentioned in the Linux kernel boot protocol:\n\n> The kernel is started by jumping to the kernel entry point, which is located at *segment* offset 0x20 from the start of the real mode kernel.\n\nThis applies also to the GRUB 2 bootloader. We can see in its [source code](https://github.com/rhboot/grub2/blob/master/grub-core/loader/i386/pc/linux.c):\n\n```C\nsegment = grub_linux_real_target >> 4;\nstate.gs = state.fs = state.es = state.ds = state.ss = segment;\nstate.sp = GRUB_LINUX_SETUP_STACK;\nstate.cs = segment + 0x20;\nstate.ip = 0;\n```\n\nHere, `grub_linux_real_target` is the physical address where the kernel setup code will be loaded. As we saw in the [previous section](#the-magic-power-button---what-happens-next), this address was `0x90000`. Shifting it right by four divides it by `16`, converting a physical address into a segment value - that’s how real mode memory segmentation works.\n\nThen, GRUB sets the code segment specified by the `CS` register to `segment + 0x20` before starting execution. Why `0x20`? Let's remember that in real mode, physical addresses are computed as:\n\n```\nPhysical = (cs << 4) + ip\n```\n\nWith `segment = 0x9000`, setting `cs = 0x9000 + 0x20 = 0x9020` and `ip = 0` gives us:\n\n```\nPhysical = (0x9020 << 4) + 0 = 0x90200\n```\n\nThis means execution starts at physical address `0x90200` which is exactly `512` bytes offset from where the setup code was loaded. In other words - the offset to the address where the `jump` instruction resides in the image.\n\nAfter the jump to the `start_of_setup` label, the kernel setup code enters the very first phase of its real work:\n\n- Unifying the segment registers\n- Establishing a valid stack\n- Clearing the `.bss` section\n- Transitioning into C code\n\nIn the next sections, we’ll walk through each of these steps in detail.\n\n### Aligning the segment registers\n\nReading the Linux kernel boot protocol for `x86_64`, we can see:\n\n> At entry, ds = es = ss should point to the start of the real-mode kernel code...\n\nThis is the first operation we can see after the `start_of_setup` label. First, the kernel setup code ensures that the `ds` and `es` segment registers point to the same address. Next, it clears the [direction flag](https://en.wikipedia.org/wiki/Direction_flag) using the `cld` instruction:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/header.S#L546-L551 -->\n```assembly\n\t.section \".entrytext\", \"ax\"\nstart_of_setup:\n# Force %es = %ds\n\tmovw\t%ds, %ax\n\tmovw\t%ax, %es\n\tcld\n```\n\nWe need to do both of these two things to clear the [bss](https://en.wikipedia.org/wiki/.bss) section properly a bit later. From this point we are sure that both `ds` and `es` segment registers point to the same address - `0x9000`.\n\n### Stack Setup\n\nWe need to prepare for C language environment. The next step is to setup the stack. Let's take a look at the next lines of the code:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/header.S#L553-L561 -->\n```assembly\n# Apparently some ancient versions of LILO invoked the kernel with %ss != %ds,\n# which happened to work by accident for the old code.  Recalculate the stack\n# pointer if %ss is invalid.  Otherwise leave it alone, LOADLIN sets up the\n# stack behind its own code, so we can't blindly put it directly past the heap.\n\n\tmovw\t%ss, %dx\n\tcmpw\t%ax, %dx\t# %ds == %ss?\n\tmovw\t%sp, %dx\n\tje\t2f\t\t# -> assume %sp is reasonably set\n```\n\nHere we compare the value of the `ss` and `ds` registers to be sure that they are equal or to fix the `ss` otherwise. \n\nAccording to the comment to this code, only old versions of the [LILO](https://en.wikipedia.org/wiki/LILO_(bootloader)) bootloader can set these registers to different values. So we will skip all the \"edge cases\" and consider only a single case when the value of the `ss` register is equal to `ds`. Since the values of these registers are equal, we jump to the `2` label:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/header.S#L572-L578 -->\n```assembly\n2:\t# Now %dx should point to the end of our stack space\n\tandw\t$~3, %dx\t# dword align (might as well...)\n\tjnz\t3f\n\tmovw\t$0xfffc, %dx\t# Make sure we're not zero\n3:\tmovw\t%ax, %ss\n\tmovzwl\t%dx, %esp\t# Clear upper half of %esp\n\tsti\t\t\t# Now we should have a working stack\n```\n\nAt this point, the `dx` register stores the stack pointer value, which should point to the top of the stack. The value of the stack pointer is `0x9000`. GRUB 2 bootloader sets it during the loading of the Linux kernel image. The address is defined by:\n\n<!-- https://raw.githubusercontent.com/rhboot/grub2/refs/heads/master/include/grub/i386/linux.h#L34-L34 -->\n```C\n#define GRUB_LINUX_SETUP_STACK\t\t0x9000\n```\n\nAt the next step we check that the address is aligned by four bytes and if yes jump to the label `3`. If the stack pointer is not aligned, we set it to `0xFFFC` value. The reason for this that we can not have stack pointer equal to zero as it grows down during pushing something on the stack. The `0xFFFC` value is the highest 4‑byte aligned address below `0x10000`. If the value of the stack pointer is aligned, we continue to use the aligned value.\n\nFrom this point we have a correct stack and starts from `0x9000:0x9000` and grows down:\n\n![early-stack](./images/early-stack.svg)\n\n### BSS Setup\n\nBefore the kernel's setup code can switch to C code, two final tasks must be done:\n\n- Verify the \"magic\" signature\n- Clear the `.bss` section\n\nThe first is the signature checking:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/header.S#L588-L589 -->\n```assembly\n\tcmpl\t$0x5a5aaa55, setup_sig\n\tjne\tsetup_bad\n```\n\nThis simply compares the [setup_sig](https://github.com/torvalds/linux/blob/master/arch/x86/boot/setup.ld) constant value placed by the linker with the magic number `0x5A5AAA55`. If they are not equal, the setup code reports a fatal error and stops execution. The main goal of this check is to ensure we are actually running a valid Linux kernel setup binary, loaded into the proper place by the bootloader.\n\nWith the magic number confirmed, and knowing our segment registers and stack are already in the proper state, the only initialization left is to clear the `.bss` section. The section of memory is used to store statically allocated, uninitialized data. Let's take a look at the initialization of this memory area:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/header.S#L592-L597 -->\n```assembly\n\tmovw\t$__bss_start, %di\n\tmovw\t$_end+3, %cx\n\txorl\t%eax, %eax\n\tsubw\t%di, %cx\n\tshrw\t$2, %cx\n\trep stosl\n```\n\nThe main goal of this code is to clear, or in other words, to fill with zeros the memory area between `__bss_start` and `_end`. To fill this memory area with zeros, the `rep stos` instruction is used. This instruction puts the value of the `eax` register into the destination pointed to by `es:di`. That is why we unified the values of the `ds` and `es` registers at the beginning of the kernel setup code. The `rep` prefix specifies the repetition of the `stos` instruction based on the value of the `cx` register.\n\nTo clear this memory area, at first we set the borders of this area - from the [__bss_start](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld) to `_end + 3`. We add `3` bytes to the `_end` address because we are going to write zeros in double words, meaning four bytes at a time. Adding three bytes ensures that when we later divide by four, any reminder at the end of the memory area is still covered. After we set up the borders of the memory area and fill the `eax` with zero using the `xor` instruction, the `rep stosl` does its job.\n\nThe effect of this code is that zeros are written through the entire memory from `__bss_start` to `_end`. To know their exact addresses, we can inspect the `setup.elf` file with the [readelf](https://en.wikipedia.org/wiki/Readelf) utility:\n\n```bash\n$ readelf -a arch/x86/boot/setup.elf  | grep bss\n  [12] .bss              NOBITS          00003f00 004efc 001380 00  WA  0   0 32\n   00     .bstext .header .entrytext .inittext .initdata .text .text32 .rodata .videocards .data .signature .bss\n   145: 00005280     0 NOTYPE  GLOBAL DEFAULT   12 __bss_end\n   169: 00003f00     0 NOTYPE  GLOBAL DEFAULT   12 __bss_start\n```\n\nThese offsets inside the setup segment. Since in our case the kernel image is loaded at physical address `0x90000`, the symbols translate to:\n\n- __bss_start - `0x90000 + 0x3f00 = 0x93F00`\n- __bss_end - `0x90000 + 0x5280 = 0x95280`\n\nThe following diagram illustrates how the setup image, `.bss`, and the stack region are laid out in memory:\n\n![bss](./images/early-bss.svg)\n\n> [!IMPORTANT]\n> The addresses of the `__bss_start` and `__bss_end` may differ on your machine and depend on the Linux kernel version.\n\nWe can confirm it by running an experiment. Add a simple change to the [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/main.c) source code file, build the kernel with our change, and run the Linux kernel in the qemu virtual machine as we did before in this part. The change is:\n\n```diff\nmodified   arch/x86/boot/main.c\n@@ -11,6 +11,7 @@\n  * Main module for the real-mode kernel code\n  */\n #include <linux/build_bug.h>\n+#include <asm/sections.h>\n\n #include \"boot.h\"\n #include \"string.h\"\n@@ -173,6 +174,8 @@ void main(void)\n\tquery_edd();\n #endif\n\n+        printf(\"BSS start: %p. BSS end: %p\\n\", __bss_start, _end);\n+\n\t/* Set the video mode */\n\tset_video();\n```\n\nIf you did everything correctly, you will see an output similar to:\n\n```\nBSS start: 00003F00. BSS end: 00005280\n```\n\n### Jump to C code\n\nAt this point, we initialized the [stack](#stack-setup) and [.bss](#bss-setup) sections. The last assembly instruction is a jump to C code:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/header.S#L600-L600 -->\n```assembly\n\tcalll\tmain\n```\n\nThe `main()` function is located in [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/main.c) source code file.\n\nWhat's happening there, we will see in the next chapter.\n\n## Conclusion\n\nThis is the end of the first part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part, we will see the first C code that executes in the Linux kernel setup, the implementation of memory routines such as `memset`, `memcpy`, `earlyprintk`, early console implementation and initialization, and much more.\n\n## Links\n\nHere is the list of the links that you may find useful during reading of this chapter:\n\n- [Intel 80386 programmer's reference manual 1986](http://css.csail.mit.edu/6.858/2014/readings/i386.pdf)\n- [Minimal Boot Loader for Intel® Architecture](https://www.cs.cmu.edu/~410/doc/minimal_boot.pdf)\n- [Minimal Boot Loader in Assembler with comments](https://github.com/Stefan20162016/linux-insides-code/blob/master/bootloader.asm)\n- [8086](https://en.wikipedia.org/wiki/Intel_8086)\n- [80386](https://en.wikipedia.org/wiki/Intel_80386)\n- [Reset vector](https://en.wikipedia.org/wiki/Reset_vector)\n- [Real mode](https://en.wikipedia.org/wiki/Real_mode)\n- [Linux kernel boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.rst)\n- [Ralf Brown's Interrupt List](http://www.ctyme.com/intr/int.htm)\n- [Power supply](https://en.wikipedia.org/wiki/Power_supply)\n- [Power good signal](https://en.wikipedia.org/wiki/Power_good_signal)\n"
  },
  {
    "path": "Booting/linux-bootstrap-2.md",
    "content": "# Kernel booting process - Part 2\n\nWe have already started our journey into the Linux kernel in the previous [part](./linux-bootstrap-1.md), where we walked through the very early stages of the booting process and first assembly instructions of the Linux kernel code. Aside from different mechanisms, this code was responsible for preparing the environment for the [C](https://en.wikipedia.org/wiki/C_(programming_language)) programming language. At the end of the chapter, we reached a symbolic milestone - the very first call of a C function. This function has a classical name - `main` - and is defined in the [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/main.c) source code file.\n\nFrom here on, we will still see some assembly code on our way, but it will be more and more rare 🤓 Now it is time for more \"high-level\" logic!\n\nFrom the previous part, we know that the kernel setup code is still running in [real mode](https://en.wikipedia.org/wiki/Real_mode). Its primary task is to move the processor first into [protected mode](https://en.wikipedia.org/wiki/Protected_mode), and then into [long mode](https://en.wikipedia.org/wiki/Long_mode). Almost all of the C code we will see in the next chapters exists for this purpose - to prepare and complete these transitions.\n\nIn this part, we’ll keep digging through the kernel’s setup code and cover:\n\n- What protected mode is on x86 processors\n- Setup of early [heap](https://en.wikipedia.org/wiki/Memory_management#HEAP) and console\n- Detection of available memory\n- Validation of a CPU \n- Initialization of a keyboard \n\nTime to explore these steps in detail!\n\n## Protected mode\n\nThe Linux kernel for x86_64 operates in a special mode called - [long mode](http://en.wikipedia.org/wiki/Long_mode). One of the main goal of all the setup kernel code is to switch to this mode. But before we can move to this mode, the kernel must switch the CPU into [protected mode](https://en.wikipedia.org/wiki/Protected_mode).\n\nWhat is [protected mode](https://en.wikipedia.org/wiki/Protected_mode)? From the previous chapter we already know that currently CPU operates in [real mode](https://en.wikipedia.org/wiki/Real_mode). For us it is mostly means - memory segmentation. As a short reminder - to access a memory location, the combination of two CPU [registers](https://en.wikipedia.org/wiki/Processor_register) is used:\n\n- A segment register - `cs`, `ds`, `ss` and `es` which defines segment selector.\n- A general purpose register which specifies offset within the segment.\n\nThe main motivation for switching from real mode is its memory addressing limitation. As we saw in the previous part, real mode can address only 2<sup>20</sup> bytes. This is just 1 MB of RAM. Obviously, modern software, including an operating system kernel, needs more. To break these constraints, the new processor mode was introduced - `protected mode`.\n\nProtected mode was introduced to the x86 architecture in 1982 and became the primary operating mode of Intel processors, starting with the [80286](http://en.wikipedia.org/wiki/Intel_80286) until the introduction of x86_64 and long mode. This mode brought many changes and improvements, but one of the most crucial was the memory management. The 20-bit address bus was replaced with a 32-bit address bus. It allowed access to 4 Gigabytes of memory in comparison to the 1 Megabyte in real mode.\n\nMemory management in protected mode is divided into two, mostly independent mechanisms:\n\n- `Segmentation`\n- `Paging`\n\nFor now, our attention stays on segmentation. We’ll return to paging later, once we enter 64-bit long mode.\n\n### Memory segmentation in protected mode\n\nIn protected mode, memory segmentation is completely redesigned. Fixed 64 KB real mode segments are gone. Instead, each segment is now defined by a special data structure called a `Segment Descriptor` which specifies the properties of a memory segment. The segment descriptors are stored in a special structure called the `Global Descriptor Table` or `GDT`. Whenever a CPU needs to find an actual physical memory address, it consults this table. The GDT itself is just a block of memory. Its address is stored in the special CPU register called `gdtr`.  This is a 48-bit register and consists of two parts:\n\n- The size of the Global Descriptor Table\n- The address of the Global Descriptor Table\n\nLater, we will see exactly how the Linux kernel builds and loads its GDT. For now, it’s enough to know that the CPU provides a dedicated instruction to load the table’s address into the GDTR register:\n\n```assembly\nlgdt gdt\n```\n\nAs mentioned above, the GDT contains `segment descriptors` which describe memory segments. Now let's see how segment descriptors look like. Each descriptor is 64-bits in size. The general scheme of a descriptor is:\n\n![segment-descriptor](./images/segment-descriptor.svg)\n\nDo not worry! I know it may look a little bit intimidating at the first glance, especially in comparison to the relatively simple addressing in real mode, but we will go through it in details. We will start from the bottom, from right to left. \n\nThe first field is `LIMIT 15:0`. It represents the first 16 bits of the segment limit. The second part is located at the bits `51:48`. This field provides information about the size of a segment. Having 20-bit size of the limit field, it may seem that the max size of a memory segment can be 1 MB, but it is not like that. In addition, the max size of a segment depends on the 55th `G` bit:\n\n- If `G=0` - the value of the `LIMIT` field is interpreted in bytes.\n- if `G=1` - the value of the `LIMIT` field is interpreted in 4 KB units called pages.\n\nBased on this, we can easily calculate that the max size of a segment is 4 GB.\n\nThe next field is `BASE`. We can see that it is split into three parts. The first part occupies bits from `16` to `31`, the second part occupies bits from `32` to `39`, and the last third part occupies bits from `56` to `63`. The main goal of this field is to store the base address of a segment.\n\nThe remaining fields in a segment descriptor represent flags that control different aspects of a segment, such as the type of memory. Let's take a look at the description of these flags:\n\n- `Type` - describes the type of a memory segment.\n- `S` - distinguishes system segments from code and data segments.\n- `DPL` - provides information about the privilege level of a segment. It can be a value from `0` to `3`, where `0` is the level with the highest privileges.\n- `P` - tells the CPU whether a segment presented in memory.\n- `AVL` - available and reserved bits. It is ignored by the Linux kernel.\n- `L` - indicates whether a code segment contains 64-bit code.\n- `D / B` - provides different meaning depends on the type of a segment.\n  - For a code segment: Controls the default operand and address size. If the bit is clear, it is a 16-bit code segment. Otherwise it is a 32-bit code segment.\n  - For a stack segment or in other words a data segment pointed by the `ss` register: Controls the default stack pointer size. If the bit is clear, it is a 16-bit stack segment and stack operations use `sp` register. Otherwise it is a 32-bit stack segment and stack operations use `esp` register.\n  - For a expand-down data segment: Specifies the upper bound of the segment. If the bit is clear, the upper bound is `0xFFFF` or 64 KB. Otherwise, it is `0xFFFFFFFF` or 4 GB.\n\nIf the `S` flag of a segment descriptor is set, the descriptor describes either a code or a data segment, otherwise it is a system segment. If the highest order bit of the `Type` flags is clear - this descriptor describes a data segment, otherwise a code segment. Rest of the three bits of a data segment descriptor interpreted as:\n\n- `Accessed` - indicates whether a segment has been accessed since the last time the kernel cleared this bit.\n- `Write-Enable` - determines whether a segment is writable or read-only.\n- `Expansion-Direction` - determines whether addresses decreasing from the base address or not.\n\nFor a code segment, these three bits interpreted as:\n\n- `Accessed` - indicates whether a segment has been accessed since the last time the kernel cleared this bit.\n- `Read-Enable` - determines whether a segment is execute-only or execute-read.\n- `Confirming` - determines how privilege level changes are handled when transferring execution to that segment.\n\nIn the tables below you can find full information about possible states of the flags for a code and a data segments.\n\nA data segment `Type` field:\n\n| E (Expand-Down) | W (Writable) | A (Accessed) | Description                       |\n| --------------- | ------------ | ------------ | --------------------------------- |\n| 0               | 0            | 0            | Read-Only                         |\n| 0               | 0            | 1            | Read-Only, accessed               |\n| 0               | 1            | 0            | Read/Write                        |\n| 0               | 1            | 1            | Read/Write, accessed              |\n| 1               | 0            | 0            | Read-Only, expand-down            |\n| 1               | 0            | 1            | Read-Only, expand-down, accessed  |\n| 1               | 1            | 0            | Read/Write, expand-down           |\n| 1               | 1            | 1            | Read/Write, expand-down, accessed |\n\nA code segment `Type` field:\n\n| C (Conforming) | R (Readable) | A (Accessed) | Description                        |\n| -------------- | ------------ | ------------ | ---------------------------------- |\n| 0              | 0            | 0            | Execute-Only                       |\n| 0              | 0            | 1            | Execute-Only, accessed             |\n| 0              | 1            | 0            | Execute/Read                       |\n| 0              | 1            | 1            | Execute/Read, accessed             |\n| 1              | 0            | 0            | Execute-Only, conforming           |\n| 1              | 1            | 0            | Execute/Read, conforming           |\n| 1              | 0            | 1            | Execute-Only, conforming, accessed |\n| 1              | 1            | 1            | Execute/Read, conforming, accessed |\n\nSo far, we’ve looked at how a segment descriptor defines the properties of a memory segment — its base, limit, type, and different flags. But how does the CPU actually refer to one of these descriptors during execution? Just like in real mode - using segment registers. In protected mode they contain segment selectors. However, in protected mode, a segment selector is handled differently. Each segment descriptor has an associated segment selector which is a 16-bit structure:\n\n![segment-selector](./images/segment-selector.svg)\n\nThe meaning of the fields is:\n\n- `Index` - the entry number of the descriptor in the descriptor table.\n- `TI` - indicates where to search for the descriptor\n  - If the value of the bit is `0`, a descriptor will be searched in the Global Descriptor Table.\n  - If the value of this bit is `1`, a descriptor will be searched in the Local Descriptor Table.\n- `RPL` - the privilege level requested by the selector.\n\nWhen a program running in protected mode references a memory, the CPU need to calculate a proper physical address. The following steps are needed to get a physical address in protected mode:\n\n1. A segment selector is loaded into one of the segment registers.\n2. The CPU tries to find a associated segment descriptor in the Global Descriptor Table based on the `Index` value from the segment selector. If the descriptor was found, it is loaded into a special hidden part of this segment register.\n3. The physical address will be the base address from the segment descriptor plus offset from the instruction pointer or memory location referenced within an executed instruction.\n\nIn the next part, we will see the transition into protected mode. But before the kernel can be switched to protected mode, we need to do some more preparations.\n\nLet's continue from the point where we have stopped in the previous chapter.\n\n## Back to the Kernel: Entering main.c\n\nAs we already have mentioned in the beginning of this chapter, one of the kernel's first main goals is to switch the processor into protected mode. But before this can happen, the kernel need to do some preparations.\n\nIf we look at the very beginning of the `main` function from the [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/main.c), the very first thing we will see is a call of the `init_default_io_ops` function.\n\nThis function defined in the [arch/x86/boot/io.h](https://github.com/torvalds/linux/blob/master/arch/x86/boot/io.h) and looks like:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/io.h#L26-L31 -->\n```C\nstatic inline void init_default_io_ops(void)\n{\n\tpio_ops.f_inb  = __inb;\n\tpio_ops.f_outb = __outb;\n\tpio_ops.f_outw = __outw;\n}\n```\n\nThis function initializes function pointers for:\n\n- reading a byte from an I/O port\n- writing a byte to an I/O port\n- writing a word (16-bit) to an I/O port\n\nThese callbacks will be used to write data to the serial console which will be initialized at the one of the next steps. All the operations will be executed with the help of the `inb`, `outb`, and `outw` macros which defined in the same file:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/io.h#L37-L39 -->\n```C\n#define inb  pio_ops.f_inb\n#define outb pio_ops.f_outb\n#define outw pio_ops.f_outw\n```\n\nThe `__inb`, `__outb`, and `__outw` themselves are inline functions from the [arch/x86/include/asm/shared/io.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/shared/io.h):\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/include/asm/shared/io.h#L7-L24 -->\n```C\n#define BUILDIO(bwl, bw, type)\t\t\t\t\t\t\\\nstatic __always_inline void __out##bwl(type value, u16 port)\t\t\\\n{\t\t\t\t\t\t\t\t\t\\\n\tasm volatile(\"out\" #bwl \" %\" #bw \"0, %w1\"\t\t\t\\\n\t\t     : : \"a\"(value), \"Nd\"(port));\t\t\t\\\n}\t\t\t\t\t\t\t\t\t\\\n\t\t\t\t\t\t\t\t\t\\\nstatic __always_inline type __in##bwl(u16 port)\t\t\t\t\\\n{\t\t\t\t\t\t\t\t\t\\\n\ttype value;\t\t\t\t\t\t\t\\\n\tasm volatile(\"in\" #bwl \" %w1, %\" #bw \"0\"\t\t\t\\\n\t\t     : \"=a\"(value) : \"Nd\"(port));\t\t\t\\\n\treturn value;\t\t\t\t\t\t\t\\\n}\n\nBUILDIO(b, b, u8)\nBUILDIO(w, w, u16)\nBUILDIO(l,  , u32)\n```\n\nAll of these functions use `in` and `out` assembly instructions which send the given value to the given port or read the value from the given port. If the syntax is not familiar to you, you can read the chapter about [inline assembly](https://github.com/0xAX/linux-insides/blob/master/Theory/linux-theory-3.md).\n\nAfter initialization of callbacks for writing to a serial port, the next step is copying of the kernel setup header filled by a bootloader into the corresponding field of the C `boot_params` structure. This will make the fields from the kernel setup header more easily accessible. All the job by copying handled by the `copy_boot_params` function with the help of `memcpy`:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/main.c#L39-L39 -->\n```C\n\tmemcpy(&boot_params.hdr, &hdr, sizeof(hdr));\n```\n\nDo not mix this `memcpy` with the function from the C standard library - [memcpy](https://man7.org/linux/man-pages/man3/memcpy.3.html). During the time when the kernel is in the early initialization phase, there is no way to load any library. For this reason, an operating system kernel provides own implementation of such functions. The kernel's `memcpy` defined in the [copy.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/copy.S). If you already started to miss an assembly code, this is the high time to bring some back:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/copy.S#L18-L32 -->\n```assembly\nSYM_FUNC_START_NOALIGN(memcpy)\n\tpushw\t%si\n\tpushw\t%di\n\tmovw\t%ax, %di\n\tmovw\t%dx, %si\n\tpushw\t%cx\n\tshrw\t$2, %cx\n\trep movsl\n\tpopw\t%cx\n\tandw\t$3, %cx\n\trep movsb\n\tpopw\t%di\n\tpopw\t%si\n\tretl\nSYM_FUNC_END(memcpy)\n```\n\nFirst of all, we can see that `memcpy` and other routines which are defined there, start and end with the two macros - `SYM_FUNC_START_NOALIGN` and `SYM_FUNC_END`. The `SYM_FUNC_START_NOALIGN` just specifies the given symbol name as [.globl](https://sourceware.org/binutils/docs/as.html#Global) to make it visible for other functions. The `SYM_FUNC_END` just expands to an empty string in our case.\n\nDespite the implementation of this function is written in assembly language, the implementation of `memcpy` is relatively simple. At first, it pushes values from the `si` and `di` registers to the stack to preserve their values because they will change during the `memcpy` execution. At the next step we may see handling of the function's parameters. The parameters of this function are passed through the `ax`, `dx`, and `cx` registers. This is because the kernel setup code is built with `-mregparm=3` option. So:\n\n- `ax` will contain the address of `boot_params.hdr`\n- `dx` will contain the address of `hdr`\n- `cx` will contain the size of `hdr` in bytes\n\nThe `rep movsl` instruction copies bytes from the memory pointed by the `si` register to the memory location pointed by the `di` register. At each iteration 4 bytes copied. For this reason we divided the size of the setup header by 4 using `shrw` instruction. After this step we just copy rest of bytes that is not divided by 4.\n\nFrom this point, the setup header is copied into a proper place and we can move forward.\n\n### Console initialization\n\nAs soon as the kernel setup header is copied into the `boot_params.hdr`, the next step is to initialize the serial console by calling the `console_init` function. Very soon we will be able to print something from within the kernel code!\n\nThe `console_init` defined in [arch/x86/boot/early_serial_console.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/early_serial_console.c). At the very first step it tries to find the `earlyprintk` option in the kernel's command line. If the search was successful, it parses the port address and [baud rate](https://en.wikipedia.org/wiki/Baud) and executes the initialization of the serial port.\n\n> [!NOTE]\n> If you want to know what else options you can pass in the kernel command line, you can find more information in the [The kernel's command-line parameters](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst) document.\n\nLet's take a look at these two steps in details.\n\nThe possible values of the `earlyprintk` command line option are:\n\n- `serial,0x3f8,115200`\n- `serial,ttyS0,115200`\n- `ttyS0,115200`\n\nThese parameters define the name of a serial port, the port number, and the [baud](https://en.wikipedia.org/wiki/Baud) rate.\n\nThe pointer to the kernel command line is stored in the kernel setup header that was copied in the previous section. The kernel setup code accesses it using `boot_params.hdr.cmd_line_ptr`. The `parse_earlyprintk` function tries to find the `earlyprintk` option in the kernel command line, parse it, and initialize the serial console with the given parameters. If the `earlyprintk` option is given and contains valid values, the initialization of the serial console takes place in the `early_serial_init` function. There is nothing specific to the Linux kernel in the initialization of a serial console, so we will skip this part. If you want to dive deeper, you can find more information [here](https://wiki.osdev.org/Serial_Ports#Port_Addresses) and learn [arch/x86/boot/early_serial_console.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/early_serial_console.c) step by step.\n\nAfter the serial port initialization we can see the first output:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/main.c#L142-L143 -->\n```C\n\tif (cmdline_find_option_bool(\"debug\"))\n\t\tputs(\"early console in setup code\\n\");\n```\n\nThe `puts` function uses the `inb` function that we have seen above during initialization of I/O callbacks.\n\nFrom this point we can print messages from the kernel setup code 🎉. Time to move to the next step.\n\n### Heap initialization\n\nWe have seen the initialization of the `stack` and `bss` memory areas in the previous chapter. The next step is to initialize the [heap](https://en.wikipedia.org/wiki/Memory_management#HEAP) memory area. The heap initialization takes place in the `init_heap` function:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/main.c#L118-131 -->\n```C\nstatic void init_heap(void)\n{\n\tchar *stack_end;\n\n\tif (boot_params.hdr.loadflags & CAN_USE_HEAP) {\n\t\tstack_end = (char *) (current_stack_pointer - STACK_SIZE);\n\t\theap_end = (char *) ((size_t)boot_params.hdr.heap_end_ptr + 0x200);\n\t\tif (heap_end > stack_end)\n\t\t\theap_end = stack_end;\n\t} else {\n\t\t/* Boot protocol 2.00 only, no heap available */\n\t\tputs(\"WARNING: Ancient bootloader, some functionality may be limited!\\n\");\n\t}\n}\n```\n\nFirst of all, `init_heap` checks the `CAN_USE_HEAP` flag from the kernel setup header. We can find information about this flag in the kernel boot protocol:\n\n>   Bit 7 (write): CAN_USE_HEAP\n>\n>\tSet this bit to 1 to indicate that the value entered in the\n>\theap_end_ptr is valid.  If this field is clear, some setup code\n>\tfunctionality will be disabled.\n\nIf this bit is not set, we'll see the warning message. Otherwise, the heap memory area is initialized. The beginning of the heap is defined by the `HEAP` pointer, which points to the end of the kernel setup image:\n\n```C\nchar *HEAP = _end;\n```\n\nNow we need to initialize the size of the heap. There is another small hint in the Linux kernel boot protocol:\n\n> ============\t==================\n> Field name:\theap_end_ptr\n> Type:\t\twrite (obligatory)\n> Offset/size:\t0x224/2\n> Protocol:\t2.01+\n> ============\t==================\n>\n>  Set this field to the offset (from the beginning of the real-mode\n>  code) of the end of the setup stack/heap, minus 0x0200.\n\nThe GRUB bootloader sets this value to:\n\n```C\n#define GRUB_LINUX_HEAP_END_OFFSET\t(0x9000 - 0x200)\n```\n\nBased on these values, the end of the heap pointed by the `heap_end` will be at the `0x9000` offset from the end of the kernel setup image. To avoid the case when the heap and stack overlap, there is an additional check. It sets the end of the heap equal to the end of the stack if the first one is greater than the second. Having this, the heap memory area will be located above the `bss` area till the stack. So, the memory map will look like:\n\n![early-heap](./images/early-heap.svg)\n\nNow the heap is initialized, although we will see the usage of it in the next chapters.\n\n### CPU validation\n\nThe next step is the validation of CPU on which the kernel is running. The kernel has to do it to make sure that the all required functionalities will work correctly on the given CPU.\n\nThe `validate_cpu` function from [arch/x86/boot/cpu.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/cpu.c) validates the CPU. This function calls the [`check_cpu`](https://github.com/torvalds/linux/blob/master/arch/x86/boot/cpucheck.c) which check the CPU model and its flags using the [cpuid](https://en.wikipedia.org/wiki/CPUID) instruction. The CPU's flags are checked like the presence of [long mode](http://en.wikipedia.org/wiki/Long_mode), checks the processor's vendor and makes preparations for certain vendors like turning on extensions like [SSE+SSE2](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data):\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/cpu.c#L60-L73 -->\n```C\nint validate_cpu(void)\n{\n\tu32 *err_flags;\n\tint cpu_level, req_level;\n\n\tcheck_cpu(&cpu_level, &req_level, &err_flags);\n\n\tif (cpu_level < req_level) {\n\t\tprintf(\"This kernel requires an %s CPU, \",\n\t\t       cpu_name(req_level));\n\t\tprintf(\"but only detected an %s CPU.\\n\",\n\t\t       cpu_name(cpu_level));\n\t\treturn -1;\n\t}\n```\n\nIf the level of CPU is less than the required level specified by the `CONFIG_X86_MINIMUM_CPU_FAMILY` kernel configuration option, the function returns the error and the kernel setup process is aborted.\n\n### Memory detection\n\nAfter the kernel became sure that the CPU which it is running on is suitable, the next stage is to detect available memory in the system. This task is handled by the `detect_memory` function, which queries the system firmware to obtain a map of physical memory regions. To do this, the kernel uses the special BIOS service - `0xE820`, but kernel can fallback to legacy BIOS services like `0xE801` or `0x88`. In this chapter, we will see only the implementation of the `0xE820` interface.\n\nThe `detect_memory` function defined in the [arch/x86/boot/memory.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/memory.c) and as just mentioned, tries to get the information about available memory:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/memory.c#L116-L123 -->\n```C\nvoid detect_memory(void)\n{\n\tdetect_memory_e820();\n\n\tdetect_memory_e801();\n\n\tdetect_memory_88();\n}\n```\n\nLet's look at the crucial part of the implementation of the `detect_memory_e820` function. First of all, the `detect_memory_e820` function initializes the `biosregs` structure with the special values related to the `0xE820` BIOS interface:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/memory.c#L25-L29 -->\n```C\n\tinitregs(&ireg);\n\tireg.ax  = 0xe820;\n\tireg.cx  = sizeof(buf);\n\tireg.edx = SMAP;\n\tireg.di  = (size_t)&buf;\n```\n\n- `ax` register contains the number of the BIOS service\n- `cx` register contains the size of the buffer which will contain the data about available memory\n- `di` register contain the address of the buffer which will contain memory data\n- `edx` register contains the `SMAP` magic number\n\nAfter registers are filled with the needed values, the kernel can ask the `0xE820` BIOS interface about the available memory. To do so, the kernel invokes `0x15` [BIOS interrupt](https://en.wikipedia.org/wiki/BIOS_interrupt_call), which returns information about one memory region. The kernel repeats this operation in a loop until it collects information about all available memory regions into the array of `boot_e820_entry` structures. This structure contains information about:\n\n- beginning address of the memory region\n- size of the memory region\n- type of the memory region\n\nThe structure is defined in [arch/x86/include/uapi/asm/setup_data.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/uapi/asm/setup_data.h):\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/include/uapi/asm/setup_data.h#L45-L49 -->\n```C\nstruct boot_e820_entry {\n\t__u64 addr;\n\t__u64 size;\n\t__u32 type;\n} __attribute__((packed));\n```\n\nAfter the information is called, the kernel prints a message about the available memory regions. You can find it in the [dmesg](https://en.wikipedia.org/wiki/Dmesg) output:\n\n```\n[    0.000000] e820: BIOS-provided physical RAM map:\n[    0.000000] BIOS-e820: [mem 0x0000000000000000-0x000000000009fbff] usable\n[    0.000000] BIOS-e820: [mem 0x000000000009fc00-0x000000000009ffff] reserved\n[    0.000000] BIOS-e820: [mem 0x00000000000f0000-0x00000000000fffff] reserved\n[    0.000000] BIOS-e820: [mem 0x0000000000100000-0x000000003ffdffff] usable\n[    0.000000] BIOS-e820: [mem 0x000000003ffe0000-0x000000003fffffff] reserved\n[    0.000000] BIOS-e820: [mem 0x00000000fffc0000-0x00000000ffffffff] reserved\n```\n\n### Keyboard initialization\n\nOnce memory detection is complete, the kernel proceeds with initializing the keyboard using the `keyboard_init`:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/main.c#L64-L76 -->\n```C\nstatic void keyboard_init(void)\n{\n\tstruct biosregs ireg, oreg;\n\n\tinitregs(&ireg);\n\n\tireg.ah = 0x02;\t\t/* Get keyboard status */\n\tintcall(0x16, &ireg, &oreg);\n\tboot_params.kbd_status = oreg.al;\n\n\tireg.ax = 0x0305;\t/* Set keyboard repeat rate */\n\tintcall(0x16, &ireg, NULL);\n}\n```\n\nThis function performs two tasks using [BIOS interrupt](https://en.wikipedia.org/wiki/BIOS_interrupt_call) `0x16`:\n\n1. Gets the state of a keyboard which contains information about state of certain modifier keys, like for example Caps Lock active or not.\n2. Sets the keyboard repeat rate which determines how long a key must hold down before it begins repeating\n\nAfter the BIOS interrupt was executed, the keyboard should be initialized. If you are wondering why we need a working keyboard at such an early stage, the answer is - it can be used during the selection of the video mode. We will see more details in the [next chapter](linux-bootstrap-3.md).\n\n### Gathering system information\n\nAfter we went though the most essential hardware interfaces like CPU, I/O, memory map, keyboard, the next a couple of steps are to query the BIOS for additional information about the system. The information which kernel is going to gather is not strictly required for entering protected mode, but it provides useful details that later parts of the kernel may rely on. \n\nThe following information is going to be collected:\n\n- Information about [Intel SpeedStep](http://en.wikipedia.org/wiki/SpeedStep)\n- Information about [Advanced Power Management](http://en.wikipedia.org/wiki/Advanced_Power_Management)\n- Information about [Enhanced Disk Drive](https://en.wikipedia.org/wiki/INT_13H)\n\nAt this moment we will not dive into details about each of this query, but will get back to them in the next parts when we will use this information. For now, just let's take a short look at these functions:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/main.c#L163-L174 -->\n```C\n\t/* Query Intel SpeedStep (IST) information */\n\tquery_ist();\n\n\t/* Query APM information */\n#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)\n\tquery_apm_bios();\n#endif\n\n\t/* Query EDD information */\n#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)\n\tquery_edd();\n#endif\n```\n\nThe first one is getting information about the [Intel SpeedStep](http://en.wikipedia.org/wiki/SpeedStep). This information is obtained by the calling the `0x15` BIOS interrupt and store the result in the `boot_params` structure. The returned information describes the support of the Intel SpeedStep and settings around it. If it is supported, this information will be passed later by the kernel to the power management subsystems.\n\nThe next one is getting information about the [Advanced Power Management](http://en.wikipedia.org/wiki/Advanced_Power_Management). The logic of this function is pretty similar to the one described above. It uses the same `0x15` BIOS interrupt to obtain information and store it in the `boot_params` structure. The returned information describes the support of the `APM` which was power management sub-system before [ACPI](https://en.wikipedia.org/wiki/ACPI) started to be a standard.\n\nThe last one function gets information about the `Enhanced Disk Drive` from the BIOS. The same `0x13` BIOS interrupt is used to obtain this information. The returned information describes the disks and their characteristics like geometry and mapping information.\n\n## Conclusion\n\nThis is the end of the second part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part, we will continue to deal with the preparations before transitioning into protected mode and the transitioning itself.\n\n## Links\n\nHere is the list of the links that you may find useful during reading of this chapter:\n\n- [Protected mode](http://en.wikipedia.org/wiki/Protected_mode)\n- [Long mode](http://en.wikipedia.org/wiki/Long_mode)\n- [The kernel's command-line parameters](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst)\n- [Linux serial console](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/serial-console.rst)\n- [BIOS interrupt](https://en.wikipedia.org/wiki/BIOS_interrupt_call)\n- [Intel SpeedStep](http://en.wikipedia.org/wiki/SpeedStep)\n- [APM](https://en.wikipedia.org/wiki/Advanced_Power_Management)\n- [EDD](https://en.wikipedia.org/wiki/Enhanced_Disk_Drive)\n- [Previous part](linux-bootstrap-1.md)\n"
  },
  {
    "path": "Booting/linux-bootstrap-3.md",
    "content": "# Kernel booting process. Part 3\n\nIn the previous [part](./linux-bootstrap-2.md), we have seen first pieces of C code that run in the Linux kernel. One of the main goal of this stage is to switch into the [protected mode](https://en.wikipedia.org/wiki/Protected_mode), but before this, we have seen some early setup code which executes early initialization procedures, such as:\n\n- Setup of console to be able to print messages from the kernel's setup code\n- Validation of CPU\n- Detection of available memory\n- Initialization of keyboard\n- Platform information\n\nIn this part, we continue to explore the next steps before transitioning to protected mode.\n\n## Video mode setup\n\nPreviously, we stopped right at the point where the kernel setup code was about to initialize the video mode. \n\nThe setup code is located in [arch/x86/boot/video.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/video.c) and implemented by the `set_video` function. Now let's take a look at the implementation of the `set_video` function:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/video.c#L317-L343 -->\n```C\nvoid set_video(void)\n{\n\tu16 mode = boot_params.hdr.vid_mode;\n\n\tRESET_HEAP();\n\n\tstore_mode_params();\n\tsave_screen();\n\tprobe_cards(0);\n\n\tfor (;;) {\n\t\tif (mode == ASK_VGA)\n\t\t\tmode = mode_menu();\n\n\t\tif (!set_mode(mode))\n\t\t\tbreak;\n\n\t\tprintf(\"Undefined video mode number: %x\\n\", mode);\n\t\tmode = ASK_VGA;\n\t}\n\tboot_params.hdr.vid_mode = mode;\n\tvesa_store_edid();\n\tstore_mode_params();\n\n\tif (do_restore)\n\t\trestore_screen();\n}\n```\n\nIn the next section, let's try to understand what a video mode is and how this function initializes it.\n\n### Video modes\n\nA video mode is a predefined configuration of a screen that tells the video hardware information about:\n\n- resolution\n- color depth\n- text or graphic mode\n\nThe next goal of the kernel is to collect this information and initialize a suitable video mode. This allows the kernel to use a special API to print messages on the screen.\n\nThe implementation of the `set_video` function starts by getting the video mode from the `boot_params.hdr` structure:\n\n```C\nu16 mode = boot_params.hdr.vid_mode;\n```\n\n> [!NOTE] \n> Instead of old good standard C data types like `int`, `short`, `unsigned short`, Linux kernel provides own data types for numeric values. Here is the table that will help you to remember them:\n>\n> | Type | char | short | int | long | u8 | u16 | u32 | u64 |\n> |------|------|-------|-----|------|----|-----|-----|-----|\n> | Size |  1   |   2   |  4  |   8  |  1 |  2  |  4  |  8  |\n\nThe initial value of the video mode can be filled by the bootloader. This header field defined in the Linux kernel boot protocol:\n\n```\nOffset\tProto\tName\t\tMeaning\n/Size\n01FA/2\tALL\t    vid_mode\tVideo mode control\n```\n\nInformation about potential values for this field can be also found in the Linux kernel boot protocol document:\n\n```\nvga=<mode>\n\t<mode> here is either an integer (in C notation, either\n\tdecimal, octal, or hexadecimal) or one of the strings\n\t\"normal\" (meaning 0xFFFF), \"ext\" (meaning 0xFFFE) or \"ask\"\n\t(meaning 0xFFFD). This value should be entered into the\n\tvid_mode field, as it is used by the kernel before the command\n\tline is parsed.\n```\n\nThis tells us that we can add the `vga` option to the kernel's command line. As mentioned in the description above, this option can have different values. For example, it can be an integer number `0xFFFD` or `ask`. If you pass `ask` to `vga`, you see a menu with the possible video modes. We can test it using [QEMU](https://www.qemu.org/) virtual machine as we did in the previous chapters:\n\n```bash\nsudo qemu-system-x86_64 -kernel ./linux/arch/x86/boot/bzImage                \\\n                        -nographic                                           \\\n                        -append \"console=ttyS0 nokaslr vga=ask\"              \\\n                        -initrd /boot/initramfs-6.17.0-rc3-g1b237f190eb3.img \n```\n\nIf you did everything correctly, after the kernel is loaded it will ask you to press the `ENTER`. By pressing on it you should see something like this:\n\n```\nBooting from ROM...\nProbing EDD (edd=off to disable)... ok\nPress <ENTER> to see video modes available, <SPACE> to continue, or wait 30 sec\nMode: Resolution:  Type: Mode: Resolution:  Type: Mode: Resolution:  Type: \n0 F00   80x25      VGA   1 F01   80x50      VGA   2 F02   80x43      VGA   \n3 F03   80x28      VGA   4 F05   80x30      VGA   5 F06   80x34      VGA   \n6 F07   80x60      VGA   7 340  320x200x32  VESA  8 341  640x400x32  VESA  \n9 342  640x480x32  VESA  a 343  800x600x32  VESA  b 344 1024x768x32  VESA  \nc 345 1280x1024x32 VESA  d 347 1600x1200x32 VESA  e 34C 1152x864x32  VESA  \nf 377 1280x768x32  VESA  g 37A 1280x800x32  VESA  h 37D 1280x960x32  VESA  \ni 380 1440x900x32  VESA  j 383 1400x1050x32 VESA  k 386 1680x1050x32 VESA  \nl 389 1920x1200x32 VESA  m 38C 2560x1600x32 VESA  n 38F 1280x720x32  VESA  \no 392 1920x1080x32 VESA  p 300  640x400x8   VESA  q 301  640x480x8   VESA  \nr 303  800x600x8   VESA  s 305 1024x768x8   VESA  t 307 1280x1024x8  VESA  \nu 30D  320x200x15  VESA  v 30E  320x200x16  VESA  w 30F  320x200x24  VESA  \nx 310  640x480x15  VESA  y 311  640x480x16  VESA  z 312  640x480x24  VESA  \n  313  800x600x15  VESA    314  800x600x16  VESA    315  800x600x24  VESA  \n  316 1024x768x15  VESA    317 1024x768x16  VESA    318 1024x768x24  VESA  \n  319 1280x1024x15 VESA    31A 1280x1024x16 VESA    31B 1280x1024x24 VESA  \n  31C 1600x1200x8  VESA    31D 1600x1200x15 VESA    31E 1600x1200x16 VESA  \n  31F 1600x1200x24 VESA    346  320x200x8   VESA    348 1152x864x8   VESA  \n  349 1152x864x15  VESA    34A 1152x864x16  VESA    34B 1152x864x24  VESA  \n  375 1280x768x16  VESA    376 1280x768x24  VESA    378 1280x800x16  VESA  \n  379 1280x800x24  VESA    37B 1280x960x16  VESA    37C 1280x960x24  VESA  \n  37E 1440x900x16  VESA    37F 1440x900x24  VESA    381 1400x1050x16 VESA  \n  382 1400x1050x24 VESA    384 1680x1050x16 VESA    385 1680x1050x24 VESA  \n  387 1920x1200x16 VESA    388 1920x1200x24 VESA    38A 2560x1600x16 VESA  \n  38B 2560x1600x24 VESA    38D 1280x720x16  VESA    38E 1280x720x24  VESA  \n  390 1920x1080x16 VESA    391 1920x1080x24 VESA    393 1600x900x16  VESA  \n  394 1600x900x24  VESA    395 1600x900x32  VESA    396 2560x1440x16 VESA  \n  397 2560x1440x24 VESA    398 2560x1440x32 VESA    399 3840x2160x16 VESA  \n  200   40x25      VESA    201   40x25      VESA    202   80x25      VESA  \n  203   80x25      VESA    207   80x25      VESA    213  320x200x8   VESA  \nEnter a video mode or \"scan\" to scan for additional modes: \n```\n\n### Early heap API\n\nBefore proceeding further to investigate what the `set_video` function does, it will be useful to take a look at the API for the management of the kernel's early heap. \n\nAfter getting the video mode set by the bootloader, we can see resetting the heap value by the `RESET_HEAP` macro. The definition of this macro is in the [arch/x86/boot/boot.h](https://github.com/torvalds/linux/blob/master/arch/x86/boot/boot.h):\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/boot.h#L174-L174 -->\n```C\n#define RESET_HEAP() ((void *)( HEAP = _end ))\n```\n\nIf you have read [part 2](./linux-bootstrap-2.md#kernel-booting-process-part-2), you should remember the initialization of the heap memory area. This memory area starts right after the end of [BSS](https://en.wikipedia.org/wiki/.bss) and lasts till the stack.\n\nThe kernel setup code provides a couple of utility macros and functions for managing the early heap. Let's take a look at some of them, especially at those relevant for this chapter.\n\nThe `RESET_HEAP` macro resets the heap by setting the `HEAP` variable to `_end`, which represents the end of the early setup kernel's image, including the early code, data, and BSS memory areas. By doing this, we set the heap pointer to the very beginning of the heap.\n\nThe next useful macro is:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/boot.h#L184-L185 -->\n```C\n#define GET_HEAP(type, n) \\\n\t((type *)__get_heap(sizeof(type),__alignof__(type),(n)))\n```\n\nThe goal of this macro is to allocate memory on the early heap. This macro calls the `__get_heap` function from the same header file with the following parameters:\n\n- Size of the data type to allocate on the heap\n- Alignment of the allocated memory area\n- Number of items to allocate, specified by the size of the first parameter\n\nThe implementation of `__get_heap` is:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/boot.h#L175-L183 -->\n```C\nstatic inline char *__get_heap(size_t s, size_t a, size_t n)\n{\n\tchar *tmp;\n\n\tHEAP = (char *)(((size_t)HEAP+(a-1)) & ~(a-1));\n\ttmp = HEAP;\n\tHEAP += s*n;\n\treturn tmp;\n}\n```\n\nLet's try to understand how the `__get_heap` function works. First of all we can see here that `HEAP` pointer is assigned to the [aligned](https://en.wikipedia.org/wiki/Data_structure_alignment) address of the memory. The address is aligned based on the size of data type for which we want to allocate memory. After we have got the initial aligned address, we just move the `HEAP` pointer by the requested size.\n\nThe last but not least API of the early heap that we will see is the `heap_free` function which checks the availability of the given size of memory on the heap:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/boot.h#L187-L190 -->\n```C\nstatic inline bool heap_free(size_t n)\n{\n\treturn (int)(heap_end-HEAP) >= (int)n;\n}\n```\n\nAs you may see, the implementation of this function is pretty trivial. It just subtracts the current value of the heap pointer from the address which represents the end of heap memory area. The function returns `true` if there is enough memory for `n` or `false` otherwise.\n\n### Return to the setup of the video mode\n\nSince the kernel initialized the heap and the heap pointer is in the right place, we can move directly to video mode initialization.\n\nThe first step during the process of a video mode initialization is the `store_mode_params` function, which stores currently available video mode parameters in `boot_params.screen_info`. This structure is defined in [include/uapi/linux/screen_info.h](https://github.com/torvalds/linux/blob/master/include/uapi/linux/screen_info.h) header file and provides basic information about the screen and video mode:\n\n- The current position of the cursor\n- The BIOS video mode\n- The number of text rows and columns\n\nThe `store_mode_params` function asks the BIOS services about this information and stores it in this structure for later usage.\n\nThe next step is saving the current contents of the screen to the heap by calling the `save_screen` function. This function collects all the data that we got in the previous functions (like rows and columns) and stores it in the `saved_screen` structure, which is defined as:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/video.c#L233-L237 -->\n```C\nstatic struct saved_screen {\n\tint x, y;\n\tint curx, cury;\n\tu16 *data;\n} saved;\n```\n\nAfter the contents of the screen is saved, the next step is to collect currently available video modes in the system. This job is done by the `probe_cards` function defined in the [arch/x86/boot/video-mode.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/video-mode.c). It goes over all `video_cards` and collects the information about them:\n\n```C\nfor (card = video_cards; card < video_cards_end; card++) {\n  /* collecting the number of video modes */\n}\n```\n\nThe `video_cards` is an array defined as:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/video.h#L81-L82 -->\n```C\n#define __videocard struct card_info __section(\".videocards\") __attribute__((used))\nextern struct card_info video_cards[], video_cards_end[];\n```\n\nThe `__videocard` macro allows to define structures which describe video cards and the linker will put them into the `video_cards` array. Example of such structure can be found in the [arch/x86/boot/video-vga.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/video-vga.c):\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/video-vga.c#L282-L286 -->\n```C\nstatic __videocard video_vga = {\n\t.card_name\t= \"VGA\",\n\t.probe\t\t= vga_probe,\n\t.set_mode\t= vga_set_mode,\n};\n```\n\nAfter the `probe_cards` function is executed, we have a set of structures in our `video_cards` array, along with the known number of video modes they support. At the next step, the kernel setup code prints a menu with available video modes if the `vid_mode=ask` option was passed to the kernel command line, and sets up the video mode with all the parameters that we collected in the previous steps.\n\nThe video mode is set by the `set_mode` function which is defined in [video-mode.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/video-mode.c). This function expects one parameter - the video mode identifier. This identifier is set by the bootloader or based on the choice of the video modes menu. The `set_mode` function goes over all available video cards defined in the `video_cards` array, and if the given mode belongs to the given card, the `card->set_mode()` callback is called to set up the video mode.\n\nLet's take a look at the example of setting up the [VGA](https://en.wikipedia.org/wiki/Video_Graphics_Array) video mode:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/video-vga.c#L191-L224 -->\n```C\nstatic int vga_set_mode(struct mode_info *mode)\n{\n\t/* Set the basic mode */\n\tvga_set_basic_mode();\n\n\t/* Override a possibly broken BIOS */\n\tforce_x = mode->x;\n\tforce_y = mode->y;\n\n\tswitch (mode->mode) {\n\tcase VIDEO_80x25:\n\t\tbreak;\n\tcase VIDEO_8POINT:\n\t\tvga_set_8font();\n\t\tbreak;\n\tcase VIDEO_80x43:\n\t\tvga_set_80x43();\n\t\tbreak;\n\tcase VIDEO_80x28:\n\t\tvga_set_14font();\n\t\tbreak;\n\tcase VIDEO_80x30:\n\t\tvga_set_80x30();\n\t\tbreak;\n\tcase VIDEO_80x34:\n\t\tvga_set_80x34();\n\t\tbreak;\n\tcase VIDEO_80x60:\n\t\tvga_set_80x60();\n\t\tbreak;\n\t}\n\n\treturn 0;\n}\n```\n\nThe `vga_set_mode` function is responsible for configuring the VGA display to a specific text mode, based on the settings which we collected in the previous steps. The `vga_set_basic_mode` function resets the VGA hardware into a standard text mode. The next statement sets up the video mode based on the video mode that was selected. Most of these functions have very similar implementation based on the `0x10` BIOS interrupt.\n\nAfter this step, the video mode is configured and we save all the information about it again for later use. Having done this, the video mode setup is complete and now we can take a look at the last preparation before we will see the switch into the protected mode.\n\n## Last preparation before transition into protected mode\n\nReturning to the [`main`](https://github.com/torvalds/linux/blob/master/arch/x86/boot/main.c) function of the early kernel setup code, we finally can see:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/main.c#L179-L180 -->\n```C\n\t/* Do the last things and invoke protected mode */\n\tgo_to_protected_mode();\n```\n\nAs the comment says: `Do the last things and invoke protected mode`, so let's see what these last things are and switch into protected mode.\n\nThe `go_to_protected_mode` function is defined in [arch/x86/boot/pm.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/pm.c). It contains routines that make the final preparations before we jump into protected mode, so let's look at it and try to understand what it does and how it works.\n\nThe very first function that we can see in `go_to_protected_mode` is the `realmode_switch_hook` function. This function invokes the real mode switch hook if it is present, or disables [NMI](http://en.wikipedia.org/wiki/Non-maskable_interrupt) otherwise. The hooks are used if the bootloader runs in a hostile environment. You can read more about hooks in the [boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) (see **ADVANCED BOOT LOADER HOOKS**). Interrupts must be disabled before switching to protected mode because otherwise the CPU could receive an interrupt when there is no valid interrupt table or handlers. Once the kernel sets up the protected-mode interrupt infrastructure, interrupts are enabled again.\n\nWe will consider only a standard use case, when the bootloader does not provide any hooks. In this case, we just disable non-maskable interrupts:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pm.c#L28-L30 -->\n```assembly\n\t\tasm volatile(\"cli\");\n\t\toutb(0x80, 0x70); /* Disable NMI */\n\t\tio_delay();\n```\n\nAn interrupt is a signal to the CPU that is emitted by hardware or software. After getting such a signal, the CPU suspends the current instruction sequence, saves its state, and transfers control to the interrupt handler. After the interrupt handler has finished its work, it transfers control back to the interrupted instruction. Non-maskable interrupts (NMI) are interrupts that are always processed, independently of permission. They cannot be ignored and are typically used to signal non-recoverable hardware errors. We will not dive into the details of interrupts now, but we will discuss them in the next parts.\n\nAt the first line, there is an [inline assembly](../Theory/linux-theory-3.md) statement with the `cli` instruction, which clears the [interrupt flag](https://en.wikipedia.org/wiki/Interrupt_flag). After this, external interrupts are disabled. The next line disables NMI (non-maskable interrupt).\n\nLet's get back to the code. In the second line, we set the byte `0x0` to the port `0x80`. After that, a call to the `io_delay` function occurs. `io_delay` causes a little delay and looks like this:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/boot.h#L39-L43 -->\n```C\nstatic inline void io_delay(void)\n{\n\tconst u16 DELAY_PORT = 0x80;\n\toutb(0, DELAY_PORT);\n}\n```\n\nWriting any byte to port `0x80` introduces a delay of 1 microsecond. This delay ensures that the change to the NMI mask has fully taken effect. After this delay, all interrupts are disabled.\n\nThe next step is the `enable_a20` function, which enables the [A20 line](http://en.wikipedia.org/wiki/A20_line). Enabling this line allows the kernel to have access to more than 1 megabyte of memory.\n\nThe `enable_a20` function is defined in [arch/x86/boot/a20.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/a20.c). It enables the `A20` gate using the different approaches. The first is the `a20_test_short` function, which checks if `A20` is already enabled using the `a20_test` function:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/a20.c#L54-L74 -->\n```C\nstatic int a20_test(int loops)\n{\n\tint ok = 0;\n\tint saved, ctr;\n\n\tset_fs(0x0000);\n\tset_gs(0xffff);\n\n\tsaved = ctr = rdfs32(A20_TEST_ADDR);\n\n\twhile (loops--) {\n\t\twrfs32(++ctr, A20_TEST_ADDR);\n\t\tio_delay();\t/* Serialize and make delay constant */\n\t\tok = rdgs32(A20_TEST_ADDR+0x10) ^ ctr;\n\t\tif (ok)\n\t\t\tbreak;\n\t}\n\n\twrfs32(saved, A20_TEST_ADDR);\n\treturn ok;\n}\n```\n\nTo verify whether the `A20` line is already enabled or not, the kernel performs a simple memory test. It begins by setting the `FS` register to `0x0000` and the `GS` register to `0xffff` values. By doing this, an access to `FS:0x200` (`A20_TEST_ADDR`) points into the very beginning of memory, while an access to `GS:0x2010` refers to a location just past the one-megabyte boundary. If the `A20` line is disabled, the latter will wrap around and point to the same physical address.\n\nIf the `A20` gate is disabled, the kernel will try to enable it using different methods which you can find in `enable_a20` function. For example, it can be done with a call to the `0x15` BIOS interrupt with `AH` register set to `0x2041`. If this function finished with a failure, print an error message and call the function `die` which will stop the process of the kernel setup.\n\nAfter the `A20` gate is successfully enabled, the `reset_coprocessor` function is called:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pm.c#L48-L54 -->\n```C\nstatic void reset_coprocessor(void)\n{\n\toutb(0, 0xf0);\n\tio_delay();\n\toutb(0, 0xf1);\n\tio_delay();\n}\n```\n\nThis function resets the [math coprocessor](https://en.wikipedia.org/wiki/Floating-point_unit) to ensure it is in a clean state before switching to protected mode. The reset is performed by writing `0` to port `0xF0`, followed by writing `0` to port `0xF1`.\n\nThe next step is the `mask_all_interrupts` function:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pm.c#L37-L43 -->\n```C\nstatic void mask_all_interrupts(void)\n{\n\toutb(0xff, 0xa1);\t/* Mask all interrupts on the secondary PIC */\n\tio_delay();\n\toutb(0xfb, 0x21);\t/* Mask all but cascade on the primary PIC */\n\tio_delay();\n}\n```\n\nThis function masks or in other words forbids all interrupts on the primary and secondary [PICs](https://en.wikipedia.org/wiki/Programmable_interrupt_controller). This is needed for safeness, we forbid all the interrupts from the `PIC` so nothing can interrupt the CPU while the kernel is doing transition into protected mode.\n\nAll the operations before this point, were executed for safe transition to the protected mode. The next operations will prepare the transition to the protected mode. Let's take a look at them.\n\n## Entering Protected Mode\n\nAt this point, we are very close to see the switching into protected mode of the Linux kernel. \n\nOnly two last steps remain:\n\n- Setting up the Interrupt Descriptor Table\n- Setting up the Global Descriptor Table\n\nAnd that’s all! Once these two structures will be configured, the Linux kernel can make the jump into protected mode.\n\n### Set up the Interrupt Descriptor Table\n\nBefore the CPU can safely enter protected mode, it needs to know where to find the handlers that are triggered in the case of [interrupts and exceptions](https://en.wikipedia.org/wiki/Interrupt). In real mode, the CPU relies on the [Interrupt Vector Table](https://en.wikipedia.org/wiki/Interrupt_vector_table). In protected mode, this mechanism changes to the Interrupt Descriptor Table.\n\nThe Interrupt Descriptor Table is a special structure located in memory that contains descriptors. This structure describes where the CPU can find handlers for interrupts and exceptions. We will see the full description of the Interrupt Description Table and its entries later, because for now, we have disabled all interrupts in the previous steps. Let's take a look at the function that sets up a zero-filled Interrupt Descriptor Table:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pm.c#L94-L98 -->\n```C\nstatic void setup_idt(void)\n{\n\tstatic const struct gdt_ptr null_idt = {0, 0};\n\tasm volatile(\"lidtl %0\" : : \"m\" (null_idt));\n}\n```\n\nAs we can see, it just loads the IDT (which is filled with zeros) using the `lidtl` instruction. The `null_idt` has type `gdt_ptr`, which is a structure defined in the same [arch/x86/boot/pm.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/pm.c) file:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pm.c#L60-L63 -->\n```C\nstruct gdt_ptr {\n\tu16 len;\n\tu32 ptr;\n} __attribute__((packed));\n```\n\nThis structure provides information about the pointer to the Interrupt Descriptor Table.\n\n### Set up Global Descriptor Table\n\nNext, we set up the Global Descriptor Table. As you may remember, the memory access is based on the `segment:offset` addressing in real mode. The protected mode introduces a different model based on the `Global Descriptor Table`. If you forgot the details about the Global Description Table structure, you can find more information in the [previous chapter](./linux-bootstrap-2.md#protected-mode).\n\nInstead of fixed segment bases and limits, the CPU now looks for memory regions defined by descriptors located in the Global Descriptor Table. The goal of the kernel is to set up these descriptors.\n\nAll the job will be done by the `setup_gdt` function, which is defined in the same source code file. Let's take a look at the definition of this function:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pm.c#L65-L89 -->\n```C\nstatic void setup_gdt(void)\n{\n\t/* There are machines which are known to not boot with the GDT\n\t   being 8-byte unaligned.  Intel recommends 16 byte alignment. */\n\tstatic const u64 boot_gdt[] __attribute__((aligned(16))) = {\n\t\t/* CS: code, read/execute, 4 GB, base 0 */\n\t\t[GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32, 0, 0xfffff),\n\t\t/* DS: data, read/write, 4 GB, base 0 */\n\t\t[GDT_ENTRY_BOOT_DS] = GDT_ENTRY(DESC_DATA32, 0, 0xfffff),\n\t\t/* TSS: 32-bit tss, 104 bytes, base 4096 */\n\t\t/* We only have a TSS here to keep Intel VT happy;\n\t\t   we don't actually use it for anything. */\n\t\t[GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(DESC_TSS32, 4096, 103),\n\t};\n\t/* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead\n\t   of the gdt_ptr contents.  Thus, make it static so it will\n\t   stay in memory, at least long enough that we switch to the\n\t   proper kernel GDT. */\n\tstatic struct gdt_ptr gdt;\n\n\tgdt.len = sizeof(boot_gdt)-1;\n\tgdt.ptr = (u32)&boot_gdt + (ds() << 4);\n\n\tasm volatile(\"lgdtl %0\" : : \"m\" (gdt));\n}\n```\n\nThe initial memory descriptors specified by the items of the `boot_gdt` array. The `setup_gdt` function just loads the pointer to the Global Descriptor Table filled with these items using the `lgdtl` instruction. Let's take a closer look at the memory descriptors definition.\n\nInitially, the 3 memory descriptors specified:\n\n- Code segment\n- Memory segment\n- Task state segment\n\nWe will skip the description of the task state segment for now, as it was added there (according to the comment) to make [Intel VT](https://en.wikipedia.org/wiki/X86_virtualization#Intel_virtualization_(VT-x)) happy.\n\nThe other two segments correspond to the memory regions used by the kernel code and data sections. Both memory descriptors are defined using the `GDT_ENTRY` macro. This macro itself is defined in [arch/x86/include/asm/segment.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/segment.h) and expects three arguments:\n\n- `flags`\n- `base`\n- `limit`\n\nLet's take a look at the definition of the code memory segment:\n\n```C\n[GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32, 0, 0xfffff),\n```\n\nThe base address of this memory segment is defined as `0` and the limit as `0xFFFFF`. The `DESC_CODE32` value describes the flags of this segment. If we take a look at the flags, we can see that the granularity (bit `G`) of this segment is set to 4 KB units. This means that the segment covers addresses `0x00000000–0xFFFFFFFF`, which is the entire 4 GB linear address space. The same base address and limit are defined for the data segment. This is because the Linux kernel uses the so-called [flat memory model](https://en.wikipedia.org/wiki/Flat_memory_model).\n\nBesides the granularity bit, the `DESC_CODE32` specifies other flags. Among them, you can find a 32-bit segment that is readable, executable, and present in memory. The privilege level is set to the highest value as the kernel needs.\n\nLooking at the documentation of the Global Descriptor Table and its entries, you can check all the initial segments by yourself. It is not so hard.\n\n## Transition into protected mode\n\nFinally, we are standing right before it – Interrupts are disabled, and the Interrupt Descriptor Table and Global Descriptor Table are initialized. Now the kernel can execute a jump into protected mode! But despite the good news, we need to return to the assembly again 😅\n\nThe transition to protected mode is defined in [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/pmjump.S). Let's take a look at it:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pmjump.S#L24-L45 -->\n```assembly\nSYM_FUNC_START_NOALIGN(protected_mode_jump)\n\tmovl\t%edx, %esi\t\t# Pointer to boot_params table\n\n\txorl\t%ebx, %ebx\n\tmovw\t%cs, %bx\n\tshll\t$4, %ebx\n\taddl\t%ebx, 2f\n\tjmp\t1f\t\t\t# Short jump to serialize on 386/486\n1:\n\n\tmovw\t$__BOOT_DS, %cx\n\tmovw\t$__BOOT_TSS, %di\n\n\tmovl\t%cr0, %edx\n\torb\t$X86_CR0_PE, %dl\t# Protected mode\n\tmovl\t%edx, %cr0\n\n\t# Transition to 32-bit mode\n\t.byte\t0x66, 0xea\t\t# ljmpl opcode\n2:\t.long\t.Lin_pm32\t\t# offset\n\t.word\t__BOOT_CS\t\t# segment\nSYM_FUNC_END(protected_mode_jump)\n```\n\nFirst of all, we preserve the address of the `boot_params` structure in the `esi` register since we continue to use parameters that the kernel got during boot in later stages.\n\nAfter this, we compute the physical base address of the current code segment and store it in the `ebx` register. Having it, we add it to the value stored at memory location `2f` so that the jump instruction to the first protected mode code will contain the proper offset.\n\nThe next jump to the label `1` may look quite unexpected. Why does the kernel even need this jump? Right now, the CPU works in real mode. While it is executing the current instruction, it may have already fetched several subsequent instruction bytes into its internal prefetch queue. At this moment, all prefetched instructions were fetched under the assumption that the processor is still operating in real mode. If we were to continue executing instructions that were prefetched before the jump to the protected mode, the processor could continue decoding and executing them without fully synchronizing its internal state with the new mode. The jump instruction prevents this.\n\nAt the next steps, we save the segment addresses of the data and task state in general-purpose registers `cx` and `di` and set the `PE` bit in the [control register](https://en.wikipedia.org/wiki/Control_register) `cr0`. From this point, the protected mode is turned on, and we just need to jump into it to set the proper value of the code segment:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pmjump.S#L41-L44 -->\n```assembly\n\t# Transition to 32-bit mode\n\t.byte\t0x66, 0xea\t\t# ljmpl opcode\n2:\t.long\t.Lin_pm32\t\t# offset\n\t.word\t__BOOT_CS\t\t# segment\n```\n\nThe kernel is in protected mode now 🥳🥳🥳\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pmjump.S#L47-L49 -->\n```assembly\n\t.code32\n\t.section \".text32\",\"ax\"\nSYM_FUNC_START_LOCAL_NOALIGN(.Lin_pm32)\n```\n\nLet's look at the first steps taken in the protected mode. First of all we set up the data segment with the data segment address that we preserved in the `cx` register at the previous step:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pmjump.S#L50-L55 -->\n```assembly\n\t# Set up data segments for flat 32-bit mode\n\tmovl\t%ecx, %ds\n\tmovl\t%ecx, %es\n\tmovl\t%ecx, %fs\n\tmovl\t%ecx, %gs\n\tmovl\t%ecx, %ss\n```\n\nSince we are in protected mode, our segment bases point to zero. Because of this, the stack pointer will point somewhere below the kernel code, so we need to adjust it to at least its previous state. Before the jump, we stored the base address of the code segment in the `ebx` register, so now we can use this value to adjust the stack pointer:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pmjump.S#L58-L58 -->\n```assembly\n\taddl\t%ebx, %esp\n```\n\nThe last step before the jump into actual 32-bit entry point is to clear the general purpose registers:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pmjump.S#L65-L69 -->\n```assembly\n\txorl\t%ecx, %ecx\n\txorl\t%edx, %edx\n\txorl\t%ebx, %ebx\n\txorl\t%ebp, %ebp\n\txorl\t%edi, %edi\n```\n\nNow everything is ready. The kernel is in the protected mode and we can jump to the next code, address of which was passed in the `eax` register:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pmjump.S#L74-L74 -->\n```assembly\n\tjmpl\t*%eax\t\t\t# Jump to the 32-bit entrypoint\n```\n\n## Conclusion\n\nThis is the end of the third part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n## Links\n\nHere is the list of the links that you may find useful during reading of this chapter:\n\n- [QEMU](https://www.qemu.org/)\n- [VGA](http://en.wikipedia.org/wiki/Video_Graphics_Array)\n- [VESA BIOS Extensions](http://en.wikipedia.org/wiki/VESA_BIOS_Extensions)\n- [Data structure alignment](http://en.wikipedia.org/wiki/Data_structure_alignment)\n- [Non-maskable interrupt](http://en.wikipedia.org/wiki/Non-maskable_interrupt)\n- [A20](http://en.wikipedia.org/wiki/A20_line)\n- [Math coprocessor](https://en.wikipedia.org/wiki/Floating-point_unit)\n- [PIC](https://en.wikipedia.org/wiki/Programmable_interrupt_controller)\n- [Interrupts and exceptions](https://en.wikipedia.org/wiki/Interrupt)\n- [Interrupt Vector Table](https://en.wikipedia.org/wiki/Interrupt_vector_table)\n- [Protected mode](https://en.wikipedia.org/wiki/Protected_mode)\n- [Intel VT](https://en.wikipedia.org/wiki/X86_virtualization#Intel_virtualization_(VT-x))\n- [Flat memory model](https://en.wikipedia.org/wiki/Flat_memory_model)\n- [Previous part](linux-bootstrap-2.md)\n"
  },
  {
    "path": "Booting/linux-bootstrap-4.md",
    "content": "# Kernel booting process. Part 4\n\nIn the previous [part](./linux-bootstrap-3.md), we saw the transition from the [real mode](https://en.wikipedia.org/wiki/Real_mode) into [protected mode](http://en.wikipedia.org/wiki/Protected_mode). At this point, the two crucial things were changed: \n\n- The processor now can address up to four gigabytes of memory\n- The privilege levels were set for the memory access \n\nDespite this, the kernel is still in its early setup mode. There are many different things that the early setup code should prepare before we reach the main kernel's entry point. Right now, the processor operates in protected mode. However, protected mode is not the main mode in which `x86_64` processors should operate – it exists only for backward compatibility. The next crucial step is to switch to the native mode for `x86_64` - [long mode](https://en.wikipedia.org/wiki/Long_mode).\n\nThe main characteristic of this new mode (as with all the earlier modes) is the way it defines the memory model. In real mode, the memory model was relatively simple, and each memory location was formed based on the base address specified in a segment register, plus some offset. In protected mode, the global and local descriptor tables contain descriptors that describe memory areas. All the memory accesses in long mode are based on the new mechanism called [paging](https://en.wikipedia.org/wiki/Memory_paging). One of the crucial goals of the kernel setup code before it can switch to the long mode is to set up paging.\n\nIn this chapter, we will see how the kernel switches to long mode in detail.\n\n> [!NOTE]\n> There will be lots of assembly code in this part, so if you are not familiar with that, read another set of my [posts about assembly programming](https://github.com/0xAX/asm).\n\n## The 32-bit kernel entry point location\n\nThe last point where we stopped was the [jump](https://en.wikipedia.org/wiki/Branch_(computer_science)#Implementation) instruction to the kernel's entry point in protected mode. This jump was located in the [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/pmjump.S) and looks like this:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/pmjump.S#L74-L74 -->\n```assembly\n\tjmpl\t*%eax\t\t\t# Jump to the 32-bit entrypoint\n```\n\nThe value of the `eax` register contains the address of the `32-bit` entry point. What is this address? To answer on this question, we can read the [Linux kernel x86 boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) document:\n\n> When using bzImage, the protected-mode kernel was relocated to 0x100000\n\nWe can make make sure that this 32-bit entry point of the Linux kernel using the [GNU GDB](https://sourceware.org/gdb/) debugger and running the Linux kernel in the [QEMU](https://www.qemu.org/) virtual machine. To do this, you can run the following command in one terminal:\n\n```bash\nsudo qemu-system-x86_64 -kernel ./linux/arch/x86/boot/bzImage \\ \n                        -nographic                            \\\n                        -append \"console=ttyS0 nokaslr\" -s -S \\ \n                        -initrd /boot/initramfs-6.17.0-rc3-g1b237f190eb3.img\n```\n\n> [!NOTE]\n> You need to pass your own kernel image and [initrd](https://en.wikipedia.org/wiki/Initial_ramdisk) image to the `-kernel` and `-initrd` command line options.\n\nAfter this, run the GNU GDB debugger in another terminal and pass the following commands:\n\n```\n$ gdb\n(gdb) target remote :1234\n(gdb) hbreak *0x100000\n(gdb) c\nContinuing.\n\nBreakpoint 1, 0x0000000000100000 in ?? ()\n```\n\nAs soon as the debugger stopped at the [breakpoint](https://en.wikipedia.org/wiki/Breakpoint), we can inspect registers to be sure that the `eax` register contains the `0x100000` - address of the 32-bit kernel entry point:\n\n```\neax            0x100000\t1048576\necx            0x0\t    0\nedx            0x0\t    0\nebx            0x0\t    0\nesp            0x1ff5c\t0x1ff5c\nebp            0x0\t    0x0\nesi            0x14470\t83056\nedi            0x0\t    0\neip            0x100000\t0x100000\neflags         0x46\t    [ PF ZF ]\n```\n\nFrom the previous part, you may remember:\n\n> First of all, we preserve the address of `boot_params` structure in the `esi` register.\n\nSo the `esi` register has the pointer to the `boot_params`. Let's inspect it to make sure that it is really it. For example we can take a look at the command line string that we passed to the virtual machine:\n\n```\n(gdb) x/s ((struct boot_params *)$rsi)->hdr.cmd_line_ptr\n0x20000:\t\"console=ttyS0 nokaslr\"\n(gdb) ptype struct boot_params\ntype = struct boot_params {\n    struct screen_info screen_info;\n    struct apm_bios_info apm_bios_info;\n    __u8 _pad2[4];\n    __u64 tboot_addr;\n    struct ist_info ist_info;\n    __u64 acpi_rsdp_addr;\n    __u8 _pad3[8];\n    __u8 hd0_info[16];\n    __u8 hd1_info[16];\n    struct sys_desc_table sys_desc_table;\n    struct olpc_ofw_header olpc_ofw_header;\n    __u32 ext_ramdisk_image;\n    __u32 ext_ramdisk_size;\n    __u32 ext_cmd_line_ptr;\n    __u8 _pad4[112];\n    __u32 cc_blob_address;\n    struct edid_info edid_info;\n    struct efi_info efi_info;\n    __u32 alt_mem_k;\n    __u32 scratch;\n    __u8 e820_entries;\n    __u8 eddbuf_entries;\n    __u8 edd_mbr_sig_buf_entries;\n    __u8 kbd_status;\n    __u8 secure_boot;\n    __u8 _pad5[2];\n    __u8 sentinel;\n    __u8 _pad6[1];\n    struct setup_header hdr;\n    __u8 _pad7[36];\n    __u32 edd_mbr_sig_buffer[16];\n    struct boot_e820_entry e820_table[128];\n    __u8 _pad8[48];\n    struct edd_info eddbuf[6];\n    __u8 _pad9[276];\n}\n(gdb) x/s ((struct boot_params *)$rsi)->hdr.cmd_line_ptr\n0x20000:\t\"console=ttyS0 nokaslr\"\n```\n\nWe got it 🎉\n\nNow we know where we are, so let's take a look at the code and proceed with learning of the Linux kernel.\n\n## First steps in the protected mode\n\nThe `32-bit` entry point is defined in [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/head_64.S) assembly source code file:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L81-L82 -->\n```assembly\n\t.code32\nSYM_FUNC_START(startup_32)\n```\n\nFirst of all, it is worth knowing why the directory is named `compressed`. It's because the kernel is in the [`bzImage`](https://en.wikipedia.org/wiki/Vmlinux#bzImage) file, which is a compressed package that contains the kernel image and kernel setup code. In all previous chapters, we were researching the kernel setup code. The next two big steps, which the kernel's setup code should do before we see the entry point of the kernel itself, are:\n\n- Switch to long mode\n- Decompress the kernel image and jump to its entry point\n\nIn this part, we will focus only on switching to long mode. The kernel image decompression will be covered in the next chapters. Returning to the current kernel code, you can find the following two files in the [arch/x86/boot/compressed](https://github.com/torvalds/linux/tree/master/arch/x86/boot/compressed) directory:\n\n- [head_32.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/head_32.S)\n- [head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/head_64.S)\n\nWe will focus only on the `head_64.S` file. Yes, the file name contains the `64` suffix, despite the kernel being in the 32-bit protected mode at the moment. The explanation for this situation is simple. Let's look at [arch/x86/boot/compressed/Makefile](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/Makefile). We can see the following `make` goal here:\n\n```Makefile\nvmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o \\\n\t$(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \\\n\t$(obj)/piggy.o $(obj)/cpuflags.o\n```\n\nThe first line contains the following target - `$(obj)/head_$(BITS).o`. This means that `make` will select the file during the kernel build process based on the `$(BITS)` value. This `make` variable is defined in the [arch/x86/Makefile](https://github.com/torvalds/linux/blob/master/arch/x86/Makefile) Makefile and its value depends on the kernel's configuration:\n\n```Makefile\nifeq ($(CONFIG_X86_32),y)\n        BITS := 32\n        ...\n        ...\nelse\n        BITS := 64\n        ...\n        ...\nendif\n```\n\nSince we are consider the kernel for `x86_64` architecture, we assume that the `CONFIG_X86_64` is set to `y`. As the result, the `head_64.S` file will be used during the kernel build process. Let's start to investigate this what the kernel does in this file.\n\n### Reload the segments if needed\n\nAs we already know, our start is in [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/head_64.S) assembly source code file. The entry point is defined by the `startup_32` symbol.\n\nAt the beginning of the `startup_32`, we can see the `cld` instruction, which clears the `DF` or [direction flag](https://en.wikipedia.org/wiki/Direction_flag) bit in the [flags](https://en.wikipedia.org/wiki/FLAGS_register) register:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L81-L90 -->\n```assembly\n\t.code32\nSYM_FUNC_START(startup_32)\n\t/*\n\t * 32bit entry is 0 and it is ABI so immutable!\n\t * If we come here directly from a bootloader,\n\t * kernel(text+data+bss+brk) ramdisk, zero_page, command line\n\t * all need to be under the 4G limit.\n\t */\n\tcld\n\tcli\n```\n\nWhen the direction flag is clear, all string or copy-like operations used for copying data, like for example [stos](https://www.felixcloutier.com/x86/stos:stosb:stosw:stosd:stosq) or [scas](https://www.felixcloutier.com/x86/scas:scasb:scasw:scasd), will increment the index registers `esi` or `edi`. We need to clear the direction flag because later we will use string operations for tasks such as clearing space for page tables or copying data.\n\nThe next instruction is to disable interrupts - `cli`. We have already seen it in the previous chapter. The interrupts are disabled \"twice\" because modern bootloaders can load the kernel starting from this point, but not only one that we have seen in the [first chapter](./linux-bootstrap-1.md).\n\nAfter these two simple instructions, the next step is to calculate the difference between where the kernel is compiled to run, and where it actually was loaded. If we will take a look at the linker [script](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/vmlinux.lds.S), we will see the following definition:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/vmlinux.lds.S#L19-L24 -->\n```linker-script\nSECTIONS\n{\n\t/* Be careful parts of head_64.S assume startup_32 is at\n\t * address 0.\n\t */\n\t. = 0;\n```\n\nThis means that the code in this section is compiled to run at the address zero. We also can see this in the output of `objdump` utility:\n\n```bash\n$ objdump -D /home/alex/disk/dev/linux/arch/x86/boot/compressed/vmlinux | less\n\n/home/alex/disk/dev/linux/arch/x86/boot/compressed/vmlinux:     file format elf64-x86-64\n\n\nDisassembly of section .head.text:\n\n0000000000000000 <startup_32>:\n   0:   fc                      cld\n   1:   fa                      cli\n```\n\nWe can see that both the linker script and the `objdump` utility indicate that the address of the `startup_32` function is `0`, but this is not where the kernel was loaded. This is the address that the code was compiled for, also known as the link-time address. Why was it done like that? The answer is – for simplicity. By telling the linker to set the address of the very first symbol to zero, each next symbol becomes a simple offset from 0. As we already know, the kernel was loaded at the `0x100000` address. The difference between the address where the kernel was loaded and the address with which the kernel was compiled is called the relocation delta. Once the delta is known, the code can reach any variable or function by adding this delta to their compile-time addresses.\n\nWe know both these addresses based on the experiment above, and as a result, we know the value of the delta. Now let's take a look at how the kernel calculates this difference:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L100-L104 -->\n```assembly\n\tleal\t(BP_scratch+4)(%esi), %esp\n\tcall\t1f\n1:\tpopl\t%ebp\n\tsubl\t$ rva(1b), %ebp\n```\n\nThe `call` instruction is used to get the physical address where the kernel is actually loaded. This trick works because after the `call` instruction is executed, the stack should have the return address on top. This return address will be exactly the address of the label `1`. \n\nIn the code above, the kernel sets up a temporary mini stack where the return address will be stored after the `call` instruction. Right after the call, we pop this address from the stack and save it in the `ebp` register. Using the last instruction, we subtract the difference between the address of the label `1` and the `startup_32` physical address using the `rva` macro and `subl` instruction, and store the result in the `ebp` register.\n\nThe `rva` macro is defined in the same source code file and looks like this:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L79-L79 -->\n```assembly\n#define rva(X) ((X) - startup_32)\n```\n\nSchematically, it can be represented like this:\n\n![startup_32](./images/startup_32.svg)\n\nStarting from this moment, the `ebp` register contains the physical address of the `startup_32` symbol. Next, it will be used to calculate the offset to any other symbols or structures in memory.\n\nThe very first such structure that we need to access is the Global Descriptor Table. To switch to long mode, we need to update the previously loaded Global Descriptor Table with `64-bit` segments:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L106-L109 -->\n```assembly\n\tleal\trva(gdt)(%ebp), %eax\n\tmovl\t%eax, 2(%eax)\n\tlgdt\t(%eax)\n```\n\nKnowing now that the `ebp` register contains the physical address of the beginning of the kernel in protected mode, we calculate the offset to the `gdt` structure using it at the first line of code shown above. In the last two lines, we write this address to the `gdt` structure with offset `2`, and load the new Global Descriptor Table with the `lgdt` instruction.\n\nThe new Global Descriptor Table looks like this:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L495-L504 -->\n```assembly\nSYM_DATA_START_LOCAL(gdt)\n\t.word\tgdt_end - gdt - 1\n\t.long\t0\n\t.word\t0\n\t.quad\t0x00cf9a000000ffff\t/* __KERNEL32_CS */\n\t.quad\t0x00af9a000000ffff\t/* __KERNEL_CS */\n\t.quad\t0x00cf92000000ffff\t/* __KERNEL_DS */\n\t.quad\t0x0080890000000000\t/* TS descriptor */\n\t.quad   0x0000000000000000\t/* TS continued */\nSYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)\n```\n\nThe new Global Descriptor table contains five descriptors: \n\n- 32-bit kernel code segment\n- 64-bit kernel code segment\n- 32-bit kernel data segment\n- Task state descriptor\n- Second task state descriptor\n\nWe already saw loading the Global Descriptor Table in the previous [part](./linux-bootstrap-3.md#set-up-global-descriptor-table), and now we're doing almost the same, but we set descriptors to use `CS.L = 1` and `CS.D = 0` for execution in `64` bit mode.\n\nAfter the new Global Descriptor Table is loaded, the next step is to set up the stack:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L111-L119 -->\n```assembly\n\tmovl\t$__BOOT_DS, %eax\n\tmovl\t%eax, %ds\n\tmovl\t%eax, %es\n\tmovl\t%eax, %fs\n\tmovl\t%eax, %gs\n\tmovl\t%eax, %ss\n\n\t/* Setup a stack and load CS from current GDT */\n\tleal\trva(boot_stack_end)(%ebp), %esp\n```\n\nIn the previous step, we loaded a new Global Descriptor Table; however, all the segment registers may still have selectors from the old table. If those selectors point to invalid entries in the new Global Descriptor Table, the next memory access can cause [General Protection Fault](https://en.wikipedia.org/wiki/General_protection_fault). Setting them to `__BOOT_DS`, which is a well-known descriptor, should fix this potential fault and allow us to set the proper stack pointed by `boot_stack_end`.\n\nThe last action after we loaded the new Global Descriptor Table is to reload the `cs` descriptor:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L121-L125 -->\n```assembly\n\tpushl\t$__KERNEL32_CS\n\tleal\trva(1f)(%ebp), %eax\n\tpushl\t%eax\n\tlretl\n1:\n```\n\nSince we can not change segment registers using the `mov` instruction, a trick with the `lretl` instruction is used to set the `cs` with the correct value. This instruction fetches two values from the top of the stack, then puts the first value into the `eip` register and the second value into the `cs` register. Since this moment, we have a proper kernel code selector and instruction pointer values.\n\nJust a couple of steps separate us from transitioning into the long mode. As mentioned at the beginning of this chapter, one of the most crucial steps is to set up `paging`. But before that, the kernel needs to do the last preparations, which we will see in the next sections.\n\n## Last steps before paging setup\n\nAs we mentioned in the previous section, there a couple of additional steps before we can setup paging and switch to long mode. These steps are:\n\n- Verification of CPU\n- Calculation of the relocation address\n- Enabling `PAE` mode\n\nIn the next sections we will take a look at these steps.\n\n### CPU verification\n\nBefore the kernel can switch to long mode, it checks that it runs on a suitable `x86_64` processor by running this piece of code:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L132-L136 -->\n```assembly\n\t/* Make sure cpu supports long mode. */\n\tcall\tverify_cpu\n\ttestl\t%eax, %eax\n\tjnz\t.Lno_longmode\n```\n\nThe `verify_cpu` function is defined in [arch/x86/kernel/verify_cpu.S](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/verify_cpu.S) and executes the [CPUID](https://en.wikipedia.org/wiki/CPUID) instruction to check the details of the processors on which the kernel is running. In our case, the most crucial check is for long mode and [SSE](http://en.wikipedia.org/wiki/Streaming_SIMD_Extensions) support. This function returns the result in the `eax` register. Its value is `0` on success and `1` on failure. If long mode is not supported by the current processor, the kernel jumps to the `no_longmode` label, which stops the CPU with the `hlt` instruction:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L478-L483 -->\n```assembly\n\t.code32\nSYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode)\n\t/* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */\n1:\n\thlt\n\tjmp     1b\n```\n\nIf everything is ok, the kernel proceeds its work.\n\n### Calculation of the kernel relocation address\n\nThe next step is to calculate the address for the kernel decompression. The kernel image mainly consists of two parts:\n\n- Kernel's setup and decompressor code\n- Chunk of compressed kernel code\n\nWe can see it looking at the [arch/x86/boot/compressed/vmlinux.lds.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/vmlinux.lds.S) linker script:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/vmlinux.lds.S#L19-L39 -->\n```linker-script\nSECTIONS\n{\n\t/* Be careful parts of head_64.S assume startup_32 is at\n\t * address 0.\n\t */\n\t. = 0;\n\t.head.text : {\n\t\t_head = . ;\n\t\tHEAD_TEXT\n\t\t_ehead = . ;\n\t}\n\t.rodata..compressed : {\n\t\t*(.rodata..compressed)\n\t}\n\t.text :\t{\n\t\t_text = .; \t/* Text */\n\t\t*(.text)\n\t\t*(.text.*)\n\t\t*(.noinstr.text)\n\t\t_etext = . ;\n\t}\n```\n\nThere are three sections at the beginning of the linker script above:\n\n- `.head.text` - section where we are now\n- `.rodaya..compressed` - section with the compressed kernel image\n- `.text` - section with the decompressor code\n\nThe kernel decompression happens in-place, which is the same place where the compressed kernel is. This means that the parts of the decompressed kernel image will overwrite the parts of the compressed image during the decompression process. It may sound dangerous – if the decompressed part overwrites the decompressor code or the part of the compressed kernel image that is not decompressed yet, this will lead to code or image corruption.\n\nOne way to avoid this problem is to allocate a buffer for the decompressed kernel image and copy the compressed image outside of it. But this is not the most effective way in terms of memory consumption, and may not work on devices with not enough memory to hold both kernel images.\n\nThe second way to avoid this problem is to allocate a buffer for the decompressed kernel image, but copy the compressed image to the end of this buffer and leave some room at the beginning of this buffer for the parts of the decompressed kernel. Of course, the kernel decompressor must choose the right parameters, so the pointer to the end of the decompressed part does not move faster than the pointer to the part that is currently compressed.\n\nSchematically, it can be represented like this:\n\n![kernel-relocation](./images/kernel-relocation.svg)\n\nThe buffer for the decompressed kernel starts at the address specified by the `LOAD_PHYSICAL_ADDR` macro, which by default expands to the `0x1000000` address. Since we loaded this address below (at `0x100000`), the kernel setup code should copy itself, the compressed kernel image, and the decompressor code at this address. In addition, to have some room for the safe in-place decompression, it should calculate a special offset from the beginning of this buffer.\n\nWe can see this calculation in the following code:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L146-L161 -->\n```assembly\n#ifdef CONFIG_RELOCATABLE\n\tmovl\t%ebp, %ebx\n\tmovl\tBP_kernel_alignment(%esi), %eax\n\tdecl\t%eax\n\taddl\t%eax, %ebx\n\tnotl\t%eax\n\tandl\t%eax, %ebx\n\tcmpl\t$LOAD_PHYSICAL_ADDR, %ebx\n\tjae\t1f\n#endif\n\tmovl\t$LOAD_PHYSICAL_ADDR, %ebx\n1:\n\n\t/* Target address to relocate to for decompression */\n\taddl\tBP_init_size(%esi), %ebx\n\tsubl\t$ rva(_end), %ebx\n```\n\nDespite it may look scary, it is not as complex as it may seem. Let's take a closer look at it and try to understand what it does.\n\nThe `ebp` register contains the physical address where the protected kernel mode was loaded. We know that this address is `0x100000`. This address is aligned to the two-megabyte boundary, and the result value is compared with the `LOAD_PHYSICAL_ADDRESS`:\n\n- If this value is equal to or greater than `LOAD_PHYSICAL_ADDRESS`, we leave it as is. \n- Otherwise, we put the value of the `LOAD_PHYSICAL_ADDRESS` (which is `0x1000000`) into the `ebx` register. \n\nAt this moment, we have the pointer to the beginning of the buffer where the kernel image is relocated and decompressed in the `ebx` register.\n\nThe last two lines are the most interesting. Using them, the kernel calculates the offset where to move the compressed kernel image with the decompressor for safe in-place decompression. At first, we add the `BP_init_size` to the `ebx` register. The `BP_init_size` is the maximum value between the size of the uncompressed kernel image code (from `_text` to `_end`) and the size of the kernel setup code + compressed kernel image + decompressor code. At this moment, the `ebx` register points to the end of the decompression buffer. On the last line of the code, we move this pointer back to the new place of the `startup_32` symbol within the decompression buffer.\n\nAs a result, we get something like this:\n\n![kernel-relocation](./images/kernel-relocation-2.svg)\n\nThe decompressor code decompresses the compressed kernel image starting from the beginning of the buffer and gradually overwrites the compressed kernel image. As mentioned above, the size of the gap between the beginning of the decompression buffer and `startup_32` must be safe enough not to overwrite still-compressed parts of the image with the decompressed ones. The calculation of this gap highly depends on the compression method the kernel uses and is encoded in `BP_init_size`. Here I will skip all the details about this calculation, but if you are interested, you can find more details in the comment located in the [arch/x86/boot/header.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/header.S) file.\n\n### Enabling PAE mode\n\nThe next step before the kernel can switch the processor into the long mode is to set up the so-called [`PAE`](https://en.wikipedia.org/wiki/Physical_Address_Extension) mode:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L167-L170 -->\n```assembly\n\t/* Enable PAE mode */\n\tmovl\t%cr4, %eax\n\torl\t$X86_CR4_PAE, %eax\n\tmovl\t%eax, %cr4\n```\n\nKernel does it by setting the `X86_CR4_PAE` bit in the `cr4` [control register](https://en.wikipedia.org/wiki/Control_register). This tells the processor that the page table entries will be enlarged from `32` to `64` bits. We will see this process soon.\n\n## Set up paging\n\nAt this moment, we almost finished the preparations needed to switch the processor into 64-bit long mode. The next crucial step is to build [page tables](https://en.wikipedia.org/wiki/Page_table). But before we take a look at the process of page table setup, let's try to briefly understand what it is.\n\nIn protected mode, each memory access is interpreted through a segment descriptor stored in the Global Descriptor Table. The situation changes significantly in long mode.\n\nIn 64-bit mode, segmentation is disabled. The base and limit fields of most segment descriptors are ignored, and the processor treats the address space as a flat linear range. Of course, code, data, and stack segments still exist, but only formally. The processor still requires valid segment selectors, but they no longer perform address translation in the traditional sense.\n\nInstead, memory translation in long mode relies almost entirely on the mechanism called `paging`.\n\nEach program operates now with addresses that are called `virtual`. When a program references a virtual address, the processor interprets the address as a 64-bit linear address and translates it through the multi-level structure called page tables.\n\n> [!NOTE]\n> Modern x86_64 processors support five-level paging, but we will skip it in this post and focus on four-level paging.\n\nLet’s briefly see what happens when the processor needs to translate a virtual address into a physical one.\n\nIn four-level paging mode, a virtual address is 64 bits long. However, only the `48` bits are actually used for translation to a physical address. These `48` bits are divided into several parts:\n\n![early-page-table.svg](./images/early-page-table.svg)\n\nEach group of `9` bits selects an entry in one level of the page-table hierarchy. Since `9` bits can represent `512` values, each page table contains exactly `512` entries. Each entry of a page table occupies `8` bytes, so a single page table fits into one 4-kilobyte page.\n\nWhen the processor translates a virtual address, it performs the following steps:\n\n1. It reads the `cr3` control register to obtain the physical address of the top-level page table called `PML4`.\n2. It extracts bits `47–39` of the virtual address and uses them as an index of the `PML4` page table.\n3. The selected `PML4` entry contains the physical address of the next-level table called `PDPT`.\n4. Bits `38–30` are selected to find an entry in the `PDPT`.\n5. Bits `29–21` are selected to find an entry in the `PD`.\n6. Bits `20–12` select an entry in the `PT`.\n7. Bits `11–0` provide the offset inside the resulting physical page.\n\nIn addition to a physical address of the next-level table, each page table entry contains flags in first `12` bits. These flags are:\n\n| Bit   | Name                     | Description                                                                                                                                        |\n|-------|--------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|\n| `P`   | Present                  | Indicates whether the page or page table entry is valid and exists in memory. If cleared, accessing the corresponding address causes a page fault. |\n| `RW`  | Read/Write               | Determines whether write operations are permitted. If cleared, the page is read-only; if set, writes are allowed (subject to privilege rules).     |\n| `US`  | User/Supervisor          | Controls privilege-level access. If cleared, the page is accessible only in supervisor mode. If set, it may also be accessed from user mode.       |\n| `PWT` | Page-Level Write-Through | Controls the caching policy. If set, write-through caching is used; otherwise, write-back caching is typically applied.                            |\n| `PCD` | Page Cache Disable       | Disables caching for the referenced page when set. Commonly used for memory-mapped I/O regions.                                                    |\n| `A`   | Accessed                 | Set automatically by the processor when the page-table entry is used during address translation. Useful for page replacement decisions.            |\n| `D`   | Dirty                    | Set automatically by the processor when a write operation occurs to a mapped page. Indicates that the page has been modified.                      |\n| `PS`  | Page Size                | Determines whether the entry maps a large page (e.g., 2 MiB or 1 GiB) instead of pointing to a lower-level page table.                             |\n| `NX`  | No-Execute               | Prevents instruction execution from the referenced page when set. Used to enforce executable/non-executable memory protections.                    |\n       \nYou might wonder how an 8-byte entry can contain both a 64-bit physical address of the next-level page table and flags at the same time. The reason is that each page table is aligned on a four-kilobyte boundary. As a result, the lower 12 bits of its physical address are always zero. These 12 bits are therefore used to store the flags.\n\nNow that we know how the processor translates a virtual address to a physical address using paging, it is time to take a look at the structure of page tables.\n\nA page table in x86_64 is a four-kilobyte memory area that contains 512 entries. Each entry occupies `8` bytes. In four-level paging mode with four-kilobyte pages, four such tables participate in the translation of a virtual address:\n\n| Level | Name   | Description                                                                                                                 |\n|-------|--------|-----------------------------------------------------------------------------------------------------------------------------|\n| 4     | `PML4` | The top-level page table. Each entry points to a Page Directory Pointer Table (`PDPT`).                                     |\n| 3     | `PDPT` | The third-level table. Each entry points to a Page Directory (`PD`) or, if the `PS` bit is set, directly maps a 1 GiB page. |\n| 2     | `PD`   | The second-level table. Each entry points to a Page Table (`PT`) or, if the `PS` bit is set, directly maps a 2 MiB page.    |\n| 1     | `PT`   | The first-level table. Each entry points directly to a 4 KiB physical memory page.                                          |\n\nEach table has the same internal structure. The only difference between them is how their entries are interpreted. As we already know, an entry in a page table is 64 bits wide. It contains two types of information:\n\n- A physical address of either the next-level page table or a physical memory page\n- A set of control flags that define access permissions and status information \n\nIf you are interested in this topic, you can find more information about page tables and page table entries structure in the [Intel® 64 and IA-32 Architectures Software Developer Manuals](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html).\n\nNow that we know a little about paging, we can return to the kernel and update our knowledge by looking at the real code. Now we will see how the kernel builds the early page table to switch to long mode. But before we jump directly to the code, we need to remember one important thing. The kernel will be relocated to the address stored in the `ebx` register, as seen above. So, all structures, including the page tables, should be aligned to this address.\n\nThe page table structure for boot is defined in the same source code file and looks like this:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L531-L533 -->\n```assembly\n\t.section \".pgtable\",\"aw\",@nobits\n\t.balign 4096\nSYM_DATA_LOCAL(pgtable,\t\t.fill BOOT_PGT_SIZE, 1, 0)\n```\n\nThe kernel needs to fill this structure with the proper page table entries for early 64-bit code. First of all, it fills the whole memory area occupied by the page tables with zeros for safety:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L200-L203 -->\n```assembly\n\tleal\trva(pgtable)(%ebx), %edi\n\txorl\t%eax, %eax\n\tmovl\t$(BOOT_INIT_PGT_SIZE/4), %ecx\n\trep\tstosl\n```\n\nAt the beginning, we set the address of the top of the page table to the `edi` register. After this, the kernel fills with zeros the memory area that will be occupied by the page table. The boot page table will have the following structure:\n\n- 1 level4 table\n- 1 level3 table\n- 4 level2 table that maps everything with 2M pages\n\nAfter the kernel clears the memory region reserved for the page tables, it starts populating it with entries. At the start, it fills the first and single entry of the top-level page table. The following snippet shows this:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L206-L209 -->\n```assembly\n\tleal\trva(pgtable + 0)(%ebx), %edi\n\tleal\t0x1007 (%edi), %eax\n\tmovl\t%eax, 0(%edi)\n\taddl\t%edx, 4(%edi)\n```\n\nIn the code above, the kernel fills the first entry of the top-level page table with the address of the next-level page table, which is located at the `pgtable + 0x1000` address and has `0x7` flags. In our case, the flags `0x7` are:\n\n- Present\n- Read/Write\n- User\n\nIn the next step, the kernel builds four `Page Directory` entries in the `Page Directory Pointer` table with the same `Present+Read/Write/User` flags:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L212-L220 -->\n```assembly\n\tleal\trva(pgtable + 0x1000)(%ebx), %edi\n\tleal\t0x1007(%edi), %eax\n\tmovl\t$4, %ecx\n1:\tmovl\t%eax, 0x00(%edi)\n\taddl\t%edx, 0x04(%edi)\n\taddl\t$0x00001000, %eax\n\taddl\t$8, %edi\n\tdecl\t%ecx\n\tjnz\t1b\n```\n\nIn the code above, we can see the filling of the first four entries of the 3rd-level page table. The first entry of the 3rd level page table is located at the offset `0x1000` from the beginning of the top-level page table. The value of the `eax` register is similar to the 4th-level page table entry, with the difference that now it points to the 2nd-level page table. Next, the kernel fills the four entries of the 3rd-level page table in the \"loop\" until the value of the `ecx` register is not zero. As soon as these page table entries are filled, the kernel proceeds to the next-level page table:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L223-L231 -->\n```assembly\n\tleal\trva(pgtable + 0x2000)(%ebx), %edi\n\tmovl\t$0x00000183, %eax\n\tmovl\t$2048, %ecx\n1:\tmovl\t%eax, 0(%edi)\n\taddl\t%edx, 4(%edi)\n\taddl\t$0x00200000, %eax\n\taddl\t$8, %edi\n\tdecl\t%ecx\n\tjnz\t1b\n```\n\nHere we already fill four page directory tables with `2048` entries. The first entry is located at the offset `0x2000` from the beginning of the top-level page table. Each entry maps a two-megabyte chunk of memory with the following flags:\n\n- Present\n- Read/Write\n- User\n- Page Cache Disable\n- Large Page \n\nThe two additional flags tell the processor to keep [TLB](https://en.wikipedia.org/wiki/Translation_lookaside_buffer) entry across reload of the value of the `cr3` register and use two-megabyte pages.\n\nThere is no need to populate the lowest-level page tables yet. Every entry in the 2nd-level page directory has the `Large Page` bit set, which means each entry directly maps a two-megabyte region of physical memory. During the address translation, the page-walk procedure stops at the 2nd-level page table, and the lower `21` bits of the virtual address are used as the offset inside that two-megabyte page.\n\nThe page tables are now fully prepared. The last remaining step is to actually enable paging. To do this, the processor must know where the top-level page table resides. As we know, this is done by loading the physical address of the top-level page table into the `cr3` control register:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L234-L235 -->\n```assembly\n\tleal\trva(pgtable)(%ebx), %eax\n\tmovl\t%eax, %cr3\n```\n\nFrom this moment, page tables that cover four gigabytes of memory are ready, and paging is enabled. The kernel is ready for transition into the long mode.\n\n## The transition into 64-bit mode\n\nOnly the last steps remain before the Linux kernel can switch the processor into the long mode. The first one is setting the `EFER.LME` flag in the special [model-specific register](http://en.wikipedia.org/wiki/Model-specific_register) to the predefined value `0xC0000080`:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L238-L241 -->\n```assembly\n\tmovl\t$MSR_EFER, %ecx\n\trdmsr\n\tbtsl\t$_EFER_LME, %eax\n\twrmsr\n```\n\nThis is the `Long Mode Enable` bit, and it is mandatory to set this bit to enable long mode.\n\nIn the next step, we can see the preparation for the jump on the long mode entrypoint. To do this jump, the kernel stores the base address of the kernel segment code along with the address of the long mode entrypoint on the stack:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L264-L266 -->\n```assembly\n\tleal\trva(startup_64)(%ebp), %eax\n\tpushl\t$__KERNEL_CS\n\tpushl\t%eax\n```\n\nSince the stack contains the base of the kernel code segment and the address of the entrypoint, the kernel executes the last instruction in protected mode:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L273-L273 -->\n```assembly\n\tlret\n```\n\nThe CPU extracts the address of `startup_64`, which is the long mode entrypoint from the stack, and jumps there:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L276-L278 -->\n```assembly\n\t.code64\n\t.org 0x200\nSYM_CODE_START(startup_64)\n```\n\nThe Linux kernel is now in 64-bit mode! 🎉\n\n## Conclusion\n\nThis is the end of the third part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n## Links\n\nHere is the list of the links that you may find useful during reading of this chapter:\n\n- [Real mode](https://en.wikipedia.org/wiki/Real_mode)\n- [Protected mode](http://en.wikipedia.org/wiki/Protected_mode)\n- [Long mode](https://en.wikipedia.org/wiki/Long_mode)\n- [Linux kernel x86 boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt)\n- [Intel® 64 and IA-32 Architectures Software Developer Manuals](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html)\n- [Paging](http://en.wikipedia.org/wiki/Paging)\n- [Virtual addresses](https://en.wikipedia.org/wiki/Virtual_address_space)\n- [Physical addresses](https://en.wikipedia.org/wiki/Physical_address)\n- [Model specific registers](http://en.wikipedia.org/wiki/Model-specific_register)\n- [Control registers](https://en.wikipedia.org/wiki/Control_register)\n- [Previous part](linux-bootstrap-3.md)\n"
  },
  {
    "path": "Booting/linux-bootstrap-5.md",
    "content": "# Kernel booting process. Part 5\n\nIn the previous [part](./linux-bootstrap-4.md), we saw the transition from the [protected mode](https://en.wikipedia.org/wiki/Protected_mode) into [long mode](https://en.wikipedia.org/wiki/Long_mode), but what we have in memory is not yet the kernel image ready to run. We are still in the kernel setup code, which should decompress the kernel and pass control to it. The next step before we see the Linux kernel entrypoint is kernel decompression.\n\n## First steps in the long mode\n\nThe point where we stopped in the previous chapter is the [lret](https://www.felixcloutier.com/x86/ret) instruction, which performed \"jump\" to the `64-bit` entry point located in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/head_64.S):\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L276-L278 -->\n```assembly\n\t.code64\n\t.org 0x200\nSYM_CODE_START(startup_64)\n```\n\nThis is the first 64-bit code that we see. Before decompression, the kernel must complete a few final steps. These steps are:\n\n- Disabling the interrupts\n- Unification of the segment registers\n- Calculation of the kernel relocation address\n- Reload of the Global Descriptor Table\n- Load of the Interrupt Descriptor Table\n\nAll of this we will see in the next sections.\n\n### Disabling the interrupts\n\nThe `64-bit` entrypoint starts with the same two instructions that `32-bit`:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L290-L291 -->\n```assembly\n\tcld\n\tcli\n```\n\nAs we already know from the previous part, the first instruction clears the [direction flag](https://en.wikipedia.org/wiki/Direction_flag) bit in the [flags](https://en.wikipedia.org/wiki/FLAGS_register) register, and the second instruction disables [interrupts](https://en.wikipedia.org/wiki/Interrupt).\n\nThe same as the bootloader can load the Linux kernel at the `32-bit` entrypoint instead of [16-bit entry point](linux-bootstrap-1.md#the-beginning-of-the-kernel-setup-stage), in the same way the bootloader can switch the processor into `64-bit` long mode by itself and load the kernel starting from the `64-bit` entry point. \n\nThe kernel executes these two instructions if the bootloader didn't perform them before transfering the control to the kernel. The `direction flag` ensures that memory copying operations proceed in the correct direction, and disabling interrupts prevents them from disrupting the kernel decompression process.\n\n### Unification of the segment registers\n\nAfter these two instructions are executed, the next step is to unify segment registers:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L294-L299 -->\n```assembly\n\txorl\t%eax, %eax\n\tmovl\t%eax, %ds\n\tmovl\t%eax, %es\n\tmovl\t%eax, %ss\n\tmovl\t%eax, %fs\n\tmovl\t%eax, %gs\n```\n\nSegment registers are not used in long mode, so the kernel resets them to zero.\n\n### Calculation of the kernel relocation address\n\nThe next step is to compute the difference between the location the kernel was compiled to be loaded at and the location where it is actually loaded:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L315-L331 -->\n```assembly\n#ifdef CONFIG_RELOCATABLE\n\tleaq\tstartup_32(%rip) /* - $startup_32 */, %rbp\n\tmovl\tBP_kernel_alignment(%rsi), %eax\n\tdecl\t%eax\n\taddq\t%rax, %rbp\n\tnotq\t%rax\n\tandq\t%rax, %rbp\n\tcmpq\t$LOAD_PHYSICAL_ADDR, %rbp\n\tjae\t1f\n#endif\n\tmovq\t$LOAD_PHYSICAL_ADDR, %rbp\n1:\n\n\t/* Target address to relocate to for decompression */\n\tmovl\tBP_init_size(%rsi), %ebx\n\tsubl\t$ rva(_end), %ebx\n\taddq\t%rbp, %rbx\n```\n\nThis operation is very similar to what we have seen already in the [Calculation of the kernel relocation address](./linux-bootstrap-4.md#calculation-of-the-kernel-relocation-address) section of the previous chapter.\n\n> [!TIP]\n> It is highly recommended to read carefully [Calculation of the kernel relocation address](./linux-bootstrap-4.md#calculation-of-the-kernel-relocation-address) before trying to understand this code.\n\nThis piece of code is almost a 1:1 copy of what we have seen in protected mode. If you understood it back then, you shouldn't have any problems understanding it now. The main purpose of this code is to set up the `rbp` and `ebx` registers with the base addresses where the kernel will be decompressed, and the address where the kernel image with decompressor code should be relocated for safe decompression.\n\nThe only difference with the code from protected mode is that now, the kernel can use `rip` based addressing to get the address of the `startup_32`. So it does not need to do magic tricks with `call` and `popl` instructions like in protected mode. All the rest is just the same as what we already have seen in the previous chapter and done only for the same reason - if the bootloader is loaded, the kernel starts from the `64-bit` mode, and the protected mode code is skipped.\n\nAfter these addresses are obtained, the kernel sets up the stack for the decompressor code:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L334-L334 -->\n```assembly\n\tleaq\trva(boot_stack_end)(%rbx), %rsp\n```\n\n### Reload of the Global Descriptor Table\n\nThe next step is to set up a new Global Descriptor Table. Yes, one more time 😊 There are at least two reasons to do this:\n\n1. The bootloader can load the Linux kernel starting from the `64-bit` entrypoint, and the kernel needs to set up its own Global Descriptor Table in case the one from the bootloader is not suitable.\n2. The kernel might be configured with support for the [5-level](https://en.wikipedia.org/wiki/Intel_5-level_paging) paging, and in this case, the kernel needs to jump to `32-bit` mode again to set it safely.\n\nThe \"new\" Global Descriptor Table has the same entries but is pointed by the `gdt64` symbol:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L489-L493 -->\n```assembly\n\t.data\nSYM_DATA_START_LOCAL(gdt64)\n\t.word\tgdt_end - gdt - 1\n\t.quad   gdt - gdt64\nSYM_DATA_END(gdt64)\n```\n\nThe single difference is that `lgdt` in `64-bit` mode loads `GDTR` register with size `10` bytes. In comparison, in `32-bit`, the size of `GDTR` is `6` bytes. To load the new Global Descriptor Table, the kernel writes its address to the `GDTR` register using the `lgdt` instruction:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L357-L368 -->\n```assembly\n\t/* Make sure we have GDT with 32-bit code segment */\n\tleaq\tgdt64(%rip), %rax\n\taddq\t%rax, 2(%rax)\n\tlgdt\t(%rax)\n\n\t/* Reload CS so IRET returns to a CS actually in the GDT */\n\tpushq\t$__KERNEL_CS\n\tleaq\t.Lon_kernel_cs(%rip), %rax\n\tpushq\t%rax\n\tlretq\n\n.Lon_kernel_cs:\n```\n\n### Load of the Interrupt Descriptor Table\n\nAfter the new Global Descriptor Table is loaded, the next step is to load the new `Interrupt Descriptor Table`:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L369-L376 -->\n```assembly\n\t/*\n\t * RSI holds a pointer to a boot_params structure provided by the\n\t * loader, and this needs to be preserved across C function calls. So\n\t * move it into a callee saved register.\n\t */\n\tmovq\t%rsi, %r15\n\n\tcall\tload_stage1_idt\n```\n\nThe `load_stage1_idt` function is defined in [arch/x86/boot/compressed/idt_64.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/idt_64.c) and uses the `lidt` instruction to load the address of the new `Interrupt Descriptor Table`. For this moment, the `Interrupt Descriptor Table` has `NULL` entries to avoid handling the interrupts. As you can remember, the interrupts are disabled at this moment anyway. The valid interrupt handlers will be loaded after kernel relocation.\n\nThe next steps after this are highly related to the setup of `5-level` paging, if it is configured using the `CONFIG_PGTABLE_LEVELS=5` kernel configuration option. This feature extends the virtual address space beyond the traditional 4-level paging scheme, but it is still relatively uncommon in practice and not essential for understanding the mainline boot flow. As mentioned in the [previous chapter](./linux-bootstrap-5.md), for clarity and focus, we’ll set it aside and continue with the standard 4-level paging case.\n\n### Kernel relocation\n\nSince the calculation of the base address for the kernel relocation is done, the kernel setup code can copy the compressed kernel image and the decompressor code to the memory area pointed by this address:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L419-L425 -->\n```assembly\n\tleaq\t(_bss-8)(%rip), %rsi\n\tleaq\trva(_bss-8)(%rbx), %rdi\n\tmovl\t$(_bss - startup_32), %ecx\n\tshrl\t$3, %ecx\n\tstd\n\trep\tmovsq\n\tcld\n```\n\nThe set of assembly instructions above copies the compressed kernel image and decompressor code to the memory area, which starts at the address pointed by the `rbx` register. The code above copies the memory contents starting from the `_bss-8` up to the `_startup_32` symbol, which includes:\n\n- `32-bit` kernel setup code\n- compressed kernel image \n- decompressor code\n\nBecause of the `std` instruction, the copying is performed in the backward order, from higher memory addresses to the lower.\n\nAfter the copying is performed, the kernel needs to reload the previously loaded `Global Descriptor Table` in case it was overwritten or corrupted during the copy procedure:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L432-L435 -->\n```assembly\n\tleaq\trva(gdt64)(%rbx), %rax\n\tleaq\trva(gdt)(%rbx), %rdx\n\tmovq\t%rdx, 2(%rax)\n\tlgdt\t(%rax)\n```\n\nAnd finally jump on the relocated code:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L440-L441 -->\n```assembly\n\tleaq\trva(.Lrelocated)(%rbx), %rax\n\tjmp\t*%rax\n```\n\n## The last actions before the kernel decompression\n\nIn the previous section, we saw the kernel relocation. The very first task after this jump is to clear the `.bss` section. This step is needed because the `.bss` section holds all uninitialized global and static variables. By definition, they must be initialized with zeros in `C` code. Cleaning it, the kernel ensures that all the following code, including the decompressor, begins with a proper `.bss` memory area without any possible garbage in it.\n\nThe following code does that:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L450-L455 -->\n```assembly\n\txorl\t%eax, %eax\n\tleaq    _bss(%rip), %rdi\n\tleaq    _ebss(%rip), %rcx\n\tsubq\t%rdi, %rcx\n\tshrq\t$3, %rcx\n\trep\tstosq\n```\n\nThe assembly code above should be pretty easy to understand if you read the previous parts. It clears the value of the `eax` register and uses its value to fill the memory region of the `.bss` section between the `_bss` and `_ebss` symbols.\n\nIn the next step, the kernel fills the new `Interrupt Descriptor Table` with the call:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L457-L457 -->\n```\n\tcall\tload_stage2_idt\n```\n\nThis function defined in the [arch/x86/boot/compressed/idt_64.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/idt_64.c) and looks like this:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/idt_64.c#L59-L78 -->\n```C\nvoid load_stage2_idt(void)\n{\n\tboot_idt_desc.address = (unsigned long)boot_idt;\n\n\tset_idt_entry(X86_TRAP_PF, boot_page_fault);\n\tset_idt_entry(X86_TRAP_NMI, boot_nmi_trap);\n\n#ifdef CONFIG_AMD_MEM_ENCRYPT\n\t/*\n\t * Clear the second stage #VC handler in case guest types\n\t * needing #VC have not been detected.\n\t */\n\tif (sev_status & BIT(1))\n\t\tset_idt_entry(X86_TRAP_VC, boot_stage2_vc);\n\telse\n\t\tset_idt_entry(X86_TRAP_VC, NULL);\n#endif\n\n\tload_boot_idt(&boot_idt_desc);\n}\n```\n\nWe can skip the part of the code wrapped with `CONFIG_AMD_MEM_ENCRYPT` as it is not of main interest for us right now, but try to understand the rest of the function's body. It is similar to the first stage of the `Interrupt Descriptor Table`. It loads the entries of this table using the `lidt` instruction, which we already have seen before. The only single difference is that it sets up two interrupt handlers:\n\n- `PF` - Page fault interrupt handler\n- `NMI` - Non-maskable interrupt handler\n\nThe first interrupt handler is set because the `initialize_identity_maps` function (which we will see very soon) may trigger page fault exception. This exception can be triggered for example, when [Address space layout randomization](https://en.wikipedia.org/wiki/Address_space_layout_randomization) is enabled and such random physical and virtual addresses were used for which the page tables do have an entry.\n\nThe second interrupt handler is needed to \"handle\" a triple-fault if such an interrupt appears during kernel decompression. So at least dummy NMI handler is needed.\n\nAfter the `Interrupt Descriptor Table` is re-loaded, the `initialize_identity_maps` function is called:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L460-L461 -->\n```assembly\n\tmovq\t%r15, %rdi\n\tcall\tinitialize_identity_maps\n```\n\nThis function is defined in [arch/x86/boot/compressed/ident_map_64.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/ident_map_64.c) and clears the memory area for the top-level page table identified by the `top_level_pgt` pointer to initialize a new page table. Yes, the kernel needs to initialize page tables one more time, despite we have seen the initialization and setup of the early page tables in the [previous chapter](./linux-bootstrap-4.md##setup-paging). The reason for \"one more\" page table is that if the kernel was loaded at the `64-bit` entrypoint, it uses the page table built by the bootloader. Since the kernel was relocated to a new place, the decompressor code can overwrite these page tables during decompression.\n\nThe new page table is built in a very similar way to the [previous page table](./linux-bootstrap-4.md#set-up-paging). Each [virtual address](https://en.wikipedia.org/wiki/Virtual_address_space) directly corresponds to the same [physical address](https://en.wikipedia.org/wiki/Physical_address). That is why it is called the identity mapping.\n\nNow let's take a look at the implementation of this function. It starts by initializing an instance of the `x86_mapping_info` structure called `mapping_info`:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/ident_map_64.c#L119-L122 -->\n```C\n\tmapping_info.alloc_pgt_page = alloc_pgt_page;\n\tmapping_info.context = &pgt_data;\n\tmapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;\n\tmapping_info.kernpg_flag = _KERNPG_TABLE;\n```\n\nThis structure provides information about memory mappings and a callback to allocate space for page table entries. The `context` field is used for tracking the allocated page tables. The `page_flag` and `kernpg_flag` fields define various page attributes (such as `present`, `writable`, or `executable`), which are reflected in their names.\n\nIn the next step, the kernel reads the address of the top-level page table from the `cr3` [control register](https://en.wikipedia.org/wiki/Control_register) and compares it with the `_pgtable`. If you read the previous chapter, you remember that `_pgtable` is the page table initialized by the early kernel setup code before switching to long mode. If we came from the `startup_32`, and it is exactly our case, the `cr3` register contains the same address as `_pgtable`. In this case, the kernel reuses and extends this page table:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/ident_map_64.c#L142-L146 -->\n```C\n\ttop_level_pgt = read_cr3_pa();\n\tif (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {\n\t\tpgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;\n\t\tpgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;\n\t\tmemset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);\n```\n\nOtherwise, the new page table is built:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/ident_map_64.c#L147-L152 -->\n```C\n\t} else {\n\t\tpgt_data.pgt_buf = _pgtable;\n\t\tpgt_data.pgt_buf_size = BOOT_PGT_SIZE;\n\t\tmemset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);\n\t\ttop_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);\n\t}\n```\n\nAt this stage, new identity mappings are added to cover the essential regions needed for the kernel to continue the boot process:\n\n- the kernel image itself (from `_head` to `_end`)\n- the boot parameters provided by the bootloader\n- the kernel command line\n\nAll of the actual work is performed by the `kernel_add_identity_map` function defined in the same [file](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/ident_map_64.c):\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/ident_map_64.c#L161-L166 -->\n```C\n\tkernel_add_identity_map((unsigned long)_head, (unsigned long)_end);\n\tboot_params_ptr = rmode;\n\tkernel_add_identity_map((unsigned long)boot_params_ptr,\n\t\t\t\t(unsigned long)(boot_params_ptr + 1));\n\tcmdline = get_cmd_line_ptr();\n\tkernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE);\n```\n\nThe `kernel_add_itntity_map` function walks the page table hierarchy and ensures that there is existing page table entries which provide 1:1 mapping into the virtual address space. If such entries does not exist, the new entry is allocated with the flags that we have seen during the initialization of the `mapping_info`.\n\nAfter all the identity mapping page table entries were initialized, the kernel updates the `cr3` control register with the address of the top page table:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/ident_map_64.c#L183-L183 -->\n```C\n\twrite_cr3(top_level_pgt);\n```\n\nAt this point, all the preparations needed to decompress the kernel image are done. Now the kernel decompressor code is ready to decompress the kernel:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L466-L475 -->\n```assembly\n\t/* pass struct boot_params pointer and output target address */\n\tmovq\t%r15, %rdi\n\tmovq\t%rbp, %rsi\n\tcall\textract_kernel\t\t/* returns kernel entry point in %rax */\n\n/*\n * Jump to the decompressed kernel.\n */\n\tmovq\t%r15, %rsi\n\tjmp\t*%rax\n```\n\nAfter the kernel is decompressed. The last instructions of the decompressor code transfers control to the Linux kernel entrypoint jumping on the address of the kernel's entrypoint. The early setup phase is complete, and the Linux kernel starts its job 🎉\n\nIn the next section, let's see how the kernel decompression works.\n\n## Kernel decompression\n\nRight now, we are finally at the last point before we see the kernel entrypoint. The last remaining step is only to decompress the kernel and switch control to it.\n\nThe kernel decompression is performed by the `extract_kernel` function defined in [arch/x86/boot/compressed/misc.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/misc.c). This function starts with the video mode and console initialization that we already saw in the previous parts. The kernel needs to do this again because it does not know if the kernel was loaded in the [real mode](https://en.wikipedia.org/wiki/Real_mode) or whether the bootloader used the `32-bit` or `64-bit` boot protocol.\n\nWe will skip all these initialization steps as we already saw them in the previous chapters. After the first initialization steps are done, the decompressor code stores the pointers to the start of the free heap memory and to the end of it:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/misc.c#L458-L459 -->\n```C\n\tfree_mem_ptr     = heap;\t/* Heap */\n\tfree_mem_end_ptr = heap + BOOT_HEAP_SIZE;\n```\n\nThe main reason to set up the heap borders is that the kernel decompressor code uses the heap intensively during decompression.\n\nAfter the initialization of the heap, the kernel calls the `choose_random_location` function from [arch/x86/boot/compressed/kaslr.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/kaslr.c). This function chooses the random location in memory to write the decompressed kernel to. This function performs work only if the address randomization is enabled. At this point, we will skip it and move to the next step, as it is not the most crucial point in the kernel decompression. If you are interested in what this function does, you can find more information in the [next chapter](./linux-bootstrap-6.md).\n\nNow let's get back to the `extract_kernel` function. Since we assume that the kernel address randomization is disabled, the address where the kernel image will be decompressed is stored in the `output` parameter without any change. The value from this variable is obtained from the `rbp` register as calculated in the previous steps.\n\nThe next action before the kernel is decompressed is to perform the sanitising checks:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/misc.c#L496-L512 -->\n```C\n\tif ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))\n\t\terror(\"Destination physical address inappropriately aligned\");\n\tif (virt_addr & (MIN_KERNEL_ALIGN - 1))\n\t\terror(\"Destination virtual address inappropriately aligned\");\n#ifdef CONFIG_X86_64\n\tif (heap > 0x3fffffffffffUL)\n\t\terror(\"Destination address too large\");\n\tif (virt_addr + needed_size > KERNEL_IMAGE_SIZE)\n\t\terror(\"Destination virtual address is beyond the kernel mapping area\");\n#else\n\tif (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))\n\t\terror(\"Destination address too large\");\n#endif\n#ifndef CONFIG_RELOCATABLE\n\tif (virt_addr != LOAD_PHYSICAL_ADDR)\n\t\terror(\"Destination virtual address changed when not relocatable\");\n#endif\n```\n\nAfter all these checks, we can see the familiar message on the screen of our computers:\n\n```\nDecompressing Linux...\n```\n\nThe kernel setup code starts decompression by calling the `decompress_kernel` function:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/misc.c#L521-L521 -->\n```C\n\tentry_offset = decompress_kernel(output, virt_addr, error);\n```\n\nThis function performs the following actions:\n\n1. Decompress the kernel\n2. Parse kernel ELF binary\n3. Handle relocations\n\nThe kernel decompression performed by the helper function `__decompress`. The implementation of this function depends on what compression algorithm was used to compress the kernel and located in one of the following files:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/misc.c#L63-L89 -->\n```C\n#ifdef CONFIG_KERNEL_GZIP\n#include \"../../../../lib/decompress_inflate.c\"\n#endif\n\n#ifdef CONFIG_KERNEL_BZIP2\n#include \"../../../../lib/decompress_bunzip2.c\"\n#endif\n\n#ifdef CONFIG_KERNEL_LZMA\n#include \"../../../../lib/decompress_unlzma.c\"\n#endif\n\n#ifdef CONFIG_KERNEL_XZ\n#include \"../../../../lib/decompress_unxz.c\"\n#endif\n\n#ifdef CONFIG_KERNEL_LZO\n#include \"../../../../lib/decompress_unlzo.c\"\n#endif\n\n#ifdef CONFIG_KERNEL_LZ4\n#include \"../../../../lib/decompress_unlz4.c\"\n#endif\n\n#ifdef CONFIG_KERNEL_ZSTD\n#include \"../../../../lib/decompress_unzstd.c\"\n#endif\n```\n\nI will not describe here each implementation as this information is rather about compression algorithms rather than something specific to the Linux kernel.\n\nAfter the kernel is decompressed, two more functions are called: `parse_elf` and `handle_relocations`. Let's take a short look at them.\n\nThe kernel binary, which is called `vmlinux` is an [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) executable file. As a result, after decompression we have not just a \"piece\" of code on which we can jump but an ELF file with headers, program segments, debug symbols and other information. We can easily make sure in it inspecting the `vmlinux` with `readelf` utility:\n\n```bash\nreadelf -l vmlinux\n\nElf file type is EXEC (Executable file)\nEntry point 0x1000000\nThere are 5 program headers, starting at offset 64\n\nProgram Headers:\n  Type           Offset             VirtAddr           PhysAddr\n                 FileSiz            MemSiz              Flags  Align\n  LOAD           0x0000000000200000 0xffffffff81000000 0x0000000001000000\n                 0x0000000000893000 0x0000000000893000  R E    200000\n  LOAD           0x0000000000a93000 0xffffffff81893000 0x0000000001893000\n                 0x000000000016d000 0x000000000016d000  RW     200000\n  LOAD           0x0000000000c00000 0x0000000000000000 0x0000000001a00000\n                 0x00000000000152d8 0x00000000000152d8  RW     200000\n  LOAD           0x0000000000c16000 0xffffffff81a16000 0x0000000001a16000\n                 0x0000000000138000 0x000000000029b000  RWE    200000\n  ...\n  ...\n  ...\n```\n\nThe `parse_elf` function acts as a minimal [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) loader. It reads the ELF program headers of the decompressed kernel image and uses them to determine which segments must be loaded and where each segment should be placed in physical memory.\n\nAt this point, the `parse_elf` function has completed loading the decompressed kernel image into memory. Each `PT_LOAD` segment has been copied from the ELF file into its proper location. The kernel’s code, data, and other segments are now present at the chosen load address. However, it might not be sufficient to make the kernel fully runnable.\n\nThe kernel was originally linked assuming a specific base address. If the address space layout randomization is enabled, the kernel can instead be loaded at a different physical and virtual address. As a result, any absolute addresses embedded within the kernel image will still reflect the original link-time address rather than the actual load address. To resolve this, the kernel image includes a relocation table that identifies all locations containing such absolute references. \n\nThe `handle_relocations` function processes this table and adjusts each affected value by applying the relocation delta, which is the difference between the actual load address and the link-time base address. \n\nOnce the relocations are applied, the decompressor code jumps to the kernel entrypoint. Its address is stored in the `rax` register, as we already have seen above.\n\nNow we are in the kernel 🎉🎉🎉\n\nThe kernel entrypoint is the `startup_64` function from [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/head_64.S). This is our next stop, but it will be in the next set of chapters - [Kernel initialization process](https://github.com/0xAX/linux-insides/tree/master/Initialization).\n\n## Conclusion\n\nThis is the end of the third part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n## Links\n\nHere is the list of the links that you can find useful when reading this chapter:\n\n- [Real mode](https://en.wikipedia.org/wiki/Real_mode)\n- [Protected mode](http://en.wikipedia.org/wiki/Protected_mode)\n- [Long mode](https://en.wikipedia.org/wiki/Long_mode)\n- [Flat memory model](https://en.wikipedia.org/wiki/Flat_memory_model)\n- [Address space layout randomization](https://en.wikipedia.org/wiki/Address_space_layout_randomization)\n- [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format)\n- [Previous part](linux-bootstrap-4.md)\n"
  },
  {
    "path": "Booting/linux-bootstrap-6.md",
    "content": "# Kernel booting process. Part 6\n\nIn the [previous part](./linux-bootstrap-5.md), we finally left the setup code and reached the Linux kernel itself. We explored the last steps of the early boot process - from the kernel decompression to the hand-off to the Linux kernel entrypoint (the `startup_64` function). You may think this is the end of the set of posts about the Linux kernel booting process, but I'd like to come back one more time to the early setup code and look at one more important part of it - `KASLR` or Kernel Address Space Layout Randomization.\n\nAs you can remember from the previous parts, the entry point of the Linux kernel is the `startup_64` function defined in [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/head_64.S). In normal cases, the kernel is loaded at the fixed, well-known address defined by the value of the `CONFIG_PHYSICAL_START` configuration option. The description and the default value of this option are defined in [arch/x86/Kconfig](https://github.com/torvalds/linux/blob/master/arch/x86/Kconfig):\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/Kconfig#L2021-L2025 -->\n```\nconfig PHYSICAL_START\n\thex \"Physical address where the kernel is loaded\" if (EXPERT || CRASH_DUMP)\n\tdefault \"0x1000000\"\n\thelp\n\t  This gives the physical address where the kernel is loaded.\n```\n\nHowever, modern systems rarely stick to predictable memory layouts for security reasons. Knowing the fixed address where the kernel was loaded can make it easier for attackers to guess the location of the kernel structures which can be exploited in various ways. To make such attacks harder, the Linux kernel provides support for [address space layout randomization](https://en.wikipedia.org/wiki/Address_space_layout_randomization) mechanism. \n\nTo enable this mechanism, the `CONFIG_RANDOMIZE_BASE` kernel configuration option should be enabled. If this mechanism is enabled, the kernel will not be decompressed and loaded at the given fixed address. Instead, each boot the kernel image will be placed at a different physical address. \n\nIn this part, we will look at how this mechanism works.\n\n## Choose random location for kernel image\n\nBefore we will start to investigate kernel's code, let's remember where we were and what we have seen. \n\nIn the [previous part](linux-bootstrap-5.md), we followed the kernel decompression code and transition to [long mode](https://en.wikipedia.org/wiki/Long_mode). The kernel's decompressor entrypoint is the `extract_kernel` function defined in [arch/x86/boot/compressed/misc.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/misc.c). At this point, the kernel image is about to be decompressed into the specific location in memory.\n\nBefore the kernel's decompressor actually begins to decompress the kernel image, it needs to decide where that image should be placed in memory. While we were going through the kernel's decompression code in the `extract_kernel`, we skipped the next function call:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/misc.c#L490-L493 -->\n```C\n\tchoose_random_location((unsigned long)input_data, input_len,\n\t\t\t\t(unsigned long *)&output,\n\t\t\t\tneeded_size,\n\t\t\t\t&virt_addr);\n```\n\nThis function is defined in [arch/x86/boot/compressed/kaslr.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/kaslr.c) and does nothing if the `kaslr` option is not passed to the kernel command line:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L861-L872 -->\n```C\nvoid choose_random_location(unsigned long input,\n\t\t\t    unsigned long input_size,\n\t\t\t    unsigned long *output,\n\t\t\t    unsigned long output_size,\n\t\t\t    unsigned long *virt_addr)\n{\n\tunsigned long random_addr, min_addr;\n\n\tif (cmdline_find_option_bool(\"nokaslr\")) {\n\t\twarn(\"KASLR disabled: 'nokaslr' on cmdline.\");\n\t\treturn;\n\t}\n```\n\nOtherwise, it selects a randomized address where the kernel image should be decompressed.\n\nAs we can see, this function takes five parameters:\n\n- `input` - beginning address of the compressed kernel image\n- `input_size` - size of the compressed kernel image\n- `output` - physical address where the kernel should be decompressed\n- `output_size` - size of the decompressed kernel image\n- `virt_addr` - virtual address where the kernel should be decompressed\n\nThe `extract_kernel` function receives the `output` parameter from the code that prepares the decompressor:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/head_64.S#L467-L469 -->\n```\n\tmovq\t%r15, %rdi\n\tmovq\t%rbp, %rsi\n\tcall\textract_kernel\t\t/* returns kernel entry point in %rax */\n```\n\nIf you read the previous chapters, you can remember that the starting address where the kernel image should be decompressed was calculated and stored in the `rbp` register.\n\nThe source of the values for the `input`, `input_size`, and `output_size` parameters is quite interesting. These values come from a little program called [mkpiggy](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/mkpiggy.c).\n\nIf you've ever tried compiling the Linux kernel yourself, you can find the output generated by this program in the `arch/x86/boot/compressed/piggy.S` assembly file, which contains all the parameters needed for decompression. In my case, this file looks like this:\n\n```assembly\n.section \".rodata..compressed\",\"a\",@progbits\n.globl z_input_len\nz_input_len = 14213122\n.globl z_output_len\nz_output_len = 36564556\n.globl input_data, input_data_end\ninput_data:\n.incbin \"arch/x86/boot/compressed/vmlinux.bin.lz4\"\ninput_data_end:\n.section \".rodata\",\"a\",@progbits\n.globl input_len\ninput_len:\n\t.long 14213122\n.globl output_len\noutput_len:\n\t.long 36564556\n```\n\nAt build time, the kernel's `vmlinux` image is compressed into `vmlinux.bin.{ALGO}` file. A small `mkpiggy` program gets the information about the compressed kernel image and generates this assembly file using the following code:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/mkpiggy.c#L52-L67 -->\n```C\n\tprintf(\".section \\\".rodata..compressed\\\",\\\"a\\\",@progbits\\n\");\n\tprintf(\".globl z_input_len\\n\");\n\tprintf(\"z_input_len = %lu\\n\", ilen);\n\tprintf(\".globl z_output_len\\n\");\n\tprintf(\"z_output_len = %lu\\n\", (unsigned long)olen);\n\n\tprintf(\".globl input_data, input_data_end\\n\");\n\tprintf(\"input_data:\\n\");\n\tprintf(\".incbin \\\"%s\\\"\\n\", argv[1]);\n\tprintf(\"input_data_end:\\n\");\n\n\tprintf(\".section \\\".rodata\\\",\\\"a\\\",@progbits\\n\");\n\tprintf(\".globl input_len\\n\");\n\tprintf(\"input_len:\\n\\t.long %lu\\n\", ilen);\n\tprintf(\".globl output_len\\n\");\n\tprintf(\"output_len:\\n\\t.long %lu\\n\", (unsigned long)olen);\n```\n\nThat is where the kernel setup code obtains the values of these parameters.\n\nThe last parameter of the `choose_random_location` function is the virtual base address for the decompressed kernel image. At this point during early boot it is set to the physical load address:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/misc.c#L409-L409 -->\n```C\n\tunsigned long virt_addr = LOAD_PHYSICAL_ADDR;\n```\n\nWhy is a virtual address initialized with the value of the physical address? The answer is simple and can be found in the previous chapters. During decompression, the early boot-time page tables are set up as an identity map. In other words, for this early stage, we have each virtual address equal to a physical address.\n\nThe value of `LOAD_PHYISICAL_ADDR` is the aligned value of the `CONFIG_PHYSICAL_START` configuration option, which we already saw at the beginning of this chapter:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/include/asm/page_types.h#L32-L32 -->\n```C\n#define LOAD_PHYSICAL_ADDR\t__ALIGN_KERNEL_MASK(CONFIG_PHYSICAL_START, CONFIG_PHYSICAL_ALIGN - 1)\n```\n\nAt this point, we have examined all the parameters passed to the `choose_random_location` function. Now it is time to look inside the function. \n\nAs it was mentioned above, the first thing that this function does is check whether ASLR disabled using the `nokaslr` option in the kernel's command line:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L869-L872 -->\n```C\n\tif (cmdline_find_option_bool(\"nokaslr\")) {\n\t\twarn(\"KASLR disabled: 'nokaslr' on cmdline.\");\n\t\treturn;\n\t}\n```\n\nIf this option is specified in the kernel command line, the function does nothing, and the kernel is decompressed at the fixed address. In this chapter, however, we focus on the case where this option is not provided, as that is the main topic under discussion. If the `nokaslr` option is not present, the function proceeds to find a random location in memory to decompress the kernel.\n\nThe very first step is to set a mark in the boot parameters that ASLR is enabled. This is done by setting a specific flag in the kernel’s boot header:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L874-L874 -->\n```C\n\tboot_params_ptr->hdr.loadflags |= KASLR_FLAG;\n```\n\nAfter marking that ASLR is enabled, the next task is to determine the upper memory limit which system can use:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L876-L879 -->\n```C\n\tif (IS_ENABLED(CONFIG_X86_32))\n\t\tmem_limit = KERNEL_IMAGE_SIZE;\n\telse\n\t\tmem_limit = MAXMEM;\n```\n\nSince we consider only `x86_64` systems, the memory limit is `MAXMEM`, which is a macro defined in [arch/x86/include/asm/pgtable_64_types.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/pgtable_64_types.h):\n\n```C\n#define MAXMEM\t\t\t(1UL << MAX_PHYSMEM_BITS)\n```\n\nwhere `MAX_PHYSMEM_BITS` depends on is [5-level paging](https://en.wikipedia.org/wiki/Intel_5-level_paging) is enabled or not. We will consider only 4-level paging, so in our case `MAXMEM` will be expand to `1 << 46` bytes.\n\nWith the `mem_limit` value set, the decompressor and kernel code responsible for the address randomization will know how far they can safely go during calculating an address for the kernel image. But before a random address for the kernel image can be chosen, the kernel needs to make sure it does not overwrite something important.\n\n### Avoiding reserved memory ranges\n\nThe next step in the randomization process is to build a map of forbidden memory regions to prevent the kernel image from overwriting memory areas that are already in use. These may include, for example, the [initial ramdisk](https://en.wikipedia.org/wiki/Initial_ramdisk) or the kernel command line. To gather this information, we use this function:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L882-L882 -->\n```C\n\tmem_avoid_init(input, input_size, *output);\n```\n\nIt collects the forbidden memory regions into the `mem_avoid` array, which has `mem_vector` type:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/misc.h#L97-L100 -->\n```C\nstruct mem_vector {\n\tu64 start;\n\tu64 size;\n};\n```\n\nFor this moment, the randomization code tries to avoid the memory regions specified by the `mem_avoid_index`:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L86-L94 -->\n```C\nenum mem_avoid_index {\n\tMEM_AVOID_ZO_RANGE = 0,\n\tMEM_AVOID_INITRD,\n\tMEM_AVOID_CMDLINE,\n\tMEM_AVOID_BOOTPARAMS,\n\tMEM_AVOID_MEMMAP_BEGIN,\n\tMEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1,\n\tMEM_AVOID_MAX,\n};\n```\n\nLet's look at the implementation of the `mem_avoid_init` function. As we know, the main goal of this function is to store information about reserved memory regions to avoid them when choosing a random address for the kernel image. There are no complex calculations in this function, and most of the reserved memory areas are known, as they are set by the bootloader or were already calculated at the previous steps during kernel setup. A typical example of the process of gathering information about the memory reserved regions looks like this:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L369-L374 -->\n```C\n\tinitrd_start  = (u64)boot_params_ptr->ext_ramdisk_image << 32;\n\tinitrd_start |= boot_params_ptr->hdr.ramdisk_image;\n\tinitrd_size  = (u64)boot_params_ptr->ext_ramdisk_size << 32;\n\tinitrd_size |= boot_params_ptr->hdr.ramdisk_size;\n\tmem_avoid[MEM_AVOID_INITRD].start = initrd_start;\n\tmem_avoid[MEM_AVOID_INITRD].size = initrd_size;\n```\n\nIn the code above, the start address of the initial ramdisk and its size are stored in the `mem_avoid` array. The same pattern repeats for other important memory areas, for example:\n\n- the setup header \n- the decompressor itself\n- the compressed kernel image\n\nAfter the `mem_avoid_init` function is executed, the decompressor code has a complete picture of the system’s reserved memory zones and avoids them during selecting a random address to load the kernel image.\n\nNow we can return to the `choose_random_location` function and finally see the process of the address randomization.\n\n### Physical address randomization\n\nThe whole process of finding a suitable random address to load the kernel image consists of two parts:\n\n- Find a random physical address\n- Find a random virtual address\n\nYou can remember that at this point, the kernel uses identity-mapped page tables. Having this in mind, you can ask why two different addresses are calculated if there is a `1:1` mapping anyway. The answer is that these two random addresses have different purposes. Physical address determines where the kernel image is loaded in memory. Virtual address determines the kernel's address in the virtual address space. Despite the decompressor code now running with identity mapping, all the symbol references in the kernel image are patched during the relocation process with a random virtual address and offset. If it turns out that there is no mapping between the newly chosen physical and virtual addresses in the current page tables, the [page fault](https://en.wikipedia.org/wiki/Page_fault) interrupt handler builds a new identity mapping. You can find more information in the [previous chapter](./linux-bootstrap-5.md#the-last-actions-before-the-kernel-decompression).\n\nBefore generating any random offset, the decompressor determines the lowest possible base address that the kernel can use:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L889-L891 -->\n```C\n\tmin_addr = min(*output, 512UL << 20);\n\t/* Make sure minimum is aligned. */\n\tmin_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN);\n```\n\nThis address is the minimal aligned value between `512` megabytes and the starting address of the output buffer passed to the `extract_kernel` function. After obtaining this value, the kernel calls the next function, which returns a random physical address:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L894-L901 -->\n```C\n\trandom_addr = find_random_phys_addr(min_addr, output_size);\n\tif (!random_addr) {\n\t\twarn(\"Physical KASLR disabled: no suitable memory region!\");\n\t} else {\n\t\t/* Update the new physical address location. */\n\t\tif (*output != random_addr)\n\t\t\t*output = random_addr;\n\t}\n```\n\nThe `find_random_phys_addr` function is defined in the same [arch/x86/boot/compressed/kaslr.c](https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c) source code file as the `choose_random_location` function. This function starts from the sanity checks. The first check is that the kernel image will not get behind the memory limit:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L812-L813 -->\n```C\n\tif (minimum + image_size > mem_limit)\n\t\treturn 0;\n```\n\nThe next check is to verify that the number of memory regions specified via `memmap` kernel command line option is not excessive:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L816-L819 -->\n```C\n\tif (memmap_too_large) {\n\t\tdebug_putstr(\"Aborted memory entries scan (more than 4 memmap= args)!\\n\");\n\t\treturn 0;\n\t}\n```\n\nAfter these sanity checks, the decompressor code begins scanning the system's available memory regions to find suitable candidates for the randomized address to decompress the kernel image. This is done with the help of the following functions:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L825-L828 -->\n```C\n\tif (!process_kho_entries(minimum, image_size) &&\n\t    !process_efi_entries(minimum, image_size))\n\t\tprocess_e820_entries(minimum, image_size);\n```\n\nThe scanning consists of three potential stages:\n\n1. Scan the memory regions that are not preserved by the [KHO](https://docs.kernel.org/next/kho/concepts.html).\n2. Scan the memory regions presented by the [EFI](https://en.wikipedia.org/wiki/Uefi) memory map.\n3. Fallback to scanning the memory regions reported by the [e820](https://en.wikipedia.org/wiki/E820) BIOS service.\n\nAll the memory regions that were found and accepted as suitable will be stored in the `slot_areas` array represented by the following structure:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L452-L455 -->\n```C\nstruct slot_area {\n\tu64 addr;\n\tunsigned long num;\n};\n```\n\nThe kernel will select a random index from this array to decompress kernel to. The selection of the random index happens in the `slots_fetch_random` function:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L527-L549 -->\n```C\nstatic u64 slots_fetch_random(void)\n{\n\tunsigned long slot;\n\tunsigned int i;\n\n\t/* Handle case of no slots stored. */\n\tif (slot_max == 0)\n\t\treturn 0;\n\n\tslot = kaslr_get_random_long(\"Physical\") % slot_max;\n\n\tfor (i = 0; i < slot_area_index; i++) {\n\t\tif (slot >= slot_areas[i].num) {\n\t\t\tslot -= slot_areas[i].num;\n\t\t\tcontinue;\n\t\t}\n\t\treturn slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN);\n\t}\n\n\tif (i == slot_area_index)\n\t\tdebug_putstr(\"slots_fetch_random() failed!?\\n\");\n\treturn 0;\n}\n```\n\nThe main goal of the `slots_fetch_random` function is to select a random memory slot from the list of possible locations that were gathered into the `slot_areas` array. Each entry of this array represents a contiguous free region of memory and the number of possible aligned kernel placements that fit in it.\n\nTo select a random address, this function generates a random number which is limited to the total number of the available slots. The random value is produced by the `kaslr_get_random_long` function which is defined in the same file. As its name suggests, this function returns a random `unsigned long` value, obtained using whatever entropy sources are available on the system. Depending on the hardware and the kernel configuration it can be:\n\n- the CPU’s [Time Stamp Counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter)\n- the [rdrand](https://en.wikipedia.org/wiki/RdRand) instruction\n- The [i8254 programmable interval timer](https://en.wikipedia.org/wiki/Intel_8253)\n\nAfter obtaining the random value, the code goes through the `slot_areas` array to find a memory region with enough available slots. If such a memory region is found, its starting address is used as a random physical address for decompressing the kernel image.\n\nThe kernel checks the result of the `find_random_phys_addr` function and prints a warning message if this operation was not successful, otherwise it assigned the obtained address to the `output`:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L895-L901 -->\n```C\n\tif (!random_addr) {\n\t\twarn(\"Physical KASLR disabled: no suitable memory region!\");\n\t} else {\n\t\t/* Update the new physical address location. */\n\t\tif (*output != random_addr)\n\t\t\t*output = random_addr;\n\t}\n```\n\nAt this point, the kernel has successfully picked a random physical address. The final step is to obtain a random virtual address.\n\n### Virtual address randomization\n\nWith the physical address chosen, the decompressor now knows where to decompress the kernel image. Once the decompressed kernel starts running, it switches from the early-boot page tables to the full paging setup. The next and last step is to randomize the virtual base address:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L905-L907 -->\n```C\n\tif (IS_ENABLED(CONFIG_X86_64))\n\t\trandom_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size);\n\t*virt_addr = random_addr;\n```\n\nThe function `find_random_virt_addr` is located in the same source code file and looks like this:\n\n<!-- https://raw.githubusercontent.com/torvalds/linux/refs/heads/master/arch/x86/boot/compressed/kaslr.c#L840-L855 -->\n```C\nstatic unsigned long find_random_virt_addr(unsigned long minimum,\n\t\t\t\t\t   unsigned long image_size)\n{\n\tunsigned long slots, random_addr;\n\n\t/*\n\t * There are how many CONFIG_PHYSICAL_ALIGN-sized slots\n\t * that can hold image_size within the range of minimum to\n\t * KERNEL_IMAGE_SIZE?\n\t */\n\tslots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN;\n\n\trandom_addr = kaslr_get_random_long(\"Virtual\") % slots;\n\n\treturn random_addr * CONFIG_PHYSICAL_ALIGN + minimum;\n}\n```\n\nAs we can see, this function uses the same `kaslr_get_random_long` call to get a random memory slot.\n\nAt this point, both the physical and virtual base addresses are determined — randomized, aligned, and guaranteed to fit in available memory.\n\n## Conclusion\n\nThis is the end of the sixth part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new).\n\nThe next chapter will be about kernel initialization and we will study the first steps take in the Linux kernel initialization code.\n\n## Links\n\n- [Address Space Layout Randomization](https://en.wikipedia.org/wiki/Address_space_layout_randomization)\n- [Linux kernel boot protocol](https://github.com/torvalds/linux/blob/v4.16/Documentation/x86/boot.txt)\n- [Long mode](https://en.wikipedia.org/wiki/Long_mode)\n- [Initial ramdisk](https://en.wikipedia.org/wiki/Initial_ramdisk)\n- [Four-level page tables](https://lwn.net/Articles/117749/)\n- [Five-level page tables](https://lwn.net/Articles/717293/)\n- [EFI](https://en.wikipedia.org/wiki/Unified_Extensible_Firmware_Interface)\n- [e820](https://en.wikipedia.org/wiki/E820)\n- [Time Stamp Counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter)\n- [rdrand instruction](https://en.wikipedia.org/wiki/RdRand)\n- [Previous part](linux-bootstrap-5.md)\n"
  },
  {
    "path": "CODEOWNERS",
    "content": "# Owner of the repository\n* @0xAX\n\n# Documentation owners\n*.md @0xAX @klaudiagrz\n"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "content": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nWe as members, contributors, and leaders pledge to make participation in our\ncommunity a harassment-free experience for everyone, regardless of age, body\nsize, visible or invisible disability, ethnicity, sex characteristics, gender\nidentity and expression, level of experience, education, socio-economic status,\nnationality, personal appearance, race, religion, or sexual identity\nand orientation.\n\nWe pledge to act and interact in ways that contribute to an open, welcoming,\ndiverse, inclusive, and healthy community.\n\n## Our Standards\n\nExamples of behavior that contributes to a positive environment for our\ncommunity include:\n\n* Demonstrating empathy and kindness toward other people\n* Being respectful of differing opinions, viewpoints, and experiences\n* Giving and gracefully accepting constructive feedback\n* Accepting responsibility and apologizing to those affected by our mistakes,\n  and learning from the experience\n* Focusing on what is best not just for us as individuals, but for the\n  overall community\n\nExamples of unacceptable behavior include:\n\n* The use of sexualized language or imagery, and sexual attention or\n  advances of any kind\n* Trolling, insulting or derogatory comments, and personal or political attacks\n* Public or private harassment\n* Publishing others' private information, such as a physical or email\n  address, without their explicit permission\n* Other conduct which could reasonably be considered inappropriate in a\n  professional setting\n\n## Enforcement Responsibilities\n\nCommunity leaders are responsible for clarifying and enforcing our standards of\nacceptable behavior and will take appropriate and fair corrective action in\nresponse to any behavior that they deem inappropriate, threatening, offensive,\nor harmful.\n\nCommunity leaders have the right and responsibility to remove, edit, or reject\ncomments, commits, code, wiki edits, issues, and other contributions that are\nnot aligned to this Code of Conduct, and will communicate reasons for moderation\ndecisions when appropriate.\n\n## Scope\n\nThis Code of Conduct applies within all community spaces, and also applies when\nan individual is officially representing the community in public spaces.\nExamples of representing our community include using an official e-mail address,\nposting via an official social media account, or acting as an appointed\nrepresentative at an online or offline event.\n\n## Enforcement\n\nInstances of abusive, harassing, or otherwise unacceptable behavior may be\nreported to the community leaders responsible for enforcement at\nkuleshovmail@gmail.com.\nAll complaints will be reviewed and investigated promptly and fairly.\n\nAll community leaders are obligated to respect the privacy and security of the\nreporter of any incident.\n\n## Enforcement Guidelines\n\nCommunity leaders will follow these Community Impact Guidelines in determining\nthe consequences for any action they deem in violation of this Code of Conduct:\n\n### 1. Correction\n\n**Community Impact**: Use of inappropriate language or other behavior deemed\nunprofessional or unwelcome in the community.\n\n**Consequence**: A private, written warning from community leaders, providing\nclarity around the nature of the violation and an explanation of why the\nbehavior was inappropriate. A public apology may be requested.\n\n### 2. Warning\n\n**Community Impact**: A violation through a single incident or series\nof actions.\n\n**Consequence**: A warning with consequences for continued behavior. No\ninteraction with the people involved, including unsolicited interaction with\nthose enforcing the Code of Conduct, for a specified period of time. This\nincludes avoiding interactions in community spaces as well as external channels\nlike social media. Violating these terms may lead to a temporary or\npermanent ban.\n\n### 3. Temporary Ban\n\n**Community Impact**: A serious violation of community standards, including\nsustained inappropriate behavior.\n\n**Consequence**: A temporary ban from any sort of interaction or public\ncommunication with the community for a specified period of time. No public or\nprivate interaction with the people involved, including unsolicited interaction\nwith those enforcing the Code of Conduct, is allowed during this period.\nViolating these terms may lead to a permanent ban.\n\n### 4. Permanent Ban\n\n**Community Impact**: Demonstrating a pattern of violation of community\nstandards, including sustained inappropriate behavior,  harassment of an\nindividual, or aggression toward or disparagement of classes of individuals.\n\n**Consequence**: A permanent ban from any sort of public interaction within\nthe community.\n\n## Attribution\n\nThis Code of Conduct is adapted from the [Contributor Covenant][homepage],\nversion 2.0, available at\nhttps://www.contributor-covenant.org/version/2/0/code_of_conduct.html.\n\nCommunity Impact Guidelines were inspired by [Mozilla's code of conduct\nenforcement ladder](https://github.com/mozilla/diversity).\n\n[homepage]: https://www.contributor-covenant.org\n\nFor answers to common questions about this code of conduct, see the FAQ at\nhttps://www.contributor-covenant.org/faq. Translations are available at\nhttps://www.contributor-covenant.org/translations.\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contributing\n\nThis document outlines the contribution workflow, starting from opening an issue, creating a pull request (PR), reviewing, and merging the PR. When working on this project, make sure to follow the [Code of Conduct](./CODE_OF_CONDUCT.md).\n\nThank you for your contribution.\n\n## New contributor guide \n\nIf you are a new open source contributor, here are some resources you may find useful before providing your first contributions:\n\n- [Finding ways to contribute to open source on GitHub](https://docs.github.com/en/get-started/exploring-projects-on-github/finding-ways-to-contribute-to-open-source-on-github)\n- [Set up Git](https://docs.github.com/en/get-started/getting-started-with-git/set-up-git)\n- [GitHub flow](https://docs.github.com/en/get-started/using-github/github-flow)\n- [Collaborating with pull requests](https://docs.github.com/en/github/collaborating-with-pull-requests)\n\n**Working on your first pull request?** You can learn how from this free series [How to Contribute to an Open Source Project on GitHub](https://kcd.im/pull-request).\n\n## Create an issue \n\nIf you have any improvement ideas, notice a missing feature or a bug, create a GitHub issue by clicking **Issues -> New issue** in GitHub. Make sure to fill the issue template with a detailed description of the bug or suggested improvements. Provide proper argumentation and screenshots, if necessary.\n\nIf you find any existing issue to work on, you are welcome to open a PR with a fix.\n\n## Open a pull request\n\nIf you want to directly contribute to the project, create a pull reguest with the suggested changes. To do so:\n\n1. [Fork the repository](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo#fork-an-example-repository).\n\n2. Make changes on your local copy of the forked repository.\n\n3. Commit and push the changes to GitHub. \n\n> [!IMPORTANT]  \n> Don't forget to update your fork. Since many contributors may be working on the same content based on the `master` branch, some merge conflicts may occur. Remember to rebase with `master` every time before pushing your changes and make sure your branch doesn't have any conflicts with `master`. If you run into any merge conflicts, read the [Resolve merge conflicts](https://github.com/skills/resolve-merge-conflicts) tutorial to learn how to resolve merge conflicts and other issues.\n\n4. Open a pull request in GitHub. Fill the pull request template with the reason and description for the provided changes. Link your pull request with the existing issue, if applicable. After submitting your PR, wait for the review from the project maintainers.\n\n## Review and approval process\n\nAfter you submit your PR, wait for the review. The project maintainers will evaluate your changes and provide feedback either using [suggested changes](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/incorporating-feedback-in-your-pull-request) or pull request comments. Address the review suggestions and comments as soon as you can. If your PR looks good, the maintainers approve and merge it. \n\n## Contributors \n\nAll contributions get credit in [Contributors](contributors.md). Don't forget to add yourself there. \n"
  },
  {
    "path": "Cgroups/README.md",
    "content": "# Cgroups\n\nThis chapter describes `control groups` mechanism in the Linux kernel.\n\n* [Introduction](linux-cgroups-1.md)\n"
  },
  {
    "path": "Cgroups/linux-cgroups-1.md",
    "content": "Control Groups\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nThis is the first part of the new chapter of the [linux insides](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md) book and as you may guess by part's name - this part will cover [control groups](https://en.wikipedia.org/wiki/Cgroups) or `cgroups` mechanism in the Linux kernel.\n\n`Cgroups` are special mechanism provided by the Linux kernel which allows us to allocate kind of `resources` like processor time, number of processes per group, amount of memory per control group or combination of such resources for a process or set of processes. `Cgroups` are organized hierarchically and here this mechanism is similar to usual processes as they are hierarchical too and child `cgroups` inherit set of certain parameters from their parents. But actually they are not the same. The main difference between `cgroups` and normal processes is that many different hierarchies of control groups may exist simultaneously in one time while normal process tree is always single. This was not a casual step because each control group hierarchy is attached to set of control group `subsystems`.\n\nOne `control group subsystem` represents one kind of resources like a processor time or number of [pids](https://en.wikipedia.org/wiki/Process_identifier) or in other words number of processes for a `control group`. Linux kernel provides support for following twelve `control group subsystems`:\n\n* `cpuset` - assigns individual processor(s) and memory nodes to task(s) in a group;\n* `cpu` - uses the scheduler to provide cgroup tasks access to the processor resources;\n* `cpuacct` - generates reports about processor usage by a group;\n* `io` - sets limit to read/write from/to [block devices](https://en.wikipedia.org/wiki/Device_file);\n* `memory` - sets limit on memory usage by a task(s) from a group;\n* `devices` - allows access to devices by a task(s) from a group;\n* `freezer` - allows to suspend/resume for a task(s) from a group;\n* `net_cls` - allows to mark network packets from task(s) from a group;\n* `net_prio` - provides a way to dynamically set the priority of network traffic per network interface for a group;\n* `perf_event` - provides access to [perf events](https://en.wikipedia.org/wiki/Perf_\\(Linux\\)) to a group;\n* `hugetlb` - activates support for [huge pages](https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt) for a group;\n* `pid` - sets limit to number of processes in a group.\n\nEach of these control group subsystems depends on related configuration option. For example the `cpuset` subsystem should be enabled via `CONFIG_CPUSETS` kernel configuration option, the `io` subsystem via `CONFIG_BLK_CGROUP` kernel configuration option and etc. All of these kernel configuration options may be found in the `General setup → Control Group support` menu:\n\n![menuconfig](images/menuconfig.png)\n\nYou may see enabled control groups on your computer via [proc](https://en.wikipedia.org/wiki/Procfs) filesystem:\n\n```\n$ cat /proc/cgroups\n#subsys_name\thierarchy\tnum_cgroups\tenabled\ncpuset\t8\t1\t1\ncpu\t7\t66\t1\ncpuacct\t7\t66\t1\nblkio\t11\t66\t1\nmemory\t9\t94\t1\ndevices\t6\t66\t1\nfreezer\t2\t1\t1\nnet_cls\t4\t1\t1\nperf_event\t3\t1\t1\nnet_prio\t4\t1\t1\nhugetlb\t10\t1\t1\npids\t5\t69\t1\n```\n\nor via [sysfs](https://en.wikipedia.org/wiki/Sysfs):\n\n```\n$ ls -l /sys/fs/cgroup/\ntotal 0\ndr-xr-xr-x 5 root root  0 Dec  2 22:37 blkio\nlrwxrwxrwx 1 root root 11 Dec  2 22:37 cpu -> cpu,cpuacct\nlrwxrwxrwx 1 root root 11 Dec  2 22:37 cpuacct -> cpu,cpuacct\ndr-xr-xr-x 5 root root  0 Dec  2 22:37 cpu,cpuacct\ndr-xr-xr-x 2 root root  0 Dec  2 22:37 cpuset\ndr-xr-xr-x 5 root root  0 Dec  2 22:37 devices\ndr-xr-xr-x 2 root root  0 Dec  2 22:37 freezer\ndr-xr-xr-x 2 root root  0 Dec  2 22:37 hugetlb\ndr-xr-xr-x 5 root root  0 Dec  2 22:37 memory\nlrwxrwxrwx 1 root root 16 Dec  2 22:37 net_cls -> net_cls,net_prio\ndr-xr-xr-x 2 root root  0 Dec  2 22:37 net_cls,net_prio\nlrwxrwxrwx 1 root root 16 Dec  2 22:37 net_prio -> net_cls,net_prio\ndr-xr-xr-x 2 root root  0 Dec  2 22:37 perf_event\ndr-xr-xr-x 5 root root  0 Dec  2 22:37 pids\ndr-xr-xr-x 5 root root  0 Dec  2 22:37 systemd\n```\n\nAs you already may guess that `control groups` mechanism is not such mechanism which was invented only directly to the needs of the Linux kernel, but mostly for userspace needs. To use a `control group`, we should create it at first. We may create a `cgroup` via two ways.\n\nThe first way is to create subdirectory in any subsystem from `/sys/fs/cgroup` and add a pid of a task to a `tasks` file which will be created automatically right after we will create the subdirectory.\n\nThe second way is to create/destroy/manage `cgroups` with utils from `libcgroup` library (`libcgroup-tools` in Fedora).\n\nLet's consider a simple example. Following [bash](https://www.gnu.org/software/bash/) script will print a line to `/dev/tty` device which represents control terminal for the current process:\n\n```shell\n#!/bin/bash\n\nwhile :\ndo\n    echo \"print line\" > /dev/tty\n    sleep 5\ndone\n```\n\nSo, if we will run this script we will see following result:\n\n```\n$ sudo chmod +x cgroup_test_script.sh\n~$ ./cgroup_test_script.sh\nprint line\nprint line\nprint line\n...\n...\n...\n```\n\nNow let's go to the place where `cgroupfs` is mounted on our computer. As we just saw, this is `/sys/fs/cgroup` directory, but you may mount it everywhere you want.\n\n```\n$ cd /sys/fs/cgroup\n```\n\nAnd now let's go to the `devices` subdirectory which represents kind of resources that allows or denies access to devices by tasks in a `cgroup`:\n\n```\n# cd devices\n```\n\nand create `cgroup_test_group` directory there:\n\n```\n# mkdir cgroup_test_group\n```\n\nAfter creation of the `cgroup_test_group` directory, following files will be generated there:\n\n```\n/sys/fs/cgroup/devices/cgroup_test_group$ ls -l\ntotal 0\n-rw-r--r-- 1 root root 0 Dec  3 22:55 cgroup.clone_children\n-rw-r--r-- 1 root root 0 Dec  3 22:55 cgroup.procs\n--w------- 1 root root 0 Dec  3 22:55 devices.allow\n--w------- 1 root root 0 Dec  3 22:55 devices.deny\n-r--r--r-- 1 root root 0 Dec  3 22:55 devices.list\n-rw-r--r-- 1 root root 0 Dec  3 22:55 notify_on_release\n-rw-r--r-- 1 root root 0 Dec  3 22:55 tasks\n```\n\nFor this moment we are interested in `tasks` and `devices.deny` files. The first `tasks` files should contain pid(s) of processes which will be attached to the `cgroup_test_group`. The second `devices.deny` file contain list of denied devices. By default a newly created group has no any limits for devices access. To forbid a device (in our case it is `/dev/tty`) we should write to the `devices.deny` following line:\n\n```\n# echo \"c 5:0 w\" > devices.deny\n```\n\nLet's go step by step through this line. The first `c` letter represents type of a device. In our case the `/dev/tty` is `char device`. We can verify this from output of `ls` command:\n\n```\n~$ ls -l /dev/tty\ncrw-rw-rw- 1 root tty 5, 0 Dec  3 22:48 /dev/tty\n```\n\nsee the first `c` letter in a permissions list. The second part is `5:0` is major and minor numbers of the device. You can see these numbers in the output of `ls` too. And the last `w` letter forbids tasks to write to the specified device. So let's start the `cgroup_test_script.sh` script:\n\n```\n~$ ./cgroup_test_script.sh\nprint line\nprint line\nprint line\n...\n...\n```\n\nand add pid of this process to the `devices/tasks` file of our group:\n\n```\n# echo $(pidof -x cgroup_test_script.sh) > /sys/fs/cgroup/devices/cgroup_test_group/tasks\n```\n\nThe result of this action will be as expected:\n\n```\n~$ ./cgroup_test_script.sh\nprint line\nprint line\nprint line\nprint line\nprint line\nprint line\n./cgroup_test_script.sh: line 5: /dev/tty: Operation not permitted\n```\n\nSimilar situation will be when you will run you [docker](https://en.wikipedia.org/wiki/Docker_(software)) containers for example:\n\n```\n~$ docker ps\nCONTAINER ID        IMAGE               COMMAND                  CREATED             STATUS              PORTS                    NAMES\nfa2d2085cd1c        mariadb:10          \"docker-entrypoint...\"   12 days ago         Up 4 minutes        0.0.0.0:3306->3306/tcp   mysql-work\n\n~$ cat /sys/fs/cgroup/devices/docker/fa2d2085cd1c8d797002c77387d2061f56fefb470892f140d0dc511bd4d9bb61/tasks | head -3\n5501\n5584\n5585\n...\n...\n...\n```\n\nSo, during startup of a `docker` container, `docker` will create a `cgroup` for processes in this container:\n\n```\n$ docker exec -it mysql-work /bin/bash\n$ top\n  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND                                                                                   1 mysql     20   0  963996 101268  15744 S   0.0  0.6   0:00.46 mysqld\n   71 root      20   0   20248   3028   2732 S   0.0  0.0   0:00.01 bash\n   77 root      20   0   21948   2424   2056 R   0.0  0.0   0:00.00 top\n```\n\nAnd we may see this `cgroup` on host machine:\n\n```C\n$ systemd-cgls\n\nControl group /:\n-.slice\n├─docker\n│ └─fa2d2085cd1c8d797002c77387d2061f56fefb470892f140d0dc511bd4d9bb61\n│   ├─5501 mysqld\n│   └─6404 /bin/bash\n```\n\nNow we know a little about `control groups` mechanism, how to use it manually and what's the purpose of this mechanism. It's time to look inside of the Linux kernel source code and start to dive into implementation of this mechanism.\n\nEarly initialization of control groups\n--------------------------------------------------------------------------------\n\nNow after we just saw a little theory about `control groups` Linux kernel mechanism, we may start to dive into the source code of Linux kernel to get better acquainted with this mechanism. As always we will start from the initialization of `control groups`. Initialization of `cgroups` is divided into two parts in the Linux kernel: early and late. In this part we will consider only `early` part and `late` part will be considered in next parts.\n\nEarly initialization of `cgroups` starts from the call of the:\n\n```C\ncgroup_init_early();\n```\n\nfunction in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) during early initialization of the Linux kernel. This function is defined in the [kernel/cgroup/cgroup.c](https://github.com/torvalds/linux/blob/master/kernel/cgroup/cgroup.c) source code file and starts from the definition of two following local variables:\n\n```C\nint __init cgroup_init_early(void)\n{\n\tstatic struct cgroup_sb_opts __initdata opts;\n\tstruct cgroup_subsys *ss;\n    ...\n    ...\n    ...\n}\n```\n\nThe `cgroup_sb_opts` structure defined in the same source code file and looks:\n\n```C\nstruct cgroup_sb_opts {\n\tu16 subsys_mask;\n\tunsigned int flags;\n\tchar *release_agent;\n\tbool cpuset_clone_children;\n\tchar *name;\n\tbool none;\n};\n```\n\nwhich represents mount options of `cgroupfs`. For example we may create named cgroup hierarchy (with name `my_cgrp`) with the `name=` option and without any subsystems:\n\n```\n$ mount -t cgroup -oname=my_cgrp,none /mnt/cgroups\n```\n\nThe second variable - `ss` has type - `cgroup_subsys` structure which is defined in the [include/linux/cgroup-defs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/cgroup-defs.h) header file and as you may guess from the name of the type, it represents a `cgroup` subsystem. This structure contains various fields and callback functions like:\n\n```C\nstruct cgroup_subsys {\n    int (*css_online)(struct cgroup_subsys_state *css);\n    void (*css_offline)(struct cgroup_subsys_state *css);\n    ...\n    ...\n    ...\n    bool early_init:1;\n    int id;\n    const char *name;\n    struct cgroup_root *root;\n    ...\n    ...\n    ...\n}\n```\n\nWhere for example `css_online` and `css_offline` callbacks are called after a cgroup successfully will complete all allocations and a cgroup will be before releasing respectively. The `early_init` flags marks subsystems which may/should be initialized early. The `id` and `name` fields represents unique identifier in the array of registered subsystems for a cgroup and `name` of a subsystem respectively. The last - `root` fields represents pointer to the root of of a cgroup hierarchy.\n\nOf course the `cgroup_subsys` structure is bigger and has other fields, but it is enough for now. Now as we got to know important structures related to `cgroups` mechanism, let's return to the `cgroup_init_early` function. Main purpose of this function is to do early initialization of some subsystems. As you already may guess, these `early` subsystems should have `cgroup_subsys->early_init = 1`. Let's look what subsystems may be initialized early.\n\nAfter the definition of the two local variables we may see following lines of code:\n\n```C\ninit_cgroup_root(&cgrp_dfl_root, &opts);\ncgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;\n```\n\nHere we may see call of the `init_cgroup_root` function which will execute initialization of the default unified hierarchy and after this we set `CSS_NO_REF` flag in state of this default `cgroup` to disable reference counting for this css. The `cgrp_dfl_root` is defined in the same source code file:\n\n```C\nstruct cgroup_root cgrp_dfl_root;\n```\n\nIts `cgrp` field represented by the `cgroup` structure which represents a `cgroup` as you already may guess and defined in the [include/linux/cgroup-defs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/cgroup-defs.h) header file. We already know that a process is represented by the `task_struct` in the Linux kernel. The `task_struct` does not contain direct link to a `cgroup` where this task is attached. But it may be reached via `css_set` field of the `task_struct`. This `css_set` structure holds pointer to the array of subsystem states:\n\n```C\nstruct css_set {\n    ...\n    ...\n    ....\n    struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];\n    ...\n    ...\n    ...\n}\n```\n\nAnd via the `cgroup_subsys_state`, a process may get a `cgroup` that this process is attached to:\n\n```C\nstruct cgroup_subsys_state {\n    ...\n    ...\n    ...\n    struct cgroup *cgroup;\n    ...\n    ...\n    ...\n}\n```\n\nSo, the overall picture of `cgroups` related data structure is following:\n\n```\n+-------------+         +---------------------+    +------------->+---------------------+          +----------------+\n| task_struct |         |       css_set       |    |              | cgroup_subsys_state |          |     cgroup     |\n+-------------+         |                     |    |              +---------------------+          +----------------+\n|             |         |                     |    |              |                     |          |     flags      |\n|             |         |                     |    |              +---------------------+          |  cgroup.procs  |\n|             |         |                     |    |              |        cgroup       |--------->|       id       |\n|             |         |                     |    |              +---------------------+          |      ....      |\n|-------------+         |---------------------+----+                                               +----------------+\n|   cgroups   | ------> | cgroup_subsys_state | array of cgroup_subsys_state\n|-------------+         +---------------------+------------------>+---------------------+          +----------------+\n|             |         |                     |                   | cgroup_subsys_state |          |      cgroup    |\n+-------------+         +---------------------+                   +---------------------+          +----------------+\n                                                                  |                     |          |      flags     |\n                                                                  +---------------------+          |   cgroup.procs |\n                                                                  |        cgroup       |--------->|        id      |\n                                                                  +---------------------+          |       ....     |\n                                                                  |    cgroup_subsys    |          +----------------+\n                                                                  +---------------------+\n                                                                             |\n                                                                             |\n                                                                             ↓\n                                                                  +---------------------+\n                                                                  |    cgroup_subsys    |\n                                                                  +---------------------+\n                                                                  |         id          |\n                                                                  |        name         |\n                                                                  |      css_online     |\n                                                                  |      css_ofline     |\n                                                                  |        attach       |\n                                                                  |         ....        |\n                                                                  +---------------------+\n```\n\n\n\nSo, the `init_cgroup_root` fills the `cgrp_dfl_root` with the default values. The next thing is assigning initial `css_set` to the `init_task` which represents first process in the system:\n\n```C\nRCU_INIT_POINTER(init_task.cgroups, &init_css_set);\n```\n\nAnd the last big thing in the `cgroup_init_early` function is initialization of `early cgroups`. Here we go over all registered subsystems and assign unique identity number, name of a subsystem and call the `cgroup_init_subsys` function for subsystems which are marked as early:\n\n```C\nfor_each_subsys(ss, i) {\n\t\tss->id = i;\n\t\tss->name = cgroup_subsys_name[i];\n\n        if (ss->early_init)\n\t\t\tcgroup_init_subsys(ss, true);\n}\n```\n\nThe `for_each_subsys` here is a macro which is defined in the [kernel/cgroup/cgroup.c](https://github.com/torvalds/linux/blob/master/kernel/cgroup/cgroup.c) source code file and just expands to the `for` loop over `cgroup_subsys` array. Definition of this array may be found in the same source code file and it looks in a little unusual way:\n\n```C\n#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,\n    static struct cgroup_subsys *cgroup_subsys[] = {\n        #include <linux/cgroup_subsys.h>\n};\n#undef SUBSYS\n```\n\nIt is defined as `SUBSYS` macro which takes one argument (name of a subsystem) and defines `cgroup_subsys` array of cgroup subsystems. Additionally we may see that the array is initialized with content of the [linux/cgroup_subsys.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/cgroup_subsys.h) header file. If we will look inside of this header file we will see again set of the `SUBSYS` macros with the given subsystems names:\n\n```C\n#if IS_ENABLED(CONFIG_CPUSETS)\nSUBSYS(cpuset)\n#endif\n\n#if IS_ENABLED(CONFIG_CGROUP_SCHED)\nSUBSYS(cpu)\n#endif\n...\n...\n...\n```\n\nThis works because of `#undef` statement after first definition of the `SUBSYS` macro. Look at the `&_x ## _cgrp_subsys` expression. The `##` operator concatenates right and left expression in a `C` macro. So as we passed `cpuset`, `cpu` and etc., to the `SUBSYS` macro, somewhere `cpuset_cgrp_subsys`, `cpu_cgrp_subsys` should be defined. And that's true. If you will look in the [kernel/cgroup/cpuset.c](https://github.com/torvalds/linux/blob/master/kernel/cgroup/cpuset.c) source code file, you will see this definition:\n\n```C\nstruct cgroup_subsys cpuset_cgrp_subsys = {\n    ...\n    ...\n    ...\n\t.early_init\t= true,\n};\n```\n\nSo the last step in the `cgroup_init_early` function is initialization of early subsystems with the call of the `cgroup_init_subsys` function. Following early subsystems will be initialized:\n\n* `cpuset`;\n* `cpu`;\n* `cpuacct`.\n\nThe `cgroup_init_subsys` function does initialization of the given subsystem with the default values. For example sets root of hierarchy, allocates space for the given subsystem with the call of the `css_alloc` callback function, link a subsystem with a parent if it exists, add allocated subsystem to the initial process and etc.\n\nThat's all. From this moment early subsystems are initialized.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the first part which describes introduction into `Control groups` mechanism in the Linux kernel. We covered some theory and the first steps of initialization of stuffs related to `control groups` mechanism. In the next part we will continue to dive into the more practical aspects of `control groups`.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me a PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [control groups](https://en.wikipedia.org/wiki/Cgroups)\n* [PID](https://en.wikipedia.org/wiki/Process_identifier)\n* [cpuset](http://man7.org/linux/man-pages/man7/cpuset.7.html)\n* [block devices](https://en.wikipedia.org/wiki/Device_file)\n* [huge pages](https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt)\n* [sysfs](https://en.wikipedia.org/wiki/Sysfs)\n* [proc](https://en.wikipedia.org/wiki/Procfs)\n* [cgroups kernel documentation](https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt)\n* [cgroups v2](https://www.kernel.org/doc/Documentation/cgroup-v2.txt)\n* [bash](https://www.gnu.org/software/bash/)\n* [docker](https://en.wikipedia.org/wiki/Docker_\\(software\\))\n* [perf events](https://en.wikipedia.org/wiki/Perf_\\(Linux\\))\n* [Previous chapter](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-1)\n"
  },
  {
    "path": "Concepts/README.md",
    "content": "# Linux kernel concepts\n\nThis chapter describes various concepts which are used in the Linux kernel.\n\n* [Per-CPU variables](linux-cpu-1.md)\n* [CPU masks](linux-cpu-2.md)\n* [The initcall mechanism](linux-cpu-3.md)\n* [Notification Chains](linux-cpu-4.md)"
  },
  {
    "path": "Concepts/linux-cpu-1.md",
    "content": "Per-CPU variables\n================================================================================\n\nPer-CPU variables are one of the kernel features. You can understand the meaning of this feature by reading its name. We can create a variable and each processor core will have its own copy of this variable. In this part, we take a closer look at this feature and try to understand how it is implemented and how it works.\n\nThe kernel provides an API for creating per-cpu variables - the `DEFINE_PER_CPU` macro:\n\n```C\n#define DEFINE_PER_CPU(type, name) \\\n        DEFINE_PER_CPU_SECTION(type, name, \"\")\n```\n\nThis macro defined in the [include/linux/percpu-defs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/percpu-defs.h) as many other macros for work with per-cpu variables. Now we will see how this feature is implemented.\n\nTake a look at the `DEFINE_PER_CPU` definition. We see that it takes 2 parameters: `type` and `name`, so we can use it to create per-cpu variables, for example like this:\n\n```C\nDEFINE_PER_CPU(int, per_cpu_n)\n```\n\nWe pass the type and the name of our variable. `DEFINE_PER_CPU` calls the `DEFINE_PER_CPU_SECTION` macro and passes the same two parameters and empty string to it. Let's look at the definition of the `DEFINE_PER_CPU_SECTION`:\n\n```C\n#define DEFINE_PER_CPU_SECTION(type, name, sec)    \\\n         __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES  \\\n         __typeof__(type) name\n```\n\n```C\n#define __PCPU_ATTRS(sec)                                                \\\n         __percpu __attribute__((section(PER_CPU_BASE_SECTION sec)))     \\\n         PER_CPU_ATTRIBUTES\n```\n\nwhere `section` is:\n\n```C\n#define PER_CPU_BASE_SECTION \".data..percpu\"\n```\n\nAfter all macros are expanded we will get a global per-cpu variable:\n\n```C\n__attribute__((section(\".data..percpu\"))) int per_cpu_n\n```\n\nIt means that we will have a `per_cpu_n` variable in the `.data..percpu` section. We can find this section in the `vmlinux`:\n\n```\n.data..percpu 00013a58  0000000000000000  0000000001a5c000  00e00000  2**12\n              CONTENTS, ALLOC, LOAD, DATA\n```\n\nOk, now we know that when we use the `DEFINE_PER_CPU` macro, a per-cpu variable in the `.data..percpu` section will be created. When the kernel initializes it calls the `setup_per_cpu_areas` function which loads the `.data..percpu` section multiple times, one section per CPU.\n\nLet's look at the per-CPU areas initialization process. It starts in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) from the call of the `setup_per_cpu_areas` function which is defined in the [arch/x86/kernel/setup_percpu.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/setup_percpu.c).\n\n```C\npr_info(\"NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\\n\",\n        NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);\n```\n\nThe `setup_per_cpu_areas` starts from the output information about the maximum number of CPUs set during kernel configuration with the `CONFIG_NR_CPUS` configuration option, actual number of CPUs, `nr_cpumask_bits` is the same that `NR_CPUS` bit for the new `cpumask` operators and number of `NUMA` nodes.\n\nWe can see this output in the dmesg:\n\n```\n$ dmesg | grep percpu\n[    0.000000] setup_percpu: NR_CPUS:8 nr_cpumask_bits:8 nr_cpu_ids:8 nr_node_ids:1\n```\n\nIn the next step we check the `percpu` first chunk allocator. All percpu areas are allocated in chunks. The first chunk is used for the static percpu variables. The Linux kernel has `percpu_alloc` command line parameters which provides the type of the first chunk allocator. We can read about it in the kernel documentation:\n\n```\npercpu_alloc=\tSelect which percpu first chunk allocator to use.\n\t\tCurrently supported values are \"embed\" and \"page\".\n\t\tArchs may support subset or none of the\tselections.\n\t\tSee comments in mm/percpu.c for details on each\n\t\tallocator.  This parameter is primarily\tfor debugging\n\t\tand performance comparison.\n```\n\nThe [mm/percpu.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/percpu.c) contains the handler of this command line option:\n\n```C\nearly_param(\"percpu_alloc\", percpu_alloc_setup);\n```\n\nWhere the `percpu_alloc_setup` function sets the `pcpu_chosen_fc` variable depends on the `percpu_alloc` parameter value. By default the first chunk allocator is `auto`:\n\n```C\nenum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;\n```\n\nIf the `percpu_alloc` parameter is not given to the kernel command line, the `embed` allocator will be used which embeds the first percpu chunk into bootmem with the [memblock](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-1). The last allocator is the first chunk `page` allocator which maps the first chunk with `PAGE_SIZE` pages.\n\nAs I wrote above, first of all we make a check of the first chunk allocator type in the `setup_per_cpu_areas`. We check that first chunk allocator is not page:\n\n```C\nif (pcpu_chosen_fc != PCPU_FC_PAGE) {\n    ...\n    ...\n    ...\n}\n```\n\nIf it is not `PCPU_FC_PAGE`, we will use the `embed` allocator and allocate space for the first chunk with the `pcpu_embed_first_chunk` function:\n\n```C\nrc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,\n\t\t\t\t\t    dyn_size, atom_size,\n\t\t\t\t\t    pcpu_cpu_distance,\n\t\t\t\t\t    pcpu_fc_alloc, pcpu_fc_free);\n```\n\nAs shown above, the `pcpu_embed_first_chunk` function embeds the first percpu chunk into bootmem then we pass a couple of parameters to the `pcup_embed_first_chunk`. They are as follows:\n\n* `PERCPU_FIRST_CHUNK_RESERVE` - the size of the reserved space for the static `percpu` variables;\n* `dyn_size` - minimum free size for dynamic allocation in bytes;\n* `atom_size` - all allocations are whole multiples of this and aligned to this parameter;\n* `pcpu_cpu_distance` - callback to determine distance between cpus;\n* `pcpu_fc_alloc` - function to allocate `percpu` page;\n* `pcpu_fc_free` - function to release `percpu` page.\n\nWe calculate all of these parameters before the call of the `pcpu_embed_first_chunk`:\n\n```C\nconst size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;\nsize_t atom_size;\n#ifdef CONFIG_X86_64\n\t\tatom_size = PMD_SIZE;\n#else\n\t\tatom_size = PAGE_SIZE;\n#endif\n```\n\nIf the first chunk allocator is `PCPU_FC_PAGE`, we will use the `pcpu_page_first_chunk` instead of the `pcpu_embed_first_chunk`. After that `percpu` areas up, we setup `percpu` offset and its segment for every CPU with the `setup_percpu_segment` function (only for `x86` systems) and move some early data from the arrays to the `percpu` variables (`x86_cpu_to_apicid`, `irq_stack_ptr` and etc...). After the kernel finishes the initialization process, we will have loaded N `.data..percpu` sections, where N is the number of CPUs, and the section used by the bootstrap processor will contain an uninitialized variable created with the `DEFINE_PER_CPU` macro.\n\nThe kernel provides an API for per-cpu variables manipulating:\n\n* get_cpu_var(var)\n* put_cpu_var(var)\n\n\nLet's look at the `get_cpu_var` implementation:\n\n```C\n#define get_cpu_var(var)     \\\n(*({                         \\\n         preempt_disable();  \\\n         this_cpu_ptr(&var); \\\n}))\n```\n\nThe Linux kernel is preemptible and accessing a per-cpu variable requires us to know which processor the kernel is running on. So, current code must not be preempted and moved to the another CPU while accessing a per-cpu variable. That's why, first of all we can see a call of the `preempt_disable` function then a call of the `this_cpu_ptr` macro, which looks like:\n\n```C\n#define this_cpu_ptr(ptr) raw_cpu_ptr(ptr)\n```\n\nand\n\n```C\n#define raw_cpu_ptr(ptr)        per_cpu_ptr(ptr, 0)\n```\n\nwhere `per_cpu_ptr` returns a pointer to the per-cpu variable for the given cpu (second parameter). After we've created a per-cpu variable and made modifications to it, we must call the `put_cpu_var` macro which enables preemption with a call of `preempt_enable` function. So the typical usage of a per-cpu variable is as follows:\n\n```C\nget_cpu_var(var);\n...\n//Do something with the 'var'\n...\nput_cpu_var(var);\n```\n\nLet's look at the `per_cpu_ptr` macro:\n\n```C\n#define per_cpu_ptr(ptr, cpu)                             \\\n({                                                        \\\n        __verify_pcpu_ptr(ptr);                           \\\n         SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)));  \\\n})\n```\n\nAs I wrote above, this macro returns a per-cpu variable for the given cpu. First of all it calls `__verify_pcpu_ptr`:\n\n```C\n#define __verify_pcpu_ptr(ptr)\ndo {\n\tconst void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL;\n\t(void)__vpp_verify;\n} while (0)\n```\n\nwhich makes the given `ptr` type of `const void __percpu *`,\n\nAfter this we can see the call of the `SHIFT_PERCPU_PTR` macro with two parameters. As first parameter we pass our ptr and for second parameter we pass the cpu number to the `per_cpu_offset` macro:\n\n```C\n#define per_cpu_offset(x) (__per_cpu_offset[x])\n```\n\nwhich expands to getting the `x` element from the `__per_cpu_offset` array:\n\n\n```C\nextern unsigned long __per_cpu_offset[NR_CPUS];\n```\n\nwhere `NR_CPUS` is the number of CPUs. The `__per_cpu_offset` array is filled with the distances between cpu-variable copies. For example all per-cpu data is `X` bytes in size, so if we access `__per_cpu_offset[Y]`, `X*Y` will be accessed. Let's look at the `SHIFT_PERCPU_PTR` implementation:\n\n```C\n#define SHIFT_PERCPU_PTR(__p, __offset)                                 \\\n         RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset))\n```\n\n`RELOC_HIDE` just returns offset `(typeof(ptr)) (__ptr + (off))` and it will return a pointer to the variable.\n\nThat's all! Of course it is not the full API, but a general overview. It can be hard to start with, but to understand per-cpu variables you mainly need to understand the  [include/linux/percpu-defs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/percpu-defs.h) magic.\n\nLet's again look at the algorithm of getting a pointer to a per-cpu variable:\n\n* The kernel creates multiple `.data..percpu` sections (one per-cpu) during initialization process;\n* All variables created with the `DEFINE_PER_CPU` macro will be relocated to the first section or for CPU0;\n* `__per_cpu_offset` array filled with the distance (`BOOT_PERCPU_OFFSET`) between `.data..percpu` sections;\n* When the `per_cpu_ptr` is called, for example for getting a pointer on a certain per-cpu variable for the third CPU, the `__per_cpu_offset` array will be accessed, where every index points to the required CPU.\n\nThat's all.\n"
  },
  {
    "path": "Concepts/linux-cpu-2.md",
    "content": "CPU masks\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\n`Cpumasks` is a special way provided by the Linux kernel to store information about CPUs in the system. The relevant source code and header files which contains API for `Cpumasks` manipulation:\n\n* [include/linux/cpumask.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/cpumask.h)\n* [lib/cpumask.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/cpumask.c)\n* [kernel/cpu.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/cpu.c)\n\nAs comment says from the [include/linux/cpumask.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/cpumask.h): Cpumasks provide a bitmap suitable for representing the set of CPU's in a system, one bit position per CPU number. We already saw a bit about cpumask in the `boot_cpu_init` function from the [Kernel entry point](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4) part. This function makes first boot cpu online, active and etc...:\n\n```C\nset_cpu_online(cpu, true);\nset_cpu_active(cpu, true);\nset_cpu_present(cpu, true);\nset_cpu_possible(cpu, true);\n```\n\nBefore we consider implementation of these functions, let's consider all of these masks.\n\nThe `cpu_possible` is a set of cpu ID's which can be plugged in anytime during the life of that system boot or in other words mask of possible CPUs contains maximum number of CPUs which are possible in the system. It will be equal to value of the `NR_CPUS` which is set statically via the `CONFIG_NR_CPUS` kernel configuration option.\n\nThe `cpu_present` mask represents which CPUs are currently plugged in.\n\nThe `cpu_online` represents a subset of the `cpu_present` and indicates CPUs which are available for scheduling or in other words a bit from this mask tells the kernel if a processor may be utilized by the Linux kernel.\n\nThe last mask is `cpu_active`. Bits of this mask tells to Linux kernel is a task may be moved to a certain processor.\n\nAll of these masks depend on the `CONFIG_HOTPLUG_CPU` configuration option and if this option is disabled `possible == present` and `active == online`. The implementations of all of these functions are very similar. Every function checks the second parameter. If it is `true`, it calls `cpumask_set_cpu` otherwise it calls `cpumask_clear_cpu` .\n\nThere are two ways for a `cpumask` creation. First is to use `cpumask_t`. It is defined as:\n\n```C\ntypedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;\n```\n\nIt wraps the `cpumask` structure which contains one bitmask `bits` field. The `DECLARE_BITMAP` macro gets two parameters:\n\n* bitmap name;\n* number of bits.\n\nand creates an array of `unsigned long` with the given name. Its implementation is pretty easy:\n\n```C\n#define DECLARE_BITMAP(name,bits) \\\n        unsigned long name[BITS_TO_LONGS(bits)]\n```\n\nwhere `BITS_TO_LONGS`:\n\n```C\n#define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))\n#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))\n```\n\nAs we are focusing on the `x86_64` architecture, `unsigned long` is 8-bytes size and our array will contain only one element:\n\n```\n(((8) + (64) - 1) / (64)) = 1\n```\n\n`NR_CPUS` macro represents the number of CPUs in the system and depends on the `CONFIG_NR_CPUS` macro which is defined in [include/linux/threads.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/threads.h) and looks like this:\n\n```C\n#ifndef CONFIG_NR_CPUS\n        #define CONFIG_NR_CPUS  1\n#endif\n\n#define NR_CPUS         CONFIG_NR_CPUS\n```\n\nThe second way to define cpumask is to use the `DECLARE_BITMAP` macro directly and the `to_cpumask` macro which converts the given bitmap to `struct cpumask *`:\n\n```C\n#define to_cpumask(bitmap)                                              \\\n        ((struct cpumask *)(1 ? (bitmap)                                \\\n                            : (void *)sizeof(__check_is_bitmap(bitmap))))\n```\n\nWe can see the ternary operator operator here which is `true` every time. `__check_is_bitmap` inline function is defined as:\n\n```C\nstatic inline int __check_is_bitmap(const unsigned long *bitmap)\n{\n        return 1;\n}\n```\n\nAnd returns `1` every time. We need it here for only one purpose: at compile time it checks that a given `bitmap` is a bitmap, or in other words it checks that a given `bitmap` has type - `unsigned long *`. So we just pass `cpu_possible_bits` to the `to_cpumask` macro for converting an array of `unsigned long` to the `struct cpumask *`.\n\ncpumask API\n--------------------------------------------------------------------------------\n\nAs we can define cpumask with one of the methods, Linux kernel provides API for manipulating a cpumask. Let's consider one of the function which presented above. For example `set_cpu_online`. This function takes two parameters:\n\n* Index of CPU;\n* CPU status;\n\nImplementation of this function looks as:\n\n```C\nvoid set_cpu_online(unsigned int cpu, bool online)\n{\n\tif (online) {\n\t\tcpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));\n\t\tcpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));\n\t} else {\n\t\tcpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));\n\t}\n}\n```\n\nFirst of all it checks the second `state` parameter and calls `cpumask_set_cpu` or `cpumask_clear_cpu` depending on it. Here we can see casting to the `struct cpumask *` of the second parameter in the `cpumask_set_cpu`. In our case it is `cpu_online_bits` which is a bitmap and defined as:\n\n```C\nstatic DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;\n```\n\nThe `cpumask_set_cpu` function makes only one call to the `set_bit` function:\n\n```C\nstatic inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)\n{\n        set_bit(cpumask_check(cpu), cpumask_bits(dstp));\n}\n```\n\nThe `set_bit` function takes two parameters too, and sets a given bit (first parameter) in the memory (second parameter or `cpu_online_bits` bitmap). We can see here that before `set_bit` is called, its two parameters will be passed to the\n\n* cpumask_check;\n* cpumask_bits.\n\nLet's consider these two macros. First if `cpumask_check` does nothing in our case and just returns given parameter. The second `cpumask_bits` just returns the `bits` field from the given `struct cpumask *` structure:\n\n```C\n#define cpumask_bits(maskp) ((maskp)->bits)\n```\n\nNow let's look on the `set_bit` implementation:\n\n```C\n static __always_inline void\n set_bit(long nr, volatile unsigned long *addr)\n {\n         if (IS_IMMEDIATE(nr)) {\n                asm volatile(LOCK_PREFIX \"orb %1,%0\"\n                        : CONST_MASK_ADDR(nr, addr)\n                        : \"iq\" ((u8)CONST_MASK(nr))\n                        : \"memory\");\n        } else {\n                asm volatile(LOCK_PREFIX \"bts %1,%0\"\n                        : BITOP_ADDR(addr) : \"Ir\" (nr) : \"memory\");\n        }\n }\n```\n\nThis function looks scary, but it is not so hard as it seems. First of all it passes `nr` or number of the bit to the `IS_IMMEDIATE` macro which just calls the GCC internal `__builtin_constant_p` function:\n\n```C\n#define IS_IMMEDIATE(nr)    (__builtin_constant_p(nr))\n```\n\n`__builtin_constant_p` checks that given parameter is known constant at compile-time. As our `cpu` is not compile-time constant, the `else` clause will be executed:\n\n```C\nasm volatile(LOCK_PREFIX \"bts %1,%0\" : BITOP_ADDR(addr) : \"Ir\" (nr) : \"memory\");\n```\n\nLet's try to understand how it works step by step:\n\n`LOCK_PREFIX` is a x86 `lock` instruction. This instruction tells the cpu to occupy the system bus while the instruction(s) will be executed. This allows the CPU to synchronize memory access, preventing simultaneous access of multiple processors (or devices - the DMA controller for example) to one memory cell.\n\n`BITOP_ADDR` casts the given parameter to the `(*(volatile long *)` and adds `+m` constraints. `+` means that this operand is both read and written by the instruction. `m` shows that this is a memory operand. `BITOP_ADDR` is defined as:\n\n```C\n#define BITOP_ADDR(x) \"+m\" (*(volatile long *) (x))\n```\n\nNext is the `memory` clobber. It tells the compiler that the assembly code performs memory reads or writes to items other than those listed in the input and output operands (for example, accessing the memory pointed to by one of the input parameters).\n\n`Ir` - immediate register operand.\n\n\nThe `bts` instruction sets a given bit in a bit string and stores the value of a given bit in the `CF` flag. So we passed the cpu number which is zero in our case and after `set_bit` is executed, it sets the zero bit in the `cpu_online_bits` cpumask. It means that the first cpu is online at this moment.\n\nBesides the `set_cpu_*` API, cpumask of course provides another API for cpumasks manipulation. Let's consider it in short.\n\nAdditional cpumask API\n--------------------------------------------------------------------------------\n\ncpumask provides a set of macros for getting the numbers of CPUs in various states. For example:\n\n```C\n#define num_online_cpus()\tcpumask_weight(cpu_online_mask)\n```\n\nThis macro returns the amount of `online` CPUs. It calls the `cpumask_weight` function with the `cpu_online_mask` bitmap (read about it). The`cpumask_weight` function makes one call of the `bitmap_weight` function with two parameters:\n\n* cpumask bitmap;\n* `nr_cpumask_bits` - which is `NR_CPUS` in our case.\n\n```C\nstatic inline unsigned int cpumask_weight(const struct cpumask *srcp)\n{\n\treturn bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);\n}\n```\n\nand calculates the number of bits in the given bitmap. Besides the `num_online_cpus`, cpumask provides macros for the all CPU states:\n\n* num_possible_cpus;\n* num_active_cpus;\n* cpu_online;\n* cpu_possible.\n\nand many more.\n\nBesides that the Linux kernel provides the following API for the manipulation of `cpumask`:\n\n* `for_each_cpu` - iterates over every cpu in a mask;\n* `for_each_cpu_not` - iterates over every cpu in a complemented mask;\n* `cpumask_clear_cpu` - clears a cpu in a cpumask;\n* `cpumask_test_cpu` - tests a cpu in a mask;\n* `cpumask_setall` - set all cpus in a mask;\n* `cpumask_size` - returns size to allocate for a 'struct cpumask' in bytes;\n\nand many many more...\n\nLinks\n--------------------------------------------------------------------------------\n\n* [cpumask documentation](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt)\n"
  },
  {
    "path": "Concepts/linux-cpu-3.md",
    "content": "The initcall mechanism\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nAs you may understand from the title, this part will cover an interesting and important concept in the Linux kernel which is called `initcall`. We already saw definitions like these:\n\n```C\nearly_param(\"debug\", debug_kernel);\n```\n\nor\n\n```C\narch_initcall(init_pit_clocksource);\n```\n\nin some parts of the Linux kernel. Before we will see how this mechanism is implemented in the Linux kernel, we must know actually what is it and how the Linux kernel uses it. Definitions like these represent a [callback](https://en.wikipedia.org/wiki/Callback_%28computer_programming%29) function which is called during Linux kernel initialization. Actually the main point of the `initcall` mechanism is to determine correct order of the built-in modules and subsystems initialization. For example let's look at the following function:\n\n```C\nstatic int __init nmi_warning_debugfs(void)\n{\n    debugfs_create_u64(\"nmi_longest_ns\", 0644,\n                       arch_debugfs_dir, &nmi_longest_ns);\n    return 0;\n}\nfs_initcall(nmi_warning_debugfs);\n```\n\nfrom the [arch/x86/kernel/nmi.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/nmi.c) source code file. As we may see it just creates the `nmi_longest_ns` [debugfs](https://en.wikipedia.org/wiki/Debugfs) file in the `arch_debugfs_dir` directory. Actually, this `debugfs` file may be created only after the `arch_debugfs_dir` will be created. Creation of this directory occurs during the architecture-specific initialization of the Linux kernel. Actually this directory will be created in the `arch_kdebugfs_init` function from the [arch/x86/kernel/kdebugfs.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/kdebugfs.c) source code file. Note that the `arch_kdebugfs_init` function is marked as `initcall` too:\n\n```C\narch_initcall(arch_kdebugfs_init);\n```\n\nThe Linux kernel calls all architecture-specific `initcalls` before the `fs` related `initcalls`. So, our `nmi_longest_ns` file will be created only after the `arch_kdebugfs_dir` directory will be created. Actually, the Linux kernel provides eight levels of main `initcalls`:\n\n* `early`;\n* `core`;\n* `postcore`;\n* `arch`;\n* `subsys`;\n* `fs`;\n* `device`;\n* `late`.\n\nAll of their names are represented by the `initcall_level_names` array which is defined in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file:\n\n```C\nstatic char *initcall_level_names[] __initdata = {\n\t\"early\",\n\t\"core\",\n\t\"postcore\",\n\t\"arch\",\n\t\"subsys\",\n\t\"fs\",\n\t\"device\",\n\t\"late\",\n};\n```\n\nAll functions which are marked as `initcall` by these identifiers, will be called in the same order presented in the `initcall_level_names` array, in other words, at first `early initcalls` will be called, then `core initcalls` and so forth. From this moment we know a little about `initcall` mechanism, so we can start to dive into the source code of the Linux kernel to see how this mechanism is implemented.\n\nImplementation initcall mechanism in the Linux kernel\n--------------------------------------------------------------------------------\n\nThe Linux kernel provides a set of macros from the [include/linux/init.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init.h) header file to mark a given function as `initcall`. All of these macros are pretty simple:\n\n```C\n#define early_initcall(fn)\t\t__define_initcall(fn, early)\n#define core_initcall(fn)\t\t__define_initcall(fn, 1)\n#define postcore_initcall(fn)\t\t__define_initcall(fn, 2)\n#define arch_initcall(fn)\t\t__define_initcall(fn, 3)\n#define subsys_initcall(fn)\t\t__define_initcall(fn, 4)\n#define fs_initcall(fn)\t\t\t__define_initcall(fn, 5)\n#define device_initcall(fn)\t\t__define_initcall(fn, 6)\n#define late_initcall(fn)\t\t__define_initcall(fn, 7)\n```\n\nand as we may see these macros just expand to the call of the `__define_initcall` macro from the same header file. Moreover, the `__define_initcall` macro takes two arguments:\n\n* `fn` - callback function which will be called during call of `initcalls` of the certain level;\n* `id` - identifier to identify `initcall` to prevent error when two the same `initcalls` point to the same handler.\n\nThe implementation of the `__define_initcall` macro looks like:\n\n```C\n#define __define_initcall(fn, id) \\\n\tstatic initcall_t __initcall_##fn##id __used \\\n\t__attribute__((__section__(\".initcall\" #id \".init\"))) = fn; \\\n\tLTO_REFERENCE_INITCALL(__initcall_##fn##id)\n```\n\nTo understand the `__define_initcall` macro, first of all let's look at the `initcall_t` type. This type is defined in the same [header](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init.h) file and it represents pointer to a function which returns [integer](https://en.wikipedia.org/wiki/Integer) which will be result of the `initcall`:\n\n```C\ntypedef int (*initcall_t)(void);\n```\n\nNow let's return to the `__define_initcall` macro. The [##](https://gcc.gnu.org/onlinedocs/cpp/Concatenation.html) provides ability to concatenate two symbols. In our case, the first line of the `__define_initcall` macro produces the definition of a given function, `__initcall_<function-name>_<id>`, which is located in the `.initcall <id> .init` [ELF section](http://www.skyfree.org/linux/references/ELF_Format.pdf) and marked with the `__used` attribute (see below). If we look at [include/asm-generic/vmlinux.lds.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/asm-generic/vmlinux.lds.h) header file, which represents data for the kernel [linker](https://en.wikipedia.org/wiki/Linker_%28computing%29) script, we will see that all of `initcalls` sections will be placed in the `.data` section:\n\n```C\n#define INIT_CALLS\t\t\t\t\t\\\n\t\tVMLINUX_SYMBOL(__initcall_start) = .;\t\\\n\t\t*(.initcallearly.init)\t\t\t\t\t\\\n\t\tINIT_CALLS_LEVEL(0)\t\t\t\t\t    \\\n\t\tINIT_CALLS_LEVEL(1)\t\t\t\t\t    \\\n\t\tINIT_CALLS_LEVEL(2)\t\t\t\t\t    \\\n\t\tINIT_CALLS_LEVEL(3)\t\t\t\t\t    \\\n\t\tINIT_CALLS_LEVEL(4)\t\t\t\t\t    \\\n\t\tINIT_CALLS_LEVEL(5)\t\t\t\t\t    \\\n\t\tINIT_CALLS_LEVEL(rootfs)\t\t\t\t\\\n\t\tINIT_CALLS_LEVEL(6)\t\t\t\t\t    \\\n\t\tINIT_CALLS_LEVEL(7)\t\t\t\t\t    \\\n\t\tVMLINUX_SYMBOL(__initcall_end) = .;\n\n#define INIT_DATA_SECTION(initsetup_align)\t\\\n\t.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {\t   \\\n        ...                                                \\\n        INIT_CALLS\t\t\t\t\t\t                   \\\n        ...                                                \\\n\t}\n\n```\n\nand their names are going to be as follows (got from System.map):\n\n```\n...\nffffffff8320ce60 t __initcall_arch_kdebugfs_init3\n...\nffffffff8320d0e0 t __initcall_nmi_warning_debugfs5\n...\n```\n\nThe attribute `__used` is defined in the [include/linux/compiler-gcc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/compiler-gcc.h) header file and it expands to the definition of the following [`gcc`](https://en.wikipedia.org/wiki/GNU_Compiler_Collection) attribute:\n\n```C\n#define __used   __attribute__((__used__))\n```\n\nwhich prevents `variable defined but not used` warning. The last line of the `__define_initcall` macro is:\n\n```C\nLTO_REFERENCE_INITCALL(__initcall_##fn##id)\n```\n\ndepends on the `CONFIG_LTO` kernel configuration option and just provides stub for the compiler [Link time optimization](https://gcc.gnu.org/wiki/LinkTimeOptimization):\n\n```\n#ifdef CONFIG_LTO\n#define LTO_REFERENCE_INITCALL(x) \\\n        static __used __exit void *reference_##x(void)  \\\n        {                                               \\\n                return &x;                              \\\n        }\n#else\n#define LTO_REFERENCE_INITCALL(x)\n#endif\n```\n\nIn order to prevent any problem when there is no reference to a variable in a module, it will be moved to the end of the program. That's all about the `__define_initcall` macro. So, all of the `*_initcall` macros will be expanded during compilation of the Linux kernel, and all `initcalls` will be placed in their sections and all of them will be available from the `.data` section and the Linux kernel will know where to find a certain `initcall` to call it during initialization process.\n\nAs `initcalls` can be called by the Linux kernel, let's look how the Linux kernel does this. This process starts in the `do_basic_setup` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file:\n\n```C\nstatic void __init do_basic_setup(void)\n{\n    ...\n    ...\n    ...\n   \tdo_initcalls();\n    ...\n    ...\n    ...\n}\n```\n\nwhich is called during the initialization of the Linux kernel, right after main steps of initialization like memory manager related initialization, `CPU` subsystem and others are already finished. The `do_initcalls` function just goes through the array of `initcall` levels and call the `do_initcall_level` function for each level:\n\n```C\nstatic void __init do_initcalls(void)\n{\n\tint level;\n\n\tfor (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++)\n\t\tdo_initcall_level(level);\n}\n```\n\nThe `initcall_levels` array is defined in the same source code [file](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) and contains pointers to the sections which were defined in the `__define_initcall` macro:\n\n```C\nstatic initcall_t *initcall_levels[] __initdata = {\n\t__initcall0_start,\n\t__initcall1_start,\n\t__initcall2_start,\n\t__initcall3_start,\n\t__initcall4_start,\n\t__initcall5_start,\n\t__initcall6_start,\n\t__initcall7_start,\n\t__initcall_end,\n};\n```\n\nIf you are interested, you can find these sections in the `arch/x86/kernel/vmlinux.lds` linker script which is generated after the Linux kernel compilation:\n\n```\n.init.data : AT(ADDR(.init.data) - 0xffffffff80000000) {\n    ...\n    ...\n    ...\n    ...\n    __initcall_start = .;\n    *(.initcallearly.init)\n    __initcall0_start = .;\n    *(.initcall0.init)\n    *(.initcall0s.init)\n    __initcall1_start = .;\n    ...\n    ...\n}\n```\n\nIf you are not familiar with this then you can know more about [linkers](https://en.wikipedia.org/wiki/Linker_%28computing%29) in the special [part](https://0xax.gitbook.io/linux-insides/summary/misc/linux-misc-3) of this book.\n\nAs we just saw, the `do_initcall_level` function takes one parameter - level of `initcall` - and does the following two things:\n\n* parses the `initcall_command_line` which is copy of usual kernel [command line](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst) which may contain parameters for modules with the `parse_args` function from the [kernel/params.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/params.c) source code file;\n\n* call the `do_one_initcall` function for each level:\n\n```C\nfor (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)\n\t\tdo_one_initcall(*fn);\n```\n\nThe `do_one_initcall` does the main job for us. As we may see, this function takes one parameter which represent `initcall` callback function and does the call of the given callback:\n\n```C\nint __init_or_module do_one_initcall(initcall_t fn)\n{\n\tint count = preempt_count();\n\tint ret;\n\tchar msgbuf[64];\n\n\tif (initcall_blacklisted(fn))\n\t\treturn -EPERM;\n\n\tif (initcall_debug)\n\t\tret = do_one_initcall_debug(fn);\n\telse\n\t\tret = fn();\n\n\tmsgbuf[0] = 0;\n\n\tif (preempt_count() != count) {\n\t\tsprintf(msgbuf, \"preemption imbalance \");\n\t\tpreempt_count_set(count);\n\t}\n\tif (irqs_disabled()) {\n\t\tstrlcat(msgbuf, \"disabled interrupts \", sizeof(msgbuf));\n\t\tlocal_irq_enable();\n\t}\n\tWARN(msgbuf[0], \"initcall %pF returned with %s\\n\", fn, msgbuf);\n\n\treturn ret;\n}\n```\n\nLet's try to understand what does the `do_one_initcall` function does. First of all we increase [preemption](https://en.wikipedia.org/wiki/Preemption_%28computing%29) counter so that we can check it later to be sure that it is not imbalanced. After this step we can see the call of the `initcall_blacklisted` function which goes over the `blacklisted_initcalls` list which stores blacklisted `initcalls` and releases the given `initcall` if it is located in this list:\n\n```C\nlist_for_each_entry(entry, &blacklisted_initcalls, next) {\n\tif (!strcmp(fn_name, entry->buf)) {\n\t\tpr_debug(\"initcall %s blacklisted\\n\", fn_name);\n\t\tkfree(fn_name);\n\t\treturn true;\n\t}\n}\n```\n\nThis blacklist is filled during early Linux kernel initialization from the Linux kernel command line.\n\nAfter the blacklisted `initcalls` are handled, the next part of code directly calls the `initcall` callback:\n\n```C\nif (initcall_debug)\n\tret = do_one_initcall_debug(fn);\nelse\n\tret = fn();\n```\n\n`initcall_debug` variable defines if the call should be handled through the debug codepath (with more information being printed to the [kernel log buffer](https://en.wikipedia.org/wiki/Dmesg)) or not, where the callback will finally be executed. The `initcall_debug` variable is defined in the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file:\n\n```C\nbool initcall_debug;\n```\n\nThe value of the variable can be set from the kernel commands via the `initcall_debug` parameter, as we can read from the [documentation](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst) of the Linux kernel command line:\n\n```\ninitcall_debug\t[KNL] Trace initcalls as they are executed.  Useful\n                      for working out where the kernel is dying during\n                      startup.\n```\n\nAnd that's true. If we will look at the implementation of the `do_one_initcall_debug` function, we will see that it does the same as the `do_one_initcall` function, i.e. the `do_one_initcall_debug` function calls the given `initcall` and prints some information (like the [pid](https://en.wikipedia.org/wiki/Process_identifier) of the currently running task, duration of execution of the `initcall` and etc.) related to the execution of the given `initcall`:\n\n```C\nstatic int __init_or_module do_one_initcall_debug(initcall_t fn)\n{\n\tktime_t calltime, delta, rettime;\n\tunsigned long long duration;\n\tint ret;\n\n\tprintk(KERN_DEBUG \"calling  %pF @ %i\\n\", fn, task_pid_nr(current));\n\tcalltime = ktime_get();\n\tret = fn();\n\trettime = ktime_get();\n\tdelta = ktime_sub(rettime, calltime);\n\tduration = (unsigned long long) ktime_to_ns(delta) >> 10;\n\tprintk(KERN_DEBUG \"initcall %pF returned %d after %lld usecs\\n\",\n\t\t fn, ret, duration);\n\n\treturn ret;\n}\n```\n\nAs an `initcall` was called by the one of the ` do_one_initcall` or `do_one_initcall_debug` functions, we may see two checks in the end of the `do_one_initcall` function. The first one checks the amount of possible `__preempt_count_add` and `__preempt_count_sub` calls inside of the executed initcall, and if this value is not equal to the previous value of the preemptible counter, we add the `preemption imbalance` string to the message buffer and set correct value of the preemptible counter:\n\n```C\nif (preempt_count() != count) {\n\tsprintf(msgbuf, \"preemption imbalance \");\n\tpreempt_count_set(count);\n}\n```\n\nThe last check the state of local [IRQs](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) and if they are disabled, we add the `disabled interrupts` strings to log buffer and enable `IRQs` for the current processor to make sure that `IRQs` are enabled after each `initcall` is completed (in case the callback disabled it and didn't enable before exiting):\n\n```C\nif (irqs_disabled()) {\n\tstrlcat(msgbuf, \"disabled interrupts \", sizeof(msgbuf));\n\tlocal_irq_enable();\n}\n```\n\nThat's all. In this way the Linux kernel does initialization of many subsystems in a correct order. From now on, we know what is the `initcall` mechanism in the Linux kernel. In this part, we covered main general portion of the `initcall` mechanism but we left some important concepts. Let's make a short look at these concepts.\n\nFirst of all, we have missed one level of `initcalls`, this is `rootfs initcalls`. You can find definition of the `rootfs_initcall` in the [include/linux/init.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init.h) header file along with all similar macros which we saw in this part:\n\n```C\n#define rootfs_initcall(fn)\t\t__define_initcall(fn, rootfs)\n```\n\nAs we may understand from the macro's name, its main purpose is to store callbacks which are related to the [rootfs](https://en.wikipedia.org/wiki/Initramfs). Besides this goal, it may be useful to initialize other components after initialization related to filesystems level was already done, but before devices related initcalls. For example, the decompression of the [initramfs](https://en.wikipedia.org/wiki/Initramfs) which occurred in the `populate_rootfs` function from the [init/initramfs.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/initramfs.c) source code file:\n\n```C\nrootfs_initcall(populate_rootfs);\n```\n\nFrom this place, we may see familiar output:\n\n```\n[    0.199960] Unpacking initramfs...\n```\n\nBesides the `rootfs_initcall` level, there are additional `console_initcall`, `security_initcall` and other secondary `initcall` levels. The last thing that we have missed is the set of the `*_initcall_sync` levels. Almost each `*_initcall` macro that we have seen in this part, has macro companion with the `_sync` prefix:\n\n```C\n#define core_initcall_sync(fn)\t\t__define_initcall(fn, 1s)\n#define postcore_initcall_sync(fn)\t__define_initcall(fn, 2s)\n#define arch_initcall_sync(fn)\t\t__define_initcall(fn, 3s)\n#define subsys_initcall_sync(fn)\t__define_initcall(fn, 4s)\n#define fs_initcall_sync(fn)\t\t__define_initcall(fn, 5s)\n#define device_initcall_sync(fn)\t__define_initcall(fn, 6s)\n#define late_initcall_sync(fn)\t\t__define_initcall(fn, 7s)\n```\n\nThe main goal of these additional levels is to wait for completion of all modules related initialization routines for a certain level.\n\nAnother point worthy of mention is the `module_init(x)` macro, defined at [include/linux/module.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/module.h) as:\n\n```C\n#define module_init(x)   __initcall(x);\n```\n\nIf we follow and check what's the definition of `__initcall(x)` at [include/linux/init.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init.h) we can see that it's being set as an `device_initcall`:\n\n```C\n#define __initcall(fn) device_initcall(fn)\n```\n\nWith that we can conclude that when a function set as `__init` of certain module isn't explicitly added to a specific initcall category, but using `module_init()` macro, it is added to device initcall list by default.\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIn this part we saw the important mechanism of the Linux kernel which allows to call a function which depends on the current state of the Linux kernel during its initialization.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [callback](https://en.wikipedia.org/wiki/Callback_%28computer_programming%29)\n* [debugfs](https://en.wikipedia.org/wiki/Debugfs)\n* [integer type](https://en.wikipedia.org/wiki/Integer)\n* [symbols concatenation](https://gcc.gnu.org/onlinedocs/cpp/Concatenation.html)\n* [GCC](https://en.wikipedia.org/wiki/GNU_Compiler_Collection)\n* [Link time optimization](https://gcc.gnu.org/wiki/LinkTimeOptimization)\n* [Introduction to linkers](https://0xax.gitbook.io/linux-insides/summary/misc/linux-misc-3)\n* [Linux kernel command line](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst)\n* [Process identifier](https://en.wikipedia.org/wiki/Process_identifier)\n* [IRQs](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29)\n* [rootfs](https://en.wikipedia.org/wiki/Initramfs)\n* [previous part](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)\n"
  },
  {
    "path": "Concepts/linux-cpu-4.md",
    "content": "Notification Chains in Linux Kernel\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nThe Linux kernel is huge piece of [C](https://en.wikipedia.org/wiki/C_%28programming_language%29) code which consists from many different subsystems. Each subsystem has its own purpose which is independent of other subsystems. But often one subsystem wants to know something from other subsystem(s). There is special mechanism in the Linux kernel which allows to solve this problem partly. The name of this mechanism is - `notification chains` and its main purpose to provide a way for different subsystems to subscribe on asynchronous events from other subsystems. Note that this mechanism is only for communication inside kernel, but there are other mechanisms for communication between kernel and userspace.\n\nBefore we consider `notification chains` [API](https://en.wikipedia.org/wiki/Application_programming_interface) and implementation of this API, let's look at `Notification chains` mechanism from theoretical side as we did it in other parts of this book. Everything which is related to `notification chains` mechanism is located in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file and [kernel/notifier.c](https://github.com/torvalds/linux/blob/master/kernel/notifier.c) source code file. So let's open them and start to dive.\n\nNotification Chains related data structures\n--------------------------------------------------------------------------------\n\nLet's start to consider `notification chains` mechanism from related data structures. As I wrote above, main data structures should be located in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file, so the Linux kernel provides generic API which does not depend on certain architecture. In general, the `notification chains` mechanism represents a list (that's why it's named `chains`) of [callback](https://en.wikipedia.org/wiki/Callback_%28computer_programming%29) functions which are will be executed when an event will be occurred.\n\nAll of these callback functions are represented as `notifier_fn_t` type in the Linux kernel:\n\n```C\ntypedef\tint (*notifier_fn_t)(struct notifier_block *nb, unsigned long action, void *data);\n```\n\nSo we may see that it takes three following arguments:\n\n* `nb` - is linked list of function pointers (will see it now);\n* `action` - is type of an event. A notification chain may support multiple events, so we need this parameter to distinguish an event from other events;\n* `data` - is storage for private information. Actually it allows to provide additional data information about an event.\n\nAdditionally we may see that `notifier_fn_t` returns an integer value. This integer value maybe one of:\n\n* `NOTIFY_DONE` - subscriber does not interested in notification;\n* `NOTIFY_OK` - notification was processed correctly;\n* `NOTIFY_BAD` - something went wrong;\n* `NOTIFY_STOP` - notification is done, but no further callbacks should be called for this event.\n\nAll of these results defined as macros in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file:\n\n```C\n#define NOTIFY_DONE\t\t0x0000\n#define NOTIFY_OK\t\t0x0001\n#define NOTIFY_BAD\t\t(NOTIFY_STOP_MASK|0x0002)\n#define NOTIFY_STOP\t\t(NOTIFY_OK|NOTIFY_STOP_MASK)\n```\n\nWhere `NOTIFY_STOP_MASK` represented by the:\n\n```C\n#define NOTIFY_STOP_MASK\t0x8000\n```\n\nmacro and means that callbacks will not be called during next notifications.\n\nEach part of the Linux kernel which wants to be notified on a certain event will should provide own `notifier_fn_t` callback function. Main role of the `notification chains` mechanism is to call certain callbacks when an asynchronous event occurred.\n\nThe main building block of the `notification chains` mechanism is the `notifier_block` structure:\n\n```C\nstruct notifier_block {\n\tnotifier_fn_t notifier_call;\n\tstruct notifier_block __rcu *next;\n\tint priority;\n};\n```\n\nwhich is defined in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) file. This struct contains pointer to callback function - `notifier_call`, link to the next notification callback and `priority` of a callback function as functions with higher priority are executed first.\n\nThe Linux kernel provides notification chains of four following types:\n\n* Blocking notifier chains;\n* SRCU notifier chains;\n* Atomic notifier chains;\n* Raw notifier chains.\n\nLet's consider all of these types of notification chains by order:\n\nIn the first case for the `blocking notifier chains`, callbacks will be called/executed in process context. This means that the calls in a notification chain may be blocked.\n\nThe second `SRCU notifier chains` represent alternative form of `blocking notifier chains`. In the first case, blocking notifier chains uses `rw_semaphore` synchronization primitive to protect chain links. `SRCU` notifier chains run in process context too, but uses special form of [RCU](https://en.wikipedia.org/wiki/Read-copy-update) mechanism which is permissible to block in an read-side critical section.\n\nIn the third case for the `atomic notifier chains` runs in interrupt or atomic context and protected by [spinlock](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) synchronization primitive. The last `raw notifier chains` provides special type of notifier chains without any locking restrictions on callbacks. This means that protection rests on the shoulders of caller side. It is very useful when we want to protect our chain with very specific locking mechanism.\n\nIf we will look at the implementation of the `notifier_block` structure, we will see that it contains pointer to the `next` element from a notification chain list, but we have no head. Actually a head of such list is in separate structure depends on type of a notification chain. For example for the `blocking notifier chains`:\n\n```C\nstruct blocking_notifier_head {\n\tstruct rw_semaphore rwsem;\n\tstruct notifier_block __rcu *head;\n};\n```\n\nor for `atomic notification chains`:\n\n```C\nstruct atomic_notifier_head {\n\tspinlock_t lock;\n\tstruct notifier_block __rcu *head;\n};\n```\n\nNow as we know a little about `notification chains` mechanism let's consider implementation of its API.\n\nNotification Chains\n--------------------------------------------------------------------------------\n\nUsually there are two sides in a publish/subscriber mechanisms. One side who wants to get notifications and other side(s) who generates these notifications. We will consider notification chains mechanism from both sides. We will consider `blocking notification chains` in this part, because of other types of notification chains are similar to it and differ mostly in protection mechanisms.\n\nBefore a notification producer is able to produce notification, first of all it should initialize head of a notification chain. For example let's consider notification chains related to kernel [loadable modules](https://en.wikipedia.org/wiki/Loadable_kernel_module). If we will look in the [kernel/module.c](https://github.com/torvalds/linux/blob/master/kernel/module.c) source code file, we will see following definition:\n\n```C\nstatic BLOCKING_NOTIFIER_HEAD(module_notify_list);\n```\n\nwhich defines head for loadable modules blocking notifier chain. The `BLOCKING_NOTIFIER_HEAD` macro is defined in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file and expands to the following code:\n\n```C\n#define BLOCKING_INIT_NOTIFIER_HEAD(name) do {\t\\\n\t\tinit_rwsem(&(name)->rwsem);\t                            \\\n\t\t(name)->head = NULL;\t\t                            \\\n\t} while (0)\n```\n\nSo we may see that it takes name of a name of a head of a blocking notifier chain and initializes read/write [semaphore](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-3) and set head to `NULL`. Besides the `BLOCKING_INIT_NOTIFIER_HEAD` macro, the Linux kernel additionally provides `ATOMIC_INIT_NOTIFIER_HEAD`, `RAW_INIT_NOTIFIER_HEAD` macros and `srcu_init_notifier` function for initialization atomic and other types of notification chains.\n\nAfter initialization of a head of a notification chain, a subsystem which wants to receive notification from the given notification chain should register with certain function which depends on the type of notification. If you will look in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file, you will see following four function for this:\n\n```C\nextern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,\n\t\tstruct notifier_block *nb);\n\nextern int blocking_notifier_chain_register(struct blocking_notifier_head *nh,\n\t\tstruct notifier_block *nb);\n\nextern int raw_notifier_chain_register(struct raw_notifier_head *nh,\n\t\tstruct notifier_block *nb);\n\nextern int srcu_notifier_chain_register(struct srcu_notifier_head *nh,\n\t\tstruct notifier_block *nb);\n```\n\nAs I already wrote above, we will cover only blocking notification chains in the part, so let's consider implementation of the `blocking_notifier_chain_register` function. Implementation of this function is located in the [kernel/notifier.c](https://github.com/torvalds/linux/blob/master/kernel/notifier.c) source code file and as we may see the `blocking_notifier_chain_register` takes two parameters:\n\n* `nh` - head of a notification chain;\n* `nb` - notification descriptor.\n\nNow let's look at the implementation of the `blocking_notifier_chain_register` function:\n\n```C\nint raw_notifier_chain_register(struct raw_notifier_head *nh,\n\t\tstruct notifier_block *n)\n{\n\treturn notifier_chain_register(&nh->head, n);\n}\n```\n\nAs we may see it just returns result of the `notifier_chain_register` function from the same source code file and as we may understand this function does all job for us. Definition of the `notifier_chain_register` function looks:\n\n```C\nint blocking_notifier_chain_register(struct blocking_notifier_head *nh,\n\t\tstruct notifier_block *n)\n{\n\tint ret;\n\n\tif (unlikely(system_state == SYSTEM_BOOTING))\n\t\treturn notifier_chain_register(&nh->head, n);\n\n\tdown_write(&nh->rwsem);\n\tret = notifier_chain_register(&nh->head, n);\n\tup_write(&nh->rwsem);\n\treturn ret;\n}\n```\n\nAs we may see implementation of the `blocking_notifier_chain_register` is pretty simple. First of all there is check which check current system state and if a system in rebooting state we just call the `notifier_chain_register`. In other way we do the same call of the `notifier_chain_register` but as you may see this call is protected with read/write semaphores. Now let's look at the implementation of the `notifier_chain_register` function:\n\n```C\nstatic int notifier_chain_register(struct notifier_block **nl,\n\t\tstruct notifier_block *n)\n{\n\twhile ((*nl) != NULL) {\n\t\tif (n->priority > (*nl)->priority)\n\t\t\tbreak;\n\t\tnl = &((*nl)->next);\n\t}\n\tn->next = *nl;\n\trcu_assign_pointer(*nl, n);\n\treturn 0;\n}\n```\n\nThis function just inserts new `notifier_block` (given by a subsystem which wants to get notifications) to the notification chain list. Besides subscribing on an event, subscriber may unsubscribe from a certain events with the set of `unsubscribe` functions:\n\n```C\nextern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,\n\t\tstruct notifier_block *nb);\n\nextern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,\n\t\tstruct notifier_block *nb);\n\nextern int raw_notifier_chain_unregister(struct raw_notifier_head *nh,\n\t\tstruct notifier_block *nb);\n\nextern int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,\n\t\tstruct notifier_block *nb);\n```\n\nWhen a producer of notifications wants to notify subscribers about an event, the `*.notifier_call_chain` function will be called. As you already may guess each type of notification chains provides own function to produce notification:\n\n```C\nextern int atomic_notifier_call_chain(struct atomic_notifier_head *nh,\n\t\tunsigned long val, void *v);\n\nextern int blocking_notifier_call_chain(struct blocking_notifier_head *nh,\n\t\tunsigned long val, void *v);\n\nextern int raw_notifier_call_chain(struct raw_notifier_head *nh,\n\t\tunsigned long val, void *v);\n\nextern int srcu_notifier_call_chain(struct srcu_notifier_head *nh,\n\t\tunsigned long val, void *v);\n```\n\nLet's consider implementation of the `blocking_notifier_call_chain` function. This function is defined in the [kernel/notifier.c](https://github.com/torvalds/linux/blob/master/kernel/notifier.c) source code file:\n\n```C\nint blocking_notifier_call_chain(struct blocking_notifier_head *nh,\n\t\tunsigned long val, void *v)\n{\n\treturn __blocking_notifier_call_chain(nh, val, v, -1, NULL);\n}\n```\n\nand as we may see it just returns result of the `__blocking_notifier_call_chain` function. As we may see, the `blocking_notifier_call_chain` takes three parameters:\n\n* `nh` - head of notification chain list;\n* `val` - type of a notification;\n* `v` -  input parameter which may be used by handlers.\n\nBut the `__blocking_notifier_call_chain` function takes five parameters:\n\n```C\nint __blocking_notifier_call_chain(struct blocking_notifier_head *nh,\n\t\t\t\t   unsigned long val, void *v,\n\t\t\t\t   int nr_to_call, int *nr_calls)\n{\n    ...\n    ...\n    ...\n}\n```\n\nWhere `nr_to_call` and `nr_calls` are number of notifier functions to be called and number of sent notifications. As you may guess the main goal of the `__blocking_notifier_call_chain` function and other functions for other notification types is to call callback function when an event occurs. Implementation of the `__blocking_notifier_call_chain` is pretty simple, it just calls the `notifier_call_chain` function from the same source code file protected with read/write semaphore:\n\n```C\nint __blocking_notifier_call_chain(struct blocking_notifier_head *nh,\n\t\t\t\t   unsigned long val, void *v,\n\t\t\t\t   int nr_to_call, int *nr_calls)\n{\n\tint ret = NOTIFY_DONE;\n\n\tif (rcu_access_pointer(nh->head)) {\n\t\tdown_read(&nh->rwsem);\n\t\tret = notifier_call_chain(&nh->head, val, v, nr_to_call,\n\t\t\t\t\tnr_calls);\n\t\tup_read(&nh->rwsem);\n\t}\n\treturn ret;\n}\n```\n\nand returns its result. In this case all job is done by the `notifier_call_chain` function. Main purpose of this function is to inform registered notifiers about an asynchronous event:\n\n```C\nstatic int notifier_call_chain(struct notifier_block **nl,\n\t\t\t       unsigned long val, void *v,\n\t\t\t       int nr_to_call, int *nr_calls)\n{\n    ...\n    ...\n    ...\n    ret = nb->notifier_call(nb, val, v);\n    ...\n    ...\n    ...\n    return ret;\n}\n```\n\nThat's all. In general all looks pretty simple.\n\nNow let's consider on a simple example related to [loadable modules](https://en.wikipedia.org/wiki/Loadable_kernel_module). If we will look in the [kernel/module.c](https://github.com/torvalds/linux/blob/master/kernel/module.c). As we already saw in this part, there is:\n\n```C\nstatic BLOCKING_NOTIFIER_HEAD(module_notify_list);\n```\n\ndefinition of the `module_notify_list` in the [kernel/module.c](https://github.com/torvalds/linux/blob/master/kernel/module.c) source code file. This definition determines head of list of blocking notifier chains related to kernel modules. There are at least three following events:\n\n* MODULE_STATE_LIVE\n* MODULE_STATE_COMING\n* MODULE_STATE_GOING\n\nin which maybe interested some subsystems of the Linux kernel. For example tracing of kernel modules states. Instead of direct call of the `atomic_notifier_chain_register`, `blocking_notifier_chain_register` and etc., most notification chains come with a set of wrappers used to register to them. Registration on these modules events is going with the help of such wrapper:\n\n```C\nint register_module_notifier(struct notifier_block *nb)\n{\n\treturn blocking_notifier_chain_register(&module_notify_list, nb);\n}\n```\n\nIf we will look in the [kernel/tracepoint.c](https://github.com/torvalds/linux/blob/master/kernel/tracepoint.c) source code file, we will see such registration during initialization of [tracepoints](https://www.kernel.org/doc/Documentation/trace/tracepoints.txt):\n\n```C\nstatic __init int init_tracepoints(void)\n{\n\tint ret;\n\n\tret = register_module_notifier(&tracepoint_module_nb);\n\tif (ret)\n\t\tpr_warn(\"Failed to register tracepoint module enter notifier\\n\");\n\n\treturn ret;\n}\n```\n\nWhere `tracepoint_module_nb` provides callback function:\n\n```C\nstatic struct notifier_block tracepoint_module_nb = {\n\t.notifier_call = tracepoint_module_notify,\n\t.priority = 0,\n};\n```\n\nWhen one of the `MODULE_STATE_LIVE`, `MODULE_STATE_COMING` or `MODULE_STATE_GOING` events occurred. For example the `MODULE_STATE_LIVE` the `MODULE_STATE_COMING` notifications will be sent during execution of the [init_module](http://man7.org/linux/man-pages/man2/init_module.2.html) [system call](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-1). Or for example `MODULE_STATE_GOING` will be sent during execution of the [delete_module](http://man7.org/linux/man-pages/man2/delete_module.2.html) `system call`:\n\n```C\nSYSCALL_DEFINE2(delete_module, const char __user *, name_user,\n\t\tunsigned int, flags)\n{\n    ...\n    ...\n    ...\n    blocking_notifier_call_chain(&module_notify_list,\n\t\t\t\t     MODULE_STATE_GOING, mod);\n    ...\n    ...\n    ...\n}\n```\n\nThus when one of these system call will be called from userspace, the Linux kernel will send certain notification depending on a system call and the `tracepoint_module_notify` callback function will be called.\n\nThat's all.\n\nLinks\n--------------------------------------------------------------------------------\n\n* [C programming language](https://en.wikipedia.org/wiki/C_%28programming_language%29)\n* [API](https://en.wikipedia.org/wiki/Application_programming_interface)\n* [callback](https://en.wikipedia.org/wiki/Callback_%28computer_programming%29)\n* [RCU](https://en.wikipedia.org/wiki/Read-copy-update)\n* [spinlock](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1)\n* [loadable modules](https://en.wikipedia.org/wiki/Loadable_kernel_module)\n* [semaphore](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-3)\n* [tracepoints](https://www.kernel.org/doc/Documentation/trace/tracepoints.txt)\n* [system call](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-1)\n* [init_module system call](http://man7.org/linux/man-pages/man2/init_module.2.html)\n* [delete_module](http://man7.org/linux/man-pages/man2/delete_module.2.html)\n* [previous part](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-3)\n"
  },
  {
    "path": "DataStructures/README.md",
    "content": "Data Structures in the Linux Kernel\n========================================================================\n\nLinux kernel provides different implementations of data structures like doubly linked list, B+ tree, priority heap and many many more.\n\nThis part considers the following data structures and algorithms:\n\n  * [Doubly linked list](linux-datastructures-1.md)\n  * [Radix tree](linux-datastructures-2.md)\n  * [Bit arrays](linux-datastructures-3.md)\n"
  },
  {
    "path": "DataStructures/linux-datastructures-1.md",
    "content": "Data Structures in the Linux Kernel\n================================================================================\n\nDoubly linked list\n--------------------------------------------------------------------------------\n\nLinux kernel provides its own implementation of doubly linked list, which you can find in the [include/linux/list.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/list.h). We will start `Data Structures in the Linux kernel` from the doubly linked list data structure. Why? Because it is very popular in the kernel, just try to [search](http://lxr.free-electrons.com/ident?i=list_head)\n\nFirst of all, let's look on the main structure in the [include/linux/types.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/types.h):\n\n```C\nstruct list_head {\n\tstruct list_head *next, *prev;\n};\n```\n\nYou can note that it is different from many implementations of doubly linked list which you have seen. For example, this doubly linked list structure from the [glib](http://www.gnu.org/software/libc/) library looks like :\n\n```C\nstruct GList {\n  gpointer data;\n  GList *next;\n  GList *prev;\n};\n```\n\nUsually a linked list structure contains a pointer to the item. The implementation of linked list in Linux kernel does not. So the main question is - `where does the list store the data?`. The actual implementation of linked list in the kernel is - `Intrusive list`. An intrusive linked list does not contain data in its nodes - A node just contains pointers to the next and previous node and list nodes part of the data that are added to the list. This makes the data structure generic, so it does not care about entry data type anymore.\n\nFor example:\n\n```C\nstruct nmi_desc {\n    spinlock_t lock;\n    struct list_head head;\n};\n```\n\nLet's look at some examples to understand how `list_head` is used in the kernel. As I already wrote about, there are many, really many different places where lists are used in the kernel. Let's look for an example in miscellaneous character drivers. Misc character drivers API from the [drivers/char/misc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/char/misc.c) is used for writing small drivers for handling simple hardware or virtual devices. Those drivers share same major number:\n\n```C\n#define MISC_MAJOR              10\n```\n\nbut have their own minor number. For example you can see it with:\n\n```\nls -l /dev |  grep 10\ncrw-------   1 root root     10, 235 Mar 21 12:01 autofs\ndrwxr-xr-x  10 root root         200 Mar 21 12:01 cpu\ncrw-------   1 root root     10,  62 Mar 21 12:01 cpu_dma_latency\ncrw-------   1 root root     10, 203 Mar 21 12:01 cuse\ndrwxr-xr-x   2 root root         100 Mar 21 12:01 dri\ncrw-rw-rw-   1 root root     10, 229 Mar 21 12:01 fuse\ncrw-------   1 root root     10, 228 Mar 21 12:01 hpet\ncrw-------   1 root root     10, 183 Mar 21 12:01 hwrng\ncrw-rw----+  1 root kvm      10, 232 Mar 21 12:01 kvm\ncrw-rw----   1 root disk     10, 237 Mar 21 12:01 loop-control\ncrw-------   1 root root     10, 227 Mar 21 12:01 mcelog\ncrw-------   1 root root     10,  59 Mar 21 12:01 memory_bandwidth\ncrw-------   1 root root     10,  61 Mar 21 12:01 network_latency\ncrw-------   1 root root     10,  60 Mar 21 12:01 network_throughput\ncrw-r-----   1 root kmem     10, 144 Mar 21 12:01 nvram\nbrw-rw----   1 root disk      1,  10 Mar 21 12:01 ram10\ncrw--w----   1 root tty       4,  10 Mar 21 12:01 tty10\ncrw-rw----   1 root dialout   4,  74 Mar 21 12:01 ttyS10\ncrw-------   1 root root     10,  63 Mar 21 12:01 vga_arbiter\ncrw-------   1 root root     10, 137 Mar 21 12:01 vhci\n```\n\nNow let's have a close look at how lists are used in the misc device drivers. First of all, let's look on `miscdevice` structure:\n\n```C\nstruct miscdevice\n{\n      int minor;\n      const char *name;\n      const struct file_operations *fops;\n      struct list_head list;\n      struct device *parent;\n      struct device *this_device;\n      const char *nodename;\n      mode_t mode;\n};\n```\n\nWe can see the fourth field in the `miscdevice` structure - `list` which is a list of registered devices. In the beginning of the source code file we can see the definition of misc_list:\n\n```C\nstatic LIST_HEAD(misc_list);\n```\n\nwhich expands to the definition of variables with `list_head` type:\n\n```C\n#define LIST_HEAD(name) \\\n\tstruct list_head name = LIST_HEAD_INIT(name)\n```\n\nand initializes it with the `LIST_HEAD_INIT` macro, which sets previous and next entries with the address of variable - name:\n\n```C\n#define LIST_HEAD_INIT(name) { &(name), &(name) }\n```\n\nNow let's look on the `misc_register` function which registers a miscellaneous device. At the start it initializes `miscdevice->list` with the `INIT_LIST_HEAD` function:\n\n```C\nINIT_LIST_HEAD(&misc->list);\n```\n\nwhich does the same as the `LIST_HEAD_INIT` macro:\n\n```C\nstatic inline void INIT_LIST_HEAD(struct list_head *list)\n{\n\tlist->next = list;\n\tlist->prev = list;\n}\n```\n\nIn the next step after a device is created by the `device_create` function, we add it to the miscellaneous devices list with:\n\n```\nlist_add(&misc->list, &misc_list);\n```\n\nKernel `list.h` provides this API for the addition of a new entry to the list. Let's look at its implementation:\n\n```C\nstatic inline void list_add(struct list_head *new, struct list_head *head)\n{\n\t__list_add(new, head, head->next);\n}\n```\n\nIt just calls internal function `__list_add` with the 3 given parameters:\n\n* new  - new entry.\n* head - list head after which the new item will be inserted.\n* head->next - next item after list head.\n\nImplementation of the `__list_add` is pretty simple:\n\n```C\nstatic inline void __list_add(struct list_head *new,\n\t\t\t      struct list_head *prev,\n\t\t\t      struct list_head *next)\n{\n\tnext->prev = new;\n\tnew->next = next;\n\tnew->prev = prev;\n\tprev->next = new;\n}\n```\n\nHere we add a new item between `prev` and `next`. So `misc` list which we defined at the start with the `LIST_HEAD_INIT` macro will contain previous and next pointers to the `miscdevice->list`.\n\nThere is still one question: how to get list's entry. There is a special macro:\n\n```C\n#define list_entry(ptr, type, member) \\\n\tcontainer_of(ptr, type, member)\n```\n\nwhich gets three parameters:\n\n* ptr - the structure list_head pointer;\n* type - structure type;\n* member - the name of the list_head within the structure;\n\nFor example:\n\n```C\nconst struct miscdevice *p = list_entry(v, struct miscdevice, list)\n```\n\nAfter this we can access to any `miscdevice` field with `p->minor` or `p->name` and etc... Let's look on the `list_entry` implementation:\n\n```C\n#define list_entry(ptr, type, member) \\\n\tcontainer_of(ptr, type, member)\n```\n\nAs we can see it just calls `container_of` macro with the same arguments. At first sight, the `container_of` looks strange:\n\n```C\n#define container_of(ptr, type, member) ({                      \\\n    const typeof( ((type *)0)->member ) *__mptr = (ptr);    \\\n    (type *)( (char *)__mptr - offsetof(type,member) );})\n```\n\nFirst of all you can note that it consists of two expressions in curly brackets. The compiler will evaluate the whole block in the curly braces and use the value of the last expression.\n\nFor example:\n\n```\n#include <stdio.h>\n\nint main() {\n\tint i = 0;\n\tprintf(\"i = %d\\n\", ({++i; ++i;}));\n\treturn 0;\n}\n```\n\nwill print `2`.\n\nThe next point is `typeof`, it's simple. As you can understand from its name, it just returns the type of the given variable. When I first saw the implementation of the `container_of` macro, the strangest thing I found was the zero in the `((type *)0)` expression. Actually this pointer magic calculates the offset of the given field from the address of the structure, but as we have `0` here, it will be just a zero offset along with the field width. Let's look at a simple example:\n\n```C\n#include <stdio.h>\n\nstruct s {\n        int field1;\n        char field2;\n\t\tchar field3;\n};\n\nint main() {\n\tprintf(\"%p\\n\", &((struct s*)0)->field3);\n\treturn 0;\n}\n```\n\nwill print `0x5`.\n\nThe next `offsetof` macro calculates offset from the beginning of the structure to the given structure's field. Its implementation is very similar to the previous code:\n\n```C\n#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)\n```\n\nLet's summarize all about `container_of` macro. The `container_of` macro returns the address of the structure by the given address of the structure's field with `list_head` type, the name of the structure field with `list_head` type and type of the container structure. At the first line this macro declares the `__mptr` pointer which points to the field of the structure that `ptr` points to and assigns `ptr` to it. Now `ptr` and `__mptr` point to the same address. Technically we don't need this line but it's useful for type checking. The first line ensures that the given structure (`type` parameter) has a member called `member`. In the second line it calculates offset of the field from the structure with the `offsetof` macro and subtracts it from the structure address. That's all.\n\nOf course `list_add` and `list_entry` is not the only functions which `<linux/list.h>` provides. Implementation of the doubly linked list provides the following API:\n\n* list_add\n* list_add_tail\n* list_del\n* list_replace\n* list_move\n* list_is_last\n* list_empty\n* list_cut_position\n* list_splice\n* list_for_each\n* list_for_each_entry\n\nand many more.\n"
  },
  {
    "path": "DataStructures/linux-datastructures-2.md",
    "content": "Data Structures in the Linux Kernel\n================================================================================\n\nRadix tree\n--------------------------------------------------------------------------------\n\nAs you already know Linux kernel provides many different libraries and functions which implement different data structures and algorithms. In this part we will consider one of these data structures - [Radix tree](http://en.wikipedia.org/wiki/Radix_tree). There are two files which are related to `radix tree` implementation and API in the linux kernel:\n\n* [include/linux/radix-tree.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/radix-tree.h)\n* [lib/radix-tree.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/radix-tree.c)\n\nLets talk about what a `radix tree` is. Radix tree is a `compressed trie` where a [trie](http://en.wikipedia.org/wiki/Trie) is a data structure which implements an interface of an associative array and allows to store values as `key-value`. The keys are usually strings, but any data type can be used. A trie is different from an `n-tree` because of its nodes. Nodes of a trie do not store keys; instead, a node of a trie stores single character labels. The key which is related to a given node is derived by traversing from the root of the tree to this node. For example:\n\n\n```\n               +-----------+\n               |           |\n               |    \" \"    |\n               |           |\n        +------+-----------+------+\n        |                         |\n        |                         |\n   +----v------+            +-----v-----+\n   |           |            |           |\n   |    g      |            |     c     |\n   |           |            |           |\n   +-----------+            +-----------+\n        |                         |\n        |                         |\n   +----v------+            +-----v-----+\n   |           |            |           |\n   |    o      |            |     a     |\n   |           |            |           |\n   +-----------+            +-----------+\n                                  |\n                                  |\n                            +-----v-----+\n                            |           |\n                            |     t     |\n                            |           |\n                            +-----------+\n```\n\nSo in this example, we can see the `trie` with keys, `go` and `cat`. The compressed trie or `radix tree` differs from `trie` in that all intermediates nodes which have only one child are removed.\n\nRadix tree in Linux kernel is the data structure which maps values to integer keys. It is represented by the following structures from the file [include/linux/radix-tree.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/radix-tree.h):\n\n```C\nstruct radix_tree_root {\n         unsigned int            height;\n         gfp_t                   gfp_mask;\n         struct radix_tree_node  __rcu *rnode;\n};\n```\n\nThis structure presents the root of a radix tree and contains three fields:\n\n* `height`   - height of the tree;\n* `gfp_mask` - tells how memory allocations will be performed;\n* `rnode`    - pointer to the child node.\n\nThe first field we will discuss is `gfp_mask`:\n\nLow-level kernel memory allocation functions take a set of flags as - `gfp_mask`, which describes how that allocation is to be performed. These `GFP_` flags which control the allocation process can have following values: (`GFP_NOIO` flag) means allocation can block but must not initiate disk I/O; (`__GFP_HIGHMEM` flag) means either ZONE_HIGHMEM or ZONE_NORMAL memory can be used; (`GFP_ATOMIC` flag) means the allocation is high-priority and must not sleep, etc.\n\n* `GFP_NOIO` - allocation can block but must not initiate disk I/O;\n* `__GFP_HIGHMEM` - either ZONE_HIGHMEM or ZONE_NORMAL can be used;\n* `GFP_ATOMIC` - allocation process is high-priority and must not sleep;\n\netc.\n\nThe next field is `rnode`:\n\n```C\nstruct radix_tree_node {\n        unsigned int    path;\n        unsigned int    count;\n        union {\n                struct {\n                        struct radix_tree_node *parent;\n                        void *private_data;\n                };\n                struct rcu_head rcu_head;\n        };\n        /* For tree user */\n        struct list_head private_list;\n        void __rcu      *slots[RADIX_TREE_MAP_SIZE];\n        unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];\n};\n```\n\nThis structure contains information about the offset in a parent and height from the bottom, count of the child nodes and fields for accessing and freeing a node. This fields are described below:\n\n* `path` - offset in parent & height from the bottom;\n* `count` - count of the child nodes;\n* `parent` - pointer to the parent node;\n* `private_data` - used by the user of a tree;\n* `rcu_head` - used for freeing a node;\n* `private_list` - used by the user of a tree;\n\nThe two last fields of the `radix_tree_node` - `tags` and `slots` are important and interesting. Every node can contains a set of slots which are store pointers to the data. Empty slots in the Linux kernel radix tree implementation store `NULL`. Radix trees in the linux kernel also supports tags which are associated with the `tags` fields in the `radix_tree_node` structure. Tags allow individual bits to be set on records which are stored in the radix tree.\n\nNow that we know about radix tree structure, it is time to look on its API.\n\nLinux kernel radix tree API\n---------------------------------------------------------------------------------\n\nWe start from the data structure initialization. There are two ways to initialize a new radix tree. The first is to use `RADIX_TREE` macro:\n\n```C\nRADIX_TREE(name, gfp_mask);\n````\n\nAs you can see we pass the `name` parameter, so with the `RADIX_TREE` macro we can define and initialize radix tree with the given name. Implementation of the `RADIX_TREE` is easy:\n\n```C\n#define RADIX_TREE(name, mask) \\\n         struct radix_tree_root name = RADIX_TREE_INIT(mask)\n\n#define RADIX_TREE_INIT(mask)   { \\\n        .height = 0,              \\\n        .gfp_mask = (mask),       \\\n        .rnode = NULL,            \\\n}\n```\n\nAt the beginning of the `RADIX_TREE` macro we define instance of the `radix_tree_root` structure with the given name and call `RADIX_TREE_INIT` macro with the given mask. The `RADIX_TREE_INIT` macro just initializes `radix_tree_root` structure with the default values and the given mask.\n\nThe second way is to define `radix_tree_root` structure by hand and pass it with mask to the `INIT_RADIX_TREE` macro:\n\n```C\nstruct radix_tree_root my_radix_tree;\nINIT_RADIX_TREE(my_tree, gfp_mask_for_my_radix_tree);\n```\n\nwhere:\n\n```C\n#define INIT_RADIX_TREE(root, mask)  \\\ndo {                                 \\\n        (root)->height = 0;          \\\n        (root)->gfp_mask = (mask);   \\\n        (root)->rnode = NULL;        \\\n} while (0)\n```\n\nmakes the same initialization with default values as it does `RADIX_TREE_INIT` macro.\n\nThe next are two functions for inserting and deleting records to/from a radix tree:\n\n* `radix_tree_insert`;\n* `radix_tree_delete`;\n\nThe first `radix_tree_insert` function takes three parameters:\n\n* root of a radix tree;\n* index key;\n* data to insert;\n\nThe `radix_tree_delete` function takes the same set of parameters as the `radix_tree_insert`, but without data.\n\nSearching through a radix tree is implemented in three ways:\n\n* `radix_tree_lookup`;\n* `radix_tree_gang_lookup`;\n* `radix_tree_lookup_slot`.\n\nThe first `radix_tree_lookup` function takes two parameters:\n\n* root of a radix tree;\n* index key;\n\nThis function tries to find the given key in the tree and return the record associated with this key. The second `radix_tree_gang_lookup` function have the following signature\n\n```C\nunsigned int radix_tree_gang_lookup(struct radix_tree_root *root,\n                                    void **results,\n                                    unsigned long first_index,\n                                    unsigned int max_items);\n```\n\nand returns number of records, sorted by the keys, starting from the first index. Number of the returned records will not be greater than `max_items` value.\n\nAnd the last `radix_tree_lookup_slot` function will return the slot which will contain the data.\n\nLinks\n---------------------------------------------------------------------------------\n\n* [Radix tree](http://en.wikipedia.org/wiki/Radix_tree)\n* [Trie](http://en.wikipedia.org/wiki/Trie)\n"
  },
  {
    "path": "DataStructures/linux-datastructures-3.md",
    "content": "Data Structures in the Linux Kernel\n================================================================================\n\nBit arrays and bit operations in the Linux kernel\n--------------------------------------------------------------------------------\n\nBesides different [linked](https://en.wikipedia.org/wiki/Linked_data_structure) and [tree](https://en.wikipedia.org/wiki/Tree_%28data_structure%29) based data structures, the Linux kernel provides [API](https://en.wikipedia.org/wiki/Application_programming_interface) for [bit arrays](https://en.wikipedia.org/wiki/Bit_array) or `bitmap`. Bit arrays are heavily used in the Linux kernel and following source code files contain common `API` for work with such structures:\n\n* [lib/bitmap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/bitmap.c)\n* [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h)\n\nBesides these two files, there is also architecture-specific header file which provides optimized bit operations for certain architecture. We consider [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, so in our case it will be:\n\n* [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h)\n\nheader file. As I just wrote above, the `bitmap` is heavily used in the Linux kernel. For example a `bit array` is used to store set of online/offline processors for systems which support [hot-plug](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt) CPU (more about this you can read in the [cpumasks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) part), a `bit array` stores set of allocated [IRQs](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) during initialization of the Linux kernel and etc.\n\nSo, the main goal of this part is to see how `bit arrays` are implemented in the Linux kernel. Let's start.\n\nDeclaration of bit array\n================================================================================\n\nBefore we will look on `API` for bitmaps manipulation, we must know how to declare it in the Linux kernel. There are two common method to declare own bit array. The first simple way to declare a bit array is to array of `unsigned long`. For example:\n\n```C\nunsigned long my_bitmap[8]\n```\n\nThe second way is to use the `DECLARE_BITMAP` macro which is defined in the [include/linux/types.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/types.h) header file:\n\n```C\n#define DECLARE_BITMAP(name,bits) \\\n    unsigned long name[BITS_TO_LONGS(bits)]\n```\n\nWe can see that `DECLARE_BITMAP` macro takes two parameters:\n\n* `name` - name of bitmap;\n* `bits` - amount of bits in bitmap;\n\nand just expands to the definition of `unsigned long` array with `BITS_TO_LONGS(bits)` elements, where the `BITS_TO_LONGS` macro converts a given number of bits to number of `longs` or in other words it calculates how many `8` byte elements in `bits`:\n\n```C\n#define BITS_PER_BYTE           8\n#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))\n#define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))\n```\n\nSo, for example `DECLARE_BITMAP(my_bitmap, 64)` will produce:\n\n```python\n>>> (((64) + (64) - 1) / (64))\n1\n```\n\nand:\n\n```C\nunsigned long my_bitmap[1];\n```\n\nAfter we are able to declare a bit array, we can start to use it.\n\nArchitecture-specific bit operations\n================================================================================\n\nWe already saw above a couple of source code and header files which provide [API](https://en.wikipedia.org/wiki/Application_programming_interface) for manipulation of bit arrays. The most important and widely used API of bit arrays is architecture-specific and located as we already know in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file.\n\nFirst of all let's look at the two most important functions:\n\n* `set_bit`;\n* `clear_bit`.\n\nI think that there is no need to explain what these function do. This is already must be clear from their name. Let's look on their implementation. If you will look into the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file, you will note that each of these functions represented by two variants: [atomic](https://en.wikipedia.org/wiki/Linearizability) and not. Before we will start to dive into implementations of these functions, first of all we must to know a little about `atomic` operations.\n\nIn simple words atomic operations guarantees that two or more operations will not be performed on the same data concurrently. The `x86` architecture provides a set of atomic instructions, for example [xchg](http://x86.renejeschke.de/html/file_module_x86_id_328.html) instruction, [cmpxchg](http://x86.renejeschke.de/html/file_module_x86_id_41.html) instruction and etc. Besides atomic instructions, some of non-atomic instructions can be made atomic with the help of the [lock](http://x86.renejeschke.de/html/file_module_x86_id_159.html) instruction. It is enough to know about atomic operations for now, so we can begin to consider implementation of `set_bit` and `clear_bit` functions.\n\nFirst of all, let's start to consider `non-atomic` variants of this function. Names of non-atomic `set_bit` and `clear_bit` starts with double underscore. As we already know, all of these functions are defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file and the first function is `__set_bit`:\n\n```C\nstatic inline void __set_bit(long nr, volatile unsigned long *addr)\n{\n\tasm volatile(\"bts %1,%0\" : ADDR : \"Ir\" (nr) : \"memory\");\n}\n```\n\nAs we can see it takes two arguments:\n\n* `nr` - number of bit in a bit array.\n* `addr` - address of a bit array where we need to set bit.\n\nNote that the `addr` parameter is defined with `volatile` keyword which tells to compiler that value maybe changed by the given address. The implementation of the `__set_bit` is pretty easy. As we can see, it just contains one line of [inline assembler](https://en.wikipedia.org/wiki/Inline_assembler) code. In our case we are using the [bts](http://x86.renejeschke.de/html/file_module_x86_id_25.html) instruction which selects a bit which is specified with the first operand (`nr` in our case) from the bit array, stores the value of the selected bit in the [CF](https://en.wikipedia.org/wiki/FLAGS_register) flags register and set this bit.\n\nNote that we can see usage of the `nr`, but there is `addr` here. You already might guess that the secret is in `ADDR`. The `ADDR` is the macro which is defined in the same header code file and expands to the string which contains value of the given address and `+m` constraint:\n\n```C\n#define ADDR\t\t\t\tBITOP_ADDR(addr)\n#define BITOP_ADDR(x) \"+m\" (*(volatile long *) (x))\n```\n\nBesides the `+m`, we can see other constraints in the `__set_bit` function. Let's look on they and try to understand what do they mean:\n\n* `+m` - represents memory operand where `+` tells that the given operand will be input and output operand;\n* `I` - represents integer constant;\n* `r` - represents register operand\n\nBesides these constraint, we also can see - the `memory` keyword which tells compiler that this code will change value in memory. That's all. Now let's look at the same function but at `atomic` variant. It looks more complex that its `non-atomic` variant:\n\n```C\nstatic __always_inline void\nset_bit(long nr, volatile unsigned long *addr)\n{\n\tif (IS_IMMEDIATE(nr)) {\n\t\tasm volatile(LOCK_PREFIX \"orb %1,%0\"\n\t\t\t: CONST_MASK_ADDR(nr, addr)\n\t\t\t: \"iq\" ((u8)CONST_MASK(nr))\n\t\t\t: \"memory\");\n\t} else {\n\t\tasm volatile(LOCK_PREFIX \"bts %1,%0\"\n\t\t\t: BITOP_ADDR(addr) : \"Ir\" (nr) : \"memory\");\n\t}\n}\n```\n\nFirst of all note that this function takes the same set of parameters that `__set_bit`, but additionally marked with the `__always_inline` attribute. The `__always_inline` is macro which defined in the [include/linux/compiler-gcc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/compiler-gcc.h) and just expands to the `always_inline` attribute:\n\n```C\n#define __always_inline inline __attribute__((always_inline))\n```\n\nwhich means that this function will be always inlined to reduce size of the Linux kernel image. Now let's try to understand implementation of the `set_bit` function. First of all we check a given number of bit at the beginning of the `set_bit` function. The `IS_IMMEDIATE` macro defined in the same [header](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) file and expands to the call of the builtin [gcc](https://en.wikipedia.org/wiki/GNU_Compiler_Collection) function:\n\n```C\n#define IS_IMMEDIATE(nr)\t\t(__builtin_constant_p(nr))\n```\n\nThe `__builtin_constant_p` builtin function returns `1` if the given parameter is known to be constant at compile-time and returns `0` in other case. We do not need to use slow `bts` instruction to set bit if the given number of bit is known in compile time constant. We can just apply [bitwise or](https://en.wikipedia.org/wiki/Bitwise_operation#OR) for byte from the give address which contains given bit and masked number of bits where high bit is `1` and other is zero. In other case if the given number of bit is not known constant at compile-time, we do the same as we did in the `__set_bit` function. The `CONST_MASK_ADDR` macro:\n\n```C\n#define CONST_MASK_ADDR(nr, addr)\tBITOP_ADDR((void *)(addr) + ((nr)>>3))\n```\n\nexpands to the given address with offset to the byte which contains a given bit. For example we have address `0x1000` and the number of bit is `0x9`. So, as `0x9` is `one byte + one bit` our address with be `addr + 1`:\n\n```python\n>>> hex(0x1000 + (0x9 >> 3))\n'0x1001'\n```\n\nThe `CONST_MASK` macro represents our given number of bit as byte where high bit is `1` and other bits are `0`:\n\n```C\n#define CONST_MASK(nr)\t\t\t(1 << ((nr) & 7))\n```\n\n```python\n>>> bin(1 << (0x9 & 7))\n'0b10'\n```\n\nIn the end we just apply bitwise `or` for these values. So, for example if our address will be `0x4097` and we need to set `0x9` bit:\n\n```python\n>>> bin(0x4097)\n'0b100000010010111'\n>>> bin((0x4097 >> 0x9) | (1 << (0x9 & 7)))\n'0b100010'\n```\n\nthe `ninth` bit will be set.\n\nNote that all of these operations are marked with `LOCK_PREFIX` which is expands to the [lock](http://x86.renejeschke.de/html/file_module_x86_id_159.html) instruction which guarantees atomicity of this operation.\n\nAs we already know, besides the `set_bit` and `__set_bit` operations, the Linux kernel provides two inverse functions to clear bit in atomic and non-atomic context. They are `clear_bit` and `__clear_bit`. Both of these functions are defined in the same [header file](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) and takes the same set of arguments. But not only arguments are similar. Generally these functions are very similar on the `set_bit` and `__set_bit`. Let's look on the implementation of the non-atomic `__clear_bit` function:\n\n```C\nstatic inline void __clear_bit(long nr, volatile unsigned long *addr)\n{\n\tasm volatile(\"btr %1,%0\" : ADDR : \"Ir\" (nr));\n}\n```\n\nYes. As we see, it takes the same set of arguments and contains very similar block of inline assembler. It just uses the [btr](http://x86.renejeschke.de/html/file_module_x86_id_24.html) instruction instead of `bts`. As we can understand form the function's name, it clears a given bit by the given address. The `btr` instruction acts like `bts`. This instruction also selects a given bit which is specified in the first operand, stores its value in the `CF` flag register and clears this bit in the given bit array which is specified with second operand.\n\nThe atomic variant of the `__clear_bit` is `clear_bit`:\n\n```C\nstatic __always_inline void\nclear_bit(long nr, volatile unsigned long *addr)\n{\n\tif (IS_IMMEDIATE(nr)) {\n\t\tasm volatile(LOCK_PREFIX \"andb %1,%0\"\n\t\t\t: CONST_MASK_ADDR(nr, addr)\n\t\t\t: \"iq\" ((u8)~CONST_MASK(nr)));\n\t} else {\n\t\tasm volatile(LOCK_PREFIX \"btr %1,%0\"\n\t\t\t: BITOP_ADDR(addr)\n\t\t\t: \"Ir\" (nr));\n\t}\n}\n```\n\nand as we can see it is very similar on `set_bit` and just contains two differences. The first difference it uses `btr` instruction to clear bit when the `set_bit` uses `bts` instruction to set bit. The second difference it uses negated mask and `and` instruction to clear bit in the given byte when the `set_bit` uses `or` instruction.\n\nThat's all. Now we can set and clear bit in any bit array and and we can go to other operations on bitmasks.\n\nMost widely used operations on a bit arrays are set and clear bit in a bit array in the Linux kernel. But besides this operations it is useful to do additional operations on a bit array. Yet another widely used operation in the Linux kernel - is to know if a given bit is set or not in a bit array. We can achieve this with the help of the `test_bit` macro. This macro is defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file and expands to the call of the `constant_test_bit` or `variable_test_bit` depending on bit number:\n\n```C\n#define test_bit(nr, addr)\t\t\t\\\n\t(__builtin_constant_p((nr))                 \\\n\t ? constant_test_bit((nr), (addr))\t        \\\n\t : variable_test_bit((nr), (addr)))\n```\n\nSo, if the `nr` is known in compile time constant, the `test_bit` will be expanded to the call of the `constant_test_bit` function or `variable_test_bit` in other case. Now let's look at implementations of these functions. Let's start from the `variable_test_bit`:\n\n```C\nstatic inline int variable_test_bit(long nr, volatile const unsigned long *addr)\n{\n\tint oldbit;\n\n\tasm volatile(\"bt %2,%1\\n\\t\"\n\t\t     \"sbb %0,%0\"\n\t\t     : \"=r\" (oldbit)\n\t\t     : \"m\" (*(unsigned long *)addr), \"Ir\" (nr));\n\n\treturn oldbit;\n}\n```\n\nThe `variable_test_bit` function takes similar set of arguments as `set_bit` and other function take. We also may see inline assembly code here which executes [bt](http://x86.renejeschke.de/html/file_module_x86_id_22.html) and [sbb](http://x86.renejeschke.de/html/file_module_x86_id_286.html) instruction. The `bt` or `bit test` instruction selects a given bit which is specified with first operand from the bit array which is specified with the second operand and stores its value in the [CF](https://en.wikipedia.org/wiki/FLAGS_register) bit of flags register. The second `sbb` instruction subtracts first operand from second and subtracts value of the `CF`. So, here write a value of a given bit number from a given bit array to the `CF` bit of flags register and execute `sbb` instruction which calculates: `00000000 - CF` and writes the result to the `oldbit`.\n\nThe `constant_test_bit` function does the same as we saw in the `set_bit`:\n\n```C\nstatic __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr)\n{\n\treturn ((1UL << (nr & (BITS_PER_LONG-1))) &\n\t\t(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;\n}\n```\n\nIt generates a byte where high bit is `1` and other bits are `0` (as we saw in `CONST_MASK`) and applies bitwise [and](https://en.wikipedia.org/wiki/Bitwise_operation#AND) to the byte which contains a given bit number.\n\nThe next widely used bit array related operation is to change bit in a bit array. The Linux kernel provides two helper for this:\n\n* `__change_bit`;\n* `change_bit`.\n\nAs you already can guess, these two variants are atomic and non-atomic as for example `set_bit` and `__set_bit`. For the start, let's look at the implementation of the `__change_bit` function:\n\n```C\nstatic inline void __change_bit(long nr, volatile unsigned long *addr)\n{\n    asm volatile(\"btc %1,%0\" : ADDR : \"Ir\" (nr));\n}\n```\n\nPretty easy, is it not? The implementation of the `__change_bit` is the same as `__set_bit`, but instead of `bts` instruction, we are using [btc](http://x86.renejeschke.de/html/file_module_x86_id_23.html). This instruction selects a given bit from a given bit array, stores its value in the `CF` and changes its value by the applying of complement operation. So, a bit with value `1` will be `0` and vice versa:\n\n```python\n>>> int(not 1)\n0\n>>> int(not 0)\n1\n```\n\nThe atomic version of the `__change_bit` is the `change_bit` function:\n\n```C\nstatic inline void change_bit(long nr, volatile unsigned long *addr)\n{\n\tif (IS_IMMEDIATE(nr)) {\n\t\tasm volatile(LOCK_PREFIX \"xorb %1,%0\"\n\t\t\t: CONST_MASK_ADDR(nr, addr)\n\t\t\t: \"iq\" ((u8)CONST_MASK(nr)));\n\t} else {\n\t\tasm volatile(LOCK_PREFIX \"btc %1,%0\"\n\t\t\t: BITOP_ADDR(addr)\n\t\t\t: \"Ir\" (nr));\n\t}\n}\n```\n\nIt is similar on `set_bit` function, but also has two differences. The first difference is `xor` operation instead of `or` and the second is `btc` instead of `bts`.\n\nFor this moment we know the most important architecture-specific operations with bit arrays. Time to look at generic bitmap API.\n\nCommon bit operations\n================================================================================\n\nBesides the architecture-specific API from the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file, the Linux kernel provides common API for manipulation of bit arrays. As we know from the beginning of this part, we can find it in the  [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file and additionally in the [lib/bitmap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/bitmap.c)  source code file. But before these source code files let's look into the [include/linux/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitops.h) header file which provides a set of useful macro. Let's look on some of them.\n\nFirst of all let's look at following four macros:\n\n* `for_each_set_bit`\n* `for_each_set_bit_from`\n* `for_each_clear_bit`\n* `for_each_clear_bit_from`\n\nAll of these macros provide iterator over certain set of bits in a bit array. The first macro iterates over bits which are set, the second does the same, but starts from a certain bits. The last two macros do the same, but iterates over clear bits. Let's look on implementation of the `for_each_set_bit` macro:\n\n```C\n#define for_each_set_bit(bit, addr, size) \\\n\tfor ((bit) = find_first_bit((addr), (size));\t\t\\\n\t     (bit) < (size);\t\t\t\t\t\\\n\t     (bit) = find_next_bit((addr), (size), (bit) + 1))\n```\n\nAs we may see it takes three arguments and expands to the loop from first set bit which is returned as result of the `find_first_bit` function and to the last bit number while it is less than given size.\n\nBesides these four macros, the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) provides API for rotation of `64-bit` or `32-bit` values and etc.\n\nThe next [header](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) file which provides API for manipulation with a bit arrays. For example it provides two functions:\n\n* `bitmap_zero`;\n* `bitmap_fill`.\n\nTo clear a bit array or fill it with `1`. Let's look at the implementation of the `bitmap_zero` function:\n\n```C\nstatic inline void bitmap_zero(unsigned long *dst, unsigned int nbits)\n{\n\tif (small_const_nbits(nbits))\n\t\t*dst = 0UL;\n\telse {\n\t\tunsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);\n\t\tmemset(dst, 0, len);\n\t}\n}\n```\n\nFirst of all we can see the check for `nbits`. The `small_const_nbits` is macro which defined in the same header [file](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) and looks:\n\n```C\n#define small_const_nbits(nbits) \\\n\t(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)\n```\n\nAs we may see it checks that `nbits` is known constant in compile time and `nbits` value does not overflow `BITS_PER_LONG` or `64`. If bits number does not overflow amount of bits in a `long` value we can just set to zero. In other case we need to calculate how many `long` values do we need to fill our bit array and fill it with [memset](http://man7.org/linux/man-pages/man3/memset.3.html).\n\nThe implementation of the `bitmap_fill` function is similar on implementation of the `bitmap_zero` function, except we fill a given bit array with `0xff` values or `0b11111111`:\n\n```C\nstatic inline void bitmap_fill(unsigned long *dst, unsigned int nbits)\n{\n\tunsigned int nlongs = BITS_TO_LONGS(nbits);\n\tif (!small_const_nbits(nbits)) {\n\t\tunsigned int len = (nlongs - 1) * sizeof(unsigned long);\n\t\tmemset(dst, 0xff,  len);\n\t}\n\tdst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits);\n}\n```\n\nBesides the `bitmap_fill` and `bitmap_zero` functions, the [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file provides `bitmap_copy` which is similar on the `bitmap_zero`, but just uses [memcpy](http://man7.org/linux/man-pages/man3/memcpy.3.html) instead of [memset](http://man7.org/linux/man-pages/man3/memset.3.html). Also it provides bitwise operations for bit array like `bitmap_and`, `bitmap_or`, `bitmap_xor` and etc. We will not consider implementation of these functions because it is easy to understand implementations of these functions if you understood all from this part. Anyway if you are interested in how these function are implemented, you may open [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file and start to research.\n\nThat's all.\n\nLinks\n================================================================================\n\n* [bitmap](https://en.wikipedia.org/wiki/Bit_array)\n* [linked data structures](https://en.wikipedia.org/wiki/Linked_data_structure)\n* [tree data structures](https://en.wikipedia.org/wiki/Tree_%28data_structure%29)\n* [hot-plug](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt)\n* [cpumasks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)\n* [IRQs](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29)\n* [API](https://en.wikipedia.org/wiki/Application_programming_interface)\n* [atomic operations](https://en.wikipedia.org/wiki/Linearizability)\n* [xchg instruction](http://x86.renejeschke.de/html/file_module_x86_id_328.html)\n* [cmpxchg instruction](http://x86.renejeschke.de/html/file_module_x86_id_41.html)\n* [lock instruction](http://x86.renejeschke.de/html/file_module_x86_id_159.html)\n* [bts instruction](http://x86.renejeschke.de/html/file_module_x86_id_25.html)\n* [btr instruction](http://x86.renejeschke.de/html/file_module_x86_id_24.html)\n* [bt instruction](http://x86.renejeschke.de/html/file_module_x86_id_22.html)\n* [sbb instruction](http://x86.renejeschke.de/html/file_module_x86_id_286.html)\n* [btc instruction](http://x86.renejeschke.de/html/file_module_x86_id_23.html)\n* [man memcpy](http://man7.org/linux/man-pages/man3/memcpy.3.html)\n* [man memset](http://man7.org/linux/man-pages/man3/memset.3.html)\n* [CF](https://en.wikipedia.org/wiki/FLAGS_register)\n* [inline assembler](https://en.wikipedia.org/wiki/Inline_assembler)\n* [gcc](https://en.wikipedia.org/wiki/GNU_Compiler_Collection)\n"
  },
  {
    "path": "Dockerfile",
    "content": "FROM kyselejsyrecek/gitbook:3.2.3\nCOPY ./ /srv/gitbook/\nEXPOSE 4000\nWORKDIR /srv/gitbook\nCMD [\"sh\", \"-c\", \"/usr/local/bin/gitbook serve\"]\n\n# Examples:\n#RUN gitbook pdf\n#RUN gitbook epub\n\n"
  },
  {
    "path": "Initialization/README.md",
    "content": "# Kernel initialization process\n\nYou will find here a couple of posts which describe the full cycle of kernel initialization from its first step after the kernel has been decompressed to the start of the first process run by the kernel itself.\n\n*Note* That there will not be a description of the all kernel initialization steps. Here will be only generic kernel part, without interrupts handling, ACPI, and many other parts. All parts which I have missed, will be described in other chapters.\n\n* [First steps after kernel decompression](linux-initialization-1.md) - describes first steps in the kernel.\n* [Early interrupt and exception handling](linux-initialization-2.md) - describes early interrupts initialization and early page fault handler.\n* [Last preparations before the kernel entry point](linux-initialization-3.md) - describes the last preparations before the call of the `start_kernel`.\n* [Kernel entry point](linux-initialization-4.md) - describes first steps in the kernel generic code.\n* [Continue of architecture-specific initializations](linux-initialization-5.md) - describes architecture-specific initialization.\n* [Architecture-specific initializations, again...](linux-initialization-6.md) - describes continue of the architecture-specific initialization process.\n* [The End of the architecture-specific initializations, almost...](linux-initialization-7.md) - describes the end of the `setup_arch` related stuff.\n* [Scheduler initialization](linux-initialization-8.md) - describes preparation before scheduler initialization and initialization of it.\n* [RCU initialization](linux-initialization-9.md) - describes the initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update).\n* [End of the initialization](linux-initialization-10.md) - the last part about Linux kernel initialization.\n"
  },
  {
    "path": "Initialization/linux-initialization-1.md",
    "content": "Kernel initialization. Part 1.\n================================================================================\n\nFirst steps in the kernel code\n--------------------------------------------------------------------------------\n\nThe previous [post](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-6) was a last part of the Linux kernel [booting process](https://0xax.gitbook.io/linux-insides/summary/booting) chapter and now we are starting to dive into initialization process of the Linux kernel. After the image of the Linux kernel is decompressed and placed in a correct place in memory, it starts to work. All previous parts describe the work of the Linux kernel setup code which does preparation before the first bytes of the Linux kernel code will be executed. From now we are in the kernel and all parts of this chapter will be devoted to the initialization process of the kernel before it will launch process with [pid](https://en.wikipedia.org/wiki/Process_identifier) `1`. There are many things to do before the kernel will start first `init` process. Hope we will see all of the preparations before kernel will start in this big chapter. We will start from the kernel entry point, which is located in the [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/head_64.S) and will move further and further. We will see first preparations like early page tables initialization, switch to a new descriptor in kernel space and many many more, before we will see the `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/master/init/main.c) will be called.\n\nIn the last [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-6) of the previous [chapter](https://0xax.gitbook.io/linux-insides/summary/booting) we stopped at the jmp instruction from the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/head_64.S) assembly source code file:\n\n```assembly\njmp\t*%rax\n```\n\nAt this moment the `rax` register contains address of the Linux kernel entry point which was obtained as a result of the call of the `decompress_kernel` function from the [arch/x86/boot/compressed/misc.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/misc.c) source code file. So, our last instruction in the kernel setup code is a jump on the kernel entry point. We already know where the entry point of the Linux kernel is defined, so we are able to start to learn what Linux kernel does after the start.\n\nFirst steps in the kernel\n--------------------------------------------------------------------------------\n\nOkay, we got the address of the decompressed kernel image from the `decompress_kernel` function into `rax` register and just jumped there. As we already know the entry point of the decompressed kernel image starts in the [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/head_64.S) assembly source code file and at the beginning of it, we can see following definitions:\n\n```assembly\n    .text\n\t__HEAD\n\t.code64\n\t.globl startup_64\nstartup_64:\n\t...\n\t...\n\t...\n```\n\nWe can see definition of the `startup_64` routine that is defined in the `__HEAD` section, which is just a macro which expands to the definition of executable `.head.text` section:\n\n```C\n#define __HEAD\t\t.section\t\".head.text\",\"ax\"\n```\n\nWe can see definition of this section in the [arch/x86/kernel/vmlinux.lds.S](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/vmlinux.lds.S) linker script:\n\n```\n.text : AT(ADDR(.text) - LOAD_OFFSET) {\n\t_text = .;\n\t...\n\t...\n\t...\n} :text = 0x9090\n```\n\nThe ADDR keyword above returns the absolute address (here means virtual address) of the named section. The AT keyword above specifies the load address (here means physical address) of the section. The full syntax of section definition is defined in the [Using ld The GNU linker](https://ftp.gnu.org/old-gnu/Manuals/ld-2.9.1/html_node/ld_21.html). \n\n\nBesides the definition of the `.text` section, we can understand default virtual and physical addresses from the linker script. Note that address of the `_text` is location counter which is defined as:\n\n```\n. = __START_KERNEL;\n```\n\nfor [x86_64](https://en.wikipedia.org/wiki/X86-64). The definition of the `__START_KERNEL` macro is located in the [arch/x86/include/asm/page_types.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/page_types.h) header file and represented by the sum of the base virtual address of the kernel mapping and physical start:\n\n```C\n#define __START_KERNEL\t(__START_KERNEL_map + __PHYSICAL_START)\n\n#define __PHYSICAL_START  ALIGN(CONFIG_PHYSICAL_START, CONFIG_PHYSICAL_ALIGN)\n```\n\nOr in other words:\n\n* Base physical address of the Linux kernel - `0x1000000`;\n* Base virtual address of the Linux kernel - `0xffffffff81000000`.\n\nAfter we sanitized CPU configuration, we call `__startup_64` function which is defined in [arch/x86/kernel/head64.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/head64.c):\n\n```assembly\n\tleaq\t_text(%rip), %rdi\n\tpushq\t%rsi\n\tcall\t__startup_64\n\tpopq\t%rsi\n```\n\n```C\nunsigned long __head __startup_64(unsigned long physaddr,\n\t\t\t\t struct boot_params *bp)\n{\n\tunsigned long load_delta, *p;\n\tunsigned long pgtable_flags;\n\tpgdval_t *pgd;\n\tp4dval_t *p4d;\n\tpudval_t *pud;\n\tpmdval_t *pmd, pmd_entry;\n\tpteval_t *mask_ptr;\n\tbool la57;\n\tint i;\n\tunsigned int *next_pgt_ptr;\n\t...\n\t...\n\t...\n}\n```\n\nSince [kASLR](https://en.wikipedia.org/wiki/Address_space_layout_randomization#Linux) is enabled, the address `startup_64` routine was loaded may be different from the address compiled to run at, so we need to calculate the delta with the following code:\n\n```C\n\tload_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);\n```\n\nAs a result, `load_delta` contains the delta between the address compiled to run at and the address actually loaded.\n\nAfter we got the delta, we check if `_text` address is correctly aligned for `2` megabytes. We will do it with the following code:\n\n```C\n\tif (load_delta & ~PMD_PAGE_MASK)\n\t\tfor (;;);\n```\n\nIf `_text` address is not aligned for `2` megabytes, we enter infinite loop. The `PMD_PAGE_MASK` indicates the mask for `Page middle directory` (read [Paging](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-1) about it) and is defined as:\n\n```C\n#define PMD_PAGE_MASK           (~(PMD_PAGE_SIZE-1))\n```\n\nwhere `PMD_PAGE_SIZE` macro is defined as:\n\n```C\n#define PMD_PAGE_SIZE           (_AC(1, UL) << PMD_SHIFT)\n#define PMD_SHIFT\t\t21\n```\n\nAs we can easily calculate, `PMD_PAGE_SIZE` is `2` megabytes.\n\nIf [SME](https://en.wikipedia.org/wiki/Zen_%28microarchitecture%29#Enhanced_security_and_virtualization_support) is supported and enabled, we activate it and include the SME encryption mask in `load_delta`:\n\n```C\n\tsme_enable(bp);\n\tload_delta += sme_get_me_mask();\n```\n\nOkay, we did some early checks and now we can move on.\n\nFix base addresses of page tables\n--------------------------------------------------------------------------------\n\nIn the next step we fixup the physical addresses in the page table:\n\n```C\n\tpgd = fixup_pointer(&early_top_pgt, physaddr);\n\tpud = fixup_pointer(&level3_kernel_pgt, physaddr);\n\tpmd = fixup_pointer(level2_fixmap_pgt, physaddr);\n```\n\nSo, let's look at the definition of `fixup_pointer` function which returns physical address of the passed argument:\n\n```C\nstatic void __head *fixup_pointer(void *ptr, unsigned long physaddr)\n{\n\treturn ptr - (void *)_text + (void *)physaddr;\n}\n```\n\nNext we'll focus on `early_top_pgt` and the other page table symbols which we saw above. Let's try to understand what these symbols mean. First of all let's look at their definition:\n\n```assembly\nNEXT_PAGE(early_top_pgt)\n\t.fill\t512,8,0\n\t.fill\tPTI_USER_PGD_FILL,8,0\n\nNEXT_PAGE(level3_kernel_pgt)\n\t.fill\tL3_START_KERNEL,8,0\n\t.quad\tlevel2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC\n\t.quad\tlevel2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC\n\nNEXT_PAGE(level2_kernel_pgt)\n\tPMDS(0, __PAGE_KERNEL_LARGE_EXEC,\n\t\tKERNEL_IMAGE_SIZE/PMD_SIZE)\n\nNEXT_PAGE(level2_fixmap_pgt)\n\t.fill\t506,8,0\n\t.quad\tlevel1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC\n\t.fill\t5,8,0\n\nNEXT_PAGE(level1_fixmap_pgt)\n\t.fill\t512,8,0\n```\n\nLooks hard, but it isn't. First of all let's look at the `early_top_pgt`. It starts with the `4096` bytes of zeros (or `8192` bytes if `CONFIG_PAGE_TABLE_ISOLATION` is enabled), it means that we don't use the first `512` entries. And after this we can see `level3_kernel_pgt` entry. At the start of its definition, we can see that it is filled with the `4080` bytes of zeros (`L3_START_KERNEL` equals `510`). Subsequently, it stores two entries which map kernel space. Note that we subtract `__START_KERNEL_map` from `level2_kernel_pgt` and `level2_fixmap_pgt`. As we know `__START_KERNEL_map` is a base virtual address of the kernel text, so if we subtract `__START_KERNEL_map`, we will get physical addresses of the `level2_kernel_pgt` and `level2_fixmap_pgt`.\n\nNext let's look at `_KERNPG_TABLE_NOENC` and `_PAGE_TABLE_NOENC`, these are just page entry access rights:\n\n```C\n#define _KERNPG_TABLE_NOENC   (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \\\n\t\t\t       _PAGE_DIRTY)\n#define _PAGE_TABLE_NOENC     (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \\\n\t\t\t       _PAGE_ACCESSED | _PAGE_DIRTY)\n```\n\nThe `level2_kernel_pgt` is page table entry which contains pointer to the page middle directory which maps kernel space. It calls the `PDMS` macro which creates `512` megabytes from the `__START_KERNEL_map` for kernel `.text` (after these `512` megabytes will be module memory space).\n\nThe `level2_fixmap_pgt` is a virtual addresses which can refer to any physical addresses even under kernel space. They are represented by the `4048` bytes of zeros, the `level1_fixmap_pgt` entry, `8` megabytes reserved for [vsyscalls](https://lwn.net/Articles/446528/) mapping and `2` megabytes of hole.\n\nYou can read more about it in the [Paging](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-1) part.\n\nNow, after we saw the definitions of these symbols, let's get back to the code. Next we initialize last entry of `pgd` with `level3_kernel_pgt`:\n\n```C\n\tpgd[pgd_index(__START_KERNEL_map)] = level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC;\n```\n\nAll of `p*d` addresses may be wrong if the `startup_64` is not equal to default `0x1000000` address. Remember that the `load_delta` contains delta between the address of the `startup_64` symbol which was got during kernel [linking](https://en.wikipedia.org/wiki/Linker_%28computing%29) and the actual address. So we add the delta to the certain entries of the `p*d`.\n\n```C\n\tpgd[pgd_index(__START_KERNEL_map)] += load_delta;\n\tpud[510] += load_delta;\n\tpud[511] += load_delta;\n\tpmd[506] += load_delta;\n```\n\nAfter all of this we will have:\n\n```\nearly_top_pgt[511] -> level3_kernel_pgt[0]\nlevel3_kernel_pgt[510] -> level2_kernel_pgt[0]\nlevel3_kernel_pgt[511] -> level2_fixmap_pgt[0]\nlevel2_kernel_pgt[0]   -> 512 MB kernel mapping\nlevel2_fixmap_pgt[506] -> level1_fixmap_pgt\n```\n\nNote that we didn't fixup base address of the `early_top_pgt` and some of other page table directories, because we will see this when building/filling structures of these page tables. As we corrected base addresses of the page tables, we can start to build it.\n\nIdentity mapping setup\n--------------------------------------------------------------------------------\n\nNow we can see the set up of identity mapping of early page tables. In Identity Mapped Paging, virtual addresses are mapped to physical addresses identically. Let's look at it in detail. First of all we replace `pud` and `pmd` with the pointer to first and second entry of `early_dynamic_pgts`:\n\n```C\n\tnext_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);\n\tpud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);\n\tpmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);\n```\n\nLet's look at the `early_dynamic_pgts` definition:\n\n```assembly\nNEXT_PAGE(early_dynamic_pgts)\n\t.fill\t512*EARLY_DYNAMIC_PAGE_TABLES,8,0\n```\n\nwhich will store temporary page tables for early kernel.\n\nNext we initialize `pgtable_flags` which will be used when initializing `p*d` entries later:\n\n```C\n\tpgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();\n```\n\n`sme_get_me_mask` function returns `sme_me_mask` which was initialized in `sme_enable` function.\n\nNext we fill two entries of `pgd` with `pud` plus `pgtable_flags` which we initialized above:\n\n```C\n\ti = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;\n\tpgd[i + 0] = (pgdval_t)pud + pgtable_flags;\n\tpgd[i + 1] = (pgdval_t)pud + pgtable_flags;\n```\n\n`PGDIR_SHFT` indicates the mask for page global directory bits in a virtual address. Here we calculate modulo with `PTRS_PER_PGD` (which expands to `512`) so as not to access the index greater than `512`. There are macro for all types of page directories:\n\n```C\n#define PGDIR_SHIFT     39\n#define PTRS_PER_PGD\t512\n#define PUD_SHIFT       30\n#define PTRS_PER_PUD\t512\n#define PMD_SHIFT       21\n#define PTRS_PER_PMD\t512\n```\n\nWe do the almost same thing above:\n\n```C\n\ti = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;\n\tpud[i + 0] = (pudval_t)pmd + pgtable_flags;\n\tpud[i + 1] = (pudval_t)pmd + pgtable_flags;\n```\n\nNext we initialize `pmd_entry` and filter out unsupported `__PAGE_KERNEL_*` bits:\n\n```C\n\tpmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;\n\tmask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);\n\tpmd_entry &= *mask_ptr;\n\tpmd_entry += sme_get_me_mask();\n\tpmd_entry += physaddr;\n```\n\nNext we fill all `pmd` entries to cover full size of the kernel:\n\n```C\n\tfor (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {\n\t\tint idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;\n\t\tpmd[idx] = pmd_entry + i * PMD_SIZE;\n\t}\n```\n\nNext we fixup the kernel text+data virtual addresses. Note that we might write invalid pmds, when the kernel is relocated (`cleanup_highmap` function fixes this up along with the mappings beyond `_end`).\n\n```C\n\tpmd = fixup_pointer(level2_kernel_pgt, physaddr);\n\tfor (i = 0; i < PTRS_PER_PMD; i++) {\n\t\tif (pmd[i] & _PAGE_PRESENT)\n\t\t\tpmd[i] += load_delta;\n\t}\n```\n\nNext we remove the memory encryption mask to obtain the true physical address (remember that `load_delta` includes the mask):\n\n```C\n\t*fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();\n```\n\n`phys_base` must match the first entry in `level2_kernel_pgt`.\n\nAs final step of `__startup_64` function, we encrypt the kernel (if SME is active) and return the SME encryption mask to be used as a modifier for the initial page directory entry programmed into `cr3` register:\n\n```C\n\tsme_encrypt_kernel(bp);\n\treturn sme_get_me_mask();\n```\n\nNow let's get back to assembly code. We prepare for next paragraph with following code:\n\n```assembly\n\taddq\t$(early_top_pgt - __START_KERNEL_map), %rax\n\tjmp 1f\n```\n\nwhich adds physical address of `early_top_pgt` to `rax` register so that `rax` register contains sum of the address and the SME encryption mask.\n\nThat's all for now. Our early paging is prepared and we just need to finish last preparation before we will jump into kernel entry point.\n\nLast preparation before jump at the kernel entry point\n--------------------------------------------------------------------------------\n\nAfter that we jump to the label `1` we enable `PAE`, `PGE` (Paging Global Extension) and put the content of the `phys_base` (see above) to the `rax` register and fill `cr3` register with it:\n\n```assembly\n1:\n\tmovl\t$(X86_CR4_PAE | X86_CR4_PGE), %ecx\n\tmovq\t%rcx, %cr4\n\n\taddq\tphys_base(%rip), %rax\n\tmovq\t%rax, %cr3\n```\n\nIn the next step we check that CPU supports [NX](http://en.wikipedia.org/wiki/NX_bit) bit with:\n\n```assembly\n\tmovl\t$0x80000001, %eax\n\tcpuid\n\tmovl\t%edx,%edi\n```\n\nWe put `0x80000001` value to the `eax` and execute `cpuid` instruction for getting the extended processor info and feature bits. The result will be in the `edx` register which we put to the `edi`.\n\nNow we put `0xc0000080` or `MSR_EFER` to the `ecx` and execute `rdmsr` instruction for the reading model specific register.\n\n```assembly\n\tmovl\t$MSR_EFER, %ecx\n\trdmsr\n```\n\nThe result will be in the `edx:eax`. General view of the `EFER` is following:\n\n```\n63                                                                              32\n --------------------------------------------------------------------------------\n|                                                                               |\n|                                Reserved MBZ                                   |\n|                                                                               |\n --------------------------------------------------------------------------------\n31                            16  15      14      13   12  11   10  9  8 7  1   0\n --------------------------------------------------------------------------------\n|                              | T |       |       |    |   |   |   |   |   |   |\n| Reserved MBZ                 | C | FFXSR | LMSLE |SVME|NXE|LMA|MBZ|LME|RAZ|SCE|\n|                              | E |       |       |    |   |   |   |   |   |   |\n --------------------------------------------------------------------------------\n```\n\nWe will not see all fields in details here, but we will learn about this and other `MSRs` in a special part about it. As we read `EFER` to the `edx:eax`, we check `_EFER_SCE` or zero bit which is `System Call Extensions` with `btsl` instruction and set it to one. By the setting `SCE` bit we enable `SYSCALL` and `SYSRET` instructions. In the next step we check 20th bit in the `edi`, remember that this register stores result of the `cpuid` (see above). If `20` bit is set (`NX` bit) we just write `EFER_SCE` to the model specific register.\n\n```assembly\n\tbtsl\t$_EFER_SCE, %eax\n\tbtl\t$20,%edi\n\tjnc     1f\n\tbtsl\t$_EFER_NX, %eax\n\tbtsq\t$_PAGE_BIT_NX,early_pmd_flags(%rip)\n1:\twrmsr\n```\n\nIf the [NX](https://en.wikipedia.org/wiki/NX_bit) bit is supported we enable `_EFER_NX`  and write it too, with the `wrmsr` instruction. After the [NX](https://en.wikipedia.org/wiki/NX_bit) bit is set, we set some bits in the `cr0` [control register](https://en.wikipedia.org/wiki/Control_register) with following assembly code:\n\n```assembly\n\tmovl\t$CR0_STATE, %eax\n\tmovq\t%rax, %cr0\n```\n\nspecifically the following bits:\n\n* `X86_CR0_PE` - system is in protected mode;\n* `X86_CR0_MP` - controls interaction of WAIT/FWAIT instructions with TS flag in CR0;\n* `X86_CR0_ET` - on the 386, it allowed to specify whether the external math coprocessor was an 80287 or 80387;\n* `X86_CR0_NE` - enable internal x87 floating point error reporting when set, else enables PC style x87 error detection;\n* `X86_CR0_WP` - when set, the CPU can't write to read-only pages when privilege level is 0;\n* `X86_CR0_AM` - alignment check enabled if AM set, AC flag (in EFLAGS register) set, and privilege level is 3;\n* `X86_CR0_PG` - enable paging.\n\nWe already know that to run any code, and even more [C](https://en.wikipedia.org/wiki/C_%28programming_language%29) code from assembly, we need to setup a stack. As always, we are doing it by the setting of [stack pointer](https://en.wikipedia.org/wiki/Stack_register) to a correct place in memory and resetting [flags](https://en.wikipedia.org/wiki/FLAGS_register) register after this:\n\n```assembly\n\tmovq initial_stack(%rip), %rsp\n\tpushq $0\n\tpopfq\n```\n\nThe most interesting thing here is the `initial_stack`. This symbol is defined in the [source](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/head_64.S) code file and looks like:\n\n```assembly\nGLOBAL(initial_stack)\n    .quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS\n```\n\nThe `THREAD_SIZE` macro is defined in the [arch/x86/include/asm/page_64_types.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/page_64_types.h) header file and depends on value of the `KASAN_STACK_ORDER` macro:\n\n```C\n#ifdef CONFIG_KASAN\n#define KASAN_STACK_ORDER 1\n#else\n#define KASAN_STACK_ORDER 0\n#endif\n\n#define THREAD_SIZE_ORDER       (2 + KASAN_STACK_ORDER)\n#define THREAD_SIZE  (PAGE_SIZE << THREAD_SIZE_ORDER)\n```\n\nWe consider when the [kasan](https://github.com/torvalds/linux/blob/master/Documentation/dev-tools/kasan.rst) is disabled and the `PAGE_SIZE` is `4096` bytes. So the `THREAD_SIZE` will expands to `16` kilobytes and represents size of the stack of a thread. Why is `thread`? You may already know that each [process](https://en.wikipedia.org/wiki/Process_%28computing%29) may have [parent processes](https://en.wikipedia.org/wiki/Parent_process) and [child processes](https://en.wikipedia.org/wiki/Child_process). Actually, a parent process and child process differ in stack. A new kernel stack is allocated for a new process. In the Linux kernel this stack is represented by the [union](https://en.wikipedia.org/wiki/Union_type#C.2FC.2B.2B) with the `thread_info` structure.\n\nThe `init_thread_union` is represented by the `thread_union`. And the `thread_union` is defined in the [include/linux/sched.h](https://github.com/torvalds/linux/blob/master/include/linux/sched.h) file like the following:\n\n```C\nunion thread_union {\n#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK\n\tstruct task_struct task;\n#endif\n#ifndef CONFIG_THREAD_INFO_IN_TASK\n\tstruct thread_info thread_info;\n#endif\n\tunsigned long stack[THREAD_SIZE/sizeof(long)];\n};\n```\n\nThe `CONFIG_ARCH_TASK_STRUCT_ON_STACK` kernel configuration option is only enabled for `ia64` architecture, and the `CONFIG_THREAD_INFO_IN_TASK` kernel configuration option is enabled for `x86_64` architecture. Thus the `thread_info` structure will be placed in `task_struct` structure instead of the `thread_union` union.\n\nThe `init_thread_union` is placed in the [include/asm-generic/vmlinux.lds.h](https://github.com/torvalds/blob/master/include/asm-generic/vmlinux.lds.h) file as part of the `INIT_TASK_DATA` macro like the following:\n\n```C\n#define INIT_TASK_DATA(align)  \\\n\t. = ALIGN(align);      \\\n\t...                    \\\n\tinit_thread_union = .; \\\n\t...\n```\n\nThis macro is used in the [arch/x86/kernel/vmlinux.lds.S](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/vmlinux.lds.S) file like the following:\n\n```\n.data : AT(ADDR(.data) - LOAD_OFFSET) {\n\t...\n\tINIT_TASK_DATA(THREAD_SIZE)\n\t...\n} :data\n```\n\nThat is, `init_thread_union` is initialized with the address which is aligned to `THREAD_SIZE` which is `16` kilobytes.\n\nNow we may understand this expression:\n\n```assembly\nGLOBAL(initial_stack)\n    .quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS\n```\n\nthat `initial_stack` symbol points to the start of the `thread_union.stack` array + `THREAD_SIZE` which is 16 killobytes and - `SIZEOF_PTREGS` which is convention which helps the in-kernel unwinder reliably detect the end of the stack.\n\nAfter the early boot stack is set, to update the [Global Descriptor Table](https://en.wikipedia.org/wiki/Global_Descriptor_Table) with the `lgdt` instruction:\n\n```assembly\nlgdt\tearly_gdt_descr(%rip)\n```\n\nwhere the `early_gdt_descr` is defined as:\n\n```assembly\nearly_gdt_descr:\n\t.word\tGDT_ENTRIES*8-1\nearly_gdt_descr_base:\n\t.quad\tINIT_PER_CPU_VAR(gdt_page)\n```\n\nWe need to reload `Global Descriptor Table` because now kernel works in the low userspace addresses, but soon kernel will work in its own space.\n\nNow let's look at the definition of `early_gdt_descr`. `GDT_ENTRIES` expands to `32` so that Global Descriptor Table contains `32` entries for kernel code, data, thread local storage segments and etc...\n\nNow let's look at the definition of `early_gdt_descr_base`. The `gdt_page` structure is defined in the [arch/x86/include/asm/desc.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/desc.h) as:\n\n```C\nstruct gdt_page {\n\tstruct desc_struct gdt[GDT_ENTRIES];\n} __attribute__((aligned(PAGE_SIZE)));\n```\n\nIt contains one field `gdt` which is array of the `desc_struct` structure which is defined as:\n\n```C\nstruct desc_struct {\n         union {\n                 struct {\n                         unsigned int a;\n                         unsigned int b;\n                 };\n                 struct {\n                         u16 limit0;\n                         u16 base0;\n                         unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;\n                         unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;\n                 };\n         };\n } __attribute__((packed));\n```\n\nwhich looks familiar `GDT` descriptor. Note that `gdt_page` structure is aligned to `PAGE_SIZE` which is `4096` bytes. Which means that `gdt` will occupy one page.\n\nNow let's try to understand what `INIT_PER_CPU_VAR` is. `INIT_PER_CPU_VAR` is a macro which is defined in the [arch/x86/include/asm/percpu.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/percpu.h) and just concatenates `init_per_cpu__` with the given parameter:\n\n```C\n#define INIT_PER_CPU_VAR(var) init_per_cpu__##var\n```\n\nAfter the `INIT_PER_CPU_VAR` macro will be expanded, we will have `init_per_cpu__gdt_page`. We can see the initialization of `init_per_cpu__gdt_page` in the [linker script](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/vmlinux.lds.S):\n\n```\n#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load\nINIT_PER_CPU(gdt_page);\n```\n\nAs we got `init_per_cpu__gdt_page` in `INIT_PER_CPU_VAR` and `INIT_PER_CPU` macro from linker script will be expanded we will get offset from the `__per_cpu_load`. After this calculations, we will have correct base address of the new GDT.\n\nGenerally per-CPU variables is a 2.6 kernel feature. You can understand what it is from its name. When we create `per-CPU` variable, each CPU will have its own copy of this variable. Here we are creating `gdt_page` per-CPU variable. There are many advantages for variables of this type, like there are no locks, because each CPU works with its own copy of variable and etc... So every core on multiprocessor will have its own `GDT` table and every entry in the table will represent a memory segment which can be accessed from the thread which ran on the core. You can read in details about `per-CPU` variables in the [Concepts/per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) post.\n\nAs we loaded new Global Descriptor Table, we reload segments as we did it every time:\n\n```assembly\n\txorl %eax,%eax\n\tmovl %eax,%ds\n\tmovl %eax,%ss\n\tmovl %eax,%es\n\tmovl %eax,%fs\n\tmovl %eax,%gs\n```\n\nAfter all of these steps we set up `gs` register that it post to the `irqstack` which represents special stack where [interrupts](https://en.wikipedia.org/wiki/Interrupt) will be handled on:\n\n```assembly\n\tmovl\t$MSR_GS_BASE,%ecx\n\tmovl\tinitial_gs(%rip),%eax\n\tmovl\tinitial_gs+4(%rip),%edx\n\twrmsr\n```\n\nwhere `MSR_GS_BASE` is:\n\n```C\n#define MSR_GS_BASE             0xc0000101\n```\n\nWe need to put `MSR_GS_BASE` to the `ecx` register and load data from the `eax` and `edx` (which point to the `initial_gs`) with `wrmsr` instruction. We don't use `cs`, `fs`, `ds` and `ss` segment registers for addressing in the 64-bit mode, but `fs` and `gs` registers can be used. `fs` and `gs` have a hidden part (as we saw it in the real mode for `cs`) and this part contains a descriptor which is mapped to [Model Specific Registers](https://en.wikipedia.org/wiki/Model-specific_register). So we can see above `0xc0000101` is a `gs.base` MSR address. When a [system call](https://en.wikipedia.org/wiki/System_call) or [interrupt](https://en.wikipedia.org/wiki/Interrupt) occurs, there is no kernel stack at the entry point, so the value of the `MSR_GS_BASE` will store address of the interrupt stack.\n\nIn the next step we put the address of the real mode bootparam structure to the `rdi` (remember `rsi` holds pointer to this structure from the start) and jump to the C code with:\n\n```assembly\n\tpushq\t$.Lafter_lret\t# put return address on stack for unwinder\n\txorq\t%rbp, %rbp\t# clear frame pointer\n\tmovq\tinitial_code(%rip), %rax\n\tpushq\t$__KERNEL_CS\t# set correct cs\n\tpushq\t%rax\t\t# target address in negative space\n\tlretq\n.Lafter_lret:\n```\n\nHere we put the address of the `initial_code` to the `rax` and push the return address, `__KERNEL_CS` and the address of the `initial_code` to the stack. After this we can see `lretq` instruction which means that after it return address will be extracted from stack (now there is address of the `initial_code`) and jump there. `initial_code` is defined in the same source code file and looks:\n\n```assembly\n\t.balign\t8\n\tGLOBAL(initial_code)\n\t.quad\tx86_64_start_kernel\n\t...\n\t...\n\t...\n```\n\nAs we can see `initial_code` contains address of the `x86_64_start_kernel`, which is defined in the [arch/x86/kerne/head64.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/head64.c) and looks like this:\n\n```C\nasmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)\n{\n\t...\n\t...\n\t...\n}\n```\n\nIt has one argument is a `real_mode_data` (remember that we passed address of the real mode data to the `rdi` register previously).\n\nNext to start_kernel\n--------------------------------------------------------------------------------\n\nWe need to see last preparations before we can see \"kernel entry point\" - start_kernel function from the [init/main.c](https://github.com/torvalds/linux/blob/master/init/main.c).\n\nFirst of all we can see some checks in the `x86_64_start_kernel` function:\n\n```C\nBUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map);\nBUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE);\nBUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);\nBUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);\nBUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);\nBUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));\nMAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == (__START_KERNEL & PGDIR_MASK)));\nBUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);\n```\n\nThere are checks for different things like virtual address of module space is not fewer than base address of the kernel text - `__STAT_KERNEL_map`, that kernel text with modules is not less than image of the kernel and etc... `BUILD_BUG_ON` is a macro which looks as:\n\n```C\n#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))\n```\n\nLet's try to understand how this trick works. Let's take for example first condition: `MODULES_VADDR < __START_KERNEL_map`. `!!conditions` is the same that `condition != 0`. So it means if `MODULES_VADDR < __START_KERNEL_map` is true, we will get `1` in the `!!(condition)` or zero if not. After `2*!!(condition)` we will get or `2` or `0`. In the end of calculations we can get two different behaviors:\n\n* We will have compilation error, because try to get size of the char array with negative index (as can be in our case, because `MODULES_VADDR` can't be less than `__START_KERNEL_map` will be in our case);\n* No compilation errors.\n\nThat's all. So interesting C trick for getting compile error which depends on some constants.\n\nIn the next step we can see call of the `cr4_init_shadow` function which stores shadow copy of the `cr4` per cpu. Context switches can change bits in the `cr4` so we need to store `cr4` for each CPU. And after this we can see call of the `reset_early_page_tables` function where we resets all page global directory entries and write new pointer to the PGT in `cr3`:\n\n```C\n\tmemset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));\n\tnext_early_pgt = 0;\n\twrite_cr3(__sme_pa_nodebug(early_top_pgt));\n```\n\nSoon we will build new page tables. Here we can see that we zero all Page Global Directory entries. After this we set `next_early_pgt` to zero (we will see details about it in the next post) and write physical address of the `early_top_pgt` to the `cr3`.\n\nAfter this we clear `_bss` from the `__bss_stop` to `__bss_start` and also clear `init_top_pgt`. `init_top_pgt` is defined in the [arch/x86/kerne/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/head_64.S) like the following:\n\n```assembly\nNEXT_PGD_PAGE(init_top_pgt)\n\t.fill\t512,8,0\n\t.fill\tPTI_USER_PGD_FILL,8,0\n``` \n\nThis is exactly the same definition as `early_top_pgt`.\n\nThe next step will be setup of the early `IDT` handlers, but it's big concept so we will see it in the next post.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the first part about Linux kernel initialization.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\nIn the next part we will see initialization of the early interruption handlers, kernel space memory mapping and a lot more.\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [Model Specific Register](http://en.wikipedia.org/wiki/Model-specific_register)\n* [Paging](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-1)\n* [Previous part - kernel load address randomization](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-6)\n* [NX](http://en.wikipedia.org/wiki/NX_bit)\n* [ASLR](http://en.wikipedia.org/wiki/Address_space_layout_randomization)\n"
  },
  {
    "path": "Initialization/linux-initialization-10.md",
    "content": "Kernel initialization. Part 10.\n================================================================================\n\nEnd of the Linux kernel initialization process\n================================================================================\n\nThis is tenth part of the chapter about Linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) and in the [previous part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-9) we saw the initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update) and stopped on the call of the `acpi_early_init` function. This part will be the last part of the [Kernel initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) chapter, so let's finish it.\n\nAfter the call of the `acpi_early_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c), we can see the following code:\n\n```C\n#ifdef CONFIG_X86_ESPFIX64\n\tinit_espfix_bsp();\n#endif\n```\n\nHere we can see the call of the `init_espfix_bsp` function which depends on the `CONFIG_X86_ESPFIX64` kernel configuration option. As we can understand from the function name, it does something with the stack. This function is defined in the [arch/x86/kernel/espfix_64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/espfix_64.c) and prevents leaking of `31:16` bits of the `esp` register during returning to 16-bit stack. First of all we install `espfix` page upper directory into the kernel page directory in the `init_espfix_bs`:\n\n```C\npgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];\npgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);\n```\n\nWhere `ESPFIX_BASE_ADDR` is:\n\n```C\n#define PGDIR_SHIFT     39\n#define ESPFIX_PGD_ENTRY _AC(-2, UL)\n#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)\n```\n\nAlso we can find it in the [Documentation/x86/x86_64/mm](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/x86/x86_64/mm.txt):\n\n```\n... unused hole ...\nffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks\n... unused hole ...\n```\n\nAfter we've filled page global directory with the `espfix` pud, the next step is call of the `init_espfix_random` and `init_espfix_ap` functions. The first function returns random locations for the `espfix` page and the second enables the `espfix` for the current CPU. After the `init_espfix_bsp` finished the work, we can see the call of the `thread_info_cache_init` function which defined in the [kernel/fork.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/fork.c) and allocates cache for the `thread_info` if `THREAD_SIZE` is less than `PAGE_SIZE`:\n\n```C\n# if THREAD_SIZE >= PAGE_SIZE\n...\n...\n...\nvoid thread_info_cache_init(void)\n{\n        thread_info_cache = kmem_cache_create(\"thread_info\", THREAD_SIZE,\n                                              THREAD_SIZE, 0, NULL);\n        BUG_ON(thread_info_cache == NULL);\n}\n...\n...\n...\n#endif\n```\n\nAs we already know the `PAGE_SIZE` is `(_AC(1,UL) << PAGE_SHIFT)` or `4096` bytes and `THREAD_SIZE` is `(PAGE_SIZE << THREAD_SIZE_ORDER)` or `16384` bytes for the `x86_64`. The next function after the `thread_info_cache_init` is the `cred_init` from the [kernel/cred.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/cred.c). This function just allocates cache for the credentials (like `uid`, `gid`, etc.):\n\n```C\nvoid __init cred_init(void)\n{\n         cred_jar = kmem_cache_create(\"cred_jar\", sizeof(struct cred),\n                                     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);\n}\n```\n\nmore about credentials you can read in the [Documentation/security/credentials.txt](https://github.com/torvalds/linux/blob/master/Documentation/security/credentials.rst). Next step is the `fork_init` function from the [kernel/fork.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/fork.c). The `fork_init` function allocates cache for the `task_struct`. Let's look on the implementation of the `fork_init`. First of all we can see definitions of the `ARCH_MIN_TASKALIGN` macro and creation of a slab where task_structs will be allocated:\n\n```C\n#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR\n#ifndef ARCH_MIN_TASKALIGN\n#define ARCH_MIN_TASKALIGN      L1_CACHE_BYTES\n#endif\n        task_struct_cachep =\n                kmem_cache_create(\"task_struct\", sizeof(struct task_struct),\n                        ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);\n#endif\n```\n\nAs we can see this code depends on the `CONFIG_ARCH_TASK_STRUCT_ACLLOCATOR` kernel configuration option. This configuration option shows the presence of the `alloc_task_struct` for the given architecture. As `x86_64` has no `alloc_task_struct` function, this code will not work and even will not be compiled on the `x86_64`.\n\nAllocating cache for init task\n--------------------------------------------------------------------------------\n\nAfter this we can see the call of the `arch_task_cache_init` function in the `fork_init`:\n\n```C\nvoid arch_task_cache_init(void)\n{\n        task_xstate_cachep =\n                kmem_cache_create(\"task_xstate\", xstate_size,\n                                  __alignof__(union thread_xstate),\n                                  SLAB_PANIC | SLAB_NOTRACK, NULL);\n        setup_xstate_comp();\n}\n```\n\nThe `arch_task_cache_init` does initialization of the architecture-specific caches. In our case it is `x86_64`, so as we can see, the `arch_task_cache_init` allocates cache for the `task_xstate` which represents [FPU](http://en.wikipedia.org/wiki/Floating-point_unit) state and sets up offsets and sizes of all extended states in [xsave](http://www.felixcloutier.com/x86/XSAVES.html) area with the call of the `setup_xstate_comp` function. After the `arch_task_cache_init` we calculate default maximum number of threads with the:\n\n```C\nset_max_threads(MAX_THREADS);\n```\n\nwhere default maximum number of threads is:\n\n```C\n#define FUTEX_TID_MASK  0x3fffffff\n#define MAX_THREADS     FUTEX_TID_MASK\n```\n\nIn the end of the `fork_init` function we initialize [signal](http://www.win.tue.nl/~aeb/linux/lk/lk-5.html) handler:\n\n```C\ninit_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;\ninit_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;\ninit_task.signal->rlim[RLIMIT_SIGPENDING] =\n\t\tinit_task.signal->rlim[RLIMIT_NPROC];\n```\n\nAs we know the `init_task` is an instance of the `task_struct` structure, so it contains `signal` field which represents signal handler. It has following type `struct signal_struct`. On the first two lines we can see setting of the current and maximum limit of the `resource limits`. Every process has an associated set of resource limits. These limits specify amount of resources which current process can use. Here `rlim` is resource control limit and presented by the:\n\n```C\nstruct rlimit {\n        __kernel_ulong_t        rlim_cur;\n        __kernel_ulong_t        rlim_max;\n};\n```\n\nstructure from the [include/uapi/linux/resource.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/uapi/linux/resource.h). In our case the resource is the `RLIMIT_NPROC` which is the maximum number of processes that user can own and `RLIMIT_SIGPENDING` - the maximum number of pending signals. We can see it in the:\n\n```C\ncat /proc/self/limits\nLimit                     Soft Limit           Hard Limit           Units\n...\n...\n...\nMax processes             63815                63815                processes\nMax pending signals       63815                63815                signals\n...\n...\n...\n```\n\nInitialization of the caches\n--------------------------------------------------------------------------------\n\nThe next function after the `fork_init` is the `proc_caches_init` from the [kernel/fork.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/fork.c). This function allocates caches for the memory descriptors (or `mm_struct` structure). At the beginning of the `proc_caches_init` we can see allocation of the different [SLAB](http://en.wikipedia.org/wiki/Slab_allocation) caches with the call of the `kmem_cache_create`:\n\n* `sighand_cachep` - manage information about installed signal handlers;\n* `signal_cachep` - manage information about process signal descriptor;\n* `files_cachep` - manage information about opened files;\n* `fs_cachep` - manage filesystem information.\n\nAfter this we allocate `SLAB` cache for the `mm_struct` structures:\n\n```C\nmm_cachep = kmem_cache_create(\"mm_struct\",\n                         sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,\n                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);\n```\n\n\nAfter this we allocate `SLAB` cache for the important `vm_area_struct` which used by the kernel to manage virtual memory space:\n\n```C\nvm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);\n```\n\nNote, that we use `KMEM_CACHE` macro here instead of the `kmem_cache_create`. This macro is defined in the [include/linux/slab.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/slab.h) and just expands to the `kmem_cache_create` call:\n\n```C\n#define KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\\\n                sizeof(struct __struct), __alignof__(struct __struct),\\\n                (__flags), NULL)\n```\n\nThe `KMEM_CACHE` has one difference from `kmem_cache_create`. Take a look on `__alignof__` operator. The `KMEM_CACHE` macro aligns `SLAB` to the size of the given structure, but `kmem_cache_create` uses given value to align space. After this we can see the call of the `mmap_init` and `nsproxy_cache_init` functions. The first function initializes virtual memory area `SLAB` and the second function initializes `SLAB` for namespaces.\n\nThe next function after the `proc_caches_init` is `buffer_init`. This function is defined in the [fs/buffer.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/buffer.c) source code file and allocate cache for the `buffer_head`. The `buffer_head` is a special structure which defined in the [include/linux/buffer_head.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/buffer_head.h) and used for managing buffers. In the start of the `buffer_init` function we allocate cache for the `struct buffer_head` structures with the call of the `kmem_cache_create` function as we did in the previous functions. And calculate the maximum size of the buffers in memory with:\n\n```C\nnrpages = (nr_free_buffer_pages() * 10) / 100;\nmax_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));\n```\n\nwhich will be equal to the `10%` of the `ZONE_NORMAL` (all RAM from the 4GB on the `x86_64`). The next function after the `buffer_init` is - `vfs_caches_init`. This function allocates `SLAB` caches and hashtable for different [VFS](http://en.wikipedia.org/wiki/Virtual_file_system) caches. We already saw the `vfs_caches_init_early` function in the eighth part of the Linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-8) which initialized caches for `dcache` (or directory-cache) and [inode](http://en.wikipedia.org/wiki/Inode) cache. The `vfs_caches_init` function makes post-early initialization of the `dcache` and `inode` caches, private data cache, hash tables for the mount points, etc. More details about [VFS](http://en.wikipedia.org/wiki/Virtual_file_system) will be described in the separate part. After this we can see `signals_init` function. This function is defined in the [kernel/signal.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/signal.c) and allocates a cache for the `sigqueue` structures which represents queue of the real time signals. The next function is `page_writeback_init`. This function initializes the ratio for the dirty pages. Every low-level page entry contains the `dirty` bit which indicates whether a page has been written to after been loaded into memory.\n\nCreation of the root for the procfs\n--------------------------------------------------------------------------------\n\nAfter all of this preparations we need to create the root for the [proc](http://en.wikipedia.org/wiki/Procfs) filesystem. We will do it with the call of the `proc_root_init` function from the [fs/proc/root.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/proc/root.c). At the start of the `proc_root_init` function we allocate the cache for the inodes and register a new filesystem in the system with the:\n\n```C\nerr = register_filesystem(&proc_fs_type);\n      if (err)\n                return;\n```\n\nAs I wrote above we will not dive into details about [VFS](http://en.wikipedia.org/wiki/Virtual_file_system) and different filesystems in this chapter, but will see it in the chapter about the `VFS`. After we've registered a new filesystem in our system, we call the `proc_self_init` function from the [fs/proc/self.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/proc/self.c) and this function allocates `inode` number for the `self` (`/proc/self` directory refers to the process accessing the `/proc` filesystem). The next step after the `proc_self_init` is `proc_setup_thread_self` which setups the `/proc/thread-self` directory which contains information about current thread. After this we create `/proc/self/mounts` symlink which will contains mount points with the call of the\n\n```C\nproc_symlink(\"mounts\", NULL, \"self/mounts\");\n```\n\nand a couple of directories depends on the different configuration options:\n\n```C\n#ifdef CONFIG_SYSVIPC\n        proc_mkdir(\"sysvipc\", NULL);\n#endif\n        proc_mkdir(\"fs\", NULL);\n        proc_mkdir(\"driver\", NULL);\n        proc_mkdir(\"fs/nfsd\", NULL);\n#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)\n        proc_mkdir(\"openprom\", NULL);\n#endif\n        proc_mkdir(\"bus\", NULL);\n        ...\n        ...\n        ...\n        if (!proc_mkdir(\"tty\", NULL))\n                 return;\n        proc_mkdir(\"tty/ldisc\", NULL);\n        ...\n        ...\n        ...\n```\n\nIn the end of the `proc_root_init` we call the `proc_sys_init` function which creates `/proc/sys` directory and initializes the [Sysctl](http://en.wikipedia.org/wiki/Sysctl).\n\nIt is the end of `start_kernel` function. I did not describe all functions which are called in the `start_kernel`. I skipped them, because they are not important for the generic kernel initialization stuff and depend on only different kernel configurations. They are `taskstats_init_early` which exports per-task statistic to the user-space, `delayacct_init` - initializes per-task delay accounting, `key_init` and `security_init` initialize different security stuff, `check_bugs` - fix some architecture-dependent bugs, `ftrace_init` function executes initialization of the [ftrace](https://www.kernel.org/doc/Documentation/trace/ftrace.txt), `cgroup_init` makes initialization of the rest of the [cgroup](http://en.wikipedia.org/wiki/Cgroups) subsystem, etc. Many of these parts and subsystems will be described in the other chapters.\n\nThat's all. Finally we have passed through the long-long `start_kernel` function. But it is not the end of the Linux kernel initialization process. We haven't run the first process yet. In the end of the `start_kernel` we can see the last call of the - `rest_init` function. Let's go ahead.\n\nFirst steps after the start_kernel\n--------------------------------------------------------------------------------\n\nThe `rest_init` function is defined in the same source code file as `start_kernel` function, and this file is [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). In the beginning of the `rest_init` we can see call of the two following functions:\n\n```C\n\trcu_scheduler_starting();\n\tsmpboot_thread_init();\n```\n\nThe first `rcu_scheduler_starting` makes [RCU](http://en.wikipedia.org/wiki/Read-copy-update) scheduler active and the second `smpboot_thread_init` registers the `smpboot_thread_notifier` CPU notifier (more about it you can read in the [CPU hotplug documentation](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt). After this we can see the following calls:\n\n```C\nkernel_thread(kernel_init, NULL, CLONE_FS);\npid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);\n```\n\nHere the `kernel_thread` function (defined in the [kernel/fork.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/fork.c)) creates new kernel thread. As we can see the `kernel_thread` function takes three arguments:\n\n* Function which will be executed in a new thread;\n* Parameter for the `kernel_init` function;\n* Flags.\n\nWe will not dive into details about `kernel_thread` implementation (we will see it in the chapter which describe scheduler, just need to say that `kernel_thread` invokes [clone](http://www.tutorialspoint.com/unix_system_calls/clone.htm)). Now we only need to know that we create new kernel thread with `kernel_thread` function, parent and child of the thread will use shared information about filesystem and it will start to execute `kernel_init` function. A kernel thread differs from a user thread that it runs in kernel mode. So with these two `kernel_thread` calls we create two new kernel threads with the `PID = 1` for `init` process and `PID = 2` for `kthreadd`. We already know what is `init` process. Let's look on the `kthreadd`. It is a special kernel thread which manages and helps different parts of the kernel to create another kernel thread. We can see it in the output of the `ps` util:\n\n```C\n$ ps -ef | grep kthreadd\nroot         2     0  0 Jan11 ?        00:00:00 [kthreadd]\n```\n\nLet's postpone `kernel_init` and `kthreadd` for now and go ahead in the `rest_init`. In the next step after we have created two new kernel threads we can see the following code:\n\n```C\n\trcu_read_lock();\n\tkthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);\n\trcu_read_unlock();\n```\n\nThe first `rcu_read_lock` function marks the beginning of an [RCU](http://en.wikipedia.org/wiki/Read-copy-update) read-side critical section and the `rcu_read_unlock` marks the end of an RCU read-side critical section. We call these functions because we need to protect the `find_task_by_pid_ns`. The `find_task_by_pid_ns` returns pointer to the `task_struct` by the given pid. So, here we are getting the pointer to the `task_struct` for `PID = 2` (we got it after `kthreadd` creation with the `kernel_thread`). In the next step we call `complete` function\n\n```C\ncomplete(&kthreadd_done);\n```\n\nand pass address of the `kthreadd_done`. The `kthreadd_done` defined as\n\n```C\nstatic __initdata DECLARE_COMPLETION(kthreadd_done);\n```\n\nwhere `DECLARE_COMPLETION` macro defined as:\n\n```C\n#define DECLARE_COMPLETION(work) \\\n         struct completion work = COMPLETION_INITIALIZER(work)\n```\n\nand expands to the definition of the `completion` structure. This structure is defined in the [include/linux/completion.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/completion.h) and presents `completions` concept. Completions is a code synchronization mechanism which provides race-free solution for the threads that must wait for some process to have reached a point or a specific state. Using completions consists of three parts: The first is definition of the `complete` structure and we did it with the `DECLARE_COMPLETION`. The second is call of the `wait_for_completion`. After the call of this function, a thread which called it will not continue to execute and will wait while other thread did not call `complete` function. Note that we call `wait_for_completion` with the `kthreadd_done` in the beginning of the `kernel_init_freeable`:\n\n```C\nwait_for_completion(&kthreadd_done);\n```\n\nAnd the last step is to call `complete` function as we saw it above. After this the `kernel_init_freeable` function will not be executed while `kthreadd` thread will not be set. After the `kthreadd` was set, we can see three following functions in the `rest_init`:\n\n```C\n\tinit_idle_bootup_task(current);\n\tschedule_preempt_disabled();\n    cpu_startup_entry(CPUHP_ONLINE);\n```\n\nThe first `init_idle_bootup_task` function from the [kernel/sched/core.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/sched/core.c) sets the Scheduling class for the current process (`idle` class in our case):\n\n```C\nvoid init_idle_bootup_task(struct task_struct *idle)\n{\n         idle->sched_class = &idle_sched_class;\n}\n```\n\nwhere `idle` class is a low task priority and tasks can be run only when the processor doesn't have anything to run besides this tasks. The second function `schedule_preempt_disabled` disables preempt in `idle` tasks. And the third function `cpu_startup_entry` is defined in the [kernel/sched/idle.c](https://github.com/torvalds/linux/blob/master/kernel/sched/idle.c) and calls `cpu_idle_loop` from the [kernel/sched/idle.c](https://github.com/torvalds/linux/blob/master/kernel/sched/idle.c). The `cpu_idle_loop` function works as process with `PID = 0` and works in the background. Main purpose of the `cpu_idle_loop` is to consume the idle CPU cycles. When there is no process to run, this process starts to work. We have one process with `idle` scheduling class (we just set the `current` task to the `idle` with the call of the `init_idle_bootup_task` function), so the `idle` thread does not do useful work but just checks if there is an active task to switch to:\n\n```C\nstatic void cpu_idle_loop(void)\n{\n        ...\n        ...\n        ...\n        while (1) {\n                while (!need_resched()) {\n                ...\n                ...\n                ...\n                }\n        ...\n        }\n```\n\nMore about it will be in the chapter about scheduler. So for this moment the `start_kernel` calls the `rest_init` function which spawns an `init` (`kernel_init` function) process and become `idle` process itself. Now is time to look on the `kernel_init`. Execution of the `kernel_init` function starts from the call of the `kernel_init_freeable` function. The `kernel_init_freeable` function first of all waits for the completion of the `kthreadd` setup. I already wrote about it above:\n\n```C\nwait_for_completion(&kthreadd_done);\n```\n\nAfter this we set `gfp_allowed_mask` to `__GFP_BITS_MASK` which means that system is already running, set allowed [cpus/mems](https://www.kernel.org/doc/Documentation/cgroups/cpusets.txt) to all CPUs and [NUMA](http://en.wikipedia.org/wiki/Non-uniform_memory_access) nodes with the `set_mems_allowed` function, allow `init` process to run on any CPU with the `set_cpus_allowed_ptr`, set pid for the `cad` or `Ctrl-Alt-Delete`, do preparation for booting of the other CPUs with the call of the `smp_prepare_cpus`, call early [initcalls](http://kernelnewbies.org/Documents/InitcallMechanism) with the `do_pre_smp_initcalls`, initialize `SMP` with the `smp_init` and initialize [lockup_detector](https://www.kernel.org/doc/Documentation/lockup-watchdogs.txt) with the call of the `lockup_detector_init` and initialize scheduler with the `sched_init_smp`.\n\nAfter this we can see the call of the following functions - `do_basic_setup`. Before we will call the `do_basic_setup` function, our kernel already initialized for this moment. As comment says:\n\n```\nNow we can finally start doing some real work..\n```\n\nThe `do_basic_setup` will reinitialize [cpuset](https://www.kernel.org/doc/Documentation/cgroups/cpusets.txt) to the active CPUs, initialize the `khelper` - which is a kernel thread which used for making calls out to userspace from within the kernel, initialize [tmpfs](http://en.wikipedia.org/wiki/Tmpfs), initialize `drivers` subsystem, enable the user-mode helper `workqueue`  and make post-early call of the `initcalls`. We can see opening of the `dev/console` and dup twice file descriptors from `0` to `2` after the `do_basic_setup`:\n\n\n```C\nif (sys_open((const char __user *) \"/dev/console\", O_RDWR, 0) < 0)\n\tpr_err(\"Warning: unable to open an initial console.\\n\");\n\n(void) sys_dup(0);\n(void) sys_dup(0);\n```\n\nWe are using two system calls here `sys_open` and `sys_dup`. In the next chapters we will see explanation and implementation of the different system calls. After we opened initial console, we check that `rdinit=` option was passed to the kernel command line or set default path of the ramdisk:\n\n```C\nif (!ramdisk_execute_command)\n\tramdisk_execute_command = \"/init\";\n```\n\nCheck user's permissions for the `ramdisk` and call the `prepare_namespace` function from the [init/do_mounts.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/do_mounts.c) which checks and mounts the [initrd](http://en.wikipedia.org/wiki/Initrd):\n\n```C\nif (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) {\n\tramdisk_execute_command = NULL;\n\tprepare_namespace();\n}\n```\n\nThis is the end of the `kernel_init_freeable` function and we need return to the `kernel_init`. The next step after the `kernel_init_freeable` finished its execution is the `async_synchronize_full`. This function waits until all asynchronous function calls have been done and after it we will call the `free_initmem` which will release all memory occupied by the initialization stuff which located between `__init_begin` and `__init_end`. After this we protect `.rodata` with the `mark_rodata_ro` and update state of the system from the `SYSTEM_BOOTING` to the\n\n```C\nsystem_state = SYSTEM_RUNNING;\n```\n\nAnd tries to run the `init` process:\n\n```C\nif (ramdisk_execute_command) {\n\tret = run_init_process(ramdisk_execute_command);\n\tif (!ret)\n\t\treturn 0;\n\tpr_err(\"Failed to execute %s (error %d)\\n\",\n\t       ramdisk_execute_command, ret);\n}\n```\n\nFirst of all it checks the `ramdisk_execute_command` which we set in the `kernel_init_freeable` function and it will be equal to the value of the `rdinit=` kernel command line parameters or `/init` by default. The `run_init_process` function fills the first element of the `argv_init` array:\n\n```C\nstatic const char *argv_init[MAX_INIT_ARGS+2] = { \"init\", NULL, };\n```\n\nwhich represents arguments of the `init` program and call `do_execve` function:\n\n```C\nargv_init[0] = init_filename;\nreturn do_execve(getname_kernel(init_filename),\n\t(const char __user *const __user *)argv_init,\n\t(const char __user *const __user *)envp_init);\n```\n\nThe `do_execve` function is defined in the [include/linux/sched.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/sched.h) and runs program with the given file name and arguments. If we did not pass `rdinit=` option to the kernel command line, kernel starts to check the `execute_command` which is equal to value of the `init=` kernel command line parameter:\n\n```C\n\tif (execute_command) {\n\t\tret = run_init_process(execute_command);\n\t\tif (!ret)\n\t\t\treturn 0;\n\t\tpanic(\"Requested init %s failed (error %d).\",\n\t\t      execute_command, ret);\n\t}\n```\n\nIf we did not pass `init=` kernel command line parameter either, kernel tries to run one of the following executable files:\n\n```C\nif (!try_to_run_init_process(\"/sbin/init\") ||\n    !try_to_run_init_process(\"/etc/init\") ||\n    !try_to_run_init_process(\"/bin/init\") ||\n    !try_to_run_init_process(\"/bin/sh\"))\n\treturn 0;\n```\n\nOtherwise we finish with [panic](http://en.wikipedia.org/wiki/Kernel_panic):\n\n```C\npanic(\"No working init found.  Try passing init= option to kernel. \"\n      \"See Linux Documentation/init.txt for guidance.\");\n```\n\nThat's all! Linux kernel initialization process is finished!\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the tenth part about the Linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization). It is not only the `tenth` part, but also is the last part which describes initialization of the linux kernel. As I wrote in the first [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of this chapter, we will go through all steps of the kernel initialization and we did it. We started at the first architecture-independent function - `start_kernel` and finished with the launch of the first `init` process in the our system. I skipped details about different subsystem of the kernel, for example I almost did not cover scheduler, interrupts, exception handling, etc. From the next part we will start to dive to the different kernel subsystems. Hope it will be interesting.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [SLAB](http://en.wikipedia.org/wiki/Slab_allocation)\n* [xsave](http://www.felixcloutier.com/x86/XSAVES.html)\n* [FPU](http://en.wikipedia.org/wiki/Floating-point_unit)\n* [Documentation/security/credentials.txt](https://github.com/torvalds/linux/blob/master/Documentation/security/credentials.rst)\n* [Documentation/x86/x86_64/mm](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/x86/x86_64/mm.txt)\n* [RCU](http://en.wikipedia.org/wiki/Read-copy-update)\n* [VFS](http://en.wikipedia.org/wiki/Virtual_file_system)\n* [inode](http://en.wikipedia.org/wiki/Inode)\n* [proc](http://en.wikipedia.org/wiki/Procfs)\n* [man proc](http://linux.die.net/man/5/proc)\n* [Sysctl](http://en.wikipedia.org/wiki/Sysctl)\n* [ftrace](https://www.kernel.org/doc/Documentation/trace/ftrace.txt)\n* [cgroup](http://en.wikipedia.org/wiki/Cgroups)\n* [CPU hotplug documentation](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt)\n* [completions - wait for completion handling](https://www.kernel.org/doc/Documentation/scheduler/completion.txt)\n* [NUMA](http://en.wikipedia.org/wiki/Non-uniform_memory_access)\n* [cpus/mems](https://www.kernel.org/doc/Documentation/cgroups/cpusets.txt)\n* [initcalls](http://kernelnewbies.org/Documents/InitcallMechanism)\n* [Tmpfs](http://en.wikipedia.org/wiki/Tmpfs)\n* [initrd](http://en.wikipedia.org/wiki/Initrd)\n* [panic](http://en.wikipedia.org/wiki/Kernel_panic)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-9)\n"
  },
  {
    "path": "Initialization/linux-initialization-2.md",
    "content": "Kernel initialization. Part 2.\n================================================================================\n\nEarly interrupt and exception handling\n--------------------------------------------------------------------------------\n\nIn the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) we stopped before setting of early interrupt handlers. At this moment we are in the decompressed Linux kernel, we have basic [paging](https://en.wikipedia.org/wiki/Page_table) structure for early boot and our current goal is to finish early preparation before the main kernel code will start to work.\n\nWe already started to do this preparation in the previous [first](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) part of this [chapter](https://0xax.gitbook.io/linux-insides/summary/initialization). We continue in this part and will know more about interrupt and exception handling.\n\nRemember that we stopped before following function:\n\n```C\n\tidt_setup_early_handler();\n```\n\nfrom the [arch/x86/kernel/head64.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/head64.c) source code file. But before we start to sort out this function, we need to know about interrupts and handlers.\n\nSome theory\n--------------------------------------------------------------------------------\n\nAn interrupt is an event caused by software or hardware to the CPU. For example a user have pressed a key on keyboard. On interrupt, CPU stops the current task and transfer control to the special routine which is called - [interrupt handler](https://en.wikipedia.org/wiki/Interrupt_handler). An interrupt handler handles and interrupt and transfer control back to the previously stopped task. We can split interrupts on three types:\n\n* Software interrupts - when a software signals CPU that it needs kernel attention. These interrupts are generally used for system calls;\n* Hardware interrupts - when a hardware event happens, for example button is pressed on a keyboard;\n* Exceptions - interrupts generated by CPU, when the CPU detects error, for example division by zero or accessing a memory page which is not in RAM.\n\nEvery interrupt and exception is assigned a unique number which is called - `vector number`. `Vector number` can be any number from `0` to `255`. There is common practice to use first `32` vector numbers for exceptions, and vector numbers from `32` to `255` are used for user-defined interrupts.\n\nCPU uses vector number as an index in the `Interrupt Descriptor Table` (we will see description of it soon). CPU catches interrupts from the [APIC](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) or through its pins. Following table shows `0-31` exceptions:\n\n```\n----------------------------------------------------------------------------------------------\n|Vector|Mnemonic|Description         |Type |Error Code|Source                   |\n----------------------------------------------------------------------------------------------\n|0     | #DE    |Divide Error        |Fault|NO        |DIV and IDIV                          |\n|---------------------------------------------------------------------------------------------\n|1     | #DB    |Reserved            |F/T  |NO        |                                      |\n|---------------------------------------------------------------------------------------------\n|2     | ---    |NMI                 |INT  |NO        |external NMI                          |\n|---------------------------------------------------------------------------------------------\n|3     | #BP    |Breakpoint          |Trap |NO        |INT 3                                 |\n|---------------------------------------------------------------------------------------------\n|4     | #OF    |Overflow            |Trap |NO        |INTO  instruction                     |\n|---------------------------------------------------------------------------------------------\n|5     | #BR    |Bound Range Exceeded|Fault|NO        |BOUND instruction                     |\n|---------------------------------------------------------------------------------------------\n|6     | #UD    |Invalid Opcode      |Fault|NO        |UD2 instruction                       |\n|---------------------------------------------------------------------------------------------\n|7     | #NM    |Device Not Available|Fault|NO        |Floating point or [F]WAIT             |\n|---------------------------------------------------------------------------------------------\n|8     | #DF    |Double Fault        |Abort|YES       |An instruction which can generate NMI |\n|---------------------------------------------------------------------------------------------\n|9     | ---    |Reserved            |Fault|NO        |                                      |\n|---------------------------------------------------------------------------------------------\n|10    | #TS    |Invalid TSS         |Fault|YES       |Task switch or TSS access             |\n|---------------------------------------------------------------------------------------------\n|11    | #NP    |Segment Not Present |Fault|NO        |Accessing segment register            |\n|---------------------------------------------------------------------------------------------\n|12    | #SS    |Stack-Segment Fault |Fault|YES       |Stack operations                      |\n|---------------------------------------------------------------------------------------------\n|13    | #GP    |General Protection  |Fault|YES       |Memory reference                      |\n|---------------------------------------------------------------------------------------------\n|14    | #PF    |Page fault          |Fault|YES       |Memory reference                      |\n|---------------------------------------------------------------------------------------------\n|15    | ---    |Reserved            |     |NO        |                                      |\n|---------------------------------------------------------------------------------------------\n|16    | #MF    |x87 FPU fp error    |Fault|NO        |Floating point or [F]Wait             |\n|---------------------------------------------------------------------------------------------\n|17    | #AC    |Alignment Check     |Fault|YES       |Data reference                        |\n|---------------------------------------------------------------------------------------------\n|18    | #MC    |Machine Check       |Abort|NO        |                                      |\n|---------------------------------------------------------------------------------------------\n|19    | #XM    |SIMD fp exception   |Fault|NO        |SSE[2,3] instructions                 |\n|---------------------------------------------------------------------------------------------\n|20    | #VE    |Virtualization exc. |Fault|NO        |EPT violations                        |\n|---------------------------------------------------------------------------------------------\n|21-31 | ---    |Reserved            |INT  |NO        |External interrupts                   |\n----------------------------------------------------------------------------------------------\n```\n\nTo react on interrupt CPU uses special structure - Interrupt Descriptor Table or IDT. IDT is an array of 8-byte descriptors like Global Descriptor Table, but IDT entries are called `gates`. CPU multiplies vector number by 8 to find the IDT entry. But in 64-bit mode IDT is an array of 16-byte descriptors and CPU multiplies vector number by 16 to find the entry in the IDT. We remember from the previous part that CPU uses special `GDTR` register to locate Global Descriptor Table, so CPU uses special register `IDTR` for Interrupt Descriptor Table and `lidt` instruction for loading base address of the table into this register.\n\n64-bit mode IDT entry has following structure:\n\n```\n127                                                                             96\n --------------------------------------------------------------------------------\n|                                                                               |\n|                                Reserved                                       |\n|                                                                               |\n --------------------------------------------------------------------------------\n95                                                                              64\n --------------------------------------------------------------------------------\n|                                                                               |\n|                               Offset 63..32                                   |\n|                                                                               |\n --------------------------------------------------------------------------------\n63                               48 47      46  44   42    39             34    32\n --------------------------------------------------------------------------------\n|                                  |       |  D  |   |     |      |   |   |     |\n|       Offset 31..16              |   P   |  P  | 0 |Type |0 0 0 | 0 | 0 | IST |\n|                                  |       |  L  |   |     |      |   |   |     |\n --------------------------------------------------------------------------------\n31                                   16 15                                      0\n --------------------------------------------------------------------------------\n|                                      |                                        |\n|          Segment Selector            |                 Offset 15..0           |\n|                                      |                                        |\n --------------------------------------------------------------------------------\n```\n\nWhere:\n\n* `Offset` - is offset to entry point of an interrupt handler;\n* `DPL` -    Descriptor Privilege Level;\n* `P` -      Segment Present flag;\n* `Segment selector` - a code segment selector in GDT or LDT (actually in linux, it must point to a valid descriptor in your GDT.)\n```C\n#define __KERNEL_CS\t(GDT_ENTRY_KERNEL_CS*8) // 0000 0000 0001 0000\n#define GDT_ENTRY_KERNEL_CS 2\n```\n* `IST` -    provides ability to switch to a new stack for interrupts handling.\n\nAnd the last `Type` field describes type of the `IDT` entry. There are three different kinds of gates for interrupts:\n\n* Task gate\n* Interrupt gate\n* Trap gate\n\nInterrupt and trap gates contain a far pointer to the entry point of the interrupt handler. Only one difference between these types is how CPU handles `IF` flag. If interrupt handler was accessed through interrupt gate, CPU clear the `IF` flag to prevent other interrupts while current interrupt handler executes. After that current interrupt handler executes, CPU sets the `IF` flag again with `iret` instruction.\n\nOther bits in the interrupt descriptor is reserved and must be 0. Now let's look how CPU handles interrupts:\n\n* CPU save flags register, `CS`, and instruction pointer on the stack.\n* If interrupt causes an error code (like `#PF` for example), CPU saves an error on the stack after instruction pointer;\n* After interrupt handler executes, `iret` instruction will be used to return from it.\n\nNow let's back to code.\n\nFill and load IDT\n--------------------------------------------------------------------------------\n\nWe stopped at the following function:\n\n```C\n\tidt_setup_early_handler();\n```\n\n`idt_setup_early_handler` is defined in the [arch/x86/kernel/idt.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/idt.c) like the following:\n\n```C\nvoid __init idt_setup_early_handler(void)\n{\n\tint i;\n\n\tfor (i = 0; i < NUM_EXCEPTION_VECTORS; i++)\n\t\tset_intr_gate(i, early_idt_handler_array[i]);\n\n\tload_idt(&idt_descr);\n}\n```\n\nwhere `NUM_EXCEPTION_VECTORS` expands to `32`. As we can see, We're filling only first 32 `IDT` entries in the loop, because all of the early setup runs with interrupts disabled, so there is no need to set up interrupt handlers for vectors greater than `32`. Here we call `set_intr_gate` in the loop, which takes two parameters:\n\n* Number of an interrupt or `vector number`;\n* Address of the idt handler.\n\nand inserts an interrupt gate to the `IDT` table which is represented by the `&idt_descr` array. \n\nThe `early_idt_handler_array` array is declared in the [arch/x86/include/asm/segment.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/segment.h) header file and contains addresses of the first `32` exception handlers:\n\n```C\n#define EARLY_IDT_HANDLER_SIZE   9\n#define NUM_EXCEPTION_VECTORS\t32\n\nextern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];\n```\n\nThe `early_idt_handler_array` is `288` bytes array which contains address of exception entry points every nine bytes. Every nine bytes of this array consist of two bytes optional instruction for pushing dummy error code if an exception does not provide it, two bytes instruction for pushing vector number to the stack and five bytes of `jump` to the common exception handler code. You will see more detail in the next paragraph.\n\nThe `set_intr_gate` function is defined in the [arch/x86/kernel/idt.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/idt.c) source file and looks:\n\n```C\nstatic void set_intr_gate(unsigned int n, const void *addr)\n{\n\tstruct idt_data data;\n\n\tBUG_ON(n > 0xFF);\n\n\tmemset(&data, 0, sizeof(data));\n\tdata.vector\t= n;\n\tdata.addr\t= addr;\n\tdata.segment\t= __KERNEL_CS;\n\tdata.bits.type\t= GATE_INTERRUPT;\n\tdata.bits.p\t= 1;\n\n        idt_setup_from_table(idt_table, &data, 1, false);\n}\n```\n\nFirst of all it checks that passed vector number is not greater than `255` with `BUG_ON` macro. We need to do this because we are limited to have up to `256` interrupts. After this, we fill the idt data with the given arguments and others, which will be passed to `idt_setup_from_table`. The `idt_setup_from_table` function is defined in the same file as the `set_intr_gate` function like the following:\n\n```C\nstatic void\nidt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)\n{\n\tgate_desc desc;\n\n\tfor (; size > 0; t++, size--) {\n\t\tdesc.offset_low    = (u16) t->addr;\n\t\tdesc.segment\t   = (u16) t->segment\n\t\tdesc.bits\t   = t->bits;\n\t\tdesc.offset_middle = (u16) (t->addr >> 16);\n\t\tdesc.offset_high   = (u32) (t->addr >> 32);\n\t\tdesc.reserved\t   = 0;\n\t\tmemcpy(&idt[t->vector], &desc, sizeof(desc));\n\t\tif (sys)\n\t\t\tset_bit(t->vector, system_vectors);\n\t}\n}\n```\n\nwhich fill temporary idt descriptor with the given arguments and others. And then we just copy it to the certain element of the `idt_table` array. `idt_table` is an array of idt entries:\n\n```C\ngate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;\n```\n\nNow we are moving back to main loop code. After main loop finishes, we can load `Interrupt Descriptor table` with the call of the:\n\n```C\n\tload_idt((const struct desc_ptr *)&idt_descr);\n```\n\nwhere `idt_descr` is:\n\n```C\nstruct desc_ptr idt_descr __ro_after_init = {\n\t.size\t\t= (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1,\n\t.address\t= (unsigned long) idt_table,\n};\n```\n\nand `load_idt` just executes `lidt` instruction:\n\n```C\n\tasm volatile(\"lidt %0\"::\"m\" (idt_descr));\n```\n\nOkay, now we have filled and loaded `Interrupt Descriptor Table`, we know how the CPU acts during an interrupt. So now time to deal with interrupts handlers.\n\nEarly interrupts handlers\n--------------------------------------------------------------------------------\n\nAs you can read above, we filled `IDT` with the address of the `early_idt_handler_array`. In this section, we are going to look into it in detail. We can find it in the [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/head_64.S) assembly file:\n\n```assembly\nENTRY(early_idt_handler_array)\n\ti = 0\n\t.rept NUM_EXCEPTION_VECTORS\n\t.if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0\n\t\tUNWIND_HINT_IRET_REGS\n\t\tpushq $0\t# Dummy error code, to make stack frame uniform\n\t.else\n\t\tUNWIND_HINT_IRET_REGS offset=8\n\t.endif\n\tpushq $i\t\t# 72(%rsp) Vector number\n\tjmp early_idt_handler_common\n\tUNWIND_HINT_IRET_REGS\n\ti = i + 1\n\t.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc\n\t.endr\n\tUNWIND_HINT_IRET_REGS offset=16\nEND(early_idt_handler_array)\n```\n\nWe can see here, interrupt handlers generation for the first `32` exceptions. We check here, if exception has an error code then we do nothing, if exception does not return error code, we push zero to the stack. We do it for that stack was uniform. After that we push `vector number` on the stack and jump on the `early_idt_handler_common` which is generic interrupt handler for now. After all, every nine bytes of the `early_idt_handler_array` array consists of optional push of an error code, push of `vector number` and jump instruction to `early_idt_handler_common`. We can see it in the output of the `objdump` util:\n\n```\n$ objdump -D vmlinux\n...\n...\n...\nffffffff81fe5000 <early_idt_handler_array>:\nffffffff81fe5000:       6a 00                   pushq  $0x0\nffffffff81fe5002:       6a 00                   pushq  $0x0\nffffffff81fe5004:       e9 17 01 00 00          jmpq   ffffffff81fe5120 <early_idt_handler_common>\nffffffff81fe5009:       6a 00                   pushq  $0x0\nffffffff81fe500b:       6a 01                   pushq  $0x1\nffffffff81fe500d:       e9 0e 01 00 00          jmpq   ffffffff81fe5120 <early_idt_handler_common>\nffffffff81fe5012:       6a 00                   pushq  $0x0\nffffffff81fe5014:       6a 02                   pushq  $0x2\n...\n...\n...\n```\n\nAs we may know, CPU pushes flag register, `CS` and `RIP` on the stack before calling interrupt handler. So before `early_idt_handler_common` will be executed, stack will contain following data:\n\n```\n|--------------------|\n| %rflags            |\n| %cs                |\n| %rip               |\n| error code         |\n| vector number      |<-- %rsp\n|--------------------|\n```\n\nNow let's look on the `early_idt_handler_common` implementation. It locates in the same [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/head_64.S) assembly file. First of all we increment `early_recursion_flag` to prevent recursion in the `early_idt_handler_common`:\n\n```assembly\n\tincl early_recursion_flag(%rip)\n```\n\nThe `early_recursion_flag` is defined in the same assembly file as the `early_idt_handler_common` symbol as follows:\n\n```assembly\n\tearly_recursion_flag:\n\t\t.long 0\n```\n\nNext we save general registers on the stack:\n\n```assembly\n\tpushq %rsi\n\tmovq 8(%rsp), %rsi\n\tmovq %rdi, 8(%rsp)\n\tpushq %rdx\n\tpushq %rcx\n\tpushq %rax\n\tpushq %r8\n\tpushq %r9\n\tpushq %r10\n\tpushq %r11\n\tpushq %rbx\n\tpushq %rbp\n\tpushq %r12\n\tpushq %r13\n\tpushq %r14\n\tpushq %r15\n\tUNWIND_HINT_REGS\n```\n\nOkay, now the stack contains following data:\n```\nHigh |-------------------------|\n     | %rflags                 |\n     | %cs                     |\n     | %rip                    |\n     | error code              |\n     | %rdi                    |\n     | %rsi                    |\n     | %rdx                    |\n     | %rax                    |\n     | %r8                     |\n     | %r9                     |\n     | %r10                    |\n     | %r11                    |\n     | %rbx                    |\n     | %rbp                    |\n     | %r12                    |\n     | %r13                    |\n     | %r14                    |\n     | %r15                    |<-- %rsp\nLow  |-------------------------|\n```\n\nWe need to do it to prevent wrong values of registers when we return from the interrupt handler. After this we check the vector number, and if it is `#PF` or [Page Fault](https://en.wikipedia.org/wiki/Page_fault), we put value from the `cr2` to the `rdi` register and call `early_make_pgtable` (we'll see it soon):\n\n```assembly\n\tcmpq $14,%rsi            /* Page fault? */\n\tjnz 10f\n\tGET_CR2_INTO(%rdi)\n\tcall early_make_pgtable\n\tandl %eax,%eax           /* It is more efficient, the opcode is shorter than movl 1, %eax, only 2 bytes. */\n\tjz 20f                   /* All good */\n```\n\notherwise we call `early_fixup_exception` function by passing kernel stack pointer:\n\n```assembly\n10:\n\tmovq %rsp,%rdi\n\tcall early_fixup_exception\n```\n\nWe'll see the implementation of the `early_fixup_exception` function later.\n\n```assembly\n20:\n\tdecl early_recursion_flag(%rip)\n\tjmp restore_regs_and_return_to_kernel\n```\n\nAfter we decrement the `early_recursion_flag`, we restore registers which we saved before from the stack and return from the handler with `iretq`.\n\nIt is the end of the interrupt handler. We will examine the page fault handling and the other exception handling in order.\n\nPage fault handling\n--------------------------------------------------------------------------------\n\nIn the previous paragraph we saw the early interrupt handler which checks if the vector number is page fault and calls `early_make_pgtable` for building new page tables if it is. We need to have `#PF` handler in this step because there are plans to add ability to load kernel above `4G` and make access to `boot_params` structure above the 4G.\n\nYou can find the implementation of `early_make_pgtable` in [arch/x86/kernel/head64.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/head64.c) and takes one parameter - the value of `cr2` register, which contains the address caused page fault. Let's look on it:\n\n```C\nint __init early_make_pgtable(unsigned long address)\n{\n\tunsigned long physaddr = address - __PAGE_OFFSET;\n\tpmdval_t pmd;\n\n\tpmd = (physaddr & PMD_MASK) + early_pmd_flags;\n\n\treturn __early_make_pgtable(address, pmd);\n}\n```\n\n`__PAGE_OFFSET` is defined in the [arch/x86/include/asm/page_64_types.h](https://elixir.bootlin.com/linux/v3.10-rc1/source/arch/x86/include/asm/page_64_types.h#L33) header file, and the suffix `UL` forces the page offset to be a unsigned long data type.\n\n```C\n#define __PAGE_OFFSET           _AC(0xffff880000000000, UL) \n```\n\nAnd the `_AC` macro is defined in the [include/uapi/linux/const.h](https://elixir.bootlin.com/linux/v3.10-rc1/source/include/uapi/linux/const.h#L16) header file: \n\n```C\n/* Some constant macros are used in both assembler and\n * C code.  Therefore we cannot annotate them always with\n * 'UL' and other type specifiers unilaterally.  We\n * use the following macros to deal with this.\n *\n * Similarly, _AT() will cast an expression with a type in C, but\n * leave it unchanged in asm.\n */\n\n#ifdef __ASSEMBLY__\n#define _AC(X,Y)\tX\n#else\n#define __AC(X,Y)\t(X##Y)\n#define _AC(X,Y)\t__AC(X,Y)\n#endif\n```\nWhere `__PAGE_OFFSET` expands to `0xffff888000000000`. But, why is it possible to translate a virtual address to a physical address by subtracting `__PAGE_OFFSET`?  The answer is in the [Documentation/x86/x86_64/mm.rst](https://elixir.bootlin.com/linux/v5.10-rc5/source/Documentation/x86/x86_64/mm.rst#L45) documentation: \n\n```\n...\nffff888000000000 | -119.5  TB | ffffc87fffffffff |   64 TB | direct mapping of all physical memory (page_offset_base)\n...\n```\n\nAs explained above, the virtual address space `ffff888000000000-ffffc87fffffffff` is direct mapping of all physical memory. When the kernel wants to access all physical memory, it uses direct mapping.\n\nOkay, let's get back to discussing `early_make_pgtable`. We initialize `pmd` and pass it to the `__early_make_pgtable` function along with `address`. The `__early_make_pgtable` function is defined in the same file as the `early_make_pgtable` function as follows:\n\n```C\nint __init __early_make_pgtable(unsigned long address, pmdval_t pmd)\n{\n\tunsigned long physaddr = address - __PAGE_OFFSET;\n\tpgdval_t pgd, *pgd_p;\n\tp4dval_t p4d, *p4d_p;\n\tpudval_t pud, *pud_p;\n\tpmdval_t *pmd_p;\n\t...\n\t...\n\t...\n}\n```\n\nIt starts from the definition of some variables which have `*val_t` types. All of these types are declared as alias of `unsigned long` using `typedef`.\n\nAfter we made the check that we have no invalid address, we're getting the address of the Page Global Directory entry which contains base address of Page Upper Directory and put its value to the `pgd` variable:\n\n```C\nagain:\n\tpgd_p = &early_top_pgt[pgd_index(address)].pgd;\n\tpgd = *pgd_p;\n```\n\nAnd we check if `pgd` is presented. If it is, we assign the base address of the page upper directory table to `pud_p`:\n\n```C\n\tpud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);\n```\n\nwhere `PTE_PFN_MASK` is a macro which mask lower `12` bits of `(pte|pmd|pud|pgd)val_t`.\n\nIf `pgd` is not presented, we check if `next_early_pgt` is not greater than `EARLY_DYNAMIC_PAGE_TABLES` which is `64` and present a fixed number of buffers to set up new page tables on demand. If `next_early_pgt` is greater than `EARLY_DYNAMIC_PAGE_TABLES` we reset page tables and start again from `again` label. If `next_early_pgt` is less than `EARLY_DYNAMIC_PAGE_TABLES`, we assign the next entry of `early_dynamic_pgts` to `pud_p` and fill whole entry of the page upper directory with `0`, then fill the page global directory entry with the base address and some access rights:\n\n```C\n\tif (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {\n\t\treset_early_page_tables();\n\t\tgoto again;\n\t}\n\t\t\n\tpud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];\n\tmemset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);\n\t*pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;\n```\n\nAnd we fix `pud_p` to point to correct entry and assign its value to `pud` with the following:\n\n```C\n\tpud_p += pud_index(address);\n\tpud = *pud_p;\n```\n\nAnd then we do the same routine as above, but to the page middle directory.\n\nIn the end we assign the given `pmd` which is passed by the `early_make_pgtable` function to the certain entry of page middle directory which maps kernel text+data virtual addresses:\n\n```C\n\tpmd_p[pmd_index(address)] = pmd;\n```\n\nAfter page fault handler finished its work, as a result, `early_top_pgt` contains entries which point to the valid addresses.\n\nOther exception handling\n--------------------------------------------------------------------------------\n\nIn early interrupt phase, exceptions other than page fault are handled by `early_fixup_exception` function which is defined in [arch/x86/mm/extable.c](https://github.com/torvalds/linux/blob/master/arch/x86/mm/extable.c) and takes two parameters - pointer to kernel stack which consists of saved registers and vector number:\n\n```C\nvoid __init early_fixup_exception(struct pt_regs *regs, int trapnr)\n{\n\t...\n\t...\n\t...\n}\n```\n\nFirst of all we need to make some checks as the following:\n\n```C\n\tif (trapnr == X86_TRAP_NMI)\n\t\treturn;\n\n\tif (early_recursion_flag > 2)\n\t\tgoto halt_loop;\n\n\tif (!xen_pv_domain() && regs->cs != __KERNEL_CS)\n\t\tgoto fail;\n```\n\nHere we just ignore [NMI](https://en.wikipedia.org/wiki/Non-maskable_interrupt) and make sure that we are not in recursive situation.\n\nAfter that, we get into:\n\n```C\n\tif (fixup_exception(regs, trapnr))\n\t\treturn;\n```\n\nThe `fixup_exception` function finds the actual handler and call it. It is defined in the same file as `early_fixup_exception` function as the following:\n\n```C\nint fixup_exception(struct pt_regs *regs, int trapnr)\n{\n\tconst struct exception_table_entry *e;\n\tex_handler_t handler;\n\n\te = search_exception_tables(regs->ip);\n\tif (!e)\n\t\treturn 0;\n\n\thandler = ex_fixup_handler(e);\n\treturn handler(e, regs, trapnr);\n}\n```\n\nThe `ex_handler_t` is a type of function pointer, which is defined like:\n\n```C\ntypedef bool (*ex_handler_t)(const struct exception_table_entry *,\n                            struct pt_regs *, int)\n```\n\nThe `search_exception_tables` function looks up the given address in the exception table (i.e. the contents of the ELF section, `__ex_table`). After that, we get the actual address by `ex_fixup_handler` function. At last we call actual handler. For more information about exception table, you can refer to [Documentation/x86/exception-tables.txt](https://github.com/torvalds/linux/blob/master/Documentation/x86/exception-tables.txt).\n\nLet's get back to the `early_fixup_exception` function, the next step is:\n\n```C\n\tif (fixup_bug(regs, trapnr))\n\t\treturn;\n```\n\nThe `fixup_bug` function is defined in [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/traps.c). Let's have a look on the function implementation:\n\n```C\nint fixup_bug(struct pt_regs *regs, int trapnr)\n{\n\tif (trapnr != X86_TRAP_UD)\n\t\treturn 0;\n\n\tswitch (report_bug(regs->ip, regs)) {\n\tcase BUG_TRAP_TYPE_NONE:\n\tcase BUG_TRAP_TYPE_BUG:\n\t\tbreak;\n\n\tcase BUG_TRAP_TYPE_WARN:\n\t\tregs->ip += LEN_UD2;\n\t\treturn 1;\n\t}\n\n\treturn 0;\n}\n```\n\nAll what this function does is just returns `1` if the exception is generated because `#UD` (or [Invalid Opcode](https://wiki.osdev.org/Exceptions#Invalid_Opcode)) occurred and the `report_bug` function returns `BUG_TRAP_TYPE_WARN`, otherwise returns `0`.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the second part about Linux kernel insides. If you have questions or suggestions, ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part we will see all steps before kernel entry point - `start_kernel` function.\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [GNU assembly .rept](https://sourceware.org/binutils/docs-2.23/as/Rept.html)\n* [APIC](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)\n* [NMI](http://en.wikipedia.org/wiki/Non-maskable_interrupt)\n* [Page table](https://en.wikipedia.org/wiki/Page_table)\n* [Interrupt handler](https://en.wikipedia.org/wiki/Interrupt_handler)\n* [Page Fault](https://en.wikipedia.org/wiki/Page_fault),\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1)\n"
  },
  {
    "path": "Initialization/linux-initialization-3.md",
    "content": "Kernel initialization. Part 3.\n================================================================================\n\nLast preparations before the kernel entry point\n--------------------------------------------------------------------------------\n\nThis is the third part of the Linux kernel initialization process series. In the previous [part](https://github.com/0xAX/linux-insides/blob/master/Initialization/linux-initialization-2.md) we saw early interrupt and exception handling and will continue to dive into the Linux kernel initialization process in the current part. Our next point is 'kernel entry point' - `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file. Yes, technically it is not kernel's entry point but the start of the generic kernel code which does not depend on certain architecture. But before we call the `start_kernel` function, we must do some preparations. So let's continue.\n\nboot_params again\n--------------------------------------------------------------------------------\n\nIn the previous part we stopped at setting Interrupt Descriptor Table and loading it in the `IDTR` register. At the next step after this we can see a call of the `copy_bootdata` function:\n\n```C\ncopy_bootdata(__va(real_mode_data));\n```\n\nThis function takes one argument - virtual address of the `real_mode_data`. Remember that we passed the address of the `boot_params` structure from [arch/x86/include/uapi/asm/bootparam.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/uapi/asm/bootparam.h#L114)  to the `x86_64_start_kernel` function as first argument in [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head_64.S):\n\n```\n\t/* rsi is pointer to real mode structure with interesting info.\n\t   pass it to C */\n\tmovq\t%rsi, %rdi\n```\n\nNow let's look at `__va` macro. This macro defined in [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c):\n\n```C\n#define __va(x)                 ((void *)((unsigned long)(x)+PAGE_OFFSET))\n```\n\nwhere `PAGE_OFFSET` is `__PAGE_OFFSET` which is `0xffff880000000000` and the base virtual address of the direct mapping of all physical memory. So we're getting virtual address of variable `boot_params` which come along from real mode, and pass it to the `copy_bootdata` function, where we copy `real_mode_data` to the `boot_params` which is defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/d9919d43cbf6790d2bc0c0a2743c51fc25f26919/arch/x86/kernel/setup.c)\n\n```C\nstruct boot_params boot_params;\n```\n\nLet's look at the `copy_boot_data` implementation:\n\n```C\nstatic void __init copy_bootdata(char *real_mode_data)\n{\n\tchar * command_line;\n\tunsigned long cmd_line_ptr;\n\n\tmemcpy(&boot_params, real_mode_data, sizeof boot_params);\n\tsanitize_boot_params(&boot_params);\n\tcmd_line_ptr = get_cmd_line_ptr();\n\tif (cmd_line_ptr) {\n\t\tcommand_line = __va(cmd_line_ptr);\n\t\tmemcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);\n\t}\n}\n```\n\nFirst of all, note that this function is declared with `__init` prefix. It means that this function will be used only during the initialization and used memory will be freed.\n\nWe can see declaration of two variables for the kernel command line and copying `real_mode_data` to the `boot_params` with the `memcpy` function. The next call of the `sanitize_boot_params` function which fills some fields of the `boot_params` structure like `ext_ramdisk_image` and etc... if bootloaders which fail to initialize unknown fields in `boot_params` to zero. After this we're getting address of the command line with the call of the `get_cmd_line_ptr` function:\n\n```C\nunsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;\ncmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;\nreturn cmd_line_ptr;\n```\n\nwhich gets the 64-bit address of the command line from the kernel boot header and returns it. In the last step we check `cmd_line_ptr`, getting its virtual address and copy it to the `boot_command_line` which is just an array of bytes:\n\n```C\nextern char __initdata boot_command_line[];\n```\n\nAfter this we will have copied kernel command line and `boot_params` structure. In the next step we can see call of the `load_ucode_bsp` function which loads processor microcode, but we will not see it here.\n\nAfter microcode was loaded we can see the check of the `console_loglevel` and the `early_printk` function which prints `Kernel Alive` string. But you'll never see this output because `early_printk` is not initialized yet. It is a minor bug in the kernel and i sent the patch - [commit](http://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/commit/?id=91d8f0416f3989e248d3a3d3efb821eda10a85d2) and you will see it in the mainline soon. So you can skip this code.\n\nMove on init pages\n--------------------------------------------------------------------------------\n\nIn the next step, as we have copied `boot_params` structure, we need to move from the early page tables to the page tables for initialization process. We already set early page tables for switchover, you can read about it in the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) and dropped all it in the `reset_early_page_tables` function (you can read about it in the previous part too) and kept only kernel high mapping. After this we call:\n\n```C\n\tclear_page(init_level4_pgt);\n```\n\nfunction and pass `init_level4_pgt` which also defined in the [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head_64.S) and looks:\n\n```assembly\nNEXT_PAGE(init_level4_pgt)\n\t.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE\n\t.org    init_level4_pgt + L4_PAGE_OFFSET*8, 0\n\t.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE\n\t.org    init_level4_pgt + L4_START_KERNEL*8, 0\n\t.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE\n```\n\nwhich maps first 2 gigabytes and 512 megabytes for the kernel code, data and bss. `clear_page` function defined in the [arch/x86/lib/clear_page_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/lib/clear_page_64.S) let's look on this function:\n\n```assembly\nENTRY(clear_page)\n\tCFI_STARTPROC\n\txorl %eax,%eax\n\tmovl $4096/64,%ecx\n\t.p2align 4\n\t.Lloop:\n    decl\t%ecx\n#define PUT(x) movq %rax,x*8(%rdi)\n\tmovq %rax,(%rdi)\n\tPUT(1)\n\tPUT(2)\n\tPUT(3)\n\tPUT(4)\n\tPUT(5)\n\tPUT(6)\n\tPUT(7)\n\tleaq 64(%rdi),%rdi\n\tjnz\t.Lloop\n\tnop\n\tret\n\tCFI_ENDPROC\n\t.Lclear_page_end:\n\tENDPROC(clear_page)\n```\n\nAs you can understand from the function name it clears or fills with zeros page tables. First of all note that this function starts with the `CFI_STARTPROC` and `CFI_ENDPROC` which are expands to GNU assembly directives:\n\n```C\n#define CFI_STARTPROC           .cfi_startproc\n#define CFI_ENDPROC             .cfi_endproc\n```\n\nand used for debugging. After `CFI_STARTPROC` macro we zero out `eax` register and put 64 to the `ecx` (it will be a counter). Next we can see loop which starts with the `.Lloop` label and it starts from the `ecx` decrement. After it we put zero from the `rax` register to the `rdi` which contains the base address of the `init_level4_pgt` now and do the same procedure seven times but every time move `rdi` offset on 8. After this we will have first 64 bytes of the `init_level4_pgt` filled with zeros. In the next step we put the address of the `init_level4_pgt` with 64-bytes offset to the `rdi` again and repeat all operations until `ecx` reaches zero. In the end we will have `init_level4_pgt` filled with zeros.\n\nAs we have `init_level4_pgt` filled with zeros, we set the last `init_level4_pgt` entry to kernel high mapping with the:\n\n```C\ninit_level4_pgt[511] = early_top_pgt[511];\n```\n\nRemember that we dropped all `early_top_pgt` entries in the `reset_early_page_table` function and kept only kernel high mapping there.\n\nThe last step in the `x86_64_start_kernel` function is the call of the:\n\n```C\nx86_64_start_reservations(real_mode_data);\n```\n\nfunction with the `real_mode_data` as argument. The `x86_64_start_reservations` function defined in the same source code file as the `x86_64_start_kernel` function and looks:\n\n```C\nvoid __init x86_64_start_reservations(char *real_mode_data)\n{\n\tif (!boot_params.hdr.version)\n\t\tcopy_bootdata(__va(real_mode_data));\n\n\treserve_ebda_region();\n\n\tstart_kernel();\n}\n```\n\nYou can see that it is the last function before we are in the kernel entry point - `start_kernel` function. Let's look what it does and how it works.\n\nLast step before kernel entry point\n--------------------------------------------------------------------------------\n\nFirst of all we can see in the `x86_64_start_reservations` function the check for `boot_params.hdr.version`:\n\n```C\nif (!boot_params.hdr.version)\n\tcopy_bootdata(__va(real_mode_data));\n```\n\nand if it is zero we call `copy_bootdata` function again with the virtual address of the `real_mode_data` (read about its implementation).\n\nIn the next step we can see the call of the `reserve_ebda_region` function which defined in the [arch/x86/kernel/head.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head.c). This function reserves memory block for the `EBDA` or Extended BIOS Data Area. The Extended BIOS Data Area located in the top of conventional memory and contains data about ports, disk parameters and etc...\n\nLet's look on the `reserve_ebda_region` function. It starts from the checking is paravirtualization enabled or not:\n\n```C\nif (paravirt_enabled())\n\treturn;\n```\n\nwe exit from the `reserve_ebda_region` function if paravirtualization is enabled because if it enabled the extended BIOS data area is absent. In the next step we need to get the end of the low memory:\n\n```C\nlowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);\nlowmem <<= 10;\n```\n\nWe're getting the virtual address of the BIOS low memory in kilobytes and convert it to bytes with shifting it on 10 (multiply on 1024 in other words). After this we need to get the address of the extended BIOS data are with the:\n\n```C\nebda_addr = get_bios_ebda();\n```\n\nwhere `get_bios_ebda` function defined in the [arch/x86/include/asm/bios_ebda.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bios_ebda.h) and looks like:\n\n```C\nstatic inline unsigned int get_bios_ebda(void)\n{\n\tunsigned int address = *(unsigned short *)phys_to_virt(0x40E);\n\taddress <<= 4;\n\treturn address;\n}\n```\n\nLet's try to understand how it works. Here we can see that we are converting physical address `0x40E` to the virtual, where `0x0040:0x000e` is the segment which contains base address of the extended BIOS data area. Don't worry that we are using `phys_to_virt` function for converting a physical address to virtual address. You can note that previously we have used `__va` macro for the same point, but `phys_to_virt` is the same:\n\n```C\nstatic inline void *phys_to_virt(phys_addr_t address)\n{\n         return __va(address);\n}\n```\n\nonly with one difference: we pass argument with the `phys_addr_t` which depends on `CONFIG_PHYS_ADDR_T_64BIT`:\n\n```C\n#ifdef CONFIG_PHYS_ADDR_T_64BIT\n\ttypedef u64 phys_addr_t;\n#else\n\ttypedef u32 phys_addr_t;\n#endif\n```\n\nThis configuration option is enabled by `CONFIG_PHYS_ADDR_T_64BIT`. After that we got virtual address of the segment which stores the base address of the extended BIOS data area, we shift it on 4 and return. After this `ebda_addr` variables contains the base address of the extended BIOS data area.\n\nIn the next step we check that address of the extended BIOS data area and low memory is not less than `INSANE_CUTOFF` macro\n\n```C\nif (ebda_addr < INSANE_CUTOFF)\n\tebda_addr = LOWMEM_CAP;\n\nif (lowmem < INSANE_CUTOFF)\n\tlowmem = LOWMEM_CAP;\n```\n\nwhich is:\n\n```C\n#define INSANE_CUTOFF\t\t0x20000U\n```\n\nor 128 kilobytes. In the last step we get lower part in the low memory and extended BIOS data area and call `memblock_reserve` function which will reserve memory region for extended BIOS data between low memory and one megabyte mark:\n\n```C\nlowmem = min(lowmem, ebda_addr);\nlowmem = min(lowmem, LOWMEM_CAP);\nmemblock_reserve(lowmem, 0x100000 - lowmem);\n```\n\n`memblock_reserve` function is defined at [mm/memblock.c](https://github.com/torvalds/linux/blob/master/mm/memblock.c) and takes two parameters:\n\n* base physical address;\n* region size.\n\nand reserves memory region for the given base address and size. `memblock_reserve` is the first function in this book from Linux kernel memory manager framework. We will take a closer look on memory manager soon, but now let's look at its implementation.\n\nFirst touch of the Linux kernel memory manager framework\n--------------------------------------------------------------------------------\n\nIn the previous paragraph we stopped at the call of the `memblock_reserve` function and as I said before it is the first function from the memory manager framework. Let's try to understand how it works. `memblock_reserve` function just calls:\n\n```C\nmemblock_reserve_region(base, size, MAX_NUMNODES, 0);\n```\n\nfunction and passes 4 parameters there:\n\n* physical base address of the memory region;\n* size of the memory region;\n* maximum number of numa nodes;\n* flags.\n\nAt the start of the `memblock_reserve_region` body we can see definition of the `memblock_type` structure:\n\n```C\nstruct memblock_type *_rgn = &memblock.reserved;\n```\n\nwhich presents the type of the memory block and looks:\n\n```C\nstruct memblock_type {\n         unsigned long cnt;\n         unsigned long max;\n         phys_addr_t total_size;\n         struct memblock_region *regions;\n};\n```\n\nAs we need to reserve memory block for extended BIOS data area, the type of the current memory region is reserved where `memblock` structure is:\n\n```C\nstruct memblock {\n         bool bottom_up;\n         phys_addr_t current_limit;\n         struct memblock_type memory;\n         struct memblock_type reserved;\n#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP\n         struct memblock_type physmem;\n#endif\n};\n```\n\nand describes generic memory block. You can see that we initialize `_rgn` by assigning it to the address of the `memblock.reserved`. `memblock` is the global variable which looks:\n\n```C\nstruct memblock memblock __initdata_memblock = {\n\t.memory.regions\t\t= memblock_memory_init_regions,\n\t.memory.cnt\t\t= 1,\n\t.memory.max\t\t= INIT_MEMBLOCK_REGIONS,\n\t.reserved.regions\t= memblock_reserved_init_regions,\n\t.reserved.cnt\t\t= 1,\n\t.reserved.max\t\t= INIT_MEMBLOCK_REGIONS,\n#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP\n\t.physmem.regions\t= memblock_physmem_init_regions,\n\t.physmem.cnt\t\t= 1,\n\t.physmem.max\t\t= INIT_PHYSMEM_REGIONS,\n#endif\n\t.bottom_up\t\t= false,\n\t.current_limit\t\t= MEMBLOCK_ALLOC_ANYWHERE,\n};\n```\n\nWe will not dive into detail of this variable, but we will see all details about it in the parts about memory manager. Just note that `memblock` variable defined with the `__initdata_memblock` which is:\n\n```C\n#define __initdata_memblock __meminitdata\n```\n\nand `__meminit_data` is:\n\n```C\n#define __meminitdata    __section(.meminit.data)\n```\n\nFrom this we can conclude that all memory blocks will be in the `.meminit.data` section. After we defined `_rgn` we print information about it with `memblock_dbg` macros. You can enable it by passing `memblock=debug` to the kernel command line.\n\nAfter debugging lines were printed next is the call of the following function:\n\n```C\nmemblock_add_range(_rgn, base, size, nid, flags);\n```\n\nwhich adds new memory block region into the `.meminit.data` section. As we do not initialize `_rgn` but it just contains `&memblock.reserved`, we just fill passed `_rgn` with the base address of the extended BIOS data area region, size of this region and flags:\n\n```C\nif (type->regions[0].size == 0) {\n    WARN_ON(type->cnt != 1 || type->total_size);\n    type->regions[0].base = base;\n    type->regions[0].size = size;\n    type->regions[0].flags = flags;\n    memblock_set_region_node(&type->regions[0], nid);\n    type->total_size = size;\n    return 0;\n}\n```\n\nAfter we filled our region we can see the call of the `memblock_set_region_node` function with two parameters:\n\n* address of the filled memory region;\n* NUMA node id.\n\nwhere our regions represented by the `memblock_region` structure:\n\n```C\nstruct memblock_region {\n    phys_addr_t base;\n\tphys_addr_t size;\n\tunsigned long flags;\n#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP\n    int nid;\n#endif\n};\n```\n\nNUMA node id depends on `MAX_NUMNODES` macro which is defined in the [include/linux/numa.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/numa.h):\n\n```C\n#define MAX_NUMNODES    (1 << NODES_SHIFT)\n```\n\nwhere `NODES_SHIFT` depends on `CONFIG_NODES_SHIFT` configuration parameter and defined as:\n\n```C\n#ifdef CONFIG_NODES_SHIFT\n  #define NODES_SHIFT     CONFIG_NODES_SHIFT\n#else\n  #define NODES_SHIFT     0\n#endif\n```\n\n`memblock_set_region_node` function just fills `nid` field from `memblock_region` with the given value:\n\n```C\nstatic inline void memblock_set_region_node(struct memblock_region *r, int nid)\n{\n         r->nid = nid;\n}\n```\n\nAfter this we will have first reserved `memblock` for the extended BIOS data area in the `.meminit.data` section. `reserve_ebda_region` function finished its work on this step and we can go back to the [arch/x86/kernel/head64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head64.c).\n\nWe finished all preparations before the kernel entry point! The last step in the `x86_64_start_reservations` function is the call of the:\n\n```C\nstart_kernel()\n```\n\nfunction from [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) file.\n\nThat's all for this part.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the third part about Linux kernel insides. In next part we will see the first initialization steps in the kernel entry point - `start_kernel` function. It will be the first step before we will see launch of the first `init` process.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [BIOS data area](http://stanislavs.org/helppc/bios_data_area.html)\n* [What is in the extended BIOS data area on a PC?](http://www.kryslix.com/nsfaq/Q.6.html)\n* [Previous part](https://github.com/0xAX/linux-insides/blob/master/Initialization/linux-initialization-2.md)\n"
  },
  {
    "path": "Initialization/linux-initialization-4.md",
    "content": "Kernel initialization. Part 4.\n================================================================================\n\nKernel entry point\n================================================================================\n\nIf you have read the previous part - [Last preparations before the kernel entry point](https://github.com/0xAX/linux-insides/blob/master/Initialization/linux-initialization-3.md), you can remember that we finished all pre-initialization stuff and stopped right before the call to the `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). The `start_kernel` is the entry of the generic and architecture independent kernel code, although we will return to the `arch/` folder many times. If you look inside of the `start_kernel` function, you will see that this function is very big. For this moment it contains about `86` function calls. Yes, it's very big and of course this part will not cover all the processes that occur in this function. In the current part we will only start to do it. This part and all the next which will be in the [Kernel initialization process](https://github.com/0xAX/linux-insides/blob/master/Initialization/README.md) chapter will cover it.\n\nThe main purpose of the `start_kernel` to finish kernel initialization process and launch the first `init` process. Before the first process will be started, the `start_kernel` must do many things such as: to enable [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt), to initialize processor id, to enable early [cgroups](http://en.wikipedia.org/wiki/Cgroups) subsystem, to setup per-cpu areas, to initialize different caches in [vfs](http://en.wikipedia.org/wiki/Virtual_file_system), to initialize memory manager, rcu, vmalloc, scheduler, IRQs, ACPI and many many more. Only after these steps will we see the launch of the first `init` process in the last part of this chapter. So much kernel code awaits us, let's start.\n\n**NOTE: All parts from this big chapter `Linux Kernel initialization process` will not cover anything about debugging. There will be a separate chapter about kernel debugging tips.**\n\nA little about function attributes\n---------------------------------------------------------------------------------\n\nAs I wrote above, the `start_kernel` function is defined in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). This function defined with the `__init` attribute and as you already may know from other parts, all functions which are defined with this attribute are necessary during kernel initialization.\n\n```C\n#define __init      __section(.init.text) __cold notrace\n```\n\nAfter the initialization process have finished, the kernel will release these sections with a call to the `free_initmem` function. Note also that `__init` is defined with two attributes: `__cold` and `notrace`. The purpose of the first `cold` attribute is to mark that the function is rarely used and the compiler must optimize this function for size. The second `notrace` is defined as:\n\n```C\n#define notrace __attribute__((no_instrument_function))\n```\n\nwhere `no_instrument_function` says to the compiler not to generate profiling function calls.\n\nIn the definition of the `start_kernel` function, you can also see the `__visible` attribute which expands to the:\n\n```\n#define __visible __attribute__((externally_visible))\n```\n\nwhere `externally_visible` tells to the compiler that something uses this function or variable, to prevent marking this function/variable as `unusable`. You can find the definition of this and other macro attributes in [include/linux/init.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init.h).\n\nFirst steps in the start_kernel\n--------------------------------------------------------------------------------\n\nAt the beginning of the `start_kernel` you can see the definition of these two variables:\n\n```C\nchar *command_line;\nchar *after_dashes;\n```\n\nThe first represents a pointer to the kernel command line and the second will contain the result of the `parse_args` function which parses an input string with parameters in the form `name=value`, looking for specific keywords and invoking the right handlers. We will not go into the details related with these two variables at this time, but will see it in the next parts. In the next step we can see a call to the `set_task_stack_end_magic` function. This function takes address of the `init_task` and sets `STACK_END_MAGIC` (`0x57AC6E9D`) as canary for it. `init_task` represents the initial task structure:\n\n```C\nstruct task_struct init_task = INIT_TASK(init_task);\n```\n\nwhere `task_struct` stores all the information about a process. I will not explain this structure in this book because it's very big. You can find its definition in [include/linux/sched.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/sched.h#L1278). At this moment `task_struct` contains more than `100` fields! Although you will not see the explanation of the `task_struct` in this book, we will use it very often since it is the fundamental structure which describes the `process` in the Linux kernel. I will describe the meaning of the fields of this structure as we meet them in practice.\n\nYou can see the definition of the `init_task` and it is initialized by the `INIT_TASK` macro. This macro is from [include/linux/init_task.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init_task.h) and it just fills the `init_task` with the values for the first process. For example it sets:\n\n* init process state to zero or `runnable`. A runnable process is one which is waiting only for a CPU to run on;\n* init process flags - `PF_KTHREAD` which means - kernel thread;\n* a list of runnable task;\n* process address space;\n* init process stack to the `&init_thread_info` which is `init_thread_union.thread_info` and `initthread_union` has type - `thread_union` which contains `thread_info` and process stack:\n\n```C\nunion thread_union {\n    struct thread_info thread_info;\n    unsigned long stack[THREAD_SIZE/sizeof(long)];\n};\n```\n\nEvery process has its own stack and it is 16 kilobytes or 4 page frames in `x86_64`. We can note that it is defined as array of `unsigned long`. The next field of the `thread_union` is - `thread_info` defined as:\n\n```C\nstruct thread_info {\n        struct task_struct      *task;\n        struct exec_domain      *exec_domain;\n        __u32                   flags;\n        __u32                   status;\n        __u32                   cpu;\n        int                     saved_preempt_count;\n        mm_segment_t            addr_limit;\n        struct restart_block    restart_block;\n        void __user             *sysenter_return;\n        unsigned int            sig_on_uaccess_error:1;\n        unsigned int            uaccess_err:1;\n};\n```\n\nand occupies 52 bytes. The `thread_info` structure contains architecture-specific information on the thread. We know that on `x86_64` the stack grows down and `thread_union.thread_info` is stored at the bottom of the stack in our case. So the process stack is 16 kilobytes and `thread_info` is at the bottom. The remaining thread size will be `16 kilobytes - 62 bytes = 16332 bytes`. Note that `thread_union` represented as the [union](http://en.wikipedia.org/wiki/Union_type) and not structure, it means that `thread_info` and stack share the memory space.\n\nSchematically it can be represented as follows:\n\n```C\n+-----------------------+\n|                       |\n|                       |\n|        stack          |\n|                       |\n|_______________________|\n|          |            |\n|          |            |\n|          |            |\n|__________↓____________|             +--------------------+\n|                       |             |                    |\n|      thread_info      |<----------->|     task_struct    |\n|                       |             |                    |\n+-----------------------+             +--------------------+\n```\n\nhttp://www.quora.com/In-Linux-kernel-Why-thread_info-structure-and-the-kernel-stack-of-a-process-binds-in-union-construct\n\nSo the `INIT_TASK` macro fills these `task_struct's` fields and many many more. As I already wrote above, I will not describe all the fields and values in the `INIT_TASK` macro but we will see them soon.\n\nNow let's go back to the `set_task_stack_end_magic` function. This function defined in the [kernel/fork.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/fork.c#L297) and sets a [canary](http://en.wikipedia.org/wiki/Stack_buffer_overflow) to the `init` process stack to prevent stack overflow.\n\n```C\nvoid set_task_stack_end_magic(struct task_struct *tsk)\n{\n\tunsigned long *stackend;\n\tstackend = end_of_stack(tsk);\n\t*stackend = STACK_END_MAGIC; /* for overflow detection */\n}\n```\n\nIts implementation is simple. `set_task_stack_end_magic` gets the end of the stack for the given `task_struct` with the `end_of_stack` function. Earlier (and now for all architectures besides `x86_64`) stack was located in the `thread_info` structure. So the end of a process stack depends on the `CONFIG_STACK_GROWSUP` configuration option. As we learn in `x86_64` architecture, the stack grows down. So the end of the process stack will be:\n\n```C\n(unsigned long *)(task_thread_info(p) + 1);\n```\n\nwhere `task_thread_info` just returns the stack which we filled with the `INIT_TASK` macro:\n\n```C\n#define task_thread_info(task)  ((struct thread_info *)(task)->stack)\n```\n\nFrom the Linux kernel `v4.9-rc1` release, `thread_info` structure may contains only flags and stack pointer resides in `task_struct` structure which represents a thread in the Linux kernel. This depends on `CONFIG_THREAD_INFO_IN_TASK` kernel configuration option which is enabled by default for `x86_64`. You can be sure in this if you will look in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) configuration build file:\n\n```\nconfig THREAD_INFO_IN_TASK\n\tbool\n\thelp\n\t  Select this to move thread_info off the stack into task_struct.  To\n\t  make this work, an arch will need to remove all thread_info fields\n\t  except flags and fix any runtime bugs.\n\n\t  One subtle change that will be needed is to use try_get_task_stack()\n\t  and put_task_stack() in save_thread_stack_tsk() and get_wchan().\n```\n\nand [arch/x86/Kconfig](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Kconfig):\n\n```\nconfig X86\n\tdef_bool y\n        ...\n        ...\n        ...\n        select THREAD_INFO_IN_TASK\n        ...\n        ...\n        ...\n```\n\nSo, in this way we may just get end of a thread stack from the given `task_struct` structure:\n\n```C\n#ifdef CONFIG_THREAD_INFO_IN_TASK\nstatic inline unsigned long *end_of_stack(const struct task_struct *task)\n{\n\treturn task->stack;\n}\n#endif\n```\n\nAs we got the end of the `init` process stack, we write `STACK_END_MAGIC` there. After `canary` is set, we can check it like this:\n\n```C\nif (*end_of_stack(task) != STACK_END_MAGIC) {\n        //\n        // handle stack overflow here\n        //\n}\n```\n\nThe next function after the `set_task_stack_end_magic` is `smp_setup_processor_id`. This function has an empty body for `x86_64`:\n\n```C\nvoid __init __weak smp_setup_processor_id(void)\n{\n}\n```\n\nas it not implemented for all architectures, but some such as [s390](http://en.wikipedia.org/wiki/IBM_ESA/390) and [arm64](http://en.wikipedia.org/wiki/ARM_architecture#64.2F32-bit_architecture).\n\nThe next function in `start_kernel` is `debug_objects_early_init`. Implementation of this function is almost the same as `lockdep_init`, but fills hashes for object debugging. As I wrote above, we will not see the explanation of this and other functions which are for debugging purposes in this chapter.\n\nAfter the `debug_object_early_init` function we can see the call of the `boot_init_stack_canary` function which fills `task_struct->canary` with the `canary` value for the `-fstack-protector` gcc feature. This function depends on the `CONFIG_CC_STACKPROTECTOR` configuration option and if this option is disabled, `boot_init_stack_canary` does nothing, otherwise it generates random numbers based on random pool and the [TSC](http://en.wikipedia.org/wiki/Time_Stamp_Counter):\n\n```C\nget_random_bytes(&canary, sizeof(canary));\ntsc = __native_read_tsc();\ncanary += tsc + (tsc << 32UL);\n```\n\nAfter we got a random number, we fill the `stack_canary` field of `task_struct` with it:\n\n```C\ncurrent->stack_canary = canary;\n```\n\nand write this value to the top of the IRQ stack with the:\n\n```C\nthis_cpu_write(irq_stack_union.stack_canary, canary); // read below about this_cpu_write\n```\n\nAgain, we will not dive into details here, we will cover it in the part about [IRQs](http://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29). As `canary` is set, we disable local and early boot IRQs and register the bootstrap CPU in the CPU maps. We disable local IRQs (interrupts for current CPU) with the `local_irq_disable` macro which expands to the call of the `arch_local_irq_disable` function from [include/linux/percpu-defs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/percpu-defs.h):\n\n```C\nstatic inline notrace void arch_local_irq_disable(void)\n{\n        native_irq_disable();\n}\n```\n\nWhere `native_irq_disable` is `cli` instruction for `x86_64`. As interrupts are disabled we can register the current CPU with the given ID in the CPU bitmap.\n\nThe first processor activation\n---------------------------------------------------------------------------------\n\nThe current function from the `start_kernel` is `boot_cpu_init`. This function initializes various CPU masks for the bootstrap processor. First of all it gets the bootstrap processor id with a call to:\n\n```C\nint cpu = smp_processor_id();\n```\n\nFor now it is just zero. If the `CONFIG_DEBUG_PREEMPT` configuration option is disabled, `smp_processor_id` just expands to the call of `raw_smp_processor_id` which expands to the:\n\n```C\n#define raw_smp_processor_id() (this_cpu_read(cpu_number))\n```\n\n`this_cpu_read` as many other function like this (`this_cpu_write`, `this_cpu_add` and etc...) defined in the [include/linux/percpu-defs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/percpu-defs.h) and presents `this_cpu` operation. These operations provide a way of optimizing access to the [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variables which are associated with the current processor. In our case it is `this_cpu_read`:\n\n```\n__pcpu_size_call_return(this_cpu_read_, pcp)\n```\n\nRemember that we have passed `cpu_number` as `pcp` to the `this_cpu_read` from the `raw_smp_processor_id`. Now let's look at the `__pcpu_size_call_return` implementation:\n\n```C\n#define __pcpu_size_call_return(stem, variable)                         \\\n({                                                                      \\\n        typeof(variable) pscr_ret__;                                    \\\n        __verify_pcpu_ptr(&(variable));                                 \\\n        switch(sizeof(variable)) {                                      \\\n        case 1: pscr_ret__ = stem##1(variable); break;                  \\\n        case 2: pscr_ret__ = stem##2(variable); break;                  \\\n        case 4: pscr_ret__ = stem##4(variable); break;                  \\\n        case 8: pscr_ret__ = stem##8(variable); break;                  \\\n        default:                                                        \\\n                __bad_size_call_parameter(); break;                     \\\n        }                                                               \\\n        pscr_ret__;                                                     \\\n})\n```\n\nYes, it looks a little strange but it's easy. First of all we can see the definition of the `pscr_ret__` variable with the `int` type. Why int? Ok, `variable` is `cpu_number` and it was declared as per-cpu int variable:\n\n```C\nDECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);\n```\n\nIn the next step we call `__verify_pcpu_ptr` with the address of `cpu_number`. `__veryf_pcpu_ptr` used to verify that the given parameter is a per-cpu pointer. After that we set `pscr_ret__` value which depends on the size of the variable. Our `cpu_number` variable is `int`, so it's 4 bytes in size. It means that we will get `this_cpu_read_4(cpu_number)` in `pscr_ret__`. In the end of the `__pcpu_size_call_return` we just call it. `this_cpu_read_4` is a macro:\n\n```C\n#define this_cpu_read_4(pcp)       percpu_from_op(\"mov\", pcp)\n```\n\nwhich calls `percpu_from_op` and pass `mov` instruction and per-cpu variable there. `percpu_from_op` will expand to the inline assembly call:\n\n```C\nasm(\"movl %%gs:%1,%0\" : \"=r\" (pfo_ret__) : \"m\" (cpu_number))\n```\n\nLet's try to understand how it works and what it does. The `gs` segment register contains the base of per-cpu area. Here we just copy `cpu_number` which is in memory to the `pfo_ret__` with the `movl` instruction. Or with another words:\n\n```C\nthis_cpu_read(cpu_number)\n```\n\nis the same as:\n\n```C\nmovl %gs:$cpu_number, $pfo_ret__\n```\n\nAs we didn't setup per-cpu area, we have only one - for the current running CPU, we will get `zero` as a result of the `smp_processor_id`.\n\nAs we got the current processor id, `boot_cpu_init` sets the given CPU online, active, present and possible with the:\n\n```C\nset_cpu_online(cpu, true);\nset_cpu_active(cpu, true);\nset_cpu_present(cpu, true);\nset_cpu_possible(cpu, true);\n```\n\nAll of these functions use the concept - `cpumask`. `cpu_possible` is a set of CPU ID's which can be plugged in at any time during the life of that system boot. `cpu_present` represents which CPUs are currently plugged in. `cpu_online` represents subset of the `cpu_present` and indicates CPUs which are available for scheduling. These masks depend on the `CONFIG_HOTPLUG_CPU` configuration option and if this option is disabled `possible == present` and `active == online`. Implementation of the all of these functions are very similar. Every function checks the second parameter. If it is `true`, it calls `cpumask_set_cpu` or `cpumask_clear_cpu` otherwise.\n\nFor example let's look at `set_cpu_possible`. As we passed `true` as the second parameter, the:\n\n```C\ncpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));\n```\n\nwill be called. First of all let's try to understand the `to_cpumask` macro. This macro casts a bitmap to a `struct cpumask *`. CPU masks provide a bitmap suitable for representing the set of CPU's in a system, one bit position per CPU number. CPU mask presented by the `cpumask` structure:\n\n```C\ntypedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;\n```\n\nwhich is just bitmap declared with the `DECLARE_BITMAP` macro:\n\n```C\n#define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)]\n```\n\nAs we can see from its definition, the `DECLARE_BITMAP` macro expands to the array of `unsigned long`. Now let's look at how the `to_cpumask` macro is implemented:\n\n```C\n#define to_cpumask(bitmap)                                              \\\n        ((struct cpumask *)(1 ? (bitmap)                                \\\n                            : (void *)sizeof(__check_is_bitmap(bitmap))))\n```\n\nI don't know about you, but it looked really weird for me at the first time. We can see a ternary operator here which is `true` every time, but why the `__check_is_bitmap` here? It's simple, let's look at it:\n\n```C\nstatic inline int __check_is_bitmap(const unsigned long *bitmap)\n{\n        return 1;\n}\n```\n\nYeah, it just returns `1` every time. Actually we need in it here only for one purpose: at compile time it checks that the given `bitmap` is a bitmap, or in other words it checks that the given `bitmap` has a type of `unsigned long *`. So we just pass `cpu_possible_bits` to the `to_cpumask` macro for converting the array of `unsigned long` to the `struct cpumask *`. Now we can call `cpumask_set_cpu` function with the `cpu` - 0 and `struct cpumask *cpu_possible_bits`. This function makes only one call of the `set_bit` function which sets the given `cpu` in the cpumask. All of these `set_cpu_*` functions work on the same principle.\n\nIf you're not sure that this `set_cpu_*` operations and `cpumask` are not clear for you, don't worry about it. You can get more info by reading the special part about it - [cpumask](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) or [documentation](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt).\n\nAs we activated the bootstrap processor, it's time to go to the next function in the `start_kernel.` Now it is `page_address_init`, but this function does nothing in our case, because it executes only when all `RAM` can't be mapped directly.\n\nPrint Linux banner\n---------------------------------------------------------------------------------\n\nThe next call is `pr_notice`:\n\n```C\n#define pr_notice(fmt, ...) \\\n    printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)\n```\n\nas you can see it just expands to the `printk` call. At this moment we use `pr_notice` to print the Linux banner:\n\n```C\npr_notice(\"%s\", linux_banner);\n```\n\nwhich is just the kernel version with some additional parameters:\n\n```\nLinux version 4.0.0-rc6+ (alex@localhost) (gcc version 4.9.1 (Ubuntu 4.9.1-16ubuntu6) ) #319 SMP\n```\n\nArchitecture-dependent parts of initialization\n---------------------------------------------------------------------------------\n\nThe next step is architecture-specific initialization. The Linux kernel does it with the call of the `setup_arch` function. This is a very big function like `start_kernel` and we do not have time to consider all of its implementation in this part. Here we'll only start to do it and continue in the next part. As it is `architecture-specific`, we need to go again to the `arch/` directory. The `setup_arch` function defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) source code file and takes only one argument - address of the kernel command line.\n\nThis function starts from the reserving memory block for the kernel `_text` and `_data` which starts from the `_text` symbol (you can remember it from the [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head_64.S#L46)) and ends before `__bss_stop`. We are using `memblock` for the reserving of memory block:\n\n```C\nmemblock_reserve(__pa_symbol(_text), (unsigned long)__bss_stop - (unsigned long)_text);\n```\n\nYou can read about `memblock` in the [Linux kernel memory management Part 1.](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-1). As you can remember `memblock_reserve` function takes two parameters:\n\n* base physical address of a memory block;\n* size of a memory block.\n\nWe can get the base physical address of the `_text` symbol with the `__pa_symbol` macro:\n\n```C\n#define __pa_symbol(x) \\\n\t__phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))\n```\n\nFirst of all it calls `__phys_reloc_hide` macro on the given parameter. The `__phys_reloc_hide` macro does nothing for `x86_64` and just returns the given parameter. Implementation of the `__phys_addr_symbol` macro is easy. It just subtracts the symbol address from the base address of the kernel text mapping base virtual address (you can remember that it is `__START_KERNEL_map`) and adds `phys_base` which is the base address of `_text`:\n\n```C\n#define __phys_addr_symbol(x) \\\n ((unsigned long)(x) - __START_KERNEL_map + phys_base)\n```\n\nAfter we got the physical address of the `_text` symbol, `memblock_reserve` can reserve a memory block from the `_text` to the `__bss_stop - _text`.\n\nReserve memory for initrd\n---------------------------------------------------------------------------------\n\nIn the next step after we reserved place for the kernel text and data is reserving place for the [initrd](http://en.wikipedia.org/wiki/Initrd). We will not see details about `initrd` in this post, you just may know that it is temporary root file system stored in memory and used by the kernel during its startup. The `early_reserve_initrd` function does all work. First of all this function gets the base address of the ram disk, its size and the end address with:\n\n```C\nu64 ramdisk_image = get_ramdisk_image();\nu64 ramdisk_size  = get_ramdisk_size();\nu64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);\n```\n\nAll of these parameters are taken from `boot_params`. If you have read the chapter about [Linux Kernel Booting Process](https://0xax.gitbook.io/linux-insides/summary/booting), you must remember that we filled the `boot_params` structure during boot time. The kernel setup header contains a couple of fields which describes ramdisk, for example:\n\n```\nField name:\tramdisk_image\nType:\t\twrite (obligatory)\nOffset/size:\t0x218/4\nProtocol:\t2.00+\n\n  The 32-bit linear address of the initial ramdisk or ramfs.  Leave at\n  zero if there is no initial ramdisk/ramfs.\n```\n\nSo we can get all the information that interests us from `boot_params`. For example let's look at `get_ramdisk_image`:\n\n```C\nstatic u64 __init get_ramdisk_image(void)\n{\n        u64 ramdisk_image = boot_params.hdr.ramdisk_image;\n\n        ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;\n\n        return ramdisk_image;\n}\n```\n\nHere we get the address of the ramdisk from the `boot_params` and shift left it on `32`. We need to do it because as you can read in the [Documentation/x86/zero-page.txt](https://github.com/0xAX/linux/blob/0a07b238e5f488b459b6113a62e06b6aab017f71/Documentation/x86/zero-page.txt):\n\n```\n0C0/004\tALL\text_ramdisk_image ramdisk_image high 32bits\n```\n\nSo after shifting it on 32, we're getting a 64-bit address in `ramdisk_image` and we return it. `get_ramdisk_size` works on the same principle as `get_ramdisk_image`, but it used `ext_ramdisk_size` instead of `ext_ramdisk_image`. After we got ramdisk's size, base address and end address, we check that bootloader provided ramdisk with the:\n\n```C\nif (!boot_params.hdr.type_of_loader ||\n    !ramdisk_image || !ramdisk_size)\n\treturn;\n```\n\nand reserve memory block with the calculated addresses for the initial ramdisk in the end:\n\n```C\nmemblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);\n```\n\nConclusion\n---------------------------------------------------------------------------------\n\nIt is the end of the fourth part about the Linux kernel initialization process. We started to dive in the kernel generic code from the `start_kernel` function in this part and stopped on the architecture-specific initialization in the `setup_arch`. In the next part we will continue with architecture-dependent initialization steps.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me a PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [GCC function attributes](https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html)\n* [this_cpu operations](https://www.kernel.org/doc/Documentation/this_cpu_ops.txt)\n* [cpumask](http://www.crashcourse.ca/wiki/index.php/Cpumask)\n* [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt)\n* [cgroups](http://en.wikipedia.org/wiki/Cgroups)\n* [stack buffer overflow](http://en.wikipedia.org/wiki/Stack_buffer_overflow)\n* [IRQs](http://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29)\n* [initrd](http://en.wikipedia.org/wiki/Initrd)\n* [Previous part](https://github.com/0xAX/linux-insides/blob/master/Initialization/linux-initialization-3.md)\n"
  },
  {
    "path": "Initialization/linux-initialization-5.md",
    "content": "Kernel initialization. Part 5.\n================================================================================\n\nContinue of architecture-specific initialization\n================================================================================\n\nIn the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4), we stopped at the initialization of an architecture-specific stuff from the [setup_arch](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/setup.c#L856) function and now we will continue with it. As we reserved memory for the [initrd](http://en.wikipedia.org/wiki/Initrd), next step is the `olpc_ofw_detect` which detects [One Laptop Per Child support](http://wiki.laptop.org/go/OFW_FAQ). We will not consider platform related stuff in this book and will skip functions related with it. So let's go ahead. The next step is the `early_trap_init` function. This function initializes debug (`#DB` - raised when the `TF` flag of rflags is set) and `int3` (`#BP`) interrupts gate. If you don't know anything about interrupts, you can read about it in the [Early interrupt and exception handling](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-2). In `x86` architecture `INT`, `INTO` and `INT3` are special instructions which allow a task to explicitly call an interrupt handler. The `INT3` instruction calls the breakpoint (`#BP`) handler. You may remember, we already saw it in the [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-2) about interrupts: and exceptions:\n\n```\n----------------------------------------------------------------------------------------------\n|Vector|Mnemonic|Description         |Type |Error Code|Source                   |\n----------------------------------------------------------------------------------------------\n|3     | #BP    |Breakpoint          |Trap |NO        |INT 3                    |\n----------------------------------------------------------------------------------------------\n```\n\nDebug interrupt `#DB` is the primary method of invoking debuggers. `early_trap_init` defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c). This functions sets `#DB` and `#BP` handlers and reloads [IDT](http://en.wikipedia.org/wiki/Interrupt_descriptor_table):\n\n```C\nvoid __init early_trap_init(void)\n{\n        set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);\n        set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);\n        load_idt(&idt_descr);\n}\n```\n\nWe already saw implementation of the `set_intr_gate` in the previous part about interrupts. Here are two similar functions `set_intr_gate_ist` and `set_system_intr_gate_ist`. Both of these two functions take three parameters:\n\n* number of the interrupt;\n* base address of the interrupt/exception handler;\n* third parameter is - `Interrupt Stack Table`. `IST` is a new mechanism in the `x86_64` and part of the [TSS](http://en.wikipedia.org/wiki/Task_state_segment). Every active thread in kernel mode has own kernel stack which is `16` kilobytes. While a thread in user space, this kernel stack is empty.\n\nIn addition to per-thread stacks, there are a couple of specialized stacks associated with each CPU. All about these stack you can read in the Linux kernel documentation - [Kernel stacks](https://www.kernel.org/doc/Documentation/x86/kernel-stacks). `x86_64` provides feature which allows to switch to a new `special` stack for during any events as non-maskable interrupt and etc... And the name of this feature is - `Interrupt Stack Table`. There can be up to 7 `IST` entries per CPU and every entry points to the dedicated stack. In our case this is `DEBUG_STACK`.\n\n`set_intr_gate_ist` and `set_system_intr_gate_ist` work by the same principle as `set_intr_gate` with only one difference. Both of these functions checks\ninterrupt number and call `_set_gate` inside:\n\n```C\nBUG_ON((unsigned)n > 0xFF);\n_set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);\n```\n\nas `set_intr_gate` does this. But `set_intr_gate` calls `_set_gate` with [dpl](http://en.wikipedia.org/wiki/Privilege_level) - 0, and ist - 0, but `set_intr_gate_ist` and `set_system_intr_gate_ist` sets `ist` as `DEBUG_STACK` and `set_system_intr_gate_ist` sets `dpl` as `0x3` which is the lowest privilege. When an interrupt occurs and the hardware loads such a descriptor, then hardware automatically sets the new stack pointer based on the IST value, then invokes the interrupt handler. All of the special kernel stacks will be set in the `cpu_init` function (we will see it later).\n\nAs `#DB` and `#BP` gates written to the `idt_descr`, we reload `IDT` table with `load_idt` which just call `ldtr` instruction. Now let's look on interrupt handlers and will try to understand how they works. Of course, I can't cover all interrupt handlers in this book and I do not see the point in this. It is very interesting to delve in the Linux kernel source code, so we will see how `debug` handler implemented in this part, and understand how other interrupt handlers are implemented will be your task.\n\n#DB handler\n--------------------------------------------------------------------------------\n\nAs you can read above, we passed address of the `#DB` handler as `&debug` in the `set_intr_gate_ist`. [lxr.free-electrons.com](http://lxr.free-electrons.com/ident) is a great resource for searching identifiers in the Linux kernel source code, but unfortunately you will not find `debug` handler with it. All of you can find, it is `debug` definition in the [arch/x86/include/asm/traps.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/traps.h):\n\n```C\nasmlinkage void debug(void);\n```\n\nWe can see `asmlinkage` attribute which tells to us that `debug` is function written with [assembly](http://en.wikipedia.org/wiki/Assembly_language). Yeah, again and again assembly :). Implementation of the `#DB` handler as other handlers is in this [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S) and defined with the `idtentry` assembly macro:\n\n```assembly\nidtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK\n```\n\n`idtentry` is a macro which defines an interrupt/exception entry point. As you can see it takes five arguments:\n\n* name of the interrupt entry point;\n* name of the interrupt handler;\n* has interrupt error code or not;\n* paranoid  - if this parameter = 1, switch to special stack (read above);\n* shift_ist - stack to switch during interrupt.\n\nNow let's look on `idtentry` macro implementation. This macro defined in the same assembly file and defines `debug` function with the `ENTRY` macro. For the start `idtentry` macro checks that given parameters are correct in case if need to switch to the special stack. In the next step it checks that give interrupt returns error code. If interrupt does not return error code (in our case `#DB` does not return error code), it calls `INTR_FRAME` or `XCPT_FRAME` if interrupt has error code. Both of these macros `XCPT_FRAME` and `INTR_FRAME` do nothing and need only for the building initial frame state for interrupts. They uses `CFI` directives and used for debugging. More info you can find in the [CFI directives](https://sourceware.org/binutils/docs/as/CFI-directives.html). As comment from the [arch/x86/kernel/entry/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S) says: `CFI macros are used to generate dwarf2 unwind information for better backtraces. They don't change any code.` so we will ignore them.\n\n```assembly\n.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1\nENTRY(\\sym)\n\t/* Sanity check */\n\t.if \\shift_ist != -1 && \\paranoid == 0\n\t.error \"using shift_ist requires paranoid=1\"\n\t.endif\n\n\t.if \\has_error_code\n\tXCPT_FRAME\n\t.else\n\tINTR_FRAME\n\t.endif\n\t...\n\t...\n\t...\n```\n\nYou can remember from the previous part about early interrupts/exceptions handling that after interrupt occurs, current stack will have following format:\n\n```\n    +-----------------------+\n    |                       |\n+40 |         SS            |\n+32 |         RSP           |\n+24 |        RFLAGS         |\n+16 |         CS            |\n+8  |         RIP           |\n 0  |       Error Code      | <---- rsp\n    |                       |\n    +-----------------------+\n```\n\nThe next two macro from the `idtentry` implementation are:\n\n```assembly\n\tASM_CLAC\n\tPARAVIRT_ADJUST_EXCEPTION_FRAME\n```\n\nFirst `ASM_CLAC` macro depends on `CONFIG_X86_SMAP` configuration option and need for security reason, more about it you can read [here](https://lwn.net/Articles/517475/). The second `PARAVIRT_ADJUST_EXCEPTION_FRAME` macro is for handling handle Xen-type-exceptions (this chapter about kernel initialization and we will not consider virtualization stuff here).\n\nThe next piece of code checks if interrupt has error code or not and pushes `$-1` which is `0xffffffffffffffff` on `x86_64` on the stack if not:\n\n```assembly\n\t.ifeq \\has_error_code\n\tpushq_cfi $-1\n\t.endif\n```\n\nWe need to do it as `dummy` error code for stack consistency for all interrupts. In the next step we subtract from the stack pointer `$ORIG_RAX-R15`:\n\n```assembly\n\tsubq $ORIG_RAX-R15, %rsp\n```\n\nwhere `ORIRG_RAX`, `R15` and other macros defined in the [arch/x86/entry/calling.h](https://github.com/torvalds/linux/blob/master/arch/x86/entry/calling.h) and `ORIG_RAX-R15` is 120 bytes. General purpose registers will occupy these 120 bytes because we need to store all registers on the stack during interrupt handling. After we set stack for general purpose registers, the next step is checking that interrupt came from userspace with:\n\n```assembly\ntestl $3, CS(%rsp)\njnz 1f\n```\n\nHere we checks first and second bits in the `CS`. You can remember that `CS` register contains segment selector where first two bits are `RPL`. All privilege levels are integers in the range 0–3, where the lowest number corresponds to the highest privilege. So if interrupt came from the kernel mode we call `save_paranoid`\tor jump on label `1` if not. In the `save_paranoid` we store all general purpose registers on the stack and switch user `gs` on kernel `gs` if need:\n\n```assembly\n\tmovl $1,%ebx\n\tmovl $MSR_GS_BASE,%ecx\n\trdmsr\n\ttestl %edx,%edx\n\tjs 1f\n\tSWAPGS\n\txorl %ebx,%ebx\n1:\tret\n```\n\nIn the next steps we put `pt_regs` pointer to the `rdi`, save error code in the `rsi` if it has and call interrupt handler which is - `do_debug` in our case from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c). `do_debug` like other handlers takes two parameters:\n\n* pt_regs - is a structure which presents set of CPU registers which are saved in the process' memory region;\n* error code - error code of interrupt.\n\nAfter interrupt handler finished its work, calls `paranoid_exit` which restores stack, switch on userspace if interrupt came from there and calls `iret`. That's all. Of course it is not all :), but we will see more deeply in the separate chapter about interrupts.\n\nThis is general view of the `idtentry` macro for `#DB` interrupt. All interrupts are similar to this implementation and defined with idtentry too. After `early_trap_init` finished its work, the next function is `early_cpu_init`. This function defined in the [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/cpu/common.c) and collects information about CPU and its vendor.\n\nEarly ioremap initialization\n--------------------------------------------------------------------------------\n\nThe next step is initialization of early `ioremap`. In general there are two ways to communicate with devices:\n\n* I/O Ports;\n* Device memory.\n\nWe already saw first method (`outb/inb` instructions) in the part about Linux kernel booting [process](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-3). The second method is to map I/O physical addresses to virtual addresses. When a physical address is accessed by the CPU, it may refer to a portion of physical RAM which can be mapped on memory of the I/O device. So `ioremap` used to map device memory into kernel address space.\n\nAs I wrote above next function is the `early_ioremap_init` which re-maps I/O memory to kernel address space so it can access it. We need to initialize early ioremap for early initialization code which needs to temporarily map I/O or memory regions before the normal mapping functions like `ioremap` are available. Implementation of this function is in the [arch/x86/mm/ioremap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/ioremap.c). At the start of the `early_ioremap_init` we can see definition of the `pmd` pointer with `pmd_t` type (which presents page middle directory entry `typedef struct { pmdval_t pmd; } pmd_t;` where `pmdval_t` is `unsigned long`) and make a check that `fixmap` aligned in a correct way:\n\n```C\npmd_t *pmd;\nBUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));\n```\n\n`fixmap` - is fixed virtual address mappings which extends from `FIXADDR_START` to `FIXADDR_TOP`. Fixed virtual addresses are needed for subsystems that need to know the virtual address at compile time. After the check `early_ioremap_init` makes a call of the `early_ioremap_setup` function from the [mm/early_ioremap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/early_ioremap.c). `early_ioremap_setup` fills `slot_virt` array of the `unsigned long` with virtual addresses with 512 temporary boot-time fix-mappings:\n\n```C\nfor (i = 0; i < FIX_BTMAPS_SLOTS; i++)\n    slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);\n```\n\nAfter this we get page middle directory entry for the `FIX_BTMAP_BEGIN` and put to the `pmd` variable, fills `bm_pte` with zeros which is boot time page tables and call `pmd_populate_kernel` function for setting given page table entry in the given page middle directory:\n\n```C\npmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));\nmemset(bm_pte, 0, sizeof(bm_pte));\npmd_populate_kernel(&init_mm, pmd, bm_pte);\n```\n\nThat's all for this. If you feeling puzzled, don't worry. There is special part about `ioremap` and `fixmaps` in the [Linux Kernel Memory Management. Part 2](https://github.com/0xAX/linux-insides/blob/master/MM/linux-mm-2.md) chapter.\n\nObtaining major and minor numbers for the root device\n--------------------------------------------------------------------------------\n\nAfter early `ioremap` was initialized, you can see the following code:\n\n```C\nROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);\n```\n\nThis code obtains major and minor numbers for the root device where `initrd` will be mounted later in the `do_mount_root` function. Major number of the device identifies a driver associated with the device. Minor number referred on the device controlled by driver. Note that `old_decode_dev` takes one parameter from the `boot_params_structure`. As we can read from the x86 Linux kernel boot protocol:\n\n```\nField name:\troot_dev\nType:\t\tmodify (optional)\nOffset/size:\t0x1fc/2\nProtocol:\tALL\n\n  The default root device device number.  The use of this field is\n  deprecated, use the \"root=\" option on the command line instead.\n```\n\nNow let's try to understand what `old_decode_dev` does. Actually it just calls `MKDEV` inside which generates `dev_t` from the give major and minor numbers. It's implementation is pretty simple:\n\n```C\nstatic inline dev_t old_decode_dev(u16 val)\n{\n         return MKDEV((val >> 8) & 255, val & 255);\n}\n```\n\nwhere `dev_t` is a kernel data type to present major/minor number pair.  But what's the strange `old_` prefix? For historical reasons, there are two ways of managing the major and minor numbers of a device. In the first way major and minor numbers occupied 2 bytes. You can see it in the previous code: 8 bit for major number and 8 bit for minor number. But there is a problem: only 256 major numbers and 256 minor numbers are possible. So 16-bit integer was replaced by 32-bit integer where 12 bits reserved for major number and 20 bits for minor. You can see this in the `new_decode_dev` implementation:\n\n```C\nstatic inline dev_t new_decode_dev(u32 dev)\n{\n         unsigned major = (dev & 0xfff00) >> 8;\n         unsigned minor = (dev & 0xff) | ((dev >> 12) & 0xfff00);\n         return MKDEV(major, minor);\n}\n```\n\nAfter calculation we will get `0xfff` or 12 bits for `major` if it is `0xffffffff` and `0xfffff` or 20 bits for `minor`. So in the end of execution of the `old_decode_dev` we will get major and minor numbers for the root device in `ROOT_DEV`.\n\nMemory map setup\n--------------------------------------------------------------------------------\n\nThe next point is the setup of the memory map with the call of the `setup_memory_map` function. But before this we setup different parameters as information about a screen (current row and column, video page and etc... (you can read about it in the [Video mode initialization and transition to protected mode](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-3))), Extended display identification data, video mode, bootloader_type and etc...:\n\n```C\n\tscreen_info = boot_params.screen_info;\n\tedid_info = boot_params.edid_info;\n\tsaved_video_mode = boot_params.hdr.vid_mode;\n\tbootloader_type = boot_params.hdr.type_of_loader;\n\tif ((bootloader_type >> 4) == 0xe) {\n\t\tbootloader_type &= 0xf;\n\t\tbootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;\n\t}\n\tbootloader_version  = bootloader_type & 0xf;\n\tbootloader_version |= boot_params.hdr.ext_loader_ver << 4;\n```\n\nAll of these parameters we got during boot time and stored in the `boot_params` structure. After this we need to setup the end of the I/O memory. As you know one of the main purposes of the kernel is resource management. And one of the resource is memory. As we already know there are two ways to communicate with devices are I/O ports and device memory. All information about registered resources are available through:\n\n* /proc/ioports - provides a list of currently registered port regions used for input or output communication with a device;\n* /proc/iomem   - provides current map of the system's memory for each physical device.\n\nAt the moment we are interested in `/proc/iomem`:\n\n```\ncat /proc/iomem\n00000000-00000fff : reserved\n00001000-0009d7ff : System RAM\n0009d800-0009ffff : reserved\n000a0000-000bffff : PCI Bus 0000:00\n000c0000-000cffff : Video ROM\n000d0000-000d3fff : PCI Bus 0000:00\n000d4000-000d7fff : PCI Bus 0000:00\n000d8000-000dbfff : PCI Bus 0000:00\n000dc000-000dffff : PCI Bus 0000:00\n000e0000-000fffff : reserved\n  000e0000-000e3fff : PCI Bus 0000:00\n  000e4000-000e7fff : PCI Bus 0000:00\n  000f0000-000fffff : System ROM\n```\n\nAs you can see range of addresses are shown in hexadecimal notation with its owner. Linux kernel provides API for managing any resources in a general way. Global resources (for example PICs or I/O ports) can be divided into subsets - relating to any hardware bus slot. The main structure `resource`:\n\n```C\nstruct resource {\n        resource_size_t start;\n        resource_size_t end;\n        const char *name;\n        unsigned long flags;\n        struct resource *parent, *sibling, *child;\n};\n```\n\npresents abstraction for a tree-like subset of system resources. This structure provides range of addresses from `start` to `end` (`resource_size_t` is `phys_addr_t` or `u64` for `x86_64`) which a resource covers, `name` of a resource (you see these names in the `/proc/iomem` output) and `flags` of a resource (All resources flags defined in the [include/linux/ioport.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/ioport.h)). The last are three pointers to the `resource` structure. These pointers enable a tree-like structure:\n\n```\n+-------------+      +-------------+\n|             |      |             |\n|    parent   |------|    sibling  |\n|             |      |             |\n+-------------+      +-------------+\n       |\n       |\n+-------------+\n|             |\n|    child    |\n|             |\n+-------------+\n```\n\nEvery subset of resources has root range resources. For `iomem` it is `iomem_resource` which defined as:\n\n```C\nstruct resource iomem_resource = {\n        .name   = \"PCI mem\",\n        .start  = 0,\n        .end    = -1,\n        .flags  = IORESOURCE_MEM,\n};\nEXPORT_SYMBOL(iomem_resource);\n```\n\nTODO EXPORT_SYMBOL\n\n`iomem_resource` defines root addresses range for io memory with `PCI mem` name and `IORESOURCE_MEM` (`0x00000200`) as flags. As i wrote above our current point is setup the end address of the `iomem`. We will do it with:\n\n```C\niomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;\n```\n\nHere we shift `1` on `boot_cpu_data.x86_phys_bits`. `boot_cpu_data` is `cpuinfo_x86` structure which we filled during execution of the `early_cpu_init`. As you can understand from the name of the `x86_phys_bits` field, it presents maximum bits amount of the maximum physical address in the system. Note also that `iomem_resource` is passed to the `EXPORT_SYMBOL` macro. This macro exports the given symbol (`iomem_resource` in our case) for dynamic linking or in other words it makes a symbol accessible to dynamically loaded modules.\n\nAfter we set the end address of the root `iomem` resource address range, as I wrote above the next step will be setup of the memory map. It will be produced with the call of the `setup_ memory_map` function:\n\n```C\nvoid __init setup_memory_map(void)\n{\n        char *who;\n\n        who = x86_init.resources.memory_setup();\n        memcpy(&e820_saved, &e820, sizeof(struct e820map));\n        printk(KERN_INFO \"e820: BIOS-provided physical RAM map:\\n\");\n        e820_print_map(who);\n}\n```\n\nFirst of all we call look here the call of the `x86_init.resources.memory_setup`. `x86_init` is a `x86_init_ops` structure which presents platform specific setup functions as resources initialization, pci initialization and etc... initialization of the `x86_init` is in the [arch/x86/kernel/x86_init.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/x86_init.c). I will not give here the full description because it is very long, but only one part which interests us for now:\n\n```C\nstruct x86_init_ops x86_init __initdata = {\n\t.resources = {\n            .probe_roms             = probe_roms,\n            .reserve_resources      = reserve_standard_io_resources,\n            .memory_setup           = default_machine_specific_memory_setup,\n    },\n    ...\n    ...\n    ...\n}\n```\n\nAs we can see here `memory_setup` field is `default_machine_specific_memory_setup` where we get the number of the [e820](http://en.wikipedia.org/wiki/E820) entries which we collected in the [boot time](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-2), sanitize the BIOS e820 map and fill `e820map` structure with the memory regions. As all regions are collected, print of all regions with printk. You can find this print if you execute `dmesg` command and you can see something like this:\n\n```\n[    0.000000] e820: BIOS-provided physical RAM map:\n[    0.000000] BIOS-e820: [mem 0x0000000000000000-0x000000000009d7ff] usable\n[    0.000000] BIOS-e820: [mem 0x000000000009d800-0x000000000009ffff] reserved\n[    0.000000] BIOS-e820: [mem 0x00000000000e0000-0x00000000000fffff] reserved\n[    0.000000] BIOS-e820: [mem 0x0000000000100000-0x00000000be825fff] usable\n[    0.000000] BIOS-e820: [mem 0x00000000be826000-0x00000000be82cfff] ACPI NVS\n[    0.000000] BIOS-e820: [mem 0x00000000be82d000-0x00000000bf744fff] usable\n[    0.000000] BIOS-e820: [mem 0x00000000bf745000-0x00000000bfff4fff] reserved\n[    0.000000] BIOS-e820: [mem 0x00000000bfff5000-0x00000000dc041fff] usable\n[    0.000000] BIOS-e820: [mem 0x00000000dc042000-0x00000000dc0d2fff] reserved\n[    0.000000] BIOS-e820: [mem 0x00000000dc0d3000-0x00000000dc138fff] usable\n[    0.000000] BIOS-e820: [mem 0x00000000dc139000-0x00000000dc27dfff] ACPI NVS\n[    0.000000] BIOS-e820: [mem 0x00000000dc27e000-0x00000000deffefff] reserved\n[    0.000000] BIOS-e820: [mem 0x00000000defff000-0x00000000deffffff] usable\n...\n...\n...\n```\n\nCopying of the BIOS Enhanced Disk Device information\n--------------------------------------------------------------------------------\n\nThe next two steps is parsing of the `setup_data` with `parse_setup_data` function and copying BIOS EDD to the safe place. `setup_data` is a field from the kernel boot header and as we can read from the `x86` boot protocol:\n\n```\nField name:\tsetup_data\nType:\t\twrite (special)\nOffset/size:\t0x250/8\nProtocol:\t2.09+\n\n  The 64-bit physical pointer to NULL terminated single linked list of\n  struct setup_data. This is used to define a more extensible boot\n  parameters passing mechanism.\n```\n\nIt used for storing setup information for different types as device tree blob, EFI setup data and etc... In the second step we copy BIOS EDD information from the `boot_params` structure that we collected in the [arch/x86/boot/edd.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/edd.c) to the `edd` structure:\n\n```C\nstatic inline void __init copy_edd(void)\n{\n     memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,\n            sizeof(edd.mbr_signature));\n     memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));\n     edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;\n     edd.edd_info_nr = boot_params.eddbuf_entries;\n}\n```\n\nMemory descriptor initialization\n--------------------------------------------------------------------------------\n\nThe next step is initialization of the memory descriptor of the init process. As you already can know every process has its own address space. This address space presented with special data structure which called `memory descriptor`. Directly in the Linux kernel source code memory descriptor presented with `mm_struct` structure. `mm_struct` contains many different fields related with the process address space as start/end address of the kernel code/data, start/end of the brk, number of memory areas, list of memory areas and etc... This structure defined in the [include/linux/mm_types.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mm_types.h). As every process has its own memory descriptor, `task_struct` structure contains it in the `mm` and `active_mm` field. And our first `init` process has it too. You can remember that we saw the part of initialization of the init `task_struct` with `INIT_TASK` macro in the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4):\n\n```C\n#define INIT_TASK(tsk)  \\\n{\n    ...\n\t...\n\t...\n\t.mm = NULL,         \\\n    .active_mm  = &init_mm, \\\n\t...\n}\n```\n\n`mm` points to the process address space and `active_mm` points to the active address space if process has no address space such as kernel threads (more about it you can read in the [documentation](https://www.kernel.org/doc/Documentation/vm/active_mm.txt)). Now we fill memory descriptor of the initial process:\n\n```C\n\tinit_mm.start_code = (unsigned long) _text;\n\tinit_mm.end_code = (unsigned long) _etext;\n\tinit_mm.end_data = (unsigned long) _edata;\n\tinit_mm.brk = _brk_end;\n```\n\nwith the kernel's text, data and brk. `init_mm` is the memory descriptor of the initial process and defined as:\n\n```C\nstruct mm_struct init_mm = {\n    .mm_rb          = RB_ROOT,\n    .pgd            = swapper_pg_dir,\n    .mm_users       = ATOMIC_INIT(2),\n    .mm_count       = ATOMIC_INIT(1),\n    .mmap_sem       = __RWSEM_INITIALIZER(init_mm.mmap_sem),\n    .page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),\n    .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),\n    INIT_MM_CONTEXT(init_mm)\n};\n```\n\nwhere `mm_rb` is a red-black tree of the virtual memory areas, `pgd` is a pointer to the page global directory, `mm_users` is address space users, `mm_count` is primary usage counter and `mmap_sem` is memory area semaphore. After we setup memory descriptor of the initial process, next step is initialization of the Intel Memory Protection Extensions with `mpx_mm_init`. The next step is initialization of the code/data/bss resources with:\n\n```C\n\tcode_resource.start = __pa_symbol(_text);\n\tcode_resource.end = __pa_symbol(_etext)-1;\n\tdata_resource.start = __pa_symbol(_etext);\n\tdata_resource.end = __pa_symbol(_edata)-1;\n\tbss_resource.start = __pa_symbol(__bss_start);\n\tbss_resource.end = __pa_symbol(__bss_stop)-1;\n```\n\nWe already know a little about `resource` structure (read above). Here we fill code/data/bss resources with their physical addresses. You can see it in the `/proc/iomem`:\n\n```C\n00100000-be825fff : System RAM\n  01000000-015bb392 : Kernel code\n  015bb393-01930c3f : Kernel data\n  01a11000-01ac3fff : Kernel bss\n```\n\nAll of these structures are defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) and look like typical resource initialization:\n\n```C\nstatic struct resource code_resource = {\n\t.name\t= \"Kernel code\",\n\t.start\t= 0,\n\t.end\t= 0,\n\t.flags\t= IORESOURCE_BUSY | IORESOURCE_MEM\n};\n```\n\nThe last step which we will cover in this part will be `NX` configuration. `NX-bit` or no execute bit is 63-bit in the page directory entry which controls the ability to execute code from all physical pages mapped by the table entry. This bit can only be used/set when the `no-execute` page-protection mechanism is enabled by the setting `EFER.NXE` to 1. In the `x86_configure_nx` function we check that CPU has support of `NX-bit` and it does not disabled. After the check we fill `__supported_pte_mask` depend on it:\n\n```C\nvoid x86_configure_nx(void)\n{\n        if (cpu_has_nx && !disable_nx)\n                __supported_pte_mask |= _PAGE_NX;\n        else\n                __supported_pte_mask &= ~_PAGE_NX;\n}\n```\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the fifth part about Linux kernel initialization process. In this part we continued to dive in the `setup_arch` function which makes initialization of architecture-specific stuff. It was long part, but we are not finished with it. As I already wrote, the `setup_arch` is big function, and I am really not sure that we will cover all of it even in the next part. There were some new interesting concepts in this part like `Fix-mapped` addresses, ioremap and etc... Don't worry if they are unclear for you. There is a special part about these concepts - [Linux kernel memory management Part 2.](https://github.com/0xAX/linux-insides/blob/master/MM/linux-mm-2.md). In the next part we will continue with the initialization of the architecture-specific stuff and will see parsing of the early kernel parameters, early dump of the pci devices, `Desktop Management Interface` scanning and many many more.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [mm vs active_mm](https://www.kernel.org/doc/Documentation/vm/active_mm.txt)\n* [e820](http://en.wikipedia.org/wiki/E820)\n* [Supervisor mode access prevention](https://lwn.net/Articles/517475/)\n* [Kernel stacks](https://www.kernel.org/doc/Documentation/x86/kernel-stacks)\n* [TSS](http://en.wikipedia.org/wiki/Task_state_segment)\n* [IDT](http://en.wikipedia.org/wiki/Interrupt_descriptor_table)\n* [Memory mapped I/O](http://en.wikipedia.org/wiki/Memory-mapped_I/O)\n* [CFI directives](https://sourceware.org/binutils/docs/as/CFI-directives.html)\n* [PDF. dwarf4 specification](http://dwarfstd.org/doc/DWARF4.pdf)\n* [Call stack](http://en.wikipedia.org/wiki/Call_stack)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4)\n"
  },
  {
    "path": "Initialization/linux-initialization-6.md",
    "content": "Kernel initialization. Part 6.\n================================================================================\n\nArchitecture-specific initialization, again...\n================================================================================\n\nIn the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-5) we saw architecture-specific (`x86_64` in our case) initialization stuff from the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) and finished on `x86_configure_nx` function which sets the `_PAGE_NX` flag depends on support of [NX bit](http://en.wikipedia.org/wiki/NX_bit). As I wrote before `setup_arch` function and `start_kernel` are very big, so in this and in the next part we will continue to learn about architecture-specific initialization process. The next function after `x86_configure_nx` is `parse_early_param`. This function is defined in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) and as you can understand from its name, this function parses kernel command line and setups different services depends on the given parameters (all kernel command line parameters you can find are in the [Documentation/kernel-parameters.txt](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst)). You may remember how we setup `earlyprintk` in the earliest [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-2). On the early stage we looked for kernel parameters and their value with the `cmdline_find_option` function and `__cmdline_find_option`, `__cmdline_find_option_bool` helpers from the [arch/x86/boot/cmdline.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/cmdline.c). There we're in the generic kernel part which does not depend on architecture and here we use another approach. If you are reading Linux kernel source code, you already note calls like this:\n\n```C\nearly_param(\"gbpages\", parse_direct_gbpages_on);\n```\n\n`early_param` macro takes two parameters:\n\n* command line parameter name;\n* function which will be called if given parameter is passed.\n\nand defined as:\n\n```C\n#define early_param(str, fn) \\\n        __setup_param(str, fn, fn, 1)\n```\n\nin the [include/linux/init.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init.h). As you can see `early_param` macro just makes call of the `__setup_param` macro:\n\n```C\n#define __setup_param(str, unique_id, fn, early)                \\\n        static const char __setup_str_##unique_id[] __initconst \\\n                __aligned(1) = str; \\\n        static struct obs_kernel_param __setup_##unique_id      \\\n                __used __section(.init.setup)                   \\\n                __attribute__((aligned((sizeof(long)))))        \\\n                = { __setup_str_##unique_id, fn, early }\n```\n\nThis macro defines `__setup_str_*_id` variable (where `*` depends on given function name) and assigns it to the given command line parameter name. In the next line we can see definition of the `__setup_*` variable which type is `obs_kernel_param` and its initialization. `obs_kernel_param` structure defined as:\n\n```C\nstruct obs_kernel_param {\n        const char *str;\n        int (*setup_func)(char *);\n        int early;\n};\n```\n\nand contains three fields:\n\n* name of the kernel parameter;\n* function which setups something depend on parameter;\n* field determines is parameter early (1) or not (0).\n\nNote that `__set_param` macro defines with `__section(.init.setup)` attribute. It means that all `__setup_str_*` will be placed in the `.init.setup` section, moreover, as we can see in the [include/asm-generic/vmlinux.lds.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/asm-generic/vmlinux.lds.h), they will be placed between `__setup_start` and `__setup_end`:\n\n```\n#define INIT_SETUP(initsetup_align)                \\\n                . = ALIGN(initsetup_align);        \\\n                VMLINUX_SYMBOL(__setup_start) = .; \\\n                *(.init.setup)                     \\\n                VMLINUX_SYMBOL(__setup_end) = .;\n```\n\nNow we know how parameters are defined, let's back to the `parse_early_param` implementation:\n\n```C\nvoid __init parse_early_param(void)\n{\n        static int done __initdata;\n        static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;\n\n        if (done)\n                return;\n\n        /* All fall through to do_early_param. */\n        strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);\n        parse_early_options(tmp_cmdline);\n        done = 1;\n}\n```\n\nThe `parse_early_param` function defines two static variables. First `done` check that `parse_early_param` already called and the second is temporary storage for kernel command line. After this we copy `boot_command_line` to the temporary command line which we just defined and call the `parse_early_options` function from the same source code `main.c` file. `parse_early_options` calls the `parse_args` function from the [kernel/params.c](https://github.com/torvalds/linux) where `parse_args` parses given command line and calls `do_early_param` function. This [function](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c#L413) goes from the ` __setup_start` to `__setup_end`, and calls the function from the `obs_kernel_param` if a parameter is early. After this all services which are depend on early command line parameters were setup and the next call after the `parse_early_param` is `x86_report_nx`. As I wrote in the beginning of this part, we already set `NX-bit` with the `x86_configure_nx`. The next `x86_report_nx` function from the [arch/x86/mm/setup_nx.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/setup_nx.c) just prints information about the `NX`. Note that we call `x86_report_nx` not right after the `x86_configure_nx`, but after the call of the `parse_early_param`. The answer is simple: we call it after the `parse_early_param` because the kernel support `noexec` parameter:\n\n```\nnoexec\t\t[X86]\n\t\t\tOn X86-32 available only on PAE configured kernels.\n\t\t\tnoexec=on: enable non-executable mappings (default)\n\t\t\tnoexec=off: disable non-executable mappings\n```\n\nWe can see it in the booting time:\n\n![NX](images/NX.png)\n\nAfter this we can see call of the:\n\n```C\n\tmemblock_x86_reserve_range_setup_data();\n```\n\nfunction. This function is defined in the same [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) source code file and remaps memory for the `setup_data` and reserved memory block for the `setup_data` (more about `setup_data` you can read in the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-5) and about `ioremap` and `memblock` you can read in the [Linux kernel memory management](https://0xax.gitbook.io/linux-insides/summary/mm)).\n\nIn the next step we can see following conditional statement:\n\n```C\n\tif (acpi_mps_check()) {\n#ifdef CONFIG_X86_LOCAL_APIC\n\t\tdisable_apic = 1;\n#endif\n\t\tsetup_clear_cpu_cap(X86_FEATURE_APIC);\n\t}\n```\n\nThe first `acpi_mps_check` function from the [arch/x86/kernel/acpi/boot.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/acpi/boot.c) depends on `CONFIG_X86_LOCAL_APIC` and `CONFIG_x86_MPPARSE` configuration options:\n\n```C\nint __init acpi_mps_check(void)\n{\n#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)\n        /* mptable code is not built-in*/\n        if (acpi_disabled || acpi_noirq) {\n                printk(KERN_WARNING \"MPS support code is not built-in.\\n\"\n                       \"Using acpi=off or acpi=noirq or pci=noacpi \"\n                       \"may have problem\\n\");\n                 return 1;\n        }\n#endif\n        return 0;\n}\n```\n\nIt checks the built-in `MPS` or [MultiProcessor Specification](http://en.wikipedia.org/wiki/MultiProcessor_Specification) table. If `CONFIG_X86_LOCAL_APIC` is set and `CONFIG_x86_MPPARSE` is not set, `acpi_mps_check` prints warning message if the one of the command line options: `acpi=off`, `acpi=noirq` or `pci=noacpi` passed to the kernel. If `acpi_mps_check` returns `1` it means that we disable local [APIC](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) and clear `X86_FEATURE_APIC` bit in the of the current CPU with the `setup_clear_cpu_cap` macro. (more about CPU mask you can read in the [CPU masks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)).\n\nEarly PCI dump\n--------------------------------------------------------------------------------\n\nIn the next step we make a dump of the [PCI](http://en.wikipedia.org/wiki/Conventional_PCI) devices with the following code:\n\n```C\n#ifdef CONFIG_PCI\n\tif (pci_early_dump_regs)\n\t\tearly_dump_pci_devices();\n#endif\n```\n\n`pci_early_dump_regs` variable defined in the [arch/x86/pci/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/pci/common.c) and its value depends on the kernel command line parameter: `pci=earlydump`. We can find definition of this parameter in the [drivers/pci/pci.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch):\n\n```C\nearly_param(\"pci\", pci_setup);\n```\n\n`pci_setup` function gets the string after the `pci=` and analyzes it. This function calls `pcibios_setup` which defined as `__weak` in the [drivers/pci/pci.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch) and every architecture defines the same function which overrides `__weak` analog. For example `x86_64` architecture-dependent version is in the [arch/x86/pci/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/pci/common.c):\n\n```C\nchar *__init pcibios_setup(char *str) {\n        ...\n\t\t...\n\t\t...\n\t\t} else if (!strcmp(str, \"earlydump\")) {\n                pci_early_dump_regs = 1;\n                return NULL;\n        }\n\t\t...\n\t\t...\n\t\t...\n}\n```\n\nSo, if `CONFIG_PCI` option is set and we passed `pci=earlydump` option to the kernel command line, next function which will be called - `early_dump_pci_devices` from the [arch/x86/pci/early.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/pci/early.c). This function checks `noearly` PCI parameter with:\n\n```C\nif (!early_pci_allowed())\n        return;\n```\n\nand returns if it was passed. Each PCI domain can host up to `256` buses and each bus hosts up to 32 devices. So, we goes in a loop:\n\n```C\nfor (bus = 0; bus < 256; bus++) {\n                for (slot = 0; slot < 32; slot++) {\n                        for (func = 0; func < 8; func++) {\n\t\t\t\t\t\t...\n\t\t\t\t\t\t...\n\t\t\t\t\t\t...\n                        }\n                }\n}\n```\n\nand read the `pci` config with the `read_pci_config` function.\n\nThat's all. We will not go deep in the `pci` details, but will see more details in the special `Drivers/PCI` part.\n\nFinish with memory parsing\n--------------------------------------------------------------------------------\n\nAfter the `early_dump_pci_devices`, there are a couple of function related with available memory and [e820](http://en.wikipedia.org/wiki/E820) which we collected in the [First steps in the kernel setup](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-2) part:\n\n```C\n\t/* update the e820_saved too */\n\te820_reserve_setup_data();\n\tfinish_e820_parsing();\n\t...\n\t...\n\t...\n\te820_add_kernel_range();\n\ttrim_bios_range(void);\n\tmax_pfn = e820_end_of_ram_pfn();\n\tearly_reserve_e820_mpc_new();\n```\n\nLet's look at it. As you can see the first function is `e820_reserve_setup_data`. This function does almost the same as `memblock_x86_reserve_range_setup_data` which we saw above, but it also calls `e820_update_range` which adds new regions to the `e820map` with the given type which is `E820_RESERVED_KERN` in our case. The next function is `finish_e820_parsing` which sanitizes `e820map` with the `sanitize_e820_map` function. Besides this two functions we can see a couple of functions related to the [e820](http://en.wikipedia.org/wiki/E820). You can see it in the listing above. `e820_add_kernel_range` function takes the physical address of the kernel start and end:\n\n```C\nu64 start = __pa_symbol(_text);\nu64 size = __pa_symbol(_end) - start;\n```\n\nchecks that `.text` `.data` and `.bss` marked as `E820RAM` in the `e820map` and prints the warning message if not. The next function `trm_bios_range` update first 4096 bytes in `e820Map` as `E820_RESERVED` and sanitizes it again with the call of the `sanitize_e820_map`. After this we get the last page frame number with the call of the `e820_end_of_ram_pfn` function. Every memory page has a unique number - `Page frame number`  and `e820_end_of_ram_pfn` function returns the maximum with the call of the `e820_end_pfn`:\n\n```C\nunsigned long __init e820_end_of_ram_pfn(void)\n{\n\treturn e820_end_pfn(MAX_ARCH_PFN);\n}\n```\n\nwhere `e820_end_pfn` takes maximum page frame number on the certain architecture (`MAX_ARCH_PFN` is `0x400000000` for `x86_64`). In the `e820_end_pfn` we go through the all `e820` slots and check that `e820` entry has `E820_RAM` or `E820_PRAM` type because we calculate page frame numbers only for these types, gets the base address and end address of the page frame number for the current `e820` entry and makes some checks for these addresses:\n\n```C\nfor (i = 0; i < e820.nr_map; i++) {\n\t\tstruct e820entry *ei = &e820.map[i];\n\t\tunsigned long start_pfn;\n\t\tunsigned long end_pfn;\n\n\t\tif (ei->type != E820_RAM && ei->type != E820_PRAM)\n\t\t\tcontinue;\n\n\t\tstart_pfn = ei->addr >> PAGE_SHIFT;\n\t\tend_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;\n\n        if (start_pfn >= limit_pfn)\n\t\t\tcontinue;\n\t\tif (end_pfn > limit_pfn) {\n\t\t\tlast_pfn = limit_pfn;\n\t\t\tbreak;\n\t\t}\n\t\tif (end_pfn > last_pfn)\n\t\t\tlast_pfn = end_pfn;\n}\n```\n\n```C\n\tif (last_pfn > max_arch_pfn)\n\t\tlast_pfn = max_arch_pfn;\n\n\tprintk(KERN_INFO \"e820: last_pfn = %#lx max_arch_pfn = %#lx\\n\",\n\t\t\t last_pfn, max_arch_pfn);\n\treturn last_pfn;\n```\n\nAfter this we check that `last_pfn` which we got in the loop is not greater that maximum page frame number for the certain architecture (`x86_64` in our case), print information about last page frame number and return it. We can see the `last_pfn` in the `dmesg` output:\n\n```\n...\n[    0.000000] e820: last_pfn = 0x41f000 max_arch_pfn = 0x400000000\n...\n```\n\nAfter this, as we have calculated the biggest page frame number, we calculate `max_low_pfn` which is the biggest page frame number in the `low memory` or below first `4` gigabytes. If installed more than 4 gigabytes of RAM, `max_low_pfn` will be result of the `e820_end_of_low_ram_pfn` function which does the same `e820_end_of_ram_pfn` but with 4 gigabytes limit, in other way `max_low_pfn` will be the same as `max_pfn`:\n\n```C\nif (max_pfn > (1UL<<(32 - PAGE_SHIFT)))\n\tmax_low_pfn = e820_end_of_low_ram_pfn();\nelse\n\tmax_low_pfn = max_pfn;\n\nhigh_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;\n```\n\nNext we calculate `high_memory` (defines the upper bound on direct map memory) with `__va` macro which returns a virtual address by the given physical memory.\n\nDMI scanning\n-------------------------------------------------------------------------------\n\nThe next step after manipulations with different memory regions and `e820` slots is collecting information about computer. We will get all information with the [Desktop Management Interface](http://en.wikipedia.org/wiki/Desktop_Management_Interface) and following functions:\n\n```C\ndmi_scan_machine();\ndmi_memdev_walk();\n```\n\nFirst is `dmi_scan_machine` defined in the [drivers/firmware/dmi_scan.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/firmware/dmi_scan.c). This function goes through the [System Management BIOS](http://en.wikipedia.org/wiki/System_Management_BIOS) structures and extracts information. There are two ways specified to gain access to the `SMBIOS` table: get the pointer to the `SMBIOS` table from the [EFI](http://en.wikipedia.org/wiki/Unified_Extensible_Firmware_Interface)'s configuration table and scanning the physical memory between `0xF0000` and `0xFFFFF` addresses, `0x10000` bytes totally. Let's look on the second approach. `dmi_scan_machine` function remaps memory addresses start from `0xF0000` to `0xFFFFF` with size of `0x10000` bytes with the `dmi_early_remap` which just expands to the `early_ioremap`:\n\n```C\nvoid __init dmi_scan_machine(void)\n{\n\tchar __iomem *p, *q;\n\tchar buf[32];\n\t...\n\t...\n\t...\n\tp = dmi_early_remap(0xF0000, 0x10000);\n\tif (p == NULL)\n\t\t\tgoto error;\n```\n\nand iterates over all `DMI` header address and find search `_SM_` string:\n\n```C\nmemset(buf, 0, 16);\nfor (q = p; q < p + 0x10000; q += 16) {\n\t\tmemcpy_fromio(buf + 16, q, 16);\n\t\tif (!dmi_smbios3_present(buf) || !dmi_present(buf)) {\n\t\t\tdmi_available = 1;\n\t\t\tdmi_early_unmap(p, 0x10000);\n\t\t\tgoto out;\n\t\t}\n\t\tmemcpy(buf, buf + 16, 16);\n}\n```\n\n`_SM_` string must be between `000F0000h` and `0x000FFFFF`. Here we copy 16 bytes to the `buf` with `memcpy_fromio` which is the same `memcpy` and execute `dmi_smbios3_present` and `dmi_present` on the buffer. These functions check that first 4 bytes is `_SM_` string, get `SMBIOS` version and gets `_DMI_` attributes as `DMI` structure table length, table address and etc... After one of these functions finish, you will see the result of it in the `dmesg` output:\n\n```\n[    0.000000] SMBIOS 2.7 present.\n[    0.000000] DMI: Gigabyte Technology Co., Ltd. Z97X-UD5H-BK/Z97X-UD5H-BK, BIOS F6 06/17/2014\n```\n\nIn the end of the `dmi_scan_machine`, we unmap the previously remapped memory:\n\n```C\ndmi_early_unmap(p, 0x10000);\n```\n\nThe second function is - `dmi_memdev_walk`. As you can understand it goes over memory devices. Let's look on it:\n\n```C\nvoid __init dmi_memdev_walk(void)\n{\n\tif (!dmi_available)\n\t\treturn;\n\n\tif (dmi_walk_early(count_mem_devices) == 0 && dmi_memdev_nr) {\n\t\tdmi_memdev = dmi_alloc(sizeof(*dmi_memdev) * dmi_memdev_nr);\n\t\tif (dmi_memdev)\n\t\t\tdmi_walk_early(save_mem_devices);\n\t}\n}\n```\n\nIt checks that `DMI` available (we got it in the previous function - `dmi_scan_machine`) and collects information about memory devices with `dmi_walk_early` and `dmi_alloc` which defined as:\n\n```\n#ifdef CONFIG_DMI\nRESERVE_BRK(dmi_alloc, 65536);\n#endif\n```\n\n`RESERVE_BRK` defined in the [arch/x86/include/asm/setup.h](http://github.com/torvalds/linux/blob/master/arch/x86/include/asm/setup.h) and reserves space with given size in the `brk` section.\n\n-------------------------\n\tinit_hypervisor_platform();\n\tx86_init.resources.probe_roms();\n\tinsert_resource(&iomem_resource, &code_resource);\n\tinsert_resource(&iomem_resource, &data_resource);\n\tinsert_resource(&iomem_resource, &bss_resource);\n\tearly_gart_iommu_check();\n\n\nSMP config\n--------------------------------------------------------------------------------\n\nThe next step is parsing of the [SMP](http://en.wikipedia.org/wiki/Symmetric_multiprocessing) configuration. We do it with the call of the `find_smp_config` function which just calls function:\n\n```C\nstatic inline void find_smp_config(void)\n{\n        x86_init.mpparse.find_smp_config();\n}\n```\n\ninside. `x86_init.mpparse.find_smp_config` is the `default_find_smp_config` function from the [arch/x86/kernel/mpparse.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/mpparse.c). In the `default_find_smp_config` function we are scanning a couple of memory regions for `SMP` config and return if they are found:\n\n```C\nif (smp_scan_config(0x0, 0x400) ||\n            smp_scan_config(639 * 0x400, 0x400) ||\n            smp_scan_config(0xF0000, 0x10000))\n            return;\n```\n\nFirst of all `smp_scan_config` function defines a couple of variables:\n\n```C\nunsigned int *bp = phys_to_virt(base);\nstruct mpf_intel *mpf;\n```\n\nFirst is virtual address of the memory region where we will scan `SMP` config, second is the pointer to the `mpf_intel` structure. Let's try to understand what is it `mpf_intel`. All information stores in the multiprocessor configuration data structure. `mpf_intel` presents this structure and looks:\n\n```C\nstruct mpf_intel {\n        char signature[4];\n        unsigned int physptr;\n        unsigned char length;\n        unsigned char specification;\n        unsigned char checksum;\n        unsigned char feature1;\n        unsigned char feature2;\n        unsigned char feature3;\n        unsigned char feature4;\n        unsigned char feature5;\n};\n```\n\nAs we can read in the documentation - one of the main functions of the system BIOS is to construct the MP floating pointer structure and the MP configuration table. And operating system must have access to this information about the multiprocessor configuration and `mpf_intel` stores the physical address (look at second parameter) of the multiprocessor configuration table. So, `smp_scan_config` going in a loop through the given memory range and tries to find `MP floating pointer structure` there. It checks that current byte points to the `SMP` signature, checks checksum, checks if `mpf->specification` is 1 or 4(it must be `1` or `4` by specification) in the loop:\n\n```C\nwhile (length > 0) {\nif ((*bp == SMP_MAGIC_IDENT) &&\n    (mpf->length == 1) &&\n    !mpf_checksum((unsigned char *)bp, 16) &&\n    ((mpf->specification == 1)\n    || (mpf->specification == 4))) {\n\n        mem = virt_to_phys(mpf);\n        memblock_reserve(mem, sizeof(*mpf));\n        if (mpf->physptr)\n            smp_reserve_memory(mpf);\n\t}\n}\n```\n\nreserves given memory block if search is successful with `memblock_reserve` and reserves physical address of the multiprocessor configuration table. You can find documentation about this in the - [MultiProcessor Specification](http://www.intel.com/design/pentium/datashts/24201606.pdf). You can read More details in the special part about `SMP`.\n\nAdditional early memory initialization routines\n--------------------------------------------------------------------------------\n\nIn the next step of the `setup_arch` we can see the call of the `early_alloc_pgt_buf` function which allocates the page table buffer for early stage. The page table buffer will be placed in the `brk` area. Let's look on its implementation:\n\n```C\nvoid  __init early_alloc_pgt_buf(void)\n{\n        unsigned long tables = INIT_PGT_BUF_SIZE;\n        phys_addr_t base;\n\n        base = __pa(extend_brk(tables, PAGE_SIZE));\n\n        pgt_buf_start = base >> PAGE_SHIFT;\n        pgt_buf_end = pgt_buf_start;\n        pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);\n}\n```\n\nFirst of all it get the size of the page table buffer, it will be `INIT_PGT_BUF_SIZE` which is `(6 * PAGE_SIZE)` in the current Linux kernel 4.0. As we got the size of the page table buffer, we call `extend_brk` function with two parameters: size and align. As you can understand from its name, this function extends the `brk` area. As we can see in the linux kernel linker script `brk` is in memory right after the [BSS](http://en.wikipedia.org/wiki/.bss):\n\n```C\n\t. = ALIGN(PAGE_SIZE);\n\t.brk : AT(ADDR(.brk) - LOAD_OFFSET) {\n\t\t__brk_base = .;\n\t\t. += 64 * 1024;\t\t/* 64k alignment slop space */\n\t\t*(.brk_reservation)\t/* areas brk users have reserved */\n\t\t__brk_limit = .;\n\t}\n```\n\nOr we can find it with `readelf` util:\n\n![brk area](images/brk_area.png)\n\nAfter that we got physical address of the new `brk` with the `__pa` macro, we calculate the base address and the end of the page table buffer. In the next step as we got page table buffer, we reserve memory block for the brk area with the `reserve_brk` function:\n\n```C\nstatic void __init reserve_brk(void)\n{\n\tif (_brk_end > _brk_start)\n\t\tmemblock_reserve(__pa_symbol(_brk_start),\n\t\t\t\t _brk_end - _brk_start);\n\n\t_brk_start = 0;\n}\n```\n\nNote that in the end of the `reserve_brk`, we set `brk_start` to zero, because after this we will not allocate it anymore. The next step after reserving memory block for the `brk`, we need to unmap out-of-range memory areas in the kernel mapping with the `cleanup_highmap` function. Remember that kernel mapping is `__START_KERNEL_map` and `_end - _text` or `level2_kernel_pgt` maps the kernel `_text`, `data` and `bss`. In the start of the `clean_high_map` we define these parameters:\n\n```C\nunsigned long vaddr = __START_KERNEL_map;\nunsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;\npmd_t *pmd = level2_kernel_pgt;\npmd_t *last_pmd = pmd + PTRS_PER_PMD;\n```\n\nNow, as we defined start and end of the kernel mapping, we go in the loop through the all kernel page middle directory entries and clean entries which are not between `_text` and `end`:\n\n```C\nfor (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {\n        if (pmd_none(*pmd))\n            continue;\n        if (vaddr < (unsigned long) _text || vaddr > end)\n            set_pmd(pmd, __pmd(0));\n}\n```\n\nAfter this we set the limit for the `memblock` allocation with the `memblock_set_current_limit` function (read more about `memblock` you can in the [Linux kernel memory management Part 2](https://github.com/0xAX/linux-insides/blob/master/MM/linux-mm-2.md)), it will be `ISA_END_ADDRESS` or `0x100000` and fill the `memblock` information according to `e820` with the call of the `memblock_x86_fill` function. You can see the result of this function in the kernel initialization time:\n\n```\nMEMBLOCK configuration:\n memory size = 0x1fff7ec00 reserved size = 0x1e30000\n memory.cnt  = 0x3\n memory[0x0]\t[0x00000000001000-0x0000000009efff], 0x9e000 bytes flags: 0x0\n memory[0x1]\t[0x00000000100000-0x000000bffdffff], 0xbfee0000 bytes flags: 0x0\n memory[0x2]\t[0x00000100000000-0x0000023fffffff], 0x140000000 bytes flags: 0x0\n reserved.cnt  = 0x3\n reserved[0x0]\t[0x0000000009f000-0x000000000fffff], 0x61000 bytes flags: 0x0\n reserved[0x1]\t[0x00000001000000-0x00000001a57fff], 0xa58000 bytes flags: 0x0\n reserved[0x2]\t[0x0000007ec89000-0x0000007fffffff], 0x1377000 bytes flags: 0x0\n```\n\nThe rest functions after the `memblock_x86_fill` are: `early_reserve_e820_mpc_new` allocates additional slots in the `e820map` for MultiProcessor Specification table, `reserve_real_mode` - reserves low memory from `0x0` to 1 megabyte for the trampoline to the real mode (for rebooting, etc.), `trim_platform_memory_ranges` - trims certain memory regions started from `0x20050000`, `0x20110000`, etc. these regions must be excluded because [Sandy Bridge](http://en.wikipedia.org/wiki/Sandy_Bridge) has problems with these regions, `trim_low_memory_range` reserves the first 4 kilobyte page in `memblock`, `init_mem_mapping` function reconstructs direct memory mapping and setups the direct mapping of the physical memory at `PAGE_OFFSET`, `early_trap_pf_init` setups `#PF` handler (we will look on it in the chapter about interrupts) and `setup_real_mode` function setups trampoline to the [real mode](http://en.wikipedia.org/wiki/Real_mode) code.\n\nThat's all. You can note that this part will not cover all functions which are in the `setup_arch` (like `early_gart_iommu_check`, [mtrr](http://en.wikipedia.org/wiki/Memory_type_range_register) initialization, etc.). As I already wrote many times, `setup_arch` is big, and Linux kernel is big. That's why I can't cover every line in the linux kernel. I don't think that we missed something important, but you can say something like: each line of code is important. Yes, it's true, but I missed them anyway, because I think that it is not realistic to cover full linux kernel. Anyway we will often return to the idea that we have already seen, and if something is unfamiliar, we will cover this theme.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the sixth part about Linux kernel initialization process. In this part we continued to dive in the `setup_arch` function again and it was long part, but we are not finished with it. Yes, `setup_arch` is big, hope that next part will be the last part about this function.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [MultiProcessor Specification](http://en.wikipedia.org/wiki/MultiProcessor_Specification)\n* [NX bit](http://en.wikipedia.org/wiki/NX_bit)\n* [Documentation/kernel-parameters.txt](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst)\n* [APIC](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)\n* [CPU masks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)\n* [Linux kernel memory management](https://0xax.gitbook.io/linux-insides/summary/mm)\n* [PCI](http://en.wikipedia.org/wiki/Conventional_PCI)\n* [e820](http://en.wikipedia.org/wiki/E820)\n* [System Management BIOS](http://en.wikipedia.org/wiki/System_Management_BIOS)\n* [System Management BIOS](http://en.wikipedia.org/wiki/System_Management_BIOS)\n* [EFI](http://en.wikipedia.org/wiki/Unified_Extensible_Firmware_Interface)\n* [SMP](http://en.wikipedia.org/wiki/Symmetric_multiprocessing)\n* [MultiProcessor Specification](http://www.intel.com/design/pentium/datashts/24201606.pdf)\n* [BSS](http://en.wikipedia.org/wiki/.bss)\n* [SMBIOS specification](http://www.dmtf.org/sites/default/files/standards/documents/DSP0134v2.5Final.pdf)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-5)\n"
  },
  {
    "path": "Initialization/linux-initialization-7.md",
    "content": "Kernel initialization. Part 7.\n================================================================================\n\nThe End of the architecture-specific initialization, almost...\n================================================================================\n\nThis is the seventh part of the Linux Kernel initialization process which covers insides of the `setup_arch` function from the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/setup.c#L861). As you can know from the previous [parts](https://0xax.gitbook.io/linux-insides/summary/initialization), the `setup_arch` function does some architecture-specific (in our case it is [x86_64](http://en.wikipedia.org/wiki/X86-64)) initialization stuff like reserving memory for kernel code/data/bss, early scanning of the [Desktop Management Interface](http://en.wikipedia.org/wiki/Desktop_Management_Interface), early dump of the [PCI](http://en.wikipedia.org/wiki/PCI) device and many many more. If you have read the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6), you can remember that we've finished it at the `setup_real_mode` function. In the next step, as we set limit of the [memblock](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-1) to the all mapped pages, we can see the call of the `setup_log_buf` function from the [kernel/printk/printk.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/printk/printk.c).\n\nThe `setup_log_buf` function setups kernel cyclic buffer and its length depends on the `CONFIG_LOG_BUF_SHIFT` configuration option. As we can read from the documentation of the `CONFIG_LOG_BUF_SHIFT` it can be between `12` and `21`. In the insides, buffer defined as array of chars:\n\n```C\n#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)\nstatic char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);\nstatic char *log_buf = __log_buf;\n```\n\nNow let's look on the implementation of the `setup_log_buf` function. It starts with check that current buffer is empty (It must be empty, because we just setup it) and another check that it is early setup. If setup of the kernel log buffer is not early, we call the `log_buf_add_cpu` function which increase size of the buffer for every CPU:\n\n```C\nif (log_buf != __log_buf)\n    return;\n\nif (!early && !new_log_buf_len)\n    log_buf_add_cpu();\n```\n\nWe will not research `log_buf_add_cpu` function, because as you can see in the `setup_arch`, we call `setup_log_buf` as:\n\n```C\nsetup_log_buf(1);\n```\n\nwhere `1` means that it is early setup. In the next step we check `new_log_buf_len` variable which is updated length of the kernel log buffer and allocate new space for the buffer with the `memblock_virt_alloc` function for it, or just return.\n\nAs kernel log buffer is ready, the next function is `reserve_initrd`. You can remember that we already called the `early_reserve_initrd` function in the fourth part of the [Kernel initialization](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4). Now, as we reconstructed direct memory mapping in the `init_mem_mapping` function, we need to move [initrd](http://en.wikipedia.org/wiki/Initrd) into directly mapped memory. The `reserve_initrd` function starts from the definition of the base address and end address of the `initrd` and check that `initrd` is provided by a bootloader. All the same as what we saw in the `early_reserve_initrd`. But instead of the reserving place in the `memblock` area with the call of the `memblock_reserve` function, we get the mapped size of the direct memory area and check that the size of the `initrd` is not greater than this area with:\n\n```C\nmapped_size = memblock_mem_size(max_pfn_mapped);\nif (ramdisk_size >= (mapped_size>>1))\n    panic(\"initrd too large to handle, \"\n\t      \"disabling initrd (%lld needed, %lld available)\\n\",\n\t      ramdisk_size, mapped_size>>1);\n```\n\nYou can see here that we call `memblock_mem_size` function and pass the `max_pfn_mapped` to it, where `max_pfn_mapped` contains the highest direct mapped page frame number. If you do not remember what is `page frame number`, explanation is simple: First `12` bits of the virtual address represent offset in the physical page or page frame. If we right-shift out `12` bits of the virtual address, we'll discard offset part and will get `Page Frame Number`. In the `memblock_mem_size` we go through the all memblock `mem` (not reserved) regions and calculates size of the mapped pages and return it to the `mapped_size` variable (see code above). As we got amount of the direct mapped memory, we check that size of the `initrd` is not greater than mapped pages. If it is greater we just call `panic` which halts the system and prints famous [Kernel panic](http://en.wikipedia.org/wiki/Kernel_panic) message. In the next step we print information about the `initrd` size. We can see the result of this in the `dmesg` output:\n\n```C\n[0.000000] RAMDISK: [mem 0x36d20000-0x37687fff]\n```\n\nand relocate `initrd` to the direct mapping area with the `relocate_initrd` function. In the start of the `relocate_initrd` function we try to find a free area with the `memblock_find_in_range` function:\n\n```C\nrelocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), area_size, PAGE_SIZE);\n\nif (!relocated_ramdisk)\n    panic(\"Cannot find place for new RAMDISK of size %lld\\n\",\n\t       ramdisk_size);\n```\n\nThe `memblock_find_in_range` function tries to find a free area in a given range, in our case from `0` to the maximum mapped physical address and size must equal to the aligned size of the `initrd`. If we didn't find a area with the given size, we call `panic` again. If all is good, we start to relocated RAM disk to the down of the directly mapped memory in the next step.\n\nIn the end of the `reserve_initrd` function, we free memblock memory which occupied by the ramdisk with the call of the:\n\n```C\nmemblock_free(ramdisk_image, ramdisk_end - ramdisk_image);\n```\n\nAfter we relocated `initrd` ramdisk image, the next function is `vsmp_init` from the [arch/x86/kernel/vsmp_64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/vsmp_64.c). This function initializes support of the `ScaleMP vSMP`. As I already wrote in the previous parts, this chapter will not cover non-related `x86_64` initialization parts (for example as the current or `ACPI`, etc.). So we will skip implementation of this for now and will back to it in the part which cover techniques of parallel computing.\n\nThe next function is `io_delay_init` from the [arch/x86/kernel/io_delay.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/io_delay.c). This function allows to override default I/O delay `0x80` port. We already saw I/O delay in the [Last preparation before transition into protected mode](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-3), now let's look on the `io_delay_init` implementation:\n\n```C\nvoid __init io_delay_init(void)\n{\n    if (!io_delay_override)\n        dmi_check_system(io_delay_0xed_port_dmi_table);\n}\n```\n\nThis function check `io_delay_override` variable and overrides I/O delay port if `io_delay_override` is set. We can set `io_delay_override` variably by passing `io_delay` option to the kernel command line. As we can read from the [Documentation/kernel-parameters.txt](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst), `io_delay` option is:\n\n```\nio_delay=\t[X86] I/O delay method\n    0x80\n        Standard port 0x80 based delay\n    0xed\n        Alternate port 0xed based delay (needed on some systems)\n    udelay\n        Simple two microseconds delay\n    none\n        No delay\n```\n\nWe can see `io_delay` command line parameter setup with the `early_param` macro in the [arch/x86/kernel/io_delay.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/io_delay.c)\n\n```C\nearly_param(\"io_delay\", io_delay_param);\n```\n\nMore about `early_param` you can read in the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6). So the `io_delay_param` function which setups `io_delay_override` variable will be called in the [do_early_param](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c#L413) function. `io_delay_param` function gets the argument of the `io_delay` kernel command line parameter and sets `io_delay_type` depends on it:\n\n```C\nstatic int __init io_delay_param(char *s)\n{\n        if (!s)\n                return -EINVAL;\n\n        if (!strcmp(s, \"0x80\"))\n                io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;\n        else if (!strcmp(s, \"0xed\"))\n                io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;\n        else if (!strcmp(s, \"udelay\"))\n                io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY;\n        else if (!strcmp(s, \"none\"))\n                io_delay_type = CONFIG_IO_DELAY_TYPE_NONE;\n        else\n                return -EINVAL;\n\n        io_delay_override = 1;\n        return 0;\n}\n```\n\nThe next functions are `acpi_boot_table_init`, `early_acpi_boot_init` and `initmem_init` after the `io_delay_init`, but as I wrote above we will not cover [ACPI](http://en.wikipedia.org/wiki/Advanced_Configuration_and_Power_Interface) related stuff in this `Linux Kernel initialization process` chapter.\n\nAllocate area for DMA\n--------------------------------------------------------------------------------\n\nIn the next step we need to allocate area for the [Direct memory access](http://en.wikipedia.org/wiki/Direct_memory_access) with the `dma_contiguous_reserve` function which is defined in the [drivers/base/dma-contiguous.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/base/dma-contiguous.c). `DMA` is a special mode when devices communicate with memory without CPU. Note that we pass one parameter - `max_pfn_mapped << PAGE_SHIFT`, to the `dma_contiguous_reserve` function and as you can understand from this expression, this is limit of the reserved memory. Let's look on the implementation of this function. It starts from the definition of the following variables:\n\n```C\nphys_addr_t selected_size = 0;\nphys_addr_t selected_base = 0;\nphys_addr_t selected_limit = limit;\nbool fixed = false;\n```\n\nwhere first represents size in bytes of the reserved area, second is base address of the reserved area, third is end address of the reserved area and the last `fixed` parameter shows where to place reserved area. If `fixed` is `1` we just reserve area with the `memblock_reserve`, if it is `0` we allocate space with the `kmemleak_alloc`. In the next step we check `size_cmdline` variable and if it is not equal to `-1` we fill all variables which you can see above with the values from the `cma` kernel command line parameter:\n\n```C\nif (size_cmdline != -1) {\n   ...\n   ...\n   ...\n}\n```\n\nYou can find in this source code file definition of the early parameter:\n\n```C\nearly_param(\"cma\", early_cma);\n```\n\nwhere `cma` is:\n\n```\ncma=nn[MG]@[start[MG][-end[MG]]]\n\t\t[ARM,X86,KNL]\n\t\tSets the size of kernel global memory area for\n\t\tcontiguous memory allocations and optionally the\n\t\tplacement constraint by the physical address range of\n\t\tmemory allocations. A value of 0 disables CMA\n\t\taltogether. For more information, see\n\t\tinclude/linux/dma-contiguous.h\n```\n\nIf we will not pass `cma` option to the kernel command line, `size_cmdline` will be equal to `-1`. In this way we need to calculate size of the reserved area which depends on the following kernel configuration options:\n\n* `CONFIG_CMA_SIZE_SEL_MBYTES` - size in megabytes, default global `CMA` area, which is equal to `CMA_SIZE_MBYTES * SZ_1M` or `CONFIG_CMA_SIZE_MBYTES * 1M`;\n* `CONFIG_CMA_SIZE_SEL_PERCENTAGE` - percentage of total memory;\n* `CONFIG_CMA_SIZE_SEL_MIN` - use lower value;\n* `CONFIG_CMA_SIZE_SEL_MAX` - use higher value.\n\nAs we calculated the size of the reserved area, we reserve area with the call of the `dma_contiguous_reserve_area` function which first of all calls:\n\n```\nret = cma_declare_contiguous(base, size, limit, 0, 0, fixed, res_cma);\n```\n\nfunction. The `cma_declare_contiguous` reserves contiguous area from the given base address with given size. After we reserved area for the `DMA`, next function is the `memblock_find_dma_reserve`. As you can understand from its name, this function counts the reserved pages in the `DMA` area. This part will not cover all details of the `CMA` and `DMA`, because they are big. We will see much more details in the special part in the Linux Kernel Memory management which covers contiguous memory allocators and areas.\n\nInitialization of the sparse memory\n--------------------------------------------------------------------------------\n\nThe next step is the call of the function - `x86_init.paging.pagetable_init`. If you try to find this function in the Linux kernel source code, in the end of your search, you will see the following macro:\n\n```C\n#define native_pagetable_init        paging_init\n```\n\nwhich expands as you can see to the call of the `paging_init` function from the [arch/x86/mm/init_64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/init_64.c). The `paging_init` function initializes sparse memory and zone sizes. First of all what's zones and what is it `Sparsemem`. The `Sparsemem` is a special foundation in the Linux kernel memory manager which used to split memory area into different memory banks in the [NUMA](http://en.wikipedia.org/wiki/Non-uniform_memory_access) systems. Let's look on the implementation of the `paging_init` function:\n\n```C\nvoid __init paging_init(void)\n{\n        sparse_memory_present_with_active_regions(MAX_NUMNODES);\n        sparse_init();\n\n        node_clear_state(0, N_MEMORY);\n        if (N_MEMORY != N_NORMAL_MEMORY)\n                node_clear_state(0, N_NORMAL_MEMORY);\n\n        zone_sizes_init();\n}\n```\n\nAs you can see there is call of the `sparse_memory_present_with_active_regions` function which records a memory area for every `NUMA` node to the array of the `mem_section` structure which contains a pointer to the structure of the array of `struct page`. The next `sparse_init` function allocates non-linear `mem_section` and `mem_map`. In the next step we clear state of the movable memory nodes and initialize sizes of zones. Every `NUMA` node is divided into a number of pieces which are called - `zones`. So, `zone_sizes_init` function from the [arch/x86/mm/init.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/init.c) initializes size of zones.\n\nAgain, this part and next parts do not cover this theme in full details. There will be special part about `NUMA`.\n\nvsyscall mapping\n--------------------------------------------------------------------------------\n\nThe next step after `SparseMem` initialization is setting of the `trampoline_cr4_features` which must contain content of the `cr4` [Control register](http://en.wikipedia.org/wiki/Control_register). First of all we need to check that current CPU has support of the `cr4` register and if it has, we save its content to the `trampoline_cr4_features` which is storage for `cr4` in the real mode:\n\n```C\nif (boot_cpu_data.cpuid_level >= 0) {\n    mmu_cr4_features = __read_cr4();\n\tif (trampoline_cr4_features)\n\t    *trampoline_cr4_features = mmu_cr4_features;\n}\n```\n\nThe next function which you can see is `map_vsyscal` from the [arch/x86/entry/vsyscall/vsyscall_64.c](https://github.com/torvalds/linux/blob/master/arch/x86/entry/vsyscall/vsyscall_64.c). This function maps memory space for [vsyscalls](https://lwn.net/Articles/446528/) and depends on `CONFIG_X86_VSYSCALL_EMULATION` kernel configuration option. Actually `vsyscall` is a special segment which provides fast access to the certain system calls like `getcpu`, etc. Let's look on implementation of this function:\n\n```C\nvoid __init map_vsyscall(void)\n{\n        extern char __vsyscall_page;\n        unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);\n\n        if (vsyscall_mode != NONE)\n                __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,\n                             vsyscall_mode == NATIVE\n                             ? PAGE_KERNEL_VSYSCALL\n                             : PAGE_KERNEL_VVAR);\n\n        BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=\n                     (unsigned long)VSYSCALL_ADDR);\n}\n```\n\nIn the beginning of the `map_vsyscall` we can see definition of two variables. The first is extern variable `__vsyscall_page`. As a extern variable, it defined somewhere in other source code file. Actually we can see definition of the `__vsyscall_page` in the [arch/x86/entry/vsyscall/vsyscall_emu_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/vsyscall/vsyscall_emu_64.S). The `__vsyscall_page` symbol points to the aligned calls of the `vsyscalls` as `gettimeofday`, etc.:\n\n```assembly\n\t.globl __vsyscall_page\n\t.balign PAGE_SIZE, 0xcc\n\t.type __vsyscall_page, @object\n__vsyscall_page:\n\n\tmov $__NR_gettimeofday, %rax\n\tsyscall\n\tret\n\n\t.balign 1024, 0xcc\n\tmov $__NR_time, %rax\n\tsyscall\n\tret\n    ...\n    ...\n    ...\n```\n\nThe second variable is `physaddr_vsyscall` which just stores physical address of the `__vsyscall_page` symbol. In the next step we check the `vsyscall_mode` variable, and if it is not equal to `NONE`, it is `EMULATE` by default:\n\n```C\nstatic enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;\n```\n\nAnd after this check we can see the call of the `__set_fixmap` function which calls `native_set_fixmap` with the same parameters:\n\n```C\nvoid native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)\n{\n        __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));\n}\n\nvoid __native_set_fixmap(enum fixed_addresses idx, pte_t pte)\n{\n        unsigned long address = __fix_to_virt(idx);\n\n        if (idx >= __end_of_fixed_addresses) {\n                BUG();\n                return;\n        }\n        set_pte_vaddr(address, pte);\n        fixmaps_set++;\n}\n```\n\nHere we can see that `native_set_fixmap` makes value of `Page Table Entry` from the given physical address (physical address of the `__vsyscall_page` symbol in our case) and calls internal function - `__native_set_fixmap`. Internal function gets the virtual address of the given `fixed_addresses` index (`VSYSCALL_PAGE` in our case) and checks that given index is not greater than end of the fix-mapped addresses. After this we set page table entry with the call of the `set_pte_vaddr` function and increase count of the fix-mapped addresses. And in the end of the `map_vsyscall` we check that virtual address of the `VSYSCALL_PAGE` (which is first index in the `fixed_addresses`) is not greater than `VSYSCALL_ADDR` which is `-10UL << 20` or `ffffffffff600000` with the `BUILD_BUG_ON` macro:\n\n```C\nBUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=\n                     (unsigned long)VSYSCALL_ADDR);\n```\n\nNow `vsyscall` area is in the `fix-mapped` area. That's all about `map_vsyscall`, if you do not know anything about fix-mapped addresses, you can read [Fix-Mapped Addresses and ioremap](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-2). We will see more about `vsyscalls` in the `vsyscalls and vdso` part.\n\nGetting the SMP configuration\n--------------------------------------------------------------------------------\n\nYou may remember how we made a search of the [SMP](http://en.wikipedia.org/wiki/Symmetric_multiprocessing) configuration in the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6). Now we need to get the `SMP` configuration if we found it. For this we check `smp_found_config` variable which we set in the `smp_scan_config` function (read about it the previous part) and call the `get_smp_config` function:\n\n```C\nif (smp_found_config)\n\tget_smp_config();\n```\n\nThe `get_smp_config` expands to the `x86_init.mpparse.default_get_smp_config` function which is defined in the [arch/x86/kernel/mpparse.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/mpparse.c). This function defines a pointer to the multiprocessor floating pointer structure - `mpf_intel` (you can read about it in the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6)) and does some checks:\n\n```C\nstruct mpf_intel *mpf = mpf_found;\n\nif (!mpf)\n    return;\n\nif (acpi_lapic && early)\n   return;\n```\n\nHere we can see that multiprocessor configuration was found in the `smp_scan_config` function or just return from the function if not. The next check is `acpi_lapic` and `early`. And as we did this checks, we start to read the `SMP` configuration. As we finished reading it, the next step is - `prefill_possible_map` function which makes preliminary filling of the possible CPU's `cpumask` (more about it you can read in the [Introduction to the cpumasks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)).\n\nThe rest of the setup_arch\n--------------------------------------------------------------------------------\n\nHere we are getting to the end of the `setup_arch` function. The rest of function of course is important, but details about these stuff will not will not be included in this part. We will just take a short look on these functions, because although they are important as I wrote above, they cover non-generic kernel features related with the `NUMA`, `SMP`, `ACPI` and `APICs`, etc. First of all, the next call of the `init_apic_mappings` function. As we can understand this function sets the address of the local [APIC](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). The next is `x86_io_apic_ops.init` and this function initializes I/O APIC. Please note that we will see all details related with `APIC` in the chapter about interrupts and exceptions handling. In the next step we reserve standard I/O resources like `DMA`, `TIMER`, `FPU`, etc., with the call of the `x86_init.resources.reserve_resources` function. Following is `mcheck_init` function initializes `Machine check Exception` and the last is `register_refined_jiffies` which registers [jiffy](http://en.wikipedia.org/wiki/Jiffy_%28time%29) (There will be separate chapter about timers in the kernel).\n\nSo that's all. Finally we have finished with the big `setup_arch` function in this part. Of course as I already wrote many times, we did not see full details about this function, but do not worry about it. We will be back more than once to this function from different chapters for understanding how different platform-dependent parts are initialized.\n\nThat's all, and now we can back to the `start_kernel` from the `setup_arch`.\n\nBack to the main.c\n================================================================================\n\nAs I wrote above, we have finished with the `setup_arch` function and now we can back to the `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). As you may remember or saw yourself, `start_kernel` function as big as the `setup_arch`. So the couple of the next part will be dedicated to learning of this function. So, let's continue with it. After the `setup_arch` we can see the call of the `mm_init_cpumask` function. This function sets the [cpumask](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) pointer to the memory descriptor `cpumask`. We can look on its implementation:\n\n```C\nstatic inline void mm_init_cpumask(struct mm_struct *mm)\n{\n#ifdef CONFIG_CPUMASK_OFFSTACK\n        mm->cpu_vm_mask_var = &mm->cpumask_allocation;\n#endif\n        cpumask_clear(mm->cpu_vm_mask_var);\n}\n```\n\nAs you can see in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c), we pass memory descriptor of the init process to the `mm_init_cpumask` and depends on `CONFIG_CPUMASK_OFFSTACK` configuration option we clear [TLB](http://en.wikipedia.org/wiki/Translation_lookaside_buffer) switch `cpumask`.\n\nIn the next step we can see the call of the following function:\n\n```C\nsetup_command_line(command_line);\n```\n\nThis function takes pointer to the kernel command line allocates a couple of buffers to store command line. We need a couple of buffers, because one buffer used for future reference and accessing to command line and one for parameter parsing. We will allocate space for the following buffers:\n\n* `saved_command_line` - will contain boot command line;\n* `initcall_command_line` - will contain boot command line. will be used in the `do_initcall_level`;\n* `static_command_line` - will contain command line for parameters parsing.\n\nWe will allocate space with the `memblock_virt_alloc` function. This function calls `memblock_virt_alloc_try_nid` which allocates boot memory block with `memblock_reserve` if [slab](http://en.wikipedia.org/wiki/Slab_allocation) is not available or uses `kzalloc_node` (more about it will be in the Linux memory management chapter). The `memblock_virt_alloc` uses `BOOTMEM_LOW_LIMIT` (physical address of the `(PAGE_OFFSET + 0x1000000)` value) and `BOOTMEM_ALLOC_ACCESSIBLE` (equal to the current value of the `memblock.current_limit`) as minimum address of the memory region and maximum address of the memory region.\n\nLet's look on the implementation of the `setup_command_line`:\n\n```C\nstatic void __init setup_command_line(char *command_line)\n{\n        saved_command_line =\n                memblock_virt_alloc(strlen(boot_command_line) + 1, 0);\n        initcall_command_line =\n                memblock_virt_alloc(strlen(boot_command_line) + 1, 0);\n        static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0);\n        strcpy(saved_command_line, boot_command_line);\n        strcpy(static_command_line, command_line);\n }\n ```\n\nHere we can see that we allocate space for the three buffers which will contain kernel command line for the different purposes (read above). And as we allocated space, we store `boot_command_line` in the `saved_command_line` and `command_line` (kernel command line from the `setup_arch`) to the `static_command_line`.\n\nThe next function after the `setup_command_line` is the `setup_nr_cpu_ids`. This function setting `nr_cpu_ids` (number of CPUs) according to the last bit in the `cpu_possible_mask` (more about it you can read in the chapter describes [cpumasks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) concept). Let's look on its implementation:\n\n```C\nvoid __init setup_nr_cpu_ids(void)\n{\n        nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;\n}\n```\n\nHere `nr_cpu_ids` represents number of CPUs, `NR_CPUS` represents the maximum number of CPUs which we can set in configuration time:\n\n![CONFIG_NR_CPUS](images/CONFIG_NR_CPUS.png)\n\nActually we need to call this function, because `NR_CPUS` can be greater than actual amount of the CPUs in the your computer. Here we can see that we call `find_last_bit` function and pass two parameters to it:\n\n* `cpu_possible_mask` bits;\n* maximum number of CPUS.\n\nIn the `setup_arch` we can find the call of the `prefill_possible_map` function which calculates and writes to the `cpu_possible_mask` actual number of the CPUs. We call the `find_last_bit` function which takes the address and maximum size to search and returns bit number of the first set bit. We passed `cpu_possible_mask` bits and maximum number of the CPUs. First of all the `find_last_bit` function splits given `unsigned long` address to the [words](http://en.wikipedia.org/wiki/Word_%28computer_architecture%29):\n\n```C\nwords = size / BITS_PER_LONG;\n```\n\nwhere `BITS_PER_LONG` is `64` on the `x86_64`. As we got amount of words in the given size of the search data, we need to check is given size does not contain partial words with the following check:\n\n```C\nif (size & (BITS_PER_LONG-1)) {\n         tmp = (addr[words] & (~0UL >> (BITS_PER_LONG\n                                 - (size & (BITS_PER_LONG-1)))));\n         if (tmp)\n                 goto found;\n}\n```\n\nif it contains partial word, we mask the last word and check it. If the last word is not zero, it means that current word contains at least one set bit. We go to the `found` label:\n\n```C\nfound:\n    return words * BITS_PER_LONG + __fls(tmp);\n```\n\nHere you can see `__fls` function which returns last set bit in a given word with help of the `bsr` instruction:\n\n```C\nstatic inline unsigned long __fls(unsigned long word)\n{\n        asm(\"bsr %1,%0\"\n            : \"=r\" (word)\n            : \"rm\" (word));\n        return word;\n}\n```\n\nThe `bsr` instruction which scans the given operand for first bit set. If the last word is not partial we going through the all words in the given address and trying to find first set bit:\n\n```C\nwhile (words) {\n    tmp = addr[--words];\n    if (tmp) {\nfound:\n        return words * BITS_PER_LONG + __fls(tmp);\n    }\n}\n```\n\nHere we put the last word to the `tmp` variable and check that `tmp` contains at least one set bit. If a set bit found, we return the number of this bit. If no one words do not contains set bit we just return given size:\n\n```C\nreturn size;\n```\n\nAfter this `nr_cpu_ids` will contain the correct amount of the available CPUs.\n\nThat's all.\n\nConclusion\n================================================================================\n\nIt is the end of the seventh part about the Linux kernel initialization process. In this part, finally we have finished with the `setup_arch` function and returned to the `start_kernel` function. In the next part we will continue to learn generic kernel code from the `start_kernel` and will continue our way to the first `init` process.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n================================================================================\n\n* [Desktop Management Interface](http://en.wikipedia.org/wiki/Desktop_Management_Interface)\n* [x86_64](http://en.wikipedia.org/wiki/X86-64)\n* [initrd](http://en.wikipedia.org/wiki/Initrd)\n* [Kernel panic](http://en.wikipedia.org/wiki/Kernel_panic)\n* [Documentation/kernel-parameters.txt](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst)\n* [ACPI](http://en.wikipedia.org/wiki/Advanced_Configuration_and_Power_Interface)\n* [Direct memory access](http://en.wikipedia.org/wiki/Direct_memory_access)\n* [NUMA](http://en.wikipedia.org/wiki/Non-uniform_memory_access)\n* [Control register](http://en.wikipedia.org/wiki/Control_register)\n* [vsyscalls](https://lwn.net/Articles/446528/)\n* [SMP](http://en.wikipedia.org/wiki/Symmetric_multiprocessing)\n* [jiffy](http://en.wikipedia.org/wiki/Jiffy_%28time%29)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6)\n"
  },
  {
    "path": "Initialization/linux-initialization-8.md",
    "content": "Kernel initialization. Part 8.\n================================================================================\n\nScheduler initialization\n================================================================================\n\nThis is the eighth [part](https://0xax.gitbook.io/linux-insides/summary/initialization) of the Linux kernel initialization process chapter and we stopped on the `setup_nr_cpu_ids` function in the [previous part](https://github.com/0xAX/linux-insides/blob/master/Initialization/linux-initialization-7.md).\n\nThe main point of this part is [scheduler](http://en.wikipedia.org/wiki/Scheduling_%28computing%29) initialization. But before we will start to learn initialization process of the scheduler, we need to do some stuff. The next step in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) is the `setup_per_cpu_areas` function. This function setups memory areas for the `percpu` variables, more about it you can read in the special part about the [Per-CPU variables](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1). After `percpu` areas is up and running, the next step is the `smp_prepare_boot_cpu` function.\n\nThis function does some preparations for [symmetric multiprocessing](http://en.wikipedia.org/wiki/Symmetric_multiprocessing). Since this function is architecture specific, it is located in the [arch/x86/include/asm/smp.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/smp.h#L78) Linux kernel header file. Let's look at the definition of this function:\n\n```C\nstatic inline void smp_prepare_boot_cpu(void)\n{\n         smp_ops.smp_prepare_boot_cpu();\n}\n```\n\nWe may see here that it just calls the `smp_prepare_boot_cpu` callback of the `smp_ops` structure. If we look at the definition of instance of this structure from the [arch/x86/kernel/smp.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/smp.c) source code file, we will see that the `smp_prepare_boot_cpu` expands to the call of the `native_smp_prepare_boot_cpu` function:\n\n```C\nstruct smp_ops smp_ops = {\n    ...\n    ...\n    ...\n    smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,\n    ...\n    ...\n    ...\n}\nEXPORT_SYMBOL_GPL(smp_ops);\n```\n\nThe `native_smp_prepare_boot_cpu` function looks:\n\n```C\nvoid __init native_smp_prepare_boot_cpu(void)\n{\n        int me = smp_processor_id();\n        switch_to_new_gdt(me);\n        cpumask_set_cpu(me, cpu_callout_mask);\n        per_cpu(cpu_state, me) = CPU_ONLINE;\n}\n```\n\nand executes following things: first of all it gets the `id` of the current CPU (which is Bootstrap processor and its `id` is zero for this moment) with the `smp_processor_id` function. I will not explain how the `smp_processor_id` works, because we already saw it in the [Kernel entry point](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4) part. After we've got processor `id` number we reload [Global Descriptor Table](http://en.wikipedia.org/wiki/Global_Descriptor_Table) for the given CPU with the `switch_to_new_gdt` function:\n\n```C\nvoid switch_to_new_gdt(int cpu)\n{\n        struct desc_ptr gdt_descr;\n\n        gdt_descr.address = (long)get_cpu_gdt_table(cpu);\n        gdt_descr.size = GDT_SIZE - 1;\n        load_gdt(&gdt_descr);\n        load_percpu_segment(cpu);\n}\n```\n\nThe `gdt_descr` variable represents pointer to the `GDT` descriptor here (we already saw definition of a `desc_ptr` structure in the [Early interrupt and exception handling](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-2) part). We get the address and the size of the `GDT` descriptor for the `CPU` with the given `id`. The `GDT_SIZE` is `256` or:\n\n```C\n#define GDT_SIZE (GDT_ENTRIES * 8)\n```\n\nand the address of the descriptor we will get with the `get_cpu_gdt_table`:\n\n```C\nstatic inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)\n{\n        return per_cpu(gdt_page, cpu).gdt;\n}\n```\n\nThe `get_cpu_gdt_table` uses `per_cpu` macro for getting value of a `gdt_page` percpu variable for the given CPU number (bootstrap processor with `id` - 0 in our case).\n\nYou may ask the following question: so, if we can access `gdt_page` percpu variable, where was it defined? Actually we already saw it in this book. If you have read the first [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of this chapter, you can remember that we saw definition of the `gdt_page` in the [arch/x86/kernel/head_64.S](https://github.com/0xAX/linux/blob/0a07b238e5f488b459b6113a62e06b6aab017f71/arch/x86/kernel/head_64.S):\n\n```assembly\nearly_gdt_descr:\n\t.word\tGDT_ENTRIES*8-1\nearly_gdt_descr_base:\n\t.quad\tINIT_PER_CPU_VAR(gdt_page)\n```\n\nand if we will look on the [linker](https://github.com/0xAX/linux/blob/0a07b238e5f488b459b6113a62e06b6aab017f71/arch/x86/kernel/vmlinux.lds.S) file we can see that it locates after the `__per_cpu_load` symbol:\n\n```C\n#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load\nINIT_PER_CPU(gdt_page);\n```\n\nand filled `gdt_page` in the [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/cpu/common.c#L94):\n\n```C\nDEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {\n#ifdef CONFIG_X86_64\n\t[GDT_ENTRY_KERNEL32_CS]\t\t= GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),\n\t[GDT_ENTRY_KERNEL_CS]\t\t= GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),\n\t[GDT_ENTRY_KERNEL_DS]\t\t= GDT_ENTRY_INIT(0xc093, 0, 0xfffff),\n\t[GDT_ENTRY_DEFAULT_USER32_CS]\t= GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),\n\t[GDT_ENTRY_DEFAULT_USER_DS]\t= GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),\n\t[GDT_ENTRY_DEFAULT_USER_CS]\t= GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),\n    ...\n    ...\n    ...\n```\n\nmore about `percpu` variables you can read in the [Per-CPU variables](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) part. As we got address and size of the `GDT` descriptor we reload `GDT` with the `load_gdt` which just execute `lgdt` instruct and load `percpu_segment` with the following function:\n\n```C\nvoid load_percpu_segment(int cpu) {\n    loadsegment(gs, 0);\n    wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));\n    load_stack_canary_segment();\n}\n```\n\nThe base address of the `percpu` area must contain `gs` register (or `fs` register for `x86`), so we are using `loadsegment` macro and pass `gs`. In the next step we write the base address if the [IRQ](http://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) stack and setup stack [canary](http://en.wikipedia.org/wiki/Buffer_overflow_protection) (this is only for `x86_32`). After we load new `GDT`, we fill `cpu_callout_mask` bitmap with the current cpu and set cpu state as online with the setting `cpu_state` percpu variable for the current processor - `CPU_ONLINE`:\n\n```C\ncpumask_set_cpu(me, cpu_callout_mask);\nper_cpu(cpu_state, me) = CPU_ONLINE;\n```\n\nSo, what is `cpu_callout_mask` bitmap? As we initialized bootstrap processor (processor which is booted the first on `x86`) the other processors in a multiprocessor system are known as `secondary processors`. Linux kernel uses following two bitmasks:\n\n* `cpu_callout_mask`\n* `cpu_callin_mask`\n\nAfter bootstrap processor initialized, it updates the `cpu_callout_mask` to indicate which secondary processor can be initialized next. All other or secondary processors can do some initialization stuff before and check the `cpu_callout_mask` on the bootstrap processor bit. Only after the bootstrap processor filled the `cpu_callout_mask` with this secondary processor, it will continue the rest of its initialization. After that the certain processor finish its initialization process, the processor sets bit in the `cpu_callin_mask`. Once the bootstrap processor finds the bit in the `cpu_callin_mask` for the current secondary processor, this processor repeats the same procedure for initialization of one of the remaining secondary processors. In a short words it works as I described, but we will see more details in the chapter about `SMP`.\n\nThat's all. We did all `SMP` boot preparation.\n\nBuild zonelists\n-----------------------------------------------------------------------\n\nIn the next step we can see the call of the `build_all_zonelists` function. This function sets up the order of zones that allocations are preferred from. What are zones and what's order we will understand soon. For the start let's see how Linux kernel considers physical memory. Physical memory is split into banks which are called - `nodes`. If you have no hardware support for `NUMA`, you will see only one node:\n\n```\n$ cat /sys/devices/system/node/node0/numastat\nnuma_hit 72452442\nnuma_miss 0\nnuma_foreign 0\ninterleave_hit 12925\nlocal_node 72452442\nother_node 0\n```\n\nEvery `node` is presented by the `struct pglist_data` in the Linux kernel. Each node is divided into a number of special blocks which are called - `zones`. Every zone is presented by the `zone struct` in the linux kernel and has one of the type:\n\n* `ZONE_DMA` - 0-16M;\n* `ZONE_DMA32` - used for 32 bit devices that can only do DMA areas below 4G;\n* `ZONE_NORMAL` - all RAM from the 4GB on the `x86_64`;\n* `ZONE_HIGHMEM` - absent on the `x86_64`;\n* `ZONE_MOVABLE` - zone which contains movable pages.\n\nwhich are presented by the `zone_type` enum. We can get information about zones with the:\n\n```\n$ cat /proc/zoneinfo\nNode 0, zone      DMA\n  pages free     3975\n        min      3\n        low      3\n        ...\n        ...\nNode 0, zone    DMA32\n  pages free     694163\n        min      875\n        low      1093\n        ...\n        ...\nNode 0, zone   Normal\n  pages free     2529995\n        min      3146\n        low      3932\n        ...\n        ...\n```\n\nAs I wrote above all nodes are described with the `pglist_data` or `pg_data_t` structure in memory. This structure is defined in the [include/linux/mmzone.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mmzone.h). The `build_all_zonelists` function from the [mm/page_alloc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/page_alloc.c) constructs an ordered `zonelist` (of different zones `DMA`, `DMA32`, `NORMAL`, `HIGH_MEMORY`, `MOVABLE`) which specifies the zones/nodes to visit when a selected `zone` or `node` cannot satisfy the allocation request. That's all. More about `NUMA` and multiprocessor systems will be in the special part.\n\nThe rest of the stuff before scheduler initialization\n--------------------------------------------------------------------------------\n\nBefore we start to dive into Linux kernel scheduler initialization process we must do a couple of things. The first thing is the `page_alloc_init` function from the [mm/page_alloc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/page_alloc.c). This function looks pretty easy:\n\n```C\nvoid __init page_alloc_init(void)\n{\n    int ret;\n\n    ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,\n                                    \"mm/page_alloc:dead\", NULL,\n                                    page_alloc_cpu_dead);\n    WARN_ON(ret < 0);\n}\n```\n\nIt setups setup the `startup` and `teardown` callbacks (second and third parameters) for the `CPUHP_PAGE_ALLOC_DEAD` cpu [hotplug](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt) state. Of course the implementation of this function depends on the `CONFIG_HOTPLUG_CPU` kernel configuration option and if this option is set, such callbacks will be set for all cpu(s) in the system depends on their `hotplug` states. [hotplug](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt) mechanism is a big theme and it will not be described in this book.\n\nAfter this function we can see the kernel command line in the initialization output:\n\n![kernel command line](images/kernel_command_line.png)\n\nAnd a couple of functions such as `parse_early_param` and `parse_args` which handles Linux kernel command line. You may remember that we already saw the call of the `parse_early_param` function in the sixth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6) of the kernel initialization chapter, so why we call it again? Answer is simple: we call this function in the architecture-specific code (`x86_64` in our case), but not all architecture calls this function. And we need to call the second function `parse_args` to parse and handle non-early command line arguments.\n\nIn the next step we can see the call of the `jump_label_init` from the [kernel/jump_label.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/jump_label.c). and initializes [jump label](https://lwn.net/Articles/412072/).\n\nAfter this we can see the call of the `setup_log_buf` function which setups the [printk](http://www.makelinux.net/books/lkd2/ch18lev1sec3) log buffer. We already saw this function in the seventh [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-7) of the Linux kernel initialization process chapter.\n\nPID hash initialization\n--------------------------------------------------------------------------------\n\nThe next is `pidhash_init` function. As you know each process has assigned a unique number which called - `process identification number` or `PID`. Each process generated with fork or clone is automatically assigned a new unique `PID` value by the kernel. The management of `PIDs` centered around the two special data structures: `struct pid` and `struct upid`. First structure represents information about a `PID` in the kernel. The second structure represents the information that is visible in a specific namespace. All `PID` instances stored in the special hash table:\n\n```C\nstatic struct hlist_head *pid_hash;\n```\n\nThis hash table is used to find the pid instance that belongs to a numeric `PID` value. So, `pidhash_init` initializes this hash table. In the start of the `pidhash_init` function we can see the call of the `alloc_large_system_hash`:\n\n```C\npid_hash = alloc_large_system_hash(\"PID\", sizeof(*pid_hash), 0, 18,\n                                   HASH_EARLY | HASH_SMALL,\n                                   &pidhash_shift, NULL,\n                                   0, 4096);\n```\n\nThe number of elements of the `pid_hash` depends on the `RAM` configuration, but it can be between `2^4` and `2^12`. The `pidhash_init` computes the size\nand allocates the required storage (which is `hlist` in our case - the same as [doubly linked list](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-1), but contains one pointer instead on the [struct hlist_head](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/types.h). The `alloc_large_system_hash` function allocates a large system hash table with `memblock_virt_alloc_nopanic` if we pass `HASH_EARLY` flag (as it in our case) or with `__vmalloc` if we did no pass this flag.\n\nThe result we can see in the `dmesg` output:\n\n```\n$ dmesg | grep hash\n[    0.000000] PID hash table entries: 4096 (order: 3, 32768 bytes)\n...\n...\n...\n```\n\nThat's all. The rest of the stuff before scheduler initialization is the following functions: `vfs_caches_init_early` does early initialization of the [virtual file system](http://en.wikipedia.org/wiki/Virtual_file_system) (more about it will be in the chapter which will describe virtual file system), `sort_main_extable` sorts the kernel's built-in exception table entries which are between `__start___ex_table` and `__stop___ex_table`, and `trap_init` initializes trap handlers (more about last two function we will know in the separate chapter about interrupts).\n\nThe last step before the scheduler initialization is initialization of the memory manager with the `mm_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). As we can see, the `mm_init` function initializes different parts of the Linux kernel memory manager:\n\n```C\npage_ext_init_flatmem();\nmem_init();\nkmem_cache_init();\npercpu_init_late();\npgtable_init();\nvmalloc_init();\n```\n\nThe first is `page_ext_init_flatmem` which depends on the `CONFIG_SPARSEMEM` kernel configuration option and initializes extended data per page handling. The `mem_init` releases all `bootmem`, the `kmem_cache_init` initializes kernel cache, the `percpu_init_late` - replaces `percpu` chunks with those allocated by [slub](http://en.wikipedia.org/wiki/SLUB_%28software%29), the `pgtable_init` - initializes the `page->ptl` kernel cache, the `vmalloc_init` - initializes `vmalloc`. Please, **NOTE** that we will not dive into details about all of these functions and concepts, but we will see all of them it in the [Linux kernel memory manager](https://0xax.gitbook.io/linux-insides/summary/mm) chapter.\n\nThat's all. Now we can look on the `scheduler`.\n\nScheduler initialization\n--------------------------------------------------------------------------------\n\nAnd now we come to the main purpose of this part - initialization of the task scheduler. I want to say again as I already did it many times, you will not see the full explanation of the scheduler here, there will be special separate chapter about this. Here will be described first initial scheduler mechanisms which are initialized first of all. So let's start.\n\nOur current point is the `sched_init` function from the [kernel/sched/core.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/sched/core.c) kernel source code file and as we can understand from the function's name, it initializes scheduler. Let's start to dive into this function and try to understand how the scheduler is initialized. At the start of the `sched_init` function we can see the following call:\n\n```C\nsched_clock_init();\n```\n\nThe `sched_clock_init` is pretty easy function and as we may see it just sets `sched_clock_init` variable:\n\n```C\nvoid sched_clock_init(void)\n{\n\tsched_clock_running = 1;\n}\n```\n\nthat will be used later. At the next step is initialization of the array of `waitqueues`:\n\n```C\nfor (i = 0; i < WAIT_TABLE_SIZE; i++)\n\tinit_waitqueue_head(bit_wait_table + i);\n```\n\nwhere `bit_wait_table` is defined as:\n\n```C\n#define WAIT_TABLE_BITS 8\n#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)\nstatic wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;\n```\n\nThe `bit_wait_table` is array of wait queues that will be used for wait/wake up of processes depends on the value of a designated bit. The next step after initialization of `waitqueues` array is calculating size of memory to allocate for the `root_task_group`. As we may see this size depends on two following kernel configuration options:\n\n```C\n#ifdef CONFIG_FAIR_GROUP_SCHED\n         alloc_size += 2 * nr_cpu_ids * sizeof(void **);\n#endif\n#ifdef CONFIG_RT_GROUP_SCHED\n         alloc_size += 2 * nr_cpu_ids * sizeof(void **);\n#endif\n```\n\n* `CONFIG_FAIR_GROUP_SCHED`;\n* `CONFIG_RT_GROUP_SCHED`.\n\nBoth of these options provide two different planning models. As we can read from the [documentation](https://www.kernel.org/doc/Documentation/scheduler/sched-design-CFS.txt), the current scheduler - `CFS` or `Completely Fair Scheduler` use a simple concept. It models process scheduling as if the system has an ideal multitasking processor where each process would receive `1/n` processor time, where `n` is the number of the runnable processes. The scheduler uses the special set of rules. These rules determine when and how to select a new process to run and they are called `scheduling policy`.\n\nThe `Completely Fair Scheduler` supports following `normal` or in other words `non-real-time` scheduling policies:\n\n* `SCHED_NORMAL`;\n* `SCHED_BATCH`;\n* `SCHED_IDLE`.\n\nThe `SCHED_NORMAL` is used for the most normal applications, the amount of cpu each process consumes is mostly determined by the [nice](http://en.wikipedia.org/wiki/Nice_%28Unix%29) value, the `SCHED_BATCH` used for the 100% non-interactive tasks and the `SCHED_IDLE` runs tasks only when the processor has no task to run besides this task.\n\nThe `real-time` policies are also supported for the time-critical applications: `SCHED_FIFO` and `SCHED_RR`. If you've read something about the Linux kernel scheduler, you can know that it is modular. That means it supports different algorithms to schedule different types of processes. Usually this modularity is called `scheduler classes`. These modules encapsulate scheduling policy details and are handled by the scheduler core without knowing too much about them.\n\nNow let's get back to the our code and look on the two configuration options: `CONFIG_FAIR_GROUP_SCHED` and `CONFIG_RT_GROUP_SCHED`. The smallest unit that the scheduler works with is an individual task or thread. However, a process is not the only type of entity that the scheduler can operate with. Both of these options provide support for group scheduling. The first option provides support for group scheduling with the `completely fair scheduler` policies and the second with the `real-time` policies respectively.\n\nIn simple words, group scheduling is a feature that allows us to schedule a set of tasks as if they were a single task. For example, if you create a group with two tasks on the group, then this group is just like one normal task, from the kernel perspective. After a group is scheduled, the scheduler will pick a task from this group and it will be scheduled inside the group. So, such mechanism allows us to build hierarchies and manage their resources. Although a minimal unit of scheduling is a process, the Linux kernel scheduler does not use `task_struct` structure under the hood. There is special `sched_entity` structure that is used by the Linux kernel scheduler as scheduling unit.\n\nSo, the current goal is to calculate a space to allocate for a `sched_entity(ies)` of the root task group and we do it two times with:\n\n```C\n#ifdef CONFIG_FAIR_GROUP_SCHED\n         alloc_size += 2 * nr_cpu_ids * sizeof(void **);\n#endif\n#ifdef CONFIG_RT_GROUP_SCHED\n         alloc_size += 2 * nr_cpu_ids * sizeof(void **);\n#endif\n```\n\nThe first is for case when scheduling of task groups is enabled with `completely fair` scheduler and the second is for the same purpose by in a case of `real-time` scheduler. So here we calculate size which is equal to size of a pointer multiplied on amount of CPUs in the system and multiplied to `2`. We need to multiply this on `2` as we will need to allocate a space for two things:\n\n* scheduler entity structure;\n* `runqueue`.\n\nAfter we have calculated size, we allocate a space with the `kzalloc` function and set pointers of `sched_entity` and `runqueues` there:\n\n```C\nptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);\n\n#ifdef CONFIG_FAIR_GROUP_SCHED\n        root_task_group.se = (struct sched_entity **)ptr;\n        ptr += nr_cpu_ids * sizeof(void **);\n\n        root_task_group.cfs_rq = (struct cfs_rq **)ptr;\n        ptr += nr_cpu_ids * sizeof(void **);\n#endif\n#ifdef CONFIG_RT_GROUP_SCHED\n\t\troot_task_group.rt_se = (struct sched_rt_entity **)ptr;\n\t\tptr += nr_cpu_ids * sizeof(void **);\n\n\t\troot_task_group.rt_rq = (struct rt_rq **)ptr;\n\t\tptr += nr_cpu_ids * sizeof(void **);\n\n#endif\n```\n\nAs I already mentioned, the Linux group scheduling mechanism allows to specify a hierarchy. The root of such hierarchies is the `root_runqueuetask_group` task group structure. This structure contains many fields, but we are interested in `se`, `rt_se`, `cfs_rq` and `rt_rq` for this moment:\n\nThe first two are instances of `sched_entity` structure. It is defined in the [include/linux/sched.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/sched.h) kernel header filed and used by the scheduler as a unit of scheduling.\n\n```C\nstruct task_group {\n    ...\n    ...\n    struct sched_entity **se;\n    struct cfs_rq **cfs_rq;\n    ...\n    ...\n}\n```\n\nThe `cfs_rq` and `rt_rq` present `run queues`. A `run queue` is a special `per-cpu` structure that is used by the Linux kernel scheduler to store `active` threads or in other words set of threads which potentially will be picked up by the scheduler to run.\n\nThe space is allocated and the next step is to initialize a `bandwidth` of CPU for `real-time` and `deadline` tasks:\n\n```C\ninit_rt_bandwidth(&def_rt_bandwidth,\n                  global_rt_period(), global_rt_runtime());\ninit_dl_bandwidth(&def_dl_bandwidth,\n                  global_rt_period(), global_rt_runtime());\n```\n\nAll groups have to be able to rely on the amount of CPU time. The two following structures: `def_rt_bandwidth` and `def_dl_bandwidth` represent default values of bandwidths for `real-time` and `deadline` tasks. We will not look at definition of these structures as it is not so important for now, but we are interested in two following values:\n\n* `sched_rt_period_us`;\n* `sched_rt_runtime_us`.\n\nThe first represents a period and the second represents quantum that is allocated for `real-time` tasks during `sched_rt_period_us`. You may see global values of these parameters in the:\n\n```\n$ cat /proc/sys/kernel/sched_rt_period_us\n1000000\n\n$ cat /proc/sys/kernel/sched_rt_runtime_us\n950000\n```\n\nThe values related to a group can be configured in `<cgroup>/cpu.rt_period_us` and `<cgroup>/cpu.rt_runtime_us`. Due no one filesystem is not mounted yet, the `def_rt_bandwidth` and the `def_dl_bandwidth` will be initialized with default values which will be returned by the `global_rt_period` and `global_rt_runtime` functions.\n\nThat's all with the bandwiths of `real-time` and `deadline` tasks and in the next step, depends on enable of [SMP](http://en.wikipedia.org/wiki/Symmetric_multiprocessing), we make initialization of the `root domain`:\n\n```C\n#ifdef CONFIG_SMP\n\tinit_defrootdomain();\n#endif\n```\n\nThe real-time scheduler requires global resources to make scheduling decision. But unfortunately scalability bottlenecks appear as the number of CPUs increase. The concept of `root domains` was introduced for improving scalability and avoid such bottlenecks. Instead of bypassing over all `run queues`, the scheduler gets information about a CPU where/from to push/pull a `real-time` task from the `root_domain` structure. This structure is defined in the [kernel/sched/sched.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/sched/sched.h) kernel header file and just keeps track of CPUs that can be used to push or pull a process.\n\nAfter `root domain` initialization, we make initialization of the `bandwidth` for the `real-time` tasks of the `root task group` as we did the same above:\n```C\n#ifdef CONFIG_RT_GROUP_SCHED\n\tinit_rt_bandwidth(&root_task_group.rt_bandwidth,\n\t\t\tglobal_rt_period(), global_rt_runtime());\n#endif\n```\n\nwith the same default values.\n\nIn the next step, depends on the `CONFIG_CGROUP_SCHED` kernel configuration option we allocate `slab` cache for `task_group(s)` and initialize the `siblings` and `children` lists of the root task group. As we can read from the documentation, the `CONFIG_CGROUP_SCHED` is:\n\n```\nThis option allows you to create arbitrary task groups using the \"cgroup\" pseudo\nfilesystem and control the cpu bandwidth allocated to each such task group.\n```\n\nAs we finished with the lists initialization, we can see the call of the `autogroup_init` function:\n\n```C\n#ifdef CONFIG_CGROUP_SCHED\n         list_add(&root_task_group.list, &task_groups);\n         INIT_LIST_HEAD(&root_task_group.children);\n         INIT_LIST_HEAD(&root_task_group.siblings);\n         autogroup_init(&init_task);\n#endif\n```\n\nwhich initializes automatic process group scheduling. The `autogroup` feature is about automatic creation and population of a new task group during creation of a new session via [setsid](https://linux.die.net/man/2/setsid) call.\n\nAfter this we are going through the all `possible` CPUs (you can remember that `possible` CPUs are stored in the `cpu_possible_mask` bitmap that can ever be available in the system) and initialize a `runqueue` for each `possible` cpu:\n\n```C\nfor_each_possible_cpu(i) {\n    struct rq *rq;\n    ...\n    ...\n    ...\n```\n\nThe `rq` structure in the Linux kernel is defined in the [kernel/sched/sched.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/sched/sched.h#L625). As I already mentioned this above, a `run queue` is a fundamental data structure in a scheduling process. The scheduler uses it to determine who will be ran next. As you may see, this structure has many different fields and we will not cover all of them here, but we will look on them when they will be directly used.\n\nAfter initialization of `per-cpu` run queues with default values, we need to setup `load weight` of the first task in the system:\n\n```C\nset_load_weight(&init_task);\n```\n\nFirst of all let's try to understand what is it `load weight` of a process. If you will look at the definition of the `sched_entity` structure, you will see that it starts from the `load` field:\n\n```C\nstruct sched_entity {\n\tstruct load_weight\t\tload;\n    ...\n    ...\n    ...\n}\n```\n\nrepresented by the `load_weight` structure which just contains two fields that represent actual load weight of a scheduler entity and its invariant value:\n\n```C\nstruct load_weight {\n\tunsigned long\tweight;\n\tu32\t\t\t\tinv_weight;\n};\n```\n\nYou already may know that each process in the system has `priority`. The higher priority allows to get more time to run. A `load weight` of a process is a relation between priority of this process and timeslices of this process. Each process has three following fields related to priority:\n\n```C\nstruct task_struct {\n...\n...\n...\n\tint\t\t\t\tprio;\n\tint\t\t\t\tstatic_prio;\n\tint\t\t\t\tnormal_prio;\n...\n...\n...\n}\n```\n\nThe first one is `dynamic priority` which can't be changed during lifetime of a process based on its static priority and interactivity of the process. The `static_prio` contains initial priority most likely well-known to you `nice value`. This value is not changed by the kernel if a user does not change it. The last one is `normal_priority` based on the value of the `static_prio` too, but also it depends on the scheduling policy of a process.\n\nSo the main goal of the `set_load_weight` function is to initialize `load_weight` fields for the `init` task:\n\n```C\nstatic void set_load_weight(struct task_struct *p)\n{\n\tint prio = p->static_prio - MAX_RT_PRIO;\n\tstruct load_weight *load = &p->se.load;\n\n\tif (idle_policy(p->policy)) {\n\t\tload->weight = scale_load(WEIGHT_IDLEPRIO);\n\t\tload->inv_weight = WMULT_IDLEPRIO;\n\t\treturn;\n\t}\n\n\tload->weight = scale_load(sched_prio_to_weight[prio]);\n\tload->inv_weight = sched_prio_to_wmult[prio];\n}\n```\n\nAs you may see we calculate initial `prio` from the initial value of the `static_prio` of the `init` task and use it as index of `sched_prio_to_weight` and `sched_prio_to_wmult` arrays to set `weight` and `inv_weight` values. These two arrays contain a `load weight` depends on priority value. In a case of when a process is `idle` process, we set minimal load weight.\n\nFor this moment we came to the end of initialization process of the Linux kernel scheduler. The last steps are: to make current process (it will be the first `init` process) `idle` that will be ran when a cpu has no other process to run. Calculating next time period of the next calculation of CPU load and initialization of the `fair` class:\n\n```C\n__init void init_sched_fair_class(void)\n{\n#ifdef CONFIG_SMP\n\topen_softirq(SCHED_SOFTIRQ, run_rebalance_domains);\n#endif\n}\n```\n\nHere we register a [soft irq](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-9) that will call the `run_rebalance_domains` handler. After the `SCHED_SOFTIRQ` will be triggered, the `run_rebalance` will be called to rebalance a run queue on the current CPU.\n\nThe last two steps of the `sched_init` function is to initialization of scheduler statistics and setting `scheeduler_running` variable:\n\n```C\nscheduler_running = 1;\n```\n\nThat's all. Linux kernel scheduler is initialized. Of course, we have skipped many different details and explanations here, because we need to know and understand how different concepts (like process and process groups, runqueue, rcu, etc.) works in the Linux kernel , but we took a short look on the scheduler initialization process. We will look all other details in the separate part which will be fully dedicated to the scheduler.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the eighth part about the Linux kernel initialization process. In this part, we looked on the initialization process of the scheduler and we will continue in the next part to dive in the linux kernel initialization process and will see initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update) and many other initialization stuff in the next part.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [CPU masks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)\n* [high-resolution kernel timer](https://www.kernel.org/doc/Documentation/timers/hrtimers.txt)\n* [spinlock](http://en.wikipedia.org/wiki/Spinlock)\n* [Run queue](http://en.wikipedia.org/wiki/Run_queue)\n* [Linux kernel memory manager](https://0xax.gitbook.io/linux-insides/summary/mm)\n* [slub](http://en.wikipedia.org/wiki/SLUB_%28software%29)\n* [virtual file system](http://en.wikipedia.org/wiki/Virtual_file_system)\n* [Linux kernel hotplug documentation](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt)\n* [IRQ](http://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29)\n* [Global Descriptor Table](http://en.wikipedia.org/wiki/Global_Descriptor_Table)\n* [Per-CPU variables](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [SMP](http://en.wikipedia.org/wiki/Symmetric_multiprocessing)\n* [RCU](http://en.wikipedia.org/wiki/Read-copy-update)\n* [CFS Scheduler documentation](https://www.kernel.org/doc/Documentation/scheduler/sched-design-CFS.txt)\n* [Real-Time group scheduling](https://www.kernel.org/doc/Documentation/scheduler/sched-rt-group.txt)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-7)\n"
  },
  {
    "path": "Initialization/linux-initialization-9.md",
    "content": "Kernel initialization. Part 9.\n================================================================================\n\nRCU initialization\n================================================================================\n\nThis is ninth part of the [Linux Kernel initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) and in the previous part we stopped at the [scheduler initialization](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-8). In this part we will continue to dive to the Linux kernel initialization process and the main purpose of this part will be to learn about initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update). We can see that the next step in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) after the `sched_init` is the call of the `preempt_disable`. There are two macros:\n\n* `preempt_disable`\n* `preempt_enable`\n\nfor preemption disabling and enabling. First of all let's try to understand what is `preempt` in the context of an operating system kernel. In simple words, preemption is ability of the operating system kernel to preempt current task to run task with higher priority. Here we need to disable preemption because we will have only one `init` process for the early boot time and we don't need to stop it before we call `cpu_idle` function. The `preempt_disable` macro is defined in the [include/linux/preempt.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/preempt.h) and depends on the `CONFIG_PREEMPT_COUNT` kernel configuration option. This macro is implemented as:\n\n```C\n#define preempt_disable() \\\ndo { \\\n        preempt_count_inc(); \\\n        barrier(); \\\n} while (0)\n```\n\nand if `CONFIG_PREEMPT_COUNT` is not set just:\n\n```C\n#define preempt_disable()                       barrier()\n```\n\nLet's look on it. First of all we can see one difference between these macro implementations. The `preempt_disable` with `CONFIG_PREEMPT_COUNT` set contains the call of the `preempt_count_inc`. There is special `percpu` variable which stores the number of held locks and `preempt_disable` calls:\n\n```C\nDECLARE_PER_CPU(int, __preempt_count);\n```\n\nIn the first implementation of the `preempt_disable` we increment this `__preempt_count`. There is API for returning value of the `__preempt_count`, it is the `preempt_count` function. As we called `preempt_disable`, first of all we increment preemption counter with the `preempt_count_inc` macro which expands to the:\n\n```\n#define preempt_count_inc() preempt_count_add(1)\n#define preempt_count_add(val)  __preempt_count_add(val)\n```\n\nwhere `preempt_count_add` calls the `raw_cpu_add_4` macro which adds `1` to the given `percpu` variable (`__preempt_count`) in our case (more about `precpu` variables you can read in the part about [Per-CPU variables](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)). Ok, we increased `__preempt_count` and the next step we can see the call of the `barrier` macro in the both macros. The `barrier` macro inserts an optimization barrier. In the processors with `x86_64` architecture independent memory access operations can be performed in any order. That's why we need the opportunity to point compiler and processor on compliance of order. This mechanism is memory barrier. Let's consider a simple example:\n\n```C\npreempt_disable();\nfoo();\npreempt_enable();\n```\n\nCompiler can rearrange it as:\n\n```C\npreempt_disable();\npreempt_enable();\nfoo();\n```\n\nIn this case non-preemptible function `foo` can be preempted. As we put `barrier` macro in the `preempt_disable` and `preempt_enable` macros, it prevents the compiler from swapping `preempt_count_inc` with other statements. More about barriers you can read [here](http://en.wikipedia.org/wiki/Memory_barrier) and [here](https://www.kernel.org/doc/Documentation/memory-barriers.txt).\n\nIn the next step we can see following statement:\n\n```C\nif (WARN(!irqs_disabled(),\n\t \"Interrupts were enabled *very* early, fixing it\\n\"))\n\tlocal_irq_disable();\n```\n\nwhich check [IRQs](http://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) state, and disabling (with `cli` instruction for `x86_64`) if they are enabled.\n\nThat's all. Preemption is disabled and we can go ahead.\n\nInitialization of the integer ID management\n--------------------------------------------------------------------------------\n\nIn the next step we can see the call of the `idr_init_cache` function which defined in the [lib/idr.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/idr.c). The `idr` library is used in a various [places](http://lxr.free-electrons.com/ident?i=idr_find) in the Linux kernel to manage assigning integer `IDs` to objects and looking up objects by id.\n\nLet's look on the implementation of the `idr_init_cache` function:\n\n```C\nvoid __init idr_init_cache(void)\n{\n        idr_layer_cache = kmem_cache_create(\"idr_layer_cache\",\n                                sizeof(struct idr_layer), 0, SLAB_PANIC, NULL);\n}\n```\n\nHere we can see the call of the `kmem_cache_create`. We already called the `kmem_cache_init` in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c#L485). This function create generalized caches again using the `kmem_cache_alloc` (more about caches we will see in the [Linux kernel memory management](https://0xax.gitbook.io/linux-insides/summary/mm) chapter). In our case, as we are using `kmem_cache_t` which will be used by the [slab](http://en.wikipedia.org/wiki/Slab_allocation) allocator and `kmem_cache_create` creates it. As you can see we pass five parameters to the `kmem_cache_create`:\n\n* name of the cache;\n* size of the object to store in cache;\n* offset of the first object in the page;\n* flags;\n* constructor for the objects.\n\nand it will create `kmem_cache` for the integer IDs. Integer `IDs` is commonly used pattern to map set of integer IDs to the set of pointers. We can see usage of the integer IDs in the [i2c](http://en.wikipedia.org/wiki/I%C2%B2C) drivers subsystem. For example [drivers/i2c/i2c-core-base.c](https://github.com/torvalds/linux/blob/master/drivers/i2c/i2c-core-base.c) which represents the core of the `i2c` subsystem defines `ID` for the `i2c` adapter with the `DEFINE_IDR` macro:\n\n```C\nstatic DEFINE_IDR(i2c_adapter_idr);\n```\n\nand then uses it for the declaration of the `i2c` adapter:\n\n```C\nstatic int __i2c_add_numbered_adapter(struct i2c_adapter *adap)\n{\n  int     id;\n  ...\n  ...\n  ...\n  id = idr_alloc(&i2c_adapter_idr, adap, adap->nr, adap->nr + 1, GFP_KERNEL);\n  ...\n  ...\n  ...\n}\n```\n\nand `id2_adapter_idr` presents dynamically calculated bus number.\n\nMore about integer ID management you can read [here](https://lwn.net/Articles/103209/).\n\nRCU initialization\n--------------------------------------------------------------------------------\n\nThe next step is [RCU](http://en.wikipedia.org/wiki/Read-copy-update) initialization with the `rcu_init` function and its implementation depends on two kernel configuration options:\n\n* `CONFIG_TINY_RCU`\n* `CONFIG_TREE_RCU`\n\nIn the first case `rcu_init` will be in the [kernel/rcu/tiny.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/rcu/tiny.c) and in the second case it will be defined in the [kernel/rcu/tree.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/rcu/tree.c). We will see the implementation of the `tree rcu`, but first of all about the `RCU` in general.\n\n`RCU` or read-copy update is a scalable high-performance synchronization mechanism implemented in the Linux kernel. On the early stage the Linux kernel provided support and environment for the concurrently running applications, but all execution was serialized in the kernel using a single global lock. In our days linux kernel has no single global lock, but provides different mechanisms including [lock-free data structures](http://en.wikipedia.org/wiki/Concurrent_data_structure), [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) data structures and other. One of these mechanisms is - the `read-copy update`. The `RCU` technique is designed for rarely-modified data structures. The idea of the `RCU` is simple. For example we have a rarely-modified data structure. If somebody wants to change this data structure, we make a copy of this data structure and make all changes in the copy. In the same time all other users of the data structure use old version of it. Next, we need to choose safe moment when original version of the data structure will have no users and update it with the modified copy.\n\nOf course this description of the `RCU` is very simplified. To understand some details about `RCU`, first of all we need to learn some terminology. Data readers in the `RCU` executed in the [critical section](http://en.wikipedia.org/wiki/Critical_section). Every time when data reader get to the critical section, it calls the `rcu_read_lock`, and `rcu_read_unlock` on exit from the critical section. If the thread is not in the critical section, it will be in state which called - `quiescent state`. The moment when every thread is in the `quiescent state` called - `grace period`. If a thread wants to remove an element from the data structure, this occurs in two steps. First step is `removal` - atomically removes element from the data structure, but does not release the physical memory. After this thread-writer announces and waits until it is finished. From this moment, the removed element is available to the thread-readers. After the `grace period` finished, the second step of the element removal will be started, it just removes the element from the physical memory.\n\nThere a couple of implementations of the `RCU`. Old `RCU` called classic, the new implementation called `tree` RCU. As you may already understand, the `CONFIG_TREE_RCU` kernel configuration option enables tree `RCU`. Another is the `tiny` RCU which depends on `CONFIG_TINY_RCU` and `CONFIG_SMP=n`. We will see more details about the `RCU` in general in the separate chapter about synchronization primitives, but now let's look on the `rcu_init` implementation from the [kernel/rcu/tree.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/rcu/tree.c):\n\n```C\nvoid __init rcu_init(void)\n{\n         int cpu;\n\n         rcu_bootup_announce();\n         rcu_init_geometry();\n         rcu_init_one(&rcu_bh_state, &rcu_bh_data);\n         rcu_init_one(&rcu_sched_state, &rcu_sched_data);\n         __rcu_init_preempt();\n         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);\n\n         /*\n          * We don't need protection against CPU-hotplug here because\n          * this is called early in boot, before either interrupts\n          * or the scheduler are operational.\n          */\n         cpu_notifier(rcu_cpu_notify, 0);\n         pm_notifier(rcu_pm_notify, 0);\n         for_each_online_cpu(cpu)\n                 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);\n\n         rcu_early_boot_tests();\n}\n```\n\nIn the beginning of the `rcu_init` function we define `cpu` variable and call `rcu_bootup_announce`. The `rcu_bootup_announce` function is pretty simple:\n\n```C\nstatic void __init rcu_bootup_announce(void)\n{\n        pr_info(\"Hierarchical RCU implementation.\\n\");\n        rcu_bootup_announce_oddness();\n}\n```\n\nIt just prints information about the `RCU` with the `pr_info` function and `rcu_bootup_announce_oddness` which uses `pr_info` too, for printing different information about the current `RCU` configuration which depends on different kernel configuration options like `CONFIG_RCU_TRACE`, `CONFIG_PROVE_RCU`, `CONFIG_RCU_FANOUT_EXACT`, etc. In the next step, we can see the call of the `rcu_init_geometry` function. This function is defined in the same source code file and computes the node tree geometry depends on the amount of CPUs. Actually `RCU` provides scalability with extremely low internal RCU lock contention. What if a data structure will be read from the different CPUs? `RCU` API provides the `rcu_state` structure which presents RCU global state including node hierarchy. Hierarchy is presented by the:\n\n```\nstruct rcu_node node[NUM_RCU_NODES];\n```\n\narray of structures. As we can read in the comment of above definition:\n\n```\nThe root (first level) of the hierarchy is in ->node[0] (referenced by ->level[0]), the second\nlevel in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]), and the third level\nin ->node[m+1] and following (->node[m+1] referenced by ->level[2]).  The number of levels is\ndetermined by the number of CPUs and by CONFIG_RCU_FANOUT.\n\nSmall systems will have a \"hierarchy\" consisting of a single rcu_node.\n```\n\nThe `rcu_node` structure is defined in the [kernel/rcu/tree.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/rcu/tree.h) and contains information about current grace period, is grace period completed or not, CPUs or groups that need to switch in order for current grace period to proceed, etc. Every `rcu_node` contains a lock for a couple of CPUs. These `rcu_node` structures are embedded into a linear array in the `rcu_state` structure and represented as a tree with the root as the first element and covers all CPUs. As you can see the number of the rcu nodes determined by the `NUM_RCU_NODES` which depends on number of available CPUs:\n\n```C\n#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)\n#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)\n```\n\nwhere levels values depend on the `CONFIG_RCU_FANOUT_LEAF` configuration option. For example for the simplest case, one `rcu_node` will cover two CPU on machine with the eight CPUs:\n\n```\n+-----------------------------------------------------------------+\n|  rcu_state                                                      |\n|                 +----------------------+                        |\n|                 |         root         |                        |\n|                 |       rcu_node       |                        |\n|                 +----------------------+                        |\n|                    |                |                           |\n|               +----v-----+       +--v-------+                   |\n|               |          |       |          |                   |\n|               | rcu_node |       | rcu_node |                   |\n|               |          |       |          |                   |\n|         +------------------+     +----------------+             |\n|         |                  |        |             |             |\n|         |                  |        |             |             |\n|    +----v-----+    +-------v--+   +-v--------+  +-v--------+    |\n|    |          |    |          |   |          |  |          |    |\n|    | rcu_node |    | rcu_node |   | rcu_node |  | rcu_node |    |\n|    |          |    |          |   |          |  |          |    |\n|    +----------+    +----------+   +----------+  +----------+    |\n|         |                 |             |               |       |\n|         |                 |             |               |       |\n|         |                 |             |               |       |\n|         |                 |             |               |       |\n+---------|-----------------|-------------|---------------|-------+\n          |                 |             |               |\n+---------v-----------------v-------------v---------------v--------+\n|                 |                |               |               |\n|     CPU1        |      CPU3      |      CPU5     |     CPU7      |\n|                 |                |               |               |\n|     CPU2        |      CPU4      |      CPU6     |     CPU8      |\n|                 |                |               |               |\n+------------------------------------------------------------------+\n```\n\nSo, in the `rcu_init_geometry` function we just need to calculate the total number of `rcu_node` structures. We start to do it with the calculation of the `jiffies` till to the first and next `fqs` which is `force-quiescent-state` (read above about it):\n\n```C\nd = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;\nif (jiffies_till_first_fqs == ULONG_MAX)\n        jiffies_till_first_fqs = d;\nif (jiffies_till_next_fqs == ULONG_MAX)\n        jiffies_till_next_fqs = d;\n```\n\nwhere:\n\n```C\n#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))\n#define RCU_JIFFIES_FQS_DIV     256\n```\n\nAs we calculated these [jiffies](http://en.wikipedia.org/wiki/Jiffy_%28time%29), we check that previous defined `jiffies_till_first_fqs` and `jiffies_till_next_fqs` variables are equal to the [ULONG_MAX](http://www.rowleydownload.co.uk/avr/documentation/index.htm?http://www.rowleydownload.co.uk/avr/documentation/ULONG_MAX.htm) (their default values) and set they equal to the calculated value. As we did not touch these variables before, they are equal to the `ULONG_MAX`:\n\n```C\nstatic ulong jiffies_till_first_fqs = ULONG_MAX;\nstatic ulong jiffies_till_next_fqs = ULONG_MAX;\n```\n\nIn the next step of the `rcu_init_geometry`, we check that `rcu_fanout_leaf` didn't change (it has the same value as `CONFIG_RCU_FANOUT_LEAF` in compile-time) and equal to the value of the `CONFIG_RCU_FANOUT_LEAF` configuration option, we just return:\n\n```C\nif (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&\n    nr_cpu_ids == NR_CPUS)\n    return;\n```\n\nAfter this we need to compute the number of nodes that an `rcu_node` tree can handle with the given number of levels:\n\n```C\nrcu_capacity[0] = 1;\nrcu_capacity[1] = rcu_fanout_leaf;\nfor (i = 2; i <= MAX_RCU_LVLS; i++)\n    rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;\n```\n\nAnd in the last step we calculate the number of rcu_nodes at each level of the tree in the [loop](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/rcu/tree.c#L4094).\n\nAs we calculated geometry of the `rcu_node` tree, we need to go back to the `rcu_init` function and next step we need to initialize two `rcu_state` structures with the `rcu_init_one` function:\n\n```C\nrcu_init_one(&rcu_bh_state, &rcu_bh_data);\nrcu_init_one(&rcu_sched_state, &rcu_sched_data);\n```\n\nThe `rcu_init_one` function takes two arguments:\n\n* Global `RCU` state;\n* Per-CPU data for `RCU`.\n\nBoth variables defined in the [kernel/rcu/tree.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/rcu/tree.h) with its `percpu` data:\n\n```\nextern struct rcu_state rcu_bh_state;\nDECLARE_PER_CPU(struct rcu_data, rcu_bh_data);\n```\n\nAbout these states you can read [here](http://lwn.net/Articles/264090/). As I wrote above we need to initialize `rcu_state` structures and `rcu_init_one` function will help us with it. After the `rcu_state` initialization, we can see the call of the ` __rcu_init_preempt` which depends on the `CONFIG_PREEMPT_RCU` kernel configuration option. It does the same as previous functions - initialization of the `rcu_preempt_state` structure with the `rcu_init_one` function which has `rcu_state` type. After this, in the `rcu_init`, we can see the call of the:\n\n```C\nopen_softirq(RCU_SOFTIRQ, rcu_process_callbacks);\n```\n\nfunction. This function registers a handler of the `pending interrupt`. Pending interrupt or `softirq` supposes that part of actions can be delayed for later execution when the system is less loaded. Pending interrupts is represented by the following structure:\n\n```C\nstruct softirq_action\n{\n        void    (*action)(struct softirq_action *);\n};\n```\n\nwhich is defined in the [include/linux/interrupt.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/interrupt.h) and contains only one field - handler of an interrupt. You can check about `softirqs` in the your system with the:\n\n```\n$ cat /proc/softirqs\n                    CPU0       CPU1       CPU2       CPU3       CPU4       CPU5       CPU6       CPU7\n          HI:          2          0          0          1          0          2          0          0\n       TIMER:     137779     108110     139573     107647     107408     114972      99653      98665\n      NET_TX:       1127          0          4          0          1          1          0          0\n      NET_RX:        334        221     132939       3076        451        361        292        303\n       BLOCK:       5253       5596          8        779       2016      37442         28       2855\nBLOCK_IOPOLL:          0          0          0          0          0          0          0          0\n     TASKLET:         66          0       2916        113          0         24      26708          0\n       SCHED:     102350      75950      91705      75356      75323      82627      69279      69914\n     HRTIMER:        510        302        368        260        219        255        248        246\n         RCU:      81290      68062      82979      69015      68390      69385      63304      63473\n```\n\nThe `open_softirq` function takes two parameters:\n\n* index of the interrupt;\n* interrupt handler.\n\nand adds interrupt handler to the array of the pending interrupts:\n\n```C\nvoid open_softirq(int nr, void (*action)(struct softirq_action *))\n{\n        softirq_vec[nr].action = action;\n}\n```\n\nIn our case the interrupt handler is - `rcu_process_callbacks` which is defined in the [kernel/rcu/tree.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/rcu/tree.c) and does the `RCU` core processing for the current CPU. After we registered `softirq` interrupt for the `RCU`, we can see the following code:\n\n```C\ncpu_notifier(rcu_cpu_notify, 0);\npm_notifier(rcu_pm_notify, 0);\nfor_each_online_cpu(cpu)\n    rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);\n```\n\nHere we can see registration of the `cpu` notifier which needs in systems which supports [CPU hotplug](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt) and we will not dive into details about this theme. The last function in the `rcu_init` is the `rcu_early_boot_tests`:\n\n```C\nvoid rcu_early_boot_tests(void)\n{\n        pr_info(\"Running RCU self tests\\n\");\n\n        if (rcu_self_test)\n                 early_boot_test_call_rcu();\n         if (rcu_self_test_bh)\n                 early_boot_test_call_rcu_bh();\n         if (rcu_self_test_sched)\n                early_boot_test_call_rcu_sched();\n}\n```\n\nwhich runs self tests for the `RCU`.\n\nThat's all. We saw initialization process of the `RCU` subsystem. As I wrote above, more about the `RCU` will be in the separate chapter about synchronization primitives.\n\nRest of the initialization process\n--------------------------------------------------------------------------------\n\nOk, we already passed the main theme of this part which is `RCU` initialization, but it is not the end of the Linux kernel initialization process. In the last paragraph of this theme we will see a couple of functions which work in the initialization time, but we will not dive into deep details around this function for different reasons. Some reasons not to dive into details are following:\n\n* They are not very important for the generic kernel initialization process and depend on the different kernel configuration;\n* They have the character of debugging and not important for now;\n* We will see many of this stuff in the separate parts/chapters.\n\nAfter we initialized `RCU`, the next step which you can see in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) is the - `trace_init` function. As you can understand from its name, this function initialize [tracing](http://en.wikipedia.org/wiki/Tracing_%28software%29) subsystem. You can read more about Linux kernel trace system - [here](http://elinux.org/Kernel_Trace_Systems).\n\nAfter the `trace_init`, we can see the call of the `radix_tree_init`. If you are familiar with the different data structures, you can understand from the name of this function that it initializes kernel implementation of the [Radix tree](http://en.wikipedia.org/wiki/Radix_tree). This function is defined in the [lib/radix-tree.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/radix-tree.c) and you can read more about it in the part about [Radix tree](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-2).\n\nIn the next step we can see the functions which are related to the `interrupts handling` subsystem, they are:\n\n* `early_irq_init`\n* `init_IRQ`\n* `softirq_init`\n\nWe will see explanation about this functions and their implementation in the special part about interrupts and exceptions handling. After this many different functions (like `init_timers`, `hrtimers_init`, `time_init`, etc.) which are related to different timing and timers stuff. We will see more about these function in the chapter about timers.\n\nThe next couple of functions are related with the [perf](https://perf.wiki.kernel.org/index.php/Main_Page) events - `perf_event-init` (there will be separate chapter about perf), initialization of the `profiling` with the `profile_init`. After this we enable `irq` with the call of the:\n\n```C\nlocal_irq_enable();\n```\n\nwhich expands to the `sti` instruction and making post initialization of the [SLAB](http://en.wikipedia.org/wiki/Slab_allocation) with the call of the `kmem_cache_init_late` function (As I wrote above we will know about the `SLAB` in the [Linux memory management](https://0xax.gitbook.io/linux-insides/summary/mm) chapter).\n\nAfter the post initialization of the `SLAB`, next point is initialization of the console with the `console_init` function from the [drivers/tty/tty_io.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/tty/tty_io.c).\n\nAfter the console initialization, we can see the `lockdep_info` function which prints information about the [Lock dependency validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt). After this, we can see the initialization of the dynamic allocation of the `debug objects` with the `debug_objects_mem_init`, kernel memory leak [detector](https://www.kernel.org/doc/Documentation/kmemleak.txt) initialization with the `kmemleak_init`, `percpu` pageset setup with the `setup_per_cpu_pageset`, setup of the [NUMA](http://en.wikipedia.org/wiki/Non-uniform_memory_access) policy with the `numa_policy_init`, setting time for the scheduler with the `sched_clock_init`, `pidmap` initialization with the call of the `pidmap_init` function for the initial `PID` namespace, cache creation with the `anon_vma_init` for the private virtual memory areas and early initialization of the [ACPI](http://en.wikipedia.org/wiki/Advanced_Configuration_and_Power_Interface) with the `acpi_early_init`.\n\nThis is the end of the ninth part of the [linux kernel initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) and here we saw initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update). In the last paragraph of this part (`Rest of the initialization process`) we will go through many functions but did not dive into details about their implementations. Do not worry if you do not know anything about these stuff or you know and do not understand anything about this. As I already wrote many times, we will see details of implementations in other parts or other chapters.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the ninth part about the Linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization). In this part, we looked on the initialization process of the `RCU` subsystem. In the next part we will continue to dive into linux kernel initialization process and I hope that we will finish with the `start_kernel` function and will go to the `rest_init` function from the same [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file and will see the start of the first process.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [lock-free data structures](http://en.wikipedia.org/wiki/Concurrent_data_structure)\n* [kmemleak](https://www.kernel.org/doc/Documentation/kmemleak.txt)\n* [ACPI](http://en.wikipedia.org/wiki/Advanced_Configuration_and_Power_Interface)\n* [IRQs](http://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29)\n* [RCU](http://en.wikipedia.org/wiki/Read-copy-update)\n* [RCU documentation](https://github.com/torvalds/linux/tree/master/Documentation/RCU)\n* [integer ID management](https://lwn.net/Articles/103209/)\n* [Documentation/memory-barriers.txt](https://www.kernel.org/doc/Documentation/memory-barriers.txt)\n* [Runtime locking correctness validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt)\n* [Per-CPU variables](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [Linux kernel memory management](https://0xax.gitbook.io/linux-insides/summary/mm)\n* [slab](http://en.wikipedia.org/wiki/Slab_allocation)\n* [i2c](http://en.wikipedia.org/wiki/I%C2%B2C)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-8)\n"
  },
  {
    "path": "Interrupts/README.md",
    "content": "# Interrupts and Interrupt Handling\n\nIn the following posts, we will cover interrupts and exceptions handling in the Linux kernel.\n\n* [Interrupts and Interrupt Handling. Part 1.](linux-interrupts-1.md) - describes interrupts and interrupt handling theory.\n* [Interrupts in the Linux Kernel](linux-interrupts-2.md) - describes stuffs related to interrupts and exceptions handling from the early stage.\n* [Early interrupt handlers](linux-interrupts-3.md) - describes early interrupt handlers.\n* [Interrupt handlers](linux-interrupts-4.md) - describes first non-early interrupt handlers.\n* [Implementation of exception handlers](linux-interrupts-5.md) - describes implementation of some exception handlers such as double fault, divide by zero etc.\n* [Handling non-maskable interrupts](linux-interrupts-6.md) - describes handling of non-maskable interrupts and remaining interrupt handlers from the architecture-specific part.\n* [External hardware interrupts](linux-interrupts-7.md) - describes early initialization of code which is related to handling external hardware interrupts.\n* [Non-early initialization of the IRQs](linux-interrupts-8.md) - describes non-early initialization of code which is related to handling external hardware interrupts.\n* [Softirq, Tasklets and Workqueues](linux-interrupts-9.md) - describes softirqs, tasklets and workqueues concepts.\n* [Last part](linux-interrupts-10.md) - this is the last part of the `Interrupts and Interrupt Handling` chapter and here we will see a real hardware driver and some interrupts related stuff.\n"
  },
  {
    "path": "Interrupts/linux-interrupts-1.md",
    "content": "Interrupts and Interrupt Handling. Part 1.\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nThis is the first part of the new chapter of the [linux insides](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md) book. We have come a long way in the previous [chapter](https://0xax.gitbook.io/linux-insides/summary/initialization) of this book. We started from the earliest [steps](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of kernel initialization and finished with the [launch](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-10) of the first `init` process. Yes, we saw several initialization steps which are related to the various kernel subsystems. But we did not dig deep into the details of these subsystems. With this chapter, we will try to understand how the various kernel subsystems work and how they are implemented. As you can already understand from the chapter's title, the first subsystem will be [interrupts](http://en.wikipedia.org/wiki/Interrupt).\n\nWhat is an Interrupt?\n--------------------------------------------------------------------------------\n\nWe have already heard of the word `interrupt` in several parts of this book. We even saw a couple of examples of interrupt handlers. In the current chapter we will start from the theory, i.e.\n\n* What are `interrupts` ?\n* What are `interrupt handlers`?\n\nWe will then continue to dig deeper into the details of `interrupts` and how the Linux kernel handles them.\n\nThe first question that arises in our mind when we come across word `interrupt` is `What is an interrupt?` An interrupt is an `event` raised by software or hardware when it needs the CPU's attention. For example, we press a button on the keyboard and what do we expect next? What should the operating system and computer do after this? To simplify matters, assume that each peripheral device has an interrupt line to the CPU. A device can use it to signal an interrupt to the CPU. However, interrupts are not signaled directly to the CPU. In the old machines there was a [PIC](http://en.wikipedia.org/wiki/Programmable_Interrupt_Controller) which is a chip responsible for sequentially processing multiple interrupt requests from multiple devices. In the new machines there is an [Advanced Programmable Interrupt Controller](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) commonly known as - `APIC`. An `APIC` consists of two separate devices:\n\n* `Local APIC`\n* `I/O APIC`\n\nThe first - `Local APIC` is located on each CPU core. The local APIC is responsible for handling the CPU-specific interrupt configuration. The local APIC is usually used to manage interrupts from the APIC-timer, thermal sensor and any other such locally connected I/O devices.\n\nThe second - `I/O APIC` provides multi-processor interrupt management. It is used to distribute external interrupts among the CPU cores. More about the local and I/O APICs will be covered later in this chapter. As you can understand, interrupts can occur at any time. When an interrupt occurs, the operating system must handle it immediately. But what does it mean `to handle an interrupt`? When an interrupt occurs, the  operating system must ensure the following steps:\n\n* The kernel must pause execution of the current process; (preempt current task);\n* The kernel must search for the handler of the interrupt and transfer control (execute interrupt handler);\n* After the interrupt handler completes execution, the interrupted process can resume execution.\n\nOf course there are numerous intricacies involved in this procedure of handling interrupts. But the above 3 steps form the basic skeleton of the procedure.\n\nAddresses of each of the interrupt handlers are maintained in a special location referred to as the - `Interrupt Descriptor Table` or `IDT`. The processor uses a unique number for recognizing the type of interruption or exception. This number is called - `vector number`. A vector number is an index in the `IDT`. There is a limited amount of the vector numbers and it can be from `0` to `255`. You can note the following range-check upon the vector number within the Linux kernel source-code:\n\n```C\nBUG_ON((unsigned)n > 0xFF);\n```\n\nYou can find this check within the Linux kernel source code related to interrupt setup (e.g. The `set_intr_gate` in [arch/x86/kernel/idt.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/idt.c)). The first 32 vector numbers from `0` to `31` are reserved by the processor and used for the processing of architecture-defined exceptions and interrupts. You can find the table with the description of these vector numbers in the second part of the Linux kernel initialization process - [Early interrupt and exception handling](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-2). Vector numbers from `32` to `255` are designated as user-defined interrupts and are not reserved by the processor. These interrupts are generally assigned to external I/O devices to enable those devices to send interrupts to the processor.\n\nNow let's talk about the types of interrupts. Broadly speaking, we can split interrupts into 2 major classes:\n\n* External or hardware generated interrupts\n* Software-generated interrupts\n\nThe first - external interrupts are received through the `Local APIC` or pins on the processor which are connected to the `Local APIC`. The second - software-generated interrupts are caused by an exceptional condition in the processor itself (sometimes using special architecture-specific instructions). A common example of an exceptional condition is `division by zero`. Another example is exiting a program with the `syscall` instruction.\n\nAs mentioned earlier, an interrupt can occur at any time for a reason which the code and CPU have no control over. On the other hand, exceptions are `synchronous` with program execution and can be classified into 3 categories:\n\n* `Faults`\n* `Traps`\n* `Aborts`\n\nA `fault` is an exception reported before the execution of a \"faulty\" instruction (which can then be corrected). If correct, it allows the interrupted program to resume.\n\nNext a `trap` is an exception, which is reported immediately following the execution of the `trap` instruction. Traps also allow the interrupted program to be continued just as a `fault` does.\n\nFinally, an `abort` is an exception that does not always report the exact instruction which caused the exception and does not allow the interrupted program to be resumed.\n\nAlso, we already know from the previous [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-3) that interrupts can be classified as `maskable` and `non-maskable`. Maskable interrupts are interrupts which can be blocked with the two following instructions for `x86_64` - `sti` and `cli`. We can find them in the Linux kernel source code:\n\n```C\nstatic inline void native_irq_disable(void)\n{\n        asm volatile(\"cli\": : :\"memory\");\n}\n```\n\nand\n\n```C\nstatic inline void native_irq_enable(void)\n{\n        asm volatile(\"sti\": : :\"memory\");\n}\n```\n\nThese two instructions modify the `IF` flag bit within the interrupt register. The `sti` instruction sets the `IF` flag and the `cli` instruction clears this flag. Non-maskable interrupts are always reported. Usually any failure in the hardware is mapped to such non-maskable interrupts.\n\nIf multiple exceptions or interrupts occur at the same time, the processor handles them in order of their predefined priorities. We can determine the priorities from the highest to the lowest in the following table:\n\n```\n+----------------------------------------------------------------+\n|              |                                                 |\n|   Priority   | Description                                     |\n|              |                                                 |\n+--------------+-------------------------------------------------+\n|              | Hardware Reset and Machine Checks               |\n|     1        | - RESET                                         |\n|              | - Machine Check                                 |\n+--------------+-------------------------------------------------+\n|              | Trap on Task Switch                             |\n|     2        | - T flag in TSS is set                          |\n|              |                                                 |\n+--------------+-------------------------------------------------+\n|              | External Hardware Interventions                 |\n|              | - FLUSH                                         |\n|     3        | - STOPCLK                                       |\n|              | - SMI                                           |\n|              | - INIT                                          |\n+--------------+-------------------------------------------------+\n|              | Traps on the Previous Instruction               |\n|     4        | - Breakpoints                                   |\n|              | - Debug Trap Exceptions                         |\n+--------------+-------------------------------------------------+\n|     5        | Nonmaskable Interrupts                          |\n+--------------+-------------------------------------------------+\n|     6        | Maskable Hardware Interrupts                    |\n+--------------+-------------------------------------------------+\n|     7        | Code Breakpoint Fault                           |\n+--------------+-------------------------------------------------+\n|     8        | Faults from Fetching Next Instruction           |\n|              | Code-Segment Limit Violation                    |\n|              | Code Page Fault                                 |\n+--------------+-------------------------------------------------+\n|              | Faults from Decoding the Next Instruction       |\n|              | Instruction length > 15 bytes                   |\n|     9        | Invalid Opcode                                  |\n|              | Coprocessor Not Available                       |\n|              |                                                 |\n+--------------+-------------------------------------------------+\n|     10       | Faults on Executing an Instruction              |\n|              | Overflow                                        |\n|              | Bound error                                     |\n|              | Invalid TSS                                     |\n|              | Segment Not Present                             |\n|              | Stack fault                                     |\n|              | General Protection                              |\n|              | Data Page Fault                                 |\n|              | Alignment Check                                 |\n|              | x87 FPU Floating-point exception                |\n|              | SIMD floating-point exception                   |\n|              | Virtualization exception                        |\n+--------------+-------------------------------------------------+\n```\n\nNow that we know a little about the various types of interrupts and exceptions, it is time to move on to a more practical part. We start with the description of the `Interrupt Descriptor Table`. As mentioned earlier, the `IDT` stores entry points of the interrupts and exceptions handlers. The `IDT` is similar in structure to the `Global Descriptor Table` which we saw in the second part of the [Kernel booting process](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-2). But of course it has some differences. Instead of `descriptors`, the `IDT` entries are called `gates`. It can contain one of the following gates:\n\n* Interrupt gates\n* Task gates\n* Trap gates.\n\nIn the `x86` architecture. Only [long mode](http://en.wikipedia.org/wiki/Long_mode) interrupt gates and trap gates can be referenced in the `x86_64`. Like the `Global Descriptor Table`, the `Interrupt Descriptor table` is an array of 8-byte gates on `x86` and an array of 16-byte gates on `x86_64`. We can remember from the second part of the [Kernel booting process](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-2), that `Global Descriptor Table` must contain `NULL` descriptor as its first element. Unlike the `Global Descriptor Table`, the `Interrupt Descriptor Table` may contain a gate; it is not mandatory. For example, you may remember that we have loaded the Interrupt Descriptor table with the `NULL` gates only in the earlier [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-3) while transitioning into [protected mode](http://en.wikipedia.org/wiki/Protected_mode):\n\n```C\n/*\n * Set up the IDT\n */\nstatic void setup_idt(void)\n{\n\tstatic const struct gdt_ptr null_idt = {0, 0};\n\tasm volatile(\"lidtl %0\" : : \"m\" (null_idt));\n}\n```\n\nFrom the [arch/x86/boot/pm.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/pm.c). The `Interrupt Descriptor table` can be located anywhere in the linear address space and the base address of it must be aligned on an 8-byte boundary on `x86` or 16-byte boundary on `x86_64`. The base address of the `IDT` is stored in the special register - `IDTR`. There are two instructions on `x86`-compatible processors to modify the `IDTR` register:\n\n* `LIDT`\n* `SIDT`\n\nThe first instruction `LIDT` is used to load the base-address of the `IDT` i.e., the specified operand into the `IDTR`. The second instruction `SIDT` is used to read and store the contents of the `IDTR` into the specified operand. The `IDTR` register is 48-bits on the `x86` and contains the following information:\n\n```\n+-----------------------------------+----------------------+\n|                                   |                      |\n|     Base address of the IDT       |   Limit of the IDT   |\n|                                   |                      |\n+-----------------------------------+----------------------+\n47                                16 15                    0\n```\n\nLooking at the implementation of `setup_idt`, we have prepared a `null_idt` and loaded it to the `IDTR` register with the `lidt` instruction. Note that `null_idt` has `gdt_ptr` type which is defined as:\n\n```C\nstruct gdt_ptr {\n        u16 len;\n        u32 ptr;\n} __attribute__((packed));\n```\n\nHere we can see the definition of the structure with the two fields of 2-bytes and 4-bytes each (a total of 48-bits) as we can see in the diagram. Now let's look at the `IDT` entries structure. The `IDT` entries structure is an array of the 16-byte entries which are called gates in the `x86_64`. They have the following structure:\n\n```\n127                                                                             96\n+-------------------------------------------------------------------------------+\n|                                                                               |\n|                                Reserved                                       |\n|                                                                               |\n+--------------------------------------------------------------------------------\n95                                                                              64\n+-------------------------------------------------------------------------------+\n|                                                                               |\n|                               Offset 63..32                                   |\n|                                                                               |\n+-------------------------------------------------------------------------------+\n63                               48 47      46  44   42    39             34    32\n+-------------------------------------------------------------------------------+\n|                                  |       |  D  |   |     |      |   |   |     |\n|       Offset 31..16              |   P   |  P  | 0 |Type |0 0 0 | 0 | 0 | IST |\n|                                  |       |  L  |   |     |      |   |   |     |\n -------------------------------------------------------------------------------+\n31                                   16 15                                      0\n+-------------------------------------------------------------------------------+\n|                                      |                                        |\n|          Segment Selector            |                 Offset 15..0           |\n|                                      |                                        |\n+-------------------------------------------------------------------------------+\n```\n\nTo form an index into the IDT, the processor scales the exception or interrupt vector by sixteen. The processor handles the occurrence of exceptions and interrupts just like it handles calls of a procedure when it sees the `call` instruction. A processor uses a unique number or `vector number` of the interrupt or the exception as the index to find the necessary `Interrupt Descriptor Table` entry. Now let's take a closer look at an `IDT` entry.\n\nAs we can see, `IDT` entry on the diagram consists of the following fields:\n\n* `0-15` bits  - offset from the segment selector which is used by the processor as the base address of the entry point of the interrupt handler;\n* `16-31` bits - base address of the segment select which contains the entry point of the interrupt handler;\n* `IST` - a new special mechanism in the `x86_64`, which is described below;\n* `DPL` - Descriptor Privilege Level;\n* `P` - Segment Present flag;\n* `48-63` bits - the second part of the handler base address;\n* `64-95` bits - the third part of the base address of the handler;\n* `96-127` bits - and the last bits are reserved by the CPU.\n\nAnd the last `Type` field describes the type of the `IDT` entry. There are three different kinds of handlers for interrupts:\n\n* Interrupt gate\n* Trap gate\n* Task gate\n\nThe `IST` or `Interrupt Stack Table` is a new mechanism in the `x86_64`. It is used as an alternative to the legacy stack-switch mechanism. Previously the `x86` architecture provided a mechanism to automatically switch stack frames in response to an interrupt. The `IST` is a modified version of the `x86` Stack switching mode. This mechanism unconditionally switches stacks when it is enabled and can be enabled for any interrupt in the `IDT` entry related with the certain interrupt (we will soon see it). From this we can understand that `IST` is not necessary for all interrupts. Some interrupts can continue to use the legacy stack switching mode. The `IST` mechanism provides up to seven `IST` pointers in the [Task State Segment](http://en.wikipedia.org/wiki/Task_state_segment) or `TSS` which is the special structure which contains information about a process. The `TSS` is used for stack switching during the execution of an interrupt or exception handler in the Linux kernel. Each pointer is referenced by an interrupt gate from the `IDT`.\n\nThe `Interrupt Descriptor Table` represented by the array of the `gate_desc` structures:\n\n\n```C\nextern gate_desc idt_table[];\n```\n\nwhere `gate_struct` is defined as:\n[/arch/x86/include/asm/desc_defs.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/desc_defs.h)\n\n```C\nstruct gate_struct {\n\tu16\t\toffset_low;\n\tu16\t\tsegment;\n\tstruct idt_bits\tbits;\n\tu16\t\toffset_middle;\n#ifdef CONFIG_X86_64\n\tu32\t\toffset_high;\n\tu32\t\treserved;\n#endif\n} __attribute__((packed));\n```\n\nEach active thread has a large stack in the Linux kernel for the `x86_64` architecture. The stack size is defined as `THREAD_SIZE` and is equal to:\n\n```C\n#define PAGE_SHIFT      12\n#define PAGE_SIZE       (_AC(1,UL) << PAGE_SHIFT)\n...\n...\n...\n#define THREAD_SIZE_ORDER       (2 + KASAN_STACK_ORDER)\n#define THREAD_SIZE  (PAGE_SIZE << THREAD_SIZE_ORDER)\n```\n\nThe `PAGE_SIZE` is `4096`-bytes and the `THREAD_SIZE_ORDER` depends on the `KASAN_STACK_ORDER`. As we can see, the `KASAN_STACK` depends on the `CONFIG_KASAN` kernel configuration parameter and is defined as:\n\n```C\n#ifdef CONFIG_KASAN\n    #define KASAN_STACK_ORDER 1\n#else\n    #define KASAN_STACK_ORDER 0\n#endif\n```\n\n`KASan` is a runtime memory [debugger](http://lwn.net/Articles/618180/). Thus, the `THREAD_SIZE` will be `16384` bytes if `CONFIG_KASAN` is disabled or `32768` if this kernel configuration option is enabled. These stacks contain useful data as long as a thread is alive or in a zombie state. While the thread is in user-space, the kernel stack is empty except for the `thread_info` structure (details about this structure are available in the fourth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4) of the Linux kernel initialization process) at the end of the stack. The active or zombie threads aren't the only threads with their own stack. There also exist specialized stacks that are associated with each available CPU. These stacks are active when the kernel is executing on that CPU. When the user-space is executing on the CPU, these stacks do not contain any useful information. Each CPU has a few special per-cpu stacks as well. The first is the `interrupt stack` used for the external hardware interrupts. Its size is determined as follows:\n\n```C\n#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)\n#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)\n```\n\nOr `16384` bytes. The per-cpu interrupt stack is represented by the `irq_stack` struct and the `fixed_percpu_data` struct in the Linux kernel for `x86_64`:\n\n```C\n/* Per CPU interrupt stacks */\nstruct irq_stack {\n\tchar\t\tstack[IRQ_STACK_SIZE];\n} __aligned(IRQ_STACK_SIZE);\n```\n\n```C\n#ifdef CONFIG_X86_64\nstruct fixed_percpu_data {\n\t/*\n\t * GCC hardcodes the stack canary as %gs:40.  Since the\n\t * irq_stack is the object at %gs:0, we reserve the bottom\n\t * 48 bytes of the irq stack for the canary.\n\t */\n\tchar\t\tgs_base[40];\n\tunsigned long\tstack_canary;\n};\n...\n#endif\n```\n\nThe `irq_stack` struct contains a 16 kilobytes array.\nAlso, you can see that the fixed\\_percpu\\_data contains two fields:\n\n* `gs_base` - The `gs` register always points to the bottom of the `fixed_percpu_data`. On the `x86_64`, the `gs` register is shared by per-cpu area and stack canary (more about `per-cpu` variables you can read in the special [part](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)).  All per-cpu symbols are zero-based and the `gs` points to the base of the per-cpu area. You already know that [segmented memory model](http://en.wikipedia.org/wiki/Memory_segmentation) is abolished in the long mode, but we can set the base address for the two segment registers - `fs` and `gs` with the [Model specific registers](http://en.wikipedia.org/wiki/Model-specific_register) and these registers can be still be used as address registers. If you remember the first [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of the Linux kernel initialization process, you can remember that we have set the `gs` register:\n\n```assembly\n\tmovl\t$MSR_GS_BASE,%ecx\n\tmovl\tinitial_gs(%rip),%eax\n\tmovl\tinitial_gs+4(%rip),%edx\n\twrmsr\n```\n\nwhere `initial_gs` points to the `fixed_percpu_data`:\n\n```assembly\nSYM_DATA(initial_gs,\t.quad INIT_PER_CPU_VAR(fixed_percpu_data))\n```\n\n* `stack_canary` - [Stack canary](http://en.wikipedia.org/wiki/Stack_buffer_overflow#Stack_canaries) for the interrupt stack is a `stack protector`\nto verify that the stack hasn't been overwritten. Note that `gs_base` is a 40 bytes array. `GCC` requires that stack canary will be on the fixed offset from the base of the `gs` and its value must be `40` for the `x86_64` and `20` for the `x86`.\n\nThe `fixed_percpu_data` is the first datum in the `percpu` area, we can see it in the `System.map`:\n\n```\n0000000000000000 D __per_cpu_start\n0000000000000000 D fixed_percpu_data\n00000000000001e0 A kexec_control_code_size\n0000000000001000 D cpu_debug_store\n0000000000002000 D irq_stack_backing_store\n0000000000006000 D cpu_tss_rw\n0000000000009000 D gdt_page\n000000000000a000 d exception_stacks\n...\n...\n...\n```\n\nWe can see its definition in the code:\n\n```C\nDECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible;\n```\n\nNow, it's time to look at the initialization of the `fixed_percpu_data`. Besides the `fixed_percpu_data` definition, we can see the definition of the following per-cpu variables in the [arch/x86/include/asm/processor.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/processor.h):\n\n```C\nDECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);\n...\nDECLARE_PER_CPU(unsigned int, irq_count);\n...\n/* Per CPU softirq stack pointer */\nDECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);\n```\n\nThe first and third are the stack pointers for hardware and software interrupts. It is obvious from the name of the variables, that these point to the top of stacks. The second - `irq_count` is used to check if a CPU is already on an interrupt stack or not. Initialization of the `hardirq_stack_ptr` is located in the `irq_init_percpu_irqstack` function in [arch/x86/kernel/irq_64.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/irq_64.c):\n\n```C\nint irq_init_percpu_irqstack(unsigned int cpu)\n{\n\tif (per_cpu(hardirq_stack_ptr, cpu))\n\t\treturn 0;\n\treturn map_irq_stack(cpu);\n}\n```\n\nHere we go over all the CPUs one-by-one and setup the `hardirq_stack_ptr`.  \nWhere `map_irq_stack` is called to initialize the `hardirq_stack_ptr`,  \nto point onto the `irq_stack_backing_store` of the current CPU with an offset of IRQ\\_STACK\\_SIZE,   \neither with guard pages or without when KASan is enabled.  \n\n\nAfter the initialization of the interrupt stack, we need to initialize the gs register within [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/cpu/common.c):\n\n```C\nvoid load_percpu_segment(int cpu)\n{\n        ...\n        ...\n        ...\n        __loadsegment_simple(gs, 0);\n        wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));\n        ...\n        load_stack_canary_segment();\n}\n```\n\nand as we already know the `gs` register points to the bottom of the interrupt stack.\n\n```assembly\n\tmovl\t$MSR_GS_BASE,%ecx\n\tmovl\tinitial_gs(%rip),%eax\n\tmovl\tinitial_gs+4(%rip),%edx\n\twrmsr\n\n    SYM_DATA(initial_gs,\n    .quad INIT_PER_CPU_VAR(fixed_percpu_data))\n```\n\nHere we can see the `wrmsr` instruction, which loads the data from `edx:eax` into the [Model specific register](http://en.wikipedia.org/wiki/Model-specific_register) pointed by the `ecx` register. In our case the model specific register is `MSR_GS_BASE`, which contains the base address of the memory segment pointed to by the `gs` register. `edx:eax` points to the address of the `initial_gs,` which is the base address of our `fixed_percpu_data`.\n\nWe already know that `x86_64` has a feature called `Interrupt Stack Table` or `IST` and this feature provides the ability to switch to a new stack for events like a non-maskable interrupt, double fault, etc. There can be up to seven `IST` entries per-cpu. Some of them are:\n\n* `DOUBLEFAULT_STACK`\n* `NMI_STACK`\n* `DEBUG_STACK`\n* `MCE_STACK`\n\nor\n\n```C\n#define DOUBLEFAULT_STACK 1\n#define NMI_STACK 2\n#define DEBUG_STACK 3\n#define MCE_STACK 4\n```\n\nAll interrupt-gate descriptors, which switch to a new stack with the `IST`, are initialized within the `idt_setup_from_table` function. That function initializes every gate descriptor within the `struct idt_data def_idts[]` array.\nFor example:\n\n```C\nstatic const __initconst struct idt_data def_idts[] = {\n    ...\n\tINTG(X86_TRAP_NMI,\t\tnmi),\n    ...\n\tINTG(X86_TRAP_DF,\t\tdouble_fault),\n```\n\nwhere `nmi` and `double_fault` are entry points created at [arch/x86/kernel/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S):\n\n```assembly\nidtentry double_fault\t\t\tdo_double_fault\t\t\thas_error_code=1 paranoid=2 read_cr2=1\n...\n...\n...\nSYM_CODE_START(nmi)\n...\n...\n...\nSYM_CODE_END(nmi)\n```\nfor the the given interrupt handlers declared at [arch/x86/include/asm/traps.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/traps.h):\n\n```C\nasmlinkage void nmi(void);\nasmlinkage void double_fault(void);\n```\n\nWhen an interrupt or an exception occurs, the new `ss` selector is forced to `NULL` and the `ss` selector’s `rpl` field is set to the new `cpl`. The old `ss`, `rsp`, register flags, `cs`, `rip` are pushed onto the new stack. In 64-bit mode, the size of interrupt stack-frame pushes is fixed at 8-bytes, so that we will get the following stack:\n\n```\n+---------------+\n|               |\n|      SS       | 40\n|      RSP      | 32\n|     RFLAGS    | 24\n|      CS       | 16\n|      RIP      | 8\n|   Error code  | 0\n|               |\n+---------------+\n```\n\nIf the `IST` field in the interrupt gate is not `0`, we read the `IST` pointer into `rsp`. If the interrupt vector number has an error code associated with it, we then push the error code onto the stack. If the interrupt vector number has no error code, we go ahead and push the dummy error code on to the stack. We need to do this to ensure stack consistency. Next, we load the segment-selector field from the gate descriptor into the CS register and must verify that the target code-segment is a 64-bit mode code segment by the checking bit `21` i.e. the `L` bit in the `Global Descriptor Table`. Finally, we load the offset field from the gate descriptor into `rip` which will be the entry-point of the interrupt handler. After this the interrupt handler begins to execute and when the interrupt handler finishes its execution, it must return control to the interrupted process with the `iret` instruction. The `iret` instruction unconditionally pops the stack pointer (`ss:rsp`) to restore the stack of the interrupted process and does not depend on the `cpl` change.\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the first part of `Interrupts and Interrupt Handling` in the Linux kernel. We covered some theory and the first steps of initialization of stuff related to interrupts and exceptions. In the next part we will continue to dive into the more practical aspects of interrupts and interrupt handling.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me a PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [PIC](http://en.wikipedia.org/wiki/Programmable_Interrupt_Controller)\n* [Advanced Programmable Interrupt Controller](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)\n* [protected mode](http://en.wikipedia.org/wiki/Protected_mode)\n* [long mode](http://en.wikipedia.org/wiki/Long_mode)\n* [kernel stacks](https://www.kernel.org/doc/Documentation/x86/kernel-stacks)\n* [Task State Segment](http://en.wikipedia.org/wiki/Task_state_segment)\n* [segmented memory model](http://en.wikipedia.org/wiki/Memory_segmentation)\n* [Model specific registers](http://en.wikipedia.org/wiki/Model-specific_register)\n* [Stack canary](http://en.wikipedia.org/wiki/Stack_buffer_overflow#Stack_canaries)\n* [Previous chapter](https://0xax.gitbook.io/linux-insides/summary/initialization)\n"
  },
  {
    "path": "Interrupts/linux-interrupts-10.md",
    "content": "Interrupts and Interrupt Handling. Part 10.\n================================================================================\n\nLast part\n-------------------------------------------------------------------------------\n\nThis is the tenth part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) about interrupts and interrupt handling in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-9) we saw a little about deferred interrupts and related concepts like `softirq`, `tasklet` and `workqeue`. In this part we will continue to dive into this theme and now it's time to look at real hardware driver.\n\nLet's consider serial driver of the [StrongARM** SA-110/21285 Evaluation Board](http://netwinder.osuosl.org/pub/netwinder/docs/intel/datashts/27813501.pdf) board for example and will look how this driver requests an [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) line,\nwhat happens when an interrupt is triggered and etc. The source code of this driver is placed in the [drivers/tty/serial/21285.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/tty/serial/21285.c) source code file. Ok, we have source code, let's start.\n\nInitialization of a kernel module\n--------------------------------------------------------------------------------\n\nWe will start to consider this driver as we usually did it with all new concepts that we saw in this book. We will start to consider it from the initialization. As you already may know, the Linux kernel provides two macros for initialization and finalization of a driver or a kernel module:\n\n* `module_init`;\n* `module_exit`.\n\nAnd we can find usage of these macros in our driver source code:\n\n```C\nmodule_init(serial21285_init);\nmodule_exit(serial21285_exit);\n```\n\nThe most part of device drivers can be compiled as a loadable kernel [module](https://en.wikipedia.org/wiki/Loadable_kernel_module) or in another way they can be statically linked into the Linux kernel. In the first case initialization of a device driver will be produced via the `module_init` and `module_exit` macros that are defined in the [include/linux/init.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init.h):\n\n```C\n#define module_init(initfn)                                     \\\n        static inline initcall_t __inittest(void)               \\\n        { return initfn; }                                      \\\n        int init_module(void) __attribute__((alias(#initfn)));\n\n#define module_exit(exitfn)                                     \\\n        static inline exitcall_t __exittest(void)               \\\n        { return exitfn; }                                      \\\n        void cleanup_module(void) __attribute__((alias(#exitfn)));\n```\n\nand will be called by the [initcall](http://kernelnewbies.org/Documents/InitcallMechanism) functions:\n\n* `early_initcall`\n* `pure_initcall`\n* `core_initcall`\n* `postcore_initcall`\n* `arch_initcall`\n* `subsys_initcall`\n* `fs_initcall`\n* `rootfs_initcall`\n* `device_initcall`\n* `late_initcall`\n\nthat are called in the `do_initcalls` from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). Otherwise, if a device driver is statically linked into the Linux kernel, implementation of these macros will be following:\n\n```C\n#define module_init(x)  __initcall(x);\n#define module_exit(x)  __exitcall(x);\n```\n\nIn this way implementation of module loading placed in the [kernel/module.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/module.c) source code file and initialization occurs in the `do_init_module` function. We will not dive into details about loadable modules in this chapter, but will see it in the special chapter that will describe Linux kernel modules. Ok, the `module_init` macro takes one parameter - the `serial21285_init` in our case. As we can understand from function's name, this function does stuff related to the driver initialization. Let's look at it:\n\n```C\nstatic int __init serial21285_init(void)\n{\n\tint ret;\n\n\tprintk(KERN_INFO \"Serial: 21285 driver\\n\");\n\n\tserial21285_setup_ports();\n\n\tret = uart_register_driver(&serial21285_reg);\n\tif (ret == 0)\n\t\tuart_add_one_port(&serial21285_reg, &serial21285_port);\n\n\treturn ret;\n}\n```\n\nAs we can see, first of all it prints information about the driver to the kernel buffer and the call of the `serial21285_setup_ports` function. This function setups the base [uart](https://en.wikipedia.org/wiki/Universal_asynchronous_receiver/transmitter) clock of the `serial21285_port` device:\n\n```C\nunsigned int mem_fclk_21285 = 50000000;\n\nstatic void serial21285_setup_ports(void)\n{\n\tserial21285_port.uartclk = mem_fclk_21285 / 4;\n}\n```\n\nHere the `serial21285` is the structure that describes `uart` driver:\n\n```C\nstatic struct uart_driver serial21285_reg = {\n\t.owner\t\t\t= THIS_MODULE,\n\t.driver_name\t= \"ttyFB\",\n\t.dev_name\t\t= \"ttyFB\",\n\t.major\t\t\t= SERIAL_21285_MAJOR,\n\t.minor\t\t\t= SERIAL_21285_MINOR,\n\t.nr\t\t\t    = 1,\n\t.cons\t\t\t= SERIAL_21285_CONSOLE,\n};\n```\n\nIf the driver registered successfully we attach the driver-defined port `serial21285_port` structure with the `uart_add_one_port` function from the [drivers/tty/serial/serial_core.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/tty/serial/serial_core.c) source code file and return from the `serial21285_init` function:\n\n```C\nif (ret == 0)\n\tuart_add_one_port(&serial21285_reg, &serial21285_port);\n\nreturn ret;\n```\n\nThat's all. Our driver is initialized. When an `uart` port is opened with the call of the `uart_open` function from the [drivers/tty/serial/serial_core.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/tty/serial/serial_core.c), it will call the `uart_startup` function to start up the serial port. This function will call the `startup` function that is part of the `uart_ops` structure. Each `uart` driver has the definition of this structure, in our case it is:\n\n```C\nstatic struct uart_ops serial21285_ops = {\n\t...\n\t.startup\t= serial21285_startup,\n\t...\n}\n```\n\n`serial21285` structure. As we can see the `.strartup` field references on the `serial21285_startup` function. Implementation of this function is very interesting for us, because it is related to the interrupts and interrupt handling.\n\nRequesting irq line\n--------------------------------------------------------------------------------\n\nLet's look at the implementation of the `serial21285` function:\n\n```C\nstatic int serial21285_startup(struct uart_port *port)\n{\n\tint ret;\n\n\ttx_enabled(port) = 1;\n\trx_enabled(port) = 1;\n\n\tret = request_irq(IRQ_CONRX, serial21285_rx_chars, 0,\n\t\t\t  serial21285_name, port);\n\tif (ret == 0) {\n\t\tret = request_irq(IRQ_CONTX, serial21285_tx_chars, 0,\n\t\t\t\t  serial21285_name, port);\n\t\tif (ret)\n\t\t\tfree_irq(IRQ_CONRX, port);\n\t}\n\n\treturn ret;\n}\n```\n\nFirst of all about `TX` and `RX`. A serial bus of a device consists of just two wires: one for sending data and another for receiving. As such, serial devices should have two serial pins: the receiver - `RX`, and the transmitter - `TX`. With the call of first two macros: `tx_enabled` and `rx_enabled`, we enable these wires. The following part of these function is the greatest interest for us. Note on `request_irq` functions. This function registers an interrupt handler and enables a given interrupt line. Let's look at the implementation of this function and get into the details. This function defined in the [include/linux/interrupt.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/interrupt.h) header file and looks as:\n\n```C\nstatic inline int __must_check\nrequest_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,\n            const char *name, void *dev)\n{\n        return request_threaded_irq(irq, handler, NULL, flags, name, dev);\n}\n```\n\nAs we can see, the `request_irq` function takes five parameters:\n\n* `irq` - the interrupt number that being requested;\n* `handler` - the pointer to the interrupt handler;\n* `flags` - the bitmask options;\n* `name` - the name of the owner of an interrupt;\n* `dev` - the pointer used for shared interrupt lines;\n\nNow let's look at the calls of the `request_irq` functions in our example. As we can see the first parameter is `IRQ_CONRX`. We know that it is number of the interrupt, but what is it `CONRX`? This macro defined in the [arch/arm/mach-footbridge/include/mach/irqs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/arm/mach-footbridge/include/mach/irqs.h) header file. We can find the full list of interrupts that the `21285` board can generate. Note that in the second call of the `request_irq` function we pass the `IRQ_CONTX` interrupt number. Both these interrupts will handle `RX` and `TX` event in our driver. Implementation of these macros is easy:\n\n```C\n#define IRQ_CONRX               _DC21285_IRQ(0)\n#define IRQ_CONTX               _DC21285_IRQ(1)\n...\n...\n...\n#define _DC21285_IRQ(x)         (16 + (x))\n```\n\nThe [ISA](https://en.wikipedia.org/wiki/Industry_Standard_Architecture) IRQs on this board are from `0` to `15`, so, our interrupts will have first two numbers: `16` and `17`. Second parameters for two calls of the `request_irq` functions are `serial21285_rx_chars` and `serial21285_tx_chars`. These functions will be called when an `RX` or `TX` interrupt occurred. We will not dive in this part into details of these functions, because this chapter covers the interrupts and interrupts handling but not device and drivers. The next parameter - `flags` and as we can see, it is zero in both calls of the `request_irq` function. All acceptable flags are defined as `IRQF_*` macros in the [include/linux/interrupt.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/interrupt.h). Some of it:\n\n* `IRQF_SHARED` - allows sharing the irq among several devices;\n* `IRQF_PERCPU` - an interrupt is per cpu;\n* `IRQF_NO_THREAD` - an interrupt cannot be threaded;\n* `IRQF_NOBALANCING` - excludes this interrupt from irq balancing;\n* `IRQF_IRQPOLL` - an interrupt is used for polling;\n* and etc.\n\nIn our case we pass `0`, so it will be `IRQF_TRIGGER_NONE`. This flag means that it does not imply any kind of edge or level triggered interrupt behaviour. To the fourth parameter (`name`), we pass the `serial21285_name` that defined as:\n\n```C\nstatic const char serial21285_name[] = \"Footbridge UART\";\n```\n\nand will be displayed in the output of the `/proc/interrupts`. And in the last parameter we pass the pointer to the our main `uart_port` structure. Now we know a little about `request_irq` function and its parameters, let's look at its implementation. As we can see above, the `request_irq` function just makes a call of the `request_threaded_irq` function inside. The `request_threaded_irq` function defined in the [kernel/irq/manage.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/manage.c) source code file and allocates a given interrupt line. If we will look at this function, it starts from the definition of the `irqaction` and the `irq_desc`:\n\n```C\nint request_threaded_irq(unsigned int irq, irq_handler_t handler,\n                         irq_handler_t thread_fn, unsigned long irqflags,\n                         const char *devname, void *dev_id)\n{\n        struct irqaction *action;\n        struct irq_desc *desc;\n        int retval;\n\t\t...\n\t\t...\n\t\t...\n}\n```\n\nWe already saw the `irqaction` and the `irq_desc` structures in this chapter. The first structure represents per interrupt action descriptor and contains pointers to the interrupt handler, name of the device, interrupt number, etc. The second structure represents a descriptor of an interrupt and contains pointer to the `irqaction`, interrupt flags, etc. Note that the `request_threaded_irq` function called by the `request_irq` with the additional parameter: `irq_handler_t thread_fn`. If this parameter is not `NULL`, the `irq` thread will be created and the given `irq` handler will be executed in this thread. In the next step we need to make following checks:\n\n```C\nif (((irqflags & IRQF_SHARED) && !dev_id) ||\n            (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) ||\n            ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND)))\n               return -EINVAL;\n```\n\nFirst of all we check that real `dev_id` is passed for the shared interrupt and the `IRQF_COND_SUSPEND` only makes sense for shared interrupts. Otherwise we exit from this function with the `-EINVAL` error. After this we convert the given `irq` number to the `irq` descriptor with the help of the `irq_to_desc` function that defined in the [kernel/irq/irqdesc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/irqdesc.c) source code file and exit from this function with the `-EINVAL` error if it was not successful:\n\n```C\ndesc = irq_to_desc(irq);\nif (!desc)\n    return -EINVAL;\n```\n\nThe `irq_to_desc` function checks that given `irq` number is less than maximum number of IRQs and returns the irq descriptor where the `irq` number is offset from the `irq_desc` array:\n\n```C\nstruct irq_desc *irq_to_desc(unsigned int irq)\n{\n        return (irq < NR_IRQS) ? irq_desc + irq : NULL;\n}\n```\n\nAs we have converted `irq` number to the `irq` descriptor we make the check the status of the descriptor that an interrupt can be requested:\n\n```C\nif (!irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc)))\n    return -EINVAL;\n```\n\nand exit with the `-EINVAL` otherwise. After this we check the given interrupt handler. If it was not passed to the `request_irq` function, we check the `thread_fn`. If both handlers are `NULL`, we return with the `-EINVAL`. If an interrupt handler was not passed to the `request_irq` function, but the `thread_fn` is not null, we set handler to the `irq_default_primary_handler`:\n\n```C\nif (!handler) {\n    if (!thread_fn)\n        return -EINVAL;\n\thandler = irq_default_primary_handler;\n}\n```\n\nIn the next step we allocate memory for our `irqaction` with the `kzalloc` function and return from the function if this operation was not successful:\n\n```C\naction = kzalloc(sizeof(struct irqaction), GFP_KERNEL);\nif (!action)\n    return -ENOMEM;\n```\n\nMore about `kzalloc` will be in the separate chapter about [memory management](https://0xax.gitbook.io/linux-insides/summary/mm) in the Linux kernel. As we allocated space for the `irqaction`, we start to initialize this structure with the values of interrupt handler, interrupt flags, device name, etc:\n\n```C\naction->handler = handler;\naction->thread_fn = thread_fn;\naction->flags = irqflags;\naction->name = devname;\naction->dev_id = dev_id;\n```\n\nIn the end of the `request_threaded_irq` function we call the `__setup_irq` function from the [kernel/irq/manage.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/manage.c) and registers a given `irqaction`. Release memory for the `irqaction` and return:\n\n```C\nchip_bus_lock(desc);\nretval = __setup_irq(irq, desc, action);\nchip_bus_sync_unlock(desc);\n\nif (retval)\n\tkfree(action);\n\nreturn retval;\n```\n\nNote that the call of the `__setup_irq` function is placed between the `chip_bus_lock` and the `chip_bus_sync_unlock` functions. These functions lock/unlock access to slow buses (like [i2c](https://en.wikipedia.org/wiki/I%C2%B2C)) chips. Now let's look at the implementation of the `__setup_irq` function. In the beginning of the `__setup_irq` function we can see a couple of different checks. First of all we check that the given interrupt descriptor is not `NULL`, `irqchip` is not `NULL` and that given interrupt descriptor module owner is not `NULL`. After this we check if the interrupt is nested into another interrupt thread or not, and if it is nested we replace the `irq_default_primary_handler` with the `irq_nested_primary_handler`.\n\nIn the next step we create an irq handler thread with the `kthread_create` function, if the given interrupt is not nested and the `thread_fn` is not `NULL`:\n\n```C\nif (new->thread_fn && !nested) {\n\tstruct task_struct *t;\n\tt = kthread_create(irq_thread, new, \"irq/%d-%s\", irq, new->name);\n\t...\n}\n```\n\nAnd fill the rest of the given interrupt descriptor fields in the end. So, our `16` and `17` interrupt request lines are registered and the `serial21285_rx_chars` and `serial21285_tx_chars` functions will be invoked when an interrupt controller will get event related to these interrupts. Now let's look at what happens when an interrupt occurs.\n\nPrepare to handle an interrupt\n--------------------------------------------------------------------------------\n\nIn the previous paragraph we saw the requesting of the irq line for the given interrupt descriptor and registration of the `irqaction` structure for the given interrupt. We already know that when an interrupt event occurs, an interrupt controller notifies the processor about this event and processor tries to find appropriate interrupt gate for this interrupt. If you have read the eighth [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-8) of this chapter, you may remember the `native_init_IRQ` function. This function makes initialization of the local [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). The following part of this function is the most interesting part for us right now:\n\n```C\nfor_each_clear_bit_from(i, used_vectors, first_system_vector) {\n\tset_intr_gate(i, irq_entries_start +\n\t\t8 * (i - FIRST_EXTERNAL_VECTOR));\n}\n```\n\nHere we iterate over all the cleared bit of the `used_vectors` bitmap starting at `first_system_vector` that is:\n\n```C\nint first_system_vector = FIRST_SYSTEM_VECTOR; // 0xef\n```\n\nand set interrupt gates with the `i` vector number and the `irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR)` start address. Only one thing is unclear here - the `irq_entries_start`. This symbol defined in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S) assembly file and provides `irq` entries. Let's look at it:\n\n```assembly\n\t.align 8\nENTRY(irq_entries_start)\n    vector=FIRST_EXTERNAL_VECTOR\n    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)\n\tpushq\t$(~vector+0x80)\n    vector=vector+1\n\tjmp\tcommon_interrupt\n\t.align\t8\n    .endr\nEND(irq_entries_start)\n```\n\nHere we can see the [GNU assembler](https://en.wikipedia.org/wiki/GNU_Assembler) `.rept` instruction which repeats the sequence of lines that are before `.endr` - `FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR` times. As we already know, the `FIRST_SYSTEM_VECTOR` is `0xef`, and the `FIRST_EXTERNAL_VECTOR` is equal to `0x20`. So, it will work:\n\n```python\n>>> 0xef - 0x20\n207\n```\n\ntimes. In the body of the `.rept` instruction we push entry stubs on the stack (note that we use negative numbers for the interrupt vector numbers, because positive numbers already reserved to identify [system calls](https://en.wikipedia.org/wiki/System_call)), increase the `vector` variable and jump on the `common_interrupt` label. In the `common_interrupt` we adjust vector number on the stack and execute `interrupt` number with the `do_IRQ` parameter:\n\n```assembly\ncommon_interrupt:\n\taddq\t$-0x80, (%rsp)\n\tinterrupt do_IRQ\n```\n\nThe macro `interrupt` defined in the same source code file and saves [general purpose](https://en.wikipedia.org/wiki/Processor_register) registers on the stack, change the userspace `gs` on the kernel with the `SWAPGS` assembler instruction if need, increase [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) - `irq_count` variable that shows that we are in interrupt and call the `do_IRQ` function. This function defined in the [arch/x86/kernel/irq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/irq.c) source code file and handles our device interrupt. Let's look at this function. The `do_IRQ` function takes one parameter - `pt_regs` structure that stores values of the userspace registers:\n\n```C\n__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)\n{\n    struct pt_regs *old_regs = set_irq_regs(regs);\n    unsigned vector = ~regs->orig_ax;\n    unsigned irq;\n\n\tirq_enter();\n    exit_idle();\n\t...\n\t...\n\t...\n}\n```\n\nAt the beginning of this function we can see call of the `set_irq_regs` function that returns saved `per-cpu` irq register pointer and the calls of the `irq_enter` and `exit_idle` functions. The first function `irq_enter` enters to an interrupt context with the updating `__preempt_count` variable and the second function - `exit_idle` checks that current process is `idle` with [pid](https://en.wikipedia.org/wiki/Process_identifier) - `0` and notify the `idle_notifier` with the `IDLE_END`.\n\nIn the next step we read the `irq` for the current cpu and call the `handle_irq` function:\n\n```C\nirq = __this_cpu_read(vector_irq[vector]);\n\nif (!handle_irq(irq, regs)) {\n\t...\n\t...\n\t...\n}\n...\n...\n...\n```\n\nThe `handle_irq` function defined in the [arch/x86/kernel/irq_64.c](https://github.com/torvalds/linux/blob/arch/x86/kernel/irq_64.c) source code file, checks the given interrupt descriptor and call the `generic_handle_irq_desc`:\n\n```C\ndesc = irq_to_desc(irq);\n\tif (unlikely(!desc))\n\t\treturn false;\ngeneric_handle_irq_desc(irq, desc);\n```\n\nWhere the `generic_handle_irq_desc` calls the interrupt handler:\n\n```C\nstatic inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc)\n{\n       desc->handle_irq(irq, desc);\n}\n```\n\nBut stop... What is it `handle_irq` and why do we call our interrupt handler from the interrupt descriptor when we know that `irqaction` points to the actual interrupt handler? Actually the `irq_desc->handle_irq` is a high-level API for the calling interrupt handler routine. It is setup during initialization of the [device tree](https://en.wikipedia.org/wiki/Device_tree) and [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) initialization. The kernel selects correct function and call chain of the `irq->action(s)` there. In this way, the `serial21285_tx_chars` or the `serial21285_rx_chars` function will be executed after an interrupt occurs.\n\nIn the end of the `do_IRQ` function we call the `irq_exit` function that will exit from the interrupt context, the `set_irq_regs` with the old userspace registers and return:\n\n```C\nirq_exit();\nset_irq_regs(old_regs);\nreturn 1;\n```\n\nWe already know that when an `IRQ` finishes its work, deferred interrupts will be executed if they exist.\n\nExit from interrupt\n--------------------------------------------------------------------------------\n\nOk, the interrupt handler finished its execution and now we must return from the interrupt. When the work of the `do_IRQ` function is finished, we will return back to the assembler code in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S) to the `ret_from_intr` label. First of all we disable interrupts with the `DISABLE_INTERRUPTS` macro that expands to the `cli` instruction and decreases value of the `irq_count` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable. Remember, this variable had value - `1`, when we were in interrupt context:\n\n```assembly\nDISABLE_INTERRUPTS(CLBR_NONE)\nTRACE_IRQS_OFF\ndecl\tPER_CPU_VAR(irq_count)\n```\n\nIn the last step we check the previous context (user or kernel), restore it in a correct way and exit from an interrupt with the:\n\n```assembly\nINTERRUPT_RETURN\n```\n\nwhere the `INTERRUPT_RETURN` macro is:\n\n```C\n#define INTERRUPT_RETURN\tjmp native_iret\n```\n\nand\n\n```assembly\nENTRY(native_iret)\n\n.global native_irq_return_iret\nnative_irq_return_iret:\n\tiretq\n```\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the tenth part of the [Interrupts and Interrupt Handling](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and as you have read in the beginning of this part - it is the last part of this chapter. This chapter started from the explanation of the theory of interrupts and we have learned what is it interrupt and kinds of interrupts, then we saw exceptions and handling of this kind of interrupts, deferred interrupts and finally we looked on the hardware interrupts and the handling of theirs in this part. Of course, this part and even this chapter does not cover full aspects of interrupts and interrupt handling in the Linux kernel. It is not realistic to do this. At least for me. It was the big part, I don't know how about you, but it was really big for me. This theme is much bigger than this chapter and I am not sure that somewhere there is a book that covers it. We have missed many part and aspects of interrupts and interrupt handling, but I think it will be good point to dive in the kernel code related to the interrupts and interrupts handling.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [Serial driver documentation](https://www.kernel.org/doc/Documentation/serial/driver)\n* [StrongARM** SA-110/21285 Evaluation Board](http://netwinder.osuosl.org/pub/netwinder/docs/intel/datashts/27813501.pdf)\n* [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29)\n* [module](https://en.wikipedia.org/wiki/Loadable_kernel_module)\n* [initcall](http://kernelnewbies.org/Documents/InitcallMechanism)\n* [uart](https://en.wikipedia.org/wiki/Universal_asynchronous_receiver/transmitter)\n* [ISA](https://en.wikipedia.org/wiki/Industry_Standard_Architecture)\n* [memory management](https://0xax.gitbook.io/linux-insides/summary/mm)\n* [i2c](https://en.wikipedia.org/wiki/I%C2%B2C)\n* [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)\n* [GNU assembler](https://en.wikipedia.org/wiki/GNU_Assembler)\n* [Processor register](https://en.wikipedia.org/wiki/Processor_register)\n* [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [pid](https://en.wikipedia.org/wiki/Process_identifier)\n* [device tree](https://en.wikipedia.org/wiki/Device_tree)\n* [system calls](https://en.wikipedia.org/wiki/System_call)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-9)\n"
  },
  {
    "path": "Interrupts/linux-interrupts-2.md",
    "content": "Interrupts and Interrupt Handling. Part 2.\n================================================================================\n\nStart to dive into interrupt and exceptions handling in the Linux kernel\n--------------------------------------------------------------------------------\n\nWe saw some theory about interrupts and exception handling in the [introduction](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-1) and as I mentioned in that part, we will now start to dive into interrupts and exceptions within the Linux kernel source code. We'll commence by initializing the basic components as we did in the other chapters. But, we will not see the Linux kernel source code from the very early [code lines](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/header.S#L292), as this was presented in the example within the [Linux kernel booting process](https://0xax.gitbook.io/linux-insides/summary/booting) chapter. In the beginning we will deal with the first sections of the Linux kernel source code, which are related to interrupts and exceptions.\n\nIf you've read the previous parts, you can remember that the earliest place in the Linux kernel `x86_64` architecture-specific source code, which is related to the interrupt is located in the [arch/x86/boot/pm.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/pm.c) source code file and represents the first setup of the [Interrupt Descriptor Table](http://en.wikipedia.org/wiki/Interrupt_descriptor_table). It occurs right before the transition into the [protected mode](http://en.wikipedia.org/wiki/Protected_mode) in the `go_to_protected_mode` function by calling `setup_idt`:\n\n```C\nvoid go_to_protected_mode(void)\n{\n\t...\n\tsetup_idt();\n\t...\n}\n```\n\nThe `setup_idt` function is defined in the same source code file as the `go_to_protected_mode` function and just loads the address of the `NULL` interrupt descriptor table:\n\n```C\nstatic void setup_idt(void)\n{\n\tstatic const struct gdt_ptr null_idt = {0, 0};\n\tasm volatile(\"lidtl %0\" : : \"m\" (null_idt));\n}\n```\n\nwhere `gdt_ptr` represents a special 48-bit `GDTR` register, which must contain the base address of the `Global Descriptor Table`:\n\n```C\nstruct gdt_ptr {\n\tu16 len;\n\tu32 ptr;\n} __attribute__((packed));\n```\n\nOf course in our case the `gdt_ptr` does not represent the `GDTR` register, but `IDTR` since we set the `Interrupt Descriptor Table`. You will not find an `idt_ptr` structure, because if it had been in the Linux kernel source code, it would have been the same as a `gdt_ptr` but with a different name. It would make no sense to create two structures that only differ in their names. Note here that we do not fill the `Interrupt Descriptor Table` with entries, because it is too early to handle any interrupts or exceptions at this point. That's why we just fill the `IDT` with `NULL`.\n\nAfter the setup of the [Interrupt descriptor table](http://en.wikipedia.org/wiki/Interrupt_descriptor_table), [Global Descriptor Table](http://en.wikipedia.org/wiki/GDT) and other stuff we jump into [protected mode](http://en.wikipedia.org/wiki/Protected_mode) in the - [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/pmjump.S) file. You can read more about it in the [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-3), which describes the transition to protected mode.\n\nThe entry to protected mode is located in the `boot_params.hdr.code32_start` and passed together with the `boot_params` to the `protected_mode_jump` function at the end of [arch/x86/boot/pm.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/pm.c):\n\n```C\nprotected_mode_jump(boot_params.hdr.code32_start,\n\t\t\t    (u32)&boot_params + (ds() << 4));\n```\n\nThe `protected_mode_jump` function is defined at [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/pmjump.S) and receives these two parameters within the `ax` and `dx` registers, using one of the [8086](http://en.wikipedia.org/wiki/Intel_8086) calling  [conventions](http://en.wikipedia.org/wiki/X86_calling_conventions#List_of_x86_calling_conventions):\n\n```assembly\nSYM_FUNC_START_NOALIGN(protected_mode_jump)\n\t...\n\t...\n\t...\n\t.byte\t0x66, 0xea\t\t# ljmpl opcode\n2:\t.long\t.Lin_pm32\t\t# offset\n\t.word\t__BOOT_CS\t\t# segment\nSYM_FUNC_END(protected_mode_jump)\n```\n\nwhere `in_pm32` contains a jump to the 32-bit entry point:\n\n```assembly\nSYM_FUNC_START_LOCAL_NOALIGN(.Lin_pm32)\n\t...\n\t...\n\tjmpl\t*%eax\t\t\t# Jump to the 32-bit entrypoint\nSYM_FUNC_END(.Lin_pm32)\n```\n\nAs you can remember the 32-bit entry point is in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/compressed/head_64.S) assembly file, although it contains `_64` in its name. We can see the two similar files in the `arch/x86/boot/compressed` directory:\n\n* `arch/x86/boot/compressed/head_32.S`.\n* `arch/x86/boot/compressed/head_64.S`;\n\nBut the 32-bit mode entry point is the second file in our case. The first file is not even compiled for `x86_64`. Let's look at the [arch/x86/boot/compressed/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/compressed/Makefile):\n\n```\nvmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o \\\n...\n...\n```\n\nWe can see here that `head_*` depends on the `$(BITS)` variable, which is based on the architecture. The variable is defined within [arch/x86/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile):\n\n```\nifeq ($(CONFIG_X86_32),y)\n...\n\tBITS := 32\nelse\n\tBITS := 64\n\t...\nendif\n```\n\nNow as we jumped into `startup_32` from [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/compressed/head_64.S), we will not encounter anything related to interrupt handling here. The code inside of `startup_32` makes necessary preparations, before transitioning into the [long mode](http://en.wikipedia.org/wiki/Long_mode) with a direct jump. The `long mode` entry is located in `startup_64` and it makes arrangements for the [kernel decompression](https://0xax.gitbooks.io/linux-insides/content/Booting/linux-bootstrap-5.html) that occurs in the `decompress_kernel` function inside of [arch/x86/boot/compressed/misc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/compressed/misc.c). After the kernel is decompressed, we jump into `startup_64` defined at [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head_64.S). In `startup_64` we start to build identity-mapped pages, check the [NX](http://en.wikipedia.org/wiki/NX_bit) bit, setup the `Extended Feature Enable Register` (see in links) and update the early `Global Descriptor Table` with the `lgdt` instruction. And proceed to setup `gs` register with the following code:\n\n```assembly\nmovl\t$MSR_GS_BASE,%ecx\nmovl\tinitial_gs(%rip),%eax\nmovl\tinitial_gs+4(%rip),%edx\nwrmsr\n```\n\nWe already saw this code in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-1). First of all pay attention on the last `wrmsr` instruction. This instruction writes data from the `edx:eax` registers to the [model specific register](http://en.wikipedia.org/wiki/Model-specific_register) specified by the `ecx` register. We can see that `ecx` contains `$MSR_GS_BASE` which is declared in the [arch/x86/include/asm/msr-index.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/msr-index.h) and looks like:\n\n```C\n#define MSR_GS_BASE             0xc0000101\n```\n\nFrom this we can understand that `MSR_GS_BASE` defines the number of the `model specific register`. Since registers `cs`, `ds`, `es`, and `ss` are not used in the 64-bit mode, their fields are ignored. But we can access memory over `fs` and `gs` registers. The model specific register provides a `back door` to the hidden parts of these segment registers and allows to use 64-bit base address for segment register addressed by the `fs` and `gs`. So the `MSR_GS_BASE` is the hidden part and this part is mapped on the `GS.base` field. Let's look on the `initial_gs`:\n\n```assembly\nGLOBAL(initial_gs)\n\t.quad\tINIT_PER_CPU_VAR(irq_stack_union)\n```\n\nWe pass `irq_stack_union` symbol to the `INIT_PER_CPU_VAR` macro which just concatenates the `init_per_cpu__` prefix with the given symbol. In our case we will get the `init_per_cpu__irq_stack_union` symbol. Let's look at the [linker](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/vmlinux.lds.S) script. There we can see following definition:\n\n```\n#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load\nINIT_PER_CPU(irq_stack_union);\n```\n\nIt tells us that the address of the `init_per_cpu__irq_stack_union` will be `irq_stack_union + __per_cpu_load`. Now we need to understand where `init_per_cpu__irq_stack_union` and `__per_cpu_load` are what they mean. The first `irq_stack_union` is defined in the [arch/x86/include/asm/processor.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/processor.h) with the `DECLARE_INIT_PER_CPU` macro which expands to call the `init_per_cpu_var` macro:\n\n```C\nDECLARE_INIT_PER_CPU(irq_stack_union);\n\n#define DECLARE_INIT_PER_CPU(var) \\\n       extern typeof(per_cpu_var(var)) init_per_cpu_var(var)\n\n#define init_per_cpu_var(var)  init_per_cpu__##var\n```\n\nIf we expand all macros we will get the same `init_per_cpu__irq_stack_union` as we got after expanding the `INIT_PER_CPU` macro, but you can note that it is not just a symbol, but a variable. Let's look at the `typeof(per_cpu_var(var))` expression. Our `var` is `irq_stack_union` and the `per_cpu_var` macro is defined in the [arch/x86/include/asm/percpu.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/percpu.h):\n\n```C\n#define PER_CPU_VAR(var)        %__percpu_seg:var\n```\n\nwhere:\n\n```C\n#ifdef CONFIG_X86_64\n    #define __percpu_seg gs\nendif\n```\n\nSo, we are accessing `gs:irq_stack_union` and getting its type which is `irq_union`. Ok, we defined the first variable and know its address, now let's look at the second `__per_cpu_load` symbol. There are a couple of `per-cpu` variables which are located after this symbol. The `__per_cpu_load` is defined in the [include/asm-generic/sections.h](https://github.com/torvalds/linux/blob/master/include/asm-generic/sections.h):\n\n```C\nextern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];\n```\n\nand presented base address of the `per-cpu` variables from the data area. So, we know the address of the `irq_stack_union`, `__per_cpu_load` and we know that `init_per_cpu__irq_stack_union` must be placed right after `__per_cpu_load`. And we can see it in the [System.map](http://en.wikipedia.org/wiki/System.map):\n\n```\n...\n...\n...\nffffffff819ed000 D __init_begin\nffffffff819ed000 D __per_cpu_load\nffffffff819ed000 A init_per_cpu__irq_stack_union\n...\n...\n...\n```\n\nNow we know about `initial_gs`, so let's look at the code:\n\n```assembly\nmovl\t$MSR_GS_BASE,%ecx\nmovl\tinitial_gs(%rip),%eax\nmovl\tinitial_gs+4(%rip),%edx\nwrmsr\n```\n\nHere we specified a model specific register with `MSR_GS_BASE`, put the content of the `initial_gs` to the `edx:eax` pair and execute the `wrmsr` instruction for filling the `gs` register with the base address of the `init_per_cpu__irq_stack_union` which will be at the bottom of the interrupt stack. After this we will jump to the C code on the `x86_64_start_kernel` from the [arch/x86/kernel/head64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head64.c). In the `x86_64_start_kernel` function we do the last preparations before we jump into the generic and architecture-independent kernel code and one of these preparations is filling the early `Interrupt Descriptor Table` with the interrupts handlers entries or `early_idt_handlers`. You can remember it, if you have read the part about the [Early interrupt and exception handling](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-2) and can remember following code:\n\n```C\nfor (i = 0; i < NUM_EXCEPTION_VECTORS; i++)\n\tset_intr_gate(i, early_idt_handlers[i]);\n\nload_idt((const struct desc_ptr *)&idt_descr);\n```\n\nbut I wrote `Early interrupt and exception handling` part when Linux kernel version was - `3.18`. For this day actual version of the Linux kernel is `4.1.0-rc6+` and ` Andy Lutomirski` sent the [patch](https://lkml.org/lkml/2015/6/2/106) and soon it will be in the mainline kernel that changes behaviour for the `early_idt_handlers`. **NOTE** While I wrote this part the [patch](https://github.com/torvalds/linux/commit/425be5679fd292a3c36cb1fe423086708a99f11a) already turned in the Linux kernel source code. Let's look on it. Now the same part looks like:\n\n```C\nfor (i = 0; i < NUM_EXCEPTION_VECTORS; i++)\n\tset_intr_gate(i, early_idt_handler_array[i]);\n\nload_idt((const struct desc_ptr *)&idt_descr);\n```\n\nAs you can see it has only one difference in the name of the array of the interrupts handlers entry points. Now it is `early_idt_handler_array`:\n\n```C\nextern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];\n```\n\nwhere `NUM_EXCEPTION_VECTORS` and `EARLY_IDT_HANDLER_SIZE` are defined as:\n\n```C\n#define NUM_EXCEPTION_VECTORS 32\n#define EARLY_IDT_HANDLER_SIZE 9\n```\n\nSo, the `early_idt_handler_array` is an array of the interrupts handlers entry points and contains one entry point on every nine bytes. You can remember that previous `early_idt_handlers` was defined in the [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head_64.S). The `early_idt_handler_array` is defined in the same source code file too:\n\n```assembly\nENTRY(early_idt_handler_array)\n...\n...\n...\nENDPROC(early_idt_handler_array)\n```\n\nIt fills `early_idt_handler_array` with the `.rept NUM_EXCEPTION_VECTORS` and contains entry of the `early_make_pgtable` interrupt handler (you can read more about its implementation in the part about [Early interrupt and exception handling](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-2)). For now, we have reached the end of the x86_64 architecture-specific code and the next part is the generic kernel code. You probably already know, that we will return to the architecture-specific code in the `setup_arch` function and other places, but this is the end of the `x86_64` early code.\n\nSetting stack canary for the interrupt stack\n-------------------------------------------------------------------------------\n\nThe next stop after the [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head_64.S) is the biggest `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). If you've read the previous [chapter](https://0xax.gitbook.io/linux-insides/summary/initialization) about the Linux kernel initialization process, you must remember it. This function does all initialization stuff before kernel will launch first `init` process with the [pid](https://en.wikipedia.org/wiki/Process_identifier) - `1`. The first thing that is related to the interrupts and exceptions handling is the call of the `boot_init_stack_canary` function.\n\nThis function sets the [canary](http://en.wikipedia.org/wiki/Stack_buffer_overflow#Stack_canaries) value to protect interrupt stack overflow. We already saw a little some details about implementation of the `boot_init_stack_canary` in the previous part and now let's take a closer look on it. You can find implementation of this function in the [arch/x86/include/asm/stackprotector.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/stackprotector.h) and its depends on the `CONFIG_CC_STACKPROTECTOR` kernel configuration option. If this option is not set this function will not do anything:\n\n```C\n#ifdef CONFIG_CC_STACKPROTECTOR\n...\n...\n...\n#else\nstatic inline void boot_init_stack_canary(void)\n{\n}\n#endif\n```\n\nIf the `CONFIG_CC_STACKPROTECTOR` kernel configuration option is set, the `boot_init_stack_canary` function starts from the check stat `irq_stack_union` that represents [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) interrupt stack has offset equal to forty bytes from the `stack_canary` value:\n\n```C\n#ifdef CONFIG_X86_64\n        BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40);\n#endif\n```\n\nAs we can read in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-1) the `irq_stack_union` represented by the following union:\n\n```C\nunion irq_stack_union {\n\tchar irq_stack[IRQ_STACK_SIZE];\n\n    struct {\n\t\tchar gs_base[40];\n\t\tunsigned long stack_canary;\n\t};\n};\n```\n\nwhich defined in the [arch/x86/include/asm/processor.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/processor.h). We know that [union](http://en.wikipedia.org/wiki/Union_type) in the [C](http://en.wikipedia.org/wiki/C_%28programming_language%29) programming language is a data structure which stores only one field in a memory. We can see here that structure has first field - `gs_base` which is 40 bytes size and represents bottom of the `irq_stack`. So, after this our check with the `BUILD_BUG_ON` macro should end successfully. (you can read the first part about Linux kernel initialization [process](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) if you're interesting about the `BUILD_BUG_ON` macro).\n\nAfter this we calculate new `canary` value based on the random number and [Time Stamp Counter](http://en.wikipedia.org/wiki/Time_Stamp_Counter):\n\n```C\nget_random_bytes(&canary, sizeof(canary));\ntsc = __native_read_tsc();\ncanary += tsc + (tsc << 32UL);\n```\n\nand write `canary` value to the `irq_stack_union` with the `this_cpu_write` macro:\n\n```C\nthis_cpu_write(irq_stack_union.stack_canary, canary);\n```\n\nmore about `this_cpu_*` operation you can read in the [Linux kernel documentation](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/this_cpu_ops.txt).\n\nDisabling/Enabling local interrupts\n--------------------------------------------------------------------------------\n\nThe next step in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) which is related to the interrupts and interrupts handling after we have set the `canary` value to the interrupt stack - is the call of the `local_irq_disable` macro.\n\nThis macro defined in the [include/linux/irqflags.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irqflags.h) header file and as you can understand, we can disable interrupts for the CPU with the call of this macro. Let's look on its implementation. First of all note that it depends on the `CONFIG_TRACE_IRQFLAGS_SUPPORT` kernel configuration option:\n\n```C\n#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT\n...\n#define local_irq_disable() \\\n         do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0)\n...\n#else\n...\n#define local_irq_disable()     do { raw_local_irq_disable(); } while (0)\n...\n#endif\n```\n\nThey are both similar and as you can see have only one difference: the `local_irq_disable` macro contains call of the `trace_hardirqs_off` when `CONFIG_TRACE_IRQFLAGS_SUPPORT` is enabled. There is special feature in the [lockdep](http://lwn.net/Articles/321663/) subsystem - `irq-flags tracing` for tracing `hardirq` and `softirq` state. In our case `lockdep` subsystem can give us interesting information about hard/soft irqs on/off events which are occurs in the system. The `trace_hardirqs_off` function defined in the [kernel/locking/lockdep.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/lockdep.c):\n\n```C\nvoid trace_hardirqs_off(void)\n{\n         trace_hardirqs_off_caller(CALLER_ADDR0);\n}\nEXPORT_SYMBOL(trace_hardirqs_off);\n```\n\nand just calls `trace_hardirqs_off_caller` function. The `trace_hardirqs_off_caller` checks the `hardirqs_enabled` field of the current process and increases the `redundant_hardirqs_off` if call of the `local_irq_disable` was redundant or the `hardirqs_off_events` if it was not. These two fields and other `lockdep` statistic related fields are defined in the [kernel/locking/lockdep_internals.h](https://github.com/torvalds/linux/blob/master/kernel/locking/lockdep_internals.h) and located in the `lockdep_stats` structure:\n\n```C\nstruct lockdep_stats {\n...\n...\n...\nint     softirqs_off_events;\nint     redundant_softirqs_off;\n...\n...\n...\n}\n```\n\nIf you will set `CONFIG_DEBUG_LOCKDEP` kernel configuration option, the `lockdep_stats_debug_show` function will write all tracing information to the `/proc/lockdep`:\n\n```C\nstatic void lockdep_stats_debug_show(struct seq_file *m)\n{\n#ifdef CONFIG_DEBUG_LOCKDEP\n\tunsigned long long hi1 = debug_atomic_read(hardirqs_on_events),\n\t                         hi2 = debug_atomic_read(hardirqs_off_events),\n\t\t\t\t\t\t\t hr1 = debug_atomic_read(redundant_hardirqs_on),\n    ...\n\t...\n\t...\n    seq_printf(m, \" hardirq on events:             %11llu\\n\", hi1);\n    seq_printf(m, \" hardirq off events:            %11llu\\n\", hi2);\n    seq_printf(m, \" redundant hardirq ons:         %11llu\\n\", hr1);\n#endif\n}\n```\n\nand you can see its result with the:\n\n```\n$ sudo cat /proc/lockdep\n hardirq on events:             12838248974\n hardirq off events:            12838248979\n redundant hardirq ons:               67792\n redundant hardirq offs:         3836339146\n softirq on events:                38002159\n softirq off events:               38002187\n redundant softirq ons:                   0\n redundant softirq offs:                  0\n```\n\nOk, now we know a little about tracing, but more info will be in the separate part about `lockdep` and `tracing`. You can see that the both `local_irq_disable` macros have the same part - `raw_local_irq_disable`. This macro defined in the [arch/x86/include/asm/irqflags.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/irqflags.h) and expands to the call of the:\n\n```C\nstatic inline void native_irq_disable(void)\n{\n        asm volatile(\"cli\": : :\"memory\");\n}\n```\n\nAnd you already must remember that `cli` instruction clears the [IF](http://en.wikipedia.org/wiki/Interrupt_flag) flag which determines ability of a processor to handle an interrupt or an exception. Besides the `local_irq_disable`, as you already can know there is an inverse macro - `local_irq_enable`. This macro has the same tracing mechanism and very similar on the `local_irq_enable`, but as you can understand from its name, it enables interrupts with the `sti` instruction:\n\n```C\nstatic inline void native_irq_enable(void)\n{\n        asm volatile(\"sti\": : :\"memory\");\n}\n```\n\nNow we know how `local_irq_disable` and `local_irq_enable` work. It was the first call of the `local_irq_disable` macro, but we will meet these macros many times in the Linux kernel source code. But for now we are in the `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) and we just disabled `local` interrupts. Why local and why we did it? Previously kernel provided a method to disable interrupts on all processors and it was called `cli`. This function was [removed](https://lwn.net/Articles/291956/) and now we have `local_irq_{enabled,disable}` to disable or enable interrupts on the current processor. After we've disabled the interrupts with the `local_irq_disable` macro, we set the:\n\n```C\nearly_boot_irqs_disabled = true;\n```\n\nThe `early_boot_irqs_disabled` variable defined in the [include/linux/kernel.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/kernel.h):\n\n```C\nextern bool early_boot_irqs_disabled;\n```\n\nand used in the different places. For example it used in the `smp_call_function_many` function from the [kernel/smp.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/smp.c) for the checking possible deadlock when interrupts are disabled:\n\n```C\nWARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()\n                     && !oops_in_progress && !early_boot_irqs_disabled);\n```\n\nEarly trap initialization during kernel initialization\n--------------------------------------------------------------------------------\n\nThe next functions after the `local_disable_irq` are `boot_cpu_init` and `page_address_init`, but they are not related to the interrupts and exceptions (more about this functions you can read in the chapter about Linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization)). The next is the `setup_arch` function. As you can remember this function located in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) source code file and makes initialization of many different architecture-dependent [stuff](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4). The first interrupts related function which we can see in the `setup_arch` is the - `early_trap_init` function. This function defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) and fills `Interrupt Descriptor Table` with the couple of entries:\n\n```C\nvoid __init early_trap_init(void)\n{\n        set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);\n        set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);\n#ifdef CONFIG_X86_32\n        set_intr_gate(X86_TRAP_PF, page_fault);\n#endif\n        load_idt(&idt_descr);\n}\n```\n\nHere we can see calls of three different functions:\n\n* `set_intr_gate_ist`\n* `set_system_intr_gate_ist`\n* `set_intr_gate`\n\nAll of these functions defined in the [arch/x86/include/asm/desc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/desc.h) and do the similar thing but not the same. The first `set_intr_gate_ist` function inserts a new interrupt gate in the `IDT`. Let's look on its implementation:\n\n```C\nstatic inline void set_intr_gate_ist(int n, void *addr, unsigned ist)\n{\n        BUG_ON((unsigned)n > 0xFF);\n        _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);\n}\n```\n\nFirst of all we can see the check that `n` which is [vector number](http://en.wikipedia.org/wiki/Interrupt_vector_table) of the interrupt is not greater than `0xff` or 255. We need to check it because we remember from the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-1) that vector number of an interrupt must be between `0` and `255`. In the next step we can see the call of the `_set_gate` function that sets a given interrupt gate to the `IDT` table:\n\n```C\nstatic inline void _set_gate(int gate, unsigned type, void *addr,\n                             unsigned dpl, unsigned ist, unsigned seg)\n{\n        gate_desc s;\n\n        pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);\n        write_idt_entry(idt_table, gate, &s);\n        write_trace_idt_entry(gate, &s);\n}\n```\n\nHere we start from the `pack_gate` function which takes clean `IDT` entry represented by the `gate_desc` structure and fills it with the base address and limit, [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/kernel-stacks), [Privilege level](http://en.wikipedia.org/wiki/Privilege_level), type of an interrupt which can be one of the following values:\n\n* `GATE_INTERRUPT`\n* `GATE_TRAP`\n* `GATE_CALL`\n* `GATE_TASK`\n\nand set the present bit for the given `IDT` entry:\n\n```C\nstatic inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,\n                             unsigned dpl, unsigned ist, unsigned seg)\n{\n        gate->offset_low        = PTR_LOW(func);\n        gate->segment           = __KERNEL_CS;\n        gate->ist               = ist;\n        gate->p                 = 1;\n        gate->dpl               = dpl;\n        gate->zero0             = 0;\n        gate->zero1             = 0;\n        gate->type              = type;\n        gate->offset_middle     = PTR_MIDDLE(func);\n        gate->offset_high       = PTR_HIGH(func);\n}\n```\n\nAfter this we write just filled interrupt gate to the `IDT` with the `write_idt_entry` macro which expands to the `native_write_idt_entry` and just copy the interrupt gate to the `idt_table` table by the given index:\n\n```C\n#define write_idt_entry(dt, entry, g)           native_write_idt_entry(dt, entry, g)\n\nstatic inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)\n{\n        memcpy(&idt[entry], gate, sizeof(*gate));\n}\n```\n\nwhere `idt_table` is just array of `gate_desc`:\n\n```C\nextern gate_desc idt_table[];\n```\n\nThat's all. The second `set_system_intr_gate_ist` function has only one difference from the `set_intr_gate_ist`:\n\n```C\nstatic inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)\n{\n        BUG_ON((unsigned)n > 0xFF);\n        _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);\n}\n```\n\nDo you see it? Look on the fourth parameter of the `_set_gate`. It is `0x3`. In the `set_intr_gate` it was `0x0`. We know that this parameter represent `DPL` or privilege level. We also know that `0` is the highest privilege level and `3` is the lowest. Now we know how `set_system_intr_gate_ist`, `set_intr_gate_ist`, `set_intr_gate` work and we can return to the `early_trap_init` function. Let's look on it again:\n\n```C\nset_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);\nset_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);\n```\n\nWe set two `IDT` entries for the `#DB` interrupt and `int3`. These functions takes the same set of parameters:\n\n* vector number of an interrupt;\n* address of an interrupt handler;\n* interrupt stack table index.\n\nThat's all. More about interrupts and handlers you will know in the next parts.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the second part about interrupts and interrupt handling in the Linux kernel. We saw the some theory in the previous part and started to dive into interrupts and exceptions handling in the current part. We have started from the earliest parts in the Linux kernel source code which are related to the interrupts. In the next part we will continue to dive into this interesting theme and will know more about interrupt handling process.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [IDT](http://en.wikipedia.org/wiki/Interrupt_descriptor_table)\n* [Protected mode](http://en.wikipedia.org/wiki/Protected_mode)\n* [List of x86 calling conventions](http://en.wikipedia.org/wiki/X86_calling_conventions#List_of_x86_calling_conventions)\n* [8086](http://en.wikipedia.org/wiki/Intel_8086)\n* [Long mode](http://en.wikipedia.org/wiki/Long_mode)\n* [NX](http://en.wikipedia.org/wiki/NX_bit)\n* [Extended Feature Enable Register](http://en.wikipedia.org/wiki/Control_register#Additional_Control_registers_in_x86-64_series)\n* [Model-specific register](http://en.wikipedia.org/wiki/Model-specific_register)\n* [Process identifier](https://en.wikipedia.org/wiki/Process_identifier)\n* [lockdep](http://lwn.net/Articles/321663/)\n* [irqflags tracing](https://www.kernel.org/doc/Documentation/irqflags-tracing.txt)\n* [IF](http://en.wikipedia.org/wiki/Interrupt_flag)\n* [Stack canary](http://en.wikipedia.org/wiki/Stack_buffer_overflow#Stack_canaries)\n* [Union type](http://en.wikipedia.org/wiki/Union_type)\n* [this_cpu_* operations](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/this_cpu_ops.txt)\n* [vector number](http://en.wikipedia.org/wiki/Interrupt_vector_table)\n* [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/kernel-stacks)\n* [Privilege level](http://en.wikipedia.org/wiki/Privilege_level)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-1)\n"
  },
  {
    "path": "Interrupts/linux-interrupts-3.md",
    "content": "Interrupts and Interrupt Handling. Part 3.\n================================================================================\n\nException Handling\n--------------------------------------------------------------------------------\n\nThis is the third part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) about interrupts and an exceptions handling in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts) we stopped at the `setup_arch` function from the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blame/master/arch/x86/kernel/setup.c) source code file.\n\nWe already know that this function executes initialization of architecture-specific stuff. In our case the `setup_arch` function does [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture related initializations. The `setup_arch` is big function, and in the previous part we stopped on the setting of the two exception handlers for the two following exceptions:\n\n* `#DB` - debug exception, transfers control from the interrupted process to the debug handler;\n* `#BP` - breakpoint exception, caused by the `int 3` instruction.\n\nThese exceptions allow the `x86_64` architecture to have early exception processing for the purpose of debugging via the [kgdb](https://en.wikipedia.org/wiki/KGDB).\n\nAs you can remember we set these exceptions handlers in the `early_trap_init` function:\n\n```C\nvoid __init early_trap_init(void)\n{\n        set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);\n        set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);\n        load_idt(&idt_descr);\n}\n```\n\nfrom the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). We already saw implementation of the `set_intr_gate_ist` and `set_system_intr_gate_ist` functions in the previous part and now we will look on the implementation of these two exception handlers.\n\nDebug and Breakpoint exceptions\n--------------------------------------------------------------------------------\n\nOk, we setup exception handlers in the `early_trap_init` function for the `#DB` and `#BP` exceptions and now is time to consider their implementations. But before we will do this, first of all let's look on details of these exceptions.\n\nThe first exceptions - `#DB` or `debug` exception occurs when a debug event occurs. For example - attempt to change the contents of a [debug register](http://en.wikipedia.org/wiki/X86_debug_register). Debug registers are special registers that were presented in `x86` processors starting from the [Intel 80386](http://en.wikipedia.org/wiki/Intel_80386) processor and as you can understand from name of this CPU extension, main purpose of these registers is debugging.\n\nThese registers allow to set breakpoints on the code and read or write data to trace it. Debug registers may be accessed only in the privileged mode and an attempt to read or write the debug registers when executing at any other privilege level causes a [general protection fault](https://en.wikipedia.org/wiki/General_protection_fault) exception. That's why we have used `set_intr_gate_ist` for the `#DB` exception, but not the `set_system_intr_gate_ist`.\n\nThe vector number of the `#DB` exceptions is `1` (we pass it as `X86_TRAP_DB`) and as we may read in specification, this exception has no error code:\n\n```\n+-----------------------------------------------------+\n|Vector|Mnemonic|Description         |Type |Error Code|\n+-----------------------------------------------------+\n|1     | #DB    |Reserved            |F/T  |NO        |\n+-----------------------------------------------------+\n```\n\nThe second exception is `#BP` or `breakpoint` exception occurs when processor executes the [int 3](http://en.wikipedia.org/wiki/INT_%28x86_instruction%29#INT_3) instruction. Unlike the `DB` exception, the `#BP` exception may occur in userspace. We can add it anywhere in our code, for example let's look on the simple program:\n\n```C\n// breakpoint.c\n#include <stdio.h>\n\nint main() {\n    int i;\n    while (i < 6){\n\t    printf(\"i equal to: %d\\n\", i);\n\t    __asm__(\"int3\");\n\t\t++i;\n    }\n}\n```\n\nIf we will compile and run this program, we will see following output:\n\n```\n$ gcc breakpoint.c -o breakpoint\n$ ./breakpoint\ni equal to: 0\nTrace/breakpoint trap\n```\n\nBut if will run it with gdb, we will see our breakpoint and can continue execution of our program:\n\n```\n$ gdb breakpoint\n...\n...\n...\n(gdb) run\nStarting program: /home/alex/breakpoints\ni equal to: 0\n\nProgram received signal SIGTRAP, Trace/breakpoint trap.\n0x0000000000400585 in main ()\n=> 0x0000000000400585 <main+31>:\t83 45 fc 01\tadd    DWORD PTR [rbp-0x4],0x1\n(gdb) c\nContinuing.\ni equal to: 1\n\nProgram received signal SIGTRAP, Trace/breakpoint trap.\n0x0000000000400585 in main ()\n=> 0x0000000000400585 <main+31>:\t83 45 fc 01\tadd    DWORD PTR [rbp-0x4],0x1\n(gdb) c\nContinuing.\ni equal to: 2\n\nProgram received signal SIGTRAP, Trace/breakpoint trap.\n0x0000000000400585 in main ()\n=> 0x0000000000400585 <main+31>:\t83 45 fc 01\tadd    DWORD PTR [rbp-0x4],0x1\n...\n...\n...\n```\n\nFrom this moment we know a little about these two exceptions and we can move on to consideration of their handlers.\n\nPreparation before an exception handler\n--------------------------------------------------------------------------------\n\nAs you may note before, the `set_intr_gate_ist` and `set_system_intr_gate_ist` functions takes an addresses of exceptions handlers in theirs second parameter. In or case our two exception handlers will be:\n\n* `debug`;\n* `int3`.\n\nYou will not find these functions in the C code. All of that could be found in the kernel's `*.c/*.h` files only definition of these functions which are located in the [arch/x86/include/asm/traps.h](https://github.com/torvalds/linux/tree/master/arch/x86/include/asm/traps.h) kernel header file:\n\n```C\nasmlinkage void debug(void);\n```\n\nand\n\n```C\nasmlinkage void int3(void);\n```\n\nYou may note `asmlinkage` directive in definitions of these functions. The directive is the special specificator of the [gcc](http://en.wikipedia.org/wiki/GNU_Compiler_Collection). Actually for a `C` functions which are called from assembly, we need in explicit declaration of the function calling convention. In our case, if function made with `asmlinkage` descriptor, then `gcc` will compile the function to retrieve parameters from stack.\n\nSo, both handlers are defined in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S) assembly source code file with the `idtentry` macro:\n\n```assembly\nidtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK\n```\n\nand\n\n```assembly\nidtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK\n```\n\nEach exception handler may consists of two parts. The first part is generic part and it is the same for all exception handlers. An exception handler should to save  [general purpose registers](https://en.wikipedia.org/wiki/Processor_register) on the stack, switch to kernel stack if an exception came from userspace and transfer control to the second part of an exception handler. The second part of an exception handler does certain work depends on certain exception. For example page fault exception handler should find virtual page for given address, invalid opcode exception handler should send `SIGILL` [signal](https://en.wikipedia.org/wiki/Unix_signal) and etc.\n\nAs we just saw, an exception handler starts from definition of the `idtentry` macro from the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S) assembly source code file, so let's look at implementation of this macro. As we may see, the `idtentry` macro takes five arguments:\n\n* `sym` - defines global symbol with the `.globl name` which will be an an entry of exception handler;\n* `do_sym` - symbol name which represents a secondary entry of an exception handler;\n* `has_error_code` - information about existence of an error code of exception.\n\nThe last two parameters are optional:\n\n* `paranoid` - shows us how we need to check current mode (will see explanation in details later);\n* `shift_ist` - shows us is an exception running at `Interrupt Stack Table`.\n\nDefinition of the `.idtentry` macro looks:\n\n```assembly\n.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1\nENTRY(\\sym)\n...\n...\n...\nEND(\\sym)\n.endm\n```\n\nBefore we will consider internals of the `idtentry` macro, we should to know state of stack when an exception occurs. As we may read in the [Intel® 64 and IA-32 Architectures Software Developer’s Manual 3A](http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html), the state of stack when an exception occurs is following:\n\n```\n    +------------+\n+40 | %SS        |\n+32 | %RSP       |\n+24 | %RFLAGS    |\n+16 | %CS        |\n +8 | %RIP       |\n  0 | ERROR CODE | <-- %RSP\n    +------------+\n```\n\nNow we may start to consider implementation of the `idtmacro`. Both `#DB` and `BP` exception handlers are defined as:\n\n```assembly\nidtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK\nidtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK\n```\n\nIf we will look at these definitions, we may know that compiler will generate two routines with `debug` and `int3` names and both of these exception handlers will call `do_debug` and `do_int3` secondary handlers after some preparation. The third parameter defines existence of error code and as we may see both our exception do not have them. As we may see on the diagram above, processor pushes error code on stack if an exception provides it. In our case, the `debug` and `int3` exception do not have error codes. This may bring some difficulties because stack will look differently for exceptions which provides error code and for exceptions which not. That's why implementation of the `idtentry` macro starts from putting a fake error code to the stack if an exception does not provide it:\n\n```assembly\n.ifeq \\has_error_code\n    pushq\t$-1\n.endif\n```\n\nBut it is not only fake error-code. Moreover the `-1` also represents invalid system call number, so that the system call restart logic will not be triggered.\n\nThe last two parameters of the `idtentry` macro `shift_ist` and `paranoid` allow to know do an exception handler ran at stack from `Interrupt Stack Table` or not. You already may know that each kernel thread in the system has its own stack. In addition to these stacks, there are some specialized stacks associated with each processor in the system. One of these stacks is - exception stack. The [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture provides special feature which is called - `Interrupt Stack Table`. This feature allows to switch to a new stack for designated events such as an atomic exceptions like `double fault`, etc. So the `shift_ist` parameter allows us to know do we need to switch on `IST` stack for an exception handler or not.\n\nThe second parameter - `paranoid` defines the method which helps us to know did we come from userspace or not to an exception handler. The easiest way to determine this is to via `CPL` or `Current Privilege Level` in `CS` segment register. If it is equal to `3`, we came from userspace, if zero we came from kernel space:\n\n```\ntestl $3,CS(%rsp)\njnz userspace\n...\n...\n...\n// we are from the kernel space\n```\n\nBut unfortunately this method does not give a 100% guarantee. As described in the kernel documentation:\n\n> if we are in an NMI/MCE/DEBUG/whatever super-atomic entry context,\n> which might have triggered right after a normal entry wrote CS to the\n> stack but before we executed SWAPGS, then the only safe way to check\n> for GS is the slower method: the RDMSR.\n\nIn other words for example `NMI` could happen inside the critical section of a [swapgs](http://www.felixcloutier.com/x86/SWAPGS.html) instruction. In this way we should check value of the `MSR_GS_BASE` [model specific register](https://en.wikipedia.org/wiki/Model-specific_register) which stores pointer to the start of per-cpu area. So to check if we did come from userspace or not, we should to check value of the `MSR_GS_BASE` model specific register and if it is negative we came from kernel space, in other way we came from userspace:\n\n```assembly\nmovl $MSR_GS_BASE,%ecx\nrdmsr\ntestl %edx,%edx\njs 1f\n```\n\nIn first two lines of code we read value of the `MSR_GS_BASE` model specific register into `edx:eax` pair. We can't set negative value to the `gs` from userspace. But from other side we know that direct mapping of the physical memory starts from the `0xffff880000000000` virtual address. In this way, `MSR_GS_BASE` will contain an address from `0xffff880000000000` to `0xffffc7ffffffffff`. After the `rdmsr` instruction will be executed, the smallest possible value in the `%edx` register will be - `0xffff8800` which is `-30720` in unsigned 4 bytes. That's why kernel space `gs` which points to start of `per-cpu` area will contain negative value.\n\nAfter we push fake error code on the stack, we should allocate space for general purpose registers with:\n\n```assembly\nALLOC_PT_GPREGS_ON_STACK\n```\n\nmacro which is defined in the [arch/x86/entry/calling.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/calling.h) header file. This macro just allocates 15*8 bytes space on the stack to preserve general purpose registers:\n\n```assembly\n.macro ALLOC_PT_GPREGS_ON_STACK addskip=0\n    addq\t$-(15*8+\\addskip), %rsp\n.endm\n```\n\nSo the stack will look like this after execution of the `ALLOC_PT_GPREGS_ON_STACK`:\n\n```\n     +------------+\n+160 | %SS        |\n+152 | %RSP       |\n+144 | %RFLAGS    |\n+136 | %CS        |\n+128 | %RIP       |\n+120 | ERROR CODE |\n     |------------|\n+112 |            |\n+104 |            |\n +96 |            |\n +88 |            |\n +80 |            |\n +72 |            |\n +64 |            |\n +56 |            |\n +48 |            |\n +40 |            |\n +32 |            |\n +24 |            |\n +16 |            |\n  +8 |            |\n  +0 |            | <- %RSP\n     +------------+\n```\n\nAfter we allocated space for general purpose registers, we do some checks to understand did an exception come from userspace or not and if yes, we should move back to an interrupted process stack or stay on exception stack:\n\n```assembly\n.if \\paranoid\n    .if \\paranoid == 1\n\t    testb\t$3, CS(%rsp)\n\t    jnz\t1f\n\t.endif\n\tcall\tparanoid_entry\n.else\n\tcall\terror_entry\n.endif\n```\n\nLet's consider all of these there cases in course.\n\nAn exception occurred in userspace\n--------------------------------------------------------------------------------\n\nIn the first let's consider a case when an exception has `paranoid=1` like our `debug` and `int3` exceptions. In this case we check selector from `CS` segment register and jump at `1f` label if we came from userspace or the `paranoid_entry` will be called in other way.\n\nLet's consider first case when we came from userspace to an exception handler. As described above we should jump at `1` label. The `1` label starts from the call of the\n\n```assembly\ncall\terror_entry\n```\n\nroutine which saves all general purpose registers in the previously allocated area on the stack:\n\n```assembly\nSAVE_C_REGS 8\nSAVE_EXTRA_REGS 8\n```\n\nThese both macros are defined in the  [arch/x86/entry/calling.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/calling.h) header file and just move values of general purpose registers to a certain place at the stack, for example:\n\n```assembly\n.macro SAVE_EXTRA_REGS offset=0\n\tmovq %r15, 0*8+\\offset(%rsp)\n\tmovq %r14, 1*8+\\offset(%rsp)\n\tmovq %r13, 2*8+\\offset(%rsp)\n\tmovq %r12, 3*8+\\offset(%rsp)\n\tmovq %rbp, 4*8+\\offset(%rsp)\n\tmovq %rbx, 5*8+\\offset(%rsp)\n.endm\n```\n\nAfter execution of `SAVE_C_REGS` and `SAVE_EXTRA_REGS` the stack will look:\n\n```\n     +------------+\n+160 | %SS        |\n+152 | %RSP       |\n+144 | %RFLAGS    |\n+136 | %CS        |\n+128 | %RIP       |\n+120 | ERROR CODE |\n     |------------|\n+112 | %RDI       |\n+104 | %RSI       |\n +96 | %RDX       |\n +88 | %RCX       |\n +80 | %RAX       |\n +72 | %R8        |\n +64 | %R9        |\n +56 | %R10       |\n +48 | %R11       |\n +40 | %RBX       |\n +32 | %RBP       |\n +24 | %R12       |\n +16 | %R13       |\n  +8 | %R14       |\n  +0 | %R15       | <- %RSP\n     +------------+\n```\n\nAfter the kernel saved general purpose registers at the stack, we should check that we came from userspace space again with:\n\n```assembly\ntestb\t$3, CS+8(%rsp)\njz\t.Lerror_kernelspace\n```\n\nbecause we may have potentially fault if as described in documentation truncated `%RIP` was reported. Anyway, in both cases the [SWAPGS](http://www.felixcloutier.com/x86/SWAPGS.html) instruction will be executed and values from `MSR_KERNEL_GS_BASE` and `MSR_GS_BASE` will be swapped. From this moment the `%gs` register will point to the base address of kernel structures. So, the `SWAPGS` instruction is called and it was main point of the `error_entry` routing.\n\nNow we can back to the `idtentry` macro. We may see following assembler code after the call of `error_entry`:\n\n```assembly\nmovq\t%rsp, %rdi\ncall\tsync_regs\n```\n\nHere we put base address of stack pointer `%rdi` register which will be first argument (according to [x86_64 ABI](https://www.uclibc.org/docs/psABI-x86_64.pdf)) of the `sync_regs` function and call this function which is defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) source code file:\n\n```C\nasmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)\n{\n\tstruct pt_regs *regs = task_pt_regs(current);\n\t*regs = *eregs;\n\treturn regs;\n}\n```\n\nThis function takes the result of the `task_ptr_regs` macro which is defined in the [arch/x86/include/asm/processor.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/processor.h) header file, stores it in the stack pointer and returns it. The `task_ptr_regs` macro expands to the address of `thread.sp0` which represents pointer to the normal kernel stack:\n\n```C\n#define task_pt_regs(tsk)       ((struct pt_regs *)(tsk)->thread.sp0 - 1)\n```\n\nAs we came from userspace, this means that exception handler will run in real process context. After we got stack pointer from the `sync_regs` we switch stack:\n\n```assembly\nmovq\t%rax, %rsp\n```\n\nThe last two steps before an exception handler will call secondary handler are:\n\n1. Passing pointer to `pt_regs` structure which contains preserved general purpose registers to the `%rdi` register:\n\n```assembly\nmovq\t%rsp, %rdi\n```\n\nas it will be passed as first parameter of secondary exception handler.\n\n2. Pass error code to the `%rsi` register as it will be second argument of an exception handler and set it to `-1` on the stack for the same purpose as we did it before - to prevent restart of a system call:\n\n```\n.if \\has_error_code\n\tmovq\tORIG_RAX(%rsp), %rsi\n\tmovq\t$-1, ORIG_RAX(%rsp)\n.else\n\txorl\t%esi, %esi\n.endif\n```\n\nAdditionally you may see that we zeroed the `%esi` register above in a case if an exception does not provide error code.\n\nIn the end we just call secondary exception handler:\n\n```assembly\ncall\t\\do_sym\n```\n\nwhich:\n\n```C\ndotraplinkage void do_debug(struct pt_regs *regs, long error_code);\n```\n\nwill be for `debug` exception and:\n\n```C\ndotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code);\n```\n\nwill be for `int 3` exception. In this part we will not see implementations of secondary handlers, because they are very specific, but will see some of them in one of next parts.\n\nWe just considered first case when an exception occurred in userspace. Let's consider last two.\n\nAn exception with paranoid > 0 occurred in kernelspace\n--------------------------------------------------------------------------------\n\nIn this case an exception was occurred in kernelspace and `idtentry` macro is defined with `paranoid=1` for this exception. This value of `paranoid` means that we should use slower way that we saw in the beginning of this part to check do we really came from kernelspace or not. The `paranoid_entry` routing allows us to know this:\n\n```assembly\nENTRY(paranoid_entry)\n\tcld\n\tSAVE_C_REGS 8\n\tSAVE_EXTRA_REGS 8\n\tmovl\t$1, %ebx\n\tmovl\t$MSR_GS_BASE, %ecx\n\trdmsr\n\ttestl\t%edx, %edx\n\tjs\t1f\n\tSWAPGS\n\txorl\t%ebx, %ebx\n1:\tret\nEND(paranoid_entry)\n```\n\nAs you may see, this function represents the same that we covered before. We use second (slow) method to get information about previous state of an interrupted task. As we checked this and executed `SWAPGS` in a case if we came from userspace, we should to do the same that we did before: We need to put pointer to a structure which holds general purpose registers to the `%rdi` (which will be first parameter of a secondary handler) and put error code if an exception provides it to the `%rsi` (which will be second parameter of a secondary handler):\n\n```assembly\nmovq\t%rsp, %rdi\n\n.if \\has_error_code\n\tmovq\tORIG_RAX(%rsp), %rsi\n\tmovq\t$-1, ORIG_RAX(%rsp)\n.else\n\txorl\t%esi, %esi\n.endif\n```\n\nThe last step before a secondary handler of an exception will be called is cleanup of new `IST` stack frame:\n\n```assembly\n.if \\shift_ist != -1\n\tsubq\t$EXCEPTION_STKSZ, CPU_TSS_IST(\\shift_ist)\n.endif\n```\n\nYou may remember that we passed the `shift_ist` as argument of the `idtentry` macro. Here we check its value and if its not equal to `-1`, we get pointer to a stack from `Interrupt Stack Table` by `shift_ist` index and setup it.\n\nIn the end of this second way we just call secondary exception handler as we did it before:\n\n```assembly\ncall\t\\do_sym\n```\n\nThe last method is similar to previous both, but an exception occurred with `paranoid=0` and we may use fast method determination of where we are from.\n\nExit from an exception handler\n--------------------------------------------------------------------------------\n\nAfter secondary handler will finish its works, we will return to the `idtentry` macro and the next step will be jump to the `error_exit`:\n\n```assembly\njmp\terror_exit\n```\n\nroutine. The `error_exit` function defined in the same [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S) assembly source code file and the main goal of this function is to know where we are from (from userspace or kernelspace) and execute `SWPAGS` depends on this. Restore registers to previous state and execute `iret` instruction to transfer control to an interrupted task.\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the third part about interrupts and interrupt handling in the Linux kernel. We saw the initialization of the [Interrupt descriptor table](https://en.wikipedia.org/wiki/Interrupt_descriptor_table) in the previous part with the `#DB` and `#BP` gates and started to dive into preparation before control will be transferred to an exception handler and implementation of some interrupt handlers in this part. In the next part we will continue to dive into this theme and will go next by the `setup_arch` function and will try to understand interrupts handling related stuff.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [Debug registers](http://en.wikipedia.org/wiki/X86_debug_register)\n* [Intel 80385](http://en.wikipedia.org/wiki/Intel_80386)\n* [INT 3](http://en.wikipedia.org/wiki/INT_%28x86_instruction%29#INT_3)\n* [gcc](http://en.wikipedia.org/wiki/GNU_Compiler_Collection)\n* [TSS](http://en.wikipedia.org/wiki/Task_state_segment)\n* [GNU assembly .error directive](https://sourceware.org/binutils/docs/as/Error.html#Error)\n* [dwarf2](http://en.wikipedia.org/wiki/DWARF)\n* [CFI directives](https://sourceware.org/binutils/docs/as/CFI-directives.html)\n* [IRQ](http://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29)\n* [system call](http://en.wikipedia.org/wiki/System_call)\n* [swapgs](http://www.felixcloutier.com/x86/SWAPGS.html)\n* [SIGTRAP](https://en.wikipedia.org/wiki/Unix_signal#SIGTRAP)\n* [Per-CPU variables](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [kgdb](https://en.wikipedia.org/wiki/KGDB)\n* [ACPI](https://en.wikipedia.org/wiki/Advanced_Configuration_and_Power_Interface)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts)\n"
  },
  {
    "path": "Interrupts/linux-interrupts-4.md",
    "content": "Interrupts and Interrupt Handling. Part 4.\n================================================================================\n\nInitialization of non-early interrupt gates\n--------------------------------------------------------------------------------\n\nThis is fourth part about an interrupts and exceptions handling in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-3) we saw first early `#DB` and `#BP` exceptions handlers from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). We stopped on the right after the `early_trap_init` function that called in the `setup_arch` function which defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/setup.c). In this part we will continue to dive into an interrupts and exceptions handling in the Linux kernel for `x86_64` and continue to do it from the place where we left off in the last part. First thing which is related to the interrupts and exceptions handling is the setup of the `#PF` or [page fault](https://en.wikipedia.org/wiki/Page_fault) handler with the `early_trap_pf_init` function. Let's start from it.\n\nEarly page fault handler\n--------------------------------------------------------------------------------\n\nThe `early_trap_pf_init` function defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). It uses `set_intr_gate` macro that fills [Interrupt Descriptor Table](https://en.wikipedia.org/wiki/Interrupt_descriptor_table) with the given entry:\n\n```C\nvoid __init early_trap_pf_init(void)\n{\n#ifdef CONFIG_X86_64\n         set_intr_gate(X86_TRAP_PF, page_fault);\n#endif\n}\n```\n\nThis macro defined in the [arch/x86/include/asm/desc.h](https://github.com/torvalds/linux/tree/master/arch/x86/include/asm/desc.h). We already saw macros like this in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-3) - `set_system_intr_gate` and `set_intr_gate_ist`. This macro checks that given vector number is not greater than `255` (maximum vector number) and calls `_set_gate` function as `set_system_intr_gate` and `set_intr_gate_ist` did it:\n\n```C\n#define set_intr_gate(n, addr)                                  \\\ndo {                                                            \\\n        BUG_ON((unsigned)n > 0xFF);                             \\\n        _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0,        \\\n                  __KERNEL_CS);                                 \\\n        _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\\\n                        0, 0, __KERNEL_CS);                     \\\n} while (0)\n```\n\nThe `set_intr_gate` macro takes two parameters:\n\n* vector number of a interrupt;\n* address of an interrupt handler;\n\nIn our case they are:\n\n* `X86_TRAP_PF` - `14`;\n* `page_fault` - the interrupt handler entry point.\n\nThe `X86_TRAP_PF` is the element of enum which defined in the [arch/x86/include/asm/traprs.h](https://github.com/torvalds/linux/tree/master/arch/x86/include/asm/traprs.h):\n\n```C\nenum {\n\t...\n\t...\n\t...\n\t...\n\tX86_TRAP_PF,            /* 14, Page Fault */\n\t...\n\t...\n\t...\n}\n```\n\nWhen the `early_trap_pf_init` will be called, the `set_intr_gate` will be expanded to the call of the `_set_gate` which will fill the `IDT` with the handler for the page fault. Now let's look on the implementation of the `page_fault` handler. The `page_fault` handler defined in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S) assembly source code file as all exceptions handlers. Let's look on it:\n\n```assembly\ntrace_idtentry page_fault do_page_fault has_error_code=1\n```\n\nWe saw in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-3) how `#DB` and `#BP` handlers defined. They were defined with the `idtentry` macro, but here we can see `trace_idtentry`. This macro defined in the same source code file and depends on the `CONFIG_TRACING` kernel configuration option:\n\n```assembly\n#ifdef CONFIG_TRACING\n.macro trace_idtentry sym do_sym has_error_code:req\nidtentry trace(\\sym) trace(\\do_sym) has_error_code=\\has_error_code\nidtentry \\sym \\do_sym has_error_code=\\has_error_code\n.endm\n#else\n.macro trace_idtentry sym do_sym has_error_code:req\nidtentry \\sym \\do_sym has_error_code=\\has_error_code\n.endm\n#endif\n```\n\nWe will not dive into exceptions [Tracing](https://en.wikipedia.org/wiki/Tracing_%28software%29) now. If `CONFIG_TRACING` is not set, we can see that `trace_idtentry` macro just expands to the normal `idtentry`. We already saw implementation of the `idtentry` macro in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-3), so let's start from the `page_fault` exception handler.\n\nAs we can see in the `idtentry` definition, the handler of the `page_fault` is `do_page_fault` function which defined in the [arch/x86/mm/fault.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/fault.c) and as all exceptions handlers it takes two arguments:\n\n* `regs` - `pt_regs` structure that holds state of an interrupted process;\n* `error_code` - error code of the page fault exception.\n\nLet's look inside this function. First of all we read content of the [cr2](https://en.wikipedia.org/wiki/Control_register) control register:\n\n```C\ndotraplinkage void notrace\ndo_page_fault(struct pt_regs *regs, unsigned long error_code)\n{\n\tunsigned long address = read_cr2();\n\t...\n\t...\n\t...\n}\n```\n\nThis register contains a linear address which caused `page fault`. In the next step we make a call of the `exception_enter` function from the [include/linux/context_tracking.h](https://github.com/torvalds/linux/blob/master/include/linux/context_tracking.h). The `exception_enter` and `exception_exit` are functions from context tracking subsystem in the Linux kernel used by the [RCU](https://en.wikipedia.org/wiki/Read-copy-update) to remove its dependency on the timer tick while a processor runs in userspace. Almost in every exception handler we will see similar code:\n\n```C\nenum ctx_state prev_state;\nprev_state = exception_enter();\n...\n... // exception handler here\n...\nexception_exit(prev_state);\n```\n\nThe `exception_enter` function checks that `context tracking` is enabled with the `context_tracking_is_enabled` and if it is in enabled state, we get previous context with the `this_cpu_read` (more about `this_cpu_*` operations you can read in the [Documentation](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/this_cpu_ops.txt)). After this it calls `context_tracking_user_exit` function which informs the context tracking that the processor is exiting userspace mode and entering the kernel:\n\n```C\nstatic inline enum ctx_state exception_enter(void)\n{\n        enum ctx_state prev_ctx;\n\n        if (!context_tracking_is_enabled())\n                return 0;\n\n        prev_ctx = this_cpu_read(context_tracking.state);\n        context_tracking_user_exit();\n\n        return prev_ctx;\n}\n```\n\nThe state can be one of the:\n\n```C\nenum ctx_state {\n    IN_KERNEL = 0,\n\tIN_USER,\n} state;\n```\n\nAnd in the end we return previous context. Between the `exception_enter` and `exception_exit` we call actual page fault handler:\n\n```C\n__do_page_fault(regs, error_code, address);\n```\n\nThe `__do_page_fault` is defined in the same source code file as `do_page_fault` - [arch/x86/mm/fault.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/fault.c). In the beginning of the `__do_page_fault` we check state of the [kmemcheck](https://www.kernel.org/doc/Documentation/kmemcheck.txt) checker. The `kmemcheck` detects warns about some uses of uninitialized memory. We need to check it because page fault can be caused by kmemcheck:\n\n```C\nif (kmemcheck_active(regs))\n\t\tkmemcheck_hide(regs);\n\tprefetchw(&mm->mmap_sem);\n```\n\nAfter this we can see the call of the `prefetchw` which executes instruction with the same [name](http://www.felixcloutier.com/x86/PREFETCHW.html) which fetches [X86_FEATURE_3DNOW](https://en.wikipedia.org/?title=3DNow!) to get exclusive [cache line](https://en.wikipedia.org/wiki/CPU_cache). The main purpose of prefetching is to hide the latency of a memory access. In the next step we check that we got page fault not in the kernel space with the following condition:\n\n```C\nif (unlikely(fault_in_kernel_space(address))) {\n...\n...\n...\n}\n```\n\nwhere `fault_in_kernel_space` is:\n\n```C\nstatic int fault_in_kernel_space(unsigned long address)\n{\n        return address >= TASK_SIZE_MAX;\n}\n```\n\nThe `TASK_SIZE_MAX` macro expands to the:\n\n```C\n#define TASK_SIZE_MAX   ((1UL << 47) - PAGE_SIZE)\n```\n\nor `0x00007ffffffff000`. Pay attention on `unlikely` macro. There are two macros in the Linux kernel:\n\n```C\n#define likely(x)      __builtin_expect(!!(x), 1)\n#define unlikely(x)    __builtin_expect(!!(x), 0)\n```\n\nYou can [often](http://lxr.free-electrons.com/ident?i=unlikely) find these macros in the code of the Linux kernel. Main purpose of these macros is optimization. Sometimes this situation is that we need to check the condition of the code and we know that it will rarely be `true` or `false`. With these macros we can tell to the compiler about this. For example\n\n```C\nstatic int proc_root_readdir(struct file *file, struct dir_context *ctx)\n{\n        if (ctx->pos < FIRST_PROCESS_ENTRY) {\n                int error = proc_readdir(file, ctx);\n                if (unlikely(error <= 0))\n                        return error;\n...\n...\n...\n}\n```\n\nHere we can see `proc_root_readdir` function which will be called when the Linux [VFS](https://en.wikipedia.org/wiki/Virtual_file_system) needs to read the `root` directory contents. If condition marked with `unlikely`, compiler can put `false` code right after branching. Now let's back to the our address check. Comparison between the given address and the `0x00007ffffffff000` will give us to know, was page fault in the kernel mode or user mode. After this check we know it. After this `__do_page_fault` routine will try to understand the problem that provoked page fault exception and then will pass address to the appropriate routine. It can be `kmemcheck` fault, spurious fault, [kprobes](https://www.kernel.org/doc/Documentation/kprobes.txt) fault and etc. Will not dive into implementation details of the page fault exception handler in this part, because we need to know many different concepts which are provided by the Linux kernel, but will see it in the chapter about the [memory management](https://0xax.gitbook.io/linux-insides/summary/mm) in the Linux kernel.\n\nBack to start_kernel\n--------------------------------------------------------------------------------\n\nThere are many different function calls after the `early_trap_pf_init` in the `setup_arch` function from different kernel subsystems, but there are no one interrupts and exceptions handling related. So, we have to go back where we came from - `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c#L492). The first things after the `setup_arch` is the `trap_init` function from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). This function makes initialization of the remaining exceptions handlers (remember that we already setup 3 handlers for the `#DB` - debug exception, `#BP` - breakpoint exception and `#PF` - page fault exception). The `trap_init` function starts from the check of the [Extended Industry Standard Architecture](https://en.wikipedia.org/wiki/Extended_Industry_Standard_Architecture):\n\n```C\n#ifdef CONFIG_EISA\n        void __iomem *p = early_ioremap(0x0FFFD9, 4);\n\n        if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))\n                EISA_bus = 1;\n        early_iounmap(p, 4);\n#endif\n```\n\nNote that it depends on the `CONFIG_EISA` kernel configuration parameter which represents `EISA` support. Here we use `early_ioremap` function to map `I/O` memory on the page tables. We use `readl` function to read first `4` bytes from the mapped region and if they are equal to `EISA` string we set `EISA_bus` to one. In the end we just unmap previously mapped region. More about `early_ioremap` you can read in the part which describes [Fix-Mapped Addresses and ioremap](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-2).\n\nAfter this we start to fill the `Interrupt Descriptor Table` with the different interrupt gates. First of all we set `#DE` or `Divide Error` and `#NMI` or `Non-maskable Interrupt`:\n\n```C\nset_intr_gate(X86_TRAP_DE, divide_error);\nset_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);\n```\n\nWe use `set_intr_gate` macro to set the interrupt gate for the `#DE` exception and `set_intr_gate_ist` for the `#NMI`. You can remember that we already used these macros when we have set the interrupts gates for the page fault handler, debug handler and etc, you can find explanation of it in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-3). After this we setup exception gates for the following exceptions:\n\n```C\nset_system_intr_gate(X86_TRAP_OF, &overflow);\nset_intr_gate(X86_TRAP_BR, bounds);\nset_intr_gate(X86_TRAP_UD, invalid_op);\nset_intr_gate(X86_TRAP_NM, device_not_available);\n```\n\nHere we can see:\n\n* `#OF` or `Overflow` exception. This exception indicates that an overflow trap occurred when an special [INTO](http://x86.renejeschke.de/html/file_module_x86_id_142.html) instruction was executed;\n* `#BR` or `BOUND Range exceeded` exception. This exception indicates that a `BOUND-range-exceed` fault occurred when a [BOUND](http://pdos.csail.mit.edu/6.828/2005/readings/i386/BOUND.htm) instruction was executed;\n* `#UD` or `Invalid Opcode` exception. Occurs when a processor attempted to execute invalid or reserved [opcode](https://en.wikipedia.org/?title=Opcode), processor attempted to execute instruction with invalid operand(s) and etc;\n* `#NM` or `Device Not Available` exception. Occurs when the processor tries to execute `x87 FPU` floating point instruction while `EM` flag in the [control register](https://en.wikipedia.org/wiki/Control_register#CR0) `cr0` was set.\n\nIn the next step we set the interrupt gate for the `#DF` or `Double fault` exception:\n\n```C\nset_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);\n```\n\nThis exception occurs when processor detected a second exception while calling an exception handler for a prior exception. In usual way when the processor detects another exception while trying to call an exception handler, the two exceptions can be handled serially. If the processor cannot handle them serially, it signals the double-fault or `#DF` exception.\n\nThe following set of the interrupt gates is:\n\n```C\nset_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);\nset_intr_gate(X86_TRAP_TS, &invalid_TSS);\nset_intr_gate(X86_TRAP_NP, &segment_not_present);\nset_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);\nset_intr_gate(X86_TRAP_GP, &general_protection);\nset_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);\nset_intr_gate(X86_TRAP_MF, &coprocessor_error);\nset_intr_gate(X86_TRAP_AC, &alignment_check);\n```\n\nHere we can see setup for the following exception handlers:\n\n* `#CSO` or `Coprocessor Segment Overrun` - this exception indicates that math [coprocessor](https://en.wikipedia.org/wiki/Coprocessor) of an old processor detected a page or segment violation. Modern processors do not generate this exception\n* `#TS` or `Invalid TSS` exception - indicates that there was an error related to the [Task State Segment](https://en.wikipedia.org/wiki/Task_state_segment).\n* `#NP` or `Segment Not Present` exception indicates that the `present flag` of a segment or gate descriptor is clear during attempt to load one of `cs`, `ds`, `es`, `fs`, or `gs` register.\n* `#SS` or `Stack Fault` exception indicates one of the stack related conditions was detected, for example a not-present stack segment is detected when attempting to load the `ss` register.\n* `#GP` or `General Protection` exception indicates that the processor detected one of a class of protection violations called general-protection violations. There are many different conditions that can cause general-protection exception. For example loading the `ss`, `ds`, `es`, `fs`, or `gs` register with a segment selector for a system segment, writing to a code segment or a read-only data segment, referencing an entry in the `Interrupt Descriptor Table` (following an interrupt or exception) that is not an interrupt, trap, or task gate and many many more.\n* `Spurious Interrupt` - a hardware interrupt that is unwanted.\n* `#MF` or `x87 FPU Floating-Point Error` exception caused when the [x87 FPU](https://en.wikipedia.org/wiki/X86_instruction_listings#x87_floating-point_instructions) has detected a floating point error.\n* `#AC` or `Alignment Check` exception Indicates that the processor detected an unaligned memory operand when alignment checking was enabled.\n\nAfter that we setup this exception gates, we can see setup of the `Machine-Check` exception:\n\n```C\n#ifdef CONFIG_X86_MCE\n\tset_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);\n#endif\n```\n\nNote that it depends on the `CONFIG_X86_MCE` kernel configuration option and indicates that the processor detected an internal [machine error](https://en.wikipedia.org/wiki/Machine-check_exception) or a bus error, or that an external agent detected a bus error. The next exception gate is for the [SIMD](https://en.wikipedia.org/?title=SIMD) Floating-Point exception:\n\n```C\nset_intr_gate(X86_TRAP_XF, &simd_coprocessor_error);\n```\n\nwhich indicates the processor has detected an `SSE` or `SSE2` or `SSE3` SIMD floating-point exception. There are six classes of numeric exception conditions that can occur while executing an SIMD floating-point instruction:\n\n* Invalid operation\n* Divide-by-zero\n* Denormal operand\n* Numeric overflow\n* Numeric underflow\n* Inexact result (Precision)\n\nIn the next step we fill the `used_vectors` array which defined in the [arch/x86/include/asm/desc.h](https://github.com/torvalds/linux/tree/master/arch/x86/include/asm/desc.h) header file and represents `bitmap`:\n\n```C\nDECLARE_BITMAP(used_vectors, NR_VECTORS);\n```\n\nof the first `32` interrupts (more about bitmaps in the Linux kernel you can read in the part which describes [cpumasks and bitmaps](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2))\n\n```C\nfor (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)\n\tset_bit(i, used_vectors)\n```\n\nwhere `FIRST_EXTERNAL_VECTOR` is:\n\n```C\n#define FIRST_EXTERNAL_VECTOR           0x20\n```\n\nAfter this we setup the interrupt gate for the `ia32_syscall` and add `0x80` to the `used_vectors` bitmap:\n\n```C\n#ifdef CONFIG_IA32_EMULATION\n        set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);\n        set_bit(IA32_SYSCALL_VECTOR, used_vectors);\n#endif\n```\n\nThere is `CONFIG_IA32_EMULATION` kernel configuration option on `x86_64` Linux kernels. This option provides ability to execute 32-bit processes in compatibility-mode. In the next parts we will see how it works, in the meantime we need only to know that there is yet another interrupt gate in the `IDT` with the vector number `0x80`. In the next step we maps `IDT` to the fixmap area:\n\n```C\n__set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);\nidt_descr.address = fix_to_virt(FIX_RO_IDT);\n```\n\nand write its address to the `idt_descr.address` (more about fix-mapped addresses you can read in the second part of the [Linux kernel memory management](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-2) chapter). After this we can see the call of the `cpu_init` function that defined in the [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/cpu/common.c). This function makes initialization of the all `per-cpu` state. In the beginning of the `cpu_init` we do the following things: First of all we wait while current cpu is initialized and than we call the `cr4_init_shadow` function which stores shadow copy of the `cr4` control register for the current cpu and load CPU microcode if need with the following function calls:\n\n```C\nwait_for_master_cpu(cpu);\ncr4_init_shadow();\nload_ucode_ap();\n```\n\nNext we get the `Task State Segment` for the current cpu and `orig_ist` structure which represents origin `Interrupt Stack Table` values with the:\n\n```C\nt = &per_cpu(cpu_tss, cpu);\noist = &per_cpu(orig_ist, cpu);\n```\n\nAs we got values of the `Task State Segment` and `Interrupt Stack Table` for the current processor, we clear following bits in the `cr4` control register:\n\n```C\ncr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);\n```\n\nwith this we disable `vm86` extension, virtual interrupts, timestamp ([RDTSC](https://en.wikipedia.org/wiki/Time_Stamp_Counter) can only be executed with the highest privilege) and debug extension. After this we reload the `Global Descriptor Table` and `Interrupt Descriptor table` with the:\n\n```C\n\tswitch_to_new_gdt(cpu);\n\tloadsegment(fs, 0);\n\tload_current_idt();\n```\n\nAfter this we setup array of the Thread-Local Storage Descriptors, configure [NX](https://en.wikipedia.org/wiki/NX_bit) and load CPU microcode. Now is time to setup and load `per-cpu` Task State Segments. We are going in a loop through the all exception stack which is `N_EXCEPTION_STACKS` or `4` and fill it with `Interrupt Stack Tables`:\n\n```C\n\tif (!oist->ist[0]) {\n\t\tchar *estacks = per_cpu(exception_stacks, cpu);\n\n\t\tfor (v = 0; v < N_EXCEPTION_STACKS; v++) {\n\t\t\testacks += exception_stack_sizes[v];\n\t\t\toist->ist[v] = t->x86_tss.ist[v] =\n\t\t\t\t\t(unsigned long)estacks;\n\t\t\tif (v == DEBUG_STACK-1)\n\t\t\t\tper_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;\n\t\t}\n\t}\n```\n\nAs we have filled `Task State Segments` with the `Interrupt Stack Tables` we can set `TSS` descriptor for the current processor and load it with the:\n\n```C\nset_tss_desc(cpu, t);\nload_TR_desc();\n```\n\nwhere `set_tss_desc` macro from the [arch/x86/include/asm/desc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/desc.h) writes given  descriptor to the `Global Descriptor Table` of the given processor:\n\n```C\n#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)\nstatic inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)\n{\n        struct desc_struct *d = get_cpu_gdt_table(cpu);\n        tss_desc tss;\n        set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,\n                              IO_BITMAP_OFFSET + IO_BITMAP_BYTES +\n                              sizeof(unsigned long) - 1);\n        write_gdt_entry(d, entry, &tss, DESC_TSS);\n}\n```\n\nand `load_TR_desc` macro expands to the `ltr` or `Load Task Register` instruction:\n\n```C\n#define load_TR_desc()                          native_load_tr_desc()\nstatic inline void native_load_tr_desc(void)\n{\n        asm volatile(\"ltr %w0\"::\"q\" (GDT_ENTRY_TSS*8));\n}\n```\n\nIn the end of the `trap_init` function we can see the following code:\n\n```C\nset_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);\nset_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);\n...\n...\n...\n#ifdef CONFIG_X86_64\n        memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);\n        set_nmi_gate(X86_TRAP_DB, &debug);\n        set_nmi_gate(X86_TRAP_BP, &int3);\n#endif\n```\n\nHere we copy `idt_table` to the `nmi_dit_table` and setup exception handlers for the `#DB` or `Debug exception` and `#BR` or `Breakpoint exception`. You can remember that we already set these interrupt gates in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-3), so why do we need to setup it again? We setup it again because when we initialized it before in the `early_trap_init` function, the `Task State Segment` was not ready yet, but now it is ready after the call of the `cpu_init` function.\n\nThat's all. Soon we will consider all handlers of these interrupts/exceptions.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the fourth part about interrupts and interrupt handling in the Linux kernel. We saw the initialization of the [Task State Segment](https://en.wikipedia.org/wiki/Task_state_segment) in this part and initialization of the different interrupt handlers as `Divide Error`, `Page Fault` exception and etc. You can note that we saw just initialization stuff, and will dive into details about handlers for these exceptions. In the next part we will start to do it.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [page fault](https://en.wikipedia.org/wiki/Page_fault)\n* [Interrupt Descriptor Table](https://en.wikipedia.org/wiki/Interrupt_descriptor_table)\n* [Tracing](https://en.wikipedia.org/wiki/Tracing_%28software%29)\n* [cr2](https://en.wikipedia.org/wiki/Control_register)\n* [RCU](https://en.wikipedia.org/wiki/Read-copy-update)\n* [this_cpu_* operations](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/this_cpu_ops.txt)\n* [kmemcheck](https://www.kernel.org/doc/Documentation/kmemcheck.txt)\n* [prefetchw](http://www.felixcloutier.com/x86/PREFETCHW.html)\n* [3DNow](https://en.wikipedia.org/?title=3DNow!)\n* [CPU caches](https://en.wikipedia.org/wiki/CPU_cache)\n* [VFS](https://en.wikipedia.org/wiki/Virtual_file_system)\n* [Linux kernel memory management](https://0xax.gitbook.io/linux-insides/summary/mm)\n* [Fix-Mapped Addresses and ioremap](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-2)\n* [Extended Industry Standard Architecture](https://en.wikipedia.org/wiki/Extended_Industry_Standard_Architecture)\n* [INT instruction](https://en.wikipedia.org/wiki/INT_%28x86_instruction%29)\n* [INTO](http://x86.renejeschke.de/html/file_module_x86_id_142.html)\n* [BOUND](http://pdos.csail.mit.edu/6.828/2005/readings/i386/BOUND.htm)\n* [opcode](https://en.wikipedia.org/?title=Opcode)\n* [control register](https://en.wikipedia.org/wiki/Control_register#CR0)\n* [x87 FPU](https://en.wikipedia.org/wiki/X86_instruction_listings#x87_floating-point_instructions)\n* [MCE exception](https://en.wikipedia.org/wiki/Machine-check_exception)\n* [SIMD](https://en.wikipedia.org/?title=SIMD)\n* [cpumasks and bitmaps](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)\n* [NX](https://en.wikipedia.org/wiki/NX_bit)\n* [Task State Segment](https://en.wikipedia.org/wiki/Task_state_segment)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-3)\n"
  },
  {
    "path": "Interrupts/linux-interrupts-5.md",
    "content": "Interrupts and Interrupt Handling. Part 5.\n================================================================================\n\nImplementation of exception handlers\n--------------------------------------------------------------------------------\n\nThis is the fifth part about an interrupts and exceptions handling in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-4) we stopped on the setting of interrupt gates to the [Interrupt descriptor Table](https://en.wikipedia.org/wiki/Interrupt_descriptor_table). We did it in the `trap_init` function from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c) source code file. We saw only setting of these interrupt gates in the previous part and in the current part we will see implementation of the exception handlers for these gates. The preparation before an exception handler will be executed is in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S) assembly file and occurs in the [idtentry](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S#L820) macro that defines exceptions entry points:\n\n```assembly\nidtentry divide_error\t\t     do_divide_error\t\t    has_error_code=0\nidtentry overflow\t\t     do_overflow\t\t    has_error_code=0\nidtentry invalid_op\t\t     do_invalid_op\t\t    has_error_code=0\nidtentry bounds\t\t\t     do_bounds\t\t\t    has_error_code=0\nidtentry device_not_available\t     do_device_not_available\t    has_error_code=0\nidtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0\nidtentry invalid_TSS\t\t     do_invalid_TSS\t\t    has_error_code=1\nidtentry segment_not_present\t     do_segment_not_present\t    has_error_code=1\nidtentry spurious_interrupt_bug\t     do_spurious_interrupt_bug      has_error_code=0\nidtentry coprocessor_error\t     do_coprocessor_error\t    has_error_code=0\nidtentry alignment_check\t     do_alignment_check\t\t    has_error_code=1\nidtentry simd_coprocessor_error\t     do_simd_coprocessor_error      has_error_code=0\n```\n\nThe `idtentry` macro does following preparation before an actual exception handler (`do_divide_error` for the `divide_error`, `do_overflow` for the `overflow`, etc.) will get control. In another words the `idtentry` macro allocates place for the registers ([pt_regs](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/uapi/asm/ptrace.h#L43) structure) on the stack, pushes dummy error code for the stack consistency if an interrupt/exception has no error code, checks the segment selector in the `cs` segment register and switches depends on the previous state (userspace or kernelspace). After all of these preparations it makes a call to an actual interrupt/exception handler:\n\n```assembly\n.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1\nENTRY(\\sym)\n\t...\n\t...\n\t...\n\tcall\t\\do_sym\n\t...\n\t...\n\t...\nEND(\\sym)\n.endm\n```\n\nAfter an exception handler will finish its work, the `idtentry` macro restores stack and general purpose registers of an interrupted task and executes [iret](http://x86.renejeschke.de/html/file_module_x86_id_145.html) instruction:\n\n```assembly\nENTRY(paranoid_exit)\n\t...\n\t...\n\t...\n\tRESTORE_EXTRA_REGS\n\tRESTORE_C_REGS\n\tREMOVE_PT_GPREGS_FROM_STACK 8\n\tINTERRUPT_RETURN\nEND(paranoid_exit)\n```\n\nwhere `INTERRUPT_RETURN` is:\n\n```assembly\n#define INTERRUPT_RETURN\tjmp native_iret\n...\nENTRY(native_iret)\n.global native_irq_return_iret\nnative_irq_return_iret:\niretq\n```\n\nMore about the `idtentry` macro you can read in the third part of the [https://0xax.gitbooks.io/linux-insides/content/Interrupts/linux-interrupts-3.html](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-3) chapter. Ok, now we saw the preparation before an exception handler will be executed and now time to look on the handlers. First of all let's look on the following handlers:\n\n* divide_error\n* overflow\n* invalid_op\n* coprocessor_segment_overrun\n* invalid_TSS\n* segment_not_present\n* stack_segment\n* alignment_check\n\nAll these handlers defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) source code file with the `DO_ERROR` macro:\n\n```C\nDO_ERROR(X86_TRAP_DE,     SIGFPE,  \"divide error\",                divide_error)\nDO_ERROR(X86_TRAP_OF,     SIGSEGV, \"overflow\",                    overflow)\nDO_ERROR(X86_TRAP_UD,     SIGILL,  \"invalid opcode\",              invalid_op)\nDO_ERROR(X86_TRAP_OLD_MF, SIGFPE,  \"coprocessor segment overrun\", coprocessor_segment_overrun)\nDO_ERROR(X86_TRAP_TS,     SIGSEGV, \"invalid TSS\",                 invalid_TSS)\nDO_ERROR(X86_TRAP_NP,     SIGBUS,  \"segment not present\",         segment_not_present)\nDO_ERROR(X86_TRAP_SS,     SIGBUS,  \"stack segment\",               stack_segment)\nDO_ERROR(X86_TRAP_AC,     SIGBUS,  \"alignment check\",             alignment_check)\n```\n\nAs we can see the `DO_ERROR` macro takes 4 parameters:\n\n* Vector number of an interrupt;\n* Signal number which will be sent to the interrupted process;\n* String which describes an exception;\n* Exception handler entry point.\n\nThis macro defined in the same source code file and expands to the function with the `do_handler` name:\n\n```C\n#define DO_ERROR(trapnr, signr, str, name)                              \\\ndotraplinkage void do_##name(struct pt_regs *regs, long error_code)     \\\n{                                                                       \\\n        do_error_trap(regs, error_code, str, trapnr, signr);            \\\n}\n```\n\nNote on the `##` tokens. This is special feature - [GCC macro Concatenation](https://gcc.gnu.org/onlinedocs/cpp/Concatenation.html#Concatenation) which concatenates two given strings. For example, first `DO_ERROR` in our example will expands to the:\n\n```C\ndotraplinkage void do_divide_error(struct pt_regs *regs, long error_code)     \\\n{\n\t...\n}\n```\n\nWe can see that all functions which are generated by the `DO_ERROR` macro just make a call to the `do_error_trap` function from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). Let's look on implementation of the `do_error_trap` function.\n\nTrap handlers\n--------------------------------------------------------------------------------\n\nThe `do_error_trap` function starts and ends from the two following functions:\n\n```C\nenum ctx_state prev_state = exception_enter();\n...\n...\n...\nexception_exit(prev_state);\n```\n\nfrom the [include/linux/context_tracking.h](https://github.com/torvalds/linux/tree/master/include/linux/context_tracking.h). The context tracking in the Linux kernel subsystem which provide kernel boundaries probes to keep track of the transitions between level contexts with two basic initial contexts: `user` or `kernel`. The `exception_enter` function checks that context tracking is enabled. After this if it is enabled, the `exception_enter` reads previous context and compares it with the `CONTEXT_KERNEL`. If the previous context is `user`, we call `context_tracking_exit` function from the [kernel/context_tracking.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/context_tracking.c) which inform the context tracking subsystem that a processor is exiting user mode and entering the kernel mode:\n\n```C\nif (!context_tracking_is_enabled())\n\treturn 0;\n\nprev_ctx = this_cpu_read(context_tracking.state);\nif (prev_ctx != CONTEXT_KERNEL)\n\tcontext_tracking_exit(prev_ctx);\n\nreturn prev_ctx;\n```\n\nIf previous context is non `user`, we just return it. The `pre_ctx` has `enum ctx_state` type which defined in the [include/linux/context_tracking_state.h](https://github.com/torvalds/linux/tree/master/include/linux/context_tracking_state.h) and looks as:\n\n```C\nenum ctx_state {\n\tCONTEXT_KERNEL = 0,\n\tCONTEXT_USER,\n\tCONTEXT_GUEST,\n} state;\n```\n\nThe second function is `exception_exit` defined in the same [include/linux/context_tracking.h](https://github.com/torvalds/linux/tree/master/include/linux/context_tracking.h) file and checks that context tracking is enabled and call the `context_tracking_enter` function if the previous context was `user`:\n\n```C\nstatic inline void exception_exit(enum ctx_state prev_ctx)\n{\n\tif (context_tracking_is_enabled()) {\n\t\tif (prev_ctx != CONTEXT_KERNEL)\n\t\t\tcontext_tracking_enter(prev_ctx);\n\t}\n}\n```\n\nThe `context_tracking_enter` function informs the context tracking subsystem that a processor is going to enter to the user mode from the kernel mode. We can see the following code between the `exception_enter` and `exception_exit`:\n\n```C\nif (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=\n\t\tNOTIFY_STOP) {\n\tconditional_sti(regs);\n\tdo_trap(trapnr, signr, str, regs, error_code,\n\t\tfill_trap_info(regs, signr, trapnr, &info));\n}\n```\n\nFirst of all it calls the `notify_die` function which defined in the [kernel/notifier.c](https://github.com/torvalds/linux/tree/master/kernel/notifier.c). To get notified for [kernel panic](https://en.wikipedia.org/wiki/Kernel_panic), [kernel oops](https://en.wikipedia.org/wiki/Linux_kernel_oops), [Non-Maskable Interrupt](https://en.wikipedia.org/wiki/Non-maskable_interrupt) or other events the caller needs to insert itself in the `notify_die` chain and the `notify_die` function does it. The Linux kernel has special mechanism that allows kernel to ask when something happens and this mechanism called `notifiers` or `notifier chains`. This mechanism used for example for the `USB` hotplug events (look on the [drivers/usb/core/notify.c](https://github.com/torvalds/linux/tree/master/drivers/usb/core/notify.c)), for the memory [hotplug](https://en.wikipedia.org/wiki/Hot_swapping) (look on the [include/linux/memory.h](https://github.com/torvalds/linux/tree/master/include/linux/memory.h), the `hotplug_memory_notifier` macro, etc...), system reboots, etc. A notifier chain is thus a simple, singly-linked list. When a Linux kernel subsystem wants to be notified of specific events, it fills out a special `notifier_block` structure and passes it to the `notifier_chain_register` function. An event can be sent with the call of the `notifier_call_chain` function. First of all the `notify_die` function fills `die_args` structure with the trap number, trap string, registers and other values:\n\n```C\nstruct die_args args = {\n       .regs   = regs,\n       .str    = str,\n       .err    = err,\n       .trapnr = trap,\n       .signr  = sig,\n}\n```\n\nand returns the result of the `atomic_notifier_call_chain` function with the `die_chain`:\n\n```C\nstatic ATOMIC_NOTIFIER_HEAD(die_chain);\nreturn atomic_notifier_call_chain(&die_chain, val, &args);\n```\n\nwhich just expands to the `atomic_notifier_head` structure that contains lock and `notifier_block`:\n\n```C\nstruct atomic_notifier_head {\n        spinlock_t lock;\n        struct notifier_block __rcu *head;\n};\n```\n\nThe `atomic_notifier_call_chain` function calls each function in a notifier chain in turn and returns the value of the last notifier function called. If the `notify_die` in the `do_error_trap` does not return `NOTIFY_STOP` we execute `conditional_sti` function from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) that checks the value of the [interrupt flag](https://en.wikipedia.org/wiki/Interrupt_flag) and enables interrupt depends on it:\n\n```C\nstatic inline void conditional_sti(struct pt_regs *regs)\n{\n        if (regs->flags & X86_EFLAGS_IF)\n                local_irq_enable();\n}\n```\n\nmore about `local_irq_enable` macro you can read in the second [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-2) of this chapter. The next and last call in the `do_error_trap` is the `do_trap` function. First of all the `do_trap` function defined the `tsk` variable which has `task_struct` type and represents the current interrupted process. After the definition of the `tsk`, we can see the call of the `do_trap_no_signal` function:\n\n```C\nstruct task_struct *tsk = current;\n\nif (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))\n\treturn;\n```\n\nThe `do_trap_no_signal` function makes two checks:\n\n* Did we come from the [Virtual 8086](https://en.wikipedia.org/wiki/Virtual_8086_mode) mode;\n* Did we come from the kernelspace.\n\n```C\nif (v8086_mode(regs)) {\n\t...\n}\n\nif (!user_mode(regs)) {\n\t...\n}\n\nreturn -1;\n```\n\nWe will not consider first case because the [long mode](https://en.wikipedia.org/wiki/Long_mode) does not support the [Virtual 8086](https://en.wikipedia.org/wiki/Virtual_8086_mode) mode. In the second case we invoke `fixup_exception` function which will try to recover a fault and `die` if we can't:\n\n```C\nif (!fixup_exception(regs)) {\n\ttsk->thread.error_code = error_code;\n\ttsk->thread.trap_nr = trapnr;\n\tdie(str, regs, error_code);\n}\n```\n\nThe `die` function defined in the [arch/x86/kernel/dumpstack.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/dumpstack.c) source code file, prints useful information about stack, registers, kernel modules and caused kernel [oops](https://en.wikipedia.org/wiki/Linux_kernel_oops). If we came from the userspace the `do_trap_no_signal` function will return `-1` and the execution of the `do_trap` function will continue. If we passed through the `do_trap_no_signal` function and did not exit from the `do_trap` after this, it means that previous context was - `user`.  Most exceptions caused by the processor are interpreted by Linux as error conditions, for example division by zero, invalid opcode, etc. When an exception occurs the Linux kernel sends a [signal](https://en.wikipedia.org/wiki/Unix_signal) to the interrupted process that caused the exception to notify it of an incorrect condition. So, in the `do_trap` function we need to send a signal with the given number (`SIGFPE` for the divide error, `SIGILL` for a illegal instruction, etc.). First of all we save error code and vector number in the current interrupts process with the filling `thread.error_code` and `thread_trap_nr`:\n\n```C\ntsk->thread.error_code = error_code;\ntsk->thread.trap_nr = trapnr;\n```\n\nAfter this we make a check do we need to print information about unhandled signals for the interrupted process. We check that `show_unhandled_signals` variable is set, that `unhandled_signal` function from the [kernel/signal.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/signal.c) will return unhandled signal(s) and [printk](https://en.wikipedia.org/wiki/Printk) rate limit:\n\n```C\n#ifdef CONFIG_X86_64\n\tif (show_unhandled_signals && unhandled_signal(tsk, signr) &&\n\t    printk_ratelimit()) {\n\t\tpr_info(\"%s[%d] trap %s ip:%lx sp:%lx error:%lx\",\n\t\t\ttsk->comm, tsk->pid, str,\n\t\t\tregs->ip, regs->sp, error_code);\n\t\tprint_vma_addr(\" in \", regs->ip);\n\t\tpr_cont(\"\\n\");\n\t}\n#endif\n```\n\nAnd send a given signal to interrupted process:\n\n```C\nforce_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);\n```\n\nThis is the end of the `do_trap`. We just saw generic implementation for eight different exceptions which are defined with the `DO_ERROR` macro. Now let's look at other exception handlers.\n\nDouble fault\n--------------------------------------------------------------------------------\n\nThe next exception is `#DF` or `Double fault`. This exception occurs when the processor detected a second exception while calling an exception handler for a prior exception. We set the trap gate for this exception in the previous part:\n\n```C\nset_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);\n```\n\nNote that this exception runs on the `DOUBLEFAULT_STACK` [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/kernel-stacks) which has index - `1`:\n\n```C\n#define DOUBLEFAULT_STACK 1\n```\n\nThe `double_fault` is handler for this exception and defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). The `double_fault` handler starts from the definition of two variables: string that describes exception and interrupted process, as other exception handlers:\n\n```C\nstatic const char str[] = \"double fault\";\nstruct task_struct *tsk = current;\n```\n\nThe handler of the double fault exception split on two parts. The first part is the check which checks that a fault is a `non-IST` fault on the `espfix64` stack. Actually the `iret` instruction restores only the bottom `16` bits when returning to a `16` bit segment. The `espfix` feature solves this problem. So if the `non-IST` fault on the espfix64 stack we modify the stack to make it look like `General Protection Fault`:\n\n```C\nstruct pt_regs *normal_regs = task_pt_regs(current);\n\nmemmove(&normal_regs->ip, (void *)regs->sp, 5*8);\normal_regs->orig_ax = 0;\nregs->ip = (unsigned long)general_protection;\nregs->sp = (unsigned long)&normal_regs->orig_ax;\nreturn;\n```\n\nIn the second case we do almost the same that we did in the previous exception handlers. The first is the call of the `ist_enter` function that discards previous context, `user` in our case:\n\n```C\nist_enter(regs);\n```\n\nAnd after this we fill the interrupted process with the vector number of the `Double fault` exception and error code as we did it in the previous handlers:\n\n```C\ntsk->thread.error_code = error_code;\ntsk->thread.trap_nr = X86_TRAP_DF;\n```\n\nNext we print useful information about the double fault ([PID](https://en.wikipedia.org/wiki/Process_identifier) number, registers content):\n\n```C\n#ifdef CONFIG_DOUBLEFAULT\n\tdf_debug(regs, error_code);\n#endif\n```\n\nAnd die:\n\n```\n\tfor (;;)\n\t\tdie(str, regs, error_code);\n```\n\nThat's all.\n\nDevice not available exception handler\n--------------------------------------------------------------------------------\n\nThe next exception is the `#NM` or `Device not available`. The `Device not available` exception can occur depending on these things:\n\n* The processor executed an [x87 FPU](https://en.wikipedia.org/wiki/X87) floating-point instruction while the EM flag in [control register](https://en.wikipedia.org/wiki/Control_register) `cr0` was set;\n* The processor executed a `wait` or `fwait` instruction while the `MP` and `TS` flags of register `cr0` were set;\n* The processor executed an [x87 FPU](https://en.wikipedia.org/wiki/X87), [MMX](https://en.wikipedia.org/wiki/MMX_%28instruction_set%29) or [SSE](https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions) instruction while the `TS` flag in control register `cr0` was set and the `EM` flag is clear.\n\nThe handler of the `Device not available` exception is the `do_device_not_available` function and it defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c) source code file too. It starts and ends from the getting of the previous context, as other traps which we saw in the beginning of this part:\n\n```C\nenum ctx_state prev_state;\nprev_state = exception_enter();\n...\n...\n...\nexception_exit(prev_state);\n```\n\nIn the next step we check that `FPU` is not eager:\n\n```C\nBUG_ON(use_eager_fpu());\n```\n\nWhen we switch into a task or interrupt we may avoid loading the `FPU` state. If a task will use it, we catch `Device not Available exception` exception. If we loading the `FPU` state during task switching, the `FPU` is eager. In the next step we check `cr0` control register on the `EM` flag which can show us is `x87` floating point unit present (flag clear) or not (flag set):\n\n```C\n#ifdef CONFIG_MATH_EMULATION\n\tif (read_cr0() & X86_CR0_EM) {\n\t\tstruct math_emu_info info = { };\n\n\t\tconditional_sti(regs);\n\n\t\tinfo.regs = regs;\n\t\tmath_emulate(&info);\n\t\texception_exit(prev_state);\n\t\treturn;\n\t}\n#endif\n```\n\nIf the `x87` floating point unit not presented, we enable interrupts with the `conditional_sti`, fill the `math_emu_info` (defined in the [arch/x86/include/asm/math_emu.h](https://github.com/torvalds/linux/tree/master/arch/x86/include/asm/math_emu.h)) structure with the registers of an interrupt task and call `math_emulate` function from the [arch/x86/math-emu/fpu_entry.c](https://github.com/torvalds/linux/tree/master/arch/x86/math-emu/fpu_entry.c). As you can understand from function's name, it emulates `X87 FPU` unit (more about the `x87` we will know in the special chapter). In other way, if `X86_CR0_EM` flag is clear which means that `x87 FPU` unit is presented, we call the `fpu__restore` function from the [arch/x86/kernel/fpu/core.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/fpu/core.c) which copies the `FPU` registers from the `fpustate` to the live hardware registers. After this `FPU` instructions can be used:\n\n```C\nfpu__restore(&current->thread.fpu);\n```\n\nGeneral protection fault exception handler\n--------------------------------------------------------------------------------\n\nThe next exception is the `#GP` or `General protection fault`. This exception occurs when the processor detected one of a class of protection violations called `general-protection violations`. It can be:\n\n* Exceeding the segment limit when accessing the `cs`, `ds`, `es`, `fs` or `gs` segments;\n* Loading the `ss`, `ds`, `es`, `fs` or `gs` register with a segment selector for a system segment.;\n* Violating any of the privilege rules;\n* and other...\n\nThe exception handler for this exception is the `do_general_protection` from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). The `do_general_protection` function starts and ends as other exception handlers from the getting of the previous context:\n\n```C\nprev_state = exception_enter();\n...\nexception_exit(prev_state);\n```\n\nAfter this we enable interrupts if they were disabled and check that we came from the [Virtual 8086](https://en.wikipedia.org/wiki/Virtual_8086_mode) mode:\n\n```C\nconditional_sti(regs);\n\nif (v8086_mode(regs)) {\n\tlocal_irq_enable();\n\thandle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);\n\tgoto exit;\n}\n```\n\nAs long mode does not support this mode, we will not consider exception handling for this case. In the next step check that previous mode was kernel mode and try to fix the trap. If we can't fix the current general protection fault exception we fill the interrupted process with the vector number and error code of the exception and add it to the `notify_die` chain:\n\n```C\nif (!user_mode(regs)) {\n\tif (fixup_exception(regs))\n\t\tgoto exit;\n\n\ttsk->thread.error_code = error_code;\n\ttsk->thread.trap_nr = X86_TRAP_GP;\n\tif (notify_die(DIE_GPF, \"general protection fault\", regs, error_code,\n\t\t       X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)\n\t\tdie(\"general protection fault\", regs, error_code);\n\tgoto exit;\n}\n```\n\nIf we can fix exception we go to the `exit` label which exits from exception state:\n\n```C\nexit:\n\texception_exit(prev_state);\n```\n\nIf we came from user mode we send `SIGSEGV` signal to the interrupted process from user mode as we did it in the `do_trap` function:\n\n```C\nif (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&\n\t\tprintk_ratelimit()) {\n\tpr_info(\"%s[%d] general protection ip:%lx sp:%lx error:%lx\",\n\t\ttsk->comm, task_pid_nr(tsk),\n\t\tregs->ip, regs->sp, error_code);\n\tprint_vma_addr(\" in \", regs->ip);\n\tpr_cont(\"\\n\");\n}\n\nforce_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);\n```\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the fifth part of the [Interrupts and Interrupt Handling](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and we saw implementation of some interrupt handlers in this part. In the next part we will continue to dive into interrupt and exception handlers and will see handler for the [Non-Maskable Interrupts](https://en.wikipedia.org/wiki/Non-maskable_interrupt), handling of the math [coprocessor](https://en.wikipedia.org/wiki/Coprocessor) and [SIMD](https://en.wikipedia.org/wiki/SIMD) coprocessor exceptions and many many more.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [Interrupt descriptor Table](https://en.wikipedia.org/wiki/Interrupt_descriptor_table)\n* [iret instruction](http://x86.renejeschke.de/html/file_module_x86_id_145.html)\n* [GCC macro Concatenation](https://gcc.gnu.org/onlinedocs/cpp/Concatenation.html#Concatenation)\n* [kernel panic](https://en.wikipedia.org/wiki/Kernel_panic)\n* [kernel oops](https://en.wikipedia.org/wiki/Linux_kernel_oops)\n* [Non-Maskable Interrupt](https://en.wikipedia.org/wiki/Non-maskable_interrupt)\n* [hotplug](https://en.wikipedia.org/wiki/Hot_swapping)\n* [interrupt flag](https://en.wikipedia.org/wiki/Interrupt_flag)\n* [long mode](https://en.wikipedia.org/wiki/Long_mode)\n* [signal](https://en.wikipedia.org/wiki/Unix_signal)\n* [printk](https://en.wikipedia.org/wiki/Printk)\n* [coprocessor](https://en.wikipedia.org/wiki/Coprocessor)\n* [SIMD](https://en.wikipedia.org/wiki/SIMD)\n* [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/kernel-stacks)\n* [PID](https://en.wikipedia.org/wiki/Process_identifier)\n* [x87 FPU](https://en.wikipedia.org/wiki/X87)\n* [control register](https://en.wikipedia.org/wiki/Control_register)\n* [MMX](https://en.wikipedia.org/wiki/MMX_%28instruction_set%29)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-4)\n"
  },
  {
    "path": "Interrupts/linux-interrupts-6.md",
    "content": "Interrupts and Interrupt Handling. Part 6.\n================================================================================\n\nNon-maskable interrupt handler\n--------------------------------------------------------------------------------\n\nIt is sixth part of the [Interrupts and Interrupt Handling in the Linux kernel](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-5) we saw implementation of some exception handlers for the [General Protection Fault](https://en.wikipedia.org/wiki/General_protection_fault) exception, divide exception, invalid [opcode](https://en.wikipedia.org/wiki/Opcode) exceptions, etc. As I wrote in the previous part we will see implementations of the rest exceptions in this part. We will see implementation of the following handlers:\n\n* [Non-Maskable](https://en.wikipedia.org/wiki/Non-maskable_interrupt) interrupt;\n* [BOUND](http://pdos.csail.mit.edu/6.828/2005/readings/i386/BOUND.htm) Range Exceeded Exception;\n* [Coprocessor](https://en.wikipedia.org/wiki/Coprocessor) exception;\n* [SIMD](https://en.wikipedia.org/wiki/SIMD) coprocessor exception.\n\nin this part. So, let's start.\n\nNon-Maskable interrupt handling\n--------------------------------------------------------------------------------\n\nA [Non-Maskable](https://en.wikipedia.org/wiki/Non-maskable_interrupt) interrupt is a hardware interrupt that cannot be ignored by standard masking techniques. In a general way, a non-maskable interrupt can be generated in either of two ways:\n\n* External hardware asserts the non-maskable interrupt [pin](https://en.wikipedia.org/wiki/CPU_socket) on the CPU.\n* The processor receives a message on the system bus or the APIC serial bus with a delivery mode `NMI`.\n\nWhen the processor receives a `NMI` from one of these sources, the processor handles it immediately by calling the `NMI` handler pointed to by interrupt vector which has number `2` (see table in the first [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-1)). We already filled the [Interrupt Descriptor Table](https://en.wikipedia.org/wiki/Interrupt_descriptor_table) with the [vector number](https://en.wikipedia.org/wiki/Interrupt_vector_table), address of the `nmi` interrupt handler and `NMI_STACK` [Interrupt Stack Table entry](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/x86/kernel-stacks):\n\n```C\nset_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);\n```\n\nin the `trap_init` function which defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) source code file. In the previous [parts](https://0xax.gitbook.io/linux-insides/summary/interrupts) we saw that entry points of the all interrupt handlers are defined with the:\n\n```assembly\n.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1\nENTRY(\\sym)\n...\n...\n...\nEND(\\sym)\n.endm\n```\n\nmacro from the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S) assembly source code file. But the handler of the `Non-Maskable` interrupts is not defined with this macro. It has own entry point:\n\n```assembly\nENTRY(nmi)\n...\n...\n...\nEND(nmi)\n```\n\nin the same [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S) assembly file. Lets dive into it and will try to understand how `Non-Maskable` interrupt handler works. The `nmi` handlers starts from the call of the:\n\n```assembly\nPARAVIRT_ADJUST_EXCEPTION_FRAME\n```\n\nmacro but we will not dive into details about it in this part, because this macro related to the [Paravirtualization](https://en.wikipedia.org/wiki/Paravirtualization) stuff which we will see in another chapter. After this save the content of the `rdx` register on the stack:\n\n```assembly\npushq\t%rdx\n```\n\nAnd allocated check that `cs` was not the kernel segment when an non-maskable interrupt occurs:\n\n```assembly\ncmpl\t$__KERNEL_CS, 16(%rsp)\njne\tfirst_nmi\n```\n\nThe `__KERNEL_CS` macro defined in the [arch/x86/include/asm/segment.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/segment.h) and represented second descriptor in the [Global Descriptor Table](https://en.wikipedia.org/wiki/Global_Descriptor_Table):\n\n```C\n#define GDT_ENTRY_KERNEL_CS\t2\n#define __KERNEL_CS\t(GDT_ENTRY_KERNEL_CS*8)\n```\n\nmore about `GDT` you can read in the second [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-2) of the Linux kernel booting process chapter. If `cs` is not kernel segment, it means that it is not nested `NMI` and we jump on the `first_nmi` label. Let's consider this case. First of all we put address of the current stack pointer to the `rdx` and pushes `1` to the stack in the `first_nmi` label:\n\n```assembly\nfirst_nmi:\n\tmovq\t(%rsp), %rdx\n\tpushq\t$1\n```\n\nWhy do we push `1` on the stack? As the comment says: `We allow breakpoints in NMIs`. On the [x86_64](https://en.wikipedia.org/wiki/X86-64), like other architectures, the CPU will not execute another `NMI` until the first `NMI` is completed. A `NMI` interrupt finished with the [iret](http://faydoc.tripod.com/cpu/iret.htm) instruction like other interrupts and exceptions do it. If the `NMI` handler triggers either a [page fault](https://en.wikipedia.org/wiki/Page_fault) or [breakpoint](https://en.wikipedia.org/wiki/Breakpoint) or another exception which are use `iret` instruction too. If this happens while in `NMI` context, the CPU will leave `NMI` context and a new `NMI` may come in. The `iret` used to return from those exceptions will re-enable `NMIs` and we will get nested non-maskable interrupts. The problem the `NMI` handler will not return to the state that it was, when the exception triggered, but instead it will return to a state that will allow new `NMIs` to preempt the running `NMI` handler. If another `NMI` comes in before the first NMI handler is complete, the new NMI will write all over the preempted `NMIs` stack. We can have nested `NMIs` where the next `NMI` is using the top of the stack of the previous `NMI`. It means that we cannot execute it because a nested non-maskable interrupt will corrupt stack of a previous non-maskable interrupt. That's why we have allocated space on the stack for temporary variable. We will check this variable that it was set when a previous `NMI` is executing and clear if it is not nested `NMI`. We push `1` here to the previously allocated space on the stack to denote that a `non-maskable` interrupt executed currently. Remember that when and `NMI` or another exception occurs we have the following [stack frame](https://en.wikipedia.org/wiki/Call_stack):\n\n```\n+------------------------+\n|         SS             |\n|         RSP            |\n|        RFLAGS          |\n|         CS             |\n|         RIP            |\n+------------------------+\n```\n\nand also an error code if an exception has it. So, after all of these manipulations our stack frame will look like this:\n\n```\n+------------------------+\n|         SS             |\n|         RSP            |\n|        RFLAGS          |\n|         CS             |\n|         RIP            |\n|         RDX            |\n|          1             |\n+------------------------+\n```\n\nIn the next step we allocate yet another `40` bytes on the stack:\n\n```assembly\nsubq\t$(5*8), %rsp\n```\n\nand pushes the copy of the original stack frame after the allocated space:\n\n```C\n.rept 5\npushq\t11*8(%rsp)\n.endr\n```\n\nwith the [.rept](http://tigcc.ticalc.org/doc/gnuasm.html#SEC116) assembly directive. We need in the copy of the original stack frame. Generally we need in two copies of the interrupt stack. First is `copied` interrupts stack: `saved` stack frame and `copied` stack frame. Now we pushes original stack frame to the `saved` stack frame which locates after the just allocated `40` bytes (`copied` stack frame). This stack frame is used to fixup the `copied` stack frame that a nested NMI may change. The second - `copied` stack frame modified by any nested `NMIs` to let the first `NMI` know that we triggered a second `NMI` and we should repeat the first `NMI` handler. Ok, we have made first copy of the original stack frame, now time to make second copy:\n\n```assembly\naddq\t$(10*8), %rsp\n\n.rept 5\npushq\t-6*8(%rsp)\n.endr\nsubq\t$(5*8), %rsp\n```\n\nAfter all of these manipulations our stack frame will be like this:\n\n```\n+-------------------------+\n| original SS             |\n| original Return RSP     |\n| original RFLAGS         |\n| original CS             |\n| original RIP            |\n+-------------------------+\n| temp storage for rdx    |\n+-------------------------+\n| NMI executing variable  |\n+-------------------------+\n| copied SS               |\n| copied Return RSP       |\n| copied RFLAGS           |\n| copied CS               |\n| copied RIP              |\n+-------------------------+\n| Saved SS                |\n| Saved Return RSP        |\n| Saved RFLAGS            |\n| Saved CS                |\n| Saved RIP               |\n+-------------------------+\n```\n\nAfter this we push dummy error code on the stack as we did it already in the previous exception handlers and allocate space for the general purpose registers on the stack:\n\n```assembly\npushq\t$-1\nALLOC_PT_GPREGS_ON_STACK\n```\n\nWe already saw implementation of the `ALLOC_PT_GPREGS_ON_STACK` macro in the third part of the interrupts [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-3). This macro defined in the [arch/x86/entry/calling.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/calling.h) and yet another allocates `120` bytes on stack for the general purpose registers, from the `rdi` to the `r15`:\n\n```assembly\n.macro ALLOC_PT_GPREGS_ON_STACK addskip=0\naddq\t$-(15*8+\\addskip), %rsp\n.endm\n```\n\nAfter space allocation for the general registers we can see call of the `paranoid_entry`:\n\n```assembly\ncall\tparanoid_entry\n```\n\nWe can remember from the previous parts this label. It pushes general purpose registers on the stack, reads `MSR_GS_BASE` [Model Specific register](https://en.wikipedia.org/wiki/Model-specific_register) and checks its value. If the value of the `MSR_GS_BASE` is negative, we came from the kernel mode and just return from the `paranoid_entry`, in other way it means that we came from the usermode and need to execute `swapgs` instruction which will change user `gs` with the kernel `gs`:\n\n```assembly\nENTRY(paranoid_entry)\n\tcld\n\tSAVE_C_REGS 8\n\tSAVE_EXTRA_REGS 8\n\tmovl\t$1, %ebx\n\tmovl\t$MSR_GS_BASE, %ecx\n\trdmsr\n\ttestl\t%edx, %edx\n\tjs\t1f\n\tSWAPGS\n\txorl\t%ebx, %ebx\n1:\tret\nEND(paranoid_entry)\n```\n\nNote that after the `swapgs` instruction we zeroed the `ebx` register. Next time we will check content of this register and if we executed `swapgs` than `ebx` must contain `0` and `1` in other way. In the next step we store value of the `cr2` [control register](https://en.wikipedia.org/wiki/Control_register) to the `r12` register, because the `NMI` handler can cause `page fault` and corrupt the value of this control register:\n\n```C\nmovq\t%cr2, %r12\n```\n\nNow time to call actual `NMI` handler. We push the address of the `pt_regs` to the `rdi`, error code to the `rsi` and call the `do_nmi` handler:\n\n```assembly\nmovq\t%rsp, %rdi\nmovq\t$-1, %rsi\ncall\tdo_nmi\n```\n\nWe will back to the `do_nmi` little later in this part, but now let's look what occurs after the `do_nmi` will finish its execution. After the `do_nmi` handler will be finished we check the `cr2` register, because we can got page fault during `do_nmi` performed and if we got it we restore original `cr2`, in other way we jump on the label `1`. After this we test content of the `ebx` register (remember it must contain `0` if we have used `swapgs` instruction and `1` if we didn't use it) and execute `SWAPGS_UNSAFE_STACK` if it contains `1` or jump to the `nmi_restore` label. The `SWAPGS_UNSAFE_STACK` macro just expands to the `swapgs` instruction. In the `nmi_restore` label we restore general purpose registers, clear allocated space on the stack for this registers, clear our temporary variable and exit from the interrupt handler with the `INTERRUPT_RETURN` macro:\n\n```assembly\n\tmovq\t%cr2, %rcx\n\tcmpq\t%rcx, %r12\n\tje\t1f\n\tmovq\t%r12, %cr2\n1:\n\ttestl\t%ebx, %ebx\n\tjnz\tnmi_restore\nnmi_swapgs:\n\tSWAPGS_UNSAFE_STACK\nnmi_restore:\n\tRESTORE_EXTRA_REGS\n\tRESTORE_C_REGS\n\t/* Pop the extra iret frame at once */\n\tREMOVE_PT_GPREGS_FROM_STACK 6*8\n\t/* Clear the NMI executing stack variable */\n\tmovq\t$0, 5*8(%rsp)\n\tINTERRUPT_RETURN\n```\n\nwhere `INTERRUPT_RETURN` is defined in the [arch/x86/include/asm/irqflags.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/irqflags.h) and just expands to the `iret` instruction. That's all.\n\nNow let's consider case when another `NMI` interrupt occurred when previous `NMI` interrupt didn't finish its execution. You can remember from the beginning of this part that we've made a check that we came from userspace and jump on the `first_nmi` in this case:\n\n```assembly\ncmpl\t$__KERNEL_CS, 16(%rsp)\njne\tfirst_nmi\n```\n\nNote that in this case it is first `NMI` every time, because if the first `NMI` caught page fault, breakpoint or another exception it will be executed in the kernel mode. If we didn't come from userspace, first of all we test our temporary variable:\n\n```assembly\ncmpl\t$1, -8(%rsp)\nje\tnested_nmi\n```\n\nand if it is set to `1` we jump to the `nested_nmi` label. If it is not `1`, we test the `IST` stack. In the case of nested `NMIs` we check that we are above the `repeat_nmi`. In this case we ignore it, in other way we check that we above than `end_repeat_nmi` and jump on the `nested_nmi_out` label.\n\nNow let's look on the `do_nmi` exception handler. This function defined in the [arch/x86/kernel/nmi.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/nmi.c) source code file and takes two parameters:\n\n* address of the `pt_regs`;\n* error code.\n\nas all exception handlers. The `do_nmi` starts from the call of the `nmi_nesting_preprocess` function and ends with the call of the `nmi_nesting_postprocess`. The `nmi_nesting_preprocess` function checks that we likely do not work with the debug stack and if we on the debug stack set the `update_debug_stack` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable to `1` and call the `debug_stack_set_zero` function from the [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/cpu/common.c). This function increases the `debug_stack_use_ctr` per-cpu variable and loads new `Interrupt Descriptor Table`:\n\n```C\nstatic inline void nmi_nesting_preprocess(struct pt_regs *regs)\n{\n        if (unlikely(is_debug_stack(regs->sp))) {\n                debug_stack_set_zero();\n                this_cpu_write(update_debug_stack, 1);\n        }\n}\n```\n\nThe `nmi_nesting_postprocess` function checks the `update_debug_stack` per-cpu variable which we set in the `nmi_nesting_preprocess` and resets debug stack or in another words it loads origin `Interrupt Descriptor Table`. After the call of the `nmi_nesting_preprocess` function, we can see the call of the `nmi_enter` in the `do_nmi`. The `nmi_enter` increases `lockdep_recursion` field of the interrupted process, update preempt counter and informs the [RCU](https://en.wikipedia.org/wiki/Read-copy-update) subsystem about `NMI`. There is also `nmi_exit` function that does the same stuff as `nmi_enter`, but vice-versa. After the `nmi_enter` we increase `__nmi_count` in the `irq_stat` structure and call the `default_do_nmi` function. First of all in the `default_do_nmi` we check the address of the previous nmi and update address of the last nmi to the actual:\n\n```C\nif (regs->ip == __this_cpu_read(last_nmi_rip))\n    b2b = true;\nelse\n    __this_cpu_write(swallow_nmi, false);\n\n__this_cpu_write(last_nmi_rip, regs->ip);\n```\n\nAfter this first of all we need to handle CPU-specific `NMIs`:\n\n```C\nhandled = nmi_handle(NMI_LOCAL, regs, b2b);\n__this_cpu_add(nmi_stats.normal, handled);\n```\n\nAnd then non-specific `NMIs` depends on its reason:\n\n```C\nreason = x86_platform.get_nmi_reason();\nif (reason & NMI_REASON_MASK) {\n\tif (reason & NMI_REASON_SERR)\n\t\tpci_serr_error(reason, regs);\n\telse if (reason & NMI_REASON_IOCHK)\n\t\tio_check_error(reason, regs);\n\n\t__this_cpu_add(nmi_stats.external, 1);\n\treturn;\n}\n```\n\nThat's all.\n\nRange Exceeded Exception\n--------------------------------------------------------------------------------\n\nThe next exception is the `BOUND` range exceeded exception. The `BOUND` instruction determines if the first operand (array index) is within the bounds of an array specified the second operand (bounds operand). If the index is not within bounds, a `BOUND` range exceeded exception or `#BR` is occurred. The handler of the `#BR` exception is the `do_bounds` function that defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c). The `do_bounds` handler starts with the call of the `exception_enter` function and ends with the call of the `exception_exit`:\n\n```C\nprev_state = exception_enter();\n\nif (notify_die(DIE_TRAP, \"bounds\", regs, error_code,\n\t           X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)\n    goto exit;\n...\n...\n...\nexception_exit(prev_state);\nreturn;\n```\n\nAfter we have got the state of the previous context, we add the exception to the `notify_die` chain and if it will return `NOTIFY_STOP` we return from the exception. More about notify chains and the `context tracking` functions you can read in the [previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-5). In the next step we enable interrupts if they were disabled with the `contidional_sti` function that checks `IF` flag and call the `local_irq_enable` depends on its value:\n\n```C\nconditional_sti(regs);\n\nif (!user_mode(regs))\n\tdie(\"bounds\", regs, error_code);\n```\n\nand check that if we didn't came from user mode we send `SIGSEGV` signal with the `die` function. After this we check is [MPX](https://en.wikipedia.org/wiki/Intel_MPX) enabled or not, and if this feature is disabled we jump on the `exit_trap` label:\n\n```C\nif (!cpu_feature_enabled(X86_FEATURE_MPX)) {\n\tgoto exit_trap;\n}\n\nwhere we execute `do_trap` function (more about it you can find in the previous part):\n\n```C\nexit_trap:\n\tdo_trap(X86_TRAP_BR, SIGSEGV, \"bounds\", regs, error_code, NULL);\n\texception_exit(prev_state);\n```\n\nIf `MPX` feature is enabled we check the `BNDSTATUS` with the `get_xsave_field_ptr` function and if it is zero, it means that the `MPX` was not responsible for this exception:\n\n```C\nbndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);\nif (!bndcsr)\n\t\tgoto exit_trap;\n```\n\nAfter all of this, there is still only one way when `MPX` is responsible for this exception. We will not dive into the details about Intel Memory Protection Extensions in this part, but will see it in another chapter.\n\nCoprocessor exception and SIMD exception\n--------------------------------------------------------------------------------\n\nThe next two exceptions are [x87 FPU](https://en.wikipedia.org/wiki/X87) Floating-Point Error exception or `#MF` and [SIMD](https://en.wikipedia.org/wiki/SIMD) Floating-Point Exception or `#XF`. The first exception occurs when the `x87 FPU` has detected floating point error. For example divide by zero, numeric overflow, etc. The second exception occurs when the processor has detected [SSE/SSE2/SSE3](https://en.wikipedia.org/wiki/SSE3) `SIMD` floating-point exception. It can be the same as for the `x87 FPU`. The handlers for these exceptions are `do_coprocessor_error` and `do_simd_coprocessor_error` are defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) and very similar on each other. They both make a call of the `math_error` function from the same source code file but pass different vector number. The `do_coprocessor_error` passes `X86_TRAP_MF` vector number to the `math_error`:\n\n```C\ndotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)\n{\n\tenum ctx_state prev_state;\n\n\tprev_state = exception_enter();\n\tmath_error(regs, error_code, X86_TRAP_MF);\n\texception_exit(prev_state);\n}\n```\n\nand `do_simd_coprocessor_error` passes `X86_TRAP_XF` to the `math_error` function:\n\n```C\ndotraplinkage void\ndo_simd_coprocessor_error(struct pt_regs *regs, long error_code)\n{\n\tenum ctx_state prev_state;\n\n\tprev_state = exception_enter();\n\tmath_error(regs, error_code, X86_TRAP_XF);\n\texception_exit(prev_state);\n}\n```\n\nFirst of all the `math_error` function defines current interrupted task, address of its FPU, string which describes an exception, add it to the `notify_die` chain and return from the exception handler if it will return `NOTIFY_STOP`:\n\n```C\n\tstruct task_struct *task = current;\n\tstruct fpu *fpu = &task->thread.fpu;\n\tsiginfo_t info;\n\tchar *str = (trapnr == X86_TRAP_MF) ? \"fpu exception\" :\n\t\t\t\t\t\t\"simd exception\";\n\n\tif (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)\n\t\treturn;\n```\n\nAfter this we check that we are from the kernel mode and if yes we will try to fix an exception with the `fixup_exception` function. If we cannot we fill the task with the exception's error code and vector number and die:\n\n```C\nif (!user_mode(regs)) {\n\tif (!fixup_exception(regs)) {\n\t\ttask->thread.error_code = error_code;\n\t\ttask->thread.trap_nr = trapnr;\n\t\tdie(str, regs, error_code);\n\t}\n\treturn;\n}\n```\n\nIf we came from the user mode, we save the `fpu` state, fill the task structure with the vector number of an exception and `siginfo_t` with the number of signal, `errno`, the address where exception occurred and signal code:\n\n```C\nfpu__save(fpu);\n\ntask->thread.trap_nr\t= trapnr;\ntask->thread.error_code = error_code;\ninfo.si_signo\t\t= SIGFPE;\ninfo.si_errno\t\t= 0;\ninfo.si_addr\t\t= (void __user *)uprobe_get_trap_addr(regs);\ninfo.si_code = fpu__exception_code(fpu, trapnr);\n```\n\nAfter this we check the signal code and if it is non-zero we return:\n\n```C\nif (!info.si_code)\n\treturn;\n```\n\nOr send the `SIGFPE` signal in the end:\n\n```C\nforce_sig_info(SIGFPE, &info, task);\n```\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the sixth part of the [Interrupts and Interrupt Handling](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and we saw implementation of some exception handlers in this part, like `non-maskable` interrupt, [SIMD](https://en.wikipedia.org/wiki/SIMD) and [x87 FPU](https://en.wikipedia.org/wiki/X87) floating point exception. Finally, we finished with the `trap_init` function in this part and will go ahead in the next part. The next our point is the external interrupts and the `early_irq_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c).\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [General Protection Fault](https://en.wikipedia.org/wiki/General_protection_fault)\n* [opcode](https://en.wikipedia.org/wiki/Opcode)\n* [Non-Maskable](https://en.wikipedia.org/wiki/Non-maskable_interrupt)\n* [BOUND instruction](http://pdos.csail.mit.edu/6.828/2005/readings/i386/BOUND.htm)\n* [CPU socket](https://en.wikipedia.org/wiki/CPU_socket)\n* [Interrupt Descriptor Table](https://en.wikipedia.org/wiki/Interrupt_descriptor_table)\n* [Interrupt Stack Table](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/x86/kernel-stacks)\n* [Paravirtualization](https://en.wikipedia.org/wiki/Paravirtualization)\n* [.rept](http://tigcc.ticalc.org/doc/gnuasm.html#SEC116)\n* [SIMD](https://en.wikipedia.org/wiki/SIMD)\n* [Coprocessor](https://en.wikipedia.org/wiki/Coprocessor)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [iret](http://faydoc.tripod.com/cpu/iret.htm)\n* [page fault](https://en.wikipedia.org/wiki/Page_fault)\n* [breakpoint](https://en.wikipedia.org/wiki/Breakpoint)\n* [Global Descriptor Table](https://en.wikipedia.org/wiki/Global_Descriptor_Table)\n* [stack frame](https://en.wikipedia.org/wiki/Call_stack)\n* [Model Specific register](https://en.wikipedia.org/wiki/Model-specific_register)\n* [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [RCU](https://en.wikipedia.org/wiki/Read-copy-update)\n* [MPX](https://en.wikipedia.org/wiki/Intel_MPX)\n* [x87 FPU](https://en.wikipedia.org/wiki/X87)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-5)\n"
  },
  {
    "path": "Interrupts/linux-interrupts-7.md",
    "content": "Interrupts and Interrupt Handling. Part 7.\n================================================================================\n\nIntroduction to external interrupts\n--------------------------------------------------------------------------------\n\nThis is the seventh part of the Interrupts and Interrupt Handling in the Linux kernel [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-6) we have finished with the exceptions which are generated by the processor. In this part we will continue to dive to the interrupt handling and will start with the external hardware interrupt handling. As you can remember, in the previous part we have finished with the `trap_init` function from the [arch/x86/kernel/trap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) and the next step is the call of the `early_irq_init` function from [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c).\n\nInterrupts are signal that are sent across [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) or `Interrupt Request Line` by a hardware or software. External hardware interrupts allow devices like keyboard, mouse and etc, to indicate that it needs attention of the processor. Once the processor receives the `Interrupt Request`, it will temporary stop execution of the running program and invoke special routine which depends on an interrupt. We already know that this routine is called interrupt handler (or how we will call it `ISR` or `Interrupt Service Routine` from this part). The `ISR` or `Interrupt Handler Routine` can be found in Interrupt Vector table that is located at fixed address in the memory. After the interrupt is handled processor resumes the interrupted process. At the boot/initialization time, the Linux kernel identifies all devices in the machine, and appropriate interrupt handlers are loaded into the interrupt table. As we saw in the previous parts, most exceptions are handled simply by the sending a [Unix signal](https://en.wikipedia.org/wiki/Unix_signal) to the interrupted process. That's how the kernel can handle an exception quickly. Unfortunately we can not use this approach for the external hardware interrupts, because often they arrive after (and sometimes long after) the process to which they are related has been suspended. So it would make no sense to send a Unix signal to the current process. External interrupt handling depends on the type of an interrupt:\n\n* `I/O` interrupts;\n* Timer interrupts;\n* Interprocessor interrupts.\n\nI will try to describe all types of interrupts in this book.\n\nGenerally, a handler of an `I/O` interrupt must be flexible enough to service several devices at the same time. For example in the [PCI](https://en.wikipedia.org/wiki/Conventional_PCI) bus architecture several devices may share the same `IRQ` line. In the simplest way the Linux kernel must do following thing when an `I/O` interrupt occurred:\n\n* Save the value of an `IRQ` and the register's contents on the kernel stack;\n* Send an acknowledgment to the hardware controller which is servicing the `IRQ` line;\n* Execute the interrupt service routine (next we will call it `ISR`) which is associated with the device;\n* Restore registers and return from an interrupt;\n\nOk, we know a little theory and now let's start with the `early_irq_init` function. The implementation of the `early_irq_init` function is in the [kernel/irq/irqdesc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/irqdesc.c). This function make early initialization of the `irq_desc` structure. The `irq_desc` structure is the foundation of interrupt management code in the Linux kernel. An array of this structure, which has the same name - `irq_desc`, keeps track of every interrupt request source in the Linux kernel. This structure defined in the [include/linux/irqdesc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irqdesc.h) and as you can note it depends on the `CONFIG_SPARSE_IRQ` kernel configuration option. This kernel configuration option enables support for sparse IRQs. The `irq_desc` structure contains many different fields:\n\n* `irq_common_data` - per irq and chip data passed down to chip functions;\n* `status_use_accessors` - contains status of the interrupt source which is combination of the values from the `enum` from the [include/linux/irq.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irq.h) and different macros which are defined in the same source code file;\n* `kstat_irqs` - irq stats per-cpu;\n* `handle_irq` - highlevel irq-events handler;\n* `action` - identifies the interrupt service routines to be invoked when the [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) occurs;\n* `irq_count` - counter of interrupt occurrences on the IRQ line;\n* `depth` - `0` if the IRQ line is enabled and a positive value if it has been disabled at least once;\n* `last_unhandled` - aging timer for unhandled count;\n* `irqs_unhandled` - count of the unhandled interrupts;\n* `lock`  - a spin lock used to serialize the accesses to the `IRQ` descriptor;\n* `pending_mask` - pending rebalanced interrupts;\n* `owner` - an owner of interrupt descriptor. Interrupt descriptors can be allocated from modules. This field is need to proved refcount on the module which provides the interrupts;\n* and etc.\n\nOf course it is not all fields of the `irq_desc` structure, because it is too long to describe each field of this structure, but we will see it all soon. Now let's start to dive into the implementation of the `early_irq_init` function.\n\nEarly external interrupts initialization\n--------------------------------------------------------------------------------\n\nNow, let's look on the implementation of the `early_irq_init` function. Note that implementation of the `early_irq_init` function depends on the `CONFIG_SPARSE_IRQ` kernel configuration option. Now we consider implementation of the `early_irq_init` function when the `CONFIG_SPARSE_IRQ` kernel configuration option is not set. This function starts from the declaration of the following variables: `irq` descriptors counter, loop counter, memory node and the `irq_desc` descriptor:\n\n```C\nint __init early_irq_init(void)\n{\n        int count, i, node = first_online_node;\n        struct irq_desc *desc;\n\t\t...\n\t\t...\n\t\t...\n}\n```\n\nThe `node` is an online [NUMA](https://en.wikipedia.org/wiki/Non-uniform_memory_access) node which depends on the `MAX_NUMNODES` value which depends on the `CONFIG_NODES_SHIFT` kernel configuration parameter:\n\n```C\n#define MAX_NUMNODES    (1 << NODES_SHIFT)\n...\n...\n...\n#ifdef CONFIG_NODES_SHIFT\n    #define NODES_SHIFT     CONFIG_NODES_SHIFT\n#else\n    #define NODES_SHIFT     0\n#endif\n```\n\nAs I already wrote, implementation of the `first_online_node` macro depends on the `MAX_NUMNODES` value:\n\n```C\n#if MAX_NUMNODES > 1\n  #define first_online_node       first_node(node_states[N_ONLINE])\n#else\n  #define first_online_node       0\n```\n\nThe `node_states` is the [enum](https://en.wikipedia.org/wiki/Enumerated_type) which defined in the [include/linux/nodemask.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/nodemask.h) and represent the set of the states of a node. In our case we are searching an online node and it will be `0` if `MAX_NUMNODES` is one or zero. If the `MAX_NUMNODES` is greater than one, the `node_states[N_ONLINE]` will return `1` and the `first_node` macro will be expanded to the call of the `__first_node` function which will return `minimal` or the first online node:\n\n```C\n#define first_node(src) __first_node(&(src))\n\nstatic inline int __first_node(const nodemask_t *srcp)\n{\n        return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));\n}\n```\n\nMore about this will be in the another chapter about the `NUMA`. The next step after the declaration of these local variables is the call of the:\n\n```C\ninit_irq_default_affinity();\n```\n\nfunction. The `init_irq_default_affinity` function defined in the same source code file and depends on the `CONFIG_SMP` kernel configuration option allocates a given [cpumask](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) structure (in our case it is the `irq_default_affinity`):\n\n```C\n#if defined(CONFIG_SMP)\ncpumask_var_t irq_default_affinity;\n\nstatic void __init init_irq_default_affinity(void)\n{\n        alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);\n        cpumask_setall(irq_default_affinity);\n}\n#else\nstatic void __init init_irq_default_affinity(void)\n{\n}\n#endif\n```\n\nWe know that when a hardware, such as disk controller or keyboard, needs attention from the processor, it throws an interrupt. The interrupt tells to the processor that something has happened and that the processor should interrupt current process and handle an incoming event. In order to prevent multiple devices from sending the same interrupts, the [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) system was established where each device in a computer system is assigned its own special IRQ so that its interrupts are unique. Linux kernel can assign certain `IRQs` to specific processors. This is known as `SMP IRQ affinity`, and it allows you to control how your system will respond to various hardware events (that's why it has certain implementation only if the `CONFIG_SMP` kernel configuration option is set). After we allocated `irq_default_affinity` cpumask, we can see `printk` output:\n\n```C\nprintk(KERN_INFO \"NR_IRQS:%d\\n\", NR_IRQS);\n```\n\nwhich prints `NR_IRQS`:\n\n```C\n~$ dmesg | grep NR_IRQS\n[    0.000000] NR_IRQS:4352\n```\n\nThe `NR_IRQS` is the maximum number of the `irq` descriptors or in another words maximum number of interrupts. Its value depends on the state of the `CONFIG_X86_IO_APIC` kernel configuration option. If the `CONFIG_X86_IO_APIC` is not set and the Linux kernel uses an old [PIC](https://en.wikipedia.org/wiki/Programmable_Interrupt_Controller) chip, the `NR_IRQS` is:\n\n```C\n#define NR_IRQS_LEGACY                    16\n\n#ifdef CONFIG_X86_IO_APIC\n...\n...\n...\n#else\n# define NR_IRQS                        NR_IRQS_LEGACY\n#endif\n```\n\nIn other way, when the `CONFIG_X86_IO_APIC` kernel configuration option is set, the `NR_IRQS` depends on the amount of the processors and amount of the interrupt vectors:\n\n```C\n#define CPU_VECTOR_LIMIT               (64 * NR_CPUS)\n#define NR_VECTORS                     256\n#define IO_APIC_VECTOR_LIMIT           ( 32 * MAX_IO_APICS )\n#define MAX_IO_APICS                   128\n\n# define NR_IRQS                                       \\\n        (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ?     \\\n                (NR_VECTORS + CPU_VECTOR_LIMIT)  :     \\\n                (NR_VECTORS + IO_APIC_VECTOR_LIMIT))\n...\n...\n...\n```\n\nWe remember from the previous parts, that the amount of processors we can set during Linux kernel configuration process with the `CONFIG_NR_CPUS` configuration option:\n\n![kernel](images/kernel.png)\n\nIn the first case (`CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT`), the `NR_IRQS` will be `4352`, in the second case (`CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT`), the `NR_IRQS` will be `768`. In my case the `NR_CPUS` is `8` as you can see in the my configuration, the `CPU_VECTOR_LIMIT` is `512` and the `IO_APIC_VECTOR_LIMIT` is `4096`. So `NR_IRQS` for my configuration is `4352`:\n\n```\n~$ dmesg | grep NR_IRQS\n[    0.000000] NR_IRQS:4352\n```\n\nIn the next step we assign array of the IRQ descriptors to the `irq_desc` variable which we defined in the start of the `early_irq_init` function and calculate count of the `irq_desc` array with the `ARRAY_SIZE` macro:\n\n```C\ndesc = irq_desc;\ncount = ARRAY_SIZE(irq_desc);\n```\n\nThe `irq_desc` array defined in the same source code file and looks like:\n\n```C\nstruct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {\n        [0 ... NR_IRQS-1] = {\n                .handle_irq     = handle_bad_irq,\n                .depth          = 1,\n                .lock           = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),\n        }\n};\n```\n\nThe `irq_desc` is array of the `irq` descriptors. It has three already initialized fields:\n\n* `handle_irq` - as I already wrote above, this field is the highlevel irq-event handler. In our case it initialized with the `handle_bad_irq` function that defined in the [kernel/irq/handle.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/handle.c) source code file and handles spurious and unhandled IRQs;\n* `depth` - `0` if the IRQ line is enabled and a positive value if it has been disabled at least once;\n* `lock` - A spin lock used to serialize the accesses to the `IRQ` descriptor.\n\nAs we calculated count of the interrupts and initialized our `irq_desc` array, we start to fill descriptors in the loop:\n\n```C\nfor (i = 0; i < count; i++) {\n    desc[i].kstat_irqs = alloc_percpu(unsigned int);\n    alloc_masks(&desc[i], GFP_KERNEL, node);\n    raw_spin_lock_init(&desc[i].lock);\n    lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);\n\tdesc_set_defaults(i, &desc[i], node, NULL);\n}\n```\n\nWe are going through the all interrupt descriptors and do the following things:\n\nFirst of all we allocate [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable for the `irq` kernel statistic with the `alloc_percpu` macro. This macro allocates one instance of an object of the given type for every processor on the system. You can access kernel statistic from the userspace via `/proc/stat`:\n\n```\n~$ cat /proc/stat\ncpu  207907 68 53904 5427850 14394 0 394 0 0 0\ncpu0 25881 11 6684 679131 1351 0 18 0 0 0\ncpu1 24791 16 5894 679994 2285 0 24 0 0 0\ncpu2 26321 4 7154 678924 664 0 71 0 0 0\ncpu3 26648 8 6931 678891 414 0 244 0 0 0\n...\n...\n...\n```\n\nWhere the sixth column is the servicing interrupts. After this we allocate [cpumask](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) for the given irq descriptor affinity and initialize the [spinlock](https://en.wikipedia.org/wiki/Spinlock) for the given interrupt descriptor. After this before the [critical section](https://en.wikipedia.org/wiki/Critical_section), the lock will be acquired with a call of the `raw_spin_lock` and unlocked with the call of the `raw_spin_unlock`. In the next step we call the `lockdep_set_class` macro which set the [Lock validator](https://lwn.net/Articles/185666/) `irq_desc_lock_class` class for the lock of the given interrupt descriptor. More about `lockdep`, `spinlock` and other synchronization primitives will be described in the separate chapter.\n\nIn the end of the loop we call the `desc_set_defaults` function from the [kernel/irq/irqdesc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/irqdesc.c). This function takes four parameters:\n\n* number of a irq;\n* interrupt descriptor;\n* online `NUMA` node;\n* owner of interrupt descriptor. Interrupt descriptors can be allocated from modules. This field is need to proved refcount on the module which provides the interrupts;\n\nand fills the rest of the `irq_desc` fields. The `desc_set_defaults` function fills interrupt number, `irq` chip, platform-specific per-chip private data for the chip methods, per-IRQ data for the `irq_chip` methods and [MSI](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts) descriptor for the per `irq` and `irq` chip data:\n\n```C\ndesc->irq_data.irq = irq;\ndesc->irq_data.chip = &no_irq_chip;\ndesc->irq_data.chip_data = NULL;\ndesc->irq_data.handler_data = NULL;\ndesc->irq_data.msi_desc = NULL;\n...\n...\n...\n```\n\nThe `irq_data.chip` structure provides general `API` like the `irq_set_chip`, `irq_set_irq_type` and etc, for the irq controller [drivers](https://github.com/torvalds/linux/tree/master/drivers/irqchip). You can find it in the [kernel/irq/chip.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/chip.c) source code file.\n\nAfter this we set the status of the accessor for the given descriptor and set disabled state of the interrupts:\n\n```C\n...\n...\n...\nirq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);\nirqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);\n...\n...\n...\n```\n\nIn the next step we set the high level interrupt handlers to the `handle_bad_irq` which handles spurious and unhandled IRQs (as the hardware stuff is not initialized yet, we set this handler), set `irq_desc.desc` to `1` which means that an `IRQ` is disabled, reset count of the unhandled interrupts and interrupts in general:\n\n```C\n...\n...\n...\ndesc->handle_irq = handle_bad_irq;\ndesc->depth = 1;\ndesc->irq_count = 0;\ndesc->irqs_unhandled = 0;\ndesc->name = NULL;\ndesc->owner = owner;\n...\n...\n...\n```\n\nAfter this we go through the all [possible](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) processor with the [for_each_possible_cpu](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/cpumask.h#L714) helper and set the `kstat_irqs` to zero for the given interrupt descriptor:\n\n```C\n\tfor_each_possible_cpu(cpu)\n\t\t*per_cpu_ptr(desc->kstat_irqs, cpu) = 0;\n```\n\nand call the `desc_smp_init` function from the [kernel/irq/irqdesc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/irqdesc.c) that initializes `NUMA` node of the given interrupt descriptor, sets default `SMP` affinity and clears the `pending_mask` of the given interrupt descriptor depends on the value of the `CONFIG_GENERIC_PENDING_IRQ` kernel configuration option:\n\n```C\nstatic void desc_smp_init(struct irq_desc *desc, int node)\n{\n        desc->irq_data.node = node;\n        cpumask_copy(desc->irq_data.affinity, irq_default_affinity);\n#ifdef CONFIG_GENERIC_PENDING_IRQ\n        cpumask_clear(desc->pending_mask);\n#endif\n}\n```\n\nIn the end of the `early_irq_init` function we return the return value of the `arch_early_irq_init` function:\n\n```C\nreturn arch_early_irq_init();\n```\n\nThis function defined in the [kernel/apic/vector.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/apic/vector.c) and contains only one call of the `arch_early_ioapic_init` function from the [kernel/apic/io_apic.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/apic/io_apic.c). As we can understand from the `arch_early_ioapic_init` function's name, this function makes early initialization of the [I/O APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). First of all it make a check of the number of the legacy interrupts with the call of the `nr_legacy_irqs` function. If we have no legacy interrupts with the [Intel 8259](https://en.wikipedia.org/wiki/Intel_8259) programmable interrupt controller we set `io_apic_irqs` to the `0xffffffffffffffff`:\n\n```C\nif (!nr_legacy_irqs())\n\tio_apic_irqs = ~0UL;\n```\n\nAfter this we are going through the all `I/O APICs` and allocate space for the registers with the call of the `alloc_ioapic_saved_registers`:\n\n```C\nfor_each_ioapic(i)\n\talloc_ioapic_saved_registers(i);\n```\n\nAnd in the end of the `arch_early_ioapic_init` function we are going through the all legacy IRQs (from `IRQ0` to `IRQ15`) in the loop and allocate space for the `irq_cfg` which represents configuration of an irq on the given `NUMA` node:\n\n```C\nfor (i = 0; i < nr_legacy_irqs(); i++) {\n    cfg = alloc_irq_and_cfg_at(i, node);\n    cfg->vector = IRQ0_VECTOR + i;\n    cpumask_setall(cfg->domain);\n}\n```\n\nThat's all.\n\nSparse IRQs\n--------------------------------------------------------------------------------\n\nWe already saw in the beginning of this part that implementation of the `early_irq_init` function depends on the `CONFIG_SPARSE_IRQ` kernel configuration option. Previously we saw implementation of the `early_irq_init` function when the `CONFIG_SPARSE_IRQ` configuration option is not set, now let's look at its implementation when this option is set. Implementation of this function very similar, but little differ. We can see the same definition of variables and call of the `init_irq_default_affinity` in the beginning of the `early_irq_init` function:\n\n```C\n#ifdef CONFIG_SPARSE_IRQ\nint __init early_irq_init(void)\n{\n    int i, initcnt, node = first_online_node;\n\tstruct irq_desc *desc;\n\n\tinit_irq_default_affinity();\n\t...\n\t...\n\t...\n}\n#else\n...\n...\n...\n```\n\nBut after this we can see the following call:\n\n```C\ninitcnt = arch_probe_nr_irqs();\n```\n\nThe `arch_probe_nr_irqs` function defined in the [arch/x86/kernel/apic/vector.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/apic/vector.c) and calculates count of the pre-allocated IRQs and update `nr_irqs` with this number. But stop. Why are there pre-allocated IRQs? There is alternative form of interrupts called - [Message Signaled Interrupts](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts) available in the [PCI](https://en.wikipedia.org/wiki/Conventional_PCI). Instead of assigning a fixed number of the interrupt request, the device is allowed to record a message at a particular address of RAM, in fact, the display on the [Local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#Integrated_local_APICs). `MSI` permits a device to allocate `1`, `2`, `4`, `8`, `16` or `32` interrupts and `MSI-X` permits a device to allocate up to `2048` interrupts. Now we know that IRQs can be pre-allocated. More about `MSI` will be in a next part, but now let's look on the `arch_probe_nr_irqs` function. We can see the check which assign amount of the interrupt vectors for the each processor in the system to the `nr_irqs` if it is greater and calculate the `nr` which represents number of `MSI` interrupts:\n\n```C\nint nr_irqs = NR_IRQS;\n\nif (nr_irqs > (NR_VECTORS * nr_cpu_ids))\n\tnr_irqs = NR_VECTORS * nr_cpu_ids;\n\nnr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;\n```\n\nTake a look on the `gsi_top` variable. Each `APIC` is identified with its own `ID` and with the offset where its `IRQ` starts. It is called `GSI` base or `Global System Interrupt` base. So the `gsi_top` represents it. We get the `Global System Interrupt` base from the [MultiProcessor Configuration Table](https://en.wikipedia.org/wiki/MultiProcessor_Specification) table (you can remember that we have parsed this table in the sixth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6) of the Linux kernel initialization process chapter).\n\nAfter this we update the `nr` depends on the value of the `gsi_top`:\n\n```C\n#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)\n        if (gsi_top <= NR_IRQS_LEGACY)\n                nr +=  8 * nr_cpu_ids;\n        else\n                nr += gsi_top * 16;\n#endif\n```\n\nUpdate the `nr_irqs` if it less than `nr` and return the number of the legacy IRQs:\n\n```C\nif (nr < nr_irqs)\n    nr_irqs = nr;\n\nreturn nr_legacy_irqs();\n}\n```\n\nThe next after the `arch_probe_nr_irqs` is printing information about number of `IRQs`:\n\n```C\nprintk(KERN_INFO \"NR_IRQS:%d nr_irqs:%d %d\\n\", NR_IRQS, nr_irqs, initcnt);\n```\n\nWe can find it in the [dmesg](https://en.wikipedia.org/wiki/Dmesg) output:\n\n```\n$ dmesg | grep NR_IRQS\n[    0.000000] NR_IRQS:4352 nr_irqs:488 16\n```\n\nAfter this we do some checks that `nr_irqs` and `initcnt` values is not greater than maximum allowable number of `irqs`:\n\n```C\nif (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))\n    nr_irqs = IRQ_BITMAP_BITS;\n\nif (WARN_ON(initcnt > IRQ_BITMAP_BITS))\n    initcnt = IRQ_BITMAP_BITS;\n```\n\nwhere `IRQ_BITMAP_BITS` is equal to the `NR_IRQS` if the `CONFIG_SPARSE_IRQ` is not set and `NR_IRQS + 8196` in other way. In the next step we are going over all interrupt descriptors which need to be allocated in the loop and allocate space for the descriptor and insert to the `irq_desc_tree` [radix tree](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-2):\n\n```C\nfor (i = 0; i < initcnt; i++) {\n    desc = alloc_desc(i, node, NULL);\n    set_bit(i, allocated_irqs);\n\tirq_insert_desc(i, desc);\n}\n```\n\nIn the end of the `early_irq_init` function we return the value of the call of the `arch_early_irq_init` function as we did it already in the previous variant when the `CONFIG_SPARSE_IRQ` option was not set:\n\n```C\nreturn arch_early_irq_init();\n```\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the seventh part of the [Interrupts and Interrupt Handling](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and we started to dive into external hardware interrupts in this part. We saw early initialization of the `irq_desc` structure which represents description of an external interrupt and contains information about it like list of irq actions, information about interrupt handler, interrupt's owner, count of the unhandled interrupt and etc. In the next part we will continue to research external interrupts.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29)\n* [numa](https://en.wikipedia.org/wiki/Non-uniform_memory_access)\n* [Enum type](https://en.wikipedia.org/wiki/Enumerated_type)\n* [cpumask](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)\n* [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [spinlock](https://en.wikipedia.org/wiki/Spinlock)\n* [critical section](https://en.wikipedia.org/wiki/Critical_section)\n* [Lock validator](https://lwn.net/Articles/185666/)\n* [MSI](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts)\n* [I/O APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)\n* [Local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#Integrated_local_APICs)\n* [Intel 8259](https://en.wikipedia.org/wiki/Intel_8259)\n* [PIC](https://en.wikipedia.org/wiki/Programmable_Interrupt_Controller)\n* [MultiProcessor Configuration Table](https://en.wikipedia.org/wiki/MultiProcessor_Specification)\n* [radix tree](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-2)\n* [dmesg](https://en.wikipedia.org/wiki/Dmesg)\n"
  },
  {
    "path": "Interrupts/linux-interrupts-8.md",
    "content": "Interrupts and Interrupt Handling. Part 8.\n================================================================================\n\nNon-early initialization of the IRQs\n--------------------------------------------------------------------------------\n\nThis is the eighth part of the Interrupts and Interrupt Handling in the Linux kernel [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-7) we started to dive into the external hardware [interrupts](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29). We looked on the implementation of the `early_irq_init` function from the [kernel/irq/irqdesc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/irqdesc.c) source code file and saw the initialization of the `irq_desc` structure in this function. Remind that `irq_desc` structure (defined in the [include/linux/irqdesc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irqdesc.h#L46) is the foundation of interrupt management code in the Linux kernel and represents an interrupt descriptor. In this part we will continue to dive into the initialization stuff which is related to the external hardware interrupts.\n\nRight after the call of the `early_irq_init` function in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) we can see the call of the `init_IRQ` function. This function is architecture-specific and defined in the [arch/x86/kernel/irqinit.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/irqinit.c). The `init_IRQ` function makes initialization of the `vector_irq` [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable that defined in the same [arch/x86/kernel/irqinit.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/irqinit.c) source code file:\n\n```C\n...\nDEFINE_PER_CPU(vector_irq_t, vector_irq) = {\n         [0 ... NR_VECTORS - 1] = -1,\n};\n...\n```\n\nand represents `percpu` array of the interrupt vector numbers. The `vector_irq_t` defined in the [arch/x86/include/asm/hw_irq.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/hw_irq.h) and expands to the:\n\n```C\ntypedef int vector_irq_t[NR_VECTORS];\n```\n\nwhere `NR_VECTORS` is count of the vector number and as you can remember from the first [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-1) of this chapter it is `256` for the [x86_64](https://en.wikipedia.org/wiki/X86-64):\n\n```C\n#define NR_VECTORS                       256\n```\n\nSo, in the start of the `init_IRQ` function we fill the `vector_irq` [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) array with the vector number of the `legacy` interrupts:\n\n```C\nvoid __init init_IRQ(void)\n{\n\tint i;\n\n\tfor (i = 0; i < nr_legacy_irqs(); i++)\n\t\tper_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;\n...\n...\n...\n}\n```\n\nThis `vector_irq` will be used during the first steps of an external hardware interrupt handling in the `do_IRQ` function from the [arch/x86/kernel/irq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/irq.c):\n\n```C\n__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)\n{\n\t...\n\t...\n\t...\n\tirq = __this_cpu_read(vector_irq[vector]);\n\n\tif (!handle_irq(irq, regs)) {\n\t\t...\n\t\t...\n\t\t...\n\t}\n\n\texiting_irq();\n\t...\n\t...\n\treturn 1;\n}\n```\n\nWhy is `legacy` here? Actually all interrupts are handled by the modern [IO-APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#I.2FO_APICs) controller. But these interrupts (from `0x30` to `0x3f`) by legacy interrupt-controllers like [Programmable Interrupt Controller](https://en.wikipedia.org/wiki/Programmable_Interrupt_Controller). If these interrupts are handled by the `I/O APIC` then this vector space will be freed and re-used. Let's look on this code closer. First of all the `nr_legacy_irqs` defined in the [arch/x86/include/asm/i8259.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/i8259.h) and just returns the `nr_legacy_irqs` field from the `legacy_pic` structure:\n\n```C\nstatic inline int nr_legacy_irqs(void)\n{\n        return legacy_pic->nr_legacy_irqs;\n}\n```\n\nThis structure defined in the same header file and represents non-modern programmable interrupts controller:\n\n```C\nstruct legacy_pic {\n        int nr_legacy_irqs;\n        struct irq_chip *chip;\n        void (*mask)(unsigned int irq);\n        void (*unmask)(unsigned int irq);\n        void (*mask_all)(void);\n        void (*restore_mask)(void);\n        void (*init)(int auto_eoi);\n        int (*irq_pending)(unsigned int irq);\n        void (*make_irq)(unsigned int irq);\n};\n```\n\nActual default maximum number of the legacy interrupts represented by the `NR_IRQ_LEGACY` macro from the [arch/x86/include/asm/irq_vectors.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/irq_vectors.h):\n\n```C\n#define NR_IRQS_LEGACY                    16\n```\n\nIn the loop we are accessing the `vecto_irq` per-cpu array with the `per_cpu` macro by the `IRQ0_VECTOR + i` index and write the legacy vector number there. The `IRQ0_VECTOR` macro defined in the [arch/x86/include/asm/irq_vectors.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/irq_vectors.h) header file and expands to the `0x30`:\n\n```C\n#define FIRST_EXTERNAL_VECTOR           0x20\n\n#define IRQ0_VECTOR                     ((FIRST_EXTERNAL_VECTOR + 16) & ~15)\n```\n\nWhy is `0x30` here? You can remember from the first [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-1) of this chapter that first 32 vector numbers from `0` to `31` are reserved by the processor and used for the processing of architecture-defined exceptions and interrupts. Vector numbers from `0x30` to `0x3f` are reserved for the [ISA](https://en.wikipedia.org/wiki/Industry_Standard_Architecture). So, it means that we fill the `vector_irq` from the `IRQ0_VECTOR` which is equal to the `32` to the `IRQ0_VECTOR + 16` (before the `0x30`).\n\nIn the end of the `init_IRQ` function we can see the call of the following function:\n\n```C\nx86_init.irqs.intr_init();\n```\n\nfrom the [arch/x86/kernel/x86_init.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/x86_init.c) source code file. If you have read [chapter](https://0xax.gitbook.io/linux-insides/summary/initialization) about the Linux kernel initialization process, you can remember the `x86_init` structure. This structure contains a couple of files which point to the function related to the platform setup (`x86_64` in our case), for example `resources` - related with the memory resources, `mpparse` - related with the parsing of the [MultiProcessor Configuration Table](https://en.wikipedia.org/wiki/MultiProcessor_Specification) table, etc.). As we can see the `x86_init` also contains the `irqs` field which contains the three following fields:\n\n```C\nstruct x86_init_ops x86_init __initdata\n{\n\t...\n\t...\n\t...\n    .irqs = {\n                .pre_vector_init        = init_ISA_irqs,\n                .intr_init              = native_init_IRQ,\n                .trap_init              = x86_init_noop,\n\t},\n\t...\n\t...\n\t...\n}\n```\n\nNow, we are interesting in the `native_init_IRQ`. As we can note, the name of the `native_init_IRQ` function contains the `native_` prefix which means that this function is architecture-specific. It defined in the [arch/x86/kernel/irqinit.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/irqinit.c) and executes general initialization of the [Local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#Integrated_local_APICs) and initialization of the [ISA](https://en.wikipedia.org/wiki/Industry_Standard_Architecture) irqs. Let's look at the implementation of the `native_init_IRQ` function and try to understand what occurs there. The `native_init_IRQ` function starts from the execution of the following function:\n\n```C\nx86_init.irqs.pre_vector_init();\n```\n\nAs we can see above, the `pre_vector_init` points to the `init_ISA_irqs` function that defined in the same [source code](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/irqinit.c) file and as we can understand from the function's name, it makes initialization of the `ISA` related interrupts. The `init_ISA_irqs` function starts from the definition of the `chip` variable which has a `irq_chip` type:\n\n```C\nvoid __init init_ISA_irqs(void)\n{\n\tstruct irq_chip *chip = legacy_pic->chip;\n\t...\n\t...\n\t...\n```\n\nThe `irq_chip` structure defined in the [include/linux/irq.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irq.h) header file and represents hardware interrupt chip descriptor. It contains:\n\n* `name` - name of a device. Used in the `/proc/interrupts`:\n\n```C\n$ cat /proc/interrupts\n           CPU0       CPU1       CPU2       CPU3       CPU4       CPU5       CPU6       CPU7\n  0:         16          0          0          0          0          0          0          0   IO-APIC   2-edge      timer\n  1:          2          0          0          0          0          0          0          0   IO-APIC   1-edge      i8042\n  8:          1          0          0          0          0          0          0          0   IO-APIC   8-edge      rtc0\n```\n\nlook at the last column;\n\n* `(*irq_mask)(struct irq_data *data)`  - mask an interrupt source;\n* `(*irq_ack)(struct irq_data *data)` - start of a new interrupt;\n* `(*irq_startup)(struct irq_data *data)` - start up the interrupt;\n* `(*irq_shutdown)(struct irq_data *data)` - shutdown the interrupt\n* etc.\n\nfields. Note that the `irq_data` structure represents set of the per irq chip data passed down to chip functions. It contains `mask` - precomputed bitmask for accessing the chip registers, `irq` - interrupt number, `hwirq` - hardware interrupt number, local to the interrupt domain chip low level interrupt hardware access, etc.\n\nAfter this depends on the `CONFIG_X86_64` and `CONFIG_X86_LOCAL_APIC` kernel configuration option call the `init_bsp_APIC` function from the [arch/x86/kernel/apic/apic.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/apic/apic.c):\n\n```C\n#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)\n\tinit_bsp_APIC();\n#endif\n```\n\nThis function makes initialization of the [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) of `bootstrap processor` (or processor which starts first). It starts from the check that we found [SMP](https://en.wikipedia.org/wiki/Symmetric_multiprocessing) config (read more about it in the sixth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6) of the Linux kernel initialization process chapter) and the processor has `APIC`:\n\n```C\nif (smp_found_config || !cpu_has_apic)\n\treturn;\n```\n\nOtherwise, we return from this function. In the next step we call the `clear_local_APIC` function from the same source code file that shuts down the local `APIC` (more on it in the `Advanced Programmable Interrupt Controller` chapter) and enable `APIC` of the first processor by the setting `unsigned int value` to the `APIC_SPIV_APIC_ENABLED`:\n\n```C\nvalue = apic_read(APIC_SPIV);\nvalue &= ~APIC_VECTOR_MASK;\nvalue |= APIC_SPIV_APIC_ENABLED;\n```\n\nand writing it with the help of the `apic_write` function:\n\n```C\napic_write(APIC_SPIV, value);\n```\n\nAfter we have enabled `APIC` for the bootstrap processor, we return to the `init_ISA_irqs` function and in the next step we initialize legacy `Programmable Interrupt Controller` and set the legacy chip and handler for each legacy irq:\n\n```C\nlegacy_pic->init(0);\n\nfor (i = 0; i < nr_legacy_irqs(); i++)\n    irq_set_chip_and_handler(i, chip, handle_level_irq);\n```\n\nWhere can we find `init` function? The `legacy_pic` defined in the [arch/x86/kernel/i8259.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/i8259.c) and it is:\n\n```C\nstruct legacy_pic *legacy_pic = &default_legacy_pic;\n```\n\nWhere the `default_legacy_pic` is:\n\n```C\nstruct legacy_pic default_legacy_pic = {\n\t...\n\t...\n\t...\n\t.init = init_8259A,\n\t...\n\t...\n\t...\n}\n```\n\nThe `init_8259A` function defined in the same source code file and executes initialization of the [Intel 8259](https://en.wikipedia.org/wiki/Intel_8259) `Programmable Interrupt Controller` (more about it will be in the separate chapter about `Programmable Interrupt Controllers` and `APIC`).\n\nNow we can return to the `native_init_IRQ` function, after the `init_ISA_irqs` function finished its work. The next step is the call of the `apic_intr_init` function that allocates special interrupt gates which are used by the [SMP](https://en.wikipedia.org/wiki/Symmetric_multiprocessing) architecture for the [Inter-processor interrupt](https://en.wikipedia.org/wiki/Inter-processor_interrupt). The `alloc_intr_gate` macro from the [arch/x86/include/asm/desc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/desc.h) used for the interrupt descriptor allocation:\n\n```C\n#define alloc_intr_gate(n, addr)                        \\\ndo {                                                    \\\n        alloc_system_vector(n);                         \\\n        set_intr_gate(n, addr);                         \\\n} while (0)\n```\n\nAs we can see, first of all it expands to the call of the `alloc_system_vector` function that checks the given vector number in the `used_vectors` bitmap (read previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-7) about it) and if it is not set in the `used_vectors` bitmap we set it. After this we test that the `first_system_vector` is greater than given interrupt vector number and if it is greater we assign it:\n\n```C\nif (!test_bit(vector, used_vectors)) {\n\tset_bit(vector, used_vectors);\n    if (first_system_vector > vector)\n\t\tfirst_system_vector = vector;\n} else {\n\tBUG();\n}\n```\n\nWe already saw the `set_bit` macro, now let's look at the `test_bit` and the `first_system_vector`. The first `test_bit` macro defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/bitops.h) and looks like this:\n\n```C\n#define test_bit(nr, addr)                      \\\n        (__builtin_constant_p((nr))             \\\n         ? constant_test_bit((nr), (addr))      \\\n         : variable_test_bit((nr), (addr)))\n```\n\nWe can see the [ternary operator](https://en.wikipedia.org/wiki/Ternary_operation) here makes a test with the [gcc](https://en.wikipedia.org/wiki/GNU_Compiler_Collection) built-in function `__builtin_constant_p` tests that given vector number (`nr`) is known at compile time. If you're feeling misunderstanding of the `__builtin_constant_p`, we can make simple test:\n\n```C\n#include <stdio.h>\n\n#define PREDEFINED_VAL 1\n\nint main() {\n\tint i = 5;\n\tprintf(\"__builtin_constant_p(i) is %d\\n\", __builtin_constant_p(i));\n\tprintf(\"__builtin_constant_p(PREDEFINED_VAL) is %d\\n\", __builtin_constant_p(PREDEFINED_VAL));\n\tprintf(\"__builtin_constant_p(100) is %d\\n\", __builtin_constant_p(100));\n\n\treturn 0;\n}\n```\n\nand look at the result:\n\n```\n$ gcc test.c -o test\n$ ./test\n__builtin_constant_p(i) is 0\n__builtin_constant_p(PREDEFINED_VAL) is 1\n__builtin_constant_p(100) is 1\n```\n\nNow I think it must be clear for you. Let's get back to the `test_bit` macro. If the `__builtin_constant_p` returns non-zero, we call `constant_test_bit` function:\n\n```C\nstatic inline int constant_test_bit(int nr, const void *addr)\n{\n\tconst u32 *p = (const u32 *)addr;\n\n\treturn ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;\n}\n```\n\nand the `variable_test_bit` in other way:\n\n```C\nstatic inline int variable_test_bit(int nr, const void *addr)\n{\n        u8 v;\n        const u32 *p = (const u32 *)addr;\n\n        asm(\"btl %2,%1; setc %0\" : \"=qm\" (v) : \"m\" (*p), \"Ir\" (nr));\n        return v;\n}\n```\n\nWhat's the difference between two these functions and why do we need in two different functions for the same purpose? As you already can guess main purpose is optimization. If we write simple example with these functions:\n\n```C\n#define CONST 25\n\nint main() {\n\tint nr = 24;\n\tvariable_test_bit(nr, (int*)0x10000000);\n\tconstant_test_bit(CONST, (int*)0x10000000)\n\treturn 0;\n}\n```\n\nand will look at the assembly output of our example we will see following assembly code:\n\n```assembly\npushq\t%rbp\nmovq\t%rsp, %rbp\n\nmovl\t$268435456, %esi\nmovl\t$25, %edi\ncall\tconstant_test_bit\n```\n\nfor the `constant_test_bit`, and:\n\n```assembly\npushq\t%rbp\nmovq\t%rsp, %rbp\n\nsubq\t$16, %rsp\nmovl\t$24, -4(%rbp)\nmovl\t-4(%rbp), %eax\nmovl\t$268435456, %esi\nmovl\t%eax, %edi\ncall\tvariable_test_bit\n```\n\nfor the `variable_test_bit`. These two code listings starts with the same part, first of all we save base of the current stack frame in the `%rbp` register. But after this code for both examples is different. In the first example we put `$268435456` (here the `$268435456` is our second parameter - `0x10000000`) to the `esi` and `$25` (our first parameter) to the `edi` register and call `constant_test_bit`. We put function parameters to the `esi` and `edi` registers because as we are learning Linux kernel for the `x86_64` architecture we use `System V AMD64 ABI` [calling convention](https://en.wikipedia.org/wiki/X86_calling_conventions). All is pretty simple. When we are using predefined constant, the compiler can just substitute its value. Now let's look at the second part. As you can see here, the compiler can not substitute value from the `nr` variable. In this case compiler must calculate its offset on the program's [stack frame](https://en.wikipedia.org/wiki/Call_stack). We subtract `16` from the `rsp` register to allocate stack for the local variables data and put the `$24` (value of the `nr` variable) to the `rbp` with offset `-4`. Our stack frame will be like this:\n\n```\n         <- stack grows\n\n\t          %[rbp]\n                 |\n+----------+ +---------+ +---------+ +--------+\n|          | |         | | return  | |        |\n|    nr    |-|         |-|         |-|  argc  |\n|          | |         | | address | |        |\n+----------+ +---------+ +---------+ +--------+\n                 |\n              %[rsp]\n```\n\nAfter this we put this value to the `eax`, so `eax` register now contains value of the `nr`. In the end we do the same that in the first example, we put the `$268435456` (the first parameter of the `variable_test_bit` function) and the value of the `eax` (value of `nr`) to the `edi` register (the second parameter of the `variable_test_bit function`).\n\nThe next step after the `apic_intr_init` function will finish its work is the setting interrupt gates from the `FIRST_EXTERNAL_VECTOR` or `0x20` up to `0x100`:\n\n```C\ni = FIRST_EXTERNAL_VECTOR;\n\n#ifndef CONFIG_X86_LOCAL_APIC\n#define first_system_vector NR_VECTORS\n#endif\n\nfor_each_clear_bit_from(i, used_vectors, first_system_vector) {\n\tset_intr_gate(i, irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR));\n}\n```\n\nBut as we are using the `for_each_clear_bit_from` helper, we set only non-initialized interrupt gates. After this we use the same `for_each_clear_bit_from` helper to fill the non-filled interrupt gates in the interrupt table with the `spurious_interrupt`:\n\n```C\n#ifdef CONFIG_X86_LOCAL_APIC\nfor_each_clear_bit_from(i, used_vectors, NR_VECTORS)\n    set_intr_gate(i, spurious_interrupt);\n#endif\n```\n\nWhere the `spurious_interrupt` function represent interrupt handler for the `spurious` interrupt. Here the `used_vectors` is the `unsigned long` that contains already initialized interrupt gates. We already filled first `32` interrupt vectors in the `trap_init` function from the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) source code file:\n\n```C\nfor (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)\n    set_bit(i, used_vectors);\n```\n\nYou can remember how we did it in the sixth [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-6) of this chapter.\n\nIn the end of the `native_init_IRQ` function we can see the following check:\n\n```C\nif (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())\n\tsetup_irq(2, &irq2);\n```\n\nFirst of all let's deal with the condition. The `acpi_ioapic` variable represents existence of [I/O APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#I.2FO_APICs). It defined in the [arch/x86/kernel/acpi/boot.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/acpi/boot.c). This variable set in the `acpi_set_irq_model_ioapic` function that called during the processing `Multiple APIC Description Table`. This occurs during initialization of the architecture-specific stuff in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) (more about it we will know in the other chapter about [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)). Note that the value of the `acpi_ioapic` variable depends on the `CONFIG_ACPI` and `CONFIG_X86_LOCAL_APIC` Linux kernel configuration options. If these options were not set, this variable will be just zero:\n\n```C\n#define acpi_ioapic 0\n```\n\nThe second condition - `!of_ioapic && nr_legacy_irqs()` checks that we do not use [Open Firmware](https://en.wikipedia.org/wiki/Open_Firmware) `I/O APIC` and legacy interrupt controller. We already know about the `nr_legacy_irqs`. The second is `of_ioapic` variable defined in the [arch/x86/kernel/devicetree.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/devicetree.c) and initialized in the `dtb_ioapic_setup` function that build information about `APICs` in the [devicetree](https://en.wikipedia.org/wiki/Device_tree). Note that `of_ioapic` variable depends on the `CONFIG_OF` Linux kernel configuration option. If this option is not set, the value of the `of_ioapic` will be zero too:\n\n```C\n#ifdef CONFIG_OF\nextern int of_ioapic;\n...\n...\n...\n#else\n#define of_ioapic 0\n...\n...\n...\n#endif\n```\n\nIf the condition returns non-zero value we call the:\n\n```C\nsetup_irq(2, &irq2);\n```\n\nfunction. First of all about the `irq2`. The `irq2` is the `irqaction` structure that defined in the [arch/x86/kernel/irqinit.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/irqinit.c) source code file and represents `IRQ 2` line that is used to query devices connected cascade:\n\n```C\nstatic struct irqaction irq2 = {\n\t.handler = no_action,\n    .name = \"cascade\",\n    .flags = IRQF_NO_THREAD,\n};\n```\n\nSome time ago interrupt controller consisted of two chips and one was connected to second. The second chip that was connected to the first chip via this `IRQ 2` line. This chip serviced lines from `8` to `15` and after this lines of the first chip. So, for example [Intel 8259A](https://en.wikipedia.org/wiki/Intel_8259) has following lines:\n\n* `IRQ 0`  - system time;\n* `IRQ 1`  - keyboard;\n* `IRQ 2`  - used for devices which are cascade connected;\n* `IRQ 8`  - [RTC](https://en.wikipedia.org/wiki/Real-time_clock);\n* `IRQ 9`  - reserved;\n* `IRQ 10` - reserved;\n* `IRQ 11` - reserved;\n* `IRQ 12` - `ps/2` mouse;\n* `IRQ 13` - coprocessor;\n* `IRQ 14` - hard drive controller;\n* `IRQ 1`  - reserved;\n* `IRQ 3`  - `COM2` and `COM4`;\n* `IRQ 4`  - `COM1` and `COM3`;\n* `IRQ 5`  - `LPT2`;\n* `IRQ 6`  - drive controller;\n* `IRQ 7`  - `LPT1`.\n\nThe `setup_irq` function is defined in the [kernel/irq/manage.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/manage.c) and takes two parameters:\n\n* vector number of an interrupt;\n* `irqaction` structure related with an interrupt.\n\nThis function initializes interrupt descriptor from the given vector number at the beginning:\n\n```C\nstruct irq_desc *desc = irq_to_desc(irq);\n```\n\nAnd call the `__setup_irq` function that sets up given interrupt:\n\n```C\nchip_bus_lock(desc);\nretval = __setup_irq(irq, desc, act);\nchip_bus_sync_unlock(desc);\nreturn retval;\n```\n\nNote that the interrupt descriptor is locked during `__setup_irq` function will work. The `__setup_irq` function does many different things: it creates a handler thread when a thread function is supplied and the interrupt does not nest into another interrupt thread, sets the flags of the chip, fills the `irqaction` structure and many many more.\n\nAll of the above it creates `/prov/vector_number` directory and fills it, but if you are using modern computer all values will be zero there:\n\n```\n$ cat /proc/irq/2/node\n0\n\n$cat /proc/irq/2/affinity_hint\n00\n\ncat /proc/irq/2/spurious\ncount 0\nunhandled 0\nlast_unhandled 0 ms\n```\n\nbecause probably `APIC` handles interrupts on the machine.\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the eighth part of the [Interrupts and Interrupt Handling](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and we continued to dive into external hardware interrupts in this part. In the previous part we started to do it and saw early initialization of the `IRQs`. In this part we already saw non-early interrupts initialization in the `init_IRQ` function. We saw initialization of the `vector_irq` per-cpu array which is store vector numbers of the interrupts and will be used during interrupt handling and initialization of other stuff which is related to the external hardware interrupts.\n\nIn the next part we will continue to learn interrupts handling related stuff and will see initialization of the `softirqs`.\n\nIf you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29)\n* [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [Intel 8259](https://en.wikipedia.org/wiki/Intel_8259)\n* [Programmable Interrupt Controller](https://en.wikipedia.org/wiki/Programmable_Interrupt_Controller)\n* [ISA](https://en.wikipedia.org/wiki/Industry_Standard_Architecture)\n* [MultiProcessor Configuration Table](https://en.wikipedia.org/wiki/MultiProcessor_Specification)\n* [Local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#Integrated_local_APICs)\n* [I/O APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#I.2FO_APICs)\n* [SMP](https://en.wikipedia.org/wiki/Symmetric_multiprocessing)\n* [Inter-processor interrupt](https://en.wikipedia.org/wiki/Inter-processor_interrupt)\n* [ternary operator](https://en.wikipedia.org/wiki/Ternary_operation)\n* [gcc](https://en.wikipedia.org/wiki/GNU_Compiler_Collection)\n* [calling convention](https://en.wikipedia.org/wiki/X86_calling_conventions)\n* [PDF. System V Application Binary Interface AMD64](http://x86-64.org/documentation/abi.pdf)\n* [Call stack](https://en.wikipedia.org/wiki/Call_stack)\n* [Open Firmware](https://en.wikipedia.org/wiki/Open_Firmware)\n* [devicetree](https://en.wikipedia.org/wiki/Device_tree)\n* [RTC](https://en.wikipedia.org/wiki/Real-time_clock)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-7)\n"
  },
  {
    "path": "Interrupts/linux-interrupts-9.md",
    "content": "Interrupts and Interrupt Handling. Part 9.\n================================================================================\n\nIntroduction to deferred interrupts (Softirq, Tasklets and Workqueues)\n--------------------------------------------------------------------------------\n\nIt is the nine part of the Interrupts and Interrupt Handling in the Linux kernel [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) and in the previous [Previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-8) we saw implementation of the `init_IRQ` from that defined in the [arch/x86/kernel/irqinit.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/irqinit.c) source code file. So, we will continue to dive into the initialization stuff which is related to the external hardware interrupts in this part.\n\nInterrupts may have different important characteristics and there are two among them:\n\n* Handler of an interrupt must execute quickly;\n* Sometime an interrupt handler must do a large amount of work.\n\nAs you can understand, it is almost impossible to make so that both characteristics were valid. Because of these, previously the handling of interrupts was split into two parts:\n\n* Top half;\n* Bottom half;\n\nIn the past there was one way to defer interrupt handling in Linux kernel. And it was called: `the bottom half` of the processor, but now it is already not actual. Now this term has remained as a common noun referring to all the different ways of organizing deferred processing of an interrupt.The deferred processing of an interrupt suggests that some of the actions for an interrupt may be postponed to a later execution when the system will be less loaded. As you can suggest, an interrupt handler can do large amount of work that is impermissible as it executes in the context where interrupts are disabled. That's why processing of an interrupt can be split in two different parts. In the first part, the main handler of an interrupt does only minimal and the most important job. After this it schedules the second part and finishes its work. When the system is less busy and context of the processor allows to handle interrupts, the second part starts its work and finishes to process remaining part of a deferred interrupt.\n\nThere are three types of `deferred interrupts` in the Linux kernel:\n\n* `softirqs`;\n* `tasklets`;\n* `workqueues`;\n\nAnd we will see a description of all of these types in this part. As I said, we saw only a little bit about this theme, so, now is time to dive deep into details about this theme.\n\nSoftirqs\n----------------------------------------------------------------------------------\n\nWith the advent of parallelisms in the Linux kernel, all new schemes of implementation of the bottom half handlers are built on the performance of the processor specific kernel thread that called `ksoftirqd` (will be discussed below). Each processor has its own thread that is called `ksoftirqd/n` where the `n` is the number of the processor. We can see it in the output of the `systemd-cgls` util:\n\n```\n$ systemd-cgls -k | grep ksoft\n├─   3 [ksoftirqd/0]\n├─  13 [ksoftirqd/1]\n├─  18 [ksoftirqd/2]\n├─  23 [ksoftirqd/3]\n├─  28 [ksoftirqd/4]\n├─  33 [ksoftirqd/5]\n├─  38 [ksoftirqd/6]\n├─  43 [ksoftirqd/7]\n```\n\nThe `spawn_ksoftirqd` function starts these threads. As we can see this function called as early [initcall](https://kernelnewbies.org/Documents/InitcallMechanism):\n\n```C\nearly_initcall(spawn_ksoftirqd);\n```\n\nSoftirqs are determined statically at compile-time of the Linux kernel and the `open_softirq` function takes care of `softirq` initialization. The `open_softirq` function defined in the [kernel/softirq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/softirq.c):\n\n\n```C\nvoid open_softirq(int nr, void (*action)(struct softirq_action *))\n{\n\tsoftirq_vec[nr].action = action;\n}\n```\n\nand as we can see this function uses two parameters:\n\n* the index of the `softirq_vec` array;\n* a pointer to the softirq function to be executed;\n\nFirst of all let's look on the `softirq_vec` array:\n\n```C\nstatic struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;\n```\n\nit defined in the same source code file. As we can see, the `softirq_vec` array may contain `NR_SOFTIRQS` or `10` types of `softirqs` that has type `softirq_action`. First of all about its elements. In the current version of the Linux kernel there are ten softirq vectors defined; two for tasklet processing, two for networking, two for the block layer, two for timers, and one each for the scheduler and read-copy-update processing. All of these kinds are represented by the following enum:\n\n```C\nenum\n{\n        HI_SOFTIRQ=0,\n        TIMER_SOFTIRQ,\n        NET_TX_SOFTIRQ,\n        NET_RX_SOFTIRQ,\n        BLOCK_SOFTIRQ,\n        BLOCK_IOPOLL_SOFTIRQ,\n        TASKLET_SOFTIRQ,\n        SCHED_SOFTIRQ,\n        HRTIMER_SOFTIRQ,\n        RCU_SOFTIRQ,\n        NR_SOFTIRQS\n};\n```\n\nAll names of these kinds of softirqs are represented by the following array:\n\n```C\nconst char * const softirq_to_name[NR_SOFTIRQS] = {\n        \"HI\", \"TIMER\", \"NET_TX\", \"NET_RX\", \"BLOCK\", \"BLOCK_IOPOLL\",\n        \"TASKLET\", \"SCHED\", \"HRTIMER\", \"RCU\"\n};\n```\n\nOr we can see it in the output of the `/proc/softirqs`:\n\n```\n~$ cat /proc/softirqs\n                    CPU0       CPU1       CPU2       CPU3       CPU4       CPU5       CPU6       CPU7\n          HI:          5          0          0          0          0          0          0          0\n       TIMER:     332519     310498     289555     272913     282535     279467     282895     270979\n      NET_TX:       2320          0          0          2          1          1          0          0\n      NET_RX:     270221        225        338        281        311        262        430        265\n       BLOCK:     134282         32         40         10         12          7          8          8\nBLOCK_IOPOLL:          0          0          0          0          0          0          0          0\n     TASKLET:     196835          2          3          0          0          0          0          0\n       SCHED:     161852     146745     129539     126064     127998     128014     120243     117391\n     HRTIMER:          0          0          0          0          0          0          0          0\n         RCU:     337707     289397     251874     239796     254377     254898     267497     256624\n```\n\nAs we can see the `softirq_vec` array has `softirq_action` types. This is the main data structure related to the `softirq` mechanism, so all `softirqs` represented by the `softirq_action` structure. The `softirq_action` structure consists a single field only: an action pointer to the softirq function:\n\n```C\nstruct softirq_action\n{\n         void    (*action)(struct softirq_action *);\n};\n```\n\nSo, after this we can understand that the `open_softirq` function fills the `softirq_vec` array with the given `softirq_action`. The registered deferred interrupt (with the call of the `open_softirq` function) for it to be queued for execution, it should be activated by the call of the `raise_softirq` function. This function takes only one parameter -- a softirq index `nr`. Let's look on its implementation:\n\n```C\nvoid raise_softirq(unsigned int nr)\n{\n        unsigned long flags;\n\n        local_irq_save(flags);\n        raise_softirq_irqoff(nr);\n        local_irq_restore(flags);\n}\n```\n\nHere we can see the call of the `raise_softirq_irqoff` function between the `local_irq_save` and the `local_irq_restore` macros. The `local_irq_save` defined in the [include/linux/irqflags.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irqflags.h) header file and saves the state of the [IF](https://en.wikipedia.org/wiki/Interrupt_flag) flag of the [eflags](https://en.wikipedia.org/wiki/FLAGS_register) register and disables interrupts on the local processor. The `local_irq_restore` macro defined in the same header file and does the opposite thing: restores the `interrupt flag` and enables interrupts. We disable interrupts here because a `softirq` interrupt runs in the interrupt context and that one softirq (and no others) will be run.\n\nThe `raise_softirq_irqoff` function marks the softirq as deferred by setting the bit corresponding to the given index `nr` in the `softirq` bit mask (`__softirq_pending`) of the local processor. It does it with the help of the:\n\n```C\n__raise_softirq_irqoff(nr);\n```\n\nmacro. After this, it checks the result of the `in_interrupt` that returns `irq_count` value. We already saw the `irq_count` in the first [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-1) of this chapter and it is used to check if a CPU is already on an interrupt stack or not. We just exit from the `raise_softirq_irqoff`, restore `IF` flag and enable interrupts on the local processor, if we are in the interrupt context, otherwise  we call the `wakeup_softirqd`:\n\n```C\nif (!in_interrupt())\n\twakeup_softirqd();\n```\n\nWhere the `wakeup_softirqd` function activates the `ksoftirqd` kernel thread of the local processor:\n\n```C\nstatic void wakeup_softirqd(void)\n{\n\tstruct task_struct *tsk = __this_cpu_read(ksoftirqd);\n\n    if (tsk && tsk->state != TASK_RUNNING)\n        wake_up_process(tsk);\n}\n```\n\nEach `ksoftirqd` kernel thread runs the `run_ksoftirqd` function that checks existence of deferred interrupts and calls the `__do_softirq` function depending on the result of the check. This function reads the `__softirq_pending` softirq bit mask of the local processor and executes the deferrable functions corresponding to every bit set. During execution of a deferred function, new pending `softirqs` might occur. The main problem here that execution of the userspace code can be delayed for a long time while the `__do_softirq` function will handle deferred interrupts. For this purpose, it has the limit of the time when it must be finished:\n\n```C\nunsigned long end = jiffies + MAX_SOFTIRQ_TIME;\n...\n...\n...\nrestart:\nwhile ((softirq_bit = ffs(pending))) {\n\t...\n\th->action(h);\n\t...\n}\n...\n...\n...\npending = local_softirq_pending();\nif (pending) {\n\tif (time_before(jiffies, end) && !need_resched() &&\n\t\t--max_restart)\n            goto restart;\n}\n...\n```\n\nChecks of the existence of the deferred interrupts are performed periodically. There are several points where these checks occur. The main point is the call of the `do_IRQ` function defined in [arch/x86/kernel/irq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/irq.c), which provides the main means for actual interrupt processing in the Linux kernel. When `do_IRQ` finishes handling an interrupt, it calls the `exiting_irq` function from the [arch/x86/include/asm/apic.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/apic.h) that expands to the call of the `irq_exit` function. `irq_exit` checks for deferred interrupts and the current context and calls the `invoke_softirq` function:\n\n```C\nif (!in_interrupt() && local_softirq_pending())\n    invoke_softirq();\n```\n\nthat also executes `__do_softirq`. To summarize, each `softirq` goes through the following stages:\n * Registration of a `softirq` with the `open_softirq` function.\n * Activation of a `softirq` by marking it as deferred with the `raise_softirq` function.\n * After this, all marked `softirqs` will be triggered in the next time the Linux kernel schedules a round of executions of deferrable functions.\n * And execution of the deferred functions that have the same type.\n\nAs I already wrote, the `softirqs` are statically allocated and it is a problem for a kernel module that can be loaded. The second concept that built on top of `softirq` -- the `tasklets` solves this problem.\n\nTasklets\n--------------------------------------------------------------------------------\n\nIf you read the source code of the Linux kernel that is related to the `softirq`, you notice that it is used very rarely. The preferable way to implement deferrable functions are `tasklets`. As I already wrote above the `tasklets` are built on top of the `softirq` concept and generally on top of two `softirqs`:\n\n* `TASKLET_SOFTIRQ`;\n* `HI_SOFTIRQ`.\n\nIn short words, `tasklets` are `softirqs` that can be allocated and initialized at runtime and unlike `softirqs`, tasklets that have the same type cannot be run on multiple processors at a time. Ok, now we know a little bit about the `softirqs`, of course previous text does not cover all aspects about this, but now we can directly look on the code and to know more about the `softirqs` step by step on practice and to know about `tasklets`. Let's return back to the implementation of the `softirq_init` function that we talked about in the beginning of this part. This function is defined in the [kernel/softirq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/softirq.c) source code file, let's look on its implementation:\n\n```C\nvoid __init softirq_init(void)\n{\n        int cpu;\n\n        for_each_possible_cpu(cpu) {\n                per_cpu(tasklet_vec, cpu).tail =\n                        &per_cpu(tasklet_vec, cpu).head;\n                per_cpu(tasklet_hi_vec, cpu).tail =\n                        &per_cpu(tasklet_hi_vec, cpu).head;\n        }\n\n        open_softirq(TASKLET_SOFTIRQ, tasklet_action);\n        open_softirq(HI_SOFTIRQ, tasklet_hi_action);\n}\n```\n\nWe can see definition of the integer `cpu` variable at the beginning of the `softirq_init` function. Next we will use it as parameter for the `for_each_possible_cpu` macro that goes through the all possible processors in the system. If the `possible processor` is the new terminology for you, you can read more about it the [CPU masks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) chapter. In short words, `possible cpus` is the set of processors that can be plugged in anytime during the life of that system boot. All `possible processors` stored in the `cpu_possible_bits` bitmap, you can find its definition in the [kernel/cpu.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/cpu.c):\n\n```C\nstatic DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;\n...\n...\n...\nconst struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);\n```\n\nOk, we defined the integer `cpu` variable and go through the all possible processors with the `for_each_possible_cpu` macro and makes initialization of the two following [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variables:\n\n* `tasklet_vec`;\n* `tasklet_hi_vec`;\n\nThese two `per-cpu` variables defined in the same source [code](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/softirq.c) file as the `softirq_init` function and represent two `tasklet_head` structures:\n\n```C\nstatic DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);\nstatic DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);\n```\n\nWhere `tasklet_head` structure represents a list of `Tasklets` and contains two fields, head and tail:\n\n```C\nstruct tasklet_head {\n        struct tasklet_struct *head;\n        struct tasklet_struct **tail;\n};\n```\n\nThe `tasklet_struct` structure is defined in the [include/linux/interrupt.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/interrupt.h) and represents the `Tasklet`. Previously we did not see this word in this book. Let's try to understand what the `tasklet` is. Actually, the tasklet is one of mechanisms to handle deferred interrupt. Let's look on the implementation of the `tasklet_struct` structure:\n\n```C\nstruct tasklet_struct\n{\n        struct tasklet_struct *next;\n        unsigned long state;\n        atomic_t count;\n        void (*func)(unsigned long);\n        unsigned long data;\n};\n```\n\nAs we can see this structure contains five fields, they are:\n\n* Next tasklet in the scheduling queue;\n* State of the tasklet;\n* Represent current state of the tasklet, active or not;\n* Main callback of the tasklet;\n* Parameter of the callback.\n\nIn our case, we initialize only two per-CPU tasklet vectors: `tasklet_vec` for normal-priority tasklets and `tasklet_hi_vec` for high-priority tasklets. These vectors are implemented as linked lists, with each CPU maintaining its own instance.\nAfter setting up the tasklet vectors, we register two softirq handlers using the `open_softirq` function that is defined in the [kernel/softirq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/softirq.c) source code file:\n\n```C\nopen_softirq(TASKLET_SOFTIRQ, tasklet_action);\nopen_softirq(HI_SOFTIRQ, tasklet_hi_action);\n```\n\nat the end of the `softirq_init` function. The main purpose of the `open_softirq` function is the initialization of `softirq`. Let's look on the implementation of the `open_softirq` function.\n\nIn our case they are: `tasklet_action` and the `tasklet_hi_action` or the `softirq` function associated with the `HI_SOFTIRQ` softirq is named `tasklet_hi_action` and `softirq` function associated with the `TASKLET_SOFTIRQ` is named `tasklet_action`. The Linux kernel provides API for the manipulating of `tasklets`. First of all it is the `tasklet_init` function that takes `tasklet_struct`, function and parameter for it and initializes the given `tasklet_struct` with the given data:\n\n```C\nvoid tasklet_init(struct tasklet_struct *t,\n                  void (*func)(unsigned long), unsigned long data)\n{\n    t->next = NULL;\n    t->state = 0;\n    atomic_set(&t->count, 0);\n    t->func = func;\n    t->data = data;\n}\n```\n\nThere are additional methods to initialize a tasklet statically with the two following macros:\n\n```C\nDECLARE_TASKLET(name, func, data);\nDECLARE_TASKLET_DISABLED(name, func, data);\n```\n\nThe Linux kernel provides three following functions to mark a tasklet as ready to run:\n\n```C\nvoid tasklet_schedule(struct tasklet_struct *t);\nvoid tasklet_hi_schedule(struct tasklet_struct *t);\nvoid tasklet_hi_schedule_first(struct tasklet_struct *t);\n```\n\nThe first function schedules a tasklet with the normal priority, the second with the high priority and the third out of turn. Implementation of the all of these three functions is similar, so we will consider only the first -- `tasklet_schedule`. Let's look on its implementation:\n\n```C\nstatic inline void tasklet_schedule(struct tasklet_struct *t)\n{\n    if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))\n        __tasklet_schedule(t);\n}\n\nvoid __tasklet_schedule(struct tasklet_struct *t)\n{\n        unsigned long flags;\n\n        local_irq_save(flags);\n        t->next = NULL;\n        *__this_cpu_read(tasklet_vec.tail) = t;\n        __this_cpu_write(tasklet_vec.tail, &(t->next));\n        raise_softirq_irqoff(TASKLET_SOFTIRQ);\n        local_irq_restore(flags);\n}\n```\n\nAs we can see it checks and sets the state of the given tasklet to the `TASKLET_STATE_SCHED` and executes the `__tasklet_schedule` with the given tasklet. The `__tasklet_schedule` looks very similar to the `raise_softirq` function that we saw above. It saves the `interrupt flag` and disables interrupts at the beginning. After this, it updates `tasklet_vec` with the new tasklet and calls the `raise_softirq_irqoff` function that we saw above. When the Linux kernel scheduler will decide to run deferred functions, the `tasklet_action` function will be called for deferred functions which are associated with the `TASKLET_SOFTIRQ` and `tasklet_hi_action` for deferred functions which are associated with the `HI_SOFTIRQ`. These functions are very similar and there is only one difference between them -- `tasklet_action` uses `tasklet_vec` and `tasklet_hi_action` uses `tasklet_hi_vec`.\n\nLet's look on the implementation of the `tasklet_action` function:\n\n```C\nstatic void tasklet_action(struct softirq_action *a)\n{\n    local_irq_disable();\n    list = __this_cpu_read(tasklet_vec.head);\n    __this_cpu_write(tasklet_vec.head, NULL);\n    __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));\n    local_irq_enable();\n\n    while (list) {\n\t\tif (tasklet_trylock(t)) {\n\t        t->func(t->data);\n            tasklet_unlock(t);\n\t    }\n\t\t...\n\t\t...\n\t\t...\n    }\n}\n```\n\nIn the beginning of the `tasklet_action` function, we disable interrupts for the local processor with the help of the `local_irq_disable` macro (you can read about this macro in the second [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-2) of this chapter). In the next step, we take a head of the list that contains tasklets with normal priority and set this per-cpu list to `NULL` because all tasklets must be executed in a general way. After this we enable interrupts for the local processor and go through the list of tasklets in the loop. In every iteration of the loop we call the `tasklet_trylock` function for the given tasklet that updates state of the given tasklet on `TASKLET_STATE_RUN`:\n\n```C\nstatic inline int tasklet_trylock(struct tasklet_struct *t)\n{\n    return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);\n}\n```\n\nIf this operation was successful we execute tasklet's action (it was set in the `tasklet_init`) and call the `tasklet_unlock` function that clears tasklet's `TASKLET_STATE_RUN` state.\n\nIn general, that's all about `tasklets` concept. Of course this does not cover full `tasklets`, but I think that it is a good point from where you can continue to learn this concept.\n\nThe `tasklets` are [widely](http://lxr.free-electrons.com/ident?i=tasklet_init) used concept in the Linux kernel, but as I wrote in the beginning of this part there is third mechanism for deferred functions -- `workqueue`. In the next paragraph we will see what it is.\n\nWorkqueues\n--------------------------------------------------------------------------------\n\nThe `workqueue` is another concept for handling deferred functions. It is similar to `tasklets` with some differences. Workqueue functions run in the context of a kernel process, but `tasklet` functions run in the software interrupt context. This means that `workqueue` functions must not be atomic as `tasklet` functions. Tasklets always run on the processor from which they were originally submitted. Workqueues work in the same way, but only by default. The `workqueue` concept represented by the:\n\n```C\nstruct worker_pool {\n    spinlock_t              lock;\n    int                     cpu;\n    int                     node;\n    int                     id;\n    unsigned int            flags;\n\n    struct list_head        worklist;\n    int                     nr_workers;\n...\n...\n...\n```\n\nstructure that is defined in the [kernel/workqueue.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/workqueue.c) source code file in the Linux kernel. I will not write the source code of this structure here, because it has quite a lot of fields, but we will consider some of those fields.\n\nIn its most basic form, the work queue subsystem is an interface for creating kernel threads to handle work that is queued from elsewhere. All of these kernel threads are called -- `worker threads`. The work queue are maintained by the `work_struct` that defined in the [include/linux/workqueue.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/workqueue.h). Let's look on this structure:\n\n```C\nstruct work_struct {\n    atomic_long_t data;\n    struct list_head entry;\n    work_func_t func;\n#ifdef CONFIG_LOCKDEP\n    struct lockdep_map lockdep_map;\n#endif\n};\n```\n\nHere are two things that we are interested: `func` -- the function that will be scheduled by the `workqueue` and the `data` - parameter of this function. The Linux kernel provides special per-cpu threads that are called `kworker`:\n\n```\nsystemd-cgls -k | grep kworker\n├─    5 [kworker/0:0H]\n├─   15 [kworker/1:0H]\n├─   20 [kworker/2:0H]\n├─   25 [kworker/3:0H]\n├─   30 [kworker/4:0H]\n...\n...\n...\n```\n\nThis process can be used to schedule the deferred functions of the workqueues (as `ksoftirqd` for `softirqs`). Besides this we can create new separate worker thread for a `workqueue`. The Linux kernel provides following macros for the creation of workqueue:\n\n```C\n#define DECLARE_WORK(n, f) \\\n    struct work_struct n = __WORK_INITIALIZER(n, f)\n```\n\nfor static creation. It takes two parameters: name of the workqueue and the workqueue function. For creation of workqueue in runtime, we can use the:\n\n```C\n#define INIT_WORK(_work, _func)       \\\n    __INIT_WORK((_work), (_func), 0)\n\n#define __INIT_WORK(_work, _func, _onstack)                     \\\n    do {                                                        \\\n            __init_work((_work), _onstack);                     \\\n            (_work)->data = (atomic_long_t) WORK_DATA_INIT();   \\\n            INIT_LIST_HEAD(&(_work)->entry);                    \\\n             (_work)->func = (_func);                           \\\n    } while (0)\n```\n\nmacro that takes `work_struct` structure that has to be created and the function to be scheduled in this workqueue. After a `work` was created with the one of these macros, we need to put it to the `workqueue`. We can do it with the help of the `queue_work` or the `queue_delayed_work` functions:\n\n```C\nstatic inline bool queue_work(struct workqueue_struct *wq,\n                              struct work_struct *work)\n{\n    return queue_work_on(WORK_CPU_UNBOUND, wq, work);\n}\n```\n\nThe `queue_work` function just calls the `queue_work_on` function that queues work on specific processor. Note that in our case we pass the `WORK_CPU_UNBOUND` to the `queue_work_on` function. It is a part of the `enum` that is defined in the [include/linux/workqueue.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/workqueue.h) and represents workqueue which are not bound to any specific processor. The `queue_work_on` function tests and set the `WORK_STRUCT_PENDING_BIT` bit of the given `work` and executes the `__queue_work` function with the `workqueue` for the given processor and given `work`:\n\n```C\nbool queue_work_on(int cpu, struct workqueue_struct *wq,\n           struct work_struct *work)\n{\n    bool ret = false;\n    ...\n    if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {\n        __queue_work(cpu, wq, work);\n        ret = true;\n    }\n    ...\n    return ret;\n}\n```\n\nThe `__queue_work` function gets the `work pool`. Yes, the `work pool` not `workqueue`. Actually, all `works` are not placed in the `workqueue`, but to the `work pool` that is represented by the `worker_pool` structure in the Linux kernel. As you can see above, the `workqueue_struct` structure has the `pwqs` field which is list of `worker_pools`. When we create a `workqueue`, it stands out for each processor the `pool_workqueue`. Each `pool_workqueue` associated with `worker_pool`, which is allocated on the same processor and corresponds to the type of priority queue. Through them `workqueue` interacts with `worker_pool`. So in the `__queue_work` function we set the cpu to the current processor with the `raw_smp_processor_id` (you can find information about this macro in the fourth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4) of the Linux kernel initialization process chapter), getting the `pool_workqueue` for the given `workqueue_struct` and insert the given `work` to the given `workqueue`:\n\n```C\nstatic void __queue_work(int cpu, struct workqueue_struct *wq,\n                         struct work_struct *work)\n{\n...\n...\n...\nif (req_cpu == WORK_CPU_UNBOUND)\n    cpu = raw_smp_processor_id();\n\nif (!(wq->flags & WQ_UNBOUND))\n    pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);\nelse\n    pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));\n...\n...\n...\ninsert_work(pwq, work, worklist, work_flags);\n```\n\nAs we can create `works` and `workqueue`, we need to know when they are executed. As I already wrote, all `works` are executed by the kernel thread. When this kernel thread is scheduled, it starts to execute `works` from the given `workqueue`. Each worker thread executes a loop inside the `worker_thread` function. This thread makes many different things and part of these things are similar to what we saw before in this part. As it starts executing, it removes all `work_struct` or `works` from its `workqueue`.\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt is the end of the ninth part of the [Interrupts and Interrupt Handling](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and we continued to dive into external hardware interrupts in this part. In the previous part we saw initialization of the `IRQs` and main `irq_desc` structure. In this part we saw three concepts: the `softirq`, `tasklet` and `workqueue` that are used for the deferred functions.\n\nThe next part will be last part of the `Interrupts and Interrupt Handling` chapter and we will look on the real hardware driver and will try to learn how it works with the interrupts subsystem.\n\nIf you have any questions or suggestions, write me a comment or ping me at [twitter](https://twitter.com/0xAX).\n\n**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [initcall](https://kernelnewbies.org/Documents/InitcallMechanism)\n* [IF](https://en.wikipedia.org/wiki/Interrupt_flag)\n* [eflags](https://en.wikipedia.org/wiki/FLAGS_register)\n* [CPU masks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)\n* [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [Workqueue](https://github.com/torvalds/linux/blob/6f0d349d922ba44e4348a17a78ea51b7135965b1/Documentation/core-api/workqueue.rst)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-8)\n"
  },
  {
    "path": "KernelStructures/.gitkeep",
    "content": ""
  },
  {
    "path": "KernelStructures/README.md",
    "content": "# Internal `system` structures of the Linux kernel\n\nThis is not usual chapter of `linux-insides`. As you may understand from the title, it mostly describes\ninternal `system` structures of the Linux kernel. Like `Interrupt Descriptor Table`, `Global Descriptor\nTable` and many many more.\n\nMost of information is taken from official [Intel](http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html) and [AMD](http://developer.amd.com/resources/developer-guides-manuals/) manuals.\n"
  },
  {
    "path": "KernelStructures/linux-kernelstructure-1.md",
    "content": "interrupt-descriptor table (IDT)\n================================================================================\n\nThree general interrupt & exceptions sources:\n\n* Exceptions - sync;\n* Software interrupts - sync;\n* External interrupts - async.\n\nTypes of Exceptions:\n\n* Faults - are precise exceptions reported on the boundary `before` the instruction causing the exception. The saved `%rip` points to the faulting instruction;\n* Traps - are precise exceptions reported on the boundary `following` the instruction causing the exception. The same with `%rip`;\n* Aborts - are imprecise exceptions. Because they are imprecise, aborts typically do not allow reliable program restart.\n\n`Maskable` interrupts trigger the interrupt-handling mechanism only when `RFLAGS.IF=1`. Otherwise they are held pending for as long as the `RFLAGS.IF` bit is cleared to 0.\n\n`Nonmaskable` interrupts (NMI) are unaffected by the value of the 'RFLAGS.IF' bit. However, the occurrence of an NMI masks further NMIs until an IRET instruction is executed.\n\nSpecific exception and interrupt sources are assigned a fixed vector-identification number (also called an “interrupt vector” or simply “vector”). The interrupt vector is used by the interrupt-handling mechanism to locate the system-software service routine assigned to the exception or interrupt. Up to\n256 unique interrupt vectors are available. The first 32 vectors are reserved for predefined exception and interrupt conditions. They are defined in the [arch/x86/include/asm/traps.h](http://lxr.free-electrons.com/source/arch/x86/include/asm/traps.h#L121) header file:\n\n```\n/* Interrupts/Exceptions */\nenum {\n\tX86_TRAP_DE = 0,\t/*  0, Divide-by-zero */\n\tX86_TRAP_DB,\t\t/*  1, Debug */\n\tX86_TRAP_NMI,\t\t/*  2, Non-maskable Interrupt */\n\tX86_TRAP_BP,\t\t/*  3, Breakpoint */\n\tX86_TRAP_OF,\t\t/*  4, Overflow */\n\tX86_TRAP_BR,\t\t/*  5, Bound Range Exceeded */\n\tX86_TRAP_UD,\t\t/*  6, Invalid Opcode */\n\tX86_TRAP_NM,\t\t/*  7, Device Not Available */\n\tX86_TRAP_DF,\t\t/*  8, Double Fault */\n\tX86_TRAP_OLD_MF,\t/*  9, Coprocessor Segment Overrun */\n\tX86_TRAP_TS,\t\t/* 10, Invalid TSS */\n\tX86_TRAP_NP,\t\t/* 11, Segment Not Present */\n\tX86_TRAP_SS,\t\t/* 12, Stack Segment Fault */\n\tX86_TRAP_GP,\t\t/* 13, General Protection Fault */\n\tX86_TRAP_PF,\t\t/* 14, Page Fault */\n\tX86_TRAP_SPURIOUS,\t/* 15, Spurious Interrupt */\n\tX86_TRAP_MF,\t\t/* 16, x87 Floating-Point Exception */\n\tX86_TRAP_AC,\t\t/* 17, Alignment Check */\n\tX86_TRAP_MC,\t\t/* 18, Machine Check */\n\tX86_TRAP_XF,\t\t/* 19, SIMD Floating-Point Exception */\n\tX86_TRAP_IRET = 32,\t/* 32, IRET Exception */\n};\n```\n\nError Codes\n--------------------------------------------------------------------------------\n\nThe processor exception-handling mechanism reports error and status information for some exceptions using an error code. The error code is pushed onto the stack by the exception-mechanism during the control transfer into the exception handler. The error code has two formats:\n\n* most error-reporting exceptions format;\n* page fault format.\n\nHere is format of selector error code:\n\n```\n31                           16 15                                  3   2   1   0\n+-------------------------------------------------------------------------------+\n|                              |                                    | T | I | E |\n|           Reserved           |             Selector Index         | - | D | X |\n|                              |                                    | I | T | T |\n+-------------------------------------------------------------------------------+\n```\n\nWhere:\n\n* `EXT` - If this bit is set to 1, the exception source is external to the processor. If cleared to 0, the exception source is internal to the processor;\n* `IDT` - If this bit is set to 1, the error-code selector-index field references a gate descriptor located in the `interrupt-descriptor table`. If cleared to 0, the selector-index field references a descriptor in either the `global-descriptor table` or local-descriptor table `LDT`, as indicated by the `TI` bit;\n* `TI` - If this bit is set to 1, the error-code selector-index field references a descriptor in the `LDT`. If cleared to 0, the selector-index field references a descriptor in the `GDT`.\n* `Selector Index` - The selector-index field specifies the index into either the `GDT`, `LDT`, or `IDT`, as specified by the `IDT` and `TI` bits.\n\nPage-Fault Error Code format is:\n\n```\n31                                                              4   3   2   1   0\n+-------------------------------------------------------------------------------+\n|                                                         |     | R | U | R | - |\n|                       Reserved                          | I/D | S | - | - | P |\n|                                                         |     | V | S | W | - |\n+-------------------------------------------------------------------------------+\n```\n\nWhere:\n\n* `I/D` - If this bit is set to 1, it indicates that the access that caused the page fault was an instruction fetch;\n* `RSV` - If this bit is set to 1, the page fault is a result of the processor reading a 1 from a reserved field within a page-translation-table entry;\n* `U/S` - If this bit is cleared to 0, an access in supervisor mode (`CPL=0, 1, or 2`) caused the page fault. If this bit is set to 1, an access in user mode (CPL=3) caused the page fault;\n* `R/W` - If this bit is cleared to 0, the access that caused the page fault is a memory read. If this bit is set to 1, the memory access that caused the page fault was a write;\n* `P` - If this bit is cleared to 0, the page fault was caused by a not-present page. If this bit is set to 1, the page fault was caused by a page-protection violation.\n\nInterrupt Control Transfers\n--------------------------------------------------------------------------------\n\nThe IDT may contain any of three kinds of gate descriptors:\n\n* `Task Gate` - contains the segment selector for a TSS for an exception and/or interrupt handler task;\n* `Interrupt Gate` - contains segment selector and offset that the processor uses to transfer program execution to a handler procedure in an interrupt handler code segment;\n* `Trap Gate` - contains segment selector and offset that the processor uses to transfer program execution to a handler procedure in an exception handler code segment.\n\nGeneral format of gates is:\n\n```\n127                                                                             96\n+-------------------------------------------------------------------------------+\n|                                                                               |\n|                                Reserved                                       |\n|                                                                               |\n+--------------------------------------------------------------------------------\n95                                                                              64\n+-------------------------------------------------------------------------------+\n|                                                                               |\n|                               Offset 63..32                                   |\n|                                                                               |\n+-------------------------------------------------------------------------------+\n63                               48 47      46  44   42    39             34    32\n+-------------------------------------------------------------------------------+\n|                                  |       |  D  |   |     |      |   |   |     |\n|       Offset 31..16              |   P   |  P  | 0 |Type |0 0 0 | 0 | 0 | IST |\n|                                  |       |  L  |   |     |      |   |   |     |\n -------------------------------------------------------------------------------+\n31                                   16 15                                      0\n+-------------------------------------------------------------------------------+\n|                                      |                                        |\n|          Segment Selector            |                 Offset 15..0           |\n|                                      |                                        |\n+-------------------------------------------------------------------------------+\n```\n\nWhere\n\n* `Selector` - Segment Selector for destination code segment;\n* `Offset` - Offset to handler procedure entry point;\n* `DPL` - Descriptor Privilege Level;\n* `P` - Segment Present flag;\n* `IST` - Interrupt Stack Table;\n* `TYPE` - one of: Local descriptor-table (LDT) segment descriptor, Task-state segment (TSS) descriptor, Call-gate descriptor, Interrupt-gate descriptor, Trap-gate descriptor or Task-gate descriptor.\n\nAn `IDT` descriptor is represented by the following structure in the Linux kernel (only for `x86_64`):\n\n```C\nstruct gate_struct64 {\n\tu16 offset_low;\n\tu16 segment;\n\tunsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;\n\tu16 offset_middle;\n\tu32 offset_high;\n\tu32 zero1;\n} __attribute__((packed));\n```\n\nwhich is defined in the [arch/x86/include/asm/desc_defs.h](http://lxr.free-electrons.com/source/arch/x86/include/asm/desc_defs.h#L51) header file.\n\nA task gate descriptor does not contain `IST` field and its format differs from interrupt/trap gates:\n\n```C\nstruct ldttss_desc64 {\n\tu16 limit0;\n\tu16 base0;\n\tunsigned base1 : 8, type : 5, dpl : 2, p : 1;\n\tunsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;\n\tu32 base3;\n\tu32 zero1;\n} __attribute__((packed));\n```\n\nExceptions During a Task Switch\n--------------------------------------------------------------------------------\n\nAn exception can occur during a task switch while loading a segment selector. Page faults can also occur when accessing a TSS. In these cases, the hardware task-switch mechanism completes loading the new task state from the TSS, and then triggers the appropriate exception mechanism.\n\n**In long mode, an exception cannot occur during a task switch, because the hardware task-switch mechanism is disabled.**\n\nNonmaskable interrupt\n--------------------------------------------------------------------------------\n\n**TODO**\n\nAPI\n--------------------------------------------------------------------------------\n\n**TODO**\n\nInterrupt Stack Table\n--------------------------------------------------------------------------------\n\n**TODO**\n"
  },
  {
    "path": "LICENSE",
    "content": "Attribution-NonCommercial-ShareAlike 4.0 International\n\n=======================================================================\n\nCreative Commons Corporation (\"Creative Commons\") is not a law firm and\ndoes not provide legal services or legal advice. Distribution of\nCreative Commons public licenses does not create a lawyer-client or\nother relationship. Creative Commons makes its licenses and related\ninformation available on an \"as-is\" basis. Creative Commons gives no\nwarranties regarding its licenses, any material licensed under their\nterms and conditions, or any related information. Creative Commons\ndisclaims all liability for damages resulting from their use to the\nfullest extent possible.\n\nUsing Creative Commons Public Licenses\n\nCreative Commons public licenses provide a standard set of terms and\nconditions that creators and other rights holders may use to share\noriginal works of authorship and other material subject to copyright\nand certain other rights specified in the public license below. The\nfollowing considerations are for informational purposes only, are not\nexhaustive, and do not form part of our licenses.\n\n     Considerations for licensors: Our public licenses are\n     intended for use by those authorized to give the public\n     permission to use material in ways otherwise restricted by\n     copyright and certain other rights. Our licenses are\n     irrevocable. Licensors should read and understand the terms\n     and conditions of the license they choose before applying it.\n     Licensors should also secure all rights necessary before\n     applying our licenses so that the public can reuse the\n     material as expected. Licensors should clearly mark any\n     material not subject to the license. This includes other CC-\n     licensed material, or material used under an exception or\n     limitation to copyright. More considerations for licensors:\n\twiki.creativecommons.org/Considerations_for_licensors\n\n     Considerations for the public: By using one of our public\n     licenses, a licensor grants the public permission to use the\n     licensed material under specified terms and conditions. If\n     the licensor's permission is not necessary for any reason--for\n     example, because of any applicable exception or limitation to\n     copyright--then that use is not regulated by the license. Our\n     licenses grant only permissions under copyright and certain\n     other rights that a licensor has authority to grant. Use of\n     the licensed material may still be restricted for other\n     reasons, including because others have copyright or other\n     rights in the material. A licensor may make special requests,\n     such as asking that all changes be marked or described.\n     Although not required by our licenses, you are encouraged to\n     respect those requests where reasonable. More_considerations\n     for the public: \n\twiki.creativecommons.org/Considerations_for_licensees\n\n=======================================================================\n\nCreative Commons Attribution-NonCommercial-ShareAlike 4.0 International\nPublic License\n\nBy exercising the Licensed Rights (defined below), You accept and agree\nto be bound by the terms and conditions of this Creative Commons\nAttribution-NonCommercial-ShareAlike 4.0 International Public License\n(\"Public License\"). To the extent this Public License may be\ninterpreted as a contract, You are granted the Licensed Rights in\nconsideration of Your acceptance of these terms and conditions, and the\nLicensor grants You such rights in consideration of benefits the\nLicensor receives from making the Licensed Material available under\nthese terms and conditions.\n\n\nSection 1 -- Definitions.\n\n  a. Adapted Material means material subject to Copyright and Similar\n     Rights that is derived from or based upon the Licensed Material\n     and in which the Licensed Material is translated, altered,\n     arranged, transformed, or otherwise modified in a manner requiring\n     permission under the Copyright and Similar Rights held by the\n     Licensor. For purposes of this Public License, where the Licensed\n     Material is a musical work, performance, or sound recording,\n     Adapted Material is always produced where the Licensed Material is\n     synched in timed relation with a moving image.\n\n  b. Adapter's License means the license You apply to Your Copyright\n     and Similar Rights in Your contributions to Adapted Material in\n     accordance with the terms and conditions of this Public License.\n\n  c. BY-NC-SA Compatible License means a license listed at\n     creativecommons.org/compatiblelicenses, approved by Creative\n     Commons as essentially the equivalent of this Public License.\n\n  d. Copyright and Similar Rights means copyright and/or similar rights\n     closely related to copyright including, without limitation,\n     performance, broadcast, sound recording, and Sui Generis Database\n     Rights, without regard to how the rights are labeled or\n     categorized. For purposes of this Public License, the rights\n     specified in Section 2(b)(1)-(2) are not Copyright and Similar\n     Rights.\n\n  e. Effective Technological Measures means those measures that, in the\n     absence of proper authority, may not be circumvented under laws\n     fulfilling obligations under Article 11 of the WIPO Copyright\n     Treaty adopted on December 20, 1996, and/or similar international\n     agreements.\n\n  f. Exceptions and Limitations means fair use, fair dealing, and/or\n     any other exception or limitation to Copyright and Similar Rights\n     that applies to Your use of the Licensed Material.\n\n  g. License Elements means the license attributes listed in the name\n     of a Creative Commons Public License. The License Elements of this\n     Public License are Attribution, NonCommercial, and ShareAlike.\n\n  h. Licensed Material means the artistic or literary work, database,\n     or other material to which the Licensor applied this Public\n     License.\n\n  i. Licensed Rights means the rights granted to You subject to the\n     terms and conditions of this Public License, which are limited to\n     all Copyright and Similar Rights that apply to Your use of the\n     Licensed Material and that the Licensor has authority to license.\n\n  j. Licensor means the individual(s) or entity(ies) granting rights\n     under this Public License.\n\n  k. NonCommercial means not primarily intended for or directed towards\n     commercial advantage or monetary compensation. For purposes of\n     this Public License, the exchange of the Licensed Material for\n     other material subject to Copyright and Similar Rights by digital\n     file-sharing or similar means is NonCommercial provided there is\n     no payment of monetary compensation in connection with the\n     exchange.\n\n  l. Share means to provide material to the public by any means or\n     process that requires permission under the Licensed Rights, such\n     as reproduction, public display, public performance, distribution,\n     dissemination, communication, or importation, and to make material\n     available to the public including in ways that members of the\n     public may access the material from a place and at a time\n     individually chosen by them.\n\n  m. Sui Generis Database Rights means rights other than copyright\n     resulting from Directive 96/9/EC of the European Parliament and of\n     the Council of 11 March 1996 on the legal protection of databases,\n     as amended and/or succeeded, as well as other essentially\n     equivalent rights anywhere in the world.\n\n  n. You means the individual or entity exercising the Licensed Rights\n     under this Public License. Your has a corresponding meaning.\n\n\nSection 2 -- Scope.\n\n  a. License grant.\n\n       1. Subject to the terms and conditions of this Public License,\n          the Licensor hereby grants You a worldwide, royalty-free,\n          non-sublicensable, non-exclusive, irrevocable license to\n          exercise the Licensed Rights in the Licensed Material to:\n\n            a. reproduce and Share the Licensed Material, in whole or\n               in part, for NonCommercial purposes only; and\n\n            b. produce, reproduce, and Share Adapted Material for\n               NonCommercial purposes only.\n\n       2. Exceptions and Limitations. For the avoidance of doubt, where\n          Exceptions and Limitations apply to Your use, this Public\n          License does not apply, and You do not need to comply with\n          its terms and conditions.\n\n       3. Term. The term of this Public License is specified in Section\n          6(a).\n\n       4. Media and formats; technical modifications allowed. The\n          Licensor authorizes You to exercise the Licensed Rights in\n          all media and formats whether now known or hereafter created,\n          and to make technical modifications necessary to do so. The\n          Licensor waives and/or agrees not to assert any right or\n          authority to forbid You from making technical modifications\n          necessary to exercise the Licensed Rights, including\n          technical modifications necessary to circumvent Effective\n          Technological Measures. For purposes of this Public License,\n          simply making modifications authorized by this Section 2(a)\n          (4) never produces Adapted Material.\n\n       5. Downstream recipients.\n\n            a. Offer from the Licensor -- Licensed Material. Every\n               recipient of the Licensed Material automatically\n               receives an offer from the Licensor to exercise the\n               Licensed Rights under the terms and conditions of this\n               Public License.\n\n            b. Additional offer from the Licensor -- Adapted Material.\n               Every recipient of Adapted Material from You\n               automatically receives an offer from the Licensor to\n               exercise the Licensed Rights in the Adapted Material\n               under the conditions of the Adapter's License You apply.\n\n            c. No downstream restrictions. You may not offer or impose\n               any additional or different terms or conditions on, or\n               apply any Effective Technological Measures to, the\n               Licensed Material if doing so restricts exercise of the\n               Licensed Rights by any recipient of the Licensed\n               Material.\n\n       6. No endorsement. Nothing in this Public License constitutes or\n          may be construed as permission to assert or imply that You\n          are, or that Your use of the Licensed Material is, connected\n          with, or sponsored, endorsed, or granted official status by,\n          the Licensor or others designated to receive attribution as\n          provided in Section 3(a)(1)(A)(i).\n\n  b. Other rights.\n\n       1. Moral rights, such as the right of integrity, are not\n          licensed under this Public License, nor are publicity,\n          privacy, and/or other similar personality rights; however, to\n          the extent possible, the Licensor waives and/or agrees not to\n          assert any such rights held by the Licensor to the limited\n          extent necessary to allow You to exercise the Licensed\n          Rights, but not otherwise.\n\n       2. Patent and trademark rights are not licensed under this\n          Public License.\n\n       3. To the extent possible, the Licensor waives any right to\n          collect royalties from You for the exercise of the Licensed\n          Rights, whether directly or through a collecting society\n          under any voluntary or waivable statutory or compulsory\n          licensing scheme. In all other cases the Licensor expressly\n          reserves any right to collect such royalties, including when\n          the Licensed Material is used other than for NonCommercial\n          purposes.\n\n\nSection 3 -- License Conditions.\n\nYour exercise of the Licensed Rights is expressly made subject to the\nfollowing conditions.\n\n  a. Attribution.\n\n       1. If You Share the Licensed Material (including in modified\n          form), You must:\n\n            a. retain the following if it is supplied by the Licensor\n               with the Licensed Material:\n\n                 i. identification of the creator(s) of the Licensed\n                    Material and any others designated to receive\n                    attribution, in any reasonable manner requested by\n                    the Licensor (including by pseudonym if\n                    designated);\n\n                ii. a copyright notice;\n\n               iii. a notice that refers to this Public License;\n\n                iv. a notice that refers to the disclaimer of\n                    warranties;\n\n                 v. a URI or hyperlink to the Licensed Material to the\n                    extent reasonably practicable;\n\n            b. indicate if You modified the Licensed Material and\n               retain an indication of any previous modifications; and\n\n            c. indicate the Licensed Material is licensed under this\n               Public License, and include the text of, or the URI or\n               hyperlink to, this Public License.\n\n       2. You may satisfy the conditions in Section 3(a)(1) in any\n          reasonable manner based on the medium, means, and context in\n          which You Share the Licensed Material. For example, it may be\n          reasonable to satisfy the conditions by providing a URI or\n          hyperlink to a resource that includes the required\n          information.\n       3. If requested by the Licensor, You must remove any of the\n          information required by Section 3(a)(1)(A) to the extent\n          reasonably practicable.\n\n  b. ShareAlike.\n\n     In addition to the conditions in Section 3(a), if You Share\n     Adapted Material You produce, the following conditions also apply.\n\n       1. The Adapter's License You apply must be a Creative Commons\n          license with the same License Elements, this version or\n          later, or a BY-NC-SA Compatible License.\n\n       2. You must include the text of, or the URI or hyperlink to, the\n          Adapter's License You apply. You may satisfy this condition\n          in any reasonable manner based on the medium, means, and\n          context in which You Share Adapted Material.\n\n       3. You may not offer or impose any additional or different terms\n          or conditions on, or apply any Effective Technological\n          Measures to, Adapted Material that restrict exercise of the\n          rights granted under the Adapter's License You apply.\n\n\nSection 4 -- Sui Generis Database Rights.\n\nWhere the Licensed Rights include Sui Generis Database Rights that\napply to Your use of the Licensed Material:\n\n  a. for the avoidance of doubt, Section 2(a)(1) grants You the right\n     to extract, reuse, reproduce, and Share all or a substantial\n     portion of the contents of the database for NonCommercial purposes\n     only;\n\n  b. if You include all or a substantial portion of the database\n     contents in a database in which You have Sui Generis Database\n     Rights, then the database in which You have Sui Generis Database\n     Rights (but not its individual contents) is Adapted Material,\n     including for purposes of Section 3(b); and\n\n  c. You must comply with the conditions in Section 3(a) if You Share\n     all or a substantial portion of the contents of the database.\n\nFor the avoidance of doubt, this Section 4 supplements and does not\nreplace Your obligations under this Public License where the Licensed\nRights include other Copyright and Similar Rights.\n\n\nSection 5 -- Disclaimer of Warranties and Limitation of Liability.\n\n  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE\n     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS\n     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF\n     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,\n     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,\n     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR\n     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,\n     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT\n     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT\n     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.\n\n  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE\n     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,\n     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,\n     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,\n     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR\n     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN\n     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR\n     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR\n     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.\n\n  c. The disclaimer of warranties and limitation of liability provided\n     above shall be interpreted in a manner that, to the extent\n     possible, most closely approximates an absolute disclaimer and\n     waiver of all liability.\n\n\nSection 6 -- Term and Termination.\n\n  a. This Public License applies for the term of the Copyright and\n     Similar Rights licensed here. However, if You fail to comply with\n     this Public License, then Your rights under this Public License\n     terminate automatically.\n\n  b. Where Your right to use the Licensed Material has terminated under\n     Section 6(a), it reinstates:\n\n       1. automatically as of the date the violation is cured, provided\n          it is cured within 30 days of Your discovery of the\n          violation; or\n\n       2. upon express reinstatement by the Licensor.\n\n     For the avoidance of doubt, this Section 6(b) does not affect any\n     right the Licensor may have to seek remedies for Your violations\n     of this Public License.\n\n  c. For the avoidance of doubt, the Licensor may also offer the\n     Licensed Material under separate terms or conditions or stop\n     distributing the Licensed Material at any time; however, doing so\n     will not terminate this Public License.\n\n  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public\n     License.\n\n\nSection 7 -- Other Terms and Conditions.\n\n  a. The Licensor shall not be bound by any additional or different\n     terms or conditions communicated by You unless expressly agreed.\n\n  b. Any arrangements, understandings, or agreements regarding the\n     Licensed Material not stated herein are separate from and\n     independent of the terms and conditions of this Public License.\n\n\nSection 8 -- Interpretation.\n\n  a. For the avoidance of doubt, this Public License does not, and\n     shall not be interpreted to, reduce, limit, restrict, or impose\n     conditions on any use of the Licensed Material that could lawfully\n     be made without permission under this Public License.\n\n  b. To the extent possible, if any provision of this Public License is\n     deemed unenforceable, it shall be automatically reformed to the\n     minimum extent necessary to make it enforceable. If the provision\n     cannot be reformed, it shall be severed from this Public License\n     without affecting the enforceability of the remaining terms and\n     conditions.\n\n  c. No term or condition of this Public License will be waived and no\n     failure to comply consented to unless expressly agreed to by the\n     Licensor.\n\n  d. Nothing in this Public License constitutes or may be interpreted\n     as a limitation upon, or waiver of, any privileges and immunities\n     that apply to the Licensor or You, including from the legal\n     processes of any jurisdiction or authority.\n\n=======================================================================\n\nCreative Commons is not a party to its public\nlicenses. Notwithstanding, Creative Commons may elect to apply one of\nits public licenses to material it publishes and in those instances\nwill be considered the “Licensor.” The text of the Creative Commons\npublic licenses is dedicated to the public domain under the CC0 Public\nDomain Dedication. Except for the limited purpose of indicating that\nmaterial is shared under a Creative Commons public license or as\notherwise permitted by the Creative Commons policies published at\ncreativecommons.org/policies, Creative Commons does not authorize the\nuse of the trademark \"Creative Commons\" or any other trademark or logo\nof Creative Commons without its prior written consent including,\nwithout limitation, in connection with any unauthorized modifications\nto any of its public licenses or any other arrangements,\nunderstandings, or agreements concerning use of licensed material. For\nthe avoidance of doubt, this paragraph does not form part of the\npublic licenses.\n\nCreative Commons may be contacted at creativecommons.org.\n"
  },
  {
    "path": "LINKS.md",
    "content": "Useful links\n========================\n\nLinux boot\n------------------------\n\n* [Linux/x86 boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt)\n* [Linux kernel parameters](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst)\n\nProtected mode\n------------------------\n\n* [64-ia-32-architectures-software-developer-vol-3a-part-1-manual.pdf](http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html)\n\nMemory management in the Linux kernel\n--------------------------------------\n\n* [Notes on the linux kernel VM subsystem by @lorenzo-stoakes](https://github.com/lorenzo-stoakes/linux-vm-notes)\n\nSerial programming\n------------------------\n\n* [8250 UART Programming](http://en.wikibooks.org/wiki/Serial_Programming/8250_UART_Programming#UART_Registers)\n* [Serial ports on OSDEV](http://wiki.osdev.org/Serial_Ports)\n\nVGA\n------------------------\n\n* [Video Graphics Array (VGA)](http://en.wikipedia.org/wiki/Video_Graphics_Array)\n\nIO\n------------------------\n\n* [IO port programming](http://www.tldp.org/HOWTO/text/IO-Port-Programming)\n\nGCC and GAS\n------------------------\n\n* [GCC type attributes](https://gcc.gnu.org/onlinedocs/gcc/Type-Attributes.html)\n* [Assembler Directives](http://www.chemie.fu-berlin.de/chemnet/use/info/gas/gas_toc.html#TOC65)\n\n\nImportant data structures\n--------------------------\n\n* [task_struct definition](http://lxr.free-electrons.com/source/include/linux/sched.h#L1274)\n\n\nUseful links\n------------------------\n\n* [Linux x86 Program Start Up](http://dbp-consulting.com/tutorials/debugging/linuxProgramStartup.html)\n* [Memory Layout in Program Execution (32 bits)](http://fgiasson.com/articles/memorylayout.txt)\n"
  },
  {
    "path": "MM/README.md",
    "content": "# Linux kernel memory management\n\nThis chapter describes memory management in the Linux kernel. You will see here a\ncouple of posts which describe different parts of the Linux memory management framework:\n\n* [Memblock](linux-mm-1.md) - describes early `memblock` allocator.\n* [Fix-Mapped Addresses and ioremap](linux-mm-2.md) - describes `fix-mapped` addresses and early `ioremap`.\n* [kmemcheck](linux-mm-3.md) - third part describes `kmemcheck` tool.\n"
  },
  {
    "path": "MM/linux-mm-1.md",
    "content": "Linux kernel memory management Part 1.\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nMemory management is one of the most complex (and I think that it is the most complex) part of the operating system kernel. In the [last preparations before the kernel entry point](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3) part we stopped right before call of the `start_kernel` function. This function initializes all the kernel features (including architecture-dependent features) before the kernel runs the first `init` process. You may remember as we built early page tables, identity page tables and fixmap page tables in the boot time. No complicated memory management is working yet. When the `start_kernel` function is called we will see the transition to more complex data structures and techniques for memory management. For a good understanding of the initialization process in the Linux kernel we need to have a clear understanding of these techniques. This chapter will provide an overview of the different parts of the linux kernel memory management framework and its API, starting from the `memblock`.\n\nMemblock\n--------------------------------------------------------------------------------\n\nMemblock is one of the methods of managing memory regions during the early bootstrap period while the usual kernel memory allocators are not up and\nrunning yet. Previously it was called `Logical Memory Block`, but with the [patch](https://lkml.org/lkml/2010/7/13/68) by Yinghai Lu, it was renamed to the `memblock`. As Linux kernel for `x86_64` architecture uses this method. We already met `memblock` in the [Last preparations before the kernel entry point](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3) part. And now it's time to get acquainted with it closer. We will see how it is implemented.\n\nWe will start to learn `memblock` from the data structures. Definitions of all logical-memory-block-related data structures can be found in the [include/linux/memblock.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/memblock.h) header file.\n\nThe first structure has the same name as this part and it is:\n\n```C\nstruct memblock {\n         bool bottom_up;\n         phys_addr_t current_limit;\n         struct memblock_type memory;   --> array of memblock_region\n         struct memblock_type reserved; --> array of memblock_region\n#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP\n         struct memblock_type physmem;\n#endif\n};\n```\n\nThis structure contains five fields. First is `bottom_up` which allows allocating memory in bottom-up mode when it is `true`. Next field is `current_limit`. This field describes the limit size of the memory block. The next three fields describe the type of the memory block. It can be: reserved, memory and physical memory (physical memory is available if the `CONFIG_HAVE_MEMBLOCK_PHYS_MAP` configuration option is enabled). Now we see yet another data structure - `memblock_type`. Let's look at its definition:\n\n```C\nstruct memblock_type {\n\tunsigned long cnt;\n\tunsigned long max;\n\tphys_addr_t total_size;\n\tstruct memblock_region *regions;\n};\n```\n\nThis structure provides information about the memory type. It contains fields which describe the number of memory regions inside the current memory block, the size of all memory regions, the size of the allocated array of the memory regions, and a pointer to the array of the `memblock_region` structures. `memblock_region` is a structure which describes a memory region. Its definition is:\n\n```C\nstruct memblock_region {\n        phys_addr_t base;\n        phys_addr_t size;\n        unsigned long flags;\n#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP\n        int nid;\n#endif\n};\n```\n\n`memblock_region` provides the base address and size of the memory region as well as a flags field which can have the following values:\n\n```C\nenum {\n    MEMBLOCK_NONE\t= 0x0,\t/* No special request */\n    MEMBLOCK_HOTPLUG\t= 0x1,\t/* hotpluggable region */\n    MEMBLOCK_MIRROR\t= 0x2,\t/* mirrored region */\n    MEMBLOCK_NOMAP\t= 0x4,\t/* don't add to kernel direct mapping */\n};\n```\n\nAlso `memblock_region` provides an integer field - [numa](http://en.wikipedia.org/wiki/Non-uniform_memory_access) node selector, if the `CONFIG_HAVE_MEMBLOCK_NODE_MAP` configuration option is enabled.\n\nSchematically we can imagine it as:\n\n```\n+---------------------------+   +---------------------------+\n|         memblock          |   |                           |\n|  _______________________  |   |                           |\n| |        memory         | |   |       Array of the        |\n| |      memblock_type    |-|-->|      memblock_region      |\n| |_______________________| |   |                           |\n|                           |   +---------------------------+\n|  _______________________  |   +---------------------------+\n| |       reserved        | |   |                           |\n| |      memblock_type    |-|-->|       Array of the        |\n| |_______________________| |   |      memblock_region      |\n|                           |   |                           |\n+---------------------------+   +---------------------------+\n```\n\nThese three structures: `memblock`, `memblock_type` and `memblock_region` are main in the `Memblock`. Now we know about it and can look at Memblock initialization process.\n\nMemblock initialization\n--------------------------------------------------------------------------------\n\nAs all API of the `memblock` are described in the [include/linux/memblock.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/memblock.h) header file, all implementations of these functions are in the [mm/memblock.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/memblock.c) source code file. Let's look at the top of the source code file and we will see the initialization of the `memblock` structure:\n\n```C\nstruct memblock memblock __initdata_memblock = {\n\t.memory.regions\t\t= memblock_memory_init_regions,\n\t.memory.cnt\t\t    = 1,\n\t.memory.max\t\t    = INIT_MEMBLOCK_REGIONS,\n\n\t.reserved.regions\t= memblock_reserved_init_regions,\n\t.reserved.cnt\t\t= 1,\n\t.reserved.max\t\t= INIT_MEMBLOCK_REGIONS,\n\n#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP\n\t.physmem.regions\t= memblock_physmem_init_regions,\n\t.physmem.cnt\t\t= 1,\n\t.physmem.max\t\t= INIT_PHYSMEM_REGIONS,\n#endif\n\t.bottom_up\t\t    = false,\n\t.current_limit\t\t= MEMBLOCK_ALLOC_ANYWHERE,\n};\n```\n\nHere we can see initialization of the `memblock` structure which has the same name as structure - `memblock`. First of all note the `__initdata_memblock`. Definition of this macro looks like:\n\n```C\n#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK\n    #define __init_memblock __meminit\n    #define __initdata_memblock __meminitdata\n#else\n    #define __init_memblock\n    #define __initdata_memblock\n#endif\n```\n\nYou can see that it depends on `CONFIG_ARCH_DISCARD_MEMBLOCK`. If this configuration option is enabled, memblock code will be put into the `.init` section and will be released after the kernel is booted up.\n\nNext we can see the initialization of the `memblock_type memory`, `memblock_type reserved` and `memblock_type physmem` fields of the `memblock` structure. Here we are interested only in the `memblock_type.regions` initialization process. Note that every `memblock_type` field is initialized by and array of `memblock_region`s:\n\n```C\nstatic struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;\nstatic struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;\n#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP\nstatic struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;\n#endif\n```\n\nEvery array contains 128 memory regions. We can see it in the `INIT_MEMBLOCK_REGIONS` macro definition:\n\n```C\n#define INIT_MEMBLOCK_REGIONS   128\n```\n\nNote that all arrays are also defined with the `__initdata_memblock` macro which we already saw in the `memblock` structure initialization (read above if you've forgotten).\n\nThe last two fields describe that `bottom_up` allocation is disabled and the limit of the current Memblock is:\n\n```C\n#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0)\n```\n\nwhich is `0xffffffffffffffff`.\n\nOn this step the initialization of the `memblock` structure has been finished and we can have a look at the Memblock API.\n\nMemblock API\n--------------------------------------------------------------------------------\n\nOk we have finished with the initialization of the `memblock` structure and now we can look at the Memblock API and its implementation. As I said above, the implementation of `memblock` is taking place fully in [mm/memblock.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/memblock.c). To understand how `memblock` works and how it is implemented, let's look at its usage first. There are a couple of [places](http://lxr.free-electrons.com/ident?i=memblock) in the Linux kernel where memblock is used. For example let's take `memblock_x86_fill` function from the [arch/x86/kernel/e820.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/e820.c#L1061). This function goes through the memory map provided by the [e820](http://en.wikipedia.org/wiki/E820) and adds memory regions reserved by the kernel to the `memblock` with the `memblock_add` function. Since we have met the `memblock_add` function first, let's start from it.\n\nThis function takes a physical base address and the size of the memory region as arguments and add them to the `memblock`. The `memblock_add` function does not do anything special in its body, but just calls the:\n\n```C\nmemblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);\n```\n\nfunction. We pass the memory block type - `memory`, the physical base address and the size of the memory region, the maximum number of nodes which is 1 if `CONFIG_NODES_SHIFT` is not set in the configuration file or `1 << CONFIG_NODES_SHIFT` if it is set, and the flags. The `memblock_add_range` function adds a new memory region to the memory block. It starts by checking the size of the given region and if it is zero it just returns. After this, `memblock_add_range` checks the existence of the memory regions in the `memblock` structure with the given `memblock_type`. If there are no memory regions, we just fill a new `memory_region` with the given values and return (we already saw the implementation of this in the [First touch of the Linux kernel memory manager framework](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3)). If `memblock_type` is not empty, we start to add a new memory region to the `memblock` with the given `memblock_type`.\n\nFirst of all we get the end of the memory region with the:\n\n```C\nphys_addr_t end = base + memblock_cap_size(base, &size);\n```\n\n`memblock_cap_size` adjusts `size` so that `base + size` will not overflow. Its implementation is pretty easy:\n\n```C\nstatic inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)\n{\n\treturn *size = min(*size, (phys_addr_t)ULLONG_MAX - base);\n}\n```\n\n`memblock_cap_size` returns the new size which is the smallest value between the given size and `ULLONG_MAX - base`.\n\nAfter that we have the end address of the new memory region, `memblock_add_range` checks for overlap and merge conditions with memory regions that have been added before. Insertion of the new memory region to the `memblock` consists of two steps:\n\n* Adding of non-overlapping parts of the new memory area as separate regions;\n* Merging of all neighboring regions.\n\nWe are going through all the already stored memory regions and checking for overlap with the new region:\n\n```C\n\tfor (i = 0; i < type->cnt; i++) {\n\t\tstruct memblock_region *rgn = &type->regions[i];\n\t\tphys_addr_t rbase = rgn->base;\n\t\tphys_addr_t rend = rbase + rgn->size;\n\n\t\tif (rbase >= end)\n\t\t\tbreak;\n\t\tif (rend <= base)\n\t\t\tcontinue;\n        ...\n\t\t...\n\t\t...\n\t}\n```\n\nIf the new memory region does not overlap with regions which are already stored in the `memblock`, insert this region into the memblock with and this is first step, we check if the new region can fit into the memory block and call `memblock_double_array` in another way:\n\n```C\nwhile (type->cnt + nr_new > type->max)\n\tif (memblock_double_array(type, obase, size) < 0)\n\t\treturn -ENOMEM;\n\tinsert = true;\n\tgoto repeat;\n```\n\n`memblock_double_array` doubles the size of the given regions array. Then we set `insert` to `true` and go to the `repeat` label. In the second step, starting from the `repeat` label we go through the same loop and insert the current memory region into the memory block with the `memblock_insert_region` function:\n\n```C\n\tif (base < end) {\n\t\tnr_new++;\n\t\tif (insert)\n\t\t\tmemblock_insert_region(type, i, base, end - base,\n\t\t\t\t\t       nid, flags);\n\t}\n```\n\nSince we set `insert` to `true` in the first step, now `memblock_insert_region` will be called. `memblock_insert_region` has almost the same implementation that we saw when we inserted a new region to the empty `memblock_type` (see above). This function gets the last memory region:\n\n```C\nstruct memblock_region *rgn = &type->regions[idx];\n```\n\nand copies the memory area with `memmove`:\n\n```C\nmemmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));\n```\n\nAfter this fills `memblock_region` fields of the new memory region base, size, etc. and increases size of the `memblock_type`. In the end of the execution, `memblock_add_range` calls `memblock_merge_regions` which merges neighboring compatible regions in the second step.\n\nIn the second case the new memory region can overlap already stored regions. For example we already have `region1` in the `memblock`:\n\n```\n0                    0x1000\n+-----------------------+\n|                       |\n|                       |\n|        region1        |\n|                       |\n|                       |\n+-----------------------+\n```\n\nAnd now we want to add `region2` to the `memblock` with the following base address and size:\n\n```\n0x100                 0x2000\n+-----------------------+\n|                       |\n|                       |\n|        region2        |\n|                       |\n|                       |\n+-----------------------+\n```\n\nIn this case set the base address of the new memory region as the end address of the overlapped region with:\n\n```C\nbase = min(rend, end);\n```\n\nSo it will be `0x1000` in our case. And insert it as we did it already in the second step with:\n\n```\nif (base < end) {\n\tnr_new++;\n\tif (insert)\n\t\tmemblock_insert_region(type, i, base, end - base, nid, flags);\n}\n```\n\nIn this case we insert `overlapping portion` (we insert only the higher portion, because the lower portion is already in the overlapped memory region), then the remaining portion and merge these portions with `memblock_merge_regions`. As I said above `memblock_merge_regions` function merges neighboring compatible regions. It goes through all memory regions from the given `memblock_type`, takes two neighboring memory regions - `type->regions[i]` and `type->regions[i + 1]` and checks that these regions have the same flags, belong to the same node and that the end address of the first regions is not equal to the base address of the second region:\n\n```C\nwhile (i < type->cnt - 1) {\n\tstruct memblock_region *this = &type->regions[i];\n\tstruct memblock_region *next = &type->regions[i + 1];\n\tif (this->base + this->size != next->base ||\n\t    memblock_get_region_node(this) !=\n\t    memblock_get_region_node(next) ||\n\t    this->flags != next->flags) {\n\t\tBUG_ON(this->base + this->size > next->base);\n\t\ti++;\n\t\tcontinue;\n\t}\n```\n\nIf none of these conditions are true, we update the size of the first region with the size of the next region:\n\n```C\nthis->size += next->size;\n```\n\nAs we update the size of the first memory region with the size of the next memory region, we move all memory regions which are after the (`next`) memory region one index backwards with the `memmove` function:\n\n```C\nmemmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));\n```\n\nThe `memmove` here moves all regions which are located after the `next` region to the base address of the `next` region. In the end we just decrease the count of the memory regions which belong to the `memblock_type`:\n\n```C\ntype->cnt--;\n```\n\nAfter this we will get two memory regions merged into one:\n\n```\n0                                             0x2000\n+------------------------------------------------+\n|                                                |\n|                                                |\n|                   region1                      |\n|                                                |\n|                                                |\n+------------------------------------------------+\n```\n\nAs we decreased counts of regions in a memblock with certain type, increased size of the `this` region and shifted all regions which are located after `next` region to its place.\n\nThat's all. This is the whole principle of the work of the `memblock_add_range` function.\n\nThere is also `memblock_reserve` function which does the same as `memblock_add`, but with one difference. It stores `memblock_type.reserved` in the memblock instead of `memblock_type.memory`.\n\nOf course this is not the full API. Memblock provides APIs not only for adding `memory` and `reserved` memory regions, but also:\n\n* `memblock_remove` - removes memory region from memblock;\n* `memblock_find_in_range` - finds free area in given range;\n* `memblock_free` - releases memory region in memblock;\n* `for_each_mem_range` - iterates through memblock areas.\n\nand many more....\n\nGetting info about memory regions\n--------------------------------------------------------------------------------\n\nMemblock also provides an API for getting information about allocated memory regions in the `memblock`. It is split in two parts:\n\n* `get_allocated_memblock_memory_regions_info` - getting info about memory regions;\n* `get_allocated_memblock_reserved_regions_info` - getting info about reserved regions.\n\nImplementation of these functions is easy. Let's look at `get_allocated_memblock_reserved_regions_info` for example:\n\n```C\nphys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(\n\t\t\t\t\tphys_addr_t *addr)\n{\n\tif (memblock.reserved.regions == memblock_reserved_init_regions)\n\t\treturn 0;\n\n\t*addr = __pa(memblock.reserved.regions);\n\n\treturn PAGE_ALIGN(sizeof(struct memblock_region) *\n\t\t\t  memblock.reserved.max);\n}\n```\n\nFirst of all this function checks that `memblock` contains reserved memory regions. If `memblock` does not contain reserved memory regions we just return zero. Otherwise we write the physical address of the reserved memory regions array to the given address and return aligned size of the allocated array. Note that there is `PAGE_ALIGN` macro used for align. Actually it depends on size of page:\n\n```C\n#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)\n```\n\nImplementation of the `get_allocated_memblock_memory_regions_info` function is the same. It has only one difference, `memblock_type.memory` used instead of `memblock_type.reserved`.\n\nMemblock debugging\n--------------------------------------------------------------------------------\n\nThere are many calls to `memblock_dbg` in the memblock implementation. If you pass the `memblock=debug` option to the kernel command line, this function will be called. Actually `memblock_dbg` is just a macro which expands to `printk`:\n\n```C\n#define memblock_dbg(fmt, ...) \\\n         if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)\n```\n\nFor example you can see a call of this macro in the `memblock_reserve` function:\n\n```C\nmemblock_dbg(\"memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\\n\",\n\t\t     (unsigned long long)base,\n\t\t     (unsigned long long)base + size - 1,\n\t\t     flags, (void *)_RET_IP_);\n```\n\nAnd you will see something like this:\n\n![Memblock](images/memblock.png)\n\nMemblock also has support in [debugfs](http://en.wikipedia.org/wiki/Debugfs). If you run the kernel on another architecture than `X86` you can access:\n\n* `/sys/kernel/debug/memblock/memory`\n* `/sys/kernel/debug/memblock/reserved`\n* `/sys/kernel/debug/memblock/physmem`\n\nto get a dump of the `memblock` contents.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the first part about Linux kernel memory management. If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me a PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [e820](http://en.wikipedia.org/wiki/E820)\n* [numa](http://en.wikipedia.org/wiki/Non-uniform_memory_access)\n* [debugfs](http://en.wikipedia.org/wiki/Debugfs)\n* [First touch of the Linux kernel memory manager framework](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3)\n"
  },
  {
    "path": "MM/linux-mm-2.md",
    "content": "Linux kernel memory management Part 2.\n================================================================================\n\nFix-Mapped Addresses and ioremap\n--------------------------------------------------------------------------------\n\n`Fix-Mapped` addresses are a set of special compile-time addresses whose corresponding physical addresses do not have to be a linear address minus `__START_KERNEL_map`. Each fix-mapped address maps one page frame and the kernel uses them as pointers that never change their address. That is the main point of these addresses. As the comment says: `to have a constant address at compile time, but to set the physical address only in the boot process`. You can remember that in the earliest [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1), we already set the `level2_fixmap_pgt`:\n\n```assembly\nNEXT_PAGE(level2_fixmap_pgt)\n\t.fill\t506,8,0\n\t.quad\tlevel1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE\n\t.fill\t5,8,0\n\nNEXT_PAGE(level1_fixmap_pgt)\n\t.fill\t512,8,0\n```\n\nAs you can see `level2_fixmap_pgt` is right after the `level2_kernel_pgt` which is kernel code+data+bss. Every fix-mapped address is represented by an integer index which is defined in the `fixed_addresses` enum from the [arch/x86/include/asm/fixmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/fixmap.h). For example it contains entries for `VSYSCALL_PAGE` - if emulation of legacy vsyscall page is enabled, `FIX_APIC_BASE` for local [apic](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller), etc. In virtual memory fix-mapped area is placed in the modules area:\n\n```\n       +-----------+-----------------+---------------+------------------+\n       |           |                 |               |                  |\n       |kernel text|      kernel     |               |    vsyscalls     |\n       | mapping   |       text      |    Modules    |    fix-mapped    |\n       |from phys 0|       data      |               |    addresses     |\n       |           |                 |               |                  |\n       +-----------+-----------------+---------------+------------------+\n__START_KERNEL_map   __START_KERNEL    MODULES_VADDR            0xffffffffffffffff\n```\n\nBase virtual address and size of the `fix-mapped` area are presented by the two following macro:\n\n```C\n#define FIXADDR_SIZE\t(__end_of_permanent_fixed_addresses << PAGE_SHIFT)\n#define FIXADDR_START\t(FIXADDR_TOP - FIXADDR_SIZE)\n```\n\nHere `__end_of_permanent_fixed_addresses` is an element of the `fixed_addresses` enum and as I wrote above, every fix-mapped address is represented by an integer index which is defined in the `fixed_addresses`. `PAGE_SHIFT` determines the size of a page. For example size of the one page we can get with the `1 << PAGE_SHIFT` expression.\n\nIn our case we need to get the size of the fix-mapped area, but not only of one page, that's why we are using `__end_of_permanent_fixed_addresses` for getting the size of the fix-mapped area. The `__end_of_permanent_fixed_addresses` is the last index of the `fixed_addresses` enum or in other words the `__end_of_permanent_fixed_addresses` contains amount of pages in a fixed-mapped area. So if we multiply the value of the `__end_of_permanent_fixed_addresses` on a page size value we will get size of fix-mapped area. In my case it's a little more than `536` kilobytes. In your case it might be a different number, because the size depends on amount of the fix-mapped addresses which depends on your kernel configuration.\n\nThe second `FIXADDR_START` macro just subtracts the fix-mapped area size from the last address of the fix-mapped area to get its base virtual address. `FIXADDR_TOP` is a rounded up address from the base address of the [vsyscall](https://lwn.net/Articles/446528/) space:\n\n```C\n#define FIXADDR_TOP     (round_up(VSYSCALL_ADDR + PAGE_SIZE, 1<<PMD_SHIFT) - PAGE_SIZE)\n```\n\nThe `fixed_addresses` enums are used as indexes to get the virtual addresses by the `fix_to_virt` function. Implementation of this function is easy:\n\n```C\nstatic __always_inline unsigned long fix_to_virt(const unsigned int idx)\n{\n        BUILD_BUG_ON(idx >= __end_of_fixed_addresses);\n        return __fix_to_virt(idx);\n}\n```\n\nfirst of all it checks that the index given for the `fixed_addresses` enum is not greater or equal than `__end_of_fixed_addresses` with the `BUILD_BUG_ON` macro and then returns the result of the `__fix_to_virt` macro:\n\n```C\n#define __fix_to_virt(x)        (FIXADDR_TOP - ((x) << PAGE_SHIFT))\n```\n\nHere we shift left the given index of a `fix-mapped` area on the `PAGE_SHIFT` which determines size of a page as I wrote above and subtract it from the `FIXADDR_TOP` which is the highest address of the `fix-mapped` area:\n\n```\n+-----------------+\n|    PAGE 1       | FIXADDR_TOP (virt address)\n|    PAGE 2       |\n|    PAGE 3       |\n|    PAGE 4 (idx) | x - 4\n|    PAGE 5       |\n+-----------------+\n```\n\nThere is an inverse function for getting an index of a fix-mapped area corresponding to the given virtual address:\n\n```C\nstatic inline unsigned long virt_to_fix(const unsigned long vaddr)\n{\n        BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);\n        return __virt_to_fix(vaddr);\n}\n```\n\nThe `virt_to_fix` takes a virtual address, checks that this address is between `FIXADDR_START` and `FIXADDR_TOP` and calls the `__virt_to_fix` macro which implemented as:\n\n```C\n#define __virt_to_fix(x)        ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)\n```\n\nAs we may see, the `__virt_to_fix` macro clears the first `12` bits in the given virtual address, subtracts it from the last address the of `fix-mapped` area (`FIXADDR_TOP`) and shifts the result right on `PAGE_SHIFT` which is `12`. Let me explain how it works.\n\nAs in previous example (in `__fix_to_virt` macro), we start from the top of the fix-mapped area. We also go back to bottom from the top to search an index of a fix-mapped area corresponding to the given virtual address. As you may see, first of all we will clear the first `12` bits in the given virtual address with `x & PAGE_MASK` expression. This allows us to get base address of page. We need to do this for case when the given virtual address points somewhere in a beginning/middle or end of a page, but not to the base address of it. At the next step subtract this from the `FIXADDR_TOP` and this gives us virtual address of a corresponding page in a fix-mapped area. In the end we just divide value of this address on `PAGE_SHIFT`. This gives us index of a fix-mapped area corresponding to the given virtual address. It may looks hard, but if you will go through this step by step, you will be sure that the `__virt_to_fix` macro is pretty easy.\n\nThat's all. For this moment we know a little about `fix-mapped` addresses, but this is enough to go next.\n\n`Fix-mapped` addresses are used in different [places](http://lxr.free-electrons.com/ident?i=fix_to_virt) in the Linux kernel. `IDT` descriptor stored there, [Intel Trusted Execution Technology](http://en.wikipedia.org/wiki/Trusted_Execution_Technology) UUID stored in the `fix-mapped` area started from `FIX_TBOOT_BASE` index, [Xen](http://en.wikipedia.org/wiki/Xen) bootmap and many more... We already saw a little about `fix-mapped` addresses in the fifth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-5) about of the linux kernel initialization. We use `fix-mapped` area in the early `ioremap` initialization. Let's look at it more closely and try to understand what `ioremap` is, how it is implemented in the kernel and how it is related to the `fix-mapped` addresses.\n\nioremap\n--------------------------------------------------------------------------------\n\nThe Linux kernel provides many different primitives to manage memory. For this moment we will touch `I/O memory`. Every device is controlled by reading/writing from/to its registers. For example a driver can turn off/on a device by writing to its registers or get the state of a device by reading from its registers. Besides registers, many devices have buffers where a driver can write something or read from there. As we know for this moment there are two ways to access device's registers and data buffers:\n\n* through the I/O ports;\n* mapping of all the registers to the memory address space;\n\nIn the first case every control register of a device has a number of input and output port. A device driver can read from a port and write to it with two `in` and `out` instructions which we already saw. If you want to know about currently registered port regions, you can learn about them by accessing `/proc/ioports`:\n\n```\n$ cat /proc/ioports\n0000-0cf7 : PCI Bus 0000:00\n  0000-001f : dma1\n  0020-0021 : pic1\n  0040-0043 : timer0\n  0050-0053 : timer1\n  0060-0060 : keyboard\n  0064-0064 : keyboard\n  0070-0077 : rtc0\n  0080-008f : dma page reg\n  00a0-00a1 : pic2\n  00c0-00df : dma2\n  00f0-00ff : fpu\n    00f0-00f0 : PNP0C04:00\n  03c0-03df : vesafb\n  03f8-03ff : serial\n  04d0-04d1 : pnp 00:06\n  0800-087f : pnp 00:01\n  0a00-0a0f : pnp 00:04\n  0a20-0a2f : pnp 00:04\n  0a30-0a3f : pnp 00:04\n0cf8-0cff : PCI conf1\n0d00-ffff : PCI Bus 0000:00\n...\n...\n...\n```\n\n`/proc/ioports` provides information about which driver uses which address of a `I/O` port region. All of these memory regions, for example `0000-0cf7`, were claimed with the `request_region` function from the [include/linux/ioport.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/ioport.h). Actually `request_region` is a macro which is defined as:\n\n```C\n#define request_region(start,n,name)   __request_region(&ioport_resource, (start), (n), (name), 0)\n```\n\nAs we can see it takes three parameters:\n\n* `start` -  begin of region;\n* `n`     -  length of region;\n* `name`  -  name of requester.\n\n`request_region` allocates an `I/O` port region. Very often the `check_region` function is called before the `request_region` to check that the given address range is available and the `release_region` function to release the memory region. `request_region` returns a pointer to the `resource` structure. The `resource` structure represents an abstraction for a tree-like subset of system resources. We already saw the `resource` structure in the fifth part of the kernel [initialization](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-5) process and it looks as follows:\n\n```C\nstruct resource {\n        resource_size_t start;\n        resource_size_t end;\n        const char *name;\n        unsigned long flags;\n        struct resource *parent, *sibling, *child;\n};\n```\n\nand contains start and end addresses of the resource, the name, etc. Every `resource` structure contains pointers to the `parent`, `sibling` and `child` resources. As it has a parent and a child, it means that every subset of resources has root `resource` structure. For example, for `I/O` ports it is the `ioport_resource` structure:\n\n```C\nstruct resource ioport_resource = {\n         .name   = \"PCI IO\",\n         .start  = 0,\n         .end    = IO_SPACE_LIMIT,\n        .flags  = IORESOURCE_IO,\n};\nEXPORT_SYMBOL(ioport_resource);\n```\n\nOr for `iomem`, it is the `iomem_resource` structure:\n\n```C\nstruct resource iomem_resource = {\n        .name   = \"PCI mem\",\n        .start  = 0,\n        .end    = -1,\n        .flags  = IORESOURCE_MEM,\n};\n```\n\nAs I have mentioned before, `request_regions` is used to register I/O port regions and this macro is used in many [places](http://lxr.free-electrons.com/ident?i=request_region) in the kernel. For example let's look at [drivers/char/rtc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/char/rtc.c). This source code file provides the [Real Time Clock](http://en.wikipedia.org/wiki/Real-time_clock) interface in the Linux kernel. As every kernel module, `rtc` module contains `module_init` definition:\n\n```C\nmodule_init(rtc_init);\n```\n\nwhere `rtc_init` is the `rtc` initialization function. This function is defined in the same `rtc.c` source code file. In the `rtc_init` function we can see a couple of calls to the `rtc_request_region` functions, which wrap `request_region` for example:\n\n```C\nr = rtc_request_region(RTC_IO_EXTENT);\n```\n\nwhere `rtc_request_region` calls:\n\n```C\nr = request_region(RTC_PORT(0), size, \"rtc\");\n```\n\nHere `RTC_IO_EXTENT` is the size of the memory region and it is `0x8`, `\"rtc\"` is the name of the region and `RTC_PORT` is:\n\n```C\n#define RTC_PORT(x)     (0x70 + (x))\n```\n\nSo with the `request_region(RTC_PORT(0), size, \"rtc\")` we register a memory region that starts at `0x70` and has a size of `0x8`. Let's look at `/proc/ioports`:\n\n```\n~$ sudo cat /proc/ioports | grep rtc\n0070-0077 : rtc0\n```\n\nSo, we got it! Ok, that was it for the I/O ports. The second way to communicate with drivers is through the use of `I/O` memory. As I have mentioned above this works by mapping the control registers and the memory of a device to the memory address space. `I/O` memory is a set of contiguous addresses which are provided by a device to the CPU through a bus. None of the memory-mapped I/O addresses are used by the kernel directly. There is a special `ioremap` function which allows us to convert the physical address on a bus to a kernel virtual address. In other words, `ioremap` maps I/O physical memory regions to make them accessible from the kernel. The `ioremap` function takes two parameters:\n\n* start of the memory region;\n* size of the memory region;\n\nThe I/O memory mapping API provides functions to check, request and release memory regions as I/O memory. There are three functions for that:\n\n* `request_mem_region`\n* `release_mem_region`\n* `check_mem_region`\n\n```\n~$ sudo cat /proc/iomem\n...\n...\n...\nbe826000-be82cfff : ACPI Non-volatile Storage\nbe82d000-bf744fff : System RAM\nbf745000-bfff4fff : reserved\nbfff5000-dc041fff : System RAM\ndc042000-dc0d2fff : reserved\ndc0d3000-dc138fff : System RAM\ndc139000-dc27dfff : ACPI Non-volatile Storage\ndc27e000-deffefff : reserved\ndefff000-deffffff : System RAM\ndf000000-dfffffff : RAM buffer\ne0000000-feafffff : PCI Bus 0000:00\n  e0000000-efffffff : PCI Bus 0000:01\n    e0000000-efffffff : 0000:01:00.0\n  f7c00000-f7cfffff : PCI Bus 0000:06\n    f7c00000-f7c0ffff : 0000:06:00.0\n    f7c10000-f7c101ff : 0000:06:00.0\n      f7c10000-f7c101ff : ahci\n  f7d00000-f7dfffff : PCI Bus 0000:03\n    f7d00000-f7d3ffff : 0000:03:00.0\n      f7d00000-f7d3ffff : alx\n...\n...\n...\n```\n\nPart of these addresses are from the call of the `e820_reserve_resources` function. We can find a call to this function in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) and the function itself is defined in [arch/x86/kernel/e820.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/e820.c). `e820_reserve_resources` goes through the [e820](http://en.wikipedia.org/wiki/E820) map and inserts memory regions into the root `iomem` resource structure. All `e820` memory regions which are inserted into the `iomem` resource have the following types:\n\n```C\nstatic inline const char *e820_type_to_string(int e820_type)\n{\n\tswitch (e820_type) {\n\tcase E820_RESERVED_KERN:\n\tcase E820_RAM:\treturn \"System RAM\";\n\tcase E820_ACPI:\treturn \"ACPI Tables\";\n\tcase E820_NVS:\treturn \"ACPI Non-volatile Storage\";\n\tcase E820_UNUSABLE:\treturn \"Unusable memory\";\n\tdefault:\treturn \"reserved\";\n\t}\n}\n```\n\nand we can see them in the `/proc/iomem` (read above).\n\nNow let's try to understand how `ioremap` works. We already know a little about `ioremap`, we saw it in the fifth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-5) about Linux kernel initialization. If you have read this part, you can remember the call of the `early_ioremap_init` function from the [arch/x86/mm/ioremap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/ioremap.c). Initialization of the `ioremap` is split into two parts: there is the early part which we can use before the normal `ioremap` is available and the normal `ioremap` which is available after `vmalloc` initialization and the call of `paging_init`. We do not know anything about `vmalloc` for now, so let's consider early initialization of the `ioremap`. First of all `early_ioremap_init` checks that `fixmap` is aligned on page middle directory boundary:\n\n```C\nBUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));\n```\n\nmore about `BUILD_BUG_ON` you can read in the first part about [Linux Kernel initialization](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1). So `BUILD_BUG_ON` macro raises a compilation error if the given expression is true. In the next step after this check, we can see call of the `early_ioremap_setup` function from the [mm/early_ioremap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/early_ioremap.c). This function presents generic initialization of the `ioremap`. `early_ioremap_setup` function fills the `slot_virt` array with the virtual addresses of the early fixmaps. All early fixmaps are after `__end_of_permanent_fixed_addresses` in memory. They start at `FIX_BITMAP_BEGIN` (top) and end with `FIX_BITMAP_END` (down). Actually there are `512` temporary boot-time mappings, used by early `ioremap`:\n\n```\n#define NR_FIX_BTMAPS\t\t64\n#define FIX_BTMAPS_SLOTS\t8\n#define TOTAL_FIX_BTMAPS\t(NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)\n```\n\nand `early_ioremap_setup`:\n\n```C\nvoid __init early_ioremap_setup(void)\n{\n        int i;\n\n        for (i = 0; i < FIX_BTMAPS_SLOTS; i++)\n                if (WARN_ON(prev_map[i]))\n                        break;\n\n        for (i = 0; i < FIX_BTMAPS_SLOTS; i++)\n                slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);\n}\n```\n\nthe `slot_virt` and other arrays are defined in the same source code file:\n\n```C\nstatic void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;\nstatic unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;\nstatic unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;\n```\n\n`slot_virt` contains the virtual addresses of the `fix-mapped` areas, `prev_map` array contains addresses of the early ioremap areas. Note that I wrote above: `Actually there are 512 temporary boot-time mappings, used by early ioremap` and you can see that all arrays are defined with the `__initdata` attribute which means that this memory will be released after the kernel initialization process. After `early_ioremap_setup` has finished its work, we're getting page middle directory where early ioremap begins with the `early_ioremap_pmd` function which just gets the base address of the page global directory and calculates the page middle directory for the given address:\n\n```C\nstatic inline pmd_t * __init early_ioremap_pmd(unsigned long addr)\n{\n\tpgd_t *base = __va(read_cr3_pa());\n\tpgd_t *pgd = &base[pgd_index(addr)];\n\tpud_t *pud = pud_offset(pgd, addr);\n\tpmd_t *pmd = pmd_offset(pud, addr);\n\treturn pmd;\n}\n```\n\nAfter this we fill `bm_pte` (early ioremap page table entries) with zeros and call the `pmd_populate_kernel` function:\n\n```C\npmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));\nmemset(bm_pte, 0, sizeof(bm_pte));\npmd_populate_kernel(&init_mm, pmd, bm_pte);\n```\n\n`pmd_populate_kernel` takes three parameters:\n\n* `init_mm` - memory descriptor of the `init` process (you can read about it in the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-5));\n* `pmd`     - page middle directory of the beginning of the `ioremap` fixmaps;\n* `bm_pte`  - early `ioremap` page table entries array which defined as:\n\n```C\nstatic pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;\n```\n\nThe `pmd_populate_kernel` function is defined in the [arch/x86/include/asm/pgalloc.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/pgalloc.h) and populates the page middle directory (`pmd`) provided as an argument with the given page table entries (`bm_pte`):\n\n```C\nstatic inline void pmd_populate_kernel(struct mm_struct *mm,\n                                       pmd_t *pmd, pte_t *pte)\n{\n        paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);\n        set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));\n}\n```\n\nwhere `set_pmd` is:\n\n```C\n#define set_pmd(pmdp, pmd)              native_set_pmd(pmdp, pmd)\n```\n\nand `native_set_pmd` is:\n\n```C\nstatic inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)\n{\n        *pmdp = pmd;\n}\n```\n\nThat's all. Early `ioremap` is ready to use. There are a couple of checks in the `early_ioremap_init` function, but they are not so important, anyway initialization of the `ioremap` is finished.\n\nUse of early ioremap\n--------------------------------------------------------------------------------\n\nAs soon as early `ioremap` has been setup successfully, we can use it. It provides two functions:\n\n* early_ioremap\n* early_iounmap\n\nfor mapping/unmapping of I/O physical address to virtual address. Both functions depend on the `CONFIG_MMU` configuration option. [Memory management unit](http://en.wikipedia.org/wiki/Memory_management_unit) is a special block of memory management. The main purpose of this block is the translation of physical addresses to virtual addresses. The memory management unit knows about the high-level page table addresses (`pgd`) from the `cr3` control register. If `CONFIG_MMU` options is set to `n`, `early_ioremap` just returns the given physical address and `early_iounmap` does nothing. If `CONFIG_MMU` option is set to `y`, `early_ioremap` calls `__early_ioremap` which takes three parameters:\n\n* `phys_addr` - base physical address of the `I/O` memory region to map on virtual addresses;\n* `size`      - size of the `I/O` memory region;\n* `prot`      - page table entry bits.\n\nFirst of all in the `__early_ioremap`, we go through all early ioremap fixmap slots and search for the first free one in the `prev_map` array. When we found it we remember its number in the `slot` variable and set up size:\n\n```C\nslot = -1;\nfor (i = 0; i < FIX_BTMAPS_SLOTS; i++) {\n\tif (!prev_map[i]) {\n\t\tslot = i;\n\t\tbreak;\n\t}\n}\n...\n...\n...\nprev_size[slot] = size;\nlast_addr = phys_addr + size - 1;\n```\n\n\nIn the next spte we can see the following code:\n\n```C\noffset = phys_addr & ~PAGE_MASK;\nphys_addr &= PAGE_MASK;\nsize = PAGE_ALIGN(last_addr + 1) - phys_addr;\n```\n\nHere we are using `PAGE_MASK` for clearing all bits in the `phys_addr` except the first 12 bits. `PAGE_MASK` macro is defined as:\n\n```C\n#define PAGE_MASK       (~(PAGE_SIZE-1))\n```\n\nWe know that size of a page is 4096 bytes or `1000000000000` in binary. `PAGE_SIZE - 1` will be `111111111111`, but with `~`, we will get `000000000000`, but as we use `~PAGE_MASK` we will get `111111111111` again. On the second line we do the same but clear the first 12 bits and getting page-aligned size of the area on the third line. We getting aligned area and now we need to get the number of pages which are occupied by the new `ioremap` area and calculate the fix-mapped index from `fixed_addresses` in the next steps:\n\n```C\nnrpages = size >> PAGE_SHIFT;\nidx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;\n```\n\nNow we can fill `fix-mapped` area with the given physical addresses. On every iteration in the loop, we call the `__early_set_fixmap` function from the [arch/x86/mm/ioremap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/ioremap.c), increase the given physical address by the page size which is `4096` bytes and update the `addresses` index and the number of pages:\n\n```C\nwhile (nrpages > 0) {\n\t__early_set_fixmap(idx, phys_addr, prot);\n\tphys_addr += PAGE_SIZE;\n\t--idx;\n    --nrpages;\n}\n```\n\nThe `__early_set_fixmap` function gets the page table entry (stored in the `bm_pte`, see above) for the given physical address with:\n\n```C\npte = early_ioremap_pte(addr);\n```\n\nIn the next step of `early_ioremap_pte` we check the given page flags with the `pgprot_val` macro and call `set_pte` or `pte_clear` depending on the flags given:\n\n```C\nif (pgprot_val(flags))\n\t\tset_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));\n\telse\n\t\tpte_clear(&init_mm, addr, pte);\n```\n\nAs you can see above, we passed `FIXMAP_PAGE_IO` as flags to the `__early_ioremap`. `FIXMPA_PAGE_IO` expands to the:\n\n```C\n(__PAGE_KERNEL_EXEC | _PAGE_NX)\n```\n\nflags, so we call `set_pte` function to set the page table entry which works in the same manner as `set_pmd` but for PTEs (read above about it). As we have set all `PTEs` in the loop, we can now take a look at the call of the `__flush_tlb_one` function:\n\n```C\n__flush_tlb_one(addr);\n```\n\nThis function is defined in [arch/x86/include/asm/tlbflush.h](https://github.com/torvalds/linux) and calls `__flush_tlb_single` or `__flush_tlb` depending on the value of `cpu_has_invlpg`:\n\n```C\nstatic inline void __flush_tlb_one(unsigned long addr)\n{\n        if (cpu_has_invlpg)\n                __flush_tlb_single(addr);\n        else\n                __flush_tlb();\n}\n```\n\nThe `__flush_tlb_one` function invalidates the given address in the [TLB](http://en.wikipedia.org/wiki/Translation_lookaside_buffer). As you just saw we updated the paging structure, but `TLB` is not informed of the changes, that's why we need to do it manually. There are two ways to do it. The first is to update the `cr3` control register and the `__flush_tlb` function does this:\n\n```C\nnative_write_cr3(__native_read_cr3());\n```\n\nThe second method is to use the `invlpg` instruction to invalidate the `TLB` entry. Let's look at the `__flush_tlb_one` implementation. As you can see, first of all the function checks `cpu_has_invlpg` which is defined as:\n\n```C\n#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)\n# define cpu_has_invlpg         1\n#else\n# define cpu_has_invlpg         (boot_cpu_data.x86 > 3)\n#endif\n```\n\nIf a CPU supports the `invlpg` instruction, we call the `__flush_tlb_single` macro which expands to the call of `__native_flush_tlb_single`:\n\n```C\nstatic inline void __native_flush_tlb_single(unsigned long addr)\n{\n        asm volatile(\"invlpg (%0)\" ::\"r\" (addr) : \"memory\");\n}\n```\n\nor call `__flush_tlb` which just updates the `cr3` register as we have seen. After this step execution of the `__early_set_fixmap` function is finished and we can go back to the `__early_ioremap` implementation. When we have set up the fixmap area for the given address, we need to save the base virtual address of the I/O remapped area in the `prev_map` using the `slot` index:\n\n```C\nprev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);\n```\n\nand return it.\n\nThe second function, `early_iounmap`, unmaps an `I/O` memory region. This function takes two parameters: base address and size of a `I/O` region and generally looks very similar to `early_ioremap`. It also goes through fixmap slots and looks for a slot with the given address. After that, it gets the index of the fixmap slot and calls `__late_clear_fixmap` or `__early_set_fixmap` depending on the `after_paging_init` value. It calls `__early_set_fixmap` with one difference to how `early_ioremap` does it: `early_iounmap` passes `zero` as physical address. And in the end it sets the address of the I/O memory region to `NULL`:\n\n```C\nprev_map[slot] = NULL;\n```\n\nThat's all about `fixmaps` and `ioremap`. Of course this part does not cover all features of `ioremap`, only early ioremap but there is also normal ioremap. But we need to know more things before we study that in more detail.\n\nSo, this is the end!\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the second part about Linux kernel memory management. If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me a PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [apic](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)\n* [vsyscall](https://lwn.net/Articles/446528/)\n* [Intel Trusted Execution Technology](http://en.wikipedia.org/wiki/Trusted_Execution_Technology)\n* [Xen](http://en.wikipedia.org/wiki/Xen)\n* [Real Time Clock](http://en.wikipedia.org/wiki/Real-time_clock)\n* [e820](http://en.wikipedia.org/wiki/E820)\n* [Memory management unit](http://en.wikipedia.org/wiki/Memory_management_unit)\n* [TLB](http://en.wikipedia.org/wiki/Translation_lookaside_buffer)\n* [Paging](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-1)\n* [Linux kernel memory management Part 1.](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-1)\n"
  },
  {
    "path": "MM/linux-mm-3.md",
    "content": "Linux kernel memory management Part 3.\n================================================================================\n\nIntroduction to the kmemcheck in the Linux kernel\n--------------------------------------------------------------------------------\n\nThis is the third part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/mm) which describes [memory management](https://en.wikipedia.org/wiki/Memory_management) in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-2) of this chapter we met two memory management related concepts:\n\n* `Fix-Mapped Addresses`;\n* `ioremap`.\n\nThe first concept represents special area in [virtual memory](https://en.wikipedia.org/wiki/Virtual_memory), whose corresponding physical mapping is calculated in [compile-time](https://en.wikipedia.org/wiki/Compile_time). The second concept provides ability to map input/output related memory to virtual memory.\n\nFor example if you will look at the output of the `/proc/iomem`:\n\n```\n$ sudo cat /proc/iomem\n\n00000000-00000fff : reserved\n00001000-0009d7ff : System RAM\n0009d800-0009ffff : reserved\n000a0000-000bffff : PCI Bus 0000:00\n000c0000-000cffff : Video ROM\n000d0000-000d3fff : PCI Bus 0000:00\n000d4000-000d7fff : PCI Bus 0000:00\n000d8000-000dbfff : PCI Bus 0000:00\n000dc000-000dffff : PCI Bus 0000:00\n000e0000-000fffff : reserved\n...\n...\n...\n```\n\nyou will see map of the system's memory for each physical device. Here the first column displays the memory registers used by each of the different types of memory. The second column lists the kind of memory located within those registers. Or for example:\n\n```\n$ sudo cat /proc/ioports\n\n0000-0cf7 : PCI Bus 0000:00\n  0000-001f : dma1\n  0020-0021 : pic1\n  0040-0043 : timer0\n  0050-0053 : timer1\n  0060-0060 : keyboard\n  0064-0064 : keyboard\n  0070-0077 : rtc0\n  0080-008f : dma page reg\n  00a0-00a1 : pic2\n  00c0-00df : dma2\n  00f0-00ff : fpu\n    00f0-00f0 : PNP0C04:00\n  03c0-03df : vga+\n  03f8-03ff : serial\n  04d0-04d1 : pnp 00:06\n  0800-087f : pnp 00:01\n  0a00-0a0f : pnp 00:04\n  0a20-0a2f : pnp 00:04\n  0a30-0a3f : pnp 00:04\n...\n...\n...\n```\n\ncan show us lists of currently registered port regions used for input or output communication with a device. All memory-mapped I/O addresses are not used by the kernel directly. So, before the Linux kernel can use such memory, it must map it to the virtual memory space which is the main purpose of the `ioremap` mechanism. Note that we saw only early `ioremap` in the previous [part](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-2). Soon we will look at the implementation of the non-early `ioremap` function. But before this we must learn other things, like different types of memory allocators and etc., because otherwise it will be very difficult to understand it.\n\nSo, before we will move on to the non-early [memory management](https://en.wikipedia.org/wiki/Memory_management) of the Linux kernel, we will see some mechanisms which provide special abilities for [debugging](https://en.wikipedia.org/wiki/Debugging), check of [memory leaks](https://en.wikipedia.org/wiki/Memory_leak), memory control and etc. It will be easier to understand how memory management arranged in the Linux kernel after learning of all of these things.\n\nAs you already may guess from the title of this part, we will start to consider memory mechanisms from the [kmemcheck](https://www.kernel.org/doc/Documentation/kmemcheck.txt). As we always did in other [chapters](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md), we will start to consider from theoretical side and will learn what is `kmemcheck` mechanism in general and only after this, we will see how it is implemented in the Linux kernel.\n\nSo let's start. What is it `kmemcheck` in the Linux kernel? As you may guess from the name of this mechanism, the `kmemcheck` checks memory. That's true. Main point of the `kmemcheck` mechanism is to check that some kernel code accesses `uninitialized memory`. Let's take following simple [C](https://en.wikipedia.org/wiki/C_%28programming_language%29) program:\n\n```C\n#include <stdlib.h>\n#include <stdio.h>\n\nstruct A {\n        int a;\n};\n\nint main(int argc, char **argv) {\n        struct A *a = malloc(sizeof(struct A));\n        printf(\"a->a = %d\\n\", a->a);\n        return 0;\n}\n```\n\nHere we allocate memory for the `A` structure and tries to print value of the `a` field. If we will compile this program without additional options:\n\n```\ngcc test.c -o test\n```\n\nThe [compiler](https://en.wikipedia.org/wiki/GNU_Compiler_Collection) will not show us warning that `a` field is not uninitialized. But if we will run this program with [valgrind](https://en.wikipedia.org/wiki/Valgrind) tool, we will see the following output:\n\n```\n~$   valgrind --leak-check=yes ./test\n==28469== Memcheck, a memory error detector\n==28469== Copyright (C) 2002-2015, and GNU GPL'd, by Julian Seward et al.\n==28469== Using Valgrind-3.11.0 and LibVEX; rerun with -h for copyright info\n==28469== Command: ./test\n==28469==\n==28469== Conditional jump or move depends on uninitialised value(s)\n==28469==    at 0x4E820EA: vfprintf (in /usr/lib64/libc-2.22.so)\n==28469==    by 0x4E88D48: printf (in /usr/lib64/libc-2.22.so)\n==28469==    by 0x4005B9: main (in /home/alex/test)\n==28469==\n==28469== Use of uninitialised value of size 8\n==28469==    at 0x4E7E0BB: _itoa_word (in /usr/lib64/libc-2.22.so)\n==28469==    by 0x4E8262F: vfprintf (in /usr/lib64/libc-2.22.so)\n==28469==    by 0x4E88D48: printf (in /usr/lib64/libc-2.22.so)\n==28469==    by 0x4005B9: main (in /home/alex/test)\n...\n...\n...\n```\n\nActually the `kmemcheck` mechanism does the same for the kernel, what the `valgrind` does for userspace programs. It check uninitialized memory.\n\nTo enable this mechanism in the Linux kernel, you need to enable the `CONFIG_KMEMCHECK` kernel configuration option in the:\n\n```\nKernel hacking\n  -> Memory Debugging\n```\n\nmenu of the Linux kernel configuration:\n\n![kernel configuration menu](images/kernel_configuration_menu1.png)\n\nWe may not only enable support of the `kmemcheck` mechanism in the Linux kernel, but it also provides some configuration options for us. We will see all of these options in the next paragraph of this part. Last note before we will consider how does the `kmemcheck` check memory. Now this mechanism is implemented only for the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture. You can be sure if you will look in the [arch/x86/Kconfig](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Kconfig) `x86` related kernel configuration file, you will see following lines:\n\n```\nconfig X86\n  ...\n  ...\n  ...\n  select HAVE_ARCH_KMEMCHECK\n  ...\n  ...\n  ...\n```\n\nSo, there isn't anything which is specific for other architectures.\n\nOk, so we know that `kmemcheck` provides mechanism to check usage of `uninitialized memory` in the Linux kernel and how to enable it. How it does these checks? When the Linux kernel tries to allocate some memory i.e. something is called like this:\n\n```C\nstruct my_struct *my_struct = kmalloc(sizeof(struct my_struct), GFP_KERNEL);\n```\n\nor in other words somebody wants to access a [page](https://en.wikipedia.org/wiki/Page_%28computer_memory%29), a [page fault](https://en.wikipedia.org/wiki/Page_fault) exception is generated. This is achieved by the fact that the `kmemcheck` marks memory pages as `non-present` (more about this you can read in the special part which is devoted to [Paging](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-1)). If a `page fault` exception occurred, the exception handler knows about it and in a case when the `kmemcheck` is enabled it transfers control to it. After the `kmemcheck` will finish its checks, the page will be marked as `present` and the interrupted code will be able to continue execution. There is little subtlety in this chain. When the first instruction of interrupted code will be executed, the `kmemcheck` will mark the page as `non-present` again. In this way next access to memory will be caught again.\n\nWe just considered the `kmemcheck` mechanism from theoretical side. Now let's consider how it is implemented in the Linux kernel.\n\nImplementation of the `kmemcheck` mechanism in the Linux kernel\n--------------------------------------------------------------------------------\n\nSo, now we know what is it `kmemcheck` and what it does in the Linux kernel. Time to see at its implementation in the Linux kernel. Implementation of the `kmemcheck` is split in two parts. The first is generic part is located in the [mm/kmemcheck.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/kmemcheck.c) source code file and the second [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture-specific part is located in the [arch/x86/mm/kmemcheck](https://github.com/torvalds/linux/tree/master/arch/x86/mm/kmemcheck) directory.\n\nLet's start from the initialization of this mechanism. We already know that to enable the `kmemcheck` mechanism in the Linux kernel, we must enable the `CONFIG_KMEMCHECK` kernel configuration option. But besides this, we need to pass one of following parameters:\n\n * kmemcheck=0 (disabled)\n * kmemcheck=1 (enabled)\n * kmemcheck=2 (one-shot mode)\n\nto the Linux kernel command line. The first two are clear, but the last needs a little explanation. This option switches the `kmemcheck` in a special mode when it will be turned off after detecting the first use of uninitialized memory. Actually this mode is enabled by default in the Linux kernel:\n\n![kernel configuration menu](images/kernel_configuration_menu2.png)\n\nWe know from the seventh [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-7) of the chapter which describes initialization of the Linux kernel that the kernel command line is parsed during initialization of the Linux kernel in `do_initcall_level`, `do_early_param` functions. Actually the `kmemcheck` subsystem consists from two stages. The first stage is early. If we will look at the [mm/kmemcheck.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/kmemcheck.c) source code file, we will see the `param_kmemcheck` function which is will be called during early command line parsing:\n\n```C\nstatic int __init param_kmemcheck(char *str)\n{\n\tint val;\n\tint ret;\n\n\tif (!str)\n\t\treturn -EINVAL;\n\n\tret = kstrtoint(str, 0, &val);\n\tif (ret)\n\t\treturn ret;\n\tkmemcheck_enabled = val;\n\treturn 0;\n}\n\nearly_param(\"kmemcheck\", param_kmemcheck);\n```\n\nAs we already saw, the `param_kmemcheck` may have one of the following values: `0` (enabled), `1` (disabled) or `2` (one-shot). The implementation of the `param_kmemcheck` is pretty simple. We just convert string value of the `kmemcheck` command line option to integer representation and set it to the `kmemcheck_enabled` variable.\n\nThe second stage will be executed during initialization of the Linux kernel, rather during initialization of early [initcalls](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-3). The second stage is represented by the `kmemcheck_init`:\n\n```C\nint __init kmemcheck_init(void)\n{\n    ...\n    ...\n    ...\n}\n\nearly_initcall(kmemcheck_init);\n```\n\nMain goal of the `kmemcheck_init` function is to call the `kmemcheck_selftest` function and check its result:\n\n```C\nif (!kmemcheck_selftest()) {\n\tprintk(KERN_INFO \"kmemcheck: self-tests failed; disabling\\n\");\n\tkmemcheck_enabled = 0;\n\treturn -EINVAL;\n}\n\nprintk(KERN_INFO \"kmemcheck: Initialized\\n\");\n```\n\nand return with the `EINVAL` if this check is failed. The `kmemcheck_selftest` function checks sizes of different memory access related [opcodes](https://en.wikipedia.org/wiki/Opcode) like `rep movsb`, `movzwq` and etc. If sizes of opcodes are equal to expected sizes, the `kmemcheck_selftest` will return `true` and `false` otherwise.\n\nSo when somebody calls:\n\n```C\nstruct my_struct *my_struct = kmalloc(sizeof(struct my_struct), GFP_KERNEL);\n```\n\nthrough a series of different function calls the `kmem_getpages` function will be called. This function is defined in the [mm/slab.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/slab.c) source code file and main goal of this function tries to allocate [pages](https://en.wikipedia.org/wiki/Paging) with the given flags. In the end of this function we can see following code:\n\n```C\nif (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {\n\tkmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);\n\n    if (cachep->ctor)\n\t\tkmemcheck_mark_uninitialized_pages(page, nr_pages);\n\telse\n\t\tkmemcheck_mark_unallocated_pages(page, nr_pages);\n}\n```\n\nSo, here we check that the if `kmemcheck` is enabled and the `SLAB_NOTRACK` bit is not set in flags we set `non-present` bit for the just allocated page. The `SLAB_NOTRACK` bit tell us to not track uninitialized memory. Additionally we check if a cache object has constructor (details will be considered in next parts) we mark allocated page as uninitialized or unallocated otherwise. The `kmemcheck_alloc_shadow` function is defined in the [mm/kmemcheck.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/kmemcheck.c) source code file and does following things:\n\n```C\nvoid kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)\n{\n    struct page *shadow;\n\n   \tshadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);\n\n   \tfor(i = 0; i < pages; ++i)\n\t\tpage[i].shadow = page_address(&shadow[i]);\n\n   \tkmemcheck_hide_pages(page, pages);\n}\n```\n\nFirst of all it allocates memory space for the shadow bits. If this bit is set in a page, this means that this page is tracked by the `kmemcheck`. After we allocated space for the shadow bit, we fill all allocated pages with this bit. In the end we just call the `kmemcheck_hide_pages` function with the pointer to the allocated page and number of these pages. The `kmemcheck_hide_pages` is architecture-specific function, so its implementation is located in the [arch/x86/mm/kmemcheck/kmemcheck.c](https://github.com/torvalds/linux/tree/master/arch/x86/mm/kmemcheck/kmemcheck.c) source code file. The main goal of this function is to set `non-present` bit in given pages. Let's look at the implementation of this function:\n\n```C\nvoid kmemcheck_hide_pages(struct page *p, unsigned int n)\n{\n\tunsigned int i;\n\n\tfor (i = 0; i < n; ++i) {\n\t\tunsigned long address;\n\t\tpte_t *pte;\n\t\tunsigned int level;\n\n\t\taddress = (unsigned long) page_address(&p[i]);\n\t\tpte = lookup_address(address, &level);\n\t\tBUG_ON(!pte);\n\t\tBUG_ON(level != PG_LEVEL_4K);\n\n\t\tset_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));\n\t\tset_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));\n\t\t__flush_tlb_one(address);\n\t}\n}\n```\n\nHere we go through all pages and try to get `page table entry` for each page. If this operation was successful, we unset present bit and set hidden bit in each page. In the end we flush [translation lookaside buffer](https://en.wikipedia.org/wiki/Translation_lookaside_buffer), because some pages was changed. From this point allocated pages are tracked by the `kmemcheck`. Now, as `present` bit is unset, the [page fault](https://en.wikipedia.org/wiki/Page_fault) execution will be occurred right after the `kmalloc` will return pointer to allocated space and a code will try to access this memory.\n\nAs you may remember from the [second part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-2) of the Linux kernel initialization chapter, the `page fault` handler is located in the [arch/x86/mm/fault.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/fault.c) source code file and represented by the `do_page_fault` function. We can see following check from the beginning of the `do_page_fault` function:\n\n```C\nstatic noinline void\n__do_page_fault(struct pt_regs *regs, unsigned long error_code,\n\t\tunsigned long address)\n{\n    ...\n    ...\n    ...\n\tif (kmemcheck_active(regs))\n\t\tkmemcheck_hide(regs);\n    ...\n    ...\n    ...\n}\n```\n\nThe `kmemcheck_active` gets `kmemcheck_context` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) structure and returns the result of comparison of the `balance` field of this structure with zero:\n\n```\nbool kmemcheck_active(struct pt_regs *regs)\n{\n\tstruct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);\n\n\treturn data->balance > 0;\n}\n```\n\nThe `kmemcheck_context` is structure which describes current state of the `kmemcheck` mechanism. It stored uninitialized addresses, number of such addresses and etc. The `balance` field of this structure represents current state of the `kmemcheck` or in other words it can tell us did `kmemcheck` already hid pages or not yet. If the `data->balance` is greater than zero, the `kmemcheck_hide` function will be called. This means than `kmemecheck` already set `present` bit for given pages and now we need to hide pages again to cause next step to page fault. This function will hide addresses of pages again by unsetting of `present` bit. This means that one session of `kmemcheck` already finished and new page fault occurred. At the first step the `kmemcheck_active` will return false as the `data->balance` is zero for the start and the `kmemcheck_hide` will not be called. Next, we may see following line of code in the `do_page_fault`:\n\n```C\nif (kmemcheck_fault(regs, address, error_code))\n\t\treturn;\n```\n\nFirst of all the `kmemcheck_fault` function checks that the fault occurred by the correct reason. At first we check the [flags register](https://en.wikipedia.org/wiki/FLAGS_register) and check that we are in normal kernel mode:\n\n```C\nif (regs->flags & X86_VM_MASK)\n\t\treturn false;\nif (regs->cs != __KERNEL_CS)\n\t\treturn false;\n```\n\nIf these checks weren't successful we return from the `kmemcheck_fault` function as it was not `kmemcheck` related page fault. After this we try to lookup a `page table entry` related to the faulted address and if we can't find it we return:\n\n```C\npte = kmemcheck_pte_lookup(address);\nif (!pte)\n\treturn false;\n```\n\nLast two steps of the `kmemcheck_fault` function is to call the `kmemcheck_access` function which check access to the given page and show addresses again by setting present bit in the given page. The `kmemcheck_access` function does all main job. It checks current instruction which caused a page fault. If it finds an error, the context of this error will be saved by `kmemcheck` to the ring queue:\n\n```C\nstatic struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];\n```\n\nThe `kmemcheck` mechanism declares special [tasklet](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-9):\n\n```C\nstatic DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);\n```\n\nwhich runs the `do_wakeup` function from the [arch/x86/mm/kmemcheck/error.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/kmemcheck/error.c) source code file when it is scheduled to run.\n\nThe `do_wakeup` function will call the `kmemcheck_error_recall` function which will print errors collected by `kmemcheck`. As we already saw the:\n\n```C\nkmemcheck_show(regs);\n```\n\nfunction will be called in the end of the `kmemcheck_fault` function. This function will set present bit for the given pages again:\n\n```C\nif (unlikely(data->balance != 0)) {\n\tkmemcheck_show_all();\n\tkmemcheck_error_save_bug(regs);\n\tdata->balance = 0;\n\treturn;\n}\n```\n\nWhere the `kmemcheck_show_all` function calls the `kmemcheck_show_addr` for each address:\n\n```C\nstatic unsigned int kmemcheck_show_all(void)\n{\n\tstruct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);\n\tunsigned int i;\n\tunsigned int n;\n\n\tn = 0;\n\tfor (i = 0; i < data->n_addrs; ++i)\n\t\tn += kmemcheck_show_addr(data->addr[i]);\n\n\treturn n;\n}\n```\n\nby the call of the `kmemcheck_show_addr`:\n\n```C\nint kmemcheck_show_addr(unsigned long address)\n{\n\tpte_t *pte;\n\n\tpte = kmemcheck_pte_lookup(address);\n\tif (!pte)\n\t\treturn 0;\n\n\tset_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));\n\t__flush_tlb_one(address);\n\treturn 1;\n}\n```\n\nIn the end of the `kmemcheck_show` function we set the [TF](https://en.wikipedia.org/wiki/Trap_flag) flag if it wasn't set:\n\n```C\nif (!(regs->flags & X86_EFLAGS_TF))\n\tdata->flags = regs->flags;\n```\n\nWe need to do it because we need to hide pages again after first executed instruction after a page fault will be handled. In a case when the `TF` flag, so the processor will switch into single-step mode after the first instruction will be executed. In this case `debug` exception will occurred. From this moment pages will be hidden again and execution will be continued. As pages hidden from this moment, page fault exception will occur again and `kmemcheck` continue to check/collect errors again and print them from time to time.\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the third part about Linux kernel [memory management](https://en.wikipedia.org/wiki/Memory_management). If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part we will see yet another memory debugging related tool - `kmemleak`.\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me a PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [memory management](https://en.wikipedia.org/wiki/Memory_management)\n* [debugging](https://en.wikipedia.org/wiki/Debugging)\n* [memory leaks](https://en.wikipedia.org/wiki/Memory_leak)\n* [kmemcheck documentation](https://www.kernel.org/doc/Documentation/kmemcheck.txt)\n* [valgrind](https://en.wikipedia.org/wiki/Valgrind)\n* [Paging](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-1)\n* [page fault](https://en.wikipedia.org/wiki/Page_fault)\n* [initcalls](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-3)\n* [opcode](https://en.wikipedia.org/wiki/Opcode)\n* [translation lookaside buffer](https://en.wikipedia.org/wiki/Translation_lookaside_buffer)\n* [per-cpu variables](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [flags register](https://en.wikipedia.org/wiki/FLAGS_register)\n* [tasklet](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-9)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-2)\n"
  },
  {
    "path": "Makefile",
    "content": "### HELP\n\n.PHONY: help\nhelp: ## Print help\n\t@egrep \"(^### |^\\S+:.*##\\s)\" Makefile | sed 's/^###\\s*//' | sed 's/^\\(\\S*\\)\\:.*##\\s*\\(.*\\)/  \\1 - \\2/'\n\n### DOCKER\n\n.PHONY: run\nrun: image ## docker run ...\n\t(docker stop linux-insides-book 2>&1) > /dev/null || true\n\tdocker run --detach -p 4000:4000 --name linux-insides-book --hostname linux-insides-book linux-insides-book \n\n.PHONY: start\nstart: ## start the docker container ...\n\tdocker start linux-insides-book\n\n.PHONY: image\nimage: ## docker image build ...\n\tdocker image build --rm --squash --label linux-insides --tag linux-insides-book:latest -f Dockerfile . 2> /dev/null || \\\n\tdocker image build --rm --label linux-insides --tag linux-insides-book:latest -f Dockerfile . \n\n.PHONY: sh\nsh: ## run interactive shell inside an already running docker container ...\n\tdocker exec -it linux-insides-book bash\n\n.PHONY: rm\nrm: ## remove the docker container ...\n\t(docker stop linux-insides-book 2>&1) > /dev/null || true\n\t(docker rm linux-insides-book 2>&1) > /dev/null || true\n\n.PHONY: logs\nlogs: ## gather logs from the docker container ...\n\tdocker logs linux-insides-book\n\n.PHONY: export\nexport: ## run e-book generation inside an already running docker container ...\n\tdocker exec linux-insides-book /bin/bash -c ' \\\n\tfind . -type f -name '*.svg' -a ! \\( -path \"./.github/*\" -o -path \"./_book/*\" \\) -print0 | while IFS= read -r -d \"\" svg_file; do \\\n    output_file=\"$${svg_file%.svg}.png\"; \\\n    chapter_dir=$$(dirname $$(dirname \"$$svg_file\")); \\\n    svg_relative_path=\"$${svg_file#$$chapter_dir/}\"; \\\n    output_relative_path=\"$${output_file#$$chapter_dir/}\"; \\\n    inkscape --export-png=\"$$output_file\" \\\n             --export-area-page \\\n             --export-dpi=150 \\\n             \"$$svg_file\"; \\\n    find \"$$chapter_dir\" -maxdepth 1 -type f -name \"*.md\" -print0 | xargs -0 sed -i \"s|\\\\([/ \\\\t\\\\(]\\\\)$${svg_relative_path}|\\\\1$${output_relative_path}|g\"; \\\n\tdone; \\\n\tgitbook epub; \\\n\tgitbook mobi; \\\n\tgitbook pdf; \\\n\tmv book.pdf book-A4.pdf; \\\n\tmv book-A5.json book.json; \\\n\tgitbook pdf; \\\n\tmv book.pdf book-A5.pdf; \\\n\tmv book-A4.pdf book.pdf'\n\n.PHONY: cp\ncp: ## copy all exported e-book formats to current working directory ...\n\tdocker cp linux-insides-book:/srv/gitbook/book.epub \"Linux Inside - 0xAX.epub\"\n\tdocker cp linux-insides-book:/srv/gitbook/book.mobi \"Linux Inside - 0xAX.mobi\"\n\tdocker cp linux-insides-book:/srv/gitbook/book.pdf \"Linux Inside - 0xAX.pdf\"\n\tdocker cp linux-insides-book:/srv/gitbook/book-A5.pdf \"Linux Inside - 0xAX (A5).pdf\"\n\n.PHONY: clean\nclean: ## remove all exported e-book files ...\n\trm \"Linux Inside - 0xAX.epub\" \\\n\t\t \"Linux Inside - 0xAX.mobi\" \\\n\t\t \"Linux Inside - 0xAX.pdf\" \\\n\t\t \"Linux Inside - 0xAX (A5).pdf\"\n\n### LAUNCH BROWSER\n\n.PHONY: browse\nbrowse: ## Launch broweser\n\t@timeout 60 sh -c 'until nc -z 127.0.0.1 4000; do sleep 1; done' || true\n\t@(uname | grep Darwin > /dev/null) && open http://127.0.0.1:4000 || true\n\t@(uname | grep Linux > /dev/null) && xdg-open http://127.0.0.1:4000 || true\n"
  },
  {
    "path": "Misc/README.md",
    "content": "# Misc\n\nThis chapter contains parts which are not directly related to the Linux kernel source code and implementation of different subsystems.\n"
  },
  {
    "path": "Misc/linux-misc-1.md",
    "content": "Linux kernel development\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nAs you already may know, I've started a series of [blog posts](https://0xax.github.io/categories/assembler/) about assembler programming for `x86_64` architecture in the last year. I have never written a line of low-level code before this moment, except for a couple of toy `Hello World` examples in university. It was a long time ago and, as I already said, I didn't write low-level code at all. Some time ago I became interested in such things. I understood that I can write programs, but didn't actually understand how my program is arranged.\n\nAfter writing some assembler code I began to understand how my program looks after compilation, **approximately**. But anyway, I didn't understand many other things. For example: what occurs when the `syscall` instruction is executed in my assembler, what occurs when the `printf` function starts to work or how can my program talk with other computers via network. [Assembler](https://en.wikipedia.org/wiki/Assembly_language#Assembler) programming language didn't give me answers to my questions and I decided to go deeper in my research. I started to learn from the source code of the Linux kernel and tried to understand the things that I'm interested in. The source code of the Linux kernel didn't give me the answers to **all** of my questions, but now my knowledge about the Linux kernel and the processes around it is much better.\n\nI'm writing this part nine and a half months after I've started to learn from the source code of the Linux kernel and published the first [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-1) of this book. Now it contains forty parts and it is not the end. I decided to write this series about the Linux kernel mostly for myself. As you know the Linux kernel is very huge piece of code and it is easy to forget what does this or that part of the Linux kernel mean and how does it implement something. But soon the [linux-insides](https://github.com/0xAX/linux-insides) repo became popular and after nine months it has `9096` stars:\n\n![github](images/github.png)\n\nIt seems that people are interested in the insides of the Linux kernel. Besides this, in all the time that I have been writing `linux-insides`, I have received many questions from different people about how to begin contributing to the Linux kernel. Generally people are interested in contributing to open source projects and the Linux kernel is not an exception:\n\n![google-linux](images/google_linux.png)\n\nSo, it seems that people are interested in the Linux kernel development process. I thought it would be strange if a book about the Linux kernel would not contain a part describing how to take a part in the Linux kernel development and that's why I decided to write it. You will not find information about why you should be interested in contributing to the Linux kernel in this part. But if you are interested how to start with Linux kernel development, this part is for you.\n\nLet's start.\n\nHow to start with Linux kernel\n---------------------------------------------------------------------------------\n\nFirst of all, let's see how to get, build, and run the Linux kernel. You can run your custom build of the Linux kernel in two ways:\n\n* Run the Linux kernel on a virtual machine;\n* Run the Linux kernel on real hardware.\n\nI'll provide descriptions for both methods. Before we start doing anything with the Linux kernel, we need to get it. There are a couple of ways to do this depending on your purpose. If you just want to update the current version of the Linux kernel on your computer, you can use the instructions specific to your Linux [distro](https://en.wikipedia.org/wiki/Linux_distribution).\n\nIn the first case you just need to download new version of the Linux kernel with the [package manager](https://en.wikipedia.org/wiki/Package_manager). For example, to upgrade the version of the Linux kernel to `4.1` for [Ubuntu (Vivid Vervet)](http://releases.ubuntu.com/15.04/), you will just need to execute the following commands:\n\n```\n$ sudo add-apt-repository ppa:kernel-ppa/ppa\n$ sudo apt-get update\n```\n\nAfter this execute this command:\n\n```\n$ apt-cache showpkg linux-headers\n```\n\nand choose the version of the Linux kernel in which you are interested. In the end execute the next command and replace `${version}` with the version that you chose in the output of the previous command:\n\n```\n$ sudo apt-get install linux-headers-${version} linux-headers-${version}-generic linux-image-${version}-generic --fix-missing\n```\n\nand reboot your system. After the reboot you will see the new kernel in the [grub](https://en.wikipedia.org/wiki/GNU_GRUB) menu.\n\nIn the other way if you are interested in the Linux kernel development, you will need to get the source code of the Linux kernel. You can find it on the [kernel.org](https://kernel.org/) website and download an archive with the Linux kernel source code. Actually the Linux kernel development process is fully built around `git` [version control system](https://en.wikipedia.org/wiki/Version_control). So you can get it with `git` from the `kernel.org`:\n\n```\n$ git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git\n```\n\nI don't know how about you, but I prefer `github`. There is a [mirror](https://github.com/torvalds/linux) of the Linux kernel mainline repository, so you can clone it with:\n\n```\n$ git clone git@github.com:torvalds/linux.git\n```\n\nI  use my own [fork](https://github.com/0xAX/linux) for development and when I want to pull updates from the main repository I just execute the following command:\n\n```\n$ git checkout master\n$ git pull upstream master\n```\n\nNote that the remote name of the main repository is `upstream`. To add a new remote with the main Linux repository you can execute:\n\n```\ngit remote add upstream git@github.com:torvalds/linux.git\n```\n\nAfter this you will have two remotes:\n\n```\n~/dev/linux (master) $ git remote -v\norigin\tgit@github.com:0xAX/linux.git (fetch)\norigin\tgit@github.com:0xAX/linux.git (push)\nupstream\thttps://github.com/torvalds/linux.git (fetch)\nupstream\thttps://github.com/torvalds/linux.git (push)\n```\n\nOne is of your fork (`origin`) and the second is for the main repository (`upstream`).\n\nNow that we have a local copy of the Linux kernel source code, we need to configure and build it. The Linux kernel can be configured in different ways. The simplest way is to just copy the configuration file of the already installed kernel that is located in the `/boot` directory:\n\n```\n$ sudo cp /boot/config-$(uname -r) ~/dev/linux/.config\n```\n\nIf your current Linux kernel was built with the support for access to the `/proc/config.gz` file, you can copy your actual kernel configuration file with this command:\n\n```\n$ cat /proc/config.gz | gunzip > ~/dev/linux/.config\n```\n\nIf you are not satisfied with the standard kernel configuration that is provided by the maintainers of your distro, you can configure the Linux kernel manually. There are a couple of ways to do it. The Linux kernel root [Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Makefile) provides a set of targets that allows you to configure it. For example `menuconfig` provides a menu-driven interface for the kernel configuration:\n\n![menuconfig](images/menuconfig.png)\n\nThe `defconfig` argument generates the default kernel configuration file for the current architecture, for example [x86_64 defconfig](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/configs/x86_64_defconfig). You can pass the `ARCH` command line argument to `make` to build `defconfig` for the given architecture:\n\n```\n$ make ARCH=arm64 defconfig\n```\n\nThe `allnoconfig`, `allyesconfig` and `allmodconfig` arguments allow you to generate a new configuration file where all options will be disabled, enabled, and enabled as modules respectively. The `nconfig` command line arguments that provides `ncurses` based program with menu to configure Linux kernel:\n\n![nconfig](images/nconfig.png)\n\nAnd even `randconfig` to generate random Linux kernel configuration file. I will not write about how to configure the Linux kernel or which options to enable because it makes no sense to do so for two reasons: First of all I do not know your hardware and second, if you know your hardware, the only remaining task is to find out how to use programs for kernel configuration, and all of them are pretty simple to use.\n\nOK, we now have the source code of the Linux kernel and configured it. The next step is the compilation of the Linux kernel. The simplest way to compile Linux kernel is to just execute:\n\n```\n$ make\nscripts/kconfig/conf  --silentoldconfig Kconfig\n#\n# configuration written to .config\n#\n  CHK     include/config/kernel.release\n  UPD     include/config/kernel.release\n  CHK     include/generated/uapi/linux/version.h\n  CHK     include/generated/utsrelease.h\n  ...\n  ...\n  ...\n  OBJCOPY arch/x86/boot/vmlinux.bin\n  AS      arch/x86/boot/header.o\n  LD      arch/x86/boot/setup.elf\n  OBJCOPY arch/x86/boot/setup.bin\n  BUILD   arch/x86/boot/bzImage\n  Setup is 15740 bytes (padded to 15872 bytes).\nSystem is 4342 kB\nCRC 82703414\nKernel: arch/x86/boot/bzImage is ready  (#73)\n```\n\nTo increase the speed of kernel compilation you can pass `-jN` command line argument to `make`, where `N` specifies the number of commands to run simultaneously:\n\n```\n$ make -j8\n```\n\nIf you want to build Linux kernel for an architecture that differs from your current, the simplest way to do it pass two arguments:\n\n* `ARCH` command line argument and the name of the target architecture;\n* `CROSS_COMPILER` command line argument and the cross-compiler tool prefix;\n\nFor example if we want to compile the Linux kernel for the [arm64](https://en.wikipedia.org/wiki/ARM_architecture#AArch64_features) with default kernel configuration file, we need to execute following command:\n\n```\n$ make -j4 ARCH=arm64 CROSS_COMPILER=aarch64-linux-gnu- defconfig\n$ make -j4 ARCH=arm64 CROSS_COMPILER=aarch64-linux-gnu-\n```\n\nAs result of compilation we can see the compressed kernel - `arch/x86/boot/bzImage`. Now that we have compiled the kernel, we can either install it on our computer or just run it in an emulator.\n\nInstalling Linux kernel\n--------------------------------------------------------------------------------\n\nAs I already wrote we will consider two ways to launch new kernel: in the first case we can install and run the new version of the Linux kernel on the real hardware and the second is launch the Linux kernel on a virtual machine. In the previous paragraph we saw how to build the Linux kernel from source code and as a result we have got compressed image:\n\n```\n...\n...\n...\nKernel: arch/x86/boot/bzImage is ready  (#73)\n```\n\nAfter we have got the [bzImage](https://en.wikipedia.org/wiki/Vmlinux#bzImage) we need to install `headers`, `modules` of the new Linux kernel with the:\n\n```\n$ sudo make headers_install\n$ sudo make modules_install\n```\n\nand directly the kernel itself:\n\n```\n$ sudo make install\n```\n\nFrom this moment we have installed new version of the Linux kernel and now we must tell the `bootloader` about it. Of course we can add it manually by the editing of the `/boot/grub2/grub.cfg` configuration file, but I prefer to use a script for this purpose. I'm using two different Linux distros: Fedora and Ubuntu. There are two different ways to update the [grub](https://en.wikipedia.org/wiki/GNU_GRUB) configuration file. I'm using following script for this purpose:\n\n```shell\n#!/bin/bash\n\nsource \"term-colors\"\n\nDISTRIBUTIVE=$(cat /etc/*-release | grep NAME | head -1 | sed -n -e 's/NAME\\=//p')\necho -e \"Distributive: ${Green}${DISTRIBUTIVE}${Color_Off}\"\n\nif [[ \"$DISTRIBUTIVE\" == \"Fedora\" ]] ;\nthen\n    su -c 'grub2-mkconfig -o /boot/grub2/grub.cfg'\nelse\n    sudo update-grub\nfi\n\necho \"${Green}Done.${Color_Off}\"\n```\n\nThis is the last step of the new Linux kernel installation and after this you can reboot your computer and select new version of the kernel during boot.\n\nThe second case is to launch new Linux kernel in the virtual machine. I prefer [qemu](https://en.wikipedia.org/wiki/QEMU). First of all we need to build initial ramdisk - [initrd](https://en.wikipedia.org/wiki/Initrd) for this. The `initrd` is a temporary root file system that is used by the Linux kernel during initialization process while other filesystems are not mounted. We can build `initrd` with the following commands:\n\nFirst of all we need to download [busybox](https://en.wikipedia.org/wiki/BusyBox) and run `menuconfig` for its configuration:\n\n```shell\n$ mkdir initrd\n$ cd initrd\n$ curl http://busybox.net/downloads/busybox-1.23.2.tar.bz2 | tar xjf -\n$ cd busybox-1.23.2/\n$ make menuconfig\n$ make -j4\n```\n\n`busybox` is an executable file - `/bin/busybox` that contains a set of standard tools like [coreutils](https://en.wikipedia.org/wiki/GNU_Core_Utilities). In the `busybox` menu we need to enable: `Build BusyBox as a static binary (no shared libs)` option:\n\n![busysbox menu](https://i.imgur.com/TxPRCzQ.png)\n\nWe can find this menu in the:\n\n```\nBusybox Settings\n--> Build Options\n```\n\nAfter this we exit from the `busybox` configuration menu and execute following commands for building and installation of it:\n\n```\n$ make -j4\n$ sudo make install\n```\n\nNow that `busybox` is installed, we can begin building our `initrd`. To do this, we go to the previous `initrd` directory and:\n\n```\n$ cd ..\n$ mkdir -p initramfs\n$ cd initramfs\n$ mkdir -pv {bin,sbin,etc,proc,sys,usr/{bin,sbin}}\n$ cp -av ../busybox-1.23.2/_install/* .\n```\n\ncopy `busybox` fields to the `bin`, `sbin` and other directories. Now we need to create executable `init` file that will be executed as a first process in the system. My `init` file just mounts [procfs](https://en.wikipedia.org/wiki/Procfs) and [sysfs](https://en.wikipedia.org/wiki/Sysfs) filesystems and executed shell:\n\n```shell\n#!/bin/sh\n\nmount -t proc none /proc\nmount -t sysfs none /sys\n\nexec /bin/sh\n```\n\nNow we can create an archive that will be our `initrd`:\n\n```\n$ find . -print0 | cpio --null -ov --format=newc | gzip -9 > ~/dev/initrd_x86_64.gz\n```\n\nWe can now run our kernel in the virtual machine. As I already wrote I prefer [qemu](https://en.wikipedia.org/wiki/QEMU) for this. We can run our kernel with the following command:\n\n```\n$ qemu-system-x86_64 -snapshot -m 8G -serial stdio -kernel ~/dev/linux/arch/x86_64/boot/bzImage -initrd ~/dev/initrd_x86_64.gz -append \"root=/dev/sda1 ignore_loglevel\"\n```\n\n![qemu](images/qemu.png)\n\nFrom now we can run the Linux kernel in the virtual machine and this means that we can begin to change and test the kernel.\n\nConsider using [ivandaviov/minimal](https://github.com/ivandavidov/minimal) or [Buildroot](https://buildroot.org/) to automate the process of generating initrd.\n\nGetting started with the Linux Kernel Development\n---------------------------------------------------------------------------------\n\nThe main point of this paragraph is to answer two questions: what to do and what not to do before sending your first patch to the Linux kernel. Please, do not confuse this `to do` with `todo`. I have no answer what you can fix in the Linux kernel. I just want to tell you my workflow during experimenting with the Linux kernel source code.\n\nFirst of all I pull the latest updates from Linus's repo with the following commands:\n\n```\n$ git checkout master\n$ git pull upstream master\n```\n\nAs soon as your local copy of the Linux kernel source code is in sync with the [mainline](https://github.com/torvalds/linux) repository, we can start to apply changes to it. I already wrote, I have no advice for where you should start and what `TODO` to choose within the Linux kernel. But the best place for newbies is the `staging` tree. In other words the set of drivers from the [drivers/staging](https://github.com/torvalds/linux/tree/master/drivers/staging) directory. The maintainer of this tree is [Greg Kroah-Hartman](https://en.wikipedia.org/wiki/Greg_Kroah-Hartman) and the `staging` drivers are a good target for trivial patch fixes. Let's look at this simple example, that describes how to generate a patch, check it and send it to the [Linux kernel mail listing](https://lkml.org/).\n\nIf we look in the driver for the [Digi International EPCA PCI](https://github.com/torvalds/linux/tree/master/drivers/staging/dgap) based devices, we will see the `dgap_sindex` function on line 295:\n\n```C\nstatic char *dgap_sindex(char *string, char *group)\n{\n\tchar *ptr;\n\n\tif (!string || !group)\n\t\treturn NULL;\n\n\tfor (; *string; string++) {\n\t\tfor (ptr = group; *ptr; ptr++) {\n\t\t\tif (*ptr == *string)\n\t\t\t\treturn string;\n\t\t}\n\t}\n\n\treturn NULL;\n}\n```\n\nThis function looks for a match of any character in the group and returns that position. During research of source code of the Linux kernel, I have noted that the [lib/string.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/string.c#L473) source code file contains the implementation of the `strpbrk` function that does the same thing as `dgap_sinidex`. It is not a good idea to use a custom implementation of a function that already exists, so we can remove the `dgap_sindex` function from the [drivers/staging/dgap/dgap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/staging/dgap/dgap.c) source code file and use the `strpbrk` instead.\n\nFirst of all let's create new `git` branch based on the current master that synced with the Linux kernel mainline repo:\n\n```\n$ git checkout -b \"dgap-remove-dgap_sindex\"\n```\n\nAnd now we can replace the `dgap_sindex` with the `strpbrk`. After we did all changes we need to recompile the Linux kernel or just [dgap](https://github.com/torvalds/linux/tree/master/drivers/staging/dgap) directory. Do not forget to enable this driver in the kernel configuration. You can find it in the:\n\n```\nDevice Drivers\n--> Staging drivers\n----> Digi EPCA PCI products\n```\n\n![dgap menu](images/dgap_menu.png)\n\nNow is time to make commit. I'm using following combination for this:\n\n```\n$ git add .\n$ git commit -s -v\n```\n\nAfter the last command an editor will be opened that will be chosen from `$GIT_EDITOR` or `$EDITOR` environment variable. The `-s` command line argument will add `Signed-off-by` line by the committer at the end of the commit log message. You can find this line in the end of each commit message, for example - [00cc1633](https://github.com/torvalds/linux/commit/00cc1633816de8c95f337608a1ea64e228faf771). The main point of this line is the tracking of who did a change. The `-v` option show unified diff between the HEAD commit and what would be committed at the bottom of the commit message. It is not necessary, but very useful sometimes. A couple of words about commit message. Actually a commit message consists from two parts:\n\nThe first part is on the first line and contains short description of changes. It starts from the `[PATCH]` prefix followed by a subsystem, driver or architecture name and after `:` symbol short description. In our case it will be something like this:\n\n```\n[PATCH] staging/dgap: Use strpbrk() instead of dgap_sindex()\n```\n\nAfter short description usually we have an empty line and full description of the commit. In our case it will be:\n\n```\nThe <linux/string.h> provides strpbrk() function that does the same that the\ndgap_sindex(). Let's use already defined function instead of writing custom.\n```\n\nAnd the `Sign-off-by` line in the end of the commit message. Note that each line of a commit message must no be longer than `80` symbols and commit message must describe your changes in details. Do not just write a commit message like: `Custom function removed`, you need to describe what you did and why. The patch reviewers must know what they review. Besides this commit messages in this view are very helpful. Each time when we can't understand something, we can use [git blame](http://git-scm.com/docs/git-blame) to read description of changes.\n\nAfter we have committed changes time to generate patch. We can do it with the `format-patch` command:\n\n```\n$ git format-patch master\n0001-staging-dgap-Use-strpbrk-instead-of-dgap_sindex.patch\n```\n\nWe've passed name of the branch (`master` in this case) to the `format-patch` command that will generate a patch with the last changes that are in the `dgap-remove-dgap_sindex` branch and not are in the `master` branch. As you can note, the `format-patch` command generates file that contains last changes and has name that is based on the commit short description. If you want to generate a patch with the custom name, you can use `--stdout` option:\n\n```\n$ git format-patch master --stdout > dgap-patch-1.patch\n```\n\nThe last step after we have generated our patch is to send it to the Linux kernel mailing list. Of course, you can use any email client, `git` provides a special command for this: `git send-email`. Before you send your patch, you need to know where to send it. Yes, you can just send it to the Linux kernel mailing list address which is `linux-kernel@vger.kernel.org`, but it is very likely that the patch will be ignored, because of the large flow of messages. The better choice would be to send the patch to the maintainers of the subsystem where you have made changes. To find the names of these maintainers use the `get_maintainer.pl` script. All you need to do is pass the file or directory where you wrote code.\n\n```\n$ ./scripts/get_maintainer.pl -f drivers/staging/dgap/dgap.c\nLidza Louina <lidza.louina@gmail.com> (maintainer:DIGI EPCA PCI PRODUCTS)\nMark Hounschell <markh@compro.net> (maintainer:DIGI EPCA PCI PRODUCTS)\nDaeseok Youn <daeseok.youn@gmail.com> (maintainer:DIGI EPCA PCI PRODUCTS)\nGreg Kroah-Hartman <gregkh@linuxfoundation.org> (supporter:STAGING SUBSYSTEM)\ndriverdev-devel@linuxdriverproject.org (open list:DIGI EPCA PCI PRODUCTS)\ndevel@driverdev.osuosl.org (open list:STAGING SUBSYSTEM)\nlinux-kernel@vger.kernel.org (open list)\n```\n\nYou will see the set of the names and related emails. Now we can send our patch with:\n\n```\n$ git send-email --to \"Lidza Louina <lidza.louina@gmail.com>\" \\\n  --cc \"Mark Hounschell <markh@compro.net>\"                   \\\n  --cc \"Daeseok Youn <daeseok.youn@gmail.com>\"                \\\n  --cc \"Greg Kroah-Hartman <gregkh@linuxfoundation.org>\"      \\\n  --cc \"driverdev-devel@linuxdriverproject.org\"               \\\n  --cc \"devel@driverdev.osuosl.org\"                           \\\n  --cc \"linux-kernel@vger.kernel.org\"\n```\n\nThat's all. The patch is sent and now you only have to wait for feedback from the Linux kernel developers. After you send a patch and a maintainer accepts it, you will find it in the maintainer's repository (for example [patch](https://git.kernel.org/cgit/linux/kernel/git/gregkh/staging.git/commit/?h=staging-testing&id=b9f7f1d0846f15585b8af64435b6b706b25a5c0b) that you saw in this part) and after some time the maintainer will send a pull request to Linus and you will see your patch in the mainline repository.\n\nThat's all.\n\nSome advice\n--------------------------------------------------------------------------------\n\nIn the end of this part I want to give you some advice that will describe what to do and what not to do during development of the Linux kernel:\n\n* Think, Think, Think. And think again before you decide to send a patch.\n\n* Each time when you have changed something in the Linux kernel source code - compile it. After any changes. Again and again. Nobody likes changes that don't even compile.\n\n* The Linux kernel has a coding style [guide](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/CodingStyle) and you need to comply with it. There is great script which can help to check your changes. This script is - [scripts/checkpatch.pl](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/scripts/checkpatch.pl). Just pass source code file with changes to it and you will see:\n\n```\n$ ./scripts/checkpatch.pl -f drivers/staging/dgap/dgap.c\nWARNING: Block comments use * on subsequent lines\n#94: FILE: drivers/staging/dgap/dgap.c:94:\n+/*\n+     SUPPORTED PRODUCTS\n\nCHECK: spaces preferred around that '|' (ctx:VxV)\n#143: FILE: drivers/staging/dgap/dgap.c:143:\n+\t{ PPCM,        PCI_DEV_XEM_NAME,     64, (T_PCXM|T_PCLITE|T_PCIBUS) },\n\n```\n\nAlso you can see problematic places with the help of the `git diff`:\n\n![git diff](images/git_diff.png)\n\n* [Linus doesn't accept github pull requests](https://github.com/torvalds/linux/pull/17#issuecomment-5654674)\n\n* If your change consists from some different and unrelated changes, you need to split the changes via separate commits. The `git format-patch` command will generate patches for each commit and the subject of each patch will contain a `vN` prefix where the `N` is the number of the patch. If you are planning to send a series of patches it will be helpful to pass the `--cover-letter` option to the `git format-patch` command. This will generate an additional file that will contain the cover letter that you can use to describe what your patchset changes. It is also a good idea to use the `--in-reply-to` option in the `git send-email` command. This option allows you to send your patch series in reply to your cover message. The structure of the your patch will look like this for a maintainer:\n\n```\n|--> cover letter\n  |----> patch_1\n  |----> patch_2\n```\n\nYou need to pass `message-id` as an argument of the `--in-reply-to` option that you can find in the output of the `git send-email`:\n\nIt's important that your email be in the [plain text](https://en.wikipedia.org/wiki/Plain_text) format. Generally, `send-email` and `format-patch` are very useful during development, so look at the documentation for the commands and you'll find some useful options such as: [git send-email](http://git-scm.com/docs/git-send-email) and [git format-patch](http://git-scm.com/docs/git-format-patch).\n\n* Do not be surprised if you do not get an immediate answer after you send your patch. Maintainers can be very busy.\n\n* The [scripts](https://github.com/torvalds/linux/tree/master/scripts) directory contains many different useful scripts that are related to Linux kernel development. We already saw two scripts from this directory: the `checkpatch.pl` and the `get_maintainer.pl` scripts. Outside of those scripts, you can find the [stackusage](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/scripts/stackusage) script that will print usage of the stack, [extract-vmlinux](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/scripts/extract-vmlinux) for extracting an uncompressed kernel image, and many others. Outside of the `scripts` directory you can find some very useful [scripts](https://github.com/lorenzo-stoakes/kernel-scripts) by [Lorenzo Stoakes](https://twitter.com/ljsloz) for kernel development.\n\n* Subscribe to the Linux kernel mailing list. There are a large number of letters every day on `lkml`, but it is very useful to read them and understand things such as the current state of the Linux kernel. Other than `lkml` there are [set](http://vger.kernel.org/vger-lists.html) mailing listings which are related to the different Linux kernel subsystems.\n\n* If your patch is not accepted the first time and you receive feedback from Linux kernel developers, make your changes and resend the patch with the `[PATCH vN]` prefix (where `N` is the number of patch version). For example:\n\n```\n[PATCH v2] staging/dgap: Use strpbrk() instead of dgap_sindex()\n```\n\nAlso it must contain a changelog that describes all changes from previous patch versions. Of course, this is not an exhaustive list of requirements for Linux kernel development, but some of the most important items were addressed.\n\nHappy Hacking!\n\nConclusion\n--------------------------------------------------------------------------------\n\nI hope this will help others join the Linux kernel community!\nIf you have any questions or suggestions, write me at [email](mailto:kuleshovmail@gmail.com) or ping [me](https://twitter.com/0xAX) on twitter.\n\nPlease note that English is not my first language, and I am really sorry for any inconvenience. If you find any mistakes please let me know via email or send a PR.\n\nLinks\n--------------------------------------------------------------------------------\n\n* [blog posts about assembly programming for x86_64](https://0xax.github.io/categories/assembler/)\n* [Assembler](https://en.wikipedia.org/wiki/Assembly_language#Assembler)\n* [distro](https://en.wikipedia.org/wiki/Linux_distribution)\n* [package manager](https://en.wikipedia.org/wiki/Package_manager)\n* [grub](https://en.wikipedia.org/wiki/GNU_GRUB)\n* [kernel.org](https://kernel.org/)\n* [version control system](https://en.wikipedia.org/wiki/Version_control)\n* [arm64](https://en.wikipedia.org/wiki/ARM_architecture#AArch64_features)\n* [bzImage](https://en.wikipedia.org/wiki/Vmlinux#bzImage)\n* [qemu](https://en.wikipedia.org/wiki/QEMU)\n* [initrd](https://en.wikipedia.org/wiki/Initrd)\n* [busybox](https://en.wikipedia.org/wiki/BusyBox)\n* [coreutils](https://en.wikipedia.org/wiki/GNU_Core_Utilities)\n* [procfs](https://en.wikipedia.org/wiki/Procfs)\n* [sysfs](https://en.wikipedia.org/wiki/Sysfs)\n* [Linux kernel mail listing archive](https://lkml.org/)\n* [Linux kernel coding style guide](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/CodingStyle)\n* [How to Get Your Change Into the Linux Kernel](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/SubmittingPatches)\n* [Linux Kernel Newbies](http://kernelnewbies.org/)\n* [plain text](https://en.wikipedia.org/wiki/Plain_text)\n"
  },
  {
    "path": "Misc/linux-misc-2.md",
    "content": "Process of the Linux kernel building\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nI won't tell you how to build and install a custom Linux kernel on your machine. If you need help with this, you can find many [resources](https://encrypted.google.com/search?q=building+linux+kernel#q=building+linux+kernel+from+source+code) that will help you do it. Instead, we will learn what occurs when you execute `make` in the root directory of the Linux kernel source code.\n\nWhen I started to study the source code of the Linux kernel, the [makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Makefile) was the first file that I opened. And it was scary :). The [makefile](https://en.wikipedia.org/wiki/Make_%28software%29) contained `1591` lines of code when I wrote this part and the kernel was the [4.2.0-rc3](https://github.com/torvalds/linux/commit/52721d9d3334c1cb1f76219a161084094ec634dc) release.\n\nThis makefile is the top makefile in the Linux kernel source code and the kernel building starts here. Yes, it is big, but moreover, if you've read the source code of the Linux kernel you may have noted that all directories containing source code has its own makefile. Of course it is not possible to describe how each source file is compiled and linked, so we will only study the standard compilation case. You will not find here building of the kernel's documentation, cleaning of the kernel source code, [tags](https://en.wikipedia.org/wiki/Ctags) generation, [cross-compilation](https://en.wikipedia.org/wiki/Cross_compiler) related stuff, etc... We will start from the `make` execution with the standard kernel configuration file and will finish with the building of the [bzImage](https://en.wikipedia.org/wiki/Vmlinux#bzImage).\n\nIt would be better if you're already familiar with the [make](https://en.wikipedia.org/wiki/Make_%28software%29) util, but I will try to describe every piece of code in this part anyway.\n\nSo let's start.\n\nPreparation before the kernel compilation\n---------------------------------------------------------------------------------\n\nThere are many things to prepare before the kernel compilation can be started. The main point here is to find and configure\nthe type of compilation, to parse command line arguments that are passed to `make`, etc... So let's dive into the top `Makefile` of Linux kernel.\n\nThe top `Makefile` of Linux kernel is responsible for building two major products: [vmlinux](https://en.wikipedia.org/wiki/Vmlinux) (the resident kernel image) and the modules (any module files). The [Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Makefile) of the Linux kernel starts with the definition of following variables:\n\n```Makefile\nVERSION = 4\nPATCHLEVEL = 2\nSUBLEVEL = 0\nEXTRAVERSION = -rc3\nNAME = Hurr durr I'm a sheep\n```\n\nThese variables determine the current version of Linux kernel and are used in different places, for example in the forming of the `KERNELVERSION` variable in the same `Makefile`:\n\n```Makefile\nKERNELVERSION = $(VERSION)$(if $(PATCHLEVEL),.$(PATCHLEVEL)$(if $(SUBLEVEL),.$(SUBLEVEL)))$(EXTRAVERSION)\n```\n\nAfter this we can see a couple of `ifeq` conditions that check some of the parameters passed to `make`. The Linux kernel `makefiles` provides a special `make help` target that prints all available targets and some of the command line arguments that can be passed to `make`. For example : `make V=1` => verbose build. The first `ifeq` checks whether the `V=n` option is passed to `make`:\n\n```Makefile\nifeq (\"$(origin V)\", \"command line\")\n  KBUILD_VERBOSE = $(V)\nendif\nifndef KBUILD_VERBOSE\n  KBUILD_VERBOSE = 0\nendif\n\nifeq ($(KBUILD_VERBOSE),1)\n  quiet =\n  Q =\nelse\n  quiet=quiet_\n  Q = @\nendif\n\nexport quiet Q KBUILD_VERBOSE\n```\n\nIf this option is passed to `make`, we set the `KBUILD_VERBOSE` variable to the value of `V` option. Otherwise we set the `KBUILD_VERBOSE` variable to zero. After this we check the value of `KBUILD_VERBOSE` variable and set values of the `quiet` and `Q` variables depending on the value of `KBUILD_VERBOSE` variable. The `@` symbols suppress the output of command. And if it is present before a command the output will be something like this: `CC scripts/mod/empty.o` instead of `Compiling .... scripts/mod/empty.o`. In the end we just export all of these variables. The next `ifeq` statement checks that `O=/dir` option was passed to the `make`. This option allows to locate all output files in the given `dir`:\n\n```Makefile\nifeq ($(KBUILD_SRC),)\n\nifeq (\"$(origin O)\", \"command line\")\n  KBUILD_OUTPUT := $(O)\nendif\n\nifneq ($(KBUILD_OUTPUT),)\nsaved-output := $(KBUILD_OUTPUT)\nKBUILD_OUTPUT := $(shell mkdir -p $(KBUILD_OUTPUT) && cd $(KBUILD_OUTPUT) \\\n\t\t\t\t\t\t\t\t&& /bin/pwd)\n$(if $(KBUILD_OUTPUT),, \\\n     $(error failed to create output directory \"$(saved-output)\"))\n\nsub-make: FORCE\n\t$(Q)$(MAKE) -C $(KBUILD_OUTPUT) KBUILD_SRC=$(CURDIR) \\\n\t-f $(CURDIR)/Makefile $(filter-out _all sub-make,$(MAKECMDGOALS))\n\nskip-makefile := 1\nendif # ifneq ($(KBUILD_OUTPUT),)\nendif # ifeq ($(KBUILD_SRC),)\n```\n\nWe check the `KBUILD_SRC` that represents the top directory of the kernel source code and whether it is empty (it is empty when the makefile is executed for the first time). We then set the `KBUILD_OUTPUT` variable to the value passed with the `O` option (if this option was passed). In the next step we check this `KBUILD_OUTPUT` variable and if it is set, we do following things:\n\n* Store the value of `KBUILD_OUTPUT` in the temporary `saved-output` variable;\n* Try to create the given output directory;\n* Check that directory created, in other way print error message;\n* If the custom output directory was created successfully, execute `make` again with the new directory (see the `-C` option).\n\nThe next `ifeq` statements check that the `C` or `M` options passed to `make`:\n\n```Makefile\nifeq (\"$(origin C)\", \"command line\")\n  KBUILD_CHECKSRC = $(C)\nendif\nifndef KBUILD_CHECKSRC\n  KBUILD_CHECKSRC = 0\nendif\n\nifeq (\"$(origin M)\", \"command line\")\n  KBUILD_EXTMOD := $(M)\nendif\n```\n\nThe `C` option tells the `makefile` that we need to check all `c` source code with a tool provided by the `$CHECK` environment variable, by default it is [sparse](https://en.wikipedia.org/wiki/Sparse). The second `M` option provides build for the external modules (will not see this case in this part). We also check whether the `KBUILD_SRC` variable is set, and if it isn't, we set the `srctree` variable to `.`:\n\n```Makefile\nifeq ($(KBUILD_SRC),)\n        srctree := .\nendif\n\nobjtree\t:= .\nsrc\t\t:= $(srctree)\nobj\t\t:= $(objtree)\n\nexport srctree objtree VPATH\n```\n\nThat tells `Makefile` that the kernel source tree will be in the current directory where `make` was executed. We then set `objtree` and other variables to this directory and export them. The next step is to get value for the `SUBARCH` variable that represents what the underlying architecture is:\n\n```Makefile\nSUBARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \\\n\t\t\t\t  -e s/sun4u/sparc64/ \\\n\t\t\t\t  -e s/arm.*/arm/ -e s/sa110/arm/ \\\n\t\t\t\t  -e s/s390x/s390/ -e s/parisc64/parisc/ \\\n\t\t\t\t  -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \\\n\t\t\t\t  -e s/sh[234].*/sh/ -e s/aarch64.*/arm64/ )\n```\n\nAs you can see, it executes the [uname](https://en.wikipedia.org/wiki/Uname) util that prints information about machine, operating system and architecture. As it gets the output of `uname`, it parses the output and assigns the result to the `SUBARCH` variable. Now that we have `SUBARCH`, we set the `SRCARCH` variable that provides the directory of the certain architecture and `hdr-arch` that provides the directory for the header files:\n\n```Makefile\nifeq ($(ARCH),i386)\n        SRCARCH := x86\nendif\nifeq ($(ARCH),x86_64)\n        SRCARCH := x86\nendif\n\nhdr-arch  := $(SRCARCH)\n```\n\nNote `ARCH` is an alias for `SUBARCH`. In the next step we set the `KCONFIG_CONFIG` variable that represents path to the kernel configuration file and if it was not set before, it is set to `.config` by default:\n\n```Makefile\nKCONFIG_CONFIG\t?= .config\nexport KCONFIG_CONFIG\n```\n\nand the [shell](https://en.wikipedia.org/wiki/Shell_%28computing%29) that will be used during kernel compilation:\n\n```Makefile\nCONFIG_SHELL := $(shell if [ -x \"$$BASH\" ]; then echo $$BASH; \\\n\t  else if [ -x /bin/bash ]; then echo /bin/bash; \\\n\t  else echo sh; fi ; fi)\n```\n\nThe next set of variables are related to the compilers used during Linux kernel compilation. We set the host compilers for the `c` and `c++` and the flags to be used with them:\n\n```Makefile\nHOSTCC       = gcc\nHOSTCXX      = g++\nHOSTCFLAGS   = -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89\nHOSTCXXFLAGS = -O2\n```\n\nNext we get to the `CC` variable that represents compiler too, so why do we need the `HOST*` variables? `CC` is the target compiler that will be used during kernel compilation, but `HOSTCC` will be used during compilation of the set of the `host` programs (we will see it soon). After this we can see the definition of `KBUILD_MODULES` and `KBUILD_BUILTIN` variables that are used to determine what to compile (modules, kernel, or both):\n\n```Makefile\nKBUILD_MODULES :=\nKBUILD_BUILTIN := 1\n\nifeq ($(MAKECMDGOALS),modules)\n  KBUILD_BUILTIN := $(if $(CONFIG_MODVERSIONS),1)\nendif\n```\n\nHere we can see definition of these variables and the value of `KBUILD_BUILTIN` variable will depend on the `CONFIG_MODVERSIONS` kernel configuration parameter if we pass only `modules` to `make`. The next step is to include the `kbuild` file.\n\n```Makefile\ninclude scripts/Kbuild.include\n```\n\nThe [Kbuild](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/kbuild/kbuild.txt) or `Kernel Build System` is a special infrastructure to manage building the kernel and its modules. `kbuild` files have the same syntax as makefiles. The [scripts/Kbuild.include](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/scripts/Kbuild.include) file provides some generic definitions for the `kbuild` system. After including this `kbuild` file (back in [makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Makefile)) we can see the definitions of the variables that are related to the different tools used during kernel and module compilation (like linker, compilers, utils from the [binutils](http://www.gnu.org/software/binutils/), etc...):\n\n```Makefile\nAS\t\t= $(CROSS_COMPILE)as\nLD\t\t= $(CROSS_COMPILE)ld\nCC\t\t= $(CROSS_COMPILE)gcc\nCPP\t\t= $(CC) -E\nAR\t\t= $(CROSS_COMPILE)ar\nNM\t\t= $(CROSS_COMPILE)nm\nSTRIP\t\t= $(CROSS_COMPILE)strip\nOBJCOPY\t\t= $(CROSS_COMPILE)objcopy\nOBJDUMP\t\t= $(CROSS_COMPILE)objdump\nAWK\t\t= awk\n...\n...\n...\n```\n\nWe then define two other variables: `USERINCLUDE` and `LINUXINCLUDE`, which specify paths to header file directories (public for users in the first case and for kernel in the second case):\n\n```Makefile\nUSERINCLUDE    := \\\n\t\t-I$(srctree)/arch/$(hdr-arch)/include/uapi \\\n\t\t-Iarch/$(hdr-arch)/include/generated/uapi \\\n\t\t-I$(srctree)/include/uapi \\\n\t\t-Iinclude/generated/uapi \\\n        -include $(srctree)/include/linux/kconfig.h\n\nLINUXINCLUDE    := \\\n\t\t-I$(srctree)/arch/$(hdr-arch)/include \\\n\t\t...\n```\n\nAnd the standard flags for the C compiler:\n\n```Makefile\nKBUILD_CFLAGS   := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \\\n\t\t   -fno-strict-aliasing -fno-common \\\n\t\t   -Werror-implicit-function-declaration \\\n\t\t   -Wno-format-security \\\n\t\t   -std=gnu89\n```\n\nThese are not the final compilation flags, as they can be updated in other makefiles (for example kbuilds from `arch/`). After all of these, all variables will be exported to be available in the other makefiles. The `RCS_FIND_IGNORE` and the `RCS_TAR_IGNORE` variables contain files that will be ignored in the version control system:\n\n```Makefile\nexport RCS_FIND_IGNORE := \\( -name SCCS -o -name BitKeeper -o -name .svn -o    \\\n\t\t\t  -name CVS -o -name .pc -o -name .hg -o -name .git \\) \\\n\t\t\t  -prune -o\nexport RCS_TAR_IGNORE := --exclude SCCS --exclude BitKeeper --exclude .svn \\\n\t\t\t --exclude CVS --exclude .pc --exclude .hg --exclude .git\n```\n\nWith that, we have finished all preparations. The next step is building the `vmlinux` target.\n\nDirectly to the kernel build\n--------------------------------------------------------------------------------\n\nWe have now finished all the preparations, and next step in the main makefile is related to the kernel build. Before this moment, nothing has been printed to the terminal by `make`. But now the first steps of the compilation are started. We need to go to line [598](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Makefile#L598) of the Linux kernel top makefile and we will find the `vmlinux` target there:\n\n```Makefile\nall: vmlinux\n\tinclude arch/$(SRCARCH)/Makefile\n```\n\nDon't worry that we have missed many lines in Makefile that are between `export RCS_FIND_IGNORE.....` and `all: vmlinux.....`. This part of the makefile is responsible for the `make *.config` targets and as I wrote in the beginning of this part we will see only building of the kernel in a general way.\n\nThe `all:` target is the default when no target is given on the command line. You can see here that we include architecture specific makefile there (in our case it will be [arch/x86/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile)). From this moment we will continue from this makefile. As we can see `all` target depends on the `vmlinux` target that is defined a little lower in the top makefile:\n\n```Makefile\nvmlinux: scripts/link-vmlinux.sh $(vmlinux-deps) FORCE\n```\n\nThe `vmlinux` is the Linux kernel in a statically linked executable file format. The [scripts/link-vmlinux.sh](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/scripts/link-vmlinux.sh) script links and combines different compiled subsystems into vmlinux. The second target is the `vmlinux-deps` that defined as:\n\n```Makefile\nvmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN)\n```\n\nand consists from the set of the `built-in.o` from each top directory of the Linux kernel. Later, when we will go through all directories in the Linux kernel, the `Kbuild` will compile all the `$(obj-y)` files.  It then calls `$(LD) -r` to merge these files into one `built-in.o` file. For this moment we have no `vmlinux-deps`, so the `vmlinux` target will not be executed now. For me `vmlinux-deps` contains following files:\n\n```\narch/x86/kernel/vmlinux.lds arch/x86/kernel/head_64.o\narch/x86/kernel/head64.o    arch/x86/kernel/head.o\ninit/built-in.o             usr/built-in.o\narch/x86/built-in.o         kernel/built-in.o\nmm/built-in.o               fs/built-in.o\nipc/built-in.o              security/built-in.o\ncrypto/built-in.o           block/built-in.o\nlib/lib.a                   arch/x86/lib/lib.a\nlib/built-in.o              arch/x86/lib/built-in.o\ndrivers/built-in.o          sound/built-in.o\nfirmware/built-in.o         arch/x86/pci/built-in.o\narch/x86/power/built-in.o   arch/x86/video/built-in.o\nnet/built-in.o\n```\n\nThe next target that can be executed is following:\n\n```Makefile\n$(sort $(vmlinux-deps)): $(vmlinux-dirs) ;\n$(vmlinux-dirs): prepare scripts\n\t$(Q)$(MAKE) $(build)=$@\n```\n\nAs we can see `vmlinux-dirs` depends on two targets: `prepare` and `scripts`. `prepare` is defined in the top `Makefile` of the Linux kernel and executes three stages of preparations:\n\n```Makefile\nprepare: prepare0\nprepare0: archprepare FORCE\n\t$(Q)$(MAKE) $(build)=.\narchprepare: archheaders archscripts prepare1 scripts_basic\n\nprepare1: prepare2 $(version_h) include/generated/utsrelease.h \\\n                   include/config/auto.conf\n\t$(cmd_crmodverdir)\nprepare2: prepare3 outputmakefile asm-generic\n```\n\nThe first `prepare0` expands to the `archprepare` that expands to the `archheaders` and `archscripts` that defined in the `x86_64` specific [Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile). Let's look on it. The `x86_64` specific makefile starts from the definition of the variables that are related to the architecture-specific configs ([defconfig](https://github.com/torvalds/linux/tree/master/arch/x86/configs), etc...). After this it defines flags for the compiling of the [16-bit](https://en.wikipedia.org/wiki/Real_mode) code, calculating of the `BITS` variable that can be `32` for `i386` or `64` for the `x86_64` flags for the assembly source code, flags for the linker and many many more (all definitions you can find in the [arch/x86/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile)). The first target is `archheaders` in the makefile and it generates syscall table:\n\n```Makefile\narchheaders:\n\t$(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all\n```\n\nAnd the second target is `archscripts` in this makefile is:\n\n```Makefile\narchscripts: scripts_basic\n\t$(Q)$(MAKE) $(build)=arch/x86/tools relocs\n```\n\nWe can see that it depends on the `scripts_basic` target from the top [Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Makefile). At the first we can see the `scripts_basic` target that executes make for the [scripts/basic](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/scripts/basic/Makefile) makefile:\n\n```Makefile\nscripts_basic:\n\t$(Q)$(MAKE) $(build)=scripts/basic\n```\n\nThe `scripts/basic/Makefile` contains targets for compilation of the two host programs: `fixdep` and `bin2`:\n\n```Makefile\nhostprogs-y\t:= fixdep\nhostprogs-$(CONFIG_BUILD_BIN2C)     += bin2c\nalways\t\t:= $(hostprogs-y)\n\n$(addprefix $(obj)/,$(filter-out fixdep,$(always))): $(obj)/fixdep\n```\n\nFirst program is `fixdep` - optimizes list of dependencies generated by [gcc](https://gcc.gnu.org/) that tells make when to remake a source code file. The second program is `bin2c`, which depends on the value of the `CONFIG_BUILD_BIN2C` kernel configuration option and is a very little C program that allows to convert a binary on stdin to a C include on stdout. You can note here a strange notation: `hostprogs-y`, etc... This notation is used in the all `kbuild` files and you can read more about it in the [documentation](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/kbuild/makefiles.txt). In our case `hostprogs-y` tells `kbuild` that there is one host program named `fixdep` that will be built from `fixdep.c` that is located in the same directory where the `Makefile` is. The first output after we execute `make` in our terminal will be result of this `kbuild` file:\n\n```\n$ make\n  HOSTCC  scripts/basic/fixdep\n```\n\nAs `script_basic` target was executed, the `archscripts` target will execute `make` for the [arch/x86/tools](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/tools/Makefile) makefile with the `relocs` target:\n\n```Makefile\n$(Q)$(MAKE) $(build)=arch/x86/tools relocs\n```\n\nThe `relocs_32.c` and the `relocs_64.c` will be compiled that will contain [relocation](https://en.wikipedia.org/wiki/Relocation_%28computing%29) information and we will see it in the `make` output:\n\n```Makefile\n  HOSTCC  arch/x86/tools/relocs_32.o\n  HOSTCC  arch/x86/tools/relocs_64.o\n  HOSTCC  arch/x86/tools/relocs_common.o\n  HOSTLD  arch/x86/tools/relocs\n```\n\nThere is checking of the `version.h` after compiling of the `relocs.c`:\n\n```Makefile\n$(version_h): $(srctree)/Makefile FORCE\n\t$(call filechk,version.h)\n\t$(Q)rm -f $(old_version_h)\n```\n\nWe can see it in the output:\n\n```\nCHK     include/config/kernel.release\n```\n\nand the building of the `generic` assembly headers with the `asm-generic` target from the `arch/x86/include/generated/asm` that generated in the top Makefile of the Linux kernel. After the `asm-generic` target the `archprepare` will be done, so the `prepare0` target will be executed. As I wrote above:\n\n```Makefile\nprepare0: archprepare FORCE\n\t$(Q)$(MAKE) $(build)=.\n```\n\nNote on the `build`. It defined in the [scripts/Kbuild.include](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/scripts/Kbuild.include) and looks like this:\n\n```Makefile\nbuild := -f $(srctree)/scripts/Makefile.build obj\n```\n\nOr in our case it is current source directory - `.`:\n\n```Makefile\n$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.build obj=.\n```\n\nThe [scripts/Makefile.build](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/scripts/Makefile.build) tries to find the `Kbuild` file by the given directory via the `obj` parameter, include this `Kbuild` files:\n\n```Makefile\ninclude $(kbuild-file)\n```\n\nand build targets from it. In our case `.` contains the [Kbuild](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Kbuild) file that generates the `kernel/bounds.s` and the `arch/x86/kernel/asm-offsets.s`. After this the `prepare` target finished to work. The `vmlinux-dirs` also depends on the second target - `scripts` that compiles following programs: `file2alias`, `mk_elfconfig`, `modpost`, etc..... After scripts/host-programs compilation our `vmlinux-dirs` target can be executed. First of all let's try to understand what does `vmlinux-dirs` contain. For my case it contains paths of the following kernel directories:\n\n```\ninit usr arch/x86 kernel mm fs ipc security crypto block\ndrivers sound firmware arch/x86/pci arch/x86/power\narch/x86/video net lib arch/x86/lib\n```\n\nWe can find definition of the `vmlinux-dirs` in the top [Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Makefile) of the Linux kernel:\n\n```Makefile\nvmlinux-dirs\t:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \\\n\t\t     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \\\n\t\t     $(net-y) $(net-m) $(libs-y) $(libs-m)))\n\ninit-y\t\t:= init/\ndrivers-y\t:= drivers/ sound/ firmware/\nnet-y\t\t:= net/\nlibs-y\t\t:= lib/\n...\n...\n...\n```\n\nHere we remove the `/` symbol from the each directory with the help of the `patsubst` and `filter` functions and put it to the `vmlinux-dirs`. So we have list of directories in the `vmlinux-dirs` and the following code:\n\n```Makefile\n$(vmlinux-dirs): prepare scripts\n\t$(Q)$(MAKE) $(build)=$@\n```\n\nThe `$@` represents `vmlinux-dirs` here that means that it will go recursively over all directories from the `vmlinux-dirs` and its internal directories (depends on configuration) and will execute `make` in there. We can see it in the output:\n\n```\n  CC      init/main.o\n  CHK     include/generated/compile.h\n  CC      init/version.o\n  CC      init/do_mounts.o\n  ...\n  CC      arch/x86/crypto/glue_helper.o\n  AS      arch/x86/crypto/aes-x86_64-asm_64.o\n  CC      arch/x86/crypto/aes_glue.o\n  ...\n  AS      arch/x86/entry/entry_64.o\n  AS      arch/x86/entry/thunk_64.o\n  CC      arch/x86/entry/syscall_64.o\n```\n\nSource code in each directory will be compiled and linked to the `built-in.o`:\n\n```\n$ find . -name built-in.o\n./arch/x86/crypto/built-in.o\n./arch/x86/crypto/sha-mb/built-in.o\n./arch/x86/net/built-in.o\n./init/built-in.o\n./usr/built-in.o\n...\n...\n```\n\nOk, all buint-in.o(s) built, now we can back to the `vmlinux` target. As you remember, the `vmlinux` target is in the top Makefile of the Linux kernel. Before the linking of the `vmlinux` it builds [samples](https://github.com/torvalds/linux/tree/master/samples), [Documentation](https://github.com/torvalds/linux/tree/master/Documentation), etc... but I will not describe it here as I wrote in the beginning of this part.\n\n```Makefile\nvmlinux: scripts/link-vmlinux.sh $(vmlinux-deps) FORCE\n    ...\n    ...\n    +$(call if_changed,link-vmlinux)\n```\n\nAs you can see main purpose of it is a call of the [scripts/link-vmlinux.sh](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/scripts/link-vmlinux.sh) script is linking of the all `built-in.o`(s) to the one statically linked executable and creation of the [System.map](https://en.wikipedia.org/wiki/System.map). In the end we will see following output:\n\n```\n  LINK    vmlinux\n  LD      vmlinux.o\n  MODPOST vmlinux.o\n  GEN     .version\n  CHK     include/generated/compile.h\n  UPD     include/generated/compile.h\n  CC      init/version.o\n  LD      init/built-in.o\n  KSYM    .tmp_kallsyms1.o\n  KSYM    .tmp_kallsyms2.o\n  LD      vmlinux\n  SORTEX  vmlinux\n  SYSMAP  System.map\n```\n\nand `vmlinux` and `System.map` in the root of the Linux kernel source tree:\n\n```\n$ ls vmlinux System.map\nSystem.map  vmlinux\n```\n\nThat's all, `vmlinux` is ready. The next step is creation of the [bzImage](https://en.wikipedia.org/wiki/Vmlinux#bzImage).\n\nBuilding bzImage\n--------------------------------------------------------------------------------\n\nThe `bzImage` file is the compressed Linux kernel image. We can get it by executing `make bzImage` after `vmlinux` is built. That, or we can just execute `make` without any argument and we will get `bzImage` anyway because it is default image:\n\n```Makefile\nall: bzImage\n```\n\nin the [arch/x86/kernel/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile). Let's look on this target, it will help us to understand how this image builds. As I already said the `bzImage` target defined in the [arch/x86/kernel/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile) and looks like this:\n\n```Makefile\nbzImage: vmlinux\n\t$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)\n\t$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot\n\t$(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@\n```\n\nWe can see here, that first of all called `make` for the boot directory, in our case it is:\n\n```Makefile\nboot := arch/x86/boot\n```\n\nThe main goal now is to build the source code in the `arch/x86/boot` and `arch/x86/boot/compressed` directories, build `setup.bin` and `vmlinux.bin`, and build the `bzImage` from them in the end. First target in the [arch/x86/boot/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/Makefile) is the `$(obj)/setup.elf`:\n\n```Makefile\n$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE\n\t$(call if_changed,ld)\n```\n\nWe already have the `setup.ld` linker script in the `arch/x86/boot` directory and the `SETUP_OBJS` variable that expands to the all source files from the `boot` directory. We can see first output:\n\n```Makefile\n  AS      arch/x86/boot/bioscall.o\n  CC      arch/x86/boot/cmdline.o\n  AS      arch/x86/boot/copy.o\n  HOSTCC  arch/x86/boot/mkcpustr\n  CPUSTR  arch/x86/boot/cpustr.h\n  CC      arch/x86/boot/cpu.o\n  CC      arch/x86/boot/cpuflags.o\n  CC      arch/x86/boot/cpucheck.o\n  CC      arch/x86/boot/early_serial_console.o\n  CC      arch/x86/boot/edd.o\n```\n\nThe next source file is [arch/x86/boot/header.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/header.S), but we can't build it now because this target depends on the following two header files:\n\n```Makefile\n$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h\n```\n\nThe first is `voffset.h` generated by the `sed` script that gets two addresses from the `vmlinux` with the `nm` util:\n\n```C\n#define VO__end 0xffffffff82ab0000\n#define VO__text 0xffffffff81000000\n```\n\nThey are the start and the end of the kernel. The second is `zoffset.h` depens on the `vmlinux` target from the [arch/x86/boot/compressed/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/compressed/Makefile):\n\n```Makefile\n$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE\n\t$(call if_changed,zoffset)\n```\n\nThe `$(obj)/compressed/vmlinux` target depends on the `vmlinux-objs-y` that compiles source code files from the [arch/x86/boot/compressed](https://github.com/torvalds/linux/tree/master/arch/x86/boot/compressed) directory and generates `vmlinux.bin`, `vmlinux.bin.bz2`, and compiles program - `mkpiggy`. We can see this in the output:\n\n```Makefile\n  LDS     arch/x86/boot/compressed/vmlinux.lds\n  AS      arch/x86/boot/compressed/head_64.o\n  CC      arch/x86/boot/compressed/misc.o\n  CC      arch/x86/boot/compressed/string.o\n  CC      arch/x86/boot/compressed/cmdline.o\n  OBJCOPY arch/x86/boot/compressed/vmlinux.bin\n  BZIP2   arch/x86/boot/compressed/vmlinux.bin.bz2\n  HOSTCC  arch/x86/boot/compressed/mkpiggy\n```\n\nWhere `vmlinux.bin` is the `vmlinux` file with debugging information and comments stripped and the `vmlinux.bin.bz2` compressed `vmlinux.bin.all` + `u32` size of `vmlinux.bin.all`. The `vmlinux.bin.all` is `vmlinux.bin + vmlinux.relocs`, where `vmlinux.relocs` is the `vmlinux` that was handled by the `relocs` program (see above). As we got these files, the `piggy.S` assembly files will be generated with the `mkpiggy` program and compiled:\n\n```Makefile\n  MKPIGGY arch/x86/boot/compressed/piggy.S\n  AS      arch/x86/boot/compressed/piggy.o\n```\n\nThis assembly files will contain the computed offset from the compressed kernel. After this we can see that `zoffset` generated:\n\n```Makefile\n  ZOFFSET arch/x86/boot/zoffset.h\n```\n\nAs the `zoffset.h` and the `voffset.h` are generated, compilation of the source code files from the [arch/x86/boot](https://github.com/torvalds/linux/tree/master/arch/x86/boot/) can be continued:\n\n```Makefile\n  AS      arch/x86/boot/header.o\n  CC      arch/x86/boot/main.o\n  CC      arch/x86/boot/mca.o\n  CC      arch/x86/boot/memory.o\n  CC      arch/x86/boot/pm.o\n  AS      arch/x86/boot/pmjump.o\n  CC      arch/x86/boot/printf.o\n  CC      arch/x86/boot/regs.o\n  CC      arch/x86/boot/string.o\n  CC      arch/x86/boot/tty.o\n  CC      arch/x86/boot/video.o\n  CC      arch/x86/boot/video-mode.o\n  CC      arch/x86/boot/video-vga.o\n  CC      arch/x86/boot/video-vesa.o\n  CC      arch/x86/boot/video-bios.o\n```\n\nAs all source code files will be compiled, they will be linked to the `setup.elf`:\n\n```Makefile\n  LD      arch/x86/boot/setup.elf\n```\n\nor:\n\n```\nld -m elf_x86_64   -T arch/x86/boot/setup.ld arch/x86/boot/a20.o arch/x86/boot/bioscall.o arch/x86/boot/cmdline.o arch/x86/boot/copy.o arch/x86/boot/cpu.o arch/x86/boot/cpuflags.o arch/x86/boot/cpucheck.o arch/x86/boot/early_serial_console.o arch/x86/boot/edd.o arch/x86/boot/header.o arch/x86/boot/main.o arch/x86/boot/mca.o arch/x86/boot/memory.o arch/x86/boot/pm.o arch/x86/boot/pmjump.o arch/x86/boot/printf.o arch/x86/boot/regs.o arch/x86/boot/string.o arch/x86/boot/tty.o arch/x86/boot/video.o arch/x86/boot/video-mode.o arch/x86/boot/version.o arch/x86/boot/video-vga.o arch/x86/boot/video-vesa.o arch/x86/boot/video-bios.o -o arch/x86/boot/setup.elf\n```\n\nThe last two things is the creation of the `setup.bin` that will contain compiled code from the `arch/x86/boot/*` directory:\n\n```\nobjcopy  -O binary arch/x86/boot/setup.elf arch/x86/boot/setup.bin\n```\n\nand the creation of the `vmlinux.bin` from the `vmlinux`:\n\n```\nobjcopy  -O binary -R .note -R .comment -S arch/x86/boot/compressed/vmlinux arch/x86/boot/vmlinux.bin\n```\n\nIn the end we compile host program: [arch/x86/boot/tools/build.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/tools/build.c) that will create our `bzImage` from the `setup.bin` and the `vmlinux.bin`:\n\n```\narch/x86/boot/tools/build arch/x86/boot/setup.bin arch/x86/boot/vmlinux.bin arch/x86/boot/zoffset.h arch/x86/boot/bzImage\n```\n\nActually the `bzImage` is the concatenated `setup.bin` and the `vmlinux.bin`. In the end we will see the output which is familiar to all who once built the Linux kernel from source:\n\n```\nSetup is 16268 bytes (padded to 16384 bytes).\nSystem is 4704 kB\nCRC 94a88f9a\nKernel: arch/x86/boot/bzImage is ready  (#5)\n```\n\nThat's all.\n\nConclusion\n================================================================================\n\nIt is the end of this part and here we saw all steps from the execution of the `make` command to the generation of the `bzImage`. I know, the Linux kernel makefiles and process of the Linux kernel building may seem confusing at first glance, but it is not so hard. Hope this part will help you understand the process of building the Linux kernel.\n\nLinks\n================================================================================\n\n* [GNU make util](https://en.wikipedia.org/wiki/Make_%28software%29)\n* [Linux kernel top Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Makefile)\n* [cross-compilation](https://en.wikipedia.org/wiki/Cross_compiler)\n* [Ctags](https://en.wikipedia.org/wiki/Ctags)\n* [sparse](https://en.wikipedia.org/wiki/Sparse)\n* [bzImage](https://en.wikipedia.org/wiki/Vmlinux#bzImage)\n* [uname](https://en.wikipedia.org/wiki/Uname)\n* [shell](https://en.wikipedia.org/wiki/Shell_%28computing%29)\n* [Kbuild](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/kbuild/kbuild.txt)\n* [binutils](http://www.gnu.org/software/binutils/)\n* [gcc](https://gcc.gnu.org/)\n* [Documentation](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/kbuild/makefiles.txt)\n* [System.map](https://en.wikipedia.org/wiki/System.map)\n* [Relocation](https://en.wikipedia.org/wiki/Relocation_%28computing%29)\n* [The Linux Kernel](https://www.kernel.org/doc/html/latest/driver-api/device_link.html)\n"
  },
  {
    "path": "Misc/linux-misc-3.md",
    "content": "Introduction\n---------------\n\nDuring the writing of the [linux-insides](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md) book I have received many emails with questions related to the [linker](https://en.wikipedia.org/wiki/Linker_%28computing%29) script and linker-related subjects. So I've decided to write this to cover some aspects of the linker and the linking of object files.\n\nIf we open the `Linker` page on Wikipedia, we will see following definition:\n\n>In computer science, a linker or link editor is a computer program that takes one or more object files generated by a compiler and combines them into a single executable file, library file, or another object file.\n\nIf you've written at least one program on C in your life, you will have seen files with the `*.o` extension. These files are [object files](https://en.wikipedia.org/wiki/Object_file). Object files are blocks of machine code and data with placeholder addresses that reference data and functions in other object files or libraries, as well as a list of its own functions and data. The main purpose of the linker is collect/handle the code and data of each object file, turning it into the final executable file or library. In this post we will try to go through all aspects of this process. Let's start.\n\nLinking process\n---------------\n\nLet's create a simple project with the following structure:\n\n```\n*-linkers\n*--main.c\n*--lib.c\n*--lib.h\n```\n\nOur `main.c` source code file contains:\n\n```C\n#include <stdio.h>\n\n#include \"lib.h\"\n\nint main(int argc, char **argv) {\n\tprintf(\"factorial of 5 is: %d\\n\", factorial(5));\n\treturn 0;\n}\n```\n\nThe `lib.c` file contains:\n\n```C\nint factorial(int base) {\n\tint res,i = 1;\n\n\tif (base == 0) {\n\t\treturn 1;\n\t}\n\n\twhile (i <= base) {\n\t\tres *= i;\n\t\ti++;\n\t}\n\n\treturn res;\n}\n```\n\nAnd the `lib.h` file contains:\n\n```C\n#ifndef LIB_H\n#define LIB_H\n\nint factorial(int base);\n\n#endif\n```\n\nNow let's compile only the `main.c` source code file with:\n\n```\n$ gcc -c main.c\n```\n\nIf we look inside the outputted object file with the `nm` util, we will see the\nfollowing output:\n\n```\n$ nm -A main.o\nmain.o:                 U factorial\nmain.o:0000000000000000 T main\nmain.o:                 U printf\n```\n\nThe `nm` util allows us to see the list of symbols from the given object file. It consists of three columns: the first is the name of the given object file and the address of any resolved symbols. The second column contains a character that represents the status of the given symbol. In this case the `U` means `undefined` and the `T` denotes that the symbols are placed in the `.text` section of the object. The `nm` utility shows us here that we have three symbols in the `main.c` source code file:\n\n* `factorial` - the factorial function defined in the `lib.c` source code file. It is marked as `undefined` here because we compiled only the `main.c` source code file, and it does not know anything about code from the `lib.c` file for now;\n* `main` - the main function;\n* `printf` - the function from the [glibc](https://en.wikipedia.org/wiki/GNU_C_Library) library. `main.c` does not know anything about it for now either.\n\nWhat can we understand from the output of `nm` so far? The `main.o` object file contains the local symbol `main` at address `0000000000000000` (it will be filled with the correct address after it is linked), and two unresolved symbols. We can see all of this information in the disassembly output of the `main.o` object file:\n\n```\n$ objdump -S main.o\n\nmain.o:     file format elf64-x86-64\nDisassembly of section .text:\n\n0000000000000000 <main>:\n   0:\t55                   \tpush   %rbp\n   1:\t48 89 e5             \tmov    %rsp,%rbp\n   4:\t48 83 ec 10          \tsub    $0x10,%rsp\n   8:\t89 7d fc             \tmov    %edi,-0x4(%rbp)\n   b:\t48 89 75 f0          \tmov    %rsi,-0x10(%rbp)\n   f:\tbf 05 00 00 00       \tmov    $0x5,%edi\n  14:\te8 00 00 00 00       \tcallq  19 <main+0x19>\n  19:\t89 c6                \tmov    %eax,%esi\n  1b:\tbf 00 00 00 00       \tmov    $0x0,%edi\n  20:\tb8 00 00 00 00       \tmov    $0x0,%eax\n  25:\te8 00 00 00 00       \tcallq  2a <main+0x2a>\n  2a:\tb8 00 00 00 00       \tmov    $0x0,%eax\n  2f:\tc9                   \tleaveq\n  30:\tc3                   \tretq\n```\n\nHere we are interested only in the two `callq` operations. The two `callq` operations contain `linker stubs`, or the function name and offset from it to the next instruction. These stubs will be updated to the real addresses of the functions. We can see these functions' names within the following `objdump` output:\n\n```\n$ objdump -S -r main.o\n\n...\n  14:\te8 00 00 00 00       \tcallq  19 <main+0x19>\n  15: R_X86_64_PC32\t               factorial-0x4\n  19:\t89 c6                \tmov    %eax,%esi\n...\n  25:\te8 00 00 00 00       \tcallq  2a <main+0x2a>\n  26:   R_X86_64_PC32\t               printf-0x4\n  2a:\tb8 00 00 00 00       \tmov    $0x0,%eax\n...\n```\n\nThe `-r` or `--reloc ` flags of the `objdump` util print the `relocation` entries of the file. Now let's look in more detail at the relocation process.\n\nRelocation\n------------\n\nRelocation is the process of connecting symbolic references with symbolic definitions. Let's look at the previous snippet from the `objdump` output:\n\n```\n  14:\te8 00 00 00 00       \tcallq  19 <main+0x19>\n  15:   R_X86_64_PC32\t               factorial-0x4\n  19:\t89 c6                \tmov    %eax,%esi\n```\n\nNote the `e8 00 00 00 00` on the first line. The `e8` is the [opcode](https://en.wikipedia.org/wiki/Opcode) of the `call`, and the remainder of the line is a relative offset. So the `e8 00 00 00 00` contains a one-byte operation code followed by a four-byte address. Note that the `00 00 00 00` is 4-bytes. Why only 4-bytes if an address can be 8-bytes in a `x86_64` (64-bit) machine? Actually, we compiled the `main.c` source code file with the `-mcmodel=small`! From the `gcc` man page:\n\n```\n-mcmodel=small\n\nGenerate code for the small code model: the program and its symbols must be linked in the lower 2 GB of the address space. Pointers are 64 bits. Programs can be statically or dynamically linked. This is the default code model.\n```\n\nOf course we didn't pass this option to the `gcc` when we compiled the `main.c`, but it is the default. We know that our program will be linked in the lower 2 GB of the address space from the `gcc` manual extract above. Four bytes is therefore enough for this. So we have the opcode of the `call` instruction and an unknown address. When we compile `main.c` with all its dependencies to an executable file, and then look at the factorial call, we see:\n\n```\n$ gcc main.c lib.c -o factorial | objdump -S factorial | grep factorial\n\nfactorial:     file format elf64-x86-64\n...\n...\n0000000000400506 <main>:\n\t40051a:\te8 18 00 00 00       \tcallq  400537 <factorial>\n...\n...\n0000000000400537 <factorial>:\n\t400550:\t75 07                \tjne    400559 <factorial+0x22>\n\t400557:\teb 1b                \tjmp    400574 <factorial+0x3d>\n\t400559:\teb 0e                \tjmp    400569 <factorial+0x32>\n\t40056f:\t7e ea                \tjle    40055b <factorial+0x24>\n...\n...\n```\n\nAs we can see in the previous output, the address of the `main` function is `0x0000000000400506`. Why doesn't it start from `0x0`? You may already know that standard C programs are linked with the `glibc` C standard library (assuming the `-nostdlib` was not passed to the `gcc`). The compiled code for a program includes constructor functions to initialize data in the program when the program is started. These functions need to be called before the program is started, or in another words before the `main` function is called. To make the initialization and termination functions work, the compiler must output something in the assembler code to cause those functions to be called at the appropriate time. Execution of this program will start from the code placed in the special `.init` section. We can see this in the beginning of the objdump output:\n\n```\nobjdump -S factorial | less\n\nfactorial:     file format elf64-x86-64\n\nDisassembly of section .init:\n\n00000000004003a8 <_init>:\n  4003a8:       48 83 ec 08             sub    $0x8,%rsp\n  4003ac:       48 8b 05 a5 05 20 00    mov    0x2005a5(%rip),%rax        # 600958 <_DYNAMIC+0x1d0>\n```\n\nNot that it starts at the `0x00000000004003a8` address relative to the `glibc` code. We can check it also in the [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) output by running `readelf`:\n\n```\n$ readelf -d factorial | grep \\(INIT\\)\n 0x000000000000000c (INIT)               0x4003a8\n ```\n\nSo, the address of the `main` function is `0000000000400506` and is offset from the `.init` section. As we can see from the output, the address of the `factorial` function is `0x0000000000400537` and binary code for the call of the `factorial` function now is `e8 18 00 00 00`. We already know that `e8` is opcode for the `call` instruction, the next `18 00 00 00` (note that address represented as little endian for `x86_64`, so it is `00 00 00 18`) is the offset from the `callq` to the `factorial` function:\n\n```python\n>>> hex(0x40051a + 0x18 + 0x5) == hex(0x400537)\nTrue\n```\n\nSo we add `0x18` and `0x5` to the address of the `call` instruction. The offset is measured from the address of the following instruction. Our call instruction is 5-bytes long (`e8 18 00 00 00`) and the `0x18` is the offset after the call instruction to the `factorial` function. A compiler generally creates each object file with the program addresses starting at zero. But if a program is created from multiple object files, these will overlap.\n\nWhat we have seen in this section is the `relocation` process. This process assigns load addresses to the various parts of the program, adjusting the code and data in the program to reflect the assigned addresses.\n\nOk, now that we know a little about linkers and relocation, it is time to learn more about linkers by linking our object files.\n\nGNU linker\n-----------------\n\nAs you can understand from the title, I will use [GNU linker](https://en.wikipedia.org/wiki/GNU_linker) or just `ld` in this post. Of course we can use `gcc` to link our `factorial` project:\n\n```\n$ gcc main.c lib.o -o factorial\n```\n\nand after it we will get executable file - `factorial` as a result:\n\n```\n./factorial\nfactorial of 5 is: 120\n```\n\nBut `gcc` does not link object files. Instead it uses `collect2` which is just wrapper for the `GNU ld` linker:\n\n```\n~$ /usr/lib/gcc/x86_64-linux-gnu/4.9/collect2 --version\ncollect2 version 4.9.3\n/usr/bin/ld --version\nGNU ld (GNU Binutils for Debian) 2.25\n...\n...\n...\n```\n\nOk, we can use gcc and it will produce executable file of our program for us. But let's look how to use `GNU ld` linker for the same purpose. First of all let's try to link these object files with the following example:\n\n```\nld main.o lib.o -o factorial\n```\n\nTry to do it and you will get following error:\n\n```\n$ ld main.o lib.o -o factorial\nld: warning: cannot find entry symbol _start; defaulting to 00000000004000b0\nmain.o: In function `main':\nmain.c:(.text+0x26): undefined reference to `printf'\n```\n\nHere we can see two problems:\n\n* Linker can't find `_start` symbol;\n* Linker does not know anything about `printf` function.\n\nFirst of all let's try to understand what is this `_start` entry symbol that appears to be required for our program to run? When I started to learn programming I learned that the `main` function is the entry point of the program. I think you learned this too :) But it actually isn't the entry point, it's `_start` instead. The `_start` symbol is defined in the `crt1.o` object file. We can find it with the following command:\n\n```\n$ objdump -S /usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crt1.o\n\n/usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crt1.o:     file format elf64-x86-64\n\n\nDisassembly of section .text:\n\n0000000000000000 <_start>:\n   0:\t31 ed                \txor    %ebp,%ebp\n   2:\t49 89 d1             \tmov    %rdx,%r9\n   ...\n   ...\n   ...\n```\n\nWe pass this object file to the `ld` command as its first argument (see above). Now let's try to link it and will look on result:\n\n```\nld /usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crt1.o \\\nmain.o lib.o -o factorial\n\n/usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crt1.o: In function `_start':\n/tmp/buildd/glibc-2.19/csu/../sysdeps/x86_64/start.S:115: undefined reference to `__libc_csu_fini'\n/tmp/buildd/glibc-2.19/csu/../sysdeps/x86_64/start.S:116: undefined reference to `__libc_csu_init'\n/tmp/buildd/glibc-2.19/csu/../sysdeps/x86_64/start.S:122: undefined reference to `__libc_start_main'\nmain.o: In function `main':\nmain.c:(.text+0x26): undefined reference to `printf'\n```\n\nUnfortunately we will see even more errors. We can see here old error about undefined `printf` and yet another three undefined references:\n\n* `__libc_csu_fini`\n* `__libc_csu_init`\n* `__libc_start_main`\n\nThe `_start` symbol is defined in the [sysdeps/x86_64/start.S](https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/start.S;h=0d27a38e9c02835ce17d1c9287aa01be222e72eb;hb=HEAD) assembly file in the `glibc` source code. We can find following assembly code lines there:\n\n```assembly\nmov $__libc_csu_fini, %R8_LP\nmov $__libc_csu_init, %RCX_LP\n...\ncall __libc_start_main\n```\n\nHere we pass address of the entry point to the `.init` and `.fini` section that contain code that starts to execute when the program is ran and the code that executes when program terminates. And in the end we see the call of the `main` function from our program. These three symbols are defined in the [csu/elf-init.c](https://sourceware.org/git/?p=glibc.git;a=blob;f=csu/elf-init.c;hb=1d4bbc54bd4f7d85d774871341b49f4357af1fb7) source code file. The following two object files:\n\n* `crtn.o`;\n* `crti.o`.\n\ndefine the function prologs/epilogs for the .init and .fini sections (with the `_init` and `_fini` symbols respectively).\n\nThe `crtn.o` object file contains these `.init` and `.fini` sections:\n\n```\n$ objdump -S /usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crtn.o\n\n0000000000000000 <.init>:\n   0:\t48 83 c4 08          \tadd    $0x8,%rsp\n   4:\tc3                   \tretq\n\nDisassembly of section .fini:\n\n0000000000000000 <.fini>:\n   0:\t48 83 c4 08          \tadd    $0x8,%rsp\n   4:\tc3                   \tretq\n```\n\nAnd the `crti.o` object file contains the `_init` and `_fini` symbols. Let's try to link again with these two object files:\n\n```\n$ ld \\\n/usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crt1.o \\\n/usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crti.o \\\n/usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crtn.o main.o lib.o \\\n-o factorial\n```\n\nAnd anyway we will get the same errors. Now we need to pass `-lc` option to the `ld`. This option will search for the standard library in the paths present in the `$LD_LIBRARY_PATH` environment variable. Let's try to link again with the `-lc` option:\n\n```\n$ ld \\\n/usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crt1.o \\\n/usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crti.o \\\n/usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crtn.o main.o lib.o -lc \\\n-o factorial\n```\n\nFinally we get an executable file, but if we try to run it, we will get strange results:\n\n```\n$ ./factorial\nbash: ./factorial: No such file or directory\n```\n\nWhat's the problem here? Let's look on the executable file with the [readelf](https://sourceware.org/binutils/docs/binutils/readelf.html) util:\n\n```\n$ readelf -l factorial\n\nElf file type is EXEC (Executable file)\nEntry point 0x4003c0\nThere are 7 program headers, starting at offset 64\n\nProgram Headers:\n  Type           Offset             VirtAddr           PhysAddr\n                 FileSiz            MemSiz              Flags  Align\n  PHDR           0x0000000000000040 0x0000000000400040 0x0000000000400040\n                 0x0000000000000188 0x0000000000000188  R E    8\n  INTERP         0x00000000000001c8 0x00000000004001c8 0x00000000004001c8\n                 0x000000000000001c 0x000000000000001c  R      1\n      [Requesting program interpreter: /lib64/ld-linux-x86-64.so.2]\n  LOAD           0x0000000000000000 0x0000000000400000 0x0000000000400000\n                 0x0000000000000610 0x0000000000000610  R E    200000\n  LOAD           0x0000000000000610 0x0000000000600610 0x0000000000600610\n                 0x00000000000001cc 0x00000000000001cc  RW     200000\n  DYNAMIC        0x0000000000000610 0x0000000000600610 0x0000000000600610\n                 0x0000000000000190 0x0000000000000190  RW     8\n  NOTE           0x00000000000001e4 0x00000000004001e4 0x00000000004001e4\n                 0x0000000000000020 0x0000000000000020  R      4\n  GNU_STACK      0x0000000000000000 0x0000000000000000 0x0000000000000000\n                 0x0000000000000000 0x0000000000000000  RW     10\n\n Section to Segment mapping:\n  Segment Sections...\n   00\n   01     .interp\n   02     .interp .note.ABI-tag .hash .dynsym .dynstr .gnu.version .gnu.version_r .rela.dyn .rela.plt .init .plt .text .fini .rodata .eh_frame\n   03     .dynamic .got .got.plt .data\n   04     .dynamic\n   05     .note.ABI-tag\n   06\n```\n\nNote on the strange line:\n\n```\n  INTERP         0x00000000000001c8 0x00000000004001c8 0x00000000004001c8\n                 0x000000000000001c 0x000000000000001c  R      1\n      [Requesting program interpreter: /lib64/ld-linux-x86-64.so.2]\n```\n\nThe `.interp` section in the `elf` file holds the path name of a program interpreter or in another words the `.interp` section simply contains an `ascii` string that is the name of the dynamic linker. The dynamic linker is the part of Linux that loads and links shared libraries needed by an executable when it is executed, by copying the content of libraries from disk to RAM. As we can see in the output of the `readelf` command it is placed in the `/lib64/ld-linux-x86-64.so.2` file for the `x86_64` architecture. Now let's add the `-dynamic-linker` option with the path of `ld-linux-x86-64.so.2` to the `ld` call and will see the following results:\n\n```\n$ gcc -c main.c lib.c\n\n$ ld \\\n/usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crt1.o \\\n/usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crti.o \\\n/usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crtn.o main.o lib.o \\\n-dynamic-linker /lib64/ld-linux-x86-64.so.2 \\\n-lc -o factorial\n```\n\nNow we can run it as normal executable file:\n\n```\n$ ./factorial\n\nfactorial of 5 is: 120\n```\n\nIt works! With the first line we compile the `main.c` and the `lib.c` source code files to object files. We will get the `main.o` and the `lib.o` after execution of the `gcc`:\n\n```\n$ file lib.o main.o\nlib.o:  ELF 64-bit LSB relocatable, x86-64, version 1 (SYSV), not stripped\nmain.o: ELF 64-bit LSB relocatable, x86-64, version 1 (SYSV), not stripped\n```\n\nand after this we link object files of our program with the needed system object files and libraries. We just saw a simple example of how to compile and link a C program with the `gcc` compiler and `GNU ld` linker. In this example we have used a couple command line options of the `GNU linker`, but it supports much more command line options than `-o`, `-dynamic-linker`, etc... Moreover `GNU ld` has its own language that allows to control the linking process. In the next two paragraphs we will look into it.\n\nUseful command line options of the GNU linker\n----------------------------------------------\n\nAs I already wrote and as you can see in the manual of the `GNU linker`, it has a big set of command line options. We've seen a couple of options in this post: `-o <output>` - that tells `ld` to produce an output file called `output` as the result of linking, `-l<name>` that adds the archive or object file specified by the name, `-dynamic-linker` that specifies the name of the dynamic linker. Of course `ld` supports much more command line options, let's look at some of them.\n\nThe first useful command line option is `@file`. In this case the `file` specifies filename where command line options will be read. For example we can create file with the name `linker.ld`, put there our command line arguments from the previous example and execute it with:\n\n```\n$ ld @linker.ld\n```\n\nThe next command line option is `-b` or `--format`. This command line option specifies format of the input object files `ELF`, `DJGPP/COFF` and etc. There is a command line option for the same purpose but for the output file: `--oformat=output-format`.\n\nThe next command line option is `--defsym`. Full format of this command line option is the `--defsym=symbol=expression`. It allows to create global symbol in the output file containing the absolute address given by expression. We can find following case where this command line option can be useful: in the Linux kernel source code and more precisely in the Makefile that is related to the kernel decompression for the ARM architecture - [arch/arm/boot/compressed/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/arm/boot/compressed/Makefile), we can find following definition:\n\n```\nLDFLAGS_vmlinux = --defsym _kernel_bss_size=$(KBSS_SZ)\n```\n\nAs we already know, it defines the `_kernel_bss_size` symbol with the size of the `.bss` section in the output file. This symbol will be used in the first [assembly file](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/arm/boot/compressed/head.S) that will be executed during kernel decompression:\n\n```assembly\nldr r5, =_kernel_bss_size\n```\n\nThe next command line options is the `-shared` that allows us to create shared library. The `-M` or `-map <filename>` command line option prints the linking map with the information about symbols. In our case:\n\n```\n$ ld -M @linker.ld\n...\n...\n...\n.text           0x00000000004003c0      0x112\n *(.text.unlikely .text.*_unlikely .text.unlikely.*)\n *(.text.exit .text.exit.*)\n *(.text.startup .text.startup.*)\n *(.text.hot .text.hot.*)\n *(.text .stub .text.* .gnu.linkonce.t.*)\n .text          0x00000000004003c0       0x2a /usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crt1.o\n...\n...\n...\n .text          0x00000000004003ea       0x31 main.o\n                0x00000000004003ea                main\n .text          0x000000000040041b       0x3f lib.o\n                0x000000000040041b                factorial\n```\n\nOf course the `GNU linker` supports standard command line options: `--help` and `--version` that prints common help of the usage of the `ld` and its version. That's all about command line options of the `GNU linker`. Of course it is not the full set of command line options supported by the `ld` util. You can find the complete documentation of the `ld` util in the manual.\n\nControl Language linker\n----------------------------------------------\n\nAs I wrote previously, `ld` has support for its own language. It accepts Linker Command Language files written in a superset of AT&T's Link Editor Command Language syntax, to provide explicit and total control over the linking process. Let's look on its details.\n\nWith the linker language we can control:\n\n* input files;\n* output files;\n* file formats\n* addresses of sections;\n* etc...\n\nCommands written in the linker control language are usually placed in a file called linker script. We can pass it to `ld` with the `-T` command line option. The main command in a linker script is the `SECTIONS` command. Each linker script must contain this command and it determines the `map` of the output file. The special variable `.` contains current position of the output. Let's write a simple assembly program and we will look at how we can use a linker script to control linking of this program. We will take a hello world program for this example:\n\n```assembly\n.data\n        msg:    .ascii  \"hello, world!\\n\"\n\n.text\n\n.global _start\n\n_start:\n        mov    $1,%rax\n        mov    $1,%rdi\n        mov    $msg,%rsi\n        mov    $14,%rdx\n        syscall\n\n        mov    $60,%rax\n        mov    $0,%rdi\n        syscall\n```\n\nWe can compile and link it with the following commands:\n\n```\n$ as -o hello.o hello.asm\n$ ld -o hello hello.o\n```\n\nOur program consists from two sections: `.text` contains code of the program and `.data` contains initialized variables. Let's write simple linker script and try to link our `hello.asm` assembly file with it. Our script is:\n\n```\n/*\n * Linker script for the factorial\n */\nOUTPUT(hello)\nOUTPUT_FORMAT(\"elf64-x86-64\")\nINPUT(hello.o)\n\nSECTIONS\n{\n\t. = 0x200000;\n\t.text : {\n\t      *(.text)\n\t}\n\n\t. = 0x400000;\n\t.data : {\n\t      *(.data)\n\t}\n}\n```\n\nOn the first three lines you can see a comment written in `C` style. After it the `OUTPUT` and the `OUTPUT_FORMAT` commands specify the name of our executable file and its format. The next command, `INPUT`, specifies the input file to the `ld` linker. Then, we can see the main `SECTIONS` command, which, as I already wrote, must be present in every linker script. The `SECTIONS` command represents the set and order of the sections which will be in the output file. At the beginning of the `SECTIONS` command we can see following line `. = 0x200000`. I already wrote above that `.` command points to the current position of the output. This line says that the code should be loaded at address `0x200000` and the line `. = 0x400000` says that data section should be loaded at address `0x400000`. The second line after the `. = 0x200000` defines `.text` as an output section. We can see `*(.text)` expression inside it. The `*` symbol is wildcard that matches any file name. In other words, the `*(.text)` expression says all `.text` input sections in all input files. We can rewrite it as `hello.o(.text)` for our example. After the following location counter `. = 0x400000`, we can see definition of the data section.\n\nWe can compile and link it with the following command:\n\n```\n$ as -o hello.o hello.S && ld -T linker.script && ./hello\nhello, world!\n```\n\nIf we look inside it with the `objdump` util, we can see that `.text` section starts from the address `0x200000` and the `.data` sections starts from the address `0x400000`:\n\n```\n$ objdump -D hello\n\nDisassembly of section .text:\n\n0000000000200000 <_start>:\n  200000:\t48 c7 c0 01 00 00 00 \tmov    $0x1,%rax\n  ...\n\nDisassembly of section .data:\n\n0000000000400000 <msg>:\n  400000:\t68 65 6c 6c 6f       \tpushq  $0x6f6c6c65\n  ...\n```\n\nApart from the commands we have already seen, there are a few others. The first is the `ASSERT(exp, message)` that ensures that given expression is not zero. If it is zero, then exit the linker with an error code and print the given error message. If you've read about Linux kernel booting process in the [linux-insides](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md) book, you may know that the setup header of the Linux kernel has offset `0x1f1`. In the linker script of the Linux kernel we can find a check for this:\n\n```\n. = ASSERT(hdr == 0x1f1, \"The setup header has the wrong offset!\");\n```\n\nThe `INCLUDE filename` command allows to include external linker script symbols in the current one. In a linker script we can assign a value to a symbol. `ld` supports a couple of assignment operators:\n\n* symbol = expression   ;\n* symbol += expression  ;\n* symbol -= expression  ;\n* symbol *= expression  ;\n* symbol /= expression  ;\n* symbol <<= expression ;\n* symbol >>= expression ;\n* symbol &= expression  ;\n* symbol |= expression  ;\n\nAs you can note all operators are C assignment operators. For example we can use it in our linker script as:\n\n```\nSTART_ADDRESS = 0x200000;\nDATA_OFFSET   = 0x200000;\n\nSECTIONS\n{\n\t. = START_ADDRESS;\n\t.text : {\n\t      *(.text)\n\t}\n\n\t. = START_ADDRESS + DATA_OFFSET;\n\t.data : {\n\t      *(.data)\n\t}\n}\n```\n\nAs you already may have noted, the syntax for expressions in the linker script language is identical to that of C expressions. Besides this the control language of the linking supports following builtin functions:\n\n* `ABSOLUTE` - returns absolute value of the given expression;\n* `ADDR` - takes the section and returns its address;\n* `ALIGN` - returns the value of the location counter (`.` operator) that aligned by the boundary of the next expression after the given expression;\n* `DEFINED` - returns `1` if the given symbol placed in the global symbol table and `0` otherwise;\n* `MAX` and `MIN` - return maximum and minimum of the two given expressions;\n* `NEXT` - returns the next unallocated address that is a multiple of the given expression;\n* `SIZEOF` - returns the size in bytes of the given named section.\n\nThat's all.\n\nConclusion\n-----------------\n\nThis is the end of the post about linkers. We learned many things about linkers in this post, such as what is a linker and why it is needed, how to use it, etc..\n\nIf you have any questions or suggestions, write me an [email](mailto:kuleshovmail@gmail.com) or ping [me](https://twitter.com/0xAX) on twitter.\n\nPlease note that English is not my first language, and I am really sorry for any inconvenience. If you find any mistakes please let me know via email or send a PR.\n\nLinks\n-----------------\n\n* [Book about Linux kernel insides](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md)\n* [linker](https://en.wikipedia.org/wiki/Linker_%28computing%29)\n* [object files](https://en.wikipedia.org/wiki/Object_file)\n* [glibc](https://en.wikipedia.org/wiki/GNU_C_Library)\n* [opcode](https://en.wikipedia.org/wiki/Opcode)\n* [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format)\n* [GNU linker](https://en.wikipedia.org/wiki/GNU_linker)\n* [My posts about assembly programming for x86_64](https://0xax.github.io/categories/assembler/)\n* [readelf](https://sourceware.org/binutils/docs/binutils/readelf.html)\n"
  },
  {
    "path": "Misc/linux-misc-4.md",
    "content": "Program startup process in userspace\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nDespite the [linux-insides](https://www.gitbook.com/book/0xax/linux-insides/details) described mostly Linux kernel related stuff, I have decided to write this one part which mostly relates to userspace.\n\nThere is already fourth [part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-4) of [System calls](https://en.wikipedia.org/wiki/System_call) chapter which describes what the Linux kernel does when we want to start a program. In this part I want to explore what happens when we run a program on a Linux machine from userspace perspective.\n\nI don't know how about you, but in my university I learn that a `C` program starts executing from the function which is called `main`. And that's partly true. Whenever we are starting to write new program, we start our program from the following lines of code:\n\n```C\nint main(int argc, char *argv[]) {\n\t// Entry point is here\n}\n```\n\nBut if you are interested in low-level programming, you may already know that the `main` function isn't the actual entry point of a program. You will believe it's true after you look at this simple program in debugger:\n\n```C\nint main(int argc, char *argv[]) {\n\treturn 0;\n}\n```\n\nLet's compile this and run in [gdb](https://www.gnu.org/software/gdb/):\n\n```\n$ gcc -ggdb program.c -o program\n$ gdb ./program\nThe target architecture is assumed to be i386:x86-64:intel\nReading symbols from ./program...done.\n```\n\nLet's execute gdb `info` subcommand with `files` argument. The `info files` prints information about debugging targets and memory spaces occupied by different sections.\n\n```\n(gdb) info files\nSymbols from \"/home/alex/program\".\nLocal exec file:\n\t`/home/alex/program', file type elf64-x86-64.\n\tEntry point: 0x400430\n\t0x0000000000400238 - 0x0000000000400254 is .interp\n\t0x0000000000400254 - 0x0000000000400274 is .note.ABI-tag\n\t0x0000000000400274 - 0x0000000000400298 is .note.gnu.build-id\n\t0x0000000000400298 - 0x00000000004002b4 is .gnu.hash\n\t0x00000000004002b8 - 0x0000000000400318 is .dynsym\n\t0x0000000000400318 - 0x0000000000400357 is .dynstr\n\t0x0000000000400358 - 0x0000000000400360 is .gnu.version\n\t0x0000000000400360 - 0x0000000000400380 is .gnu.version_r\n\t0x0000000000400380 - 0x0000000000400398 is .rela.dyn\n\t0x0000000000400398 - 0x00000000004003c8 is .rela.plt\n\t0x00000000004003c8 - 0x00000000004003e2 is .init\n\t0x00000000004003f0 - 0x0000000000400420 is .plt\n\t0x0000000000400420 - 0x0000000000400428 is .plt.got\n\t0x0000000000400430 - 0x00000000004005e2 is .text\n\t0x00000000004005e4 - 0x00000000004005ed is .fini\n\t0x00000000004005f0 - 0x0000000000400610 is .rodata\n\t0x0000000000400610 - 0x0000000000400644 is .eh_frame_hdr\n\t0x0000000000400648 - 0x000000000040073c is .eh_frame\n\t0x0000000000600e10 - 0x0000000000600e18 is .init_array\n\t0x0000000000600e18 - 0x0000000000600e20 is .fini_array\n\t0x0000000000600e20 - 0x0000000000600e28 is .jcr\n\t0x0000000000600e28 - 0x0000000000600ff8 is .dynamic\n\t0x0000000000600ff8 - 0x0000000000601000 is .got\n\t0x0000000000601000 - 0x0000000000601028 is .got.plt\n\t0x0000000000601028 - 0x0000000000601034 is .data\n\t0x0000000000601034 - 0x0000000000601038 is .bss\n```\n\nNote on `Entry point: 0x400430` line. Now we know the actual address of entry point of our program. Let's put a breakpoint by this address, run our program and see what happens:\n\n```\n(gdb) break *0x400430\nBreakpoint 1 at 0x400430\n(gdb) run\nStarting program: /home/alex/program\n\nBreakpoint 1, 0x0000000000400430 in _start ()\n```\n\nInteresting. We don't see execution of the `main` function here, but we have seen that another function is called. This function is `_start` and as our debugger shows us, it is the actual entry point of our program. Where is this function from? Who does call `main` and when is it called? I will try to answer all these questions in the following post.\n\nHow the kernel starts a new program\n--------------------------------------------------------------------------------\n\nFirst of all, let's take a look at the following simple `C` program:\n\n```C\n// program.c\n\n#include <stdlib.h>\n#include <stdio.h>\n\nstatic int x = 1;\n\nint y = 2;\n\nint main(int argc, char *argv[]) {\n\tint z = 3;\n\n\tprintf(\"x + y + z = %d\\n\", x + y + z);\n\n\treturn EXIT_SUCCESS;\n}\n```\n\nWe can be sure that this program works as we expect. Let's compile it:\n\n```\n$ gcc -Wall program.c -o sum\n```\n\nand run:\n\n```\n$ ./sum\nx + y + z = 6\n```\n\nOk, everything looks pretty good up to now. You may already know that there is a special family of functions - [exec*](http://man7.org/linux/man-pages/man3/execl.3.html). As we read in the man page:\n\n> The exec() family of functions replaces the current process image with a new process image.\n\nAll the `exec*` functions are simple frontends to the [execve](http://man7.org/linux/man-pages/man2/execve.2.html) system call. If you have read the fourth [part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-4) of the chapter which describes [system calls](https://en.wikipedia.org/wiki/System_call), you may know that the [execve](http://linux.die.net/man/2/execve) system call is defined in the [files/exec.c](https://github.com/torvalds/linux/blob/08e4e0d0456d0ca8427b2d1ddffa30f1c3e774d7/fs/exec.c#L1888) source code file and looks like:\n\n```C\nSYSCALL_DEFINE3(execve,\n\t\tconst char __user *, filename,\n\t\tconst char __user *const __user *, argv,\n\t\tconst char __user *const __user *, envp)\n{\n\treturn do_execve(getname(filename), argv, envp);\n}\n```\n\nIt takes an executable file name, set of command line arguments, and set of environment variables. As you may guess, everything is done by the `do_execve` function. I will not describe the implementation of the `do_execve` function in detail because you can read about this in [here](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-4). But in short words, the `do_execve` function does many checks like `filename` is valid, limit of launched processes is not exceeded in our system and etc. After all of these checks, this function parses our executable file which is represented in [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) format, creates memory descriptor for newly executed executable file and fills it with the appropriate values like area for the stack, heap and etc. When the setup of new binary image is done, the `start_thread` function will set up one new process. This function is architecture-specific and for the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, its definition will be located in the [arch/x86/kernel/process_64.c](https://github.com/torvalds/linux/blob/08e4e0d0456d0ca8427b2d1ddffa30f1c3e774d7/arch/x86/kernel/process_64.c#L239) source code file.\n\nThe `start_thread` function sets new value to [segment registers](https://en.wikipedia.org/wiki/X86_memory_segmentation) and program execution address. From this point, our new process is ready to start. Once the [context switch](https://en.wikipedia.org/wiki/Context_switch) will be done, control will be returned to userspace with new values of registers and the new executable will be started to execute.\n\nThat's all from the kernel side. The Linux kernel prepares the binary image for execution and its execution starts right after the context switch and returns control to userspace when it is finished. But it does not answer our questions like where does `_start` come from and others. Let's try to answer these questions in the next paragraph.\n\nHow does a program start in userspace\n--------------------------------------------------------------------------------\n\nIn the previous paragraph we saw how an executable file is prepared to run by the Linux kernel. Let's look at the same, but from userspace side. We already know that the entry point of each program is its `_start` function. But where is this function from? It may come from a library. But if you remember correctly we didn't link our program with any libraries during compilation of our program:\n\n```\n$ gcc -Wall program.c -o sum\n```\n\nYou may guess that `_start` comes from the [standard library](https://en.wikipedia.org/wiki/Standard_library) and that's true. If you try to compile our program again and pass the `-v` option to gcc which will enable `verbose mode`, you will see a long output. The full output is not interesting for us, let's look at the following steps:\n\nFirst of all, our program should be compiled with `gcc`:\n\n```\n$ gcc -v -ggdb program.c -o sum\n...\n...\n...\n/usr/libexec/gcc/x86_64-redhat-linux/6.1.1/cc1 -quiet -v program.c -quiet -dumpbase program.c -mtune=generic -march=x86-64 -auxbase test -ggdb -version -o /tmp/ccvUWZkF.s\n...\n...\n...\n```\n\nThe `cc1` compiler will compile our `C` source code and an produce assembly named `/tmp/ccvUWZkF.s` file. After this we can see that our assembly file will be compiled into object file with the `GNU as` assembler:\n\n```\n$ gcc -v -ggdb program.c -o sum\n...\n...\n...\nas -v --64 -o /tmp/cc79wZSU.o /tmp/ccvUWZkF.s\n...\n...\n...\n```\n\nIn the end our object file will be linked by `collect2`:\n\n```\n$ gcc -v -ggdb program.c -o sum\n...\n...\n...\n/usr/libexec/gcc/x86_64-redhat-linux/6.1.1/collect2 -plugin /usr/libexec/gcc/x86_64-redhat-linux/6.1.1/liblto_plugin.so -plugin-opt=/usr/libexec/gcc/x86_64-redhat-linux/6.1.1/lto-wrapper -plugin-opt=-fresolution=/tmp/ccLEGYra.res -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lgcc_s --build-id --no-add-needed --eh-frame-hdr --hash-style=gnu -m elf_x86_64 -dynamic-linker /lib64/ld-linux-x86-64.so.2 -o test /usr/lib/gcc/x86_64-redhat-linux/6.1.1/../../../../lib64/crt1.o /usr/lib/gcc/x86_64-redhat-linux/6.1.1/../../../../lib64/crti.o /usr/lib/gcc/x86_64-redhat-linux/6.1.1/crtbegin.o -L/usr/lib/gcc/x86_64-redhat-linux/6.1.1 -L/usr/lib/gcc/x86_64-redhat-linux/6.1.1/../../../../lib64 -L/lib/../lib64 -L/usr/lib/../lib64 -L. -L/usr/lib/gcc/x86_64-redhat-linux/6.1.1/../../.. /tmp/cc79wZSU.o -lgcc --as-needed -lgcc_s --no-as-needed -lc -lgcc --as-needed -lgcc_s --no-as-needed /usr/lib/gcc/x86_64-redhat-linux/6.1.1/crtend.o /usr/lib/gcc/x86_64-redhat-linux/6.1.1/../../../../lib64/crtn.o\n...\n...\n...\n```\n\nYes, we can see a long set of command line options which are passed to the linker. Let's go from another way. We know that our program depends on `stdlib`:\n\n```\n$ ldd program\n\tlinux-vdso.so.1 (0x00007ffc9afd2000)\n\tlibc.so.6 => /lib64/libc.so.6 (0x00007f56b389b000)\n\t/lib64/ld-linux-x86-64.so.2 (0x0000556198231000)\n```\n\nas we use some stuff from there like `printf` and etc. But not only. That's why we will get an error when we pass `-nostdlib` option to the compiler:\n\n```\n$ gcc -nostdlib program.c -o program\n/usr/bin/ld: warning: cannot find entry symbol _start; defaulting to 000000000040017c\n/tmp/cc02msGW.o: In function `main':\n/home/alex/program.c:11: undefined reference to `printf'\ncollect2: error: ld returned 1 exit status\n```\n\nBesides other errors, we also see that `_start` symbol is undefined. So now we are sure that the `_start` function comes from standard library. But even if we link it with the standard library, it will not be compiled successfully anyway:\n\n```\n$ gcc -nostdlib -lc -ggdb program.c -o program\n/usr/bin/ld: warning: cannot find entry symbol _start; defaulting to 0000000000400350\n```\n\nOk, the compiler does not complain about undefined reference of standard library functions anymore as we linked our program with `/usr/lib64/libc.so.6`, but the `_start` symbol isn't resolved yet. Let's return to the verbose output of `gcc` and look at the parameters of `collect2`. The most important thing that we may see is that our program is linked not only with the standard library, but also with some object files. The first object file is: `/lib64/crt1.o`. And if we look inside this object file with `objdump`, we will see the `_start` symbol:\n\n```\n$ objdump -d /lib64/crt1.o\n\n/lib64/crt1.o:     file format elf64-x86-64\n\n\nDisassembly of section .text:\n\n0000000000000000 <_start>:\n   0:\t31 ed                \txor    %ebp,%ebp\n   2:\t49 89 d1             \tmov    %rdx,%r9\n   5:\t5e                   \tpop    %rsi\n   6:\t48 89 e2             \tmov    %rsp,%rdx\n   9:\t48 83 e4 f0          \tand    $0xfffffffffffffff0,%rsp\n   d:\t50                   \tpush   %rax\n   e:\t54                   \tpush   %rsp\n   f:\t49 c7 c0 00 00 00 00 \tmov    $0x0,%r8\n  16:\t48 c7 c1 00 00 00 00 \tmov    $0x0,%rcx\n  1d:\t48 c7 c7 00 00 00 00 \tmov    $0x0,%rdi\n  24:\te8 00 00 00 00       \tcallq  29 <_start+0x29>\n  29:\tf4                   \thlt\n```\n\nAs `crt1.o` is a shared object file, we see only stubs here instead of real calls. Let's look at the source code of the `_start` function. As this function is architecture specific, implementation for `_start` will be located in the [sysdeps/x86_64/start.S](https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/start.S;h=f1b961f5ba2d6a1ebffee0005f43123c4352fbf4;hb=HEAD) assembly file.\n\nThe `_start` starts from the clearing of `ebp` register as [ABI](https://software.intel.com/sites/default/files/article/402129/mpx-linux64-abi.pdf) suggests.\n\n```assembly\nxorl %ebp, %ebp\n```\n\nAnd after this we put the address of termination function to the `r9` register:\n\n```assembly\nmov %RDX_LP, %R9_LP\n```\n\nAs described in the [ELF](http://flint.cs.yale.edu/cs422/doc/ELF_Format.pdf) specification:\n\n> After the dynamic linker has built the process image and performed the relocations, each shared object\n> gets the opportunity to execute some initialization code.\n> ...\n> Similarly, shared objects may have termination functions, which are executed with the atexit (BA_OS)\n> mechanism after the base process begins its termination sequence.\n\nSo we need to put the address of the termination function to the `r9` register as it will be passed to `__libc_start_main` in future as sixth argument. Note that the address of the termination function initially is located in the `rdx` register. Other registers besides `rdx` and `rsp` contain unspecified values. Actually the main point of the `_start` function is to call `__libc_start_main`. So the next action is to prepare for this function.\n\nThe signature of the `__libc_start_main` function is located in the [csu/libc-start.c](https://sourceware.org/git/?p=glibc.git;a=blob;f=csu/libc-start.c;h=9a56dcbbaeb7ef85c495b4df9ab1d0b13454c043;hb=HEAD#l107) source code file. Let's look on it:\n\n```C\nSTATIC int LIBC_START_MAIN (int (*main) (int, char **, char **),\n \t\t\t                int argc,\n\t\t\t                char **argv,\n \t\t\t                __typeof (main) init,\n\t\t\t                void (*fini) (void),\n\t\t\t                void (*rtld_fini) (void),\n\t\t\t                void *stack_end)\n```\n\nIt takes the address of the `main` function of a program, `argc` and `argv`. `init` and `fini` functions are constructor and destructor of the program. The `rtld_fini` is the termination function which will be called after the program will be exited to terminate and free its dynamic section. The last parameter of the `__libc_start_main` is a pointer to the stack of the program. Before we can call the `__libc_start_main` function, all of these parameters must be prepared and passed to it. Let's return to the [sysdeps/x86_64/start.S](https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/start.S;h=f1b961f5ba2d6a1ebffee0005f43123c4352fbf4;hb=HEAD) assembly file and continue to see what happens before the `__libc_start_main` function will be called from there.\n\nWe can get all the arguments we need for `__libc_start_main` function from the stack. At the very beginning, when `_start` is called, our stack looks like:\n\n```\n+-----------------+\n|       NULL      |\n+-----------------+\n|       ...       |\n|       envp      |\n|       ...       |\n+-----------------+\n|       NULL      |\n+------------------\n|       ...       |\n|       argv      |\n|       ...       |\n+------------------\n|       argc      | <- rsp\n+-----------------+\n```\n\nAfter we cleared `ebp` register and saved the address of the termination function in the `r9` register, we pop an element from the stack to the `rsi` register, so after this `rsp` will point to the `argv` array and `rsi` will contain count of command line arguments passed to the program:\n\n```\n+-----------------+\n|       NULL      |\n+-----------------+\n|       ...       |\n|       envp      |\n|       ...       |\n+-----------------+\n|       NULL      |\n+------------------\n|       ...       |\n|       argv      |\n|       ...       | <- rsp\n+-----------------+\n```\n\nAfter this we move the address of the `argv` array to the `rdx` register\n\n```assembly\npopq %rsi\nmov %RSP_LP, %RDX_LP\n```\n\nFrom this moment we have `argc` and `argv`. We still need to put pointers to the constructor, destructor in appropriate registers and pass pointer to the stack. At the first following three lines we align stack to `16` bytes boundary as suggested in [ABI](https://software.intel.com/sites/default/files/article/402129/mpx-linux64-abi.pdf) and push `rax` which contains garbage:\n\n```assembly\nand  $~15, %RSP_LP\npushq %rax\n\npushq %rsp\nmov $__libc_csu_fini, %R8_LP\nmov $__libc_csu_init, %RCX_LP\nmov $main, %RDI_LP\n```\n\nAfter stack aligning we push the address of the stack, move the addresses of constructor and destructor to the `r8` and `rcx` registers and address of the `main` symbol to the `rdi`. From this moment we can call the `__libc_start_main` function from the [csu/libc-start.c](https://sourceware.org/git/?p=glibc.git;a=blob;f=csu/libc-start.c;h=0fb98f1606bab475ab5ba2d0fe08c64f83cce9df;hb=HEAD).\n\nBefore we look at the `__libc_start_main` function, let's add the `/lib64/crt1.o` and try to compile our program again:\n\n```\n$ gcc -nostdlib /lib64/crt1.o -lc -ggdb program.c -o program\n/lib64/crt1.o: In function `_start':\n(.text+0x12): undefined reference to `__libc_csu_fini'\n/lib64/crt1.o: In function `_start':\n(.text+0x19): undefined reference to `__libc_csu_init'\ncollect2: error: ld returned 1 exit status\n```\n\nNow we see another error that both `__libc_csu_fini` and `__libc_csu_init` functions are not found. We know that the addresses of these two functions are passed to the `__libc_start_main` as parameters and also these functions are constructor and destructor of our programs. But what do `constructor` and `destructor` in terms of `C` program means? We already saw the quote from the [ELF](http://flint.cs.yale.edu/cs422/doc/ELF_Format.pdf) specification:\n\n> After the dynamic linker has built the process image and performed the relocations, each shared object\n> gets the opportunity to execute some initialization code.\n> ...\n> Similarly, shared objects may have termination functions, which are executed with the atexit (BA_OS)\n> mechanism after the base process begins its termination sequence.\n\nSo the linker creates two special sections besides usual sections like `.text`, `.data` and others:\n\n* `.init`\n* `.fini`\n\nWe can find them with the `readelf` util:\n\n```\n$ readelf -e test | grep init\n  [11] .init             PROGBITS         00000000004003c8  000003c8\n\n$ readelf -e test | grep fini\n  [15] .fini             PROGBITS         0000000000400504  00000504\n```\n\nBoth of these sections will be placed at the start and end of the binary image and contain routines which are called constructor and destructor respectively. The main point of these routines is to do some initialization/finalization like initialization of global variables, such as [errno](http://man7.org/linux/man-pages/man3/errno.3.html), allocation and deallocation of memory for system routines and etc., before the actual code of a program is executed.\n\nYou may infer from the names of these functions, they will be called before the `main` function and after the `main` function. Definitions of `.init` and `.fini` sections are located in the `/lib64/crti.o` and if we add this object file:\n\n```\n$ gcc -nostdlib /lib64/crt1.o /lib64/crti.o  -lc -ggdb program.c -o program\n```\n\nwe will not get any errors. But let's try to run our program and see what happens:\n\n```\n$ ./program\nSegmentation fault (core dumped)\n```\n\nYeah, we got segmentation fault. Let's look inside of the `lib64/crti.o` with `objdump`:\n\n```\n$ objdump -D /lib64/crti.o\n\n/lib64/crti.o:     file format elf64-x86-64\n\n\nDisassembly of section .init:\n\n0000000000000000 <_init>:\n   0:\t48 83 ec 08          \tsub    $0x8,%rsp\n   4:\t48 8b 05 00 00 00 00 \tmov    0x0(%rip),%rax        # b <_init+0xb>\n   b:\t48 85 c0             \ttest   %rax,%rax\n   e:\t74 05                \tje     15 <_init+0x15>\n  10:\te8 00 00 00 00       \tcallq  15 <_init+0x15>\n\nDisassembly of section .fini:\n\n0000000000000000 <_fini>:\n   0:\t48 83 ec 08          \tsub    $0x8,%rsp\n```\n\nAs I wrote above, the `/lib64/crti.o` object file contains definition of the `.init` and `.fini` section, but also we can see here the stub for function. Let's look at the source code which is placed in the [sysdeps/x86_64/crti.S](https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/crti.S;h=e9d86ed08ab134a540e3dae5f97a9afb82cdb993;hb=HEAD) source code file:\n\n```assembly\n\t.section .init,\"ax\",@progbits\n\t.p2align 2\n\t.globl _init\n\t.type _init, @function\n_init:\n\tsubq $8, %rsp\n\tmovq PREINIT_FUNCTION@GOTPCREL(%rip), %rax\n\ttestq %rax, %rax\n\tje .Lno_weak_fn\n\tcall *%rax\n.Lno_weak_fn:\n\tcall PREINIT_FUNCTION\n```\n\nIt contains the definition of the `.init` section and assembly code does 16-byte stack alignment and next we move address of the `PREINIT_FUNCTION` and if it is zero we don't call it:\n\n```\n00000000004003c8 <_init>:\n  4003c8:       48 83 ec 08             sub    $0x8,%rsp\n  4003cc:       48 8b 05 25 0c 20 00    mov    0x200c25(%rip),%rax        # 600ff8 <_DYNAMIC+0x1d0>\n  4003d3:       48 85 c0                test   %rax,%rax\n  4003d6:       74 05                   je     4003dd <_init+0x15>\n  4003d8:       e8 43 00 00 00          callq  400420 <__libc_start_main@plt+0x10>\n  4003dd:       48 83 c4 08             add    $0x8,%rsp\n  4003e1:       c3                      retq\n```\n\nwhere the `PREINIT_FUNCTION` is the `__gmon_start__` which does setup for profiling. You may note that we have no return instruction in the [sysdeps/x86_64/crti.S](https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/crti.S;h=e9d86ed08ab134a540e3dae5f97a9afb82cdb993;hb=HEAD). Actually that's why we got a segmentation fault. Prolog of `_init` and `_fini` is placed in the [sysdeps/x86_64/crtn.S](https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/crtn.S;h=e9d86ed08ab134a540e3dae5f97a9afb82cdb993;hb=HEAD) assembly file:\n\n```assembly\n.section .init,\"ax\",@progbits\naddq $8, %rsp\nret\n\n.section .fini,\"ax\",@progbits\naddq $8, %rsp\nret\n```\n\nand if we will add it to the compilation, our program will be successfully compiled and run!\n\n```\n$ gcc -nostdlib /lib64/crt1.o /lib64/crti.o /lib64/crtn.o  -lc -ggdb program.c -o program\n\n$ ./program\nx + y + z = 6\n```\n\nConclusion\n--------------------------------------------------------------------------------\n\nNow let's return to the `_start` function and try to go through a full chain of calls before the `main` of our program will be called.\n\nThe `_start` is always placed at the beginning of the `.text` section in our programs by the linked which is used default `ld` script:\n\n```\n$ ld --verbose | grep ENTRY\nENTRY(_start)\n```\n\nThe `_start` function is defined in the [sysdeps/x86_64/start.S](https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/start.S;h=f1b961f5ba2d6a1ebffee0005f43123c4352fbf4;hb=HEAD) assembly file and does preparation like getting `argc/argv` from the stack, stack preparation and etc., before the `__libc_start_main` function will be called. The `__libc_start_main` function from the [csu/libc-start.c](https://sourceware.org/git/?p=glibc.git;a=blob;f=csu/libc-start.c;h=0fb98f1606bab475ab5ba2d0fe08c64f83cce9df;hb=HEAD) source code file does a registration of the constructor and destructor of application which are will be called before `main` and after it, starts up threading, does some security related actions like setting stack canary if need, calls initialization related routines and in the end it calls `main` function of our application and exits with its result:\n\n```C\nresult = main (argc, argv, __environ MAIN_AUXVEC_PARAM);\nexit (result);\n```\n\nThat's all.\n\nLinks\n--------------------------------------------------------------------------------\n\n* [system call](https://en.wikipedia.org/wiki/System_call)\n* [gdb](https://www.gnu.org/software/gdb/)\n* [execve](http://linux.die.net/man/2/execve)\n* [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [segment registers](https://en.wikipedia.org/wiki/X86_memory_segmentation)\n* [context switch](https://en.wikipedia.org/wiki/Context_switch)\n* [System V ABI](https://software.intel.com/sites/default/files/article/402129/mpx-linux64-abi.pdf)\n"
  },
  {
    "path": "README.md",
    "content": "# Linux insides\n\nThis repository contains a book-in-progress about the Linux kernel and its insides.\n\nThe goal of this project is simple – to share knowledge about the Linux kernel internals and related low-level topics. If you’re curious about what’s under the hood, see the [Table of Contents](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md).\n\n## Chapter status (kernel v6.19.0)\n\n> [!IMPORTANT]\n> I started writing this series when the latest version of the kernel was `3.18`. A lot has changed since then, and I am in progress of updating the content to reflect modern kernels (v6.18+). I’ll continue revising the posts as the kernel evolves.\n\n- [x] Booting (updated for v6.19.0)\n- [ ] Initialization (pending v6.18.0 review)\n- [ ] Interrupts (pending v6.18.0 review)\n- [ ] System calls (pending v6.18.0 review)\n- [ ] Timers and time management (pending v6.18.0 review)\n- [ ] Synchronization primitives (pending v6.18.0 review)\n- [ ] Memory management (pending v6.18.0 review)\n- [ ] Cgroups (pending v6.18.0 review)\n- [ ] SMP (pending v6.18.0 review)\n- [ ] Concepts (pending v6.18.0 review)\n- [ ] Data Structures in the Linux Kernel (pending v6.18.0 review)\n- [ ] Theory (pending v6.18.0 review)\n- [ ] Initial ram disk (pending v6.18.0 review)\n- [ ] Misc (pending v6.18.0 review)\n- [ ] KernelStructures (pending v6.18.0 review)\n\n## Requirements\n\n- Prior knowledge about the [Assembly language](https://en.wikipedia.org/wiki/Assembly_language)\n- Proficiency with the [C programming language](https://en.wikipedia.org/wiki/C_(programming_language))\n- Additionally, you can find lots of useful information about x86_64 processors in [Intel Software Developer Manuals](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html)\n\n> [!TIP]\n> You can get started with Assembler with my other series of posts about the [Assembly programming](https://github.com/0xAX/asm).\n\n## Translations\n\nThanks to the volunteers, the posts about Linux are translated into different languages.\n\n> [!NOTE]\n> The translations may diverge from the original content.\n\n  * [Brazilian Portuguese](https://github.com/mauri870/linux-insides)\n  * [Chinese](https://github.com/hust-open-atom-club/linux-insides-zh)\n  * [Japanese](https://github.com/tkmru/linux-insides-ja)\n  * [Korean](https://github.com/junsooo/linux-insides-ko)\n  * [Russian](https://github.com/proninyaroslav/linux-insides-ru)\n  * [Spanish](https://github.com/leolas95/linux-insides)\n  * [Turkish](https://github.com/ayyucedemirbas/linux-insides_Turkish)\n\n## Contribution\n\nRead the [Contribution guide](./CONTRIBUTING.md) to learn how to contribute to the project. When contributing, make sure to follow the [Code of Conduct](./CODE_OF_CONDUCT.md).\n\nIf you have any questions or suggestions, feel free to ping me at Twitter [@0xAX](https://twitter.com/0xAX), add an [issue](https://github.com/0xAX/linux-insides/issues/new), or drop me an [email](mailto:anotherworldofworld@gmail.com).\n\n## Mailing list\n\nThere is a Google group mailing list (`kernelhacking@googlegroups.com`) for learning the kernel source code.\n\nTo join the group, send an email to `kernelhacking+subscribe@googlegroups.com`. You will receive a confirmation email. After replying to it, you will be added to the mailing list.\n\n> [!TIP]\n> If you have a Google account, you can simply open the [archive page](https://groups.google.com/forum/#!forum/kernelhacking) and click **Apply to join group**. You will be approved automatically.\n\n## License\n\nThis project is licensed under the [BY-NC-SA Creative Commons](http://creativecommons.org/licenses/by-nc-sa/4.0/).\n\n## Author\n\nThe technical content is written by [@0xAX](https://x.com/0xAX).\n\nAdditional big thanks to [@klaudiagrz](https://github.com/klaudiagrz) for text improvements.\n"
  },
  {
    "path": "SUMMARY.md",
    "content": "### Summary\n\n* [Booting](Booting/README.md)\n    * [From bootloader to kernel](Booting/linux-bootstrap-1.md)\n    * [First steps in the kernel setup code](Booting/linux-bootstrap-2.md)\n    * [Video mode initialization and transition to protected mode](Booting/linux-bootstrap-3.md)\n    * [Transition to 64-bit mode](Booting/linux-bootstrap-4.md)\n    * [Kernel decompression](Booting/linux-bootstrap-5.md)\n    * [Kernel load address randomization](Booting/linux-bootstrap-6.md)\n* [Initialization](Initialization/README.md)\n    * [First steps in the kernel](Initialization/linux-initialization-1.md)\n    * [Early interrupts handler](Initialization/linux-initialization-2.md)\n    * [Last preparations before the kernel entry point](Initialization/linux-initialization-3.md)\n    * [Kernel entry point](Initialization/linux-initialization-4.md)\n    * [Continue architecture-specific boot-time initializations](Initialization/linux-initialization-5.md)\n    * [Architecture-specific initializations, again...](Initialization/linux-initialization-6.md)\n    * [End of the architecture-specific initializations, almost...](Initialization/linux-initialization-7.md)\n    * [Scheduler initialization](Initialization/linux-initialization-8.md)\n    * [RCU initialization](Initialization/linux-initialization-9.md)\n    * [End of initialization](Initialization/linux-initialization-10.md)\n* [Interrupts](Interrupts/README.md)\n    * [Introduction](Interrupts/linux-interrupts-1.md)\n    * [Start to dive into interrupts](Interrupts/linux-interrupts-2.md)\n    * [Interrupt handlers](Interrupts/linux-interrupts-3.md)\n    * [Initialization of non-early interrupt gates](Interrupts/linux-interrupts-4.md)\n    * [Implementation of some exception handlers](Interrupts/linux-interrupts-5.md)\n    * [Handling Non-Maskable interrupts](Interrupts/linux-interrupts-6.md)\n    * [Dive into external hardware interrupts](Interrupts/linux-interrupts-7.md)\n    * [Initialization of external hardware interrupts structures](Interrupts/linux-interrupts-8.md)\n    * [Softirq, Tasklets and Workqueues](Interrupts/linux-interrupts-9.md)\n    * [Last part](Interrupts/linux-interrupts-10.md)\n* [System calls](SysCall/README.md)\n    * [Introduction to system calls](SysCall/linux-syscall-1.md)\n    * [How the Linux kernel handles a system call](SysCall/linux-syscall-2.md)\n    * [vsyscall and vDSO](SysCall/linux-syscall-3.md)\n    * [How the Linux kernel runs a program](SysCall/linux-syscall-4.md)\n    * [Implementation of the open system call](SysCall/linux-syscall-5.md)\n    * [Limits on resources in Linux](SysCall/linux-syscall-6.md)\n* [Timers and time management](Timers/README.md)\n    * [Introduction](Timers/linux-timers-1.md)\n    * [Clocksource framework](Timers/linux-timers-2.md)\n    * [The tick broadcast framework and dyntick](Timers/linux-timers-3.md)\n    * [Introduction to timers](Timers/linux-timers-4.md)\n    * [Clockevents framework](Timers/linux-timers-5.md)\n    * [x86 related clock sources](Timers/linux-timers-6.md)\n    * [Time related system calls](Timers/linux-timers-7.md)\n* [Synchronization primitives](SyncPrim/README.md)\n    * [Introduction to spinlocks](SyncPrim/linux-sync-1.md)\n    * [Queued spinlocks](SyncPrim/linux-sync-2.md)\n    * [Semaphores](SyncPrim/linux-sync-3.md)\n    * [Mutex](SyncPrim/linux-sync-4.md)\n    * [Reader/Writer semaphores](SyncPrim/linux-sync-5.md)\n    * [SeqLock](SyncPrim/linux-sync-6.md)\n    * [RCU]()\n    * [Lockdep]()\n* [Memory management](MM/README.md)\n    * [Memblock](MM/linux-mm-1.md)\n    * [Fixmaps and ioremap](MM/linux-mm-2.md)\n    * [kmemcheck](MM/linux-mm-3.md)\n* [Cgroups](Cgroups/README.md)\n    * [Introduction to Control Groups](Cgroups/linux-cgroups-1.md)\n* [SMP]()\n* [Concepts](Concepts/README.md)\n    * [Per-CPU variables](Concepts/linux-cpu-1.md)\n    * [Cpumasks](Concepts/linux-cpu-2.md)\n    * [The initcall mechanism](Concepts/linux-cpu-3.md)\n    * [Notification Chains](Concepts/linux-cpu-4.md)\n* [Data Structures in the Linux Kernel](DataStructures/README.md)\n    * [Doubly linked list](DataStructures/linux-datastructures-1.md)\n    * [Radix tree](DataStructures/linux-datastructures-2.md)\n    * [Bit arrays](DataStructures/linux-datastructures-3.md)\n* [Theory](Theory/README.md)\n    * [Paging](Theory/linux-theory-1.md)\n    * [Elf64](Theory/linux-theory-2.md)\n    * [Inline assembly](Theory/linux-theory-3.md)\n    * [CPUID]()\n    * [MSR]()\n* [Initial ram disk]()\n   * [initrd]()\n* [Misc](Misc/README.md)\n    * [Linux kernel development](Misc/linux-misc-1.md)\n    * [How the kernel is compiled](Misc/linux-misc-2.md)\n    * [Linkers](Misc/linux-misc-3.md)\n    * [Program startup process in userspace](Misc/linux-misc-4.md)\n    * [Write and Submit your first Linux kernel Patch]()\n    * [Data types in the kernel]()\n* [KernelStructures](KernelStructures/README.md)\n    * [IDT](KernelStructures/linux-kernelstructure-1.md)\n* [Useful links](LINKS.md)\n* [Contributors](contributors.md)\n"
  },
  {
    "path": "Scripts/README.md",
    "content": "# Scripts\n\n## Description\n\n`get_all_links.py` : justify one link is live or dead with network connection\n\n`latex.sh` : a script for converting Markdown files in each of the subdirectories into a unified PDF typeset in LaTeX\n\n## Usage\n\n`get_all_links.py` :\n\n```\n./get_all_links.py ../\n```\n\n`latex.sh` :\n\n```\n./latex.sh\n```\n"
  },
  {
    "path": "Scripts/get_all_links.py",
    "content": "#!/usr/bin/env python\n\nfrom __future__ import print_function\nfrom socket import timeout\n\nimport os\nimport sys\nimport codecs\nimport re\n\nimport markdown\n\ntry:\n    # compatible for python2\n    from urllib2 import urlopen\n    from urllib2 import HTTPError\n    from urllib2 import URLError\nexcept ImportError:\n    # compatible for python3\n    from urllib.request import urlopen\n    from urllib.error import HTTPError\n    from urllib.error import URLError\n\ndef check_live_url(url):\n\n    result = False\n    try:\n        ret = urlopen(url, timeout=2)\n        result = (ret.code == 200)\n    except HTTPError as e:\n        print(e, file=sys.stderr)\n    except URLError as e:\n        print(e, file=sys.stderr)\n    except timeout as e:\n        print(e, file=sys.stderr)\n    except Exception as e:\n        print(e, file=sys.stderr)\n\n    return result\n\n\ndef main(path):\n\n    filenames = []\n    for (dirpath, dnames, fnames) in os.walk(path):\n        for fname in fnames:\n            if fname.endswith('.md'):\n                filenames.append(os.sep.join([dirpath, fname]))\n\n    urls = []\n\n    for filename in filenames:\n        fd = codecs.open(filename, mode=\"r\", encoding=\"utf-8\")\n        for line in fd.readlines():\n            refs = re.findall(r'(?<=<a href=\")[^\"]*', markdown.markdown(line))\n            for ref in refs:\n                if ref not in urls:\n                    urls.append(ref)\n        fd.close()\n\n    for url in urls:\n        if not url.startswith(\"http\"):\n            print(\"markdown file name: \" + url)\n            continue\n        if check_live_url(url):\n            print(url)\n        else:\n            print(url, file=sys.stderr)\n\n\nif __name__ == '__main__':\n\n    if len(sys.argv) == 2:\n        main(sys.argv[1])\n    else:\n        print(\"Choose one path as argument one\")\n"
  },
  {
    "path": "Scripts/latex.sh",
    "content": "# latex.sh\n# A script for converting Markdown files in each of the subdirectories into a unified PDF typeset in LaTeX. \n# Requires TexLive, Pandoc templates and pdfunite. Not necessary if you just want to read the PDF, only if you're compiling it yourself.\n\n#!/bin/bash\nrm -r build \nmkdir build\nfor D in ../*; do\n    if [ -d \"$D\" ]\n    then\n        name=$(basename \"$D\")\n        echo \"Converting $name . . .\"\n        pandoc \"$D\"/README.md \"$D\"/linux-*.md \\\n           -o build/\"$name\".tex --template default\n    fi\ndone\n\ncd ./build || exit 1\nfor f in *.tex\ndo\n    pdflatex -interaction=nonstopmode \"$f\"\ndone\n\ncd ../ || exit 1\npandoc ../README.md ../SUMMARY.md ../CONTRIBUTING.md ../contributors.md \\\n   -o ./build/Preface.tex --template default\n\npdfunite ./build/*.pdf LinuxKernelInsides.pdf\n"
  },
  {
    "path": "SyncPrim/README.md",
    "content": "# Synchronization primitives in the Linux kernel.\n\nThis chapter describes synchronization primitives in the Linux kernel.\n\n* [Introduction to spinlocks](linux-sync-1.md) - the first part of this chapter describes implementation of spinlock mechanism in the Linux kernel.\n* [Queued spinlocks](linux-sync-2.md) - the second part describes another type of spinlocks - queued spinlocks.\n* [Semaphores](linux-sync-3.md) - this part describes implementation of `semaphore` synchronization primitive in the Linux kernel.\n* [Mutual exclusion](linux-sync-4.md) - this part describes - `mutex` in the Linux kernel.\n* [Reader/Writer semaphores](linux-sync-5.md) - this part describes special type of semaphores - `reader/writer` semaphores.\n* [Sequential locks](linux-sync-6.md) - this part describes sequential locks in the Linux kernel.\n"
  },
  {
    "path": "SyncPrim/linux-sync-1.md",
    "content": "Synchronization primitives in the Linux kernel. Part 1.\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nThis part opens a new chapter in the [linux-insides](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md) book. Timers and time management related stuff was described in the previous [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/). Now it's time to move on to the next topic. As you probably recognized from the title, this chapter will describe the [synchronization](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) primitives in the Linux kernel.\n\nAs always, we will try to know what a `synchronization primitive` in general is before we deal with any synchronization-related issues. Actually, a synchronization primitive is a software mechanism, that ensures that two or more [parallel](https://en.wikipedia.org/wiki/Parallel_computing) processes or threads are not running simultaneously on the same code segment. For example, let's look at the following piece of code:\n\n```C\nmutex_lock(&clocksource_mutex);\n...\n...\n...\nclocksource_enqueue(cs);\nclocksource_enqueue_watchdog(cs);\nclocksource_select();\n...\n...\n...\nmutex_unlock(&clocksource_mutex);\n```\n\nfrom the [kernel/time/clocksource.c](https://github.com/torvalds/linux/blob/master/kernel/time/clocksource.c) source code file. This code is from the `__clocksource_register_scale` function which adds the given [clocksource](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-2) to the clock sources list. This function produces different operations on a list with registered clock sources. For example, the `clocksource_enqueue` function adds the given clock source to the list with registered clocksources - `clocksource_list`. Note that these lines of code wrapped to two functions: `mutex_lock` and `mutex_unlock` which takes one parameter - the `clocksource_mutex` in our case.\n\nThese functions represent locking and unlocking based on [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) synchronization primitive. As `mutex_lock` will be executed, it allows us to prevent the situation when two or more threads will execute this code while the `mutex_unlock` will not be executed by process-owner of the mutex. In other words, we prevent parallel operations on a `clocksource_list`. Why do we need `mutex` here? What if two parallel processes will try to register a clock source. As we already know, the `clocksource_enqueue` function adds the given clock source to the `clocksource_list` list right after a clock source in the list which has the biggest rating (a registered clock source which has the highest frequency in the system):\n\n```C\nstatic void clocksource_enqueue(struct clocksource *cs)\n{\n\tstruct list_head *entry = &clocksource_list;\n\tstruct clocksource *tmp;\n\n\tlist_for_each_entry(tmp, &clocksource_list, list) {\n\t\tif (tmp->rating < cs->rating)\n\t\t\tbreak;\n\t\tentry = &tmp->list;\n\t}\n\tlist_add(&cs->list, entry);\n}\n```\n\nIf two parallel processes will try to do it simultaneously, both process may found the same `entry` may occur [race condition](https://en.wikipedia.org/wiki/Race_condition) or in other words, the second process which will execute `list_add`, will overwrite a clock source from the first thread.\n\nBesides this simple example, synchronization primitives are ubiquitous in the Linux kernel. If we will go through the previous [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/) or other chapters again or if we will look at the Linux kernel source code in general, we will meet many places like this. We will not consider how `mutex` is implemented in the Linux kernel. Actually, the Linux kernel provides a set of different synchronization primitives like:\n\n* `mutex`;\n* `semaphores`; \n* `seqlocks`;\n* `atomic operations`;\n* etc.\n\nWe will start this chapter from the `spinlock`.\n\nSpinlocks in the Linux kernel.\n--------------------------------------------------------------------------------\n\nThe `spinlock` is a low-level synchronization mechanism which in simple words, represents a variable which can be in two states:\n\n* `acquired`;\n* `released`.\n\nEach process which wants to acquire a `spinlock`, must write a value which represents `spinlock acquired` state to this variable and write `spinlock released` state to the variable. If a process tries to execute code which is protected by a `spinlock`, it will be locked while a process which holds this lock will release it. In this case all related operations must be [atomic](https://en.wikipedia.org/wiki/Linearizability) to prevent [race conditions](https://en.wikipedia.org/wiki/Race_condition) state. The `spinlock` is represented by the `spinlock_t` type in the Linux kernel. If we will look at the Linux kernel code, we will see that this type is [widely](http://lxr.free-electrons.com/ident?i=spinlock_t) used. The `spinlock_t` is defined as:\n\n```C\ntypedef struct spinlock {\n        union {\n              struct raw_spinlock rlock;\n \n#ifdef CONFIG_DEBUG_LOCK_ALLOC\n# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))\n                struct {\n                        u8 __padding[LOCK_PADSIZE];\n                        struct lockdep_map dep_map;\n                };\n#endif\n        };\n} spinlock_t;\n```\n\nand located in the [include/linux/spinlock_types.h](https://github.com/torvalds/linux/blob/master/include/linux/spinlock_types.h) header file. We may see that its implementation depends on the state of the `CONFIG_DEBUG_LOCK_ALLOC` kernel configuration option. We will skip this now, because all debugging related stuff will be in the end of this part. So, if the `CONFIG_DEBUG_LOCK_ALLOC` kernel configuration option is disabled, the `spinlock_t` contains [union](https://en.wikipedia.org/wiki/Union_type#C.2FC.2B.2B) with one field which is - `raw_spinlock`:\n\n```C\ntypedef struct spinlock {\n        union {\n              struct raw_spinlock rlock;\n        };\n} spinlock_t;\n```\n\nThe `raw_spinlock` structure defined in the [same](https://github.com/torvalds/linux/blob/master/include/linux/spinlock_types.h) header file represents the implementation of `normal` spinlock. Let's look how the `raw_spinlock` structure is defined:\n\n```C\ntypedef struct raw_spinlock {\n        arch_spinlock_t raw_lock;\n#ifdef CONFIG_DEBUG_SPINLOCK\n\tunsigned int magic, owner_cpu;\n\tvoid *owner;\n#endif\n#ifdef CONFIG_DEBUG_LOCK_ALLOC\n\tstruct lockdep_map dep_map;\n#endif\n} raw_spinlock_t;\n```\n\nwhere the `arch_spinlock_t` represents architecture-specific `spinlock` implementation. As we mentioned above, we will skip debugging kernel configuration options. As we focus on [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture in this book, the `arch_spinlock_t` that we will consider is defined in the [include/asm-generic/qspinlock_types.h](https://github.com/torvalds/linux/blob/master/include/asm-generic/qspinlock_types.h) header file and looks:\n\n```C\ntypedef struct qspinlock {\n        union {\n\t\tatomic_t val;\n\t\tstruct {\n\t\t\tu8\tlocked;\n\t\t\tu8\tpending;\n\t\t};\n\t\tstruct {\n\t\t\tu16\tlocked_pending;\n\t\t\tu16\ttail;\n\t\t};\n        };\n} arch_spinlock_t;\n```\n\nWe will not stop on this structures for now. Let's look at the operations on a `spinlock`. The Linux kernel provides following main operations on a `spinlock`:\n\n* `spin_lock_init` - produces initialization of the given `spinlock`;\n* `spin_lock` - acquires given `spinlock`;\n* `spin_lock_bh` - disables software [interrupts](https://en.wikipedia.org/wiki/Interrupt) and acquire given `spinlock`;\n* `spin_lock_irqsave` and `spin_lock_irq` - disable interrupts on local processor, preserve/not preserve previous interrupt state in the `flags` and acquire given `spinlock`;\n* `spin_unlock` - releases given `spinlock`;\n* `spin_unlock_bh` - releases given `spinlock` and enables software interrupts;\n* `spin_is_locked` - returns the state of the given `spinlock`;\n* and etc.\n\nLet's look on the implementation of the `spin_lock_init` macro. As I already wrote, this and other macro are defined in the [include/linux/spinlock.h](https://github.com/torvalds/linux/blob/master/include/linux/spinlock.h) header file and the `spin_lock_init` macro looks:\n\n```C\n#define spin_lock_init(_lock)\t\t\t\\\ndo {\t\t\t\t\t\t\\\n\tspinlock_check(_lock);\t\t        \\\n\traw_spin_lock_init(&(_lock)->rlock);\t\\\n} while (0)\n```\n\nAs we may see, the `spin_lock_init` macro takes a `spinlock` and executes two operations: check the given `spinlock` and execute the `raw_spin_lock_init`. The implementation of the `spinlock_check` is pretty easy, this function just returns the `raw_spinlock_t` of the given `spinlock` to be sure that we got exactly `normal` raw spinlock:\n\n```C\nstatic __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)\n{\n\treturn &lock->rlock;\n}\n```\n\nThe `raw_spin_lock_init` macro:\n\n```C\n# define raw_spin_lock_init(lock)\t\t\\\ndo {\t\t\t\t\t\t\\\n    *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock);\t\\\n} while (0)\t\t\t\t\t\\\n```\n\nassigns the value of the `__RAW_SPIN_LOCK_UNLOCKED` with the given `spinlock` to the given `raw_spinlock_t`. As we may understand from the name of the `__RAW_SPIN_LOCK_UNLOCKED` macro, this macro does initialization of the given `spinlock` and set it to `released` state. This macro is defined in the [include/linux/spinlock_types.h](https://github.com/torvalds/linux/blob/master/include/linux/spinlock_types.h) header file and expands to the following macros:\n\n```C\n#define __RAW_SPIN_LOCK_UNLOCKED(lockname)      \\\n         (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)\n\n#define __RAW_SPIN_LOCK_INITIALIZER(lockname)\t\t\t\\\n         {                                                      \\\n             .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,             \\\n             SPIN_DEBUG_INIT(lockname)                          \\\n             SPIN_DEP_MAP_INIT(lockname)                        \\\n         }\n```\n\nAs I already wrote above, we will not consider stuff which is related to debugging of synchronization primitives. In this case we will not consider the `SPIN_DEBUG_INIT` and the `SPIN_DEP_MAP_INIT` macros. So the `__RAW_SPINLOCK_UNLOCKED` macro will be expanded to the:\n\n```C\n*(&(_lock)->rlock) = __ARCH_SPIN_LOCK_UNLOCKED;\n```\n\nwhere the `__ARCH_SPIN_LOCK_UNLOCKED` is:\n\n```C\n#define __ARCH_SPIN_LOCK_UNLOCKED       { { .val = ATOMIC_INIT(0) } }\n```\n\nfor the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture. So, after the expansion of the `spin_lock_init` macro, a given `spinlock` will be initialized and its state will be - `unlocked`.\n\nFrom this moment we know how to initialize a `spinlock`, now let's consider [API](https://en.wikipedia.org/wiki/Application_programming_interface) which Linux kernel provides for manipulations of `spinlocks`. The first is:\n\n```C\nstatic __always_inline void spin_lock(spinlock_t *lock)\n{\n\traw_spin_lock(&lock->rlock);\n}\n```\n\nfunction which allows us to `acquire` a `spinlock`. The `raw_spin_lock` macro is defined in the same header file and expands to the call of `_raw_spin_lock`:\n\n```C\n#define raw_spin_lock(lock)\t_raw_spin_lock(lock)\n```\n\nWhere `_raw_spin_lock` is defined depends on whether `CONFIG_SMP` option is set and `CONFIG_INLINE_SPIN_LOCK` option is set. If the [SMP](https://en.wikipedia.org/wiki/Symmetric_multiprocessing) is disabled, `_raw_spin_lock` is defined in the [include/linux/spinlock_api_up.h](https://github.com/torvalds/linux/blob/master/include/linux/spinlock_api_up.h) header file as a macro and looks like:\n\n```C\n#define _raw_spin_lock(lock)\t__LOCK(lock)\n```\n\nIf the SMP is enabled and `CONFIG_INLINE_SPIN_LOCK` is set, it is defined in [include/linux/spinlock_api_smp.h](https://github.com/torvalds/linux/blob/master/include/linux/spinlock_api_smp.h) header file as the following:\n\n```C\n#define _raw_spin_lock(lock) __raw_spin_lock(lock)\n```\n\nIf the SMP is enabled and `CONFIG_INLINE_SPIN_LOCK` is not set, it is defined in [kernel/locking/spinlock.c](https://github.com/torvalds/linux/blob/master/kernel/locking/spinlock.c) source code file as the following:\n\n```C\nvoid __lockfunc _raw_spin_lock(raw_spinlock_t *lock)\n{\n\t__raw_spin_lock(lock);\n}\n```\n\nHere we will consider the latter form of `_raw_spin_lock`. The `__raw_spin_lock` function looks:\n\n```C\nstatic inline void __raw_spin_lock(raw_spinlock_t *lock)\n{\n        preempt_disable();\n        spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);\n        LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);\n}\n```\n\nAs you may see, first of all we disable [preemption](https://en.wikipedia.org/wiki/Preemption_%28computing%29) by the call of the `preempt_disable` macro from the [include/linux/preempt.h](https://github.com/torvalds/linux/blob/master/include/linux/preempt.h) (more about this you may read in the ninth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-9) of the Linux kernel initialization process chapter). When we unlock the given `spinlock`, preemption will be enabled again:\n\n```C\nstatic inline void __raw_spin_unlock(raw_spinlock_t *lock)\n{\n        ...\n        ...\n        ...\n        preempt_enable();\n}\n```\n\nWe need to do this to prevent the process from other processes to preempt it while it is spinning on a lock. The `spin_acquire` macro which through a chain of other macros expands to the call of the:\n\n```C\n#define spin_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)\n#define lock_acquire_exclusive(l, s, t, n, i)           lock_acquire(l, s, t, 0, 1, n, i)\n```\n\nThe `lock_acquire` function:\n\n```C\nvoid lock_acquire(struct lockdep_map *lock, unsigned int subclass,\n                  int trylock, int read, int check,\n                  struct lockdep_map *nest_lock, unsigned long ip)\n{\n         unsigned long flags;\n\n         if (unlikely(current->lockdep_recursion))\n                return;\n \n         raw_local_irq_save(flags);\n         check_flags(flags);\n \n         current->lockdep_recursion = 1;\n         trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);\n         __lock_acquire(lock, subclass, trylock, read, check,\n                        irqs_disabled_flags(flags), nest_lock, ip, 0, 0);\n         current->lockdep_recursion = 0;\n         raw_local_irq_restore(flags);\n}\n```\n\nAs I wrote above, we will not consider stuff here which is related to debugging or tracing. The main point of the `lock_acquire` function is to disable hardware interrupts by the call of the `raw_local_irq_save` macro, because the given spinlock might be acquired with enabled hardware interrupts. In this way the process will not be preempted. Note that in the end of the `lock_acquire` function we will enable hardware interrupts again with the help of the `raw_local_irq_restore` macro. As you already may guess, the main work will be in the `__lock_acquire` function which is defined in the [kernel/locking/lockdep.c](https://github.com/torvalds/linux/blob/master/kernel/locking/lockdep.c) source code file.\n\nThe `__lock_acquire` function looks big. We will try to understand what this function does, but not in this part. Actually this function is mostly related to the Linux kernel [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) and it is not topic of this part. If we will return to the definition of the `__raw_spin_lock` function, we will see that it contains the following definition in the end:\n\n```C\nLOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);\n```\n\nThe `LOCK_CONTENDED` macro is defined in the [include/linux/lockdep.h](https://github.com/torvalds/linux/blob/master/include/linux/lockdep.h) header file and just calls the given function with the given `spinlock`:\n\n```C\n#define LOCK_CONTENDED(_lock, try, lock) \\\n         lock(_lock)\n```\n\nIn our case, the `lock` is `do_raw_spin_lock` function from the [include/linux/spinlock.h](https://github.com/torvalds/linux/blob/master/include/linux/spnlock.h) header file and the `_lock` is the given `raw_spinlock_t`:\n\n```C\nstatic inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)\n{\n        __acquire(lock);\n         arch_spin_lock(&lock->raw_lock);\n}\n```\n\nThe `__acquire` here is just [Sparse](https://en.wikipedia.org/wiki/Sparse) related macro and we are not interested in it in this moment. The `arch_spin_lock` macro is defined in the [include/asm-generic/qspinlock.h](https://github.com/torvalds/linux/blob/master/include/asm-generic/qspinlocks.h) header file as the following:\n\n```C\n#define arch_spin_lock(l)               queued_spin_lock(l)\n```\n\nWe stop here for this part. In the next part, we'll dive into how queued spinlocks works and related concepts.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis concludes the first part covering synchronization primitives in the Linux kernel. In this part, we met first synchronization primitive `spinlock` provided by the Linux kernel. In the next part we will continue to dive into this interesting theme and will see other `synchronization` related stuff.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [Concurrent computing](https://en.wikipedia.org/wiki/Concurrent_computing)\n* [Synchronization](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29)\n* [Clocksource framework](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-2)\n* [Mutex](https://en.wikipedia.org/wiki/Mutual_exclusion)\n* [Race condition](https://en.wikipedia.org/wiki/Race_condition)\n* [Atomic operations](https://en.wikipedia.org/wiki/Linearizability)\n* [SMP](https://en.wikipedia.org/wiki/Symmetric_multiprocessing)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64) \n* [Interrupts](https://en.wikipedia.org/wiki/Interrupt)\n* [Preemption](https://en.wikipedia.org/wiki/Preemption_%28computing%29) \n* [Linux kernel lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt)\n* [Sparse](https://en.wikipedia.org/wiki/Sparse)\n* [xadd instruction](http://x86.renejeschke.de/html/file_module_x86_id_327.html)\n* [NOP](https://en.wikipedia.org/wiki/NOP)\n* [Memory barriers](https://www.kernel.org/doc/Documentation/memory-barriers.txt)\n* [Previous chapter](https://0xax.gitbook.io/linux-insides/summary/timers/)\n"
  },
  {
    "path": "SyncPrim/linux-sync-2.md",
    "content": "Synchronization primitives in the Linux kernel. Part 2.\n================================================================================\n\nQueued Spinlocks\n--------------------------------------------------------------------------------\n\nThis is the second part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim) which describes synchronization primitives in the Linux kernel.  In the first [part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) of this chapter we meet the first [spinlock](https://en.wikipedia.org/wiki/Spinlock). We will continue to learn about this synchronization primitive here. If you have read the previous part, you may remember that besides normal spinlocks, the Linux kernel provides a special type of `spinlocks` - `queued spinlocks`. Here we will try to understand what this concept represents.\n\nWe saw the [API](https://en.wikipedia.org/wiki/Application_programming_interface) of `spinlock` in the previous [part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1):\n\n* `spin_lock_init` - produces initialization of the given `spinlock`;\n* `spin_lock` - acquires given `spinlock`;\n* `spin_lock_bh` - disables software [interrupts](https://en.wikipedia.org/wiki/Interrupt) and acquire given `spinlock`;\n* `spin_lock_irqsave` and `spin_lock_irq` - disable interrupts on local processor and preserve/not preserve previous interrupt state in the `flags`;\n* `spin_unlock` - releases given `spinlock` and acquire given `spinlock`;\n* `spin_unlock_bh` - releases given `spinlock` and enables software interrupts;\n* `spin_is_locked` - returns the state of the given `spinlock`;\n* and etc.\n\nAnd we know that all of these macros with the `arch_*` prefix which are defined in the [include/linux/spinlock.h](https://github.com/torvalds/linux/blob/master/include/linux/spinlock.h) header file will be expanded to the call of the functions  from the [include/asm-generic/qspinlock.h](https://github.com/torvalds/linux/blob/master/include/asm-generic/qspinlock.h):\n\n```C\n#define arch_spin_is_locked(l)          queued_spin_is_locked(l)\n#define arch_spin_is_contended(l)       queued_spin_is_contended(l)\n#define arch_spin_value_unlocked(l)     queued_spin_value_unlocked(l)\n#define arch_spin_lock(l)               queued_spin_lock(l)\n#define arch_spin_trylock(l)            queued_spin_trylock(l)\n#define arch_spin_unlock(l)             queued_spin_unlock(l)\n```\n\nBefore we consider how queued spinlocks and their [API](https://en.wikipedia.org/wiki/Application_programming_interface) are implemented, let's first take a look at the theory.\n\nIntroduction to queued spinlocks\n-------------------------------------------------------------------------------\n\nQueued spinlocks is a [locking mechanism](https://en.wikipedia.org/wiki/Lock_%28computer_science%29) in the Linux kernel which is replacement for the standard `spinlocks`. At least this is true for the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture. If we will look at the following kernel configuration file - [kernel/Kconfig.locks](https://github.com/torvalds/linux/blob/master/kernel/Kconfig.locks), we will see following configuration entries:\n\n```\nconfig ARCH_USE_QUEUED_SPINLOCKS\n\tbool\n\nconfig QUEUED_SPINLOCKS\n\tdef_bool y if ARCH_USE_QUEUED_SPINLOCKS\n\tdepends on SMP\n```\n\nThis means that the `CONFIG_QUEUED_SPINLOCKS` kernel configuration option will be enabled by default if the `ARCH_USE_QUEUED_SPINLOCKS` is enabled. We may see that the `ARCH_USE_QUEUED_SPINLOCKS` is enabled by default in the `x86_64` specific kernel configuration file - [arch/x86/Kconfig](https://github.com/torvalds/linux/blob/master/arch/x86/Kconfig):\n\n```\nconfig X86\n    ...\n    ...\n    ...\n    select ARCH_USE_QUEUED_SPINLOCKS\n    ...\n    ...\n    ...\n```\n\nBefore we start to consider what queued spinlock concept is, let's look on other types of `spinlocks`. For the start let's consider how a `normal` spinlock is implemented. Usually, the implementation of a `normal` spinlock is based on the [test and set](https://en.wikipedia.org/wiki/Test-and-set) instruction. The principle of how this instruction works is pretty simple. It writes a value to the memory location and returns the old value from it. Together these instructions are atomic i.e. non-interruptible instructions. So if the first thread starts to execute this instruction, second thread will wait until the first processor has finished its instruction. A basic lock can be built on top of this mechanism. Schematically it may look like this:\n\n```C\nint lock(lock)\n{\n    while (test_and_set(lock) == 1)\n        ;\n    return 0;\n}\n\nint unlock(lock)\n{\n    lock=0;\n\n    return lock;\n}\n```\n\nThe first thread will execute the `test_and_set` which will set the `lock` to `1`. When the second thread calls the `lock` function, it will spin in the `while` loop, until the first thread calls the `unlock` function and the `lock` will be equal to `0`. This implementation is not very good for performance reasons, due to (at least) two problems. The first problem is that this implementation may be unfair since other threads which arrived later at the lock may acquire it first. The second problem is that all threads which want to acquire a lock must execute many `atomic` operations like `test_and_set` on a variable which is in shared memory. This leads to the cache invalidation as the cache of the processor will store `lock=1`, but the value of the `lock` in memory may not be `1` after a thread will release this lock.\n\nThe topic of this part is `queued spinlocks`. This approach may help to solve both of these problems. The `queued spinlocks` allows each processor to spin while checking its own memory location. The basic principle of a queue-based spinlock can best be understood by studying a classic queue-based spinlock implementation called the [MCS](http://www.cs.rochester.edu/~scott/papers/1991_TOCS_synch.pdf) lock. Before we look at implementation of the `queued spinlocks` in the Linux kernel, we will try to understand how `MCS` lock works.\n\nThe basic idea of the `MCS` lock is that a thread spins on a local variable and each processor in the system has its own copy of this variable (see the previous paragraph). In other words this concept is built on top of the [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variables concept in the Linux kernel.\n\nWhen the first thread wants to acquire a lock, it registers itself in the `queue`. In other words it will be added to the special `queue` and will acquire lock, because it is free for now. When the second thread wants to acquire the same lock before the first thread releases it, this thread adds its own copy of the lock variable into this `queue`. In this case the first thread will contain a `next` field which will point to the second thread. From this moment, the second thread will wait until the first thread releases its lock and notifies `next` thread about this event. The first thread will be deleted from the `queue` and the second thread will be owner of a lock.\n\nSchematically we can represent it like:\n\nEmpty queue:\n\n```\n+---------+\n|         |\n|  Queue  |\n|         |\n+---------+\n```\n\nFirst thread tries to acquire a lock:\n\n```\n+---------+     +----------------------------+\n|         |     |                            |\n|  Queue  |---->| First thread acquired lock |\n|         |     |                            |\n+---------+     +----------------------------+\n```\n\nSecond thread tries to acquire a lock:\n\n```\n+---------+     +----------------------------------------+     +-------------------------+\n|         |     |                                        |     |                         |\n|  Queue  |---->|  Second thread waits for first thread  |<----| First thread holds lock |\n|         |     |                                        |     |                         |\n+---------+     +----------------------------------------+     +-------------------------+\n```\n\nOr the pseudocode:\n\n```C\nvoid lock(...)\n{\n    lock.next = NULL;\n    ancestor = put_lock_to_queue_and_return_ancestor(queue, lock);\n\n    // if we have ancestor, the lock is already acquired and we\n    // need to wait until it is released\n    if (ancestor)\n    {\n        lock.is_locked = 1;\n        ancestor.next = lock;\n\n        while (lock.is_locked == true)\n            ;\n    }\n\n    // otherwise we are owner of the lock and may exit\n}\n\nvoid unlock(...)\n{\n    // do we need to notify somebody or we are alone in the\n    // queue?\n    if (lock.next != NULL) {\n        // the while loop from the lock() function will be\n        // finished\n        lock.next.is_locked = false;\n    }\n\n    // So, we have no next threads in the queue to notify about\n    // lock releasing event. Let's just put `0` to the lock, will\n    // delete ourself from the queue and exit.\n}\n```\n\nThat's all we'll say about the theory of the `queued spinlocks`.  Now let's consider how this mechanism is implemented in the Linux kernel. Unlike above pseudocode, the implementation of the `queued spinlocks` looks complex and tangled. But the study with attention will lead to success.\n\nAPI of queued spinlocks\n-------------------------------------------------------------------------------\n\nNow that we know a little about `queued spinlocks` from the theoretical side, it's time to see the implementation of this mechanism in the Linux kernel. As we saw above, the [include/asm-generic/qspinlock.h](https://github.com/torvalds/linux/blob/master/include/asm-generic/qspinlock.h) header file provides a set of macros which represents the API for a spinlock acquiring, releasing, etc:\n\n```C\n#define arch_spin_is_locked(l)          queued_spin_is_locked(l)\n#define arch_spin_is_contended(l)       queued_spin_is_contended(l)\n#define arch_spin_value_unlocked(l)     queued_spin_value_unlocked(l)\n#define arch_spin_lock(l)               queued_spin_lock(l)\n#define arch_spin_trylock(l)            queued_spin_trylock(l)\n#define arch_spin_unlock(l)             queued_spin_unlock(l)\n```\n\nAll of these macros expand to the call of functions from the same header file. Additionally, we saw the `qspinlock` structure from the [include/asm-generic/qspinlock_types.h](https://github.com/torvalds/linux/blob/master/include/asm-generic/qspinlock_types.h) header file which represents a queued spinlock in the Linux kernel:\n\n```C\ntypedef struct qspinlock {\n\tunion {\n\t\tatomic_t val;\n\n\t\tstruct {\n\t\t\tu8\tlocked;\n\t\t\tu8\tpending;\n\t\t};\n\t\tstruct {\n\t\t\tu16\tlocked_pending;\n\t\t\tu16\ttail;\n\t\t};\n\t};\n} arch_spinlock_t;\n```\n\nThe `val` field represents the state of a given `spinlock`. This `4` bytes field consists from following parts:\n\n* `0-7` - locked byte;\n* `8` - pending bit;\n* `9-15` - not used;\n* `16-17` - two bit index which represents entry of the `per-cpu` array of the `MCS` lock (will see it soon);\n* `18-31` - contains number of processor which indicates tail of the queue.\n\nBefore we move on to consider the `API` of `queued spinlocks`, notice the `val` field of the `qspinlock` structure has type - `atomic_t` which represents atomic variable aka a \"one operation at a time\" variable. So, all operations with this field will be [atomic](https://en.wikipedia.org/wiki/Linearizability). For example let's look at the reading value of the `val` API:\n\n```C\nstatic __always_inline int queued_spin_is_locked(struct qspinlock *lock)\n{\n\treturn atomic_read(&lock->val);\n}\n```\n\nOk, now we know data structures which represents queued spinlock in the Linux kernel and now is the time to look at the implementation of the main function from the `queued spinlocks` [API](https://en.wikipedia.org/wiki/Application_programming_interface):\n\n```C\n#define arch_spin_lock(l)               queued_spin_lock(l)\n```\n\nYes, this function is - `queued_spin_lock`. As we may understand from the function's name, it allows a thread to acquire a lock. This function is defined in the [include/asm-generic/qspinlock_types.h](https://github.com/torvalds/linux/blob/master/include/asm-generic/qspinlock_types.h) header file and its implementation is:\n\n```C\nstatic __always_inline void queued_spin_lock(struct qspinlock *lock)\n{\n        u32 val;\n\n        val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);\n        if (likely(val == 0))\n                 return;\n        queued_spin_lock_slowpath(lock, val);\n}\n```\n\nLooks pretty easy, except for the `queued_spin_lock_slowpath` function. We see that it takes only one parameter. In our case this parameter represents `queued spinlock`, which will be locked. Let's consider the situation where `queue` with locks is empty for now and the first thread wanted to acquire lock. As we may see the `queued_spin_lock` function starts from the call of the `atomic_cmpxchg_acquire` macro. As you may guess from its name, it executes atomic [CMPXCHG](http://x86.renejeschke.de/html/file_module_x86_id_41.html) instruction. Ultimately, the `atomic_cmpxchg_acquire` macro expands to the call of the `__raw_cmpxchg` macro almost like the following:\n\n```C\n#define __raw_cmpxchg(ptr, old, new, size, lock)\t\t\\\n({\t\t\t\t\t\t\t\t\\\n\t__typeof__(*(ptr)) __ret;\t\t\t\t\\\n\t__typeof__(*(ptr)) __old = (old);\t\t\t\\\n\t__typeof__(*(ptr)) __new = (new);\t\t\t\\\n\t\t\t\t\t\t\t\t\\\n\tvolatile u32 *__ptr = (volatile u32 *)(ptr);\t\t\\\n\tasm volatile(lock \"cmpxchgl %2,%1\"\t\t\t\\\n\t\t     : \"=a\" (__ret), \"+m\" (*__ptr)\t\t\\\n\t\t     : \"r\" (__new), \"0\" (__old)\t\t\t\\\n\t\t     : \"memory\");\t\t\t\t\\\n\t\t\t\t\t\t\t\t\\\n\t__ret;\t\t\t\t\t\t\t\\\n})\n```\n\nwhich compares the `old` with the value pointed to by `ptr`.  If they are equal, it stores the `new` in the memory location which is pointed by the `ptr` and returns the initial value in this memory location.\n\nLet's back to the `queued_spin_lock` function. Assuming that we are the first one who tried to acquire the lock, the `val` will be zero and we will return from the `queued_spin_lock` function:\n\n```C\n\tval = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);\n\tif (likely(val == 0))\n\t\treturn;\n```\n\nSo far, we've only considered uncontended case (i.e. fast-path). Now let's consider contended case (i.e. slow-path). Suppose that one thread tried to acquire a lock, but the lock is already held, then `queued_spin_lock_slowpath` will be called. The `queued_spin_lock_slowpath` function is defined in the [kernel/locking/qspinlock.c](https://github.com/torvalds/linux/blob/master/kernel/locking/qspinlock.c) source code file:\n\n```C\nvoid queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)\n{\n\t...\n\t...\n\t...\n\tif (val == _Q_PENDING_VAL) {\n\t\tint cnt = _Q_PENDING_LOOPS;\n\t\tval = atomic_cond_read_relaxed(&lock->val,\n\t\t\t\t\t       (VAL != _Q_PENDING_VAL) || !cnt--);\n\t}\n\t...\n\t...\n\t...\n}\n```\n\nwhich waits for in-progress lock acquisition to be done with a bounded number of spins so that we guarantee forward progress. Above, we saw that the lock contains - pending bit. This bit represents thread which wanted to acquire lock, but it is already acquired by the other thread and `queue` is empty at the same time. In this case, the pending bit will be set and the `queue` will not be touched. This is done for optimization, because there are no need in unnecessary latency which will be caused by the cache invalidation in a touching of own `mcs_spinlock` array.\n\nIf we observe contention, then we have no choice other than queueing, so jump to `queue` label that we'll see later:\n\n```C\n\tif (val & ~_Q_LOCKED_MASK)\n\t\tgoto queue;\n```\n\nSo, the lock is already held. That is, we set the pending bit of the lock:\n\n```C\n\tval = queued_fetch_set_pending_acquire(lock);\n```\n\nAgain if we observe contention, undo the pending and queue.\n\n```C\n\tif (unlikely(val & ~_Q_LOCKED_MASK)) {\n\t\tif (!(val & _Q_PENDING_MASK))\n\t\t\tclear_pending(lock);\n\t\tgoto queue;\n\t}\n```\n\nNow, we're pending, wait for the lock owner to release it.\n\n```C\n\tif (val & _Q_LOCKED_MASK)\n\t\tatomic_cond_read_acquire(&)\n```\n\nWe are allowed to take the lock. So, we clear the pending bit and set the locked bit. Now we have nothing to do with the `queued_spin_lock_slowpath` function, return from it.\n\n```C\n\tclear_pending_set_locked(lock);\n\treturn;\n```\n\nBefore diving into queueing, we'll see about `MCS` lock mechanism first. As we already know, each processor in the system has own copy of the lock. The lock is represented by the following structure:\n\n```C\nstruct mcs_spinlock {\n       struct mcs_spinlock *next;\n       int locked;\n       int count;\n};\n```\n\nfrom the [kernel/locking/mcs_spinlock.h](https://github.com/torvalds/linux/blob/master/kernel/locking/mcs_spinlock.h) header file. The first field represents a pointer to the next thread in the `queue`. The second field represents the state of the current thread in the `queue`, where `1` is `lock` already acquired and `0` in other way. And the last field of the `mcs_spinlock` structure represents nested locks. To understand what nested lock is, imagine situation when a thread acquired lock, but was interrupted by the hardware [interrupt](https://en.wikipedia.org/wiki/Interrupt) and an [interrupt handler](https://en.wikipedia.org/wiki/Interrupt_handler) tries to take a lock too. For this case, each processor has not just copy of the `mcs_spinlock` structure but array of these structures:\n\n```C\nstatic DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]);\n```\n\nThis array allows to make four attempts of a lock acquisition for the four events in following contexts:\n\n* normal task context;\n* hardware interrupt context;\n* software interrupt context;\n* non-maskable interrupt context.\n\nNotice that we did not touch `queue` yet. We do not need it, because for two threads it just leads to unnecessary latency for memory access. In other case, the first thread may release it lock before this moment. In this case the `lock->val` will contain `_Q_LOCKED_VAL | _Q_PENDING_VAL` and we will start to build `queue`. We start to build `queue` by the getting the local copy of the `qnodes` array of the processor which executes thread and calculate `tail` which will indicate the tail of the `queue` and `idx` which represents an index of the `qnodes` array:\n\n```C\nqueue:\n\tnode = this_cpu_ptr(&qnodes[0].mcs);\n\tidx = node->count++;\n\ttail = encode_tail(smp_processor_id(), idx);\n\n\tnode = grab_mcs_node(node, idx);\n```\n\nAfter this, we set `locked` to zero because this thread didn't acquire lock yet and `next` to `NULL` because we don't know anything about other `queue` entries:\n\n```C\n\tnode->locked = 0;\n\tnode->next = NULL;\n```\n\nWe already touched `per-cpu` copy of the queue for the processor which executes current thread which wants to acquire lock, this means that owner of the lock may released it before this moment. So we may try to acquire lock again by the call of the `queued_spin_trylock` function:\n\n```C\n\tif (queued_spin_trylock(lock))\n\t\tgoto release;\n```\n\nIt does the almost same thing `queued_spin_lock` function does.\n\nIf the lock was successfully acquired we jump to the `release` label to release a node of the `queue`:\n\n```C\nrelease:\n\t__this_cpu_dec(qnodes[0].mcs.count);\n```\n\nbecause we no need in it anymore as lock is acquired. If the `queued_spin_trylock` was unsuccessful, we update tail of the queue:\n\n```C\n\told = xchg_tail(lock, tail);\n\tnext = NULL;\n```\n\nand retrieve previous tail. The next step is to check that `queue` is not empty. In this case we need to link previous entry with the new. While waiting for the MCS lock, the next pointer may have been set by another lock waiter. We optimistically load the next pointer & prefetch the cacheline for writing to reduce latency in the upcoming MCS unlock operation:\n\n```C\n\tif (old & _Q_TAIL_MASK) {\n\t\tprev = decode_tail(old);\n\t\tWRITE_ONCE(prev->next, node);\n\n\t\tarch_mcs_spin_lock_contended(&node->locked);\n\n\t\tnext = READ_ONCE(node->next);\n\t\tif (next)\n\t\t\tprefetchw(next);\n\t}\n```\n\nIf the new node was added, we prefetch cache line from memory pointed by the next queue entry with the [PREFETCHW](http://www.felixcloutier.com/x86/PREFETCHW.html) instruction. We preload this pointer now for optimization purpose. We just became a head of queue and this means that there is upcoming `MCS` unlock operation and the next entry will be touched.\n\nYes, from this moment we are in the head of the `queue`. But before we are able to acquire a lock, we need to wait at least two events: current owner of a lock will release it and the second thread with `pending` bit will acquire a lock too:\n\n```C\n\tval = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));\n```\n\nAfter both threads will release a lock, the head of the `queue` will hold a lock. In the end we just need to update the tail of the `queue` and remove current head from it.\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the second part of the [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) chapter in the Linux kernel. In the previous [part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) we already met the first synchronization primitive `spinlock` provided by the Linux kernel which is implemented as `ticket spinlock`. In this part we saw another implementation of the `spinlock` mechanism - `queued spinlock`. In the next part we will continue to dive into synchronization primitives in the Linux kernel.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [spinlock](https://en.wikipedia.org/wiki/Spinlock)\n* [interrupt](https://en.wikipedia.org/wiki/Interrupt)\n* [interrupt handler](https://en.wikipedia.org/wiki/Interrupt_handler)\n* [API](https://en.wikipedia.org/wiki/Application_programming_interface)\n* [Test and Set](https://en.wikipedia.org/wiki/Test-and-set)\n* [MCS](http://www.cs.rochester.edu/~scott/papers/1991_TOCS_synch.pdf)\n* [per-cpu variables](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [atomic instruction](https://en.wikipedia.org/wiki/Linearizability)\n* [CMPXCHG instruction](http://x86.renejeschke.de/html/file_module_x86_id_41.html)\n* [LOCK instruction](http://x86.renejeschke.de/html/file_module_x86_id_159.html)\n* [NOP instruction](https://en.wikipedia.org/wiki/NOP)\n* [PREFETCHW instruction](http://www.felixcloutier.com/x86/PREFETCHW.html)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1)\n"
  },
  {
    "path": "SyncPrim/linux-sync-3.md",
    "content": "Synchronization primitives in the Linux kernel. Part 3.\n================================================================================\n\nSemaphores\n--------------------------------------------------------------------------------\n\nThis is the third part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim) which describes synchronization primitives in the Linux kernel and in the previous part we saw special type of [spinlocks](https://en.wikipedia.org/wiki/Spinlock) - `queued spinlocks`. The previous [part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-2) was the last part which describes `spinlocks` related stuff. So we need to go ahead.\n\nThe next [synchronization primitive](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) after `spinlock` which we will see in this part is [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29). We will start from theoretical side and will learn what is it `semaphore` and only after this, we will see how it is implemented in the Linux kernel as we did in the previous part.\n\nSo, let's start.\n\nIntroduction to the semaphores in the Linux kernel\n--------------------------------------------------------------------------------\n\nSo, what is it `semaphore`? As you may guess - `semaphore` is yet another mechanism for support of thread or process synchronization. The Linux kernel already provides implementation of one synchronization mechanism - `spinlocks`, why do we need in yet another one? To answer on this question we need to know details of both of these mechanisms. We already familiar with the `spinlocks`, so let's start from this mechanism.\n\n`spinlock` creates a lock which will be acquired to protect a shared resource from being modified by more than one process. As a result, other processes that try to acquire the current lock get stopped (aka \"spin-in-place\" or busy waiting). [Context switch](https://en.wikipedia.org/wiki/Context_switch) is not allowed because [preemption](https://en.wikipedia.org/wiki/Preemption_%28computing%29) is disabled to avoid [deadlocks](https://en.wikipedia.org/wiki/Deadlock). As a result, `spinlock` should only be used if the lock will only be acquired for a very short period of time, otherwise amount of busy waiting accumulated by other processes results in extremely inefficient operation. For locks that need to be acquired for a relatively long period of time, we turn to `semaphore`.\n\n[semaphores](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) is a good solution for locks which may be acquired for a long time. In other way this mechanism is not optimal for locks that acquired for a short time. To understand this, we need to know what is `semaphore`.\n\nAs usual synchronization primitive, a `semaphore` is based on a variable. This variable may be incremented or decremented and it's state will represent ability to acquire lock. Notice that value of the variable is not limited to `0` and `1`. There are two types of `semaphores`:\n\n* `binary semaphore`;\n* `normal semaphore`.\n\nIn the first case, value of `semaphore` may be only `1` or `0`. In the second case value of `semaphore` any non-negative number. If the value of `semaphore` is greater than `1` it is called as `counting semaphore` and it allows to acquire a lock to more than `1` process. This allows us to keep records of available resources, when `spinlock` allows to hold a lock only on one task. Besides all of this, one more important thing that `semaphore` allows to sleep. Moreover when processes waits for a lock which is acquired by other process, the [scheduler](https://en.wikipedia.org/wiki/Scheduling_%28computing%29) may switch on another process.\n\nSemaphore API\n--------------------------------------------------------------------------------\n\nSo, we know a little about `semaphores` from theoretical side, let's look on its implementation in the Linux kernel. All `semaphore` [API](https://en.wikipedia.org/wiki/Application_programming_interface) is located in the [include/linux/semaphore.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/semaphore.h) header file.\n\nWe may see that the `semaphore` mechanism is represented by the following structure:\n\n```C\nstruct semaphore {\n\traw_spinlock_t\t\tlock;\n\tunsigned int\t\tcount;\n\tstruct list_head\twait_list;\n};\n```\n\nin the Linux kernel. The `semaphore` structure consists of three fields:\n\n* `lock` - `spinlock` for a `semaphore` data protection;\n* `count` - amount available resources;\n* `wait_list` - list of processes which are waiting to acquire a lock.\n\nBefore we will consider an [API](https://en.wikipedia.org/wiki/Application_programming_interface) of the `semaphore` mechanism in the Linux kernel, we need to know how to initialize a `semaphore`. Actually the Linux kernel provides two approaches to execute initialization of the given `semaphore` structure. These methods allows to initialize a `semaphore` in a:\n\n* `statically`;\n* `dynamically`.\n\nways. Let's look at the first approach. We are able to initialize a `semaphore` statically with the `DEFINE_SEMAPHORE` macro:\n\n```C\n#define DEFINE_SEMAPHORE(name)  \\\n         struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1)\n```\n\nas we may see, the `DEFINE_SEMAPHORE` macro provides ability to initialize only `binary` semaphore. The `DEFINE_SEMAPHORE` macro expands to the definition of the `semaphore` structure which is initialized with the `__SEMAPHORE_INITIALIZER` macro. Let's look at the implementation of this macro:\n\n```C\n#define __SEMAPHORE_INITIALIZER(name, n)              \\\n{                                                                       \\\n        .lock           = __RAW_SPIN_LOCK_UNLOCKED((name).lock),        \\\n        .count          = n,                                            \\\n        .wait_list      = LIST_HEAD_INIT((name).wait_list),             \\\n}\n```\n\nThe `__SEMAPHORE_INITIALIZER` macro takes the name of the future `semaphore` structure and does initialization of the fields of this structure. First of all we initialize a `spinlock` of the given `semaphore` with the `__RAW_SPIN_LOCK_UNLOCKED` macro. As you may remember from the [previous](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) parts, the `__RAW_SPIN_LOCK_UNLOCKED` is defined in the [include/linux/spinlock_types.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/spinlock_types.h) header file and expands to the `__ARCH_SPIN_LOCK_UNLOCKED` macro which just expands to zero or unlocked state:\n\n```C\n#define __ARCH_SPIN_LOCK_UNLOCKED       { { 0 } }\n```\n\nThe last two fields of the `semaphore` structure `count` and `wait_list` are initialized with the given value which represents count of available resources and empty [list](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-1).\n\nThe second way to initialize a `semaphore` structure is to pass the `semaphore` and number of available resources to the `sema_init` function which is defined in the [include/linux/semaphore.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/semaphore.h) header file:\n\n```C\nstatic inline void sema_init(struct semaphore *sem, int val)\n{\n       static struct lock_class_key __key;\n       *sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);\n       lockdep_init_map(&sem->lock.dep_map, \"semaphore->lock\", &__key, 0);\n}\n```\n\nLet's consider implementation of this function. It looks pretty easy and actually it does almost the same. Thus function executes initialization of the given `semaphore` with the `__SEMAPHORE_INITIALIZER` macro which we just saw. As I already wrote in the previous parts of this [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim), we will skip the stuff which is related to the [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) of the Linux kernel.\n\nSo, from now we are able to initialize a `semaphore` let's look at how to lock and unlock. The Linux kernel provides following [API](https://en.wikipedia.org/wiki/Application_programming_interface) to manipulate `semaphores`:\n\n```\nvoid down(struct semaphore *sem);\nvoid up(struct semaphore *sem);\nint  down_interruptible(struct semaphore *sem);\nint  down_killable(struct semaphore *sem);\nint  down_trylock(struct semaphore *sem);\nint  down_timeout(struct semaphore *sem, long jiffies);\n```\n\nThe first two functions: `down` and `up` are for acquiring and releasing of the given `semaphore`. The `down_interruptible` function tries to acquire a `semaphore`. If this try was successful, the `count` field of the given `semaphore` will be decremented and lock will be acquired, in other way the task will be switched to the blocked state or in other words the `TASK_INTERRUPTIBLE` flag will be set. This `TASK_INTERRUPTIBLE` flag means that the process may returned to ruined state by [signal](https://en.wikipedia.org/wiki/Unix_signal).\n\nThe `down_killable` function does the same as the `down_interruptible` function, but set the `TASK_KILLABLE` flag for the current process. This means that the waiting process may be interrupted by the kill signal.\n\nThe `down_trylock` function is similar on the `spin_trylock` function. This function tries to acquire a lock and exit if this operation was unsuccessful. In this case the process which wants to acquire a lock, will not wait. The last `down_timeout` function tries to acquire a lock. It will be interrupted in a waiting state when the given timeout will be expired. Additionally, you may notice that the timeout is in [jiffies](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1)\n\nWe just saw definitions of the `semaphore` [API](https://en.wikipedia.org/wiki/Application_programming_interface). We will start from the `down` function. This function is defined in the [kernel/locking/semaphore.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/semaphore.c) source code file. Let's look on the implementation function:\n\n```C\nvoid down(struct semaphore *sem)\n{\n        unsigned long flags;\n\n        raw_spin_lock_irqsave(&sem->lock, flags);\n        if (likely(sem->count > 0))\n                sem->count--;\n        else\n                __down(sem);\n        raw_spin_unlock_irqrestore(&sem->lock, flags);\n}\nEXPORT_SYMBOL(down);\n```\n\nWe may see the definition of the `flags` variable at the beginning of the `down` function. This variable will be passed to the `raw_spin_lock_irqsave` and `raw_spin_lock_irqrestore` macros which are defined in the [include/linux/spinlock.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/spinlock.h) header file and protect a counter of the given `semaphore` here. Actually both of these macro do the same that `spin_lock` and `spin_unlock` macros, but additionally they save/restore current value of interrupt flags and disables [interrupts](https://en.wikipedia.org/wiki/Interrupt).\n\nAs you already may guess, the main work is done between the `raw_spin_lock_irqsave` and `raw_spin_unlock_irqrestore` macros in the `down` function. We compare the value of the `semaphore` counter with zero and if it is bigger than zero, we may decrement this counter. This means that we already acquired the lock. In other way counter is zero. This means that all available resources already finished and we need to wait to acquire this lock. As we may see, the `__down` function will be called in this case.\n\nThe `__down` function is defined in the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/semaphore.c) source code file and its implementation looks:\n\n```C\nstatic noinline void __sched __down(struct semaphore *sem)\n{\n        __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);\n}\n```\n\nThe `__down` function just calls the `__down_common` function with three parameters:\n\n* `semaphore`;\n* `flag` - for the task;\n* `timeout` - maximum timeout to wait `semaphore`.\n\nBefore we will consider implementation of the `__down_common` function, notice that implementation of the `down_trylock`, `down_timeout` and `down_killable` functions based on the `__down_common` too:\n\n```C\nstatic noinline int __sched __down_interruptible(struct semaphore *sem)\n{\n        return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);\n}\n```\n\nThe `__down_killable`:\n\n```C\nstatic noinline int __sched __down_killable(struct semaphore *sem)\n{\n        return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);\n}\n```\n\nAnd the `__down_timeout`:\n\n```C\nstatic noinline int __sched __down_timeout(struct semaphore *sem, long timeout)\n{\n        return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);\n}\n```\n\nNow let's look at the implementation of the `__down_common` function. This function is defined in the [kernel/locking/semaphore.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/semaphore.c) source code file too and starts from the definition of the two following local variables:\n\n```C\nstruct task_struct *task = current;\nstruct semaphore_waiter waiter;\n```\n\nThe first represents current task for the local processor which wants to acquire a lock. The `current` is a macro which is defined in the [arch/x86/include/asm/current.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/current.h) header file:\n\n```C\n#define current get_current()\n```\n\nWhere the `get_current` function returns value of the `current_task` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable:\n\n```C\nDECLARE_PER_CPU(struct task_struct *, current_task);\n\nstatic __always_inline struct task_struct *get_current(void)\n{\n        return this_cpu_read_stable(current_task);\n}\n```\n\nThe second variable is `waiter` represents an entry of a `semaphore.wait_list` list:\n\n```C\nstruct semaphore_waiter {\n        struct list_head list;\n        struct task_struct *task;\n        bool up;\n};\n```\n\nNext we add current task to the `wait_list` and fill `waiter` fields after definition of these variables:\n\n```C\nlist_add_tail(&waiter.list, &sem->wait_list);\nwaiter.task = task;\nwaiter.up = false;\n```\n\nIn the next step we join into the following infinite loop:\n\n```C\nfor (;;) {\n        if (signal_pending_state(state, task))\n            goto interrupted;\n\n        if (unlikely(timeout <= 0))\n            goto timed_out;\n\n        __set_task_state(task, state);\n\n        raw_spin_unlock_irq(&sem->lock);\n        timeout = schedule_timeout(timeout);\n        raw_spin_lock_irq(&sem->lock);\n\n        if (waiter.up)\n            return 0;\n}\n```\n\nIn the previous piece of code we set `waiter.up` to `false`. So, a task will spin in this loop while `up` will not be set to `true`. This loop starts from the check that the current task is in the `pending` state or in other words flags of this task contains `TASK_INTERRUPTIBLE` or `TASK_WAKEKILL` flag. As I already wrote above a task may be interrupted by [signal](https://en.wikipedia.org/wiki/Unix_signal) during wait of ability to acquire a lock. The `signal_pending_state` function is defined in the [include/linux/sched.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/sched.h) source code file and looks:\n\n```C\nstatic inline int signal_pending_state(long state, struct task_struct *p)\n{\n         if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))\n                 return 0;\n         if (!signal_pending(p))\n                 return 0;\n\n         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);\n}\n```\n\nWe check that the `state` [bitmask](https://en.wikipedia.org/wiki/Mask_%28computing%29) contains `TASK_INTERRUPTIBLE` or `TASK_WAKEKILL` bits and if the bitmask does not contain this bit we exit. At the next step we check that the given task has a pending signal and exit if there is not. In the end we just check `TASK_INTERRUPTIBLE` bit in the `state` bitmask again or the [SIGKILL](https://en.wikipedia.org/wiki/Unix_signal#SIGKILL) signal. So, if our task has a pending signal, we will jump at the `interrupted` label:\n\n```C\ninterrupted:\n    list_del(&waiter.list);\n    return -EINTR;\n```\n\nwhere we delete task from the list of lock waiters and return the `-EINTR` [error code](https://en.wikipedia.org/wiki/Errno.h). If a task has no pending signal, we check the given timeout and if it is less or equal zero:\n\n```C\nif (unlikely(timeout <= 0))\n    goto timed_out;\n```\n\nwe jump at the `timed_out` label:\n\n```C\ntimed_out:\n    list_del(&waiter.list);\n    return -ETIME;\n```\n\nWhere we do almost the same that we did in the `interrupted` label. We delete task from the list of lock waiters, but return the `-ETIME` error code. If a task has no pending signal and the given timeout is not expired yet, the given `state` will be set in the given task:\n\n```C\n__set_task_state(task, state);\n```\n\nand call the `schedule_timeout` function:\n\n```C\nraw_spin_unlock_irq(&sem->lock);\ntimeout = schedule_timeout(timeout);\nraw_spin_lock_irq(&sem->lock);\n```\n\nwhich is defined in the [kernel/time/timer.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/timer.c) source code file. The `schedule_timeout` function makes the current task sleep until the given timeout.\n\nThat is all about the `__down_common` function. A task which wants to acquire a lock which is already acquired by another task will be spun in the infinite loop while it will not be interrupted by a signal, the given timeout will not be expired or the task which holds a lock will not release it. Now let's look at the implementation of the `up` function.\n\nThe `up` function is defined in the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/semaphore.c) source code file as `down` function. As we already know, the main purpose of this function is to release a lock. This function looks:\n\n```C\nvoid up(struct semaphore *sem)\n{\n        unsigned long flags;\n\n        raw_spin_lock_irqsave(&sem->lock, flags);\n        if (likely(list_empty(&sem->wait_list)))\n                sem->count++;\n        else\n                __up(sem);\n        raw_spin_unlock_irqrestore(&sem->lock, flags);\n}\nEXPORT_SYMBOL(up);\n```\n\nIt looks almost the same as the `down` function. There are only two differences here. First of all we increment a counter of a `semaphore` if the list of waiters is empty. In other way we call the `__up` function from the same source code file. If the list of waiters is not empty we need to allow the first task from the list to acquire a lock:\n\n```C\nstatic noinline void __sched __up(struct semaphore *sem)\n{\n        struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,\n                                                struct semaphore_waiter, list);\n        list_del(&waiter->list);\n        waiter->up = true;\n        wake_up_process(waiter->task);\n}\n```\n\nHere we takes the first task from the list of waiters, delete it from the list, set its `waiter-up` to true. From this point the infinite loop from the `__down_common` function will be stopped. The `wake_up_process` function will be called in the end of the `__up` function. As you remember we called the `schedule_timeout` function in the infinite loop from the `__down_common` this function. The `schedule_timeout` function makes the current task sleep until the given timeout will not be expired. So, as our process may sleep right now, we need to wake it up. That's why we call the `wake_up_process` function from the [kernel/sched/core.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/sched/core.c) source code file.\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the third part of the [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) chapter in the Linux kernel. In the two previous parts we already met the first synchronization primitive `spinlock` provided by the Linux kernel which is implemented as `ticket spinlock` and used for a very short time locks. In this part we saw yet another synchronization primitive - [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) which is used for long time locks as it leads to [context switch](https://en.wikipedia.org/wiki/Context_switch). In the next part we will continue to dive into synchronization primitives in the Linux kernel and will see next synchronization primitive - [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion).\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [spinlocks](https://en.wikipedia.org/wiki/Spinlock)\n* [synchronization primitive](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29)\n* [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29)\n* [context switch](https://en.wikipedia.org/wiki/Context_switch)\n* [preemption](https://en.wikipedia.org/wiki/Preemption_%28computing%29)\n* [deadlocks](https://en.wikipedia.org/wiki/Deadlock)\n* [scheduler](https://en.wikipedia.org/wiki/Scheduling_%28computing%29)\n* [Doubly linked list in the Linux kernel](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-1)\n* [jiffies](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1)\n* [interrupts](https://en.wikipedia.org/wiki/Interrupt)\n* [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [bitmask](https://en.wikipedia.org/wiki/Mask_%28computing%29)\n* [SIGKILL](https://en.wikipedia.org/wiki/Unix_signal#SIGKILL)\n* [errno](https://en.wikipedia.org/wiki/Errno.h)\n* [API](https://en.wikipedia.org/wiki/Application_programming_interface)\n* [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-2)\n"
  },
  {
    "path": "SyncPrim/linux-sync-4.md",
    "content": "Synchronization primitives in the Linux kernel. Part 4.\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nThis is the fourth part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim) which describes synchronization primitives in the Linux kernel and in the previous parts we finished to consider different types [spinlocks](https://en.wikipedia.org/wiki/Spinlock) and [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) synchronization primitives. We will continue to learn [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) in this part and consider yet another one which is called - [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) which is stands for `MUTual EXclusion`.\n\nAs in all previous parts of this [book](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md), we will try to consider this synchronization primitive from the theoretical side and only than we will consider [API](https://en.wikipedia.org/wiki/Application_programming_interface) provided by the Linux kernel to manipulate with `mutexes`.\n\nSo, let's start.\n\nConcept of `mutex`\n--------------------------------------------------------------------------------\n\nWe already familiar with the [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) synchronization primitive from the previous [part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-3). It represented by the:\n\n```C\nstruct semaphore {\n\traw_spinlock_t\t\tlock;\n\tunsigned int\t\tcount;\n\tstruct list_head\twait_list;\n};\n```\n\nstructure which holds information about state of a [lock](https://en.wikipedia.org/wiki/Lock_%28computer_science%29) and list of a lock waiters. Depending on the value of the `count` field, a `semaphore` can provide access to a resource to more than one processes wishing to access this resource. The [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) concept is very similar to a [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) concept. But it has some differences. The main difference between `semaphore` and `mutex` synchronization primitive is that `mutex` has more strict semantic. Unlike a `semaphore`, only one [process](https://en.wikipedia.org/wiki/Process_%28computing%29) may hold `mutex` at one time and only the `owner` of a `mutex` may release or unlock it. Additional difference in implementation of `lock` [API](https://en.wikipedia.org/wiki/Application_programming_interface). The `semaphore` synchronization primitive forces rescheduling of processes which are in waiters list. The implementation of `mutex` lock `API` allows to avoid this situation and has expensive [context switches](https://en.wikipedia.org/wiki/Context_switch).\n\nThe `mutex` synchronization primitive represented by the following:\n\n```C\nstruct mutex {\n        atomic_t                count;\n        spinlock_t              wait_lock;\n        struct list_head        wait_list;\n#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)\n        struct task_struct      *owner;\n#endif\n#ifdef CONFIG_MUTEX_SPIN_ON_OWNER\n        struct optimistic_spin_queue osq;\n#endif\n#ifdef CONFIG_DEBUG_MUTEXES\n        void                    *magic;\n#endif\n#ifdef CONFIG_DEBUG_LOCK_ALLOC\n        struct lockdep_map      dep_map;\n#endif\n};\n```\n\nstructure in the Linux kernel. This structure is defined in the [include/linux/mutex.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) header file and contains a set of fields similar to the `semaphore` structure. The first field of the `mutex` structure is - `count`. Value of this field represents state of a `mutex`. In a case when the value of the `count` field is `1`, a `mutex` is in `unlocked` state. When the value of the `count` field is `zero`, a `mutex` is in the `locked` state. Additionally value of the `count` field may be `negative`. In this case a `mutex` is in the `locked` state and has possible waiters.\n\nThe next two fields of the `mutex` structure - `wait_lock` and `wait_list` are [spinlock](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) for the protection of a `wait queue` and list of waiters which represents this `wait queue` for a certain lock. As you may notice, the similarity of the `mutex` and `semaphore` structures ends. Remaining fields of the `mutex` structure, as we may see depends on different configuration options of the Linux kernel.\n\nThe first field - `owner` represents [process](https://en.wikipedia.org/wiki/Process_%28computing%29) which acquired a lock. As we may see, existence of this field in the `mutex` structure depends on the `CONFIG_DEBUG_MUTEXES` or `CONFIG_MUTEX_SPIN_ON_OWNER` kernel configuration options. Main point of this field and the next `osq` fields is support of `optimistic spinning` which we will see later. The last two fields - `magic` and `dep_map` are used only in [debugging](https://en.wikipedia.org/wiki/Debugging) mode. The `magic` field is to storing a `mutex` related information for debugging and the second field - `lockdep_map` is for [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) of the Linux kernel.\n\nNow, after we have considered the `mutex` structure, we may consider how this synchronization primitive works in the Linux kernel. As you may guess, a process who wants to acquire a lock, must to decrease value of the `mutex->count` if possible. And if a process wants to release a lock, it must to increase the same value. That's true. But as you may also guess, it is not so simple in the Linux kernel.\n\nActually, when a process try to acquire a `mutex`, there three possible paths:\n\n* `fastpath`;\n* `midpath`;\n* `slowpath`.\n\nwhich may be taken, depending on the current state of the `mutex`. The first path or `fastpath` is the fastest as you may understand from its name. Everything is easy in this case. Nobody acquired a `mutex`, so the value of the `count` field of the `mutex` structure may be directly decremented. In a case of unlocking of a `mutex`, the algorithm is the same. A process just increments the value of the `count` field of the `mutex` structure. Of course, all of these operations must be [atomic](https://en.wikipedia.org/wiki/Linearizability).\n\nYes, this looks pretty easy. But what happens if a process wants to acquire a `mutex` which is already acquired by other process? In this case, the control will be transferred to the second path - `midpath`. The `midpath` or `optimistic spinning` tries to [spin](https://en.wikipedia.org/wiki/Spinlock) with already familiar for us [MCS lock](http://www.cs.rochester.edu/~scott/papers/1991_TOCS_synch.pdf) while the lock owner is running. This path will be executed only if there are no other processes ready to run that have higher priority. This path is called `optimistic` because the waiting task will not sleep and be rescheduled. This allows to avoid expensive [context switch](https://en.wikipedia.org/wiki/Context_switch).\n\nIn the last case, when the `fastpath` and `midpath` may not be executed, the last path - `slowpath` will be executed. This path acts like a [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) lock. If the lock is unable to be acquired by a process, this process will be added to `wait queue` which is represented by the following:\n\n```C\nstruct mutex_waiter {\n        struct list_head        list;\n        struct task_struct      *task;\n#ifdef CONFIG_DEBUG_MUTEXES\n        void                    *magic;\n#endif\n};\n```\n\nstructure from the [include/linux/mutex.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) header file and will sleep. Before we will consider [API](https://en.wikipedia.org/wiki/Application_programming_interface) which is provided by the Linux kernel for manipulation of `mutexes`, let's consider the `mutex_waiter` structure. If you have read the [previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-3) of this chapter, you may notice that the `mutex_waiter` structure is similar to the `semaphore_waiter` structure from the [kernel/locking/semaphore.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/semaphore.c) source code file:\n\n```C\nstruct semaphore_waiter {\n        struct list_head list;\n        struct task_struct *task;\n        bool up;\n};\n```\n\nIt also contains `list` and `task` fields which represent entry of the mutex wait queue. The one difference here that the `mutex_waiter` does not contains `up` field, but contains the `magic` field which depends on the `CONFIG_DEBUG_MUTEXES` kernel configuration option and used to store a `mutex` related information for debugging purpose.\n\nNow we know what is a `mutex` and how it is represented the Linux kernel. In this case, we may go ahead and start to look at the [API](https://en.wikipedia.org/wiki/Application_programming_interface) which the Linux kernel provides for manipulation of `mutexes`.\n\nMutex API\n--------------------------------------------------------------------------------\n\nOk, in the previous paragraph we knew what is a `mutex` synchronization primitive and saw the `mutex` structure which represents `mutex` in the Linux kernel. Now it's time to consider [API](https://en.wikipedia.org/wiki/Application_programming_interface) for manipulation of mutexes. Description of the `mutex` API is located in the [include/linux/mutex.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) header file. As always, before we will consider how to acquire and release a `mutex`, we need to know how to initialize it.\n\nThere are two approaches to initializing a `mutex`. The first is to do it statically. For this purpose the Linux kernel provides following:\n\n```C\n#define DEFINE_MUTEX(mutexname) \\\n        struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)\n```\n\nmacro. Let's consider implementation of this macro. As we may see, the `DEFINE_MUTEX` macro takes name for the `mutex` and expands to the definition of the new `mutex` structure. Additionally new `mutex` structure get initialized with the `__MUTEX_INITIALIZER` macro. Let's look at the implementation of the `__MUTEX_INITIALIZER`:\n\n```C\n#define __MUTEX_INITIALIZER(lockname)         \\\n{                                                             \\\n       .count = ATOMIC_INIT(1),                               \\\n       .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock), \\\n       .wait_list = LIST_HEAD_INIT(lockname.wait_list)        \\\n}\n```\n\nThis macro is defined in the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) header file and as we may understand it initializes fields of the `mutex` structure to their initial values. The `count` field get initialized with the `1` which represents `unlocked` state of a mutex. The `wait_lock` [spinlock](https://en.wikipedia.org/wiki/Spinlock) get initialized to the unlocked state and the last field `wait_list` to empty [doubly linked list](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-1).\n\nThe second approach allows us to initialize a `mutex` dynamically. To do this we need to call the `__mutex_init` function from the [kernel/locking/mutex.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/mutex.c) source code file. Actually, the `__mutex_init` function is rarely called directly. Instead of the `__mutex_init`, the:\n\n```C\n# define mutex_init(mutex)                \\\ndo {                                                    \\\n        static struct lock_class_key __key;             \\\n                                                        \\\n        __mutex_init((mutex), #mutex, &__key);          \\\n} while (0)\n```\n\nmacro is used. We may see that the `mutex_init` macro just defines the `lock_class_key` and call the `__mutex_init` function. Let's look at the implementation of this function:\n\n```C\nvoid\n__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)\n{\n        atomic_set(&lock->count, 1);\n        spin_lock_init(&lock->wait_lock);\n        INIT_LIST_HEAD(&lock->wait_list);\n        mutex_clear_owner(lock);\n#ifdef CONFIG_MUTEX_SPIN_ON_OWNER\n        osq_lock_init(&lock->osq);\n#endif\n        debug_mutex_init(lock, name, key);\n}\n```\n\nAs we may see the `__mutex_init` function takes three arguments:\n\n* `lock` - a mutex itself;\n* `name` - name of mutex for debugging purpose;\n* `key`  - key for [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt).\n\nAt the beginning of the `__mutex_init` function, we may see initialization of the `mutex` state. We set it to `unlocked` state with the `atomic_set` function which atomically sets the variable to the given value. After this we may see initialization of the `spinlock` to the unlocked state which will protect `wait queue` of the `mutex` and initialization of the `wait queue` of the `mutex`. After this we clear owner of the `lock` and initialize optimistic queue by the call of the `osq_lock_init` function from the [include/linux/osq_lock.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/osq_lock.h) header file. This function just sets the tail of the optimistic queue to the unlocked state:\n\n```C\nstatic inline bool osq_is_locked(struct optimistic_spin_queue *lock)\n{\n        return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL;\n}\n```\n\nIn the end of the `__mutex_init` function we may see the call of the `debug_mutex_init` function, but as I already wrote in previous parts of this [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim), we will not consider debugging related stuff in this chapter.\n\nAfter the `mutex` structure is initialized, we may go ahead and will look at the `lock` and `unlock` API of `mutex` synchronization primitive. Implementation of `mutex_lock` and `mutex_unlock` functions is located in the [kernel/locking/mutex.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/mutex.c) source code file. First of all let's start from the implementation of the `mutex_lock`. It looks:\n\n```C\nvoid __sched mutex_lock(struct mutex *lock)\n{\n        might_sleep();\n        __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);\n        mutex_set_owner(lock);\n}\n```\n\nWe may see the call of the `might_sleep` macro from the [include/linux/kernel.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/kernel.h) header file at the beginning of the `mutex_lock` function. Implementation of this macro depends on the `CONFIG_DEBUG_ATOMIC_SLEEP` kernel configuration option and if this option is enabled, this macro just prints a stack trace if it was executed in [atomic](https://en.wikipedia.org/wiki/Linearizability) context. This macro is helper for debugging purposes. In other way this macro does nothing.\n\nAfter the `might_sleep` macro, we may see the call of the `__mutex_fastpath_lock` function. This function is architecture-specific and as we consider [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture in this book, the implementation of the `__mutex_fastpath_lock` is located in the [arch/x86/include/asm/mutex_64.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/mutex_64.h) header file. As we may understand from the name of the `__mutex_fastpath_lock` function, this function will try to acquire lock in a fast path or in other words this function will try to decrement the value of the `count` of the given mutex.\n\nImplementation of the `__mutex_fastpath_lock` function consists of two parts. The first part is [inline assembly](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-3) statement. Let's look at it:\n\n```C\nasm_volatile_goto(LOCK_PREFIX \"   decl %0\\n\"\n                              \"   jns %l[exit]\\n\"\n                              : : \"m\" (v->counter)\n                              : \"memory\", \"cc\"\n                              : exit);\n```\n\nFirst of all, let's pay attention to the `asm_volatile_goto`. This macro is defined in the [include/linux/compiler-gcc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/compiler-gcc.h) header file and just expands to the two inline assembly statements:\n\n```C\n#define asm_volatile_goto(x...) do { asm goto(x); asm (\"\"); } while (0)\n```\n\nThe first assembly statement contains `goto` specificator and the second empty inline assembly statement is [barrier](https://en.wikipedia.org/wiki/Memory_barrier). Now let's return to the our inline assembly statement. As we may see it starts from the definition of the `LOCK_PREFIX` macro which just expands to the [lock](http://x86.renejeschke.de/html/file_module_x86_id_159.html) instruction:\n\n```C\n#define LOCK_PREFIX LOCK_PREFIX_HERE \"\\n\\tlock; \"\n```\n\nAs we already know from the previous parts, this instruction allows to execute prefixed instruction [atomically](https://en.wikipedia.org/wiki/Linearizability). So, at the first step in the our assembly statement we try decrement value of the given `mutex->counter`. At the next step the [jns](http://unixwiz.net/techtips/x86-jumps.html) instruction will execute jump at the `exit` label if the value of the decremented `mutex->counter` is not negative. The `exit` label is the second part of the `__mutex_fastpath_lock` function and it just points to the exit from this function:\n\n```C\nexit:\n        return;\n```\n\nFor this moment the implementation of the `__mutex_fastpath_lock` function looks pretty easy. But the value of the `mutex->counter` may be negative after decrement. In this case the: \n\n```C\nfail_fn(v);\n```\n\nwill be called after our inline assembly statement. The `fail_fn` is the second parameter of the `__mutex_fastpath_lock` function and represents pointer to function which represents `midpath/slowpath` paths to acquire the given lock. In our case the `fail_fn` is the `__mutex_lock_slowpath` function. Before we look at the implementation of the `__mutex_lock_slowpath` function, let's finish with the implementation of the `mutex_lock` function. In the simplest way, the lock will be acquired successfully by a process and the `__mutex_fastpath_lock` will be finished. In this case, we just call the\n\n```C\nmutex_set_owner(lock);\n```\n\nin the end of the `mutex_lock`. The `mutex_set_owner` function is defined in the [kernel/locking/mutex.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) header file and just sets owner of a lock to the current process:\n\n```C\nstatic inline void mutex_set_owner(struct mutex *lock)\n{\n        lock->owner = current;\n}\n```\n\nIn other way, let's consider situation when a process which wants to acquire a lock is unable to do it, because another process already acquired the same lock. We already know that the `__mutex_lock_slowpath` function will be called in this case. Let's consider implementation of this function. This function is defined in the [kernel/locking/mutex.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/mutex.c) source code file and starts from the obtaining of the proper mutex by the mutex state given from the `__mutex_fastpath_lock` with the `container_of` macro:\n\n```C\n__visible void __sched\n__mutex_lock_slowpath(atomic_t *lock_count)\n{\n        struct mutex *lock = container_of(lock_count, struct mutex, count);\n\n        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,\n                            NULL, _RET_IP_, NULL, 0);\n}\n```\n\nand call the `__mutex_lock_common` function with the obtained `mutex`. The `__mutex_lock_common` function starts from [preemption](https://en.wikipedia.org/wiki/Preemption_%28computing%29) disabling until rescheduling:\n\n```C\npreempt_disable();\n```\n\nAfter this comes the stage of optimistic spinning. As we already know this stage depends on the `CONFIG_MUTEX_SPIN_ON_OWNER` kernel configuration option. If this option is disabled, we skip this stage and move at the last path - `slowpath` of a `mutex` acquisition:\n\n```C\nif (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {\n        preempt_enable();\n        return 0;\n}\n```\n\nFirst of all, `mutex_optimistic_spin` checks that we don't need to reschedule or in other words there are no other tasks ready to run that have higher priority. If this check was successful we need to update `MCS` lock wait queue with the current spin. In this way only one spinner can complete for the mutex at one time:\n\n```C\nosq_lock(&lock->osq)\n```\n\nAt the next step we start to spin in the next loop:\n\n```C\nwhile (true) {\n    owner = READ_ONCE(lock->owner);\n\n    if (owner && !mutex_spin_on_owner(lock, owner))\n        break;\n\n    if (mutex_try_to_acquire(lock)) {\n        lock_acquired(&lock->dep_map, ip);\n\n        mutex_set_owner(lock);\n        osq_unlock(&lock->osq);\n        return true;\n    }\n}\n```\n\nand try to acquire a lock. First of all we try to take current owner and if the owner exists (it may not exist in a case when a process already released a mutex) and we wait for it in the `mutex_spin_on_owner` function before the owner will release a lock. If new task with higher priority have appeared during wait of the lock owner, we break the loop and go to sleep. In other case, the process already may release a lock, so we try to acquire a lock with the `mutex_try_to_acquired`. If this operation finished successfully, we set new owner for the given mutex, removes ourself from the `MCS` wait queue and exit from the `mutex_optimistic_spin` function. At this stage, a lock will be acquired by a process and we enable [preemption](https://en.wikipedia.org/wiki/Preemption_%28computing%29) and exit from the `__mutex_lock_common` function:\n\n```C\nif (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {\n    preempt_enable();\n    return 0;\n}\n\n```\n\nThat's all for this case.\n\nIn other case all may not be so successful. For example new task may occur during we spinning in the loop from the `mutex_optimistic_spin` or even we may not get to this loop from the `mutex_optimistic_spin` in a case when there were task(s) with higher priority before this loop. Or finally the `CONFIG_MUTEX_SPIN_ON_OWNER` kernel configuration option disabled. In this case the `mutex_optimistic_spin` will do nothing:\n\n```C\n#ifndef CONFIG_MUTEX_SPIN_ON_OWNER\nstatic bool mutex_optimistic_spin(struct mutex *lock,\n                                  struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)\n{\n    return false;\n}\n#endif\n```\n\nIn all of these cases, the `__mutex_lock_common` function will act like a `semaphore`. We try to acquire a lock again because the owner of a lock might already release a lock before this time:\n\n```C\nif (!mutex_is_locked(lock) &&\n   (atomic_xchg_acquire(&lock->count, 0) == 1))\n      goto skip_wait;\n```\n\nIn a failure case the process which wants to acquire a lock will be added to the waiters list\n\n```C\nlist_add_tail(&waiter.list, &lock->wait_list);\nwaiter.task = task;\n```\n\nIn a successful case we update the owner of a lock, enable preemption and exit from the `__mutex_lock_common` function:\n\n```C\nskip_wait:\n        mutex_set_owner(lock);\n        preempt_enable();\n        return 0;\n```\n\nIn this case a lock will be acquired. If can't acquire a lock for now, we enter into the following loop:\n\n```C\nfor (;;) {\n\n    if (atomic_read(&lock->count) >= 0 && (atomic_xchg_acquire(&lock->count, -1) == 1))\n        break;\n\n    if (unlikely(signal_pending_state(state, task))) {\n        ret = -EINTR;\n        goto err;\n    }\n\n    __set_task_state(task, state);\n\n     schedule_preempt_disabled();\n}\n```\n\nwhere try to acquire a lock again and exit if this operation was successful. Yes, we try to acquire a lock again right after unsuccessful try  before the loop. We need to do it to make sure that we get a wakeup once a lock will be unlocked. Besides this, it allows us to acquire a lock after sleep.  In other case we check the current process for pending [signals](https://en.wikipedia.org/wiki/Unix_signal) and exit if the process was interrupted by a `signal` during wait for a lock acquisition. In the end of loop we didn't acquire a lock, so we set the task state for `TASK_UNINTERRUPTIBLE` and go to sleep with call of the `schedule_preempt_disabled` function.\n\nThat's all. We have considered all three possible paths through which a process may pass when it will want to acquire a lock. Now let's consider how `mutex_unlock` is implemented. When the `mutex_unlock` is called by a process which wants to release a lock, the `__mutex_fastpath_unlock` will be called from the  [arch/x86/include/asm/mutex_64.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/mutex_64.h)  header file:\n\n```C\nvoid __sched mutex_unlock(struct mutex *lock)\n{\n    __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);\n}\n```\n\nImplementation of the `__mutex_fastpath_unlock` function is very similar to the implementation of the `__mutex_fastpath_lock` function:\n\n```C\nstatic inline void __mutex_fastpath_unlock(atomic_t *v,\n                                           void (*fail_fn)(atomic_t *))\n{\n       asm_volatile_goto(LOCK_PREFIX \"   incl %0\\n\"\n                         \"   jg %l[exit]\\n\"\n                         : : \"m\" (v->counter)\n                         : \"memory\", \"cc\"\n                         : exit);\n       fail_fn(v);\nexit:\n       return;\n}\n```\n\nActually, there is only one difference. We increment value if the `mutex->count`. So it will represent `unlocked` state after this operation. As `mutex` released, but we have something in the `wait queue` we need to update it. In this case the `fail_fn` function will be called which is `__mutex_unlock_slowpath`. The `__mutex_unlock_slowpath` function just gets the correct `mutex` instance by the given `mutex->count` and calls the `__mutex_unlock_common_slowpath` function:\n\n```C\n__mutex_unlock_slowpath(atomic_t *lock_count)\n{\n      struct mutex *lock = container_of(lock_count, struct mutex, count);\n\n      __mutex_unlock_common_slowpath(lock, 1);\n}\n```\n\nIn the `__mutex_unlock_common_slowpath` function we will get the first entry from the wait queue if the wait queue is not empty and wake up related process:\n\n```C\nif (!list_empty(&lock->wait_list)) {\n    struct mutex_waiter *waiter =\n           list_entry(lock->wait_list.next, struct mutex_waiter, list);\n                wake_up_process(waiter->task);\n}\n```\n\nAfter this, a mutex will be released by previous process and will be acquired by another process from a wait queue.\n\nThat's all. We have considered main `API` for manipulation with `mutexes`: `mutex_lock` and `mutex_unlock`. Besides this the Linux kernel provides following API:\n\n* `mutex_lock_interruptible`;\n* `mutex_lock_killable`;\n* `mutex_trylock`.\n\nand corresponding versions of `unlock` prefixed functions. This part will not describe this `API`, because it is similar to corresponding `API` of `semaphores`. More about it you may read in the [previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-3).\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the fourth part of the [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) chapter in the Linux kernel. In this part we met with new synchronization primitive which is called - `mutex`. From the theoretical side, this synchronization primitive very similar on a [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29). Actually, `mutex` represents binary semaphore. But its implementation differs from the implementation of `semaphore` in the Linux kernel. In the next part we will continue to dive into synchronization primitives in the Linux kernel.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [Mutex](https://en.wikipedia.org/wiki/Mutual_exclusion)\n* [Spinlock](https://en.wikipedia.org/wiki/Spinlock)\n* [Semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29)\n* [Synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29)\n* [API](https://en.wikipedia.org/wiki/Application_programming_interface)\n* [Locking mechanism](https://en.wikipedia.org/wiki/Lock_%28computer_science%29)\n* [Context switches](https://en.wikipedia.org/wiki/Context_switch)\n* [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt)\n* [Atomic](https://en.wikipedia.org/wiki/Linearizability)\n* [MCS lock](http://www.cs.rochester.edu/~scott/papers/1991_TOCS_synch.pdf)\n* [Doubly linked list](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-1)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [Inline assembly](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-3)\n* [Memory barrier](https://en.wikipedia.org/wiki/Memory_barrier)\n* [Lock instruction](http://x86.renejeschke.de/html/file_module_x86_id_159.html)\n* [JNS instruction](http://unixwiz.net/techtips/x86-jumps.html)\n* [preemption](https://en.wikipedia.org/wiki/Preemption_%28computing%29)\n* [Unix signals](https://en.wikipedia.org/wiki/Unix_signal)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-3)\n"
  },
  {
    "path": "SyncPrim/linux-sync-5.md",
    "content": "Synchronization primitives in the Linux kernel. Part 5.\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nThis is the fifth part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim) which describes synchronization primitives in the Linux kernel and in the previous parts we finished to consider different types [spinlocks](https://en.wikipedia.org/wiki/Spinlock), [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) and [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) synchronization primitives. We will continue to learn [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) in this part and start to consider special type of synchronization primitives - [readers–writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock).\n\nThe first synchronization primitive of this type will be already familiar for us - [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29). As in all previous parts of this [book](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md), before we will consider implementation of the `reader/writer semaphores` in the Linux kernel, we will start from the theoretical side and will try to understand what is the difference between `reader/writer semaphores` and `normal semaphores`.\n\nSo, let's start.\n\nReader/Writer semaphore\n--------------------------------------------------------------------------------\n\nActually there are two types of operations may be performed on the data. We may read data and make changes in data. Two fundamental operations - `read` and `write`. Usually (but not always), `read` operation is performed more often than `write` operation. In this case, it would be logical to lock data in such way, that some processes may read locked data in one time, on condition that no one will not change the data. The [readers/writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock) allows us to get this lock.\n\nWhen a process which wants to write something into data, all other `writer` and `reader` processes will be blocked until the process which acquired a lock, will not release it. When a process reads data, other processes which want to read the same data too, will not be locked and will be able to do this. As you may guess, implementation of the `reader/writer semaphore` is based on the implementation of the `normal semaphore`. We already familiar with the [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) synchronization primitive from the third [part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-4) of this chapter. From the theoretical side everything looks pretty simple. Let's look how `reader/writer semaphore` is represented in the Linux kernel.\n\nThe `semaphore` is represented by the:\n\n```C\nstruct semaphore {\n\traw_spinlock_t\t\tlock;\n\tunsigned int\t\tcount;\n\tstruct list_head\twait_list;\n};\n```\n\nstructure. If you will look in the [include/linux/rwsem.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/rwsem.h) header file, you will find definition of the `rw_semaphore` structure which represents `reader/writer semaphore` in the Linux kernel. Let's look at the definition of this structure:\n\n```C\n#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK\n#include <linux/rwsem-spinlock.h>\n#else\nstruct rw_semaphore {\n        long count;\n        struct list_head wait_list;\n        raw_spinlock_t wait_lock;\n#ifdef CONFIG_RWSEM_SPIN_ON_OWNER\n        struct optimistic_spin_queue osq;\n        struct task_struct *owner;\n#endif\n#ifdef CONFIG_DEBUG_LOCK_ALLOC\n        struct lockdep_map      dep_map;\n#endif\n};\n```\n\nBefore we will consider fields of the `rw_semaphore` structure, we may notice, that declaration of the `rw_semaphore` structure depends on the `CONFIG_RWSEM_GENERIC_SPINLOCK` kernel configuration option. This option is disabled for the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture by default. We can be sure in this by looking at the corresponding kernel configuration file. In our case, this configuration file is - [arch/x86/um/Kconfig](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/um/Kconfig):\n\n```\nconfig RWSEM_XCHGADD_ALGORITHM\n\tdef_bool 64BIT\n\nconfig RWSEM_GENERIC_SPINLOCK\n\tdef_bool !RWSEM_XCHGADD_ALGORITHM\n```\n\nSo, as this [book](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md) describes only [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture related stuff, we will skip the case when the `CONFIG_RWSEM_GENERIC_SPINLOCK` kernel configuration is enabled and consider definition of the `rw_semaphore` structure only from the [include/linux/rwsem.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/rwsem.h) header file.\n\nIf we will take a look at the definition of the `rw_semaphore` structure, we will notice that first three fields are the same that in the `semaphore` structure. It contains `count` field which represents amount of available resources, the `wait_list` field which represents [doubly linked list](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-1) of processes which are waiting to acquire a lock and `wait_lock` [spinlock](https://en.wikipedia.org/wiki/Spinlock) for protection of this list. Notice that `rw_semaphore.count` field is `long` type unlike the same field in the `semaphore` structure.\n\nThe `count` field of a `rw_semaphore` structure may have following values:\n\n* `0x0000000000000000` - `reader/writer semaphore` is in unlocked state and no one is waiting for a lock;\n* `0x000000000000000X` - `X` readers are active or attempting to acquire a lock and no writer waiting;\n* `0xffffffff0000000X` - may represent different cases. The first is - `X` readers are active or attempting to acquire a lock with waiters for the lock. The second is - one writer attempting a lock, no waiters for the lock. And the last - one writer is active and no waiters for the lock;\n* `0xffffffff00000001` - may represented two different cases. The first is - one reader is active or attempting to acquire a lock and exist waiters for the lock. The second case is one writer is active or attempting to acquire a lock and no waiters for the lock;\n* `0xffffffff00000000` - represents situation when there are readers or writers are queued, but no one is active or is in the process of acquire of a lock;\n* `0xfffffffe00000001` - a writer is active or attempting to acquire a lock and waiters are in queue.\n\nSo, besides the `count` field, all of these fields are similar to fields of the `semaphore` structure. Last three fields depend on the two configuration options of the Linux kernel: the `CONFIG_RWSEM_SPIN_ON_OWNER` and `CONFIG_DEBUG_LOCK_ALLOC`. The first two fields may be familiar us by declaration of the [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) structure from the [previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-4). The first `osq` field represents [MCS lock](http://www.cs.rochester.edu/~scott/papers/1991_TOCS_synch.pdf) spinner for `optimistic spinning` and the second represents process which is current owner of a lock.\n\nThe last field of the `rw_semaphore` structure is - `dep_map` - debugging related, and as I already wrote in previous parts, we will skip debugging related stuff in this chapter.\n\nThat's all. Now we know a little about what is it `reader/writer lock` in general and `reader/writer semaphore` in particular. Additionally we saw how a `reader/writer semaphore` is represented in the Linux kernel. In this case, we may go ahead and start to look at the [API](https://en.wikipedia.org/wiki/Application_programming_interface) which the Linux kernel provides for manipulation of `reader/writer semaphores`.\n\nReader/Writer semaphore API\n--------------------------------------------------------------------------------\n\nSo, we know a little about `reader/writer semaphores` from theoretical side, let's look on its implementation in the Linux kernel. All `reader/writer semaphores` related [API](https://en.wikipedia.org/wiki/Application_programming_interface) is located in the [include/linux/rwsem.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/rwsem.h) header file.\n\nAs always, before we consider an [API](https://en.wikipedia.org/wiki/Application_programming_interface) of the `reader/writer semaphore` mechanism in the Linux kernel, we need to know how to initialize the `rw_semaphore` structure. As we already saw in previous parts of this [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim), all [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) may be initialized in two ways:\n\n* `statically`;\n* `dynamically`.\n\nAnd `reader/writer semaphore` is not an exception. First of all, let's take a look at the first approach. We may initialize `rw_semaphore` structure with the help of the `DECLARE_RWSEM` macro in compile time. This macro is defined in the [include/linux/rwsem.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/rwsem.h) header file and looks:\n\n```C\n#define DECLARE_RWSEM(name) \\\n        struct rw_semaphore name = __RWSEM_INITIALIZER(name)\n```\n\nAs we may see, the `DECLARE_RWSEM` macro just expands to the definition of the `rw_semaphore` structure with the given name. Additionally new `rw_semaphore` structure is initialized with the value of the `__RWSEM_INITIALIZER` macro:\n\n```C\n#define __RWSEM_INITIALIZER(name)              \\\n{                                                              \\\n        .count = RWSEM_UNLOCKED_VALUE,                         \\\n        .wait_list = LIST_HEAD_INIT((name).wait_list),         \\\n        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock)  \\\n         __RWSEM_OPT_INIT(name)                                \\\n         __RWSEM_DEP_MAP_INIT(name)\n}\n```\n\nand expands to the initialization of fields of `rw_semaphore` structure. First of all we initialize `count` field of the `rw_semaphore` structure to the `unlocked` state with `RWSEM_UNLOCKED_VALUE` macro from the [arch/x86/include/asm/rwsem.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/rwsem.h) architecture specific header file:\n\n```C\n#define RWSEM_UNLOCKED_VALUE            0x00000000L\n```\n\nAfter this we initialize list of a lock waiters with the empty linked list and [spinlock](https://en.wikipedia.org/wiki/Spinlock) for protection of this list with the `unlocked` state too. The `__RWSEM_OPT_INIT` macro depends on the state of the `CONFIG_RWSEM_SPIN_ON_OWNER` kernel configuration option and if this option is enabled it expands to the initialization of the `osq` and `owner` fields of the `rw_semaphore` structure. As we already saw above, the `CONFIG_RWSEM_SPIN_ON_OWNER` kernel configuration option is enabled by default for [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, so let's take a look at the definition of the `__RWSEM_OPT_INIT` macro:\n\n```C\n#ifdef CONFIG_RWSEM_SPIN_ON_OWNER\n    #define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED, .owner = NULL\n#else\n    #define __RWSEM_OPT_INIT(lockname)\n#endif\n```\n\nAs we may see, the `__RWSEM_OPT_INIT` macro initializes the [MCS lock](http://www.cs.rochester.edu/~scott/papers/1991_TOCS_synch.pdf) lock with `unlocked` state and initial `owner` of a lock with `NULL`. From this moment, a `rw_semaphore` structure will be initialized in a compile time and may be used for data protection.\n\nThe second way to initialize a `rw_semaphore` structure is `dynamically` or use the `init_rwsem` macro from the [include/linux/rwsem.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/rwsem.h) header file. This macro declares an instance of the `lock_class_key` which is related to the [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) of the Linux kernel and to the call of the `__init_rwsem` function with the given `reader/writer semaphore`:\n\n```C\n#define init_rwsem(sem)                         \\\ndo {                                                            \\\n        static struct lock_class_key __key;                     \\\n                                                                \\\n        __init_rwsem((sem), #sem, &__key);                      \\\n} while (0)\n```\n\nIf you will start definition of the `__init_rwsem` function, you will notice that there are couple of source code files which contain it. As you may guess, sometimes we need to initialize additional fields of the `rw_semaphore` structure, like the `osq` and `owner`. But sometimes not. All of this depends on some kernel configuration options. If we will look at the [kernel/locking/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/Makefile) makefile, we will see following lines:\n\n```Makefile\nobj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o\nobj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o\n```\n\nAs we already know, the Linux kernel for `x86_64` architecture has enabled `CONFIG_RWSEM_XCHGADD_ALGORITHM` kernel configuration option by default:\n\n```\nconfig RWSEM_XCHGADD_ALGORITHM\n\tdef_bool 64BIT\n```\n\nin the [arch/x86/um/Kconfig](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/um/Kconfig) kernel configuration file. In this case, implementation of the `__init_rwsem` function will be located in the [kernel/locking/rwsem.c](https://github.com/torvalds/linux/blob/master/kernel/locking/rwsem.c) source code file for us. Let's take a look at this function:\n\n```C\nvoid __init_rwsem(struct rw_semaphore *sem, const char *name,\n                    struct lock_class_key *key)\n{\n#ifdef CONFIG_DEBUG_LOCK_ALLOC\n        debug_check_no_locks_freed((void *)sem, sizeof(*sem));\n        lockdep_init_map(&sem->dep_map, name, key, 0);\n#endif\n        sem->count = RWSEM_UNLOCKED_VALUE;\n        raw_spin_lock_init(&sem->wait_lock);\n        INIT_LIST_HEAD(&sem->wait_list);\n#ifdef CONFIG_RWSEM_SPIN_ON_OWNER\n        sem->owner = NULL;\n        osq_lock_init(&sem->osq);\n#endif\n}\n```\n\nWe may see here almost the same as in `__RWSEM_INITIALIZER` macro with difference that all of this will be executed in [runtime](https://en.wikipedia.org/wiki/Run_time_%28program_lifecycle_phase%29).\n\nSo, from now we are able to initialize a `reader/writer semaphore` let's look at the `lock` and `unlock` API. The Linux kernel provides following primary [API](https://en.wikipedia.org/wiki/Application_programming_interface) to manipulate `reader/writer semaphores`:\n\n* `void down_read(struct rw_semaphore *sem)` - lock for reading;\n* `int down_read_trylock(struct rw_semaphore *sem)` - try lock for reading;\n* `void down_write(struct rw_semaphore *sem)` - lock for writing;\n* `int down_write_trylock(struct rw_semaphore *sem)` - try lock for writing;\n* `void up_read(struct rw_semaphore *sem)` - release a read lock;\n* `void up_write(struct rw_semaphore *sem)` - release a write lock;\n\nLet's start as always from the locking. First of all let's consider implementation of the `down_write` function which executes a try of acquiring of a lock for `write`. This function is [kernel/locking/rwsem.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/rwsem.c) source code file and starts from the call of the macro from the [include/linux/kernel.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/kernel.h) header file:\n\n```C\nvoid __sched down_write(struct rw_semaphore *sem)\n{\n        might_sleep();\n        rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);\n\n        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);\n        rwsem_set_owner(sem);\n}\n```\n\nWe already met the `might_sleep` macro in the [previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-4). In short, implementation of the `might_sleep` macro depends on the `CONFIG_DEBUG_ATOMIC_SLEEP` kernel configuration option and if this option is enabled, this macro just prints a stack trace if it was executed in [atomic](https://en.wikipedia.org/wiki/Linearizability) context. As this macro is mostly for debugging purpose we will skip it and will go ahead. Additionally we will skip the next macro from the `down_read` function - `rwsem_acquire` which is related to the [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) of the Linux kernel, because this is topic of other part.\n\nThe only two things that remained in the `down_write` function is the call of the `LOCK_CONTENDED` macro which is defined in the [include/linux/lockdep.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/lockdep.h) header file and setting of owner of a lock with the `rwsem_set_owner` function which sets owner to currently running process:\n\n```C\nstatic inline void rwsem_set_owner(struct rw_semaphore *sem)\n{\n        sem->owner = current;\n}\n```\n\nAs you already may guess, the `LOCK_CONTENDED` macro does all job for us. Let's look at the implementation of the `LOCK_CONTENDED` macro:\n\n```C\n#define LOCK_CONTENDED(_lock, try, lock) \\\n        lock(_lock)\n```\n\nAs we may see it just calls the `lock` function which is third parameter of the `LOCK_CONTENDED` macro with the given `rw_semaphore`. In our case the third parameter of the `LOCK_CONTENDED` macro is the `__down_write` function which is architecture specific function and located in the [arch/x86/include/asm/rwsem.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/rwsem.h) header file. Let's look at the implementation of the `__down_write` function:\n\n```C\nstatic inline void __down_write(struct rw_semaphore *sem)\n{\n        __down_write_nested(sem, 0);\n}\n```\n\nwhich just executes a call of the `__down_write_nested` function from the same source code file. Let's take a look at the implementation of the `__down_write_nested` function:\n\n```C\nstatic inline void __down_write_nested(struct rw_semaphore *sem, int subclass)\n{\n        long tmp;\n\n        asm volatile(\"# beginning down_write\\n\\t\"\n                     LOCK_PREFIX \"  xadd      %1,(%2)\\n\\t\"\n                     \"  test \" __ASM_SEL(%w1,%k1) \",\" __ASM_SEL(%w1,%k1) \"\\n\\t\"\n                     \"  jz        1f\\n\"\n                     \"  call call_rwsem_down_write_failed\\n\"\n                     \"1:\\n\"\n                     \"# ending down_write\"\n                     : \"+m\" (sem->count), \"=d\" (tmp)\n                     : \"a\" (sem), \"1\" (RWSEM_ACTIVE_WRITE_BIAS)\n                     : \"memory\", \"cc\");\n}\n```\n\nAs for other synchronization primitives which we saw in this chapter, usually `lock/unlock` functions consists only from an [inline assembly](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-3) statement. As we may see, in our case the same for `__down_write_nested` function. Let's try to understand what does this function do. The first line of our assembly statement is just a comment, let's skip it. The second like contains `LOCK_PREFIX` which will be expanded to the [LOCK](http://x86.renejeschke.de/html/file_module_x86_id_159.html) instruction as we already know. The next [xadd](http://x86.renejeschke.de/html/file_module_x86_id_327.html) instruction executes `add` and `exchange` operations. In other words, `xadd` instruction adds value of the `RWSEM_ACTIVE_WRITE_BIAS`:\n\n```C\n#define RWSEM_ACTIVE_WRITE_BIAS         (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)\n\n#define RWSEM_WAITING_BIAS              (-RWSEM_ACTIVE_MASK-1)\n#define RWSEM_ACTIVE_BIAS               0x00000001L\n```\n\nor `0xffffffff00000001` to the `count` of the given `reader/writer semaphore` and returns previous value of it. After this we check the active mask in the `rw_semaphore->count`. If it was zero before, this means that there were no-one writer before, so we acquired a lock. In other way we call the `call_rwsem_down_write_failed` function from the [arch/x86/lib/rwsem.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/lib/rwsem.S) assembly file. The `call_rwsem_down_write_failed` function just calls the `rwsem_down_write_failed` function from the [kernel/locking/rwsem-xadd.c](https://github.com/torvalds/linux/blob/master/kernel/locking/rwsem.c) source code file anticipatorily save general purpose registers:\n\n```assembly\nENTRY(call_rwsem_down_write_failed)\n\tFRAME_BEGIN\n\tsave_common_regs\n\tmovq %rax,%rdi\n\tcall rwsem_down_write_failed\n\trestore_common_regs\n\tFRAME_END\n\tret\n    ENDPROC(call_rwsem_down_write_failed)\n```\n\nThe `rwsem_down_write_failed` function starts from the [atomic](https://en.wikipedia.org/wiki/Linearizability) update of the `count` value:\n\n```C\n __visible\nstruct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)\n{\n    count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);\n    ...\n    ...\n    ...\n}\n```\n\nwith the `-RWSEM_ACTIVE_WRITE_BIAS` value. The `rwsem_atomic_update` function is defined in the [arch/x86/include/asm/rwsem.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/rwsem.h) header file and implement exchange and add logic:\n\n```C\nstatic inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)\n{\n        return delta + xadd(&sem->count, delta);\n}\n```\n\nThis function atomically adds the given delta to the `count` and returns old value of the count. After this it just returns sum of the given `delta` and old value of the `count` field. In our case we undo write bias from the `count` as we didn't acquire a lock. After this step we try to do `optimistic spinning` by the call of the `rwsem_optimistic_spin` function:\n\n```C\nif (rwsem_optimistic_spin(sem))\n      return sem;\n```\n\nWe will skip implementation of the `rwsem_optimistic_spin` function, as it is similar on the `mutex_optimistic_spin` function which we saw in the [previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-4). In short words we check existence other tasks ready to run that have higher priority in the `rwsem_optimistic_spin` function. If there are such tasks, the process will be added to the [MCS](http://www.cs.rochester.edu/~scott/papers/1991_TOCS_synch.pdf) `waitqueue` and start to spin in the loop until a lock will be able to be acquired. If `optimistic spinning` is disabled, a process will be added to the `wait_list` and marked as waiting for write:\n\n```C\nwaiter.task = current;\nwaiter.type = RWSEM_WAITING_FOR_WRITE;\n\nif (list_empty(&sem->wait_list))\n    waiting = false;\n\nlist_add_tail(&waiter.list, &sem->wait_list);\n```\n\nwaiters list and start to wait until it will successfully acquire the lock. After we have added a process to the waiters list which was empty before this moment, we update the value of the `rw_semaphore->count` with the `RWSEM_WAITING_BIAS`:\n\n```C\ncount = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);\n```\n\nwith this we mark `rw_semaphore->counter` that it is already locked and exists/waits one `writer` which wants to acquire the lock. In other way we try to wake `reader` processes from the `wait queue` that were queued before this `writer` process and there are no active readers. In the end of the `rwsem_down_write_failed` a `writer` process will go to sleep which didn't acquire a lock in the following loop:\n\n```C\nwhile (true) {\n    if (rwsem_try_write_lock(count, sem))\n        break;\n    raw_spin_unlock_irq(&sem->wait_lock);\n    do {\n        schedule();\n        set_current_state(TASK_UNINTERRUPTIBLE);\n    } while ((count = sem->count) & RWSEM_ACTIVE_MASK);\n    raw_spin_lock_irq(&sem->wait_lock);\n}\n```\n\nI will skip explanation of this loop as we already met similar functional in the [previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-4).\n\nThat's all. From this moment, our `writer` process will acquire or not acquire a lock depends on the value of the `rw_semaphore->count` field. Now if we will look at the implementation of the `down_read` function which executes a try of acquiring of a lock. We will see similar actions which we saw in the `down_write` function. This function calls different debugging and lock validator related functions/macros:\n\n```C\nvoid __sched down_read(struct rw_semaphore *sem)\n{\n        might_sleep();\n        rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);\n\n        LOCK_CONTENDED(sem, __down_read_trylock, __down_read);\n}\n```\n\nand does all job in the `__down_read` function. The `__down_read` consists of inline assembly statement:\n\n```C\nstatic inline void __down_read(struct rw_semaphore *sem)\n{\n         asm volatile(\"# beginning down_read\\n\\t\"\n                     LOCK_PREFIX _ASM_INC \"(%1)\\n\\t\"\n                     \"  jns        1f\\n\"\n                     \"  call call_rwsem_down_read_failed\\n\"\n                     \"1:\\n\\t\"\n                     \"# ending down_read\\n\\t\"\n                     : \"+m\" (sem->count)\n                     : \"a\" (sem)\n                     : \"memory\", \"cc\");\n}\n```\n\nwhich increments value of the given `rw_semaphore->count` and calls the `call_rwsem_down_read_failed` if this value is negative. In other way we jump at the label `1:` and exit. After this `read` lock will be successfully acquired. Notice that we check a sign of the `count` value as it may be negative, because as you may remember most significant [word](https://en.wikipedia.org/wiki/Word_%28computer_architecture%29) of the `rw_semaphore->count` contains negated number of active writers.\n\nLet's consider case when a process wants to acquire a lock for `read` operation, but it is already locked. In this case the `call_rwsem_down_read_failed` function from the [arch/x86/lib/rwsem.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/lib/rwsem.S)  assembly file will be called. If you will look at the implementation of this function, you will notice that it does the same that `call_rwsem_down_write_failed` function does. Except it calls the `rwsem_down_read_failed` function instead of `rwsem_down_write_failed`. Now let's consider implementation of the `rwsem_down_read_failed` function. It starts from the adding a process to the `wait queue` and updating of value of the `rw_semaphore->counter`:\n\n```C\nlong adjustment = -RWSEM_ACTIVE_READ_BIAS;\n\nwaiter.task = tsk;\nwaiter.type = RWSEM_WAITING_FOR_READ;\n\nif (list_empty(&sem->wait_list))\n    adjustment += RWSEM_WAITING_BIAS;\nlist_add_tail(&waiter.list, &sem->wait_list);\n\ncount = rwsem_atomic_update(adjustment, sem);\n```\n\nNotice that if the `wait queue` was empty before we clear the `rw_semaphore->counter` and undo `read` bias in other way. At the next step we check that there are no active locks and we are first in the `wait queue` we need to join currently active `reader` processes. In other way we go to sleep until a lock will not be able to acquired.\n\nThat's all. Now we know how `reader` and `writer` processes will behave in different cases during a lock acquisition. Now let's take a short look at `unlock` operations. The `up_read` and `up_write` functions allows us to unlock a `reader` or `writer` lock. First of all let's take a look at the implementation of the `up_write` function which is defined in the [kernel/locking/rwsem.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/rwsem.c) source code file:\n\n```C\nvoid up_write(struct rw_semaphore *sem)\n{\n        rwsem_release(&sem->dep_map, 1, _RET_IP_);\n\n        rwsem_clear_owner(sem);\n        __up_write(sem);\n}\n```\n\nFirst of all it calls the `rwsem_release` macro which is related to the lock validator of the Linux kernel, so we will skip it now. And at the next line the `rwsem_clear_owner` function which as you may understand from the name of this function, just clears the `owner` field of the given `rw_semaphore`:\n\n```C\nstatic inline void rwsem_clear_owner(struct rw_semaphore *sem)\n{\n\tsem->owner = NULL;\n}\n```\n\nThe `__up_write` function does all job of unlocking of the lock. The `_up_write` is architecture-specific function, so for our case it will be located in the [arch/x86/include/asm/rwsem.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/rwsem.h) source code file. If we will take a look at the implementation of this function, we will see that it does almost the same that `__down_write` function, but conversely. Instead of adding of the `RWSEM_ACTIVE_WRITE_BIAS` to the `count`, we subtract the same value and check the `sign` of the previous value.\n\nIf the previous value of the `rw_semaphore->count` is not negative, a writer process released a lock and now it may be acquired by someone else. In other case, the `rw_semaphore->count` will contain negative values. This means that there is at least one `writer` in a wait queue. In this case the `call_rwsem_wake` function will be called. This function acts like similar functions which we already saw above. It store general purpose registers at the stack for preserving and call the `rwsem_wake` function.\n\nFirst of all the `rwsem_wake` function checks if a spinner is present. In this case it will just acquire a lock which is just released by lock owner. In other case there must be someone in the `wait queue` and we need to wake or writer process if it exists at the top of the `wait queue` or all `reader` processes. The `up_read` function which release a `reader` lock acts in similar way like `up_write`, but with a little difference. Instead of subtracting of `RWSEM_ACTIVE_WRITE_BIAS` from the `rw_semaphore->count`, it subtracts `1` from it, because less significant word of the `count` contains number active locks. After this it checks `sign` of the `count` and calls the `rwsem_wake` like `__up_write` if the `count` is negative or in other way lock will be successfully released.\n\nThat's all. We have considered API for manipulation with `reader/writer semaphore`: `up_read/up_write` and `down_read/down_write`. We saw that the Linux kernel provides additional API, besides this functions, like the ``, `` and etc. But I will not consider implementation of these function in this part because it must be similar on that we have seen in this part of except few subtleties.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the fifth part of the [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) chapter in the Linux kernel. In this part we met with special type of `semaphore` - `readers/writer` semaphore which provides access to data for multiply process to read or for one process to writer. In the next part we will continue to dive into synchronization primitives in the Linux kernel.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [Synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29)\n* [Readers/Writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock)\n* [Spinlocks](https://en.wikipedia.org/wiki/Spinlock)\n* [Semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29)\n* [Mutex](https://en.wikipedia.org/wiki/Mutual_exclusion)\n* [x86_64 architecture](https://en.wikipedia.org/wiki/X86-64)\n* [Doubly linked list](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-1)\n* [MCS lock](http://www.cs.rochester.edu/~scott/papers/1991_TOCS_synch.pdf)\n* [API](https://en.wikipedia.org/wiki/Application_programming_interface)\n* [Linux kernel lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt)\n* [Atomic operations](https://en.wikipedia.org/wiki/Linearizability)\n* [Inline assembly](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-3)\n* [XADD instruction](http://x86.renejeschke.de/html/file_module_x86_id_327.html)\n* [LOCK instruction](http://x86.renejeschke.de/html/file_module_x86_id_159.html)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-4)\n"
  },
  {
    "path": "SyncPrim/linux-sync-6.md",
    "content": "Synchronization primitives in the Linux kernel. Part 6.\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nThis is the sixth part of the chapter which describes [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_(computer_science)) in the Linux kernel and in the previous parts we finished to consider different [readers-writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock) synchronization primitives. We will continue to learn synchronization primitives in this part and start to consider a similar synchronization primitive which can be used to avoid the `writer starvation` problem. The name of this synchronization primitive is - `seqlock` or `sequential locks`.\n\nWe know from the previous [part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-5) that [readers-writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock) is a special lock mechanism which allows concurrent access for read-only operations, but an exclusive lock is needed for writing or modifying data. As we may guess, it may lead to a problem which is called `writer starvation`. In other words, a writer process can't acquire a lock as long as at least one reader process which acquired a lock holds it. So, in the situation when contention is high, it will lead to situation when a writer process which wants to acquire a lock will wait for it for a long time.\n\nThe `seqlock` synchronization primitive can help solve this problem.\n\nAs in all previous parts of this [book](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md), we will try to consider this synchronization primitive from the theoretical side and only than we will consider [API](https://en.wikipedia.org/wiki/Application_programming_interface) provided by the Linux kernel to manipulate the `seqlocks`.\n\nSo, let's start.\n\nSequential lock\n--------------------------------------------------------------------------------\n\nSo, what is a `seqlock` synchronization primitive and how does it work? Let's try to answer these questions in this paragraph. Actually `sequential locks` were introduced in the Linux kernel 2.6.x. Main point of this synchronization primitive is to provide fast and lock-free access to shared resources. Since the heart of `sequential lock` synchronization primitive is [spinlock](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) synchronization primitive, `sequential locks` work in situations where the protected resources are small and simple. Additionally write access must be rare and also should be fast.\n\nWork of this synchronization primitive is based on the sequence of events counter. Actually a `sequential lock` allows free access to a resource for readers, but each reader must check existence of conflicts with a writer. This synchronization primitive introduces a special counter. The main algorithm of work of `sequential locks` is simple: Each writer which acquired a sequential lock increments this counter and additionally acquires a [spinlock](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1). When this writer finishes, it will release the acquired spinlock to give access to other writers and increment the counter of a sequential lock again.\n\nRead only access works on the following principle, it gets the value of a `sequential lock` counter before it will enter into [critical section](https://en.wikipedia.org/wiki/Critical_section) and compares it with the value of the same `sequential lock` counter at the exit of critical section. If their values are equal, this means that there weren't writers for this period. If their values are not equal, this means that a writer has incremented the counter during the [critical section](https://en.wikipedia.org/wiki/Critical_section). This conflict means that reading of protected data must be repeated.\n\nThat's all. As we may see principle of work of `sequential locks` is simple.\n\n```C\nunsigned int seq_counter_value;\n\ndo {\n    seq_counter_value = get_seq_counter_val(&the_lock);\n    //\n    // do as we want here\n    //\n} while (__retry__);\n```\n\nActually the Linux kernel does not provide `get_seq_counter_val()` function. Here it is just a stub. Like a `__retry__` too. As I already wrote above, we will see actual the [API](https://en.wikipedia.org/wiki/Application_programming_interface) for this in the next paragraph of this part.\n\nOk, now we know what a `seqlock` synchronization primitive is and how it is represented in the Linux kernel. In this case, we may go ahead and start to look at the [API](https://en.wikipedia.org/wiki/Application_programming_interface) which the Linux kernel provides for manipulation of synchronization primitives of this type.\n\nSequential lock API\n--------------------------------------------------------------------------------\n\nSo, now we know a little about `sequential lock` synchronization primitive from theoretical side, let's look at its implementation in the Linux kernel. All `sequential locks` [API](https://en.wikipedia.org/wiki/Application_programming_interface) are located in the [include/linux/seqlock.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/seqlock.h) header file.\n\nFirst of all we may see that the a `sequential lock` mechanism is represented by the following type:\n\n```C\ntypedef struct {\n\tstruct seqcount seqcount;\n\tspinlock_t lock;\n} seqlock_t;\n```\n\nAs we may see the `seqlock_t` provides two fields. These fields represent a sequential lock counter, description of which we saw above and also a [spinlock](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) which will protect data from other writers. Note that the `seqcount` counter represented as `seqcount` type. The `seqcount` is structure:\n\n```C\ntypedef struct seqcount {\n\tunsigned sequence;\n#ifdef CONFIG_DEBUG_LOCK_ALLOC\n\tstruct lockdep_map dep_map;\n#endif\n} seqcount_t;\n```\n\nwhich holds counter of a sequential lock and [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) related field.\n\nAs always in previous parts of this [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim), before we will consider an [API](https://en.wikipedia.org/wiki/Application_programming_interface) of `sequential lock` mechanism in the Linux kernel, we need to know how to initialize an instance of `seqlock_t`.\n\nWe saw in the previous parts that often the Linux kernel provides two approaches to execute initialization of the given synchronization primitive. The same situation with the `seqlock_t` structure. These approaches allows to initialize a `seqlock_t` in two following:\n\n* `statically`;\n* `dynamically`.\n\nways. Let's look at the first approach. We are able to initialize a `seqlock_t` statically with the `DEFINE_SEQLOCK` macro:\n\n```C\n#define DEFINE_SEQLOCK(x) \\\n\t\tseqlock_t x = __SEQLOCK_UNLOCKED(x)\n```\n\nwhich is defined in the [include/linux/seqlock.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/seqlock.h) header file. As we may see, the `DEFINE_SEQLOCK` macro takes one argument and expands to the definition and initialization of the `seqlock_t` structure. Initialization occurs with the help of the `__SEQLOCK_UNLOCKED` macro which is defined in the same source code file. Let's look at the implementation of this macro:\n\n```C\n#define __SEQLOCK_UNLOCKED(lockname)\t\t\t\\\n\t{\t\t\t\t\t\t\\\n\t\t.seqcount = SEQCNT_ZERO(lockname),\t\\\n\t\t.lock =\t__SPIN_LOCK_UNLOCKED(lockname)\t\\\n\t}\n```\n\nAs we may see the, `__SEQLOCK_UNLOCKED` macro executes initialization of fields of the given `seqlock_t` structure. The first field is `seqcount` initialized with the `SEQCNT_ZERO` macro which expands to the:\n\n```C\n#define SEQCNT_ZERO(lockname) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(lockname)}\n```\n\nSo we just initialize counter of the given sequential lock to zero and additionally we can see [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) related initialization which depends on the state of the `CONFIG_DEBUG_LOCK_ALLOC` kernel configuration option:\n\n```C\n#ifdef CONFIG_DEBUG_LOCK_ALLOC\n# define SEQCOUNT_DEP_MAP_INIT(lockname) \\\n    .dep_map = { .name = #lockname } \\\n    ...\n    ...\n    ...\n#else\n# define SEQCOUNT_DEP_MAP_INIT(lockname)\n    ...\n    ...\n    ...\n#endif\n```\n\nAs I already wrote in previous parts of this [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim) we will not consider [debugging](https://en.wikipedia.org/wiki/Debugging) and [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) related stuff in this part. So for now we just skip the `SEQCOUNT_DEP_MAP_INIT` macro. The second field of the given `seqlock_t` is `lock` initialized with the `__SPIN_LOCK_UNLOCKED` macro which is defined in the [include/linux/spinlock_types.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/spinlock_types.h) header file. We will not consider implementation of this macro here as it just initializes [rawspinlock](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) with architecture-specific methods (More about spinlocks you may read in first parts of this [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim)).\n\nWe have considered the first way to initialize a sequential lock. Let's consider second way to do the same, but do it dynamically. We can initialize a sequential lock with the `seqlock_init` macro which is defined in the same  [include/linux/seqlock.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/seqlock.h) header file.\n\nLet's look at the implementation of this macro:\n\n```C\n#define seqlock_init(x)\t\t\t\t\t\\\n\tdo {\t\t\t\t\t\t\\\n\t\tseqcount_init(&(x)->seqcount);\t\t\\\n\t\tspin_lock_init(&(x)->lock);\t\t\\\n\t} while (0)\n```\n\nAs we may see, the `seqlock_init` expands into two macros. The first macro `seqcount_init` takes counter of the given sequential lock and expands to the call of the `__seqcount_init` function:\n\n```C\n# define seqcount_init(s)\t\t\t\t\\\n\tdo {\t\t\t\t\t\t\\\n\t\tstatic struct lock_class_key __key;\t\\\n\t\t__seqcount_init((s), #s, &__key);\t\\\n\t} while (0)\n```\n\nfrom the same header file. This function\n\n```C\nstatic inline void __seqcount_init(seqcount_t *s, const char *name,\n\t\t\t\t\t  struct lock_class_key *key)\n{\n    lockdep_init_map(&s->dep_map, name, key, 0);\n    s->sequence = 0;\n}\n```\n\njust initializes counter of the given `seqcount_t` with zero. The second call from the `seqlock_init` macro is the call of the `spin_lock_init` macro which we saw in the [first part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) of this chapter.\n\nSo, now we know how to initialize a `sequential lock`, now let's look at how to use it. The Linux kernel provides following [API](https://en.wikipedia.org/wiki/Application_programming_interface) to manipulate `sequential locks`:\n\n```C\nstatic inline unsigned read_seqbegin(const seqlock_t *sl);\nstatic inline unsigned read_seqretry(const seqlock_t *sl, unsigned start);\nstatic inline void write_seqlock(seqlock_t *sl);\nstatic inline void write_sequnlock(seqlock_t *sl);\nstatic inline void write_seqlock_irq(seqlock_t *sl);\nstatic inline void write_sequnlock_irq(seqlock_t *sl);\nstatic inline void read_seqlock_excl(seqlock_t *sl)\nstatic inline void read_sequnlock_excl(seqlock_t *sl)\n```\n\nand others. Before we move on to considering the implementation of this [API](https://en.wikipedia.org/wiki/Application_programming_interface), we must know that there actually are two types of readers. The first type of reader never blocks a writer process. In this case writer will not wait for readers. The second type of reader which can lock. In this case, the locking reader will block the writer as it will wait while reader will not release its lock.\n\nFirst of all let's consider the first type of readers. The `read_seqbegin` function begins a seq-read [critical section](https://en.wikipedia.org/wiki/Critical_section).\n\nAs we may see this function just returns value of the `read_seqcount_begin` function:\n\n```C\nstatic inline unsigned read_seqbegin(const seqlock_t *sl)\n{\n\treturn read_seqcount_begin(&sl->seqcount);\n}\n```\n\nIn its turn the `read_seqcount_begin` function calls the `raw_read_seqcount_begin` function:\n\n```C\nstatic inline unsigned read_seqcount_begin(const seqcount_t *s)\n{\n\treturn raw_read_seqcount_begin(s);\n}\n```\n\nwhich just returns value of the `sequential lock` counter:\n\n```C\nstatic inline unsigned raw_read_seqcount(const seqcount_t *s)\n{\n\tunsigned ret = READ_ONCE(s->sequence);\n\tsmp_rmb();\n\treturn ret;\n}\n```\n\nAfter we have the initial value of the given `sequential lock` counter and did some stuff, we know from the previous paragraph of this function, that we need to compare it with the current value of the counter the same `sequential lock` before we will exit from the critical section. We can achieve this by the call of the `read_seqretry` function. This function takes a `sequential lock`, start value of the counter and through a chain of functions:\n\n```C\nstatic inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)\n{\n\treturn read_seqcount_retry(&sl->seqcount, start);\n}\n\nstatic inline int read_seqcount_retry(const seqcount_t *s, unsigned start)\n{\n\tsmp_rmb();\n\treturn __read_seqcount_retry(s, start);\n}\n```\n\nit calls the `__read_seqcount_retry` function:\n\n```C\nstatic inline int __read_seqcount_retry(const seqcount_t *s, unsigned start)\n{\n\treturn unlikely(s->sequence != start);\n}\n```\n\nwhich just compares value of the counter of the given `sequential lock` with the initial value of this counter. If the initial value of the counter which is obtained from `read_seqbegin()` function is odd, this means that a writer was in the middle of updating the data when our reader began to act. In this case the value of the data can be in inconsistent state, so we need to try to read it again.\n\nThis is a common pattern in the Linux kernel. For example, you may remember the `jiffies` concept from the [first part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1) of the [timers and time management in the Linux kernel](https://0xax.gitbook.io/linux-insides/summary/timers/) chapter. The sequential lock is used to obtain value of `jiffies` at [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture:\n\n```C\nu64 get_jiffies_64(void)\n{\n\tunsigned long seq;\n\tu64 ret;\n\n\tdo {\n\t\tseq = read_seqbegin(&jiffies_lock);\n\t\tret = jiffies_64;\n\t} while (read_seqretry(&jiffies_lock, seq));\n\treturn ret;\n}\n```\n\nHere we just read the value of the counter of the `jiffies_lock` sequential lock and then we write value of the `jiffies_64` system variable to the `ret`. As here we may see `do/while` loop, the body of the loop will be executed at least one time. So, as the body of loop was executed, we read and compare the current value of the counter of the `jiffies_lock` with the initial value. If these values are not equal, execution of the loop will be repeated, else `get_jiffies_64` will return its value in `ret`.\n\nWe just saw the first type of readers which do not block writer and other readers. Let's consider second type. It does not update value of a `sequential lock` counter, but just locks `spinlock`:\n\n```C\nstatic inline void read_seqlock_excl(seqlock_t *sl)\n{\n\tspin_lock(&sl->lock);\n}\n```\n\nSo, no one reader or writer can't access protected data. When a reader finishes, the lock must be unlocked with the:\n\n```C\nstatic inline void read_sequnlock_excl(seqlock_t *sl)\n{\n\tspin_unlock(&sl->lock);\n}\n```\n\nfunction.\n\nNow we know how `sequential lock` work for readers. Let's consider how does writer act when it wants to acquire a `sequential lock` to modify data. To acquire a `sequential lock`, writer should use `write_seqlock` function. If we look at the implementation of this function:\n\n```C\nstatic inline void write_seqlock(seqlock_t *sl)\n{\n\tspin_lock(&sl->lock);\n\twrite_seqcount_begin(&sl->seqcount);\n}\n```\n\nWe will see that it acquires `spinlock` to prevent access from other writers and calls the `write_seqcount_begin` function. This function just increments value of the `sequential lock` counter:\n\n```C\nstatic inline void raw_write_seqcount_begin(seqcount_t *s)\n{\n\ts->sequence++;\n\tsmp_wmb();\n}\n```\n\nWhen a writer process will finish to modify data, the `write_sequnlock` function must be called to release a lock and give access to other writers or readers. Let's consider the implementation of the `write_sequnlock` function. It looks pretty simple:\n\n```C\nstatic inline void write_sequnlock(seqlock_t *sl)\n{\n\twrite_seqcount_end(&sl->seqcount);\n\tspin_unlock(&sl->lock);\n}\n```\n\nFirst of all it just calls `write_seqcount_end` function to increase value of the counter of the `sequential` lock again:\n\n```C\nstatic inline void raw_write_seqcount_end(seqcount_t *s)\n{\n\tsmp_wmb();\n\ts->sequence++;\n}\n```\n\nand in the end we just call the `spin_unlock` macro to give access for other readers or writers.\n\nThat's all about `sequential lock` mechanism in the Linux kernel. Of course we did not consider full [API](https://en.wikipedia.org/wiki/Application_programming_interface) of this mechanism in this part. But all other functions are based on these which we described here. For example, Linux kernel also provides some safe macros/functions to use `sequential lock` mechanism in [interrupt handlers](https://en.wikipedia.org/wiki/Interrupt_handler) of [softirq](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-9): `write_seqclock_irq` and `write_sequnlock_irq`:\n\n```C\nstatic inline void write_seqlock_irq(seqlock_t *sl)\n{\n\tspin_lock_irq(&sl->lock);\n\twrite_seqcount_begin(&sl->seqcount);\n}\n\nstatic inline void write_sequnlock_irq(seqlock_t *sl)\n{\n\twrite_seqcount_end(&sl->seqcount);\n\tspin_unlock_irq(&sl->lock);\n}\n```\n\nAs we may see, these functions differ only in the initialization of spinlock. They call `spin_lock_irq` and `spin_unlock_irq` instead of `spin_lock` and `spin_unlock`.\n\nOr for example `write_seqlock_irqsave` and `write_sequnlock_irqrestore` functions which are the same but used `spin_lock_irqsave` and `spin_unlock_irqsave` macro to use in [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_(PC_architecture)) handlers.\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the sixth part of the [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) chapter in the Linux kernel. In this part we met with new synchronization primitive which is called - `sequential lock`. From the theoretical side, this synchronization primitive very similar on a [readers-writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock) synchronization primitive, but allows to avoid `writer-starving` issue.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_\\(computer_science\\))\n* [readers-writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock)\n* [spinlock](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1)\n* [critical section](https://en.wikipedia.org/wiki/Critical_section)\n* [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt)\n* [debugging](https://en.wikipedia.org/wiki/Debugging)\n* [API](https://en.wikipedia.org/wiki/Application_programming_interface)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [Timers and time management in the Linux kernel](https://0xax.gitbook.io/linux-insides/summary/timers/)\n* [interrupt handlers](https://en.wikipedia.org/wiki/Interrupt_handler)\n* [softirq](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-9)\n* [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_\\(PC_architecture\\))\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-5)\n"
  },
  {
    "path": "SysCall/README.md",
    "content": "# System calls\n\nThis chapter describes the `system call` concept in the Linux kernel.\n\n* [Introduction to system call concept](linux-syscall-1.md) - this part is introduction to the `system call` concept in the Linux kernel.\n* [How the Linux kernel handles a system call](linux-syscall-2.md) - this part describes how the Linux kernel handles a system call from a userspace application.\n* [vsyscall and vDSO](linux-syscall-3.md) - third part describes `vsyscall` and `vDSO` concepts.\n* [How the Linux kernel runs a program](linux-syscall-4.md) - this part describes startup process of a program.\n* [Implementation of the open system call](linux-syscall-5.md) - this part describes implementation of the [open](http://man7.org/linux/man-pages/man2/open.2.html) system call.\n* [Limits on resources in Linux](linux-syscall-6.md) - this part describes implementation of the [getrlimit/setrlimit](https://linux.die.net/man/2/getrlimit) system calls.\n"
  },
  {
    "path": "SysCall/linux-syscall-1.md",
    "content": "System calls in the Linux kernel. Part 1.\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nThis post opens up a new chapter in [linux-insides](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md) book, and as you may understand from the title, this chapter will be devoted to the [System call](https://en.wikipedia.org/wiki/System_call) concept in the Linux kernel. The choice of topic for this chapter is not accidental. In the previous [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) we saw interrupts and interrupt handling. The concept of system calls is very similar to that of interrupts. This is because the most common way to implement system calls is as software interrupts. We will see many different aspects that are related to the system call concept. For example, we will learn what's happening when a system call occurs from userspace. We will see an implementation of a couple system call handlers in the Linux kernel, [VDSO](https://en.wikipedia.org/wiki/VDSO) and [vsyscall](https://lwn.net/Articles/446528/) concepts and many many more.\n\nBefore we dive into Linux system call implementation, it is good to know some theory about system calls. Let's do it in the following paragraph.\n\nSystem call. What is it?\n--------------------------------------------------------------------------------\n\nA system call is just a userspace request of a kernel service. Yes, the operating system kernel provides many services. When your program wants to write to or read from a file, start to listen for connections on a [socket](https://en.wikipedia.org/wiki/Network_socket), delete or create directory, or even to finish its work, a program uses a system call. In other words, a system call is just a [C](https://en.wikipedia.org/wiki/C_%28programming_language%29) kernel space function that user space programs call to handle some request.\n\nThe Linux kernel provides a set of these functions and each architecture provides its own set. For example: the [x86_64](https://en.wikipedia.org/wiki/X86-64) provides [322](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_64.tbl) system calls and the [x86](https://en.wikipedia.org/wiki/X86) provides [358](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_32.tbl) different system calls. Ok, a system call is just a function. Let's look on a simple `Hello world` example that's written in the assembly programming language:\n\n```assembly\n.data\n\nmsg:\n    .ascii \"Hello, world!\\n\"\n    len = . - msg\n\n.text\n    .global _start\n\n_start:\n\tmovq  $1, %rax\n    movq  $1, %rdi\n    movq  $msg, %rsi\n    movq  $len, %rdx\n    syscall\n\n    movq  $60, %rax\n    xorq  %rdi, %rdi\n    syscall\n```\n\nWe can compile the above with the following commands:\n\n```\n$ gcc -c test.S\n$ ld -o test test.o\n```\n\nand run it as follows:\n\n```\n./test\nHello, world!\n```\n\nOk, what do we see here? This simple code represents `Hello world` assembly program for the Linux `x86_64` architecture. We can see two sections here:\n\n* `.data`\n* `.text`\n\nThe first section - `.data` stores initialized data of our program (`Hello world` string and its length in our case). The second section - `.text` contains the code of our program. We can split the code of our program into two parts: first part will be before the first `syscall` instruction and the second part will be between first and second `syscall` instructions. First of all what does the `syscall` instruction do in our code and generally? As we can read in the [64-ia-32-architectures-software-developer-vol-2b-manual](http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html):\n\n```\nSYSCALL invokes an OS system-call handler at privilege level 0. It does so by\nloading RIP from the IA32_LSTAR MSR (after saving the address of the instruction\nfollowing SYSCALL into RCX). (The WRMSR instruction ensures that the\nIA32_LSTAR MSR always contain a canonical address.)\n...\n...\n...\nSYSCALL loads the CS and SS selectors with values derived from bits 47:32 of the\nIA32_STAR MSR. However, the CS and SS descriptor caches are not loaded from the\ndescriptors (in GDT or LDT) referenced by those selectors.\n\nInstead, the descriptor caches are loaded with fixed values. It is the respon-\nsibility of OS software to ensure that the descriptors (in GDT or LDT) referenced\nby those selector values correspond to the fixed values loaded into the descriptor\ncaches; the SYSCALL instruction does not ensure this correspondence.\n```\n\nTo summarize, the `syscall` instruction jumps to the address stored in the `MSR_LSTAR` [Model specific register](https://en.wikipedia.org/wiki/Model-specific_register) (Long system target address register). The kernel is responsible for providing its own custom function for handling syscalls as well as writing the address of this handler function to the `MSR_LSTAR` register upon system startup.\nThe custom function is `entry_SYSCALL_64`, which is defined in [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S#L98). The address of this syscall handling function is written to the `MSR_LSTAR` register during startup in [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/cpu/common.c#L1335).\n```C\nwrmsrl(MSR_LSTAR, entry_SYSCALL_64);\n```\n\nSo, the `syscall` instruction invokes a handler of a given system call. But how does it know which handler to call? Actually it gets this information from the general purpose [registers](https://en.wikipedia.org/wiki/Processor_register). As you can see in the system call [table](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_64.tbl), each system call has a unique number. In our example the first system call is `write`, which writes data to the given file. Let's look in the system call table and try to find the `write` system call. As we can see, the [write](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_64.tbl#L10) system call has number `1`. We pass the number of this system call through the `rax` register in our example. The next general purpose registers: `%rdi`, `%rsi`, and `%rdx` take the three parameters of the `write` syscall. In our case, they are:\n\n* [File descriptor](https://en.wikipedia.org/wiki/File_descriptor) (`1` is [stdout](https://en.wikipedia.org/wiki/Standard_streams#Standard_output_.28stdout.29) in our case)\n* Pointer to our string\n* Size of data\n\nYes, you heard right. Parameters for a system call. As I already wrote above, a system call is a just `C` function in the kernel space. In our case first system call is write. This system call defined in the [fs/read_write.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/read_write.c) source code file and looks like:\n\n```C\nSYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,\n\t\tsize_t, count)\n{\n\t...\n\t...\n\t...\n}\n```\n\nOr in other words:\n\n```C\nssize_t write(int fd, const void *buf, size_t nbytes);\n```\n\nDon't worry about the `SYSCALL_DEFINE3` macro for now, we'll come back to it.\n\nThe second part of our example is the same, but we call another system call. In this case we call the [exit](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_64.tbl#L69) system call. This system call gets only one parameter:\n\n* Return value\n\nand handles the way our program exits. We can pass the program name of our program to the [strace](https://en.wikipedia.org/wiki/Strace) util and we will see our system calls:\n\n```\n$ strace test\nexecve(\"./test\", [\"./test\"], [/* 62 vars */]) = 0\nwrite(1, \"Hello, world!\\n\", 14Hello, world!\n)         = 14\n_exit(0)                                = ?\n\n+++ exited with 0 +++\n```\n\nIn the first line of the `strace` output, we can see the [execve](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_64.tbl#L68) system call that executes our program, and the second and third are system calls that we have used in our program: `write` and `exit`. Note that we pass the parameter through the general purpose registers in our example. The order of the registers is not accidental. The order of the registers is defined by the following agreement - [x86-64 calling conventions](https://en.wikipedia.org/wiki/X86_calling_conventions#x86-64_calling_conventions). This, and the other agreement for the `x86_64` architecture are explained in the special document - [System V Application Binary Interface. PDF](https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-r252.pdf). In a general way, argument(s) of a function are placed either in registers or pushed on the stack. The right order is:\n\n* `rdi`\n* `rsi`\n* `rdx`\n* `rcx`\n* `r8`\n* `r9`\n\nfor the first six parameters of a function. If a function has more than six arguments, the remaining parameters will be placed on the stack.\n\nWe do not use system calls in our code directly, but our program uses them when we want to print something, check access to a file or just write or read something to it.\n\nFor example:\n\n```C\n#include <stdio.h>\n\nint main(int argc, char **argv)\n{\n   FILE *fp;\n   char buff[255];\n\n   fp = fopen(\"test.txt\", \"r\");\n   fgets(buff, 255, fp);\n   printf(\"%s\\n\", buff);\n   fclose(fp);\n\n   return 0;\n}\n```\n\nThere are no `fopen`, `fgets`, `printf`, and `fclose` system calls in the Linux kernel, but `open`, `read`, `write`, and `close` instead. I think you know that `fopen`, `fgets`, `printf`, and `fclose` are defined in the `C` [standard library](https://en.wikipedia.org/wiki/GNU_C_Library). Actually, these functions are just wrappers for the system calls. We do not call system calls directly in our code, but instead use these [wrapper](https://en.wikipedia.org/wiki/Wrapper_function) functions from the standard library. The main reason of this is simple: a system call must be performed quickly, very quickly. As a system call must be quick, it must be small. The standard library takes responsibility to perform system calls with the correct parameters and makes different checks before it will call the given system call. Let's compile our program with the following command:\n\n```\n$ gcc test.c -o test\n```\n\nand examine it with the [ltrace](https://en.wikipedia.org/wiki/Ltrace) util:\n\n```\n$ ltrace ./test\n__libc_start_main([ \"./test\" ] <unfinished ...>\nfopen(\"test.txt\", \"r\")                                             = 0x602010\nfgets(\"Hello World!\\n\", 255, 0x602010)                             = 0x7ffd2745e700\nputs(\"Hello World!\\n\"Hello World!\n\n)                                                                  = 14\nfclose(0x602010)                                                   = 0\n+++ exited (status 0) +++\n```\n\nThe `ltrace` util displays a set of userspace calls of a program. The `fopen` function opens the given text file, the `fgets` function reads file content to the `buf` buffer, the `puts` function prints the buffer to `stdout`, and the `fclose` function closes the file given by the file descriptor. And as I already wrote, all of these functions call an appropriate system call. For example, `puts` calls the `write` system call inside, we can see it if we will add `-S` option to the `ltrace` program:\n\n```\nwrite@SYS(1, \"Hello World!\\n\\n\", 14) = 14\n```\n\nYes, system calls are ubiquitous. Each program needs to open/write/read files and network connections, allocate memory, and many other things that can be provided only by the kernel. The [proc](https://en.wikipedia.org/wiki/Procfs) file system contains special files in a format: `/proc/${pid}/syscall` that exposes the system call number and argument registers for the system call currently being executed by the process. For example, pid 1 is [systemd](https://en.wikipedia.org/wiki/Systemd) for me:\n\n```\n$ sudo cat /proc/1/comm\nsystemd\n\n$ sudo cat /proc/1/syscall\n232 0x4 0x7ffdf82e11b0 0x1f 0xffffffff 0x100 0x7ffdf82e11bf 0x7ffdf82e11a0 0x7f9114681193\n```\n\nthe system call with number - `232` which is [epoll_wait](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_64.tbl#L241) system call that waits for an I/O event on an [epoll](https://en.wikipedia.org/wiki/Epoll) file descriptor. Or for example `emacs` editor where I'm writing this part:\n\n```\n$ ps ax | grep emacs\n2093 ?        Sl     2:40 emacs\n\n$ sudo cat /proc/2093/comm\nemacs\n\n$ sudo cat /proc/2093/syscall\n270 0xf 0x7fff068a5a90 0x7fff068a5b10 0x0 0x7fff068a59c0 0x7fff068a59d0 0x7fff068a59b0 0x7f777dd8813c\n```\n\nthe system call with the number `270` which is [sys_pselect6](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_64.tbl#L279) system call that allows `emacs` to monitor multiple file descriptors.\n\nNow we know a little about system call, what is it and why we need in it. So let's look at the `write` system call that our program used.\n\nImplementation of write system call\n--------------------------------------------------------------------------------\n\nLet's look at the implementation of this system call directly in the source code of the Linux kernel. As we already know, the `write` system call is defined in the [fs/read_write.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/read_write.c) source code file and looks like this:\n\n```C\nSYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,\n\t\tsize_t, count)\n{\n\tstruct fd f = fdget_pos(fd);\n\tssize_t ret = -EBADF;\n\n\tif (f.file) {\n\t\tloff_t pos = file_pos_read(f.file);\n\t\tret = vfs_write(f.file, buf, count, &pos);\n\t\tif (ret >= 0)\n\t\t\tfile_pos_write(f.file, pos);\n\t\tfdput_pos(f);\n\t}\n\n\treturn ret;\n}\n```\n\nFirst of all, the `SYSCALL_DEFINE3` macro is defined in the [include/linux/syscalls.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/syscalls.h) header file and expands to the definition of the `sys_name(...)` function. Let's look at this macro:\n\n```C\n#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)\n\n#define SYSCALL_DEFINEx(x, sname, ...)                \\\n        SYSCALL_METADATA(sname, x, __VA_ARGS__)       \\\n        __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)\n```\n\nAs we can see the `SYSCALL_DEFINE3` macro takes `name` parameter which will represent name of a system call and variadic number of parameters. This macro just expands to the `SYSCALL_DEFINEx` macro that takes the number of the parameters the given system call, the `_##name` stub for the future name of the system call (more about tokens concatenation with the `##` you can read in the [documentation](https://gcc.gnu.org/onlinedocs/cpp/Concatenation.html) of [gcc](https://en.wikipedia.org/wiki/GNU_Compiler_Collection)). Next we can see the `SYSCALL_DEFINEx` macro. This macro expands to the two following macros:\n\n* `SYSCALL_METADATA`;\n* `__SYSCALL_DEFINEx`.\n\nImplementation of the first macro `SYSCALL_METADATA` depends on the `CONFIG_FTRACE_SYSCALLS` kernel configuration option. As we can understand from the name of this option, it allows to enable tracer to catch the syscall entry and exit events. If this kernel configuration option is enabled, the `SYSCALL_METADATA` macro executes initialization of the `syscall_metadata` structure that defined in the [include/trace/syscall.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/trace/syscall.h) header file and contains different useful fields as name of a system call, number of a system call in the system call [table](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_64.tbl), number of parameters of a system call, list of parameter types and etc:\n\n```C\n#define SYSCALL_METADATA(sname, nb, ...)                             \\\n\t...                                                              \\\n\t...                                                              \\\n\t...                                                              \\\n    struct syscall_metadata __used                                   \\\n              __syscall_meta_##sname = {                             \\\n                    .name           = \"sys\"#sname,                   \\\n                    .syscall_nr     = -1,                            \\\n                    .nb_args        = nb,                            \\\n                    .types          = nb ? types_##sname : NULL,     \\\n                    .args           = nb ? args_##sname : NULL,      \\\n                    .enter_event    = &event_enter_##sname,          \\\n                    .exit_event     = &event_exit_##sname,           \\\n                    .enter_fields   = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \\\n             };                                                                            \\\n\n    static struct syscall_metadata __used                           \\\n              __attribute__((section(\"__syscalls_metadata\")))       \\\n             *__p_syscall_meta_##sname = &__syscall_meta_##sname;\n```\n\nIf the `CONFIG_FTRACE_SYSCALLS` kernel option is not enabled during kernel configuration, the `SYSCALL_METADATA` macro expands to an empty string:\n\n```C\n#define SYSCALL_METADATA(sname, nb, ...)\n```\n\nThe second macro `__SYSCALL_DEFINEx` expands to the definition of the five following functions:\n\n```C\n#define __SYSCALL_DEFINEx(x, name, ...)                                 \\\n        asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))       \\\n                __attribute__((alias(__stringify(SyS##name))));         \\\n                                                                        \\\n        static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));  \\\n                                                                        \\\n        asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));      \\\n                                                                        \\\n        asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))       \\\n        {                                                               \\\n                long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));  \\\n                __MAP(x,__SC_TEST,__VA_ARGS__);                         \\\n                __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));       \\\n                return ret;                                             \\\n        }                                                               \\\n                                                                        \\\n        static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))\n```\n\nThe first `sys##name` is definition of the syscall handler function with the given name - `sys_system_call_name`. The `__SC_DECL` macro takes the `__VA_ARGS__` and combines call input parameter system type and the parameter name, because the macro definition is unable to determine the parameter types. And the `__MAP` macro applies `__SC_DECL` macro to the `__VA_ARGS__` arguments. The other functions that are generated by the `__SYSCALL_DEFINEx` macro are need to protect from the [CVE-2009-0029](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2009-0029) and we will not dive into details about this here. Ok, as result of the `SYSCALL_DEFINE3` macro, we will have:\n\n```C\nasmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t count);\n```\n\nNow we know a little about the system call's definition and we can go back to the implementation of the `write` system call. Let's look on the implementation of this system call again:\n\n```C\nSYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,\n\t\tsize_t, count)\n{\n\tstruct fd f = fdget_pos(fd);\n\tssize_t ret = -EBADF;\n\n\tif (f.file) {\n\t\tloff_t pos = file_pos_read(f.file);\n\t\tret = vfs_write(f.file, buf, count, &pos);\n\t\tif (ret >= 0)\n\t\t\tfile_pos_write(f.file, pos);\n\t\tfdput_pos(f);\n\t}\n\n\treturn ret;\n}\n```\n\nAs we already know and can see from the code, it takes three arguments:\n\n* `fd`    - file descriptor;\n* `buf`   - buffer to write;\n* `count` - length of buffer to write.\n\nand writes data from a buffer declared by the user to a given device or a file. Note that the second parameter `buf`, defined with the `__user` attribute. The main purpose of this attribute is for checking the Linux kernel code with the [sparse](https://en.wikipedia.org/wiki/Sparse) util. It is defined in the [include/linux/compiler.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/compiler.h) header file and depends on the `__CHECKER__` definition in the Linux kernel. That's all about useful meta-information related to our `sys_write` system call, let's try to understand how this system call is implemented. As we can see it starts from the definition of the `f` structure that has `fd` structure type that represents file descriptor in the Linux kernel and we put the result of the call of the `fdget_pos` function. The `fdget_pos` function defined in the same [source](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/read_write.c) code file and just expands the call of the `__to_fd` function:\n\n```C\nstatic inline struct fd fdget_pos(int fd)\n{\n        return __to_fd(__fdget_pos(fd));\n}\n```\n\nThe main purpose of the `fdget_pos` is to convert the given file descriptor which is just a number to the `fd` structure. Through the long chain of function calls, the `fdget_pos` function gets the file descriptor table of the current process, `current->files`, and tries to find a corresponding file descriptor number there. As we got the `fd` structure for the given file descriptor number, we check it and return if it does not exist. We get the current position in the file with the call of the `file_pos_read` function that just returns `f_pos` field of our file:\n\n```C\nstatic inline loff_t file_pos_read(struct file *file)\n{\n        return file->f_pos;\n}\n```\n\nand calls the `vfs_write` function. The `vfs_write` function defined in the [fs/read_write.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/read_write.c) source code file and does the work for us - writes given buffer to the given file starting from the given position. We will not dive into details about the `vfs_write` function, because this function is weakly related to the `system call` concept but mostly about [Virtual file system](https://en.wikipedia.org/wiki/Virtual_file_system) concept which we will see in another chapter. After the `vfs_write` has finished its work, we check the result and if it was finished successfully we change the position in the file with the `file_pos_write` function:\n\n```C\nif (ret >= 0)\n\tfile_pos_write(f.file, pos);\n```\n\nthat just updates `f_pos` with the given position in the given file:\n\n```C\nstatic inline void file_pos_write(struct file *file, loff_t pos)\n{\n        file->f_pos = pos;\n}\n```\n\nAt the end of the our `write` system call handler, we can see the call of the following function:\n\n```C\nfdput_pos(f);\n```\n\nunlocks the `f_pos_lock` mutex that protects file position during concurrent writes from threads that share file descriptor.\n\nThat's all.\n\nWe have seen the partial implementation of one system call provided by the Linux kernel. Of course we have missed some parts in the implementation of the `write` system call, because as I mentioned above, we will see only system calls related stuff in this chapter and will not see other stuff related to other subsystems, such as [Virtual file system](https://en.wikipedia.org/wiki/Virtual_file_system).\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis concludes the first part covering system call concepts in the Linux kernel. We have covered the theory of system calls so far and in the next part we will continue to dive into this topic, touching Linux kernel code related to system calls.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [system call](https://en.wikipedia.org/wiki/System_call)\n* [vdso](https://en.wikipedia.org/wiki/VDSO)\n* [vsyscall](https://lwn.net/Articles/446528/)\n* [general purpose registers](https://en.wikipedia.org/wiki/Processor_register)\n* [socket](https://en.wikipedia.org/wiki/Network_socket)\n* [C programming language](https://en.wikipedia.org/wiki/C_%28programming_language%29)\n* [x86](https://en.wikipedia.org/wiki/X86)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [x86-64 calling conventions](https://en.wikipedia.org/wiki/X86_calling_conventions#x86-64_calling_conventions)\n* [System V Application Binary Interface. PDF](http://www.x86-64.org/documentation/abi.pdf)\n* [GCC](https://en.wikipedia.org/wiki/GNU_Compiler_Collection)\n* [Intel manual. PDF](http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html)\n* [system call table](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_64.tbl)\n* [GCC macro documentation](https://gcc.gnu.org/onlinedocs/cpp/Concatenation.html)\n* [file descriptor](https://en.wikipedia.org/wiki/File_descriptor)\n* [stdout](https://en.wikipedia.org/wiki/Standard_streams#Standard_output_.28stdout.29)\n* [strace](https://en.wikipedia.org/wiki/Strace)\n* [standard library](https://en.wikipedia.org/wiki/GNU_C_Library)\n* [wrapper functions](https://en.wikipedia.org/wiki/Wrapper_function)\n* [ltrace](https://en.wikipedia.org/wiki/Ltrace)\n* [sparse](https://en.wikipedia.org/wiki/Sparse)\n* [proc file system](https://en.wikipedia.org/wiki/Procfs)\n* [Virtual file system](https://en.wikipedia.org/wiki/Virtual_file_system)\n* [systemd](https://en.wikipedia.org/wiki/Systemd)\n* [epoll](https://en.wikipedia.org/wiki/Epoll)\n* [Previous chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts)\n"
  },
  {
    "path": "SysCall/linux-syscall-2.md",
    "content": "System calls in the Linux kernel. Part 2.\n================================================================================\n\nHow does the Linux kernel handle a system call\n--------------------------------------------------------------------------------\n\nThe previous [part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-1) was the first part of the chapter that describes the [system call](https://en.wikipedia.org/wiki/System_call) concepts in the Linux kernel.\nIn the previous part we learned what a system call is in the Linux kernel, and in operating systems in general. This was introduced from a user-space perspective, and part of the [write](http://man7.org/linux/man-pages/man2/write.2.html) system call implementation was discussed. In this part we continue our look at system calls, starting with some theory before moving onto the Linux kernel code.\n\nA user application does not make the system call directly from our applications. We did not write the `Hello world!` program like:\n\n```C\nint main(int argc, char **argv)\n{\n\t...\n\t...\n\t...\n\tsys_write(fd1, buf, strlen(buf));\n\t...\n\t...\n}\n```\n\nWe can use something similar with the help of [C standard library](https://en.wikipedia.org/wiki/GNU_C_Library) and it will look something like this:\n\n```C\n#include <unistd.h>\n\nint main(int argc, char **argv)\n{\n\t...\n\t...\n\t...\n\twrite(fd1, buf, strlen(buf));\n\t...\n\t...\n}\n```\n\nBut anyway, `write` is not a direct system call and not a kernel function. An application must fill general purpose registers with the correct values in the correct order and use the `syscall` instruction to make the actual system call. In this part we will look at what occurs in the Linux kernel when the `syscall` instruction is met by the processor.\n\nInitialization of the system calls table\n--------------------------------------------------------------------------------\n\nFrom the previous part we know that system call concept is very similar to an interrupt. Furthermore, system calls are implemented as software interrupts. So, when the processor handles a `syscall` instruction from a user application, this instruction causes an exception which transfers control to an exception handler. As we know, all exception handlers (or in other words kernel [C](https://en.wikipedia.org/wiki/C_%28programming_language%29) functions that will react on an exception) are placed in the kernel code. But how does the Linux kernel search for the address of the necessary system call handler for the related system call? The Linux kernel contains a special table called the `system call table`. The system call table is represented by the `sys_call_table` array in the Linux kernel which is defined in the [arch/x86/entry/syscall_64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscall_64.c) source code file. Let's look at its implementation:\n\n```C\nasmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {\n\t[0 ... __NR_syscall_max] = &sys_ni_syscall,\n    #include <asm/syscalls_64.h>\n};\n```\n\nAs we can see, the `sys_call_table` is an array of `__NR_syscall_max + 1` size where the `__NR_syscall_max` macro represents the maximum number of system calls for the given [architecture](https://en.wikipedia.org/wiki/List_of_CPU_architectures). This book is about the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, so for our case the `__NR_syscall_max` is `547` and this is the correct number at the time of writing (current Linux kernel version is `5.0.0-rc7`). We can see this macro in the header file generated by [Kbuild](https://www.kernel.org/doc/Documentation/kbuild/makefiles.txt) during kernel compilation - `include/generated/asm-offsets.h`:\n\n```C\n#define __NR_syscall_max 547\n```\n\nThere will be the same number of system calls in the [arch/x86/entry/syscalls/syscall_64.tbl](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_64.tbl#L331) for the `x86_64`. There are two important topics here; the type of the `sys_call_table` array, and the initialization of elements in this array. First of all, the type. The `sys_call_ptr_t` represents a pointer to a system call table. It is defined as [typedef](https://en.wikipedia.org/wiki/Typedef) for a function pointer that returns nothing and does not take arguments:\n\n```C\ntypedef void (*sys_call_ptr_t)(void);\n```\n\nThe second thing is the initialization of the `sys_call_table` array. As we can see in the code above, all elements of our array that contain pointers to the system call handlers point to the `sys_ni_syscall`. The `sys_ni_syscall` function represents not-implemented system calls. To start with, all elements of the `sys_call_table` array point to the not-implemented system call. This is the correct initial behaviour, because we only initialize storage of the pointers to the system call handlers, it is populated later on. Implementation of the `sys_ni_syscall` is pretty easy, it just returns [-errno](http://man7.org/linux/man-pages/man3/errno.3.html) or `-ENOSYS` in our case:\n\n```C\nasmlinkage long sys_ni_syscall(void)\n{\n\treturn -ENOSYS;\n}\n```\n\nThe `-ENOSYS` error tells us that:\n\n```\nENOSYS          Function not implemented (POSIX.1)\n```\n\nAlso a note on `...` in the initialization of the `sys_call_table`. We can do it with a [GCC](https://en.wikipedia.org/wiki/GNU_Compiler_Collection) compiler extension called - [Designated Initializers](https://gcc.gnu.org/onlinedocs/gcc/Designated-Inits.html). This extension allows us to initialize elements in non-fixed order. As you can see, we include the `asm/syscalls_64.h` header at the end of the array. This header file is generated by the special script at [arch/x86/entry/syscalls/syscalltbl.sh](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscalltbl.sh) and generates our header file from the [syscall table](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_64.tbl). The `asm/syscalls_64.h` contains definitions of the following macros:\n\n```C\n__SYSCALL_COMMON(0, sys_read, sys_read)\n__SYSCALL_COMMON(1, sys_write, sys_write)\n__SYSCALL_COMMON(2, sys_open, sys_open)\n__SYSCALL_COMMON(3, sys_close, sys_close)\n__SYSCALL_COMMON(5, sys_newfstat, sys_newfstat)\n...\n...\n...\n```\n\nThe `__SYSCALL_COMMON` macro is defined in the same source code [file](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscall_64.c) and expands to the `__SYSCALL_64` macro which expands to the function definition:\n\n```C\n#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)\n#define __SYSCALL_64(nr, sym, compat) [nr] = sym,\n```\n\nSo, after this, our `sys_call_table` takes the following form:\n\n```C\nasmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {\n\t[0 ... __NR_syscall_max] = &sys_ni_syscall,\n\t[0] = sys_read,\n\t[1] = sys_write,\n\t[2] = sys_open,\n\t...\n\t...\n\t...\n};\n```\n\nAfter this all elements that point to the non-implemented system calls will contain the address of the `sys_ni_syscall` function that just returns `-ENOSYS` as we saw above, and other elements will point to the `sys_syscall_name` functions.\n\nAt this point, we have filled the system call table and the Linux kernel knows where each system call handler is. But the Linux kernel does not call a `sys_syscall_name` function immediately after it is instructed to handle a system call from a user space application. Remember the [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) about interrupts and interrupt handling. When the Linux kernel gets the control to handle an interrupt, it had to do some preparations like save user space registers, switch to a new stack and many more tasks before it will call an interrupt handler. There is the same situation with the system call handling. The preparation for handling a system call is the first thing, but before the Linux kernel will start these preparations, the entry point of a system call must be initialized and only the Linux kernel knows how to perform this preparation. In the next paragraph we will see the process of the initialization of the system call entry in the Linux kernel.\n\nInitialization of the system call entry\n--------------------------------------------------------------------------------\n\nWhen a system call occurs in the system, where are the first bytes of code that starts to handle it? As we can read in the Intel manual - [64-ia-32-architectures-software-developer-vol-2b-manual](http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html):\n\n```\nSYSCALL invokes an OS system-call handler at privilege level 0.\nIt does so by loading RIP from the IA32_LSTAR MSR\n```\n\nIt means that we need to put the system call entry in to the `IA32_LSTAR` [model specific register](https://en.wikipedia.org/wiki/Model-specific_register). This operation takes place during the Linux kernel initialization process. If you have read the fourth [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-4) of the chapter that describes interrupts and interrupt handling in the Linux kernel, you know that the Linux kernel calls the `trap_init` function during the initialization process. This function is defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) source code file and executes the initialization of the `non-early` exception handlers like divide error, [coprocessor](https://en.wikipedia.org/wiki/Coprocessor) error, etc. Besides the initialization of the `non-early` exceptions handlers, this function calls the `cpu_init` function from the [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/cpu/common.c) source code file which besides initialization of `per-cpu` state, calls the `syscall_init` function from the same source code file.\n\nThis function performs the initialization of the system call entry point. Let's look on the implementation of this function. It does not take parameters and first of all it fills two model specific registers:\n\n```C\nwrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);\nwrmsrl(MSR_LSTAR, entry_SYSCALL_64);\n```\n\nThe first model specific register - `MSR_STAR` contains `63:48` bits of the user code segment. These bits will be loaded to the `CS` and `SS` segment registers for the `sysret` instruction which provides functionality to return from a system call to user code with the related privilege. Also the `MSR_STAR` contains `47:32` bits from the kernel code that will be used as the base selector for `CS` and `SS` segment registers when user space applications execute a system call. In the second line of code we fill the `MSR_LSTAR` register with the `entry_SYSCALL_64` symbol that represents system call entry. The `entry_SYSCALL_64` is defined in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S) assembly file and contains code related to the preparation performed before a system call handler is executed (I already wrote about these preparations, read above). We will not consider the `entry_SYSCALL_64` now, but will return to it later in this chapter.\n\nAfter we have set the entry point for system calls, we need to set the following model specific registers:\n\n* `MSR_CSTAR` - target `rip` for the compatibility mode callers;\n* `MSR_IA32_SYSENTER_CS` - target `cs` for the `sysenter` instruction;\n* `MSR_IA32_SYSENTER_ESP` - target `esp` for the `sysenter` instruction;\n* `MSR_IA32_SYSENTER_EIP` - target `eip` for the `sysenter` instruction.\n\nThe values of these model specific register depend on the `CONFIG_IA32_EMULATION` kernel configuration option. If this kernel configuration option is enabled, it allows legacy 32-bit programs to run under a 64-bit kernel. In the first case, if the `CONFIG_IA32_EMULATION` kernel configuration option is enabled, we fill these model specific registers with the entry point for the system calls the compatibility mode:\n\n```C\nwrmsrl(MSR_CSTAR, entry_SYSCALL_compat);\n```\n\nand with the kernel code segment, put zero to the stack pointer and write the address of the `entry_SYSENTER_compat` symbol to the [instruction pointer](https://en.wikipedia.org/wiki/Program_counter):\n\n```C\nwrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);\nwrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);\nwrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);\n```\n\nIn another way, if the `CONFIG_IA32_EMULATION` kernel configuration option is disabled, we write `ignore_sysret` symbol to the `MSR_CSTAR`:\n\n```C\nwrmsrl(MSR_CSTAR, ignore_sysret);\n```\n\nthat is defined in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S) assembly file and just returns `-ENOSYS` error code:\n\n```assembly\nENTRY(ignore_sysret)\n\tmov\t$-ENOSYS, %eax\n\tsysret\nEND(ignore_sysret)\n```\n\nNow we need to fill `MSR_IA32_SYSENTER_CS`, `MSR_IA32_SYSENTER_ESP`, `MSR_IA32_SYSENTER_EIP` model specific registers as we did in the previous code when the `CONFIG_IA32_EMULATION` kernel configuration option was enabled. In this case (when the `CONFIG_IA32_EMULATION` configuration option is not set) we fill the `MSR_IA32_SYSENTER_ESP` and the `MSR_IA32_SYSENTER_EIP` with zero and put the invalid segment of the [Global Descriptor Table](https://en.wikipedia.org/wiki/Global_Descriptor_Table) to the `MSR_IA32_SYSENTER_CS` model specific register:\n\n```C\nwrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);\nwrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);\nwrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);\n```\n\nYou can read more about the `Global Descriptor Table` in the second [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-2) of the chapter that describes the booting process of the Linux kernel.\n\nAt the end of the `syscall_init` function, we just mask flags in the [flags register](https://en.wikipedia.org/wiki/FLAGS_register) by writing the set of flags to the `MSR_SYSCALL_MASK` model specific register:\n\n```C\nwrmsrl(MSR_SYSCALL_MASK,\n\t   X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|\n\t   X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);\n```\n\nThese flags will be cleared during syscall initialization. That's all, it is the end of the `syscall_init` function and it means that system call entry is ready to work. Now we can see what will occur when a user application executes the `syscall` instruction.\n\nPreparation before system call handler is called\n--------------------------------------------------------------------------------\n\nAs I already wrote, before a system call or an interrupt handler is called by the Linux kernel we need to do some preparations. The `idtentry` macro performs the preparations required before an exception handler is executed, the `interrupt` macro performs the preparations required before an interrupt handler is called and the `entry_SYSCALL_64` will do the preparations required before a system call handler is executed.\n\nThe `entry_SYSCALL_64` is defined in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S)  assembly file and starts from the following macro:\n\n```assembly\nSWAPGS_UNSAFE_STACK\n```\n\nThis macro is defined in the [arch/x86/include/asm/irqflags.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/irqflags.h) header file and expands to the `swapgs` instruction:\n\n```C\n#define SWAPGS_UNSAFE_STACK\tswapgs\n```\n\nwhich exchanges the current GS base register value with the value contained in the `MSR_KERNEL_GS_BASE ` model specific register. In other words we moved it on to the kernel stack. After this we point the old stack pointer to the `rsp_scratch` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable and setup the stack pointer to point to the top of stack for the current processor:\n\n```assembly\nmovq\t%rsp, PER_CPU_VAR(rsp_scratch)\nmovq\tPER_CPU_VAR(cpu_current_top_of_stack), %rsp\n```\n\nIn the next step we push the stack segment and the old stack pointer to the stack:\n\n```assembly\npushq\t$__USER_DS\npushq\tPER_CPU_VAR(rsp_scratch)\n```\n\nAfter this we enable interrupts, because interrupts are `off` on entry and save the general purpose [registers](https://en.wikipedia.org/wiki/Processor_register) (besides `bp`, `bx` and from `r12` to `r15`), flags, `-ENOSYS` for the non-implemented system call and code segment register on the stack:\n\n```assembly\nENABLE_INTERRUPTS(CLBR_NONE)\n\npushq\t%r11\npushq\t$__USER_CS\npushq\t%rcx\npushq\t%rax\npushq\t%rdi\npushq\t%rsi\npushq\t%rdx\npushq\t%rcx\npushq\t$-ENOSYS\npushq\t%r8\npushq\t%r9\npushq\t%r10\npushq\t%r11\nsub\t$(6*8), %rsp\n```\n\nWhen a system call occurs from the user's application, general purpose registers have the following state:\n\n* `rax` - contains system call number;\n* `rcx` - contains return address to the user space;\n* `r11` - contains register flags;\n* `rdi` - contains first argument of a system call handler;\n* `rsi` - contains second argument of a system call handler;\n* `rdx` - contains third argument of a system call handler;\n* `r10` - contains fourth argument of a system call handler;\n* `r8`  - contains fifth argument of a system call handler;\n* `r9`  - contains sixth argument of a system call handler;\n\nOther general purpose registers (as `rbp`, `rbx` and from `r12` to `r15`) are callee-preserved in [C ABI](http://www.x86-64.org/documentation/abi.pdf)). So we push register flags on the top of the stack, then user code segment, return address to the user space, system call number, first three arguments, dump error code for the non-implemented system call and other arguments on the stack.\n\nIn the next step we check the `_TIF_WORK_SYSCALL_ENTRY` in the current `thread_info`:\n\n```assembly\ntestl\t$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)\njnz\ttracesys\n```\n\nThe `_TIF_WORK_SYSCALL_ENTRY` macro is defined in the [arch/x86/include/asm/thread_info.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/thread_info.h) header file and provides set of the thread information flags that are related to the system calls tracing:\n\n```C\n#define _TIF_WORK_SYSCALL_ENTRY \\\n    (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |   \\\n    _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |     \\\n    _TIF_NOHZ)\n```\n\nWe will not consider debugging/tracing related stuff in this chapter, but will see it in the separate chapter that will be devoted to the debugging and tracing techniques in the Linux kernel. After the `tracesys` label, the next label is the `entry_SYSCALL_64_fastpath`. In the `entry_SYSCALL_64_fastpath` we check the `__SYSCALL_MASK` that is defined in the [arch/x86/include/asm/unistd.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/unistd.h) header file and\n\n```C\n# ifdef CONFIG_X86_X32_ABI\n#  define __SYSCALL_MASK (~(__X32_SYSCALL_BIT))\n# else\n#  define __SYSCALL_MASK (~0)\n# endif\n```\n\nwhere the `__X32_SYSCALL_BIT` is\n\n```C\n#define __X32_SYSCALL_BIT\t0x40000000\n```\n\nAs we can see the `__SYSCALL_MASK` depends on the `CONFIG_X86_X32_ABI` kernel configuration option and represents the mask for the 32-bit [ABI](https://en.wikipedia.org/wiki/Application_binary_interface) in the 64-bit kernel.\n\nSo we check the value of the `__SYSCALL_MASK` and if the `CONFIG_X86_X32_ABI` is disabled we compare the value of the `rax` register to the maximum syscall number (`__NR_syscall_max`), alternatively if the `CONFIG_X86_X32_ABI` is enabled we mask the `eax` register with the `__X32_SYSCALL_BIT` and do the same comparison:\n\n```assembly\n#if __SYSCALL_MASK == ~0\n\tcmpq\t$__NR_syscall_max, %rax\n#else\n\tandl\t$__SYSCALL_MASK, %eax\n\tcmpl\t$__NR_syscall_max, %eax\n#endif\n```\n\nAfter this we check the result of the last comparison with the `ja` instruction that executes if `CF` and `ZF` flags are zero:\n\n```assembly\nja\t1f\n```\n\nand if we have the correct system call for this, we move the fourth argument from the `r10` to the `rcx` to keep [x86_64 C ABI](http://www.x86-64.org/documentation/abi.pdf) compliant and execute the `call` instruction with the address of a system call handler:\n\n```assembly\nmovq\t%r10, %rcx\ncall\t*sys_call_table(, %rax, 8)\n```\n\nNote, the `sys_call_table` is an array that we saw above in this part. As we already know the `rax` general purpose register contains the number of a system call and each element of the `sys_call_table` is 8-bytes. So we are using `*sys_call_table(, %rax, 8)` this notation to find the correct offset in the `sys_call_table` array for the given system call handler.\n\nThat's all. We did all the required preparations and the system call handler was called for the given interrupt handler, for example `sys_read`, `sys_write` or other system call handler that is defined with the `SYSCALL_DEFINE[N]` macro in the Linux kernel code.\n\nExit from a system call\n--------------------------------------------------------------------------------\n\nAfter a system call handler finishes its work, we will return back to the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S), right after where we have called the system call handler:\n\n```assembly\ncall\t*sys_call_table(, %rax, 8)\n```\n\nThe next step after we've returned from a system call handler is to put the return value of a system handler on to the stack. We know that a system call returns the result to the user program in the general purpose `rax` register, so we are moving its value on to the stack after the system call handler has finished its work:\n\n```C\nmovq\t%rax, RAX(%rsp)\n```\n\non the `RAX` place.\n\nAfter this we can see the call of the `LOCKDEP_SYS_EXIT` macro from the [arch/x86/include/asm/irqflags.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/irqflags.h):\n\n```assembly\nLOCKDEP_SYS_EXIT\n```\n\nThe implementation of this macro depends on the `CONFIG_DEBUG_LOCK_ALLOC` kernel configuration option that allows us to debug locks on exit from a system call. And again, we will not consider it in this chapter, but will return to it in a separate one. In the end of the `entry_SYSCALL_64` function we restore all general purpose registers besides `rcx` and `r11`, because the `rcx` register must contain the return address to the application that called system call and the `r11` register contains the old [flags register](https://en.wikipedia.org/wiki/FLAGS_register). After all general purpose registers are restored, we fill `rcx` with the return address, `r11` register with the flags and `rsp` with the old stack pointer:\n\n```assembly\nRESTORE_C_REGS_EXCEPT_RCX_R11\n\nmovq\tRIP(%rsp), %rcx\nmovq\tEFLAGS(%rsp), %r11\nmovq\tRSP(%rsp), %rsp\n\nUSERGS_SYSRET64\n```\n\nIn the end we just call the `USERGS_SYSRET64` macro that expands to the call of the `swapgs` instruction which exchanges again the user `GS` and kernel `GS` and the `sysretq` instruction which executes on exit from a system call handler:\n\n```C\n#define USERGS_SYSRET64\t\t\t\t\\\n\tswapgs;\t           \t\t\t\t\\\n\tsysretq;\n```\n\nNow we know what occurs when a user application calls a system call. The full path of this process is as follows:\n\n* User application contains code that fills general purpose register with the values (system call number and arguments of this system call);\n* Processor switches from the user mode to kernel mode and starts execution of the system call entry - `entry_SYSCALL_64`;\n* `entry_SYSCALL_64` switches to the kernel stack and saves some general purpose registers, old stack and code segment, flags and etc... on the stack;\n* `entry_SYSCALL_64` checks the system call number in the `rax` register, searches a system call handler in the `sys_call_table` and calls it, if the number of a system call is correct;\n* If a system call is not correct, jump on exit from system call;\n* After a system call handler will finish its work, restore general purpose registers, old stack, flags and return address and exit from the `entry_SYSCALL_64` with the `sysretq` instruction.\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the second part about the system calls concept in the Linux kernel. In the previous [part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-1) we saw theory about this concept from the user application view. In this part we continued to dive into the stuff which is related to the system call concept and saw what the Linux kernel does when a system call occurs.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [system call](https://en.wikipedia.org/wiki/System_call)\n* [write](http://man7.org/linux/man-pages/man2/write.2.html)\n* [C standard library](https://en.wikipedia.org/wiki/GNU_C_Library)\n* [list of cpu architectures](https://en.wikipedia.org/wiki/List_of_CPU_architectures)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [kbuild](https://www.kernel.org/doc/Documentation/kbuild/makefiles.txt)\n* [typedef](https://en.wikipedia.org/wiki/Typedef)\n* [errno](http://man7.org/linux/man-pages/man3/errno.3.html)\n* [gcc](https://en.wikipedia.org/wiki/GNU_Compiler_Collection)\n* [model specific register](https://en.wikipedia.org/wiki/Model-specific_register)\n* [intel 2b manual](http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html)\n* [coprocessor](https://en.wikipedia.org/wiki/Coprocessor)\n* [instruction pointer](https://en.wikipedia.org/wiki/Program_counter)\n* [flags register](https://en.wikipedia.org/wiki/FLAGS_register)\n* [Global Descriptor Table](https://en.wikipedia.org/wiki/Global_Descriptor_Table)\n* [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [general purpose registers](https://en.wikipedia.org/wiki/Processor_register)\n* [ABI](https://en.wikipedia.org/wiki/Application_binary_interface)\n* [x86_64 C ABI](http://www.x86-64.org/documentation/abi.pdf)\n* [previous chapter](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-1)\n"
  },
  {
    "path": "SysCall/linux-syscall-3.md",
    "content": "System calls in the Linux kernel. Part 3.\n================================================================================\n\nvsyscalls and vDSO\n--------------------------------------------------------------------------------\n\nThis is the third part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/syscall) that describes system calls in the Linux kernel and we saw preparations after a system call caused by a userspace application and process of handling of a system call in the previous [part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-2). In this part we will look at two concepts that are very close to the system call concept, they are called `vsyscall` and `vdso`.\n\nWe already know what `system call`s are. They are special routines in the Linux kernel which userspace applications ask to do privileged tasks, like to read or to write to a file, to open a socket, etc. As you may know, invoking a system call is an expensive operation in the Linux kernel, because the processor must interrupt the currently executing task and switch context to kernel mode, subsequently jumping again into userspace after the system call handler finishes its work. These two mechanisms - `vsyscall` and `vdso` are designed to speed up this process for certain system calls and in this part we will try to understand how these mechanisms work.\n\nIntroduction to vsyscalls\n--------------------------------------------------------------------------------\n\nThe `vsyscall` or `virtual system call` is the first and oldest mechanism in the Linux kernel that is designed to accelerate execution of certain system calls. The principle of work of the `vsyscall` concept is simple. The Linux kernel maps into user space a page that contains some variables and the implementation of some system calls. We can find information about this memory space in the Linux kernel [documentation](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/x86/x86_64/mm.txt) for the [x86_64](https://en.wikipedia.org/wiki/X86-64):\n\n```\nffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls\n```\n\nor:\n\n```\n~$ sudo cat /proc/1/maps | grep vsyscall\nffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n```\n\nAfter this, these system calls will be executed in userspace and this means that there will not be [context switching](https://en.wikipedia.org/wiki/Context_switch). Mapping of the `vsyscall` page occurs in the `map_vsyscall` function that is defined in the [arch/x86/entry/vsyscall/vsyscall_64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vsyscall/vsyscall_64.c) source code file. This function is called during the Linux kernel initialization in the `setup_arch` function that is defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) source code file (we saw this function in the fifth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-5) of the Linux kernel initialization process chapter).\n\nNote that implementation of the `map_vsyscall` function depends on the `CONFIG_X86_VSYSCALL_EMULATION` kernel configuration option:\n\n```C\n#ifdef CONFIG_X86_VSYSCALL_EMULATION\nextern void map_vsyscall(void);\n#else\nstatic inline void map_vsyscall(void) {}\n#endif\n```\n\nAs we can read in the help text, the `CONFIG_X86_VSYSCALL_EMULATION` configuration option: `Enable vsyscall emulation`. Why emulate `vsyscall`? Actually, the `vsyscall` is a legacy [ABI](https://en.wikipedia.org/wiki/Application_binary_interface) due to security reasons. Virtual system calls have fixed addresses, meaning that `vsyscall` page is still at the same location every time and the location of this page is determined in the `map_vsyscall` function. Let's look on the implementation of this function:\n\n```C\nvoid __init map_vsyscall(void)\n{\n    extern char __vsyscall_page;\n    unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);\n\t...\n\t...\n\t...\n}\n```\n\nAs we can see, at the beginning of the `map_vsyscall` function we get the physical address of the `vsyscall` page with the `__pa_symbol` macro (we already saw implementation if this macro in the fourth [path](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4) of the Linux kernel initialization process). The `__vsyscall_page` symbol defined in the [arch/x86/entry/vsyscall/vsyscall_emu_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vsyscall/vsyscall_emu_64.S) assembly source code file and have the following [virtual address](https://en.wikipedia.org/wiki/Virtual_address_space):\n\n```\nffffffff81881000 D __vsyscall_page\n```\n\nin the `.data..page_aligned, aw` [section](https://en.wikipedia.org/wiki/Memory_segmentation) and contains call of the three following system calls:\n\n* `gettimeofday`;\n* `time`;\n* `getcpu`.\n\nOr:\n\n```assembly\n__vsyscall_page:\n\tmov $__NR_gettimeofday, %rax\n\tsyscall\n\tret\n\n\t.balign 1024, 0xcc\n\tmov $__NR_time, %rax\n\tsyscall\n\tret\n\n\t.balign 1024, 0xcc\n\tmov $__NR_getcpu, %rax\n\tsyscall\n\tret\n```\n\nLet's go back to the implementation of the `map_vsyscall` function and return to the implementation of the `__vsyscall_page` later. After we received the physical address of the `__vsyscall_page`, we check the value of the `vsyscall_mode` variable and set the [fix-mapped](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-2) address for the `vsyscall` page with the `__set_fixmap` macro:\n\n```C\nif (vsyscall_mode != NONE)\n\t__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,\n                 vsyscall_mode == NATIVE\n                             ? PAGE_KERNEL_VSYSCALL\n                             : PAGE_KERNEL_VVAR);\n```\n\nThe `__set_fixmap` takes three arguments: The first is index of the `fixed_addresses` [enum](https://en.wikipedia.org/wiki/Enumerated_type). In our case `VSYSCALL_PAGE` is the first element of the `fixed_addresses` enum for the `x86_64` architecture:\n\n```C\nenum fixed_addresses {\n...\n...\n...\n#ifdef CONFIG_X86_VSYSCALL_EMULATION\n\tVSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT,\n#endif\n...\n...\n...\n```\n\nIt equal to the `511`. The second argument is the physical address of the page that has to be mapped and the third argument is the flags of the page. Note that the flags of the `VSYSCALL_PAGE` depend on the `vsyscall_mode` variable. It will be `PAGE_KERNEL_VSYSCALL` if the `vsyscall_mode` variable is `NATIVE` and the `PAGE_KERNEL_VVAR` otherwise. Both macros (the `PAGE_KERNEL_VSYSCALL` and the `PAGE_KERNEL_VVAR`) will be expanded to the following flags:\n\n```C\n#define __PAGE_KERNEL_VSYSCALL          (__PAGE_KERNEL_RX | _PAGE_USER)\n#define __PAGE_KERNEL_VVAR              (__PAGE_KERNEL_RO | _PAGE_USER)\n```\n\nthat represent access rights to the `vsyscall` page. Both flags have the same `_PAGE_USER` flags that means that the page can be accessed by a user-mode process running at lower privilege levels. The second flag depends on the value of the `vsyscall_mode` variable. The first flag (`__PAGE_KERNEL_VSYSCALL`) will be set in the case where `vsyscall_mode` is `NATIVE`. This means virtual system calls will be native `syscall` instructions. In other way the vsyscall will have `PAGE_KERNEL_VVAR` if the `vsyscall_mode` variable will be `emulate`. In this case virtual system calls will be turned into traps and are emulated reasonably. The `vsyscall_mode` variable gets its value in the `vsyscall_setup` function:\n\n```C\nstatic int __init vsyscall_setup(char *str)\n{\n\tif (str) {\n\t\tif (!strcmp(\"emulate\", str))\n\t\t\tvsyscall_mode = EMULATE;\n\t\telse if (!strcmp(\"native\", str))\n\t\t\tvsyscall_mode = NATIVE;\n\t\telse if (!strcmp(\"none\", str))\n\t\t\tvsyscall_mode = NONE;\n\t\telse\n\t\t\treturn -EINVAL;\n\n\t\treturn 0;\n\t}\n\n\treturn -EINVAL;\n}\n```\n\nThat will be called during early kernel parameters parsing:\n\n```C\nearly_param(\"vsyscall\", vsyscall_setup);\n```\n\nMore about `early_param` macro you can read in the sixth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6) of the chapter that describes process of the initialization of the Linux kernel.\n\nIn the end of the `vsyscall_map` function we just check that virtual address of the `vsyscall` page is equal to the value of the `VSYSCALL_ADDR` with the [BUILD_BUG_ON](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) macro:\n\n```C\nBUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=\n             (unsigned long)VSYSCALL_ADDR);\n```\n\nThat's all. `vsyscall` page is set up. The result of the all the above is the following: If we pass `vsyscall=native` parameter to the kernel command line, virtual system calls will be handled as native `syscall` instructions in the [arch/x86/entry/vsyscall/vsyscall_emu_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vsyscall/vsyscall_emu_64.S). The [glibc](https://en.wikipedia.org/wiki/GNU_C_Library) knows addresses of the virtual system call handlers. Note that virtual system call handlers are aligned by `1024` (or `0x400`) bytes:\n\n```assembly\n__vsyscall_page:\n\tmov $__NR_gettimeofday, %rax\n\tsyscall\n\tret\n\n\t.balign 1024, 0xcc\n\tmov $__NR_time, %rax\n\tsyscall\n\tret\n\n\t.balign 1024, 0xcc\n\tmov $__NR_getcpu, %rax\n\tsyscall\n\tret\n```\n\nAnd the start address of the `vsyscall` page is the `ffffffffff600000` every time. So, the [glibc](https://en.wikipedia.org/wiki/GNU_C_Library) knows the addresses of the all virtual system call handlers. You can find definition of these addresses in the `glibc` source code:\n\n```C\n#define VSYSCALL_ADDR_vgettimeofday   0xffffffffff600000\n#define VSYSCALL_ADDR_vtime \t      0xffffffffff600400\n#define VSYSCALL_ADDR_vgetcpu\t      0xffffffffff600800\n```\n\nAll virtual system call requests will fall into the `__vsyscall_page` + `VSYSCALL_ADDR_vsyscall_name` offset, put the number of a virtual system call to the `rax` general purpose [register](https://en.wikipedia.org/wiki/Processor_register) and the native for the x86_64 `syscall` instruction will be executed.\n\nIn the second case, if we pass `vsyscall=emulate` parameter to the kernel command line, an attempt to perform virtual system call handler will cause a [page fault](https://en.wikipedia.org/wiki/Page_fault) exception. Of course, remember, the `vsyscall` page has `__PAGE_KERNEL_VVAR` access rights that forbid execution. The `do_page_fault` function is the `#PF` or page fault handler. It tries to understand the reason of the last page fault. And one of the reason can be situation when virtual system call called and `vsyscall` mode is `emulate`. In this case `vsyscall` will be handled by the `emulate_vsyscall` function that defined in the [arch/x86/entry/vsyscall/vsyscall_64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vsyscall/vsyscall_64.c) source code file.\n\nThe `emulate_vsyscall` function gets the number of a virtual system call, checks it, prints error and sends [segmentation fault](https://en.wikipedia.org/wiki/Segmentation_fault) simply:\n\n```C\n...\n...\n...\nvsyscall_nr = addr_to_vsyscall_nr(address);\nif (vsyscall_nr < 0) {\n\twarn_bad_vsyscall(KERN_WARNING, regs, \"misaligned vsyscall...);\n\tgoto sigsegv;\n}\n...\n...\n...\nsigsegv:\n\tforce_sig(SIGSEGV, current);\n\treturn true;\n```\n\nAs it checked number of a virtual system call, it does some yet another checks like `access_ok` violations and execute system call function depends on the number of a virtual system call:\n\n```C\nswitch (vsyscall_nr) {\n\tcase 0:\n\t\tret = sys_gettimeofday(\n\t\t\t(struct timeval __user *)regs->di,\n\t\t\t(struct timezone __user *)regs->si);\n\t\tbreak;\n\t...\n\t...\n\t...\n}\n```\n\nIn the end we put the result of the `sys_gettimeofday` or another virtual system call handler to the `ax` general purpose register, as we did it with the normal system calls and restore the [instruction pointer](https://en.wikipedia.org/wiki/Program_counter) register and add `8` bytes to the [stack pointer](https://en.wikipedia.org/wiki/Stack_register) register. This operation emulates `ret` instruction.\n\n```C\n\tregs->ax = ret;\n\ndo_ret:\n\tregs->ip = caller;\n\tregs->sp += 8;\n\treturn true;\n```\n\nThat's all. Now let's look on the modern concept - `vDSO`.\n\nIntroduction to vDSO\n--------------------------------------------------------------------------------\n\nAs I already wrote above, `vsyscall` is an obsolete concept and replaced by the `vDSO` or `virtual dynamic shared object`. The main difference between the `vsyscall` and `vDSO` mechanisms is that `vDSO` maps memory pages into each process in a shared object [form](https://en.wikipedia.org/wiki/Library_%28computing%29#Shared_libraries), but `vsyscall` is static in memory and has the same address every time. For the `x86_64` architecture it is called -`linux-vdso.so.1`. All userspace applications that dynamically link to `glibc` will use the `vDSO` automatically. For example:\n\n```\n~$ ldd /bin/uname\n\tlinux-vdso.so.1 (0x00007ffe014b7000)\n\tlibc.so.6 => /lib64/libc.so.6 (0x00007fbfee2fe000)\n\t/lib64/ld-linux-x86-64.so.2 (0x00005559aab7c000)\n```\n\nOr:\n\n```\n~$ sudo cat /proc/1/maps | grep vdso\n7fff39f73000-7fff39f75000 r-xp 00000000 00:00 0       [vdso]\n```\n\nHere we can see that [uname](https://en.wikipedia.org/wiki/Uname) util was linked with the three libraries:\n\n* `linux-vdso.so.1`;\n* `libc.so.6`;\n* `ld-linux-x86-64.so.2`.\n\nThe first provides `vDSO` functionality, the second is `C` [standard library](https://en.wikipedia.org/wiki/C_standard_library) and the third is the program interpreter (more about this you can read in the part that describes [linkers](https://0xax.gitbook.io/linux-insides/summary/misc/linux-misc-3)). So, the `vDSO` solves limitations of the `vsyscall`. Implementation of the `vDSO` is similar to `vsyscall`.\n\nInitialization of the `vDSO` occurs in the `init_vdso` function that defined in the [arch/x86/entry/vdso/vma.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vdso/vma.c) source code file. This function starts from the initialization of the `vDSO` images for 32-bits and 64-bits depends on the `CONFIG_X86_X32_ABI` kernel configuration option:\n\n```C\nstatic int __init init_vdso(void)\n{\n\tinit_vdso_image(&vdso_image_64);\n\n#ifdef CONFIG_X86_X32_ABI\n\tinit_vdso_image(&vdso_image_x32);\n#endif\n```\n\nBoth functions initialize the `vdso_image` structure. This structure is defined in the two generated source code files: the [arch/x86/entry/vdso/vdso-image-64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vdso/vdso-image-64.c) and the [arch/x86/entry/vdso/vdso-image-32.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vdso/vdso-image-32.c). These source code files generated by the [vdso2c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vdso/vdso2c.c) program from the different source code files, represent different approaches to call a system call like `int 0x80`, `sysenter`, etc. The full set of the images depends on the kernel configuration.\n\nFor example for the `x86_64` Linux kernel it will contain `vdso_image_64`:\n\n```C\n#ifdef CONFIG_X86_64\nextern const struct vdso_image vdso_image_64;\n#endif\n```\n\nBut for the `x86` - `vdso_image_32`:\n\n```C\n#ifdef CONFIG_X86_X32\nextern const struct vdso_image vdso_image_x32;\n#endif\n```\n\nIf our kernel is configured for the `x86` architecture or for the `x86_64` and compatibility mode, we will have ability to call a system call with the `int 0x80` interrupt, if compatibility mode is enabled, we will be able to call a system call with the native `syscall instruction` or `sysenter` instruction in other way:\n\n```C\n#if defined CONFIG_X86_32 || defined CONFIG_COMPAT\n  extern const struct vdso_image vdso_image_32_int80;\n#ifdef CONFIG_COMPAT\n  extern const struct vdso_image vdso_image_32_syscall;\n#endif\n extern const struct vdso_image vdso_image_32_sysenter;\n#endif\n```\n\nAs we can understand from the name of the `vdso_image` structure, it represents image of the `vDSO` for the certain mode of the system call entry. This structure contains information about size in bytes of the `vDSO` area that's always a multiple of `PAGE_SIZE` (`4096` bytes), pointer to the text mapping, start and end address of the `alternatives` (set of instructions with better alternatives for the certain type of the processor), etc. For example `vdso_image_64` looks like this:\n\n```C\nconst struct vdso_image vdso_image_64 = {\n\t.data = raw_data,\n\t.size = 8192,\n\t.text_mapping = {\n\t\t.name = \"[vdso]\",\n\t\t.pages = pages,\n\t},\n\t.alt = 3145,\n\t.alt_len = 26,\n\t.sym_vvar_start = -8192,\n\t.sym_vvar_page = -8192,\n\t.sym_hpet_page = -4096,\n};\n```\n\nWhere the `raw_data` contains raw binary code of the 64-bit `vDSO` system calls which are `2` page size:\n\n```C\nstatic struct page *pages[2];\n```\n\nor 8 Kilobytes.\n\nThe `init_vdso_image` function is defined in the same source code file and just initializes the `vdso_image.text_mapping.pages`. First of all this function calculates the number of pages and initializes each `vdso_image.text_mapping.pages[number_of_page]` with the `virt_to_page` macro that converts given address to the `page` structure:\n\n```C\nvoid __init init_vdso_image(const struct vdso_image *image)\n{\n\tint i;\n\tint npages = (image->size) / PAGE_SIZE;\n\n\tfor (i = 0; i < npages; i++)\n\t\timage->text_mapping.pages[i] =\n\t\t\tvirt_to_page(image->data + i*PAGE_SIZE);\n\t...\n\t...\n\t...\n}\n```\n\nThe `init_vdso` function passed to the `subsys_initcall` macro adds the given function to the `initcalls` list. All functions from this list will be called in the `do_initcalls` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file:\n\n```C\nsubsys_initcall(init_vdso);\n```\n\nOk, we just saw initialization of the `vDSO` and initialization of `page` structures that are related to the memory pages that contain `vDSO` system calls. But to where do their pages map? Actually they are mapped by the kernel, when it loads binary to the memory. The Linux kernel calls the `arch_setup_additional_pages` function from the [arch/x86/entry/vdso/vma.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vdso/vma.c) source code file that checks that `vDSO` enabled for the `x86_64` and calls the `map_vdso` function:\n\n```C\nint arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)\n{\n\tif (!vdso64_enabled)\n\t\treturn 0;\n\n\treturn map_vdso(&vdso_image_64, true);\n}\n```\n\nThe `map_vdso` function is defined in the same source code file and maps pages for the `vDSO` and for the shared `vDSO` variables. That's all. The main differences between the `vsyscall` and the `vDSO` concepts is that `vsyscall` has a static address of `ffffffffff600000` and implements three system calls, whereas the `vDSO` loads dynamically and implements five system calls, as defined in [arch/x86/entry/vdso/vma.c](https://github.com/torvalds/linux/blob/master/arch/x86/entry/vdso/vdso.lds.S):\n\n* `__vdso_clock_gettime`;\n* `__vdso_getcpu`;\n* `__vdso_gettimeofday`;\n* `__vdso_time`;\n* `__vdso_clock_getres`.\n\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the third part about the system calls concept in the Linux kernel. In the previous [part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-2) we discussed the implementation of the preparation from the Linux kernel side, before a system call will be handled and implementation of the `exit` process from a system call handler. In this part we continued to dive into the stuff which is related to the system call concept and learned two new concepts that are very similar to the system call - the `vsyscall` and the `vDSO`.\n\nAfter all of these three parts, we know almost all things that are related to system calls, we know what system call is and why user applications need them.  We also know what occurs when a user application calls a system call and how the kernel handles system calls.\n\nThe next part will be the last part in this [chapter](https://0xax.gitbook.io/linux-insides/summary/syscall) and we will see what occurs when a user runs the program.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [x86_64 memory map](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/x86/x86_64/mm.txt)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [context switching](https://en.wikipedia.org/wiki/Context_switch)\n* [ABI](https://en.wikipedia.org/wiki/Application_binary_interface)\n* [virtual address](https://en.wikipedia.org/wiki/Virtual_address_space)\n* [Segmentation](https://en.wikipedia.org/wiki/Memory_segmentation)\n* [enum](https://en.wikipedia.org/wiki/Enumerated_type)\n* [fix-mapped addresses](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-2)\n* [glibc](https://en.wikipedia.org/wiki/GNU_C_Library)\n* [BUILD_BUG_ON](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1)\n* [Processor register](https://en.wikipedia.org/wiki/Processor_register)\n* [Page fault](https://en.wikipedia.org/wiki/Page_fault)\n* [segmentation fault](https://en.wikipedia.org/wiki/Segmentation_fault)\n* [instruction pointer](https://en.wikipedia.org/wiki/Program_counter)\n* [stack pointer](https://en.wikipedia.org/wiki/Stack_register)\n* [uname](https://en.wikipedia.org/wiki/Uname)\n* [Linkers](https://0xax.gitbook.io/linux-insides/summary/misc/linux-misc-3)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-2)\n"
  },
  {
    "path": "SysCall/linux-syscall-4.md",
    "content": "System calls in the Linux kernel. Part 4.\n================================================================================\n\nHow does the Linux kernel run a program\n--------------------------------------------------------------------------------\n\nThis is the fourth part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/syscall) that describes [system calls](https://en.wikipedia.org/wiki/System_call) in the Linux kernel and as I wrote in the conclusion of the [previous](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-3) - this part will be last in this chapter. In the previous part we stopped at the two new concepts:\n\n* `vsyscall`;\n* `vDSO`;\n\nthat are related and very similar on system call concept.\n\nThis part will be last part in this chapter and as you can understand from the part's title - we will see what does occur in the Linux kernel when we run our programs. So, let's start.\n\nhow do we launch our programs?\n--------------------------------------------------------------------------------\n\nThere are many different ways to launch an application from a user perspective. For example we can run a program from the [shell](https://en.wikipedia.org/wiki/Unix_shell) or double-click on the application icon. It does not matter. The Linux kernel handles application launch regardless how we do launch this application.\n\nIn this part we will consider the way when we just launch an application from the shell. As you know, the standard way to launch an application from shell is the following: We just launch a [terminal emulator](https://en.wikipedia.org/wiki/Terminal_emulator) application and just write the name of the program and pass or not arguments to our program, for example:\n\n![ls shell](images/ls_shell.png)\n\nLet's consider what does occur when we launch an application from the shell, what does shell do when we write program name, what does Linux kernel do etc. But before we will start to consider these interesting things, I want to warn that this book is about the Linux kernel. That's why we will see Linux kernel insides related stuff mostly in this part. We will not consider in details what does shell do, we will not consider complex cases, for example subshells etc.\n\nMy default shell is - [bash](https://en.wikipedia.org/wiki/Bash_%28Unix_shell%29), so I will consider how do bash shell launches a program. So let's start. The `bash` shell as well as any program that written with [C](https://en.wikipedia.org/wiki/C_%28programming_language%29) programming language starts from the [main](https://en.wikipedia.org/wiki/Entry_point) function. If you will look on the source code of the `bash` shell, you will find the `main` function in the [shell.c](https://github.com/bminor/bash/blob/bc007799f0e1362100375bb95d952d28de4c62fb/shell.c#L357) source code file. This function makes many different things before the main thread loop of the `bash` started to work. For example this function:\n\n* checks and tries to open `/dev/tty`;\n* check that shell running in debug mode;\n* parses command line arguments;\n* reads shell environment;\n* loads `.bashrc`, `.profile` and other configuration files;\n* and many many more.\n\nAfter all of these operations we can see the call of the `reader_loop` function. This function defined in the [eval.c](https://github.com/bminor/bash/blob/bc007799f0e1362100375bb95d952d28de4c62fb/eval.c#L67) source code file and represents main thread loop or in other words it reads and executes commands. As the `reader_loop` function made all checks and read the given program name and arguments, it calls the `execute_command` function from the [execute_cmd.c](https://github.com/bminor/bash/blob/bc007799f0e1362100375bb95d952d28de4c62fb/execute_cmd.c#L378) source code file. The `execute_command` function through the chain of the functions calls:\n\n```\nexecute_command\n--> execute_command_internal\n----> execute_simple_command\n------> execute_disk_command\n--------> shell_execve\n```\n\nmakes different checks like do we need to start `subshell`, was it builtin `bash` function or not etc. As I already wrote above, we will not consider all details about things that are not related to the Linux kernel. In the end of this process, the `shell_execve` function calls the `execve` system call:\n\n```C\nexecve (command, args, env);\n```\n\nThe `execve` system call has the following signature:\n\n```\nint execve(const char *filename, char *const argv [], char *const envp[]);\n```\n\nand executes a program by the given filename, with the given arguments and [environment variables](https://en.wikipedia.org/wiki/Environment_variable). This system call is the first in our case and only, for example:\n\n```\n$ strace ls\nexecve(\"/bin/ls\", [\"ls\"], [/* 62 vars */]) = 0\n\n$ strace echo\nexecve(\"/bin/echo\", [\"echo\"], [/* 62 vars */]) = 0\n\n$ strace uname\nexecve(\"/bin/uname\", [\"uname\"], [/* 62 vars */]) = 0\n```\n\nSo, a user application (`bash` in our case) calls the system call and as we already know the next step is Linux kernel.\n\nexecve system call\n--------------------------------------------------------------------------------\n\nWe saw preparation before a system call called by a user application and after a system call handler finished its work in the second [part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-2) of this chapter. We stopped at the call of the `execve` system call in the previous paragraph. This system call defined in the [fs/exec.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/exec.c) source code file and as we already know it takes three arguments:\n\n```\nSYSCALL_DEFINE3(execve,\n\t\tconst char __user *, filename,\n\t\tconst char __user *const __user *, argv,\n\t\tconst char __user *const __user *, envp)\n{\n\treturn do_execve(getname(filename), argv, envp);\n}\n```\n\nImplementation of the `execve` is pretty simple here, as we can see it just returns the result of the `do_execve` function. The `do_execve` function defined in the same source code file and do the following things:\n\n* Initialize two pointers on a userspace data with the given arguments and environment variables;\n* return the result of the `do_execveat_common`.\n\nWe can see its implementation:\n\n```C\nstruct user_arg_ptr argv = { .ptr.native = __argv };\nstruct user_arg_ptr envp = { .ptr.native = __envp };\nreturn do_execveat_common(AT_FDCWD, filename, argv, envp, 0);\n```\n\nThe `do_execveat_common` function does main work - it executes a new program. This function takes similar set of arguments, but as you can see it takes five arguments instead of three. The first argument is the file descriptor that represent directory with our application, in our case the `AT_FDCWD` means that the given pathname is interpreted relative to the current working directory of the calling process. The fifth argument is flags. In our case we passed `0` to the `do_execveat_common`. We will check in a next step, so will see it later.\n\nFirst of all the `do_execveat_common` function checks the `filename` pointer and returns if it is `NULL`. After this we check flags of the current process that limit of running processes is not exceeded:\n\n```C\nif (IS_ERR(filename))\n\treturn PTR_ERR(filename);\n\nif ((current->flags & PF_NPROC_EXCEEDED) &&\n\tatomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {\n\tretval = -EAGAIN;\n\tgoto out_ret;\n}\n\ncurrent->flags &= ~PF_NPROC_EXCEEDED;\n```\n\nIf these two checks were successful we unset `PF_NPROC_EXCEEDED` flag in the flags of the current process to prevent fail of the `execve`. You can see that in the next step we call the `unshare_files` function that defined in the [kernel/fork.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/fork.c) and unshares the files of the current task and check the result of this function:\n\n```C\nretval = unshare_files(&displaced);\nif (retval)\n\tgoto out_ret;\n```\n\nWe need to call this function to eliminate potential leak of the execve'd binary's [file descriptor](https://en.wikipedia.org/wiki/File_descriptor). In the next step we start preparation of the `bprm` that represented by the `struct linux_binprm` structure (defined in the [include/linux/binfmts.h](https://github.com/torvalds/linux/blob/master/include/linux/binfmts.h) header file). The `linux_binprm` structure is used to hold the arguments that are used when loading binaries. For example it contains `vma` field which has `vm_area_struct` type and represents single memory area over a contiguous interval in a given address space where our application will be loaded, `mm` field which is memory descriptor of the binary, pointer to the top of memory and many other different fields.\n\nFirst of all we allocate memory for this structure with the `kzalloc` function and check the result of the allocation:\n\n```C\nbprm = kzalloc(sizeof(*bprm), GFP_KERNEL);\nif (!bprm)\n\tgoto out_files;\n```\n\nAfter this we start to prepare the `binprm` credentials with the call of the `prepare_bprm_creds` function:\n\n```C\nretval = prepare_bprm_creds(bprm);\n\tif (retval)\n\t\tgoto out_free;\n\ncheck_unsafe_exec(bprm);\ncurrent->in_execve = 1;\n```\n\nInitialization of the `binprm` credentials in other words is initialization of the `cred` structure that stored inside of the `linux_binprm` structure. The `cred` structure contains the security context of a task for example [real uid](https://en.wikipedia.org/wiki/User_identifier#Real_user_ID) of the task, real [guid](https://en.wikipedia.org/wiki/Globally_unique_identifier) of the task, `uid` and `guid` for the [virtual file system](https://en.wikipedia.org/wiki/Virtual_file_system) operations etc. In the next step as we executed preparation of the `bprm` credentials we check that now we can safely execute a program with the call of the `check_unsafe_exec` function and set the current process to the `in_execve` state.\n\nAfter all of these operations we call the `do_open_execat` function that checks the flags that we passed to the `do_execveat_common` function (remember that we have `0` in the `flags`) and searches and opens executable file on disk, checks that our we will load a binary file from `noexec` mount points (we need to avoid execute a binary from filesystems that do not contain executable binaries like [proc](https://en.wikipedia.org/wiki/Procfs) or [sysfs](https://en.wikipedia.org/wiki/Sysfs)), initializes `file` structure and returns pointer on this structure. Next we can see the call the `sched_exec` after this:\n\n```C\nfile = do_open_execat(fd, filename, flags);\nretval = PTR_ERR(file);\nif (IS_ERR(file))\n\tgoto out_unmark;\n\nsched_exec();\n```\n\nThe `sched_exec` function is used to determine the least loaded processor that can execute the new program and to migrate the current process to it.\n\nAfter this we need to check [file descriptor](https://en.wikipedia.org/wiki/File_descriptor) of the give executable binary. We try to check does the name of the our binary file starts from the `/` symbol or does the path of the given executable binary is interpreted relative to the current working directory of the calling process or in other words file descriptor is `AT_FDCWD` (read above about this).\n\nIf one of these checks is successful we set the binary parameter filename:\n\n```C\nbprm->file = file;\n\nif (fd == AT_FDCWD || filename->name[0] == '/') {\n\tbprm->filename = filename->name;\n}\n```\n\nOtherwise if the filename is empty we set the binary parameter filename to the `/dev/fd/%d` or `/dev/fd/%d/%s` depends on the filename of the given executable binary which means that we will execute the file to which the file descriptor refers:\n\n```C\n} else {\n\tif (filename->name[0] == '\\0')\n\t\tpathbuf = kasprintf(GFP_TEMPORARY, \"/dev/fd/%d\", fd);\n\telse\n\t\tpathbuf = kasprintf(GFP_TEMPORARY, \"/dev/fd/%d/%s\",\n\t\t                    fd, filename->name);\n\tif (!pathbuf) {\n\t\tretval = -ENOMEM;\n\t\tgoto out_unmark;\n\t}\n\n\tbprm->filename = pathbuf;\n}\n\nbprm->interp = bprm->filename;\n```\n\nNote that we set not only the `bprm->filename` but also `bprm->interp` that will contain name of the program interpreter. For now we just write the same name there, but later it will be updated with the real name of the program interpreter depends on binary format of a program. You can read above that we already prepared `cred` for the `linux_binprm`. The next step is initialization of other fields of the `linux_binprm`.  First of all we call the `bprm_mm_init` function and pass the `bprm` to it:\n\n```C\nretval = bprm_mm_init(bprm);\nif (retval)\n\tgoto out_unmark;\n```\n\nThe `bprm_mm_init` defined in the same source code file and as we can understand from the function's name, it makes initialization of the memory descriptor or in other words the `bprm_mm_init` function initializes `mm_struct` structure. This structure defined in the [include/linux/mm_types.h](https://github.com/torvalds/linux/blob/master/include/linux/mm_types.h) header file and represents address space of a process. We will not consider implementation of the `bprm_mm_init` function because we do not know many important stuff related to the Linux kernel memory manager, but we just need to know that this function initializes `mm_struct` and populate it with a temporary stack `vm_area_struct`.\n\nAfter this we calculate the count of the command line arguments which were passed to our executable binary, the count of the environment variables and set it to the `bprm->argc` and `bprm->envc` respectively:\n\n```C\nbprm->argc = count(argv, MAX_ARG_STRINGS);\nif ((retval = bprm->argc) < 0)\n\tgoto out;\n\nbprm->envc = count(envp, MAX_ARG_STRINGS);\nif ((retval = bprm->envc) < 0)\n\tgoto out;\n```\n\nAs you can see we do this operations with the help of the `count` function that defined in the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/exec.c) source code file and calculates the count of strings in the `argv` array. The `MAX_ARG_STRINGS` macro defined in the [include/uapi/linux/binfmts.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/uapi/linux/binfmts.h) header file and as we can understand from the macro's name, it represents maximum number of strings that were passed to the `execve` system call. The value of the `MAX_ARG_STRINGS`:\n\n```C\n#define MAX_ARG_STRINGS 0x7FFFFFFF\n```\n\nAfter we calculated the number of the command line arguments and environment variables, we call the `prepare_binprm` function. We already call the function with the similar name before this moment. This function is called `prepare_binprm_cred` and we remember that this function initializes `cred` structure in the `linux_bprm`. Now the `prepare_binprm` function:\n\n```C\nretval = prepare_binprm(bprm);\nif (retval < 0)\n\tgoto out;\n```\n\nfills the `linux_binprm` structure with the `uid` from [inode](https://en.wikipedia.org/wiki/Inode) and read `128` bytes from the binary executable file. We read only first `128` from the executable file because we need to check a type of our executable. We will read the rest of the executable file in the later step. After the preparation of the `linux_bprm` structure we copy the filename of the executable binary file, command line arguments and environment variables to the `linux_bprm` with the call of the `copy_strings_kernel` function:\n\n```C\nretval = copy_strings_kernel(1, &bprm->filename, bprm);\nif (retval < 0)\n\tgoto out;\n\nretval = copy_strings(bprm->envc, envp, bprm);\nif (retval < 0)\n\tgoto out;\n\nretval = copy_strings(bprm->argc, argv, bprm);\nif (retval < 0)\n\tgoto out;\n```\n\nAnd set the pointer to the top of new program's stack that we set in the `bprm_mm_init` function:\n\n```C\nbprm->exec = bprm->p;\n```\n\nThe top of the stack will contain the program filename and we store this filename to the `exec` field of the `linux_bprm` structure.\n\nNow we have filled `linux_bprm` structure, we call the `exec_binprm` function:\n\n```C\nretval = exec_binprm(bprm);\nif (retval < 0)\n\tgoto out;\n```\n\nFirst of all we store the [pid](https://en.wikipedia.org/wiki/Process_identifier) and `pid` that seen from the [namespace](https://en.wikipedia.org/wiki/Cgroups) of the current task in the `exec_binprm`:\n\n```C\nold_pid = current->pid;\nrcu_read_lock();\nold_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));\nrcu_read_unlock();\n```\n\nand call the:\n\n```C\nsearch_binary_handler(bprm);\n```\n\nfunction. This function goes through the list of handlers that contains different binary formats. Currently the Linux kernel supports the following binary formats:\n\n* `binfmt_script` - support for interpreted scripts that are starts from the [#!](https://en.wikipedia.org/wiki/Shebang_%28Unix%29) line;\n* `binfmt_misc` - support different binary formats, according to runtime configuration of the Linux kernel;\n* `binfmt_elf` - support [elf](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) format;\n* `binfmt_aout` - support [a.out](https://en.wikipedia.org/wiki/A.out) format;\n* `binfmt_flat` - support for [flat](https://en.wikipedia.org/wiki/Binary_file#Structure) format;\n* `binfmt_elf_fdpic` - Support for [elf](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) [FDPIC](http://elinux.org/UClinux_Shared_Library#FDPIC_ELF) binaries;\n* `binfmt_em86` - support for Intel [elf](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) binaries running on [Alpha](https://en.wikipedia.org/wiki/DEC_Alpha) machines.\n\nSo, the `search_binary_handler` tries to call the `load_binary` function and pass `linux_binprm` to it. If the binary handler supports the given executable file format, it starts to prepare the executable binary for execution:\n\n```C\nint search_binary_handler(struct linux_binprm *bprm)\n{\n\t...\n\t...\n\t...\n\tlist_for_each_entry(fmt, &formats, lh) {\n\t\tretval = fmt->load_binary(bprm);\n\t\tif (retval < 0 && !bprm->mm) {\n\t\t\tforce_sigsegv(SIGSEGV, current);\n\t\t\treturn retval;\n\t\t}\n\t}\n\n\treturn retval;\n```\n\nWhere the `load_binary` for example for the [elf](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) checks the magic number (each `elf` binary file contains magic number in the header) in the `linux_bprm` buffer (remember that we read first `128` bytes from the executable binary file): and exit if it is not `elf` binary:\n\n```C\nstatic int load_elf_binary(struct linux_binprm *bprm)\n{\n\t...\n\t...\n\t...\n\tloc->elf_ex = *((struct elfhdr *)bprm->buf);\n\n\tif (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)\n\t\tgoto out;\n```\n\nIf the given executable file is in `elf` format, the `load_elf_binary` continues to execute. The `load_elf_binary` does many different things to prepare on execution executable file. For example it checks the architecture and type of the executable file:\n\n```C\nif (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)\n\tgoto out;\nif (!elf_check_arch(&loc->elf_ex))\n\tgoto out;\n```\n\nand exit if there is wrong architecture and executable file non executable non shared. Tries to load the `program header table`:\n\n```C\nelf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file);\nif (!elf_phdata)\n\tgoto out;\n```\n\nthat describes [segments](https://en.wikipedia.org/wiki/Memory_segmentation). Read the `program interpreter` and libraries that linked with the our executable binary file from disk and load it to memory. The `program interpreter` specified in the `.interp` section of the executable file and as you can read in the part that describes [Linkers](https://0xax.gitbook.io/linux-insides/summary/misc/linux-misc-3) it is - `/lib64/ld-linux-x86-64.so.2` for the `x86_64`. It setups the stack and map `elf` binary into the correct location in memory. It maps the [bss](https://en.wikipedia.org/wiki/.bss) and the [brk](http://man7.org/linux/man-pages/man2/sbrk.2.html) sections and does many many other different things to prepare executable file to execute.\n\nIn the end of the execution of the `load_elf_binary` we call the `start_thread` function and pass three arguments to it:\n\n```C\n\tstart_thread(regs, elf_entry, bprm->p);\n\tretval = 0;\nout:\n\tkfree(loc);\nout_ret:\n\treturn retval;\n```\n\nThese arguments are:\n\n* Set of [registers](https://en.wikipedia.org/wiki/Processor_register) for the new task;\n* Address of the entry point of the new task;\n* Address of the top of the stack for the new task.\n\nAs we can understand from the function's name, it starts new thread, but it is not so. The `start_thread` function just prepares new task's registers to be ready to run. Let's look on the implementation of this function:\n\n```C\nvoid\nstart_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)\n{\n        start_thread_common(regs, new_ip, new_sp,\n                            __USER_CS, __USER_DS, 0);\n}\n```\n\nAs we can see the `start_thread` function just makes a call of the `start_thread_common` function that will do all for us:\n\n```C\nstatic void\nstart_thread_common(struct pt_regs *regs, unsigned long new_ip,\n                    unsigned long new_sp,\n                    unsigned int _cs, unsigned int _ss, unsigned int _ds)\n{\n        loadsegment(fs, 0);\n        loadsegment(es, _ds);\n        loadsegment(ds, _ds);\n        load_gs_index(0);\n        regs->ip                = new_ip;\n        regs->sp                = new_sp;\n        regs->cs                = _cs;\n        regs->ss                = _ss;\n        regs->flags             = X86_EFLAGS_IF;\n        force_iret();\n}\n```\n\nThe `start_thread_common` function fills `fs` segment register with zero and `es` and `ds` with the value of the data segment register. After this we set new values to the [instruction pointer](https://en.wikipedia.org/wiki/Program_counter), `cs` segments etc. In the end of the `start_thread_common` function we can see the `force_iret` macro that forces a system call return via `iret` instruction. Ok, we prepared new thread to run in userspace and now we can return from the `exec_binprm` and now we are in the `do_execveat_common` again. After the `exec_binprm` will finish its execution we release memory for structures that was allocated before and return.\n\nAfter we returned from the `execve` system call handler, execution of our program will be started. We can do it, because all context related information is already configured for this purpose. As we saw the `execve` system call does not return control to a process, but code, data and other segments of the caller process are just overwritten of the program segments. The exit from our application will be implemented through the `exit` system call.\n\nThat's all. From this point our program will be executed.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the fourth part of the about the system calls concept in the Linux kernel. We saw almost all related stuff to the `system call` concept in these four parts. We started from the understanding of the `system call` concept, we have learned what is it and why do users applications need in this concept. Next we saw how does the Linux handle a system call from a user application. We met two similar concepts to the `system call` concept, they are `vsyscall` and `vDSO` and finally we saw how does Linux kernel run a user program.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [System call](https://en.wikipedia.org/wiki/System_call)\n* [shell](https://en.wikipedia.org/wiki/Unix_shell)\n* [bash](https://en.wikipedia.org/wiki/Bash_%28Unix_shell%29)\n* [entry point](https://en.wikipedia.org/wiki/Entry_point)\n* [C](https://en.wikipedia.org/wiki/C_%28programming_language%29)\n* [environment variables](https://en.wikipedia.org/wiki/Environment_variable)\n* [file descriptor](https://en.wikipedia.org/wiki/File_descriptor)\n* [real uid](https://en.wikipedia.org/wiki/User_identifier#Real_user_ID)\n* [virtual file system](https://en.wikipedia.org/wiki/Virtual_file_system)\n* [procfs](https://en.wikipedia.org/wiki/Procfs)\n* [sysfs](https://en.wikipedia.org/wiki/Sysfs)\n* [inode](https://en.wikipedia.org/wiki/Inode)\n* [pid](https://en.wikipedia.org/wiki/Process_identifier)\n* [namespace](https://en.wikipedia.org/wiki/Cgroups)\n* [#!](https://en.wikipedia.org/wiki/Shebang_%28Unix%29)\n* [elf](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format)\n* [a.out](https://en.wikipedia.org/wiki/A.out)\n* [flat](https://en.wikipedia.org/wiki/Binary_file#Structure)\n* [Alpha](https://en.wikipedia.org/wiki/DEC_Alpha)\n* [FDPIC](http://elinux.org/UClinux_Shared_Library#FDPIC_ELF)\n* [segments](https://en.wikipedia.org/wiki/Memory_segmentation)\n* [Linkers](https://0xax.gitbook.io/linux-insides/summary/misc/linux-misc-3)\n* [Processor register](https://en.wikipedia.org/wiki/Processor_register)\n* [instruction pointer](https://en.wikipedia.org/wiki/Program_counter)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-3)\n"
  },
  {
    "path": "SysCall/linux-syscall-5.md",
    "content": "How does the `open` system call work\n--------------------------------------------------------------------------------\n\nIntroduction\n--------------------------------------------------------------------------------\n\nThis is the fifth part of the chapter that describes [system calls](https://en.wikipedia.org/wiki/System_call) mechanism in the Linux kernel. Previous parts of this chapter described the mechanism of system calls in general. I will now try to describe the implementation of different system calls in the Linux kernel. Previous parts from this chapter and parts of other chapters of the book mostly described deep parts of the Linux kernel that are barely visible or  invisible from userspace. However, the greatness of the Linux kernel is not its singular existence, but its ability to enable our code to perform various useful functions such as reading/writing from/to files without the knowledge of details such as sectors, tracks and other nitty gritties of the disk layout. For eg., the kernel allows programs to send data over networks without our having to encapsulate network packets by hand etc.\n\nI don't know how about you, but the inner workings of the operating system both fascinate and excite my curiosity greatly. As you may know, our programs interact with the kernel through a special mechanism called [system call](https://en.wikipedia.org/wiki/System_call). I will hence attempt to describe the implementation and behavior of system calls such as `read`, `write`, `open`, `close`, `dup` etc. in a series of articles.\n\nLet me start with the description of the simplest (and commonly used) [open](http://man7.org/linux/man-pages/man2/open.2.html) system call. if you have done any `C` programming at all, you should know that a file must be opened using the `open` system call before we are able to read/write to it.\n\n```C\n#include <fcntl.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <unistd.h>\n#include <sys/stat.h>\n#include <sys/types.h>\n\nint main(int argc, char *argv) {\n        int fd = open(\"test\", O_RDONLY);\n\n        if fd < 0 {\n                perror(\"Opening of the file is failed\\n\");\n        }\n        else {\n                printf(\"file successfully opened\\n\");\n        }\n\n        close(fd);\n        return 0;\n}\n```\n\nIn this case, `open` is a function from standard library, but not the system call. The standard library will call the related system call for us. The `open` call will return a [file descriptor](https://en.wikipedia.org/wiki/File_descriptor) which is just a unique number within our process which is associated with the opened file. Now as we opened a file and got file descriptor as result of `open` call, we may start to interact with this file. We can write into, read from it and etc. List of opened file by a process is available via [proc](https://en.wikipedia.org/wiki/Procfs) filesystem: \n\n```\n$ sudo ls /proc/1/fd/\n\n0  10  12  14  16  2   21  23  25  27  29  30  32  34  36  38  4   41  43  45  47  49  50  53  55  58  6   61  63  67  8\n1  11  13  15  19  20  22  24  26  28  3   31  33  35  37  39  40  42  44  46  48  5   51  54  57  59  60  62  65  7   9\n```\n\nI am not going to describe more details about the `open` routine from the userspace view in this post, but mostly from the kernel side. If you are not very familiar with, you can get more info in the [man page](http://man7.org/linux/man-pages/man2/open.2.html).\n\nSo let's start.\n\nDefinition of the open system call\n--------------------------------------------------------------------------------\n\nIf you have read the [fourth part](https://github.com/0xAX/linux-insides/blob/master/SysCall/linux-syscall-4.md) of the [linux-insides](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md) book, you should know that system calls are defined with the help of `SYSCALL_DEFINE` macro. So, the `open` system call is no exception.\n\nDefinition of the `open` system call is located in the [fs/open.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/open.c) source code file and looks pretty small for the first view:\n\n```C\nSYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)\n{\n\tif (force_o_largefile())\n\t\tflags |= O_LARGEFILE;\n\n\treturn do_sys_open(AT_FDCWD, filename, flags, mode);\n}\n```\n\nAs you may guess, the `do_sys_open` function from the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/open.c) source code file does the main job. But before this function is called, let's consider the `if` clause from which the implementation of the `open` system call starts:\n\n```C\nif (force_o_largefile())\n\tflags |= O_LARGEFILE;\n```\n\nHere we apply the `O_LARGEFILE` flag to the flags which were passed to `open` system call in a case when the `force_o_largefile()` will return true.\nWhat is `O_LARGEFILE`? We may read this in the [man page](http://man7.org/linux/man-pages/man2/open.2.html) for the `open(2)` system call:\n\n> O_LARGEFILE\n>\n> (LFS) Allow files whose sizes cannot be represented in an off_t (but can be represented in an off64_t) to be opened.\n\nAs we may read in the [GNU C Library Reference Manual](https://www.gnu.org/software/libc/manual/html_mono/libc.html#File-Position-Primitive):\n\n> off_t\n>\n>    This is a signed integer type used to represent file sizes.\n>    In the GNU C Library, this type is no narrower than int.\n>    If the source is compiled with _FILE_OFFSET_BITS == 64 this\n>    type is transparently replaced by off64_t.\n\nand\n\n> off64_t\n>\n>    This type is used similar to off_t. The difference is that\n>    even on 32 bit machines, where the off_t type would have 32 bits,\n>    off64_t has 64 bits and so is able to address files up to 2^63 bytes\n>    in length. When compiling with _FILE_OFFSET_BITS == 64 this type\n>    is available under the name off_t.\n\nSo it is not hard to guess that the `off_t`, `off64_t` and `O_LARGEFILE` are about a file size. In the case of the Linux kernel, the `O_LARGEFILE` is used  to disallow opening large files on 32bit systems if the caller didn't specify `O_LARGEFILE` flag during opening of a file. On 64bit systems we force on this flag in open system call. And the `force_o_largefile` macro from the [include/linux/fcntl.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/fcntl.h#L7) Linux kernel header file confirms this:\n\n```C\n#ifndef force_o_largefile\n#define force_o_largefile() (BITS_PER_LONG != 32)\n#endif\n```\n\nThis macro may be architecture-specific as for example for [IA-64](https://en.wikipedia.org/wiki/IA-64) architecture, but in our case the [x86_64](https://en.wikipedia.org/wiki/X86-64) does not provide definition of the `force_o_largefile` and it will be used from [include/linux/fcntl.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/fcntl.h#L7).\n\nSo, as we may see the `force_o_largefile` is just a macro which expands to the `true` value in our case of [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture. As we are considering 64-bit architecture, the `force_o_largefile` will be expanded to `true` and the `O_LARGEFILE` flag will be added to the set of flags which were passed to the `open` system call.\n\nNow as we considered meaning of the `O_LARGEFILE` flag and `force_o_largefile` macro, we can proceed to the consideration of the implementation of the `do_sys_open` function. As I wrote above, this function is defined in the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/open.c) source code file and looks:\n\n```C\nlong do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)\n{\n\tstruct open_flags op;\n\tint fd = build_open_flags(flags, mode, &op);\n\tstruct filename *tmp;\n\n\tif (fd)\n\t\treturn fd;\n\n\ttmp = getname(filename);\n\tif (IS_ERR(tmp))\n\t\treturn PTR_ERR(tmp);\n\n\tfd = get_unused_fd_flags(flags);\n\tif (fd >= 0) {\n\t\tstruct file *f = do_filp_open(dfd, tmp, &op);\n\t\tif (IS_ERR(f)) {\n\t\t\tput_unused_fd(fd);\n\t\t\tfd = PTR_ERR(f);\n\t\t} else {\n\t\t\tfsnotify_open(f);\n\t\t\tfd_install(fd, f);\n\t\t}\n\t}\n\tputname(tmp);\n\treturn fd;\n}\n```\n\nLet's try to understand how the `do_sys_open` works step by step.\n\nopen(2) flags\n--------------------------------------------------------------------------------\n\nAs you know the `open` system call takes set of `flags` as second argument that control opening a file and `mode` as third argument that specifies permission the permissions of a file if it is created. The `do_sys_open` function starts from the call of the `build_open_flags` function which does some checks that set of the given flags is valid and handles different conditions of flags and mode.\n\nLet's look at the implementation of the `build_open_flags`. This function is defined in the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/open.c) kernel file and takes three arguments:\n\n* flags - flags that control opening of a file;\n* mode - permissions for newly created file;\n\nThe last argument - `op` is represented with the `open_flags` structure:\n\n```C\nstruct open_flags {\n        int open_flag;\n        umode_t mode;\n        int acc_mode;\n        int intent;\n        int lookup_flags;\n};\n```\n\nwhich is defined in the [fs/internal.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/internal.h#L99) header file and as we may see it holds information about flags and access mode for internal kernel purposes. As you already may guess the main goal of the `build_open_flags` function is to fill an instance of this structure.\n\nImplementation of the `build_open_flags` function starts from the definition of local variables and one of them is:\n\n```C\nint acc_mode = ACC_MODE(flags);\n```\n\nThis local variable represents access mode and its initial value will be equal to the value of expanded `ACC_MODE` macro. This macro is defined in the [include/linux/fs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/fs.h) and looks pretty interesting:\n\n```C\n#define ACC_MODE(x) (\"\\004\\002\\006\\006\"[(x)&O_ACCMODE])\n#define O_ACCMODE   00000003\n```\n\nThe `\"\\004\\002\\006\\006\"` is an array of four chars:\n\n```\n\"\\004\\002\\006\\006\" == {'\\004', '\\002', '\\006', '\\006'}\n```\n\nSo, the `ACC_MODE` macro just expands to the accession to this array by `[(x) & O_ACCMODE]` index. As we just saw, the `O_ACCMODE` is `00000003`. By applying `x & O_ACCMODE` we will take the two least significant bits which are represents `read`, `write` or `read/write` access modes:\n\n```C\n#define O_RDONLY        00000000\n#define O_WRONLY        00000001\n#define O_RDWR          00000002\n```\n\nAfter getting value from the array by the calculated index, the `ACC_MODE` will be expanded to access mode mask of a file which will hold `MAY_WRITE`, `MAY_READ` and other information.\n\nWe may see following condition after we have calculated initial access mode:\n\n```C\nif (flags & (O_CREAT | __O_TMPFILE))\n\top->mode = (mode & S_IALLUGO) | S_IFREG;\nelse\n\top->mode = 0;\n```\n\nHere we reset permissions in `open_flags` instance if an open file wasn't temporary and wasn't open for creation. This is because:\n\n> if  neither O_CREAT nor O_TMPFILE is specified, then mode is ignored.\n\nIn other case if `O_CREAT` or `O_TMPFILE` were passed we canonicalize it to a regular file because a directory should be created with the [opendir](http://man7.org/linux/man-pages/man3/opendir.3.html) system call.\n\nAt the next step we check that a file is not tried to be opened via [fanotify](http://man7.org/linux/man-pages/man7/fanotify.7.html) and without the `O_CLOEXEC` flag:\n\n```C\nflags &= ~FMODE_NONOTIFY & ~O_CLOEXEC;\n```\n\nWe do this to not leak a [file descriptor](https://en.wikipedia.org/wiki/File_descriptor). By default, the new file descriptor is set to remain open across an `execve` system call, but the `open` system call supports `O_CLOEXEC` flag that can be used to change this default behaviour. So we do this to prevent leaking of a file descriptor when one thread opens a file to set `O_CLOEXEC` flag and in the same time the second process does a [fork](https://en.wikipedia.org/wiki/Fork_(system_call)) + [execve](https://en.wikipedia.org/wiki/Exec_(system_call)) and as you may remember that child will have copies of the parent's set of open file descriptors.\n\nAt the next step we check that if our flags contains `O_SYNC` flag, we apply `O_DSYNC` flag too:\n\n```\nif (flags & __O_SYNC)\n\tflags |= O_DSYNC;\n```\n\nThe `O_SYNC` flag guarantees that the any write call will not return before all data has been transferred to the disk. The `O_DSYNC` is like `O_SYNC` except that there is no requirement to wait for any metadata (like `atime`, `mtime` and etc.) changes will be written. We apply `O_DSYNC` in a case of `__O_SYNC` because it is implemented as `__O_SYNC|O_DSYNC` in the Linux kernel.\n\nAfter this we must be sure that if a user wants to create temporary file, the flags should contain `O_TMPFILE_MASK` or in other words it should contain or `O_CREAT` or `O_TMPFILE` or both and also it should be writeable:\n\n```C\nif (flags & __O_TMPFILE) {\n\tif ((flags & O_TMPFILE_MASK) != O_TMPFILE)\n\t\treturn -EINVAL;\n\tif (!(acc_mode & MAY_WRITE))\n\t\treturn -EINVAL;\n} else if (flags & O_PATH) {\n       \tflags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;\n        acc_mode = 0;\n}\n```\n\nas it is written in in the manual page:\n\n> O_TMPFILE  must  be  specified  with one of O_RDWR or O_WRONLY\n\nIf we didn't pass `O_TMPFILE` for creation of a temporary file, we check the `O_PATH` flag at the next condition. The `O_PATH` flag allows us to obtain a file descriptor that may be used for two following purposes:\n\n* to indicate a location in the filesystem tree;\n* to perform operations that act purely at the file descriptor level.\n\nSo, in this case the file itself is not opened, but operations like `dup`, `fcntl` and other can be used. So, if all file content related operations like `read`, `write` and other are not permitted, only `O_DIRECTORY | O_NOFOLLOW | O_PATH` flags can be used. We have finished with flags for this moment in the `build_open_flags` for this moment and we may fill our `open_flags->open_flag` with them:\n\n```C\nop->open_flag = flags;\n```\n\nNow we have filled `open_flag` field which represents flags that will control opening of a file and `mode` that will represent `umask` of a new file if we open file for creation. There are still to fill last flags in our `open_flags` structure. The next is `op->acc_mode` which represents access mode to a opened file. We already filled the `acc_mode` local variable with the initial value at the beginning of the `build_open_flags` and now we check last two flags related to access mode:\n\n```C\nif (flags & O_TRUNC)\n        acc_mode |= MAY_WRITE;\nif (flags & O_APPEND)\n\tacc_mode |= MAY_APPEND;\nop->acc_mode = acc_mode;\n```\n\nThese flags are - `O_TRUNC` that will truncate an opened file to length `0` if it existed before we open it and the `O_APPEND` flag allows to open a file in `append mode`. So the opened file will be appended during write but not overwritten.\n\nThe next field of the `open_flags` structure is - `intent`. It allows us to know about our intention or in other words what do we really want to do with file, open it, create, rename it or something else. So we set it to zero if our flags contains the `O_PATH` flag as we can't do anything related to a file content with this flag:\n\n```C\nop->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;\n```\n\nor just to `LOOKUP_OPEN` intention. Additionally we set `LOOKUP_CREATE` intention if we want to create new file and to be sure that a file didn't exist before with `O_EXCL` flag:\n\n```C\nif (flags & O_CREAT) {\n\top->intent |= LOOKUP_CREATE;\n\tif (flags & O_EXCL)\n\t\top->intent |= LOOKUP_EXCL;\n}\n```\n\nThe last flag of the `open_flags` structure is the `lookup_flags`:\n\n```C\nif (flags & O_DIRECTORY)\n\tlookup_flags |= LOOKUP_DIRECTORY;\nif (!(flags & O_NOFOLLOW))\n\tlookup_flags |= LOOKUP_FOLLOW;\nop->lookup_flags = lookup_flags;\n\nreturn 0;\n```\n\nWe fill it with `LOOKUP_DIRECTORY` if we want to open a directory and `LOOKUP_FOLLOW` if we don't want to follow (open) [symlink](https://en.wikipedia.org/wiki/Symbolic_link). That's all. It is the end of the `build_open_flags` function. The `open_flags` structure is filled with modes and flags for a file opening and we can return back to the `do_sys_open`.\n\nActual opening of a file\n--------------------------------------------------------------------------------\n\nAt the next step after `build_open_flags` function is finished and we have formed flags and modes for our file we should get the `filename` structure with the help of the `getname` function by name of a file which was passed to the `open` system call:\n\n```C\ntmp = getname(filename);\nif (IS_ERR(tmp))\n\treturn PTR_ERR(tmp);\n```\n\nThe `getname` function is defined in the [fs/namei.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/namei.c) source code file and looks:\n\n```C\nstruct filename *\ngetname(const char __user * filename)\n{\n        return getname_flags(filename, 0, NULL);\n}\n```\n\nSo, it just calls the `getname_flags` function and returns its result. The main goal of the `getname_flags` function is to copy a file path given from userland to kernel space. The `filename` structure is defined in the [include/linux/fs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/fs.h) Linux kernel header file and contains following fields:\n\n* name - pointer to a file path in kernel space;\n* uptr - original pointer from userland;\n* aname - filename from [audit](https://linux.die.net/man/8/auditd) context;\n* refcnt - reference counter;\n* iname - a filename in a case when it will be less than `PATH_MAX`.\n\nAs I already wrote above, the main goal of the `getname_flags` function is to copy name of a file which was passed to the `open` system call from user space to kernel space with the strncpy_from_user function. The next step after a filename will be copied to kernel space is getting of new non-busy file descriptor:\n\n```C\nfd = get_unused_fd_flags(flags);\n```\n\nThe `get_unused_fd_flags` function takes table of open files of the current process, minimum (`0`) and maximum (`RLIMIT_NOFILE`) possible number of a file descriptor in the system and flags that we have passed to the `open` system call and allocates file descriptor and mark it busy in the file descriptor table of the current process. The `get_unused_fd_flags` function sets or clears the `O_CLOEXEC` flag depends on its state in the passed flags.\n\nThe last and main step in the `do_sys_open` is the `do_filp_open` function:\n\n```C\nstruct file *f = do_filp_open(dfd, tmp, &op);\n\nif (IS_ERR(f)) {\n\tput_unused_fd(fd);\n\tfd = PTR_ERR(f);\n} else {\n\tfsnotify_open(f);\n\tfd_install(fd, f);\n}\n```\n\nThe main goal of this function is to resolve given path name into `file` structure which represents an opened file of a process. If something going wrong and execution of the `do_filp_open` function will be failed, we should free new file descriptor with the `put_unused_fd` or in other way the `file` structure returned by the `do_filp_open` will be stored in the file descriptor table of the current process.\n\nNow let's take a short look at the implementation of the `do_filp_open` function. This function is defined in the [fs/namei.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/namei.c) Linux kernel source code file and starts from initialization of the `nameidata` structure. This structure will provide a link to a file [inode](https://en.wikipedia.org/wiki/Inode). Actually this is one of the main point of the `do_filp_open` function to acquire an `inode` by the filename given to `open` system call. After the `nameidata` structure will be initialized, the `path_openat` function will be called:\n\n```C\nfilp = path_openat(&nd, op, flags | LOOKUP_RCU);\n\nif (unlikely(filp == ERR_PTR(-ECHILD)))\n\tfilp = path_openat(&nd, op, flags);\nif (unlikely(filp == ERR_PTR(-ESTALE)))\n\tfilp = path_openat(&nd, op, flags | LOOKUP_REVAL);\n```\n\nNote that it is called three times. Actually, the Linux kernel will open the file in [RCU](https://www.kernel.org/doc/Documentation/RCU/whatisRCU.txt) mode. This is the most efficient way to open a file. If this try will be failed, the kernel enters the normal mode. The third call is relatively rare, only in the [nfs](https://en.wikipedia.org/wiki/Network_File_System) file system is likely to be used. The `path_openat` function executes `path lookup` or in other words it tries to find a `dentry` (what the Linux kernel uses to keep track of the hierarchy of files in directories) corresponding to a path.\n\nThe `path_openat` function starts from the call of the `get_empty_flip()` function that allocates a new `file` structure with some additional checks like do we exceed amount of opened files in the system or not and etc. After we have got allocated new `file` structure we call the `do_tmpfile` or `do_o_path` functions in a case if we have passed `O_TMPFILE | O_CREATE` or `O_PATH` flags during call of the `open` system call. Both these cases are quite specific, so let's consider quite usual case when we want to open already existed file and want to read/write from/to it.\n\nIn this case the `path_init` function will be called. This function performs some preparatory work before actual path lookup. This includes search of start position of path traversal and its metadata like `inode` of the path, `dentry inode` and etc. This can be `root` directory - `/` or current directory as in our case, because we use `AT_CWD` as starting point (see call of the `do_sys_open` at the beginning of the post).\n\nThe next step after the `path_init` is the [loop](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/namei.c#L3457) which executes the `link_path_walk` and `do_last`. The first function executes name resolution or in other words this function starts process of walking along a given path. It handles everything step by step except the last component of a file path. This handling includes checking of a permissions and getting a file component. As a file component is gotten, it is passed to `walk_component` that updates current directory entry from the `dcache` or asks underlying filesystem. This repeats before all path's components will not be handled in such way. After the `link_path_walk` will be executed, the `do_last` function will populate a `file` structure based on the result of the `link_path_walk`. As we reached last component of the given file path the `vfs_open` function from the `do_last` will be called.\n\nThis function is defined in the [fs/open.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/open.c) Linux kernel source code file and the main goal of this function is to call an `open` operation of underlying filesystem.\n\nThat's all for now. We didn't consider **full** implementation of the `open` system call. We skip some parts like handling case when we want to open a file from other filesystem with different mount point, resolving symlinks and etc., but it should be not so hard to follow this stuff. This stuff does not included in **generic** implementation of open system call and depends on underlying filesystem. If you are interested in, you may lookup the `file_operations.open` callback function for a certain [filesystem](https://github.com/torvalds/linux/tree/master/fs).\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the fifth part of the implementation of different system calls in the Linux kernel. If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part, we will continue to dive into system calls in the Linux kernel and see the implementation of the [read](http://man7.org/linux/man-pages/man2/read.2.html) system call.\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [system call](https://en.wikipedia.org/wiki/System_call)\n* [open](http://man7.org/linux/man-pages/man2/open.2.html)\n* [file descriptor](https://en.wikipedia.org/wiki/File_descriptor)\n* [proc](https://en.wikipedia.org/wiki/Procfs)\n* [GNU C Library Reference Manual](https://www.gnu.org/software/libc/manual/html_mono/libc.html#File-Position-Primitive)\n* [IA-64](https://en.wikipedia.org/wiki/IA-64)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [opendir](http://man7.org/linux/man-pages/man3/opendir.3.html)\n* [fanotify](http://man7.org/linux/man-pages/man7/fanotify.7.html)\n* [fork](https://en.wikipedia.org/wiki/Fork_\\(system_call\\))\n* [execve](https://en.wikipedia.org/wiki/Exec_\\(system_call\\))\n* [symlink](https://en.wikipedia.org/wiki/Symbolic_link)\n* [audit](https://linux.die.net/man/8/auditd)\n* [inode](https://en.wikipedia.org/wiki/Inode)\n* [RCU](https://www.kernel.org/doc/Documentation/RCU/whatisRCU.txt)\n* [read](http://man7.org/linux/man-pages/man2/read.2.html)\n* [previous part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-4)\n"
  },
  {
    "path": "SysCall/linux-syscall-6.md",
    "content": "Limits on resources in Linux\n================================================================================\n\nEach process in the system uses certain amount of different resources like files, CPU time, memory and so on.\n\nSuch resources are not infinite and each process and we should have an instrument to manage it. Sometimes it is useful to know current limits for a certain resource or to change its value. In this post we will consider such instruments that allow us to get information about limits for a process and increase or decrease such limits.\n\nWe will start from userspace view and then we will look how it is implemented in the Linux kernel.\n\nThere are three main fundamental [system calls](https://en.wikipedia.org/wiki/System_call) to manage resource limit for a process:\n\n  * `getrlimit`\n  * `setrlimit`\n  * `prlimit`\n\nThe first two allows a process to read and set limits on a system resource. The last one is extension for previous functions. The `prlimit` allows to set and read the resource limits of a process specified by [PID](https://en.wikipedia.org/wiki/Process_identifier). Definitions of these functions looks:\n\nThe `getrlimit` is:\n\n```C\nint getrlimit(int resource, struct rlimit *rlim);\n```\n\nThe `setrlimit` is:\n\n```C\nint setrlimit(int resource, const struct rlimit *rlim);\n```\n\nAnd the definition of the `prlimit` is:\n\n```C\nint prlimit(pid_t pid, int resource, const struct rlimit *new_limit,\n            struct rlimit *old_limit);\n```\n\nIn the first two cases, functions takes two parameters:\n\n  * `resource` - represents resource type (we will see available types later);\n  * `rlim` - combination of `soft` and `hard` limits.\n\nThere are two types of limits:\n\n  * `soft`\n  * `hard`\n\nThe first provides actual limit for a resource of a process. The second is a ceiling value of a `soft` limit and can be set only by superuser. So, `soft` limit can never exceed related `hard` limit.\n\nBoth these values are combined in the `rlimit` structure:\n\n```C\nstruct rlimit {\n    rlim_t rlim_cur;\n    rlim_t rlim_max;\n};\n```\n\nThe last one function looks a little bit complex and takes `4` arguments. Besides `resource` argument, it takes:\n\n  * `pid` - specifies an ID of a process on which the `prlimit` should be executed;\n  * `new_limit` - provides new limits values if it is not `NULL`;\n  * `old_limit` - current `soft` and `hard` limits will be placed here if it is not `NULL`.\n\nExactly `prlimit` function is used by [ulimit](https://www.gnu.org/software/bash/manual/html_node/Bash-Builtins.html#index-ulimit) util. We can verify this with the help of [strace](https://linux.die.net/man/1/strace) util.\n\nFor example:\n\n```\n~$ strace ulimit -s 2>&1 | grep rl\n\nprlimit64(0, RLIMIT_NPROC, NULL, {rlim_cur=63727, rlim_max=63727}) = 0\nprlimit64(0, RLIMIT_NOFILE, NULL, {rlim_cur=1024, rlim_max=4*1024}) = 0\nprlimit64(0, RLIMIT_STACK, NULL, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0\n```\n\nHere we can see `prlimit64`, but not the `prlimit`. The fact is that we see underlying system call here instead of library call.\n\nNow let's look at list of available resources:\n\n| Resource          | Description\n|-------------------|------------------------------------------------------------------------------------------|\n| RLIMIT_CPU        | CPU time limit given in seconds                                                          |\n| RLIMIT_FSIZE      | the maximum size of files that a process may create                                      |\n| RLIMIT_DATA       | the maximum  size  of  the process's data segment                                        |\n| RLIMIT_STACK      | the maximum size of the process stack in bytes                                           |\n| RLIMIT_CORE       | the maximum size of a [core](http://man7.org/linux/man-pages/man5/core.5.html) file.     |\n| RLIMIT_RSS        | the number of bytes that can be allocated for a process in RAM                           |\n| RLIMIT_NPROC      | the maximum number of processes that can be created by a user                            |\n| RLIMIT_NOFILE     | the maximum number of a file descriptor that can be opened by a process                  |\n| RLIMIT_MEMLOCK    | the maximum number of bytes of memory that may be locked into RAM by [mlock](http://man7.org/linux/man-pages/man2/mlock.2.html).|\n| RLIMIT_AS         | the maximum size of virtual memory in bytes.                                             |\n| RLIMIT_LOCKS      | the maximum number [flock](https://linux.die.net/man/1/flock) and locking related [fcntl](http://man7.org/linux/man-pages/man2/fcntl.2.html) calls|\n| RLIMIT_SIGPENDING | maximum number of [signals](http://man7.org/linux/man-pages/man7/signal.7.html) that may be queued for a user of the calling process|\n| RLIMIT_MSGQUEUE   | the number of bytes that can be allocated for [POSIX message queues](http://man7.org/linux/man-pages/man7/mq_overview.7.html) |\n| RLIMIT_NICE       | the maximum [nice](https://linux.die.net/man/1/nice) value that can be set by a process  |\n| RLIMIT_RTPRIO     | maximum real-time priority value                                                         |\n| RLIMIT_RTTIME     | maximum number of microseconds that a process may be scheduled under real-time scheduling policy without making blocking system call|\n\nIf you're looking into source code of open source projects, you will note that reading or updating of a resource limit is quite widely used operation.\n\nFor example: [systemd](https://github.com/systemd/systemd/blob/01a45898fce8def67d51332bccc410eb1e8710e7/src/core/main.c)\n\n```C\n/* Don't limit the coredump size */\n(void) setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY));\n```\n\nOr [haproxy](https://github.com/haproxy/haproxy/blob/25f067ccec52f53b0248a05caceb7841a3cb99df/src/haproxy.c):\n\n```C\ngetrlimit(RLIMIT_NOFILE, &limit);\nif (limit.rlim_cur < global.maxsock) {\n\tWarning(\"[%s.main()] FD limit (%d) too low for maxconn=%d/maxsock=%d. Please raise 'ulimit-n' to %d or more to avoid any trouble.\\n\",\n\t\targv[0], (int)limit.rlim_cur, global.maxconn, global.maxsock, global.maxsock);\n}\n```\n\nWe've just saw a little bit about resources limits related stuff in the userspace, now let's look at the same system calls in the Linux kernel.\n\nLimits on resource in the Linux kernel\n--------------------------------------------------------------------------------\n\nBoth implementation of `getrlimit` system call and `setrlimit` looks similar. Both they execute `do_prlimit` function that is core implementation of the `prlimit` system call and copy from/to given `rlimit` from/to userspace:\n\nThe `getrlimit`:\n\n```C\nSYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)\n{\n\tstruct rlimit value;\n\tint ret;\n\n\tret = do_prlimit(current, resource, NULL, &value);\n\tif (!ret)\n\t\tret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;\n\n\treturn ret;\n}\n```\n\nand `setrlimit`:\n\n```C\nSYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)\n{\n\tstruct rlimit new_rlim;\n\n\tif (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))\n\t\treturn -EFAULT;\n\treturn do_prlimit(current, resource, &new_rlim, NULL);\n}\n```\n\nImplementations of these system calls are defined in the [kernel/sys.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/sys.c) kernel source code file.\n\nFirst of all the `do_prlimit` function executes a check that the given resource is valid:\n\n```C\nif (resource >= RLIM_NLIMITS)\n\treturn -EINVAL;\n```\n\nand in a failure case returns `-EINVAL` error. After this check will pass successfully and new limits was passed as non `NULL` value, two following checks:\n\n```C\nif (new_rlim) {\n\tif (new_rlim->rlim_cur > new_rlim->rlim_max)\n\t\treturn -EINVAL;\n\tif (resource == RLIMIT_NOFILE &&\n\t\t\tnew_rlim->rlim_max > sysctl_nr_open)\n\t\treturn -EPERM;\n}\n```\n\ncheck that the given `soft` limit does not exceed `hard` limit and in a case when the given resource is the maximum number of a file descriptors that hard limit is not greater than `sysctl_nr_open` value. The value of the `sysctl_nr_open` can be found via [procfs](https://en.wikipedia.org/wiki/Procfs):\n\n```\n~$ cat /proc/sys/fs/nr_open\n1048576\n```\n\nAfter all of these checks we lock `tasklist` to be sure that [signal]() handlers related things will not be destroyed while we updating limits for a given resource:\n\n```C\nread_lock(&tasklist_lock);\n...\n...\n...\nread_unlock(&tasklist_lock);\n```\n\nWe need to do this because `prlimit` system call allows us to update limits of another task by the given pid. As task list is locked, we take the `rlimit` instance that is responsible for the given resource limit of the given process:\n\n```C\nrlim = tsk->signal->rlim + resource;\n```\n\nwhere the `tsk->signal->rlim` is just array of `struct rlimit` that represents certain resources. And if the `new_rlim` is not `NULL` we just update its value. If `old_rlim` is not `NULL` we fill it:\n\n```C\nif (old_rlim)\n    *old_rlim = *rlim;\n```\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the second part that describes implementation of the system calls in the Linux kernel. If you have questions or suggestions, ping me on Twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [system calls](https://en.wikipedia.org/wiki/System_call)\n* [PID](https://en.wikipedia.org/wiki/Process_identifier)\n* [ulimit](https://www.gnu.org/software/bash/manual/html_node/Bash-Builtins.html#index-ulimit)\n* [strace](https://linux.die.net/man/1/strace)\n* [POSIX message queues](http://man7.org/linux/man-pages/man7/mq_overview.7.html)\n"
  },
  {
    "path": "Theory/README.md",
    "content": "# Theory\n\nThis chapter describes various theoretical concepts and concepts which are not directly related to practice but useful to know.\n\n* [Paging](linux-theory-1.md)\n* [Elf64 format](linux-theory-2.md)\n* [Inline assembly](linux-theory-3.md)\n"
  },
  {
    "path": "Theory/linux-theory-1.md",
    "content": "Paging\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nIn the fifth [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-5) of the series `Linux kernel booting process` we learned about what the kernel does in its earliest stage. In the next step the kernel will initialize different things like `initrd` mounting, lockdep initialization, and many many other things, before we can see how the kernel runs the first init process.\n\nYeah, there will be many different things, but many many and once again many work with **memory**.\n\nIn my view, memory management is one of the most complex parts of the Linux kernel and system programming in general. This is why we need to get acquainted with paging, before we proceed with the kernel initialization stuff.\n\n`Paging` is a mechanism that translates a linear memory address to a physical address. If you have read the previous parts of this book, you may remember that we saw segmentation in real mode when physical addresses are calculated by shifting a segment register by four and adding an offset. We also saw segmentation in protected mode, where we used the descriptor tables and base addresses from descriptors with offsets to calculate the physical addresses. Now we will see paging in 64-bit mode.\n\nAs the Intel manual says:\n\n> Paging provides a mechanism for implementing a conventional demand-paged, virtual-memory system where sections of a program’s execution environment are mapped into physical memory as needed.\n\nSo... In this post I will try to explain the theory behind paging. Of course it will be closely related to the `x86_64` version of the Linux kernel, but we will not go into too much details (at least in this post).\n\nEnabling paging\n--------------------------------------------------------------------------------\n\nThere are three paging modes:\n\n* 32-bit paging;\n* PAE paging;\n* IA-32e paging.\n\nWe will only explain the last mode here. To enable the `IA-32e paging` paging mode we need to do the following things:\n\n* set the `CR0.PG` bit;\n* set the `CR4.PAE` bit;\n* set the `IA32_EFER.LME` bit.\n\nWe already saw where those bits were set in [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/compressed/head_64.S):\n\n```assembly\nmovl\t$(X86_CR0_PG | X86_CR0_PE), %eax\nmovl\t%eax, %cr0\n```\n\nand\n\n```assembly\nmovl\t$MSR_EFER, %ecx\nrdmsr\nbtsl\t$_EFER_LME, %eax\nwrmsr\n```\n\nPaging structures\n--------------------------------------------------------------------------------\n\nPaging divides the linear address space into fixed-size pages. Pages can be mapped into the physical address space or external storage. This fixed size is `4096` bytes for the `x86_64` Linux kernel. To perform the translation from linear address to physical address, special structures are used. Every structure is `4096` bytes and contains `512` entries (this only for `PAE` and `IA32_EFER.LME` modes). Paging structures are hierarchical and the Linux kernel uses 4 level of paging in the `x86_64` architecture. The CPU uses a part of linear addresses to identify the entry in another paging structure which is at the lower level, physical memory region (`page frame`) or physical address in this region (`page offset`). The address of the top level paging structure located in the `cr3` register. We have already seen this in [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/compressed/head_64.S):\n\n```assembly\nleal\tpgtable(%ebx), %eax\nmovl\t%eax, %cr3\n```\n\nWe build the page table structures and put the address of the top-level structure in the `cr3` register. Here `cr3` is used to store the address of the top-level structure, the `PML4` or `Page Global Directory` as it is called in the Linux kernel. `cr3` is 64-bit register and has the following structure:\n\n```\n63                  52 51                                                        32\n --------------------------------------------------------------------------------\n|                     |                                                          |\n|    Reserved MBZ     |            Address of the top level structure            |\n|                     |                                                          |\n --------------------------------------------------------------------------------\n31                                  12 11            5     4     3 2             0\n --------------------------------------------------------------------------------\n|                                     |               |  P  |  P  |              |\n|  Address of the top level structure |   Reserved    |  C  |  W  |    Reserved  |\n|                                     |               |  D  |  T  |              |\n --------------------------------------------------------------------------------\n```\n\nThese fields have the following meanings:\n\n* Bits 63:52 - reserved must be 0.\n* Bits 51:12 - stores the address of the top level paging structure;\n* Bits 11: 5 - reserved must be 0;\n* Bits 4 : 3 - PWT or Page-Level Writethrough and PCD or Page-level cache disable indicate. These bits control the way the page or Page Table is handled by the hardware cache;\n* Bits 2 : 0 - ignored;\n\nThe linear address translation is following:\n\n* A given linear address arrives to the [MMU](http://en.wikipedia.org/wiki/Memory_management_unit) instead of memory bus.\n* 64-bit linear address is split into some parts. Only low 48 bits are significant, it means that `2^48` or 256 TBytes of linear-address space may be accessed at any given time.\n* `cr3` register stores the address of the 4 top-level paging structure.\n* `47:39` bits of the given linear address store an index into the paging structure level-4, `38:30` bits store index into the paging structure level-3, `29:21` bits store an index into the paging structure level-2, `20:12` bits store an index into the paging structure level-1 and `11:0` bits provide the offset into the physical page in byte.\n\nschematically, we can imagine it like this:\n\n![4-level paging](images/4_level_paging.png)\n\nEvery access to a linear address is either a supervisor-mode access or a user-mode access. This access is determined by the `CPL` (current privilege level). If `CPL < 3` it is a supervisor mode access level, otherwise it is a user mode access level. For example, the top level page table entry contains access bits and has the following structure (See [arch/x86/include/asm/pgtable_types.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/pgtable_types.h) for the bit offset definitions):\n\n```\n63  62                  52 51                                                    32\n --------------------------------------------------------------------------------\n| N |                     |                                                     |\n|   |     Available       |     Address of the paging structure on lower level  |\n| X |                     |                                                     |\n --------------------------------------------------------------------------------\n31                                              12 11  9 8 7 6 5   4   3 2 1     0\n --------------------------------------------------------------------------------\n|                                                |     | M |I| | P | P |U|W|    |\n| Address of the paging structure on lower level | AVL | B |G|A| C | W | | |  P |\n|                                                |     | Z |N| | D | T |S|R|    |\n --------------------------------------------------------------------------------\n```\n\nWhere:\n\n* 63 bit - N/X bit (No Execute Bit) which presents ability to execute the code from physical pages mapped by the table entry;\n* 62:52 bits - ignored by CPU, used by system software;\n* 51:12 bits - stores physical address of the lower level paging structure;\n* 11: 9 bits - ignored by CPU;\n* MBZ - must be zero bits;\n* Ignored bits;\n* A - accessed bit indicates was physical page or page structure accessed;\n* PWT and PCD used for cache;\n* U/S - user/supervisor bit controls user access to all the physical pages mapped by this table entry;\n* R/W - read/write bit controls read/write access to all the physical pages mapped by this table entry;\n* P - present bit. Current bit indicates was page table or physical page loaded into primary memory or not.\n\nOk, we know about the paging structures and their entries. Now let's see some details about 4-level paging in the Linux kernel.\n\nPaging structures in the Linux kernel\n--------------------------------------------------------------------------------\n\nAs we've seen, the Linux kernel in `x86_64` uses 4-level page tables. Their names are:\n\n* Page Global Directory\n* Page Upper  Directory\n* Page Middle Directory\n* Page Table Entry\n\nAfter you've compiled and installed the Linux kernel, you can see the `System.map` file which stores the virtual addresses of the functions that are used by the kernel. For example:\n\n```\n$ grep \"start_kernel\" System.map\nffffffff81efe497 T x86_64_start_kernel\nffffffff81efeaa2 T start_kernel\n```\n\nWe can see `0xffffffff81efe497` here. I doubt you really have that much RAM installed. But anyway, `start_kernel` and `x86_64_start_kernel` will be executed. The address space in `x86_64` is `2^64` wide, but it's too large, that's why a smaller address space is used, only 48-bits wide. So we have a situation where the physical address space is limited to 48 bits, but addressing still performs with 64 bit pointers. How is this problem solved? Look at this diagram:\n\n```\n0xffffffffffffffff  +-----------+\n                    |           |\n                    |           | Kernelspace\n                    |           |\n0xffff800000000000  +-----------+\n                    |           |\n                    |           |\n                    |   hole    |\n                    |           |\n                    |           |\n0x00007fffffffffff  +-----------+\n                    |           |\n                    |           |  Userspace\n                    |           |\n0x0000000000000000  +-----------+\n```\n\nThis solution is `sign extension`. Here we can see that the lower 48 bits of a virtual address can be used for addressing. Bits `63:48` can be either only zeroes or only ones. Note that the virtual address space is split into 2 parts:\n\n* Kernel space\n* Userspace\n\nUserspace occupies the lower part of the virtual address space, from `0x000000000000000` to `0x00007fffffffffff` and kernel space occupies the highest part from `0xffff8000000000` to `0xffffffffffffffff`. Note that bits `63:47` is 0 for userspace and 1 for kernel space. All addresses which are in kernel space and in userspace or in other words which higher `63:48` bits are zeroes or ones are called `canonical` addresses. There is a `non-canonical` area between these memory regions. Together these two memory regions (kernel space and user space) are exactly `2^48` bits wide. We can find the virtual memory map with 4 level page tables in the [Documentation/x86/x86_64/mm.txt](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/x86/x86_64/mm.txt):\n\n```\n0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm\nhole caused by [48:63] sign extension\nffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor\nffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory\nffffc80000000000 - ffffc8ffffffffff (=40 bits) hole\nffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space\nffffe90000000000 - ffffe9ffffffffff (=40 bits) hole\nffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)\n... unused hole ...\nffffec0000000000 - fffffc0000000000 (=44 bits) kasan shadow memory (16TB)\n... unused hole ...\nffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks\n... unused hole ...\nffffffff80000000 - ffffffffa0000000 (=512 MB)  kernel text mapping, from phys 0\nffffffffa0000000 - ffffffffff5fffff (=1525 MB) module mapping space\nffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls\nffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole\n```\n\nWe can see here the memory map for user space, kernel space and the non-canonical area in-between them. The user space memory map is simple. Let's take a closer look at the kernel space. We can see that it starts from the guard hole which is reserved for the hypervisor. We can find the definition of this guard hole in [arch/x86/include/asm/page_64_types.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/page_64_types.h):\n\n```C\n#define __PAGE_OFFSET _AC(0xffff880000000000, UL)\n```\n\nPreviously this guard hole and `__PAGE_OFFSET` was from `0xffff800000000000` to `0xffff87ffffffffff` to prevent access to non-canonical area, but was later extended by 3 bits for the hypervisor.\n\nNext is the lowest usable address in kernel space - `ffff880000000000`. This virtual memory region is for direct mapping of all the physical memory. After the memory space which maps all the physical addresses, the guard hole. It needs to be between the direct mapping of all the physical memory and the vmalloc area. After the virtual memory map for the first terabyte and the unused hole after it, we can see the `kasan` shadow memory. It was added by [commit](https://github.com/torvalds/linux/commit/ef7f0d6a6ca8c9e4b27d78895af86c2fbfaeedb2) and provides the kernel address sanitizer. After the next unused hole we can see the `esp` fixup stacks (we will talk about it in other parts of this book) and the start of the kernel text mapping from the physical address - `0`. We can find the definition of this address in the same file as the `__PAGE_OFFSET`:\n\n```C\n#define __START_KERNEL_map      _AC(0xffffffff80000000, UL)\n```\n\nUsually kernel's `.text` starts here with the `CONFIG_PHYSICAL_START` offset. We have seen it in the post about [ELF64](https://github.com/0xAX/linux-insides/blob/master/Theory/ELF.md):\n\n```\nreadelf -s vmlinux | grep ffffffff81000000\n     1: ffffffff81000000     0 SECTION LOCAL  DEFAULT    1\n 65099: ffffffff81000000     0 NOTYPE  GLOBAL DEFAULT    1 _text\n 90766: ffffffff81000000     0 NOTYPE  GLOBAL DEFAULT    1 startup_64\n```\n\nHere I check `vmlinux` with `CONFIG_PHYSICAL_START` is `0x1000000`. So we have the start point of the kernel `.text` - `0xffffffff80000000` and offset - `0x1000000`, the resulted virtual address will be `0xffffffff80000000 + 1000000 = 0xffffffff81000000`.\n\nAfter the kernel `.text` region there is the virtual memory region for kernel module, `vsyscalls` and an unused hole of 2 megabytes.\n\nWe've seen how virtual memory map in the kernel is laid out and how a virtual address is translated into a physical one. Let's take the following address as example:\n\n```\n0xffffffff81000000\n```\n\nIn binary it will be:\n\n```\n1111111111111111 111111111 111111110 000001000 000000000 000000000000\n      63:48        47:39     38:30     29:21     20:12      11:0\n```\n\nThis virtual address is split in parts as described above:\n\n* `63:48` - bits not used;\n* `47:39` - bits store an index into the paging structure level-4;\n* `38:30` - bits store index into the paging structure level-3;\n* `29:21` - bits store an index into the paging structure level-2;\n* `20:12` - bits store an index into the paging structure level-1;\n* `11:0`  - bits provide the offset into the physical page in byte.\n\nThat is all. Now you know a little about theory of `paging` and we can go ahead in the kernel source code and see the first initialization steps.\n\nConclusion\n--------------------------------------------------------------------------------\n\nIt's the end of this short part about paging theory. Of course this post doesn't cover every detail of paging, but soon we'll see in practice how the Linux kernel builds paging structures and works with them.\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you've found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [Paging on Wikipedia](http://en.wikipedia.org/wiki/Paging)\n* [Intel 64 and IA-32 architectures software developer's manual volume 3A](http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html)\n* [MMU](http://en.wikipedia.org/wiki/Memory_management_unit)\n* [ELF64](https://github.com/0xAX/linux-insides/blob/master/Theory/ELF.md)\n* [Documentation/x86/x86_64/mm.txt](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/x86/x86_64/mm.txt)\n* [Last part - Kernel booting process](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-5)\n"
  },
  {
    "path": "Theory/linux-theory-2.md",
    "content": "Executable and Linkable Format\n================================================================================\n\nELF (Executable and Linkable Format) is a standard file format for executable files, object code, shared libraries and core dumps. Linux and many UNIX-like operating systems use this format. Let's look at the structure of the ELF-64 Object File Format and some definitions in the Linux kernel source code which related with it.\n\nAn ELF object file consists of the following parts:\n\n* ELF header - describes the main characteristics of the object file: type, CPU architecture, the virtual address of the entry point, the size and offset of the remaining parts, etc...;\n* Program header table - lists the available segments and their attributes. Program header table need loaders for placing sections of the file as virtual memory segments;\n* Section header table - contains the description of the sections.\n\nNow let's have a closer look on these components.\n\n**ELF header**\n\nThe ELF header is located at the beginning of the object file. Its main purpose is to locate all other parts of the object file. The file header contains the following fields:\n\n* ELF identification - array of bytes which helps identify the file as an ELF object file and also provides information about general object file characteristic;\n* Object file type - identifies the object file type. This field can describe that ELF file is a relocatable object file, an executable file, etc...;\n* Target architecture;\n* Version of the object file format;\n* Virtual address of the program entry point;\n* File offset of the program header table;\n* File offset of the section header table;\n* Size of an ELF header;\n* Size of a program header table entry;\n* and other fields...\n\nYou can find the `elf64_hdr` structure which presents ELF64 header in the Linux kernel source code:\n\n```C\ntypedef struct elf64_hdr {\n\tunsigned char\te_ident[EI_NIDENT];\n\tElf64_Half e_type;\n\tElf64_Half e_machine;\n\tElf64_Word e_version;\n\tElf64_Addr e_entry;\n\tElf64_Off e_phoff;\n\tElf64_Off e_shoff;\n\tElf64_Word e_flags;\n\tElf64_Half e_ehsize;\n\tElf64_Half e_phentsize;\n\tElf64_Half e_phnum;\n\tElf64_Half e_shentsize;\n\tElf64_Half e_shnum;\n\tElf64_Half e_shstrndx;\n} Elf64_Ehdr;\n```\n\nThis structure defined in the [elf.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/uapi/linux/elf.h#L220)\n\n**Sections**\n\nAll data stores in a sections in an Elf object file. Sections identified by index in the section header table. Section header contains following fields:\n\n* Section name;\n* Section type;\n* Section attributes;\n* Virtual address in memory;\n* Offset in file;\n* Size of section;\n* Link to other section;\n* Miscellaneous information;\n* Address alignment boundary;\n* Size of entries, if section has table;\n\nAnd presented with the following `elf64_shdr` structure in the Linux kernel:\n\n```C\ntypedef struct elf64_shdr {\n\tElf64_Word sh_name;\n\tElf64_Word sh_type;\n\tElf64_Xword sh_flags;\n\tElf64_Addr sh_addr;\n\tElf64_Off sh_offset;\n\tElf64_Xword sh_size;\n\tElf64_Word sh_link;\n\tElf64_Word sh_info;\n\tElf64_Xword sh_addralign;\n\tElf64_Xword sh_entsize;\n} Elf64_Shdr;\n```\n\n[elf.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/uapi/linux/elf.h#L312)\n\n**Program header table**\n\nAll sections are grouped into segments in an executable or shared object file. Program header is an array of structures which describe every segment. It looks like:\n\n```C\ntypedef struct elf64_phdr {\n\tElf64_Word p_type;\n\tElf64_Word p_flags;\n\tElf64_Off p_offset;\n\tElf64_Addr p_vaddr;\n\tElf64_Addr p_paddr;\n\tElf64_Xword p_filesz;\n\tElf64_Xword p_memsz;\n\tElf64_Xword p_align;\n} Elf64_Phdr;\n```\n\nin the Linux kernel source code.\n\n`elf64_phdr` defined in the same [elf.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/uapi/linux/elf.h#L254).\n\nThe ELF object file also contains other fields/structures which you can find in the [Documentation](http://www.uclibc.org/docs/elf-64-gen.pdf). Now let's a look at the `vmlinux` ELF object.\n\nvmlinux\n--------------------------------------------------------------------------------\n\n`vmlinux` is also a relocatable ELF object file . We can take a look at it with the `readelf` utility. First of all let's look at the header:\n\n```\n$ readelf -h  vmlinux\nELF Header:\n  Magic:   7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00\n  Class:                             ELF64\n  Data:                              2's complement, little endian\n  Version:                           1 (current)\n  OS/ABI:                            UNIX - System V\n  ABI Version:                       0\n  Type:                              EXEC (Executable file)\n  Machine:                           Advanced Micro Devices X86-64\n  Version:                           0x1\n  Entry point address:               0x1000000\n  Start of program headers:          64 (bytes into file)\n  Start of section headers:          381608416 (bytes into file)\n  Flags:                             0x0\n  Size of this header:               64 (bytes)\n  Size of program headers:           56 (bytes)\n  Number of program headers:         5\n  Size of section headers:           64 (bytes)\n  Number of section headers:         73\n  Section header string table index: 70\n```\n\nHere we can see that `vmlinux` is a 64-bit executable file.\n\nWe can read from the [Documentation/x86/x86_64/mm.txt](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/x86/x86_64/mm.txt#L21):\n\n```\nffffffff80000000 - ffffffffa0000000 (=512 MB)  kernel text mapping, from phys 0\n```\n\nWe can then look this address up in the `vmlinux` ELF object with:\n\n```\n$ readelf -s vmlinux | grep ffffffff81000000\n     1: ffffffff81000000     0 SECTION LOCAL  DEFAULT    1\n 65099: ffffffff81000000     0 NOTYPE  GLOBAL DEFAULT    1 _text\n 90766: ffffffff81000000     0 NOTYPE  GLOBAL DEFAULT    1 startup_64\n```\n\nNote that the address of the `startup_64` routine is not `ffffffff80000000`, but `ffffffff81000000` and now I'll explain why.\n\nWe can see following definition in the [arch/x86/kernel/vmlinux.lds.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/vmlinux.lds.S):\n\n```\n    . = __START_KERNEL;\n\t...\n\t...\n\t..\n\t/* Text and read-only data */\n\t.text :  AT(ADDR(.text) - LOAD_OFFSET) {\n\t\t_text = .;\n\t\t...\n\t\t...\n\t\t...\n\t}\n```\n\nWhere `__START_KERNEL` is:\n\n```\n#define __START_KERNEL\t\t(__START_KERNEL_map + __PHYSICAL_START)\n```\n\n`__START_KERNEL_map` is the value from the documentation - `ffffffff80000000` and `__PHYSICAL_START` is `0x1000000`. That's why address of the `startup_64` is `ffffffff81000000`.\n\nAnd at last we can get program headers from `vmlinux` with the following command:\n\n```\nreadelf -l vmlinux\n\nElf file type is EXEC (Executable file)\nEntry point 0x1000000\nThere are 5 program headers, starting at offset 64\n\nProgram Headers:\n  Type           Offset             VirtAddr           PhysAddr\n                 FileSiz            MemSiz              Flags  Align\n  LOAD           0x0000000000200000 0xffffffff81000000 0x0000000001000000\n                 0x0000000000cfd000 0x0000000000cfd000  R E    200000\n  LOAD           0x0000000001000000 0xffffffff81e00000 0x0000000001e00000\n                 0x0000000000100000 0x0000000000100000  RW     200000\n  LOAD           0x0000000001200000 0x0000000000000000 0x0000000001f00000\n                 0x0000000000014d98 0x0000000000014d98  RW     200000\n  LOAD           0x0000000001315000 0xffffffff81f15000 0x0000000001f15000\n                 0x000000000011d000 0x0000000000279000  RWE    200000\n  NOTE           0x0000000000b17284 0xffffffff81917284 0x0000000001917284\n                 0x0000000000000024 0x0000000000000024         4\n\n Section to Segment mapping:\n  Segment Sections...\n   00     .text .notes __ex_table .rodata __bug_table .pci_fixup .builtin_fw\n          .tracedata __ksymtab __ksymtab_gpl __kcrctab __kcrctab_gpl\n\t\t  __ksymtab_strings __param __modver\n   01     .data .vvar\n   02     .data..percpu\n   03     .init.text .init.data .x86_cpu_dev.init .altinstructions\n          .altinstr_replacement .iommu_table .apicdrivers .exit.text\n\t\t  .smp_locks .data_nosave .bss .brk\n```\n\nHere we can see five segments with sections list. You can find all of these sections in the generated linker script at - `arch/x86/kernel/vmlinux.lds`.\n\nThat's all. Of course it's not a full description of ELF (Executable and Linkable Format), but if you want to know more, you can find the documentation - [here](http://www.uclibc.org/docs/elf-64-gen.pdf)\n"
  },
  {
    "path": "Theory/linux-theory-3.md",
    "content": "Inline assembly\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nWhile reading source code in the [Linux kernel](https://github.com/torvalds/linux), I often see statements like this:\n\n```C\n__asm__(\"andq %%rsp,%0; \":\"=r\" (ti) : \"0\" (CURRENT_MASK));\n```\n\nYes, this is [inline assembly](https://en.wikipedia.org/wiki/Inline_assembler) or in other words assembler code which is integrated in a high level programming language. In this case the high level programming language is [C](https://en.wikipedia.org/wiki/C_%28programming_language%29). Yes, the `C` programming language is not very high-level, but still.\n\nIf you are familiar with the [assembly](https://en.wikipedia.org/wiki/Assembly_language) programming language, you may notice that `inline assembly` is not very different from normal assembler. Moreover, the special form of inline assembly which is called `basic form` is exactly the same. For example:\n\n```C\n__asm__(\"movq %rax, %rsp\");\n```\n\nor:\n\n```C\n__asm__(\"hlt\");\n```\n\nThe same code (of course without `__asm__` prefix) you might see in plain assembly code. Yes, this is very similar, but not so simple as it might seem at first glance. Actually, the [GCC](https://en.wikipedia.org/wiki/GNU_Compiler_Collection) supports two forms of inline assembly statements:\n\n* `basic`;\n* `extended`.\n\nThe basic form consists of only two things: the `__asm__` keyword and the string with valid assembler instructions. For example it may look something like this:\n\n```C\n__asm__(\"movq    $3, %rax\\t\\n\"\n        \"movq    %rsi, %rdi\");\n```\n\nThe `asm` keyword may be used in place of `__asm__`, however `__asm__` is portable whereas the `asm` keyword is a `GNU` [extension](https://gcc.gnu.org/onlinedocs/gcc/C-Extensions.html). In further examples I will only use the `__asm__` variant.\n\nIf you know assembly programming language this looks pretty familiar. The main problem is in the second form of inline assembly statements - `extended`. This form allows us to pass parameters to an assembly statement, perform [jumps](https://en.wikipedia.org/wiki/Branch_%28computer_science%29) etc. Does not sound difficult, but requires knowledge of special rules in addition to knowledge of the assembly language. Every time I see yet another piece of inline assembly code in the Linux kernel, I need to refer to the official [documentation](https://gcc.gnu.org/onlinedocs/) of `GCC` to remember how a particular `qualifier` behaves or what the meaning of `=&r` is for example.\n\nI've decided to write this part to consolidate my knowledge related to the inline assembly, as inline assembly statements are quite common in the Linux kernel and we may see them in [linux-insides](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md) parts sometimes. I thought that it would be useful if we have a special part which contains information on more important aspects of the inline assembly. Of course you may find comprehensive information about inline assembly in the official [documentation](https://gcc.gnu.org/onlinedocs/gcc/Using-Assembly-Language-with-C.html#Using-Assembly-Language-with-C), but I like to put everything in one place.\n\n** Note: This part will not provide guide for assembly programming. It is not intended to teach you to write programs with assembler or to know what one or another assembler instruction means. Just a little memo for extended asm. **\n\nIntroduction to extended inline assembly\n--------------------------------------------------------------------------------\n\nSo, let's start. As I already mentioned above, the `basic` assembly statement consists of the `asm` or `__asm__` keyword and set of assembly instructions. This form is in no way different from \"normal\" assembly. The most interesting part is inline assembler with operands, or `extended` assembler. An extended assembly statement looks more complicated and consists of more than two parts:\n\n```assembly\n__asm__ [volatile] [goto] (AssemblerTemplate\n                           [ : OutputOperands ]\n                           [ : InputOperands  ]\n                           [ : Clobbers       ]\n                           [ : GotoLabels     ]);\n```\n\nAll parameters which are marked with squared brackets are optional. You may notice that if we skip the optional parameters and the modifiers `volatile` and `goto` we obtain the `basic` form.\n\nLet's start to consider this in order. The first optional `qualifier` is `volatile`. This specifier tells the compiler that an assembly statement may produce `side effects`. In this case we need to prevent compiler optimizations related to the given assembly statement. In simple terms the `volatile` specifier instructs the compiler not to modify the statement and place it exactly where it was in the original code. As an example let's look at the following function from the [Linux kernel](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/desc.h):\n\n```C\nstatic inline void native_load_gdt(const struct desc_ptr *dtr)\n{\n\tasm volatile(\"lgdt %0\"::\"m\" (*dtr));\n}\n```\n\nHere we see the `native_load_gdt` function which loads a base address from the [Global Descriptor Table](https://en.wikipedia.org/wiki/Global_Descriptor_Table) to the `GDTR` register with the `lgdt` instruction. This assembly statement is marked with `volatile` qualifier. It is very important that the compiler does not change the original place of this assembly statement in the resulting code. Otherwise the `GDTR` register may contain wrong address for the `Global Descriptor Table` or the address may be correct, but the structure has not been filled yet. This can lead to an exception being generated, preventing the kernel from booting correctly.\n\nThe second optional `qualifier` is the `goto`. This qualifier tells the compiler that the given assembly statement may perform a jump to one of the labels which are listed in the `GotoLabels`. For example:\n\n```C\n__asm__ goto(\"jmp %l[label]\" : : : : label);\n```\n\nSince we finished with these two qualifiers, let's look at the main part of an assembly statement body. As we have seen above, the main part of an assembly statement consists of the following four parts:\n\n* set of assembly instructions;\n* output parameters;\n* input parameters;\n* clobbers.\n\nThe first represents a string which contains a set of valid assembly instructions which may be separated by the `\\t\\n` sequence. Names of processor [registers](https://en.wikipedia.org/wiki/Processor_register) must be prefixed with the `%%` sequence in `extended` form and other symbols like immediates must start with the `$` symbol. The `OutputOperands` and `InputOperands` are comma-separated lists of [C](https://en.wikipedia.org/wiki/C_%28programming_language%29) variables which may be provided with \"constraints\" and the `Clobbers` is a list of registers or other values which are modified by the assembler instructions from the `AssemblerTemplate` beyond those listed in the `OutputOperands`. Before we dive into the examples we have to know a little bit about `constraints`. A constraint is a string which specifies placement of an operand. For example the value of an operand may be written to a processor register or read from memory etc.\n\nConsider the following simple example:\n\n```C\n#include <stdio.h>\n\nint main(void)\n{\n        unsigned long a = 5;\n        unsigned long b = 10;\n        unsigned long sum = 0;\n\n        __asm__(\"addq %1,%2\" : \"=r\" (sum) : \"r\" (a), \"0\" (b));\n        printf(\"a + b = %lu\\n\", sum);\n        return 0;\n}\n```\n\nLet's compile and run it to be sure that it works as expected:\n\n```\n$ gcc test.c -o test\n./test\na + b = 15\n```\n\nOk, great. It works. Now let's look at this example in detail. Here we see a simple `C` program which calculates the sum of two variables placing the result into the `sum` variable and in the end we print the result. This example consists of three parts. The first is the assembly statement with the [add](http://x86.renejeschke.de/html/file_module_x86_id_5.html) instruction. It adds the value of the source operand together with the value of the destination operand and stores the result in the destination operand. In our case:\n\n```assembly\naddq %1, %2\n```\n\nwill be expanded to the:\n\n```assembly\naddq a, b\n```\n\nVariables and expressions which are listed in the `OutputOperands` and `InputOperands` may be matched in the `AssemblerTemplate`. An input/output operand is designated as `%N` where the `N` is the number of operand from left to right beginning from `zero`. The second part of the our assembly statement is located after the first `:` symbol and contains the definition of the output value:\n\n```assembly\n\"=r\" (sum)\n```\n\nNotice that the `sum` is marked with two special symbols: `=r`. This is the first constraint that we have encountered. The actual constraint here is only `r` itself. The `=` symbol is `modifier` which denotes output value. This tells to compiler that the previous value will be discarded and replaced by the new data. Besides the `=` modifier, `GCC` provides support for following three modifiers:\n\n* `+` - an operand is read and written by an instruction;\n* `&` - output register shouldn't overlap an input register and should be used only for output;\n* `%` - tells the compiler that operands may be [commutative](https://en.wikipedia.org/wiki/Commutative_property).\n\nNow let's go back to the `r` qualifier. As I mentioned above, a qualifier denotes the placement of an operand. The `r` symbol means a value will be stored in one of the [general purpose register](https://en.wikipedia.org/wiki/Processor_register). The last part of our assembly statement:\n\n```assembly\n\"r\" (a), \"0\" (b)\n```\n\nThese are input operands - variables `a` and `b`. We already know what the `r` qualifier does. Now we can have a look at the constraint for the variable `b`. The `0` or any other digit from `1` to `9` is called \"matching constraint\". With this a single operand can be used for multiple roles. The value of the constraint is the source operand index. In our case `0` will match `sum`. If we look at assembly output of our program:\n\n```C\n0000000000400400 <main>:\n  ...\n  ...\n  ...\n  4004fe:       48 c7 45 f8 05 00 00    movq   $0x5,-0x8(%rbp)\n  400506:       48 c7 45 f0 0a 00 00    movq   $0xa,-0x10(%rbp)\n\n  400516:       48 8b 55 f8             mov    -0x8(%rbp),%rdx\n  40051a:       48 8b 45 f0             mov    -0x10(%rbp),%rax\n  40051e:       48 01 d0                add    %rdx,%rax\n```\n\nFirst of all our values `5` and `10` will be put at the stack and then these values will be moved to the two general purpose registers: `%rdx` and `%rax`.\n\nThis way the `%rax` register is used for storing the value of the `b` as well as storing the result of the calculation. **NOTE** that I've used `gcc 6.3.1` version, so the resulted code of your compiler may differ.\n\nWe have looked at input and output parameters of an inline assembly statement. Before we move on to other constraints supported by `gcc`, there is one remaining part of the inline assembly statement we have not discussed yet - `clobbers`.\n\nClobbers\n--------------------------------------------------------------------------------\n\nAs mentioned above, the \"clobbered\" part should contain a comma-separated list of registers whose content will be modified by the assembler code. This is useful if our assembly expression needs additional registers for calculation. If we add clobbered registers to the inline assembly statement, the compiler take this into account and the register in question will not simultaneously be used by the compiler.\n\nConsider the example from before, but we will add an additional, simple assembler instruction:\n\n```C\n__asm__(\"movq $100, %%rdx\\t\\n\"\n        \"addq %1,%2\" : \"=r\" (sum) : \"r\" (a), \"0\" (b));\n```\n\nIf we look at the assembly output:\n\n```C\n0000000000400400 <main>:\n  ...\n  ...\n  ...\n  4004fe:       48 c7 45 f8 05 00 00    movq   $0x5,-0x8(%rbp)\n  400506:       48 c7 45 f0 0a 00 00    movq   $0xa,-0x10(%rbp)\n\n  400516:       48 8b 55 f8             mov    -0x8(%rbp),%rdx\n  40051a:       48 8b 45 f0             mov    -0x10(%rbp),%rax\n\n  40051e:       48 c7 c2 64 00 00 00    mov    $0x64,%rdx\n  400525:       48 01 d0                add    %rdx,%rax\n```\n\nwe will see that the `%rdx` register is overwritten with `0x64` or `100` and the result will be `110` instead of `15`. Now if we add the `%rdx` register to the list of `clobbered` registers:\n\n```C\n__asm__(\"movq $100, %%rdx\\t\\n\"\n        \"addq %1,%2\" : \"=r\" (sum) : \"r\" (a), \"0\" (b) : \"%rdx\");\n```\n\nand look at the assembler output again:\n\n```C\n0000000000400400 <main>:\n  4004fe:       48 c7 45 f8 05 00 00    movq   $0x5,-0x8(%rbp)\n  400506:       48 c7 45 f0 0a 00 00    movq   $0xa,-0x10(%rbp)\n\n  400516:       48 8b 4d f8             mov    -0x8(%rbp),%rcx\n  40051a:       48 8b 45 f0             mov    -0x10(%rbp),%rax\n\n  40051e:       48 c7 c2 64 00 00 00    mov    $0x64,%rdx\n  400525:       48 01 c8                add    %rcx,%rax\n```\n\nthe `%rcx` register will be used for `sum` calculation, preserving the intended semantics of the program. Besides general purpose registers, we may pass two special specifiers. They are:\n\n* `cc`;\n* `memory`.\n\nThe first - `cc` indicates that an assembler code modifies [flags](https://en.wikipedia.org/wiki/FLAGS_register) register. This is typically used if the assembly within contains arithmetic or logic instructions:\n\n```C\n__asm__(\"incq %0\" ::\"\"(variable): \"cc\");\n```\n\nThe second `memory` specifier tells the compiler that the given inline assembly statement executes read/write operations on memory not specified by operands in the output list. This prevents the compiler from keeping memory values loaded and cached in registers. Let's take a look at the following example:\n\n```C\n#include <stdio.h>\n\nint main(void)\n{\n        unsigned long a[3] = {10000000000, 0, 1};\n        unsigned long b = 5;\n\n        __asm__ volatile(\"incq %0\" :: \"m\" (a[0]));\n\n        printf(\"a[0] - b = %lu\\n\", a[0] - b);\n        return 0;\n}\n```\n\nThis example may be artificial, but it illustrates the main idea. Here we have an array of integers and one integer variable. The example is pretty simple, we take the first element of `a` and increment its value. After this we subtract the value of `b` from the  first element of `a`. In the end we print the result. If we compile and run this simple example the result may surprise you:\n\n```\n~$ gcc -O3  test.c -o test\n~$ ./test\na[0] - b = 9999999995\n```\n\nThe result is `a[0] - b = 9999999995` here, but why? We incremented `a[0]` and subtracted `b`, so the result should be `a[0] - b = 9999999996` here.\n\nIf we have a look at the assembler output for this example:\n\n```assembly\n00000000004004f6 <main>:\n  4004b4:       48 b8 00 e4 0b 54 02    movabs $0x2540be400,%rax\n  4004be:       48 89 04 24             mov    %rax,(%rsp)\n  ...\n  ...\n  ...\n  40050e:       ff 44 24 f0             incq   (%rsp)\n\n  4004d8:       48 be fb e3 0b 54 02    movabs $0x2540be3fb,%rsi\n```\n\nwe will see that the first element of the `a` contains the value `0x2540be400` (`10000000000`). The last two lines of code are the actual calculations.\n\nWe see our increment instruction with `incq` but then just a move of `0x2540be3fb` (`9999999995`) to the `%rsi` register. This looks strange.\n\nThe problem is we have passed the `-O3` flag to `gcc`, so the compiler did some constant folding and propagation to determine the result of `a[0] - 5` at compile time and reduced it to a `movabs` with a constant `0x2540be3fb` or `9999999995` in runtime.\n\nLet's now add `memory` to the clobbers list:\n\n```C\n__asm__ volatile(\"incq %0\" :: \"m\" (a[0]) : \"memory\");\n```\n\nand the new result of running this is:\n\n```\n~$ gcc -O3  test.c -o test\n~$ ./test\na[0] - b = 9999999996\n```\n\nNow the result is correct. If we look at the assembly output again:\n\n```assembly\n00000000004004f6 <main>:\n  400404:       48 b8 00 e4 0b 54 02    movabs $0x2540be400,%rax\n  40040b:       00 00 00\n  40040e:       48 89 04 24             mov    %rax,(%rsp)\n  400412:       48 c7 44 24 08 00 00    movq   $0x0,0x8(%rsp)\n  400419:       00 00\n  40041b:       48 c7 44 24 10 01 00    movq   $0x1,0x10(%rsp)\n  400422:       00 00\n  400424:       48 ff 04 24             incq   (%rsp)\n  400428:       48 8b 04 24             mov    (%rsp),%rax\n  400431:       48 8d 70 fb             lea    -0x5(%rax),%rsi\n```\n\nwe will see one difference here which is in the last two lines:\n\n```assembly\n  400428:       48 8b 04 24             mov    (%rsp),%rax\n  400431:       48 8d 70 fb             lea    -0x5(%rax),%rsi\n```\n\nInstead of constant folding, `GCC` now preserves calculations in the assembly and places the value of `a[0]` in the `%rax` register afterwards. In the end it just subtracts the constant value of `b` from the `%rax` register and puts the result to the `%rsi`.\n\nBesides the `memory` specifier, we also see a new constraint here - `m`. This constraint tells the compiler to use the address of `a[0]`, instead of its value. So, now we are finished with `clobbers` and we may continue by looking at other constraints supported by `GCC` besides `r` and `m` which we have already seen.\n\nConstraints\n---------------------------------------------------------------------------------\n\nNow that we are finished with all three parts of an inline assembly statement, let's return to constraints. We already saw some constraints in the previous parts, like `r` which represents a `register` operand, `m` which represents a memory operand and `0-9` which represent a reused, indexed operand. Besides these `GCC` provides support for other constraints. For example the `i` constraint represents an `immediate` integer operand with known value:\n\n```C\n#include <stdio.h>\n\nint main(void)\n{\n        int a = 0;\n\n        __asm__(\"movl %1, %0\" : \"=r\"(a) : \"i\"(100));\n        printf(\"a = %d\\n\", a);\n        return 0;\n}\n```\n\nThe result is:\n\n```\n~$ gcc test.c -o test\n~$ ./test\na = 100\n```\n\nOr for example `I` which represents an immediate 32-bit integer. The difference between `i` and `I` is that `i` is general, whereas `I` is strictly specified to 32-bit integer data. For example if you try to compile the following code:\n\n```C\nunsigned long test_asm(int nr)\n{\n        unsigned long a = 0;\n\n        __asm__(\"movq %1, %0\" : \"=r\"(a) : \"I\"(0xffffffffffff));\n        return a;\n}\n```\n\nyou will get an error:\n\n```\n$ gcc -O3 test.c -o test\ntest.c: In function ‘test_asm’:\ntest.c:7:9: warning: asm operand 1 probably doesn’t match constraints\n         __asm__(\"movq %1, %0\" : \"=r\"(a) : \"I\"(0xffffffffffff));\n         ^\ntest.c:7:9: error: impossible constraint in ‘asm’\n```\n\nwhen at the same time:\n\n```C\nunsigned long test_asm(int nr)\n{\n        unsigned long a = 0;\n\n        __asm__(\"movq %1, %0\" : \"=r\"(a) : \"i\"(0xffffffffffff));\n        return a;\n}\n```\n\nworks perfectly:\n\n```\n~$ gcc -O3 test.c -o test\n~$ echo $?\n0\n```\n\n`GCC` also supports `J`, `K`, `N` constraints for integer constants in the range of 0-63 bits, signed 8-bit integer constants and unsigned 8-bit integer constants respectively. The `o` constraint represents a memory operand with an `offsetable` memory address. For example:\n\n```C\n#include <stdio.h>\n\nint main(void)\n{\n        static unsigned long arr[3] = {0, 1, 2};\n        static unsigned long element;\n\n        __asm__ volatile(\"movq 16+%1, %0\" : \"=r\"(element) : \"o\"(arr));\n        printf(\"%lu\\n\", element);\n        return 0;\n}\n```\n\nThe result, as expected:\n\n```\n~$ gcc -O3 test.c -o test\n~$ ./test\n2\n```\n\nAll of these constraints may be combined (so long as they do not conflict). In this case the compiler will choose the best one for a certain situation. For example:\n\n```C\nunsigned long a = 10;\nunsigned long b = 20;\n\nvoid main(void)\n{\n    __asm__ (\"movq %1,%0\" : \"=mr\"(b) : \"rm\"(a));\n}\n```\n\nwill use a memory operand:\n\n```assembly\nmain:\n        movq a(%rip),b(%rip)\n        ret\nb:\n        .quad   20\na:\n        .quad   10\n```\n\ninstead of direct usage of general purpose registers.\n\nThat's about all of the commonly used constraints in inline assembly statements. You can find more in the official [documentation](https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html#Simple-Constraints).\n\nArchitecture specific constraints\n--------------------------------------------------------------------------------\n\nBefore we finish, let's look at the set of special constraints. These constrains are architecture specific and as this book is specific to the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, we will look at constraints related to it. First of all the set of `a` ... `d` and also `S` and `D` constraints represent [generic purpose](https://en.wikipedia.org/wiki/Processor_register) registers. In this case the `a` constraint corresponds to `%al`, `%ax`, `%eax` or `%rax` register depending on instruction size. The `S` and `D` constraints are `%si` and `%di` registers respectively. For example let's take our previous example. We can see in its assembly output that value of the `a` variable is stored in the `%eax` register. Now let's look at the assembly output of the same assembly, but with other constraint:\n\n```C\n#include <stdio.h>\n\nint a = 1;\n\nint main(void)\n{\n        int b;\n        __asm__ (\"movl %1,%0\" : \"=r\"(b) : \"d\"(a));\n        return b;\n}\n```\n\nNow we see that value of the `a` variable will be stored in the `%edx` register:\n\n```assembly\n0000000000400400 <main>:\n  40044a:       8b 05 b4 2b 00 00       mov    0x2bb4(%rip),%eax        # 403004 <a>\n  400450:       89 c2                   mov    %eax,%edx\n  400452:       89 d0                   mov    %edx,%eax\n```\n\nThe `f` and `t` constraints represent any floating point stack register - `%st` and the top of the floating point stack respectively. The `u` constraint represents the second value from the top of the floating point stack.\n\nThat's all. You may find more details about [x86_64](https://en.wikipedia.org/wiki/X86-64) and general constraints in the official [documentation](https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html#Machine-Constraints).\n\nLinks\n--------------------------------------------------------------------------------\n\n* [Linux kernel source code](https://github.com/torvalds/linux)\n* [assembly programming language](https://en.wikipedia.org/wiki/Assembly_language)\n* [GCC](https://en.wikipedia.org/wiki/GNU_Compiler_Collection)\n* [GNU extension](https://gcc.gnu.org/onlinedocs/gcc/C-Extensions.html)\n* [Global Descriptor Table](https://en.wikipedia.org/wiki/Global_Descriptor_Table)\n* [Processor registers](https://en.wikipedia.org/wiki/Processor_register)\n* [add instruction](http://x86.renejeschke.de/html/file_module_x86_id_5.html)\n* [flags register](https://en.wikipedia.org/wiki/FLAGS_register)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [constraints](https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html#Machine-Constraints)\n"
  },
  {
    "path": "Timers/README.md",
    "content": "# Timers and time management\n\nThis chapter describes timers and time management related concepts in the Linux kernel.\n\n* [Introduction](linux-timers-1.md) - An introduction to the timers in the Linux kernel.\n* [Introduction to the clocksource framework](linux-timers-2.md) - Describes `clocksource` framework in the Linux kernel.\n* [The tick broadcast framework and dyntick](linux-timers-3.md) - Describes tick broadcast framework and dyntick concept.\n* [Introduction to timers](linux-timers-4.md) - Describes timers in the Linux kernel.\n* [Introduction to the clockevents framework](linux-timers-5.md) - Describes yet another clock/time management related framework : `clockevents`.\n* [x86 related clock sources](linux-timers-6.md) - Describes `x86_64` related clock sources.\n* [Time related system calls in the Linux kernel](linux-timers-7.md) - Describes time related system calls.\n"
  },
  {
    "path": "Timers/linux-timers-1.md",
    "content": "Timers and time management in the Linux kernel. Part 1.\n================================================================================\n\nIntroduction\n--------------------------------------------------------------------------------\n\nThis is yet another post that opens a new chapter in the [linux-insides](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md) book. The previous [part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-4) described [system call](https://en.wikipedia.org/wiki/System_call) concepts, and now it's time to start new chapter. As one might understand from the title, this chapter will be devoted to the `timers` and `time management` in the Linux kernel. The choice of topic for the current chapter is not accidental. Timers (and generally, time management) are very important and widely used in the Linux kernel. The Linux kernel uses timers for various tasks, for example different timeouts in the [TCP](https://en.wikipedia.org/wiki/Transmission_Control_Protocol) implementation, the kernel knowing current time, scheduling asynchronous functions, next event interrupt scheduling and many many more.\n\nSo, we will start to learn implementation of the different time management related stuff in this part. We will see different types of timers and how different Linux kernel subsystems use them. As always, we will start from the earliest part of the Linux kernel and go through the initialization process of the Linux kernel. We already did it in the special [chapter](https://0xax.gitbook.io/linux-insides/summary/initialization) which describes the initialization process of the Linux kernel, but as you may remember we missed some things there. And one of them is the initialization of timers.\n\nLet's start.\n\nInitialization of non-standard PC hardware clock\n--------------------------------------------------------------------------------\n\nAfter the Linux kernel was decompressed (more about this you can read in the [Kernel decompression](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-5) part) the architecture non-specific code starts to work in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file. After initialization of the [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt), initialization of [cgroups](https://en.wikipedia.org/wiki/Cgroups) and setting [canary](https://en.wikipedia.org/wiki/Buffer_overflow_protection) value we can see the call of the `setup_arch` function.\n\nAs you may remember, this function (defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/setup.c#L842)) prepares/initializes architecture-specific stuff (for example it reserves a place for [bss](https://en.wikipedia.org/wiki/.bss) section, reserves a place for [initrd](https://en.wikipedia.org/wiki/Initrd), parses kernel command line, and many, many other things). Besides this, we can find some time management related functions there.\n\nThe first is:\n\n```C\nx86_init.timers.wallclock_init();\n```\n\nWe already saw `x86_init` structure in the chapter that describes initialization of the Linux kernel. This structure contains pointers to the default setup functions for the different platforms like [Intel MID](https://en.wikipedia.org/wiki/Mobile_Internet_device#Intel_MID_platforms), [Intel CE4100](http://www.wpgholdings.com/epaper/US/newsRelease_20091215/255874.pdf), etc. The `x86_init` structure is defined in the [arch/x86/kernel/x86_init.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/x86_init.c#L36), and as you can see it determines standard PC hardware by default.\n\nAs we can see, the `x86_init` structure has the `x86_init_ops` type that provides a set of functions for platform specific setup like reserving standard resources, platform specific memory setup, initialization of interrupt handlers, etc. This structure looks like:\n\n```C\nstruct x86_init_ops {\n\tstruct x86_init_resources       resources;\n    struct x86_init_mpparse         mpparse;\n    struct x86_init_irqs            irqs;\n    struct x86_init_oem             oem;\n    struct x86_init_paging          paging;\n    struct x86_init_timers          timers;\n    struct x86_init_iommu           iommu;\n    struct x86_init_pci             pci;\n};\n```\n\nNote the `timers` field that has the `x86_init_timers` type. We can understand by its name that this field is related to time management and timers. `x86_init_timers` contains four fields which are all functions that returns pointer on [void](https://en.wikipedia.org/wiki/Void_type):\n\n* `setup_percpu_clockev` - set up the per cpu clock event device for the boot cpu;\n* `tsc_pre_init` - platform function called before [TSC](https://en.wikipedia.org/wiki/Time_Stamp_Counter) init;\n* `timer_init` - initialize the platform timer;\n* `wallclock_init` - initialize the wallclock device.\n\nSo, as we already know, in our case the `wallclock_init` executes initialization of the wallclock device. If we look on the `x86_init` structure, we see that `wallclock_init` points to the `x86_init_noop`:\n\n```C\nstruct x86_init_ops x86_init __initdata = {\n\t...\n\t...\n\t...\n\t.timers = {\n\t\t.wallclock_init\t\t    = x86_init_noop,\n\t},\n\t...\n\t...\n\t...\n}\n```\n\nWhere the `x86_init_noop` is just a function that does nothing:\n\n```C\nvoid __cpuinit x86_init_noop(void) { }\n```\n\nfor the standard PC hardware. Actually, the `wallclock_init` function is used in the [Intel MID](https://en.wikipedia.org/wiki/Mobile_Internet_device#Intel_MID_platforms) platform. Initialization of the `x86_init.timers.wallclock_init` is located in the [arch/x86/platform/intel-mid/intel-mid.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/platform/intel-mid/intel-mid.c) source code file in the `x86_intel_mid_early_setup` function:\n\n```C\nvoid __init x86_intel_mid_early_setup(void)\n{\n\t...\n\t...\n\t...\n\tx86_init.timers.wallclock_init = intel_mid_rtc_init;\n\t...\n\t...\n\t...\n}\n```\n\nImplementation of the `intel_mid_rtc_init` function is in the [arch/x86/platform/intel-mid/intel_mid_vrtc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/platform/intel-mid/intel_mid_vrtc.c) source code file and looks pretty simple. First of all, this function parses [Simple Firmware Interface](https://en.wikipedia.org/wiki/Simple_Firmware_Interface) M-Real-Time-Clock table for getting such devices to the `sfi_mrtc_array` array and initialization of the `set_time` and `get_time` functions:\n\n```C\nvoid __init intel_mid_rtc_init(void)\n{\n\tunsigned long vrtc_paddr;\n\n\tsfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);\n\n\tvrtc_paddr = sfi_mrtc_array[0].phys_addr;\n\tif (!sfi_mrtc_num || !vrtc_paddr)\n\t\treturn;\n\n\tvrtc_virt_base = (void __iomem *)set_fixmap_offset_nocache(FIX_LNW_VRTC,\n\t\t\t\t\t\t\t\tvrtc_paddr);\n\n    x86_platform.get_wallclock = vrtc_get_time;\n\tx86_platform.set_wallclock = vrtc_set_mmss;\n}\n```\n\nThat's all, after this a device based on `Intel MID` will be able to get time from the hardware clock. As I already wrote, the standard PC [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture does not support `x86_init_noop` and just do nothing during call of this function. We just saw initialization of the [real time clock](https://en.wikipedia.org/wiki/Real-time_clock) for the [Intel MID](https://en.wikipedia.org/wiki/Mobile_Internet_device#Intel_MID_platforms) architecture, now it's time to return to the general `x86_64` architecture and will look on the time management related stuff there.\n\nAcquainted with jiffies\n--------------------------------------------------------------------------------\n\nIf we return to the `setup_arch` function (which is located, as you remember, in the  [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/setup.c#L842) source code file), we see the next call of the time management related function:\n\n```C\nregister_refined_jiffies(CLOCK_TICK_RATE);\n```\n\nBefore we look at the implementation of this function, we must know about [jiffy](https://en.wikipedia.org/wiki/Jiffy_%28time%29). As we can read on wikipedia:\n\n```\nJiffy is an informal term for any unspecified short period of time\n```\n\nThis definition is very similar to the `jiffy` in the Linux kernel. There is global variable with the `jiffies` which holds the number of ticks that have occurred since the system booted. The Linux kernel sets this variable to zero:\n\n```C\nextern unsigned long volatile __jiffy_data jiffies;\n```\n\nduring initialization process. This global variable will be increased each time during timer interrupt. Besides this, near the `jiffies` variable we can see the definition of the similar variable\n\n```C\nextern u64 jiffies_64;\n```\n\nActually, only one of these variables is in use in the Linux kernel, and it depends on the processor type. For the [x86_64](https://en.wikipedia.org/wiki/X86-64) it will be `u64` use and for the [x86](https://en.wikipedia.org/wiki/X86) it's `unsigned long`. We see this looking at the [arch/x86/kernel/vmlinux.lds.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/vmlinux.lds.S) linker script:\n\n```\n#ifdef CONFIG_X86_32\n...\njiffies = jiffies_64;\n...\n#else\n...\njiffies_64 = jiffies;\n...\n#endif\n```\n\nIn the case of `x86_32` the `jiffies` will be the lower `32` bits of the `jiffies_64` variable. Schematically, we can imagine it as follows\n\n```\n                    jiffies_64\n+-----------------------------------------------------+\n|                       |                             |\n|                       |                             |\n|                       |       jiffies on `x86_32`   |\n|                       |                             |\n|                       |                             |\n+-----------------------------------------------------+\n63                     31                             0\n```\n\nNow we know a little theory about `jiffies` and can return to our function. There is no architecture-specific implementation for our function - the `register_refined_jiffies`. This function is located in the generic kernel code - [kernel/time/jiffies.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/jiffies.c) source code file. Main point of the `register_refined_jiffies` is registration of the jiffy `clocksource`. Before we look on the implementation of the `register_refined_jiffies` function, we must know what `clocksource` is. As we can read in the comments:\n\n```\nThe `clocksource` is hardware abstraction for a free-running counter.\n```\n\nI'm not sure about you, but that description didn't give a good understanding about the `clocksource` concept. Let's try to understand what is it, but we will not go deeper because this topic will be described in a separate part in much more detail. The main point of the `clocksource` is timekeeping abstraction or in very simple words - it provides a time value to the kernel. We already know about the `jiffies` interface that represents number of ticks that have occurred since the system booted. It is represented by a global variable in the Linux kernel and increases each timer interrupt. The Linux kernel can use `jiffies` for time measurement. So why do we need in separate context like the `clocksource`? Actually, different hardware devices provide different clock sources that are varied in their capabilities. The availability of more precise techniques for time intervals measurement is hardware-dependent.\n\nFor example `x86` has on-chip a 64-bit counter that is called [Time Stamp  Counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter) and its frequency can be equal to processor frequency. Or for example the [High Precision Event Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer), that consists of a `64-bit` counter of at least `10 MHz` frequency. Two different timers and they are both for `x86`. If we will add timers from other architectures, this only makes this problem more complex. The Linux kernel provides the `clocksource` concept to solve the problem.\n\nThe clocksource concept is represented by the `clocksource` structure in the Linux kernel. This structure is defined in the [include/linux/clocksource.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clocksource.h) header file and contains a couple of fields that describe a time counter. For example, it contains - `name` field which is the name of a counter, `flags` field that describes different properties of a counter, pointers to the `suspend` and `resume` functions, and many more.\n\nLet's look at the `clocksource` structure for jiffies that is defined in the [kernel/time/jiffies.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/jiffies.c) source code file:\n\n```C\nstatic struct clocksource clocksource_jiffies = {\n\t.name\t\t= \"jiffies\",\n\t.rating\t\t= 1,\n\t.read\t\t= jiffies_read,\n\t.mask\t\t= 0xffffffff,\n\t.mult\t\t= NSEC_PER_JIFFY << JIFFIES_SHIFT,\n\t.shift\t\t= JIFFIES_SHIFT,\n\t.max_cycles\t= 10,\n};\n```\n\nWe can see the definition of the default name here - `jiffies`. The next is the `rating` field, which allows the best registered clock source to be chosen by the clock source management code available for the specified hardware. The `rating` may have following value:\n\n* `1-99`    - Only available for bootup and testing purposes;\n* `100-199` - Functional for real use, but not desired.\n* `200-299` - A correct and usable clocksource.\n* `300-399` - A reasonably fast and accurate clocksource.\n* `400-499` - The ideal clocksource. A must-use where available;\n\nFor example, rating of the [time stamp counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter) is `300`, but rating of the [high precision event timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer) is `250`. The next field is `read` - it is pointer to the function that allows it to read clocksource's cycle value; or in other words, it just returns `jiffies` variable with `cycle_t` type:\n\n```C\nstatic cycle_t jiffies_read(struct clocksource *cs)\n{\n        return (cycle_t) jiffies;\n}\n```\n\nthat is just 64-bit unsigned type:\n\n```C\ntypedef u64 cycle_t;\n```\n\nThe next field is the `mask` value, which ensures that subtraction between counters values from non `64 bit` counters do not need special overflow logic. In our case the mask is `0xffffffff` and it is `32` bits. This means that `jiffy` wraps around to zero after `42` seconds:\n\n```python\n>>> 0xffffffff\n4294967295\n# 42 nanoseconds\n>>> 42 * pow(10, -9)\n4.2000000000000006e-08\n# 43 nanoseconds\n>>> 43 * pow(10, -9)\n4.3e-08\n```\n\nThe next two fields `mult` and `shift` are used to convert the clocksource's period to nanoseconds per cycle. When the kernel calls the `clocksource.read` function, this function returns a value in `machine` time units represented with `cycle_t` data type that we saw just now. To convert this return value to [nanoseconds](https://en.wikipedia.org/wiki/Nanosecond) we need these two fields: `mult` and `shift`. The `clocksource` provides the `clocksource_cyc2ns` function that will do it for us with the following expression:\n\n```C\n((u64) cycles * mult) >> shift;\n```\n\nAs we can see the `mult` field is equal:\n\n```C\nNSEC_PER_JIFFY << JIFFIES_SHIFT\n\n#define NSEC_PER_JIFFY  ((NSEC_PER_SEC+HZ/2)/HZ)\n#define NSEC_PER_SEC    1000000000L\n```\n\nby default, and the `shift` is\n\n```C\n#if HZ < 34\n  #define JIFFIES_SHIFT   6\n#elif HZ < 67\n  #define JIFFIES_SHIFT   7\n#else\n  #define JIFFIES_SHIFT   8\n#endif\n```\n\nThe `jiffies` clock source uses the `NSEC_PER_JIFFY` multiplier conversion to specify the nanosecond over cycle ratio. Note that values of the  `JIFFIES_SHIFT` and `NSEC_PER_JIFFY` depend on `HZ` value. The `HZ` represents the frequency of the system timer. This macro defined in the [include/asm-generic/param.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/asm-generic/param.h) and depends on the `CONFIG_HZ` kernel configuration option. The value of `HZ` differs for each supported architecture, but for `x86` it's defined like:\n\n```C\n#define HZ\t\tCONFIG_HZ\n```\n\nWhere `CONFIG_HZ` can be one of the following values:\n\n![HZ](images/HZ.png)\n\nThis means that in our case the timer interrupt frequency is `250 HZ` or occurs `250` times per second or one timer interrupt each `4ms`.\n\nThe last field that we can see in the definition of the `clocksource_jiffies` structure is the - `max_cycles` that holds the maximum cycle value that can safely be multiplied without potentially causing an overflow.\n\nOk, we just saw definition of the `clocksource_jiffies` structure, also we know a little about `jiffies` and `clocksource`, now it is time to get back to the implementation of the our function. In the beginning of this part we have stopped on the call of the:\n\n```C\nregister_refined_jiffies(CLOCK_TICK_RATE);\n```\n\nfunction from the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/setup.c#L842) source code file.\n\nAs I already wrote, the main purpose of the `register_refined_jiffies` function is to register `refined_jiffies` clocksource. We already saw the `clocksource_jiffies` structure represents standard `jiffies` clock source. Now, if you look in the [kernel/time/jiffies.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/jiffies.c) source code file, you will find yet another clock source definition:\n\n```C\nstruct clocksource refined_jiffies;\n```\n\nThere is one difference between `refined_jiffies` and `clocksource_jiffies`: The standard `jiffies` based clock source is the lowest common denominator clock source which should function on all systems. As we already know, the `jiffies` global variable will be increased during each timer interrupt. This means the that standard `jiffies` based clock source has the same resolution as the timer interrupt frequency. From this we can understand that standard `jiffies` based clock source may suffer from inaccuracies. The `refined_jiffies` uses `CLOCK_TICK_RATE` as the base of `jiffies` shift.\n\nLet's look at the implementation of this function. First of all, we can see that the `refined_jiffies` clock source based on the `clocksource_jiffies` structure:\n\n```C\nint register_refined_jiffies(long cycles_per_second)\n{\n\tu64 nsec_per_tick, shift_hz;\n\tlong cycles_per_tick;\n\n\trefined_jiffies = clocksource_jiffies;\n\trefined_jiffies.name = \"refined-jiffies\";\n\trefined_jiffies.rating++;\n\t...\n\t...\n\t...\n```\n\nHere we can see that we update the name of the `refined_jiffies` to `refined-jiffies` and increase the rating of this structure. As you remember, the `clocksource_jiffies` has rating - `1`, so our `refined_jiffies` clocksource will have rating - `2`. This means that the `refined_jiffies` will be the best selection for clock source management code.\n\nIn the next step we need to calculate number of cycles per one tick:\n\n```C\ncycles_per_tick = (cycles_per_second + HZ/2)/HZ;\n```\n\nNote that we have used `NSEC_PER_SEC` macro as the base of the standard `jiffies` multiplier. Here we are using the `cycles_per_second` which is the first parameter of the `register_refined_jiffies` function. We've passed the `CLOCK_TICK_RATE` macro to the `register_refined_jiffies` function. This macro is defined in the [arch/x86/include/asm/timex.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/timex.h) header file and expands to the:\n\n```C\n#define CLOCK_TICK_RATE         PIT_TICK_RATE\n```\n\nwhere the `PIT_TICK_RATE` macro expands to the frequency of the [Intel 8253](Programmable interval timer):\n\n```C\n#define PIT_TICK_RATE 1193182ul\n```\n\nAfter this we calculate `shift_hz` for the `register_refined_jiffies` that will store `hz << 8` or in other words frequency of the system timer. We shift left the `cycles_per_second` or frequency of the programmable interval timer on `8` in order to get extra accuracy:\n\n```C\nshift_hz = (u64)cycles_per_second << 8;\nshift_hz += cycles_per_tick/2;\ndo_div(shift_hz, cycles_per_tick);\n```\n\nIn the next step we calculate the number of seconds per one tick by shifting left the `NSEC_PER_SEC` on `8` too as we did it with the `shift_hz` and do the same calculation as before:\n\n```C\nnsec_per_tick = (u64)NSEC_PER_SEC << 8;\nnsec_per_tick += (u32)shift_hz/2;\ndo_div(nsec_per_tick, (u32)shift_hz);\n```\n\n```C\nrefined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;\n```\n\nIn the end of the `register_refined_jiffies` function we register new clock source with the `__clocksource_register` function that is defined in the [include/linux/clocksource.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clocksource.h) header file and return:\n\n```C\n__clocksource_register(&refined_jiffies);\nreturn 0;\n```\n\nThe clock source management code provides the API for clock source registration and selection. As we can see, clock sources are registered by calling the  `__clocksource_register` function during kernel initialization or from a kernel module. During registration, the clock source management code will choose the best clock source available in the system using the `clocksource.rating` field which we already saw when we initialized `clocksource` structure for `jiffies`.\n\nUsing the jiffies\n--------------------------------------------------------------------------------\n\nWe just saw initialization of two `jiffies` based clock sources in the previous paragraph:\n\n* standard `jiffies` based clock source;\n* refined  `jiffies` based clock source;\n\nDon't worry if you don't understand the calculations here. They look frightening at first. Soon, step by step we will learn these things. So, we just saw initialization of `jiffies` based clock sources and also we know that the Linux kernel has the global variable `jiffies` that holds the number of ticks that have occurred since the kernel started to work. Now, let's look how to use it. To use `jiffies` we just can use the `jiffies` global variable by its name or with the call of the `get_jiffies_64` function. This function defined in the [kernel/time/jiffies.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/jiffies.c) source code file and just returns full `64-bit` value of the `jiffies`:\n\n```C\nu64 get_jiffies_64(void)\n{\n\tunsigned long seq;\n\tu64 ret;\n\n\tdo {\n\t\tseq = read_seqbegin(&jiffies_lock);\n\t\tret = jiffies_64;\n\t} while (read_seqretry(&jiffies_lock, seq));\n\treturn ret;\n}\nEXPORT_SYMBOL(get_jiffies_64);\n```\n\nNote that the `get_jiffies_64` function is not implemented as `jiffies_read` for example:\n\n```C\nstatic cycle_t jiffies_read(struct clocksource *cs)\n{\n\treturn (cycle_t) jiffies;\n}\n```\n\nWe can see that implementation of the `get_jiffies_64` is more complex. The reading of the `jiffies_64` variable is implemented using [seqlocks](https://en.wikipedia.org/wiki/Seqlock). Actually this is done for machines that cannot atomically read the full 64-bit values.\n\nIf we can access the `jiffies` or the `jiffies_64` variable we can convert it to `human` time units. To get one second we can use following expression:\n\n```C\njiffies / HZ\n```\n\nSo, if we know this, we can get any time units. For example:\n\n```C\n/* Thirty seconds from now */\njiffies + 30*HZ\n\n/* Two minutes from now */\njiffies + 120*HZ\n\n/* One millisecond from now */\njiffies + HZ / 1000\n```\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis concludes the first part covering time and time management related concepts in the Linux kernel. We first met two concepts and their initialization: `jiffies` and `clocksource`. In the next part we will continue to dive into this interesting theme, and as I already wrote in this part, we will try to understand the insides of these and other time management concepts in the Linux kernel.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [system call](https://en.wikipedia.org/wiki/System_call)\n* [TCP](https://en.wikipedia.org/wiki/Transmission_Control_Protocol)\n* [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt)\n* [cgroups](https://en.wikipedia.org/wiki/Cgroups)\n* [bss](https://en.wikipedia.org/wiki/.bss)\n* [initrd](https://en.wikipedia.org/wiki/Initrd)\n* [Intel MID](https://en.wikipedia.org/wiki/Mobile_Internet_device#Intel_MID_platforms)\n* [TSC](https://en.wikipedia.org/wiki/Time_Stamp_Counter)\n* [void](https://en.wikipedia.org/wiki/Void_type)\n* [Simple Firmware Interface](https://en.wikipedia.org/wiki/Simple_Firmware_Interface)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [real time clock](https://en.wikipedia.org/wiki/Real-time_clock)\n* [Jiffy](https://en.wikipedia.org/wiki/Jiffy_%28time%29)\n* [high precision event timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer)\n* [nanoseconds](https://en.wikipedia.org/wiki/Nanosecond)\n* [Intel 8253](https://en.wikipedia.org/wiki/Intel_8253)\n* [seqlocks](https://en.wikipedia.org/wiki/Seqlock)\n* [cloksource documentation](https://www.kernel.org/doc/Documentation/timers/timekeeping.txt)\n* [Previous chapter](https://0xax.gitbook.io/linux-insides/summary/syscall)\n"
  },
  {
    "path": "Timers/linux-timers-2.md",
    "content": "Timers and time management in the Linux kernel. Part 2.\n================================================================================\n\nIntroduction to the `clocksource` framework\n--------------------------------------------------------------------------------\n\nThe previous [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1) was the first part in the current [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/) that describes timers and time management related stuff in the Linux kernel. We got acquainted with two concepts in the previous part:\n\n  * `jiffies`\n  * `clocksource`\n\nThe first is the global variable that is defined in the [include/linux/jiffies.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/jiffies.h) header file and represents the counter that is increased during each timer interrupt. So if we can access this global variable and we know the timer interrupt rate we can convert `jiffies` to the human time units. As we already know the timer interrupt rate represented by the compile-time constant that is called `HZ` in the Linux kernel. The value of `HZ` is equal to the value of the `CONFIG_HZ` kernel configuration option and if we will look into the [arch/x86/configs/x86_64_defconfig](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/configs/x86_64_defconfig) kernel configuration file, we will see that:\n\n```\nCONFIG_HZ_1000=y\n```\n\nkernel configuration option is set. This means that value of `CONFIG_HZ` will be `1000` by default for the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture. So, if we divide the value of `jiffies` by the value of `HZ`:\n\n```\njiffies / HZ\n```\n\nwe will get the amount of seconds that elapsed since the beginning of the moment the Linux kernel started to work or in other words we will get the system [uptime](https://en.wikipedia.org/wiki/Uptime). Since `HZ` represents the amount of timer interrupts in a second, we can set a value for some time in the future. For example:\n\n```C\n/* one minute from now */\nunsigned long later = jiffies + 60*HZ;\n\n/* five minutes from now */\nunsigned long later = jiffies + 5*60*HZ;\n```\n\nThis is a very common practice in the Linux kernel. For example, if you will look into the [arch/x86/kernel/smpboot.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/smpboot.c) source code file, you will find the `do_boot_cpu` function. This function boots all processors besides bootstrap processor. You can find a snippet that waits ten seconds for a response from the application processor:\n\n```C\nif (!boot_error) {\n\ttimeout = jiffies + 10*HZ;\n\twhile (time_before(jiffies, timeout)) {\n\t\t...\n\t\t...\n\t\t...\n\t\tudelay(100);\n\t}\n\t...\n\t...\n\t...\n}\n```\n\nWe assign `jiffies + 10*HZ` value to the `timeout` variable here. As I think you already understood, this means a ten seconds timeout. After this we are entering a loop where we use the `time_before` macro to compare the current `jiffies` value and our timeout.\n\nOr for example if we look into the [sound/isa/sscape.c](https://github.com/torvalds/linux/blob/master/sound/isa/sscape.c) source code file which represents the driver for the [Ensoniq Soundscape Elite](https://en.wikipedia.org/wiki/Ensoniq_Soundscape_Elite) sound card, we will see the `obp_startup_ack` function that waits up to a given timeout for the On-Board Processor to return its start-up acknowledgement sequence:\n\n```C\nstatic int obp_startup_ack(struct soundscape *s, unsigned timeout)\n{\n\tunsigned long end_time = jiffies + msecs_to_jiffies(timeout);\n\n\tdo {\n\t\t...\n\t\t...\n\t\t...\n\t\tx = host_read_unsafe(s->io_base);\n\t\t...\n\t\t...\n\t\t...\n\t\tif (x == 0xfe || x == 0xff)\n\t\t\treturn 1;\n\t\tmsleep(10);\n\t} while (time_before(jiffies, end_time));\n\n\treturn 0;\n}\n```\n\nAs you can see, the `jiffies` variable is very widely used in the Linux kernel [code](http://lxr.free-electrons.com/ident?i=jiffies). As I already wrote, we met yet another new time management related concept in the previous part - `clocksource`. We have only seen a short description of this concept and the API for a `clocksource` registration. Let's take a closer look in this part.\n\nIntroduction to `clocksource`\n--------------------------------------------------------------------------------\n\nThe `clocksource` concept represents the generic API for clock sources management in the Linux kernel. Why do we need a separate framework for this? Let's go back to the beginning. The `time` concept is the fundamental concept in the Linux kernel and other operating system kernels. And the timekeeping is one of the necessities to use this concept. For example Linux kernel must know and update the time elapsed since system startup, it must determine how long the current process has been running for every processor and many many more. Where the Linux kernel can get information about time? First of all it is Real Time Clock or [RTC](https://en.wikipedia.org/wiki/Real-time_clock) that represents the nonvolatile device. You can find a set of architecture-independent real time clock drivers in the Linux kernel in the [drivers/rtc](https://github.com/torvalds/linux/tree/master/drivers/rtc) directory. Besides this, each architecture can provide a driver for the architecture-dependent real time clock, for example - `CMOS/RTC` - [arch/x86/kernel/rtc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/rtc.c) for the [x86](https://en.wikipedia.org/wiki/X86) architecture. The second is system timer - timer that excites [interrupts](https://en.wikipedia.org/wiki/Interrupt) with a periodic rate. For example, for [IBM PC](https://en.wikipedia.org/wiki/IBM_Personal_Computer) compatibles it was - [programmable interval timer](https://en.wikipedia.org/wiki/Programmable_interval_timer).\n\nWe already know that for timekeeping purposes we can use `jiffies` in the Linux kernel. The `jiffies` can be considered as read only global variable which is updated with `HZ` frequency. We know that the `HZ` is a compile-time kernel parameter whose reasonable range is from `100` to `1000` [Hz](https://en.wikipedia.org/wiki/Hertz). So, it is guaranteed to have an interface for time measurement  with `1` - `10` milliseconds resolution. Besides standard `jiffies`, we saw the `refined_jiffies` clock source in the previous part that is based on the `i8253/i8254` [programmable interval timer](https://en.wikipedia.org/wiki/Programmable_interval_timer) tick rate which is almost `1193182` hertz. So we can get something about `1` microsecond resolution with the `refined_jiffies`. In this time, [nanoseconds](https://en.wikipedia.org/wiki/Nanosecond) are the favorite choice for the time value units of the given `clocksource`.\n\nThe availability of more precise techniques for time intervals measurement is hardware-dependent. We just knew a little about `x86` dependent timers hardware. But each architecture provides its own timer(s) hardware. Earlier each architecture had own implementation for this purpose. Solution of this problem is an abstraction layer and associated API in a common code framework for managing various clock sources and independent of the timer interrupt. This common code framework became - `clocksource` framework.\n\nGeneric timeofday and `clocksource` management framework moved a lot of timekeeping code into the architecture independent portion of the code, with the architecture-dependent portion reduced to defining and managing low-level hardware pieces of clocksources. It takes a large amount of funds to measure the time interval on different architectures with different hardware, and it is very complex. Implementation of the each clock related service is strongly associated with an individual hardware device and as you can understand, it results in similar implementations for different architectures.\n\nWithin this framework, each clock source is required to maintain a representation of time as a monotonically increasing value. As we can see in the Linux kernel code, nanoseconds are the favorite choice for the time value units of a clock source at this time. One of the main point of the clock source framework is to allow a user to select clock source among a range of available hardware devices supporting clock functions when configuring the system and selecting, accessing and scaling different clock sources.\n\nThe `clocksource` structure\n--------------------------------------------------------------------------------\n\nThe fundamental of the `clocksource` framework is the `clocksource` structure that defined in the [include/linux/clocksource.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clocksource.h) header file. We already saw some fields that are provided by the `clocksource` structure in the previous [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1). Let's look on the full definition of this structure and try to describe all of its fields:\n\n```C\nstruct clocksource {\n\tcycle_t (*read)(struct clocksource *cs);\n\tcycle_t mask;\n\tu32 mult;\n\tu32 shift;\n\tu64 max_idle_ns;\n\tu32 maxadj;\n#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA\n\tstruct arch_clocksource_data archdata;\n#endif\n\tu64 max_cycles;\n\tconst char *name;\n\tstruct list_head list;\n\tint rating;\n\tint (*enable)(struct clocksource *cs);\n\tvoid (*disable)(struct clocksource *cs);\n\tunsigned long flags;\n\tvoid (*suspend)(struct clocksource *cs);\n\tvoid (*resume)(struct clocksource *cs);\n#ifdef CONFIG_CLOCKSOURCE_WATCHDOG\n\tstruct list_head wd_list;\n\tcycle_t cs_last;\n\tcycle_t wd_last;\n#endif\n\tstruct module *owner;\n} ____cacheline_aligned;\n```\n\nWe already saw the first field of the `clocksource` structure in the previous part - it is a pointer to the `read` function that returns best counter selected by the clocksource framework. For example we use `jiffies_read` function to read `jiffies` value:\n\n```C\nstatic struct clocksource clocksource_jiffies = {\n\t...\n\t.read\t\t= jiffies_read,\n\t...\n}\n```\n\nwhere `jiffies_read` just returns:\n\n```C\nstatic cycle_t jiffies_read(struct clocksource *cs)\n{\n\treturn (cycle_t) jiffies;\n}\n```\n\nOr the `read_tsc` function:\n\n```C\nstatic struct clocksource clocksource_tsc = {\n\t...\n    .read                   = read_tsc,\n\t...\n};\n```\n\nfor the [time stamp counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter) reading.\n\nThe next field is `mask` that allows to ensure that subtraction between counters values from non `64 bit` counters do not need special overflow logic. After the `mask` field, we can see two fields: `mult` and `shift`. These are the fields that are base of mathematical functions that are provide ability to convert time values specific to each clock source. In other words these two fields help us to convert an abstract machine time units of a counter to nanoseconds.\n\nAfter these two fields we can see the `64` bits `max_idle_ns` field represents max idle time permitted by the clocksource in nanoseconds. We need in this field for the Linux kernel with enabled `CONFIG_NO_HZ` kernel configuration option. This kernel configuration option enables the Linux kernel to run without a regular timer tick (we will see full explanation of this in other part). The problem that dynamic tick allows the kernel to sleep for periods longer than a single tick, moreover sleep time could be unlimited. The `max_idle_ns` field represents this sleeping limit.\n\nThe next field after the `max_idle_ns` is the `maxadj` field which is the maximum adjustment value to `mult`. The main formula by which we convert cycles to the nanoseconds:\n\n```C\n((u64) cycles * mult) >> shift;\n```\n\nis not `100%` accurate. Instead the number is taken as close as possible to a nanosecond and `maxadj` helps to correct this and allows clocksource API to avoid `mult` values that might overflow when adjusted. The next four fields are pointers to the function:\n\n* `enable` - optional function to enable clocksource;\n* `disable` - optional function to disable clocksource;\n* `suspend` - suspend function for the clocksource;\n* `resume` - resume function for the clocksource;\n\nThe next field is the `max_cycles` and as we can understand from its name, this field represents maximum cycle value before potential overflow. And the last field is `owner` represents reference to a kernel [module](https://en.wikipedia.org/wiki/Loadable_kernel_module) that is owner of a clocksource. This is all. We just went through all the standard fields of the `clocksource` structure. But you might have noted that we missed some fields of the `clocksource` structure. We can divide all of missed field on two types: Fields of the first type are already known for us. For example, they are `name` field that represents name of a `clocksource`, the `rating` field that helps to the Linux kernel to select the best clocksource and etc. The second type, fields which are dependent from the different Linux kernel configuration options. Let's look on these fields.\n\nThe first field is the `archdata`. This field has `arch_clocksource_data` type and depends on the `CONFIG_ARCH_CLOCKSOURCE_DATA` kernel configuration option. This field is actual only for the [x86](https://en.wikipedia.org/wiki/X86) and [IA64](https://en.wikipedia.org/wiki/IA-64) architectures for this moment. And again, as we can understand from the field's name, it represents architecture-specific data for a clock source. For example, it represents `vDSO` clock mode:\n\n```C\nstruct arch_clocksource_data {\n    int vclock_mode;\n};\n```\n\nfor the `x86` architectures. Where the `vDSO` clock mode can be one of the:\n\n```C\n#define VCLOCK_NONE 0\n#define VCLOCK_TSC  1\n#define VCLOCK_HPET 2\n#define VCLOCK_PVCLOCK 3\n```\n\nThe last three fields are `wd_list`, `cs_last` and the `wd_last` depends on the `CONFIG_CLOCKSOURCE_WATCHDOG` kernel configuration option. First of all let's try to understand what is `watchdog`. In a simple words, watchdog is a timer that is used for detection of the computer malfunctions and recovering from it. All of these three fields contain watchdog related data that is used by the `clocksource` framework. If we will grep the Linux kernel source code, we will see that only [arch/x86/KConfig](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Kconfig#L54) kernel configuration file contains the `CONFIG_CLOCKSOURCE_WATCHDOG` kernel configuration option. So, why do `x86` and `x86_64` need in [watchdog](https://en.wikipedia.org/wiki/Watchdog_timer)? You already may know that all `x86` processors has special 64-bit register - [time stamp counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter). This register contains number of [cycles](https://en.wikipedia.org/wiki/Clock_rate) since the reset. Sometimes the time stamp counter needs to be verified against another clock source. We will not see initialization of the `watchdog` timer in this part, before this we must learn more about timers.\n\nThat's all. From this moment we know all fields of the `clocksource` structure. This knowledge will help us to learn insides of the `clocksource` framework.\n\nNew `clocksource` registration\n--------------------------------------------------------------------------------\n\nWe saw only one function from the `clocksource` framework in the previous [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1). This function was - `__clocksource_register`. This function defined in the [include/linux/clocksource.h](https://github.com/torvalds/linux/tree/master/include/linux/clocksource.h) header file and as we can understand from the function's name, main point of this function is to register new clocksource. If we will look on the implementation of the `__clocksource_register` function, we will see that it just makes call of the `__clocksource_register_scale` function and returns its result:\n\n```C\nstatic inline int __clocksource_register(struct clocksource *cs)\n{\n\treturn __clocksource_register_scale(cs, 1, 0);\n}\n```\n\nBefore we will see implementation of the `__clocksource_register_scale` function, we can see that `clocksource` provides additional API for a new clock source registration:\n\n```C\nstatic inline int clocksource_register_hz(struct clocksource *cs, u32 hz)\n{\n        return __clocksource_register_scale(cs, 1, hz);\n}\n\nstatic inline int clocksource_register_khz(struct clocksource *cs, u32 khz)\n{\n        return __clocksource_register_scale(cs, 1000, khz);\n}\n```\n\nAnd all of these functions do the same. They return value of the `__clocksource_register_scale` function but with different set of parameters. The `__clocksource_register_scale` function defined in the [kernel/time/clocksource.c](https://github.com/torvalds/linux/tree/master/kernel/time/clocksource.c) source code file. To understand difference between these functions, let's look on the parameters of the `clocksource_register_khz` function. As we can see, this function takes three parameters:\n\n* `cs` - clocksource to be installed;\n* `scale` - scale factor of a clock source. In other words, if we will multiply value of this parameter on frequency, we will get `hz` of a clocksource;\n* `freq` - clock source frequency divided by scale.\n\nNow let's look on the implementation of the `__clocksource_register_scale` function:\n\n```C\nint __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)\n{\n        __clocksource_update_freq_scale(cs, scale, freq);\n        mutex_lock(&clocksource_mutex);\n        clocksource_enqueue(cs);\n        clocksource_enqueue_watchdog(cs);\n        clocksource_select();\n        mutex_unlock(&clocksource_mutex);\n        return 0;\n}\n```\n\nFirst of all we can see that the `__clocksource_register_scale` function starts from the call of the `__clocksource_update_freq_scale` function that defined in the same source code file and updates given clock source with the new frequency. Let's look on the implementation of this function. In the first step we need to check given frequency and if it was not passed as `zero`, we need to calculate `mult` and `shift` parameters for the given clock source. Why do we need to check value of the `frequency`? Actually it can be zero. If you attentively looked on the implementation of the `__clocksource_register` function, you may have noticed that we passed `frequency` as `0`. We will do it only for some clock sources that have self defined `mult` and `shift` parameters. Look in the previous [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1) and you will see that we saw calculation of the `mult` and `shift` for `jiffies`. The `__clocksource_update_freq_scale` function will do it for us for other clock sources.\n\nSo in the start of the `__clocksource_update_freq_scale` function we check the value of the `frequency` parameter and if it is not zero we need to calculate `mult` and `shift` for the given clock source. Let's look on the `mult` and `shift` calculation:\n\n```C\nvoid __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)\n{\n        u64 sec;\n\n\t\tif (freq) {\n             sec = cs->mask;\n             do_div(sec, freq);\n             do_div(sec, scale);\n\n             if (!sec)\n                   sec = 1;\n             else if (sec > 600 && cs->mask > UINT_MAX)\n                   sec = 600;\n\n             clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,\n                                    NSEC_PER_SEC / scale, sec * scale);\n\t    }\n\t    ...\n        ...\n        ...\n}\n```\n\nHere we can see calculation of the maximum number of seconds which we can run before a `clocksource` counter will overflow. First of all we fill the `sec` variable with the value of a clock source mask. Remember that a clock source's mask represents maximum amount of bits that are valid for the given clock source. After this, we can see two division operations. At first we divide our `sec` variable on a `clocksource` frequency and then on scale factor. The `freq` parameter shows us how many timer interrupts will be occurred in one second. So, we divide `mask` value that represents maximum number of a counter (for example `jiffy`) on the frequency of a timer and will get the maximum number of seconds for the certain `clocksource`. The second division operation will give us maximum number of seconds for the certain `clocksource`depends on its scale factor which can be `1` hertz or `1` kilohertz (10^3 Hz).\n\nAfter we have got maximum number of seconds, we check this value and set it to `1` or `600` depends on the result at the next step. These values is maximum sleeping time for a clocksource in seconds. In the next step we can see call of the `clocks_calc_mult_shift`. Main point of this function is calculation of the `mult` and `shift` values for a given clock source. In the end of the `__clocksource_update_freq_scale` function we check that just calculated `mult` value of a given clock source will not cause overflow after adjustment, update the `max_idle_ns` and `max_cycles` values of a given clock source with the maximum nanoseconds that can be converted to a clock source counter and print result to the kernel buffer:\n\n```C\npr_info(\"%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\\n\",\n\tcs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);\n```\n\nthat we can see in the [dmesg](https://en.wikipedia.org/wiki/Dmesg) output:\n\n```\n$ dmesg | grep \"clocksource:\"\n[    0.000000] clocksource: refined-jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 1910969940391419 ns\n[    0.000000] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns\n[    0.094084] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 1911260446275000 ns\n[    0.205302] clocksource: acpi_pm: mask: 0xffffff max_cycles: 0xffffff, max_idle_ns: 2085701024 ns\n[    1.452979] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x7350b459580, max_idle_ns: 881591204237 ns\n```\n\nAfter the `__clocksource_update_freq_scale` function will finish its work, we can return back to the `__clocksource_register_scale` function that will register new clock source. We can see the call of the following three functions:\n\n```C\nmutex_lock(&clocksource_mutex);\nclocksource_enqueue(cs);\nclocksource_enqueue_watchdog(cs);\nclocksource_select();\nmutex_unlock(&clocksource_mutex);\n```\n\nNote that before the first will be called, we lock the `clocksource_mutex` [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion). The point of the `clocksource_mutex` mutex is to protect `curr_clocksource` variable which represents currently selected `clocksource` and `clocksource_list` variable which represents list that contains registered `clocksources`. Now, let's look on these three functions.\n\nThe first `clocksource_enqueue` function and other two defined in the same source code [file](https://github.com/torvalds/linux/tree/master/kernel/time/clocksource.c). We go through all already registered `clocksources` or in other words we go through all elements of the `clocksource_list` and tries to find best place for a given `clocksource`:\n\n```C\n/*\n * Enqueue the clocksource sorted by rating\n */\nstatic void clocksource_enqueue(struct clocksource *cs)\n{\n\tstruct list_head *entry = &clocksource_list;\n\tstruct clocksource *tmp;\n\n\tlist_for_each_entry(tmp, &clocksource_list, list) {\n\t\t/* Keep track of the place, where to insert */\n\t\tif (tmp->rating < cs->rating)\n\t\t\tbreak;\n\t\tentry = &tmp->list;\n\t}\n\tlist_add(&cs->list, entry);\n}\n```\n\nIn the end we just insert new clocksource to the `clocksource_list`. The second function - `clocksource_enqueue_watchdog` does almost the same that previous function, but it inserts new clock source to the `wd_list` depends on flags of a clock source and starts new [watchdog](https://en.wikipedia.org/wiki/Watchdog_timer) timer. As I already wrote, we will not consider `watchdog` related stuff in this part but will do it in next parts.\n\nThe last function is the `clocksource_select`. As we can understand from the function's name, main point of this function - select the best `clocksource` from registered clocksources. This function consists only from the call of the function helper:\n\n```C\nstatic void clocksource_select(void)\n{\n\treturn __clocksource_select(false);\n}\n```\n\nNote that the `__clocksource_select` function takes one parameter (`false` in our case). This [bool](https://en.wikipedia.org/wiki/Boolean_data_type) parameter shows how to traverse the `clocksource_list`. In our case we pass `false` that is meant that we will go through all entries of the `clocksource_list`. We already know that `clocksource` with the best rating will the first in the `clocksource_list` after the call of the `clocksource_enqueue` function, so we can easily get it from this list. After we found a clock source with the best rating, we switch to it:\n\n```C\nif (curr_clocksource != best && !timekeeping_notify(best)) {\n\tpr_info(\"Switched to clocksource %s\\n\", best->name);\n\tcurr_clocksource = best;\n}\n```\n\nThe result of this operation we can see in the `dmesg` output:\n\n```\n$ dmesg | grep Switched\n[    0.199688] clocksource: Switched to clocksource hpet\n[    2.452966] clocksource: Switched to clocksource tsc\n```\n\nNote that we can see two clock sources in the `dmesg` output (`hpet` and `tsc` in our case). Yes, actually there can be many different clock sources on a particular hardware. So the Linux kernel knows about all registered clock sources and switches to a clock source with a better rating each time after registration of a new clock source.\n\nIf we will look on the bottom of the [kernel/time/clocksource.c](https://github.com/torvalds/linux/tree/master/kernel/time/clocksource.c) source code file, we will see that it has [sysfs](https://en.wikipedia.org/wiki/Sysfs) interface. Main initialization occurs in the `init_clocksource_sysfs` function which will be called during device `initcalls`. Let's look on the implementation of the `init_clocksource_sysfs` function:\n\n```C\nstatic struct bus_type clocksource_subsys = {\n\t.name = \"clocksource\",\n\t.dev_name = \"clocksource\",\n};\n\nstatic int __init init_clocksource_sysfs(void)\n{\n\tint error = subsys_system_register(&clocksource_subsys, NULL);\n\n\tif (!error)\n\t\terror = device_register(&device_clocksource);\n\tif (!error)\n\t\terror = device_create_file(\n\t\t\t\t&device_clocksource,\n\t\t\t\t&dev_attr_current_clocksource);\n\tif (!error)\n\t\terror = device_create_file(&device_clocksource,\n\t\t\t\t\t   &dev_attr_unbind_clocksource);\n\tif (!error)\n\t\terror = device_create_file(\n\t\t\t\t&device_clocksource,\n\t\t\t\t&dev_attr_available_clocksource);\n\treturn error;\n}\ndevice_initcall(init_clocksource_sysfs);\n```\n\nFirst of all we can see that it registers a `clocksource` subsystem with the call of the `subsys_system_register` function. In other words, after the call of this function, we will have following directory:\n\n```\n$ pwd\n/sys/devices/system/clocksource\n```\n\nAfter this step, we can see registration of the `device_clocksource` device which is represented by the following structure:\n\n```C\nstatic struct device device_clocksource = {\n\t.id\t= 0,\n\t.bus\t= &clocksource_subsys,\n};\n```\n\nand creation of three files:\n\n* `dev_attr_current_clocksource`;\n* `dev_attr_unbind_clocksource`;\n* `dev_attr_available_clocksource`.\n\nThese files will provide information about current clock source in the system, available clock sources in the system and interface which allows to unbind the clock source.\n\nAfter the `init_clocksource_sysfs` function is executed, we will be able to find some information about available clock sources in the:\n\n```\n$ cat /sys/devices/system/clocksource/clocksource0/available_clocksource\ntsc hpet acpi_pm\n```\n\nOr for example information about current clock source in the system:\n\n```\n$ cat /sys/devices/system/clocksource/clocksource0/current_clocksource\ntsc\n```\n\nIn the previous part, we saw API for the registration of the `jiffies` clock source, but didn't dive into details about the `clocksource` framework. In this part we did it and saw implementation of the new clock source registration and selection of a clock source with the best rating value in the system. Of course, this is not all API that `clocksource` framework provides. There a couple additional functions like `clocksource_unregister` for removing given clock source from the `clocksource_list` and etc. But I will not describe this functions in this part, because they are not important for us right now. Anyway if you are interested in it, you can find it in the [kernel/time/clocksource.c](https://github.com/torvalds/linux/tree/master/kernel/time/clocksource.c).\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the second part of the chapter that describes timers and timer management related stuff in the Linux kernel. In the previous part got acquainted with the following two concepts: `jiffies` and `clocksource`. In this part we saw some examples of the `jiffies` usage and knew more details about the `clocksource` concept.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n-------------------------------------------------------------------------------\n\n* [x86](https://en.wikipedia.org/wiki/X86)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [uptime](https://en.wikipedia.org/wiki/Uptime)\n* [Ensoniq Soundscape Elite](https://en.wikipedia.org/wiki/Ensoniq_Soundscape_Elite)\n* [RTC](https://en.wikipedia.org/wiki/Real-time_clock)\n* [interrupts](https://en.wikipedia.org/wiki/Interrupt)\n* [IBM PC](https://en.wikipedia.org/wiki/IBM_Personal_Computer)\n* [programmable interval timer](https://en.wikipedia.org/wiki/Programmable_interval_timer)\n* [Hz](https://en.wikipedia.org/wiki/Hertz)\n* [nanoseconds](https://en.wikipedia.org/wiki/Nanosecond)\n* [dmesg](https://en.wikipedia.org/wiki/Dmesg)\n* [time stamp counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter)\n* [loadable kernel module](https://en.wikipedia.org/wiki/Loadable_kernel_module)\n* [IA64](https://en.wikipedia.org/wiki/IA-64)\n* [watchdog](https://en.wikipedia.org/wiki/Watchdog_timer)\n* [clock rate](https://en.wikipedia.org/wiki/Clock_rate)\n* [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion)\n* [sysfs](https://en.wikipedia.org/wiki/Sysfs)\n* [previous part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1)\n"
  },
  {
    "path": "Timers/linux-timers-3.md",
    "content": "Timers and time management in the Linux kernel. Part 3.\n================================================================================\n\nThe tick broadcast framework and dyntick\n--------------------------------------------------------------------------------\n\nThis is third part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/) which describes timers and time management related stuff in the Linux kernel and we stopped on the `clocksource` framework in the previous [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-2). We have started to consider this framework because it is closely related to the special counters which are provided by the Linux kernel. One of these counters which we already saw in the first [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1.md) of this chapter is - `jiffies`. As I already wrote in the first part of this chapter, we will consider time management related stuff step by step during the Linux kernel initialization. Previous step was call of the:\n\n```C\nregister_refined_jiffies(CLOCK_TICK_RATE);\n```\n\nfunction which is defined in the [kernel/time/jiffies.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/jiffies.c) source code file and executes initialization of the `refined_jiffies` clock source for us. Recall that this function is called from the `setup_arch` function that is defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/setup.c) source code and executes architecture-specific ([x86_64](https://en.wikipedia.org/wiki/X86-64) in our case) initialization. Look on the implementation of the `setup_arch` and you will note that the call of the `register_refined_jiffies` is the last step before the `setup_arch` function finishes its work.\n\nThere are many different `x86_64` specific things already configured after the end of the `setup_arch` execution. For example some early [interrupt](https://en.wikipedia.org/wiki/Interrupt) handlers already able to handle interrupts, memory space reserved for the [initrd](https://en.wikipedia.org/wiki/Initrd), [DMI](https://en.wikipedia.org/wiki/Desktop_Management_Interface) scanned, the Linux kernel log buffer is already set and this means that the [printk](https://en.wikipedia.org/wiki/Printk) function is able to work, [e820](https://en.wikipedia.org/wiki/E820) parsed and the Linux kernel already knows about available memory and and many many other architecture specific things (if you are interested, you can read more about the `setup_arch` function and Linux kernel initialization process in the second [chapter](https://0xax.gitbook.io/linux-insides/summary/initialization) of this book).\n\nNow, the `setup_arch` finished its work and we can go back to the generic Linux kernel code. Recall that the `setup_arch` function was called from the `start_kernel` function which is defined in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file. So, we shall return to this function. You can see that there are many different functions that are called right after `setup_arch` function inside of the `start_kernel` function, but since our chapter is devoted to timers and time management related stuff, we will skip all code which is not related to this topic. The first function which is related to the time management in the Linux kernel is:\n\n```C\ntick_init();\n```\n\nin the `start_kernel`. The `tick_init` function defined in the [kernel/time/tick-common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/tick-common.c) source code file and does two things:\n\n* Initialization of `tick broadcast` framework related data structures;\n* Initialization of `full` tickless mode related data structures.\n\nWe didn't see anything related to the `tick broadcast` framework in this book and didn't know anything about tickless mode in the Linux kernel. So, the main point of this part is to look on these concepts and to know what they are.\n\nThe idle process\n--------------------------------------------------------------------------------\n\nFirst of all, let's look on the implementation of the `tick_init` function. As I already wrote, this function is defined in the [kernel/time/tick-common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/tick-common.c) source code file and consists from the two calls of following functions:\n\n```C\nvoid __init tick_init(void)\n{\n\ttick_broadcast_init();\n\ttick_nohz_init();\n}\n```\n\nAs you can understand from the paragraph's title, we are interesting only in the `tick_broadcast_init` function for now. This function defined in the [kernel/time/tick-broadcast.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/tick-broadcast.c) source code file and executes initialization of the `tick broadcast` framework related data structures. Before we will look on the implementation of the `tick_broadcast_init` function and will try to understand what does this function do, we need to know about `tick broadcast` framework.\n\nMain point of a central processor is to execute programs. But sometimes a processor may be in a special state when it is not being used by any program. This special state is called - [idle](https://en.wikipedia.org/wiki/Idle_%28CPU%29). When the processor has no anything to execute, the Linux kernel launches `idle` task. We already saw a little about this in the last part of the [Linux kernel initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-10). When the Linux kernel will finish all initialization processes in the `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file, it will call the `rest_init` function from the same source code file. Main point of this function is to launch kernel `init` thread and the `kthreadd` thread, to call the `schedule` function to start task scheduling and to go to sleep by calling the `cpu_idle_loop` function that defined in the [kernel/sched/idle.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/sched/idle.c) source code file.\n\nThe `cpu_idle_loop` function represents infinite loop which checks the need for rescheduling on each iteration. After the scheduler finds something to execute, the `idle` process will finish its work and the control will be moved to a new runnable task with the call of the `schedule_preempt_disabled` function:\n\n```C\nstatic void cpu_idle_loop(void)\n{\n\twhile (1) {\n\t\twhile (!need_resched()) {\n\t\t\t...\n\t\t\t...\n\t\t\t...\n\t\t    /* the main idle function */\n\t\t\tcpuidle_idle_call();\n\t\t}\n\t\t...\n\t\t...\n\t\t...\n\t\tschedule_preempt_disabled();\n\t}\n}\n```\n\nOf course, we will not consider full implementation of the `cpu_idle_loop` function and details of the `idle` state in this part, because it is not related to our topic. But there is one interesting moment for us. We know that the processor can execute only one task in one time. How does the Linux kernel decide to reschedule and stop `idle` process if the processor executes infinite loop in the `cpu_idle_loop`? The answer is system timer interrupts. When an interrupt occurs, the processor stops the `idle` thread and transfers control to an interrupt handler. After the system timer interrupt handler will be handled, the `need_resched` will return true and the Linux kernel will stop `idle` process and will transfer control to the current runnable task. But handling of the system timer interrupts is not effective for [power management](https://en.wikipedia.org/wiki/Power_management), because if a processor is in `idle` state,  there is little point in sending it a system timer interrupt.\n\nBy default, there is the `CONFIG_HZ_PERIODIC` kernel configuration option which is enabled in the Linux kernel and tells to handle each interrupt of the system timer. To solve this problem, the Linux kernel provides two additional ways of managing scheduling-clock interrupts:\n\nThe first is to omit scheduling-clock ticks on idle processors. To enable this behaviour in the Linux kernel, we need to enable the `CONFIG_NO_HZ_IDLE` kernel configuration option. This option allows Linux kernel to avoid sending timer interrupts to idle processors. In this case periodic timer interrupts will be replaced with on-demand interrupts. This mode is called - `dyntick-idle` mode. But if the kernel does not handle interrupts of a system timer, how can the kernel decide if the system has nothing to do?\n\nWhenever the idle task is selected to run, the periodic tick is disabled with the call of the `tick_nohz_idle_enter` function that defined in the [kernel/time/tick-sched.c](https://github.com/torvalds/linux/blob/master/kernel/time/tick-sched.c) source code file and enabled with the call of the `tick_nohz_idle_exit` function. There is special concept in the Linux kernel which is called - `clock event devices` that are used to schedule the next interrupt. This concept provides API for devices which can deliver interrupts at a specific time in the future and represented by the `clock_event_device` structure in the Linux kernel. We will not dive into implementation of the `clock_event_device` structure now. We will see it in the next part of this chapter. But there is one interesting moment for us right now.\n\nThe second way is to omit scheduling-clock ticks on processors that are either in `idle` state or that have only one runnable task or in other words busy processor. We can enable this feature with the `CONFIG_NO_HZ_FULL` kernel configuration option and it allows to reduce the number of timer interrupts significantly.\n\nBesides the `cpu_idle_loop`, idle processor can be in a sleeping state. The Linux kernel provides special `cpuidle` framework. Main point of this framework is to put an idle processor to sleeping states. The name of the set of these states is - `C-states`. But how will a processor will be woken if local timer is disabled? The Linux kernel provides `tick broadcast` framework for this. The main point of this framework is assign a timer which is not affected by the `C-states`. This timer will wake a sleeping processor.\n\nNow, after some theory we can return to the implementation of our function. Let's recall that the `tick_init` function just calls two following functions:\n\n```C\nvoid __init tick_init(void)\n{\n\ttick_broadcast_init();\n\ttick_nohz_init();\n}\n```\n\nLet's consider the first function. The first `tick_broadcast_init` function defined in the [kernel/time/tick-broadcast.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/tick-broadcast.c) source code file and executes initialization of the `tick broadcast` framework related data structures. Let's look on the implementation of the `tick_broadcast_init` function:\n\n```C\nvoid __init tick_broadcast_init(void)\n{\n        zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);\n        zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);\n        zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);\n#ifdef CONFIG_TICK_ONESHOT\n         zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);\n         zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);\n         zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);\n#endif\n}\n```\n\nAs we can see, the `tick_broadcast_init` function allocates different [cpumasks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) with the help of the `zalloc_cpumask_var` function. The `zalloc_cpumask_var` function defined in the [lib/cpumask.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/cpumask.c) source code file and expands to the call of the following function:\n\n```C\nbool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)\n{\n        return alloc_cpumask_var(mask, flags | __GFP_ZERO);\n}\n```\n\nUltimately, the memory space will be allocated for the given `cpumask` with the certain flags with the help of the `kmalloc_node` function:\n\n```C\n*mask = kmalloc_node(cpumask_size(), flags, node);\n```\n\nNow let's look on the `cpumasks` that will be initialized in the `tick_broadcast_init` function. As we can see, the `tick_broadcast_init` function will initialize six `cpumasks`, and moreover, initialization of the last three `cpumasks` will depend on the `CONFIG_TICK_ONESHOT` kernel configuration option.\n\nThe first three `cpumasks` are:\n\n* `tick_broadcast_mask` - the bitmap which represents list of processors that are in a sleeping mode;\n* `tick_broadcast_on` - the bitmap that stores numbers of processors which are in a periodic broadcast state;\n* `tmpmask` - this bitmap for temporary usage.\n\nAs we already know, the next three `cpumasks` depends on the `CONFIG_TICK_ONESHOT` kernel configuration option. Actually each clock event devices can be in one of two modes:\n\n* `periodic` - clock events devices that support periodic events;\n* `oneshot`  - clock events devices that capable of issuing events that happen only once.\n\nThe Linux kernel defines two mask for such clock events devices in the [include/linux/clockchips.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clockchips.h) header file:\n\n```C\n#define CLOCK_EVT_FEAT_PERIODIC        0x000001\n#define CLOCK_EVT_FEAT_ONESHOT         0x000002\n```\n\nSo, the last three `cpumasks` are:\n\n* `tick_broadcast_oneshot_mask` - stores numbers of processors that must be notified;\n* `tick_broadcast_pending_mask` - stores numbers of processors that pending broadcast;\n* `tick_broadcast_force_mask`   - stores numbers of processors with enforced broadcast.\n\nWe have initialized six `cpumasks` in the `tick broadcast` framework, and now we can proceed to implementation of this framework.\n\nThe `tick broadcast` framework\n--------------------------------------------------------------------------------\n\nHardware may provide some clock source devices. When a processor sleeps and its local timer stopped, there must be additional clock source device that will handle awakening of a processor. The Linux kernel uses these `special` clock source devices which can raise an interrupt at a specified time. We already know that such timers called `clock events` devices in the Linux kernel. Besides `clock events` devices, each processor in the system has its own local timer which is programmed to issue interrupt at the time of the next deferred task. Also these timers can be programmed to do a periodical job, like updating `jiffies` and etc. These timers represented by the `tick_device` structure in the Linux kernel. This structure defined in the [kernel/time/tick-sched.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/tick-sched.h) header file and looks:\n\n```C\nstruct tick_device {\n        struct clock_event_device *evtdev;\n        enum tick_device_mode mode;\n};\n```\n\nNote, that the `tick_device` structure contains two fields. The first field - `evtdev` represents pointer to the `clock_event_device` structure that is defined in the [include/linux/clockchips.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clockchips.h) header file and represents descriptor of a clock event device. A `clock event` device allows to register an event that will happen in the future. As I already wrote, we will not consider `clock_event_device` structure and related API in this part, but will see it in the next part.\n\nThe second field of the `tick_device` structure represents mode of the `tick_device`. As we already know, the mode can be one of the:\n\n```C\nenum tick_device_mode {\n        TICKDEV_MODE_PERIODIC,\n        TICKDEV_MODE_ONESHOT,\n};\n```\n\nEach `clock events` device in the system registers itself by the call of the `clockevents_register_device` function or `clockevents_config_and_register` function during initialization process of the Linux kernel. During the registration of a new `clock events` device, the Linux kernel calls the `tick_check_new_device` function that defined in the [kernel/time/tick-common.c](https://github.com/torvalds/linux/blob/master/kernel/time/tick-common.c) source code file and checks the given `clock events` device should be used by the Linux kernel. After all checks, the `tick_check_new_device` function executes a call of the:\n\n```C\ntick_install_broadcast_device(newdev);\n```\n\nfunction that checks that the given `clock event` device can be broadcast device and install it, if the given device can be broadcast device. Let's look on the implementation of the `tick_install_broadcast_device` function:\n\n```C\nvoid tick_install_broadcast_device(struct clock_event_device *dev)\n{\n\tstruct clock_event_device *cur = tick_broadcast_device.evtdev;\n\n\tif (!tick_check_broadcast_device(cur, dev))\n\t\treturn;\n\n\tif (!try_module_get(dev->owner))\n\t\treturn;\n\n\tclockevents_exchange_device(cur, dev);\n\n\tif (cur)\n\t\tcur->event_handler = clockevents_handle_noop;\n\n\ttick_broadcast_device.evtdev = dev;\n\n\tif (!cpumask_empty(tick_broadcast_mask))\n\t\ttick_broadcast_start_periodic(dev);\n\n\tif (dev->features & CLOCK_EVT_FEAT_ONESHOT)\n\t\ttick_clock_notify();\n}\n```\n\nFirst of all we get the current `clock event` device from the `tick_broadcast_device`. The `tick_broadcast_device` defined in the [kernel/time/tick-common.c](https://github.com/torvalds/linux/blob/master/kernel/time/tick-common.c) source code file:\n\n```C\nstatic struct tick_device tick_broadcast_device;\n```\n\nand represents external clock device that keeps track of events for a processor. The first step after we get the current clock device is the call of the `tick_check_broadcast_device` function which checks that a given clock events device can be utilized as broadcast device. The main point of the `tick_check_broadcast_device` function is to check value of the `features` field of the given `clock events` device. As we can understand from the name of this field, the `features` field contains a clock event device features. Available values defined in the [include/linux/clockchips.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clockchips.h) header file and can be one of the `CLOCK_EVT_FEAT_PERIODIC` - which represents a clock events device which supports periodic events and etc. So, the `tick_check_broadcast_device` function check `features` flags for `CLOCK_EVT_FEAT_ONESHOT`, `CLOCK_EVT_FEAT_DUMMY` and other flags and returns `false` if the given clock events device has one of these features. In other way the `tick_check_broadcast_device` function compares `ratings` of the given clock event device and current clock event device and returns the best.\n\nAfter the `tick_check_broadcast_device` function, we can see the call of the `try_module_get` function that checks module owner of the clock events. We need to do it to be sure that the given `clock events` device was correctly initialized. The next step is the call of the `clockevents_exchange_device` function that defined in the [kernel/time/clockevents.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/clockevents.c) source code file and will release old clock events device and replace the previous functional handler with a dummy handler.\n\nIn the last step of the `tick_install_broadcast_device` function we check that the `tick_broadcast_mask` is not empty and start the given `clock events` device in periodic mode with the call of the `tick_broadcast_start_periodic` function:\n\n```C\nif (!cpumask_empty(tick_broadcast_mask))\n\ttick_broadcast_start_periodic(dev);\n\nif (dev->features & CLOCK_EVT_FEAT_ONESHOT)\n\ttick_clock_notify();\n```\n\nThe `tick_broadcast_mask` filled in the `tick_device_uses_broadcast` function that checks a `clock events` device during registration of this `clock events` device:\n\n```C\nint cpu = smp_processor_id();\n\nint tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)\n{\n\t...\n\t...\n\t...\n\tif (!tick_device_is_functional(dev)) {\n\t\t...\n\t\tcpumask_set_cpu(cpu, tick_broadcast_mask);\n\t\t...\n\t}\n\t...\n\t...\n\t...\n}\n```\n\nMore about the `smp_processor_id` macro you can read in the fourth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4) of the Linux kernel initialization process chapter.\n\nThe `tick_broadcast_start_periodic` function check the given `clock event` device and call the `tick_setup_periodic` function:\n\n```\nstatic void tick_broadcast_start_periodic(struct clock_event_device *bc)\n{\n\tif (bc)\n\t\ttick_setup_periodic(bc, 1);\n}\n```\n\nthat defined in the [kernel/time/tick-common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/tick-common.c) source code file and sets broadcast handler for the given `clock event` device by the call of the following function:\n\n```C\ntick_set_periodic_handler(dev, broadcast);\n```\n\nThis function checks the second parameter which represents broadcast state (`on` or `off`) and sets the broadcast handler depends on its value:\n\n```C\nvoid tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)\n{\n\tif (!broadcast)\n\t\tdev->event_handler = tick_handle_periodic;\n\telse\n\t\tdev->event_handler = tick_handle_periodic_broadcast;\n}\n```\n\nWhen an `clock event` device will issue an interrupt, the `dev->event_handler` will be called. For example, let's look on the interrupt handler of the [high precision event timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer) which is located in the [arch/x86/kernel/hpet.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/hpet.c) source code file:\n\n```C\nstatic irqreturn_t hpet_interrupt_handler(int irq, void *data)\n{\n\tstruct hpet_dev *dev = (struct hpet_dev *)data;\n\tstruct clock_event_device *hevt = &dev->evt;\n\n\tif (!hevt->event_handler) {\n\t\tprintk(KERN_INFO \"Spurious HPET timer interrupt on HPET timer %d\\n\",\n\t\t\t\tdev->num);\n\t\treturn IRQ_HANDLED;\n\t}\n\n\thevt->event_handler(hevt);\n\treturn IRQ_HANDLED;\n}\n```\n\nThe `hpet_interrupt_handler` gets the [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) specific data and check the event handler of the `clock event` device. Recall that we just set in the `tick_set_periodic_handler` function. So the `tick_handler_periodic_broadcast` function will be called in the end of the high precision event timer interrupt handler.\n\nThe `tick_handler_periodic_broadcast` function calls the\n\n```C\nbc_local = tick_do_periodic_broadcast();\n```\n\nfunction which stores numbers of processors which have asked to be woken up in the temporary `cpumask` and call the `tick_do_broadcast` function:\n\n```\ncpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);\nreturn tick_do_broadcast(tmpmask);\n```\n\nThe `tick_do_broadcast` calls the `broadcast` function of the given clock events which sends [IPI](https://en.wikipedia.org/wiki/Inter-processor_interrupt) interrupt to the set of the processors. In the end we can call the event handler of the given `tick_device`:\n\n```C\nif (bc_local)\n\ttd->evtdev->event_handler(td->evtdev);\n```\n\nwhich actually represents interrupt handler of the local timer of a processor. After this a processor will wake up. That is all about `tick broadcast` framework in the Linux kernel. We have missed some aspects of this framework, for example reprogramming of a `clock event` device and broadcast with the oneshot timer and etc. But the Linux kernel is very big, it is not realistic to cover all aspects of it. I think it will be interesting to dive into it yourself.\n\nIf you remember, we have started this part with the call of the `tick_init` function. We just consider the `tick_broadcast_init` function and related theory, but the `tick_init` function contains another call of a function and this function is - `tick_nohz_init`. Let's look on the implementation of this function.\n\nInitialization of dyntick related data structures\n--------------------------------------------------------------------------------\n\nWe already saw some information about `dyntick` concept in this part and we know that this concept allows kernel to disable system timer interrupts in the `idle` state. The `tick_nohz_init` function makes initialization of the different data structures which are related to this concept. This function defined in the [kernel/time/tick-sched.c](https://github.com/torvalds/linux/blob/master/kernel/time/tick-sched.c) source code file and starts from the check of the value of the `tick_nohz_full_running` variable which represents state of the tick-less mode for the `idle` state and the state when system timer interrupts are disabled during a processor has only one runnable task:\n\n```C\nif (!tick_nohz_full_running) {\n    if (tick_nohz_init_all() < 0)\n    return;\n}\n```\n\nIf this mode is not running we call the `tick_nohz_init_all` function that defined in the same source code file and check its result. The `tick_nohz_init_all` function tries to allocate the `tick_nohz_full_mask` with the call of the `alloc_cpumask_var` that will allocate space for a `tick_nohz_full_mask`. The `tick_nohz_full_mask` will store numbers of processors that have enabled full `NO_HZ`. After successful allocation of the `tick_nohz_full_mask` we set all bits in the `tick_nohz_full_mask`, set the `tick_nohz_full_running` and return result to the `tick_nohz_init` function:\n\n```C\nstatic int tick_nohz_init_all(void)\n{\n        int err = -1;\n#ifdef CONFIG_NO_HZ_FULL_ALL\n        if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {\n                WARN(1, \"NO_HZ: Can't allocate full dynticks cpumask\\n\");\n                return err;\n        }\n        err = 0;\n        cpumask_setall(tick_nohz_full_mask);\n        tick_nohz_full_running = true;\n#endif\n        return err;\n}\n```\n\nIn the next step we try to allocate a memory space for the `housekeeping_mask`:\n\n```C\nif (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {\n\tWARN(1, \"NO_HZ: Can't allocate not-full dynticks cpumask\\n\");\n\tcpumask_clear(tick_nohz_full_mask);\n\ttick_nohz_full_running = false;\n\treturn;\n}\n```\n\nThis `cpumask` will store number of processor for `housekeeping` or in other words we need at least in one processor that will not be in `NO_HZ` mode, because it will do timekeeping and etc. After this we check the result of the architecture-specific `arch_irq_work_has_interrupt` function. This function checks ability to send inter-processor interrupt for the certain architecture. We need to check this, because system timer of a processor will be disabled during `NO_HZ` mode, so there must be at least one online processor which can send inter-processor interrupt to awake offline processor. This function defined in the [arch/x86/include/asm/irq_work.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/irq_work.h) header file for the [x86_64](https://en.wikipedia.org/wiki/X86-64) and just checks that a processor has [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) from the [CPUID](https://en.wikipedia.org/wiki/CPUID):\n\n```C\nstatic inline bool arch_irq_work_has_interrupt(void)\n{\n    return cpu_has_apic;\n}\n```\n\nIf a processor has not `APIC`, the Linux kernel prints warning message, clears the `tick_nohz_full_mask` cpumask, copies numbers of all possible processors in the system to the `housekeeping_mask` and resets the value of the `tick_nohz_full_running` variable:\n\n```C\nif (!arch_irq_work_has_interrupt()) {\n\tpr_warning(\"NO_HZ: Can't run full dynticks because arch doesn't \"\n\t\t   \"support irq work self-IPIs\\n\");\n\tcpumask_clear(tick_nohz_full_mask);\n\tcpumask_copy(housekeeping_mask, cpu_possible_mask);\n\ttick_nohz_full_running = false;\n\treturn;\n}\n```\n\nAfter this step, we get the number of the current processor by the call of the `smp_processor_id` and check this processor in the `tick_nohz_full_mask`. If the `tick_nohz_full_mask` contains a given processor we clear appropriate bit in the `tick_nohz_full_mask`:\n\n```C\ncpu = smp_processor_id();\n\nif (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {\n\tpr_warning(\"NO_HZ: Clearing %d from nohz_full range for timekeeping\\n\", cpu);\n\tcpumask_clear_cpu(cpu, tick_nohz_full_mask);\n}\n```\n\nBecause this processor will be used for timekeeping. After this step we put all numbers of processors that are in the `cpu_possible_mask` and not in the `tick_nohz_full_mask`:\n\n```C\ncpumask_andnot(housekeeping_mask,\n\t       cpu_possible_mask, tick_nohz_full_mask);\n```\n\nAfter this operation, the `housekeeping_mask` will contain all processors of the system except a processor for timekeeping. In the last step of the `tick_nohz_init_all` function, we are going through all processors that are defined in the `tick_nohz_full_mask` and call the following function for an each processor:\n\n```C\nfor_each_cpu(cpu, tick_nohz_full_mask)\n\tcontext_tracking_cpu_set(cpu);\n```\n\nThe `context_tracking_cpu_set` function defined in the [kernel/context_tracking.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/context_tracking.c) source code file and main point of this function is to set the `context_tracking.active` [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable to `true`. When the `active` field will be set to `true` for the certain processor, all [context switches](https://en.wikipedia.org/wiki/Context_switch) will be ignored by the Linux kernel context tracking subsystem for this processor.\n\nThat's all. This is the end of the `tick_nohz_init` function. After this `NO_HZ` related data structures will be initialized. We didn't see API of the `NO_HZ` mode, but will see it soon.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the third part of the chapter that describes timers and timer management related stuff in the Linux kernel. In the previous part got acquainted with the `clocksource` concept in the Linux kernel which represents framework for managing different clock source in a interrupt and hardware characteristics independent way. We continued to look on the Linux kernel initialization process in a time management context in this part and got acquainted with two new concepts for us: the `tick broadcast` framework and `tick-less` mode. The first concept helps the Linux kernel to deal with processors which are in deep sleep and the second concept represents the mode in which kernel may work to improve power management of `idle` processors.\n\nIn the next part we will continue to dive into timer management related things in the Linux kernel and will see new concept for us - `timers`.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n-------------------------------------------------------------------------------\n\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [initrd](https://en.wikipedia.org/wiki/Initrd)\n* [interrupt](https://en.wikipedia.org/wiki/Interrupt)\n* [DMI](https://en.wikipedia.org/wiki/Desktop_Management_Interface)\n* [printk](https://en.wikipedia.org/wiki/Printk)\n* [CPU idle](https://en.wikipedia.org/wiki/Idle_%28CPU%29)\n* [power management](https://en.wikipedia.org/wiki/Power_management)\n* [NO_HZ documentation](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/timers/NO_HZ.txt)\n* [cpumasks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)\n* [high precision event timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer)\n* [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29)\n* [IPI](https://en.wikipedia.org/wiki/Inter-processor_interrupt)\n* [CPUID](https://en.wikipedia.org/wiki/CPUID)\n* [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)\n* [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [context switches](https://en.wikipedia.org/wiki/Context_switch)\n* [Previous part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-2)\n"
  },
  {
    "path": "Timers/linux-timers-4.md",
    "content": "Timers and time management in the Linux kernel. Part 4.\n================================================================================\n\nTimers\n--------------------------------------------------------------------------------\n\nThis is fourth part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/) which describes timers and time management related stuff in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-3) we knew about the `tick broadcast` framework and `NO_HZ` mode in the Linux kernel. We will continue to dive into the time management related stuff in the Linux kernel in this part and will be acquainted with yet another concept in the Linux kernel - `timers`. Before we will look at timers in the Linux kernel, we have to learn some theory about this concept. Note that we will consider software timers in this part.\n\nThe Linux kernel provides a `software timer` concept to allow to kernel functions could be invoked at future moment. Timers are widely used in the Linux kernel. For example, look in the [net/netfilter/ipset/ip_set_list_set.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/net/netfilter/ipset/ip_set_list_set.c) source code file. This source code file provides implementation of the framework for the managing of groups of [IP](https://en.wikipedia.org/wiki/Internet_Protocol) addresses.\n\nWe can find the `list_set` structure that contains `gc` filed in this source code file:\n\n```C\nstruct list_set {\n\t...\n\tstruct timer_list gc;\n\t...\n};\n```\n\nNot that the `gc` filed has `timer_list` type. This structure defined in the [include/linux/timer.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/timer.h) header file and main point of this structure is to store `dynamic` timers in the Linux kernel. Actually, the Linux kernel provides two types of timers called dynamic timers and interval timers. First type of timers is used by the kernel, and the second can be used by user mode. The `timer_list` structure contains actual `dynamic` timers. The `list_set` contains `gc` timer in our example represents timer for garbage collection. This timer will be initialized in the `list_set_gc_init` function:\n\n```C\nstatic void\nlist_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))\n{\n\tstruct list_set *map = set->data;\n\t...\n\t...\n\t...\n\tmap->gc.function = gc;\n\tmap->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;\n\t...\n\t...\n\t...\n}\n```\n\nA function that is pointed by the `gc` pointer, will be called after timeout which is equal to the `map->gc.expires`.\n\nOk, we will not dive into this example with the [netfilter](https://en.wikipedia.org/wiki/Netfilter), because this chapter is not about [network](https://en.wikipedia.org/wiki/Computer_network) related stuff. But we saw that timers are widely used in the Linux kernel and learned that they represent concept which allows to functions to be called in future.\n\nNow let's continue to research source code of Linux kernel which is related to the timers and time management stuff as we did it in all previous chapters.\n\nIntroduction to dynamic timers in the Linux kernel\n--------------------------------------------------------------------------------\n\nAs I already wrote, we knew about the `tick broadcast` framework and `NO_HZ` mode in the previous [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-3). They will be initialized in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file by the call of the `tick_init` function. If we will look at this source code file, we will see that the next time management related function is:\n\n```C\ninit_timers();\n```\n\nThis function defined in the [kernel/time/timer.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/timer.c) source code file and contains calls of four functions:\n\n```C\nvoid __init init_timers(void)\n{\n\tinit_timer_cpus();\n\tinit_timer_stats();\n\ttimer_register_cpu_notifier();\n\topen_softirq(TIMER_SOFTIRQ, run_timer_softirq);\n}\n```\n\nLet's look on implementation of each function. The first function is `init_timer_cpus` defined in the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/timer.c) source code file and just calls the `init_timer_cpu` function for each possible processor in the system:\n\n```C\nstatic void __init init_timer_cpus(void)\n{\n\tint cpu;\n\n\tfor_each_possible_cpu(cpu)\n\t\tinit_timer_cpu(cpu);\n}\n```\n\nIf you do not know or do not remember what is it a `possible` cpu, you can read the special [part](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) of this book which describes `cpumask` concept in the Linux kernel. In short words, a `possible` processor is a processor which can be plugged in anytime during the life of the system.\n\nThe `init_timer_cpu` function does main work for us, namely it executes initialization of the `tvec_base` structure for each processor. This structure defined in the [kernel/time/timer.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/timer.c) source code file and stores data related to a `dynamic` timer for a certain processor. Let's look on the definition of this structure:\n\n```C\nstruct tvec_base {\n\tspinlock_t lock;\n\tstruct timer_list *running_timer;\n\tunsigned long timer_jiffies;\n\tunsigned long next_timer;\n\tunsigned long active_timers;\n\tunsigned long all_timers;\n\tint cpu;\n\tbool migration_enabled;\n\tbool nohz_active;\n\tstruct tvec_root tv1;\n\tstruct tvec tv2;\n\tstruct tvec tv3;\n\tstruct tvec tv4;\n\tstruct tvec tv5;\n} ____cacheline_aligned;\n```\n\nThe `thec_base` structure contains following fields: The `lock` for `tvec_base` protection, the next `running_timer` field points to the currently running timer for the certain processor, the `timer_jiffies` fields represents the earliest expiration time (it will be used by the Linux kernel to find already expired timers). The next field - `next_timer` contains the next pending timer for a next timer [interrupt](https://en.wikipedia.org/wiki/Interrupt) in a case when a processor goes to sleep and the `NO_HZ` mode is enabled in the Linux kernel. The `active_timers` field provides accounting of non-deferrable timers or in other words all timers that will not be stopped during a processor will go to sleep. The `all_timers` field tracks total number of timers or `active_timers` + deferrable timers. The `cpu` field represents number of a processor which owns timers. The `migration_enabled` and `nohz_active` fields are represent opportunity of timers migration to another processor and status of the `NO_HZ` mode respectively.\n\nThe last five fields of the `tvec_base` structure represent lists of dynamic timers. The first `tv1` field has:\n\n```C\n#define TVR_SIZE (1 << TVR_BITS)\n#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)\n\n...\n...\n...\n\nstruct tvec_root {\n\tstruct hlist_head vec[TVR_SIZE];\n};\n```\n\ntype. Note that the value of the `TVR_SIZE` depends on the `CONFIG_BASE_SMALL` kernel configuration option:\n\n![base small](images/base_small.png)\n\nthat reduces size of the kernel data structures if disabled. The `v1` is array that may contain `64` or `256` elements where an each element represents a dynamic timer that will decay within the next `255` system timer interrupts. Next three fields: `tv2`, `tv3` and `tv4` are lists with dynamic timers too, but they store dynamic timers which will decay the next `2^14 - 1`, `2^20 - 1` and `2^26` respectively. The last `tv5` field represents list which stores dynamic timers with a large expiring period.\n\nSo, now we saw the `tvec_base` structure and description of its fields and we can look on the implementation of the `init_timer_cpu` function. As I already wrote, this function defined in the [kernel/time/timer.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/timer.c) source code file and executes initialization of the `tvec_bases`:\n\n```C\nstatic void __init init_timer_cpu(int cpu)\n{\n\tstruct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);\n\n\tbase->cpu = cpu;\n\tspin_lock_init(&base->lock);\n\n\tbase->timer_jiffies = jiffies;\n\tbase->next_timer = base->timer_jiffies;\n}\n```\n\nThe `tvec_bases` represents [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable which represents main data structure for a dynamic timer for a given processor. This `per-cpu` variable defined in the same source code file:\n\n```C\nstatic DEFINE_PER_CPU(struct tvec_base, tvec_bases);\n```\n\nFirst of all we're getting the address of the `tvec_bases` for the given processor to `base` variable and as we got it, we are starting to initialize some of the `tvec_base` fields in the `init_timer_cpu` function. After initialization of the `per-cpu` dynamic timers with the [jiffies](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1) and the number of a possible processor, we need to initialize a `tstats_lookup_lock` [spinlock](https://en.wikipedia.org/wiki/Spinlock) in the `init_timer_stats` function:\n\n```C\nvoid __init init_timer_stats(void)\n{\n\tint cpu;\n\n\tfor_each_possible_cpu(cpu)\n\t\traw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));\n}\n```\n\nThe `tstats_lookcup_lock` variable represents `per-cpu` raw spinlock:\n\n```C\nstatic DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);\n```\n\nwhich will be used for protection of operation with statistics of timers that can be accessed through the [procfs](https://en.wikipedia.org/wiki/Procfs):\n\n```C\nstatic int __init init_tstats_procfs(void)\n{\n\tstruct proc_dir_entry *pe;\n\n\tpe = proc_create(\"timer_stats\", 0644, NULL, &tstats_fops);\n\tif (!pe)\n\t\treturn -ENOMEM;\n\treturn 0;\n}\n```\n\nFor example:\n\n```\n$ cat /proc/timer_stats\nTimerstats sample period: 3.888770 s\n  12,     0 swapper          hrtimer_stop_sched_tick (hrtimer_sched_tick)\n  15,     1 swapper          hcd_submit_urb (rh_timer_func)\n   4,   959 kedac            schedule_timeout (process_timeout)\n   1,     0 swapper          page_writeback_init (wb_timer_fn)\n  28,     0 swapper          hrtimer_stop_sched_tick (hrtimer_sched_tick)\n  22,  2948 IRQ 4            tty_flip_buffer_push (delayed_work_timer_fn)\n  ...\n  ...\n  ...\n```\n\nThe next step after initialization of the `tstats_lookup_lock` spinlock is the call of the `timer_register_cpu_notifier` function. This function depends on the `CONFIG_HOTPLUG_CPU` kernel configuration option which enables support for [hotplug](https://en.wikipedia.org/wiki/Hot_swapping) processors in the Linux kernel.\n\nWhen a processor will be logically offlined, a notification will be sent to the Linux kernel with the `CPU_DEAD` or the `CPU_DEAD_FROZEN` event by the call of the `cpu_notifier` macro:\n\n```C\n#ifdef CONFIG_HOTPLUG_CPU\n...\n...\nstatic inline void timer_register_cpu_notifier(void)\n{\n\tcpu_notifier(timer_cpu_notify, 0);\n}\n...\n...\n#else\n...\n...\nstatic inline void timer_register_cpu_notifier(void) { }\n...\n...\n#endif /* CONFIG_HOTPLUG_CPU */\n```\n\nIn this case the `timer_cpu_notify` will be called which checks an event type and will call the `migrate_timers` function:\n\n```C\nstatic int timer_cpu_notify(struct notifier_block *self,\n\t                        unsigned long action, void *hcpu)\n{\n\tswitch (action) {\n\tcase CPU_DEAD:\n\tcase CPU_DEAD_FROZEN:\n\t\tmigrate_timers((long)hcpu);\n\t\tbreak;\n\tdefault:\n\t\tbreak;\n\t}\n\n\treturn NOTIFY_OK;\n}\n```\n\nThis chapter will not describe `hotplug` related events in the Linux kernel source code, but if you are interesting in such things, you can find implementation of the `migrate_timers` function in the [kernel/time/timer.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/timer.c) source code file.\n\nThe last step in the `init_timers` function is the call of the:\n\n```C\nopen_softirq(TIMER_SOFTIRQ, run_timer_softirq);\n```\n\nfunction. The `open_softirq` function may be already familiar to you if you have read the ninth [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-9) about the interrupts and interrupt handling in the Linux kernel. In short words, the `open_softirq` function defined in the [kernel/softirq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/softirq.c) source code file and executes initialization of the deferred interrupt handler.\n\nIn our case the deferred function is the `run_timer_softirq` function that is will be called after a hardware interrupt in the `do_IRQ` function which defined in the [arch/x86/kernel/irq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/irq.c) source code file. The main point of this function is to handle a software dynamic timer. The Linux kernel does not do this thing during the hardware timer interrupt handling because this is time consuming operation.\n\nLet's look on the implementation of the `run_timer_softirq` function:\n\n```C\nstatic void run_timer_softirq(struct softirq_action *h)\n{\n\tstruct tvec_base *base = this_cpu_ptr(&tvec_bases);\n\n\tif (time_after_eq(jiffies, base->timer_jiffies))\n\t\t__run_timers(base);\n}\n```\n\nAt the beginning of the `run_timer_softirq` function we get a `dynamic` timer for a current processor and compares the current value of the [jiffies](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1) with the value of the `timer_jiffies` for the current structure by the call of the `time_after_eq` macro which is defined in the [include/linux/jiffies.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/jiffies.h) header file:\n\n```C\n#define time_after_eq(a,b)          \\\n    (typecheck(unsigned long, a) && \\\n     typecheck(unsigned long, b) && \\\n    ((long)((a) - (b)) >= 0))\n```\n\nReclaim that the `timer_jiffies` field of the `tvec_base` structure represents the relative time when functions delayed by the given timer will be executed. So we compare these two values and if the current time represented by the `jiffies` is greater than `base->timer_jiffies`, we call the `__run_timers` function that defined in the same source code file. Let's look on the implementation of this function.\n\nAs I just wrote, the `__run_timers` function runs all expired timers for a given processor. This function starts from the acquiring of the `tvec_base's`  lock to protect the `tvec_base` structure\n\n```C\nstatic inline void __run_timers(struct tvec_base *base)\n{\n\tstruct timer_list *timer;\n\n\tspin_lock_irq(&base->lock);\n\t...\n\t...\n\t...\n\tspin_unlock_irq(&base->lock);\n}\n```\n\nAfter this it starts the loop while the `timer_jiffies` will not be greater than the `jiffies`:\n\n```C\nwhile (time_after_eq(jiffies, base->timer_jiffies)) {\n\t...\n\t...\n\t...\n}\n```\n\nWe can find many different manipulations in the our loop, but the main point is to find expired timers and call delayed functions. First of all we need to calculate the `index` of the `base->tv1` list that stores the next timer to be handled with the following expression:\n\n```C\nindex = base->timer_jiffies & TVR_MASK;\n```\n\nwhere the `TVR_MASK` is a mask for the getting of the `tvec_root->vec` elements. As we got the index with the next timer which must be handled we check its value. If the index is zero, we go through all lists in our cascade table `tv2`, `tv3` and etc., and rehashing it with the call of the `cascade` function:\n\n```C\nif (!index &&\n\t(!cascade(base, &base->tv2, INDEX(0))) &&\n\t\t(!cascade(base, &base->tv3, INDEX(1))) &&\n\t\t\t\t!cascade(base, &base->tv4, INDEX(2)))\n\t\tcascade(base, &base->tv5, INDEX(3));\n```\n\nAfter this we increase the value of the `base->timer_jiffies`:\n\n```C\n++base->timer_jiffies;\n```\n\nIn the last step we are executing a corresponding function for each timer from the list in a following loop:\n\n```C\nhlist_move_list(base->tv1.vec + index, head);\n\nwhile (!hlist_empty(head)) {\n\t...\n\t...\n\t...\n\ttimer = hlist_entry(head->first, struct timer_list, entry);\n\tfn = timer->function;\n\tdata = timer->data;\n\n\tspin_unlock(&base->lock);\n\tcall_timer_fn(timer, fn, data);\n\tspin_lock(&base->lock);\n\n\t...\n\t...\n\t...\n}\n```\n\nwhere the `call_timer_fn` just call the given function:\n\n```C\nstatic void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),\n\t                      unsigned long data)\n{\n\t...\n\t...\n\t...\n\tfn(data);\n\t...\n\t...\n\t...\n}\n```\n\nThat's all. The Linux kernel has infrastructure for `dynamic timers` from this moment. We will not dive into this interesting theme. As I already wrote the `timers` is a [widely](http://lxr.free-electrons.com/ident?i=timer_list) used concept in the Linux kernel and nor one part, nor two parts will not cover understanding of such things how it implemented and how it works. But now we know about this concept, why does the Linux kernel needs in it and some data structures around it.\n\nNow let's look usage of `dynamic timers` in the Linux kernel.\n\nUsage of dynamic timers\n--------------------------------------------------------------------------------\n\nAs you already can noted, if the Linux kernel provides a concept, it also provides API for managing of this concept and the `dynamic timers` concept is not exception here. To use a timer in the Linux kernel code, we must define a variable with a `timer_list` type. We can initialize our `timer_list` structure in two ways. The first is to use the `init_timer` macro that defined in the [include/linux/timer.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/timer.h) header file:\n\n```C\n#define init_timer(timer)    \\\n\t__init_timer((timer), 0)\n\n#define __init_timer(_timer, _flags)   \\\n         init_timer_key((_timer), (_flags), NULL, NULL)\n```\n\nwhere the `init_timer_key` function just calls the:\n\n```C\ndo_init_timer(timer, flags, name, key);\n```\n\nfunction which fields the given `timer` with default values. The second way is to use the:\n\n```C\n#define TIMER_INITIALIZER(_function, _expires, _data)\t\t\\\n\t__TIMER_INITIALIZER((_function), (_expires), (_data), 0)\n```\n\nmacro which will initialize the given `timer_list` structure too.\n\nAfter a `dynamic timer` is initialized we can start this `timer` with the call of the:\n\n```C\nvoid add_timer(struct timer_list * timer);\n```\n\nfunction and stop it with the:\n\n```C\nint del_timer(struct timer_list * timer);\n```\n\nfunction.\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the fourth part of the chapter that describes timers and timer management related stuff in the Linux kernel. In the previous part we got acquainted with the two new concepts: the `tick broadcast` framework and the `NO_HZ` mode. In this part we continued to dive into time management related stuff and got acquainted with the new concept - `dynamic timer` or software timer. We didn't saw implementation of a `dynamic timers` management code in details in this part but saw data structures and API around this concept.\n\nIn the next part we will continue to dive into timer management related things in the Linux kernel and will see new concept for us - `timers`.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n-------------------------------------------------------------------------------\n\n* [IP](https://en.wikipedia.org/wiki/Internet_Protocol)\n* [netfilter](https://en.wikipedia.org/wiki/Netfilter)\n* [network](https://en.wikipedia.org/wiki/Computer_network)\n* [cpumask](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)\n* [interrupt](https://en.wikipedia.org/wiki/Interrupt)\n* [jiffies](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1)\n* [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)\n* [spinlock](https://en.wikipedia.org/wiki/Spinlock)\n* [procfs](https://en.wikipedia.org/wiki/Procfs)\n* [previous part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-3)\n"
  },
  {
    "path": "Timers/linux-timers-5.md",
    "content": "Timers and time management in the Linux kernel. Part 5.\n================================================================================\n\nIntroduction to the `clockevents` framework\n--------------------------------------------------------------------------------\n\nThis is fifth part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/) which describes timers and time management related stuff in the Linux kernel. As you might noted from the title of this part, the `clockevents` framework will be discussed. We already saw one framework in the [second](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-2) part of this chapter. It was `clocksource` framework. Both of these frameworks represent timekeeping abstractions in the Linux kernel.\n\nAt first let's refresh your memory and try to remember what is it `clocksource` framework and and what its purpose. The main goal of the `clocksource` framework is to provide `timeline`. As described in the [documentation](https://github.com/0xAX/linux/blob/0a07b238e5f488b459b6113a62e06b6aab017f71/Documentation/timers/timekeeping.txt):\n\n> For example issuing the command 'date' on a Linux system will eventually read the clock source to determine exactly what time it is.\n\nThe Linux kernel supports many different clock sources. You can find some of them in the [drivers/clocksource](https://github.com/torvalds/linux/tree/master/drivers/clocksource). For example old good [Intel 8253](https://en.wikipedia.org/wiki/Intel_8253) - [programmable interval timer](https://en.wikipedia.org/wiki/Programmable_interval_timer) with `1193182` Hz frequency, yet another one - [ACPI PM](http://uefi.org/sites/default/files/resources/ACPI_5.pdf) timer with `3579545` Hz frequency. Besides the [drivers/clocksource](https://github.com/torvalds/linux/tree/master/drivers/clocksource) directory, each architecture may provide own architecture-specific clock sources. For example [x86](https://en.wikipedia.org/wiki/X86) architecture provides [High Precision Event Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer), or for example [powerpc](https://en.wikipedia.org/wiki/PowerPC) provides access to the processor timer through `timebase` register.\n\nEach clock source provides monotonic atomic counter. As I already wrote, the Linux kernel supports a huge set of different clock source and each clock source has own parameters like [frequency](https://en.wikipedia.org/wiki/Frequency). The main goal of the `clocksource` framework is to provide [API](https://en.wikipedia.org/wiki/Application_programming_interface) to select best available clock source in the system i.e. a clock source with the highest frequency. Additional goal of the `clocksource` framework is to represent an atomic counter provided by a clock source in human units. In this time, nanoseconds are the favorite choice for the time value units of the given clock source in the Linux kernel.\n\nThe `clocksource` framework represented by the `clocksource` structure which is defined in the [include/linux/clocksource.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clocksource.h) header code file which contains `name` of a clock source, rating of certain clock source in the system (a clock source with the higher frequency has the biggest rating in the system), `list` of all registered clock source in the system, `enable` and `disable` fields to enable and disable a clock source, pointer to the `read` function which must return an atomic counter of a clock source and etc.\n\nAdditionally the `clocksource` structure provides two fields: `mult` and `shift` which are needed for translation of an atomic counter which is provided by a certain clock source to the human units, i.e. [nanoseconds](https://en.wikipedia.org/wiki/Nanosecond). Translation occurs via following formula:\n\n```\nns ~= (clocksource * mult) >> shift\n```\n\nAs we already know, besides the `clocksource` structure, the `clocksource` framework provides an API for registration of clock source with different frequency scale factor:\n\n```C\nstatic inline int clocksource_register_hz(struct clocksource *cs, u32 hz)\nstatic inline int clocksource_register_khz(struct clocksource *cs, u32 khz)\n```\n\nA clock source unregistration:\n\n```C\nint clocksource_unregister(struct clocksource *cs)\n```\n\nand etc.\n\nAdditionally to the `clocksource` framework, the Linux kernel provides `clockevents` framework. As described in the [documentation](https://github.com/0xAX/linux/blob/0a07b238e5f488b459b6113a62e06b6aab017f71/Documentation/timers/timekeeping.txt):\n\n> Clock events are the conceptual reverse of clock sources\n\nMain goal of the is to manage clock event devices or in other words - to manage devices that allow to register an event or in other words [interrupt](https://en.wikipedia.org/wiki/Interrupt) that is going to happen at a defined point of time in the future.\n\nNow we know a little about the `clockevents` framework in the Linux kernel, and now time is to see on it [API](https://en.wikipedia.org/wiki/Application_programming_interface).\n\nAPI of `clockevents` framework\n-------------------------------------------------------------------------------\n\nThe main structure which described a clock event device is `clock_event_device` structure. This structure is defined in the [include/linux/clockchips.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clockchips.h) header file and contains a huge set of fields. as well as the `clocksource` structure it has `name` fields which contains human readable name of a clock event device, for example [local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) timer:\n\n```C\nstatic struct clock_event_device lapic_clockevent = {\n    .name                   = \"lapic\",\n    ...\n    ...\n    ...\n}\n```\n\nAddresses of the `event_handler`, `set_next_event`, `next_event` functions for a certain clock event device which are an [interrupt handler](https://en.wikipedia.org/wiki/Interrupt_handler), setter of next event and local storage for next event respectively. Yet another field of the `clock_event_device` structure is - `features` field. Its value maybe on of the following generic features:\n\n```C\n#define CLOCK_EVT_FEAT_PERIODIC\t0x000001\n#define CLOCK_EVT_FEAT_ONESHOT\t\t0x000002\n```\n\nWhere the `CLOCK_EVT_FEAT_PERIODIC` represents device which may be programmed to generate events periodically. The `CLOCK_EVT_FEAT_ONESHOT` represents device which may generate an event only once. Besides these two features, there are also architecture-specific features. For example [x86_64](https://en.wikipedia.org/wiki/X86-64) supports two additional features:\n\n```C\n#define CLOCK_EVT_FEAT_C3STOP\t\t0x000008\n```\n\nThe first `CLOCK_EVT_FEAT_C3STOP` means that a clock event device will be stopped in the [C3](https://en.wikipedia.org/wiki/Advanced_Configuration_and_Power_Interface#Device_states) state. Additionally the `clock_event_device` structure has `mult` and `shift` fields as well as `clocksource` structure. The `clocksource` structure also contains other fields, but we will consider it later.\n\nAfter we considered part of the `clock_event_device` structure, time is to look at the `API` of the `clockevents` framework. To work with a clock event device, first of all we need to initialize `clock_event_device` structure and register a clock events device. The `clockevents` framework provides following `API` for registration of clock event devices:\n\n```C\nvoid clockevents_register_device(struct clock_event_device *dev)\n{\n   ...\n   ...\n   ...\n}\n```\n\nThis function defined in the [kernel/time/clockevents.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/clockevents.c) source code file and as we may see, the `clockevents_register_device` function takes only one parameter:\n\n* address of a `clock_event_device` structure which represents a clock event device.\n\nSo, to register a clock event device, at first we need to initialize `clock_event_device` structure with parameters of a certain clock event device. Let's take a look at one random clock event device in the Linux kernel source code. We can find one in the [drivers/clocksource](https://github.com/torvalds/linux/tree/master/drivers/clocksource) directory or try to take a look at an architecture-specific clock event device. Let's take for example - [Periodic Interval Timer (PIT) for at91sam926x](http://www.atmel.com/Images/doc6062.pdf). You can find its implementation in the [drivers/clocksource](https://github.com/torvalds/linux/tree/master/drivers/clocksource/timer-atmel-pit.c).\n\nFirst of all let's look at initialization of the `clock_event_device` structure. This occurs in the `at91sam926x_pit_common_init` function:\n\n```C\nstruct pit_data {\n    ...\n    ...\n    struct clock_event_device       clkevt;\n    ...\n    ...\n};\n\nstatic void __init at91sam926x_pit_common_init(struct pit_data *data)\n{\n    ...\n    ...\n    ...\n    data->clkevt.name = \"pit\";\n    data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;\n    data->clkevt.shift = 32;\n    data->clkevt.mult = div_sc(pit_rate, NSEC_PER_SEC, data->clkevt.shift);\n    data->clkevt.rating = 100;\n    data->clkevt.cpumask = cpumask_of(0);\n\n    data->clkevt.set_state_shutdown = pit_clkevt_shutdown;\n    data->clkevt.set_state_periodic = pit_clkevt_set_periodic;\n    data->clkevt.resume = at91sam926x_pit_resume;\n    data->clkevt.suspend = at91sam926x_pit_suspend;\n    ...\n}\n```\n\nHere we can see that `at91sam926x_pit_common_init` takes one parameter - pointer to the `pit_data` structure which contains `clock_event_device` structure which will contain clock event related information of the `at91sam926x` [periodic Interval Timer](https://en.wikipedia.org/wiki/Programmable_interval_timer). At the start we fill `name` of the timer device and its `features`. In our case we deal with periodic timer which as we already know may be programmed to generate events periodically.\n\nThe next two fields `shift` and `mult` are familiar to us. They will be used to translate counter of our timer to nanoseconds. After this we set rating of the timer  to `100`. This means if there will not be timers with higher rating in the system, this timer will be used for timekeeping. The next field - `cpumask` indicates for which processors in the system the device will work. In our case, the device will work for the first processor. The `cpumask_of` macro defined in the [include/linux/cpumask.h](https://github.com/torvalds/linux/tree/master/include/linux/cpumask.h) header file and just expands to the call of the:\n\n```C\n#define cpumask_of(cpu) (get_cpu_mask(cpu))\n```\n\nWhere the `get_cpu_mask` returns the cpumask containing just a given `cpu` number. More about `cpumasks` concept you may read in the [CPU masks in the Linux kernel](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) part. In the last four lines of code we set callbacks for the clock event device suspend/resume, device shutdown and update of the clock event device state.\n\nAfter we finished with the initialization of the `at91sam926x` periodic timer, we can register it by the call of the following functions:\n\n```C\nclockevents_register_device(&data->clkevt);\n```\n\nNow we can consider implementation of the `clockevent_register_device` function. As I already wrote above, this function is defined in the [kernel/time/clockevents.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/clockevents.c) source code file and starts from the initialization of the initial event device state:\n\n```C\nclockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);\n```\n\nActually, an event device may be in one of this states:\n\n```C\nenum clock_event_state {\n\tCLOCK_EVT_STATE_DETACHED,\n\tCLOCK_EVT_STATE_SHUTDOWN,\n\tCLOCK_EVT_STATE_PERIODIC,\n\tCLOCK_EVT_STATE_ONESHOT,\n\tCLOCK_EVT_STATE_ONESHOT_STOPPED,\n};\n```\n\nWhere:\n\n* `CLOCK_EVT_STATE_DETACHED` - a clock event device is not not used by `clockevents` framework. Actually it is initial state of all clock event devices;\n* `CLOCK_EVT_STATE_SHUTDOWN` - a clock event device is powered-off;\n* `CLOCK_EVT_STATE_PERIODIC` - a clock event device may be programmed to generate event periodically;\n* `CLOCK_EVT_STATE_ONESHOT`  - a clock event device may be programmed to generate event only once;\n* `CLOCK_EVT_STATE_ONESHOT_STOPPED` - a clock event device was programmed to generate event only once and now it is temporary stopped.\n\nThe implementation of the `clock_event_set_state` function is pretty easy:\n\n```C\nstatic inline void clockevent_set_state(struct clock_event_device *dev,\n\t\t\t\t\tenum clock_event_state state)\n{\n\tdev->state_use_accessors = state;\n}\n```\n\nAs we can see, it just fills the `state_use_accessors` field of the given `clock_event_device` structure with the given value which is in our case is `CLOCK_EVT_STATE_DETACHED`. Actually all clock event devices has this initial state during registration. The `state_use_accessors` field of the `clock_event_device` structure provides `current` state of the clock event device.\n\nAfter we have set initial state of the given `clock_event_device` structure we check that the `cpumask` of the given clock event device is not zero:\n\n```C\nif (!dev->cpumask) {\n\tWARN_ON(num_possible_cpus() > 1);\n\tdev->cpumask = cpumask_of(smp_processor_id());\n}\n```\n\nRemember that we have set the `cpumask` of the `at91sam926x` periodic timer to first processor. If the `cpumask` field is zero, we check the number of possible processors in the system and print warning message if it is less than on. Additionally we set the `cpumask` of the given clock event device to the current processor. If you are interested in how the `smp_processor_id` macro is implemented, you can read more about it in the fourth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4) of the Linux kernel initialization process chapter.\n\nAfter this check we lock the actual code of the clock event device registration by the call following macros:\n\n```C\nraw_spin_lock_irqsave(&clockevents_lock, flags);\n...\n...\n...\nraw_spin_unlock_irqrestore(&clockevents_lock, flags);\n```\n\nAdditionally the `raw_spin_lock_irqsave` and the `raw_spin_unlock_irqrestore` macros disable local interrupts, however interrupts on other processors still may occur. We need to do it to prevent potential [deadlock](https://en.wikipedia.org/wiki/Deadlock) if we adding new clock event device to the list of clock event devices and an interrupt occurs from other clock event device.\n\nWe can see following code of clock event device registration between the `raw_spin_lock_irqsave` and `raw_spin_unlock_irqrestore` macros:\n\n```C\nlist_add(&dev->list, &clockevent_devices);\ntick_check_new_device(dev);\nclockevents_notify_released();\n```\n\nFirst of all we add the given clock event device to the list of clock event devices which is represented by the `clockevent_devices`:\n\n```C\nstatic LIST_HEAD(clockevent_devices);\n```\n\nAt the next step we call the `tick_check_new_device` function which is defined in the [kernel/time/tick-common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/tick-common.c) source code file and checks do the new registered clock event device should be used or not. The `tick_check_new_device` function checks the given `clock_event_device` gets the current registered tick device which is represented by the `tick_device` structure and compares their ratings and features. Actually `CLOCK_EVT_STATE_ONESHOT` is preferred:\n\n```C\nstatic bool tick_check_preferred(struct clock_event_device *curdev,\n\t\t\t\t struct clock_event_device *newdev)\n{\n\tif (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) {\n\t\tif (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT))\n\t\t\treturn false;\n\t\tif (tick_oneshot_mode_active())\n\t\t\treturn false;\n\t}\n\n\treturn !curdev ||\n\t\tnewdev->rating > curdev->rating ||\n\t       !cpumask_equal(curdev->cpumask, newdev->cpumask);\n}\n```\n\nIf the new registered clock event device is more preferred than old tick device, we exchange old and new registered devices and install new device:\n\n```C\nclockevents_exchange_device(curdev, newdev);\ntick_setup_device(td, newdev, cpu, cpumask_of(cpu));\n```\n\nThe `clockevents_exchange_device` function releases or in other words deleted the old clock event device from the `clockevent_devices` list. The next function - `tick_setup_device` as we may understand from its name, setups new tick device. This function check the mode of the new registered clock event device and call the `tick_setup_periodic` function or the `tick_setup_oneshot` depends on the tick device mode:\n\n```C\nif (td->mode == TICKDEV_MODE_PERIODIC)\n\ttick_setup_periodic(newdev, 0);\nelse\n\ttick_setup_oneshot(newdev, handler, next_event);\n```\n\nBoth of this functions calls the `clockevents_switch_state` to change state of the clock event device and the `clockevents_program_event` function to set next event of clock event device based on delta between the maximum and minimum difference current time and time for the next event. The `tick_setup_periodic`:\n\n```C\nclockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);\nclockevents_program_event(dev, next, false))\n```\n\nand the `tick_setup_oneshot_periodic`:\n\n```C\nclockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT);\nclockevents_program_event(newdev, next_event, true);\n```\n\nThe `clockevents_switch_state` function checks that the clock event device is not in the given state and calls the `__clockevents_switch_state` function from the same source code file:\n\n```C\nif (clockevent_get_state(dev) != state) {\n\tif (__clockevents_switch_state(dev, state))\n\t\treturn;\n```\n\nThe `__clockevents_switch_state` function just makes a call of the certain callback depends on the given state:\n\n```C\nstatic int __clockevents_switch_state(struct clock_event_device *dev,\n\t\t\t\t      enum clock_event_state state)\n{\n\tif (dev->features & CLOCK_EVT_FEAT_DUMMY)\n\t\treturn 0;\n\n\tswitch (state) {\n\tcase CLOCK_EVT_STATE_DETACHED:\n\tcase CLOCK_EVT_STATE_SHUTDOWN:\n\t\tif (dev->set_state_shutdown)\n\t\t\treturn dev->set_state_shutdown(dev);\n\t\treturn 0;\n\n\tcase CLOCK_EVT_STATE_PERIODIC:\n\t\tif (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))\n\t\t\treturn -ENOSYS;\n\t\tif (dev->set_state_periodic)\n\t\t\treturn dev->set_state_periodic(dev);\n\t\treturn 0;\n    ...\n    ...\n    ...\n```\n\nIn our case for `at91sam926x` periodic timer, the state is the `CLOCK_EVT_FEAT_PERIODIC`:\n\n```C\ndata->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;\ndata->clkevt.set_state_periodic = pit_clkevt_set_periodic;\n```\n\nSo, for the `pit_clkevt_set_periodic` callback will be called. If we will read the documentation of the [Periodic Interval Timer (PIT) for at91sam926x](http://www.atmel.com/Images/doc6062.pdf), we will see that there is `Periodic Interval Timer Mode Register` which allows us to control of periodic interval timer.\n\nIt looks like:\n\n```\n31                                                   25        24\n+---------------------------------------------------------------+\n|                                          |  PITIEN  |  PITEN  |\n+---------------------------------------------------------------+\n23                            19                               16\n+---------------------------------------------------------------+\n|                             |               PIV               |\n+---------------------------------------------------------------+\n15                                                              8\n+---------------------------------------------------------------+\n|                            PIV                                |\n+---------------------------------------------------------------+\n7                                                               0\n+---------------------------------------------------------------+\n|                            PIV                                |\n+---------------------------------------------------------------+\n```\n\nWhere `PIV` or `Periodic Interval Value` - defines the value compared with the primary `20-bit` counter of the Periodic Interval Timer. The `PITEN` or `Period Interval Timer Enabled` if the bit is `1` and the `PITIEN` or `Periodic Interval Timer Interrupt Enable` if the bit is `1`. So, to set periodic mode, we need to set `24`, `25` bits in the `Periodic Interval Timer Mode Register`. And we are doing it in the `pit_clkevt_set_periodic` function:\n\n```C\nstatic int pit_clkevt_set_periodic(struct clock_event_device *dev)\n{\n        struct pit_data *data = clkevt_to_pit_data(dev);\n        ...\n        ...\n        ...\n        pit_write(data->base, AT91_PIT_MR,\n                  (data->cycle - 1) | AT91_PIT_PITEN | AT91_PIT_PITIEN);\n\n        return 0;\n}\n```\n\nWhere the `AT91_PT_MR`, `AT91_PT_PITEN` and the `AT91_PIT_PITIEN` are declared as:\n\n```C\n#define AT91_PIT_MR             0x00\n#define AT91_PIT_PITIEN       BIT(25)\n#define AT91_PIT_PITEN        BIT(24)\n```\n\nAfter the setup of the new clock event device is finished, we can return to the `clockevents_register_device` function. The last function in the `clockevents_register_device` function is:\n\n```C\nclockevents_notify_released();\n```\n\nThis function checks the `clockevents_released` list which contains released clock event devices (remember that they may occur after the call of the ` clockevents_exchange_device` function). If this list is not empty, we go through clock event devices from the `clock_events_released` list and delete it from the `clockevent_devices`:\n\n```C\nstatic void clockevents_notify_released(void)\n{\n\tstruct clock_event_device *dev;\n\n\twhile (!list_empty(&clockevents_released)) {\n\t\tdev = list_entry(clockevents_released.next,\n\t\t\t\t struct clock_event_device, list);\n\t\tlist_del(&dev->list);\n\t\tlist_add(&dev->list, &clockevent_devices);\n\t\ttick_check_new_device(dev);\n\t}\n}\n```\n\nThat's all. From this moment we have registered new clock event device. So the usage of the `clockevents` framework is simple and clear. Architectures registered their clock event devices, in the clock events core. Users of the clockevents core can get clock event devices for their use. The `clockevents` framework provides notification mechanisms for various clock related management events like a clock event device registered or unregistered, a processor is offlined in system which supports [CPU hotplug](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt) and etc.\n\nWe saw implementation only of the `clockevents_register_device` function. But generally, the clock event layer [API](https://en.wikipedia.org/wiki/Application_programming_interface) is small. Besides the `API` for clock event device registration, the `clockevents` framework provides functions to schedule the next event interrupt, clock event device notification service and support for suspend and resume for clock event devices.\n\nIf you want to know more about `clockevents` API you can start to research following source code and header files: [kernel/time/tick-common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/tick-common.c), [kernel/time/clockevents.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/clockevents.c) and [include/linux/clockchips.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clockchips.h).\n\nThat's all.\n\nConclusion\n-------------------------------------------------------------------------------\n\nThis is the end of the fifth part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/) that describes timers and timer management related stuff in the Linux kernel. In the previous part got acquainted with the `timers` concept. In this part we continued to learn time management related stuff in the Linux kernel and saw a little about yet another framework - `clockevents`.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n-------------------------------------------------------------------------------\n\n* [timekeeping documentation](https://github.com/0xAX/linux/blob/0a07b238e5f488b459b6113a62e06b6aab017f71/Documentation/timers/timekeeping.txt)\n* [Intel 8253](https://en.wikipedia.org/wiki/Intel_8253)\n* [programmable interval timer](https://en.wikipedia.org/wiki/Programmable_interval_timer)\n* [ACPI pdf](http://uefi.org/sites/default/files/resources/ACPI_5.pdf)\n* [x86](https://en.wikipedia.org/wiki/X86)\n* [High Precision Event Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer)\n* [powerpc](https://en.wikipedia.org/wiki/PowerPC)\n* [frequency](https://en.wikipedia.org/wiki/Frequency)\n* [API](https://en.wikipedia.org/wiki/Application_programming_interface)\n* [nanoseconds](https://en.wikipedia.org/wiki/Nanosecond)\n* [interrupt](https://en.wikipedia.org/wiki/Interrupt)\n* [interrupt handler](https://en.wikipedia.org/wiki/Interrupt_handler)\n* [local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)\n* [C3 state](https://en.wikipedia.org/wiki/Advanced_Configuration_and_Power_Interface#Device_states) \n* [Periodic Interval Timer (PIT) for at91sam926x](http://www.atmel.com/Images/doc6062.pdf)\n* [CPU masks in the Linux kernel](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)\n* [deadlock](https://en.wikipedia.org/wiki/Deadlock)\n* [CPU hotplug](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt)\n* [previous part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-3)\n"
  },
  {
    "path": "Timers/linux-timers-6.md",
    "content": "Timers and time management in the Linux kernel. Part 6.\n================================================================================\n\nx86_64 related clock sources\n--------------------------------------------------------------------------------\n\nThis is sixth part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/) which describes timers and time management related stuff in the Linux kernel. In the previous [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-5) we saw `clockevents` framework and now we will continue to dive into time management related stuff in the Linux kernel. This part will describe implementation of [x86](https://en.wikipedia.org/wiki/X86) architecture related clock sources (more about `clocksource` concept you can read in the [second part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-2) of this chapter).\n\nFirst of all we must know what clock sources may be used at `x86` architecture. It is easy to know from the [sysfs](https://en.wikipedia.org/wiki/Sysfs) or from content of the `/sys/devices/system/clocksource/clocksource0/available_clocksource`. The `/sys/devices/system/clocksource/clocksourceN` provides two special files to achieve this:\n\n* `available_clocksource` - provides information about available clock sources in the system;\n* `current_clocksource`   - provides information about currently used clock source in the system.\n\nSo, let's look:\n\n```\n$ cat /sys/devices/system/clocksource/clocksource0/available_clocksource \ntsc hpet acpi_pm \n```\n\nWe can see that there are three registered clock sources in my system:\n\n* `tsc` - [Time Stamp Counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter);\n* `hpet` - [High Precision Event Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer);\n* `acpi_pm` - [ACPI Power Management Timer](http://uefi.org/sites/default/files/resources/ACPI_5.pdf).\n\nNow let's look at the second file which provides best clock source (a clock source which has the best rating in the system):\n\n```\n$ cat /sys/devices/system/clocksource/clocksource0/current_clocksource \ntsc\n```\n\nFor me it is [Time Stamp Counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter). As we may know from the [second part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-2) of this chapter, which describes internals of the `clocksource` framework in the Linux kernel, the best clock source in a system is a clock source with the best (highest) rating or in other words with the highest [frequency](https://en.wikipedia.org/wiki/Frequency).\n\nFrequency of the [ACPI](https://en.wikipedia.org/wiki/Advanced_Configuration_and_Power_Interface) power management timer is `3.579545 MHz`. Frequency of the [High Precision Event Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer) is at least `10 MHz`. And the frequency of the [Time Stamp Counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter) depends on processor. For example On older processors, the `Time Stamp Counter` was counting internal processor clock cycles. This means its frequency changed when the processor's frequency scaling changed. The situation has changed for newer processors. Newer processors have an `invariant Time Stamp counter` that increments at a constant rate in all operational states of processor. Actually we can get its frequency in the output of the `/proc/cpuinfo`. For example for the first processor in the system:\n\n```\n$ cat /proc/cpuinfo\n...\nmodel name\t: Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz\n...\n```\n\nAnd although Intel manual says that the frequency of the `Time Stamp Counter`, while constant, is not necessarily the maximum qualified frequency of the processor, or the frequency given in the brand string, anyway we may see that it will be much more than frequency of the `ACPI PM` timer or `High Precision Event Timer`. And we can see that the clock source with the best rating or highest frequency is current in the system.\n\nYou can note that besides these three clock source, we don't see yet another two familiar us clock sources in the output of the `/sys/devices/system/clocksource/clocksource0/available_clocksource`. These clock sources are `jiffy` and `refined_jiffies`. We don't see them because this filed maps only high resolution clock sources or in other words clock sources with the [CLOCK_SOURCE_VALID_FOR_HRES](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clocksource.h#L113) flag.\n\nAs I already wrote above, we will consider all of these three clock sources in this part. We will consider it in order of their initialization or:\n\n* `hpet`;\n* `acpi_pm`;\n* `tsc`.\n\nWe can make sure that the order is exactly like this in the output of the [dmesg](https://en.wikipedia.org/wiki/Dmesg) util:\n\n```\n$ dmesg | grep clocksource\n[    0.000000] clocksource: refined-jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 1910969940391419 ns\n[    0.000000] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns\n[    0.094369] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 1911260446275000 ns\n[    0.186498] clocksource: Switched to clocksource hpet\n[    0.196827] clocksource: acpi_pm: mask: 0xffffff max_cycles: 0xffffff, max_idle_ns: 2085701024 ns\n[    1.413685] tsc: Refined TSC clocksource calibration: 3999.981 MHz\n[    1.413688] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x73509721780, max_idle_ns: 881591102108 ns\n[    2.413748] clocksource: Switched to clocksource tsc\n```\n\nThe first clock source is the [High Precision Event Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer), so let's start from it.\n\nHigh Precision Event Timer\n--------------------------------------------------------------------------------\n\nThe implementation of the `High Precision Event Timer` for the [x86](https://en.wikipedia.org/wiki/X86) architecture is located in the [arch/x86/kernel/hpet.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/hpet.c) source code file. Its initialization starts from the call of the `hpet_enable` function. This function is called during Linux kernel initialization. If we will look into `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file, we will see that after the all architecture-specific stuff initialized, early console is disabled and time management subsystem already ready, call of the following function:\n\n```C\nif (late_time_init)\n\tlate_time_init();\n```\n\nwhich does initialization of the late architecture specific timers after early jiffy counter already initialized. The definition of the `late_time_init` function for the `x86` architecture is located in the [arch/x86/kernel/time.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/time.c) source code file. It looks pretty easy:\n\n```C\nstatic __init void x86_late_time_init(void)\n{\n\tx86_init.timers.timer_init();\n\ttsc_init();\n}\n```\n\nAs we may see, it does initialization of the `x86` related timer and initialization of the `Time Stamp Counter`. The seconds we will see in the next paragraph, but now let's consider the call of the `x86_init.timers.timer_init` function. The `timer_init` points to the `hpet_time_init` function from the same source code file. We can verify this by looking on the definition of the `x86_init` structure from the [arch/x86/kernel/x86_init.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/x86_init.c):\n\n```C\nstruct x86_init_ops x86_init __initdata = {\n   ...\n   ...\n   ...\n   .timers = {\n\t\t.setup_percpu_clockev\t= setup_boot_APIC_clock,\n\t\t.timer_init\t\t= hpet_time_init,\n\t\t.wallclock_init\t\t= x86_init_noop,\n   },\n   ...\n   ...\n   ...\n```\n\nThe `hpet_time_init` function does setup of the [programmable interval timer](https://en.wikipedia.org/wiki/Programmable_interval_timer) if we can not enable `High Precision Event Timer` and setups default timer [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) for the enabled timer:\n\n```C\nvoid __init hpet_time_init(void)\n{\n\tif (!hpet_enable())\n\t\tsetup_pit_timer();\n\tsetup_default_timer_irq();\n}\n```\n\nFirst of all the `hpet_enable` function check we can enable `High Precision Event Timer` in the system by the call of the `is_hpet_capable` function and if we can, we map a virtual address space for it:\n\n```C\nint __init hpet_enable(void)\n{\n\tif (!is_hpet_capable())\n\t\treturn 0;\n\n    hpet_set_mapping();\n}\n```\n\nThe `is_hpet_capable` function checks that we didn't pass `hpet=disable` to the kernel command line and the `hpet_address` is received from the [ACPI HPET](https://en.wikipedia.org/wiki/Advanced_Configuration_and_Power_Interface) table. The `hpet_set_mapping` function just maps the virtual address spaces for the timer registers:\n\n```C\nhpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);\n```\n\nAs we can read in the  [IA-PC HPET (High Precision Event Timers) Specification](http://www.intel.com/content/dam/www/public/us/en/documents/technical-specifications/software-developers-hpet-spec-1-0a.pdf):\n\n> The timer register space is 1024 bytes\n\nSo, the `HPET_MMAP_SIZE` is `1024` bytes too:\n\n```C\n#define HPET_MMAP_SIZE\t\t1024\n```\n\nAfter we mapped virtual space for the `High Precision Event Timer`, we read `HPET_ID` register to get number of the timers:\n\n```C\nid = hpet_readl(HPET_ID);\n\nlast = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;\n```\n\nWe need to get this number to allocate correct amount of space for the `General Configuration Register` of the `High Precision Event Timer`:\n\n```C\ncfg = hpet_readl(HPET_CFG);\n\nhpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg), GFP_KERNEL);\n```\n\nAfter the space is allocated for the configuration register of the `High Precision Event Timer`, we allow to main counter to run, and allow timer interrupts if they are enabled by the setting of `HPET_CFG_ENABLE` bit in the configuration register for all timers. In the end we just register new clock source by the call of the `hpet_clocksource_register` function:\n\n```C\nif (hpet_clocksource_register())\n\tgoto out_nohpet;\n```\n\nwhich just calls already familiar\n\n```C\nclocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);\n```\n\nfunction. Where the `clocksource_hpet` is the `clocksource` structure with the rating `250` (remember rating of the previous `refined_jiffies` clock source was `2`), name - `hpet` and `read_hpet` callback for the reading of atomic counter provided by the `High Precision Event Timer`:\n\n```C\nstatic struct clocksource clocksource_hpet = {\n\t.name\t\t= \"hpet\",\n\t.rating\t\t= 250,\n\t.read\t\t= read_hpet,\n\t.mask\t\t= HPET_MASK,\n\t.flags\t\t= CLOCK_SOURCE_IS_CONTINUOUS,\n\t.resume\t\t= hpet_resume_counter,\n\t.archdata\t= { .vclock_mode = VCLOCK_HPET },\n};\n```\n\nAfter the `clocksource_hpet` is registered, we can return to the `hpet_time_init()` function from the [arch/x86/kernel/time.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/time.c) source code file. We can remember that the last step is the call of the:\n\n```C\nsetup_default_timer_irq();\n```\n\nfunction in the `hpet_time_init()`. The `setup_default_timer_irq` function checks existence of `legacy` IRQs or in other words support for the [i8259](https://en.wikipedia.org/wiki/Intel_8259) and setups [IRQ0](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29#Master_PIC) depends on this.\n\nThat's all. From this moment the [High Precision Event Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer) clock source registered in the Linux kernel `clock source` framework and may be used from generic kernel code via the `read_hpet`:\n```C\nstatic cycle_t read_hpet(struct clocksource *cs)\n{\n\treturn (cycle_t)hpet_readl(HPET_COUNTER);\n}\n```\n\nfunction which just reads and returns atomic counter from the `Main Counter Register`.\n\nACPI PM timer\n--------------------------------------------------------------------------------\n\nThe seconds clock source is [ACPI Power Management Timer](http://uefi.org/sites/default/files/resources/ACPI_5.pdf). Implementation of this clock source is located in the [drivers/clocksource/acpi_pm.c](https://github.com/torvalds/linux/blob/master/drivers/clocksource/acpi_pm.c) source code file and starts from the call of the `init_acpi_pm_clocksource` function during `fs` [initcall](https://kernelnewbies.org/Documents/InitcallMechanism).\n\nIf we will look at implementation of the `init_acpi_pm_clocksource` function, we will see that it starts from the check of the value of `pmtmr_ioport` variable:\n\n```C\nstatic int __init init_acpi_pm_clocksource(void)\n{\n    ...\n    ...\n    ...\n\tif (!pmtmr_ioport)\n\t\treturn -ENODEV;\n    ...\n    ...\n    ...\n```\n\nThis `pmtmr_ioport` variable contains extended address of the `Power Management Timer Control Register Block`. It gets its value in the `acpi_parse_fadt` function which is defined in the [arch/x86/kernel/acpi/boot.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/acpi/boot.c) source code file. This function parses `FADT` or `Fixed ACPI Description Table` [ACPI](https://en.wikipedia.org/wiki/Advanced_Configuration_and_Power_Interface) table and tries to get the values of the `X_PM_TMR_BLK` field which contains extended address of the `Power Management Timer Control Register Block`, represented in `Generic Address Structure` format:\n\n```C\nstatic int __init acpi_parse_fadt(struct acpi_table_header *table)\n{\n#ifdef CONFIG_X86_PM_TIMER\n        ...\n        ...\n        ...\n\t\tpmtmr_ioport = acpi_gbl_FADT.xpm_timer_block.address;\n        ...\n        ...\n        ...\n#endif\n\treturn 0;\n}\n```\n\nSo, if the `CONFIG_X86_PM_TIMER` Linux kernel configuration option is disabled or something going wrong in the `acpi_parse_fadt` function, we can't access the `Power Management Timer` register and return from the `init_acpi_pm_clocksource`. In other way, if the value of the `pmtmr_ioport` variable is not zero, we check rate of this timer and register this clock source by the call of the:\n\n```C\nclocksource_register_hz(&clocksource_acpi_pm, PMTMR_TICKS_PER_SEC);\n```\n    \nfunction. After the call of the `clocksource_register_hs`, the `acpi_pm` clock source will be registered in the `clocksource` framework of the Linux kernel:\n\n```C\nstatic struct clocksource clocksource_acpi_pm = {\n\t.name\t\t= \"acpi_pm\",\n\t.rating\t\t= 200,\n\t.read\t\t= acpi_pm_read,\n\t.mask\t\t= (cycle_t)ACPI_PM_MASK,\n\t.flags\t\t= CLOCK_SOURCE_IS_CONTINUOUS,\n};\n```\n\nwith the rating - `200` and the `acpi_pm_read` callback to read atomic counter provided by the `acpi_pm` clock source. The `acpi_pm_read` function just executes `read_pmtmr` function:\n\n```C\nstatic cycle_t acpi_pm_read(struct clocksource *cs)\n{\n\treturn (cycle_t)read_pmtmr();\n}\n```\n\nwhich reads value of the `Power Management Timer` register. This register has following structure:\n\n```\n+-------------------------------+----------------------------------+\n|                               |                                  |\n|  upper eight bits of a        |      running count of the        |\n| 32-bit power management timer |     power management timer       |\n|                               |                                  |\n+-------------------------------+----------------------------------+\n31          E_TMR_VAL           24               TMR_VAL           0\n```\n\nAddress of this register is stored in the `Fixed ACPI Description Table` [ACPI](https://en.wikipedia.org/wiki/Advanced_Configuration_and_Power_Interface) table and we already have it in the `pmtmr_ioport`. So, the implementation of the `read_pmtmr` function is pretty easy:\n\n```C\nstatic inline u32 read_pmtmr(void)\n{\n\treturn inl(pmtmr_ioport) & ACPI_PM_MASK;\n}\n```\n\nWe just read the value of the `Power Management Timer` register and mask its `24` bits.\n\nThat's all. Now we move to the last clock source in this part - `Time Stamp Counter`.\n\nTime Stamp Counter\n--------------------------------------------------------------------------------\n\nThe third and last clock source in this part is - [Time Stamp Counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter) clock source and its implementation is located in the [arch/x86/kernel/tsc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/tsc.c) source code file. We already saw the `x86_late_time_init` function in this part and initialization of the [Time Stamp Counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter) starts from this place. This function calls the `tsc_init()` function from the [arch/x86/kernel/tsc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/tsc.c) source code file.\n\nAt the beginning of the `tsc_init` function we can see check, which checks that a processor has support of the `Time Stamp Counter`:\n\n```C\nvoid __init tsc_init(void)\n{\n\tu64 lpj;\n\tint cpu;\n\n\tif (!cpu_has_tsc) {\n\t\tsetup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);\n\t\treturn;\n\t}\n    ...\n    ...\n    ...\n```\n\nThe `cpu_has_tsc` macro expands to the call of the `cpu_has` macro:\n\n```C\n#define cpu_has_tsc\t\tboot_cpu_has(X86_FEATURE_TSC)\n\n#define boot_cpu_has(bit)\tcpu_has(&boot_cpu_data, bit)\n\n#define cpu_has(c, bit)\t\t\t\t\t\t\t\\\n\t(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :\t\\\n\t test_cpu_cap(c, bit))\n```\n\nwhich check the given bit (the `X86_FEATURE_TSC_DEADLINE_TIMER` in our case) in the `boot_cpu_data` array which is filled during early Linux kernel initialization. If the processor has support of the `Time Stamp Counter`, we get the frequency of the `Time Stamp Counter` by the call of the `calibrate_tsc` function from the same source code file which tries to get frequency from the different source like [Model Specific Register](https://en.wikipedia.org/wiki/Model-specific_register), calibrate over [programmable interval timer](https://en.wikipedia.org/wiki/Programmable_interval_timer) and etc, after this we initialize frequency and scale factor for the all processors in the system:\n\n```C\ntsc_khz = x86_platform.calibrate_tsc();\ncpu_khz = tsc_khz;\n\nfor_each_possible_cpu(cpu) {\n\tcyc2ns_init(cpu);\n\tset_cyc2ns_scale(cpu_khz, cpu);\n}\n```\n\nbecause only first bootstrap processor will call the `tsc_init`. After this we check hat `Time Stamp Counter` is not disabled:\n\n```\nif (tsc_disabled > 0)\n\treturn;\n...\n...\n...\ncheck_system_tsc_reliable();\n```\n\nand call the `check_system_tsc_reliable` function which sets the `tsc_clocksource_reliable` if bootstrap processor has the `X86_FEATURE_TSC_RELIABLE` feature. Note that we went through the `tsc_init` function, but did not register our clock source. Actual registration of the `Time Stamp Counter` clock source occurs in the:\n\n```C\nstatic int __init init_tsc_clocksource(void)\n{\n\tif (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz)\n\t\treturn 0;\n    ...\n    ...\n    ...\n    if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {\n\t\tclocksource_register_khz(&clocksource_tsc, tsc_khz);\n\t\treturn 0;\n\t}\n```\n\nfunction. This function called during the `device` [initcall](https://kernelnewbies.org/Documents/InitcallMechanism). We do it to be sure that the `Time Stamp Counter` clock source will be registered after the  [High Precision Event Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer) clock source.\n\nAfter these all three clock sources will be registered in the `clocksource` framework and the `Time Stamp Counter` clock source will be selected as active, because it has the highest rating among other clock sources:\n\n```C\nstatic struct clocksource clocksource_tsc = {\n\t.name                   = \"tsc\",\n\t.rating                 = 300,\n\t.read                   = read_tsc,\n\t.mask                   = CLOCKSOURCE_MASK(64),\n\t.flags                  = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY,\n\t.archdata               = { .vclock_mode = VCLOCK_TSC },\n};\n```\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the sixth part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/) that describes timers and timer management related stuff in the Linux kernel. In the previous part got acquainted with the `clockevents` framework. In this part we continued to learn time management related stuff in the Linux kernel and saw a little about three different clock sources which are used in the [x86](https://en.wikipedia.org/wiki/X86) architecture. The next part will be last part of this [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/) and we will see some user space related stuff, i.e. how some time related [system calls](https://en.wikipedia.org/wiki/System_call) implemented in the Linux kernel.\n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\nLinks\n--------------------------------------------------------------------------------\n\n* [x86](https://en.wikipedia.org/wiki/X86)\n* [sysfs](https://en.wikipedia.org/wiki/Sysfs)\n* [Time Stamp Counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter)\n* [High Precision Event Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer)\n* [ACPI Power Management Timer (PDF)](http://uefi.org/sites/default/files/resources/ACPI_5.pdf)\n* [frequency](https://en.wikipedia.org/wiki/Frequency).\n* [dmesg](https://en.wikipedia.org/wiki/Dmesg)\n* [programmable interval timer](https://en.wikipedia.org/wiki/Programmable_interval_timer)\n* [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) \n* [IA-PC HPET (High Precision Event Timers) Specification](http://www.intel.com/content/dam/www/public/us/en/documents/technical-specifications/software-developers-hpet-spec-1-0a.pdf)\n* [IRQ0](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29#Master_PIC)\n* [i8259](https://en.wikipedia.org/wiki/Intel_8259)\n* [initcall](https://kernelnewbies.org/Documents/InitcallMechanism)\n* [previous part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-5)\n"
  },
  {
    "path": "Timers/linux-timers-7.md",
    "content": "Timers and time management in the Linux kernel. Part 7.\n================================================================================\n\nTime related system calls in the Linux kernel\n--------------------------------------------------------------------------------\n\nThis is the seventh and last part [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/), which describes timers and time management related stuff in the Linux kernel. In the previous [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-6), we discussed timers in the context of [x86_64](https://en.wikipedia.org/wiki/X86-64): [High Precision Event Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer) and [Time Stamp Counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter). Internal time management is an interesting part of the Linux kernel, but of course not only the kernel needs the `time` concept. Our programs also need to know time. In this part, we will consider implementation of some time management related [system calls](https://en.wikipedia.org/wiki/System_call). These system calls are:\n\n* `clock_gettime`;\n* `gettimeofday`;\n* `nanosleep`.\n\nWe will start from a simple userspace [C](https://en.wikipedia.org/wiki/C_%28programming_language%29) program and see all way from the call of the [standard library](https://en.wikipedia.org/wiki/Standard_library) function to the implementation of certain system calls. As each [architecture](https://github.com/torvalds/linux/tree/master/arch) provides its own implementation of certain system calls, we will consider only [x86_64](https://en.wikipedia.org/wiki/X86-64) specific implementations of system calls, as this book is related to this architecture.\n\nAdditionally, we will not consider the concept of system calls in this part, but only implementations of these three system calls in the Linux kernel. If you are interested in what is a `system call`, there is a special [chapter](https://0xax.gitbook.io/linux-insides/summary/syscall) about this.\n\nSo, let's start from the `gettimeofday` system call.\n\nImplementation of the `gettimeofday` system call\n--------------------------------------------------------------------------------\n\nAs we can understand from the name `gettimeofday`, this function returns the current time. First of all, let's look at the following simple example:\n\n```C\n#include <time.h>\n#include <sys/time.h>\n#include <stdio.h>\n\nint main(int argc, char **argv)\n{\n    char buffer[40];\n    struct timeval time;\n        \n    gettimeofday(&time, NULL);\n\n    strftime(buffer, 40, \"Current date/time: %m-%d-%Y/%T\", localtime(&time.tv_sec));\n    printf(\"%s\\n\",buffer);\n\n    return 0;\n}\n```\n\nAs you can see, here we call the `gettimeofday` function, which takes two parameters. The first parameter is a pointer to the `timeval` structure, which represents an elapsed time:\n\n```C\nstruct timeval {\n    time_t      tv_sec;     /* seconds */\n    suseconds_t tv_usec;    /* microseconds */\n};\n```\n\nThe second parameter of the `gettimeofday` function is a pointer to the `timezone` structure which represents a timezone. In our example, we pass address of the `timeval time` to the `gettimeofday` function, the Linux kernel fills the given `timeval` structure and returns it back to us. Additionally, we format the time with the `strftime` function to get something more human readable than elapsed microseconds. Let's see the result:\n\n```C\n~$ gcc date.c -o date\n~$ ./date\nCurrent date/time: 03-26-2016/16:42:02\n```\n\nAs you may already know, a userspace application does not call a system call directly from the kernel space. Before the actual system call entry will be called, we call a function from the standard library. In my case it is [glibc](https://en.wikipedia.org/wiki/GNU_C_Library), so I will consider this case. The implementation of the `gettimeofday` function is located in the [sysdeps/unix/sysv/linux/x86/gettimeofday.c](https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86/gettimeofday.c;h=36f7c26ffb0e818709d032c605fec8c4bd22a14e;hb=HEAD) source code file. As you already may know, the `gettimeofday` is not a usual system call. It is located in the special area which is called `vDSO` (you can read more about it in the [part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-3), which describes this concept).\n\nThe `glibc` implementation of `gettimeofday` tries to resolve the given symbol; in our case this symbol is `__vdso_gettimeofday` by the call of the `_dl_vdso_vsym` internal function. If the symbol cannot be resolved, it returns `NULL` and we fallback to the call of the usual system call:\n\n```C\nreturn (_dl_vdso_vsym (\"__vdso_gettimeofday\", &linux26)\n  ?: (void*) (&__gettimeofday_syscall));\n```\n\nThe `gettimeofday` entry is located in the [arch/x86/entry/vdso/vclock_gettime.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vdso/vclock_gettime.c) source code file. As we can see the `gettimeofday` is a weak alias of the `__vdso_gettimeofday`:\n\n```C\nint gettimeofday(struct timeval *, struct timezone *)\n\t__attribute__((weak, alias(\"__vdso_gettimeofday\")));\n```\n\nThe `__vdso_gettimeofday` is defined in the same source code file and calls the `do_realtime` function if the given `timeval` is not null:\n\n```C\nnotrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)\n{\n\tif (likely(tv != NULL)) {\n\t\tif (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE))\n\t\t\treturn vdso_fallback_gtod(tv, tz);\n\t\ttv->tv_usec /= 1000;\n\t}\n\tif (unlikely(tz != NULL)) {\n\t\ttz->tz_minuteswest = gtod->tz_minuteswest;\n\t\ttz->tz_dsttime = gtod->tz_dsttime;\n\t}\n\n\treturn 0;\n}\n```\n\nIf the `do_realtime` will fail, we fallback to the real system call via call the `syscall` instruction and passing the `__NR_gettimeofday` system call number and the given `timeval` and `timezone`:\n\n```C\nnotrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)\n{\n\tlong ret;\n\n\tasm(\"syscall\" : \"=a\" (ret) :\n\t    \"0\" (__NR_gettimeofday), \"D\" (tv), \"S\" (tz) : \"memory\");\n\treturn ret;\n}\n```\n\nThe `do_realtime` function gets the time data from the `vsyscall_gtod_data` structure which is defined in the [arch/x86/include/asm/vgtod.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/vgtod.h#L16) header file and contains mapping of the `timespec` structure and a couple of fields which are related to the current clock source in the system. This function fills the given `timeval` structure with values from the `vsyscall_gtod_data` which contains a time related data which is updated via timer interrupt.\n\nFirst of all we try to access the `gtod` or `global time of day` the `vsyscall_gtod_data` structure via the call of the `gtod_read_begin` and will continue to do it until it will be successful:\n\n```C\ndo {\n\tseq = gtod_read_begin(gtod);\n\tmode = gtod->vclock_mode;\n\tts->tv_sec = gtod->wall_time_sec;\n\tns = gtod->wall_time_snsec;\n\tns += vgetsns(&mode);\n\tns >>= gtod->shift;\n} while (unlikely(gtod_read_retry(gtod, seq)));\n\nts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);\nts->tv_nsec = ns;\n```\n\nAs we got access to the `gtod`, we fill the `ts->tv_sec` with the `gtod->wall_time_sec` which stores current time in seconds gotten from the [real time clock](https://en.wikipedia.org/wiki/Real-time_clock) during initialization of the timekeeping subsystem in the Linux kernel and the same value but in nanoseconds. In the end of this code we just fill the given `timespec` structure with the resulted values.\n\nThat's all about the `gettimeofday` system call. The next system call in our list is the `clock_gettime`.\n\nImplementation of the clock_gettime system call\n--------------------------------------------------------------------------------\n\nThe `clock_gettime` function gets the time which is specified by the second parameter. Generally the `clock_gettime` function takes two parameters:\n\n* `clk_id` - clock identifier;\n* `timespec` - address of the `timespec` structure which represent elapsed time.\n\nLet's look on the following simple example:\n\n```C\n#include <time.h>\n#include <sys/time.h>\n#include <stdio.h>\n\nint main(int argc, char **argv)\n{\n    struct timespec elapsed_from_boot;\n\n    clock_gettime(CLOCK_BOOTTIME, &elapsed_from_boot);\n\n    printf(\"%d - seconds elapsed from boot\\n\", elapsed_from_boot.tv_sec);\n    \n    return 0;\n}\n```\n\nwhich prints `uptime` information:\n\n```C\n~$ gcc uptime.c -o uptime\n~$ ./uptime\n14180 - seconds elapsed from boot\n```\n\nWe can easily check the result with the help of the [uptime](https://en.wikipedia.org/wiki/Uptime#Using_uptime) util:\n\n```\n~$ uptime\nup  3:56\n```\n\nThe `elapsed_from_boot.tv_sec` represents elapsed time in seconds, so:\n\n```python\n>>> 14180 / 60\n236\n>>> 14180 / 60 / 60\n3\n>>> 14180 / 60 % 60\n56\n```\n\nThe `clock_id` maybe one of the following:\n\n* `CLOCK_REALTIME` - system wide clock which measures real or wall-clock time;\n* `CLOCK_REALTIME_COARSE` - faster version of the `CLOCK_REALTIME`;\n* `CLOCK_MONOTONIC` - represents monotonic time since some unspecified starting point; \n* `CLOCK_MONOTONIC_COARSE` - faster version of the `CLOCK_MONOTONIC`;\n* `CLOCK_MONOTONIC_RAW` - the same as the `CLOCK_MONOTONIC` but provides non [NTP](https://en.wikipedia.org/wiki/Network_Time_Protocol) adjusted time. \n* `CLOCK_BOOTTIME` - the same as the `CLOCK_MONOTONIC` but plus time that the system was suspended;\n* `CLOCK_PROCESS_CPUTIME_ID` - per-process time consumed by all threads in the process;\n* `CLOCK_THREAD_CPUTIME_ID` - thread-specific clock.\n\nThe `clock_gettime` is not usual syscall too, but as the `gettimeofday`, this system call is placed in the `vDSO` area. Entry of this system call is located in the same source code file - [arch/x86/entry/vdso/vclock_gettime.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vdso/vclock_gettime.c)) as for `gettimeofday`.\n\nThe Implementation of the `clock_gettime` depends on the clock id. If we have passed the `CLOCK_REALTIME` clock id, the `do_realtime` function will be called:\n\n```C\nnotrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)\n{\n\tswitch (clock) {\n\tcase CLOCK_REALTIME:\n\t\tif (do_realtime(ts) == VCLOCK_NONE)\n\t\t\tgoto fallback;\n\t\tbreak;\n    ...\n    ...\n    ...\nfallback:\n\treturn vdso_fallback_gettime(clock, ts);\n}\n```\n\nIn other cases, the `do_{name_of_clock_id}` function is called. Implementations of some of them is similar. For example if we will pass the `CLOCK_MONOTONIC` clock id:\n\n```C\n...\n...\n...\ncase CLOCK_MONOTONIC:\n\tif (do_monotonic(ts) == VCLOCK_NONE)\n\t\tgoto fallback;\n\tbreak;\n...\n...\n...\n```\n\nthe `do_monotonic` function will be called which is very similar on the implementation of the `do_realtime`:\n\n```C\nnotrace static int __always_inline do_monotonic(struct timespec *ts)\n{\n\tdo {\n\t\tseq = gtod_read_begin(gtod);\n\t\tmode = gtod->vclock_mode;\n\t\tts->tv_sec = gtod->monotonic_time_sec;\n\t\tns = gtod->monotonic_time_snsec;\n\t\tns += vgetsns(&mode);\n\t\tns >>= gtod->shift;\n\t} while (unlikely(gtod_read_retry(gtod, seq)));\n\n\tts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);\n\tts->tv_nsec = ns;\n\n\treturn mode;\n}\n```\n\nWe already saw a little about the implementation of this function in the previous paragraph about the `gettimeofday`. There is only one difference here, that the `sec` and `nsec` of our `timespec` value will be based on the `gtod->monotonic_time_sec` instead of `gtod->wall_time_sec` which maps the value of the `tk->tkr_mono.xtime_nsec` or number of [nanoseconds](https://en.wikipedia.org/wiki/Nanosecond) elapsed.\n\nThat's all.\n\nImplementation of the `nanosleep` system call\n--------------------------------------------------------------------------------\n\nThe last system call in our list is the `nanosleep`. As you can understand from its name, this function provides `sleeping` ability. Let's look on the following simple example:\n\n```C\n#include <time.h>\n#include <stdlib.h>\n#include <stdio.h>\n\nint main (void)\n{    \n   struct timespec ts = {5,0};\n\n   printf(\"sleep five seconds\\n\");\n   nanosleep(&ts, NULL);\n   printf(\"end of sleep\\n\");\n\n   return 0;\n}\n```\n\nIf we will compile and run it, we will see the first line\n\n```\n~$ gcc sleep_test.c -o sleep\n~$ ./sleep\nsleep five seconds\nend of sleep\n```\n\nand the second line after five seconds.\n\nThe `nanosleep` is not located in the `vDSO` area like the `gettimeofday` and the `clock_gettime` functions. So, let's look how the `real` system call which is located in the kernel space will be called by the standard library. The implementation of the `nanosleep` system call will be called with the help of the [syscall](http://www.felixcloutier.com/x86/SYSCALL.html) instruction. Before the execution of the `syscall` instruction, parameters of the system call must be put in processor [registers](https://en.wikipedia.org/wiki/Processor_register) according to order which is described in the [System V Application Binary Interface](http://www.x86-64.org/documentation/abi.pdf) or in other words:\n\n* `rdi` - first parameter;\n* `rsi` - second parameter;\n* `rdx` - third parameter;\n* `r10` - fourth parameter;\n* `r8` - fifth parameter;\n* `r9` - sixth parameter.\n\nThe `nanosleep` system call has two parameters - two pointers to the `timespec` structures. The system call suspends the calling thread until the given timeout has elapsed. Additionally it will finish if a signal interrupts its execution. It takes two parameters, the first is `timespec` which represents timeout for the sleep. The second parameter is the pointer to the `timespec` structure too and it contains remainder of time if the call of the `nanosleep` was interrupted.\n\nAs `nanosleep` has two parameters:\n\n```C\nint nanosleep(const struct timespec *req, struct timespec *rem);\n```\n\nTo call system call, we need put the `req` to the `rdi` register, and the `rem` parameter to the `rsi` register. The [glibc](https://en.wikipedia.org/wiki/GNU_C_Library) does these job in the `INTERNAL_SYSCALL` macro which is located in the [sysdeps/unix/sysv/linux/x86_64/sysdep.h](https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/sysdep.h;h=d023d68174d3dfb4e698160b31ae31ad291802e1;hb=HEAD) header file.\n\n```C\n# define INTERNAL_SYSCALL(name, err, nr, args...) \\\n  INTERNAL_SYSCALL_NCS (__NR_##name, err, nr, ##args)\n```\n\nwhich takes the name of the system call, storage for possible error during execution of system call, number of the system call (all `x86_64` system calls you can find in the [system calls table](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_64.tbl)) and arguments of certain system call. The `INTERNAL_SYSCALL` macro just expands to the call of the `INTERNAL_SYSCALL_NCS` macro, which prepares arguments of system call (puts them into the processor registers in correct order), executes `syscall` instruction and returns the result:\n\n```C\n# define INTERNAL_SYSCALL_NCS(name, err, nr, args...)      \\\n  ({\t\t\t\t\t\t\t\t\t                                      \\\n    unsigned long int resultvar;\t\t\t\t\t                          \\\n    LOAD_ARGS_##nr (args)\t\t\t\t\t\t                              \\\n    LOAD_REGS_##nr\t\t\t\t\t\t\t                                  \\\n    asm volatile (\t\t\t\t\t\t\t                                  \\\n    \"syscall\\n\\t\"\t\t\t\t\t\t\t                                  \\\n    : \"=a\" (resultvar)\t\t\t\t\t\t\t                              \\\n    : \"0\" (name) ASM_ARGS_##nr : \"memory\", REGISTERS_CLOBBERED_BY_SYSCALL);   \\\n    (long int) resultvar; })\n```\n\nThe `LOAD_ARGS_##nr` macro calls the `LOAD_ARGS_N` macro where the `N` is number of arguments of the system call. In our case, it will be the `LOAD_ARGS_2` macro. Ultimately all of these macros will be expanded to the following:\n\n```C\n# define LOAD_REGS_TYPES_1(t1, a1)\t\t\t\t\t   \\\n  register t1 _a1 asm (\"rdi\") = __arg1;\t\t\t\t\t   \\\n  LOAD_REGS_0\n\n# define LOAD_REGS_TYPES_2(t1, a1, t2, a2)\t\t\t\t   \\\n  register t2 _a2 asm (\"rsi\") = __arg2;\t\t\t\t\t   \\\n  LOAD_REGS_TYPES_1(t1, a1)\n...\n...\n...\n```\n\nAfter the `syscall` instruction will be executed, the [context switch](https://en.wikipedia.org/wiki/Context_switch) will occur and the kernel will transfer execution to the system call handler. The system call handler for the `nanosleep` system call is located in the [kernel/time/hrtimer.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/hrtimer.c) source code file and defined with the `SYSCALL_DEFINE2` macro helper:\n\n```C\nSYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,\n\t\tstruct timespec __user *, rmtp)\n{\n\tstruct timespec tu;\n\n\tif (copy_from_user(&tu, rqtp, sizeof(tu)))\n\t\treturn -EFAULT;\n\n\tif (!timespec_valid(&tu))\n\t\treturn -EINVAL;\n\n\treturn hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);\n}\n```\n\nMore about the `SYSCALL_DEFINE2` macro you may read in the [chapter](https://0xax.gitbook.io/linux-insides/summary/syscall) about system calls. If we look at the implementation of the `nanosleep` system call, first of all we will see that it starts from the call of the `copy_from_user` function. This function copies the given data from the userspace to kernelspace. In our case we copy timeout value to sleep to the kernelspace `timespec` structure and check that the given `timespec` is valid by the call of the `timesc_valid` function:\n\n```C\nstatic inline bool timespec_valid(const struct timespec *ts)\n{\n\tif (ts->tv_sec < 0)\n\t\treturn false;\n\tif ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)\n\t\treturn false;\n\treturn true;\n}\n```\n\nwhich just checks that the given `timespec` does not represent date before `1970` and nanoseconds does not overflow `1` second. The `nanosleep` function ends with the call of the `hrtimer_nanosleep` function from the same source code file. The `hrtimer_nanosleep` function creates a [timer](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-4) and calls the `do_nanosleep` function. The `do_nanosleep` does main job for us. This function provides loop:\n\n```C\ndo {\n\tset_current_state(TASK_INTERRUPTIBLE);\n\thrtimer_start_expires(&t->timer, mode);\n\n\tif (likely(t->task))\n\t\tfreezable_schedule();\n    \n} while (t->task && !signal_pending(current));\n\n__set_current_state(TASK_RUNNING);\nreturn t->task == NULL;\n```\n\nWhich freezes current task during sleep. After we set `TASK_INTERRUPTIBLE` flag for the current task, the `hrtimer_start_expires` function starts the give high-resolution timer on the current processor. As the given high resolution timer will expire, the task will be again running.\n\nThat's all.\n\nConclusion\n--------------------------------------------------------------------------------\n\nThis is the end of the seventh part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/) that describes timers and timer management related stuff in the Linux kernel. In the previous part we saw [x86_64](https://en.wikipedia.org/wiki/X86-64) specific clock sources. As I wrote in the beginning, this part is the last part of this chapter. We saw important time management related concepts like `clocksource` and `clockevents` frameworks, `jiffies` counter and etc., in this chpater. Of course this does not cover all of the time management in the Linux kernel. Many parts of this mostly related to the scheduling which we will see in other chapter. \n\nIf you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).\n\n**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**\n\n\nLinks\n--------------------------------------------------------------------------------\n\n* [system call](https://en.wikipedia.org/wiki/System_call)\n* [C programming language](https://en.wikipedia.org/wiki/C_%28programming_language%29)\n* [standard library](https://en.wikipedia.org/wiki/Standard_library)\n* [glibc](https://en.wikipedia.org/wiki/GNU_C_Library)\n* [real time clock](https://en.wikipedia.org/wiki/Real-time_clock)\n* [NTP](https://en.wikipedia.org/wiki/Network_Time_Protocol)\n* [nanoseconds](https://en.wikipedia.org/wiki/Nanosecond)\n* [register](https://en.wikipedia.org/wiki/Processor_register)\n* [System V Application Binary Interface](http://www.x86-64.org/documentation/abi.pdf)\n* [context switch](https://en.wikipedia.org/wiki/Context_switch)\n* [Introduction to timers in the Linux kernel](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-4)\n* [uptime](https://en.wikipedia.org/wiki/Uptime#Using_uptime)\n* [system calls table for x86_64](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/syscalls/syscall_64.tbl)\n* [High Precision Event Timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer)\n* [Time Stamp Counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter)\n* [x86_64](https://en.wikipedia.org/wiki/X86-64)\n* [previous part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-6)\n"
  },
  {
    "path": "book-A5.json",
    "content": "{\n    \"title\": \"Linux Insides\",\n    \"author\" : \"0xAX\",\n    \"pdf\": {\n        \"paperSize\": \"a5\",\n        \"margin\":\n        {\n            \"top\": 48,\n            \"bottom\": 48,\n            \"right\": 28,\n            \"left\": 28\n        }\n    }\n}\n"
  },
  {
    "path": "book.json",
    "content": "{\n    \"title\": \"Linux Insides\",\n    \"author\" : \"0xAX\"\n}\n"
  },
  {
    "path": "contributors.md",
    "content": "# Contributors\n\nSpecial thanks to all the people who helped to develop this project:\n\n* [Akash Shende](https://github.com/akash0x53)\n* [Jakub Kramarz](https://github.com/jkramarz)\n* [ckrooss](https://github.com/ckrooss)\n* [ecksun](https://github.com/ecksun)\n* [Maciek Makowski](https://github.com/mmakowski)\n* [Thomas Marcelis](https://github.com/ThomasMarcelis)\n* [Chris Costes](https://github.com/ccostes)\n* [nathansoz](https://github.com/nathansoz)\n* [RubanDeventhiran](https://github.com/RubanDeventhiran)\n* [fuzhli](https://github.com/fuzhli)\n* [andars](https://github.com/andars)\n* [Alexandru Pana](https://github.com/alexpana)\n* [Bogdan Rădulescu](https://github.com/bogdanr)\n* [zil](https://github.com/zil)\n* [codelitt](https://github.com/codelitt)\n* [gulyasm](https://github.com/gulyasm)\n* [alx741](https://github.com/alx741)\n* [Haddayn](https://github.com/Haddayn)\n* [Daniel Campoverde Carrión](https://github.com/alx741)\n* [Guillaume Gomez](https://github.com/GuillaumeGomez)\n* [Leandro Moreira](https://github.com/leandromoreira)\n* [Jonatan Pålsson](https://github.com/jonte)\n* [George Horrell](https://github.com/georgehorrell)\n* [Ciro Santilli](https://github.com/cirosantilli)\n* [Kevin Soules](https://github.com/eax64)\n* [Fabio Pozzi](https://github.com/fabiopozzi)\n* [Kevin Swinton](https://github.com/kevinjswinton)\n* [Leandro Moreira](https://github.com/leandromoreira)\n* [LYF610400210](https://github.com/LYF610400210)\n* [Cam Cope](https://github.com/ccope)\n* [Miquel Sabaté Solà](https://github.com/mssola)\n* [Michael Aquilina](https://github.com/MichaelAquilina)\n* [Gabriel Sullice](https://github.com/gabesullice)\n* [Michael Drüing](https://github.com/darkstar)\n* [Alexander Polakov](https://github.com/polachok)\n* [Anton Davydov](https://github.com/davydovanton)\n* [Arpan Kapoor](https://github.com/arpankapoor)\n* [Brandon Fosdick](https://github.com/bfoz)\n* [Ashleigh Newman-Jones](https://github.com/anewmanjones)\n* [Terrell Russell](https://github.com/trel)\n* [Mario](https://github.com/bedna-KU)\n* [Ewoud Kohl van Wijngaarden](https://github.com/ekohl)\n* [Jochen Maes](https://github.com/sejo)\n* [Brother-Lal](https://github.com/Brother-Lal)\n* [Brian McKenna](https://github.com/puffnfresh)\n* [Josh Triplett](https://github.com/joshtriplett)\n* [James Flowers](https://github.com/comjf)\n* [Alexander Harding](https://github.com/aeharding)\n* [Dzmitry Plashchynski](https://github.com/plashchynski)\n* [Simarpreet Singh](https://github.com/simar7)\n* [umatomba](https://github.com/umatomba)\n* [Vaibhav Tulsyan](https://github.com/xennygrimmato)\n* [Brandon Wamboldt](https://github.com/brandonwamboldt)\n* [Maxime Leboeuf](https://github.com/leboeuf)\n* [Maximilien Richer](https://github.com/halfa)\n* [marmeladema](https://github.com/marmeladema)\n* [Anisse Astier](https://github.com/anisse)\n* [TheCodeArtist](https://github.com/TheCodeArtist)\n* [Ehsun N](https://github.com/imehsunn)\n* [Adam Shannon](https://github.com/adamdecaf)\n* [Donny Nadolny](https://github.com/dnadolny)\n* [Ehsun N](https://github.com/imehsunn)\n* [Waqar Ahmed](https://github.com/Waqar144)\n* [Ian Miell](https://github.com/ianmiell)\n* [DongLiang Mu](https://github.com/mudongliang)\n* [Johan Manuel](https://github.com/29jm)\n* [Brian Rak](https://github.com/brakthehack)\n* [Robin Peiremans](https://github.com/rpeiremans)\n* [xiaoqiang zhao](https://github.com/hitmoon)\n* [aouelete](https://github.com/aouelete)\n* [Dennis Birkholz](https://github.com/dennisbirkholz)\n* [Anton Tyurin](https://github.com/noxiouz)\n* [Bogdan Kulbida](https://github.com/kulbida)\n* [Matt Hudgins](https://github.com/mhudgins)\n* [Ruth Grace Wong](https://github.com/ruthgrace)\n* [Jeremy Lacomis](https://github.com/jlacomis)\n* [Dubyah](https://github.com/Dubyah)\n* [Matthieu Tardy](https://github.com/c0riolis)\n* [michaelian ennis](https://github.com/mennis)\n* [Amitay Stern](https://github.com/amist)\n* [Matt Todd](https://github.com/mtodd)\n* [Piyush Pangtey](https://github.com/pangteypiyush)\n* [Alfred Agrell](https://github.com/Alcaro)\n* [Jakub Wilk](https://github.com/jwilk)\n* [Justus Adam](https://github.com/JustusAdam)\n* [Roy Wellington Ⅳ](https://github.com/thanatos)\n* [Jonathan Rennison](https://github.com/JGRennison)\n* [Mack Stump](https://github.com/rmbreak)\n* [Pushpinder Singh](https://github.com/PrinceDhaliwal)\n* [Xiaoqin Hu](https://github.com/huxq)\n* [Jeremy Cline](https://github.com/jeremycline)\n* [Kavindra Nikhurpa](https://github.com/kavi-nikhurpa)\n* [Connor Mullen](https://github.com/mullen3)\n* [Alex Gonzalez](https://github.com/alex-gonz)\n* [Tim Konick](https://github.com/tijko)\n* [Anastas Stoyanovsky](https://github.com/anastasds)\n* [Faiz Halde](https://github.com/7coder7)\n* [Andrew Hayes](https://github.com/AndrewRussellHayes)\n* [Matthew Fernandez](https://github.com/Smattr)\n* [Yoshihiro YUNOMAE](https://github.com/yunomae)\n* [paulch](https://github.com/paulch)\n* [Nathan Dautenhahn](https://github.com/ndauten)\n* [Sachin Patil](https://github.com/psachin)\n* [Stéphan Gorget](https://github.com/phantez)\n* [Adrian Reyes](https://github.com/int3rrupt)\n* [Chandan Rai](https://github.com/crowchirp)\n* [JB Cayrou](https://github.com/jbcayrou)\n* [Cornelius Diekmann](https://github.com/diekmann)\n* [Andrés Rojas](https://github.com/c0r3dump3d)\n* [Beomsu Kim](https://github.com/0xF0D0)\n* [Firo Yang](https://github.com/firogh)\n* [Edward Hu](https://github.com/BDHU)\n* [WarpspeedSCP](https://github.com/WarpspeedSCP)\n* [Gabriela Moldovan](https://github.com/gabi-250)\n* [kuritonasu](https://github.com/kuritonasu/)\n* [Miles Frain](https://github.com/milesfrain)\n* [Horace Heaven](https://github.com/horaceheaven)\n* [Miha Zidar](https://github.com/zidarsk8)\n* [Ivan Kovnatsky](https://github.com/sevenfourk)\n* [Takuya Yamamoto](https://github.com/tkyymmt)\n* [Dragonly](https://github.com/dragonly)\n* [Blameying](https://github.com/Blameying)\n* [Junsoo Lee](https://github.com/junsooo)\n* [SeongJae Park](https://github.com/sjp38)\n* [Stefan20162016](https://github.com/stefan20162016)\n* [Marco Torsello](https://github.com/md1512)\n* [Bruno Meneguele](https://github.com/bmeneguele)\n* [Sebastian Fricke](https://github.com/initBasti)\n* [Zhouyi Zhou](https://github.com/zhouzhouyi-hub)\n* [Mingzhe Yang](https://github.com/Mutated1994)\n* [Yuxin Wu](https://github.com/chaffz)\n* [Biao Ding](https://github.com/SmallPond)\n* [Arfy slowy](https://github.com/slowy07)\n* [Junbo Jiang](https://github.com/junbo42)\n* [Dexter Plameras](https://github.com/dexterp)\n* [Jun Duan](https://github.com/waltforme)\n* [Guochao Xie](https://github.com/XieGuochao)\n* [Davide Benini](https://github.com/beninidavide/)\n* [kyselejsyrecek](https://github.com/kyselejsyrecek)\n* [kianmeng](https://github.com/kianmeng)\n"
  },
  {
    "path": "lychee.toml",
    "content": "# Lychee link checker configuration\n# See https://github.com/lycheeverse/lychee for all options\n\n# Maximum number of retries per link\nmax_retries = 3\n\n# Timeout per request in seconds\ntimeout = 20\n\n# Exclude these URLs from checking (regex patterns)\nexclude = [\n    \"twitter\\\\.com\",\n    \"x\\\\.com\",\n    \"osdev\\\\.org\",\n    \"intel.com/*\"\n]\n\n# Accept status codes as valid\naccept = [\n    200,   # OK\n    204,   # No Content\n    206,   # Partial Content (for range requests)\n]\n\n# User agent string\nuser_agent = \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\"\n\n# Check external links\nscheme = [\"https\", \"http\"]\n"
  },
  {
    "path": "scripts/check_code_snippets.py",
    "content": "\"\"\"\nA script that takes the lines of the Linux kernel source code from the comments\nin the markdown files that are attached to the code and checks their validity.\n\"\"\"\nimport os\nimport re\nimport sys\nfrom typing import Optional, Tuple\n\nimport requests\n\nexclude_dirs = [\"./.github\"]\n\ndef __split_url_and_range__(url: str) -> Tuple[str, Optional[int], Optional[int]]:\n    base, frag = url.split(\"#\", 1)\n    m = re.match(r'L(\\d+)(?:-L?(\\d+))?$', frag)\n    start = int(m.group(1))\n    end = int(m.group(2)) if m.group(2) else None\n    return base, start, end\n\ndef __fetch_raw__(source: str) -> str:\n    r = requests.get(source, timeout=5.0)\n    return r.text\n\ndef __handle_md__(md: str):\n    in_code = False\n    code = ''\n    content = ''\n\n    md_lines = md.splitlines()\n\n    for line in md_lines:\n        if in_code:\n            if re.search(\"^```[a-zA-Z].*\", line):\n                continue\n\n            if re.search(\"^```$\", line):\n                in_code = False\n                continue\n\n            code += line + '\\n'\n            continue\n\n        if line.startswith(\"<!--\"):\n            in_code = True\n            (uri, start, end) = __split_url_and_range__(line.split(' ')[1])\n            content = \"\\n\".join(__fetch_raw__(uri).splitlines()[start-1:end]).rstrip()\n            continue\n\n        if code != '':\n            if code.rstrip() != content:\n                print(\"Error in\", sys.argv[1])\n                print(\"Code in book:\")\n                print(code)\n                print(\"Code from github:\")\n                print(content)\n                sys.exit(1)\n\n            code = ''\n            content = ''\n            continue\n\ndef __main__():\n    md_files = []\n\n    for root, _dirs, files in os.walk(sys.argv[1]):\n        for name in files:\n            if name.endswith('.md'):\n                md_files.append(os.path.join(root, name))\n            else:\n                continue\n\n    for md in md_files:\n        print(\"Checking code in the\", md)\n        if os.path.dirname(md) in exclude_dirs:\n            continue\n\n        with open(md, \"r\", encoding=\"utf-8\") as f:\n            md = f.read()\n\n        __handle_md__(md)\n\nif __name__ == \"__main__\":\n    __main__()\n"
  }
]