Repository: certik/fastGPT
Branch: main
Commit: db135c7d2f4b
Files: 31
Total size: 67.0 KB

Directory structure:
gitextract_nupenw_f/

├── .github/
│   └── workflows/
│       └── CI.yml
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── build.sh
├── chat.f90
├── ci/
│   ├── build.sh
│   └── build_lfortran.sh
├── cmake/
│   ├── FindOMP.cmake
│   ├── FindOPENBLAS.cmake
│   └── UserOverride.cmake
├── comparison/
│   └── encode_input.py
├── create_model.py
├── driver.f90
├── environment.yml
├── fpm.toml
├── gpt2.f90
├── input
├── linalg_accelerate.c
├── linalg_c.f90
├── linalg_f.f90
├── linalg_openblas.c
├── main.f90
├── omp.f90
├── omp_dummy.f90
├── pt.py
├── tests/
│   ├── test_basic_input.f90
│   ├── test_chat.f90
│   └── test_more_inputs.f90
└── tokenizer.f90

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/CI.yml
================================================
name: CI

on:
  push:
    branches:
      - main
  pull_request:
    branches:
      - main


jobs:

  gfortran:
    name: GFortran (${{ matrix.os }})
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        os: ["macos-latest", "ubuntu-latest"]

    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - uses: mamba-org/setup-micromamba@v2.0.2
        with:
          micromamba-version: '2.0.4-0'
          environment-file: environment.yml
          create-args: >-
            ${{ matrix.os == 'macos-latest' && 'gfortran=14.2.0' || '' }}

      - name: Install GGUF
        shell: bash -e -x -l {0}
        run: |
            git clone https://github.com/ggerganov/llama.cpp
            cd llama.cpp
            git checkout 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
            cd gguf-py
            pip install .
            cd ../..

      - name: Build and run
        shell: bash -l {0}
        run: |
            ci/build.sh

  lfortran:
    name: LFortran (${{ matrix.os }})
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        os: ["macos-latest", "ubuntu-latest"]

    steps:
      - uses: actions/checkout@v3
        with:
          fetch-depth: 0

      - uses: mamba-org/setup-micromamba@v2.0.2
        with:
          micromamba-version: '2.0.4-0'
          environment-file: environment.yml
          create-args: >-
            lfortran=0.60.0
            ${{ matrix.os == 'ubuntu-latest' && 'llvm-openmp=11.1.0' || '' }}

      - name: Build and run
        shell: bash -l {0}
        run: |
            ci/build_lfortran.sh


================================================
FILE: .gitignore
================================================
gpt2/
encoder.json
input.dat
model.dat
vocab.bpe


================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)

set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_SOURCE_DIR}/cmake/UserOverride.cmake)

if (NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release
        CACHE STRING "Build type (Debug, Release)")
endif()

project(fastGPT)
enable_language(Fortran)

set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
set(CMAKE_Fortran_MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/mod_files)

# Make sure that CMAKE_BUILD_TYPE is either Debug or Release:
if (NOT CMAKE_BUILD_TYPE MATCHES "Debug|Release")
    message(FATAL_ERROR "CMAKE_BUILD_TYPE must be one of: Debug, Release (current value: '${CMAKE_BUILD_TYPE}')")
endif ()

if (APPLE)
    set(DEFAULT_FASTGPT_BLAS "Accelerate")
else()
    set(DEFAULT_FASTGPT_BLAS "Fortran")
endif()
set(FASTGPT_BLAS ${DEFAULT_FASTGPT_BLAS}
    CACHE STRING "The BLAS library that fastGPT should use")
if (NOT FASTGPT_BLAS MATCHES "Accelerate|OpenBLAS|Fortran")
    message(FATAL_ERROR "FASTGPT_BLAS must be one of: OpenBLAS, Accelerate, Fortran (current value: '${FASTGPT_BLAS}')")
endif ()
if (FASTGPT_BLAS STREQUAL "Accelerate")
    find_package(OMP)
    if(NOT OMP_FOUND)
        find_package(OpenMP REQUIRED COMPONENTS Fortran)
    endif()
elseif (FASTGPT_BLAS STREQUAL "OpenBLAS")
    find_package(OPENBLAS REQUIRED)
    find_package(OMP)
    if(NOT OMP_FOUND)
        find_package(OpenMP REQUIRED COMPONENTS Fortran)
    endif()
else()
    # pass
endif()

enable_testing()

set(SRC
    gpt2.f90
    tokenizer.f90
    driver.f90
    )
if (FASTGPT_BLAS STREQUAL "Accelerate")
    list(APPEND SRC
        linalg_accelerate.c
        linalg_c.f90
        omp.f90
    )
elseif (FASTGPT_BLAS STREQUAL "OpenBLAS")
    list(APPEND SRC
        linalg_openblas.c
        linalg_c.f90
        omp.f90
    )
else()
    list(APPEND SRC
        linalg_f.f90
        omp_dummy.f90
    )
endif()
add_library(fastgpt ${SRC})
if (FASTGPT_BLAS STREQUAL "Accelerate")
    target_link_options(fastgpt PUBLIC -framework accelerate)
    target_link_libraries(fastgpt PUBLIC "$<IF:$<BOOL:${OMP_FOUND}>,p::omp,OpenMP::OpenMP_Fortran>")
elseif (FASTGPT_BLAS STREQUAL "OpenBLAS")
    target_link_libraries(fastgpt p::openblas
        "$<IF:$<BOOL:${OMP_FOUND}>,p::omp,OpenMP::OpenMP_Fortran>")
endif()

add_executable(gpt2 main.f90)
target_link_libraries(gpt2 fastgpt)

add_executable(chat chat.f90)
target_link_libraries(chat fastgpt)

add_executable(test_basic_input tests/test_basic_input.f90)
target_link_libraries(test_basic_input fastgpt)
add_test(test_basic_input ${PROJECT_BINARY_DIR}/test_basic_input)

add_executable(test_more_inputs tests/test_more_inputs.f90)
target_link_libraries(test_more_inputs fastgpt)
add_test(test_more_inputs ${PROJECT_BINARY_DIR}/test_more_inputs)

add_executable(test_chat tests/test_chat.f90)
target_link_libraries(test_chat fastgpt)
add_test(test_chat ${PROJECT_BINARY_DIR}/test_chat)

if(NOT PROJECT_SOURCE_DIR STREQUAL PROJECT_BINARY_DIR)
    # Git auto-ignore out-of-source build directory
    file(GENERATE OUTPUT .gitignore CONTENT "*")
endif()

message("\n")
message("Configuration results")
message("---------------------")
message("Fortran compiler: ${CMAKE_Fortran_COMPILER}")
message("Build type: ${CMAKE_BUILD_TYPE}")
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
    message("Fortran compiler flags: ${CMAKE_Fortran_FLAGS_DEBUG}")
else ()
    message("Fortran compiler flags: ${CMAKE_Fortran_FLAGS_RELEASE}")
endif ()
message("Installation prefix: ${CMAKE_INSTALL_PREFIX}")
message("FASTGPT_BLAS: ${FASTGPT_BLAS}")


================================================
FILE: LICENSE
================================================
Copyright 2023 Ondřej Čertík

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: README.md
================================================
# fastGPT

The progression of GPT-2 codes from the original to "minimal", "nano" and
"pico":

* [openai/gpt-2](https://github.com/openai/gpt-2)
* [karpathy/minGPT](https://github.com/karpathy/mingpt)
* [karpathy/nanoGPT](https://github.com/karpathy/nanogpt)
* [jaymody/picoGPT](https://github.com/jaymody/picoGPT)

`fastGPT` is very similar to `picoGPT` (very small and readable), but it is
also fast (see the Benchmarks section below). The speed and readability is
achieved by using Fortran. I wrote a
[blog post](https://ondrejcertik.com/blog/2023/03/fastgpt-faster-than-pytorch-in-300-lines-of-fortran/)
introducing fastGPT.

`fastGPT` features:
* Fast? ✅
* Training code? ❌
* Batch inference? ❌
* top-p sampling? ❌ top-k? ❌ temperature? ❌ categorical sampling?! ❌ greedy? ✅
* Readable? ✅
* Small? ✅

A quick breakdown of each of the files:

* `gpt2.f90`: the actual GPT-2 model and a decoder
* `main.f90`: the main driver
* `create_model.py`: downloads the TensorFlow model and converts to the GGUF
  format (`model.gguf`)
* `encode_input.py`: encodes the text input into tokens (input file for `gpt2`)
* Matmul implementations
    * `linalg_f.f90` native Fortran
    * `linalg_c.f90`, `linalg_accelerate.c` macOS Accelerate Framework
* `pt.py`: a reference script to run PyTorch (returns the same answer)

## Getting Started

### Install prerequisites:
```bash
    mamba env create -f environment.yml
    conda activate fastgpt
```

### Configure and build
#### Fortran Package Manager (fpm)
```bash
    fpm build
```

#### CMake 
```bash
    FC=gfortran cmake .
    make
```

### Download the GPT2 model weights

    curl -o model.gguf -L https://huggingface.co/certik/fastGPT/resolve/main/model_fastgpt_124M_v2.gguf

You can also download 355M for the `gpt-medium` model.

Now you can modify the `input` file to change the input string and set other
parameters.

### Run 
(requires `model.gguf` and `input` in the current directory)

If you built with `cmake`, execute
```bash
    ./gpt2
```
Alternatively, if you built with `fpm`, execute
```bash
    fpm run chatgpt2
```
to launch an interactive chat session
```bash
    fpm run gpt2
```
or to launch a session with predetermined prompts.

### Create the GGUF file

Create the `model.gguf` file from a given GPT-2 model. Supported sizes (and the
corresponding names to be used in `pt.py`, and the approximate download size):
"124M" (`gpt2`, 0.5GB), "355M" (`gpt-medium`, 1.5GB), "774M" (`gpt-large`,
3GB), "1558M" (`gpt-xl`, 6GB). This will download the model and cache it for
subsequent runs:
```python
    python create_model.py --models_dir "models" --model_size "124M"
```

This script depends on the `gguf` Python library, that you can install using:
```bash
    git clone https://github.com/ggerganov/llama.cpp
    cd llama.cpp
    git checkout 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
    cd gguf-py
    pip install .
```
The `gguf` library is available in pip and conda, but we currently require the
latest version that is not available there yet.

We used this script to create several GGUF files and uploaded them to:
https://huggingface.co/certik/fastGPT, so that you can just download the
pre-generated files.

### Example Output

The above `./gpt2` command prints on Apple M1 Max:
```
$ ./gpt2
Loading the model...
    done. Time:   0.111s

Model parameters:
n_vocab = 50257
n_ctx   =  1024
n_embd  =   768
n_layer =    12
n_head  =    12

Input text
Alan Turing theorized that computers would one day become very powerful, but even he could not imagine

Encoding: tokenizing input text into tokens (currently slow)...
    done. Time:   0.074s

Input parameters:
n_seq                =  19
n_tokens_to_generate =  20

Input tokens:
 36235 39141 18765  1143   326  9061   561   530  1110  1716   845  3665    11   475   772   339   714   407  5967

Decoded input as text:
Alan Turing theorized that computers would one day become very powerful, but even he could not imagine

Running model...
 how they would be able to do so.

"I think that the most important thing is
    done. Time:   0.304s (1.01x)

Output tokens:
   703   484   561   307  1498   284   466   523    13   198   198     1    40   892   326   262   749  1593  1517   318

Decoded output as text:
 how they would be able to do so.

"I think that the most important thing is
```

### Chat interface

Here is an example chat using the largest 1558M model:

```
$ ./chat
Your name is fastGPT and you are an AI bot. The user will ask you questions and you answer in a nice, truthful, short way.
User: What is the capital of Czechia?
fastGPT: Prague.
User: How many legs does a dog have?
fastGPT: Four.
User: What color does the sky have?
fastGPT: Blue.
User: What can you type a document on?
fastGPT: A typewriter.
User: What can you drive in?
fastGPT: A car.
User: What can you fly in?
fastGPT: A plane.
User: What continent is Germany in?
fastGPT: Europe.
User: When did Second World War start?
fastGPT: 1939.
User: When did it end?
fastGPT: 1945.
User: When did the U.S. enter the Second World War?
fastGPT: 1941.
User: When did the First World War start?
fastGPT: 1914.
User: When did it end?
fastGPT: 1918.
User: When did the Mexican-American war start?
fastGPT: 1846.
User: When did it end?
fastGPT: 1848.
User: What color is snow?
fastGPT: White.
User: What color do plants usually have?
fastGPT: Green.
User: What is your name?
fastGPT: fastGPT.
```


### BLAS Implementation

You can choose which BLAS implementation to use for `matmul` using:
* `-DFASTGPT_BLAS=OpenBLAS`: Use OpenBLAS
* `-DFASTGPT_BLAS=Accelerate`: Use the macOS Accelerate Framework
* `-DFASTGPT_BLAS=Fortran`: Use the default Fortran's intrinsic `matmul`

## Benchmarks

On Apple M1 Max, inference of the above input file (20 tokens):

                                    1 core  2 cores  4 cores  8 cores

    fastGPT (Accelerate, fast_tanh) 0.288s

    fastGPT (Accelerate)            0.299s
    PyTorch (Accelerate)            0.346s

    fastGPT (OpenBLAS)              0.837s  0.514s    0.341s   0.339s
    PyTorch (OpenBLAS)              0.873s  0.539s    0.386s   0.392s

    fastGPT (Accelerate, no cache)  0.717s
    picoGPT (Accelerate, no cache)  0.765s
    PyTorch (Accelerate, no cache)  0.787s

    fastGPT (OpenBLAS, no cache)    2.343s  1.603s    1.209s   1.018s
    PyTorch (OpenBLAS, no cache)    2.356s  1.520s    1.104s   0.997s
    picoGPT (OpenBLAS, no cache)    2.427s  1.645s    1.272s   1.081s

Total run (includes loading the model and Python imports):

    fastGPT (Accelerate, fast_tanh): 0.401s
    picoGPT (8 cores):               3.445s
    PyTorch (OpenBLAS, 4 cores):     4.867s

## TODO

* [ ] Parallelization:
  * [ ] Over heads: https://github.com/certik/fastGPT/issues/2
  * [ ] MPI: https://github.com/certik/fastGPT/issues/5
* [ ] Other sampling methods: https://github.com/certik/fastGPT/issues/8
* [ ] Batching: https://github.com/certik/fastGPT/issues/7
* [x] Improve the UI:
  * [x] Implement the input tokenizer in Fortran: https://github.com/certik/fastGPT/issues/1
  * [x] Show the words as they are generated: https://github.com/certik/fastGPT/issues/6


================================================
FILE: build.sh
================================================
#!/bin/bash

set -ex

FC=gfortran cmake -Bbuild
cmake --build build --parallel
python create_model.py --models_dir "../gpt2/models" --model_size "124M"
python encode_input.py \
    "Alan Turing theorized that computers would one day become very powerful, but even he could not imagine" \
    -n 20
build/gpt2


================================================
FILE: chat.f90
================================================
program chatgpt2
use driver, only: chat
implicit none
call chat()
end program


================================================
FILE: ci/build.sh
================================================
#!/bin/bash

set -ex

cmake .
make
mkdir models
python create_model.py --models_dir "models" --model_size "124M"
./gpt2
ctest

make clean
rm CMakeCache.txt
cmake -DFASTGPT_BLAS=OpenBLAS .
make
time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2

rm model.gguf
curl -o model.gguf -L https://huggingface.co/certik/fastGPT/resolve/main/model_fastgpt_124M_v2.gguf
time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2

rm gpt2
python pt.py


================================================
FILE: ci/build_lfortran.sh
================================================
#!/bin/bash

set -ex

curl -o model.gguf -L https://huggingface.co/certik/fastGPT/resolve/main/model_fastgpt_124M_v2.gguf

mkdir lf
cd lf
FC=lfortran CMAKE_PREFIX_PATH=$CONDA_PREFIX cmake -DFASTGPT_BLAS=OpenBLAS -DCMAKE_BUILD_TYPE=Debug ..
make VERBOSE=1
ln -s ../model.gguf .
ln -s ../input .
time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2
time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./test_basic_input
time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./test_more_inputs
cd ..

mkdir lf-fast
cd lf-fast
FC="lfortran --fast" CMAKE_PREFIX_PATH=$CONDA_PREFIX cmake -DFASTGPT_BLAS=OpenBLAS -DCMAKE_BUILD_TYPE=Release ..
make VERBOSE=1
ln -s ../model.gguf .
ln -s ../input .
time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2
time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./test_basic_input
time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./test_more_inputs
cd ..


================================================
FILE: cmake/FindOMP.cmake
================================================
find_path(OMP_INCLUDE_DIR omp.h)
find_library(OMP_LIBRARY omp)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(OMP DEFAULT_MSG OMP_INCLUDE_DIR
    OMP_LIBRARY)

add_library(p::omp INTERFACE IMPORTED)
set_property(TARGET p::omp PROPERTY INTERFACE_INCLUDE_DIRECTORIES
    ${OMP_INCLUDE_DIR})
set_property(TARGET p::omp PROPERTY INTERFACE_LINK_LIBRARIES
    ${OMP_LIBRARY})


================================================
FILE: cmake/FindOPENBLAS.cmake
================================================
find_path(OPENBLAS_INCLUDE_DIR NAMES cblas.h PATHS /usr/include/openblas)

find_library(OPENBLAS_LIBRARY NAMES openblas)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(OPENBLAS DEFAULT_MSG OPENBLAS_INCLUDE_DIR
    OPENBLAS_LIBRARY)

add_library(p::openblas INTERFACE IMPORTED)
set_property(TARGET p::openblas PROPERTY INTERFACE_INCLUDE_DIRECTORIES
    ${OPENBLAS_INCLUDE_DIR})
set_property(TARGET p::openblas PROPERTY INTERFACE_LINK_LIBRARIES
    ${OPENBLAS_LIBRARY})


================================================
FILE: cmake/UserOverride.cmake
================================================
# This overrides the default CMake Debug and Release compiler options.
# The user can still specify different options by setting the
# CMAKE_Fortran_FLAGS_[RELEASE,DEBUG] variables (on the command line or in the
# CMakeList.txt). This files serves as better CMake defaults and should only be
# modified if the default values are to be changed. Project specific compiler
# flags should be set in the CMakeList.txt by setting the CMAKE_Fortran_FLAGS_*
# variables.
if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
    # gfortran
    set(common "-Wall -Wextra -Wimplicit-interface -fPIC")
    set(CMAKE_Fortran_FLAGS_RELEASE_INIT "${common} -O3 -march=native -ffast-math -funroll-loops")
    set(CMAKE_Fortran_FLAGS_DEBUG_INIT   "${common} -g -fcheck=all -fbacktrace")
elseif (CMAKE_Fortran_COMPILER_ID MATCHES "^Intel")
    # ifort
    set(common "-warn all")
    set(CMAKE_Fortran_FLAGS_RELEASE_INIT "${common} -xHOST -O3 -no-prec-div -static")
    set(CMAKE_Fortran_FLAGS_DEBUG_INIT   "${common} -check all")
endif ()


================================================
FILE: comparison/encode_input.py
================================================
"""
This script implements the encoding of an input string into tokens.
It requires two files in the current directory: encoder.json, vocab.bpe
It creates the file input.dat which contains the input tokens and how many
tokens to generate.

TODO: save the information from encoder.json and vocab.pge into model.dat and
implement this encoder in Fortran.

Most of this file were taken from:

https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py

And it is licensed under:

Modified MIT License

Software Copyright (c) 2019 OpenAI

We don’t claim ownership of the content you create with GPT-2, so it is yours to do with as you please.
We only ask that you use GPT-2 responsibly and clearly indicate your content was created using GPT-2.

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
associated documentation files (the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
The above copyright notice and this permission notice need not be included
with content created by the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
OR OTHER DEALINGS IN THE SOFTWARE.
"""

import numpy as np
import json
import os
import regex as re

def bytes_to_unicode():
    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


def get_pairs(word):
    """Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


class Encoder:
    def __init__(self, encoder, bpe_merges):
        self.encoder = encoder
        self.byte_encoder = bytes_to_unicode()
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

    def bpe(self, token):
        word = tuple(token)
        pairs = get_pairs(word)

        if not pairs:
            return token

        while True:
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i) # can trigger ValueError
                    new_word.extend(word[i:j])
                    i = j
                except ValueError:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        return word

    def encode(self, text):
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token))
        return bpe_tokens


def get_encoder():
    with open("encoder.json") as f:
        encoder = json.load(f)
    with open("vocab.bpe", encoding="utf-8") as f:
        bpe_data = f.read()
    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
    return Encoder(encoder=encoder, bpe_merges=bpe_merges)

def main(prompt: str, n_tokens_to_generate: int = 40):
    encoder = get_encoder()
    input_ids = np.array(encoder.encode(prompt), dtype=np.int32)

    print("Saving the input into `input.dat`")
    g = open("input.dat", "w")
    np.array([len(input_ids), n_tokens_to_generate], dtype=np.int32).tofile(g)
    input_ids.tofile(g)
    print(input_ids)

if __name__ == "__main__":
    import fire
    fire.Fire(main)


================================================
FILE: create_model.py
================================================
"""
This script loads the specified GPT-2 model from OpenAI using TensorFlow,
converts it into our custom format and saves it to `model.gguf`, which contains
everything (all the parameters, all the weights, encoding/decoding
information).

Parts of this script were taken from the picoGPT project: https://github.com/jaymody/picoGPT

Those are licensed as:

MIT License

Copyright (c) 2023 Jay Mody

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

"""

from time import monotonic as clock
import os
import json
import re
from shutil import copyfile

import numpy as np
import gguf
import requests
import tensorflow as tf
from tqdm import tqdm

def download_gpt2_files(model_size, model_dir):
    assert model_size in ["124M", "355M", "774M", "1558M"]
    for filename in [
        "checkpoint",
        "encoder.json",
        "hparams.json",
        "model.ckpt.data-00000-of-00001",
        "model.ckpt.index",
        "model.ckpt.meta",
        "vocab.bpe",
    ]:
        url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
        r = requests.get(f"{url}/{model_size}/{filename}", stream=True)
        r.raise_for_status()

        with open(os.path.join(model_dir, filename), "wb") as f:
            file_size = int(r.headers["content-length"])
            chunk_size = 1000
            with tqdm(
                ncols=100,
                desc="Fetching " + filename,
                total=file_size,
                unit_scale=True,
            ) as pbar:
                # 1k for chunk_size, since Ethernet packet size is around 1500 bytes
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
                    pbar.update(chunk_size)


def load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams):
    def set_in_nested_dict(d, keys, val):
        if not keys:
            return val
        if keys[0] not in d:
            d[keys[0]] = {}
        d[keys[0]] = set_in_nested_dict(d[keys[0]], keys[1:], val)
        return d

    init_vars = tf.train.list_variables(tf_ckpt_path)
    params = {"blocks": [{} for _ in range(hparams["n_layer"])]}
    for name, _ in init_vars:
        array = np.squeeze(tf.train.load_variable(tf_ckpt_path, name))
        name = name.removeprefix("model/")
        if name.startswith("h"):
            m = re.match(r"h([0-9]+)/(.*)", name)
            n = int(m[1])
            sub_name = m[2]
            set_in_nested_dict(params["blocks"][n], sub_name.split("/"), array)
        else:
            set_in_nested_dict(params, name.split("/"), array)

    return params


def load_encoder_hparams_and_params(model_size, models_dir):
    assert model_size in ["124M", "355M", "774M", "1558M"]

    model_dir = os.path.join(models_dir, model_size)
    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
    if not tf_ckpt_path:  # download files if necessary
        os.makedirs(model_dir, exist_ok=True)
        download_gpt2_files(model_size, model_dir)
        tf_ckpt_path = tf.train.latest_checkpoint(model_dir)

    hparams = json.load(open(os.path.join(model_dir, "hparams.json")))
    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams)

    return hparams, params

def convert(params, n_head, n_ctx, idx, decoder_txt,
        vocab_idx, vocab_txt, byte_decoder):
    t1 = clock()
    blocks = params["blocks"]
    n_embd = blocks[0]["ln_1"]["b"].size
    n_layer = len(blocks)
    mlp_fc_w = np.empty((n_layer,n_embd,4*n_embd), dtype=np.float32)
    mlp_fc_b = np.empty((n_layer,4*n_embd), dtype=np.float32)
    mlp_proj_w = np.empty((n_layer,4*n_embd,n_embd), dtype=np.float32)
    mlp_proj_b = np.empty((n_layer,n_embd), dtype=np.float32)
    attn_w = np.empty((n_layer,n_embd,3*n_embd), dtype=np.float32)
    attn_b = np.empty((n_layer,3*n_embd), dtype=np.float32)
    attn_proj_w = np.empty((n_layer,n_embd,n_embd), dtype=np.float32)
    attn_proj_b = np.empty((n_layer,n_embd), dtype=np.float32)
    ln1_g = np.empty((n_layer,n_embd), dtype=np.float32)
    ln1_b = np.empty((n_layer,n_embd), dtype=np.float32)
    ln2_g = np.empty((n_layer,n_embd), dtype=np.float32)
    ln2_b = np.empty((n_layer,n_embd), dtype=np.float32)
    for i, block in enumerate(blocks):
        mlp_fc_w[i,:,:] = block["mlp"]["c_fc"]["w"]
        mlp_fc_b[i,:] = block["mlp"]["c_fc"]["b"]
        mlp_proj_w[i,:,:] = block["mlp"]["c_proj"]["w"]
        mlp_proj_b[i,:] = block["mlp"]["c_proj"]["b"]
        attn_w[i,:,:] = block["attn"]["c_attn"]["w"]
        attn_b[i,:] = block["attn"]["c_attn"]["b"]
        attn_proj_w[i,:,:] = block["attn"]["c_proj"]["w"]
        attn_proj_b[i,:] = block["attn"]["c_proj"]["b"]
        ln1_g[i,:] = block["ln_1"]["g"]
        ln1_b[i,:] = block["ln_1"]["b"]
        ln2_g[i,:] = block["ln_2"]["g"]
        ln2_b[i,:] = block["ln_2"]["b"]
    wte = params["wte"]
    wpe = params["wpe"]
    lnf_g = params["ln_f"]["g"]
    lnf_b = params["ln_f"]["b"]
    t2 = clock()
    print("Transform time: ", t2-t1)
    t1 = clock()

    n_vocab = np.size(wte, 0)
    assert np.size(wte, 1) == n_embd

    model_type = 0xfa51697 # fastGPT
    model_version = 2
    header = np.array([model_type, model_version, n_vocab, n_ctx, n_embd, n_layer, n_head,
        len(idx),len(decoder_txt.encode("utf-8")),
        len(vocab_idx),len(vocab_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32)

    # Save the model to GGUF
    def save_gguf(data_offset_name, data_offset_value):
        g = gguf.GGUFWriter("model.gguf", None)
        g.add_int32(data_offset_name, data_offset_value)
        g.add_tensor("header", header)
        g.add_tensor("wte", wte); g.add_tensor("wpe", wpe)
        g.add_tensor("mlp_fc_w", mlp_fc_w); g.add_tensor("mlp_fc_b", mlp_fc_b)
        g.add_tensor("mlp_proj_w", mlp_proj_w); g.add_tensor("mlp_proj_b", mlp_proj_b)
        g.add_tensor("attn_w", attn_w); g.add_tensor("attn_b", attn_b)
        g.add_tensor("attn_proj_w", attn_proj_w); g.add_tensor("attn_proj_b",
                attn_proj_b)
        g.add_tensor("ln1_b", ln1_b); g.add_tensor("ln1_g", ln1_g)
        g.add_tensor("ln2_b", ln2_b); g.add_tensor("ln2_g", ln2_g)
        g.add_tensor("lnf_b", lnf_b); g.add_tensor("lnf_g", lnf_g)
        g.add_tensor("idx", idx)
        g.add_tensor("decoder_txt", np.frombuffer(decoder_txt.encode("utf-8"),
            dtype=np.int8))
        g.add_tensor("vocab_idx", vocab_idx)
        g.add_tensor("vocab_txt", np.frombuffer(vocab_txt.encode("utf-8"),
            dtype=np.int8))
        g.add_tensor("byte_decoder", byte_decoder)
        g.write_header_to_file()
        g.write_kv_data_to_file()
        g.write_tensors_to_file()
        g.close()

    data_offset_name = "general.data_offset"
    save_gguf(data_offset_name, 0)

    g = gguf.GGUFReader("model.gguf")
    data_offset = g.tensors[0].data_offset
    # * .offset: the offset of the kv entry
    # * 8: The i64 length of the key string
    # * 4: The i32 type of the value
    assert g.fields[data_offset_name].offset == 24
    offset_offset = g.fields[data_offset_name].offset + 8 + \
        len(data_offset_name) + 4
    print("offset offset:", offset_offset)
    print("data offset:", data_offset)

    save_gguf(data_offset_name, data_offset)

    t2 = clock()
    print("Save time: ", t2-t1)


def load_decoder(filename):
    D = json.load(open(filename))
    D2 = {v: k for k, v in D.items()}
    i = 0
    decoder = []
    while True:
        if i not in D2:
            break
        decoder.append(D2[i])
        i += 1
    return decoder

def load_vocab(filename):
    D = open(filename).read()
    D = D.split("\n")
    D = D[1:]
    return D

def decoder_idx(decoder):
    i = 0
    idx = np.empty(len(decoder)+1, dtype=np.int32)
    idx[0] = i
    for n, t in enumerate(decoder):
        i += len(t.encode("utf-8"))
        idx[n+1] = i
    assert idx[-1] == len("".join(decoder).encode("utf-8"))
    return idx

def bytes_to_unicode():
    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    btu = dict(zip(bs, cs))
    byte_decoder = {v: k for k, v in btu.items()}
    bd = np.zeros(324, dtype=np.int32)
    for y in byte_decoder:
        x = ord(y)
        bd[x] = byte_decoder[y]
    bd2 = np.zeros(256, dtype=np.int32)
    for i in range(np.size(bd)):
        bd2[bd[i]] = i
    return bd2

def main(model_size: str = "124M", models_dir: str = "models"):
    print("Loading model")
    # load encoder, hparams, and params from the released open-ai gpt-2 files
    t1 = clock()
    hparams, params = load_encoder_hparams_and_params(model_size, models_dir)
    decoder = load_decoder(os.path.join(models_dir, model_size, "encoder.json"))
    vocab = load_vocab(os.path.join(models_dir, model_size, "vocab.bpe"))
    t2 = clock()
    print("  Done. Loading time: ", t2-t1)

    # generate output ids
    print("Converting model, saving to `model.gguf`")
    t1 = clock()
    decoder_txt = "".join(decoder)
    idx = decoder_idx(decoder)
    vocab_txt = "".join(vocab)
    vocab_idx = decoder_idx(vocab)
    byte_decoder = bytes_to_unicode()
    convert(params, hparams["n_head"], hparams["n_ctx"], idx, decoder_txt,
            vocab_idx, vocab_txt, byte_decoder)
    t2 = clock()
    print("  Done. Time: ", t2-t1)


if __name__ == "__main__":
    import fire
    fire.Fire(main)


================================================
FILE: driver.f90
================================================
module driver
use gpt2_mod, only: generate, model_t
use tokenizer, only: encode, decode, string
use omp, only: omp_get_wtime
implicit none

integer, parameter :: sp = kind(0.0)
integer, parameter :: dp = kind(0.d0)
character(1), parameter :: LF = achar(10)

contains

subroutine load_input(filename, input_txt, n_tokens_to_generate)
! Load the input from a namelist `filename`
character(*), intent(in) :: filename
character(:), allocatable, intent(out) :: input_txt
integer, intent(out) :: n_tokens_to_generate
character(1024) :: input_txt2
integer :: u, ios
namelist / input_fastGPT / n_tokens_to_generate
allocate(character(0) :: input_txt)
input_txt = ""
open(newunit=u, file=filename, status="old")
read(u, input_fastGPT)
do
    read(u, "(a)", iostat=ios) input_txt2
    if (ios /= 0) exit
    if (len(input_txt) > 0) input_txt = input_txt // char(10)
    input_txt = input_txt // trim(input_txt2)
end do
close(u)
end subroutine

! Skips `amount` bytes from the current position
subroutine fskip(u, amount)
integer, intent(in) :: u, amount
character, allocatable :: tmp(:)
! Note: the code below is equivalent to the non-standard: fseek(u, amount, 1)
! Let's allocate on heap, in case the skip is large
allocate(tmp(amount))
read(u) tmp
end subroutine

! Aligns file position in `u` to 32 byte boundary after `A` was read
subroutine align_i4(u, A)
integer, intent(in) :: u
integer, intent(in) :: A(..)
integer :: n, alignment
alignment = 32
n = size(A)*4
call fskip(u, alignment-modulo(n,alignment))
end subroutine

subroutine align_str(u, A)
integer, intent(in) :: u
character, intent(in) :: A(:)
integer :: n, alignment
alignment = 32
n = size(A)
if (modulo(n, alignment) /= 0) then
    call fskip(u, alignment-modulo(n,alignment))
end if
end subroutine

subroutine load_model(filename, m)
character(*), intent(in) :: filename
type(model_t), intent(out) :: m
! We use the following fastGPT model type number
!   fastGPT (digits look similar to the letters they represent)
! 0xfa51697 = 262477463

! We read the offset to the data section at this position, which is the first
! variable in the metadata, the name is "general.data_offset", type i32.
integer, parameter :: offset_offset = &
    ! header
    4 + & ! u8[4] magic
    4 + & ! u32 version
    8 + & ! u64 n_arrays
    8 + & ! u64 n_kv
    ! kv
    8 + & ! u64 n_str
    19 + & ! len("general.data_offset")
    4 ! u32 type of value
integer, parameter :: current_model_mark = 262477463
integer, parameter :: current_model_version = 2
integer :: model_mark
integer :: u
integer :: data_offset
open(newunit=u, file=filename, form="unformatted", access="stream", status="old")
call fskip(u, offset_offset)
read(u) data_offset
! Alternatively we could have done: rewind(u); call fskip(u, data_offset)
call fskip(u, data_offset-offset_offset-4)
read(u) model_mark
if (model_mark /= current_model_mark) then
    print *, "Found:", model_mark
    print *, "Expected:", current_model_mark
    error stop "Invalid fastGPT model file"
end if
read(u) m%model_file_version
if (m%model_file_version /= current_model_version) then
    print *, "Found:", m%model_file_version
    print *, "Expected:", current_model_version
    error stop "Incompatible model version"
end if
read(u) m%n_vocab, m%n_ctx, m%n_embd, m%n_layer, m%n_head, m%n_decoder_idx, &
    m%n_decoder_txt, m%n_vocab_idx, m%n_vocab_txt, m%n_byte_encoder
call fskip(u, 16) ! Pad the 12 element i32 array to 32 byte boundary
allocate(m%wte(m%n_embd,m%n_vocab), m%wpe(m%n_embd,m%n_ctx), &
    m%mlp_fc_w(4*m%n_embd,m%n_embd,m%n_layer), m%mlp_fc_b(4*m%n_embd,m%n_layer), &
    m%mlp_proj_w(m%n_embd,4*m%n_embd,m%n_layer), m%mlp_proj_b(m%n_embd,m%n_layer), &
    m%attn_w(3*m%n_embd,m%n_embd,m%n_layer), m%attn_b(3*m%n_embd,m%n_layer), &
    m%attn_proj_w(m%n_embd,m%n_embd,m%n_layer), m%attn_proj_b(m%n_embd,m%n_layer), &
    m%ln1_b(m%n_embd,m%n_layer), m%ln1_g(m%n_embd,m%n_layer), &
    m%ln2_b(m%n_embd,m%n_layer), m%ln2_g(m%n_embd,m%n_layer), &
    m%lnf_b(m%n_embd), m%lnf_g(m%n_embd), &
    m%decoder_idx(0:m%n_decoder_idx-1), m%decoder_txt(m%n_decoder_txt), &
    m%vocab_idx(0:m%n_vocab_idx-1), m%vocab_txt(m%n_vocab_txt), &
    m%byte_encoder(0:m%n_byte_encoder-1))
read(u) m%wte, m%wpe, &
    m%mlp_fc_w, m%mlp_fc_b, &
    m%mlp_proj_w, m%mlp_proj_b, &
    m%attn_w, m%attn_b, &
    m%attn_proj_w, m%attn_proj_b, &
    m%ln1_b, m%ln1_g, &
    m%ln2_b, m%ln2_g, &
    m%lnf_b, m%lnf_g, &
    m%decoder_idx
call align_i4(u, m%decoder_idx)
read(u) m%decoder_txt
call align_str(u, m%decoder_txt)
read(u) m%vocab_idx
call align_i4(u, m%vocab_idx)
read(u) m%vocab_txt
call align_str(u, m%vocab_txt)
read(u) m%byte_encoder
close(u)
end subroutine

subroutine gpt2_driver(input, output, m)
integer, allocatable, intent(out) :: input(:), output(:)
type(model_t), intent(out) :: m
character(:), allocatable :: input_txt
integer :: n_tokens_to_generate
real(dp) :: t1, t2
call load_input("input", input_txt, n_tokens_to_generate)

! Load the model
print "(a)", "Loading the model..."
call cpu_time(t1)
call load_model("model.gguf", m)
call cpu_time(t2)
print "(a,f8.3,a,i2)", "    done. Time:", t2-t1, "s, Model file version:", m%model_file_version
print *
print "(a)", "Model parameters:"
print "(a,i6)", "n_vocab =", m%n_vocab
print "(a,i6)", "n_ctx   =", m%n_ctx
print "(a,i6)", "n_embd  =", m%n_embd
print "(a,i6)", "n_layer =", m%n_layer
print "(a,i6)", "n_head  =", m%n_head
print *

call gpt2_driver2(input_txt, n_tokens_to_generate, m, input, output)
endsubroutine

subroutine gpt2_driver2(input_txt, n_tokens_to_generate, m, input, output)
character(*), intent(in) :: input_txt
integer, intent(in) :: n_tokens_to_generate
type(model_t), intent(in) :: m
integer, allocatable, intent(out) :: input(:), output(:)
integer, allocatable :: byte_decoder(:)
integer :: n_seq
character(:), allocatable :: output_txt
real(dp) :: t1, t2, t1o, t2o
integer :: i
logical :: use_cache

! Compute byte_decoder:
allocate(byte_decoder(0:maxval(m%byte_encoder)))
byte_decoder = 0
do i = 0, size(m%byte_encoder)-1
    byte_decoder(m%byte_encoder(i)) = i
end do

print "(a)", "Input text"
print "(a)", input_txt

print *
print "(a)",  "Encoding: tokenizing input text into tokens (currently slow)..."
call cpu_time(t1)
input = encode(input_txt, m%decoder_idx, m%decoder_txt, m%vocab_idx, m%vocab_txt, &
    m%byte_encoder)
call cpu_time(t2)
n_seq = size(input)
print "(a,f8.3,a)", "    done. Time:", t2-t1, "s"
print *
print "(a)", "Input parameters:"
print "(a,i4)", "n_seq                =", n_seq
print "(a,i4)", "n_tokens_to_generate =", n_tokens_to_generate
print *
print "(a)", "Input tokens:"
print "(1000(i6))", input
print *

if (n_seq + n_tokens_to_generate >= m%n_ctx) then
    print *, "The maximum sequence length of the model was surpassed."
    print *, "Make the input and/or number of tokens to generate shorter."
    error stop
end if

print "(a)", "Decoded input as text:"
!print "(a)", decode(input, decoder_idx, decoder_txt, byte_decoder)
allocate(character(0) :: output_txt) ! Fix GFortran warning
output_txt = decode(input, m%decoder_idx, m%decoder_txt, byte_decoder)
print "(a)", output_txt
print *

if (input_txt /= output_txt) then
    error stop "The decoded input text does not agree with the input text"
end if

print "(a)", "Running model..."
call cpu_time(t1)
t1o = omp_get_wtime()
use_cache = .true.
call generate(output, n_tokens_to_generate, m, size(input), input, use_cache, &
    byte_decoder)
print *
t2o = omp_get_wtime()
call cpu_time(t2)
print "(a,f8.3,a,f4.2,a)", "    done. Time:", t2o-t1o, "s (", (t2-t1)/(t2o-t1o), "x)"
print *
print "(a)", "Output tokens:"
print "(1000(i6))", output
output_txt = decode(output, m%decoder_idx, m%decoder_txt, byte_decoder)
print *
print "(a)", "Decoded output as text:"
print "(a)", output_txt
end subroutine

subroutine gpt2_driver3(input_txt, n_tokens_to_generate, stop_text, m, output_txt)
character(*), intent(in) :: input_txt, stop_text
integer, intent(in) :: n_tokens_to_generate
type(model_t), intent(in) :: m
integer, allocatable :: input(:), output(:)
integer, allocatable :: byte_decoder(:)
integer :: n_seq
character(:), allocatable, intent(out) :: output_txt
integer :: i
logical :: use_cache
! TODO: move the decoder into model_t
! Compute byte_decoder:
allocate(byte_decoder(0:maxval(m%byte_encoder)))
byte_decoder = 0
do i = 0, size(m%byte_encoder)-1
    byte_decoder(m%byte_encoder(i)) = i
end do
input = encode(input_txt, m%decoder_idx, m%decoder_txt, m%vocab_idx, m%vocab_txt, &
    m%byte_encoder)
n_seq = size(input)
if (n_seq + n_tokens_to_generate >= m%n_ctx) then
    print *, "The maximum sequence length of the model was surpassed."
    print *, "Make the input and/or number of tokens to generate shorter."
    error stop
end if
allocate(character(0) :: output_txt) ! Fix GFortran warning
output_txt = decode(input, m%decoder_idx, m%decoder_txt, byte_decoder)
if (input_txt /= output_txt) then
    error stop "The decoded input text does not agree with the input text"
end if
use_cache = .true.
call generate(output, n_tokens_to_generate, m, size(input), input, use_cache, &
    byte_decoder, stop_text)
output_txt = decode(output, m%decoder_idx, m%decoder_txt, byte_decoder)
end subroutine

function get_prompt() result(input)
character(:), allocatable :: input
character(1024) :: tmp
integer ::ios
read(*,"(a)",iostat=ios) tmp
if (ios == 0) then
    input = trim(tmp)
else
    input = ""
end if
end function

subroutine chat(inputs)
type(string), optional, intent(in) :: inputs(:)
type(model_t) :: m
character(:), allocatable :: prompt, input, output
integer :: i, n_prompts
call load_model("model.gguf", m)
prompt = "Your name is fastGPT and you are an AI bot. The user will ask you &
&questions and you answer in a nice, truthful, short way." // LF // "&
&User: What is the capital of Czechia?" // LF // "&
&fastGPT: Prague." // LF // "&
&User: How many legs does a dog have?" // LF // "&
&fastGPT: Four." // LF // "&
&User:"
write(*,"(a)",advance="no") prompt
if (present(inputs)) then
    n_prompts = size(inputs)
else
    n_prompts = 1024
end if
do i = 1, n_prompts
    write(*,"(a)",advance="no")  " "
    if (present(inputs)) then
        input = inputs(i)%s
        write(*,"(a)") input
    else
        input = get_prompt()
        if (input == "") exit
    end if
    write(*,"(a)",advance="no") "fastGPT:"
    prompt = prompt // " " // input // LF // "fastGPT:"
    call gpt2_driver3(prompt, 200, "User:", m, output)
    prompt = prompt // output
end do
print *
end subroutine

end module


================================================
FILE: environment.yml
================================================
name: fastgpt
channels:
  - conda-forge
dependencies:
  - python=3.9
  - numpy=1.24.2
  - tensorflow=2.11.0
  - tqdm=4.65.0
  - fire=0.4.0
  - regex=2022.10.31
  #- gfortran=14.2.0
  - cmake=3.25.2
  - transformers=4.26.1
  - openblas=0.3.21


================================================
FILE: fpm.toml
================================================
name = "fastGPT"


================================================
FILE: gpt2.f90
================================================
module gpt2_mod
use linalg, only: matmul_2d, matmul_2d_t
use tokenizer, only: decode
implicit none

integer, parameter :: sp = kind(0.0)
real(sp), parameter :: pi = 3.14159265358979323846_sp

! This derived type contains all the data of the GPT-2 model, including all
! weights, model parameters, and encoder/decoder data
type :: model_t
    integer :: n_vocab, n_ctx, n_embd, n_layer, n_head, &
        n_decoder_idx, n_decoder_txt, &
        n_vocab_idx, n_vocab_txt, n_byte_encoder
    real(sp), allocatable :: wte(:,:), wpe(:,:), &
        mlp_fc_w(:,:,:), mlp_fc_b(:,:), &
        mlp_proj_w(:,:,:), mlp_proj_b(:,:), &
        attn_w(:,:,:), attn_b(:,:), &
        attn_proj_w(:,:,:), attn_proj_b(:,:), &
        ln1_b(:,:), ln1_g(:,:), &
        ln2_b(:,:), ln2_g(:,:), &
        lnf_b(:), lnf_g(:)
    integer, allocatable :: decoder_idx(:), vocab_idx(:), byte_encoder(:)
    character, allocatable :: decoder_txt(:), vocab_txt(:)
    integer :: model_file_version
end type

contains

elemental real(sp) function fast_tanh(x) result(y)
real(sp), intent(in) :: x
real(sp) :: x2
if (x > 5) then
    y = 1
elseif (x < -5) then
    y = -1
else
    x2 = x*x
    y = x * (0.98569772605911309407 + x2 *(-0.2794500993392901382 &
        + x2 * (6.8280504526399188164e-2 + x2 * (-1.0972014877337651823e-2 &
        + x2 * (1.1132367134444316902e-3 + x2 * (-7.018851897305717565e-5 &
        + x2 * (2.656616768082727089e-6 + x2 * (-5.5138381821615909058e-8 &
        + x2 * 4.8162484477588665996e-10))))))))
end if
end function

elemental real(sp) function gelu(x) result(y)
real(sp), intent(in) :: x
y = 0.5_sp * x * (1 + tanh(sqrt(2 / pi) * (x + 0.044715_sp * x**3)))
end function

function softmax(x) result(y)
real(sp), intent(in) :: x(:,:)
real(sp) :: y(size(x,1),size(x,2))
integer :: i
do i = 1, size(x,2)
    y(:,i) = exp(x(:,i) - maxval(x(:,i)))
    y(:,i) = y(:,i) / sum(y(:,i))
end do
end function

function layer_norm(x, g, b, eps) result(y)
real(sp), intent(in) :: x(:,:), g(:), b(:), eps
real(sp) :: y(size(x,1),size(x,2))
real(sp) :: mean(size(x,2)), variance(size(x,2))
integer :: i
do i = 1, size(x,2)
    mean(i) = sum(x(:,i)) / size(x,1)
    variance(i) = sum((x(:,i) - mean(i))**2) / size(x,1)
end do
!do i = 1, size(x,1)
!    y(i,:) = (x(i,:) - mean(:)) / sqrt(variance(:) + eps)
!    y(i,:) = g(i) * y(i,:) + b(i)
!end do
do i = 1, size(x,2)
    y(:,i) = (x(:,i) - mean(i)) / sqrt(variance(i) + eps)
    y(:,i) = g(:) * y(:,i) + b(:)
end do
end function

function linear(x, w, b) result(y)
real(sp), intent(in) :: x(:,:), w(:,:), b(:)
real(sp) :: y(size(b,1),size(x,2))
integer :: i
!y = matmul(w, x) + spread(b, 2, size(x,2))
!y = matmul(w, x)
call matmul_2d(w, x, y)
do i = 1, size(y,2)
    y(:,i) = y(:,i) + b(:)
end do
end function

function ffn(x, fc_w, fc_b, proj_w, proj_b) result(y)
real(sp), intent(in) :: x(:,:), fc_w(:,:), fc_b(:), proj_w(:,:), proj_b(:)
real(sp) :: y(size(x,1),size(x,2))
!real(sp) :: a(4*size(x,1),size(x,2))
!a = gelu(linear(x, fc_w, fc_b))
y = linear(gelu(linear(x, fc_w, fc_b)), proj_w, proj_b)
end function

function attention(n_embd_head,n_seq,n_seq_x, q, k, v, mask) result(y)
integer, intent(in) :: n_embd_head, n_seq, n_seq_x
real(sp), intent(in) :: q(n_embd_head,n_seq_x), k(n_embd_head,n_seq), v(n_embd_head,n_seq), mask(n_seq,n_seq_x)
real(sp) :: y(n_embd_head,n_seq_x)
real(sp) :: tmp(n_seq,n_seq_x)
!tmp = matmul(transpose(k), q)
!call matmul_2d(transpose(k), q, tmp)
call matmul_2d_t(k, q, tmp)
call matmul_2d(v, softmax(tmp / sqrt(real(n_embd_head,sp)) + mask), y)
end function

function mha(n_seq, n_seq_x, n_embd, x, attn_w, attn_b, proj_w, proj_b, n_head, &
            use_kv_cache, kv_cache) &
        result(y)
integer, intent(in) :: n_seq, n_seq_x, n_embd
real(sp), intent(in) :: x(n_embd,n_seq_x), &
    attn_w(3*n_embd,n_embd), attn_b(3*n_embd), &
    proj_w(n_embd,n_embd), proj_b(n_embd)
real(sp), intent(inout) :: kv_cache(n_embd,n_seq,2)
integer, intent(in) :: n_head
logical, intent(in) :: use_kv_cache
real(sp) :: y(n_embd,n_seq_x)
real(sp) :: causal_mask(n_seq,n_seq_x)
real(sp) :: x2(3*n_embd,n_seq_x)
real(sp) :: q(n_embd/n_head,n_seq_x), k(n_embd/n_head,n_seq), v(n_embd/n_head,n_seq)
real(sp) :: yy(n_embd/n_head,n_seq_x)
integer :: i, j, l
! Mask
if (use_kv_cache) then
    causal_mask = 0
else
    do j = 1, n_seq
    do i = 1, n_seq
        if (i > j) then
            causal_mask(i,j) = -1e10_sp
        else
            causal_mask(i,j) = 0
        end if
    end do
    end do
end if
x2 = linear(x, attn_w, attn_b)
if (use_kv_cache) then
    do j = 1, n_embd
        kv_cache(j,n_seq,1) = x2((2-1)*n_embd+j,1)
        kv_cache(j,n_seq,2) = x2((3-1)*n_embd+j,1)
    end do
else
    do i = 1, n_seq
    do j = 1, n_embd
        kv_cache(j,i,1) = x2((2-1)*n_embd+j,i)
        kv_cache(j,i,2) = x2((3-1)*n_embd+j,i)
    end do
    end do
end if
! Perform attention over each head
do l = 1, n_head
    do i = 1, n_seq_x
    do j = 1, n_embd/n_head
        q(j,i) = x2((l-1)*n_embd/n_head+j,i)
    end do
    end do
    do i = 1, n_seq
    do j = 1, n_embd/n_head
        k(j,i) = kv_cache((l-1)*n_embd/n_head+j,i,1)
        v(j,i) = kv_cache((l-1)*n_embd/n_head+j,i,2)
    end do
    end do
    yy = attention(n_embd/n_head, n_seq, n_seq_x, q, k, v, causal_mask)
    do i = 1, n_seq_x
    do j = 1, n_embd/n_head
        y((l-1)*n_embd/n_head+j,i) = yy(j,i)
    end do
    end do
end do
! Out projection
y = linear(y, proj_w, proj_b)
end function


function transformer_block(n_seq, n_seq_x, n_embd, x, mlp_fc_w, mlp_fc_b, mlp_proj_w, mlp_proj_b, &
        attn_w, attn_b, attn_proj_w, attn_proj_b, ln1_g, ln1_b, ln2_g, ln2_b, &
        n_head, use_kv_cache, kv_cache) result(y)
real(sp), intent(in) :: x(n_embd,n_seq_x), &
    mlp_fc_w(:,:), mlp_fc_b(:), &
    mlp_proj_w(:,:), mlp_proj_b(:), &
    attn_w(:,:), attn_b(:), attn_proj_w(:,:), attn_proj_b(:), &
    ln1_g(:), ln1_b(:), ln2_g(:), ln2_b(:)
integer, intent(in) :: n_head
integer, intent(in) :: n_seq, n_seq_x, n_embd
real(sp) :: y(n_embd,n_seq_x)
logical, intent(in) :: use_kv_cache
real(sp), intent(inout) :: kv_cache(n_embd,n_seq,2)
y = x + mha(n_seq, n_seq_x, n_embd, layer_norm(x, ln1_g, ln1_b, 1e-5_sp), &
    attn_w, attn_b, attn_proj_w, attn_proj_b, n_head, use_kv_cache, kv_cache)
y = y + ffn(layer_norm(y, ln2_g, ln2_b, 1e-5_sp), &
    mlp_fc_w, mlp_fc_b, mlp_proj_w, mlp_proj_b)
end function

function gpt2(n_vocab, n_ctx, n_seq, n_seq_x, n_embd, n_layer, n_head, input, &
        wte, wpe, &
        mlp_fc_w, mlp_fc_b, mlp_proj_w, mlp_proj_b, &
        attn_w, attn_b, attn_proj_w, attn_proj_b, &
        ln1_g, ln1_b, ln2_g, ln2_b, lnf_g, lnf_b, &
        use_kv_cache, kv_cache) result(y)
integer, intent(in) :: n_vocab, n_ctx, n_seq, n_seq_x, n_embd, n_layer, n_head
integer, intent(in) :: input(n_seq)
real(sp), intent(in) :: wte(n_embd,n_vocab), wpe(n_embd,n_ctx), &
    mlp_fc_w(4*n_embd,n_embd,n_layer), mlp_fc_b(4*n_embd,n_layer), &
    mlp_proj_w(n_embd,4*n_embd,n_layer), mlp_proj_b(n_embd,n_layer), &
    attn_w(3*n_embd,n_embd,n_layer), attn_b(3*n_embd,n_layer), &
    attn_proj_w(n_embd,n_embd,n_layer), attn_proj_b(n_embd,n_layer), &
    ln1_b(n_embd,n_layer), ln1_g(n_embd,n_layer), &
    ln2_b(n_embd,n_layer), ln2_g(n_embd,n_layer), &
    lnf_b(n_embd), lnf_g(n_embd)
logical, intent(in) :: use_kv_cache
real(sp), intent(inout) :: kv_cache(n_embd,n_seq,2,n_layer)
real(sp) :: y(n_vocab,n_seq_x)
real(sp) :: x(n_embd,n_seq_x)
integer :: i
if (use_kv_cache) then
    i = n_seq
    x(:,1) = wte(:,input(i)+1) + wpe(:,i)
else
    do i = 1, n_seq
        x(:,i) = wte(:,input(i)+1) + wpe(:,i)
    end do
end if
do i = 1, n_layer
    x = transformer_block(n_seq, n_seq_x, n_embd, x, &
        mlp_fc_w(:,:,i), mlp_fc_b(:,i), &
        mlp_proj_w(:,:,i), mlp_proj_b(:,i), &
        attn_w(:,:,i), attn_b(:,i), attn_proj_w(:,:,i), attn_proj_b(:,i), &
        ln1_g(:,i), ln1_b(:,i), ln2_g(:,i), ln2_b(:,i), &
        n_head, use_kv_cache, kv_cache(:,:,:,i))
end do
x = layer_norm(x, lnf_g, lnf_b, 1e-5)
!y = matmul(transpose(wte), x)
call matmul_2d_t(wte, x, y)
end function

subroutine generate(output, n_tokens_to_generate, m, &
        n_seq, input, &
        use_cache, &
        byte_decoder, stop_text)
integer, intent(in) :: n_seq, n_tokens_to_generate
type(model_t), intent(in) :: m
integer, intent(in) :: input(n_seq)
logical, intent(in) :: use_cache
integer, intent(in) :: byte_decoder(:)
character(*), intent(in), optional :: stop_text ! Stop if you see this text
integer, allocatable, intent(out) :: output(:)
real(sp), allocatable :: logits(:,:)
integer :: i
integer :: n_seq2, n_seq_x
integer :: next_id
integer :: input2(size(input)+n_tokens_to_generate)
logical :: use_kv_cache
real(sp) :: kv_cache(m%n_embd,n_seq+n_tokens_to_generate,2,m%n_layer)
real(sp), allocatable :: kv_cache2(:,:,:,:)
character(:), allocatable :: output_txt, last_token
if (present(stop_text)) then
    allocate(character(0) :: output_txt)
    output_txt = ""
end if
input2(:n_seq) = input
do i = 1, n_tokens_to_generate
    if (use_cache) then
        use_kv_cache = (i > 1) ! Use cache for subsequent tokens
    else
        use_kv_cache = .false.
    end if
    n_seq2 = n_seq+i-1
    if (use_kv_cache) then
        n_seq_x = 1
    else
        n_seq_x = n_seq2
    end if
    allocate(kv_cache2(m%n_embd,n_seq2,2,m%n_layer))
    kv_cache2(:,:,:,:) = kv_cache(:,:n_seq2,:,:)
    allocate(logits(m%n_vocab, n_seq_x))
    logits = gpt2(m%n_vocab, m%n_ctx, n_seq2, n_seq_x, m%n_embd, m%n_layer, &
            m%n_head, &
            input2(:n_seq2), &
            m%wte, m%wpe, &
            m%mlp_fc_w, m%mlp_fc_b, m%mlp_proj_w, m%mlp_proj_b, &
            m%attn_w, m%attn_b, m%attn_proj_w, m%attn_proj_b, &
            m%ln1_g, m%ln1_b, m%ln2_g, m%ln2_b, m%lnf_g, m%lnf_b, use_kv_cache,&
            kv_cache2)
    kv_cache(:,:n_seq2,:,:) = kv_cache2(:,:,:,:)
    deallocate(kv_cache2)
    next_id = maxloc(logits(:,n_seq_x), dim=1)-1
    input2(n_seq2+1) = next_id
    last_token = decode([next_id], m%decoder_idx, &
        m%decoder_txt, byte_decoder)
    write(*, fmt="(a)", advance="no") last_token
    if (present(stop_text)) then
        output_txt = output_txt // last_token
        if (output_txt(len(output_txt)-len(stop_text)+1:len(output_txt)) == stop_text) then
            exit
        end if
    end if
    deallocate(logits)
end do
allocate(output(n_seq2 - n_seq + 1))
output(:) = input2(n_seq+1:n_seq2+1)
end subroutine

end module


================================================
FILE: input
================================================
&input_fastGPT
n_tokens_to_generate = 20
/
Alan Turing theorized that computers would one day become very powerful, but even he could not imagine


================================================
FILE: linalg_accelerate.c
================================================
/*
This file provides matvec implementation using the macOS Accelerate
Framework, which seems to be the most optimized matrix matrix multiplication
on macOS.
*/
#include <CoreFoundation/CFAttributedString.h>
#include <Accelerate/Accelerate.h>

void acc_sgemm(int m, int n, int k, float *A, float *B, float *C) {
    //A[m][k]
    //B[k][n]
    //C[m][n]
    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, m, B, k, 0.0, C, m);
}

void acc_sgemm_t(int m, int n, int k, float *A, float *B, float *C) {
    //A[k][m] (to be transposed)
    //B[k][n]
    //C[m][n]
    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, n, k, 1.0, A, k, B, k, 0.0, C, m);
}


================================================
FILE: linalg_c.f90
================================================
module linalg
! C implementation of the matmul routines
use iso_c_binding, only: c_int, c_float
implicit none

integer, parameter :: sp = kind(0.0)

interface
    subroutine acc_sgemm(m, n, k, A, B, C) bind(c)
    import :: c_int, c_float
    implicit none
    integer(c_int), value, intent(in) :: m, n, k
    real(c_float), intent(in) :: A(m,k), B(k,n)
    real(c_float), intent(out) :: C(m,n)
    end subroutine

    subroutine acc_sgemm_t(m, n, k, A, B, C) bind(c)
    import :: c_int, c_float
    implicit none
    integer(c_int), value, intent(in) :: m, n, k
    real(c_float), intent(in) :: A(k,m), B(k,n)
    real(c_float), intent(out) :: C(m,n)
    end subroutine
end interface

contains

    subroutine matmul_2d(A, B, C)
    ! C = matmul(A, B)
    real(sp), intent(in) :: A(:,:), B(:,:)
    real(sp), intent(out) :: C(:,:)
    call acc_sgemm(size(A,1), size(B,2), size(A,2), A, B, C)
    end subroutine

    subroutine matmul_2d_t(A, B, C)
    ! C = matmul(transpose(A), B)
    real(sp), intent(in) :: A(:,:), B(:,:)
    real(sp), intent(out) :: C(:,:)
    call acc_sgemm_t(size(A,2), size(B,2), size(A,1), A, B, C)
    end subroutine

end module


================================================
FILE: linalg_f.f90
================================================
module linalg
! Pure Fortran implementation of the matmul routines
implicit none

integer, parameter :: sp = kind(0.0)

contains

    subroutine matmul_2d(A, B, C)
    real(sp), intent(in) :: A(:,:), B(:,:)
    real(sp), intent(out) :: C(:,:)
    C = matmul(A, B)
    end subroutine

    subroutine matmul_2d_t(A, B, C)
    real(sp), intent(in) :: A(:,:), B(:,:)
    real(sp), intent(out) :: C(:,:)
    C = matmul(transpose(A), B)
    end subroutine

end module


================================================
FILE: linalg_openblas.c
================================================
/*
This file provides matvec implementation using OpenBLAS.
*/
#include <cblas.h>

void acc_sgemm(int m, int n, int k, float *A, float *B, float *C) {
    //A[m][k]
    //B[k][n]
    //C[m][n]
    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, m, B, k, 0.0, C, m);
}

void acc_sgemm_t(int m, int n, int k, float *A, float *B, float *C) {
    //A[k][m] (to be transposed)
    //B[k][n]
    //C[m][n]
    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, n, k, 1.0, A, k, B, k, 0.0, C, m);
}


================================================
FILE: main.f90
================================================
program gpt2
use driver, only: gpt2_driver, model_t
implicit none
integer, allocatable :: input(:), output(:)
type(model_t) :: m
call gpt2_driver(input, output, m)
end program


================================================
FILE: omp.f90
================================================
module omp
implicit none
private
public :: omp_get_wtime

integer, parameter :: dp = kind(0.d0)

interface
    real(dp) function omp_get_wtime()
    import :: dp
    end function
end interface

end module


================================================
FILE: omp_dummy.f90
================================================
module omp
implicit none
private
public :: omp_get_wtime

integer, parameter :: dp = kind(0.d0)

contains

real(dp) function omp_get_wtime()
omp_get_wtime = 0
end function

end module


================================================
FILE: pt.py
================================================
from time import monotonic as clock
import os; os.environ["OMP_NUM_THREADS"] = "1"
print("Importing")
t1 = clock()
from transformers import pipeline
t2 = clock()
print("  Time: ", t2-t1)
print("Loading")
t1 = clock()
generator = pipeline('text-generation', model='gpt2')
t2 = clock()
print("  Time: ", t2-t1)
text="Alan Turing theorized that computers would one day become very powerful, but even he could not imagine"
print("Generating")
t1 = clock()
g = generator(text, do_sample=False, max_new_tokens=20, use_cache=True)
t2 = clock()
print("  Time: ", t2-t1)
output = g[0]["generated_text"]
print(output)


================================================
FILE: tests/test_basic_input.f90
================================================
program test_basic_input
use driver, only: gpt2_driver, model_t
implicit none

type(model_t) :: m

integer, parameter :: input_ref(*) = [36235, 39141, 18765, 1143, 326, 9061, &
    561, 530, 1110, 1716, 845, 3665, 11, 475, 772, 339, 714, 407, 5967]
integer, parameter :: output_ref(*) = [703, 484, 561, 307, 1498, 284, 466, &
    523, 13, 198, 198, 1, 40, 892, 326, 262, 749, 1593, 1517, 318]
integer, allocatable :: input(:), output(:)

call gpt2_driver(input, output, m)

print *
print *, "TESTS:"

if (all(input == input_ref)) then
    print *, "Input tokens agree with reference results"
else
    print *, "Input tokens DO NOT agree with reference results"
    error stop
end if

if (all(output == output_ref)) then
    print *, "Output tokens agree with reference results"
else
    print *, "Output tokens DO NOT agree with reference results"
    error stop
end if


end program


================================================
FILE: tests/test_chat.f90
================================================
program test_chat
use driver, only: chat
use tokenizer, only: string
implicit none
type(string), allocatable :: inputs(:)
inputs = [ &
    string("What color does the sky have?"), &
    string("What can you type a document on?"), &
    string("What can you drive in?"), &
    string("What can you fly in?"), &
    string("What continent is Germany in?"), &
    string("When did Second World War start?"), &
    string("When did it end?"), &
    string("When did the U.S. enter the Second World War?"), &
    string("When did the First World War start?"), &
    string("When did it end?"), &
    string("When did the Mexican-American war start?"), &
    string("When did it end?"), &
    string("What color is snow?"), &
    string("What color do plants usually have?") &
    ]
call chat(inputs(:3))
end program


================================================
FILE: tests/test_more_inputs.f90
================================================
program test_more_inputs
use driver, only: gpt2_driver2, model_t, load_model
implicit none

type(model_t) :: m
integer, parameter :: input_ref(*) = [46, 358, 129, 247, 68, 73, 34754, 234, &
    861, 8836, 74, 373, 4642, 287]
integer, parameter :: output_ref(*) = [1248, 5332, 287, 262, 7404, 286, &
    25370, 254, 368, 83, 6557, 81, 11]
integer, allocatable :: input(:), output(:)

call load_model("model.gguf", m)

call gpt2_driver2("Ondřej Čertík was born in", 13, m, input, output)
print *
print *, "TESTS:"
call test(input, input_ref, "Input")
call test(output, output_ref, "Output")

call gpt2_driver2("San Francisco is", 8, m, input, output)
print *
print *, "TESTS:"
call test(input, [15017, 6033, 318], "Input")
call test(output, [257, 1748, 286, 517, 621, 352, 1510, 661], "Output")

call gpt2_driver2("Cars are", 13, m, input, output)
print *
print *, "TESTS:"
call test(input, [34, 945, 389], "Input")
call test(output, [407, 3142, 284, 307, 973, 287, 262, 7647, 1256, 286, &
    257, 7072, 13], "Output")

contains

subroutine test(a, a_ref, text)
integer, intent(in) :: a(:), a_ref(:)
character(*), intent(in) :: text
if (all(a == a_ref)) then
    print *, text, ": OK"
else
    print *, text, ": FAIL"
    error stop
end if
end subroutine

end program


================================================
FILE: tokenizer.f90
================================================
module tokenizer
implicit none

type :: string
    character(:), allocatable :: s
end type

contains

function c2s(x) result(y)
character, intent(in) :: x(:)
character(:), allocatable :: y
integer :: i
allocate(character(size(x)) :: y)
do i = 1, size(x)
    y(i:i) = x(i)
end do
end function

function next_token(input, i) result(y)
! TODO: tokenize exactly according to this regex:
! re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
! Right now we are more greedy, but the bpe() tokenizer seems to still return
! exactly the same tokens for most inputs (it is not clear if for all inputs).
character(*), intent(in) :: input
integer, intent(inout) :: i
character(:), allocatable :: y
if (i > len(input)) then
    y = ""
else if (input(i:i) == " ") then
    y = tokenize_word(input, i)
else if (input(i:i) == "," .or. input(i:i) == ".") then
    y = input(i:i)
    i = i + 1
else
    y = tokenize_word(input, i)
end if
end function

function tokenize_word(input, i) result(y)
character(*), intent(in) :: input
integer, intent(inout) :: i
character(:), allocatable :: y
integer :: i0
i0 = i
if (input(i:i) == " ") then
    i = i + 1
end if
do
    if (i > len(input)) then
        y = input(i0:i-1)
        exit
    end if
    if (input(i:i) == " " .or. input(i:i) == "," .or. input(i:i) == ".") then
        y = input(i0:i-1)
        exit
    end if
    i = i + 1
end do
end function

function word_idx(word, idx, decoder_txt) result(token)
character(*), intent(in) :: word
integer, intent(in) :: idx(0:)
character, intent(in) :: decoder_txt(:)
integer :: token
integer :: i
! This is O(n) search instead of O(1) lookup in a dictionary, so it is slow
do i = 0, ubound(idx,1)-1
    if (c2s(decoder_txt(idx(i)+1:idx(i+1))) == word) then
        token = i
        return
    end if
end do
token = -1
end function

subroutine codepoint_to_utf8(s, c)
! UTF-32 -> UTF-8
character(:), allocatable, intent(inout) :: s
integer, intent(in) :: c
integer :: d1, d2
if (c < 128) then
    s = s // achar(c)
else if (c < 2048) then
    d1 = ior(ishft(c, -6), 192)
    d2 = iand(ior(c, 128), 191)
    s = s // achar(d1) // achar(d2)
else
    error stop "UTF-32 range not supported"
end if
end subroutine

function utf8_to_codepoint(s, i) result(c)
! UTF-8 -> UTF-32
character(*), intent(in) :: s
integer, intent(inout) :: i
integer :: c, d
c = iachar(s(i:i))
if (c >= 128) then
    i = i + 1
    d = iachar(s(i:i))
    c = ior(ishft(iand(c, 31), 6), iand(d, 63))
end if
if (c >= 2048) then
    error stop "UTF-8 range not supported"
end if
end function

function merge_pair(intokens, idx) result(tokens)
! Merge the pair `idx`
type(string), intent(in) :: intokens(:)
integer, intent(in) :: idx
type(string), allocatable :: tokens(:)
allocate(tokens(size(intokens)-1))
tokens(:idx-1) = intokens(:idx-1)
tokens(idx)%s = intokens(idx)%s // intokens(idx+1)%s
tokens(idx+1:) = intokens(idx+2:)
end function

function merge_utf8_pairs(intokens) result(tokens)
! Merge all UTF-8 character pairs
type(string), intent(in) :: intokens(:)
type(string), allocatable :: tokens(:), tmp_tokens(:)
integer :: i, j
logical :: one_more_pass
allocate(tokens(size(intokens)))
tokens = intokens
one_more_pass = .true.
j = 1
do while(one_more_pass)
    one_more_pass = .false.
    do i = j, size(tokens)-1
        if (len(tokens(i)%s) == 1 .and. iachar(tokens(i)%s(1:1)) >= 128) then
            tmp_tokens = merge_pair(tokens, i)
            deallocate(tokens)
            call move_alloc(tmp_tokens, tokens)
            one_more_pass = .true.
            j = i + 1
            exit
        end if
    end do
end do
!print *, "tokens = ", (tokens(i)%s // " ", i=1,size(tokens))
end function

function bpe(token, vocab_idx, vocab_txt) result(tokens)
! Takes a token as a string, and returns bpe tokens as an array of strings
character(*), intent(in) :: token
integer, intent(in) :: vocab_idx(0:)
character, intent(in) :: vocab_txt(:)
type(string), allocatable :: tokens(:), tmp_tokens(:)
integer, allocatable :: pair_scores(:)
integer :: not_found, merge_pair_idx
integer :: i
not_found = size(vocab_idx) + 10
allocate(tokens(len(token)))
do i = 1, len(token)
    tokens(i)%s = token(i:i)
end do
tmp_tokens = merge_utf8_pairs(tokens)
deallocate(tokens)
call move_alloc(tmp_tokens, tokens)
do
    !print *, "tokens = ", (tokens(i)%s // " ", i=1,size(tokens))
    if (size(tokens) == 1) then
        ! The token pairs were either all merged into one word, or the input
        ! token was a one character word, either way we are done:
        exit
    end if
    allocate(pair_scores(size(tokens)-1))
    ! Loop over pairs
    do i = 1, size(tokens)-1
        pair_scores(i) = word_idx(tokens(i)%s // " " // tokens(i+1)%s, vocab_idx, vocab_txt)
        if (pair_scores(i) == -1) pair_scores(i) = not_found
    end do
    merge_pair_idx = minloc(pair_scores, 1)
    if (pair_scores(merge_pair_idx) == not_found) then
        ! No token pair can be merged, so we are done:
        exit
    end if
    !print *, pair_scores
    !print *, merge_pair_idx, pair_scores(merge_pair_idx)
    tmp_tokens = merge_pair(tokens, merge_pair_idx)
    deallocate(tokens)
    call move_alloc(tmp_tokens, tokens)
    deallocate(pair_scores)
end do
!print *, "final tokens = ", (tokens(i)%s // " ", i=1,size(tokens))
end function

function encode(input, idx, decoder_txt, vocab_idx, vocab_txt, byte_encoder) &
        result(tokens2)
character(*), intent(in) :: input
integer, intent(in) :: idx(0:), vocab_idx(0:), byte_encoder(0:)
character, intent(in) :: decoder_txt(:), vocab_txt(:)
integer, parameter :: max_tokens = 2048
integer :: tokens(max_tokens)
integer, allocatable :: tokens2(:)
character(:), allocatable :: tmp, tmp2
type(string), allocatable :: bpe_tokens(:)
integer :: i, j, c, n_tokens
n_tokens = 0
i = 1
do
    tmp = next_token(input, i)
    if (tmp == "") exit
    tmp2 = ""
    do j = 1, len(tmp)
        c = iachar(tmp(j:j))
        c = byte_encoder(c)
        ! c is UTF-32 (4 bytes), but only the range [0, 324] is used
        ! Encode c from UTF-32 to UTF-8. Due to the limited range
        ! either one or two bytes of UTF-8 are appended to tmp2:
        call codepoint_to_utf8(tmp2, c)
    end do
    if (allocated(bpe_tokens)) deallocate(bpe_tokens)
    bpe_tokens = bpe(tmp2, vocab_idx, vocab_txt)
    do j = 1, size(bpe_tokens)
        n_tokens = n_tokens + 1
        if (n_tokens > max_tokens) error stop "exceeded max_tokens"
        tokens(n_tokens) = word_idx(bpe_tokens(j)%s, idx, decoder_txt)
    end do
    deallocate(tmp2)
end do
allocate(tokens2(n_tokens))
tokens2(:) = tokens(:n_tokens)
end function

function decode(tokens, idx, decoder_txt, byte_decoder) result(output)
integer, intent(in) :: tokens(:), idx(0:), byte_decoder(0:)
character, intent(in) :: decoder_txt(:)
character(:), allocatable :: output
character(:), allocatable :: output2, tmp
integer :: i, c
allocate(character(0) :: output2) ! Fix GFortran warning
output2 = ""
do i = 1, size(tokens)
    if (tokens(i) < 0) error stop "tokens(i) < 0"
    output2 = output2 // c2s(decoder_txt(idx(tokens(i))+1:idx(tokens(i)+1)))
end do
i = 1
output = ""
do
    ! Decode UTF-8 (one or more bytes) to UTF-32 code point (always 4 bytes),
    ! However for GPT-2 it seems only range 0-323 is used from UTF-32.
    c = utf8_to_codepoint(output2, i)
    ! [0,324] -> [0,255]
    if (c < 0 .or. c > ubound(byte_decoder,1)) then
        print *, "Codepoint out of range for byte decoder:", c, ubound(byte_decoder,1)
        error stop
    end if
    tmp = achar(byte_decoder(c))
    output = output // tmp
    if (i == len(output2)) exit
    i = i + 1
end do
end function

end module