Repository: certik/fastGPT Branch: main Commit: db135c7d2f4b Files: 31 Total size: 67.0 KB Directory structure: gitextract_nupenw_f/ ├── .github/ │ └── workflows/ │ └── CI.yml ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── build.sh ├── chat.f90 ├── ci/ │ ├── build.sh │ └── build_lfortran.sh ├── cmake/ │ ├── FindOMP.cmake │ ├── FindOPENBLAS.cmake │ └── UserOverride.cmake ├── comparison/ │ └── encode_input.py ├── create_model.py ├── driver.f90 ├── environment.yml ├── fpm.toml ├── gpt2.f90 ├── input ├── linalg_accelerate.c ├── linalg_c.f90 ├── linalg_f.f90 ├── linalg_openblas.c ├── main.f90 ├── omp.f90 ├── omp_dummy.f90 ├── pt.py ├── tests/ │ ├── test_basic_input.f90 │ ├── test_chat.f90 │ └── test_more_inputs.f90 └── tokenizer.f90 ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/CI.yml ================================================ name: CI on: push: branches: - main pull_request: branches: - main jobs: gfortran: name: GFortran (${{ matrix.os }}) runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: ["macos-latest", "ubuntu-latest"] steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - uses: mamba-org/setup-micromamba@v2.0.2 with: micromamba-version: '2.0.4-0' environment-file: environment.yml create-args: >- ${{ matrix.os == 'macos-latest' && 'gfortran=14.2.0' || '' }} - name: Install GGUF shell: bash -e -x -l {0} run: | git clone https://github.com/ggerganov/llama.cpp cd llama.cpp git checkout 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc cd gguf-py pip install . cd ../.. - name: Build and run shell: bash -l {0} run: | ci/build.sh lfortran: name: LFortran (${{ matrix.os }}) runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: ["macos-latest", "ubuntu-latest"] steps: - uses: actions/checkout@v3 with: fetch-depth: 0 - uses: mamba-org/setup-micromamba@v2.0.2 with: micromamba-version: '2.0.4-0' environment-file: environment.yml create-args: >- lfortran=0.60.0 ${{ matrix.os == 'ubuntu-latest' && 'llvm-openmp=11.1.0' || '' }} - name: Build and run shell: bash -l {0} run: | ci/build_lfortran.sh ================================================ FILE: .gitignore ================================================ gpt2/ encoder.json input.dat model.dat vocab.bpe ================================================ FILE: CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_SOURCE_DIR}/cmake/UserOverride.cmake) if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type (Debug, Release)") endif() project(fastGPT) enable_language(Fortran) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) set(CMAKE_Fortran_MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/mod_files) # Make sure that CMAKE_BUILD_TYPE is either Debug or Release: if (NOT CMAKE_BUILD_TYPE MATCHES "Debug|Release") message(FATAL_ERROR "CMAKE_BUILD_TYPE must be one of: Debug, Release (current value: '${CMAKE_BUILD_TYPE}')") endif () if (APPLE) set(DEFAULT_FASTGPT_BLAS "Accelerate") else() set(DEFAULT_FASTGPT_BLAS "Fortran") endif() set(FASTGPT_BLAS ${DEFAULT_FASTGPT_BLAS} CACHE STRING "The BLAS library that fastGPT should use") if (NOT FASTGPT_BLAS MATCHES "Accelerate|OpenBLAS|Fortran") message(FATAL_ERROR "FASTGPT_BLAS must be one of: OpenBLAS, Accelerate, Fortran (current value: '${FASTGPT_BLAS}')") endif () if (FASTGPT_BLAS STREQUAL "Accelerate") find_package(OMP) if(NOT OMP_FOUND) find_package(OpenMP REQUIRED COMPONENTS Fortran) endif() elseif (FASTGPT_BLAS STREQUAL "OpenBLAS") find_package(OPENBLAS REQUIRED) find_package(OMP) if(NOT OMP_FOUND) find_package(OpenMP REQUIRED COMPONENTS Fortran) endif() else() # pass endif() enable_testing() set(SRC gpt2.f90 tokenizer.f90 driver.f90 ) if (FASTGPT_BLAS STREQUAL "Accelerate") list(APPEND SRC linalg_accelerate.c linalg_c.f90 omp.f90 ) elseif (FASTGPT_BLAS STREQUAL "OpenBLAS") list(APPEND SRC linalg_openblas.c linalg_c.f90 omp.f90 ) else() list(APPEND SRC linalg_f.f90 omp_dummy.f90 ) endif() add_library(fastgpt ${SRC}) if (FASTGPT_BLAS STREQUAL "Accelerate") target_link_options(fastgpt PUBLIC -framework accelerate) target_link_libraries(fastgpt PUBLIC "$,p::omp,OpenMP::OpenMP_Fortran>") elseif (FASTGPT_BLAS STREQUAL "OpenBLAS") target_link_libraries(fastgpt p::openblas "$,p::omp,OpenMP::OpenMP_Fortran>") endif() add_executable(gpt2 main.f90) target_link_libraries(gpt2 fastgpt) add_executable(chat chat.f90) target_link_libraries(chat fastgpt) add_executable(test_basic_input tests/test_basic_input.f90) target_link_libraries(test_basic_input fastgpt) add_test(test_basic_input ${PROJECT_BINARY_DIR}/test_basic_input) add_executable(test_more_inputs tests/test_more_inputs.f90) target_link_libraries(test_more_inputs fastgpt) add_test(test_more_inputs ${PROJECT_BINARY_DIR}/test_more_inputs) add_executable(test_chat tests/test_chat.f90) target_link_libraries(test_chat fastgpt) add_test(test_chat ${PROJECT_BINARY_DIR}/test_chat) if(NOT PROJECT_SOURCE_DIR STREQUAL PROJECT_BINARY_DIR) # Git auto-ignore out-of-source build directory file(GENERATE OUTPUT .gitignore CONTENT "*") endif() message("\n") message("Configuration results") message("---------------------") message("Fortran compiler: ${CMAKE_Fortran_COMPILER}") message("Build type: ${CMAKE_BUILD_TYPE}") if (CMAKE_BUILD_TYPE STREQUAL "Debug") message("Fortran compiler flags: ${CMAKE_Fortran_FLAGS_DEBUG}") else () message("Fortran compiler flags: ${CMAKE_Fortran_FLAGS_RELEASE}") endif () message("Installation prefix: ${CMAKE_INSTALL_PREFIX}") message("FASTGPT_BLAS: ${FASTGPT_BLAS}") ================================================ FILE: LICENSE ================================================ Copyright 2023 Ondřej Čertík Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # fastGPT The progression of GPT-2 codes from the original to "minimal", "nano" and "pico": * [openai/gpt-2](https://github.com/openai/gpt-2) * [karpathy/minGPT](https://github.com/karpathy/mingpt) * [karpathy/nanoGPT](https://github.com/karpathy/nanogpt) * [jaymody/picoGPT](https://github.com/jaymody/picoGPT) `fastGPT` is very similar to `picoGPT` (very small and readable), but it is also fast (see the Benchmarks section below). The speed and readability is achieved by using Fortran. I wrote a [blog post](https://ondrejcertik.com/blog/2023/03/fastgpt-faster-than-pytorch-in-300-lines-of-fortran/) introducing fastGPT. `fastGPT` features: * Fast? ✅ * Training code? ❌ * Batch inference? ❌ * top-p sampling? ❌ top-k? ❌ temperature? ❌ categorical sampling?! ❌ greedy? ✅ * Readable? ✅ * Small? ✅ A quick breakdown of each of the files: * `gpt2.f90`: the actual GPT-2 model and a decoder * `main.f90`: the main driver * `create_model.py`: downloads the TensorFlow model and converts to the GGUF format (`model.gguf`) * `encode_input.py`: encodes the text input into tokens (input file for `gpt2`) * Matmul implementations * `linalg_f.f90` native Fortran * `linalg_c.f90`, `linalg_accelerate.c` macOS Accelerate Framework * `pt.py`: a reference script to run PyTorch (returns the same answer) ## Getting Started ### Install prerequisites: ```bash mamba env create -f environment.yml conda activate fastgpt ``` ### Configure and build #### Fortran Package Manager (fpm) ```bash fpm build ``` #### CMake ```bash FC=gfortran cmake . make ``` ### Download the GPT2 model weights curl -o model.gguf -L https://huggingface.co/certik/fastGPT/resolve/main/model_fastgpt_124M_v2.gguf You can also download 355M for the `gpt-medium` model. Now you can modify the `input` file to change the input string and set other parameters. ### Run (requires `model.gguf` and `input` in the current directory) If you built with `cmake`, execute ```bash ./gpt2 ``` Alternatively, if you built with `fpm`, execute ```bash fpm run chatgpt2 ``` to launch an interactive chat session ```bash fpm run gpt2 ``` or to launch a session with predetermined prompts. ### Create the GGUF file Create the `model.gguf` file from a given GPT-2 model. Supported sizes (and the corresponding names to be used in `pt.py`, and the approximate download size): "124M" (`gpt2`, 0.5GB), "355M" (`gpt-medium`, 1.5GB), "774M" (`gpt-large`, 3GB), "1558M" (`gpt-xl`, 6GB). This will download the model and cache it for subsequent runs: ```python python create_model.py --models_dir "models" --model_size "124M" ``` This script depends on the `gguf` Python library, that you can install using: ```bash git clone https://github.com/ggerganov/llama.cpp cd llama.cpp git checkout 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc cd gguf-py pip install . ``` The `gguf` library is available in pip and conda, but we currently require the latest version that is not available there yet. We used this script to create several GGUF files and uploaded them to: https://huggingface.co/certik/fastGPT, so that you can just download the pre-generated files. ### Example Output The above `./gpt2` command prints on Apple M1 Max: ``` $ ./gpt2 Loading the model... done. Time: 0.111s Model parameters: n_vocab = 50257 n_ctx = 1024 n_embd = 768 n_layer = 12 n_head = 12 Input text Alan Turing theorized that computers would one day become very powerful, but even he could not imagine Encoding: tokenizing input text into tokens (currently slow)... done. Time: 0.074s Input parameters: n_seq = 19 n_tokens_to_generate = 20 Input tokens: 36235 39141 18765 1143 326 9061 561 530 1110 1716 845 3665 11 475 772 339 714 407 5967 Decoded input as text: Alan Turing theorized that computers would one day become very powerful, but even he could not imagine Running model... how they would be able to do so. "I think that the most important thing is done. Time: 0.304s (1.01x) Output tokens: 703 484 561 307 1498 284 466 523 13 198 198 1 40 892 326 262 749 1593 1517 318 Decoded output as text: how they would be able to do so. "I think that the most important thing is ``` ### Chat interface Here is an example chat using the largest 1558M model: ``` $ ./chat Your name is fastGPT and you are an AI bot. The user will ask you questions and you answer in a nice, truthful, short way. User: What is the capital of Czechia? fastGPT: Prague. User: How many legs does a dog have? fastGPT: Four. User: What color does the sky have? fastGPT: Blue. User: What can you type a document on? fastGPT: A typewriter. User: What can you drive in? fastGPT: A car. User: What can you fly in? fastGPT: A plane. User: What continent is Germany in? fastGPT: Europe. User: When did Second World War start? fastGPT: 1939. User: When did it end? fastGPT: 1945. User: When did the U.S. enter the Second World War? fastGPT: 1941. User: When did the First World War start? fastGPT: 1914. User: When did it end? fastGPT: 1918. User: When did the Mexican-American war start? fastGPT: 1846. User: When did it end? fastGPT: 1848. User: What color is snow? fastGPT: White. User: What color do plants usually have? fastGPT: Green. User: What is your name? fastGPT: fastGPT. ``` ### BLAS Implementation You can choose which BLAS implementation to use for `matmul` using: * `-DFASTGPT_BLAS=OpenBLAS`: Use OpenBLAS * `-DFASTGPT_BLAS=Accelerate`: Use the macOS Accelerate Framework * `-DFASTGPT_BLAS=Fortran`: Use the default Fortran's intrinsic `matmul` ## Benchmarks On Apple M1 Max, inference of the above input file (20 tokens): 1 core 2 cores 4 cores 8 cores fastGPT (Accelerate, fast_tanh) 0.288s fastGPT (Accelerate) 0.299s PyTorch (Accelerate) 0.346s fastGPT (OpenBLAS) 0.837s 0.514s 0.341s 0.339s PyTorch (OpenBLAS) 0.873s 0.539s 0.386s 0.392s fastGPT (Accelerate, no cache) 0.717s picoGPT (Accelerate, no cache) 0.765s PyTorch (Accelerate, no cache) 0.787s fastGPT (OpenBLAS, no cache) 2.343s 1.603s 1.209s 1.018s PyTorch (OpenBLAS, no cache) 2.356s 1.520s 1.104s 0.997s picoGPT (OpenBLAS, no cache) 2.427s 1.645s 1.272s 1.081s Total run (includes loading the model and Python imports): fastGPT (Accelerate, fast_tanh): 0.401s picoGPT (8 cores): 3.445s PyTorch (OpenBLAS, 4 cores): 4.867s ## TODO * [ ] Parallelization: * [ ] Over heads: https://github.com/certik/fastGPT/issues/2 * [ ] MPI: https://github.com/certik/fastGPT/issues/5 * [ ] Other sampling methods: https://github.com/certik/fastGPT/issues/8 * [ ] Batching: https://github.com/certik/fastGPT/issues/7 * [x] Improve the UI: * [x] Implement the input tokenizer in Fortran: https://github.com/certik/fastGPT/issues/1 * [x] Show the words as they are generated: https://github.com/certik/fastGPT/issues/6 ================================================ FILE: build.sh ================================================ #!/bin/bash set -ex FC=gfortran cmake -Bbuild cmake --build build --parallel python create_model.py --models_dir "../gpt2/models" --model_size "124M" python encode_input.py \ "Alan Turing theorized that computers would one day become very powerful, but even he could not imagine" \ -n 20 build/gpt2 ================================================ FILE: chat.f90 ================================================ program chatgpt2 use driver, only: chat implicit none call chat() end program ================================================ FILE: ci/build.sh ================================================ #!/bin/bash set -ex cmake . make mkdir models python create_model.py --models_dir "models" --model_size "124M" ./gpt2 ctest make clean rm CMakeCache.txt cmake -DFASTGPT_BLAS=OpenBLAS . make time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2 rm model.gguf curl -o model.gguf -L https://huggingface.co/certik/fastGPT/resolve/main/model_fastgpt_124M_v2.gguf time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2 rm gpt2 python pt.py ================================================ FILE: ci/build_lfortran.sh ================================================ #!/bin/bash set -ex curl -o model.gguf -L https://huggingface.co/certik/fastGPT/resolve/main/model_fastgpt_124M_v2.gguf mkdir lf cd lf FC=lfortran CMAKE_PREFIX_PATH=$CONDA_PREFIX cmake -DFASTGPT_BLAS=OpenBLAS -DCMAKE_BUILD_TYPE=Debug .. make VERBOSE=1 ln -s ../model.gguf . ln -s ../input . time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2 time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./test_basic_input time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./test_more_inputs cd .. mkdir lf-fast cd lf-fast FC="lfortran --fast" CMAKE_PREFIX_PATH=$CONDA_PREFIX cmake -DFASTGPT_BLAS=OpenBLAS -DCMAKE_BUILD_TYPE=Release .. make VERBOSE=1 ln -s ../model.gguf . ln -s ../input . time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2 time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./test_basic_input time OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./test_more_inputs cd .. ================================================ FILE: cmake/FindOMP.cmake ================================================ find_path(OMP_INCLUDE_DIR omp.h) find_library(OMP_LIBRARY omp) include(FindPackageHandleStandardArgs) find_package_handle_standard_args(OMP DEFAULT_MSG OMP_INCLUDE_DIR OMP_LIBRARY) add_library(p::omp INTERFACE IMPORTED) set_property(TARGET p::omp PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${OMP_INCLUDE_DIR}) set_property(TARGET p::omp PROPERTY INTERFACE_LINK_LIBRARIES ${OMP_LIBRARY}) ================================================ FILE: cmake/FindOPENBLAS.cmake ================================================ find_path(OPENBLAS_INCLUDE_DIR NAMES cblas.h PATHS /usr/include/openblas) find_library(OPENBLAS_LIBRARY NAMES openblas) include(FindPackageHandleStandardArgs) find_package_handle_standard_args(OPENBLAS DEFAULT_MSG OPENBLAS_INCLUDE_DIR OPENBLAS_LIBRARY) add_library(p::openblas INTERFACE IMPORTED) set_property(TARGET p::openblas PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${OPENBLAS_INCLUDE_DIR}) set_property(TARGET p::openblas PROPERTY INTERFACE_LINK_LIBRARIES ${OPENBLAS_LIBRARY}) ================================================ FILE: cmake/UserOverride.cmake ================================================ # This overrides the default CMake Debug and Release compiler options. # The user can still specify different options by setting the # CMAKE_Fortran_FLAGS_[RELEASE,DEBUG] variables (on the command line or in the # CMakeList.txt). This files serves as better CMake defaults and should only be # modified if the default values are to be changed. Project specific compiler # flags should be set in the CMakeList.txt by setting the CMAKE_Fortran_FLAGS_* # variables. if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") # gfortran set(common "-Wall -Wextra -Wimplicit-interface -fPIC") set(CMAKE_Fortran_FLAGS_RELEASE_INIT "${common} -O3 -march=native -ffast-math -funroll-loops") set(CMAKE_Fortran_FLAGS_DEBUG_INIT "${common} -g -fcheck=all -fbacktrace") elseif (CMAKE_Fortran_COMPILER_ID MATCHES "^Intel") # ifort set(common "-warn all") set(CMAKE_Fortran_FLAGS_RELEASE_INIT "${common} -xHOST -O3 -no-prec-div -static") set(CMAKE_Fortran_FLAGS_DEBUG_INIT "${common} -check all") endif () ================================================ FILE: comparison/encode_input.py ================================================ """ This script implements the encoding of an input string into tokens. It requires two files in the current directory: encoder.json, vocab.bpe It creates the file input.dat which contains the input tokens and how many tokens to generate. TODO: save the information from encoder.json and vocab.pge into model.dat and implement this encoder in Fortran. Most of this file were taken from: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py And it is licensed under: Modified MIT License Software Copyright (c) 2019 OpenAI We don’t claim ownership of the content you create with GPT-2, so it is yours to do with as you please. We only ask that you use GPT-2 responsibly and clearly indicate your content was created using GPT-2. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. The above copyright notice and this permission notice need not be included with content created by the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np import json import os import regex as re def bytes_to_unicode(): bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) cs = bs[:] n = 0 for b in range(2**8): if b not in bs: bs.append(b) cs.append(2**8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) def get_pairs(word): """Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length strings). """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char return pairs class Encoder: def __init__(self, encoder, bpe_merges): self.encoder = encoder self.byte_encoder = bytes_to_unicode() self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") def bpe(self, token): word = tuple(token) pairs = get_pairs(word) if not pairs: return token while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) # can trigger ValueError new_word.extend(word[i:j]) i = j except ValueError: new_word.extend(word[i:]) break if word[i] == first and i < len(word) - 1 and word[i + 1] == second: new_word.append(first + second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) return word def encode(self, text): bpe_tokens = [] for token in re.findall(self.pat, text): token = "".join(self.byte_encoder[b] for b in token.encode("utf-8")) bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token)) return bpe_tokens def get_encoder(): with open("encoder.json") as f: encoder = json.load(f) with open("vocab.bpe", encoding="utf-8") as f: bpe_data = f.read() bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]] return Encoder(encoder=encoder, bpe_merges=bpe_merges) def main(prompt: str, n_tokens_to_generate: int = 40): encoder = get_encoder() input_ids = np.array(encoder.encode(prompt), dtype=np.int32) print("Saving the input into `input.dat`") g = open("input.dat", "w") np.array([len(input_ids), n_tokens_to_generate], dtype=np.int32).tofile(g) input_ids.tofile(g) print(input_ids) if __name__ == "__main__": import fire fire.Fire(main) ================================================ FILE: create_model.py ================================================ """ This script loads the specified GPT-2 model from OpenAI using TensorFlow, converts it into our custom format and saves it to `model.gguf`, which contains everything (all the parameters, all the weights, encoding/decoding information). Parts of this script were taken from the picoGPT project: https://github.com/jaymody/picoGPT Those are licensed as: MIT License Copyright (c) 2023 Jay Mody Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ from time import monotonic as clock import os import json import re from shutil import copyfile import numpy as np import gguf import requests import tensorflow as tf from tqdm import tqdm def download_gpt2_files(model_size, model_dir): assert model_size in ["124M", "355M", "774M", "1558M"] for filename in [ "checkpoint", "encoder.json", "hparams.json", "model.ckpt.data-00000-of-00001", "model.ckpt.index", "model.ckpt.meta", "vocab.bpe", ]: url = "https://openaipublic.blob.core.windows.net/gpt-2/models" r = requests.get(f"{url}/{model_size}/{filename}", stream=True) r.raise_for_status() with open(os.path.join(model_dir, filename), "wb") as f: file_size = int(r.headers["content-length"]) chunk_size = 1000 with tqdm( ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True, ) as pbar: # 1k for chunk_size, since Ethernet packet size is around 1500 bytes for chunk in r.iter_content(chunk_size=chunk_size): f.write(chunk) pbar.update(chunk_size) def load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams): def set_in_nested_dict(d, keys, val): if not keys: return val if keys[0] not in d: d[keys[0]] = {} d[keys[0]] = set_in_nested_dict(d[keys[0]], keys[1:], val) return d init_vars = tf.train.list_variables(tf_ckpt_path) params = {"blocks": [{} for _ in range(hparams["n_layer"])]} for name, _ in init_vars: array = np.squeeze(tf.train.load_variable(tf_ckpt_path, name)) name = name.removeprefix("model/") if name.startswith("h"): m = re.match(r"h([0-9]+)/(.*)", name) n = int(m[1]) sub_name = m[2] set_in_nested_dict(params["blocks"][n], sub_name.split("/"), array) else: set_in_nested_dict(params, name.split("/"), array) return params def load_encoder_hparams_and_params(model_size, models_dir): assert model_size in ["124M", "355M", "774M", "1558M"] model_dir = os.path.join(models_dir, model_size) tf_ckpt_path = tf.train.latest_checkpoint(model_dir) if not tf_ckpt_path: # download files if necessary os.makedirs(model_dir, exist_ok=True) download_gpt2_files(model_size, model_dir) tf_ckpt_path = tf.train.latest_checkpoint(model_dir) hparams = json.load(open(os.path.join(model_dir, "hparams.json"))) params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams) return hparams, params def convert(params, n_head, n_ctx, idx, decoder_txt, vocab_idx, vocab_txt, byte_decoder): t1 = clock() blocks = params["blocks"] n_embd = blocks[0]["ln_1"]["b"].size n_layer = len(blocks) mlp_fc_w = np.empty((n_layer,n_embd,4*n_embd), dtype=np.float32) mlp_fc_b = np.empty((n_layer,4*n_embd), dtype=np.float32) mlp_proj_w = np.empty((n_layer,4*n_embd,n_embd), dtype=np.float32) mlp_proj_b = np.empty((n_layer,n_embd), dtype=np.float32) attn_w = np.empty((n_layer,n_embd,3*n_embd), dtype=np.float32) attn_b = np.empty((n_layer,3*n_embd), dtype=np.float32) attn_proj_w = np.empty((n_layer,n_embd,n_embd), dtype=np.float32) attn_proj_b = np.empty((n_layer,n_embd), dtype=np.float32) ln1_g = np.empty((n_layer,n_embd), dtype=np.float32) ln1_b = np.empty((n_layer,n_embd), dtype=np.float32) ln2_g = np.empty((n_layer,n_embd), dtype=np.float32) ln2_b = np.empty((n_layer,n_embd), dtype=np.float32) for i, block in enumerate(blocks): mlp_fc_w[i,:,:] = block["mlp"]["c_fc"]["w"] mlp_fc_b[i,:] = block["mlp"]["c_fc"]["b"] mlp_proj_w[i,:,:] = block["mlp"]["c_proj"]["w"] mlp_proj_b[i,:] = block["mlp"]["c_proj"]["b"] attn_w[i,:,:] = block["attn"]["c_attn"]["w"] attn_b[i,:] = block["attn"]["c_attn"]["b"] attn_proj_w[i,:,:] = block["attn"]["c_proj"]["w"] attn_proj_b[i,:] = block["attn"]["c_proj"]["b"] ln1_g[i,:] = block["ln_1"]["g"] ln1_b[i,:] = block["ln_1"]["b"] ln2_g[i,:] = block["ln_2"]["g"] ln2_b[i,:] = block["ln_2"]["b"] wte = params["wte"] wpe = params["wpe"] lnf_g = params["ln_f"]["g"] lnf_b = params["ln_f"]["b"] t2 = clock() print("Transform time: ", t2-t1) t1 = clock() n_vocab = np.size(wte, 0) assert np.size(wte, 1) == n_embd model_type = 0xfa51697 # fastGPT model_version = 2 header = np.array([model_type, model_version, n_vocab, n_ctx, n_embd, n_layer, n_head, len(idx),len(decoder_txt.encode("utf-8")), len(vocab_idx),len(vocab_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32) # Save the model to GGUF def save_gguf(data_offset_name, data_offset_value): g = gguf.GGUFWriter("model.gguf", None) g.add_int32(data_offset_name, data_offset_value) g.add_tensor("header", header) g.add_tensor("wte", wte); g.add_tensor("wpe", wpe) g.add_tensor("mlp_fc_w", mlp_fc_w); g.add_tensor("mlp_fc_b", mlp_fc_b) g.add_tensor("mlp_proj_w", mlp_proj_w); g.add_tensor("mlp_proj_b", mlp_proj_b) g.add_tensor("attn_w", attn_w); g.add_tensor("attn_b", attn_b) g.add_tensor("attn_proj_w", attn_proj_w); g.add_tensor("attn_proj_b", attn_proj_b) g.add_tensor("ln1_b", ln1_b); g.add_tensor("ln1_g", ln1_g) g.add_tensor("ln2_b", ln2_b); g.add_tensor("ln2_g", ln2_g) g.add_tensor("lnf_b", lnf_b); g.add_tensor("lnf_g", lnf_g) g.add_tensor("idx", idx) g.add_tensor("decoder_txt", np.frombuffer(decoder_txt.encode("utf-8"), dtype=np.int8)) g.add_tensor("vocab_idx", vocab_idx) g.add_tensor("vocab_txt", np.frombuffer(vocab_txt.encode("utf-8"), dtype=np.int8)) g.add_tensor("byte_decoder", byte_decoder) g.write_header_to_file() g.write_kv_data_to_file() g.write_tensors_to_file() g.close() data_offset_name = "general.data_offset" save_gguf(data_offset_name, 0) g = gguf.GGUFReader("model.gguf") data_offset = g.tensors[0].data_offset # * .offset: the offset of the kv entry # * 8: The i64 length of the key string # * 4: The i32 type of the value assert g.fields[data_offset_name].offset == 24 offset_offset = g.fields[data_offset_name].offset + 8 + \ len(data_offset_name) + 4 print("offset offset:", offset_offset) print("data offset:", data_offset) save_gguf(data_offset_name, data_offset) t2 = clock() print("Save time: ", t2-t1) def load_decoder(filename): D = json.load(open(filename)) D2 = {v: k for k, v in D.items()} i = 0 decoder = [] while True: if i not in D2: break decoder.append(D2[i]) i += 1 return decoder def load_vocab(filename): D = open(filename).read() D = D.split("\n") D = D[1:] return D def decoder_idx(decoder): i = 0 idx = np.empty(len(decoder)+1, dtype=np.int32) idx[0] = i for n, t in enumerate(decoder): i += len(t.encode("utf-8")) idx[n+1] = i assert idx[-1] == len("".join(decoder).encode("utf-8")) return idx def bytes_to_unicode(): bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) cs = bs[:] n = 0 for b in range(2**8): if b not in bs: bs.append(b) cs.append(2**8 + n) n += 1 cs = [chr(n) for n in cs] btu = dict(zip(bs, cs)) byte_decoder = {v: k for k, v in btu.items()} bd = np.zeros(324, dtype=np.int32) for y in byte_decoder: x = ord(y) bd[x] = byte_decoder[y] bd2 = np.zeros(256, dtype=np.int32) for i in range(np.size(bd)): bd2[bd[i]] = i return bd2 def main(model_size: str = "124M", models_dir: str = "models"): print("Loading model") # load encoder, hparams, and params from the released open-ai gpt-2 files t1 = clock() hparams, params = load_encoder_hparams_and_params(model_size, models_dir) decoder = load_decoder(os.path.join(models_dir, model_size, "encoder.json")) vocab = load_vocab(os.path.join(models_dir, model_size, "vocab.bpe")) t2 = clock() print(" Done. Loading time: ", t2-t1) # generate output ids print("Converting model, saving to `model.gguf`") t1 = clock() decoder_txt = "".join(decoder) idx = decoder_idx(decoder) vocab_txt = "".join(vocab) vocab_idx = decoder_idx(vocab) byte_decoder = bytes_to_unicode() convert(params, hparams["n_head"], hparams["n_ctx"], idx, decoder_txt, vocab_idx, vocab_txt, byte_decoder) t2 = clock() print(" Done. Time: ", t2-t1) if __name__ == "__main__": import fire fire.Fire(main) ================================================ FILE: driver.f90 ================================================ module driver use gpt2_mod, only: generate, model_t use tokenizer, only: encode, decode, string use omp, only: omp_get_wtime implicit none integer, parameter :: sp = kind(0.0) integer, parameter :: dp = kind(0.d0) character(1), parameter :: LF = achar(10) contains subroutine load_input(filename, input_txt, n_tokens_to_generate) ! Load the input from a namelist `filename` character(*), intent(in) :: filename character(:), allocatable, intent(out) :: input_txt integer, intent(out) :: n_tokens_to_generate character(1024) :: input_txt2 integer :: u, ios namelist / input_fastGPT / n_tokens_to_generate allocate(character(0) :: input_txt) input_txt = "" open(newunit=u, file=filename, status="old") read(u, input_fastGPT) do read(u, "(a)", iostat=ios) input_txt2 if (ios /= 0) exit if (len(input_txt) > 0) input_txt = input_txt // char(10) input_txt = input_txt // trim(input_txt2) end do close(u) end subroutine ! Skips `amount` bytes from the current position subroutine fskip(u, amount) integer, intent(in) :: u, amount character, allocatable :: tmp(:) ! Note: the code below is equivalent to the non-standard: fseek(u, amount, 1) ! Let's allocate on heap, in case the skip is large allocate(tmp(amount)) read(u) tmp end subroutine ! Aligns file position in `u` to 32 byte boundary after `A` was read subroutine align_i4(u, A) integer, intent(in) :: u integer, intent(in) :: A(..) integer :: n, alignment alignment = 32 n = size(A)*4 call fskip(u, alignment-modulo(n,alignment)) end subroutine subroutine align_str(u, A) integer, intent(in) :: u character, intent(in) :: A(:) integer :: n, alignment alignment = 32 n = size(A) if (modulo(n, alignment) /= 0) then call fskip(u, alignment-modulo(n,alignment)) end if end subroutine subroutine load_model(filename, m) character(*), intent(in) :: filename type(model_t), intent(out) :: m ! We use the following fastGPT model type number ! fastGPT (digits look similar to the letters they represent) ! 0xfa51697 = 262477463 ! We read the offset to the data section at this position, which is the first ! variable in the metadata, the name is "general.data_offset", type i32. integer, parameter :: offset_offset = & ! header 4 + & ! u8[4] magic 4 + & ! u32 version 8 + & ! u64 n_arrays 8 + & ! u64 n_kv ! kv 8 + & ! u64 n_str 19 + & ! len("general.data_offset") 4 ! u32 type of value integer, parameter :: current_model_mark = 262477463 integer, parameter :: current_model_version = 2 integer :: model_mark integer :: u integer :: data_offset open(newunit=u, file=filename, form="unformatted", access="stream", status="old") call fskip(u, offset_offset) read(u) data_offset ! Alternatively we could have done: rewind(u); call fskip(u, data_offset) call fskip(u, data_offset-offset_offset-4) read(u) model_mark if (model_mark /= current_model_mark) then print *, "Found:", model_mark print *, "Expected:", current_model_mark error stop "Invalid fastGPT model file" end if read(u) m%model_file_version if (m%model_file_version /= current_model_version) then print *, "Found:", m%model_file_version print *, "Expected:", current_model_version error stop "Incompatible model version" end if read(u) m%n_vocab, m%n_ctx, m%n_embd, m%n_layer, m%n_head, m%n_decoder_idx, & m%n_decoder_txt, m%n_vocab_idx, m%n_vocab_txt, m%n_byte_encoder call fskip(u, 16) ! Pad the 12 element i32 array to 32 byte boundary allocate(m%wte(m%n_embd,m%n_vocab), m%wpe(m%n_embd,m%n_ctx), & m%mlp_fc_w(4*m%n_embd,m%n_embd,m%n_layer), m%mlp_fc_b(4*m%n_embd,m%n_layer), & m%mlp_proj_w(m%n_embd,4*m%n_embd,m%n_layer), m%mlp_proj_b(m%n_embd,m%n_layer), & m%attn_w(3*m%n_embd,m%n_embd,m%n_layer), m%attn_b(3*m%n_embd,m%n_layer), & m%attn_proj_w(m%n_embd,m%n_embd,m%n_layer), m%attn_proj_b(m%n_embd,m%n_layer), & m%ln1_b(m%n_embd,m%n_layer), m%ln1_g(m%n_embd,m%n_layer), & m%ln2_b(m%n_embd,m%n_layer), m%ln2_g(m%n_embd,m%n_layer), & m%lnf_b(m%n_embd), m%lnf_g(m%n_embd), & m%decoder_idx(0:m%n_decoder_idx-1), m%decoder_txt(m%n_decoder_txt), & m%vocab_idx(0:m%n_vocab_idx-1), m%vocab_txt(m%n_vocab_txt), & m%byte_encoder(0:m%n_byte_encoder-1)) read(u) m%wte, m%wpe, & m%mlp_fc_w, m%mlp_fc_b, & m%mlp_proj_w, m%mlp_proj_b, & m%attn_w, m%attn_b, & m%attn_proj_w, m%attn_proj_b, & m%ln1_b, m%ln1_g, & m%ln2_b, m%ln2_g, & m%lnf_b, m%lnf_g, & m%decoder_idx call align_i4(u, m%decoder_idx) read(u) m%decoder_txt call align_str(u, m%decoder_txt) read(u) m%vocab_idx call align_i4(u, m%vocab_idx) read(u) m%vocab_txt call align_str(u, m%vocab_txt) read(u) m%byte_encoder close(u) end subroutine subroutine gpt2_driver(input, output, m) integer, allocatable, intent(out) :: input(:), output(:) type(model_t), intent(out) :: m character(:), allocatable :: input_txt integer :: n_tokens_to_generate real(dp) :: t1, t2 call load_input("input", input_txt, n_tokens_to_generate) ! Load the model print "(a)", "Loading the model..." call cpu_time(t1) call load_model("model.gguf", m) call cpu_time(t2) print "(a,f8.3,a,i2)", " done. Time:", t2-t1, "s, Model file version:", m%model_file_version print * print "(a)", "Model parameters:" print "(a,i6)", "n_vocab =", m%n_vocab print "(a,i6)", "n_ctx =", m%n_ctx print "(a,i6)", "n_embd =", m%n_embd print "(a,i6)", "n_layer =", m%n_layer print "(a,i6)", "n_head =", m%n_head print * call gpt2_driver2(input_txt, n_tokens_to_generate, m, input, output) endsubroutine subroutine gpt2_driver2(input_txt, n_tokens_to_generate, m, input, output) character(*), intent(in) :: input_txt integer, intent(in) :: n_tokens_to_generate type(model_t), intent(in) :: m integer, allocatable, intent(out) :: input(:), output(:) integer, allocatable :: byte_decoder(:) integer :: n_seq character(:), allocatable :: output_txt real(dp) :: t1, t2, t1o, t2o integer :: i logical :: use_cache ! Compute byte_decoder: allocate(byte_decoder(0:maxval(m%byte_encoder))) byte_decoder = 0 do i = 0, size(m%byte_encoder)-1 byte_decoder(m%byte_encoder(i)) = i end do print "(a)", "Input text" print "(a)", input_txt print * print "(a)", "Encoding: tokenizing input text into tokens (currently slow)..." call cpu_time(t1) input = encode(input_txt, m%decoder_idx, m%decoder_txt, m%vocab_idx, m%vocab_txt, & m%byte_encoder) call cpu_time(t2) n_seq = size(input) print "(a,f8.3,a)", " done. Time:", t2-t1, "s" print * print "(a)", "Input parameters:" print "(a,i4)", "n_seq =", n_seq print "(a,i4)", "n_tokens_to_generate =", n_tokens_to_generate print * print "(a)", "Input tokens:" print "(1000(i6))", input print * if (n_seq + n_tokens_to_generate >= m%n_ctx) then print *, "The maximum sequence length of the model was surpassed." print *, "Make the input and/or number of tokens to generate shorter." error stop end if print "(a)", "Decoded input as text:" !print "(a)", decode(input, decoder_idx, decoder_txt, byte_decoder) allocate(character(0) :: output_txt) ! Fix GFortran warning output_txt = decode(input, m%decoder_idx, m%decoder_txt, byte_decoder) print "(a)", output_txt print * if (input_txt /= output_txt) then error stop "The decoded input text does not agree with the input text" end if print "(a)", "Running model..." call cpu_time(t1) t1o = omp_get_wtime() use_cache = .true. call generate(output, n_tokens_to_generate, m, size(input), input, use_cache, & byte_decoder) print * t2o = omp_get_wtime() call cpu_time(t2) print "(a,f8.3,a,f4.2,a)", " done. Time:", t2o-t1o, "s (", (t2-t1)/(t2o-t1o), "x)" print * print "(a)", "Output tokens:" print "(1000(i6))", output output_txt = decode(output, m%decoder_idx, m%decoder_txt, byte_decoder) print * print "(a)", "Decoded output as text:" print "(a)", output_txt end subroutine subroutine gpt2_driver3(input_txt, n_tokens_to_generate, stop_text, m, output_txt) character(*), intent(in) :: input_txt, stop_text integer, intent(in) :: n_tokens_to_generate type(model_t), intent(in) :: m integer, allocatable :: input(:), output(:) integer, allocatable :: byte_decoder(:) integer :: n_seq character(:), allocatable, intent(out) :: output_txt integer :: i logical :: use_cache ! TODO: move the decoder into model_t ! Compute byte_decoder: allocate(byte_decoder(0:maxval(m%byte_encoder))) byte_decoder = 0 do i = 0, size(m%byte_encoder)-1 byte_decoder(m%byte_encoder(i)) = i end do input = encode(input_txt, m%decoder_idx, m%decoder_txt, m%vocab_idx, m%vocab_txt, & m%byte_encoder) n_seq = size(input) if (n_seq + n_tokens_to_generate >= m%n_ctx) then print *, "The maximum sequence length of the model was surpassed." print *, "Make the input and/or number of tokens to generate shorter." error stop end if allocate(character(0) :: output_txt) ! Fix GFortran warning output_txt = decode(input, m%decoder_idx, m%decoder_txt, byte_decoder) if (input_txt /= output_txt) then error stop "The decoded input text does not agree with the input text" end if use_cache = .true. call generate(output, n_tokens_to_generate, m, size(input), input, use_cache, & byte_decoder, stop_text) output_txt = decode(output, m%decoder_idx, m%decoder_txt, byte_decoder) end subroutine function get_prompt() result(input) character(:), allocatable :: input character(1024) :: tmp integer ::ios read(*,"(a)",iostat=ios) tmp if (ios == 0) then input = trim(tmp) else input = "" end if end function subroutine chat(inputs) type(string), optional, intent(in) :: inputs(:) type(model_t) :: m character(:), allocatable :: prompt, input, output integer :: i, n_prompts call load_model("model.gguf", m) prompt = "Your name is fastGPT and you are an AI bot. The user will ask you & &questions and you answer in a nice, truthful, short way." // LF // "& &User: What is the capital of Czechia?" // LF // "& &fastGPT: Prague." // LF // "& &User: How many legs does a dog have?" // LF // "& &fastGPT: Four." // LF // "& &User:" write(*,"(a)",advance="no") prompt if (present(inputs)) then n_prompts = size(inputs) else n_prompts = 1024 end if do i = 1, n_prompts write(*,"(a)",advance="no") " " if (present(inputs)) then input = inputs(i)%s write(*,"(a)") input else input = get_prompt() if (input == "") exit end if write(*,"(a)",advance="no") "fastGPT:" prompt = prompt // " " // input // LF // "fastGPT:" call gpt2_driver3(prompt, 200, "User:", m, output) prompt = prompt // output end do print * end subroutine end module ================================================ FILE: environment.yml ================================================ name: fastgpt channels: - conda-forge dependencies: - python=3.9 - numpy=1.24.2 - tensorflow=2.11.0 - tqdm=4.65.0 - fire=0.4.0 - regex=2022.10.31 #- gfortran=14.2.0 - cmake=3.25.2 - transformers=4.26.1 - openblas=0.3.21 ================================================ FILE: fpm.toml ================================================ name = "fastGPT" ================================================ FILE: gpt2.f90 ================================================ module gpt2_mod use linalg, only: matmul_2d, matmul_2d_t use tokenizer, only: decode implicit none integer, parameter :: sp = kind(0.0) real(sp), parameter :: pi = 3.14159265358979323846_sp ! This derived type contains all the data of the GPT-2 model, including all ! weights, model parameters, and encoder/decoder data type :: model_t integer :: n_vocab, n_ctx, n_embd, n_layer, n_head, & n_decoder_idx, n_decoder_txt, & n_vocab_idx, n_vocab_txt, n_byte_encoder real(sp), allocatable :: wte(:,:), wpe(:,:), & mlp_fc_w(:,:,:), mlp_fc_b(:,:), & mlp_proj_w(:,:,:), mlp_proj_b(:,:), & attn_w(:,:,:), attn_b(:,:), & attn_proj_w(:,:,:), attn_proj_b(:,:), & ln1_b(:,:), ln1_g(:,:), & ln2_b(:,:), ln2_g(:,:), & lnf_b(:), lnf_g(:) integer, allocatable :: decoder_idx(:), vocab_idx(:), byte_encoder(:) character, allocatable :: decoder_txt(:), vocab_txt(:) integer :: model_file_version end type contains elemental real(sp) function fast_tanh(x) result(y) real(sp), intent(in) :: x real(sp) :: x2 if (x > 5) then y = 1 elseif (x < -5) then y = -1 else x2 = x*x y = x * (0.98569772605911309407 + x2 *(-0.2794500993392901382 & + x2 * (6.8280504526399188164e-2 + x2 * (-1.0972014877337651823e-2 & + x2 * (1.1132367134444316902e-3 + x2 * (-7.018851897305717565e-5 & + x2 * (2.656616768082727089e-6 + x2 * (-5.5138381821615909058e-8 & + x2 * 4.8162484477588665996e-10)))))))) end if end function elemental real(sp) function gelu(x) result(y) real(sp), intent(in) :: x y = 0.5_sp * x * (1 + tanh(sqrt(2 / pi) * (x + 0.044715_sp * x**3))) end function function softmax(x) result(y) real(sp), intent(in) :: x(:,:) real(sp) :: y(size(x,1),size(x,2)) integer :: i do i = 1, size(x,2) y(:,i) = exp(x(:,i) - maxval(x(:,i))) y(:,i) = y(:,i) / sum(y(:,i)) end do end function function layer_norm(x, g, b, eps) result(y) real(sp), intent(in) :: x(:,:), g(:), b(:), eps real(sp) :: y(size(x,1),size(x,2)) real(sp) :: mean(size(x,2)), variance(size(x,2)) integer :: i do i = 1, size(x,2) mean(i) = sum(x(:,i)) / size(x,1) variance(i) = sum((x(:,i) - mean(i))**2) / size(x,1) end do !do i = 1, size(x,1) ! y(i,:) = (x(i,:) - mean(:)) / sqrt(variance(:) + eps) ! y(i,:) = g(i) * y(i,:) + b(i) !end do do i = 1, size(x,2) y(:,i) = (x(:,i) - mean(i)) / sqrt(variance(i) + eps) y(:,i) = g(:) * y(:,i) + b(:) end do end function function linear(x, w, b) result(y) real(sp), intent(in) :: x(:,:), w(:,:), b(:) real(sp) :: y(size(b,1),size(x,2)) integer :: i !y = matmul(w, x) + spread(b, 2, size(x,2)) !y = matmul(w, x) call matmul_2d(w, x, y) do i = 1, size(y,2) y(:,i) = y(:,i) + b(:) end do end function function ffn(x, fc_w, fc_b, proj_w, proj_b) result(y) real(sp), intent(in) :: x(:,:), fc_w(:,:), fc_b(:), proj_w(:,:), proj_b(:) real(sp) :: y(size(x,1),size(x,2)) !real(sp) :: a(4*size(x,1),size(x,2)) !a = gelu(linear(x, fc_w, fc_b)) y = linear(gelu(linear(x, fc_w, fc_b)), proj_w, proj_b) end function function attention(n_embd_head,n_seq,n_seq_x, q, k, v, mask) result(y) integer, intent(in) :: n_embd_head, n_seq, n_seq_x real(sp), intent(in) :: q(n_embd_head,n_seq_x), k(n_embd_head,n_seq), v(n_embd_head,n_seq), mask(n_seq,n_seq_x) real(sp) :: y(n_embd_head,n_seq_x) real(sp) :: tmp(n_seq,n_seq_x) !tmp = matmul(transpose(k), q) !call matmul_2d(transpose(k), q, tmp) call matmul_2d_t(k, q, tmp) call matmul_2d(v, softmax(tmp / sqrt(real(n_embd_head,sp)) + mask), y) end function function mha(n_seq, n_seq_x, n_embd, x, attn_w, attn_b, proj_w, proj_b, n_head, & use_kv_cache, kv_cache) & result(y) integer, intent(in) :: n_seq, n_seq_x, n_embd real(sp), intent(in) :: x(n_embd,n_seq_x), & attn_w(3*n_embd,n_embd), attn_b(3*n_embd), & proj_w(n_embd,n_embd), proj_b(n_embd) real(sp), intent(inout) :: kv_cache(n_embd,n_seq,2) integer, intent(in) :: n_head logical, intent(in) :: use_kv_cache real(sp) :: y(n_embd,n_seq_x) real(sp) :: causal_mask(n_seq,n_seq_x) real(sp) :: x2(3*n_embd,n_seq_x) real(sp) :: q(n_embd/n_head,n_seq_x), k(n_embd/n_head,n_seq), v(n_embd/n_head,n_seq) real(sp) :: yy(n_embd/n_head,n_seq_x) integer :: i, j, l ! Mask if (use_kv_cache) then causal_mask = 0 else do j = 1, n_seq do i = 1, n_seq if (i > j) then causal_mask(i,j) = -1e10_sp else causal_mask(i,j) = 0 end if end do end do end if x2 = linear(x, attn_w, attn_b) if (use_kv_cache) then do j = 1, n_embd kv_cache(j,n_seq,1) = x2((2-1)*n_embd+j,1) kv_cache(j,n_seq,2) = x2((3-1)*n_embd+j,1) end do else do i = 1, n_seq do j = 1, n_embd kv_cache(j,i,1) = x2((2-1)*n_embd+j,i) kv_cache(j,i,2) = x2((3-1)*n_embd+j,i) end do end do end if ! Perform attention over each head do l = 1, n_head do i = 1, n_seq_x do j = 1, n_embd/n_head q(j,i) = x2((l-1)*n_embd/n_head+j,i) end do end do do i = 1, n_seq do j = 1, n_embd/n_head k(j,i) = kv_cache((l-1)*n_embd/n_head+j,i,1) v(j,i) = kv_cache((l-1)*n_embd/n_head+j,i,2) end do end do yy = attention(n_embd/n_head, n_seq, n_seq_x, q, k, v, causal_mask) do i = 1, n_seq_x do j = 1, n_embd/n_head y((l-1)*n_embd/n_head+j,i) = yy(j,i) end do end do end do ! Out projection y = linear(y, proj_w, proj_b) end function function transformer_block(n_seq, n_seq_x, n_embd, x, mlp_fc_w, mlp_fc_b, mlp_proj_w, mlp_proj_b, & attn_w, attn_b, attn_proj_w, attn_proj_b, ln1_g, ln1_b, ln2_g, ln2_b, & n_head, use_kv_cache, kv_cache) result(y) real(sp), intent(in) :: x(n_embd,n_seq_x), & mlp_fc_w(:,:), mlp_fc_b(:), & mlp_proj_w(:,:), mlp_proj_b(:), & attn_w(:,:), attn_b(:), attn_proj_w(:,:), attn_proj_b(:), & ln1_g(:), ln1_b(:), ln2_g(:), ln2_b(:) integer, intent(in) :: n_head integer, intent(in) :: n_seq, n_seq_x, n_embd real(sp) :: y(n_embd,n_seq_x) logical, intent(in) :: use_kv_cache real(sp), intent(inout) :: kv_cache(n_embd,n_seq,2) y = x + mha(n_seq, n_seq_x, n_embd, layer_norm(x, ln1_g, ln1_b, 1e-5_sp), & attn_w, attn_b, attn_proj_w, attn_proj_b, n_head, use_kv_cache, kv_cache) y = y + ffn(layer_norm(y, ln2_g, ln2_b, 1e-5_sp), & mlp_fc_w, mlp_fc_b, mlp_proj_w, mlp_proj_b) end function function gpt2(n_vocab, n_ctx, n_seq, n_seq_x, n_embd, n_layer, n_head, input, & wte, wpe, & mlp_fc_w, mlp_fc_b, mlp_proj_w, mlp_proj_b, & attn_w, attn_b, attn_proj_w, attn_proj_b, & ln1_g, ln1_b, ln2_g, ln2_b, lnf_g, lnf_b, & use_kv_cache, kv_cache) result(y) integer, intent(in) :: n_vocab, n_ctx, n_seq, n_seq_x, n_embd, n_layer, n_head integer, intent(in) :: input(n_seq) real(sp), intent(in) :: wte(n_embd,n_vocab), wpe(n_embd,n_ctx), & mlp_fc_w(4*n_embd,n_embd,n_layer), mlp_fc_b(4*n_embd,n_layer), & mlp_proj_w(n_embd,4*n_embd,n_layer), mlp_proj_b(n_embd,n_layer), & attn_w(3*n_embd,n_embd,n_layer), attn_b(3*n_embd,n_layer), & attn_proj_w(n_embd,n_embd,n_layer), attn_proj_b(n_embd,n_layer), & ln1_b(n_embd,n_layer), ln1_g(n_embd,n_layer), & ln2_b(n_embd,n_layer), ln2_g(n_embd,n_layer), & lnf_b(n_embd), lnf_g(n_embd) logical, intent(in) :: use_kv_cache real(sp), intent(inout) :: kv_cache(n_embd,n_seq,2,n_layer) real(sp) :: y(n_vocab,n_seq_x) real(sp) :: x(n_embd,n_seq_x) integer :: i if (use_kv_cache) then i = n_seq x(:,1) = wte(:,input(i)+1) + wpe(:,i) else do i = 1, n_seq x(:,i) = wte(:,input(i)+1) + wpe(:,i) end do end if do i = 1, n_layer x = transformer_block(n_seq, n_seq_x, n_embd, x, & mlp_fc_w(:,:,i), mlp_fc_b(:,i), & mlp_proj_w(:,:,i), mlp_proj_b(:,i), & attn_w(:,:,i), attn_b(:,i), attn_proj_w(:,:,i), attn_proj_b(:,i), & ln1_g(:,i), ln1_b(:,i), ln2_g(:,i), ln2_b(:,i), & n_head, use_kv_cache, kv_cache(:,:,:,i)) end do x = layer_norm(x, lnf_g, lnf_b, 1e-5) !y = matmul(transpose(wte), x) call matmul_2d_t(wte, x, y) end function subroutine generate(output, n_tokens_to_generate, m, & n_seq, input, & use_cache, & byte_decoder, stop_text) integer, intent(in) :: n_seq, n_tokens_to_generate type(model_t), intent(in) :: m integer, intent(in) :: input(n_seq) logical, intent(in) :: use_cache integer, intent(in) :: byte_decoder(:) character(*), intent(in), optional :: stop_text ! Stop if you see this text integer, allocatable, intent(out) :: output(:) real(sp), allocatable :: logits(:,:) integer :: i integer :: n_seq2, n_seq_x integer :: next_id integer :: input2(size(input)+n_tokens_to_generate) logical :: use_kv_cache real(sp) :: kv_cache(m%n_embd,n_seq+n_tokens_to_generate,2,m%n_layer) real(sp), allocatable :: kv_cache2(:,:,:,:) character(:), allocatable :: output_txt, last_token if (present(stop_text)) then allocate(character(0) :: output_txt) output_txt = "" end if input2(:n_seq) = input do i = 1, n_tokens_to_generate if (use_cache) then use_kv_cache = (i > 1) ! Use cache for subsequent tokens else use_kv_cache = .false. end if n_seq2 = n_seq+i-1 if (use_kv_cache) then n_seq_x = 1 else n_seq_x = n_seq2 end if allocate(kv_cache2(m%n_embd,n_seq2,2,m%n_layer)) kv_cache2(:,:,:,:) = kv_cache(:,:n_seq2,:,:) allocate(logits(m%n_vocab, n_seq_x)) logits = gpt2(m%n_vocab, m%n_ctx, n_seq2, n_seq_x, m%n_embd, m%n_layer, & m%n_head, & input2(:n_seq2), & m%wte, m%wpe, & m%mlp_fc_w, m%mlp_fc_b, m%mlp_proj_w, m%mlp_proj_b, & m%attn_w, m%attn_b, m%attn_proj_w, m%attn_proj_b, & m%ln1_g, m%ln1_b, m%ln2_g, m%ln2_b, m%lnf_g, m%lnf_b, use_kv_cache,& kv_cache2) kv_cache(:,:n_seq2,:,:) = kv_cache2(:,:,:,:) deallocate(kv_cache2) next_id = maxloc(logits(:,n_seq_x), dim=1)-1 input2(n_seq2+1) = next_id last_token = decode([next_id], m%decoder_idx, & m%decoder_txt, byte_decoder) write(*, fmt="(a)", advance="no") last_token if (present(stop_text)) then output_txt = output_txt // last_token if (output_txt(len(output_txt)-len(stop_text)+1:len(output_txt)) == stop_text) then exit end if end if deallocate(logits) end do allocate(output(n_seq2 - n_seq + 1)) output(:) = input2(n_seq+1:n_seq2+1) end subroutine end module ================================================ FILE: input ================================================ &input_fastGPT n_tokens_to_generate = 20 / Alan Turing theorized that computers would one day become very powerful, but even he could not imagine ================================================ FILE: linalg_accelerate.c ================================================ /* This file provides matvec implementation using the macOS Accelerate Framework, which seems to be the most optimized matrix matrix multiplication on macOS. */ #include #include void acc_sgemm(int m, int n, int k, float *A, float *B, float *C) { //A[m][k] //B[k][n] //C[m][n] cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, m, B, k, 0.0, C, m); } void acc_sgemm_t(int m, int n, int k, float *A, float *B, float *C) { //A[k][m] (to be transposed) //B[k][n] //C[m][n] cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, n, k, 1.0, A, k, B, k, 0.0, C, m); } ================================================ FILE: linalg_c.f90 ================================================ module linalg ! C implementation of the matmul routines use iso_c_binding, only: c_int, c_float implicit none integer, parameter :: sp = kind(0.0) interface subroutine acc_sgemm(m, n, k, A, B, C) bind(c) import :: c_int, c_float implicit none integer(c_int), value, intent(in) :: m, n, k real(c_float), intent(in) :: A(m,k), B(k,n) real(c_float), intent(out) :: C(m,n) end subroutine subroutine acc_sgemm_t(m, n, k, A, B, C) bind(c) import :: c_int, c_float implicit none integer(c_int), value, intent(in) :: m, n, k real(c_float), intent(in) :: A(k,m), B(k,n) real(c_float), intent(out) :: C(m,n) end subroutine end interface contains subroutine matmul_2d(A, B, C) ! C = matmul(A, B) real(sp), intent(in) :: A(:,:), B(:,:) real(sp), intent(out) :: C(:,:) call acc_sgemm(size(A,1), size(B,2), size(A,2), A, B, C) end subroutine subroutine matmul_2d_t(A, B, C) ! C = matmul(transpose(A), B) real(sp), intent(in) :: A(:,:), B(:,:) real(sp), intent(out) :: C(:,:) call acc_sgemm_t(size(A,2), size(B,2), size(A,1), A, B, C) end subroutine end module ================================================ FILE: linalg_f.f90 ================================================ module linalg ! Pure Fortran implementation of the matmul routines implicit none integer, parameter :: sp = kind(0.0) contains subroutine matmul_2d(A, B, C) real(sp), intent(in) :: A(:,:), B(:,:) real(sp), intent(out) :: C(:,:) C = matmul(A, B) end subroutine subroutine matmul_2d_t(A, B, C) real(sp), intent(in) :: A(:,:), B(:,:) real(sp), intent(out) :: C(:,:) C = matmul(transpose(A), B) end subroutine end module ================================================ FILE: linalg_openblas.c ================================================ /* This file provides matvec implementation using OpenBLAS. */ #include void acc_sgemm(int m, int n, int k, float *A, float *B, float *C) { //A[m][k] //B[k][n] //C[m][n] cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, m, B, k, 0.0, C, m); } void acc_sgemm_t(int m, int n, int k, float *A, float *B, float *C) { //A[k][m] (to be transposed) //B[k][n] //C[m][n] cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, n, k, 1.0, A, k, B, k, 0.0, C, m); } ================================================ FILE: main.f90 ================================================ program gpt2 use driver, only: gpt2_driver, model_t implicit none integer, allocatable :: input(:), output(:) type(model_t) :: m call gpt2_driver(input, output, m) end program ================================================ FILE: omp.f90 ================================================ module omp implicit none private public :: omp_get_wtime integer, parameter :: dp = kind(0.d0) interface real(dp) function omp_get_wtime() import :: dp end function end interface end module ================================================ FILE: omp_dummy.f90 ================================================ module omp implicit none private public :: omp_get_wtime integer, parameter :: dp = kind(0.d0) contains real(dp) function omp_get_wtime() omp_get_wtime = 0 end function end module ================================================ FILE: pt.py ================================================ from time import monotonic as clock import os; os.environ["OMP_NUM_THREADS"] = "1" print("Importing") t1 = clock() from transformers import pipeline t2 = clock() print(" Time: ", t2-t1) print("Loading") t1 = clock() generator = pipeline('text-generation', model='gpt2') t2 = clock() print(" Time: ", t2-t1) text="Alan Turing theorized that computers would one day become very powerful, but even he could not imagine" print("Generating") t1 = clock() g = generator(text, do_sample=False, max_new_tokens=20, use_cache=True) t2 = clock() print(" Time: ", t2-t1) output = g[0]["generated_text"] print(output) ================================================ FILE: tests/test_basic_input.f90 ================================================ program test_basic_input use driver, only: gpt2_driver, model_t implicit none type(model_t) :: m integer, parameter :: input_ref(*) = [36235, 39141, 18765, 1143, 326, 9061, & 561, 530, 1110, 1716, 845, 3665, 11, 475, 772, 339, 714, 407, 5967] integer, parameter :: output_ref(*) = [703, 484, 561, 307, 1498, 284, 466, & 523, 13, 198, 198, 1, 40, 892, 326, 262, 749, 1593, 1517, 318] integer, allocatable :: input(:), output(:) call gpt2_driver(input, output, m) print * print *, "TESTS:" if (all(input == input_ref)) then print *, "Input tokens agree with reference results" else print *, "Input tokens DO NOT agree with reference results" error stop end if if (all(output == output_ref)) then print *, "Output tokens agree with reference results" else print *, "Output tokens DO NOT agree with reference results" error stop end if end program ================================================ FILE: tests/test_chat.f90 ================================================ program test_chat use driver, only: chat use tokenizer, only: string implicit none type(string), allocatable :: inputs(:) inputs = [ & string("What color does the sky have?"), & string("What can you type a document on?"), & string("What can you drive in?"), & string("What can you fly in?"), & string("What continent is Germany in?"), & string("When did Second World War start?"), & string("When did it end?"), & string("When did the U.S. enter the Second World War?"), & string("When did the First World War start?"), & string("When did it end?"), & string("When did the Mexican-American war start?"), & string("When did it end?"), & string("What color is snow?"), & string("What color do plants usually have?") & ] call chat(inputs(:3)) end program ================================================ FILE: tests/test_more_inputs.f90 ================================================ program test_more_inputs use driver, only: gpt2_driver2, model_t, load_model implicit none type(model_t) :: m integer, parameter :: input_ref(*) = [46, 358, 129, 247, 68, 73, 34754, 234, & 861, 8836, 74, 373, 4642, 287] integer, parameter :: output_ref(*) = [1248, 5332, 287, 262, 7404, 286, & 25370, 254, 368, 83, 6557, 81, 11] integer, allocatable :: input(:), output(:) call load_model("model.gguf", m) call gpt2_driver2("Ondřej Čertík was born in", 13, m, input, output) print * print *, "TESTS:" call test(input, input_ref, "Input") call test(output, output_ref, "Output") call gpt2_driver2("San Francisco is", 8, m, input, output) print * print *, "TESTS:" call test(input, [15017, 6033, 318], "Input") call test(output, [257, 1748, 286, 517, 621, 352, 1510, 661], "Output") call gpt2_driver2("Cars are", 13, m, input, output) print * print *, "TESTS:" call test(input, [34, 945, 389], "Input") call test(output, [407, 3142, 284, 307, 973, 287, 262, 7647, 1256, 286, & 257, 7072, 13], "Output") contains subroutine test(a, a_ref, text) integer, intent(in) :: a(:), a_ref(:) character(*), intent(in) :: text if (all(a == a_ref)) then print *, text, ": OK" else print *, text, ": FAIL" error stop end if end subroutine end program ================================================ FILE: tokenizer.f90 ================================================ module tokenizer implicit none type :: string character(:), allocatable :: s end type contains function c2s(x) result(y) character, intent(in) :: x(:) character(:), allocatable :: y integer :: i allocate(character(size(x)) :: y) do i = 1, size(x) y(i:i) = x(i) end do end function function next_token(input, i) result(y) ! TODO: tokenize exactly according to this regex: ! re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") ! Right now we are more greedy, but the bpe() tokenizer seems to still return ! exactly the same tokens for most inputs (it is not clear if for all inputs). character(*), intent(in) :: input integer, intent(inout) :: i character(:), allocatable :: y if (i > len(input)) then y = "" else if (input(i:i) == " ") then y = tokenize_word(input, i) else if (input(i:i) == "," .or. input(i:i) == ".") then y = input(i:i) i = i + 1 else y = tokenize_word(input, i) end if end function function tokenize_word(input, i) result(y) character(*), intent(in) :: input integer, intent(inout) :: i character(:), allocatable :: y integer :: i0 i0 = i if (input(i:i) == " ") then i = i + 1 end if do if (i > len(input)) then y = input(i0:i-1) exit end if if (input(i:i) == " " .or. input(i:i) == "," .or. input(i:i) == ".") then y = input(i0:i-1) exit end if i = i + 1 end do end function function word_idx(word, idx, decoder_txt) result(token) character(*), intent(in) :: word integer, intent(in) :: idx(0:) character, intent(in) :: decoder_txt(:) integer :: token integer :: i ! This is O(n) search instead of O(1) lookup in a dictionary, so it is slow do i = 0, ubound(idx,1)-1 if (c2s(decoder_txt(idx(i)+1:idx(i+1))) == word) then token = i return end if end do token = -1 end function subroutine codepoint_to_utf8(s, c) ! UTF-32 -> UTF-8 character(:), allocatable, intent(inout) :: s integer, intent(in) :: c integer :: d1, d2 if (c < 128) then s = s // achar(c) else if (c < 2048) then d1 = ior(ishft(c, -6), 192) d2 = iand(ior(c, 128), 191) s = s // achar(d1) // achar(d2) else error stop "UTF-32 range not supported" end if end subroutine function utf8_to_codepoint(s, i) result(c) ! UTF-8 -> UTF-32 character(*), intent(in) :: s integer, intent(inout) :: i integer :: c, d c = iachar(s(i:i)) if (c >= 128) then i = i + 1 d = iachar(s(i:i)) c = ior(ishft(iand(c, 31), 6), iand(d, 63)) end if if (c >= 2048) then error stop "UTF-8 range not supported" end if end function function merge_pair(intokens, idx) result(tokens) ! Merge the pair `idx` type(string), intent(in) :: intokens(:) integer, intent(in) :: idx type(string), allocatable :: tokens(:) allocate(tokens(size(intokens)-1)) tokens(:idx-1) = intokens(:idx-1) tokens(idx)%s = intokens(idx)%s // intokens(idx+1)%s tokens(idx+1:) = intokens(idx+2:) end function function merge_utf8_pairs(intokens) result(tokens) ! Merge all UTF-8 character pairs type(string), intent(in) :: intokens(:) type(string), allocatable :: tokens(:), tmp_tokens(:) integer :: i, j logical :: one_more_pass allocate(tokens(size(intokens))) tokens = intokens one_more_pass = .true. j = 1 do while(one_more_pass) one_more_pass = .false. do i = j, size(tokens)-1 if (len(tokens(i)%s) == 1 .and. iachar(tokens(i)%s(1:1)) >= 128) then tmp_tokens = merge_pair(tokens, i) deallocate(tokens) call move_alloc(tmp_tokens, tokens) one_more_pass = .true. j = i + 1 exit end if end do end do !print *, "tokens = ", (tokens(i)%s // " ", i=1,size(tokens)) end function function bpe(token, vocab_idx, vocab_txt) result(tokens) ! Takes a token as a string, and returns bpe tokens as an array of strings character(*), intent(in) :: token integer, intent(in) :: vocab_idx(0:) character, intent(in) :: vocab_txt(:) type(string), allocatable :: tokens(:), tmp_tokens(:) integer, allocatable :: pair_scores(:) integer :: not_found, merge_pair_idx integer :: i not_found = size(vocab_idx) + 10 allocate(tokens(len(token))) do i = 1, len(token) tokens(i)%s = token(i:i) end do tmp_tokens = merge_utf8_pairs(tokens) deallocate(tokens) call move_alloc(tmp_tokens, tokens) do !print *, "tokens = ", (tokens(i)%s // " ", i=1,size(tokens)) if (size(tokens) == 1) then ! The token pairs were either all merged into one word, or the input ! token was a one character word, either way we are done: exit end if allocate(pair_scores(size(tokens)-1)) ! Loop over pairs do i = 1, size(tokens)-1 pair_scores(i) = word_idx(tokens(i)%s // " " // tokens(i+1)%s, vocab_idx, vocab_txt) if (pair_scores(i) == -1) pair_scores(i) = not_found end do merge_pair_idx = minloc(pair_scores, 1) if (pair_scores(merge_pair_idx) == not_found) then ! No token pair can be merged, so we are done: exit end if !print *, pair_scores !print *, merge_pair_idx, pair_scores(merge_pair_idx) tmp_tokens = merge_pair(tokens, merge_pair_idx) deallocate(tokens) call move_alloc(tmp_tokens, tokens) deallocate(pair_scores) end do !print *, "final tokens = ", (tokens(i)%s // " ", i=1,size(tokens)) end function function encode(input, idx, decoder_txt, vocab_idx, vocab_txt, byte_encoder) & result(tokens2) character(*), intent(in) :: input integer, intent(in) :: idx(0:), vocab_idx(0:), byte_encoder(0:) character, intent(in) :: decoder_txt(:), vocab_txt(:) integer, parameter :: max_tokens = 2048 integer :: tokens(max_tokens) integer, allocatable :: tokens2(:) character(:), allocatable :: tmp, tmp2 type(string), allocatable :: bpe_tokens(:) integer :: i, j, c, n_tokens n_tokens = 0 i = 1 do tmp = next_token(input, i) if (tmp == "") exit tmp2 = "" do j = 1, len(tmp) c = iachar(tmp(j:j)) c = byte_encoder(c) ! c is UTF-32 (4 bytes), but only the range [0, 324] is used ! Encode c from UTF-32 to UTF-8. Due to the limited range ! either one or two bytes of UTF-8 are appended to tmp2: call codepoint_to_utf8(tmp2, c) end do if (allocated(bpe_tokens)) deallocate(bpe_tokens) bpe_tokens = bpe(tmp2, vocab_idx, vocab_txt) do j = 1, size(bpe_tokens) n_tokens = n_tokens + 1 if (n_tokens > max_tokens) error stop "exceeded max_tokens" tokens(n_tokens) = word_idx(bpe_tokens(j)%s, idx, decoder_txt) end do deallocate(tmp2) end do allocate(tokens2(n_tokens)) tokens2(:) = tokens(:n_tokens) end function function decode(tokens, idx, decoder_txt, byte_decoder) result(output) integer, intent(in) :: tokens(:), idx(0:), byte_decoder(0:) character, intent(in) :: decoder_txt(:) character(:), allocatable :: output character(:), allocatable :: output2, tmp integer :: i, c allocate(character(0) :: output2) ! Fix GFortran warning output2 = "" do i = 1, size(tokens) if (tokens(i) < 0) error stop "tokens(i) < 0" output2 = output2 // c2s(decoder_txt(idx(tokens(i))+1:idx(tokens(i)+1))) end do i = 1 output = "" do ! Decode UTF-8 (one or more bytes) to UTF-32 code point (always 4 bytes), ! However for GPT-2 it seems only range 0-323 is used from UTF-32. c = utf8_to_codepoint(output2, i) ! [0,324] -> [0,255] if (c < 0 .or. c > ubound(byte_decoder,1)) then print *, "Codepoint out of range for byte decoder:", c, ubound(byte_decoder,1) error stop end if tmp = achar(byte_decoder(c)) output = output // tmp if (i == len(output2)) exit i = i + 1 end do end function end module