[
  {
    "path": ".github/workflows/CI.yml",
    "content": "name: CI\n\non:\n  push:\n    branches:\n      - main\n  pull_request:\n    branches:\n      - main\n\n\njobs:\n\n  gfortran:\n    name: GFortran (${{ matrix.os }})\n    runs-on: ${{ matrix.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        os: [\"macos-latest\", \"ubuntu-latest\"]\n\n    steps:\n      - uses: actions/checkout@v4\n        with:\n          fetch-depth: 0\n\n      - uses: mamba-org/setup-micromamba@v2.0.2\n        with:\n          micromamba-version: '2.0.4-0'\n          environment-file: environment.yml\n          create-args: >-\n            ${{ matrix.os == 'macos-latest' && 'gfortran=14.2.0' || '' }}\n\n      - name: Install GGUF\n        shell: bash -e -x -l {0}\n        run: |\n            git clone https://github.com/ggerganov/llama.cpp\n            cd llama.cpp\n            git checkout 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc\n            cd gguf-py\n            pip install .\n            cd ../..\n\n      - name: Build and run\n        shell: bash -l {0}\n        run: |\n            ci/build.sh\n\n  lfortran:\n    name: LFortran (${{ matrix.os }})\n    runs-on: ${{ matrix.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        os: [\"macos-latest\", \"ubuntu-latest\"]\n\n    steps:\n      - uses: actions/checkout@v3\n        with:\n          fetch-depth: 0\n\n      - uses: mamba-org/setup-micromamba@v2.0.2\n        with:\n          micromamba-version: '2.0.4-0'\n          environment-file: environment.yml\n          create-args: >-\n            lfortran=0.60.0\n            ${{ matrix.os == 'ubuntu-latest' && 'llvm-openmp=11.1.0' || '' }}\n\n      - name: Build and run\n        shell: bash -l {0}\n        run: |\n            ci/build_lfortran.sh\n"
  },
  {
    "path": ".gitignore",
    "content": "gpt2/\nencoder.json\ninput.dat\nmodel.dat\nvocab.bpe\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "cmake_minimum_required(VERSION 3.13 FATAL_ERROR)\n\nset(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_SOURCE_DIR}/cmake/UserOverride.cmake)\n\nif (NOT CMAKE_BUILD_TYPE)\n    set(CMAKE_BUILD_TYPE Release\n        CACHE STRING \"Build type (Debug, Release)\")\nendif()\n\nproject(fastGPT)\nenable_language(Fortran)\n\nset(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)\nset(CMAKE_Fortran_MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/mod_files)\n\n# Make sure that CMAKE_BUILD_TYPE is either Debug or Release:\nif (NOT CMAKE_BUILD_TYPE MATCHES \"Debug|Release\")\n    message(FATAL_ERROR \"CMAKE_BUILD_TYPE must be one of: Debug, Release (current value: '${CMAKE_BUILD_TYPE}')\")\nendif ()\n\nif (APPLE)\n    set(DEFAULT_FASTGPT_BLAS \"Accelerate\")\nelse()\n    set(DEFAULT_FASTGPT_BLAS \"Fortran\")\nendif()\nset(FASTGPT_BLAS ${DEFAULT_FASTGPT_BLAS}\n    CACHE STRING \"The BLAS library that fastGPT should use\")\nif (NOT FASTGPT_BLAS MATCHES \"Accelerate|OpenBLAS|Fortran\")\n    message(FATAL_ERROR \"FASTGPT_BLAS must be one of: OpenBLAS, Accelerate, Fortran (current value: '${FASTGPT_BLAS}')\")\nendif ()\nif (FASTGPT_BLAS STREQUAL \"Accelerate\")\n    find_package(OMP)\n    if(NOT OMP_FOUND)\n        find_package(OpenMP REQUIRED COMPONENTS Fortran)\n    endif()\nelseif (FASTGPT_BLAS STREQUAL \"OpenBLAS\")\n    find_package(OPENBLAS REQUIRED)\n    find_package(OMP)\n    if(NOT OMP_FOUND)\n        find_package(OpenMP REQUIRED COMPONENTS Fortran)\n    endif()\nelse()\n    # pass\nendif()\n\nenable_testing()\n\nset(SRC\n    gpt2.f90\n    tokenizer.f90\n    driver.f90\n    )\nif (FASTGPT_BLAS STREQUAL \"Accelerate\")\n    list(APPEND SRC\n        linalg_accelerate.c\n        linalg_c.f90\n        omp.f90\n    )\nelseif (FASTGPT_BLAS STREQUAL \"OpenBLAS\")\n    list(APPEND SRC\n        linalg_openblas.c\n        linalg_c.f90\n        omp.f90\n    )\nelse()\n    list(APPEND SRC\n        linalg_f.f90\n        omp_dummy.f90\n    )\nendif()\nadd_library(fastgpt ${SRC})\nif (FASTGPT_BLAS STREQUAL \"Accelerate\")\n    target_link_options(fastgpt PUBLIC -framework accelerate)\n    target_link_libraries(fastgpt PUBLIC \"$<IF:$<BOOL:${OMP_FOUND}>,p::omp,OpenMP::OpenMP_Fortran>\")\nelseif (FASTGPT_BLAS STREQUAL \"OpenBLAS\")\n    target_link_libraries(fastgpt p::openblas\n        \"$<IF:$<BOOL:${OMP_FOUND}>,p::omp,OpenMP::OpenMP_Fortran>\")\nendif()\n\nadd_executable(gpt2 main.f90)\ntarget_link_libraries(gpt2 fastgpt)\n\nadd_executable(chat chat.f90)\ntarget_link_libraries(chat fastgpt)\n\nadd_executable(test_basic_input tests/test_basic_input.f90)\ntarget_link_libraries(test_basic_input fastgpt)\nadd_test(test_basic_input ${PROJECT_BINARY_DIR}/test_basic_input)\n\nadd_executable(test_more_inputs tests/test_more_inputs.f90)\ntarget_link_libraries(test_more_inputs fastgpt)\nadd_test(test_more_inputs ${PROJECT_BINARY_DIR}/test_more_inputs)\n\nadd_executable(test_chat tests/test_chat.f90)\ntarget_link_libraries(test_chat fastgpt)\nadd_test(test_chat ${PROJECT_BINARY_DIR}/test_chat)\n\nif(NOT PROJECT_SOURCE_DIR STREQUAL PROJECT_BINARY_DIR)\n    # Git auto-ignore out-of-source build directory\n    file(GENERATE OUTPUT .gitignore CONTENT \"*\")\nendif()\n\nmessage(\"\\n\")\nmessage(\"Configuration results\")\nmessage(\"---------------------\")\nmessage(\"Fortran compiler: ${CMAKE_Fortran_COMPILER}\")\nmessage(\"Build type: ${CMAKE_BUILD_TYPE}\")\nif (CMAKE_BUILD_TYPE STREQUAL \"Debug\")\n    message(\"Fortran compiler flags: ${CMAKE_Fortran_FLAGS_DEBUG}\")\nelse ()\n    message(\"Fortran compiler flags: ${CMAKE_Fortran_FLAGS_RELEASE}\")\nendif ()\nmessage(\"Installation prefix: ${CMAKE_INSTALL_PREFIX}\")\nmessage(\"FASTGPT_BLAS: ${FASTGPT_BLAS}\")\n"
  },
  {
    "path": "LICENSE",
    "content": "Copyright 2023 Ondřej Čertík\n\nPermission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# fastGPT\n\nThe progression of GPT-2 codes from the original to \"minimal\", \"nano\" and\n\"pico\":\n\n* [openai/gpt-2](https://github.com/openai/gpt-2)\n* [karpathy/minGPT](https://github.com/karpathy/mingpt)\n* [karpathy/nanoGPT](https://github.com/karpathy/nanogpt)\n* [jaymody/picoGPT](https://github.com/jaymody/picoGPT)\n\n`fastGPT` is very similar to `picoGPT` (very small and readable), but it is\nalso fast (see the Benchmarks section below). The speed and readability is\nachieved by using Fortran. I wrote a\n[blog post](https://ondrejcertik.com/blog/2023/03/fastgpt-faster-than-pytorch-in-300-lines-of-fortran/)\nintroducing fastGPT.\n\n`fastGPT` features:\n* Fast? ✅\n* Training code? ❌\n* Batch inference? ❌\n* top-p sampling? ❌ top-k? ❌ temperature? ❌ categorical sampling?! ❌ greedy? ✅\n* Readable? ✅\n* Small? ✅\n\nA quick breakdown of each of the files:\n\n* `gpt2.f90`: the actual GPT-2 model and a decoder\n* `main.f90`: the main driver\n* `create_model.py`: downloads the TensorFlow model and converts to the GGUF\n  format (`model.gguf`)\n* `encode_input.py`: encodes the text input into tokens (input file for `gpt2`)\n* Matmul implementations\n    * `linalg_f.f90` native Fortran\n    * `linalg_c.f90`, `linalg_accelerate.c` macOS Accelerate Framework\n* `pt.py`: a reference script to run PyTorch (returns the same answer)\n\n## Getting Started\n\n### Install prerequisites:\n```bash\n    mamba env create -f environment.yml\n    conda activate fastgpt\n```\n\n### Configure and build\n#### Fortran Package Manager (fpm)\n```bash\n    fpm build\n```\n\n#### CMake \n```bash\n    FC=gfortran cmake .\n    make\n```\n\n### Download the GPT2 model weights\n\n    curl -o model.gguf -L https://huggingface.co/certik/fastGPT/resolve/main/model_fastgpt_124M_v2.gguf\n\nYou can also download 355M for the `gpt-medium` model.\n\nNow you can modify the `input` file to change the input string and set other\nparameters.\n\n### Run \n(requires `model.gguf` and `input` in the current directory)\n\nIf you built with `cmake`, execute\n```bash\n    ./gpt2\n```\nAlternatively, if you built with `fpm`, execute\n```bash\n    fpm run chatgpt2\n```\nto launch an interactive chat session\n```bash\n    fpm run gpt2\n```\nor to launch a session with predetermined prompts.\n\n### Create the GGUF file\n\nCreate the `model.gguf` file from a given GPT-2 model. Supported sizes (and the\ncorresponding names to be used in `pt.py`, and the approximate download size):\n\"124M\" (`gpt2`, 0.5GB), \"355M\" (`gpt-medium`, 1.5GB), \"774M\" (`gpt-large`,\n3GB), \"1558M\" (`gpt-xl`, 6GB). This will download the model and cache it for\nsubsequent runs:\n```python\n    python create_model.py --models_dir \"models\" --model_size \"124M\"\n```\n\nThis script depends on the `gguf` Python library, that you can install using:\n```bash\n    git clone https://github.com/ggerganov/llama.cpp\n    cd llama.cpp\n    git checkout 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc\n    cd gguf-py\n    pip install .\n```\nThe `gguf` library is available in pip and conda, but we currently require the\nlatest version that is not available there yet.\n\nWe used this script to create several GGUF files and uploaded them to:\nhttps://huggingface.co/certik/fastGPT, so that you can just download the\npre-generated files.\n\n### Example Output\n\nThe above `./gpt2` command prints on Apple M1 Max:\n```\n$ ./gpt2\nLoading the model...\n    done. Time:   0.111s\n\nModel parameters:\nn_vocab = 50257\nn_ctx   =  1024\nn_embd  =   768\nn_layer =    12\nn_head  =    12\n\nInput text\nAlan Turing theorized that computers would one day become very powerful, but even he could not imagine\n\nEncoding: tokenizing input text into tokens (currently slow)...\n    done. Time:   0.074s\n\nInput parameters:\nn_seq                =  19\nn_tokens_to_generate =  20\n\nInput tokens:\n 36235 39141 18765  1143   326  9061   561   530  1110  1716   845  3665    11   475   772   339   714   407  5967\n\nDecoded input as text:\nAlan Turing theorized that computers would one day become very powerful, but even he could not imagine\n\nRunning model...\n how they would be able to do so.\n\n\"I think that the most important thing is\n    done. Time:   0.304s (1.01x)\n\nOutput tokens:\n   703   484   561   307  1498   284   466   523    13   198   198     1    40   892   326   262   749  1593  1517   318\n\nDecoded output as text:\n how they would be able to do so.\n\n\"I think that the most important thing is\n```\n\n### Chat interface\n\nHere is an example chat using the largest 1558M model:\n\n```\n$ ./chat\nYour name is fastGPT and you are an AI bot. The user will ask you questions and you answer in a nice, truthful, short way.\nUser: What is the capital of Czechia?\nfastGPT: Prague.\nUser: How many legs does a dog have?\nfastGPT: Four.\nUser: What color does the sky have?\nfastGPT: Blue.\nUser: What can you type a document on?\nfastGPT: A typewriter.\nUser: What can you drive in?\nfastGPT: A car.\nUser: What can you fly in?\nfastGPT: A plane.\nUser: What continent is Germany in?\nfastGPT: Europe.\nUser: When did Second World War start?\nfastGPT: 1939.\nUser: When did it end?\nfastGPT: 1945.\nUser: When did the U.S. enter the Second World War?\nfastGPT: 1941.\nUser: When did the First World War start?\nfastGPT: 1914.\nUser: When did it end?\nfastGPT: 1918.\nUser: When did the Mexican-American war start?\nfastGPT: 1846.\nUser: When did it end?\nfastGPT: 1848.\nUser: What color is snow?\nfastGPT: White.\nUser: What color do plants usually have?\nfastGPT: Green.\nUser: What is your name?\nfastGPT: fastGPT.\n```\n\n\n### BLAS Implementation\n\nYou can choose which BLAS implementation to use for `matmul` using:\n* `-DFASTGPT_BLAS=OpenBLAS`: Use OpenBLAS\n* `-DFASTGPT_BLAS=Accelerate`: Use the macOS Accelerate Framework\n* `-DFASTGPT_BLAS=Fortran`: Use the default Fortran's intrinsic `matmul`\n\n## Benchmarks\n\nOn Apple M1 Max, inference of the above input file (20 tokens):\n\n                                    1 core  2 cores  4 cores  8 cores\n\n    fastGPT (Accelerate, fast_tanh) 0.288s\n\n    fastGPT (Accelerate)            0.299s\n    PyTorch (Accelerate)            0.346s\n\n    fastGPT (OpenBLAS)              0.837s  0.514s    0.341s   0.339s\n    PyTorch (OpenBLAS)              0.873s  0.539s    0.386s   0.392s\n\n    fastGPT (Accelerate, no cache)  0.717s\n    picoGPT (Accelerate, no cache)  0.765s\n    PyTorch (Accelerate, no cache)  0.787s\n\n    fastGPT (OpenBLAS, no cache)    2.343s  1.603s    1.209s   1.018s\n    PyTorch (OpenBLAS, no cache)    2.356s  1.520s    1.104s   0.997s\n    picoGPT (OpenBLAS, no cache)    2.427s  1.645s    1.272s   1.081s\n\nTotal run (includes loading the model and Python imports):\n\n    fastGPT (Accelerate, fast_tanh): 0.401s\n    picoGPT (8 cores):               3.445s\n    PyTorch (OpenBLAS, 4 cores):     4.867s\n\n## TODO\n\n* [ ] Parallelization:\n  * [ ] Over heads: https://github.com/certik/fastGPT/issues/2\n  * [ ] MPI: https://github.com/certik/fastGPT/issues/5\n* [ ] Other sampling methods: https://github.com/certik/fastGPT/issues/8\n* [ ] Batching: https://github.com/certik/fastGPT/issues/7\n* [x] Improve the UI:\n  * [x] Implement the input tokenizer in Fortran: https://github.com/certik/fastGPT/issues/1\n  * [x] Show the words as they are generated: https://github.com/certik/fastGPT/issues/6\n"
  },
  {
    "path": "build.sh",
    "content": "#!/bin/bash\n\nset -ex\n\nFC=gfortran cmake -Bbuild\ncmake --build build --parallel\npython create_model.py --models_dir \"../gpt2/models\" --model_size \"124M\"\npython encode_input.py \\\n    \"Alan Turing theorized that computers would one day become very powerful, but even he could not imagine\" \\\n    -n 20\nbuild/gpt2\n"
  },
  {
    "path": "chat.f90",
    "content": "program chatgpt2\nuse driver, only: chat\nimplicit none\ncall chat()\nend program\n"
  },
  {
    "path": "ci/build.sh",
    "content": "#!/bin/bash\n\nset -ex\n\ncmake .\nmake\nmkdir models\npython create_model.py --models_dir \"models\" --model_size \"124M\"\n./gpt2\nctest\n\nmake clean\nrm CMakeCache.txt\ncmake -DFASTGPT_BLAS=OpenBLAS .\nmake\ntime OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2\n\nrm model.gguf\ncurl -o model.gguf -L https://huggingface.co/certik/fastGPT/resolve/main/model_fastgpt_124M_v2.gguf\ntime OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2\n\nrm gpt2\npython pt.py\n"
  },
  {
    "path": "ci/build_lfortran.sh",
    "content": "#!/bin/bash\n\nset -ex\n\ncurl -o model.gguf -L https://huggingface.co/certik/fastGPT/resolve/main/model_fastgpt_124M_v2.gguf\n\nmkdir lf\ncd lf\nFC=lfortran CMAKE_PREFIX_PATH=$CONDA_PREFIX cmake -DFASTGPT_BLAS=OpenBLAS -DCMAKE_BUILD_TYPE=Debug ..\nmake VERBOSE=1\nln -s ../model.gguf .\nln -s ../input .\ntime OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2\ntime OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./test_basic_input\ntime OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./test_more_inputs\ncd ..\n\nmkdir lf-fast\ncd lf-fast\nFC=\"lfortran --fast\" CMAKE_PREFIX_PATH=$CONDA_PREFIX cmake -DFASTGPT_BLAS=OpenBLAS -DCMAKE_BUILD_TYPE=Release ..\nmake VERBOSE=1\nln -s ../model.gguf .\nln -s ../input .\ntime OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./gpt2\ntime OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./test_basic_input\ntime OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 ./test_more_inputs\ncd ..\n"
  },
  {
    "path": "cmake/FindOMP.cmake",
    "content": "find_path(OMP_INCLUDE_DIR omp.h)\nfind_library(OMP_LIBRARY omp)\n\ninclude(FindPackageHandleStandardArgs)\nfind_package_handle_standard_args(OMP DEFAULT_MSG OMP_INCLUDE_DIR\n    OMP_LIBRARY)\n\nadd_library(p::omp INTERFACE IMPORTED)\nset_property(TARGET p::omp PROPERTY INTERFACE_INCLUDE_DIRECTORIES\n    ${OMP_INCLUDE_DIR})\nset_property(TARGET p::omp PROPERTY INTERFACE_LINK_LIBRARIES\n    ${OMP_LIBRARY})\n"
  },
  {
    "path": "cmake/FindOPENBLAS.cmake",
    "content": "find_path(OPENBLAS_INCLUDE_DIR NAMES cblas.h PATHS /usr/include/openblas)\n\nfind_library(OPENBLAS_LIBRARY NAMES openblas)\n\ninclude(FindPackageHandleStandardArgs)\nfind_package_handle_standard_args(OPENBLAS DEFAULT_MSG OPENBLAS_INCLUDE_DIR\n    OPENBLAS_LIBRARY)\n\nadd_library(p::openblas INTERFACE IMPORTED)\nset_property(TARGET p::openblas PROPERTY INTERFACE_INCLUDE_DIRECTORIES\n    ${OPENBLAS_INCLUDE_DIR})\nset_property(TARGET p::openblas PROPERTY INTERFACE_LINK_LIBRARIES\n    ${OPENBLAS_LIBRARY})\n"
  },
  {
    "path": "cmake/UserOverride.cmake",
    "content": "# This overrides the default CMake Debug and Release compiler options.\n# The user can still specify different options by setting the\n# CMAKE_Fortran_FLAGS_[RELEASE,DEBUG] variables (on the command line or in the\n# CMakeList.txt). This files serves as better CMake defaults and should only be\n# modified if the default values are to be changed. Project specific compiler\n# flags should be set in the CMakeList.txt by setting the CMAKE_Fortran_FLAGS_*\n# variables.\nif (CMAKE_Fortran_COMPILER_ID STREQUAL \"GNU\")\n    # gfortran\n    set(common \"-Wall -Wextra -Wimplicit-interface -fPIC\")\n    set(CMAKE_Fortran_FLAGS_RELEASE_INIT \"${common} -O3 -march=native -ffast-math -funroll-loops\")\n    set(CMAKE_Fortran_FLAGS_DEBUG_INIT   \"${common} -g -fcheck=all -fbacktrace\")\nelseif (CMAKE_Fortran_COMPILER_ID MATCHES \"^Intel\")\n    # ifort\n    set(common \"-warn all\")\n    set(CMAKE_Fortran_FLAGS_RELEASE_INIT \"${common} -xHOST -O3 -no-prec-div -static\")\n    set(CMAKE_Fortran_FLAGS_DEBUG_INIT   \"${common} -check all\")\nendif ()\n"
  },
  {
    "path": "comparison/encode_input.py",
    "content": "\"\"\"\nThis script implements the encoding of an input string into tokens.\nIt requires two files in the current directory: encoder.json, vocab.bpe\nIt creates the file input.dat which contains the input tokens and how many\ntokens to generate.\n\nTODO: save the information from encoder.json and vocab.pge into model.dat and\nimplement this encoder in Fortran.\n\nMost of this file were taken from:\n\nhttps://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py\n\nAnd it is licensed under:\n\nModified MIT License\n\nSoftware Copyright (c) 2019 OpenAI\n\nWe don’t claim ownership of the content you create with GPT-2, so it is yours to do with as you please.\nWe only ask that you use GPT-2 responsibly and clearly indicate your content was created using GPT-2.\n\nPermission is hereby granted, free of charge, to any person obtaining a copy of this software and\nassociated documentation files (the \"Software\"), to deal in the Software without restriction,\nincluding without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,\nand/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,\nsubject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included\nin all copies or substantial portions of the Software.\nThe above copyright notice and this permission notice need not be included\nwith content created by the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,\nINCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\nBE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\nTORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE\nOR OTHER DEALINGS IN THE SOFTWARE.\n\"\"\"\n\nimport numpy as np\nimport json\nimport os\nimport regex as re\n\ndef bytes_to_unicode():\n    bs = list(range(ord(\"!\"), ord(\"~\") + 1)) + list(range(ord(\"¡\"), ord(\"¬\") + 1)) + list(range(ord(\"®\"), ord(\"ÿ\") + 1))\n    cs = bs[:]\n    n = 0\n    for b in range(2**8):\n        if b not in bs:\n            bs.append(b)\n            cs.append(2**8 + n)\n            n += 1\n    cs = [chr(n) for n in cs]\n    return dict(zip(bs, cs))\n\n\ndef get_pairs(word):\n    \"\"\"Return set of symbol pairs in a word.\n    Word is represented as tuple of symbols (symbols being variable-length strings).\n    \"\"\"\n    pairs = set()\n    prev_char = word[0]\n    for char in word[1:]:\n        pairs.add((prev_char, char))\n        prev_char = char\n    return pairs\n\n\nclass Encoder:\n    def __init__(self, encoder, bpe_merges):\n        self.encoder = encoder\n        self.byte_encoder = bytes_to_unicode()\n        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))\n\n        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions\n        self.pat = re.compile(r\"\"\"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+\"\"\")\n\n    def bpe(self, token):\n        word = tuple(token)\n        pairs = get_pairs(word)\n\n        if not pairs:\n            return token\n\n        while True:\n            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float(\"inf\")))\n            if bigram not in self.bpe_ranks:\n                break\n            first, second = bigram\n            new_word = []\n            i = 0\n            while i < len(word):\n                try:\n                    j = word.index(first, i) # can trigger ValueError\n                    new_word.extend(word[i:j])\n                    i = j\n                except ValueError:\n                    new_word.extend(word[i:])\n                    break\n\n                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:\n                    new_word.append(first + second)\n                    i += 2\n                else:\n                    new_word.append(word[i])\n                    i += 1\n            new_word = tuple(new_word)\n            word = new_word\n            if len(word) == 1:\n                break\n            else:\n                pairs = get_pairs(word)\n        return word\n\n    def encode(self, text):\n        bpe_tokens = []\n        for token in re.findall(self.pat, text):\n            token = \"\".join(self.byte_encoder[b] for b in token.encode(\"utf-8\"))\n            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token))\n        return bpe_tokens\n\n\ndef get_encoder():\n    with open(\"encoder.json\") as f:\n        encoder = json.load(f)\n    with open(\"vocab.bpe\", encoding=\"utf-8\") as f:\n        bpe_data = f.read()\n    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split(\"\\n\")[1:-1]]\n    return Encoder(encoder=encoder, bpe_merges=bpe_merges)\n\ndef main(prompt: str, n_tokens_to_generate: int = 40):\n    encoder = get_encoder()\n    input_ids = np.array(encoder.encode(prompt), dtype=np.int32)\n\n    print(\"Saving the input into `input.dat`\")\n    g = open(\"input.dat\", \"w\")\n    np.array([len(input_ids), n_tokens_to_generate], dtype=np.int32).tofile(g)\n    input_ids.tofile(g)\n    print(input_ids)\n\nif __name__ == \"__main__\":\n    import fire\n    fire.Fire(main)\n"
  },
  {
    "path": "create_model.py",
    "content": "\"\"\"\nThis script loads the specified GPT-2 model from OpenAI using TensorFlow,\nconverts it into our custom format and saves it to `model.gguf`, which contains\neverything (all the parameters, all the weights, encoding/decoding\ninformation).\n\nParts of this script were taken from the picoGPT project: https://github.com/jaymody/picoGPT\n\nThose are licensed as:\n\nMIT License\n\nCopyright (c) 2023 Jay Mody\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n\"\"\"\n\nfrom time import monotonic as clock\nimport os\nimport json\nimport re\nfrom shutil import copyfile\n\nimport numpy as np\nimport gguf\nimport requests\nimport tensorflow as tf\nfrom tqdm import tqdm\n\ndef download_gpt2_files(model_size, model_dir):\n    assert model_size in [\"124M\", \"355M\", \"774M\", \"1558M\"]\n    for filename in [\n        \"checkpoint\",\n        \"encoder.json\",\n        \"hparams.json\",\n        \"model.ckpt.data-00000-of-00001\",\n        \"model.ckpt.index\",\n        \"model.ckpt.meta\",\n        \"vocab.bpe\",\n    ]:\n        url = \"https://openaipublic.blob.core.windows.net/gpt-2/models\"\n        r = requests.get(f\"{url}/{model_size}/{filename}\", stream=True)\n        r.raise_for_status()\n\n        with open(os.path.join(model_dir, filename), \"wb\") as f:\n            file_size = int(r.headers[\"content-length\"])\n            chunk_size = 1000\n            with tqdm(\n                ncols=100,\n                desc=\"Fetching \" + filename,\n                total=file_size,\n                unit_scale=True,\n            ) as pbar:\n                # 1k for chunk_size, since Ethernet packet size is around 1500 bytes\n                for chunk in r.iter_content(chunk_size=chunk_size):\n                    f.write(chunk)\n                    pbar.update(chunk_size)\n\n\ndef load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams):\n    def set_in_nested_dict(d, keys, val):\n        if not keys:\n            return val\n        if keys[0] not in d:\n            d[keys[0]] = {}\n        d[keys[0]] = set_in_nested_dict(d[keys[0]], keys[1:], val)\n        return d\n\n    init_vars = tf.train.list_variables(tf_ckpt_path)\n    params = {\"blocks\": [{} for _ in range(hparams[\"n_layer\"])]}\n    for name, _ in init_vars:\n        array = np.squeeze(tf.train.load_variable(tf_ckpt_path, name))\n        name = name.removeprefix(\"model/\")\n        if name.startswith(\"h\"):\n            m = re.match(r\"h([0-9]+)/(.*)\", name)\n            n = int(m[1])\n            sub_name = m[2]\n            set_in_nested_dict(params[\"blocks\"][n], sub_name.split(\"/\"), array)\n        else:\n            set_in_nested_dict(params, name.split(\"/\"), array)\n\n    return params\n\n\ndef load_encoder_hparams_and_params(model_size, models_dir):\n    assert model_size in [\"124M\", \"355M\", \"774M\", \"1558M\"]\n\n    model_dir = os.path.join(models_dir, model_size)\n    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)\n    if not tf_ckpt_path:  # download files if necessary\n        os.makedirs(model_dir, exist_ok=True)\n        download_gpt2_files(model_size, model_dir)\n        tf_ckpt_path = tf.train.latest_checkpoint(model_dir)\n\n    hparams = json.load(open(os.path.join(model_dir, \"hparams.json\")))\n    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams)\n\n    return hparams, params\n\ndef convert(params, n_head, n_ctx, idx, decoder_txt,\n        vocab_idx, vocab_txt, byte_decoder):\n    t1 = clock()\n    blocks = params[\"blocks\"]\n    n_embd = blocks[0][\"ln_1\"][\"b\"].size\n    n_layer = len(blocks)\n    mlp_fc_w = np.empty((n_layer,n_embd,4*n_embd), dtype=np.float32)\n    mlp_fc_b = np.empty((n_layer,4*n_embd), dtype=np.float32)\n    mlp_proj_w = np.empty((n_layer,4*n_embd,n_embd), dtype=np.float32)\n    mlp_proj_b = np.empty((n_layer,n_embd), dtype=np.float32)\n    attn_w = np.empty((n_layer,n_embd,3*n_embd), dtype=np.float32)\n    attn_b = np.empty((n_layer,3*n_embd), dtype=np.float32)\n    attn_proj_w = np.empty((n_layer,n_embd,n_embd), dtype=np.float32)\n    attn_proj_b = np.empty((n_layer,n_embd), dtype=np.float32)\n    ln1_g = np.empty((n_layer,n_embd), dtype=np.float32)\n    ln1_b = np.empty((n_layer,n_embd), dtype=np.float32)\n    ln2_g = np.empty((n_layer,n_embd), dtype=np.float32)\n    ln2_b = np.empty((n_layer,n_embd), dtype=np.float32)\n    for i, block in enumerate(blocks):\n        mlp_fc_w[i,:,:] = block[\"mlp\"][\"c_fc\"][\"w\"]\n        mlp_fc_b[i,:] = block[\"mlp\"][\"c_fc\"][\"b\"]\n        mlp_proj_w[i,:,:] = block[\"mlp\"][\"c_proj\"][\"w\"]\n        mlp_proj_b[i,:] = block[\"mlp\"][\"c_proj\"][\"b\"]\n        attn_w[i,:,:] = block[\"attn\"][\"c_attn\"][\"w\"]\n        attn_b[i,:] = block[\"attn\"][\"c_attn\"][\"b\"]\n        attn_proj_w[i,:,:] = block[\"attn\"][\"c_proj\"][\"w\"]\n        attn_proj_b[i,:] = block[\"attn\"][\"c_proj\"][\"b\"]\n        ln1_g[i,:] = block[\"ln_1\"][\"g\"]\n        ln1_b[i,:] = block[\"ln_1\"][\"b\"]\n        ln2_g[i,:] = block[\"ln_2\"][\"g\"]\n        ln2_b[i,:] = block[\"ln_2\"][\"b\"]\n    wte = params[\"wte\"]\n    wpe = params[\"wpe\"]\n    lnf_g = params[\"ln_f\"][\"g\"]\n    lnf_b = params[\"ln_f\"][\"b\"]\n    t2 = clock()\n    print(\"Transform time: \", t2-t1)\n    t1 = clock()\n\n    n_vocab = np.size(wte, 0)\n    assert np.size(wte, 1) == n_embd\n\n    model_type = 0xfa51697 # fastGPT\n    model_version = 2\n    header = np.array([model_type, model_version, n_vocab, n_ctx, n_embd, n_layer, n_head,\n        len(idx),len(decoder_txt.encode(\"utf-8\")),\n        len(vocab_idx),len(vocab_txt.encode(\"utf-8\")),len(byte_decoder)], dtype=np.int32)\n\n    # Save the model to GGUF\n    def save_gguf(data_offset_name, data_offset_value):\n        g = gguf.GGUFWriter(\"model.gguf\", None)\n        g.add_int32(data_offset_name, data_offset_value)\n        g.add_tensor(\"header\", header)\n        g.add_tensor(\"wte\", wte); g.add_tensor(\"wpe\", wpe)\n        g.add_tensor(\"mlp_fc_w\", mlp_fc_w); g.add_tensor(\"mlp_fc_b\", mlp_fc_b)\n        g.add_tensor(\"mlp_proj_w\", mlp_proj_w); g.add_tensor(\"mlp_proj_b\", mlp_proj_b)\n        g.add_tensor(\"attn_w\", attn_w); g.add_tensor(\"attn_b\", attn_b)\n        g.add_tensor(\"attn_proj_w\", attn_proj_w); g.add_tensor(\"attn_proj_b\",\n                attn_proj_b)\n        g.add_tensor(\"ln1_b\", ln1_b); g.add_tensor(\"ln1_g\", ln1_g)\n        g.add_tensor(\"ln2_b\", ln2_b); g.add_tensor(\"ln2_g\", ln2_g)\n        g.add_tensor(\"lnf_b\", lnf_b); g.add_tensor(\"lnf_g\", lnf_g)\n        g.add_tensor(\"idx\", idx)\n        g.add_tensor(\"decoder_txt\", np.frombuffer(decoder_txt.encode(\"utf-8\"),\n            dtype=np.int8))\n        g.add_tensor(\"vocab_idx\", vocab_idx)\n        g.add_tensor(\"vocab_txt\", np.frombuffer(vocab_txt.encode(\"utf-8\"),\n            dtype=np.int8))\n        g.add_tensor(\"byte_decoder\", byte_decoder)\n        g.write_header_to_file()\n        g.write_kv_data_to_file()\n        g.write_tensors_to_file()\n        g.close()\n\n    data_offset_name = \"general.data_offset\"\n    save_gguf(data_offset_name, 0)\n\n    g = gguf.GGUFReader(\"model.gguf\")\n    data_offset = g.tensors[0].data_offset\n    # * .offset: the offset of the kv entry\n    # * 8: The i64 length of the key string\n    # * 4: The i32 type of the value\n    assert g.fields[data_offset_name].offset == 24\n    offset_offset = g.fields[data_offset_name].offset + 8 + \\\n        len(data_offset_name) + 4\n    print(\"offset offset:\", offset_offset)\n    print(\"data offset:\", data_offset)\n\n    save_gguf(data_offset_name, data_offset)\n\n    t2 = clock()\n    print(\"Save time: \", t2-t1)\n\n\ndef load_decoder(filename):\n    D = json.load(open(filename))\n    D2 = {v: k for k, v in D.items()}\n    i = 0\n    decoder = []\n    while True:\n        if i not in D2:\n            break\n        decoder.append(D2[i])\n        i += 1\n    return decoder\n\ndef load_vocab(filename):\n    D = open(filename).read()\n    D = D.split(\"\\n\")\n    D = D[1:]\n    return D\n\ndef decoder_idx(decoder):\n    i = 0\n    idx = np.empty(len(decoder)+1, dtype=np.int32)\n    idx[0] = i\n    for n, t in enumerate(decoder):\n        i += len(t.encode(\"utf-8\"))\n        idx[n+1] = i\n    assert idx[-1] == len(\"\".join(decoder).encode(\"utf-8\"))\n    return idx\n\ndef bytes_to_unicode():\n    bs = list(range(ord(\"!\"), ord(\"~\") + 1)) + list(range(ord(\"¡\"), ord(\"¬\") + 1)) + list(range(ord(\"®\"), ord(\"ÿ\") + 1))\n    cs = bs[:]\n    n = 0\n    for b in range(2**8):\n        if b not in bs:\n            bs.append(b)\n            cs.append(2**8 + n)\n            n += 1\n    cs = [chr(n) for n in cs]\n    btu = dict(zip(bs, cs))\n    byte_decoder = {v: k for k, v in btu.items()}\n    bd = np.zeros(324, dtype=np.int32)\n    for y in byte_decoder:\n        x = ord(y)\n        bd[x] = byte_decoder[y]\n    bd2 = np.zeros(256, dtype=np.int32)\n    for i in range(np.size(bd)):\n        bd2[bd[i]] = i\n    return bd2\n\ndef main(model_size: str = \"124M\", models_dir: str = \"models\"):\n    print(\"Loading model\")\n    # load encoder, hparams, and params from the released open-ai gpt-2 files\n    t1 = clock()\n    hparams, params = load_encoder_hparams_and_params(model_size, models_dir)\n    decoder = load_decoder(os.path.join(models_dir, model_size, \"encoder.json\"))\n    vocab = load_vocab(os.path.join(models_dir, model_size, \"vocab.bpe\"))\n    t2 = clock()\n    print(\"  Done. Loading time: \", t2-t1)\n\n    # generate output ids\n    print(\"Converting model, saving to `model.gguf`\")\n    t1 = clock()\n    decoder_txt = \"\".join(decoder)\n    idx = decoder_idx(decoder)\n    vocab_txt = \"\".join(vocab)\n    vocab_idx = decoder_idx(vocab)\n    byte_decoder = bytes_to_unicode()\n    convert(params, hparams[\"n_head\"], hparams[\"n_ctx\"], idx, decoder_txt,\n            vocab_idx, vocab_txt, byte_decoder)\n    t2 = clock()\n    print(\"  Done. Time: \", t2-t1)\n\n\nif __name__ == \"__main__\":\n    import fire\n    fire.Fire(main)\n"
  },
  {
    "path": "driver.f90",
    "content": "module driver\nuse gpt2_mod, only: generate, model_t\nuse tokenizer, only: encode, decode, string\nuse omp, only: omp_get_wtime\nimplicit none\n\ninteger, parameter :: sp = kind(0.0)\ninteger, parameter :: dp = kind(0.d0)\ncharacter(1), parameter :: LF = achar(10)\n\ncontains\n\nsubroutine load_input(filename, input_txt, n_tokens_to_generate)\n! Load the input from a namelist `filename`\ncharacter(*), intent(in) :: filename\ncharacter(:), allocatable, intent(out) :: input_txt\ninteger, intent(out) :: n_tokens_to_generate\ncharacter(1024) :: input_txt2\ninteger :: u, ios\nnamelist / input_fastGPT / n_tokens_to_generate\nallocate(character(0) :: input_txt)\ninput_txt = \"\"\nopen(newunit=u, file=filename, status=\"old\")\nread(u, input_fastGPT)\ndo\n    read(u, \"(a)\", iostat=ios) input_txt2\n    if (ios /= 0) exit\n    if (len(input_txt) > 0) input_txt = input_txt // char(10)\n    input_txt = input_txt // trim(input_txt2)\nend do\nclose(u)\nend subroutine\n\n! Skips `amount` bytes from the current position\nsubroutine fskip(u, amount)\ninteger, intent(in) :: u, amount\ncharacter, allocatable :: tmp(:)\n! Note: the code below is equivalent to the non-standard: fseek(u, amount, 1)\n! Let's allocate on heap, in case the skip is large\nallocate(tmp(amount))\nread(u) tmp\nend subroutine\n\n! Aligns file position in `u` to 32 byte boundary after `A` was read\nsubroutine align_i4(u, A)\ninteger, intent(in) :: u\ninteger, intent(in) :: A(..)\ninteger :: n, alignment\nalignment = 32\nn = size(A)*4\ncall fskip(u, alignment-modulo(n,alignment))\nend subroutine\n\nsubroutine align_str(u, A)\ninteger, intent(in) :: u\ncharacter, intent(in) :: A(:)\ninteger :: n, alignment\nalignment = 32\nn = size(A)\nif (modulo(n, alignment) /= 0) then\n    call fskip(u, alignment-modulo(n,alignment))\nend if\nend subroutine\n\nsubroutine load_model(filename, m)\ncharacter(*), intent(in) :: filename\ntype(model_t), intent(out) :: m\n! We use the following fastGPT model type number\n!   fastGPT (digits look similar to the letters they represent)\n! 0xfa51697 = 262477463\n\n! We read the offset to the data section at this position, which is the first\n! variable in the metadata, the name is \"general.data_offset\", type i32.\ninteger, parameter :: offset_offset = &\n    ! header\n    4 + & ! u8[4] magic\n    4 + & ! u32 version\n    8 + & ! u64 n_arrays\n    8 + & ! u64 n_kv\n    ! kv\n    8 + & ! u64 n_str\n    19 + & ! len(\"general.data_offset\")\n    4 ! u32 type of value\ninteger, parameter :: current_model_mark = 262477463\ninteger, parameter :: current_model_version = 2\ninteger :: model_mark\ninteger :: u\ninteger :: data_offset\nopen(newunit=u, file=filename, form=\"unformatted\", access=\"stream\", status=\"old\")\ncall fskip(u, offset_offset)\nread(u) data_offset\n! Alternatively we could have done: rewind(u); call fskip(u, data_offset)\ncall fskip(u, data_offset-offset_offset-4)\nread(u) model_mark\nif (model_mark /= current_model_mark) then\n    print *, \"Found:\", model_mark\n    print *, \"Expected:\", current_model_mark\n    error stop \"Invalid fastGPT model file\"\nend if\nread(u) m%model_file_version\nif (m%model_file_version /= current_model_version) then\n    print *, \"Found:\", m%model_file_version\n    print *, \"Expected:\", current_model_version\n    error stop \"Incompatible model version\"\nend if\nread(u) m%n_vocab, m%n_ctx, m%n_embd, m%n_layer, m%n_head, m%n_decoder_idx, &\n    m%n_decoder_txt, m%n_vocab_idx, m%n_vocab_txt, m%n_byte_encoder\ncall fskip(u, 16) ! Pad the 12 element i32 array to 32 byte boundary\nallocate(m%wte(m%n_embd,m%n_vocab), m%wpe(m%n_embd,m%n_ctx), &\n    m%mlp_fc_w(4*m%n_embd,m%n_embd,m%n_layer), m%mlp_fc_b(4*m%n_embd,m%n_layer), &\n    m%mlp_proj_w(m%n_embd,4*m%n_embd,m%n_layer), m%mlp_proj_b(m%n_embd,m%n_layer), &\n    m%attn_w(3*m%n_embd,m%n_embd,m%n_layer), m%attn_b(3*m%n_embd,m%n_layer), &\n    m%attn_proj_w(m%n_embd,m%n_embd,m%n_layer), m%attn_proj_b(m%n_embd,m%n_layer), &\n    m%ln1_b(m%n_embd,m%n_layer), m%ln1_g(m%n_embd,m%n_layer), &\n    m%ln2_b(m%n_embd,m%n_layer), m%ln2_g(m%n_embd,m%n_layer), &\n    m%lnf_b(m%n_embd), m%lnf_g(m%n_embd), &\n    m%decoder_idx(0:m%n_decoder_idx-1), m%decoder_txt(m%n_decoder_txt), &\n    m%vocab_idx(0:m%n_vocab_idx-1), m%vocab_txt(m%n_vocab_txt), &\n    m%byte_encoder(0:m%n_byte_encoder-1))\nread(u) m%wte, m%wpe, &\n    m%mlp_fc_w, m%mlp_fc_b, &\n    m%mlp_proj_w, m%mlp_proj_b, &\n    m%attn_w, m%attn_b, &\n    m%attn_proj_w, m%attn_proj_b, &\n    m%ln1_b, m%ln1_g, &\n    m%ln2_b, m%ln2_g, &\n    m%lnf_b, m%lnf_g, &\n    m%decoder_idx\ncall align_i4(u, m%decoder_idx)\nread(u) m%decoder_txt\ncall align_str(u, m%decoder_txt)\nread(u) m%vocab_idx\ncall align_i4(u, m%vocab_idx)\nread(u) m%vocab_txt\ncall align_str(u, m%vocab_txt)\nread(u) m%byte_encoder\nclose(u)\nend subroutine\n\nsubroutine gpt2_driver(input, output, m)\ninteger, allocatable, intent(out) :: input(:), output(:)\ntype(model_t), intent(out) :: m\ncharacter(:), allocatable :: input_txt\ninteger :: n_tokens_to_generate\nreal(dp) :: t1, t2\ncall load_input(\"input\", input_txt, n_tokens_to_generate)\n\n! Load the model\nprint \"(a)\", \"Loading the model...\"\ncall cpu_time(t1)\ncall load_model(\"model.gguf\", m)\ncall cpu_time(t2)\nprint \"(a,f8.3,a,i2)\", \"    done. Time:\", t2-t1, \"s, Model file version:\", m%model_file_version\nprint *\nprint \"(a)\", \"Model parameters:\"\nprint \"(a,i6)\", \"n_vocab =\", m%n_vocab\nprint \"(a,i6)\", \"n_ctx   =\", m%n_ctx\nprint \"(a,i6)\", \"n_embd  =\", m%n_embd\nprint \"(a,i6)\", \"n_layer =\", m%n_layer\nprint \"(a,i6)\", \"n_head  =\", m%n_head\nprint *\n\ncall gpt2_driver2(input_txt, n_tokens_to_generate, m, input, output)\nendsubroutine\n\nsubroutine gpt2_driver2(input_txt, n_tokens_to_generate, m, input, output)\ncharacter(*), intent(in) :: input_txt\ninteger, intent(in) :: n_tokens_to_generate\ntype(model_t), intent(in) :: m\ninteger, allocatable, intent(out) :: input(:), output(:)\ninteger, allocatable :: byte_decoder(:)\ninteger :: n_seq\ncharacter(:), allocatable :: output_txt\nreal(dp) :: t1, t2, t1o, t2o\ninteger :: i\nlogical :: use_cache\n\n! Compute byte_decoder:\nallocate(byte_decoder(0:maxval(m%byte_encoder)))\nbyte_decoder = 0\ndo i = 0, size(m%byte_encoder)-1\n    byte_decoder(m%byte_encoder(i)) = i\nend do\n\nprint \"(a)\", \"Input text\"\nprint \"(a)\", input_txt\n\nprint *\nprint \"(a)\",  \"Encoding: tokenizing input text into tokens (currently slow)...\"\ncall cpu_time(t1)\ninput = encode(input_txt, m%decoder_idx, m%decoder_txt, m%vocab_idx, m%vocab_txt, &\n    m%byte_encoder)\ncall cpu_time(t2)\nn_seq = size(input)\nprint \"(a,f8.3,a)\", \"    done. Time:\", t2-t1, \"s\"\nprint *\nprint \"(a)\", \"Input parameters:\"\nprint \"(a,i4)\", \"n_seq                =\", n_seq\nprint \"(a,i4)\", \"n_tokens_to_generate =\", n_tokens_to_generate\nprint *\nprint \"(a)\", \"Input tokens:\"\nprint \"(1000(i6))\", input\nprint *\n\nif (n_seq + n_tokens_to_generate >= m%n_ctx) then\n    print *, \"The maximum sequence length of the model was surpassed.\"\n    print *, \"Make the input and/or number of tokens to generate shorter.\"\n    error stop\nend if\n\nprint \"(a)\", \"Decoded input as text:\"\n!print \"(a)\", decode(input, decoder_idx, decoder_txt, byte_decoder)\nallocate(character(0) :: output_txt) ! Fix GFortran warning\noutput_txt = decode(input, m%decoder_idx, m%decoder_txt, byte_decoder)\nprint \"(a)\", output_txt\nprint *\n\nif (input_txt /= output_txt) then\n    error stop \"The decoded input text does not agree with the input text\"\nend if\n\nprint \"(a)\", \"Running model...\"\ncall cpu_time(t1)\nt1o = omp_get_wtime()\nuse_cache = .true.\ncall generate(output, n_tokens_to_generate, m, size(input), input, use_cache, &\n    byte_decoder)\nprint *\nt2o = omp_get_wtime()\ncall cpu_time(t2)\nprint \"(a,f8.3,a,f4.2,a)\", \"    done. Time:\", t2o-t1o, \"s (\", (t2-t1)/(t2o-t1o), \"x)\"\nprint *\nprint \"(a)\", \"Output tokens:\"\nprint \"(1000(i6))\", output\noutput_txt = decode(output, m%decoder_idx, m%decoder_txt, byte_decoder)\nprint *\nprint \"(a)\", \"Decoded output as text:\"\nprint \"(a)\", output_txt\nend subroutine\n\nsubroutine gpt2_driver3(input_txt, n_tokens_to_generate, stop_text, m, output_txt)\ncharacter(*), intent(in) :: input_txt, stop_text\ninteger, intent(in) :: n_tokens_to_generate\ntype(model_t), intent(in) :: m\ninteger, allocatable :: input(:), output(:)\ninteger, allocatable :: byte_decoder(:)\ninteger :: n_seq\ncharacter(:), allocatable, intent(out) :: output_txt\ninteger :: i\nlogical :: use_cache\n! TODO: move the decoder into model_t\n! Compute byte_decoder:\nallocate(byte_decoder(0:maxval(m%byte_encoder)))\nbyte_decoder = 0\ndo i = 0, size(m%byte_encoder)-1\n    byte_decoder(m%byte_encoder(i)) = i\nend do\ninput = encode(input_txt, m%decoder_idx, m%decoder_txt, m%vocab_idx, m%vocab_txt, &\n    m%byte_encoder)\nn_seq = size(input)\nif (n_seq + n_tokens_to_generate >= m%n_ctx) then\n    print *, \"The maximum sequence length of the model was surpassed.\"\n    print *, \"Make the input and/or number of tokens to generate shorter.\"\n    error stop\nend if\nallocate(character(0) :: output_txt) ! Fix GFortran warning\noutput_txt = decode(input, m%decoder_idx, m%decoder_txt, byte_decoder)\nif (input_txt /= output_txt) then\n    error stop \"The decoded input text does not agree with the input text\"\nend if\nuse_cache = .true.\ncall generate(output, n_tokens_to_generate, m, size(input), input, use_cache, &\n    byte_decoder, stop_text)\noutput_txt = decode(output, m%decoder_idx, m%decoder_txt, byte_decoder)\nend subroutine\n\nfunction get_prompt() result(input)\ncharacter(:), allocatable :: input\ncharacter(1024) :: tmp\ninteger ::ios\nread(*,\"(a)\",iostat=ios) tmp\nif (ios == 0) then\n    input = trim(tmp)\nelse\n    input = \"\"\nend if\nend function\n\nsubroutine chat(inputs)\ntype(string), optional, intent(in) :: inputs(:)\ntype(model_t) :: m\ncharacter(:), allocatable :: prompt, input, output\ninteger :: i, n_prompts\ncall load_model(\"model.gguf\", m)\nprompt = \"Your name is fastGPT and you are an AI bot. The user will ask you &\n&questions and you answer in a nice, truthful, short way.\" // LF // \"&\n&User: What is the capital of Czechia?\" // LF // \"&\n&fastGPT: Prague.\" // LF // \"&\n&User: How many legs does a dog have?\" // LF // \"&\n&fastGPT: Four.\" // LF // \"&\n&User:\"\nwrite(*,\"(a)\",advance=\"no\") prompt\nif (present(inputs)) then\n    n_prompts = size(inputs)\nelse\n    n_prompts = 1024\nend if\ndo i = 1, n_prompts\n    write(*,\"(a)\",advance=\"no\")  \" \"\n    if (present(inputs)) then\n        input = inputs(i)%s\n        write(*,\"(a)\") input\n    else\n        input = get_prompt()\n        if (input == \"\") exit\n    end if\n    write(*,\"(a)\",advance=\"no\") \"fastGPT:\"\n    prompt = prompt // \" \" // input // LF // \"fastGPT:\"\n    call gpt2_driver3(prompt, 200, \"User:\", m, output)\n    prompt = prompt // output\nend do\nprint *\nend subroutine\n\nend module\n"
  },
  {
    "path": "environment.yml",
    "content": "name: fastgpt\nchannels:\n  - conda-forge\ndependencies:\n  - python=3.9\n  - numpy=1.24.2\n  - tensorflow=2.11.0\n  - tqdm=4.65.0\n  - fire=0.4.0\n  - regex=2022.10.31\n  #- gfortran=14.2.0\n  - cmake=3.25.2\n  - transformers=4.26.1\n  - openblas=0.3.21\n"
  },
  {
    "path": "fpm.toml",
    "content": "name = \"fastGPT\"\n"
  },
  {
    "path": "gpt2.f90",
    "content": "module gpt2_mod\nuse linalg, only: matmul_2d, matmul_2d_t\nuse tokenizer, only: decode\nimplicit none\n\ninteger, parameter :: sp = kind(0.0)\nreal(sp), parameter :: pi = 3.14159265358979323846_sp\n\n! This derived type contains all the data of the GPT-2 model, including all\n! weights, model parameters, and encoder/decoder data\ntype :: model_t\n    integer :: n_vocab, n_ctx, n_embd, n_layer, n_head, &\n        n_decoder_idx, n_decoder_txt, &\n        n_vocab_idx, n_vocab_txt, n_byte_encoder\n    real(sp), allocatable :: wte(:,:), wpe(:,:), &\n        mlp_fc_w(:,:,:), mlp_fc_b(:,:), &\n        mlp_proj_w(:,:,:), mlp_proj_b(:,:), &\n        attn_w(:,:,:), attn_b(:,:), &\n        attn_proj_w(:,:,:), attn_proj_b(:,:), &\n        ln1_b(:,:), ln1_g(:,:), &\n        ln2_b(:,:), ln2_g(:,:), &\n        lnf_b(:), lnf_g(:)\n    integer, allocatable :: decoder_idx(:), vocab_idx(:), byte_encoder(:)\n    character, allocatable :: decoder_txt(:), vocab_txt(:)\n    integer :: model_file_version\nend type\n\ncontains\n\nelemental real(sp) function fast_tanh(x) result(y)\nreal(sp), intent(in) :: x\nreal(sp) :: x2\nif (x > 5) then\n    y = 1\nelseif (x < -5) then\n    y = -1\nelse\n    x2 = x*x\n    y = x * (0.98569772605911309407 + x2 *(-0.2794500993392901382 &\n        + x2 * (6.8280504526399188164e-2 + x2 * (-1.0972014877337651823e-2 &\n        + x2 * (1.1132367134444316902e-3 + x2 * (-7.018851897305717565e-5 &\n        + x2 * (2.656616768082727089e-6 + x2 * (-5.5138381821615909058e-8 &\n        + x2 * 4.8162484477588665996e-10))))))))\nend if\nend function\n\nelemental real(sp) function gelu(x) result(y)\nreal(sp), intent(in) :: x\ny = 0.5_sp * x * (1 + tanh(sqrt(2 / pi) * (x + 0.044715_sp * x**3)))\nend function\n\nfunction softmax(x) result(y)\nreal(sp), intent(in) :: x(:,:)\nreal(sp) :: y(size(x,1),size(x,2))\ninteger :: i\ndo i = 1, size(x,2)\n    y(:,i) = exp(x(:,i) - maxval(x(:,i)))\n    y(:,i) = y(:,i) / sum(y(:,i))\nend do\nend function\n\nfunction layer_norm(x, g, b, eps) result(y)\nreal(sp), intent(in) :: x(:,:), g(:), b(:), eps\nreal(sp) :: y(size(x,1),size(x,2))\nreal(sp) :: mean(size(x,2)), variance(size(x,2))\ninteger :: i\ndo i = 1, size(x,2)\n    mean(i) = sum(x(:,i)) / size(x,1)\n    variance(i) = sum((x(:,i) - mean(i))**2) / size(x,1)\nend do\n!do i = 1, size(x,1)\n!    y(i,:) = (x(i,:) - mean(:)) / sqrt(variance(:) + eps)\n!    y(i,:) = g(i) * y(i,:) + b(i)\n!end do\ndo i = 1, size(x,2)\n    y(:,i) = (x(:,i) - mean(i)) / sqrt(variance(i) + eps)\n    y(:,i) = g(:) * y(:,i) + b(:)\nend do\nend function\n\nfunction linear(x, w, b) result(y)\nreal(sp), intent(in) :: x(:,:), w(:,:), b(:)\nreal(sp) :: y(size(b,1),size(x,2))\ninteger :: i\n!y = matmul(w, x) + spread(b, 2, size(x,2))\n!y = matmul(w, x)\ncall matmul_2d(w, x, y)\ndo i = 1, size(y,2)\n    y(:,i) = y(:,i) + b(:)\nend do\nend function\n\nfunction ffn(x, fc_w, fc_b, proj_w, proj_b) result(y)\nreal(sp), intent(in) :: x(:,:), fc_w(:,:), fc_b(:), proj_w(:,:), proj_b(:)\nreal(sp) :: y(size(x,1),size(x,2))\n!real(sp) :: a(4*size(x,1),size(x,2))\n!a = gelu(linear(x, fc_w, fc_b))\ny = linear(gelu(linear(x, fc_w, fc_b)), proj_w, proj_b)\nend function\n\nfunction attention(n_embd_head,n_seq,n_seq_x, q, k, v, mask) result(y)\ninteger, intent(in) :: n_embd_head, n_seq, n_seq_x\nreal(sp), intent(in) :: q(n_embd_head,n_seq_x), k(n_embd_head,n_seq), v(n_embd_head,n_seq), mask(n_seq,n_seq_x)\nreal(sp) :: y(n_embd_head,n_seq_x)\nreal(sp) :: tmp(n_seq,n_seq_x)\n!tmp = matmul(transpose(k), q)\n!call matmul_2d(transpose(k), q, tmp)\ncall matmul_2d_t(k, q, tmp)\ncall matmul_2d(v, softmax(tmp / sqrt(real(n_embd_head,sp)) + mask), y)\nend function\n\nfunction mha(n_seq, n_seq_x, n_embd, x, attn_w, attn_b, proj_w, proj_b, n_head, &\n            use_kv_cache, kv_cache) &\n        result(y)\ninteger, intent(in) :: n_seq, n_seq_x, n_embd\nreal(sp), intent(in) :: x(n_embd,n_seq_x), &\n    attn_w(3*n_embd,n_embd), attn_b(3*n_embd), &\n    proj_w(n_embd,n_embd), proj_b(n_embd)\nreal(sp), intent(inout) :: kv_cache(n_embd,n_seq,2)\ninteger, intent(in) :: n_head\nlogical, intent(in) :: use_kv_cache\nreal(sp) :: y(n_embd,n_seq_x)\nreal(sp) :: causal_mask(n_seq,n_seq_x)\nreal(sp) :: x2(3*n_embd,n_seq_x)\nreal(sp) :: q(n_embd/n_head,n_seq_x), k(n_embd/n_head,n_seq), v(n_embd/n_head,n_seq)\nreal(sp) :: yy(n_embd/n_head,n_seq_x)\ninteger :: i, j, l\n! Mask\nif (use_kv_cache) then\n    causal_mask = 0\nelse\n    do j = 1, n_seq\n    do i = 1, n_seq\n        if (i > j) then\n            causal_mask(i,j) = -1e10_sp\n        else\n            causal_mask(i,j) = 0\n        end if\n    end do\n    end do\nend if\nx2 = linear(x, attn_w, attn_b)\nif (use_kv_cache) then\n    do j = 1, n_embd\n        kv_cache(j,n_seq,1) = x2((2-1)*n_embd+j,1)\n        kv_cache(j,n_seq,2) = x2((3-1)*n_embd+j,1)\n    end do\nelse\n    do i = 1, n_seq\n    do j = 1, n_embd\n        kv_cache(j,i,1) = x2((2-1)*n_embd+j,i)\n        kv_cache(j,i,2) = x2((3-1)*n_embd+j,i)\n    end do\n    end do\nend if\n! Perform attention over each head\ndo l = 1, n_head\n    do i = 1, n_seq_x\n    do j = 1, n_embd/n_head\n        q(j,i) = x2((l-1)*n_embd/n_head+j,i)\n    end do\n    end do\n    do i = 1, n_seq\n    do j = 1, n_embd/n_head\n        k(j,i) = kv_cache((l-1)*n_embd/n_head+j,i,1)\n        v(j,i) = kv_cache((l-1)*n_embd/n_head+j,i,2)\n    end do\n    end do\n    yy = attention(n_embd/n_head, n_seq, n_seq_x, q, k, v, causal_mask)\n    do i = 1, n_seq_x\n    do j = 1, n_embd/n_head\n        y((l-1)*n_embd/n_head+j,i) = yy(j,i)\n    end do\n    end do\nend do\n! Out projection\ny = linear(y, proj_w, proj_b)\nend function\n\n\nfunction transformer_block(n_seq, n_seq_x, n_embd, x, mlp_fc_w, mlp_fc_b, mlp_proj_w, mlp_proj_b, &\n        attn_w, attn_b, attn_proj_w, attn_proj_b, ln1_g, ln1_b, ln2_g, ln2_b, &\n        n_head, use_kv_cache, kv_cache) result(y)\nreal(sp), intent(in) :: x(n_embd,n_seq_x), &\n    mlp_fc_w(:,:), mlp_fc_b(:), &\n    mlp_proj_w(:,:), mlp_proj_b(:), &\n    attn_w(:,:), attn_b(:), attn_proj_w(:,:), attn_proj_b(:), &\n    ln1_g(:), ln1_b(:), ln2_g(:), ln2_b(:)\ninteger, intent(in) :: n_head\ninteger, intent(in) :: n_seq, n_seq_x, n_embd\nreal(sp) :: y(n_embd,n_seq_x)\nlogical, intent(in) :: use_kv_cache\nreal(sp), intent(inout) :: kv_cache(n_embd,n_seq,2)\ny = x + mha(n_seq, n_seq_x, n_embd, layer_norm(x, ln1_g, ln1_b, 1e-5_sp), &\n    attn_w, attn_b, attn_proj_w, attn_proj_b, n_head, use_kv_cache, kv_cache)\ny = y + ffn(layer_norm(y, ln2_g, ln2_b, 1e-5_sp), &\n    mlp_fc_w, mlp_fc_b, mlp_proj_w, mlp_proj_b)\nend function\n\nfunction gpt2(n_vocab, n_ctx, n_seq, n_seq_x, n_embd, n_layer, n_head, input, &\n        wte, wpe, &\n        mlp_fc_w, mlp_fc_b, mlp_proj_w, mlp_proj_b, &\n        attn_w, attn_b, attn_proj_w, attn_proj_b, &\n        ln1_g, ln1_b, ln2_g, ln2_b, lnf_g, lnf_b, &\n        use_kv_cache, kv_cache) result(y)\ninteger, intent(in) :: n_vocab, n_ctx, n_seq, n_seq_x, n_embd, n_layer, n_head\ninteger, intent(in) :: input(n_seq)\nreal(sp), intent(in) :: wte(n_embd,n_vocab), wpe(n_embd,n_ctx), &\n    mlp_fc_w(4*n_embd,n_embd,n_layer), mlp_fc_b(4*n_embd,n_layer), &\n    mlp_proj_w(n_embd,4*n_embd,n_layer), mlp_proj_b(n_embd,n_layer), &\n    attn_w(3*n_embd,n_embd,n_layer), attn_b(3*n_embd,n_layer), &\n    attn_proj_w(n_embd,n_embd,n_layer), attn_proj_b(n_embd,n_layer), &\n    ln1_b(n_embd,n_layer), ln1_g(n_embd,n_layer), &\n    ln2_b(n_embd,n_layer), ln2_g(n_embd,n_layer), &\n    lnf_b(n_embd), lnf_g(n_embd)\nlogical, intent(in) :: use_kv_cache\nreal(sp), intent(inout) :: kv_cache(n_embd,n_seq,2,n_layer)\nreal(sp) :: y(n_vocab,n_seq_x)\nreal(sp) :: x(n_embd,n_seq_x)\ninteger :: i\nif (use_kv_cache) then\n    i = n_seq\n    x(:,1) = wte(:,input(i)+1) + wpe(:,i)\nelse\n    do i = 1, n_seq\n        x(:,i) = wte(:,input(i)+1) + wpe(:,i)\n    end do\nend if\ndo i = 1, n_layer\n    x = transformer_block(n_seq, n_seq_x, n_embd, x, &\n        mlp_fc_w(:,:,i), mlp_fc_b(:,i), &\n        mlp_proj_w(:,:,i), mlp_proj_b(:,i), &\n        attn_w(:,:,i), attn_b(:,i), attn_proj_w(:,:,i), attn_proj_b(:,i), &\n        ln1_g(:,i), ln1_b(:,i), ln2_g(:,i), ln2_b(:,i), &\n        n_head, use_kv_cache, kv_cache(:,:,:,i))\nend do\nx = layer_norm(x, lnf_g, lnf_b, 1e-5)\n!y = matmul(transpose(wte), x)\ncall matmul_2d_t(wte, x, y)\nend function\n\nsubroutine generate(output, n_tokens_to_generate, m, &\n        n_seq, input, &\n        use_cache, &\n        byte_decoder, stop_text)\ninteger, intent(in) :: n_seq, n_tokens_to_generate\ntype(model_t), intent(in) :: m\ninteger, intent(in) :: input(n_seq)\nlogical, intent(in) :: use_cache\ninteger, intent(in) :: byte_decoder(:)\ncharacter(*), intent(in), optional :: stop_text ! Stop if you see this text\ninteger, allocatable, intent(out) :: output(:)\nreal(sp), allocatable :: logits(:,:)\ninteger :: i\ninteger :: n_seq2, n_seq_x\ninteger :: next_id\ninteger :: input2(size(input)+n_tokens_to_generate)\nlogical :: use_kv_cache\nreal(sp) :: kv_cache(m%n_embd,n_seq+n_tokens_to_generate,2,m%n_layer)\nreal(sp), allocatable :: kv_cache2(:,:,:,:)\ncharacter(:), allocatable :: output_txt, last_token\nif (present(stop_text)) then\n    allocate(character(0) :: output_txt)\n    output_txt = \"\"\nend if\ninput2(:n_seq) = input\ndo i = 1, n_tokens_to_generate\n    if (use_cache) then\n        use_kv_cache = (i > 1) ! Use cache for subsequent tokens\n    else\n        use_kv_cache = .false.\n    end if\n    n_seq2 = n_seq+i-1\n    if (use_kv_cache) then\n        n_seq_x = 1\n    else\n        n_seq_x = n_seq2\n    end if\n    allocate(kv_cache2(m%n_embd,n_seq2,2,m%n_layer))\n    kv_cache2(:,:,:,:) = kv_cache(:,:n_seq2,:,:)\n    allocate(logits(m%n_vocab, n_seq_x))\n    logits = gpt2(m%n_vocab, m%n_ctx, n_seq2, n_seq_x, m%n_embd, m%n_layer, &\n            m%n_head, &\n            input2(:n_seq2), &\n            m%wte, m%wpe, &\n            m%mlp_fc_w, m%mlp_fc_b, m%mlp_proj_w, m%mlp_proj_b, &\n            m%attn_w, m%attn_b, m%attn_proj_w, m%attn_proj_b, &\n            m%ln1_g, m%ln1_b, m%ln2_g, m%ln2_b, m%lnf_g, m%lnf_b, use_kv_cache,&\n            kv_cache2)\n    kv_cache(:,:n_seq2,:,:) = kv_cache2(:,:,:,:)\n    deallocate(kv_cache2)\n    next_id = maxloc(logits(:,n_seq_x), dim=1)-1\n    input2(n_seq2+1) = next_id\n    last_token = decode([next_id], m%decoder_idx, &\n        m%decoder_txt, byte_decoder)\n    write(*, fmt=\"(a)\", advance=\"no\") last_token\n    if (present(stop_text)) then\n        output_txt = output_txt // last_token\n        if (output_txt(len(output_txt)-len(stop_text)+1:len(output_txt)) == stop_text) then\n            exit\n        end if\n    end if\n    deallocate(logits)\nend do\nallocate(output(n_seq2 - n_seq + 1))\noutput(:) = input2(n_seq+1:n_seq2+1)\nend subroutine\n\nend module\n"
  },
  {
    "path": "input",
    "content": "&input_fastGPT\nn_tokens_to_generate = 20\n/\nAlan Turing theorized that computers would one day become very powerful, but even he could not imagine\n"
  },
  {
    "path": "linalg_accelerate.c",
    "content": "/*\nThis file provides matvec implementation using the macOS Accelerate\nFramework, which seems to be the most optimized matrix matrix multiplication\non macOS.\n*/\n#include <CoreFoundation/CFAttributedString.h>\n#include <Accelerate/Accelerate.h>\n\nvoid acc_sgemm(int m, int n, int k, float *A, float *B, float *C) {\n    //A[m][k]\n    //B[k][n]\n    //C[m][n]\n    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, m, B, k, 0.0, C, m);\n}\n\nvoid acc_sgemm_t(int m, int n, int k, float *A, float *B, float *C) {\n    //A[k][m] (to be transposed)\n    //B[k][n]\n    //C[m][n]\n    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, n, k, 1.0, A, k, B, k, 0.0, C, m);\n}\n"
  },
  {
    "path": "linalg_c.f90",
    "content": "module linalg\n! C implementation of the matmul routines\nuse iso_c_binding, only: c_int, c_float\nimplicit none\n\ninteger, parameter :: sp = kind(0.0)\n\ninterface\n    subroutine acc_sgemm(m, n, k, A, B, C) bind(c)\n    import :: c_int, c_float\n    implicit none\n    integer(c_int), value, intent(in) :: m, n, k\n    real(c_float), intent(in) :: A(m,k), B(k,n)\n    real(c_float), intent(out) :: C(m,n)\n    end subroutine\n\n    subroutine acc_sgemm_t(m, n, k, A, B, C) bind(c)\n    import :: c_int, c_float\n    implicit none\n    integer(c_int), value, intent(in) :: m, n, k\n    real(c_float), intent(in) :: A(k,m), B(k,n)\n    real(c_float), intent(out) :: C(m,n)\n    end subroutine\nend interface\n\ncontains\n\n    subroutine matmul_2d(A, B, C)\n    ! C = matmul(A, B)\n    real(sp), intent(in) :: A(:,:), B(:,:)\n    real(sp), intent(out) :: C(:,:)\n    call acc_sgemm(size(A,1), size(B,2), size(A,2), A, B, C)\n    end subroutine\n\n    subroutine matmul_2d_t(A, B, C)\n    ! C = matmul(transpose(A), B)\n    real(sp), intent(in) :: A(:,:), B(:,:)\n    real(sp), intent(out) :: C(:,:)\n    call acc_sgemm_t(size(A,2), size(B,2), size(A,1), A, B, C)\n    end subroutine\n\nend module\n"
  },
  {
    "path": "linalg_f.f90",
    "content": "module linalg\n! Pure Fortran implementation of the matmul routines\nimplicit none\n\ninteger, parameter :: sp = kind(0.0)\n\ncontains\n\n    subroutine matmul_2d(A, B, C)\n    real(sp), intent(in) :: A(:,:), B(:,:)\n    real(sp), intent(out) :: C(:,:)\n    C = matmul(A, B)\n    end subroutine\n\n    subroutine matmul_2d_t(A, B, C)\n    real(sp), intent(in) :: A(:,:), B(:,:)\n    real(sp), intent(out) :: C(:,:)\n    C = matmul(transpose(A), B)\n    end subroutine\n\nend module\n"
  },
  {
    "path": "linalg_openblas.c",
    "content": "/*\nThis file provides matvec implementation using OpenBLAS.\n*/\n#include <cblas.h>\n\nvoid acc_sgemm(int m, int n, int k, float *A, float *B, float *C) {\n    //A[m][k]\n    //B[k][n]\n    //C[m][n]\n    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, m, B, k, 0.0, C, m);\n}\n\nvoid acc_sgemm_t(int m, int n, int k, float *A, float *B, float *C) {\n    //A[k][m] (to be transposed)\n    //B[k][n]\n    //C[m][n]\n    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, n, k, 1.0, A, k, B, k, 0.0, C, m);\n}\n"
  },
  {
    "path": "main.f90",
    "content": "program gpt2\nuse driver, only: gpt2_driver, model_t\nimplicit none\ninteger, allocatable :: input(:), output(:)\ntype(model_t) :: m\ncall gpt2_driver(input, output, m)\nend program\n"
  },
  {
    "path": "omp.f90",
    "content": "module omp\nimplicit none\nprivate\npublic :: omp_get_wtime\n\ninteger, parameter :: dp = kind(0.d0)\n\ninterface\n    real(dp) function omp_get_wtime()\n    import :: dp\n    end function\nend interface\n\nend module\n"
  },
  {
    "path": "omp_dummy.f90",
    "content": "module omp\nimplicit none\nprivate\npublic :: omp_get_wtime\n\ninteger, parameter :: dp = kind(0.d0)\n\ncontains\n\nreal(dp) function omp_get_wtime()\nomp_get_wtime = 0\nend function\n\nend module\n"
  },
  {
    "path": "pt.py",
    "content": "from time import monotonic as clock\nimport os; os.environ[\"OMP_NUM_THREADS\"] = \"1\"\nprint(\"Importing\")\nt1 = clock()\nfrom transformers import pipeline\nt2 = clock()\nprint(\"  Time: \", t2-t1)\nprint(\"Loading\")\nt1 = clock()\ngenerator = pipeline('text-generation', model='gpt2')\nt2 = clock()\nprint(\"  Time: \", t2-t1)\ntext=\"Alan Turing theorized that computers would one day become very powerful, but even he could not imagine\"\nprint(\"Generating\")\nt1 = clock()\ng = generator(text, do_sample=False, max_new_tokens=20, use_cache=True)\nt2 = clock()\nprint(\"  Time: \", t2-t1)\noutput = g[0][\"generated_text\"]\nprint(output)\n"
  },
  {
    "path": "tests/test_basic_input.f90",
    "content": "program test_basic_input\nuse driver, only: gpt2_driver, model_t\nimplicit none\n\ntype(model_t) :: m\n\ninteger, parameter :: input_ref(*) = [36235, 39141, 18765, 1143, 326, 9061, &\n    561, 530, 1110, 1716, 845, 3665, 11, 475, 772, 339, 714, 407, 5967]\ninteger, parameter :: output_ref(*) = [703, 484, 561, 307, 1498, 284, 466, &\n    523, 13, 198, 198, 1, 40, 892, 326, 262, 749, 1593, 1517, 318]\ninteger, allocatable :: input(:), output(:)\n\ncall gpt2_driver(input, output, m)\n\nprint *\nprint *, \"TESTS:\"\n\nif (all(input == input_ref)) then\n    print *, \"Input tokens agree with reference results\"\nelse\n    print *, \"Input tokens DO NOT agree with reference results\"\n    error stop\nend if\n\nif (all(output == output_ref)) then\n    print *, \"Output tokens agree with reference results\"\nelse\n    print *, \"Output tokens DO NOT agree with reference results\"\n    error stop\nend if\n\n\nend program\n"
  },
  {
    "path": "tests/test_chat.f90",
    "content": "program test_chat\nuse driver, only: chat\nuse tokenizer, only: string\nimplicit none\ntype(string), allocatable :: inputs(:)\ninputs = [ &\n    string(\"What color does the sky have?\"), &\n    string(\"What can you type a document on?\"), &\n    string(\"What can you drive in?\"), &\n    string(\"What can you fly in?\"), &\n    string(\"What continent is Germany in?\"), &\n    string(\"When did Second World War start?\"), &\n    string(\"When did it end?\"), &\n    string(\"When did the U.S. enter the Second World War?\"), &\n    string(\"When did the First World War start?\"), &\n    string(\"When did it end?\"), &\n    string(\"When did the Mexican-American war start?\"), &\n    string(\"When did it end?\"), &\n    string(\"What color is snow?\"), &\n    string(\"What color do plants usually have?\") &\n    ]\ncall chat(inputs(:3))\nend program\n"
  },
  {
    "path": "tests/test_more_inputs.f90",
    "content": "program test_more_inputs\nuse driver, only: gpt2_driver2, model_t, load_model\nimplicit none\n\ntype(model_t) :: m\ninteger, parameter :: input_ref(*) = [46, 358, 129, 247, 68, 73, 34754, 234, &\n    861, 8836, 74, 373, 4642, 287]\ninteger, parameter :: output_ref(*) = [1248, 5332, 287, 262, 7404, 286, &\n    25370, 254, 368, 83, 6557, 81, 11]\ninteger, allocatable :: input(:), output(:)\n\ncall load_model(\"model.gguf\", m)\n\ncall gpt2_driver2(\"Ondřej Čertík was born in\", 13, m, input, output)\nprint *\nprint *, \"TESTS:\"\ncall test(input, input_ref, \"Input\")\ncall test(output, output_ref, \"Output\")\n\ncall gpt2_driver2(\"San Francisco is\", 8, m, input, output)\nprint *\nprint *, \"TESTS:\"\ncall test(input, [15017, 6033, 318], \"Input\")\ncall test(output, [257, 1748, 286, 517, 621, 352, 1510, 661], \"Output\")\n\ncall gpt2_driver2(\"Cars are\", 13, m, input, output)\nprint *\nprint *, \"TESTS:\"\ncall test(input, [34, 945, 389], \"Input\")\ncall test(output, [407, 3142, 284, 307, 973, 287, 262, 7647, 1256, 286, &\n    257, 7072, 13], \"Output\")\n\ncontains\n\nsubroutine test(a, a_ref, text)\ninteger, intent(in) :: a(:), a_ref(:)\ncharacter(*), intent(in) :: text\nif (all(a == a_ref)) then\n    print *, text, \": OK\"\nelse\n    print *, text, \": FAIL\"\n    error stop\nend if\nend subroutine\n\nend program\n"
  },
  {
    "path": "tokenizer.f90",
    "content": "module tokenizer\nimplicit none\n\ntype :: string\n    character(:), allocatable :: s\nend type\n\ncontains\n\nfunction c2s(x) result(y)\ncharacter, intent(in) :: x(:)\ncharacter(:), allocatable :: y\ninteger :: i\nallocate(character(size(x)) :: y)\ndo i = 1, size(x)\n    y(i:i) = x(i)\nend do\nend function\n\nfunction next_token(input, i) result(y)\n! TODO: tokenize exactly according to this regex:\n! re.compile(r\"\"\"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+\"\"\")\n! Right now we are more greedy, but the bpe() tokenizer seems to still return\n! exactly the same tokens for most inputs (it is not clear if for all inputs).\ncharacter(*), intent(in) :: input\ninteger, intent(inout) :: i\ncharacter(:), allocatable :: y\nif (i > len(input)) then\n    y = \"\"\nelse if (input(i:i) == \" \") then\n    y = tokenize_word(input, i)\nelse if (input(i:i) == \",\" .or. input(i:i) == \".\") then\n    y = input(i:i)\n    i = i + 1\nelse\n    y = tokenize_word(input, i)\nend if\nend function\n\nfunction tokenize_word(input, i) result(y)\ncharacter(*), intent(in) :: input\ninteger, intent(inout) :: i\ncharacter(:), allocatable :: y\ninteger :: i0\ni0 = i\nif (input(i:i) == \" \") then\n    i = i + 1\nend if\ndo\n    if (i > len(input)) then\n        y = input(i0:i-1)\n        exit\n    end if\n    if (input(i:i) == \" \" .or. input(i:i) == \",\" .or. input(i:i) == \".\") then\n        y = input(i0:i-1)\n        exit\n    end if\n    i = i + 1\nend do\nend function\n\nfunction word_idx(word, idx, decoder_txt) result(token)\ncharacter(*), intent(in) :: word\ninteger, intent(in) :: idx(0:)\ncharacter, intent(in) :: decoder_txt(:)\ninteger :: token\ninteger :: i\n! This is O(n) search instead of O(1) lookup in a dictionary, so it is slow\ndo i = 0, ubound(idx,1)-1\n    if (c2s(decoder_txt(idx(i)+1:idx(i+1))) == word) then\n        token = i\n        return\n    end if\nend do\ntoken = -1\nend function\n\nsubroutine codepoint_to_utf8(s, c)\n! UTF-32 -> UTF-8\ncharacter(:), allocatable, intent(inout) :: s\ninteger, intent(in) :: c\ninteger :: d1, d2\nif (c < 128) then\n    s = s // achar(c)\nelse if (c < 2048) then\n    d1 = ior(ishft(c, -6), 192)\n    d2 = iand(ior(c, 128), 191)\n    s = s // achar(d1) // achar(d2)\nelse\n    error stop \"UTF-32 range not supported\"\nend if\nend subroutine\n\nfunction utf8_to_codepoint(s, i) result(c)\n! UTF-8 -> UTF-32\ncharacter(*), intent(in) :: s\ninteger, intent(inout) :: i\ninteger :: c, d\nc = iachar(s(i:i))\nif (c >= 128) then\n    i = i + 1\n    d = iachar(s(i:i))\n    c = ior(ishft(iand(c, 31), 6), iand(d, 63))\nend if\nif (c >= 2048) then\n    error stop \"UTF-8 range not supported\"\nend if\nend function\n\nfunction merge_pair(intokens, idx) result(tokens)\n! Merge the pair `idx`\ntype(string), intent(in) :: intokens(:)\ninteger, intent(in) :: idx\ntype(string), allocatable :: tokens(:)\nallocate(tokens(size(intokens)-1))\ntokens(:idx-1) = intokens(:idx-1)\ntokens(idx)%s = intokens(idx)%s // intokens(idx+1)%s\ntokens(idx+1:) = intokens(idx+2:)\nend function\n\nfunction merge_utf8_pairs(intokens) result(tokens)\n! Merge all UTF-8 character pairs\ntype(string), intent(in) :: intokens(:)\ntype(string), allocatable :: tokens(:), tmp_tokens(:)\ninteger :: i, j\nlogical :: one_more_pass\nallocate(tokens(size(intokens)))\ntokens = intokens\none_more_pass = .true.\nj = 1\ndo while(one_more_pass)\n    one_more_pass = .false.\n    do i = j, size(tokens)-1\n        if (len(tokens(i)%s) == 1 .and. iachar(tokens(i)%s(1:1)) >= 128) then\n            tmp_tokens = merge_pair(tokens, i)\n            deallocate(tokens)\n            call move_alloc(tmp_tokens, tokens)\n            one_more_pass = .true.\n            j = i + 1\n            exit\n        end if\n    end do\nend do\n!print *, \"tokens = \", (tokens(i)%s // \" \", i=1,size(tokens))\nend function\n\nfunction bpe(token, vocab_idx, vocab_txt) result(tokens)\n! Takes a token as a string, and returns bpe tokens as an array of strings\ncharacter(*), intent(in) :: token\ninteger, intent(in) :: vocab_idx(0:)\ncharacter, intent(in) :: vocab_txt(:)\ntype(string), allocatable :: tokens(:), tmp_tokens(:)\ninteger, allocatable :: pair_scores(:)\ninteger :: not_found, merge_pair_idx\ninteger :: i\nnot_found = size(vocab_idx) + 10\nallocate(tokens(len(token)))\ndo i = 1, len(token)\n    tokens(i)%s = token(i:i)\nend do\ntmp_tokens = merge_utf8_pairs(tokens)\ndeallocate(tokens)\ncall move_alloc(tmp_tokens, tokens)\ndo\n    !print *, \"tokens = \", (tokens(i)%s // \" \", i=1,size(tokens))\n    if (size(tokens) == 1) then\n        ! The token pairs were either all merged into one word, or the input\n        ! token was a one character word, either way we are done:\n        exit\n    end if\n    allocate(pair_scores(size(tokens)-1))\n    ! Loop over pairs\n    do i = 1, size(tokens)-1\n        pair_scores(i) = word_idx(tokens(i)%s // \" \" // tokens(i+1)%s, vocab_idx, vocab_txt)\n        if (pair_scores(i) == -1) pair_scores(i) = not_found\n    end do\n    merge_pair_idx = minloc(pair_scores, 1)\n    if (pair_scores(merge_pair_idx) == not_found) then\n        ! No token pair can be merged, so we are done:\n        exit\n    end if\n    !print *, pair_scores\n    !print *, merge_pair_idx, pair_scores(merge_pair_idx)\n    tmp_tokens = merge_pair(tokens, merge_pair_idx)\n    deallocate(tokens)\n    call move_alloc(tmp_tokens, tokens)\n    deallocate(pair_scores)\nend do\n!print *, \"final tokens = \", (tokens(i)%s // \" \", i=1,size(tokens))\nend function\n\nfunction encode(input, idx, decoder_txt, vocab_idx, vocab_txt, byte_encoder) &\n        result(tokens2)\ncharacter(*), intent(in) :: input\ninteger, intent(in) :: idx(0:), vocab_idx(0:), byte_encoder(0:)\ncharacter, intent(in) :: decoder_txt(:), vocab_txt(:)\ninteger, parameter :: max_tokens = 2048\ninteger :: tokens(max_tokens)\ninteger, allocatable :: tokens2(:)\ncharacter(:), allocatable :: tmp, tmp2\ntype(string), allocatable :: bpe_tokens(:)\ninteger :: i, j, c, n_tokens\nn_tokens = 0\ni = 1\ndo\n    tmp = next_token(input, i)\n    if (tmp == \"\") exit\n    tmp2 = \"\"\n    do j = 1, len(tmp)\n        c = iachar(tmp(j:j))\n        c = byte_encoder(c)\n        ! c is UTF-32 (4 bytes), but only the range [0, 324] is used\n        ! Encode c from UTF-32 to UTF-8. Due to the limited range\n        ! either one or two bytes of UTF-8 are appended to tmp2:\n        call codepoint_to_utf8(tmp2, c)\n    end do\n    if (allocated(bpe_tokens)) deallocate(bpe_tokens)\n    bpe_tokens = bpe(tmp2, vocab_idx, vocab_txt)\n    do j = 1, size(bpe_tokens)\n        n_tokens = n_tokens + 1\n        if (n_tokens > max_tokens) error stop \"exceeded max_tokens\"\n        tokens(n_tokens) = word_idx(bpe_tokens(j)%s, idx, decoder_txt)\n    end do\n    deallocate(tmp2)\nend do\nallocate(tokens2(n_tokens))\ntokens2(:) = tokens(:n_tokens)\nend function\n\nfunction decode(tokens, idx, decoder_txt, byte_decoder) result(output)\ninteger, intent(in) :: tokens(:), idx(0:), byte_decoder(0:)\ncharacter, intent(in) :: decoder_txt(:)\ncharacter(:), allocatable :: output\ncharacter(:), allocatable :: output2, tmp\ninteger :: i, c\nallocate(character(0) :: output2) ! Fix GFortran warning\noutput2 = \"\"\ndo i = 1, size(tokens)\n    if (tokens(i) < 0) error stop \"tokens(i) < 0\"\n    output2 = output2 // c2s(decoder_txt(idx(tokens(i))+1:idx(tokens(i)+1)))\nend do\ni = 1\noutput = \"\"\ndo\n    ! Decode UTF-8 (one or more bytes) to UTF-32 code point (always 4 bytes),\n    ! However for GPT-2 it seems only range 0-323 is used from UTF-32.\n    c = utf8_to_codepoint(output2, i)\n    ! [0,324] -> [0,255]\n    if (c < 0 .or. c > ubound(byte_decoder,1)) then\n        print *, \"Codepoint out of range for byte decoder:\", c, ubound(byte_decoder,1)\n        error stop\n    end if\n    tmp = achar(byte_decoder(c))\n    output = output // tmp\n    if (i == len(output2)) exit\n    i = i + 1\nend do\nend function\n\nend module\n"
  }
]