Repository: kuvaus/LlamaGPTJ-chat Branch: main Commit: e022976f0460 Files: 32 Total size: 196.2 KB Directory structure: gitextract_vuj2yh60/ ├── .github/ │ └── workflows/ │ ├── cmake-release.yml │ ├── cmake.yml │ └── cmake_branch.yml ├── .gitignore ├── .gitmodules ├── CHANGELOG.md ├── CMakeLists.txt ├── LICENSE ├── README.md ├── cmake/ │ └── config.h.in ├── gpt4all-backend/ │ ├── CMakeLists.txt │ ├── README.md │ ├── gptj/ │ │ └── placeholder │ ├── gptj.cpp │ ├── gptj.h │ ├── llama/ │ │ └── placeholder │ ├── llamamodel.cpp │ ├── llamamodel.h │ ├── llmodel.h │ ├── llmodel_c.cpp │ ├── llmodel_c.h │ ├── mpt.cpp │ ├── mpt.h │ ├── scripts/ │ │ └── convert_mpt_hf_to_ggml.py │ ├── utils.cpp │ └── utils.h ├── prompt_template_sample.txt └── src/ ├── CMakeLists.txt ├── chat.cpp ├── header.h ├── parse_json.h └── utils.h ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/cmake-release.yml ================================================ name: CMake-release on: push: tags: - 'v*' env: BUILD_TYPE: Release permissions: contents: read actions: write jobs: build: runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: - ubuntu-latest - macos-latest - windows-latest instructions: - avx - avx2 steps: - uses: actions/checkout@v3 with: submodules: recursive - name: Setup MinGW if: matrix.os == 'windows-latest' run: | choco install mingw -y -libwinpthread echo "C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - name: Configure CMake run: | if ("${{ matrix.os }}" -eq "windows-latest") { $env:PATH += ";C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin" cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }} -G "MinGW Makefiles" } else { cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }} } shell: pwsh - name: Build run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} - name: Test working-directory: ${{github.workspace}}/build run: ctest -C ${{env.BUILD_TYPE}} - name: Prepare binary run: | if ("${{ matrix.instructions }}" -eq "avx"){ if ("${{ matrix.os }}" -eq "windows-latest") { cp ${{github.workspace}}\build\bin\chat.exe chat.exe mv chat.exe chat-windows-latest-avx.exe shasum -a 256 -b chat-windows-latest-avx.exe > shasum-chat-windows-latest-avx.sha256 } else { cp ${{github.workspace}}/build/bin/chat chat mv chat chat-${{ matrix.os }}-${{ matrix.instructions }} shasum -a 256 -b chat-${{ matrix.os }}-${{ matrix.instructions }} > shasum-chat-${{ matrix.os }}-${{ matrix.instructions }}.sha256 } } else { if ("${{ matrix.os }}" -eq "windows-latest") { cp ${{github.workspace}}\build\bin\chat.exe chat.exe mv chat.exe chat-windows-latest-avx2.exe shasum -a 256 -b chat-windows-latest-avx2.exe > shasum-chat-windows-latest-avx2.sha256 } else { cp ${{github.workspace}}/build/bin/chat chat mv chat chat-${{ matrix.os }}-${{ matrix.instructions }} shasum -a 256 -b chat-${{ matrix.os }}-${{ matrix.instructions }} > shasum-chat-${{ matrix.os }}-${{ matrix.instructions }}.sha256 } } shell: pwsh - name: Upload binary uses: actions/upload-artifact@v2 with: name: chat-${{ matrix.os }}-${{ matrix.instructions }} path: chat-${{ matrix.os }}-${{ matrix.instructions }}* - name: Upload shasums uses: actions/upload-artifact@v2 with: name: shasum-chat-${{ matrix.os }}-${{ matrix.instructions }} path: shasum-chat-${{ matrix.os }}-${{ matrix.instructions }}* release: needs: build runs-on: ubuntu-latest steps: - name: Create Release id: create_release uses: actions/create-release@v1 env: GITHUB_TOKEN: ${{ secrets.DEPLOY_KEY }} with: tag_name: ${{ github.ref }} release_name: Release ${{ github.ref }} draft: false prerelease: false - name: Download artifacts uses: actions/download-artifact@v2 with: path: artifacts - name: Upload artifacts uses: softprops/action-gh-release@v1 env: GITHUB_TOKEN: ${{ secrets.DEPLOY_KEY }} with: tag_name: ${{ github.ref_name }} name: Release ${{ github.ref_name }} draft: false prerelease: false files: | artifacts/**/* # # This part filters the CHANGELOG.md using python # Then it adds FILTERED_CHANGELOG.md to release notes # - name: Checkout repository uses: actions/checkout@v3 - name: Set up Python uses: actions/setup-python@v4 with: python-version: 3.x - name: Filter CHANGELOG.md uses: jannekem/run-python-script-action@v1 with: script: | filtered_lines = [] start_processing = False with open('CHANGELOG.md', 'r') as file: for line in file: if line.startswith("#### [v"): if start_processing: break else: file.readline() file.readline() start_processing = True continue if start_processing: filtered_lines.append(line) with open('FILTERED_CHANGELOG.md', 'w') as file: file.writelines(filtered_lines) - name: Generate release notes uses: softprops/action-gh-release@v1 env: GITHUB_TOKEN: ${{ secrets.DEPLOY_KEY }} with: tag_name: ${{ github.ref_name }} name: Release ${{ github.ref_name }} body_path: FILTERED_CHANGELOG.md draft: false prerelease: false ================================================ FILE: .github/workflows/cmake.yml ================================================ name: CMake on: push: branches: [ "main" ] env: BUILD_TYPE: Release jobs: build: runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: - ubuntu-latest - macos-latest - windows-latest instructions: - avx - avx2 steps: - uses: actions/checkout@v3 with: submodules: recursive - name: Setup MinGW if: matrix.os == 'windows-latest' run: | choco install mingw -y -libwinpthread echo "C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - name: Configure CMake run: | if ("${{ matrix.os }}" -eq "windows-latest") { $env:PATH += ";C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin" cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }} -G "MinGW Makefiles" } else { cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }} } shell: pwsh - name: Build run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} - name: Test working-directory: ${{github.workspace}}/build run: ctest -C ${{env.BUILD_TYPE}} ================================================ FILE: .github/workflows/cmake_branch.yml ================================================ name: CMake on: push: branches: - '*' - '!main' env: BUILD_TYPE: Release jobs: build: runs-on: ${{ matrix.config.os }} strategy: fail-fast: false matrix: config: - { os: 'ubuntu-latest', instructions: 'avx' } - { os: 'ubuntu-latest', instructions: 'avx2' } - { os: 'macos-latest', instructions: 'avx' } - { os: 'macos-latest', instructions: 'avx2' } - { os: 'windows-latest', build: 'msvc', instructions: 'avx' } - { os: 'windows-latest', build: 'msvc', instructions: 'avx2' } - { os: 'windows-latest', build: 'mingw', instructions: 'avx' } - { os: 'windows-latest', build: 'mingw', instructions: 'avx2' } steps: - uses: actions/checkout@v3 with: submodules: recursive - name: Configure CMake if: matrix.build == 'msvc' run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} - name: Build if: matrix.build == 'msvc' run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} - name: Test if: matrix.build == 'msvc' working-directory: ${{github.workspace}}/build run: ctest -C ${{env.BUILD_TYPE}} - name: Prepare binary if: matrix.build == 'msvc' run: | if ("${{ matrix.os }}" -eq "windows-latest") { cp ${{github.workspace}}\build\bin\Release\chat.exe chat-msvc.exe mv chat-msvc.exe chat-windows-latest-msvc.exe } shell: pwsh - name: Setup MinGW if: matrix.os == 'windows-latest' run: | choco install mingw -y -libwinpthread echo "C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - name: Configure CMake run: | if ("${{ matrix.os }}" -eq "windows-latest") { $env:PATH += ";C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin" cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }} -G "MinGW Makefiles" } elseif ("${{ matrix.arch }}" -eq "aarch64") { } else { cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }} } shell: pwsh - name: Build run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} - name: Test working-directory: ${{github.workspace}}/build run: ctest -C ${{env.BUILD_TYPE}} ================================================ FILE: .gitignore ================================================ # Folders build/ tmp/ # Visual Studio Code .vscode # MacOS .DS_Store # Prerequisites *.d # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app .cache ================================================ FILE: .gitmodules ================================================ [submodule "llama.cpp"] path = gpt4all-backend/llama.cpp url = https://github.com/manyoso/llama.cpp #url = https://github.com/ggerganov/llama.cpp ================================================ FILE: CHANGELOG.md ================================================ ## Changelog #### [Upcoming](https://github.com/kuvaus/LlamaGPTJ-chat/compare/v0.3.0...HEAD) #### [v0.3.0](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.3.0) > 26 June 2023 - Add this [changelog](https://github.com/kuvaus/LlamaGPTJ-chat/blob/main/CHANGELOG.md) :) - Add sha256 hashes on release so you can verify the binaries - All binaries are automatically generated with Github actions - Add signal handling for SIGHUP (macOS, Linux) and CTRL_CLOSE_EVENT (Windows) to fix issue [`#16`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/16) - This allows you to run chat as a subprocess. The chat subprocess now quits properly if parent app is closed. - Version information - Fix segfault on`/help` #### [v0.2.9](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.9) > 22 June 2023 - [Pull request](https://github.com/kuvaus/LlamaGPTJ-chat/pull/18) from [@154pinkchairs](https://github.com/154pinkchairs/) merged. Thanks. :) - The pull request [`#18`](https://github.com/kuvaus/LlamaGPTJ-chat/pull/18) has the two fixes below: - Properly handle file paths including tildes [`18e9f36`](https://github.com/kuvaus/LlamaGPTJ-chat/commit/18e9f36) - Handle buffer allocation errors [`6800dfb`](https://github.com/kuvaus/LlamaGPTJ-chat/commit/6800dfb) - Better debug mode compilation. May fix issue [`#9`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/9) #### [v0.2.8](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.8) > 16 June 2023 - Adds `--save_dir` option so you can change save directory location - Default location is `./saves` on the same directory as the chat binary - See issue [`#13`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/13) for more details #### [v0.2.7](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.7) > 15 June 2023 - Fixes for old macOS. - Use `-DOLD_MACOS=ON` option when compiling with CMake. - Tested to compile on High Sierra and Xcode 10 #### [v0.2.6](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.6) > 14 June 2023 - You can name saves with `./save NAME` and `./load NAME` - You can toggle saving and loading off with `--no-saves` flag #### [v0.2.5](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.5) > 13 June 2023 - Save/load state with `./save` and `./load` - Reset context with `./reset`, help with `./help` - Makes a `./saves` folder - Note that a single save can take up to 2Gb - You can wrap the AI response with tokens using `--b_token` and `--e_token` - See issue [`#12`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/12) for more details #### [v0.2.4](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.4) > 5 June 2023 - Fix when using json to specify names for logfiles. Fixes issue [`#11`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/11) #### [v0.2.3](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.3) > 4 June 2023 - Fix said ability to reset context... :) #### [v0.2.2](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.2) > 3 June 2023 - Ability to reset context #### [v0.2.1](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.1) > 30 May 2023 - Save and load chat logs - Use `--save_log` and `--load_log` - AVX512 option for compilation `-DAVX512=ON` #### [v0.2.0](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.0) > 17 May 2023 - Update gpt4all backend to v0.1.1 [`61a963a`](https://github.com/kuvaus/LlamaGPTJ-chat/commit/61a963a3d220ef157a8504ddde708f33dc2946eb) - Full Windows Visual Studio compatibility. Finally fixes issue [`#1`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/1) - Builds from source on aarch64 Linux. Fixes issue [`#3`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/3) - Full MPT support. Fixes issue [`#4`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/4) #### v0.1.9 > 16 May 2023 - Code cleaning and reordering - `llmodel_create_model()` function #### v0.1.8 > 13 May 2023 - Add support for MPT models - Uses [gpt4all-backend](https://github.com/nomic-ai/gpt4all) #### v0.1.7 > 12 May 2023 - First [pull request](https://github.com/kuvaus/LlamaGPTJ-chat/pull/2) from [@itz-coffee](https://github.com/itz-coffee/) merged. Thanks. :) - The pull request [`#2`](https://github.com/kuvaus/LlamaGPTJ-chat/pull/2) adds the feature below: - Add --no-animation flag [`fdc2ac3`](https///github.com/kuvaus/LlamaGPTJ-chat/commit/fdc2ac3) - Support for old macOS #### v0.1.6 > 4 May 2023 - Parse parameters from json files - Use `-j FNAME` or`--load_json FNAME` #### v0.1.5 > 3 May 2023 - MinGW compilation on Windows #### v0.1.4 > 1 May 2023 - v0.1.4 had no tags - It was part of `cmake-release.yml` rewrite to enable MinGW [`e7e1ebf`](https://github.com/kuvaus/LlamaGPTJ-chat/commit/e7e1ebf97d696d069bbc0ae7f0ed078739fb6642) #### v0.1.3 > 1 May 2023 - Add loading of prompt template files - Use `--load_template` for loading - See `prompt_template_sample.txt` for a sample #### v0.1.2 > 30 April 2023 - Automatic memory handling for the model #### v0.1.1 > 29 April 2023 - Windows compilation fixes #### v0.1.0 > 29 April 2023 - Before this, progress was in [GPTJ-chat](https://github.com/kuvaus/GPTJ-chat/) and [Llama-chat](https://github.com/kuvaus/Llama-chat/) - First version ================================================ FILE: CMakeLists.txt ================================================ cmake_minimum_required (VERSION 3.2) if(APPLE) option(OLD_MACOS "Using old macos" OFF) option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON) if(BUILD_UNIVERSAL AND NOT OLD_MACOS) # Build a Universal binary on macOS set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE) else() # Build for the host architecture on macOS set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE) endif() if (OLD_MACOS) add_definitions(-DOLD_MACOS) endif() endif() project(LlamaGPTJ-chat) set(VERSION_MAJOR 0) set(VERSION_MINOR 3) set(VERSION_PATCH 0) set(VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") set(CMAKE_EXPORT_COMPILE_COMMANDS "on") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) set(LLAMA_STANDALONE ON) else() set(LLAMA_STANDALONE OFF) endif() # options option(AVX2 "enable AVX2" ON) option(AVX512 "enable AVX512" OFF) option(LLAMA_AVX "llama: enable AVX" ON) option(LLAMA_AVX2 "llama: enable AVX2" ${AVX2}) option(LLAMA_AVX512 "llama: enable AVX512" ${AVX512}) option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" ${AVX512}) option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" ${AVX512}) option(LLAMA_FMA "llama: enable FMA" ${AVX2}) # sanitizers #set(LLAMA_BUILD_EXAMPLES ON CACHE BOOL "llama: build examples" FORCE) if(APPLE) elseif(UNIX) if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") option(AVX2 "enable AVX2" OFF) option(LLAMA_AVX "llama: enable AVX" OFF) option(LLAMA_AVX2 "llama: enable AVX2" OFF) option(LLAMA_AVX512 "llama: enable AVX512" OFF) option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) set(BUILD_SHARED_LIBS ON FORCE) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -mno-outline-atomics") endif() endif() if (GGML_SANITIZE_THREAD) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread") endif() if (GGML_SANITIZE_ADDRESS) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer") endif() if (GGML_SANITIZE_UNDEFINED) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined") endif() if (AVX512) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl") endif() #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math") #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native") # dependencies set(CMAKE_C_STANDARD 17) set(CMAKE_CXX_STANDARD 20) find_package(Threads REQUIRED) # main # Include static libs for compatibility: if(APPLE) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-search_paths_first -lSystem") elseif(UNIX) if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++ -static") endif() elseif(WIN32) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++ -static") endif() # Generate a header file with the version number configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/config.h.in" "${CMAKE_CURRENT_BINARY_DIR}/config.h" ) # Include the binary directory for the generated header file include_directories("${CMAKE_CURRENT_BINARY_DIR}") add_subdirectory(gpt4all-backend/llama.cpp) add_subdirectory(gpt4all-backend) add_subdirectory(src) ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2023 Jukka Maatta Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ [![CMake](https://github.com/kuvaus/LlamaGPTJ-chat/actions/workflows/cmake.yml/badge.svg)](https://github.com/kuvaus/LlamaGPTJ-chat/actions/workflows/cmake.yml) # LlamaGPTJ-chat Simple command line chat program for [GPT-J](https://en.wikipedia.org/wiki/GPT-J), [LLaMA](https://en.wikipedia.org/wiki/LLaMA) and [MPT](https://www.mosaicml.com/blog/mpt-7b) models written in C++. Based on [llama.cpp](https://github.com/ggerganov/llama.cpp) and uses [gpt4all-backend](https://github.com/nomic-ai/gpt4all) for full compatibility. LlamaGPTJ-chat demo > **Warning** > Very early progress, might have bugs # Table of contents * [Installation](#installation) * [Usage](#usage) * [GPT-J, LLaMA, and MPT models](#gpt-j-llama-and-mpt-models) * [Detailed command list](#detailed-command-list) * [Useful features](#useful-features) * [License](#license) ## Installation Since the program is made using c++ it should build and run on most Linux, MacOS and Windows systems. The [Releases](https://github.com/kuvaus/LlamaGPTJ-chat/releases) link has ready-made binaries. AVX2 is faster and works on most newer computers. If you run the program, it will check and print if your computer has AVX2 support. ### Download ```sh git clone --recurse-submodules https://github.com/kuvaus/LlamaGPTJ-chat cd LlamaGPTJ-chat ``` You need to also download a model file, see [supported models](#gpt-j-llama-and-mpt-models) for details and links. ### Build Since the program is made using c++ it should build and run on most Linux, MacOS and Windows systems. On most systems, you only need this to build: ```sh mkdir build cd build cmake .. cmake --build . --parallel ``` > **Note** > > If you have an old processor, you can turn AVX2 instructions OFF in the build step with `-DAVX2=OFF` flag. > > If you have a new processor, you can turn AVX512 instructions ON in the build step with `-DAVX512=ON` flag. > > On old macOS, set `-DBUILD_UNIVERSAL=OFF` to make the build x86 only instead of the universal Intel/ARM64 binary. > On really old macOS, set `-DOLD_MACOS=ON`. This disables `/save` and `/load` but compiles on old Xcode. > > On Windows you can now use Visual Studio (MSVC) or MinGW. If you want MinGW build instead, set `-G "MinGW Makefiles"`. > > On ARM64 Linux there are no ready-made binaries, but you can now build it from source. ## Usage After compiling, the binary is located at: ```sh build/bin/chat ``` But you're free to move it anywhere. Simple command for 4 threads to get started: ```sh ./chat -m "/path/to/modelfile/ggml-vicuna-13b-1.1-q4_2.bin" -t 4 ``` or ```sh ./chat -m "/path/to/modelfile/ggml-gpt4all-j-v1.3-groovy.bin" -t 4 ``` Happy chatting! ## GPT-J, LLaMA, and MPT models Current backend supports the GPT-J, LLaMA and MPT models. ### GPT-J model You need to download a GPT-J model first. Here are direct links to models: >- The default version is **v1.0**: [ggml-gpt4all-j.bin](https://gpt4all.io/models/ggml-gpt4all-j.bin) >- At the time of writing the newest is **1.3-groovy**: [ggml-gpt4all-j-v1.3-groovy.bin](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin) They're around 3.8 Gb each. The chat program stores the model in RAM on runtime so you need enough memory to run. You can get more details on GPT-J models from [gpt4all.io](https://gpt4all.io/) or [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) github. ### LLaMA model Alternatively you need to download a LLaMA model first. The original weights are for research purposes and you can apply for access [here](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/). Below are direct links to derived models: >- Vicuna 7b **v1.1**: [ggml-vicuna-7b-1.1-q4_2.bin](https://gpt4all.io/models/ggml-vicuna-7b-1.1-q4_2.bin) >- Vicuna 13b **v1.1**: [ggml-vicuna-13b-1.1-q4_2.bin](https://gpt4all.io/models/ggml-vicuna-13b-1.1-q4_2.bin) >- GPT-4-All **l13b-snoozy**: [ggml-gpt4all-l13b-snoozy.bin](https://gpt4all.io/models/ggml-gpt4all-l13b-snoozy.bin) The LLaMA models are quite large: the 7B parameter versions are around 4.2 Gb and 13B parameter 8.2 Gb each. The chat program stores the model in RAM on runtime so you need enough memory to run. You can get more details on LLaMA models from the [whitepaper](https://arxiv.org/abs/2302.13971) or META AI [website](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/). ### MPT model You can also download and use an MPT model instead. Here are direct links to MPT-7B models: >- MPT-7B base model pre-trained by Mosaic ML: [ggml-mpt-7b-base.bin](https://gpt4all.io/models/ggml-mpt-7b-base.bin) >- MPT-7B instruct model trained by Mosaic ML: [ggml-mpt-7b-instruct.bin](https://gpt4all.io/models/ggml-mpt-7b-instruct.bin) >- Non-commercial MPT-7B chat model trained by Mosaic ML: [ggml-mpt-7b-chat.bin](https://gpt4all.io/models/ggml-mpt-7b-chat.bin) They're around 4.9 Gb each. The chat program stores the model in RAM on runtime so you need enough memory to run. You can get more details on MPT models from MosaicML [website](https://www.mosaicml.com/blog/mpt-7b) or [mosaicml/llm-foundry](https://github.com/mosaicml/llm-foundry) github. ## Detailed command list You can view the help and full parameter list with: ` ./chat -h ` ```sh usage: ./bin/chat [options] A simple chat program for GPT-J, LLaMA, and MPT models. You can set specific initial prompt with the -p flag. Runs default in interactive and continuous mode. Type '/reset' to reset the chat context. Type '/save','/load' to save network state into a binary file. Type '/save NAME','/load NAME' to rename saves. Default: --save_name NAME. Type '/help' to show this help dialog. Type 'quit', 'exit' or, 'Ctrl+C' to quit. options: -h, --help show this help message and exit -v, --version show version and license information --run-once disable continuous mode --no-interactive disable interactive mode altogether (uses given prompt only) --no-animation disable chat animation --no-saves disable '/save','/load' functionality -s SEED, --seed SEED RNG seed for --random-prompt (default: -1) -t N, --threads N number of threads to use during computation (default: 4) -p PROMPT, --prompt PROMPT prompt to start generation with (default: empty) --random-prompt start with a randomized prompt. -n N, --n_predict N number of tokens to predict (default: 200) --top_k N top-k sampling (default: 40) --top_p N top-p sampling (default: 0.9) --temp N temperature (default: 0.9) --n_ctx N number of tokens in context window (default: 0) -b N, --batch_size N batch size for prompt processing (default: 20) --repeat_penalty N repeat_penalty (default: 1.1) --repeat_last_n N last n tokens to penalize (default: 64) --context_erase N percent of context to erase (default: 0.8) --b_token optional beginning wrap token for response (default: empty) --e_token optional end wrap token for response (default: empty) -j, --load_json FNAME load options instead from json at FNAME (default: empty/no) --load_template FNAME load prompt template from a txt file at FNAME (default: empty/no) --save_log FNAME save chat log to a file at FNAME (default: empty/no) --load_log FNAME load chat log from a file at FNAME (default: empty/no) --save_dir DIR directory for saves (default: ./saves) --save_name NAME save/load model state binary at save_dir/NAME.bin (current: model_state) context is saved to save_dir/NAME.ctx (current: model_state) -m FNAME, --model FNAME model path (current: ./models/ggml-vicuna-13b-1.1-q4_2.bin) ``` ## Useful features Here are some handy features and details on how to achieve them using command line options. ### Save/load chat log and read output from other apps By default, the program prints the chat to standard (stdout) output, so if you're including the program into your app, it only needs to read stdout. You can also save the whole chat log to a text file with `--save_log` option. There's an elementary way to remember your past conversation by simply loading the saved chat log with `--load_log` option when you start a new session. ### Run the program once without user interaction If you only need the program to run once without any user interactions, one way is to set prompt with `-p "prompt"` and using `--no-interactive` and `--no-animation` flags. The program will read the prompt, print the answer, and close. ### Add AI personalities and characters If you want a personality for your AI, you can change `prompt_template_sample.txt` and use `--load_template` to load the modified file. The only constant is that your input during chat will be put on the `%1` line. Instructions, prompt, response, and everything else can be replaced any way you want. Having different `personality_template.txt` files is an easy way to add different AI characters. With _some_ models, giving both AI and user names instead of `Prompt:` and `Response:`, can make the conversation flow more naturally as the AI tries to mimic a conversation between two people. ### Ability to reset chat context You can reset the chat at any time during chatting by typing `/reset` in the input field. This will clear the AI's memory of past conversations, logits, and tokens. You can then start the chat from a blank slate without having to reload the whole model again. ### Load all parameters using JSON You can also fetch parameters from a json file with `--load_json "/path/to/file.json"` flag. Different models might perform better or worse with different input parameters so using json files is a handy way to store and load all the settings at once. The JSON file loader is designed to be simple in order to prevent any external dependencies, and as a result, the JSON file must follow a specific format. Here is a simple example: ```javascript {"top_p": 1.0, "top_k": 50400, "temp": 0.9, "n_batch": 9} ``` This is useful when you want to store different temperature and sampling settings. And a more detailed one: ```javascript { "top_p": 1.0, "top_k": 50400, "temp": 0.9, "n_batch": 20, "threads": 12, "prompt": "Once upon a time", "load_template": "/path/to/prompt_template_sample.txt", "model": "/path/to/ggml-gpt4all-j-v1.3-groovy.bin", "no-interactive": "true" } ``` This one loads the prompt from the json, uses a specific template, and runs the program once in no-interactive mode so user does not have to press any input. ## License This project is licensed under the MIT [License](https://github.com/kuvaus/LlamaGPTJ-chat/blob/main/LICENSE) ================================================ FILE: cmake/config.h.in ================================================ #ifndef CONFIG_H #define CONFIG_H #define VERSION "@VERSION_MAJOR@" "." "@VERSION_MINOR@" "." "@VERSION_PATCH@" #endif // CONFIG_H ================================================ FILE: gpt4all-backend/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.16) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) #if(APPLE) # option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON) # if(BUILD_UNIVERSAL) # # Build a Universal binary on macOS # # This requires that the found Qt library is compiled as Universal binaries. # set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE) # else() # # Build for the host architecture on macOS # set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE) # endif() #endif() # Include the binary directory for the generated header file #include_directories("${CMAKE_CURRENT_BINARY_DIR}") #set(LLMODEL_VERSION_MAJOR 0) #set(LLMODEL_VERSION_MINOR 1) #set(LLMODEL_VERSION_PATCH 1) #set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}") #project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C) set(CMAKE_CXX_STANDARD_REQUIRED ON) #set(LLAMA_BUILD_EXAMPLES ON CACHE BOOL "llama: build examples" FORCE) #set(BUILD_SHARED_LIBS ON FORCE) set(CMAKE_VERBOSE_MAKEFILE ON) if (GPT4ALL_AVX_ONLY) set(LLAMA_AVX2 OFF CACHE BOOL "llama: enable AVX2" FORCE) set(LLAMA_F16C OFF CACHE BOOL "llama: enable F16C" FORCE) set(LLAMA_FMA OFF CACHE BOOL "llama: enable FMA" FORCE) endif() #add_subdirectory(llama.cpp) add_library(llmodel gptj.h gptj.cpp llamamodel.h llamamodel.cpp llama.cpp/examples/common.cpp llmodel.h llmodel_c.h llmodel_c.cpp mpt.h mpt.cpp utils.h utils.cpp ) target_link_libraries(llmodel PRIVATE llama) #set_target_properties(llmodel PROPERTIES # VERSION ${PROJECT_VERSION} # SOVERSION ${PROJECT_VERSION_MAJOR}) #set(COMPONENT_NAME_MAIN ${PROJECT_NAME}) #set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/install) ================================================ FILE: gpt4all-backend/README.md ================================================ # GPT4ALL Backend This directory contains the C/C++ model backend used by GPT4All for inference on the CPU. This backend acts as a universal library/wrapper for all models that the GPT4All ecosystem supports. Language bindings are built on top of this universal library. The native GPT4all Chat application directly uses this library for all inference. # What models are supported by the GPT4All ecosystem? Currently, there are three different model architectures that are supported: 1. GPTJ - Based off of the GPT-J architecture with examples found [here](https://huggingface.co/EleutherAI/gpt-j-6b) 2. LLAMA - Based off of the LLAMA architecture with examples found [here](https://huggingface.co/models?sort=downloads&search=llama) 3. MPT - Based off of Mosaic ML's MPT architecture with examples found [here](https://huggingface.co/mosaicml/mpt-7b) # Why so many different architectures? What differentiates them? One of the major differences is license. Currently, the LLAMA based models are subject to a non-commercial license, whereas the GPTJ and MPT base models allow commercial usage. In the early advent of the recent explosion of activity in open source local models, the llama models have generally been seen as performing better, but that is changing quickly. Every week - even every day! - new models are released with some of the GPTJ and MPT models competitive in performance/quality with LLAMA. What's more, there are some very nice architectural innovations with the MPT models that could lead to new performance/quality gains. # How does GPT4All make these models available for CPU inference? By leveraging the ggml library written by Georgi Gerganov and a growing community of developers. There are currently multiple different versions of this library. The original github repo can be found [here](https://github.com/ggerganov/ggml), but the developer of the library has also created a LLAMA based version [here](https://github.com/ggerganov/llama.cpp). Currently, this backend is using the latter as a submodule. # Does that mean GPT4All is compatible with all llama.cpp models and vice versa? Unfortunately, no for three reasons: 1. The upstream [llama.cpp](https://github.com/ggerganov/llama.cpp) project has introduced [a compatibility breaking](https://github.com/ggerganov/llama.cpp/commit/b9fd7eee57df101d4a3e3eabc9fd6c2cb13c9ca1) re-quantization method recently. This is a breaking change that renders all previous models (including the ones that GPT4All uses) inoperative with newer versions of llama.cpp since that change. 2. The GPT4All backend has the llama.cpp submodule specifically pinned to a version prior to this breaking change. 3. The GPT4All backend currently supports MPT based models as an added feature. Neither llama.cpp nor the original ggml repo support this architecture as of this writing, however efforts are underway to make MPT available in the ggml repo which you can follow [here.](https://github.com/ggerganov/ggml/pull/145) # What is being done to make them more compatible? A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differienting them with namespaces or some other manner. Investigations continue. # What about GPU inference? In newer versions of llama.cpp, there has been some added support for NVIDIA GPU's for inference. We're investigating how to incorporate this into our downloadable installers. # Ok, so bottom line... how do I make my model on huggingface compatible with GPT4All ecosystem right now? 1. Check to make sure the huggingface model is available in one of our three supported architectures 2. If it is, then you can use the conversion script inside of our pinned llama.cpp submodule for GPTJ and LLAMA based models 3. Or if your model is an MPT model you can use the conversion script located directly in this backend directory under the scripts subdirectory # Check back for updates as we'll try to keep this updated as things change! ================================================ FILE: gpt4all-backend/gptj/placeholder ================================================ ================================================ FILE: gpt4all-backend/gptj.cpp ================================================ #include "gptj.h" #include "llama.cpp/ggml.h" #include "utils.h" #include #include #include #include #include #include #include #include #include #if defined(_WIN32) && defined(_MSC_VER) #define WIN32_LEAN_AND_MEAN #ifndef NOMINMAX #define NOMINMAX #endif #include #include #include #else #include #endif #include #include // default hparams (GPT-J 6B) static const size_t MB = 1024*1024; struct gptj_hparams { int32_t n_vocab = 50400; int32_t n_ctx = 2048; int32_t n_embd = 4096; int32_t n_head = 16; int32_t n_layer = 28; int32_t n_rot = 64; int32_t f16 = 1; }; struct gptj_layer { // normalization struct ggml_tensor * ln_1_g; struct ggml_tensor * ln_1_b; // attention struct ggml_tensor * c_attn_q_proj_w; struct ggml_tensor * c_attn_k_proj_w; struct ggml_tensor * c_attn_v_proj_w; struct ggml_tensor * c_attn_proj_w; // ff struct ggml_tensor * c_mlp_fc_w; struct ggml_tensor * c_mlp_fc_b; struct ggml_tensor * c_mlp_proj_w; struct ggml_tensor * c_mlp_proj_b; }; struct gptj_buffer { uint8_t * addr = NULL; size_t size = 0; void resize(size_t size) { delete[] addr; addr = new uint8_t[size]; this->size = size; } ~gptj_buffer() { fflush(stdout); delete[] addr; } }; struct gptj_kv_cache { struct ggml_tensor * k; struct ggml_tensor * v; struct ggml_context * ctx = NULL; gptj_buffer buf; int n; // number of tokens currently in the cache ~gptj_kv_cache() { if (ctx) { ggml_free(ctx); } } }; struct gptj_model { gptj_hparams hparams; // normalization struct ggml_tensor * ln_f_g; struct ggml_tensor * ln_f_b; struct ggml_tensor * wte; // position embedding struct ggml_tensor * lmh_g; // language model head struct ggml_tensor * lmh_b; // language model bias std::vector layers; // key + value memory struct gptj_kv_cache kv_self; // struct ggml_context * ctx; std::map tensors; gptj_buffer buf; ~gptj_model() { if (ctx) { ggml_free(ctx); } } }; static bool kv_cache_init( const struct gptj_hparams & hparams, struct gptj_kv_cache & cache, ggml_type wtype, int n_ctx) { const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int64_t n_mem = (int64_t)n_layer*n_ctx; const int64_t n_elements = n_embd*n_mem; cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); struct ggml_init_params params; params.mem_size = cache.buf.size; params.mem_buffer = cache.buf.addr; params.no_alloc = false; cache.ctx = ggml_init(params); if (!cache.ctx) { fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); return false; } cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); return true; } // load the model's weights from a stream bool gptj_model_load(const std::string &fname, std::istream &fin, gptj_model & model, gpt_vocab & vocab) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); // verify magic { uint32_t magic; fin.read((char *) &magic, sizeof(magic)); if (magic != 0x67676d6c) { fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); return false; } } // load hparams { auto & hparams = model.hparams; fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); fin.read((char *) &hparams.f16, sizeof(hparams.f16)); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); printf("%s: n_rot = %d\n", __func__, hparams.n_rot); printf("%s: f16 = %d\n", __func__, hparams.f16); } // load vocab { int32_t n_vocab = 0; fin.read((char *) &n_vocab, sizeof(n_vocab)); if (n_vocab != model.hparams.n_vocab) { fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); return false; } std::string word; for (int i = 0; i < n_vocab; i++) { uint32_t len; fin.read((char *) &len, sizeof(len)); word.resize(len); fin.read((char *) word.data(), len); vocab.token_to_id[word] = i; vocab.id_to_token[i] = word; } } // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation ggml_type wtype = GGML_TYPE_COUNT; switch (model.hparams.f16) { case 0: wtype = GGML_TYPE_F32; break; case 1: wtype = GGML_TYPE_F16; break; case 2: wtype = GGML_TYPE_Q4_0; break; case 3: wtype = GGML_TYPE_Q4_1; break; case 5: wtype = GGML_TYPE_Q4_2; break; default: { fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", __func__, fname.c_str(), model.hparams.f16); return false; } } const ggml_type wtype2 = GGML_TYPE_F32; auto & ctx = model.ctx; size_t ctx_size = 0; { const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v ctx_size += (5 + 10*n_layer)*256; // object overhead printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } // create the ggml context { struct ggml_init_params params = { .mem_size = ctx_size, .mem_buffer = NULL, }; model.ctx = ggml_init(params); if (!model.ctx) { fprintf(stderr, "%s: ggml_init() failed\n", __func__); return false; } } // prepare memory for the weights { const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; model.layers.resize(n_layer); model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab); // map by name model.tensors["transformer.wte.weight"] = model.wte; model.tensors["transformer.ln_f.weight"] = model.ln_f_g; model.tensors["transformer.ln_f.bias"] = model.ln_f_b; model.tensors["lm_head.weight"] = model.lmh_g; model.tensors["lm_head.bias"] = model.lmh_b; for (int i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // map by name model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"] = layer.ln_1_g; model.tensors["transformer.h." + std::to_string(i) + ".ln_1.bias"] = layer.ln_1_b; model.tensors["transformer.h." + std::to_string(i) + ".attn.q_proj.weight"] = layer.c_attn_q_proj_w; model.tensors["transformer.h." + std::to_string(i) + ".attn.k_proj.weight"] = layer.c_attn_k_proj_w; model.tensors["transformer.h." + std::to_string(i) + ".attn.v_proj.weight"] = layer.c_attn_v_proj_w; model.tensors["transformer.h." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_proj_w; model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.weight"] = layer.c_mlp_fc_w; model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.bias"] = layer.c_mlp_fc_b; model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"] = layer.c_mlp_proj_w; model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.bias"] = layer.c_mlp_proj_b; } } // key + value memory { const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; const int n_mem = n_layer*n_ctx; const int n_elements = n_embd*n_mem; if (!kv_cache_init(hparams, model.kv_self, GGML_TYPE_F16, model.hparams.n_ctx)) { fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); ggml_free(ctx); return false; } const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v); printf("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } // load weights { int n_tensors = 0; size_t total_size = 0; printf("%s: ", __func__); while (true) { int32_t n_dims; int32_t length; int32_t ftype; fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); fin.read(reinterpret_cast(&length), sizeof(length)); fin.read(reinterpret_cast(&ftype), sizeof(ftype)); if (fin.eof()) { break; } int32_t nelements = 1; int32_t ne[2] = { 1, 1 }; for (int i = 0; i < n_dims; ++i) { fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); nelements *= ne[i]; } std::string name(length, 0); fin.read(&name[0], length); if (model.tensors.find(name.data()) == model.tensors.end()) { fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); return false; } auto tensor = model.tensors[name.data()]; if (ggml_nelements(tensor) != nelements) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); return false; } if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lu, %lu], expected [%d, %d]\n", __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); return false; } if (0) { static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); } size_t bpe = 0; switch (ftype) { case 0: bpe = ggml_type_size(GGML_TYPE_F32); break; case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; default: { fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); return false; } }; if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); return false; } fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); total_size += ggml_nbytes(tensor); if (++n_tensors % 8 == 0) { printf("."); fflush(stdout); } } printf(" done\n"); printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); } return true; } // load the model's weights from a file path bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) { auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); return false; } bool loaded = gptj_model_load(fname, fin, model, vocab); fin.close(); return loaded; } // evaluate the transformer // // - model: the model // - n_threads: number of threads to use // - n_past: the context size so far // - embd_inp: the embeddings of the tokens in the context // - embd_w: the predicted logits for the next token // // The GPT-J model requires about 16MB of memory per input token. // bool gptj_eval( gptj_model & model, const int n_threads, const int n_past, const std::vector & embd_inp, std::vector & embd_w, size_t & mem_per_token) { const int N = embd_inp.size(); const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; const int n_head = hparams.n_head; const int n_vocab = hparams.n_vocab; const int n_rot = hparams.n_rot; const int d_key = n_embd/n_head; const size_t init_buf_size = 1024u*MB; if (!model.buf.addr || model.buf.size < init_buf_size) model.buf.resize(init_buf_size); if (mem_per_token > 0 && mem_per_token*N > model.buf.size) { const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, model.buf.size, buf_size_new); // reallocate model.buf.resize(buf_size_new); if (model.buf.addr == nullptr) { fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, model.buf.size); return false; } } struct ggml_init_params params = { .mem_size = model.buf.size, .mem_buffer = model.buf.addr, }; struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = { .n_threads = n_threads }; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); // wte struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * cur; // norm { cur = ggml_norm(ctx0, inpL); // cur = ln_1_g*cur + ln_1_b cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), cur), ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); } struct ggml_tensor * inpSA = cur; // self-attention { struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur); struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur); // store key and value to memory { struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_self.k, N*n_embd, (ggml_element_size(model.kv_self.k)*n_embd)*(il*n_ctx + n_past)); struct ggml_tensor * v = ggml_view_1d(ctx0, model.kv_self.v, N*n_embd, (ggml_element_size(model.kv_self.v)*n_embd)*(il*n_ctx + n_past)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); } // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) struct ggml_tensor * Q = ggml_permute(ctx0, ggml_rope(ctx0, ggml_cpy(ctx0, Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), n_past, n_rot, 0), 0, 2, 1, 3); // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) struct ggml_tensor * K = ggml_permute(ctx0, ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_view_1d(ctx0, model.kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.k)*n_embd), n_embd/n_head, n_head, n_past + N), n_past, n_rot, 1), 0, 2, 1, 3); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); // KQ_scaled = KQ / sqrt(n_embd/n_head) struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) ); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() struct ggml_tensor * V_trans = ggml_cpy(ctx0, ggml_permute(ctx0, ggml_reshape_3d(ctx0, ggml_view_1d(ctx0, model.kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.v)*n_embd), n_embd/n_head, n_head, n_past + N), 1, 2, 0, 3), ggml_new_tensor_3d(ctx0, model.kv_self.v->type, n_past + N, n_embd/n_head, n_head)); // KQV = transpose(V) * KQ_soft_max struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); // cur = KQV_merged.contiguous().view(n_embd, N) cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_proj_w, cur); } struct ggml_tensor * inpFF = cur; // feed-forward network // this is independent of the self-attention result, so it could be done in parallel to the self-attention { // note here we pass inpSA instead of cur cur = ggml_mul_mat(ctx0, model.layers[il].c_mlp_fc_w, inpSA); cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), cur); // GELU activation cur = ggml_gelu(ctx0, cur); // projection // cur = proj_w*cur + proj_b cur = ggml_mul_mat(ctx0, model.layers[il].c_mlp_proj_w, cur); cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), cur); } // self-attention + FF cur = ggml_add(ctx0, cur, inpFF); // input for next layer inpL = ggml_add(ctx0, cur, inpL); } // norm { inpL = ggml_norm(ctx0, inpL); // inpL = ln_f_g*inpL + ln_f_b inpL = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.ln_f_g, inpL), inpL), ggml_repeat(ctx0, model.ln_f_b, inpL)); } // lm_head { inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); inpL = ggml_add(ctx0, ggml_repeat(ctx0, model.lmh_b, inpL), inpL); } // logits -> probs //inpL = ggml_soft_max(ctx0, inpL); // run the computation ggml_build_forward_expand(&gf, inpL); ggml_graph_compute (ctx0, &gf); //if (n_past%100 == 0) { // ggml_graph_print (&gf); // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); //} //embd_w.resize(n_vocab*N); //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); // return result for just the last token embd_w.resize(n_vocab); memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); if (mem_per_token == 0) { mem_per_token = ggml_used_mem(ctx0)/N; } //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); ggml_free(ctx0); return true; } #define GPTJ_MAX_RNG_STATE 64*1024 size_t gptj_get_state_size(const gptj_model &model) { // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. // for reference, std::mt19937(1337) serializes to 6701 bytes. const size_t s_rng_size = sizeof(size_t); const size_t s_rng = GPTJ_MAX_RNG_STATE; const size_t s_kv_size = sizeof(size_t); const size_t s_kv_ntok = sizeof(int); const size_t s_kv = model.kv_self.buf.size; const size_t s_total = ( + s_rng_size + s_rng + s_kv_size + s_kv_ntok + s_kv ); fflush(stdout); return s_total; } size_t gptj_copy_state_data(const gptj_model &model, const std::mt19937 &rng, uint8_t *dest) { uint8_t * out = dest; fflush(stdout); // copy rng { std::stringstream rng_ss; rng_ss << rng; const size_t rng_size = rng_ss.str().size(); char rng_buf[GPTJ_MAX_RNG_STATE]; memset(&rng_buf[0], 0, GPTJ_MAX_RNG_STATE); memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size()); memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size); memcpy(out, &rng_buf[0], GPTJ_MAX_RNG_STATE); out += GPTJ_MAX_RNG_STATE; } // copy kv cache { const size_t kv_size = model.kv_self.buf.size; const int kv_ntok = model.kv_self.n; memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size); memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok); if (kv_size) { memcpy(out, model.kv_self.buf.addr, kv_size); out += kv_size; } } const size_t written = out - dest; const size_t expected = gptj_get_state_size(model); assert(written == expected); fflush(stdout); return written; } size_t gptj_set_state_data(gptj_model *model, std::mt19937 *rng, const uint8_t *src) { const uint8_t * in = src; // set rng { size_t rng_size; char rng_buf[GPTJ_MAX_RNG_STATE]; memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size); memcpy(&rng_buf[0], in, GPTJ_MAX_RNG_STATE); in += GPTJ_MAX_RNG_STATE; std::stringstream rng_ss; rng_ss.str(std::string(&rng_buf[0], rng_size)); rng_ss >> *rng; assert(rng_ss.fail() == false); } // set kv cache { size_t kv_size; int kv_ntok; memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size); memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok); if (kv_size) { assert(model->kv_self.buf.size == kv_size); void * k_data = model->kv_self.k->data; // remember data pointers void * v_data = model->kv_self.v->data; // because their value is stored in buf and overwritten by memcpy memcpy(model->kv_self.buf.addr, in, kv_size); in += kv_size; model->kv_self.k->data = k_data; // restore correct data pointers model->kv_self.v->data = v_data; } model->kv_self.n = kv_ntok; } const size_t nread = in - src; const size_t expected = gptj_get_state_size(*model); assert(nread == expected); fflush(stdout); return nread; } struct GPTJPrivate { const std::string modelPath; bool modelLoaded; gpt_vocab vocab; gptj_model *model = nullptr; int64_t n_threads = 0; size_t mem_per_token = 0; std::mt19937 rng; }; GPTJ::GPTJ() : d_ptr(new GPTJPrivate) { d_ptr->model = new gptj_model; d_ptr->modelLoaded = false; } bool GPTJ::loadModel(const std::string &modelPath) { std::mt19937 rng(time(NULL)); d_ptr->rng = rng; auto fin = std::ifstream(modelPath, std::ios::binary); // load the model if (!gptj_model_load(modelPath, fin, *d_ptr->model, d_ptr->vocab)) { std::cerr << "GPT-J ERROR: failed to load model from " << modelPath; return false; } d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); d_ptr->modelLoaded = true; fflush(stdout); return true; } void GPTJ::setThreadCount(int32_t n_threads) { d_ptr->n_threads = n_threads; } int32_t GPTJ::threadCount() const { return d_ptr->n_threads; } GPTJ::~GPTJ() { delete d_ptr->model; } bool GPTJ::isModelLoaded() const { return d_ptr->modelLoaded; } size_t GPTJ::stateSize() const { return gptj_get_state_size(*d_ptr->model); } size_t GPTJ::saveState(uint8_t *dest) const { return gptj_copy_state_data(*d_ptr->model, d_ptr->rng, dest); } size_t GPTJ::restoreState(const uint8_t *src) { return gptj_set_state_data(d_ptr->model, &d_ptr->rng, src); } void GPTJ::prompt(const std::string &prompt, std::function promptCallback, std::function responseCallback, std::function recalculateCallback, PromptContext &promptCtx) { if (!isModelLoaded()) { std::cerr << "GPT-J ERROR: prompt won't work with an unloaded model!\n"; return; } const int64_t t_main_start_us = ggml_time_us(); int64_t t_sample_us = 0; int64_t t_predict_us = 0; int64_t t_prompt_us = 0; // tokenize the prompt std::vector embd_inp = ::gpt_tokenize(d_ptr->vocab, prompt); // save the context size promptCtx.n_ctx = d_ptr->model->hparams.n_ctx; if ((int) embd_inp.size() > promptCtx.n_ctx - 4) { responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed."); std::cerr << "GPT-J ERROR: The prompt is" << embd_inp.size() << "tokens and the context window is" << promptCtx.n_ctx << "!\n"; return; } promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size()); promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx); // determine the required inference memory per token: static bool initialized = false; static std::vector p_instruct; static std::vector r_instruct; if (!initialized) { gptj_eval(*d_ptr->model, d_ptr->n_threads, 0, { 0, 1, 2, 3 }, promptCtx.logits, d_ptr->mem_per_token); initialized = true; } // process the prompt in batches size_t i = 0; const int64_t t_start_prompt_us = ggml_time_us(); while (i < embd_inp.size()) { size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size()); std::vector batch(embd_inp.begin() + i, embd_inp.begin() + batch_end); // Check if the context has run out... if (promptCtx.n_past + batch.size() > promptCtx.n_ctx) { const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase; // Erase the first percentage of context from the tokens... std::cerr << "GPTJ: reached the end of the context window so resizing\n"; promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint); promptCtx.n_past = promptCtx.tokens.size(); recalculateContext(promptCtx, recalculateCallback); assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx); } if (!gptj_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, batch, promptCtx.logits, d_ptr->mem_per_token)) { std::cerr << "GPT-J ERROR: Failed to process prompt\n"; return; } size_t tokens = batch_end - i; for (size_t t = 0; t < tokens; ++t) { if (promptCtx.tokens.size() == promptCtx.n_ctx) promptCtx.tokens.erase(promptCtx.tokens.begin()); promptCtx.tokens.push_back(batch.at(t)); if (!promptCallback(batch.at(t))) return; } promptCtx.n_past += batch.size(); i = batch_end; } t_prompt_us += ggml_time_us() - t_start_prompt_us; int p_instructFound = 0; int r_instructFound = 0; std::string cachedResponse; std::vector cachedTokens; std::unordered_set reversePrompts = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" }; // predict next tokens int32_t totalPredictions = 0; for (int i = 0; i < promptCtx.n_predict; i++) { // sample next token const int n_vocab = d_ptr->model->hparams.n_vocab; gpt_vocab::id id = 0; { const int64_t t_start_sample_us = ggml_time_us(); const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size()); id = gpt_sample_top_k_top_p(d_ptr->vocab, n_vocab, promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks, n_prev_toks, promptCtx.logits, promptCtx.top_k, promptCtx.top_p, promptCtx.temp, promptCtx.repeat_penalty, d_ptr->rng); t_sample_us += ggml_time_us() - t_start_sample_us; } // Check if the context has run out... if (promptCtx.n_past + 1 > promptCtx.n_ctx) { const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase; // Erase the first percentage of context from the tokens... std::cerr << "GPTJ: reached the end of the context window so resizing\n"; promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint); promptCtx.n_past = promptCtx.tokens.size(); recalculateContext(promptCtx, recalculateCallback); assert(promptCtx.n_past + 1 <= promptCtx.n_ctx); } const int64_t t_start_predict_us = ggml_time_us(); if (!gptj_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, { id }, promptCtx.logits, d_ptr->mem_per_token)) { std::cerr << "GPT-J ERROR: Failed to predict next token\n"; return; } t_predict_us += ggml_time_us() - t_start_predict_us; promptCtx.n_past += 1; // display text ++totalPredictions; if (id == 50256 /*end of text*/) goto stop_generating; const std::string str = d_ptr->vocab.id_to_token[id]; // Check if the provided str is part of our reverse prompts bool foundPartialReversePrompt = false; const std::string completed = cachedResponse + str; if (reversePrompts.find(completed) != reversePrompts.end()) { goto stop_generating; } // Check if it partially matches our reverse prompts and if so, cache for (auto s : reversePrompts) { if (s.compare(0, completed.size(), completed) == 0) { foundPartialReversePrompt = true; cachedResponse = completed; break; } } // Regardless the token gets added to our cache cachedTokens.push_back(id); // Continue if we have found a partial match if (foundPartialReversePrompt) continue; // Empty the cache for (auto t : cachedTokens) { if (promptCtx.tokens.size() == promptCtx.n_ctx) promptCtx.tokens.erase(promptCtx.tokens.begin()); promptCtx.tokens.push_back(t); if (!responseCallback(t, d_ptr->vocab.id_to_token[t])) goto stop_generating; } cachedTokens.clear(); } stop_generating: #if 0 // report timing { const int64_t t_main_end_us = ggml_time_us(); std::cout << "GPT-J INFO: mem per token = " << mem_per_token << " bytes\n"; std::cout << "GPT-J INFO: sample time = " << t_sample_us/1000.0f << " ms\n"; std::cout << "GPT-J INFO: prompt time = " << t_prompt_us/1000.0f << " ms\n"; std::cout << "GPT-J INFO: predict time = " << t_predict_us/1000.0f << " ms / " << t_predict_us/1000.0f/totalPredictions << " ms per token\n"; std::cout << "GPT-J INFO: total time = " << (t_main_end_us - t_main_start_us)/1000.0f << " ms\n"; fflush(stdout); } #endif return; } void GPTJ::recalculateContext(PromptContext &promptCtx, std::function recalculate) { size_t i = 0; promptCtx.n_past = 0; while (i < promptCtx.tokens.size()) { size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size()); std::vector batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end); assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx); if (!gptj_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, batch, promptCtx.logits, d_ptr->mem_per_token)) { std::cerr << "GPTJ ERROR: Failed to process prompt\n"; goto stop_generating; } promptCtx.n_past += batch.size(); if (!recalculate(true)) goto stop_generating; i = batch_end; } assert(promptCtx.n_past == promptCtx.tokens.size()); stop_generating: recalculate(false); } ================================================ FILE: gpt4all-backend/gptj.h ================================================ #ifndef GPTJ_H #define GPTJ_H #include #include #include #include "llmodel.h" class GPTJPrivate; class GPTJ : public LLModel { public: GPTJ(); ~GPTJ(); bool loadModel(const std::string &modelPath) override; bool isModelLoaded() const override; size_t stateSize() const override; size_t saveState(uint8_t *dest) const override; size_t restoreState(const uint8_t *src) override; void prompt(const std::string &prompt, std::function promptCallback, std::function responseCallback, std::function recalculateCallback, PromptContext &ctx) override; void setThreadCount(int32_t n_threads) override; int32_t threadCount() const override; protected: void recalculateContext(PromptContext &promptCtx, std::function recalculate) override; private: GPTJPrivate *d_ptr; }; #endif // GPTJ_H ================================================ FILE: gpt4all-backend/llama/placeholder ================================================ ================================================ FILE: gpt4all-backend/llamamodel.cpp ================================================ #include "llamamodel.h" #include "llama.cpp/examples/common.h" #include "llama.cpp/llama.h" #include "llama.cpp/ggml.h" #include #include #include #include #include #include #include #include #include #if defined(_WIN32) && defined(_MSC_VER) #define WIN32_LEAN_AND_MEAN #ifndef NOMINMAX #define NOMINMAX #endif #include #include #include #else #include #endif #include #include #include struct LLamaPrivate { const std::string modelPath; bool modelLoaded; llama_context *ctx = nullptr; llama_context_params params; int64_t n_threads = 0; }; LLamaModel::LLamaModel() : d_ptr(new LLamaPrivate) { d_ptr->modelLoaded = false; } bool LLamaModel::loadModel(const std::string &modelPath) { // load the model d_ptr->params = llama_context_default_params(); gpt_params params; d_ptr->params.n_ctx = 2048; d_ptr->params.n_parts = params.n_parts; d_ptr->params.seed = params.seed; d_ptr->params.f16_kv = params.memory_f16; d_ptr->params.use_mmap = params.use_mmap; #if defined (__APPLE__) d_ptr->params.use_mlock = true; #else d_ptr->params.use_mlock = params.use_mlock; #endif d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params); if (!d_ptr->ctx) { std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl; return false; } d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); d_ptr->modelLoaded = true; fflush(stderr); return true; } void LLamaModel::setThreadCount(int32_t n_threads) { d_ptr->n_threads = n_threads; } int32_t LLamaModel::threadCount() const { return d_ptr->n_threads; } LLamaModel::~LLamaModel() { llama_free(d_ptr->ctx); } bool LLamaModel::isModelLoaded() const { return d_ptr->modelLoaded; } size_t LLamaModel::stateSize() const { return llama_get_state_size(d_ptr->ctx); } size_t LLamaModel::saveState(uint8_t *dest) const { return llama_copy_state_data(d_ptr->ctx, dest); } size_t LLamaModel::restoreState(const uint8_t *src) { return llama_set_state_data(d_ptr->ctx, src); } void LLamaModel::prompt(const std::string &prompt, std::function promptCallback, std::function responseCallback, std::function recalculateCallback, PromptContext &promptCtx) { if (!isModelLoaded()) { std::cerr << "LLAMA ERROR: prompt won't work with an unloaded model!\n"; return; } gpt_params params; params.prompt = prompt; // Add a space in front of the first character to match OG llama tokenizer behavior params.prompt.insert(0, 1, ' '); // tokenize the prompt auto embd_inp = ::llama_tokenize(d_ptr->ctx, params.prompt, false); // save the context size promptCtx.n_ctx = llama_n_ctx(d_ptr->ctx); if ((int) embd_inp.size() > promptCtx.n_ctx - 4) { responseCallback(-1, "The prompt size exceeds the context window size and cannot be processed."); std::cerr << "LLAMA ERROR: The prompt is" << embd_inp.size() << "tokens and the context window is" << promptCtx.n_ctx << "!\n"; return; } promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size()); promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx); // number of tokens to keep when resetting context params.n_keep = (int)embd_inp.size(); // process the prompt in batches size_t i = 0; const int64_t t_start_prompt_us = ggml_time_us(); while (i < embd_inp.size()) { size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size()); std::vector batch(embd_inp.begin() + i, embd_inp.begin() + batch_end); // Check if the context has run out... if (promptCtx.n_past + batch.size() > promptCtx.n_ctx) { const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase; // Erase the first percentage of context from the tokens... std::cerr << "LLAMA: reached the end of the context window so resizing\n"; promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint); promptCtx.n_past = promptCtx.tokens.size(); recalculateContext(promptCtx, recalculateCallback); assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx); } if (llama_eval(d_ptr->ctx, batch.data(), batch.size(), promptCtx.n_past, d_ptr->n_threads)) { std::cerr << "LLAMA ERROR: Failed to process prompt\n"; return; } size_t tokens = batch_end - i; for (size_t t = 0; t < tokens; ++t) { if (promptCtx.tokens.size() == promptCtx.n_ctx) promptCtx.tokens.erase(promptCtx.tokens.begin()); promptCtx.tokens.push_back(batch.at(t)); if (!promptCallback(batch.at(t))) return; } promptCtx.n_past += batch.size(); i = batch_end; } std::string cachedResponse; std::vector cachedTokens; std::unordered_set reversePrompts = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" }; // predict next tokens int32_t totalPredictions = 0; for (int i = 0; i < promptCtx.n_predict; i++) { // sample next token const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size()); llama_token id = llama_sample_top_p_top_k(d_ptr->ctx, promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks, n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.temp, promptCtx.repeat_penalty); // Check if the context has run out... if (promptCtx.n_past + 1 > promptCtx.n_ctx) { const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase; // Erase the first percentage of context from the tokens... std::cerr << "LLAMA: reached the end of the context window so resizing\n"; promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint); promptCtx.n_past = promptCtx.tokens.size(); recalculateContext(promptCtx, recalculateCallback); assert(promptCtx.n_past + 1 <= promptCtx.n_ctx); } if (llama_eval(d_ptr->ctx, &id, 1, promptCtx.n_past, d_ptr->n_threads)) { std::cerr << "LLAMA ERROR: Failed to predict next token\n"; return; } promptCtx.n_past += 1; // display text ++totalPredictions; if (id == llama_token_eos()) return; const std::string str = llama_token_to_str(d_ptr->ctx, id); // Check if the provided str is part of our reverse prompts bool foundPartialReversePrompt = false; const std::string completed = cachedResponse + str; if (reversePrompts.find(completed) != reversePrompts.end()) { return; } // Check if it partially matches our reverse prompts and if so, cache for (auto s : reversePrompts) { if (s.compare(0, completed.size(), completed) == 0) { foundPartialReversePrompt = true; cachedResponse = completed; break; } } // Regardless the token gets added to our cache cachedTokens.push_back(id); // Continue if we have found a partial match if (foundPartialReversePrompt) continue; // Empty the cache for (auto t : cachedTokens) { if (promptCtx.tokens.size() == promptCtx.n_ctx) promptCtx.tokens.erase(promptCtx.tokens.begin()); promptCtx.tokens.push_back(t); if (!responseCallback(t, llama_token_to_str(d_ptr->ctx, t))) return; } cachedTokens.clear(); } } void LLamaModel::recalculateContext(PromptContext &promptCtx, std::function recalculate) { size_t i = 0; promptCtx.n_past = 0; while (i < promptCtx.tokens.size()) { size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size()); std::vector batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end); assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx); if (llama_eval(d_ptr->ctx, batch.data(), batch.size(), promptCtx.n_past, d_ptr->n_threads)) { std::cerr << "LLAMA ERROR: Failed to process prompt\n"; goto stop_generating; } promptCtx.n_past += batch.size(); if (!recalculate(true)) goto stop_generating; i = batch_end; } assert(promptCtx.n_past == promptCtx.tokens.size()); stop_generating: recalculate(false); } ================================================ FILE: gpt4all-backend/llamamodel.h ================================================ #ifndef LLAMAMODEL_H #define LLAMAMODEL_H #include #include #include #include "llmodel.h" class LLamaPrivate; class LLamaModel : public LLModel { public: LLamaModel(); ~LLamaModel(); bool loadModel(const std::string &modelPath) override; bool isModelLoaded() const override; size_t stateSize() const override; size_t saveState(uint8_t *dest) const override; size_t restoreState(const uint8_t *src) override; void prompt(const std::string &prompt, std::function promptCallback, std::function responseCallback, std::function recalculateCallback, PromptContext &ctx) override; void setThreadCount(int32_t n_threads) override; int32_t threadCount() const override; protected: void recalculateContext(PromptContext &promptCtx, std::function recalculate) override; private: LLamaPrivate *d_ptr; }; #endif // LLAMAMODEL_H ================================================ FILE: gpt4all-backend/llmodel.h ================================================ #ifndef LLMODEL_H #define LLMODEL_H #include #include #include #include class LLModel { public: explicit LLModel() {} virtual ~LLModel() {} virtual bool loadModel(const std::string &modelPath) = 0; virtual bool isModelLoaded() const = 0; virtual size_t stateSize() const { return 0; } virtual size_t saveState(uint8_t *dest) const { return 0; } virtual size_t restoreState(const uint8_t *src) { return 0; } struct PromptContext { std::vector logits; // logits of current context std::vector tokens; // current tokens in the context window int32_t n_past = 0; // number of tokens in past conversation int32_t n_ctx = 0; // number of tokens possible in context window int32_t n_predict = 200; int32_t top_k = 40; float top_p = 0.9f; float temp = 0.9f; int32_t n_batch = 9; float repeat_penalty = 1.10f; int32_t repeat_last_n = 64; // last n tokens to penalize float contextErase = 0.75f; // percent of context to erase if we exceed the context // window }; virtual void prompt(const std::string &prompt, std::function promptCallback, std::function responseCallback, std::function recalculateCallback, PromptContext &ctx) = 0; virtual void setThreadCount(int32_t n_threads) {} virtual int32_t threadCount() const { return 1; } protected: virtual void recalculateContext(PromptContext &promptCtx, std::function recalculate) = 0; }; #endif // LLMODEL_H ================================================ FILE: gpt4all-backend/llmodel_c.cpp ================================================ #include "llmodel_c.h" #include "gptj.h" #include "llamamodel.h" #include "mpt.h" struct LLModelWrapper { LLModel *llModel = nullptr; LLModel::PromptContext promptContext; }; llmodel_model llmodel_gptj_create() { LLModelWrapper *wrapper = new LLModelWrapper; wrapper->llModel = new GPTJ; return reinterpret_cast(wrapper); } void llmodel_gptj_destroy(llmodel_model gptj) { LLModelWrapper *wrapper = reinterpret_cast(gptj); delete wrapper->llModel; delete wrapper; } llmodel_model llmodel_mpt_create() { LLModelWrapper *wrapper = new LLModelWrapper; wrapper->llModel = new MPT; return reinterpret_cast(wrapper); } void llmodel_mpt_destroy(llmodel_model mpt) { LLModelWrapper *wrapper = reinterpret_cast(mpt); delete wrapper->llModel; delete wrapper; } llmodel_model llmodel_llama_create() { LLModelWrapper *wrapper = new LLModelWrapper; wrapper->llModel = new LLamaModel; return reinterpret_cast(wrapper); } void llmodel_llama_destroy(llmodel_model llama) { LLModelWrapper *wrapper = reinterpret_cast(llama); delete wrapper->llModel; delete wrapper; } llmodel_model llmodel_model_create(const char *model_path) { uint32_t magic; llmodel_model model; FILE *f = fopen(model_path, "rb"); fread(&magic, sizeof(magic), 1, f); if (magic == 0x67676d6c) { model = llmodel_gptj_create(); } else if (magic == 0x67676a74) { model = llmodel_llama_create(); } else if (magic == 0x67676d6d) { model = llmodel_mpt_create(); } else {fprintf(stderr, "Invalid model file\n");} fclose(f); return model; } void llmodel_model_destroy(llmodel_model model) { LLModelWrapper *wrapper = reinterpret_cast(model); const std::type_info &modelTypeInfo = typeid(*wrapper->llModel); if (modelTypeInfo == typeid(GPTJ)) { llmodel_gptj_destroy(model); } if (modelTypeInfo == typeid(LLamaModel)) { llmodel_llama_destroy(model); } if (modelTypeInfo == typeid(MPT)) { llmodel_mpt_destroy(model); } } bool llmodel_loadModel(llmodel_model model, const char *model_path) { LLModelWrapper *wrapper = reinterpret_cast(model); return wrapper->llModel->loadModel(model_path); } bool llmodel_isModelLoaded(llmodel_model model) { const auto *llm = reinterpret_cast(model)->llModel; return llm->isModelLoaded(); } uint64_t llmodel_get_state_size(llmodel_model model) { const auto *llm = reinterpret_cast(model)->llModel; return llm->stateSize(); } uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest) { const auto *llm = reinterpret_cast(model)->llModel; return llm->saveState(dest); } uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src) { LLModelWrapper *wrapper = reinterpret_cast(model); return wrapper->llModel->restoreState(src); } // Wrapper functions for the C callbacks bool prompt_wrapper(int32_t token_id, void *user_data) { llmodel_prompt_callback callback = reinterpret_cast(user_data); return callback(token_id); } bool response_wrapper(int32_t token_id, const std::string &response, void *user_data) { llmodel_response_callback callback = reinterpret_cast(user_data); return callback(token_id, response.c_str()); } bool recalculate_wrapper(bool is_recalculating, void *user_data) { llmodel_recalculate_callback callback = reinterpret_cast(user_data); return callback(is_recalculating); } void llmodel_prompt(llmodel_model model, const char *prompt, llmodel_prompt_callback prompt_callback, llmodel_response_callback response_callback, llmodel_recalculate_callback recalculate_callback, llmodel_prompt_context *ctx) { LLModelWrapper *wrapper = reinterpret_cast(model); // Create std::function wrappers that call the C function pointers std::function prompt_func = std::bind(&prompt_wrapper, std::placeholders::_1, reinterpret_cast(prompt_callback)); std::function response_func = std::bind(&response_wrapper, std::placeholders::_1, std::placeholders::_2, reinterpret_cast(response_callback)); std::function recalc_func = std::bind(&recalculate_wrapper, std::placeholders::_1, reinterpret_cast(recalculate_callback)); // Copy the C prompt context wrapper->promptContext.n_past = ctx->n_past; wrapper->promptContext.n_ctx = ctx->n_ctx; wrapper->promptContext.n_predict = ctx->n_predict; wrapper->promptContext.top_k = ctx->top_k; wrapper->promptContext.top_p = ctx->top_p; wrapper->promptContext.temp = ctx->temp; wrapper->promptContext.n_batch = ctx->n_batch; wrapper->promptContext.repeat_penalty = ctx->repeat_penalty; wrapper->promptContext.repeat_last_n = ctx->repeat_last_n; wrapper->promptContext.contextErase = ctx->context_erase; // Call the C++ prompt method wrapper->llModel->prompt(prompt, prompt_func, response_func, recalc_func, wrapper->promptContext); // Update the C context by giving access to the wrappers raw pointers to std::vector data // which involves no copies ctx->logits = wrapper->promptContext.logits.data(); ctx->logits_size = wrapper->promptContext.logits.size(); ctx->tokens = wrapper->promptContext.tokens.data(); ctx->tokens_size = wrapper->promptContext.tokens.size(); // Update the rest of the C prompt context ctx->n_past = wrapper->promptContext.n_past; ctx->n_ctx = wrapper->promptContext.n_ctx; ctx->n_predict = wrapper->promptContext.n_predict; ctx->top_k = wrapper->promptContext.top_k; ctx->top_p = wrapper->promptContext.top_p; ctx->temp = wrapper->promptContext.temp; ctx->n_batch = wrapper->promptContext.n_batch; ctx->repeat_penalty = wrapper->promptContext.repeat_penalty; ctx->repeat_last_n = wrapper->promptContext.repeat_last_n; ctx->context_erase = wrapper->promptContext.contextErase; } void llmodel_setThreadCount(llmodel_model model, int32_t n_threads) { LLModelWrapper *wrapper = reinterpret_cast(model); wrapper->llModel->setThreadCount(n_threads); } int32_t llmodel_threadCount(llmodel_model model) { const auto *llm = reinterpret_cast(model)->llModel; return llm->threadCount(); } ================================================ FILE: gpt4all-backend/llmodel_c.h ================================================ #ifndef LLMODEL_C_H #define LLMODEL_C_H #include #include #include #ifdef __cplusplus extern "C" { #endif /** * Opaque pointer to the underlying model. */ typedef void *llmodel_model; /** * llmodel_prompt_context structure for holding the prompt context. * NOTE: The implementation takes care of all the memory handling of the raw logits pointer and the * raw tokens pointer. Attempting to resize them or modify them in any way can lead to undefined * behavior. */ typedef struct { float *logits; // logits of current context size_t logits_size; // the size of the raw logits vector int32_t *tokens; // current tokens in the context window size_t tokens_size; // the size of the raw tokens vector int32_t n_past; // number of tokens in past conversation int32_t n_ctx; // number of tokens possible in context window int32_t n_predict; // number of tokens to predict int32_t top_k; // top k logits to sample from float top_p; // nucleus sampling probability threshold float temp; // temperature to adjust model's output distribution int32_t n_batch; // number of predictions to generate in parallel float repeat_penalty; // penalty factor for repeated tokens int32_t repeat_last_n; // last n tokens to penalize float context_erase; // percent of context to erase if we exceed the context window } llmodel_prompt_context; /** * Callback type for prompt processing. * @param token_id The token id of the prompt. * @return a bool indicating whether the model should keep processing. */ typedef bool (*llmodel_prompt_callback)(int32_t token_id); /** * Callback type for response. * @param token_id The token id of the response. * @param response The response string. NOTE: a token_id of -1 indicates the string is an error string. * @return a bool indicating whether the model should keep generating. */ typedef bool (*llmodel_response_callback)(int32_t token_id, const char *response); /** * Callback type for recalculation of context. * @param whether the model is recalculating the context. * @return a bool indicating whether the model should keep generating. */ typedef bool (*llmodel_recalculate_callback)(bool is_recalculating); /** * Create a GPTJ instance. * @return A pointer to the GPTJ instance. */ llmodel_model llmodel_gptj_create(); /** * Destroy a GPTJ instance. * @param gptj A pointer to the GPTJ instance. */ void llmodel_gptj_destroy(llmodel_model gptj); /** * Create a MPT instance. * @return A pointer to the MPT instance. */ llmodel_model llmodel_mpt_create(); /** * Destroy a MPT instance. * @param gptj A pointer to the MPT instance. */ void llmodel_mpt_destroy(llmodel_model mpt); /** * Create a LLAMA instance. * @return A pointer to the LLAMA instance. */ llmodel_model llmodel_llama_create(); /** * Destroy a LLAMA instance. * @param llama A pointer to the LLAMA instance. */ void llmodel_llama_destroy(llmodel_model llama); /** * Create a llmodel instance. * Recognises correct model type from file at model_path * @param model_path A string representing the path to the model file. * @return A pointer to the llmodel_model instance. */ llmodel_model llmodel_model_create(const char *model_path); /** * Destroy a llmodel instance. * Recognises correct model type using type info * @param model a pointer to a llmodel_model instance. */ void llmodel_model_destroy(llmodel_model model); /** * Load a model from a file. * @param model A pointer to the llmodel_model instance. * @param model_path A string representing the path to the model file. * @return true if the model was loaded successfully, false otherwise. */ bool llmodel_loadModel(llmodel_model model, const char *model_path); /** * Check if a model is loaded. * @param model A pointer to the llmodel_model instance. * @return true if the model is loaded, false otherwise. */ bool llmodel_isModelLoaded(llmodel_model model); /** * Get the size of the internal state of the model. * NOTE: This state data is specific to the type of model you have created. * @param model A pointer to the llmodel_model instance. * @return the size in bytes of the internal state of the model */ uint64_t llmodel_get_state_size(llmodel_model model); /** * Saves the internal state of the model to the specified destination address. * NOTE: This state data is specific to the type of model you have created. * @param model A pointer to the llmodel_model instance. * @param dest A pointer to the destination. * @return the number of bytes copied */ uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest); /** * Restores the internal state of the model using data from the specified address. * NOTE: This state data is specific to the type of model you have created. * @param model A pointer to the llmodel_model instance. * @param src A pointer to the src. * @return the number of bytes read */ uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src); /** * Generate a response using the model. * @param model A pointer to the llmodel_model instance. * @param prompt A string representing the input prompt. * @param prompt_callback A callback function for handling the processing of prompt. * @param response_callback A callback function for handling the generated response. * @param recalculate_callback A callback function for handling recalculation requests. * @param ctx A pointer to the llmodel_prompt_context structure. */ void llmodel_prompt(llmodel_model model, const char *prompt, llmodel_prompt_callback prompt_callback, llmodel_response_callback response_callback, llmodel_recalculate_callback recalculate_callback, llmodel_prompt_context *ctx); /** * Set the number of threads to be used by the model. * @param model A pointer to the llmodel_model instance. * @param n_threads The number of threads to be used. */ void llmodel_setThreadCount(llmodel_model model, int32_t n_threads); /** * Get the number of threads currently being used by the model. * @param model A pointer to the llmodel_model instance. * @return The number of threads currently being used. */ int32_t llmodel_threadCount(llmodel_model model); #ifdef __cplusplus } #endif #endif // LLMODEL_C_H ================================================ FILE: gpt4all-backend/mpt.cpp ================================================ #include "mpt.h" #include "llama.cpp/ggml.h" #include "utils.h" #include #include #include #include #include #include #include #include #include #include #if defined(_WIN32) && defined(_MSC_VER) #define WIN32_LEAN_AND_MEAN #ifndef NOMINMAX #define NOMINMAX #endif #include #include #include #else #include #endif #include #include #include #include static const size_t MB = 1024*1024; // default hparams (MPT 7B) struct mpt_hparams { int32_t n_vocab = 50432; int32_t n_ctx = 2048; int32_t n_embd = 4096; int32_t n_head = 32; int32_t n_layer = 32; float alibi_bias_max = 8; float clip_qkv = 0; int32_t expand = 4; int32_t f16 = 1; }; struct mpt_layer { // normalization struct ggml_tensor * norm_1_w; struct ggml_tensor * norm_2_w; // attention struct ggml_tensor * attn_Wqkv_w; struct ggml_tensor * attn_out_proj_w; // ff struct ggml_tensor * ffn_up_proj_w; struct ggml_tensor * ffn_down_proj_w; }; struct mpt_buffer { uint8_t * addr = NULL; size_t size = 0; void resize(size_t size) { delete[] addr; addr = new uint8_t[size]; this->size = size; } ~mpt_buffer() { fflush(stdout); delete[] addr; } }; struct mpt_kv_cache { struct ggml_tensor * k; struct ggml_tensor * v; struct ggml_context * ctx = NULL; mpt_buffer buf; int n; // number of tokens currently in the cache ~mpt_kv_cache() { if (ctx) { ggml_free(ctx); } } }; struct mpt_model { mpt_hparams hparams; // normalization struct ggml_tensor * norm_f_w; struct ggml_tensor * wte; // position embedding // mpt does weight tying std::vector layers; struct mpt_kv_cache kv_self; struct ggml_context * ctx; std::map tensors; mpt_buffer buf; ~mpt_model() { if (ctx) { ggml_free(ctx); } } }; static bool kv_cache_init( const struct mpt_hparams & hparams, struct mpt_kv_cache & cache, ggml_type wtype, int n_ctx) { const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int64_t n_mem = (int64_t)n_layer*n_ctx; const int64_t n_elements = n_embd*n_mem; cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); struct ggml_init_params params; params.mem_size = cache.buf.size; params.mem_buffer = cache.buf.addr; params.no_alloc = false; cache.ctx = ggml_init(params); if (!cache.ctx) { fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); return false; } cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); return true; } // load the model's weights from a stream bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & model, gpt_vocab & vocab) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); // verify magic { uint32_t magic; fin.read((char *) &magic, sizeof(magic)); if (magic != 0x67676d6d) { fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); return false; } } // load hparams { auto & hparams = model.hparams; fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); fin.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max)); fin.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv)); fin.read((char *) &hparams.f16, sizeof(hparams.f16)); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max); printf("%s: clip_qkv = %f\n", __func__, hparams.clip_qkv); printf("%s: ftype = %d\n", __func__, hparams.f16); } // load vocab { int32_t n_vocab = model.hparams.n_vocab; fin.read((char *) &n_vocab, sizeof(n_vocab)); if (n_vocab != model.hparams.n_vocab) { fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); return false; } std::string word; for (int i = 0; i < n_vocab; i++) { uint32_t len; fin.read((char *) &len, sizeof(len)); bool special = false; if (len & (1<<31)) { len = len &~ (1<<31); special = true; } if (len > 0) { word.resize(len); fin.read((char *) word.data(), len); vocab.token_to_id[word] = i; vocab.id_to_token[i] = word; } if(special) { vocab.add_special_token(word); } } } // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation ggml_type wtype = GGML_TYPE_COUNT; switch (model.hparams.f16) { case 0: wtype = GGML_TYPE_F32; break; case 1: wtype = GGML_TYPE_F16; break; case 2: wtype = GGML_TYPE_Q4_0; break; case 3: wtype = GGML_TYPE_Q4_1; break; case 5: wtype = GGML_TYPE_Q4_2; break; default: { fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", __func__, fname.c_str(), model.hparams.f16); return false; } } auto & ctx = model.ctx; size_t ctx_size = 0; { const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; const int expand = hparams.expand; ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_w ctx_size += n_embd*n_vocab*ggml_type_sizef(GGML_TYPE_F32); // wte ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // norm_1_w ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // norm_2_w ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // attn_Wqkv_w ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // attn_out_proj_w ctx_size += n_layer*(expand*n_embd*n_embd*ggml_type_sizef(wtype)); // ffn_up_proj_w ctx_size += n_layer*(expand*n_embd*n_embd*ggml_type_sizef(wtype)); // ffn_down_proj_w ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v // TODO probably less now? ctx_size += (5 + 10*n_layer)*256; // object overhead printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } // create the ggml context { struct ggml_init_params params = { .mem_size = ctx_size, .mem_buffer = NULL, .no_alloc = false, }; model.ctx = ggml_init(params); if (!model.ctx) { fprintf(stderr, "%s: ggml_init() failed\n", __func__); return false; } } // prepare memory for the weights { const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; const int expand = hparams.expand; model.layers.resize(n_layer); model.wte = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); model.norm_f_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // map by name model.tensors["transformer.wte.weight"] = model.wte; model.tensors["transformer.norm_f.weight"] = model.norm_f_w; for (int i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; layer.norm_1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.norm_2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.attn_Wqkv_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd * 3); layer.attn_out_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.ffn_up_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, expand*n_embd); layer.ffn_down_proj_w = ggml_new_tensor_2d(ctx, wtype, expand*n_embd, n_embd); // map by name model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"] = layer.norm_1_w; model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_w; model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"] = layer.attn_Wqkv_w; model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] = layer.attn_out_proj_w; model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"] = layer.ffn_up_proj_w; model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj_w; } } // key + value memory { const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; const int n_mem = n_layer*n_ctx; const int n_elements = n_embd*n_mem; if (!kv_cache_init(hparams, model.kv_self, GGML_TYPE_F16, model.hparams.n_ctx)) { fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); ggml_free(ctx); return false; } const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v); printf("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } // load weights { int n_tensors = 0; size_t total_size = 0; printf("%s: ", __func__); while (true) { int32_t n_dims; int32_t length; int32_t ttype; fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); fin.read(reinterpret_cast(&length), sizeof(length)); fin.read(reinterpret_cast(&ttype), sizeof(ttype)); if (fin.eof()) { break; } int32_t nelements = 1; int32_t ne[2] = { 1, 1 }; for (int i = 0; i < n_dims; ++i) { fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); nelements *= ne[i]; } std::string name(length, 0); fin.read(&name[0], length); if (model.tensors.find(name.data()) == model.tensors.end()) { fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); return false; } auto tensor = model.tensors[name.data()]; if (ggml_nelements(tensor) != nelements) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); return false; } if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); return false; } // for debugging if (0) { printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); } const size_t bpe = ggml_type_size(ggml_type(ttype)); if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); return false; } fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); total_size += ggml_nbytes(tensor); if (++n_tensors % 8 == 0) { printf("."); fflush(stdout); } } printf(" done\n"); printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); } return true; } // load the model's weights from a file path bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) { auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); return false; } bool loaded = mpt_model_load(fname, fin, model, vocab); fin.close(); return loaded; } bool mpt_eval( mpt_model & model, const int n_threads, const int n_past, const std::vector & embd_inp, std::vector & embd_w, size_t & mem_per_token) { const int N = embd_inp.size(); const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; const int n_head = hparams.n_head; const int n_vocab = hparams.n_vocab; const int expand = hparams.expand; const int d_key = n_embd/n_head; const size_t init_buf_size = 1024u*MB; if (!model.buf.addr || model.buf.size < init_buf_size) model.buf.resize(init_buf_size); if (mem_per_token > 0 && mem_per_token*N > model.buf.size) { const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, model.buf.size, buf_size_new); // reallocate model.buf.resize(buf_size_new); if (model.buf.addr == nullptr) { fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, model.buf.size); return false; } } struct ggml_init_params params = { .mem_size = model.buf.size, .mem_buffer = model.buf.addr, }; struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = { .n_threads = n_threads }; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); // wte struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; struct ggml_tensor * cur = inpSA; // self-attention { // norm1 cur = ggml_norm(ctx0, cur); cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_w, cur), cur); // compute QKV cur = ggml_mul_mat(ctx0, model.layers[il].attn_Wqkv_w, cur); // TODO: clip_qkv struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*ggml_element_size(cur)*n_embd)); struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*ggml_element_size(cur)*n_embd)); struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*ggml_element_size(cur)*n_embd)); // TODO: qk_ln? (seems to be False in MPT-7B configs) { Vcur = ggml_transpose(ctx0, Vcur); struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_self.k, N*n_embd, (ggml_element_size(model.kv_self.k)*n_embd)*(il*n_ctx + n_past)); struct ggml_tensor * v = ggml_view_2d(ctx0, model.kv_self.v, N, n_embd, ( n_ctx)*ggml_element_size(model.kv_self.v), (il*n_ctx)*ggml_element_size(model.kv_self.v)*n_embd + n_past*ggml_element_size(model.kv_self.v)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); } // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) struct ggml_tensor * Q = ggml_permute(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N), 0, 2, 1, 3); struct ggml_tensor * K = ggml_permute(ctx0, ggml_reshape_3d(ctx0, ggml_view_1d(ctx0, model.kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.k)*n_embd), n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); // KQ_scaled = KQ / sqrt(n_embd/n_head) struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) ); // Alibi struct ggml_tensor * KQ_scaled_biased = ggml_alibi(ctx0, ggml_cont(ctx0, KQ_scaled), n_past, n_head); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_biased, n_past); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() struct ggml_tensor * V = ggml_view_3d(ctx0, model.kv_self.v, n_past + N, n_embd/n_head, n_head, n_ctx*ggml_element_size(model.kv_self.v), n_ctx*ggml_element_size(model.kv_self.v)*n_embd/n_head, il*n_ctx*ggml_element_size(model.kv_self.v)*n_embd); // KQV = transpose(V) * KQ_soft_max struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); // cur = KQV_merged.contiguous().view(n_embd, N) cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].attn_out_proj_w, cur); } // residual struct ggml_tensor * resSA = ggml_add(ctx0, cur, inpSA); // feed-forward network { cur = resSA; // norm2 cur = ggml_norm(ctx0, cur); cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_w, cur), cur); // ffn cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_proj_w, cur); cur = ggml_gelu(ctx0, cur); cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down_proj_w, cur); } // self-attention + FF inpL = ggml_add(ctx0, cur, resSA); } struct ggml_tensor * out = inpL; // -> logits { out = ggml_norm(ctx0, out); out = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_w, out), out); out = ggml_mul_mat(ctx0, model.wte, out); } // run the computation ggml_build_forward_expand(&gf, out); ggml_graph_compute (ctx0, &gf); // return result for just the last token embd_w.resize(n_vocab); memcpy(embd_w.data(), (float *) ggml_get_data(out) + (n_vocab*(N-1)), sizeof(float)*n_vocab); if (mem_per_token == 0) { mem_per_token = ggml_used_mem(ctx0)/N; } //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); ggml_free(ctx0); return true; } #define MPT_MAX_RNG_STATE 64*1024 size_t mpt_get_state_size(const mpt_model &model) { // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. // for reference, std::mt19937(1337) serializes to 6701 bytes. const size_t s_rng_size = sizeof(size_t); const size_t s_rng = MPT_MAX_RNG_STATE; const size_t s_kv_size = sizeof(size_t); const size_t s_kv_ntok = sizeof(int); const size_t s_kv = model.kv_self.buf.size; const size_t s_total = ( + s_rng_size + s_rng + s_kv_size + s_kv_ntok + s_kv ); fflush(stdout); return s_total; } size_t mpt_copy_state_data(const mpt_model &model, const std::mt19937 &rng, uint8_t *dest) { uint8_t * out = dest; fflush(stdout); // copy rng { std::stringstream rng_ss; rng_ss << rng; const size_t rng_size = rng_ss.str().size(); char rng_buf[MPT_MAX_RNG_STATE]; memset(&rng_buf[0], 0, MPT_MAX_RNG_STATE); memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size()); memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size); memcpy(out, &rng_buf[0], MPT_MAX_RNG_STATE); out += MPT_MAX_RNG_STATE; } // copy kv cache { const size_t kv_size = model.kv_self.buf.size; const int kv_ntok = model.kv_self.n; memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size); memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok); if (kv_size) { memcpy(out, model.kv_self.buf.addr, kv_size); out += kv_size; } } const size_t written = out - dest; const size_t expected = mpt_get_state_size(model); assert(written == expected); fflush(stdout); return written; } size_t mpt_set_state_data(mpt_model *model, std::mt19937 *rng, const uint8_t *src) { const uint8_t * in = src; // set rng { size_t rng_size; char rng_buf[MPT_MAX_RNG_STATE]; memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size); memcpy(&rng_buf[0], in, MPT_MAX_RNG_STATE); in += MPT_MAX_RNG_STATE; std::stringstream rng_ss; rng_ss.str(std::string(&rng_buf[0], rng_size)); rng_ss >> *rng; assert(rng_ss.fail() == false); } // set kv cache { size_t kv_size; int kv_ntok; memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size); memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok); if (kv_size) { assert(model->kv_self.buf.size == kv_size); void * k_data = model->kv_self.k->data; // remember data pointers void * v_data = model->kv_self.v->data; // because their value is stored in buf and overwritten by memcpy memcpy(model->kv_self.buf.addr, in, kv_size); in += kv_size; model->kv_self.k->data = k_data; // restore correct data pointers model->kv_self.v->data = v_data; } model->kv_self.n = kv_ntok; } const size_t nread = in - src; const size_t expected = mpt_get_state_size(*model); assert(nread == expected); fflush(stdout); return nread; } struct MPTPrivate { const std::string modelPath; bool modelLoaded; gpt_vocab vocab; mpt_model *model = nullptr; int64_t n_threads = 0; size_t mem_per_token = 0; std::mt19937 rng; bool has_im_end = false; }; MPT::MPT() : d_ptr(new MPTPrivate) { d_ptr->model = new mpt_model; d_ptr->modelLoaded = false; } bool MPT::loadModel(const std::string &modelPath) { std::mt19937 rng(time(NULL)); d_ptr->rng = rng; auto fin = std::ifstream(modelPath, std::ios::binary); // load the model if (!mpt_model_load(modelPath, fin, *d_ptr->model, d_ptr->vocab)) { std::cerr << "GPT-J ERROR: failed to load model from " << modelPath; return false; } d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); d_ptr->modelLoaded = true; d_ptr->has_im_end = d_ptr->vocab.token_to_id.find("<|im_end|>") != d_ptr->vocab.token_to_id.end(); fflush(stdout); return true; } void MPT::setThreadCount(int32_t n_threads) { d_ptr->n_threads = n_threads; } int32_t MPT::threadCount() const { return d_ptr->n_threads; } MPT::~MPT() { delete d_ptr->model; } bool MPT::isModelLoaded() const { return d_ptr->modelLoaded; } size_t MPT::stateSize() const { return mpt_get_state_size(*d_ptr->model); } size_t MPT::saveState(uint8_t *dest) const { return mpt_copy_state_data(*d_ptr->model, d_ptr->rng, dest); } size_t MPT::restoreState(const uint8_t *src) { return mpt_set_state_data(d_ptr->model, &d_ptr->rng, src); } void MPT::prompt(const std::string &prompt, std::function promptCallback, std::function responseCallback, std::function recalculateCallback, PromptContext &promptCtx) { if (!isModelLoaded()) { std::cerr << "GPT-J ERROR: prompt won't work with an unloaded model!\n"; return; } const int64_t t_main_start_us = ggml_time_us(); int64_t t_sample_us = 0; int64_t t_predict_us = 0; int64_t t_prompt_us = 0; // tokenize the prompt std::vector embd_inp = gpt_tokenize(d_ptr->vocab, prompt); // save the context size promptCtx.n_ctx = d_ptr->model->hparams.n_ctx; if ((int) embd_inp.size() > promptCtx.n_ctx - 4) { responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed."); std::cerr << "GPT-J ERROR: The prompt is" << embd_inp.size() << "tokens and the context window is" << promptCtx.n_ctx << "!\n"; return; } promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size()); promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx); // determine the required inference memory per token: static bool initialized = false; static std::vector p_instruct; static std::vector r_instruct; if (!initialized) { mpt_eval(*d_ptr->model, d_ptr->n_threads, 0, { 0, 1, 2, 3 }, promptCtx.logits, d_ptr->mem_per_token); initialized = true; } // process the prompt in batches size_t i = 0; const int64_t t_start_prompt_us = ggml_time_us(); while (i < embd_inp.size()) { size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size()); std::vector batch(embd_inp.begin() + i, embd_inp.begin() + batch_end); // Check if the context has run out... if (promptCtx.n_past + batch.size() > promptCtx.n_ctx) { const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase; // Erase the first percentage of context from the tokens... std::cerr << "MPT: reached the end of the context window so resizing\n"; promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint); promptCtx.n_past = promptCtx.tokens.size(); recalculateContext(promptCtx, recalculateCallback); assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx); } if (!mpt_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, batch, promptCtx.logits, d_ptr->mem_per_token)) { std::cerr << "GPT-J ERROR: Failed to process prompt\n"; return; } size_t tokens = batch_end - i; for (size_t t = 0; t < tokens; ++t) { if (promptCtx.tokens.size() == promptCtx.n_ctx) promptCtx.tokens.erase(promptCtx.tokens.begin()); promptCtx.tokens.push_back(batch.at(t)); if (!promptCallback(batch.at(t))) return; } promptCtx.n_past += batch.size(); i = batch_end; } t_prompt_us += ggml_time_us() - t_start_prompt_us; int p_instructFound = 0; int r_instructFound = 0; std::string cachedResponse; std::vector cachedTokens; std::unordered_set reversePrompts = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" }; // predict next tokens int32_t totalPredictions = 0; for (int i = 0; i < promptCtx.n_predict; i++) { // sample next token const int n_vocab = d_ptr->model->hparams.n_vocab; int id = 0; { const int64_t t_start_sample_us = ggml_time_us(); const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size()); id = gpt_sample_top_k_top_p(d_ptr->vocab, n_vocab, promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks, n_prev_toks, promptCtx.logits, promptCtx.top_k, promptCtx.top_p, promptCtx.temp, promptCtx.repeat_penalty, d_ptr->rng); t_sample_us += ggml_time_us() - t_start_sample_us; } // Check if the context has run out... if (promptCtx.n_past + 1 > promptCtx.n_ctx) { const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase; // Erase the first percentage of context from the tokens... std::cerr << "MPT: reached the end of the context window so resizing\n"; promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint); promptCtx.n_past = promptCtx.tokens.size(); recalculateContext(promptCtx, recalculateCallback); assert(promptCtx.n_past + 1 <= promptCtx.n_ctx); } const int64_t t_start_predict_us = ggml_time_us(); if (!mpt_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, { id }, promptCtx.logits, d_ptr->mem_per_token)) { std::cerr << "GPT-J ERROR: Failed to predict next token\n"; return; } t_predict_us += ggml_time_us() - t_start_predict_us; promptCtx.n_past += 1; // display text ++totalPredictions; // mpt-7b-chat has special token for end if (d_ptr->has_im_end && id == d_ptr->vocab.token_to_id["<|im_end|>"]) goto stop_generating; if (id == 0 /*end of text*/) goto stop_generating; const std::string str = d_ptr->vocab.id_to_token[id]; // Check if the provided str is part of our reverse prompts bool foundPartialReversePrompt = false; const std::string completed = cachedResponse + str; if (reversePrompts.find(completed) != reversePrompts.end()) { goto stop_generating; } // Check if it partially matches our reverse prompts and if so, cache for (auto s : reversePrompts) { if (s.compare(0, completed.size(), completed) == 0) { foundPartialReversePrompt = true; cachedResponse = completed; break; } } // Regardless the token gets added to our cache cachedTokens.push_back(id); // Continue if we have found a partial match if (foundPartialReversePrompt) continue; // Empty the cache for (auto t : cachedTokens) { if (promptCtx.tokens.size() == promptCtx.n_ctx) promptCtx.tokens.erase(promptCtx.tokens.begin()); promptCtx.tokens.push_back(t); if (!responseCallback(t, d_ptr->vocab.id_to_token[t])) goto stop_generating; } cachedTokens.clear(); } stop_generating: #if 0 // report timing { const int64_t t_main_end_us = ggml_time_us(); std::cout << "GPT-J INFO: mem per token = " << mem_per_token << " bytes\n"; std::cout << "GPT-J INFO: sample time = " << t_sample_us/1000.0f << " ms\n"; std::cout << "GPT-J INFO: prompt time = " << t_prompt_us/1000.0f << " ms\n"; std::cout << "GPT-J INFO: predict time = " << t_predict_us/1000.0f << " ms / " << t_predict_us/1000.0f/totalPredictions << " ms per token\n"; std::cout << "GPT-J INFO: total time = " << (t_main_end_us - t_main_start_us)/1000.0f << " ms\n"; fflush(stdout); } #endif return; } void MPT::recalculateContext(PromptContext &promptCtx, std::function recalculate) { size_t i = 0; promptCtx.n_past = 0; while (i < promptCtx.tokens.size()) { size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size()); std::vector batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end); assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx); if (!mpt_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, batch, promptCtx.logits, d_ptr->mem_per_token)) { std::cerr << "MPT ERROR: Failed to process prompt\n"; goto stop_generating; } promptCtx.n_past += batch.size(); if (!recalculate(true)) goto stop_generating; i = batch_end; } assert(promptCtx.n_past == promptCtx.tokens.size()); stop_generating: recalculate(false); } ================================================ FILE: gpt4all-backend/mpt.h ================================================ #ifndef MPT_H #define MPT_H #include #include #include #include "llmodel.h" class MPTPrivate; class MPT : public LLModel { public: MPT(); ~MPT(); bool loadModel(const std::string &modelPath) override; bool isModelLoaded() const override; size_t stateSize() const override; size_t saveState(uint8_t *dest) const override; size_t restoreState(const uint8_t *src) override; void prompt(const std::string &prompt, std::function promptCallback, std::function responseCallback, std::function recalculateCallback, PromptContext &ctx) override; void setThreadCount(int32_t n_threads) override; int32_t threadCount() const override; protected: void recalculateContext(PromptContext &promptCtx, std::function recalculate) override; private: MPTPrivate *d_ptr; }; #endif // MPT_H ================================================ FILE: gpt4all-backend/scripts/convert_mpt_hf_to_ggml.py ================================================ # Convert Hugging Face fine-tuned bloom-like models to ggml format # # Usage: # # python3 models/convert-h5-to-ggml.py # # This script is similar to "convert-pt-to-ggml.py" # import io import os import sys import struct import json import code import torch import numpy as np from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BloomForCausalLM # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py def bytes_to_unicode(): """ Returns list of utf-8 byte and a corresponding list of unicode strings. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) cs = bs[:] n = 0 for b in range(2**8): if b not in bs: bs.append(b) cs.append(2**8+n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) if len(sys.argv) < 3: print("Usage: python convert-hf-to-ggml.py model_name dir-output [use-f32]") print(" model_name: name of the model to convert. Example: 'bigscience/bloomz-560m'") print(" dir-output: directory where the output file will be written") print(" use-f32: if present, use float32 instead of float16") sys.exit(1) model_name = sys.argv[1] dir_out = sys.argv[2] # make sure the output directory exists os.makedirs(dir_out, exist_ok=True) # possible data types # ftype == 0 -> float32 # ftype == 1 -> float16 # # map from ftype to string ftype_str = ["f32", "f16"] ftype = 1 if len(sys.argv) > 3: ftype = 0 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) hparams = config.to_dict() print("Loading model: ", model_name) model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, config=config, torch_dtype=torch.float16 if ftype == 1 else torch.float32, low_cpu_mem_usage=True) print("Model loaded: ", model_name) fname_out = dir_out + f"/ggml-model-{model_name.split('/')[-1]}-{ftype_str[ftype]}.bin" fout = open(fname_out, "wb") vocab = tokenizer.vocab hparams["multiple_of"] = 1 fout.write(struct.pack("I", 0x67676d6d)) # magic: ggml in hex fout.write(struct.pack("I", model.config.vocab_size)) fout.write(struct.pack("I", model.config.max_seq_len)) fout.write(struct.pack("I", model.config.n_layers)) fout.write(struct.pack("I", model.config.n_heads)) fout.write(struct.pack("I", model.config.d_model)) fout.write(struct.pack("f", model.config.attn_config['alibi_bias_max'])) clip_qkv = model.config.attn_config['clip_qkv'] fout.write(struct.pack("f", clip_qkv if clip_qkv is not None else 0)) fout.write(struct.pack("I", ftype)) # # Is this correct?? # dot_token = tokenizer.encode(".")[0] # write tokens to ggml file dot_token = tokenizer.encode('.')[0] fout.write(struct.pack("I", model.config.vocab_size)) for i in range(model.config.vocab_size): text = tokenizer.decode([dot_token, i]).encode('utf-8') # remove the first byte (it's always '.') text = text[1:] enclen = len(text) if i in tokenizer.all_special_ids: print(f"special token: {text}") enclen = enclen | 1<<31 fout.write(struct.pack("I", enclen)) fout.write(text) list_vars = model.state_dict() for name in list_vars.keys(): data = list_vars[name].squeeze().numpy() print("Processing variable: " + name + " with shape: ", data.shape) n_dims = len(data.shape); # ftype == 0 -> float32, ftype == 1 -> float16 ftype_cur = 0; if ftype != 0: # Keep token embeddings in fp32 if name[-7:] == ".weight" and n_dims == 2 and ".wte" not in name: print(" Converting to float16") data = data.astype(np.float16) ftype_cur = 1 else: print(" Converting to float32") data = data.astype(np.float32) ftype_cur = 0 else: if data.dtype != np.float32: print(" Converting to float32") data = data.astype(np.float32) ftype_cur = 0 # header str = name.encode('utf-8') fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) for i in range(n_dims): fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) fout.write(str); # data data.tofile(fout) fout.close() print("Done. Output file: " + fname_out) print("") ================================================ FILE: gpt4all-backend/utils.cpp ================================================ #include "utils.h" #include #include void replace(std::string & str, const std::string & needle, const std::string & replacement) { size_t pos = 0; while ((pos = str.find(needle, pos)) != std::string::npos) { str.replace(pos, needle.length(), replacement); pos += replacement.length(); } } std::map json_parse(const std::string & fname) { std::map result; // read file into string std::string json; { std::ifstream ifs(fname); if (!ifs) { fprintf(stderr, "Failed to open %s\n", fname.c_str()); exit(1); } json = std::string((std::istreambuf_iterator(ifs)), (std::istreambuf_iterator())); } if (json[0] != '{') { return result; } // parse json { bool has_key = false; bool in_token = false; std::string str_key = ""; std::string str_val = ""; int n = json.size(); for (int i = 1; i < n; ++i) { if (!in_token) { if (json[i] == ' ') continue; if (json[i] == '"') { in_token = true; continue; } } else { if (json[i] == '\\' && i+1 < n) { if (has_key == false) { str_key += json[i]; } else { str_val += json[i]; } ++i; } else if (json[i] == '"') { if (has_key == false) { has_key = true; ++i; while (json[i] == ' ') ++i; ++i; // : while (json[i] == ' ') ++i; if (json[i] != '\"') { while (json[i] != ',' && json[i] != '}') { str_val += json[i++]; } has_key = false; } else { in_token = true; continue; } } else { has_key = false; } ::replace(str_key, "\\u0120", " " ); // \u0120 -> space ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line ::replace(str_key, "\\\"", "\""); // \\\" -> " try { result[str_key] = std::stoi(str_val); } catch (...) { //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str()); } str_key = ""; str_val = ""; in_token = false; continue; } if (has_key == false) { str_key += json[i]; } else { str_val += json[i]; } } } } return result; } std::vector gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text) { std::vector words; // first split the text into words { std::string str = text; std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; std::regex re(pat); std::smatch m; while (std::regex_search(str, m, re)) { for (auto x : m) { words.push_back(x); } str = m.suffix(); } } // find the longest tokens that form the words: std::vector tokens; for (const auto & word : words) { if (word.size() == 0) continue; int i = 0; int n = word.size(); while (i < n) { int j = n; while (j > i) { auto it = vocab.token_to_id.find(word.substr(i, j-i)); if (it != vocab.token_to_id.end()) { tokens.push_back(it->second); i = j; break; } --j; } if (i == n) { break; } if (j == i) { auto sub = word.substr(i, 1); if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) { tokens.push_back(vocab.token_to_id.at(sub)); } else { fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data()); } ++i; } } } return tokens; } std::string regex_escape(const std::string &s) { static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])"); return std::regex_replace(s, metacharacters, "\\$&"); } std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { // Generate the subpattern from the special_tokens vector if it's not empty if (!vocab.special_tokens.empty()) { std::vector out; std::vector chunks; std::string str = text; std::string special_tokens_subpattern; for (const auto &token : vocab.special_tokens) { if (!special_tokens_subpattern.empty()) { special_tokens_subpattern += "|"; } special_tokens_subpattern += regex_escape(token); } std::regex re(special_tokens_subpattern); std::smatch m; while (std::regex_search(str, m, re)) { auto tok = vocab.token_to_id.find(m.str()); if (tok != vocab.token_to_id.end()) { auto tokid = tok->second; auto pfxtoks = gpt_tokenize_inner(vocab, m.prefix()); out.insert(out.end(), pfxtoks.begin(), pfxtoks.end()); out.push_back(tokid); str = m.suffix(); } } if (!str.empty()) { auto tokrest = gpt_tokenize_inner(vocab, str); out.insert(out.end(), tokrest.begin(), tokrest.end()); } return out; } else { return gpt_tokenize_inner(vocab, text); } } bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { printf("%s: loading vocab from '%s'\n", __func__, fname.c_str()); vocab.token_to_id = ::json_parse(fname); for (const auto & kv : vocab.token_to_id) { vocab.id_to_token[kv.second] = kv.first; } printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size()); // print the vocabulary //for (auto kv : vocab.token_to_id) { // printf("'%s' -> %d\n", kv.first.data(), kv.second); //} return true; } gpt_vocab::id gpt_sample_top_k_top_p( const gpt_vocab & vocab, const size_t actualVocabSize, const int32_t * last_n_tokens_data, int last_n_tokens_size, const std::vector logits, int top_k, double top_p, double temp, float repeat_penalty, std::mt19937 & rng) { int n_logits = actualVocabSize; const auto last_n_tokens = std::vector(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size); const auto * plogits = logits.data() + logits.size() - n_logits; std::vector> logits_id; logits_id.reserve(n_logits); { const float scale = 1.0f/temp; for (int i = 0; i < n_logits; ++i) { // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858) // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) { // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability if (plogits[i] < 0.0f) { logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i)); } else { logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i)); } } else { logits_id.push_back(std::make_pair(plogits[i]*scale, i)); } } } // find the top K tokens std::partial_sort( logits_id.begin(), logits_id.begin() + top_k, logits_id.end(), [](const std::pair & a, const std::pair & b) { return a.first > b.first; }); logits_id.resize(top_k); double maxl = -INFINITY; for (const auto & kv : logits_id) { maxl = std::max(maxl, kv.first); } // compute probs for the top K tokens std::vector probs; probs.reserve(logits_id.size()); double sum = 0.0; for (const auto & kv : logits_id) { double p = exp(kv.first - maxl); probs.push_back(p); sum += p; } // normalize the probs for (auto & p : probs) { p /= sum; } if (top_p < 1.0f) { double cumsum = 0.0f; for (int i = 0; i < top_k; i++) { cumsum += probs[i]; if (cumsum >= top_p) { top_k = i + 1; probs.resize(top_k); logits_id.resize(top_k); break; } } cumsum = 1.0/cumsum; for (int i = 0; i < (int) probs.size(); i++) { probs[i] *= cumsum; } } //printf("\n"); //for (int i = 0; i < (int) probs.size(); i++) { // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); //} //exit(0); std::discrete_distribution<> dist(probs.begin(), probs.end()); int idx = dist(rng); return logits_id[idx].second; } ================================================ FILE: gpt4all-backend/utils.h ================================================ // Various helper functions and utilities #pragma once #include #include #include #include #include // // CLI argument parsing // struct gpt_params { int32_t seed = -1; // RNG seed int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); int32_t n_predict = 200; // new tokens to predict // sampling parameters int32_t top_k = 40; float top_p = 0.9f; float temp = 0.9f; int32_t n_batch = 8; // batch size for prompt processing std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path std::string prompt; }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); void gpt_print_usage(int argc, char ** argv, const gpt_params & params); std::string gpt_random_prompt(std::mt19937 & rng); // // Vocab utils // struct gpt_vocab { using id = int32_t; using token = std::string; std::map token_to_id; std::map id_to_token; std::vector special_tokens; void add_special_token(const std::string &token) { special_tokens.push_back(token); } }; void replace(std::string & str, const std::string & needle, const std::string & replacement); // poor-man's JSON parsing std::map json_parse(const std::string & fname); // split text into tokens // // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 // // Regex (Python): // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" // // Regex (C++): // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" // std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text); // load the tokens from encoder.json bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); // sample next token given probabilities for each embedding // // - consider only the top K tokens // - from them, consider only the top tokens with cumulative probability > P // // TODO: not sure if this implementation is correct // gpt_vocab::id gpt_sample_top_k_top_p( const gpt_vocab & vocab, const size_t actualVocabSize, const int32_t * last_n_tokens_data, int last_n_tokens_size, const std::vector logits, int top_k, double top_p, double temp, float repeat_penalty, std::mt19937 & rng); ================================================ FILE: prompt_template_sample.txt ================================================ ### Instruction: The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response. ### Prompt: %1 ### Response: ================================================ FILE: src/CMakeLists.txt ================================================ add_executable(chat chat.cpp header.h utils.h parse_json.h ../gpt4all-backend/llmodel_c.h) target_link_libraries(chat PRIVATE llmodel llama) ================================================ FILE: src/chat.cpp ================================================ #include "./header.h" #include "../gpt4all-backend/llmodel_c.h" #include "./utils.h" #include "./parse_json.h" ////////////////////////////////////////////////////////////////////////// //////////// ANIMATION //////////// ////////////////////////////////////////////////////////////////////////// std::atomic stop_display{false}; void display_frames() { const char* frames[] = {".", ":", "'", ":"}; int frame_index = 0; ConsoleState con_st; con_st.use_color = true; while (!stop_display) { set_console_color(con_st, PROMPT); std::cerr << "\r" << frames[frame_index % 4] << std::flush; frame_index++; set_console_color(con_st, DEFAULT); if (!stop_display){ std::this_thread::sleep_for(std::chrono::milliseconds(200)); std::cerr << "\r" << " " << std::flush; std::cerr << "\r" << std::flush; } } } void display_loading() { while (!stop_display) { for (int i=0; i < 14; i++){ fprintf(stdout, "."); fflush(stdout); std::this_thread::sleep_for(std::chrono::milliseconds(200)); if (stop_display){ break; } } std::cout << "\r" << " " << "\r" << std::flush; } std::cout << "\r" << " " << std::flush; } ////////////////////////////////////////////////////////////////////////// //////////// /ANIMATION //////////// ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// //////////// CHAT FUNCTIONS //////////// ////////////////////////////////////////////////////////////////////////// #ifndef OLD_MACOS bool save_state_to_binary(llmodel_model& model, uint8_t *dest, chatParams& params, std::string &filename, uint64_t model_size) { if (params.save_dir == "") { std::filesystem::path directory_path(params.path+"saves"); if (!std::filesystem::is_directory(directory_path)) { if (!std::filesystem::create_directory(directory_path)) { std::cerr << "Error creating directory" << std::endl; return false; } } params.save_dir = params.path+"saves"; } //sanity check that we're not trying to overwrite binaries of wrong size //empty binaries are allowed, so are previous saves of same model type if (std::filesystem::exists(params.save_dir+"/"+filename+".bin")) { uint64_t file_size = std::filesystem::file_size(params.save_dir+"/"+filename+".bin"); if ((file_size == model_size) || (file_size = 0)) { //continue } else { std::cerr << "You are trying to overwrite existing binary of different size! " << params.save_dir+"/"+filename+".bin" << std::endl; return 0; } } // create an output file stream std::ofstream outfile; // open the file in binary mode outfile.open(params.save_dir+"/"+filename+".bin", std::ios::binary); // check if the file stream is open if (!outfile.is_open()) { std::cerr << "Error opening file " << params.save_dir+"/"+filename+".bin" << std::endl; return false; } // write the model data to the file stream uint64_t copied_bytes = llmodel_save_state_data(model, dest); outfile.write(reinterpret_cast(dest), copied_bytes); // close the file stream outfile.close(); return true; } bool load_state_from_binary(llmodel_model& model, chatParams& params, std::string &filename, uint64_t model_size) { if (params.save_dir == "") { params.save_dir = params.path+"saves"; } //sanity check that we're not trying to load binaries of wrong size //only binaries that are saves of same model type are allowed if (std::filesystem::exists(params.save_dir+"/"+filename+".bin")) { uint64_t file_size = std::filesystem::file_size(params.save_dir+"/"+filename+".bin"); if (file_size == model_size) { //continue } else { std::cerr << "You are trying to load a binary of wrong size! " << params.save_dir+"/"+filename+".bin" << std::endl; return 0; } } // create an input file stream std::ifstream infile; // open the file in binary mode infile.open(params.save_dir+"/"+filename+".bin", std::ios::binary); // check if the file stream is open if (!infile.is_open()) { std::cerr << "Error opening file " << params.save_dir+"/"+filename+".bin" << std::endl; return false; } // get the size of the file infile.seekg(0, std::ios::end); uint64_t file_size = infile.tellg(); infile.seekg(0, std::ios::beg); // allocate a buffer to hold the file data uint8_t* buffer = new uint8_t[file_size]; try { buffer = new uint8_t[file_size]; } catch (std::bad_alloc& ba) { std::cerr << "Failed to allocate buffer: " << ba.what() << std::endl; return false; } // read the file data into the buffer infile.read(reinterpret_cast(buffer), file_size); infile.close(); // restore the internal state of the model using the buffer data llmodel_restore_state_data(model, buffer); delete[] buffer; return true; } bool save_ctx_to_binary(llmodel_prompt_context& prompt_context, chatParams& params, std::string &filename) { if (params.save_dir == "") { std::filesystem::path directory_path(params.path+"saves"); if (!std::filesystem::is_directory(directory_path)) { if (!std::filesystem::create_directory(directory_path)) { std::cerr << "Error creating directory" << std::endl; return false; } } params.save_dir = params.path+"saves"; } std::filesystem::path filePath = std::filesystem::path(params.save_dir) / (filename + ".ctx"); std::string fullPath = filePath.string(); // Open the binary file for writing FILE* file = fopen(fullPath.c_str(), "wb"); if (!file) { std::cerr << "Error opening file: " << fullPath << std::endl; return false; } // Write the struct to the file using fwrite fwrite(&prompt_context, sizeof(prompt_context), 1, file); // Close the file fclose(file); return true; } llmodel_prompt_context load_ctx_from_binary(chatParams& params, std::string &filename) { if (params.save_dir == "") { params.save_dir = params.path+"saves"; } // Construct the file path with home directory expansion std::filesystem::path filePath = std::filesystem::path(params.save_dir) / (filename + ".ctx"); std::string fullPath = filePath.string(); // Open the binary file for reading FILE* file = fopen(fullPath.c_str(), "rb"); if (!file) { std::cerr << "Error opening file: " << fullPath << std::endl; exit(EXIT_FAILURE); } // Read the struct from the file using fread llmodel_prompt_context prompt_context; fread(&prompt_context, sizeof(prompt_context), 1, file); // Close the file fclose(file); return prompt_context; } #endif std::string get_input(ConsoleState& con_st, std::string& input, chatParams ¶ms, llmodel_prompt_context &prompt_context, llmodel_model& model) { set_console_color(con_st, USER_INPUT); std::cout << "\n> "; std::getline(std::cin, input); std::istringstream iss(input); std::string input1, input2; std::getline(iss, input1, ' '); std::getline(iss, input2, ' '); set_console_color(con_st, DEFAULT); if (input == "/reset") { //reset the logits, tokens and past conversation prompt_context.logits = params.logits; prompt_context.logits_size = params.logits_size; prompt_context.tokens = params.tokens; prompt_context.tokens_size = params.tokens_size; prompt_context.n_past = params.n_past; prompt_context.n_ctx = params.n_ctx; //get new input using recursion set_console_color(con_st, PROMPT); std::cout << "Chat context reset."; return get_input(con_st, input, params, prompt_context, model); } #ifndef OLD_MACOS if ((input == "/save" || input1 == "/save") && (params.no_saves == false)) { std::string filename = params.save_name; if (input2 != "" && (input2.find("..") == std::string::npos) ) { filename = input2; } bool success1 = false; bool success2 = false; uint64_t model_size = llmodel_get_state_size(model); uint8_t *dest = new uint8_t[model_size]; success1 = save_state_to_binary(model, dest, params, filename, model_size); delete[] dest; success2 = save_ctx_to_binary(prompt_context, params, filename); //get new input using recursion set_console_color(con_st, PROMPT); if (success1 && success2) { std::cout << "Model data saved to: " << params.save_dir+"/"+filename+".bin" << " size: " << floor(model_size/10000000)/100.0 << " Gb"; } return get_input(con_st, input, params, prompt_context, model); } if ((input == "/load" || input1 == "/load") && (params.no_saves == false)) { std::string filename = params.save_name; if (input2 != "" && (input2.find("..") == std::string::npos) ) { filename = input2; } //reset the logits, tokens and past conversation free(prompt_context.logits); free(prompt_context.tokens); prompt_context.logits = params.logits; prompt_context.logits_size = params.logits_size; prompt_context.tokens = params.tokens; prompt_context.tokens_size = params.tokens_size; prompt_context.n_past = params.n_past; prompt_context.n_ctx = params.n_ctx; bool success = false; uint64_t model_size = llmodel_get_state_size(model); prompt_context = load_ctx_from_binary(params, filename); success = load_state_from_binary(model, params, filename, model_size); model_size = llmodel_get_state_size(model); //get new input using recursion set_console_color(con_st, PROMPT); if (success) { std::cout << "Model data loaded from: " << params.save_dir+"/"+filename+".bin" << " size: " << floor(model_size/10000000)/100.0 << " Gb"; } return get_input(con_st, input, params, prompt_context, model); } #endif if (input == "/help"){ set_console_color(con_st, DEFAULT); std::cout << std::endl; char emptystring[] = ""; char* emptyargv[] = {emptystring}; int emptyargc = sizeof(emptyargv) / sizeof(char*); print_usage(emptyargc, emptyargv, params); return get_input(con_st, input, params, prompt_context, model); } if (input == "/about"){ set_console_color(con_st, DEFAULT); std::cout << std::endl; print_version(); return get_input(con_st, input, params, prompt_context, model); } if (input == "exit" || input == "quit" || input == "/exit" || input == "/quit") { llmodel_model_destroy(model); exit(0); } return input; } std::string hashstring = ""; std::string answer = ""; ////////////////////////////////////////////////////////////////////////// //////////// /CHAT FUNCTIONS //////////// ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// //////////// MAIN PROGRAM //////////// ////////////////////////////////////////////////////////////////////////// int main(int argc, char* argv[]) { ConsoleState con_st; con_st.use_color = true; set_console_color(con_st, DEFAULT); set_console_color(con_st, PROMPT); set_console_color(con_st, BOLD); std::cout << APPNAME; set_console_color(con_st, DEFAULT); set_console_color(con_st, PROMPT); std::cout << " (v. " << VERSION << ")"; set_console_color(con_st, DEFAULT); std::cout << "" << std::endl; check_avx_support_at_startup(); chatParams params; //convert the default model path into Windows format if on WIN32 #ifdef _WIN32 std::filesystem::path p(params.model); params.model = p.make_preferred().string(); #endif //get all parameters from cli arguments or json parse_params(argc, argv, params); //Create a prompt_context and copy all params from chatParams to prompt_context llmodel_prompt_context prompt_context = { .logits = params.logits, .logits_size = params.logits_size, .tokens = params.tokens, .tokens_size = params.tokens_size, .n_past = params.n_past, .n_ctx = params.n_ctx, .n_predict = params.n_predict, .top_k = params.top_k, .top_p = params.top_p, .temp = params.temp, .n_batch = params.n_batch, .repeat_penalty = params.repeat_penalty, .repeat_last_n = params.repeat_last_n, .context_erase = params.context_erase, }; //Subprocess signal handling #ifdef _WIN32 SetConsoleCtrlHandler(console_ctrl_handler, TRUE); #else signal(SIGHUP, handle_sighup); #endif ////////////////////////////////////////////////////////////////////////// //////////// LOAD THE MODEL //////////// ////////////////////////////////////////////////////////////////////////// //animation std::future future; stop_display = true; if(params.use_animation) {stop_display = false; future = std::async(std::launch::async, display_loading);} //handle stderr for now //this is just to prevent printing unnecessary details during model loading. int stderr_copy = dup(fileno(stderr)); #ifdef _WIN32 std::freopen("NUL", "w", stderr); #else std::freopen("/dev/null", "w", stderr); #endif llmodel_model model = llmodel_model_create(params.model.c_str()); std::cout << "\r" << APPNAME << ": loading " << params.model.c_str() << std::endl; //bring back stderr for now dup2(stderr_copy, fileno(stderr)); close(stderr_copy); //check if model is loaded auto check_model = llmodel_loadModel(model, params.model.c_str()); if (check_model == false) { if(params.use_animation) { stop_display = true; future.wait(); stop_display= false; } std::cerr << "Error loading: " << params.model.c_str() << std::endl; std::cout << "Press any key to exit..." << std::endl; std::cin.get(); return 0; } else { if(params.use_animation) { stop_display = true; future.wait(); } std::cout << "\r" << APPNAME << ": done loading!" << std::flush; } ////////////////////////////////////////////////////////////////////////// //////////// /LOAD THE MODEL //////////// ////////////////////////////////////////////////////////////////////////// set_console_color(con_st, PROMPT); std::cout << "\n" << params.prompt.c_str() << std::endl; set_console_color(con_st, DEFAULT); //load prompt template from file instead if (params.load_template != "") { std::tie(params.default_prefix, params.default_header, params.default_footer) = read_prompt_template_file(params.load_template); } //load chat log from a file if (params.load_log != "") { if (params.prompt == "") { params.prompt = params.default_prefix + read_chat_log(params.load_log) + params.default_header; } else { params.prompt = params.default_prefix + read_chat_log(params.load_log) + params.default_header + params.prompt; } } else { params.prompt = params.default_prefix + params.default_header + params.prompt; } ////////////////////////////////////////////////////////////////////////// //////////// PROMPT LAMBDA FUNCTIONS //////////// ////////////////////////////////////////////////////////////////////////// auto prompt_callback = [](int32_t token_id) { // You can handle prompt here if needed return true; }; auto response_callback = [](int32_t token_id, const char *responsechars) { if (!(responsechars == nullptr || responsechars[0] == '\0')) { // stop the animation, printing response if (stop_display == false) { stop_display = true; std::this_thread::sleep_for(std::chrono::milliseconds(200)); std::cerr << "\r" << " " << std::flush; std::cerr << "\r" << std::flush; if (answer != "") {std::cout << answer;} } std::cout << responsechars << std::flush; answer += responsechars; } return true; }; auto recalculate_callback = [](bool is_recalculating) { // You can handle recalculation requests here if needed return is_recalculating; }; ////////////////////////////////////////////////////////////////////////// //////////// PROMPT TEXT AND GET RESPONSE //////////// ////////////////////////////////////////////////////////////////////////// llmodel_setThreadCount(model, params.n_threads); std::string input = ""; //main chat loop. if (!params.no_interactive && !sighup_received) { input = get_input(con_st, input, params, prompt_context, model); //Interactive mode. We have a prompt. if (params.prompt != "") { if (params.use_animation){ stop_display = false; future = std::async(std::launch::async, display_frames); } if (params.b_token != ""){answer = answer + params.b_token; if(!params.use_animation) {std::cout << params.b_token;} } llmodel_prompt(model, (params.prompt + " " + input + params.default_footer).c_str(), prompt_callback, response_callback, recalculate_callback, &prompt_context); if (params.e_token != ""){std::cout << params.e_token; answer = answer + params.e_token; } if (params.use_animation){ stop_display = true; future.wait(); stop_display = false; } if (params.save_log != ""){ save_chat_log(params.save_log, (params.prompt + " " + input + params.default_footer).c_str(), answer.c_str()); } //Interactive mode. Else get prompt from input. } else { if (params.use_animation){ stop_display = false; future = std::async(std::launch::async, display_frames); } if (params.b_token != ""){answer = answer + params.b_token; if(!params.use_animation) {std::cout << params.b_token;} } llmodel_prompt(model, (params.default_prefix + params.default_header + input + params.default_footer).c_str(), prompt_callback, response_callback, recalculate_callback, &prompt_context); if (params.e_token != ""){std::cout << params.e_token; answer = answer + params.e_token; } if (params.use_animation){ stop_display = true; future.wait(); stop_display = false; } if (params.save_log != ""){ save_chat_log(params.save_log, (params.default_prefix + params.default_header + input + params.default_footer).c_str(), answer.c_str()); } } //Interactive and continuous mode. Get prompt from input. while (!params.run_once && !sighup_received) { answer = ""; //New prompt. We stored previous answer in memory so clear it. input = get_input(con_st, input, params, prompt_context, model); if (params.use_animation){ stop_display = false; future = std::async(std::launch::async, display_frames); } if (params.b_token != ""){answer = answer + params.b_token; if(!params.use_animation) {std::cout << params.b_token;} } llmodel_prompt(model, (params.default_prefix + params.default_header + input + params.default_footer).c_str(), prompt_callback, response_callback, recalculate_callback, &prompt_context); if (params.e_token != ""){std::cout << params.e_token; answer = answer + params.e_token; } if (params.use_animation){ stop_display = true; future.wait(); stop_display = false; } if (params.save_log != ""){ save_chat_log(params.save_log, (params.default_prefix + params.default_header + input + params.default_footer).c_str(), answer.c_str()); } } //No-interactive mode. Get the answer once from prompt and print it. } else { if (params.use_animation){ stop_display = false; future = std::async(std::launch::async, display_frames); } if (params.b_token != ""){answer = answer + params.b_token; if(!params.use_animation) {std::cout << params.b_token;} } llmodel_prompt(model, (params.prompt + params.default_footer).c_str(), prompt_callback, response_callback, recalculate_callback, &prompt_context); if (params.e_token != ""){std::cout << params.e_token; answer = answer + params.e_token; } if (params.use_animation){ stop_display = true; future.wait(); stop_display = false; } if (params.save_log != ""){ save_chat_log(params.save_log, (params.prompt + params.default_footer).c_str(), answer.c_str()); } std::cout << std::endl; } set_console_color(con_st, DEFAULT); llmodel_model_destroy(model); return 0; } ================================================ FILE: src/header.h ================================================ #pragma once #ifndef HEADER_H #define HEADER_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include //For paths //Commented out to support really old xcode #ifndef OLD_MACOS #include #endif //For Windows MSVC compilation #if defined(_WIN32) && defined(_MSC_VER) #define WIN32_LEAN_AND_MEAN #ifndef NOMINMAX #define NOMINMAX #endif #include #include #include #else #include #endif #include #include #include #include #include #include "config.h" #include #include #include // chatParams contains all the parameters you can import from json or with cli arguments // it also contains the initial value for PromptContext struct chatParams { //std::vector logits, // logits of current context //std::vector tokens, // current tokens in the context window //These are in the prompt context, maybe add as parameters too. float *logits = nullptr; // logits of current context size_t logits_size = 0; // the size of the raw logits vector int32_t *tokens = nullptr; // current tokens in the context window size_t tokens_size = 0; // the size of the raw tokens vector int32_t n_past = 0; // number of tokens in past conversation //Parameters below you can import from json or with cli arguments int32_t n_ctx = 0; // number of tokens possible in context window int32_t n_predict = 200; // number of tokens to predict int32_t top_k = 40; // top k logits to sample from float top_p = 0.95; // nucleus sampling probability threshold float temp = 0.28; // temperature to adjust model's output distribution int32_t n_batch = 9; // number of predictions to generate in parallel float repeat_penalty = 1.1; // penalty factor for repeated tokens int32_t repeat_last_n = 64; // last n tokens to penalize float context_erase = 0.75; // percent of context to erase if we exceed the context window //Parameters below are not inside prompt_context, but handled separately int32_t seed = -1; int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency()); std::string model = "./models/ggml-vicuna-13b-1.1-q4_2.bin"; std::string prompt = ""; //template prefix, header, and footer std::string default_prefix = "### Instruction:\n The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response."; std::string default_header = "\n### Prompt: "; std::string default_footer = "\n### Response: "; //You can toggle chat interactivity with these parameters bool no_interactive = false; bool use_animation = true; bool run_once = false; bool no_saves = false; std::string b_token = ""; //beginning wrap token std::string e_token = ""; //ending wrap token std::string load_template = ""; //template file location std::string load_json = ""; //json file location std::string save_log = ""; //saved chat log file location std::string load_log = ""; //loaded chat log file location std::string save_name = "model_state"; //model state binary name std::string save_dir = ""; //saves directory name //program binary path std::string path = ""; }; enum ConsoleColor { DEFAULT = 0, PROMPT, USER_INPUT, BOLD }; struct ConsoleState { bool use_color = false; ConsoleColor color = DEFAULT; }; std::string APPNAME = "LlamaGPTJ-chat"; //utils.h functions void set_console_color(ConsoleState &con_st, ConsoleColor color); std::string random_prompt(int32_t seed); void print_usage(int argc, char** argv, const chatParams& params); bool parse_params(int argc, char** argv, chatParams& params); //parse_json.h functions void get_params_from_json(chatParams& params); #endif ================================================ FILE: src/parse_json.h ================================================ #pragma once #ifndef PARSE_JSON_H #define PARSE_JSON_H #include "header.h" //helper function to convert string to bool bool stob(const std::string& str) { std::string lowerStr = str; std::transform(str.begin(), str.end(), lowerStr.begin(), ::tolower); if (lowerStr == "true") { return true; } else if (lowerStr == "false") { return false; } else { throw std::invalid_argument("Invalid boolean string"); } } std::string readFile(const std::string& filename) { std::ifstream inFile(filename); if (!inFile) { std::cerr << "Unable to open file: " << filename << std::endl; return ""; } std::stringstream buffer; buffer << inFile.rdbuf(); inFile.close(); return buffer.str(); } std::map parse_json_string(const std::string& jsonString) { std::map resultMap; std::regex pattern("\"([^\"]+)\":\\s*([^\"]+|\"[^\"]+\")"); std::smatch match; std::string::const_iterator searchStart(jsonString.cbegin()); while (std::regex_search(searchStart, jsonString.cend(), match, pattern)) { resultMap[match[1]] = match[2]; searchStart = match.suffix().first; } return resultMap; } std::string removeQuotes(const std::string& input) { std::string result = input; result.erase(std::remove(result.begin(), result.end(), '\"'), result.end()); return result; } void get_params_from_json(chatParams& params) { std::map parsed = parse_json_string(readFile(params.load_json)); if (parsed.find("top_p") != parsed.end()) params.top_p = std::stof(parsed["top_p"]); if (parsed.find("top_k") != parsed.end()) params.top_k = std::stoi(parsed["top_k"]); if (parsed.find("temp") != parsed.end()) params.temp = std::stof(parsed["temp"]); if (parsed.find("n_predict") != parsed.end()) params.n_predict = std::stoi(parsed["n_predict"]); if (parsed.find("n_batch") != parsed.end()) params.n_batch = std::stoi(parsed["n_batch"]); if (parsed.find("n_ctx") != parsed.end()) params.n_ctx = std::stoi(parsed["n_ctx"]); if (parsed.find("seed") != parsed.end()) params.seed = std::stoi(parsed["seed"]); if (parsed.find("threads") != parsed.end()) params.n_threads = std::stoi(parsed["threads"]); if (parsed.find("model") != parsed.end()) params.model = removeQuotes(parsed["model"]); if (parsed.find("prompt") != parsed.end()) params.prompt = removeQuotes(parsed["prompt"]); if (parsed.find("no-interactive") != parsed.end()) params.no_interactive = stob(removeQuotes(parsed["no-interactive"])); if (parsed.find("run-once") != parsed.end()) params.run_once = stob(removeQuotes(parsed["run-once"])); if (parsed.find("no-animation") != parsed.end()) params.use_animation = !stob(removeQuotes(parsed["no-animation"])); if (parsed.find("no-saves") != parsed.end()) params.no_saves = stob(removeQuotes(parsed["no-saves"])); if (parsed.find("repeat_penalty") != parsed.end()) params.repeat_penalty = std::stof(parsed["repeat_penalty"]); if (parsed.find("repeat_last_n") != parsed.end()) params.repeat_last_n = std::stoi(parsed["repeat_last_n"]); if (parsed.find("context_erase") != parsed.end()) params.context_erase = std::stof(parsed["context_erase"]); if (parsed.find("b_token") != parsed.end()) params.b_token = removeQuotes(parsed["b_token"]); if (parsed.find("e_token") != parsed.end()) params.e_token = removeQuotes(parsed["e_token"]); if (parsed.find("load_template") != parsed.end()) params.load_template = removeQuotes(parsed["load_template"]); if (parsed.find("save_log") != parsed.end()) params.save_log = removeQuotes(parsed["save_log"]); if (parsed.find("load_log") != parsed.end()) params.load_log = removeQuotes(parsed["load_log"]); if (parsed.find("save_dir") != parsed.end()) params.save_dir = removeQuotes(parsed["save_dir"]); if (parsed.find("save_name") != parsed.end()) params.save_name = removeQuotes(parsed["save_name"]);} #endif ================================================ FILE: src/utils.h ================================================ #pragma once #ifndef UTILS_H #define UTILS_H #include "header.h" //Need this for Windows colors #ifdef _WIN32 #include #endif bool containsSubstring(const std::string &str, const std::string &substr) { return str.find(substr) != std::string::npos; } void check_avx_support_at_startup() { #if defined(__x86_64__) || defined(__i386__) const bool avx(__builtin_cpu_supports("avx")); const bool avx2(__builtin_cpu_supports("avx2")); const bool avx512(__builtin_cpu_supports("avx512f")); const bool fma(__builtin_cpu_supports("fma")); if (avx512 && avx && avx2 && fma) {std::cout << "Your computer supports AVX512" << std::endl;} else if (avx && avx2 && fma) {std::cout << "Your computer supports AVX2" << std::endl;} else if (avx) {std::cout << "Your computer only supports AVX1" << std::endl;} else {std::cout << "Your computer does not support AVX1 or AVX2\nThe program will likely not run." << std::endl;} #ifdef OLD_MACOS std::cout << "Compiled with OLD_MACOS flag. /save and /load features turned off." << std::endl; #endif #endif } ////////////////////////////////////////////////////////////////////////// //////////// SIGNAL HANDLING //////////// ////////////////////////////////////////////////////////////////////////// volatile sig_atomic_t sighup_received = 0; void handle_sighup(int signal) { #ifndef _WIN32 if (signal == SIGHUP) { sighup_received = 1; } #endif } #ifdef _WIN32 BOOL WINAPI console_ctrl_handler(DWORD ctrl_type) { switch (ctrl_type) { case CTRL_C_EVENT: case CTRL_CLOSE_EVENT: sighup_received = 1; return TRUE; default: return FALSE; } } #endif ////////////////////////////////////////////////////////////////////////// //////////// /SIGNAL HANDLING //////////// ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// //////////// READ PROMPT TEMPLATE FILE //////////// ////////////////////////////////////////////////////////////////////////// //This is a bit messy function but it should parse the template file into prefix, header, and footer. //Chat will then prompt the model with (prefix + header + input/prompt + footer) std::tuple read_prompt_template_file(const std::string& file_path) { std::string prefix, header, footer; std::ifstream file(file_path); std::vector lines; std::string line; //store all lines of header template into a vector if (file.is_open()) { while (std::getline(file, line)) { lines.push_back(line); } file.close(); } else { std::cerr << "Unable to open the prompt template file." << std::endl; std::cerr << "Reverting to default prompt template." << std::endl; return std::make_tuple("### Instruction:\n The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.", "\n### Prompt: ", "\n### Response: "); } //find line containing %1 and store its index. int input_index; for (size_t i = 0; i < lines.size(); ++i) { if (lines[i].find("%1") != std::string::npos) { input_index = i; } } //Special case of having only %1 in template file. if (input_index == 0) { header = ""; prefix = ""; footer = ""; //If there is only 1 line above %1, that will be ### header. } else if (input_index == 1) { header = lines[0]; prefix = " "; } else { //Put lines above the header-line into prefix. prefix = lines[0]; for (size_t i = 1; i < input_index-1; ++i) { prefix = prefix + "\n" + lines[i]; } prefix = prefix + " "; //store header-line (line above input-line) header = "\n" + lines[input_index-1] + " "; //Put lines below the input-line into footer. footer = "\n"; for (size_t i = input_index+1; i < lines.size(); ++i) { footer = footer + lines[i]+" "; } } return std::make_tuple(prefix, header, footer); } ////////////////////////////////////////////////////////////////////////// //////////// /READ PROMPT TEMPLATE FILE //////////// ////////////////////////////////////////////////////////////////////////// void save_chat_log(std::string save_log, std::string prompt, std::string answer) { std::ofstream logfile(save_log, std::ios::app); if (logfile.is_open()) { logfile << prompt; logfile << answer+"\n"; logfile.close(); } } std::string read_chat_log(std::string load_log) { std::ifstream ifs(load_log); std::string content((std::istreambuf_iterator(ifs)), std::istreambuf_iterator()); return content; } std::string pathname_directory(const std::string &pathname) { std::size_t len = pathname.find_last_of("/\\"); return len == std::string::npos ? "": pathname.substr(0, len); } void set_console_color(ConsoleState &con_st, ConsoleColor color) { if (con_st.use_color && con_st.color != color) { //Windows handles colors differently. #ifdef _WIN32 WORD windows_colors[] = { 7, 14, 10, 15 }; HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE); SetConsoleTextAttribute(hConsole, windows_colors[color]); #else //ANSI colors, works for unix. const char* ansi_colors[] = { //DEFAULT, PROMPT, USER_INPUT, BOLD //default, yellow, bright_green, bold "\x1b[0m", "\x1b[33m", "\x1b[1m\x1b[32m", "\x1b[1m" }; printf("%s", ansi_colors[color]); #endif con_st.color = color; } } std::string random_prompt(int32_t seed) { const std::vector prompts = { "So", "Once upon a time", "When", "The", "After", "If", "import", "He", "She", "They" }; std::mt19937 rng(seed); return prompts[rng() % prompts.size()]; } void print_version() { //Version/about page //Contains License information for distributions in binary form std::string mit_license = R"(MIT License Big thanks to contributors, testers, and commenters on Github. And to you, dear user! Happy chatting! :) )"; std::cout << "\n\n" << APPNAME << " version " << VERSION << "\n\n" << "Made by kuvaus" << "\n\n" << mit_license << std::endl; //std::cout << mit_license << std::endl; } void print_usage(int argc, char** argv, const chatParams& params) { // Print usage information fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "A simple chat program for GPT-J, LLaMA, and MPT models.\n"); fprintf(stderr, "You can set specific initial prompt with the -p flag.\n"); fprintf(stderr, "Runs default in interactive and continuous mode.\n"); fprintf(stderr, "Type '/reset' to reset the chat context.\n"); fprintf(stderr, "Type '/save','/load' to save network state into a binary file.\n"); fprintf(stderr, "Type '/save NAME','/load NAME' to rename saves. Default: --save_name NAME.\n"); fprintf(stderr, "Type '/help' to show this help dialog.\n"); fprintf(stderr, "Type 'quit', 'exit' or, 'Ctrl+C' to quit.\n"); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -v, --version show version and license information\n"); fprintf(stderr, " --run-once disable continuous mode\n"); fprintf(stderr, " --no-interactive disable interactive mode altogether (uses given prompt only)\n"); fprintf(stderr, " --no-animation disable chat animation\n"); fprintf(stderr, " --no-saves disable '/save','/load' functionality\n"); fprintf(stderr, " -s SEED, --seed SEED RNG seed for --random-prompt (default: -1)\n"); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); fprintf(stderr, " prompt to start generation with (default: empty)\n"); fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); fprintf(stderr, " --n_ctx N number of tokens in context window (default: %d)\n", params.n_ctx); fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --repeat_penalty N repeat_penalty (default: %.1f)\n", params.repeat_penalty); fprintf(stderr, " --repeat_last_n N last n tokens to penalize (default: %d)\n", params.repeat_last_n); fprintf(stderr, " --context_erase N percent of context to erase (default: %.1f)\n", params.context_erase); fprintf(stderr, " --b_token optional beginning wrap token for response (default: empty)\n"); fprintf(stderr, " --e_token optional end wrap token for response (default: empty)\n"); fprintf(stderr, " -j, --load_json FNAME\n"); fprintf(stderr, " load options instead from json at FNAME (default: empty/no)\n"); fprintf(stderr, " --load_template FNAME\n"); fprintf(stderr, " load prompt template from a txt file at FNAME (default: empty/no)\n"); fprintf(stderr, " --save_log FNAME\n"); fprintf(stderr, " save chat log to a file at FNAME (default: empty/no)\n"); fprintf(stderr, " --load_log FNAME\n"); fprintf(stderr, " load chat log from a file at FNAME (default: empty/no)\n"); fprintf(stderr, " --save_dir DIR\n"); fprintf(stderr, " directory for saves (default: %s/saves)\n", pathname_directory(argv[0]).c_str()); fprintf(stderr, " --save_name NAME\n"); fprintf(stderr, " save/load model state binary at save_dir/NAME.bin (current: %s)\n", params.save_name.c_str()); fprintf(stderr, " context is saved to save_dir/NAME.ctx (current: %s)\n", params.save_name.c_str()); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (current: %s)\n", params.model.c_str()); fprintf(stderr, "\n"); } bool parse_params(int argc, char** argv, chatParams& params) { // Parse command-line arguments for (int i = 1; i < argc; i++) { std::string arg = argv[i]; if (arg == "-j" || arg == "--load_json") { params.load_json = argv[++i]; if (!params.load_json.empty()) { std::cout << APPNAME << ": parsing options from json: " << params.load_json << std::endl; get_params_from_json(params); } else { std::cout << APPNAME << ": trying to parse options from json but got empty filename." << std::endl; } } else if (arg == "--run-once") { params.run_once = true; } else if (arg == "--no-interactive") { params.no_interactive = true; } else if (arg == "--no-animation") { params.use_animation = false; } else if (arg == "--no-saves") { params.no_saves = true; } else if (arg == "-s" || arg == "--seed") { params.seed = static_cast(std::stoi(argv[++i])); } else if (arg == "-t" || arg == "--threads") { params.n_threads = static_cast(std::stoi(argv[++i])); } else if (arg == "-p" || arg == "--prompt") { params.prompt = argv[++i]; } else if (arg == "--random-prompt") { params.prompt = random_prompt(params.seed); } else if (arg == "-n" || arg == "--n_predict") { params.n_predict = static_cast(std::stoi(argv[++i])); } else if (arg == "--top_k") { params.top_k = static_cast(std::stoi(argv[++i])); } else if (arg == "--top_p") { params.top_p = static_cast(std::stof(argv[++i])); } else if (arg == "--temp") { params.temp = static_cast(std::stof(argv[++i])); } else if (arg == "-b" || arg == "--batch_size") { params.n_batch = static_cast(std::stoi(argv[++i])); } else if (arg == "--n_ctx") { params.n_ctx = static_cast(std::stoi(argv[++i])); } else if (arg == "--repeat_penalty") { params.repeat_penalty = static_cast(std::stof(argv[++i])); } else if (arg == "--repeat_last_n") { params.repeat_last_n = static_cast(std::stoi(argv[++i])); } else if (arg == "--context_erase") { params.context_erase = static_cast(std::stof(argv[++i])); } else if (arg == "--b_token") { params.b_token = argv[++i]; } else if (arg == "--e_token") { params.e_token = argv[++i]; } else if (arg == "--load_template") { params.load_template = argv[++i]; } else if (arg == "--save_log") { params.save_log = argv[++i]; } else if (arg == "--load_log") { params.load_log = argv[++i]; } else if (arg == "--save_dir") { params.save_dir = argv[++i]; } else if (arg == "--save_name") { params.save_name = argv[++i]; } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } else if (arg == "-h" || arg == "--help") { print_usage(argc, argv, params); exit(0); } else if (arg == "-v" || arg == "--version") { print_version(); exit(0); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); print_usage(argc, argv, params); exit(0); } } //get path to program params.path = pathname_directory(argv[0]); params.path.append("/"); return true; } #endif