[
  {
    "path": ".github/workflows/cmake-release.yml",
    "content": "name: CMake-release\n\non:\n  push:\n    tags:\n      - 'v*'\n\nenv:\n  BUILD_TYPE: Release\n\npermissions:\n  contents: read\n  actions: write\n\njobs:\n  build:\n    runs-on: ${{ matrix.os }}\n\n    strategy:\n      fail-fast: false\n      matrix:\n        os:\n          - ubuntu-latest\n          - macos-latest\n          - windows-latest\n        instructions:\n          - avx\n          - avx2\n\n    steps:\n    - uses: actions/checkout@v3\n      with:\n          submodules: recursive\n\n    - name: Setup MinGW\n      if: matrix.os == 'windows-latest'\n      run: |\n        choco install mingw -y -libwinpthread\n        echo \"C:\\ProgramData\\chocolatey\\lib\\mingw\\tools\\install\\mingw64\\bin\" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append\n\n    - name: Configure CMake\n      run: |\n        if (\"${{ matrix.os }}\" -eq \"windows-latest\") {\n          $env:PATH += \";C:\\ProgramData\\chocolatey\\lib\\mingw\\tools\\install\\mingw64\\bin\"\n          cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }} -G \"MinGW Makefiles\"\n        } else {\n          cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }}\n        }\n      shell: pwsh\n      \n    - name: Build\n      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}\n      \n    - name: Test\n      working-directory: ${{github.workspace}}/build\n      run: ctest -C ${{env.BUILD_TYPE}}\n\n    - name: Prepare binary\n      run: |\n        if (\"${{ matrix.instructions }}\" -eq \"avx\"){\n          if (\"${{ matrix.os }}\" -eq \"windows-latest\") {\n            cp ${{github.workspace}}\\build\\bin\\chat.exe chat.exe\n            mv chat.exe chat-windows-latest-avx.exe\n            shasum -a 256 -b chat-windows-latest-avx.exe > shasum-chat-windows-latest-avx.sha256\n          } else {\n            cp ${{github.workspace}}/build/bin/chat chat\n            mv chat chat-${{ matrix.os }}-${{ matrix.instructions }}\n            shasum -a 256 -b chat-${{ matrix.os }}-${{ matrix.instructions }} > shasum-chat-${{ matrix.os }}-${{ matrix.instructions }}.sha256\n          }\n        } else {\n          if (\"${{ matrix.os }}\" -eq \"windows-latest\") {\n            cp ${{github.workspace}}\\build\\bin\\chat.exe chat.exe\n            mv chat.exe chat-windows-latest-avx2.exe\n            shasum -a 256 -b chat-windows-latest-avx2.exe > shasum-chat-windows-latest-avx2.sha256\n          } else {\n            cp ${{github.workspace}}/build/bin/chat chat\n            mv chat chat-${{ matrix.os }}-${{ matrix.instructions }}\n            shasum -a 256 -b chat-${{ matrix.os }}-${{ matrix.instructions }} > shasum-chat-${{ matrix.os }}-${{ matrix.instructions }}.sha256\n          }\n        }\n      shell: pwsh\n\n    - name: Upload binary\n      uses: actions/upload-artifact@v2\n      with:\n        name: chat-${{ matrix.os }}-${{ matrix.instructions }}\n        path: chat-${{ matrix.os }}-${{ matrix.instructions }}*\n\n    - name: Upload shasums\n      uses: actions/upload-artifact@v2\n      with:\n        name: shasum-chat-${{ matrix.os }}-${{ matrix.instructions }}\n        path: shasum-chat-${{ matrix.os }}-${{ matrix.instructions }}*\n  release:\n    needs: build\n    runs-on: ubuntu-latest\n\n    steps:\n    - name: Create Release\n      id: create_release\n      uses: actions/create-release@v1\n      env:\n        GITHUB_TOKEN: ${{ secrets.DEPLOY_KEY }}\n      with:\n        tag_name: ${{ github.ref }}\n        release_name: Release ${{ github.ref }}\n        draft: false\n        prerelease: false\n\n    - name: Download artifacts\n      uses: actions/download-artifact@v2\n      with:\n        path: artifacts\n\n    - name: Upload artifacts\n      uses: softprops/action-gh-release@v1\n      env:\n        GITHUB_TOKEN: ${{ secrets.DEPLOY_KEY }}\n      with:\n        tag_name: ${{ github.ref_name }}\n        name: Release ${{ github.ref_name }}\n        draft: false\n        prerelease: false\n        files: |\n          artifacts/**/*\n\n    #\n    # This part filters the CHANGELOG.md using python\n    # Then it adds FILTERED_CHANGELOG.md to release notes\n    #\n\n    - name: Checkout repository\n      uses: actions/checkout@v3\n\n    - name: Set up Python\n      uses: actions/setup-python@v4\n      with:\n        python-version: 3.x\n\n    - name: Filter CHANGELOG.md\n      uses: jannekem/run-python-script-action@v1\n      with:\n        script: |\n          filtered_lines = []\n          start_processing = False\n\n          with open('CHANGELOG.md', 'r') as file:\n            for line in file:\n              if line.startswith(\"#### [v\"):\n                if start_processing:\n                  break\n                else:\n                  file.readline()\n                  file.readline()\n                  start_processing = True\n                  continue\n              if start_processing:\n                filtered_lines.append(line)\n      \n            with open('FILTERED_CHANGELOG.md', 'w') as file:\n              file.writelines(filtered_lines)\n\n    - name: Generate release notes\n      uses: softprops/action-gh-release@v1\n      env:\n        GITHUB_TOKEN: ${{ secrets.DEPLOY_KEY }}\n      with:\n        tag_name: ${{ github.ref_name }}\n        name: Release ${{ github.ref_name }}\n        body_path: FILTERED_CHANGELOG.md\n        draft: false\n        prerelease: false\n\n"
  },
  {
    "path": ".github/workflows/cmake.yml",
    "content": "name: CMake\n\non:\n  push:\n    branches: [ \"main\" ]\n\nenv:\n  BUILD_TYPE: Release\n\njobs:\n  build:\n    runs-on: ${{ matrix.os }}\n    \n    strategy:\n      fail-fast: false\n      matrix:\n        os:\n          - ubuntu-latest\n          - macos-latest\n          - windows-latest\n        instructions:\n          - avx\n          - avx2\n\n    steps:\n    - uses: actions/checkout@v3\n      with:\n          submodules: recursive\n\n    - name: Setup MinGW\n      if: matrix.os == 'windows-latest'\n      run: |\n        choco install mingw -y -libwinpthread\n        echo \"C:\\ProgramData\\chocolatey\\lib\\mingw\\tools\\install\\mingw64\\bin\" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append\n\n    - name: Configure CMake\n      run: |\n        if (\"${{ matrix.os }}\" -eq \"windows-latest\") {\n          $env:PATH += \";C:\\ProgramData\\chocolatey\\lib\\mingw\\tools\\install\\mingw64\\bin\"\n          cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }} -G \"MinGW Makefiles\"\n        } else {\n          cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }}\n        }\n      shell: pwsh\n\n    - name: Build\n      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}\n\n    - name: Test\n      working-directory: ${{github.workspace}}/build\n      run: ctest -C ${{env.BUILD_TYPE}}\n\n\n\n"
  },
  {
    "path": ".github/workflows/cmake_branch.yml",
    "content": "name: CMake\n\non:\n  push:\n    branches:\n      - '*'\n      - '!main'\n\nenv:\n  BUILD_TYPE: Release\n\njobs:\n  build:\n    runs-on: ${{ matrix.config.os }}\n    \n    strategy:\n      fail-fast: false\n      matrix:\n        config:\n          - { os: 'ubuntu-latest', instructions: 'avx' }\n          - { os: 'ubuntu-latest', instructions: 'avx2' }\n          - { os: 'macos-latest', instructions: 'avx' }\n          - { os: 'macos-latest', instructions: 'avx2' }\n          - { os: 'windows-latest', build: 'msvc', instructions: 'avx' }\n          - { os: 'windows-latest', build: 'msvc', instructions: 'avx2' }\n          - { os: 'windows-latest', build: 'mingw', instructions: 'avx' }\n          - { os: 'windows-latest', build: 'mingw', instructions: 'avx2' }\n\n    steps:\n    - uses: actions/checkout@v3\n      with:\n          submodules: recursive\n\n    - name: Configure CMake\n      if: matrix.build == 'msvc'\n      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}\n\n    - name: Build\n      if: matrix.build == 'msvc'\n      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}\n\n    - name: Test\n      if: matrix.build == 'msvc'\n      working-directory: ${{github.workspace}}/build\n      run: ctest -C ${{env.BUILD_TYPE}}\n\n    - name: Prepare binary\n      if: matrix.build == 'msvc'\n      run: |\n        if (\"${{ matrix.os }}\" -eq \"windows-latest\") {\n          cp ${{github.workspace}}\\build\\bin\\Release\\chat.exe chat-msvc.exe\n          mv chat-msvc.exe chat-windows-latest-msvc.exe\n        }\n      shell: pwsh\n      \n    - name: Setup MinGW\n      if: matrix.os == 'windows-latest'\n      run: |\n        choco install mingw -y -libwinpthread\n        echo \"C:\\ProgramData\\chocolatey\\lib\\mingw\\tools\\install\\mingw64\\bin\" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append\n\n    - name: Configure CMake\n      run: |\n        if (\"${{ matrix.os }}\" -eq \"windows-latest\") {\n          $env:PATH += \";C:\\ProgramData\\chocolatey\\lib\\mingw\\tools\\install\\mingw64\\bin\"\n          cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }} -G \"MinGW Makefiles\"\n        } elseif (\"${{ matrix.arch }}\" -eq \"aarch64\") {\n        } else {\n          cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }}\n        }\n      shell: pwsh\n\n    - name: Build\n      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}\n\n    - name: Test\n      working-directory: ${{github.workspace}}/build\n      run: ctest -C ${{env.BUILD_TYPE}}\n    \n\n\n\n"
  },
  {
    "path": ".gitignore",
    "content": "# Folders\nbuild/\ntmp/\n\n# Visual Studio Code\n.vscode\n\n# MacOS \n.DS_Store\n\n# Prerequisites\n*.d\n\n# Compiled Object files\n*.slo\n*.lo\n*.o\n*.obj\n\n# Precompiled Headers\n*.gch\n*.pch\n\n# Compiled Dynamic libraries\n*.so\n*.dylib\n*.dll\n\n# Fortran module files\n*.mod\n*.smod\n\n# Compiled Static libraries\n*.lai\n*.la\n*.a\n*.lib\n\n# Executables\n*.exe\n*.out\n*.app\n\n.cache\n"
  },
  {
    "path": ".gitmodules",
    "content": "[submodule \"llama.cpp\"]\n    path = gpt4all-backend/llama.cpp\n    url = https://github.com/manyoso/llama.cpp\n    #url = https://github.com/ggerganov/llama.cpp\n"
  },
  {
    "path": "CHANGELOG.md",
    "content": "\n## Changelog\n\n#### [Upcoming](https://github.com/kuvaus/LlamaGPTJ-chat/compare/v0.3.0...HEAD)\n\n\n#### [v0.3.0](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.3.0)\n\n> 26 June 2023\n\n- Add this [changelog](https://github.com/kuvaus/LlamaGPTJ-chat/blob/main/CHANGELOG.md) :)\n- Add sha256 hashes on release so you can verify the binaries\n- All binaries are automatically generated with Github actions\n- Add signal handling for SIGHUP (macOS, Linux) and CTRL_CLOSE_EVENT (Windows) to fix issue [`#16`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/16)\n- This allows you to run chat as a subprocess. The chat subprocess now quits properly if parent app is closed.\n- Version information\n- Fix segfault on`/help`\n\n#### [v0.2.9](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.9)\n\n> 22 June 2023\n\n- [Pull request](https://github.com/kuvaus/LlamaGPTJ-chat/pull/18) from [@154pinkchairs](https://github.com/154pinkchairs/) merged. Thanks. :)\n- The pull request [`#18`](https://github.com/kuvaus/LlamaGPTJ-chat/pull/18) has the two fixes below:\n- Properly handle file paths including tildes [`18e9f36`](https://github.com/kuvaus/LlamaGPTJ-chat/commit/18e9f36)\n- Handle buffer allocation errors [`6800dfb`](https://github.com/kuvaus/LlamaGPTJ-chat/commit/6800dfb)\n- Better debug mode compilation. May fix issue [`#9`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/9)\n\n#### [v0.2.8](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.8)\n> 16 June 2023\n\n- Adds `--save_dir` option so you can change save directory location\n- Default location is `./saves` on the same directory as the chat binary\n- See issue [`#13`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/13) for more details\n\n#### [v0.2.7](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.7)\n> 15 June 2023\n\n- Fixes for old macOS.\n- Use `-DOLD_MACOS=ON` option when compiling with CMake.\n- Tested to compile on High Sierra and Xcode 10\n\n#### [v0.2.6](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.6)\n> 14 June 2023\n\n- You can name saves with `./save NAME` and `./load NAME`\n- You can toggle saving and loading off with `--no-saves` flag\n\n#### [v0.2.5](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.5)\n> 13 June 2023\n\n- Save/load state with `./save` and `./load`\n- Reset context with `./reset`, help with `./help`\n- Makes a `./saves` folder\n- Note that a single save can take up to 2Gb\n- You can wrap the AI response with tokens using `--b_token` and `--e_token`\n- See issue [`#12`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/12) for more details\n\n#### [v0.2.4](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.4)\n> 5 June 2023\n\n- Fix when using json to specify names for logfiles. Fixes issue [`#11`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/11)\n\n#### [v0.2.3](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.3)\n> 4 June 2023\n\n- Fix said ability to reset context... :)\n\n#### [v0.2.2](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.2)\n> 3 June 2023\n\n- Ability to reset context\n\n#### [v0.2.1](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.1)\n> 30 May 2023\n\n- Save and load chat logs\n- Use `--save_log` and `--load_log`\n- AVX512 option for compilation `-DAVX512=ON`\n\n#### [v0.2.0](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.0)\n> 17 May 2023\n\n- Update gpt4all backend to v0.1.1 [`61a963a`](https://github.com/kuvaus/LlamaGPTJ-chat/commit/61a963a3d220ef157a8504ddde708f33dc2946eb)\n- Full Windows Visual Studio compatibility. Finally fixes issue [`#1`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/1)\n- Builds from source on aarch64 Linux. Fixes issue [`#3`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/3)\n- Full MPT support. Fixes issue [`#4`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/4)\n\n#### v0.1.9\n> 16 May 2023\n\n- Code cleaning and reordering\n- `llmodel_create_model()` function\n\n#### v0.1.8\n> 13 May 2023\n\n- Add support for MPT models\n- Uses [gpt4all-backend](https://github.com/nomic-ai/gpt4all)\n\n#### v0.1.7\n> 12 May 2023\n\n- First [pull request](https://github.com/kuvaus/LlamaGPTJ-chat/pull/2)  from [@itz-coffee](https://github.com/itz-coffee/) merged. Thanks. :)\n- The pull request [`#2`](https://github.com/kuvaus/LlamaGPTJ-chat/pull/2) adds the feature below:\n- Add --no-animation flag [`fdc2ac3`](https///github.com/kuvaus/LlamaGPTJ-chat/commit/fdc2ac3)\n- Support for old macOS\n\n#### v0.1.6\n> 4 May 2023\n\n- Parse parameters from json files\n- Use `-j FNAME` or`--load_json  FNAME`\n\n#### v0.1.5\n> 3 May 2023\n\n- MinGW compilation on Windows\n\n#### v0.1.4\n> 1 May 2023\n\n- v0.1.4 had no tags\n- It was part of `cmake-release.yml` rewrite to enable MinGW [`e7e1ebf`](https://github.com/kuvaus/LlamaGPTJ-chat/commit/e7e1ebf97d696d069bbc0ae7f0ed078739fb6642)\n\n#### v0.1.3\n> 1 May 2023\n\n- Add loading of prompt template files\n- Use `--load_template` for loading\n- See `prompt_template_sample.txt` for a sample\n\n\n#### v0.1.2\n> 30 April 2023\n\n- Automatic memory handling for the model\n\n#### v0.1.1\n> 29 April 2023\n\n- Windows compilation fixes\n\n#### v0.1.0\n> 29 April 2023\n\n\n- Before this, progress was in  [GPTJ-chat](https://github.com/kuvaus/GPTJ-chat/) and [Llama-chat](https://github.com/kuvaus/Llama-chat/)\n- First version\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "cmake_minimum_required (VERSION 3.2)\n\nif(APPLE)\n  option(OLD_MACOS       \"Using old macos\"                   OFF) \n  option(BUILD_UNIVERSAL \"Build a Universal binary on macOS\" ON)\n  if(BUILD_UNIVERSAL AND NOT OLD_MACOS)\n    # Build a Universal binary on macOS\n    set(CMAKE_OSX_ARCHITECTURES \"arm64;x86_64\" CACHE STRING \"\" FORCE)\n  else()\n    # Build for the host architecture on macOS\n    set(CMAKE_OSX_ARCHITECTURES \"${CMAKE_HOST_SYSTEM_PROCESSOR}\" CACHE STRING \"\" FORCE)\n  endif()\n  if (OLD_MACOS)\n    add_definitions(-DOLD_MACOS)\n  endif()\nendif()\n\nproject(LlamaGPTJ-chat)\n\nset(VERSION_MAJOR 0)\nset(VERSION_MINOR 3)\nset(VERSION_PATCH 0)\nset(VERSION \"${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}\")\n\nset(CMAKE_EXPORT_COMPILE_COMMANDS \"on\")\nset(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)\nset(CMAKE_INSTALL_RPATH \"${CMAKE_INSTALL_PREFIX}/lib\")\n\n\nif(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)\n    set(LLAMA_STANDALONE ON)\nelse()\n    set(LLAMA_STANDALONE OFF)\nendif()\n\n\n# options\noption(AVX2                         \"enable AVX2\"                                           ON)\noption(AVX512                       \"enable AVX512\"                                         OFF)\n\noption(LLAMA_AVX                    \"llama: enable AVX\"                                     ON)\noption(LLAMA_AVX2                   \"llama: enable AVX2\"                                    ${AVX2})\noption(LLAMA_AVX512                 \"llama: enable AVX512\"                                  ${AVX512})\noption(LLAMA_AVX512_VBMI            \"llama: enable AVX512-VBMI\"                             ${AVX512})\noption(LLAMA_AVX512_VNNI            \"llama: enable AVX512-VNNI\"                             ${AVX512})\noption(LLAMA_FMA                    \"llama: enable FMA\"                                     ${AVX2})\n\n\n# sanitizers\n#set(LLAMA_BUILD_EXAMPLES ON CACHE BOOL \"llama: build examples\" FORCE)\nif(APPLE)\nelseif(UNIX)\n    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES \"aarch64\")\n    option(AVX2                         \"enable AVX2\"                                           OFF)\n    option(LLAMA_AVX                    \"llama: enable AVX\"                                     OFF)\n    option(LLAMA_AVX2                   \"llama: enable AVX2\"                                    OFF)\n    option(LLAMA_AVX512                 \"llama: enable AVX512\"                                  OFF)\n    option(LLAMA_AVX512_VBMI            \"llama: enable AVX512-VBMI\"                             OFF)\n    option(LLAMA_AVX512_VNNI            \"llama: enable AVX512-VNNI\"                             OFF)\n    set(BUILD_SHARED_LIBS ON FORCE)\n    set(CMAKE_EXE_LINKER_FLAGS \"${CMAKE_EXE_LINKER_FLAGS} -mno-outline-atomics\")\n    endif()\nendif()\n\nif (GGML_SANITIZE_THREAD)\n    set(CMAKE_C_FLAGS   \"${CMAKE_C_FLAGS}   -fsanitize=thread\")\n    set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} -fsanitize=thread\")\nendif()\n\nif (GGML_SANITIZE_ADDRESS)\n    set(CMAKE_C_FLAGS \"${CMAKE_C_FLAGS}     -fsanitize=address -fno-omit-frame-pointer\")\n    set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer\")\nendif()\n\nif (GGML_SANITIZE_UNDEFINED)\n    set(CMAKE_C_FLAGS \"${CMAKE_C_FLAGS}     -fsanitize=undefined\")\n    set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} -fsanitize=undefined\")\nendif()\nif (AVX512)\n    set(CMAKE_C_FLAGS \"${CMAKE_C_FLAGS}     -mavx512vl\")\n    set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} -mavx512vl\")\nendif()\n\n#set(CMAKE_C_FLAGS \"${CMAKE_C_FLAGS} -ffast-math\")\n#set(CMAKE_C_FLAGS \"${CMAKE_C_FLAGS} -march=native\")\n#set(CMAKE_C_FLAGS \"${CMAKE_C_FLAGS} -mcpu=native\")\n\n# dependencies\n\nset(CMAKE_C_STANDARD   17)\nset(CMAKE_CXX_STANDARD 20)\n\nfind_package(Threads REQUIRED)\n\n# main\n\n# Include static libs for compatibility:\nif(APPLE)\n  set(CMAKE_EXE_LINKER_FLAGS \"${CMAKE_EXE_LINKER_FLAGS} -Wl,-search_paths_first -lSystem\")\nelseif(UNIX)\n    if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES \"aarch64\")\n        set(CMAKE_EXE_LINKER_FLAGS \"${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++ -static\")\n    endif()\nelseif(WIN32)\n  set(CMAKE_EXE_LINKER_FLAGS \"${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++ -static\")\nendif()\n\n# Generate a header file with the version number\nconfigure_file(\n  \"${CMAKE_CURRENT_SOURCE_DIR}/cmake/config.h.in\"\n  \"${CMAKE_CURRENT_BINARY_DIR}/config.h\"\n)\n\n# Include the binary directory for the generated header file\ninclude_directories(\"${CMAKE_CURRENT_BINARY_DIR}\")\n\nadd_subdirectory(gpt4all-backend/llama.cpp)\nadd_subdirectory(gpt4all-backend)\nadd_subdirectory(src)\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2023 Jukka Maatta\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "[![CMake](https://github.com/kuvaus/LlamaGPTJ-chat/actions/workflows/cmake.yml/badge.svg)](https://github.com/kuvaus/LlamaGPTJ-chat/actions/workflows/cmake.yml)\n# LlamaGPTJ-chat\nSimple command line chat program for [GPT-J](https://en.wikipedia.org/wiki/GPT-J), [LLaMA](https://en.wikipedia.org/wiki/LLaMA) and [MPT](https://www.mosaicml.com/blog/mpt-7b) models written in C++. Based on [llama.cpp](https://github.com/ggerganov/llama.cpp) and uses [gpt4all-backend](https://github.com/nomic-ai/gpt4all) for full compatibility.\n\n<img alt=\"LlamaGPTJ-chat demo\" src=\"https://user-images.githubusercontent.com/22169537/234323778-64365dc9-8bd9-4a48-b7de-ec0280a5fb4e.gif\" width=\"600\" />\n\n> **Warning**\n> Very early progress, might have bugs\n\n# Table of contents\n<!-- TOC -->\n* [Installation](#installation)\n* [Usage](#usage)\n* [GPT-J, LLaMA, and MPT models](#gpt-j-llama-and-mpt-models)\n* [Detailed command list](#detailed-command-list)\n* [Useful features](#useful-features)\n* [License](#license)\n<!-- TOC -->\n\n## Installation\nSince the program is made using c++ it should build and run on most Linux, MacOS and Windows systems. The [Releases](https://github.com/kuvaus/LlamaGPTJ-chat/releases) link has ready-made binaries. AVX2 is faster and works on most newer computers. If you run the program, it will check and print if your computer has AVX2 support.\n\n### Download\n```sh\ngit clone --recurse-submodules https://github.com/kuvaus/LlamaGPTJ-chat\ncd LlamaGPTJ-chat\n```\nYou need to also download a model file, see [supported models](#gpt-j-llama-and-mpt-models) for details and links.\n\n### Build\nSince the program is made using c++ it should build and run on most Linux, MacOS and Windows systems. \nOn most systems, you only need this to build:\n```sh\nmkdir build\ncd build\ncmake ..\ncmake --build . --parallel\n```\n> **Note**\n> \n> If you have an old processor, you can turn AVX2 instructions OFF in the build step with `-DAVX2=OFF` flag.\n> \n> If you have a new processor, you can turn AVX512 instructions ON in the build step with `-DAVX512=ON` flag.\n> \n> On old macOS, set `-DBUILD_UNIVERSAL=OFF` to make the build x86 only instead of the universal Intel/ARM64 binary.\n> On really old macOS, set `-DOLD_MACOS=ON`. This disables `/save` and `/load` but compiles on old Xcode.\n> \n> On Windows you can now use Visual Studio (MSVC) or MinGW. If you want MinGW build instead, set `-G \"MinGW Makefiles\"`.\n>\n> On ARM64 Linux there are no ready-made binaries, but you can now build it from source.\n\n## Usage\n\nAfter compiling, the binary is located at:\n\n```sh\nbuild/bin/chat\n```\nBut you're free to move it anywhere. Simple command for 4 threads to get started:\n```sh\n./chat -m \"/path/to/modelfile/ggml-vicuna-13b-1.1-q4_2.bin\" -t 4\n```\nor\n```sh\n./chat -m \"/path/to/modelfile/ggml-gpt4all-j-v1.3-groovy.bin\" -t 4\n```\n\nHappy chatting!\n\n\n## GPT-J, LLaMA, and MPT models\nCurrent backend supports the GPT-J, LLaMA and MPT models.\n\n### GPT-J model\nYou need to download a GPT-J model first. Here are direct links to models:\n\n>- The default version is **v1.0**: [ggml-gpt4all-j.bin](https://gpt4all.io/models/ggml-gpt4all-j.bin)\n>- At the time of writing the newest is **1.3-groovy**: [ggml-gpt4all-j-v1.3-groovy.bin](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin)\n\nThey're around 3.8 Gb each. The chat program stores the model in RAM on runtime so you need enough memory to run. You can get more details on GPT-J models from [gpt4all.io](https://gpt4all.io/) or [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) github.\n\n### LLaMA model\nAlternatively you need to download a LLaMA model first. The original weights are for research purposes and you can apply for access [here](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/). Below are direct links to derived models:\n\n>- Vicuna 7b **v1.1**: [ggml-vicuna-7b-1.1-q4_2.bin](https://gpt4all.io/models/ggml-vicuna-7b-1.1-q4_2.bin)\n>- Vicuna 13b **v1.1**: [ggml-vicuna-13b-1.1-q4_2.bin](https://gpt4all.io/models/ggml-vicuna-13b-1.1-q4_2.bin)\n>- GPT-4-All **l13b-snoozy**: [ggml-gpt4all-l13b-snoozy.bin](https://gpt4all.io/models/ggml-gpt4all-l13b-snoozy.bin)\n\nThe LLaMA models are quite large: the 7B parameter versions are around 4.2 Gb and 13B parameter 8.2 Gb each. The chat program stores the model in RAM on runtime so you need enough memory to run. You can get more details on LLaMA models from the [whitepaper](https://arxiv.org/abs/2302.13971) or META AI [website](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/).\n\n### MPT model\nYou can also download and use an MPT model instead. Here are direct links to MPT-7B models:\n>- MPT-7B base model pre-trained by Mosaic ML: [ggml-mpt-7b-base.bin](https://gpt4all.io/models/ggml-mpt-7b-base.bin)\n>- MPT-7B instruct model trained by Mosaic ML: [ggml-mpt-7b-instruct.bin](https://gpt4all.io/models/ggml-mpt-7b-instruct.bin)\n>- Non-commercial MPT-7B chat model  trained by Mosaic ML: [ggml-mpt-7b-chat.bin](https://gpt4all.io/models/ggml-mpt-7b-chat.bin)\n\nThey're around 4.9 Gb each. The chat program stores the model in RAM on runtime so you need enough memory to run. You can get more details on MPT models from MosaicML [website](https://www.mosaicml.com/blog/mpt-7b) or [mosaicml/llm-foundry](https://github.com/mosaicml/llm-foundry) github.\n\n## Detailed command list\nYou can view the help and full parameter list with:\n`\n./chat -h\n`\n\n```sh\nusage: ./bin/chat [options]\n\nA simple chat program for GPT-J, LLaMA, and MPT models.\nYou can set specific initial prompt with the -p flag.\nRuns default in interactive and continuous mode.\nType '/reset' to reset the chat context.\nType '/save','/load' to save network state into a binary file.\nType '/save NAME','/load NAME' to rename saves. Default: --save_name NAME.\nType '/help' to show this help dialog.\nType 'quit', 'exit' or, 'Ctrl+C' to quit.\n\noptions:\n  -h, --help            show this help message and exit\n  -v, --version         show version and license information\n  --run-once            disable continuous mode\n  --no-interactive      disable interactive mode altogether (uses given prompt only)\n  --no-animation        disable chat animation\n  --no-saves            disable '/save','/load' functionality\n  -s SEED, --seed SEED  RNG seed for --random-prompt (default: -1)\n  -t N, --threads    N  number of threads to use during computation (default: 4)\n  -p PROMPT, --prompt PROMPT\n                        prompt to start generation with (default: empty)\n  --random-prompt       start with a randomized prompt.\n  -n N, --n_predict  N  number of tokens to predict (default: 200)\n  --top_k            N  top-k sampling (default: 40)\n  --top_p            N  top-p sampling (default: 0.9)\n  --temp             N  temperature (default: 0.9)\n  --n_ctx            N  number of tokens in context window (default: 0)\n  -b N, --batch_size N  batch size for prompt processing (default: 20)\n  --repeat_penalty   N  repeat_penalty (default: 1.1)\n  --repeat_last_n    N  last n tokens to penalize  (default: 64)\n  --context_erase    N  percent of context to erase  (default: 0.8)\n  --b_token             optional beginning wrap token for response (default: empty)\n  --e_token             optional end wrap token for response (default: empty)\n  -j,   --load_json FNAME\n                        load options instead from json at FNAME (default: empty/no)\n  --load_template   FNAME\n                        load prompt template from a txt file at FNAME (default: empty/no)\n  --save_log        FNAME\n                        save chat log to a file at FNAME (default: empty/no)\n  --load_log        FNAME\n                        load chat log from a file at FNAME (default: empty/no)\n  --save_dir        DIR\n                        directory for saves (default: ./saves)\n  --save_name       NAME\n                        save/load model state binary at save_dir/NAME.bin (current: model_state)\n                        context is saved to save_dir/NAME.ctx (current: model_state)\n  -m FNAME, --model FNAME\n                        model path (current: ./models/ggml-vicuna-13b-1.1-q4_2.bin)\n```\n## Useful features\nHere are some handy features and details on how to achieve them using command line options.\n\n### Save/load chat log and read output from other apps\nBy default, the program prints the chat to standard (stdout) output, so if you're including the program into your app, it only needs to read stdout. You can also save the whole chat log to a text file with `--save_log` option. There's an elementary way to remember your past conversation by simply loading the saved chat log with `--load_log` option when you start a new session.\n\n### Run the program once without user interaction\nIf you only need the program to run once without any user interactions, one way is to set prompt with `-p \"prompt\"` and using `--no-interactive` and `--no-animation` flags. The program will read the prompt, print the answer, and close.\n\n### Add AI personalities and characters\nIf you want a personality for your AI, you can change `prompt_template_sample.txt` and use `--load_template` to load the modified file. The only constant is that your input during chat will be put on the `%1` line. Instructions, prompt, response, and everything else can be replaced any way you want. Having different `personality_template.txt` files is an easy way to add different AI characters. With _some_ models, giving both AI and user names instead of `Prompt:` and `Response:`, can make the conversation flow more naturally as the AI tries to mimic a conversation between two people.\n\n### Ability to reset chat context\nYou can reset the chat at any time during chatting by typing `/reset` in the input field. This will clear the AI's memory of past conversations, logits, and tokens. You can then start the chat from a blank slate without having to reload the whole model again.\n\n### Load all parameters using JSON\nYou can also fetch parameters from a json file with `--load_json \"/path/to/file.json\"` flag. Different models might perform better or worse with different input parameters so using json files is a handy way to store and load all the settings at once. The JSON file loader is designed to be simple in order to prevent any external dependencies, and as a result, the JSON file must follow a specific format. Here is a simple example:\n\n```javascript\n{\"top_p\": 1.0, \"top_k\": 50400, \"temp\": 0.9, \"n_batch\": 9}\n```\nThis is useful when you want to store different temperature and sampling settings.\n\nAnd a more detailed one:\n```javascript\n{\n\"top_p\": 1.0,\n\"top_k\": 50400,\n\"temp\": 0.9,\n\"n_batch\": 20,\n\"threads\": 12,\n\"prompt\": \"Once upon a time\",\n\"load_template\": \"/path/to/prompt_template_sample.txt\",\n\"model\": \"/path/to/ggml-gpt4all-j-v1.3-groovy.bin\",\n\"no-interactive\": \"true\"\n}\n```\nThis one loads the prompt from the json, uses a specific template, and runs the program once in no-interactive mode so user does not have to press any input.\n\n## License\n\nThis project is licensed under the MIT [License](https://github.com/kuvaus/LlamaGPTJ-chat/blob/main/LICENSE)\n"
  },
  {
    "path": "cmake/config.h.in",
    "content": "#ifndef CONFIG_H\n#define CONFIG_H\n\n#define VERSION \"@VERSION_MAJOR@\" \".\" \"@VERSION_MINOR@\" \".\" \"@VERSION_PATCH@\"\n\n#endif // CONFIG_H\n"
  },
  {
    "path": "gpt4all-backend/CMakeLists.txt",
    "content": "cmake_minimum_required(VERSION 3.16)\nset(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)\n\n#if(APPLE)\n#  option(BUILD_UNIVERSAL \"Build a Universal binary on macOS\" ON)\n#  if(BUILD_UNIVERSAL)\n#    # Build a Universal binary on macOS\n#    # This requires that the found Qt library is compiled as Universal binaries.\n#    set(CMAKE_OSX_ARCHITECTURES \"arm64;x86_64\" CACHE STRING \"\" FORCE)\n#  else()\n#    # Build for the host architecture on macOS\n#    set(CMAKE_OSX_ARCHITECTURES \"${CMAKE_HOST_SYSTEM_PROCESSOR}\" CACHE STRING \"\" FORCE)\n#  endif()\n#endif()\n\n# Include the binary directory for the generated header file\n#include_directories(\"${CMAKE_CURRENT_BINARY_DIR}\")\n\n#set(LLMODEL_VERSION_MAJOR 0)\n#set(LLMODEL_VERSION_MINOR 1)\n#set(LLMODEL_VERSION_PATCH 1)\n#set(LLMODEL_VERSION \"${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}\")\n#project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)\n\nset(CMAKE_CXX_STANDARD_REQUIRED ON)\n\n#set(LLAMA_BUILD_EXAMPLES ON CACHE BOOL \"llama: build examples\" FORCE)\n#set(BUILD_SHARED_LIBS ON FORCE)\n\nset(CMAKE_VERBOSE_MAKEFILE ON)\nif (GPT4ALL_AVX_ONLY)\n    set(LLAMA_AVX2 OFF CACHE BOOL \"llama: enable AVX2\" FORCE)\n    set(LLAMA_F16C OFF CACHE BOOL \"llama: enable F16C\" FORCE)\n    set(LLAMA_FMA  OFF CACHE BOOL \"llama: enable FMA\" FORCE)\nendif()\n\n#add_subdirectory(llama.cpp)\n\nadd_library(llmodel\n    gptj.h gptj.cpp\n    llamamodel.h llamamodel.cpp\n    llama.cpp/examples/common.cpp\n    llmodel.h llmodel_c.h llmodel_c.cpp\n    mpt.h mpt.cpp\n    utils.h utils.cpp\n)\n\ntarget_link_libraries(llmodel\n    PRIVATE llama)\n\n#set_target_properties(llmodel PROPERTIES\n#                              VERSION ${PROJECT_VERSION}\n#                              SOVERSION ${PROJECT_VERSION_MAJOR})\n\n#set(COMPONENT_NAME_MAIN ${PROJECT_NAME})\n#set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/install)\n"
  },
  {
    "path": "gpt4all-backend/README.md",
    "content": "# GPT4ALL Backend\nThis directory contains the C/C++ model backend used by GPT4All for inference on the CPU. This backend acts as a universal library/wrapper for all models that the GPT4All ecosystem supports. Language bindings are built on top of this universal library. The native GPT4all Chat application directly uses this library for all inference.\n\n# What models are supported by the GPT4All ecosystem?\n\nCurrently, there are three different model architectures that are supported:\n\n1. GPTJ - Based off of the GPT-J architecture with examples found [here](https://huggingface.co/EleutherAI/gpt-j-6b)\n2. LLAMA - Based off of the LLAMA architecture with examples found [here](https://huggingface.co/models?sort=downloads&search=llama)\n3. MPT - Based off of Mosaic ML's MPT architecture with examples found [here](https://huggingface.co/mosaicml/mpt-7b)\n\n# Why so many different architectures? What differentiates them?\n\nOne of the major differences is license. Currently, the LLAMA based models are subject to a non-commercial license, whereas the GPTJ and MPT base models allow commercial usage. In the early advent of the recent explosion of activity in open source local models, the llama models have generally been seen as performing better, but that is changing quickly. Every week - even every day! - new models are released with some of the GPTJ and MPT models competitive in performance/quality with LLAMA. What's more, there are some very nice architectural innovations with the MPT models that could lead to new performance/quality gains.\n\n# How does GPT4All make these models available for CPU inference?\n\nBy leveraging the ggml library written by Georgi Gerganov and a growing community of developers. There are currently multiple different versions of this library. The original github repo can be found [here](https://github.com/ggerganov/ggml), but the developer of the library has also created a LLAMA based version [here](https://github.com/ggerganov/llama.cpp). Currently, this backend is using the latter as a submodule.\n\n# Does that mean GPT4All is compatible with all llama.cpp models and vice versa?\n\nUnfortunately, no for three reasons:\n\n1. The upstream [llama.cpp](https://github.com/ggerganov/llama.cpp) project has introduced [a compatibility breaking](https://github.com/ggerganov/llama.cpp/commit/b9fd7eee57df101d4a3e3eabc9fd6c2cb13c9ca1) re-quantization method recently. This is a breaking change that renders all previous models (including the ones that GPT4All uses) inoperative with newer versions of llama.cpp since that change.\n2. The GPT4All backend has the llama.cpp submodule specifically pinned to a version prior to this breaking change.\n3. The GPT4All backend currently supports MPT based models as an added feature. Neither llama.cpp nor the original ggml repo support this architecture as of this writing, however efforts are underway to make MPT available in the ggml repo which you can follow [here.](https://github.com/ggerganov/ggml/pull/145)\n\n# What is being done to make them more compatible?\n\nA few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differienting them with namespaces or some other manner. Investigations continue.\n\n# What about GPU inference?\n\nIn newer versions of llama.cpp, there has been some added support for NVIDIA GPU's for inference. We're investigating how to incorporate this into our downloadable installers.\n\n# Ok, so bottom line... how do I make my model on huggingface compatible with GPT4All ecosystem right now?\n\n1. Check to make sure the huggingface model is available in one of our three supported architectures\n2. If it is, then you can use the conversion script inside of our pinned llama.cpp submodule for GPTJ and LLAMA based models\n3. Or if your model is an MPT model you can use the conversion script located directly in this backend directory under the scripts subdirectory \n\n# Check back for updates as we'll try to keep this updated as things change!\n"
  },
  {
    "path": "gpt4all-backend/gptj/placeholder",
    "content": ""
  },
  {
    "path": "gpt4all-backend/gptj.cpp",
    "content": "#include \"gptj.h\"\n#include \"llama.cpp/ggml.h\"\n\n#include \"utils.h\"\n\n#include <cassert>\n#include <cmath>\n#include <cstdio>\n#include <cstring>\n#include <fstream>\n#include <map>\n#include <string>\n#include <vector>\n#include <iostream>\n#if defined(_WIN32) && defined(_MSC_VER)\n    #define WIN32_LEAN_AND_MEAN\n    #ifndef NOMINMAX\n        #define NOMINMAX\n    #endif\n    #include <windows.h>\n    #include <io.h>\n    #include <stdio.h>\n#else\n    #include <unistd.h>\n#endif\n#include <sstream>\n#include <unordered_set>\n\n// default hparams (GPT-J 6B)\nstatic const size_t MB = 1024*1024;\n\nstruct gptj_hparams {\n    int32_t n_vocab = 50400;\n    int32_t n_ctx   = 2048;\n    int32_t n_embd  = 4096;\n    int32_t n_head  = 16;\n    int32_t n_layer = 28;\n    int32_t n_rot   = 64;\n    int32_t f16     = 1;\n};\n\nstruct gptj_layer {\n    // normalization\n    struct ggml_tensor * ln_1_g;\n    struct ggml_tensor * ln_1_b;\n\n    // attention\n    struct ggml_tensor * c_attn_q_proj_w;\n    struct ggml_tensor * c_attn_k_proj_w;\n    struct ggml_tensor * c_attn_v_proj_w;\n\n    struct ggml_tensor * c_attn_proj_w;\n\n    // ff\n    struct ggml_tensor * c_mlp_fc_w;\n    struct ggml_tensor * c_mlp_fc_b;\n\n    struct ggml_tensor * c_mlp_proj_w;\n    struct ggml_tensor * c_mlp_proj_b;\n};\n\nstruct gptj_buffer {\n    uint8_t * addr = NULL;\n    size_t size = 0;\n\n    void resize(size_t size) {\n        delete[] addr;\n        addr = new uint8_t[size];\n        this->size = size;\n    }\n\n    ~gptj_buffer() {\n        fflush(stdout);\n        delete[] addr;\n    }\n};\n\nstruct gptj_kv_cache {\n    struct ggml_tensor * k;\n    struct ggml_tensor * v;\n\n    struct ggml_context * ctx = NULL;\n\n    gptj_buffer buf;\n\n    int n; // number of tokens currently in the cache\n\n    ~gptj_kv_cache() {\n        if (ctx) {\n            ggml_free(ctx);\n        }\n    }\n};\n\nstruct gptj_model {\n    gptj_hparams hparams;\n\n    // normalization\n    struct ggml_tensor * ln_f_g;\n    struct ggml_tensor * ln_f_b;\n\n    struct ggml_tensor * wte; // position embedding\n\n    struct ggml_tensor * lmh_g; // language model head\n    struct ggml_tensor * lmh_b; // language model bias\n\n    std::vector<gptj_layer> layers;\n\n    // key + value memory\n    struct gptj_kv_cache kv_self;\n\n    //\n    struct ggml_context * ctx;\n    std::map<std::string, struct ggml_tensor *> tensors;\n\n    gptj_buffer buf;\n\n    ~gptj_model() {\n        if (ctx) {\n            ggml_free(ctx);\n        }\n    }\n};\n\nstatic bool kv_cache_init(\n        const struct gptj_hparams & hparams,\n             struct gptj_kv_cache & cache,\n                         ggml_type   wtype,\n                               int   n_ctx) {\n    const int n_embd  = hparams.n_embd;\n    const int n_layer = hparams.n_layer;\n\n    const int64_t n_mem      = (int64_t)n_layer*n_ctx;\n    const int64_t n_elements = n_embd*n_mem;\n\n    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);\n\n    struct ggml_init_params params;\n    params.mem_size   = cache.buf.size;\n    params.mem_buffer = cache.buf.addr;\n    params.no_alloc   = false;\n\n    cache.ctx = ggml_init(params);\n\n    if (!cache.ctx) {\n        fprintf(stderr, \"%s: failed to allocate memory for kv cache\\n\", __func__);\n        return false;\n    }\n\n    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);\n    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);\n\n    return true;\n}\n\n// load the model's weights from a stream\nbool gptj_model_load(const std::string &fname, std::istream &fin, gptj_model & model, gpt_vocab & vocab) {\n    printf(\"%s: loading model from '%s' - please wait ...\\n\", __func__, fname.c_str());\n\n    // verify magic\n    {\n        uint32_t magic;\n        fin.read((char *) &magic, sizeof(magic));\n        if (magic != 0x67676d6c) {\n            fprintf(stderr, \"%s: invalid model file '%s' (bad magic)\\n\", __func__, fname.c_str());\n            return false;\n        }\n    }\n\n    // load hparams\n    {\n        auto & hparams = model.hparams;\n\n        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));\n        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));\n        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));\n        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));\n        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));\n        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));\n        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));\n\n        printf(\"%s: n_vocab = %d\\n\", __func__, hparams.n_vocab);\n        printf(\"%s: n_ctx   = %d\\n\", __func__, hparams.n_ctx);\n        printf(\"%s: n_embd  = %d\\n\", __func__, hparams.n_embd);\n        printf(\"%s: n_head  = %d\\n\", __func__, hparams.n_head);\n        printf(\"%s: n_layer = %d\\n\", __func__, hparams.n_layer);\n        printf(\"%s: n_rot   = %d\\n\", __func__, hparams.n_rot);\n        printf(\"%s: f16     = %d\\n\", __func__, hparams.f16);\n    }\n\n    // load vocab\n    {\n        int32_t n_vocab = 0;\n        fin.read((char *) &n_vocab, sizeof(n_vocab));\n\n        if (n_vocab != model.hparams.n_vocab) {\n            fprintf(stderr, \"%s: invalid model file '%s' (bad vocab size %d != %d)\\n\",\n                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);\n            return false;\n        }\n\n        std::string word;\n        for (int i = 0; i < n_vocab; i++) {\n            uint32_t len;\n            fin.read((char *) &len, sizeof(len));\n\n            word.resize(len);\n            fin.read((char *) word.data(), len);\n\n            vocab.token_to_id[word] = i;\n            vocab.id_to_token[i] = word;\n        }\n    }\n\n    // for the big tensors, we have the option to store the data in 16-bit floats or quantized\n    // in order to save memory and also to speed up the computation\n    ggml_type wtype = GGML_TYPE_COUNT;\n    switch (model.hparams.f16) {\n        case 0: wtype = GGML_TYPE_F32;  break;\n        case 1: wtype = GGML_TYPE_F16;  break;\n        case 2: wtype = GGML_TYPE_Q4_0; break;\n        case 3: wtype = GGML_TYPE_Q4_1; break;\n        case 5: wtype = GGML_TYPE_Q4_2; break;\n        default:\n                {\n                    fprintf(stderr, \"%s: invalid model file '%s' (bad f16 value %d)\\n\",\n                            __func__, fname.c_str(), model.hparams.f16);\n                    return false;\n                }\n    }\n\n    const ggml_type wtype2 = GGML_TYPE_F32;\n\n    auto & ctx = model.ctx;\n\n    size_t ctx_size = 0;\n\n    {\n        const auto & hparams = model.hparams;\n\n        const int n_embd  = hparams.n_embd;\n        const int n_layer = hparams.n_layer;\n        const int n_ctx   = hparams.n_ctx;\n        const int n_vocab = hparams.n_vocab;\n\n        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g\n        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b\n\n        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte\n\n        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype);         // lmh_g\n        ctx_size +=        n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b\n\n        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g\n        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b\n\n        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w\n        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w\n        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w\n\n        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w\n\n        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w\n        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b\n\n        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w\n        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b\n\n        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k\n        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v\n\n        ctx_size += (5 + 10*n_layer)*256; // object overhead\n\n        printf(\"%s: ggml ctx size = %6.2f MB\\n\", __func__, ctx_size/(1024.0*1024.0));\n    }\n\n    // create the ggml context\n    {\n        struct ggml_init_params params = {\n            .mem_size   = ctx_size,\n            .mem_buffer = NULL,\n        };\n\n        model.ctx = ggml_init(params);\n        if (!model.ctx) {\n            fprintf(stderr, \"%s: ggml_init() failed\\n\", __func__);\n            return false;\n        }\n    }\n\n    // prepare memory for the weights\n    {\n        const auto & hparams = model.hparams;\n\n        const int n_embd  = hparams.n_embd;\n        const int n_layer = hparams.n_layer;\n        const int n_ctx   = hparams.n_ctx;\n        const int n_vocab = hparams.n_vocab;\n\n        model.layers.resize(n_layer);\n\n        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);\n\n        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);\n        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);\n\n        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);\n        model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);\n\n        // map by name\n        model.tensors[\"transformer.wte.weight\"] = model.wte;\n\n        model.tensors[\"transformer.ln_f.weight\"] = model.ln_f_g;\n        model.tensors[\"transformer.ln_f.bias\"]   = model.ln_f_b;\n\n        model.tensors[\"lm_head.weight\"] = model.lmh_g;\n        model.tensors[\"lm_head.bias\"]   = model.lmh_b;\n\n        for (int i = 0; i < n_layer; ++i) {\n            auto & layer = model.layers[i];\n\n            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);\n            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);\n\n            layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);\n            layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);\n            layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);\n\n            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);\n\n            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);\n            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);\n\n            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);\n            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);\n\n            // map by name\n            model.tensors[\"transformer.h.\" + std::to_string(i) + \".ln_1.weight\"]          = layer.ln_1_g;\n            model.tensors[\"transformer.h.\" + std::to_string(i) + \".ln_1.bias\"]            = layer.ln_1_b;\n\n            model.tensors[\"transformer.h.\" + std::to_string(i) + \".attn.q_proj.weight\"]   = layer.c_attn_q_proj_w;\n            model.tensors[\"transformer.h.\" + std::to_string(i) + \".attn.k_proj.weight\"]   = layer.c_attn_k_proj_w;\n            model.tensors[\"transformer.h.\" + std::to_string(i) + \".attn.v_proj.weight\"]   = layer.c_attn_v_proj_w;\n\n            model.tensors[\"transformer.h.\" + std::to_string(i) + \".attn.out_proj.weight\"] = layer.c_attn_proj_w;\n\n            model.tensors[\"transformer.h.\" + std::to_string(i) + \".mlp.fc_in.weight\"]     = layer.c_mlp_fc_w;\n            model.tensors[\"transformer.h.\" + std::to_string(i) + \".mlp.fc_in.bias\"]       = layer.c_mlp_fc_b;\n\n            model.tensors[\"transformer.h.\" + std::to_string(i) + \".mlp.fc_out.weight\"]    = layer.c_mlp_proj_w;\n            model.tensors[\"transformer.h.\" + std::to_string(i) + \".mlp.fc_out.bias\"]      = layer.c_mlp_proj_b;\n        }\n    }\n\n    // key + value memory\n    {\n        const auto & hparams = model.hparams;\n\n        const int n_embd  = hparams.n_embd;\n        const int n_layer = hparams.n_layer;\n        const int n_ctx   = hparams.n_ctx;\n\n        const int n_mem      = n_layer*n_ctx;\n        const int n_elements = n_embd*n_mem;\n\n        if (!kv_cache_init(hparams, model.kv_self, GGML_TYPE_F16, model.hparams.n_ctx)) {\n            fprintf(stderr, \"%s: kv_cache_init() failed for self-attention cache\\n\", __func__);\n            ggml_free(ctx);\n            return false;\n        }\n\n        const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v);\n        printf(\"%s: kv self size  = %7.2f MB\\n\", __func__, memory_size / 1024.0 / 1024.0);\n    }\n\n    // load weights\n    {\n        int n_tensors = 0;\n        size_t total_size = 0;\n\n        printf(\"%s: \", __func__);\n\n        while (true) {\n            int32_t n_dims;\n            int32_t length;\n            int32_t ftype;\n\n            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));\n            fin.read(reinterpret_cast<char *>(&length), sizeof(length));\n            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));\n\n            if (fin.eof()) {\n                break;\n            }\n\n            int32_t nelements = 1;\n            int32_t ne[2] = { 1, 1 };\n            for (int i = 0; i < n_dims; ++i) {\n                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));\n                nelements *= ne[i];\n            }\n\n            std::string name(length, 0);\n            fin.read(&name[0], length);\n\n            if (model.tensors.find(name.data()) == model.tensors.end()) {\n                fprintf(stderr, \"%s: unknown tensor '%s' in model file\\n\", __func__, name.data());\n                return false;\n            }\n\n            auto tensor = model.tensors[name.data()];\n            if (ggml_nelements(tensor) != nelements) {\n                fprintf(stderr, \"%s: tensor '%s' has wrong size in model file\\n\", __func__, name.data());\n                return false;\n            }\n\n            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {\n                fprintf(stderr, \"%s: tensor '%s' has wrong shape in model file: got [%lu, %lu], expected [%d, %d]\\n\",\n                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);\n                return false;\n            }\n\n            if (0) {\n                static const char * ftype_str[] = { \"f32\", \"f16\", \"q4_0\", \"q4_1\", };\n                printf(\"%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\\n\", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));\n            }\n\n            size_t bpe = 0;\n\n            switch (ftype) {\n                case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;\n                case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;\n                case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;\n                case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;\n                default:\n                        {\n                            fprintf(stderr, \"%s: unknown ftype %d in model file\\n\", __func__, ftype);\n                            return false;\n                        }\n            };\n\n            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {\n                fprintf(stderr, \"%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\\n\",\n                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);\n                return false;\n            }\n\n            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));\n\n            //printf(\"%42s - [%5d, %5d], type = %6s, %6.2f MB\\n\", name.data(), ne[0], ne[1], ftype == 0 ? \"float\" : \"f16\", ggml_nbytes(tensor)/1024.0/1024.0);\n            total_size += ggml_nbytes(tensor);\n            if (++n_tensors % 8 == 0) {\n                printf(\".\");\n                fflush(stdout);\n            }\n        }\n\n        printf(\" done\\n\");\n\n        printf(\"%s: model size = %8.2f MB / num tensors = %d\\n\", __func__, total_size/1024.0/1024.0, n_tensors);\n    }\n\n    return true;\n}\n\n// load the model's weights from a file path\nbool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) {\n\n    auto fin = std::ifstream(fname, std::ios::binary);\n    if (!fin) {\n        fprintf(stderr, \"%s: failed to open '%s'\\n\", __func__, fname.c_str());\n        return false;\n    }\n\n    bool loaded = gptj_model_load(fname, fin, model, vocab);\n    fin.close();\n    return loaded;\n}\n\n// evaluate the transformer\n//\n//   - model:     the model\n//   - n_threads: number of threads to use\n//   - n_past:    the context size so far\n//   - embd_inp:  the embeddings of the tokens in the context\n//   - embd_w:    the predicted logits for the next token\n//\n// The GPT-J model requires about 16MB of memory per input token.\n//\nbool gptj_eval(\n        gptj_model & model,\n        const int n_threads,\n        const int n_past,\n        const std::vector<gpt_vocab::id> & embd_inp,\n              std::vector<float>         & embd_w,\n              size_t                     & mem_per_token) {\n    const int N = embd_inp.size();\n\n    const auto & hparams = model.hparams;\n\n    const int n_embd  = hparams.n_embd;\n    const int n_layer = hparams.n_layer;\n    const int n_ctx   = hparams.n_ctx;\n    const int n_head  = hparams.n_head;\n    const int n_vocab = hparams.n_vocab;\n    const int n_rot   = hparams.n_rot;\n\n    const int d_key = n_embd/n_head;\n\n    const size_t init_buf_size = 1024u*MB;\n    if (!model.buf.addr || model.buf.size < init_buf_size)\n        model.buf.resize(init_buf_size);\n\n    if (mem_per_token > 0 && mem_per_token*N > model.buf.size) {\n        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead\n        printf(\"\\n%s: reallocating buffer from %zu to %zu bytes\\n\", __func__, model.buf.size, buf_size_new);\n\n        // reallocate\n        model.buf.resize(buf_size_new);\n        if (model.buf.addr == nullptr) {\n            fprintf(stderr, \"%s: failed to allocate %zu bytes\\n\", __func__, model.buf.size);\n            return false;\n        }\n    }\n\n    struct ggml_init_params params = {\n        .mem_size   = model.buf.size,\n        .mem_buffer = model.buf.addr,\n    };\n\n    struct ggml_context * ctx0 = ggml_init(params);\n    struct ggml_cgraph gf = { .n_threads = n_threads };\n\n    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);\n    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));\n\n    // wte\n    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);\n\n    for (int il = 0; il < n_layer; ++il) {\n        struct ggml_tensor * cur;\n\n        // norm\n        {\n            cur = ggml_norm(ctx0, inpL);\n\n            // cur = ln_1_g*cur + ln_1_b\n            cur = ggml_add(ctx0,\n                    ggml_mul(ctx0,\n                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),\n                        cur),\n                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));\n        }\n\n        struct ggml_tensor * inpSA = cur;\n\n        // self-attention\n        {\n            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur);\n            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur);\n            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur);\n\n            // store key and value to memory\n            {\n                struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_self.k, N*n_embd, (ggml_element_size(model.kv_self.k)*n_embd)*(il*n_ctx + n_past));\n                struct ggml_tensor * v = ggml_view_1d(ctx0, model.kv_self.v, N*n_embd, (ggml_element_size(model.kv_self.v)*n_embd)*(il*n_ctx + n_past));\n\n                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));\n                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));\n            }\n\n            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)\n            struct ggml_tensor * Q =\n                ggml_permute(ctx0,\n                        ggml_rope(ctx0,\n                            ggml_cpy(ctx0,\n                                Qcur,\n                                ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),\n                            n_past, n_rot, 0),\n                        0, 2, 1, 3);\n\n            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)\n            struct ggml_tensor * K =\n                ggml_permute(ctx0,\n                        ggml_rope(ctx0,\n                            ggml_reshape_3d(ctx0,\n                                ggml_view_1d(ctx0, model.kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.k)*n_embd),\n                                n_embd/n_head, n_head, n_past + N),\n                            n_past, n_rot, 1),\n                        0, 2, 1, 3);\n\n            // K * Q\n            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);\n\n            // KQ_scaled = KQ / sqrt(n_embd/n_head)\n            struct ggml_tensor * KQ_scaled =\n                ggml_scale(ctx0,\n                        KQ,\n                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))\n                        );\n\n            // KQ_masked = mask_past(KQ_scaled)\n            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);\n\n            // KQ = soft_max(KQ_masked)\n            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);\n\n            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()\n            struct ggml_tensor * V_trans =\n                ggml_cpy(ctx0,\n                        ggml_permute(ctx0,\n                            ggml_reshape_3d(ctx0,\n                                ggml_view_1d(ctx0, model.kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.v)*n_embd),\n                                n_embd/n_head, n_head, n_past + N),\n                            1, 2, 0, 3),\n                        ggml_new_tensor_3d(ctx0, model.kv_self.v->type, n_past + N, n_embd/n_head, n_head));\n\n            // KQV = transpose(V) * KQ_soft_max\n            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);\n\n            // KQV_merged = KQV.permute(0, 2, 1, 3)\n            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);\n\n            // cur = KQV_merged.contiguous().view(n_embd, N)\n            cur = ggml_cpy(ctx0,\n                    KQV_merged,\n                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));\n\n            // projection (no bias)\n            cur = ggml_mul_mat(ctx0,\n                    model.layers[il].c_attn_proj_w,\n                    cur);\n        }\n\n        struct ggml_tensor * inpFF = cur;\n\n        // feed-forward network\n        // this is independent of the self-attention result, so it could be done in parallel to the self-attention\n        {\n            // note here we pass inpSA instead of cur\n            cur = ggml_mul_mat(ctx0,\n                    model.layers[il].c_mlp_fc_w,\n                    inpSA);\n\n            cur = ggml_add(ctx0,\n                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),\n                    cur);\n\n            // GELU activation\n            cur = ggml_gelu(ctx0, cur);\n\n            // projection\n            // cur = proj_w*cur + proj_b\n            cur = ggml_mul_mat(ctx0,\n                    model.layers[il].c_mlp_proj_w,\n                    cur);\n\n            cur = ggml_add(ctx0,\n                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),\n                    cur);\n        }\n\n        // self-attention + FF\n        cur  = ggml_add(ctx0, cur, inpFF);\n\n        // input for next layer\n        inpL = ggml_add(ctx0, cur, inpL);\n    }\n\n    // norm\n    {\n        inpL = ggml_norm(ctx0, inpL);\n\n        // inpL = ln_f_g*inpL + ln_f_b\n        inpL = ggml_add(ctx0,\n                ggml_mul(ctx0,\n                    ggml_repeat(ctx0, model.ln_f_g, inpL),\n                    inpL),\n                ggml_repeat(ctx0, model.ln_f_b, inpL));\n    }\n\n    // lm_head\n    {\n        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);\n\n        inpL = ggml_add(ctx0,\n                ggml_repeat(ctx0, model.lmh_b, inpL),\n                inpL);\n    }\n\n    // logits -> probs\n    //inpL = ggml_soft_max(ctx0, inpL);\n\n    // run the computation\n    ggml_build_forward_expand(&gf, inpL);\n    ggml_graph_compute       (ctx0, &gf);\n\n    //if (n_past%100 == 0) {\n    //    ggml_graph_print   (&gf);\n    //    ggml_graph_dump_dot(&gf, NULL, \"gpt-2.dot\");\n    //}\n\n    //embd_w.resize(n_vocab*N);\n    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);\n\n    // return result for just the last token\n    embd_w.resize(n_vocab);\n    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);\n\n    if (mem_per_token == 0) {\n        mem_per_token = ggml_used_mem(ctx0)/N;\n    }\n    //printf(\"used_mem = %zu\\n\", ggml_used_mem(ctx0));\n\n    ggml_free(ctx0);\n\n    return true;\n}\n\n#define GPTJ_MAX_RNG_STATE 64*1024\n\nsize_t gptj_get_state_size(const gptj_model &model)\n{\n    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.\n    // for reference, std::mt19937(1337) serializes to 6701 bytes.\n    const size_t s_rng_size        = sizeof(size_t);\n    const size_t s_rng             = GPTJ_MAX_RNG_STATE;\n    const size_t s_kv_size         = sizeof(size_t);\n    const size_t s_kv_ntok         = sizeof(int);\n    const size_t s_kv              = model.kv_self.buf.size;\n    const size_t s_total = (\n        + s_rng_size\n        + s_rng\n        + s_kv_size\n        + s_kv_ntok\n        + s_kv\n    );\n    fflush(stdout);\n    return s_total;\n}\n\nsize_t gptj_copy_state_data(const gptj_model &model, const std::mt19937 &rng, uint8_t *dest)\n{\n    uint8_t * out = dest;\n    fflush(stdout);\n    // copy rng\n    {\n        std::stringstream rng_ss;\n        rng_ss << rng;\n\n        const size_t rng_size = rng_ss.str().size();\n        char rng_buf[GPTJ_MAX_RNG_STATE];\n\n        memset(&rng_buf[0], 0, GPTJ_MAX_RNG_STATE);\n        memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());\n\n        memcpy(out, &rng_size,   sizeof(rng_size));   out += sizeof(rng_size);\n        memcpy(out, &rng_buf[0], GPTJ_MAX_RNG_STATE); out += GPTJ_MAX_RNG_STATE;\n    }\n\n    // copy kv cache\n    {\n        const size_t kv_size = model.kv_self.buf.size;\n        const int    kv_ntok = model.kv_self.n;\n\n        memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);\n        memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);\n\n        if (kv_size) {\n            memcpy(out, model.kv_self.buf.addr, kv_size); out += kv_size;\n        }\n    }\n\n    const size_t written  = out - dest;\n    const size_t expected = gptj_get_state_size(model);\n    assert(written == expected);\n    fflush(stdout);\n    return written;\n}\n\nsize_t gptj_set_state_data(gptj_model *model, std::mt19937 *rng, const uint8_t *src)\n{\n    const uint8_t * in = src;\n\n    // set rng\n    {\n        size_t rng_size;\n        char   rng_buf[GPTJ_MAX_RNG_STATE];\n\n        memcpy(&rng_size,   in, sizeof(rng_size));    in += sizeof(rng_size);\n        memcpy(&rng_buf[0], in, GPTJ_MAX_RNG_STATE); in += GPTJ_MAX_RNG_STATE;\n\n        std::stringstream rng_ss;\n        rng_ss.str(std::string(&rng_buf[0], rng_size));\n        rng_ss >> *rng;\n\n        assert(rng_ss.fail() == false);\n    }\n\n    // set kv cache\n    {\n        size_t kv_size;\n        int kv_ntok;\n\n        memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);\n        memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);\n\n        if (kv_size) {\n            assert(model->kv_self.buf.size == kv_size);\n\n            void * k_data = model->kv_self.k->data; // remember data pointers\n            void * v_data = model->kv_self.v->data; // because their value is stored in buf and overwritten by memcpy\n\n            memcpy(model->kv_self.buf.addr, in, kv_size); in += kv_size;\n\n            model->kv_self.k->data = k_data; // restore correct data pointers\n            model->kv_self.v->data = v_data;\n\n        }\n\n        model->kv_self.n = kv_ntok;\n    }\n\n    const size_t nread    = in - src;\n    const size_t expected = gptj_get_state_size(*model);\n    assert(nread == expected);\n    fflush(stdout);\n    return nread;\n}\n\nstruct GPTJPrivate {\n    const std::string modelPath;\n    bool modelLoaded;\n    gpt_vocab vocab;\n    gptj_model *model = nullptr;\n    int64_t n_threads = 0;\n    size_t mem_per_token = 0;\n    std::mt19937 rng;\n};\n\nGPTJ::GPTJ()\n    : d_ptr(new GPTJPrivate) {\n\n    d_ptr->model = new gptj_model;\n    d_ptr->modelLoaded = false;\n}\n\nbool GPTJ::loadModel(const std::string &modelPath) {\n    std::mt19937 rng(time(NULL));\n    d_ptr->rng = rng;\n\n    auto fin = std::ifstream(modelPath, std::ios::binary);\n\n    // load the model\n    if (!gptj_model_load(modelPath, fin, *d_ptr->model, d_ptr->vocab)) {\n        std::cerr << \"GPT-J ERROR: failed to load model from \" <<  modelPath;\n        return false;\n    }\n\n    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());\n    d_ptr->modelLoaded = true;\n    fflush(stdout);\n    return true;\n}\n\nvoid GPTJ::setThreadCount(int32_t n_threads) {\n    d_ptr->n_threads = n_threads;\n}\n\nint32_t GPTJ::threadCount() const\n{\n    return d_ptr->n_threads;\n}\n\nGPTJ::~GPTJ()\n{\n    delete d_ptr->model;\n}\n\nbool GPTJ::isModelLoaded() const\n{\n    return d_ptr->modelLoaded;\n}\n\nsize_t GPTJ::stateSize() const\n{\n    return gptj_get_state_size(*d_ptr->model);\n}\n\nsize_t GPTJ::saveState(uint8_t *dest) const\n{\n    return gptj_copy_state_data(*d_ptr->model, d_ptr->rng, dest);\n}\n\nsize_t GPTJ::restoreState(const uint8_t *src)\n{\n    return gptj_set_state_data(d_ptr->model, &d_ptr->rng, src);\n}\n\nvoid GPTJ::prompt(const std::string &prompt,\n        std::function<bool(int32_t)> promptCallback,\n        std::function<bool(int32_t, const std::string&)> responseCallback,\n        std::function<bool(bool)> recalculateCallback,\n        PromptContext &promptCtx) {\n\n    if (!isModelLoaded()) {\n        std::cerr << \"GPT-J ERROR: prompt won't work with an unloaded model!\\n\";\n        return;\n    }\n\n    const int64_t t_main_start_us = ggml_time_us();\n\n    int64_t t_sample_us  = 0;\n    int64_t t_predict_us = 0;\n    int64_t t_prompt_us = 0;\n\n    // tokenize the prompt\n    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(d_ptr->vocab, prompt);\n\n    // save the context size\n    promptCtx.n_ctx = d_ptr->model->hparams.n_ctx;\n\n    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {\n        responseCallback(-1, \"ERROR: The prompt size exceeds the context window size and cannot be processed.\");\n        std::cerr << \"GPT-J ERROR: The prompt is\" << embd_inp.size() <<\n            \"tokens and the context window is\" << promptCtx.n_ctx << \"!\\n\";\n        return;\n    }\n\n    promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size());\n    promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx);\n\n    // determine the required inference memory per token:\n    static bool initialized = false;\n    static std::vector<gpt_vocab::id> p_instruct;\n    static std::vector<gpt_vocab::id> r_instruct;\n    if (!initialized) {\n        gptj_eval(*d_ptr->model, d_ptr->n_threads, 0, { 0, 1, 2, 3 }, promptCtx.logits,\n            d_ptr->mem_per_token);\n        initialized = true;\n    }\n\n    // process the prompt in batches\n    size_t i = 0;\n    const int64_t t_start_prompt_us = ggml_time_us();\n    while (i < embd_inp.size()) {\n        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());\n        std::vector<gpt_vocab::id> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);\n\n        // Check if the context has run out...\n        if (promptCtx.n_past + batch.size() > promptCtx.n_ctx) {\n            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;\n            // Erase the first percentage of context from the tokens...\n            std::cerr << \"GPTJ: reached the end of the context window so resizing\\n\";\n            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);\n            promptCtx.n_past = promptCtx.tokens.size();\n            recalculateContext(promptCtx, recalculateCallback);\n            assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);\n        }\n\n        if (!gptj_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, batch, promptCtx.logits,\n            d_ptr->mem_per_token)) {\n            std::cerr << \"GPT-J ERROR: Failed to process prompt\\n\";\n            return;\n        }\n\n        size_t tokens = batch_end - i;\n        for (size_t t = 0; t < tokens; ++t) {\n            if (promptCtx.tokens.size() == promptCtx.n_ctx)\n                promptCtx.tokens.erase(promptCtx.tokens.begin());\n            promptCtx.tokens.push_back(batch.at(t));\n            if (!promptCallback(batch.at(t)))\n                return;\n        }\n        promptCtx.n_past += batch.size();\n        i = batch_end;\n    }\n    t_prompt_us += ggml_time_us() - t_start_prompt_us;\n\n    int p_instructFound = 0;\n    int r_instructFound = 0;\n\n    std::string cachedResponse;\n    std::vector<gpt_vocab::id> cachedTokens;\n    std::unordered_set<std::string> reversePrompts\n        = { \"### Instruction\", \"### Prompt\", \"### Response\", \"### Human\", \"### Assistant\", \"### Context\" };\n\n    // predict next tokens\n    int32_t totalPredictions = 0;\n    for (int i = 0; i < promptCtx.n_predict; i++) {\n\n        // sample next token\n        const int n_vocab = d_ptr->model->hparams.n_vocab;\n        gpt_vocab::id id = 0;\n        {\n            const int64_t t_start_sample_us = ggml_time_us();\n            const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());\n            id = gpt_sample_top_k_top_p(d_ptr->vocab, n_vocab,\n                promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,\n                n_prev_toks,\n                promptCtx.logits,\n                promptCtx.top_k, promptCtx.top_p, promptCtx.temp,\n                promptCtx.repeat_penalty,\n                d_ptr->rng);\n\n            t_sample_us += ggml_time_us() - t_start_sample_us;\n        }\n\n        // Check if the context has run out...\n        if (promptCtx.n_past + 1 > promptCtx.n_ctx) {\n            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;\n            // Erase the first percentage of context from the tokens...\n            std::cerr << \"GPTJ: reached the end of the context window so resizing\\n\";\n            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);\n            promptCtx.n_past = promptCtx.tokens.size();\n            recalculateContext(promptCtx, recalculateCallback);\n            assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);\n        }\n\n        const int64_t t_start_predict_us = ggml_time_us();\n        if (!gptj_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, { id }, promptCtx.logits,\n            d_ptr->mem_per_token)) {\n            std::cerr << \"GPT-J ERROR: Failed to predict next token\\n\";\n            return;\n        }\n        t_predict_us += ggml_time_us() - t_start_predict_us;\n\n        promptCtx.n_past += 1;\n        // display text\n        ++totalPredictions;\n\n        if (id == 50256 /*end of text*/)\n            goto stop_generating;\n\n        const std::string str = d_ptr->vocab.id_to_token[id];\n\n        // Check if the provided str is part of our reverse prompts\n        bool foundPartialReversePrompt = false;\n        const std::string completed = cachedResponse + str;\n        if (reversePrompts.find(completed) != reversePrompts.end()) {\n            goto stop_generating;\n        }\n\n        // Check if it partially matches our reverse prompts and if so, cache\n        for (auto s : reversePrompts) {\n            if (s.compare(0, completed.size(), completed) == 0) {\n                foundPartialReversePrompt = true;\n                cachedResponse = completed;\n                break;\n            }\n        }\n\n        // Regardless the token gets added to our cache\n        cachedTokens.push_back(id);\n\n        // Continue if we have found a partial match\n        if (foundPartialReversePrompt)\n            continue;\n\n        // Empty the cache\n        for (auto t : cachedTokens) {\n            if (promptCtx.tokens.size() == promptCtx.n_ctx)\n                promptCtx.tokens.erase(promptCtx.tokens.begin());\n            promptCtx.tokens.push_back(t);\n            if (!responseCallback(t, d_ptr->vocab.id_to_token[t]))\n                goto stop_generating;\n        }\n        cachedTokens.clear();\n    }\n\nstop_generating:\n\n#if 0\n    // report timing\n    {\n        const int64_t t_main_end_us = ggml_time_us();\n\n        std::cout << \"GPT-J INFO: mem per token = \" << mem_per_token << \" bytes\\n\";\n        std::cout << \"GPT-J INFO:   sample time = \" << t_sample_us/1000.0f << \" ms\\n\";\n        std::cout << \"GPT-J INFO:   prompt time = \" << t_prompt_us/1000.0f << \" ms\\n\";\n        std::cout << \"GPT-J INFO:  predict time = \" << t_predict_us/1000.0f << \" ms / \" << t_predict_us/1000.0f/totalPredictions << \" ms per token\\n\";\n        std::cout << \"GPT-J INFO:    total time = \" << (t_main_end_us - t_main_start_us)/1000.0f << \" ms\\n\";\n        fflush(stdout);\n    }\n#endif\n\n    return;\n}\n\nvoid GPTJ::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate)\n{\n    size_t i = 0;\n    promptCtx.n_past = 0;\n    while (i < promptCtx.tokens.size()) {\n        size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());\n        std::vector<gpt_vocab::id> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);\n\n        assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);\n\n        if (!gptj_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, batch, promptCtx.logits,\n            d_ptr->mem_per_token)) {\n            std::cerr << \"GPTJ ERROR: Failed to process prompt\\n\";\n            goto stop_generating;\n        }\n        promptCtx.n_past += batch.size();\n        if (!recalculate(true))\n            goto stop_generating;\n        i = batch_end;\n    }\n    assert(promptCtx.n_past == promptCtx.tokens.size());\n\nstop_generating:\n    recalculate(false);\n}\n"
  },
  {
    "path": "gpt4all-backend/gptj.h",
    "content": "#ifndef GPTJ_H\n#define GPTJ_H\n\n#include <string>\n#include <functional>\n#include <vector>\n#include \"llmodel.h\"\n\nclass GPTJPrivate;\nclass GPTJ : public LLModel {\npublic:\n    GPTJ();\n    ~GPTJ();\n\n    bool loadModel(const std::string &modelPath) override;\n    bool isModelLoaded() const override;\n    size_t stateSize() const override;\n    size_t saveState(uint8_t *dest) const override;\n    size_t restoreState(const uint8_t *src) override;\n    void prompt(const std::string &prompt,\n        std::function<bool(int32_t)> promptCallback,\n        std::function<bool(int32_t, const std::string&)> responseCallback,\n        std::function<bool(bool)> recalculateCallback,\n        PromptContext &ctx) override;\n    void setThreadCount(int32_t n_threads) override;\n    int32_t threadCount() const override;\n\nprotected:\n    void recalculateContext(PromptContext &promptCtx,\n        std::function<bool(bool)> recalculate) override;\n\nprivate:\n    GPTJPrivate *d_ptr;\n};\n\n#endif // GPTJ_H\n"
  },
  {
    "path": "gpt4all-backend/llama/placeholder",
    "content": ""
  },
  {
    "path": "gpt4all-backend/llamamodel.cpp",
    "content": "#include \"llamamodel.h\"\n\n#include \"llama.cpp/examples/common.h\"\n#include \"llama.cpp/llama.h\"\n#include \"llama.cpp/ggml.h\"\n\n#include <cassert>\n#include <cmath>\n#include <cstdio>\n#include <cstring>\n#include <fstream>\n#include <map>\n#include <string>\n#include <vector>\n#include <iostream>\n#if defined(_WIN32) && defined(_MSC_VER)\n    #define WIN32_LEAN_AND_MEAN\n    #ifndef NOMINMAX\n        #define NOMINMAX\n    #endif\n    #include <windows.h>\n    #include <io.h>\n    #include <stdio.h>\n#else\n    #include <unistd.h>\n#endif\n#include <random>\n#include <thread>\n#include <unordered_set>\n\nstruct LLamaPrivate {\n    const std::string modelPath;\n    bool modelLoaded;\n    llama_context *ctx = nullptr;\n    llama_context_params params;\n    int64_t n_threads = 0;\n};\n\nLLamaModel::LLamaModel()\n    : d_ptr(new LLamaPrivate) {\n\n    d_ptr->modelLoaded = false;\n}\n\nbool LLamaModel::loadModel(const std::string &modelPath)\n{\n    // load the model\n    d_ptr->params = llama_context_default_params();\n\n    gpt_params params;\n    d_ptr->params.n_ctx      = 2048;\n    d_ptr->params.n_parts    = params.n_parts;\n    d_ptr->params.seed       = params.seed;\n    d_ptr->params.f16_kv     = params.memory_f16;\n    d_ptr->params.use_mmap   = params.use_mmap;\n#if defined (__APPLE__)\n    d_ptr->params.use_mlock  = true;\n#else\n    d_ptr->params.use_mlock  = params.use_mlock;\n#endif\n\n    d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);\n    if (!d_ptr->ctx) {\n        std::cerr << \"LLAMA ERROR: failed to load model from \" <<  modelPath << std::endl;\n        return false;\n    }\n\n    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());\n    d_ptr->modelLoaded = true;\n    fflush(stderr);\n    return true;\n}\n\nvoid LLamaModel::setThreadCount(int32_t n_threads) {\n    d_ptr->n_threads = n_threads;\n}\n\nint32_t LLamaModel::threadCount() const\n{\n    return d_ptr->n_threads;\n}\n\nLLamaModel::~LLamaModel()\n{\n    llama_free(d_ptr->ctx);\n}\n\nbool LLamaModel::isModelLoaded() const\n{\n    return d_ptr->modelLoaded;\n}\n\nsize_t LLamaModel::stateSize() const\n{\n    return llama_get_state_size(d_ptr->ctx);\n}\n\nsize_t LLamaModel::saveState(uint8_t *dest) const\n{\n    return llama_copy_state_data(d_ptr->ctx, dest);\n}\n\nsize_t LLamaModel::restoreState(const uint8_t *src)\n{\n    return llama_set_state_data(d_ptr->ctx, src);\n}\n\nvoid LLamaModel::prompt(const std::string &prompt,\n        std::function<bool(int32_t)> promptCallback,\n        std::function<bool(int32_t, const std::string&)> responseCallback,\n        std::function<bool(bool)> recalculateCallback,\n        PromptContext &promptCtx) {\n\n    if (!isModelLoaded()) {\n        std::cerr << \"LLAMA ERROR: prompt won't work with an unloaded model!\\n\";\n        return;\n    }\n\n    gpt_params params;\n    params.prompt = prompt;\n\n    // Add a space in front of the first character to match OG llama tokenizer behavior\n    params.prompt.insert(0, 1, ' ');\n\n    // tokenize the prompt\n    auto embd_inp = ::llama_tokenize(d_ptr->ctx, params.prompt, false);\n\n    // save the context size\n    promptCtx.n_ctx = llama_n_ctx(d_ptr->ctx);\n\n    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {\n        responseCallback(-1, \"The prompt size exceeds the context window size and cannot be processed.\");\n        std::cerr << \"LLAMA ERROR: The prompt is\" << embd_inp.size() <<\n            \"tokens and the context window is\" << promptCtx.n_ctx << \"!\\n\";\n        return;\n    }\n\n    promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size());\n    promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx);\n\n    // number of tokens to keep when resetting context\n    params.n_keep = (int)embd_inp.size();\n\n    // process the prompt in batches\n    size_t i = 0;\n    const int64_t t_start_prompt_us = ggml_time_us();\n    while (i < embd_inp.size()) {\n        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());\n        std::vector<llama_token> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);\n\n        // Check if the context has run out...\n        if (promptCtx.n_past + batch.size() > promptCtx.n_ctx) {\n            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;\n            // Erase the first percentage of context from the tokens...\n            std::cerr << \"LLAMA: reached the end of the context window so resizing\\n\";\n            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);\n            promptCtx.n_past = promptCtx.tokens.size();\n            recalculateContext(promptCtx, recalculateCallback);\n            assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);\n        }\n\n        if (llama_eval(d_ptr->ctx, batch.data(), batch.size(), promptCtx.n_past, d_ptr->n_threads)) {\n            std::cerr << \"LLAMA ERROR: Failed to process prompt\\n\";\n            return;\n        }\n\n        size_t tokens = batch_end - i;\n        for (size_t t = 0; t < tokens; ++t) {\n            if (promptCtx.tokens.size() == promptCtx.n_ctx)\n                promptCtx.tokens.erase(promptCtx.tokens.begin());\n            promptCtx.tokens.push_back(batch.at(t));\n            if (!promptCallback(batch.at(t)))\n                return;\n        }\n        promptCtx.n_past += batch.size();\n        i = batch_end;\n    }\n\n    std::string cachedResponse;\n    std::vector<llama_token> cachedTokens;\n    std::unordered_set<std::string> reversePrompts\n        = { \"### Instruction\", \"### Prompt\", \"### Response\", \"### Human\", \"### Assistant\", \"### Context\" };\n\n    // predict next tokens\n    int32_t totalPredictions = 0;\n    for (int i = 0; i < promptCtx.n_predict; i++) {\n        // sample next token\n        const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());\n        llama_token id = llama_sample_top_p_top_k(d_ptr->ctx,\n            promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,\n            n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.temp,\n            promptCtx.repeat_penalty);\n\n        // Check if the context has run out...\n        if (promptCtx.n_past + 1 > promptCtx.n_ctx) {\n            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;\n            // Erase the first percentage of context from the tokens...\n            std::cerr << \"LLAMA: reached the end of the context window so resizing\\n\";\n            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);\n            promptCtx.n_past = promptCtx.tokens.size();\n            recalculateContext(promptCtx, recalculateCallback);\n            assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);\n        }\n\n        if (llama_eval(d_ptr->ctx, &id, 1, promptCtx.n_past, d_ptr->n_threads)) {\n            std::cerr << \"LLAMA ERROR: Failed to predict next token\\n\";\n            return;\n        }\n\n        promptCtx.n_past += 1;\n        // display text\n        ++totalPredictions;\n        if (id == llama_token_eos())\n            return;\n\n        const std::string str = llama_token_to_str(d_ptr->ctx, id);\n\n        // Check if the provided str is part of our reverse prompts\n        bool foundPartialReversePrompt = false;\n        const std::string completed = cachedResponse + str;\n        if (reversePrompts.find(completed) != reversePrompts.end()) {\n            return;\n        }\n\n        // Check if it partially matches our reverse prompts and if so, cache\n        for (auto s : reversePrompts) {\n            if (s.compare(0, completed.size(), completed) == 0) {\n                foundPartialReversePrompt = true;\n                cachedResponse = completed;\n                break;\n            }\n        }\n\n        // Regardless the token gets added to our cache\n        cachedTokens.push_back(id);\n\n        // Continue if we have found a partial match\n        if (foundPartialReversePrompt)\n            continue;\n\n        // Empty the cache\n        for (auto t : cachedTokens) {\n            if (promptCtx.tokens.size() == promptCtx.n_ctx)\n                promptCtx.tokens.erase(promptCtx.tokens.begin());\n            promptCtx.tokens.push_back(t);\n            if (!responseCallback(t, llama_token_to_str(d_ptr->ctx, t)))\n                return;\n        }\n        cachedTokens.clear();\n    }\n}\n\nvoid LLamaModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate)\n{\n    size_t i = 0;\n    promptCtx.n_past = 0;\n    while (i < promptCtx.tokens.size()) {\n        size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());\n        std::vector<llama_token> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);\n\n        assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);\n\n        if (llama_eval(d_ptr->ctx, batch.data(), batch.size(), promptCtx.n_past, d_ptr->n_threads)) {\n            std::cerr << \"LLAMA ERROR: Failed to process prompt\\n\";\n            goto stop_generating;\n        }\n        promptCtx.n_past += batch.size();\n        if (!recalculate(true))\n            goto stop_generating;\n        i = batch_end;\n    }\n    assert(promptCtx.n_past == promptCtx.tokens.size());\n\nstop_generating:\n    recalculate(false);\n}\n"
  },
  {
    "path": "gpt4all-backend/llamamodel.h",
    "content": "#ifndef LLAMAMODEL_H\n#define LLAMAMODEL_H\n\n#include <string>\n#include <functional>\n#include <vector>\n#include \"llmodel.h\"\n\nclass LLamaPrivate;\nclass LLamaModel : public LLModel {\npublic:\n    LLamaModel();\n    ~LLamaModel();\n\n    bool loadModel(const std::string &modelPath) override;\n    bool isModelLoaded() const override;\n    size_t stateSize() const override;\n    size_t saveState(uint8_t *dest) const override;\n    size_t restoreState(const uint8_t *src) override;\n    void prompt(const std::string &prompt,\n        std::function<bool(int32_t)> promptCallback,\n        std::function<bool(int32_t, const std::string&)> responseCallback,\n        std::function<bool(bool)> recalculateCallback,\n        PromptContext &ctx) override;\n    void setThreadCount(int32_t n_threads) override;\n    int32_t threadCount() const override;\n\nprotected:\n    void recalculateContext(PromptContext &promptCtx,\n        std::function<bool(bool)> recalculate) override;\n\nprivate:\n    LLamaPrivate *d_ptr;\n};\n\n#endif // LLAMAMODEL_H"
  },
  {
    "path": "gpt4all-backend/llmodel.h",
    "content": "#ifndef LLMODEL_H\n#define LLMODEL_H\n\n#include <string>\n#include <functional>\n#include <vector>\n#include <cstdint>\n\nclass LLModel {\npublic:\n    explicit LLModel() {}\n    virtual ~LLModel() {}\n\n    virtual bool loadModel(const std::string &modelPath) = 0;\n    virtual bool isModelLoaded() const = 0;\n    virtual size_t stateSize() const { return 0; }\n    virtual size_t saveState(uint8_t *dest) const { return 0; }\n    virtual size_t restoreState(const uint8_t *src) { return 0; }\n    struct PromptContext {\n        std::vector<float> logits;      // logits of current context\n        std::vector<int32_t> tokens;    // current tokens in the context window\n        int32_t n_past = 0;             // number of tokens in past conversation\n        int32_t n_ctx = 0;              // number of tokens possible in context window\n        int32_t n_predict = 200;\n        int32_t top_k = 40;\n        float   top_p = 0.9f;\n        float   temp = 0.9f;\n        int32_t n_batch = 9;\n        float   repeat_penalty = 1.10f;\n        int32_t repeat_last_n = 64;     // last n tokens to penalize\n        float   contextErase = 0.75f;   // percent of context to erase if we exceed the context\n                                        // window\n    };\n    virtual void prompt(const std::string &prompt,\n        std::function<bool(int32_t)> promptCallback,\n        std::function<bool(int32_t, const std::string&)> responseCallback,\n        std::function<bool(bool)> recalculateCallback,\n        PromptContext &ctx) = 0;\n    virtual void setThreadCount(int32_t n_threads) {}\n    virtual int32_t threadCount() const { return 1; }\n\nprotected:\n    virtual void recalculateContext(PromptContext &promptCtx,\n        std::function<bool(bool)> recalculate) = 0;\n};\n\n#endif // LLMODEL_H\n"
  },
  {
    "path": "gpt4all-backend/llmodel_c.cpp",
    "content": "#include \"llmodel_c.h\"\n\n#include \"gptj.h\"\n#include \"llamamodel.h\"\n#include \"mpt.h\"\n\nstruct LLModelWrapper {\n    LLModel *llModel = nullptr;\n    LLModel::PromptContext promptContext;\n};\n\nllmodel_model llmodel_gptj_create()\n{\n    LLModelWrapper *wrapper = new LLModelWrapper;\n    wrapper->llModel = new GPTJ;\n    return reinterpret_cast<void*>(wrapper);\n}\n\nvoid llmodel_gptj_destroy(llmodel_model gptj)\n{\n    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(gptj);\n    delete wrapper->llModel;\n    delete wrapper;\n}\n\nllmodel_model llmodel_mpt_create()\n{\n    LLModelWrapper *wrapper = new LLModelWrapper;\n    wrapper->llModel = new MPT;\n    return reinterpret_cast<void*>(wrapper);\n}\n\nvoid llmodel_mpt_destroy(llmodel_model mpt)\n{\n    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(mpt);\n    delete wrapper->llModel;\n    delete wrapper;\n}\n\nllmodel_model llmodel_llama_create()\n{\n    LLModelWrapper *wrapper = new LLModelWrapper;\n    wrapper->llModel = new LLamaModel;\n    return reinterpret_cast<void*>(wrapper);\n}\n\nvoid llmodel_llama_destroy(llmodel_model llama)\n{\n    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(llama);\n    delete wrapper->llModel;\n    delete wrapper;\n}\n\nllmodel_model llmodel_model_create(const char *model_path) {\n\n    uint32_t magic;\n    llmodel_model model;\n    FILE *f = fopen(model_path, \"rb\");\n    fread(&magic, sizeof(magic), 1, f);\n\n    if (magic == 0x67676d6c) { model = llmodel_gptj_create();  }\n    else if (magic == 0x67676a74) { model = llmodel_llama_create(); }\n    else if (magic == 0x67676d6d) { model = llmodel_mpt_create();   }\n    else  {fprintf(stderr, \"Invalid model file\\n\");}\n    fclose(f);\n    return model;\n}\n\nvoid llmodel_model_destroy(llmodel_model model) {\n\n    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);\n    const std::type_info &modelTypeInfo = typeid(*wrapper->llModel);\n\n    if (modelTypeInfo == typeid(GPTJ))       { llmodel_gptj_destroy(model);  }\n    if (modelTypeInfo == typeid(LLamaModel)) { llmodel_llama_destroy(model); }\n    if (modelTypeInfo == typeid(MPT))        { llmodel_mpt_destroy(model);   }\n}\n\nbool llmodel_loadModel(llmodel_model model, const char *model_path)\n{\n    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);\n    return wrapper->llModel->loadModel(model_path);\n}\n\nbool llmodel_isModelLoaded(llmodel_model model)\n{\n    const auto *llm = reinterpret_cast<LLModelWrapper*>(model)->llModel;\n    return llm->isModelLoaded();\n}\n\nuint64_t llmodel_get_state_size(llmodel_model model)\n{\n    const auto *llm = reinterpret_cast<LLModelWrapper*>(model)->llModel;\n    return llm->stateSize();\n}\n\nuint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)\n{\n    const auto *llm = reinterpret_cast<LLModelWrapper*>(model)->llModel;\n    return llm->saveState(dest);\n}\n\nuint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)\n{\n    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);\n    return wrapper->llModel->restoreState(src);\n}\n\n// Wrapper functions for the C callbacks\nbool prompt_wrapper(int32_t token_id, void *user_data) {\n    llmodel_prompt_callback callback = reinterpret_cast<llmodel_prompt_callback>(user_data);\n    return callback(token_id);\n}\n\nbool response_wrapper(int32_t token_id, const std::string &response, void *user_data) {\n    llmodel_response_callback callback = reinterpret_cast<llmodel_response_callback>(user_data);\n    return callback(token_id, response.c_str());\n}\n\nbool recalculate_wrapper(bool is_recalculating, void *user_data) {\n    llmodel_recalculate_callback callback = reinterpret_cast<llmodel_recalculate_callback>(user_data);\n    return callback(is_recalculating);\n}\n\nvoid llmodel_prompt(llmodel_model model, const char *prompt,\n                    llmodel_prompt_callback prompt_callback,\n                    llmodel_response_callback response_callback,\n                    llmodel_recalculate_callback recalculate_callback,\n                    llmodel_prompt_context *ctx)\n{\n    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);\n\n    // Create std::function wrappers that call the C function pointers\n    std::function<bool(int32_t)> prompt_func =\n        std::bind(&prompt_wrapper, std::placeholders::_1, reinterpret_cast<void*>(prompt_callback));\n    std::function<bool(int32_t, const std::string&)> response_func =\n        std::bind(&response_wrapper, std::placeholders::_1, std::placeholders::_2, reinterpret_cast<void*>(response_callback));\n    std::function<bool(bool)> recalc_func =\n        std::bind(&recalculate_wrapper, std::placeholders::_1, reinterpret_cast<void*>(recalculate_callback));\n\n    // Copy the C prompt context\n    wrapper->promptContext.n_past = ctx->n_past;\n    wrapper->promptContext.n_ctx = ctx->n_ctx;\n    wrapper->promptContext.n_predict = ctx->n_predict;\n    wrapper->promptContext.top_k = ctx->top_k;\n    wrapper->promptContext.top_p = ctx->top_p;\n    wrapper->promptContext.temp = ctx->temp;\n    wrapper->promptContext.n_batch = ctx->n_batch;\n    wrapper->promptContext.repeat_penalty = ctx->repeat_penalty;\n    wrapper->promptContext.repeat_last_n = ctx->repeat_last_n;\n    wrapper->promptContext.contextErase = ctx->context_erase;\n\n    // Call the C++ prompt method\n    wrapper->llModel->prompt(prompt, prompt_func, response_func, recalc_func, wrapper->promptContext);\n\n    // Update the C context by giving access to the wrappers raw pointers to std::vector data\n    // which involves no copies\n    ctx->logits = wrapper->promptContext.logits.data();\n    ctx->logits_size = wrapper->promptContext.logits.size();\n    ctx->tokens = wrapper->promptContext.tokens.data();\n    ctx->tokens_size = wrapper->promptContext.tokens.size();\n\n    // Update the rest of the C prompt context\n    ctx->n_past = wrapper->promptContext.n_past;\n    ctx->n_ctx = wrapper->promptContext.n_ctx;\n    ctx->n_predict = wrapper->promptContext.n_predict;\n    ctx->top_k = wrapper->promptContext.top_k;\n    ctx->top_p = wrapper->promptContext.top_p;\n    ctx->temp = wrapper->promptContext.temp;\n    ctx->n_batch = wrapper->promptContext.n_batch;\n    ctx->repeat_penalty = wrapper->promptContext.repeat_penalty;\n    ctx->repeat_last_n = wrapper->promptContext.repeat_last_n;\n    ctx->context_erase = wrapper->promptContext.contextErase;\n}\n\nvoid llmodel_setThreadCount(llmodel_model model, int32_t n_threads)\n{\n    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);\n    wrapper->llModel->setThreadCount(n_threads);\n}\n\nint32_t llmodel_threadCount(llmodel_model model)\n{\n    const auto *llm = reinterpret_cast<LLModelWrapper*>(model)->llModel;\n    return llm->threadCount();\n}\n"
  },
  {
    "path": "gpt4all-backend/llmodel_c.h",
    "content": "#ifndef LLMODEL_C_H\n#define LLMODEL_C_H\n\n#include <stdint.h>\n#include <stddef.h>\n#include <stdbool.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n/**\n * Opaque pointer to the underlying model.\n */\ntypedef void *llmodel_model;\n\n/**\n * llmodel_prompt_context structure for holding the prompt context.\n * NOTE: The implementation takes care of all the memory handling of the raw logits pointer and the\n * raw tokens pointer. Attempting to resize them or modify them in any way can lead to undefined\n * behavior.\n */\ntypedef struct {\n    float *logits;          // logits of current context\n    size_t logits_size;     // the size of the raw logits vector\n    int32_t *tokens;        // current tokens in the context window\n    size_t tokens_size;     // the size of the raw tokens vector\n    int32_t n_past;         // number of tokens in past conversation\n    int32_t n_ctx;          // number of tokens possible in context window\n    int32_t n_predict;      // number of tokens to predict\n    int32_t top_k;          // top k logits to sample from\n    float top_p;            // nucleus sampling probability threshold\n    float temp;             // temperature to adjust model's output distribution\n    int32_t n_batch;        // number of predictions to generate in parallel\n    float repeat_penalty;   // penalty factor for repeated tokens\n    int32_t repeat_last_n;  // last n tokens to penalize\n    float context_erase;    // percent of context to erase if we exceed the context window\n} llmodel_prompt_context;\n\n/**\n * Callback type for prompt processing.\n * @param token_id The token id of the prompt.\n * @return a bool indicating whether the model should keep processing.\n */\ntypedef bool (*llmodel_prompt_callback)(int32_t token_id);\n\n/**\n * Callback type for response.\n * @param token_id The token id of the response.\n * @param response The response string. NOTE: a token_id of -1 indicates the string is an error string.\n * @return a bool indicating whether the model should keep generating.\n */\ntypedef bool (*llmodel_response_callback)(int32_t token_id, const char *response);\n\n/**\n * Callback type for recalculation of context.\n * @param whether the model is recalculating the context.\n * @return a bool indicating whether the model should keep generating.\n */\ntypedef bool (*llmodel_recalculate_callback)(bool is_recalculating);\n\n/**\n * Create a GPTJ instance.\n * @return A pointer to the GPTJ instance.\n */\nllmodel_model llmodel_gptj_create();\n\n/**\n * Destroy a GPTJ instance.\n * @param gptj A pointer to the GPTJ instance.\n */\nvoid llmodel_gptj_destroy(llmodel_model gptj);\n\n/**\n * Create a MPT instance.\n * @return A pointer to the MPT instance.\n */\nllmodel_model llmodel_mpt_create();\n\n/**\n * Destroy a MPT instance.\n * @param gptj A pointer to the MPT instance.\n */\nvoid llmodel_mpt_destroy(llmodel_model mpt);\n\n/**\n * Create a LLAMA instance.\n * @return A pointer to the LLAMA instance.\n */\nllmodel_model llmodel_llama_create();\n\n/**\n * Destroy a LLAMA instance.\n * @param llama A pointer to the LLAMA instance.\n */\nvoid llmodel_llama_destroy(llmodel_model llama);\n\n/**\n * Create a llmodel instance.\n * Recognises correct model type from file at model_path\n * @param model_path A string representing the path to the model file. \n * @return A pointer to the llmodel_model instance.\n */\nllmodel_model llmodel_model_create(const char *model_path);\n\n/**\n * Destroy a llmodel instance.\n * Recognises correct model type using type info\n * @param model a pointer to a llmodel_model instance.\n */\nvoid llmodel_model_destroy(llmodel_model model);\n\n\n/**\n * Load a model from a file.\n * @param model A pointer to the llmodel_model instance.\n * @param model_path A string representing the path to the model file.\n * @return true if the model was loaded successfully, false otherwise.\n */\nbool llmodel_loadModel(llmodel_model model, const char *model_path);\n\n/**\n * Check if a model is loaded.\n * @param model A pointer to the llmodel_model instance.\n * @return true if the model is loaded, false otherwise.\n */\nbool llmodel_isModelLoaded(llmodel_model model);\n\n/**\n * Get the size of the internal state of the model.\n * NOTE: This state data is specific to the type of model you have created.\n * @param model A pointer to the llmodel_model instance.\n * @return the size in bytes of the internal state of the model\n */\nuint64_t llmodel_get_state_size(llmodel_model model);\n\n/**\n * Saves the internal state of the model to the specified destination address.\n * NOTE: This state data is specific to the type of model you have created.\n * @param model A pointer to the llmodel_model instance.\n * @param dest A pointer to the destination.\n * @return the number of bytes copied\n */\nuint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);\n\n/**\n * Restores the internal state of the model using data from the specified address.\n * NOTE: This state data is specific to the type of model you have created.\n * @param model A pointer to the llmodel_model instance.\n * @param src A pointer to the src.\n * @return the number of bytes read\n */\nuint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);\n\n/**\n * Generate a response using the model.\n * @param model A pointer to the llmodel_model instance.\n * @param prompt A string representing the input prompt.\n * @param prompt_callback A callback function for handling the processing of prompt.\n * @param response_callback A callback function for handling the generated response.\n * @param recalculate_callback A callback function for handling recalculation requests.\n * @param ctx A pointer to the llmodel_prompt_context structure.\n */\nvoid llmodel_prompt(llmodel_model model, const char *prompt,\n                    llmodel_prompt_callback prompt_callback,\n                    llmodel_response_callback response_callback,\n                    llmodel_recalculate_callback recalculate_callback,\n                    llmodel_prompt_context *ctx);\n\n/**\n * Set the number of threads to be used by the model.\n * @param model A pointer to the llmodel_model instance.\n * @param n_threads The number of threads to be used.\n */\nvoid llmodel_setThreadCount(llmodel_model model, int32_t n_threads);\n\n/**\n * Get the number of threads currently being used by the model.\n * @param model A pointer to the llmodel_model instance.\n * @return The number of threads currently being used.\n */\nint32_t llmodel_threadCount(llmodel_model model);\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif // LLMODEL_C_H\n"
  },
  {
    "path": "gpt4all-backend/mpt.cpp",
    "content": "#include \"mpt.h\"\n#include \"llama.cpp/ggml.h\"\n\n#include \"utils.h\"\n\n#include <cassert>\n#include <cmath>\n#include <cstdio>\n#include <cstring>\n#include <fstream>\n#include <map>\n#include <random>\n#include <string>\n#include <vector>\n#include <iostream>\n#if defined(_WIN32) && defined(_MSC_VER)\n    #define WIN32_LEAN_AND_MEAN\n    #ifndef NOMINMAX\n        #define NOMINMAX\n    #endif\n    #include <windows.h>\n    #include <io.h>\n    #include <stdio.h>\n#else\n    #include <unistd.h>\n#endif\n#include <sstream>\n#include <thread>\n#include <unordered_set>\n#include <regex>\n\nstatic const size_t MB = 1024*1024;\n\n// default hparams (MPT 7B)\nstruct mpt_hparams {\n    int32_t n_vocab      = 50432;\n    int32_t n_ctx        = 2048;\n    int32_t n_embd       = 4096;\n    int32_t n_head       = 32;\n    int32_t n_layer      = 32;\n    float alibi_bias_max = 8;\n    float clip_qkv       = 0;\n    int32_t expand       = 4;\n    int32_t f16          = 1;\n};\n\nstruct mpt_layer {\n    // normalization\n    struct ggml_tensor * norm_1_w;\n    struct ggml_tensor * norm_2_w;\n\n    // attention\n    struct ggml_tensor * attn_Wqkv_w;\n    struct ggml_tensor * attn_out_proj_w;\n\n    // ff\n    struct ggml_tensor * ffn_up_proj_w;\n    struct ggml_tensor * ffn_down_proj_w;\n};\n\nstruct mpt_buffer {\n    uint8_t * addr = NULL;\n    size_t size = 0;\n\n    void resize(size_t size) {\n        delete[] addr;\n        addr = new uint8_t[size];\n        this->size = size;\n    }\n\n    ~mpt_buffer() {\n        fflush(stdout);\n        delete[] addr;\n    }\n};\n\nstruct mpt_kv_cache {\n    struct ggml_tensor * k;\n    struct ggml_tensor * v;\n\n    struct ggml_context * ctx = NULL;\n\n    mpt_buffer buf;\n\n    int n; // number of tokens currently in the cache\n\n    ~mpt_kv_cache() {\n        if (ctx) {\n            ggml_free(ctx);\n        }\n    }\n};\n\nstruct mpt_model {\n    mpt_hparams hparams;\n\n    // normalization\n    struct ggml_tensor * norm_f_w;\n\n    struct ggml_tensor * wte; // position embedding\n\n    // mpt does weight tying\n\n    std::vector<mpt_layer> layers;\n\n    struct mpt_kv_cache kv_self;\n    struct ggml_context * ctx;\n    std::map<std::string, struct ggml_tensor *> tensors;\n\n\n    mpt_buffer buf;\n\n    ~mpt_model() {\n        if (ctx) {\n            ggml_free(ctx);\n        }\n    }\n};\n\nstatic bool kv_cache_init(\n        const struct mpt_hparams & hparams,\n             struct mpt_kv_cache & cache,\n                         ggml_type   wtype,\n                               int   n_ctx) {\n    const int n_embd  = hparams.n_embd;\n    const int n_layer = hparams.n_layer;\n\n    const int64_t n_mem      = (int64_t)n_layer*n_ctx;\n    const int64_t n_elements = n_embd*n_mem;\n\n    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);\n\n    struct ggml_init_params params;\n    params.mem_size   = cache.buf.size;\n    params.mem_buffer = cache.buf.addr;\n    params.no_alloc   = false;\n\n    cache.ctx = ggml_init(params);\n\n    if (!cache.ctx) {\n        fprintf(stderr, \"%s: failed to allocate memory for kv cache\\n\", __func__);\n        return false;\n    }\n\n    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);\n    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);\n\n    return true;\n}\n\n// load the model's weights from a stream\nbool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & model, gpt_vocab & vocab) {\n    printf(\"%s: loading model from '%s' - please wait ...\\n\", __func__, fname.c_str());\n\n    // verify magic\n    {\n        uint32_t magic;\n        fin.read((char *) &magic, sizeof(magic));\n        if (magic != 0x67676d6d) {\n            fprintf(stderr, \"%s: invalid model file '%s' (bad magic)\\n\", __func__, fname.c_str());\n            return false;\n        }\n    }\n\n    // load hparams\n    {\n        auto & hparams = model.hparams;\n\n        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));\n        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));\n        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));\n        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));\n        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));\n        fin.read((char *) &hparams.alibi_bias_max,  sizeof(hparams.alibi_bias_max));\n        fin.read((char *) &hparams.clip_qkv,  sizeof(hparams.clip_qkv));\n        fin.read((char *) &hparams.f16,   sizeof(hparams.f16));\n\n        printf(\"%s: n_vocab        = %d\\n\", __func__, hparams.n_vocab);\n        printf(\"%s: n_ctx          = %d\\n\", __func__, hparams.n_ctx);\n        printf(\"%s: n_embd         = %d\\n\", __func__, hparams.n_embd);\n        printf(\"%s: n_head         = %d\\n\", __func__, hparams.n_head);\n        printf(\"%s: n_layer        = %d\\n\", __func__, hparams.n_layer);\n        printf(\"%s: alibi_bias_max = %f\\n\", __func__, hparams.alibi_bias_max);\n        printf(\"%s: clip_qkv       = %f\\n\", __func__, hparams.clip_qkv);\n        printf(\"%s: ftype          = %d\\n\", __func__, hparams.f16);\n    }\n\n    // load vocab\n    {\n        int32_t n_vocab = model.hparams.n_vocab;\n        fin.read((char *) &n_vocab, sizeof(n_vocab));\n\n        if (n_vocab != model.hparams.n_vocab) {\n            fprintf(stderr, \"%s: invalid model file '%s' (bad vocab size %d != %d)\\n\",\n                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);\n            return false;\n        }\n\n        std::string word;\n        for (int i = 0; i < n_vocab; i++) {\n            uint32_t len;\n            fin.read((char *) &len, sizeof(len));\n            bool special = false;\n            if (len & (1<<31)) {\n                len = len &~ (1<<31);\n                special = true;\n            }\n\n            if (len > 0) {\n                word.resize(len);\n                fin.read((char *) word.data(), len);\n                vocab.token_to_id[word] = i;\n                vocab.id_to_token[i] = word;\n            }\n\n            if(special) {\n                vocab.add_special_token(word);\n            }\n        }\n    }\n\n    // for the big tensors, we have the option to store the data in 16-bit floats or quantized\n    // in order to save memory and also to speed up the computation\n    ggml_type wtype = GGML_TYPE_COUNT;\n    switch (model.hparams.f16) {\n        case 0: wtype = GGML_TYPE_F32;  break;\n        case 1: wtype = GGML_TYPE_F16;  break;\n        case 2: wtype = GGML_TYPE_Q4_0; break;\n        case 3: wtype = GGML_TYPE_Q4_1; break;\n        case 5: wtype = GGML_TYPE_Q4_2; break;\n        default:\n                {\n                    fprintf(stderr, \"%s: invalid model file '%s' (bad f16 value %d)\\n\",\n                            __func__, fname.c_str(), model.hparams.f16);\n                    return false;\n                }\n    }\n\n    auto & ctx = model.ctx;\n\n    size_t ctx_size = 0;\n\n    {\n        const auto & hparams = model.hparams;\n\n        const int n_embd  = hparams.n_embd;\n        const int n_layer = hparams.n_layer;\n        const int n_ctx   = hparams.n_ctx;\n        const int n_vocab = hparams.n_vocab;\n        const int expand  = hparams.expand;\n\n\n        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_w\n\n        ctx_size += n_embd*n_vocab*ggml_type_sizef(GGML_TYPE_F32); // wte\n\n        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // norm_1_w\n        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // norm_2_w\n\n        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // attn_Wqkv_w\n        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // attn_out_proj_w\n\n        ctx_size += n_layer*(expand*n_embd*n_embd*ggml_type_sizef(wtype));  // ffn_up_proj_w\n        ctx_size += n_layer*(expand*n_embd*n_embd*ggml_type_sizef(wtype)); // ffn_down_proj_w\n\n        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k\n        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v\n\n        // TODO probably less now?\n        ctx_size += (5 + 10*n_layer)*256; // object overhead\n\n        printf(\"%s: ggml ctx size = %6.2f MB\\n\", __func__, ctx_size/(1024.0*1024.0));\n    }\n\n    // create the ggml context\n    {\n        struct ggml_init_params params = {\n            .mem_size   = ctx_size,\n            .mem_buffer = NULL,\n            .no_alloc   = false,\n        };\n\n        model.ctx = ggml_init(params);\n        if (!model.ctx) {\n            fprintf(stderr, \"%s: ggml_init() failed\\n\", __func__);\n            return false;\n        }\n    }\n\n    // prepare memory for the weights\n    {\n        const auto & hparams = model.hparams;\n\n        const int n_embd  = hparams.n_embd;\n        const int n_layer = hparams.n_layer;\n        const int n_ctx   = hparams.n_ctx;\n        const int n_vocab = hparams.n_vocab;\n        const int expand  = hparams.expand;\n\n        model.layers.resize(n_layer);\n\n        model.wte    = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);\n        model.norm_f_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);\n\n        // map by name\n        model.tensors[\"transformer.wte.weight\"] = model.wte;\n        model.tensors[\"transformer.norm_f.weight\"] = model.norm_f_w;\n\n        for (int i = 0; i < n_layer; ++i) {\n            auto & layer = model.layers[i];\n\n            layer.norm_1_w        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);\n            layer.norm_2_w        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);\n\n            layer.attn_Wqkv_w     = ggml_new_tensor_2d(ctx, wtype,        n_embd, n_embd * 3);\n            layer.attn_out_proj_w = ggml_new_tensor_2d(ctx, wtype,        n_embd, n_embd);\n            layer.ffn_up_proj_w   = ggml_new_tensor_2d(ctx, wtype,        n_embd, expand*n_embd);\n            layer.ffn_down_proj_w = ggml_new_tensor_2d(ctx, wtype, expand*n_embd, n_embd);\n\n            // map by name\n            model.tensors[\"transformer.blocks.\" + std::to_string(i) + \".norm_1.weight\"]        = layer.norm_1_w;\n            model.tensors[\"transformer.blocks.\" + std::to_string(i) + \".norm_2.weight\"]        = layer.norm_2_w;\n            model.tensors[\"transformer.blocks.\" + std::to_string(i) + \".attn.Wqkv.weight\"]     = layer.attn_Wqkv_w;\n            model.tensors[\"transformer.blocks.\" + std::to_string(i) + \".attn.out_proj.weight\"] = layer.attn_out_proj_w;\n\n            model.tensors[\"transformer.blocks.\" + std::to_string(i) + \".ffn.up_proj.weight\"]   = layer.ffn_up_proj_w;\n            model.tensors[\"transformer.blocks.\" + std::to_string(i) + \".ffn.down_proj.weight\"] = layer.ffn_down_proj_w;\n        }\n    }\n\n    // key + value memory\n    {\n        const auto & hparams = model.hparams;\n\n        const int n_embd  = hparams.n_embd;\n        const int n_layer = hparams.n_layer;\n        const int n_ctx   = hparams.n_ctx;\n\n        const int n_mem      = n_layer*n_ctx;\n        const int n_elements = n_embd*n_mem;\n\n        if (!kv_cache_init(hparams, model.kv_self, GGML_TYPE_F16, model.hparams.n_ctx)) {\n            fprintf(stderr, \"%s: kv_cache_init() failed for self-attention cache\\n\", __func__);\n            ggml_free(ctx);\n            return false;\n        }\n\n        const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v);\n        printf(\"%s: kv self size  = %7.2f MB\\n\", __func__, memory_size / 1024.0 / 1024.0);\n    }\n\n    // load weights\n    {\n        int n_tensors = 0;\n        size_t total_size = 0;\n\n        printf(\"%s: \", __func__);\n\n        while (true) {\n            int32_t n_dims;\n            int32_t length;\n            int32_t ttype;\n\n            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));\n            fin.read(reinterpret_cast<char *>(&length), sizeof(length));\n            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));\n\n            if (fin.eof()) {\n                break;\n            }\n\n            int32_t nelements = 1;\n            int32_t ne[2] = { 1, 1 };\n            for (int i = 0; i < n_dims; ++i) {\n                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));\n                nelements *= ne[i];\n            }\n\n            std::string name(length, 0);\n            fin.read(&name[0], length);\n\n            if (model.tensors.find(name.data()) == model.tensors.end()) {\n                fprintf(stderr, \"%s: unknown tensor '%s' in model file\\n\", __func__, name.data());\n                return false;\n            }\n\n            auto tensor = model.tensors[name.data()];\n            if (ggml_nelements(tensor) != nelements) {\n                fprintf(stderr, \"%s: tensor '%s' has wrong size in model file\\n\", __func__, name.data());\n                return false;\n            }\n\n            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {\n                fprintf(stderr, \"%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\\n\",\n                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);\n                return false;\n            }\n\n            // for debugging\n            if (0) {\n                printf(\"%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\\n\", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));\n            }\n\n            const size_t bpe = ggml_type_size(ggml_type(ttype));\n\n            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {\n                fprintf(stderr, \"%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\\n\",\n                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);\n                return false;\n            }\n\n            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));\n\n            //printf(\"%42s - [%5d, %5d], type = %6s, %6.2f MB\\n\", name.data(), ne[0], ne[1], ttype == 0 ? \"float\" : \"f16\", ggml_nbytes(tensor)/1024.0/1024.0);\n            total_size += ggml_nbytes(tensor);\n            if (++n_tensors % 8 == 0) {\n                printf(\".\");\n                fflush(stdout);\n            }\n        }\n\n        printf(\" done\\n\");\n\n        printf(\"%s: model size = %8.2f MB / num tensors = %d\\n\", __func__, total_size/1024.0/1024.0, n_tensors);\n    }\n\n    return true;\n}\n\n// load the model's weights from a file path\nbool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) {\n\n    auto fin = std::ifstream(fname, std::ios::binary);\n    if (!fin) {\n        fprintf(stderr, \"%s: failed to open '%s'\\n\", __func__, fname.c_str());\n        return false;\n    }\n\n    bool loaded = mpt_model_load(fname, fin, model, vocab);\n    fin.close();\n    return loaded;\n}\n\nbool mpt_eval(\n        mpt_model & model,\n        const int n_threads,\n        const int n_past,\n        const std::vector<int>           & embd_inp,\n              std::vector<float>         & embd_w,\n              size_t                     & mem_per_token) {\n    const int N = embd_inp.size();\n\n    const auto & hparams = model.hparams;\n\n    const int n_embd  = hparams.n_embd;\n    const int n_layer = hparams.n_layer;\n    const int n_ctx   = hparams.n_ctx;\n    const int n_head  = hparams.n_head;\n    const int n_vocab = hparams.n_vocab;\n    const int expand  = hparams.expand;\n\n    const int d_key = n_embd/n_head;\n\n    const size_t init_buf_size = 1024u*MB;\n    if (!model.buf.addr || model.buf.size < init_buf_size)\n        model.buf.resize(init_buf_size);\n\n    if (mem_per_token > 0 && mem_per_token*N > model.buf.size) {\n        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead\n        // printf(\"\\n%s: reallocating buffer from %zu to %zu bytes\\n\", __func__, model.buf.size, buf_size_new);\n\n        // reallocate\n        model.buf.resize(buf_size_new);\n        if (model.buf.addr == nullptr) {\n            fprintf(stderr, \"%s: failed to allocate %zu bytes\\n\", __func__, model.buf.size);\n            return false;\n        }\n    }\n\n    struct ggml_init_params params = {\n        .mem_size   = model.buf.size,\n        .mem_buffer = model.buf.addr,\n    };\n\n    struct ggml_context * ctx0 = ggml_init(params);\n    struct ggml_cgraph gf = { .n_threads = n_threads };\n\n    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);\n    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));\n\n    // wte\n    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);\n\n    for (int il = 0; il < n_layer; ++il) {\n\n        struct ggml_tensor * inpSA = inpL;\n        struct ggml_tensor * cur = inpSA;\n        // self-attention\n        {\n\n            // norm1\n            cur = ggml_norm(ctx0, cur);\n            cur = ggml_mul(ctx0,\n                    ggml_repeat(ctx0, model.layers[il].norm_1_w, cur),\n                    cur);\n            // compute QKV\n            cur = ggml_mul_mat(ctx0,\n                    model.layers[il].attn_Wqkv_w,\n                    cur);\n\n            // TODO: clip_qkv\n            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*ggml_element_size(cur)*n_embd));\n            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*ggml_element_size(cur)*n_embd));\n            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*ggml_element_size(cur)*n_embd));\n\n            // TODO: qk_ln? (seems to be False in MPT-7B configs)\n            {\n                Vcur = ggml_transpose(ctx0, Vcur);\n\n                struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_self.k, N*n_embd, (ggml_element_size(model.kv_self.k)*n_embd)*(il*n_ctx + n_past));\n                struct ggml_tensor * v = ggml_view_2d(ctx0, model.kv_self.v, N, n_embd,\n                                        (   n_ctx)*ggml_element_size(model.kv_self.v),\n                                        (il*n_ctx)*ggml_element_size(model.kv_self.v)*n_embd + n_past*ggml_element_size(model.kv_self.v));\n\n                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));\n                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));\n            }\n            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)\n            struct ggml_tensor * Q =\n                ggml_permute(ctx0,\n                        ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N),\n                        0, 2, 1, 3);\n\n            struct ggml_tensor * K =\n                ggml_permute(ctx0,\n                        ggml_reshape_3d(ctx0,\n                            ggml_view_1d(ctx0, model.kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.k)*n_embd),\n                            n_embd/n_head, n_head, n_past + N),\n                        0, 2, 1, 3);\n\n            // K * Q\n            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);\n\n            // KQ_scaled = KQ / sqrt(n_embd/n_head)\n            struct ggml_tensor * KQ_scaled =\n                ggml_scale(ctx0,\n                        KQ,\n                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))\n                        );\n\n\n            // Alibi\n            struct ggml_tensor * KQ_scaled_biased = ggml_alibi(ctx0, ggml_cont(ctx0, KQ_scaled), n_past, n_head);\n\n            // KQ_masked = mask_past(KQ_scaled)\n            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_biased, n_past);\n\n            // KQ = soft_max(KQ_masked)\n            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);\n\n            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()\n            struct ggml_tensor * V =\n                ggml_view_3d(ctx0, model.kv_self.v,\n                        n_past + N, n_embd/n_head, n_head,\n                        n_ctx*ggml_element_size(model.kv_self.v),\n                        n_ctx*ggml_element_size(model.kv_self.v)*n_embd/n_head,\n                        il*n_ctx*ggml_element_size(model.kv_self.v)*n_embd);\n\n            // KQV = transpose(V) * KQ_soft_max\n            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);\n\n            // KQV_merged = KQV.permute(0, 2, 1, 3)\n            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);\n\n            // cur = KQV_merged.contiguous().view(n_embd, N)\n            cur = ggml_cpy(ctx0,\n                    KQV_merged,\n                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));\n\n            // projection (no bias)\n            cur = ggml_mul_mat(ctx0,\n                    model.layers[il].attn_out_proj_w,\n                    cur);\n        }\n\n\n        // residual\n        struct ggml_tensor * resSA = ggml_add(ctx0, cur, inpSA);\n        // feed-forward network\n        {\n            cur = resSA;\n            // norm2\n            cur = ggml_norm(ctx0, cur);\n            cur = ggml_mul(ctx0,\n                    ggml_repeat(ctx0, model.layers[il].norm_2_w, cur),\n                    cur);\n            // ffn\n            cur = ggml_mul_mat(ctx0,\n                    model.layers[il].ffn_up_proj_w,\n                    cur);\n            cur = ggml_gelu(ctx0, cur);\n            cur = ggml_mul_mat(ctx0,\n                    model.layers[il].ffn_down_proj_w,\n                    cur);\n\n        }\n\n        // self-attention + FF\n        inpL = ggml_add(ctx0, cur, resSA);\n    }\n\n    struct ggml_tensor * out = inpL;\n    // -> logits\n    {\n        out = ggml_norm(ctx0, out);\n        out = ggml_mul(ctx0,\n                    ggml_repeat(ctx0, model.norm_f_w, out),\n                    out);\n        out = ggml_mul_mat(ctx0, model.wte, out);\n    }\n\n\n    // run the computation\n    ggml_build_forward_expand(&gf, out);\n    ggml_graph_compute       (ctx0, &gf);\n\n\n    // return result for just the last token\n    embd_w.resize(n_vocab);\n    memcpy(embd_w.data(), (float *) ggml_get_data(out) + (n_vocab*(N-1)), sizeof(float)*n_vocab);\n\n    if (mem_per_token == 0) {\n        mem_per_token = ggml_used_mem(ctx0)/N;\n    }\n    //printf(\"used_mem = %zu\\n\", ggml_used_mem(ctx0));\n\n    ggml_free(ctx0);\n\n    return true;\n}\n\n\n#define MPT_MAX_RNG_STATE 64*1024\n\nsize_t mpt_get_state_size(const mpt_model &model)\n{\n    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.\n    // for reference, std::mt19937(1337) serializes to 6701 bytes.\n    const size_t s_rng_size        = sizeof(size_t);\n    const size_t s_rng             = MPT_MAX_RNG_STATE;\n    const size_t s_kv_size         = sizeof(size_t);\n    const size_t s_kv_ntok         = sizeof(int);\n    const size_t s_kv              = model.kv_self.buf.size;\n    const size_t s_total = (\n        + s_rng_size\n        + s_rng\n        + s_kv_size\n        + s_kv_ntok\n        + s_kv\n    );\n    fflush(stdout);\n    return s_total;\n}\n\nsize_t mpt_copy_state_data(const mpt_model &model, const std::mt19937 &rng, uint8_t *dest)\n{\n    uint8_t * out = dest;\n    fflush(stdout);\n    // copy rng\n    {\n        std::stringstream rng_ss;\n        rng_ss << rng;\n\n        const size_t rng_size = rng_ss.str().size();\n        char rng_buf[MPT_MAX_RNG_STATE];\n\n        memset(&rng_buf[0], 0, MPT_MAX_RNG_STATE);\n        memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());\n\n        memcpy(out, &rng_size,   sizeof(rng_size));   out += sizeof(rng_size);\n        memcpy(out, &rng_buf[0], MPT_MAX_RNG_STATE); out += MPT_MAX_RNG_STATE;\n    }\n\n    // copy kv cache\n    {\n        const size_t kv_size = model.kv_self.buf.size;\n        const int    kv_ntok = model.kv_self.n;\n\n        memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);\n        memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);\n\n        if (kv_size) {\n            memcpy(out, model.kv_self.buf.addr, kv_size); out += kv_size;\n        }\n    }\n\n    const size_t written  = out - dest;\n    const size_t expected = mpt_get_state_size(model);\n    assert(written == expected);\n    fflush(stdout);\n    return written;\n}\n\nsize_t mpt_set_state_data(mpt_model *model, std::mt19937 *rng, const uint8_t *src)\n{\n    const uint8_t * in = src;\n\n    // set rng\n    {\n        size_t rng_size;\n        char   rng_buf[MPT_MAX_RNG_STATE];\n\n        memcpy(&rng_size,   in, sizeof(rng_size));    in += sizeof(rng_size);\n        memcpy(&rng_buf[0], in, MPT_MAX_RNG_STATE); in += MPT_MAX_RNG_STATE;\n\n        std::stringstream rng_ss;\n        rng_ss.str(std::string(&rng_buf[0], rng_size));\n        rng_ss >> *rng;\n\n        assert(rng_ss.fail() == false);\n    }\n\n    // set kv cache\n    {\n        size_t kv_size;\n        int kv_ntok;\n\n        memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);\n        memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);\n\n        if (kv_size) {\n            assert(model->kv_self.buf.size == kv_size);\n\n            void * k_data = model->kv_self.k->data; // remember data pointers\n            void * v_data = model->kv_self.v->data; // because their value is stored in buf and overwritten by memcpy\n\n            memcpy(model->kv_self.buf.addr, in, kv_size); in += kv_size;\n\n            model->kv_self.k->data = k_data; // restore correct data pointers\n            model->kv_self.v->data = v_data;\n\n        }\n\n        model->kv_self.n = kv_ntok;\n    }\n\n    const size_t nread    = in - src;\n    const size_t expected = mpt_get_state_size(*model);\n    assert(nread == expected);\n    fflush(stdout);\n    return nread;\n}\n\nstruct MPTPrivate {\n    const std::string modelPath;\n    bool modelLoaded;\n    gpt_vocab vocab;\n    mpt_model *model = nullptr;\n    int64_t n_threads = 0;\n    size_t mem_per_token = 0;\n    std::mt19937 rng;\n    bool has_im_end = false;\n};\n\nMPT::MPT()\n    : d_ptr(new MPTPrivate) {\n\n    d_ptr->model = new mpt_model;\n    d_ptr->modelLoaded = false;\n}\n\nbool MPT::loadModel(const std::string &modelPath) {\n    std::mt19937 rng(time(NULL));\n    d_ptr->rng = rng;\n\n    auto fin = std::ifstream(modelPath, std::ios::binary);\n\n    // load the model\n    if (!mpt_model_load(modelPath, fin, *d_ptr->model, d_ptr->vocab)) {\n        std::cerr << \"GPT-J ERROR: failed to load model from \" <<  modelPath;\n        return false;\n    }\n\n    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());\n    d_ptr->modelLoaded = true;\n    d_ptr->has_im_end = d_ptr->vocab.token_to_id.find(\"<|im_end|>\") != d_ptr->vocab.token_to_id.end();\n    fflush(stdout);\n    return true;\n}\n\nvoid MPT::setThreadCount(int32_t n_threads) {\n    d_ptr->n_threads = n_threads;\n}\n\nint32_t MPT::threadCount() const\n{\n    return d_ptr->n_threads;\n}\n\nMPT::~MPT()\n{\n    delete d_ptr->model;\n}\n\nbool MPT::isModelLoaded() const\n{\n    return d_ptr->modelLoaded;\n}\n\nsize_t MPT::stateSize() const\n{\n    return mpt_get_state_size(*d_ptr->model);\n}\n\nsize_t MPT::saveState(uint8_t *dest) const\n{\n    return mpt_copy_state_data(*d_ptr->model, d_ptr->rng, dest);\n}\n\nsize_t MPT::restoreState(const uint8_t *src)\n{\n    return mpt_set_state_data(d_ptr->model, &d_ptr->rng, src);\n}\n\nvoid MPT::prompt(const std::string &prompt,\n        std::function<bool(int32_t)> promptCallback,\n        std::function<bool(int32_t, const std::string&)> responseCallback,\n        std::function<bool(bool)> recalculateCallback,\n        PromptContext &promptCtx) {\n\n    if (!isModelLoaded()) {\n        std::cerr << \"GPT-J ERROR: prompt won't work with an unloaded model!\\n\";\n        return;\n    }\n\n    const int64_t t_main_start_us = ggml_time_us();\n\n    int64_t t_sample_us  = 0;\n    int64_t t_predict_us = 0;\n    int64_t t_prompt_us = 0;\n\n    // tokenize the prompt\n    std::vector<int> embd_inp = gpt_tokenize(d_ptr->vocab, prompt);\n\n    // save the context size\n    promptCtx.n_ctx = d_ptr->model->hparams.n_ctx;\n\n    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {\n        responseCallback(-1, \"ERROR: The prompt size exceeds the context window size and cannot be processed.\");\n        std::cerr << \"GPT-J ERROR: The prompt is\" << embd_inp.size() <<\n            \"tokens and the context window is\" << promptCtx.n_ctx << \"!\\n\";\n        return;\n    }\n\n    promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size());\n    promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx);\n\n    // determine the required inference memory per token:\n    static bool initialized = false;\n    static std::vector<int> p_instruct;\n    static std::vector<int> r_instruct;\n    if (!initialized) {\n         mpt_eval(*d_ptr->model, d_ptr->n_threads, 0, { 0, 1, 2, 3 }, promptCtx.logits,\n            d_ptr->mem_per_token);\n        initialized = true;\n    }\n\n    // process the prompt in batches\n    size_t i = 0;\n    const int64_t t_start_prompt_us = ggml_time_us();\n    while (i < embd_inp.size()) {\n        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());\n        std::vector<int> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);\n\n        // Check if the context has run out...\n        if (promptCtx.n_past + batch.size() > promptCtx.n_ctx) {\n            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;\n            // Erase the first percentage of context from the tokens...\n            std::cerr << \"MPT: reached the end of the context window so resizing\\n\";\n            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);\n            promptCtx.n_past = promptCtx.tokens.size();\n            recalculateContext(promptCtx, recalculateCallback);\n            assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);\n        }\n\n        if (!mpt_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, batch, promptCtx.logits,\n            d_ptr->mem_per_token)) {\n            std::cerr << \"GPT-J ERROR: Failed to process prompt\\n\";\n            return;\n        }\n\n        size_t tokens = batch_end - i;\n        for (size_t t = 0; t < tokens; ++t) {\n            if (promptCtx.tokens.size() == promptCtx.n_ctx)\n                promptCtx.tokens.erase(promptCtx.tokens.begin());\n            promptCtx.tokens.push_back(batch.at(t));\n            if (!promptCallback(batch.at(t)))\n                return;\n        }\n        promptCtx.n_past += batch.size();\n        i = batch_end;\n    }\n    t_prompt_us += ggml_time_us() - t_start_prompt_us;\n\n    int p_instructFound = 0;\n    int r_instructFound = 0;\n\n    std::string cachedResponse;\n    std::vector<int> cachedTokens;\n    std::unordered_set<std::string> reversePrompts\n        = { \"### Instruction\", \"### Prompt\", \"### Response\", \"### Human\", \"### Assistant\", \"### Context\" };\n\n    // predict next tokens\n    int32_t totalPredictions = 0;\n    for (int i = 0; i < promptCtx.n_predict; i++) {\n\n        // sample next token\n        const int n_vocab = d_ptr->model->hparams.n_vocab;\n        int id = 0;\n        {\n            const int64_t t_start_sample_us = ggml_time_us();\n            const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());\n            id = gpt_sample_top_k_top_p(d_ptr->vocab, n_vocab,\n                promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,\n                n_prev_toks,\n                promptCtx.logits,\n                promptCtx.top_k, promptCtx.top_p, promptCtx.temp,\n                promptCtx.repeat_penalty,\n                d_ptr->rng);\n\n            t_sample_us += ggml_time_us() - t_start_sample_us;\n        }\n\n        // Check if the context has run out...\n        if (promptCtx.n_past + 1 > promptCtx.n_ctx) {\n            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;\n            // Erase the first percentage of context from the tokens...\n            std::cerr << \"MPT: reached the end of the context window so resizing\\n\";\n            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);\n            promptCtx.n_past = promptCtx.tokens.size();\n            recalculateContext(promptCtx, recalculateCallback);\n            assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);\n        }\n\n        const int64_t t_start_predict_us = ggml_time_us();\n        if (!mpt_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, { id }, promptCtx.logits,\n            d_ptr->mem_per_token)) {\n            std::cerr << \"GPT-J ERROR: Failed to predict next token\\n\";\n            return;\n        }\n        t_predict_us += ggml_time_us() - t_start_predict_us;\n\n        promptCtx.n_past += 1;\n        // display text\n        ++totalPredictions;\n\n        // mpt-7b-chat has special token for end\n        if (d_ptr->has_im_end && id == d_ptr->vocab.token_to_id[\"<|im_end|>\"])\n            goto stop_generating;\n\n        if (id == 0 /*end of text*/)\n            goto stop_generating;\n\n        const std::string str = d_ptr->vocab.id_to_token[id];\n\n        // Check if the provided str is part of our reverse prompts\n        bool foundPartialReversePrompt = false;\n        const std::string completed = cachedResponse + str;\n        if (reversePrompts.find(completed) != reversePrompts.end()) {\n            goto stop_generating;\n        }\n\n        // Check if it partially matches our reverse prompts and if so, cache\n        for (auto s : reversePrompts) {\n            if (s.compare(0, completed.size(), completed) == 0) {\n                foundPartialReversePrompt = true;\n                cachedResponse = completed;\n                break;\n            }\n        }\n\n        // Regardless the token gets added to our cache\n        cachedTokens.push_back(id);\n\n        // Continue if we have found a partial match\n        if (foundPartialReversePrompt)\n            continue;\n\n        // Empty the cache\n        for (auto t : cachedTokens) {\n            if (promptCtx.tokens.size() == promptCtx.n_ctx)\n                promptCtx.tokens.erase(promptCtx.tokens.begin());\n            promptCtx.tokens.push_back(t);\n            if (!responseCallback(t, d_ptr->vocab.id_to_token[t]))\n                goto stop_generating;\n        }\n        cachedTokens.clear();\n    }\n\nstop_generating:\n\n#if 0\n    // report timing\n    {\n        const int64_t t_main_end_us = ggml_time_us();\n\n        std::cout << \"GPT-J INFO: mem per token = \" << mem_per_token << \" bytes\\n\";\n        std::cout << \"GPT-J INFO:   sample time = \" << t_sample_us/1000.0f << \" ms\\n\";\n        std::cout << \"GPT-J INFO:   prompt time = \" << t_prompt_us/1000.0f << \" ms\\n\";\n        std::cout << \"GPT-J INFO:  predict time = \" << t_predict_us/1000.0f << \" ms / \" << t_predict_us/1000.0f/totalPredictions << \" ms per token\\n\";\n        std::cout << \"GPT-J INFO:    total time = \" << (t_main_end_us - t_main_start_us)/1000.0f << \" ms\\n\";\n        fflush(stdout);\n    }\n#endif\n\n    return;\n}\n\nvoid MPT::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate)\n{\n    size_t i = 0;\n    promptCtx.n_past = 0;\n    while (i < promptCtx.tokens.size()) {\n        size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());\n        std::vector<int> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);\n\n        assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);\n\n        if (!mpt_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, batch, promptCtx.logits,\n            d_ptr->mem_per_token)) {\n            std::cerr << \"MPT ERROR: Failed to process prompt\\n\";\n            goto stop_generating;\n        }\n        promptCtx.n_past += batch.size();\n        if (!recalculate(true))\n            goto stop_generating;\n        i = batch_end;\n    }\n    assert(promptCtx.n_past == promptCtx.tokens.size());\n\nstop_generating:\n    recalculate(false);\n}\n"
  },
  {
    "path": "gpt4all-backend/mpt.h",
    "content": "#ifndef MPT_H\n#define MPT_H\n\n#include <string>\n#include <functional>\n#include <vector>\n#include \"llmodel.h\"\n\nclass MPTPrivate;\nclass MPT : public LLModel {\npublic:\n    MPT();\n    ~MPT();\n\n    bool loadModel(const std::string &modelPath) override;\n    bool isModelLoaded() const override;\n    size_t stateSize() const override;\n    size_t saveState(uint8_t *dest) const override;\n    size_t restoreState(const uint8_t *src) override;\n    void prompt(const std::string &prompt,\n        std::function<bool(int32_t)> promptCallback,\n        std::function<bool(int32_t, const std::string&)> responseCallback,\n        std::function<bool(bool)> recalculateCallback,\n        PromptContext &ctx) override;\n    void setThreadCount(int32_t n_threads) override;\n    int32_t threadCount() const override;\n\nprotected:\n    void recalculateContext(PromptContext &promptCtx,\n        std::function<bool(bool)> recalculate) override;\n\nprivate:\n    MPTPrivate *d_ptr;\n};\n\n#endif // MPT_H\n"
  },
  {
    "path": "gpt4all-backend/scripts/convert_mpt_hf_to_ggml.py",
    "content": "# Convert Hugging Face fine-tuned bloom-like models to ggml format\n#\n# Usage:\n#\n#   python3 models/convert-h5-to-ggml.py \n#\n# This script is similar to \"convert-pt-to-ggml.py\"\n#\n\nimport io\nimport os\nimport sys\nimport struct\nimport json\nimport code\nimport torch\nimport numpy as np\n\nfrom transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BloomForCausalLM\n\n# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py\ndef bytes_to_unicode():\n    \"\"\"\n    Returns list of utf-8 byte and a corresponding list of unicode strings.\n    The reversible bpe codes work on unicode strings.\n    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.\n    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.\n    This is a significant percentage of your normal, say, 32K bpe vocab.\n    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.\n    And avoids mapping to whitespace/control characters the bpe code barfs on.\n    \"\"\"\n    bs = list(range(ord(\"!\"), ord(\"~\")+1))+list(range(ord(\"¡\"), ord(\"¬\")+1))+list(range(ord(\"®\"), ord(\"ÿ\")+1))\n    cs = bs[:]\n    n = 0\n    for b in range(2**8):\n        if b not in bs:\n            bs.append(b)\n            cs.append(2**8+n)\n            n += 1\n    cs = [chr(n) for n in cs]\n    return dict(zip(bs, cs))\n\nif len(sys.argv) < 3:\n    print(\"Usage: python convert-hf-to-ggml.py model_name dir-output [use-f32]\")\n    print(\"  model_name: name of the model to convert. Example: 'bigscience/bloomz-560m'\")\n    print(\"  dir-output: directory where the output file will be written\")\n    print(\"  use-f32:    if present, use float32 instead of float16\")\n    sys.exit(1)\n\nmodel_name = sys.argv[1]\ndir_out = sys.argv[2]\n\n# make sure the output directory exists\nos.makedirs(dir_out, exist_ok=True)\n\n# possible data types\n#   ftype == 0 -> float32\n#   ftype == 1 -> float16\n#\n# map from ftype to string\nftype_str = [\"f32\", \"f16\"]\nftype = 1\nif len(sys.argv) > 3:\n    ftype = 0\n\ntokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\nconfig = AutoConfig.from_pretrained(model_name, trust_remote_code=True)\nhparams = config.to_dict()\nprint(\"Loading model: \", model_name)\nmodel = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, config=config, torch_dtype=torch.float16 if ftype == 1 else torch.float32, low_cpu_mem_usage=True)\nprint(\"Model loaded: \", model_name)\n\n\nfname_out = dir_out + f\"/ggml-model-{model_name.split('/')[-1]}-{ftype_str[ftype]}.bin\"\nfout = open(fname_out, \"wb\")\nvocab = tokenizer.vocab\n\nhparams[\"multiple_of\"] = 1\nfout.write(struct.pack(\"I\", 0x67676d6d)) # magic: ggml in hex\nfout.write(struct.pack(\"I\", model.config.vocab_size))\nfout.write(struct.pack(\"I\", model.config.max_seq_len))\nfout.write(struct.pack(\"I\", model.config.n_layers))\nfout.write(struct.pack(\"I\", model.config.n_heads))\nfout.write(struct.pack(\"I\", model.config.d_model))\nfout.write(struct.pack(\"f\", model.config.attn_config['alibi_bias_max']))\nclip_qkv = model.config.attn_config['clip_qkv']\nfout.write(struct.pack(\"f\",  clip_qkv if clip_qkv is not None else 0))\nfout.write(struct.pack(\"I\", ftype))\n\n# # Is this correct??\n# dot_token = tokenizer.encode(\".\")[0]\n# write tokens to ggml file \ndot_token = tokenizer.encode('.')[0]\nfout.write(struct.pack(\"I\", model.config.vocab_size))\n\nfor i in range(model.config.vocab_size):\n    text = tokenizer.decode([dot_token, i]).encode('utf-8')\n    # remove the first byte (it's always '.')\n    text = text[1:]\n    enclen = len(text)\n    if i in tokenizer.all_special_ids:\n        print(f\"special token: {text}\")\n        enclen = enclen | 1<<31\n    fout.write(struct.pack(\"I\", enclen))\n    fout.write(text)\n    \nlist_vars = model.state_dict()\nfor name in list_vars.keys():\n    data = list_vars[name].squeeze().numpy()\n    print(\"Processing variable: \" + name + \" with shape: \", data.shape)\n\n    n_dims = len(data.shape);\n\n    # ftype == 0 -> float32, ftype == 1 -> float16\n    ftype_cur = 0;\n    if ftype != 0:\n        # Keep token embeddings in fp32\n        if name[-7:] == \".weight\" and n_dims == 2 and \".wte\" not in name:\n            print(\"  Converting to float16\")\n            data = data.astype(np.float16)\n            ftype_cur = 1\n        else:\n            print(\"  Converting to float32\")\n            data = data.astype(np.float32)\n            ftype_cur = 0\n    else:\n        if data.dtype != np.float32:\n            print(\"  Converting to float32\")\n            data = data.astype(np.float32)\n            ftype_cur = 0\n\n    # header\n    str = name.encode('utf-8')\n    fout.write(struct.pack(\"iii\", n_dims, len(str), ftype_cur))\n    for i in range(n_dims):\n        fout.write(struct.pack(\"i\", data.shape[n_dims - 1 - i]))\n    fout.write(str);\n\n    # data\n    data.tofile(fout)\n\nfout.close()\n\nprint(\"Done. Output file: \" + fname_out)\nprint(\"\")"
  },
  {
    "path": "gpt4all-backend/utils.cpp",
    "content": "#include \"utils.h\"\n\n#include <fstream>\n#include <regex>\n\nvoid replace(std::string & str, const std::string & needle, const std::string & replacement) {\n    size_t pos = 0;\n    while ((pos = str.find(needle, pos)) != std::string::npos) {\n        str.replace(pos, needle.length(), replacement);\n        pos += replacement.length();\n    }\n}\n\nstd::map<std::string, int32_t> json_parse(const std::string & fname) {\n    std::map<std::string, int32_t> result;\n\n    // read file into string\n    std::string json;\n    {\n        std::ifstream ifs(fname);\n        if (!ifs) {\n            fprintf(stderr, \"Failed to open %s\\n\", fname.c_str());\n            exit(1);\n        }\n\n        json = std::string((std::istreambuf_iterator<char>(ifs)),\n                (std::istreambuf_iterator<char>()));\n    }\n\n    if (json[0] != '{') {\n        return result;\n    }\n\n    // parse json\n    {\n        bool has_key  = false;\n        bool in_token = false;\n\n        std::string str_key = \"\";\n        std::string str_val = \"\";\n\n        int n = json.size();\n        for (int i = 1; i < n; ++i) {\n            if (!in_token) {\n                if (json[i] == ' ') continue;\n                if (json[i] == '\"') {\n                    in_token = true;\n                    continue;\n                }\n            } else {\n                if (json[i] == '\\\\' && i+1 < n) {\n                    if (has_key == false) {\n                        str_key += json[i];\n                    } else {\n                        str_val += json[i];\n                    }\n                    ++i;\n                } else if (json[i] == '\"') {\n                    if (has_key == false) {\n                        has_key = true;\n                        ++i;\n                        while (json[i] == ' ') ++i;\n                        ++i; // :\n                        while (json[i] == ' ') ++i;\n                        if (json[i] != '\\\"') {\n                            while (json[i] != ',' && json[i] != '}') {\n                                str_val += json[i++];\n                            }\n                            has_key = false;\n                        } else {\n                            in_token = true;\n                            continue;\n                        }\n                    } else {\n                        has_key = false;\n                    }\n\n                    ::replace(str_key, \"\\\\u0120\", \" \" ); // \\u0120 -> space\n                    ::replace(str_key, \"\\\\u010a\", \"\\n\"); // \\u010a -> new line\n                    ::replace(str_key, \"\\\\\\\"\",    \"\\\"\"); // \\\\\\\"   -> \"\n\n                    try {\n                        result[str_key] = std::stoi(str_val);\n                    } catch (...) {\n                        //fprintf(stderr, \"%s: ignoring key '%s' with value '%s'\\n\", fname.c_str(), str_key.c_str(), str_val.c_str());\n\n                    }\n                    str_key = \"\";\n                    str_val = \"\";\n                    in_token = false;\n                    continue;\n                }\n                if (has_key == false) {\n                    str_key += json[i];\n                } else {\n                    str_val += json[i];\n                }\n            }\n        }\n    }\n\n    return result;\n}\n\nstd::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text) {\n    std::vector<std::string> words;\n\n    // first split the text into words\n    {\n        std::string str = text;\n        std::string pat = R\"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\\s[:alpha:][:digit:]]+|\\s+(?!\\S)|\\s+)\";\n\n        std::regex re(pat);\n        std::smatch m;\n\n        while (std::regex_search(str, m, re)) {\n            for (auto x : m) {\n                words.push_back(x);\n            }\n            str = m.suffix();\n        }\n    }\n\n    // find the longest tokens that form the words:\n    std::vector<gpt_vocab::id> tokens;\n    for (const auto & word : words) {\n        if (word.size() == 0) continue;\n\n        int i = 0;\n        int n = word.size();\n        while (i < n) {\n            int j = n;\n            while (j > i) {\n                auto it = vocab.token_to_id.find(word.substr(i, j-i));\n                if (it != vocab.token_to_id.end()) {\n                    tokens.push_back(it->second);\n                    i = j;\n                    break;\n                }\n                --j;\n            }\n            if (i == n) {\n                break;\n            }\n            if (j == i) {\n                auto sub = word.substr(i, 1);\n                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {\n                    tokens.push_back(vocab.token_to_id.at(sub));\n                } else {\n                    fprintf(stderr, \"%s: unknown token '%s'\\n\", __func__, sub.data());\n                }\n                ++i;\n            }\n        }\n    }\n\n    return tokens;\n}\n\nstd::string regex_escape(const std::string &s) {\n  static const std::regex metacharacters(R\"([\\.\\^\\$\\-\\+\\(\\)\\[\\]\\{\\}\\|\\?\\*])\");\n  return std::regex_replace(s, metacharacters, \"\\\\$&\");\n}\n\nstd::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {\n    // Generate the subpattern from the special_tokens vector if it's not empty\n    if (!vocab.special_tokens.empty()) {\n        std::vector<gpt_vocab::id> out;\n        std::vector<std::string> chunks;\n        std::string str = text;\n        std::string special_tokens_subpattern;\n        for (const auto &token : vocab.special_tokens) {\n            if (!special_tokens_subpattern.empty()) {\n                special_tokens_subpattern += \"|\";\n            }\n            special_tokens_subpattern += regex_escape(token);\n        }\n        std::regex re(special_tokens_subpattern);\n        std::smatch m;\n        while (std::regex_search(str, m, re)) {\n            auto tok = vocab.token_to_id.find(m.str());\n            if (tok != vocab.token_to_id.end()) {\n                auto tokid = tok->second;\n                auto pfxtoks = gpt_tokenize_inner(vocab, m.prefix());\n                out.insert(out.end(), pfxtoks.begin(), pfxtoks.end());\n                out.push_back(tokid);\n                str = m.suffix();\n            }\n        }\n        if (!str.empty()) {\n            auto tokrest = gpt_tokenize_inner(vocab, str);\n            out.insert(out.end(), tokrest.begin(), tokrest.end());\n        }\n        return out;\n    } else {\n        return gpt_tokenize_inner(vocab, text);\n    }\n}\n\n\nbool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {\n    printf(\"%s: loading vocab from '%s'\\n\", __func__, fname.c_str());\n\n    vocab.token_to_id = ::json_parse(fname);\n\n    for (const auto & kv : vocab.token_to_id) {\n        vocab.id_to_token[kv.second] = kv.first;\n    }\n\n    printf(\"%s: vocab size = %d\\n\", __func__, (int) vocab.token_to_id.size());\n\n    // print the vocabulary\n    //for (auto kv : vocab.token_to_id) {\n    //    printf(\"'%s' -> %d\\n\", kv.first.data(), kv.second);\n    //}\n\n    return true;\n}\n\ngpt_vocab::id gpt_sample_top_k_top_p(\n        const gpt_vocab & vocab,\n        const size_t actualVocabSize,\n        const int32_t * last_n_tokens_data,\n        int   last_n_tokens_size,\n        const std::vector<float> logits,\n        int    top_k,\n        double top_p,\n        double temp,\n        float repeat_penalty,\n        std::mt19937 & rng) {\n    int n_logits = actualVocabSize;\n\n    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);\n    const auto * plogits = logits.data() + logits.size() - n_logits;\n\n    std::vector<std::pair<double, gpt_vocab::id>> logits_id;\n    logits_id.reserve(n_logits);\n\n    {\n        const float scale = 1.0f/temp;\n        for (int i = 0; i < n_logits; ++i) {\n            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)\n            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main\n            if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {\n                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability\n                if (plogits[i] < 0.0f) {\n                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));\n                } else {\n                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));\n                }\n            } else {\n                logits_id.push_back(std::make_pair(plogits[i]*scale, i));\n            }\n        }\n    }\n\n    // find the top K tokens\n    std::partial_sort(\n            logits_id.begin(),\n            logits_id.begin() + top_k, logits_id.end(),\n            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {\n        return a.first > b.first;\n    });\n\n    logits_id.resize(top_k);\n\n    double maxl = -INFINITY;\n    for (const auto & kv : logits_id) {\n        maxl = std::max(maxl, kv.first);\n    }\n\n    // compute probs for the top K tokens\n    std::vector<double> probs;\n    probs.reserve(logits_id.size());\n\n    double sum = 0.0;\n    for (const auto & kv : logits_id) {\n        double p = exp(kv.first - maxl);\n        probs.push_back(p);\n        sum += p;\n    }\n\n    // normalize the probs\n    for (auto & p : probs) {\n        p /= sum;\n    }\n\n    if (top_p < 1.0f) {\n        double cumsum = 0.0f;\n        for (int i = 0; i < top_k; i++) {\n            cumsum += probs[i];\n            if (cumsum >= top_p) {\n                top_k = i + 1;\n                probs.resize(top_k);\n                logits_id.resize(top_k);\n                break;\n            }\n        }\n\n        cumsum = 1.0/cumsum;\n        for (int i = 0; i < (int) probs.size(); i++) {\n            probs[i] *= cumsum;\n        }\n    }\n\n    //printf(\"\\n\");\n    //for (int i = 0; i < (int) probs.size(); i++) {\n    //    printf(\"%d: '%s' %f\\n\", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);\n    //}\n    //exit(0);\n\n    std::discrete_distribution<> dist(probs.begin(), probs.end());\n    int idx = dist(rng);\n\n    return logits_id[idx].second;\n}"
  },
  {
    "path": "gpt4all-backend/utils.h",
    "content": "// Various helper functions and utilities\n\n#pragma once\n\n#include <string>\n#include <map>\n#include <vector>\n#include <random>\n#include <thread>\n\n//\n// CLI argument parsing\n//\n\nstruct gpt_params {\n    int32_t seed      = -1; // RNG seed\n    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());\n    int32_t n_predict = 200; // new tokens to predict\n\n    // sampling parameters\n    int32_t top_k = 40;\n    float   top_p = 0.9f;\n    float   temp  = 0.9f;\n\n    int32_t n_batch = 8; // batch size for prompt processing\n\n    std::string model = \"models/gpt-2-117M/ggml-model.bin\"; // model path\n    std::string prompt;\n};\n\nbool gpt_params_parse(int argc, char ** argv, gpt_params & params);\n\nvoid gpt_print_usage(int argc, char ** argv, const gpt_params & params);\n\nstd::string gpt_random_prompt(std::mt19937 & rng);\n\n//\n// Vocab utils\n//\n\nstruct gpt_vocab {\n    using id    = int32_t;\n    using token = std::string;\n\n    std::map<token, id> token_to_id;\n    std::map<id, token> id_to_token;\n    std::vector<std::string> special_tokens;\n\n    void add_special_token(const std::string &token) {\n        special_tokens.push_back(token);\n    }\n};\n\nvoid replace(std::string & str, const std::string & needle, const std::string & replacement);\n\n// poor-man's JSON parsing\nstd::map<std::string, int32_t> json_parse(const std::string & fname);\n\n// split text into tokens\n//\n// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53\n//\n// Regex (Python):\n// r\"\"\"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+\"\"\"\n//\n// Regex (C++):\n// R\"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\\s[:alpha:][:digit:]]+|\\s+(?!\\S)|\\s+)\"\n//\nstd::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);\n\n// load the tokens from encoder.json\nbool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);\n\n// sample next token given probabilities for each embedding\n//\n//   - consider only the top K tokens\n//   - from them, consider only the top tokens with cumulative probability > P\n//\n// TODO: not sure if this implementation is correct\n//\ngpt_vocab::id gpt_sample_top_k_top_p(\n        const gpt_vocab & vocab,\n        const size_t actualVocabSize,\n        const int32_t * last_n_tokens_data,\n        int   last_n_tokens_size,\n        const std::vector<float> logits,\n        int    top_k,\n        double top_p,\n        double temp,\n        float repeat_penalty,\n        std::mt19937 & rng);\n"
  },
  {
    "path": "prompt_template_sample.txt",
    "content": "### Instruction:\nThe prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.\n### Prompt:\n%1\n### Response:"
  },
  {
    "path": "src/CMakeLists.txt",
    "content": "\nadd_executable(chat chat.cpp header.h utils.h parse_json.h ../gpt4all-backend/llmodel_c.h)\ntarget_link_libraries(chat PRIVATE  llmodel llama)\n\n\n"
  },
  {
    "path": "src/chat.cpp",
    "content": "#include \"./header.h\"\n#include \"../gpt4all-backend/llmodel_c.h\"\n#include \"./utils.h\"\n#include \"./parse_json.h\"\n\n//////////////////////////////////////////////////////////////////////////\n////////////                    ANIMATION                     ////////////\n//////////////////////////////////////////////////////////////////////////\n\nstd::atomic<bool> stop_display{false}; \n\nvoid display_frames() {\n    const char* frames[] = {\".\", \":\", \"'\", \":\"};\n    int frame_index = 0;\n    ConsoleState con_st;\n    con_st.use_color = true;\n    while (!stop_display) {\n        set_console_color(con_st, PROMPT);\n        std::cerr << \"\\r\" << frames[frame_index % 4] << std::flush;\n        frame_index++;\n        set_console_color(con_st, DEFAULT);\n        if (!stop_display){\n            std::this_thread::sleep_for(std::chrono::milliseconds(200));\n            std::cerr << \"\\r\" << \" \" << std::flush;\n            std::cerr << \"\\r\" << std::flush;\n        }\n    }\n}\n\nvoid display_loading() {\n\n    while (!stop_display) {\n\n\n        for (int i=0; i < 14; i++){\n                fprintf(stdout, \".\");\n                fflush(stdout);\n                std::this_thread::sleep_for(std::chrono::milliseconds(200));\n                if (stop_display){ break; }\n        }\n        \n        std::cout << \"\\r\" << \"               \" << \"\\r\" << std::flush;\n    }\n    std::cout << \"\\r\" << \" \" << std::flush;\n\n}\n\n//////////////////////////////////////////////////////////////////////////\n////////////                   /ANIMATION                     ////////////\n//////////////////////////////////////////////////////////////////////////\n\n\n//////////////////////////////////////////////////////////////////////////\n////////////                 CHAT FUNCTIONS                   ////////////\n//////////////////////////////////////////////////////////////////////////\n\n#ifndef OLD_MACOS\n\nbool save_state_to_binary(llmodel_model& model, uint8_t *dest, chatParams& params, std::string &filename, uint64_t model_size) {\n\t\n  if (params.save_dir == \"\") {\n\tstd::filesystem::path directory_path(params.path+\"saves\");\n    if (!std::filesystem::is_directory(directory_path)) {\n        if (!std::filesystem::create_directory(directory_path)) {\n            std::cerr << \"Error creating directory\" << std::endl;\n            return false;\n        }\n    }\n  \tparams.save_dir = params.path+\"saves\";\n  }\n  \n  //sanity check that we're not trying to overwrite binaries of wrong size\n  //empty binaries are allowed, so are previous saves of same model type\n  if (std::filesystem::exists(params.save_dir+\"/\"+filename+\".bin\")) {\n  \tuint64_t file_size = std::filesystem::file_size(params.save_dir+\"/\"+filename+\".bin\");\n  \tif ((file_size == model_size) || (file_size = 0)) {\n  \t//continue\n  \t} else {\n  \t\tstd::cerr << \"You are trying to overwrite existing binary of different size! \" << params.save_dir+\"/\"+filename+\".bin\" << std::endl;\n  \t\treturn 0;\n  \t}\n  }\n  \n  // create an output file stream\n  std::ofstream outfile;\n  // open the file in binary mode\n  outfile.open(params.save_dir+\"/\"+filename+\".bin\", std::ios::binary);\n\n  // check if the file stream is open\n  if (!outfile.is_open()) {\n    std::cerr << \"Error opening file \" << params.save_dir+\"/\"+filename+\".bin\" << std::endl;\n    return false;\n  }\n\n  // write the model data to the file stream\n  uint64_t copied_bytes = llmodel_save_state_data(model, dest);\n  outfile.write(reinterpret_cast<char *>(dest), copied_bytes);\n\n  // close the file stream\n  outfile.close();\n  return true;\n}\n\nbool load_state_from_binary(llmodel_model& model, chatParams& params, std::string &filename, uint64_t model_size) {\n\n  if (params.save_dir == \"\") {\n  \tparams.save_dir = params.path+\"saves\";\n  }\n\n  //sanity check that we're not trying to load binaries of wrong size\n  //only binaries that are saves of same model type are allowed\n  if (std::filesystem::exists(params.save_dir+\"/\"+filename+\".bin\")) {\n  \tuint64_t file_size = std::filesystem::file_size(params.save_dir+\"/\"+filename+\".bin\");\n  \tif (file_size == model_size) {\n  \t//continue\n  \t} else {\n  \t\tstd::cerr << \"You are trying to load a binary of wrong size! \" << params.save_dir+\"/\"+filename+\".bin\" << std::endl;\n  \t\treturn 0;\n  \t}\n  }  \n\n  // create an input file stream\n  std::ifstream infile;\n  // open the file in binary mode\n  infile.open(params.save_dir+\"/\"+filename+\".bin\", std::ios::binary);\n\n  // check if the file stream is open\n  if (!infile.is_open()) {\n    std::cerr << \"Error opening file \" << params.save_dir+\"/\"+filename+\".bin\" << std::endl;\n    return false;\n  }\n\n  // get the size of the file\n  infile.seekg(0, std::ios::end);\n  uint64_t file_size = infile.tellg();\n  infile.seekg(0, std::ios::beg);\n\n  // allocate a buffer to hold the file data\n  uint8_t* buffer = new uint8_t[file_size];\n  try {\n    buffer = new uint8_t[file_size];\n  } catch (std::bad_alloc& ba) {\n    std::cerr << \"Failed to allocate buffer: \" << ba.what() << std::endl;\n    return false;\n  }\n\n  // read the file data into the buffer\n  infile.read(reinterpret_cast<char*>(buffer), file_size);\n  infile.close();\n\n  // restore the internal state of the model using the buffer data\n  llmodel_restore_state_data(model, buffer); \n  delete[] buffer;\n  return true;\n}\n\nbool save_ctx_to_binary(llmodel_prompt_context& prompt_context, chatParams& params, std::string &filename) {\n\t\n  if (params.save_dir == \"\") {\n    std::filesystem::path directory_path(params.path+\"saves\");\n    if (!std::filesystem::is_directory(directory_path)) {\n        if (!std::filesystem::create_directory(directory_path)) {\n            std::cerr << \"Error creating directory\" << std::endl;\n            return false;\n        }\n    }\n  params.save_dir = params.path+\"saves\";\n  }\n\n  std::filesystem::path filePath = std::filesystem::path(params.save_dir) / (filename + \".ctx\");\n  std::string fullPath = filePath.string();\n\t\n    // Open the binary file for writing\n  FILE* file = fopen(fullPath.c_str(), \"wb\");\n    if (!file) {\n        std::cerr << \"Error opening file: \" << fullPath << std::endl;\n        return false;\n    }\n\n    // Write the struct to the file using fwrite\n    fwrite(&prompt_context, sizeof(prompt_context), 1, file);\n\n    // Close the file\n    fclose(file);\n    return true;\n}\n\nllmodel_prompt_context load_ctx_from_binary(chatParams& params, std::string &filename) {\n\n    if (params.save_dir == \"\") {\n        params.save_dir = params.path+\"saves\";\n    }\n\t  // Construct the file path with home directory expansion\n    std::filesystem::path filePath = std::filesystem::path(params.save_dir) / (filename + \".ctx\");\n    std::string fullPath = filePath.string();\n\n    // Open the binary file for reading\n    FILE* file = fopen(fullPath.c_str(), \"rb\");\n        if (!file) {\n        std::cerr << \"Error opening file: \" << fullPath << std::endl;\n        exit(EXIT_FAILURE);\n    }\n\n    // Read the struct from the file using fread\n    llmodel_prompt_context prompt_context;\n    fread(&prompt_context, sizeof(prompt_context), 1, file);\n\n    // Close the file\n    fclose(file);\n\n    return prompt_context;\n}\n#endif\n\nstd::string get_input(ConsoleState& con_st, std::string& input, chatParams &params, llmodel_prompt_context &prompt_context, llmodel_model& model) {\n    set_console_color(con_st, USER_INPUT);\n\n    std::cout << \"\\n> \";\n    std::getline(std::cin, input);\n    \n    std::istringstream iss(input);\n    std::string input1, input2;\n    std::getline(iss, input1, ' ');\n    std::getline(iss, input2, ' ');\n    set_console_color(con_st, DEFAULT);\n    \n    if (input == \"/reset\") {\n    \t//reset the logits, tokens and past conversation\n        prompt_context.logits = params.logits;\n        prompt_context.logits_size = params.logits_size;\n        prompt_context.tokens = params.tokens;\n        prompt_context.tokens_size = params.tokens_size;\n        prompt_context.n_past = params.n_past;\n        prompt_context.n_ctx = params.n_ctx;\n        \n        //get new input using recursion\n        set_console_color(con_st, PROMPT);\n        std::cout << \"Chat context reset.\";\n        return get_input(con_st, input, params, prompt_context, model);\n    }\n    #ifndef OLD_MACOS\n    if ((input == \"/save\" || input1 == \"/save\") && (params.no_saves == false)) {\n    \tstd::string filename = params.save_name;\n    \tif (input2 != \"\" && (input2.find(\"..\") == std::string::npos) ) { filename = input2; }\n    \t\n    \t\n        bool success1 = false;\n        bool success2 = false;\n    \t\n    \tuint64_t model_size = llmodel_get_state_size(model);\n\t\tuint8_t *dest = new uint8_t[model_size];\n    \tsuccess1 = save_state_to_binary(model, dest, params, filename, model_size);\n    \tdelete[] dest;\n    \tsuccess2 = save_ctx_to_binary(prompt_context, params, filename);\n    \t\n    \t//get new input using recursion\n        set_console_color(con_st, PROMPT);\n        if (success1 && success2) { std::cout << \"Model data saved to: \" << params.save_dir+\"/\"+filename+\".bin\" << \" size: \" << floor(model_size/10000000)/100.0 << \" Gb\"; }\n        return get_input(con_st, input, params, prompt_context, model);\n    }\n    \n    if ((input == \"/load\" || input1 == \"/load\") && (params.no_saves == false)) {\n    \tstd::string filename = params.save_name;\n    \tif (input2 != \"\" && (input2.find(\"..\") == std::string::npos) ) { filename = input2; }\n    \t//reset the logits, tokens and past conversation\n    \tfree(prompt_context.logits);\n    \tfree(prompt_context.tokens);\n        prompt_context.logits = params.logits;\n        prompt_context.logits_size = params.logits_size;\n        prompt_context.tokens = params.tokens;\n        prompt_context.tokens_size = params.tokens_size;\n        prompt_context.n_past = params.n_past;\n        prompt_context.n_ctx = params.n_ctx;\n        \n        bool success = false;\n        \n        uint64_t model_size = llmodel_get_state_size(model);\n    \tprompt_context = load_ctx_from_binary(params, filename);\n    \tsuccess = load_state_from_binary(model, params, filename, model_size);\n    \tmodel_size = llmodel_get_state_size(model);\n    \t\n    \t//get new input using recursion\n        set_console_color(con_st, PROMPT);\n        if (success) { std::cout << \"Model data loaded from: \" << params.save_dir+\"/\"+filename+\".bin\" << \" size: \" << floor(model_size/10000000)/100.0 << \" Gb\"; }\n        return get_input(con_st, input, params, prompt_context, model);\n    }\n    #endif\n    \n    if (input == \"/help\"){\n    \tset_console_color(con_st, DEFAULT);\n    \tstd::cout << std::endl;\n        char emptystring[] = \"\";\n        char* emptyargv[] = {emptystring};\n        int emptyargc = sizeof(emptyargv) / sizeof(char*);\n    \tprint_usage(emptyargc, emptyargv, params);\n        return get_input(con_st, input, params, prompt_context, model);\n    }\n    \n    if (input == \"/about\"){\n    \tset_console_color(con_st, DEFAULT);\n    \tstd::cout << std::endl;\n    \tprint_version();\n    \treturn get_input(con_st, input, params, prompt_context, model);\n    }\n       \n    if (input == \"exit\" || input == \"quit\" || input == \"/exit\" || input == \"/quit\") {       \n        llmodel_model_destroy(model);\n        exit(0);\n    }\n    \n    return input;\n}\n\nstd::string hashstring = \"\";\nstd::string answer = \"\";\n\n//////////////////////////////////////////////////////////////////////////\n////////////                /CHAT FUNCTIONS                   ////////////\n//////////////////////////////////////////////////////////////////////////\n\n\n\n//////////////////////////////////////////////////////////////////////////\n////////////                  MAIN PROGRAM                    ////////////\n//////////////////////////////////////////////////////////////////////////\n\n\nint main(int argc, char* argv[]) {\n\n\n    ConsoleState con_st;\n    con_st.use_color = true;\n    set_console_color(con_st, DEFAULT);\n\n    set_console_color(con_st, PROMPT);\n    set_console_color(con_st, BOLD);\n    std::cout << APPNAME;\n    set_console_color(con_st, DEFAULT);\n    set_console_color(con_st, PROMPT);\n    std::cout << \" (v. \" << VERSION << \")\";\n    set_console_color(con_st, DEFAULT);\n    std::cout << \"\" << std::endl;\n    check_avx_support_at_startup();\n\n    chatParams params;\n    //convert the default model path into Windows format if on WIN32\n    #ifdef _WIN32\n        std::filesystem::path p(params.model);\n        params.model = p.make_preferred().string();\n    #endif\n \n    //get all parameters from cli arguments or json\n    parse_params(argc, argv, params);\n    \n    //Create a prompt_context and copy all params from chatParams to prompt_context\n    llmodel_prompt_context prompt_context = {\n     .logits = params.logits,\n     .logits_size = params.logits_size,\n     .tokens = params.tokens,\n     .tokens_size = params.tokens_size,\n     .n_past = params.n_past,\n     .n_ctx = params.n_ctx,\n     .n_predict = params.n_predict,\n     .top_k = params.top_k,\n     .top_p = params.top_p,\n     .temp = params.temp,\n     .n_batch = params.n_batch,\n     .repeat_penalty = params.repeat_penalty,  \n     .repeat_last_n = params.repeat_last_n,\n     .context_erase = params.context_erase,\n    }; \n\n    //Subprocess signal handling\n    #ifdef _WIN32\n        SetConsoleCtrlHandler(console_ctrl_handler, TRUE);\n    #else\n        signal(SIGHUP, handle_sighup);\n    #endif\n \n    //////////////////////////////////////////////////////////////////////////\n    ////////////                 LOAD THE MODEL                   ////////////\n    ////////////////////////////////////////////////////////////////////////// \n\n    //animation\n    std::future<void> future;\n    stop_display = true;\n    if(params.use_animation) {stop_display = false; future = std::async(std::launch::async, display_loading);}\n\n    //handle stderr for now\n    //this is just to prevent printing unnecessary details during model loading.\n    int stderr_copy = dup(fileno(stderr));\n    #ifdef _WIN32\n        std::freopen(\"NUL\", \"w\", stderr);\n    #else\n        std::freopen(\"/dev/null\", \"w\", stderr);\n    #endif\n\n\n    llmodel_model model = llmodel_model_create(params.model.c_str());\n    std::cout << \"\\r\" << APPNAME << \": loading \" << params.model.c_str()  << std::endl;\n    \n    //bring back stderr for now\n    dup2(stderr_copy, fileno(stderr));\n    close(stderr_copy);\n    \n    \n\n    //check if model is loaded\n    auto check_model = llmodel_loadModel(model, params.model.c_str());\n\n    if (check_model == false) {\n        if(params.use_animation) {\n            stop_display = true;\n            future.wait();\n            stop_display= false;\n        }\n\n        std::cerr << \"Error loading: \" << params.model.c_str() << std::endl;\n        std::cout << \"Press any key to exit...\" << std::endl;\n        std::cin.get();\n        return 0;\n    } else {\n        if(params.use_animation) {\n            stop_display = true;\n            future.wait();\n        }\n        std::cout << \"\\r\" << APPNAME << \": done loading!\" << std::flush;   \n    }\n    //////////////////////////////////////////////////////////////////////////\n    ////////////                /LOAD THE MODEL                   ////////////\n    ////////////////////////////////////////////////////////////////////////// \n\n\n\n    set_console_color(con_st, PROMPT);\n    std::cout << \"\\n\" << params.prompt.c_str() << std::endl;\n    set_console_color(con_st, DEFAULT);\n\n    //load prompt template from file instead\n    if (params.load_template != \"\") {\n        std::tie(params.default_prefix, params.default_header, params.default_footer) = read_prompt_template_file(params.load_template);\n    }\n    \n    //load chat log from a file\n    if (params.load_log != \"\") {\n    \tif (params.prompt == \"\") {\n        \tparams.prompt = params.default_prefix + read_chat_log(params.load_log) + params.default_header;\n        } else {\n        \tparams.prompt = params.default_prefix + read_chat_log(params.load_log) + params.default_header + params.prompt;\n        }\n    } else {\n    \tparams.prompt = params.default_prefix + params.default_header + params.prompt;\n    }\n    \n    //////////////////////////////////////////////////////////////////////////\n    ////////////            PROMPT LAMBDA FUNCTIONS               ////////////\n    //////////////////////////////////////////////////////////////////////////\n\n\n    auto prompt_callback = [](int32_t token_id)  {\n\t    // You can handle prompt here if needed\n\t    return true;\n\t};\n\n\n    auto response_callback = [](int32_t token_id, const char *responsechars) {\n    \n        if (!(responsechars == nullptr || responsechars[0] == '\\0')) {\n\t    // stop the animation, printing response\n        if (stop_display == false) {\n\t        stop_display = true;\n            std::this_thread::sleep_for(std::chrono::milliseconds(200));\n            std::cerr << \"\\r\" << \" \" << std::flush;\n            std::cerr << \"\\r\" << std::flush;\n            if (answer != \"\") {std::cout << answer;}\n        }\n            \n\t\t\tstd::cout << responsechars << std::flush;\n\t        answer += responsechars;\n\t    }\n\t            \n\t    return true;\n\t};\n\t\n    auto recalculate_callback = [](bool is_recalculating) {\n        // You can handle recalculation requests here if needed\n        return is_recalculating;\n    };\n\n\n    //////////////////////////////////////////////////////////////////////////\n    ////////////         PROMPT TEXT AND GET RESPONSE             ////////////\n    //////////////////////////////////////////////////////////////////////////\n\n    llmodel_setThreadCount(model, params.n_threads);\n\n    std::string input = \"\";\n\n    //main chat loop.\n    if (!params.no_interactive && !sighup_received) {\n        input = get_input(con_st, input, params, prompt_context, model);\n\n        //Interactive mode. We have a prompt.\n        if (params.prompt != \"\") {\n            if (params.use_animation){ stop_display = false; future = std::async(std::launch::async, display_frames); }\n            if (params.b_token != \"\"){answer = answer + params.b_token; if(!params.use_animation) {std::cout << params.b_token;} }\n            llmodel_prompt(model, (params.prompt + \" \" + input + params.default_footer).c_str(),\n            prompt_callback, response_callback, recalculate_callback, &prompt_context);\n            if (params.e_token != \"\"){std::cout << params.e_token; answer = answer + params.e_token; }\n            if (params.use_animation){ stop_display = true; future.wait(); stop_display = false; }\n            if (params.save_log != \"\"){ save_chat_log(params.save_log, (params.prompt + \" \" + input + params.default_footer).c_str(), answer.c_str()); }\n\n        //Interactive mode. Else get prompt from input.\n        } else {\n            if (params.use_animation){ stop_display = false; future = std::async(std::launch::async, display_frames); }\n            if (params.b_token != \"\"){answer = answer + params.b_token; if(!params.use_animation) {std::cout << params.b_token;} }\n            llmodel_prompt(model, (params.default_prefix + params.default_header + input + params.default_footer).c_str(),\n            prompt_callback, response_callback, recalculate_callback, &prompt_context);\n            if (params.e_token != \"\"){std::cout << params.e_token; answer = answer + params.e_token; }\n            if (params.use_animation){ stop_display = true; future.wait(); stop_display = false; }\n            if (params.save_log != \"\"){ save_chat_log(params.save_log, (params.default_prefix + params.default_header + input + params.default_footer).c_str(), answer.c_str()); }\n        }\n        //Interactive and continuous mode. Get prompt from input.\n\n        while (!params.run_once && !sighup_received) {\n            answer = \"\"; //New prompt. We stored previous answer in memory so clear it.\n            input = get_input(con_st, input, params, prompt_context, model);\n            if (params.use_animation){ stop_display = false; future = std::async(std::launch::async, display_frames); }\n            if (params.b_token != \"\"){answer = answer + params.b_token; if(!params.use_animation) {std::cout << params.b_token;} }\n            llmodel_prompt(model, (params.default_prefix + params.default_header + input + params.default_footer).c_str(), \n            prompt_callback, response_callback, recalculate_callback, &prompt_context);\n            if (params.e_token != \"\"){std::cout << params.e_token; answer = answer + params.e_token; }\n            if (params.use_animation){ stop_display = true; future.wait(); stop_display = false; }\n            if (params.save_log != \"\"){ save_chat_log(params.save_log, (params.default_prefix + params.default_header + input + params.default_footer).c_str(), answer.c_str()); }\n\n        }\n\n    //No-interactive mode. Get the answer once from prompt and print it.\n    } else {\n        if (params.use_animation){ stop_display = false; future = std::async(std::launch::async, display_frames); }\n        if (params.b_token != \"\"){answer = answer + params.b_token; if(!params.use_animation) {std::cout << params.b_token;} }\n        llmodel_prompt(model, (params.prompt + params.default_footer).c_str(), \n        prompt_callback, response_callback, recalculate_callback, &prompt_context);\n        if (params.e_token != \"\"){std::cout << params.e_token; answer = answer + params.e_token; }\n        if (params.use_animation){ stop_display = true; future.wait(); stop_display = false; }\n        if (params.save_log != \"\"){ save_chat_log(params.save_log, (params.prompt + params.default_footer).c_str(), answer.c_str()); }\n        std::cout << std::endl;\n    }\n\n\n    set_console_color(con_st, DEFAULT);\n    llmodel_model_destroy(model);\n    return 0;\n}\n"
  },
  {
    "path": "src/header.h",
    "content": "#pragma once\n\n#ifndef HEADER_H\n#define HEADER_H\n\n\n\n#include <cstdio>\n\n#include <cassert>\n#include <cmath>\n#include <string>\n#include <vector>\n#include <random>\n#include <thread>\n#include <iostream>\n#include <map>\n#include <sstream>\n#include <fstream>\n#include <regex>\n#include <cstring>\n#include <functional>\n#include <csignal>\n\n//For paths\n//Commented out to support really old xcode\n#ifndef OLD_MACOS\n    #include <filesystem>\n#endif\n\n//For Windows MSVC compilation\n#if defined(_WIN32) && defined(_MSC_VER)\n    #define WIN32_LEAN_AND_MEAN\n    #ifndef NOMINMAX\n        #define NOMINMAX\n    #endif\n    #include <windows.h>\n    #include <io.h>\n    #include <stdio.h>\n#else\n    #include <unistd.h>\n#endif\n\n\n#include <typeinfo>\n#include <future>\n#include <chrono>\n#include <atomic>\n#include <fcntl.h>\n#include \"config.h\"\n\n#include <stdint.h>\n#include <stddef.h>\n#include <stdbool.h>\n\n\n// chatParams contains all the parameters you can import from json or with cli arguments\n// it also contains the initial value for PromptContext\nstruct chatParams {\n        //std::vector<float> logits,          // logits of current context \n        //std::vector<int32_t> tokens,        // current tokens in the context window \n\n        //These are in the prompt context, maybe add as parameters too.    \n        float *logits = nullptr;            // logits of current context\n        size_t logits_size = 0;             // the size of the raw logits vector\n        int32_t *tokens = nullptr;          // current tokens in the context window\n        size_t tokens_size = 0;             // the size of the raw tokens vector\n        int32_t n_past = 0;                 // number of tokens in past conversation\n        //Parameters below you can import from json or with cli arguments\n        int32_t n_ctx = 0;                  // number of tokens possible in context window\n        int32_t n_predict = 200;            // number of tokens to predict\n        int32_t top_k = 40;                 // top k logits to sample from\n        float top_p = 0.95;                 // nucleus sampling probability threshold\n        float temp = 0.28;                  // temperature to adjust model's output distribution\n        int32_t n_batch = 9;                // number of predictions to generate in parallel\n        float repeat_penalty = 1.1;         // penalty factor for repeated tokens\n        int32_t repeat_last_n = 64;         // last n tokens to penalize\n        float context_erase = 0.75;         // percent of context to erase if we exceed the context window\n        //Parameters below are not inside prompt_context, but handled separately\n        int32_t seed = -1; \n        int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency()); \n        std::string model = \"./models/ggml-vicuna-13b-1.1-q4_2.bin\";\n        std::string prompt = \"\";\n        //template prefix, header, and footer\n        std::string default_prefix = \"### Instruction:\\n The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.\";\n        std::string default_header = \"\\n### Prompt: \";\n        std::string default_footer = \"\\n### Response: \";\n        //You can toggle chat interactivity with these parameters\n        bool no_interactive = false;\n        bool use_animation = true;\n        bool run_once = false;\n        bool no_saves = false;\n        std::string b_token = \"\";           //beginning wrap token\n        std::string e_token = \"\";           //ending wrap token\n        std::string load_template = \"\";     //template file location\n        std::string load_json = \"\";         //json file location\n        std::string save_log = \"\";          //saved chat log file location\n        std::string load_log = \"\";          //loaded chat log file location\n        std::string save_name = \"model_state\";  //model state binary name\n        std::string save_dir  = \"\";         //saves directory name\n        //program binary path\n        std::string path = \"\";\n};\n\nenum ConsoleColor {\n    DEFAULT = 0,\n    PROMPT,\n    USER_INPUT,\n    BOLD\n};\n\nstruct ConsoleState {\n    bool use_color = false;\n    ConsoleColor color = DEFAULT;\n};\n\nstd::string APPNAME = \"LlamaGPTJ-chat\";\n\n//utils.h functions\nvoid set_console_color(ConsoleState &con_st, ConsoleColor color);\nstd::string random_prompt(int32_t seed);\nvoid print_usage(int argc, char** argv, const chatParams& params);\nbool parse_params(int argc, char** argv, chatParams& params);\n\n//parse_json.h functions\nvoid get_params_from_json(chatParams& params);\n\n#endif"
  },
  {
    "path": "src/parse_json.h",
    "content": "#pragma once\n\n#ifndef PARSE_JSON_H\n#define PARSE_JSON_H\n\n#include \"header.h\" \n\n//helper function to convert string to bool\nbool stob(const std::string& str) {\n    std::string lowerStr = str;\n    std::transform(str.begin(), str.end(), lowerStr.begin(), ::tolower);\n    if (lowerStr == \"true\") {\n        return true;\n    } else if (lowerStr == \"false\") {\n        return false;\n    } else {\n        throw std::invalid_argument(\"Invalid boolean string\");\n    }\n}\n\nstd::string readFile(const std::string& filename) {\n    std::ifstream inFile(filename);\n    if (!inFile) {\n        std::cerr << \"Unable to open file: \" << filename << std::endl;\n        return \"\";\n    }\n    std::stringstream buffer;\n    buffer << inFile.rdbuf();\n    inFile.close();\n    return buffer.str();\n}\n\nstd::map<std::string, std::string> parse_json_string(const std::string& jsonString) {\n    std::map<std::string, std::string> resultMap;\n    std::regex pattern(\"\\\"([^\\\"]+)\\\":\\\\s*([^\\\"]+|\\\"[^\\\"]+\\\")\");\n    std::smatch match;\n    std::string::const_iterator searchStart(jsonString.cbegin());\n\n    while (std::regex_search(searchStart, jsonString.cend(), match, pattern)) {\n        resultMap[match[1]] = match[2];\n        searchStart = match.suffix().first;\n    }\n    return resultMap;\n}\n\nstd::string removeQuotes(const std::string& input) {\n    std::string result = input;\n    result.erase(std::remove(result.begin(), result.end(), '\\\"'), result.end());\n    return result;\n}\n\nvoid get_params_from_json(chatParams& params) {\n    std::map<std::string, std::string> parsed = parse_json_string(readFile(params.load_json));\n\n    if (parsed.find(\"top_p\") != parsed.end())\n        params.top_p = std::stof(parsed[\"top_p\"]);\n    if (parsed.find(\"top_k\") != parsed.end())\n        params.top_k = std::stoi(parsed[\"top_k\"]);\n    if (parsed.find(\"temp\") != parsed.end())\n        params.temp = std::stof(parsed[\"temp\"]);\n    if (parsed.find(\"n_predict\") != parsed.end())\n        params.n_predict = std::stoi(parsed[\"n_predict\"]);\n    if (parsed.find(\"n_batch\") != parsed.end())\n        params.n_batch = std::stoi(parsed[\"n_batch\"]);\n    if (parsed.find(\"n_ctx\") != parsed.end())\n        params.n_ctx = std::stoi(parsed[\"n_ctx\"]); \n    if (parsed.find(\"seed\") != parsed.end())\n        params.seed = std::stoi(parsed[\"seed\"]);\n    if (parsed.find(\"threads\") != parsed.end())\n        params.n_threads = std::stoi(parsed[\"threads\"]);\n    if (parsed.find(\"model\") != parsed.end())\n        params.model = removeQuotes(parsed[\"model\"]);\n\n    if (parsed.find(\"prompt\") != parsed.end())\n        params.prompt = removeQuotes(parsed[\"prompt\"]);\n    if (parsed.find(\"no-interactive\") != parsed.end())\n        params.no_interactive = stob(removeQuotes(parsed[\"no-interactive\"]));    \n    if (parsed.find(\"run-once\") != parsed.end())\n        params.run_once = stob(removeQuotes(parsed[\"run-once\"]));        \n    if (parsed.find(\"no-animation\") != parsed.end())\n        params.use_animation = !stob(removeQuotes(parsed[\"no-animation\"]));\n    if (parsed.find(\"no-saves\") != parsed.end())\n        params.no_saves = stob(removeQuotes(parsed[\"no-saves\"]));\n\n    if (parsed.find(\"repeat_penalty\") != parsed.end())\n        params.repeat_penalty = std::stof(parsed[\"repeat_penalty\"]);\n    if (parsed.find(\"repeat_last_n\") != parsed.end())\n        params.repeat_last_n = std::stoi(parsed[\"repeat_last_n\"]);\n    if (parsed.find(\"context_erase\") != parsed.end())\n        params.context_erase = std::stof(parsed[\"context_erase\"]);\n    if (parsed.find(\"b_token\") != parsed.end())\n        params.b_token = removeQuotes(parsed[\"b_token\"]);\n    if (parsed.find(\"e_token\") != parsed.end())\n        params.e_token = removeQuotes(parsed[\"e_token\"]);              \n    if (parsed.find(\"load_template\") != parsed.end())\n        params.load_template = removeQuotes(parsed[\"load_template\"]);   \n    if (parsed.find(\"save_log\") != parsed.end())\n        params.save_log = removeQuotes(parsed[\"save_log\"]);\n    if (parsed.find(\"load_log\") != parsed.end())\n        params.load_log = removeQuotes(parsed[\"load_log\"]);\n    if (parsed.find(\"save_dir\") != parsed.end())\n        params.save_dir = removeQuotes(parsed[\"save_dir\"]);\n    if (parsed.find(\"save_name\") != parsed.end())\n        params.save_name = removeQuotes(parsed[\"save_name\"]);}\n\n\n#endif"
  },
  {
    "path": "src/utils.h",
    "content": "#pragma once\n\n#ifndef UTILS_H\n#define UTILS_H\n\n#include \"header.h\" \n\n//Need this for Windows colors\n#ifdef _WIN32\n    #include <windows.h> \n#endif\n\nbool containsSubstring(const std::string &str, const std::string &substr) {\n    return str.find(substr) != std::string::npos;\n}\n\nvoid check_avx_support_at_startup() {\n#if defined(__x86_64__) || defined(__i386__)\n    const bool avx(__builtin_cpu_supports(\"avx\"));\n    const bool avx2(__builtin_cpu_supports(\"avx2\"));\n    const bool avx512(__builtin_cpu_supports(\"avx512f\"));\n    const bool fma(__builtin_cpu_supports(\"fma\"));\n    if (avx512 && avx && avx2 && fma) {std::cout << \"Your computer supports AVX512\" << std::endl;}\n    else if (avx && avx2 && fma)      {std::cout << \"Your computer supports AVX2\" << std::endl;}\n    else if (avx)                     {std::cout << \"Your computer only supports AVX1\" << std::endl;}\n    else                    {std::cout << \"Your computer does not support AVX1 or AVX2\\nThe program will likely not run.\" << std::endl;} \n    #ifdef OLD_MACOS\n    std::cout << \"Compiled with OLD_MACOS flag. /save and /load features turned off.\" << std::endl;\n    #endif\n#endif\n}\n\n\n//////////////////////////////////////////////////////////////////////////\n////////////                 SIGNAL HANDLING                  ////////////\n//////////////////////////////////////////////////////////////////////////\n\n\n\nvolatile sig_atomic_t sighup_received = 0;\n\nvoid handle_sighup(int signal) {\n    #ifndef _WIN32\n    if (signal == SIGHUP) {\n        sighup_received = 1;\n    }\n    #endif\n}\n\n#ifdef _WIN32\nBOOL WINAPI console_ctrl_handler(DWORD ctrl_type) {\n    switch (ctrl_type) {\n        case CTRL_C_EVENT:\n        case CTRL_CLOSE_EVENT:\n            sighup_received = 1;\n            return TRUE;\n        default:\n            return FALSE;\n    }\n}\n#endif\n\n//////////////////////////////////////////////////////////////////////////\n////////////                /SIGNAL HANDLING                  ////////////\n//////////////////////////////////////////////////////////////////////////\n\n//////////////////////////////////////////////////////////////////////////\n////////////            READ PROMPT TEMPLATE FILE             ////////////\n//////////////////////////////////////////////////////////////////////////\n\n//This is a bit messy function but it should parse the template file into prefix, header, and footer.\n//Chat will then prompt the model with (prefix + header + input/prompt +  footer)\nstd::tuple<std::string, std::string, std::string> read_prompt_template_file(const std::string& file_path) {\n    std::string prefix, header, footer;\n    std::ifstream file(file_path);\n\n    std::vector<std::string> lines;\n    std::string line;\n\n    //store all lines of header template into a vector\n    if (file.is_open()) {\n        while (std::getline(file, line)) {\n            lines.push_back(line);\n        }\n        file.close();\n    } else {\n        std::cerr << \"Unable to open the prompt template file.\" << std::endl;\n        std::cerr << \"Reverting to default prompt template.\" << std::endl;\n        return std::make_tuple(\"### Instruction:\\n The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.\", \"\\n### Prompt: \", \"\\n### Response: \"); \n    }\n\n    //find line containing %1 and store its index.\n    int input_index;\n    for (size_t i = 0; i < lines.size(); ++i) {\n        if (lines[i].find(\"%1\") != std::string::npos) {\n            input_index = i;\n        }\n    }\n    //Special case of having only %1 in template file.\n    if (input_index == 0) {\n        header = \"\";\n        prefix = \"\";\n        footer = \"\";\n    //If there is only 1 line above %1, that will be ### header.\n    } else if (input_index == 1) {\n        header = lines[0];\n        prefix = \" \";\n    } else {\n        \n        //Put lines above the header-line into prefix.\n        prefix = lines[0];\n        for (size_t i = 1; i < input_index-1; ++i) {\n            prefix = prefix + \"\\n\" + lines[i];\n        }\n        prefix = prefix  + \" \";\n\n        //store header-line (line above input-line)\n        header = \"\\n\" + lines[input_index-1] + \" \";\n\n        //Put lines below the input-line into footer.\n        footer = \"\\n\";\n        for (size_t i = input_index+1; i < lines.size(); ++i) {\n             footer = footer + lines[i]+\" \";\n        }\n    }\n\n    return std::make_tuple(prefix, header, footer);\n}\n\n\n//////////////////////////////////////////////////////////////////////////\n////////////           /READ PROMPT TEMPLATE FILE             ////////////\n//////////////////////////////////////////////////////////////////////////\n\nvoid save_chat_log(std::string save_log, std::string prompt, std::string answer) {\n  std::ofstream logfile(save_log, std::ios::app);\n  if (logfile.is_open()) {\n    logfile << prompt;\n    logfile << answer+\"\\n\";\n    logfile.close();\n    }\n}\n\n\nstd::string read_chat_log(std::string load_log) {\n\n    std::ifstream ifs(load_log);\n    std::string content((std::istreambuf_iterator<char>(ifs)),\n                         std::istreambuf_iterator<char>());\n    return content;\n}\n\nstd::string pathname_directory(const std::string &pathname)\n{\n    std::size_t len = pathname.find_last_of(\"/\\\\\");\n    return len == std::string::npos ? \"\": pathname.substr(0, len);\n}\n\n\nvoid set_console_color(ConsoleState &con_st, ConsoleColor color) {\n    if (con_st.use_color && con_st.color != color) {\n        //Windows handles colors differently.\n        #ifdef _WIN32\n          WORD windows_colors[] = {\n            7, 14, 10, 15\n        };          \n            HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);\n            SetConsoleTextAttribute(hConsole, windows_colors[color]);\n        #else\n        //ANSI colors, works for unix.\n        const char* ansi_colors[] = {\n            //DEFAULT, PROMPT, USER_INPUT,   BOLD\n            //default, yellow, bright_green, bold\n            \"\\x1b[0m\", \"\\x1b[33m\", \"\\x1b[1m\\x1b[32m\", \"\\x1b[1m\"\n        };\n        printf(\"%s\", ansi_colors[color]);\n        #endif\n        con_st.color = color;\n    }\n}\n\nstd::string random_prompt(int32_t seed) {\n    const std::vector<std::string> prompts = {\n        \"So\", \"Once upon a time\", \"When\", \"The\", \"After\", \"If\", \"import\", \"He\", \"She\", \"They\"\n    };\n\n    std::mt19937 rng(seed);\n    return prompts[rng() % prompts.size()];\n}\n\nvoid print_version() {\n\t//Version/about page\n\t//Contains License information for distributions in binary form\nstd::string mit_license = R\"(MIT License\n\nBig thanks to contributors, testers, and commenters on Github.\nAnd to you, dear user!\n\nHappy chatting! :)\n)\";\nstd::cout << \"\\n\\n\" << APPNAME << \" version \" << VERSION << \"\\n\\n\" << \"Made by kuvaus\" << \"\\n\\n\" << mit_license << std::endl; \n//std::cout << mit_license << std::endl; \n}\n\nvoid print_usage(int argc, char** argv, const chatParams& params) {\n    // Print usage information\n    fprintf(stderr, \"usage: %s [options]\\n\", argv[0]);\n    fprintf(stderr, \"\\n\");\n    fprintf(stderr, \"A simple chat program for GPT-J, LLaMA, and MPT models.\\n\");\n    fprintf(stderr, \"You can set specific initial prompt with the -p flag.\\n\");\n    fprintf(stderr, \"Runs default in interactive and continuous mode.\\n\");\n    fprintf(stderr, \"Type '/reset' to reset the chat context.\\n\");\n    fprintf(stderr, \"Type '/save','/load' to save network state into a binary file.\\n\");\n    fprintf(stderr, \"Type '/save NAME','/load NAME' to rename saves. Default: --save_name NAME.\\n\");\n    fprintf(stderr, \"Type '/help' to show this help dialog.\\n\");\n    fprintf(stderr, \"Type 'quit', 'exit' or, 'Ctrl+C' to quit.\\n\");\n    fprintf(stderr, \"\\n\");\n    fprintf(stderr, \"options:\\n\");\n    fprintf(stderr, \"  -h, --help            show this help message and exit\\n\");\n    fprintf(stderr, \"  -v, --version         show version and license information\\n\");\n    fprintf(stderr, \"  --run-once            disable continuous mode\\n\");\n    fprintf(stderr, \"  --no-interactive      disable interactive mode altogether (uses given prompt only)\\n\");\n    fprintf(stderr, \"  --no-animation        disable chat animation\\n\");\n    fprintf(stderr, \"  --no-saves            disable '/save','/load' functionality\\n\");\n    fprintf(stderr, \"  -s SEED, --seed SEED  RNG seed for --random-prompt (default: -1)\\n\");\n    fprintf(stderr, \"  -t N, --threads    N  number of threads to use during computation (default: %d)\\n\", params.n_threads);\n    fprintf(stderr, \"  -p PROMPT, --prompt PROMPT\\n\");\n    fprintf(stderr, \"                        prompt to start generation with (default: empty)\\n\");\n    fprintf(stderr, \"  --random-prompt       start with a randomized prompt.\\n\");\n    fprintf(stderr, \"  -n N, --n_predict  N  number of tokens to predict (default: %d)\\n\", params.n_predict);\n    fprintf(stderr, \"  --top_k            N  top-k sampling (default: %d)\\n\", params.top_k);\n    fprintf(stderr, \"  --top_p            N  top-p sampling (default: %.1f)\\n\", params.top_p);\n    fprintf(stderr, \"  --temp             N  temperature (default: %.1f)\\n\", params.temp);\n    fprintf(stderr, \"  --n_ctx            N  number of tokens in context window (default: %d)\\n\", params.n_ctx);\n    fprintf(stderr, \"  -b N, --batch_size N  batch size for prompt processing (default: %d)\\n\", params.n_batch);\n    fprintf(stderr, \"  --repeat_penalty   N  repeat_penalty (default: %.1f)\\n\", params.repeat_penalty);\n    fprintf(stderr, \"  --repeat_last_n    N  last n tokens to penalize  (default: %d)\\n\", params.repeat_last_n);\n    fprintf(stderr, \"  --context_erase    N  percent of context to erase  (default: %.1f)\\n\", params.context_erase);\n    fprintf(stderr, \"  --b_token             optional beginning wrap token for response (default: empty)\\n\");\n    fprintf(stderr, \"  --e_token             optional end wrap token for response (default: empty)\\n\");\n    fprintf(stderr, \"  -j,   --load_json FNAME\\n\");\n    fprintf(stderr, \"                        load options instead from json at FNAME (default: empty/no)\\n\");\n    fprintf(stderr, \"  --load_template   FNAME\\n\");\n    fprintf(stderr, \"                        load prompt template from a txt file at FNAME (default: empty/no)\\n\");\n    fprintf(stderr, \"  --save_log        FNAME\\n\");\n    fprintf(stderr, \"                        save chat log to a file at FNAME (default: empty/no)\\n\");\n    fprintf(stderr, \"  --load_log        FNAME\\n\");\n    fprintf(stderr, \"                        load chat log from a file at FNAME (default: empty/no)\\n\");\n    fprintf(stderr, \"  --save_dir        DIR\\n\");\n    fprintf(stderr, \"                        directory for saves (default: %s/saves)\\n\", pathname_directory(argv[0]).c_str());\n    fprintf(stderr, \"  --save_name       NAME\\n\");\n    fprintf(stderr, \"                        save/load model state binary at save_dir/NAME.bin (current: %s)\\n\", params.save_name.c_str());\n    fprintf(stderr, \"                        context is saved to save_dir/NAME.ctx (current: %s)\\n\", params.save_name.c_str());\n    fprintf(stderr, \"  -m FNAME, --model FNAME\\n\");\n    fprintf(stderr, \"                        model path (current: %s)\\n\", params.model.c_str());\n    fprintf(stderr, \"\\n\");\n}\n\nbool parse_params(int argc, char** argv, chatParams& params) { \n\n    // Parse command-line arguments\n    for (int i = 1; i < argc; i++) {\n        std::string arg = argv[i];\n\n        if (arg == \"-j\" || arg == \"--load_json\") {\n            params.load_json = argv[++i];\n            if (!params.load_json.empty()) {\n                std::cout << APPNAME << \": parsing options from json: \" << params.load_json << std::endl;\n                get_params_from_json(params);\n            } else {\n                std::cout << APPNAME << \": trying to parse options from json but got empty filename.\" << std::endl;\n            }\n        } else if (arg == \"--run-once\") {\n            params.run_once = true;\n        } else if (arg == \"--no-interactive\") {\n            params.no_interactive = true;\n        } else if (arg == \"--no-animation\") {\n            params.use_animation = false;\n        } else if (arg == \"--no-saves\") {\n            params.no_saves = true;\n        } else if (arg == \"-s\" || arg == \"--seed\") {\n            params.seed = static_cast<int32_t>(std::stoi(argv[++i]));\n        } else if (arg == \"-t\" || arg == \"--threads\") {\n            params.n_threads = static_cast<int32_t>(std::stoi(argv[++i]));\n        } else if (arg == \"-p\" || arg == \"--prompt\") {\n            params.prompt = argv[++i];\n        } else if (arg == \"--random-prompt\") {\n            params.prompt = random_prompt(params.seed);\n        } else if (arg == \"-n\" || arg == \"--n_predict\") {\n            params.n_predict = static_cast<int32_t>(std::stoi(argv[++i]));\n        } else if (arg == \"--top_k\") {\n            params.top_k = static_cast<int32_t>(std::stoi(argv[++i]));\n        } else if (arg == \"--top_p\") {\n            params.top_p = static_cast<float>(std::stof(argv[++i]));\n        } else if (arg == \"--temp\") {\n            params.temp = static_cast<float>(std::stof(argv[++i]));\n        } else if (arg == \"-b\" || arg == \"--batch_size\") {\n            params.n_batch = static_cast<int32_t>(std::stoi(argv[++i]));\n        } else if (arg == \"--n_ctx\") {\n            params.n_ctx = static_cast<int>(std::stoi(argv[++i]));\n        } else if (arg == \"--repeat_penalty\") {\n            params.repeat_penalty = static_cast<float>(std::stof(argv[++i]));\n        } else if (arg == \"--repeat_last_n\") {\n            params.repeat_last_n = static_cast<int>(std::stoi(argv[++i]));\n        } else if (arg == \"--context_erase\") {\n            params.context_erase = static_cast<float>(std::stof(argv[++i]));\n        } else if (arg == \"--b_token\") {\n            params.b_token = argv[++i];\n        } else if (arg == \"--e_token\") {\n            params.e_token = argv[++i];\n        } else if (arg == \"--load_template\") {\n            params.load_template = argv[++i];\n        } else if (arg == \"--save_log\") {\n            params.save_log = argv[++i];\n        } else if (arg == \"--load_log\") {\n            params.load_log = argv[++i];\n        } else if (arg == \"--save_dir\") {\n            params.save_dir = argv[++i];    \n        } else if (arg == \"--save_name\") {\n            params.save_name = argv[++i];\n        } else if (arg == \"-m\" || arg == \"--model\") {\n            params.model = argv[++i];\n        } else if (arg == \"-h\" || arg == \"--help\") {\n            print_usage(argc, argv, params);\n            exit(0);\n        } else if (arg == \"-v\" || arg == \"--version\") {\n            print_version();\n            exit(0);\n        } else {\n            fprintf(stderr, \"error: unknown argument: %s\\n\", arg.c_str());\n            print_usage(argc, argv, params);\n            exit(0);\n        }\n    }\n    //get path to program\n    params.path = pathname_directory(argv[0]);\n\tparams.path.append(\"/\");\n\n\t\n    return true;\n}\n\n\n#endif\n"
  }
]