Repository: kuvaus/LlamaGPTJ-chat
Branch: main
Commit: e022976f0460
Files: 32
Total size: 196.2 KB

Directory structure:
gitextract_vuj2yh60/

├── .github/
│   └── workflows/
│       ├── cmake-release.yml
│       ├── cmake.yml
│       └── cmake_branch.yml
├── .gitignore
├── .gitmodules
├── CHANGELOG.md
├── CMakeLists.txt
├── LICENSE
├── README.md
├── cmake/
│   └── config.h.in
├── gpt4all-backend/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── gptj/
│   │   └── placeholder
│   ├── gptj.cpp
│   ├── gptj.h
│   ├── llama/
│   │   └── placeholder
│   ├── llamamodel.cpp
│   ├── llamamodel.h
│   ├── llmodel.h
│   ├── llmodel_c.cpp
│   ├── llmodel_c.h
│   ├── mpt.cpp
│   ├── mpt.h
│   ├── scripts/
│   │   └── convert_mpt_hf_to_ggml.py
│   ├── utils.cpp
│   └── utils.h
├── prompt_template_sample.txt
└── src/
    ├── CMakeLists.txt
    ├── chat.cpp
    ├── header.h
    ├── parse_json.h
    └── utils.h

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/cmake-release.yml
================================================
name: CMake-release

on:
  push:
    tags:
      - 'v*'

env:
  BUILD_TYPE: Release

permissions:
  contents: read
  actions: write

jobs:
  build:
    runs-on: ${{ matrix.os }}

    strategy:
      fail-fast: false
      matrix:
        os:
          - ubuntu-latest
          - macos-latest
          - windows-latest
        instructions:
          - avx
          - avx2

    steps:
    - uses: actions/checkout@v3
      with:
          submodules: recursive

    - name: Setup MinGW
      if: matrix.os == 'windows-latest'
      run: |
        choco install mingw -y -libwinpthread
        echo "C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append

    - name: Configure CMake
      run: |
        if ("${{ matrix.os }}" -eq "windows-latest") {
          $env:PATH += ";C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin"
          cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }} -G "MinGW Makefiles"
        } else {
          cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }}
        }
      shell: pwsh
      
    - name: Build
      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
      
    - name: Test
      working-directory: ${{github.workspace}}/build
      run: ctest -C ${{env.BUILD_TYPE}}

    - name: Prepare binary
      run: |
        if ("${{ matrix.instructions }}" -eq "avx"){
          if ("${{ matrix.os }}" -eq "windows-latest") {
            cp ${{github.workspace}}\build\bin\chat.exe chat.exe
            mv chat.exe chat-windows-latest-avx.exe
            shasum -a 256 -b chat-windows-latest-avx.exe > shasum-chat-windows-latest-avx.sha256
          } else {
            cp ${{github.workspace}}/build/bin/chat chat
            mv chat chat-${{ matrix.os }}-${{ matrix.instructions }}
            shasum -a 256 -b chat-${{ matrix.os }}-${{ matrix.instructions }} > shasum-chat-${{ matrix.os }}-${{ matrix.instructions }}.sha256
          }
        } else {
          if ("${{ matrix.os }}" -eq "windows-latest") {
            cp ${{github.workspace}}\build\bin\chat.exe chat.exe
            mv chat.exe chat-windows-latest-avx2.exe
            shasum -a 256 -b chat-windows-latest-avx2.exe > shasum-chat-windows-latest-avx2.sha256
          } else {
            cp ${{github.workspace}}/build/bin/chat chat
            mv chat chat-${{ matrix.os }}-${{ matrix.instructions }}
            shasum -a 256 -b chat-${{ matrix.os }}-${{ matrix.instructions }} > shasum-chat-${{ matrix.os }}-${{ matrix.instructions }}.sha256
          }
        }
      shell: pwsh

    - name: Upload binary
      uses: actions/upload-artifact@v2
      with:
        name: chat-${{ matrix.os }}-${{ matrix.instructions }}
        path: chat-${{ matrix.os }}-${{ matrix.instructions }}*

    - name: Upload shasums
      uses: actions/upload-artifact@v2
      with:
        name: shasum-chat-${{ matrix.os }}-${{ matrix.instructions }}
        path: shasum-chat-${{ matrix.os }}-${{ matrix.instructions }}*
  release:
    needs: build
    runs-on: ubuntu-latest

    steps:
    - name: Create Release
      id: create_release
      uses: actions/create-release@v1
      env:
        GITHUB_TOKEN: ${{ secrets.DEPLOY_KEY }}
      with:
        tag_name: ${{ github.ref }}
        release_name: Release ${{ github.ref }}
        draft: false
        prerelease: false

    - name: Download artifacts
      uses: actions/download-artifact@v2
      with:
        path: artifacts

    - name: Upload artifacts
      uses: softprops/action-gh-release@v1
      env:
        GITHUB_TOKEN: ${{ secrets.DEPLOY_KEY }}
      with:
        tag_name: ${{ github.ref_name }}
        name: Release ${{ github.ref_name }}
        draft: false
        prerelease: false
        files: |
          artifacts/**/*

    #
    # This part filters the CHANGELOG.md using python
    # Then it adds FILTERED_CHANGELOG.md to release notes
    #

    - name: Checkout repository
      uses: actions/checkout@v3

    - name: Set up Python
      uses: actions/setup-python@v4
      with:
        python-version: 3.x

    - name: Filter CHANGELOG.md
      uses: jannekem/run-python-script-action@v1
      with:
        script: |
          filtered_lines = []
          start_processing = False

          with open('CHANGELOG.md', 'r') as file:
            for line in file:
              if line.startswith("#### [v"):
                if start_processing:
                  break
                else:
                  file.readline()
                  file.readline()
                  start_processing = True
                  continue
              if start_processing:
                filtered_lines.append(line)
      
            with open('FILTERED_CHANGELOG.md', 'w') as file:
              file.writelines(filtered_lines)

    - name: Generate release notes
      uses: softprops/action-gh-release@v1
      env:
        GITHUB_TOKEN: ${{ secrets.DEPLOY_KEY }}
      with:
        tag_name: ${{ github.ref_name }}
        name: Release ${{ github.ref_name }}
        body_path: FILTERED_CHANGELOG.md
        draft: false
        prerelease: false


================================================
FILE: .github/workflows/cmake.yml
================================================
name: CMake

on:
  push:
    branches: [ "main" ]

env:
  BUILD_TYPE: Release

jobs:
  build:
    runs-on: ${{ matrix.os }}
    
    strategy:
      fail-fast: false
      matrix:
        os:
          - ubuntu-latest
          - macos-latest
          - windows-latest
        instructions:
          - avx
          - avx2

    steps:
    - uses: actions/checkout@v3
      with:
          submodules: recursive

    - name: Setup MinGW
      if: matrix.os == 'windows-latest'
      run: |
        choco install mingw -y -libwinpthread
        echo "C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append

    - name: Configure CMake
      run: |
        if ("${{ matrix.os }}" -eq "windows-latest") {
          $env:PATH += ";C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin"
          cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }} -G "MinGW Makefiles"
        } else {
          cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }}
        }
      shell: pwsh

    - name: Build
      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}

    - name: Test
      working-directory: ${{github.workspace}}/build
      run: ctest -C ${{env.BUILD_TYPE}}


================================================
FILE: .github/workflows/cmake_branch.yml
================================================
name: CMake

on:
  push:
    branches:
      - '*'
      - '!main'

env:
  BUILD_TYPE: Release

jobs:
  build:
    runs-on: ${{ matrix.config.os }}
    
    strategy:
      fail-fast: false
      matrix:
        config:
          - { os: 'ubuntu-latest', instructions: 'avx' }
          - { os: 'ubuntu-latest', instructions: 'avx2' }
          - { os: 'macos-latest', instructions: 'avx' }
          - { os: 'macos-latest', instructions: 'avx2' }
          - { os: 'windows-latest', build: 'msvc', instructions: 'avx' }
          - { os: 'windows-latest', build: 'msvc', instructions: 'avx2' }
          - { os: 'windows-latest', build: 'mingw', instructions: 'avx' }
          - { os: 'windows-latest', build: 'mingw', instructions: 'avx2' }

    steps:
    - uses: actions/checkout@v3
      with:
          submodules: recursive

    - name: Configure CMake
      if: matrix.build == 'msvc'
      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}

    - name: Build
      if: matrix.build == 'msvc'
      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}

    - name: Test
      if: matrix.build == 'msvc'
      working-directory: ${{github.workspace}}/build
      run: ctest -C ${{env.BUILD_TYPE}}

    - name: Prepare binary
      if: matrix.build == 'msvc'
      run: |
        if ("${{ matrix.os }}" -eq "windows-latest") {
          cp ${{github.workspace}}\build\bin\Release\chat.exe chat-msvc.exe
          mv chat-msvc.exe chat-windows-latest-msvc.exe
        }
      shell: pwsh
      
    - name: Setup MinGW
      if: matrix.os == 'windows-latest'
      run: |
        choco install mingw -y -libwinpthread
        echo "C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append

    - name: Configure CMake
      run: |
        if ("${{ matrix.os }}" -eq "windows-latest") {
          $env:PATH += ";C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin"
          cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }} -G "MinGW Makefiles"
        } elseif ("${{ matrix.arch }}" -eq "aarch64") {
        } else {
          cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DAVX2=${{ matrix.instructions == 'avx2' && 'ON' || 'OFF' }}
        }
      shell: pwsh

    - name: Build
      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}

    - name: Test
      working-directory: ${{github.workspace}}/build
      run: ctest -C ${{env.BUILD_TYPE}}
    

================================================
FILE: .gitignore
================================================
# Folders
build/
tmp/

# Visual Studio Code
.vscode

# MacOS 
.DS_Store

# Prerequisites
*.d

# Compiled Object files
*.slo
*.lo
*.o
*.obj

# Precompiled Headers
*.gch
*.pch

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Fortran module files
*.mod
*.smod

# Compiled Static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app

.cache


================================================
FILE: .gitmodules
================================================
[submodule "llama.cpp"]
    path = gpt4all-backend/llama.cpp
    url = https://github.com/manyoso/llama.cpp
    #url = https://github.com/ggerganov/llama.cpp


================================================
FILE: CHANGELOG.md
================================================

## Changelog

#### [Upcoming](https://github.com/kuvaus/LlamaGPTJ-chat/compare/v0.3.0...HEAD)


#### [v0.3.0](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.3.0)

> 26 June 2023

- Add this [changelog](https://github.com/kuvaus/LlamaGPTJ-chat/blob/main/CHANGELOG.md) :)
- Add sha256 hashes on release so you can verify the binaries
- All binaries are automatically generated with Github actions
- Add signal handling for SIGHUP (macOS, Linux) and CTRL_CLOSE_EVENT (Windows) to fix issue [`#16`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/16)
- This allows you to run chat as a subprocess. The chat subprocess now quits properly if parent app is closed.
- Version information
- Fix segfault on`/help`

#### [v0.2.9](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.9)

> 22 June 2023

- [Pull request](https://github.com/kuvaus/LlamaGPTJ-chat/pull/18) from [@154pinkchairs](https://github.com/154pinkchairs/) merged. Thanks. :)
- The pull request [`#18`](https://github.com/kuvaus/LlamaGPTJ-chat/pull/18) has the two fixes below:
- Properly handle file paths including tildes [`18e9f36`](https://github.com/kuvaus/LlamaGPTJ-chat/commit/18e9f36)
- Handle buffer allocation errors [`6800dfb`](https://github.com/kuvaus/LlamaGPTJ-chat/commit/6800dfb)
- Better debug mode compilation. May fix issue [`#9`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/9)

#### [v0.2.8](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.8)
> 16 June 2023

- Adds `--save_dir` option so you can change save directory location
- Default location is `./saves` on the same directory as the chat binary
- See issue [`#13`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/13) for more details

#### [v0.2.7](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.7)
> 15 June 2023

- Fixes for old macOS.
- Use `-DOLD_MACOS=ON` option when compiling with CMake.
- Tested to compile on High Sierra and Xcode 10

#### [v0.2.6](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.6)
> 14 June 2023

- You can name saves with `./save NAME` and `./load NAME`
- You can toggle saving and loading off with `--no-saves` flag

#### [v0.2.5](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.5)
> 13 June 2023

- Save/load state with `./save` and `./load`
- Reset context with `./reset`, help with `./help`
- Makes a `./saves` folder
- Note that a single save can take up to 2Gb
- You can wrap the AI response with tokens using `--b_token` and `--e_token`
- See issue [`#12`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/12) for more details

#### [v0.2.4](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.4)
> 5 June 2023

- Fix when using json to specify names for logfiles. Fixes issue [`#11`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/11)

#### [v0.2.3](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.3)
> 4 June 2023

- Fix said ability to reset context... :)

#### [v0.2.2](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.2)
> 3 June 2023

- Ability to reset context

#### [v0.2.1](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.1)
> 30 May 2023

- Save and load chat logs
- Use `--save_log` and `--load_log`
- AVX512 option for compilation `-DAVX512=ON`

#### [v0.2.0](https://github.com/kuvaus/LlamaGPTJ-chat/releases/tag/v0.2.0)
> 17 May 2023

- Update gpt4all backend to v0.1.1 [`61a963a`](https://github.com/kuvaus/LlamaGPTJ-chat/commit/61a963a3d220ef157a8504ddde708f33dc2946eb)
- Full Windows Visual Studio compatibility. Finally fixes issue [`#1`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/1)
- Builds from source on aarch64 Linux. Fixes issue [`#3`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/3)
- Full MPT support. Fixes issue [`#4`](https://github.com/kuvaus/LlamaGPTJ-chat/issues/4)

#### v0.1.9
> 16 May 2023

- Code cleaning and reordering
- `llmodel_create_model()` function

#### v0.1.8
> 13 May 2023

- Add support for MPT models
- Uses [gpt4all-backend](https://github.com/nomic-ai/gpt4all)

#### v0.1.7
> 12 May 2023

- First [pull request](https://github.com/kuvaus/LlamaGPTJ-chat/pull/2)  from [@itz-coffee](https://github.com/itz-coffee/) merged. Thanks. :)
- The pull request [`#2`](https://github.com/kuvaus/LlamaGPTJ-chat/pull/2) adds the feature below:
- Add --no-animation flag [`fdc2ac3`](https///github.com/kuvaus/LlamaGPTJ-chat/commit/fdc2ac3)
- Support for old macOS

#### v0.1.6
> 4 May 2023

- Parse parameters from json files
- Use `-j FNAME` or`--load_json  FNAME`

#### v0.1.5
> 3 May 2023

- MinGW compilation on Windows

#### v0.1.4
> 1 May 2023

- v0.1.4 had no tags
- It was part of `cmake-release.yml` rewrite to enable MinGW [`e7e1ebf`](https://github.com/kuvaus/LlamaGPTJ-chat/commit/e7e1ebf97d696d069bbc0ae7f0ed078739fb6642)

#### v0.1.3
> 1 May 2023

- Add loading of prompt template files
- Use `--load_template` for loading
- See `prompt_template_sample.txt` for a sample


#### v0.1.2
> 30 April 2023

- Automatic memory handling for the model

#### v0.1.1
> 29 April 2023

- Windows compilation fixes

#### v0.1.0
> 29 April 2023


- Before this, progress was in  [GPTJ-chat](https://github.com/kuvaus/GPTJ-chat/) and [Llama-chat](https://github.com/kuvaus/Llama-chat/)
- First version


================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required (VERSION 3.2)

if(APPLE)
  option(OLD_MACOS       "Using old macos"                   OFF) 
  option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
  if(BUILD_UNIVERSAL AND NOT OLD_MACOS)
    # Build a Universal binary on macOS
    set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
  else()
    # Build for the host architecture on macOS
    set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE)
  endif()
  if (OLD_MACOS)
    add_definitions(-DOLD_MACOS)
  endif()
endif()

project(LlamaGPTJ-chat)

set(VERSION_MAJOR 0)
set(VERSION_MINOR 3)
set(VERSION_PATCH 0)
set(VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")

set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")


if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(LLAMA_STANDALONE ON)
else()
    set(LLAMA_STANDALONE OFF)
endif()


# options
option(AVX2                         "enable AVX2"                                           ON)
option(AVX512                       "enable AVX512"                                         OFF)

option(LLAMA_AVX                    "llama: enable AVX"                                     ON)
option(LLAMA_AVX2                   "llama: enable AVX2"                                    ${AVX2})
option(LLAMA_AVX512                 "llama: enable AVX512"                                  ${AVX512})
option(LLAMA_AVX512_VBMI            "llama: enable AVX512-VBMI"                             ${AVX512})
option(LLAMA_AVX512_VNNI            "llama: enable AVX512-VNNI"                             ${AVX512})
option(LLAMA_FMA                    "llama: enable FMA"                                     ${AVX2})


# sanitizers
#set(LLAMA_BUILD_EXAMPLES ON CACHE BOOL "llama: build examples" FORCE)
if(APPLE)
elseif(UNIX)
    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
    option(AVX2                         "enable AVX2"                                           OFF)
    option(LLAMA_AVX                    "llama: enable AVX"                                     OFF)
    option(LLAMA_AVX2                   "llama: enable AVX2"                                    OFF)
    option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
    option(LLAMA_AVX512_VBMI            "llama: enable AVX512-VBMI"                             OFF)
    option(LLAMA_AVX512_VNNI            "llama: enable AVX512-VNNI"                             OFF)
    set(BUILD_SHARED_LIBS ON FORCE)
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -mno-outline-atomics")
    endif()
endif()

if (GGML_SANITIZE_THREAD)
    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fsanitize=thread")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
endif()

if (GGML_SANITIZE_ADDRESS)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=address -fno-omit-frame-pointer")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
endif()

if (GGML_SANITIZE_UNDEFINED)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=undefined")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
endif()
if (AVX512)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -mavx512vl")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl")
endif()

#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native")

# dependencies

set(CMAKE_C_STANDARD   17)
set(CMAKE_CXX_STANDARD 20)

find_package(Threads REQUIRED)

# main

# Include static libs for compatibility:
if(APPLE)
  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-search_paths_first -lSystem")
elseif(UNIX)
    if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++ -static")
    endif()
elseif(WIN32)
  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++ -static")
endif()

# Generate a header file with the version number
configure_file(
  "${CMAKE_CURRENT_SOURCE_DIR}/cmake/config.h.in"
  "${CMAKE_CURRENT_BINARY_DIR}/config.h"
)

# Include the binary directory for the generated header file
include_directories("${CMAKE_CURRENT_BINARY_DIR}")

add_subdirectory(gpt4all-backend/llama.cpp)
add_subdirectory(gpt4all-backend)
add_subdirectory(src)


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2023 Jukka Maatta

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
[![CMake](https://github.com/kuvaus/LlamaGPTJ-chat/actions/workflows/cmake.yml/badge.svg)](https://github.com/kuvaus/LlamaGPTJ-chat/actions/workflows/cmake.yml)
# LlamaGPTJ-chat
Simple command line chat program for [GPT-J](https://en.wikipedia.org/wiki/GPT-J), [LLaMA](https://en.wikipedia.org/wiki/LLaMA) and [MPT](https://www.mosaicml.com/blog/mpt-7b) models written in C++. Based on [llama.cpp](https://github.com/ggerganov/llama.cpp) and uses [gpt4all-backend](https://github.com/nomic-ai/gpt4all) for full compatibility.

<img alt="LlamaGPTJ-chat demo" src="https://user-images.githubusercontent.com/22169537/234323778-64365dc9-8bd9-4a48-b7de-ec0280a5fb4e.gif" width="600" />

> **Warning**
> Very early progress, might have bugs

# Table of contents
<!-- TOC -->
* [Installation](#installation)
* [Usage](#usage)
* [GPT-J, LLaMA, and MPT models](#gpt-j-llama-and-mpt-models)
* [Detailed command list](#detailed-command-list)
* [Useful features](#useful-features)
* [License](#license)
<!-- TOC -->

## Installation
Since the program is made using c++ it should build and run on most Linux, MacOS and Windows systems. The [Releases](https://github.com/kuvaus/LlamaGPTJ-chat/releases) link has ready-made binaries. AVX2 is faster and works on most newer computers. If you run the program, it will check and print if your computer has AVX2 support.

### Download
```sh
git clone --recurse-submodules https://github.com/kuvaus/LlamaGPTJ-chat
cd LlamaGPTJ-chat
```
You need to also download a model file, see [supported models](#gpt-j-llama-and-mpt-models) for details and links.

### Build
Since the program is made using c++ it should build and run on most Linux, MacOS and Windows systems. 
On most systems, you only need this to build:
```sh
mkdir build
cd build
cmake ..
cmake --build . --parallel
```
> **Note**
> 
> If you have an old processor, you can turn AVX2 instructions OFF in the build step with `-DAVX2=OFF` flag.
> 
> If you have a new processor, you can turn AVX512 instructions ON in the build step with `-DAVX512=ON` flag.
> 
> On old macOS, set `-DBUILD_UNIVERSAL=OFF` to make the build x86 only instead of the universal Intel/ARM64 binary.
> On really old macOS, set `-DOLD_MACOS=ON`. This disables `/save` and `/load` but compiles on old Xcode.
> 
> On Windows you can now use Visual Studio (MSVC) or MinGW. If you want MinGW build instead, set `-G "MinGW Makefiles"`.
>
> On ARM64 Linux there are no ready-made binaries, but you can now build it from source.

## Usage

After compiling, the binary is located at:

```sh
build/bin/chat
```
But you're free to move it anywhere. Simple command for 4 threads to get started:
```sh
./chat -m "/path/to/modelfile/ggml-vicuna-13b-1.1-q4_2.bin" -t 4
```
or
```sh
./chat -m "/path/to/modelfile/ggml-gpt4all-j-v1.3-groovy.bin" -t 4
```

Happy chatting!


## GPT-J, LLaMA, and MPT models
Current backend supports the GPT-J, LLaMA and MPT models.

### GPT-J model
You need to download a GPT-J model first. Here are direct links to models:

>- The default version is **v1.0**: [ggml-gpt4all-j.bin](https://gpt4all.io/models/ggml-gpt4all-j.bin)
>- At the time of writing the newest is **1.3-groovy**: [ggml-gpt4all-j-v1.3-groovy.bin](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin)

They're around 3.8 Gb each. The chat program stores the model in RAM on runtime so you need enough memory to run. You can get more details on GPT-J models from [gpt4all.io](https://gpt4all.io/) or [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) github.

### LLaMA model
Alternatively you need to download a LLaMA model first. The original weights are for research purposes and you can apply for access [here](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/). Below are direct links to derived models:

>- Vicuna 7b **v1.1**: [ggml-vicuna-7b-1.1-q4_2.bin](https://gpt4all.io/models/ggml-vicuna-7b-1.1-q4_2.bin)
>- Vicuna 13b **v1.1**: [ggml-vicuna-13b-1.1-q4_2.bin](https://gpt4all.io/models/ggml-vicuna-13b-1.1-q4_2.bin)
>- GPT-4-All **l13b-snoozy**: [ggml-gpt4all-l13b-snoozy.bin](https://gpt4all.io/models/ggml-gpt4all-l13b-snoozy.bin)

The LLaMA models are quite large: the 7B parameter versions are around 4.2 Gb and 13B parameter 8.2 Gb each. The chat program stores the model in RAM on runtime so you need enough memory to run. You can get more details on LLaMA models from the [whitepaper](https://arxiv.org/abs/2302.13971) or META AI [website](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/).

### MPT model
You can also download and use an MPT model instead. Here are direct links to MPT-7B models:
>- MPT-7B base model pre-trained by Mosaic ML: [ggml-mpt-7b-base.bin](https://gpt4all.io/models/ggml-mpt-7b-base.bin)
>- MPT-7B instruct model trained by Mosaic ML: [ggml-mpt-7b-instruct.bin](https://gpt4all.io/models/ggml-mpt-7b-instruct.bin)
>- Non-commercial MPT-7B chat model  trained by Mosaic ML: [ggml-mpt-7b-chat.bin](https://gpt4all.io/models/ggml-mpt-7b-chat.bin)

They're around 4.9 Gb each. The chat program stores the model in RAM on runtime so you need enough memory to run. You can get more details on MPT models from MosaicML [website](https://www.mosaicml.com/blog/mpt-7b) or [mosaicml/llm-foundry](https://github.com/mosaicml/llm-foundry) github.

## Detailed command list
You can view the help and full parameter list with:
`
./chat -h
`

```sh
usage: ./bin/chat [options]

A simple chat program for GPT-J, LLaMA, and MPT models.
You can set specific initial prompt with the -p flag.
Runs default in interactive and continuous mode.
Type '/reset' to reset the chat context.
Type '/save','/load' to save network state into a binary file.
Type '/save NAME','/load NAME' to rename saves. Default: --save_name NAME.
Type '/help' to show this help dialog.
Type 'quit', 'exit' or, 'Ctrl+C' to quit.

options:
  -h, --help            show this help message and exit
  -v, --version         show version and license information
  --run-once            disable continuous mode
  --no-interactive      disable interactive mode altogether (uses given prompt only)
  --no-animation        disable chat animation
  --no-saves            disable '/save','/load' functionality
  -s SEED, --seed SEED  RNG seed for --random-prompt (default: -1)
  -t N, --threads    N  number of threads to use during computation (default: 4)
  -p PROMPT, --prompt PROMPT
                        prompt to start generation with (default: empty)
  --random-prompt       start with a randomized prompt.
  -n N, --n_predict  N  number of tokens to predict (default: 200)
  --top_k            N  top-k sampling (default: 40)
  --top_p            N  top-p sampling (default: 0.9)
  --temp             N  temperature (default: 0.9)
  --n_ctx            N  number of tokens in context window (default: 0)
  -b N, --batch_size N  batch size for prompt processing (default: 20)
  --repeat_penalty   N  repeat_penalty (default: 1.1)
  --repeat_last_n    N  last n tokens to penalize  (default: 64)
  --context_erase    N  percent of context to erase  (default: 0.8)
  --b_token             optional beginning wrap token for response (default: empty)
  --e_token             optional end wrap token for response (default: empty)
  -j,   --load_json FNAME
                        load options instead from json at FNAME (default: empty/no)
  --load_template   FNAME
                        load prompt template from a txt file at FNAME (default: empty/no)
  --save_log        FNAME
                        save chat log to a file at FNAME (default: empty/no)
  --load_log        FNAME
                        load chat log from a file at FNAME (default: empty/no)
  --save_dir        DIR
                        directory for saves (default: ./saves)
  --save_name       NAME
                        save/load model state binary at save_dir/NAME.bin (current: model_state)
                        context is saved to save_dir/NAME.ctx (current: model_state)
  -m FNAME, --model FNAME
                        model path (current: ./models/ggml-vicuna-13b-1.1-q4_2.bin)
```
## Useful features
Here are some handy features and details on how to achieve them using command line options.

### Save/load chat log and read output from other apps
By default, the program prints the chat to standard (stdout) output, so if you're including the program into your app, it only needs to read stdout. You can also save the whole chat log to a text file with `--save_log` option. There's an elementary way to remember your past conversation by simply loading the saved chat log with `--load_log` option when you start a new session.

### Run the program once without user interaction
If you only need the program to run once without any user interactions, one way is to set prompt with `-p "prompt"` and using `--no-interactive` and `--no-animation` flags. The program will read the prompt, print the answer, and close.

### Add AI personalities and characters
If you want a personality for your AI, you can change `prompt_template_sample.txt` and use `--load_template` to load the modified file. The only constant is that your input during chat will be put on the `%1` line. Instructions, prompt, response, and everything else can be replaced any way you want. Having different `personality_template.txt` files is an easy way to add different AI characters. With _some_ models, giving both AI and user names instead of `Prompt:` and `Response:`, can make the conversation flow more naturally as the AI tries to mimic a conversation between two people.

### Ability to reset chat context
You can reset the chat at any time during chatting by typing `/reset` in the input field. This will clear the AI's memory of past conversations, logits, and tokens. You can then start the chat from a blank slate without having to reload the whole model again.

### Load all parameters using JSON
You can also fetch parameters from a json file with `--load_json "/path/to/file.json"` flag. Different models might perform better or worse with different input parameters so using json files is a handy way to store and load all the settings at once. The JSON file loader is designed to be simple in order to prevent any external dependencies, and as a result, the JSON file must follow a specific format. Here is a simple example:

```javascript
{"top_p": 1.0, "top_k": 50400, "temp": 0.9, "n_batch": 9}
```
This is useful when you want to store different temperature and sampling settings.

And a more detailed one:
```javascript
{
"top_p": 1.0,
"top_k": 50400,
"temp": 0.9,
"n_batch": 20,
"threads": 12,
"prompt": "Once upon a time",
"load_template": "/path/to/prompt_template_sample.txt",
"model": "/path/to/ggml-gpt4all-j-v1.3-groovy.bin",
"no-interactive": "true"
}
```
This one loads the prompt from the json, uses a specific template, and runs the program once in no-interactive mode so user does not have to press any input.

## License

This project is licensed under the MIT [License](https://github.com/kuvaus/LlamaGPTJ-chat/blob/main/LICENSE)


================================================
FILE: cmake/config.h.in
================================================
#ifndef CONFIG_H
#define CONFIG_H

#define VERSION "@VERSION_MAJOR@" "." "@VERSION_MINOR@" "." "@VERSION_PATCH@"

#endif // CONFIG_H


================================================
FILE: gpt4all-backend/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.16)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)

#if(APPLE)
#  option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
#  if(BUILD_UNIVERSAL)
#    # Build a Universal binary on macOS
#    # This requires that the found Qt library is compiled as Universal binaries.
#    set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
#  else()
#    # Build for the host architecture on macOS
#    set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE)
#  endif()
#endif()

# Include the binary directory for the generated header file
#include_directories("${CMAKE_CURRENT_BINARY_DIR}")

#set(LLMODEL_VERSION_MAJOR 0)
#set(LLMODEL_VERSION_MINOR 1)
#set(LLMODEL_VERSION_PATCH 1)
#set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
#project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)

set(CMAKE_CXX_STANDARD_REQUIRED ON)

#set(LLAMA_BUILD_EXAMPLES ON CACHE BOOL "llama: build examples" FORCE)
#set(BUILD_SHARED_LIBS ON FORCE)

set(CMAKE_VERBOSE_MAKEFILE ON)
if (GPT4ALL_AVX_ONLY)
    set(LLAMA_AVX2 OFF CACHE BOOL "llama: enable AVX2" FORCE)
    set(LLAMA_F16C OFF CACHE BOOL "llama: enable F16C" FORCE)
    set(LLAMA_FMA  OFF CACHE BOOL "llama: enable FMA" FORCE)
endif()

#add_subdirectory(llama.cpp)

add_library(llmodel
    gptj.h gptj.cpp
    llamamodel.h llamamodel.cpp
    llama.cpp/examples/common.cpp
    llmodel.h llmodel_c.h llmodel_c.cpp
    mpt.h mpt.cpp
    utils.h utils.cpp
)

target_link_libraries(llmodel
    PRIVATE llama)

#set_target_properties(llmodel PROPERTIES
#                              VERSION ${PROJECT_VERSION}
#                              SOVERSION ${PROJECT_VERSION_MAJOR})

#set(COMPONENT_NAME_MAIN ${PROJECT_NAME})
#set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/install)


================================================
FILE: gpt4all-backend/README.md
================================================
# GPT4ALL Backend
This directory contains the C/C++ model backend used by GPT4All for inference on the CPU. This backend acts as a universal library/wrapper for all models that the GPT4All ecosystem supports. Language bindings are built on top of this universal library. The native GPT4all Chat application directly uses this library for all inference.

# What models are supported by the GPT4All ecosystem?

Currently, there are three different model architectures that are supported:

1. GPTJ - Based off of the GPT-J architecture with examples found [here](https://huggingface.co/EleutherAI/gpt-j-6b)
2. LLAMA - Based off of the LLAMA architecture with examples found [here](https://huggingface.co/models?sort=downloads&search=llama)
3. MPT - Based off of Mosaic ML's MPT architecture with examples found [here](https://huggingface.co/mosaicml/mpt-7b)

# Why so many different architectures? What differentiates them?

One of the major differences is license. Currently, the LLAMA based models are subject to a non-commercial license, whereas the GPTJ and MPT base models allow commercial usage. In the early advent of the recent explosion of activity in open source local models, the llama models have generally been seen as performing better, but that is changing quickly. Every week - even every day! - new models are released with some of the GPTJ and MPT models competitive in performance/quality with LLAMA. What's more, there are some very nice architectural innovations with the MPT models that could lead to new performance/quality gains.

# How does GPT4All make these models available for CPU inference?

By leveraging the ggml library written by Georgi Gerganov and a growing community of developers. There are currently multiple different versions of this library. The original github repo can be found [here](https://github.com/ggerganov/ggml), but the developer of the library has also created a LLAMA based version [here](https://github.com/ggerganov/llama.cpp). Currently, this backend is using the latter as a submodule.

# Does that mean GPT4All is compatible with all llama.cpp models and vice versa?

Unfortunately, no for three reasons:

1. The upstream [llama.cpp](https://github.com/ggerganov/llama.cpp) project has introduced [a compatibility breaking](https://github.com/ggerganov/llama.cpp/commit/b9fd7eee57df101d4a3e3eabc9fd6c2cb13c9ca1) re-quantization method recently. This is a breaking change that renders all previous models (including the ones that GPT4All uses) inoperative with newer versions of llama.cpp since that change.
2. The GPT4All backend has the llama.cpp submodule specifically pinned to a version prior to this breaking change.
3. The GPT4All backend currently supports MPT based models as an added feature. Neither llama.cpp nor the original ggml repo support this architecture as of this writing, however efforts are underway to make MPT available in the ggml repo which you can follow [here.](https://github.com/ggerganov/ggml/pull/145)

# What is being done to make them more compatible?

A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differienting them with namespaces or some other manner. Investigations continue.

# What about GPU inference?

In newer versions of llama.cpp, there has been some added support for NVIDIA GPU's for inference. We're investigating how to incorporate this into our downloadable installers.

# Ok, so bottom line... how do I make my model on huggingface compatible with GPT4All ecosystem right now?

1. Check to make sure the huggingface model is available in one of our three supported architectures
2. If it is, then you can use the conversion script inside of our pinned llama.cpp submodule for GPTJ and LLAMA based models
3. Or if your model is an MPT model you can use the conversion script located directly in this backend directory under the scripts subdirectory 

# Check back for updates as we'll try to keep this updated as things change!


================================================
FILE: gpt4all-backend/gptj/placeholder
================================================


================================================
FILE: gpt4all-backend/gptj.cpp
================================================
#include "gptj.h"
#include "llama.cpp/ggml.h"

#include "utils.h"

#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include <iostream>
#if defined(_WIN32) && defined(_MSC_VER)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <io.h>
    #include <stdio.h>
#else
    #include <unistd.h>
#endif
#include <sstream>
#include <unordered_set>

// default hparams (GPT-J 6B)
static const size_t MB = 1024*1024;

struct gptj_hparams {
    int32_t n_vocab = 50400;
    int32_t n_ctx   = 2048;
    int32_t n_embd  = 4096;
    int32_t n_head  = 16;
    int32_t n_layer = 28;
    int32_t n_rot   = 64;
    int32_t f16     = 1;
};

struct gptj_layer {
    // normalization
    struct ggml_tensor * ln_1_g;
    struct ggml_tensor * ln_1_b;

    // attention
    struct ggml_tensor * c_attn_q_proj_w;
    struct ggml_tensor * c_attn_k_proj_w;
    struct ggml_tensor * c_attn_v_proj_w;

    struct ggml_tensor * c_attn_proj_w;

    // ff
    struct ggml_tensor * c_mlp_fc_w;
    struct ggml_tensor * c_mlp_fc_b;

    struct ggml_tensor * c_mlp_proj_w;
    struct ggml_tensor * c_mlp_proj_b;
};

struct gptj_buffer {
    uint8_t * addr = NULL;
    size_t size = 0;

    void resize(size_t size) {
        delete[] addr;
        addr = new uint8_t[size];
        this->size = size;
    }

    ~gptj_buffer() {
        fflush(stdout);
        delete[] addr;
    }
};

struct gptj_kv_cache {
    struct ggml_tensor * k;
    struct ggml_tensor * v;

    struct ggml_context * ctx = NULL;

    gptj_buffer buf;

    int n; // number of tokens currently in the cache

    ~gptj_kv_cache() {
        if (ctx) {
            ggml_free(ctx);
        }
    }
};

struct gptj_model {
    gptj_hparams hparams;

    // normalization
    struct ggml_tensor * ln_f_g;
    struct ggml_tensor * ln_f_b;

    struct ggml_tensor * wte; // position embedding

    struct ggml_tensor * lmh_g; // language model head
    struct ggml_tensor * lmh_b; // language model bias

    std::vector<gptj_layer> layers;

    // key + value memory
    struct gptj_kv_cache kv_self;

    //
    struct ggml_context * ctx;
    std::map<std::string, struct ggml_tensor *> tensors;

    gptj_buffer buf;

    ~gptj_model() {
        if (ctx) {
            ggml_free(ctx);
        }
    }
};

static bool kv_cache_init(
        const struct gptj_hparams & hparams,
             struct gptj_kv_cache & cache,
                         ggml_type   wtype,
                               int   n_ctx) {
    const int n_embd  = hparams.n_embd;
    const int n_layer = hparams.n_layer;

    const int64_t n_mem      = (int64_t)n_layer*n_ctx;
    const int64_t n_elements = n_embd*n_mem;

    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);

    struct ggml_init_params params;
    params.mem_size   = cache.buf.size;
    params.mem_buffer = cache.buf.addr;
    params.no_alloc   = false;

    cache.ctx = ggml_init(params);

    if (!cache.ctx) {
        fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
        return false;
    }

    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);

    return true;
}

// load the model's weights from a stream
bool gptj_model_load(const std::string &fname, std::istream &fin, gptj_model & model, gpt_vocab & vocab) {
    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

    // verify magic
    {
        uint32_t magic;
        fin.read((char *) &magic, sizeof(magic));
        if (magic != 0x67676d6c) {
            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
            return false;
        }
    }

    // load hparams
    {
        auto & hparams = model.hparams;

        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));

        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
        printf("%s: f16     = %d\n", __func__, hparams.f16);
    }

    // load vocab
    {
        int32_t n_vocab = 0;
        fin.read((char *) &n_vocab, sizeof(n_vocab));

        if (n_vocab != model.hparams.n_vocab) {
            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
            return false;
        }

        std::string word;
        for (int i = 0; i < n_vocab; i++) {
            uint32_t len;
            fin.read((char *) &len, sizeof(len));

            word.resize(len);
            fin.read((char *) word.data(), len);

            vocab.token_to_id[word] = i;
            vocab.id_to_token[i] = word;
        }
    }

    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
    ggml_type wtype = GGML_TYPE_COUNT;
    switch (model.hparams.f16) {
        case 0: wtype = GGML_TYPE_F32;  break;
        case 1: wtype = GGML_TYPE_F16;  break;
        case 2: wtype = GGML_TYPE_Q4_0; break;
        case 3: wtype = GGML_TYPE_Q4_1; break;
        case 5: wtype = GGML_TYPE_Q4_2; break;
        default:
                {
                    fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
                            __func__, fname.c_str(), model.hparams.f16);
                    return false;
                }
    }

    const ggml_type wtype2 = GGML_TYPE_F32;

    auto & ctx = model.ctx;

    size_t ctx_size = 0;

    {
        const auto & hparams = model.hparams;

        const int n_embd  = hparams.n_embd;
        const int n_layer = hparams.n_layer;
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b

        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte

        ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype);         // lmh_g
        ctx_size +=        n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b

        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b

        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w
        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w
        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w

        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w

        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b

        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b

        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

        ctx_size += (5 + 10*n_layer)*256; // object overhead

        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
    }

    // create the ggml context
    {
        struct ggml_init_params params = {
            .mem_size   = ctx_size,
            .mem_buffer = NULL,
        };

        model.ctx = ggml_init(params);
        if (!model.ctx) {
            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
            return false;
        }
    }

    // prepare memory for the weights
    {
        const auto & hparams = model.hparams;

        const int n_embd  = hparams.n_embd;
        const int n_layer = hparams.n_layer;
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

        model.layers.resize(n_layer);

        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);

        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
        model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);

        // map by name
        model.tensors["transformer.wte.weight"] = model.wte;

        model.tensors["transformer.ln_f.weight"] = model.ln_f_g;
        model.tensors["transformer.ln_f.bias"]   = model.ln_f_b;

        model.tensors["lm_head.weight"] = model.lmh_g;
        model.tensors["lm_head.bias"]   = model.lmh_b;

        for (int i = 0; i < n_layer; ++i) {
            auto & layer = model.layers[i];

            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

            layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
            layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
            layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);

            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);

            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);

            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

            // map by name
            model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"]          = layer.ln_1_g;
            model.tensors["transformer.h." + std::to_string(i) + ".ln_1.bias"]            = layer.ln_1_b;

            model.tensors["transformer.h." + std::to_string(i) + ".attn.q_proj.weight"]   = layer.c_attn_q_proj_w;
            model.tensors["transformer.h." + std::to_string(i) + ".attn.k_proj.weight"]   = layer.c_attn_k_proj_w;
            model.tensors["transformer.h." + std::to_string(i) + ".attn.v_proj.weight"]   = layer.c_attn_v_proj_w;

            model.tensors["transformer.h." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_proj_w;

            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.weight"]     = layer.c_mlp_fc_w;
            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.bias"]       = layer.c_mlp_fc_b;

            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"]    = layer.c_mlp_proj_w;
            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.bias"]      = layer.c_mlp_proj_b;
        }
    }

    // key + value memory
    {
        const auto & hparams = model.hparams;

        const int n_embd  = hparams.n_embd;
        const int n_layer = hparams.n_layer;
        const int n_ctx   = hparams.n_ctx;

        const int n_mem      = n_layer*n_ctx;
        const int n_elements = n_embd*n_mem;

        if (!kv_cache_init(hparams, model.kv_self, GGML_TYPE_F16, model.hparams.n_ctx)) {
            fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
            ggml_free(ctx);
            return false;
        }

        const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v);
        printf("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
    }

    // load weights
    {
        int n_tensors = 0;
        size_t total_size = 0;

        printf("%s: ", __func__);

        while (true) {
            int32_t n_dims;
            int32_t length;
            int32_t ftype;

            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));

            if (fin.eof()) {
                break;
            }

            int32_t nelements = 1;
            int32_t ne[2] = { 1, 1 };
            for (int i = 0; i < n_dims; ++i) {
                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
                nelements *= ne[i];
            }

            std::string name(length, 0);
            fin.read(&name[0], length);

            if (model.tensors.find(name.data()) == model.tensors.end()) {
                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
                return false;
            }

            auto tensor = model.tensors[name.data()];
            if (ggml_nelements(tensor) != nelements) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
                return false;
            }

            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lu, %lu], expected [%d, %d]\n",
                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
                return false;
            }

            if (0) {
                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
            }

            size_t bpe = 0;

            switch (ftype) {
                case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
                case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
                case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
                case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
                default:
                        {
                            fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
                            return false;
                        }
            };

            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                return false;
            }

            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));

            //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
            total_size += ggml_nbytes(tensor);
            if (++n_tensors % 8 == 0) {
                printf(".");
                fflush(stdout);
            }
        }

        printf(" done\n");

        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
    }

    return true;
}

// load the model's weights from a file path
bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) {

    auto fin = std::ifstream(fname, std::ios::binary);
    if (!fin) {
        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
        return false;
    }

    bool loaded = gptj_model_load(fname, fin, model, vocab);
    fin.close();
    return loaded;
}

// evaluate the transformer
//
//   - model:     the model
//   - n_threads: number of threads to use
//   - n_past:    the context size so far
//   - embd_inp:  the embeddings of the tokens in the context
//   - embd_w:    the predicted logits for the next token
//
// The GPT-J model requires about 16MB of memory per input token.
//
bool gptj_eval(
        gptj_model & model,
        const int n_threads,
        const int n_past,
        const std::vector<gpt_vocab::id> & embd_inp,
              std::vector<float>         & embd_w,
              size_t                     & mem_per_token) {
    const int N = embd_inp.size();

    const auto & hparams = model.hparams;

    const int n_embd  = hparams.n_embd;
    const int n_layer = hparams.n_layer;
    const int n_ctx   = hparams.n_ctx;
    const int n_head  = hparams.n_head;
    const int n_vocab = hparams.n_vocab;
    const int n_rot   = hparams.n_rot;

    const int d_key = n_embd/n_head;

    const size_t init_buf_size = 1024u*MB;
    if (!model.buf.addr || model.buf.size < init_buf_size)
        model.buf.resize(init_buf_size);

    if (mem_per_token > 0 && mem_per_token*N > model.buf.size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, model.buf.size, buf_size_new);

        // reallocate
        model.buf.resize(buf_size_new);
        if (model.buf.addr == nullptr) {
            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, model.buf.size);
            return false;
        }
    }

    struct ggml_init_params params = {
        .mem_size   = model.buf.size,
        .mem_buffer = model.buf.addr,
    };

    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph gf = { .n_threads = n_threads };

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));

    // wte
    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);

    for (int il = 0; il < n_layer; ++il) {
        struct ggml_tensor * cur;

        // norm
        {
            cur = ggml_norm(ctx0, inpL);

            // cur = ln_1_g*cur + ln_1_b
            cur = ggml_add(ctx0,
                    ggml_mul(ctx0,
                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
                        cur),
                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
        }

        struct ggml_tensor * inpSA = cur;

        // self-attention
        {
            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur);
            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur);
            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur);

            // store key and value to memory
            {
                struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_self.k, N*n_embd, (ggml_element_size(model.kv_self.k)*n_embd)*(il*n_ctx + n_past));
                struct ggml_tensor * v = ggml_view_1d(ctx0, model.kv_self.v, N*n_embd, (ggml_element_size(model.kv_self.v)*n_embd)*(il*n_ctx + n_past));

                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
            }

            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
            struct ggml_tensor * Q =
                ggml_permute(ctx0,
                        ggml_rope(ctx0,
                            ggml_cpy(ctx0,
                                Qcur,
                                ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
                            n_past, n_rot, 0),
                        0, 2, 1, 3);

            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
            struct ggml_tensor * K =
                ggml_permute(ctx0,
                        ggml_rope(ctx0,
                            ggml_reshape_3d(ctx0,
                                ggml_view_1d(ctx0, model.kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.k)*n_embd),
                                n_embd/n_head, n_head, n_past + N),
                            n_past, n_rot, 1),
                        0, 2, 1, 3);

            // K * Q
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

            // KQ_scaled = KQ / sqrt(n_embd/n_head)
            struct ggml_tensor * KQ_scaled =
                ggml_scale(ctx0,
                        KQ,
                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
                        );

            // KQ_masked = mask_past(KQ_scaled)
            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);

            // KQ = soft_max(KQ_masked)
            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);

            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            struct ggml_tensor * V_trans =
                ggml_cpy(ctx0,
                        ggml_permute(ctx0,
                            ggml_reshape_3d(ctx0,
                                ggml_view_1d(ctx0, model.kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.v)*n_embd),
                                n_embd/n_head, n_head, n_past + N),
                            1, 2, 0, 3),
                        ggml_new_tensor_3d(ctx0, model.kv_self.v->type, n_past + N, n_embd/n_head, n_head));

            // KQV = transpose(V) * KQ_soft_max
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);

            // KQV_merged = KQV.permute(0, 2, 1, 3)
            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);

            // cur = KQV_merged.contiguous().view(n_embd, N)
            cur = ggml_cpy(ctx0,
                    KQV_merged,
                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));

            // projection (no bias)
            cur = ggml_mul_mat(ctx0,
                    model.layers[il].c_attn_proj_w,
                    cur);
        }

        struct ggml_tensor * inpFF = cur;

        // feed-forward network
        // this is independent of the self-attention result, so it could be done in parallel to the self-attention
        {
            // note here we pass inpSA instead of cur
            cur = ggml_mul_mat(ctx0,
                    model.layers[il].c_mlp_fc_w,
                    inpSA);

            cur = ggml_add(ctx0,
                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
                    cur);

            // GELU activation
            cur = ggml_gelu(ctx0, cur);

            // projection
            // cur = proj_w*cur + proj_b
            cur = ggml_mul_mat(ctx0,
                    model.layers[il].c_mlp_proj_w,
                    cur);

            cur = ggml_add(ctx0,
                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
                    cur);
        }

        // self-attention + FF
        cur  = ggml_add(ctx0, cur, inpFF);

        // input for next layer
        inpL = ggml_add(ctx0, cur, inpL);
    }

    // norm
    {
        inpL = ggml_norm(ctx0, inpL);

        // inpL = ln_f_g*inpL + ln_f_b
        inpL = ggml_add(ctx0,
                ggml_mul(ctx0,
                    ggml_repeat(ctx0, model.ln_f_g, inpL),
                    inpL),
                ggml_repeat(ctx0, model.ln_f_b, inpL));
    }

    // lm_head
    {
        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);

        inpL = ggml_add(ctx0,
                ggml_repeat(ctx0, model.lmh_b, inpL),
                inpL);
    }

    // logits -> probs
    //inpL = ggml_soft_max(ctx0, inpL);

    // run the computation
    ggml_build_forward_expand(&gf, inpL);
    ggml_graph_compute       (ctx0, &gf);

    //if (n_past%100 == 0) {
    //    ggml_graph_print   (&gf);
    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
    //}

    //embd_w.resize(n_vocab*N);
    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);

    // return result for just the last token
    embd_w.resize(n_vocab);
    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);

    if (mem_per_token == 0) {
        mem_per_token = ggml_used_mem(ctx0)/N;
    }
    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));

    ggml_free(ctx0);

    return true;
}

#define GPTJ_MAX_RNG_STATE 64*1024

size_t gptj_get_state_size(const gptj_model &model)
{
    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
    // for reference, std::mt19937(1337) serializes to 6701 bytes.
    const size_t s_rng_size        = sizeof(size_t);
    const size_t s_rng             = GPTJ_MAX_RNG_STATE;
    const size_t s_kv_size         = sizeof(size_t);
    const size_t s_kv_ntok         = sizeof(int);
    const size_t s_kv              = model.kv_self.buf.size;
    const size_t s_total = (
        + s_rng_size
        + s_rng
        + s_kv_size
        + s_kv_ntok
        + s_kv
    );
    fflush(stdout);
    return s_total;
}

size_t gptj_copy_state_data(const gptj_model &model, const std::mt19937 &rng, uint8_t *dest)
{
    uint8_t * out = dest;
    fflush(stdout);
    // copy rng
    {
        std::stringstream rng_ss;
        rng_ss << rng;

        const size_t rng_size = rng_ss.str().size();
        char rng_buf[GPTJ_MAX_RNG_STATE];

        memset(&rng_buf[0], 0, GPTJ_MAX_RNG_STATE);
        memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());

        memcpy(out, &rng_size,   sizeof(rng_size));   out += sizeof(rng_size);
        memcpy(out, &rng_buf[0], GPTJ_MAX_RNG_STATE); out += GPTJ_MAX_RNG_STATE;
    }

    // copy kv cache
    {
        const size_t kv_size = model.kv_self.buf.size;
        const int    kv_ntok = model.kv_self.n;

        memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
        memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);

        if (kv_size) {
            memcpy(out, model.kv_self.buf.addr, kv_size); out += kv_size;
        }
    }

    const size_t written  = out - dest;
    const size_t expected = gptj_get_state_size(model);
    assert(written == expected);
    fflush(stdout);
    return written;
}

size_t gptj_set_state_data(gptj_model *model, std::mt19937 *rng, const uint8_t *src)
{
    const uint8_t * in = src;

    // set rng
    {
        size_t rng_size;
        char   rng_buf[GPTJ_MAX_RNG_STATE];

        memcpy(&rng_size,   in, sizeof(rng_size));    in += sizeof(rng_size);
        memcpy(&rng_buf[0], in, GPTJ_MAX_RNG_STATE); in += GPTJ_MAX_RNG_STATE;

        std::stringstream rng_ss;
        rng_ss.str(std::string(&rng_buf[0], rng_size));
        rng_ss >> *rng;

        assert(rng_ss.fail() == false);
    }

    // set kv cache
    {
        size_t kv_size;
        int kv_ntok;

        memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
        memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);

        if (kv_size) {
            assert(model->kv_self.buf.size == kv_size);

            void * k_data = model->kv_self.k->data; // remember data pointers
            void * v_data = model->kv_self.v->data; // because their value is stored in buf and overwritten by memcpy

            memcpy(model->kv_self.buf.addr, in, kv_size); in += kv_size;

            model->kv_self.k->data = k_data; // restore correct data pointers
            model->kv_self.v->data = v_data;

        }

        model->kv_self.n = kv_ntok;
    }

    const size_t nread    = in - src;
    const size_t expected = gptj_get_state_size(*model);
    assert(nread == expected);
    fflush(stdout);
    return nread;
}

struct GPTJPrivate {
    const std::string modelPath;
    bool modelLoaded;
    gpt_vocab vocab;
    gptj_model *model = nullptr;
    int64_t n_threads = 0;
    size_t mem_per_token = 0;
    std::mt19937 rng;
};

GPTJ::GPTJ()
    : d_ptr(new GPTJPrivate) {

    d_ptr->model = new gptj_model;
    d_ptr->modelLoaded = false;
}

bool GPTJ::loadModel(const std::string &modelPath) {
    std::mt19937 rng(time(NULL));
    d_ptr->rng = rng;

    auto fin = std::ifstream(modelPath, std::ios::binary);

    // load the model
    if (!gptj_model_load(modelPath, fin, *d_ptr->model, d_ptr->vocab)) {
        std::cerr << "GPT-J ERROR: failed to load model from " <<  modelPath;
        return false;
    }

    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    d_ptr->modelLoaded = true;
    fflush(stdout);
    return true;
}

void GPTJ::setThreadCount(int32_t n_threads) {
    d_ptr->n_threads = n_threads;
}

int32_t GPTJ::threadCount() const
{
    return d_ptr->n_threads;
}

GPTJ::~GPTJ()
{
    delete d_ptr->model;
}

bool GPTJ::isModelLoaded() const
{
    return d_ptr->modelLoaded;
}

size_t GPTJ::stateSize() const
{
    return gptj_get_state_size(*d_ptr->model);
}

size_t GPTJ::saveState(uint8_t *dest) const
{
    return gptj_copy_state_data(*d_ptr->model, d_ptr->rng, dest);
}

size_t GPTJ::restoreState(const uint8_t *src)
{
    return gptj_set_state_data(d_ptr->model, &d_ptr->rng, src);
}

void GPTJ::prompt(const std::string &prompt,
        std::function<bool(int32_t)> promptCallback,
        std::function<bool(int32_t, const std::string&)> responseCallback,
        std::function<bool(bool)> recalculateCallback,
        PromptContext &promptCtx) {

    if (!isModelLoaded()) {
        std::cerr << "GPT-J ERROR: prompt won't work with an unloaded model!\n";
        return;
    }

    const int64_t t_main_start_us = ggml_time_us();

    int64_t t_sample_us  = 0;
    int64_t t_predict_us = 0;
    int64_t t_prompt_us = 0;

    // tokenize the prompt
    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(d_ptr->vocab, prompt);

    // save the context size
    promptCtx.n_ctx = d_ptr->model->hparams.n_ctx;

    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
        responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
        std::cerr << "GPT-J ERROR: The prompt is" << embd_inp.size() <<
            "tokens and the context window is" << promptCtx.n_ctx << "!\n";
        return;
    }

    promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size());
    promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx);

    // determine the required inference memory per token:
    static bool initialized = false;
    static std::vector<gpt_vocab::id> p_instruct;
    static std::vector<gpt_vocab::id> r_instruct;
    if (!initialized) {
        gptj_eval(*d_ptr->model, d_ptr->n_threads, 0, { 0, 1, 2, 3 }, promptCtx.logits,
            d_ptr->mem_per_token);
        initialized = true;
    }

    // process the prompt in batches
    size_t i = 0;
    const int64_t t_start_prompt_us = ggml_time_us();
    while (i < embd_inp.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());
        std::vector<gpt_vocab::id> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);

        // Check if the context has run out...
        if (promptCtx.n_past + batch.size() > promptCtx.n_ctx) {
            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
            // Erase the first percentage of context from the tokens...
            std::cerr << "GPTJ: reached the end of the context window so resizing\n";
            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
            promptCtx.n_past = promptCtx.tokens.size();
            recalculateContext(promptCtx, recalculateCallback);
            assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);
        }

        if (!gptj_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, batch, promptCtx.logits,
            d_ptr->mem_per_token)) {
            std::cerr << "GPT-J ERROR: Failed to process prompt\n";
            return;
        }

        size_t tokens = batch_end - i;
        for (size_t t = 0; t < tokens; ++t) {
            if (promptCtx.tokens.size() == promptCtx.n_ctx)
                promptCtx.tokens.erase(promptCtx.tokens.begin());
            promptCtx.tokens.push_back(batch.at(t));
            if (!promptCallback(batch.at(t)))
                return;
        }
        promptCtx.n_past += batch.size();
        i = batch_end;
    }
    t_prompt_us += ggml_time_us() - t_start_prompt_us;

    int p_instructFound = 0;
    int r_instructFound = 0;

    std::string cachedResponse;
    std::vector<gpt_vocab::id> cachedTokens;
    std::unordered_set<std::string> reversePrompts
        = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" };

    // predict next tokens
    int32_t totalPredictions = 0;
    for (int i = 0; i < promptCtx.n_predict; i++) {

        // sample next token
        const int n_vocab = d_ptr->model->hparams.n_vocab;
        gpt_vocab::id id = 0;
        {
            const int64_t t_start_sample_us = ggml_time_us();
            const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
            id = gpt_sample_top_k_top_p(d_ptr->vocab, n_vocab,
                promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
                n_prev_toks,
                promptCtx.logits,
                promptCtx.top_k, promptCtx.top_p, promptCtx.temp,
                promptCtx.repeat_penalty,
                d_ptr->rng);

            t_sample_us += ggml_time_us() - t_start_sample_us;
        }

        // Check if the context has run out...
        if (promptCtx.n_past + 1 > promptCtx.n_ctx) {
            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
            // Erase the first percentage of context from the tokens...
            std::cerr << "GPTJ: reached the end of the context window so resizing\n";
            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
            promptCtx.n_past = promptCtx.tokens.size();
            recalculateContext(promptCtx, recalculateCallback);
            assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);
        }

        const int64_t t_start_predict_us = ggml_time_us();
        if (!gptj_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, { id }, promptCtx.logits,
            d_ptr->mem_per_token)) {
            std::cerr << "GPT-J ERROR: Failed to predict next token\n";
            return;
        }
        t_predict_us += ggml_time_us() - t_start_predict_us;

        promptCtx.n_past += 1;
        // display text
        ++totalPredictions;

        if (id == 50256 /*end of text*/)
            goto stop_generating;

        const std::string str = d_ptr->vocab.id_to_token[id];

        // Check if the provided str is part of our reverse prompts
        bool foundPartialReversePrompt = false;
        const std::string completed = cachedResponse + str;
        if (reversePrompts.find(completed) != reversePrompts.end()) {
            goto stop_generating;
        }

        // Check if it partially matches our reverse prompts and if so, cache
        for (auto s : reversePrompts) {
            if (s.compare(0, completed.size(), completed) == 0) {
                foundPartialReversePrompt = true;
                cachedResponse = completed;
                break;
            }
        }

        // Regardless the token gets added to our cache
        cachedTokens.push_back(id);

        // Continue if we have found a partial match
        if (foundPartialReversePrompt)
            continue;

        // Empty the cache
        for (auto t : cachedTokens) {
            if (promptCtx.tokens.size() == promptCtx.n_ctx)
                promptCtx.tokens.erase(promptCtx.tokens.begin());
            promptCtx.tokens.push_back(t);
            if (!responseCallback(t, d_ptr->vocab.id_to_token[t]))
                goto stop_generating;
        }
        cachedTokens.clear();
    }

stop_generating:

#if 0
    // report timing
    {
        const int64_t t_main_end_us = ggml_time_us();

        std::cout << "GPT-J INFO: mem per token = " << mem_per_token << " bytes\n";
        std::cout << "GPT-J INFO:   sample time = " << t_sample_us/1000.0f << " ms\n";
        std::cout << "GPT-J INFO:   prompt time = " << t_prompt_us/1000.0f << " ms\n";
        std::cout << "GPT-J INFO:  predict time = " << t_predict_us/1000.0f << " ms / " << t_predict_us/1000.0f/totalPredictions << " ms per token\n";
        std::cout << "GPT-J INFO:    total time = " << (t_main_end_us - t_main_start_us)/1000.0f << " ms\n";
        fflush(stdout);
    }
#endif

    return;
}

void GPTJ::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate)
{
    size_t i = 0;
    promptCtx.n_past = 0;
    while (i < promptCtx.tokens.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());
        std::vector<gpt_vocab::id> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);

        assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);

        if (!gptj_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, batch, promptCtx.logits,
            d_ptr->mem_per_token)) {
            std::cerr << "GPTJ ERROR: Failed to process prompt\n";
            goto stop_generating;
        }
        promptCtx.n_past += batch.size();
        if (!recalculate(true))
            goto stop_generating;
        i = batch_end;
    }
    assert(promptCtx.n_past == promptCtx.tokens.size());

stop_generating:
    recalculate(false);
}


================================================
FILE: gpt4all-backend/gptj.h
================================================
#ifndef GPTJ_H
#define GPTJ_H

#include <string>
#include <functional>
#include <vector>
#include "llmodel.h"

class GPTJPrivate;
class GPTJ : public LLModel {
public:
    GPTJ();
    ~GPTJ();

    bool loadModel(const std::string &modelPath) override;
    bool isModelLoaded() const override;
    size_t stateSize() const override;
    size_t saveState(uint8_t *dest) const override;
    size_t restoreState(const uint8_t *src) override;
    void prompt(const std::string &prompt,
        std::function<bool(int32_t)> promptCallback,
        std::function<bool(int32_t, const std::string&)> responseCallback,
        std::function<bool(bool)> recalculateCallback,
        PromptContext &ctx) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;

protected:
    void recalculateContext(PromptContext &promptCtx,
        std::function<bool(bool)> recalculate) override;

private:
    GPTJPrivate *d_ptr;
};

#endif // GPTJ_H


================================================
FILE: gpt4all-backend/llama/placeholder
================================================


================================================
FILE: gpt4all-backend/llamamodel.cpp
================================================
#include "llamamodel.h"

#include "llama.cpp/examples/common.h"
#include "llama.cpp/llama.h"
#include "llama.cpp/ggml.h"

#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include <iostream>
#if defined(_WIN32) && defined(_MSC_VER)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <io.h>
    #include <stdio.h>
#else
    #include <unistd.h>
#endif
#include <random>
#include <thread>
#include <unordered_set>

struct LLamaPrivate {
    const std::string modelPath;
    bool modelLoaded;
    llama_context *ctx = nullptr;
    llama_context_params params;
    int64_t n_threads = 0;
};

LLamaModel::LLamaModel()
    : d_ptr(new LLamaPrivate) {

    d_ptr->modelLoaded = false;
}

bool LLamaModel::loadModel(const std::string &modelPath)
{
    // load the model
    d_ptr->params = llama_context_default_params();

    gpt_params params;
    d_ptr->params.n_ctx      = 2048;
    d_ptr->params.n_parts    = params.n_parts;
    d_ptr->params.seed       = params.seed;
    d_ptr->params.f16_kv     = params.memory_f16;
    d_ptr->params.use_mmap   = params.use_mmap;
#if defined (__APPLE__)
    d_ptr->params.use_mlock  = true;
#else
    d_ptr->params.use_mlock  = params.use_mlock;
#endif

    d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
    if (!d_ptr->ctx) {
        std::cerr << "LLAMA ERROR: failed to load model from " <<  modelPath << std::endl;
        return false;
    }

    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    d_ptr->modelLoaded = true;
    fflush(stderr);
    return true;
}

void LLamaModel::setThreadCount(int32_t n_threads) {
    d_ptr->n_threads = n_threads;
}

int32_t LLamaModel::threadCount() const
{
    return d_ptr->n_threads;
}

LLamaModel::~LLamaModel()
{
    llama_free(d_ptr->ctx);
}

bool LLamaModel::isModelLoaded() const
{
    return d_ptr->modelLoaded;
}

size_t LLamaModel::stateSize() const
{
    return llama_get_state_size(d_ptr->ctx);
}

size_t LLamaModel::saveState(uint8_t *dest) const
{
    return llama_copy_state_data(d_ptr->ctx, dest);
}

size_t LLamaModel::restoreState(const uint8_t *src)
{
    return llama_set_state_data(d_ptr->ctx, src);
}

void LLamaModel::prompt(const std::string &prompt,
        std::function<bool(int32_t)> promptCallback,
        std::function<bool(int32_t, const std::string&)> responseCallback,
        std::function<bool(bool)> recalculateCallback,
        PromptContext &promptCtx) {

    if (!isModelLoaded()) {
        std::cerr << "LLAMA ERROR: prompt won't work with an unloaded model!\n";
        return;
    }

    gpt_params params;
    params.prompt = prompt;

    // Add a space in front of the first character to match OG llama tokenizer behavior
    params.prompt.insert(0, 1, ' ');

    // tokenize the prompt
    auto embd_inp = ::llama_tokenize(d_ptr->ctx, params.prompt, false);

    // save the context size
    promptCtx.n_ctx = llama_n_ctx(d_ptr->ctx);

    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
        responseCallback(-1, "The prompt size exceeds the context window size and cannot be processed.");
        std::cerr << "LLAMA ERROR: The prompt is" << embd_inp.size() <<
            "tokens and the context window is" << promptCtx.n_ctx << "!\n";
        return;
    }

    promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size());
    promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx);

    // number of tokens to keep when resetting context
    params.n_keep = (int)embd_inp.size();

    // process the prompt in batches
    size_t i = 0;
    const int64_t t_start_prompt_us = ggml_time_us();
    while (i < embd_inp.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());
        std::vector<llama_token> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);

        // Check if the context has run out...
        if (promptCtx.n_past + batch.size() > promptCtx.n_ctx) {
            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
            // Erase the first percentage of context from the tokens...
            std::cerr << "LLAMA: reached the end of the context window so resizing\n";
            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
            promptCtx.n_past = promptCtx.tokens.size();
            recalculateContext(promptCtx, recalculateCallback);
            assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);
        }

        if (llama_eval(d_ptr->ctx, batch.data(), batch.size(), promptCtx.n_past, d_ptr->n_threads)) {
            std::cerr << "LLAMA ERROR: Failed to process prompt\n";
            return;
        }

        size_t tokens = batch_end - i;
        for (size_t t = 0; t < tokens; ++t) {
            if (promptCtx.tokens.size() == promptCtx.n_ctx)
                promptCtx.tokens.erase(promptCtx.tokens.begin());
            promptCtx.tokens.push_back(batch.at(t));
            if (!promptCallback(batch.at(t)))
                return;
        }
        promptCtx.n_past += batch.size();
        i = batch_end;
    }

    std::string cachedResponse;
    std::vector<llama_token> cachedTokens;
    std::unordered_set<std::string> reversePrompts
        = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" };

    // predict next tokens
    int32_t totalPredictions = 0;
    for (int i = 0; i < promptCtx.n_predict; i++) {
        // sample next token
        const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
        llama_token id = llama_sample_top_p_top_k(d_ptr->ctx,
            promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
            n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.temp,
            promptCtx.repeat_penalty);

        // Check if the context has run out...
        if (promptCtx.n_past + 1 > promptCtx.n_ctx) {
            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
            // Erase the first percentage of context from the tokens...
            std::cerr << "LLAMA: reached the end of the context window so resizing\n";
            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
            promptCtx.n_past = promptCtx.tokens.size();
            recalculateContext(promptCtx, recalculateCallback);
            assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);
        }

        if (llama_eval(d_ptr->ctx, &id, 1, promptCtx.n_past, d_ptr->n_threads)) {
            std::cerr << "LLAMA ERROR: Failed to predict next token\n";
            return;
        }

        promptCtx.n_past += 1;
        // display text
        ++totalPredictions;
        if (id == llama_token_eos())
            return;

        const std::string str = llama_token_to_str(d_ptr->ctx, id);

        // Check if the provided str is part of our reverse prompts
        bool foundPartialReversePrompt = false;
        const std::string completed = cachedResponse + str;
        if (reversePrompts.find(completed) != reversePrompts.end()) {
            return;
        }

        // Check if it partially matches our reverse prompts and if so, cache
        for (auto s : reversePrompts) {
            if (s.compare(0, completed.size(), completed) == 0) {
                foundPartialReversePrompt = true;
                cachedResponse = completed;
                break;
            }
        }

        // Regardless the token gets added to our cache
        cachedTokens.push_back(id);

        // Continue if we have found a partial match
        if (foundPartialReversePrompt)
            continue;

        // Empty the cache
        for (auto t : cachedTokens) {
            if (promptCtx.tokens.size() == promptCtx.n_ctx)
                promptCtx.tokens.erase(promptCtx.tokens.begin());
            promptCtx.tokens.push_back(t);
            if (!responseCallback(t, llama_token_to_str(d_ptr->ctx, t)))
                return;
        }
        cachedTokens.clear();
    }
}

void LLamaModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate)
{
    size_t i = 0;
    promptCtx.n_past = 0;
    while (i < promptCtx.tokens.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());
        std::vector<llama_token> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);

        assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);

        if (llama_eval(d_ptr->ctx, batch.data(), batch.size(), promptCtx.n_past, d_ptr->n_threads)) {
            std::cerr << "LLAMA ERROR: Failed to process prompt\n";
            goto stop_generating;
        }
        promptCtx.n_past += batch.size();
        if (!recalculate(true))
            goto stop_generating;
        i = batch_end;
    }
    assert(promptCtx.n_past == promptCtx.tokens.size());

stop_generating:
    recalculate(false);
}


================================================
FILE: gpt4all-backend/llamamodel.h
================================================
#ifndef LLAMAMODEL_H
#define LLAMAMODEL_H

#include <string>
#include <functional>
#include <vector>
#include "llmodel.h"

class LLamaPrivate;
class LLamaModel : public LLModel {
public:
    LLamaModel();
    ~LLamaModel();

    bool loadModel(const std::string &modelPath) override;
    bool isModelLoaded() const override;
    size_t stateSize() const override;
    size_t saveState(uint8_t *dest) const override;
    size_t restoreState(const uint8_t *src) override;
    void prompt(const std::string &prompt,
        std::function<bool(int32_t)> promptCallback,
        std::function<bool(int32_t, const std::string&)> responseCallback,
        std::function<bool(bool)> recalculateCallback,
        PromptContext &ctx) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;

protected:
    void recalculateContext(PromptContext &promptCtx,
        std::function<bool(bool)> recalculate) override;

private:
    LLamaPrivate *d_ptr;
};

#endif // LLAMAMODEL_H

================================================
FILE: gpt4all-backend/llmodel.h
================================================
#ifndef LLMODEL_H
#define LLMODEL_H

#include <string>
#include <functional>
#include <vector>
#include <cstdint>

class LLModel {
public:
    explicit LLModel() {}
    virtual ~LLModel() {}

    virtual bool loadModel(const std::string &modelPath) = 0;
    virtual bool isModelLoaded() const = 0;
    virtual size_t stateSize() const { return 0; }
    virtual size_t saveState(uint8_t *dest) const { return 0; }
    virtual size_t restoreState(const uint8_t *src) { return 0; }
    struct PromptContext {
        std::vector<float> logits;      // logits of current context
        std::vector<int32_t> tokens;    // current tokens in the context window
        int32_t n_past = 0;             // number of tokens in past conversation
        int32_t n_ctx = 0;              // number of tokens possible in context window
        int32_t n_predict = 200;
        int32_t top_k = 40;
        float   top_p = 0.9f;
        float   temp = 0.9f;
        int32_t n_batch = 9;
        float   repeat_penalty = 1.10f;
        int32_t repeat_last_n = 64;     // last n tokens to penalize
        float   contextErase = 0.75f;   // percent of context to erase if we exceed the context
                                        // window
    };
    virtual void prompt(const std::string &prompt,
        std::function<bool(int32_t)> promptCallback,
        std::function<bool(int32_t, const std::string&)> responseCallback,
        std::function<bool(bool)> recalculateCallback,
        PromptContext &ctx) = 0;
    virtual void setThreadCount(int32_t n_threads) {}
    virtual int32_t threadCount() const { return 1; }

protected:
    virtual void recalculateContext(PromptContext &promptCtx,
        std::function<bool(bool)> recalculate) = 0;
};

#endif // LLMODEL_H


================================================
FILE: gpt4all-backend/llmodel_c.cpp
================================================
#include "llmodel_c.h"

#include "gptj.h"
#include "llamamodel.h"
#include "mpt.h"

struct LLModelWrapper {
    LLModel *llModel = nullptr;
    LLModel::PromptContext promptContext;
};

llmodel_model llmodel_gptj_create()
{
    LLModelWrapper *wrapper = new LLModelWrapper;
    wrapper->llModel = new GPTJ;
    return reinterpret_cast<void*>(wrapper);
}

void llmodel_gptj_destroy(llmodel_model gptj)
{
    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(gptj);
    delete wrapper->llModel;
    delete wrapper;
}

llmodel_model llmodel_mpt_create()
{
    LLModelWrapper *wrapper = new LLModelWrapper;
    wrapper->llModel = new MPT;
    return reinterpret_cast<void*>(wrapper);
}

void llmodel_mpt_destroy(llmodel_model mpt)
{
    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(mpt);
    delete wrapper->llModel;
    delete wrapper;
}

llmodel_model llmodel_llama_create()
{
    LLModelWrapper *wrapper = new LLModelWrapper;
    wrapper->llModel = new LLamaModel;
    return reinterpret_cast<void*>(wrapper);
}

void llmodel_llama_destroy(llmodel_model llama)
{
    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(llama);
    delete wrapper->llModel;
    delete wrapper;
}

llmodel_model llmodel_model_create(const char *model_path) {

    uint32_t magic;
    llmodel_model model;
    FILE *f = fopen(model_path, "rb");
    fread(&magic, sizeof(magic), 1, f);

    if (magic == 0x67676d6c) { model = llmodel_gptj_create();  }
    else if (magic == 0x67676a74) { model = llmodel_llama_create(); }
    else if (magic == 0x67676d6d) { model = llmodel_mpt_create();   }
    else  {fprintf(stderr, "Invalid model file\n");}
    fclose(f);
    return model;
}

void llmodel_model_destroy(llmodel_model model) {

    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
    const std::type_info &modelTypeInfo = typeid(*wrapper->llModel);

    if (modelTypeInfo == typeid(GPTJ))       { llmodel_gptj_destroy(model);  }
    if (modelTypeInfo == typeid(LLamaModel)) { llmodel_llama_destroy(model); }
    if (modelTypeInfo == typeid(MPT))        { llmodel_mpt_destroy(model);   }
}

bool llmodel_loadModel(llmodel_model model, const char *model_path)
{
    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
    return wrapper->llModel->loadModel(model_path);
}

bool llmodel_isModelLoaded(llmodel_model model)
{
    const auto *llm = reinterpret_cast<LLModelWrapper*>(model)->llModel;
    return llm->isModelLoaded();
}

uint64_t llmodel_get_state_size(llmodel_model model)
{
    const auto *llm = reinterpret_cast<LLModelWrapper*>(model)->llModel;
    return llm->stateSize();
}

uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
{
    const auto *llm = reinterpret_cast<LLModelWrapper*>(model)->llModel;
    return llm->saveState(dest);
}

uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
{
    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
    return wrapper->llModel->restoreState(src);
}

// Wrapper functions for the C callbacks
bool prompt_wrapper(int32_t token_id, void *user_data) {
    llmodel_prompt_callback callback = reinterpret_cast<llmodel_prompt_callback>(user_data);
    return callback(token_id);
}

bool response_wrapper(int32_t token_id, const std::string &response, void *user_data) {
    llmodel_response_callback callback = reinterpret_cast<llmodel_response_callback>(user_data);
    return callback(token_id, response.c_str());
}

bool recalculate_wrapper(bool is_recalculating, void *user_data) {
    llmodel_recalculate_callback callback = reinterpret_cast<llmodel_recalculate_callback>(user_data);
    return callback(is_recalculating);
}

void llmodel_prompt(llmodel_model model, const char *prompt,
                    llmodel_prompt_callback prompt_callback,
                    llmodel_response_callback response_callback,
                    llmodel_recalculate_callback recalculate_callback,
                    llmodel_prompt_context *ctx)
{
    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);

    // Create std::function wrappers that call the C function pointers
    std::function<bool(int32_t)> prompt_func =
        std::bind(&prompt_wrapper, std::placeholders::_1, reinterpret_cast<void*>(prompt_callback));
    std::function<bool(int32_t, const std::string&)> response_func =
        std::bind(&response_wrapper, std::placeholders::_1, std::placeholders::_2, reinterpret_cast<void*>(response_callback));
    std::function<bool(bool)> recalc_func =
        std::bind(&recalculate_wrapper, std::placeholders::_1, reinterpret_cast<void*>(recalculate_callback));

    // Copy the C prompt context
    wrapper->promptContext.n_past = ctx->n_past;
    wrapper->promptContext.n_ctx = ctx->n_ctx;
    wrapper->promptContext.n_predict = ctx->n_predict;
    wrapper->promptContext.top_k = ctx->top_k;
    wrapper->promptContext.top_p = ctx->top_p;
    wrapper->promptContext.temp = ctx->temp;
    wrapper->promptContext.n_batch = ctx->n_batch;
    wrapper->promptContext.repeat_penalty = ctx->repeat_penalty;
    wrapper->promptContext.repeat_last_n = ctx->repeat_last_n;
    wrapper->promptContext.contextErase = ctx->context_erase;

    // Call the C++ prompt method
    wrapper->llModel->prompt(prompt, prompt_func, response_func, recalc_func, wrapper->promptContext);

    // Update the C context by giving access to the wrappers raw pointers to std::vector data
    // which involves no copies
    ctx->logits = wrapper->promptContext.logits.data();
    ctx->logits_size = wrapper->promptContext.logits.size();
    ctx->tokens = wrapper->promptContext.tokens.data();
    ctx->tokens_size = wrapper->promptContext.tokens.size();

    // Update the rest of the C prompt context
    ctx->n_past = wrapper->promptContext.n_past;
    ctx->n_ctx = wrapper->promptContext.n_ctx;
    ctx->n_predict = wrapper->promptContext.n_predict;
    ctx->top_k = wrapper->promptContext.top_k;
    ctx->top_p = wrapper->promptContext.top_p;
    ctx->temp = wrapper->promptContext.temp;
    ctx->n_batch = wrapper->promptContext.n_batch;
    ctx->repeat_penalty = wrapper->promptContext.repeat_penalty;
    ctx->repeat_last_n = wrapper->promptContext.repeat_last_n;
    ctx->context_erase = wrapper->promptContext.contextErase;
}

void llmodel_setThreadCount(llmodel_model model, int32_t n_threads)
{
    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
    wrapper->llModel->setThreadCount(n_threads);
}

int32_t llmodel_threadCount(llmodel_model model)
{
    const auto *llm = reinterpret_cast<LLModelWrapper*>(model)->llModel;
    return llm->threadCount();
}


================================================
FILE: gpt4all-backend/llmodel_c.h
================================================
#ifndef LLMODEL_C_H
#define LLMODEL_C_H

#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>

#ifdef __cplusplus
extern "C" {
#endif

/**
 * Opaque pointer to the underlying model.
 */
typedef void *llmodel_model;

/**
 * llmodel_prompt_context structure for holding the prompt context.
 * NOTE: The implementation takes care of all the memory handling of the raw logits pointer and the
 * raw tokens pointer. Attempting to resize them or modify them in any way can lead to undefined
 * behavior.
 */
typedef struct {
    float *logits;          // logits of current context
    size_t logits_size;     // the size of the raw logits vector
    int32_t *tokens;        // current tokens in the context window
    size_t tokens_size;     // the size of the raw tokens vector
    int32_t n_past;         // number of tokens in past conversation
    int32_t n_ctx;          // number of tokens possible in context window
    int32_t n_predict;      // number of tokens to predict
    int32_t top_k;          // top k logits to sample from
    float top_p;            // nucleus sampling probability threshold
    float temp;             // temperature to adjust model's output distribution
    int32_t n_batch;        // number of predictions to generate in parallel
    float repeat_penalty;   // penalty factor for repeated tokens
    int32_t repeat_last_n;  // last n tokens to penalize
    float context_erase;    // percent of context to erase if we exceed the context window
} llmodel_prompt_context;

/**
 * Callback type for prompt processing.
 * @param token_id The token id of the prompt.
 * @return a bool indicating whether the model should keep processing.
 */
typedef bool (*llmodel_prompt_callback)(int32_t token_id);

/**
 * Callback type for response.
 * @param token_id The token id of the response.
 * @param response The response string. NOTE: a token_id of -1 indicates the string is an error string.
 * @return a bool indicating whether the model should keep generating.
 */
typedef bool (*llmodel_response_callback)(int32_t token_id, const char *response);

/**
 * Callback type for recalculation of context.
 * @param whether the model is recalculating the context.
 * @return a bool indicating whether the model should keep generating.
 */
typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);

/**
 * Create a GPTJ instance.
 * @return A pointer to the GPTJ instance.
 */
llmodel_model llmodel_gptj_create();

/**
 * Destroy a GPTJ instance.
 * @param gptj A pointer to the GPTJ instance.
 */
void llmodel_gptj_destroy(llmodel_model gptj);

/**
 * Create a MPT instance.
 * @return A pointer to the MPT instance.
 */
llmodel_model llmodel_mpt_create();

/**
 * Destroy a MPT instance.
 * @param gptj A pointer to the MPT instance.
 */
void llmodel_mpt_destroy(llmodel_model mpt);

/**
 * Create a LLAMA instance.
 * @return A pointer to the LLAMA instance.
 */
llmodel_model llmodel_llama_create();

/**
 * Destroy a LLAMA instance.
 * @param llama A pointer to the LLAMA instance.
 */
void llmodel_llama_destroy(llmodel_model llama);

/**
 * Create a llmodel instance.
 * Recognises correct model type from file at model_path
 * @param model_path A string representing the path to the model file. 
 * @return A pointer to the llmodel_model instance.
 */
llmodel_model llmodel_model_create(const char *model_path);

/**
 * Destroy a llmodel instance.
 * Recognises correct model type using type info
 * @param model a pointer to a llmodel_model instance.
 */
void llmodel_model_destroy(llmodel_model model);


/**
 * Load a model from a file.
 * @param model A pointer to the llmodel_model instance.
 * @param model_path A string representing the path to the model file.
 * @return true if the model was loaded successfully, false otherwise.
 */
bool llmodel_loadModel(llmodel_model model, const char *model_path);

/**
 * Check if a model is loaded.
 * @param model A pointer to the llmodel_model instance.
 * @return true if the model is loaded, false otherwise.
 */
bool llmodel_isModelLoaded(llmodel_model model);

/**
 * Get the size of the internal state of the model.
 * NOTE: This state data is specific to the type of model you have created.
 * @param model A pointer to the llmodel_model instance.
 * @return the size in bytes of the internal state of the model
 */
uint64_t llmodel_get_state_size(llmodel_model model);

/**
 * Saves the internal state of the model to the specified destination address.
 * NOTE: This state data is specific to the type of model you have created.
 * @param model A pointer to the llmodel_model instance.
 * @param dest A pointer to the destination.
 * @return the number of bytes copied
 */
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);

/**
 * Restores the internal state of the model using data from the specified address.
 * NOTE: This state data is specific to the type of model you have created.
 * @param model A pointer to the llmodel_model instance.
 * @param src A pointer to the src.
 * @return the number of bytes read
 */
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);

/**
 * Generate a response using the model.
 * @param model A pointer to the llmodel_model instance.
 * @param prompt A string representing the input prompt.
 * @param prompt_callback A callback function for handling the processing of prompt.
 * @param response_callback A callback function for handling the generated response.
 * @param recalculate_callback A callback function for handling recalculation requests.
 * @param ctx A pointer to the llmodel_prompt_context structure.
 */
void llmodel_prompt(llmodel_model model, const char *prompt,
                    llmodel_prompt_callback prompt_callback,
                    llmodel_response_callback response_callback,
                    llmodel_recalculate_callback recalculate_callback,
                    llmodel_prompt_context *ctx);

/**
 * Set the number of threads to be used by the model.
 * @param model A pointer to the llmodel_model instance.
 * @param n_threads The number of threads to be used.
 */
void llmodel_setThreadCount(llmodel_model model, int32_t n_threads);

/**
 * Get the number of threads currently being used by the model.
 * @param model A pointer to the llmodel_model instance.
 * @return The number of threads currently being used.
 */
int32_t llmodel_threadCount(llmodel_model model);

#ifdef __cplusplus
}
#endif

#endif // LLMODEL_C_H


================================================
FILE: gpt4all-backend/mpt.cpp
================================================
#include "mpt.h"
#include "llama.cpp/ggml.h"

#include "utils.h"

#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <random>
#include <string>
#include <vector>
#include <iostream>
#if defined(_WIN32) && defined(_MSC_VER)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <io.h>
    #include <stdio.h>
#else
    #include <unistd.h>
#endif
#include <sstream>
#include <thread>
#include <unordered_set>
#include <regex>

static const size_t MB = 1024*1024;

// default hparams (MPT 7B)
struct mpt_hparams {
    int32_t n_vocab      = 50432;
    int32_t n_ctx        = 2048;
    int32_t n_embd       = 4096;
    int32_t n_head       = 32;
    int32_t n_layer      = 32;
    float alibi_bias_max = 8;
    float clip_qkv       = 0;
    int32_t expand       = 4;
    int32_t f16          = 1;
};

struct mpt_layer {
    // normalization
    struct ggml_tensor * norm_1_w;
    struct ggml_tensor * norm_2_w;

    // attention
    struct ggml_tensor * attn_Wqkv_w;
    struct ggml_tensor * attn_out_proj_w;

    // ff
    struct ggml_tensor * ffn_up_proj_w;
    struct ggml_tensor * ffn_down_proj_w;
};

struct mpt_buffer {
    uint8_t * addr = NULL;
    size_t size = 0;

    void resize(size_t size) {
        delete[] addr;
        addr = new uint8_t[size];
        this->size = size;
    }

    ~mpt_buffer() {
        fflush(stdout);
        delete[] addr;
    }
};

struct mpt_kv_cache {
    struct ggml_tensor * k;
    struct ggml_tensor * v;

    struct ggml_context * ctx = NULL;

    mpt_buffer buf;

    int n; // number of tokens currently in the cache

    ~mpt_kv_cache() {
        if (ctx) {
            ggml_free(ctx);
        }
    }
};

struct mpt_model {
    mpt_hparams hparams;

    // normalization
    struct ggml_tensor * norm_f_w;

    struct ggml_tensor * wte; // position embedding

    // mpt does weight tying

    std::vector<mpt_layer> layers;

    struct mpt_kv_cache kv_self;
    struct ggml_context * ctx;
    std::map<std::string, struct ggml_tensor *> tensors;


    mpt_buffer buf;

    ~mpt_model() {
        if (ctx) {
            ggml_free(ctx);
        }
    }
};

static bool kv_cache_init(
        const struct mpt_hparams & hparams,
             struct mpt_kv_cache & cache,
                         ggml_type   wtype,
                               int   n_ctx) {
    const int n_embd  = hparams.n_embd;
    const int n_layer = hparams.n_layer;

    const int64_t n_mem      = (int64_t)n_layer*n_ctx;
    const int64_t n_elements = n_embd*n_mem;

    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);

    struct ggml_init_params params;
    params.mem_size   = cache.buf.size;
    params.mem_buffer = cache.buf.addr;
    params.no_alloc   = false;

    cache.ctx = ggml_init(params);

    if (!cache.ctx) {
        fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
        return false;
    }

    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);

    return true;
}

// load the model's weights from a stream
bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & model, gpt_vocab & vocab) {
    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

    // verify magic
    {
        uint32_t magic;
        fin.read((char *) &magic, sizeof(magic));
        if (magic != 0x67676d6d) {
            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
            return false;
        }
    }

    // load hparams
    {
        auto & hparams = model.hparams;

        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fin.read((char *) &hparams.alibi_bias_max,  sizeof(hparams.alibi_bias_max));
        fin.read((char *) &hparams.clip_qkv,  sizeof(hparams.clip_qkv));
        fin.read((char *) &hparams.f16,   sizeof(hparams.f16));

        printf("%s: n_vocab        = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx          = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd         = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head         = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer        = %d\n", __func__, hparams.n_layer);
        printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max);
        printf("%s: clip_qkv       = %f\n", __func__, hparams.clip_qkv);
        printf("%s: ftype          = %d\n", __func__, hparams.f16);
    }

    // load vocab
    {
        int32_t n_vocab = model.hparams.n_vocab;
        fin.read((char *) &n_vocab, sizeof(n_vocab));

        if (n_vocab != model.hparams.n_vocab) {
            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
            return false;
        }

        std::string word;
        for (int i = 0; i < n_vocab; i++) {
            uint32_t len;
            fin.read((char *) &len, sizeof(len));
            bool special = false;
            if (len & (1<<31)) {
                len = len &~ (1<<31);
                special = true;
            }

            if (len > 0) {
                word.resize(len);
                fin.read((char *) word.data(), len);
                vocab.token_to_id[word] = i;
                vocab.id_to_token[i] = word;
            }

            if(special) {
                vocab.add_special_token(word);
            }
        }
    }

    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
    ggml_type wtype = GGML_TYPE_COUNT;
    switch (model.hparams.f16) {
        case 0: wtype = GGML_TYPE_F32;  break;
        case 1: wtype = GGML_TYPE_F16;  break;
        case 2: wtype = GGML_TYPE_Q4_0; break;
        case 3: wtype = GGML_TYPE_Q4_1; break;
        case 5: wtype = GGML_TYPE_Q4_2; break;
        default:
                {
                    fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
                            __func__, fname.c_str(), model.hparams.f16);
                    return false;
                }
    }

    auto & ctx = model.ctx;

    size_t ctx_size = 0;

    {
        const auto & hparams = model.hparams;

        const int n_embd  = hparams.n_embd;
        const int n_layer = hparams.n_layer;
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;
        const int expand  = hparams.expand;


        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_w

        ctx_size += n_embd*n_vocab*ggml_type_sizef(GGML_TYPE_F32); // wte

        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // norm_1_w
        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // norm_2_w

        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // attn_Wqkv_w
        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // attn_out_proj_w

        ctx_size += n_layer*(expand*n_embd*n_embd*ggml_type_sizef(wtype));  // ffn_up_proj_w
        ctx_size += n_layer*(expand*n_embd*n_embd*ggml_type_sizef(wtype)); // ffn_down_proj_w

        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v

        // TODO probably less now?
        ctx_size += (5 + 10*n_layer)*256; // object overhead

        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
    }

    // create the ggml context
    {
        struct ggml_init_params params = {
            .mem_size   = ctx_size,
            .mem_buffer = NULL,
            .no_alloc   = false,
        };

        model.ctx = ggml_init(params);
        if (!model.ctx) {
            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
            return false;
        }
    }

    // prepare memory for the weights
    {
        const auto & hparams = model.hparams;

        const int n_embd  = hparams.n_embd;
        const int n_layer = hparams.n_layer;
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;
        const int expand  = hparams.expand;

        model.layers.resize(n_layer);

        model.wte    = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
        model.norm_f_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

        // map by name
        model.tensors["transformer.wte.weight"] = model.wte;
        model.tensors["transformer.norm_f.weight"] = model.norm_f_w;

        for (int i = 0; i < n_layer; ++i) {
            auto & layer = model.layers[i];

            layer.norm_1_w        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
            layer.norm_2_w        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

            layer.attn_Wqkv_w     = ggml_new_tensor_2d(ctx, wtype,        n_embd, n_embd * 3);
            layer.attn_out_proj_w = ggml_new_tensor_2d(ctx, wtype,        n_embd, n_embd);
            layer.ffn_up_proj_w   = ggml_new_tensor_2d(ctx, wtype,        n_embd, expand*n_embd);
            layer.ffn_down_proj_w = ggml_new_tensor_2d(ctx, wtype, expand*n_embd, n_embd);

            // map by name
            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"]        = layer.norm_1_w;
            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"]        = layer.norm_2_w;
            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"]     = layer.attn_Wqkv_w;
            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] = layer.attn_out_proj_w;

            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"]   = layer.ffn_up_proj_w;
            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj_w;
        }
    }

    // key + value memory
    {
        const auto & hparams = model.hparams;

        const int n_embd  = hparams.n_embd;
        const int n_layer = hparams.n_layer;
        const int n_ctx   = hparams.n_ctx;

        const int n_mem      = n_layer*n_ctx;
        const int n_elements = n_embd*n_mem;

        if (!kv_cache_init(hparams, model.kv_self, GGML_TYPE_F16, model.hparams.n_ctx)) {
            fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
            ggml_free(ctx);
            return false;
        }

        const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v);
        printf("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
    }

    // load weights
    {
        int n_tensors = 0;
        size_t total_size = 0;

        printf("%s: ", __func__);

        while (true) {
            int32_t n_dims;
            int32_t length;
            int32_t ttype;

            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));

            if (fin.eof()) {
                break;
            }

            int32_t nelements = 1;
            int32_t ne[2] = { 1, 1 };
            for (int i = 0; i < n_dims; ++i) {
                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
                nelements *= ne[i];
            }

            std::string name(length, 0);
            fin.read(&name[0], length);

            if (model.tensors.find(name.data()) == model.tensors.end()) {
                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
                return false;
            }

            auto tensor = model.tensors[name.data()];
            if (ggml_nelements(tensor) != nelements) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
                return false;
            }

            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
                return false;
            }

            // for debugging
            if (0) {
                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
            }

            const size_t bpe = ggml_type_size(ggml_type(ttype));

            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                return false;
            }

            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));

            //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
            total_size += ggml_nbytes(tensor);
            if (++n_tensors % 8 == 0) {
                printf(".");
                fflush(stdout);
            }
        }

        printf(" done\n");

        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
    }

    return true;
}

// load the model's weights from a file path
bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) {

    auto fin = std::ifstream(fname, std::ios::binary);
    if (!fin) {
        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
        return false;
    }

    bool loaded = mpt_model_load(fname, fin, model, vocab);
    fin.close();
    return loaded;
}

bool mpt_eval(
        mpt_model & model,
        const int n_threads,
        const int n_past,
        const std::vector<int>           & embd_inp,
              std::vector<float>         & embd_w,
              size_t                     & mem_per_token) {
    const int N = embd_inp.size();

    const auto & hparams = model.hparams;

    const int n_embd  = hparams.n_embd;
    const int n_layer = hparams.n_layer;
    const int n_ctx   = hparams.n_ctx;
    const int n_head  = hparams.n_head;
    const int n_vocab = hparams.n_vocab;
    const int expand  = hparams.expand;

    const int d_key = n_embd/n_head;

    const size_t init_buf_size = 1024u*MB;
    if (!model.buf.addr || model.buf.size < init_buf_size)
        model.buf.resize(init_buf_size);

    if (mem_per_token > 0 && mem_per_token*N > model.buf.size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
        // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, model.buf.size, buf_size_new);

        // reallocate
        model.buf.resize(buf_size_new);
        if (model.buf.addr == nullptr) {
            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, model.buf.size);
            return false;
        }
    }

    struct ggml_init_params params = {
        .mem_size   = model.buf.size,
        .mem_buffer = model.buf.addr,
    };

    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph gf = { .n_threads = n_threads };

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));

    // wte
    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);

    for (int il = 0; il < n_layer; ++il) {

        struct ggml_tensor * inpSA = inpL;
        struct ggml_tensor * cur = inpSA;
        // self-attention
        {

            // norm1
            cur = ggml_norm(ctx0, cur);
            cur = ggml_mul(ctx0,
                    ggml_repeat(ctx0, model.layers[il].norm_1_w, cur),
                    cur);
            // compute QKV
            cur = ggml_mul_mat(ctx0,
                    model.layers[il].attn_Wqkv_w,
                    cur);

            // TODO: clip_qkv
            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*ggml_element_size(cur)*n_embd));
            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*ggml_element_size(cur)*n_embd));
            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*ggml_element_size(cur)*n_embd));

            // TODO: qk_ln? (seems to be False in MPT-7B configs)
            {
                Vcur = ggml_transpose(ctx0, Vcur);

                struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_self.k, N*n_embd, (ggml_element_size(model.kv_self.k)*n_embd)*(il*n_ctx + n_past));
                struct ggml_tensor * v = ggml_view_2d(ctx0, model.kv_self.v, N, n_embd,
                                        (   n_ctx)*ggml_element_size(model.kv_self.v),
                                        (il*n_ctx)*ggml_element_size(model.kv_self.v)*n_embd + n_past*ggml_element_size(model.kv_self.v));

                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
            }
            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
            struct ggml_tensor * Q =
                ggml_permute(ctx0,
                        ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N),
                        0, 2, 1, 3);

            struct ggml_tensor * K =
                ggml_permute(ctx0,
                        ggml_reshape_3d(ctx0,
                            ggml_view_1d(ctx0, model.kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.k)*n_embd),
                            n_embd/n_head, n_head, n_past + N),
                        0, 2, 1, 3);

            // K * Q
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

            // KQ_scaled = KQ / sqrt(n_embd/n_head)
            struct ggml_tensor * KQ_scaled =
                ggml_scale(ctx0,
                        KQ,
                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
                        );


            // Alibi
            struct ggml_tensor * KQ_scaled_biased = ggml_alibi(ctx0, ggml_cont(ctx0, KQ_scaled), n_past, n_head);

            // KQ_masked = mask_past(KQ_scaled)
            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_biased, n_past);

            // KQ = soft_max(KQ_masked)
            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);

            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            struct ggml_tensor * V =
                ggml_view_3d(ctx0, model.kv_self.v,
                        n_past + N, n_embd/n_head, n_head,
                        n_ctx*ggml_element_size(model.kv_self.v),
                        n_ctx*ggml_element_size(model.kv_self.v)*n_embd/n_head,
                        il*n_ctx*ggml_element_size(model.kv_self.v)*n_embd);

            // KQV = transpose(V) * KQ_soft_max
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);

            // KQV_merged = KQV.permute(0, 2, 1, 3)
            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);

            // cur = KQV_merged.contiguous().view(n_embd, N)
            cur = ggml_cpy(ctx0,
                    KQV_merged,
                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));

            // projection (no bias)
            cur = ggml_mul_mat(ctx0,
                    model.layers[il].attn_out_proj_w,
                    cur);
        }


        // residual
        struct ggml_tensor * resSA = ggml_add(ctx0, cur, inpSA);
        // feed-forward network
        {
            cur = resSA;
            // norm2
            cur = ggml_norm(ctx0, cur);
            cur = ggml_mul(ctx0,
                    ggml_repeat(ctx0, model.layers[il].norm_2_w, cur),
                    cur);
            // ffn
            cur = ggml_mul_mat(ctx0,
                    model.layers[il].ffn_up_proj_w,
                    cur);
            cur = ggml_gelu(ctx0, cur);
            cur = ggml_mul_mat(ctx0,
                    model.layers[il].ffn_down_proj_w,
                    cur);

        }

        // self-attention + FF
        inpL = ggml_add(ctx0, cur, resSA);
    }

    struct ggml_tensor * out = inpL;
    // -> logits
    {
        out = ggml_norm(ctx0, out);
        out = ggml_mul(ctx0,
                    ggml_repeat(ctx0, model.norm_f_w, out),
                    out);
        out = ggml_mul_mat(ctx0, model.wte, out);
    }


    // run the computation
    ggml_build_forward_expand(&gf, out);
    ggml_graph_compute       (ctx0, &gf);


    // return result for just the last token
    embd_w.resize(n_vocab);
    memcpy(embd_w.data(), (float *) ggml_get_data(out) + (n_vocab*(N-1)), sizeof(float)*n_vocab);

    if (mem_per_token == 0) {
        mem_per_token = ggml_used_mem(ctx0)/N;
    }
    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));

    ggml_free(ctx0);

    return true;
}


#define MPT_MAX_RNG_STATE 64*1024

size_t mpt_get_state_size(const mpt_model &model)
{
    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
    // for reference, std::mt19937(1337) serializes to 6701 bytes.
    const size_t s_rng_size        = sizeof(size_t);
    const size_t s_rng             = MPT_MAX_RNG_STATE;
    const size_t s_kv_size         = sizeof(size_t);
    const size_t s_kv_ntok         = sizeof(int);
    const size_t s_kv              = model.kv_self.buf.size;
    const size_t s_total = (
        + s_rng_size
        + s_rng
        + s_kv_size
        + s_kv_ntok
        + s_kv
    );
    fflush(stdout);
    return s_total;
}

size_t mpt_copy_state_data(const mpt_model &model, const std::mt19937 &rng, uint8_t *dest)
{
    uint8_t * out = dest;
    fflush(stdout);
    // copy rng
    {
        std::stringstream rng_ss;
        rng_ss << rng;

        const size_t rng_size = rng_ss.str().size();
        char rng_buf[MPT_MAX_RNG_STATE];

        memset(&rng_buf[0], 0, MPT_MAX_RNG_STATE);
        memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());

        memcpy(out, &rng_size,   sizeof(rng_size));   out += sizeof(rng_size);
        memcpy(out, &rng_buf[0], MPT_MAX_RNG_STATE); out += MPT_MAX_RNG_STATE;
    }

    // copy kv cache
    {
        const size_t kv_size = model.kv_self.buf.size;
        const int    kv_ntok = model.kv_self.n;

        memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
        memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);

        if (kv_size) {
            memcpy(out, model.kv_self.buf.addr, kv_size); out += kv_size;
        }
    }

    const size_t written  = out - dest;
    const size_t expected = mpt_get_state_size(model);
    assert(written == expected);
    fflush(stdout);
    return written;
}

size_t mpt_set_state_data(mpt_model *model, std::mt19937 *rng, const uint8_t *src)
{
    const uint8_t * in = src;

    // set rng
    {
        size_t rng_size;
        char   rng_buf[MPT_MAX_RNG_STATE];

        memcpy(&rng_size,   in, sizeof(rng_size));    in += sizeof(rng_size);
        memcpy(&rng_buf[0], in, MPT_MAX_RNG_STATE); in += MPT_MAX_RNG_STATE;

        std::stringstream rng_ss;
        rng_ss.str(std::string(&rng_buf[0], rng_size));
        rng_ss >> *rng;

        assert(rng_ss.fail() == false);
    }

    // set kv cache
    {
        size_t kv_size;
        int kv_ntok;

        memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
        memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);

        if (kv_size) {
            assert(model->kv_self.buf.size == kv_size);

            void * k_data = model->kv_self.k->data; // remember data pointers
            void * v_data = model->kv_self.v->data; // because their value is stored in buf and overwritten by memcpy

            memcpy(model->kv_self.buf.addr, in, kv_size); in += kv_size;

            model->kv_self.k->data = k_data; // restore correct data pointers
            model->kv_self.v->data = v_data;

        }

        model->kv_self.n = kv_ntok;
    }

    const size_t nread    = in - src;
    const size_t expected = mpt_get_state_size(*model);
    assert(nread == expected);
    fflush(stdout);
    return nread;
}

struct MPTPrivate {
    const std::string modelPath;
    bool modelLoaded;
    gpt_vocab vocab;
    mpt_model *model = nullptr;
    int64_t n_threads = 0;
    size_t mem_per_token = 0;
    std::mt19937 rng;
    bool has_im_end = false;
};

MPT::MPT()
    : d_ptr(new MPTPrivate) {

    d_ptr->model = new mpt_model;
    d_ptr->modelLoaded = false;
}

bool MPT::loadModel(const std::string &modelPath) {
    std::mt19937 rng(time(NULL));
    d_ptr->rng = rng;

    auto fin = std::ifstream(modelPath, std::ios::binary);

    // load the model
    if (!mpt_model_load(modelPath, fin, *d_ptr->model, d_ptr->vocab)) {
        std::cerr << "GPT-J ERROR: failed to load model from " <<  modelPath;
        return false;
    }

    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    d_ptr->modelLoaded = true;
    d_ptr->has_im_end = d_ptr->vocab.token_to_id.find("<|im_end|>") != d_ptr->vocab.token_to_id.end();
    fflush(stdout);
    return true;
}

void MPT::setThreadCount(int32_t n_threads) {
    d_ptr->n_threads = n_threads;
}

int32_t MPT::threadCount() const
{
    return d_ptr->n_threads;
}

MPT::~MPT()
{
    delete d_ptr->model;
}

bool MPT::isModelLoaded() const
{
    return d_ptr->modelLoaded;
}

size_t MPT::stateSize() const
{
    return mpt_get_state_size(*d_ptr->model);
}

size_t MPT::saveState(uint8_t *dest) const
{
    return mpt_copy_state_data(*d_ptr->model, d_ptr->rng, dest);
}

size_t MPT::restoreState(const uint8_t *src)
{
    return mpt_set_state_data(d_ptr->model, &d_ptr->rng, src);
}

void MPT::prompt(const std::string &prompt,
        std::function<bool(int32_t)> promptCallback,
        std::function<bool(int32_t, const std::string&)> responseCallback,
        std::function<bool(bool)> recalculateCallback,
        PromptContext &promptCtx) {

    if (!isModelLoaded()) {
        std::cerr << "GPT-J ERROR: prompt won't work with an unloaded model!\n";
        return;
    }

    const int64_t t_main_start_us = ggml_time_us();

    int64_t t_sample_us  = 0;
    int64_t t_predict_us = 0;
    int64_t t_prompt_us = 0;

    // tokenize the prompt
    std::vector<int> embd_inp = gpt_tokenize(d_ptr->vocab, prompt);

    // save the context size
    promptCtx.n_ctx = d_ptr->model->hparams.n_ctx;

    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
        responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
        std::cerr << "GPT-J ERROR: The prompt is" << embd_inp.size() <<
            "tokens and the context window is" << promptCtx.n_ctx << "!\n";
        return;
    }

    promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size());
    promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx);

    // determine the required inference memory per token:
    static bool initialized = false;
    static std::vector<int> p_instruct;
    static std::vector<int> r_instruct;
    if (!initialized) {
         mpt_eval(*d_ptr->model, d_ptr->n_threads, 0, { 0, 1, 2, 3 }, promptCtx.logits,
            d_ptr->mem_per_token);
        initialized = true;
    }

    // process the prompt in batches
    size_t i = 0;
    const int64_t t_start_prompt_us = ggml_time_us();
    while (i < embd_inp.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());
        std::vector<int> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);

        // Check if the context has run out...
        if (promptCtx.n_past + batch.size() > promptCtx.n_ctx) {
            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
            // Erase the first percentage of context from the tokens...
            std::cerr << "MPT: reached the end of the context window so resizing\n";
            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
            promptCtx.n_past = promptCtx.tokens.size();
            recalculateContext(promptCtx, recalculateCallback);
            assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);
        }

        if (!mpt_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, batch, promptCtx.logits,
            d_ptr->mem_per_token)) {
            std::cerr << "GPT-J ERROR: Failed to process prompt\n";
            return;
        }

        size_t tokens = batch_end - i;
        for (size_t t = 0; t < tokens; ++t) {
            if (promptCtx.tokens.size() == promptCtx.n_ctx)
                promptCtx.tokens.erase(promptCtx.tokens.begin());
            promptCtx.tokens.push_back(batch.at(t));
            if (!promptCallback(batch.at(t)))
                return;
        }
        promptCtx.n_past += batch.size();
        i = batch_end;
    }
    t_prompt_us += ggml_time_us() - t_start_prompt_us;

    int p_instructFound = 0;
    int r_instructFound = 0;

    std::string cachedResponse;
    std::vector<int> cachedTokens;
    std::unordered_set<std::string> reversePrompts
        = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" };

    // predict next tokens
    int32_t totalPredictions = 0;
    for (int i = 0; i < promptCtx.n_predict; i++) {

        // sample next token
        const int n_vocab = d_ptr->model->hparams.n_vocab;
        int id = 0;
        {
            const int64_t t_start_sample_us = ggml_time_us();
            const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
            id = gpt_sample_top_k_top_p(d_ptr->vocab, n_vocab,
                promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
                n_prev_toks,
                promptCtx.logits,
                promptCtx.top_k, promptCtx.top_p, promptCtx.temp,
                promptCtx.repeat_penalty,
                d_ptr->rng);

            t_sample_us += ggml_time_us() - t_start_sample_us;
        }

        // Check if the context has run out...
        if (promptCtx.n_past + 1 > promptCtx.n_ctx) {
            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
            // Erase the first percentage of context from the tokens...
            std::cerr << "MPT: reached the end of the context window so resizing\n";
            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
            promptCtx.n_past = promptCtx.tokens.size();
            recalculateContext(promptCtx, recalculateCallback);
            assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);
        }

        const int64_t t_start_predict_us = ggml_time_us();
        if (!mpt_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, { id }, promptCtx.logits,
            d_ptr->mem_per_token)) {
            std::cerr << "GPT-J ERROR: Failed to predict next token\n";
            return;
        }
        t_predict_us += ggml_time_us() - t_start_predict_us;

        promptCtx.n_past += 1;
        // display text
        ++totalPredictions;

        // mpt-7b-chat has special token for end
        if (d_ptr->has_im_end && id == d_ptr->vocab.token_to_id["<|im_end|>"])
            goto stop_generating;

        if (id == 0 /*end of text*/)
            goto stop_generating;

        const std::string str = d_ptr->vocab.id_to_token[id];

        // Check if the provided str is part of our reverse prompts
        bool foundPartialReversePrompt = false;
        const std::string completed = cachedResponse + str;
        if (reversePrompts.find(completed) != reversePrompts.end()) {
            goto stop_generating;
        }

        // Check if it partially matches our reverse prompts and if so, cache
        for (auto s : reversePrompts) {
            if (s.compare(0, completed.size(), completed) == 0) {
                foundPartialReversePrompt = true;
                cachedResponse = completed;
                break;
            }
        }

        // Regardless the token gets added to our cache
        cachedTokens.push_back(id);

        // Continue if we have found a partial match
        if (foundPartialReversePrompt)
            continue;

        // Empty the cache
        for (auto t : cachedTokens) {
            if (promptCtx.tokens.size() == promptCtx.n_ctx)
                promptCtx.tokens.erase(promptCtx.tokens.begin());
            promptCtx.tokens.push_back(t);
            if (!responseCallback(t, d_ptr->vocab.id_to_token[t]))
                goto stop_generating;
        }
        cachedTokens.clear();
    }

stop_generating:

#if 0
    // report timing
    {
        const int64_t t_main_end_us = ggml_time_us();

        std::cout << "GPT-J INFO: mem per token = " << mem_per_token << " bytes\n";
        std::cout << "GPT-J INFO:   sample time = " << t_sample_us/1000.0f << " ms\n";
        std::cout << "GPT-J INFO:   prompt time = " << t_prompt_us/1000.0f << " ms\n";
        std::cout << "GPT-J INFO:  predict time = " << t_predict_us/1000.0f << " ms / " << t_predict_us/1000.0f/totalPredictions << " ms per token\n";
        std::cout << "GPT-J INFO:    total time = " << (t_main_end_us - t_main_start_us)/1000.0f << " ms\n";
        fflush(stdout);
    }
#endif

    return;
}

void MPT::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate)
{
    size_t i = 0;
    promptCtx.n_past = 0;
    while (i < promptCtx.tokens.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());
        std::vector<int> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);

        assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);

        if (!mpt_eval(*d_ptr->model, d_ptr->n_threads, promptCtx.n_past, batch, promptCtx.logits,
            d_ptr->mem_per_token)) {
            std::cerr << "MPT ERROR: Failed to process prompt\n";
            goto stop_generating;
        }
        promptCtx.n_past += batch.size();
        if (!recalculate(true))
            goto stop_generating;
        i = batch_end;
    }
    assert(promptCtx.n_past == promptCtx.tokens.size());

stop_generating:
    recalculate(false);
}


================================================
FILE: gpt4all-backend/mpt.h
================================================
#ifndef MPT_H
#define MPT_H

#include <string>
#include <functional>
#include <vector>
#include "llmodel.h"

class MPTPrivate;
class MPT : public LLModel {
public:
    MPT();
    ~MPT();

    bool loadModel(const std::string &modelPath) override;
    bool isModelLoaded() const override;
    size_t stateSize() const override;
    size_t saveState(uint8_t *dest) const override;
    size_t restoreState(const uint8_t *src) override;
    void prompt(const std::string &prompt,
        std::function<bool(int32_t)> promptCallback,
        std::function<bool(int32_t, const std::string&)> responseCallback,
        std::function<bool(bool)> recalculateCallback,
        PromptContext &ctx) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;

protected:
    void recalculateContext(PromptContext &promptCtx,
        std::function<bool(bool)> recalculate) override;

private:
    MPTPrivate *d_ptr;
};

#endif // MPT_H


================================================
FILE: gpt4all-backend/scripts/convert_mpt_hf_to_ggml.py
================================================
# Convert Hugging Face fine-tuned bloom-like models to ggml format
#
# Usage:
#
#   python3 models/convert-h5-to-ggml.py 
#
# This script is similar to "convert-pt-to-ggml.py"
#

import io
import os
import sys
import struct
import json
import code
import torch
import numpy as np

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BloomForCausalLM

# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

if len(sys.argv) < 3:
    print("Usage: python convert-hf-to-ggml.py model_name dir-output [use-f32]")
    print("  model_name: name of the model to convert. Example: 'bigscience/bloomz-560m'")
    print("  dir-output: directory where the output file will be written")
    print("  use-f32:    if present, use float32 instead of float16")
    sys.exit(1)

model_name = sys.argv[1]
dir_out = sys.argv[2]

# make sure the output directory exists
os.makedirs(dir_out, exist_ok=True)

# possible data types
#   ftype == 0 -> float32
#   ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]
ftype = 1
if len(sys.argv) > 3:
    ftype = 0

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
hparams = config.to_dict()
print("Loading model: ", model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, config=config, torch_dtype=torch.float16 if ftype == 1 else torch.float32, low_cpu_mem_usage=True)
print("Model loaded: ", model_name)


fname_out = dir_out + f"/ggml-model-{model_name.split('/')[-1]}-{ftype_str[ftype]}.bin"
fout = open(fname_out, "wb")
vocab = tokenizer.vocab

hparams["multiple_of"] = 1
fout.write(struct.pack("I", 0x67676d6d)) # magic: ggml in hex
fout.write(struct.pack("I", model.config.vocab_size))
fout.write(struct.pack("I", model.config.max_seq_len))
fout.write(struct.pack("I", model.config.n_layers))
fout.write(struct.pack("I", model.config.n_heads))
fout.write(struct.pack("I", model.config.d_model))
fout.write(struct.pack("f", model.config.attn_config['alibi_bias_max']))
clip_qkv = model.config.attn_config['clip_qkv']
fout.write(struct.pack("f",  clip_qkv if clip_qkv is not None else 0))
fout.write(struct.pack("I", ftype))

# # Is this correct??
# dot_token = tokenizer.encode(".")[0]
# write tokens to ggml file 
dot_token = tokenizer.encode('.')[0]
fout.write(struct.pack("I", model.config.vocab_size))

for i in range(model.config.vocab_size):
    text = tokenizer.decode([dot_token, i]).encode('utf-8')
    # remove the first byte (it's always '.')
    text = text[1:]
    enclen = len(text)
    if i in tokenizer.all_special_ids:
        print(f"special token: {text}")
        enclen = enclen | 1<<31
    fout.write(struct.pack("I", enclen))
    fout.write(text)
    
list_vars = model.state_dict()
for name in list_vars.keys():
    data = list_vars[name].squeeze().numpy()
    print("Processing variable: " + name + " with shape: ", data.shape)

    n_dims = len(data.shape);

    # ftype == 0 -> float32, ftype == 1 -> float16
    ftype_cur = 0;
    if ftype != 0:
        # Keep token embeddings in fp32
        if name[-7:] == ".weight" and n_dims == 2 and ".wte" not in name:
            print("  Converting to float16")
            data = data.astype(np.float16)
            ftype_cur = 1
        else:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
    else:
        if data.dtype != np.float32:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0

    # header
    str = name.encode('utf-8')
    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
    for i in range(n_dims):
        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
    fout.write(str);

    # data
    data.tofile(fout)

fout.close()

print("Done. Output file: " + fname_out)
print("")

================================================
FILE: gpt4all-backend/utils.cpp
================================================
#include "utils.h"

#include <fstream>
#include <regex>

void replace(std::string & str, const std::string & needle, const std::string & replacement) {
    size_t pos = 0;
    while ((pos = str.find(needle, pos)) != std::string::npos) {
        str.replace(pos, needle.length(), replacement);
        pos += replacement.length();
    }
}

std::map<std::string, int32_t> json_parse(const std::string & fname) {
    std::map<std::string, int32_t> result;

    // read file into string
    std::string json;
    {
        std::ifstream ifs(fname);
        if (!ifs) {
            fprintf(stderr, "Failed to open %s\n", fname.c_str());
            exit(1);
        }

        json = std::string((std::istreambuf_iterator<char>(ifs)),
                (std::istreambuf_iterator<char>()));
    }

    if (json[0] != '{') {
        return result;
    }

    // parse json
    {
        bool has_key  = false;
        bool in_token = false;

        std::string str_key = "";
        std::string str_val = "";

        int n = json.size();
        for (int i = 1; i < n; ++i) {
            if (!in_token) {
                if (json[i] == ' ') continue;
                if (json[i] == '"') {
                    in_token = true;
                    continue;
                }
            } else {
                if (json[i] == '\\' && i+1 < n) {
                    if (has_key == false) {
                        str_key += json[i];
                    } else {
                        str_val += json[i];
                    }
                    ++i;
                } else if (json[i] == '"') {
                    if (has_key == false) {
                        has_key = true;
                        ++i;
                        while (json[i] == ' ') ++i;
                        ++i; // :
                        while (json[i] == ' ') ++i;
                        if (json[i] != '\"') {
                            while (json[i] != ',' && json[i] != '}') {
                                str_val += json[i++];
                            }
                            has_key = false;
                        } else {
                            in_token = true;
                            continue;
                        }
                    } else {
                        has_key = false;
                    }

                    ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
                    ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
                    ::replace(str_key, "\\\"",    "\""); // \\\"   -> "

                    try {
                        result[str_key] = std::stoi(str_val);
                    } catch (...) {
                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());

                    }
                    str_key = "";
                    str_val = "";
                    in_token = false;
                    continue;
                }
                if (has_key == false) {
                    str_key += json[i];
                } else {
                    str_val += json[i];
                }
            }
        }
    }

    return result;
}

std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text) {
    std::vector<std::string> words;

    // first split the text into words
    {
        std::string str = text;
        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";

        std::regex re(pat);
        std::smatch m;

        while (std::regex_search(str, m, re)) {
            for (auto x : m) {
                words.push_back(x);
            }
            str = m.suffix();
        }
    }

    // find the longest tokens that form the words:
    std::vector<gpt_vocab::id> tokens;
    for (const auto & word : words) {
        if (word.size() == 0) continue;

        int i = 0;
        int n = word.size();
        while (i < n) {
            int j = n;
            while (j > i) {
                auto it = vocab.token_to_id.find(word.substr(i, j-i));
                if (it != vocab.token_to_id.end()) {
                    tokens.push_back(it->second);
                    i = j;
                    break;
                }
                --j;
            }
            if (i == n) {
                break;
            }
            if (j == i) {
                auto sub = word.substr(i, 1);
                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
                    tokens.push_back(vocab.token_to_id.at(sub));
                } else {
                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
                }
                ++i;
            }
        }
    }

    return tokens;
}

std::string regex_escape(const std::string &s) {
  static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])");
  return std::regex_replace(s, metacharacters, "\\$&");
}

std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
    // Generate the subpattern from the special_tokens vector if it's not empty
    if (!vocab.special_tokens.empty()) {
        std::vector<gpt_vocab::id> out;
        std::vector<std::string> chunks;
        std::string str = text;
        std::string special_tokens_subpattern;
        for (const auto &token : vocab.special_tokens) {
            if (!special_tokens_subpattern.empty()) {
                special_tokens_subpattern += "|";
            }
            special_tokens_subpattern += regex_escape(token);
        }
        std::regex re(special_tokens_subpattern);
        std::smatch m;
        while (std::regex_search(str, m, re)) {
            auto tok = vocab.token_to_id.find(m.str());
            if (tok != vocab.token_to_id.end()) {
                auto tokid = tok->second;
                auto pfxtoks = gpt_tokenize_inner(vocab, m.prefix());
                out.insert(out.end(), pfxtoks.begin(), pfxtoks.end());
                out.push_back(tokid);
                str = m.suffix();
            }
        }
        if (!str.empty()) {
            auto tokrest = gpt_tokenize_inner(vocab, str);
            out.insert(out.end(), tokrest.begin(), tokrest.end());
        }
        return out;
    } else {
        return gpt_tokenize_inner(vocab, text);
    }
}


bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());

    vocab.token_to_id = ::json_parse(fname);

    for (const auto & kv : vocab.token_to_id) {
        vocab.id_to_token[kv.second] = kv.first;
    }

    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());

    // print the vocabulary
    //for (auto kv : vocab.token_to_id) {
    //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
    //}

    return true;
}

gpt_vocab::id gpt_sample_top_k_top_p(
        const gpt_vocab & vocab,
        const size_t actualVocabSize,
        const int32_t * last_n_tokens_data,
        int   last_n_tokens_size,
        const std::vector<float> logits,
        int    top_k,
        double top_p,
        double temp,
        float repeat_penalty,
        std::mt19937 & rng) {
    int n_logits = actualVocabSize;

    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
    const auto * plogits = logits.data() + logits.size() - n_logits;

    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
    logits_id.reserve(n_logits);

    {
        const float scale = 1.0f/temp;
        for (int i = 0; i < n_logits; ++i) {
            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
            if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
                if (plogits[i] < 0.0f) {
                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
                } else {
                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
                }
            } else {
                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
            }
        }
    }

    // find the top K tokens
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
        return a.first > b.first;
    });

    logits_id.resize(top_k);

    double maxl = -INFINITY;
    for (const auto & kv : logits_id) {
        maxl = std::max(maxl, kv.first);
    }

    // compute probs for the top K tokens
    std::vector<double> probs;
    probs.reserve(logits_id.size());

    double sum = 0.0;
    for (const auto & kv : logits_id) {
        double p = exp(kv.first - maxl);
        probs.push_back(p);
        sum += p;
    }

    // normalize the probs
    for (auto & p : probs) {
        p /= sum;
    }

    if (top_p < 1.0f) {
        double cumsum = 0.0f;
        for (int i = 0; i < top_k; i++) {
            cumsum += probs[i];
            if (cumsum >= top_p) {
                top_k = i + 1;
                probs.resize(top_k);
                logits_id.resize(top_k);
                break;
            }
        }

        cumsum = 1.0/cumsum;
        for (int i = 0; i < (int) probs.size(); i++) {
            probs[i] *= cumsum;
        }
    }

    //printf("\n");
    //for (int i = 0; i < (int) probs.size(); i++) {
    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
    //}
    //exit(0);

    std::discrete_distribution<> dist(probs.begin(), probs.end());
    int idx = dist(rng);

    return logits_id[idx].second;
}

================================================
FILE: gpt4all-backend/utils.h
================================================
// Various helper functions and utilities

#pragma once

#include <string>
#include <map>
#include <vector>
#include <random>
#include <thread>

//
// CLI argument parsing
//

struct gpt_params {
    int32_t seed      = -1; // RNG seed
    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_predict = 200; // new tokens to predict

    // sampling parameters
    int32_t top_k = 40;
    float   top_p = 0.9f;
    float   temp  = 0.9f;

    int32_t n_batch = 8; // batch size for prompt processing

    std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
    std::string prompt;
};

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

std::string gpt_random_prompt(std::mt19937 & rng);

//
// Vocab utils
//

struct gpt_vocab {
    using id    = int32_t;
    using token = std::string;

    std::map<token, id> token_to_id;
    std::map<id, token> id_to_token;
    std::vector<std::string> special_tokens;

    void add_special_token(const std::string &token) {
        special_tokens.push_back(token);
    }
};

void replace(std::string & str, const std::string & needle, const std::string & replacement);

// poor-man's JSON parsing
std::map<std::string, int32_t> json_parse(const std::string & fname);

// split text into tokens
//
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
//
// Regex (Python):
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
//
// Regex (C++):
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
//
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);

// load the tokens from encoder.json
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);

// sample next token given probabilities for each embedding
//
//   - consider only the top K tokens
//   - from them, consider only the top tokens with cumulative probability > P
//
// TODO: not sure if this implementation is correct
//
gpt_vocab::id gpt_sample_top_k_top_p(
        const gpt_vocab & vocab,
        const size_t actualVocabSize,
        const int32_t * last_n_tokens_data,
        int   last_n_tokens_size,
        const std::vector<float> logits,
        int    top_k,
        double top_p,
        double temp,
        float repeat_penalty,
        std::mt19937 & rng);


================================================
FILE: prompt_template_sample.txt
================================================
### Instruction:
The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
### Prompt:
%1
### Response:

================================================
FILE: src/CMakeLists.txt
================================================

add_executable(chat chat.cpp header.h utils.h parse_json.h ../gpt4all-backend/llmodel_c.h)
target_link_libraries(chat PRIVATE  llmodel llama)


================================================
FILE: src/chat.cpp
================================================
#include "./header.h"
#include "../gpt4all-backend/llmodel_c.h"
#include "./utils.h"
#include "./parse_json.h"

//////////////////////////////////////////////////////////////////////////
////////////                    ANIMATION                     ////////////
//////////////////////////////////////////////////////////////////////////

std::atomic<bool> stop_display{false}; 

void display_frames() {
    const char* frames[] = {".", ":", "'", ":"};
    int frame_index = 0;
    ConsoleState con_st;
    con_st.use_color = true;
    while (!stop_display) {
        set_console_color(con_st, PROMPT);
        std::cerr << "\r" << frames[frame_index % 4] << std::flush;
        frame_index++;
        set_console_color(con_st, DEFAULT);
        if (!stop_display){
            std::this_thread::sleep_for(std::chrono::milliseconds(200));
            std::cerr << "\r" << " " << std::flush;
            std::cerr << "\r" << std::flush;
        }
    }
}

void display_loading() {

    while (!stop_display) {


        for (int i=0; i < 14; i++){
                fprintf(stdout, ".");
                fflush(stdout);
                std::this_thread::sleep_for(std::chrono::milliseconds(200));
                if (stop_display){ break; }
        }
        
        std::cout << "\r" << "               " << "\r" << std::flush;
    }
    std::cout << "\r" << " " << std::flush;

}

//////////////////////////////////////////////////////////////////////////
////////////                   /ANIMATION                     ////////////
//////////////////////////////////////////////////////////////////////////


//////////////////////////////////////////////////////////////////////////
////////////                 CHAT FUNCTIONS                   ////////////
//////////////////////////////////////////////////////////////////////////

#ifndef OLD_MACOS

bool save_state_to_binary(llmodel_model& model, uint8_t *dest, chatParams& params, std::string &filename, uint64_t model_size) {
	
  if (params.save_dir == "") {
	std::filesystem::path directory_path(params.path+"saves");
    if (!std::filesystem::is_directory(directory_path)) {
        if (!std::filesystem::create_directory(directory_path)) {
            std::cerr << "Error creating directory" << std::endl;
            return false;
        }
    }
  	params.save_dir = params.path+"saves";
  }
  
  //sanity check that we're not trying to overwrite binaries of wrong size
  //empty binaries are allowed, so are previous saves of same model type
  if (std::filesystem::exists(params.save_dir+"/"+filename+".bin")) {
  	uint64_t file_size = std::filesystem::file_size(params.save_dir+"/"+filename+".bin");
  	if ((file_size == model_size) || (file_size = 0)) {
  	//continue
  	} else {
  		std::cerr << "You are trying to overwrite existing binary of different size! " << params.save_dir+"/"+filename+".bin" << std::endl;
  		return 0;
  	}
  }
  
  // create an output file stream
  std::ofstream outfile;
  // open the file in binary mode
  outfile.open(params.save_dir+"/"+filename+".bin", std::ios::binary);

  // check if the file stream is open
  if (!outfile.is_open()) {
    std::cerr << "Error opening file " << params.save_dir+"/"+filename+".bin" << std::endl;
    return false;
  }

  // write the model data to the file stream
  uint64_t copied_bytes = llmodel_save_state_data(model, dest);
  outfile.write(reinterpret_cast<char *>(dest), copied_bytes);

  // close the file stream
  outfile.close();
  return true;
}

bool load_state_from_binary(llmodel_model& model, chatParams& params, std::string &filename, uint64_t model_size) {

  if (params.save_dir == "") {
  	params.save_dir = params.path+"saves";
  }

  //sanity check that we're not trying to load binaries of wrong size
  //only binaries that are saves of same model type are allowed
  if (std::filesystem::exists(params.save_dir+"/"+filename+".bin")) {
  	uint64_t file_size = std::filesystem::file_size(params.save_dir+"/"+filename+".bin");
  	if (file_size == model_size) {
  	//continue
  	} else {
  		std::cerr << "You are trying to load a binary of wrong size! " << params.save_dir+"/"+filename+".bin" << std::endl;
  		return 0;
  	}
  }  

  // create an input file stream
  std::ifstream infile;
  // open the file in binary mode
  infile.open(params.save_dir+"/"+filename+".bin", std::ios::binary);

  // check if the file stream is open
  if (!infile.is_open()) {
    std::cerr << "Error opening file " << params.save_dir+"/"+filename+".bin" << std::endl;
    return false;
  }

  // get the size of the file
  infile.seekg(0, std::ios::end);
  uint64_t file_size = infile.tellg();
  infile.seekg(0, std::ios::beg);

  // allocate a buffer to hold the file data
  uint8_t* buffer = new uint8_t[file_size];
  try {
    buffer = new uint8_t[file_size];
  } catch (std::bad_alloc& ba) {
    std::cerr << "Failed to allocate buffer: " << ba.what() << std::endl;
    return false;
  }

  // read the file data into the buffer
  infile.read(reinterpret_cast<char*>(buffer), file_size);
  infile.close();

  // restore the internal state of the model using the buffer data
  llmodel_restore_state_data(model, buffer); 
  delete[] buffer;
  return true;
}

bool save_ctx_to_binary(llmodel_prompt_context& prompt_context, chatParams& params, std::string &filename) {
	
  if (params.save_dir == "") {
    std::filesystem::path directory_path(params.path+"saves");
    if (!std::filesystem::is_directory(directory_path)) {
        if (!std::filesystem::create_directory(directory_path)) {
            std::cerr << "Error creating directory" << std::endl;
            return false;
        }
    }
  params.save_dir = params.path+"saves";
  }

  std::filesystem::path filePath = std::filesystem::path(params.save_dir) / (filename + ".ctx");
  std::string fullPath = filePath.string();
	
    // Open the binary file for writing
  FILE* file = fopen(fullPath.c_str(), "wb");
    if (!file) {
        std::cerr << "Error opening file: " << fullPath << std::endl;
        return false;
    }

    // Write the struct to the file using fwrite
    fwrite(&prompt_context, sizeof(prompt_context), 1, file);

    // Close the file
    fclose(file);
    return true;
}

llmodel_prompt_context load_ctx_from_binary(chatParams& params, std::string &filename) {

    if (params.save_dir == "") {
        params.save_dir = params.path+"saves";
    }
	  // Construct the file path with home directory expansion
    std::filesystem::path filePath = std::filesystem::path(params.save_dir) / (filename + ".ctx");
    std::string fullPath = filePath.string();

    // Open the binary file for reading
    FILE* file = fopen(fullPath.c_str(), "rb");
        if (!file) {
        std::cerr << "Error opening file: " << fullPath << std::endl;
        exit(EXIT_FAILURE);
    }

    // Read the struct from the file using fread
    llmodel_prompt_context prompt_context;
    fread(&prompt_context, sizeof(prompt_context), 1, file);

    // Close the file
    fclose(file);

    return prompt_context;
}
#endif

std::string get_input(ConsoleState& con_st, std::string& input, chatParams &params, llmodel_prompt_context &prompt_context, llmodel_model& model) {
    set_console_color(con_st, USER_INPUT);

    std::cout << "\n> ";
    std::getline(std::cin, input);
    
    std::istringstream iss(input);
    std::string input1, input2;
    std::getline(iss, input1, ' ');
    std::getline(iss, input2, ' ');
    set_console_color(con_st, DEFAULT);
    
    if (input == "/reset") {
    	//reset the logits, tokens and past conversation
        prompt_context.logits = params.logits;
        prompt_context.logits_size = params.logits_size;
        prompt_context.tokens = params.tokens;
        prompt_context.tokens_size = params.tokens_size;
        prompt_context.n_past = params.n_past;
        prompt_context.n_ctx = params.n_ctx;
        
        //get new input using recursion
        set_console_color(con_st, PROMPT);
        std::cout << "Chat context reset.";
        return get_input(con_st, input, params, prompt_context, model);
    }
    #ifndef OLD_MACOS
    if ((input == "/save" || input1 == "/save") && (params.no_saves == false)) {
    	std::string filename = params.save_name;
    	if (input2 != "" && (input2.find("..") == std::string::npos) ) { filename = input2; }
    	
    	
        bool success1 = false;
        bool success2 = false;
    	
    	uint64_t model_size = llmodel_get_state_size(model);
		uint8_t *dest = new uint8_t[model_size];
    	success1 = save_state_to_binary(model, dest, params, filename, model_size);
    	delete[] dest;
    	success2 = save_ctx_to_binary(prompt_context, params, filename);
    	
    	//get new input using recursion
        set_console_color(con_st, PROMPT);
        if (success1 && success2) { std::cout << "Model data saved to: " << params.save_dir+"/"+filename+".bin" << " size: " << floor(model_size/10000000)/100.0 << " Gb"; }
        return get_input(con_st, input, params, prompt_context, model);
    }
    
    if ((input == "/load" || input1 == "/load") && (params.no_saves == false)) {
    	std::string filename = params.save_name;
    	if (input2 != "" && (input2.find("..") == std::string::npos) ) { filename = input2; }
    	//reset the logits, tokens and past conversation
    	free(prompt_context.logits);
    	free(prompt_context.tokens);
        prompt_context.logits = params.logits;
        prompt_context.logits_size = params.logits_size;
        prompt_context.tokens = params.tokens;
        prompt_context.tokens_size = params.tokens_size;
        prompt_context.n_past = params.n_past;
        prompt_context.n_ctx = params.n_ctx;
        
        bool success = false;
        
        uint64_t model_size = llmodel_get_state_size(model);
    	prompt_context = load_ctx_from_binary(params, filename);
    	success = load_state_from_binary(model, params, filename, model_size);
    	model_size = llmodel_get_state_size(model);
    	
    	//get new input using recursion
        set_console_color(con_st, PROMPT);
        if (success) { std::cout << "Model data loaded from: " << params.save_dir+"/"+filename+".bin" << " size: " << floor(model_size/10000000)/100.0 << " Gb"; }
        return get_input(con_st, input, params, prompt_context, model);
    }
    #endif
    
    if (input == "/help"){
    	set_console_color(con_st, DEFAULT);
    	std::cout << std::endl;
        char emptystring[] = "";
        char* emptyargv[] = {emptystring};
        int emptyargc = sizeof(emptyargv) / sizeof(char*);
    	print_usage(emptyargc, emptyargv, params);
        return get_input(con_st, input, params, prompt_context, model);
    }
    
    if (input == "/about"){
    	set_console_color(con_st, DEFAULT);
    	std::cout << std::endl;
    	print_version();
    	return get_input(con_st, input, params, prompt_context, model);
    }
       
    if (input == "exit" || input == "quit" || input == "/exit" || input == "/quit") {       
        llmodel_model_destroy(model);
        exit(0);
    }
    
    return input;
}

std::string hashstring = "";
std::string answer = "";

//////////////////////////////////////////////////////////////////////////
////////////                /CHAT FUNCTIONS                   ////////////
//////////////////////////////////////////////////////////////////////////


//////////////////////////////////////////////////////////////////////////
////////////                  MAIN PROGRAM                    ////////////
//////////////////////////////////////////////////////////////////////////


int main(int argc, char* argv[]) {


    ConsoleState con_st;
    con_st.use_color = true;
    set_console_color(con_st, DEFAULT);

    set_console_color(con_st, PROMPT);
    set_console_color(con_st, BOLD);
    std::cout << APPNAME;
    set_console_color(con_st, DEFAULT);
    set_console_color(con_st, PROMPT);
    std::cout << " (v. " << VERSION << ")";
    set_console_color(con_st, DEFAULT);
    std::cout << "" << std::endl;
    check_avx_support_at_startup();

    chatParams params;
    //convert the default model path into Windows format if on WIN32
    #ifdef _WIN32
        std::filesystem::path p(params.model);
        params.model = p.make_preferred().string();
    #endif
 
    //get all parameters from cli arguments or json
    parse_params(argc, argv, params);
    
    //Create a prompt_context and copy all params from chatParams to prompt_context
    llmodel_prompt_context prompt_context = {
     .logits = params.logits,
     .logits_size = params.logits_size,
     .tokens = params.tokens,
     .tokens_size = params.tokens_size,
     .n_past = params.n_past,
     .n_ctx = params.n_ctx,
     .n_predict = params.n_predict,
     .top_k = params.top_k,
     .top_p = params.top_p,
     .temp = params.temp,
     .n_batch = params.n_batch,
     .repeat_penalty = params.repeat_penalty,  
     .repeat_last_n = params.repeat_last_n,
     .context_erase = params.context_erase,
    }; 

    //Subprocess signal handling
    #ifdef _WIN32
        SetConsoleCtrlHandler(console_ctrl_handler, TRUE);
    #else
        signal(SIGHUP, handle_sighup);
    #endif
 
    //////////////////////////////////////////////////////////////////////////
    ////////////                 LOAD THE MODEL                   ////////////
    ////////////////////////////////////////////////////////////////////////// 

    //animation
    std::future<void> future;
    stop_display = true;
    if(params.use_animation) {stop_display = false; future = std::async(std::launch::async, display_loading);}

    //handle stderr for now
    //this is just to prevent printing unnecessary details during model loading.
    int stderr_copy = dup(fileno(stderr));
    #ifdef _WIN32
        std::freopen("NUL", "w", stderr);
    #else
        std::freopen("/dev/null", "w", stderr);
    #endif


    llmodel_model model = llmodel_model_create(params.model.c_str());
    std::cout << "\r" << APPNAME << ": loading " << params.model.c_str()  << std::endl;
    
    //bring back stderr for now
    dup2(stderr_copy, fileno(stderr));
    close(stderr_copy);
    
    
    //check if model is loaded
    auto check_model = llmodel_loadModel(model, params.model.c_str());

    if (check_model == false) {
        if(params.use_animation) {
            stop_display = true;
            future.wait();
            stop_display= false;
        }

        std::cerr << "Error loading: " << params.model.c_str() << std::endl;
        std::cout << "Press any key to exit..." << std::endl;
        std::cin.get();
        return 0;
    } else {
        if(params.use_animation) {
            stop_display = true;
            future.wait();
        }
        std::cout << "\r" << APPNAME << ": done loading!" << std::flush;   
    }
    //////////////////////////////////////////////////////////////////////////
    ////////////                /LOAD THE MODEL                   ////////////
    ////////////////////////////////////////////////////////////////////////// 


    set_console_color(con_st, PROMPT);
    std::cout << "\n" << params.prompt.c_str() << std::endl;
    set_console_color(con_st, DEFAULT);

    //load prompt template from file instead
    if (params.load_template != "") {
        std::tie(params.default_prefix, params.default_header, params.default_footer) = read_prompt_template_file(params.load_template);
    }
    
    //load chat log from a file
    if (params.load_log != "") {
    	if (params.prompt == "") {
        	params.prompt = params.default_prefix + read_chat_log(params.load_log) + params.default_header;
        } else {
        	params.prompt = params.default_prefix + read_chat_log(params.load_log) + params.default_header + params.prompt;
        }
    } else {
    	params.prompt = params.default_prefix + params.default_header + params.prompt;
    }
    
    //////////////////////////////////////////////////////////////////////////
    ////////////            PROMPT LAMBDA FUNCTIONS               ////////////
    //////////////////////////////////////////////////////////////////////////


    auto prompt_callback = [](int32_t token_id)  {
	    // You can handle prompt here if needed
	    return true;
	};


    auto response_callback = [](int32_t token_id, const char *responsechars) {
    
        if (!(responsechars == nullptr || responsechars[0] == '\0')) {
	    // stop the animation, printing response
        if (stop_display == false) {
	        stop_display = true;
            std::this_thread::sleep_for(std::chrono::milliseconds(200));
            std::cerr << "\r" << " " << std::flush;
            std::cerr << "\r" << std::flush;
            if (answer != "") {std::cout << answer;}
        }
            
			std::cout << responsechars << std::flush;
	        answer += responsechars;
	    }
	            
	    return true;
	};
	
    auto recalculate_callback = [](bool is_recalculating) {
        // You can handle recalculation requests here if needed
        return is_recalculating;
    };


    //////////////////////////////////////////////////////////////////////////
    ////////////         PROMPT TEXT AND GET RESPONSE             ////////////
    //////////////////////////////////////////////////////////////////////////

    llmodel_setThreadCount(model, params.n_threads);

    std::string input = "";

    //main chat loop.
    if (!params.no_interactive && !sighup_received) {
        input = get_input(con_st, input, params, prompt_context, model);

        //Interactive mode. We have a prompt.
        if (params.prompt != "") {
            if (params.use_animation){ stop_display = false; future = std::async(std::launch::async, display_frames); }
            if (params.b_token != ""){answer = answer + params.b_token; if(!params.use_animation) {std::cout << params.b_token;} }
            llmodel_prompt(model, (params.prompt + " " + input + params.default_footer).c_str(),
            prompt_callback, response_callback, recalculate_callback, &prompt_context);
            if (params.e_token != ""){std::cout << params.e_token; answer = answer + params.e_token; }
            if (params.use_animation){ stop_display = true; future.wait(); stop_display = false; }
            if (params.save_log != ""){ save_chat_log(params.save_log, (params.prompt + " " + input + params.default_footer).c_str(), answer.c_str()); }

        //Interactive mode. Else get prompt from input.
        } else {
            if (params.use_animation){ stop_display = false; future = std::async(std::launch::async, display_frames); }
            if (params.b_token != ""){answer = answer + params.b_token; if(!params.use_animation) {std::cout << params.b_token;} }
            llmodel_prompt(model, (params.default_prefix + params.default_header + input + params.default_footer).c_str(),
            prompt_callback, response_callback, recalculate_callback, &prompt_context);
            if (params.e_token != ""){std::cout << params.e_token; answer = answer + params.e_token; }
            if (params.use_animation){ stop_display = true; future.wait(); stop_display = false; }
            if (params.save_log != ""){ save_chat_log(params.save_log, (params.default_prefix + params.default_header + input + params.default_footer).c_str(), answer.c_str()); }
        }
        //Interactive and continuous mode. Get prompt from input.

        while (!params.run_once && !sighup_received) {
            answer = ""; //New prompt. We stored previous answer in memory so clear it.
            input = get_input(con_st, input, params, prompt_context, model);
            if (params.use_animation){ stop_display = false; future = std::async(std::launch::async, display_frames); }
            if (params.b_token != ""){answer = answer + params.b_token; if(!params.use_animation) {std::cout << params.b_token;} }
            llmodel_prompt(model, (params.default_prefix + params.default_header + input + params.default_footer).c_str(), 
            prompt_callback, response_callback, recalculate_callback, &prompt_context);
            if (params.e_token != ""){std::cout << params.e_token; answer = answer + params.e_token; }
            if (params.use_animation){ stop_display = true; future.wait(); stop_display = false; }
            if (params.save_log != ""){ save_chat_log(params.save_log, (params.default_prefix + params.default_header + input + params.default_footer).c_str(), answer.c_str()); }

        }

    //No-interactive mode. Get the answer once from prompt and print it.
    } else {
        if (params.use_animation){ stop_display = false; future = std::async(std::launch::async, display_frames); }
        if (params.b_token != ""){answer = answer + params.b_token; if(!params.use_animation) {std::cout << params.b_token;} }
        llmodel_prompt(model, (params.prompt + params.default_footer).c_str(), 
        prompt_callback, response_callback, recalculate_callback, &prompt_context);
        if (params.e_token != ""){std::cout << params.e_token; answer = answer + params.e_token; }
        if (params.use_animation){ stop_display = true; future.wait(); stop_display = false; }
        if (params.save_log != ""){ save_chat_log(params.save_log, (params.prompt + params.default_footer).c_str(), answer.c_str()); }
        std::cout << std::endl;
    }


    set_console_color(con_st, DEFAULT);
    llmodel_model_destroy(model);
    return 0;
}


================================================
FILE: src/header.h
================================================
#pragma once

#ifndef HEADER_H
#define HEADER_H


#include <cstdio>

#include <cassert>
#include <cmath>
#include <string>
#include <vector>
#include <random>
#include <thread>
#include <iostream>
#include <map>
#include <sstream>
#include <fstream>
#include <regex>
#include <cstring>
#include <functional>
#include <csignal>

//For paths
//Commented out to support really old xcode
#ifndef OLD_MACOS
    #include <filesystem>
#endif

//For Windows MSVC compilation
#if defined(_WIN32) && defined(_MSC_VER)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <io.h>
    #include <stdio.h>
#else
    #include <unistd.h>
#endif


#include <typeinfo>
#include <future>
#include <chrono>
#include <atomic>
#include <fcntl.h>
#include "config.h"

#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>


// chatParams contains all the parameters you can import from json or with cli arguments
// it also contains the initial value for PromptContext
struct chatParams {
        //std::vector<float> logits,          // logits of current context 
        //std::vector<int32_t> tokens,        // current tokens in the context window 

        //These are in the prompt context, maybe add as parameters too.    
        float *logits = nullptr;            // logits of current context
        size_t logits_size = 0;             // the size of the raw logits vector
        int32_t *tokens = nullptr;          // current tokens in the context window
        size_t tokens_size = 0;             // the size of the raw tokens vector
        int32_t n_past = 0;                 // number of tokens in past conversation
        //Parameters below you can import from json or with cli arguments
        int32_t n_ctx = 0;                  // number of tokens possible in context window
        int32_t n_predict = 200;            // number of tokens to predict
        int32_t top_k = 40;                 // top k logits to sample from
        float top_p = 0.95;                 // nucleus sampling probability threshold
        float temp = 0.28;                  // temperature to adjust model's output distribution
        int32_t n_batch = 9;                // number of predictions to generate in parallel
        float repeat_penalty = 1.1;         // penalty factor for repeated tokens
        int32_t repeat_last_n = 64;         // last n tokens to penalize
        float context_erase = 0.75;         // percent of context to erase if we exceed the context window
        //Parameters below are not inside prompt_context, but handled separately
        int32_t seed = -1; 
        int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency()); 
        std::string model = "./models/ggml-vicuna-13b-1.1-q4_2.bin";
        std::string prompt = "";
        //template prefix, header, and footer
        std::string default_prefix = "### Instruction:\n The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.";
        std::string default_header = "\n### Prompt: ";
        std::string default_footer = "\n### Response: ";
        //You can toggle chat interactivity with these parameters
        bool no_interactive = false;
        bool use_animation = true;
        bool run_once = false;
        bool no_saves = false;
        std::string b_token = "";           //beginning wrap token
        std::string e_token = "";           //ending wrap token
        std::string load_template = "";     //template file location
        std::string load_json = "";         //json file location
        std::string save_log = "";          //saved chat log file location
        std::string load_log = "";          //loaded chat log file location
        std::string save_name = "model_state";  //model state binary name
        std::string save_dir  = "";         //saves directory name
        //program binary path
        std::string path = "";
};

enum ConsoleColor {
    DEFAULT = 0,
    PROMPT,
    USER_INPUT,
    BOLD
};

struct ConsoleState {
    bool use_color = false;
    ConsoleColor color = DEFAULT;
};

std::string APPNAME = "LlamaGPTJ-chat";

//utils.h functions
void set_console_color(ConsoleState &con_st, ConsoleColor color);
std::string random_prompt(int32_t seed);
void print_usage(int argc, char** argv, const chatParams& params);
bool parse_params(int argc, char** argv, chatParams& params);

//parse_json.h functions
void get_params_from_json(chatParams& params);

#endif

================================================
FILE: src/parse_json.h
================================================
#pragma once

#ifndef PARSE_JSON_H
#define PARSE_JSON_H

#include "header.h" 

//helper function to convert string to bool
bool stob(const std::string& str) {
    std::string lowerStr = str;
    std::transform(str.begin(), str.end(), lowerStr.begin(), ::tolower);
    if (lowerStr == "true") {
        return true;
    } else if (lowerStr == "false") {
        return false;
    } else {
        throw std::invalid_argument("Invalid boolean string");
    }
}

std::string readFile(const std::string& filename) {
    std::ifstream inFile(filename);
    if (!inFile) {
        std::cerr << "Unable to open file: " << filename << std::endl;
        return "";
    }
    std::stringstream buffer;
    buffer << inFile.rdbuf();
    inFile.close();
    return buffer.str();
}

std::map<std::string, std::string> parse_json_string(const std::string& jsonString) {
    std::map<std::string, std::string> resultMap;
    std::regex pattern("\"([^\"]+)\":\\s*([^\"]+|\"[^\"]+\")");
    std::smatch match;
    std::string::const_iterator searchStart(jsonString.cbegin());

    while (std::regex_search(searchStart, jsonString.cend(), match, pattern)) {
        resultMap[match[1]] = match[2];
        searchStart = match.suffix().first;
    }
    return resultMap;
}

std::string removeQuotes(const std::string& input) {
    std::string result = input;
    result.erase(std::remove(result.begin(), result.end(), '\"'), result.end());
    return result;
}

void get_params_from_json(chatParams& params) {
    std::map<std::string, std::string> parsed = parse_json_string(readFile(params.load_json));

    if (parsed.find("top_p") != parsed.end())
        params.top_p = std::stof(parsed["top_p"]);
    if (parsed.find("top_k") != parsed.end())
        params.top_k = std::stoi(parsed["top_k"]);
    if (parsed.find("temp") != parsed.end())
        params.temp = std::stof(parsed["temp"]);
    if (parsed.find("n_predict") != parsed.end())
        params.n_predict = std::stoi(parsed["n_predict"]);
    if (parsed.find("n_batch") != parsed.end())
        params.n_batch = std::stoi(parsed["n_batch"]);
    if (parsed.find("n_ctx") != parsed.end())
        params.n_ctx = std::stoi(parsed["n_ctx"]); 
    if (parsed.find("seed") != parsed.end())
        params.seed = std::stoi(parsed["seed"]);
    if (parsed.find("threads") != parsed.end())
        params.n_threads = std::stoi(parsed["threads"]);
    if (parsed.find("model") != parsed.end())
        params.model = removeQuotes(parsed["model"]);

    if (parsed.find("prompt") != parsed.end())
        params.prompt = removeQuotes(parsed["prompt"]);
    if (parsed.find("no-interactive") != parsed.end())
        params.no_interactive = stob(removeQuotes(parsed["no-interactive"]));    
    if (parsed.find("run-once") != parsed.end())
        params.run_once = stob(removeQuotes(parsed["run-once"]));        
    if (parsed.find("no-animation") != parsed.end())
        params.use_animation = !stob(removeQuotes(parsed["no-animation"]));
    if (parsed.find("no-saves") != parsed.end())
        params.no_saves = stob(removeQuotes(parsed["no-saves"]));

    if (parsed.find("repeat_penalty") != parsed.end())
        params.repeat_penalty = std::stof(parsed["repeat_penalty"]);
    if (parsed.find("repeat_last_n") != parsed.end())
        params.repeat_last_n = std::stoi(parsed["repeat_last_n"]);
    if (parsed.find("context_erase") != parsed.end())
        params.context_erase = std::stof(parsed["context_erase"]);
    if (parsed.find("b_token") != parsed.end())
        params.b_token = removeQuotes(parsed["b_token"]);
    if (parsed.find("e_token") != parsed.end())
        params.e_token = removeQuotes(parsed["e_token"]);              
    if (parsed.find("load_template") != parsed.end())
        params.load_template = removeQuotes(parsed["load_template"]);   
    if (parsed.find("save_log") != parsed.end())
        params.save_log = removeQuotes(parsed["save_log"]);
    if (parsed.find("load_log") != parsed.end())
        params.load_log = removeQuotes(parsed["load_log"]);
    if (parsed.find("save_dir") != parsed.end())
        params.save_dir = removeQuotes(parsed["save_dir"]);
    if (parsed.find("save_name") != parsed.end())
        params.save_name = removeQuotes(parsed["save_name"]);}


#endif

================================================
FILE: src/utils.h
================================================
#pragma once

#ifndef UTILS_H
#define UTILS_H

#include "header.h" 

//Need this for Windows colors
#ifdef _WIN32
    #include <windows.h> 
#endif

bool containsSubstring(const std::string &str, const std::string &substr) {
    return str.find(substr) != std::string::npos;
}

void check_avx_support_at_startup() {
#if defined(__x86_64__) || defined(__i386__)
    const bool avx(__builtin_cpu_supports("avx"));
    const bool avx2(__builtin_cpu_supports("avx2"));
    const bool avx512(__builtin_cpu_supports("avx512f"));
    const bool fma(__builtin_cpu_supports("fma"));
    if (avx512 && avx && avx2 && fma) {std::cout << "Your computer supports AVX512" << std::endl;}
    else if (avx && avx2 && fma)      {std::cout << "Your computer supports AVX2" << std::endl;}
    else if (avx)                     {std::cout << "Your computer only supports AVX1" << std::endl;}
    else                    {std::cout << "Your computer does not support AVX1 or AVX2\nThe program will likely not run." << std::endl;} 
    #ifdef OLD_MACOS
    std::cout << "Compiled with OLD_MACOS flag. /save and /load features turned off." << std::endl;
    #endif
#endif
}


//////////////////////////////////////////////////////////////////////////
////////////                 SIGNAL HANDLING                  ////////////
//////////////////////////////////////////////////////////////////////////


volatile sig_atomic_t sighup_received = 0;

void handle_sighup(int signal) {
    #ifndef _WIN32
    if (signal == SIGHUP) {
        sighup_received = 1;
    }
    #endif
}

#ifdef _WIN32
BOOL WINAPI console_ctrl_handler(DWORD ctrl_type) {
    switch (ctrl_type) {
        case CTRL_C_EVENT:
        case CTRL_CLOSE_EVENT:
            sighup_received = 1;
            return TRUE;
        default:
            return FALSE;
    }
}
#endif

//////////////////////////////////////////////////////////////////////////
////////////                /SIGNAL HANDLING                  ////////////
//////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////
////////////            READ PROMPT TEMPLATE FILE             ////////////
//////////////////////////////////////////////////////////////////////////

//This is a bit messy function but it should parse the template file into prefix, header, and footer.
//Chat will then prompt the model with (prefix + header + input/prompt +  footer)
std::tuple<std::string, std::string, std::string> read_prompt_template_file(const std::string& file_path) {
    std::string prefix, header, footer;
    std::ifstream file(file_path);

    std::vector<std::string> lines;
    std::string line;

    //store all lines of header template into a vector
    if (file.is_open()) {
        while (std::getline(file, line)) {
            lines.push_back(line);
        }
        file.close();
    } else {
        std::cerr << "Unable to open the prompt template file." << std::endl;
        std::cerr << "Reverting to default prompt template." << std::endl;
        return std::make_tuple("### Instruction:\n The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.", "\n### Prompt: ", "\n### Response: "); 
    }

    //find line containing %1 and store its index.
    int input_index;
    for (size_t i = 0; i < lines.size(); ++i) {
        if (lines[i].find("%1") != std::string::npos) {
            input_index = i;
        }
    }
    //Special case of having only %1 in template file.
    if (input_index == 0) {
        header = "";
        prefix = "";
        footer = "";
    //If there is only 1 line above %1, that will be ### header.
    } else if (input_index == 1) {
        header = lines[0];
        prefix = " ";
    } else {
        
        //Put lines above the header-line into prefix.
        prefix = lines[0];
        for (size_t i = 1; i < input_index-1; ++i) {
            prefix = prefix + "\n" + lines[i];
        }
        prefix = prefix  + " ";

        //store header-line (line above input-line)
        header = "\n" + lines[input_index-1] + " ";

        //Put lines below the input-line into footer.
        footer = "\n";
        for (size_t i = input_index+1; i < lines.size(); ++i) {
             footer = footer + lines[i]+" ";
        }
    }

    return std::make_tuple(prefix, header, footer);
}


//////////////////////////////////////////////////////////////////////////
////////////           /READ PROMPT TEMPLATE FILE             ////////////
//////////////////////////////////////////////////////////////////////////

void save_chat_log(std::string save_log, std::string prompt, std::string answer) {
  std::ofstream logfile(save_log, std::ios::app);
  if (logfile.is_open()) {
    logfile << prompt;
    logfile << answer+"\n";
    logfile.close();
    }
}


std::string read_chat_log(std::string load_log) {

    std::ifstream ifs(load_log);
    std::string content((std::istreambuf_iterator<char>(ifs)),
                         std::istreambuf_iterator<char>());
    return content;
}

std::string pathname_directory(const std::string &pathname)
{
    std::size_t len = pathname.find_last_of("/\\");
    return len == std::string::npos ? "": pathname.substr(0, len);
}


void set_console_color(ConsoleState &con_st, ConsoleColor color) {
    if (con_st.use_color && con_st.color != color) {
        //Windows handles colors differently.
        #ifdef _WIN32
          WORD windows_colors[] = {
            7, 14, 10, 15
        };          
            HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
            SetConsoleTextAttribute(hConsole, windows_colors[color]);
        #else
        //ANSI colors, works for unix.
        const char* ansi_colors[] = {
            //DEFAULT, PROMPT, USER_INPUT,   BOLD
            //default, yellow, bright_green, bold
            "\x1b[0m", "\x1b[33m", "\x1b[1m\x1b[32m", "\x1b[1m"
        };
        printf("%s", ansi_colors[color]);
        #endif
        con_st.color = color;
    }
}

std::string random_prompt(int32_t seed) {
    const std::vector<std::string> prompts = {
        "So", "Once upon a time", "When", "The", "After", "If", "import", "He", "She", "They"
    };

    std::mt19937 rng(seed);
    return prompts[rng() % prompts.size()];
}

void print_version() {
	//Version/about page
	//Contains License information for distributions in binary form
std::string mit_license = R"(MIT License

Big thanks to contributors, testers, and commenters on Github.
And to you, dear user!

Happy chatting! :)
)";
std::cout << "\n\n" << APPNAME << " version " << VERSION << "\n\n" << "Made by kuvaus" << "\n\n" << mit_license << std::endl; 
//std::cout << mit_license << std::endl; 
}

void print_usage(int argc, char** argv, const chatParams& params) {
    // Print usage information
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "A simple chat program for GPT-J, LLaMA, and MPT models.\n");
    fprintf(stderr, "You can set specific initial prompt with the -p flag.\n");
    fprintf(stderr, "Runs default in interactive and continuous mode.\n");
    fprintf(stderr, "Type '/reset' to reset the chat context.\n");
    fprintf(stderr, "Type '/save','/load' to save network state into a binary file.\n");
    fprintf(stderr, "Type '/save NAME','/load NAME' to rename saves. Default: --save_name NAME.\n");
    fprintf(stderr, "Type '/help' to show this help dialog.\n");
    fprintf(stderr, "Type 'quit', 'exit' or, 'Ctrl+C' to quit.\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help            show this help message and exit\n");
    fprintf(stderr, "  -v, --version         show version and license information\n");
    fprintf(stderr, "  --run-once            disable continuous mode\n");
    fprintf(stderr, "  --no-interactive      disable interactive mode altogether (uses given prompt only)\n");
    fprintf(stderr, "  --no-animation        disable chat animation\n");
    fprintf(stderr, "  --no-saves            disable '/save','/load' functionality\n");
    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed for --random-prompt (default: -1)\n");
    fprintf(stderr, "  -t N, --threads    N  number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
    fprintf(stderr, "                        prompt to start generation with (default: empty)\n");
    fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
    fprintf(stderr, "  -n N, --n_predict  N  number of tokens to predict (default: %d)\n", params.n_predict);
    fprintf(stderr, "  --top_k            N  top-k sampling (default: %d)\n", params.top_k);
    fprintf(stderr, "  --top_p            N  top-p sampling (default: %.1f)\n", params.top_p);
    fprintf(stderr, "  --temp             N  temperature (default: %.1f)\n", params.temp);
    fprintf(stderr, "  --n_ctx            N  number of tokens in context window (default: %d)\n", params.n_ctx);
    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  --repeat_penalty   N  repeat_penalty (default: %.1f)\n", params.repeat_penalty);
    fprintf(stderr, "  --repeat_last_n    N  last n tokens to penalize  (default: %d)\n", params.repeat_last_n);
    fprintf(stderr, "  --context_erase    N  percent of context to erase  (default: %.1f)\n", params.context_erase);
    fprintf(stderr, "  --b_token             optional beginning wrap token for response (default: empty)\n");
    fprintf(stderr, "  --e_token             optional end wrap token for response (default: empty)\n");
    fprintf(stderr, "  -j,   --load_json FNAME\n");
    fprintf(stderr, "                        load options instead from json at FNAME (default: empty/no)\n");
    fprintf(stderr, "  --load_template   FNAME\n");
    fprintf(stderr, "                        load prompt template from a txt file at FNAME (default: empty/no)\n");
    fprintf(stderr, "  --save_log        FNAME\n");
    fprintf(stderr, "                        save chat log to a file at FNAME (default: empty/no)\n");
    fprintf(stderr, "  --load_log        FNAME\n");
    fprintf(stderr, "                        load chat log from a file at FNAME (default: empty/no)\n");
    fprintf(stderr, "  --save_dir        DIR\n");
    fprintf(stderr, "                        directory for saves (default: %s/saves)\n", pathname_directory(argv[0]).c_str());
    fprintf(stderr, "  --save_name       NAME\n");
    fprintf(stderr, "                        save/load model state binary at save_dir/NAME.bin (current: %s)\n", params.save_name.c_str());
    fprintf(stderr, "                        context is saved to save_dir/NAME.ctx (current: %s)\n", params.save_name.c_str());
    fprintf(stderr, "  -m FNAME, --model FNAME\n");
    fprintf(stderr, "                        model path (current: %s)\n", params.model.c_str());
    fprintf(stderr, "\n");
}

bool parse_params(int argc, char** argv, chatParams& params) { 

    // Parse command-line arguments
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

        if (arg == "-j" || arg == "--load_json") {
            params.load_json = argv[++i];
            if (!params.load_json.empty()) {
                std::cout << APPNAME << ": parsing options from json: " << params.load_json << std::endl;
                get_params_from_json(params);
            } else {
                std::cout << APPNAME << ": trying to parse options from json but got empty filename." << std::endl;
            }
        } else if (arg == "--run-once") {
            params.run_once = true;
        } else if (arg == "--no-interactive") {
            params.no_interactive = true;
        } else if (arg == "--no-animation") {
            params.use_animation = false;
        } else if (arg == "--no-saves") {
            params.no_saves = true;
        } else if (arg == "-s" || arg == "--seed") {
            params.seed = static_cast<int32_t>(std::stoi(argv[++i]));
        } else if (arg == "-t" || arg == "--threads") {
            params.n_threads = static_cast<int32_t>(std::stoi(argv[++i]));
        } else if (arg == "-p" || arg == "--prompt") {
            params.prompt = argv[++i];
        } else if (arg == "--random-prompt") {
            params.prompt = random_prompt(params.seed);
        } else if (arg == "-n" || arg == "--n_predict") {
            params.n_predict = static_cast<int32_t>(std::stoi(argv[++i]));
        } else if (arg == "--top_k") {
            params.top_k = static_cast<int32_t>(std::stoi(argv[++i]));
        } else if (arg == "--top_p") {
            params.top_p = static_cast<float>(std::stof(argv[++i]));
        } else if (arg == "--temp") {
            params.temp = static_cast<float>(std::stof(argv[++i]));
        } else if (arg == "-b" || arg == "--batch_size") {
            params.n_batch = static_cast<int32_t>(std::stoi(argv[++i]));
        } else if (arg == "--n_ctx") {
            params.n_ctx = static_cast<int>(std::stoi(argv[++i]));
        } else if (arg == "--repeat_penalty") {
            params.repeat_penalty = static_cast<float>(std::stof(argv[++i]));
        } else if (arg == "--repeat_last_n") {
            params.repeat_last_n = static_cast<int>(std::stoi(argv[++i]));
        } else if (arg == "--context_erase") {
            params.context_erase = static_cast<float>(std::stof(argv[++i]));
        } else if (arg == "--b_token") {
            params.b_token = argv[++i];
        } else if (arg == "--e_token") {
            params.e_token = argv[++i];
        } else if (arg == "--load_template") {
            params.load_template = argv[++i];
        } else if (arg == "--save_log") {
            params.save_log = argv[++i];
        } else if (arg == "--load_log") {
            params.load_log = argv[++i];
        } else if (arg == "--save_dir") {
            params.save_dir = argv[++i];    
        } else if (arg == "--save_name") {
            params.save_name = argv[++i];
        } else if (arg == "-m" || arg == "--model") {
            params.model = argv[++i];
        } else if (arg == "-h" || arg == "--help") {
            print_usage(argc, argv, params);
            exit(0);
        } else if (arg == "-v" || arg == "--version") {
            print_version();
            exit(0);
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            print_usage(argc, argv, params);
            exit(0);
        }
    }
    //get path to program
    params.path = pathname_directory(argv[0]);
	params.path.append("/");

	
    return true;
}


#endif