Repository: jlko/semantic_uncertainty Branch: master Commit: a8d9aa8cecd5 Files: 20 Total size: 144.0 KB Directory structure: gitextract_1q6fc1mb/ ├── .gitignore ├── LICENSE ├── README.md ├── environment.yaml ├── environment_export.yaml ├── notebooks/ │ └── example_evaluation.ipynb └── semantic_uncertainty/ ├── analyze_results.py ├── compute_uncertainty_measures.py ├── generate_answers.py └── uncertainty/ ├── __init__.py ├── data/ │ └── data_utils.py ├── models/ │ ├── __init__.py │ ├── base_model.py │ └── huggingface_models.py ├── uncertainty_measures/ │ ├── p_ik.py │ ├── p_true.py │ └── semantic_entropy.py └── utils/ ├── eval_utils.py ├── openai.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ figures/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # Slurm *.out # VSCODE .vscode ================================================ FILE: LICENSE ================================================ The Clear BSD License Copyright (c) 2023 All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted (subject to the limitations in the disclaimer below) provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: README.md ================================================ # Detecting Hallucinations in Large Language Models Using Semantic Entropy This repository contains the code necessary to reproduce the short-phrase and sentence-length experiments of the Nature submission 'Detecting Hallucinations in Large Language Models Using Semantic Entropy'. This repository builds on the original, now deprecated codebase for semantic uncertainty at [https://github.com/lorenzkuhn/semantic_uncertainty](https://github.com/lorenzkuhn/semantic_uncertainty). ## System Requirements We here discuss hardware and software system requirements. ### Hardware Dependencies Generally speaking, our experiments require modern computer hardware which is suited for usage with large language models (LLMs). Requirements regarding the system's CPU and RAM size are relatively modest: any reasonably modern system should suffice, e.g. a system with an Intel 10th generation CPU and 16 GB of system memory or better. More importantly, all our experiments make use of one or more Graphics Processor Units (GPUs) to speed up LLM inference. Without a GPU, it is not feasible to reproduce our results in a reasonable amount of time. The particular GPU necessary depends on the choice of LLM: LLMs with more parameters require GPUs with more memory. For smaller models (7B parameters), desktop GPUs such as the Nvidia TitanRTX (24 GB) are sufficient. For larger models (13B), GPUs with more memory, such as the Nvidia A100 server GPU, are required. Our largest models with 70B parameters require the use of two Nvidia A100 GPUs (2x80GB) simultaneously. One can reduce the precision to float16 or int8 to reduce memory requirements without significantly affecting model predictions and their accuracy. We use float16 for 70B models by default, and int8 mode can be enabled for any model by suffixing the model name with `-int8`. ### Software Dependencies Our code relies on Python 3.11 with PyTorch 2.1. Our systems run the Ubuntu 20.04.6 LTS (GNU/Linux 5.15.0-89-generic x86_64) operating system. In [environment_export.yaml](environment_export.yaml) we list the exact versions for all Python packages used in our experiments. We generally advise against trying to install from this exact export of our conda environment. Please see below for installation instructions. Although we have not tested this, we would expect our code to be compatible with other operating systems, Python versions, and versions of the Python libraries that we use. ## Installation Guide To install Python with all necessary dependencies, we recommend the use of conda, and we refer to [https://conda.io/](https://conda.io/) for an installation guide. After installing conda, you can set up and activate a new conda environment with all required packages by executing the following commands from the root folder of this repository in a shell: ``` conda-env update -f environment.yaml conda activate semantic_uncertainty ``` The installation should take around 15 minutes. Our experiments rely on [Weights & Biases (wandb)](https://wandb.ai/) to log and save individual runs. While wandb will be installed automatically with the above conda script, you may need to log in with your wandb API key upon initial execution. Our experiments rely on Hugging Face for all LLM models and most of the datasets. It may be necessary to set the environment variable `HUGGING_FACE_HUB_TOKEN` to the token associated with your Hugging Face account. Further, it may be necessary to [apply for access](https://huggingface.co/meta-llama) to use the official repository of Meta's LLaMa-2 models. We further recommend setting the `XDG_CACHE_HOME` environment variable to a directory on a device with sufficient space, as models and datasets will be downloaded to this folder. Our experiments with sentence-length generation use GPT models from the OpenAI API. Please set the environment variable `OPENAI_API_KEY` to your OpenAI API key in order to use these models. Note that OpenAI charges a cost per input token and per generated token. Costs for reproducing our results vary depending on experiment configuration, but, without any guarantee, should lie somewhere between 5 and 30 USD per run. For almost all tasks, the dataset is downloaded automatically from the Hugging Face Datasets library upon first execution. The only exception is BioASQ (task b, BioASQ11, 2023), for which the data needs to be [downloaded](http://participants-area.bioasq.org/datasets) manually and stored at `$SCRATCH_DIR/$USER/semantic_uncertainty/data/bioasq/training11b.json`, where `$SCRATCH_DIR` defaults to `.`. ## Demo Execute ``` python semantic_uncertainty/generate_answers.py --model_name=Llama-2-7b-chat --dataset=trivia_qa ``` to reproduce results for short-phrase generation with LLaMa-2 Chat (7B) on the BioASQ dataset. The expected runtime of this demo is 1 hour using an Nvidia A100 GPU (80 GB), 24 cores of a Intel(R) Xeon(R) Gold 6248R CPU @ 3.00GHz, and 192 GB of RAM. Runtime may be longer upon first execution, as the LLM needs to be downloaded from Hugging Face first. To evaluate the run and obtain a barplot similar to those of the paper, open the Jupyter notebook in [notebooks/example_evaluation.ipynb](notebooks/example_evaluation.ipynb), populate the `wandb_id` variable in the first cell with the id assigned to your demo run, and execute all cells of the notebook. We refer to [https://jupyter.org/](https://jupyter.org/) for more information on how to start the Jupter notebook server. ## Further Instructions ### Repository Structure We here give an overview of the various components of the code. By default, a standard run executes the following three scripts in order: * `generate_answers.py`: Sample responses (and their likelihods/hidden states) from the models for a set of input questions. * `compute_uncertainty_measures.py`: Compute uncertainty metrics given responses. * `analyze_results.py`: Compute aggregate performance metrics given uncertainties. It is possible to run these scripts individually, e.g. when recomputing results, and we are happy to provide guidance on how to do so upon request. ### Reproducing the Experiments To reproduce the experiments of the paper, one needs to execute ``` python generate_answers.py --model_name=$MODEL --dataset=$DATASET $EXTRA_CFG ``` for all combinations of models and datasets, and where `$EXTRA_CFG` is defined to either activate short-phrase or sentence-length generations and their associated hyperparameters. Concretely, * `$MODEL` is one of: `[Llama-2-7b, Llama-2-13b, Llama-2-70b, Llama-2-7b-chat, Llama-2-13b-chat, Llama-2-70b-chat, falcon-7b, falcon-40b, falcon-7b-instruct, falcon-40b-instruct, Mistral-7B-v0.1, Mistral-7B-Instruct-v0.1]`, * `$DATASET` is one of `[trivia_qa, squad, bioasq, nq, svamp]`, * and `$EXTRA_CFG=''` is empty for short-phrase generations and `EXTRA_CFG=--num_few_shot=0 --model_max_new_tokens=100 --brief_prompt=chat --metric=llm_gpt-4 --entailment_model=gpt-3.5 --no-compute_accuracy_at_all_temps` for sentence-length generations. The results for any run can be obtained by passing the associated `wandb_id` to an evaluation notebook identical to the demo in [notebooks/example_evaluation.ipynb](notebooks/example_evaluation.ipynb). ================================================ FILE: environment.yaml ================================================ name: semantic_uncertainty channels: - pytorch - nvidia - defaults dependencies: - python=3.11 - pip - pytorch - torchvision - torchaudio - pytorch-cuda=11.8 - numpy - transformers - evaluate - datasets - pip: - transformers>=4.31 - scikit-learn - pandas - flake8 - omegaconf - jupyterlab - notebook - matplotlib - seaborn - tqdm - ipywidgets - scipy - wandb - tokenizers>=0.13.3 - accelerate - ml_collections - torchmetrics - openai - tiktoken - einops - bitsandbytes - nltk - tenacity - sentencepiece - safetensors ================================================ FILE: environment_export.yaml ================================================ name: semantic_uncertainty_export channels: - pytorch - nvidia - defaults dependencies: - _libgcc_mutex=0.1=main - _openmp_mutex=5.1=1_gnu - abseil-cpp=20211102.0=hd4dd3e8_0 - aiohttp=3.8.5=py311h5eee18b_0 - aiosignal=1.2.0=pyhd3eb1b0_0 - arrow=1.2.3=py311h06a4308_1 - arrow-cpp=11.0.0=h374c478_2 - async-timeout=4.0.2=py311h06a4308_0 - attrs=23.1.0=py311h06a4308_0 - aws-c-common=0.6.8=h5eee18b_1 - aws-c-event-stream=0.1.6=h6a678d5_6 - aws-checksums=0.1.11=h5eee18b_2 - aws-sdk-cpp=1.8.185=h721c034_1 - binaryornot=0.4.4=pyhd3eb1b0_1 - blas=1.0=mkl - boost-cpp=1.82.0=hdb19cb5_2 - bottleneck=1.3.5=py311hbed6279_0 - brotlipy=0.7.0=py311h5eee18b_1002 - bzip2=1.0.8=h7b6447c_0 - c-ares=1.19.1=h5eee18b_0 - ca-certificates=2023.08.22=h06a4308_0 - certifi=2023.11.17=py311h06a4308_0 - cffi=1.15.1=py311h5eee18b_3 - chardet=4.0.0=py311h06a4308_1003 - charset-normalizer=2.0.4=pyhd3eb1b0_0 - click=8.0.4=py311h06a4308_0 - cookiecutter=1.7.3=pyhd3eb1b0_0 - cryptography=41.0.3=py311hdda0065_0 - cuda-cudart=11.8.89=0 - cuda-cupti=11.8.87=0 - cuda-libraries=11.8.0=0 - cuda-nvrtc=11.8.89=0 - cuda-nvtx=11.8.86=0 - cuda-runtime=11.8.0=0 - datasets=2.12.0=py311h06a4308_0 - dill=0.3.6=py311h06a4308_0 - evaluate=0.4.0=py311h06a4308_0 - ffmpeg=4.3=hf484d3e_0 - filelock=3.9.0=py311h06a4308_0 - freetype=2.12.1=h4a9f257_0 - frozenlist=1.3.3=py311h5eee18b_0 - fsspec=2023.9.2=py311h06a4308_0 - gflags=2.2.2=he6710b0_0 - giflib=5.2.1=h5eee18b_3 - glog=0.5.0=h2531618_0 - gmp=6.2.1=h295c915_3 - gmpy2=2.1.2=py311hc9b5ff0_0 - gnutls=3.6.15=he1e5248_0 - grpc-cpp=1.48.2=he1ff14a_1 - huggingface_hub=0.17.3=py311h06a4308_0 - icu=73.1=h6a678d5_0 - idna=3.4=py311h06a4308_0 - intel-openmp=2023.1.0=hdb19cb5_46305 - jinja2=3.1.2=py311h06a4308_0 - jinja2-time=0.2.0=pyhd3eb1b0_3 - jpeg=9e=h5eee18b_1 - krb5=1.20.1=h143b758_1 - lame=3.100=h7b6447c_0 - lcms2=2.12=h3be6417_0 - ld_impl_linux-64=2.38=h1181459_1 - lerc=3.0=h295c915_0 - libboost=1.82.0=h109eef0_2 - libbrotlicommon=1.0.9=h5eee18b_7 - libbrotlidec=1.0.9=h5eee18b_7 - libbrotlienc=1.0.9=h5eee18b_7 - libcublas=11.11.3.6=0 - libcufft=10.9.0.58=0 - libcufile=1.8.0.34=0 - libcurand=10.3.4.52=0 - libcurl=8.4.0=h251f7ec_0 - libcusolver=11.4.1.48=0 - libcusparse=11.7.5.86=0 - libdeflate=1.17=h5eee18b_1 - libedit=3.1.20221030=h5eee18b_0 - libev=4.33=h7f8727e_1 - libevent=2.1.12=hdbd6064_1 - libffi=3.4.4=h6a678d5_0 - libgcc-ng=11.2.0=h1234567_1 - libgomp=11.2.0=h1234567_1 - libiconv=1.16=h7f8727e_2 - libidn2=2.3.4=h5eee18b_0 - libjpeg-turbo=2.0.0=h9bf148f_0 - libnghttp2=1.57.0=h2d74bed_0 - libnpp=11.8.0.86=0 - libnvjpeg=11.9.0.86=0 - libpng=1.6.39=h5eee18b_0 - libprotobuf=3.20.3=he621ea3_0 - libssh2=1.10.0=hdbd6064_2 - libstdcxx-ng=11.2.0=h1234567_1 - libtasn1=4.19.0=h5eee18b_0 - libthrift=0.15.0=h1795dd8_2 - libtiff=4.5.1=h6a678d5_0 - libunistring=0.9.10=h27cfd23_0 - libuuid=1.41.5=h5eee18b_0 - libwebp=1.3.2=h11a3e52_0 - libwebp-base=1.3.2=h5eee18b_0 - llvm-openmp=14.0.6=h9e868ea_0 - lz4-c=1.9.4=h6a678d5_0 - markupsafe=2.1.1=py311h5eee18b_0 - mkl=2023.1.0=h213fc3f_46343 - mkl-service=2.4.0=py311h5eee18b_1 - mkl_fft=1.3.8=py311h5eee18b_0 - mkl_random=1.2.4=py311hdb19cb5_0 - mpc=1.1.0=h10f8cd9_1 - mpfr=4.0.2=hb69a4c5_1 - mpmath=1.3.0=py311h06a4308_0 - multidict=6.0.2=py311h5eee18b_0 - multiprocess=0.70.14=py311h06a4308_0 - ncurses=6.4=h6a678d5_0 - nettle=3.7.3=hbbd107a_1 - networkx=3.1=py311h06a4308_0 - numexpr=2.8.7=py311h65dcdc2_0 - numpy=1.26.0=py311h08b1b3b_0 - numpy-base=1.26.0=py311hf175353_0 - openh264=2.1.1=h4ff587b_0 - openjpeg=2.4.0=h3ad879b_0 - openssl=3.0.12=h7f8727e_0 - orc=1.7.4=hb3bc3d3_1 - packaging=23.1=py311h06a4308_0 - pillow=10.0.1=py311ha6cbd5a_0 - pip=23.3.1=py311h06a4308_0 - poyo=0.5.0=pyhd3eb1b0_0 - pyarrow=11.0.0=py311hd8e8d9b_1 - pycparser=2.21=pyhd3eb1b0_0 - pyopenssl=23.2.0=py311h06a4308_0 - pysocks=1.7.1=py311h06a4308_0 - python=3.11.5=h955ad1f_0 - python-dateutil=2.8.2=pyhd3eb1b0_0 - python-slugify=5.0.2=pyhd3eb1b0_0 - python-tzdata=2023.3=pyhd3eb1b0_0 - python-xxhash=2.0.2=py311h5eee18b_1 - pytorch=2.1.1=py3.11_cuda11.8_cudnn8.7.0_0 - pytorch-cuda=11.8=h7e8668a_5 - pytorch-mutex=1.0=cuda - pytz=2023.3.post1=py311h06a4308_0 - pyyaml=6.0=py311h5eee18b_1 - re2=2022.04.01=h295c915_0 - readline=8.2=h5eee18b_0 - requests=2.31.0=py311h06a4308_0 - responses=0.13.3=pyhd3eb1b0_0 - setuptools=68.0.0=py311h06a4308_0 - six=1.16.0=pyhd3eb1b0_1 - snappy=1.1.9=h295c915_0 - sqlite=3.41.2=h5eee18b_0 - sympy=1.11.1=py311h06a4308_0 - tbb=2021.8.0=hdb19cb5_0 - text-unidecode=1.3=pyhd3eb1b0_0 - tk=8.6.12=h1ccaba5_0 - torchaudio=2.1.1=py311_cu118 - torchtriton=2.1.0=py311 - torchvision=0.16.1=py311_cu118 - typing-extensions=4.7.1=py311h06a4308_0 - typing_extensions=4.7.1=py311h06a4308_0 - tzdata=2023c=h04d1e81_0 - unidecode=1.2.0=pyhd3eb1b0_0 - urllib3=1.26.16=py311h06a4308_0 - utf8proc=2.6.1=h27cfd23_0 - wheel=0.41.2=py311h06a4308_0 - xxhash=0.8.0=h7f8727e_3 - xz=5.4.2=h5eee18b_0 - yaml=0.2.5=h7b6447c_0 - yarl=1.8.1=py311h5eee18b_0 - zlib=1.2.13=h5eee18b_0 - zstd=1.5.5=hc292b87_0 - pip: - absl-py==2.0.0 - accelerate==0.25.0 - annotated-types==0.6.0 - antlr4-python3-runtime==4.9.3 - anyio==3.7.1 - appdirs==1.4.4 - argon2-cffi==23.1.0 - argon2-cffi-bindings==21.2.0 - asttokens==2.4.0 - async-lru==2.0.4 - babel==2.13.0 - backcall==0.2.0 - beautifulsoup4==4.12.2 - bitsandbytes==0.41.2.post2 - bleach==6.1.0 - comm==0.1.4 - contextlib2==21.6.0 - contourpy==1.1.1 - cycler==0.12.1 - debugpy==1.8.0 - decorator==5.1.1 - defusedxml==0.7.1 - distro==1.8.0 - docker-pycreds==0.4.0 - einops==0.7.0 - executing==2.0.0 - fastjsonschema==2.18.1 - flake8==6.1.0 - fonttools==4.43.1 - fqdn==1.5.1 - gitdb==4.0.11 - gitpython==3.1.40 - h11==0.14.0 - httpcore==1.0.1 - httpx==0.25.1 - ipykernel==6.25.2 - ipython==8.16.1 - ipywidgets==8.1.1 - isoduration==20.11.0 - jedi==0.19.1 - joblib==1.3.2 - json5==0.9.14 - jsonpointer==2.4 - jsonschema==4.19.1 - jsonschema-specifications==2023.7.1 - jupyter-client==8.4.0 - jupyter-core==5.4.0 - jupyter-events==0.8.0 - jupyter-lsp==2.2.0 - jupyter-server==2.8.0 - jupyter-server-terminals==0.4.4 - jupyterlab==4.0.9 - jupyterlab-pygments==0.2.2 - jupyterlab-server==2.25.0 - jupyterlab-widgets==3.0.9 - kiwisolver==1.4.5 - lightning-utilities==0.9.0 - matplotlib==3.8.2 - matplotlib-inline==0.1.6 - mccabe==0.7.0 - mistune==3.0.2 - ml-collections==0.1.1 - nbclient==0.8.0 - nbconvert==7.9.2 - nbformat==5.9.2 - nest-asyncio==1.5.8 - nltk==3.8.1 - notebook==7.0.6 - notebook-shim==0.2.3 - omegaconf==2.3.0 - openai==1.3.7 - overrides==7.4.0 - pandas==2.1.3 - pandocfilters==1.5.0 - parso==0.8.3 - pathtools==0.1.2 - pexpect==4.8.0 - pickleshare==0.7.5 - platformdirs==3.11.0 - prometheus-client==0.17.1 - prompt-toolkit==3.0.39 - protobuf==4.24.4 - psutil==5.9.6 - ptyprocess==0.7.0 - pure-eval==0.2.2 - pycodestyle==2.11.1 - pydantic==2.4.2 - pydantic-core==2.10.1 - pyflakes==3.1.0 - pygments==2.16.1 - pyparsing==3.1.1 - python-json-logger==2.0.7 - pyzmq==25.1.1 - referencing==0.30.2 - regex==2023.10.3 - rfc3339-validator==0.1.4 - rfc3986-validator==0.1.1 - rpds-py==0.10.6 - safetensors==0.4.1 - scikit-learn==1.3.2 - scipy==1.11.4 - seaborn==0.13.0 - send2trash==1.8.2 - sentencepiece==0.1.99 - sentry-sdk==1.32.0 - setproctitle==1.3.3 - smmap==5.0.1 - sniffio==1.3.0 - soupsieve==2.5 - stack-data==0.6.3 - tenacity==8.2.3 - terminado==0.17.1 - threadpoolctl==3.2.0 - tiktoken==0.5.2 - tinycss2==1.2.1 - tokenizers==0.15.0 - torchmetrics==1.2.1 - tornado==6.3.3 - tqdm==4.66.1 - traitlets==5.11.2 - transformers==4.35.2 - uri-template==1.3.0 - wandb==0.16.0 - wcwidth==0.2.8 - webcolors==1.13 - webencodings==0.5.1 - websocket-client==1.6.4 - widgetsnbextension==4.0.9 ================================================ FILE: notebooks/example_evaluation.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e9698d3d-f63c-4d8c-af6e-4e9996b3fe28", "metadata": {}, "outputs": [], "source": [ "# Fill in the wandb_id assigned to your demo run!\n", "\n", "wandb_id = 'YOUR_ID'\n", "if wandb_id == 'YOUR_ID':\n", " raise ValueError('Need to provide wandb_id of demo run!')" ] }, { "cell_type": "code", "execution_count": 2, "id": "0ff8faff-38fa-49cb-be73-624dc88fcd13", "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "import json\n", "import wandb\n", "import pandas as pd\n", "from matplotlib import pyplot as plt" ] }, { "cell_type": "code", "execution_count": 3, "id": "32bfe743-5d8c-4ae6-804d-c5800a4209f1", "metadata": {}, "outputs": [], "source": [ "# Helper Functions\n", "\n", "def restore_file(wandb_id, filename='wandb-summary.json'):\n", " files_dir = 'notebooks/restored_files' \n", " os.system(f'mkdir -p {files_dir}')\n", "\n", " api = wandb.Api()\n", " run = api.run(f'semantic_uncertainty/{wandb_id}')\n", "\n", " path = f'{files_dir}/{filename}'\n", " os.system(f'rm -rf {path}')\n", " run.file(filename).download(root=files_dir, replace=True, exist_ok=False)\n", " with open(path, 'r') as f:\n", " out = json.load(f)\n", " return out\n", "\n", "def get_uncertainty_df(metrics):\n", " data = []\n", " for method in metrics['uncertainty']:\n", " for metric in metrics['uncertainty'][method]:\n", " mean = metrics['uncertainty'][method][metric]['mean']\n", " data.append([method, metric, mean])\n", " df = pd.DataFrame(data, columns=['method', 'metric', 'means'])\n", " main_methods = ['semantic_entropy', 'cluster_assignment_entropy', 'regular_entropy', 'p_false', 'p_ik']\n", " df = df.set_index('method').loc[main_methods].reset_index()\n", " main_names = ['Semantic entropy', 'Discrete Semantic Entropy', 'Naive Entropy', 'p(True)', 'Embedding Regression']\n", " conversion = dict(zip(main_methods, main_names))\n", " df['method'] = df.method.map(lambda x: conversion[x])\n", " return df" ] }, { "cell_type": "code", "execution_count": 4, "id": "56409203-8734-4b9e-8618-638a26be7ccb", "metadata": {}, "outputs": [], "source": [ "results = restore_file(wandb_id)\n", "unc_df = get_uncertainty_df(results)" ] }, { "cell_type": "code", "execution_count": 5, "id": "6acb4184-0c9a-4ec5-b860-e050bf7544a2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0.6, 0.8)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkgAAAJhCAYAAAC6vU9RAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABsVElEQVR4nO3deXhMZ/8/8PdksklkI7IKidorsQQR1N7YvlSpnRBLlVinWkmL2CqqT1Hl4UFsbYny0FoTxE5IxRolCBFLFpEmkaSyzfz+8DNP50xCtJk5k5z367rmusx97jnzmbmb5p1z7nMfmUqlUoGIiIiI1IzELoCIiIjI0DAgEREREQkwIBEREREJMCARERERCTAgEREREQkwIBEREREJMCARERERCTAgEREREQkwIBEREREJMCARERERCYgekFavXg13d3eYm5vDx8cHMTExr+2/YsUKNGjQAFWqVIGbmxtmzJiBFy9evNU+X7x4gcDAQFSvXh1Vq1bFgAEDkJqaWu6fjYiIiComUQPSjh07oFAoEBISgkuXLqFp06bo3r070tLSSuy/bds2BAUFISQkBDdv3kRYWBh27NiBL7744q32OWPGDOzbtw87d+7EyZMn8eTJE/Tv31/nn5eIiIgqBpmYN6v18fFBq1atsGrVKgCAUqmEm5sbpkyZgqCgIK3+kydPxs2bNxEVFaVu+/TTT3HhwgWcOXOmTPvMyspCjRo1sG3bNnz00UcAgFu3bqFRo0aIjo5GmzZtdP2xiYiIyMAZi/XGBQUFiI2NRXBwsLrNyMgI3bp1Q3R0dImvadu2LX788UfExMSgdevWuHfvHg4ePIiRI0eWeZ+xsbEoLCxEt27d1H0aNmyIWrVqvTYg5efnIz8/X/1cqVQiIyMD1atXh0wm+/tfBBEREemNSqXC8+fP4eLiAiOj0k+kiRaQ0tPTUVxcDEdHR412R0dH3Lp1q8TXDBs2DOnp6Wjfvj1UKhWKiorwySefqE+xlWWfKSkpMDU1ha2trVaflJSUUusNDQ3F/Pnz3/ZjEhERkQF6+PAhatasWep20QLS33HixAksXrwY//73v+Hj44O7d+9i2rRpWLhwIebMmaPT9w4ODoZCoVA/z8rKQq1atXD//n1YWVnp9L2JiIiofDx//hweHh5v/N0tWkCyt7eHXC7XunosNTUVTk5OJb5mzpw5GDlyJMaNGwcA8PT0RG5uLj7++GN8+eWXZdqnk5MTCgoKkJmZqXEU6XXvCwBmZmYwMzPTaq9WrRqsra3L9JmJiIhIXCYmJgDwxukxol3FZmpqCm9vb40J10qlElFRUfD19S3xNXl5eVrnC+VyOYCX5xTLsk9vb2+YmJho9ImPj0dSUlKp70tERETSIuopNoVCgVGjRqFly5Zo3bo1VqxYgdzcXAQEBAAA/P394erqitDQUABAnz59sGzZMjRv3lx9im3OnDno06ePOii9aZ82NjYYO3YsFAqF+ujPlClT4OvryyvYiIiICIDIAWnw4MF4+vQp5s6di5SUFDRr1gwRERHqSdZJSUkaR4xmz54NmUyG2bNn4/Hjx6hRowb69OmDr776qsz7BIDly5fDyMgIAwYMQH5+Prp3745///vf+vvgREREZNBEXQepIsvOzoaNjQ2ysrI4B4mIiLS8utq6uLhY7FIkRS6Xw9jYuNQ5RmX9/V2hrmIjIiKqCAoKCpCcnIy8vDyxS5EkCwsLODs7w9TU9G/vgwGJiIioHCmVSty/fx9yuRwuLi4wNTXlgsJ6olKpUFBQgKdPn+L+/fuoV6/eaxeDfB0GJCIionJUUFCgvs2VhYWF2OVITpUqVWBiYoIHDx6goKAA5ubmf2s/ot6sloiIqLL6u0cu6J8rj++eo0dEREQkwIBEREREJMCARERERCTASdpERER64h50QK/vl7ikt17frzLhESQiIiIiAQYkIiIiAgB06tQJU6ZMwfTp02FnZwdHR0esX79efU9TKysr1K1bF4cOHVK/Ji4uDj179kTVqlXh6OiIkSNHIj09Xb09IiIC7du3h62tLapXr47/+7//Q0JCgnp7YmIiZDIZdu/ejc6dO8PCwgJNmzZFdHS0us+DBw/Qp08f2NnZwdLSEu+++y4OHjyo0++CAYmIiIjUtmzZAnt7e8TExGDKlCmYOHEiBg4ciLZt2+LSpUvw8/PDyJEjkZeXh8zMTHTp0gXNmzfHxYsXERERgdTUVAwaNEi9v9zcXCgUCly8eBFRUVEwMjLChx9+CKVSqfG+X375JWbOnIkrV66gfv36GDp0KIqKigAAgYGByM/Px6lTp3D9+nV8/fXXqFq1qk6/B96L7W/ivdiIiKgkL168wP379+Hh4aG1SKGhz0Hq1KkTiouLcfr0aQBAcXExbGxs0L9/f2zduhUAkJKSAmdnZ0RHR+Po0aM4ffo0IiMj1ft49OgR3NzcEB8fj/r162u9R3p6OmrUqIHr16+jSZMmSExMhIeHBzZs2ICxY8cCAH7//Xe8++67uHnzJho2bAgvLy8MGDAAISEhZfocrxuDsv7+5hEkIiIiUvPy8lL/Wy6Xo3r16vD09FS3OTo6AgDS0tJw9epVHD9+HFWrVlU/GjZsCADq02h37tzB0KFDUadOHVhbW8Pd3R0AkJSUVOr7Ojs7q98DAKZOnYpFixahXbt2CAkJwbVr18r5U2tjQCIiIiI1ExMTjecymUyj7dV95ZRKJXJyctCnTx9cuXJF43Hnzh106NABANCnTx9kZGRg/fr1uHDhAi5cuADg5S1ZSnvfv74HAIwbNw737t3DyJEjcf36dbRs2RLff/99OX9yTbzMn4iIiP6WFi1a4L///S/c3d1hbKwdKZ49e4b4+HisX78e7733HgDgzJkzf+u93Nzc8Mknn+CTTz5BcHAw1q9fjylTpvyj+l+HR5CIiIjobwkMDERGRgaGDh2K3377DQkJCYiMjERAQACKi4thZ2eH6tWrY926dbh79y6OHTsGhULx1u8zffp0REZG4v79+7h06RKOHz+ORo0a6eAT/Q+PIBEREelJZVu40cXFBWfPnsWsWbPg5+eH/Px81K5dGz169ICRkRFkMhnCw8MxdepUNGnSBA0aNMDKlSvRqVOnt3qf4uJiBAYG4tGjR7C2tkaPHj2wfPly3Xyo/49Xsf1NvIqNiIhK8rorqEg/eBUbERERkQ4wIBEREREJMCARERERCTAgEREREQkwIBEREekAr4EST3l89wxIRERE5ejVitB5eXkiVyJdr7574argb4PrIBEREZUjuVwOW1tb9X3ELCws1LfOIN1SqVTIy8tDWloabG1tIZfL//a+GJCIiIjKmZOTE4D/3WyV9MvW1lY9Bn8XAxIREVE5k8lkcHZ2hoODAwoLC8UuR1JMTEz+0ZGjVxiQiIiIdEQul5fLL2vSP07SJiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhIwiIC0evVquLu7w9zcHD4+PoiJiSm1b6dOnSCTybQevXv3VvcpabtMJsM333yj7uPu7q61fcmSJTr9nERERFQxiL5Q5I4dO6BQKLB27Vr4+PhgxYoV6N69O+Lj4+Hg4KDVf/fu3SgoKFA/f/bsGZo2bYqBAweq25KTkzVec+jQIYwdOxYDBgzQaF+wYAHGjx+vfm5lZVVeH4uIiIgqMNED0rJlyzB+/HgEBAQAANauXYsDBw5g48aNCAoK0upfrVo1jefh4eGwsLDQCEjC+6/8+uuv6Ny5M+rUqaPRbmVl9Y/v1UJERESVj6gBqaCgALGxsQgODla3GRkZoVu3boiOji7TPsLCwjBkyBBYWlqWuD01NRUHDhzAli1btLYtWbIECxcuRK1atTBs2DDMmDEDxsYlfyX5+fnIz89XP8/OzgYAFBYW8j47REREFURZf2eLGpDS09NRXFwMR0dHjXZHR0fcunXrja+PiYlBXFwcwsLCSu2zZcsWWFlZoX///hrtU6dORYsWLVCtWjWcO3cOwcHBSE5OxrJly0rcT2hoKObPn6/VfvjwYVhYWLyxViIiIhJfXl5emfqJfortnwgLC4Onpydat25dap+NGzdi+PDhMDc312hXKBTqf3t5ecHU1BQTJkxAaGgozMzMtPYTHBys8Zrs7Gy4ubnBz88P1tbW5fBpiIiISNdenQF6E1EDkr29PeRyOVJTUzXaU1NT3zg3KDc3F+Hh4ViwYEGpfU6fPo34+Hjs2LHjjbX4+PigqKgIiYmJaNCggdZ2MzOzEoOTiYkJTExM3rh/IiIiEl9Zf2eLepm/qakpvL29ERUVpW5TKpWIioqCr6/va1+7c+dO5OfnY8SIEaX2CQsLg7e3N5o2bfrGWq5cuQIjI6MSr5wjIiIiaRH9FJtCocCoUaPQsmVLtG7dGitWrEBubq76qjZ/f3+4uroiNDRU43VhYWHo168fqlevXuJ+s7OzsXPnTnz77bda26Kjo3HhwgV07twZVlZWiI6OxowZMzBixAjY2dmV/4ckIiKiCkX0gDR48GA8ffoUc+fORUpKCpo1a4aIiAj1xO2kpCQYGWke6IqPj8eZM2dw+PDhUvcbHh4OlUqFoUOHam0zMzNDeHg45s2bh/z8fHh4eGDGjBkac4yIiIhIumQqlUoldhEVUXZ2NmxsbJCVlcVJ2kRERBVEWX9/G8StRoiIiIgMCQMSERERkQADEhEREZEAAxIRERGRAAMSERERkQADEhEREZEAAxIRERGRAAMSERERkQADEhEREZEAAxIRERGRgOj3YqPXcw86IHYJ/1jikt5il0BERPRWeASJiIiISIABiYiIiEiAAYmIiIhIgAGJiIiISIABiYiIiEiAAYmIiIhIgJf5E5URl1wgIpIOHkEiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhIwiIC0evVquLu7w9zcHD4+PoiJiSm1b6dOnSCTybQevXv3VvcZPXq01vYePXpo7CcjIwPDhw+HtbU1bG1tMXbsWOTk5OjsMxIREVHFIXpA2rFjBxQKBUJCQnDp0iU0bdoU3bt3R1paWon9d+/ejeTkZPUjLi4OcrkcAwcO1OjXo0cPjX7bt2/X2D58+HDcuHEDR44cwf79+3Hq1Cl8/PHHOvucREREVHGIHpCWLVuG8ePHIyAgAI0bN8batWthYWGBjRs3lti/WrVqcHJyUj+OHDkCCwsLrYBkZmam0c/Ozk697ebNm4iIiMCGDRvg4+OD9u3b4/vvv0d4eDiePHmi089LREREhs9YzDcvKChAbGwsgoOD1W1GRkbo1q0boqOjy7SPsLAwDBkyBJaWlhrtJ06cgIODA+zs7NClSxcsWrQI1atXBwBER0fD1tYWLVu2VPfv1q0bjIyMcOHCBXz44Yda75Ofn4/8/Hz18+zsbABAYWEhCgsLy/6h35KZXKWzfeuLLr8ffeJYEBFVfGX9/6CoASk9PR3FxcVwdHTUaHd0dMStW7fe+PqYmBjExcUhLCxMo71Hjx7o378/PDw8kJCQgC+++AI9e/ZEdHQ05HI5UlJS4ODgoPEaY2NjVKtWDSkpKSW+V2hoKObPn6/VfvjwYVhYWLyx1r9raWud7VpvDh48KHYJ5YJjQURU8eXl5ZWpn6gB6Z8KCwuDp6cnWrfW/M01ZMgQ9b89PT3h5eWFd955BydOnEDXrl3/1nsFBwdDoVCon2dnZ8PNzQ1+fn6wtrb+ex+gDJrMi9TZvvUlbl53sUsoFxwLIqKK79UZoDcRNSDZ29tDLpcjNTVVoz01NRVOTk6vfW1ubi7Cw8OxYMGCN75PnTp1YG9vj7t376Jr165wcnLSmgReVFSEjIyMUt/XzMwMZmZmWu0mJiYwMTF5Yw1/V36xTGf71hddfj/6xLEgIqr4yvr/QVEnaZuamsLb2xtRUVHqNqVSiaioKPj6+r72tTt37kR+fj5GjBjxxvd59OgRnj17BmdnZwCAr68vMjMzERsbq+5z7NgxKJVK+Pj4/M1PQ0RERJWF6FexKRQKrF+/Hlu2bMHNmzcxceJE5ObmIiAgAADg7++vMYn7lbCwMPTr10898fqVnJwcfPbZZzh//jwSExMRFRWFDz74AHXr1kX37i9PLzRq1Ag9evTA+PHjERMTg7Nnz2Ly5MkYMmQIXFxcdP+hiYiIyKCJPgdp8ODBePr0KebOnYuUlBQ0a9YMERER6onbSUlJMDLSzHHx8fE4c+YMDh8+rLU/uVyOa9euYcuWLcjMzISLiwv8/PywcOFCjVNkP/30EyZPnoyuXbvCyMgIAwYMwMqVK3X7YYmIiKhCkKlUqop/7bIIsrOzYWNjg6ysLJ1O0nYPOqCzfetL4pLeb+5UAXAsiIgqvrL+/hb9FBsRERGRoWFAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISMBa7ACKit+UedEDsEspF4pLeYpdARKXgESQiIiIiAQYkIiIiIgEGJCIiIiIBBiQiIiIiAQYkIiIiIgGDCEirV6+Gu7s7zM3N4ePjg5iYmFL7durUCTKZTOvRu/fLq0EKCwsxa9YseHp6wtLSEi4uLvD398eTJ0809uPu7q61jyVLluj0cxIREVHFIHpA2rFjBxQKBUJCQnDp0iU0bdoU3bt3R1paWon9d+/ejeTkZPUjLi4OcrkcAwcOBADk5eXh0qVLmDNnDi5duoTdu3cjPj4effv21drXggULNPY1ZcoUnX5WIiIiqhhEXwdp2bJlGD9+PAICAgAAa9euxYEDB7Bx40YEBQVp9a9WrZrG8/DwcFhYWKgDko2NDY4cOaLRZ9WqVWjdujWSkpJQq1YtdbuVlRWcnJzKVGd+fj7y8/PVz7OzswG8PGJVWFhYpn38HWZylc72rS+6/H70iWNhOCrDWACVZzyIKpKy/tzJVCqVaP+nKSgogIWFBXbt2oV+/fqp20eNGoXMzEz8+uuvb9yHp6cnfH19sW7dulL7HD16FH5+fsjMzIS1tTWAl6fYXrx4gcLCQtSqVQvDhg3DjBkzYGxccmacN28e5s+fr9W+bds2WFhYvLFOIiIiEl9eXh6GDRuGrKwsdSYoiahHkNLT01FcXAxHR0eNdkdHR9y6deuNr4+JiUFcXBzCwsJK7fPixQvMmjULQ4cO1fgipk6dihYtWqBatWo4d+4cgoODkZycjGXLlpW4n+DgYCgUCvXz7OxsuLm5wc/P77Vf8D/VZF6kzvatL3HzuotdQrngWBiOyjAWQOUZD6KK5NUZoDcR/RTbPxEWFgZPT0+0bt26xO2FhYUYNGgQVCoV1qxZo7Htr2HHy8sLpqammDBhAkJDQ2FmZqa1LzMzsxLbTUxMYGJi8g8/Senyi2U627e+6PL70SeOheGoDGMBVJ7xIKpIyvpzJ+okbXt7e8jlcqSmpmq0p6amvnFuUG5uLsLDwzF27NgSt78KRw8ePMCRI0feeJTHx8cHRUVFSExMfKvPQERERJWPqAHJ1NQU3t7eiIqKUrcplUpERUXB19f3ta/duXMn8vPzMWLECK1tr8LRnTt3cPToUVSvXv2NtVy5cgVGRkZwcHB4+w9CRERElYrop9gUCgVGjRqFli1bonXr1lixYgVyc3PVV7X5+/vD1dUVoaGhGq8LCwtDv379tMJPYWEhPvroI1y6dAn79+9HcXExUlJSALy8As7U1BTR0dG4cOECOnfuDCsrK0RHR2PGjBkYMWIE7Ozs9PPBiYiIyGCJHpAGDx6Mp0+fYu7cuUhJSUGzZs0QERGhnridlJQEIyPNA13x8fE4c+YMDh8+rLW/x48fY+/evQCAZs2aaWw7fvw4OnXqBDMzM4SHh2PevHnIz8+Hh4cHZsyYoTEviYiIiKRL9IAEAJMnT8bkyZNL3HbixAmttgYNGqC01Qnc3d1L3fZKixYtcP78+beuk4iIiKRB9JW0iYiIiAwNAxIRERGRAAMSERERkQADEhEREZEAAxIRERGRAAMSERERkQADEhEREZEAAxIRERGRAAMSERERkQADEhEREZEAAxIRERGRAAMSERERkQADEhEREZEAAxIRERGRAAMSERERkQADEhEREZHAWwWk7OxsKJVKrfbi4mJkZ2eXW1FEREREYipzQNqzZw9atmyJFy9eaG178eIFWrVqhX379pVrcURERERiKHNAWrNmDT7//HNYWFhobbO0tMSsWbOwatWqci2OiIiISAxlDkhxcXHo1KlTqds7dOiA69evl0dNRERERKIqc0D6448/UFRUVOr2wsJC/PHHH+VSFBEREZGYyhyQ3N3dcfHixVK3X7x4EbVr1y6XooiIiIjEVOaA1L9/f3z55ZdITU3V2paSkoLZs2djwIAB5VocERERkRiMy9oxKCgIv/76K+rVq4cRI0agQYMGAIBbt27hp59+gpubG4KCgnRWKBEREZG+lDkgWVlZ4ezZswgODsaOHTvU841sbW0xYsQIfPXVV7CystJZoURERET6UuaABAA2Njb497//jdWrVyM9PR0qlQo1atSATCbTVX1EREREevdWAemV69ev4/bt2wCABg0awNPTs1yLIiIiIhLTWwWkmJgYjB07Fr///jtUKhUAQCaT4d1330VYWBhatWqlkyKJiIiI9KnMV7H9/vvv6Nq1K6pUqYIff/wRly5dwqVLl/DDDz/AzMwMXbt2xe+//67LWomIiIj0osxHkObNm4f3338f//3vfzXmHDVr1gxDhw5F//79MW/ePPz88886KZSIiIhIX8ockI4fP45Dhw6VOCFbJpPhiy++QK9evcq1OCIiIiIxlPkU2/Pnz+Ho6FjqdicnJzx//rxciiIiIiISU5kDUu3atRETE1Pq9gsXLvBWI0RERFQplDkgDRkyBAqFAnFxcVrbrl+/jpkzZ2Lw4MHlWhwRERGRGMo8Byk4OBhHjx5Fs2bN8P7776NRo0ZQqVS4efMmjh49itatW+OLL77QZa1EREREelHmI0jm5uY4fvw4vvrqKyQnJ2Pt2rX4z3/+g5SUFCxatAjHjx+Hubn53ypi9erVcHd3h7m5OXx8fF57Kq9Tp06QyWRaj969e6v7qFQqzJ07F87OzqhSpQq6deuGO3fuaOwnIyMDw4cPh7W1NWxtbTF27Fjk5OT8rfqJiIiocilzQAIAU1NTzJo1C1euXEFeXh7y8vJw5coVBAUFwczM7G8VsGPHDigUCoSEhODSpUto2rQpunfvjrS0tBL77969G8nJyepHXFwc5HI5Bg4cqO6zdOlSrFy5EmvXrsWFCxdgaWmJ7t2748WLF+o+w4cPx40bN3DkyBHs378fp06dwscff/y3PgMRERFVLm8VkF4nOTkZkydPfuvXLVu2DOPHj0dAQAAaN26MtWvXwsLCAhs3biyxf7Vq1eDk5KR+HDlyBBYWFuqApFKpsGLFCsyePRsffPABvLy8sHXrVjx58gS//PILAODmzZuIiIjAhg0b4OPjg/bt2+P7779HeHg4njx58re/AyIiIqoc3upWIzdu3MDx48dhamqKQYMGwdbWFunp6Vi0aBH+85//oE6dOm/15gUFBYiNjUVwcLC6zcjICN26dUN0dHSZ9hEWFoYhQ4bA0tISAHD//n2kpKSgW7du6j42Njbw8fFBdHQ0hgwZgujoaNja2qJly5bqPt26dYORkREuXLiADz/8UOt98vPzkZ+fr36enZ0NACgsLERhYeFbfe63YSZX6Wzf+qLL70efOBaGozKMBVB5xoOoIinrz12ZA9LevXvx0UcfoaioCMDL01jr16/HoEGD4O3tjT179qBHjx5vVWR6ejqKi4u11ldydHTErVu33vj6mJgYxMXFISwsTN2WkpKi3odwn6+2paSkwMHBQWO7sbExqlWrpu4jFBoaivnz52u1Hz58GBYWFm+s9e9a2lpnu9abgwcPil1CueBYGI7KMBZA5RkPoookLy+vTP3KHJAWLVqEwMBALFy4EBs2bIBCocDUqVNx8OBB0W5SGxYWBk9PT7Rurfv/WwYHB0OhUKifZ2dnw83NDX5+frC2ttbZ+zaZF6mzfetL3LzuYpdQLjgWhqMyjAVQecaDqCJ5dQboTcockOLj47Ft2zZUrVoVU6ZMwcyZM7F8+fJ/FI7s7e0hl8uRmpqq0Z6amgonJ6fXvjY3Nxfh4eFYsGCBRvur16WmpsLZ2Vljn82aNVP3EU4CLyoqQkZGRqnva2ZmVuJEdBMTE5iYmLy21n8iv1j71i4VjS6/H33iWBiOyjAWQOUZD6KKpKw/d291q5FXR0rkcjmqVKny1nOOhExNTeHt7Y2oqCh1m1KpRFRUFHx9fV/72p07dyI/Px8jRozQaPfw8ICTk5PGPrOzs3HhwgX1Pn19fZGZmYnY2Fh1n2PHjkGpVMLHx+cffSYiIiKq+N5qknZkZCRsbGwA/C/ICFfW7tu371sVoFAoMGrUKLRs2RKtW7fGihUrkJubi4CAAACAv78/XF1dERoaqvG6sLAw9OvXD9WrV9dol8lkmD59OhYtWoR69erBw8MDc+bMgYuLC/r16wcAaNSoEXr06IHx48dj7dq1KCwsxOTJkzFkyBC4uLi8Vf1ERERU+bxVQBo1apTG8wkTJmg8l8lkKC4ufqsCBg8ejKdPn2Lu3LlISUlBs2bNEBERoZ5knZSUBCMjzQNd8fHxOHPmDA4fPlziPj///HPk5ubi448/RmZmJtq3b4+IiAiNhSx/+uknTJ48GV27doWRkREGDBiAlStXvlXtREREVDnJVCpV5bheVs+ys7NhY2ODrKwsnU7Sdg86oLN960vikt5v7lQBcCwMR2UYC6DyjAdRRVLW39/ltlAkERERUWVR5lNspZ1+srGxQf369d84qZqIiIiooihzQFq+fHmJ7ZmZmcjKykLbtm2xd+9eVKtWrdyKIyIiIhJDmU+x3b9/v8THH3/8gbt370KpVGL27Nm6rJWIiIhIL8plDlKdOnWwZMmSUq8qIyIiIqpIym2Sdq1atUq9jxkRERFRRVJuAen69euoXbt2ee2OiIiISDRlnqRd2s3dsrKyEBsbi08//VRrIUkiIiKiiqjMAcnW1hYyWck3iJTJZBg3bhyCgoLKrTAiIiIisZQ5IB0/frzEdmtra9SrVw9Vq1ZFXFwcmjRpUm7FEREREYmhzAGpY8eOJbY/f/4c27ZtQ1hYGC5evPjW92IjIiIiMjR/e5L2qVOnMGrUKDg7O+Nf//oXOnfujPPnz5dnbURERESiKPMRJABISUnB5s2bERYWhuzsbAwaNAj5+fn45Zdf0LhxY13VSERERKRXZT6C1KdPHzRo0ADXrl3DihUr8OTJE3z//fe6rI2IiIhIFGU+gnTo0CFMnToVEydORL169XRZExEREZGoynwE6cyZM3j+/Dm8vb3h4+ODVatWIT09XZe1EREREYmizAGpTZs2WL9+PZKTkzFhwgSEh4fDxcUFSqUSR44cwfPnz3VZJxEREZHevPVVbJaWlhgzZgzOnDmD69ev49NPP8WSJUvg4OCAvn376qJGIiIiIr36R/dia9CgAZYuXYpHjx5h+/bt5VUTERERkajK5Wa1crkc/fr1w969e8tjd0RERESiKpeARERERFSZMCARERERCTAgEREREQkwIBEREREJMCARERERCTAgEREREQkwIBEREREJMCARERERCTAgEREREQkwIBEREREJMCARERERCTAgEREREQkwIBEREREJMCARERERCTAgEREREQmIHpBWr14Nd3d3mJubw8fHBzExMa/tn5mZicDAQDg7O8PMzAz169fHwYMH1dvd3d0hk8m0HoGBgeo+nTp10tr+ySef6OwzEhERUcViLOab79ixAwqFAmvXroWPjw9WrFiB7t27Iz4+Hg4ODlr9CwoK8P7778PBwQG7du2Cq6srHjx4AFtbW3Wf3377DcXFxerncXFxeP/99zFw4ECNfY0fPx4LFixQP7ewsCj/D0hEREQVkqgBadmyZRg/fjwCAgIAAGvXrsWBAwewceNGBAUFafXfuHEjMjIycO7cOZiYmAB4ecTor2rUqKHxfMmSJXjnnXfQsWNHjXYLCws4OTmV46chIiKiykK0gFRQUIDY2FgEBwer24yMjNCtWzdER0eX+Jq9e/fC19cXgYGB+PXXX1GjRg0MGzYMs2bNglwuL/E9fvzxRygUCshkMo1tP/30E3788Uc4OTmhT58+mDNnzmuPIuXn5yM/P1/9PDs7GwBQWFiIwsLCt/rsb8NMrtLZvvVFl9+PPnEsDEdlGAug8owHUUVS1p870QJSeno6iouL4ejoqNHu6OiIW7dulfiae/fu4dixYxg+fDgOHjyIu3fvYtKkSSgsLERISIhW/19++QWZmZkYPXq0RvuwYcNQu3ZtuLi44Nq1a5g1axbi4+Oxe/fuUusNDQ3F/PnztdoPHz6s09NzS1vrbNd689c5YhUZx8JwVIaxACrPeBBVJHl5eWXqJ1OpVKL8KfbkyRO4urri3Llz8PX1Vbd//vnnOHnyJC5cuKD1mvr16+PFixe4f/+++ojRsmXL8M033yA5OVmrf/fu3WFqaop9+/a9tpZjx46ha9euuHv3Lt55550S+5R0BMnNzQ3p6emwtrYu02f+O5rMi9TZvvUlbl53sUsoFxwLw1EZxgKoPONBVJFkZ2fD3t4eWVlZr/39LdoRJHt7e8jlcqSmpmq0p6amljo3yNnZGSYmJhqn0xo1aoSUlBQUFBTA1NRU3f7gwQMcPXr0tUeFXvHx8QGA1wYkMzMzmJmZabWbmJio50PpQn6x7M2dDJwuvx994lgYjsowFkDlGQ+iiqSsP3eiXeZvamoKb29vREVFqduUSiWioqI0jij9Vbt27XD37l0olUp12+3bt+Hs7KwRjgBg06ZNcHBwQO/evd9Yy5UrVwC8DGBEREREoq6DpFAosH79emzZsgU3b97ExIkTkZubq76qzd/fX2MS98SJE5GRkYFp06bh9u3bOHDgABYvXqyxxhHwMmht2rQJo0aNgrGx5kGyhIQELFy4ELGxsUhMTMTevXvh7++PDh06wMvLS/cfmoiIiAyeqJf5Dx48GE+fPsXcuXORkpKCZs2aISIiQj1xOykpCUZG/8twbm5uiIyMxIwZM+Dl5QVXV1dMmzYNs2bN0tjv0aNHkZSUhDFjxmi9p6mpKY4ePYoVK1YgNzcXbm5uGDBgAGbPnq3bD0tEREQVhqgBCQAmT56MyZMnl7jtxIkTWm2+vr44f/78a/fp5+eH0uaeu7m54eTJk29dJxEREUmH6LcaISIiIjI0DEhEREREAgxIRERERAIMSEREREQCDEhEREREAgxIRERERAIMSEREREQCoq+DREREFZd70AGxSygXiUvefFsqkhYeQSIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhJgQCIiIiISYEAiIiIiEmBAIiIiIhIQPSCtXr0a7u7uMDc3h4+PD2JiYl7bPzMzE4GBgXB2doaZmRnq16+PgwcPqrfPmzcPMplM49GwYUONfbx48QKBgYGoXr06qlatigEDBiA1NVUnn4+IiIgqHlED0o4dO6BQKBASEoJLly6hadOm6N69O9LS0krsX1BQgPfffx+JiYnYtWsX4uPjsX79eri6umr0e/fdd5GcnKx+nDlzRmP7jBkzsG/fPuzcuRMnT57EkydP0L9/f519TiIiIqpYjMV882XLlmH8+PEICAgAAKxduxYHDhzAxo0bERQUpNV/48aNyMjIwLlz52BiYgIAcHd31+pnbGwMJyenEt8zKysLYWFh2LZtG7p06QIA2LRpExo1aoTz58+jTZs25fTpiIiIqKISLSAVFBQgNjYWwcHB6jYjIyN069YN0dHRJb5m79698PX1RWBgIH799VfUqFEDw4YNw6xZsyCXy9X97ty5AxcXF5ibm8PX1xehoaGoVasWACA2NhaFhYXo1q2bun/Dhg1Rq1YtREdHlxqQ8vPzkZ+fr36enZ0NACgsLERhYeHf/yLewEyu0tm+9UWX348+cSwMR2UYC6ByjAfHgiqaso61aAEpPT0dxcXFcHR01Gh3dHTErVu3SnzNvXv3cOzYMQwfPhwHDx7E3bt3MWnSJBQWFiIkJAQA4OPjg82bN6NBgwZITk7G/Pnz8d577yEuLg5WVlZISUmBqakpbG1ttd43JSWl1HpDQ0Mxf/58rfbDhw/DwsLiLT992S1trbNd681f54hVZBwLw1EZxgKoHOPBsaCKJi8vr0z9RD3F9raUSiUcHBywbt06yOVyeHt74/Hjx/jmm2/UAalnz57q/l5eXvDx8UHt2rXx888/Y+zYsX/7vYODg6FQKNTPs7Oz4ebmBj8/P1hbW//9D/UGTeZF6mzf+hI3r7vYJZQLjoXhqAxjAVSO8eBYUEXz6gzQm4gWkOzt7SGXy7WuHktNTS11/pCzszNMTEw0Tqc1atQIKSkpKCgogKmpqdZrbG1tUb9+fdy9excA4OTkhIKCAmRmZmocRXrd+wKAmZkZzMzMtNpNTEzU86F0Ib9YprN964suvx994lgYjsowFkDlGA+OBVU0ZR1r0a5iMzU1hbe3N6KiotRtSqUSUVFR8PX1LfE17dq1w927d6FUKtVtt2/fhrOzc4nhCABycnKQkJAAZ2dnAIC3tzdMTEw03jc+Ph5JSUmlvi8RERFJi6iX+SsUCqxfvx5btmzBzZs3MXHiROTm5qqvavP399eYxD1x4kRkZGRg2rRpuH37Ng4cOIDFixcjMDBQ3WfmzJk4efIkEhMTce7cOXz44YeQy+UYOnQoAMDGxgZjx46FQqHA8ePHERsbi4CAAPj6+vIKNiIiIgIg8hykwYMH4+nTp5g7dy5SUlLQrFkzREREqCduJyUlwcjofxnOzc0NkZGRmDFjBry8vODq6opp06Zh1qxZ6j6PHj3C0KFD8ezZM9SoUQPt27fH+fPnUaNGDXWf5cuXw8jICAMGDEB+fj66d++Of//73/r74ERERGTQRJ+kPXnyZEyePLnEbSdOnNBq8/X1xfnz50vdX3h4+Bvf09zcHKtXr8bq1avLXCcRERFJh+i3GiEiIiIyNAxIRERERAIMSEREREQCDEhEREREAgxIRERERAIMSEREREQCDEhEREREAgxIRERERAIMSEREREQCDEhEREREAgxIRERERAIMSEREREQCDEhEREREAgxIRERERAIMSEREREQCDEhEREREAgxIRERERAIMSEREREQCDEhEREREAgxIRERERAIMSEREREQCDEhEREREAsZiF0BERET/nHvQAbFLKBeJS3qLXQIAHkEiIiIi0sKARERERCTAgEREREQkwIBEREREJMCARERERCTAgEREREQkwIBEREREJMCARERERCTAgEREREQkwIBEREREJMCARERERCTAgEREREQkIHpAWr16Ndzd3WFubg4fHx/ExMS8tn9mZiYCAwPh7OwMMzMz1K9fHwcPHlRvDw0NRatWrWBlZQUHBwf069cP8fHxGvvo1KkTZDKZxuOTTz7RyecjIiKiikfUgLRjxw4oFAqEhITg0qVLaNq0Kbp37460tLQS+xcUFOD9999HYmIidu3ahfj4eKxfvx6urq7qPidPnkRgYCDOnz+PI0eOoLCwEH5+fsjNzdXY1/jx45GcnKx+LF26VKeflYiIiCoOYzHffNmyZRg/fjwCAgIAAGvXrsWBAwewceNGBAUFafXfuHEjMjIycO7cOZiYmAAA3N3dNfpERERoPN+8eTMcHBwQGxuLDh06qNstLCzg5ORUzp+IiIiIKgPRAlJBQQFiY2MRHBysbjMyMkK3bt0QHR1d4mv27t0LX19fBAYG4tdff0WNGjUwbNgwzJo1C3K5vMTXZGVlAQCqVaum0f7TTz/hxx9/hJOTE/r06YM5c+bAwsKi1Hrz8/ORn5+vfp6dnQ0AKCwsRGFhYdk+9N9gJlfpbN/6osvvR584FoajMowFUDnGg2NhODgW5bt/mUqlEuUbffLkCVxdXXHu3Dn4+vqq2z///HOcPHkSFy5c0HpNw4YNkZiYiOHDh2PSpEm4e/cuJk2ahKlTpyIkJESrv1KpRN++fZGZmYkzZ86o29etW4fatWvDxcUF165dw6xZs9C6dWvs3r271HrnzZuH+fPna7Vv27bttcGKiIiIDEdeXh6GDRuGrKwsWFtbl9pP1FNsb0upVMLBwQHr1q2DXC6Ht7c3Hj9+jG+++abEgBQYGIi4uDiNcAQAH3/8sfrfnp6ecHZ2RteuXZGQkIB33nmnxPcODg6GQqFQP8/Ozoabmxv8/Pxe+wX/U03mReps3/oSN6+72CWUC46F4agMYwFUjvHgWBgOjkXZvDoD9CaiBSR7e3vI5XKkpqZqtKemppY6N8jZ2RkmJiYap9MaNWqElJQUFBQUwNTUVN0+efJk7N+/H6dOnULNmjVfW4uPjw8A4O7du6UGJDMzM5iZmWm1m5iYqOdD6UJ+sUxn+9YXXX4/+sSxMByVYSyAyjEeHAvDwbEo3/2LdhWbqakpvL29ERUVpW5TKpWIiorSOOX2V+3atcPdu3ehVCrVbbdv34azs7M6HKlUKkyePBl79uzBsWPH4OHh8cZarly5AuBlACMiIiIS9TJ/hUKB9evXY8uWLbh58yYmTpyI3Nxc9VVt/v7+GpO4J06ciIyMDEybNg23b9/GgQMHsHjxYgQGBqr7BAYG4scff8S2bdtgZWWFlJQUpKSk4M8//wQAJCQkYOHChYiNjUViYiL27t0Lf39/dOjQAV5eXvr9AoiIiMggiToHafDgwXj69Cnmzp2LlJQUNGvWDBEREXB0dAQAJCUlwcjofxnOzc0NkZGRmDFjBry8vODq6opp06Zh1qxZ6j5r1qwB8HIxyL/atGkTRo8eDVNTUxw9ehQrVqxAbm4u3NzcMGDAAMyePVv3H5iIiIgqBNEnaU+ePBmTJ08ucduJEye02nx9fXH+/PlS9/emi/Lc3Nxw8uTJt6qRiIiIpEX0W40QERERGRoGJCIiIiIBBiQiIiIiAQYkIiIiIgEGJCIiIiIBBiQiIiIiAQYkIiIiIgEGJCIiIiIBBiQiIiIiAQYkIiIiIgEGJCIiIiIBBiQiIiIiAQYkIiIiIgEGJCIiIiIBBiQiIiIiAQYkIiIiIgEGJCIiIiIBBiQiIiIiAQYkIiIiIgEGJCIiIiIBBiQiIiIiAQYkIiIiIgEGJCIiIiIBBiQiIiIiAQYkIiIiIgEGJCIiIiIBBiQiIiIiAQYkIiIiIgEGJCIiIiIBBiQiIiIiAQYkIiIiIgEGJCIiIiIBBiQiIiIiAQYkIiIiIgEGJCIiIiIBBiQiIiIiAdED0urVq+Hu7g5zc3P4+PggJibmtf0zMzMRGBgIZ2dnmJmZoX79+jh48OBb7fPFixcIDAxE9erVUbVqVQwYMACpqanl/tmIiIioYhI1IO3YsQMKhQIhISG4dOkSmjZtiu7duyMtLa3E/gUFBXj//feRmJiIXbt2IT4+HuvXr4erq+tb7XPGjBnYt28fdu7ciZMnT+LJkyfo37+/zj8vERERVQzGYr75smXLMH78eAQEBAAA1q5diwMHDmDjxo0ICgrS6r9x40ZkZGTg3LlzMDExAQC4u7u/1T6zsrIQFhaGbdu2oUuXLgCATZs2oVGjRjh//jzatGlTYq35+fnIz89XP8/KygIAZGRkoLCw8J99Ea9hXJSrs33ry7Nnz8QuoVxwLAxHZRgLoHKMB8fCcHAsyub58+cAAJVK9fqOKpHk5+er5HK5as+ePRrt/v7+qr59+5b4mp49e6qGDx+uGj9+vMrBwUH17rvvqr766itVUVFRmfcZFRWlAqD6448/NPrUqlVLtWzZslLrDQkJUQHggw8++OCDDz4qwePhw4evzSmiHUFKT09HcXExHB0dNdodHR1x69atEl9z7949HDt2DMOHD8fBgwdx9+5dTJo0CYWFhQgJCSnTPlNSUmBqagpbW1utPikpKaXWGxwcDIVCoX6uVCqRkZGB6tWrQyaTvc1HNxjZ2dlwc3PDw4cPYW1tLXY5ksfxMBwcC8PBsTAclWUsVCoVnj9/DhcXl9f2E/UU29tSKpVwcHDAunXrIJfL4e3tjcePH+Obb75BSEiITt/bzMwMZmZmGm3CkFVRWVtbV+j/2Csbjofh4FgYDo6F4agMY2FjY/PGPqIFJHt7e8jlcq2rx1JTU+Hk5FTia5ydnWFiYgK5XK5ua9SoEVJSUlBQUFCmfTo5OaGgoACZmZkaAed170tERETSItpVbKampvD29kZUVJS6TalUIioqCr6+viW+pl27drh79y6USqW67fbt23B2doapqWmZ9unt7Q0TExONPvHx8UhKSir1fYmIiEhaRL3MX6FQYP369diyZQtu3ryJiRMnIjc3V30Fmr+/P4KDg9X9J06ciIyMDEybNg23b9/GgQMHsHjxYgQGBpZ5nzY2Nhg7diwUCgWOHz+O2NhYBAQEwNfXt9Qr2CorMzMzhISEaJ06JHFwPAwHx8JwcCwMh+TG4rVTuPXg+++/V9WqVUtlamqqat26ter8+fPqbR07dlSNGjVKo/+5c+dUPj4+KjMzM1WdOnU0rmIryz5VKpXqzz//VE2aNEllZ2ensrCwUH344Yeq5ORknX1GIiIiqlhkKtWbFgIgIiIikhbRbzVCREREZGgYkIiIiIgEGJCIiIiIBBiQiIiIiAQYkCSkY8eO2Lp1K/7880+xSyFwPAxJSEgIHjx4IHYZRGRAGJAkpHnz5pg5cyacnJwwfvx4nD9/XuySJI3jYTh+/fVXvPPOO+jatSu2bduG/Px8sUsiEl1ubi7mzJmDtm3bom7duqhTp47Go7LjZf4SU1RUhL1792LLli04dOgQ6tatizFjxmDkyJFaN/kl3eN4GI7Lly9j06ZN2L59O4qKijBkyBCMGTMGrVq1Ers0ScvPz5fOwoQGZujQoTh58iRGjhwJZ2dnrRuzT5s2TaTK9IMBScLS0tKwbt06fPXVVyguLkavXr0wdepUdOnSRezSJInjYRgKCwuxb98+bNq0CZGRkWjYsCHGjh2L0aNHl+kGl/TPHDp0COHh4Th9+jQePnwIpVIJS0tLNG/eHH5+fggICHjjXdipfNja2uLAgQNo166d2KWIgqfYJComJgYhISH49ttv4eDggODgYNjb2+P//u//MHPmTLHLkxyOh+FQqVQoLCxEQUEBVCoV7OzssGrVKri5uWHHjh1il1dp7dmzB/Xr18eYMWNgbGyMWbNmYffu3YiMjMSGDRvQsWNHHD16FHXq1MEnn3yCp0+fil1ypWdnZ4dq1aqJXYZ4xFvEm/QtNTVV9a9//Uv17rvvqkxNTVUDBgxQHTp0SKVUKtV9Tp8+rbK0tBSxSungeBiWixcvqgIDA1XVqlVTOTs7q2bNmqW6c+eOevvKlStVDg4OIlZYubVp00a1f/9+VXFx8Wv7PXr0SDVr1izVsmXL9FSZdP3www+qjz76SJWbmyt2KaLgKTYJMTU1xTvvvIMxY8Zg9OjRqFGjhlaf7OxsfPDBBzh+/LgIFUoLx8NweHp64tatW/Dz88P48ePRp08fyOVyjT7p6elwcHCAUqkUqUoi/WrevDkSEhKgUqng7u4OExMTje2XLl0SqTL9MBa7ANKfqKgovPfee6/tY21tzV/GesLxMByDBg3CmDFj4OrqWmofe3t7hiM9KygowP379/HOO+/A2Ji/rvStX79+YpcgKh5BkqC0tDTEx8cDABo0aAAHBweRK5I2jodhefW/ROEVO6Q/eXl5mDJlCrZs2QIAuH37NurUqYMpU6bA1dUVQUFBIldIUsBJ2hLy/PlzjBw5Eq6urujYsSM6duwIV1dXjBgxAllZWWKXJzkcD8MSFhaGJk2awNzcHObm5mjSpAk2bNggdlmSFBwcjKtXr+LEiRMwNzdXt3fr1o0T5UUQGxuLH3/8ET/++CMuX74sdjl6w4AkIePGjcOFCxewf/9+ZGZmIjMzE/v378fFixcxYcIEscuTHI6H4Zg7dy6mTZuGPn36YOfOndi5cyf69OmDGTNmYO7cuWKXJzm//PILVq1ahfbt22scyXv33XeRkJAgYmXSkpaWhi5duqBVq1aYOnUqpk6dCm9vb3Tt2lUaVxGKOUOc9MvCwkJ1+vRprfZTp06pLCwsRKhI2jgehsPe3l61bds2rfZt27apqlevLkJF0lalShVVQkKCSqVSqapWrar+95UrV1TW1tZiliYpgwYNUrVs2VL1+++/q9tu3LihatmypWrIkCEiVqYfPIIkIdWrVy9xoTsbGxvY2dmJUJG0cTwMR2FhIVq2bKnV7u3tjaKiIhEqkraWLVviwIED6uevjiJt2LABvr6+YpUlOREREfj3v/+NRo0aqdsaN26M1atX49ChQyJWph8MSBIye/ZsKBQKpKSkqNtSUlLw2WefYc6cOSJWJk0cD8MxcuRIrFmzRqt93bp1GD58uAgVSdvixYvxxRdfYOLEiSgqKsJ3330HPz8/bNq0CV999ZXY5UmGUqnUurQfAExMTCRxRSevYpOQ5s2b4+7du8jPz0etWrUAAElJSTAzM0O9evU0+lb29S0MAcfDcEyZMgVbt26Fm5sb2rRpAwC4cOECkpKS4O/vr/FLYtmyZWKVKSkJCQlYsmQJrl69ipycHLRo0QKzZs2Cp6en2KVJxgcffIDMzExs375dfXuXx48fY/jw4bCzs8OePXtErlC3uLCEhEh9TQtDw/EwHHFxcWjRogUAqCcB29vbw97eHnFxcep+vPRff9555x2sX79e7DIkbdWqVejbty/c3d3h5uYGAHj48CGaNGmCH3/8UeTqdI9HkIiIyKAkJSW9dvurI66keyqVCkePHsWtW7cAAI0aNUK3bt1Erko/GJAkKDY2Fjdv3gTw8rLZ5s2bi1yRtHE8DMujR48AADVr1hS5EukyMjJ67dG64uJiPVZDUsVTbBKSlpaGIUOG4MSJE7C1tQUAZGZmonPnzggPDy/xXmCkOxwPw6FUKrFo0SJ8++23yMnJAQBYWVnh008/xZdffgkjI17Pok/CxQgLCwtx+fJlLFu2jJO0dWzlypX4+OOPYW5ujpUrV76279SpU/VUlTh4BElCBg8ejHv37mHr1q3qyzZ///13jBo1CnXr1sX27dtFrlBaOB6GIzg4GGFhYZg/fz7atWsHADhz5gzmzZuH8ePH85eygThw4AC++eYbnDhxQuxSKi0PDw9cvHgR1atXh4eHR6n9ZDIZ7t27p8fK9I8BSUJsbGxw9OhRtGrVSqM9JiYGfn5+yMzMFKcwieJ4GA4XFxesXbsWffv21Wj/9ddfMWnSJDx+/Fikyuiv7t69i6ZNmyI3N1fsUkgCeIpNQqS+poWh4XgYjoyMDDRs2FCrvWHDhsjIyBChImnLzs7WeK5SqZCcnIx58+ZpLYFB+lNcXIzr16+jdu3akljMlifWJaRLly6YNm0anjx5om57/PgxZsyYga5du4pYmTRxPAxH06ZNsWrVKq32VatWoWnTpiJUJG22traws7NTP6pVq4bGjRsjOjq6xAU9STemT5+OsLAwAC/DUYcOHdCiRQu4ublJ4jQnT7FJyMOHD9G3b1/cuHFDa02LvXv38qodPeN4GI6TJ0+id+/eqFWrlvpWFtHR0Xj48CEOHjyI9957T+QKpeXkyZMaz42MjFCjRg3UrVsXxsY88aEvNWvWxC+//IKWLVvil19+QWBgII4fP44ffvgBx44dw9mzZ8UuUacYkCRGymtaGCKOh+F48uQJVq9erTEWkyZNUq8gTPpRWFiICRMmYM6cOa+dJEy6Z25ujrt376JmzZr4+OOPYWFhgRUrVuD+/fto2rSp1qnQyoYBSSIKCwtRpUoVXLlyBU2aNBG7HMnjeBiOwsJC9OjRA2vXruX8FgNhY2ODK1euMCCJrHbt2li/fj26du0KDw8PrFmzBr1798aNGzfQvn17/PHHH2KXqFOcgyQRJiYmqFWrFhdYMxAcD8NhYmKCa9euiV0G/UW/fv3wyy+/iF2G5AUEBGDQoEFo0qQJZDKZ+uj2hQsXSryoobLhESQJCQsLw+7du/HDDz+gWrVqYpcjeRwPwzFjxgyYmZlhyZIlYpdCgHrRzq5du8Lb2xuWlpYa2yv7AoWGZNeuXXj48CEGDhyonhe5ZcsW2Nra4oMPPhC5Ot1iQJKQV3ePLywsRO3atbX+p8M7xusXx8NwTJkyBVu3bkW9evVK/IW8bNkykSqTljp16uC3335Dy5YtS+0jhQUKDVlmZqZ65f/KjpcDSMgHH3zAu5EbEI6H4YiLi0OLFi0AALdv3xa5GulKTExEcXEx7t+/L3YpBODrr7+Gu7s7Bg8eDAAYNGgQ/vvf/8LZ2RkHDx6El5eXyBXqFo8gERGRQTAyMkJKSgocHBzELoXw8rYjP/30E9q2bYsjR45g0KBB2LFjB37++WckJSXh8OHDYpeoUzyCJCGvDl9Xr15doz0zMxMtWrTgYWs943gYjjFjxuC7776DlZWVRntubi6mTJmCjRs3ilSZ9ERGRsLGxua1fYS3hCHdSElJUa/Rtn//fgwaNAh+fn5wd3eHj4+PyNXpHo8gSUhpf52lpqbCzc0NBQUFIlUmTRwPwyGXy5GcnKw1Funp6XByckJRUZFIlUmLkdGbL6yWyWS8+lNPXFxcsGvXLrRt2xYNGjTAokWLMHDgQMTHx6NVq1aVfh0kHkGSgL1796r/LfzrrLi4GFFRUVxvRI84HoYjOzsbKpUKKpUKz58/h7m5uXpbcXExDh48yNM9esZTbIajf//+GDZsGOrVq4dnz56hZ8+eAIDLly+jbt26IlenewxIEtCvXz8AL//yGjVqlMY2ExMTuLu749tvvxWhMmnieBgOW1tbyGQyyGQy1K9fX2u7TCbD/PnzRahMmnjRgmFZvnw53N3d8fDhQyxduhRVq1YFACQnJ2PSpEkiV6d7PMUmIR4eHvjtt99gb28vdikEjochOHnyJFQqFbp06YL//ve/GutRmZqaonbt2rzViB5xkjYZEgYkIpK8Bw8ewM3NrUxzYEh3AgICsHLlSq3J8iSeH374Af/5z39w7949REdHo3bt2lixYgU8PDwq/UKRPMUmMVFRUYiKikJaWhqUSqXGNl6po38cD8NQu3ZtZGZmIiYmpsSx8Pf3F6ky6cjNzcWmTZveqr9wQU8qX2vWrMHcuXMxffp0fPXVV+rJ8ba2tlixYkWlD0g8giQh8+fPx4IFC9CyZUs4Oztrne/fs2ePSJVJE8fDcOzbtw/Dhw9HTk4OrK2tNcZCJpMhIyNDxOqkwdnZGdOmTcOoUaPg7OxcYh+VSoWjR49i2bJl6NChA4KDg/VcpbQ0btwYixcvRr9+/WBlZYWrV6+iTp06iIuLQ6dOnZCeni52iTrFgCQhzs7OWLp0KUaOHCl2KQSOhyGpX78+evXqhcWLF8PCwkLsciQpPj4eX3zxBfbv349mzZqhZcuWcHFxgbm5Of744w/8/vvviI6OhrGxMYKDgzFhwgTI5XKxy67UqlSpglu3bqF27doaAenOnTvw8vLCn3/+KXaJOsVTbBJSUFCAtm3bil0G/X8cD8Px+PFjTJ06leFIRA0aNMB///tfJCUlYefOnTh9+jTOnTuHP//8E/b29mjevDnWr1+Pnj17MhjpiYeHB65cuYLatWtrtEdERKBRo0YiVaU/DEgSMm7cOGzbtg1z5swRuxQCx8OQdO/eHRcvXkSdOnXELkXyatWqhU8//RSffvopgJen1QAuASAGhUKBwMBAvHjxAiqVCjExMdi+fTtCQ0OxYcMGscvTOQYkCXnx4gXWrVuHo0ePwsvLCyYmJhrbecdy/eJ4GI7evXvjs88+w++//w5PT0+tseCtLfQvLCwMy5cvx507dwAA9erVw/Tp0zFu3DiRK5OOcePGoUqVKpg9ezby8vIwbNgwuLi44LvvvsOQIUPELk/nOAdJQjp37lzqNplMhmPHjumxGuJ4GI7XXd7PW1vo39y5c7Fs2TJMmTIFvr6+AIDo6GisWrUKM2bMwIIFC0SusPIrKirCtm3b0L17dzg6OiIvLw85OTmSWqOKAYmIiAxKjRo1sHLlSgwdOlSjffv27ZgyZUqlv3rKUFhYWODmzZtac5CkgquiSdDdu3cRGRmpvgKBGdnwpKWliV0CkWgKCwvRsmVLrXZvb2/eOFiPWrdujcuXL4tdhmgYkCTk2bNn6Nq1q/qS5uTkZADA2LFj1RMiSfcsLCzw9OlT9fPevXurxwIAUlNTS10HhspXr169kJWVpX6+ZMkSZGZmqp8/e/YMjRs3FqEyaRs5ciTWrFmj1b5u3ToMHz5chIqkadKkSfj000+xatUqREdH49q1axqPyo6n2CTE398faWlp2LBhAxo1aqRe0yIyMhIKhQI3btwQu0RJEN5v6q/riwD/C0jC1Zyp/MnlciQnJ6vHwtraGleuXNEYCxcXF85B0rMpU6Zg69atcHNzQ5s2bQAAFy5cQFJSEvz9/TUm0fNiBt0paW6eTCaDSqWSxNw8XsUmIYcPH0ZkZCRq1qyp0V6vXj08ePBApKqoJLykWT+Efx/y70XDEBcXhxYtWgAAEhISAAD29vawt7dHXFycuh9/TnTr/v37YpcgKgYkCcnNzS1xIbyMjAyYmZmJUBERkbbjx4+LXQIBkp2c/QoDkoS899572Lp1KxYuXAjg5V9fSqUSS5cufe0l51S+ZDKZ1r2++JewOEr67jkWRC/t3bu3xHaZTAZzc3PUrVsXHh4eeq5KfzgHSULi4uLQtWtXtGjRAseOHUPfvn1x48YNZGRk4OzZs3jnnXfELlESjIyMYGNjo/5FnJmZCWtra/X5fpVKhezs7Ep/ft8QGBkZoWfPnuojqPv27UOXLl3Ud4nPz89HREQEx4IkycjISD3n6K/+Og+pffv2+OWXX2BnZydSlbrDgCQxWVlZWLVqFa5evYqcnBy0aNECgYGBvGpKj7Zs2VKmfqNGjdJxJRQQEFCmfps2bdJxJUSGJyoqCl9++SW++uortG7dGgAQExODOXPmYPbs2bCxscGECRPg4+ODsLAwkastfwxIREREpKVJkyZYt26d1k21z549i48//hg3btzA0aNHMWbMGCQlJYlUpe5wHSQiIiLSkpCQAGtra612a2tr3Lt3D8DLq6Ar68rmDEhERESkxdvbG5999pnGwrZPnz7F559/jlatWgEA7ty5Azc3N7FK1ClexUZERERawsLC8MEHH6BmzZrqEPTw4UPUqVMHv/76KwAgJycHs2fPFrNMneEcJCIiIiqRUqnE4cOHcfv2bQBAgwYN8P7775e4ynZlw4AkIffv30dRURHq1aun0X7nzh2YmJjA3d1dnMKIiMigvXjxAmZmZpJaJ6zyR0BSGz16NM6dO6fVfuHCBYwePVr/BUncgAED8PXXX2u1L126FAMHDhShImn74Ycf0K5dO7i4uKhvvbNixQr1qQQiqVEqlVi4cCFcXV1RtWpV9a1H5syZUykv6xdiQJKQy5cvo127dlrtbdq0wZUrV/RfkMSdOnUKvXr10mrv2bMnTp06JUJF0rVmzRooFAr06tULmZmZ6oUhbW1tsWLFCnGLIxLJokWLsHnzZixduhSmpqbq9iZNmmDDhg0iVqYfDEgSIpPJ8Pz5c632rKwsrhQsgpycHI3/6bxiYmKC7OxsESqSru+//x7r16/Hl19+Cblcrm5v2bIlrl+/LmJlROLZunUr1q1bh+HDh2v8XDRt2hS3bt0SsTL9YECSkA4dOiA0NFQjDBUXFyM0NBTt27cXsTJp8vT0xI4dO7Taw8PD0bhxYxEqkq779++jefPmWu1mZmbIzc0VoSIi8T1+/Bh169bValcqlSgsLBShIv3iZf4S8vXXX6NDhw5o0KAB3nvvPQDA6dOnkZ2djWPHjolcnfTMmTMH/fv3R0JCArp06QLg5dL+27dvx86dO0WuTlo8PDxw5coVrbuXR0REoFGjRiJVRSSuxo0b4/Tp01o/F7t27SrxD4rKhgFJQho3boxr166p78VWpUoV+Pv7Y/LkyahWrZrY5UlOnz598Msvv2Dx4sXYtWsXqlSpAi8vLxw9ehQdO3YUuzxJUSgUCAwMxIsXL6BSqRATE4Pt27cjNDRUEnMtiEoyd+5cjBo1Co8fP4ZSqcTu3bsRHx+PrVu3Yv/+/WKXp3O8zJ+ICMBPP/2EefPmISEhAQDg4uKC+fPnY+zYsSJXRiSe06dPY8GCBRo3OJ87dy78/PzELk3nGJAquWvXrqFJkyYwMjLCtWvXXtvXy8tLT1URGa68vDzk5OTAwcFB7FKIDNbFixfRsmVLscvQKQakSs7IyAgpKSlwcHCAkZERZDIZShpymUzGK9n0oFq1arh9+zbs7e1hZ2f32kXXMjIy9FiZtC1atAjDhw+Hh4eH2KUQGYycnBzI5XJUqVJF3XblyhXMmTMHBw8erPS/MzgHqZK7f/8+atSoof43iWv58uWwsrJS/1tKq9Iasp07dyIkJAQ+Pj4YMWIEBg0aBHt7e7HLIhLFw4cPMWjQIMTExEAul2Py5MlYtGgRPvnkE+zYsQMffvhhiYsOVzY8giQhp06dQtu2bWFsrJmLi4qKcO7cOXTo0EGkyojEd+PGDfz0008IDw/Ho0eP8P7772P48OHo168fLCwsxC6PSG+GDBmC+Ph4jB07Frt378bJkyfRokUL+Pj4ICgoCDVr1hS7RL1gQJIQuVyO5ORkrbkVz549g4ODQ6U/XGpoOB6G6+zZs9i2bRt27tyJFy9ecOFOkhQXFxfs3r0bbdq0QVpaGpycnLBs2TJMnz5d7NL0igtFSohKpSrxlM6zZ89gaWkpQkXSVtrfJvn5+SWusE36Y2lpiSpVqsDU1FQSC+IR/VVqaqp6Pp6DgwMsLCzQs2dPkavSP85BkoD+/fsDeDkRe/To0TAzM1NvKy4uxrVr19C2bVuxypOclStXAng5Hhs2bEDVqlXV24qLi3Hq1Ck0bNhQrPIk6/79+9i2bRu2bduG+Ph4dOzYEfPnz8dHH30kdmlEemdkZKTxbyn+0caAJAE2NjYAXh6xsLKy0rgiwdTUFG3atMH48ePFKk9yli9fDuDleKxdu1bjHkempqZwd3fH2rVrxSpPktq0aYPffvsNXl5eCAgIwNChQ+Hq6ip2WUSiUKlUqF+/vvqMQ05ODpo3b64RmoDKf6UtA5IEbNq0CQDg7u6OmTNn8nSayF5dTdi5c2fs3r0bdnZ2IldEXbt2xcaNG3kPPCL873eG1HGSNhEREZEAjyBJSGpqKmbOnImoqCikpaVpTRLmVVP6VVxcjM2bN6vHQ6lUamznDYR1S6FQYOHChbC0tIRCoXht32XLlumpKiIyFAxIEjJ69GgkJSVhzpw5cHZ25iKFIps2bRo2b96M3r17o0mTJhwPPbt8+bL6CrXLly+X2o/jQiRNPMUmIVZWVjh9+jSaNWsmdikEwN7eHlu3bkWvXr3ELoWIiAS4DpKEuLm5lbr2Dumfqakp6tatK3YZRERUAh5BkpDDhw/j22+/xX/+8x+4u7uLXY7kffvtt7h37x5WrVrF0zgG4OLFi/j555+RlJSEgoICjW27d+8WqSoiEgsDkoTY2dkhLy8PRUVFsLCwgImJicb2yr6mhaH58MMPcfz4cVSrVg3vvvuu1njwl7L+hIeHw9/fH927d8fhw4fh5+eH27dvIzU1FR9++CEveyZJKu3iBZlMBnNzc9StWxcffPABqlWrpufK9IMBSUK2bNny2u2jRo3SUyUEAAEBAa/dzl/K+uPl5YUJEyYgMDAQVlZWuHr1Kjw8PDBhwgQ4Oztj/vz5YpdIpHedO3fGpUuXUFxcjAYNGgAAbt++DblcjoYNGyI+Ph4ymQxnzpyplGuIMSARkeRZWlrixo0bcHd3R/Xq1XHixAl4enri5s2b6NKlC5KTk8UukUjvVqxYgdOnT2PTpk2wtrYGAGRlZWHcuHFo3749xo8fj2HDhuHPP/9EZGSkyNWWP07SlqhXdyj/64NIquzs7PD8+XMAgKurK+Li4gAAmZmZyMvLE7M0ItF88803WLhwoTocAS9vXTVv3jwsXboUFhYWmDt3LmJjY0WsUne4DpKE5ObmYtasWfj555/x7Nkzre1cKFL/du3aVerE4EuXLolUlfR06NABR44cgaenJwYOHIhp06bh2LFjOHLkCLp27Sp2eUSiyMrKQlpamtbps6dPn6r/qLa1tdX6f1dlwSNIEvL555/j2LFjWLNmDczMzLBhwwbMnz8fLi4u2Lp1q9jlSc7KlSsREBAAR0dHXL58Ga1bt0b16tVx79499OzZU+zyJGXVqlUYMmQIAODLL7+EQqFAamoqBgwYgLCwMJGrIxLHBx98gDFjxmDPnj149OgRHj16hD179mDs2LHo168fACAmJgb169cXt1Ad4RwkCalVqxa2bt2KTp06wdraGpcuXULdunXxww8/YPv27Th48KDYJUpKw4YNERISgqFDh6onBtepUwdz585FRkYGVq1aJXaJRCRhOTk5mDFjBrZu3YqioiIAgLGxMUaNGoXly5fD0tISV65cAYBKuQAxA5KEVK1aFb///jtq1aqFmjVrYvfu3WjdujXu378PT09P5OTkiF2ipFhYWODmzZuoXbs2HBwccOTIETRt2hR37txBmzZtSjwNSkSkbzk5Obh37x4AoE6dOqhatarIFekHT7FJSJ06dXD//n0AL49e/PzzzwCAffv2wdbWVsTKpMnJyUm99lStWrVw/vx5AMD9+/e54rmeGBkZQS6Xv/ZhbMypmiRtVatWhZeXF7y8vCQTjgBO0paUgIAAXL16FR07dkRQUBD69OmDVatWobCwkHcrF0GXLl2wd+9eNG/eHAEBAZgxYwZ27dqFixcvon///mKXJwl79uwpdVt0dDRWrlwJpVKpx4qIDEdubi6WLFmCqKgopKWlaf0svDqqVFnxFJuEPXjwALGxsahbty68vLzELkdylEollEql+ghFeHg4zp07h3r16mHChAkwNTUVuUJpio+PR1BQEPbt24fhw4djwYIFqF27tthlEend0KFDcfLkSYwcORLOzs5at0SaNm2aSJXpBwMSERGAJ0+eICQkBFu2bEH37t0RGhqKJk2aiF0WkWhsbW1x4MABtGvXTuxSRMFTbBLz22+/4fjx4yUeLuVpNv178eIFrl27VuJ49O3bV6SqpCUrKwuLFy/G999/j2bNmiEqKgrvvfee2GURic7Ozq7S3metLBiQJGTx4sWYPXs2GjRoAEdHR43DpbybvP5FRETA398f6enpWttkMhkX7tSDpUuX4uuvv4aTkxO2b9+ODz74QOySiAzGwoULMXfuXGzZsgUWFhZil6N3PMUmIY6Ojvj6668xevRosUshAPXq1YOfnx/mzp0LR0dHscuRJCMjI1SpUgXdunWDXC4vtd/u3bv1WBWRYWjevDkSEhKgUqng7u4OExMTje2VfbV/HkGSECMjI8meSzZEqampUCgUDEci8vf359FTolK8Wi1bqngESUKWLl2KJ0+eYMWKFWKXQgDGjBmDdu3aYezYsWKXQkREAgxIEqJUKtG7d2/cvn0bjRs31jpcytMI+pWXl4eBAweiRo0a8PT01BqPqVOnilQZERHxFJuETJ06FcePH0fnzp1RvXp1nloQ2fbt23H48GGYm5vjxIkTWpPmGZCISN+qVauG27dvw97eHnZ2dq/9PfHqTgCVFY8gSYiVlRXCw8PRu3dvsUshvLzVyNSpUxEUFAQjI971h4jEt2XLFgwZMgRmZmbYsmXLa/uOGjVKT1WJgwFJQmrXro3IyEg0bNhQ7FIIL/9S++233/DOO++IXQoREQkwIEnIpk2bEBERgU2bNklyTQtDM2PGDNSoUQNffPGF2KUQEQEAsrOzy9zX2tpah5WIjwFJQqS+poWhmTp1KrZu3YqmTZvCy8tLazy4sjkR6ZuRkVGZ56dW9sVsOUlbQqS+poWhuX79Opo3bw4AiIuL09jGCfREJIbjx4+r/52YmIigoCCMHj0avr6+AIDo6Ghs2bIFoaGhYpWoNzyCRERERFq6du2KcePGYejQoRrt27Ztw7p163DixAlxCtMTXjojMZmZmdiwYQOCg4PVl2heunQJjx8/Frky6bp79y4iIyPx559/AgD4NwsRGYLo6Gi0bNlSq71ly5aIiYkRoSL9YkCSkGvXrqF+/fr4+uuv8a9//QuZmZkAXi4QGRwcLG5xEvTs2TN07doV9evXR69evZCcnAwAGDt2LD799FORqyMiqXNzc8P69eu12jds2AA3NzcRKtIvBiQJUSgUGD16NO7cuQNzc3N1e69evXDq1CkRK5OmGTNmwMTEBElJSRpXFQ4ePBgREREiVkZEBCxfvhzff/89PD09MW7cOIwbNw5eXl74/vvvsXz5crHL0zkGJAn57bffMGHCBK12V1dXpKSkiFCRtB0+fBhff/01atasqdFer149PHjwQKSqiIhe6tWrF27fvo0+ffogIyMDGRkZ6NOnD27fvo1evXqJXZ7O8So2CTEzMytxjYvbt2+jRo0aIlQkbbm5uSWuR5WRkQEzMzMRKiIi0uTm5obFixeLXYYoGJAkpG/fvliwYAF+/vlnAC8vJU9KSsKsWbMwYMAAkauTnvfeew9bt27FwoULAbwcD6VSiaVLl6Jz584iV0dEUnTt2rUy9/Xy8tJhJeLjZf4SkpWVhY8++ggXL17E8+fP4eLigpSUFPj6+uLgwYOwtLQUu0RJiYuLQ9euXdGiRQscO3YMffv2xY0bN5CRkYGzZ8/yFiREpHevFopUqVQa67G9igp/bavsC0UyIEnQ2bNncfXqVeTk5KBFixbo1q2b2CVJVlZWFlatWqUxHoGBgXB2dha7NCKSoL/Of7x8+TJmzpyJzz77TGOhyG+//RZLly6t9IsPMyARERGRltatW2PevHlaE7IPHjyIOXPmIDY2VqTK9INXsUlAdHQ09u/fr9G2detWeHh4wMHBAR9//DHy8/NFqk560tPTta5Su3HjBgICAjBo0CBs27ZNpMqIiP7n+vXr8PDw0Gr38PDA77//LkJF+sWAJAELFizAjRs31M+vX7+OsWPHolu3bggKCsK+ffskcV8dQzFlyhSsXLlS/TwtLQ3vvfcefvvtN+Tn52P06NH44YcfRKyQiAho1KgRQkNDUVBQoG4rKChAaGgoGjVqJGJl+sGr2CTgypUr6iulACA8PBw+Pj7qFVLd3NwQEhKCefPmiVShtJw/fx6bN29WP9+6dSuqVauGK1euwNjYGP/617+wevVqjBw5UrwiiUjy1q5diz59+qBmzZrqK9auXbsGmUyGffv2iVyd7jEgScAff/wBR0dH9fOTJ0+iZ8+e6uetWrXCw4cPxShNklJSUuDu7q5+fuzYMfTv3x/Gxi9/HPv27csjekQkutatW+PevXv46aefcOvWLQAvV/ofNmyYJK56ZkCSAEdHR9y/fx9ubm4oKCjApUuXMH/+fPX258+fw8TERMQKpcXa2hqZmZmoXbs2ACAmJgZjx45Vb5fJZJwTRkQGwdLSEh9//LHYZYiCc5AkoFevXggKCsLp06cRHBwMCwsLvPfee+rt165d45o7etSmTRusXLkSSqUSu3btwvPnz9GlSxf19tu3b0viRpBEZPh++OEHtG/fHi4uLuqLS5YvX45ff/1V5Mp0jwFJAhYuXAhjY2N07NgR69evx/r162FqaqrevnHjRvj5+YlYobQsXLgQe/fuRZUqVTB48GB8/vnnsLOzU28PDw9Hx44dRayQiAhYs2YNFAoFevbsiT/++EO9MKSdnR1WrFghbnF6wHWQJCQrKwtVq1aFXC7XaM/IyEDVqlU1QhPpVnp6Os6ePQsnJyf4+PhobDtw4AAaN25c4uW1RET60rhxYyxevBj9+vWDlZUVrl69ijp16iAuLg6dOnVCenq62CXqFAMSERERaalSpQpu3bqF2rVrawSkO3fuwMvLC3/++afYJeoUT7ERERGRFg8PD1y5ckWrPSIigusgERERkTQpFAoEBgbixYsXUKlUiImJwfbt2xEaGooNGzaIXZ7O8RQbERERleinn37CvHnzkJCQAABwcXHB/PnzNZYmqawYkIiIiOi18vLykJOTAwcHB7FL0RvOQSISUUJCAmbPno2hQ4ciLS0NAHDo0CGNe+cREYkpLS0NsbGxiI+Px9OnT8UuR28YkIhEcvLkSXh6euLChQvYvXs3cnJyAABXr15FSEiIyNURkdQ9f/4cI0eOhIuLCzp27IiOHTvCxcUFI0aMQFZWltjl6RwDEpFIgoKCsGjRIhw5ckRjDaouXbrg/PnzIlZGRASMGzcOFy5cwIEDB5CZmYnMzEzs378fFy9exIQJE8QuT+c4B4lIJFWrVsX169fh4eGhscZIYmIiGjZsiBcvXohdIhFJmKWlJSIjI9G+fXuN9tOnT6NHjx7Izc0VqTL94BEkIpHY2toiOTlZq/3y5ctwdXUVoSIiov+pXr06bGxstNptbGw0bo9UWTEgEYlkyJAhmDVrFlJSUiCTyaBUKnH27FnMnDkT/v7+YpdHRBI3e/ZsKBQKpKSkqNtSUlLw2WefYc6cOSJWph88xUYkkoKCAgQGBmLz5s0oLi6GsbExiouLMWzYMGzevFnrnnlERLrWvHlzyGQy9fM7d+4gPz8ftWrVAgAkJSXBzMwM9erVw6VLl8QqUy8YkIhE9vDhQ1y/fh05OTlo3rw56tWrJ3ZJRCRR8+fPL3Pfyn61LQMSkUgWLFiAmTNnwsLCQqP9zz//xDfffIO5c+eKVBkRETEgEYlELpcjOTlZa2XaZ8+ewcHBAcXFxSJVRkSkKScnB0qlUqPN2tpapGr0g5O0iUSiUqk0zvW/cvXqVVSrVk2EioiI/uf+/fvo3bs3LC0t1Veu2dnZwdbWVhJXsRmLXQCR1NjZ2UEmk0Emk6F+/foaIam4uBg5OTn45JNPRKyQiAgYMWIEVCoVNm7cCEdHxxL/oKvMeIqNSM+2bNkClUqFMWPGYMWKFRrrjJiamsLd3R2+vr4iVkhE9HIx29jYWDRo0EDsUkTBI0hEejZq1CgAgIeHB9q1awdjY/4YEpHhadWqFR4+fCjZgMQjSEQiSkhIwKZNm5CQkIDvvvsODg4OOHToEGrVqoV3331X7PKISMISEhLwySefYMSIEWjSpAlMTEw0tnt5eYlUmX4wIBGJ5OTJk+jZsyfatWuHU6dO4ebNm6hTpw6WLFmCixcvYteuXWKXSEQSdv78eQwbNgyJiYnqNplMpr7ApLJfacuARCQSX19fDBw4EAqFQuNmtTExMejfvz8ePXokdolEJGGNGzdGo0aN8Pnnn5c4Sbt27doiVaYfnPxAJJLr169j27ZtWu0ODg5IT08XoSIiov958OAB9u7di7p164pdiii4DhKRSGxtbZGcnKzVfvnyZbi6uopQERHR/3Tp0gVXr14VuwzR8AgSkUiGDBmCWbNmYefOnZDJZFAqlTh79ixmzpwJf39/scsjIonr06cPZsyYgevXr8PT01Nrknbfvn1Fqkw/OAeJSCQFBQUIDAzE5s2bUVxcDGNjYxQXF2PYsGHYvHkz5HK52CUSkYQZGZV+komTtIlIJ1QqFR4+fIgaNWogPT0d169fR05ODpo3b4569eqJXR4RkeQxIBGJQKlUwtzcHDdu3GAgIiIyQJykTSQCIyMj1KtXD8+ePRO7FCIiDb169UJWVpb6+ZIlS5CZmal+/uzZMzRu3FiEyvSLAYlIJEuWLMFnn32GuLg4sUshIlKLjIxEfn6++vnixYuRkZGhfl5UVIT4+HgxStMrXsVGJBJ/f3/k5eWhadOmMDU1RZUqVTS2//V/SERE+iKceSPVmTgMSEQiWbFihdglEBFRKRiQiEQyatQosUsgItIik8m0bisifC4FDEhEIjl48CDkcjm6d++u0X748GEUFxejZ8+eIlVGRFKmUqkwevRomJmZAQBevHiBTz75BJaWlgCgMT+pMuMkbSKRBAUFlbjQmlKpRFBQkAgVERG9PLrt4OAAGxsb2NjYYMSIEXBxcVE/d3BwkMRq/1wHiUgkVapUwc2bN+Hu7q7RnpiYiHfffRe5ubniFEZERDyCRCQWGxsb3Lt3T6v97t276kPZREQkDgYkIpF88MEHmD59OhISEtRtd+/exaefflrpbwJJRGToeIqNSCRZWVno0aMHLl68iJo1awIAHj16hPfeew+7d++Gra2tuAUSEUkYAxKRiFQqFY4cOYKrV6+iSpUq8PLyQocOHcQui4hI8hiQiAxIZmYmjxwRERkAzkEiEsnXX3+NHTt2qJ8PGjQI1atXh6urK65evSpiZURExIBEJJK1a9fCzc0NAHDkyBEcOXIEhw4dQs+ePfHZZ5+JXB0RkbRxJW0ikaSkpKgD0v79+zFo0CD4+fnB3d0dPj4+IldHRCRtPIJEJBI7Ozs8fPgQABAREYFu3boBeDlxu6QVtomISH94BIlIJP3798ewYcNQr149PHv2TH3vtcuXL6Nu3boiV0dEJG0MSEQiWb58Odzd3fHw4UMsXboUVatWBQAkJydj0qRJIldHRCRtvMyfiIiISIBHkIj0aO/evejZsydMTEywd+/e1/bl7UaIiMTDI0hEemRkZISUlBQ4ODjAyKj0ayRkMhknahMRiYgBiYiIiEiAl/kTERERCXAOEpEIlEolNm/ejN27dyMxMREymQweHh746KOPMHLkSMhkMrFLJCKSNJ5iI9IzlUqFPn364ODBg2jatCkaNmwIlUqFmzdv4vr16+jbty9++eUXscskIpI0HkEi0rPNmzfj1KlTiIqKQufOnTW2HTt2DP369cPWrVvh7+8vUoVERMQjSER65ufnhy5duiAoKKjE7YsXL8bJkycRGRmp58qIiOgVTtIm0rNr166hR48epW7v2bMnrl69qseKiIhIiAGJSM8yMjLg6OhY6nZHR0f88ccfeqyIiIiEGJCI9Ky4uBjGxqVP/5PL5SgqKtJjRUREJMRJ2kR6plKpMHr0aJiZmZW4PT8/X88VERGREAMSkZ6NGjXqjX14BRsRkbh4FRsRERGRAOcgEREREQkwIBEREREJMCARERERCTAgEREREQkwIBEREREJMCAREZXgxIkTkMlkyMzMLPd9y2Qy/PLLL+W+XyIqPwxIRCR5nTp1wvTp08Uug4gMCAMSERERkQADEhFVKJ06dcKUKVMwffp02NnZwdHREevXr0dubi4CAgJgZWWFunXr4tChQ+rXxMXFoWfPnqhatSocHR0xcuRIpKenAwBGjx6NkydP4rvvvoNMJoNMJkNiYqL6tbGxsWjZsiUsLCzQtm1bxMfHa9SzZs0avPPOOzA1NUWDBg3www8/aGy/c+cOOnToAHNzczRu3BhHjhzR3ZdDROWGAYmIKpwtW7bA3t4eMTExmDJlCiZOnIiBAweibdu2uHTpEvz8/DBy5Ejk5eUhMzMTXbp0QfPmzXHx4kVEREQgNTUVgwYNAgB899138PX1xfjx45GcnIzk5GS4ubmp3+vLL7/Et99+i4sXL8LY2BhjxoxRb9uzZw+mTZuGTz/9FHFxcZgwYQICAgJw/PhxAIBSqUT//v1hamqKCxcuYO3atZg1a5Z+vywi+ntUREQVSMeOHVXt27dXPy8qKlJZWlqqRo4cqW5LTk5WAVBFR0erFi5cqPLz89PYx8OHD1UAVPHx8ep9Tps2TaPP8ePHVQBUR48eVbcdOHBABUD1559/qlQqlapt27aq8ePHa7xu4MCBql69eqlUKpUqMjJSZWxsrHr8+LF6+6FDh1QAVHv27Pn7XwIR6RyPIBFRhePl5aX+t1wuR/Xq1eHp6aluc3R0BACkpaXh6tWrOH78OKpWrap+NGzYEACQkJDwVu/l7Oys3i8A3Lx5E+3atdPo365dO9y8eVO93c3NDS4uLurtvr6+b/VZiUgcxmIXQET0tkxMTDSey2QyjTaZTAbg5SmunJwc9OnTB19//bXWfl4FnrK+11/3S0SVG48gEVGl1qJFC9y4cQPu7u6oW7euxsPS0hIAYGpqiuLi4rfed6NGjXD27FmNtrNnz6Jx48bq7Q8fPkRycrJ6+/nz5//BpyEifWFAIqJKLTAwEBkZGRg6dCh+++03JCQkIDIyEgEBAepQ5O7ujgsXLiAxMRHp6ellPkL02WefYfPmzVizZg3u3LmDZcuWYffu3Zg5cyYAoFu3bqhfvz5GjRqFq1ev4vTp0/jyyy919lmJqPwwIBFRpebi4oKzZ8+iuLgYfn5+8PT0xPTp02Frawsjo5f/C5w5cybkcjkaN26MGjVqICkpqUz77tevH7777jv861//wrvvvov//Oc/2LRpEzp16gQAMDIywp49e/Dnn3+idevWGDduHL766itdfVQiKkcylUqlErsIIiIiIkPCI0hEREREAgxIRERERAIMSEREREQCDEhEREREAgxIRERERAIMSEREREQCDEhEREREAgxIRERERAIMSEREREQCDEhEREREAgxIRERERAL/D4fJzXnMM118AAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "metric = 'AUROC'\n", "unc_df.set_index('metric').loc[metric].plot.bar(x='method', y='means')\n", "plt.gca().set_ylabel(metric)\n", "plt.gca().grid(axis='y')\n", "plt.gca().set_ylim(0.6, 0.8)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: semantic_uncertainty/analyze_results.py ================================================ """Compute overall performance metrics from predicted uncertainties.""" import argparse import functools import logging import os import pickle import numpy as np import wandb from uncertainty.utils import utils from uncertainty.utils.eval_utils import ( bootstrap, compatible_bootstrap, auroc, accuracy_at_quantile, area_under_thresholded_accuracy) utils.setup_logger() result_dict = {} UNC_MEAS = 'uncertainty_measures.pkl' def init_wandb(wandb_runid, assign_new_wandb_id, experiment_lot, entity): """Initialize wandb session.""" user = os.environ['USER'] slurm_jobid = os.getenv('SLURM_JOB_ID') scratch_dir = os.getenv('SCRATCH_DIR', '.') kwargs = dict( entity=entity, project='semantic_uncertainty', dir=f'{scratch_dir}/{user}/uncertainty', notes=f'slurm_id: {slurm_jobid}, experiment_lot: {experiment_lot}', ) if not assign_new_wandb_id: # Restore wandb session. wandb.init( id=wandb_runid, resume=True, **kwargs) wandb.restore(UNC_MEAS) else: api = wandb.Api() wandb.init(**kwargs) old_run = api.run(f'{entity}/semantic_uncertainty/{wandb_runid}') old_run.file(UNC_MEAS).download( replace=True, exist_ok=False, root=wandb.run.dir) def analyze_run( wandb_runid, assign_new_wandb_id=False, answer_fractions_mode='default', experiment_lot=None, entity=None): """Analyze the uncertainty measures for a given wandb run id.""" logging.info('Analyzing wandb_runid `%s`.', wandb_runid) # Set up evaluation metrics. if answer_fractions_mode == 'default': answer_fractions = [0.8, 0.9, 0.95, 1.0] elif answer_fractions_mode == 'finegrained': answer_fractions = [round(i, 3) for i in np.linspace(0, 1, 20+1)] else: raise ValueError rng = np.random.default_rng(41) eval_metrics = dict(zip( ['AUROC', 'area_under_thresholded_accuracy', 'mean_uncertainty'], list(zip( [auroc, area_under_thresholded_accuracy, np.mean], [compatible_bootstrap, compatible_bootstrap, bootstrap] )), )) for answer_fraction in answer_fractions: key = f'accuracy_at_{answer_fraction}_answer_fraction' eval_metrics[key] = [ functools.partial(accuracy_at_quantile, quantile=answer_fraction), compatible_bootstrap] if wandb.run is None: init_wandb( wandb_runid, assign_new_wandb_id=assign_new_wandb_id, experiment_lot=experiment_lot, entity=entity) elif wandb.run.id != wandb_runid: raise ValueError # Load the results dictionary from a pickle file. with open(f'{wandb.run.dir}/{UNC_MEAS}', 'rb') as file: results_old = pickle.load(file) result_dict = {'performance': {}, 'uncertainty': {}} # First: Compute simple accuracy metrics for model predictions. all_accuracies = dict() all_accuracies['accuracy'] = 1 - np.array(results_old['validation_is_false']) for name, target in all_accuracies.items(): result_dict['performance'][name] = {} result_dict['performance'][name]['mean'] = np.mean(target) result_dict['performance'][name]['bootstrap'] = bootstrap(np.mean, rng)(target) rum = results_old['uncertainty_measures'] if 'p_false' in rum and 'p_false_fixed' not in rum: # Restore log probs true: y = 1 - x --> x = 1 - y. # Convert to probs --> np.exp(1 - y). # Convert to p_false --> 1 - np.exp(1 - y). rum['p_false_fixed'] = [1 - np.exp(1 - x) for x in rum['p_false']] # Next: Uncertainty Measures. # Iterate through the dictionary and compute additional metrics for each measure. for measure_name, measure_values in rum.items(): logging.info('Computing for uncertainty measure `%s`.', measure_name) # Validation accuracy. validation_is_falses = [ results_old['validation_is_false'], results_old['validation_unanswerable'] ] logging_names = ['', '_UNANSWERABLE'] # Iterate over predictions of 'falseness' or 'answerability'. for validation_is_false, logging_name in zip(validation_is_falses, logging_names): name = measure_name + logging_name result_dict['uncertainty'][name] = {} validation_is_false = np.array(validation_is_false) validation_accuracy = 1 - validation_is_false if len(measure_values) > len(validation_is_false): # This can happen, but only for p_false. if 'p_false' not in measure_name: raise ValueError logging.warning( 'More measure values for %s than in validation_is_false. Len(measure values): %d, Len(validation_is_false): %d', measure_name, len(measure_values), len(validation_is_false)) measure_values = measure_values[:len(validation_is_false)] fargs = { 'AUROC': [validation_is_false, measure_values], 'area_under_thresholded_accuracy': [validation_accuracy, measure_values], 'mean_uncertainty': [measure_values]} for answer_fraction in answer_fractions: fargs[f'accuracy_at_{answer_fraction}_answer_fraction'] = [validation_accuracy, measure_values] for fname, (function, bs_function) in eval_metrics.items(): metric_i = function(*fargs[fname]) result_dict['uncertainty'][name][fname] = {} result_dict['uncertainty'][name][fname]['mean'] = metric_i logging.info("%s for measure name `%s`: %f", fname, name, metric_i) result_dict['uncertainty'][name][fname]['bootstrap'] = bs_function( function, rng)(*fargs[fname]) wandb.log(result_dict) logging.info( 'Analysis for wandb_runid `%s` finished. Full results dict: %s', wandb_runid, result_dict ) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--wandb_runids', nargs='+', type=str, help='Wandb run ids of the datasets to evaluate on.') parser.add_argument('--assign_new_wandb_id', default=True, action=argparse.BooleanOptionalAction) parser.add_argument('--answer_fractions_mode', type=str, default='default') parser.add_argument( "--experiment_lot", type=str, default='Unnamed Experiment', help="Keep default wandb clean.") parser.add_argument( "--entity", type=str, help="Wandb entity.") args, unknown = parser.parse_known_args() if unknown: raise ValueError(f'Unkown args: {unknown}') wandb_runids = args.wandb_runids for wid in wandb_runids: logging.info('Evaluating wandb_runid `%s`.', wid) analyze_run( wid, args.assign_new_wandb_id, args.answer_fractions_mode, experiment_lot=args.experiment_lot, entity=args.entity) ================================================ FILE: semantic_uncertainty/compute_uncertainty_measures.py ================================================ """Compute uncertainty measures after generating answers.""" from collections import defaultdict import logging import os import pickle import numpy as np import wandb from analyze_results import analyze_run from uncertainty.data.data_utils import load_ds from uncertainty.uncertainty_measures.p_ik import get_p_ik from uncertainty.uncertainty_measures.semantic_entropy import get_semantic_ids from uncertainty.uncertainty_measures.semantic_entropy import logsumexp_by_id from uncertainty.uncertainty_measures.semantic_entropy import predictive_entropy from uncertainty.uncertainty_measures.semantic_entropy import predictive_entropy_rao from uncertainty.uncertainty_measures.semantic_entropy import cluster_assignment_entropy from uncertainty.uncertainty_measures.semantic_entropy import context_entails_response from uncertainty.uncertainty_measures.semantic_entropy import EntailmentDeberta from uncertainty.uncertainty_measures.semantic_entropy import EntailmentGPT4 from uncertainty.uncertainty_measures.semantic_entropy import EntailmentGPT35 from uncertainty.uncertainty_measures.semantic_entropy import EntailmentGPT4Turbo from uncertainty.uncertainty_measures.semantic_entropy import EntailmentLlama from uncertainty.uncertainty_measures import p_true as p_true_utils from uncertainty.utils import utils utils.setup_logger() EXP_DETAILS = 'experiment_details.pkl' def main(args): if args.train_wandb_runid is None: args.train_wandb_runid = args.eval_wandb_runid user = os.environ['USER'] scratch_dir = os.getenv('SCRATCH_DIR', '.') wandb_dir = f'{scratch_dir}/{user}/uncertainty' slurm_jobid = os.getenv('SLURM_JOB_ID', None) project = "semantic_uncertainty" if not args.debug else "semantic_uncertainty_debug" if args.assign_new_wandb_id: logging.info('Assign new wandb_id.') api = wandb.Api() old_run = api.run(f'{args.restore_entity_eval}/{project}/{args.eval_wandb_runid}') wandb.init( entity=args.entity, project=project, dir=wandb_dir, notes=f'slurm_id: {slurm_jobid}, experiment_lot: {args.experiment_lot}', # For convenience, keep any 'generate_answers' configs from old run, # but overwrite the rest! # NOTE: This means any special configs affecting this script must be # called again when calling this script! config={**old_run.config, **args.__dict__}, ) def restore(filename): old_run.file(filename).download( replace=True, exist_ok=False, root=wandb.run.dir) class Restored: name = f'{wandb.run.dir}/{filename}' return Restored else: logging.info('Reuse active wandb id.') def restore(filename): class Restored: name = f'{wandb.run.dir}/{filename}' return Restored if args.train_wandb_runid != args.eval_wandb_runid: logging.info( "Distribution shift for p_ik. Training on embeddings from run %s but evaluating on run %s", args.train_wandb_runid, args.eval_wandb_runid) is_ood_eval = True # pylint: disable=invalid-name api = wandb.Api() old_run_train = api.run(f'{args.restore_entity_train}/semantic_uncertainty/{args.train_wandb_runid}') filename = 'train_generations.pkl' old_run_train.file(filename).download( replace=True, exist_ok=False, root=wandb.run.dir) with open(f'{wandb.run.dir}/{filename}', "rb") as infile: train_generations = pickle.load(infile) wandb.config.update( {"ood_training_set": old_run_train.config['dataset']}, allow_val_change=True) else: is_ood_eval = False # pylint: disable=invalid-name if args.compute_p_ik or args.compute_p_ik_answerable: train_generations_pickle = restore('train_generations.pkl') with open(train_generations_pickle.name, 'rb') as infile: train_generations = pickle.load(infile) wandb.config.update({"is_ood_eval": is_ood_eval}, allow_val_change=True) # Load entailment model. if args.compute_predictive_entropy: logging.info('Beginning loading for entailment model.') if args.entailment_model == 'deberta': entailment_model = EntailmentDeberta() elif args.entailment_model == 'gpt-4': entailment_model = EntailmentGPT4(args.entailment_cache_id, args.entailment_cache_only) elif args.entailment_model == 'gpt-3.5': entailment_model = EntailmentGPT35(args.entailment_cache_id, args.entailment_cache_only) elif args.entailment_model == 'gpt-4-turbo': entailment_model = EntailmentGPT4Turbo(args.entailment_cache_id, args.entailment_cache_only) elif 'llama' in args.entailment_model.lower(): entailment_model = EntailmentLlama(args.entailment_cache_id, args.entailment_cache_only, args.entailment_model) else: raise ValueError logging.info('Entailment model loading complete.') if args.compute_p_true_in_compute_stage: # This is usually not called. old_exp = restore(EXP_DETAILS) with open(old_exp.name, "rb") as infile: old_exp = pickle.load(infile) if args.reuse_entailment_model: pt_model = entailment_model.model else: pt_model = utils.init_model(old_exp['args']) pt_train_dataset, pt_validation_dataset = load_ds( old_exp['args'].dataset, add_options=old_exp['args'].use_mc_options, seed=args.random_seed) del pt_validation_dataset # Reduce num generations used in p_true if needed! if not args.use_all_generations: if args.use_num_generations == -1: raise ValueError num_gen = args.use_num_generations else: num_gen = args.num_generations p_true_few_shot_prompt, p_true_responses, len_p_true = p_true_utils.construct_few_shot_prompt( model=pt_model, dataset=pt_train_dataset, indices=old_exp['p_true_indices'], prompt=old_exp['prompt'], brief=old_exp['BRIEF'], brief_always=old_exp['args'].brief_always and old_exp['args'].enable_brief, make_prompt=utils.get_make_prompt(old_exp['args']), num_generations=num_gen, metric=utils.get_metric(old_exp['args'].metric)) del p_true_responses wandb.config.update( {'p_true_num_fewshot': len_p_true}, allow_val_change=True) wandb.log(dict(len_p_true=len_p_true)) logging.info('Generated few-shot prompt for p_true.') logging.info(80*'#') logging.info('p_true_few_shot_prompt: %s', p_true_few_shot_prompt) logging.info(80*'#') if args.recompute_accuracy: # This is usually not enabled. logging.warning('Recompute accuracy enabled. This does not apply to precomputed p_true!') metric = utils.get_metric(args.metric) # Restore outputs from `generate_answrs.py` run. result_dict_pickle = restore('uncertainty_measures.pkl') with open(result_dict_pickle.name, "rb") as infile: result_dict = pickle.load(infile) result_dict['semantic_ids'] = [] validation_generations_pickle = restore('validation_generations.pkl') with open(validation_generations_pickle.name, 'rb') as infile: validation_generations = pickle.load(infile) entropies = defaultdict(list) validation_embeddings, validation_is_true, validation_answerable = [], [], [] p_trues = [] count = 0 # pylint: disable=invalid-name def is_answerable(generation): return len(generation['reference']['answers']['text']) > 0 # Loop over datapoints and compute validation embeddings and entropies. for idx, tid in enumerate(validation_generations): example = validation_generations[tid] question = example['question'] context = example['context'] full_responses = example["responses"] most_likely_answer = example['most_likely_answer'] if not args.use_all_generations: if args.use_num_generations == -1: raise ValueError responses = [fr[0] for fr in full_responses[:args.use_num_generations]] else: responses = [fr[0] for fr in full_responses] if args.recompute_accuracy: logging.info('Recomputing accuracy!') if is_answerable(example): acc = metric(most_likely_answer['response'], example, None) else: acc = 0.0 # pylint: disable=invalid-name validation_is_true.append(acc) logging.info('Recomputed accuracy!') else: validation_is_true.append(most_likely_answer['accuracy']) validation_answerable.append(is_answerable(example)) validation_embeddings.append(most_likely_answer['embedding']) logging.info('validation_is_true: %f', validation_is_true[-1]) if args.compute_predictive_entropy: # Token log likelihoods. Shape = (n_sample, n_tokens) if not args.use_all_generations: log_liks = [r[1] for r in full_responses[:args.use_num_generations]] else: log_liks = [r[1] for r in full_responses] for i in log_liks: assert i if args.compute_context_entails_response: # Compute context entails answer baseline. entropies['context_entails_response'].append(context_entails_response( context, responses, entailment_model)) if args.condition_on_question and args.entailment_model == 'deberta': responses = [f'{question} {r}' for r in responses] # Compute semantic ids. semantic_ids = get_semantic_ids( responses, model=entailment_model, strict_entailment=args.strict_entailment, example=example) result_dict['semantic_ids'].append(semantic_ids) # Compute entropy from frequencies of cluster assignments. entropies['cluster_assignment_entropy'].append(cluster_assignment_entropy(semantic_ids)) # Length normalization of generation probabilities. log_liks_agg = [np.mean(log_lik) for log_lik in log_liks] # Compute naive entropy. entropies['regular_entropy'].append(predictive_entropy(log_liks_agg)) # Compute semantic entropy. log_likelihood_per_semantic_id = logsumexp_by_id(semantic_ids, log_liks_agg, agg='sum_normalized') pe = predictive_entropy_rao(log_likelihood_per_semantic_id) entropies['semantic_entropy'].append(pe) # pylint: disable=invalid-name log_str = 'semantic_ids: %s, avg_token_log_likelihoods: %s, entropies: %s' entropies_fmt = ', '.join([f'{i}:{j[-1]:.2f}' for i, j in entropies.items()]) # pylint: enable=invalid-name logging.info(80*'#') logging.info('NEW ITEM %d at id=`%s`.', idx, tid) logging.info('Context:') logging.info(example['context']) logging.info('Question:') logging.info(question) logging.info('True Answers:') logging.info(example['reference']) logging.info('Low Temperature Generation:') logging.info(most_likely_answer['response']) logging.info('Low Temperature Generation Accuracy:') logging.info(most_likely_answer['accuracy']) logging.info('High Temp Generation:') logging.info([r[0] for r in full_responses]) logging.info('High Temp Generation:') logging.info(log_str, semantic_ids, log_liks_agg, entropies_fmt) if args.compute_p_true_in_compute_stage: p_true = p_true_utils.calculate_p_true( pt_model, question, most_likely_answer['response'], responses, p_true_few_shot_prompt, hint=old_exp['args'].p_true_hint) p_trues.append(p_true) logging.info('p_true: %s', np.exp(p_true)) count += 1 if count >= args.num_eval_samples: logging.info('Breaking out of main loop.') break logging.info('Accuracy on original task: %f', np.mean(validation_is_true)) validation_is_false = [1.0 - is_t for is_t in validation_is_true] result_dict['validation_is_false'] = validation_is_false validation_unanswerable = [1.0 - is_a for is_a in validation_answerable] result_dict['validation_unanswerable'] = validation_unanswerable logging.info('Unanswerable prop on validation: %f', np.mean(validation_unanswerable)) if 'uncertainty_measures' not in result_dict: result_dict['uncertainty_measures'] = dict() if args.compute_predictive_entropy: result_dict['uncertainty_measures'].update(entropies) if args.compute_p_ik or args.compute_p_ik_answerable: # Assemble training data for embedding classification. train_is_true, train_embeddings, train_answerable = [], [], [] for tid in train_generations: most_likely_answer = train_generations[tid]['most_likely_answer'] train_embeddings.append(most_likely_answer['embedding']) train_is_true.append(most_likely_answer['accuracy']) train_answerable.append(is_answerable(train_generations[tid])) train_is_false = [0.0 if is_t else 1.0 for is_t in train_is_true] train_unanswerable = [0.0 if is_t else 1.0 for is_t in train_answerable] logging.info('Unanswerable prop on p_ik training: %f', np.mean(train_unanswerable)) if args.compute_p_ik: logging.info('Starting training p_ik on train embeddings.') # Train classifier of correct/incorrect from embeddings. p_ik_predictions = get_p_ik( train_embeddings=train_embeddings, is_false=train_is_false, eval_embeddings=validation_embeddings, eval_is_false=validation_is_false) result_dict['uncertainty_measures']['p_ik'] = p_ik_predictions logging.info('Finished training p_ik on train embeddings.') if args.compute_p_ik_answerable: # Train classifier of answerable/unanswerable. p_ik_predictions = get_p_ik( train_embeddings=train_embeddings, is_false=train_unanswerable, eval_embeddings=validation_embeddings, eval_is_false=validation_unanswerable) result_dict['uncertainty_measures']['p_ik_unanswerable'] = p_ik_predictions if args.compute_p_true_in_compute_stage: result_dict['uncertainty_measures']['p_false'] = [1 - p for p in p_trues] result_dict['uncertainty_measures']['p_false_fixed'] = [1 - np.exp(p) for p in p_trues] utils.save(result_dict, 'uncertainty_measures.pkl') if args.compute_predictive_entropy: entailment_model.save_prediction_cache() if args.analyze_run: # Follow up with computation of aggregate performance metrics. logging.info(50 * '#X') logging.info('STARTING `analyze_run`!') analyze_run(wandb.run.id) logging.info(50 * '#X') logging.info('FINISHED `analyze_run`!') if __name__ == '__main__': parser = utils.get_parser(stages=['compute']) args, unknown = parser.parse_known_args() # pylint: disable=invalid-name if unknown: raise ValueError(f'Unkown args: {unknown}') logging.info("Args: %s", args) main(args) ================================================ FILE: semantic_uncertainty/generate_answers.py ================================================ """Sample answers from LLMs on QA task.""" import gc import os import logging import random from tqdm import tqdm import numpy as np import torch import wandb from uncertainty.data.data_utils import load_ds from uncertainty.utils import utils from uncertainty.uncertainty_measures import p_true as p_true_utils from compute_uncertainty_measures import main as main_compute utils.setup_logger() def main(args): # Setup run. if args.dataset == 'svamp': if not args.use_context: logging.info('Forcing `use_context=True` for svamp dataset.') args.use_context = True elif args.dataset == 'squad': if not args.answerable_only: logging.info('Forcing `answerable_only=True` for squad dataset.') args.answerable_only = True experiment_details = {'args': args} random.seed(args.random_seed) user = os.environ['USER'] slurm_jobid = os.getenv('SLURM_JOB_ID', None) scratch_dir = os.getenv('SCRATCH_DIR', '.') if not os.path.exists(f"{scratch_dir}/{user}/uncertainty"): os.makedirs(f"{scratch_dir}/{user}/uncertainty") wandb.init( entity=args.entity, project="semantic_uncertainty" if not args.debug else "semantic_uncertainty_debug", dir=f"{scratch_dir}/{user}/uncertainty", config=args, notes=f'slurm_id: {slurm_jobid}, experiment_lot: {args.experiment_lot}', ) logging.info('Finished wandb init.') # Get accuracy metric. metric = utils.get_metric(args.metric) # Load dataset. train_dataset, validation_dataset = load_ds( args.dataset, add_options=args.use_mc_options, seed=args.random_seed) if args.ood_train_dataset is not None: logging.warning( 'Using OOD dataset %s to construct few-shot prompts and train p_ik.', args.ood_train_dataset) # Get indices of answerable and unanswerable questions and construct prompt. train_dataset, _ = load_ds(args.ood_train_dataset, add_options=args.use_mc_options) if not isinstance(train_dataset, list): logging.info('Train dataset: %s', train_dataset) # Get indices of answerable and unanswerable questions and construct prompt. answerable_indices, unanswerable_indices = utils.split_dataset(train_dataset) if args.answerable_only: unanswerable_indices = [] val_answerable, val_unanswerable = utils.split_dataset(validation_dataset) del val_unanswerable validation_dataset = [validation_dataset[i] for i in val_answerable] prompt_indices = random.sample(answerable_indices, args.num_few_shot) experiment_details['prompt_indices'] = prompt_indices remaining_answerable = list(set(answerable_indices) - set(prompt_indices)) # Create Few-Shot prompt. make_prompt = utils.get_make_prompt(args) BRIEF = utils.BRIEF_PROMPTS[args.brief_prompt] arg = args.brief_always if args.enable_brief else True prompt = utils.construct_fewshot_prompt_from_indices( train_dataset, prompt_indices, BRIEF, arg, make_prompt) experiment_details['prompt'] = prompt experiment_details['BRIEF'] = BRIEF logging.info('Prompt is: %s', prompt) # Initialize model. model = utils.init_model(args) # Initialize prompt for p_true baseline. if args.compute_p_true: logging.info(80*'#') logging.info('Constructing few-shot prompt for p_true.') p_true_indices = random.sample(answerable_indices, args.p_true_num_fewshot) remaining_answerable = list(set(remaining_answerable) - set(p_true_indices)) p_true_few_shot_prompt, p_true_responses, len_p_true = p_true_utils.construct_few_shot_prompt( model=model, dataset=train_dataset, indices=p_true_indices, prompt=prompt, brief=BRIEF, brief_always=args.brief_always and args.enable_brief, make_prompt=make_prompt, num_generations=args.num_generations, metric=metric) wandb.config.update( {'p_true_num_fewshot': len_p_true}, allow_val_change=True) wandb.log(dict(len_p_true=len_p_true)) experiment_details['p_true_indices'] = p_true_indices experiment_details['p_true_responses'] = p_true_responses experiment_details['p_true_few_shot_prompt'] = p_true_few_shot_prompt logging.info('Finished constructing few-shot prompt for p_true.') logging.info(80*'#') logging.info('p_true_few_shot_prompt: %s', p_true_few_shot_prompt) logging.info(80*'#') # Start answer generation. logging.info(80 * '=') logging.info('Generating answers: ') logging.info(80 * '=') for dataset_split in ['train', 'validation']: logging.info(80 * 'x') logging.info('Starting with dataset_split %s.', dataset_split) logging.info(80 * 'x') # This will store all input data and model predictions. accuracies, generations, results_dict, p_trues = [], {}, {}, [] if dataset_split == 'train': if not args.get_training_set_generations: logging.info('Skip training data.') continue dataset = train_dataset possible_indices = list(set(remaining_answerable) | set(unanswerable_indices)) else: dataset = validation_dataset possible_indices = range(0, len(dataset)) # Evaluate over random subset of the datasets. indices = random.sample(possible_indices, min(args.num_samples, len(dataset))) experiment_details[dataset_split] = {'indices': indices} if args.num_samples > len(dataset): logging.warning('Not enough samples in dataset. Using all %d samples.', len(dataset)) it = 0 for index in tqdm(indices): if (it + 1 % 10) == 0: gc.collect() torch.cuda.empty_cache() it += 1 # Grab example at index. example = dataset[index] question, context = example["question"], example['context'] generations[example['id']] = {'question': question, 'context': context} correct_answer = example['answers']['text'] current_input = make_prompt( context, question, None, BRIEF, args.brief_always and args.enable_brief) local_prompt = prompt + current_input logging.info('Current input: '.ljust(15) + current_input) full_responses = [] # We sample one low temperature answer on which we will compute the # accuracy and args.num_generation high temperature answers which will # be used to estimate the entropy variants. if dataset_split == 'train' and args.get_training_set_generations_most_likely_only: num_generations = 1 else: num_generations = args.num_generations + 1 for i in range(num_generations): # Temperature for first generation is always `0.1`. temperature = 0.1 if i == 0 else args.temperature predicted_answer, token_log_likelihoods, embedding = model.predict( local_prompt, temperature) embedding = embedding.cpu() if embedding is not None else None # Only compute accuracy if question is answerable. compute_acc = args.compute_accuracy_at_all_temps or (i == 0) if correct_answer and compute_acc: acc = metric(predicted_answer, example, model) else: acc = 0.0 # pylint: disable=invalid-name if i == 0: logging.info('Iteration ' + str(it) + ': ' + 80*'#') if args.use_context: logging.info('context: '.ljust(15) + str(context)) logging.info('question: '.ljust(15) + question) logging.info('low-t prediction: '.ljust(15) + predicted_answer) logging.info('correct answer: '.ljust(15) + str(correct_answer)) logging.info('accuracy: '.ljust(15) + str(acc)) accuracies.append(acc) most_likely_answer_dict = { 'response': predicted_answer, 'token_log_likelihoods': token_log_likelihoods, 'embedding': embedding, 'accuracy': acc} generations[example['id']].update({ 'most_likely_answer': most_likely_answer_dict, 'reference': utils.get_reference(example)}) else: logging.info('high-t prediction '.ljust(15) + str(i) + ' : ' + predicted_answer) # Aggregate predictions over num_generations. full_responses.append( (predicted_answer, token_log_likelihoods, embedding, acc)) # Append all predictions for this example to `generations`. generations[example['id']]['responses'] = full_responses if args.compute_p_true and dataset_split == 'validation': # Already compute p_true here. Avoid cost of generations in compute_uncertainty script. p_true = p_true_utils.calculate_p_true( model, question, most_likely_answer_dict['response'], [r[0] for r in full_responses], p_true_few_shot_prompt, hint=args.p_true_hint) p_trues.append(p_true) logging.info('p_true: %s', p_true) # Save generations for that split. utils.save(generations, f'{dataset_split}_generations.pkl') # Log overall accuracy. accuracy = np.mean(accuracies) print(f"Overall {dataset_split} split accuracy: {accuracy}") wandb.log({f"{dataset_split}_accuracy": accuracy}) if dataset_split == 'validation': if args.compute_p_true: results_dict['uncertainty_measures'] = { 'p_false': [1 - p for p in p_trues], 'p_false_fixed': [1 - np.exp(p) for p in p_trues], } utils.save(results_dict, 'uncertainty_measures.pkl') utils.save(experiment_details, 'experiment_details.pkl') logging.info('Run complete.') del model if __name__ == '__main__': parser = utils.get_parser() args, unknown = parser.parse_known_args() logging.info('Starting new run with args: %s', args) if unknown: raise ValueError(f'Unkown args: {unknown}') if args.compute_uncertainties: args.assign_new_wandb_id = False # First sample generations from LLM. logging.info('STARTING `generate_answers`!') main(args) logging.info('FINISHED `generate_answers`!') if args.compute_uncertainties: # Follow with uncertainty calculation script by default. args.assign_new_wandb_id = False gc.collect() torch.cuda.empty_cache() logging.info(50 * '#X') logging.info('STARTING `compute_uncertainty_measures`!') main_compute(args) logging.info('FINISHED `compute_uncertainty_measures`!') ================================================ FILE: semantic_uncertainty/uncertainty/__init__.py ================================================ ================================================ FILE: semantic_uncertainty/uncertainty/data/data_utils.py ================================================ """Data Loading Utilities.""" import os import json import hashlib import datasets def load_ds(dataset_name, seed, add_options=None): """Load dataset.""" user = os.environ['USER'] train_dataset, validation_dataset = None, None if dataset_name == "squad": dataset = datasets.load_dataset("squad_v2") train_dataset = dataset["train"] validation_dataset = dataset["validation"] elif dataset_name == 'svamp': dataset = datasets.load_dataset('ChilleD/SVAMP') train_dataset = dataset["train"] validation_dataset = dataset["test"] reformat = lambda x: { 'question': x['Question'], 'context': x['Body'], 'type': x['Type'], 'equation': x['Equation'], 'id': x['ID'], 'answers': {'text': [str(x['Answer'])]}} train_dataset = [reformat(d) for d in train_dataset] validation_dataset = [reformat(d) for d in validation_dataset] elif dataset_name == 'nq': dataset = datasets.load_dataset("nq_open") train_dataset = dataset["train"] validation_dataset = dataset["validation"] md5hash = lambda s: str(int(hashlib.md5(s.encode('utf-8')).hexdigest(), 16)) reformat = lambda x: { 'question': x['question']+'?', 'answers': {'text': x['answer']}, 'context': '', 'id': md5hash(str(x['question'])), } train_dataset = [reformat(d) for d in train_dataset] validation_dataset = [reformat(d) for d in validation_dataset] elif dataset_name == "trivia_qa": dataset = datasets.load_dataset('TimoImhof/TriviaQA-in-SQuAD-format')['unmodified'] dataset = dataset.train_test_split(test_size=0.2, seed=seed) train_dataset = dataset['train'] validation_dataset = dataset['test'] elif dataset_name == "bioasq": # http://participants-area.bioasq.org/datasets/ we are using training 11b # could also download from here https://zenodo.org/records/7655130 scratch_dir = os.getenv('SCRATCH_DIR', '.') path = f"{scratch_dir}/{user}/semantic_uncertainty/data/bioasq/training11b.json" with open(path, "rb") as file: data = json.load(file) questions = data["questions"] dataset_dict = { "question": [], "answers": [], "id": [] } for question in questions: if "exact_answer" not in question: continue dataset_dict["question"].append(question["body"]) if "exact_answer" in question: if isinstance(question['exact_answer'], list): exact_answers = [ ans[0] if isinstance(ans, list) else ans for ans in question['exact_answer'] ] else: exact_answers = [question['exact_answer']] dataset_dict["answers"].append({ "text": exact_answers, "answer_start": [0] * len(question["exact_answer"]) }) else: dataset_dict["answers"].append({ "text": question["ideal_answer"], "answer_start": [0] }) dataset_dict["id"].append(question["id"]) dataset_dict["context"] = [None] * len(dataset_dict["id"]) dataset = datasets.Dataset.from_dict(dataset_dict) # Split into training and validation set. dataset = dataset.train_test_split(test_size=0.8, seed=seed) train_dataset = dataset['train'] validation_dataset = dataset['test'] else: raise ValueError return train_dataset, validation_dataset ================================================ FILE: semantic_uncertainty/uncertainty/models/__init__.py ================================================ ================================================ FILE: semantic_uncertainty/uncertainty/models/base_model.py ================================================ from abc import ABC, abstractmethod from typing import List, Text STOP_SEQUENCES = ['\n\n\n\n', '\n\n\n', '\n\n', '\n', 'Question:', 'Context:'] class BaseModel(ABC): stop_sequences: List[Text] @abstractmethod def predict(self, input_data, temperature): pass @abstractmethod def get_p_true(self, input_data): pass ================================================ FILE: semantic_uncertainty/uncertainty/models/huggingface_models.py ================================================ """Implement HuggingfaceModel models.""" import copy import logging from collections import Counter import torch import accelerate from transformers import AutoTokenizer from transformers import AutoConfig from transformers import AutoModelForCausalLM from transformers import BitsAndBytesConfig from transformers import StoppingCriteria from transformers import StoppingCriteriaList from huggingface_hub import snapshot_download from uncertainty.models.base_model import BaseModel from uncertainty.models.base_model import STOP_SEQUENCES class StoppingCriteriaSub(StoppingCriteria): """Stop generations when they match a particular text or token.""" def __init__(self, stops, tokenizer, match_on='text', initial_length=None): super().__init__() self.stops = stops self.initial_length = initial_length self.tokenizer = tokenizer self.match_on = match_on if self.match_on == 'tokens': self.stops = [torch.tensor(self.tokenizer.encode(i)).to('cuda') for i in self.stops] print(self.stops) def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): del scores # `scores` arg is required by StoppingCriteria but unused by us. for stop in self.stops: if self.match_on == 'text': generation = self.tokenizer.decode(input_ids[0][self.initial_length:], skip_special_tokens=False) match = stop in generation elif self.match_on == 'tokens': # Can be dangerous due to tokenizer ambiguities. match = stop in input_ids[0][-len(stop):] else: raise if match: return True return False def remove_split_layer(device_map_in): """Modify device maps s.t. individual layers are not spread across devices.""" device_map = copy.deepcopy(device_map_in) destinations = list(device_map.keys()) counts = Counter(['.'.join(i.split('.')[:2]) for i in destinations]) found_split = False for layer, count in counts.items(): if count == 1: continue if found_split: # Only triggers if we find more than one split layer. raise ValueError( 'More than one split layer.\n' f'Currently at layer {layer}.\n' f'In map: {device_map_in}\n' f'Out map: {device_map}\n') logging.info(f'Split layer is {layer}.') # Remove split for that layer. for name in list(device_map.keys()): if name.startswith(layer): print(f'pop {name}') device = device_map.pop(name) device_map[layer] = device found_split = True return device_map class HuggingfaceModel(BaseModel): """Hugging Face Model.""" def __init__(self, model_name, stop_sequences=None, max_new_tokens=None): if max_new_tokens is None: raise self.max_new_tokens = max_new_tokens if stop_sequences == 'default': stop_sequences = STOP_SEQUENCES if 'llama' in model_name.lower(): if model_name.endswith('-8bit'): kwargs = {'quantization_config': BitsAndBytesConfig( load_in_8bit=True,)} model_name = model_name[:-len('-8bit')] eightbit = True else: kwargs = {} eightbit = False if 'Llama-2' in model_name: base = 'meta-llama' model_name = model_name + '-hf' else: base = 'huggyllama' self.tokenizer = AutoTokenizer.from_pretrained( f"{base}/{model_name}", device_map="auto", token_type_ids=None) llama65b = '65b' in model_name and base == 'huggyllama' llama2_70b = '70b' in model_name and base == 'meta-llama' if ('7b' in model_name or '13b' in model_name) or eightbit: self.model = AutoModelForCausalLM.from_pretrained( f"{base}/{model_name}", device_map="auto", max_memory={0: '80GIB'}, **kwargs,) elif llama2_70b or llama65b: path = snapshot_download( repo_id=f'{base}/{model_name}', allow_patterns=['*.json', '*.model', '*.safetensors'], ignore_patterns=['pytorch_model.bin.index.json'] ) config = AutoConfig.from_pretrained(f"{base}/{model_name}") with accelerate.init_empty_weights(): self.model = AutoModelForCausalLM.from_config(config) self.model.tie_weights() max_mem = 15 * 4686198491 device_map = accelerate.infer_auto_device_map( self.model.model, max_memory={0: max_mem, 1: max_mem}, dtype='float16' ) device_map = remove_split_layer(device_map) full_model_device_map = {f"model.{k}": v for k, v in device_map.items()} full_model_device_map["lm_head"] = 0 self.model = accelerate.load_checkpoint_and_dispatch( self.model, path, device_map=full_model_device_map, dtype='float16', skip_keys='past_key_values') else: raise ValueError elif 'mistral' in model_name.lower(): if model_name.endswith('-8bit'): kwargs = {'quantization_config': BitsAndBytesConfig( load_in_8bit=True,)} model_name = model_name[:-len('-8bit')] if model_name.endswith('-4bit'): kwargs = {'quantization_config': BitsAndBytesConfig( load_in_4bit=True,)} model_name = model_name[:-len('-4bit')] else: kwargs = {} model_id = f'mistralai/{model_name}' self.tokenizer = AutoTokenizer.from_pretrained( model_id, device_map='auto', token_type_ids=None, clean_up_tokenization_spaces=False) self.model = AutoModelForCausalLM.from_pretrained( model_id, device_map='auto', max_memory={0: '80GIB'}, **kwargs, ) elif 'falcon' in model_name: model_id = f'tiiuae/{model_name}' self.tokenizer = AutoTokenizer.from_pretrained( model_id, device_map='auto', token_type_ids=None, clean_up_tokenization_spaces=False) kwargs = {'quantization_config': BitsAndBytesConfig( load_in_8bit=True,)} self.model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, device_map='auto', **kwargs, ) else: raise ValueError self.model_name = model_name self.stop_sequences = stop_sequences + [self.tokenizer.eos_token] self.token_limit = 4096 if 'Llama-2' in model_name else 2048 def predict(self, input_data, temperature, return_full=False): # Implement prediction. inputs = self.tokenizer(input_data, return_tensors="pt").to("cuda") if 'llama' in self.model_name.lower() or 'falcon' in self.model_name or 'mistral' in self.model_name.lower(): if 'token_type_ids' in inputs: # Some HF models have changed. del inputs['token_type_ids'] pad_token_id = self.tokenizer.eos_token_id else: pad_token_id = None if self.stop_sequences is not None: stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub( stops=self.stop_sequences, initial_length=len(inputs['input_ids'][0]), tokenizer=self.tokenizer)]) else: stopping_criteria = None logging.debug('temperature: %f', temperature) with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=self.max_new_tokens, return_dict_in_generate=True, output_scores=True, output_hidden_states=True, temperature=temperature, do_sample=True, stopping_criteria=stopping_criteria, pad_token_id=pad_token_id, ) if len(outputs.sequences[0]) > self.token_limit: raise ValueError( 'Generation exceeding token limit %d > %d', len(outputs.sequences[0]), self.token_limit) full_answer = self.tokenizer.decode( outputs.sequences[0], skip_special_tokens=True) if return_full: return full_answer # For some models, we need to remove the input_data from the answer. if full_answer.startswith(input_data): input_data_offset = len(input_data) else: raise ValueError('Have not tested this in a while.') # Remove input from answer. answer = full_answer[input_data_offset:] # Remove stop_words from answer. stop_at = len(answer) sliced_answer = answer if self.stop_sequences is not None: for stop in self.stop_sequences: if answer.endswith(stop): stop_at = len(answer) - len(stop) sliced_answer = answer[:stop_at] break if not all([stop not in sliced_answer for stop in self.stop_sequences]): error_msg = 'Error: Stop words not removed successfully!' error_msg += f'Answer: >{answer}< ' error_msg += f'Sliced Answer: >{sliced_answer}<' if 'falcon' not in self.model_name.lower(): raise ValueError(error_msg) else: logging.error(error_msg) # Remove whitespaces from answer (in particular from beginning.) sliced_answer = sliced_answer.strip() # Get the number of tokens until the stop word comes up. # Note: Indexing with `stop_at` already excludes the stop_token. # Note: It's important we do this with full answer, since there might be # non-trivial interactions between the input_data and generated part # in tokenization (particularly around whitespaces.) token_stop_index = self.tokenizer(full_answer[:input_data_offset + stop_at], return_tensors="pt")['input_ids'].shape[1] n_input_token = len(inputs['input_ids'][0]) n_generated = token_stop_index - n_input_token if n_generated == 0: logging.warning('Only stop_words were generated. For likelihoods and embeddings, taking stop word instead.') n_generated = 1 # Get the last hidden state (last layer) and the last token's embedding of the answer. # Note: We do not want this to be the stop token. # outputs.hidden_state is a tuple of len = n_generated_tokens. # The first hidden state is for the input tokens and is of shape # (n_layers) x (batch_size, input_size, hidden_size). # (Note this includes the first generated token!) # The remaining hidden states are for the remaining generated tokens and is of shape # (n_layers) x (batch_size, 1, hidden_size). # Note: The output embeddings have the shape (batch_size, generated_length, hidden_size). # We do not get embeddings for input_data! We thus subtract the n_tokens_in_input from # token_stop_index to arrive at the right output. if 'decoder_hidden_states' in outputs.keys(): hidden = outputs.decoder_hidden_states else: hidden = outputs.hidden_states if len(hidden) == 1: logging.warning( 'Taking first and only generation for hidden! ' 'n_generated: %d, n_input_token: %d, token_stop_index %d, ' 'last_token: %s, generation was: %s', n_generated, n_input_token, token_stop_index, self.tokenizer.decode(outputs['sequences'][0][-1]), full_answer, ) last_input = hidden[0] elif ((n_generated - 1) >= len(hidden)): # If access idx is larger/equal. logging.error( 'Taking last state because n_generated is too large' 'n_generated: %d, n_input_token: %d, token_stop_index %d, ' 'last_token: %s, generation was: %s, slice_answer: %s', n_generated, n_input_token, token_stop_index, self.tokenizer.decode(outputs['sequences'][0][-1]), full_answer, sliced_answer ) last_input = hidden[-1] else: last_input = hidden[n_generated - 1] # Then access last layer for input last_layer = last_input[-1] # Then access last token in input. last_token_embedding = last_layer[:, -1, :].cpu() # Get log_likelihoods. # outputs.scores are the logits for the generated token. # outputs.scores is a tuple of len = n_generated_tokens. # Each entry is shape (bs, vocabulary size). # outputs.sequences is the sequence of all tokens: input and generated. transition_scores = self.model.compute_transition_scores( outputs.sequences, outputs.scores, normalize_logits=True) # Transition_scores[0] only contains the scores for the first generated tokens. log_likelihoods = [score.item() for score in transition_scores[0]] if len(log_likelihoods) == 1: logging.warning('Taking first and only generation for log likelihood!') log_likelihoods = log_likelihoods else: log_likelihoods = log_likelihoods[:n_generated] if len(log_likelihoods) == self.max_new_tokens: logging.warning('Generation interrupted by max_token limit.') if len(log_likelihoods) == 0: raise ValueError return sliced_answer, log_likelihoods, last_token_embedding def get_p_true(self, input_data): """Get the probability of the model anwering A (True) for the given input.""" input_data += ' A' tokenized_prompt_true = self.tokenizer(input_data, return_tensors='pt').to('cuda')['input_ids'] # The computation of the negative log likelihoods follows: # https://huggingface.co/docs/transformers/perplexity. target_ids_true = tokenized_prompt_true.clone() # Set all target_ids except the last one to -100. target_ids_true[0, :-1] = -100 with torch.no_grad(): model_output_true = self.model(tokenized_prompt_true, labels=target_ids_true) loss_true = model_output_true.loss return -loss_true.item() ================================================ FILE: semantic_uncertainty/uncertainty/uncertainty_measures/p_ik.py ================================================ """Predict model correctness from linear classifier.""" import logging import torch import wandb from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split def get_p_ik(train_embeddings, is_false, eval_embeddings=None, eval_is_false=None): """Fit linear classifier to embeddings to predict model correctness.""" logging.info('Accuracy of model on Task: %f.', 1 - torch.tensor(is_false).mean()) # pylint: disable=no-member # Convert the list of tensors to a 2D tensor. train_embeddings_tensor = torch.cat(train_embeddings, dim=0) # pylint: disable=no-member # Convert the tensor to a numpy array. embeddings_array = train_embeddings_tensor.cpu().numpy() # Split the data into training and test sets. X_train, X_test, y_train, y_test = train_test_split( # pylint: disable=invalid-name embeddings_array, is_false, test_size=0.2, random_state=42) # pylint: disable=invalid-name # Fit a logistic regression model. model = LogisticRegression() model.fit(X_train, y_train) # Predict deterministically and probabilistically and compute accuracy and auroc for all splits. X_eval = torch.cat(eval_embeddings, dim=0).cpu().numpy() # pylint: disable=no-member,invalid-name y_eval = eval_is_false Xs = [X_train, X_test, X_eval] # pylint: disable=invalid-name ys = [y_train, y_test, y_eval] # pylint: disable=invalid-name suffixes = ['train_train', 'train_test', 'eval'] metrics, y_preds_proba = {}, {} for suffix, X, y_true in zip(suffixes, Xs, ys): # pylint: disable=invalid-name # If suffix is eval, we fit a new model on the entire training data set # rather than just a split of the training data set. if suffix == 'eval': model = LogisticRegression() model.fit(embeddings_array, is_false) convergence = { 'n_iter': model.n_iter_[0], 'converged': (model.n_iter_ < model.max_iter)[0]} y_pred = model.predict(X) y_pred_proba = model.predict_proba(X) y_preds_proba[suffix] = y_pred_proba acc_p_ik_train = accuracy_score(y_true, y_pred) auroc_p_ik_train = roc_auc_score(y_true, y_pred_proba[:, 1]) split_metrics = { f'acc_p_ik_{suffix}': acc_p_ik_train, f'auroc_p_ik_{suffix}': auroc_p_ik_train} metrics.update(split_metrics) logging.info('Metrics for p_ik classifier: %s.', metrics) wandb.log({**metrics, **convergence}) # Return model predictions on the eval set. return y_preds_proba['eval'][:, 1] ================================================ FILE: semantic_uncertainty/uncertainty/uncertainty_measures/p_true.py ================================================ """Compute p_true uncertainty metric.""" import logging def construct_few_shot_prompt( *, model, dataset, indices, prompt, brief, brief_always, make_prompt, num_generations, metric): """Construct few shot prompt for p_true uncertainty metric.""" # Call model n_shots many times. few_shot_prompt = [] all_responses = dict() for it, i in enumerate(indices): prompt_candidate = [] example = dataset[i] question = example["question"] context = example["context"] if it != 0: prompt_candidate += ['\n'] prompt_candidate += ['Question: ' + question] prompt_candidate += ['\nBrainstormed Answers: '] current_question = make_prompt(context, question, None, brief, brief_always) local_prompt = prompt + current_question logging.info('P_TRUE >> Current Question: '.ljust(25) + current_question) responses = [] for j in range(num_generations + 1): if j == 0: temperature = 0.1 else: temperature = 1.0 response, _, _ = model.predict(local_prompt, temperature) logging.info('P_TRUE >> Current Response: '.ljust(25) + response) responses.append(response) prompt_candidate += [f'{response.strip()} \n'] if j == 0: # Save most likely response and compute correctness metric for it. most_likely_response = response is_correct = metric(response, example, model) answers = [answer for answer in example['answers']['text']] logging.info('P_TRUE >> LOW-T >> true answer: '.ljust(35) + str(answers)) logging.info('P_TRUE >> LOW-T >> acc: '.ljust(35) + str(is_correct)) all_responses[i] = dict( responses=responses, most_likely_response=most_likely_response, is_correct=is_correct) prompt_candidate += ['Possible answer: ' + most_likely_response + '\n'] prompt_candidate += ['Is the possible answer:\n'] prompt_candidate += ['A) True\n'] prompt_candidate += ['B) False\n'] prompt_candidate += ['The possible answer is:'] prompt_candidate += [' A' if is_correct else ' B'] prompt_len = len(model.tokenizer.encode(''.join(few_shot_prompt + prompt_candidate))) # At test time, get a maximum of `num_generations * model.token_limit` extra tokens # 200 buffer for question and 'Possible Answer'. max_input_len = prompt_len + num_generations * model.max_new_tokens + 200 if max_input_len < model.token_limit: few_shot_prompt.extend(prompt_candidate) else: logging.warning('Cutting of p_true prompt at length %d.', it) break return ''.join(few_shot_prompt), all_responses, it def calculate_p_true( model, question, most_probable_answer, brainstormed_answers, few_shot_prompt, hint=False): """Calculate p_true uncertainty metric.""" if few_shot_prompt: prompt = few_shot_prompt + '\n' else: prompt = '' prompt += 'Question: ' + question prompt += '\nBrainstormed Answers: ' for answer in brainstormed_answers + [most_probable_answer]: prompt += answer.strip() + '\n' prompt += 'Possible answer: ' + most_probable_answer + '\n' if not hint: prompt += 'Is the possible answer:\n' prompt += 'A) True\n' prompt += 'B) False\n' prompt += 'The possible answer is:' else: prompt += 'Do the brainstormed answers match the possible answer? Respond with A if they do, if they do not respond with B. Answer:' log_prob = model.get_p_true(prompt) return log_prob ================================================ FILE: semantic_uncertainty/uncertainty/uncertainty_measures/semantic_entropy.py ================================================ """Implement semantic entropy.""" import os import pickle import logging import numpy as np import wandb import torch import torch.nn.functional as F from transformers import AutoModelForSequenceClassification, AutoTokenizer from uncertainty.models.huggingface_models import HuggingfaceModel from uncertainty.utils import openai as oai from uncertainty.utils import utils DEVICE = "cuda" if torch.cuda.is_available() else "cpu" class BaseEntailment: def save_prediction_cache(self): pass class EntailmentDeberta(BaseEntailment): def __init__(self): self.tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v2-xlarge-mnli") self.model = AutoModelForSequenceClassification.from_pretrained( "microsoft/deberta-v2-xlarge-mnli").to(DEVICE) def check_implication(self, text1, text2, *args, **kwargs): inputs = self.tokenizer(text1, text2, return_tensors="pt").to(DEVICE) # The model checks if text1 -> text2, i.e. if text2 follows from text1. # check_implication('The weather is good', 'The weather is good and I like you') --> 1 # check_implication('The weather is good and I like you', 'The weather is good') --> 2 outputs = self.model(**inputs) logits = outputs.logits # Deberta-mnli returns `neutral` and `entailment` classes at indices 1 and 2. largest_index = torch.argmax(F.softmax(logits, dim=1)) # pylint: disable=no-member prediction = largest_index.cpu().item() if os.environ.get('DEBERTA_FULL_LOG', False): logging.info('Deberta Input: %s -> %s', text1, text2) logging.info('Deberta Prediction: %s', prediction) return prediction class EntailmentLLM(BaseEntailment): entailment_file = 'entailment_cache.pkl' def __init__(self, entailment_cache_id, entailment_cache_only): self.prediction_cache = self.init_prediction_cache(entailment_cache_id) self.entailment_cache_only = entailment_cache_only def init_prediction_cache(self, entailment_cache_id): if entailment_cache_id is None: return dict() logging.info('Restoring prediction cache from %s', entailment_cache_id) api = wandb.Api() run = api.run(entailment_cache_id) run.file(self.entailment_file).download( replace=True, exist_ok=False, root=wandb.run.dir) with open(f'{wandb.run.dir}/{self.entailment_file}', "rb") as infile: return pickle.load(infile) def save_prediction_cache(self): # Write the dictionary to a pickle file. utils.save(self.prediction_cache, self.entailment_file) def check_implication(self, text1, text2, example=None): if example is None: raise ValueError prompt = self.equivalence_prompt(text1, text2, example['question']) logging.info('%s input: %s', self.name, prompt) hashed = oai.md5hash(prompt) if hashed in self.prediction_cache: logging.info('Restoring hashed instead of predicting with model.') response = self.prediction_cache[hashed] else: if self.entailment_cache_only: raise ValueError response = self.predict(prompt, temperature=0.02) self.prediction_cache[hashed] = response logging.info('%s prediction: %s', self.name, response) binary_response = response.lower()[:30] if 'entailment' in binary_response: return 2 elif 'neutral' in binary_response: return 1 elif 'contradiction' in binary_response: return 0 else: logging.warning('MANUAL NEUTRAL!') return 1 class EntailmentGPT4(EntailmentLLM): def __init__(self, entailment_cache_id, entailment_cache_only): super().__init__(entailment_cache_id, entailment_cache_only) self.name = 'gpt-4' def equivalence_prompt(self, text1, text2, question): prompt = f"""We are evaluating answers to the question \"{question}\"\n""" prompt += "Here are two possible answers:\n" prompt += f"Possible Answer 1: {text1}\nPossible Answer 2: {text2}\n" prompt += "Does Possible Answer 1 semantically entail Possible Answer 2? Respond with entailment, contradiction, or neutral.""" return prompt def predict(self, prompt, temperature): return oai.predict(prompt, temperature, model=self.name) class EntailmentGPT35(EntailmentGPT4): def __init__(self, entailment_cache_id, entailment_cache_only): super().__init__(entailment_cache_id, entailment_cache_only) self.name = 'gpt-3.5' class EntailmentGPT4Turbo(EntailmentGPT4): def __init__(self, entailment_cache_id, entailment_cache_only): super().__init__(entailment_cache_id, entailment_cache_only) self.name = 'gpt-4-turbo' class EntailmentLlama(EntailmentLLM): def __init__(self, entailment_cache_id, entailment_cache_only, name): super().__init__(entailment_cache_id, entailment_cache_only) self.name = name self.model = HuggingfaceModel( name, stop_sequences='default', max_new_tokens=30) def equivalence_prompt(self, text1, text2, question): prompt = f"""We are evaluating answers to the question \"{question}\"\n""" prompt += "Here are two possible answers:\n" prompt += f"Possible Answer 1: {text1}\nPossible Answer 2: {text2}\n" prompt += "Does Possible Answer 1 semantically entail Possible Answer 2? Respond only with entailment, contradiction, or neutral.\n""" prompt += "Response:""" return prompt def predict(self, prompt, temperature): predicted_answer, _, _ = self.model.predict(prompt, temperature) return predicted_answer def context_entails_response(context, responses, model): votes = [] for response in responses: votes.append(model.check_implication(context, response)) return 2 - np.mean(votes) def get_semantic_ids(strings_list, model, strict_entailment=False, example=None): """Group list of predictions into semantic meaning.""" def are_equivalent(text1, text2): implication_1 = model.check_implication(text1, text2, example=example) implication_2 = model.check_implication(text2, text1, example=example) # pylint: disable=arguments-out-of-order assert (implication_1 in [0, 1, 2]) and (implication_2 in [0, 1, 2]) if strict_entailment: semantically_equivalent = (implication_1 == 2) and (implication_2 == 2) else: implications = [implication_1, implication_2] # Check if none of the implications are 0 (contradiction) and not both of them are neutral. semantically_equivalent = (0 not in implications) and ([1, 1] != implications) return semantically_equivalent # Initialise all ids with -1. semantic_set_ids = [-1] * len(strings_list) # Keep track of current id. next_id = 0 for i, string1 in enumerate(strings_list): # Check if string1 already has an id assigned. if semantic_set_ids[i] == -1: # If string1 has not been assigned an id, assign it next_id. semantic_set_ids[i] = next_id for j in range(i+1, len(strings_list)): # Search through all remaining strings. If they are equivalent to string1, assign them the same id. if are_equivalent(string1, strings_list[j]): semantic_set_ids[j] = next_id next_id += 1 assert -1 not in semantic_set_ids return semantic_set_ids def logsumexp_by_id(semantic_ids, log_likelihoods, agg='sum_normalized'): """Sum probabilities with the same semantic id. Log-Sum-Exp because input and output probabilities in log space. """ unique_ids = sorted(list(set(semantic_ids))) assert unique_ids == list(range(len(unique_ids))) log_likelihood_per_semantic_id = [] for uid in unique_ids: # Find positions in `semantic_ids` which belong to the active `uid`. id_indices = [pos for pos, x in enumerate(semantic_ids) if x == uid] # Gather log likelihoods at these indices. id_log_likelihoods = [log_likelihoods[i] for i in id_indices] if agg == 'sum_normalized': # log_lik_norm = id_log_likelihoods - np.prod(log_likelihoods) log_lik_norm = id_log_likelihoods - np.log(np.sum(np.exp(log_likelihoods))) logsumexp_value = np.log(np.sum(np.exp(log_lik_norm))) else: raise ValueError log_likelihood_per_semantic_id.append(logsumexp_value) return log_likelihood_per_semantic_id def predictive_entropy(log_probs): """Compute MC estimate of entropy. `E[-log p(x)] ~= -1/N sum_i log p(x_i)`, i.e. the average token likelihood. """ entropy = -np.sum(log_probs) / len(log_probs) return entropy def predictive_entropy_rao(log_probs): entropy = -np.sum(np.exp(log_probs) * log_probs) return entropy def cluster_assignment_entropy(semantic_ids): """Estimate semantic uncertainty from how often different clusters get assigned. We estimate the categorical distribution over cluster assignments from the semantic ids. The uncertainty is then given by the entropy of that distribution. This estimate does not use token likelihoods, it relies soley on the cluster assignments. If probability mass is spread of between many clusters, entropy is larger. If probability mass is concentrated on a few clusters, entropy is small. Input: semantic_ids: List of semantic ids, e.g. [0, 1, 2, 1]. Output: cluster_entropy: Entropy, e.g. (-p log p).sum() for p = [1/4, 2/4, 1/4]. """ n_generations = len(semantic_ids) counts = np.bincount(semantic_ids) probabilities = counts/n_generations assert np.isclose(probabilities.sum(), 1) entropy = - (probabilities * np.log(probabilities)).sum() return entropy ================================================ FILE: semantic_uncertainty/uncertainty/utils/eval_utils.py ================================================ """Functions for performance evaluation, mainly used in analyze_results.py.""" import numpy as np import scipy from sklearn import metrics # pylint: disable=missing-function-docstring def bootstrap(function, rng, n_resamples=1000): def inner(data): bs = scipy.stats.bootstrap( (data, ), function, n_resamples=n_resamples, confidence_level=0.9, random_state=rng) return { 'std_err': bs.standard_error, 'low': bs.confidence_interval.low, 'high': bs.confidence_interval.high } return inner def auroc(y_true, y_score): fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score) del thresholds return metrics.auc(fpr, tpr) def accuracy_at_quantile(accuracies, uncertainties, quantile): cutoff = np.quantile(uncertainties, quantile) select = uncertainties <= cutoff return np.mean(accuracies[select]) def area_under_thresholded_accuracy(accuracies, uncertainties): quantiles = np.linspace(0.1, 1, 20) select_accuracies = np.array([accuracy_at_quantile(accuracies, uncertainties, q) for q in quantiles]) dx = quantiles[1] - quantiles[0] area = (select_accuracies * dx).sum() return area # Need wrappers because scipy expects 1D data. def compatible_bootstrap(func, rng): def helper(y_true_y_score): # this function is called in the bootstrap y_true = np.array([i['y_true'] for i in y_true_y_score]) y_score = np.array([i['y_score'] for i in y_true_y_score]) out = func(y_true, y_score) return out def wrap_inputs(y_true, y_score): return [{'y_true': i, 'y_score': j} for i, j in zip(y_true, y_score)] def converted_func(y_true, y_score): y_true_y_score = wrap_inputs(y_true, y_score) return bootstrap(helper, rng=rng)(y_true_y_score) return converted_func ================================================ FILE: semantic_uncertainty/uncertainty/utils/openai.py ================================================ import os import hashlib from tenacity import retry, wait_random_exponential, retry_if_not_exception_type from openai import OpenAI CLIENT = OpenAI(api_key=os.environ.get('OPENAI_API_KEY', False)) class KeyError(Exception): """OpenAIKey not provided in environment variable.""" pass @retry(retry=retry_if_not_exception_type(KeyError), wait=wait_random_exponential(min=1, max=10)) def predict(prompt, temperature=1.0, model='gpt-4'): """Predict with GPT models.""" if not CLIENT.api_key: raise KeyError('Need to provide OpenAI API key in environment variable `OPENAI_API_KEY`.') if isinstance(prompt, str): messages = [ {'role': 'user', 'content': prompt}, ] else: messages = prompt if model == 'gpt-4': model = 'gpt-4-0613' elif model == 'gpt-4-turbo': model = 'gpt-4-1106-preview' elif model == 'gpt-3.5': model = 'gpt-3.5-turbo-1106' output = CLIENT.chat.completions.create( model=model, messages=messages, max_tokens=200, temperature=temperature, ) response = output.choices[0].message.content return response def md5hash(string): return int(hashlib.md5(string.encode('utf-8')).hexdigest(), 16) ================================================ FILE: semantic_uncertainty/uncertainty/utils/utils.py ================================================ """Utility functions.""" import os import logging import argparse import pickle import wandb from evaluate import load from uncertainty.models.huggingface_models import HuggingfaceModel from uncertainty.utils import openai as oai BRIEF_PROMPTS = { 'default': "Answer the following question as briefly as possible.\n", 'chat': 'Answer the following question in a single brief but complete sentence.\n'} def get_parser(stages=['generate', 'compute']): entity = os.getenv('WANDB_SEM_UNC_ENTITY', None) parser = argparse.ArgumentParser() parser.add_argument( "--debug", action=argparse.BooleanOptionalAction, default=False, help="Keep default wandb clean.") parser.add_argument('--entity', type=str, default=entity) parser.add_argument('--random_seed', type=int, default=10) parser.add_argument( "--metric", type=str, default="squad", choices=['squad', 'llm', 'llm_gpt-3.5', 'llm_gpt-4'], help="Metric to assign accuracy to generations.") parser.add_argument( "--compute_accuracy_at_all_temps", action=argparse.BooleanOptionalAction, default=True, help="Compute accuracy at all temperatures or only t<<1.") parser.add_argument( "--experiment_lot", type=str, default='Unnamed Experiment', help="Keep default wandb clean.") if 'generate' in stages: parser.add_argument( "--model_name", type=str, default="Llama-2-7b-chat", help="Model name", ) parser.add_argument( "--model_max_new_tokens", type=int, default=50, help="Max number of tokens generated.", ) parser.add_argument( "--dataset", type=str, default="trivia_qa", choices=['trivia_qa', 'squad', 'bioasq', 'nq', 'svamp'], help="Dataset to use") parser.add_argument( "--ood_train_dataset", type=str, default=None, choices=['trivia_qa', 'squad', 'bioasq', 'nq', 'svamp'], help="Dataset to use to assemble few-shot prompt, p_true prompt, and train p_ik.") parser.add_argument( "--num_samples", type=int, default=400, help="Number of samples to use") parser.add_argument( "--num_few_shot", type=int, default=5, help="Number of few shot examples to use") parser.add_argument( "--p_true_num_fewshot", type=int, default=20, help="Number of few shot examples to use") parser.add_argument( "--p_true_hint", default=False, action=argparse.BooleanOptionalAction, help="Get generations for training set?") parser.add_argument( "--num_generations", type=int, default=10, help="Number of generations to use") parser.add_argument( "--temperature", type=float, default=1.0, help="Temperature") parser.add_argument( "--use_mc_options", type=bool, default=True, help="Include MC options question?") parser.add_argument( "--get_training_set_generations", default=True, action=argparse.BooleanOptionalAction, help="Get generations for training set?") parser.add_argument( "--use_context", default=False, action=argparse.BooleanOptionalAction, help="Get generations for training set?") parser.add_argument( "--get_training_set_generations_most_likely_only", default=True, action=argparse.BooleanOptionalAction, help=( "Only get embedding of most likely answer for training set. " "This is all that's needed for p_true.")) parser.add_argument('--compute_p_true', default=True, action=argparse.BooleanOptionalAction) parser.add_argument( "--brief_always", default=False, action=argparse.BooleanOptionalAction) parser.add_argument( "--enable_brief", default=True, action=argparse.BooleanOptionalAction) parser.add_argument( "--brief_prompt", default='default', type=str) parser.add_argument( "--prompt_type", default='default', type=str) parser.add_argument( "--compute_uncertainties", default=True, action=argparse.BooleanOptionalAction, help='Trigger compute_uncertainty_measures.py') parser.add_argument( "--answerable_only", default=False, action=argparse.BooleanOptionalAction, help='Exclude unanswerable questions.') if 'compute' in stages: parser.add_argument('--recompute_accuracy', default=False, action=argparse.BooleanOptionalAction) parser.add_argument('--eval_wandb_runid', type=str, help='wandb run id of the dataset to evaluate on') parser.add_argument('--train_wandb_runid', type=str, default=None, help='wandb run id of the dataset from which training embeddings and p_true samples will be taken') parser.add_argument('--num_eval_samples', type=int, default=int(1e19)) parser.add_argument('--compute_predictive_entropy', default=True, action=argparse.BooleanOptionalAction) parser.add_argument('--compute_p_ik', default=True, action=argparse.BooleanOptionalAction) parser.add_argument('--compute_p_ik_answerable', default=False, action=argparse.BooleanOptionalAction) parser.add_argument('--compute_context_entails_response', default=False, action=argparse.BooleanOptionalAction) parser.add_argument('--analyze_run', default=True, action=argparse.BooleanOptionalAction) parser.add_argument('--assign_new_wandb_id', default=True, action=argparse.BooleanOptionalAction) parser.add_argument('--restore_entity_eval', type=str, default=entity) parser.add_argument('--restore_entity_train', type=str, default=entity) parser.add_argument('--condition_on_question', default=True, action=argparse.BooleanOptionalAction) parser.add_argument('--strict_entailment', default=True, action=argparse.BooleanOptionalAction) parser.add_argument('--use_all_generations', default=True, action=argparse.BooleanOptionalAction) parser.add_argument('--use_num_generations', type=int, default=-1) parser.add_argument("--entailment_model", default='deberta', type=str) parser.add_argument( "--entailment_cache_id", default=None, type=str, help='Restore entailment predictions from previous run for GPT-4/LLaMa-Entailment.') parser.add_argument('--entailment_cache_only', default=False, action=argparse.BooleanOptionalAction) parser.add_argument('--compute_p_true_in_compute_stage', default=False, action=argparse.BooleanOptionalAction) parser.add_argument('--reuse_entailment_model', default=False, action=argparse.BooleanOptionalAction, help='Use entailment model as p_true model.') return parser def setup_logger(): """Setup logger to always print time and level.""" logging.basicConfig( format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') logging.getLogger().setLevel(logging.INFO) # logging.DEBUG def construct_fewshot_prompt_from_indices(dataset, example_indices, brief, brief_always, make_prompt): """Given a dataset and indices, construct a fewshot prompt.""" if not brief_always: prompt = brief else: prompt = '' for example_index in example_indices: example = dataset[example_index] context = example["context"] question = example["question"] answer = example["answers"]["text"][0] prompt = prompt + make_prompt(context, question, answer, brief, brief_always) return prompt def split_dataset(dataset): """Get indices of answerable and unanswerable questions.""" def clen(ex): return len(ex["answers"]["text"]) answerable_indices = [i for i, ex in enumerate(dataset) if clen(ex) > 0] unanswerable_indices = [i for i, ex in enumerate(dataset) if clen(ex) == 0] # union == full dataset assert set(answerable_indices) | set( unanswerable_indices) == set(range(len(dataset))) # no overlap assert set(answerable_indices) - \ set(unanswerable_indices) == set(answerable_indices) return answerable_indices, unanswerable_indices def model_based_metric(predicted_answer, example, model): if 'answers' in example: correct_answers = example['answers']['text'] elif 'reference' in example: correct_answers = example['reference']['answers']['text'] else: raise ValueError prompt = f'We are assessing the quality of answers to the following question: {example["question"]}\n' if len(correct_answers) == 1: prompt += f"The expected answer is: {correct_answers[0]}.\n" else: prompt += f"The following are expected answers to this question: {correct_answers}.\n" prompt += f"The proposed answer is: {predicted_answer}\n" if len(correct_answers) == 1: prompt += "Within the context of the question, does the proposed answer mean the same as the expected answer?" else: prompt += "Within the context of the question, does the proposed answer mean the same as any of the expected answers?" prompt += " Respond only with yes or no.\nResponse:" if 'gpt' in model.model_name.lower(): predicted_answer = model.predict(prompt, 0.01) else: predicted_answer, _, _ = model.predict(prompt, 0.01) if 'yes' in predicted_answer.lower(): return 1.0 elif 'no' in predicted_answer.lower(): return 0.0 else: logging.warning('Redo llm check.') predicted_answer, _, _ = model.predict(prompt, 1) if 'yes' in predicted_answer.lower(): return 1.0 elif 'no' in predicted_answer.lower(): return 0.0 logging.warning('Answer neither no nor yes. Defaulting to no!') return 0.0 def llm_metric(predicted_answer, example, model): return model_based_metric(predicted_answer, example, model) def get_gpt_metric(metric_name): model_name = '_'.join(metric_name.split('_')[1:]) class EntailmentGPT(): def __init__(self, model_name): self.model_name = model_name def predict(self, prompt, temperature): return oai.predict(prompt, temperature, model=self.model_name) gpt_model = EntailmentGPT(model_name) def gpt_metric(predicted_answer, example, model): del model return model_based_metric(predicted_answer, example, gpt_model) return gpt_metric def get_reference(example): if 'answers' not in example: example = example['reference'] answers = example['answers'] answer_starts = answers.get('answer_start', []) reference = {'answers': {'answer_start': answer_starts, 'text': answers['text']}, 'id': example['id']} return reference def init_model(args): mn = args.model_name if 'llama' in mn.lower() or 'falcon' in mn or 'mistral' in mn.lower(): model = HuggingfaceModel( mn, stop_sequences='default', max_new_tokens=args.model_max_new_tokens) else: raise ValueError(f'Unknown model_name `{mn}`.') return model def get_make_prompt(args): if args.prompt_type == 'default': def make_prompt(context, question, answer, brief, brief_always): prompt = '' if brief_always: prompt += brief if args.use_context and (context is not None): prompt += f"Context: {context}\n" prompt += f"Question: {question}\n" if answer: prompt += f"Answer: {answer}\n\n" else: prompt += 'Answer:' return prompt else: raise ValueError return make_prompt def get_metric(metric): if metric == 'squad': squad_metric = load("squad_v2") def metric(response, example, *args, **kwargs): # Compatibility with recomputation. if 'id' in example: exid = example['id'] elif 'id' in example['reference']: exid = example['reference']['id'] else: raise ValueError prediction = {'prediction_text': response, 'no_answer_probability': 0.0, 'id': exid} results = squad_metric.compute( predictions=[prediction], references=[get_reference(example)]) return 1.0 if (results['f1'] >= 50.0) else 0.0 # Reuses the globally active model for these. elif metric == 'llm': metric = llm_metric elif metric == 'llm_gpt-3.5': metric = get_gpt_metric(metric) elif metric == 'llm_gpt-4': metric = get_gpt_metric(metric) else: raise ValueError return metric def save(object, file): with open(f'{wandb.run.dir}/{file}', 'wb') as f: pickle.dump(object, f) wandb.save(f'{wandb.run.dir}/{file}')