gitextract_wvb_jtw0/

├── .gitignore
├── LICENCE
├── LICENSE
├── README.md
├── apply_delta.py
├── bbh.py
├── crass.py
├── docs/
│   ├── alignment_leaderboard.csv
│   ├── index.html
│   ├── models.md
│   ├── problem_solving_leaderboard.csv
│   └── writing_leaderboard.csv
├── drop.py
├── hhh.py
├── human_eval/
│   ├── __init__.py
│   ├── data.py
│   ├── evaluate_functional_correctness.py
│   ├── evaluation.py
│   ├── execution.py
│   └── main.py
├── lm_eval/
│   ├── __init__.py
│   ├── base.py
│   ├── evaluator.py
│   ├── main.py
│   ├── metrics.py
│   ├── models/
│   │   ├── __init__.py
│   │   ├── dummy.py
│   │   ├── gpt2.py
│   │   ├── gpt3.py
│   │   ├── llama.py
│   │   └── textsynth.py
│   ├── tasks/
│   │   ├── __init__.py
│   │   ├── anli.py
│   │   ├── arc.py
│   │   ├── arithmetic.py
│   │   ├── blimp.py
│   │   ├── cbt.py
│   │   ├── coqa.py
│   │   ├── drop.py
│   │   ├── glue.py
│   │   ├── gsm8k.py
│   │   ├── headqa.py
│   │   ├── hellaswag.py
│   │   ├── hendrycks_ethics.py
│   │   ├── hendrycks_math.py
│   │   ├── hendrycks_test.py
│   │   ├── lambada.py
│   │   ├── lambada_cloze.py
│   │   ├── lambada_multilingual.py
│   │   ├── mathqa.py
│   │   ├── mc_taco.py
│   │   ├── naturalqs.py
│   │   ├── openbookqa.py
│   │   ├── pile.py
│   │   ├── piqa.py
│   │   ├── prost.py
│   │   ├── pubmedqa.py
│   │   ├── qa4mre.py
│   │   ├── qasper.py
│   │   ├── quac.py
│   │   ├── race.py
│   │   ├── sciq.py
│   │   ├── squad.py
│   │   ├── storycloze.py
│   │   ├── superglue.py
│   │   ├── swag.py
│   │   ├── translation.py
│   │   ├── triviaqa.py
│   │   ├── truthfulqa.py
│   │   ├── webqs.py
│   │   ├── wikitext.py
│   │   ├── winogrande.py
│   │   └── wsc273.py
│   └── utils.py
├── main.py
├── mmlu.py
├── modeling.py
├── quant/
│   ├── __init__.py
│   ├── custom_autotune.py
│   ├── fused_attn.py
│   ├── fused_mlp.py
│   ├── quant_linear.py
│   ├── quantizer.py
│   └── triton_norm.py
├── red-eval/
│   ├── LICENSE
│   ├── README.md
│   ├── api_keys/
│   │   ├── chatgpt_api_key.json
│   │   └── gpt4_api_key.json
│   ├── generate_responses.py
│   ├── gpt4_as_judge.py
│   ├── hamrful_questions/
│   │   ├── dangerousqa.json
│   │   └── harmfulqs.json
│   ├── red_prompts/
│   │   ├── cot.txt
│   │   ├── cou.txt
│   │   ├── standard.txt
│   │   └── suffix.txt
│   ├── requirements.txt
│   └── results/
│       ├── dangerousqa_chatgpt_cou.json
│       ├── dangerousqa_chatgpt_cou_clean.json
│       ├── dangerousqa_gpt4_cou.json
│       ├── dangerousqa_gpt4_cou_clean.json
│       ├── dangerousqa_gpt4_cou_clean_labelled.xlsx
│       ├── dangerousqa_vicuna-7b-v1.3_cou_clean.json
│       └── dangerousqa_vicuna-7b-v1.3_cou_clean_labelled.xlsx
├── requirements.txt
└── subjective.py