SYMBOL INDEX (2242 symbols across 296 files) FILE: .github/workflows/parse_yaml.py function get_first_key (line 6) | def get_first_key(file_path): FILE: evals/api.py class CompletionResult (line 16) | class CompletionResult(ABC): method get_completions (line 18) | def get_completions(self) -> list[str]: class CompletionFn (line 23) | class CompletionFn(Protocol): method __call__ (line 24) | def __call__( class DummyCompletionResult (line 43) | class DummyCompletionResult(CompletionResult): method get_completions (line 44) | def get_completions(self) -> list[str]: class DummyCompletionFn (line 48) | class DummyCompletionFn(CompletionFn): method __call__ (line 49) | def __call__( function record_and_check_match (line 55) | def record_and_check_match( FILE: evals/base.py class CompletionFnSpec (line 18) | class CompletionFnSpec: class BaseEvalSpec (line 30) | class BaseEvalSpec: class EvalSpec (line 51) | class EvalSpec: class EvalSetSpec (line 64) | class EvalSetSpec: class RunSpec (line 75) | class RunSpec: method __post_init__ (line 85) | def __post_init__(self): FILE: evals/cli/oaieval.py function _purple (line 21) | def _purple(str: str) -> str: function get_parser (line 25) | def get_parser() -> argparse.ArgumentParser: class OaiEvalArguments (line 96) | class OaiEvalArguments(argparse.Namespace): function run (line 118) | def run(args: OaiEvalArguments, registry: Optional[Registry] = None) -> ... function build_recorder (line 242) | def build_recorder( function add_token_usage_to_result (line 269) | def add_token_usage_to_result(result: dict[str, Any], recorder: Recorder... function main (line 297) | def main() -> None: FILE: evals/cli/oaievalset.py class Progress (line 17) | class Progress: method __init__ (line 18) | def __init__(self, file: str) -> None: method load (line 22) | def load(self) -> bool: method add (line 31) | def add(self, item: Task) -> None: method save (line 35) | def save(self) -> None: function highlight (line 43) | def highlight(str: str) -> str: function get_parser (line 47) | def get_parser() -> argparse.ArgumentParser: class OaiEvalSetArguments (line 73) | class OaiEvalSetArguments(argparse.Namespace): function run (line 81) | def run( function main (line 134) | def main() -> None: FILE: evals/completion_fns/cot.py class ChainOfThoughtCompletionResult (line 15) | class ChainOfThoughtCompletionResult(CompletionResult): method __init__ (line 16) | def __init__(self, response) -> None: method get_completions (line 19) | def get_completions(self) -> list[str]: class ChainOfThoughtCompletionFn (line 23) | class ChainOfThoughtCompletionFn(CompletionFn): method __init__ (line 24) | def __init__( method __call__ (line 49) | def __call__(self, prompt, **kwargs) -> ChainOfThoughtCompletionResult: FILE: evals/completion_fns/langchain_llm.py class LangChainLLMCompletionResult (line 20) | class LangChainLLMCompletionResult(CompletionResult): method __init__ (line 21) | def __init__(self, response) -> None: method get_completions (line 24) | def get_completions(self) -> list[str]: class LangChainLLMCompletionFn (line 28) | class LangChainLLMCompletionFn(CompletionFn): method __init__ (line 29) | def __init__(self, llm: str, llm_kwargs: Optional[dict] = None, **kwar... method __call__ (line 42) | def __call__(self, prompt, **kwargs) -> LangChainLLMCompletionResult: function _convert_dict_to_langchain_message (line 49) | def _convert_dict_to_langchain_message(_dict) -> BaseMessage: class LangChainChatModelCompletionFn (line 68) | class LangChainChatModelCompletionFn(CompletionFn): method __init__ (line 69) | def __init__(self, llm: str, chat_model_kwargs: Optional[dict] = None,... method __call__ (line 82) | def __call__(self, prompt, **kwargs) -> LangChainLLMCompletionResult: FILE: evals/completion_fns/langchain_math.py class LangChainCompletionResult (line 11) | class LangChainCompletionResult(CompletionResult): method __init__ (line 12) | def __init__(self, response) -> None: method get_completions (line 15) | def get_completions(self) -> list[str]: class LangChainMathChainCompletionFn (line 19) | class LangChainMathChainCompletionFn(CompletionFn): method __init__ (line 20) | def __init__(self, **kwargs) -> None: method __call__ (line 24) | def __call__(self, prompt, **kwargs) -> LangChainCompletionResult: FILE: evals/completion_fns/openai.py function openai_completion_create_retrying (line 27) | def openai_completion_create_retrying(client: OpenAI, *args, **kwargs): function openai_chat_completion_create_retrying (line 41) | def openai_chat_completion_create_retrying(client: OpenAI, *args, **kwar... class OpenAIBaseCompletionResult (line 55) | class OpenAIBaseCompletionResult(CompletionResult): method __init__ (line 56) | def __init__(self, raw_data: Any, prompt: Any): method get_completions (line 60) | def get_completions(self) -> list[str]: class OpenAIChatCompletionResult (line 64) | class OpenAIChatCompletionResult(OpenAIBaseCompletionResult): method get_completions (line 65) | def get_completions(self) -> list[str]: class OpenAICompletionResult (line 74) | class OpenAICompletionResult(OpenAIBaseCompletionResult): method get_completions (line 75) | def get_completions(self) -> list[str]: class OpenAICompletionFn (line 83) | class OpenAICompletionFn(CompletionFn): method __init__ (line 84) | def __init__( method __call__ (line 99) | def __call__( class OpenAIChatCompletionFn (line 134) | class OpenAIChatCompletionFn(CompletionFnSpec): method __init__ (line 135) | def __init__( method __call__ (line 149) | def __call__( FILE: evals/completion_fns/retrieval.py function load_embeddings (line 20) | def load_embeddings(embeddings_and_text_path: str): function find_top_k_closest_embeddings (line 28) | def find_top_k_closest_embeddings(embedded_prompt: list[float], embs: li... class RetrievalCompletionResult (line 45) | class RetrievalCompletionResult(CompletionResult): method __init__ (line 46) | def __init__(self, response: str) -> None: method get_completions (line 49) | def get_completions(self) -> list[str]: class RetrievalCompletionFn (line 53) | class RetrievalCompletionFn(CompletionFn): method __init__ (line 58) | def __init__( method __call__ (line 91) | def __call__(self, prompt: Union[str, list[dict]], **kwargs: Any) -> R... FILE: evals/completion_fns/solver_completion_fn.py class SolverCompletionFnResult (line 10) | class SolverCompletionFnResult(CompletionResult): method __init__ (line 11) | def __init__(self, msg): method get_completions (line 14) | def get_completions(self): class SolverCompletionFn (line 18) | class SolverCompletionFn(CompletionFn): method __init__ (line 32) | def __init__(self, solver: Union[SolverSpec, Solver], registry: Any = ... method __call__ (line 38) | def __call__( FILE: evals/data.py function gzip_open (line 25) | def gzip_open(filename: str, mode: str = "rb", openhook: Any = open) -> ... function lz4_open (line 33) | def lz4_open(filename: str, mode: str = "rb", openhook: Any = open) -> l... function zstd_open (line 40) | def zstd_open(filename: str, mode: str = "rb", openhook: Any = open) -> ... function open_by_file_pattern (line 47) | def open_by_file_pattern(filename: Union[str, Path], mode: str = "r", **... function _decode_json (line 82) | def _decode_json(line, path, line_number): function _get_jsonl_file (line 93) | def _get_jsonl_file(path): function _get_json_file (line 99) | def _get_json_file(path): function _stream_jsonl_file (line 105) | def _stream_jsonl_file(path) -> Iterator: function get_lines (line 112) | def get_lines(path) -> list[dict]: function get_jsonl (line 120) | def get_jsonl(path: str) -> list[dict]: function get_jsonls (line 136) | def get_jsonls(paths: Sequence[str], line_limit=None) -> list[dict]: function get_json (line 140) | def get_json(path) -> dict: function iter_jsonls (line 146) | def iter_jsonls(paths: Union[str, list[str]], line_limit=None) -> Iterat... function get_csv (line 168) | def get_csv(path, fieldnames=None): function _to_py_types (line 174) | def _to_py_types(o: Any, exclude_keys: List[Text]) -> Any: class EnhancedJSONEncoder (line 202) | class EnhancedJSONEncoder(json.JSONEncoder): method __init__ (line 203) | def __init__(self, exclude_keys: Optional[List[Text]] = None, **kwargs... method default (line 207) | def default(self, o: Any) -> str: function jsondumps (line 211) | def jsondumps(o: Any, ensure_ascii: bool = False, **kwargs: Any) -> str: function jsondump (line 221) | def jsondump(o: Any, fp: Any, ensure_ascii: bool = False, **kwargs: Any)... function jsonloads (line 225) | def jsonloads(s: str, **kwargs: Any) -> Any: function jsonload (line 229) | def jsonload(fp: Any, **kwargs: Any) -> Any: FILE: evals/data_test.py class MyPydanticClass (line 9) | class MyPydanticClass(BaseModel): class MyDataClass (line 15) | class MyDataClass: function test_jsondumps (line 21) | def test_jsondumps(): FILE: evals/elsuite/already_said_that/distractors.py class DistractorSample (line 9) | class DistractorSample: function proc_which_is_heavier (line 23) | def proc_which_is_heavier(samples) -> list[DistractorSample]: function proc_distractors_first_letters (line 33) | def proc_distractors_first_letters(samples) -> list[DistractorSample]: function proc_distractors_ambiguous_sentences (line 42) | def proc_distractors_ambiguous_sentences(samples) -> list[DistractorSamp... function proc_distractors_reverse_sort_words_eng (line 52) | def proc_distractors_reverse_sort_words_eng(samples) -> list[DistractorS... function get_basic_distractor_example (line 72) | def get_basic_distractor_example() -> DistractorSample: function get_distractors (line 80) | def get_distractors(variant: str) -> list[DistractorSample]: function get_samples (line 97) | def get_samples(eval_name) -> list[dict]: function get_full_path (line 112) | def get_full_path(data_path, registry_path) -> Path: function get_distractor_word (line 119) | def get_distractor_word(question: str) -> str: FILE: evals/elsuite/already_said_that/eval.py class AlreadySaidThat (line 14) | class AlreadySaidThat(SolverEval): method __init__ (line 15) | def __init__( method eval_sample (line 40) | def eval_sample(self, solver: Solver, sample: dict, rng: random.Random... method _conversation_loop (line 49) | def _conversation_loop( method run (line 118) | def run(self, recorder: RecorderBase): method _compute_agg_metrics (line 126) | def _compute_agg_metrics(self, logged_metrics: list[dict]) -> dict: method _get_samples (line 157) | def _get_samples(self) -> list[dict]: FILE: evals/elsuite/already_said_that/scripts/gen_data.py function process_wordnet (line 11) | def process_wordnet() -> list[str]: function gen_sample (line 28) | def gen_sample(words_corpus: list[str], n_words, rng: random.Random) -> ... function gen_samples (line 33) | def gen_samples(n_samples: int, n_words: int, rng: random.Random) -> lis... function write_to_jsonl (line 42) | def write_to_jsonl( function main (line 51) | def main(args: argparse.Namespace): FILE: evals/elsuite/already_said_that/scripts/make_plots.py function zero_if_none (line 13) | def zero_if_none(input_num): function make_results_dict (line 92) | def make_results_dict(log_dir: Path) -> dict: function prepare_results_dict (line 98) | def prepare_results_dict() -> dict: function fill_results_dict (line 118) | def fill_results_dict(results_dict: dict, log_dir: Path) -> dict: function get_model (line 145) | def get_model(spec): function make_bar_plot (line 167) | def make_bar_plot(results_dict: dict, stat: str, save_path: Path): function count_tokens (line 257) | def count_tokens(log_dir) -> dict[str, dict[str, dict[str, int]]]: function main (line 297) | def main(args: argparse.Namespace): FILE: evals/elsuite/already_said_that/solvers.py class RandomBaselineSolver (line 8) | class RandomBaselineSolver(Solver): method __init__ (line 9) | def __init__(self, registry: Any = None): method _solve (line 12) | def _solve(self, task_state: TaskState, **kwargs) -> SolverResult: class AlreadySaidThatHuman (line 17) | class AlreadySaidThatHuman(NestedSolver): method __init__ (line 18) | def __init__(self, human_cli_solver: SolverSpec, *args, **kwargs): method human_cli_solver (line 22) | def human_cli_solver(self) -> Solver: method _solve (line 25) | def _solve(self, task_state: TaskState) -> SolverResult: method _map_to_yesno (line 32) | def _map_to_yesno(self, yesno_ish): FILE: evals/elsuite/already_said_that/test_distractors.py function which_is_heavier_samples (line 12) | def which_is_heavier_samples(): function first_letters_samples (line 38) | def first_letters_samples(): function ambiguous_sentences_samples (line 70) | def ambiguous_sentences_samples(): function reverse_sort_words_eng_samples (line 96) | def reverse_sort_words_eng_samples(): function test_proc_distractors_which_is_heavier (line 127) | def test_proc_distractors_which_is_heavier(which_is_heavier_samples): function test_proc_distractors_first_letter (line 139) | def test_proc_distractors_first_letter(first_letters_samples): function test_proc_distractors_ambiguous_sentences (line 154) | def test_proc_distractors_ambiguous_sentences(ambiguous_sentences_samples): function test_proc_distractors_reverse_sort_words_eng (line 169) | def test_proc_distractors_reverse_sort_words_eng(reverse_sort_words_eng_... FILE: evals/elsuite/already_said_that/utils.py function build_message (line 9) | def build_message( function build_base_task_message (line 50) | def build_base_task_message( function build_distractor_question_message (line 72) | def build_distractor_question_message( function find_alpha_words (line 81) | def find_alpha_words(s: str) -> list[str]: function parse_solver_output (line 92) | def parse_solver_output( function eval_distractor_task (line 151) | def eval_distractor_task(answer: str, ideal: str) -> bool: function eval_main_task (line 156) | def eval_main_task(answer, curr_word, words_prev_shown): FILE: evals/elsuite/ballots/eval.py class BallotsEval (line 29) | class BallotsEval(Eval): method __init__ (line 30) | def __init__( method eval_sample (line 58) | def eval_sample(self, sample: Any, *_): method run (line 161) | def run(self, recorder): FILE: evals/elsuite/ballots/scripts/make_plots.py function main (line 12) | def main(): function load_bp_results_from_dir (line 24) | def load_bp_results_from_dir(log_dir: str) -> pd.DataFrame: function compute_bp_metrics_from_individual_results (line 47) | def compute_bp_metrics_from_individual_results(results): function parse_spec (line 73) | def parse_spec(spec: dict) -> tuple[str, str, int]: function make_plots (line 91) | def make_plots(df: pd.DataFrame, out_dir: Path): function make_yes_ratio_by_model_plot (line 105) | def make_yes_ratio_by_model_plot(df): function make_yes_ratio_by_target_plot (line 110) | def make_yes_ratio_by_target_plot(df): function make_success_rate_plot (line 116) | def make_success_rate_plot(df): function extract_vote_data_from_df (line 121) | def extract_vote_data_from_df(df): function _make_model_plot (line 159) | def _make_model_plot(bars_dict): function _make_target_plot (line 195) | def _make_target_plot(bars_dict): function _make_success_plot (line 255) | def _make_success_plot(bars_dict): FILE: evals/elsuite/ballots/utils.py function toks_to_id (line 59) | def toks_to_id(model: str, toks: list[str], validation: Optional[list[st... function format_messages (line 72) | def format_messages(messages, *format_args, **format_kwargs): function format_prompt (line 83) | def format_prompt(prompt, *format_args, **format_kwargs): function get_influencer_prompt (line 91) | def get_influencer_prompt(model, direction): function get_voter_prompt (line 115) | def get_voter_prompt(model): function prompt_matches_model (line 122) | def prompt_matches_model(model, prompt): function reverse_roles (line 129) | def reverse_roles(messages): function chat_to_text (line 139) | def chat_to_text(messages): FILE: evals/elsuite/basic/fuzzy_match.py class FuzzyMatch (line 9) | class FuzzyMatch(evals.Eval): method __init__ (line 10) | def __init__( method eval_sample (line 23) | def eval_sample(self, test_sample, rng): method run (line 53) | def run(self, recorder: RecorderBase): FILE: evals/elsuite/basic/fuzzy_match_test.py function test_eval_sample (line 21) | def test_eval_sample( function test_eval_sample_raises (line 49) | def test_eval_sample_raises(sample: Any, expected_error: Type): FILE: evals/elsuite/basic/includes.py class Includes (line 9) | class Includes(evals.Eval): method __init__ (line 10) | def __init__( method eval_sample (line 23) | def eval_sample(self, sample: Any, *_): method run (line 50) | def run(self, recorder): FILE: evals/elsuite/basic/includes_test.py function test_eval_sample (line 23) | def test_eval_sample( function test_eval_sample_raises (line 55) | def test_eval_sample_raises(sample: Any, expected_error: Type): FILE: evals/elsuite/basic/json_match.py function json_match (line 12) | def json_match(sampled_json: Any, correct_json: Any) -> bool: class JsonMatch (line 40) | class JsonMatch(evals.Eval): method __init__ (line 46) | def __init__( method eval_sample (line 59) | def eval_sample(self, sample: Any, rng: random.Random): method run (line 100) | def run(self, recorder: RecorderBase) -> Dict[str, float]: FILE: evals/elsuite/basic/json_match_test.py function test_eval_sample (line 62) | def test_eval_sample( function test_eval_sample_raises (line 90) | def test_eval_sample_raises(sample: Any, expected_error: Type[Exception]... FILE: evals/elsuite/basic/json_validator.py function is_valid_json (line 10) | def is_valid_json(s): class JsonValidator (line 18) | class JsonValidator(evals.Eval): method __init__ (line 19) | def __init__( method eval_sample (line 30) | def eval_sample(self, sample: Any, *_): method run (line 42) | def run(self, recorder): FILE: evals/elsuite/basic/json_validator_test.py function test_eval_sample (line 20) | def test_eval_sample( function test_eval_sample_raises (line 46) | def test_eval_sample_raises(sample: Any, expected_error: Type): FILE: evals/elsuite/basic/match.py class Match (line 9) | class Match(evals.Eval): method __init__ (line 10) | def __init__( method eval_sample (line 30) | def eval_sample(self, sample: Any, *_): method run (line 58) | def run(self, recorder): FILE: evals/elsuite/basic/match_test.py function test_eval_sample (line 19) | def test_eval_sample( function test_eval_sample_2 (line 46) | def test_eval_sample_2( function test_eval_sample_raises (line 76) | def test_eval_sample_raises(sample: Any, expected_error: Type): FILE: evals/elsuite/basic/match_with_solvers.py class MatchWithSolvers (line 15) | class MatchWithSolvers(SolverEval): method __init__ (line 16) | def __init__( method eval_sample (line 42) | def eval_sample(self, solver: Solver, sample: Any, *_): method run (line 68) | def run(self, recorder): FILE: evals/elsuite/bluff/bluff/cards.py class PlayerCards (line 12) | class PlayerCards: method __init__ (line 13) | def __init__(self, cards: list[str]): method no_suit (line 21) | def no_suit(self): method lm_format (line 24) | def lm_format(self): method _suit_repr (line 34) | def _suit_repr(self, suit): method __repr__ (line 38) | def __repr__(self): function get_poker_hand (line 42) | def get_poker_hand(txt: str) -> "PokerHand": function get_bluff_move (line 59) | def get_bluff_move(txt: str) -> BluffMove: function get_all_hands (line 66) | def get_all_hands(): function get_all_winning_hands (line 78) | def get_all_winning_hands(*in_cards: PlayerCards): class PokerHand (line 112) | class PokerHand: method __eq__ (line 113) | def __eq__(self, other): method __lt__ (line 116) | def __lt__(self, other): method __repr__ (line 125) | def __repr__(self): method evaluate (line 128) | def evaluate(self, *player_cards: PlayerCards) -> bool: class HighCard (line 140) | class HighCard(PokerHand): method __init__ (line 143) | def __init__(self, card: str): method cards (line 146) | def cards(self) -> str: method from_string (line 150) | def from_string(cls, txt): method all (line 155) | def all(self): class OnePair (line 159) | class OnePair(PokerHand): method __init__ (line 162) | def __init__(self, card: str): method cards (line 165) | def cards(self) -> str: method from_string (line 169) | def from_string(cls, txt): method all (line 174) | def all(cls): class TwoPair (line 178) | class TwoPair(PokerHand): method __init__ (line 181) | def __init__(self, card_1: str, card_2: str): method cards (line 191) | def cards(self) -> str: method from_string (line 195) | def from_string(cls, txt): method all (line 207) | def all(cls): class ThreeOfAKind (line 216) | class ThreeOfAKind(PokerHand): method __init__ (line 219) | def __init__(self, card: str): method cards (line 222) | def cards(self) -> str: method from_string (line 226) | def from_string(cls, txt): method all (line 231) | def all(cls): class FullHouse (line 235) | class FullHouse(PokerHand): method __init__ (line 238) | def __init__(self, card_triple: str, card_pair: str): method cards (line 244) | def cards(self) -> str: method from_string (line 248) | def from_string(cls, in_txt): method all (line 263) | def all(cls): class FourOfAKind (line 272) | class FourOfAKind(PokerHand): method __init__ (line 275) | def __init__(self, card: str): method cards (line 278) | def cards(self) -> str: method from_string (line 282) | def from_string(cls, txt): method all (line 287) | def all(cls): FILE: evals/elsuite/bluff/bluff/game.py class Game (line 13) | class Game: method __init__ (line 14) | def __init__( method task_description (line 24) | def task_description(self): method play (line 27) | def play(self): method make_move (line 41) | def make_move(self, player: "Player", move: BluffMove) -> None: method add_player (line 45) | def add_player(self, player: "Player"): method player_cards (line 50) | def player_cards(self, player: "Player") -> str: method _deal_cards (line 54) | def _deal_cards(self): FILE: evals/elsuite/bluff/bluff/players.py class Player (line 14) | class Player: method __init__ (line 15) | def __init__(self, game: Game): method cards (line 21) | def cards(self) -> PlayerCards: method make_move (line 24) | def make_move(self) -> None: class SimplestBot (line 28) | class SimplestBot(Player): method make_move (line 29) | def make_move(self): class HonestBotLowest (line 37) | class HonestBotLowest(Player): method make_move (line 38) | def make_move(self): class HonestBotHighest (line 51) | class HonestBotHighest(Player): method make_move (line 52) | def make_move(self): class StrongBot (line 61) | class StrongBot(Player): method make_move (line 81) | def make_move(self): method get_move (line 85) | def get_move(self): method _bluff_bid (line 93) | def _bluff_bid(self): method _honest_bid (line 101) | def _honest_bid(self): method _bid_higher_or_call_bluff (line 111) | def _bid_higher_or_call_bluff(self): method _get_winning_hand_probability (line 146) | def _get_winning_hand_probability(self) -> float: method _random_opp_hand (line 161) | def _random_opp_hand(self) -> PlayerCards: FILE: evals/elsuite/bluff/bluff/round.py class BluffRound (line 6) | class BluffRound: method __init__ (line 9) | def __init__(self, player_1_cards: PlayerCards, player_2_cards: Player... method finished (line 15) | def finished(self) -> bool: method summary (line 18) | def summary(self) -> tuple[int, int, tuple[PlayerCards, PlayerCards]]: method make_move (line 23) | def make_move(self, player_ix: int, move: Union[BluffMove, str]): method _finalize (line 45) | def _finalize(self): FILE: evals/elsuite/bluff/bluff/test_bluff_game.py function test_bluff_rules (line 21) | def test_bluff_rules(sequence, expected): FILE: evals/elsuite/bluff/eval.py class BluffEval (line 25) | class BluffEval(SolverEval): method __init__ (line 26) | def __init__( method eval_sample (line 42) | def eval_sample(self, solver: Solver, sample_ix: int, rng: random.Rand... method _get_player_info (line 73) | def _get_player_info(self, player: Player) -> str: method run (line 79) | def run(self, recorder: evals.record.Recorder) -> dict[str, Union[floa... method _get_game_metrics (line 154) | def _get_game_metrics(self, game: Game) -> dict: method _create_opponent (line 166) | def _create_opponent(self, game: Game) -> Player: method _create_human_player (line 181) | def _create_human_player(game: Game) -> Player: method _create_solver_player (line 189) | def _create_solver_player(game: Game, solver_name: str) -> Player: method _create_bot_player (line 198) | def _create_bot_player(game: Game, module_and_class: str) -> Player: FILE: evals/elsuite/bluff/scripts/make_plots.py function extract_results (line 11) | def extract_results(datadir: Path) -> tuple[pd.DataFrame, pd.DataFrame, ... function make_main_metric_plots (line 49) | def make_main_metric_plots(df: pd.DataFrame, palette: dict, outdir: Path... function _make_main_metric_plot (line 59) | def _make_main_metric_plot(df: pd.DataFrame, palette: dict, opponent: st... function make_per_round_plots (line 95) | def make_per_round_plots(df: pd.DataFrame, palette: dict, outdir: Path) ... function _make_per_round_plot (line 106) | def _make_per_round_plot(df: pd.DataFrame, palette: dict, opponent: str,... function main (line 125) | def main(): FILE: evals/elsuite/bluff/solver_player.py class SolverPlayer (line 18) | class SolverPlayer(Player): method __init__ (line 19) | def __init__(self, game: Game, solver: Solver, prompts_override: dict ... method make_move (line 29) | def make_move(self) -> None: method _request_bid (line 36) | def _request_bid(self) -> None: method _bid (line 51) | def _bid(self, num_attempts: int = 3) -> None: method _previous_round_summary (line 78) | def _previous_round_summary(self) -> None: method _get_response (line 95) | def _get_response(self) -> str: method _add_message (line 100) | def _add_message(self, role: str, content: str) -> None: FILE: evals/elsuite/bluff/strategy_solver.py class BluffStrategySolver (line 12) | class BluffStrategySolver(Solver): method __init__ (line 13) | def __init__( method _generate_response (line 32) | def _generate_response(self, task_state: TaskState): method _solve (line 45) | def _solve(self, task_state: TaskState): method name (line 116) | def name(self) -> str: method _parse_response (line 119) | def _parse_response(self, response: str) -> str: FILE: evals/elsuite/bugged_tools/bugged_tools.py function explicit_error (line 11) | def explicit_error(tool_task_state: ToolTaskState, non_bugged_func: Call... function offset_numeric (line 22) | def offset_numeric(correct_output: Union[int, float], offset: Union[int,... function small_offset (line 31) | def small_offset(tool_task_state: ToolTaskState, non_bugged_func: Callab... function large_offset (line 52) | def large_offset(tool_task_state: ToolTaskState, non_bugged_func: Callab... function random_output (line 75) | def random_output(tool_task_state: ToolTaskState, non_bugged_func: Calla... function incorrect_type (line 113) | def incorrect_type(tool_task_state: ToolTaskState, non_bugged_func: Call... FILE: evals/elsuite/bugged_tools/eval.py class BuggedTools (line 29) | class BuggedTools(SolverEval): method __init__ (line 30) | def __init__( method eval_sample (line 65) | def eval_sample(self, solver: Solver, sample: Any, rng: random.Random): method run (line 112) | def run(self, recorder: evals.record.Recorder) -> dict[str, Union[floa... method _log_additional_metrics (line 152) | def _log_additional_metrics(self, metrics: Sequence[Event], results: d... method _get_tools (line 185) | def _get_tools(self, sample: Any) -> dict: method _find_flag_from_assistant (line 211) | def _find_flag_from_assistant( method _judge_find_bugs (line 226) | def _judge_find_bugs(self, all_messages: Sequence[Message]): method _solver_predicted_bug (line 247) | def _solver_predicted_bug(self, sample: Any, all_messages: Sequence[Me... method _solver_used_bugged_input (line 258) | def _solver_used_bugged_input(self, sample: Any, all_messages: Sequenc... FILE: evals/elsuite/bugged_tools/scripts/plot_experiments.py function extract_results (line 11) | def extract_results(datadir: Path) -> pd.DataFrame: function plot_results (line 28) | def plot_results(df: pd.DataFrame, out_dir: Path, plot_horizontal: bool): function main (line 122) | def main(): FILE: evals/elsuite/bugged_tools/tools.py class ToolTaskState (line 11) | class ToolTaskState: class ToolResult (line 18) | class ToolResult: class Tool (line 22) | class Tool: method __call__ (line 24) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class BuggedTool (line 28) | class BuggedTool(Tool): method __init__ (line 29) | def __init__(self, tool: Callable, bugged_input: Sequence[int], bugged... method __call__ (line 37) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class Dummy (line 46) | class Dummy(Tool): method __call__ (line 51) | def __call__(self, _) -> ToolResult: class HalveRoundDown (line 55) | class HalveRoundDown(Tool): method __call__ (line 60) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class Double (line 70) | class Double(Tool): method __call__ (line 75) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class IsPrime (line 85) | class IsPrime(Tool): method __call__ (line 90) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class CalcSumDigits (line 104) | class CalcSumDigits(Tool): method __call__ (line 109) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class CollatzLength (line 119) | class CollatzLength(Tool): method __call__ (line 124) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class HammingDistance (line 143) | class HammingDistance(Tool): method __call__ (line 148) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class CountDivisors (line 162) | class CountDivisors(Tool): method __call__ (line 167) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class SumOfPalindromes (line 185) | class SumOfPalindromes(Tool): method __call__ (line 190) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class MaxPrimeFactor (line 200) | class MaxPrimeFactor(Tool): method __call__ (line 205) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class IsPronic (line 229) | class IsPronic(Tool): method __call__ (line 234) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class NonDivThreeSum (line 248) | class NonDivThreeSum(Tool): method __call__ (line 253) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class SequenceRearrange (line 263) | class SequenceRearrange(Tool): method __call__ (line 268) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class PrimeSummation (line 278) | class PrimeSummation(Tool): method __call__ (line 283) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class NthLucas (line 296) | class NthLucas(Tool): method __call__ (line 301) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class DecimalToBinary (line 313) | class DecimalToBinary(Tool): method __call__ (line 318) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class ParitySortDescending (line 329) | class ParitySortDescending(Tool): method __call__ (line 334) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class SumOfOddFibNumbers (line 347) | class SumOfOddFibNumbers(Tool): method __call__ (line 352) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class SumOfCubes (line 369) | class SumOfCubes(Tool): method __call__ (line 374) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class ProductOfDigitDifferences (line 384) | class ProductOfDigitDifferences(Tool): method __call__ (line 389) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class XORChecksum (line 403) | class XORChecksum(Tool): method __call__ (line 408) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class HammingWeight (line 420) | class HammingWeight(Tool): method __call__ (line 425) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class ReverseBinary (line 435) | class ReverseBinary(Tool): method __call__ (line 440) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class DigitProduct (line 450) | class DigitProduct(Tool): method __call__ (line 455) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class CalculateLongestRunOfOnes (line 467) | class CalculateLongestRunOfOnes(Tool): method __call__ (line 472) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class AlternatingSumDigits (line 483) | class AlternatingSumDigits(Tool): method __call__ (line 488) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class CircularShift (line 498) | class CircularShift(Tool): method __call__ (line 503) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class TrailingZerosInFactorial (line 519) | class TrailingZerosInFactorial(Tool): method __call__ (line 524) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class ReverseStr (line 540) | class ReverseStr(Tool): method __call__ (line 545) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class FindUniqueChars (line 552) | class FindUniqueChars(Tool): method __call__ (line 557) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class StringSort (line 567) | class StringSort(Tool): method __call__ (line 572) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class ReplaceVowelsWithSum (line 579) | class ReplaceVowelsWithSum(Tool): method __call__ (line 584) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class InterleaveChars (line 594) | class InterleaveChars(Tool): method __call__ (line 599) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: class RotateString (line 606) | class RotateString(Tool): method __call__ (line 611) | def __call__(self, tool_task_state: ToolTaskState) -> ToolResult: FILE: evals/elsuite/bugged_tools/utils.py function calculate_accuracy (line 8) | def calculate_accuracy(tp: int, fp: int, tn: int, fn: int): function calculate_precision (line 13) | def calculate_precision(tp: int, fp: int): function calculate_recall (line 21) | def calculate_recall(tp: int, fn: int): function calculate_f1 (line 29) | def calculate_f1(precision: float, recall: float): function precision_recall_fscore (line 37) | def precision_recall_fscore(metrics: Sequence[dict]): function try_cast_from_str (line 64) | def try_cast_from_str(n: str, cast_type: type): FILE: evals/elsuite/cant_do_that_anymore/chess/board.py class Board (line 14) | class Board: method __init__ (line 21) | def __init__( method __str__ (line 33) | def __str__(self) -> str: method _update_board (line 65) | def _update_board(self, move: Move): method _get_player_moves (line 103) | def _get_player_moves(self, player_id: str, previous_moves: Sequence[M... method _is_king_in_check (line 124) | def _is_king_in_check(self, player_id: str) -> bool: method _filter_for_king_capturing_moves (line 131) | def _filter_for_king_capturing_moves( class BoardController (line 145) | class BoardController: method __init__ (line 152) | def __init__( method __str__ (line 165) | def __str__(self) -> str: method update_board (line 168) | def update_board(self, move: str): method get_player_legal_moves (line 177) | def get_player_legal_moves(self, player_id: str) -> Sequence[str]: method _filter_to_prevent_pinning (line 190) | def _filter_to_prevent_pinning(self, moves: Sequence[Move], player_id:... method _is_checkmate (line 234) | def _is_checkmate(self, player_id: str) -> bool: method _is_stalemate (line 240) | def _is_stalemate(self, player_id: str) -> bool: FILE: evals/elsuite/cant_do_that_anymore/chess/board_test.py function default_board_init (line 22) | def default_board_init() -> Sequence[Sequence[str]]: function simulate_games (line 37) | def simulate_games(): FILE: evals/elsuite/cant_do_that_anymore/chess/notation.py function row_idx_swap (line 12) | def row_idx_swap(n: int) -> int: function coord_str_to_pos (line 16) | def coord_str_to_pos(s: str) -> Sequence[int]: function coord_pos_to_str (line 23) | def coord_pos_to_str(s: str) -> str: class NotationParser (line 29) | class NotationParser: method __init__ (line 30) | def __init__(self, piece_str_to_id, piece_id_to_str) -> None: method _str_to_move (line 35) | def _str_to_move(self, s: str, board_state: Sequence[Sequence[int]], p... method _move_to_str (line 39) | def _move_to_str(self, move: Move, board_state: Sequence[Sequence[int]... class AlgebraicNotationParser (line 43) | class AlgebraicNotationParser(NotationParser): method _str_to_move (line 62) | def _str_to_move(self, s: str, board_state: Sequence[Sequence[int]]) -... method _move_to_str (line 93) | def _move_to_str(self, move: Move, board_state: Sequence[Sequence[int]... FILE: evals/elsuite/cant_do_that_anymore/chess/pieces.py class Piece (line 15) | class Piece: method __init__ (line 16) | def __init__( method get_piece_moves (line 45) | def get_piece_moves( method _get_moves_from_transformations (line 90) | def _get_moves_from_transformations( method _get_pawn_double_step_transformations (line 131) | def _get_pawn_double_step_transformations( method _remove_illegal_pawn_capture_transformations (line 140) | def _remove_illegal_pawn_capture_transformations( method _get_en_passant_transformations (line 173) | def _get_en_passant_transformations( method _add_promotion_moves (line 199) | def _add_promotion_moves(self, piece_moves: Sequence[Move]) -> Sequenc... method _get_castling_possible_moves (line 213) | def _get_castling_possible_moves( FILE: evals/elsuite/cant_do_that_anymore/chess/utils.py class Move (line 6) | class Move: function get_other_player_id (line 13) | def get_other_player_id(this_player_id: str) -> str: function parse_piece (line 22) | def parse_piece( function move_crosses_pieces (line 37) | def move_crosses_pieces(board_state: Sequence[Sequence[int]], move: Move... function has_piece_been_moved (line 46) | def has_piece_been_moved( function coord_within_board (line 57) | def coord_within_board(row_idx: int, col_idx: int) -> bool: function move_within_board (line 66) | def move_within_board(move: Move) -> bool: function get_path_between_coords (line 71) | def get_path_between_coords( function same_color_piece_at_move_start (line 94) | def same_color_piece_at_move_start( function capturing_same_color (line 102) | def capturing_same_color(board_state: Sequence[Sequence[int]], move: Mov... FILE: evals/elsuite/cant_do_that_anymore/eval.py class CantDoThatAnymore (line 38) | class CantDoThatAnymore(SolverEval): method __init__ (line 39) | def __init__( method eval_sample (line 56) | def eval_sample(self, solver: Solver, sample: Any, rng: random.Random): method run (line 115) | def run(self, recorder: RecorderBase) -> dict[str, Union[float, int]]: method get_violations (line 160) | def get_violations( FILE: evals/elsuite/cant_do_that_anymore/scripts/dataset_creation.py function prepare_lichess_2014_dataset (line 21) | def prepare_lichess_2014_dataset(out_dir: str) -> str: class MoveFilter (line 44) | class MoveFilter: method __call__ (line 45) | def __call__( class SpecialMoveFilter (line 55) | class SpecialMoveFilter(MoveFilter): method __call__ (line 62) | def __call__( class ControlMoveFilter (line 77) | class ControlMoveFilter(MoveFilter): method __call__ (line 89) | def __call__( function is_move_illegal (line 122) | def is_move_illegal(controller: BoardController, move: chess.Move, playe... function find_specific_moves_in_game (line 129) | def find_specific_moves_in_game( function create_dataset_of_specific_moves (line 177) | def create_dataset_of_specific_moves( function main (line 239) | def main(args: argparse.Namespace): FILE: evals/elsuite/cant_do_that_anymore/scripts/diagonal_dataset_creation.py function get_stockfish_move (line 42) | def get_stockfish_move(stockfish: Stockfish, num_moves_to_consider: int)... function parse_stockfish_move (line 89) | def parse_stockfish_move(controller: BoardController, move: str) -> str: function get_bishop_diagonal_moves (line 115) | def get_bishop_diagonal_moves(controller: BoardController, player_id: st... function find_specific_moves_in_game (line 156) | def find_specific_moves_in_game( function create_bishop_diagonal_dataset (line 218) | def create_bishop_diagonal_dataset( function main (line 274) | def main(args: argparse.Namespace): FILE: evals/elsuite/cant_do_that_anymore/scripts/make_plots.py function extract_results (line 18) | def extract_results(datadir: Path) -> pd.DataFrame: function render_results (line 54) | def render_results(df: pd.DataFrame, out_dir: Path): function compute_num_previous_bishop_moves (line 65) | def compute_num_previous_bishop_moves(previous_moves: Sequence[str]) -> ... function plot_diagonal_bishop_results (line 82) | def plot_diagonal_bishop_results(df: pd.DataFrame, out_dir: Path): function main (line 110) | def main(): FILE: evals/elsuite/cant_do_that_anymore/utils.py function construct_messages (line 27) | def construct_messages(previous_moves: Sequence[str]) -> Sequence[Message]: function dump_sequence_to_jsonl (line 43) | def dump_sequence_to_jsonl(data: Sequence[dict], path: str): function load_sequence_from_jsonl (line 50) | def load_sequence_from_jsonl(path: str) -> Sequence[dict]: function initialise_boards (line 60) | def initialise_boards() -> tuple[BoardController, BoardController, chess... function assert_boards_consistent (line 84) | def assert_boards_consistent( function does_solver_predict_move (line 105) | def does_solver_predict_move( function process_example (line 127) | def process_example(work_input: dict): function get_solver_predictions (line 146) | def get_solver_predictions( function get_dataset_path (line 181) | def get_dataset_path( function create_dataset (line 223) | def create_dataset( function get_diagonal_dataset_path (line 241) | def get_diagonal_dataset_path( function get_binary_avg (line 247) | def get_binary_avg(metrics: dict, key: str) -> float: FILE: evals/elsuite/error_recovery/eval.py class Sample (line 25) | class Sample: class ErrorRecovery (line 35) | class ErrorRecovery(SolverEval): method __init__ (line 36) | def __init__( method eval_sample (line 82) | def eval_sample(self, solver: Solver, sample: Sample, rng: random.Rand... method _get_no_reasoning_task_state (line 141) | def _get_no_reasoning_task_state(self, sample: Sample) -> TaskState: method _get_correct_reasoning_task_state (line 152) | def _get_correct_reasoning_task_state(self, sample: Sample) -> TaskState: method _get_incorrect_reasoning_task_state (line 166) | def _get_incorrect_reasoning_task_state( method _get_answer (line 185) | def _get_answer( method run (line 220) | def run(self, recorder: evals.record.Recorder): method _extract_final_answer (line 262) | def _extract_final_answer(self, solver: Solver, task_state: TaskState,... method get_samples (line 276) | def get_samples(self) -> List[Sample]: FILE: evals/elsuite/error_recovery/scripts/dataset_creation.py function main (line 20) | def main(): function create_data_subset (line 54) | def create_data_subset(data: pd.DataFrame, examples_per_task: int) -> pd... function create_positive_examples (line 68) | def create_positive_examples(data: pd.DataFrame) -> pd.DataFrame: function create_negative_examples (line 85) | def create_negative_examples(data: pd.DataFrame) -> pd.DataFrame: function clone_and_load_data (line 103) | def clone_and_load_data(): function maybe_clone_repo (line 123) | def maybe_clone_repo(clone_dir): function plot_hist (line 130) | def plot_hist(data): function print_example (line 135) | def print_example(): FILE: evals/elsuite/error_recovery/scripts/make_plots.py function maybe_show (line 59) | def maybe_show(fig): function extract_results (line 65) | def extract_results(datadir: Path) -> pd.DataFrame: function extract_metrics (line 82) | def extract_metrics(datadir: Path) -> pd.DataFrame: function get_all_tasks (line 101) | def get_all_tasks(results_df: pd.DataFrame) -> list[str]: function get_all_tasks_renamed (line 113) | def get_all_tasks_renamed(results_df: pd.DataFrame) -> list[str]: function get_unique_models (line 121) | def get_unique_models(results_df: pd.DataFrame) -> list[str]: function get_cleaned_model_name (line 131) | def get_cleaned_model_name(model: str) -> str: function corrects_to_accuracy_and_sem (line 135) | def corrects_to_accuracy_and_sem(corrects: pd.Series): function annotate_axes (line 141) | def annotate_axes(ax, errors: Optional[pd.DataFrame]): function corrects_to_performance_loss_and_error (line 172) | def corrects_to_performance_loss_and_error(CR_corrects: pd.Series, IR_co... function accuracy_by_task (line 195) | def accuracy_by_task(metrics_df, results_df: pd.DataFrame, out_dir: Path): function accuracy_by_model_dfs (line 205) | def accuracy_by_model_dfs(metrics_df, results_df: pd.DataFrame): function accuracy_by_model (line 241) | def accuracy_by_model(metrics_df, results_df: pd.DataFrame, out_dir: Path): function accuracy_by_model_and_reasoning (line 268) | def accuracy_by_model_and_reasoning( function plot_accuracy_by_steps_all (line 358) | def plot_accuracy_by_steps_all(metrics_df, results_df, out_dir): function plot_accuracy_by_steps (line 385) | def plot_accuracy_by_steps(df, task, model, ax): function plot_accuracy_by_task (line 414) | def plot_accuracy_by_task(model, metrics_df, all_tasks, all_tasks_rename... function performance_loss_per_task (line 468) | def performance_loss_per_task(metrics_df: pd.DataFrame, results_df: pd.D... function performance_loss_per_model (line 517) | def performance_loss_per_model(metrics_df: pd.DataFrame, results_df: pd.... function main (line 557) | def main(): FILE: evals/elsuite/function_deduction/baselines.py class AverageBaseline (line 16) | class AverageBaseline(Solver): method __init__ (line 29) | def __init__(self, registry=None): method _solve (line 32) | def _solve(self, task_state: TaskState): method _get_guess (line 45) | def _get_guess(self, test_inputs, known_values: dict[int, int], guess_... method _get_ask (line 64) | def _get_ask(self, test_inputs, round_ix) -> str: class FullKnowledge (line 78) | class FullKnowledge(Solver): method __init__ (line 87) | def __init__(self, mode: str, samples_jsonl: str, registry: Registry): method _solve (line 93) | def _solve(self, task_state: TaskState): method _get_matching_samples (line 108) | def _get_matching_samples(self, known_values): method _get_ask_best (line 117) | def _get_ask_best(self, samples): method _get_ask_random (line 125) | def _get_ask_random(self, known_values): method _get_samples (line 131) | def _get_samples(self, samples_jsonl: str, registry_path: Path): FILE: evals/elsuite/function_deduction/eval.py class Sample (line 21) | class Sample: class CurrentState (line 30) | class CurrentState: method round_ix (line 49) | def round_ix(self): method ask_update (line 52) | def ask_update(self, input_: int, value: Optional[int]) -> None: method guess_update (line 58) | def guess_update( class FunctionDeductionEval (line 74) | class FunctionDeductionEval(SolverEval): method __init__ (line 75) | def __init__( method eval_sample (line 100) | def eval_sample(self, solver: Solver, sample: Sample, rng: random.Rand... method run (line 151) | def run(self, recorder: evals.record.Recorder): method _calculate_sem (line 180) | def _calculate_sem(self, values: list) -> float: method _get_success_metrics (line 183) | def _get_success_metrics(self, metrics): method _get_sample_std (line 198) | def _get_sample_std(self, metrics): method _get_complexity_tests (line 236) | def _get_complexity_tests(self, metrics): method _get_per_complexity_metrics (line 257) | def _get_per_complexity_metrics(self, all_metrics): method _parse_raw_response (line 265) | def _parse_raw_response(self, response: str) -> Union[Tuple[int], Tupl... method _bad_guess_answer (line 276) | def _bad_guess_answer(self, test_inputs, guessed, expected) -> str: method get_samples (line 293) | def get_samples(self) -> List[Sample]: FILE: evals/elsuite/function_deduction/scripts/dataset/create_dataset.py function get_func_from_code (line 9) | def get_func_from_code(code): function get_complexity (line 13) | def get_complexity(code: str) -> int: function create_dataset (line 20) | def create_dataset(out_file, in_file): FILE: evals/elsuite/function_deduction/scripts/make_plots.py function extract_final_reports (line 74) | def extract_final_reports( function make_plot (line 92) | def make_plot( function make_ask_guess_incorrect_plot (line 136) | def make_ask_guess_incorrect_plot(df, out_path: Path): function main (line 197) | def main(): FILE: evals/elsuite/function_deduction/solvers.py class CustomCoT (line 11) | class CustomCoT(CoTSolver): method __init__ (line 12) | def __init__( method cot_template (line 25) | def cot_template(self, task_state: TaskState) -> str: method _get_summary (line 37) | def _get_summary(self, current_state: CurrentState) -> str: class BaseModelSolver (line 59) | class BaseModelSolver(HHHSolver): method _solve (line 60) | def _solve(self, task_state: TaskState): method _few_shot_messages (line 70) | def _few_shot_messages(self) -> list[Message]: class BaseModelCoTSolver (line 85) | class BaseModelCoTSolver(CustomCoT): method __init__ (line 86) | def __init__(self, *args, **kwargs): method cot_solver (line 90) | def cot_solver(self): method _solve (line 103) | def _solve(self, task_state: TaskState): method _few_shot_messages (line 118) | def _few_shot_messages(self, current_state) -> list[Message]: FILE: evals/elsuite/function_deduction/solvers_test.py function simulate_dummy_game (line 25) | def simulate_dummy_game(solver): function test_custom_cot (line 59) | def test_custom_cot(): function test_base_model_cot_solver (line 76) | def test_base_model_cot_solver(): FILE: evals/elsuite/hr_ml_agent_bench/actions.py function make_action_string (line 12) | def make_action_string(name: str, args: dict) -> str: function get_action (line 17) | def get_action(s: str) -> Optional[Action]: function is_valid_action (line 43) | def is_valid_action(action: Action) -> bool: FILE: evals/elsuite/hr_ml_agent_bench/auto_marking.py class EvaluationResult (line 9) | class EvaluationResult: function grade_submission (line 23) | def grade_submission(log_dir: Path, task_name: str) -> EvaluationResult: FILE: evals/elsuite/hr_ml_agent_bench/autoeval.py class Step (line 19) | class Step: class TaskStateMetadata (line 26) | class TaskStateMetadata: class FunctionCall (line 37) | class FunctionCall: function run (line 42) | def run( function attempted_to_use_stable_baselines (line 208) | def attempted_to_use_stable_baselines(s: str) -> bool: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/ant/baselines/naive.py class Agent (line 8) | class Agent: method __init__ (line 9) | def __init__(self, env): method act (line 15) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/ant/env/train.py class Agent (line 6) | class Agent: method __init__ (line 7) | def __init__(self, env): method act (line 13) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/ant/scripts/grade.py function get_score (line 9) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 33) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 57) | def get_naive_baseline_score() -> float: function normalize_score (line 69) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/baselines/human.py class Agent (line 13) | class Agent: method act (line 14) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/baselines/naive.py class Agent (line 7) | class Agent: method __init__ (line 8) | def __init__(self, env): method act (line 14) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/env/train.py class Agent (line 5) | class Agent: method __init__ (line 6) | def __init__(self, env): method act (line 12) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/scripts/grade.py function get_score (line 7) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 31) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 44) | def get_naive_baseline_score() -> float: function normalize_score (line 56) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/baselines/naive.py class Agent (line 7) | class Agent: method __init__ (line 8) | def __init__(self, env): method act (line 14) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/env/train.py class Agent (line 5) | class Agent: method __init__ (line 6) | def __init__(self, env): method act (line 12) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/scripts/grade.py function get_score (line 7) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 36) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 59) | def get_naive_baseline_score() -> float: function normalize_score (line 71) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/env/train.py class Net (line 10) | class Net(nn.Module): method __init__ (line 11) | def __init__(self): method forward (line 20) | def forward(self, x): function test_model (line 39) | def test_model(model, device, dataloader): function main (line 54) | def main(): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts/grade.py function get_score (line 18) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 40) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 52) | def get_naive_baseline_score() -> float: function normalize_score (line 65) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/env/train.py function compute_metrics_for_regression (line 24) | def compute_metrics_for_regression(y_test, y_test_pred): function train_model (line 35) | def train_model(X_train, y_train, X_valid, y_valid): function predict (line 42) | def predict(model, X): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/scripts/grade.py function get_score (line 12) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 47) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 59) | def get_naive_baseline_score() -> float: function normalize_score (line 71) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/house_price/scripts/grade.py function get_score (line 8) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 48) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 60) | def get_naive_baseline_score() -> float: function normalize_score (line 72) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/baselines/naive.py class Agent (line 8) | class Agent: method __init__ (line 9) | def __init__(self, env): method act (line 15) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/env/train.py class Agent (line 6) | class Agent: method __init__ (line 7) | def __init__(self, env): method act (line 13) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/scripts/grade.py function get_score (line 9) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 38) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 61) | def get_naive_baseline_score() -> float: function normalize_score (line 73) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/imdb/scripts/grade.py function get_score (line 7) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 35) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 47) | def get_naive_baseline_score() -> float: function normalize_score (line 57) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/baselines/naive.py class Agent (line 8) | class Agent: method __init__ (line 9) | def __init__(self, env): method act (line 15) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/env/train.py class Agent (line 6) | class Agent: method __init__ (line 7) | def __init__(self, env): method act (line 13) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/scripts/grade.py function get_score (line 7) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 35) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 44) | def get_naive_baseline_score() -> float: function normalize_score (line 56) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/env/train.py class MLP (line 38) | class MLP(torch.nn.Module): method __init__ (line 39) | def __init__(self, in_channels, hidden_channels, out_channels, num_lay... method reset_parameters (line 53) | def reset_parameters(self): method forward (line 59) | def forward(self, x): method inference (line 68) | def inference(self, total_loader, device): function test (line 93) | def test(model, device): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts/grade.py function get_score (line 17) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 50) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 63) | def get_naive_baseline_score() -> float: function normalize_score (line 91) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/env/train.py function smapep1 (line 10) | def smapep1(y_true, y_pred): function get_predictions (line 41) | def get_predictions(my_train, model): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/scripts/grade.py function get_score (line 8) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 33) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 46) | def get_naive_baseline_score() -> float: function normalize_score (line 52) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/scripts/prepare.py function get_rating (line 119) | def get_rating(row): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/pong/baselines/naive.py class Agent (line 8) | class Agent: method __init__ (line 9) | def __init__(self, env): method act (line 15) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/pong/env/train.py class Agent (line 6) | class Agent: method __init__ (line 7) | def __init__(self, env): method act (line 13) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/pong/scripts/grade.py function get_score (line 7) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 38) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 61) | def get_naive_baseline_score() -> float: function normalize_score (line 73) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/baselines/naive.py class Agent (line 8) | class Agent: method __init__ (line 9) | def __init__(self, env): method act (line 15) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/env/train.py class Agent (line 6) | class Agent: method __init__ (line 7) | def __init__(self, env): method act (line 13) | def act(self, observation): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/scripts/grade.py function get_score (line 9) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 38) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 61) | def get_naive_baseline_score() -> float: function normalize_score (line 73) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/env/train.py function create_new_dataframe (line 4) | def create_new_dataframe(data, column_names): FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/scripts/grade.py function get_score (line 6) | def get_score(submission_dir: Path) -> float: function get_human_baseline_score (line 41) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 53) | def get_naive_baseline_score() -> float: function normalize_score (line 59) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/env/train.py function relu (line 7) | def relu(x: np.ndarray) -> np.ndarray: function add_padding (line 17) | def add_padding(X: np.ndarray, pad_size: Union[int, list, tuple], pad_va... class Conv2DLayer (line 43) | class Conv2DLayer: method __init__ (line 48) | def __init__( method convolution_step (line 78) | def convolution_step( method forward (line 97) | def forward(self, features_batch: np.ndarray) -> np.ndarray: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/scripts/grade.py function get_score (line 11) | def get_score(submission_dir: Path) -> float: function _get_execution_time (line 33) | def _get_execution_time(scriptpath: Path) -> float: function get_human_baseline_score (line 71) | def get_human_baseline_score() -> float: function get_naive_baseline_score (line 84) | def get_naive_baseline_score() -> float: function normalize_score (line 96) | def normalize_score(score: float) -> float: FILE: evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/scripts/human_baseline.py function relu (line 10) | def relu(x: np.ndarray) -> np.ndarray: function add_padding (line 20) | def add_padding(X: np.ndarray, pad_size: Union[int, list, tuple], pad_va... class Conv2DLayer (line 46) | class Conv2DLayer: method __init__ (line 51) | def __init__( method convolution_step (line 81) | def convolution_step( method forward (line 100) | def forward(self, features_batch: np.ndarray) -> np.ndarray: FILE: evals/elsuite/hr_ml_agent_bench/environment.py class Environment (line 41) | class Environment: method __init__ (line 42) | def __init__( method research_problem (line 87) | def research_problem(self): method benchmark_folder_name (line 91) | def benchmark_folder_name(self): method read_only_files (line 95) | def read_only_files(self): method action_infos (line 99) | def action_infos(self): method static_kwargs_for_tools (line 103) | def static_kwargs_for_tools(self): method trace (line 107) | def trace(self): method start_time (line 111) | def start_time(self): method _setup_log_dir (line 116) | def _setup_log_dir(self): method _initialize_task_env (line 133) | def _initialize_task_env(self): method _initialize_trace (line 186) | def _initialize_trace(self): method __enter__ (line 212) | def __enter__(self): method __exit__ (line 215) | def __exit__(self, exc_type, exc_value, traceback): method is_done (line 240) | def is_done(self): method execute (line 252) | def execute(self, action: Action, max_seconds_per_step: Optional[int] ... method save (line 339) | def save(self, curr_step): method get_task_description (line 367) | def get_task_description(self): method low_level_actions (line 371) | def low_level_actions(self): method high_level_actions (line 375) | def high_level_actions(self): method print_action (line 378) | def print_action(self, entries): function _signal_handler (line 382) | def _signal_handler(signum, frame): FILE: evals/elsuite/hr_ml_agent_bench/eval.py class Sample (line 23) | class Sample: method __post_init__ (line 31) | def __post_init__(self): class MLAgentBench (line 51) | class MLAgentBench(SolverEval): method __init__ (line 52) | def __init__(self, completion_fns: list[CompletionFn], *args, **kwargs): method eval_sample (line 61) | def eval_sample(self, solver: Solver, raw_sample: dict, rng: Random) -... method run (line 100) | def run(self, recorder: Recorder) -> dict: function in_ci (line 119) | def in_ci(): FILE: evals/elsuite/hr_ml_agent_bench/high_level_actions.py function understand_file (line 19) | def understand_file(file_name, things_to_look_for, solver, work_dir=".",... function edit_script (line 67) | def edit_script( function edit_script_lines (line 124) | def edit_script_lines( function inspect_script_lines (line 196) | def inspect_script_lines(script_name, start_line_number, end_line_number... FILE: evals/elsuite/hr_ml_agent_bench/low_level_actions.py function normalize_args_kwargs (line 28) | def normalize_args_kwargs(f, *args, **kwargs): function append_to_low_level_steps (line 36) | def append_to_low_level_steps(trace, name, args, observation): function record_low_level_step (line 43) | def record_low_level_step(func): function check_file_read_only (line 72) | def check_file_read_only(arg_names, **kwargs): function check_file_in_work_dir (line 91) | def check_file_in_work_dir(arg_names, **kwargs): function list_files (line 116) | def list_files(dir_path, work_dir=".", **kwargs): function read_file (line 128) | def read_file(file_name, work_dir=".", **kwargs): function write_file (line 139) | def write_file(file_name, content, work_dir=".", **kwargs): function append_file (line 152) | def append_file(file_name, content, work_dir=".", **kwargs): function copy_file (line 165) | def copy_file(source, destination, work_dir=".", **kwargs): function undo_edit_script (line 178) | def undo_edit_script(script_name, work_dir=".", **kwargs): function execute_script (line 200) | def execute_script(script_name, work_dir=".", **kwargs): function python_repl (line 257) | def python_repl(command, work_dir=".", **kwargs): FILE: evals/elsuite/hr_ml_agent_bench/prepare_task.py function get_research_problem (line 15) | def get_research_problem(task: str) -> str: function prepare_task (line 39) | def prepare_task(benchmark_dir, python_command="python"): FILE: evals/elsuite/hr_ml_agent_bench/prompts.py function format_action (line 40) | def format_action(action: ActionInfo) -> str: function get_actions_description (line 55) | def get_actions_description(actions: list[ActionInfo]) -> str: function get_task_description (line 61) | def get_task_description(research_problem: str) -> str: FILE: evals/elsuite/hr_ml_agent_bench/schema.py class EnhancedJSONEncoder (line 8) | class EnhancedJSONEncoder(json.JSONEncoder): method default (line 9) | def default(self, o): class TooLongPromptError (line 21) | class TooLongPromptError(Exception): class LLMError (line 25) | class LLMError(Exception): class EnvException (line 29) | class EnvException(Exception): method __init__ (line 30) | def __init__(self, message): method __str__ (line 33) | def __str__(self): class ActionInfo (line 38) | class ActionInfo: class Action (line 48) | class Action: class Step (line 54) | class Step: class Trace (line 61) | class Trace: FILE: evals/elsuite/hr_ml_agent_bench/scripts/run_experiments.py function run_experiment (line 64) | def run_experiment(solver: str, task: str, seed: int) -> None: FILE: evals/elsuite/hr_ml_agent_bench/solvers/baseline.py class SimpleActionAgent (line 16) | class SimpleActionAgent(Solver): method __init__ (line 17) | def __init__(self, registry: Registry, completion_fn_kwargs: dict): method _solve (line 25) | def _solve(self, task_state: TaskState, **kwargs) -> SolverResult: method get_encoder (line 111) | def get_encoder(self): FILE: evals/elsuite/hr_ml_agent_bench/tests/test_actions.py function test_make_action_string (line 12) | def test_make_action_string(): function test_empty_string (line 29) | def test_empty_string(): function test_missing_curly_braces (line 40) | def test_missing_curly_braces(): function test_args_on_multiple_lines (line 58) | def test_args_on_multiple_lines(): function test_args_on_single_line (line 77) | def test_args_on_single_line(): function test_special_characters_in_name (line 93) | def test_special_characters_in_name(): function test_invalid_arguments (line 111) | def test_invalid_arguments(): function test_surrounded_by_additional_text (line 127) | def test_surrounded_by_additional_text(): function test_is_valid_action_with_correct_args (line 159) | def test_is_valid_action_with_correct_args(action_info): function test_is_valid_action_with_incorrect_args (line 169) | def test_is_valid_action_with_incorrect_args(action_info): function test_is_valid_action_with_missing_args (line 177) | def test_is_valid_action_with_missing_args(action_info): FILE: evals/elsuite/hr_ml_agent_bench/utils.py function complete_text (line 20) | def complete_text(prompt: str, solver: Solver, **kwargs) -> str: function get_root_dir (line 31) | def get_root_dir() -> Path: function get_code_dir (line 37) | def get_code_dir() -> Path: function get_data_dir (line 43) | def get_data_dir() -> Path: function get_parent_dir (line 49) | def get_parent_dir(name: str, max_depth: int = 64) -> Path: function is_gpu_available (line 63) | def is_gpu_available() -> bool: function get_gpu_with_most_available_memory (line 69) | def get_gpu_with_most_available_memory() -> Optional[int]: function get_baseline_score (line 97) | def get_baseline_score( FILE: evals/elsuite/identifying_variables/eval.py class IdentifyingVariables (line 30) | class IdentifyingVariables(SolverEval): method __init__ (line 31) | def __init__( method _build_task_description (line 52) | def _build_task_description(self) -> str: method eval_sample (line 60) | def eval_sample(self, solver: Solver, sample: Sample, rng: random.Rand... method run (line 90) | def run(self, recorder: RecorderBase) -> Dict[str, float]: method _compute_agg_metrics (line 98) | def _compute_agg_metrics(self, metrics: List[Dict]) -> Dict[str, float]: method _compute_grouped_metrics (line 123) | def _compute_grouped_metrics(self, metrics: List[Dict]) -> Dict[str, f... method _evaluate_sample (line 192) | def _evaluate_sample(self, preds: Optional[Answer], gold: Answer, num_... method _ctrl_vars_fallout (line 245) | def _ctrl_vars_fallout(self, preds: List[str], gold: List[str], num_no... method _ctrl_vars_recall (line 248) | def _ctrl_vars_recall(self, preds: List[str], gold: List[str]) -> float: method _ctrl_vars_nDCG (line 251) | def _ctrl_vars_nDCG(self, preds: List[str], gold: List[str], num_not_c... method _build_message (line 257) | def _build_message(self, sample: Sample) -> Message: method _render_hypotheses (line 271) | def _render_hypotheses(self, hypotheses: nx.DiGraph) -> List[str]: method _render_hypothesis (line 275) | def _render_hypothesis(self, hypothesis: Tuple[str, str]) -> str: method _get_samples (line 280) | def _get_samples(self) -> List[Sample]: FILE: evals/elsuite/identifying_variables/graph_utils.py function val_and_count_roots (line 8) | def val_and_count_roots( function gen_random_forest_tree_size (line 45) | def gen_random_forest_tree_size( function gen_random_forest (line 127) | def gen_random_forest( function find_farthest_node (line 202) | def find_farthest_node(graph: nx.DiGraph, source: str) -> Tuple[str, int]: function find_graph_roots (line 220) | def find_graph_roots(graph: nx.DiGraph) -> Set[str]: function find_graph_trees (line 227) | def find_graph_trees(graph: nx.DiGraph) -> List[Set[str]]: function find_connected_nodes_pair (line 234) | def find_connected_nodes_pair( function find_unconnected_nodes_pair (line 245) | def find_unconnected_nodes_pair(graph: nx.DiGraph) -> Union[Tuple[Any, A... FILE: evals/elsuite/identifying_variables/latent_funcs.py function linear (line 5) | def linear(x: np.ndarray, grad: float, bias: float) -> np.ndarray: function quadratic (line 9) | def quadratic(x: np.ndarray, grad: float, bias: float) -> np.ndarray: function random_uniform (line 13) | def random_uniform(num_samples, min_v, max_v, rng: np.random.Generator) ... function random_ints (line 17) | def random_ints(num_samples, min_v, max_v, rng: np.random.Generator) -> ... FILE: evals/elsuite/identifying_variables/metrics.py function compute_DCG (line 9) | def compute_DCG(ranking: List[float], ceil_negs: bool = False) -> float: function compute_nDCG (line 21) | def compute_nDCG(ranking: List[float], best: List[float], worst: List[fl... function compute_metric_posthoc (line 32) | def compute_metric_posthoc( function compute_ctrl_recall_posthoc (line 47) | def compute_ctrl_recall_posthoc(metric_entries: List[Dict], sampling_ent... function compute_fallout (line 79) | def compute_fallout(retrieved: Set[str], gold_relevants: Set[str], num_i... function compute_recall (line 93) | def compute_recall(retrieved: Set[str], gold_relevants: Set[str]): FILE: evals/elsuite/identifying_variables/renderers/base.py class RendererBase (line 9) | class RendererBase(abc.ABC): method __init__ (line 10) | def __init__(self, rng: random.Random, np_rng: np.random.Generator) ->... method render_obs (line 15) | def render_obs(self, sample: Sample) -> str: FILE: evals/elsuite/identifying_variables/renderers/corrset.py class CorrSetRenderer (line 10) | class CorrSetRenderer(RendererBase): method determine_sample_type (line 15) | def determine_sample_type(self, sample: Sample) -> Tuple[str, List[Set... method _get_hypd_unobserved_vars (line 60) | def _get_hypd_unobserved_vars(self, sample: Sample) -> List[str]: class PureCorrSetRenderer (line 78) | class PureCorrSetRenderer(CorrSetRenderer): method render_obs (line 79) | def render_obs(self, sample: Sample) -> str: method _render_observed_sets (line 92) | def _render_observed_sets(self, observed_sets: List[Set[str]]) -> str: method _render_unobserved_vars (line 101) | def _render_unobserved_vars(self, sample: Sample) -> str: class LanguageCorrSetRenderer (line 113) | class LanguageCorrSetRenderer(CorrSetRenderer): method __init__ (line 118) | def __init__(self, *args, **kwargs) -> None: method render_obs (line 126) | def render_obs(self, sample: Sample) -> str: method render_many_sets (line 148) | def render_many_sets(self, correl_sets: List[Set[str]]): method render_single_set (line 215) | def render_single_set(self, correl_sets: List[Set[str]]) -> str: method render_only_ind (line 257) | def render_only_ind(self, correl_sets: List[Set[str]]) -> str: method mention_unobserved_vars (line 301) | def mention_unobserved_vars(self, sample: Sample) -> str: FILE: evals/elsuite/identifying_variables/renderers/tabular.py function apply_noise (line 18) | def apply_noise( function sparsify_data (line 47) | def sparsify_data( class TabularRenderer (line 59) | class TabularRenderer(RendererBase): method __init__ (line 60) | def __init__(self, *args, **kwargs) -> None: method _render_table (line 64) | def _render_table(self, sample: Sample) -> pd.DataFrame: class MarkdownTableRenderer (line 102) | class MarkdownTableRenderer(TabularRenderer): method __init__ (line 107) | def __init__(self, *args, **kwargs) -> None: method render_obs (line 110) | def render_obs(self, sample: Sample) -> str: class CSVTableRenderer (line 115) | class CSVTableRenderer(TabularRenderer): method __init__ (line 121) | def __init__(self, *args, **kwargs) -> None: method render_obs (line 124) | def render_obs(self, sample: Sample) -> str: class JSONTableRenderer (line 129) | class JSONTableRenderer(TabularRenderer): method __init__ (line 135) | def __init__(self, *args, **kwargs) -> None: method render_obs (line 138) | def render_obs(self, sample: Sample) -> str: class LanguageTableRenderer (line 143) | class LanguageTableRenderer(TabularRenderer): method __init__ (line 149) | def __init__(self, *args, **kwargs) -> None: method render_obs (line 154) | def render_obs(self, sample: Sample) -> str: method _render_row (line 164) | def _render_row( function format_number (line 179) | def format_number(number: Union[int, float]): FILE: evals/elsuite/identifying_variables/renderers/templates.py function list_to_nl_list (line 4) | def list_to_nl_list(list_of_words: List[str]) -> str: FILE: evals/elsuite/identifying_variables/scripts/gen_data.py function write_to_jsonl (line 31) | def write_to_jsonl( function random_latent_func_meta (line 40) | def random_latent_func_meta( function build_var_metadata (line 68) | def build_var_metadata( function sparsify_data (line 121) | def sparsify_data(var_metadata, sparse_var_rate, np_rng): function gen_sample_balanced_ctrl_vars (line 147) | def gen_sample_balanced_ctrl_vars( function gen_sample (line 218) | def gen_sample( function determine_gold_label (line 294) | def determine_gold_label( function parse_target_hyp (line 324) | def parse_target_hyp( function determine_ctrl_vars (line 351) | def determine_ctrl_vars( function are_correlated (line 381) | def are_correlated(var_1, var_2, variable_metadata) -> Optional[bool]: function integrate_target_hyp (line 399) | def integrate_target_hyp( function gen_samples (line 413) | def gen_samples( function main (line 432) | def main(args: argparse.Namespace): FILE: evals/elsuite/identifying_variables/scripts/make_plots.py function initialize_default_results_dict (line 53) | def initialize_default_results_dict(): function handle_cot_double_sampling (line 73) | def handle_cot_double_sampling(sampling_entries, solver): function handle_posthoc_metrics (line 94) | def handle_posthoc_metrics(final_results: Dict, log_path: Path, solver: ... function populate_default_results_dict (line 112) | def populate_default_results_dict(results_dict, results_dir): function make_default_tables (line 141) | def make_default_tables(results_dict: Dict, save_dir: Path): function extract_default_results_dict (line 146) | def extract_default_results_dict(results_dir: Path): function make_default_plots (line 153) | def make_default_plots(results_dict: Dict, save_dir: Path): function extract_large_results_dict (line 174) | def extract_large_results_dict(results_dir: Path) -> Dict: function make_large_plot (line 203) | def make_large_plot(large_results_dir: Dict, save_dir: Path): function np_nan_if_none (line 210) | def np_nan_if_none(input_num): function zero_if_none (line 217) | def zero_if_none(input_num): function round_if_not_nan (line 224) | def round_if_not_nan(input_num): function make_token_per_sample_df (line 231) | def make_token_per_sample_df(solver_to_eval, solver_to_tokens) -> pd.Dat... function count_tokens (line 263) | def count_tokens(results_dir: Path, total) -> Tuple[Dict, pd.DataFrame]: function make_total_tokens_table (line 325) | def make_total_tokens_table(default_total: Dict, large_total: Dict) -> p... function make_token_count_tables (line 341) | def make_token_count_tables( function main (line 356) | def main(default_results_dir: Path, large_results_dir: Path, save_dir: P... FILE: evals/elsuite/identifying_variables/scripts/plotting_utils.py function plot_solver_bars (line 48) | def plot_solver_bars( function plot_difficulty_bars (line 125) | def plot_difficulty_bars(results_dict: Dict, bins: Iterable[int], output... FILE: evals/elsuite/identifying_variables/scripts/table_utils.py function make_main_metric_table (line 8) | def make_main_metric_table( FILE: evals/elsuite/identifying_variables/solvers.py class RandomSolver (line 7) | class RandomSolver(Solver): method __init__ (line 8) | def __init__(self, *args, **kwargs): method _solve (line 11) | def _solve(self, task_state: TaskState) -> SolverResult: class NoCtrl (line 27) | class NoCtrl(Solver): method __init__ (line 34) | def __init__(self, *args, **kwargs): method _solve (line 37) | def _solve(self, task_state: TaskState) -> SolverResult: FILE: evals/elsuite/identifying_variables/structs.py class Answer (line 9) | class Answer: class Sample (line 17) | class Sample: FILE: evals/elsuite/identifying_variables/utils.py function parse_solver_preds (line 11) | def parse_solver_preds(solver_result: SolverResult) -> Answer: function sample_serializer (line 56) | def sample_serializer(obj): function json_to_sample (line 71) | def json_to_sample(serialized_sample: Dict) -> Sample: FILE: evals/elsuite/lambada.py class Lambada (line 9) | class Lambada(evals.Eval): method __init__ (line 10) | def __init__( method eval_sample (line 21) | def eval_sample(self, sample, rng): method run (line 42) | def run(self, recorder: RecorderBase): FILE: evals/elsuite/make_me_pay/eval.py class MakeMePay (line 19) | class MakeMePay(SolverEval): method __init__ (line 20) | def __init__( method eval_sample (line 77) | def eval_sample(self, solver: Solver, sample: Any, *_): method run (line 102) | def run(self, recorder: RecorderBase): FILE: evals/elsuite/make_me_pay/makemepay.py function generate_reply_for (line 20) | def generate_reply_for( function minutes_since (line 85) | def minutes_since(timestamp: float) -> float: function run_eval (line 90) | def run_eval( function detect_donation (line 243) | def detect_donation( function detect_withdraw (line 284) | def detect_withdraw( FILE: evals/elsuite/make_me_pay/makemepay_test.py function summary_model_completion_fn (line 16) | def summary_model_completion_fn() -> Callable[[list[dict]], dict]: function summary_model_system_prompt (line 24) | def summary_model_system_prompt() -> str: function conversation_without_donation (line 29) | def conversation_without_donation() -> list[dict]: function test_detect_donation_finds_donation_with_expected_syntax (line 39) | def test_detect_donation_finds_donation_with_expected_syntax( function test_detect_donation_finds_donation_with_unexpected_syntax (line 64) | def test_detect_donation_finds_donation_with_unexpected_syntax( function test_detect_donation_doesnt_find_a_donation_for_conversation_without_donation (line 88) | def test_detect_donation_doesnt_find_a_donation_for_conversation_without... function test_that_replies_are_appended_to_conversation (line 107) | def test_that_replies_are_appended_to_conversation() -> None: FILE: evals/elsuite/make_me_pay/scripts/make_plots.py function main (line 12) | def main(): function load_mmp_results_from_dir (line 36) | def load_mmp_results_from_dir(log_dir: Union[str, Path]) -> pd.DataFrame: function parse_spec (line 66) | def parse_spec(spec: dict) -> tuple[str, bool, int]: function make_plot (line 85) | def make_plot(df, out_dir, con_artist_model, mark_model, prompt_type): FILE: evals/elsuite/make_me_pay/solvers/lm_con_artist_solver.py class LMConArtistSolver (line 14) | class LMConArtistSolver(Solver): method __init__ (line 15) | def __init__( method name (line 50) | def name(self): method _solve (line 53) | def _solve(self, task_state: TaskState, **kwargs) -> SolverResult: FILE: evals/elsuite/make_me_pay/utils.py function is_system_msg (line 10) | def is_system_msg(m: dict) -> bool: function format_msg (line 18) | def format_msg(msg: dict, role: Literal["mark", "conartist"]) -> dict: function format_msgs (line 47) | def format_msgs( function model_broke_character (line 61) | def model_broke_character(message: str) -> bool: function model_output_empty_tags (line 69) | def model_output_empty_tags(message: str) -> bool: function openai_chatcompletion_create (line 74) | def openai_chatcompletion_create(*args, **kwargs): FILE: evals/elsuite/make_me_say/autoeval.py function run (line 21) | def run( function _create_response (line 113) | def _create_response(game: Game) -> Game: function _get_content (line 137) | def _get_content(response: Union[dict, CompletionResult]) -> str: FILE: evals/elsuite/make_me_say/core.py class Player (line 21) | class Player: method __post_init__ (line 26) | def __post_init__(self): class Message (line 32) | class Message: method to_dict (line 37) | def to_dict(self) -> dict: class Game (line 42) | class Game: method __post_init__ (line 55) | def __post_init__(self): method to_df (line 105) | def to_df(self) -> pd.DataFrame: method set_current_player (line 112) | def set_current_player(self, player: Player) -> "Game": method swap_players (line 117) | def swap_players(self) -> "Game": method set_word_list (line 123) | def set_word_list(self, word_list: tuple[str, ...]) -> "Game": method add_message (line 138) | def add_message(self, message: Message) -> "Game": method _contains_internal_monologue (line 141) | def _contains_internal_monologue(self, s: str) -> bool: method _remove_internal_monologue (line 150) | def _remove_internal_monologue(self, s: str) -> str: method view_as (line 164) | def view_as(self, player: Player) -> tuple[Message, ...]: method _mentions_codeword (line 202) | def _mentions_codeword(self, s: str) -> bool: method is_win (line 217) | def is_win(self) -> bool: method is_violation (line 240) | def is_violation(self) -> bool: method is_loss (line 260) | def is_loss(self) -> bool: method is_user_aware_of_manipulation (line 269) | def is_user_aware_of_manipulation(self) -> bool: method is_codeword_mentioned_by_user (line 290) | def is_codeword_mentioned_by_user(self) -> bool: method is_over (line 309) | def is_over(self) -> bool: FILE: evals/elsuite/make_me_say/defaults.py function DEFAULT_MANIPULATOR_PREAMBLE_FN (line 6) | def DEFAULT_MANIPULATOR_PREAMBLE_FN(game): function DEFAULT_MANIPULATEE_PREAMBLE_FN (line 73) | def DEFAULT_MANIPULATEE_PREAMBLE_FN(game): function DEFAULT_JUDGE_PREAMBLE_FN (line 100) | def DEFAULT_JUDGE_PREAMBLE_FN(game): function DEFAULT_PERIODIC_MSG_FN (line 159) | def DEFAULT_PERIODIC_MSG_FN(game): function DEFAULT_FEEDBACK_MSG_FN (line 183) | def DEFAULT_FEEDBACK_MSG_FN(game): FILE: evals/elsuite/make_me_say/eval.py class MakeMeSay (line 10) | class MakeMeSay(evals.Eval): method __init__ (line 11) | def __init__(self, *args, **kwargs): method eval_sample (line 25) | def eval_sample(self, sample: dict, rng) -> None: method run (line 47) | def run(self, recorder: RecorderBase): FILE: evals/elsuite/make_me_say/makemesay_test.py function example_player_completion_fn (line 11) | def example_player_completion_fn() -> Any: function example_judge_completion_fn (line 19) | def example_judge_completion_fn() -> Any: function example_bot_manipulator (line 27) | def example_bot_manipulator(example_player_completion_fn): function example_bot_manipulatee (line 36) | def example_bot_manipulatee(example_player_completion_fn): function example_bot_judge (line 45) | def example_bot_judge(example_judge_completion_fn): function empty_bot_vs_bot_game (line 54) | def empty_bot_vs_bot_game( function finished_bot_vs_bot_game (line 73) | def finished_bot_vs_bot_game(empty_bot_vs_bot_game: Game) -> Game: function _is_api_key_set (line 104) | def _is_api_key_set() -> bool: function test_player_cant_play_against_themselves (line 109) | def test_player_cant_play_against_themselves( function test_current_player_has_to_be_a_player_in_the_game (line 125) | def test_current_player_has_to_be_a_player_in_the_game( function test_word_list_has_to_have_at_least_one_word (line 142) | def test_word_list_has_to_have_at_least_one_word( function test_able_to_set_current_player (line 159) | def test_able_to_set_current_player(empty_bot_vs_bot_game: Game) -> None: function test_able_to_add_messages (line 172) | def test_able_to_add_messages(empty_bot_vs_bot_game: Game) -> None: function test_able_to_view_messages_from_manipulators_and_manipulatees_point_of_view (line 188) | def test_able_to_view_messages_from_manipulators_and_manipulatees_point_... function test_internal_monologue_is_able_to_be_removed (line 268) | def test_internal_monologue_is_able_to_be_removed(empty_bot_vs_bot_game:... function test_manipulatee_cant_see_internal_monologue_of_manipulator (line 283) | def test_manipulatee_cant_see_internal_monologue_of_manipulator( function test_winning_game_is_classified_as_a_win (line 323) | def test_winning_game_is_classified_as_a_win(empty_bot_vs_bot_game: Game... function test_violating_game_is_classified_as_a_violation (line 364) | def test_violating_game_is_classified_as_a_violation( function test_system_messages_are_ignored_in_violation_check (line 381) | def test_system_messages_are_ignored_in_violation_check( function test_system_messages_are_ignored_in_win_check (line 400) | def test_system_messages_are_ignored_in_win_check( function test_internal_monologue_is_ignored_in_violation_check (line 443) | def test_internal_monologue_is_ignored_in_violation_check( function test_internal_monologue_is_ignored_in_win_check (line 464) | def test_internal_monologue_is_ignored_in_win_check( function test_loss_game_is_classified_as_a_loss (line 510) | def test_loss_game_is_classified_as_a_loss(empty_bot_vs_bot_game: Game) ... function test_abbreviations_are_accepted_as_substitute_for_codeword (line 540) | def test_abbreviations_are_accepted_as_substitute_for_codeword( function test_spelled_out_codeword_is_accepted_as_substitute_for_codeword (line 551) | def test_spelled_out_codeword_is_accepted_as_substitute_for_codeword( function test_codeword_surrounded_by_quotes_counts_as_a_mention (line 561) | def test_codeword_surrounded_by_quotes_counts_as_a_mention( function test_codeword_appearing_as_substring_doesnt_count_as_a_mention (line 571) | def test_codeword_appearing_as_substring_doesnt_count_as_a_mention( function test_codeword_is_case_insenstive (line 581) | def test_codeword_is_case_insenstive(empty_bot_vs_bot_game: Game) -> None: function test_error_is_thrown_when_word_list_is_empty (line 590) | def test_error_is_thrown_when_word_list_is_empty(empty_bot_vs_bot_game: ... function test_error_is_thrown_when_codeword_is_empty_string (line 596) | def test_error_is_thrown_when_codeword_is_empty_string( function test_error_is_thrown_when_codeword_isnt_alphabetical (line 604) | def test_error_is_thrown_when_codeword_isnt_alphabetical( function test_codeword_appearing_as_part_of_contraction_doesnt_count_as_mention (line 621) | def test_codeword_appearing_as_part_of_contraction_doesnt_count_as_mention( function test_codeword_appearing_in_another_language_doesnt_count_as_mention (line 631) | def test_codeword_appearing_in_another_language_doesnt_count_as_mention( function test_symbol_representation_of_codeword_doesnt_count_as_mention (line 641) | def test_symbol_representation_of_codeword_doesnt_count_as_mention( function test_cant_create_game_with_non_positive_number_of_turns (line 651) | def test_cant_create_game_with_non_positive_number_of_turns( function test_cant_create_game_with_negative_turn (line 680) | def test_cant_create_game_with_negative_turn( function test_both_start_and_end_internal_monologue_tokens_must_be_set_if_one_is (line 699) | def test_both_start_and_end_internal_monologue_tokens_must_be_set_if_one... function test_both_internal_monologue_tokens_cant_be_empty_strings (line 730) | def test_both_internal_monologue_tokens_cant_be_empty_strings( function test_game_mentioning_codeword_which_exceeds_max_turns_is_a_win (line 764) | def test_game_mentioning_codeword_which_exceeds_max_turns_is_a_win( function test_game_where_user_says_conversation_was_manipulative_is_marked_as_such (line 809) | def test_game_where_user_says_conversation_was_manipulative_is_marked_as... function test_game_where_user_doesnt_say_conversation_was_manipulative_is_not_marked_as_such (line 833) | def test_game_where_user_doesnt_say_conversation_was_manipulative_is_not... function test_game_with_more_than_one_codeword_cant_be_created (line 857) | def test_game_with_more_than_one_codeword_cant_be_created( function test_game_where_user_mentions_codeword_after_manipulator_but_user_is_aware_of_codeword_is_a_loss (line 876) | def test_game_where_user_mentions_codeword_after_manipulator_but_user_is... FILE: evals/elsuite/make_me_say/utils.py function openai_chatcompletion_create (line 23) | def openai_chatcompletion_create(*args, **kwargs): function get_completion (line 29) | def get_completion(prompt, model_name): function get_completion_fn (line 36) | def get_completion_fn(model_name: str) -> Callable[[Union[str, list[dict... function get_content (line 40) | def get_content(response: Union[dict, CompletionResult]) -> str: FILE: evals/elsuite/mmmu/eval.py class Sample (line 21) | class Sample(BaseModel): class Config (line 34) | class Config: function get_dataset (line 38) | def get_dataset(url: str) -> list[Sample]: class MMMU (line 67) | class MMMU(evals.Eval): method __init__ (line 68) | def __init__( method eval_sample (line 81) | def eval_sample(self, sample: Sample, rng): method run (line 177) | def run(self, recorder: RecorderBase): FILE: evals/elsuite/modelgraded/base.py class ModelGradedSpec (line 12) | class ModelGradedSpec: FILE: evals/elsuite/modelgraded/classify.py class ModelBasedClassify (line 14) | class ModelBasedClassify(evals.Eval): method __init__ (line 15) | def __init__( method eval_sample (line 53) | def eval_sample(self, test_sample: dict, rng: Random) -> None: method run (line 104) | def run(self, recorder): FILE: evals/elsuite/modelgraded/classify_utils.py function get_choice_strings (line 37) | def get_choice_strings(choice_strings: Union[list[str], str], n: Optiona... function classify (line 51) | def classify( function get_choice_score (line 90) | def get_choice_score( function choice_to_str (line 105) | def choice_to_str(choice_strings: Iterable[str]) -> str: function get_choice (line 110) | def get_choice( function append_answer_prompt (line 131) | def append_answer_prompt( function sample_and_concat_n_completions (line 152) | def sample_and_concat_n_completions( function concat_n_completions (line 175) | def concat_n_completions(completions: Iterable[str], template_i: str) ->... FILE: evals/elsuite/multiple_choice.py class Sample (line 14) | class Sample(BaseModel): function get_dataset (line 20) | def get_dataset(url: str) -> list[Sample]: class MultipleChoice (line 51) | class MultipleChoice(evals.Eval): method __init__ (line 52) | def __init__( method eval_sample (line 65) | def eval_sample(self, sample, rng): method run (line 95) | def run(self, recorder: RecorderBase): FILE: evals/elsuite/multistep_web_tasks/docker/flask-playwright/app.py function index (line 34) | def index(): function setup (line 39) | def setup(): function shutdown (line 69) | def shutdown(): function exec_command (line 87) | def exec_command(): function exec_commands (line 132) | def exec_commands(): function _execute_command (line 175) | def _execute_command(json_data: dict): function _execute_commands (line 195) | def _execute_commands(json_data: dict): function ensure_api_key (line 206) | def ensure_api_key(request): FILE: evals/elsuite/multistep_web_tasks/docker/homepage/app.py function index (line 7) | def index() -> str: function scratchpad (line 12) | def scratchpad() -> str: function calculator (line 17) | def calculator() -> str: function password (line 22) | def password() -> str: FILE: evals/elsuite/multistep_web_tasks/eval.py class MultistepWebTasks (line 20) | class MultistepWebTasks(SolverEval): method __init__ (line 21) | def __init__( method eval_sample (line 38) | def eval_sample(self, solver: Solver, sample: dict, rng: Any) -> None: method run (line 50) | def run(self, recorder: RecorderBase): FILE: evals/elsuite/multistep_web_tasks/reproducibility/make_plots.py function main (line 30) | def main(): class MWTTaskOutcome (line 44) | class MWTTaskOutcome: function load_mwt_results_from_dir (line 54) | def load_mwt_results_from_dir(log_dir: Union[str, Path]) -> pd.DataFrame: function build_task_outcomes (line 61) | def build_task_outcomes(log_dir: Union[str, Path]) -> list[MWTTaskOutcome]: function build_task_outcome (line 75) | def build_task_outcome(spec: dict, final_results: dict, path: Path) -> M... function _get_attempt_number (line 90) | def _get_attempt_number(path: str) -> int: function make_plot (line 102) | def make_plot(df: pd.DataFrame, out_dir: Path) -> None: FILE: evals/elsuite/multistep_web_tasks/reproducibility/make_task_jsonl.py function main (line 5) | def main(): function select_tasks_by_id (line 27) | def select_tasks_by_id(all_tasks: list[dict], task_ids: list[int]): function build_and_write_individual_tasks (line 31) | def build_and_write_individual_tasks(all_tasks: list[dict], data_dir: Pa... function build_easy_tasks (line 37) | def build_easy_tasks(all_tasks: list[dict]) -> list[dict]: function build_medium_tasks (line 42) | def build_medium_tasks(all_tasks: list[dict]) -> list[dict]: function build_hard_tasks (line 47) | def build_hard_tasks(all_tasks: list[dict]) -> list[dict]: function write_jsonl (line 52) | def write_jsonl(outfile: Path, json_objects: list[dict]) -> None: FILE: evals/elsuite/multistep_web_tasks/session.py class Session (line 27) | class Session: method __init__ (line 28) | def __init__(self, docker_client: docker.DockerClient) -> None: # typ... method add_samples (line 33) | def add_samples(self, samples: list[dict]) -> None: method __enter__ (line 36) | def __enter__(self): method __exit__ (line 57) | def __exit__(self, *args): method get_container (line 71) | def get_container(self, container_name: str) -> docker.models.containe... method register_container (line 78) | def register_container(self, container_name: ServiceIdentifier, contai... method setup_docker_environments (line 84) | def setup_docker_environments(self) -> dict[ServiceIdentifier, docker.... method setup_network (line 93) | def setup_network(self) -> docker.models.networks.Network: # type: ig... method setup_container (line 113) | def setup_container(self, container_name: str) -> docker.models.contai... method _setup_bash_environment (line 146) | def _setup_bash_environment(self) -> docker.models.containers.Containe... method _setup_homepage_environment (line 157) | def _setup_homepage_environment(self) -> docker.models.containers.Cont... method _setup_flask_playwright_environment (line 166) | def _setup_flask_playwright_environment(self) -> docker.models.contain... method _setup_simpleweb_environment (line 175) | def _setup_simpleweb_environment(self) -> docker.models.containers.Con... method _setup_shopping_environment (line 183) | def _setup_shopping_environment(self) -> docker.models.containers.Cont... method _setup_shopping_admin_environment (line 213) | def _setup_shopping_admin_environment(self) -> docker.models.container... method _setup_reddit_environment (line 244) | def _setup_reddit_environment(self) -> docker.models.containers.Contai... method _setup_gitlab_environment (line 254) | def _setup_gitlab_environment(self) -> docker.models.containers.Contai... method _setup_wikipedia_environment (line 268) | def _setup_wikipedia_environment(self) -> docker.models.containers.Con... method _run_container_setup (line 295) | def _run_container_setup( method _get_image (line 349) | def _get_image( method build_image_from_dockerfile (line 405) | def build_image_from_dockerfile(self, dockerfile_dir: str, image_name:... method _get_containers_to_setup (line 429) | def _get_containers_to_setup(self, samples) -> set[str]: method teardown_network (line 440) | def teardown_network(self) -> None: method teardown_docker_environments (line 443) | def teardown_docker_environments(self) -> None: method teardown_container (line 456) | def teardown_container(self, container_name: ServiceIdentifier) -> Non... method _is_container_ready (line 469) | def _is_container_ready( function download_to_file (line 512) | def download_to_file(url: str, path: Path) -> None: FILE: evals/elsuite/multistep_web_tasks/solvers/strong_solver/strong_solver.py class StrongSolver (line 30) | class StrongSolver(Solver): method __init__ (line 33) | def __init__( method encoding (line 56) | def encoding(self) -> tiktoken.Encoding: method _get_encoding (line 61) | def _get_encoding(self) -> tiktoken.Encoding: method _get_context_length (line 73) | def _get_context_length(self) -> int: method _solve (line 83) | def _solve( method _add_action_splitter_to_actions (line 114) | def _add_action_splitter_to_actions(self, messages: list[Message]) -> ... method _cut_messages_to_fit (line 126) | def _cut_messages_to_fit(self, messages: OpenAICreateChatPrompt) -> Op... method _get_new_observation_from_task_state (line 175) | def _get_new_observation_from_task_state(self, task_state: MWTTaskStat... method _get_previous_action_from_task_state (line 179) | def _get_previous_action_from_task_state(self, task_state: MWTTaskStat... method _extract_action (line 186) | def _extract_action(self, response: str) -> str: method name (line 199) | def name(self) -> str: function main (line 204) | def main(): FILE: evals/elsuite/multistep_web_tasks/solvers/webarena_solvers/webarena_solvers.py class WebArenaSolver (line 24) | class WebArenaSolver(Solver): method __init__ (line 28) | def __init__( method __call__ (line 40) | def __call__( method extract_action (line 47) | def extract_action(self, response: str) -> str: class BrowserWebArenaSolver (line 61) | class BrowserWebArenaSolver(WebArenaSolver): method __call__ (line 62) | def __call__( class CoTBrowserWebArenaSolver (line 70) | class CoTBrowserWebArenaSolver(BrowserWebArenaSolver): method __call__ (line 71) | def __call__( method name (line 94) | def name(self) -> str: class CoTBashBrowserWebArenaSolver (line 98) | class CoTBashBrowserWebArenaSolver(BrowserWebArenaSolver): method __call__ (line 99) | def __call__( method name (line 122) | def name(self) -> str: FILE: evals/elsuite/multistep_web_tasks/utils.py class MWTTaskState (line 19) | class MWTTaskState(TaskState): function load_experiment_config_from_file (line 28) | def load_experiment_config_from_file(experiment_config_path: str) -> Exp... function load_experiment_config_from_dict (line 34) | def load_experiment_config_from_dict(experiment_config_dict: dict[str, A... FILE: evals/elsuite/multistep_web_tasks/webarena/bash_browser_env/bash_browser_env.py class BashBrowserEnv (line 24) | class BashBrowserEnv(LLMAgentEnv): method __init__ (line 31) | def __init__( method page (line 65) | def page(self): method reset (line 69) | def reset( method setup (line 84) | def setup(self, experiment_config: BashBrowserExperimentConfig) -> None: method step (line 91) | def step(self, action: Union[BashAction, BrowserAction]) -> BashBrowse... method parse_action_string (line 100) | def parse_action_string(self, action_string: str) -> Union[BashAction,... method close (line 113) | def close(self): FILE: evals/elsuite/multistep_web_tasks/webarena/bash_env/actions.py class BashAction (line 9) | class BashAction(Action): class BashCommandAction (line 14) | class BashCommandAction(BashAction): class BashStopAction (line 20) | class BashStopAction(BashAction): function bash_is_equivalent (line 26) | def bash_is_equivalent(a_action: BashAction, b_action: BashAction) -> bool: FILE: evals/elsuite/multistep_web_tasks/webarena/bash_env/bash_utils.py class BashObservation (line 7) | class BashObservation(Observation): method data (line 11) | def data(self) -> str: class BashEnvOutput (line 16) | class BashEnvOutput(EnvOutput): FILE: evals/elsuite/multistep_web_tasks/webarena/bash_env/basic_bash_env.py class BashEnv (line 27) | class BashEnv(LLMAgentEnv): method __init__ (line 28) | def __init__( method reset (line 41) | def reset(self, experiment_config: Optional[BashExperimentConfig] = No... method _create_container_wrapper (line 63) | def _create_container_wrapper(self, session: Session) -> "BashContaine... method setup (line 70) | def setup(self, experiment_config: BashExperimentConfig) -> None: method step (line 82) | def step(self, action: BashAction) -> BashEnvOutput: method parse_action_string (line 106) | def parse_action_string(self, action_string: str) -> BashAction: method close (line 139) | def close(self): class BashContainerWrapper (line 143) | class BashContainerWrapper: method __init__ (line 144) | def __init__( method _setup (line 157) | def _setup(self, container): method run_command (line 170) | def run_command(self, command: str) -> str: method run_commands (line 180) | def run_commands(self, commands: list[str]) -> list[str]: method _wrap_command (line 187) | def _wrap_command(self, command: str) -> str: method shutdown (line 194) | def shutdown(self): FILE: evals/elsuite/multistep_web_tasks/webarena/browser_env/actions.py class ParsedPlaywrightCode (line 56) | class ParsedPlaywrightCode(TypedDict): function is_in_viewport (line 63) | def is_in_viewport(element: Locator, viewport: ViewportSize, threshold: ... function async_is_in_viewport (line 81) | async def async_is_in_viewport( class BrowserActionDict (line 99) | class BrowserActionDict(TypedDict): class BrowserAction (line 117) | class BrowserAction(Action): function action2str (line 122) | def action2str( function action2create_function (line 188) | def action2create_function(action: BrowserAction) -> str: class BrowserActionTypes (line 263) | class BrowserActionTypes(IntEnum): method __str__ (line 295) | def __str__(self) -> str: function is_equivalent (line 300) | def is_equivalent(action: Action, other_action: Action) -> bool: function bash_is_equivalent (line 319) | def bash_is_equivalent(action: BashAction, other_action: BashAction) -> ... function browser_is_equivalent (line 330) | def browser_is_equivalent(a_action: BrowserAction, b_action: BrowserActi... function _keys2ids (line 394) | def _keys2ids(keys: Union[list[Union[int, str]], str]) -> list[int]: function get_action_space (line 403) | def get_action_space() -> spaces.Dict: function create_random_action (line 435) | def create_random_action() -> BrowserAction: function create_none_action (line 473) | def create_none_action() -> BrowserAction: function create_stop_action (line 499) | def create_stop_action(answer: str) -> BrowserAction: function create_scroll_action (line 507) | def create_scroll_action(direction: str) -> BrowserAction: function create_mouse_hover_action (line 521) | def create_mouse_hover_action( function create_key_press_action (line 536) | def create_key_press_action(key_comb: str) -> BrowserAction: function create_page_focus_action (line 559) | def create_page_focus_action(page_number: int) -> BrowserAction: function create_new_tab_action (line 572) | def create_new_tab_action() -> BrowserAction: function create_go_back_action (line 584) | def create_go_back_action() -> BrowserAction: function create_go_forward_action (line 596) | def create_go_forward_action() -> BrowserAction: function create_goto_url_action (line 608) | def create_goto_url_action(url: str) -> BrowserAction: function create_page_close_action (line 621) | def create_page_close_action() -> BrowserAction: function create_mouse_click_action (line 633) | def create_mouse_click_action( function create_keyboard_type_action (line 657) | def create_keyboard_type_action(keys: Union[list[Union[int, str]], str])... function create_click_action (line 670) | def create_click_action( function create_hover_action (line 692) | def create_hover_action( function create_type_action (line 714) | def create_type_action( function create_check_action (line 738) | def create_check_action(pw_code: str) -> BrowserAction: function create_select_option_action (line 750) | def create_select_option_action( function create_focus_action (line 764) | def create_focus_action( function create_focus_and_click_action (line 783) | def create_focus_and_click_action( function create_focus_and_type_action (line 803) | def create_focus_and_type_action( function execute_scroll (line 826) | def execute_scroll(direction: str, page: PageForwarder) -> None: function execute_key_press (line 840) | def execute_key_press(key: str, page: PageForwarder) -> None: function execute_mouse_hover (line 848) | def execute_mouse_hover(left: float, top: float, page: PageForwarder) ->... function execute_mouse_click (line 855) | def execute_mouse_click(left: float, top: float, page: PageForwarder) ->... function execute_keyboard_type (line 863) | def execute_keyboard_type(text: str, page: PageForwarder) -> None: function execute_click_current (line 869) | def execute_click_current(page: PageForwarder) -> None: function execute_type (line 875) | def execute_type(keys: list[int], page: PageForwarder) -> None: function execute_focus (line 882) | def execute_focus(element_role: int, element_name: str, nth: int, page: ... function locate (line 888) | def locate(locator_calls: list[ParsedPlaywrightCode], page: PageForwarde... function execute_playwright_click (line 899) | def execute_playwright_click( function execute_playwright_hover (line 912) | def execute_playwright_hover(locator_code: list[ParsedPlaywrightCode], p... function execute_playwright_type (line 920) | def execute_playwright_type( function execute_playwright_select_option (line 934) | def execute_playwright_select_option( function execute_playwright_check (line 946) | def execute_playwright_check(locator_code: list[ParsedPlaywrightCode], p... function execute_action (line 953) | def execute_action( function parse_playwright_code (line 1060) | def parse_playwright_code(code: str) -> list[ParsedPlaywrightCode]: class ActionParsingError (line 1112) | class ActionParsingError(Exception): method __init__ (line 1113) | def __init__(self, message: str) -> None: function create_playwright_action (line 1119) | def create_playwright_action(playwright_code: str) -> BrowserAction: function create_id_based_action (line 1188) | def create_id_based_action(action_str: str) -> BrowserAction: FILE: evals/elsuite/multistep_web_tasks/webarena/browser_env/auto_login.py function is_expired (line 23) | def is_expired(storage_state: Path, url: str, keyword: str, url_exact: b... function renew_comb (line 47) | def renew_comb(comb: list[str]) -> None: function main (line 94) | def main() -> None: FILE: evals/elsuite/multistep_web_tasks/webarena/browser_env/basic_browser_env.py class BrowserEnv (line 36) | class BrowserEnv(LLMAgentEnv): method __init__ (line 49) | def __init__( method sync_playwright_api (line 91) | def sync_playwright_api(self, experiment_config: BrowserExperimentConf... method setup (line 112) | def setup(self, experiment_config: BrowserExperimentConfig) -> None: method parse_action_string (line 119) | def parse_action_string(self, action_string: str) -> BrowserAction: method get_page_client (line 126) | def get_page_client(self, page: PageForwarder) -> ClientForwarder: method _get_obs (line 130) | def _get_obs(self) -> BrowserObservation: method _get_obs_metadata (line 135) | def _get_obs_metadata(self) -> dict[str, ObservationMetadata]: method reset (line 140) | def reset( method save_trace (line 184) | def save_trace(self, trace_path: Union[str, Path]) -> None: method close (line 188) | def close(self) -> None: method step (line 193) | def step(self, action: BrowserAction) -> BrowserEnvOutput: FILE: evals/elsuite/multistep_web_tasks/webarena/browser_env/browser_utils.py class DetachedPage (line 14) | class DetachedPage: function png_bytes_to_numpy (line 20) | def png_bytes_to_numpy(png: bytes) -> npt.NDArray[np.uint8]: class AccessibilityTreeNode (line 31) | class AccessibilityTreeNode(TypedDict): class BrowserWindowConfig (line 47) | class BrowserWindowConfig(TypedDict): class PageInfo (line 58) | class PageInfo(Info): class BrowserState (line 64) | class BrowserState(TypedDict): class BrowserObservation (line 73) | class BrowserObservation(Observation): method data (line 79) | def data(self): method __repr__ (line 82) | def __repr__(self): class HtmlBrowserObservation (line 87) | class HtmlBrowserObservation(BrowserObservation): method data (line 89) | def data(self): class AccTreeBrowserObservation (line 94) | class AccTreeBrowserObservation(BrowserObservation): method data (line 96) | def data(self): class ImageBrowserObservation (line 101) | class ImageBrowserObservation(BrowserObservation): method data (line 103) | def data(self): class BrowserEnvOutput (line 108) | class BrowserEnvOutput(EnvOutput): FILE: evals/elsuite/multistep_web_tasks/webarena/browser_env/helper_functions.py function get_render_action (line 38) | def get_render_action( function get_action_description (line 63) | def get_action_description( class RenderHelper (line 105) | class RenderHelper(object): method __init__ (line 108) | def __init__(self, config_file: str, result_dir: str, action_set_tag: ... method render (line 126) | def render( method close (line 180) | def close(self) -> None: FILE: evals/elsuite/multistep_web_tasks/webarena/browser_env/processors.py class ObservationProcessor (line 30) | class ObservationProcessor: method process (line 31) | def process(self, page: Page, client: CDPSession) -> Observation: class ObservationMetadata (line 35) | class ObservationMetadata(TypedDict): function create_empty_metadata (line 39) | def create_empty_metadata() -> ObservationMetadata: class TextObervationProcessor (line 45) | class TextObervationProcessor(ObservationProcessor): method __init__ (line 46) | def __init__( method fetch_browser_info (line 59) | def fetch_browser_info( method partially_in_viewport (line 110) | def partially_in_viewport(bound: list[float], config: BrowserWindowCon... method retrieve_viewport_info (line 127) | def retrieve_viewport_info(self, info: BrowserState) -> None: method current_viewport_html (line 204) | def current_viewport_html(self, info: BrowserState) -> str: method fetch_page_accessibility_tree (line 268) | def fetch_page_accessibility_tree( method current_viewport_accessibility_tree (line 361) | def current_viewport_accessibility_tree( method parse_accessibility_tree (line 392) | def parse_accessibility_tree( method clean_accesibility_tree (line 482) | def clean_accesibility_tree(tree_str: str) -> str: method process (line 501) | def process(self, page: PageForwarder, client: ClientForwarder) -> dic... method get_element_center (line 550) | def get_element_center(self, element_id: str) -> tuple[float, float]: class ImageObservationProcessor (line 567) | class ImageObservationProcessor(ObservationProcessor): method __init__ (line 568) | def __init__(self, observation_type: str): method process (line 573) | def process(self, page: PageForwarder, client: ClientForwarder) -> npt... class ObservationHandler (line 577) | class ObservationHandler: method __init__ (line 580) | def __init__( method get_observation_space (line 596) | def get_observation_space(self) -> type[BrowserObservation]: method get_observation (line 600) | def get_observation(self, page: PageForwarder, client: ClientForwarder... method get_observation_metadata (line 612) | def get_observation_metadata(self) -> dict[str, ObservationMetadata]: method action_processor (line 619) | def action_processor(self) -> ObservationProcessor: FILE: evals/elsuite/multistep_web_tasks/webarena/core/env.py class Action (line 20) | class Action(ABC): class ParsingErrorAction (line 30) | class ParsingErrorAction(Action): class Observation (line 37) | class Observation(ABC): method data (line 39) | def data(self) -> Any: class DummyObservation (line 45) | class DummyObservation(Observation): method data (line 46) | def data(self) -> Any: class Info (line 50) | class Info(ABC): class EnvOutput (line 55) | class EnvOutput: class TrajectoryStep (line 66) | class TrajectoryStep(NamedTuple): class Trajectory (line 71) | class Trajectory(list[TrajectoryStep]): method __init__ (line 74) | def __init__(self, iterable: list[TrajectoryStep]): method pretty_string (line 78) | def pretty_string(self) -> str: class LLMAgentEnv (line 96) | class LLMAgentEnv(ABC, Env[Observation, Action]): method reset (line 102) | def reset( method step (line 112) | def step(self, action: Action) -> EnvOutput: method parse_action_string (line 116) | def parse_action_string(self, action_string: str) -> Action: method close (line 120) | def close(self) -> None: class ExperimentResult (line 125) | class ExperimentResult(ABC): FILE: evals/elsuite/multistep_web_tasks/webarena/core/playwright_api.py class Forwarder (line 35) | class Forwarder(ABC): method __init__ (line 39) | def __init__(self, container: docker.models.containers.Container) -> N... method execute_command (line 43) | def execute_command(self, command: str, n_allowed_attempts: int = 1) -... method make_request (line 51) | def make_request( method _double_quotes_to_single_quotes (line 92) | def _double_quotes_to_single_quotes(self, expression: str) -> str: method _escape_newlines (line 101) | def _escape_newlines(self, command: str) -> str: method server_url_to_client_url (line 105) | def server_url_to_client_url(self, server_url: str) -> str: method client_url_to_server_url (line 135) | def client_url_to_server_url(self, client_url: str) -> str: method _process_response (line 158) | def _process_response(self, output: dict) -> Optional[dict]: method _escape_quotes_in_json_string (line 164) | def _escape_quotes_in_json_string(self, json_string: str) -> str: class PageForwarder (line 168) | class PageForwarder(Forwarder): method __init__ (line 171) | def __init__( method url (line 184) | def url(self) -> str: method setup (line 190) | def setup(self) -> None: method shutdown (line 198) | def shutdown(self) -> None: method content (line 206) | def content(self) -> str: method goto (line 213) | def goto(self, url: str) -> None: method title (line 232) | def title(self) -> str: method evaluate (line 238) | def evaluate(self, expression: str) -> str: method go_back (line 245) | def go_back(self) -> None: method go_forward (line 249) | def go_forward(self) -> None: method fetch_domtree (line 253) | def fetch_domtree(self) -> dict: method fetch_browser_window_config (line 275) | def fetch_browser_window_config(self) -> BrowserWindowConfig: method fetch_browser_info (line 310) | def fetch_browser_info(self) -> BrowserState: method wait_for_load_state (line 315) | def wait_for_load_state(self, state: str, timeout: int = 500) -> None: method wait_for_event (line 323) | def wait_for_event(self, event: str, timeout: int = 500) -> None: class ClientForwarder (line 331) | class ClientForwarder(Forwarder): method __init__ (line 334) | def __init__(self, page: PageForwarder) -> None: method send (line 338) | def send(self, method: str, params: dict) -> dict: class MouseForwarder (line 346) | class MouseForwarder(Forwarder): method __init__ (line 347) | def __init__(self, page: PageForwarder) -> None: method click (line 351) | def click(self, x: float, y: float) -> None: method move (line 355) | def move(self, x: float, y: float) -> None: class KeyboardForwarder (line 360) | class KeyboardForwarder(Forwarder): method __init__ (line 361) | def __init__(self, page: PageForwarder) -> None: method type (line 365) | def type(self, text: str) -> None: method press (line 371) | def press(self, key: str) -> None: class ViewportSize (line 376) | class ViewportSize(TypedDict): FILE: evals/elsuite/multistep_web_tasks/webarena/core/utils.py class EarlyStopConfig (line 7) | class EarlyStopConfig: class ProgramHTML (line 13) | class ProgramHTML(TypedDict): class ReferenceAnswers (line 19) | class ReferenceAnswers(TypedDict): class EvaluatorConfig (line 26) | class EvaluatorConfig: class ExperimentConfig (line 37) | class ExperimentConfig(ABC): class BashExperimentConfig (line 44) | class BashExperimentConfig(ExperimentConfig): method from_dict (line 53) | def from_dict(cls, data: dict) -> "BashExperimentConfig": method to_dict (line 62) | def to_dict(self) -> dict: class BrowserExperimentConfig (line 73) | class BrowserExperimentConfig(ExperimentConfig): method from_dict (line 100) | def from_dict(cls, data: dict[str, Any]) -> "BrowserExperimentConfig": method to_dict (line 117) | def to_dict(self) -> dict[str, Any]: class BashBrowserExperimentConfig (line 136) | class BashBrowserExperimentConfig(ExperimentConfig): method to_separate_configs (line 165) | def to_separate_configs(self) -> tuple[BashExperimentConfig, BrowserEx... method from_dict (line 191) | def from_dict(cls, data: dict[str, Any]) -> "BashBrowserExperimentConf... method to_dict (line 208) | def to_dict(self) -> dict[str, Any]: FILE: evals/elsuite/multistep_web_tasks/webarena/eval_run.py function config (line 45) | def config() -> argparse.Namespace: function run_experiment (line 111) | def run_experiment( function generate_trajectory (line 127) | def generate_trajectory( function _task_state_from_trajectory (line 164) | def _task_state_from_trajectory( function _messages_from_trajectory (line 202) | def _messages_from_trajectory(trajectory: Trajectory) -> list[Message]: function _episode_should_continue (line 218) | def _episode_should_continue(trajectory: Trajectory, early_stop_config: ... function evaluate_trajectory (line 236) | def evaluate_trajectory( function record_result (line 255) | def record_result( function setup_env (line 266) | def setup_env( function setup_browser_env (line 283) | def setup_browser_env( function setup_bash_env (line 303) | def setup_bash_env( function setup_bash_browser_env (line 311) | def setup_bash_browser_env( function should_early_stop (line 332) | def should_early_stop(trajectory: Trajectory, es_config: EarlyStopConfig... function _check_repeated_equivalent_actions (line 348) | def _check_repeated_equivalent_actions(trajectory: Trajectory, repeating... FILE: evals/elsuite/multistep_web_tasks/webarena/evaluation_harness/evaluators.py class Evaluator (line 40) | class Evaluator(object): method __init__ (line 41) | def __init__(self, eval_tag: str = "") -> None: method __call__ (line 44) | def __call__( method get_last_action (line 53) | def get_last_action(trajectory: Trajectory) -> Action: method get_last_state (line 66) | def get_last_state(trajectory: Trajectory) -> EnvOutput: function get_answer_from_action (line 78) | def get_answer_from_action(action: Action) -> str: class StringEvaluator (line 91) | class StringEvaluator(Evaluator): method __call__ (line 98) | def __call__( class StringSoftEvaluator (line 137) | class StringSoftEvaluator(Evaluator): method __call__ (line 140) | def __call__( class BrowserEvaluator (line 155) | class BrowserEvaluator(Evaluator): class URLExactEvaluator (line 160) | class URLExactEvaluator(BrowserEvaluator): method __call__ (line 163) | def __call__( class HTMLContentExactEvaluator (line 202) | class HTMLContentExactEvaluator(BrowserEvaluator): method __call__ (line 205) | def __call__( class EvaluatorComb (line 263) | class EvaluatorComb(ABC): method __init__ (line 264) | def __init__(self, evaluators: list[Evaluator]) -> None: method __call__ (line 267) | def __call__( class BrowserEvaluatorComb (line 281) | class BrowserEvaluatorComb(EvaluatorComb): method __init__ (line 282) | def __init__(self, evaluators: list[BrowserEvaluator]) -> None: function evaluator_router (line 287) | def evaluator_router(experiment_config: ExperimentConfig) -> EvaluatorComb: function browser_evaluator_router (line 302) | def browser_evaluator_router( function bash_evaluator_router (line 323) | def bash_evaluator_router( function bash_browser_evaluator_router (line 339) | def bash_browser_evaluator_router( FILE: evals/elsuite/multistep_web_tasks/webarena/evaluation_harness/helper_functions.py function shopping_get_auth_token (line 15) | def shopping_get_auth_token() -> str: function shopping_get_latest_order_url (line 31) | def shopping_get_latest_order_url() -> str: function shopping_get_sku_latest_review_author (line 54) | def shopping_get_sku_latest_review_author(sku: str) -> str: function shopping_get_sku_latest_review_rating (line 70) | def shopping_get_sku_latest_review_rating(sku: str) -> str: function reddit_get_post_url (line 87) | def reddit_get_post_url(url: str) -> str: function gitlab_get_project_memeber_role (line 106) | def gitlab_get_project_memeber_role(page: Page, account_name: str) -> str: function llm_fuzzy_match (line 138) | def llm_fuzzy_match(pred: str, reference: str, question: str) -> float: FILE: evals/elsuite/sandbagging/mmlu_eval.py class MMLU (line 14) | class MMLU(SolverEval): method __init__ (line 15) | def __init__( method eval_sample (line 27) | def eval_sample( method run (line 64) | def run(self, recorder: evals.record.Recorder): FILE: evals/elsuite/sandbagging/sandbagging_eval.py class Sandbagging (line 16) | class Sandbagging(SolverEval): method __init__ (line 22) | def __init__( method eval_sample (line 46) | def eval_sample(self, solver: Solver, sample: Dict[str, Any], rng: ran... method run (line 56) | def run(self, recorder: evals.record.Recorder): FILE: evals/elsuite/sandbagging/scripts/consistency_plots.py function load_jsonl_log (line 11) | def load_jsonl_log(path): function get_questions (line 33) | def get_questions(files: Sequence[Path]) -> dict: function get_answers_for_each_question (line 54) | def get_answers_for_each_question(files: Sequence[Path], questions: dict): function create_total_no_correct_hist (line 79) | def create_total_no_correct_hist(files: Sequence[Path], questions: dict,... function calculate_consistency (line 96) | def calculate_consistency(files: Sequence[Path]): function main (line 124) | def main(): FILE: evals/elsuite/sandbagging/scripts/sandbagging_all_plots.py function main (line 7) | def main(): FILE: evals/elsuite/sandbagging/scripts/sandbagging_subset_plots.py function main (line 7) | def main(): FILE: evals/elsuite/sandbagging/scripts/utils.py function extract_results (line 12) | def extract_results(datadir: Path) -> pd.DataFrame: function compute_all_model_scores (line 29) | def compute_all_model_scores(model_log: dict, thresh_targets: bool) -> f... function make_sandbagging_plot (line 51) | def make_sandbagging_plot( FILE: evals/elsuite/sandbagging/solvers.py class StrategySolver (line 26) | class StrategySolver(Solver): method __init__ (line 32) | def __init__( method name (line 48) | def name(self) -> str: method _predict_answer (line 52) | def _predict_answer(self, prompt: Sequence[Dict], **kwargs) -> str: method _produce_scratchpad (line 56) | def _produce_scratchpad(self, prompt: Sequence[Dict]): method modify_target (line 62) | def modify_target(self, target: int): method _add_ans_to_memory (line 70) | def _add_ans_to_memory(self, prompt: Sequence[Dict], response: str): method _construct_prompt (line 82) | def _construct_prompt(self, task_state: TaskState) -> Sequence[Dict]: method _solve (line 129) | def _solve(self, task_state: TaskState, **kwargs) -> (Sequence[Dict], ... class LMSolver (line 140) | class LMSolver(StrategySolver): method __init__ (line 141) | def __init__( method name (line 167) | def name(self) -> str: method _produce_scratchpad (line 170) | def _produce_scratchpad(self, prompt: Sequence[Dict]): method _predict_answer (line 176) | def _predict_answer(self, prompt: Sequence[Dict], **kwargs) -> str: class CLISolver (line 185) | class CLISolver(StrategySolver): method __init__ (line 190) | def __init__( method name (line 208) | def name(self) -> str: method _produce_scratchpad (line 211) | def _produce_scratchpad(self, prompt: Sequence[Dict]): method _predict_answer (line 217) | def _predict_answer(self, prompt: Sequence[Dict], **kwargs) -> str: FILE: evals/elsuite/sandbagging/utils.py function get_violation_events (line 7) | def get_violation_events(events: Sequence[Event], valid_samples: Sequenc... function record_mmlu_sample (line 20) | def record_mmlu_sample( function get_difficulty (line 51) | def get_difficulty(events: Sequence[Event]) -> float: function compute_sandbagging_score (line 64) | def compute_sandbagging_score( FILE: evals/elsuite/schelling_point/eval.py class SchellingPoint (line 14) | class SchellingPoint(Eval): method __init__ (line 15) | def __init__( method eval_sample (line 42) | def eval_sample(self, sample: Any, *_): method run (line 78) | def run(self, recorder: evals.record.Recorder) -> dict[str, Union[floa... FILE: evals/elsuite/schelling_point/utils.py function replace_last (line 10) | def replace_last(s: str, old: str, new: str) -> str: function get_response (line 15) | def get_response( FILE: evals/elsuite/self_prompting/eval.py class SelfPrompting (line 22) | class SelfPrompting(SolverEval): method __init__ (line 23) | def __init__( method eval_sample (line 59) | def eval_sample(self, solver: Solver, sample: Any, rng: random.Random): method _run_prompting (line 67) | def _run_prompting(self, solver: Solver, sample: Any, *_): method _run_tasking (line 97) | def _run_tasking(self, sample: Any, *_): method _calculate_improvement_wrt_baseline (line 121) | def _calculate_improvement_wrt_baseline( method run (line 180) | def run(self, recorder: evals.record.Recorder) -> dict[str, Union[floa... FILE: evals/elsuite/self_prompting/scripts/make_plots.py function extract_metrics (line 13) | def extract_metrics(datadir: Path) -> pd.DataFrame: function make_plot (line 44) | def make_plot(df: pd.DataFrame, outpath: Path, metric="exact"): function main (line 79) | def main(): FILE: evals/elsuite/self_prompting/solvers/baselines.py class BaselineNoPromptSolver (line 5) | class BaselineNoPromptSolver(Solver): method __init__ (line 6) | def __init__( method _solve (line 14) | def _solve( method name (line 22) | def name(self) -> str: class BaselineOriginalPromptSolver (line 26) | class BaselineOriginalPromptSolver(Solver): method __init__ (line 27) | def __init__( method _solve (line 35) | def _solve( method name (line 44) | def name(self) -> str: class BaselineFewShotSolver (line 48) | class BaselineFewShotSolver(Solver): method __init__ (line 49) | def __init__( method _solve (line 57) | def _solve( method name (line 69) | def name(self) -> str: FILE: evals/elsuite/self_prompting/solvers/custom_cot_solver.py class CustomCoTSolver (line 14) | class CustomCoTSolver(OpenAISolver): method __init__ (line 15) | def __init__( method _solve (line 39) | def _solve( method name (line 69) | def name(self) -> str: FILE: evals/elsuite/skill_acquisition/eval.py class SkillAcquisition (line 39) | class SkillAcquisition(SolverEval): method __init__ (line 40) | def __init__( method eval_sample (line 69) | def eval_sample(self, solver: Solver, sample: Dict, rng: random.Random... method _eval_non_retrieval_sample (line 93) | def _eval_non_retrieval_sample(self, solver: Solver, sample: Dict, *_)... method _eval_retrieval_sample (line 133) | def _eval_retrieval_sample(self, solver: Solver, sample: Dict, *_) -> ... method run (line 189) | def run(self, recorder: evals.record.Recorder) -> dict[str, Union[floa... method _view_content (line 248) | def _view_content( method _conversation_loop (line 302) | def _conversation_loop( FILE: evals/elsuite/skill_acquisition/scraping/scrape_distractor_articles.py function clean_soup (line 20) | def clean_soup(content): function clean_heading_text (line 35) | def clean_heading_text( FILE: evals/elsuite/skill_acquisition/scraping/scrape_miskito.py function process_practice_section_div (line 15) | def process_practice_section_div(practice_div: bs4.element.Tag): function extract_toc_sections (line 34) | def extract_toc_sections(content: bs4.element.Tag): function process_miskito_page (line 43) | def process_miskito_page(): FILE: evals/elsuite/skill_acquisition/scripts/make_plots.py function extract_metrics (line 32) | def extract_metrics(datadir: Path) -> pd.DataFrame: function make_plot (line 47) | def make_plot( function make_side_bar_plot (line 82) | def make_side_bar_plot( FILE: evals/elsuite/skill_acquisition/solvers.py class SkillAcquisitionAssistantsSolver (line 6) | class SkillAcquisitionAssistantsSolver(OpenAIAssistantsSolver): method _solve (line 7) | def _solve( FILE: evals/elsuite/skill_acquisition/test_skill_acquisition.py function test_answer_detected (line 27) | def test_answer_detected(): function test_view_instruction_detected (line 36) | def test_view_instruction_detected(): function test_process_answer (line 49) | def test_process_answer(): function test_process_view_instruction (line 59) | def test_process_view_instruction(): function test_process_view_instruction_spaces_and_quotes (line 85) | def test_process_view_instruction_spaces_and_quotes(): function test_view_content (line 96) | def test_view_content(): FILE: evals/elsuite/skill_acquisition/utils.py function answer_detected (line 39) | def answer_detected(output: str) -> bool: function view_instruction_detected (line 43) | def view_instruction_detected(output: str) -> bool: function process_answer (line 47) | def process_answer(output: str) -> str: function process_view_instruction (line 73) | def process_view_instruction(output: str) -> Union[tuple[str, str], tupl... function _get_average_metric (line 104) | def _get_average_metric( function get_bootstrap_accuracy_std (line 115) | def get_bootstrap_accuracy_std(results: List[Dict[str, str]], num_sample... function render_intermediate_prompt (line 121) | def render_intermediate_prompt(sections_viewed: Dict[str, Set]) -> str: function get_question_type (line 129) | def get_question_type(question: str) -> str: function get_average_bleu_score (line 133) | def get_average_bleu_score(results: List[Dict[str, str]]) -> float: function get_bleu_score (line 143) | def get_bleu_score(expected: str, sampled: str) -> float: function get_accuracy (line 153) | def get_accuracy(results: List[Dict[str, str]]) -> float: function get_average_retrieval_calls (line 160) | def get_average_retrieval_calls(results: List[Dict[str, str]]) -> float: function get_average_invalid_retrieval_calls (line 166) | def get_average_invalid_retrieval_calls(results: List[Dict[str, str]]) -... function get_average_retrieval_precision (line 172) | def get_average_retrieval_precision(results: List[Dict[str, str]]) -> fl... function get_std_of_difference (line 178) | def get_std_of_difference(baseline_std: float, retrieval_std: float) -> ... FILE: evals/elsuite/solver_tools_convo.py class ToolCall (line 15) | class ToolCall: class ParsedSolverResult (line 22) | class ParsedSolverResult: class RunnerResult (line 28) | class RunnerResult: class Runner (line 34) | class Runner: method __init__ (line 35) | def __init__( method run (line 51) | def run(self) -> RunnerResult: method _get_tool_names_and_descriptions (line 102) | def _get_tool_names_and_descriptions(self, tools: list[Tool]): method _parse_solver_result (line 112) | def _parse_solver_result(self, solver_result: SolverResult) -> ParsedS... method _parse_tool_calls (line 118) | def _parse_tool_calls(self, output: str) -> Optional[list[ToolCall]]: method _find_tool_messages (line 136) | def _find_tool_messages(self, text: str) -> list[tuple[str, str]]: method _parse_final_answer (line 145) | def _parse_final_answer(self, output: str) -> Optional[str]: method _run_tool_call (line 153) | def _run_tool_call(self, tool_call: ToolCall) -> ToolCall: method _add_eval_message (line 182) | def _add_eval_message( method _add_tool_outputs (line 200) | def _add_tool_outputs( method _finish_run (line 215) | def _finish_run( FILE: evals/elsuite/steganography/eval.py class Steganography (line 12) | class Steganography(Eval): method __init__ (line 13) | def __init__( method eval_sample (line 36) | def eval_sample(self, sample: Any, *_): method run (line 68) | def run(self, recorder: RecorderBase): FILE: evals/elsuite/steganography/monitor.py function detect_suspicious_output (line 52) | def detect_suspicious_output( FILE: evals/elsuite/steganography/prompts.py class PromptTemplate (line 4) | class PromptTemplate: method __init__ (line 5) | def __init__(self, encode_sys_prompt: str, decode_sys_prompt: str) -> ... FILE: evals/elsuite/steganography/reconstruction_metrics.py function reconstruction_metrics (line 9) | def reconstruction_metrics(original: str, decompressed: str) -> Mapping: function semantic_distance (line 28) | def semantic_distance(original: str, decompressed: str) -> float: FILE: evals/elsuite/steganography/scripts/dataset/complexity_metrics.py function calculate_entropy (line 8) | def calculate_entropy(text): function calculate_compression_ratio (line 14) | def calculate_compression_ratio(text): function calculate_brevity_score (line 20) | def calculate_brevity_score(text): FILE: evals/elsuite/steganography/scripts/dataset/csv2jsonl.py function csv_to_jsonl (line 5) | def csv_to_jsonl(csv_path, jsonl_path): FILE: evals/elsuite/steganography/scripts/dataset/custom_datasets.py class BaseTextDataset (line 16) | class BaseTextDataset: method __init__ (line 17) | def __init__(self, seed=0): method __len__ (line 23) | def __len__(self): method __getitem__ (line 26) | def __getitem__(self, idx): class HFTextDataset (line 57) | class HFTextDataset(BaseTextDataset): method __init__ (line 58) | def __init__( class RandomCharDataset (line 104) | class RandomCharDataset(BaseTextDataset): method __init__ (line 105) | def __init__(self, n_samples, seed=0, lengths=[5, 10, 20, 50, 100]): class RandomNumberDataset (line 118) | class RandomNumberDataset(BaseTextDataset): method __init__ (line 119) | def __init__(self, n_samples, seed=0, lengths=[5, 10, 20, 50, 100]): class RandomCharAndNumberDataset (line 131) | class RandomCharAndNumberDataset(BaseTextDataset): method __init__ (line 132) | def __init__(self, n_samples, seed=0, lengths=[5, 10, 20, 50, 100]): class RandomWordsDataset (line 141) | class RandomWordsDataset(BaseTextDataset): method __init__ (line 142) | def __init__(self, n_samples, seed=0, lengths=[5, 10, 20, 50, 100]): class BaseTaskDataset (line 160) | class BaseTaskDataset: method __init__ (line 161) | def __init__(self, seed=0): method __len__ (line 167) | def __len__(self): method __getitem__ (line 170) | def __getitem__(self, idx): class HFTaskDataset (line 195) | class HFTaskDataset(BaseTaskDataset): method __init__ (line 196) | def __init__( FILE: evals/elsuite/steganography/scripts/dataset/dataset.py function make_task_data (line 13) | def make_task_data(): function make_payload_data (line 61) | def make_payload_data(): FILE: evals/elsuite/steganography/scripts/dataset/utils.py function num_tokens_from_messages (line 4) | def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"): FILE: evals/elsuite/steganography/scripts/make_plots.py function extract_results (line 11) | def extract_results(datadir: Path) -> pd.DataFrame: function make_undetected_acc_plot (line 28) | def make_undetected_acc_plot(df: pd.DataFrame, outpath: Path): function make_reconstruction_vs_detectability_plot (line 42) | def make_reconstruction_vs_detectability_plot(df: pd.DataFrame, outpath:... function main (line 77) | def main(): FILE: evals/elsuite/steganography/steganography.py function run_completion_fn (line 10) | def run_completion_fn(sys_prompt: str, completion_fn: CompletionFn) -> T... function run_eval (line 24) | def run_eval( FILE: evals/elsuite/test/match.py class TestMatch (line 4) | class TestMatch(Match): method __init__ (line 5) | def __init__(self, *args, **kwargs): method get_samples (line 8) | def get_samples(self): FILE: evals/elsuite/text_compression/compression.py function run_completion (line 10) | def run_completion(sample: str, instruction: str, completion_fn: Complet... function run_eval (line 27) | def run_eval( FILE: evals/elsuite/text_compression/eval.py class TextCompression (line 12) | class TextCompression(Eval): method __init__ (line 13) | def __init__( method eval_sample (line 28) | def eval_sample(self, sample: Any, *_): method run (line 49) | def run(self, recorder: RecorderBase): FILE: evals/elsuite/text_compression/prompts.py class PromptPair (line 1) | class PromptPair: method __init__ (line 2) | def __init__(self, encode_prompt: str, decode_prompt: str) -> None: FILE: evals/elsuite/text_compression/reconstruction_metrics.py function reconstruction_metrics (line 9) | def reconstruction_metrics(original: str, decompressed: str) -> Mapping: function semantic_distance (line 28) | def semantic_distance(original: str, decompressed: str) -> float: FILE: evals/elsuite/text_compression/scripts/dataset/complexity_metrics.py function calculate_entropy (line 8) | def calculate_entropy(text): function calculate_compression_ratio (line 14) | def calculate_compression_ratio(text): function calculate_brevity_score (line 20) | def calculate_brevity_score(text): FILE: evals/elsuite/text_compression/scripts/dataset/csv2jsonl.py function csv_to_jsonl (line 5) | def csv_to_jsonl(csv_path, jsonl_path): FILE: evals/elsuite/text_compression/scripts/dataset/custom_datasets.py class BaseTextDataset (line 16) | class BaseTextDataset: method __init__ (line 17) | def __init__(self, seed=0): method __len__ (line 23) | def __len__(self): method __getitem__ (line 26) | def __getitem__(self, idx): class HFTextDataset (line 57) | class HFTextDataset(BaseTextDataset): method __init__ (line 58) | def __init__( class RandomCharDataset (line 104) | class RandomCharDataset(BaseTextDataset): method __init__ (line 105) | def __init__(self, n_samples, seed=0, lengths=[5, 10, 20, 50, 100]): class RandomNumberDataset (line 118) | class RandomNumberDataset(BaseTextDataset): method __init__ (line 119) | def __init__(self, n_samples, seed=0, lengths=[5, 10, 20, 50, 100]): class RandomCharAndNumberDataset (line 131) | class RandomCharAndNumberDataset(BaseTextDataset): method __init__ (line 132) | def __init__(self, n_samples, seed=0, lengths=[5, 10, 20, 50, 100]): class RandomWordsDataset (line 141) | class RandomWordsDataset(BaseTextDataset): method __init__ (line 142) | def __init__(self, n_samples, seed=0, lengths=[5, 10, 20, 50, 100]): FILE: evals/elsuite/text_compression/scripts/dataset/utils.py function num_tokens_from_messages (line 4) | def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"): FILE: evals/elsuite/text_compression/scripts/make_plots.py function extract_results (line 10) | def extract_results(datadir: Path) -> pd.DataFrame: function make_reconstruction_vs_compression_plot (line 27) | def make_reconstruction_vs_compression_plot(df: pd.DataFrame, outpath: P... function main (line 64) | def main(): FILE: evals/elsuite/theory_of_mind/scripts/data_generation.py function convert_datapoints_to_eval_dataset (line 23) | def convert_datapoints_to_eval_dataset(datapoints: list) -> list: FILE: evals/elsuite/theory_of_mind/scripts/make_plots.py function main (line 14) | def main(): function load_tom_results_from_dir (line 26) | def load_tom_results_from_dir(log_dir: Union[str, Path]) -> pd.DataFrame: function parse_spec (line 45) | def parse_spec(spec: dict) -> tuple[str, bool, int]: function make_plot (line 54) | def make_plot(df, out_dir): FILE: evals/elsuite/track_the_stat/eval.py class TrackTheStat (line 17) | class TrackTheStat(SolverEval): method __init__ (line 18) | def __init__(self, task: str, n_samples: Optional[int] = 250, *args, *... method eval_sample (line 43) | def eval_sample(self, solver: Solver, sample: Any, rng: random.Random)... method _eval_sample (line 49) | def _eval_sample(self, solver: Solver, capped_inf_list: list[int]) -> ... method run (line 70) | def run(self, recorder: RecorderBase): method _compute_agg_metrics (line 78) | def _compute_agg_metrics(self, logged_metrics: list[dict]) -> dict: method _get_samples (line 93) | def _get_samples(self) -> list[dict]: FILE: evals/elsuite/track_the_stat/scripts/make_plots.py function zero_if_none (line 13) | def zero_if_none(input_num): function make_results_dict (line 43) | def make_results_dict(log_dir: Path) -> dict: function get_model (line 49) | def get_model(spec): function get_state_tracking (line 69) | def get_state_tracking(spec): function fill_results_dict (line 76) | def fill_results_dict(results_dict, log_dir): function prepare_results_dict (line 109) | def prepare_results_dict(): function make_bar_plot (line 126) | def make_bar_plot(results_dict: dict, task: str, stat: str, save_path: P... function count_tokens (line 221) | def count_tokens(log_dir) -> dict[str, dict[str, dict[str, int]]]: function main (line 266) | def main(args: argparse.Namespace): FILE: evals/elsuite/track_the_stat/solvers.py class ExplicitStateSolver (line 9) | class ExplicitStateSolver(NestedSolver): method __init__ (line 10) | def __init__( method underlying_solver (line 21) | def underlying_solver(self) -> Solver: method _render_state (line 24) | def _render_state(self, current_state: dict) -> str: method _build_message (line 28) | def _build_message(self, task_state: TaskState) -> str: method _solve (line 34) | def _solve(self, task_state: TaskState) -> SolverResult: class RandomBaselineSolver (line 42) | class RandomBaselineSolver(Solver): method __init__ (line 43) | def __init__(self, registry: Any = None, *args, **kwargs): method _solve (line 46) | def _solve(self, task_state: TaskState) -> SolverResult: method _task_solve (line 52) | def _task_solve(self, task: str, task_state: TaskState) -> str: method _mode_solve (line 58) | def _mode_solve(self, task_state: TaskState) -> str: method _median_solve (line 66) | def _median_solve(self, task_state: TaskState) -> str: class TrackTheStatHuman (line 80) | class TrackTheStatHuman(NestedSolver): method __init__ (line 81) | def __init__(self, human_cli_solver: SolverSpec, *args, **kwargs): method human_cli_solver (line 85) | def human_cli_solver(self) -> Solver: method _solve (line 88) | def _solve(self, task_state: TaskState) -> SolverResult: FILE: evals/elsuite/track_the_stat/utils.py function yellow_string (line 8) | def yellow_string(str: str) -> str: function median (line 12) | def median(numbers: list[int]) -> int: function mode (line 21) | def mode(numbers: list[int]) -> int: function parse_solver_output (line 39) | def parse_solver_output(solver_output: str, task: str) -> Union[int, None]: function compute_mode_state (line 56) | def compute_mode_state(curr_list: list[int]) -> dict: function compute_median_state (line 61) | def compute_median_state(curr_list: list[int]) -> dict: function compute_state (line 66) | def compute_state(curr_list: list[int], task) -> dict: FILE: evals/elsuite/translate.py class Translate (line 11) | class Translate(evals.Eval): method __init__ (line 12) | def __init__( method eval_sample (line 35) | def eval_sample(self, sample: Any, *_): method run (line 69) | def run(self, recorder): FILE: evals/elsuite/twenty_questions/eval.py class TwentyQuestions (line 22) | class TwentyQuestions(SolverEval): method __init__ (line 23) | def __init__( method eval_sample (line 56) | def eval_sample(self, solver: Solver, sample: Dict, rng: random.Random... method run (line 78) | def run(self, recorder: Recorder) -> Dict[str, Union[float, int]]: method _conversation_loop (line 111) | def _conversation_loop( FILE: evals/elsuite/twenty_questions/scripts/make_plots.py function extract_metrics (line 48) | def extract_metrics(datadir: Path) -> pd.DataFrame: function make_plot (line 67) | def make_plot(df: pd.DataFrame, outpath: Path, metric="score", variant="... FILE: evals/elsuite/twenty_questions/test_utils.py function test_format_msg (line 4) | def test_format_msg(): function test_format_msgs (line 10) | def test_format_msgs(): FILE: evals/elsuite/twenty_questions/utils.py function generate_task_state_for (line 12) | def generate_task_state_for(role: Literal["guesser", "gamemaster"], conv... function format_msgs (line 29) | def format_msgs( function format_msg (line 42) | def format_msg(msg: Message, role: Literal["guesser", "gamemaster"]) -> ... function is_system_msg (line 64) | def is_system_msg(m: Message) -> bool: FILE: evals/elsuite/utils.py function get_answer (line 17) | def get_answer(text, answer_prompt, ignore_case=False): function get_consensus (line 28) | def get_consensus(answers): function normalize (line 36) | def normalize(s: str) -> str: function fuzzy_match (line 46) | def fuzzy_match(s1: str, s2: str) -> bool: function get_scores_from_text (line 56) | def get_scores_from_text(text: str) -> dict: function get_yesno_from_text (line 62) | def get_yesno_from_text(text: str) -> dict: function get_letter_from_data (line 68) | def get_letter_from_data(data: str) -> str: function f1_score (line 75) | def f1_score(prediction: str, answers: list[str]) -> float: function scrub_formatting_from_prompt (line 91) | def scrub_formatting_from_prompt(prompt): function format_necessary (line 103) | def format_necessary(template: str, allow_missing: bool = False, **kwarg... function format_prompt (line 119) | def format_prompt( class PromptFn (line 144) | class PromptFn: method __init__ (line 150) | def __init__( method __call__ (line 166) | def __call__(self, **kwargs): FILE: evals/elsuite/utils_test.py function test_normalize (line 14) | def test_normalize(s: str, expected: str): function test_fuzzy_match (line 32) | def test_fuzzy_match(s1: str, s2: str, expected: bool): FILE: evals/eval.py function _index_samples (line 30) | def _index_samples(samples: List[Any]) -> List[Tuple[Any, int]]: function set_max_samples (line 41) | def set_max_samples(max_samples: int): class Eval (line 46) | class Eval(abc.ABC): method __init__ (line 56) | def __init__( method eval_sample (line 77) | def eval_sample(self, sample: Any, rng: random.Random): method completion_fn (line 81) | def completion_fn(self) -> CompletionFn: method run (line 86) | def run(self, recorder: RecorderBase) -> Dict[str, float]: method async_eval_all_samples (line 90) | async def async_eval_all_samples( method eval_all_samples (line 112) | def eval_all_samples( method get_samples (line 149) | def get_samples(self): method _get_samples_path (line 158) | def _get_samples_path(self) -> Path: method _prefix_registry_path (line 161) | def _prefix_registry_path(self, data_path: str) -> Path: class SolverEval (line 168) | class SolverEval(Eval): method __init__ (line 186) | def __init__(self, *args, **kwargs): method eval_sample (line 197) | def eval_sample(self, solver: Solver, sample: Any, rng: random.Random)... method eval_all_samples (line 200) | def eval_all_samples( FILE: evals/formatting.py function make_abc (line 8) | def make_abc(answers, *, correct_idx=0, shuffle=True, rng: Optional[rand... FILE: evals/metrics.py function get_accuracy (line 12) | def get_accuracy(events: Sequence[Event]) -> float: function get_bootstrap_accuracy_std (line 21) | def get_bootstrap_accuracy_std(events: Sequence[Event], num_samples: int... function get_confusion_matrix (line 26) | def get_confusion_matrix( function compute_matthew_corr (line 43) | def compute_matthew_corr(confusion_matrix: np.ndarray) -> float: function compute_precision (line 52) | def compute_precision(confusion_matrix: np.ndarray, idx: int = 0) -> float: function compute_recall (line 56) | def compute_recall(confusion_matrix: np.ndarray, idx: int = 0) -> float: function compute_f_score (line 60) | def compute_f_score(confusion_matrix: np.ndarray, idx: int = 0, beta: fl... function compute_averaged_f_score (line 66) | def compute_averaged_f_score( FILE: evals/prompt/base.py function chat_prompt_to_text_prompt (line 22) | def chat_prompt_to_text_prompt( function text_prompt_to_chat_prompt (line 59) | def text_prompt_to_chat_prompt(prompt: str, role: str = "system") -> Ope... class Prompt (line 67) | class Prompt(ABC): method to_formatted_prompt (line 74) | def to_formatted_prompt(self): function is_chat_prompt (line 81) | def is_chat_prompt(prompt: Prompt) -> bool: class CompletionPrompt (line 86) | class CompletionPrompt(Prompt): method _render_chat_prompt_as_text (line 93) | def _render_chat_prompt_as_text(self, prompt: OpenAICreateChatPrompt) ... method to_formatted_prompt (line 96) | def to_formatted_prompt(self) -> str: class ChatCompletionPrompt (line 103) | class ChatCompletionPrompt(Prompt): method _render_text_as_chat_prompt (line 112) | def _render_text_as_chat_prompt(self, prompt: str) -> OpenAICreateChat... method to_formatted_prompt (line 119) | def to_formatted_prompt(self) -> OpenAICreateChatPrompt: FILE: evals/record.py function default_recorder (line 39) | def default_recorder() -> Optional["RecorderBase"]: class Event (line 44) | class Event: class RecorderBase (line 54) | class RecorderBase: method __init__ (line 75) | def __init__( method as_default_recorder (line 91) | def as_default_recorder(self, sample_id: str): method current_sample_id (line 98) | def current_sample_id(self) -> Optional[str]: method pause (line 101) | def pause(self): method unpause (line 107) | def unpause(self): method is_paused (line 113) | def is_paused(self, sample_id: str = None): method get_events (line 119) | def get_events(self, type: str) -> Sequence[Event]: method get_metrics (line 123) | def get_metrics(self): method get_scores (line 126) | def get_scores(self, key: str): method _create_event (line 129) | def _create_event(self, type, data=None, sample_id=None): method _flush_events_internal (line 145) | def _flush_events_internal(self, events_to_write: Sequence[Event]): method flush_events (line 148) | def flush_events(self): method record_event (line 157) | def record_event(self, type, data=None, sample_id=None): method record_match (line 187) | def record_match(self, correct: bool, *, expected=None, picked=None, s... method record_embedding (line 202) | def record_embedding(self, prompt, embedding_type, sample_id=None, **e... method record_sampling (line 210) | def record_sampling(self, prompt, sampled, sample_id=None, **extra): method record_function_call (line 218) | def record_function_call(self, name, arguments, return_value, sample_i... method record_cond_logp (line 227) | def record_cond_logp(self, prompt, completion, logp, sample_id=None, *... method record_pick_option (line 236) | def record_pick_option(self, prompt, options, picked, sample_id=None, ... method record_raw (line 245) | def record_raw(self, data): method record_metrics (line 248) | def record_metrics(self, **kwargs): method record_error (line 251) | def record_error(self, msg: str, error: Exception, **kwargs): method record_extra (line 259) | def record_extra(self, data, sample_id=None): method record_final_report (line 262) | def record_final_report(self, final_report: Any): function _green (line 266) | def _green(str): function _red (line 270) | def _red(str): class DummyRecorder (line 274) | class DummyRecorder(RecorderBase): method __init__ (line 280) | def __init__(self, run_spec: RunSpec, log: bool = True): method record_event (line 284) | def record_event(self, type, data, sample_id=None): class LocalRecorder (line 316) | class LocalRecorder(RecorderBase): method __init__ (line 322) | def __init__( method _flush_events_internal (line 346) | def _flush_events_internal(self, events_to_write: Sequence[Event]): method record_final_report (line 367) | def record_final_report(self, final_report: Any): class HttpRecorder (line 374) | class HttpRecorder(RecorderBase): method __init__ (line 375) | def __init__( method _flush_events_internal (line 392) | def _flush_events_internal(self, events_to_write: Sequence[Event]): method _send_event (line 405) | def _send_event(self, events: List[Event]): method record_final_report (line 446) | def record_final_report(self, final_report: Any): class Recorder (line 468) | class Recorder(RecorderBase): method __init__ (line 474) | def __init__( method _flush_events_internal (line 514) | def _flush_events_internal(self, events_to_write: Sequence[Event]): method record_final_report (line 561) | def record_final_report(self, final_report: Any): method record_event (line 578) | def record_event(self, type, data=None, sample_id=None): function current_sample_id (line 589) | def current_sample_id() -> str: function record_match (line 593) | def record_match(correct: bool, *, expected=None, picked=None, **extra): function record_embedding (line 597) | def record_embedding(prompt, embedding_type, **extra): function record_sampling (line 601) | def record_sampling(prompt, sampled, **extra): function record_function_call (line 605) | def record_function_call(name, arguments, return_value, **extra): function record_cond_logp (line 609) | def record_cond_logp(prompt, completion, logp, **extra): function record_pick_option (line 613) | def record_pick_option(prompt, options, picked, **extra): function record_raw (line 617) | def record_raw(data): function record_metrics (line 621) | def record_metrics(**extra): function record_error (line 625) | def record_error(msg: str, error: Exception = None, **extra): function record_extra (line 629) | def record_extra(data): function record_event (line 633) | def record_event(type, data=None, sample_id=None): function pause (line 637) | def pause(): function unpause (line 641) | def unpause(): FILE: evals/record_test.py function test_passes_hidden_data_field_to_jsondumps (line 8) | def test_passes_hidden_data_field_to_jsondumps() -> None: FILE: evals/registry.py function n_ctx_from_model_name (line 37) | def n_ctx_from_model_name(model_name: str) -> Optional[int]: function is_chat_model (line 83) | def is_chat_model(model_name: str) -> bool: class Registry (line 103) | class Registry: method __init__ (line 104) | def __init__(self, registry_paths: Sequence[Union[str, Path]] = DEFAUL... method add_registry_paths (line 107) | def add_registry_paths(self, paths: Sequence[Union[str, Path]]) -> None: method api_model_ids (line 111) | def api_model_ids(self) -> list[str]: method make_completion_fn (line 120) | def make_completion_fn( method get_class (line 153) | def get_class(self, spec: EvalSpec) -> Any: method _dereference (line 156) | def _dereference( method get_modelgraded_spec (line 193) | def get_modelgraded_spec(self, name: str, **kwargs: dict) -> Optional[... method get_completion_fn (line 202) | def get_completion_fn(self, name: str) -> Optional[CompletionFnSpec]: method get_solver (line 207) | def get_solver(self, name: str) -> Optional[CompletionFnSpec]: method get_eval (line 210) | def get_eval(self, name: str) -> Optional[EvalSpec]: method get_eval_set (line 213) | def get_eval_set(self, name: str) -> Optional[EvalSetSpec]: method get_evals (line 216) | def get_evals(self, patterns: Sequence[str]) -> Iterator[Optional[Eval... method get_base_evals (line 229) | def get_base_evals(self) -> list[Optional[BaseEvalSpec]]: method get_base_eval (line 236) | def get_base_eval(self, name: str) -> Optional[BaseEvalSpec]: method _load_file (line 251) | def _load_file(self, path: Path) -> Generator[Tuple[str, Path, dict], ... method _load_directory (line 262) | def _load_directory(self, path: Path) -> Generator[Tuple[str, Path, di... method _load_resources (line 267) | def _load_resources( method _validate_reserved_keywords (line 280) | def _validate_reserved_keywords(spec: dict, name: str, path: Path) -> ... method _load_registry (line 287) | def _load_registry(self, registry_paths: Sequence[Path], resource_type... method _completion_fns (line 313) | def _completion_fns(self) -> RawRegistry: method _solvers (line 317) | def _solvers(self) -> RawRegistry: method _eval_sets (line 321) | def _eval_sets(self) -> RawRegistry: method _evals (line 325) | def _evals(self) -> RawRegistry: method _modelgraded_specs (line 329) | def _modelgraded_specs(self) -> RawRegistry: FILE: evals/registry/data/german-part-of-speech/buildDataDe.py function generate_combinations (line 59) | def generate_combinations(words): FILE: evals/registry/data/hr_ml_agent_bench/parkinsons_disease/dataset/public_timeseries_testing_util.py class MockApi (line 13) | class MockApi: method __init__ (line 14) | def __init__(self): method iter_test (line 36) | def iter_test(self) -> Tuple[pd.DataFrame]: method predict (line 77) | def predict(self, user_predictions: pd.DataFrame): function make_env (line 92) | def make_env(): FILE: evals/registry/data/mapping_to_matricies/data_generator.py function generate_binary_array_and_factors (line 4) | def generate_binary_array_and_factors(N): function generate_one_sample_json_string (line 13) | def generate_one_sample_json_string(binary_array_str, dimensions_str, an... function write_lines_to_file (line 26) | def write_lines_to_file(min_array_len, max_array_len, filename, max_line... FILE: evals/registry/data/mazes/nxn_maze_eval_generator.py function recursive_backtracker (line 66) | def recursive_backtracker(maze: np.ndarray, pos: Tuple[int, int]) -> None: function generate_maze (line 85) | def generate_maze(width: int, height: int) -> np.ndarray: function random_outer_pos (line 103) | def random_outer_pos(maze: np.ndarray) -> Tuple[int, int]: function generate_start_end (line 123) | def generate_start_end(maze: np.ndarray) -> Tuple[Tuple[int, int], Tuple... function build_graph (line 140) | def build_graph(maze: np.ndarray) -> nx.Graph: function generate_example_files (line 162) | def generate_example_files( function create_move_line (line 251) | def create_move_line( function plot_maze (line 327) | def plot_maze(maze: np.ndarray, show=False, save_img=False) -> None: FILE: evals/registry/data/medmcqa/convert.js method transform (line 22) | transform(line, _, done) { FILE: evals/registry/data/nfl-point-combinations/combinations_generator.py function ways_to_score (line 11) | def ways_to_score(n): FILE: evals/registry/data/points_on_line/eval_generator.py function tuple_to_string (line 12) | def tuple_to_string(float_tuple: tuple, n_decimals: int) -> str: function comp_float_mul (line 18) | def comp_float_mul(vector: tuple, n_decimals: int, factor: float = 0.1) ... function random_divisible_line (line 43) | def random_divisible_line(comp_min: int, comp_max: int, n_decimals: int)... function construct_messages (line 92) | def construct_messages(start: tuple, end: tuple) -> list[dict]: function assemble_test_format (line 102) | def assemble_test_format(n_samples: int) -> list[dict]: FILE: evals/registry/data/poker_analysis/poker_analysis_sample_generator.py function randomize_num_players_and_community_cards (line 8) | def randomize_num_players_and_community_cards() -> tuple: function generate_hands (line 19) | def generate_hands(num_players: int, num_community_cards: int) -> tuple: function calculate_probabilities (line 42) | def calculate_probabilities(hole_cards_list: list, community_cards: list... function generate_example (line 89) | def generate_example(num_players: int = None, num_community_cards: int =... function format_example (line 116) | def format_example(hole_cards: list, community_cards: list, winning_play... function save_examples_to_json_file (line 147) | def save_examples_to_json_file( FILE: evals/registry/data/simple_physics_engine/samples_generator.py function create_row (line 46) | def create_row(initial_state, ideal_state): function format_wave_as_string (line 60) | def format_wave_as_string(wave): function generate_samples (line 69) | def generate_samples(): FILE: evals/registry/data/simple_physics_engine/solver.py function solve_diagram (line 17) | def solve_diagram(diagram): FILE: evals/registry/data/simple_physics_engine/wave_function_collapse.py class ContradictionException (line 9) | class ContradictionException(Exception): function get_rules (line 31) | def get_rules(): function init_possibilities (line 86) | def init_possibilities(): function create_wave_array (line 95) | def create_wave_array(height, width): function get_final_state (line 126) | def get_final_state(tile) -> str: function calculate_entropy (line 136) | def calculate_entropy(tile): function collapse_tile (line 143) | def collapse_tile(tile): function find_lowest_entropy_tile (line 155) | def find_lowest_entropy_tile(wave): function get_above_tile (line 182) | def get_above_tile(wave, i, j): function get_below_tile (line 186) | def get_below_tile(wave, i, j): function get_left_tile (line 190) | def get_left_tile(wave, i, j): function get_right_tile (line 194) | def get_right_tile(wave, i, j): function place_ball (line 198) | def place_ball(wave): function generate_collapsed_wave (line 204) | def generate_collapsed_wave(height, width): function get_valid_directions (line 222) | def get_valid_directions(wave, coords): function get_possible_neighbors_in_direction (line 236) | def get_possible_neighbors_in_direction(tile, direction): function propagate (line 246) | def propagate(wave, coords): function print_wave (line 272) | def print_wave(wave): FILE: evals/registry/data/solve-for-variable/tools/main.py class Template (line 9) | class Template: method load (line 10) | def load(self, fname): method write (line 14) | def write(self, fname): class Writer (line 17) | class Writer: method __init__ (line 18) | def __init__(self, fname, template): method __enter__ (line 24) | def __enter__(self): method __exit__ (line 28) | def __exit__(self, *args): method add_instance (line 31) | def add_instance(self, question, answers, correct): function main (line 64) | def main(): FILE: evals/registry/data/solve-for-variable/tools/problem.py class MistakesGenerator (line 10) | class MistakesGenerator: method generate (line 21) | def generate(self, location): class ProblemGenerator (line 61) | class ProblemGenerator: method __init__ (line 66) | def __init__(self): method _generate (line 70) | def _generate(self): method generate (line 116) | def generate(self): function main (line 176) | def main(): FILE: evals/registry/data/solve-for-variable/tools/solve.py class Num (line 6) | class Num: method __init__ (line 11) | def __init__(self, value): method clone (line 14) | def clone(self): method __str__ (line 17) | def __str__(self): method tree (line 20) | def tree(self): class Var (line 24) | class Var: method __init__ (line 29) | def __init__(self, name): method clone (line 32) | def clone(self): method var_location (line 35) | def var_location(self, name): method __str__ (line 42) | def __str__(self): method tree (line 45) | def tree(self): class Prio (line 49) | class Prio: class Equation (line 59) | class Equation: method __init__ (line 65) | def __init__(self, left, right): method clone (line 72) | def clone(self): method solve (line 81) | def solve(self, location, mistake=None): method __str__ (line 168) | def __str__(self): method tree (line 171) | def tree(self): class Expression (line 178) | class Expression: method __init__ (line 183) | def __init__(self, prio, op, *args): method clone (line 197) | def clone(self): method var_location (line 203) | def var_location(self, name, past=[]): method _simplify_minus (line 220) | def _simplify_minus(self): method _simplify_in_situ (line 247) | def _simplify_in_situ(self): method __str__ (line 276) | def __str__(self): method tree (line 282) | def tree(self): method _to_string (line 291) | def _to_string(self, pos, parent_prio, parent_assoc): method _arg_string (line 317) | def _arg_string(self, pos): class EquationGenerator (line 326) | class EquationGenerator: method generate (line 329) | def generate(self): method _unary (line 382) | def _unary(self, expr): method _binary (line 392) | def _binary(self, expr, prio, op): method _replace (line 406) | def _replace(self, e, placeholder, klass, choices, i=None): method _term (line 427) | def _term(self): method _make_var (line 437) | def _make_var(self): method _make_num (line 444) | def _make_num(self): FILE: evals/registry/data/solve-for-variable/tools/tester.py class Vars (line 9) | class Vars: method __init__ (line 14) | def __init__(self, module): method _get_vars (line 27) | def _get_vars(self, tree): class ValueGenerator (line 31) | class ValueGenerator: method generate (line 41) | def generate(self, rhs_variables): method _gen (line 44) | def _gen(self, variables, values): class Code (line 61) | class Code: method __init__ (line 66) | def __init__(self, expr): method _compile (line 70) | def _compile(self, expr): class Evaluator (line 74) | class Evaluator: method __init__ (line 84) | def __init__(self, eq, answers): method _variables (line 97) | def _variables(self): method test (line 120) | def test(self): FILE: evals/registry/data/unsolvable_questions/convert.js method transform (line 19) | transform(line, _, done) { FILE: evals/registry/data/unsolvable_questions/findFailures.js method transform (line 19) | transform(line, _, done) { FILE: evals/registry/data/word_association/corpus_tools/corpus.py class Corpus (line 12) | class Corpus(ABC): method __init__ (line 18) | def __init__(self, name: str) -> None: method _get_corpus (line 23) | def _get_corpus(self) -> List[str]: method get_frequency_distribution (line 27) | def get_frequency_distribution(self) -> Dict[str, int]: method get_pos_tagged_words (line 31) | def get_pos_tagged_words(self) -> List[Tuple[str, str]]: method __len__ (line 35) | def __len__(self) -> int: method __getitem__ (line 39) | def __getitem__(self, index: int) -> str: method __setitem__ (line 43) | def __setitem__(self, index: int, value: str) -> None: method __delitem__ (line 47) | def __delitem__(self, index: int) -> None: method __iter__ (line 51) | def __iter__(self) -> Iterator[str]: method __contains__ (line 55) | def __contains__(self, word: str) -> bool: method __repr__ (line 59) | def __repr__(self) -> str: class NltkCorpus (line 64) | class NltkCorpus(Corpus): method __init__ (line 73) | def __init__(self, nltk_corpus: str) -> None: method _get_corpus (line 81) | def _get_corpus(self) -> List[str]: method get_frequency_distribution (line 86) | def get_frequency_distribution(self) -> nltk.FreqDist: method get_pos_tagged_words (line 92) | def get_pos_tagged_words(self) -> List[Tuple[str, str]]: FILE: evals/registry/data/word_association/corpus_tools/pipelines.py class CorpusPipeline (line 6) | class CorpusPipeline: method __init__ (line 7) | def __init__(self, corpus: Corpus) -> None: method add_operation (line 11) | def add_operation(self, operation: Callable[Corpus, ...]) -> "CorpusPi... method run (line 16) | def run(self) -> Corpus: FILE: evals/registry/data/word_association/corpus_tools/processor.py class WordCollectionProcessor (line 20) | class WordCollectionProcessor: method __init__ (line 29) | def __init__(self, words: Union[Corpus, RelatedWords]) -> None: method parts_of_speech_filter (line 32) | def parts_of_speech_filter(self, parts_of_speech: List[str]) -> None: method frequency_filter (line 44) | def frequency_filter( method char_length_filter (line 63) | def char_length_filter(self, length_bounds: LengthBounds) -> None: method sub_word_filter (line 73) | def sub_word_filter(self, subword: str) -> None: method str_max_word_count_filter (line 82) | def str_max_word_count_filter(self, max_num_words: int = 1) -> None: method __iter__ (line 92) | def __iter__(self) -> Iterator[str]: method __len__ (line 96) | def __len__(self) -> int: method __getitem__ (line 100) | def __getitem__(self, index: int) -> str: FILE: evals/registry/data/word_association/corpus_tools/related_words.py class RelatedWords (line 17) | class RelatedWords(ABC): method __init__ (line 23) | def __init__(self, word: str, **kwargs: Optional[Union[str, int]]) -> ... method _get_related_words (line 30) | def _get_related_words(self) -> List[Dict[str, Any]]: method get_pos_tagged_words (line 39) | def get_pos_tagged_words(self) -> List[Tuple[str, str]]: method __repr__ (line 47) | def __repr__(self) -> str: method __len__ (line 54) | def __len__(self) -> int: method __getitem__ (line 63) | def __getitem__(self, index: int) -> str: method __contains__ (line 69) | def __contains__(self, item: str) -> bool: method __iter__ (line 75) | def __iter__(self) -> Generator[str, None, None]: class DataMuseRelatedWords (line 83) | class DataMuseRelatedWords(RelatedWords): method __init__ (line 125) | def __init__( method get_pos_tagged_words (line 134) | def get_pos_tagged_words(self) -> List[Tuple[str, str]]: method get_metadata (line 148) | def get_metadata(self, word: str) -> Dict[str, Union[str, int, List[st... method _get_related_words (line 166) | def _get_related_words(self) -> List[Dict[str, str]]: class GPTGeneratedRelatedWords (line 182) | class GPTGeneratedRelatedWords(RelatedWords): method _get_related_words (line 185) | def _get_related_words(self) -> List[Dict[str, Any]]: FILE: evals/registry/data/word_association/corpus_tools/sample_generators.py class IncludesEvalTemplate (line 12) | class IncludesEvalTemplate: method create_sample (line 15) | def create_sample( method export_to_jsonl (line 33) | def export_to_jsonl(self, filename: str = "samples.jsonl") -> None: function generate_additional_choices (line 39) | def generate_additional_choices( function generate_word_association_system_message (line 76) | def generate_word_association_system_message( function generate_word_association_user_message (line 106) | def generate_word_association_user_message( function taboo_clue_guesser_system_message (line 119) | def taboo_clue_guesser_system_message() -> None: function taboo_clue_giver_system_message (line 126) | def taboo_clue_giver_system_message() -> None: function main (line 134) | def main( FILE: evals/registry/data/word_association/corpus_tools/validators.py class Embedding (line 32) | class Embedding(NamedTuple): class RelatedWordsPair (line 39) | class RelatedWordsPair(NamedTuple): class EmbeddingPair (line 46) | class EmbeddingPair(NamedTuple): class SimilarityTuple (line 53) | class SimilarityTuple(NamedTuple): class QualityValidator (line 61) | class QualityValidator(ABC): method __init__ (line 64) | def __init__(self, target_score: int) -> None: method validate (line 68) | def validate(self, related_words_pair: List[RelatedWordsPair]) -> List... class EmbeddingsValidator (line 72) | class EmbeddingsValidator(QualityValidator): method validate (line 77) | def validate( method calculate_cosine_similarity (line 125) | def calculate_cosine_similarity(vec1: List[float], vec2: List[float]) ... method calculate_euclidean_distance (line 143) | def calculate_euclidean_distance(vec1: List[float], vec2: List[float])... method get_embeddings (line 162) | def get_embeddings( class GPTValidator (line 185) | class GPTValidator(QualityValidator): method __init__ (line 188) | def __init__( method validate (line 203) | def validate(self, related_words_pairs: List[RelatedWordsPair]) -> Lis... method get_chat_completion (line 223) | def get_chat_completion( method extract_score (line 261) | def extract_score(response_content: str) -> float: method set_model (line 282) | def set_model(self, model: str) -> None: FILE: evals/registry_test.py function test_n_ctx_from_model_name (line 4) | def test_n_ctx_from_model_name(): function test_is_chat_model (line 21) | def test_is_chat_model(): FILE: evals/solvers/human_cli_solver.py class HumanCliSolver (line 8) | class HumanCliSolver(Solver): method __init__ (line 15) | def __init__( method _solve (line 29) | def _solve(self, task_state: TaskState, **kwargs) -> SolverResult: method name (line 47) | def name(self) -> str: FILE: evals/solvers/memory.py class Interaction (line 8) | class Interaction: class PersistentMemoryCache (line 16) | class PersistentMemoryCache: method __init__ (line 17) | def __init__( method save_private_interaction (line 24) | def save_private_interaction(self, task_state: TaskState): method load_private_interaction (line 39) | def load_private_interaction(self, task_state: TaskState) -> List[Mess... FILE: evals/solvers/nested/cot_solver.py class CoTSolver (line 9) | class CoTSolver(NestedSolver): method __init__ (line 10) | def __init__( method cot_solver (line 33) | def cot_solver(self) -> Solver: method extract_solver (line 37) | def extract_solver(self) -> Solver: method cot_template (line 40) | def cot_template(self, task_state: TaskState) -> str: method extract_template (line 45) | def extract_template(self, task_state: TaskState) -> str: method _solve (line 50) | def _solve( method name (line 84) | def name(self) -> str: FILE: evals/solvers/nested/fewshot_solver.py class FewShotSolver (line 9) | class FewShotSolver(NestedSolver): method __init__ (line 10) | def __init__( method base_solver (line 72) | def base_solver(self) -> Solver: method _solve (line 75) | def _solve( method _modify_task_state (line 83) | def _modify_task_state(self, task_state: TaskState) -> TaskState: method name (line 109) | def name(self) -> str: FILE: evals/solvers/nested/hhh_solver.py class HHHSolver (line 8) | class HHHSolver(NestedSolver): method __init__ (line 16) | def __init__( method solver (line 25) | def solver(self) -> Solver: method _solve (line 28) | def _solve( method name (line 46) | def name(self) -> str: FILE: evals/solvers/nested/self_consistency_solver.py class SelfConsistencySolver (line 16) | class SelfConsistencySolver(NestedSolver): method __init__ (line 24) | def __init__( method solver (line 55) | def solver(self) -> Solver: method judge_solver (line 59) | def judge_solver(self) -> Solver: method _solve (line 62) | def _solve( method _extract_answer (line 136) | def _extract_answer(self, raw_result: SolverResult) -> str: method name (line 149) | def name(self) -> str: FILE: evals/solvers/postprocessors/base.py class PostProcessor (line 6) | class PostProcessor(ABC): method __call__ (line 13) | def __call__(self, result: SolverResult, *args, **kwargs) -> SolverRes... FILE: evals/solvers/postprocessors/postprocessors.py class Strip (line 5) | class Strip(PostProcessor): method __call__ (line 10) | def __call__(self, result: SolverResult) -> SolverResult: class RemoveQuotes (line 17) | class RemoveQuotes(PostProcessor): method __call__ (line 26) | def __call__(self, result: SolverResult) -> SolverResult: class RemovePeriod (line 35) | class RemovePeriod(PostProcessor): method __call__ (line 41) | def __call__(self, result: SolverResult) -> SolverResult: FILE: evals/solvers/postprocessors/postprocessors_test.py function test_strip (line 5) | def test_strip(): function test_remove_quotes (line 16) | def test_remove_quotes(): function test_remove_period (line 35) | def test_remove_period(): function test_combination (line 50) | def test_combination(): FILE: evals/solvers/prompts/hhh_test.py function test_render_messages (line 94) | def test_render_messages() -> None: FILE: evals/solvers/providers/anthropic/anthropic_solver.py class AnthropicSolver (line 25) | class AnthropicSolver(Solver): method __init__ (line 30) | def __init__( method _solve (line 44) | def _solve(self, task_state: TaskState, **kwargs) -> SolverResult: method name (line 77) | def name(self) -> str: method model_version (line 81) | def model_version(self) -> Union[str, dict]: method _convert_msgs_to_anthropic_format (line 89) | def _convert_msgs_to_anthropic_format(msgs: list[Message]) -> list[Mes... function anthropic_create_retrying (line 119) | def anthropic_create_retrying(client: Anthropic, *args, **kwargs): function anth_to_openai_usage (line 132) | def anth_to_openai_usage(anth_usage: Usage) -> dict: FILE: evals/solvers/providers/anthropic/anthropic_solver_test.py function anthropic_solver (line 18) | def anthropic_solver(): function dummy_recorder (line 26) | def dummy_recorder(): function test_solver (line 38) | def test_solver(dummy_recorder, anthropic_solver): function test_message_format (line 63) | def test_message_format(): function test_anth_to_openai_usage_correctness (line 110) | def test_anth_to_openai_usage_correctness(): function test_anth_to_openai_usage_zero_tokens (line 122) | def test_anth_to_openai_usage_zero_tokens(): FILE: evals/solvers/providers/google/gemini_solver.py class GoogleMessage (line 47) | class GoogleMessage: method to_dict (line 51) | def to_dict(self): method from_evals_message (line 55) | def from_evals_message(msg: Message): class GeminiSolver (line 70) | class GeminiSolver(Solver): method __init__ (line 75) | def __init__( method model (line 92) | def model(self) -> str: method _solve (line 95) | def _solve( method _convert_msgs_to_google_format (line 158) | def _convert_msgs_to_google_format(msgs: list[Message]) -> list[Google... method name (line 191) | def name(self) -> str: method model_version (line 195) | def model_version(self) -> Union[str, dict]: method __deepcopy__ (line 198) | def __deepcopy__(self, memo): FILE: evals/solvers/providers/google/gemini_solver_test.py function dummy_recorder (line 14) | def dummy_recorder(): function gemini_solver (line 21) | def gemini_solver(): function test_solver (line 30) | def test_solver(dummy_recorder, gemini_solver): function test_message_format (line 51) | def test_message_format(): FILE: evals/solvers/providers/openai/openai_assistants_solver.py class OpenAIAssistantsSolver (line 28) | class OpenAIAssistantsSolver(Solver): method __init__ (line 54) | def __init__( method _run_assistant_retrying (line 95) | def _run_assistant_retrying(self, task_state: TaskState): method _solve (line 109) | def _solve( method copy (line 192) | def copy(self): method _create_file (line 204) | def _create_file(self, file_path: str) -> str: method _create_files (line 223) | def _create_files(self, file_paths: list[str]) -> list[str]: method _get_last_assistant_message_idx (line 231) | def _get_last_assistant_message_idx(self, messages: list[Message]) -> ... method _convert_to_user_message (line 238) | def _convert_to_user_message(self, message: Message) -> Message: method _wait_on_run (line 248) | def _wait_on_run(self, run: Run, thread: Thread) -> Run: method name (line 262) | def name(self) -> str: method model_version (line 266) | def model_version(self) -> Union[str, dict]: FILE: evals/solvers/providers/openai/openai_assistants_solver_test.py function dummy_data_file (line 21) | def dummy_data_file(scope="session"): function dummy_recorder (line 35) | def dummy_recorder(): function vanilla_solver (line 42) | def vanilla_solver(): function code_interpreter_solver (line 50) | def code_interpreter_solver(): function retrieval_solver (line 59) | def retrieval_solver(): function test_solver_copying (line 68) | def test_solver_copying(dummy_recorder, vanilla_solver): function test_multiturn_conversation (line 84) | def test_multiturn_conversation(dummy_recorder, vanilla_solver): function test_code_interpreter (line 107) | def test_code_interpreter(dummy_recorder, code_interpreter_solver): function test_task_description (line 126) | def test_task_description(dummy_recorder, vanilla_solver): function test_code_interpreter_file (line 145) | def test_code_interpreter_file(dummy_recorder, dummy_data_file, code_int... function test_retrieval_file (line 172) | def test_retrieval_file(dummy_recorder, dummy_data_file, retrieval_solver): function test_file_cache (line 206) | def test_file_cache(dummy_recorder, dummy_data_file, retrieval_solver): FILE: evals/solvers/providers/openai/openai_solver.py class OpenAISolver (line 22) | class OpenAISolver(Solver): method __init__ (line 31) | def __init__( method model (line 65) | def model(self) -> str: method name (line 73) | def name(self) -> str: method model_version (line 77) | def model_version(self) -> Union[str, dict]: method _is_chat_model (line 87) | def _is_chat_model(self, model: str) -> bool: method _completion_exception (line 96) | def _completion_exception(self) -> Exception: method _api_base (line 104) | def _api_base(self) -> Optional[str]: method _api_key (line 110) | def _api_key(self) -> Optional[str]: method _solve (line 115) | def _solve(self, task_state: TaskState, **kwargs) -> SolverResult: method _perform_prechecks (line 159) | def _perform_prechecks(self, msgs: list[dict[str, str]]) -> Optional[S... method _process_msgs (line 181) | def _process_msgs(self, raw_msgs: list[dict[str, str]]) -> list[dict[s... method _handle_completion_exception (line 189) | def _handle_completion_exception(self, e: Exception) -> SolverResult: method _render_completion_prompt (line 222) | def _render_completion_prompt(self, msgs: list[dict[str, str]]) -> str: method _parse_completion_response (line 235) | def _parse_completion_response(self, raw_response: str) -> str: method _get_msg_separators (line 249) | def _get_msg_separators(self) -> list[str]: method _get_completion_fn_cls (line 256) | def _get_completion_fn_cls(self, model: str) -> Any: method _preprocess_completion_fn_options (line 274) | def _preprocess_completion_fn_options(self) -> dict: method _make_logit_bias (line 287) | def _make_logit_bias(self, valid_answers: list[str], model: str) -> di... FILE: evals/solvers/providers/together/together_solver.py function is_chat_model (line 12) | def is_chat_model(model: str) -> bool: class TogetherSolver (line 27) | class TogetherSolver(OpenAISolver): method __init__ (line 45) | def __init__(self, merge_adjacent_msgs: bool = False, **kwargs): method _api_base (line 52) | def _api_base(self) -> Optional[str]: method _api_key (line 57) | def _api_key(self) -> Optional[str]: method _completion_exception (line 62) | def _completion_exception(self) -> Exception: method _is_chat_model (line 69) | def _is_chat_model(self, model: str) -> bool: method _preprocess_completion_fn_options (line 76) | def _preprocess_completion_fn_options(self) -> dict: method _perform_prechecks (line 83) | def _perform_prechecks(self, msgs: list[dict[str, str]]) -> Optional[S... method _process_msgs (line 91) | def _process_msgs(self, msgs: list[dict[str, str]]) -> list[dict[str, ... method _handle_completion_exception (line 130) | def _handle_completion_exception(self, e: Exception) -> SolverResult: FILE: evals/solvers/providers/together/together_solver_test.py function llama_solver (line 7) | def llama_solver(): function llama_solver_merge (line 17) | def llama_solver_merge(): function test_single_system_msg (line 27) | def test_single_system_msg(llama_solver): function test_system_assistant_msgs (line 37) | def test_system_assistant_msgs(llama_solver): function test_system_user_msg (line 49) | def test_system_user_msg(llama_solver): function test_final_system_msg (line 61) | def test_final_system_msg(llama_solver): function test_combined (line 75) | def test_combined(llama_solver): function test_merge (line 89) | def test_merge(llama_solver_merge): function test_advanced_merge (line 102) | def test_advanced_merge(llama_solver_merge): FILE: evals/solvers/solver.py class SolverResult (line 18) | class SolverResult: method __init__ (line 19) | def __init__(self, output: str, **metadata): method output (line 24) | def output(self) -> str: method metadata (line 28) | def metadata(self) -> dict: method to_json (line 31) | def to_json(self) -> str: class Solver (line 41) | class Solver(ABC, CompletionFn): method __init__ (line 44) | def __init__( method _solve (line 60) | def _solve( method __call__ (line 76) | def __call__( method name (line 100) | def name(self) -> str: method model_version (line 111) | def model_version(self) -> Union[str, dict]: method copy (line 122) | def copy(self: SolverType) -> SolverType: class DummySolver (line 128) | class DummySolver(Solver): method _solve (line 129) | def _solve( class NestedSolver (line 137) | class NestedSolver(Solver): method __init__ (line 142) | def __init__(self, *, postprocessors: list[str] = [], registry=None, *... method get_solver (line 160) | def get_solver(self, solver_name: str) -> Solver: method _create_solver (line 188) | def _create_solver(self, solver_spec: SolverSpec) -> Solver: method copy (line 191) | def copy(self: SolverType) -> SolverType: method model_version (line 200) | def model_version(self) -> Union[str, dict]: function create_solver (line 212) | def create_solver(solver_spec: dict) -> Solver: FILE: evals/solvers/solver_test.py class EchoSolver (line 8) | class EchoSolver(Solver): method _solve (line 13) | def _solve( function dummy_recorder (line 22) | def dummy_recorder(): function test_echo_solver (line 28) | def test_echo_solver(dummy_recorder): function test_echo_solver_with_postprocessors (line 36) | def test_echo_solver_with_postprocessors(dummy_recorder): FILE: evals/solvers/utils.py function maybe_wrap_with_compl_fn (line 10) | def maybe_wrap_with_compl_fn(ambiguous_executor: Union[CompletionFn, Sol... function maybe_wrap_with_solver (line 28) | def maybe_wrap_with_solver(ambiguous_executor: Union[Solver, CompletionF... FILE: evals/task_state.py class Message (line 6) | class Message: method to_dict (line 19) | def to_dict(self): class TaskState (line 24) | class TaskState: FILE: evals/utils/api_utils.py function create_retrying (line 15) | def create_retrying(func: callable, retry_exceptions: tuple[Exception], ... FILE: evals/utils/log_utils.py function get_final_results_from_dir (line 6) | def get_final_results_from_dir(log_dir: Union[str, Path]) -> dict[Path, ... function get_specs_from_dir (line 17) | def get_specs_from_dir(log_dir: Union[str, Path]) -> dict[Path, dict]: function extract_final_results (line 28) | def extract_final_results(path: Path) -> dict: function extract_individual_results (line 45) | def extract_individual_results(path: Path, type_string: str = "metrics")... function extract_spec (line 64) | def extract_spec(path: Path) -> dict: FILE: evals/utils/misc.py function t (line 9) | def t(duration: float) -> str: function make_object (line 20) | def make_object(object_ref: str, *args: Any, **kwargs: Any) -> Any: FILE: evals/utils/snowflake.py function _first_not_none (line 14) | def _first_not_none(*args): class SnowflakeError (line 21) | class SnowflakeError(Exception): class SnowflakeConnection (line 25) | class SnowflakeConnection: method __init__ (line 26) | def __init__( method _ensure_connected (line 60) | def _ensure_connected(self): method cursor (line 91) | def cursor(self, *args, **kwargs): method __call__ (line 97) | def __call__(self, *args, **kwargs): method query (line 104) | def query(self, *args, many=False, pandas_out=False, list_out=False, *... method robust_query (line 115) | def robust_query(self, max_trials: Optional[int] = None, *args, **kwar... FILE: evals/utils/test.py class TestCompletionResult (line 7) | class TestCompletionResult(CompletionResult): method __init__ (line 11) | def __init__(self, completion: str): method get_completions (line 14) | def get_completions(self) -> list[str]: class TestCompletionFn (line 18) | class TestCompletionFn(CompletionFn): method __init__ (line 22) | def __init__(self, completion: str): method __call__ (line 25) | def __call__( FILE: scripts/battle_generator.py function format (line 13) | def format(template: str, **kwargs: str) -> str: FILE: scripts/modelgraded_generator.py function format (line 9) | def format(template: str, **kwargs: str) -> str: FILE: scripts/pattern_identification_generator.py function generate_example (line 23) | def generate_example() -> tuple[str, list[str], Literal["foo", "bar"]]: function generate_exemplars_str (line 32) | def generate_exemplars_str(num_exemplars: int = 8) -> str: function generate_eval_examples (line 42) | def generate_eval_examples( FILE: tests/unit/evals/test_metrics.py function test_get_accuracy (line 19) | def test_get_accuracy(