SYMBOL INDEX (41 symbols across 5 files) FILE: minbpe/base.py function get_stats (line 13) | def get_stats(ids, counts=None): function merge (line 25) | def merge(ids, pair, idx): function replace_control_characters (line 44) | def replace_control_characters(s: str) -> str: function render_token (line 57) | def render_token(t: bytes) -> str: class Tokenizer (line 66) | class Tokenizer: method __init__ (line 69) | def __init__(self): method train (line 76) | def train(self, text, vocab_size, verbose=False): method encode (line 80) | def encode(self, text): method decode (line 84) | def decode(self, ids): method _build_vocab (line 88) | def _build_vocab(self): method save (line 97) | def save(self, file_prefix): method load (line 140) | def load(self, model_file): FILE: minbpe/basic.py class BasicTokenizer (line 15) | class BasicTokenizer(Tokenizer): method __init__ (line 17) | def __init__(self): method train (line 20) | def train(self, text, vocab_size, verbose=False): method decode (line 51) | def decode(self, ids): method encode (line 57) | def encode(self, text): FILE: minbpe/gpt4.py function bpe (line 11) | def bpe(mergeable_ranks, token, max_rank): function recover_merges (line 29) | def recover_merges(mergeable_ranks): class GPT4Tokenizer (line 57) | class GPT4Tokenizer(RegexTokenizer): method __init__ (line 60) | def __init__(self): method _encode_chunk (line 81) | def _encode_chunk(self, text_bytes): method decode (line 87) | def decode(self, ids): method train (line 95) | def train(self, text, vocab_size, verbose=False): method save (line 103) | def save(self, file_prefix): method load (line 106) | def load(self, model_file): method save_vocab (line 109) | def save_vocab(self, vocab_file): FILE: minbpe/regex.py class RegexTokenizer (line 22) | class RegexTokenizer(Tokenizer): method __init__ (line 24) | def __init__(self, pattern=None): method train (line 36) | def train(self, text, vocab_size, verbose=False): method register_special_tokens (line 72) | def register_special_tokens(self, special_tokens): method decode (line 78) | def decode(self, ids): method _encode_chunk (line 92) | def _encode_chunk(self, text_bytes): method encode_ordinary (line 111) | def encode_ordinary(self, text): method encode (line 123) | def encode(self, text, allowed_special="none_raise"): FILE: tests/test_tokenizer.py function unpack (line 17) | def unpack(text): function test_encode_decode_identity (line 54) | def test_encode_decode_identity(tokenizer_factory, text): function test_gpt4_tiktoken_equality (line 63) | def test_gpt4_tiktoken_equality(text): function test_gpt4_tiktoken_equality_special_tokens (line 72) | def test_gpt4_tiktoken_equality_special_tokens(): function test_wikipedia_example (line 81) | def test_wikipedia_example(tokenizer_factory): function test_save_load (line 110) | def test_save_load(special_tokens):