gitextract_lbfgru6r/

├── .gitignore
├── LICENSE
├── README.md
├── configs/
│   ├── data/
│   │   ├── cc-100.yaml
│   │   └── wikipedia.yaml
│   └── model/
│       ├── bert_base_character.yaml
│       ├── bert_base_wordpiece.yaml
│       ├── bert_large_character.yaml
│       └── bert_large_wordpiece.yaml
├── convert_tf2_ckpt_for_all_frameworks.py
├── create_pretraining_data.py
├── hf_model_configs/
│   ├── bert_base_character/
│   │   ├── config.json
│   │   └── tokenizer_config.json
│   ├── bert_base_wordpiece/
│   │   ├── config.json
│   │   └── tokenizer_config.json
│   ├── bert_large_character/
│   │   ├── config.json
│   │   └── tokenizer_config.json
│   └── bert_large_wordpiece/
│       ├── config.json
│       └── tokenizer_config.json
├── japanese_tokenizers/
│   ├── implementations.py
│   └── pre_tokenizers.py
├── make_alphabet_from_unidic.py
├── make_corpus_wiki.py
├── masked_lm_example.ipynb
├── merge_split_corpora.py
├── model_configs/
│   ├── bert_base_character/
│   │   └── config.json
│   ├── bert_base_wordpiece/
│   │   └── config.json
│   ├── bert_large_character/
│   │   └── config.json
│   └── bert_large_wordpiece/
│       └── config.json
├── requirements.txt
├── tokenization.py
└── train_tokenizer.py