Repository: cl-tohoku/bert-japanese
Branch: main
Commit: e4c8b003abd0
Files: 32
Total size: 96.2 KB

Directory structure:
gitextract_lbfgru6r/

├── .gitignore
├── LICENSE
├── README.md
├── configs/
│   ├── data/
│   │   ├── cc-100.yaml
│   │   └── wikipedia.yaml
│   └── model/
│       ├── bert_base_character.yaml
│       ├── bert_base_wordpiece.yaml
│       ├── bert_large_character.yaml
│       └── bert_large_wordpiece.yaml
├── convert_tf2_ckpt_for_all_frameworks.py
├── create_pretraining_data.py
├── hf_model_configs/
│   ├── bert_base_character/
│   │   ├── config.json
│   │   └── tokenizer_config.json
│   ├── bert_base_wordpiece/
│   │   ├── config.json
│   │   └── tokenizer_config.json
│   ├── bert_large_character/
│   │   ├── config.json
│   │   └── tokenizer_config.json
│   └── bert_large_wordpiece/
│       ├── config.json
│       └── tokenizer_config.json
├── japanese_tokenizers/
│   ├── implementations.py
│   └── pre_tokenizers.py
├── make_alphabet_from_unidic.py
├── make_corpus_wiki.py
├── masked_lm_example.ipynb
├── merge_split_corpora.py
├── model_configs/
│   ├── bert_base_character/
│   │   └── config.json
│   ├── bert_base_wordpiece/
│   │   └── config.json
│   ├── bert_large_character/
│   │   └── config.json
│   └── bert_large_wordpiece/
│       └── config.json
├── requirements.txt
├── tokenization.py
└── train_tokenizer.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# Pretrained Japanese BERT models

This is a repository of pretrained Japanese BERT models.
The models are available in [Transformers](https://github.com/huggingface/transformers) by Hugging Face.

- Model hub: https://huggingface.co/tohoku-nlp

This version of README contains information for the following models:

- [`tohoku-nlp/bert-base-japanese-v3`](https://huggingface.co/tohoku-nlp/bert-base-japanese-v3)
- [`tohoku-nlp/bert-base-japanese-char-v3`](https://huggingface.co/tohoku-nlp/bert-base-japanese-char-v3)
- [`tohoku-nlp/bert-large-japanese-v2`](https://huggingface.co/tohoku-nlp/bert-large-japanese-v2)
- [`tohoku-nlp/bert-large-japanese-char-v2`](https://huggingface.co/tohoku-nlp/bert-large-japanese-char-v2)

For information and codes for the following models, refer to the [v2.0](https://github.com/cl-tohoku/bert-japanese/tree/v2.0) tag of this repository:

- [`tohoku-nlp/bert-base-japanese-v2`](https://huggingface.co/tohoku-nlp/bert-base-japanese-v2)
- [`tohoku-nlp/bert-base-japanese-char-v2`](https://huggingface.co/tohoku-nlp/bert-base-japanese-char-v2)
- [`tohoku-nlp/bert-large-japanese`](https://huggingface.co/tohoku-nlp/bert-large-japanese)
- [`tohoku-nlp/bert-large-japanese-char`](https://huggingface.co/tohoku-nlp/bert-large-japanese-char)

For information and codes for the following models, refer to the [v1.0](https://github.com/cl-tohoku/bert-japanese/tree/v1.0) tag of this repository:

- [`tohoku-nlp/bert-base-japanese`](https://huggingface.co/tohoku-nlp/bert-base-japanese)
- [`tohoku-nlp/bert-base-japanese-whole-word-masking`](https://huggingface.co/tohoku-nlp/bert-base-japanese-whole-word-masking)
- [`tohoku-nlp/bert-base-japanese-char`](https://huggingface.co/tohoku-nlp/bert-base-japanese-char)
- [`tohoku-nlp/bert-base-japanese-char-whole-word-masking`](https://huggingface.co/tohoku-nlp/bert-base-japanese-char-whole-word-masking)

## Model Architecture

The architecture of our models are the same as the original BERT models proposed by Google.
- **BERT-base** models consist of 12 layers, 768 dimensions of hidden states, and 12 attention heads.
- **BERT-large** models consist of 24 layers, 1024 dimensions of hidden states, and 16 attention heads.

## Training Data

The models are trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.

The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.

For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).

### Generating corpus files

```sh
# For CC-100
$ mkdir -p $WORK_DIR/corpus/cc-100
$ python merge_split_corpora.py \
--input_files $DATA_DIR/cc-100/ja.txt.xz \
--output_dir $WORK_DIR/corpus/cc-100 \
--num_files 64

# For Wikipedia
$ mkdir -p $WORK_DIR/corpus/wikipedia
$ python make_corpus_wiki.py \
--input_file $DATA_DIR/wikipedia/cirrussearch/20230102/jawiki-20230102-cirrussearch-content.json.gz \
--output_file $WORK_DIR/corpus/wikipedia/corpus.txt.gz \
--min_sentence_length 10 \
--max_sentence_length 200 \
--mecab_option '-r <path to etc/mecabrc> -d <path to lib/mecab/dic/mecab-ipadic-neologd>'
$ python merge_split_corpora.py \
--input_files $WORK_DIR/corpus/wikipedia/corpus.txt.gz \
--output_dir $WORK_DIR/corpus/wikipedia \
--num_files 8

# Sample 1M sentences for training tokenizers
$ cat $WORK_DIR/corpus/wikipedia/corpus_*.txt|grep -a -v '^$'|shuf|head -n 10000000 > $WORK_DIR/corpus/wikipedia/corpus_sampled.txt
```

## Tokenization

For each of BERT-base and BERT-large, we provide two models with different tokenization methods.

- For **`wordpiece`** models, the texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
  The vocabulary size is 32768.
- For **`character`** models, the texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into characters.
  The vocabulary size is 7027, which covers all the characters present in Unidic 2.1.2 dictionary.

We used [unidic-lite](https://github.com/polm/unidic-lite) dictionary for tokenization.

### Generating a set of characters

```sh
$ mkdir -p $WORK_DIR/tokenizers/alphabet
$ python make_alphabet_from_unidic.py \
--lex_file $DATA_DIR/unidic-mecab-2.1.2_src/lex.csv \
--output_file $WORK_DIR/tokenizers/alphabet/unidic_lite.txt
```

### Training tokenizers

```sh
# WordPiece
$ python train_tokenizer.py \
--input_files $WORK_DIR/corpus/wikipedia/corpus_sampled.txt \
--output_dir $WORK_DIR/tokenizers/wordpiece_unidic_lite \
--pre_tokenizer_type mecab \
--mecab_dic_type unidic_lite \
--vocab_size 32768 \
--limit_alphabet 7012 \
--initial_alphabet_file $WORK_DIR/tokenizers/alphabet/unidic_lite.txt \
--num_unused_tokens 10 \
--wordpieces_prefix '##'

# Character
$ mkdir $WORK_DIR/tokenizers/character_unidic_lite
$ head -n 7027 $WORK_DIR/tokenizers/wordpiece_unidic_lite/vocab.txt > $WORK_DIR/tokenizers/character_unidic_lite/vocab.txt
```

### Generating pretraining data

```sh
# WordPiece on CC-100
# Each process takes about 2h50m and 60GB RAM, producing 15.2M instances
$ mkdir -p $WORK_DIR/pretraining_data/wordpiece_unidic_lite/cc-100
$ seq -f %02g 1 64|xargs -L 1 -I {} -P 2 \
python create_pretraining_data.py \
--input_file $WORK_DIR/corpus/cc-100/corpus_{}.txt \
--output_file $WORK_DIR/pretraining_data/wordpiece_unidic_lite/cc-100/pretraining_data_{}.tfrecord.gz \
--vocab_file $WORK_DIR/tokenizers/wordpiece_unidic_lite/vocab.txt \
--word_tokenizer_type mecab \
--subword_tokenizer_type wordpiece \
--mecab_dic_type unidic_lite \
--do_whole_word_mask \
--gzip_compress \
--use_v2_feature_names \
--max_seq_length 128 \
--max_predictions_per_seq 19 \
--masked_lm_prob 0.15 \
--dupe_factor 5

# WordPiece on Wikipedia
# Each process takes about 7h30m and 138GB RAM, producing 18.4M instances
$ mkdir -p $WORK_DIR/pretraining_data/wordpiece_unidic_lite/wikipedia
$ seq -f %02g 1 8|xargs -L 1 -I {} -P 1 \
python create_pretraining_data.py \
--input_file $WORK_DIR/corpus/wikipedia/corpus_{}.txt \
--output_file $WORK_DIR/pretraining_data/wordpiece_unidic_lite/wikipedia/pretraining_data_{}.tfrecord.gz \
--vocab_file $WORK_DIR/tokenizers/wordpiece_unidic_lite/vocab.txt \
--word_tokenizer_type mecab \
--subword_tokenizer_type wordpiece \
--mecab_dic_type unidic_lite \
--do_whole_word_mask \
--gzip_compress \
--use_v2_feature_names \
--max_seq_length 512 \
--max_predictions_per_seq 76 \
--masked_lm_prob 0.15 \
--dupe_factor 30

# Character on CC-100
# Each process takes about 3h30m and 82GB RAM, producing 18.4M instances
$ mkdir -p $WORK_DIR/pretraining_data/character_unidic_lite/cc-100
$ seq -f %02g 1 64|xargs -L 1 -I {} -P 2 \
python create_pretraining_data.py \
--input_file $WORK_DIR/corpus/cc-100/corpus_{}.txt \
--output_file $WORK_DIR/pretraining_data/character_unidic_lite/cc-100/pretraining_data_{}.tfrecord.gz \
--vocab_file $WORK_DIR/tokenizers/character_unidic_lite/vocab.txt \
--word_tokenizer_type mecab \
--subword_tokenizer_type character \
--mecab_dic_type unidic_lite \
--vocab_has_no_subword_prefix \
--do_whole_word_mask \
--gzip_compress \
--use_v2_feature_names \
--max_seq_length 128 \
--max_predictions_per_seq 19 \
--masked_lm_prob 0.15 \
--dupe_factor 5

# Character on Wikipedia
# Each process takes about 10h30m and 205GB RAM, producing 23.7M instances
$ mkdir -p $WORK_DIR/pretraining_data/character_unidic_lite/wikipedia
$ seq -f %02g 1 8|xargs -L 1 -I {} -P 1 \
python create_pretraining_data.py \
--input_file $WORK_DIR/corpus/wikipedia/corpus_{}.txt \
--output_file $WORK_DIR/pretraining_data/character_unidic_lite/wikipedia/pretraining_data_{}.tfrecord.gz \
--vocab_file $WORK_DIR/tokenizers/character_unidic_lite/vocab.txt \
--word_tokenizer_type mecab \
--subword_tokenizer_type character \
--mecab_dic_type unidic_lite \
--vocab_has_no_subword_prefix \
--do_whole_word_mask \
--gzip_compress \
--use_v2_feature_names \
--max_seq_length 512 \
--max_predictions_per_seq 76 \
--masked_lm_prob 0.15 \
--dupe_factor 30
```

## Training

We trained the models first on the CC-100 corpus and then on the Wikipedia corpus.
Generally speaking, the texts of Wikipedia are much cleaner than those of CC-100, but the amount of text is much smaller.
We expect that our two-stage training scheme let the model trained on large amount of text while preserving the quality of language that the model eventually learns.

For training of the MLM (masked language modeling) objective, we introduced **whole word masking** in which all subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.

To conduct training of each model, we used a v3-8 instance of Cloud TPUs provided by [TensorFlow Research Cloud program](https://www.tensorflow.org/tfrc/).
The training took about 16 and 56 days for BERT-base and BERT-large models, respectively.

### Creating a TPU VM and connecting to it

**Note:** We set the runtime version of the TPU as `2.11.0`, where TensorFlow v2.11 is used.
It is important to specify the same version if you wish to reuse our codes, otherwise it may not work properly.

Here we use [Google Cloud CLI](https://cloud.google.com/cli).

```sh
$ gcloud compute tpus tpu-vm create <TPU_NODE_ID> --zone=<TPU_ZONE> --accelerator-type=v3-8 --version=tpu-vm-tf-2.11.0
$ gcloud compute tpus tpu-vm ssh <TPU_NODE_ID> --zone=<TPU_ZONE>
```

### Training of the models

The following commands are executed in the TPU VM.
It is recommended that you run the commands in a Tmux session.

**Note:** All the necessary files (i.e., pretraining data and config files) need to be stored in a Google Cloud Storage (GCS) bucket in advance.

#### BERT-base, WordPiece

```sh
(vm)$ cd /usr/share/tpu/models/
(vm)$ pip3 install -r official/requirements.txt
(vm)$ export PYTHONPATH=/usr/share/tpu/models
(vm)$ CONFIG_DIR="gs://<GCS_BUCKET_ID>/bert-japanese/configs"
(vm)$ DATA_DIR="gs://<GCS_BUCKET_ID>/bert-japanese/pretraining_data/wordpiece_unidic_lite"
(vm)$ MODEL_DIR="gs://<GCS_BUCKET_ID>/bert-japanese/model/wordpiece_unidic_lite"

# Start training on CC-100
# It will take 6 days to finish on a v3-8 TPU
(vm)$ python3 official/nlp/train.py \
--tpu=local \
--experiment=bert/pretraining \
--mode=train_and_eval \
--model_dir=$MODEL_DIR/bert_base/training/cc-100 \
--config_file=$CONFIG_DIR/data/cc-100.yaml \
--config_file=$CONFIG_DIR/model/bert_base_wordpiece.yaml \
--params_override="task.train_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu"

# Continue training on Wikipedia
# It will take 10 days to finish on a v3-8 TPU
(vm)$ python3 official/nlp/train.py \
--tpu=local \
--experiment=bert/pretraining \
--mode=train_and_eval \
--model_dir=$MODEL_DIR/bert_base/training/cc-100_wikipedia \
--config_file=$CONFIG_DIR/data/wikipedia.yaml \
--config_file=$CONFIG_DIR/model/bert_base_wordpiece.yaml \
--params_override="task.init_checkpoint=$MODEL_DIR/bert_base/training/cc-100,task.train_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu"
```

#### BERT-base, Character

```sh
(vm)$ cd /usr/share/tpu/models/
(vm)$ pip3 install -r official/requirements.txt
(vm)$ export PYTHONPATH=/usr/share/tpu/models
(vm)$ CONFIG_DIR="gs://<GCS_BUCKET_ID>/bert-japanese/configs"
(vm)$ DATA_DIR="gs://<GCS_BUCKET_ID>/bert-japanese/pretraining_data/character_unidic_lite"
(vm)$ MODEL_DIR="gs://<GCS_BUCKET_ID>/bert-japanese/model/character_unidic_lite"

# Start training on CC-100
# It will take 6 days to finish on a v3-8 TPU
(vm)$ python3 official/nlp/train.py \
--tpu=local \
--experiment=bert/pretraining \
--mode=train_and_eval \
--model_dir=$MODEL_DIR/bert_base/training/cc-100 \
--config_file=$CONFIG_DIR/data/cc-100.yaml \
--config_file=$CONFIG_DIR/model/bert_base_character.yaml \
--params_override="task.train_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu"

# Continue training on Wikipedia
# It will take 10 days to finish on a v3-8 TPU
(vm)$ python3 official/nlp/train.py \
--tpu=local \
--experiment=bert/pretraining \
--mode=train_and_eval \
--model_dir=$MODEL_DIR/bert_base/training/cc-100_wikipedia \
--config_file=$CONFIG_DIR/data/wikipedia.yaml \
--config_file=$CONFIG_DIR/model/bert_base_character.yaml \
--params_override="task.init_checkpoint=$MODEL_DIR/bert_base/training/cc-100,task.train_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu"
```

#### BERT-large, WordPiece

```sh
(vm)$ cd /usr/share/tpu/models/
(vm)$ pip3 install -r official/requirements.txt
(vm)$ export PYTHONPATH=/usr/share/tpu/models
(vm)$ CONFIG_DIR="gs://<GCS_BUCKET_ID>/bert-japanese/configs"
(vm)$ DATA_DIR="gs://<GCS_BUCKET_ID>/bert-japanese/pretraining_data/wordpiece_unidic_lite"
(vm)$ MODEL_DIR="gs://<GCS_BUCKET_ID>/bert-japanese/model/wordpiece_unidic_lite"

# Start training on CC-100
# It will take 23 days to finish on a v3-8 TPU
(vm)$ python3 official/nlp/train.py \
--tpu=local \
--experiment=bert/pretraining \
--mode=train_and_eval \
--model_dir=$MODEL_DIR/bert_large/training/cc-100 \
--config_file=$CONFIG_DIR/data/cc-100.yaml \
--config_file=$CONFIG_DIR/model/bert_large_wordpiece.yaml \
--params_override="task.train_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu"

# Continue training on Wikipedia
# It will take 33 days to finish on a v3-8 TPU
(vm)$ python3 official/nlp/train.py \
--tpu=local \
--experiment=bert/pretraining \
--mode=train_and_eval \
--model_dir=$MODEL_DIR/bert_large/training/cc-100_wikipedia \
--config_file=$CONFIG_DIR/data/wikipedia.yaml \
--config_file=$CONFIG_DIR/model/bert_large_wordpiece.yaml \
--params_override="task.init_checkpoint=$MODEL_DIR/bert_large/training/cc-100,task.train_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu"
```

#### BERT-large, Character

```sh
(vm)$ cd /usr/share/tpu/models/
(vm)$ pip3 install -r official/requirements.txt
(vm)$ export PYTHONPATH=/usr/share/tpu/models
(vm)$ CONFIG_DIR="gs://<GCS_BUCKET_ID>/bert-japanese/configs"
(vm)$ DATA_DIR="gs://<GCS_BUCKET_ID>/bert-japanese/pretraining_data/character_unidic_lite"
(vm)$ MODEL_DIR="gs://<GCS_BUCKET_ID>/bert-japanese/model/character_unidic_lite"

# Start training on CC-100
# It will take 23 days to finish on a v3-8 TPU
(vm)$ python3 official/nlp/train.py \
--tpu=local \
--experiment=bert/pretraining \
--mode=train_and_eval \
--model_dir=$MODEL_DIR/bert_large/training/cc-100 \
--config_file=$CONFIG_DIR/data/cc-100.yaml \
--config_file=$CONFIG_DIR/model/bert_large_character.yaml \
--params_override="task.train_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu"

# Continue training on Wikipedia
# It will take 33 days to finish on a v3-8 TPU
(vm)$ python3 official/nlp/train.py \
--tpu=local \
--experiment=bert/pretraining \
--mode=train_and_eval \
--model_dir=$MODEL_DIR/bert_large/training/cc-100_wikipedia \
--config_file=$CONFIG_DIR/data/wikipedia.yaml \
--config_file=$CONFIG_DIR/model/bert_large_character.yaml \
--params_override="task.init_checkpoint=$MODEL_DIR/bert_large/training/cc-100,task.train_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu"
```

### Deleting a TPU VM

```sh
$ gcloud compute tpus tpu-vm delete <TPU_NODE_ID> --zone=<TPU_ZONE>
```

## Model Conversion

You can convert the TensorFlow model checkpoint to a PyTorch model file.

**Note:** The model conversion script is designed for the models trained with TensorFlow v2.11.0.
The script may not work for models trained with a different version of TensorFlow.

```sh
# For BERT-base, WordPiece
$ VOCAB_FILE=$WORK_DIR/tokenizers/wordpiece_unidic_lite/vocab.txt
$ TF_CONFIG_FILE=model_configs/bert_base_wordpiece/config.json
$ HF_CONFIG_DIR=hf_model_configs/bert_base_wordpiece
$ TF_CKPT_PATH=$WORK_DIR/model/wordpiece_unidic_lite/bert_base/training/cc-100_wikipedia/ckpt-1000000
$ OUTPUT_DIR=$WORK_DIR/hf_model/bert-base-japanese-v3

# For BERT-base, Character
$ VOCAB_FILE=$WORK_DIR/tokenizers/character_unidic_lite/vocab.txt
$ TF_CONFIG_FILE=model_configs/bert_base_character/config.json
$ HF_CONFIG_DIR=hf_model_configs/bert_base_character
$ TF_CKPT_PATH=$WORK_DIR/model/character_unidic_lite/bert_base/training/cc-100_wikipedia/ckpt-1000000
$ OUTPUT_DIR=$WORK_DIR/hf_model/bert-base-japanese-char-v3

# For BERT-large, WordPiece
$ VOCAB_FILE=$WORK_DIR/tokenizers/wordpiece_unidic_lite/vocab.txt
$ TF_CONFIG_FILE=model_configs/bert_large_wordpiece/config.json
$ HF_CONFIG_DIR=hf_model_configs/bert_large_wordpiece
$ TF_CKPT_PATH=$WORK_DIR/model/wordpiece_unidic_lite/bert_large/training/cc-100_wikipedia/ckpt-1000000
$ OUTPUT_DIR=$WORK_DIR/hf_model/bert-large-japanese-v2

# For BERT-large, Character
$ VOCAB_FILE=$WORK_DIR/tokenizers/character_unidic_lite/vocab.txt
$ TF_CONFIG_FILE=model_configs/bert_large_character/config.json
$ HF_CONFIG_DIR=hf_model_configs/bert_large_character
$ TF_CKPT_PATH=$WORK_DIR/model/character_unidic_lite/bert_large/training/cc-100_wikipedia/ckpt-1000000
$ OUTPUT_DIR=$WORK_DIR/hf_model/bert-large-japanese-char-v2

# Run the model conversion script
$ mkdir -p $OUTPUT_DIR
$ python convert_tf2_checkpoint_to_all_frameworks.py \
--tf_checkpoint_path $TF_CKPT_PATH \
--tf_config_file $TF_CONFIG_FILE \
--output_path $OUTPUT_DIR
$ cp $HF_CONFIG_DIR/* $OUTPUT_DIR
$ cp $VOCAB_FILE $OUTPUT_DIR
```

## Model Performances

We evaluated our models' performances on the [JGLUE](https://github.com/yahoojapan/JGLUE) benchmark tasks.

For each task, the model is fine-tuned on the training set and evaluated on the development set (the test sets are not publicly available as of this writing.)
The hyperparameters were searched within the same set of values as the ones specified in the [JGLUE fine-tuning README](https://github.com/yahoojapan/JGLUE/tree/main/fine-tuning).

The results of our (informal) experiments are below.
**Note:** These results should be viewed as informative only, since each setting was experimented with only one fixed random seed.

|              Model                     | MARC-ja |        JSTS        | JNLI  |    JSQuAD     | JCommonsenseQA |
| :------------------------------------- | :-----: | :----------------: | :---: | :-----------: | :------------: |
|                                        |   Acc.  | Pearson / Spearman |  Acc. |    EM / F1    |       Acc.     |
| `bert-base-japanese-v2`                |  0.958  |   0.910 / 0.871    | 0.901 | 0.869 / 0.939 |      0.803     |
| `bert-base-japanese-v3` **New!**       |  0.962  |   0.919 / 0.881    | 0.907 | 0.880 / 0.946 |      0.848     |
| `bert-base-japanese-char-v2`           |  0.957  |   0.891 / 0.851    | 0.896 | 0.870 / 0.938 |      0.724     |
| `bert-base-japanese-char-v3` **New!**  |  0.959  |   0.914 / 0.875    | 0.903 | 0.871 / 0.939 |      0.786     |
| `bert-large-japanese`                  |  0.958  |   0.913 / 0.874    | 0.902 | 0.881 / 0.946 |      0.823     |
| `bert-large-japanese-v2` **New!**      |  0.960  |   0.926 / 0.893    | 0.929 | 0.893 / 0.956 |      0.893     |
| `bert-large-japanese-char`             |  0.958  |   0.883 / 0.842    | 0.899 | 0.870 / 0.938 |      0.753     |
| `bert-large-japanese-char-v2` **New!** |  0.961  |   0.921 / 0.884    | 0.910 | 0.892 / 0.952 |      0.859     |

## Licenses

The pretrained models and the codes in this repository are distributed under the Apache License 2.0.

## Related Work

- Original BERT models and codes by Google Research Team
    - https://github.com/google-research/bert (for TensorFlow v1)
    - https://github.com/tensorflow/models/tree/master/official/nlp (for TensorFlow v2)

## Acknowledgments

The distributed models are trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.


================================================
FILE: configs/data/cc-100.yaml
================================================
task:
  init_checkpoint: ''
  train_data:
    drop_remainder: true
    global_batch_size: 2048
    is_training: true
    max_predictions_per_seq: 19
    seq_length: 128
    use_next_sentence_label: true
    use_position_id: false
    use_v2_feature_names: true
  validation_data:
    drop_remainder: false
    global_batch_size: 2048
    is_training: false
    max_predictions_per_seq: 19
    seq_length: 128
    use_next_sentence_label: true
    use_position_id: false
    use_v2_feature_names: true


================================================
FILE: configs/data/wikipedia.yaml
================================================
task:
  init_checkpoint: ''
  train_data:
    drop_remainder: true
    global_batch_size: 512
    is_training: true
    max_predictions_per_seq: 76
    seq_length: 512
    use_next_sentence_label: true
    use_position_id: false
    use_v2_feature_names: true
  validation_data:
    drop_remainder: false
    global_batch_size: 512
    is_training: false
    max_predictions_per_seq: 76
    seq_length: 512
    use_next_sentence_label: true
    use_position_id: false
    use_v2_feature_names: true


================================================
FILE: configs/model/bert_base_character.yaml
================================================
task:
  model:
    cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 768, name: next_sentence, num_classes: 2}]
    encoder:
      type: bert
      bert:
        attention_dropout_rate: 0.1
        dropout_rate: 0.1
        hidden_activation: gelu
        hidden_size: 768
        initializer_range: 0.02
        intermediate_size: 3072
        max_position_embeddings: 512
        num_attention_heads: 12
        num_layers: 12
        type_vocab_size: 2
        vocab_size: 7027
trainer:
  checkpoint_interval: 20000
  max_to_keep: 5
  optimizer_config:
    learning_rate:
      polynomial:
        cycle: false
        decay_steps: 1000000
        end_learning_rate: 0.0
        initial_learning_rate: 0.0001
        power: 1.0
      type: polynomial
    optimizer:
      type: adamw
    warmup:
      polynomial:
        power: 1
        warmup_steps: 10000
      type: polynomial
  steps_per_loop: 1000
  summary_interval: 1000
  train_steps: 1000000
  validation_interval: 1000
  validation_steps: 64


================================================
FILE: configs/model/bert_base_wordpiece.yaml
================================================
task:
  model:
    cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 768, name: next_sentence, num_classes: 2}]
    encoder:
      type: bert
      bert:
        attention_dropout_rate: 0.1
        dropout_rate: 0.1
        hidden_activation: gelu
        hidden_size: 768
        initializer_range: 0.02
        intermediate_size: 3072
        max_position_embeddings: 512
        num_attention_heads: 12
        num_layers: 12
        type_vocab_size: 2
        vocab_size: 32768
trainer:
  checkpoint_interval: 20000
  max_to_keep: 5
  optimizer_config:
    learning_rate:
      polynomial:
        cycle: false
        decay_steps: 1000000
        end_learning_rate: 0.0
        initial_learning_rate: 0.0001
        power: 1.0
      type: polynomial
    optimizer:
      type: adamw
    warmup:
      polynomial:
        power: 1
        warmup_steps: 10000
      type: polynomial
  steps_per_loop: 1000
  summary_interval: 1000
  train_steps: 1000000
  validation_interval: 1000
  validation_steps: 64


================================================
FILE: configs/model/bert_large_character.yaml
================================================
task:
  model:
    cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 1024, name: next_sentence, num_classes: 2}]
    encoder:
      type: bert
      bert:
        attention_dropout_rate: 0.1
        dropout_rate: 0.1
        hidden_activation: gelu
        hidden_size: 1024
        initializer_range: 0.02
        intermediate_size: 4096
        max_position_embeddings: 512
        num_attention_heads: 16
        num_layers: 24
        type_vocab_size: 2
        vocab_size: 7027
trainer:
  checkpoint_interval: 20000
  max_to_keep: 5
  optimizer_config:
    learning_rate:
      polynomial:
        cycle: false
        decay_steps: 1000000
        end_learning_rate: 0.0
        initial_learning_rate: 0.00005
        power: 1.0
      type: polynomial
    optimizer:
      type: adamw
    warmup:
      polynomial:
        power: 1
        warmup_steps: 10000
      type: polynomial
  steps_per_loop: 1000
  summary_interval: 1000
  train_steps: 1000000
  validation_interval: 1000
  validation_steps: 64


================================================
FILE: configs/model/bert_large_wordpiece.yaml
================================================
task:
  model:
    cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 1024, name: next_sentence, num_classes: 2}]
    encoder:
      type: bert
      bert:
        attention_dropout_rate: 0.1
        dropout_rate: 0.1
        hidden_activation: gelu
        hidden_size: 1024
        initializer_range: 0.02
        intermediate_size: 4096
        max_position_embeddings: 512
        num_attention_heads: 16
        num_layers: 24
        type_vocab_size: 2
        vocab_size: 32768
trainer:
  checkpoint_interval: 20000
  max_to_keep: 5
  optimizer_config:
    learning_rate:
      polynomial:
        cycle: false
        decay_steps: 1000000
        end_learning_rate: 0.0
        initial_learning_rate: 0.00005
        power: 1.0
      type: polynomial
    optimizer:
      type: adamw
    warmup:
      polynomial:
        power: 1
        warmup_steps: 10000
      type: polynomial
  steps_per_loop: 1000
  summary_interval: 1000
  train_steps: 1000000
  validation_interval: 1000
  validation_steps: 64


================================================
FILE: convert_tf2_ckpt_for_all_frameworks.py
================================================
# Copyright 2023 Masatoshi Suzuki (@singletongue)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import logging
import os
import re

import tensorflow as tf
import torch

from transformers import BertConfig, BertForPreTraining, FlaxBertForPreTraining, TFBertForPreTraining


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def load_tf2_weights_in_bert(model, tf_checkpoint_path, config):
    tf_checkpoint_path = os.path.abspath(tf_checkpoint_path)

    logger.info("Converting TensorFlow checkpoint from %s", tf_checkpoint_path)

    for full_name, shape in tf.train.list_variables(tf_checkpoint_path):
        pointer = model
        trace = []

        if len(shape) == 0:
            logger.info("Skipping non-tensor variable: %s", full_name)
            continue

        if "optimizer/" in full_name:
            logger.info("Skipping optimizer weights: %s", full_name)
            continue

        split_name = full_name.split("/")
        name = split_name.pop(0)
        if name == "encoder":
            pointer = getattr(pointer, "bert")
            trace.append("bert")

            name = split_name.pop(0)
            if name.startswith("layer_with_weights"):
                layer_num = int(name.split("-")[-1])
                # if layer_num == 0:
                    # word embedding (not saved with tensorflow-models 2.10.0)
                    # trace.extend(["embeddings", "word_embeddings"])
                    # pointer = getattr(pointer, "embeddings")
                    # pointer = getattr(pointer, "word_embeddings")
                if layer_num == 1:
                    # position_embedding
                    trace.extend(["embeddings", "position_embeddings"])
                    pointer = getattr(pointer, "embeddings")
                    pointer = getattr(pointer, "position_embeddings")
                elif layer_num == 2:
                    # type_embeddings
                    trace.extend(["embeddings", "token_type_embeddings"])
                    pointer = getattr(pointer, "embeddings")
                    pointer = getattr(pointer, "token_type_embeddings")
                elif layer_num == 3:
                    # embeddings/layer_norm
                    trace.extend(["embeddings", "LayerNorm"])
                    pointer = getattr(pointer, "embeddings")
                    pointer = getattr(pointer, "LayerNorm")
                elif layer_num >= 4 and layer_num < config.num_hidden_layers + 4:
                    # transformer/layer_x
                    trace.extend(["encoder", "layer", str(layer_num - 4)])
                    pointer = getattr(pointer, "encoder")
                    pointer = getattr(pointer, "layer")
                    pointer = pointer[layer_num - 4]
                elif layer_num == config.num_hidden_layers + 4:
                    # pooler_transform (not trained with tensorflow-models 2.10.0)
                    continue
                else:
                    logger.warning("Skipping unknown weight name: %s", full_name)
                    continue
        elif name == "masked_lm":
            trace.extend(["cls", "predictions"])
            pointer = getattr(pointer, "cls")
            pointer = getattr(pointer, "predictions")

            name = split_name.pop(0)
            if name == "dense":
                trace.extend(["transform", "dense"])
                pointer = getattr(pointer, "transform")
                pointer = getattr(pointer, "dense")
            elif name == "embedding_table":
                trace.extend(["decoder", "weight"])
                pointer = getattr(pointer, "decoder")
                pointer = getattr(pointer, "weight")
            elif name == "layer_norm":
                trace.extend(["transform", "LayerNorm"])
                pointer = getattr(pointer, "transform")
                pointer = getattr(pointer, "LayerNorm")
            elif name == "output_bias.Sbias":
                trace.extend(["bias"])
                pointer = getattr(pointer, "bias")
            else:
                logger.warning("Skipping unknown weight name: %s", full_name)
                continue
        elif name == "model":
            names = split_name[:3]
            split_name = split_name[3:]
            if names == ["classification_heads", "0", "out_proj"]:
                trace.extend(["cls", "seq_relationship"])
                pointer = getattr(pointer, "cls")
                pointer = getattr(pointer, "seq_relationship")
            else:
                logger.warning("Skipping unknown weight name: %s", full_name)
                continue
        elif name == "next_sentence..pooler_dense":
            trace.extend(["bert", "pooler", "dense"])
            pointer = getattr(pointer, "bert")
            pointer = getattr(pointer, "pooler")
            pointer = getattr(pointer, "dense")
        else:
            logger.warning("Skipping unknown weight name: %s", full_name)
            continue

        # iterate over the rest depths
        for name in split_name:
            if name == "_attention_layer":
                # self-attention layer
                trace.append("attention")
                pointer = getattr(pointer, "attention")
            elif name == "_attention_layer_norm":
                # output attention norm
                trace.extend(["attention", "output", "LayerNorm"])
                pointer = getattr(pointer, "attention")
                pointer = getattr(pointer, "output")
                pointer = getattr(pointer, "LayerNorm")
            elif name == "_attention_output_dense":
                # output attention dense
                trace.extend(["attention", "output", "dense"])
                pointer = getattr(pointer, "attention")
                pointer = getattr(pointer, "output")
                pointer = getattr(pointer, "dense")
            elif name == "_intermediate_dense":
                # attention intermediate dense
                trace.extend(["intermediate", "dense"])
                pointer = getattr(pointer, "intermediate")
                pointer = getattr(pointer, "dense")
            elif name == "_output_dense":
                # output dense
                trace.extend(["output", "dense"])
                pointer = getattr(pointer, "output")
                pointer = getattr(pointer, "dense")
            elif name == "_output_layer_norm":
                # output dense
                trace.extend(["output", "LayerNorm"])
                pointer = getattr(pointer, "output")
                pointer = getattr(pointer, "LayerNorm")
            elif name == "_key_dense":
                # attention key
                trace.extend(["self", "key"])
                pointer = getattr(pointer, "self")
                pointer = getattr(pointer, "key")
            elif name == "_query_dense":
                # attention query
                trace.extend(["self", "query"])
                pointer = getattr(pointer, "self")
                pointer = getattr(pointer, "query")
            elif name == "_value_dense":
                # attention value
                trace.extend(["self", "value"])
                pointer = getattr(pointer, "self")
                pointer = getattr(pointer, "value")
            elif name == "dense":
                # attention value
                trace.append("dense")
                pointer = getattr(pointer, "dense")
            elif name in ["bias", "beta"]:
                # norm biases
                trace.append("bias")
                pointer = getattr(pointer, "bias")
            elif name in ["kernel", "gamma"]:
                # norm weights
                trace.append("weight")
                pointer = getattr(pointer, "weight")
            elif name == "embeddings":
                # embeddins weights
                trace.append("weight")
                pointer = getattr(pointer, "weight")
            elif name == ".ATTRIBUTES":
                # full variable name ends with .ATTRIBUTES/VARIABLE_VALUE
                break
            else:
                logger.warning("Skipping unknown weight name: %s", full_name)

        logger.info("Loading TF weight %s with shape %s", full_name, shape)

        array = tf.train.load_variable(tf_checkpoint_path, full_name)

        # for certain layers reshape is necessary
        trace = ".".join(trace)
        if re.match(r"(\S+)\.attention\.self\.(key|value|query)\.(bias|weight)", trace) or \
           re.match(r"(\S+)\.attention\.output\.dense\.weight", trace):
            array = array.reshape(pointer.data.shape)
        if "kernel" in full_name:
            array = array.transpose()

        if pointer.shape == array.shape:
            pointer.data = torch.from_numpy(array)
        else:
            raise ValueError(
                f"Shape mismatch in layer {full_name}: "
                f"Model expects shape {pointer.shape} but layer contains shape: {array.shape}"
            )

        logger.info("Successfully set variable %s to PyTorch layer %s", full_name, trace)

        if full_name == "masked_lm/embedding_table/.ATTRIBUTES/VARIABLE_VALUE":
            word_embeddings_pointer = model.bert.embeddings.word_embeddings.weight
            word_embeddings_trace = "bert.embeddings.word_embeddings.weight"

            if word_embeddings_pointer.shape == array.shape:
                word_embeddings_pointer.data = torch.from_numpy(array)
            else:
                raise ValueError(
                    f"Shape mismatch in layer {full_name}: "
                    f"Model expects shape {word_embeddings_pointer.shape} but layer contains shape: {array.shape}"
                )

            logger.info("Successfully set variable %s to PyTorch layer %s", full_name, word_embeddings_trace)

    return model


def convert_tf2_checkpoint_to_pytorch(tf_checkpoint_path, tf_config_path, output_path):
    # Initialize PyTorch model
    logger.info("Loading model based on config from %s...", tf_config_path)
    config = BertConfig.from_json_file(tf_config_path)
    model = BertForPreTraining(config)

    # Load weights from tf checkpoint
    logger.info("Loading weights from checkpoint %s...", tf_checkpoint_path)
    load_tf2_weights_in_bert(model, tf_checkpoint_path, config)

    # Save the model in PyTorch format
    logger.info("Saving PyTorch model to %s...", output_path)
    model.save_pretrained(output_path)

    # Save the model in TensorFlow format
    logger.info("Reloading the saved model in TensorFlow format and saving to %s...", output_path)
    tf_model = TFBertForPreTraining.from_pretrained(output_path, from_pt=True)
    tf_model.save_pretrained(output_path)

    # Save the model in JAX/Flax format
    logger.info("Reloading the saved model in JAX/Flax format and saving to %s...", output_path)
    flax_model = FlaxBertForPreTraining.from_pretrained(output_path, from_pt=True)
    flax_model.save_pretrained(output_path)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow 2.x checkpoint path."
    )
    parser.add_argument(
        "--tf_config_file",
        type=str,
        required=True,
        help="The config json file corresponding to the BERT model. This specifies the model architecture.",
    )
    parser.add_argument(
        "--output_path",
        type=str,
        required=True,
        help="Path to the output PyTorch model (must include filename).",
    )
    args = parser.parse_args()
    convert_tf2_checkpoint_to_pytorch(args.tf_checkpoint_path, args.tf_config_file, args.output_path)


================================================
FILE: create_pretraining_data.py
================================================
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
# Copyright 2023 Masatoshi Suzuki (@singletongue)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Create masked LM/next sentence masked_lm TF examples for BERT."""

import collections
import random

# Import libraries
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf

from tokenization import BertJapaneseTokenizer

FLAGS = flags.FLAGS

flags.DEFINE_string("input_file", None,
                    "Input raw text file (or comma-separated list of files).")

flags.DEFINE_string(
    "output_file", None,
    "Output TF example file (or comma-separated list of files).")

flags.DEFINE_string("vocab_file", None,
                    "The vocabulary file that the BERT model was trained on.")

flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text. Should be True for uncased "
    "models and False for cased models.")

flags.DEFINE_string("word_tokenizer_type", None,
                    "Word tokenizer type (basic, mecab).")

flags.DEFINE_string("subword_tokenizer_type", None,
                    "Tokenizer type (wordpiece, character).")

flags.DEFINE_string("mecab_dic_type", None,
                    "Dictionary type for MecabTokenizer.")

flags.DEFINE_bool("vocab_has_no_subword_prefix", False,
                  "Whether the vocabulary contains no subword prefix.")

flags.DEFINE_bool(
    "do_whole_word_mask", False,
    "Whether to use whole word masking rather than per-WordPiece masking.")

flags.DEFINE_bool(
    "gzip_compress", False,
    "Whether to use `GZIP` compress option to get compressed TFRecord files.")

flags.DEFINE_bool(
    "use_v2_feature_names", False,
    "Whether to use the feature names consistent with the models.")

flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")

flags.DEFINE_integer("max_predictions_per_seq", 20,
                     "Maximum number of masked LM predictions per sequence.")

flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")

flags.DEFINE_integer(
    "dupe_factor", 10,
    "Number of times to duplicate the input data (with different masks).")

flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")

flags.DEFINE_float(
    "short_seq_prob", 0.1,
    "Probability of creating sequences which are shorter than the "
    "maximum length.")


class TrainingInstance(object):
  """A single training instance (sentence pair)."""

  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
               is_random_next):
    self.tokens = tokens
    self.segment_ids = segment_ids
    self.is_random_next = is_random_next
    self.masked_lm_positions = masked_lm_positions
    self.masked_lm_labels = masked_lm_labels

  def __str__(self):
    s = ""
    s += "tokens: %s\n" % (" ".join([x for x in self.tokens]))
    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
    s += "is_random_next: %s\n" % self.is_random_next
    s += "masked_lm_positions: %s\n" % (" ".join(
        [str(x) for x in self.masked_lm_positions]))
    s += "masked_lm_labels: %s\n" % (" ".join(
        [x for x in self.masked_lm_labels]))
    s += "\n"
    return s

  def __repr__(self):
    return self.__str__()


def write_instance_to_example_files(instances, tokenizer, max_seq_length,
                                    max_predictions_per_seq, output_files,
                                    gzip_compress, use_v2_feature_names):
  """Creates TF example files from `TrainingInstance`s."""
  writers = []
  for output_file in output_files:
    writers.append(
        tf.io.TFRecordWriter(
            output_file, options="GZIP" if gzip_compress else ""))

  writer_index = 0

  total_written = 0
  for (inst_index, instance) in enumerate(instances):
    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
    input_mask = [1] * len(input_ids)
    segment_ids = list(instance.segment_ids)
    assert len(input_ids) <= max_seq_length

    while len(input_ids) < max_seq_length:
      input_ids.append(0)
      input_mask.append(0)
      segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    masked_lm_positions = list(instance.masked_lm_positions)
    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
    masked_lm_weights = [1.0] * len(masked_lm_ids)

    while len(masked_lm_positions) < max_predictions_per_seq:
      masked_lm_positions.append(0)
      masked_lm_ids.append(0)
      masked_lm_weights.append(0.0)

    next_sentence_label = 1 if instance.is_random_next else 0

    features = collections.OrderedDict()
    if use_v2_feature_names:
      features["input_word_ids"] = create_int_feature(input_ids)
      features["input_type_ids"] = create_int_feature(segment_ids)
    else:
      features["input_ids"] = create_int_feature(input_ids)
      features["segment_ids"] = create_int_feature(segment_ids)

    features["input_mask"] = create_int_feature(input_mask)
    features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
    features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
    features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
    features["next_sentence_labels"] = create_int_feature([next_sentence_label])

    tf_example = tf.train.Example(features=tf.train.Features(feature=features))

    writers[writer_index].write(tf_example.SerializeToString())
    writer_index = (writer_index + 1) % len(writers)

    total_written += 1

    if inst_index < 20:
      logging.info("*** Example ***")
      logging.info("tokens: %s", " ".join([x for x in instance.tokens]))

      for feature_name in features.keys():
        feature = features[feature_name]
        values = []
        if feature.int64_list.value:
          values = feature.int64_list.value
        elif feature.float_list.value:
          values = feature.float_list.value
        logging.info("%s: %s", feature_name, " ".join([str(x) for x in values]))

  for writer in writers:
    writer.close()

  logging.info("Wrote %d total instances", total_written)


def create_int_feature(values):
  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
  return feature


def create_float_feature(values):
  feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
  return feature


def create_training_instances(input_files,
                              tokenizer,
                              max_seq_length,
                              dupe_factor,
                              short_seq_prob,
                              masked_lm_prob,
                              max_predictions_per_seq,
                              rng,
                              do_whole_word_mask=False):
  """Create `TrainingInstance`s from raw text."""
  all_documents = [[]]

  # Input file format:
  # (1) One sentence per line. These should ideally be actual sentences, not
  # entire paragraphs or arbitrary spans of text. (Because we use the
  # sentence boundaries for the "next sentence prediction" task).
  # (2) Blank lines between documents. Document boundaries are needed so
  # that the "next sentence prediction" task doesn't span between documents.
  for input_file in input_files:
    with tf.io.gfile.GFile(input_file, "rb") as reader:
      while True:
        line = reader.readline().decode("utf-8", "ignore")
        if not line:
          break
        line = line.strip()

        # Empty lines are used as document delimiters
        if not line:
          all_documents.append([])
        tokens = tokenizer.tokenize(line)
        if tokens:
          all_documents[-1].append(tokens)

  # Remove empty documents
  all_documents = [x for x in all_documents if x]
  rng.shuffle(all_documents)

  vocab_words = list(tokenizer.get_vocab().keys())
  instances = []
  for _ in range(dupe_factor):
    for document_index in range(len(all_documents)):
      instances.extend(
          create_instances_from_document(
              all_documents, document_index, max_seq_length, short_seq_prob,
              masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
              do_whole_word_mask))

  rng.shuffle(instances)
  return instances


def create_instances_from_document(
    all_documents, document_index, max_seq_length, short_seq_prob,
    masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
    do_whole_word_mask=False):
  """Creates `TrainingInstance`s for a single document."""
  document = all_documents[document_index]

  # Account for [CLS], [SEP], [SEP]
  max_num_tokens = max_seq_length - 3

  # We *usually* want to fill up the entire sequence since we are padding
  # to `max_seq_length` anyways, so short sequences are generally wasted
  # computation. However, we *sometimes*
  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
  # sequences to minimize the mismatch between pre-training and fine-tuning.
  # The `target_seq_length` is just a rough target however, whereas
  # `max_seq_length` is a hard limit.
  target_seq_length = max_num_tokens
  if rng.random() < short_seq_prob:
    target_seq_length = rng.randint(2, max_num_tokens)

  # We DON'T just concatenate all of the tokens from a document into a long
  # sequence and choose an arbitrary split point because this would make the
  # next sentence prediction task too easy. Instead, we split the input into
  # segments "A" and "B" based on the actual "sentences" provided by the user
  # input.
  instances = []
  current_chunk = []
  current_length = 0
  i = 0
  while i < len(document):
    segment = document[i]
    current_chunk.append(segment)
    current_length += len(segment)
    if i == len(document) - 1 or current_length >= target_seq_length:
      if current_chunk:
        # `a_end` is how many segments from `current_chunk` go into the `A`
        # (first) sentence.
        a_end = 1
        if len(current_chunk) >= 2:
          a_end = rng.randint(1, len(current_chunk) - 1)

        tokens_a = []
        for j in range(a_end):
          tokens_a.extend(current_chunk[j])

        tokens_b = []
        # Random next
        is_random_next = False
        if len(current_chunk) == 1 or rng.random() < 0.5:
          is_random_next = True
          target_b_length = target_seq_length - len(tokens_a)

          # This should rarely go for more than one iteration for large
          # corpora. However, just to be careful, we try to make sure that
          # the random document is not the same as the document
          # we're processing.
          for _ in range(10):
            random_document_index = rng.randint(0, len(all_documents) - 1)
            if random_document_index != document_index:
              break

          random_document = all_documents[random_document_index]
          random_start = rng.randint(0, len(random_document) - 1)
          for j in range(random_start, len(random_document)):
            tokens_b.extend(random_document[j])
            if len(tokens_b) >= target_b_length:
              break
          # We didn't actually use these segments so we "put them back" so
          # they don't go to waste.
          num_unused_segments = len(current_chunk) - a_end
          i -= num_unused_segments
        # Actual next
        else:
          is_random_next = False
          for j in range(a_end, len(current_chunk)):
            tokens_b.extend(current_chunk[j])
        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)

        assert len(tokens_a) >= 1
        assert len(tokens_b) >= 1

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
          tokens.append(token)
          segment_ids.append(0)

        tokens.append("[SEP]")
        segment_ids.append(0)

        for token in tokens_b:
          tokens.append(token)
          segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)

        (tokens, masked_lm_positions,
         masked_lm_labels) = create_masked_lm_predictions(
             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
             do_whole_word_mask)
        instance = TrainingInstance(
            tokens=tokens,
            segment_ids=segment_ids,
            is_random_next=is_random_next,
            masked_lm_positions=masked_lm_positions,
            masked_lm_labels=masked_lm_labels)
        instances.append(instance)
      current_chunk = []
      current_length = 0
    i += 1

  return instances


MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
                                          ["index", "label"])


def create_masked_lm_predictions(tokens, masked_lm_prob,
                                 max_predictions_per_seq, vocab_words, rng,
                                 do_whole_word_mask):
  """Creates the predictions for the masked LM objective."""

  cand_indexes = []
  for (i, token) in enumerate(tokens):
    if token == "[CLS]" or token == "[SEP]":
      continue
    # Whole Word Masking means that if we mask all of the wordpieces
    # corresponding to an original word. When a word has been split into
    # WordPieces, the first token does not have any marker and any subsequence
    # tokens are prefixed with ##. So whenever we see the ## token, we
    # append it to the previous set of word indexes.
    #
    # Note that Whole Word Masking does *not* change the training code
    # at all -- we still predict each WordPiece independently, softmaxed
    # over the entire vocabulary.
    if (do_whole_word_mask and len(cand_indexes) >= 1 and
        token.startswith("##")):
      cand_indexes[-1].append(i)
    else:
      cand_indexes.append([i])

  rng.shuffle(cand_indexes)

  output_tokens = list(tokens)

  num_to_predict = min(max_predictions_per_seq,
                       max(1, int(round(len(tokens) * masked_lm_prob))))

  masked_lms = []
  covered_indexes = set()
  for index_set in cand_indexes:
    if len(masked_lms) >= num_to_predict:
      break
    # If adding a whole-word mask would exceed the maximum number of
    # predictions, then just skip this candidate.
    if len(masked_lms) + len(index_set) > num_to_predict:
      continue
    is_any_index_covered = False
    for index in index_set:
      if index in covered_indexes:
        is_any_index_covered = True
        break
    if is_any_index_covered:
      continue
    for index in index_set:
      covered_indexes.add(index)

      masked_token = None
      # 80% of the time, replace with [MASK]
      if rng.random() < 0.8:
        masked_token = "[MASK]"
      else:
        # 10% of the time, keep original
        if rng.random() < 0.5:
          masked_token = tokens[index]
        # 10% of the time, replace with random word
        else:
          masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]

      output_tokens[index] = masked_token

      masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
  assert len(masked_lms) <= num_to_predict
  masked_lms = sorted(masked_lms, key=lambda x: x.index)

  masked_lm_positions = []
  masked_lm_labels = []
  for p in masked_lms:
    masked_lm_positions.append(p.index)
    masked_lm_labels.append(p.label)

  return (output_tokens, masked_lm_positions, masked_lm_labels)


def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
  """Truncates a pair of sequences to a maximum sequence length."""
  while True:
    total_length = len(tokens_a) + len(tokens_b)
    if total_length <= max_num_tokens:
      break

    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
    assert len(trunc_tokens) >= 1

    # We want to sometimes truncate from the front and sometimes from the
    # back to add more randomness and avoid biases.
    if rng.random() < 0.5:
      del trunc_tokens[0]
    else:
      trunc_tokens.pop()


def main(_):
  tokenizer = BertJapaneseTokenizer(
      FLAGS.vocab_file,
      do_lower_case=FLAGS.do_lower_case,
      word_tokenizer_type=FLAGS.word_tokenizer_type,
      subword_tokenizer_type=FLAGS.subword_tokenizer_type,
      mecab_kwargs={"mecab_dic": FLAGS.mecab_dic_type},
      vocab_has_no_subword_prefix=FLAGS.vocab_has_no_subword_prefix,
  )

  input_files = []
  for input_pattern in FLAGS.input_file.split(","):
    input_files.extend(tf.io.gfile.glob(input_pattern))

  logging.info("*** Reading from input files ***")
  for input_file in input_files:
    logging.info("  %s", input_file)

  rng = random.Random(FLAGS.random_seed)
  instances = create_training_instances(
      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
      rng, FLAGS.do_whole_word_mask)

  output_files = FLAGS.output_file.split(",")
  logging.info("*** Writing to output files ***")
  for output_file in output_files:
    logging.info("  %s", output_file)

  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
                                  FLAGS.max_predictions_per_seq, output_files,
                                  FLAGS.gzip_compress,
                                  FLAGS.use_v2_feature_names)


if __name__ == "__main__":
  flags.mark_flag_as_required("input_file")
  flags.mark_flag_as_required("output_file")
  flags.mark_flag_as_required("vocab_file")
  app.run(main)


================================================
FILE: hf_model_configs/bert_base_character/config.json
================================================
{
    "architectures": [
        "BertForPreTraining"
    ],
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "layer_norm_eps": 1e-12,
    "max_position_embeddings": 512,
    "model_type": "bert",
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "pad_token_id": 0,
    "type_vocab_size": 2,
    "vocab_size": 7027
}


================================================
FILE: hf_model_configs/bert_base_character/tokenizer_config.json
================================================
{
    "tokenizer_class": "BertJapaneseTokenizer",
    "model_max_length": 512,
    "do_lower_case": false,
    "word_tokenizer_type": "mecab",
    "subword_tokenizer_type": "character",
    "mecab_kwargs": {
        "mecab_dic": "unidic_lite"
    }
}


================================================
FILE: hf_model_configs/bert_base_wordpiece/config.json
================================================
{
    "architectures": [
        "BertForPreTraining"
    ],
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "layer_norm_eps": 1e-12,
    "max_position_embeddings": 512,
    "model_type": "bert",
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "pad_token_id": 0,
    "type_vocab_size": 2,
    "vocab_size": 32768
}


================================================
FILE: hf_model_configs/bert_base_wordpiece/tokenizer_config.json
================================================
{
    "tokenizer_class": "BertJapaneseTokenizer",
    "model_max_length": 512,
    "do_lower_case": false,
    "word_tokenizer_type": "mecab",
    "subword_tokenizer_type": "wordpiece",
    "mecab_kwargs": {
        "mecab_dic": "unidic_lite"
    }
}


================================================
FILE: hf_model_configs/bert_large_character/config.json
================================================
{
    "architectures": [
        "BertForPreTraining"
    ],
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 1024,
    "initializer_range": 0.02,
    "intermediate_size": 4096,
    "layer_norm_eps": 1e-12,
    "max_position_embeddings": 512,
    "model_type": "bert",
    "num_attention_heads": 16,
    "num_hidden_layers": 24,
    "pad_token_id": 0,
    "type_vocab_size": 2,
    "vocab_size": 7027
}


================================================
FILE: hf_model_configs/bert_large_character/tokenizer_config.json
================================================
{
    "tokenizer_class": "BertJapaneseTokenizer",
    "model_max_length": 512,
    "do_lower_case": false,
    "word_tokenizer_type": "mecab",
    "subword_tokenizer_type": "character",
    "mecab_kwargs": {
        "mecab_dic": "unidic_lite"
    }
}


================================================
FILE: hf_model_configs/bert_large_wordpiece/config.json
================================================
{
    "architectures": [
        "BertForPreTraining"
    ],
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 1024,
    "initializer_range": 0.02,
    "intermediate_size": 4096,
    "layer_norm_eps": 1e-12,
    "max_position_embeddings": 512,
    "model_type": "bert",
    "num_attention_heads": 16,
    "num_hidden_layers": 24,
    "pad_token_id": 0,
    "type_vocab_size": 2,
    "vocab_size": 32768
}


================================================
FILE: hf_model_configs/bert_large_wordpiece/tokenizer_config.json
================================================
{
    "tokenizer_class": "BertJapaneseTokenizer",
    "model_max_length": 512,
    "do_lower_case": false,
    "word_tokenizer_type": "mecab",
    "subword_tokenizer_type": "wordpiece",
    "mecab_kwargs": {
        "mecab_dic": "unidic_lite"
    }
}


================================================
FILE: japanese_tokenizers/implementations.py
================================================
# Copyright 2020 The HuggingFace Inc. team.
# Copyright 2023 Masatoshi Suzuki (@singletongue).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, Optional, Union

from tokenizers import AddedToken, normalizers, pre_tokenizers
from tokenizers.implementations import BertWordPieceTokenizer

from .pre_tokenizers import MeCabPreTokenizer


class JapaneseWordPieceTokenizer(BertWordPieceTokenizer):
    def __init__(
        self,
        vocab: Optional[Union[str, Dict[str, int]]] = None,
        unk_token: Union[str, AddedToken] = "[UNK]",
        sep_token: Union[str, AddedToken] = "[SEP]",
        cls_token: Union[str, AddedToken] = "[CLS]",
        pad_token: Union[str, AddedToken] = "[PAD]",
        mask_token: Union[str, AddedToken] = "[MASK]",
        num_unused_tokens: int = 10,
        pre_tokenizer_type: str = "mecab",
        mecab_dic_type: str = "unidic_lite",
        wordpieces_prefix: str = "##",
    ) -> None:
        super().__init__(
            vocab=vocab,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            wordpieces_prefix=wordpieces_prefix,
        )
        self._tokenizer.add_special_tokens([f"[unused{i}]" for i in range(num_unused_tokens)])

        self._tokenizer.normalizer = normalizers.Sequence([normalizers.NFKC(), normalizers.Strip()])

        parameters = {
            "model": "BertWordPieceJapaneseTokenizer",
            "pre_tokenizer_type": pre_tokenizer_type,
            "mecab_dic_type": mecab_dic_type,
        }

        if pre_tokenizer_type == "mecab":
            self._tokenizer.pre_tokenizer = pre_tokenizers.PreTokenizer.custom(MeCabPreTokenizer(mecab_dic_type))
            parameters["mecab_dic_type"] = mecab_dic_type
        elif pre_tokenizer_type == "whitespace":
            self._tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
        else:
            raise ValueError("Invalid pre_tokenizer_type is specified.")

        self._parameters.update(parameters)


================================================
FILE: japanese_tokenizers/pre_tokenizers.py
================================================
# Copyright 2020 The HuggingFace Inc. team.
# Copyright 2023 Masatoshi Suzuki (@singletongue).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import List, Optional

from tokenizers import NormalizedString, PreTokenizedString


class MeCabPreTokenizer:
    def __init__(self, mecab_dic: Optional[str] = None, mecab_option: Optional[str] = None) -> None:
        import fugashi
        mecab_option = mecab_option or ""

        if mecab_dic is not None:
            if mecab_dic == "unidic_lite":
                import unidic_lite
                dic_dir = unidic_lite.DICDIR
            elif mecab_dic == "unidic":
                import unidic
                dic_dir = unidic.DICDIR
            elif mecab_dic == "ipadic":
                import ipadic
                dic_dir = ipadic.DICDIR
            else:
                raise ValueError("Invalid mecab_dic is specified.")

            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = f"-d {dic_dir} -r {mecabrc} " + mecab_option

        self.mecab = fugashi.GenericTagger(mecab_option)

    def mecab_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        splits = []
        cursor = 0
        for token in self.mecab(str(normalized_string)):
            start = str(normalized_string).index(token.surface, cursor)
            end = start + len(token.surface)
            splits.append(normalized_string[start:end])
            cursor = end

        return splits

    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.mecab_split)


================================================
FILE: make_alphabet_from_unidic.py
================================================
# Copyright 2023 Masatoshi Suzuki (@singletongue)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import csv
from unicodedata import normalize


def main(args):
    seen_chars = set()
    with open(args.lex_file, newline="") as f:
        reader = csv.reader(f)
        for row in reader:
            token = row[0]
            if token == "":
                token = '"'

            for char in list(token):
                char = normalize("NFKC", char)
                if len(char) != 1:
                    continue

                if not char.isprintable():
                    continue

                seen_chars.add(char)

    with open(args.output_file, "w") as fo:
        for char in sorted(list(seen_chars)):
            print(char, file=fo)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--lex_file", type=str, required=True)
    parser.add_argument("--output_file", type=str, required=True)
    args = parser.parse_args()
    main(args)


================================================
FILE: make_corpus_wiki.py
================================================
# Copyright 2023 Masatoshi Suzuki (@singletongue)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import gzip
import json
import os
import re
import unicodedata

from tqdm import tqdm


class MeCabSentenceSplitter:
    def __init__(self, mecab_option=None):
        import fugashi
        if mecab_option is None:
            import unidic_lite
            dic_dir = unidic_lite.DICDIR
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = f"-d {dic_dir} -r {mecabrc}"

        self.mecab = fugashi.GenericTagger(mecab_option)

    def __call__(self, text):
        sentences = []
        start = 0
        end = 0
        for line in self.mecab.parse(text).split("\n"):
            if line == "EOS":
                if len(text[start:]) > 0:
                    sentences.append(text[start:])
                break

            token, token_info = line.split("\t", maxsplit=1)
            end = text.index(token, end) + len(token)
            if "記号" in token_info and "句点" in token_info:
                sentences.append(text[start:end])
                start = end

        return sentences


def filter_text(text):
    # filter out text containing equations
    if "\displaystyle" in text:
        return False

    return True


def preprocess_text(text, title=None):
    text = unicodedata.normalize("NFKC", text)

    # remove invisible characters
    text = "".join(c for c in text if c.isprintable())

    # remove templates
    text = re.sub(r"\[\d+?\]", "", text)
    text = re.sub(r"\[要.+?\]", "", text)
    text = re.sub(r"\{\{+[^{}]+?\}\}+", "", text)

    # remove navigation
    if title is not None:
        text = re.sub(r"^.+? \> " + re.escape(title), "", text)

    # remove footnotes
    text = re.sub(r" \^ .+", "", text)
    # remove annotations
    text = re.sub(r"\[(要出典|リンク切れ|.+?\?)\]", "", text)

    text = re.sub(r"\s+", " ", text).strip()
    return text


def main(args):
    sent_splitter = MeCabSentenceSplitter(args.mecab_option)

    with gzip.open(args.input_file, "rt") as f, gzip.open(args.output_file, "wt") as fo:
        for line in tqdm(f):
            item = json.loads(line)
            if "index" in item:
                continue

            title = item["title"]
            text = item["text"]
            text = preprocess_text(text, title=title)

            is_processed = False
            for sentence in sent_splitter(text):
                sentence = sentence.strip()
                if len(sentence) < args.min_sentence_length:
                    continue
                if len(sentence) > args.max_sentence_length:
                    continue
                if not filter_text(sentence):
                    continue

                assert not "\n" in text
                assert sentence != ""
                print(sentence, file=fo)
                is_processed = True

            if is_processed:
                # insert a newline for separating pages
                print("", file=fo)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", type=str, required=True)
    parser.add_argument("--output_file", type=str, required=True)
    parser.add_argument("--mecab_option", type=str)
    parser.add_argument("--min_sentence_length", type=int, default=10)
    parser.add_argument("--max_sentence_length", type=int, default=1000)
    args = parser.parse_args()
    main(args)


================================================
FILE: masked_lm_example.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/m-suzuki/Projects/bert-japanese/venv/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "2023-05-19 10:03:53.353302: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import AutoModelForMaskedLM, AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "model_name_or_path = \"cl-tohoku/bert-base-japanese-v3\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v3 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n",
      "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    }
   ],
   "source": [
    "model = AutoModelForMaskedLM.from_pretrained(model_name_or_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "input_ids = tokenizer.encode(f\"青葉山で{tokenizer.mask_token}の研究をしています。\", return_tensors=\"pt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[    2, 22033,  1872,   457,     4,   464, 12605,   500,   441,   456,\n",
      "           422, 12995,   385,     3]])\n"
     ]
    }
   ],
   "source": [
    "print(input_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['[CLS]', '青葉', '山', 'で', '[MASK]', 'の', '研究', 'を', 'し', 'て', 'い', 'ます', '。', '[SEP]']\n"
     ]
    }
   ],
   "source": [
    "print(tokenizer.convert_ids_to_tokens(input_ids[0].tolist()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4\n"
     ]
    }
   ],
   "source": [
    "masked_index = torch.where(input_ids == tokenizer.mask_token_id)[1][0].tolist()\n",
    "print(masked_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CLS] 青葉 山 で 雪 の 研究 を し て い ます 。 [SEP]\n",
      "[CLS] 青葉 山 で 山 の 研究 を し て い ます 。 [SEP]\n",
      "[CLS] 青葉 山 で 花 の 研究 を し て い ます 。 [SEP]\n",
      "[CLS] 青葉 山 で 植物 の 研究 を し て い ます 。 [SEP]\n",
      "[CLS] 青葉 山 で 星 の 研究 を し て い ます 。 [SEP]\n"
     ]
    }
   ],
   "source": [
    "result = model(input_ids)\n",
    "pred_ids = result[0][:, masked_index].topk(5).indices.tolist()[0]\n",
    "for pred_id in pred_ids:\n",
    "    output_ids = input_ids.tolist()[0]\n",
    "    output_ids[masked_index] = pred_id\n",
    "    print(tokenizer.decode(output_ids))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "file_extension": ".py",
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": 3
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: merge_split_corpora.py
================================================
# Copyright 2023 Masatoshi Suzuki (@singletongue)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import gzip
import logging
import lzma
import os
import random

from tqdm import tqdm


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

random.seed(0)


def _open_file(filename):
    if filename.endswith(".xz"):
        return lzma.open(filename, "rt")
    elif filename.endswith(".gz"):
        return gzip.open(filename, "rt")
    else:
        return open(filename)


def main(args):
    output_files = []
    for i in range(1, args.num_files + 1):
        output_path = os.path.join(args.output_dir, f"corpus_{i:02d}.txt")
        output_file = open(output_path, "w")
        output_files.append(output_file)

    output_index = random.randint(1, args.num_files)

    for input_path in args.input_files:
        logger.info("Processing %s", input_path)
        with _open_file(input_path) as f:
            for line in tqdm(f):
                line = " ".join(line.strip().split())
                print(line, file=output_files[output_index])

                if line == "":
                    output_index = random.randrange(args.num_files)

    for output_file in output_files:
        output_file.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_files", type=str, nargs="+", required=True)
    parser.add_argument("--output_dir", type=str, required=True)
    parser.add_argument("--num_files", type=int, required=True)
    args = parser.parse_args()
    main(args)


================================================
FILE: model_configs/bert_base_character/config.json
================================================
{
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "max_position_embeddings": 512,
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "type_vocab_size": 2,
    "vocab_size": 7027
}


================================================
FILE: model_configs/bert_base_wordpiece/config.json
================================================
{
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "max_position_embeddings": 512,
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "type_vocab_size": 2,
    "vocab_size": 32768
}


================================================
FILE: model_configs/bert_large_character/config.json
================================================
{
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 1024,
    "initializer_range": 0.02,
    "intermediate_size": 4096,
    "max_position_embeddings": 512,
    "num_attention_heads": 16,
    "num_hidden_layers": 24,
    "type_vocab_size": 2,
    "vocab_size": 7027
}


================================================
FILE: model_configs/bert_large_wordpiece/config.json
================================================
{
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 1024,
    "initializer_range": 0.02,
    "intermediate_size": 4096,
    "max_position_embeddings": 512,
    "num_attention_heads": 16,
    "num_hidden_layers": 24,
    "type_vocab_size": 2,
    "vocab_size": 32768
}


================================================
FILE: requirements.txt
================================================
flax==0.6.10
fugashi==1.2.1
ipadic==1.0.0
tensorflow==2.11.1
tqdm==4.64.1
tokenizers==0.13.2
torch==1.13.1
transformers==4.30.0
unidic==1.1.0
unidic_lite==1.0.8


================================================
FILE: tokenization.py
================================================
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright 2023 Masatoshi Suzuki (@singletongue)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unicodedata

from transformers.models.bert_japanese.tokenization_bert_japanese import (
    BertJapaneseTokenizer as BertJapaneseTokenizerBase,
    CharacterTokenizer as CharacterTokenizerBase,
)


class BertJapaneseTokenizer(BertJapaneseTokenizerBase):
    def __init__(
        self,
        vocab_file,
        spm_file=None,
        do_lower_case=False,
        do_word_tokenize=True,
        do_subword_tokenize=True,
        word_tokenizer_type="basic",
        subword_tokenizer_type="wordpiece",
        vocab_has_no_subword_prefix=False,
        never_split=None,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        mecab_kwargs=None,
        sudachi_kwargs=None,
        jumanpp_kwargs=None,
        **kwargs,
    ):
        super().__init__(
            vocab_file,
            spm_file=spm_file,
            do_lower_case=do_lower_case,
            do_word_tokenize=do_word_tokenize,
            do_subword_tokenize=do_subword_tokenize,
            word_tokenizer_type=word_tokenizer_type,
            subword_tokenizer_type=subword_tokenizer_type,
            never_split=never_split,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            mecab_kwargs=mecab_kwargs,
            sudachi_kwargs=sudachi_kwargs,
            jumanpp_kwargs=jumanpp_kwargs,
            **kwargs,
        )

        self.vocab_has_no_subword_prefix = vocab_has_no_subword_prefix

        if do_subword_tokenize and subword_tokenizer_type == "character":
            self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)

    def _convert_token_to_id(self, token):
        if self.vocab_has_no_subword_prefix and token.startswith("##"):
            token = token[len("##"):]

        return self.vocab.get(token, self.vocab.get(self.unk_token))


class CharacterTokenizer(CharacterTokenizerBase):
    def __init__(self, vocab, unk_token, normalize_text=True):
        super().__init__(vocab, unk_token, normalize_text=normalize_text)

    def tokenize(self, text):
        if self.normalize_text:
            text = unicodedata.normalize("NFKC", text)

        output_tokens = []
        for i, char in enumerate(text):
            if char not in self.vocab:
                output_tokens.append(self.unk_token)
                continue

            if i > 0:
                char = "##" + char

            output_tokens.append(char)

        return output_tokens


================================================
FILE: train_tokenizer.py
================================================
# Copyright 2020 The HuggingFace Inc. team.
# Copyright 2023 Masatoshi Suzuki (@singletongue)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import logging
import os

from japanese_tokenizers.implementations import JapaneseWordPieceTokenizer


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def main(args):
    tokenizer = JapaneseWordPieceTokenizer(
        num_unused_tokens=args.num_unused_tokens,
        pre_tokenizer_type=args.pre_tokenizer_type,
        mecab_dic_type=args.mecab_dic_type,
    )
    speical_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    speical_tokens += [f"[unused{i}]" for i in range(args.num_unused_tokens)]

    if args.initial_alphabet_file is not None:
        logger.info("Loading the initial alphabet from file")
        initial_alphabet = [line.rstrip("\n") for line in open(args.initial_alphabet_file)]
        logger.info("The size of the initial alphabet: %d", len(initial_alphabet))
    else:
        initial_alphabet = []

    logger.info("Training the tokenizer")
    tokenizer.train(
        args.input_files,
        vocab_size=args.vocab_size,
        limit_alphabet=args.limit_alphabet,
        initial_alphabet=initial_alphabet,
        special_tokens=speical_tokens,
        wordpieces_prefix=args.wordpieces_prefix,
    )

    logger.info("Saving the tokenizer to files")
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer.save_model(args.output_dir)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_files", type=str, nargs="+", required=True)
    parser.add_argument("--output_dir", type=str, required=True)
    parser.add_argument("--pre_tokenizer_type", choices=("mecab", "whitespace"), required=True)
    parser.add_argument("--mecab_dic_type", choices=("unidic_lite", "unidic", "ipadic"), default="unidic_lite")
    parser.add_argument("--vocab_size", type=int, required=True)
    parser.add_argument("--limit_alphabet", type=int, default=1000)
    parser.add_argument("--initial_alphabet_file", type=str)
    parser.add_argument("--num_unused_tokens", type=int, default=10)
    parser.add_argument("--wordpieces_prefix", type=str, default="##")
    args = parser.parse_args()
    main(args)