Repository: cl-tohoku/bert-japanese Branch: main Commit: e4c8b003abd0 Files: 32 Total size: 96.2 KB Directory structure: gitextract_lbfgru6r/ ├── .gitignore ├── LICENSE ├── README.md ├── configs/ │ ├── data/ │ │ ├── cc-100.yaml │ │ └── wikipedia.yaml │ └── model/ │ ├── bert_base_character.yaml │ ├── bert_base_wordpiece.yaml │ ├── bert_large_character.yaml │ └── bert_large_wordpiece.yaml ├── convert_tf2_ckpt_for_all_frameworks.py ├── create_pretraining_data.py ├── hf_model_configs/ │ ├── bert_base_character/ │ │ ├── config.json │ │ └── tokenizer_config.json │ ├── bert_base_wordpiece/ │ │ ├── config.json │ │ └── tokenizer_config.json │ ├── bert_large_character/ │ │ ├── config.json │ │ └── tokenizer_config.json │ └── bert_large_wordpiece/ │ ├── config.json │ └── tokenizer_config.json ├── japanese_tokenizers/ │ ├── implementations.py │ └── pre_tokenizers.py ├── make_alphabet_from_unidic.py ├── make_corpus_wiki.py ├── masked_lm_example.ipynb ├── merge_split_corpora.py ├── model_configs/ │ ├── bert_base_character/ │ │ └── config.json │ ├── bert_base_wordpiece/ │ │ └── config.json │ ├── bert_large_character/ │ │ └── config.json │ └── bert_large_wordpiece/ │ └── config.json ├── requirements.txt ├── tokenization.py └── train_tokenizer.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # Pretrained Japanese BERT models This is a repository of pretrained Japanese BERT models. The models are available in [Transformers](https://github.com/huggingface/transformers) by Hugging Face. - Model hub: https://huggingface.co/tohoku-nlp This version of README contains information for the following models: - [`tohoku-nlp/bert-base-japanese-v3`](https://huggingface.co/tohoku-nlp/bert-base-japanese-v3) - [`tohoku-nlp/bert-base-japanese-char-v3`](https://huggingface.co/tohoku-nlp/bert-base-japanese-char-v3) - [`tohoku-nlp/bert-large-japanese-v2`](https://huggingface.co/tohoku-nlp/bert-large-japanese-v2) - [`tohoku-nlp/bert-large-japanese-char-v2`](https://huggingface.co/tohoku-nlp/bert-large-japanese-char-v2) For information and codes for the following models, refer to the [v2.0](https://github.com/cl-tohoku/bert-japanese/tree/v2.0) tag of this repository: - [`tohoku-nlp/bert-base-japanese-v2`](https://huggingface.co/tohoku-nlp/bert-base-japanese-v2) - [`tohoku-nlp/bert-base-japanese-char-v2`](https://huggingface.co/tohoku-nlp/bert-base-japanese-char-v2) - [`tohoku-nlp/bert-large-japanese`](https://huggingface.co/tohoku-nlp/bert-large-japanese) - [`tohoku-nlp/bert-large-japanese-char`](https://huggingface.co/tohoku-nlp/bert-large-japanese-char) For information and codes for the following models, refer to the [v1.0](https://github.com/cl-tohoku/bert-japanese/tree/v1.0) tag of this repository: - [`tohoku-nlp/bert-base-japanese`](https://huggingface.co/tohoku-nlp/bert-base-japanese) - [`tohoku-nlp/bert-base-japanese-whole-word-masking`](https://huggingface.co/tohoku-nlp/bert-base-japanese-whole-word-masking) - [`tohoku-nlp/bert-base-japanese-char`](https://huggingface.co/tohoku-nlp/bert-base-japanese-char) - [`tohoku-nlp/bert-base-japanese-char-whole-word-masking`](https://huggingface.co/tohoku-nlp/bert-base-japanese-char-whole-word-masking) ## Model Architecture The architecture of our models are the same as the original BERT models proposed by Google. - **BERT-base** models consist of 12 layers, 768 dimensions of hidden states, and 12 attention heads. - **BERT-large** models consist of 24 layers, 1024 dimensions of hidden states, and 16 attention heads. ## Training Data The models are trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia. For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023. The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively. For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7). ### Generating corpus files ```sh # For CC-100 $ mkdir -p $WORK_DIR/corpus/cc-100 $ python merge_split_corpora.py \ --input_files $DATA_DIR/cc-100/ja.txt.xz \ --output_dir $WORK_DIR/corpus/cc-100 \ --num_files 64 # For Wikipedia $ mkdir -p $WORK_DIR/corpus/wikipedia $ python make_corpus_wiki.py \ --input_file $DATA_DIR/wikipedia/cirrussearch/20230102/jawiki-20230102-cirrussearch-content.json.gz \ --output_file $WORK_DIR/corpus/wikipedia/corpus.txt.gz \ --min_sentence_length 10 \ --max_sentence_length 200 \ --mecab_option '-r -d ' $ python merge_split_corpora.py \ --input_files $WORK_DIR/corpus/wikipedia/corpus.txt.gz \ --output_dir $WORK_DIR/corpus/wikipedia \ --num_files 8 # Sample 1M sentences for training tokenizers $ cat $WORK_DIR/corpus/wikipedia/corpus_*.txt|grep -a -v '^$'|shuf|head -n 10000000 > $WORK_DIR/corpus/wikipedia/corpus_sampled.txt ``` ## Tokenization For each of BERT-base and BERT-large, we provide two models with different tokenization methods. - For **`wordpiece`** models, the texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm. The vocabulary size is 32768. - For **`character`** models, the texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into characters. The vocabulary size is 7027, which covers all the characters present in Unidic 2.1.2 dictionary. We used [unidic-lite](https://github.com/polm/unidic-lite) dictionary for tokenization. ### Generating a set of characters ```sh $ mkdir -p $WORK_DIR/tokenizers/alphabet $ python make_alphabet_from_unidic.py \ --lex_file $DATA_DIR/unidic-mecab-2.1.2_src/lex.csv \ --output_file $WORK_DIR/tokenizers/alphabet/unidic_lite.txt ``` ### Training tokenizers ```sh # WordPiece $ python train_tokenizer.py \ --input_files $WORK_DIR/corpus/wikipedia/corpus_sampled.txt \ --output_dir $WORK_DIR/tokenizers/wordpiece_unidic_lite \ --pre_tokenizer_type mecab \ --mecab_dic_type unidic_lite \ --vocab_size 32768 \ --limit_alphabet 7012 \ --initial_alphabet_file $WORK_DIR/tokenizers/alphabet/unidic_lite.txt \ --num_unused_tokens 10 \ --wordpieces_prefix '##' # Character $ mkdir $WORK_DIR/tokenizers/character_unidic_lite $ head -n 7027 $WORK_DIR/tokenizers/wordpiece_unidic_lite/vocab.txt > $WORK_DIR/tokenizers/character_unidic_lite/vocab.txt ``` ### Generating pretraining data ```sh # WordPiece on CC-100 # Each process takes about 2h50m and 60GB RAM, producing 15.2M instances $ mkdir -p $WORK_DIR/pretraining_data/wordpiece_unidic_lite/cc-100 $ seq -f %02g 1 64|xargs -L 1 -I {} -P 2 \ python create_pretraining_data.py \ --input_file $WORK_DIR/corpus/cc-100/corpus_{}.txt \ --output_file $WORK_DIR/pretraining_data/wordpiece_unidic_lite/cc-100/pretraining_data_{}.tfrecord.gz \ --vocab_file $WORK_DIR/tokenizers/wordpiece_unidic_lite/vocab.txt \ --word_tokenizer_type mecab \ --subword_tokenizer_type wordpiece \ --mecab_dic_type unidic_lite \ --do_whole_word_mask \ --gzip_compress \ --use_v2_feature_names \ --max_seq_length 128 \ --max_predictions_per_seq 19 \ --masked_lm_prob 0.15 \ --dupe_factor 5 # WordPiece on Wikipedia # Each process takes about 7h30m and 138GB RAM, producing 18.4M instances $ mkdir -p $WORK_DIR/pretraining_data/wordpiece_unidic_lite/wikipedia $ seq -f %02g 1 8|xargs -L 1 -I {} -P 1 \ python create_pretraining_data.py \ --input_file $WORK_DIR/corpus/wikipedia/corpus_{}.txt \ --output_file $WORK_DIR/pretraining_data/wordpiece_unidic_lite/wikipedia/pretraining_data_{}.tfrecord.gz \ --vocab_file $WORK_DIR/tokenizers/wordpiece_unidic_lite/vocab.txt \ --word_tokenizer_type mecab \ --subword_tokenizer_type wordpiece \ --mecab_dic_type unidic_lite \ --do_whole_word_mask \ --gzip_compress \ --use_v2_feature_names \ --max_seq_length 512 \ --max_predictions_per_seq 76 \ --masked_lm_prob 0.15 \ --dupe_factor 30 # Character on CC-100 # Each process takes about 3h30m and 82GB RAM, producing 18.4M instances $ mkdir -p $WORK_DIR/pretraining_data/character_unidic_lite/cc-100 $ seq -f %02g 1 64|xargs -L 1 -I {} -P 2 \ python create_pretraining_data.py \ --input_file $WORK_DIR/corpus/cc-100/corpus_{}.txt \ --output_file $WORK_DIR/pretraining_data/character_unidic_lite/cc-100/pretraining_data_{}.tfrecord.gz \ --vocab_file $WORK_DIR/tokenizers/character_unidic_lite/vocab.txt \ --word_tokenizer_type mecab \ --subword_tokenizer_type character \ --mecab_dic_type unidic_lite \ --vocab_has_no_subword_prefix \ --do_whole_word_mask \ --gzip_compress \ --use_v2_feature_names \ --max_seq_length 128 \ --max_predictions_per_seq 19 \ --masked_lm_prob 0.15 \ --dupe_factor 5 # Character on Wikipedia # Each process takes about 10h30m and 205GB RAM, producing 23.7M instances $ mkdir -p $WORK_DIR/pretraining_data/character_unidic_lite/wikipedia $ seq -f %02g 1 8|xargs -L 1 -I {} -P 1 \ python create_pretraining_data.py \ --input_file $WORK_DIR/corpus/wikipedia/corpus_{}.txt \ --output_file $WORK_DIR/pretraining_data/character_unidic_lite/wikipedia/pretraining_data_{}.tfrecord.gz \ --vocab_file $WORK_DIR/tokenizers/character_unidic_lite/vocab.txt \ --word_tokenizer_type mecab \ --subword_tokenizer_type character \ --mecab_dic_type unidic_lite \ --vocab_has_no_subword_prefix \ --do_whole_word_mask \ --gzip_compress \ --use_v2_feature_names \ --max_seq_length 512 \ --max_predictions_per_seq 76 \ --masked_lm_prob 0.15 \ --dupe_factor 30 ``` ## Training We trained the models first on the CC-100 corpus and then on the Wikipedia corpus. Generally speaking, the texts of Wikipedia are much cleaner than those of CC-100, but the amount of text is much smaller. We expect that our two-stage training scheme let the model trained on large amount of text while preserving the quality of language that the model eventually learns. For training of the MLM (masked language modeling) objective, we introduced **whole word masking** in which all subword tokens corresponding to a single word (tokenized by MeCab) are masked at once. To conduct training of each model, we used a v3-8 instance of Cloud TPUs provided by [TensorFlow Research Cloud program](https://www.tensorflow.org/tfrc/). The training took about 16 and 56 days for BERT-base and BERT-large models, respectively. ### Creating a TPU VM and connecting to it **Note:** We set the runtime version of the TPU as `2.11.0`, where TensorFlow v2.11 is used. It is important to specify the same version if you wish to reuse our codes, otherwise it may not work properly. Here we use [Google Cloud CLI](https://cloud.google.com/cli). ```sh $ gcloud compute tpus tpu-vm create --zone= --accelerator-type=v3-8 --version=tpu-vm-tf-2.11.0 $ gcloud compute tpus tpu-vm ssh --zone= ``` ### Training of the models The following commands are executed in the TPU VM. It is recommended that you run the commands in a Tmux session. **Note:** All the necessary files (i.e., pretraining data and config files) need to be stored in a Google Cloud Storage (GCS) bucket in advance. #### BERT-base, WordPiece ```sh (vm)$ cd /usr/share/tpu/models/ (vm)$ pip3 install -r official/requirements.txt (vm)$ export PYTHONPATH=/usr/share/tpu/models (vm)$ CONFIG_DIR="gs:///bert-japanese/configs" (vm)$ DATA_DIR="gs:///bert-japanese/pretraining_data/wordpiece_unidic_lite" (vm)$ MODEL_DIR="gs:///bert-japanese/model/wordpiece_unidic_lite" # Start training on CC-100 # It will take 6 days to finish on a v3-8 TPU (vm)$ python3 official/nlp/train.py \ --tpu=local \ --experiment=bert/pretraining \ --mode=train_and_eval \ --model_dir=$MODEL_DIR/bert_base/training/cc-100 \ --config_file=$CONFIG_DIR/data/cc-100.yaml \ --config_file=$CONFIG_DIR/model/bert_base_wordpiece.yaml \ --params_override="task.train_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu" # Continue training on Wikipedia # It will take 10 days to finish on a v3-8 TPU (vm)$ python3 official/nlp/train.py \ --tpu=local \ --experiment=bert/pretraining \ --mode=train_and_eval \ --model_dir=$MODEL_DIR/bert_base/training/cc-100_wikipedia \ --config_file=$CONFIG_DIR/data/wikipedia.yaml \ --config_file=$CONFIG_DIR/model/bert_base_wordpiece.yaml \ --params_override="task.init_checkpoint=$MODEL_DIR/bert_base/training/cc-100,task.train_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu" ``` #### BERT-base, Character ```sh (vm)$ cd /usr/share/tpu/models/ (vm)$ pip3 install -r official/requirements.txt (vm)$ export PYTHONPATH=/usr/share/tpu/models (vm)$ CONFIG_DIR="gs:///bert-japanese/configs" (vm)$ DATA_DIR="gs:///bert-japanese/pretraining_data/character_unidic_lite" (vm)$ MODEL_DIR="gs:///bert-japanese/model/character_unidic_lite" # Start training on CC-100 # It will take 6 days to finish on a v3-8 TPU (vm)$ python3 official/nlp/train.py \ --tpu=local \ --experiment=bert/pretraining \ --mode=train_and_eval \ --model_dir=$MODEL_DIR/bert_base/training/cc-100 \ --config_file=$CONFIG_DIR/data/cc-100.yaml \ --config_file=$CONFIG_DIR/model/bert_base_character.yaml \ --params_override="task.train_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu" # Continue training on Wikipedia # It will take 10 days to finish on a v3-8 TPU (vm)$ python3 official/nlp/train.py \ --tpu=local \ --experiment=bert/pretraining \ --mode=train_and_eval \ --model_dir=$MODEL_DIR/bert_base/training/cc-100_wikipedia \ --config_file=$CONFIG_DIR/data/wikipedia.yaml \ --config_file=$CONFIG_DIR/model/bert_base_character.yaml \ --params_override="task.init_checkpoint=$MODEL_DIR/bert_base/training/cc-100,task.train_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu" ``` #### BERT-large, WordPiece ```sh (vm)$ cd /usr/share/tpu/models/ (vm)$ pip3 install -r official/requirements.txt (vm)$ export PYTHONPATH=/usr/share/tpu/models (vm)$ CONFIG_DIR="gs:///bert-japanese/configs" (vm)$ DATA_DIR="gs:///bert-japanese/pretraining_data/wordpiece_unidic_lite" (vm)$ MODEL_DIR="gs:///bert-japanese/model/wordpiece_unidic_lite" # Start training on CC-100 # It will take 23 days to finish on a v3-8 TPU (vm)$ python3 official/nlp/train.py \ --tpu=local \ --experiment=bert/pretraining \ --mode=train_and_eval \ --model_dir=$MODEL_DIR/bert_large/training/cc-100 \ --config_file=$CONFIG_DIR/data/cc-100.yaml \ --config_file=$CONFIG_DIR/model/bert_large_wordpiece.yaml \ --params_override="task.train_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu" # Continue training on Wikipedia # It will take 33 days to finish on a v3-8 TPU (vm)$ python3 official/nlp/train.py \ --tpu=local \ --experiment=bert/pretraining \ --mode=train_and_eval \ --model_dir=$MODEL_DIR/bert_large/training/cc-100_wikipedia \ --config_file=$CONFIG_DIR/data/wikipedia.yaml \ --config_file=$CONFIG_DIR/model/bert_large_wordpiece.yaml \ --params_override="task.init_checkpoint=$MODEL_DIR/bert_large/training/cc-100,task.train_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu" ``` #### BERT-large, Character ```sh (vm)$ cd /usr/share/tpu/models/ (vm)$ pip3 install -r official/requirements.txt (vm)$ export PYTHONPATH=/usr/share/tpu/models (vm)$ CONFIG_DIR="gs:///bert-japanese/configs" (vm)$ DATA_DIR="gs:///bert-japanese/pretraining_data/character_unidic_lite" (vm)$ MODEL_DIR="gs:///bert-japanese/model/character_unidic_lite" # Start training on CC-100 # It will take 23 days to finish on a v3-8 TPU (vm)$ python3 official/nlp/train.py \ --tpu=local \ --experiment=bert/pretraining \ --mode=train_and_eval \ --model_dir=$MODEL_DIR/bert_large/training/cc-100 \ --config_file=$CONFIG_DIR/data/cc-100.yaml \ --config_file=$CONFIG_DIR/model/bert_large_character.yaml \ --params_override="task.train_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/cc-100/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu" # Continue training on Wikipedia # It will take 33 days to finish on a v3-8 TPU (vm)$ python3 official/nlp/train.py \ --tpu=local \ --experiment=bert/pretraining \ --mode=train_and_eval \ --model_dir=$MODEL_DIR/bert_large/training/cc-100_wikipedia \ --config_file=$CONFIG_DIR/data/wikipedia.yaml \ --config_file=$CONFIG_DIR/model/bert_large_character.yaml \ --params_override="task.init_checkpoint=$MODEL_DIR/bert_large/training/cc-100,task.train_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,task.validation_data.input_path=$DATA_DIR/wikipedia/pretraining_data_*.tfrecord,runtime.distribution_strategy=tpu" ``` ### Deleting a TPU VM ```sh $ gcloud compute tpus tpu-vm delete --zone= ``` ## Model Conversion You can convert the TensorFlow model checkpoint to a PyTorch model file. **Note:** The model conversion script is designed for the models trained with TensorFlow v2.11.0. The script may not work for models trained with a different version of TensorFlow. ```sh # For BERT-base, WordPiece $ VOCAB_FILE=$WORK_DIR/tokenizers/wordpiece_unidic_lite/vocab.txt $ TF_CONFIG_FILE=model_configs/bert_base_wordpiece/config.json $ HF_CONFIG_DIR=hf_model_configs/bert_base_wordpiece $ TF_CKPT_PATH=$WORK_DIR/model/wordpiece_unidic_lite/bert_base/training/cc-100_wikipedia/ckpt-1000000 $ OUTPUT_DIR=$WORK_DIR/hf_model/bert-base-japanese-v3 # For BERT-base, Character $ VOCAB_FILE=$WORK_DIR/tokenizers/character_unidic_lite/vocab.txt $ TF_CONFIG_FILE=model_configs/bert_base_character/config.json $ HF_CONFIG_DIR=hf_model_configs/bert_base_character $ TF_CKPT_PATH=$WORK_DIR/model/character_unidic_lite/bert_base/training/cc-100_wikipedia/ckpt-1000000 $ OUTPUT_DIR=$WORK_DIR/hf_model/bert-base-japanese-char-v3 # For BERT-large, WordPiece $ VOCAB_FILE=$WORK_DIR/tokenizers/wordpiece_unidic_lite/vocab.txt $ TF_CONFIG_FILE=model_configs/bert_large_wordpiece/config.json $ HF_CONFIG_DIR=hf_model_configs/bert_large_wordpiece $ TF_CKPT_PATH=$WORK_DIR/model/wordpiece_unidic_lite/bert_large/training/cc-100_wikipedia/ckpt-1000000 $ OUTPUT_DIR=$WORK_DIR/hf_model/bert-large-japanese-v2 # For BERT-large, Character $ VOCAB_FILE=$WORK_DIR/tokenizers/character_unidic_lite/vocab.txt $ TF_CONFIG_FILE=model_configs/bert_large_character/config.json $ HF_CONFIG_DIR=hf_model_configs/bert_large_character $ TF_CKPT_PATH=$WORK_DIR/model/character_unidic_lite/bert_large/training/cc-100_wikipedia/ckpt-1000000 $ OUTPUT_DIR=$WORK_DIR/hf_model/bert-large-japanese-char-v2 # Run the model conversion script $ mkdir -p $OUTPUT_DIR $ python convert_tf2_checkpoint_to_all_frameworks.py \ --tf_checkpoint_path $TF_CKPT_PATH \ --tf_config_file $TF_CONFIG_FILE \ --output_path $OUTPUT_DIR $ cp $HF_CONFIG_DIR/* $OUTPUT_DIR $ cp $VOCAB_FILE $OUTPUT_DIR ``` ## Model Performances We evaluated our models' performances on the [JGLUE](https://github.com/yahoojapan/JGLUE) benchmark tasks. For each task, the model is fine-tuned on the training set and evaluated on the development set (the test sets are not publicly available as of this writing.) The hyperparameters were searched within the same set of values as the ones specified in the [JGLUE fine-tuning README](https://github.com/yahoojapan/JGLUE/tree/main/fine-tuning). The results of our (informal) experiments are below. **Note:** These results should be viewed as informative only, since each setting was experimented with only one fixed random seed. | Model | MARC-ja | JSTS | JNLI | JSQuAD | JCommonsenseQA | | :------------------------------------- | :-----: | :----------------: | :---: | :-----------: | :------------: | | | Acc. | Pearson / Spearman | Acc. | EM / F1 | Acc. | | `bert-base-japanese-v2` | 0.958 | 0.910 / 0.871 | 0.901 | 0.869 / 0.939 | 0.803 | | `bert-base-japanese-v3` **New!** | 0.962 | 0.919 / 0.881 | 0.907 | 0.880 / 0.946 | 0.848 | | `bert-base-japanese-char-v2` | 0.957 | 0.891 / 0.851 | 0.896 | 0.870 / 0.938 | 0.724 | | `bert-base-japanese-char-v3` **New!** | 0.959 | 0.914 / 0.875 | 0.903 | 0.871 / 0.939 | 0.786 | | `bert-large-japanese` | 0.958 | 0.913 / 0.874 | 0.902 | 0.881 / 0.946 | 0.823 | | `bert-large-japanese-v2` **New!** | 0.960 | 0.926 / 0.893 | 0.929 | 0.893 / 0.956 | 0.893 | | `bert-large-japanese-char` | 0.958 | 0.883 / 0.842 | 0.899 | 0.870 / 0.938 | 0.753 | | `bert-large-japanese-char-v2` **New!** | 0.961 | 0.921 / 0.884 | 0.910 | 0.892 / 0.952 | 0.859 | ## Licenses The pretrained models and the codes in this repository are distributed under the Apache License 2.0. ## Related Work - Original BERT models and codes by Google Research Team - https://github.com/google-research/bert (for TensorFlow v1) - https://github.com/tensorflow/models/tree/master/official/nlp (for TensorFlow v2) ## Acknowledgments The distributed models are trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program. ================================================ FILE: configs/data/cc-100.yaml ================================================ task: init_checkpoint: '' train_data: drop_remainder: true global_batch_size: 2048 is_training: true max_predictions_per_seq: 19 seq_length: 128 use_next_sentence_label: true use_position_id: false use_v2_feature_names: true validation_data: drop_remainder: false global_batch_size: 2048 is_training: false max_predictions_per_seq: 19 seq_length: 128 use_next_sentence_label: true use_position_id: false use_v2_feature_names: true ================================================ FILE: configs/data/wikipedia.yaml ================================================ task: init_checkpoint: '' train_data: drop_remainder: true global_batch_size: 512 is_training: true max_predictions_per_seq: 76 seq_length: 512 use_next_sentence_label: true use_position_id: false use_v2_feature_names: true validation_data: drop_remainder: false global_batch_size: 512 is_training: false max_predictions_per_seq: 76 seq_length: 512 use_next_sentence_label: true use_position_id: false use_v2_feature_names: true ================================================ FILE: configs/model/bert_base_character.yaml ================================================ task: model: cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 768, name: next_sentence, num_classes: 2}] encoder: type: bert bert: attention_dropout_rate: 0.1 dropout_rate: 0.1 hidden_activation: gelu hidden_size: 768 initializer_range: 0.02 intermediate_size: 3072 max_position_embeddings: 512 num_attention_heads: 12 num_layers: 12 type_vocab_size: 2 vocab_size: 7027 trainer: checkpoint_interval: 20000 max_to_keep: 5 optimizer_config: learning_rate: polynomial: cycle: false decay_steps: 1000000 end_learning_rate: 0.0 initial_learning_rate: 0.0001 power: 1.0 type: polynomial optimizer: type: adamw warmup: polynomial: power: 1 warmup_steps: 10000 type: polynomial steps_per_loop: 1000 summary_interval: 1000 train_steps: 1000000 validation_interval: 1000 validation_steps: 64 ================================================ FILE: configs/model/bert_base_wordpiece.yaml ================================================ task: model: cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 768, name: next_sentence, num_classes: 2}] encoder: type: bert bert: attention_dropout_rate: 0.1 dropout_rate: 0.1 hidden_activation: gelu hidden_size: 768 initializer_range: 0.02 intermediate_size: 3072 max_position_embeddings: 512 num_attention_heads: 12 num_layers: 12 type_vocab_size: 2 vocab_size: 32768 trainer: checkpoint_interval: 20000 max_to_keep: 5 optimizer_config: learning_rate: polynomial: cycle: false decay_steps: 1000000 end_learning_rate: 0.0 initial_learning_rate: 0.0001 power: 1.0 type: polynomial optimizer: type: adamw warmup: polynomial: power: 1 warmup_steps: 10000 type: polynomial steps_per_loop: 1000 summary_interval: 1000 train_steps: 1000000 validation_interval: 1000 validation_steps: 64 ================================================ FILE: configs/model/bert_large_character.yaml ================================================ task: model: cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 1024, name: next_sentence, num_classes: 2}] encoder: type: bert bert: attention_dropout_rate: 0.1 dropout_rate: 0.1 hidden_activation: gelu hidden_size: 1024 initializer_range: 0.02 intermediate_size: 4096 max_position_embeddings: 512 num_attention_heads: 16 num_layers: 24 type_vocab_size: 2 vocab_size: 7027 trainer: checkpoint_interval: 20000 max_to_keep: 5 optimizer_config: learning_rate: polynomial: cycle: false decay_steps: 1000000 end_learning_rate: 0.0 initial_learning_rate: 0.00005 power: 1.0 type: polynomial optimizer: type: adamw warmup: polynomial: power: 1 warmup_steps: 10000 type: polynomial steps_per_loop: 1000 summary_interval: 1000 train_steps: 1000000 validation_interval: 1000 validation_steps: 64 ================================================ FILE: configs/model/bert_large_wordpiece.yaml ================================================ task: model: cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 1024, name: next_sentence, num_classes: 2}] encoder: type: bert bert: attention_dropout_rate: 0.1 dropout_rate: 0.1 hidden_activation: gelu hidden_size: 1024 initializer_range: 0.02 intermediate_size: 4096 max_position_embeddings: 512 num_attention_heads: 16 num_layers: 24 type_vocab_size: 2 vocab_size: 32768 trainer: checkpoint_interval: 20000 max_to_keep: 5 optimizer_config: learning_rate: polynomial: cycle: false decay_steps: 1000000 end_learning_rate: 0.0 initial_learning_rate: 0.00005 power: 1.0 type: polynomial optimizer: type: adamw warmup: polynomial: power: 1 warmup_steps: 10000 type: polynomial steps_per_loop: 1000 summary_interval: 1000 train_steps: 1000000 validation_interval: 1000 validation_steps: 64 ================================================ FILE: convert_tf2_ckpt_for_all_frameworks.py ================================================ # Copyright 2023 Masatoshi Suzuki (@singletongue) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import logging import os import re import tensorflow as tf import torch from transformers import BertConfig, BertForPreTraining, FlaxBertForPreTraining, TFBertForPreTraining logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def load_tf2_weights_in_bert(model, tf_checkpoint_path, config): tf_checkpoint_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from %s", tf_checkpoint_path) for full_name, shape in tf.train.list_variables(tf_checkpoint_path): pointer = model trace = [] if len(shape) == 0: logger.info("Skipping non-tensor variable: %s", full_name) continue if "optimizer/" in full_name: logger.info("Skipping optimizer weights: %s", full_name) continue split_name = full_name.split("/") name = split_name.pop(0) if name == "encoder": pointer = getattr(pointer, "bert") trace.append("bert") name = split_name.pop(0) if name.startswith("layer_with_weights"): layer_num = int(name.split("-")[-1]) # if layer_num == 0: # word embedding (not saved with tensorflow-models 2.10.0) # trace.extend(["embeddings", "word_embeddings"]) # pointer = getattr(pointer, "embeddings") # pointer = getattr(pointer, "word_embeddings") if layer_num == 1: # position_embedding trace.extend(["embeddings", "position_embeddings"]) pointer = getattr(pointer, "embeddings") pointer = getattr(pointer, "position_embeddings") elif layer_num == 2: # type_embeddings trace.extend(["embeddings", "token_type_embeddings"]) pointer = getattr(pointer, "embeddings") pointer = getattr(pointer, "token_type_embeddings") elif layer_num == 3: # embeddings/layer_norm trace.extend(["embeddings", "LayerNorm"]) pointer = getattr(pointer, "embeddings") pointer = getattr(pointer, "LayerNorm") elif layer_num >= 4 and layer_num < config.num_hidden_layers + 4: # transformer/layer_x trace.extend(["encoder", "layer", str(layer_num - 4)]) pointer = getattr(pointer, "encoder") pointer = getattr(pointer, "layer") pointer = pointer[layer_num - 4] elif layer_num == config.num_hidden_layers + 4: # pooler_transform (not trained with tensorflow-models 2.10.0) continue else: logger.warning("Skipping unknown weight name: %s", full_name) continue elif name == "masked_lm": trace.extend(["cls", "predictions"]) pointer = getattr(pointer, "cls") pointer = getattr(pointer, "predictions") name = split_name.pop(0) if name == "dense": trace.extend(["transform", "dense"]) pointer = getattr(pointer, "transform") pointer = getattr(pointer, "dense") elif name == "embedding_table": trace.extend(["decoder", "weight"]) pointer = getattr(pointer, "decoder") pointer = getattr(pointer, "weight") elif name == "layer_norm": trace.extend(["transform", "LayerNorm"]) pointer = getattr(pointer, "transform") pointer = getattr(pointer, "LayerNorm") elif name == "output_bias.Sbias": trace.extend(["bias"]) pointer = getattr(pointer, "bias") else: logger.warning("Skipping unknown weight name: %s", full_name) continue elif name == "model": names = split_name[:3] split_name = split_name[3:] if names == ["classification_heads", "0", "out_proj"]: trace.extend(["cls", "seq_relationship"]) pointer = getattr(pointer, "cls") pointer = getattr(pointer, "seq_relationship") else: logger.warning("Skipping unknown weight name: %s", full_name) continue elif name == "next_sentence..pooler_dense": trace.extend(["bert", "pooler", "dense"]) pointer = getattr(pointer, "bert") pointer = getattr(pointer, "pooler") pointer = getattr(pointer, "dense") else: logger.warning("Skipping unknown weight name: %s", full_name) continue # iterate over the rest depths for name in split_name: if name == "_attention_layer": # self-attention layer trace.append("attention") pointer = getattr(pointer, "attention") elif name == "_attention_layer_norm": # output attention norm trace.extend(["attention", "output", "LayerNorm"]) pointer = getattr(pointer, "attention") pointer = getattr(pointer, "output") pointer = getattr(pointer, "LayerNorm") elif name == "_attention_output_dense": # output attention dense trace.extend(["attention", "output", "dense"]) pointer = getattr(pointer, "attention") pointer = getattr(pointer, "output") pointer = getattr(pointer, "dense") elif name == "_intermediate_dense": # attention intermediate dense trace.extend(["intermediate", "dense"]) pointer = getattr(pointer, "intermediate") pointer = getattr(pointer, "dense") elif name == "_output_dense": # output dense trace.extend(["output", "dense"]) pointer = getattr(pointer, "output") pointer = getattr(pointer, "dense") elif name == "_output_layer_norm": # output dense trace.extend(["output", "LayerNorm"]) pointer = getattr(pointer, "output") pointer = getattr(pointer, "LayerNorm") elif name == "_key_dense": # attention key trace.extend(["self", "key"]) pointer = getattr(pointer, "self") pointer = getattr(pointer, "key") elif name == "_query_dense": # attention query trace.extend(["self", "query"]) pointer = getattr(pointer, "self") pointer = getattr(pointer, "query") elif name == "_value_dense": # attention value trace.extend(["self", "value"]) pointer = getattr(pointer, "self") pointer = getattr(pointer, "value") elif name == "dense": # attention value trace.append("dense") pointer = getattr(pointer, "dense") elif name in ["bias", "beta"]: # norm biases trace.append("bias") pointer = getattr(pointer, "bias") elif name in ["kernel", "gamma"]: # norm weights trace.append("weight") pointer = getattr(pointer, "weight") elif name == "embeddings": # embeddins weights trace.append("weight") pointer = getattr(pointer, "weight") elif name == ".ATTRIBUTES": # full variable name ends with .ATTRIBUTES/VARIABLE_VALUE break else: logger.warning("Skipping unknown weight name: %s", full_name) logger.info("Loading TF weight %s with shape %s", full_name, shape) array = tf.train.load_variable(tf_checkpoint_path, full_name) # for certain layers reshape is necessary trace = ".".join(trace) if re.match(r"(\S+)\.attention\.self\.(key|value|query)\.(bias|weight)", trace) or \ re.match(r"(\S+)\.attention\.output\.dense\.weight", trace): array = array.reshape(pointer.data.shape) if "kernel" in full_name: array = array.transpose() if pointer.shape == array.shape: pointer.data = torch.from_numpy(array) else: raise ValueError( f"Shape mismatch in layer {full_name}: " f"Model expects shape {pointer.shape} but layer contains shape: {array.shape}" ) logger.info("Successfully set variable %s to PyTorch layer %s", full_name, trace) if full_name == "masked_lm/embedding_table/.ATTRIBUTES/VARIABLE_VALUE": word_embeddings_pointer = model.bert.embeddings.word_embeddings.weight word_embeddings_trace = "bert.embeddings.word_embeddings.weight" if word_embeddings_pointer.shape == array.shape: word_embeddings_pointer.data = torch.from_numpy(array) else: raise ValueError( f"Shape mismatch in layer {full_name}: " f"Model expects shape {word_embeddings_pointer.shape} but layer contains shape: {array.shape}" ) logger.info("Successfully set variable %s to PyTorch layer %s", full_name, word_embeddings_trace) return model def convert_tf2_checkpoint_to_pytorch(tf_checkpoint_path, tf_config_path, output_path): # Initialize PyTorch model logger.info("Loading model based on config from %s...", tf_config_path) config = BertConfig.from_json_file(tf_config_path) model = BertForPreTraining(config) # Load weights from tf checkpoint logger.info("Loading weights from checkpoint %s...", tf_checkpoint_path) load_tf2_weights_in_bert(model, tf_checkpoint_path, config) # Save the model in PyTorch format logger.info("Saving PyTorch model to %s...", output_path) model.save_pretrained(output_path) # Save the model in TensorFlow format logger.info("Reloading the saved model in TensorFlow format and saving to %s...", output_path) tf_model = TFBertForPreTraining.from_pretrained(output_path, from_pt=True) tf_model.save_pretrained(output_path) # Save the model in JAX/Flax format logger.info("Reloading the saved model in JAX/Flax format and saving to %s...", output_path) flax_model = FlaxBertForPreTraining.from_pretrained(output_path, from_pt=True) flax_model.save_pretrained(output_path) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow 2.x checkpoint path." ) parser.add_argument( "--tf_config_file", type=str, required=True, help="The config json file corresponding to the BERT model. This specifies the model architecture.", ) parser.add_argument( "--output_path", type=str, required=True, help="Path to the output PyTorch model (must include filename).", ) args = parser.parse_args() convert_tf2_checkpoint_to_pytorch(args.tf_checkpoint_path, args.tf_config_file, args.output_path) ================================================ FILE: create_pretraining_data.py ================================================ # Copyright 2022 The TensorFlow Authors. All Rights Reserved. # Copyright 2023 Masatoshi Suzuki (@singletongue) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Create masked LM/next sentence masked_lm TF examples for BERT.""" import collections import random # Import libraries from absl import app from absl import flags from absl import logging import tensorflow as tf from tokenization import BertJapaneseTokenizer FLAGS = flags.FLAGS flags.DEFINE_string("input_file", None, "Input raw text file (or comma-separated list of files).") flags.DEFINE_string( "output_file", None, "Output TF example file (or comma-separated list of files).") flags.DEFINE_string("vocab_file", None, "The vocabulary file that the BERT model was trained on.") flags.DEFINE_bool( "do_lower_case", True, "Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") flags.DEFINE_string("word_tokenizer_type", None, "Word tokenizer type (basic, mecab).") flags.DEFINE_string("subword_tokenizer_type", None, "Tokenizer type (wordpiece, character).") flags.DEFINE_string("mecab_dic_type", None, "Dictionary type for MecabTokenizer.") flags.DEFINE_bool("vocab_has_no_subword_prefix", False, "Whether the vocabulary contains no subword prefix.") flags.DEFINE_bool( "do_whole_word_mask", False, "Whether to use whole word masking rather than per-WordPiece masking.") flags.DEFINE_bool( "gzip_compress", False, "Whether to use `GZIP` compress option to get compressed TFRecord files.") flags.DEFINE_bool( "use_v2_feature_names", False, "Whether to use the feature names consistent with the models.") flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") flags.DEFINE_integer("max_predictions_per_seq", 20, "Maximum number of masked LM predictions per sequence.") flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") flags.DEFINE_integer( "dupe_factor", 10, "Number of times to duplicate the input data (with different masks).") flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") flags.DEFINE_float( "short_seq_prob", 0.1, "Probability of creating sequences which are shorter than the " "maximum length.") class TrainingInstance(object): """A single training instance (sentence pair).""" def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, is_random_next): self.tokens = tokens self.segment_ids = segment_ids self.is_random_next = is_random_next self.masked_lm_positions = masked_lm_positions self.masked_lm_labels = masked_lm_labels def __str__(self): s = "" s += "tokens: %s\n" % (" ".join([x for x in self.tokens])) s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) s += "is_random_next: %s\n" % self.is_random_next s += "masked_lm_positions: %s\n" % (" ".join( [str(x) for x in self.masked_lm_positions])) s += "masked_lm_labels: %s\n" % (" ".join( [x for x in self.masked_lm_labels])) s += "\n" return s def __repr__(self): return self.__str__() def write_instance_to_example_files(instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files, gzip_compress, use_v2_feature_names): """Creates TF example files from `TrainingInstance`s.""" writers = [] for output_file in output_files: writers.append( tf.io.TFRecordWriter( output_file, options="GZIP" if gzip_compress else "")) writer_index = 0 total_written = 0 for (inst_index, instance) in enumerate(instances): input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) input_mask = [1] * len(input_ids) segment_ids = list(instance.segment_ids) assert len(input_ids) <= max_seq_length while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length masked_lm_positions = list(instance.masked_lm_positions) masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) masked_lm_weights = [1.0] * len(masked_lm_ids) while len(masked_lm_positions) < max_predictions_per_seq: masked_lm_positions.append(0) masked_lm_ids.append(0) masked_lm_weights.append(0.0) next_sentence_label = 1 if instance.is_random_next else 0 features = collections.OrderedDict() if use_v2_feature_names: features["input_word_ids"] = create_int_feature(input_ids) features["input_type_ids"] = create_int_feature(segment_ids) else: features["input_ids"] = create_int_feature(input_ids) features["segment_ids"] = create_int_feature(segment_ids) features["input_mask"] = create_int_feature(input_mask) features["masked_lm_positions"] = create_int_feature(masked_lm_positions) features["masked_lm_ids"] = create_int_feature(masked_lm_ids) features["masked_lm_weights"] = create_float_feature(masked_lm_weights) features["next_sentence_labels"] = create_int_feature([next_sentence_label]) tf_example = tf.train.Example(features=tf.train.Features(feature=features)) writers[writer_index].write(tf_example.SerializeToString()) writer_index = (writer_index + 1) % len(writers) total_written += 1 if inst_index < 20: logging.info("*** Example ***") logging.info("tokens: %s", " ".join([x for x in instance.tokens])) for feature_name in features.keys(): feature = features[feature_name] values = [] if feature.int64_list.value: values = feature.int64_list.value elif feature.float_list.value: values = feature.float_list.value logging.info("%s: %s", feature_name, " ".join([str(x) for x in values])) for writer in writers: writer.close() logging.info("Wrote %d total instances", total_written) def create_int_feature(values): feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) return feature def create_float_feature(values): feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) return feature def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng, do_whole_word_mask=False): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: with tf.io.gfile.GFile(input_file, "rb") as reader: while True: line = reader.readline().decode("utf-8", "ignore") if not line: break line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) # Remove empty documents all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) vocab_words = list(tokenizer.get_vocab().keys()) instances = [] for _ in range(dupe_factor): for document_index in range(len(all_documents)): instances.extend( create_instances_from_document( all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask)) rng.shuffle(instances) return instances def create_instances_from_document( all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask=False): """Creates `TrainingInstance`s for a single document.""" document = all_documents[document_index] # Account for [CLS], [SEP], [SEP] max_num_tokens = max_seq_length - 3 # We *usually* want to fill up the entire sequence since we are padding # to `max_seq_length` anyways, so short sequences are generally wasted # computation. However, we *sometimes* # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter # sequences to minimize the mismatch between pre-training and fine-tuning. # The `target_seq_length` is just a rough target however, whereas # `max_seq_length` is a hard limit. target_seq_length = max_num_tokens if rng.random() < short_seq_prob: target_seq_length = rng.randint(2, max_num_tokens) # We DON'T just concatenate all of the tokens from a document into a long # sequence and choose an arbitrary split point because this would make the # next sentence prediction task too easy. Instead, we split the input into # segments "A" and "B" based on the actual "sentences" provided by the user # input. instances = [] current_chunk = [] current_length = 0 i = 0 while i < len(document): segment = document[i] current_chunk.append(segment) current_length += len(segment) if i == len(document) - 1 or current_length >= target_seq_length: if current_chunk: # `a_end` is how many segments from `current_chunk` go into the `A` # (first) sentence. a_end = 1 if len(current_chunk) >= 2: a_end = rng.randint(1, len(current_chunk) - 1) tokens_a = [] for j in range(a_end): tokens_a.extend(current_chunk[j]) tokens_b = [] # Random next is_random_next = False if len(current_chunk) == 1 or rng.random() < 0.5: is_random_next = True target_b_length = target_seq_length - len(tokens_a) # This should rarely go for more than one iteration for large # corpora. However, just to be careful, we try to make sure that # the random document is not the same as the document # we're processing. for _ in range(10): random_document_index = rng.randint(0, len(all_documents) - 1) if random_document_index != document_index: break random_document = all_documents[random_document_index] random_start = rng.randint(0, len(random_document) - 1) for j in range(random_start, len(random_document)): tokens_b.extend(random_document[j]) if len(tokens_b) >= target_b_length: break # We didn't actually use these segments so we "put them back" so # they don't go to waste. num_unused_segments = len(current_chunk) - a_end i -= num_unused_segments # Actual next else: is_random_next = False for j in range(a_end, len(current_chunk)): tokens_b.extend(current_chunk[j]) truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) assert len(tokens_a) >= 1 assert len(tokens_b) >= 1 tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) (tokens, masked_lm_positions, masked_lm_labels) = create_masked_lm_predictions( tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask) instance = TrainingInstance( tokens=tokens, segment_ids=segment_ids, is_random_next=is_random_next, masked_lm_positions=masked_lm_positions, masked_lm_labels=masked_lm_labels) instances.append(instance) current_chunk = [] current_length = 0 i += 1 return instances MaskedLmInstance = collections.namedtuple("MaskedLmInstance", ["index", "label"]) def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask): """Creates the predictions for the masked LM objective.""" cand_indexes = [] for (i, token) in enumerate(tokens): if token == "[CLS]" or token == "[SEP]": continue # Whole Word Masking means that if we mask all of the wordpieces # corresponding to an original word. When a word has been split into # WordPieces, the first token does not have any marker and any subsequence # tokens are prefixed with ##. So whenever we see the ## token, we # append it to the previous set of word indexes. # # Note that Whole Word Masking does *not* change the training code # at all -- we still predict each WordPiece independently, softmaxed # over the entire vocabulary. if (do_whole_word_mask and len(cand_indexes) >= 1 and token.startswith("##")): cand_indexes[-1].append(i) else: cand_indexes.append([i]) rng.shuffle(cand_indexes) output_tokens = list(tokens) num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob)))) masked_lms = [] covered_indexes = set() for index_set in cand_indexes: if len(masked_lms) >= num_to_predict: break # If adding a whole-word mask would exceed the maximum number of # predictions, then just skip this candidate. if len(masked_lms) + len(index_set) > num_to_predict: continue is_any_index_covered = False for index in index_set: if index in covered_indexes: is_any_index_covered = True break if is_any_index_covered: continue for index in index_set: covered_indexes.add(index) masked_token = None # 80% of the time, replace with [MASK] if rng.random() < 0.8: masked_token = "[MASK]" else: # 10% of the time, keep original if rng.random() < 0.5: masked_token = tokens[index] # 10% of the time, replace with random word else: masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] output_tokens[index] = masked_token masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) assert len(masked_lms) <= num_to_predict masked_lms = sorted(masked_lms, key=lambda x: x.index) masked_lm_positions = [] masked_lm_labels = [] for p in masked_lms: masked_lm_positions.append(p.index) masked_lm_labels.append(p.label) return (output_tokens, masked_lm_positions, masked_lm_labels) def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): """Truncates a pair of sequences to a maximum sequence length.""" while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_num_tokens: break trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b assert len(trunc_tokens) >= 1 # We want to sometimes truncate from the front and sometimes from the # back to add more randomness and avoid biases. if rng.random() < 0.5: del trunc_tokens[0] else: trunc_tokens.pop() def main(_): tokenizer = BertJapaneseTokenizer( FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, word_tokenizer_type=FLAGS.word_tokenizer_type, subword_tokenizer_type=FLAGS.subword_tokenizer_type, mecab_kwargs={"mecab_dic": FLAGS.mecab_dic_type}, vocab_has_no_subword_prefix=FLAGS.vocab_has_no_subword_prefix, ) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.io.gfile.glob(input_pattern)) logging.info("*** Reading from input files ***") for input_file in input_files: logging.info(" %s", input_file) rng = random.Random(FLAGS.random_seed) instances = create_training_instances( input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, rng, FLAGS.do_whole_word_mask) output_files = FLAGS.output_file.split(",") logging.info("*** Writing to output files ***") for output_file in output_files: logging.info(" %s", output_file) write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, FLAGS.max_predictions_per_seq, output_files, FLAGS.gzip_compress, FLAGS.use_v2_feature_names) if __name__ == "__main__": flags.mark_flag_as_required("input_file") flags.mark_flag_as_required("output_file") flags.mark_flag_as_required("vocab_file") app.run(main) ================================================ FILE: hf_model_configs/bert_base_character/config.json ================================================ { "architectures": [ "BertForPreTraining" ], "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 0, "type_vocab_size": 2, "vocab_size": 7027 } ================================================ FILE: hf_model_configs/bert_base_character/tokenizer_config.json ================================================ { "tokenizer_class": "BertJapaneseTokenizer", "model_max_length": 512, "do_lower_case": false, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "character", "mecab_kwargs": { "mecab_dic": "unidic_lite" } } ================================================ FILE: hf_model_configs/bert_base_wordpiece/config.json ================================================ { "architectures": [ "BertForPreTraining" ], "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 0, "type_vocab_size": 2, "vocab_size": 32768 } ================================================ FILE: hf_model_configs/bert_base_wordpiece/tokenizer_config.json ================================================ { "tokenizer_class": "BertJapaneseTokenizer", "model_max_length": 512, "do_lower_case": false, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "wordpiece", "mecab_kwargs": { "mecab_dic": "unidic_lite" } } ================================================ FILE: hf_model_configs/bert_large_character/config.json ================================================ { "architectures": [ "BertForPreTraining" ], "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 1024, "initializer_range": 0.02, "intermediate_size": 4096, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 16, "num_hidden_layers": 24, "pad_token_id": 0, "type_vocab_size": 2, "vocab_size": 7027 } ================================================ FILE: hf_model_configs/bert_large_character/tokenizer_config.json ================================================ { "tokenizer_class": "BertJapaneseTokenizer", "model_max_length": 512, "do_lower_case": false, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "character", "mecab_kwargs": { "mecab_dic": "unidic_lite" } } ================================================ FILE: hf_model_configs/bert_large_wordpiece/config.json ================================================ { "architectures": [ "BertForPreTraining" ], "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 1024, "initializer_range": 0.02, "intermediate_size": 4096, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 16, "num_hidden_layers": 24, "pad_token_id": 0, "type_vocab_size": 2, "vocab_size": 32768 } ================================================ FILE: hf_model_configs/bert_large_wordpiece/tokenizer_config.json ================================================ { "tokenizer_class": "BertJapaneseTokenizer", "model_max_length": 512, "do_lower_case": false, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "wordpiece", "mecab_kwargs": { "mecab_dic": "unidic_lite" } } ================================================ FILE: japanese_tokenizers/implementations.py ================================================ # Copyright 2020 The HuggingFace Inc. team. # Copyright 2023 Masatoshi Suzuki (@singletongue). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Dict, Optional, Union from tokenizers import AddedToken, normalizers, pre_tokenizers from tokenizers.implementations import BertWordPieceTokenizer from .pre_tokenizers import MeCabPreTokenizer class JapaneseWordPieceTokenizer(BertWordPieceTokenizer): def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, unk_token: Union[str, AddedToken] = "[UNK]", sep_token: Union[str, AddedToken] = "[SEP]", cls_token: Union[str, AddedToken] = "[CLS]", pad_token: Union[str, AddedToken] = "[PAD]", mask_token: Union[str, AddedToken] = "[MASK]", num_unused_tokens: int = 10, pre_tokenizer_type: str = "mecab", mecab_dic_type: str = "unidic_lite", wordpieces_prefix: str = "##", ) -> None: super().__init__( vocab=vocab, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, wordpieces_prefix=wordpieces_prefix, ) self._tokenizer.add_special_tokens([f"[unused{i}]" for i in range(num_unused_tokens)]) self._tokenizer.normalizer = normalizers.Sequence([normalizers.NFKC(), normalizers.Strip()]) parameters = { "model": "BertWordPieceJapaneseTokenizer", "pre_tokenizer_type": pre_tokenizer_type, "mecab_dic_type": mecab_dic_type, } if pre_tokenizer_type == "mecab": self._tokenizer.pre_tokenizer = pre_tokenizers.PreTokenizer.custom(MeCabPreTokenizer(mecab_dic_type)) parameters["mecab_dic_type"] = mecab_dic_type elif pre_tokenizer_type == "whitespace": self._tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() else: raise ValueError("Invalid pre_tokenizer_type is specified.") self._parameters.update(parameters) ================================================ FILE: japanese_tokenizers/pre_tokenizers.py ================================================ # Copyright 2020 The HuggingFace Inc. team. # Copyright 2023 Masatoshi Suzuki (@singletongue). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from typing import List, Optional from tokenizers import NormalizedString, PreTokenizedString class MeCabPreTokenizer: def __init__(self, mecab_dic: Optional[str] = None, mecab_option: Optional[str] = None) -> None: import fugashi mecab_option = mecab_option or "" if mecab_dic is not None: if mecab_dic == "unidic_lite": import unidic_lite dic_dir = unidic_lite.DICDIR elif mecab_dic == "unidic": import unidic dic_dir = unidic.DICDIR elif mecab_dic == "ipadic": import ipadic dic_dir = ipadic.DICDIR else: raise ValueError("Invalid mecab_dic is specified.") mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = f"-d {dic_dir} -r {mecabrc} " + mecab_option self.mecab = fugashi.GenericTagger(mecab_option) def mecab_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]: splits = [] cursor = 0 for token in self.mecab(str(normalized_string)): start = str(normalized_string).index(token.surface, cursor) end = start + len(token.surface) splits.append(normalized_string[start:end]) cursor = end return splits def pre_tokenize(self, pretok: PreTokenizedString): pretok.split(self.mecab_split) ================================================ FILE: make_alphabet_from_unidic.py ================================================ # Copyright 2023 Masatoshi Suzuki (@singletongue) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import csv from unicodedata import normalize def main(args): seen_chars = set() with open(args.lex_file, newline="") as f: reader = csv.reader(f) for row in reader: token = row[0] if token == "": token = '"' for char in list(token): char = normalize("NFKC", char) if len(char) != 1: continue if not char.isprintable(): continue seen_chars.add(char) with open(args.output_file, "w") as fo: for char in sorted(list(seen_chars)): print(char, file=fo) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--lex_file", type=str, required=True) parser.add_argument("--output_file", type=str, required=True) args = parser.parse_args() main(args) ================================================ FILE: make_corpus_wiki.py ================================================ # Copyright 2023 Masatoshi Suzuki (@singletongue) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import gzip import json import os import re import unicodedata from tqdm import tqdm class MeCabSentenceSplitter: def __init__(self, mecab_option=None): import fugashi if mecab_option is None: import unidic_lite dic_dir = unidic_lite.DICDIR mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = f"-d {dic_dir} -r {mecabrc}" self.mecab = fugashi.GenericTagger(mecab_option) def __call__(self, text): sentences = [] start = 0 end = 0 for line in self.mecab.parse(text).split("\n"): if line == "EOS": if len(text[start:]) > 0: sentences.append(text[start:]) break token, token_info = line.split("\t", maxsplit=1) end = text.index(token, end) + len(token) if "記号" in token_info and "句点" in token_info: sentences.append(text[start:end]) start = end return sentences def filter_text(text): # filter out text containing equations if "\displaystyle" in text: return False return True def preprocess_text(text, title=None): text = unicodedata.normalize("NFKC", text) # remove invisible characters text = "".join(c for c in text if c.isprintable()) # remove templates text = re.sub(r"\[\d+?\]", "", text) text = re.sub(r"\[要.+?\]", "", text) text = re.sub(r"\{\{+[^{}]+?\}\}+", "", text) # remove navigation if title is not None: text = re.sub(r"^.+? \> " + re.escape(title), "", text) # remove footnotes text = re.sub(r" \^ .+", "", text) # remove annotations text = re.sub(r"\[(要出典|リンク切れ|.+?\?)\]", "", text) text = re.sub(r"\s+", " ", text).strip() return text def main(args): sent_splitter = MeCabSentenceSplitter(args.mecab_option) with gzip.open(args.input_file, "rt") as f, gzip.open(args.output_file, "wt") as fo: for line in tqdm(f): item = json.loads(line) if "index" in item: continue title = item["title"] text = item["text"] text = preprocess_text(text, title=title) is_processed = False for sentence in sent_splitter(text): sentence = sentence.strip() if len(sentence) < args.min_sentence_length: continue if len(sentence) > args.max_sentence_length: continue if not filter_text(sentence): continue assert not "\n" in text assert sentence != "" print(sentence, file=fo) is_processed = True if is_processed: # insert a newline for separating pages print("", file=fo) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input_file", type=str, required=True) parser.add_argument("--output_file", type=str, required=True) parser.add_argument("--mecab_option", type=str) parser.add_argument("--min_sentence_length", type=int, default=10) parser.add_argument("--max_sentence_length", type=int, default=1000) args = parser.parse_args() main(args) ================================================ FILE: masked_lm_example.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/m-suzuki/Projects/bert-japanese/venv/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "2023-05-19 10:03:53.353302: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] } ], "source": [ "import torch\n", "from transformers import AutoModelForMaskedLM, AutoTokenizer" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "tags": [] }, "outputs": [], "source": [ "model_name_or_path = \"cl-tohoku/bert-base-japanese-v3\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "tags": [] }, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v3 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n", "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" ] } ], "source": [ "model = AutoModelForMaskedLM.from_pretrained(model_name_or_path)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "tags": [] }, "outputs": [], "source": [ "input_ids = tokenizer.encode(f\"青葉山で{tokenizer.mask_token}の研究をしています。\", return_tensors=\"pt\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[ 2, 22033, 1872, 457, 4, 464, 12605, 500, 441, 456,\n", " 422, 12995, 385, 3]])\n" ] } ], "source": [ "print(input_ids)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['[CLS]', '青葉', '山', 'で', '[MASK]', 'の', '研究', 'を', 'し', 'て', 'い', 'ます', '。', '[SEP]']\n" ] } ], "source": [ "print(tokenizer.convert_ids_to_tokens(input_ids[0].tolist()))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4\n" ] } ], "source": [ "masked_index = torch.where(input_ids == tokenizer.mask_token_id)[1][0].tolist()\n", "print(masked_index)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[CLS] 青葉 山 で 雪 の 研究 を し て い ます 。 [SEP]\n", "[CLS] 青葉 山 で 山 の 研究 を し て い ます 。 [SEP]\n", "[CLS] 青葉 山 で 花 の 研究 を し て い ます 。 [SEP]\n", "[CLS] 青葉 山 で 植物 の 研究 を し て い ます 。 [SEP]\n", "[CLS] 青葉 山 で 星 の 研究 を し て い ます 。 [SEP]\n" ] } ], "source": [ "result = model(input_ids)\n", "pred_ids = result[0][:, masked_index].topk(5).indices.tolist()[0]\n", "for pred_id in pred_ids:\n", " output_ids = input_ids.tolist()[0]\n", " output_ids[masked_index] = pred_id\n", " print(tokenizer.decode(output_ids))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "file_extension": ".py", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" }, "mimetype": "text/x-python", "name": "python", "npconvert_exporter": "python", "pygments_lexer": "ipython3", "version": 3 }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: merge_split_corpora.py ================================================ # Copyright 2023 Masatoshi Suzuki (@singletongue) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import gzip import logging import lzma import os import random from tqdm import tqdm logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) random.seed(0) def _open_file(filename): if filename.endswith(".xz"): return lzma.open(filename, "rt") elif filename.endswith(".gz"): return gzip.open(filename, "rt") else: return open(filename) def main(args): output_files = [] for i in range(1, args.num_files + 1): output_path = os.path.join(args.output_dir, f"corpus_{i:02d}.txt") output_file = open(output_path, "w") output_files.append(output_file) output_index = random.randint(1, args.num_files) for input_path in args.input_files: logger.info("Processing %s", input_path) with _open_file(input_path) as f: for line in tqdm(f): line = " ".join(line.strip().split()) print(line, file=output_files[output_index]) if line == "": output_index = random.randrange(args.num_files) for output_file in output_files: output_file.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input_files", type=str, nargs="+", required=True) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--num_files", type=int, required=True) args = parser.parse_args() main(args) ================================================ FILE: model_configs/bert_base_character/config.json ================================================ { "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "max_position_embeddings": 512, "num_attention_heads": 12, "num_hidden_layers": 12, "type_vocab_size": 2, "vocab_size": 7027 } ================================================ FILE: model_configs/bert_base_wordpiece/config.json ================================================ { "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "max_position_embeddings": 512, "num_attention_heads": 12, "num_hidden_layers": 12, "type_vocab_size": 2, "vocab_size": 32768 } ================================================ FILE: model_configs/bert_large_character/config.json ================================================ { "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 1024, "initializer_range": 0.02, "intermediate_size": 4096, "max_position_embeddings": 512, "num_attention_heads": 16, "num_hidden_layers": 24, "type_vocab_size": 2, "vocab_size": 7027 } ================================================ FILE: model_configs/bert_large_wordpiece/config.json ================================================ { "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 1024, "initializer_range": 0.02, "intermediate_size": 4096, "max_position_embeddings": 512, "num_attention_heads": 16, "num_hidden_layers": 24, "type_vocab_size": 2, "vocab_size": 32768 } ================================================ FILE: requirements.txt ================================================ flax==0.6.10 fugashi==1.2.1 ipadic==1.0.0 tensorflow==2.11.1 tqdm==4.64.1 tokenizers==0.13.2 torch==1.13.1 transformers==4.30.0 unidic==1.1.0 unidic_lite==1.0.8 ================================================ FILE: tokenization.py ================================================ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright 2023 Masatoshi Suzuki (@singletongue) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import unicodedata from transformers.models.bert_japanese.tokenization_bert_japanese import ( BertJapaneseTokenizer as BertJapaneseTokenizerBase, CharacterTokenizer as CharacterTokenizerBase, ) class BertJapaneseTokenizer(BertJapaneseTokenizerBase): def __init__( self, vocab_file, spm_file=None, do_lower_case=False, do_word_tokenize=True, do_subword_tokenize=True, word_tokenizer_type="basic", subword_tokenizer_type="wordpiece", vocab_has_no_subword_prefix=False, never_split=None, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", mecab_kwargs=None, sudachi_kwargs=None, jumanpp_kwargs=None, **kwargs, ): super().__init__( vocab_file, spm_file=spm_file, do_lower_case=do_lower_case, do_word_tokenize=do_word_tokenize, do_subword_tokenize=do_subword_tokenize, word_tokenizer_type=word_tokenizer_type, subword_tokenizer_type=subword_tokenizer_type, never_split=never_split, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, mecab_kwargs=mecab_kwargs, sudachi_kwargs=sudachi_kwargs, jumanpp_kwargs=jumanpp_kwargs, **kwargs, ) self.vocab_has_no_subword_prefix = vocab_has_no_subword_prefix if do_subword_tokenize and subword_tokenizer_type == "character": self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token) def _convert_token_to_id(self, token): if self.vocab_has_no_subword_prefix and token.startswith("##"): token = token[len("##"):] return self.vocab.get(token, self.vocab.get(self.unk_token)) class CharacterTokenizer(CharacterTokenizerBase): def __init__(self, vocab, unk_token, normalize_text=True): super().__init__(vocab, unk_token, normalize_text=normalize_text) def tokenize(self, text): if self.normalize_text: text = unicodedata.normalize("NFKC", text) output_tokens = [] for i, char in enumerate(text): if char not in self.vocab: output_tokens.append(self.unk_token) continue if i > 0: char = "##" + char output_tokens.append(char) return output_tokens ================================================ FILE: train_tokenizer.py ================================================ # Copyright 2020 The HuggingFace Inc. team. # Copyright 2023 Masatoshi Suzuki (@singletongue) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import logging import os from japanese_tokenizers.implementations import JapaneseWordPieceTokenizer logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def main(args): tokenizer = JapaneseWordPieceTokenizer( num_unused_tokens=args.num_unused_tokens, pre_tokenizer_type=args.pre_tokenizer_type, mecab_dic_type=args.mecab_dic_type, ) speical_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] speical_tokens += [f"[unused{i}]" for i in range(args.num_unused_tokens)] if args.initial_alphabet_file is not None: logger.info("Loading the initial alphabet from file") initial_alphabet = [line.rstrip("\n") for line in open(args.initial_alphabet_file)] logger.info("The size of the initial alphabet: %d", len(initial_alphabet)) else: initial_alphabet = [] logger.info("Training the tokenizer") tokenizer.train( args.input_files, vocab_size=args.vocab_size, limit_alphabet=args.limit_alphabet, initial_alphabet=initial_alphabet, special_tokens=speical_tokens, wordpieces_prefix=args.wordpieces_prefix, ) logger.info("Saving the tokenizer to files") os.makedirs(args.output_dir, exist_ok=True) tokenizer.save_model(args.output_dir) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input_files", type=str, nargs="+", required=True) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--pre_tokenizer_type", choices=("mecab", "whitespace"), required=True) parser.add_argument("--mecab_dic_type", choices=("unidic_lite", "unidic", "ipadic"), default="unidic_lite") parser.add_argument("--vocab_size", type=int, required=True) parser.add_argument("--limit_alphabet", type=int, default=1000) parser.add_argument("--initial_alphabet_file", type=str) parser.add_argument("--num_unused_tokens", type=int, default=10) parser.add_argument("--wordpieces_prefix", type=str, default="##") args = parser.parse_args() main(args)